{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 4065, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007380073800738007, "grad_norm": 0.3079667328722514, "learning_rate": 4.914004914004914e-07, "loss": 0.117, "step": 1 }, { "epoch": 0.0014760147601476014, "grad_norm": 0.1894923336307764, "learning_rate": 9.828009828009828e-07, "loss": 0.0514, "step": 2 }, { "epoch": 0.002214022140221402, "grad_norm": 0.41427050240843993, "learning_rate": 1.4742014742014743e-06, "loss": 0.0393, "step": 3 }, { "epoch": 0.002952029520295203, "grad_norm": 0.10714009392805368, "learning_rate": 1.9656019656019657e-06, "loss": 0.0281, "step": 4 }, { "epoch": 0.0036900369003690036, "grad_norm": 0.18018227187153746, "learning_rate": 2.457002457002457e-06, "loss": 0.0297, "step": 5 }, { "epoch": 0.004428044280442804, "grad_norm": 0.15336773070251217, "learning_rate": 2.9484029484029485e-06, "loss": 0.0415, "step": 6 }, { "epoch": 0.0051660516605166054, "grad_norm": 0.22246182935826153, "learning_rate": 3.43980343980344e-06, "loss": 0.0751, "step": 7 }, { "epoch": 0.005904059040590406, "grad_norm": 0.40373870984152727, "learning_rate": 3.931203931203931e-06, "loss": 0.1348, "step": 8 }, { "epoch": 0.006642066420664207, "grad_norm": 0.34926234878990686, "learning_rate": 4.422604422604422e-06, "loss": 0.0781, "step": 9 }, { "epoch": 0.007380073800738007, "grad_norm": 0.18585368096142374, "learning_rate": 4.914004914004914e-06, "loss": 0.0407, "step": 10 }, { "epoch": 0.008118081180811807, "grad_norm": 0.16646386028432728, "learning_rate": 5.405405405405406e-06, "loss": 0.0566, "step": 11 }, { "epoch": 0.008856088560885609, "grad_norm": 0.3155667553038005, "learning_rate": 5.896805896805897e-06, "loss": 0.0839, "step": 12 }, { "epoch": 0.00959409594095941, "grad_norm": 0.12569975672727676, "learning_rate": 6.388206388206389e-06, "loss": 0.0323, "step": 13 }, { "epoch": 0.010332103321033211, "grad_norm": 0.1809263244618434, "learning_rate": 6.87960687960688e-06, "loss": 0.0582, "step": 14 }, { "epoch": 0.01107011070110701, "grad_norm": 0.22566160651998468, "learning_rate": 7.371007371007371e-06, "loss": 0.0625, "step": 15 }, { "epoch": 0.011808118081180811, "grad_norm": 0.18348998141794176, "learning_rate": 7.862407862407863e-06, "loss": 0.0393, "step": 16 }, { "epoch": 0.012546125461254613, "grad_norm": 0.19747550328339092, "learning_rate": 8.353808353808354e-06, "loss": 0.064, "step": 17 }, { "epoch": 0.013284132841328414, "grad_norm": 0.1462229812033792, "learning_rate": 8.845208845208845e-06, "loss": 0.0296, "step": 18 }, { "epoch": 0.014022140221402213, "grad_norm": 0.2606086358656917, "learning_rate": 9.336609336609337e-06, "loss": 0.0861, "step": 19 }, { "epoch": 0.014760147601476014, "grad_norm": 0.17685124032855626, "learning_rate": 9.828009828009828e-06, "loss": 0.0561, "step": 20 }, { "epoch": 0.015498154981549815, "grad_norm": 0.21219801781126488, "learning_rate": 1.031941031941032e-05, "loss": 0.0413, "step": 21 }, { "epoch": 0.016236162361623615, "grad_norm": 0.35381693806150394, "learning_rate": 1.0810810810810812e-05, "loss": 0.0804, "step": 22 }, { "epoch": 0.016974169741697416, "grad_norm": 0.21679184554109615, "learning_rate": 1.1302211302211303e-05, "loss": 0.0697, "step": 23 }, { "epoch": 0.017712177121771217, "grad_norm": 0.22612749770915638, "learning_rate": 1.1793611793611794e-05, "loss": 0.0507, "step": 24 }, { "epoch": 0.01845018450184502, "grad_norm": 0.32767299572165864, "learning_rate": 1.2285012285012287e-05, "loss": 0.0995, "step": 25 }, { "epoch": 0.01918819188191882, "grad_norm": 0.15181547458794578, "learning_rate": 1.2776412776412778e-05, "loss": 0.0482, "step": 26 }, { "epoch": 0.01992619926199262, "grad_norm": 0.5786379467910463, "learning_rate": 1.3267813267813267e-05, "loss": 0.0724, "step": 27 }, { "epoch": 0.020664206642066422, "grad_norm": 0.345247198858824, "learning_rate": 1.375921375921376e-05, "loss": 0.1133, "step": 28 }, { "epoch": 0.021402214022140223, "grad_norm": 0.623402316353307, "learning_rate": 1.4250614250614252e-05, "loss": 0.1244, "step": 29 }, { "epoch": 0.02214022140221402, "grad_norm": 0.13698149251928105, "learning_rate": 1.4742014742014742e-05, "loss": 0.0416, "step": 30 }, { "epoch": 0.022878228782287822, "grad_norm": 0.2439381182877413, "learning_rate": 1.5233415233415234e-05, "loss": 0.0681, "step": 31 }, { "epoch": 0.023616236162361623, "grad_norm": 0.5837565378400934, "learning_rate": 1.5724815724815725e-05, "loss": 0.1092, "step": 32 }, { "epoch": 0.024354243542435424, "grad_norm": 0.23572898899942377, "learning_rate": 1.6216216216216218e-05, "loss": 0.0454, "step": 33 }, { "epoch": 0.025092250922509225, "grad_norm": 0.380940650325304, "learning_rate": 1.6707616707616707e-05, "loss": 0.0733, "step": 34 }, { "epoch": 0.025830258302583026, "grad_norm": 0.1772727506806864, "learning_rate": 1.71990171990172e-05, "loss": 0.0481, "step": 35 }, { "epoch": 0.026568265682656828, "grad_norm": 0.2602169618722721, "learning_rate": 1.769041769041769e-05, "loss": 0.0566, "step": 36 }, { "epoch": 0.02730627306273063, "grad_norm": 0.14837669385123137, "learning_rate": 1.8181818181818182e-05, "loss": 0.0473, "step": 37 }, { "epoch": 0.028044280442804426, "grad_norm": 0.45996350548805853, "learning_rate": 1.8673218673218675e-05, "loss": 0.0436, "step": 38 }, { "epoch": 0.028782287822878228, "grad_norm": 0.1351720794967364, "learning_rate": 1.9164619164619167e-05, "loss": 0.0681, "step": 39 }, { "epoch": 0.02952029520295203, "grad_norm": 0.28645491432798775, "learning_rate": 1.9656019656019657e-05, "loss": 0.1173, "step": 40 }, { "epoch": 0.03025830258302583, "grad_norm": 0.21416238822136383, "learning_rate": 2.014742014742015e-05, "loss": 0.058, "step": 41 }, { "epoch": 0.03099630996309963, "grad_norm": 0.21693748594327164, "learning_rate": 2.063882063882064e-05, "loss": 0.0573, "step": 42 }, { "epoch": 0.03173431734317343, "grad_norm": 0.3316143645743977, "learning_rate": 2.113022113022113e-05, "loss": 0.1287, "step": 43 }, { "epoch": 0.03247232472324723, "grad_norm": 0.2251563447432854, "learning_rate": 2.1621621621621624e-05, "loss": 0.0581, "step": 44 }, { "epoch": 0.033210332103321034, "grad_norm": 0.23014710158484994, "learning_rate": 2.2113022113022113e-05, "loss": 0.0611, "step": 45 }, { "epoch": 0.03394833948339483, "grad_norm": 0.34069097833712514, "learning_rate": 2.2604422604422606e-05, "loss": 0.1004, "step": 46 }, { "epoch": 0.03468634686346864, "grad_norm": 0.1489051327999646, "learning_rate": 2.3095823095823095e-05, "loss": 0.0416, "step": 47 }, { "epoch": 0.035424354243542434, "grad_norm": 0.09921816680158742, "learning_rate": 2.3587223587223588e-05, "loss": 0.0312, "step": 48 }, { "epoch": 0.03616236162361624, "grad_norm": 0.15033389883787268, "learning_rate": 2.4078624078624077e-05, "loss": 0.0212, "step": 49 }, { "epoch": 0.03690036900369004, "grad_norm": 0.08074677781986132, "learning_rate": 2.4570024570024573e-05, "loss": 0.0212, "step": 50 }, { "epoch": 0.037638376383763834, "grad_norm": 0.1075852093905305, "learning_rate": 2.5061425061425066e-05, "loss": 0.0347, "step": 51 }, { "epoch": 0.03837638376383764, "grad_norm": 0.44750777570449113, "learning_rate": 2.5552825552825555e-05, "loss": 0.0509, "step": 52 }, { "epoch": 0.03911439114391144, "grad_norm": 0.4978003723692939, "learning_rate": 2.6044226044226045e-05, "loss": 0.0873, "step": 53 }, { "epoch": 0.03985239852398524, "grad_norm": 0.2237287566155572, "learning_rate": 2.6535626535626534e-05, "loss": 0.0466, "step": 54 }, { "epoch": 0.04059040590405904, "grad_norm": 0.36579023076620915, "learning_rate": 2.702702702702703e-05, "loss": 0.1198, "step": 55 }, { "epoch": 0.041328413284132844, "grad_norm": 0.24978079004415094, "learning_rate": 2.751842751842752e-05, "loss": 0.046, "step": 56 }, { "epoch": 0.04206642066420664, "grad_norm": 0.25562221569665533, "learning_rate": 2.800982800982801e-05, "loss": 0.0558, "step": 57 }, { "epoch": 0.042804428044280446, "grad_norm": 0.30637527535485903, "learning_rate": 2.8501228501228505e-05, "loss": 0.0617, "step": 58 }, { "epoch": 0.043542435424354244, "grad_norm": 0.16259555762236164, "learning_rate": 2.8992628992628994e-05, "loss": 0.0441, "step": 59 }, { "epoch": 0.04428044280442804, "grad_norm": 0.12801458020732173, "learning_rate": 2.9484029484029483e-05, "loss": 0.0391, "step": 60 }, { "epoch": 0.045018450184501846, "grad_norm": 0.11059227636162758, "learning_rate": 2.9975429975429976e-05, "loss": 0.0295, "step": 61 }, { "epoch": 0.045756457564575644, "grad_norm": 0.2397650315810163, "learning_rate": 3.046683046683047e-05, "loss": 0.0706, "step": 62 }, { "epoch": 0.04649446494464945, "grad_norm": 0.23088746574178057, "learning_rate": 3.095823095823096e-05, "loss": 0.0605, "step": 63 }, { "epoch": 0.047232472324723246, "grad_norm": 0.17079308014965944, "learning_rate": 3.144963144963145e-05, "loss": 0.032, "step": 64 }, { "epoch": 0.04797047970479705, "grad_norm": 0.23054200098770644, "learning_rate": 3.1941031941031943e-05, "loss": 0.0355, "step": 65 }, { "epoch": 0.04870848708487085, "grad_norm": 0.2744280328024215, "learning_rate": 3.2432432432432436e-05, "loss": 0.072, "step": 66 }, { "epoch": 0.04944649446494465, "grad_norm": 0.21076008193885137, "learning_rate": 3.292383292383293e-05, "loss": 0.0525, "step": 67 }, { "epoch": 0.05018450184501845, "grad_norm": 0.1072724985500534, "learning_rate": 3.3415233415233415e-05, "loss": 0.036, "step": 68 }, { "epoch": 0.05092250922509225, "grad_norm": 0.19221814687814887, "learning_rate": 3.390663390663391e-05, "loss": 0.061, "step": 69 }, { "epoch": 0.05166051660516605, "grad_norm": 0.20400598670484027, "learning_rate": 3.43980343980344e-05, "loss": 0.0461, "step": 70 }, { "epoch": 0.05239852398523985, "grad_norm": 0.25422094549834584, "learning_rate": 3.488943488943489e-05, "loss": 0.071, "step": 71 }, { "epoch": 0.053136531365313655, "grad_norm": 0.39492851559289327, "learning_rate": 3.538083538083538e-05, "loss": 0.0838, "step": 72 }, { "epoch": 0.05387453874538745, "grad_norm": 0.32276015044342155, "learning_rate": 3.587223587223588e-05, "loss": 0.0898, "step": 73 }, { "epoch": 0.05461254612546126, "grad_norm": 0.3477452478480123, "learning_rate": 3.6363636363636364e-05, "loss": 0.0418, "step": 74 }, { "epoch": 0.055350553505535055, "grad_norm": 0.2583394867814939, "learning_rate": 3.685503685503686e-05, "loss": 0.0768, "step": 75 }, { "epoch": 0.05608856088560885, "grad_norm": 0.1872176579722729, "learning_rate": 3.734643734643735e-05, "loss": 0.0577, "step": 76 }, { "epoch": 0.05682656826568266, "grad_norm": 0.11883143906818665, "learning_rate": 3.783783783783784e-05, "loss": 0.0313, "step": 77 }, { "epoch": 0.057564575645756455, "grad_norm": 0.30575570451228984, "learning_rate": 3.8329238329238335e-05, "loss": 0.0866, "step": 78 }, { "epoch": 0.05830258302583026, "grad_norm": 0.2660296852299853, "learning_rate": 3.882063882063882e-05, "loss": 0.0663, "step": 79 }, { "epoch": 0.05904059040590406, "grad_norm": 0.2704004340198259, "learning_rate": 3.9312039312039314e-05, "loss": 0.0695, "step": 80 }, { "epoch": 0.05977859778597786, "grad_norm": 0.19205939379073853, "learning_rate": 3.9803439803439806e-05, "loss": 0.0349, "step": 81 }, { "epoch": 0.06051660516605166, "grad_norm": 0.34671465252949485, "learning_rate": 4.02948402948403e-05, "loss": 0.0841, "step": 82 }, { "epoch": 0.061254612546125464, "grad_norm": 0.1849468746990807, "learning_rate": 4.0786240786240785e-05, "loss": 0.0597, "step": 83 }, { "epoch": 0.06199261992619926, "grad_norm": 0.16085830526170317, "learning_rate": 4.127764127764128e-05, "loss": 0.0405, "step": 84 }, { "epoch": 0.06273062730627306, "grad_norm": 0.14730117843687723, "learning_rate": 4.176904176904177e-05, "loss": 0.0565, "step": 85 }, { "epoch": 0.06346863468634686, "grad_norm": 0.19009570607214485, "learning_rate": 4.226044226044226e-05, "loss": 0.0371, "step": 86 }, { "epoch": 0.06420664206642067, "grad_norm": 0.4221115331961075, "learning_rate": 4.2751842751842756e-05, "loss": 0.1078, "step": 87 }, { "epoch": 0.06494464944649446, "grad_norm": 0.16962476846095836, "learning_rate": 4.324324324324325e-05, "loss": 0.0442, "step": 88 }, { "epoch": 0.06568265682656826, "grad_norm": 0.14109233872592777, "learning_rate": 4.373464373464374e-05, "loss": 0.0305, "step": 89 }, { "epoch": 0.06642066420664207, "grad_norm": 0.1783802429420408, "learning_rate": 4.422604422604423e-05, "loss": 0.0481, "step": 90 }, { "epoch": 0.06715867158671587, "grad_norm": 0.25215995546463166, "learning_rate": 4.471744471744472e-05, "loss": 0.0658, "step": 91 }, { "epoch": 0.06789667896678966, "grad_norm": 0.19379565196604576, "learning_rate": 4.520884520884521e-05, "loss": 0.0728, "step": 92 }, { "epoch": 0.06863468634686347, "grad_norm": 0.15735742908234843, "learning_rate": 4.5700245700245705e-05, "loss": 0.0396, "step": 93 }, { "epoch": 0.06937269372693727, "grad_norm": 0.24185693805080044, "learning_rate": 4.619164619164619e-05, "loss": 0.0307, "step": 94 }, { "epoch": 0.07011070110701106, "grad_norm": 0.4727421262894176, "learning_rate": 4.6683046683046684e-05, "loss": 0.1512, "step": 95 }, { "epoch": 0.07084870848708487, "grad_norm": 0.252186638370853, "learning_rate": 4.7174447174447176e-05, "loss": 0.0582, "step": 96 }, { "epoch": 0.07158671586715867, "grad_norm": 0.2474150049343888, "learning_rate": 4.766584766584767e-05, "loss": 0.0972, "step": 97 }, { "epoch": 0.07232472324723248, "grad_norm": 0.29174821221627284, "learning_rate": 4.8157248157248155e-05, "loss": 0.0828, "step": 98 }, { "epoch": 0.07306273062730627, "grad_norm": 0.24175539280492186, "learning_rate": 4.8648648648648654e-05, "loss": 0.0961, "step": 99 }, { "epoch": 0.07380073800738007, "grad_norm": 0.15827123383932454, "learning_rate": 4.914004914004915e-05, "loss": 0.04, "step": 100 }, { "epoch": 0.07453874538745388, "grad_norm": 0.27422446145455054, "learning_rate": 4.963144963144963e-05, "loss": 0.0636, "step": 101 }, { "epoch": 0.07527675276752767, "grad_norm": 0.17386580546632605, "learning_rate": 5.012285012285013e-05, "loss": 0.0632, "step": 102 }, { "epoch": 0.07601476014760147, "grad_norm": 0.28557412871181864, "learning_rate": 5.061425061425061e-05, "loss": 0.0608, "step": 103 }, { "epoch": 0.07675276752767528, "grad_norm": 0.303690483427473, "learning_rate": 5.110565110565111e-05, "loss": 0.0703, "step": 104 }, { "epoch": 0.07749077490774908, "grad_norm": 0.19616302230173488, "learning_rate": 5.1597051597051604e-05, "loss": 0.0414, "step": 105 }, { "epoch": 0.07822878228782287, "grad_norm": 0.14271910339666544, "learning_rate": 5.208845208845209e-05, "loss": 0.0361, "step": 106 }, { "epoch": 0.07896678966789668, "grad_norm": 0.38563158375721984, "learning_rate": 5.257985257985258e-05, "loss": 0.1002, "step": 107 }, { "epoch": 0.07970479704797048, "grad_norm": 0.22508710780101765, "learning_rate": 5.307125307125307e-05, "loss": 0.0566, "step": 108 }, { "epoch": 0.08044280442804429, "grad_norm": 0.15334305767100687, "learning_rate": 5.356265356265356e-05, "loss": 0.0466, "step": 109 }, { "epoch": 0.08118081180811808, "grad_norm": 0.205612111918147, "learning_rate": 5.405405405405406e-05, "loss": 0.0435, "step": 110 }, { "epoch": 0.08191881918819188, "grad_norm": 0.18505061921736368, "learning_rate": 5.4545454545454546e-05, "loss": 0.0505, "step": 111 }, { "epoch": 0.08265682656826569, "grad_norm": 0.2599663355255438, "learning_rate": 5.503685503685504e-05, "loss": 0.0965, "step": 112 }, { "epoch": 0.08339483394833948, "grad_norm": 0.27238883528481367, "learning_rate": 5.552825552825554e-05, "loss": 0.0852, "step": 113 }, { "epoch": 0.08413284132841328, "grad_norm": 0.16603890951751793, "learning_rate": 5.601965601965602e-05, "loss": 0.0615, "step": 114 }, { "epoch": 0.08487084870848709, "grad_norm": 0.16884983208897433, "learning_rate": 5.651105651105652e-05, "loss": 0.0454, "step": 115 }, { "epoch": 0.08560885608856089, "grad_norm": 0.3614849204548111, "learning_rate": 5.700245700245701e-05, "loss": 0.1225, "step": 116 }, { "epoch": 0.08634686346863468, "grad_norm": 0.1350101283940287, "learning_rate": 5.7493857493857496e-05, "loss": 0.0399, "step": 117 }, { "epoch": 0.08708487084870849, "grad_norm": 0.15756671046391316, "learning_rate": 5.798525798525799e-05, "loss": 0.0422, "step": 118 }, { "epoch": 0.08782287822878229, "grad_norm": 0.24841551304021953, "learning_rate": 5.8476658476658474e-05, "loss": 0.0831, "step": 119 }, { "epoch": 0.08856088560885608, "grad_norm": 0.17526589097042913, "learning_rate": 5.896805896805897e-05, "loss": 0.0636, "step": 120 }, { "epoch": 0.08929889298892989, "grad_norm": 0.20229834879450168, "learning_rate": 5.9459459459459466e-05, "loss": 0.0535, "step": 121 }, { "epoch": 0.09003690036900369, "grad_norm": 0.19750143211261492, "learning_rate": 5.995085995085995e-05, "loss": 0.0548, "step": 122 }, { "epoch": 0.0907749077490775, "grad_norm": 0.0917438301905323, "learning_rate": 6.0442260442260445e-05, "loss": 0.025, "step": 123 }, { "epoch": 0.09151291512915129, "grad_norm": 0.25207433332403634, "learning_rate": 6.093366093366094e-05, "loss": 0.0616, "step": 124 }, { "epoch": 0.09225092250922509, "grad_norm": 0.19372466437491445, "learning_rate": 6.142506142506142e-05, "loss": 0.0418, "step": 125 }, { "epoch": 0.0929889298892989, "grad_norm": 0.18950852721349126, "learning_rate": 6.191646191646192e-05, "loss": 0.0622, "step": 126 }, { "epoch": 0.09372693726937269, "grad_norm": 0.18116610123981686, "learning_rate": 6.240786240786242e-05, "loss": 0.0403, "step": 127 }, { "epoch": 0.09446494464944649, "grad_norm": 0.09231303358531776, "learning_rate": 6.28992628992629e-05, "loss": 0.0332, "step": 128 }, { "epoch": 0.0952029520295203, "grad_norm": 0.2566174925111758, "learning_rate": 6.33906633906634e-05, "loss": 0.0476, "step": 129 }, { "epoch": 0.0959409594095941, "grad_norm": 0.1514939718007959, "learning_rate": 6.388206388206389e-05, "loss": 0.0723, "step": 130 }, { "epoch": 0.09667896678966789, "grad_norm": 0.2139372561190259, "learning_rate": 6.437346437346438e-05, "loss": 0.0616, "step": 131 }, { "epoch": 0.0974169741697417, "grad_norm": 0.2321185294597769, "learning_rate": 6.486486486486487e-05, "loss": 0.078, "step": 132 }, { "epoch": 0.0981549815498155, "grad_norm": 0.24942603576036193, "learning_rate": 6.535626535626535e-05, "loss": 0.0648, "step": 133 }, { "epoch": 0.0988929889298893, "grad_norm": 0.19084945331684627, "learning_rate": 6.584766584766586e-05, "loss": 0.0644, "step": 134 }, { "epoch": 0.0996309963099631, "grad_norm": 0.2769077593682619, "learning_rate": 6.633906633906635e-05, "loss": 0.0816, "step": 135 }, { "epoch": 0.1003690036900369, "grad_norm": 0.17017143407685192, "learning_rate": 6.683046683046683e-05, "loss": 0.049, "step": 136 }, { "epoch": 0.1011070110701107, "grad_norm": 0.36063477380535147, "learning_rate": 6.732186732186732e-05, "loss": 0.1182, "step": 137 }, { "epoch": 0.1018450184501845, "grad_norm": 0.19657328256100048, "learning_rate": 6.781326781326781e-05, "loss": 0.0546, "step": 138 }, { "epoch": 0.1025830258302583, "grad_norm": 0.18650506389875304, "learning_rate": 6.830466830466831e-05, "loss": 0.0662, "step": 139 }, { "epoch": 0.1033210332103321, "grad_norm": 0.15467634294669633, "learning_rate": 6.87960687960688e-05, "loss": 0.063, "step": 140 }, { "epoch": 0.10405904059040591, "grad_norm": 0.1540912643298967, "learning_rate": 6.928746928746929e-05, "loss": 0.0518, "step": 141 }, { "epoch": 0.1047970479704797, "grad_norm": 0.11944434223621488, "learning_rate": 6.977886977886979e-05, "loss": 0.0238, "step": 142 }, { "epoch": 0.1055350553505535, "grad_norm": 0.18361470058089868, "learning_rate": 7.027027027027028e-05, "loss": 0.0563, "step": 143 }, { "epoch": 0.10627306273062731, "grad_norm": 0.09756988862460299, "learning_rate": 7.076167076167076e-05, "loss": 0.032, "step": 144 }, { "epoch": 0.1070110701107011, "grad_norm": 0.22765245212812235, "learning_rate": 7.125307125307126e-05, "loss": 0.0545, "step": 145 }, { "epoch": 0.1077490774907749, "grad_norm": 0.10107485737447015, "learning_rate": 7.174447174447176e-05, "loss": 0.0252, "step": 146 }, { "epoch": 0.10848708487084871, "grad_norm": 0.18759395368409446, "learning_rate": 7.223587223587224e-05, "loss": 0.0817, "step": 147 }, { "epoch": 0.10922509225092251, "grad_norm": 0.24805196997337634, "learning_rate": 7.272727272727273e-05, "loss": 0.0515, "step": 148 }, { "epoch": 0.1099630996309963, "grad_norm": 0.164345276209045, "learning_rate": 7.321867321867322e-05, "loss": 0.0449, "step": 149 }, { "epoch": 0.11070110701107011, "grad_norm": 0.33146660759708485, "learning_rate": 7.371007371007371e-05, "loss": 0.0894, "step": 150 }, { "epoch": 0.11143911439114391, "grad_norm": 0.18650083874304627, "learning_rate": 7.42014742014742e-05, "loss": 0.0752, "step": 151 }, { "epoch": 0.1121771217712177, "grad_norm": 0.23614574999466256, "learning_rate": 7.46928746928747e-05, "loss": 0.0759, "step": 152 }, { "epoch": 0.11291512915129151, "grad_norm": 0.31733983228925033, "learning_rate": 7.518427518427519e-05, "loss": 0.1289, "step": 153 }, { "epoch": 0.11365313653136531, "grad_norm": 0.17671556092433016, "learning_rate": 7.567567567567568e-05, "loss": 0.0268, "step": 154 }, { "epoch": 0.11439114391143912, "grad_norm": 0.2418851166541128, "learning_rate": 7.616707616707616e-05, "loss": 0.0621, "step": 155 }, { "epoch": 0.11512915129151291, "grad_norm": 0.25386302515913034, "learning_rate": 7.665847665847667e-05, "loss": 0.0671, "step": 156 }, { "epoch": 0.11586715867158671, "grad_norm": 0.16044210276145973, "learning_rate": 7.714987714987716e-05, "loss": 0.0451, "step": 157 }, { "epoch": 0.11660516605166052, "grad_norm": 0.16691269144288384, "learning_rate": 7.764127764127764e-05, "loss": 0.0593, "step": 158 }, { "epoch": 0.11734317343173432, "grad_norm": 0.09392761643641052, "learning_rate": 7.813267813267813e-05, "loss": 0.0202, "step": 159 }, { "epoch": 0.11808118081180811, "grad_norm": 0.22035578472299172, "learning_rate": 7.862407862407863e-05, "loss": 0.0505, "step": 160 }, { "epoch": 0.11881918819188192, "grad_norm": 0.13972069034827747, "learning_rate": 7.911547911547912e-05, "loss": 0.0364, "step": 161 }, { "epoch": 0.11955719557195572, "grad_norm": 0.38838208532847557, "learning_rate": 7.960687960687961e-05, "loss": 0.1034, "step": 162 }, { "epoch": 0.12029520295202951, "grad_norm": 0.38253372404002384, "learning_rate": 8.00982800982801e-05, "loss": 0.0769, "step": 163 }, { "epoch": 0.12103321033210332, "grad_norm": 0.4154511237052845, "learning_rate": 8.05896805896806e-05, "loss": 0.0882, "step": 164 }, { "epoch": 0.12177121771217712, "grad_norm": 0.2475825976260678, "learning_rate": 8.108108108108109e-05, "loss": 0.0418, "step": 165 }, { "epoch": 0.12250922509225093, "grad_norm": 0.2683206196821622, "learning_rate": 8.157248157248157e-05, "loss": 0.1336, "step": 166 }, { "epoch": 0.12324723247232472, "grad_norm": 0.7365510091691913, "learning_rate": 8.206388206388208e-05, "loss": 0.1071, "step": 167 }, { "epoch": 0.12398523985239852, "grad_norm": 0.19970577643716397, "learning_rate": 8.255528255528255e-05, "loss": 0.066, "step": 168 }, { "epoch": 0.12472324723247233, "grad_norm": 0.22522362656362388, "learning_rate": 8.304668304668305e-05, "loss": 0.0566, "step": 169 }, { "epoch": 0.12546125461254612, "grad_norm": 0.36085582608968114, "learning_rate": 8.353808353808354e-05, "loss": 0.1082, "step": 170 }, { "epoch": 0.12619926199261994, "grad_norm": 0.23593421104283632, "learning_rate": 8.402948402948403e-05, "loss": 0.0999, "step": 171 }, { "epoch": 0.12693726937269373, "grad_norm": 0.2516066852982473, "learning_rate": 8.452088452088453e-05, "loss": 0.0643, "step": 172 }, { "epoch": 0.12767527675276752, "grad_norm": 0.11464491564544342, "learning_rate": 8.501228501228502e-05, "loss": 0.0366, "step": 173 }, { "epoch": 0.12841328413284134, "grad_norm": 0.29618121302953543, "learning_rate": 8.550368550368551e-05, "loss": 0.065, "step": 174 }, { "epoch": 0.12915129151291513, "grad_norm": 0.3294752645784318, "learning_rate": 8.5995085995086e-05, "loss": 0.0732, "step": 175 }, { "epoch": 0.12988929889298892, "grad_norm": 0.24859270180807308, "learning_rate": 8.64864864864865e-05, "loss": 0.063, "step": 176 }, { "epoch": 0.13062730627306274, "grad_norm": 0.18442728844470332, "learning_rate": 8.697788697788698e-05, "loss": 0.0447, "step": 177 }, { "epoch": 0.13136531365313653, "grad_norm": 0.11471934482215519, "learning_rate": 8.746928746928748e-05, "loss": 0.0186, "step": 178 }, { "epoch": 0.13210332103321032, "grad_norm": 0.13497539263112, "learning_rate": 8.796068796068796e-05, "loss": 0.0358, "step": 179 }, { "epoch": 0.13284132841328414, "grad_norm": 0.21543217087838204, "learning_rate": 8.845208845208845e-05, "loss": 0.07, "step": 180 }, { "epoch": 0.13357933579335793, "grad_norm": 0.39284891616013307, "learning_rate": 8.894348894348895e-05, "loss": 0.085, "step": 181 }, { "epoch": 0.13431734317343175, "grad_norm": 0.11028189484562091, "learning_rate": 8.943488943488944e-05, "loss": 0.0327, "step": 182 }, { "epoch": 0.13505535055350554, "grad_norm": 0.3166001940963968, "learning_rate": 8.992628992628993e-05, "loss": 0.0947, "step": 183 }, { "epoch": 0.13579335793357933, "grad_norm": 0.12490919180481078, "learning_rate": 9.041769041769042e-05, "loss": 0.0257, "step": 184 }, { "epoch": 0.13653136531365315, "grad_norm": 0.14989277094908446, "learning_rate": 9.090909090909092e-05, "loss": 0.0315, "step": 185 }, { "epoch": 0.13726937269372694, "grad_norm": 0.14916777982944213, "learning_rate": 9.140049140049141e-05, "loss": 0.0516, "step": 186 }, { "epoch": 0.13800738007380073, "grad_norm": 0.1777883725644129, "learning_rate": 9.18918918918919e-05, "loss": 0.0586, "step": 187 }, { "epoch": 0.13874538745387455, "grad_norm": 0.21552831996332747, "learning_rate": 9.238329238329238e-05, "loss": 0.0662, "step": 188 }, { "epoch": 0.13948339483394834, "grad_norm": 0.2680369129939743, "learning_rate": 9.287469287469289e-05, "loss": 0.069, "step": 189 }, { "epoch": 0.14022140221402213, "grad_norm": 0.12610033720839467, "learning_rate": 9.336609336609337e-05, "loss": 0.0319, "step": 190 }, { "epoch": 0.14095940959409595, "grad_norm": 0.3461093646280557, "learning_rate": 9.385749385749386e-05, "loss": 0.083, "step": 191 }, { "epoch": 0.14169741697416974, "grad_norm": 0.22824743406048967, "learning_rate": 9.434889434889435e-05, "loss": 0.0709, "step": 192 }, { "epoch": 0.14243542435424356, "grad_norm": 0.11306760252014494, "learning_rate": 9.484029484029485e-05, "loss": 0.0387, "step": 193 }, { "epoch": 0.14317343173431735, "grad_norm": 0.17492507685622777, "learning_rate": 9.533169533169534e-05, "loss": 0.04, "step": 194 }, { "epoch": 0.14391143911439114, "grad_norm": 0.38010023043797647, "learning_rate": 9.582309582309583e-05, "loss": 0.1014, "step": 195 }, { "epoch": 0.14464944649446496, "grad_norm": 0.16975099960762727, "learning_rate": 9.631449631449631e-05, "loss": 0.0387, "step": 196 }, { "epoch": 0.14538745387453875, "grad_norm": 0.17015717534935715, "learning_rate": 9.680589680589682e-05, "loss": 0.0402, "step": 197 }, { "epoch": 0.14612546125461254, "grad_norm": 0.2157698865435607, "learning_rate": 9.729729729729731e-05, "loss": 0.052, "step": 198 }, { "epoch": 0.14686346863468636, "grad_norm": 0.13581875464919918, "learning_rate": 9.778869778869779e-05, "loss": 0.0334, "step": 199 }, { "epoch": 0.14760147601476015, "grad_norm": 0.2344990452162473, "learning_rate": 9.82800982800983e-05, "loss": 0.036, "step": 200 }, { "epoch": 0.14833948339483394, "grad_norm": 0.21325765739659414, "learning_rate": 9.877149877149877e-05, "loss": 0.0485, "step": 201 }, { "epoch": 0.14907749077490776, "grad_norm": 0.30739952663265985, "learning_rate": 9.926289926289927e-05, "loss": 0.1448, "step": 202 }, { "epoch": 0.14981549815498155, "grad_norm": 0.170791257576569, "learning_rate": 9.975429975429976e-05, "loss": 0.0441, "step": 203 }, { "epoch": 0.15055350553505534, "grad_norm": 0.1579642646069598, "learning_rate": 0.00010024570024570026, "loss": 0.0439, "step": 204 }, { "epoch": 0.15129151291512916, "grad_norm": 0.20816315382744544, "learning_rate": 0.00010073710073710074, "loss": 0.0782, "step": 205 }, { "epoch": 0.15202952029520295, "grad_norm": 0.12840822198641538, "learning_rate": 0.00010122850122850122, "loss": 0.0347, "step": 206 }, { "epoch": 0.15276752767527677, "grad_norm": 0.18155963844038425, "learning_rate": 0.00010171990171990173, "loss": 0.045, "step": 207 }, { "epoch": 0.15350553505535056, "grad_norm": 0.30328641612015, "learning_rate": 0.00010221130221130222, "loss": 0.0897, "step": 208 }, { "epoch": 0.15424354243542435, "grad_norm": 0.18707435537045453, "learning_rate": 0.0001027027027027027, "loss": 0.0452, "step": 209 }, { "epoch": 0.15498154981549817, "grad_norm": 0.10755687410434948, "learning_rate": 0.00010319410319410321, "loss": 0.0287, "step": 210 }, { "epoch": 0.15571955719557196, "grad_norm": 0.3253350953971806, "learning_rate": 0.0001036855036855037, "loss": 0.0934, "step": 211 }, { "epoch": 0.15645756457564575, "grad_norm": 0.39046490139401185, "learning_rate": 0.00010417690417690418, "loss": 0.104, "step": 212 }, { "epoch": 0.15719557195571957, "grad_norm": 0.2695648112514124, "learning_rate": 0.00010466830466830469, "loss": 0.064, "step": 213 }, { "epoch": 0.15793357933579336, "grad_norm": 0.23937722221404636, "learning_rate": 0.00010515970515970516, "loss": 0.0827, "step": 214 }, { "epoch": 0.15867158671586715, "grad_norm": 0.24924194837321803, "learning_rate": 0.00010565110565110566, "loss": 0.0809, "step": 215 }, { "epoch": 0.15940959409594097, "grad_norm": 0.20289073155044463, "learning_rate": 0.00010614250614250614, "loss": 0.0716, "step": 216 }, { "epoch": 0.16014760147601476, "grad_norm": 0.1801160823592329, "learning_rate": 0.00010663390663390664, "loss": 0.0498, "step": 217 }, { "epoch": 0.16088560885608857, "grad_norm": 0.14994859211189604, "learning_rate": 0.00010712530712530712, "loss": 0.1092, "step": 218 }, { "epoch": 0.16162361623616237, "grad_norm": 0.10005463497094133, "learning_rate": 0.00010761670761670761, "loss": 0.0291, "step": 219 }, { "epoch": 0.16236162361623616, "grad_norm": 0.12490482509898662, "learning_rate": 0.00010810810810810812, "loss": 0.0145, "step": 220 }, { "epoch": 0.16309963099630997, "grad_norm": 0.12820423441512308, "learning_rate": 0.0001085995085995086, "loss": 0.0374, "step": 221 }, { "epoch": 0.16383763837638377, "grad_norm": 0.1768244195947338, "learning_rate": 0.00010909090909090909, "loss": 0.0548, "step": 222 }, { "epoch": 0.16457564575645756, "grad_norm": 0.2866400834177101, "learning_rate": 0.0001095823095823096, "loss": 0.0831, "step": 223 }, { "epoch": 0.16531365313653137, "grad_norm": 0.2536801248255819, "learning_rate": 0.00011007371007371008, "loss": 0.0501, "step": 224 }, { "epoch": 0.16605166051660517, "grad_norm": 0.20908257757378618, "learning_rate": 0.00011056511056511056, "loss": 0.0479, "step": 225 }, { "epoch": 0.16678966789667896, "grad_norm": 0.1322553692689183, "learning_rate": 0.00011105651105651108, "loss": 0.0392, "step": 226 }, { "epoch": 0.16752767527675277, "grad_norm": 0.13471765952728856, "learning_rate": 0.00011154791154791156, "loss": 0.0402, "step": 227 }, { "epoch": 0.16826568265682657, "grad_norm": 0.18099811061770404, "learning_rate": 0.00011203931203931204, "loss": 0.0586, "step": 228 }, { "epoch": 0.16900369003690036, "grad_norm": 0.31836360349301346, "learning_rate": 0.00011253071253071254, "loss": 0.0593, "step": 229 }, { "epoch": 0.16974169741697417, "grad_norm": 0.3437451773023117, "learning_rate": 0.00011302211302211303, "loss": 0.0983, "step": 230 }, { "epoch": 0.17047970479704797, "grad_norm": 0.15315047699921625, "learning_rate": 0.00011351351351351351, "loss": 0.047, "step": 231 }, { "epoch": 0.17121771217712178, "grad_norm": 0.18586278779504414, "learning_rate": 0.00011400491400491402, "loss": 0.0339, "step": 232 }, { "epoch": 0.17195571955719557, "grad_norm": 0.3345439680308082, "learning_rate": 0.0001144963144963145, "loss": 0.08, "step": 233 }, { "epoch": 0.17269372693726937, "grad_norm": 0.1592484008801119, "learning_rate": 0.00011498771498771499, "loss": 0.0265, "step": 234 }, { "epoch": 0.17343173431734318, "grad_norm": 0.12424643377490861, "learning_rate": 0.00011547911547911547, "loss": 0.0263, "step": 235 }, { "epoch": 0.17416974169741697, "grad_norm": 0.23338646496571508, "learning_rate": 0.00011597051597051598, "loss": 0.0614, "step": 236 }, { "epoch": 0.17490774907749077, "grad_norm": 0.30516878630257044, "learning_rate": 0.00011646191646191647, "loss": 0.0855, "step": 237 }, { "epoch": 0.17564575645756458, "grad_norm": 0.20735343305952433, "learning_rate": 0.00011695331695331695, "loss": 0.056, "step": 238 }, { "epoch": 0.17638376383763837, "grad_norm": 0.1192228170961843, "learning_rate": 0.00011744471744471745, "loss": 0.0285, "step": 239 }, { "epoch": 0.17712177121771217, "grad_norm": 0.18078849728110313, "learning_rate": 0.00011793611793611793, "loss": 0.0546, "step": 240 }, { "epoch": 0.17785977859778598, "grad_norm": 0.6604781290345418, "learning_rate": 0.00011842751842751843, "loss": 0.1092, "step": 241 }, { "epoch": 0.17859778597785977, "grad_norm": 0.1552257440196663, "learning_rate": 0.00011891891891891893, "loss": 0.0474, "step": 242 }, { "epoch": 0.1793357933579336, "grad_norm": 0.15272560366675095, "learning_rate": 0.00011941031941031941, "loss": 0.0546, "step": 243 }, { "epoch": 0.18007380073800738, "grad_norm": 0.19165205044827022, "learning_rate": 0.0001199017199017199, "loss": 0.044, "step": 244 }, { "epoch": 0.18081180811808117, "grad_norm": 0.15708315401507722, "learning_rate": 0.00012039312039312041, "loss": 0.055, "step": 245 }, { "epoch": 0.181549815498155, "grad_norm": 0.22060376892504324, "learning_rate": 0.00012088452088452089, "loss": 0.0809, "step": 246 }, { "epoch": 0.18228782287822878, "grad_norm": 0.5453688224343797, "learning_rate": 0.00012137592137592137, "loss": 0.1192, "step": 247 }, { "epoch": 0.18302583025830257, "grad_norm": 0.1260151028422771, "learning_rate": 0.00012186732186732188, "loss": 0.0292, "step": 248 }, { "epoch": 0.1837638376383764, "grad_norm": 0.16876978910836665, "learning_rate": 0.00012235872235872235, "loss": 0.0835, "step": 249 }, { "epoch": 0.18450184501845018, "grad_norm": 0.11192128993452585, "learning_rate": 0.00012285012285012285, "loss": 0.0411, "step": 250 }, { "epoch": 0.18523985239852397, "grad_norm": 0.2761378853186356, "learning_rate": 0.00012334152334152337, "loss": 0.0964, "step": 251 }, { "epoch": 0.1859778597785978, "grad_norm": 0.3636892212764276, "learning_rate": 0.00012383292383292383, "loss": 0.0928, "step": 252 }, { "epoch": 0.18671586715867158, "grad_norm": 0.5134015792803261, "learning_rate": 0.00012432432432432433, "loss": 0.1168, "step": 253 }, { "epoch": 0.18745387453874537, "grad_norm": 0.24934785343688792, "learning_rate": 0.00012481572481572484, "loss": 0.0426, "step": 254 }, { "epoch": 0.1881918819188192, "grad_norm": 0.23622001594464606, "learning_rate": 0.0001253071253071253, "loss": 0.0558, "step": 255 }, { "epoch": 0.18892988929889298, "grad_norm": 0.25168983979327103, "learning_rate": 0.0001257985257985258, "loss": 0.0778, "step": 256 }, { "epoch": 0.1896678966789668, "grad_norm": 0.20953801803294408, "learning_rate": 0.0001262899262899263, "loss": 0.0684, "step": 257 }, { "epoch": 0.1904059040590406, "grad_norm": 0.11749399708304124, "learning_rate": 0.0001267813267813268, "loss": 0.032, "step": 258 }, { "epoch": 0.19114391143911438, "grad_norm": 0.119485049861363, "learning_rate": 0.00012727272727272728, "loss": 0.0355, "step": 259 }, { "epoch": 0.1918819188191882, "grad_norm": 0.16104907182297357, "learning_rate": 0.00012776412776412777, "loss": 0.0392, "step": 260 }, { "epoch": 0.192619926199262, "grad_norm": 0.1592154814963246, "learning_rate": 0.00012825552825552827, "loss": 0.033, "step": 261 }, { "epoch": 0.19335793357933578, "grad_norm": 0.24688878828924235, "learning_rate": 0.00012874692874692876, "loss": 0.0652, "step": 262 }, { "epoch": 0.1940959409594096, "grad_norm": 0.16790559039340483, "learning_rate": 0.00012923832923832922, "loss": 0.0504, "step": 263 }, { "epoch": 0.1948339483394834, "grad_norm": 0.12402102422705129, "learning_rate": 0.00012972972972972974, "loss": 0.0355, "step": 264 }, { "epoch": 0.19557195571955718, "grad_norm": 0.29432667857441647, "learning_rate": 0.00013022113022113024, "loss": 0.0786, "step": 265 }, { "epoch": 0.196309963099631, "grad_norm": 0.2521194799597439, "learning_rate": 0.0001307125307125307, "loss": 0.0552, "step": 266 }, { "epoch": 0.1970479704797048, "grad_norm": 0.2545202595232484, "learning_rate": 0.00013120393120393122, "loss": 0.0443, "step": 267 }, { "epoch": 0.1977859778597786, "grad_norm": 0.13481425233394576, "learning_rate": 0.00013169533169533172, "loss": 0.0303, "step": 268 }, { "epoch": 0.1985239852398524, "grad_norm": 0.2951463641350795, "learning_rate": 0.00013218673218673218, "loss": 0.0744, "step": 269 }, { "epoch": 0.1992619926199262, "grad_norm": 0.25055815171392704, "learning_rate": 0.0001326781326781327, "loss": 0.0549, "step": 270 }, { "epoch": 0.2, "grad_norm": 0.17918241141516136, "learning_rate": 0.00013316953316953317, "loss": 0.0584, "step": 271 }, { "epoch": 0.2007380073800738, "grad_norm": 0.4035183367809682, "learning_rate": 0.00013366093366093366, "loss": 0.1191, "step": 272 }, { "epoch": 0.2014760147601476, "grad_norm": 0.3383039943989612, "learning_rate": 0.00013415233415233418, "loss": 0.1153, "step": 273 }, { "epoch": 0.2022140221402214, "grad_norm": 0.22074582252635105, "learning_rate": 0.00013464373464373464, "loss": 0.0684, "step": 274 }, { "epoch": 0.2029520295202952, "grad_norm": 0.17946898364394795, "learning_rate": 0.00013513513513513514, "loss": 0.0466, "step": 275 }, { "epoch": 0.203690036900369, "grad_norm": 0.14738376450644688, "learning_rate": 0.00013562653562653563, "loss": 0.035, "step": 276 }, { "epoch": 0.2044280442804428, "grad_norm": 0.3966425525795394, "learning_rate": 0.00013611793611793612, "loss": 0.101, "step": 277 }, { "epoch": 0.2051660516605166, "grad_norm": 0.33732973638667346, "learning_rate": 0.00013660933660933662, "loss": 0.0451, "step": 278 }, { "epoch": 0.2059040590405904, "grad_norm": 0.3708698141622069, "learning_rate": 0.0001371007371007371, "loss": 0.0962, "step": 279 }, { "epoch": 0.2066420664206642, "grad_norm": 0.3104690569184297, "learning_rate": 0.0001375921375921376, "loss": 0.0577, "step": 280 }, { "epoch": 0.207380073800738, "grad_norm": 0.24440173268542303, "learning_rate": 0.0001380835380835381, "loss": 0.089, "step": 281 }, { "epoch": 0.20811808118081182, "grad_norm": 0.2471343946174232, "learning_rate": 0.00013857493857493859, "loss": 0.0631, "step": 282 }, { "epoch": 0.2088560885608856, "grad_norm": 0.21719240956707214, "learning_rate": 0.00013906633906633908, "loss": 0.0598, "step": 283 }, { "epoch": 0.2095940959409594, "grad_norm": 0.654029396135032, "learning_rate": 0.00013955773955773957, "loss": 0.1936, "step": 284 }, { "epoch": 0.21033210332103322, "grad_norm": 0.13640107683623348, "learning_rate": 0.00014004914004914004, "loss": 0.0459, "step": 285 }, { "epoch": 0.211070110701107, "grad_norm": 0.28526318560586555, "learning_rate": 0.00014054054054054056, "loss": 0.0783, "step": 286 }, { "epoch": 0.2118081180811808, "grad_norm": 0.14195451035202797, "learning_rate": 0.00014103194103194105, "loss": 0.0293, "step": 287 }, { "epoch": 0.21254612546125462, "grad_norm": 0.376323375805187, "learning_rate": 0.00014152334152334152, "loss": 0.0888, "step": 288 }, { "epoch": 0.2132841328413284, "grad_norm": 0.1582894783703348, "learning_rate": 0.00014201474201474203, "loss": 0.04, "step": 289 }, { "epoch": 0.2140221402214022, "grad_norm": 0.14560442173323204, "learning_rate": 0.00014250614250614253, "loss": 0.048, "step": 290 }, { "epoch": 0.21476014760147602, "grad_norm": 0.19304507858483194, "learning_rate": 0.000142997542997543, "loss": 0.0663, "step": 291 }, { "epoch": 0.2154981549815498, "grad_norm": 0.13162252123947515, "learning_rate": 0.0001434889434889435, "loss": 0.0374, "step": 292 }, { "epoch": 0.21623616236162363, "grad_norm": 0.1567243866302455, "learning_rate": 0.00014398034398034398, "loss": 0.0515, "step": 293 }, { "epoch": 0.21697416974169742, "grad_norm": 0.34747364140831566, "learning_rate": 0.00014447174447174447, "loss": 0.0522, "step": 294 }, { "epoch": 0.2177121771217712, "grad_norm": 0.14759834577718983, "learning_rate": 0.000144963144963145, "loss": 0.0546, "step": 295 }, { "epoch": 0.21845018450184503, "grad_norm": 0.14323382781274852, "learning_rate": 0.00014545454545454546, "loss": 0.0531, "step": 296 }, { "epoch": 0.21918819188191882, "grad_norm": 0.26887606475276266, "learning_rate": 0.00014594594594594595, "loss": 0.0481, "step": 297 }, { "epoch": 0.2199261992619926, "grad_norm": 0.21462051329950202, "learning_rate": 0.00014643734643734644, "loss": 0.0796, "step": 298 }, { "epoch": 0.22066420664206643, "grad_norm": 0.263616307912286, "learning_rate": 0.00014692874692874693, "loss": 0.1399, "step": 299 }, { "epoch": 0.22140221402214022, "grad_norm": 0.13384833172583954, "learning_rate": 0.00014742014742014743, "loss": 0.0675, "step": 300 }, { "epoch": 0.222140221402214, "grad_norm": 0.38989800034666083, "learning_rate": 0.00014791154791154792, "loss": 0.0737, "step": 301 }, { "epoch": 0.22287822878228783, "grad_norm": 0.5570233482826172, "learning_rate": 0.0001484029484029484, "loss": 0.068, "step": 302 }, { "epoch": 0.22361623616236162, "grad_norm": 0.24803308710722477, "learning_rate": 0.0001488943488943489, "loss": 0.0811, "step": 303 }, { "epoch": 0.2243542435424354, "grad_norm": 0.23416440384643417, "learning_rate": 0.0001493857493857494, "loss": 0.1007, "step": 304 }, { "epoch": 0.22509225092250923, "grad_norm": 0.23946640125560953, "learning_rate": 0.0001498771498771499, "loss": 0.0514, "step": 305 }, { "epoch": 0.22583025830258302, "grad_norm": 0.24220656951343952, "learning_rate": 0.00015036855036855038, "loss": 0.0593, "step": 306 }, { "epoch": 0.22656826568265684, "grad_norm": 0.46585970787009173, "learning_rate": 0.00015085995085995085, "loss": 0.0637, "step": 307 }, { "epoch": 0.22730627306273063, "grad_norm": 0.17468634234265576, "learning_rate": 0.00015135135135135137, "loss": 0.0457, "step": 308 }, { "epoch": 0.22804428044280442, "grad_norm": 0.39504863068047824, "learning_rate": 0.00015184275184275186, "loss": 0.0699, "step": 309 }, { "epoch": 0.22878228782287824, "grad_norm": 0.408708506924362, "learning_rate": 0.00015233415233415233, "loss": 0.0808, "step": 310 }, { "epoch": 0.22952029520295203, "grad_norm": 0.2862894055237847, "learning_rate": 0.00015282555282555285, "loss": 0.037, "step": 311 }, { "epoch": 0.23025830258302582, "grad_norm": 0.1372937193082202, "learning_rate": 0.00015331695331695334, "loss": 0.0421, "step": 312 }, { "epoch": 0.23099630996309964, "grad_norm": 0.3235344198047746, "learning_rate": 0.0001538083538083538, "loss": 0.0524, "step": 313 }, { "epoch": 0.23173431734317343, "grad_norm": 0.17504497226388668, "learning_rate": 0.00015429975429975432, "loss": 0.0421, "step": 314 }, { "epoch": 0.23247232472324722, "grad_norm": 0.2762059844666907, "learning_rate": 0.0001547911547911548, "loss": 0.0772, "step": 315 }, { "epoch": 0.23321033210332104, "grad_norm": 0.3532312976412736, "learning_rate": 0.00015528255528255528, "loss": 0.0899, "step": 316 }, { "epoch": 0.23394833948339483, "grad_norm": 0.17200516260890217, "learning_rate": 0.00015577395577395578, "loss": 0.0417, "step": 317 }, { "epoch": 0.23468634686346865, "grad_norm": 0.16206463773010152, "learning_rate": 0.00015626535626535627, "loss": 0.0623, "step": 318 }, { "epoch": 0.23542435424354244, "grad_norm": 0.3356879527814838, "learning_rate": 0.00015675675675675676, "loss": 0.1209, "step": 319 }, { "epoch": 0.23616236162361623, "grad_norm": 0.3114734661536784, "learning_rate": 0.00015724815724815725, "loss": 0.069, "step": 320 }, { "epoch": 0.23690036900369005, "grad_norm": 0.08704347733121312, "learning_rate": 0.00015773955773955775, "loss": 0.0241, "step": 321 }, { "epoch": 0.23763837638376384, "grad_norm": 0.22450873765282728, "learning_rate": 0.00015823095823095824, "loss": 0.0809, "step": 322 }, { "epoch": 0.23837638376383763, "grad_norm": 0.47323787117354493, "learning_rate": 0.00015872235872235873, "loss": 0.1279, "step": 323 }, { "epoch": 0.23911439114391145, "grad_norm": 0.1455019745587109, "learning_rate": 0.00015921375921375922, "loss": 0.0386, "step": 324 }, { "epoch": 0.23985239852398524, "grad_norm": 0.10685450644758768, "learning_rate": 0.00015970515970515972, "loss": 0.0294, "step": 325 }, { "epoch": 0.24059040590405903, "grad_norm": 0.21050067229985542, "learning_rate": 0.0001601965601965602, "loss": 0.0646, "step": 326 }, { "epoch": 0.24132841328413285, "grad_norm": 0.28510219141599047, "learning_rate": 0.0001606879606879607, "loss": 0.0713, "step": 327 }, { "epoch": 0.24206642066420664, "grad_norm": 0.2853576870298494, "learning_rate": 0.0001611793611793612, "loss": 0.0599, "step": 328 }, { "epoch": 0.24280442804428043, "grad_norm": 0.3606971233964612, "learning_rate": 0.00016167076167076166, "loss": 0.0909, "step": 329 }, { "epoch": 0.24354243542435425, "grad_norm": 0.23385743908884243, "learning_rate": 0.00016216216216216218, "loss": 0.0653, "step": 330 }, { "epoch": 0.24428044280442804, "grad_norm": 0.40058205612996917, "learning_rate": 0.00016265356265356267, "loss": 0.0781, "step": 331 }, { "epoch": 0.24501845018450186, "grad_norm": 0.38462848266506944, "learning_rate": 0.00016314496314496314, "loss": 0.1017, "step": 332 }, { "epoch": 0.24575645756457565, "grad_norm": 0.2768201636209361, "learning_rate": 0.00016363636363636366, "loss": 0.059, "step": 333 }, { "epoch": 0.24649446494464944, "grad_norm": 0.4657215028088312, "learning_rate": 0.00016412776412776415, "loss": 0.0848, "step": 334 }, { "epoch": 0.24723247232472326, "grad_norm": 0.17983079797331944, "learning_rate": 0.00016461916461916462, "loss": 0.0586, "step": 335 }, { "epoch": 0.24797047970479705, "grad_norm": 0.11358579856222634, "learning_rate": 0.0001651105651105651, "loss": 0.0273, "step": 336 }, { "epoch": 0.24870848708487084, "grad_norm": 0.1679593200398307, "learning_rate": 0.0001656019656019656, "loss": 0.0393, "step": 337 }, { "epoch": 0.24944649446494466, "grad_norm": 0.1336506804819816, "learning_rate": 0.0001660933660933661, "loss": 0.034, "step": 338 }, { "epoch": 0.25018450184501845, "grad_norm": 0.13297552517059452, "learning_rate": 0.0001665847665847666, "loss": 0.0314, "step": 339 }, { "epoch": 0.25092250922509224, "grad_norm": 0.16671850110876388, "learning_rate": 0.00016707616707616708, "loss": 0.0419, "step": 340 }, { "epoch": 0.25166051660516603, "grad_norm": 0.2570920854994893, "learning_rate": 0.00016756756756756757, "loss": 0.0576, "step": 341 }, { "epoch": 0.2523985239852399, "grad_norm": 0.2220983256059624, "learning_rate": 0.00016805896805896807, "loss": 0.0363, "step": 342 }, { "epoch": 0.25313653136531367, "grad_norm": 0.14993425991031478, "learning_rate": 0.00016855036855036856, "loss": 0.0347, "step": 343 }, { "epoch": 0.25387453874538746, "grad_norm": 0.3101243645401874, "learning_rate": 0.00016904176904176905, "loss": 0.0322, "step": 344 }, { "epoch": 0.25461254612546125, "grad_norm": 0.2621022983317749, "learning_rate": 0.00016953316953316954, "loss": 0.0731, "step": 345 }, { "epoch": 0.25535055350553504, "grad_norm": 0.31894148734371874, "learning_rate": 0.00017002457002457004, "loss": 0.0831, "step": 346 }, { "epoch": 0.25608856088560883, "grad_norm": 0.1626551489941373, "learning_rate": 0.00017051597051597053, "loss": 0.0375, "step": 347 }, { "epoch": 0.2568265682656827, "grad_norm": 0.36738924327386685, "learning_rate": 0.00017100737100737102, "loss": 0.121, "step": 348 }, { "epoch": 0.25756457564575647, "grad_norm": 0.28277257504117936, "learning_rate": 0.00017149877149877151, "loss": 0.0355, "step": 349 }, { "epoch": 0.25830258302583026, "grad_norm": 0.23722268195704624, "learning_rate": 0.000171990171990172, "loss": 0.0555, "step": 350 }, { "epoch": 0.25904059040590405, "grad_norm": 0.34520915894428694, "learning_rate": 0.00017248157248157247, "loss": 0.1235, "step": 351 }, { "epoch": 0.25977859778597784, "grad_norm": 0.22906947189092303, "learning_rate": 0.000172972972972973, "loss": 0.0569, "step": 352 }, { "epoch": 0.2605166051660517, "grad_norm": 0.08323091698243222, "learning_rate": 0.00017346437346437349, "loss": 0.023, "step": 353 }, { "epoch": 0.2612546125461255, "grad_norm": 0.25624710955724067, "learning_rate": 0.00017395577395577395, "loss": 0.1681, "step": 354 }, { "epoch": 0.26199261992619927, "grad_norm": 0.36869292184846253, "learning_rate": 0.00017444717444717447, "loss": 0.0888, "step": 355 }, { "epoch": 0.26273062730627306, "grad_norm": 0.4458092949665933, "learning_rate": 0.00017493857493857496, "loss": 0.0778, "step": 356 }, { "epoch": 0.26346863468634685, "grad_norm": 0.18851504810200903, "learning_rate": 0.00017542997542997543, "loss": 0.0272, "step": 357 }, { "epoch": 0.26420664206642064, "grad_norm": 0.11204628972017197, "learning_rate": 0.00017592137592137592, "loss": 0.0323, "step": 358 }, { "epoch": 0.2649446494464945, "grad_norm": 0.2843621066860922, "learning_rate": 0.00017641277641277641, "loss": 0.0424, "step": 359 }, { "epoch": 0.2656826568265683, "grad_norm": 0.3846606458470074, "learning_rate": 0.0001769041769041769, "loss": 0.0854, "step": 360 }, { "epoch": 0.26642066420664207, "grad_norm": 0.20535403714917114, "learning_rate": 0.0001773955773955774, "loss": 0.0514, "step": 361 }, { "epoch": 0.26715867158671586, "grad_norm": 0.31165432965490936, "learning_rate": 0.0001778869778869779, "loss": 0.0693, "step": 362 }, { "epoch": 0.26789667896678965, "grad_norm": 0.5327625278915684, "learning_rate": 0.00017837837837837839, "loss": 0.1345, "step": 363 }, { "epoch": 0.2686346863468635, "grad_norm": 0.23475462981506276, "learning_rate": 0.00017886977886977888, "loss": 0.0629, "step": 364 }, { "epoch": 0.2693726937269373, "grad_norm": 0.2308579498551145, "learning_rate": 0.00017936117936117937, "loss": 0.0566, "step": 365 }, { "epoch": 0.2701107011070111, "grad_norm": 0.24935565709294677, "learning_rate": 0.00017985257985257986, "loss": 0.0547, "step": 366 }, { "epoch": 0.27084870848708487, "grad_norm": 0.21717912237126022, "learning_rate": 0.00018034398034398036, "loss": 0.0412, "step": 367 }, { "epoch": 0.27158671586715866, "grad_norm": 0.2752554790203395, "learning_rate": 0.00018083538083538085, "loss": 0.0414, "step": 368 }, { "epoch": 0.27232472324723245, "grad_norm": 0.22081724821397983, "learning_rate": 0.00018132678132678134, "loss": 0.0523, "step": 369 }, { "epoch": 0.2730627306273063, "grad_norm": 0.2612024283289758, "learning_rate": 0.00018181818181818183, "loss": 0.0512, "step": 370 }, { "epoch": 0.2738007380073801, "grad_norm": 0.6591349376538295, "learning_rate": 0.00018230958230958233, "loss": 0.0589, "step": 371 }, { "epoch": 0.2745387453874539, "grad_norm": 0.29280923904771616, "learning_rate": 0.00018280098280098282, "loss": 0.0666, "step": 372 }, { "epoch": 0.27527675276752767, "grad_norm": 0.1436690014485436, "learning_rate": 0.00018329238329238329, "loss": 0.0389, "step": 373 }, { "epoch": 0.27601476014760146, "grad_norm": 0.21660001607434123, "learning_rate": 0.0001837837837837838, "loss": 0.0689, "step": 374 }, { "epoch": 0.2767527675276753, "grad_norm": 0.27459546850868316, "learning_rate": 0.0001842751842751843, "loss": 0.0698, "step": 375 }, { "epoch": 0.2774907749077491, "grad_norm": 0.3128655105470955, "learning_rate": 0.00018476658476658476, "loss": 0.0997, "step": 376 }, { "epoch": 0.2782287822878229, "grad_norm": 0.246841472042025, "learning_rate": 0.00018525798525798526, "loss": 0.0687, "step": 377 }, { "epoch": 0.2789667896678967, "grad_norm": 0.15548235872144173, "learning_rate": 0.00018574938574938578, "loss": 0.0493, "step": 378 }, { "epoch": 0.27970479704797047, "grad_norm": 0.2407289631923915, "learning_rate": 0.00018624078624078624, "loss": 0.0449, "step": 379 }, { "epoch": 0.28044280442804426, "grad_norm": 0.4626599855221367, "learning_rate": 0.00018673218673218673, "loss": 0.06, "step": 380 }, { "epoch": 0.2811808118081181, "grad_norm": 0.20471963319282488, "learning_rate": 0.00018722358722358723, "loss": 0.051, "step": 381 }, { "epoch": 0.2819188191881919, "grad_norm": 0.32806700787010257, "learning_rate": 0.00018771498771498772, "loss": 0.0452, "step": 382 }, { "epoch": 0.2826568265682657, "grad_norm": 0.665201228598982, "learning_rate": 0.0001882063882063882, "loss": 0.1909, "step": 383 }, { "epoch": 0.2833948339483395, "grad_norm": 0.32508884208582556, "learning_rate": 0.0001886977886977887, "loss": 0.0698, "step": 384 }, { "epoch": 0.28413284132841327, "grad_norm": 0.36447067440139885, "learning_rate": 0.0001891891891891892, "loss": 0.098, "step": 385 }, { "epoch": 0.2848708487084871, "grad_norm": 0.1536715020049387, "learning_rate": 0.0001896805896805897, "loss": 0.0446, "step": 386 }, { "epoch": 0.2856088560885609, "grad_norm": 0.37414580946201786, "learning_rate": 0.00019017199017199018, "loss": 0.073, "step": 387 }, { "epoch": 0.2863468634686347, "grad_norm": 0.1289900495957867, "learning_rate": 0.00019066339066339068, "loss": 0.0394, "step": 388 }, { "epoch": 0.2870848708487085, "grad_norm": 0.3773700093363717, "learning_rate": 0.00019115479115479117, "loss": 0.121, "step": 389 }, { "epoch": 0.2878228782287823, "grad_norm": 0.5602462471392217, "learning_rate": 0.00019164619164619166, "loss": 0.1772, "step": 390 }, { "epoch": 0.28856088560885607, "grad_norm": 0.18256593296254037, "learning_rate": 0.00019213759213759215, "loss": 0.0654, "step": 391 }, { "epoch": 0.2892988929889299, "grad_norm": 0.17118056359207673, "learning_rate": 0.00019262899262899262, "loss": 0.0509, "step": 392 }, { "epoch": 0.2900369003690037, "grad_norm": 1.151241132854487, "learning_rate": 0.00019312039312039314, "loss": 0.1786, "step": 393 }, { "epoch": 0.2907749077490775, "grad_norm": 0.14430412960247463, "learning_rate": 0.00019361179361179363, "loss": 0.0542, "step": 394 }, { "epoch": 0.2915129151291513, "grad_norm": 0.15400555240720634, "learning_rate": 0.0001941031941031941, "loss": 0.053, "step": 395 }, { "epoch": 0.2922509225092251, "grad_norm": 0.3718285815933612, "learning_rate": 0.00019459459459459462, "loss": 0.1132, "step": 396 }, { "epoch": 0.29298892988929887, "grad_norm": 0.40572438667258687, "learning_rate": 0.0001950859950859951, "loss": 0.0853, "step": 397 }, { "epoch": 0.2937269372693727, "grad_norm": 0.3714783200546804, "learning_rate": 0.00019557739557739558, "loss": 0.0999, "step": 398 }, { "epoch": 0.2944649446494465, "grad_norm": 0.30125395336793637, "learning_rate": 0.00019606879606879607, "loss": 0.0549, "step": 399 }, { "epoch": 0.2952029520295203, "grad_norm": 0.20185858413315486, "learning_rate": 0.0001965601965601966, "loss": 0.0575, "step": 400 }, { "epoch": 0.2959409594095941, "grad_norm": 0.6290362129822138, "learning_rate": 0.00019705159705159705, "loss": 0.0814, "step": 401 }, { "epoch": 0.2966789667896679, "grad_norm": 0.4000744735919927, "learning_rate": 0.00019754299754299755, "loss": 0.0897, "step": 402 }, { "epoch": 0.2974169741697417, "grad_norm": 0.49371323687831636, "learning_rate": 0.00019803439803439804, "loss": 0.1109, "step": 403 }, { "epoch": 0.2981549815498155, "grad_norm": 0.47091231566605846, "learning_rate": 0.00019852579852579853, "loss": 0.1837, "step": 404 }, { "epoch": 0.2988929889298893, "grad_norm": 0.4203849742496023, "learning_rate": 0.00019901719901719902, "loss": 0.0845, "step": 405 }, { "epoch": 0.2996309963099631, "grad_norm": 0.3711107784086492, "learning_rate": 0.00019950859950859952, "loss": 0.133, "step": 406 }, { "epoch": 0.3003690036900369, "grad_norm": 0.3463538928780147, "learning_rate": 0.0002, "loss": 0.0922, "step": 407 }, { "epoch": 0.3011070110701107, "grad_norm": 0.1741385202266641, "learning_rate": 0.0001999999631207296, "loss": 0.0378, "step": 408 }, { "epoch": 0.3018450184501845, "grad_norm": 0.3638395058485124, "learning_rate": 0.00019999985248294558, "loss": 0.0828, "step": 409 }, { "epoch": 0.3025830258302583, "grad_norm": 0.48642573575549886, "learning_rate": 0.00019999966808672951, "loss": 0.1076, "step": 410 }, { "epoch": 0.3033210332103321, "grad_norm": 0.19793560623930428, "learning_rate": 0.00019999940993221745, "loss": 0.0381, "step": 411 }, { "epoch": 0.3040590405904059, "grad_norm": 0.15874712739405938, "learning_rate": 0.0001999990780195998, "loss": 0.0383, "step": 412 }, { "epoch": 0.3047970479704797, "grad_norm": 0.19341557350271632, "learning_rate": 0.00019999867234912134, "loss": 0.0527, "step": 413 }, { "epoch": 0.30553505535055353, "grad_norm": 0.1890531588971373, "learning_rate": 0.00019999819292108135, "loss": 0.05, "step": 414 }, { "epoch": 0.3062730627306273, "grad_norm": 0.37416977783796185, "learning_rate": 0.00019999763973583342, "loss": 0.1096, "step": 415 }, { "epoch": 0.3070110701107011, "grad_norm": 0.30246554396565384, "learning_rate": 0.00019999701279378552, "loss": 0.0895, "step": 416 }, { "epoch": 0.3077490774907749, "grad_norm": 0.422620792051147, "learning_rate": 0.00019999631209540012, "loss": 0.0614, "step": 417 }, { "epoch": 0.3084870848708487, "grad_norm": 0.3475728914003585, "learning_rate": 0.00019999553764119408, "loss": 0.0783, "step": 418 }, { "epoch": 0.3092250922509225, "grad_norm": 0.2259572380331385, "learning_rate": 0.00019999468943173856, "loss": 0.0795, "step": 419 }, { "epoch": 0.30996309963099633, "grad_norm": 0.326266735559831, "learning_rate": 0.0001999937674676592, "loss": 0.0999, "step": 420 }, { "epoch": 0.3107011070110701, "grad_norm": 0.17351671548016484, "learning_rate": 0.00019999277174963606, "loss": 0.041, "step": 421 }, { "epoch": 0.3114391143911439, "grad_norm": 0.3475822168268343, "learning_rate": 0.00019999170227840357, "loss": 0.1193, "step": 422 }, { "epoch": 0.3121771217712177, "grad_norm": 0.16337554188937317, "learning_rate": 0.00019999055905475053, "loss": 0.0503, "step": 423 }, { "epoch": 0.3129151291512915, "grad_norm": 0.27656885703350315, "learning_rate": 0.00019998934207952015, "loss": 0.0656, "step": 424 }, { "epoch": 0.31365313653136534, "grad_norm": 0.1973316488469492, "learning_rate": 0.00019998805135361007, "loss": 0.0389, "step": 425 }, { "epoch": 0.31439114391143913, "grad_norm": 0.2427508323067013, "learning_rate": 0.00019998668687797234, "loss": 0.0663, "step": 426 }, { "epoch": 0.3151291512915129, "grad_norm": 0.40937481265916476, "learning_rate": 0.00019998524865361331, "loss": 0.0696, "step": 427 }, { "epoch": 0.3158671586715867, "grad_norm": 0.262553807363941, "learning_rate": 0.0001999837366815939, "loss": 0.0481, "step": 428 }, { "epoch": 0.3166051660516605, "grad_norm": 0.2490916760973363, "learning_rate": 0.00019998215096302918, "loss": 0.0496, "step": 429 }, { "epoch": 0.3173431734317343, "grad_norm": 0.295713288222171, "learning_rate": 0.00019998049149908887, "loss": 0.063, "step": 430 }, { "epoch": 0.31808118081180814, "grad_norm": 0.1782213662877202, "learning_rate": 0.00019997875829099693, "loss": 0.0406, "step": 431 }, { "epoch": 0.31881918819188193, "grad_norm": 0.2250501421937674, "learning_rate": 0.00019997695134003172, "loss": 0.0655, "step": 432 }, { "epoch": 0.3195571955719557, "grad_norm": 0.18686561820006758, "learning_rate": 0.00019997507064752602, "loss": 0.045, "step": 433 }, { "epoch": 0.3202952029520295, "grad_norm": 0.28997313132829167, "learning_rate": 0.00019997311621486707, "loss": 0.0721, "step": 434 }, { "epoch": 0.3210332103321033, "grad_norm": 0.3027674355380553, "learning_rate": 0.00019997108804349636, "loss": 0.1145, "step": 435 }, { "epoch": 0.32177121771217715, "grad_norm": 0.4398112710843622, "learning_rate": 0.0001999689861349099, "loss": 0.0673, "step": 436 }, { "epoch": 0.32250922509225094, "grad_norm": 0.18333897428720417, "learning_rate": 0.00019996681049065792, "loss": 0.0476, "step": 437 }, { "epoch": 0.32324723247232473, "grad_norm": 0.28292539334558725, "learning_rate": 0.00019996456111234527, "loss": 0.0649, "step": 438 }, { "epoch": 0.3239852398523985, "grad_norm": 0.3296439040221405, "learning_rate": 0.000199962238001631, "loss": 0.0761, "step": 439 }, { "epoch": 0.3247232472324723, "grad_norm": 0.2589304992192621, "learning_rate": 0.0001999598411602286, "loss": 0.0565, "step": 440 }, { "epoch": 0.3254612546125461, "grad_norm": 0.1694826269263915, "learning_rate": 0.00019995737058990591, "loss": 0.0378, "step": 441 }, { "epoch": 0.32619926199261995, "grad_norm": 0.3230729397765286, "learning_rate": 0.0001999548262924853, "loss": 0.0844, "step": 442 }, { "epoch": 0.32693726937269374, "grad_norm": 0.1582764694204351, "learning_rate": 0.00019995220826984328, "loss": 0.0317, "step": 443 }, { "epoch": 0.32767527675276753, "grad_norm": 0.39627151753787665, "learning_rate": 0.00019994951652391093, "loss": 0.1253, "step": 444 }, { "epoch": 0.3284132841328413, "grad_norm": 0.2385258782551192, "learning_rate": 0.00019994675105667367, "loss": 0.0679, "step": 445 }, { "epoch": 0.3291512915129151, "grad_norm": 0.34722770548547477, "learning_rate": 0.00019994391187017118, "loss": 0.1425, "step": 446 }, { "epoch": 0.3298892988929889, "grad_norm": 0.37824051329009983, "learning_rate": 0.00019994099896649767, "loss": 0.058, "step": 447 }, { "epoch": 0.33062730627306275, "grad_norm": 0.24473148779895082, "learning_rate": 0.00019993801234780166, "loss": 0.0392, "step": 448 }, { "epoch": 0.33136531365313654, "grad_norm": 0.15154374792771624, "learning_rate": 0.00019993495201628598, "loss": 0.0409, "step": 449 }, { "epoch": 0.33210332103321033, "grad_norm": 0.618070771803953, "learning_rate": 0.00019993181797420796, "loss": 0.0828, "step": 450 }, { "epoch": 0.3328413284132841, "grad_norm": 0.2027195118249247, "learning_rate": 0.00019992861022387915, "loss": 0.0596, "step": 451 }, { "epoch": 0.3335793357933579, "grad_norm": 0.17603190540262825, "learning_rate": 0.0001999253287676656, "loss": 0.0439, "step": 452 }, { "epoch": 0.33431734317343176, "grad_norm": 0.37538178558767255, "learning_rate": 0.00019992197360798762, "loss": 0.0546, "step": 453 }, { "epoch": 0.33505535055350555, "grad_norm": 0.8146821073306602, "learning_rate": 0.00019991854474731992, "loss": 0.1835, "step": 454 }, { "epoch": 0.33579335793357934, "grad_norm": 0.18285025963236579, "learning_rate": 0.00019991504218819166, "loss": 0.0446, "step": 455 }, { "epoch": 0.33653136531365313, "grad_norm": 0.19060801616985304, "learning_rate": 0.00019991146593318618, "loss": 0.0241, "step": 456 }, { "epoch": 0.3372693726937269, "grad_norm": 0.1313935718086762, "learning_rate": 0.00019990781598494133, "loss": 0.0307, "step": 457 }, { "epoch": 0.3380073800738007, "grad_norm": 0.3693668120395353, "learning_rate": 0.00019990409234614924, "loss": 0.0927, "step": 458 }, { "epoch": 0.33874538745387456, "grad_norm": 0.1417509057722203, "learning_rate": 0.0001999002950195564, "loss": 0.0487, "step": 459 }, { "epoch": 0.33948339483394835, "grad_norm": 0.14710555617468116, "learning_rate": 0.0001998964240079637, "loss": 0.0375, "step": 460 }, { "epoch": 0.34022140221402214, "grad_norm": 0.20886603637088205, "learning_rate": 0.0001998924793142263, "loss": 0.0469, "step": 461 }, { "epoch": 0.34095940959409593, "grad_norm": 0.40361932957735835, "learning_rate": 0.00019988846094125376, "loss": 0.1177, "step": 462 }, { "epoch": 0.3416974169741697, "grad_norm": 0.23475431413336076, "learning_rate": 0.00019988436889201, "loss": 0.0627, "step": 463 }, { "epoch": 0.34243542435424357, "grad_norm": 0.3364298375178266, "learning_rate": 0.0001998802031695132, "loss": 0.0871, "step": 464 }, { "epoch": 0.34317343173431736, "grad_norm": 0.3478714580930183, "learning_rate": 0.00019987596377683603, "loss": 0.1195, "step": 465 }, { "epoch": 0.34391143911439115, "grad_norm": 0.34031033164898217, "learning_rate": 0.00019987165071710527, "loss": 0.1215, "step": 466 }, { "epoch": 0.34464944649446494, "grad_norm": 0.18554267638340302, "learning_rate": 0.0001998672639935023, "loss": 0.0542, "step": 467 }, { "epoch": 0.34538745387453873, "grad_norm": 0.22458068808003603, "learning_rate": 0.00019986280360926264, "loss": 0.0504, "step": 468 }, { "epoch": 0.3461254612546125, "grad_norm": 0.2235537876421932, "learning_rate": 0.0001998582695676762, "loss": 0.0529, "step": 469 }, { "epoch": 0.34686346863468637, "grad_norm": 0.2820236237250365, "learning_rate": 0.00019985366187208725, "loss": 0.0806, "step": 470 }, { "epoch": 0.34760147601476016, "grad_norm": 0.45975646249354596, "learning_rate": 0.00019984898052589434, "loss": 0.0884, "step": 471 }, { "epoch": 0.34833948339483395, "grad_norm": 0.6948424714841445, "learning_rate": 0.00019984422553255036, "loss": 0.0457, "step": 472 }, { "epoch": 0.34907749077490774, "grad_norm": 0.24768533368222384, "learning_rate": 0.00019983939689556253, "loss": 0.0607, "step": 473 }, { "epoch": 0.34981549815498153, "grad_norm": 0.37250730602203475, "learning_rate": 0.0001998344946184924, "loss": 0.0695, "step": 474 }, { "epoch": 0.3505535055350554, "grad_norm": 0.3678324684961192, "learning_rate": 0.00019982951870495578, "loss": 0.073, "step": 475 }, { "epoch": 0.35129151291512917, "grad_norm": 0.39589330461748773, "learning_rate": 0.00019982446915862284, "loss": 0.0726, "step": 476 }, { "epoch": 0.35202952029520296, "grad_norm": 0.34407024844289313, "learning_rate": 0.0001998193459832181, "loss": 0.0688, "step": 477 }, { "epoch": 0.35276752767527675, "grad_norm": 0.24562431329977347, "learning_rate": 0.0001998141491825203, "loss": 0.0545, "step": 478 }, { "epoch": 0.35350553505535054, "grad_norm": 0.4863647729051156, "learning_rate": 0.00019980887876036251, "loss": 0.0657, "step": 479 }, { "epoch": 0.35424354243542433, "grad_norm": 0.27467459353812435, "learning_rate": 0.00019980353472063216, "loss": 0.0984, "step": 480 }, { "epoch": 0.3549815498154982, "grad_norm": 0.40225160448835373, "learning_rate": 0.00019979811706727086, "loss": 0.0559, "step": 481 }, { "epoch": 0.35571955719557197, "grad_norm": 0.268956461553656, "learning_rate": 0.00019979262580427468, "loss": 0.0536, "step": 482 }, { "epoch": 0.35645756457564576, "grad_norm": 0.3817132532017439, "learning_rate": 0.00019978706093569387, "loss": 0.1197, "step": 483 }, { "epoch": 0.35719557195571955, "grad_norm": 0.19984362721375526, "learning_rate": 0.00019978142246563296, "loss": 0.049, "step": 484 }, { "epoch": 0.35793357933579334, "grad_norm": 0.20739811473236996, "learning_rate": 0.00019977571039825085, "loss": 0.0608, "step": 485 }, { "epoch": 0.3586715867158672, "grad_norm": 0.30580392449918353, "learning_rate": 0.00019976992473776063, "loss": 0.0597, "step": 486 }, { "epoch": 0.359409594095941, "grad_norm": 0.3375213454043639, "learning_rate": 0.00019976406548842976, "loss": 0.1051, "step": 487 }, { "epoch": 0.36014760147601477, "grad_norm": 0.25686824536858865, "learning_rate": 0.00019975813265457991, "loss": 0.0898, "step": 488 }, { "epoch": 0.36088560885608856, "grad_norm": 0.32752516676689025, "learning_rate": 0.00019975212624058708, "loss": 0.0735, "step": 489 }, { "epoch": 0.36162361623616235, "grad_norm": 0.29246948648398247, "learning_rate": 0.00019974604625088146, "loss": 0.0911, "step": 490 }, { "epoch": 0.36236162361623614, "grad_norm": 0.44635668601051476, "learning_rate": 0.00019973989268994764, "loss": 0.092, "step": 491 }, { "epoch": 0.36309963099631, "grad_norm": 0.2181678759376621, "learning_rate": 0.0001997336655623243, "loss": 0.0517, "step": 492 }, { "epoch": 0.3638376383763838, "grad_norm": 0.26619298998858154, "learning_rate": 0.00019972736487260456, "loss": 0.0631, "step": 493 }, { "epoch": 0.36457564575645757, "grad_norm": 0.19105091508823005, "learning_rate": 0.0001997209906254357, "loss": 0.06, "step": 494 }, { "epoch": 0.36531365313653136, "grad_norm": 0.17827867563462999, "learning_rate": 0.00019971454282551924, "loss": 0.0484, "step": 495 }, { "epoch": 0.36605166051660515, "grad_norm": 0.18678225779696478, "learning_rate": 0.00019970802147761102, "loss": 0.0506, "step": 496 }, { "epoch": 0.36678966789667894, "grad_norm": 0.2902030480548603, "learning_rate": 0.0001997014265865211, "loss": 0.0517, "step": 497 }, { "epoch": 0.3675276752767528, "grad_norm": 0.528632059458026, "learning_rate": 0.00019969475815711368, "loss": 0.0737, "step": 498 }, { "epoch": 0.3682656826568266, "grad_norm": 0.5246222436015356, "learning_rate": 0.00019968801619430743, "loss": 0.0639, "step": 499 }, { "epoch": 0.36900369003690037, "grad_norm": 0.293216127153962, "learning_rate": 0.000199681200703075, "loss": 0.0801, "step": 500 }, { "epoch": 0.36974169741697416, "grad_norm": 0.5910961902260434, "learning_rate": 0.0001996743116884435, "loss": 0.1184, "step": 501 }, { "epoch": 0.37047970479704795, "grad_norm": 0.28331529000162253, "learning_rate": 0.00019966734915549412, "loss": 0.0462, "step": 502 }, { "epoch": 0.3712177121771218, "grad_norm": 0.45963896726455944, "learning_rate": 0.00019966031310936233, "loss": 0.1349, "step": 503 }, { "epoch": 0.3719557195571956, "grad_norm": 0.4847285157382847, "learning_rate": 0.0001996532035552378, "loss": 0.1036, "step": 504 }, { "epoch": 0.3726937269372694, "grad_norm": 0.22071485936521154, "learning_rate": 0.00019964602049836445, "loss": 0.0792, "step": 505 }, { "epoch": 0.37343173431734317, "grad_norm": 0.1561663732535802, "learning_rate": 0.00019963876394404038, "loss": 0.0472, "step": 506 }, { "epoch": 0.37416974169741696, "grad_norm": 0.6555296873969741, "learning_rate": 0.00019963143389761795, "loss": 0.1275, "step": 507 }, { "epoch": 0.37490774907749075, "grad_norm": 0.23419048267721024, "learning_rate": 0.00019962403036450366, "loss": 0.057, "step": 508 }, { "epoch": 0.3756457564575646, "grad_norm": 0.30507725048403694, "learning_rate": 0.00019961655335015826, "loss": 0.2112, "step": 509 }, { "epoch": 0.3763837638376384, "grad_norm": 0.17095578939628797, "learning_rate": 0.00019960900286009671, "loss": 0.0383, "step": 510 }, { "epoch": 0.3771217712177122, "grad_norm": 0.4285092162476954, "learning_rate": 0.0001996013788998881, "loss": 0.0707, "step": 511 }, { "epoch": 0.37785977859778597, "grad_norm": 0.6180063828637314, "learning_rate": 0.0001995936814751558, "loss": 0.1443, "step": 512 }, { "epoch": 0.37859778597785976, "grad_norm": 0.4150474385205031, "learning_rate": 0.00019958591059157727, "loss": 0.0874, "step": 513 }, { "epoch": 0.3793357933579336, "grad_norm": 0.2865131442747489, "learning_rate": 0.00019957806625488423, "loss": 0.0673, "step": 514 }, { "epoch": 0.3800738007380074, "grad_norm": 0.20838509223668936, "learning_rate": 0.00019957014847086252, "loss": 0.0429, "step": 515 }, { "epoch": 0.3808118081180812, "grad_norm": 0.3401983378910352, "learning_rate": 0.00019956215724535224, "loss": 0.0544, "step": 516 }, { "epoch": 0.381549815498155, "grad_norm": 0.41596021696981617, "learning_rate": 0.00019955409258424754, "loss": 0.0694, "step": 517 }, { "epoch": 0.38228782287822877, "grad_norm": 0.2581935333238041, "learning_rate": 0.00019954595449349686, "loss": 0.0661, "step": 518 }, { "epoch": 0.38302583025830256, "grad_norm": 0.6063075517320448, "learning_rate": 0.00019953774297910265, "loss": 0.1538, "step": 519 }, { "epoch": 0.3837638376383764, "grad_norm": 0.27887126438773946, "learning_rate": 0.00019952945804712166, "loss": 0.0698, "step": 520 }, { "epoch": 0.3845018450184502, "grad_norm": 0.16945513183063776, "learning_rate": 0.00019952109970366473, "loss": 0.0408, "step": 521 }, { "epoch": 0.385239852398524, "grad_norm": 0.2144460871320436, "learning_rate": 0.00019951266795489685, "loss": 0.0691, "step": 522 }, { "epoch": 0.3859778597785978, "grad_norm": 0.18874745295572906, "learning_rate": 0.00019950416280703715, "loss": 0.0498, "step": 523 }, { "epoch": 0.38671586715867157, "grad_norm": 0.22779188968100106, "learning_rate": 0.0001994955842663589, "loss": 0.0558, "step": 524 }, { "epoch": 0.3874538745387454, "grad_norm": 0.32144658818900074, "learning_rate": 0.00019948693233918952, "loss": 0.1014, "step": 525 }, { "epoch": 0.3881918819188192, "grad_norm": 0.41884230261358707, "learning_rate": 0.00019947820703191053, "loss": 0.1218, "step": 526 }, { "epoch": 0.388929889298893, "grad_norm": 0.21715336365734417, "learning_rate": 0.00019946940835095762, "loss": 0.0872, "step": 527 }, { "epoch": 0.3896678966789668, "grad_norm": 0.2977594079474515, "learning_rate": 0.00019946053630282053, "loss": 0.0688, "step": 528 }, { "epoch": 0.3904059040590406, "grad_norm": 0.14553659650916823, "learning_rate": 0.00019945159089404315, "loss": 0.0523, "step": 529 }, { "epoch": 0.39114391143911437, "grad_norm": 0.2200967488831972, "learning_rate": 0.0001994425721312235, "loss": 0.0632, "step": 530 }, { "epoch": 0.3918819188191882, "grad_norm": 0.1686741865371252, "learning_rate": 0.00019943348002101371, "loss": 0.054, "step": 531 }, { "epoch": 0.392619926199262, "grad_norm": 0.2116723465683281, "learning_rate": 0.00019942431457011997, "loss": 0.0714, "step": 532 }, { "epoch": 0.3933579335793358, "grad_norm": 0.5514808195139407, "learning_rate": 0.00019941507578530255, "loss": 0.134, "step": 533 }, { "epoch": 0.3940959409594096, "grad_norm": 0.1834701503534051, "learning_rate": 0.00019940576367337594, "loss": 0.0474, "step": 534 }, { "epoch": 0.3948339483394834, "grad_norm": 0.27413855338085946, "learning_rate": 0.0001993963782412085, "loss": 0.0895, "step": 535 }, { "epoch": 0.3955719557195572, "grad_norm": 0.2424300005107514, "learning_rate": 0.00019938691949572283, "loss": 0.0736, "step": 536 }, { "epoch": 0.396309963099631, "grad_norm": 0.2474868069309868, "learning_rate": 0.00019937738744389558, "loss": 0.061, "step": 537 }, { "epoch": 0.3970479704797048, "grad_norm": 0.22926117423936707, "learning_rate": 0.00019936778209275744, "loss": 0.0728, "step": 538 }, { "epoch": 0.3977859778597786, "grad_norm": 0.3918653857115021, "learning_rate": 0.00019935810344939321, "loss": 0.0585, "step": 539 }, { "epoch": 0.3985239852398524, "grad_norm": 0.2864783033231258, "learning_rate": 0.00019934835152094166, "loss": 0.0427, "step": 540 }, { "epoch": 0.3992619926199262, "grad_norm": 0.2138762950743913, "learning_rate": 0.00019933852631459571, "loss": 0.0427, "step": 541 }, { "epoch": 0.4, "grad_norm": 0.28920348065502705, "learning_rate": 0.00019932862783760227, "loss": 0.0877, "step": 542 }, { "epoch": 0.4007380073800738, "grad_norm": 0.22042740287560367, "learning_rate": 0.0001993186560972623, "loss": 0.036, "step": 543 }, { "epoch": 0.4014760147601476, "grad_norm": 0.4008966920747419, "learning_rate": 0.00019930861110093085, "loss": 0.079, "step": 544 }, { "epoch": 0.4022140221402214, "grad_norm": 0.26266945284340143, "learning_rate": 0.00019929849285601692, "loss": 0.0346, "step": 545 }, { "epoch": 0.4029520295202952, "grad_norm": 0.31069490965749064, "learning_rate": 0.0001992883013699836, "loss": 0.0611, "step": 546 }, { "epoch": 0.40369003690036903, "grad_norm": 0.49453117133224533, "learning_rate": 0.000199278036650348, "loss": 0.078, "step": 547 }, { "epoch": 0.4044280442804428, "grad_norm": 0.3488739831715587, "learning_rate": 0.0001992676987046812, "loss": 0.0711, "step": 548 }, { "epoch": 0.4051660516605166, "grad_norm": 0.4096354275361043, "learning_rate": 0.00019925728754060834, "loss": 0.1202, "step": 549 }, { "epoch": 0.4059040590405904, "grad_norm": 0.38337284589244874, "learning_rate": 0.00019924680316580853, "loss": 0.0458, "step": 550 }, { "epoch": 0.4066420664206642, "grad_norm": 0.28470418061626845, "learning_rate": 0.0001992362455880149, "loss": 0.0433, "step": 551 }, { "epoch": 0.407380073800738, "grad_norm": 0.27160512100506135, "learning_rate": 0.0001992256148150145, "loss": 0.0531, "step": 552 }, { "epoch": 0.40811808118081183, "grad_norm": 0.2785375064343035, "learning_rate": 0.0001992149108546485, "loss": 0.0714, "step": 553 }, { "epoch": 0.4088560885608856, "grad_norm": 0.26885289953951996, "learning_rate": 0.00019920413371481204, "loss": 0.0608, "step": 554 }, { "epoch": 0.4095940959409594, "grad_norm": 0.3465242242230134, "learning_rate": 0.00019919328340345407, "loss": 0.066, "step": 555 }, { "epoch": 0.4103321033210332, "grad_norm": 0.19221337266562458, "learning_rate": 0.00019918235992857767, "loss": 0.0376, "step": 556 }, { "epoch": 0.411070110701107, "grad_norm": 0.28469893897119897, "learning_rate": 0.00019917136329823985, "loss": 0.0616, "step": 557 }, { "epoch": 0.4118081180811808, "grad_norm": 0.167214196180655, "learning_rate": 0.00019916029352055152, "loss": 0.034, "step": 558 }, { "epoch": 0.41254612546125463, "grad_norm": 0.292175543508982, "learning_rate": 0.00019914915060367764, "loss": 0.0757, "step": 559 }, { "epoch": 0.4132841328413284, "grad_norm": 0.17932326270101864, "learning_rate": 0.00019913793455583702, "loss": 0.0597, "step": 560 }, { "epoch": 0.4140221402214022, "grad_norm": 0.29897540487194363, "learning_rate": 0.00019912664538530248, "loss": 0.0675, "step": 561 }, { "epoch": 0.414760147601476, "grad_norm": 0.2295324078282633, "learning_rate": 0.00019911528310040074, "loss": 0.0517, "step": 562 }, { "epoch": 0.4154981549815498, "grad_norm": 0.4581756386258036, "learning_rate": 0.00019910384770951243, "loss": 0.0954, "step": 563 }, { "epoch": 0.41623616236162364, "grad_norm": 0.22109982299376588, "learning_rate": 0.00019909233922107218, "loss": 0.0637, "step": 564 }, { "epoch": 0.41697416974169743, "grad_norm": 0.24504805012033415, "learning_rate": 0.0001990807576435684, "loss": 0.0461, "step": 565 }, { "epoch": 0.4177121771217712, "grad_norm": 0.27298310657818065, "learning_rate": 0.0001990691029855436, "loss": 0.0919, "step": 566 }, { "epoch": 0.418450184501845, "grad_norm": 0.26425323394248496, "learning_rate": 0.00019905737525559403, "loss": 0.0433, "step": 567 }, { "epoch": 0.4191881918819188, "grad_norm": 0.3811654169350471, "learning_rate": 0.00019904557446236986, "loss": 0.0831, "step": 568 }, { "epoch": 0.4199261992619926, "grad_norm": 0.21876015682395344, "learning_rate": 0.00019903370061457522, "loss": 0.0554, "step": 569 }, { "epoch": 0.42066420664206644, "grad_norm": 0.3690773152951223, "learning_rate": 0.00019902175372096812, "loss": 0.108, "step": 570 }, { "epoch": 0.42140221402214023, "grad_norm": 0.2181719082992289, "learning_rate": 0.00019900973379036033, "loss": 0.0458, "step": 571 }, { "epoch": 0.422140221402214, "grad_norm": 0.2275218382139671, "learning_rate": 0.00019899764083161766, "loss": 0.0463, "step": 572 }, { "epoch": 0.4228782287822878, "grad_norm": 0.25336874151225364, "learning_rate": 0.00019898547485365967, "loss": 0.0741, "step": 573 }, { "epoch": 0.4236162361623616, "grad_norm": 0.41199551294071673, "learning_rate": 0.00019897323586545978, "loss": 0.0655, "step": 574 }, { "epoch": 0.42435424354243545, "grad_norm": 0.18746333269995671, "learning_rate": 0.0001989609238760453, "loss": 0.0435, "step": 575 }, { "epoch": 0.42509225092250924, "grad_norm": 0.18225651193784123, "learning_rate": 0.00019894853889449742, "loss": 0.0469, "step": 576 }, { "epoch": 0.42583025830258303, "grad_norm": 0.40402677744725607, "learning_rate": 0.00019893608092995106, "loss": 0.0687, "step": 577 }, { "epoch": 0.4265682656826568, "grad_norm": 0.38968157807782283, "learning_rate": 0.00019892354999159507, "loss": 0.0628, "step": 578 }, { "epoch": 0.4273062730627306, "grad_norm": 0.43630462716866014, "learning_rate": 0.00019891094608867206, "loss": 0.094, "step": 579 }, { "epoch": 0.4280442804428044, "grad_norm": 0.36588088745943637, "learning_rate": 0.00019889826923047852, "loss": 0.0917, "step": 580 }, { "epoch": 0.42878228782287825, "grad_norm": 0.2053897904071956, "learning_rate": 0.00019888551942636468, "loss": 0.1021, "step": 581 }, { "epoch": 0.42952029520295204, "grad_norm": 0.2136560103516423, "learning_rate": 0.00019887269668573463, "loss": 0.0424, "step": 582 }, { "epoch": 0.43025830258302583, "grad_norm": 0.1922693540577077, "learning_rate": 0.00019885980101804623, "loss": 0.0313, "step": 583 }, { "epoch": 0.4309963099630996, "grad_norm": 0.18231047028739533, "learning_rate": 0.00019884683243281116, "loss": 0.0561, "step": 584 }, { "epoch": 0.4317343173431734, "grad_norm": 0.386182141906, "learning_rate": 0.0001988337909395948, "loss": 0.1027, "step": 585 }, { "epoch": 0.43247232472324726, "grad_norm": 0.20138528694069285, "learning_rate": 0.00019882067654801645, "loss": 0.0556, "step": 586 }, { "epoch": 0.43321033210332105, "grad_norm": 0.28039614222657827, "learning_rate": 0.000198807489267749, "loss": 0.0492, "step": 587 }, { "epoch": 0.43394833948339484, "grad_norm": 0.17627973062432725, "learning_rate": 0.0001987942291085193, "loss": 0.0387, "step": 588 }, { "epoch": 0.43468634686346863, "grad_norm": 0.24882904836394354, "learning_rate": 0.00019878089608010773, "loss": 0.0563, "step": 589 }, { "epoch": 0.4354243542435424, "grad_norm": 0.2525190390265357, "learning_rate": 0.0001987674901923486, "loss": 0.0457, "step": 590 }, { "epoch": 0.4361623616236162, "grad_norm": 0.12933847310368518, "learning_rate": 0.00019875401145512994, "loss": 0.0301, "step": 591 }, { "epoch": 0.43690036900369006, "grad_norm": 0.3805958694604493, "learning_rate": 0.0001987404598783934, "loss": 0.1061, "step": 592 }, { "epoch": 0.43763837638376385, "grad_norm": 0.22927111667905156, "learning_rate": 0.00019872683547213446, "loss": 0.0694, "step": 593 }, { "epoch": 0.43837638376383764, "grad_norm": 0.15235996404301103, "learning_rate": 0.0001987131382464023, "loss": 0.0354, "step": 594 }, { "epoch": 0.43911439114391143, "grad_norm": 0.4513249136124996, "learning_rate": 0.00019869936821129974, "loss": 0.1551, "step": 595 }, { "epoch": 0.4398523985239852, "grad_norm": 0.38097373331248247, "learning_rate": 0.00019868552537698339, "loss": 0.1117, "step": 596 }, { "epoch": 0.44059040590405907, "grad_norm": 0.26167024188986043, "learning_rate": 0.0001986716097536635, "loss": 0.0402, "step": 597 }, { "epoch": 0.44132841328413286, "grad_norm": 0.4202200359580107, "learning_rate": 0.00019865762135160407, "loss": 0.0952, "step": 598 }, { "epoch": 0.44206642066420665, "grad_norm": 0.3784548702651073, "learning_rate": 0.0001986435601811227, "loss": 0.1155, "step": 599 }, { "epoch": 0.44280442804428044, "grad_norm": 0.17061602747053525, "learning_rate": 0.00019862942625259076, "loss": 0.0452, "step": 600 }, { "epoch": 0.44354243542435423, "grad_norm": 0.1679653952401722, "learning_rate": 0.00019861521957643318, "loss": 0.0604, "step": 601 }, { "epoch": 0.444280442804428, "grad_norm": 0.16021827251862683, "learning_rate": 0.0001986009401631286, "loss": 0.0493, "step": 602 }, { "epoch": 0.44501845018450187, "grad_norm": 0.4978953821770148, "learning_rate": 0.00019858658802320933, "loss": 0.1217, "step": 603 }, { "epoch": 0.44575645756457566, "grad_norm": 0.26846035419646963, "learning_rate": 0.00019857216316726127, "loss": 0.0692, "step": 604 }, { "epoch": 0.44649446494464945, "grad_norm": 0.3988977569738387, "learning_rate": 0.000198557665605924, "loss": 0.0625, "step": 605 }, { "epoch": 0.44723247232472324, "grad_norm": 0.1270492351198551, "learning_rate": 0.00019854309534989074, "loss": 0.0319, "step": 606 }, { "epoch": 0.44797047970479703, "grad_norm": 0.20821848062931672, "learning_rate": 0.00019852845240990826, "loss": 0.0522, "step": 607 }, { "epoch": 0.4487084870848708, "grad_norm": 0.45161159367026027, "learning_rate": 0.00019851373679677695, "loss": 0.0576, "step": 608 }, { "epoch": 0.44944649446494467, "grad_norm": 0.31857737332747105, "learning_rate": 0.00019849894852135092, "loss": 0.0711, "step": 609 }, { "epoch": 0.45018450184501846, "grad_norm": 0.2493021765079013, "learning_rate": 0.00019848408759453768, "loss": 0.055, "step": 610 }, { "epoch": 0.45092250922509225, "grad_norm": 0.1716688206538959, "learning_rate": 0.00019846915402729854, "loss": 0.0349, "step": 611 }, { "epoch": 0.45166051660516604, "grad_norm": 0.339397138336281, "learning_rate": 0.00019845414783064823, "loss": 0.0752, "step": 612 }, { "epoch": 0.45239852398523983, "grad_norm": 0.5524392726810267, "learning_rate": 0.00019843906901565505, "loss": 0.0439, "step": 613 }, { "epoch": 0.4531365313653137, "grad_norm": 0.2995622629781232, "learning_rate": 0.000198423917593441, "loss": 0.0693, "step": 614 }, { "epoch": 0.45387453874538747, "grad_norm": 0.21581415556478614, "learning_rate": 0.0001984086935751815, "loss": 0.0457, "step": 615 }, { "epoch": 0.45461254612546126, "grad_norm": 0.7178645343586281, "learning_rate": 0.00019839339697210557, "loss": 0.0914, "step": 616 }, { "epoch": 0.45535055350553505, "grad_norm": 0.3582478340713005, "learning_rate": 0.00019837802779549578, "loss": 0.0694, "step": 617 }, { "epoch": 0.45608856088560884, "grad_norm": 0.2256245531580918, "learning_rate": 0.00019836258605668817, "loss": 0.0639, "step": 618 }, { "epoch": 0.45682656826568263, "grad_norm": 0.33464775526167195, "learning_rate": 0.00019834707176707243, "loss": 0.0623, "step": 619 }, { "epoch": 0.4575645756457565, "grad_norm": 0.27446923631697623, "learning_rate": 0.00019833148493809155, "loss": 0.0494, "step": 620 }, { "epoch": 0.45830258302583027, "grad_norm": 0.49091859104012797, "learning_rate": 0.00019831582558124225, "loss": 0.1157, "step": 621 }, { "epoch": 0.45904059040590406, "grad_norm": 0.2629255361295235, "learning_rate": 0.00019830009370807458, "loss": 0.0727, "step": 622 }, { "epoch": 0.45977859778597785, "grad_norm": 0.2313483567140697, "learning_rate": 0.0001982842893301922, "loss": 0.1043, "step": 623 }, { "epoch": 0.46051660516605164, "grad_norm": 0.721216500334867, "learning_rate": 0.00019826841245925212, "loss": 0.1397, "step": 624 }, { "epoch": 0.4612546125461255, "grad_norm": 0.40709927643683763, "learning_rate": 0.0001982524631069649, "loss": 0.097, "step": 625 }, { "epoch": 0.4619926199261993, "grad_norm": 0.1606370705591231, "learning_rate": 0.0001982364412850946, "loss": 0.0631, "step": 626 }, { "epoch": 0.46273062730627307, "grad_norm": 0.4379012165151906, "learning_rate": 0.00019822034700545867, "loss": 0.1153, "step": 627 }, { "epoch": 0.46346863468634686, "grad_norm": 0.10089261481159706, "learning_rate": 0.00019820418027992795, "loss": 0.0246, "step": 628 }, { "epoch": 0.46420664206642065, "grad_norm": 0.1536061328376787, "learning_rate": 0.00019818794112042685, "loss": 0.0248, "step": 629 }, { "epoch": 0.46494464944649444, "grad_norm": 0.3474531545944857, "learning_rate": 0.0001981716295389331, "loss": 0.0693, "step": 630 }, { "epoch": 0.4656826568265683, "grad_norm": 0.19426741503425382, "learning_rate": 0.00019815524554747793, "loss": 0.0534, "step": 631 }, { "epoch": 0.4664206642066421, "grad_norm": 0.29872080568592646, "learning_rate": 0.0001981387891581459, "loss": 0.0668, "step": 632 }, { "epoch": 0.46715867158671587, "grad_norm": 0.34750929759802107, "learning_rate": 0.00019812226038307498, "loss": 0.0683, "step": 633 }, { "epoch": 0.46789667896678966, "grad_norm": 0.31082564631889026, "learning_rate": 0.00019810565923445662, "loss": 0.061, "step": 634 }, { "epoch": 0.46863468634686345, "grad_norm": 0.26625665135413024, "learning_rate": 0.00019808898572453552, "loss": 0.0617, "step": 635 }, { "epoch": 0.4693726937269373, "grad_norm": 0.2547041216993461, "learning_rate": 0.0001980722398656098, "loss": 0.0535, "step": 636 }, { "epoch": 0.4701107011070111, "grad_norm": 0.3863512935002946, "learning_rate": 0.00019805542167003107, "loss": 0.1084, "step": 637 }, { "epoch": 0.4708487084870849, "grad_norm": 0.29608156690586046, "learning_rate": 0.00019803853115020408, "loss": 0.0941, "step": 638 }, { "epoch": 0.47158671586715867, "grad_norm": 0.19168368256655058, "learning_rate": 0.0001980215683185871, "loss": 0.0367, "step": 639 }, { "epoch": 0.47232472324723246, "grad_norm": 0.14698605209256782, "learning_rate": 0.00019800453318769159, "loss": 0.0596, "step": 640 }, { "epoch": 0.47306273062730625, "grad_norm": 0.3495547693244105, "learning_rate": 0.0001979874257700825, "loss": 0.0808, "step": 641 }, { "epoch": 0.4738007380073801, "grad_norm": 0.222589855135223, "learning_rate": 0.00019797024607837795, "loss": 0.0455, "step": 642 }, { "epoch": 0.4745387453874539, "grad_norm": 0.29397782205149636, "learning_rate": 0.00019795299412524945, "loss": 0.0729, "step": 643 }, { "epoch": 0.4752767527675277, "grad_norm": 0.2859039255822088, "learning_rate": 0.0001979356699234218, "loss": 0.0512, "step": 644 }, { "epoch": 0.47601476014760147, "grad_norm": 0.17692725222598482, "learning_rate": 0.0001979182734856731, "loss": 0.0381, "step": 645 }, { "epoch": 0.47675276752767526, "grad_norm": 0.1923179029330154, "learning_rate": 0.0001979008048248346, "loss": 0.0483, "step": 646 }, { "epoch": 0.4774907749077491, "grad_norm": 0.2923833468531121, "learning_rate": 0.00019788326395379108, "loss": 0.0648, "step": 647 }, { "epoch": 0.4782287822878229, "grad_norm": 0.2516869322879514, "learning_rate": 0.00019786565088548034, "loss": 0.0645, "step": 648 }, { "epoch": 0.4789667896678967, "grad_norm": 0.5315965434121672, "learning_rate": 0.00019784796563289354, "loss": 0.0752, "step": 649 }, { "epoch": 0.4797047970479705, "grad_norm": 0.40687016545653565, "learning_rate": 0.00019783020820907506, "loss": 0.0964, "step": 650 }, { "epoch": 0.48044280442804427, "grad_norm": 0.21656716002775234, "learning_rate": 0.00019781237862712253, "loss": 0.0854, "step": 651 }, { "epoch": 0.48118081180811806, "grad_norm": 0.5292267641694639, "learning_rate": 0.00019779447690018676, "loss": 0.0962, "step": 652 }, { "epoch": 0.4819188191881919, "grad_norm": 0.3451936366687803, "learning_rate": 0.00019777650304147183, "loss": 0.0923, "step": 653 }, { "epoch": 0.4826568265682657, "grad_norm": 0.3903507265627889, "learning_rate": 0.00019775845706423496, "loss": 0.0581, "step": 654 }, { "epoch": 0.4833948339483395, "grad_norm": 0.1771929541689091, "learning_rate": 0.00019774033898178667, "loss": 0.0534, "step": 655 }, { "epoch": 0.4841328413284133, "grad_norm": 0.3462091874925928, "learning_rate": 0.00019772214880749056, "loss": 0.1593, "step": 656 }, { "epoch": 0.48487084870848707, "grad_norm": 0.3201105834922449, "learning_rate": 0.00019770388655476339, "loss": 0.071, "step": 657 }, { "epoch": 0.48560885608856086, "grad_norm": 0.18421739172294385, "learning_rate": 0.00019768555223707518, "loss": 0.0551, "step": 658 }, { "epoch": 0.4863468634686347, "grad_norm": 0.1819826903341884, "learning_rate": 0.00019766714586794904, "loss": 0.0484, "step": 659 }, { "epoch": 0.4870848708487085, "grad_norm": 0.34490490024909454, "learning_rate": 0.00019764866746096129, "loss": 0.1102, "step": 660 }, { "epoch": 0.4878228782287823, "grad_norm": 0.25738587951231967, "learning_rate": 0.00019763011702974125, "loss": 0.075, "step": 661 }, { "epoch": 0.4885608856088561, "grad_norm": 0.3566980297664823, "learning_rate": 0.0001976114945879715, "loss": 0.1382, "step": 662 }, { "epoch": 0.48929889298892987, "grad_norm": 0.25675720884542486, "learning_rate": 0.00019759280014938763, "loss": 0.0648, "step": 663 }, { "epoch": 0.4900369003690037, "grad_norm": 0.29685807174819673, "learning_rate": 0.00019757403372777847, "loss": 0.0523, "step": 664 }, { "epoch": 0.4907749077490775, "grad_norm": 0.14149390712022306, "learning_rate": 0.0001975551953369858, "loss": 0.0449, "step": 665 }, { "epoch": 0.4915129151291513, "grad_norm": 0.34660654212124264, "learning_rate": 0.00019753628499090452, "loss": 0.0802, "step": 666 }, { "epoch": 0.4922509225092251, "grad_norm": 0.23239973130630054, "learning_rate": 0.00019751730270348267, "loss": 0.0655, "step": 667 }, { "epoch": 0.4929889298892989, "grad_norm": 0.22809148685253236, "learning_rate": 0.00019749824848872135, "loss": 0.0603, "step": 668 }, { "epoch": 0.49372693726937267, "grad_norm": 0.30532105186849356, "learning_rate": 0.00019747912236067454, "loss": 0.0365, "step": 669 }, { "epoch": 0.4944649446494465, "grad_norm": 0.2258201583886479, "learning_rate": 0.0001974599243334495, "loss": 0.0392, "step": 670 }, { "epoch": 0.4952029520295203, "grad_norm": 0.13826709189184505, "learning_rate": 0.00019744065442120641, "loss": 0.0459, "step": 671 }, { "epoch": 0.4959409594095941, "grad_norm": 0.14850905061688432, "learning_rate": 0.00019742131263815842, "loss": 0.029, "step": 672 }, { "epoch": 0.4966789667896679, "grad_norm": 0.36728305311013265, "learning_rate": 0.00019740189899857178, "loss": 0.1121, "step": 673 }, { "epoch": 0.4974169741697417, "grad_norm": 0.3907559304722591, "learning_rate": 0.0001973824135167657, "loss": 0.0998, "step": 674 }, { "epoch": 0.4981549815498155, "grad_norm": 0.22187781221970432, "learning_rate": 0.00019736285620711242, "loss": 0.0513, "step": 675 }, { "epoch": 0.4988929889298893, "grad_norm": 0.20538582478705406, "learning_rate": 0.00019734322708403706, "loss": 0.0779, "step": 676 }, { "epoch": 0.4996309963099631, "grad_norm": 0.24165579639336152, "learning_rate": 0.00019732352616201783, "loss": 0.0872, "step": 677 }, { "epoch": 0.5003690036900369, "grad_norm": 0.16086882248029474, "learning_rate": 0.00019730375345558584, "loss": 0.051, "step": 678 }, { "epoch": 0.5011070110701107, "grad_norm": 0.32874986750261637, "learning_rate": 0.0001972839089793251, "loss": 0.0575, "step": 679 }, { "epoch": 0.5018450184501845, "grad_norm": 0.2872292948094666, "learning_rate": 0.0001972639927478727, "loss": 0.0719, "step": 680 }, { "epoch": 0.5025830258302583, "grad_norm": 0.21415147819750427, "learning_rate": 0.00019724400477591844, "loss": 0.0539, "step": 681 }, { "epoch": 0.5033210332103321, "grad_norm": 0.17854456208769745, "learning_rate": 0.00019722394507820526, "loss": 0.0315, "step": 682 }, { "epoch": 0.5040590405904058, "grad_norm": 0.16999780657607902, "learning_rate": 0.00019720381366952885, "loss": 0.0478, "step": 683 }, { "epoch": 0.5047970479704798, "grad_norm": 0.2672962620660825, "learning_rate": 0.00019718361056473785, "loss": 0.0913, "step": 684 }, { "epoch": 0.5055350553505535, "grad_norm": 0.3833342135701629, "learning_rate": 0.00019716333577873377, "loss": 0.0983, "step": 685 }, { "epoch": 0.5062730627306273, "grad_norm": 0.34228502918205955, "learning_rate": 0.00019714298932647098, "loss": 0.0752, "step": 686 }, { "epoch": 0.5070110701107011, "grad_norm": 0.3421907096673348, "learning_rate": 0.0001971225712229568, "loss": 0.0397, "step": 687 }, { "epoch": 0.5077490774907749, "grad_norm": 0.17752321708339702, "learning_rate": 0.00019710208148325127, "loss": 0.0405, "step": 688 }, { "epoch": 0.5084870848708487, "grad_norm": 0.21024064331066933, "learning_rate": 0.0001970815201224673, "loss": 0.0484, "step": 689 }, { "epoch": 0.5092250922509225, "grad_norm": 0.32600624607874307, "learning_rate": 0.0001970608871557707, "loss": 0.1076, "step": 690 }, { "epoch": 0.5099630996309963, "grad_norm": 0.16166216746702602, "learning_rate": 0.00019704018259838004, "loss": 0.0579, "step": 691 }, { "epoch": 0.5107011070110701, "grad_norm": 0.2169660713182678, "learning_rate": 0.00019701940646556665, "loss": 0.0779, "step": 692 }, { "epoch": 0.5114391143911439, "grad_norm": 0.11963874644488266, "learning_rate": 0.00019699855877265476, "loss": 0.0227, "step": 693 }, { "epoch": 0.5121771217712177, "grad_norm": 0.20280950087305788, "learning_rate": 0.00019697763953502128, "loss": 0.0627, "step": 694 }, { "epoch": 0.5129151291512916, "grad_norm": 0.1473733167558669, "learning_rate": 0.00019695664876809597, "loss": 0.0279, "step": 695 }, { "epoch": 0.5136531365313654, "grad_norm": 0.2686750352651002, "learning_rate": 0.0001969355864873613, "loss": 0.0472, "step": 696 }, { "epoch": 0.5143911439114391, "grad_norm": 0.2574817344515824, "learning_rate": 0.0001969144527083525, "loss": 0.0691, "step": 697 }, { "epoch": 0.5151291512915129, "grad_norm": 0.3364679057576319, "learning_rate": 0.00019689324744665752, "loss": 0.094, "step": 698 }, { "epoch": 0.5158671586715867, "grad_norm": 0.12081291467686636, "learning_rate": 0.00019687197071791707, "loss": 0.0236, "step": 699 }, { "epoch": 0.5166051660516605, "grad_norm": 0.19797701652723024, "learning_rate": 0.00019685062253782455, "loss": 0.0509, "step": 700 }, { "epoch": 0.5173431734317343, "grad_norm": 0.2497660892659575, "learning_rate": 0.00019682920292212608, "loss": 0.0758, "step": 701 }, { "epoch": 0.5180811808118081, "grad_norm": 0.300333977981209, "learning_rate": 0.00019680771188662044, "loss": 0.0697, "step": 702 }, { "epoch": 0.5188191881918819, "grad_norm": 0.3884593062164175, "learning_rate": 0.00019678614944715908, "loss": 0.0654, "step": 703 }, { "epoch": 0.5195571955719557, "grad_norm": 0.23288485726593033, "learning_rate": 0.00019676451561964622, "loss": 0.0537, "step": 704 }, { "epoch": 0.5202952029520295, "grad_norm": 0.15679500356413748, "learning_rate": 0.00019674281042003858, "loss": 0.0266, "step": 705 }, { "epoch": 0.5210332103321034, "grad_norm": 0.23737690510674744, "learning_rate": 0.00019672103386434562, "loss": 0.0458, "step": 706 }, { "epoch": 0.5217712177121772, "grad_norm": 0.20041271591293533, "learning_rate": 0.0001966991859686294, "loss": 0.049, "step": 707 }, { "epoch": 0.522509225092251, "grad_norm": 0.433259500674165, "learning_rate": 0.00019667726674900467, "loss": 0.0696, "step": 708 }, { "epoch": 0.5232472324723247, "grad_norm": 0.17644892473045368, "learning_rate": 0.00019665527622163864, "loss": 0.0437, "step": 709 }, { "epoch": 0.5239852398523985, "grad_norm": 0.4496154561478462, "learning_rate": 0.00019663321440275124, "loss": 0.0734, "step": 710 }, { "epoch": 0.5247232472324723, "grad_norm": 0.4699419910622859, "learning_rate": 0.00019661108130861497, "loss": 0.1097, "step": 711 }, { "epoch": 0.5254612546125461, "grad_norm": 0.29491452323761974, "learning_rate": 0.00019658887695555484, "loss": 0.039, "step": 712 }, { "epoch": 0.5261992619926199, "grad_norm": 0.1701728283386931, "learning_rate": 0.00019656660135994845, "loss": 0.0405, "step": 713 }, { "epoch": 0.5269372693726937, "grad_norm": 0.2726473421560641, "learning_rate": 0.00019654425453822597, "loss": 0.0598, "step": 714 }, { "epoch": 0.5276752767527675, "grad_norm": 0.24524333806963433, "learning_rate": 0.00019652183650687013, "loss": 0.0393, "step": 715 }, { "epoch": 0.5284132841328413, "grad_norm": 0.19235401549151018, "learning_rate": 0.0001964993472824161, "loss": 0.0503, "step": 716 }, { "epoch": 0.5291512915129152, "grad_norm": 0.27397005829780074, "learning_rate": 0.0001964767868814516, "loss": 0.0937, "step": 717 }, { "epoch": 0.529889298892989, "grad_norm": 0.27625818847169775, "learning_rate": 0.00019645415532061687, "loss": 0.0593, "step": 718 }, { "epoch": 0.5306273062730628, "grad_norm": 0.22840667431051065, "learning_rate": 0.0001964314526166046, "loss": 0.0703, "step": 719 }, { "epoch": 0.5313653136531366, "grad_norm": 0.27663303392666777, "learning_rate": 0.00019640867878616, "loss": 0.0857, "step": 720 }, { "epoch": 0.5321033210332103, "grad_norm": 0.11193187391875623, "learning_rate": 0.0001963858338460807, "loss": 0.0261, "step": 721 }, { "epoch": 0.5328413284132841, "grad_norm": 0.31495501208524873, "learning_rate": 0.00019636291781321679, "loss": 0.0646, "step": 722 }, { "epoch": 0.5335793357933579, "grad_norm": 0.6478714985300059, "learning_rate": 0.0001963399307044708, "loss": 0.1787, "step": 723 }, { "epoch": 0.5343173431734317, "grad_norm": 0.17188640533987026, "learning_rate": 0.00019631687253679768, "loss": 0.0369, "step": 724 }, { "epoch": 0.5350553505535055, "grad_norm": 0.26950559188947876, "learning_rate": 0.00019629374332720488, "loss": 0.0668, "step": 725 }, { "epoch": 0.5357933579335793, "grad_norm": 0.24661492296006018, "learning_rate": 0.00019627054309275202, "loss": 0.0737, "step": 726 }, { "epoch": 0.5365313653136531, "grad_norm": 0.5123456318334597, "learning_rate": 0.00019624727185055135, "loss": 0.0981, "step": 727 }, { "epoch": 0.537269372693727, "grad_norm": 0.37208897767369564, "learning_rate": 0.0001962239296177674, "loss": 0.0878, "step": 728 }, { "epoch": 0.5380073800738008, "grad_norm": 0.26018908374651484, "learning_rate": 0.00019620051641161705, "loss": 0.0584, "step": 729 }, { "epoch": 0.5387453874538746, "grad_norm": 0.2616271488891231, "learning_rate": 0.0001961770322493695, "loss": 0.0548, "step": 730 }, { "epoch": 0.5394833948339484, "grad_norm": 0.3823028058413703, "learning_rate": 0.00019615347714834638, "loss": 0.0943, "step": 731 }, { "epoch": 0.5402214022140222, "grad_norm": 0.21032023875415198, "learning_rate": 0.00019612985112592155, "loss": 0.0626, "step": 732 }, { "epoch": 0.5409594095940959, "grad_norm": 0.3089971408983108, "learning_rate": 0.00019610615419952124, "loss": 0.0947, "step": 733 }, { "epoch": 0.5416974169741697, "grad_norm": 0.4017984646243148, "learning_rate": 0.00019608238638662396, "loss": 0.0889, "step": 734 }, { "epoch": 0.5424354243542435, "grad_norm": 0.2307415273516294, "learning_rate": 0.00019605854770476046, "loss": 0.0414, "step": 735 }, { "epoch": 0.5431734317343173, "grad_norm": 0.19587033140187485, "learning_rate": 0.00019603463817151386, "loss": 0.0579, "step": 736 }, { "epoch": 0.5439114391143911, "grad_norm": 0.5291940456927469, "learning_rate": 0.00019601065780451945, "loss": 0.0762, "step": 737 }, { "epoch": 0.5446494464944649, "grad_norm": 0.2984244411467919, "learning_rate": 0.00019598660662146483, "loss": 0.0438, "step": 738 }, { "epoch": 0.5453874538745388, "grad_norm": 0.29662479649032036, "learning_rate": 0.00019596248464008977, "loss": 0.0775, "step": 739 }, { "epoch": 0.5461254612546126, "grad_norm": 0.1619100668875189, "learning_rate": 0.0001959382918781863, "loss": 0.0395, "step": 740 }, { "epoch": 0.5468634686346864, "grad_norm": 0.2647434678008207, "learning_rate": 0.00019591402835359865, "loss": 0.0635, "step": 741 }, { "epoch": 0.5476014760147602, "grad_norm": 0.26819688520900026, "learning_rate": 0.00019588969408422324, "loss": 0.0668, "step": 742 }, { "epoch": 0.548339483394834, "grad_norm": 0.26305450817306875, "learning_rate": 0.0001958652890880087, "loss": 0.0435, "step": 743 }, { "epoch": 0.5490774907749078, "grad_norm": 0.17329105983607962, "learning_rate": 0.00019584081338295574, "loss": 0.0551, "step": 744 }, { "epoch": 0.5498154981549815, "grad_norm": 0.35210575264552363, "learning_rate": 0.00019581626698711733, "loss": 0.0739, "step": 745 }, { "epoch": 0.5505535055350553, "grad_norm": 0.19385620607212334, "learning_rate": 0.0001957916499185985, "loss": 0.04, "step": 746 }, { "epoch": 0.5512915129151291, "grad_norm": 0.21159077655336414, "learning_rate": 0.0001957669621955565, "loss": 0.0655, "step": 747 }, { "epoch": 0.5520295202952029, "grad_norm": 0.2103492617536082, "learning_rate": 0.00019574220383620055, "loss": 0.0762, "step": 748 }, { "epoch": 0.5527675276752767, "grad_norm": 0.24185417984654628, "learning_rate": 0.0001957173748587921, "loss": 0.0427, "step": 749 }, { "epoch": 0.5535055350553506, "grad_norm": 0.2935567850269055, "learning_rate": 0.00019569247528164468, "loss": 0.0744, "step": 750 }, { "epoch": 0.5542435424354244, "grad_norm": 0.5776126908511267, "learning_rate": 0.00019566750512312378, "loss": 0.0753, "step": 751 }, { "epoch": 0.5549815498154982, "grad_norm": 0.2285459332560346, "learning_rate": 0.0001956424644016471, "loss": 0.0664, "step": 752 }, { "epoch": 0.555719557195572, "grad_norm": 0.37532985807797054, "learning_rate": 0.00019561735313568422, "loss": 0.1076, "step": 753 }, { "epoch": 0.5564575645756458, "grad_norm": 0.49491040721579016, "learning_rate": 0.0001955921713437569, "loss": 0.1153, "step": 754 }, { "epoch": 0.5571955719557196, "grad_norm": 0.25366745368598914, "learning_rate": 0.0001955669190444389, "loss": 0.0573, "step": 755 }, { "epoch": 0.5579335793357934, "grad_norm": 0.376014478840611, "learning_rate": 0.00019554159625635587, "loss": 0.0669, "step": 756 }, { "epoch": 0.5586715867158671, "grad_norm": 0.20941272606074607, "learning_rate": 0.00019551620299818558, "loss": 0.0381, "step": 757 }, { "epoch": 0.5594095940959409, "grad_norm": 0.39860686320238875, "learning_rate": 0.00019549073928865768, "loss": 0.0991, "step": 758 }, { "epoch": 0.5601476014760147, "grad_norm": 0.7118556284313808, "learning_rate": 0.00019546520514655388, "loss": 0.1007, "step": 759 }, { "epoch": 0.5608856088560885, "grad_norm": 0.3229669316719533, "learning_rate": 0.00019543960059070775, "loss": 0.0894, "step": 760 }, { "epoch": 0.5616236162361624, "grad_norm": 0.12885563543604694, "learning_rate": 0.00019541392564000488, "loss": 0.031, "step": 761 }, { "epoch": 0.5623616236162362, "grad_norm": 0.6106965187401378, "learning_rate": 0.0001953881803133827, "loss": 0.1003, "step": 762 }, { "epoch": 0.56309963099631, "grad_norm": 0.3803136186715307, "learning_rate": 0.00019536236462983065, "loss": 0.1064, "step": 763 }, { "epoch": 0.5638376383763838, "grad_norm": 0.5300468952419799, "learning_rate": 0.0001953364786083899, "loss": 0.1065, "step": 764 }, { "epoch": 0.5645756457564576, "grad_norm": 0.9280682055821953, "learning_rate": 0.00019531052226815366, "loss": 0.1356, "step": 765 }, { "epoch": 0.5653136531365314, "grad_norm": 0.4681490064402982, "learning_rate": 0.000195284495628267, "loss": 0.1055, "step": 766 }, { "epoch": 0.5660516605166052, "grad_norm": 0.24559745409226985, "learning_rate": 0.00019525839870792667, "loss": 0.0532, "step": 767 }, { "epoch": 0.566789667896679, "grad_norm": 0.2627068102741798, "learning_rate": 0.00019523223152638147, "loss": 0.0688, "step": 768 }, { "epoch": 0.5675276752767527, "grad_norm": 0.17087251588115876, "learning_rate": 0.0001952059941029319, "loss": 0.056, "step": 769 }, { "epoch": 0.5682656826568265, "grad_norm": 0.1950707724281016, "learning_rate": 0.00019517968645693028, "loss": 0.0511, "step": 770 }, { "epoch": 0.5690036900369003, "grad_norm": 0.31836784793487993, "learning_rate": 0.00019515330860778082, "loss": 0.0721, "step": 771 }, { "epoch": 0.5697416974169742, "grad_norm": 0.32187648907492794, "learning_rate": 0.00019512686057493933, "loss": 0.0816, "step": 772 }, { "epoch": 0.570479704797048, "grad_norm": 0.1989405224232227, "learning_rate": 0.0001951003423779136, "loss": 0.055, "step": 773 }, { "epoch": 0.5712177121771218, "grad_norm": 0.2122651135932569, "learning_rate": 0.00019507375403626296, "loss": 0.0746, "step": 774 }, { "epoch": 0.5719557195571956, "grad_norm": 0.2041255964955471, "learning_rate": 0.00019504709556959868, "loss": 0.1017, "step": 775 }, { "epoch": 0.5726937269372694, "grad_norm": 0.18915053633080717, "learning_rate": 0.0001950203669975836, "loss": 0.0353, "step": 776 }, { "epoch": 0.5734317343173432, "grad_norm": 0.31266105053636867, "learning_rate": 0.00019499356833993235, "loss": 0.0549, "step": 777 }, { "epoch": 0.574169741697417, "grad_norm": 0.630680037660709, "learning_rate": 0.0001949666996164112, "loss": 0.0958, "step": 778 }, { "epoch": 0.5749077490774908, "grad_norm": 0.32828752307688985, "learning_rate": 0.00019493976084683813, "loss": 0.0534, "step": 779 }, { "epoch": 0.5756457564575646, "grad_norm": 0.36912306773647435, "learning_rate": 0.0001949127520510828, "loss": 0.1291, "step": 780 }, { "epoch": 0.5763837638376383, "grad_norm": 0.15807846365580977, "learning_rate": 0.00019488567324906655, "loss": 0.0426, "step": 781 }, { "epoch": 0.5771217712177121, "grad_norm": 0.6178608423908574, "learning_rate": 0.00019485852446076224, "loss": 0.1249, "step": 782 }, { "epoch": 0.5778597785977859, "grad_norm": 0.47489038388325017, "learning_rate": 0.00019483130570619443, "loss": 0.1252, "step": 783 }, { "epoch": 0.5785977859778598, "grad_norm": 0.16121400287564985, "learning_rate": 0.0001948040170054393, "loss": 0.0383, "step": 784 }, { "epoch": 0.5793357933579336, "grad_norm": 0.40507765599055895, "learning_rate": 0.0001947766583786246, "loss": 0.0767, "step": 785 }, { "epoch": 0.5800738007380074, "grad_norm": 0.32792874052951465, "learning_rate": 0.0001947492298459296, "loss": 0.0693, "step": 786 }, { "epoch": 0.5808118081180812, "grad_norm": 0.38035124175785556, "learning_rate": 0.00019472173142758524, "loss": 0.0646, "step": 787 }, { "epoch": 0.581549815498155, "grad_norm": 0.32530939536510417, "learning_rate": 0.00019469416314387393, "loss": 0.0427, "step": 788 }, { "epoch": 0.5822878228782288, "grad_norm": 0.42845684939548206, "learning_rate": 0.00019466652501512962, "loss": 0.1129, "step": 789 }, { "epoch": 0.5830258302583026, "grad_norm": 0.31492535369245506, "learning_rate": 0.00019463881706173786, "loss": 0.1462, "step": 790 }, { "epoch": 0.5837638376383764, "grad_norm": 0.1974357564062002, "learning_rate": 0.00019461103930413555, "loss": 0.0488, "step": 791 }, { "epoch": 0.5845018450184502, "grad_norm": 0.19798801869403201, "learning_rate": 0.0001945831917628112, "loss": 0.0443, "step": 792 }, { "epoch": 0.5852398523985239, "grad_norm": 0.36181483943084974, "learning_rate": 0.00019455527445830475, "loss": 0.1052, "step": 793 }, { "epoch": 0.5859778597785977, "grad_norm": 0.31739187312737166, "learning_rate": 0.00019452728741120758, "loss": 0.063, "step": 794 }, { "epoch": 0.5867158671586716, "grad_norm": 0.28264741772837715, "learning_rate": 0.00019449923064216256, "loss": 0.0584, "step": 795 }, { "epoch": 0.5874538745387454, "grad_norm": 0.42068249682769193, "learning_rate": 0.00019447110417186389, "loss": 0.0788, "step": 796 }, { "epoch": 0.5881918819188192, "grad_norm": 0.31891318576909045, "learning_rate": 0.0001944429080210573, "loss": 0.0852, "step": 797 }, { "epoch": 0.588929889298893, "grad_norm": 0.2128440801724733, "learning_rate": 0.00019441464221053986, "loss": 0.043, "step": 798 }, { "epoch": 0.5896678966789668, "grad_norm": 0.5046561554222836, "learning_rate": 0.00019438630676116, "loss": 0.0932, "step": 799 }, { "epoch": 0.5904059040590406, "grad_norm": 0.3100395173863811, "learning_rate": 0.00019435790169381752, "loss": 0.0655, "step": 800 }, { "epoch": 0.5911439114391144, "grad_norm": 0.34911671733777094, "learning_rate": 0.0001943294270294636, "loss": 0.1112, "step": 801 }, { "epoch": 0.5918819188191882, "grad_norm": 0.409823969392575, "learning_rate": 0.00019430088278910072, "loss": 0.0661, "step": 802 }, { "epoch": 0.592619926199262, "grad_norm": 0.41759926170789013, "learning_rate": 0.00019427226899378273, "loss": 0.1168, "step": 803 }, { "epoch": 0.5933579335793358, "grad_norm": 0.18388693266725717, "learning_rate": 0.00019424358566461474, "loss": 0.054, "step": 804 }, { "epoch": 0.5940959409594095, "grad_norm": 0.6161580553361382, "learning_rate": 0.00019421483282275315, "loss": 0.0998, "step": 805 }, { "epoch": 0.5948339483394834, "grad_norm": 0.2240052107642673, "learning_rate": 0.0001941860104894056, "loss": 0.0534, "step": 806 }, { "epoch": 0.5955719557195572, "grad_norm": 0.18846179968018678, "learning_rate": 0.00019415711868583108, "loss": 0.0573, "step": 807 }, { "epoch": 0.596309963099631, "grad_norm": 0.15141852382012938, "learning_rate": 0.00019412815743333973, "loss": 0.0564, "step": 808 }, { "epoch": 0.5970479704797048, "grad_norm": 0.2550527301188835, "learning_rate": 0.00019409912675329293, "loss": 0.1077, "step": 809 }, { "epoch": 0.5977859778597786, "grad_norm": 0.2097224693496229, "learning_rate": 0.00019407002666710336, "loss": 0.0553, "step": 810 }, { "epoch": 0.5985239852398524, "grad_norm": 0.21686351333901488, "learning_rate": 0.0001940408571962347, "loss": 0.0562, "step": 811 }, { "epoch": 0.5992619926199262, "grad_norm": 0.3223176785433863, "learning_rate": 0.00019401161836220206, "loss": 0.0836, "step": 812 }, { "epoch": 0.6, "grad_norm": 0.4519668991396666, "learning_rate": 0.00019398231018657146, "loss": 0.0834, "step": 813 }, { "epoch": 0.6007380073800738, "grad_norm": 0.23966115267167426, "learning_rate": 0.00019395293269096027, "loss": 0.0379, "step": 814 }, { "epoch": 0.6014760147601476, "grad_norm": 0.30143584864268624, "learning_rate": 0.00019392348589703686, "loss": 0.0845, "step": 815 }, { "epoch": 0.6022140221402214, "grad_norm": 0.4064254219203514, "learning_rate": 0.00019389396982652076, "loss": 0.1081, "step": 816 }, { "epoch": 0.6029520295202953, "grad_norm": 0.7055589489067154, "learning_rate": 0.00019386438450118257, "loss": 0.109, "step": 817 }, { "epoch": 0.603690036900369, "grad_norm": 0.327098310256075, "learning_rate": 0.00019383472994284406, "loss": 0.0946, "step": 818 }, { "epoch": 0.6044280442804428, "grad_norm": 0.19969119539530683, "learning_rate": 0.00019380500617337796, "loss": 0.0607, "step": 819 }, { "epoch": 0.6051660516605166, "grad_norm": 0.14975776309971317, "learning_rate": 0.00019377521321470805, "loss": 0.0404, "step": 820 }, { "epoch": 0.6059040590405904, "grad_norm": 0.3305710223717638, "learning_rate": 0.00019374535108880925, "loss": 0.0991, "step": 821 }, { "epoch": 0.6066420664206642, "grad_norm": 0.23921198685652453, "learning_rate": 0.00019371541981770738, "loss": 0.0591, "step": 822 }, { "epoch": 0.607380073800738, "grad_norm": 0.146857753869367, "learning_rate": 0.00019368541942347932, "loss": 0.0572, "step": 823 }, { "epoch": 0.6081180811808118, "grad_norm": 0.1955369123346895, "learning_rate": 0.00019365534992825295, "loss": 0.0492, "step": 824 }, { "epoch": 0.6088560885608856, "grad_norm": 0.3036485329825656, "learning_rate": 0.00019362521135420706, "loss": 0.0998, "step": 825 }, { "epoch": 0.6095940959409594, "grad_norm": 0.16582177035369933, "learning_rate": 0.00019359500372357144, "loss": 0.0541, "step": 826 }, { "epoch": 0.6103321033210332, "grad_norm": 0.22512913045820804, "learning_rate": 0.00019356472705862678, "loss": 0.0635, "step": 827 }, { "epoch": 0.6110701107011071, "grad_norm": 0.22307005807353386, "learning_rate": 0.00019353438138170473, "loss": 0.0462, "step": 828 }, { "epoch": 0.6118081180811809, "grad_norm": 0.20183655583868546, "learning_rate": 0.0001935039667151878, "loss": 0.0484, "step": 829 }, { "epoch": 0.6125461254612546, "grad_norm": 0.476085806576139, "learning_rate": 0.0001934734830815094, "loss": 0.1085, "step": 830 }, { "epoch": 0.6132841328413284, "grad_norm": 0.18828533657516092, "learning_rate": 0.00019344293050315383, "loss": 0.0388, "step": 831 }, { "epoch": 0.6140221402214022, "grad_norm": 0.3719333128636075, "learning_rate": 0.00019341230900265624, "loss": 0.079, "step": 832 }, { "epoch": 0.614760147601476, "grad_norm": 0.12451645062572286, "learning_rate": 0.00019338161860260253, "loss": 0.0267, "step": 833 }, { "epoch": 0.6154981549815498, "grad_norm": 0.2480437313755228, "learning_rate": 0.00019335085932562957, "loss": 0.0512, "step": 834 }, { "epoch": 0.6162361623616236, "grad_norm": 0.4679932328838113, "learning_rate": 0.00019332003119442494, "loss": 0.1268, "step": 835 }, { "epoch": 0.6169741697416974, "grad_norm": 0.27650605969847836, "learning_rate": 0.000193289134231727, "loss": 0.0522, "step": 836 }, { "epoch": 0.6177121771217712, "grad_norm": 0.32996786329406325, "learning_rate": 0.00019325816846032487, "loss": 0.0735, "step": 837 }, { "epoch": 0.618450184501845, "grad_norm": 0.39224338466405007, "learning_rate": 0.0001932271339030585, "loss": 0.0555, "step": 838 }, { "epoch": 0.6191881918819189, "grad_norm": 0.13196344708501553, "learning_rate": 0.00019319603058281856, "loss": 0.0287, "step": 839 }, { "epoch": 0.6199261992619927, "grad_norm": 0.25745015129541027, "learning_rate": 0.00019316485852254628, "loss": 0.0651, "step": 840 }, { "epoch": 0.6206642066420665, "grad_norm": 0.36730671142892724, "learning_rate": 0.00019313361774523385, "loss": 0.07, "step": 841 }, { "epoch": 0.6214022140221402, "grad_norm": 0.5938202220633125, "learning_rate": 0.00019310230827392395, "loss": 0.0753, "step": 842 }, { "epoch": 0.622140221402214, "grad_norm": 0.26562935261043524, "learning_rate": 0.00019307093013170995, "loss": 0.0622, "step": 843 }, { "epoch": 0.6228782287822878, "grad_norm": 0.45713023183763374, "learning_rate": 0.00019303948334173604, "loss": 0.0836, "step": 844 }, { "epoch": 0.6236162361623616, "grad_norm": 0.3488716514254794, "learning_rate": 0.00019300796792719676, "loss": 0.0686, "step": 845 }, { "epoch": 0.6243542435424354, "grad_norm": 0.29404287036493043, "learning_rate": 0.0001929763839113375, "loss": 0.0417, "step": 846 }, { "epoch": 0.6250922509225092, "grad_norm": 0.6878446953359597, "learning_rate": 0.00019294473131745417, "loss": 0.1713, "step": 847 }, { "epoch": 0.625830258302583, "grad_norm": 0.2934847512359203, "learning_rate": 0.00019291301016889322, "loss": 0.0718, "step": 848 }, { "epoch": 0.6265682656826568, "grad_norm": 0.25762562438529873, "learning_rate": 0.00019288122048905177, "loss": 0.0941, "step": 849 }, { "epoch": 0.6273062730627307, "grad_norm": 0.13059825220315643, "learning_rate": 0.00019284936230137736, "loss": 0.0369, "step": 850 }, { "epoch": 0.6280442804428045, "grad_norm": 0.33911778773611745, "learning_rate": 0.00019281743562936816, "loss": 0.0777, "step": 851 }, { "epoch": 0.6287822878228783, "grad_norm": 0.18741371622470973, "learning_rate": 0.00019278544049657282, "loss": 0.0565, "step": 852 }, { "epoch": 0.629520295202952, "grad_norm": 0.29131457194418064, "learning_rate": 0.0001927533769265905, "loss": 0.0723, "step": 853 }, { "epoch": 0.6302583025830258, "grad_norm": 0.2904029579883963, "learning_rate": 0.00019272124494307074, "loss": 0.0598, "step": 854 }, { "epoch": 0.6309963099630996, "grad_norm": 0.3152630454441556, "learning_rate": 0.0001926890445697137, "loss": 0.0745, "step": 855 }, { "epoch": 0.6317343173431734, "grad_norm": 0.12398054804714302, "learning_rate": 0.00019265677583026988, "loss": 0.0317, "step": 856 }, { "epoch": 0.6324723247232472, "grad_norm": 0.37780290619980533, "learning_rate": 0.00019262443874854026, "loss": 0.109, "step": 857 }, { "epoch": 0.633210332103321, "grad_norm": 0.37798145886630724, "learning_rate": 0.00019259203334837612, "loss": 0.1127, "step": 858 }, { "epoch": 0.6339483394833948, "grad_norm": 0.5647374554739166, "learning_rate": 0.0001925595596536793, "loss": 0.0891, "step": 859 }, { "epoch": 0.6346863468634686, "grad_norm": 0.20995298348563238, "learning_rate": 0.00019252701768840189, "loss": 0.0309, "step": 860 }, { "epoch": 0.6354243542435425, "grad_norm": 0.1499747568144894, "learning_rate": 0.00019249440747654638, "loss": 0.0371, "step": 861 }, { "epoch": 0.6361623616236163, "grad_norm": 0.2739118465302556, "learning_rate": 0.00019246172904216553, "loss": 0.0624, "step": 862 }, { "epoch": 0.6369003690036901, "grad_norm": 0.46314104396425304, "learning_rate": 0.00019242898240936254, "loss": 0.1035, "step": 863 }, { "epoch": 0.6376383763837639, "grad_norm": 0.18608988617589822, "learning_rate": 0.00019239616760229083, "loss": 0.0511, "step": 864 }, { "epoch": 0.6383763837638377, "grad_norm": 0.22108005912432857, "learning_rate": 0.00019236328464515413, "loss": 0.0518, "step": 865 }, { "epoch": 0.6391143911439114, "grad_norm": 0.25079736367445504, "learning_rate": 0.0001923303335622064, "loss": 0.0943, "step": 866 }, { "epoch": 0.6398523985239852, "grad_norm": 0.3443807897323052, "learning_rate": 0.0001922973143777519, "loss": 0.0884, "step": 867 }, { "epoch": 0.640590405904059, "grad_norm": 0.148765154832008, "learning_rate": 0.00019226422711614508, "loss": 0.0315, "step": 868 }, { "epoch": 0.6413284132841328, "grad_norm": 0.21359062046308244, "learning_rate": 0.0001922310718017907, "loss": 0.034, "step": 869 }, { "epoch": 0.6420664206642066, "grad_norm": 0.4019417983878902, "learning_rate": 0.00019219784845914354, "loss": 0.0652, "step": 870 }, { "epoch": 0.6428044280442804, "grad_norm": 0.3561069342083499, "learning_rate": 0.00019216455711270865, "loss": 0.0614, "step": 871 }, { "epoch": 0.6435424354243543, "grad_norm": 0.12074560942365548, "learning_rate": 0.00019213119778704128, "loss": 0.0274, "step": 872 }, { "epoch": 0.6442804428044281, "grad_norm": 0.3934140578385458, "learning_rate": 0.00019209777050674683, "loss": 0.0437, "step": 873 }, { "epoch": 0.6450184501845019, "grad_norm": 0.2659095794730061, "learning_rate": 0.0001920642752964807, "loss": 0.0466, "step": 874 }, { "epoch": 0.6457564575645757, "grad_norm": 0.2434094309341207, "learning_rate": 0.0001920307121809485, "loss": 0.0718, "step": 875 }, { "epoch": 0.6464944649446495, "grad_norm": 0.20216729289702443, "learning_rate": 0.00019199708118490587, "loss": 0.0512, "step": 876 }, { "epoch": 0.6472324723247233, "grad_norm": 0.16917523634562057, "learning_rate": 0.0001919633823331586, "loss": 0.0331, "step": 877 }, { "epoch": 0.647970479704797, "grad_norm": 0.46933017319017734, "learning_rate": 0.00019192961565056238, "loss": 0.0931, "step": 878 }, { "epoch": 0.6487084870848708, "grad_norm": 0.3800945921924304, "learning_rate": 0.00019189578116202307, "loss": 0.0871, "step": 879 }, { "epoch": 0.6494464944649446, "grad_norm": 0.24855131704391736, "learning_rate": 0.00019186187889249653, "loss": 0.0517, "step": 880 }, { "epoch": 0.6501845018450184, "grad_norm": 0.2184569145245582, "learning_rate": 0.00019182790886698852, "loss": 0.0442, "step": 881 }, { "epoch": 0.6509225092250922, "grad_norm": 0.10716387372309505, "learning_rate": 0.00019179387111055486, "loss": 0.0394, "step": 882 }, { "epoch": 0.6516605166051661, "grad_norm": 0.30032480190063027, "learning_rate": 0.0001917597656483013, "loss": 0.0753, "step": 883 }, { "epoch": 0.6523985239852399, "grad_norm": 0.23481923654652448, "learning_rate": 0.00019172559250538358, "loss": 0.0436, "step": 884 }, { "epoch": 0.6531365313653137, "grad_norm": 0.22240114910959494, "learning_rate": 0.00019169135170700723, "loss": 0.0612, "step": 885 }, { "epoch": 0.6538745387453875, "grad_norm": 0.21865580024889733, "learning_rate": 0.00019165704327842782, "loss": 0.0626, "step": 886 }, { "epoch": 0.6546125461254613, "grad_norm": 0.275476084872021, "learning_rate": 0.00019162266724495071, "loss": 0.0786, "step": 887 }, { "epoch": 0.6553505535055351, "grad_norm": 0.244461726369598, "learning_rate": 0.0001915882236319312, "loss": 0.0447, "step": 888 }, { "epoch": 0.6560885608856089, "grad_norm": 0.1161794720654763, "learning_rate": 0.00019155371246477434, "loss": 0.0334, "step": 889 }, { "epoch": 0.6568265682656826, "grad_norm": 0.3653200853580867, "learning_rate": 0.0001915191337689351, "loss": 0.0898, "step": 890 }, { "epoch": 0.6575645756457564, "grad_norm": 0.302057548900222, "learning_rate": 0.00019148448756991823, "loss": 0.0537, "step": 891 }, { "epoch": 0.6583025830258302, "grad_norm": 0.3360787785125817, "learning_rate": 0.00019144977389327824, "loss": 0.0984, "step": 892 }, { "epoch": 0.659040590405904, "grad_norm": 0.31141920692038993, "learning_rate": 0.00019141499276461947, "loss": 0.0815, "step": 893 }, { "epoch": 0.6597785977859778, "grad_norm": 0.18856300319637478, "learning_rate": 0.00019138014420959593, "loss": 0.0527, "step": 894 }, { "epoch": 0.6605166051660517, "grad_norm": 0.19303970011910163, "learning_rate": 0.0001913452282539114, "loss": 0.0539, "step": 895 }, { "epoch": 0.6612546125461255, "grad_norm": 0.25412855972932086, "learning_rate": 0.00019131024492331943, "loss": 0.0367, "step": 896 }, { "epoch": 0.6619926199261993, "grad_norm": 0.3695737155566506, "learning_rate": 0.00019127519424362314, "loss": 0.0871, "step": 897 }, { "epoch": 0.6627306273062731, "grad_norm": 0.5006346009245367, "learning_rate": 0.00019124007624067547, "loss": 0.0761, "step": 898 }, { "epoch": 0.6634686346863469, "grad_norm": 0.27828244073243963, "learning_rate": 0.00019120489094037892, "loss": 0.0591, "step": 899 }, { "epoch": 0.6642066420664207, "grad_norm": 0.1837880867327322, "learning_rate": 0.00019116963836868564, "loss": 0.0519, "step": 900 }, { "epoch": 0.6649446494464945, "grad_norm": 0.3261224720172735, "learning_rate": 0.00019113431855159743, "loss": 0.1076, "step": 901 }, { "epoch": 0.6656826568265682, "grad_norm": 0.168429382351957, "learning_rate": 0.00019109893151516568, "loss": 0.0427, "step": 902 }, { "epoch": 0.666420664206642, "grad_norm": 0.29209206016359884, "learning_rate": 0.00019106347728549135, "loss": 0.1033, "step": 903 }, { "epoch": 0.6671586715867158, "grad_norm": 0.34983171641339855, "learning_rate": 0.00019102795588872492, "loss": 0.1153, "step": 904 }, { "epoch": 0.6678966789667896, "grad_norm": 0.19425950871628309, "learning_rate": 0.0001909923673510665, "loss": 0.0491, "step": 905 }, { "epoch": 0.6686346863468635, "grad_norm": 0.1493504031770382, "learning_rate": 0.00019095671169876567, "loss": 0.025, "step": 906 }, { "epoch": 0.6693726937269373, "grad_norm": 0.3308673080093318, "learning_rate": 0.00019092098895812147, "loss": 0.0852, "step": 907 }, { "epoch": 0.6701107011070111, "grad_norm": 0.19725397512672618, "learning_rate": 0.00019088519915548254, "loss": 0.0498, "step": 908 }, { "epoch": 0.6708487084870849, "grad_norm": 0.2478210510124813, "learning_rate": 0.00019084934231724688, "loss": 0.0719, "step": 909 }, { "epoch": 0.6715867158671587, "grad_norm": 0.16670951121770844, "learning_rate": 0.000190813418469862, "loss": 0.0352, "step": 910 }, { "epoch": 0.6723247232472325, "grad_norm": 0.20642524886145505, "learning_rate": 0.00019077742763982478, "loss": 0.0687, "step": 911 }, { "epoch": 0.6730627306273063, "grad_norm": 0.6195341735536203, "learning_rate": 0.00019074136985368153, "loss": 0.0982, "step": 912 }, { "epoch": 0.67380073800738, "grad_norm": 0.14219808704022266, "learning_rate": 0.00019070524513802796, "loss": 0.0322, "step": 913 }, { "epoch": 0.6745387453874538, "grad_norm": 0.5414096068376657, "learning_rate": 0.0001906690535195091, "loss": 0.1516, "step": 914 }, { "epoch": 0.6752767527675276, "grad_norm": 0.17867861513308414, "learning_rate": 0.0001906327950248194, "loss": 0.0377, "step": 915 }, { "epoch": 0.6760147601476014, "grad_norm": 0.3830528364744962, "learning_rate": 0.0001905964696807026, "loss": 0.0874, "step": 916 }, { "epoch": 0.6767527675276753, "grad_norm": 0.5737501362242396, "learning_rate": 0.00019056007751395174, "loss": 0.0958, "step": 917 }, { "epoch": 0.6774907749077491, "grad_norm": 0.1939175355152512, "learning_rate": 0.0001905236185514091, "loss": 0.058, "step": 918 }, { "epoch": 0.6782287822878229, "grad_norm": 0.38133464718558163, "learning_rate": 0.00019048709281996632, "loss": 0.0562, "step": 919 }, { "epoch": 0.6789667896678967, "grad_norm": 0.21303973447176533, "learning_rate": 0.00019045050034656428, "loss": 0.0421, "step": 920 }, { "epoch": 0.6797047970479705, "grad_norm": 0.39067371197817524, "learning_rate": 0.00019041384115819297, "loss": 0.0679, "step": 921 }, { "epoch": 0.6804428044280443, "grad_norm": 1.3870472774151192, "learning_rate": 0.00019037711528189174, "loss": 0.2786, "step": 922 }, { "epoch": 0.6811808118081181, "grad_norm": 0.15285973782464576, "learning_rate": 0.00019034032274474905, "loss": 0.0333, "step": 923 }, { "epoch": 0.6819188191881919, "grad_norm": 0.1712316360076446, "learning_rate": 0.0001903034635739025, "loss": 0.0519, "step": 924 }, { "epoch": 0.6826568265682657, "grad_norm": 0.26601652926168134, "learning_rate": 0.00019026653779653893, "loss": 0.0732, "step": 925 }, { "epoch": 0.6833948339483394, "grad_norm": 0.16279653187198054, "learning_rate": 0.00019022954543989422, "loss": 0.0456, "step": 926 }, { "epoch": 0.6841328413284132, "grad_norm": 0.17475721084891124, "learning_rate": 0.0001901924865312534, "loss": 0.0509, "step": 927 }, { "epoch": 0.6848708487084871, "grad_norm": 0.27559029803578394, "learning_rate": 0.0001901553610979506, "loss": 0.1077, "step": 928 }, { "epoch": 0.6856088560885609, "grad_norm": 0.47149463736790387, "learning_rate": 0.00019011816916736897, "loss": 0.0834, "step": 929 }, { "epoch": 0.6863468634686347, "grad_norm": 0.23556307320080255, "learning_rate": 0.00019008091076694076, "loss": 0.0931, "step": 930 }, { "epoch": 0.6870848708487085, "grad_norm": 0.24008539367852416, "learning_rate": 0.00019004358592414718, "loss": 0.0701, "step": 931 }, { "epoch": 0.6878228782287823, "grad_norm": 0.26575472187206867, "learning_rate": 0.00019000619466651855, "loss": 0.0883, "step": 932 }, { "epoch": 0.6885608856088561, "grad_norm": 0.13973936193405181, "learning_rate": 0.00018996873702163404, "loss": 0.0396, "step": 933 }, { "epoch": 0.6892988929889299, "grad_norm": 0.22881000076511104, "learning_rate": 0.00018993121301712193, "loss": 0.0392, "step": 934 }, { "epoch": 0.6900369003690037, "grad_norm": 0.1262417505770138, "learning_rate": 0.00018989362268065935, "loss": 0.0201, "step": 935 }, { "epoch": 0.6907749077490775, "grad_norm": 0.3085525463091507, "learning_rate": 0.00018985596603997239, "loss": 0.0588, "step": 936 }, { "epoch": 0.6915129151291513, "grad_norm": 0.23323670565408766, "learning_rate": 0.00018981824312283604, "loss": 0.0371, "step": 937 }, { "epoch": 0.692250922509225, "grad_norm": 0.3449582225216414, "learning_rate": 0.00018978045395707418, "loss": 0.0617, "step": 938 }, { "epoch": 0.6929889298892989, "grad_norm": 0.31333662410281254, "learning_rate": 0.0001897425985705595, "loss": 0.0755, "step": 939 }, { "epoch": 0.6937269372693727, "grad_norm": 0.32025142573224563, "learning_rate": 0.00018970467699121367, "loss": 0.0555, "step": 940 }, { "epoch": 0.6944649446494465, "grad_norm": 0.3608282781149688, "learning_rate": 0.000189666689247007, "loss": 0.0649, "step": 941 }, { "epoch": 0.6952029520295203, "grad_norm": 0.3455696439401912, "learning_rate": 0.00018962863536595877, "loss": 0.0518, "step": 942 }, { "epoch": 0.6959409594095941, "grad_norm": 0.20219214994930254, "learning_rate": 0.0001895905153761369, "loss": 0.0476, "step": 943 }, { "epoch": 0.6966789667896679, "grad_norm": 0.22740362447567547, "learning_rate": 0.0001895523293056582, "loss": 0.0503, "step": 944 }, { "epoch": 0.6974169741697417, "grad_norm": 0.32224227350038775, "learning_rate": 0.0001895140771826881, "loss": 0.0954, "step": 945 }, { "epoch": 0.6981549815498155, "grad_norm": 0.5686368780397619, "learning_rate": 0.00018947575903544088, "loss": 0.1555, "step": 946 }, { "epoch": 0.6988929889298893, "grad_norm": 0.30290450153851195, "learning_rate": 0.00018943737489217938, "loss": 0.0756, "step": 947 }, { "epoch": 0.6996309963099631, "grad_norm": 0.3874481861658373, "learning_rate": 0.00018939892478121522, "loss": 0.0745, "step": 948 }, { "epoch": 0.7003690036900369, "grad_norm": 0.31093814688479143, "learning_rate": 0.00018936040873090862, "loss": 0.0877, "step": 949 }, { "epoch": 0.7011070110701108, "grad_norm": 0.2912305680713261, "learning_rate": 0.00018932182676966846, "loss": 0.0861, "step": 950 }, { "epoch": 0.7018450184501845, "grad_norm": 0.6214859731445413, "learning_rate": 0.00018928317892595223, "loss": 0.1626, "step": 951 }, { "epoch": 0.7025830258302583, "grad_norm": 0.16988546214405167, "learning_rate": 0.00018924446522826607, "loss": 0.0324, "step": 952 }, { "epoch": 0.7033210332103321, "grad_norm": 0.26920286798738, "learning_rate": 0.00018920568570516454, "loss": 0.075, "step": 953 }, { "epoch": 0.7040590405904059, "grad_norm": 0.14637783322576742, "learning_rate": 0.00018916684038525094, "loss": 0.0406, "step": 954 }, { "epoch": 0.7047970479704797, "grad_norm": 0.19505954865336755, "learning_rate": 0.00018912792929717695, "loss": 0.0866, "step": 955 }, { "epoch": 0.7055350553505535, "grad_norm": 0.1584391291324113, "learning_rate": 0.00018908895246964286, "loss": 0.0437, "step": 956 }, { "epoch": 0.7062730627306273, "grad_norm": 0.3098275890799006, "learning_rate": 0.0001890499099313974, "loss": 0.0721, "step": 957 }, { "epoch": 0.7070110701107011, "grad_norm": 0.12682222311338454, "learning_rate": 0.00018901080171123774, "loss": 0.0284, "step": 958 }, { "epoch": 0.7077490774907749, "grad_norm": 0.1885009276536557, "learning_rate": 0.0001889716278380096, "loss": 0.0659, "step": 959 }, { "epoch": 0.7084870848708487, "grad_norm": 0.25083500207152587, "learning_rate": 0.000188932388340607, "loss": 0.0651, "step": 960 }, { "epoch": 0.7092250922509226, "grad_norm": 0.5264573556136557, "learning_rate": 0.00018889308324797246, "loss": 0.1117, "step": 961 }, { "epoch": 0.7099630996309964, "grad_norm": 0.25421062847726905, "learning_rate": 0.00018885371258909678, "loss": 0.0549, "step": 962 }, { "epoch": 0.7107011070110701, "grad_norm": 0.5625494466180504, "learning_rate": 0.00018881427639301927, "loss": 0.137, "step": 963 }, { "epoch": 0.7114391143911439, "grad_norm": 0.30065319220345943, "learning_rate": 0.00018877477468882744, "loss": 0.1099, "step": 964 }, { "epoch": 0.7121771217712177, "grad_norm": 0.23997894825492536, "learning_rate": 0.00018873520750565718, "loss": 0.0562, "step": 965 }, { "epoch": 0.7129151291512915, "grad_norm": 0.21083474739944094, "learning_rate": 0.00018869557487269264, "loss": 0.0475, "step": 966 }, { "epoch": 0.7136531365313653, "grad_norm": 0.3114438014071507, "learning_rate": 0.00018865587681916632, "loss": 0.0773, "step": 967 }, { "epoch": 0.7143911439114391, "grad_norm": 0.35511334676591133, "learning_rate": 0.0001886161133743589, "loss": 0.0618, "step": 968 }, { "epoch": 0.7151291512915129, "grad_norm": 0.34874280990852186, "learning_rate": 0.00018857628456759936, "loss": 0.0835, "step": 969 }, { "epoch": 0.7158671586715867, "grad_norm": 0.29997538037178545, "learning_rate": 0.00018853639042826478, "loss": 0.0564, "step": 970 }, { "epoch": 0.7166051660516605, "grad_norm": 0.35669428171614154, "learning_rate": 0.0001884964309857805, "loss": 0.0741, "step": 971 }, { "epoch": 0.7173431734317344, "grad_norm": 0.29766570305258144, "learning_rate": 0.00018845640626962006, "loss": 0.0901, "step": 972 }, { "epoch": 0.7180811808118082, "grad_norm": 0.16976259058875476, "learning_rate": 0.0001884163163093051, "loss": 0.0471, "step": 973 }, { "epoch": 0.718819188191882, "grad_norm": 0.2681061380364504, "learning_rate": 0.00018837616113440538, "loss": 0.0701, "step": 974 }, { "epoch": 0.7195571955719557, "grad_norm": 0.16519310904850257, "learning_rate": 0.00018833594077453876, "loss": 0.0374, "step": 975 }, { "epoch": 0.7202952029520295, "grad_norm": 0.3824613077625915, "learning_rate": 0.0001882956552593712, "loss": 0.0682, "step": 976 }, { "epoch": 0.7210332103321033, "grad_norm": 0.4494233406000593, "learning_rate": 0.0001882553046186167, "loss": 0.0998, "step": 977 }, { "epoch": 0.7217712177121771, "grad_norm": 0.3495861507481814, "learning_rate": 0.00018821488888203736, "loss": 0.0495, "step": 978 }, { "epoch": 0.7225092250922509, "grad_norm": 0.39250817648583264, "learning_rate": 0.00018817440807944317, "loss": 0.0937, "step": 979 }, { "epoch": 0.7232472324723247, "grad_norm": 0.3335057967072848, "learning_rate": 0.0001881338622406922, "loss": 0.0476, "step": 980 }, { "epoch": 0.7239852398523985, "grad_norm": 0.205816383528613, "learning_rate": 0.00018809325139569047, "loss": 0.0524, "step": 981 }, { "epoch": 0.7247232472324723, "grad_norm": 0.1904200413001589, "learning_rate": 0.00018805257557439193, "loss": 0.078, "step": 982 }, { "epoch": 0.7254612546125462, "grad_norm": 0.21583189813572298, "learning_rate": 0.0001880118348067985, "loss": 0.0536, "step": 983 }, { "epoch": 0.72619926199262, "grad_norm": 0.3444222208975718, "learning_rate": 0.00018797102912295998, "loss": 0.0566, "step": 984 }, { "epoch": 0.7269372693726938, "grad_norm": 0.213962520950063, "learning_rate": 0.00018793015855297403, "loss": 0.0595, "step": 985 }, { "epoch": 0.7276752767527676, "grad_norm": 0.21150429637174334, "learning_rate": 0.00018788922312698616, "loss": 0.0561, "step": 986 }, { "epoch": 0.7284132841328413, "grad_norm": 0.20918848152909778, "learning_rate": 0.0001878482228751898, "loss": 0.0405, "step": 987 }, { "epoch": 0.7291512915129151, "grad_norm": 0.3307503822520408, "learning_rate": 0.00018780715782782607, "loss": 0.1726, "step": 988 }, { "epoch": 0.7298892988929889, "grad_norm": 0.5905023657477413, "learning_rate": 0.00018776602801518405, "loss": 0.0781, "step": 989 }, { "epoch": 0.7306273062730627, "grad_norm": 0.21012689674962234, "learning_rate": 0.00018772483346760036, "loss": 0.0857, "step": 990 }, { "epoch": 0.7313653136531365, "grad_norm": 0.18372988491944192, "learning_rate": 0.00018768357421545964, "loss": 0.044, "step": 991 }, { "epoch": 0.7321033210332103, "grad_norm": 0.23337245483218866, "learning_rate": 0.00018764225028919398, "loss": 0.053, "step": 992 }, { "epoch": 0.7328413284132841, "grad_norm": 0.25184166187285845, "learning_rate": 0.00018760086171928337, "loss": 0.053, "step": 993 }, { "epoch": 0.7335793357933579, "grad_norm": 0.18855296729429177, "learning_rate": 0.00018755940853625543, "loss": 0.0383, "step": 994 }, { "epoch": 0.7343173431734318, "grad_norm": 0.27209769953595553, "learning_rate": 0.00018751789077068538, "loss": 0.0666, "step": 995 }, { "epoch": 0.7350553505535056, "grad_norm": 0.20028292787376117, "learning_rate": 0.00018747630845319612, "loss": 0.0542, "step": 996 }, { "epoch": 0.7357933579335794, "grad_norm": 0.32135560579302963, "learning_rate": 0.00018743466161445823, "loss": 0.0676, "step": 997 }, { "epoch": 0.7365313653136532, "grad_norm": 0.31230227702620805, "learning_rate": 0.00018739295028518971, "loss": 0.0942, "step": 998 }, { "epoch": 0.7372693726937269, "grad_norm": 0.34227969631962646, "learning_rate": 0.0001873511744961563, "loss": 0.1117, "step": 999 }, { "epoch": 0.7380073800738007, "grad_norm": 0.32900151909498476, "learning_rate": 0.0001873093342781712, "loss": 0.0933, "step": 1000 }, { "epoch": 0.7387453874538745, "grad_norm": 0.14619130968775493, "learning_rate": 0.00018726742966209515, "loss": 0.0385, "step": 1001 }, { "epoch": 0.7394833948339483, "grad_norm": 0.14887825120504064, "learning_rate": 0.00018722546067883632, "loss": 0.0539, "step": 1002 }, { "epoch": 0.7402214022140221, "grad_norm": 0.39045385480550404, "learning_rate": 0.00018718342735935052, "loss": 0.0915, "step": 1003 }, { "epoch": 0.7409594095940959, "grad_norm": 0.26014882033268355, "learning_rate": 0.0001871413297346408, "loss": 0.0528, "step": 1004 }, { "epoch": 0.7416974169741697, "grad_norm": 0.2808538357256002, "learning_rate": 0.00018709916783575783, "loss": 0.0895, "step": 1005 }, { "epoch": 0.7424354243542436, "grad_norm": 0.13419024155563786, "learning_rate": 0.00018705694169379963, "loss": 0.0374, "step": 1006 }, { "epoch": 0.7431734317343174, "grad_norm": 0.45900555155080247, "learning_rate": 0.00018701465133991153, "loss": 0.0775, "step": 1007 }, { "epoch": 0.7439114391143912, "grad_norm": 0.18034891460092242, "learning_rate": 0.0001869722968052863, "loss": 0.0611, "step": 1008 }, { "epoch": 0.744649446494465, "grad_norm": 0.24504052741395962, "learning_rate": 0.000186929878121164, "loss": 0.0614, "step": 1009 }, { "epoch": 0.7453874538745388, "grad_norm": 0.12254714881306453, "learning_rate": 0.00018688739531883211, "loss": 0.0295, "step": 1010 }, { "epoch": 0.7461254612546125, "grad_norm": 0.5427361890211475, "learning_rate": 0.00018684484842962525, "loss": 0.1279, "step": 1011 }, { "epoch": 0.7468634686346863, "grad_norm": 0.18687942219250409, "learning_rate": 0.00018680223748492538, "loss": 0.0544, "step": 1012 }, { "epoch": 0.7476014760147601, "grad_norm": 0.1378698114831249, "learning_rate": 0.0001867595625161618, "loss": 0.0338, "step": 1013 }, { "epoch": 0.7483394833948339, "grad_norm": 0.24275190903280053, "learning_rate": 0.00018671682355481085, "loss": 0.0768, "step": 1014 }, { "epoch": 0.7490774907749077, "grad_norm": 0.7343018533745056, "learning_rate": 0.0001866740206323962, "loss": 0.1925, "step": 1015 }, { "epoch": 0.7498154981549815, "grad_norm": 0.15566501350920825, "learning_rate": 0.00018663115378048862, "loss": 0.0356, "step": 1016 }, { "epoch": 0.7505535055350554, "grad_norm": 0.20484548293268723, "learning_rate": 0.00018658822303070616, "loss": 0.0676, "step": 1017 }, { "epoch": 0.7512915129151292, "grad_norm": 0.14296983686355602, "learning_rate": 0.00018654522841471386, "loss": 0.0355, "step": 1018 }, { "epoch": 0.752029520295203, "grad_norm": 0.5287497738755342, "learning_rate": 0.00018650216996422394, "loss": 0.114, "step": 1019 }, { "epoch": 0.7527675276752768, "grad_norm": 0.41022058916633386, "learning_rate": 0.00018645904771099567, "loss": 0.1516, "step": 1020 }, { "epoch": 0.7535055350553506, "grad_norm": 0.11960771110294738, "learning_rate": 0.00018641586168683538, "loss": 0.035, "step": 1021 }, { "epoch": 0.7542435424354244, "grad_norm": 0.27607727886335165, "learning_rate": 0.00018637261192359648, "loss": 0.0584, "step": 1022 }, { "epoch": 0.7549815498154981, "grad_norm": 0.13937778911916487, "learning_rate": 0.00018632929845317935, "loss": 0.0553, "step": 1023 }, { "epoch": 0.7557195571955719, "grad_norm": 0.1595935435462446, "learning_rate": 0.0001862859213075314, "loss": 0.1053, "step": 1024 }, { "epoch": 0.7564575645756457, "grad_norm": 0.27983398885637406, "learning_rate": 0.0001862424805186469, "loss": 0.075, "step": 1025 }, { "epoch": 0.7571955719557195, "grad_norm": 0.24142534001673965, "learning_rate": 0.00018619897611856726, "loss": 0.0731, "step": 1026 }, { "epoch": 0.7579335793357933, "grad_norm": 0.17588554458627048, "learning_rate": 0.0001861554081393806, "loss": 0.0388, "step": 1027 }, { "epoch": 0.7586715867158672, "grad_norm": 0.2948409205077282, "learning_rate": 0.0001861117766132221, "loss": 0.0971, "step": 1028 }, { "epoch": 0.759409594095941, "grad_norm": 0.16073832485025327, "learning_rate": 0.00018606808157227366, "loss": 0.0242, "step": 1029 }, { "epoch": 0.7601476014760148, "grad_norm": 0.2549966417839563, "learning_rate": 0.0001860243230487641, "loss": 0.0374, "step": 1030 }, { "epoch": 0.7608856088560886, "grad_norm": 0.23866443099522294, "learning_rate": 0.00018598050107496915, "loss": 0.0401, "step": 1031 }, { "epoch": 0.7616236162361624, "grad_norm": 0.47590696535090277, "learning_rate": 0.00018593661568321124, "loss": 0.0626, "step": 1032 }, { "epoch": 0.7623616236162362, "grad_norm": 0.24338892751262595, "learning_rate": 0.00018589266690585953, "loss": 0.0571, "step": 1033 }, { "epoch": 0.76309963099631, "grad_norm": 0.3045094399959726, "learning_rate": 0.00018584865477533008, "loss": 0.0953, "step": 1034 }, { "epoch": 0.7638376383763837, "grad_norm": 0.17216938315720606, "learning_rate": 0.0001858045793240855, "loss": 0.0306, "step": 1035 }, { "epoch": 0.7645756457564575, "grad_norm": 0.33334907273910663, "learning_rate": 0.00018576044058463525, "loss": 0.0582, "step": 1036 }, { "epoch": 0.7653136531365313, "grad_norm": 0.28858920522439546, "learning_rate": 0.00018571623858953547, "loss": 0.0582, "step": 1037 }, { "epoch": 0.7660516605166051, "grad_norm": 0.38850451683927184, "learning_rate": 0.0001856719733713888, "loss": 0.0782, "step": 1038 }, { "epoch": 0.766789667896679, "grad_norm": 0.340969550650013, "learning_rate": 0.00018562764496284472, "loss": 0.0731, "step": 1039 }, { "epoch": 0.7675276752767528, "grad_norm": 0.2291285975028127, "learning_rate": 0.00018558325339659916, "loss": 0.043, "step": 1040 }, { "epoch": 0.7682656826568266, "grad_norm": 0.30903230383329144, "learning_rate": 0.0001855387987053947, "loss": 0.0891, "step": 1041 }, { "epoch": 0.7690036900369004, "grad_norm": 0.3518102756466016, "learning_rate": 0.00018549428092202048, "loss": 0.0856, "step": 1042 }, { "epoch": 0.7697416974169742, "grad_norm": 0.5202155704894743, "learning_rate": 0.00018544970007931214, "loss": 0.0674, "step": 1043 }, { "epoch": 0.770479704797048, "grad_norm": 0.29533764280299274, "learning_rate": 0.00018540505621015193, "loss": 0.0656, "step": 1044 }, { "epoch": 0.7712177121771218, "grad_norm": 0.269810403046293, "learning_rate": 0.00018536034934746846, "loss": 0.0666, "step": 1045 }, { "epoch": 0.7719557195571956, "grad_norm": 0.1531971171201266, "learning_rate": 0.00018531557952423686, "loss": 0.0459, "step": 1046 }, { "epoch": 0.7726937269372693, "grad_norm": 0.2286666316387559, "learning_rate": 0.00018527074677347871, "loss": 0.0696, "step": 1047 }, { "epoch": 0.7734317343173431, "grad_norm": 0.19978895265677124, "learning_rate": 0.000185225851128262, "loss": 0.048, "step": 1048 }, { "epoch": 0.7741697416974169, "grad_norm": 0.17163817967387507, "learning_rate": 0.0001851808926217011, "loss": 0.0385, "step": 1049 }, { "epoch": 0.7749077490774908, "grad_norm": 0.40500231587908647, "learning_rate": 0.0001851358712869567, "loss": 0.1073, "step": 1050 }, { "epoch": 0.7756457564575646, "grad_norm": 0.2953526911146118, "learning_rate": 0.00018509078715723596, "loss": 0.0832, "step": 1051 }, { "epoch": 0.7763837638376384, "grad_norm": 0.4184179187181103, "learning_rate": 0.0001850456402657922, "loss": 0.0754, "step": 1052 }, { "epoch": 0.7771217712177122, "grad_norm": 0.25494272104156895, "learning_rate": 0.0001850004306459252, "loss": 0.0559, "step": 1053 }, { "epoch": 0.777859778597786, "grad_norm": 0.28471553254243537, "learning_rate": 0.00018495515833098086, "loss": 0.0692, "step": 1054 }, { "epoch": 0.7785977859778598, "grad_norm": 0.34756150929429436, "learning_rate": 0.0001849098233543513, "loss": 0.0782, "step": 1055 }, { "epoch": 0.7793357933579336, "grad_norm": 1.7080721203494516, "learning_rate": 0.00018486442574947511, "loss": 0.1151, "step": 1056 }, { "epoch": 0.7800738007380074, "grad_norm": 0.16499247984459506, "learning_rate": 0.00018481896554983679, "loss": 0.0326, "step": 1057 }, { "epoch": 0.7808118081180812, "grad_norm": 0.2171874327566465, "learning_rate": 0.0001847734427889671, "loss": 0.058, "step": 1058 }, { "epoch": 0.7815498154981549, "grad_norm": 0.34406530593652596, "learning_rate": 0.00018472785750044303, "loss": 0.0579, "step": 1059 }, { "epoch": 0.7822878228782287, "grad_norm": 0.1448400765962087, "learning_rate": 0.00018468220971788762, "loss": 0.036, "step": 1060 }, { "epoch": 0.7830258302583026, "grad_norm": 0.22493365609146976, "learning_rate": 0.00018463649947496994, "loss": 0.0818, "step": 1061 }, { "epoch": 0.7837638376383764, "grad_norm": 0.30350015766847654, "learning_rate": 0.00018459072680540527, "loss": 0.0485, "step": 1062 }, { "epoch": 0.7845018450184502, "grad_norm": 0.19492206904977633, "learning_rate": 0.00018454489174295482, "loss": 0.0428, "step": 1063 }, { "epoch": 0.785239852398524, "grad_norm": 0.2589888423745474, "learning_rate": 0.00018449899432142588, "loss": 0.0296, "step": 1064 }, { "epoch": 0.7859778597785978, "grad_norm": 0.3761093233278008, "learning_rate": 0.00018445303457467174, "loss": 0.1555, "step": 1065 }, { "epoch": 0.7867158671586716, "grad_norm": 0.4703017663986114, "learning_rate": 0.0001844070125365916, "loss": 0.0601, "step": 1066 }, { "epoch": 0.7874538745387454, "grad_norm": 0.2806322342238289, "learning_rate": 0.00018436092824113066, "loss": 0.1132, "step": 1067 }, { "epoch": 0.7881918819188192, "grad_norm": 0.3000822840582249, "learning_rate": 0.00018431478172228002, "loss": 0.0499, "step": 1068 }, { "epoch": 0.788929889298893, "grad_norm": 0.20270525968709618, "learning_rate": 0.00018426857301407672, "loss": 0.1086, "step": 1069 }, { "epoch": 0.7896678966789668, "grad_norm": 0.3668594463284011, "learning_rate": 0.00018422230215060355, "loss": 0.054, "step": 1070 }, { "epoch": 0.7904059040590405, "grad_norm": 0.19755086854812315, "learning_rate": 0.00018417596916598931, "loss": 0.0339, "step": 1071 }, { "epoch": 0.7911439114391144, "grad_norm": 0.41082427136461164, "learning_rate": 0.00018412957409440846, "loss": 0.0824, "step": 1072 }, { "epoch": 0.7918819188191882, "grad_norm": 0.20774836604041394, "learning_rate": 0.00018408311697008136, "loss": 0.0421, "step": 1073 }, { "epoch": 0.792619926199262, "grad_norm": 0.1898864017438984, "learning_rate": 0.0001840365978272741, "loss": 0.0381, "step": 1074 }, { "epoch": 0.7933579335793358, "grad_norm": 0.2647679010718246, "learning_rate": 0.00018399001670029854, "loss": 0.0685, "step": 1075 }, { "epoch": 0.7940959409594096, "grad_norm": 0.21431489547351823, "learning_rate": 0.0001839433736235122, "loss": 0.0764, "step": 1076 }, { "epoch": 0.7948339483394834, "grad_norm": 0.32069130792737044, "learning_rate": 0.00018389666863131838, "loss": 0.0673, "step": 1077 }, { "epoch": 0.7955719557195572, "grad_norm": 0.28119662149847635, "learning_rate": 0.00018384990175816598, "loss": 0.072, "step": 1078 }, { "epoch": 0.796309963099631, "grad_norm": 0.29997928014175357, "learning_rate": 0.00018380307303854953, "loss": 0.1155, "step": 1079 }, { "epoch": 0.7970479704797048, "grad_norm": 0.24809898900647895, "learning_rate": 0.00018375618250700927, "loss": 0.0852, "step": 1080 }, { "epoch": 0.7977859778597786, "grad_norm": 0.14190805308709695, "learning_rate": 0.00018370923019813096, "loss": 0.0346, "step": 1081 }, { "epoch": 0.7985239852398524, "grad_norm": 0.34886065357664253, "learning_rate": 0.00018366221614654588, "loss": 0.0788, "step": 1082 }, { "epoch": 0.7992619926199263, "grad_norm": 0.31107801903991317, "learning_rate": 0.00018361514038693099, "loss": 0.0774, "step": 1083 }, { "epoch": 0.8, "grad_norm": 0.2136116332718074, "learning_rate": 0.0001835680029540086, "loss": 0.0433, "step": 1084 }, { "epoch": 0.8007380073800738, "grad_norm": 0.39461785840562236, "learning_rate": 0.00018352080388254666, "loss": 0.0979, "step": 1085 }, { "epoch": 0.8014760147601476, "grad_norm": 0.3585632219108951, "learning_rate": 0.0001834735432073585, "loss": 0.1087, "step": 1086 }, { "epoch": 0.8022140221402214, "grad_norm": 0.36425353758390727, "learning_rate": 0.00018342622096330287, "loss": 0.0738, "step": 1087 }, { "epoch": 0.8029520295202952, "grad_norm": 0.2944913863069485, "learning_rate": 0.00018337883718528403, "loss": 0.0753, "step": 1088 }, { "epoch": 0.803690036900369, "grad_norm": 0.4333636633517539, "learning_rate": 0.0001833313919082515, "loss": 0.1077, "step": 1089 }, { "epoch": 0.8044280442804428, "grad_norm": 0.16706589371263433, "learning_rate": 0.00018328388516720027, "loss": 0.0355, "step": 1090 }, { "epoch": 0.8051660516605166, "grad_norm": 0.3855083680786975, "learning_rate": 0.0001832363169971706, "loss": 0.142, "step": 1091 }, { "epoch": 0.8059040590405904, "grad_norm": 0.4214911134361564, "learning_rate": 0.00018318868743324806, "loss": 0.0622, "step": 1092 }, { "epoch": 0.8066420664206642, "grad_norm": 0.21436032952439232, "learning_rate": 0.00018314099651056353, "loss": 0.0436, "step": 1093 }, { "epoch": 0.8073800738007381, "grad_norm": 0.20991422631539788, "learning_rate": 0.0001830932442642932, "loss": 0.0448, "step": 1094 }, { "epoch": 0.8081180811808119, "grad_norm": 0.2938241094826623, "learning_rate": 0.00018304543072965833, "loss": 0.0966, "step": 1095 }, { "epoch": 0.8088560885608856, "grad_norm": 0.35541934505292644, "learning_rate": 0.00018299755594192556, "loss": 0.0713, "step": 1096 }, { "epoch": 0.8095940959409594, "grad_norm": 0.37493151260693836, "learning_rate": 0.00018294961993640658, "loss": 0.0825, "step": 1097 }, { "epoch": 0.8103321033210332, "grad_norm": 0.1577590833578493, "learning_rate": 0.00018290162274845834, "loss": 0.0366, "step": 1098 }, { "epoch": 0.811070110701107, "grad_norm": 0.322271969568289, "learning_rate": 0.00018285356441348282, "loss": 0.0986, "step": 1099 }, { "epoch": 0.8118081180811808, "grad_norm": 0.22434377875637673, "learning_rate": 0.0001828054449669272, "loss": 0.0358, "step": 1100 }, { "epoch": 0.8125461254612546, "grad_norm": 0.2320579793872821, "learning_rate": 0.00018275726444428366, "loss": 0.0493, "step": 1101 }, { "epoch": 0.8132841328413284, "grad_norm": 0.22621679833968225, "learning_rate": 0.00018270902288108939, "loss": 0.0785, "step": 1102 }, { "epoch": 0.8140221402214022, "grad_norm": 0.36740432413171653, "learning_rate": 0.00018266072031292673, "loss": 0.0795, "step": 1103 }, { "epoch": 0.814760147601476, "grad_norm": 0.09901724642931124, "learning_rate": 0.00018261235677542295, "loss": 0.0231, "step": 1104 }, { "epoch": 0.8154981549815498, "grad_norm": 0.35058912520673935, "learning_rate": 0.00018256393230425027, "loss": 0.0518, "step": 1105 }, { "epoch": 0.8162361623616237, "grad_norm": 0.2155641234757761, "learning_rate": 0.00018251544693512588, "loss": 0.0542, "step": 1106 }, { "epoch": 0.8169741697416975, "grad_norm": 0.11744113297588464, "learning_rate": 0.00018246690070381188, "loss": 0.0323, "step": 1107 }, { "epoch": 0.8177121771217712, "grad_norm": 0.26489185649349445, "learning_rate": 0.00018241829364611524, "loss": 0.0619, "step": 1108 }, { "epoch": 0.818450184501845, "grad_norm": 0.15637652877483368, "learning_rate": 0.00018236962579788784, "loss": 0.0369, "step": 1109 }, { "epoch": 0.8191881918819188, "grad_norm": 0.19369997637383962, "learning_rate": 0.00018232089719502636, "loss": 0.0558, "step": 1110 }, { "epoch": 0.8199261992619926, "grad_norm": 0.2510783798578483, "learning_rate": 0.00018227210787347234, "loss": 0.0653, "step": 1111 }, { "epoch": 0.8206642066420664, "grad_norm": 0.42936504918075336, "learning_rate": 0.00018222325786921203, "loss": 0.0881, "step": 1112 }, { "epoch": 0.8214022140221402, "grad_norm": 0.35523272611554785, "learning_rate": 0.00018217434721827652, "loss": 0.085, "step": 1113 }, { "epoch": 0.822140221402214, "grad_norm": 0.149521772001882, "learning_rate": 0.00018212537595674156, "loss": 0.0371, "step": 1114 }, { "epoch": 0.8228782287822878, "grad_norm": 0.2542959702316622, "learning_rate": 0.00018207634412072764, "loss": 0.0467, "step": 1115 }, { "epoch": 0.8236162361623616, "grad_norm": 0.32537907339985783, "learning_rate": 0.00018202725174639993, "loss": 0.0616, "step": 1116 }, { "epoch": 0.8243542435424355, "grad_norm": 0.25331730396871477, "learning_rate": 0.0001819780988699683, "loss": 0.0726, "step": 1117 }, { "epoch": 0.8250922509225093, "grad_norm": 0.4758714785586844, "learning_rate": 0.0001819288855276871, "loss": 0.0827, "step": 1118 }, { "epoch": 0.825830258302583, "grad_norm": 0.18700248870572497, "learning_rate": 0.00018187961175585544, "loss": 0.0493, "step": 1119 }, { "epoch": 0.8265682656826568, "grad_norm": 0.5341052332234686, "learning_rate": 0.0001818302775908169, "loss": 0.0498, "step": 1120 }, { "epoch": 0.8273062730627306, "grad_norm": 0.2920584196807437, "learning_rate": 0.0001817808830689597, "loss": 0.0633, "step": 1121 }, { "epoch": 0.8280442804428044, "grad_norm": 0.3477614209679698, "learning_rate": 0.00018173142822671646, "loss": 0.0766, "step": 1122 }, { "epoch": 0.8287822878228782, "grad_norm": 0.5455207599806655, "learning_rate": 0.00018168191310056434, "loss": 0.0652, "step": 1123 }, { "epoch": 0.829520295202952, "grad_norm": 0.4063388052898803, "learning_rate": 0.000181632337727025, "loss": 0.0964, "step": 1124 }, { "epoch": 0.8302583025830258, "grad_norm": 0.815236962580914, "learning_rate": 0.00018158270214266455, "loss": 0.1781, "step": 1125 }, { "epoch": 0.8309963099630996, "grad_norm": 0.22338651155325112, "learning_rate": 0.00018153300638409342, "loss": 0.0424, "step": 1126 }, { "epoch": 0.8317343173431734, "grad_norm": 0.3907674397693053, "learning_rate": 0.0001814832504879665, "loss": 0.0984, "step": 1127 }, { "epoch": 0.8324723247232473, "grad_norm": 0.17052089674747467, "learning_rate": 0.00018143343449098298, "loss": 0.0511, "step": 1128 }, { "epoch": 0.8332103321033211, "grad_norm": 0.20211094411730854, "learning_rate": 0.00018138355842988645, "loss": 0.0422, "step": 1129 }, { "epoch": 0.8339483394833949, "grad_norm": 0.2304939826774484, "learning_rate": 0.00018133362234146473, "loss": 0.0345, "step": 1130 }, { "epoch": 0.8346863468634687, "grad_norm": 0.17950890168437608, "learning_rate": 0.00018128362626255, "loss": 0.0582, "step": 1131 }, { "epoch": 0.8354243542435424, "grad_norm": 0.44260600570194597, "learning_rate": 0.0001812335702300186, "loss": 0.0744, "step": 1132 }, { "epoch": 0.8361623616236162, "grad_norm": 0.25910296373425235, "learning_rate": 0.00018118345428079114, "loss": 0.0637, "step": 1133 }, { "epoch": 0.83690036900369, "grad_norm": 0.9459543310156328, "learning_rate": 0.00018113327845183244, "loss": 0.1557, "step": 1134 }, { "epoch": 0.8376383763837638, "grad_norm": 0.2688560339942466, "learning_rate": 0.0001810830427801514, "loss": 0.0511, "step": 1135 }, { "epoch": 0.8383763837638376, "grad_norm": 0.25001425437753005, "learning_rate": 0.00018103274730280115, "loss": 0.0867, "step": 1136 }, { "epoch": 0.8391143911439114, "grad_norm": 0.3486981684810533, "learning_rate": 0.00018098239205687893, "loss": 0.044, "step": 1137 }, { "epoch": 0.8398523985239852, "grad_norm": 0.18059371970562396, "learning_rate": 0.000180931977079526, "loss": 0.0404, "step": 1138 }, { "epoch": 0.8405904059040591, "grad_norm": 0.24405777191799624, "learning_rate": 0.00018088150240792768, "loss": 0.0556, "step": 1139 }, { "epoch": 0.8413284132841329, "grad_norm": 0.5121910563228455, "learning_rate": 0.00018083096807931342, "loss": 0.0688, "step": 1140 }, { "epoch": 0.8420664206642067, "grad_norm": 0.3830081909194935, "learning_rate": 0.00018078037413095656, "loss": 0.1172, "step": 1141 }, { "epoch": 0.8428044280442805, "grad_norm": 0.38519991507740986, "learning_rate": 0.00018072972060017447, "loss": 0.0694, "step": 1142 }, { "epoch": 0.8435424354243543, "grad_norm": 0.2769860590043756, "learning_rate": 0.00018067900752432846, "loss": 0.0578, "step": 1143 }, { "epoch": 0.844280442804428, "grad_norm": 0.40940309839774763, "learning_rate": 0.00018062823494082375, "loss": 0.0552, "step": 1144 }, { "epoch": 0.8450184501845018, "grad_norm": 0.3379293593421777, "learning_rate": 0.00018057740288710946, "loss": 0.0626, "step": 1145 }, { "epoch": 0.8457564575645756, "grad_norm": 0.20174080232656422, "learning_rate": 0.00018052651140067856, "loss": 0.0562, "step": 1146 }, { "epoch": 0.8464944649446494, "grad_norm": 0.1437465576555623, "learning_rate": 0.00018047556051906786, "loss": 0.0277, "step": 1147 }, { "epoch": 0.8472324723247232, "grad_norm": 0.2254312458556272, "learning_rate": 0.00018042455027985802, "loss": 0.0511, "step": 1148 }, { "epoch": 0.847970479704797, "grad_norm": 0.4827777264185114, "learning_rate": 0.00018037348072067345, "loss": 0.0638, "step": 1149 }, { "epoch": 0.8487084870848709, "grad_norm": 0.37197317173824546, "learning_rate": 0.00018032235187918224, "loss": 0.0852, "step": 1150 }, { "epoch": 0.8494464944649447, "grad_norm": 0.34950929651388984, "learning_rate": 0.00018027116379309638, "loss": 0.0941, "step": 1151 }, { "epoch": 0.8501845018450185, "grad_norm": 0.585689971503132, "learning_rate": 0.00018021991650017137, "loss": 0.1742, "step": 1152 }, { "epoch": 0.8509225092250923, "grad_norm": 0.2929786432639156, "learning_rate": 0.0001801686100382065, "loss": 0.0958, "step": 1153 }, { "epoch": 0.8516605166051661, "grad_norm": 0.23835706243431967, "learning_rate": 0.0001801172444450447, "loss": 0.0528, "step": 1154 }, { "epoch": 0.8523985239852399, "grad_norm": 0.38557701235769065, "learning_rate": 0.00018006581975857244, "loss": 0.0719, "step": 1155 }, { "epoch": 0.8531365313653136, "grad_norm": 0.314459757747131, "learning_rate": 0.0001800143360167198, "loss": 0.0607, "step": 1156 }, { "epoch": 0.8538745387453874, "grad_norm": 0.2332054038532555, "learning_rate": 0.00017996279325746051, "loss": 0.0507, "step": 1157 }, { "epoch": 0.8546125461254612, "grad_norm": 0.30576039762974844, "learning_rate": 0.00017991119151881168, "loss": 0.0717, "step": 1158 }, { "epoch": 0.855350553505535, "grad_norm": 0.30243444035792794, "learning_rate": 0.00017985953083883406, "loss": 0.0426, "step": 1159 }, { "epoch": 0.8560885608856088, "grad_norm": 0.5541204417496758, "learning_rate": 0.00017980781125563174, "loss": 0.0963, "step": 1160 }, { "epoch": 0.8568265682656827, "grad_norm": 0.45629556734558024, "learning_rate": 0.0001797560328073524, "loss": 0.0746, "step": 1161 }, { "epoch": 0.8575645756457565, "grad_norm": 0.18999892662803433, "learning_rate": 0.00017970419553218703, "loss": 0.0316, "step": 1162 }, { "epoch": 0.8583025830258303, "grad_norm": 0.3874618297283181, "learning_rate": 0.00017965229946837009, "loss": 0.1054, "step": 1163 }, { "epoch": 0.8590405904059041, "grad_norm": 0.5319578430153784, "learning_rate": 0.0001796003446541793, "loss": 0.1083, "step": 1164 }, { "epoch": 0.8597785977859779, "grad_norm": 0.5015569780836241, "learning_rate": 0.00017954833112793583, "loss": 0.0961, "step": 1165 }, { "epoch": 0.8605166051660517, "grad_norm": 0.31319717013993714, "learning_rate": 0.00017949625892800403, "loss": 0.1169, "step": 1166 }, { "epoch": 0.8612546125461255, "grad_norm": 0.23968993920313086, "learning_rate": 0.00017944412809279168, "loss": 0.0487, "step": 1167 }, { "epoch": 0.8619926199261992, "grad_norm": 0.19302959715693724, "learning_rate": 0.00017939193866074965, "loss": 0.0375, "step": 1168 }, { "epoch": 0.862730627306273, "grad_norm": 0.3628518437481467, "learning_rate": 0.00017933969067037214, "loss": 0.0723, "step": 1169 }, { "epoch": 0.8634686346863468, "grad_norm": 0.42962701405089787, "learning_rate": 0.00017928738416019653, "loss": 0.0457, "step": 1170 }, { "epoch": 0.8642066420664206, "grad_norm": 0.23912845207067301, "learning_rate": 0.00017923501916880326, "loss": 0.0504, "step": 1171 }, { "epoch": 0.8649446494464945, "grad_norm": 0.4410767565114343, "learning_rate": 0.00017918259573481606, "loss": 0.0973, "step": 1172 }, { "epoch": 0.8656826568265683, "grad_norm": 0.2978577581384058, "learning_rate": 0.00017913011389690165, "loss": 0.0592, "step": 1173 }, { "epoch": 0.8664206642066421, "grad_norm": 0.19825224708669864, "learning_rate": 0.00017907757369376985, "loss": 0.0237, "step": 1174 }, { "epoch": 0.8671586715867159, "grad_norm": 0.2810179200662466, "learning_rate": 0.00017902497516417363, "loss": 0.0647, "step": 1175 }, { "epoch": 0.8678966789667897, "grad_norm": 0.32777585028522926, "learning_rate": 0.0001789723183469088, "loss": 0.0726, "step": 1176 }, { "epoch": 0.8686346863468635, "grad_norm": 0.5788648018452449, "learning_rate": 0.00017891960328081434, "loss": 0.0719, "step": 1177 }, { "epoch": 0.8693726937269373, "grad_norm": 0.8795268601452537, "learning_rate": 0.00017886683000477204, "loss": 0.0972, "step": 1178 }, { "epoch": 0.870110701107011, "grad_norm": 0.3872141542440756, "learning_rate": 0.00017881399855770676, "loss": 0.0835, "step": 1179 }, { "epoch": 0.8708487084870848, "grad_norm": 0.23428608331715822, "learning_rate": 0.00017876110897858616, "loss": 0.0637, "step": 1180 }, { "epoch": 0.8715867158671586, "grad_norm": 0.26119204781978694, "learning_rate": 0.00017870816130642085, "loss": 0.046, "step": 1181 }, { "epoch": 0.8723247232472324, "grad_norm": 0.17350726446824744, "learning_rate": 0.00017865515558026428, "loss": 0.0386, "step": 1182 }, { "epoch": 0.8730627306273063, "grad_norm": 0.4841965021971355, "learning_rate": 0.00017860209183921262, "loss": 0.0805, "step": 1183 }, { "epoch": 0.8738007380073801, "grad_norm": 0.27112965577447995, "learning_rate": 0.000178548970122405, "loss": 0.0649, "step": 1184 }, { "epoch": 0.8745387453874539, "grad_norm": 0.2872652006760143, "learning_rate": 0.00017849579046902317, "loss": 0.0726, "step": 1185 }, { "epoch": 0.8752767527675277, "grad_norm": 0.2661390867851809, "learning_rate": 0.00017844255291829167, "loss": 0.0839, "step": 1186 }, { "epoch": 0.8760147601476015, "grad_norm": 0.5265386862387067, "learning_rate": 0.0001783892575094778, "loss": 0.0984, "step": 1187 }, { "epoch": 0.8767527675276753, "grad_norm": 0.3033901695999675, "learning_rate": 0.00017833590428189137, "loss": 0.0523, "step": 1188 }, { "epoch": 0.8774907749077491, "grad_norm": 0.22060607766923, "learning_rate": 0.00017828249327488503, "loss": 0.0796, "step": 1189 }, { "epoch": 0.8782287822878229, "grad_norm": 0.20647093467003433, "learning_rate": 0.00017822902452785394, "loss": 0.0425, "step": 1190 }, { "epoch": 0.8789667896678967, "grad_norm": 0.46215994248673836, "learning_rate": 0.00017817549808023586, "loss": 0.0503, "step": 1191 }, { "epoch": 0.8797047970479704, "grad_norm": 0.2814473778556937, "learning_rate": 0.0001781219139715111, "loss": 0.0477, "step": 1192 }, { "epoch": 0.8804428044280442, "grad_norm": 0.22381072392149004, "learning_rate": 0.00017806827224120254, "loss": 0.0614, "step": 1193 }, { "epoch": 0.8811808118081181, "grad_norm": 0.22398536556171852, "learning_rate": 0.00017801457292887553, "loss": 0.0543, "step": 1194 }, { "epoch": 0.8819188191881919, "grad_norm": 0.2568739156112108, "learning_rate": 0.0001779608160741379, "loss": 0.07, "step": 1195 }, { "epoch": 0.8826568265682657, "grad_norm": 0.18580120902773178, "learning_rate": 0.0001779070017166399, "loss": 0.0582, "step": 1196 }, { "epoch": 0.8833948339483395, "grad_norm": 0.27306898647330125, "learning_rate": 0.00017785312989607426, "loss": 0.0387, "step": 1197 }, { "epoch": 0.8841328413284133, "grad_norm": 0.5238314716848871, "learning_rate": 0.000177799200652176, "loss": 0.0671, "step": 1198 }, { "epoch": 0.8848708487084871, "grad_norm": 0.21502348752057762, "learning_rate": 0.00017774521402472257, "loss": 0.058, "step": 1199 }, { "epoch": 0.8856088560885609, "grad_norm": 0.2970216673073859, "learning_rate": 0.00017769117005353376, "loss": 0.0577, "step": 1200 }, { "epoch": 0.8863468634686347, "grad_norm": 0.20361935117947103, "learning_rate": 0.00017763706877847152, "loss": 0.0712, "step": 1201 }, { "epoch": 0.8870848708487085, "grad_norm": 0.24131654153144458, "learning_rate": 0.0001775829102394402, "loss": 0.0504, "step": 1202 }, { "epoch": 0.8878228782287823, "grad_norm": 0.2161492565559766, "learning_rate": 0.0001775286944763864, "loss": 0.1357, "step": 1203 }, { "epoch": 0.888560885608856, "grad_norm": 0.20096736229994708, "learning_rate": 0.00017747442152929883, "loss": 0.0474, "step": 1204 }, { "epoch": 0.8892988929889298, "grad_norm": 0.24978526039090715, "learning_rate": 0.00017742009143820842, "loss": 0.0521, "step": 1205 }, { "epoch": 0.8900369003690037, "grad_norm": 0.3729676966818513, "learning_rate": 0.00017736570424318825, "loss": 0.1277, "step": 1206 }, { "epoch": 0.8907749077490775, "grad_norm": 0.19615069678281113, "learning_rate": 0.00017731125998435355, "loss": 0.0436, "step": 1207 }, { "epoch": 0.8915129151291513, "grad_norm": 0.1729937310564577, "learning_rate": 0.00017725675870186157, "loss": 0.0315, "step": 1208 }, { "epoch": 0.8922509225092251, "grad_norm": 0.20162060135249973, "learning_rate": 0.0001772022004359117, "loss": 0.0743, "step": 1209 }, { "epoch": 0.8929889298892989, "grad_norm": 0.4715280068515938, "learning_rate": 0.00017714758522674532, "loss": 0.052, "step": 1210 }, { "epoch": 0.8937269372693727, "grad_norm": 0.24689182764702503, "learning_rate": 0.0001770929131146458, "loss": 0.0377, "step": 1211 }, { "epoch": 0.8944649446494465, "grad_norm": 0.26084277854594384, "learning_rate": 0.00017703818413993845, "loss": 0.068, "step": 1212 }, { "epoch": 0.8952029520295203, "grad_norm": 0.20870291206275118, "learning_rate": 0.00017698339834299061, "loss": 0.0393, "step": 1213 }, { "epoch": 0.8959409594095941, "grad_norm": 0.195752893969912, "learning_rate": 0.00017692855576421153, "loss": 0.0483, "step": 1214 }, { "epoch": 0.8966789667896679, "grad_norm": 0.2659738957966421, "learning_rate": 0.00017687365644405222, "loss": 0.0484, "step": 1215 }, { "epoch": 0.8974169741697416, "grad_norm": 0.22161802090846644, "learning_rate": 0.0001768187004230056, "loss": 0.0389, "step": 1216 }, { "epoch": 0.8981549815498155, "grad_norm": 0.09297454751474475, "learning_rate": 0.00017676368774160648, "loss": 0.0198, "step": 1217 }, { "epoch": 0.8988929889298893, "grad_norm": 0.29131282373837886, "learning_rate": 0.0001767086184404314, "loss": 0.0512, "step": 1218 }, { "epoch": 0.8996309963099631, "grad_norm": 0.17986968262769837, "learning_rate": 0.0001766534925600987, "loss": 0.0435, "step": 1219 }, { "epoch": 0.9003690036900369, "grad_norm": 0.2881095127344619, "learning_rate": 0.00017659831014126839, "loss": 0.043, "step": 1220 }, { "epoch": 0.9011070110701107, "grad_norm": 0.2444214319735383, "learning_rate": 0.00017654307122464219, "loss": 0.0683, "step": 1221 }, { "epoch": 0.9018450184501845, "grad_norm": 0.1717047939219709, "learning_rate": 0.0001764877758509636, "loss": 0.0445, "step": 1222 }, { "epoch": 0.9025830258302583, "grad_norm": 0.24475404202326645, "learning_rate": 0.0001764324240610176, "loss": 0.0595, "step": 1223 }, { "epoch": 0.9033210332103321, "grad_norm": 0.3006725129486126, "learning_rate": 0.00017637701589563092, "loss": 0.0803, "step": 1224 }, { "epoch": 0.9040590405904059, "grad_norm": 0.2674895435884546, "learning_rate": 0.00017632155139567178, "loss": 0.0686, "step": 1225 }, { "epoch": 0.9047970479704797, "grad_norm": 0.17997773190881325, "learning_rate": 0.00017626603060205, "loss": 0.0328, "step": 1226 }, { "epoch": 0.9055350553505535, "grad_norm": 0.40528410395896863, "learning_rate": 0.0001762104535557169, "loss": 0.0715, "step": 1227 }, { "epoch": 0.9062730627306274, "grad_norm": 0.27281091693823767, "learning_rate": 0.0001761548202976653, "loss": 0.0585, "step": 1228 }, { "epoch": 0.9070110701107011, "grad_norm": 0.2343512043253985, "learning_rate": 0.00017609913086892947, "loss": 0.0798, "step": 1229 }, { "epoch": 0.9077490774907749, "grad_norm": 0.3533673726441194, "learning_rate": 0.00017604338531058516, "loss": 0.0603, "step": 1230 }, { "epoch": 0.9084870848708487, "grad_norm": 0.3606371538106186, "learning_rate": 0.00017598758366374945, "loss": 0.0635, "step": 1231 }, { "epoch": 0.9092250922509225, "grad_norm": 0.24528568549705324, "learning_rate": 0.00017593172596958083, "loss": 0.0721, "step": 1232 }, { "epoch": 0.9099630996309963, "grad_norm": 0.1475592913032354, "learning_rate": 0.0001758758122692791, "loss": 0.0407, "step": 1233 }, { "epoch": 0.9107011070110701, "grad_norm": 0.23560804066154886, "learning_rate": 0.0001758198426040854, "loss": 0.061, "step": 1234 }, { "epoch": 0.9114391143911439, "grad_norm": 0.1684548576979405, "learning_rate": 0.00017576381701528212, "loss": 0.0453, "step": 1235 }, { "epoch": 0.9121771217712177, "grad_norm": 0.36355903564520237, "learning_rate": 0.000175707735544193, "loss": 0.061, "step": 1236 }, { "epoch": 0.9129151291512915, "grad_norm": 0.21143010783556482, "learning_rate": 0.0001756515982321828, "loss": 0.044, "step": 1237 }, { "epoch": 0.9136531365313653, "grad_norm": 0.37455881515603817, "learning_rate": 0.00017559540512065763, "loss": 0.0448, "step": 1238 }, { "epoch": 0.9143911439114392, "grad_norm": 0.43647769069843395, "learning_rate": 0.00017553915625106474, "loss": 0.0796, "step": 1239 }, { "epoch": 0.915129151291513, "grad_norm": 0.1266354228200273, "learning_rate": 0.00017548285166489244, "loss": 0.0312, "step": 1240 }, { "epoch": 0.9158671586715867, "grad_norm": 0.32314426577856414, "learning_rate": 0.0001754264914036702, "loss": 0.0731, "step": 1241 }, { "epoch": 0.9166051660516605, "grad_norm": 0.31779450434778544, "learning_rate": 0.00017537007550896849, "loss": 0.1014, "step": 1242 }, { "epoch": 0.9173431734317343, "grad_norm": 0.27795890042694427, "learning_rate": 0.00017531360402239888, "loss": 0.0542, "step": 1243 }, { "epoch": 0.9180811808118081, "grad_norm": 0.1419775753762223, "learning_rate": 0.00017525707698561385, "loss": 0.0393, "step": 1244 }, { "epoch": 0.9188191881918819, "grad_norm": 0.21441678901098743, "learning_rate": 0.000175200494440307, "loss": 0.0748, "step": 1245 }, { "epoch": 0.9195571955719557, "grad_norm": 0.2753365301378536, "learning_rate": 0.00017514385642821277, "loss": 0.0745, "step": 1246 }, { "epoch": 0.9202952029520295, "grad_norm": 0.27178370466525714, "learning_rate": 0.00017508716299110652, "loss": 0.0467, "step": 1247 }, { "epoch": 0.9210332103321033, "grad_norm": 0.23600431810209405, "learning_rate": 0.00017503041417080451, "loss": 0.0606, "step": 1248 }, { "epoch": 0.9217712177121771, "grad_norm": 0.3625932878260121, "learning_rate": 0.00017497361000916382, "loss": 0.1269, "step": 1249 }, { "epoch": 0.922509225092251, "grad_norm": 0.26635214508787497, "learning_rate": 0.00017491675054808237, "loss": 0.0305, "step": 1250 }, { "epoch": 0.9232472324723248, "grad_norm": 0.19608739228317695, "learning_rate": 0.00017485983582949893, "loss": 0.0671, "step": 1251 }, { "epoch": 0.9239852398523986, "grad_norm": 0.23956308398420686, "learning_rate": 0.00017480286589539287, "loss": 0.1228, "step": 1252 }, { "epoch": 0.9247232472324723, "grad_norm": 0.12924825505708312, "learning_rate": 0.00017474584078778447, "loss": 0.037, "step": 1253 }, { "epoch": 0.9254612546125461, "grad_norm": 0.5994042137172557, "learning_rate": 0.00017468876054873455, "loss": 0.1158, "step": 1254 }, { "epoch": 0.9261992619926199, "grad_norm": 0.2705776468825327, "learning_rate": 0.0001746316252203447, "loss": 0.0592, "step": 1255 }, { "epoch": 0.9269372693726937, "grad_norm": 0.31335875498868193, "learning_rate": 0.0001745744348447571, "loss": 0.0781, "step": 1256 }, { "epoch": 0.9276752767527675, "grad_norm": 0.17862968189940692, "learning_rate": 0.00017451718946415455, "loss": 0.0492, "step": 1257 }, { "epoch": 0.9284132841328413, "grad_norm": 0.15898614773684608, "learning_rate": 0.00017445988912076035, "loss": 0.032, "step": 1258 }, { "epoch": 0.9291512915129151, "grad_norm": 0.20574366858696605, "learning_rate": 0.00017440253385683844, "loss": 0.0422, "step": 1259 }, { "epoch": 0.9298892988929889, "grad_norm": 0.33700674438306194, "learning_rate": 0.00017434512371469326, "loss": 0.0505, "step": 1260 }, { "epoch": 0.9306273062730628, "grad_norm": 0.36533488085121657, "learning_rate": 0.00017428765873666962, "loss": 0.0692, "step": 1261 }, { "epoch": 0.9313653136531366, "grad_norm": 0.2156418665146771, "learning_rate": 0.00017423013896515288, "loss": 0.0223, "step": 1262 }, { "epoch": 0.9321033210332104, "grad_norm": 0.1808345957804376, "learning_rate": 0.00017417256444256883, "loss": 0.0342, "step": 1263 }, { "epoch": 0.9328413284132842, "grad_norm": 0.2058823086194036, "learning_rate": 0.00017411493521138352, "loss": 0.067, "step": 1264 }, { "epoch": 0.933579335793358, "grad_norm": 0.17229949750967213, "learning_rate": 0.00017405725131410348, "loss": 0.0565, "step": 1265 }, { "epoch": 0.9343173431734317, "grad_norm": 0.31662938033545274, "learning_rate": 0.0001739995127932755, "loss": 0.079, "step": 1266 }, { "epoch": 0.9350553505535055, "grad_norm": 0.2835372654881873, "learning_rate": 0.00017394171969148666, "loss": 0.0616, "step": 1267 }, { "epoch": 0.9357933579335793, "grad_norm": 0.2558626399014762, "learning_rate": 0.00017388387205136428, "loss": 0.0623, "step": 1268 }, { "epoch": 0.9365313653136531, "grad_norm": 0.3415863095346416, "learning_rate": 0.00017382596991557603, "loss": 0.0572, "step": 1269 }, { "epoch": 0.9372693726937269, "grad_norm": 0.1301144647248, "learning_rate": 0.0001737680133268296, "loss": 0.0291, "step": 1270 }, { "epoch": 0.9380073800738007, "grad_norm": 0.17015592873383917, "learning_rate": 0.00017371000232787296, "loss": 0.0396, "step": 1271 }, { "epoch": 0.9387453874538746, "grad_norm": 0.33302938783730907, "learning_rate": 0.00017365193696149413, "loss": 0.0998, "step": 1272 }, { "epoch": 0.9394833948339484, "grad_norm": 0.16440227962391568, "learning_rate": 0.00017359381727052132, "loss": 0.0503, "step": 1273 }, { "epoch": 0.9402214022140222, "grad_norm": 0.1433467229537589, "learning_rate": 0.0001735356432978228, "loss": 0.0283, "step": 1274 }, { "epoch": 0.940959409594096, "grad_norm": 0.24880079642533073, "learning_rate": 0.00017347741508630672, "loss": 0.0641, "step": 1275 }, { "epoch": 0.9416974169741698, "grad_norm": 0.19574781043482417, "learning_rate": 0.0001734191326789215, "loss": 0.0655, "step": 1276 }, { "epoch": 0.9424354243542435, "grad_norm": 0.35391440420221304, "learning_rate": 0.00017336079611865533, "loss": 0.085, "step": 1277 }, { "epoch": 0.9431734317343173, "grad_norm": 0.2510809275695405, "learning_rate": 0.0001733024054485364, "loss": 0.039, "step": 1278 }, { "epoch": 0.9439114391143911, "grad_norm": 0.21866586131127452, "learning_rate": 0.0001732439607116328, "loss": 0.0735, "step": 1279 }, { "epoch": 0.9446494464944649, "grad_norm": 0.2751871352551382, "learning_rate": 0.00017318546195105254, "loss": 0.0536, "step": 1280 }, { "epoch": 0.9453874538745387, "grad_norm": 0.08286229899160635, "learning_rate": 0.00017312690920994345, "loss": 0.019, "step": 1281 }, { "epoch": 0.9461254612546125, "grad_norm": 0.1431913304416073, "learning_rate": 0.00017306830253149317, "loss": 0.0394, "step": 1282 }, { "epoch": 0.9468634686346864, "grad_norm": 0.20091782771669253, "learning_rate": 0.00017300964195892917, "loss": 0.041, "step": 1283 }, { "epoch": 0.9476014760147602, "grad_norm": 0.36691439689554245, "learning_rate": 0.00017295092753551858, "loss": 0.0827, "step": 1284 }, { "epoch": 0.948339483394834, "grad_norm": 0.23948185233020727, "learning_rate": 0.00017289215930456833, "loss": 0.0444, "step": 1285 }, { "epoch": 0.9490774907749078, "grad_norm": 0.3058793156645788, "learning_rate": 0.000172833337309425, "loss": 0.0395, "step": 1286 }, { "epoch": 0.9498154981549816, "grad_norm": 0.3330739711462121, "learning_rate": 0.00017277446159347487, "loss": 0.0466, "step": 1287 }, { "epoch": 0.9505535055350554, "grad_norm": 0.3429347760437051, "learning_rate": 0.00017271553220014373, "loss": 0.0509, "step": 1288 }, { "epoch": 0.9512915129151291, "grad_norm": 0.18395865756653731, "learning_rate": 0.00017265654917289708, "loss": 0.0443, "step": 1289 }, { "epoch": 0.9520295202952029, "grad_norm": 0.2569361558155463, "learning_rate": 0.00017259751255523998, "loss": 0.0641, "step": 1290 }, { "epoch": 0.9527675276752767, "grad_norm": 0.15279802080956062, "learning_rate": 0.00017253842239071693, "loss": 0.0259, "step": 1291 }, { "epoch": 0.9535055350553505, "grad_norm": 0.4780389924592629, "learning_rate": 0.000172479278722912, "loss": 0.0742, "step": 1292 }, { "epoch": 0.9542435424354243, "grad_norm": 0.16337344230629539, "learning_rate": 0.0001724200815954487, "loss": 0.0568, "step": 1293 }, { "epoch": 0.9549815498154982, "grad_norm": 0.20664957795158223, "learning_rate": 0.00017236083105198993, "loss": 0.051, "step": 1294 }, { "epoch": 0.955719557195572, "grad_norm": 0.3570693747022546, "learning_rate": 0.00017230152713623804, "loss": 0.072, "step": 1295 }, { "epoch": 0.9564575645756458, "grad_norm": 0.3249578696573, "learning_rate": 0.00017224216989193474, "loss": 0.0638, "step": 1296 }, { "epoch": 0.9571955719557196, "grad_norm": 0.26836152476135094, "learning_rate": 0.0001721827593628611, "loss": 0.0631, "step": 1297 }, { "epoch": 0.9579335793357934, "grad_norm": 0.147153869946955, "learning_rate": 0.0001721232955928374, "loss": 0.0367, "step": 1298 }, { "epoch": 0.9586715867158672, "grad_norm": 0.2052958473425218, "learning_rate": 0.0001720637786257233, "loss": 0.0596, "step": 1299 }, { "epoch": 0.959409594095941, "grad_norm": 0.27732676296946934, "learning_rate": 0.00017200420850541762, "loss": 0.0542, "step": 1300 }, { "epoch": 0.9601476014760147, "grad_norm": 0.19295788200418468, "learning_rate": 0.0001719445852758584, "loss": 0.0445, "step": 1301 }, { "epoch": 0.9608856088560885, "grad_norm": 0.16105331497825626, "learning_rate": 0.00017188490898102288, "loss": 0.0406, "step": 1302 }, { "epoch": 0.9616236162361623, "grad_norm": 0.17636253223683382, "learning_rate": 0.00017182517966492743, "loss": 0.0497, "step": 1303 }, { "epoch": 0.9623616236162361, "grad_norm": 0.3007770629804012, "learning_rate": 0.0001717653973716275, "loss": 0.0549, "step": 1304 }, { "epoch": 0.9630996309963099, "grad_norm": 0.20998055506998456, "learning_rate": 0.00017170556214521766, "loss": 0.0486, "step": 1305 }, { "epoch": 0.9638376383763838, "grad_norm": 0.17863103088722823, "learning_rate": 0.00017164567402983152, "loss": 0.0405, "step": 1306 }, { "epoch": 0.9645756457564576, "grad_norm": 0.20563959964938597, "learning_rate": 0.00017158573306964164, "loss": 0.0546, "step": 1307 }, { "epoch": 0.9653136531365314, "grad_norm": 0.258427060476488, "learning_rate": 0.0001715257393088596, "loss": 0.0402, "step": 1308 }, { "epoch": 0.9660516605166052, "grad_norm": 0.3016793512318396, "learning_rate": 0.00017146569279173594, "loss": 0.0731, "step": 1309 }, { "epoch": 0.966789667896679, "grad_norm": 0.26213132677864287, "learning_rate": 0.00017140559356256007, "loss": 0.0922, "step": 1310 }, { "epoch": 0.9675276752767528, "grad_norm": 0.231439776283776, "learning_rate": 0.00017134544166566036, "loss": 0.0564, "step": 1311 }, { "epoch": 0.9682656826568266, "grad_norm": 0.14408366541416484, "learning_rate": 0.0001712852371454039, "loss": 0.0323, "step": 1312 }, { "epoch": 0.9690036900369003, "grad_norm": 0.22839808777531623, "learning_rate": 0.00017122498004619672, "loss": 0.0478, "step": 1313 }, { "epoch": 0.9697416974169741, "grad_norm": 0.2081441459569531, "learning_rate": 0.00017116467041248355, "loss": 0.058, "step": 1314 }, { "epoch": 0.9704797047970479, "grad_norm": 0.22062580928613215, "learning_rate": 0.00017110430828874788, "loss": 0.0502, "step": 1315 }, { "epoch": 0.9712177121771217, "grad_norm": 0.31362341754682654, "learning_rate": 0.00017104389371951198, "loss": 0.0499, "step": 1316 }, { "epoch": 0.9719557195571956, "grad_norm": 0.32349549034906344, "learning_rate": 0.00017098342674933673, "loss": 0.049, "step": 1317 }, { "epoch": 0.9726937269372694, "grad_norm": 0.14619442813257888, "learning_rate": 0.00017092290742282167, "loss": 0.0592, "step": 1318 }, { "epoch": 0.9734317343173432, "grad_norm": 0.19246867448255853, "learning_rate": 0.000170862335784605, "loss": 0.0321, "step": 1319 }, { "epoch": 0.974169741697417, "grad_norm": 0.27774463662184573, "learning_rate": 0.00017080171187936345, "loss": 0.0365, "step": 1320 }, { "epoch": 0.9749077490774908, "grad_norm": 0.3382913573654017, "learning_rate": 0.00017074103575181232, "loss": 0.1864, "step": 1321 }, { "epoch": 0.9756457564575646, "grad_norm": 0.1840529128903544, "learning_rate": 0.0001706803074467055, "loss": 0.029, "step": 1322 }, { "epoch": 0.9763837638376384, "grad_norm": 0.22011489236370718, "learning_rate": 0.00017061952700883523, "loss": 0.0581, "step": 1323 }, { "epoch": 0.9771217712177122, "grad_norm": 0.24010207967065264, "learning_rate": 0.00017055869448303232, "loss": 0.0568, "step": 1324 }, { "epoch": 0.977859778597786, "grad_norm": 0.15899513166325246, "learning_rate": 0.0001704978099141659, "loss": 0.0519, "step": 1325 }, { "epoch": 0.9785977859778597, "grad_norm": 0.33439326276356707, "learning_rate": 0.00017043687334714362, "loss": 0.0598, "step": 1326 }, { "epoch": 0.9793357933579335, "grad_norm": 0.37995043671455303, "learning_rate": 0.00017037588482691135, "loss": 0.0698, "step": 1327 }, { "epoch": 0.9800738007380074, "grad_norm": 0.23905185518586045, "learning_rate": 0.0001703148443984533, "loss": 0.0393, "step": 1328 }, { "epoch": 0.9808118081180812, "grad_norm": 0.41561386098287945, "learning_rate": 0.00017025375210679209, "loss": 0.0698, "step": 1329 }, { "epoch": 0.981549815498155, "grad_norm": 0.1905641309944041, "learning_rate": 0.00017019260799698842, "loss": 0.0699, "step": 1330 }, { "epoch": 0.9822878228782288, "grad_norm": 0.15443373494880092, "learning_rate": 0.00017013141211414133, "loss": 0.045, "step": 1331 }, { "epoch": 0.9830258302583026, "grad_norm": 0.15934323758407923, "learning_rate": 0.00017007016450338802, "loss": 0.0497, "step": 1332 }, { "epoch": 0.9837638376383764, "grad_norm": 0.2521743078852868, "learning_rate": 0.0001700088652099038, "loss": 0.0427, "step": 1333 }, { "epoch": 0.9845018450184502, "grad_norm": 0.277143349057529, "learning_rate": 0.0001699475142789022, "loss": 0.0697, "step": 1334 }, { "epoch": 0.985239852398524, "grad_norm": 0.1635996197204461, "learning_rate": 0.0001698861117556347, "loss": 0.0407, "step": 1335 }, { "epoch": 0.9859778597785978, "grad_norm": 0.2504187802579673, "learning_rate": 0.00016982465768539088, "loss": 0.0537, "step": 1336 }, { "epoch": 0.9867158671586715, "grad_norm": 0.3482672069065628, "learning_rate": 0.0001697631521134985, "loss": 0.0698, "step": 1337 }, { "epoch": 0.9874538745387453, "grad_norm": 0.2671184588229695, "learning_rate": 0.00016970159508532305, "loss": 0.0794, "step": 1338 }, { "epoch": 0.9881918819188192, "grad_norm": 0.2500561719506563, "learning_rate": 0.00016963998664626812, "loss": 0.0567, "step": 1339 }, { "epoch": 0.988929889298893, "grad_norm": 0.29789168070483674, "learning_rate": 0.00016957832684177522, "loss": 0.0858, "step": 1340 }, { "epoch": 0.9896678966789668, "grad_norm": 0.14723189430099956, "learning_rate": 0.0001695166157173237, "loss": 0.0197, "step": 1341 }, { "epoch": 0.9904059040590406, "grad_norm": 0.2998812584709244, "learning_rate": 0.00016945485331843084, "loss": 0.0646, "step": 1342 }, { "epoch": 0.9911439114391144, "grad_norm": 0.171835311613462, "learning_rate": 0.0001693930396906516, "loss": 0.0401, "step": 1343 }, { "epoch": 0.9918819188191882, "grad_norm": 0.23146056148528787, "learning_rate": 0.00016933117487957889, "loss": 0.068, "step": 1344 }, { "epoch": 0.992619926199262, "grad_norm": 0.17438540777783315, "learning_rate": 0.00016926925893084323, "loss": 0.0455, "step": 1345 }, { "epoch": 0.9933579335793358, "grad_norm": 0.17032128961632054, "learning_rate": 0.00016920729189011293, "loss": 0.0263, "step": 1346 }, { "epoch": 0.9940959409594096, "grad_norm": 0.296257864319931, "learning_rate": 0.000169145273803094, "loss": 0.0447, "step": 1347 }, { "epoch": 0.9948339483394834, "grad_norm": 0.7267894867105623, "learning_rate": 0.00016908320471553006, "loss": 0.1425, "step": 1348 }, { "epoch": 0.9955719557195571, "grad_norm": 0.14629876541853945, "learning_rate": 0.00016902108467320242, "loss": 0.0292, "step": 1349 }, { "epoch": 0.996309963099631, "grad_norm": 0.19919463563961828, "learning_rate": 0.0001689589137219298, "loss": 0.0328, "step": 1350 }, { "epoch": 0.9970479704797048, "grad_norm": 0.2565629449198025, "learning_rate": 0.00016889669190756868, "loss": 0.052, "step": 1351 }, { "epoch": 0.9977859778597786, "grad_norm": 0.22369214995541722, "learning_rate": 0.00016883441927601292, "loss": 0.044, "step": 1352 }, { "epoch": 0.9985239852398524, "grad_norm": 0.4543458247875668, "learning_rate": 0.0001687720958731939, "loss": 0.056, "step": 1353 }, { "epoch": 0.9992619926199262, "grad_norm": 0.29131388731283725, "learning_rate": 0.00016870972174508052, "loss": 0.051, "step": 1354 }, { "epoch": 1.0, "grad_norm": 0.41267357120518927, "learning_rate": 0.00016864729693767894, "loss": 0.074, "step": 1355 }, { "epoch": 1.0, "eval_loss": 0.06882239133119583, "eval_runtime": 581.8534, "eval_samples_per_second": 18.434, "eval_steps_per_second": 2.305, "step": 1355 }, { "epoch": 1.0007380073800738, "grad_norm": 0.25489797363040234, "learning_rate": 0.00016858482149703286, "loss": 0.0574, "step": 1356 }, { "epoch": 1.0014760147601476, "grad_norm": 0.27491586457927897, "learning_rate": 0.00016852229546922317, "loss": 0.0464, "step": 1357 }, { "epoch": 1.0022140221402214, "grad_norm": 0.22305901733183153, "learning_rate": 0.00016845971890036823, "loss": 0.029, "step": 1358 }, { "epoch": 1.0029520295202952, "grad_norm": 0.16472956873020184, "learning_rate": 0.00016839709183662357, "loss": 0.0428, "step": 1359 }, { "epoch": 1.003690036900369, "grad_norm": 0.2494435674668996, "learning_rate": 0.00016833441432418202, "loss": 0.046, "step": 1360 }, { "epoch": 1.0044280442804427, "grad_norm": 0.31848578943106165, "learning_rate": 0.00016827168640927358, "loss": 0.0603, "step": 1361 }, { "epoch": 1.0051660516605165, "grad_norm": 0.22835242122352262, "learning_rate": 0.00016820890813816543, "loss": 0.0586, "step": 1362 }, { "epoch": 1.0059040590405903, "grad_norm": 0.1255870475371614, "learning_rate": 0.00016814607955716198, "loss": 0.031, "step": 1363 }, { "epoch": 1.0066420664206641, "grad_norm": 0.46126051131746004, "learning_rate": 0.00016808320071260457, "loss": 0.0799, "step": 1364 }, { "epoch": 1.007380073800738, "grad_norm": 0.27453964639310957, "learning_rate": 0.00016802027165087178, "loss": 0.0487, "step": 1365 }, { "epoch": 1.0081180811808117, "grad_norm": 0.32328425793825294, "learning_rate": 0.00016795729241837913, "loss": 0.0747, "step": 1366 }, { "epoch": 1.0088560885608857, "grad_norm": 0.13279779523215662, "learning_rate": 0.00016789426306157925, "loss": 0.0149, "step": 1367 }, { "epoch": 1.0095940959409595, "grad_norm": 0.19461979688709002, "learning_rate": 0.00016783118362696163, "loss": 0.0372, "step": 1368 }, { "epoch": 1.0103321033210333, "grad_norm": 0.13281321339834984, "learning_rate": 0.00016776805416105273, "loss": 0.0218, "step": 1369 }, { "epoch": 1.011070110701107, "grad_norm": 0.5752335067957748, "learning_rate": 0.00016770487471041593, "loss": 0.1352, "step": 1370 }, { "epoch": 1.0118081180811809, "grad_norm": 0.32465660862194373, "learning_rate": 0.0001676416453216515, "loss": 0.032, "step": 1371 }, { "epoch": 1.0125461254612547, "grad_norm": 0.17495878248263896, "learning_rate": 0.00016757836604139648, "loss": 0.0424, "step": 1372 }, { "epoch": 1.0132841328413285, "grad_norm": 0.18636002808384006, "learning_rate": 0.00016751503691632476, "loss": 0.0319, "step": 1373 }, { "epoch": 1.0140221402214022, "grad_norm": 0.1981430754383351, "learning_rate": 0.00016745165799314694, "loss": 0.0415, "step": 1374 }, { "epoch": 1.014760147601476, "grad_norm": 0.20746841600359575, "learning_rate": 0.00016738822931861046, "loss": 0.0583, "step": 1375 }, { "epoch": 1.0154981549815498, "grad_norm": 0.14589415981552234, "learning_rate": 0.00016732475093949936, "loss": 0.026, "step": 1376 }, { "epoch": 1.0162361623616236, "grad_norm": 0.1998509406382449, "learning_rate": 0.00016726122290263432, "loss": 0.045, "step": 1377 }, { "epoch": 1.0169741697416974, "grad_norm": 0.13003751833374408, "learning_rate": 0.00016719764525487273, "loss": 0.0268, "step": 1378 }, { "epoch": 1.0177121771217712, "grad_norm": 0.29265790472205816, "learning_rate": 0.00016713401804310855, "loss": 0.0536, "step": 1379 }, { "epoch": 1.018450184501845, "grad_norm": 0.17503070033758736, "learning_rate": 0.0001670703413142723, "loss": 0.0277, "step": 1380 }, { "epoch": 1.0191881918819188, "grad_norm": 0.2553132352184256, "learning_rate": 0.00016700661511533088, "loss": 0.0529, "step": 1381 }, { "epoch": 1.0199261992619926, "grad_norm": 0.1240623424507521, "learning_rate": 0.00016694283949328798, "loss": 0.0177, "step": 1382 }, { "epoch": 1.0206642066420664, "grad_norm": 0.283408883367768, "learning_rate": 0.00016687901449518347, "loss": 0.0595, "step": 1383 }, { "epoch": 1.0214022140221402, "grad_norm": 0.5281172028945703, "learning_rate": 0.00016681514016809372, "loss": 0.119, "step": 1384 }, { "epoch": 1.022140221402214, "grad_norm": 0.14852639198705628, "learning_rate": 0.00016675121655913155, "loss": 0.0387, "step": 1385 }, { "epoch": 1.0228782287822877, "grad_norm": 0.14149136008280203, "learning_rate": 0.00016668724371544607, "loss": 0.0395, "step": 1386 }, { "epoch": 1.0236162361623615, "grad_norm": 0.20706521715520138, "learning_rate": 0.00016662322168422268, "loss": 0.0362, "step": 1387 }, { "epoch": 1.0243542435424353, "grad_norm": 0.9257655245685804, "learning_rate": 0.00016655915051268317, "loss": 0.1798, "step": 1388 }, { "epoch": 1.0250922509225093, "grad_norm": 0.09129004679599531, "learning_rate": 0.00016649503024808543, "loss": 0.0147, "step": 1389 }, { "epoch": 1.0258302583025831, "grad_norm": 0.44928365955368654, "learning_rate": 0.00016643086093772366, "loss": 0.076, "step": 1390 }, { "epoch": 1.026568265682657, "grad_norm": 0.26014038291618663, "learning_rate": 0.00016636664262892822, "loss": 0.0617, "step": 1391 }, { "epoch": 1.0273062730627307, "grad_norm": 0.10850699272096494, "learning_rate": 0.00016630237536906556, "loss": 0.0182, "step": 1392 }, { "epoch": 1.0280442804428045, "grad_norm": 0.20706862413613417, "learning_rate": 0.00016623805920553832, "loss": 0.0441, "step": 1393 }, { "epoch": 1.0287822878228783, "grad_norm": 0.22429514786378027, "learning_rate": 0.00016617369418578512, "loss": 0.0507, "step": 1394 }, { "epoch": 1.029520295202952, "grad_norm": 0.22937099326197, "learning_rate": 0.00016610928035728072, "loss": 0.0551, "step": 1395 }, { "epoch": 1.0302583025830259, "grad_norm": 0.2367227671984206, "learning_rate": 0.00016604481776753575, "loss": 0.0358, "step": 1396 }, { "epoch": 1.0309963099630997, "grad_norm": 0.2723536357285045, "learning_rate": 0.00016598030646409692, "loss": 0.0351, "step": 1397 }, { "epoch": 1.0317343173431734, "grad_norm": 0.20358645312078272, "learning_rate": 0.0001659157464945468, "loss": 0.0467, "step": 1398 }, { "epoch": 1.0324723247232472, "grad_norm": 0.2976193314424844, "learning_rate": 0.00016585113790650388, "loss": 0.0317, "step": 1399 }, { "epoch": 1.033210332103321, "grad_norm": 0.258935135082472, "learning_rate": 0.00016578648074762253, "loss": 0.0568, "step": 1400 }, { "epoch": 1.0339483394833948, "grad_norm": 0.24905350961859243, "learning_rate": 0.00016572177506559292, "loss": 0.0463, "step": 1401 }, { "epoch": 1.0346863468634686, "grad_norm": 0.253732852985182, "learning_rate": 0.00016565702090814104, "loss": 0.0897, "step": 1402 }, { "epoch": 1.0354243542435424, "grad_norm": 0.20485024602255936, "learning_rate": 0.0001655922183230286, "loss": 0.0193, "step": 1403 }, { "epoch": 1.0361623616236162, "grad_norm": 0.2765779125904751, "learning_rate": 0.000165527367358053, "loss": 0.0601, "step": 1404 }, { "epoch": 1.03690036900369, "grad_norm": 0.16570576579402582, "learning_rate": 0.0001654624680610474, "loss": 0.0248, "step": 1405 }, { "epoch": 1.0376383763837638, "grad_norm": 0.15023936174353297, "learning_rate": 0.00016539752047988056, "loss": 0.0238, "step": 1406 }, { "epoch": 1.0383763837638376, "grad_norm": 0.18960982560753964, "learning_rate": 0.0001653325246624569, "loss": 0.0501, "step": 1407 }, { "epoch": 1.0391143911439114, "grad_norm": 0.37717782469145344, "learning_rate": 0.0001652674806567164, "loss": 0.061, "step": 1408 }, { "epoch": 1.0398523985239851, "grad_norm": 0.13966862961921253, "learning_rate": 0.00016520238851063448, "loss": 0.0251, "step": 1409 }, { "epoch": 1.040590405904059, "grad_norm": 0.17725306794061432, "learning_rate": 0.00016513724827222227, "loss": 0.0331, "step": 1410 }, { "epoch": 1.041328413284133, "grad_norm": 0.20511884750363688, "learning_rate": 0.00016507205998952612, "loss": 0.0463, "step": 1411 }, { "epoch": 1.0420664206642067, "grad_norm": 0.3305910428012982, "learning_rate": 0.0001650068237106281, "loss": 0.0752, "step": 1412 }, { "epoch": 1.0428044280442805, "grad_norm": 0.34877338460316853, "learning_rate": 0.00016494153948364547, "loss": 0.0309, "step": 1413 }, { "epoch": 1.0435424354243543, "grad_norm": 0.2467408036013942, "learning_rate": 0.00016487620735673088, "loss": 0.0443, "step": 1414 }, { "epoch": 1.044280442804428, "grad_norm": 0.14390888715875694, "learning_rate": 0.00016481082737807246, "loss": 0.0217, "step": 1415 }, { "epoch": 1.045018450184502, "grad_norm": 0.1988297037757085, "learning_rate": 0.00016474539959589345, "loss": 0.022, "step": 1416 }, { "epoch": 1.0457564575645757, "grad_norm": 0.24910599790837173, "learning_rate": 0.00016467992405845246, "loss": 0.0494, "step": 1417 }, { "epoch": 1.0464944649446495, "grad_norm": 0.3118970839411372, "learning_rate": 0.00016461440081404324, "loss": 0.0982, "step": 1418 }, { "epoch": 1.0472324723247233, "grad_norm": 0.32171763415210547, "learning_rate": 0.00016454882991099486, "loss": 0.0564, "step": 1419 }, { "epoch": 1.047970479704797, "grad_norm": 0.20973958811344848, "learning_rate": 0.0001644832113976714, "loss": 0.028, "step": 1420 }, { "epoch": 1.0487084870848709, "grad_norm": 0.2821546562587399, "learning_rate": 0.00016441754532247216, "loss": 0.0557, "step": 1421 }, { "epoch": 1.0494464944649446, "grad_norm": 0.19922972797357213, "learning_rate": 0.0001643518317338314, "loss": 0.0429, "step": 1422 }, { "epoch": 1.0501845018450184, "grad_norm": 0.23599584889473907, "learning_rate": 0.00016428607068021863, "loss": 0.0404, "step": 1423 }, { "epoch": 1.0509225092250922, "grad_norm": 0.32473735933986164, "learning_rate": 0.00016422026221013812, "loss": 0.0577, "step": 1424 }, { "epoch": 1.051660516605166, "grad_norm": 0.1575429631481631, "learning_rate": 0.00016415440637212932, "loss": 0.0351, "step": 1425 }, { "epoch": 1.0523985239852398, "grad_norm": 0.31336650076855627, "learning_rate": 0.00016408850321476652, "loss": 0.0591, "step": 1426 }, { "epoch": 1.0531365313653136, "grad_norm": 0.26997707886241157, "learning_rate": 0.0001640225527866589, "loss": 0.0781, "step": 1427 }, { "epoch": 1.0538745387453874, "grad_norm": 0.3794179681909011, "learning_rate": 0.00016395655513645055, "loss": 0.0656, "step": 1428 }, { "epoch": 1.0546125461254612, "grad_norm": 0.47366501519361787, "learning_rate": 0.00016389051031282033, "loss": 0.1338, "step": 1429 }, { "epoch": 1.055350553505535, "grad_norm": 0.28452091049852657, "learning_rate": 0.00016382441836448202, "loss": 0.062, "step": 1430 }, { "epoch": 1.0560885608856088, "grad_norm": 0.25521380243344033, "learning_rate": 0.00016375827934018403, "loss": 0.0588, "step": 1431 }, { "epoch": 1.0568265682656826, "grad_norm": 0.20383659591427689, "learning_rate": 0.00016369209328870953, "loss": 0.0465, "step": 1432 }, { "epoch": 1.0575645756457566, "grad_norm": 0.3027594883076589, "learning_rate": 0.0001636258602588764, "loss": 0.0578, "step": 1433 }, { "epoch": 1.0583025830258304, "grad_norm": 0.17347271969007955, "learning_rate": 0.0001635595802995372, "loss": 0.038, "step": 1434 }, { "epoch": 1.0590405904059041, "grad_norm": 0.2561701446702711, "learning_rate": 0.00016349325345957897, "loss": 0.0715, "step": 1435 }, { "epoch": 1.059778597785978, "grad_norm": 0.16473921983842746, "learning_rate": 0.0001634268797879235, "loss": 0.0616, "step": 1436 }, { "epoch": 1.0605166051660517, "grad_norm": 0.08846295384741577, "learning_rate": 0.000163360459333527, "loss": 0.0188, "step": 1437 }, { "epoch": 1.0612546125461255, "grad_norm": 0.15484558747099741, "learning_rate": 0.0001632939921453802, "loss": 0.0442, "step": 1438 }, { "epoch": 1.0619926199261993, "grad_norm": 0.28534946982465215, "learning_rate": 0.0001632274782725084, "loss": 0.0739, "step": 1439 }, { "epoch": 1.062730627306273, "grad_norm": 0.2655606722323625, "learning_rate": 0.00016316091776397121, "loss": 0.0486, "step": 1440 }, { "epoch": 1.063468634686347, "grad_norm": 0.3042199973577984, "learning_rate": 0.00016309431066886273, "loss": 0.0615, "step": 1441 }, { "epoch": 1.0642066420664207, "grad_norm": 0.348033259874292, "learning_rate": 0.00016302765703631137, "loss": 0.053, "step": 1442 }, { "epoch": 1.0649446494464945, "grad_norm": 0.3580798240134443, "learning_rate": 0.00016296095691547982, "loss": 0.0749, "step": 1443 }, { "epoch": 1.0656826568265683, "grad_norm": 0.2830683959473586, "learning_rate": 0.00016289421035556518, "loss": 0.041, "step": 1444 }, { "epoch": 1.066420664206642, "grad_norm": 0.20624901553072764, "learning_rate": 0.00016282741740579872, "loss": 0.0297, "step": 1445 }, { "epoch": 1.0671586715867158, "grad_norm": 0.25229597723803265, "learning_rate": 0.00016276057811544594, "loss": 0.0371, "step": 1446 }, { "epoch": 1.0678966789667896, "grad_norm": 0.23195370824311562, "learning_rate": 0.00016269369253380656, "loss": 0.0594, "step": 1447 }, { "epoch": 1.0686346863468634, "grad_norm": 0.2991626820914835, "learning_rate": 0.00016262676071021433, "loss": 0.0545, "step": 1448 }, { "epoch": 1.0693726937269372, "grad_norm": 0.3392816280721102, "learning_rate": 0.00016255978269403727, "loss": 0.0369, "step": 1449 }, { "epoch": 1.070110701107011, "grad_norm": 0.3510160110068108, "learning_rate": 0.00016249275853467735, "loss": 0.0523, "step": 1450 }, { "epoch": 1.0708487084870848, "grad_norm": 0.2561885738711088, "learning_rate": 0.0001624256882815706, "loss": 0.0497, "step": 1451 }, { "epoch": 1.0715867158671586, "grad_norm": 0.258513797547377, "learning_rate": 0.0001623585719841871, "loss": 0.0411, "step": 1452 }, { "epoch": 1.0723247232472324, "grad_norm": 0.2763487507137373, "learning_rate": 0.0001622914096920308, "loss": 0.0567, "step": 1453 }, { "epoch": 1.0730627306273062, "grad_norm": 0.41667567146157664, "learning_rate": 0.00016222420145463966, "loss": 0.0713, "step": 1454 }, { "epoch": 1.07380073800738, "grad_norm": 0.18109138012796452, "learning_rate": 0.00016215694732158549, "loss": 0.0333, "step": 1455 }, { "epoch": 1.074538745387454, "grad_norm": 0.15872045459926598, "learning_rate": 0.00016208964734247395, "loss": 0.0333, "step": 1456 }, { "epoch": 1.0752767527675278, "grad_norm": 0.4606008665936142, "learning_rate": 0.00016202230156694457, "loss": 0.0722, "step": 1457 }, { "epoch": 1.0760147601476016, "grad_norm": 0.22566940553062237, "learning_rate": 0.00016195491004467052, "loss": 0.0646, "step": 1458 }, { "epoch": 1.0767527675276753, "grad_norm": 0.19163224374190482, "learning_rate": 0.00016188747282535885, "loss": 0.0565, "step": 1459 }, { "epoch": 1.0774907749077491, "grad_norm": 0.27492299227226036, "learning_rate": 0.0001618199899587503, "loss": 0.0529, "step": 1460 }, { "epoch": 1.078228782287823, "grad_norm": 0.08606764589026145, "learning_rate": 0.0001617524614946192, "loss": 0.0202, "step": 1461 }, { "epoch": 1.0789667896678967, "grad_norm": 0.23852138309421472, "learning_rate": 0.00016168488748277357, "loss": 0.0621, "step": 1462 }, { "epoch": 1.0797047970479705, "grad_norm": 0.2827626984223062, "learning_rate": 0.00016161726797305506, "loss": 0.0629, "step": 1463 }, { "epoch": 1.0804428044280443, "grad_norm": 0.192502095596514, "learning_rate": 0.0001615496030153388, "loss": 0.0544, "step": 1464 }, { "epoch": 1.081180811808118, "grad_norm": 0.13398991104001523, "learning_rate": 0.00016148189265953344, "loss": 0.0313, "step": 1465 }, { "epoch": 1.0819188191881919, "grad_norm": 0.24171806885894598, "learning_rate": 0.00016141413695558118, "loss": 0.0424, "step": 1466 }, { "epoch": 1.0826568265682657, "grad_norm": 0.27045536218791855, "learning_rate": 0.00016134633595345766, "loss": 0.0559, "step": 1467 }, { "epoch": 1.0833948339483395, "grad_norm": 0.38501756398308296, "learning_rate": 0.0001612784897031719, "loss": 0.0963, "step": 1468 }, { "epoch": 1.0841328413284133, "grad_norm": 0.21332061837084038, "learning_rate": 0.0001612105982547663, "loss": 0.0554, "step": 1469 }, { "epoch": 1.084870848708487, "grad_norm": 0.38590993923289263, "learning_rate": 0.00016114266165831657, "loss": 0.0573, "step": 1470 }, { "epoch": 1.0856088560885608, "grad_norm": 0.25773844769309207, "learning_rate": 0.00016107467996393182, "loss": 0.0641, "step": 1471 }, { "epoch": 1.0863468634686346, "grad_norm": 0.21057332223577138, "learning_rate": 0.00016100665322175427, "loss": 0.0435, "step": 1472 }, { "epoch": 1.0870848708487084, "grad_norm": 0.2663766158754811, "learning_rate": 0.00016093858148195954, "loss": 0.0354, "step": 1473 }, { "epoch": 1.0878228782287822, "grad_norm": 0.20409402637962387, "learning_rate": 0.00016087046479475628, "loss": 0.035, "step": 1474 }, { "epoch": 1.088560885608856, "grad_norm": 0.3237337187430215, "learning_rate": 0.00016080230321038644, "loss": 0.0714, "step": 1475 }, { "epoch": 1.0892988929889298, "grad_norm": 0.25406342960024203, "learning_rate": 0.0001607340967791249, "loss": 0.0529, "step": 1476 }, { "epoch": 1.0900369003690038, "grad_norm": 0.10439965672410732, "learning_rate": 0.00016066584555127987, "loss": 0.0279, "step": 1477 }, { "epoch": 1.0907749077490776, "grad_norm": 0.4747510093979123, "learning_rate": 0.0001605975495771923, "loss": 0.1013, "step": 1478 }, { "epoch": 1.0915129151291514, "grad_norm": 0.19600443475383314, "learning_rate": 0.00016052920890723645, "loss": 0.0479, "step": 1479 }, { "epoch": 1.0922509225092252, "grad_norm": 0.18689797519387014, "learning_rate": 0.0001604608235918193, "loss": 0.0417, "step": 1480 }, { "epoch": 1.092988929889299, "grad_norm": 0.25319324754945616, "learning_rate": 0.00016039239368138093, "loss": 0.0243, "step": 1481 }, { "epoch": 1.0937269372693728, "grad_norm": 0.2250264604440475, "learning_rate": 0.00016032391922639417, "loss": 0.053, "step": 1482 }, { "epoch": 1.0944649446494465, "grad_norm": 0.20022293074882888, "learning_rate": 0.00016025540027736485, "loss": 0.066, "step": 1483 }, { "epoch": 1.0952029520295203, "grad_norm": 0.39229555320808707, "learning_rate": 0.00016018683688483155, "loss": 0.1136, "step": 1484 }, { "epoch": 1.0959409594095941, "grad_norm": 0.21461388061532502, "learning_rate": 0.00016011822909936556, "loss": 0.0422, "step": 1485 }, { "epoch": 1.096678966789668, "grad_norm": 0.3203237726771068, "learning_rate": 0.00016004957697157102, "loss": 0.0557, "step": 1486 }, { "epoch": 1.0974169741697417, "grad_norm": 0.18190340603802288, "learning_rate": 0.00015998088055208472, "loss": 0.0412, "step": 1487 }, { "epoch": 1.0981549815498155, "grad_norm": 0.1927517439461037, "learning_rate": 0.0001599121398915762, "loss": 0.0424, "step": 1488 }, { "epoch": 1.0988929889298893, "grad_norm": 0.2377692190500979, "learning_rate": 0.0001598433550407475, "loss": 0.0469, "step": 1489 }, { "epoch": 1.099630996309963, "grad_norm": 0.30868513063151287, "learning_rate": 0.0001597745260503333, "loss": 0.0512, "step": 1490 }, { "epoch": 1.1003690036900369, "grad_norm": 0.3292619502961305, "learning_rate": 0.00015970565297110097, "loss": 0.0954, "step": 1491 }, { "epoch": 1.1011070110701107, "grad_norm": 0.19117314961842144, "learning_rate": 0.00015963673585385016, "loss": 0.0324, "step": 1492 }, { "epoch": 1.1018450184501845, "grad_norm": 0.31421328700655776, "learning_rate": 0.00015956777474941322, "loss": 0.0655, "step": 1493 }, { "epoch": 1.1025830258302582, "grad_norm": 0.2140744985825421, "learning_rate": 0.0001594987697086548, "loss": 0.0455, "step": 1494 }, { "epoch": 1.103321033210332, "grad_norm": 0.3614845534997021, "learning_rate": 0.00015942972078247206, "loss": 0.057, "step": 1495 }, { "epoch": 1.1040590405904058, "grad_norm": 0.180961032589615, "learning_rate": 0.00015936062802179445, "loss": 0.038, "step": 1496 }, { "epoch": 1.1047970479704796, "grad_norm": 0.3607064620131565, "learning_rate": 0.00015929149147758377, "loss": 0.0269, "step": 1497 }, { "epoch": 1.1055350553505534, "grad_norm": 0.14448439594720766, "learning_rate": 0.00015922231120083416, "loss": 0.0316, "step": 1498 }, { "epoch": 1.1062730627306272, "grad_norm": 0.21471449720013544, "learning_rate": 0.00015915308724257198, "loss": 0.0314, "step": 1499 }, { "epoch": 1.1070110701107012, "grad_norm": 0.23253137309247104, "learning_rate": 0.00015908381965385577, "loss": 0.051, "step": 1500 }, { "epoch": 1.107749077490775, "grad_norm": 0.1371923834535075, "learning_rate": 0.00015901450848577635, "loss": 0.0141, "step": 1501 }, { "epoch": 1.1084870848708488, "grad_norm": 0.22277172595128641, "learning_rate": 0.00015894515378945658, "loss": 0.0342, "step": 1502 }, { "epoch": 1.1092250922509226, "grad_norm": 0.4457631889075792, "learning_rate": 0.00015887575561605147, "loss": 0.1033, "step": 1503 }, { "epoch": 1.1099630996309964, "grad_norm": 0.41877096062630537, "learning_rate": 0.00015880631401674818, "loss": 0.0888, "step": 1504 }, { "epoch": 1.1107011070110702, "grad_norm": 0.2969415675328396, "learning_rate": 0.0001587368290427657, "loss": 0.0565, "step": 1505 }, { "epoch": 1.111439114391144, "grad_norm": 0.20162205240492118, "learning_rate": 0.00015866730074535522, "loss": 0.0444, "step": 1506 }, { "epoch": 1.1121771217712177, "grad_norm": 0.5013596510458762, "learning_rate": 0.00015859772917579975, "loss": 0.0299, "step": 1507 }, { "epoch": 1.1129151291512915, "grad_norm": 0.2416968146996416, "learning_rate": 0.00015852811438541432, "loss": 0.0761, "step": 1508 }, { "epoch": 1.1136531365313653, "grad_norm": 0.36936002071280094, "learning_rate": 0.0001584584564255457, "loss": 0.1723, "step": 1509 }, { "epoch": 1.1143911439114391, "grad_norm": 0.23908689401527944, "learning_rate": 0.00015838875534757266, "loss": 0.0438, "step": 1510 }, { "epoch": 1.115129151291513, "grad_norm": 0.26044526441865373, "learning_rate": 0.00015831901120290568, "loss": 0.038, "step": 1511 }, { "epoch": 1.1158671586715867, "grad_norm": 0.19453093744716796, "learning_rate": 0.000158249224042987, "loss": 0.0449, "step": 1512 }, { "epoch": 1.1166051660516605, "grad_norm": 0.2311782948481607, "learning_rate": 0.00015817939391929065, "loss": 0.0561, "step": 1513 }, { "epoch": 1.1173431734317343, "grad_norm": 0.4518469780093551, "learning_rate": 0.00015810952088332223, "loss": 0.0417, "step": 1514 }, { "epoch": 1.118081180811808, "grad_norm": 0.20827459575943524, "learning_rate": 0.00015803960498661916, "loss": 0.0316, "step": 1515 }, { "epoch": 1.1188191881918819, "grad_norm": 0.19085578656571336, "learning_rate": 0.00015796964628075037, "loss": 0.0295, "step": 1516 }, { "epoch": 1.1195571955719557, "grad_norm": 0.3145970134410295, "learning_rate": 0.00015789964481731632, "loss": 0.0552, "step": 1517 }, { "epoch": 1.1202952029520294, "grad_norm": 0.15943159597458115, "learning_rate": 0.00015782960064794917, "loss": 0.0549, "step": 1518 }, { "epoch": 1.1210332103321032, "grad_norm": 0.17871752200652996, "learning_rate": 0.00015775951382431238, "loss": 0.0366, "step": 1519 }, { "epoch": 1.121771217712177, "grad_norm": 0.4250843582441842, "learning_rate": 0.00015768938439810102, "loss": 0.112, "step": 1520 }, { "epoch": 1.122509225092251, "grad_norm": 0.12431632243052151, "learning_rate": 0.0001576192124210415, "loss": 0.0299, "step": 1521 }, { "epoch": 1.1232472324723246, "grad_norm": 0.22610447283861884, "learning_rate": 0.00015754899794489166, "loss": 0.046, "step": 1522 }, { "epoch": 1.1239852398523986, "grad_norm": 0.4066186949731492, "learning_rate": 0.0001574787410214407, "loss": 0.0724, "step": 1523 }, { "epoch": 1.1247232472324724, "grad_norm": 0.3873660279948996, "learning_rate": 0.0001574084417025091, "loss": 0.0618, "step": 1524 }, { "epoch": 1.1254612546125462, "grad_norm": 0.1858058479744769, "learning_rate": 0.00015733810003994852, "loss": 0.0979, "step": 1525 }, { "epoch": 1.12619926199262, "grad_norm": 0.3216993489386817, "learning_rate": 0.0001572677160856421, "loss": 0.0257, "step": 1526 }, { "epoch": 1.1269372693726938, "grad_norm": 0.8012683357917613, "learning_rate": 0.00015719728989150387, "loss": 0.0886, "step": 1527 }, { "epoch": 1.1276752767527676, "grad_norm": 0.21531158215664503, "learning_rate": 0.00015712682150947923, "loss": 0.0393, "step": 1528 }, { "epoch": 1.1284132841328414, "grad_norm": 0.18884930065755415, "learning_rate": 0.00015705631099154465, "loss": 0.0318, "step": 1529 }, { "epoch": 1.1291512915129152, "grad_norm": 0.12709093847050115, "learning_rate": 0.00015698575838970764, "loss": 0.0224, "step": 1530 }, { "epoch": 1.129889298892989, "grad_norm": 0.3440982566905263, "learning_rate": 0.00015691516375600673, "loss": 0.0846, "step": 1531 }, { "epoch": 1.1306273062730627, "grad_norm": 0.6949712532019751, "learning_rate": 0.00015684452714251153, "loss": 0.1071, "step": 1532 }, { "epoch": 1.1313653136531365, "grad_norm": 0.17979019167749863, "learning_rate": 0.0001567738486013226, "loss": 0.0574, "step": 1533 }, { "epoch": 1.1321033210332103, "grad_norm": 0.4683620616551937, "learning_rate": 0.0001567031281845714, "loss": 0.0532, "step": 1534 }, { "epoch": 1.132841328413284, "grad_norm": 0.25140737936685925, "learning_rate": 0.00015663236594442022, "loss": 0.0521, "step": 1535 }, { "epoch": 1.133579335793358, "grad_norm": 0.3679651930704428, "learning_rate": 0.00015656156193306225, "loss": 0.0646, "step": 1536 }, { "epoch": 1.1343173431734317, "grad_norm": 0.30241213009075474, "learning_rate": 0.00015649071620272155, "loss": 0.0489, "step": 1537 }, { "epoch": 1.1350553505535055, "grad_norm": 0.25910776168702726, "learning_rate": 0.00015641982880565291, "loss": 0.06, "step": 1538 }, { "epoch": 1.1357933579335793, "grad_norm": 0.3269625227052126, "learning_rate": 0.00015634889979414178, "loss": 0.0589, "step": 1539 }, { "epoch": 1.136531365313653, "grad_norm": 0.25661272249147, "learning_rate": 0.0001562779292205044, "loss": 0.0393, "step": 1540 }, { "epoch": 1.1372693726937269, "grad_norm": 0.33724450180463095, "learning_rate": 0.00015620691713708762, "loss": 0.0929, "step": 1541 }, { "epoch": 1.1380073800738006, "grad_norm": 0.29747197260232655, "learning_rate": 0.00015613586359626894, "loss": 0.0516, "step": 1542 }, { "epoch": 1.1387453874538744, "grad_norm": 0.1730777867149846, "learning_rate": 0.00015606476865045633, "loss": 0.0499, "step": 1543 }, { "epoch": 1.1394833948339484, "grad_norm": 0.22511362666179688, "learning_rate": 0.00015599363235208852, "loss": 0.0371, "step": 1544 }, { "epoch": 1.140221402214022, "grad_norm": 0.1494673992662814, "learning_rate": 0.0001559224547536345, "loss": 0.0223, "step": 1545 }, { "epoch": 1.140959409594096, "grad_norm": 0.146642302163798, "learning_rate": 0.00015585123590759387, "loss": 0.0244, "step": 1546 }, { "epoch": 1.1416974169741698, "grad_norm": 0.1327377096283868, "learning_rate": 0.00015577997586649657, "loss": 0.0267, "step": 1547 }, { "epoch": 1.1424354243542436, "grad_norm": 0.4521430004162991, "learning_rate": 0.00015570867468290303, "loss": 0.0662, "step": 1548 }, { "epoch": 1.1431734317343174, "grad_norm": 0.16609656586471938, "learning_rate": 0.00015563733240940396, "loss": 0.038, "step": 1549 }, { "epoch": 1.1439114391143912, "grad_norm": 0.16209368499374763, "learning_rate": 0.0001555659490986203, "loss": 0.0208, "step": 1550 }, { "epoch": 1.144649446494465, "grad_norm": 0.2542211654247066, "learning_rate": 0.00015549452480320342, "loss": 0.0387, "step": 1551 }, { "epoch": 1.1453874538745388, "grad_norm": 0.2955315393578285, "learning_rate": 0.00015542305957583482, "loss": 0.079, "step": 1552 }, { "epoch": 1.1461254612546126, "grad_norm": 0.2660371510650943, "learning_rate": 0.00015535155346922618, "loss": 0.0357, "step": 1553 }, { "epoch": 1.1468634686346864, "grad_norm": 0.31053342182180527, "learning_rate": 0.00015528000653611935, "loss": 0.0541, "step": 1554 }, { "epoch": 1.1476014760147601, "grad_norm": 0.3208091797432244, "learning_rate": 0.00015520841882928635, "loss": 0.0401, "step": 1555 }, { "epoch": 1.148339483394834, "grad_norm": 0.24749701343048128, "learning_rate": 0.00015513679040152922, "loss": 0.1011, "step": 1556 }, { "epoch": 1.1490774907749077, "grad_norm": 0.20224039241228622, "learning_rate": 0.00015506512130568004, "loss": 0.0363, "step": 1557 }, { "epoch": 1.1498154981549815, "grad_norm": 0.2374723792193898, "learning_rate": 0.00015499341159460088, "loss": 0.0334, "step": 1558 }, { "epoch": 1.1505535055350553, "grad_norm": 0.1862371945671972, "learning_rate": 0.00015492166132118377, "loss": 0.0457, "step": 1559 }, { "epoch": 1.151291512915129, "grad_norm": 0.1531561793783959, "learning_rate": 0.00015484987053835067, "loss": 0.032, "step": 1560 }, { "epoch": 1.152029520295203, "grad_norm": 0.1305776153094441, "learning_rate": 0.0001547780392990534, "loss": 0.0238, "step": 1561 }, { "epoch": 1.1527675276752767, "grad_norm": 0.22703602709821752, "learning_rate": 0.0001547061676562737, "loss": 0.0429, "step": 1562 }, { "epoch": 1.1535055350553505, "grad_norm": 0.2979290315283791, "learning_rate": 0.00015463425566302296, "loss": 0.0556, "step": 1563 }, { "epoch": 1.1542435424354243, "grad_norm": 0.2771245624604091, "learning_rate": 0.00015456230337234245, "loss": 0.0739, "step": 1564 }, { "epoch": 1.1549815498154983, "grad_norm": 0.20845917756374846, "learning_rate": 0.00015449031083730316, "loss": 0.0367, "step": 1565 }, { "epoch": 1.1557195571955718, "grad_norm": 0.3542596096200844, "learning_rate": 0.0001544182781110057, "loss": 0.076, "step": 1566 }, { "epoch": 1.1564575645756459, "grad_norm": 0.365177034713894, "learning_rate": 0.00015434620524658037, "loss": 0.0554, "step": 1567 }, { "epoch": 1.1571955719557196, "grad_norm": 0.3155017166778687, "learning_rate": 0.00015427409229718704, "loss": 0.078, "step": 1568 }, { "epoch": 1.1579335793357934, "grad_norm": 0.3072142229440177, "learning_rate": 0.00015420193931601518, "loss": 0.067, "step": 1569 }, { "epoch": 1.1586715867158672, "grad_norm": 0.20997882588009223, "learning_rate": 0.0001541297463562838, "loss": 0.0362, "step": 1570 }, { "epoch": 1.159409594095941, "grad_norm": 0.1821939845330264, "learning_rate": 0.00015405751347124136, "loss": 0.0313, "step": 1571 }, { "epoch": 1.1601476014760148, "grad_norm": 0.27299411418567787, "learning_rate": 0.0001539852407141658, "loss": 0.0416, "step": 1572 }, { "epoch": 1.1608856088560886, "grad_norm": 0.22495112910924567, "learning_rate": 0.0001539129281383644, "loss": 0.0387, "step": 1573 }, { "epoch": 1.1616236162361624, "grad_norm": 0.1970584980672287, "learning_rate": 0.0001538405757971739, "loss": 0.0407, "step": 1574 }, { "epoch": 1.1623616236162362, "grad_norm": 0.11859347676514753, "learning_rate": 0.00015376818374396027, "loss": 0.0218, "step": 1575 }, { "epoch": 1.16309963099631, "grad_norm": 0.15151093101104632, "learning_rate": 0.00015369575203211892, "loss": 0.0342, "step": 1576 }, { "epoch": 1.1638376383763838, "grad_norm": 0.651138434403327, "learning_rate": 0.00015362328071507443, "loss": 0.0938, "step": 1577 }, { "epoch": 1.1645756457564576, "grad_norm": 0.4186846563938751, "learning_rate": 0.0001535507698462805, "loss": 0.11, "step": 1578 }, { "epoch": 1.1653136531365313, "grad_norm": 0.20117003815643802, "learning_rate": 0.0001534782194792201, "loss": 0.0363, "step": 1579 }, { "epoch": 1.1660516605166051, "grad_norm": 0.3032228363052906, "learning_rate": 0.00015340562966740541, "loss": 0.068, "step": 1580 }, { "epoch": 1.166789667896679, "grad_norm": 0.39493164461193114, "learning_rate": 0.00015333300046437755, "loss": 0.0617, "step": 1581 }, { "epoch": 1.1675276752767527, "grad_norm": 0.364605075618005, "learning_rate": 0.00015326033192370673, "loss": 0.0671, "step": 1582 }, { "epoch": 1.1682656826568265, "grad_norm": 0.15229210483005898, "learning_rate": 0.0001531876240989923, "loss": 0.0172, "step": 1583 }, { "epoch": 1.1690036900369003, "grad_norm": 0.4606741241949783, "learning_rate": 0.0001531148770438624, "loss": 0.0799, "step": 1584 }, { "epoch": 1.169741697416974, "grad_norm": 0.18132492308627637, "learning_rate": 0.00015304209081197425, "loss": 0.0462, "step": 1585 }, { "epoch": 1.1704797047970479, "grad_norm": 0.2165852035499414, "learning_rate": 0.0001529692654570139, "loss": 0.0366, "step": 1586 }, { "epoch": 1.1712177121771217, "grad_norm": 0.3273161212262212, "learning_rate": 0.00015289640103269625, "loss": 0.0394, "step": 1587 }, { "epoch": 1.1719557195571957, "grad_norm": 0.0949601613732476, "learning_rate": 0.00015282349759276507, "loss": 0.0175, "step": 1588 }, { "epoch": 1.1726937269372693, "grad_norm": 0.323194001370753, "learning_rate": 0.00015275055519099284, "loss": 0.0845, "step": 1589 }, { "epoch": 1.1734317343173433, "grad_norm": 0.28113980886388645, "learning_rate": 0.0001526775738811808, "loss": 0.0373, "step": 1590 }, { "epoch": 1.174169741697417, "grad_norm": 0.527346111204985, "learning_rate": 0.000152604553717159, "loss": 0.0721, "step": 1591 }, { "epoch": 1.1749077490774908, "grad_norm": 0.43394206004558183, "learning_rate": 0.0001525314947527859, "loss": 0.0944, "step": 1592 }, { "epoch": 1.1756457564575646, "grad_norm": 0.7657403028151729, "learning_rate": 0.0001524583970419488, "loss": 0.1154, "step": 1593 }, { "epoch": 1.1763837638376384, "grad_norm": 0.23048015300044994, "learning_rate": 0.00015238526063856352, "loss": 0.0368, "step": 1594 }, { "epoch": 1.1771217712177122, "grad_norm": 0.23418443544868092, "learning_rate": 0.00015231208559657439, "loss": 0.0963, "step": 1595 }, { "epoch": 1.177859778597786, "grad_norm": 0.24388694279485218, "learning_rate": 0.00015223887196995426, "loss": 0.0471, "step": 1596 }, { "epoch": 1.1785977859778598, "grad_norm": 0.3014865693729447, "learning_rate": 0.00015216561981270442, "loss": 0.0666, "step": 1597 }, { "epoch": 1.1793357933579336, "grad_norm": 0.2402412610264188, "learning_rate": 0.00015209232917885458, "loss": 0.0325, "step": 1598 }, { "epoch": 1.1800738007380074, "grad_norm": 0.30668626445579256, "learning_rate": 0.00015201900012246284, "loss": 0.0706, "step": 1599 }, { "epoch": 1.1808118081180812, "grad_norm": 0.2912590315838508, "learning_rate": 0.0001519456326976157, "loss": 0.0978, "step": 1600 }, { "epoch": 1.181549815498155, "grad_norm": 0.2556295774988081, "learning_rate": 0.00015187222695842785, "loss": 0.0555, "step": 1601 }, { "epoch": 1.1822878228782288, "grad_norm": 0.20696442721471717, "learning_rate": 0.00015179878295904227, "loss": 0.0359, "step": 1602 }, { "epoch": 1.1830258302583025, "grad_norm": 0.1869329304364725, "learning_rate": 0.00015172530075363024, "loss": 0.0275, "step": 1603 }, { "epoch": 1.1837638376383763, "grad_norm": 0.256880183776681, "learning_rate": 0.00015165178039639113, "loss": 0.0498, "step": 1604 }, { "epoch": 1.1845018450184501, "grad_norm": 0.1458604051395323, "learning_rate": 0.00015157822194155252, "loss": 0.0294, "step": 1605 }, { "epoch": 1.185239852398524, "grad_norm": 0.16084965801047343, "learning_rate": 0.00015150462544337, "loss": 0.0318, "step": 1606 }, { "epoch": 1.1859778597785977, "grad_norm": 0.2739720175327312, "learning_rate": 0.0001514309909561273, "loss": 0.0558, "step": 1607 }, { "epoch": 1.1867158671586715, "grad_norm": 0.28625589003959545, "learning_rate": 0.00015135731853413616, "loss": 0.0575, "step": 1608 }, { "epoch": 1.1874538745387453, "grad_norm": 0.14747189902957503, "learning_rate": 0.0001512836082317362, "loss": 0.0236, "step": 1609 }, { "epoch": 1.188191881918819, "grad_norm": 0.20990958220866995, "learning_rate": 0.0001512098601032952, "loss": 0.0326, "step": 1610 }, { "epoch": 1.188929889298893, "grad_norm": 0.18754001703494771, "learning_rate": 0.00015113607420320858, "loss": 0.03, "step": 1611 }, { "epoch": 1.1896678966789669, "grad_norm": 0.24913589959905885, "learning_rate": 0.00015106225058589983, "loss": 0.0411, "step": 1612 }, { "epoch": 1.1904059040590407, "grad_norm": 0.15162727845534796, "learning_rate": 0.00015098838930582012, "loss": 0.0224, "step": 1613 }, { "epoch": 1.1911439114391145, "grad_norm": 0.29318958645241305, "learning_rate": 0.00015091449041744847, "loss": 0.0519, "step": 1614 }, { "epoch": 1.1918819188191883, "grad_norm": 0.5231567328810218, "learning_rate": 0.0001508405539752916, "loss": 0.0572, "step": 1615 }, { "epoch": 1.192619926199262, "grad_norm": 0.2351209107376181, "learning_rate": 0.000150766580033884, "loss": 0.0403, "step": 1616 }, { "epoch": 1.1933579335793358, "grad_norm": 0.4799206127751697, "learning_rate": 0.00015069256864778773, "loss": 0.0611, "step": 1617 }, { "epoch": 1.1940959409594096, "grad_norm": 0.1687259727769252, "learning_rate": 0.0001506185198715925, "loss": 0.0289, "step": 1618 }, { "epoch": 1.1948339483394834, "grad_norm": 0.22306190313377697, "learning_rate": 0.0001505444337599157, "loss": 0.0555, "step": 1619 }, { "epoch": 1.1955719557195572, "grad_norm": 0.2582204970130303, "learning_rate": 0.00015047031036740201, "loss": 0.0368, "step": 1620 }, { "epoch": 1.196309963099631, "grad_norm": 0.14717610930269526, "learning_rate": 0.00015039614974872388, "loss": 0.021, "step": 1621 }, { "epoch": 1.1970479704797048, "grad_norm": 0.4262303527910849, "learning_rate": 0.00015032195195858105, "loss": 0.0788, "step": 1622 }, { "epoch": 1.1977859778597786, "grad_norm": 0.33376227882597215, "learning_rate": 0.00015024771705170076, "loss": 0.0603, "step": 1623 }, { "epoch": 1.1985239852398524, "grad_norm": 0.26605466338483047, "learning_rate": 0.00015017344508283756, "loss": 0.0431, "step": 1624 }, { "epoch": 1.1992619926199262, "grad_norm": 0.2357108687994845, "learning_rate": 0.0001500991361067734, "loss": 0.0419, "step": 1625 }, { "epoch": 1.2, "grad_norm": 0.23484692238451338, "learning_rate": 0.00015002479017831748, "loss": 0.0521, "step": 1626 }, { "epoch": 1.2007380073800737, "grad_norm": 0.31404630822026786, "learning_rate": 0.00014995040735230625, "loss": 0.0728, "step": 1627 }, { "epoch": 1.2014760147601475, "grad_norm": 0.3833151941180018, "learning_rate": 0.00014987598768360347, "loss": 0.0948, "step": 1628 }, { "epoch": 1.2022140221402213, "grad_norm": 0.17090199429605554, "learning_rate": 0.0001498015312270999, "loss": 0.0307, "step": 1629 }, { "epoch": 1.2029520295202951, "grad_norm": 0.19992594507373102, "learning_rate": 0.00014972703803771363, "loss": 0.0442, "step": 1630 }, { "epoch": 1.203690036900369, "grad_norm": 0.17532238292065375, "learning_rate": 0.00014965250817038968, "loss": 0.0327, "step": 1631 }, { "epoch": 1.204428044280443, "grad_norm": 0.2667557880354261, "learning_rate": 0.00014957794168010024, "loss": 0.0605, "step": 1632 }, { "epoch": 1.2051660516605165, "grad_norm": 0.18495861908415984, "learning_rate": 0.00014950333862184445, "loss": 0.0389, "step": 1633 }, { "epoch": 1.2059040590405905, "grad_norm": 0.27938612160699927, "learning_rate": 0.00014942869905064843, "loss": 0.0572, "step": 1634 }, { "epoch": 1.2066420664206643, "grad_norm": 0.15213700828342674, "learning_rate": 0.00014935402302156524, "loss": 0.0409, "step": 1635 }, { "epoch": 1.207380073800738, "grad_norm": 0.21161226495478314, "learning_rate": 0.00014927931058967482, "loss": 0.0343, "step": 1636 }, { "epoch": 1.2081180811808119, "grad_norm": 0.3992592293181765, "learning_rate": 0.00014920456181008397, "loss": 0.0341, "step": 1637 }, { "epoch": 1.2088560885608857, "grad_norm": 0.2265127154159529, "learning_rate": 0.00014912977673792635, "loss": 0.0449, "step": 1638 }, { "epoch": 1.2095940959409595, "grad_norm": 0.20538144615604167, "learning_rate": 0.00014905495542836227, "loss": 0.0657, "step": 1639 }, { "epoch": 1.2103321033210332, "grad_norm": 0.18238630116593088, "learning_rate": 0.0001489800979365789, "loss": 0.037, "step": 1640 }, { "epoch": 1.211070110701107, "grad_norm": 0.25168609182849455, "learning_rate": 0.00014890520431778997, "loss": 0.0832, "step": 1641 }, { "epoch": 1.2118081180811808, "grad_norm": 0.29381517810611274, "learning_rate": 0.00014883027462723596, "loss": 0.0417, "step": 1642 }, { "epoch": 1.2125461254612546, "grad_norm": 0.23753029473180842, "learning_rate": 0.00014875530892018385, "loss": 0.04, "step": 1643 }, { "epoch": 1.2132841328413284, "grad_norm": 0.2440642459489409, "learning_rate": 0.0001486803072519274, "loss": 0.0617, "step": 1644 }, { "epoch": 1.2140221402214022, "grad_norm": 0.528779921577869, "learning_rate": 0.00014860526967778656, "loss": 0.0414, "step": 1645 }, { "epoch": 1.214760147601476, "grad_norm": 0.1888523842969093, "learning_rate": 0.00014853019625310813, "loss": 0.0338, "step": 1646 }, { "epoch": 1.2154981549815498, "grad_norm": 0.3556802761507575, "learning_rate": 0.00014845508703326504, "loss": 0.0522, "step": 1647 }, { "epoch": 1.2162361623616236, "grad_norm": 0.2017586219584752, "learning_rate": 0.0001483799420736568, "loss": 0.0667, "step": 1648 }, { "epoch": 1.2169741697416974, "grad_norm": 0.29868382399569043, "learning_rate": 0.00014830476142970925, "loss": 0.0377, "step": 1649 }, { "epoch": 1.2177121771217712, "grad_norm": 0.18809180866879474, "learning_rate": 0.00014822954515687447, "loss": 0.0347, "step": 1650 }, { "epoch": 1.218450184501845, "grad_norm": 0.3029186629468145, "learning_rate": 0.00014815429331063097, "loss": 0.0483, "step": 1651 }, { "epoch": 1.2191881918819187, "grad_norm": 0.41641970596515954, "learning_rate": 0.0001480790059464834, "loss": 0.0908, "step": 1652 }, { "epoch": 1.2199261992619925, "grad_norm": 0.2537287892779746, "learning_rate": 0.00014800368311996263, "loss": 0.0503, "step": 1653 }, { "epoch": 1.2206642066420663, "grad_norm": 0.33706724924355214, "learning_rate": 0.0001479283248866256, "loss": 0.0626, "step": 1654 }, { "epoch": 1.2214022140221403, "grad_norm": 0.210370366900377, "learning_rate": 0.00014785293130205549, "loss": 0.0556, "step": 1655 }, { "epoch": 1.222140221402214, "grad_norm": 0.22557176087674075, "learning_rate": 0.00014777750242186153, "loss": 0.0374, "step": 1656 }, { "epoch": 1.222878228782288, "grad_norm": 0.2779737318627406, "learning_rate": 0.0001477020383016789, "loss": 0.0426, "step": 1657 }, { "epoch": 1.2236162361623617, "grad_norm": 0.10375002127867872, "learning_rate": 0.00014762653899716893, "loss": 0.0183, "step": 1658 }, { "epoch": 1.2243542435424355, "grad_norm": 0.20489288392574134, "learning_rate": 0.00014755100456401873, "loss": 0.0475, "step": 1659 }, { "epoch": 1.2250922509225093, "grad_norm": 0.2788612349816279, "learning_rate": 0.00014747543505794143, "loss": 0.0433, "step": 1660 }, { "epoch": 1.225830258302583, "grad_norm": 0.14536127681388356, "learning_rate": 0.00014739983053467596, "loss": 0.0331, "step": 1661 }, { "epoch": 1.2265682656826569, "grad_norm": 0.13114861933118585, "learning_rate": 0.00014732419104998716, "loss": 0.0275, "step": 1662 }, { "epoch": 1.2273062730627307, "grad_norm": 0.3686633103529748, "learning_rate": 0.00014724851665966554, "loss": 0.0511, "step": 1663 }, { "epoch": 1.2280442804428044, "grad_norm": 0.19221932858185908, "learning_rate": 0.00014717280741952753, "loss": 0.0375, "step": 1664 }, { "epoch": 1.2287822878228782, "grad_norm": 0.17624109201795402, "learning_rate": 0.00014709706338541506, "loss": 0.0319, "step": 1665 }, { "epoch": 1.229520295202952, "grad_norm": 0.28427846671977475, "learning_rate": 0.0001470212846131959, "loss": 0.0376, "step": 1666 }, { "epoch": 1.2302583025830258, "grad_norm": 0.19547046080121228, "learning_rate": 0.00014694547115876331, "loss": 0.0383, "step": 1667 }, { "epoch": 1.2309963099630996, "grad_norm": 0.15646928733403231, "learning_rate": 0.0001468696230780362, "loss": 0.0709, "step": 1668 }, { "epoch": 1.2317343173431734, "grad_norm": 0.21458481187422587, "learning_rate": 0.00014679374042695905, "loss": 0.0462, "step": 1669 }, { "epoch": 1.2324723247232472, "grad_norm": 0.3980799673587076, "learning_rate": 0.00014671782326150173, "loss": 0.0736, "step": 1670 }, { "epoch": 1.233210332103321, "grad_norm": 0.24817624128528, "learning_rate": 0.00014664187163765968, "loss": 0.0502, "step": 1671 }, { "epoch": 1.2339483394833948, "grad_norm": 0.22881174185253944, "learning_rate": 0.0001465658856114537, "loss": 0.0422, "step": 1672 }, { "epoch": 1.2346863468634686, "grad_norm": 0.24604396240552825, "learning_rate": 0.00014648986523892998, "loss": 0.0568, "step": 1673 }, { "epoch": 1.2354243542435424, "grad_norm": 0.1898894386179584, "learning_rate": 0.00014641381057616004, "loss": 0.0259, "step": 1674 }, { "epoch": 1.2361623616236161, "grad_norm": 0.1646900540470739, "learning_rate": 0.00014633772167924064, "loss": 0.0424, "step": 1675 }, { "epoch": 1.2369003690036902, "grad_norm": 0.36689069634460497, "learning_rate": 0.00014626159860429391, "loss": 0.0457, "step": 1676 }, { "epoch": 1.2376383763837637, "grad_norm": 0.15228426972407064, "learning_rate": 0.0001461854414074671, "loss": 0.0352, "step": 1677 }, { "epoch": 1.2383763837638377, "grad_norm": 0.28120435606599875, "learning_rate": 0.0001461092501449326, "loss": 0.041, "step": 1678 }, { "epoch": 1.2391143911439115, "grad_norm": 0.11554050089432241, "learning_rate": 0.000146033024872888, "loss": 0.0259, "step": 1679 }, { "epoch": 1.2398523985239853, "grad_norm": 0.19528030287243378, "learning_rate": 0.000145956765647556, "loss": 0.0531, "step": 1680 }, { "epoch": 1.2405904059040591, "grad_norm": 0.32166670972471195, "learning_rate": 0.00014588047252518424, "loss": 0.0924, "step": 1681 }, { "epoch": 1.241328413284133, "grad_norm": 0.09196580972943344, "learning_rate": 0.0001458041455620454, "loss": 0.0173, "step": 1682 }, { "epoch": 1.2420664206642067, "grad_norm": 0.09542442102369494, "learning_rate": 0.00014572778481443716, "loss": 0.0257, "step": 1683 }, { "epoch": 1.2428044280442805, "grad_norm": 0.3450063946508746, "learning_rate": 0.00014565139033868212, "loss": 0.0489, "step": 1684 }, { "epoch": 1.2435424354243543, "grad_norm": 0.19339250983721765, "learning_rate": 0.00014557496219112765, "loss": 0.0388, "step": 1685 }, { "epoch": 1.244280442804428, "grad_norm": 0.195568661774384, "learning_rate": 0.00014549850042814611, "loss": 0.0296, "step": 1686 }, { "epoch": 1.2450184501845019, "grad_norm": 0.5000978006204018, "learning_rate": 0.0001454220051061346, "loss": 0.0509, "step": 1687 }, { "epoch": 1.2457564575645756, "grad_norm": 0.12736984594160944, "learning_rate": 0.00014534547628151486, "loss": 0.0288, "step": 1688 }, { "epoch": 1.2464944649446494, "grad_norm": 0.21107503823415352, "learning_rate": 0.0001452689140107335, "loss": 0.0418, "step": 1689 }, { "epoch": 1.2472324723247232, "grad_norm": 0.17227926105898506, "learning_rate": 0.00014519231835026175, "loss": 0.0265, "step": 1690 }, { "epoch": 1.247970479704797, "grad_norm": 0.31314782998381707, "learning_rate": 0.00014511568935659538, "loss": 0.028, "step": 1691 }, { "epoch": 1.2487084870848708, "grad_norm": 0.157524701260555, "learning_rate": 0.00014503902708625486, "loss": 0.0259, "step": 1692 }, { "epoch": 1.2494464944649446, "grad_norm": 0.4112742177717687, "learning_rate": 0.00014496233159578517, "loss": 0.0712, "step": 1693 }, { "epoch": 1.2501845018450184, "grad_norm": 0.18948112472702633, "learning_rate": 0.00014488560294175577, "loss": 0.0368, "step": 1694 }, { "epoch": 1.2509225092250922, "grad_norm": 0.1808895178605321, "learning_rate": 0.0001448088411807606, "loss": 0.0473, "step": 1695 }, { "epoch": 1.251660516605166, "grad_norm": 0.39061220324196994, "learning_rate": 0.00014473204636941803, "loss": 0.0854, "step": 1696 }, { "epoch": 1.25239852398524, "grad_norm": 0.3521720925691335, "learning_rate": 0.00014465521856437077, "loss": 0.0533, "step": 1697 }, { "epoch": 1.2531365313653136, "grad_norm": 0.17652587462795066, "learning_rate": 0.0001445783578222859, "loss": 0.0317, "step": 1698 }, { "epoch": 1.2538745387453876, "grad_norm": 0.13708688790397774, "learning_rate": 0.00014450146419985475, "loss": 0.0252, "step": 1699 }, { "epoch": 1.2546125461254611, "grad_norm": 0.38583149898288377, "learning_rate": 0.00014442453775379298, "loss": 0.0464, "step": 1700 }, { "epoch": 1.2553505535055351, "grad_norm": 0.24065313067411065, "learning_rate": 0.00014434757854084042, "loss": 0.0459, "step": 1701 }, { "epoch": 1.2560885608856087, "grad_norm": 0.19721534676990898, "learning_rate": 0.000144270586617761, "loss": 0.0406, "step": 1702 }, { "epoch": 1.2568265682656827, "grad_norm": 0.27294590371324, "learning_rate": 0.00014419356204134292, "loss": 0.0748, "step": 1703 }, { "epoch": 1.2575645756457565, "grad_norm": 0.17400700835542793, "learning_rate": 0.0001441165048683983, "loss": 0.0262, "step": 1704 }, { "epoch": 1.2583025830258303, "grad_norm": 0.2357968697072145, "learning_rate": 0.00014403941515576344, "loss": 0.0571, "step": 1705 }, { "epoch": 1.259040590405904, "grad_norm": 0.2857312231090668, "learning_rate": 0.00014396229296029857, "loss": 0.0479, "step": 1706 }, { "epoch": 1.259778597785978, "grad_norm": 0.23547094620930795, "learning_rate": 0.00014388513833888793, "loss": 0.0511, "step": 1707 }, { "epoch": 1.2605166051660517, "grad_norm": 0.18486943005413897, "learning_rate": 0.00014380795134843956, "loss": 0.0412, "step": 1708 }, { "epoch": 1.2612546125461255, "grad_norm": 0.1825171338028126, "learning_rate": 0.00014373073204588556, "loss": 0.044, "step": 1709 }, { "epoch": 1.2619926199261993, "grad_norm": 0.2667364021641549, "learning_rate": 0.00014365348048818167, "loss": 0.0388, "step": 1710 }, { "epoch": 1.262730627306273, "grad_norm": 0.17768213823280002, "learning_rate": 0.00014357619673230758, "loss": 0.0272, "step": 1711 }, { "epoch": 1.2634686346863468, "grad_norm": 0.10400399796396317, "learning_rate": 0.00014349888083526664, "loss": 0.0161, "step": 1712 }, { "epoch": 1.2642066420664206, "grad_norm": 0.38790938633057914, "learning_rate": 0.0001434215328540859, "loss": 0.0508, "step": 1713 }, { "epoch": 1.2649446494464944, "grad_norm": 0.12272919876191656, "learning_rate": 0.00014334415284581614, "loss": 0.0242, "step": 1714 }, { "epoch": 1.2656826568265682, "grad_norm": 0.45701571303807725, "learning_rate": 0.0001432667408675317, "loss": 0.1127, "step": 1715 }, { "epoch": 1.266420664206642, "grad_norm": 0.3125303974760942, "learning_rate": 0.00014318929697633052, "loss": 0.0573, "step": 1716 }, { "epoch": 1.2671586715867158, "grad_norm": 0.16778521632948282, "learning_rate": 0.00014311182122933412, "loss": 0.0465, "step": 1717 }, { "epoch": 1.2678966789667896, "grad_norm": 0.27174990059772997, "learning_rate": 0.00014303431368368745, "loss": 0.044, "step": 1718 }, { "epoch": 1.2686346863468634, "grad_norm": 0.11476439808870126, "learning_rate": 0.00014295677439655897, "loss": 0.0459, "step": 1719 }, { "epoch": 1.2693726937269374, "grad_norm": 0.12190675173875236, "learning_rate": 0.0001428792034251405, "loss": 0.0266, "step": 1720 }, { "epoch": 1.270110701107011, "grad_norm": 0.1860315839912534, "learning_rate": 0.0001428016008266473, "loss": 0.0403, "step": 1721 }, { "epoch": 1.270848708487085, "grad_norm": 0.580338099900932, "learning_rate": 0.00014272396665831783, "loss": 0.1126, "step": 1722 }, { "epoch": 1.2715867158671585, "grad_norm": 0.27648799370048316, "learning_rate": 0.000142646300977414, "loss": 0.0467, "step": 1723 }, { "epoch": 1.2723247232472326, "grad_norm": 0.09745760953379083, "learning_rate": 0.00014256860384122082, "loss": 0.0136, "step": 1724 }, { "epoch": 1.2730627306273063, "grad_norm": 0.2162536557998576, "learning_rate": 0.00014249087530704662, "loss": 0.0273, "step": 1725 }, { "epoch": 1.2738007380073801, "grad_norm": 0.21932878960530242, "learning_rate": 0.0001424131154322228, "loss": 0.0611, "step": 1726 }, { "epoch": 1.274538745387454, "grad_norm": 0.38273926603386604, "learning_rate": 0.00014233532427410396, "loss": 0.0857, "step": 1727 }, { "epoch": 1.2752767527675277, "grad_norm": 0.38212941793842176, "learning_rate": 0.00014225750189006764, "loss": 0.0609, "step": 1728 }, { "epoch": 1.2760147601476015, "grad_norm": 0.2719948700747201, "learning_rate": 0.00014217964833751457, "loss": 0.0503, "step": 1729 }, { "epoch": 1.2767527675276753, "grad_norm": 0.15446806615514422, "learning_rate": 0.0001421017636738683, "loss": 0.0257, "step": 1730 }, { "epoch": 1.277490774907749, "grad_norm": 0.19353571579154893, "learning_rate": 0.00014202384795657555, "loss": 0.0293, "step": 1731 }, { "epoch": 1.2782287822878229, "grad_norm": 0.3468466808635855, "learning_rate": 0.0001419459012431057, "loss": 0.0637, "step": 1732 }, { "epoch": 1.2789667896678967, "grad_norm": 0.2561329868061887, "learning_rate": 0.0001418679235909512, "loss": 0.0597, "step": 1733 }, { "epoch": 1.2797047970479705, "grad_norm": 0.29232502235959146, "learning_rate": 0.00014178991505762719, "loss": 0.0579, "step": 1734 }, { "epoch": 1.2804428044280443, "grad_norm": 0.3566462307160927, "learning_rate": 0.0001417118757006716, "loss": 0.147, "step": 1735 }, { "epoch": 1.281180811808118, "grad_norm": 0.23483279269697108, "learning_rate": 0.00014163380557764515, "loss": 0.0557, "step": 1736 }, { "epoch": 1.2819188191881918, "grad_norm": 0.2142327580808574, "learning_rate": 0.0001415557047461312, "loss": 0.0318, "step": 1737 }, { "epoch": 1.2826568265682656, "grad_norm": 0.13974757081419148, "learning_rate": 0.0001414775732637358, "loss": 0.0306, "step": 1738 }, { "epoch": 1.2833948339483394, "grad_norm": 0.21842878325591927, "learning_rate": 0.00014139941118808763, "loss": 0.0361, "step": 1739 }, { "epoch": 1.2841328413284132, "grad_norm": 0.23785080910079962, "learning_rate": 0.00014132121857683783, "loss": 0.0334, "step": 1740 }, { "epoch": 1.2848708487084872, "grad_norm": 0.43690481321911373, "learning_rate": 0.0001412429954876602, "loss": 0.0376, "step": 1741 }, { "epoch": 1.2856088560885608, "grad_norm": 0.13440725794398542, "learning_rate": 0.00014116474197825083, "loss": 0.0288, "step": 1742 }, { "epoch": 1.2863468634686348, "grad_norm": 0.22478084932743278, "learning_rate": 0.0001410864581063285, "loss": 0.0563, "step": 1743 }, { "epoch": 1.2870848708487084, "grad_norm": 0.23508578786008993, "learning_rate": 0.00014100814392963416, "loss": 0.0561, "step": 1744 }, { "epoch": 1.2878228782287824, "grad_norm": 0.281940659103068, "learning_rate": 0.00014092979950593125, "loss": 0.031, "step": 1745 }, { "epoch": 1.288560885608856, "grad_norm": 0.24907742772101701, "learning_rate": 0.00014085142489300546, "loss": 0.0329, "step": 1746 }, { "epoch": 1.28929889298893, "grad_norm": 0.1583613940783398, "learning_rate": 0.00014077302014866482, "loss": 0.032, "step": 1747 }, { "epoch": 1.2900369003690038, "grad_norm": 0.11518031020877559, "learning_rate": 0.00014069458533073938, "loss": 0.0395, "step": 1748 }, { "epoch": 1.2907749077490775, "grad_norm": 0.4950204203591652, "learning_rate": 0.00014061612049708167, "loss": 0.0877, "step": 1749 }, { "epoch": 1.2915129151291513, "grad_norm": 0.17145374271969968, "learning_rate": 0.00014053762570556607, "loss": 0.036, "step": 1750 }, { "epoch": 1.2922509225092251, "grad_norm": 0.1895087386821946, "learning_rate": 0.00014045910101408935, "loss": 0.0423, "step": 1751 }, { "epoch": 1.292988929889299, "grad_norm": 0.17398114151217503, "learning_rate": 0.00014038054648057003, "loss": 0.0248, "step": 1752 }, { "epoch": 1.2937269372693727, "grad_norm": 0.1503514157282056, "learning_rate": 0.00014030196216294888, "loss": 0.0489, "step": 1753 }, { "epoch": 1.2944649446494465, "grad_norm": 0.17311538569271412, "learning_rate": 0.00014022334811918853, "loss": 0.0302, "step": 1754 }, { "epoch": 1.2952029520295203, "grad_norm": 0.19831896601397644, "learning_rate": 0.0001401447044072735, "loss": 0.0366, "step": 1755 }, { "epoch": 1.295940959409594, "grad_norm": 0.14962358195204956, "learning_rate": 0.0001400660310852103, "loss": 0.0208, "step": 1756 }, { "epoch": 1.2966789667896679, "grad_norm": 0.4461184275107165, "learning_rate": 0.00013998732821102723, "loss": 0.106, "step": 1757 }, { "epoch": 1.2974169741697417, "grad_norm": 0.16329700389479704, "learning_rate": 0.00013990859584277432, "loss": 0.0312, "step": 1758 }, { "epoch": 1.2981549815498155, "grad_norm": 0.23032209827757844, "learning_rate": 0.00013982983403852347, "loss": 0.0502, "step": 1759 }, { "epoch": 1.2988929889298892, "grad_norm": 0.2639011456269338, "learning_rate": 0.0001397510428563682, "loss": 0.0322, "step": 1760 }, { "epoch": 1.299630996309963, "grad_norm": 0.19840926231978465, "learning_rate": 0.0001396722223544238, "loss": 0.0492, "step": 1761 }, { "epoch": 1.3003690036900368, "grad_norm": 0.5759119585642678, "learning_rate": 0.00013959337259082704, "loss": 0.1078, "step": 1762 }, { "epoch": 1.3011070110701106, "grad_norm": 0.2943186953817033, "learning_rate": 0.00013951449362373643, "loss": 0.0733, "step": 1763 }, { "epoch": 1.3018450184501846, "grad_norm": 0.2911648155229898, "learning_rate": 0.00013943558551133186, "loss": 0.0523, "step": 1764 }, { "epoch": 1.3025830258302582, "grad_norm": 0.40773901038716226, "learning_rate": 0.0001393566483118149, "loss": 0.0412, "step": 1765 }, { "epoch": 1.3033210332103322, "grad_norm": 0.23937308722398, "learning_rate": 0.00013927768208340838, "loss": 0.0432, "step": 1766 }, { "epoch": 1.3040590405904058, "grad_norm": 0.17891528674933954, "learning_rate": 0.00013919868688435676, "loss": 0.0736, "step": 1767 }, { "epoch": 1.3047970479704798, "grad_norm": 0.24789204721227812, "learning_rate": 0.00013911966277292562, "loss": 0.0622, "step": 1768 }, { "epoch": 1.3055350553505536, "grad_norm": 0.4121051662542013, "learning_rate": 0.00013904060980740203, "loss": 0.0965, "step": 1769 }, { "epoch": 1.3062730627306274, "grad_norm": 0.21374482234754016, "learning_rate": 0.00013896152804609434, "loss": 0.0591, "step": 1770 }, { "epoch": 1.3070110701107012, "grad_norm": 0.3155430655542807, "learning_rate": 0.00013888241754733208, "loss": 0.0532, "step": 1771 }, { "epoch": 1.307749077490775, "grad_norm": 0.2574220652865253, "learning_rate": 0.000138803278369466, "loss": 0.056, "step": 1772 }, { "epoch": 1.3084870848708487, "grad_norm": 0.2472710038873888, "learning_rate": 0.000138724110570868, "loss": 0.0828, "step": 1773 }, { "epoch": 1.3092250922509225, "grad_norm": 0.13910550174741562, "learning_rate": 0.00013864491420993112, "loss": 0.0196, "step": 1774 }, { "epoch": 1.3099630996309963, "grad_norm": 0.3860985032768373, "learning_rate": 0.0001385656893450694, "loss": 0.0346, "step": 1775 }, { "epoch": 1.3107011070110701, "grad_norm": 0.2912733367973824, "learning_rate": 0.0001384864360347179, "loss": 0.0472, "step": 1776 }, { "epoch": 1.311439114391144, "grad_norm": 0.2847895926489688, "learning_rate": 0.00013840715433733288, "loss": 0.071, "step": 1777 }, { "epoch": 1.3121771217712177, "grad_norm": 0.1933283944339423, "learning_rate": 0.00013832784431139117, "loss": 0.0457, "step": 1778 }, { "epoch": 1.3129151291512915, "grad_norm": 0.17830300895056103, "learning_rate": 0.0001382485060153908, "loss": 0.0334, "step": 1779 }, { "epoch": 1.3136531365313653, "grad_norm": 0.3333791435205872, "learning_rate": 0.00013816913950785047, "loss": 0.0598, "step": 1780 }, { "epoch": 1.314391143911439, "grad_norm": 0.18389111840371217, "learning_rate": 0.00013808974484730982, "loss": 0.0463, "step": 1781 }, { "epoch": 1.3151291512915129, "grad_norm": 0.41194658419886354, "learning_rate": 0.00013801032209232917, "loss": 0.0558, "step": 1782 }, { "epoch": 1.3158671586715867, "grad_norm": 0.2847032805458432, "learning_rate": 0.0001379308713014896, "loss": 0.0494, "step": 1783 }, { "epoch": 1.3166051660516604, "grad_norm": 0.09095978453709994, "learning_rate": 0.00013785139253339279, "loss": 0.0226, "step": 1784 }, { "epoch": 1.3173431734317342, "grad_norm": 0.24101912224818434, "learning_rate": 0.0001377718858466612, "loss": 0.0453, "step": 1785 }, { "epoch": 1.318081180811808, "grad_norm": 0.39793296133776906, "learning_rate": 0.00013769235129993773, "loss": 0.0604, "step": 1786 }, { "epoch": 1.318819188191882, "grad_norm": 0.46298587185864787, "learning_rate": 0.00013761278895188598, "loss": 0.0815, "step": 1787 }, { "epoch": 1.3195571955719556, "grad_norm": 0.4630036019048938, "learning_rate": 0.00013753319886118995, "loss": 0.0786, "step": 1788 }, { "epoch": 1.3202952029520296, "grad_norm": 0.3112276457287947, "learning_rate": 0.0001374535810865541, "loss": 0.0564, "step": 1789 }, { "epoch": 1.3210332103321032, "grad_norm": 0.19398946015469995, "learning_rate": 0.00013737393568670334, "loss": 0.0262, "step": 1790 }, { "epoch": 1.3217712177121772, "grad_norm": 0.45264571939711795, "learning_rate": 0.00013729426272038298, "loss": 0.082, "step": 1791 }, { "epoch": 1.322509225092251, "grad_norm": 0.20159038168976476, "learning_rate": 0.0001372145622463586, "loss": 0.0459, "step": 1792 }, { "epoch": 1.3232472324723248, "grad_norm": 0.1489995197538031, "learning_rate": 0.00013713483432341617, "loss": 0.0341, "step": 1793 }, { "epoch": 1.3239852398523986, "grad_norm": 0.21698139007596637, "learning_rate": 0.00013705507901036178, "loss": 0.0373, "step": 1794 }, { "epoch": 1.3247232472324724, "grad_norm": 0.35753551382865384, "learning_rate": 0.00013697529636602182, "loss": 0.0354, "step": 1795 }, { "epoch": 1.3254612546125462, "grad_norm": 0.2538695383034875, "learning_rate": 0.00013689548644924278, "loss": 0.0502, "step": 1796 }, { "epoch": 1.32619926199262, "grad_norm": 0.18342363688138344, "learning_rate": 0.00013681564931889136, "loss": 0.0323, "step": 1797 }, { "epoch": 1.3269372693726937, "grad_norm": 0.31133139231128903, "learning_rate": 0.00013673578503385416, "loss": 0.0638, "step": 1798 }, { "epoch": 1.3276752767527675, "grad_norm": 0.2018121977043779, "learning_rate": 0.00013665589365303798, "loss": 0.05, "step": 1799 }, { "epoch": 1.3284132841328413, "grad_norm": 0.1800959706108409, "learning_rate": 0.00013657597523536948, "loss": 0.0508, "step": 1800 }, { "epoch": 1.3291512915129151, "grad_norm": 0.13767496536459153, "learning_rate": 0.0001364960298397954, "loss": 0.0208, "step": 1801 }, { "epoch": 1.329889298892989, "grad_norm": 0.34897040678510743, "learning_rate": 0.00013641605752528224, "loss": 0.056, "step": 1802 }, { "epoch": 1.3306273062730627, "grad_norm": 0.4050029470381421, "learning_rate": 0.0001363360583508164, "loss": 0.0574, "step": 1803 }, { "epoch": 1.3313653136531365, "grad_norm": 0.5384151981133406, "learning_rate": 0.00013625603237540416, "loss": 0.0621, "step": 1804 }, { "epoch": 1.3321033210332103, "grad_norm": 0.17213242230892858, "learning_rate": 0.00013617597965807145, "loss": 0.0356, "step": 1805 }, { "epoch": 1.332841328413284, "grad_norm": 0.7730448381959296, "learning_rate": 0.00013609590025786403, "loss": 0.0673, "step": 1806 }, { "epoch": 1.3335793357933579, "grad_norm": 0.15956177491857756, "learning_rate": 0.0001360157942338473, "loss": 0.0386, "step": 1807 }, { "epoch": 1.3343173431734319, "grad_norm": 0.12439016384137483, "learning_rate": 0.00013593566164510628, "loss": 0.0416, "step": 1808 }, { "epoch": 1.3350553505535054, "grad_norm": 0.2135217834426172, "learning_rate": 0.00013585550255074553, "loss": 0.0333, "step": 1809 }, { "epoch": 1.3357933579335795, "grad_norm": 0.23269866459335492, "learning_rate": 0.00013577531700988935, "loss": 0.0496, "step": 1810 }, { "epoch": 1.336531365313653, "grad_norm": 0.3550009025331513, "learning_rate": 0.00013569510508168136, "loss": 0.0963, "step": 1811 }, { "epoch": 1.337269372693727, "grad_norm": 0.23293528039871877, "learning_rate": 0.0001356148668252847, "loss": 0.0371, "step": 1812 }, { "epoch": 1.3380073800738006, "grad_norm": 0.25464492609830647, "learning_rate": 0.000135534602299882, "loss": 0.0596, "step": 1813 }, { "epoch": 1.3387453874538746, "grad_norm": 0.34595619533058114, "learning_rate": 0.0001354543115646751, "loss": 0.0939, "step": 1814 }, { "epoch": 1.3394833948339484, "grad_norm": 0.21515911510488223, "learning_rate": 0.00013537399467888537, "loss": 0.1306, "step": 1815 }, { "epoch": 1.3402214022140222, "grad_norm": 0.22757465136749414, "learning_rate": 0.00013529365170175333, "loss": 0.0613, "step": 1816 }, { "epoch": 1.340959409594096, "grad_norm": 0.18032610716226127, "learning_rate": 0.00013521328269253878, "loss": 0.0276, "step": 1817 }, { "epoch": 1.3416974169741698, "grad_norm": 0.18361006448696426, "learning_rate": 0.00013513288771052073, "loss": 0.0317, "step": 1818 }, { "epoch": 1.3424354243542436, "grad_norm": 0.3459600831583869, "learning_rate": 0.00013505246681499734, "loss": 0.0453, "step": 1819 }, { "epoch": 1.3431734317343174, "grad_norm": 0.14182014150782046, "learning_rate": 0.00013497202006528596, "loss": 0.0302, "step": 1820 }, { "epoch": 1.3439114391143911, "grad_norm": 0.14468895817304572, "learning_rate": 0.00013489154752072287, "loss": 0.0273, "step": 1821 }, { "epoch": 1.344649446494465, "grad_norm": 0.3107119692170402, "learning_rate": 0.00013481104924066342, "loss": 0.065, "step": 1822 }, { "epoch": 1.3453874538745387, "grad_norm": 0.36420787959776046, "learning_rate": 0.00013473052528448201, "loss": 0.0522, "step": 1823 }, { "epoch": 1.3461254612546125, "grad_norm": 0.4940935170217732, "learning_rate": 0.00013464997571157198, "loss": 0.0816, "step": 1824 }, { "epoch": 1.3468634686346863, "grad_norm": 0.2780284451382539, "learning_rate": 0.00013456940058134543, "loss": 0.0795, "step": 1825 }, { "epoch": 1.34760147601476, "grad_norm": 0.17236085750802976, "learning_rate": 0.00013448879995323345, "loss": 0.0442, "step": 1826 }, { "epoch": 1.348339483394834, "grad_norm": 0.13362040036951406, "learning_rate": 0.00013440817388668584, "loss": 0.0198, "step": 1827 }, { "epoch": 1.3490774907749077, "grad_norm": 0.25687551897377886, "learning_rate": 0.00013432752244117133, "loss": 0.0474, "step": 1828 }, { "epoch": 1.3498154981549815, "grad_norm": 0.09792548504158978, "learning_rate": 0.00013424684567617712, "loss": 0.0175, "step": 1829 }, { "epoch": 1.3505535055350553, "grad_norm": 0.19962962137777715, "learning_rate": 0.00013416614365120924, "loss": 0.0474, "step": 1830 }, { "epoch": 1.3512915129151293, "grad_norm": 0.20031344109164925, "learning_rate": 0.00013408541642579238, "loss": 0.0533, "step": 1831 }, { "epoch": 1.3520295202952028, "grad_norm": 0.2992433176167721, "learning_rate": 0.00013400466405946973, "loss": 0.0632, "step": 1832 }, { "epoch": 1.3527675276752769, "grad_norm": 0.29725741001906125, "learning_rate": 0.00013392388661180303, "loss": 0.0828, "step": 1833 }, { "epoch": 1.3535055350553504, "grad_norm": 0.14058992184340172, "learning_rate": 0.0001338430841423726, "loss": 0.0214, "step": 1834 }, { "epoch": 1.3542435424354244, "grad_norm": 0.1923249739758672, "learning_rate": 0.00013376225671077714, "loss": 0.0346, "step": 1835 }, { "epoch": 1.3549815498154982, "grad_norm": 0.12155997486921089, "learning_rate": 0.00013368140437663376, "loss": 0.0178, "step": 1836 }, { "epoch": 1.355719557195572, "grad_norm": 0.23280210135921484, "learning_rate": 0.000133600527199578, "loss": 0.0485, "step": 1837 }, { "epoch": 1.3564575645756458, "grad_norm": 0.5236535795267556, "learning_rate": 0.00013351962523926365, "loss": 0.0791, "step": 1838 }, { "epoch": 1.3571955719557196, "grad_norm": 0.14937797822163676, "learning_rate": 0.00013343869855536285, "loss": 0.0308, "step": 1839 }, { "epoch": 1.3579335793357934, "grad_norm": 0.66119565493917, "learning_rate": 0.0001333577472075659, "loss": 0.0667, "step": 1840 }, { "epoch": 1.3586715867158672, "grad_norm": 0.38836761254726554, "learning_rate": 0.0001332767712555814, "loss": 0.1153, "step": 1841 }, { "epoch": 1.359409594095941, "grad_norm": 0.31228194044486546, "learning_rate": 0.00013319577075913597, "loss": 0.0578, "step": 1842 }, { "epoch": 1.3601476014760148, "grad_norm": 0.308582883771972, "learning_rate": 0.0001331147457779744, "loss": 0.0397, "step": 1843 }, { "epoch": 1.3608856088560886, "grad_norm": 0.21979363993903292, "learning_rate": 0.00013303369637185958, "loss": 0.0343, "step": 1844 }, { "epoch": 1.3616236162361623, "grad_norm": 0.29815407308619063, "learning_rate": 0.00013295262260057232, "loss": 0.0502, "step": 1845 }, { "epoch": 1.3623616236162361, "grad_norm": 0.27964195738029246, "learning_rate": 0.00013287152452391146, "loss": 0.0637, "step": 1846 }, { "epoch": 1.36309963099631, "grad_norm": 0.15321465835090017, "learning_rate": 0.00013279040220169375, "loss": 0.0211, "step": 1847 }, { "epoch": 1.3638376383763837, "grad_norm": 0.24279670346325563, "learning_rate": 0.00013270925569375388, "loss": 0.0278, "step": 1848 }, { "epoch": 1.3645756457564575, "grad_norm": 0.34246793558995164, "learning_rate": 0.00013262808505994425, "loss": 0.0394, "step": 1849 }, { "epoch": 1.3653136531365313, "grad_norm": 0.36519137966041215, "learning_rate": 0.00013254689036013524, "loss": 0.0584, "step": 1850 }, { "epoch": 1.366051660516605, "grad_norm": 0.1360797655559626, "learning_rate": 0.00013246567165421476, "loss": 0.0461, "step": 1851 }, { "epoch": 1.3667896678966789, "grad_norm": 0.3136424445656738, "learning_rate": 0.00013238442900208864, "loss": 0.0817, "step": 1852 }, { "epoch": 1.3675276752767527, "grad_norm": 0.1521695965715814, "learning_rate": 0.00013230316246368021, "loss": 0.0287, "step": 1853 }, { "epoch": 1.3682656826568267, "grad_norm": 0.13819711880169191, "learning_rate": 0.00013222187209893053, "loss": 0.0329, "step": 1854 }, { "epoch": 1.3690036900369003, "grad_norm": 0.2584966908879223, "learning_rate": 0.00013214055796779815, "loss": 0.0553, "step": 1855 }, { "epoch": 1.3697416974169743, "grad_norm": 0.19525351961674142, "learning_rate": 0.00013205922013025923, "loss": 0.0338, "step": 1856 }, { "epoch": 1.3704797047970478, "grad_norm": 0.3391909465665553, "learning_rate": 0.0001319778586463073, "loss": 0.0584, "step": 1857 }, { "epoch": 1.3712177121771219, "grad_norm": 0.17244994734565489, "learning_rate": 0.00013189647357595346, "loss": 0.0312, "step": 1858 }, { "epoch": 1.3719557195571956, "grad_norm": 0.32882196669905345, "learning_rate": 0.00013181506497922613, "loss": 0.0454, "step": 1859 }, { "epoch": 1.3726937269372694, "grad_norm": 0.4353381160877063, "learning_rate": 0.00013173363291617114, "loss": 0.0652, "step": 1860 }, { "epoch": 1.3734317343173432, "grad_norm": 0.33477717470126617, "learning_rate": 0.0001316521774468515, "loss": 0.0484, "step": 1861 }, { "epoch": 1.374169741697417, "grad_norm": 0.16389077554442294, "learning_rate": 0.00013157069863134772, "loss": 0.0339, "step": 1862 }, { "epoch": 1.3749077490774908, "grad_norm": 0.3706793967052937, "learning_rate": 0.00013148919652975725, "loss": 0.0446, "step": 1863 }, { "epoch": 1.3756457564575646, "grad_norm": 0.13316375060871505, "learning_rate": 0.0001314076712021949, "loss": 0.0236, "step": 1864 }, { "epoch": 1.3763837638376384, "grad_norm": 0.1293906202395038, "learning_rate": 0.00013132612270879256, "loss": 0.0197, "step": 1865 }, { "epoch": 1.3771217712177122, "grad_norm": 0.2653844833967906, "learning_rate": 0.00013124455110969925, "loss": 0.0553, "step": 1866 }, { "epoch": 1.377859778597786, "grad_norm": 0.18414365089447757, "learning_rate": 0.0001311629564650809, "loss": 0.039, "step": 1867 }, { "epoch": 1.3785977859778598, "grad_norm": 0.14074965376601048, "learning_rate": 0.00013108133883512065, "loss": 0.0238, "step": 1868 }, { "epoch": 1.3793357933579335, "grad_norm": 0.3315591034709597, "learning_rate": 0.00013099969828001836, "loss": 0.0392, "step": 1869 }, { "epoch": 1.3800738007380073, "grad_norm": 0.5065610333256214, "learning_rate": 0.000130918034859991, "loss": 0.0782, "step": 1870 }, { "epoch": 1.3808118081180811, "grad_norm": 0.23983919018006286, "learning_rate": 0.00013083634863527221, "loss": 0.0403, "step": 1871 }, { "epoch": 1.381549815498155, "grad_norm": 0.4862656324643851, "learning_rate": 0.00013075463966611268, "loss": 0.1404, "step": 1872 }, { "epoch": 1.3822878228782287, "grad_norm": 0.2019626063300029, "learning_rate": 0.00013067290801277968, "loss": 0.09, "step": 1873 }, { "epoch": 1.3830258302583025, "grad_norm": 0.2419650705939459, "learning_rate": 0.0001305911537355573, "loss": 0.0569, "step": 1874 }, { "epoch": 1.3837638376383765, "grad_norm": 0.29626202554233816, "learning_rate": 0.0001305093768947463, "loss": 0.0639, "step": 1875 }, { "epoch": 1.38450184501845, "grad_norm": 0.16252168809377016, "learning_rate": 0.0001304275775506641, "loss": 0.026, "step": 1876 }, { "epoch": 1.385239852398524, "grad_norm": 0.3505044699709797, "learning_rate": 0.00013034575576364467, "loss": 0.0439, "step": 1877 }, { "epoch": 1.3859778597785977, "grad_norm": 0.2563916981827426, "learning_rate": 0.0001302639115940386, "loss": 0.0596, "step": 1878 }, { "epoch": 1.3867158671586717, "grad_norm": 0.39235577894010176, "learning_rate": 0.00013018204510221293, "loss": 0.1027, "step": 1879 }, { "epoch": 1.3874538745387455, "grad_norm": 0.12076400830627383, "learning_rate": 0.00013010015634855123, "loss": 0.0252, "step": 1880 }, { "epoch": 1.3881918819188193, "grad_norm": 0.21199429242824616, "learning_rate": 0.0001300182453934534, "loss": 0.0365, "step": 1881 }, { "epoch": 1.388929889298893, "grad_norm": 0.1761859732388295, "learning_rate": 0.00012993631229733582, "loss": 0.0453, "step": 1882 }, { "epoch": 1.3896678966789668, "grad_norm": 0.14740350188581797, "learning_rate": 0.00012985435712063108, "loss": 0.0303, "step": 1883 }, { "epoch": 1.3904059040590406, "grad_norm": 0.24993424496789154, "learning_rate": 0.00012977237992378818, "loss": 0.0422, "step": 1884 }, { "epoch": 1.3911439114391144, "grad_norm": 0.13190015147225276, "learning_rate": 0.00012969038076727225, "loss": 0.0293, "step": 1885 }, { "epoch": 1.3918819188191882, "grad_norm": 0.2636440573786571, "learning_rate": 0.0001296083597115647, "loss": 0.0474, "step": 1886 }, { "epoch": 1.392619926199262, "grad_norm": 0.3314838172847557, "learning_rate": 0.0001295263168171631, "loss": 0.0589, "step": 1887 }, { "epoch": 1.3933579335793358, "grad_norm": 0.19889947417062084, "learning_rate": 0.00012944425214458103, "loss": 0.0396, "step": 1888 }, { "epoch": 1.3940959409594096, "grad_norm": 0.17787157538597057, "learning_rate": 0.00012936216575434823, "loss": 0.0425, "step": 1889 }, { "epoch": 1.3948339483394834, "grad_norm": 0.13764665913519644, "learning_rate": 0.0001292800577070104, "loss": 0.0261, "step": 1890 }, { "epoch": 1.3955719557195572, "grad_norm": 0.11108986659815306, "learning_rate": 0.00012919792806312928, "loss": 0.0234, "step": 1891 }, { "epoch": 1.396309963099631, "grad_norm": 0.08370165496975455, "learning_rate": 0.00012911577688328246, "loss": 0.0172, "step": 1892 }, { "epoch": 1.3970479704797047, "grad_norm": 0.19401879969930097, "learning_rate": 0.00012903360422806347, "loss": 0.0374, "step": 1893 }, { "epoch": 1.3977859778597785, "grad_norm": 0.20047183356091247, "learning_rate": 0.00012895141015808163, "loss": 0.0413, "step": 1894 }, { "epoch": 1.3985239852398523, "grad_norm": 0.5309912512083907, "learning_rate": 0.0001288691947339621, "loss": 0.1315, "step": 1895 }, { "epoch": 1.3992619926199261, "grad_norm": 0.22378544328166702, "learning_rate": 0.00012878695801634582, "loss": 0.0497, "step": 1896 }, { "epoch": 1.4, "grad_norm": 0.22621278578842385, "learning_rate": 0.00012870470006588934, "loss": 0.0212, "step": 1897 }, { "epoch": 1.400738007380074, "grad_norm": 0.23614058605743501, "learning_rate": 0.00012862242094326498, "loss": 0.045, "step": 1898 }, { "epoch": 1.4014760147601475, "grad_norm": 0.17078026708467292, "learning_rate": 0.00012854012070916053, "loss": 0.0389, "step": 1899 }, { "epoch": 1.4022140221402215, "grad_norm": 0.11338163051055541, "learning_rate": 0.00012845779942427955, "loss": 0.0228, "step": 1900 }, { "epoch": 1.402952029520295, "grad_norm": 0.23856093294083927, "learning_rate": 0.00012837545714934091, "loss": 0.042, "step": 1901 }, { "epoch": 1.403690036900369, "grad_norm": 0.125518314995796, "learning_rate": 0.00012829309394507915, "loss": 0.0259, "step": 1902 }, { "epoch": 1.4044280442804429, "grad_norm": 0.2317980009158916, "learning_rate": 0.00012821070987224415, "loss": 0.0534, "step": 1903 }, { "epoch": 1.4051660516605167, "grad_norm": 0.22358561164995713, "learning_rate": 0.0001281283049916012, "loss": 0.0496, "step": 1904 }, { "epoch": 1.4059040590405905, "grad_norm": 0.2952231413466771, "learning_rate": 0.0001280458793639309, "loss": 0.0531, "step": 1905 }, { "epoch": 1.4066420664206642, "grad_norm": 0.11693765860241973, "learning_rate": 0.00012796343305002925, "loss": 0.0333, "step": 1906 }, { "epoch": 1.407380073800738, "grad_norm": 0.19837451634966308, "learning_rate": 0.0001278809661107074, "loss": 0.0261, "step": 1907 }, { "epoch": 1.4081180811808118, "grad_norm": 0.22229572029713696, "learning_rate": 0.00012779847860679177, "loss": 0.0321, "step": 1908 }, { "epoch": 1.4088560885608856, "grad_norm": 0.3012028688011334, "learning_rate": 0.000127715970599124, "loss": 0.0526, "step": 1909 }, { "epoch": 1.4095940959409594, "grad_norm": 0.40496531189040186, "learning_rate": 0.00012763344214856067, "loss": 0.0581, "step": 1910 }, { "epoch": 1.4103321033210332, "grad_norm": 0.31254333480167446, "learning_rate": 0.00012755089331597367, "loss": 0.064, "step": 1911 }, { "epoch": 1.411070110701107, "grad_norm": 0.20121787332631938, "learning_rate": 0.0001274683241622498, "loss": 0.0391, "step": 1912 }, { "epoch": 1.4118081180811808, "grad_norm": 0.17640590038965634, "learning_rate": 0.0001273857347482908, "loss": 0.0399, "step": 1913 }, { "epoch": 1.4125461254612546, "grad_norm": 0.42662241780723087, "learning_rate": 0.00012730312513501346, "loss": 0.0594, "step": 1914 }, { "epoch": 1.4132841328413284, "grad_norm": 0.26832440173956157, "learning_rate": 0.0001272204953833494, "loss": 0.0384, "step": 1915 }, { "epoch": 1.4140221402214022, "grad_norm": 0.12231564181453164, "learning_rate": 0.0001271378455542452, "loss": 0.0271, "step": 1916 }, { "epoch": 1.414760147601476, "grad_norm": 0.2606251296658726, "learning_rate": 0.00012705517570866208, "loss": 0.044, "step": 1917 }, { "epoch": 1.4154981549815497, "grad_norm": 0.29945411865392557, "learning_rate": 0.0001269724859075761, "loss": 0.0471, "step": 1918 }, { "epoch": 1.4162361623616238, "grad_norm": 0.09731254179497746, "learning_rate": 0.00012688977621197814, "loss": 0.0216, "step": 1919 }, { "epoch": 1.4169741697416973, "grad_norm": 0.43920433407616744, "learning_rate": 0.00012680704668287363, "loss": 0.1144, "step": 1920 }, { "epoch": 1.4177121771217713, "grad_norm": 0.15778584787892594, "learning_rate": 0.0001267242973812826, "loss": 0.0297, "step": 1921 }, { "epoch": 1.418450184501845, "grad_norm": 0.21946573759185373, "learning_rate": 0.00012664152836823982, "loss": 0.0358, "step": 1922 }, { "epoch": 1.419188191881919, "grad_norm": 0.1931866390227871, "learning_rate": 0.00012655873970479444, "loss": 0.0394, "step": 1923 }, { "epoch": 1.4199261992619925, "grad_norm": 0.28788400147985493, "learning_rate": 0.00012647593145201017, "loss": 0.0694, "step": 1924 }, { "epoch": 1.4206642066420665, "grad_norm": 0.12400662399563114, "learning_rate": 0.00012639310367096524, "loss": 0.0256, "step": 1925 }, { "epoch": 1.4214022140221403, "grad_norm": 0.16618405468660763, "learning_rate": 0.00012631025642275212, "loss": 0.0642, "step": 1926 }, { "epoch": 1.422140221402214, "grad_norm": 0.20486543722566317, "learning_rate": 0.0001262273897684778, "loss": 0.0484, "step": 1927 }, { "epoch": 1.4228782287822879, "grad_norm": 0.19867498900157554, "learning_rate": 0.0001261445037692635, "loss": 0.0311, "step": 1928 }, { "epoch": 1.4236162361623617, "grad_norm": 0.35152050023034986, "learning_rate": 0.00012606159848624473, "loss": 0.0852, "step": 1929 }, { "epoch": 1.4243542435424354, "grad_norm": 0.359373810421075, "learning_rate": 0.00012597867398057115, "loss": 0.0758, "step": 1930 }, { "epoch": 1.4250922509225092, "grad_norm": 0.3054682152176191, "learning_rate": 0.00012589573031340673, "loss": 0.0622, "step": 1931 }, { "epoch": 1.425830258302583, "grad_norm": 0.28748454119211225, "learning_rate": 0.0001258127675459295, "loss": 0.0648, "step": 1932 }, { "epoch": 1.4265682656826568, "grad_norm": 0.2724047650018948, "learning_rate": 0.0001257297857393316, "loss": 0.0591, "step": 1933 }, { "epoch": 1.4273062730627306, "grad_norm": 0.3084667903158492, "learning_rate": 0.00012564678495481917, "loss": 0.0476, "step": 1934 }, { "epoch": 1.4280442804428044, "grad_norm": 0.13875031305177163, "learning_rate": 0.00012556376525361238, "loss": 0.0292, "step": 1935 }, { "epoch": 1.4287822878228782, "grad_norm": 0.20869788431912267, "learning_rate": 0.00012548072669694537, "loss": 0.0322, "step": 1936 }, { "epoch": 1.429520295202952, "grad_norm": 0.08052383538252542, "learning_rate": 0.00012539766934606617, "loss": 0.0106, "step": 1937 }, { "epoch": 1.4302583025830258, "grad_norm": 0.3566058145135655, "learning_rate": 0.00012531459326223663, "loss": 0.06, "step": 1938 }, { "epoch": 1.4309963099630996, "grad_norm": 0.23246428084650267, "learning_rate": 0.0001252314985067325, "loss": 0.0797, "step": 1939 }, { "epoch": 1.4317343173431734, "grad_norm": 0.4703331973056688, "learning_rate": 0.00012514838514084324, "loss": 0.069, "step": 1940 }, { "epoch": 1.4324723247232471, "grad_norm": 0.1737794970310243, "learning_rate": 0.00012506525322587207, "loss": 0.0312, "step": 1941 }, { "epoch": 1.4332103321033212, "grad_norm": 0.18284376602756208, "learning_rate": 0.00012498210282313582, "loss": 0.0404, "step": 1942 }, { "epoch": 1.4339483394833947, "grad_norm": 0.28484200493832684, "learning_rate": 0.00012489893399396515, "loss": 0.0364, "step": 1943 }, { "epoch": 1.4346863468634687, "grad_norm": 0.861745558112136, "learning_rate": 0.00012481574679970402, "loss": 0.1052, "step": 1944 }, { "epoch": 1.4354243542435423, "grad_norm": 0.23941912175411126, "learning_rate": 0.00012473254130171017, "loss": 0.0648, "step": 1945 }, { "epoch": 1.4361623616236163, "grad_norm": 0.1868763882372484, "learning_rate": 0.00012464931756135474, "loss": 0.029, "step": 1946 }, { "epoch": 1.4369003690036901, "grad_norm": 0.16991909664026877, "learning_rate": 0.00012456607564002235, "loss": 0.0223, "step": 1947 }, { "epoch": 1.437638376383764, "grad_norm": 0.19637883776882345, "learning_rate": 0.00012448281559911104, "loss": 0.0358, "step": 1948 }, { "epoch": 1.4383763837638377, "grad_norm": 0.08918532628380625, "learning_rate": 0.0001243995375000322, "loss": 0.0223, "step": 1949 }, { "epoch": 1.4391143911439115, "grad_norm": 0.47541507509657155, "learning_rate": 0.00012431624140421055, "loss": 0.1269, "step": 1950 }, { "epoch": 1.4398523985239853, "grad_norm": 0.1965604772231933, "learning_rate": 0.00012423292737308403, "loss": 0.0394, "step": 1951 }, { "epoch": 1.440590405904059, "grad_norm": 0.3033270761898276, "learning_rate": 0.00012414959546810388, "loss": 0.0675, "step": 1952 }, { "epoch": 1.4413284132841329, "grad_norm": 0.36627144706817116, "learning_rate": 0.0001240662457507345, "loss": 0.0323, "step": 1953 }, { "epoch": 1.4420664206642066, "grad_norm": 0.2804632353824766, "learning_rate": 0.0001239828782824534, "loss": 0.0886, "step": 1954 }, { "epoch": 1.4428044280442804, "grad_norm": 0.22878965687549524, "learning_rate": 0.00012389949312475128, "loss": 0.063, "step": 1955 }, { "epoch": 1.4435424354243542, "grad_norm": 0.47284870844322346, "learning_rate": 0.00012381609033913175, "loss": 0.0569, "step": 1956 }, { "epoch": 1.444280442804428, "grad_norm": 0.20532798509189915, "learning_rate": 0.0001237326699871115, "loss": 0.0466, "step": 1957 }, { "epoch": 1.4450184501845018, "grad_norm": 0.28578165362282, "learning_rate": 0.00012364923213022014, "loss": 0.066, "step": 1958 }, { "epoch": 1.4457564575645756, "grad_norm": 0.08673274425029724, "learning_rate": 0.0001235657768300003, "loss": 0.0207, "step": 1959 }, { "epoch": 1.4464944649446494, "grad_norm": 0.401472566831202, "learning_rate": 0.0001234823041480073, "loss": 0.0518, "step": 1960 }, { "epoch": 1.4472324723247232, "grad_norm": 0.22699982241131558, "learning_rate": 0.00012339881414580943, "loss": 0.0556, "step": 1961 }, { "epoch": 1.447970479704797, "grad_norm": 0.29199829137750627, "learning_rate": 0.00012331530688498764, "loss": 0.0585, "step": 1962 }, { "epoch": 1.4487084870848708, "grad_norm": 0.17656709119417616, "learning_rate": 0.00012323178242713576, "loss": 0.0341, "step": 1963 }, { "epoch": 1.4494464944649446, "grad_norm": 0.13025163565159323, "learning_rate": 0.0001231482408338601, "loss": 0.0252, "step": 1964 }, { "epoch": 1.4501845018450186, "grad_norm": 0.725445070437212, "learning_rate": 0.0001230646821667798, "loss": 0.1166, "step": 1965 }, { "epoch": 1.4509225092250921, "grad_norm": 0.17682783257410734, "learning_rate": 0.00012298110648752649, "loss": 0.0307, "step": 1966 }, { "epoch": 1.4516605166051662, "grad_norm": 0.17092228310162452, "learning_rate": 0.00012289751385774437, "loss": 0.0323, "step": 1967 }, { "epoch": 1.4523985239852397, "grad_norm": 0.15337902065533113, "learning_rate": 0.00012281390433909012, "loss": 0.0332, "step": 1968 }, { "epoch": 1.4531365313653137, "grad_norm": 0.33607067862508, "learning_rate": 0.00012273027799323297, "loss": 0.0529, "step": 1969 }, { "epoch": 1.4538745387453875, "grad_norm": 0.3064478754205691, "learning_rate": 0.0001226466348818544, "loss": 0.0439, "step": 1970 }, { "epoch": 1.4546125461254613, "grad_norm": 0.25153323118848614, "learning_rate": 0.00012256297506664843, "loss": 0.0344, "step": 1971 }, { "epoch": 1.455350553505535, "grad_norm": 0.5461185005379814, "learning_rate": 0.00012247929860932126, "loss": 0.0829, "step": 1972 }, { "epoch": 1.456088560885609, "grad_norm": 0.20080226688180888, "learning_rate": 0.00012239560557159146, "loss": 0.0392, "step": 1973 }, { "epoch": 1.4568265682656827, "grad_norm": 0.3830066028322324, "learning_rate": 0.00012231189601518978, "loss": 0.0762, "step": 1974 }, { "epoch": 1.4575645756457565, "grad_norm": 0.18701582771786554, "learning_rate": 0.00012222817000185918, "loss": 0.0584, "step": 1975 }, { "epoch": 1.4583025830258303, "grad_norm": 0.18765549027157902, "learning_rate": 0.00012214442759335471, "loss": 0.0587, "step": 1976 }, { "epoch": 1.459040590405904, "grad_norm": 0.20807701469490597, "learning_rate": 0.00012206066885144362, "loss": 0.0478, "step": 1977 }, { "epoch": 1.4597785977859778, "grad_norm": 0.3200371804584016, "learning_rate": 0.00012197689383790504, "loss": 0.0572, "step": 1978 }, { "epoch": 1.4605166051660516, "grad_norm": 0.2715108080722178, "learning_rate": 0.00012189310261453028, "loss": 0.0423, "step": 1979 }, { "epoch": 1.4612546125461254, "grad_norm": 0.7483610915938957, "learning_rate": 0.00012180929524312246, "loss": 0.0654, "step": 1980 }, { "epoch": 1.4619926199261992, "grad_norm": 0.4830488601401838, "learning_rate": 0.00012172547178549674, "loss": 0.0502, "step": 1981 }, { "epoch": 1.462730627306273, "grad_norm": 0.10667659719257458, "learning_rate": 0.00012164163230348, "loss": 0.026, "step": 1982 }, { "epoch": 1.4634686346863468, "grad_norm": 0.22574599906319365, "learning_rate": 0.00012155777685891112, "loss": 0.0453, "step": 1983 }, { "epoch": 1.4642066420664206, "grad_norm": 0.3350853012887793, "learning_rate": 0.00012147390551364054, "loss": 0.057, "step": 1984 }, { "epoch": 1.4649446494464944, "grad_norm": 0.3360231880425821, "learning_rate": 0.00012139001832953063, "loss": 0.0526, "step": 1985 }, { "epoch": 1.4656826568265684, "grad_norm": 0.117238228931146, "learning_rate": 0.00012130611536845532, "loss": 0.0524, "step": 1986 }, { "epoch": 1.466420664206642, "grad_norm": 0.46056331629709585, "learning_rate": 0.00012122219669230017, "loss": 0.1465, "step": 1987 }, { "epoch": 1.467158671586716, "grad_norm": 0.1868471773611989, "learning_rate": 0.00012113826236296244, "loss": 0.055, "step": 1988 }, { "epoch": 1.4678966789667895, "grad_norm": 0.821899503084465, "learning_rate": 0.00012105431244235084, "loss": 0.0732, "step": 1989 }, { "epoch": 1.4686346863468636, "grad_norm": 0.34068884168018937, "learning_rate": 0.00012097034699238559, "loss": 0.0987, "step": 1990 }, { "epoch": 1.4693726937269374, "grad_norm": 0.11642665259869332, "learning_rate": 0.00012088636607499842, "loss": 0.0234, "step": 1991 }, { "epoch": 1.4701107011070111, "grad_norm": 0.32791495880735694, "learning_rate": 0.00012080236975213235, "loss": 0.0704, "step": 1992 }, { "epoch": 1.470848708487085, "grad_norm": 0.2837396579872884, "learning_rate": 0.00012071835808574192, "loss": 0.0447, "step": 1993 }, { "epoch": 1.4715867158671587, "grad_norm": 0.19148854686051167, "learning_rate": 0.00012063433113779288, "loss": 0.0277, "step": 1994 }, { "epoch": 1.4723247232472325, "grad_norm": 0.4378671354280766, "learning_rate": 0.0001205502889702623, "loss": 0.0155, "step": 1995 }, { "epoch": 1.4730627306273063, "grad_norm": 0.41230305632205394, "learning_rate": 0.00012046623164513842, "loss": 0.0868, "step": 1996 }, { "epoch": 1.47380073800738, "grad_norm": 0.12487261629425041, "learning_rate": 0.00012038215922442076, "loss": 0.0295, "step": 1997 }, { "epoch": 1.4745387453874539, "grad_norm": 0.14846562046760414, "learning_rate": 0.0001202980717701198, "loss": 0.0288, "step": 1998 }, { "epoch": 1.4752767527675277, "grad_norm": 0.3112181308232579, "learning_rate": 0.00012021396934425735, "loss": 0.0311, "step": 1999 }, { "epoch": 1.4760147601476015, "grad_norm": 0.11728151238569348, "learning_rate": 0.00012012985200886602, "loss": 0.0253, "step": 2000 }, { "epoch": 1.4767527675276753, "grad_norm": 0.23552067363943882, "learning_rate": 0.0001200457198259896, "loss": 0.0527, "step": 2001 }, { "epoch": 1.477490774907749, "grad_norm": 0.2509259556232819, "learning_rate": 0.00011996157285768273, "loss": 0.0443, "step": 2002 }, { "epoch": 1.4782287822878228, "grad_norm": 0.15548488960100398, "learning_rate": 0.000119877411166011, "loss": 0.0515, "step": 2003 }, { "epoch": 1.4789667896678966, "grad_norm": 0.22514871227820316, "learning_rate": 0.00011979323481305088, "loss": 0.0465, "step": 2004 }, { "epoch": 1.4797047970479704, "grad_norm": 0.2415401029446883, "learning_rate": 0.00011970904386088952, "loss": 0.058, "step": 2005 }, { "epoch": 1.4804428044280442, "grad_norm": 0.13458713491351165, "learning_rate": 0.00011962483837162502, "loss": 0.0229, "step": 2006 }, { "epoch": 1.481180811808118, "grad_norm": 0.5702474659628711, "learning_rate": 0.0001195406184073661, "loss": 0.0654, "step": 2007 }, { "epoch": 1.4819188191881918, "grad_norm": 0.14764154124079032, "learning_rate": 0.00011945638403023216, "loss": 0.0262, "step": 2008 }, { "epoch": 1.4826568265682658, "grad_norm": 0.20898084499613287, "learning_rate": 0.0001193721353023533, "loss": 0.0541, "step": 2009 }, { "epoch": 1.4833948339483394, "grad_norm": 0.2571201426116716, "learning_rate": 0.0001192878722858701, "loss": 0.0321, "step": 2010 }, { "epoch": 1.4841328413284134, "grad_norm": 0.2267356615380508, "learning_rate": 0.00011920359504293373, "loss": 0.0482, "step": 2011 }, { "epoch": 1.484870848708487, "grad_norm": 0.31498397427158165, "learning_rate": 0.00011911930363570588, "loss": 0.0632, "step": 2012 }, { "epoch": 1.485608856088561, "grad_norm": 0.5860976859169188, "learning_rate": 0.00011903499812635865, "loss": 0.09, "step": 2013 }, { "epoch": 1.4863468634686348, "grad_norm": 0.1091819823387396, "learning_rate": 0.00011895067857707455, "loss": 0.0232, "step": 2014 }, { "epoch": 1.4870848708487086, "grad_norm": 0.3286725267092742, "learning_rate": 0.00011886634505004647, "loss": 0.0509, "step": 2015 }, { "epoch": 1.4878228782287823, "grad_norm": 0.2649332626067396, "learning_rate": 0.00011878199760747757, "loss": 0.0863, "step": 2016 }, { "epoch": 1.4885608856088561, "grad_norm": 0.17245089128593957, "learning_rate": 0.00011869763631158129, "loss": 0.03, "step": 2017 }, { "epoch": 1.48929889298893, "grad_norm": 0.28146698229465733, "learning_rate": 0.00011861326122458132, "loss": 0.0502, "step": 2018 }, { "epoch": 1.4900369003690037, "grad_norm": 0.22588146105179, "learning_rate": 0.00011852887240871145, "loss": 0.0619, "step": 2019 }, { "epoch": 1.4907749077490775, "grad_norm": 0.23267813401005466, "learning_rate": 0.00011844446992621565, "loss": 0.0374, "step": 2020 }, { "epoch": 1.4915129151291513, "grad_norm": 0.10022548235877643, "learning_rate": 0.000118360053839348, "loss": 0.0158, "step": 2021 }, { "epoch": 1.492250922509225, "grad_norm": 0.426512463304245, "learning_rate": 0.00011827562421037252, "loss": 0.0695, "step": 2022 }, { "epoch": 1.4929889298892989, "grad_norm": 0.11918919096293316, "learning_rate": 0.0001181911811015633, "loss": 0.0342, "step": 2023 }, { "epoch": 1.4937269372693727, "grad_norm": 0.47211760619718784, "learning_rate": 0.00011810672457520437, "loss": 0.0862, "step": 2024 }, { "epoch": 1.4944649446494465, "grad_norm": 0.32047627701378495, "learning_rate": 0.00011802225469358956, "loss": 0.0753, "step": 2025 }, { "epoch": 1.4952029520295202, "grad_norm": 0.1485904930890296, "learning_rate": 0.0001179377715190227, "loss": 0.0296, "step": 2026 }, { "epoch": 1.495940959409594, "grad_norm": 0.3301690326316182, "learning_rate": 0.00011785327511381728, "loss": 0.0341, "step": 2027 }, { "epoch": 1.4966789667896678, "grad_norm": 0.19498928983909736, "learning_rate": 0.00011776876554029666, "loss": 0.0284, "step": 2028 }, { "epoch": 1.4974169741697416, "grad_norm": 0.19894819111834752, "learning_rate": 0.00011768424286079387, "loss": 0.03, "step": 2029 }, { "epoch": 1.4981549815498156, "grad_norm": 0.21654566928662855, "learning_rate": 0.00011759970713765156, "loss": 0.025, "step": 2030 }, { "epoch": 1.4988929889298892, "grad_norm": 0.2042993799423091, "learning_rate": 0.0001175151584332221, "loss": 0.0272, "step": 2031 }, { "epoch": 1.4996309963099632, "grad_norm": 0.23884772680210414, "learning_rate": 0.00011743059680986736, "loss": 0.0326, "step": 2032 }, { "epoch": 1.5003690036900368, "grad_norm": 0.3648259306682912, "learning_rate": 0.00011734602232995872, "loss": 0.0657, "step": 2033 }, { "epoch": 1.5011070110701108, "grad_norm": 0.3545972247086483, "learning_rate": 0.00011726143505587716, "loss": 0.0512, "step": 2034 }, { "epoch": 1.5018450184501844, "grad_norm": 0.30563363734835036, "learning_rate": 0.00011717683505001296, "loss": 0.0593, "step": 2035 }, { "epoch": 1.5025830258302584, "grad_norm": 0.1416135792235131, "learning_rate": 0.00011709222237476587, "loss": 0.0176, "step": 2036 }, { "epoch": 1.503321033210332, "grad_norm": 0.1885391175688639, "learning_rate": 0.00011700759709254496, "loss": 0.0506, "step": 2037 }, { "epoch": 1.504059040590406, "grad_norm": 0.1289975794807549, "learning_rate": 0.00011692295926576861, "loss": 0.0259, "step": 2038 }, { "epoch": 1.5047970479704798, "grad_norm": 0.1881164371528411, "learning_rate": 0.00011683830895686445, "loss": 0.0331, "step": 2039 }, { "epoch": 1.5055350553505535, "grad_norm": 0.11664618276622589, "learning_rate": 0.0001167536462282693, "loss": 0.0278, "step": 2040 }, { "epoch": 1.5062730627306273, "grad_norm": 0.1694756761802084, "learning_rate": 0.00011666897114242914, "loss": 0.0269, "step": 2041 }, { "epoch": 1.5070110701107011, "grad_norm": 0.17596690660731218, "learning_rate": 0.00011658428376179911, "loss": 0.0367, "step": 2042 }, { "epoch": 1.507749077490775, "grad_norm": 0.15058029966304548, "learning_rate": 0.00011649958414884335, "loss": 0.0205, "step": 2043 }, { "epoch": 1.5084870848708487, "grad_norm": 0.26676113780730887, "learning_rate": 0.00011641487236603512, "loss": 0.0443, "step": 2044 }, { "epoch": 1.5092250922509225, "grad_norm": 0.39065757575122867, "learning_rate": 0.00011633014847585652, "loss": 0.0663, "step": 2045 }, { "epoch": 1.5099630996309963, "grad_norm": 0.5256920764050561, "learning_rate": 0.0001162454125407987, "loss": 0.1215, "step": 2046 }, { "epoch": 1.51070110701107, "grad_norm": 0.2787458799725925, "learning_rate": 0.00011616066462336163, "loss": 0.0398, "step": 2047 }, { "epoch": 1.5114391143911439, "grad_norm": 0.31221273728377275, "learning_rate": 0.00011607590478605417, "loss": 0.074, "step": 2048 }, { "epoch": 1.5121771217712177, "grad_norm": 0.26471100314511975, "learning_rate": 0.00011599113309139388, "loss": 0.0447, "step": 2049 }, { "epoch": 1.5129151291512914, "grad_norm": 0.2282702445665535, "learning_rate": 0.00011590634960190721, "loss": 0.0251, "step": 2050 }, { "epoch": 1.5136531365313655, "grad_norm": 0.16965479666282435, "learning_rate": 0.00011582155438012917, "loss": 0.0432, "step": 2051 }, { "epoch": 1.514391143911439, "grad_norm": 0.1426390870283024, "learning_rate": 0.00011573674748860346, "loss": 0.0372, "step": 2052 }, { "epoch": 1.515129151291513, "grad_norm": 0.19390463238961297, "learning_rate": 0.00011565192898988242, "loss": 0.0319, "step": 2053 }, { "epoch": 1.5158671586715866, "grad_norm": 0.14716947626484478, "learning_rate": 0.00011556709894652696, "loss": 0.0205, "step": 2054 }, { "epoch": 1.5166051660516606, "grad_norm": 0.20890931982876965, "learning_rate": 0.00011548225742110646, "loss": 0.0343, "step": 2055 }, { "epoch": 1.5173431734317342, "grad_norm": 0.26627026395391595, "learning_rate": 0.00011539740447619882, "loss": 0.0508, "step": 2056 }, { "epoch": 1.5180811808118082, "grad_norm": 0.23412821460625727, "learning_rate": 0.00011531254017439028, "loss": 0.0432, "step": 2057 }, { "epoch": 1.5188191881918818, "grad_norm": 0.22650647544701955, "learning_rate": 0.0001152276645782756, "loss": 0.0271, "step": 2058 }, { "epoch": 1.5195571955719558, "grad_norm": 0.3204522905868553, "learning_rate": 0.00011514277775045768, "loss": 0.0541, "step": 2059 }, { "epoch": 1.5202952029520294, "grad_norm": 0.20648665601154084, "learning_rate": 0.00011505787975354788, "loss": 0.0355, "step": 2060 }, { "epoch": 1.5210332103321034, "grad_norm": 0.22057621732438668, "learning_rate": 0.00011497297065016565, "loss": 0.0293, "step": 2061 }, { "epoch": 1.5217712177121772, "grad_norm": 0.4000344583301706, "learning_rate": 0.00011488805050293879, "loss": 0.0371, "step": 2062 }, { "epoch": 1.522509225092251, "grad_norm": 0.49542925365456636, "learning_rate": 0.0001148031193745031, "loss": 0.1009, "step": 2063 }, { "epoch": 1.5232472324723247, "grad_norm": 0.24084772130155319, "learning_rate": 0.00011471817732750261, "loss": 0.0323, "step": 2064 }, { "epoch": 1.5239852398523985, "grad_norm": 0.13782125740220197, "learning_rate": 0.00011463322442458921, "loss": 0.0333, "step": 2065 }, { "epoch": 1.5247232472324723, "grad_norm": 0.23002335358358092, "learning_rate": 0.00011454826072842307, "loss": 0.0414, "step": 2066 }, { "epoch": 1.5254612546125461, "grad_norm": 0.40899028254438086, "learning_rate": 0.00011446328630167205, "loss": 0.1029, "step": 2067 }, { "epoch": 1.52619926199262, "grad_norm": 0.18073872653153664, "learning_rate": 0.00011437830120701211, "loss": 0.0539, "step": 2068 }, { "epoch": 1.5269372693726937, "grad_norm": 0.4698950292067621, "learning_rate": 0.00011429330550712703, "loss": 0.0555, "step": 2069 }, { "epoch": 1.5276752767527675, "grad_norm": 0.3324902506272999, "learning_rate": 0.00011420829926470835, "loss": 0.061, "step": 2070 }, { "epoch": 1.5284132841328413, "grad_norm": 0.5576551019132504, "learning_rate": 0.00011412328254245547, "loss": 0.083, "step": 2071 }, { "epoch": 1.5291512915129153, "grad_norm": 0.22187043479119978, "learning_rate": 0.00011403825540307546, "loss": 0.0407, "step": 2072 }, { "epoch": 1.5298892988929889, "grad_norm": 0.23690703198890925, "learning_rate": 0.0001139532179092831, "loss": 0.0409, "step": 2073 }, { "epoch": 1.5306273062730629, "grad_norm": 0.22955932006071258, "learning_rate": 0.00011386817012380084, "loss": 0.0392, "step": 2074 }, { "epoch": 1.5313653136531364, "grad_norm": 0.33087103650410926, "learning_rate": 0.00011378311210935864, "loss": 0.0425, "step": 2075 }, { "epoch": 1.5321033210332105, "grad_norm": 0.18714358923092544, "learning_rate": 0.00011369804392869408, "loss": 0.0303, "step": 2076 }, { "epoch": 1.532841328413284, "grad_norm": 0.2045575965750196, "learning_rate": 0.00011361296564455218, "loss": 0.0326, "step": 2077 }, { "epoch": 1.533579335793358, "grad_norm": 0.15575567550174213, "learning_rate": 0.00011352787731968549, "loss": 0.0266, "step": 2078 }, { "epoch": 1.5343173431734316, "grad_norm": 0.16436092595133628, "learning_rate": 0.00011344277901685383, "loss": 0.044, "step": 2079 }, { "epoch": 1.5350553505535056, "grad_norm": 0.2666578799081357, "learning_rate": 0.00011335767079882456, "loss": 0.0379, "step": 2080 }, { "epoch": 1.5357933579335792, "grad_norm": 0.37569724562365686, "learning_rate": 0.00011327255272837221, "loss": 0.0733, "step": 2081 }, { "epoch": 1.5365313653136532, "grad_norm": 0.19476042787037615, "learning_rate": 0.00011318742486827865, "loss": 0.0462, "step": 2082 }, { "epoch": 1.537269372693727, "grad_norm": 0.11932287615049622, "learning_rate": 0.0001131022872813329, "loss": 0.0201, "step": 2083 }, { "epoch": 1.5380073800738008, "grad_norm": 0.3808012332491812, "learning_rate": 0.00011301714003033126, "loss": 0.0491, "step": 2084 }, { "epoch": 1.5387453874538746, "grad_norm": 0.16392632467964177, "learning_rate": 0.0001129319831780771, "loss": 0.031, "step": 2085 }, { "epoch": 1.5394833948339484, "grad_norm": 0.28868417963775567, "learning_rate": 0.00011284681678738082, "loss": 0.0574, "step": 2086 }, { "epoch": 1.5402214022140222, "grad_norm": 0.14207759984341065, "learning_rate": 0.00011276164092105994, "loss": 0.0325, "step": 2087 }, { "epoch": 1.540959409594096, "grad_norm": 0.288818775057493, "learning_rate": 0.00011267645564193894, "loss": 0.0344, "step": 2088 }, { "epoch": 1.5416974169741697, "grad_norm": 0.21737423187273874, "learning_rate": 0.0001125912610128492, "loss": 0.051, "step": 2089 }, { "epoch": 1.5424354243542435, "grad_norm": 0.4770607294291507, "learning_rate": 0.00011250605709662911, "loss": 0.0653, "step": 2090 }, { "epoch": 1.5431734317343173, "grad_norm": 0.31386863496215206, "learning_rate": 0.00011242084395612377, "loss": 0.0458, "step": 2091 }, { "epoch": 1.543911439114391, "grad_norm": 0.19348684887496448, "learning_rate": 0.00011233562165418519, "loss": 0.0537, "step": 2092 }, { "epoch": 1.544649446494465, "grad_norm": 0.3431742586720289, "learning_rate": 0.00011225039025367203, "loss": 0.0585, "step": 2093 }, { "epoch": 1.5453874538745387, "grad_norm": 0.3288142772804701, "learning_rate": 0.00011216514981744981, "loss": 0.0542, "step": 2094 }, { "epoch": 1.5461254612546127, "grad_norm": 0.16753163922693676, "learning_rate": 0.00011207990040839058, "loss": 0.0335, "step": 2095 }, { "epoch": 1.5468634686346863, "grad_norm": 0.24784493400177926, "learning_rate": 0.0001119946420893731, "loss": 0.0472, "step": 2096 }, { "epoch": 1.5476014760147603, "grad_norm": 0.11966901928308597, "learning_rate": 0.0001119093749232826, "loss": 0.0301, "step": 2097 }, { "epoch": 1.5483394833948338, "grad_norm": 0.1284167570959698, "learning_rate": 0.00011182409897301099, "loss": 0.0259, "step": 2098 }, { "epoch": 1.5490774907749079, "grad_norm": 0.13528053113689625, "learning_rate": 0.00011173881430145646, "loss": 0.0278, "step": 2099 }, { "epoch": 1.5498154981549814, "grad_norm": 0.12382447159790363, "learning_rate": 0.00011165352097152381, "loss": 0.0306, "step": 2100 }, { "epoch": 1.5505535055350554, "grad_norm": 0.5287456172540654, "learning_rate": 0.00011156821904612411, "loss": 0.1302, "step": 2101 }, { "epoch": 1.551291512915129, "grad_norm": 0.16549064724838744, "learning_rate": 0.0001114829085881749, "loss": 0.0205, "step": 2102 }, { "epoch": 1.552029520295203, "grad_norm": 0.25162880636704155, "learning_rate": 0.00011139758966059981, "loss": 0.0424, "step": 2103 }, { "epoch": 1.5527675276752766, "grad_norm": 0.3433679897491047, "learning_rate": 0.00011131226232632895, "loss": 0.0625, "step": 2104 }, { "epoch": 1.5535055350553506, "grad_norm": 0.43829538816300206, "learning_rate": 0.00011122692664829844, "loss": 0.077, "step": 2105 }, { "epoch": 1.5542435424354244, "grad_norm": 0.32755647868260507, "learning_rate": 0.00011114158268945066, "loss": 0.0539, "step": 2106 }, { "epoch": 1.5549815498154982, "grad_norm": 0.39589109760122715, "learning_rate": 0.00011105623051273404, "loss": 0.0472, "step": 2107 }, { "epoch": 1.555719557195572, "grad_norm": 0.20149332503362896, "learning_rate": 0.00011097087018110315, "loss": 0.0395, "step": 2108 }, { "epoch": 1.5564575645756458, "grad_norm": 0.13683781886962068, "learning_rate": 0.00011088550175751849, "loss": 0.0237, "step": 2109 }, { "epoch": 1.5571955719557196, "grad_norm": 0.10961095887545266, "learning_rate": 0.00011080012530494656, "loss": 0.0221, "step": 2110 }, { "epoch": 1.5579335793357934, "grad_norm": 0.16721649958986462, "learning_rate": 0.00011071474088635983, "loss": 0.0495, "step": 2111 }, { "epoch": 1.5586715867158671, "grad_norm": 0.6894368197518503, "learning_rate": 0.00011062934856473655, "loss": 0.0966, "step": 2112 }, { "epoch": 1.559409594095941, "grad_norm": 0.25056415004934945, "learning_rate": 0.00011054394840306088, "loss": 0.0852, "step": 2113 }, { "epoch": 1.5601476014760147, "grad_norm": 0.385354259463071, "learning_rate": 0.00011045854046432272, "loss": 0.0288, "step": 2114 }, { "epoch": 1.5608856088560885, "grad_norm": 0.17972030967500743, "learning_rate": 0.0001103731248115177, "loss": 0.0335, "step": 2115 }, { "epoch": 1.5616236162361625, "grad_norm": 0.23394006324259592, "learning_rate": 0.0001102877015076472, "loss": 0.0455, "step": 2116 }, { "epoch": 1.562361623616236, "grad_norm": 0.23387832482917056, "learning_rate": 0.00011020227061571817, "loss": 0.0489, "step": 2117 }, { "epoch": 1.56309963099631, "grad_norm": 0.15764703017223672, "learning_rate": 0.00011011683219874323, "loss": 0.032, "step": 2118 }, { "epoch": 1.5638376383763837, "grad_norm": 0.22929890607240905, "learning_rate": 0.00011003138631974048, "loss": 0.0837, "step": 2119 }, { "epoch": 1.5645756457564577, "grad_norm": 0.6353874775701084, "learning_rate": 0.00010994593304173353, "loss": 0.0935, "step": 2120 }, { "epoch": 1.5653136531365313, "grad_norm": 0.4481312690700046, "learning_rate": 0.00010986047242775151, "loss": 0.1024, "step": 2121 }, { "epoch": 1.5660516605166053, "grad_norm": 0.2402804034527931, "learning_rate": 0.00010977500454082892, "loss": 0.0529, "step": 2122 }, { "epoch": 1.5667896678966788, "grad_norm": 0.2733028451742954, "learning_rate": 0.00010968952944400559, "loss": 0.0342, "step": 2123 }, { "epoch": 1.5675276752767529, "grad_norm": 0.2627827050814134, "learning_rate": 0.00010960404720032675, "loss": 0.0466, "step": 2124 }, { "epoch": 1.5682656826568264, "grad_norm": 0.22577087091447998, "learning_rate": 0.00010951855787284284, "loss": 0.032, "step": 2125 }, { "epoch": 1.5690036900369004, "grad_norm": 0.31782386620364234, "learning_rate": 0.0001094330615246095, "loss": 0.0365, "step": 2126 }, { "epoch": 1.5697416974169742, "grad_norm": 0.2526575309017709, "learning_rate": 0.00010934755821868767, "loss": 0.0495, "step": 2127 }, { "epoch": 1.570479704797048, "grad_norm": 0.2441534189409438, "learning_rate": 0.00010926204801814328, "loss": 0.0516, "step": 2128 }, { "epoch": 1.5712177121771218, "grad_norm": 0.13805197269021208, "learning_rate": 0.00010917653098604741, "loss": 0.0199, "step": 2129 }, { "epoch": 1.5719557195571956, "grad_norm": 0.46328722114506415, "learning_rate": 0.0001090910071854762, "loss": 0.0362, "step": 2130 }, { "epoch": 1.5726937269372694, "grad_norm": 0.29843790009796956, "learning_rate": 0.00010900547667951071, "loss": 0.074, "step": 2131 }, { "epoch": 1.5734317343173432, "grad_norm": 0.23504105275519346, "learning_rate": 0.00010891993953123708, "loss": 0.0411, "step": 2132 }, { "epoch": 1.574169741697417, "grad_norm": 0.3016151527219854, "learning_rate": 0.00010883439580374619, "loss": 0.054, "step": 2133 }, { "epoch": 1.5749077490774908, "grad_norm": 0.2831395487522448, "learning_rate": 0.00010874884556013383, "loss": 0.0658, "step": 2134 }, { "epoch": 1.5756457564575646, "grad_norm": 0.2560814873750882, "learning_rate": 0.00010866328886350068, "loss": 0.0375, "step": 2135 }, { "epoch": 1.5763837638376383, "grad_norm": 0.18787234735621242, "learning_rate": 0.00010857772577695209, "loss": 0.0273, "step": 2136 }, { "epoch": 1.5771217712177121, "grad_norm": 0.23573347921787152, "learning_rate": 0.00010849215636359809, "loss": 0.04, "step": 2137 }, { "epoch": 1.577859778597786, "grad_norm": 0.2098322753613992, "learning_rate": 0.00010840658068655352, "loss": 0.051, "step": 2138 }, { "epoch": 1.57859778597786, "grad_norm": 0.15191697247237237, "learning_rate": 0.00010832099880893766, "loss": 0.043, "step": 2139 }, { "epoch": 1.5793357933579335, "grad_norm": 0.4781247007914278, "learning_rate": 0.00010823541079387451, "loss": 0.0398, "step": 2140 }, { "epoch": 1.5800738007380075, "grad_norm": 0.19440742713448456, "learning_rate": 0.00010814981670449254, "loss": 0.0449, "step": 2141 }, { "epoch": 1.580811808118081, "grad_norm": 0.19822950848223553, "learning_rate": 0.00010806421660392467, "loss": 0.0259, "step": 2142 }, { "epoch": 1.581549815498155, "grad_norm": 0.24859579270171148, "learning_rate": 0.00010797861055530831, "loss": 0.0459, "step": 2143 }, { "epoch": 1.5822878228782287, "grad_norm": 0.14742879457916475, "learning_rate": 0.00010789299862178523, "loss": 0.0164, "step": 2144 }, { "epoch": 1.5830258302583027, "grad_norm": 0.23969820085114626, "learning_rate": 0.00010780738086650158, "loss": 0.0528, "step": 2145 }, { "epoch": 1.5837638376383762, "grad_norm": 0.14990708623155266, "learning_rate": 0.00010772175735260765, "loss": 0.0327, "step": 2146 }, { "epoch": 1.5845018450184503, "grad_norm": 0.1506556684521434, "learning_rate": 0.00010763612814325821, "loss": 0.0128, "step": 2147 }, { "epoch": 1.5852398523985238, "grad_norm": 0.30441946243885665, "learning_rate": 0.00010755049330161207, "loss": 0.0432, "step": 2148 }, { "epoch": 1.5859778597785978, "grad_norm": 0.1590486089703164, "learning_rate": 0.00010746485289083226, "loss": 0.0378, "step": 2149 }, { "epoch": 1.5867158671586716, "grad_norm": 0.19669549910624884, "learning_rate": 0.00010737920697408585, "loss": 0.0371, "step": 2150 }, { "epoch": 1.5874538745387454, "grad_norm": 0.3954604664572585, "learning_rate": 0.00010729355561454408, "loss": 0.081, "step": 2151 }, { "epoch": 1.5881918819188192, "grad_norm": 0.42223172615837046, "learning_rate": 0.00010720789887538212, "loss": 0.1034, "step": 2152 }, { "epoch": 1.588929889298893, "grad_norm": 0.28263975411458186, "learning_rate": 0.00010712223681977913, "loss": 0.0625, "step": 2153 }, { "epoch": 1.5896678966789668, "grad_norm": 0.27714904037954236, "learning_rate": 0.00010703656951091816, "loss": 0.0334, "step": 2154 }, { "epoch": 1.5904059040590406, "grad_norm": 0.21187682202829292, "learning_rate": 0.0001069508970119862, "loss": 0.0292, "step": 2155 }, { "epoch": 1.5911439114391144, "grad_norm": 0.3088817267332269, "learning_rate": 0.00010686521938617402, "loss": 0.0446, "step": 2156 }, { "epoch": 1.5918819188191882, "grad_norm": 0.23398885423501736, "learning_rate": 0.00010677953669667623, "loss": 0.031, "step": 2157 }, { "epoch": 1.592619926199262, "grad_norm": 0.31734132469798293, "learning_rate": 0.00010669384900669106, "loss": 0.0481, "step": 2158 }, { "epoch": 1.5933579335793358, "grad_norm": 0.2868854712832581, "learning_rate": 0.00010660815637942058, "loss": 0.0355, "step": 2159 }, { "epoch": 1.5940959409594095, "grad_norm": 0.18479607836814058, "learning_rate": 0.00010652245887807036, "loss": 0.0478, "step": 2160 }, { "epoch": 1.5948339483394833, "grad_norm": 0.22810391969227684, "learning_rate": 0.00010643675656584964, "loss": 0.0418, "step": 2161 }, { "epoch": 1.5955719557195573, "grad_norm": 0.2857146974677276, "learning_rate": 0.0001063510495059712, "loss": 0.0486, "step": 2162 }, { "epoch": 1.596309963099631, "grad_norm": 0.13866523270316025, "learning_rate": 0.00010626533776165133, "loss": 0.0238, "step": 2163 }, { "epoch": 1.597047970479705, "grad_norm": 0.22528123259736388, "learning_rate": 0.00010617962139610973, "loss": 0.0276, "step": 2164 }, { "epoch": 1.5977859778597785, "grad_norm": 0.14276062284491126, "learning_rate": 0.00010609390047256957, "loss": 0.0436, "step": 2165 }, { "epoch": 1.5985239852398525, "grad_norm": 0.337698588563982, "learning_rate": 0.00010600817505425735, "loss": 0.075, "step": 2166 }, { "epoch": 1.599261992619926, "grad_norm": 0.3396649683036686, "learning_rate": 0.00010592244520440289, "loss": 0.0566, "step": 2167 }, { "epoch": 1.6, "grad_norm": 0.2974060685419305, "learning_rate": 0.00010583671098623922, "loss": 0.0263, "step": 2168 }, { "epoch": 1.6007380073800737, "grad_norm": 0.17905466669304534, "learning_rate": 0.00010575097246300274, "loss": 0.035, "step": 2169 }, { "epoch": 1.6014760147601477, "grad_norm": 0.2031676713833064, "learning_rate": 0.00010566522969793286, "loss": 0.0498, "step": 2170 }, { "epoch": 1.6022140221402212, "grad_norm": 0.17320408489451095, "learning_rate": 0.00010557948275427223, "loss": 0.0309, "step": 2171 }, { "epoch": 1.6029520295202953, "grad_norm": 0.17404030266615597, "learning_rate": 0.00010549373169526655, "loss": 0.0245, "step": 2172 }, { "epoch": 1.603690036900369, "grad_norm": 0.2475139958668255, "learning_rate": 0.00010540797658416453, "loss": 0.0414, "step": 2173 }, { "epoch": 1.6044280442804428, "grad_norm": 0.20485149424803453, "learning_rate": 0.00010532221748421787, "loss": 0.0381, "step": 2174 }, { "epoch": 1.6051660516605166, "grad_norm": 0.23425376318982224, "learning_rate": 0.00010523645445868129, "loss": 0.0364, "step": 2175 }, { "epoch": 1.6059040590405904, "grad_norm": 0.21268231992661096, "learning_rate": 0.00010515068757081228, "loss": 0.0306, "step": 2176 }, { "epoch": 1.6066420664206642, "grad_norm": 0.29611844422418254, "learning_rate": 0.00010506491688387127, "loss": 0.0308, "step": 2177 }, { "epoch": 1.607380073800738, "grad_norm": 0.29400586227306846, "learning_rate": 0.00010497914246112148, "loss": 0.0519, "step": 2178 }, { "epoch": 1.6081180811808118, "grad_norm": 0.25503445707342454, "learning_rate": 0.0001048933643658289, "loss": 0.0392, "step": 2179 }, { "epoch": 1.6088560885608856, "grad_norm": 0.19755980809106596, "learning_rate": 0.00010480758266126214, "loss": 0.0443, "step": 2180 }, { "epoch": 1.6095940959409594, "grad_norm": 0.3597150153958353, "learning_rate": 0.00010472179741069257, "loss": 0.0635, "step": 2181 }, { "epoch": 1.6103321033210332, "grad_norm": 0.44104233454303654, "learning_rate": 0.0001046360086773941, "loss": 0.0527, "step": 2182 }, { "epoch": 1.6110701107011072, "grad_norm": 0.1607121853227598, "learning_rate": 0.0001045502165246433, "loss": 0.0203, "step": 2183 }, { "epoch": 1.6118081180811807, "grad_norm": 0.320616899051173, "learning_rate": 0.00010446442101571916, "loss": 0.0684, "step": 2184 }, { "epoch": 1.6125461254612548, "grad_norm": 0.26284543042344466, "learning_rate": 0.00010437862221390327, "loss": 0.0301, "step": 2185 }, { "epoch": 1.6132841328413283, "grad_norm": 0.1707061111458451, "learning_rate": 0.0001042928201824795, "loss": 0.023, "step": 2186 }, { "epoch": 1.6140221402214023, "grad_norm": 0.24409781975425504, "learning_rate": 0.00010420701498473422, "loss": 0.0494, "step": 2187 }, { "epoch": 1.614760147601476, "grad_norm": 0.5793287446658966, "learning_rate": 0.00010412120668395604, "loss": 0.083, "step": 2188 }, { "epoch": 1.61549815498155, "grad_norm": 0.11667318473289671, "learning_rate": 0.00010403539534343598, "loss": 0.0215, "step": 2189 }, { "epoch": 1.6162361623616235, "grad_norm": 0.13544311766426625, "learning_rate": 0.00010394958102646716, "loss": 0.0179, "step": 2190 }, { "epoch": 1.6169741697416975, "grad_norm": 0.15304931305819316, "learning_rate": 0.00010386376379634506, "loss": 0.0229, "step": 2191 }, { "epoch": 1.617712177121771, "grad_norm": 0.18741531276321086, "learning_rate": 0.00010377794371636712, "loss": 0.0314, "step": 2192 }, { "epoch": 1.618450184501845, "grad_norm": 0.2840514102011649, "learning_rate": 0.00010369212084983307, "loss": 0.045, "step": 2193 }, { "epoch": 1.6191881918819189, "grad_norm": 0.32546420022871464, "learning_rate": 0.0001036062952600445, "loss": 0.0782, "step": 2194 }, { "epoch": 1.6199261992619927, "grad_norm": 0.2606817595070766, "learning_rate": 0.0001035204670103052, "loss": 0.0508, "step": 2195 }, { "epoch": 1.6206642066420665, "grad_norm": 0.20638518051229962, "learning_rate": 0.00010343463616392078, "loss": 0.0291, "step": 2196 }, { "epoch": 1.6214022140221402, "grad_norm": 0.18226852378597763, "learning_rate": 0.00010334880278419884, "loss": 0.039, "step": 2197 }, { "epoch": 1.622140221402214, "grad_norm": 0.5228548376412713, "learning_rate": 0.00010326296693444885, "loss": 0.0755, "step": 2198 }, { "epoch": 1.6228782287822878, "grad_norm": 0.30276255409925246, "learning_rate": 0.0001031771286779821, "loss": 0.0767, "step": 2199 }, { "epoch": 1.6236162361623616, "grad_norm": 0.18604022571139178, "learning_rate": 0.00010309128807811153, "loss": 0.0218, "step": 2200 }, { "epoch": 1.6243542435424354, "grad_norm": 0.17817079631669608, "learning_rate": 0.00010300544519815203, "loss": 0.0262, "step": 2201 }, { "epoch": 1.6250922509225092, "grad_norm": 0.29414304752045994, "learning_rate": 0.00010291960010141997, "loss": 0.0325, "step": 2202 }, { "epoch": 1.625830258302583, "grad_norm": 0.19471364255028342, "learning_rate": 0.00010283375285123349, "loss": 0.0356, "step": 2203 }, { "epoch": 1.6265682656826568, "grad_norm": 0.19689870691683337, "learning_rate": 0.00010274790351091223, "loss": 0.044, "step": 2204 }, { "epoch": 1.6273062730627306, "grad_norm": 0.29036008254443063, "learning_rate": 0.00010266205214377748, "loss": 0.0278, "step": 2205 }, { "epoch": 1.6280442804428046, "grad_norm": 0.14872796462999927, "learning_rate": 0.0001025761988131519, "loss": 0.0243, "step": 2206 }, { "epoch": 1.6287822878228781, "grad_norm": 0.14742506932645216, "learning_rate": 0.0001024903435823597, "loss": 0.0289, "step": 2207 }, { "epoch": 1.6295202952029522, "grad_norm": 0.24924895246204418, "learning_rate": 0.00010240448651472634, "loss": 0.0448, "step": 2208 }, { "epoch": 1.6302583025830257, "grad_norm": 0.1862644336234115, "learning_rate": 0.00010231862767357888, "loss": 0.0472, "step": 2209 }, { "epoch": 1.6309963099630997, "grad_norm": 0.10500250964428233, "learning_rate": 0.00010223276712224541, "loss": 0.0212, "step": 2210 }, { "epoch": 1.6317343173431733, "grad_norm": 0.32971482614626624, "learning_rate": 0.00010214690492405554, "loss": 0.0745, "step": 2211 }, { "epoch": 1.6324723247232473, "grad_norm": 0.10921828204029509, "learning_rate": 0.00010206104114233993, "loss": 0.0134, "step": 2212 }, { "epoch": 1.633210332103321, "grad_norm": 0.1953892166066278, "learning_rate": 0.00010197517584043043, "loss": 0.0389, "step": 2213 }, { "epoch": 1.633948339483395, "grad_norm": 0.5169854659491295, "learning_rate": 0.00010188930908166006, "loss": 0.079, "step": 2214 }, { "epoch": 1.6346863468634685, "grad_norm": 0.19566641135592347, "learning_rate": 0.00010180344092936287, "loss": 0.0217, "step": 2215 }, { "epoch": 1.6354243542435425, "grad_norm": 0.14908620419151986, "learning_rate": 0.00010171757144687397, "loss": 0.0234, "step": 2216 }, { "epoch": 1.6361623616236163, "grad_norm": 0.24516233559045647, "learning_rate": 0.00010163170069752943, "loss": 0.0329, "step": 2217 }, { "epoch": 1.63690036900369, "grad_norm": 0.25677633897340835, "learning_rate": 0.00010154582874466625, "loss": 0.0319, "step": 2218 }, { "epoch": 1.6376383763837639, "grad_norm": 0.24949058707391306, "learning_rate": 0.00010145995565162239, "loss": 0.0419, "step": 2219 }, { "epoch": 1.6383763837638377, "grad_norm": 0.2352672652853383, "learning_rate": 0.0001013740814817365, "loss": 0.0298, "step": 2220 }, { "epoch": 1.6391143911439114, "grad_norm": 0.391107898629875, "learning_rate": 0.00010128820629834819, "loss": 0.1424, "step": 2221 }, { "epoch": 1.6398523985239852, "grad_norm": 0.19655058852143942, "learning_rate": 0.0001012023301647977, "loss": 0.0376, "step": 2222 }, { "epoch": 1.640590405904059, "grad_norm": 0.19112815753201937, "learning_rate": 0.00010111645314442602, "loss": 0.0441, "step": 2223 }, { "epoch": 1.6413284132841328, "grad_norm": 0.43036381184196987, "learning_rate": 0.00010103057530057478, "loss": 0.063, "step": 2224 }, { "epoch": 1.6420664206642066, "grad_norm": 0.6063204618920417, "learning_rate": 0.00010094469669658626, "loss": 0.114, "step": 2225 }, { "epoch": 1.6428044280442804, "grad_norm": 0.2850442660436834, "learning_rate": 0.00010085881739580325, "loss": 0.0317, "step": 2226 }, { "epoch": 1.6435424354243544, "grad_norm": 0.1169323864313085, "learning_rate": 0.00010077293746156902, "loss": 0.0205, "step": 2227 }, { "epoch": 1.644280442804428, "grad_norm": 0.289787368706083, "learning_rate": 0.00010068705695722742, "loss": 0.0735, "step": 2228 }, { "epoch": 1.645018450184502, "grad_norm": 0.2513393225301587, "learning_rate": 0.00010060117594612264, "loss": 0.1868, "step": 2229 }, { "epoch": 1.6457564575645756, "grad_norm": 0.1950459399659805, "learning_rate": 0.00010051529449159925, "loss": 0.0546, "step": 2230 }, { "epoch": 1.6464944649446496, "grad_norm": 0.22071792146858693, "learning_rate": 0.00010042941265700217, "loss": 0.0559, "step": 2231 }, { "epoch": 1.6472324723247231, "grad_norm": 0.20731631282510726, "learning_rate": 0.00010034353050567655, "loss": 0.0443, "step": 2232 }, { "epoch": 1.6479704797047972, "grad_norm": 0.19342101801150077, "learning_rate": 0.00010025764810096787, "loss": 0.0359, "step": 2233 }, { "epoch": 1.6487084870848707, "grad_norm": 0.38050180784853693, "learning_rate": 0.00010017176550622171, "loss": 0.0304, "step": 2234 }, { "epoch": 1.6494464944649447, "grad_norm": 0.17254279565419386, "learning_rate": 0.00010008588278478379, "loss": 0.0376, "step": 2235 }, { "epoch": 1.6501845018450183, "grad_norm": 0.24607083827787227, "learning_rate": 0.0001, "loss": 0.0342, "step": 2236 }, { "epoch": 1.6509225092250923, "grad_norm": 0.24321668648259323, "learning_rate": 9.991411721521623e-05, "loss": 0.0828, "step": 2237 }, { "epoch": 1.651660516605166, "grad_norm": 0.47288793788783323, "learning_rate": 9.982823449377831e-05, "loss": 0.0699, "step": 2238 }, { "epoch": 1.65239852398524, "grad_norm": 0.39588073069466345, "learning_rate": 9.974235189903217e-05, "loss": 0.1028, "step": 2239 }, { "epoch": 1.6531365313653137, "grad_norm": 0.1939545344975628, "learning_rate": 9.965646949432346e-05, "loss": 0.0465, "step": 2240 }, { "epoch": 1.6538745387453875, "grad_norm": 0.2931165734793973, "learning_rate": 9.957058734299787e-05, "loss": 0.0546, "step": 2241 }, { "epoch": 1.6546125461254613, "grad_norm": 0.1271108228254121, "learning_rate": 9.948470550840075e-05, "loss": 0.0231, "step": 2242 }, { "epoch": 1.655350553505535, "grad_norm": 0.17970765881729855, "learning_rate": 9.939882405387737e-05, "loss": 0.0314, "step": 2243 }, { "epoch": 1.6560885608856089, "grad_norm": 0.1594160377788668, "learning_rate": 9.931294304277262e-05, "loss": 0.0229, "step": 2244 }, { "epoch": 1.6568265682656826, "grad_norm": 0.362655648476971, "learning_rate": 9.922706253843101e-05, "loss": 0.0338, "step": 2245 }, { "epoch": 1.6575645756457564, "grad_norm": 0.3462716780883896, "learning_rate": 9.91411826041968e-05, "loss": 0.0484, "step": 2246 }, { "epoch": 1.6583025830258302, "grad_norm": 0.4649979383554981, "learning_rate": 9.905530330341376e-05, "loss": 0.0465, "step": 2247 }, { "epoch": 1.659040590405904, "grad_norm": 0.31307103913517526, "learning_rate": 9.896942469942524e-05, "loss": 0.0941, "step": 2248 }, { "epoch": 1.6597785977859778, "grad_norm": 0.2895886888111909, "learning_rate": 9.888354685557399e-05, "loss": 0.0495, "step": 2249 }, { "epoch": 1.6605166051660518, "grad_norm": 0.20409568521097898, "learning_rate": 9.879766983520233e-05, "loss": 0.0459, "step": 2250 }, { "epoch": 1.6612546125461254, "grad_norm": 0.2986081463248114, "learning_rate": 9.871179370165184e-05, "loss": 0.0728, "step": 2251 }, { "epoch": 1.6619926199261994, "grad_norm": 0.2586764149990882, "learning_rate": 9.862591851826351e-05, "loss": 0.0332, "step": 2252 }, { "epoch": 1.662730627306273, "grad_norm": 0.2586568154187048, "learning_rate": 9.854004434837765e-05, "loss": 0.0326, "step": 2253 }, { "epoch": 1.663468634686347, "grad_norm": 0.1258507852983789, "learning_rate": 9.845417125533374e-05, "loss": 0.0217, "step": 2254 }, { "epoch": 1.6642066420664205, "grad_norm": 0.15520517317443538, "learning_rate": 9.836829930247059e-05, "loss": 0.0505, "step": 2255 }, { "epoch": 1.6649446494464946, "grad_norm": 0.1431189051345211, "learning_rate": 9.828242855312604e-05, "loss": 0.0411, "step": 2256 }, { "epoch": 1.6656826568265681, "grad_norm": 0.12800256407480703, "learning_rate": 9.819655907063715e-05, "loss": 0.0288, "step": 2257 }, { "epoch": 1.6664206642066421, "grad_norm": 0.18296375133685994, "learning_rate": 9.811069091833999e-05, "loss": 0.0445, "step": 2258 }, { "epoch": 1.6671586715867157, "grad_norm": 0.19528746384503987, "learning_rate": 9.802482415956958e-05, "loss": 0.0346, "step": 2259 }, { "epoch": 1.6678966789667897, "grad_norm": 0.1295243435319013, "learning_rate": 9.793895885766011e-05, "loss": 0.0307, "step": 2260 }, { "epoch": 1.6686346863468635, "grad_norm": 0.09087133450127319, "learning_rate": 9.785309507594447e-05, "loss": 0.018, "step": 2261 }, { "epoch": 1.6693726937269373, "grad_norm": 0.3768333453589822, "learning_rate": 9.77672328777546e-05, "loss": 0.0422, "step": 2262 }, { "epoch": 1.670110701107011, "grad_norm": 0.11671421085898862, "learning_rate": 9.768137232642119e-05, "loss": 0.0226, "step": 2263 }, { "epoch": 1.6708487084870849, "grad_norm": 0.16550984009892147, "learning_rate": 9.759551348527367e-05, "loss": 0.0295, "step": 2264 }, { "epoch": 1.6715867158671587, "grad_norm": 0.23608457634377453, "learning_rate": 9.750965641764035e-05, "loss": 0.0357, "step": 2265 }, { "epoch": 1.6723247232472325, "grad_norm": 0.30405868359026095, "learning_rate": 9.742380118684811e-05, "loss": 0.0654, "step": 2266 }, { "epoch": 1.6730627306273063, "grad_norm": 0.30061498169621065, "learning_rate": 9.733794785622253e-05, "loss": 0.0476, "step": 2267 }, { "epoch": 1.67380073800738, "grad_norm": 0.28456252962777984, "learning_rate": 9.725209648908775e-05, "loss": 0.0433, "step": 2268 }, { "epoch": 1.6745387453874538, "grad_norm": 0.2688775525677339, "learning_rate": 9.716624714876654e-05, "loss": 0.0293, "step": 2269 }, { "epoch": 1.6752767527675276, "grad_norm": 0.12976018017126906, "learning_rate": 9.708039989858008e-05, "loss": 0.0293, "step": 2270 }, { "epoch": 1.6760147601476014, "grad_norm": 0.2132108794989587, "learning_rate": 9.6994554801848e-05, "loss": 0.0372, "step": 2271 }, { "epoch": 1.6767527675276752, "grad_norm": 0.3226907734668686, "learning_rate": 9.690871192188851e-05, "loss": 0.0612, "step": 2272 }, { "epoch": 1.6774907749077492, "grad_norm": 0.23996122902624897, "learning_rate": 9.682287132201793e-05, "loss": 0.0327, "step": 2273 }, { "epoch": 1.6782287822878228, "grad_norm": 0.17315962400961019, "learning_rate": 9.673703306555116e-05, "loss": 0.027, "step": 2274 }, { "epoch": 1.6789667896678968, "grad_norm": 0.2767819638938092, "learning_rate": 9.665119721580114e-05, "loss": 0.0567, "step": 2275 }, { "epoch": 1.6797047970479704, "grad_norm": 0.19046795281352555, "learning_rate": 9.656536383607925e-05, "loss": 0.0226, "step": 2276 }, { "epoch": 1.6804428044280444, "grad_norm": 0.1679989569197366, "learning_rate": 9.647953298969484e-05, "loss": 0.0573, "step": 2277 }, { "epoch": 1.681180811808118, "grad_norm": 0.1559228732275705, "learning_rate": 9.639370473995553e-05, "loss": 0.0333, "step": 2278 }, { "epoch": 1.681918819188192, "grad_norm": 0.2572754444604959, "learning_rate": 9.630787915016698e-05, "loss": 0.0386, "step": 2279 }, { "epoch": 1.6826568265682655, "grad_norm": 0.2812631550573641, "learning_rate": 9.62220562836329e-05, "loss": 0.0326, "step": 2280 }, { "epoch": 1.6833948339483396, "grad_norm": 0.2046556143851847, "learning_rate": 9.613623620365497e-05, "loss": 0.0414, "step": 2281 }, { "epoch": 1.6841328413284131, "grad_norm": 0.153972663456334, "learning_rate": 9.605041897353283e-05, "loss": 0.0316, "step": 2282 }, { "epoch": 1.6848708487084871, "grad_norm": 0.30012107638271723, "learning_rate": 9.596460465656404e-05, "loss": 0.0597, "step": 2283 }, { "epoch": 1.685608856088561, "grad_norm": 0.21826634384857854, "learning_rate": 9.587879331604399e-05, "loss": 0.0481, "step": 2284 }, { "epoch": 1.6863468634686347, "grad_norm": 0.2199283824341777, "learning_rate": 9.57929850152658e-05, "loss": 0.0375, "step": 2285 }, { "epoch": 1.6870848708487085, "grad_norm": 0.2957315490276422, "learning_rate": 9.570717981752053e-05, "loss": 0.0303, "step": 2286 }, { "epoch": 1.6878228782287823, "grad_norm": 0.19046250843739804, "learning_rate": 9.562137778609673e-05, "loss": 0.0433, "step": 2287 }, { "epoch": 1.688560885608856, "grad_norm": 0.1924841393685909, "learning_rate": 9.553557898428085e-05, "loss": 0.0352, "step": 2288 }, { "epoch": 1.6892988929889299, "grad_norm": 0.29821572247308287, "learning_rate": 9.544978347535671e-05, "loss": 0.0633, "step": 2289 }, { "epoch": 1.6900369003690037, "grad_norm": 0.27951287405121134, "learning_rate": 9.536399132260593e-05, "loss": 0.1332, "step": 2290 }, { "epoch": 1.6907749077490775, "grad_norm": 0.28115783479595685, "learning_rate": 9.527820258930749e-05, "loss": 0.0762, "step": 2291 }, { "epoch": 1.6915129151291513, "grad_norm": 0.2599144851375211, "learning_rate": 9.519241733873789e-05, "loss": 0.0421, "step": 2292 }, { "epoch": 1.692250922509225, "grad_norm": 0.15301804826966708, "learning_rate": 9.510663563417113e-05, "loss": 0.0343, "step": 2293 }, { "epoch": 1.692988929889299, "grad_norm": 0.18852153598225352, "learning_rate": 9.502085753887851e-05, "loss": 0.0471, "step": 2294 }, { "epoch": 1.6937269372693726, "grad_norm": 0.4144341752464025, "learning_rate": 9.493508311612874e-05, "loss": 0.0397, "step": 2295 }, { "epoch": 1.6944649446494466, "grad_norm": 0.27655682621815586, "learning_rate": 9.484931242918773e-05, "loss": 0.041, "step": 2296 }, { "epoch": 1.6952029520295202, "grad_norm": 0.34772583802262147, "learning_rate": 9.476354554131874e-05, "loss": 0.0681, "step": 2297 }, { "epoch": 1.6959409594095942, "grad_norm": 0.20579073806900278, "learning_rate": 9.467778251578217e-05, "loss": 0.0342, "step": 2298 }, { "epoch": 1.6966789667896678, "grad_norm": 0.27246781228120504, "learning_rate": 9.459202341583548e-05, "loss": 0.0581, "step": 2299 }, { "epoch": 1.6974169741697418, "grad_norm": 0.23409405053958882, "learning_rate": 9.450626830473349e-05, "loss": 0.0709, "step": 2300 }, { "epoch": 1.6981549815498154, "grad_norm": 0.1639825689096591, "learning_rate": 9.442051724572776e-05, "loss": 0.0207, "step": 2301 }, { "epoch": 1.6988929889298894, "grad_norm": 0.18089863886912283, "learning_rate": 9.433477030206716e-05, "loss": 0.0411, "step": 2302 }, { "epoch": 1.699630996309963, "grad_norm": 0.20331499919180895, "learning_rate": 9.424902753699726e-05, "loss": 0.0426, "step": 2303 }, { "epoch": 1.700369003690037, "grad_norm": 0.3155113227917467, "learning_rate": 9.416328901376079e-05, "loss": 0.0471, "step": 2304 }, { "epoch": 1.7011070110701108, "grad_norm": 0.21856344773283498, "learning_rate": 9.407755479559716e-05, "loss": 0.0361, "step": 2305 }, { "epoch": 1.7018450184501845, "grad_norm": 0.6754091916267999, "learning_rate": 9.399182494574267e-05, "loss": 0.085, "step": 2306 }, { "epoch": 1.7025830258302583, "grad_norm": 0.23590812626191346, "learning_rate": 9.390609952743045e-05, "loss": 0.0354, "step": 2307 }, { "epoch": 1.7033210332103321, "grad_norm": 0.3978394084487871, "learning_rate": 9.382037860389028e-05, "loss": 0.0597, "step": 2308 }, { "epoch": 1.704059040590406, "grad_norm": 0.299503516651813, "learning_rate": 9.373466223834869e-05, "loss": 0.0504, "step": 2309 }, { "epoch": 1.7047970479704797, "grad_norm": 0.14355031030444004, "learning_rate": 9.36489504940288e-05, "loss": 0.0421, "step": 2310 }, { "epoch": 1.7055350553505535, "grad_norm": 0.13503052142379438, "learning_rate": 9.356324343415037e-05, "loss": 0.0237, "step": 2311 }, { "epoch": 1.7062730627306273, "grad_norm": 0.318048732928874, "learning_rate": 9.347754112192967e-05, "loss": 0.0331, "step": 2312 }, { "epoch": 1.707011070110701, "grad_norm": 0.17951245298519097, "learning_rate": 9.339184362057943e-05, "loss": 0.0444, "step": 2313 }, { "epoch": 1.7077490774907749, "grad_norm": 0.19422091485661408, "learning_rate": 9.330615099330897e-05, "loss": 0.0343, "step": 2314 }, { "epoch": 1.7084870848708487, "grad_norm": 0.22626305377507083, "learning_rate": 9.322046330332377e-05, "loss": 0.0834, "step": 2315 }, { "epoch": 1.7092250922509225, "grad_norm": 0.19505039747858466, "learning_rate": 9.3134780613826e-05, "loss": 0.046, "step": 2316 }, { "epoch": 1.7099630996309965, "grad_norm": 0.2780112812955657, "learning_rate": 9.304910298801384e-05, "loss": 0.0329, "step": 2317 }, { "epoch": 1.71070110701107, "grad_norm": 0.2803917812580586, "learning_rate": 9.296343048908187e-05, "loss": 0.0687, "step": 2318 }, { "epoch": 1.711439114391144, "grad_norm": 0.22088925381084368, "learning_rate": 9.287776318022092e-05, "loss": 0.0802, "step": 2319 }, { "epoch": 1.7121771217712176, "grad_norm": 0.1577094800290124, "learning_rate": 9.27921011246179e-05, "loss": 0.0281, "step": 2320 }, { "epoch": 1.7129151291512916, "grad_norm": 0.07969462881544088, "learning_rate": 9.270644438545594e-05, "loss": 0.0127, "step": 2321 }, { "epoch": 1.7136531365313652, "grad_norm": 0.17415309811145643, "learning_rate": 9.262079302591415e-05, "loss": 0.017, "step": 2322 }, { "epoch": 1.7143911439114392, "grad_norm": 0.2512406281858992, "learning_rate": 9.253514710916776e-05, "loss": 0.047, "step": 2323 }, { "epoch": 1.7151291512915128, "grad_norm": 0.16491519269684116, "learning_rate": 9.244950669838795e-05, "loss": 0.0233, "step": 2324 }, { "epoch": 1.7158671586715868, "grad_norm": 0.2569667277947152, "learning_rate": 9.23638718567418e-05, "loss": 0.0485, "step": 2325 }, { "epoch": 1.7166051660516604, "grad_norm": 0.3038946874634404, "learning_rate": 9.227824264739236e-05, "loss": 0.0176, "step": 2326 }, { "epoch": 1.7173431734317344, "grad_norm": 0.33329528524416413, "learning_rate": 9.219261913349846e-05, "loss": 0.0582, "step": 2327 }, { "epoch": 1.7180811808118082, "grad_norm": 0.1673619303054244, "learning_rate": 9.210700137821478e-05, "loss": 0.0269, "step": 2328 }, { "epoch": 1.718819188191882, "grad_norm": 0.34567037835234127, "learning_rate": 9.202138944469168e-05, "loss": 0.0405, "step": 2329 }, { "epoch": 1.7195571955719557, "grad_norm": 0.18547454841857844, "learning_rate": 9.193578339607535e-05, "loss": 0.0316, "step": 2330 }, { "epoch": 1.7202952029520295, "grad_norm": 0.18225519394949438, "learning_rate": 9.185018329550751e-05, "loss": 0.0444, "step": 2331 }, { "epoch": 1.7210332103321033, "grad_norm": 0.2051352417833268, "learning_rate": 9.176458920612552e-05, "loss": 0.0376, "step": 2332 }, { "epoch": 1.7217712177121771, "grad_norm": 0.25757854836404565, "learning_rate": 9.167900119106237e-05, "loss": 0.0395, "step": 2333 }, { "epoch": 1.722509225092251, "grad_norm": 0.23439881635183243, "learning_rate": 9.159341931344652e-05, "loss": 0.0422, "step": 2334 }, { "epoch": 1.7232472324723247, "grad_norm": 0.6957071294309731, "learning_rate": 9.150784363640192e-05, "loss": 0.0639, "step": 2335 }, { "epoch": 1.7239852398523985, "grad_norm": 0.28435962415839017, "learning_rate": 9.142227422304794e-05, "loss": 0.0398, "step": 2336 }, { "epoch": 1.7247232472324723, "grad_norm": 0.15632624781684526, "learning_rate": 9.133671113649933e-05, "loss": 0.0271, "step": 2337 }, { "epoch": 1.7254612546125463, "grad_norm": 0.32047904013342776, "learning_rate": 9.125115443986618e-05, "loss": 0.0561, "step": 2338 }, { "epoch": 1.7261992619926199, "grad_norm": 0.18606403294302074, "learning_rate": 9.116560419625385e-05, "loss": 0.0403, "step": 2339 }, { "epoch": 1.7269372693726939, "grad_norm": 0.1599308880861448, "learning_rate": 9.108006046876295e-05, "loss": 0.0432, "step": 2340 }, { "epoch": 1.7276752767527674, "grad_norm": 0.19681370145834615, "learning_rate": 9.099452332048928e-05, "loss": 0.0378, "step": 2341 }, { "epoch": 1.7284132841328415, "grad_norm": 0.35900569153994494, "learning_rate": 9.090899281452383e-05, "loss": 0.042, "step": 2342 }, { "epoch": 1.729151291512915, "grad_norm": 0.2248419130401738, "learning_rate": 9.08234690139526e-05, "loss": 0.0351, "step": 2343 }, { "epoch": 1.729889298892989, "grad_norm": 0.3066796569220508, "learning_rate": 9.073795198185674e-05, "loss": 0.0599, "step": 2344 }, { "epoch": 1.7306273062730626, "grad_norm": 0.20714821834627, "learning_rate": 9.065244178131238e-05, "loss": 0.0406, "step": 2345 }, { "epoch": 1.7313653136531366, "grad_norm": 0.27929016746374347, "learning_rate": 9.056693847539051e-05, "loss": 0.0605, "step": 2346 }, { "epoch": 1.7321033210332102, "grad_norm": 0.21351389962229977, "learning_rate": 9.04814421271572e-05, "loss": 0.0468, "step": 2347 }, { "epoch": 1.7328413284132842, "grad_norm": 0.4208789564416645, "learning_rate": 9.039595279967327e-05, "loss": 0.0388, "step": 2348 }, { "epoch": 1.7335793357933578, "grad_norm": 0.3004724237154168, "learning_rate": 9.031047055599443e-05, "loss": 0.0496, "step": 2349 }, { "epoch": 1.7343173431734318, "grad_norm": 0.724920939398076, "learning_rate": 9.02249954591711e-05, "loss": 0.0264, "step": 2350 }, { "epoch": 1.7350553505535056, "grad_norm": 0.26399666702275176, "learning_rate": 9.01395275722485e-05, "loss": 0.0327, "step": 2351 }, { "epoch": 1.7357933579335794, "grad_norm": 0.21750362093059208, "learning_rate": 9.00540669582665e-05, "loss": 0.0271, "step": 2352 }, { "epoch": 1.7365313653136532, "grad_norm": 0.6875984518374424, "learning_rate": 8.996861368025955e-05, "loss": 0.0712, "step": 2353 }, { "epoch": 1.737269372693727, "grad_norm": 0.21680975260530883, "learning_rate": 8.98831678012568e-05, "loss": 0.0339, "step": 2354 }, { "epoch": 1.7380073800738007, "grad_norm": 0.1500674272747066, "learning_rate": 8.979772938428182e-05, "loss": 0.0308, "step": 2355 }, { "epoch": 1.7387453874538745, "grad_norm": 0.20255123773702646, "learning_rate": 8.971229849235281e-05, "loss": 0.0557, "step": 2356 }, { "epoch": 1.7394833948339483, "grad_norm": 0.22906252119386056, "learning_rate": 8.96268751884823e-05, "loss": 0.0186, "step": 2357 }, { "epoch": 1.740221402214022, "grad_norm": 0.16834237328836044, "learning_rate": 8.95414595356773e-05, "loss": 0.0197, "step": 2358 }, { "epoch": 1.740959409594096, "grad_norm": 0.2606682973579256, "learning_rate": 8.945605159693917e-05, "loss": 0.017, "step": 2359 }, { "epoch": 1.7416974169741697, "grad_norm": 0.11618008657231421, "learning_rate": 8.937065143526347e-05, "loss": 0.0248, "step": 2360 }, { "epoch": 1.7424354243542437, "grad_norm": 0.2538265725380013, "learning_rate": 8.928525911364021e-05, "loss": 0.034, "step": 2361 }, { "epoch": 1.7431734317343173, "grad_norm": 0.27262151748410274, "learning_rate": 8.919987469505345e-05, "loss": 0.037, "step": 2362 }, { "epoch": 1.7439114391143913, "grad_norm": 0.20344821008741282, "learning_rate": 8.911449824248153e-05, "loss": 0.0329, "step": 2363 }, { "epoch": 1.7446494464944649, "grad_norm": 0.32012464011101743, "learning_rate": 8.902912981889686e-05, "loss": 0.0616, "step": 2364 }, { "epoch": 1.7453874538745389, "grad_norm": 0.19461509169497865, "learning_rate": 8.894376948726597e-05, "loss": 0.0458, "step": 2365 }, { "epoch": 1.7461254612546124, "grad_norm": 0.17212090523211343, "learning_rate": 8.885841731054938e-05, "loss": 0.027, "step": 2366 }, { "epoch": 1.7468634686346864, "grad_norm": 0.2265901530280327, "learning_rate": 8.877307335170158e-05, "loss": 0.0273, "step": 2367 }, { "epoch": 1.74760147601476, "grad_norm": 0.16236390543304513, "learning_rate": 8.868773767367109e-05, "loss": 0.0278, "step": 2368 }, { "epoch": 1.748339483394834, "grad_norm": 0.19146695856462592, "learning_rate": 8.860241033940018e-05, "loss": 0.0306, "step": 2369 }, { "epoch": 1.7490774907749076, "grad_norm": 0.2335423731585324, "learning_rate": 8.851709141182514e-05, "loss": 0.0253, "step": 2370 }, { "epoch": 1.7498154981549816, "grad_norm": 0.20787819827014928, "learning_rate": 8.843178095387592e-05, "loss": 0.0271, "step": 2371 }, { "epoch": 1.7505535055350554, "grad_norm": 0.23133995833689217, "learning_rate": 8.834647902847621e-05, "loss": 0.0283, "step": 2372 }, { "epoch": 1.7512915129151292, "grad_norm": 0.23809319731404455, "learning_rate": 8.826118569854359e-05, "loss": 0.0426, "step": 2373 }, { "epoch": 1.752029520295203, "grad_norm": 0.24960205127491483, "learning_rate": 8.817590102698905e-05, "loss": 0.0622, "step": 2374 }, { "epoch": 1.7527675276752768, "grad_norm": 0.3945934086727215, "learning_rate": 8.809062507671743e-05, "loss": 0.0477, "step": 2375 }, { "epoch": 1.7535055350553506, "grad_norm": 0.1771264718252818, "learning_rate": 8.800535791062694e-05, "loss": 0.0195, "step": 2376 }, { "epoch": 1.7542435424354244, "grad_norm": 0.18531984267516383, "learning_rate": 8.792009959160945e-05, "loss": 0.0382, "step": 2377 }, { "epoch": 1.7549815498154981, "grad_norm": 0.28265687771560805, "learning_rate": 8.783485018255023e-05, "loss": 0.0443, "step": 2378 }, { "epoch": 1.755719557195572, "grad_norm": 0.3949505437354537, "learning_rate": 8.774960974632799e-05, "loss": 0.0474, "step": 2379 }, { "epoch": 1.7564575645756457, "grad_norm": 0.2669654309220272, "learning_rate": 8.766437834581486e-05, "loss": 0.0379, "step": 2380 }, { "epoch": 1.7571955719557195, "grad_norm": 0.14565452243974428, "learning_rate": 8.757915604387625e-05, "loss": 0.0294, "step": 2381 }, { "epoch": 1.7579335793357933, "grad_norm": 0.16254499978418396, "learning_rate": 8.749394290337091e-05, "loss": 0.0401, "step": 2382 }, { "epoch": 1.758671586715867, "grad_norm": 0.38092656395068153, "learning_rate": 8.74087389871508e-05, "loss": 0.0956, "step": 2383 }, { "epoch": 1.759409594095941, "grad_norm": 0.3594873682985174, "learning_rate": 8.732354435806109e-05, "loss": 0.0614, "step": 2384 }, { "epoch": 1.7601476014760147, "grad_norm": 0.24564873182054836, "learning_rate": 8.723835907894012e-05, "loss": 0.0393, "step": 2385 }, { "epoch": 1.7608856088560887, "grad_norm": 0.16228898886592277, "learning_rate": 8.71531832126192e-05, "loss": 0.0317, "step": 2386 }, { "epoch": 1.7616236162361623, "grad_norm": 0.13572326550946642, "learning_rate": 8.706801682192295e-05, "loss": 0.0238, "step": 2387 }, { "epoch": 1.7623616236162363, "grad_norm": 0.3278664362637598, "learning_rate": 8.698285996966873e-05, "loss": 0.0499, "step": 2388 }, { "epoch": 1.7630996309963098, "grad_norm": 0.10347940705712172, "learning_rate": 8.689771271866713e-05, "loss": 0.0217, "step": 2389 }, { "epoch": 1.7638376383763839, "grad_norm": 0.2819702750070552, "learning_rate": 8.681257513172136e-05, "loss": 0.064, "step": 2390 }, { "epoch": 1.7645756457564574, "grad_norm": 0.2819991205087313, "learning_rate": 8.672744727162781e-05, "loss": 0.0401, "step": 2391 }, { "epoch": 1.7653136531365314, "grad_norm": 0.11923202745922168, "learning_rate": 8.664232920117548e-05, "loss": 0.0214, "step": 2392 }, { "epoch": 1.766051660516605, "grad_norm": 0.20514554561134762, "learning_rate": 8.655722098314617e-05, "loss": 0.0311, "step": 2393 }, { "epoch": 1.766789667896679, "grad_norm": 0.3788824829972903, "learning_rate": 8.647212268031456e-05, "loss": 0.0455, "step": 2394 }, { "epoch": 1.7675276752767528, "grad_norm": 0.14239069403614185, "learning_rate": 8.638703435544783e-05, "loss": 0.0262, "step": 2395 }, { "epoch": 1.7682656826568266, "grad_norm": 0.2539145743816121, "learning_rate": 8.630195607130596e-05, "loss": 0.1119, "step": 2396 }, { "epoch": 1.7690036900369004, "grad_norm": 0.174632984173497, "learning_rate": 8.621688789064136e-05, "loss": 0.0294, "step": 2397 }, { "epoch": 1.7697416974169742, "grad_norm": 0.18360034317571602, "learning_rate": 8.613182987619918e-05, "loss": 0.0351, "step": 2398 }, { "epoch": 1.770479704797048, "grad_norm": 0.6333585995012826, "learning_rate": 8.604678209071693e-05, "loss": 0.0774, "step": 2399 }, { "epoch": 1.7712177121771218, "grad_norm": 0.2328633469840744, "learning_rate": 8.596174459692455e-05, "loss": 0.0255, "step": 2400 }, { "epoch": 1.7719557195571956, "grad_norm": 0.15693579264649574, "learning_rate": 8.587671745754458e-05, "loss": 0.0179, "step": 2401 }, { "epoch": 1.7726937269372693, "grad_norm": 0.2907722473415754, "learning_rate": 8.579170073529164e-05, "loss": 0.0503, "step": 2402 }, { "epoch": 1.7734317343173431, "grad_norm": 0.22623055009200196, "learning_rate": 8.570669449287301e-05, "loss": 0.0304, "step": 2403 }, { "epoch": 1.774169741697417, "grad_norm": 0.3949257281370934, "learning_rate": 8.562169879298787e-05, "loss": 0.1105, "step": 2404 }, { "epoch": 1.774907749077491, "grad_norm": 0.22439843597210854, "learning_rate": 8.553671369832798e-05, "loss": 0.0395, "step": 2405 }, { "epoch": 1.7756457564575645, "grad_norm": 0.272985903484342, "learning_rate": 8.545173927157698e-05, "loss": 0.062, "step": 2406 }, { "epoch": 1.7763837638376385, "grad_norm": 0.1706131951242555, "learning_rate": 8.53667755754108e-05, "loss": 0.0249, "step": 2407 }, { "epoch": 1.777121771217712, "grad_norm": 0.32698931011492793, "learning_rate": 8.528182267249745e-05, "loss": 0.0441, "step": 2408 }, { "epoch": 1.777859778597786, "grad_norm": 0.2617962858374623, "learning_rate": 8.519688062549691e-05, "loss": 0.0566, "step": 2409 }, { "epoch": 1.7785977859778597, "grad_norm": 0.14825133928056267, "learning_rate": 8.511194949706124e-05, "loss": 0.0258, "step": 2410 }, { "epoch": 1.7793357933579337, "grad_norm": 0.08117877494180274, "learning_rate": 8.502702934983436e-05, "loss": 0.0163, "step": 2411 }, { "epoch": 1.7800738007380073, "grad_norm": 0.20911654028242324, "learning_rate": 8.494212024645216e-05, "loss": 0.0294, "step": 2412 }, { "epoch": 1.7808118081180813, "grad_norm": 0.23373793638432533, "learning_rate": 8.485722224954237e-05, "loss": 0.0426, "step": 2413 }, { "epoch": 1.7815498154981548, "grad_norm": 0.33690216300601755, "learning_rate": 8.477233542172442e-05, "loss": 0.0381, "step": 2414 }, { "epoch": 1.7822878228782288, "grad_norm": 0.32517998043104096, "learning_rate": 8.468745982560974e-05, "loss": 0.0588, "step": 2415 }, { "epoch": 1.7830258302583026, "grad_norm": 0.18040102022417367, "learning_rate": 8.460259552380119e-05, "loss": 0.0351, "step": 2416 }, { "epoch": 1.7837638376383764, "grad_norm": 0.20759853208886966, "learning_rate": 8.451774257889356e-05, "loss": 0.0354, "step": 2417 }, { "epoch": 1.7845018450184502, "grad_norm": 0.28340392491704963, "learning_rate": 8.443290105347304e-05, "loss": 0.0747, "step": 2418 }, { "epoch": 1.785239852398524, "grad_norm": 0.19233533639094688, "learning_rate": 8.43480710101176e-05, "loss": 0.0305, "step": 2419 }, { "epoch": 1.7859778597785978, "grad_norm": 0.20576470571219654, "learning_rate": 8.426325251139659e-05, "loss": 0.0444, "step": 2420 }, { "epoch": 1.7867158671586716, "grad_norm": 0.2600831006595423, "learning_rate": 8.417844561987086e-05, "loss": 0.0348, "step": 2421 }, { "epoch": 1.7874538745387454, "grad_norm": 0.12345935537126823, "learning_rate": 8.409365039809281e-05, "loss": 0.0319, "step": 2422 }, { "epoch": 1.7881918819188192, "grad_norm": 0.2181675882577023, "learning_rate": 8.40088669086061e-05, "loss": 0.0315, "step": 2423 }, { "epoch": 1.788929889298893, "grad_norm": 0.16971723566685396, "learning_rate": 8.392409521394584e-05, "loss": 0.0418, "step": 2424 }, { "epoch": 1.7896678966789668, "grad_norm": 0.4161068839629117, "learning_rate": 8.383933537663839e-05, "loss": 0.0836, "step": 2425 }, { "epoch": 1.7904059040590405, "grad_norm": 0.1671664040101032, "learning_rate": 8.37545874592013e-05, "loss": 0.0324, "step": 2426 }, { "epoch": 1.7911439114391143, "grad_norm": 0.23500225561249696, "learning_rate": 8.366985152414349e-05, "loss": 0.0455, "step": 2427 }, { "epoch": 1.7918819188191883, "grad_norm": 0.2943072163898591, "learning_rate": 8.35851276339649e-05, "loss": 0.0508, "step": 2428 }, { "epoch": 1.792619926199262, "grad_norm": 0.17153689642745804, "learning_rate": 8.350041585115668e-05, "loss": 0.0366, "step": 2429 }, { "epoch": 1.793357933579336, "grad_norm": 0.5914826469144138, "learning_rate": 8.34157162382009e-05, "loss": 0.0984, "step": 2430 }, { "epoch": 1.7940959409594095, "grad_norm": 0.1975159371163137, "learning_rate": 8.333102885757089e-05, "loss": 0.0341, "step": 2431 }, { "epoch": 1.7948339483394835, "grad_norm": 0.18519932100003936, "learning_rate": 8.324635377173075e-05, "loss": 0.0309, "step": 2432 }, { "epoch": 1.795571955719557, "grad_norm": 0.27357377943251315, "learning_rate": 8.316169104313558e-05, "loss": 0.0346, "step": 2433 }, { "epoch": 1.796309963099631, "grad_norm": 0.4616100296702875, "learning_rate": 8.307704073423141e-05, "loss": 0.179, "step": 2434 }, { "epoch": 1.7970479704797047, "grad_norm": 0.22967499230743035, "learning_rate": 8.299240290745505e-05, "loss": 0.0742, "step": 2435 }, { "epoch": 1.7977859778597787, "grad_norm": 0.14360513339459008, "learning_rate": 8.290777762523415e-05, "loss": 0.0313, "step": 2436 }, { "epoch": 1.7985239852398522, "grad_norm": 0.4191919569991887, "learning_rate": 8.282316494998705e-05, "loss": 0.0901, "step": 2437 }, { "epoch": 1.7992619926199263, "grad_norm": 0.25568834349889236, "learning_rate": 8.273856494412285e-05, "loss": 0.0364, "step": 2438 }, { "epoch": 1.8, "grad_norm": 0.3643616533232209, "learning_rate": 8.265397767004129e-05, "loss": 0.0644, "step": 2439 }, { "epoch": 1.8007380073800738, "grad_norm": 0.18879994189837973, "learning_rate": 8.256940319013266e-05, "loss": 0.0268, "step": 2440 }, { "epoch": 1.8014760147601476, "grad_norm": 0.3440412291758541, "learning_rate": 8.248484156677791e-05, "loss": 0.062, "step": 2441 }, { "epoch": 1.8022140221402214, "grad_norm": 0.1375880541798873, "learning_rate": 8.240029286234844e-05, "loss": 0.0329, "step": 2442 }, { "epoch": 1.8029520295202952, "grad_norm": 0.1783079476495784, "learning_rate": 8.231575713920616e-05, "loss": 0.0446, "step": 2443 }, { "epoch": 1.803690036900369, "grad_norm": 0.4302054309527638, "learning_rate": 8.223123445970333e-05, "loss": 0.0734, "step": 2444 }, { "epoch": 1.8044280442804428, "grad_norm": 0.17609046495217504, "learning_rate": 8.214672488618275e-05, "loss": 0.0286, "step": 2445 }, { "epoch": 1.8051660516605166, "grad_norm": 0.20358579567186494, "learning_rate": 8.206222848097736e-05, "loss": 0.0277, "step": 2446 }, { "epoch": 1.8059040590405904, "grad_norm": 0.2265296278749522, "learning_rate": 8.197774530641046e-05, "loss": 0.0351, "step": 2447 }, { "epoch": 1.8066420664206642, "grad_norm": 0.14548129032607104, "learning_rate": 8.189327542479568e-05, "loss": 0.0301, "step": 2448 }, { "epoch": 1.8073800738007382, "grad_norm": 0.17637311832155592, "learning_rate": 8.180881889843672e-05, "loss": 0.0478, "step": 2449 }, { "epoch": 1.8081180811808117, "grad_norm": 0.4347443567282224, "learning_rate": 8.17243757896275e-05, "loss": 0.0698, "step": 2450 }, { "epoch": 1.8088560885608858, "grad_norm": 0.10993393268134602, "learning_rate": 8.163994616065202e-05, "loss": 0.0187, "step": 2451 }, { "epoch": 1.8095940959409593, "grad_norm": 0.3079491083949138, "learning_rate": 8.155553007378436e-05, "loss": 0.0353, "step": 2452 }, { "epoch": 1.8103321033210333, "grad_norm": 0.2991885540433099, "learning_rate": 8.147112759128859e-05, "loss": 0.0789, "step": 2453 }, { "epoch": 1.811070110701107, "grad_norm": 0.1530587495770722, "learning_rate": 8.138673877541871e-05, "loss": 0.0199, "step": 2454 }, { "epoch": 1.811808118081181, "grad_norm": 0.25211562422366207, "learning_rate": 8.130236368841872e-05, "loss": 0.0368, "step": 2455 }, { "epoch": 1.8125461254612545, "grad_norm": 0.21812487654943988, "learning_rate": 8.121800239252244e-05, "loss": 0.0306, "step": 2456 }, { "epoch": 1.8132841328413285, "grad_norm": 0.19122842920702549, "learning_rate": 8.113365494995355e-05, "loss": 0.0256, "step": 2457 }, { "epoch": 1.814022140221402, "grad_norm": 0.5478423626003653, "learning_rate": 8.104932142292546e-05, "loss": 0.0781, "step": 2458 }, { "epoch": 1.814760147601476, "grad_norm": 0.17970625865132905, "learning_rate": 8.096500187364136e-05, "loss": 0.0329, "step": 2459 }, { "epoch": 1.8154981549815496, "grad_norm": 0.20197010300938203, "learning_rate": 8.088069636429416e-05, "loss": 0.0301, "step": 2460 }, { "epoch": 1.8162361623616237, "grad_norm": 0.11427660297300182, "learning_rate": 8.07964049570663e-05, "loss": 0.0125, "step": 2461 }, { "epoch": 1.8169741697416975, "grad_norm": 0.15444136027383631, "learning_rate": 8.071212771412994e-05, "loss": 0.0233, "step": 2462 }, { "epoch": 1.8177121771217712, "grad_norm": 0.10765935928735193, "learning_rate": 8.062786469764672e-05, "loss": 0.0255, "step": 2463 }, { "epoch": 1.818450184501845, "grad_norm": 0.2794399209494395, "learning_rate": 8.054361596976785e-05, "loss": 0.047, "step": 2464 }, { "epoch": 1.8191881918819188, "grad_norm": 0.1331070057631753, "learning_rate": 8.045938159263391e-05, "loss": 0.017, "step": 2465 }, { "epoch": 1.8199261992619926, "grad_norm": 0.15642770597519282, "learning_rate": 8.037516162837499e-05, "loss": 0.0255, "step": 2466 }, { "epoch": 1.8206642066420664, "grad_norm": 0.168367553728185, "learning_rate": 8.02909561391105e-05, "loss": 0.0235, "step": 2467 }, { "epoch": 1.8214022140221402, "grad_norm": 0.17987960279543694, "learning_rate": 8.020676518694916e-05, "loss": 0.0556, "step": 2468 }, { "epoch": 1.822140221402214, "grad_norm": 0.36395348980436293, "learning_rate": 8.0122588833989e-05, "loss": 0.0505, "step": 2469 }, { "epoch": 1.8228782287822878, "grad_norm": 0.15296666103244888, "learning_rate": 8.003842714231728e-05, "loss": 0.0459, "step": 2470 }, { "epoch": 1.8236162361623616, "grad_norm": 0.2907292315536597, "learning_rate": 7.995428017401042e-05, "loss": 0.0291, "step": 2471 }, { "epoch": 1.8243542435424356, "grad_norm": 0.5932381199037793, "learning_rate": 7.987014799113397e-05, "loss": 0.1098, "step": 2472 }, { "epoch": 1.8250922509225092, "grad_norm": 0.45680568399714777, "learning_rate": 7.978603065574269e-05, "loss": 0.0646, "step": 2473 }, { "epoch": 1.8258302583025832, "grad_norm": 0.17113221522847138, "learning_rate": 7.970192822988024e-05, "loss": 0.0497, "step": 2474 }, { "epoch": 1.8265682656826567, "grad_norm": 0.20918194439425972, "learning_rate": 7.961784077557928e-05, "loss": 0.0267, "step": 2475 }, { "epoch": 1.8273062730627307, "grad_norm": 0.17426519226800286, "learning_rate": 7.953376835486161e-05, "loss": 0.0367, "step": 2476 }, { "epoch": 1.8280442804428043, "grad_norm": 0.2649493912018413, "learning_rate": 7.944971102973772e-05, "loss": 0.0552, "step": 2477 }, { "epoch": 1.8287822878228783, "grad_norm": 0.3322366363395717, "learning_rate": 7.936566886220714e-05, "loss": 0.0412, "step": 2478 }, { "epoch": 1.829520295202952, "grad_norm": 0.11132188493864775, "learning_rate": 7.92816419142581e-05, "loss": 0.0219, "step": 2479 }, { "epoch": 1.830258302583026, "grad_norm": 0.1739114151174803, "learning_rate": 7.919763024786767e-05, "loss": 0.0266, "step": 2480 }, { "epoch": 1.8309963099630995, "grad_norm": 0.15614144361288437, "learning_rate": 7.911363392500164e-05, "loss": 0.0275, "step": 2481 }, { "epoch": 1.8317343173431735, "grad_norm": 0.20863673462553467, "learning_rate": 7.902965300761442e-05, "loss": 0.0316, "step": 2482 }, { "epoch": 1.8324723247232473, "grad_norm": 0.1836223266224756, "learning_rate": 7.89456875576492e-05, "loss": 0.0304, "step": 2483 }, { "epoch": 1.833210332103321, "grad_norm": 0.22343203430099326, "learning_rate": 7.886173763703757e-05, "loss": 0.0493, "step": 2484 }, { "epoch": 1.8339483394833949, "grad_norm": 0.16636015970183526, "learning_rate": 7.877780330769984e-05, "loss": 0.0319, "step": 2485 }, { "epoch": 1.8346863468634687, "grad_norm": 0.12826589543229563, "learning_rate": 7.869388463154475e-05, "loss": 0.0219, "step": 2486 }, { "epoch": 1.8354243542435424, "grad_norm": 0.21259990187417674, "learning_rate": 7.860998167046938e-05, "loss": 0.0236, "step": 2487 }, { "epoch": 1.8361623616236162, "grad_norm": 0.18760304145057904, "learning_rate": 7.852609448635949e-05, "loss": 0.0264, "step": 2488 }, { "epoch": 1.83690036900369, "grad_norm": 0.24963205203234215, "learning_rate": 7.84422231410889e-05, "loss": 0.0351, "step": 2489 }, { "epoch": 1.8376383763837638, "grad_norm": 0.24568342124550613, "learning_rate": 7.835836769652001e-05, "loss": 0.041, "step": 2490 }, { "epoch": 1.8383763837638376, "grad_norm": 0.17305399387034703, "learning_rate": 7.827452821450327e-05, "loss": 0.0329, "step": 2491 }, { "epoch": 1.8391143911439114, "grad_norm": 0.22473828086391914, "learning_rate": 7.819070475687755e-05, "loss": 0.0714, "step": 2492 }, { "epoch": 1.8398523985239852, "grad_norm": 0.20817820971577009, "learning_rate": 7.810689738546977e-05, "loss": 0.0446, "step": 2493 }, { "epoch": 1.840590405904059, "grad_norm": 0.2566824351402871, "learning_rate": 7.802310616209498e-05, "loss": 0.0567, "step": 2494 }, { "epoch": 1.841328413284133, "grad_norm": 0.1493195796464928, "learning_rate": 7.793933114855643e-05, "loss": 0.0413, "step": 2495 }, { "epoch": 1.8420664206642066, "grad_norm": 0.1397976388188946, "learning_rate": 7.785557240664528e-05, "loss": 0.0291, "step": 2496 }, { "epoch": 1.8428044280442806, "grad_norm": 0.1351301415816085, "learning_rate": 7.777182999814084e-05, "loss": 0.0341, "step": 2497 }, { "epoch": 1.8435424354243541, "grad_norm": 0.3236900003692387, "learning_rate": 7.768810398481022e-05, "loss": 0.0746, "step": 2498 }, { "epoch": 1.8442804428044282, "grad_norm": 0.2734109856412319, "learning_rate": 7.760439442840854e-05, "loss": 0.0368, "step": 2499 }, { "epoch": 1.8450184501845017, "grad_norm": 0.2280982670015393, "learning_rate": 7.752070139067878e-05, "loss": 0.0238, "step": 2500 }, { "epoch": 1.8457564575645757, "grad_norm": 0.32194990429195863, "learning_rate": 7.743702493335159e-05, "loss": 0.11, "step": 2501 }, { "epoch": 1.8464944649446493, "grad_norm": 0.1272066217481909, "learning_rate": 7.735336511814563e-05, "loss": 0.0231, "step": 2502 }, { "epoch": 1.8472324723247233, "grad_norm": 0.20302530452962209, "learning_rate": 7.726972200676704e-05, "loss": 0.0487, "step": 2503 }, { "epoch": 1.8479704797047969, "grad_norm": 0.17065259490022808, "learning_rate": 7.71860956609099e-05, "loss": 0.0432, "step": 2504 }, { "epoch": 1.848708487084871, "grad_norm": 0.1490426480923922, "learning_rate": 7.710248614225564e-05, "loss": 0.0315, "step": 2505 }, { "epoch": 1.8494464944649447, "grad_norm": 0.3190131365213238, "learning_rate": 7.701889351247354e-05, "loss": 0.0555, "step": 2506 }, { "epoch": 1.8501845018450185, "grad_norm": 0.26557499087742187, "learning_rate": 7.693531783322023e-05, "loss": 0.0365, "step": 2507 }, { "epoch": 1.8509225092250923, "grad_norm": 0.18146394948244754, "learning_rate": 7.685175916613992e-05, "loss": 0.0284, "step": 2508 }, { "epoch": 1.851660516605166, "grad_norm": 0.19621896921660612, "learning_rate": 7.676821757286427e-05, "loss": 0.0357, "step": 2509 }, { "epoch": 1.8523985239852399, "grad_norm": 0.13882765915850448, "learning_rate": 7.668469311501237e-05, "loss": 0.0309, "step": 2510 }, { "epoch": 1.8531365313653136, "grad_norm": 0.27259333406150144, "learning_rate": 7.66011858541906e-05, "loss": 0.0466, "step": 2511 }, { "epoch": 1.8538745387453874, "grad_norm": 0.2433773958199022, "learning_rate": 7.651769585199271e-05, "loss": 0.0548, "step": 2512 }, { "epoch": 1.8546125461254612, "grad_norm": 0.13106066423096788, "learning_rate": 7.643422316999971e-05, "loss": 0.0209, "step": 2513 }, { "epoch": 1.855350553505535, "grad_norm": 0.4089881734045373, "learning_rate": 7.635076786977989e-05, "loss": 0.1154, "step": 2514 }, { "epoch": 1.8560885608856088, "grad_norm": 0.24344042404926733, "learning_rate": 7.626733001288851e-05, "loss": 0.0489, "step": 2515 }, { "epoch": 1.8568265682656828, "grad_norm": 0.1396194954623259, "learning_rate": 7.61839096608683e-05, "loss": 0.0264, "step": 2516 }, { "epoch": 1.8575645756457564, "grad_norm": 0.24349896808596108, "learning_rate": 7.610050687524872e-05, "loss": 0.0272, "step": 2517 }, { "epoch": 1.8583025830258304, "grad_norm": 0.18338859127565993, "learning_rate": 7.601712171754662e-05, "loss": 0.0412, "step": 2518 }, { "epoch": 1.859040590405904, "grad_norm": 0.2477037042048173, "learning_rate": 7.59337542492655e-05, "loss": 0.0136, "step": 2519 }, { "epoch": 1.859778597785978, "grad_norm": 0.17062809934256803, "learning_rate": 7.585040453189615e-05, "loss": 0.0244, "step": 2520 }, { "epoch": 1.8605166051660516, "grad_norm": 0.22499302288935955, "learning_rate": 7.576707262691602e-05, "loss": 0.0313, "step": 2521 }, { "epoch": 1.8612546125461256, "grad_norm": 0.20899874527804416, "learning_rate": 7.568375859578948e-05, "loss": 0.0315, "step": 2522 }, { "epoch": 1.8619926199261991, "grad_norm": 0.17830167154023963, "learning_rate": 7.560046249996782e-05, "loss": 0.0349, "step": 2523 }, { "epoch": 1.8627306273062731, "grad_norm": 0.18291664780811898, "learning_rate": 7.551718440088896e-05, "loss": 0.0214, "step": 2524 }, { "epoch": 1.8634686346863467, "grad_norm": 0.43280162185743853, "learning_rate": 7.543392435997766e-05, "loss": 0.0476, "step": 2525 }, { "epoch": 1.8642066420664207, "grad_norm": 0.2689084174526102, "learning_rate": 7.535068243864527e-05, "loss": 0.0423, "step": 2526 }, { "epoch": 1.8649446494464945, "grad_norm": 0.10515020904663391, "learning_rate": 7.526745869828985e-05, "loss": 0.0235, "step": 2527 }, { "epoch": 1.8656826568265683, "grad_norm": 0.23607172889635705, "learning_rate": 7.5184253200296e-05, "loss": 0.0426, "step": 2528 }, { "epoch": 1.866420664206642, "grad_norm": 0.35628307673816606, "learning_rate": 7.510106600603488e-05, "loss": 0.061, "step": 2529 }, { "epoch": 1.867158671586716, "grad_norm": 0.3864436247746933, "learning_rate": 7.501789717686418e-05, "loss": 0.0493, "step": 2530 }, { "epoch": 1.8678966789667897, "grad_norm": 0.21721151476035858, "learning_rate": 7.493474677412794e-05, "loss": 0.0405, "step": 2531 }, { "epoch": 1.8686346863468635, "grad_norm": 0.12182630396229298, "learning_rate": 7.48516148591568e-05, "loss": 0.0173, "step": 2532 }, { "epoch": 1.8693726937269373, "grad_norm": 0.16917009925056933, "learning_rate": 7.476850149326754e-05, "loss": 0.0138, "step": 2533 }, { "epoch": 1.870110701107011, "grad_norm": 0.14287854767732064, "learning_rate": 7.468540673776339e-05, "loss": 0.0244, "step": 2534 }, { "epoch": 1.8708487084870848, "grad_norm": 0.1724291517196122, "learning_rate": 7.460233065393387e-05, "loss": 0.0287, "step": 2535 }, { "epoch": 1.8715867158671586, "grad_norm": 0.28988479892789387, "learning_rate": 7.451927330305464e-05, "loss": 0.0375, "step": 2536 }, { "epoch": 1.8723247232472324, "grad_norm": 0.3732775486875892, "learning_rate": 7.443623474638763e-05, "loss": 0.0334, "step": 2537 }, { "epoch": 1.8730627306273062, "grad_norm": 0.2026428273888483, "learning_rate": 7.435321504518085e-05, "loss": 0.0466, "step": 2538 }, { "epoch": 1.8738007380073802, "grad_norm": 0.27847977621817455, "learning_rate": 7.427021426066843e-05, "loss": 0.0546, "step": 2539 }, { "epoch": 1.8745387453874538, "grad_norm": 0.35945901450569856, "learning_rate": 7.41872324540705e-05, "loss": 0.0509, "step": 2540 }, { "epoch": 1.8752767527675278, "grad_norm": 0.22169331657964952, "learning_rate": 7.410426968659327e-05, "loss": 0.0585, "step": 2541 }, { "epoch": 1.8760147601476014, "grad_norm": 0.3771554262549975, "learning_rate": 7.402132601942889e-05, "loss": 0.0292, "step": 2542 }, { "epoch": 1.8767527675276754, "grad_norm": 0.1955767159237433, "learning_rate": 7.39384015137553e-05, "loss": 0.0484, "step": 2543 }, { "epoch": 1.877490774907749, "grad_norm": 0.4066653627965495, "learning_rate": 7.38554962307365e-05, "loss": 0.0867, "step": 2544 }, { "epoch": 1.878228782287823, "grad_norm": 0.22911554966361494, "learning_rate": 7.377261023152219e-05, "loss": 0.0415, "step": 2545 }, { "epoch": 1.8789667896678965, "grad_norm": 0.3928430634530449, "learning_rate": 7.368974357724789e-05, "loss": 0.0451, "step": 2546 }, { "epoch": 1.8797047970479706, "grad_norm": 0.28573398183261756, "learning_rate": 7.36068963290348e-05, "loss": 0.0439, "step": 2547 }, { "epoch": 1.8804428044280441, "grad_norm": 0.2002837353408956, "learning_rate": 7.352406854798983e-05, "loss": 0.0306, "step": 2548 }, { "epoch": 1.8811808118081181, "grad_norm": 0.26662764789442744, "learning_rate": 7.34412602952056e-05, "loss": 0.0344, "step": 2549 }, { "epoch": 1.881918819188192, "grad_norm": 0.10971269450820395, "learning_rate": 7.335847163176021e-05, "loss": 0.0222, "step": 2550 }, { "epoch": 1.8826568265682657, "grad_norm": 0.2815288228689489, "learning_rate": 7.327570261871742e-05, "loss": 0.0989, "step": 2551 }, { "epoch": 1.8833948339483395, "grad_norm": 0.1607333667945276, "learning_rate": 7.319295331712638e-05, "loss": 0.0245, "step": 2552 }, { "epoch": 1.8841328413284133, "grad_norm": 0.23910382473078493, "learning_rate": 7.311022378802187e-05, "loss": 0.0351, "step": 2553 }, { "epoch": 1.884870848708487, "grad_norm": 0.13549260230790142, "learning_rate": 7.30275140924239e-05, "loss": 0.0287, "step": 2554 }, { "epoch": 1.8856088560885609, "grad_norm": 0.4069754140551746, "learning_rate": 7.294482429133796e-05, "loss": 0.0585, "step": 2555 }, { "epoch": 1.8863468634686347, "grad_norm": 0.2455210806113813, "learning_rate": 7.286215444575483e-05, "loss": 0.0423, "step": 2556 }, { "epoch": 1.8870848708487085, "grad_norm": 0.373276383581375, "learning_rate": 7.277950461665059e-05, "loss": 0.0702, "step": 2557 }, { "epoch": 1.8878228782287823, "grad_norm": 0.1285221929028311, "learning_rate": 7.269687486498656e-05, "loss": 0.0281, "step": 2558 }, { "epoch": 1.888560885608856, "grad_norm": 0.26094065411180556, "learning_rate": 7.261426525170922e-05, "loss": 0.0577, "step": 2559 }, { "epoch": 1.8892988929889298, "grad_norm": 0.25349204584267604, "learning_rate": 7.253167583775025e-05, "loss": 0.0274, "step": 2560 }, { "epoch": 1.8900369003690036, "grad_norm": 0.12360236822353675, "learning_rate": 7.244910668402637e-05, "loss": 0.0333, "step": 2561 }, { "epoch": 1.8907749077490776, "grad_norm": 0.06913103175210993, "learning_rate": 7.236655785143935e-05, "loss": 0.0177, "step": 2562 }, { "epoch": 1.8915129151291512, "grad_norm": 0.14429896031601724, "learning_rate": 7.228402940087606e-05, "loss": 0.0631, "step": 2563 }, { "epoch": 1.8922509225092252, "grad_norm": 0.3124668084367191, "learning_rate": 7.220152139320824e-05, "loss": 0.0763, "step": 2564 }, { "epoch": 1.8929889298892988, "grad_norm": 0.160031414796651, "learning_rate": 7.211903388929264e-05, "loss": 0.0505, "step": 2565 }, { "epoch": 1.8937269372693728, "grad_norm": 0.2884764329940227, "learning_rate": 7.203656694997078e-05, "loss": 0.0416, "step": 2566 }, { "epoch": 1.8944649446494464, "grad_norm": 0.28670063880142627, "learning_rate": 7.195412063606912e-05, "loss": 0.0652, "step": 2567 }, { "epoch": 1.8952029520295204, "grad_norm": 0.20911351841634063, "learning_rate": 7.187169500839884e-05, "loss": 0.0223, "step": 2568 }, { "epoch": 1.895940959409594, "grad_norm": 0.23223296548847286, "learning_rate": 7.178929012775586e-05, "loss": 0.049, "step": 2569 }, { "epoch": 1.896678966789668, "grad_norm": 0.1558022103535015, "learning_rate": 7.170690605492086e-05, "loss": 0.026, "step": 2570 }, { "epoch": 1.8974169741697415, "grad_norm": 0.22072404144975022, "learning_rate": 7.16245428506591e-05, "loss": 0.0414, "step": 2571 }, { "epoch": 1.8981549815498155, "grad_norm": 0.15539963097160728, "learning_rate": 7.154220057572049e-05, "loss": 0.0217, "step": 2572 }, { "epoch": 1.8988929889298893, "grad_norm": 0.15361857661759598, "learning_rate": 7.145987929083946e-05, "loss": 0.0325, "step": 2573 }, { "epoch": 1.8996309963099631, "grad_norm": 0.21067875198480282, "learning_rate": 7.137757905673506e-05, "loss": 0.0335, "step": 2574 }, { "epoch": 1.900369003690037, "grad_norm": 0.39860406449553376, "learning_rate": 7.12952999341107e-05, "loss": 0.0498, "step": 2575 }, { "epoch": 1.9011070110701107, "grad_norm": 0.21474408692375233, "learning_rate": 7.121304198365421e-05, "loss": 0.0309, "step": 2576 }, { "epoch": 1.9018450184501845, "grad_norm": 0.23954183087716271, "learning_rate": 7.113080526603792e-05, "loss": 0.0565, "step": 2577 }, { "epoch": 1.9025830258302583, "grad_norm": 0.23819573659668028, "learning_rate": 7.10485898419184e-05, "loss": 0.0449, "step": 2578 }, { "epoch": 1.903321033210332, "grad_norm": 0.33870457329031883, "learning_rate": 7.096639577193658e-05, "loss": 0.0481, "step": 2579 }, { "epoch": 1.9040590405904059, "grad_norm": 0.09875014286551671, "learning_rate": 7.088422311671756e-05, "loss": 0.0139, "step": 2580 }, { "epoch": 1.9047970479704797, "grad_norm": 0.11866311096116064, "learning_rate": 7.080207193687076e-05, "loss": 0.0166, "step": 2581 }, { "epoch": 1.9055350553505535, "grad_norm": 0.20218195836888803, "learning_rate": 7.071994229298962e-05, "loss": 0.0488, "step": 2582 }, { "epoch": 1.9062730627306275, "grad_norm": 0.14723210510631712, "learning_rate": 7.06378342456518e-05, "loss": 0.0322, "step": 2583 }, { "epoch": 1.907011070110701, "grad_norm": 0.2155993676980411, "learning_rate": 7.055574785541901e-05, "loss": 0.0296, "step": 2584 }, { "epoch": 1.907749077490775, "grad_norm": 0.6194583653283506, "learning_rate": 7.047368318283692e-05, "loss": 0.0518, "step": 2585 }, { "epoch": 1.9084870848708486, "grad_norm": 0.3318439923531232, "learning_rate": 7.03916402884353e-05, "loss": 0.0374, "step": 2586 }, { "epoch": 1.9092250922509226, "grad_norm": 0.09568619194409499, "learning_rate": 7.03096192327278e-05, "loss": 0.0111, "step": 2587 }, { "epoch": 1.9099630996309962, "grad_norm": 1.1377147905493195, "learning_rate": 7.022762007621186e-05, "loss": 0.0488, "step": 2588 }, { "epoch": 1.9107011070110702, "grad_norm": 0.3181781503512673, "learning_rate": 7.014564287936896e-05, "loss": 0.0411, "step": 2589 }, { "epoch": 1.9114391143911438, "grad_norm": 0.23633821257197346, "learning_rate": 7.006368770266421e-05, "loss": 0.0309, "step": 2590 }, { "epoch": 1.9121771217712178, "grad_norm": 0.11977509822637726, "learning_rate": 6.998175460654662e-05, "loss": 0.0216, "step": 2591 }, { "epoch": 1.9129151291512914, "grad_norm": 0.20627689205727698, "learning_rate": 6.989984365144878e-05, "loss": 0.0233, "step": 2592 }, { "epoch": 1.9136531365313654, "grad_norm": 0.15050245653825972, "learning_rate": 6.981795489778709e-05, "loss": 0.019, "step": 2593 }, { "epoch": 1.9143911439114392, "grad_norm": 0.21970278638928958, "learning_rate": 6.973608840596144e-05, "loss": 0.0313, "step": 2594 }, { "epoch": 1.915129151291513, "grad_norm": 0.14973403922253287, "learning_rate": 6.965424423635535e-05, "loss": 0.041, "step": 2595 }, { "epoch": 1.9158671586715867, "grad_norm": 0.41722685882284455, "learning_rate": 6.957242244933593e-05, "loss": 0.0519, "step": 2596 }, { "epoch": 1.9166051660516605, "grad_norm": 0.3114128175139535, "learning_rate": 6.949062310525371e-05, "loss": 0.0456, "step": 2597 }, { "epoch": 1.9173431734317343, "grad_norm": 0.1448128971308313, "learning_rate": 6.940884626444273e-05, "loss": 0.0272, "step": 2598 }, { "epoch": 1.9180811808118081, "grad_norm": 0.15343252731927018, "learning_rate": 6.932709198722034e-05, "loss": 0.021, "step": 2599 }, { "epoch": 1.918819188191882, "grad_norm": 0.342831296289626, "learning_rate": 6.924536033388734e-05, "loss": 0.0451, "step": 2600 }, { "epoch": 1.9195571955719557, "grad_norm": 0.21011197879489327, "learning_rate": 6.916365136472782e-05, "loss": 0.0378, "step": 2601 }, { "epoch": 1.9202952029520295, "grad_norm": 0.1879566929689338, "learning_rate": 6.908196514000905e-05, "loss": 0.0196, "step": 2602 }, { "epoch": 1.9210332103321033, "grad_norm": 0.21478663463864023, "learning_rate": 6.900030171998169e-05, "loss": 0.0341, "step": 2603 }, { "epoch": 1.921771217712177, "grad_norm": 0.18873178939598087, "learning_rate": 6.891866116487938e-05, "loss": 0.0425, "step": 2604 }, { "epoch": 1.9225092250922509, "grad_norm": 0.32580226812643837, "learning_rate": 6.883704353491911e-05, "loss": 0.0632, "step": 2605 }, { "epoch": 1.9232472324723249, "grad_norm": 0.18848863663495657, "learning_rate": 6.875544889030077e-05, "loss": 0.0553, "step": 2606 }, { "epoch": 1.9239852398523984, "grad_norm": 0.2004377295069113, "learning_rate": 6.867387729120746e-05, "loss": 0.0357, "step": 2607 }, { "epoch": 1.9247232472324725, "grad_norm": 0.1468241673894683, "learning_rate": 6.859232879780515e-05, "loss": 0.0236, "step": 2608 }, { "epoch": 1.925461254612546, "grad_norm": 0.14199204293018775, "learning_rate": 6.851080347024279e-05, "loss": 0.0283, "step": 2609 }, { "epoch": 1.92619926199262, "grad_norm": 0.3860529441490429, "learning_rate": 6.842930136865233e-05, "loss": 0.0354, "step": 2610 }, { "epoch": 1.9269372693726936, "grad_norm": 0.2683818611352399, "learning_rate": 6.834782255314849e-05, "loss": 0.0498, "step": 2611 }, { "epoch": 1.9276752767527676, "grad_norm": 0.29011090260081274, "learning_rate": 6.82663670838289e-05, "loss": 0.0495, "step": 2612 }, { "epoch": 1.9284132841328412, "grad_norm": 0.10014483062014516, "learning_rate": 6.818493502077388e-05, "loss": 0.0198, "step": 2613 }, { "epoch": 1.9291512915129152, "grad_norm": 0.16216149637109198, "learning_rate": 6.810352642404656e-05, "loss": 0.0275, "step": 2614 }, { "epoch": 1.9298892988929888, "grad_norm": 0.1768328932534676, "learning_rate": 6.802214135369274e-05, "loss": 0.0383, "step": 2615 }, { "epoch": 1.9306273062730628, "grad_norm": 0.12776437089652198, "learning_rate": 6.79407798697408e-05, "loss": 0.0226, "step": 2616 }, { "epoch": 1.9313653136531366, "grad_norm": 0.4788157977513444, "learning_rate": 6.785944203220189e-05, "loss": 0.0683, "step": 2617 }, { "epoch": 1.9321033210332104, "grad_norm": 0.1844636450272322, "learning_rate": 6.777812790106948e-05, "loss": 0.0445, "step": 2618 }, { "epoch": 1.9328413284132842, "grad_norm": 0.2972308450610834, "learning_rate": 6.769683753631981e-05, "loss": 0.0542, "step": 2619 }, { "epoch": 1.933579335793358, "grad_norm": 0.4840901916590707, "learning_rate": 6.761557099791136e-05, "loss": 0.0464, "step": 2620 }, { "epoch": 1.9343173431734317, "grad_norm": 0.2622682587910348, "learning_rate": 6.753432834578525e-05, "loss": 0.0495, "step": 2621 }, { "epoch": 1.9350553505535055, "grad_norm": 0.2017146951885201, "learning_rate": 6.745310963986479e-05, "loss": 0.0281, "step": 2622 }, { "epoch": 1.9357933579335793, "grad_norm": 0.11560737525849829, "learning_rate": 6.737191494005574e-05, "loss": 0.0327, "step": 2623 }, { "epoch": 1.936531365313653, "grad_norm": 0.19802537923200547, "learning_rate": 6.729074430624615e-05, "loss": 0.0357, "step": 2624 }, { "epoch": 1.937269372693727, "grad_norm": 0.3137371195003947, "learning_rate": 6.720959779830626e-05, "loss": 0.0314, "step": 2625 }, { "epoch": 1.9380073800738007, "grad_norm": 0.18411729636270355, "learning_rate": 6.712847547608857e-05, "loss": 0.0409, "step": 2626 }, { "epoch": 1.9387453874538747, "grad_norm": 0.3403350598657615, "learning_rate": 6.70473773994277e-05, "loss": 0.0636, "step": 2627 }, { "epoch": 1.9394833948339483, "grad_norm": 0.25651804192793215, "learning_rate": 6.696630362814045e-05, "loss": 0.0463, "step": 2628 }, { "epoch": 1.9402214022140223, "grad_norm": 0.18267028028626958, "learning_rate": 6.688525422202563e-05, "loss": 0.0456, "step": 2629 }, { "epoch": 1.9409594095940959, "grad_norm": 0.15237665118025887, "learning_rate": 6.680422924086404e-05, "loss": 0.0234, "step": 2630 }, { "epoch": 1.9416974169741699, "grad_norm": 0.11129185439648426, "learning_rate": 6.672322874441863e-05, "loss": 0.0268, "step": 2631 }, { "epoch": 1.9424354243542434, "grad_norm": 0.44029231553888176, "learning_rate": 6.664225279243408e-05, "loss": 0.0795, "step": 2632 }, { "epoch": 1.9431734317343174, "grad_norm": 0.5999693245862872, "learning_rate": 6.656130144463718e-05, "loss": 0.0723, "step": 2633 }, { "epoch": 1.943911439114391, "grad_norm": 0.23181019365874414, "learning_rate": 6.648037476073635e-05, "loss": 0.0434, "step": 2634 }, { "epoch": 1.944649446494465, "grad_norm": 0.7980842834440237, "learning_rate": 6.639947280042202e-05, "loss": 0.1107, "step": 2635 }, { "epoch": 1.9453874538745386, "grad_norm": 0.2879461260597641, "learning_rate": 6.631859562336627e-05, "loss": 0.0387, "step": 2636 }, { "epoch": 1.9461254612546126, "grad_norm": 0.22546957023465733, "learning_rate": 6.623774328922289e-05, "loss": 0.0372, "step": 2637 }, { "epoch": 1.9468634686346864, "grad_norm": 0.09476482175781961, "learning_rate": 6.615691585762742e-05, "loss": 0.0158, "step": 2638 }, { "epoch": 1.9476014760147602, "grad_norm": 0.3260180407947504, "learning_rate": 6.607611338819697e-05, "loss": 0.0638, "step": 2639 }, { "epoch": 1.948339483394834, "grad_norm": 0.19313150691234177, "learning_rate": 6.59953359405303e-05, "loss": 0.04, "step": 2640 }, { "epoch": 1.9490774907749078, "grad_norm": 0.275382136165543, "learning_rate": 6.591458357420764e-05, "loss": 0.0527, "step": 2641 }, { "epoch": 1.9498154981549816, "grad_norm": 0.19025179362613073, "learning_rate": 6.583385634879075e-05, "loss": 0.0538, "step": 2642 }, { "epoch": 1.9505535055350554, "grad_norm": 0.07802085418456817, "learning_rate": 6.57531543238229e-05, "loss": 0.0092, "step": 2643 }, { "epoch": 1.9512915129151291, "grad_norm": 0.23969261608747575, "learning_rate": 6.567247755882868e-05, "loss": 0.0306, "step": 2644 }, { "epoch": 1.952029520295203, "grad_norm": 0.17159846575629528, "learning_rate": 6.559182611331415e-05, "loss": 0.0379, "step": 2645 }, { "epoch": 1.9527675276752767, "grad_norm": 0.3452804646232381, "learning_rate": 6.551120004676654e-05, "loss": 0.0402, "step": 2646 }, { "epoch": 1.9535055350553505, "grad_norm": 0.28254460051656316, "learning_rate": 6.543059941865459e-05, "loss": 0.0231, "step": 2647 }, { "epoch": 1.9542435424354243, "grad_norm": 0.15328075780448233, "learning_rate": 6.535002428842807e-05, "loss": 0.0522, "step": 2648 }, { "epoch": 1.954981549815498, "grad_norm": 0.17965724010212447, "learning_rate": 6.526947471551798e-05, "loss": 0.0195, "step": 2649 }, { "epoch": 1.9557195571955721, "grad_norm": 0.21437273059574707, "learning_rate": 6.51889507593366e-05, "loss": 0.0293, "step": 2650 }, { "epoch": 1.9564575645756457, "grad_norm": 0.24611609306419205, "learning_rate": 6.510845247927716e-05, "loss": 0.0591, "step": 2651 }, { "epoch": 1.9571955719557197, "grad_norm": 0.2876800892165275, "learning_rate": 6.502797993471406e-05, "loss": 0.0356, "step": 2652 }, { "epoch": 1.9579335793357933, "grad_norm": 0.19407311994921614, "learning_rate": 6.494753318500265e-05, "loss": 0.0313, "step": 2653 }, { "epoch": 1.9586715867158673, "grad_norm": 0.23133370926970095, "learning_rate": 6.48671122894793e-05, "loss": 0.0295, "step": 2654 }, { "epoch": 1.9594095940959408, "grad_norm": 0.14173042686057805, "learning_rate": 6.478671730746126e-05, "loss": 0.0293, "step": 2655 }, { "epoch": 1.9601476014760149, "grad_norm": 0.26395836222107916, "learning_rate": 6.47063482982467e-05, "loss": 0.0336, "step": 2656 }, { "epoch": 1.9608856088560884, "grad_norm": 0.2362466795991651, "learning_rate": 6.462600532111466e-05, "loss": 0.0392, "step": 2657 }, { "epoch": 1.9616236162361624, "grad_norm": 0.17080754855969604, "learning_rate": 6.454568843532489e-05, "loss": 0.0313, "step": 2658 }, { "epoch": 1.962361623616236, "grad_norm": 0.1701567684636816, "learning_rate": 6.446539770011804e-05, "loss": 0.0487, "step": 2659 }, { "epoch": 1.96309963099631, "grad_norm": 0.18251685257370773, "learning_rate": 6.438513317471529e-05, "loss": 0.041, "step": 2660 }, { "epoch": 1.9638376383763838, "grad_norm": 0.3377259176736572, "learning_rate": 6.430489491831868e-05, "loss": 0.0253, "step": 2661 }, { "epoch": 1.9645756457564576, "grad_norm": 0.29074219358318165, "learning_rate": 6.422468299011069e-05, "loss": 0.0452, "step": 2662 }, { "epoch": 1.9653136531365314, "grad_norm": 0.1890414792509614, "learning_rate": 6.414449744925448e-05, "loss": 0.039, "step": 2663 }, { "epoch": 1.9660516605166052, "grad_norm": 0.37509735137766564, "learning_rate": 6.406433835489379e-05, "loss": 0.0416, "step": 2664 }, { "epoch": 1.966789667896679, "grad_norm": 0.1951365289772601, "learning_rate": 6.398420576615274e-05, "loss": 0.0459, "step": 2665 }, { "epoch": 1.9675276752767528, "grad_norm": 0.23545162111697077, "learning_rate": 6.3904099742136e-05, "loss": 0.0379, "step": 2666 }, { "epoch": 1.9682656826568266, "grad_norm": 0.26893156787272604, "learning_rate": 6.382402034192856e-05, "loss": 0.0551, "step": 2667 }, { "epoch": 1.9690036900369003, "grad_norm": 0.325248565188754, "learning_rate": 6.374396762459586e-05, "loss": 0.0449, "step": 2668 }, { "epoch": 1.9697416974169741, "grad_norm": 0.20482344117888332, "learning_rate": 6.366394164918363e-05, "loss": 0.0321, "step": 2669 }, { "epoch": 1.970479704797048, "grad_norm": 0.13694353966840553, "learning_rate": 6.358394247471778e-05, "loss": 0.0242, "step": 2670 }, { "epoch": 1.9712177121771217, "grad_norm": 0.18312477292399454, "learning_rate": 6.350397016020463e-05, "loss": 0.041, "step": 2671 }, { "epoch": 1.9719557195571955, "grad_norm": 0.383978598814017, "learning_rate": 6.342402476463051e-05, "loss": 0.0515, "step": 2672 }, { "epoch": 1.9726937269372695, "grad_norm": 0.23308549687714938, "learning_rate": 6.334410634696203e-05, "loss": 0.0495, "step": 2673 }, { "epoch": 1.973431734317343, "grad_norm": 0.20514501017896355, "learning_rate": 6.326421496614585e-05, "loss": 0.0345, "step": 2674 }, { "epoch": 1.974169741697417, "grad_norm": 0.21490510959838216, "learning_rate": 6.318435068110866e-05, "loss": 0.0405, "step": 2675 }, { "epoch": 1.9749077490774907, "grad_norm": 0.1001323457791152, "learning_rate": 6.310451355075724e-05, "loss": 0.0234, "step": 2676 }, { "epoch": 1.9756457564575647, "grad_norm": 0.21877776931924708, "learning_rate": 6.30247036339782e-05, "loss": 0.0763, "step": 2677 }, { "epoch": 1.9763837638376383, "grad_norm": 0.22335847319012933, "learning_rate": 6.294492098963824e-05, "loss": 0.0403, "step": 2678 }, { "epoch": 1.9771217712177123, "grad_norm": 0.14852926565278243, "learning_rate": 6.286516567658386e-05, "loss": 0.0172, "step": 2679 }, { "epoch": 1.9778597785977858, "grad_norm": 0.2064535757359176, "learning_rate": 6.278543775364143e-05, "loss": 0.0431, "step": 2680 }, { "epoch": 1.9785977859778598, "grad_norm": 0.3141273816673359, "learning_rate": 6.270573727961705e-05, "loss": 0.0375, "step": 2681 }, { "epoch": 1.9793357933579334, "grad_norm": 0.3223345596757006, "learning_rate": 6.262606431329669e-05, "loss": 0.0253, "step": 2682 }, { "epoch": 1.9800738007380074, "grad_norm": 0.3328429063787528, "learning_rate": 6.254641891344595e-05, "loss": 0.0327, "step": 2683 }, { "epoch": 1.9808118081180812, "grad_norm": 0.1286068851074561, "learning_rate": 6.246680113881007e-05, "loss": 0.0103, "step": 2684 }, { "epoch": 1.981549815498155, "grad_norm": 0.19324935208311558, "learning_rate": 6.238721104811403e-05, "loss": 0.0254, "step": 2685 }, { "epoch": 1.9822878228782288, "grad_norm": 0.1833180806570257, "learning_rate": 6.230764870006225e-05, "loss": 0.0293, "step": 2686 }, { "epoch": 1.9830258302583026, "grad_norm": 0.20300489027949803, "learning_rate": 6.222811415333883e-05, "loss": 0.0394, "step": 2687 }, { "epoch": 1.9837638376383764, "grad_norm": 0.24359029215759911, "learning_rate": 6.214860746660721e-05, "loss": 0.0351, "step": 2688 }, { "epoch": 1.9845018450184502, "grad_norm": 0.33471876176861354, "learning_rate": 6.206912869851043e-05, "loss": 0.0839, "step": 2689 }, { "epoch": 1.985239852398524, "grad_norm": 0.14483564948314828, "learning_rate": 6.198967790767087e-05, "loss": 0.032, "step": 2690 }, { "epoch": 1.9859778597785978, "grad_norm": 0.36199160926133217, "learning_rate": 6.191025515269018e-05, "loss": 0.072, "step": 2691 }, { "epoch": 1.9867158671586715, "grad_norm": 0.12290882697452515, "learning_rate": 6.183086049214955e-05, "loss": 0.027, "step": 2692 }, { "epoch": 1.9874538745387453, "grad_norm": 0.18017286931872697, "learning_rate": 6.175149398460924e-05, "loss": 0.0368, "step": 2693 }, { "epoch": 1.9881918819188193, "grad_norm": 0.24907693776929984, "learning_rate": 6.167215568860887e-05, "loss": 0.0312, "step": 2694 }, { "epoch": 1.988929889298893, "grad_norm": 0.2100835021928704, "learning_rate": 6.159284566266719e-05, "loss": 0.0249, "step": 2695 }, { "epoch": 1.989667896678967, "grad_norm": 0.3218339458984578, "learning_rate": 6.15135639652821e-05, "loss": 0.065, "step": 2696 }, { "epoch": 1.9904059040590405, "grad_norm": 0.24303739450536077, "learning_rate": 6.143431065493066e-05, "loss": 0.0581, "step": 2697 }, { "epoch": 1.9911439114391145, "grad_norm": 0.14710456833283211, "learning_rate": 6.135508579006892e-05, "loss": 0.0309, "step": 2698 }, { "epoch": 1.991881918819188, "grad_norm": 0.17818521272726978, "learning_rate": 6.127588942913203e-05, "loss": 0.0375, "step": 2699 }, { "epoch": 1.992619926199262, "grad_norm": 0.21362583121014897, "learning_rate": 6.119672163053402e-05, "loss": 0.0429, "step": 2700 }, { "epoch": 1.9933579335793357, "grad_norm": 0.18515280769219725, "learning_rate": 6.111758245266794e-05, "loss": 0.0246, "step": 2701 }, { "epoch": 1.9940959409594097, "grad_norm": 0.1734976992246722, "learning_rate": 6.10384719539057e-05, "loss": 0.0403, "step": 2702 }, { "epoch": 1.9948339483394832, "grad_norm": 0.26964050542797297, "learning_rate": 6.0959390192597976e-05, "loss": 0.0539, "step": 2703 }, { "epoch": 1.9955719557195573, "grad_norm": 0.1095156673558249, "learning_rate": 6.0880337227074444e-05, "loss": 0.0223, "step": 2704 }, { "epoch": 1.996309963099631, "grad_norm": 0.28929550512486957, "learning_rate": 6.080131311564328e-05, "loss": 0.0403, "step": 2705 }, { "epoch": 1.9970479704797048, "grad_norm": 0.44946759665366454, "learning_rate": 6.0722317916591645e-05, "loss": 0.041, "step": 2706 }, { "epoch": 1.9977859778597786, "grad_norm": 0.33799256675276984, "learning_rate": 6.0643351688185114e-05, "loss": 0.0688, "step": 2707 }, { "epoch": 1.9985239852398524, "grad_norm": 0.3553322578304875, "learning_rate": 6.0564414488668165e-05, "loss": 0.0543, "step": 2708 }, { "epoch": 1.9992619926199262, "grad_norm": 0.35075238566648137, "learning_rate": 6.048550637626362e-05, "loss": 0.0831, "step": 2709 }, { "epoch": 2.0, "grad_norm": 0.14973454366541392, "learning_rate": 6.040662740917298e-05, "loss": 0.045, "step": 2710 }, { "epoch": 2.0, "eval_loss": 0.0575236901640892, "eval_runtime": 578.4994, "eval_samples_per_second": 18.541, "eval_steps_per_second": 2.318, "step": 2710 }, { "epoch": 2.000738007380074, "grad_norm": 0.47519638742117404, "learning_rate": 6.032777764557624e-05, "loss": 0.0453, "step": 2711 }, { "epoch": 2.0014760147601476, "grad_norm": 0.2511689791958012, "learning_rate": 6.02489571436318e-05, "loss": 0.0247, "step": 2712 }, { "epoch": 2.0022140221402216, "grad_norm": 0.19150506431464254, "learning_rate": 6.017016596147656e-05, "loss": 0.0311, "step": 2713 }, { "epoch": 2.002952029520295, "grad_norm": 0.30453780172022776, "learning_rate": 6.0091404157225696e-05, "loss": 0.0445, "step": 2714 }, { "epoch": 2.003690036900369, "grad_norm": 0.1316710018394525, "learning_rate": 6.0012671788972806e-05, "loss": 0.0208, "step": 2715 }, { "epoch": 2.0044280442804427, "grad_norm": 0.23656923518005754, "learning_rate": 5.9933968914789727e-05, "loss": 0.0257, "step": 2716 }, { "epoch": 2.0051660516605168, "grad_norm": 0.21277407460221942, "learning_rate": 5.98552955927265e-05, "loss": 0.0816, "step": 2717 }, { "epoch": 2.0059040590405903, "grad_norm": 0.17433097993132846, "learning_rate": 5.9776651880811516e-05, "loss": 0.0295, "step": 2718 }, { "epoch": 2.0066420664206643, "grad_norm": 0.14574419908405228, "learning_rate": 5.9698037837051116e-05, "loss": 0.0129, "step": 2719 }, { "epoch": 2.007380073800738, "grad_norm": 0.3591679555751239, "learning_rate": 5.961945351942999e-05, "loss": 0.0394, "step": 2720 }, { "epoch": 2.008118081180812, "grad_norm": 0.18448131040152665, "learning_rate": 5.9540898985910666e-05, "loss": 0.0306, "step": 2721 }, { "epoch": 2.0088560885608855, "grad_norm": 0.2082476046208205, "learning_rate": 5.946237429443393e-05, "loss": 0.0273, "step": 2722 }, { "epoch": 2.0095940959409595, "grad_norm": 0.1422969586961379, "learning_rate": 5.9383879502918394e-05, "loss": 0.0243, "step": 2723 }, { "epoch": 2.010332103321033, "grad_norm": 0.15354060373150125, "learning_rate": 5.930541466926064e-05, "loss": 0.0389, "step": 2724 }, { "epoch": 2.011070110701107, "grad_norm": 0.11424882775612055, "learning_rate": 5.9226979851335254e-05, "loss": 0.0108, "step": 2725 }, { "epoch": 2.0118081180811807, "grad_norm": 0.17747088200560682, "learning_rate": 5.914857510699454e-05, "loss": 0.0408, "step": 2726 }, { "epoch": 2.0125461254612547, "grad_norm": 0.13663564432748718, "learning_rate": 5.907020049406877e-05, "loss": 0.0185, "step": 2727 }, { "epoch": 2.0132841328413282, "grad_norm": 0.13588374689433716, "learning_rate": 5.899185607036586e-05, "loss": 0.0158, "step": 2728 }, { "epoch": 2.0140221402214022, "grad_norm": 0.2861879370367172, "learning_rate": 5.891354189367153e-05, "loss": 0.0329, "step": 2729 }, { "epoch": 2.014760147601476, "grad_norm": 0.19695878437050396, "learning_rate": 5.8835258021749205e-05, "loss": 0.1408, "step": 2730 }, { "epoch": 2.01549815498155, "grad_norm": 0.10880562717864711, "learning_rate": 5.875700451233985e-05, "loss": 0.0219, "step": 2731 }, { "epoch": 2.0162361623616234, "grad_norm": 0.15344669743760472, "learning_rate": 5.867878142316221e-05, "loss": 0.0202, "step": 2732 }, { "epoch": 2.0169741697416974, "grad_norm": 0.24274859486864955, "learning_rate": 5.860058881191237e-05, "loss": 0.034, "step": 2733 }, { "epoch": 2.0177121771217714, "grad_norm": 0.12336342126866445, "learning_rate": 5.852242673626421e-05, "loss": 0.0102, "step": 2734 }, { "epoch": 2.018450184501845, "grad_norm": 0.25650287722479775, "learning_rate": 5.844429525386878e-05, "loss": 0.0403, "step": 2735 }, { "epoch": 2.019188191881919, "grad_norm": 0.190096464151048, "learning_rate": 5.8366194422354894e-05, "loss": 0.016, "step": 2736 }, { "epoch": 2.0199261992619926, "grad_norm": 0.3417220904834986, "learning_rate": 5.828812429932844e-05, "loss": 0.0271, "step": 2737 }, { "epoch": 2.0206642066420666, "grad_norm": 0.22318749653955008, "learning_rate": 5.821008494237281e-05, "loss": 0.043, "step": 2738 }, { "epoch": 2.02140221402214, "grad_norm": 0.22307805675362682, "learning_rate": 5.813207640904883e-05, "loss": 0.0238, "step": 2739 }, { "epoch": 2.022140221402214, "grad_norm": 0.19637682200878173, "learning_rate": 5.8054098756894295e-05, "loss": 0.0202, "step": 2740 }, { "epoch": 2.0228782287822877, "grad_norm": 0.16532108560297026, "learning_rate": 5.797615204342447e-05, "loss": 0.022, "step": 2741 }, { "epoch": 2.0236162361623617, "grad_norm": 0.12967749434905076, "learning_rate": 5.789823632613167e-05, "loss": 0.0202, "step": 2742 }, { "epoch": 2.0243542435424353, "grad_norm": 0.2595434871179249, "learning_rate": 5.782035166248549e-05, "loss": 0.0206, "step": 2743 }, { "epoch": 2.0250922509225093, "grad_norm": 0.24214796035724137, "learning_rate": 5.7742498109932394e-05, "loss": 0.0238, "step": 2744 }, { "epoch": 2.025830258302583, "grad_norm": 0.1634435772865654, "learning_rate": 5.7664675725896064e-05, "loss": 0.0228, "step": 2745 }, { "epoch": 2.026568265682657, "grad_norm": 0.11011271533985514, "learning_rate": 5.75868845677772e-05, "loss": 0.018, "step": 2746 }, { "epoch": 2.0273062730627305, "grad_norm": 0.09802479590431985, "learning_rate": 5.75091246929534e-05, "loss": 0.0092, "step": 2747 }, { "epoch": 2.0280442804428045, "grad_norm": 0.23856082238714893, "learning_rate": 5.7431396158779215e-05, "loss": 0.0248, "step": 2748 }, { "epoch": 2.028782287822878, "grad_norm": 0.17294483208346792, "learning_rate": 5.735369902258606e-05, "loss": 0.0123, "step": 2749 }, { "epoch": 2.029520295202952, "grad_norm": 0.24281094011595486, "learning_rate": 5.727603334168219e-05, "loss": 0.0435, "step": 2750 }, { "epoch": 2.0302583025830256, "grad_norm": 0.45699203153672735, "learning_rate": 5.719839917335275e-05, "loss": 0.0798, "step": 2751 }, { "epoch": 2.0309963099630997, "grad_norm": 0.14378971719761052, "learning_rate": 5.7120796574859516e-05, "loss": 0.0206, "step": 2752 }, { "epoch": 2.0317343173431732, "grad_norm": 0.33746530717363066, "learning_rate": 5.704322560344104e-05, "loss": 0.0935, "step": 2753 }, { "epoch": 2.0324723247232472, "grad_norm": 0.1727381872135281, "learning_rate": 5.696568631631252e-05, "loss": 0.0372, "step": 2754 }, { "epoch": 2.0332103321033212, "grad_norm": 0.1890667329691629, "learning_rate": 5.68881787706659e-05, "loss": 0.034, "step": 2755 }, { "epoch": 2.033948339483395, "grad_norm": 0.18778857429391385, "learning_rate": 5.681070302366951e-05, "loss": 0.0215, "step": 2756 }, { "epoch": 2.034686346863469, "grad_norm": 0.14056633002667887, "learning_rate": 5.673325913246832e-05, "loss": 0.0146, "step": 2757 }, { "epoch": 2.0354243542435424, "grad_norm": 0.19676338364740426, "learning_rate": 5.6655847154183885e-05, "loss": 0.0192, "step": 2758 }, { "epoch": 2.0361623616236164, "grad_norm": 0.06461507463003657, "learning_rate": 5.657846714591413e-05, "loss": 0.007, "step": 2759 }, { "epoch": 2.03690036900369, "grad_norm": 0.19485948021599492, "learning_rate": 5.65011191647334e-05, "loss": 0.0276, "step": 2760 }, { "epoch": 2.037638376383764, "grad_norm": 0.14918177757433587, "learning_rate": 5.642380326769241e-05, "loss": 0.0313, "step": 2761 }, { "epoch": 2.0383763837638376, "grad_norm": 0.19899135521212952, "learning_rate": 5.634651951181833e-05, "loss": 0.0287, "step": 2762 }, { "epoch": 2.0391143911439116, "grad_norm": 0.140606389714255, "learning_rate": 5.626926795411447e-05, "loss": 0.0158, "step": 2763 }, { "epoch": 2.039852398523985, "grad_norm": 0.197657308489543, "learning_rate": 5.619204865156045e-05, "loss": 0.0406, "step": 2764 }, { "epoch": 2.040590405904059, "grad_norm": 0.2703238890031185, "learning_rate": 5.611486166111213e-05, "loss": 0.0304, "step": 2765 }, { "epoch": 2.0413284132841327, "grad_norm": 0.13647238968339925, "learning_rate": 5.6037707039701416e-05, "loss": 0.0203, "step": 2766 }, { "epoch": 2.0420664206642067, "grad_norm": 0.1142979981263087, "learning_rate": 5.596058484423656e-05, "loss": 0.0165, "step": 2767 }, { "epoch": 2.0428044280442803, "grad_norm": 0.16112332384093192, "learning_rate": 5.5883495131601714e-05, "loss": 0.0172, "step": 2768 }, { "epoch": 2.0435424354243543, "grad_norm": 0.21531216021718885, "learning_rate": 5.580643795865712e-05, "loss": 0.021, "step": 2769 }, { "epoch": 2.044280442804428, "grad_norm": 0.34676171105514164, "learning_rate": 5.572941338223902e-05, "loss": 0.0316, "step": 2770 }, { "epoch": 2.045018450184502, "grad_norm": 0.22728579309709132, "learning_rate": 5.565242145915962e-05, "loss": 0.0273, "step": 2771 }, { "epoch": 2.0457564575645755, "grad_norm": 0.22017292516738568, "learning_rate": 5.5575462246207046e-05, "loss": 0.0357, "step": 2772 }, { "epoch": 2.0464944649446495, "grad_norm": 0.17138938414524144, "learning_rate": 5.549853580014525e-05, "loss": 0.0234, "step": 2773 }, { "epoch": 2.047232472324723, "grad_norm": 0.20327928368444875, "learning_rate": 5.5421642177714126e-05, "loss": 0.0293, "step": 2774 }, { "epoch": 2.047970479704797, "grad_norm": 0.25952307398085445, "learning_rate": 5.5344781435629255e-05, "loss": 0.0206, "step": 2775 }, { "epoch": 2.0487084870848706, "grad_norm": 0.19949298566307733, "learning_rate": 5.526795363058199e-05, "loss": 0.0181, "step": 2776 }, { "epoch": 2.0494464944649446, "grad_norm": 0.35396339648557756, "learning_rate": 5.519115881923943e-05, "loss": 0.1067, "step": 2777 }, { "epoch": 2.0501845018450187, "grad_norm": 0.1143545019937446, "learning_rate": 5.5114397058244236e-05, "loss": 0.0126, "step": 2778 }, { "epoch": 2.0509225092250922, "grad_norm": 0.12605545708122667, "learning_rate": 5.5037668404214845e-05, "loss": 0.0112, "step": 2779 }, { "epoch": 2.0516605166051662, "grad_norm": 0.10416063682703058, "learning_rate": 5.4960972913745155e-05, "loss": 0.017, "step": 2780 }, { "epoch": 2.05239852398524, "grad_norm": 0.40422277698382525, "learning_rate": 5.4884310643404654e-05, "loss": 0.0104, "step": 2781 }, { "epoch": 2.053136531365314, "grad_norm": 0.14708945250729422, "learning_rate": 5.480768164973826e-05, "loss": 0.0306, "step": 2782 }, { "epoch": 2.0538745387453874, "grad_norm": 0.2530600717263199, "learning_rate": 5.47310859892665e-05, "loss": 0.0211, "step": 2783 }, { "epoch": 2.0546125461254614, "grad_norm": 0.23497546998821886, "learning_rate": 5.465452371848519e-05, "loss": 0.0232, "step": 2784 }, { "epoch": 2.055350553505535, "grad_norm": 0.24415740455255114, "learning_rate": 5.457799489386543e-05, "loss": 0.0273, "step": 2785 }, { "epoch": 2.056088560885609, "grad_norm": 0.33651985297577874, "learning_rate": 5.450149957185389e-05, "loss": 0.0251, "step": 2786 }, { "epoch": 2.0568265682656826, "grad_norm": 0.22619015833508171, "learning_rate": 5.442503780887236e-05, "loss": 0.0219, "step": 2787 }, { "epoch": 2.0575645756457566, "grad_norm": 0.1596901851293603, "learning_rate": 5.4348609661317926e-05, "loss": 0.0127, "step": 2788 }, { "epoch": 2.05830258302583, "grad_norm": 0.3768750212819495, "learning_rate": 5.4272215185562834e-05, "loss": 0.0401, "step": 2789 }, { "epoch": 2.059040590405904, "grad_norm": 0.21259896709696455, "learning_rate": 5.4195854437954606e-05, "loss": 0.0194, "step": 2790 }, { "epoch": 2.0597785977859777, "grad_norm": 0.21148555865723628, "learning_rate": 5.411952747481579e-05, "loss": 0.0233, "step": 2791 }, { "epoch": 2.0605166051660517, "grad_norm": 0.11259598877181502, "learning_rate": 5.404323435244403e-05, "loss": 0.0146, "step": 2792 }, { "epoch": 2.0612546125461253, "grad_norm": 0.16242630665699254, "learning_rate": 5.396697512711202e-05, "loss": 0.017, "step": 2793 }, { "epoch": 2.0619926199261993, "grad_norm": 0.1536136587061611, "learning_rate": 5.38907498550674e-05, "loss": 0.0155, "step": 2794 }, { "epoch": 2.062730627306273, "grad_norm": 0.2968357955363484, "learning_rate": 5.381455859253293e-05, "loss": 0.0201, "step": 2795 }, { "epoch": 2.063468634686347, "grad_norm": 0.10706628163970466, "learning_rate": 5.3738401395706095e-05, "loss": 0.0099, "step": 2796 }, { "epoch": 2.0642066420664205, "grad_norm": 0.12288438607086372, "learning_rate": 5.3662278320759366e-05, "loss": 0.0181, "step": 2797 }, { "epoch": 2.0649446494464945, "grad_norm": 0.18698151506608757, "learning_rate": 5.3586189423839995e-05, "loss": 0.021, "step": 2798 }, { "epoch": 2.065682656826568, "grad_norm": 0.16332642889802157, "learning_rate": 5.351013476107001e-05, "loss": 0.0192, "step": 2799 }, { "epoch": 2.066420664206642, "grad_norm": 0.11614275237455376, "learning_rate": 5.343411438854633e-05, "loss": 0.006, "step": 2800 }, { "epoch": 2.067158671586716, "grad_norm": 0.20605015641383895, "learning_rate": 5.335812836234032e-05, "loss": 0.0309, "step": 2801 }, { "epoch": 2.0678966789667896, "grad_norm": 0.21454408523254814, "learning_rate": 5.328217673849829e-05, "loss": 0.0287, "step": 2802 }, { "epoch": 2.0686346863468636, "grad_norm": 0.10495781344367638, "learning_rate": 5.3206259573041e-05, "loss": 0.0095, "step": 2803 }, { "epoch": 2.069372693726937, "grad_norm": 0.1986841760781456, "learning_rate": 5.313037692196383e-05, "loss": 0.0248, "step": 2804 }, { "epoch": 2.0701107011070112, "grad_norm": 0.17120257611059103, "learning_rate": 5.3054528841236736e-05, "loss": 0.0195, "step": 2805 }, { "epoch": 2.070848708487085, "grad_norm": 0.13729474869189598, "learning_rate": 5.2978715386804123e-05, "loss": 0.0229, "step": 2806 }, { "epoch": 2.071586715867159, "grad_norm": 0.2046428411073854, "learning_rate": 5.2902936614584945e-05, "loss": 0.0175, "step": 2807 }, { "epoch": 2.0723247232472324, "grad_norm": 0.20587928919051907, "learning_rate": 5.28271925804725e-05, "loss": 0.0201, "step": 2808 }, { "epoch": 2.0730627306273064, "grad_norm": 0.13411999852624182, "learning_rate": 5.275148334033446e-05, "loss": 0.0475, "step": 2809 }, { "epoch": 2.07380073800738, "grad_norm": 0.3413812683707656, "learning_rate": 5.2675808950012885e-05, "loss": 0.0178, "step": 2810 }, { "epoch": 2.074538745387454, "grad_norm": 0.13961102555364774, "learning_rate": 5.260016946532405e-05, "loss": 0.0354, "step": 2811 }, { "epoch": 2.0752767527675275, "grad_norm": 0.3972879801129293, "learning_rate": 5.2524564942058616e-05, "loss": 0.0413, "step": 2812 }, { "epoch": 2.0760147601476016, "grad_norm": 0.2526307473485124, "learning_rate": 5.244899543598127e-05, "loss": 0.0304, "step": 2813 }, { "epoch": 2.076752767527675, "grad_norm": 0.13795869283821838, "learning_rate": 5.237346100283109e-05, "loss": 0.024, "step": 2814 }, { "epoch": 2.077490774907749, "grad_norm": 0.1363382062532104, "learning_rate": 5.229796169832106e-05, "loss": 0.0111, "step": 2815 }, { "epoch": 2.0782287822878227, "grad_norm": 0.42508726905469624, "learning_rate": 5.222249757813852e-05, "loss": 0.0594, "step": 2816 }, { "epoch": 2.0789667896678967, "grad_norm": 0.20212185222138568, "learning_rate": 5.214706869794456e-05, "loss": 0.0107, "step": 2817 }, { "epoch": 2.0797047970479703, "grad_norm": 0.2126723535714553, "learning_rate": 5.207167511337443e-05, "loss": 0.0356, "step": 2818 }, { "epoch": 2.0804428044280443, "grad_norm": 0.13018930411181834, "learning_rate": 5.199631688003741e-05, "loss": 0.0179, "step": 2819 }, { "epoch": 2.081180811808118, "grad_norm": 0.07625607086542362, "learning_rate": 5.19209940535166e-05, "loss": 0.0088, "step": 2820 }, { "epoch": 2.081918819188192, "grad_norm": 0.18399827276344338, "learning_rate": 5.1845706689369033e-05, "loss": 0.027, "step": 2821 }, { "epoch": 2.082656826568266, "grad_norm": 0.15004484636185528, "learning_rate": 5.1770454843125506e-05, "loss": 0.0176, "step": 2822 }, { "epoch": 2.0833948339483395, "grad_norm": 0.1750465766671461, "learning_rate": 5.169523857029077e-05, "loss": 0.0609, "step": 2823 }, { "epoch": 2.0841328413284135, "grad_norm": 0.13051197386408958, "learning_rate": 5.162005792634326e-05, "loss": 0.0111, "step": 2824 }, { "epoch": 2.084870848708487, "grad_norm": 0.10401927631270745, "learning_rate": 5.1544912966734994e-05, "loss": 0.0062, "step": 2825 }, { "epoch": 2.085608856088561, "grad_norm": 0.09977450803243099, "learning_rate": 5.146980374689192e-05, "loss": 0.0134, "step": 2826 }, { "epoch": 2.0863468634686346, "grad_norm": 0.31226127164744316, "learning_rate": 5.13947303222134e-05, "loss": 0.0292, "step": 2827 }, { "epoch": 2.0870848708487086, "grad_norm": 0.3153213414932311, "learning_rate": 5.1319692748072666e-05, "loss": 0.0228, "step": 2828 }, { "epoch": 2.087822878228782, "grad_norm": 0.20912757828914696, "learning_rate": 5.1244691079816134e-05, "loss": 0.0367, "step": 2829 }, { "epoch": 2.088560885608856, "grad_norm": 0.12874772290848283, "learning_rate": 5.1169725372764076e-05, "loss": 0.0157, "step": 2830 }, { "epoch": 2.08929889298893, "grad_norm": 0.18982345125713948, "learning_rate": 5.109479568221007e-05, "loss": 0.0287, "step": 2831 }, { "epoch": 2.090036900369004, "grad_norm": 0.24641933293472004, "learning_rate": 5.101990206342115e-05, "loss": 0.0241, "step": 2832 }, { "epoch": 2.0907749077490774, "grad_norm": 0.20286183586623038, "learning_rate": 5.094504457163776e-05, "loss": 0.0108, "step": 2833 }, { "epoch": 2.0915129151291514, "grad_norm": 0.1359825200083365, "learning_rate": 5.087022326207366e-05, "loss": 0.0173, "step": 2834 }, { "epoch": 2.092250922509225, "grad_norm": 0.19624357588854985, "learning_rate": 5.0795438189916024e-05, "loss": 0.0393, "step": 2835 }, { "epoch": 2.092988929889299, "grad_norm": 0.25566436442626805, "learning_rate": 5.0720689410325196e-05, "loss": 0.0309, "step": 2836 }, { "epoch": 2.0937269372693725, "grad_norm": 0.12317136623184971, "learning_rate": 5.0645976978434805e-05, "loss": 0.0091, "step": 2837 }, { "epoch": 2.0944649446494465, "grad_norm": 0.1673011255807617, "learning_rate": 5.057130094935161e-05, "loss": 0.0253, "step": 2838 }, { "epoch": 2.09520295202952, "grad_norm": 0.24499015909255373, "learning_rate": 5.049666137815556e-05, "loss": 0.0336, "step": 2839 }, { "epoch": 2.095940959409594, "grad_norm": 0.08344543168688476, "learning_rate": 5.04220583198998e-05, "loss": 0.0072, "step": 2840 }, { "epoch": 2.0966789667896677, "grad_norm": 0.06615269846584594, "learning_rate": 5.034749182961033e-05, "loss": 0.0065, "step": 2841 }, { "epoch": 2.0974169741697417, "grad_norm": 0.16262048450378794, "learning_rate": 5.0272961962286394e-05, "loss": 0.0285, "step": 2842 }, { "epoch": 2.0981549815498157, "grad_norm": 0.1865190581850637, "learning_rate": 5.0198468772900085e-05, "loss": 0.0162, "step": 2843 }, { "epoch": 2.0988929889298893, "grad_norm": 0.28711330242212696, "learning_rate": 5.0124012316396583e-05, "loss": 0.0224, "step": 2844 }, { "epoch": 2.0996309963099633, "grad_norm": 0.2700925841236794, "learning_rate": 5.004959264769378e-05, "loss": 0.0263, "step": 2845 }, { "epoch": 2.100369003690037, "grad_norm": 0.19958195887015623, "learning_rate": 4.997520982168253e-05, "loss": 0.0188, "step": 2846 }, { "epoch": 2.101107011070111, "grad_norm": 0.20634922205621953, "learning_rate": 4.9900863893226615e-05, "loss": 0.0224, "step": 2847 }, { "epoch": 2.1018450184501845, "grad_norm": 0.14640157497605183, "learning_rate": 4.982655491716246e-05, "loss": 0.0142, "step": 2848 }, { "epoch": 2.1025830258302585, "grad_norm": 0.2554534312676779, "learning_rate": 4.9752282948299265e-05, "loss": 0.0308, "step": 2849 }, { "epoch": 2.103321033210332, "grad_norm": 0.20009760398848567, "learning_rate": 4.9678048041418934e-05, "loss": 0.0355, "step": 2850 }, { "epoch": 2.104059040590406, "grad_norm": 0.2075794367418422, "learning_rate": 4.9603850251276116e-05, "loss": 0.0153, "step": 2851 }, { "epoch": 2.1047970479704796, "grad_norm": 0.27701087461593304, "learning_rate": 4.9529689632597996e-05, "loss": 0.0248, "step": 2852 }, { "epoch": 2.1055350553505536, "grad_norm": 0.19234252375668645, "learning_rate": 4.945556624008434e-05, "loss": 0.0138, "step": 2853 }, { "epoch": 2.106273062730627, "grad_norm": 0.15027249050313315, "learning_rate": 4.93814801284075e-05, "loss": 0.0252, "step": 2854 }, { "epoch": 2.107011070110701, "grad_norm": 0.23648228279870298, "learning_rate": 4.930743135221225e-05, "loss": 0.0474, "step": 2855 }, { "epoch": 2.107749077490775, "grad_norm": 0.07963091692814533, "learning_rate": 4.9233419966116036e-05, "loss": 0.0069, "step": 2856 }, { "epoch": 2.108487084870849, "grad_norm": 0.2672776995310827, "learning_rate": 4.9159446024708434e-05, "loss": 0.0383, "step": 2857 }, { "epoch": 2.1092250922509224, "grad_norm": 0.1328786878855606, "learning_rate": 4.9085509582551545e-05, "loss": 0.0127, "step": 2858 }, { "epoch": 2.1099630996309964, "grad_norm": 0.2703040009400093, "learning_rate": 4.90116106941799e-05, "loss": 0.0325, "step": 2859 }, { "epoch": 2.11070110701107, "grad_norm": 0.2802466078221653, "learning_rate": 4.8937749414100196e-05, "loss": 0.0286, "step": 2860 }, { "epoch": 2.111439114391144, "grad_norm": 0.14805741854497198, "learning_rate": 4.8863925796791445e-05, "loss": 0.0124, "step": 2861 }, { "epoch": 2.1121771217712175, "grad_norm": 0.204672633145481, "learning_rate": 4.8790139896704815e-05, "loss": 0.0166, "step": 2862 }, { "epoch": 2.1129151291512915, "grad_norm": 0.4252196562315076, "learning_rate": 4.871639176826379e-05, "loss": 0.0392, "step": 2863 }, { "epoch": 2.113653136531365, "grad_norm": 0.5480070804835727, "learning_rate": 4.864268146586387e-05, "loss": 0.0286, "step": 2864 }, { "epoch": 2.114391143911439, "grad_norm": 0.20386563055881338, "learning_rate": 4.856900904387273e-05, "loss": 0.0202, "step": 2865 }, { "epoch": 2.115129151291513, "grad_norm": 0.3377585338755015, "learning_rate": 4.8495374556630024e-05, "loss": 0.0164, "step": 2866 }, { "epoch": 2.1158671586715867, "grad_norm": 0.24190443912455148, "learning_rate": 4.842177805844747e-05, "loss": 0.0258, "step": 2867 }, { "epoch": 2.1166051660516607, "grad_norm": 0.08309166790537775, "learning_rate": 4.8348219603608856e-05, "loss": 0.0078, "step": 2868 }, { "epoch": 2.1173431734317343, "grad_norm": 0.42756862581386995, "learning_rate": 4.8274699246369756e-05, "loss": 0.032, "step": 2869 }, { "epoch": 2.1180811808118083, "grad_norm": 0.1725359074512714, "learning_rate": 4.820121704095774e-05, "loss": 0.0134, "step": 2870 }, { "epoch": 2.118819188191882, "grad_norm": 0.2623581734348995, "learning_rate": 4.812777304157219e-05, "loss": 0.0123, "step": 2871 }, { "epoch": 2.119557195571956, "grad_norm": 0.14086655417042, "learning_rate": 4.805436730238434e-05, "loss": 0.0147, "step": 2872 }, { "epoch": 2.1202952029520294, "grad_norm": 0.1637022580134176, "learning_rate": 4.798099987753719e-05, "loss": 0.0134, "step": 2873 }, { "epoch": 2.1210332103321035, "grad_norm": 0.35274126439059794, "learning_rate": 4.790767082114543e-05, "loss": 0.0319, "step": 2874 }, { "epoch": 2.121771217712177, "grad_norm": 0.09137822298435197, "learning_rate": 4.7834380187295616e-05, "loss": 0.0113, "step": 2875 }, { "epoch": 2.122509225092251, "grad_norm": 0.243166593628047, "learning_rate": 4.7761128030045765e-05, "loss": 0.0302, "step": 2876 }, { "epoch": 2.1232472324723246, "grad_norm": 0.1578141225946763, "learning_rate": 4.768791440342564e-05, "loss": 0.0151, "step": 2877 }, { "epoch": 2.1239852398523986, "grad_norm": 0.13225813316371837, "learning_rate": 4.761473936143651e-05, "loss": 0.0095, "step": 2878 }, { "epoch": 2.124723247232472, "grad_norm": 0.19568080018829329, "learning_rate": 4.75416029580512e-05, "loss": 0.0236, "step": 2879 }, { "epoch": 2.125461254612546, "grad_norm": 0.19065521502372132, "learning_rate": 4.746850524721412e-05, "loss": 0.0478, "step": 2880 }, { "epoch": 2.1261992619926198, "grad_norm": 0.35325668881174954, "learning_rate": 4.739544628284105e-05, "loss": 0.0185, "step": 2881 }, { "epoch": 2.126937269372694, "grad_norm": 0.20651255110030242, "learning_rate": 4.732242611881921e-05, "loss": 0.0221, "step": 2882 }, { "epoch": 2.1276752767527674, "grad_norm": 0.2899783197031282, "learning_rate": 4.724944480900716e-05, "loss": 0.0425, "step": 2883 }, { "epoch": 2.1284132841328414, "grad_norm": 0.11940629592668818, "learning_rate": 4.717650240723493e-05, "loss": 0.0167, "step": 2884 }, { "epoch": 2.129151291512915, "grad_norm": 0.14064630535396308, "learning_rate": 4.710359896730379e-05, "loss": 0.0185, "step": 2885 }, { "epoch": 2.129889298892989, "grad_norm": 0.1262302076285799, "learning_rate": 4.703073454298611e-05, "loss": 0.0171, "step": 2886 }, { "epoch": 2.1306273062730625, "grad_norm": 0.24364983939037366, "learning_rate": 4.695790918802576e-05, "loss": 0.0335, "step": 2887 }, { "epoch": 2.1313653136531365, "grad_norm": 0.21221736678539982, "learning_rate": 4.688512295613762e-05, "loss": 0.0321, "step": 2888 }, { "epoch": 2.1321033210332105, "grad_norm": 0.2561258807986593, "learning_rate": 4.6812375901007734e-05, "loss": 0.0283, "step": 2889 }, { "epoch": 2.132841328413284, "grad_norm": 0.2867029647842029, "learning_rate": 4.6739668076293255e-05, "loss": 0.0439, "step": 2890 }, { "epoch": 2.133579335793358, "grad_norm": 0.16455845774335592, "learning_rate": 4.6666999535622466e-05, "loss": 0.0256, "step": 2891 }, { "epoch": 2.1343173431734317, "grad_norm": 0.21346740792355692, "learning_rate": 4.659437033259461e-05, "loss": 0.0286, "step": 2892 }, { "epoch": 2.1350553505535057, "grad_norm": 0.3997890521973631, "learning_rate": 4.652178052077991e-05, "loss": 0.0419, "step": 2893 }, { "epoch": 2.1357933579335793, "grad_norm": 0.18505133587026484, "learning_rate": 4.644923015371955e-05, "loss": 0.0427, "step": 2894 }, { "epoch": 2.1365313653136533, "grad_norm": 0.31682690689597676, "learning_rate": 4.63767192849256e-05, "loss": 0.0423, "step": 2895 }, { "epoch": 2.137269372693727, "grad_norm": 0.1457374764188297, "learning_rate": 4.6304247967881074e-05, "loss": 0.0118, "step": 2896 }, { "epoch": 2.138007380073801, "grad_norm": 0.3314430483133812, "learning_rate": 4.623181625603974e-05, "loss": 0.0194, "step": 2897 }, { "epoch": 2.1387453874538744, "grad_norm": 0.15226109067252916, "learning_rate": 4.615942420282615e-05, "loss": 0.014, "step": 2898 }, { "epoch": 2.1394833948339484, "grad_norm": 0.26529278912782733, "learning_rate": 4.6087071861635655e-05, "loss": 0.026, "step": 2899 }, { "epoch": 2.140221402214022, "grad_norm": 0.167473247043022, "learning_rate": 4.601475928583422e-05, "loss": 0.0203, "step": 2900 }, { "epoch": 2.140959409594096, "grad_norm": 0.44693271488913805, "learning_rate": 4.5942486528758675e-05, "loss": 0.0573, "step": 2901 }, { "epoch": 2.1416974169741696, "grad_norm": 0.12824612425655632, "learning_rate": 4.58702536437162e-05, "loss": 0.0182, "step": 2902 }, { "epoch": 2.1424354243542436, "grad_norm": 0.1979311120404551, "learning_rate": 4.5798060683984826e-05, "loss": 0.024, "step": 2903 }, { "epoch": 2.143173431734317, "grad_norm": 0.3027463809772365, "learning_rate": 4.572590770281298e-05, "loss": 0.0387, "step": 2904 }, { "epoch": 2.143911439114391, "grad_norm": 0.465574619228449, "learning_rate": 4.565379475341966e-05, "loss": 0.034, "step": 2905 }, { "epoch": 2.1446494464944648, "grad_norm": 0.1430883609958541, "learning_rate": 4.558172188899433e-05, "loss": 0.0123, "step": 2906 }, { "epoch": 2.1453874538745388, "grad_norm": 0.15075024364982859, "learning_rate": 4.5509689162696834e-05, "loss": 0.0194, "step": 2907 }, { "epoch": 2.1461254612546123, "grad_norm": 0.08557644033246432, "learning_rate": 4.543769662765754e-05, "loss": 0.006, "step": 2908 }, { "epoch": 2.1468634686346864, "grad_norm": 0.13484615439605688, "learning_rate": 4.5365744336977054e-05, "loss": 0.0361, "step": 2909 }, { "epoch": 2.14760147601476, "grad_norm": 0.2581895932091352, "learning_rate": 4.5293832343726327e-05, "loss": 0.0294, "step": 2910 }, { "epoch": 2.148339483394834, "grad_norm": 0.3478326523297735, "learning_rate": 4.522196070094661e-05, "loss": 0.0317, "step": 2911 }, { "epoch": 2.149077490774908, "grad_norm": 0.4408433994474609, "learning_rate": 4.515012946164934e-05, "loss": 0.0645, "step": 2912 }, { "epoch": 2.1498154981549815, "grad_norm": 0.1311878345422652, "learning_rate": 4.507833867881629e-05, "loss": 0.0233, "step": 2913 }, { "epoch": 2.1505535055350555, "grad_norm": 0.17022172661356982, "learning_rate": 4.500658840539914e-05, "loss": 0.0168, "step": 2914 }, { "epoch": 2.151291512915129, "grad_norm": 0.09772953309795068, "learning_rate": 4.4934878694319983e-05, "loss": 0.0125, "step": 2915 }, { "epoch": 2.152029520295203, "grad_norm": 0.23839794749806228, "learning_rate": 4.48632095984708e-05, "loss": 0.0373, "step": 2916 }, { "epoch": 2.1527675276752767, "grad_norm": 0.23609853737344244, "learning_rate": 4.4791581170713685e-05, "loss": 0.024, "step": 2917 }, { "epoch": 2.1535055350553507, "grad_norm": 0.5303922325661017, "learning_rate": 4.47199934638807e-05, "loss": 0.029, "step": 2918 }, { "epoch": 2.1542435424354243, "grad_norm": 0.13993958357752173, "learning_rate": 4.464844653077386e-05, "loss": 0.0177, "step": 2919 }, { "epoch": 2.1549815498154983, "grad_norm": 0.16189382725775428, "learning_rate": 4.4576940424165226e-05, "loss": 0.012, "step": 2920 }, { "epoch": 2.155719557195572, "grad_norm": 0.22526557354592824, "learning_rate": 4.450547519679661e-05, "loss": 0.027, "step": 2921 }, { "epoch": 2.156457564575646, "grad_norm": 0.1192446472521915, "learning_rate": 4.443405090137972e-05, "loss": 0.0145, "step": 2922 }, { "epoch": 2.1571955719557194, "grad_norm": 0.1070945466839018, "learning_rate": 4.436266759059605e-05, "loss": 0.0167, "step": 2923 }, { "epoch": 2.1579335793357934, "grad_norm": 0.1953498583067829, "learning_rate": 4.4291325317096964e-05, "loss": 0.0206, "step": 2924 }, { "epoch": 2.158671586715867, "grad_norm": 0.23846007766622374, "learning_rate": 4.422002413350346e-05, "loss": 0.0183, "step": 2925 }, { "epoch": 2.159409594095941, "grad_norm": 0.2659687222364268, "learning_rate": 4.414876409240616e-05, "loss": 0.0177, "step": 2926 }, { "epoch": 2.1601476014760146, "grad_norm": 0.09807504875700257, "learning_rate": 4.4077545246365525e-05, "loss": 0.0148, "step": 2927 }, { "epoch": 2.1608856088560886, "grad_norm": 0.24438657953185142, "learning_rate": 4.400636764791148e-05, "loss": 0.0329, "step": 2928 }, { "epoch": 2.161623616236162, "grad_norm": 0.15205228607738616, "learning_rate": 4.393523134954368e-05, "loss": 0.0275, "step": 2929 }, { "epoch": 2.162361623616236, "grad_norm": 0.23204560575214295, "learning_rate": 4.386413640373108e-05, "loss": 0.0314, "step": 2930 }, { "epoch": 2.16309963099631, "grad_norm": 0.07945672563332636, "learning_rate": 4.379308286291239e-05, "loss": 0.0136, "step": 2931 }, { "epoch": 2.1638376383763838, "grad_norm": 0.22502090557632562, "learning_rate": 4.372207077949562e-05, "loss": 0.0703, "step": 2932 }, { "epoch": 2.1645756457564573, "grad_norm": 0.6624749199933246, "learning_rate": 4.365110020585824e-05, "loss": 0.0451, "step": 2933 }, { "epoch": 2.1653136531365313, "grad_norm": 0.11988551227571884, "learning_rate": 4.358017119434713e-05, "loss": 0.016, "step": 2934 }, { "epoch": 2.1660516605166054, "grad_norm": 0.22266583528989173, "learning_rate": 4.3509283797278436e-05, "loss": 0.0094, "step": 2935 }, { "epoch": 2.166789667896679, "grad_norm": 0.25433534289591836, "learning_rate": 4.343843806693776e-05, "loss": 0.0211, "step": 2936 }, { "epoch": 2.167527675276753, "grad_norm": 0.400740235894262, "learning_rate": 4.336763405557982e-05, "loss": 0.077, "step": 2937 }, { "epoch": 2.1682656826568265, "grad_norm": 0.10296263434632462, "learning_rate": 4.329687181542864e-05, "loss": 0.0117, "step": 2938 }, { "epoch": 2.1690036900369005, "grad_norm": 0.09043889240990971, "learning_rate": 4.3226151398677404e-05, "loss": 0.0106, "step": 2939 }, { "epoch": 2.169741697416974, "grad_norm": 0.21718019128808352, "learning_rate": 4.3155472857488445e-05, "loss": 0.0162, "step": 2940 }, { "epoch": 2.170479704797048, "grad_norm": 0.15389002577991148, "learning_rate": 4.30848362439933e-05, "loss": 0.0233, "step": 2941 }, { "epoch": 2.1712177121771217, "grad_norm": 0.29753191904089177, "learning_rate": 4.3014241610292386e-05, "loss": 0.0368, "step": 2942 }, { "epoch": 2.1719557195571957, "grad_norm": 0.3734268870749441, "learning_rate": 4.294368900845537e-05, "loss": 0.0166, "step": 2943 }, { "epoch": 2.1726937269372693, "grad_norm": 0.14062900281696109, "learning_rate": 4.287317849052075e-05, "loss": 0.022, "step": 2944 }, { "epoch": 2.1734317343173433, "grad_norm": 0.2138684322131796, "learning_rate": 4.280271010849617e-05, "loss": 0.0283, "step": 2945 }, { "epoch": 2.174169741697417, "grad_norm": 0.2031648230241446, "learning_rate": 4.273228391435796e-05, "loss": 0.0292, "step": 2946 }, { "epoch": 2.174907749077491, "grad_norm": 0.22738785550741675, "learning_rate": 4.266189996005148e-05, "loss": 0.0347, "step": 2947 }, { "epoch": 2.1756457564575644, "grad_norm": 0.2455401983445805, "learning_rate": 4.259155829749094e-05, "loss": 0.0278, "step": 2948 }, { "epoch": 2.1763837638376384, "grad_norm": 0.23024344450198297, "learning_rate": 4.252125897855932e-05, "loss": 0.0322, "step": 2949 }, { "epoch": 2.177121771217712, "grad_norm": 0.2070608217981396, "learning_rate": 4.245100205510836e-05, "loss": 0.0275, "step": 2950 }, { "epoch": 2.177859778597786, "grad_norm": 0.2378811284966944, "learning_rate": 4.23807875789585e-05, "loss": 0.0529, "step": 2951 }, { "epoch": 2.1785977859778596, "grad_norm": 0.1418175592086017, "learning_rate": 4.2310615601899006e-05, "loss": 0.0232, "step": 2952 }, { "epoch": 2.1793357933579336, "grad_norm": 0.42223577752916236, "learning_rate": 4.2240486175687676e-05, "loss": 0.0318, "step": 2953 }, { "epoch": 2.1800738007380076, "grad_norm": 0.17329646665182172, "learning_rate": 4.217039935205087e-05, "loss": 0.0096, "step": 2954 }, { "epoch": 2.180811808118081, "grad_norm": 0.20470328637920795, "learning_rate": 4.210035518268369e-05, "loss": 0.0601, "step": 2955 }, { "epoch": 2.181549815498155, "grad_norm": 0.1015830202516315, "learning_rate": 4.203035371924964e-05, "loss": 0.0222, "step": 2956 }, { "epoch": 2.1822878228782288, "grad_norm": 0.3026314555696497, "learning_rate": 4.196039501338087e-05, "loss": 0.0335, "step": 2957 }, { "epoch": 2.1830258302583028, "grad_norm": 0.12158033689431917, "learning_rate": 4.189047911667777e-05, "loss": 0.0154, "step": 2958 }, { "epoch": 2.1837638376383763, "grad_norm": 0.19462936977720796, "learning_rate": 4.182060608070939e-05, "loss": 0.0278, "step": 2959 }, { "epoch": 2.1845018450184504, "grad_norm": 0.2733922350630924, "learning_rate": 4.175077595701303e-05, "loss": 0.0184, "step": 2960 }, { "epoch": 2.185239852398524, "grad_norm": 0.23016295323671188, "learning_rate": 4.1680988797094355e-05, "loss": 0.0278, "step": 2961 }, { "epoch": 2.185977859778598, "grad_norm": 0.26053015157838594, "learning_rate": 4.161124465242737e-05, "loss": 0.0435, "step": 2962 }, { "epoch": 2.1867158671586715, "grad_norm": 0.1510258928500228, "learning_rate": 4.15415435744543e-05, "loss": 0.0177, "step": 2963 }, { "epoch": 2.1874538745387455, "grad_norm": 0.47764912876163224, "learning_rate": 4.147188561458572e-05, "loss": 0.0314, "step": 2964 }, { "epoch": 2.188191881918819, "grad_norm": 0.14448923218827603, "learning_rate": 4.140227082420026e-05, "loss": 0.0264, "step": 2965 }, { "epoch": 2.188929889298893, "grad_norm": 0.13832597172983155, "learning_rate": 4.133269925464481e-05, "loss": 0.0252, "step": 2966 }, { "epoch": 2.1896678966789667, "grad_norm": 0.4583929755878972, "learning_rate": 4.126317095723433e-05, "loss": 0.0685, "step": 2967 }, { "epoch": 2.1904059040590407, "grad_norm": 0.23526014084282185, "learning_rate": 4.119368598325184e-05, "loss": 0.0209, "step": 2968 }, { "epoch": 2.1911439114391142, "grad_norm": 0.4977482087778479, "learning_rate": 4.112424438394855e-05, "loss": 0.0372, "step": 2969 }, { "epoch": 2.1918819188191883, "grad_norm": 0.366018403782202, "learning_rate": 4.105484621054344e-05, "loss": 0.0357, "step": 2970 }, { "epoch": 2.192619926199262, "grad_norm": 0.3371791223130593, "learning_rate": 4.098549151422367e-05, "loss": 0.0351, "step": 2971 }, { "epoch": 2.193357933579336, "grad_norm": 0.202172468435924, "learning_rate": 4.091618034614425e-05, "loss": 0.0462, "step": 2972 }, { "epoch": 2.1940959409594094, "grad_norm": 0.38830712683223884, "learning_rate": 4.084691275742806e-05, "loss": 0.1077, "step": 2973 }, { "epoch": 2.1948339483394834, "grad_norm": 0.22628265104914222, "learning_rate": 4.077768879916587e-05, "loss": 0.0222, "step": 2974 }, { "epoch": 2.195571955719557, "grad_norm": 0.1026052558089843, "learning_rate": 4.070850852241623e-05, "loss": 0.0133, "step": 2975 }, { "epoch": 2.196309963099631, "grad_norm": 0.17204798903271815, "learning_rate": 4.063937197820558e-05, "loss": 0.027, "step": 2976 }, { "epoch": 2.197047970479705, "grad_norm": 0.1759073091704481, "learning_rate": 4.057027921752797e-05, "loss": 0.0175, "step": 2977 }, { "epoch": 2.1977859778597786, "grad_norm": 0.35310051454395625, "learning_rate": 4.050123029134523e-05, "loss": 0.1718, "step": 2978 }, { "epoch": 2.1985239852398526, "grad_norm": 0.08843146287823389, "learning_rate": 4.043222525058683e-05, "loss": 0.0143, "step": 2979 }, { "epoch": 2.199261992619926, "grad_norm": 0.3384521364497068, "learning_rate": 4.036326414614985e-05, "loss": 0.0343, "step": 2980 }, { "epoch": 2.2, "grad_norm": 0.1470808450066313, "learning_rate": 4.029434702889907e-05, "loss": 0.0232, "step": 2981 }, { "epoch": 2.2007380073800737, "grad_norm": 0.18505496432906757, "learning_rate": 4.022547394966671e-05, "loss": 0.0224, "step": 2982 }, { "epoch": 2.2014760147601478, "grad_norm": 0.1424733624820613, "learning_rate": 4.0156644959252556e-05, "loss": 0.0145, "step": 2983 }, { "epoch": 2.2022140221402213, "grad_norm": 0.11286544278155737, "learning_rate": 4.008786010842381e-05, "loss": 0.0117, "step": 2984 }, { "epoch": 2.2029520295202953, "grad_norm": 0.3208748014854251, "learning_rate": 4.00191194479153e-05, "loss": 0.0204, "step": 2985 }, { "epoch": 2.203690036900369, "grad_norm": 0.16320893119869484, "learning_rate": 3.995042302842903e-05, "loss": 0.0276, "step": 2986 }, { "epoch": 2.204428044280443, "grad_norm": 0.172168562399739, "learning_rate": 3.9881770900634466e-05, "loss": 0.0267, "step": 2987 }, { "epoch": 2.2051660516605165, "grad_norm": 0.09819134537376847, "learning_rate": 3.981316311516848e-05, "loss": 0.0104, "step": 2988 }, { "epoch": 2.2059040590405905, "grad_norm": 0.2692410875305959, "learning_rate": 3.974459972263516e-05, "loss": 0.0168, "step": 2989 }, { "epoch": 2.206642066420664, "grad_norm": 0.17020703325569994, "learning_rate": 3.967608077360584e-05, "loss": 0.0348, "step": 2990 }, { "epoch": 2.207380073800738, "grad_norm": 0.26687090507818767, "learning_rate": 3.9607606318619087e-05, "loss": 0.0435, "step": 2991 }, { "epoch": 2.2081180811808117, "grad_norm": 0.16789848653244, "learning_rate": 3.95391764081807e-05, "loss": 0.0102, "step": 2992 }, { "epoch": 2.2088560885608857, "grad_norm": 0.3447730697637232, "learning_rate": 3.947079109276358e-05, "loss": 0.0375, "step": 2993 }, { "epoch": 2.2095940959409592, "grad_norm": 0.205552745064486, "learning_rate": 3.9402450422807715e-05, "loss": 0.0475, "step": 2994 }, { "epoch": 2.2103321033210332, "grad_norm": 0.2163221441312269, "learning_rate": 3.9334154448720184e-05, "loss": 0.028, "step": 2995 }, { "epoch": 2.211070110701107, "grad_norm": 0.2000149718672191, "learning_rate": 3.926590322087509e-05, "loss": 0.033, "step": 2996 }, { "epoch": 2.211808118081181, "grad_norm": 0.26364207815881135, "learning_rate": 3.9197696789613595e-05, "loss": 0.0301, "step": 2997 }, { "epoch": 2.2125461254612544, "grad_norm": 0.13775828543264407, "learning_rate": 3.9129535205243714e-05, "loss": 0.0149, "step": 2998 }, { "epoch": 2.2132841328413284, "grad_norm": 0.12315204692827399, "learning_rate": 3.906141851804048e-05, "loss": 0.0123, "step": 2999 }, { "epoch": 2.2140221402214024, "grad_norm": 0.12048405690965644, "learning_rate": 3.8993346778245745e-05, "loss": 0.0125, "step": 3000 }, { "epoch": 2.214760147601476, "grad_norm": 0.1441827334315984, "learning_rate": 3.892532003606823e-05, "loss": 0.0205, "step": 3001 }, { "epoch": 2.21549815498155, "grad_norm": 0.2813827212994813, "learning_rate": 3.885733834168346e-05, "loss": 0.0459, "step": 3002 }, { "epoch": 2.2162361623616236, "grad_norm": 0.1464407290469383, "learning_rate": 3.878940174523371e-05, "loss": 0.0177, "step": 3003 }, { "epoch": 2.2169741697416976, "grad_norm": 0.198398334710946, "learning_rate": 3.872151029682811e-05, "loss": 0.0308, "step": 3004 }, { "epoch": 2.217712177121771, "grad_norm": 0.1902347506294478, "learning_rate": 3.865366404654235e-05, "loss": 0.0155, "step": 3005 }, { "epoch": 2.218450184501845, "grad_norm": 0.23999926381787123, "learning_rate": 3.858586304441883e-05, "loss": 0.0208, "step": 3006 }, { "epoch": 2.2191881918819187, "grad_norm": 0.1289122772280206, "learning_rate": 3.85181073404666e-05, "loss": 0.0154, "step": 3007 }, { "epoch": 2.2199261992619927, "grad_norm": 0.2634467837852995, "learning_rate": 3.845039698466122e-05, "loss": 0.0408, "step": 3008 }, { "epoch": 2.2206642066420663, "grad_norm": 0.43607157388744944, "learning_rate": 3.838273202694495e-05, "loss": 0.0465, "step": 3009 }, { "epoch": 2.2214022140221403, "grad_norm": 0.1773523386119776, "learning_rate": 3.831511251722643e-05, "loss": 0.0446, "step": 3010 }, { "epoch": 2.222140221402214, "grad_norm": 0.37984156768937244, "learning_rate": 3.824753850538082e-05, "loss": 0.0446, "step": 3011 }, { "epoch": 2.222878228782288, "grad_norm": 0.21140034399850816, "learning_rate": 3.81800100412497e-05, "loss": 0.0209, "step": 3012 }, { "epoch": 2.2236162361623615, "grad_norm": 0.2234962895630497, "learning_rate": 3.811252717464114e-05, "loss": 0.0285, "step": 3013 }, { "epoch": 2.2243542435424355, "grad_norm": 0.25834740425809394, "learning_rate": 3.804508995532954e-05, "loss": 0.0472, "step": 3014 }, { "epoch": 2.225092250922509, "grad_norm": 0.12496843673076806, "learning_rate": 3.7977698433055476e-05, "loss": 0.0227, "step": 3015 }, { "epoch": 2.225830258302583, "grad_norm": 0.32950418252547126, "learning_rate": 3.791035265752606e-05, "loss": 0.0443, "step": 3016 }, { "epoch": 2.2265682656826566, "grad_norm": 0.1861991909237716, "learning_rate": 3.784305267841454e-05, "loss": 0.0136, "step": 3017 }, { "epoch": 2.2273062730627307, "grad_norm": 0.3404041623944496, "learning_rate": 3.7775798545360374e-05, "loss": 0.0654, "step": 3018 }, { "epoch": 2.2280442804428042, "grad_norm": 0.24647684701878447, "learning_rate": 3.770859030796924e-05, "loss": 0.0215, "step": 3019 }, { "epoch": 2.2287822878228782, "grad_norm": 0.18442374758635832, "learning_rate": 3.764142801581292e-05, "loss": 0.0228, "step": 3020 }, { "epoch": 2.229520295202952, "grad_norm": 0.30839034318384634, "learning_rate": 3.757431171842941e-05, "loss": 0.0308, "step": 3021 }, { "epoch": 2.230258302583026, "grad_norm": 0.1826186600426321, "learning_rate": 3.750724146532267e-05, "loss": 0.0262, "step": 3022 }, { "epoch": 2.2309963099631, "grad_norm": 0.211225839082285, "learning_rate": 3.7440217305962755e-05, "loss": 0.0494, "step": 3023 }, { "epoch": 2.2317343173431734, "grad_norm": 0.10772980118669309, "learning_rate": 3.7373239289785655e-05, "loss": 0.0154, "step": 3024 }, { "epoch": 2.2324723247232474, "grad_norm": 0.1236095096826585, "learning_rate": 3.7306307466193454e-05, "loss": 0.0093, "step": 3025 }, { "epoch": 2.233210332103321, "grad_norm": 0.10918227843750992, "learning_rate": 3.723942188455409e-05, "loss": 0.013, "step": 3026 }, { "epoch": 2.233948339483395, "grad_norm": 0.16727358529583994, "learning_rate": 3.71725825942013e-05, "loss": 0.0326, "step": 3027 }, { "epoch": 2.2346863468634686, "grad_norm": 0.25411907217205043, "learning_rate": 3.710578964443484e-05, "loss": 0.0246, "step": 3028 }, { "epoch": 2.2354243542435426, "grad_norm": 0.11331930541348871, "learning_rate": 3.703904308452017e-05, "loss": 0.0158, "step": 3029 }, { "epoch": 2.236162361623616, "grad_norm": 0.14089981822772385, "learning_rate": 3.697234296368869e-05, "loss": 0.0172, "step": 3030 }, { "epoch": 2.23690036900369, "grad_norm": 0.13354055693020797, "learning_rate": 3.690568933113728e-05, "loss": 0.0115, "step": 3031 }, { "epoch": 2.2376383763837637, "grad_norm": 0.22230055107146277, "learning_rate": 3.683908223602879e-05, "loss": 0.0405, "step": 3032 }, { "epoch": 2.2383763837638377, "grad_norm": 0.112013232426904, "learning_rate": 3.677252172749161e-05, "loss": 0.0113, "step": 3033 }, { "epoch": 2.2391143911439113, "grad_norm": 0.36646969446510796, "learning_rate": 3.670600785461982e-05, "loss": 0.0432, "step": 3034 }, { "epoch": 2.2398523985239853, "grad_norm": 0.22097815192932835, "learning_rate": 3.663954066647306e-05, "loss": 0.0092, "step": 3035 }, { "epoch": 2.240590405904059, "grad_norm": 0.18610407548173322, "learning_rate": 3.6573120212076516e-05, "loss": 0.04, "step": 3036 }, { "epoch": 2.241328413284133, "grad_norm": 0.1054574543131389, "learning_rate": 3.650674654042105e-05, "loss": 0.0142, "step": 3037 }, { "epoch": 2.2420664206642065, "grad_norm": 0.15085807928072809, "learning_rate": 3.6440419700462837e-05, "loss": 0.0167, "step": 3038 }, { "epoch": 2.2428044280442805, "grad_norm": 0.28299184660752763, "learning_rate": 3.63741397411236e-05, "loss": 0.0531, "step": 3039 }, { "epoch": 2.243542435424354, "grad_norm": 0.13598256947286894, "learning_rate": 3.63079067112905e-05, "loss": 0.0205, "step": 3040 }, { "epoch": 2.244280442804428, "grad_norm": 0.15728628331678934, "learning_rate": 3.624172065981598e-05, "loss": 0.023, "step": 3041 }, { "epoch": 2.245018450184502, "grad_norm": 0.1512121991240895, "learning_rate": 3.617558163551802e-05, "loss": 0.0171, "step": 3042 }, { "epoch": 2.2457564575645756, "grad_norm": 0.23974093950960332, "learning_rate": 3.610948968717968e-05, "loss": 0.0311, "step": 3043 }, { "epoch": 2.246494464944649, "grad_norm": 0.2236503406797666, "learning_rate": 3.604344486354949e-05, "loss": 0.0203, "step": 3044 }, { "epoch": 2.2472324723247232, "grad_norm": 0.1883767227728935, "learning_rate": 3.597744721334111e-05, "loss": 0.0233, "step": 3045 }, { "epoch": 2.2479704797047972, "grad_norm": 0.2028865941410593, "learning_rate": 3.5911496785233524e-05, "loss": 0.0217, "step": 3046 }, { "epoch": 2.248708487084871, "grad_norm": 0.16565977043896774, "learning_rate": 3.58455936278707e-05, "loss": 0.0406, "step": 3047 }, { "epoch": 2.249446494464945, "grad_norm": 0.09584677251558477, "learning_rate": 3.577973778986187e-05, "loss": 0.0094, "step": 3048 }, { "epoch": 2.2501845018450184, "grad_norm": 0.42543883636796304, "learning_rate": 3.571392931978139e-05, "loss": 0.0165, "step": 3049 }, { "epoch": 2.2509225092250924, "grad_norm": 0.17615444791219545, "learning_rate": 3.564816826616859e-05, "loss": 0.025, "step": 3050 }, { "epoch": 2.251660516605166, "grad_norm": 0.2597713154137622, "learning_rate": 3.558245467752788e-05, "loss": 0.0272, "step": 3051 }, { "epoch": 2.25239852398524, "grad_norm": 0.15746256273202236, "learning_rate": 3.55167886023286e-05, "loss": 0.0166, "step": 3052 }, { "epoch": 2.2531365313653136, "grad_norm": 0.23155599531984022, "learning_rate": 3.5451170089005146e-05, "loss": 0.0246, "step": 3053 }, { "epoch": 2.2538745387453876, "grad_norm": 0.17919713216788133, "learning_rate": 3.53855991859568e-05, "loss": 0.0313, "step": 3054 }, { "epoch": 2.254612546125461, "grad_norm": 0.2409040284306177, "learning_rate": 3.532007594154757e-05, "loss": 0.0387, "step": 3055 }, { "epoch": 2.255350553505535, "grad_norm": 0.23456565473890148, "learning_rate": 3.525460040410658e-05, "loss": 0.0253, "step": 3056 }, { "epoch": 2.2560885608856087, "grad_norm": 0.33923337065815556, "learning_rate": 3.518917262192753e-05, "loss": 0.0149, "step": 3057 }, { "epoch": 2.2568265682656827, "grad_norm": 0.1628325693665739, "learning_rate": 3.512379264326914e-05, "loss": 0.0177, "step": 3058 }, { "epoch": 2.2575645756457563, "grad_norm": 0.225305359859797, "learning_rate": 3.5058460516354565e-05, "loss": 0.0241, "step": 3059 }, { "epoch": 2.2583025830258303, "grad_norm": 0.2045038536833088, "learning_rate": 3.499317628937192e-05, "loss": 0.0606, "step": 3060 }, { "epoch": 2.259040590405904, "grad_norm": 0.2788921144751696, "learning_rate": 3.492794001047389e-05, "loss": 0.0103, "step": 3061 }, { "epoch": 2.259778597785978, "grad_norm": 0.24940901887841113, "learning_rate": 3.4862751727777797e-05, "loss": 0.0332, "step": 3062 }, { "epoch": 2.2605166051660515, "grad_norm": 0.20683870797681192, "learning_rate": 3.479761148936556e-05, "loss": 0.0475, "step": 3063 }, { "epoch": 2.2612546125461255, "grad_norm": 0.1611717725868623, "learning_rate": 3.4732519343283634e-05, "loss": 0.0241, "step": 3064 }, { "epoch": 2.2619926199261995, "grad_norm": 0.13507343823807294, "learning_rate": 3.4667475337543095e-05, "loss": 0.0261, "step": 3065 }, { "epoch": 2.262730627306273, "grad_norm": 0.29468135080126145, "learning_rate": 3.4602479520119445e-05, "loss": 0.0253, "step": 3066 }, { "epoch": 2.2634686346863466, "grad_norm": 0.18691001462442125, "learning_rate": 3.453753193895263e-05, "loss": 0.0088, "step": 3067 }, { "epoch": 2.2642066420664206, "grad_norm": 0.1818242500696856, "learning_rate": 3.447263264194703e-05, "loss": 0.0275, "step": 3068 }, { "epoch": 2.2649446494464947, "grad_norm": 0.18244272854049, "learning_rate": 3.440778167697142e-05, "loss": 0.0264, "step": 3069 }, { "epoch": 2.265682656826568, "grad_norm": 0.18464678595732242, "learning_rate": 3.4342979091859e-05, "loss": 0.0309, "step": 3070 }, { "epoch": 2.2664206642066422, "grad_norm": 0.11405029034610012, "learning_rate": 3.427822493440708e-05, "loss": 0.0176, "step": 3071 }, { "epoch": 2.267158671586716, "grad_norm": 0.12621924990256425, "learning_rate": 3.421351925237749e-05, "loss": 0.0128, "step": 3072 }, { "epoch": 2.26789667896679, "grad_norm": 0.09612985888058365, "learning_rate": 3.414886209349615e-05, "loss": 0.0128, "step": 3073 }, { "epoch": 2.2686346863468634, "grad_norm": 0.17496368398577408, "learning_rate": 3.408425350545324e-05, "loss": 0.0332, "step": 3074 }, { "epoch": 2.2693726937269374, "grad_norm": 0.15624700385496826, "learning_rate": 3.401969353590313e-05, "loss": 0.0228, "step": 3075 }, { "epoch": 2.270110701107011, "grad_norm": 0.18064259631429483, "learning_rate": 3.395518223246427e-05, "loss": 0.0158, "step": 3076 }, { "epoch": 2.270848708487085, "grad_norm": 0.10314661229294421, "learning_rate": 3.3890719642719306e-05, "loss": 0.0138, "step": 3077 }, { "epoch": 2.2715867158671585, "grad_norm": 0.18853896041222334, "learning_rate": 3.3826305814214885e-05, "loss": 0.0167, "step": 3078 }, { "epoch": 2.2723247232472326, "grad_norm": 0.1722361059533546, "learning_rate": 3.37619407944617e-05, "loss": 0.0159, "step": 3079 }, { "epoch": 2.273062730627306, "grad_norm": 0.14941939622552022, "learning_rate": 3.3697624630934466e-05, "loss": 0.0113, "step": 3080 }, { "epoch": 2.27380073800738, "grad_norm": 0.1925024822817914, "learning_rate": 3.3633357371071796e-05, "loss": 0.0195, "step": 3081 }, { "epoch": 2.2745387453874537, "grad_norm": 0.1383284847670398, "learning_rate": 3.3569139062276346e-05, "loss": 0.0107, "step": 3082 }, { "epoch": 2.2752767527675277, "grad_norm": 0.13689673857559403, "learning_rate": 3.35049697519146e-05, "loss": 0.0134, "step": 3083 }, { "epoch": 2.2760147601476013, "grad_norm": 0.2119172900467532, "learning_rate": 3.344084948731686e-05, "loss": 0.0198, "step": 3084 }, { "epoch": 2.2767527675276753, "grad_norm": 0.17908669467176003, "learning_rate": 3.33767783157773e-05, "loss": 0.0106, "step": 3085 }, { "epoch": 2.277490774907749, "grad_norm": 0.21449583573565165, "learning_rate": 3.331275628455398e-05, "loss": 0.0258, "step": 3086 }, { "epoch": 2.278228782287823, "grad_norm": 0.25865617059562507, "learning_rate": 3.324878344086849e-05, "loss": 0.0536, "step": 3087 }, { "epoch": 2.278966789667897, "grad_norm": 0.16568526106910875, "learning_rate": 3.3184859831906303e-05, "loss": 0.012, "step": 3088 }, { "epoch": 2.2797047970479705, "grad_norm": 0.4752854030289613, "learning_rate": 3.312098550481657e-05, "loss": 0.0433, "step": 3089 }, { "epoch": 2.280442804428044, "grad_norm": 0.3184675408741053, "learning_rate": 3.3057160506712046e-05, "loss": 0.0222, "step": 3090 }, { "epoch": 2.281180811808118, "grad_norm": 0.24734975654250904, "learning_rate": 3.299338488466912e-05, "loss": 0.0467, "step": 3091 }, { "epoch": 2.281918819188192, "grad_norm": 0.1840492456741707, "learning_rate": 3.292965868572773e-05, "loss": 0.0394, "step": 3092 }, { "epoch": 2.2826568265682656, "grad_norm": 0.315572613773467, "learning_rate": 3.286598195689145e-05, "loss": 0.0282, "step": 3093 }, { "epoch": 2.2833948339483396, "grad_norm": 0.34242801705637244, "learning_rate": 3.2802354745127264e-05, "loss": 0.0723, "step": 3094 }, { "epoch": 2.284132841328413, "grad_norm": 0.2943558786380991, "learning_rate": 3.2738777097365695e-05, "loss": 0.0362, "step": 3095 }, { "epoch": 2.2848708487084872, "grad_norm": 0.15003714246761043, "learning_rate": 3.267524906050068e-05, "loss": 0.0169, "step": 3096 }, { "epoch": 2.285608856088561, "grad_norm": 0.11799344667528526, "learning_rate": 3.261177068138953e-05, "loss": 0.0085, "step": 3097 }, { "epoch": 2.286346863468635, "grad_norm": 0.2286765582756585, "learning_rate": 3.254834200685305e-05, "loss": 0.0169, "step": 3098 }, { "epoch": 2.2870848708487084, "grad_norm": 0.13898691228744103, "learning_rate": 3.248496308367527e-05, "loss": 0.0294, "step": 3099 }, { "epoch": 2.2878228782287824, "grad_norm": 0.13849441556225808, "learning_rate": 3.242163395860355e-05, "loss": 0.0165, "step": 3100 }, { "epoch": 2.288560885608856, "grad_norm": 0.26846352032102394, "learning_rate": 3.235835467834854e-05, "loss": 0.0386, "step": 3101 }, { "epoch": 2.28929889298893, "grad_norm": 0.3042368856644062, "learning_rate": 3.2295125289584095e-05, "loss": 0.0246, "step": 3102 }, { "epoch": 2.2900369003690035, "grad_norm": 0.24898487888473306, "learning_rate": 3.223194583894731e-05, "loss": 0.0232, "step": 3103 }, { "epoch": 2.2907749077490775, "grad_norm": 0.30561197794566697, "learning_rate": 3.216881637303839e-05, "loss": 0.041, "step": 3104 }, { "epoch": 2.291512915129151, "grad_norm": 0.39244830904942685, "learning_rate": 3.210573693842076e-05, "loss": 0.0399, "step": 3105 }, { "epoch": 2.292250922509225, "grad_norm": 0.11832563403931787, "learning_rate": 3.204270758162088e-05, "loss": 0.0139, "step": 3106 }, { "epoch": 2.2929889298892987, "grad_norm": 0.14478359750602118, "learning_rate": 3.1979728349128256e-05, "loss": 0.0228, "step": 3107 }, { "epoch": 2.2937269372693727, "grad_norm": 0.14979910391109783, "learning_rate": 3.1916799287395483e-05, "loss": 0.0102, "step": 3108 }, { "epoch": 2.2944649446494463, "grad_norm": 0.1525919650167939, "learning_rate": 3.1853920442838045e-05, "loss": 0.0138, "step": 3109 }, { "epoch": 2.2952029520295203, "grad_norm": 0.6256252921501854, "learning_rate": 3.179109186183457e-05, "loss": 0.0151, "step": 3110 }, { "epoch": 2.2959409594095943, "grad_norm": 0.28859500660555004, "learning_rate": 3.1728313590726444e-05, "loss": 0.0387, "step": 3111 }, { "epoch": 2.296678966789668, "grad_norm": 0.17340443489792884, "learning_rate": 3.1665585675818e-05, "loss": 0.0351, "step": 3112 }, { "epoch": 2.297416974169742, "grad_norm": 0.10953208914359581, "learning_rate": 3.1602908163376423e-05, "loss": 0.0102, "step": 3113 }, { "epoch": 2.2981549815498155, "grad_norm": 0.17637805371964704, "learning_rate": 3.1540281099631764e-05, "loss": 0.0154, "step": 3114 }, { "epoch": 2.2988929889298895, "grad_norm": 0.21169481786190505, "learning_rate": 3.147770453077686e-05, "loss": 0.0278, "step": 3115 }, { "epoch": 2.299630996309963, "grad_norm": 0.3132387490430097, "learning_rate": 3.141517850296717e-05, "loss": 0.0441, "step": 3116 }, { "epoch": 2.300369003690037, "grad_norm": 0.2486020827026522, "learning_rate": 3.1352703062321076e-05, "loss": 0.0367, "step": 3117 }, { "epoch": 2.3011070110701106, "grad_norm": 0.3269766323823423, "learning_rate": 3.129027825491951e-05, "loss": 0.0269, "step": 3118 }, { "epoch": 2.3018450184501846, "grad_norm": 0.4333561442224005, "learning_rate": 3.1227904126806115e-05, "loss": 0.0377, "step": 3119 }, { "epoch": 2.302583025830258, "grad_norm": 0.127784767314355, "learning_rate": 3.1165580723987084e-05, "loss": 0.021, "step": 3120 }, { "epoch": 2.303321033210332, "grad_norm": 0.13367733991793937, "learning_rate": 3.110330809243134e-05, "loss": 0.0193, "step": 3121 }, { "epoch": 2.304059040590406, "grad_norm": 0.2485471098160285, "learning_rate": 3.104108627807022e-05, "loss": 0.0328, "step": 3122 }, { "epoch": 2.30479704797048, "grad_norm": 0.2215673421109872, "learning_rate": 3.0978915326797634e-05, "loss": 0.0274, "step": 3123 }, { "epoch": 2.3055350553505534, "grad_norm": 0.1301711893606739, "learning_rate": 3.0916795284469945e-05, "loss": 0.0122, "step": 3124 }, { "epoch": 2.3062730627306274, "grad_norm": 0.21000802532650464, "learning_rate": 3.0854726196905994e-05, "loss": 0.018, "step": 3125 }, { "epoch": 2.307011070110701, "grad_norm": 0.21521210533005836, "learning_rate": 3.079270810988707e-05, "loss": 0.0246, "step": 3126 }, { "epoch": 2.307749077490775, "grad_norm": 0.14319394609246, "learning_rate": 3.0730741069156824e-05, "loss": 0.0111, "step": 3127 }, { "epoch": 2.3084870848708485, "grad_norm": 0.30978045591163245, "learning_rate": 3.066882512042114e-05, "loss": 0.0278, "step": 3128 }, { "epoch": 2.3092250922509225, "grad_norm": 0.1695051615391064, "learning_rate": 3.060696030934841e-05, "loss": 0.0187, "step": 3129 }, { "epoch": 2.3099630996309966, "grad_norm": 0.3802133909603062, "learning_rate": 3.054514668156916e-05, "loss": 0.0412, "step": 3130 }, { "epoch": 2.31070110701107, "grad_norm": 0.2723517715589625, "learning_rate": 3.048338428267632e-05, "loss": 0.0244, "step": 3131 }, { "epoch": 2.3114391143911437, "grad_norm": 0.20739234818480512, "learning_rate": 3.0421673158224785e-05, "loss": 0.0342, "step": 3132 }, { "epoch": 2.3121771217712177, "grad_norm": 0.14955371901399014, "learning_rate": 3.03600133537319e-05, "loss": 0.0215, "step": 3133 }, { "epoch": 2.3129151291512917, "grad_norm": 0.21604053451040728, "learning_rate": 3.0298404914676994e-05, "loss": 0.0143, "step": 3134 }, { "epoch": 2.3136531365313653, "grad_norm": 0.18762377936165056, "learning_rate": 3.0236847886501542e-05, "loss": 0.0329, "step": 3135 }, { "epoch": 2.3143911439114393, "grad_norm": 0.36097401012124364, "learning_rate": 3.0175342314609135e-05, "loss": 0.0315, "step": 3136 }, { "epoch": 2.315129151291513, "grad_norm": 0.31307702425576056, "learning_rate": 3.011388824436533e-05, "loss": 0.0431, "step": 3137 }, { "epoch": 2.315867158671587, "grad_norm": 0.154136488925181, "learning_rate": 3.0052485721097833e-05, "loss": 0.0128, "step": 3138 }, { "epoch": 2.3166051660516604, "grad_norm": 0.24254882927456425, "learning_rate": 2.9991134790096197e-05, "loss": 0.032, "step": 3139 }, { "epoch": 2.3173431734317345, "grad_norm": 0.20919981737100124, "learning_rate": 2.9929835496612003e-05, "loss": 0.0259, "step": 3140 }, { "epoch": 2.318081180811808, "grad_norm": 0.42567594173367956, "learning_rate": 2.986858788585869e-05, "loss": 0.0541, "step": 3141 }, { "epoch": 2.318819188191882, "grad_norm": 0.28498539770626186, "learning_rate": 2.980739200301158e-05, "loss": 0.0328, "step": 3142 }, { "epoch": 2.3195571955719556, "grad_norm": 0.24211063641135477, "learning_rate": 2.9746247893207957e-05, "loss": 0.0275, "step": 3143 }, { "epoch": 2.3202952029520296, "grad_norm": 0.44589710354943046, "learning_rate": 2.96851556015467e-05, "loss": 0.0498, "step": 3144 }, { "epoch": 2.321033210332103, "grad_norm": 0.24075956982287727, "learning_rate": 2.9624115173088683e-05, "loss": 0.0249, "step": 3145 }, { "epoch": 2.321771217712177, "grad_norm": 0.12960245062181364, "learning_rate": 2.9563126652856376e-05, "loss": 0.0135, "step": 3146 }, { "epoch": 2.3225092250922508, "grad_norm": 0.09091321466120844, "learning_rate": 2.9502190085834114e-05, "loss": 0.0154, "step": 3147 }, { "epoch": 2.323247232472325, "grad_norm": 0.18712822513562943, "learning_rate": 2.944130551696772e-05, "loss": 0.038, "step": 3148 }, { "epoch": 2.3239852398523984, "grad_norm": 0.17403017525827913, "learning_rate": 2.9380472991164776e-05, "loss": 0.02, "step": 3149 }, { "epoch": 2.3247232472324724, "grad_norm": 0.37027251112040216, "learning_rate": 2.931969255329452e-05, "loss": 0.0554, "step": 3150 }, { "epoch": 2.325461254612546, "grad_norm": 0.23615926567278997, "learning_rate": 2.925896424818768e-05, "loss": 0.0134, "step": 3151 }, { "epoch": 2.32619926199262, "grad_norm": 0.2724208259254927, "learning_rate": 2.9198288120636586e-05, "loss": 0.0238, "step": 3152 }, { "epoch": 2.326937269372694, "grad_norm": 0.15491256633169895, "learning_rate": 2.9137664215395012e-05, "loss": 0.0188, "step": 3153 }, { "epoch": 2.3276752767527675, "grad_norm": 0.15623995737282576, "learning_rate": 2.9077092577178345e-05, "loss": 0.0169, "step": 3154 }, { "epoch": 2.328413284132841, "grad_norm": 0.2075779869502997, "learning_rate": 2.9016573250663326e-05, "loss": 0.0107, "step": 3155 }, { "epoch": 2.329151291512915, "grad_norm": 0.31797072119485087, "learning_rate": 2.8956106280488037e-05, "loss": 0.0544, "step": 3156 }, { "epoch": 2.329889298892989, "grad_norm": 0.3324827390861213, "learning_rate": 2.8895691711252137e-05, "loss": 0.0343, "step": 3157 }, { "epoch": 2.3306273062730627, "grad_norm": 0.15441038455129583, "learning_rate": 2.8835329587516456e-05, "loss": 0.0214, "step": 3158 }, { "epoch": 2.3313653136531367, "grad_norm": 0.16512144969823478, "learning_rate": 2.8775019953803317e-05, "loss": 0.036, "step": 3159 }, { "epoch": 2.3321033210332103, "grad_norm": 0.29019067024169454, "learning_rate": 2.8714762854596112e-05, "loss": 0.0242, "step": 3160 }, { "epoch": 2.3328413284132843, "grad_norm": 0.24512865611253032, "learning_rate": 2.8654558334339666e-05, "loss": 0.0415, "step": 3161 }, { "epoch": 2.333579335793358, "grad_norm": 0.10294588195054627, "learning_rate": 2.8594406437439935e-05, "loss": 0.0147, "step": 3162 }, { "epoch": 2.334317343173432, "grad_norm": 0.17377144150561186, "learning_rate": 2.853430720826409e-05, "loss": 0.0274, "step": 3163 }, { "epoch": 2.3350553505535054, "grad_norm": 0.2760198659162367, "learning_rate": 2.847426069114043e-05, "loss": 0.0273, "step": 3164 }, { "epoch": 2.3357933579335795, "grad_norm": 0.29295967788090943, "learning_rate": 2.8414266930358367e-05, "loss": 0.04, "step": 3165 }, { "epoch": 2.336531365313653, "grad_norm": 0.1953183323709871, "learning_rate": 2.8354325970168484e-05, "loss": 0.0078, "step": 3166 }, { "epoch": 2.337269372693727, "grad_norm": 0.5841973297326986, "learning_rate": 2.829443785478233e-05, "loss": 0.0336, "step": 3167 }, { "epoch": 2.3380073800738006, "grad_norm": 0.0931321573893058, "learning_rate": 2.8234602628372508e-05, "loss": 0.0115, "step": 3168 }, { "epoch": 2.3387453874538746, "grad_norm": 0.15352331785322862, "learning_rate": 2.8174820335072595e-05, "loss": 0.0161, "step": 3169 }, { "epoch": 2.339483394833948, "grad_norm": 0.42108968112445444, "learning_rate": 2.8115091018977126e-05, "loss": 0.0187, "step": 3170 }, { "epoch": 2.340221402214022, "grad_norm": 0.34727457122514815, "learning_rate": 2.8055414724141647e-05, "loss": 0.0489, "step": 3171 }, { "epoch": 2.3409594095940958, "grad_norm": 0.1380639313337896, "learning_rate": 2.79957914945824e-05, "loss": 0.0177, "step": 3172 }, { "epoch": 2.3416974169741698, "grad_norm": 0.10089809713654659, "learning_rate": 2.7936221374276727e-05, "loss": 0.0116, "step": 3173 }, { "epoch": 2.3424354243542433, "grad_norm": 0.2291739426020743, "learning_rate": 2.787670440716259e-05, "loss": 0.0183, "step": 3174 }, { "epoch": 2.3431734317343174, "grad_norm": 0.3232518912407666, "learning_rate": 2.781724063713893e-05, "loss": 0.032, "step": 3175 }, { "epoch": 2.3439114391143914, "grad_norm": 0.3868246435870277, "learning_rate": 2.7757830108065276e-05, "loss": 0.0364, "step": 3176 }, { "epoch": 2.344649446494465, "grad_norm": 0.09490151510525402, "learning_rate": 2.769847286376197e-05, "loss": 0.0148, "step": 3177 }, { "epoch": 2.3453874538745385, "grad_norm": 0.2738118183091752, "learning_rate": 2.7639168948010097e-05, "loss": 0.0394, "step": 3178 }, { "epoch": 2.3461254612546125, "grad_norm": 0.09538179336664657, "learning_rate": 2.757991840455133e-05, "loss": 0.0087, "step": 3179 }, { "epoch": 2.3468634686346865, "grad_norm": 0.09304195396555316, "learning_rate": 2.7520721277088024e-05, "loss": 0.011, "step": 3180 }, { "epoch": 2.34760147601476, "grad_norm": 0.17239281186051486, "learning_rate": 2.7461577609283096e-05, "loss": 0.0305, "step": 3181 }, { "epoch": 2.348339483394834, "grad_norm": 0.220269405420583, "learning_rate": 2.7402487444760028e-05, "loss": 0.0256, "step": 3182 }, { "epoch": 2.3490774907749077, "grad_norm": 0.12295434242624591, "learning_rate": 2.7343450827102923e-05, "loss": 0.0272, "step": 3183 }, { "epoch": 2.3498154981549817, "grad_norm": 0.11461310485107849, "learning_rate": 2.7284467799856294e-05, "loss": 0.0126, "step": 3184 }, { "epoch": 2.3505535055350553, "grad_norm": 0.21128387242358396, "learning_rate": 2.7225538406525185e-05, "loss": 0.0346, "step": 3185 }, { "epoch": 2.3512915129151293, "grad_norm": 0.32134307139217794, "learning_rate": 2.7166662690574996e-05, "loss": 0.0193, "step": 3186 }, { "epoch": 2.352029520295203, "grad_norm": 0.17966101626253572, "learning_rate": 2.7107840695431706e-05, "loss": 0.0153, "step": 3187 }, { "epoch": 2.352767527675277, "grad_norm": 0.1610429210591076, "learning_rate": 2.7049072464481462e-05, "loss": 0.0137, "step": 3188 }, { "epoch": 2.3535055350553504, "grad_norm": 0.3222100840036858, "learning_rate": 2.6990358041070852e-05, "loss": 0.0515, "step": 3189 }, { "epoch": 2.3542435424354244, "grad_norm": 0.2609762966300961, "learning_rate": 2.6931697468506846e-05, "loss": 0.0497, "step": 3190 }, { "epoch": 2.354981549815498, "grad_norm": 0.2001271210704725, "learning_rate": 2.6873090790056586e-05, "loss": 0.0422, "step": 3191 }, { "epoch": 2.355719557195572, "grad_norm": 0.27492968599317996, "learning_rate": 2.6814538048947503e-05, "loss": 0.0263, "step": 3192 }, { "epoch": 2.3564575645756456, "grad_norm": 0.17272679299969623, "learning_rate": 2.675603928836723e-05, "loss": 0.0196, "step": 3193 }, { "epoch": 2.3571955719557196, "grad_norm": 0.39645531605563156, "learning_rate": 2.6697594551463647e-05, "loss": 0.0433, "step": 3194 }, { "epoch": 2.357933579335793, "grad_norm": 0.18838262427404137, "learning_rate": 2.663920388134471e-05, "loss": 0.0205, "step": 3195 }, { "epoch": 2.358671586715867, "grad_norm": 0.1578147252281312, "learning_rate": 2.658086732107853e-05, "loss": 0.0376, "step": 3196 }, { "epoch": 2.3594095940959408, "grad_norm": 0.5070172918412836, "learning_rate": 2.6522584913693294e-05, "loss": 0.1647, "step": 3197 }, { "epoch": 2.3601476014760148, "grad_norm": 0.18479014280475134, "learning_rate": 2.6464356702177228e-05, "loss": 0.0242, "step": 3198 }, { "epoch": 2.360885608856089, "grad_norm": 0.43002112231061196, "learning_rate": 2.6406182729478678e-05, "loss": 0.0377, "step": 3199 }, { "epoch": 2.3616236162361623, "grad_norm": 0.21381370559369606, "learning_rate": 2.6348063038505875e-05, "loss": 0.0346, "step": 3200 }, { "epoch": 2.362361623616236, "grad_norm": 0.09784237217132832, "learning_rate": 2.6289997672127077e-05, "loss": 0.0102, "step": 3201 }, { "epoch": 2.36309963099631, "grad_norm": 0.1040162830869226, "learning_rate": 2.6231986673170416e-05, "loss": 0.0078, "step": 3202 }, { "epoch": 2.363837638376384, "grad_norm": 0.14265940280549783, "learning_rate": 2.6174030084423997e-05, "loss": 0.0301, "step": 3203 }, { "epoch": 2.3645756457564575, "grad_norm": 0.27509970638170067, "learning_rate": 2.6116127948635728e-05, "loss": 0.0306, "step": 3204 }, { "epoch": 2.3653136531365315, "grad_norm": 0.23147848687117042, "learning_rate": 2.605828030851336e-05, "loss": 0.0216, "step": 3205 }, { "epoch": 2.366051660516605, "grad_norm": 0.18124190912837412, "learning_rate": 2.6000487206724534e-05, "loss": 0.0291, "step": 3206 }, { "epoch": 2.366789667896679, "grad_norm": 0.31388160643375934, "learning_rate": 2.5942748685896546e-05, "loss": 0.0409, "step": 3207 }, { "epoch": 2.3675276752767527, "grad_norm": 0.20256074029287885, "learning_rate": 2.588506478861651e-05, "loss": 0.1024, "step": 3208 }, { "epoch": 2.3682656826568267, "grad_norm": 0.11107821082525453, "learning_rate": 2.5827435557431212e-05, "loss": 0.0167, "step": 3209 }, { "epoch": 2.3690036900369003, "grad_norm": 0.09767934181967744, "learning_rate": 2.576986103484711e-05, "loss": 0.0133, "step": 3210 }, { "epoch": 2.3697416974169743, "grad_norm": 0.2384515675483358, "learning_rate": 2.5712341263330387e-05, "loss": 0.028, "step": 3211 }, { "epoch": 2.370479704797048, "grad_norm": 0.2001797414674445, "learning_rate": 2.565487628530676e-05, "loss": 0.0213, "step": 3212 }, { "epoch": 2.371217712177122, "grad_norm": 0.22956499163745184, "learning_rate": 2.5597466143161562e-05, "loss": 0.0323, "step": 3213 }, { "epoch": 2.3719557195571954, "grad_norm": 0.21520565216323004, "learning_rate": 2.5540110879239644e-05, "loss": 0.031, "step": 3214 }, { "epoch": 2.3726937269372694, "grad_norm": 0.1404695258985454, "learning_rate": 2.548281053584547e-05, "loss": 0.0196, "step": 3215 }, { "epoch": 2.373431734317343, "grad_norm": 0.18277675249505293, "learning_rate": 2.5425565155242935e-05, "loss": 0.0209, "step": 3216 }, { "epoch": 2.374169741697417, "grad_norm": 0.13160969423365756, "learning_rate": 2.5368374779655303e-05, "loss": 0.0192, "step": 3217 }, { "epoch": 2.3749077490774906, "grad_norm": 0.34117037692664487, "learning_rate": 2.531123945126547e-05, "loss": 0.03, "step": 3218 }, { "epoch": 2.3756457564575646, "grad_norm": 0.33639210408969145, "learning_rate": 2.5254159212215568e-05, "loss": 0.0534, "step": 3219 }, { "epoch": 2.376383763837638, "grad_norm": 0.3116437319081959, "learning_rate": 2.5197134104607145e-05, "loss": 0.0456, "step": 3220 }, { "epoch": 2.377121771217712, "grad_norm": 0.10936798032607715, "learning_rate": 2.514016417050109e-05, "loss": 0.0128, "step": 3221 }, { "epoch": 2.377859778597786, "grad_norm": 0.16113250661021003, "learning_rate": 2.5083249451917622e-05, "loss": 0.0188, "step": 3222 }, { "epoch": 2.3785977859778598, "grad_norm": 0.1786015642486653, "learning_rate": 2.5026389990836195e-05, "loss": 0.0171, "step": 3223 }, { "epoch": 2.3793357933579338, "grad_norm": 0.135916496486306, "learning_rate": 2.496958582919552e-05, "loss": 0.0177, "step": 3224 }, { "epoch": 2.3800738007380073, "grad_norm": 0.19886228105438114, "learning_rate": 2.4912837008893498e-05, "loss": 0.0161, "step": 3225 }, { "epoch": 2.3808118081180814, "grad_norm": 0.25728552820810746, "learning_rate": 2.4856143571787214e-05, "loss": 0.0407, "step": 3226 }, { "epoch": 2.381549815498155, "grad_norm": 0.25818378499401884, "learning_rate": 2.4799505559692994e-05, "loss": 0.0283, "step": 3227 }, { "epoch": 2.382287822878229, "grad_norm": 0.17585990259000525, "learning_rate": 2.4742923014386156e-05, "loss": 0.0194, "step": 3228 }, { "epoch": 2.3830258302583025, "grad_norm": 0.4112112949739828, "learning_rate": 2.4686395977601163e-05, "loss": 0.0444, "step": 3229 }, { "epoch": 2.3837638376383765, "grad_norm": 0.3450198058062559, "learning_rate": 2.462992449103154e-05, "loss": 0.0424, "step": 3230 }, { "epoch": 2.38450184501845, "grad_norm": 0.22734104010757472, "learning_rate": 2.457350859632981e-05, "loss": 0.0244, "step": 3231 }, { "epoch": 2.385239852398524, "grad_norm": 0.34456359543976084, "learning_rate": 2.4517148335107587e-05, "loss": 0.0454, "step": 3232 }, { "epoch": 2.3859778597785977, "grad_norm": 0.10610255882462101, "learning_rate": 2.446084374893526e-05, "loss": 0.0114, "step": 3233 }, { "epoch": 2.3867158671586717, "grad_norm": 0.17432286921077003, "learning_rate": 2.440459487934237e-05, "loss": 0.0195, "step": 3234 }, { "epoch": 2.3874538745387452, "grad_norm": 0.4127723130588193, "learning_rate": 2.4348401767817218e-05, "loss": 0.0496, "step": 3235 }, { "epoch": 2.3881918819188193, "grad_norm": 0.1997310581982227, "learning_rate": 2.4292264455807036e-05, "loss": 0.0383, "step": 3236 }, { "epoch": 2.388929889298893, "grad_norm": 0.2072212535662147, "learning_rate": 2.4236182984717883e-05, "loss": 0.0169, "step": 3237 }, { "epoch": 2.389667896678967, "grad_norm": 0.3230347767253843, "learning_rate": 2.4180157395914606e-05, "loss": 0.0384, "step": 3238 }, { "epoch": 2.3904059040590404, "grad_norm": 0.16375008863700216, "learning_rate": 2.4124187730720917e-05, "loss": 0.0228, "step": 3239 }, { "epoch": 2.3911439114391144, "grad_norm": 0.2205394516338925, "learning_rate": 2.406827403041918e-05, "loss": 0.0229, "step": 3240 }, { "epoch": 2.3918819188191884, "grad_norm": 0.17629260157560134, "learning_rate": 2.4012416336250553e-05, "loss": 0.0159, "step": 3241 }, { "epoch": 2.392619926199262, "grad_norm": 0.11808415005453606, "learning_rate": 2.3956614689414846e-05, "loss": 0.0178, "step": 3242 }, { "epoch": 2.3933579335793356, "grad_norm": 0.16479915569728312, "learning_rate": 2.3900869131070504e-05, "loss": 0.0248, "step": 3243 }, { "epoch": 2.3940959409594096, "grad_norm": 0.2404422513853399, "learning_rate": 2.384517970233473e-05, "loss": 0.0245, "step": 3244 }, { "epoch": 2.3948339483394836, "grad_norm": 0.16253973064624544, "learning_rate": 2.3789546444283105e-05, "loss": 0.0224, "step": 3245 }, { "epoch": 2.395571955719557, "grad_norm": 0.21337766475966732, "learning_rate": 2.373396939795002e-05, "loss": 0.0208, "step": 3246 }, { "epoch": 2.396309963099631, "grad_norm": 0.20745088195610842, "learning_rate": 2.3678448604328207e-05, "loss": 0.0156, "step": 3247 }, { "epoch": 2.3970479704797047, "grad_norm": 0.2759178549646178, "learning_rate": 2.3622984104369106e-05, "loss": 0.0955, "step": 3248 }, { "epoch": 2.3977859778597788, "grad_norm": 0.23259459166882152, "learning_rate": 2.3567575938982422e-05, "loss": 0.0379, "step": 3249 }, { "epoch": 2.3985239852398523, "grad_norm": 0.1649414110149184, "learning_rate": 2.351222414903642e-05, "loss": 0.0137, "step": 3250 }, { "epoch": 2.3992619926199263, "grad_norm": 0.1849445901911328, "learning_rate": 2.345692877535781e-05, "loss": 0.0193, "step": 3251 }, { "epoch": 2.4, "grad_norm": 0.17642119616121557, "learning_rate": 2.3401689858731644e-05, "loss": 0.0381, "step": 3252 }, { "epoch": 2.400738007380074, "grad_norm": 0.14190853037745613, "learning_rate": 2.3346507439901333e-05, "loss": 0.0198, "step": 3253 }, { "epoch": 2.4014760147601475, "grad_norm": 0.2755011022797041, "learning_rate": 2.3291381559568593e-05, "loss": 0.0193, "step": 3254 }, { "epoch": 2.4022140221402215, "grad_norm": 0.19451927139274092, "learning_rate": 2.3236312258393522e-05, "loss": 0.0383, "step": 3255 }, { "epoch": 2.402952029520295, "grad_norm": 0.14456691796559396, "learning_rate": 2.3181299576994454e-05, "loss": 0.0208, "step": 3256 }, { "epoch": 2.403690036900369, "grad_norm": 0.20593051297845155, "learning_rate": 2.3126343555947825e-05, "loss": 0.0224, "step": 3257 }, { "epoch": 2.4044280442804427, "grad_norm": 0.12594528858360468, "learning_rate": 2.307144423578851e-05, "loss": 0.0148, "step": 3258 }, { "epoch": 2.4051660516605167, "grad_norm": 0.14708975659905213, "learning_rate": 2.301660165700936e-05, "loss": 0.0144, "step": 3259 }, { "epoch": 2.4059040590405902, "grad_norm": 0.2000796370776719, "learning_rate": 2.2961815860061576e-05, "loss": 0.0368, "step": 3260 }, { "epoch": 2.4066420664206642, "grad_norm": 0.22247403902378, "learning_rate": 2.2907086885354223e-05, "loss": 0.0473, "step": 3261 }, { "epoch": 2.407380073800738, "grad_norm": 0.16949404959329215, "learning_rate": 2.2852414773254694e-05, "loss": 0.0157, "step": 3262 }, { "epoch": 2.408118081180812, "grad_norm": 0.20374968040342845, "learning_rate": 2.2797799564088308e-05, "loss": 0.0218, "step": 3263 }, { "epoch": 2.408856088560886, "grad_norm": 0.3070673448880489, "learning_rate": 2.274324129813844e-05, "loss": 0.0269, "step": 3264 }, { "epoch": 2.4095940959409594, "grad_norm": 0.16661973617990686, "learning_rate": 2.2688740015646482e-05, "loss": 0.0274, "step": 3265 }, { "epoch": 2.410332103321033, "grad_norm": 0.18735341586695547, "learning_rate": 2.2634295756811752e-05, "loss": 0.0211, "step": 3266 }, { "epoch": 2.411070110701107, "grad_norm": 0.4002244933256233, "learning_rate": 2.2579908561791596e-05, "loss": 0.0454, "step": 3267 }, { "epoch": 2.411808118081181, "grad_norm": 0.1540986439263939, "learning_rate": 2.2525578470701192e-05, "loss": 0.017, "step": 3268 }, { "epoch": 2.4125461254612546, "grad_norm": 0.12284990147276235, "learning_rate": 2.2471305523613616e-05, "loss": 0.0187, "step": 3269 }, { "epoch": 2.4132841328413286, "grad_norm": 0.30819351889945246, "learning_rate": 2.2417089760559807e-05, "loss": 0.0245, "step": 3270 }, { "epoch": 2.414022140221402, "grad_norm": 0.30036528909085547, "learning_rate": 2.2362931221528495e-05, "loss": 0.045, "step": 3271 }, { "epoch": 2.414760147601476, "grad_norm": 0.3050965755355254, "learning_rate": 2.2308829946466302e-05, "loss": 0.0382, "step": 3272 }, { "epoch": 2.4154981549815497, "grad_norm": 0.17786406568509042, "learning_rate": 2.2254785975277437e-05, "loss": 0.0278, "step": 3273 }, { "epoch": 2.4162361623616238, "grad_norm": 0.14059951472355703, "learning_rate": 2.220079934782402e-05, "loss": 0.0271, "step": 3274 }, { "epoch": 2.4169741697416973, "grad_norm": 0.20549571614945555, "learning_rate": 2.2146870103925743e-05, "loss": 0.035, "step": 3275 }, { "epoch": 2.4177121771217713, "grad_norm": 0.17062919508653548, "learning_rate": 2.2092998283360122e-05, "loss": 0.0246, "step": 3276 }, { "epoch": 2.418450184501845, "grad_norm": 0.3366266650209467, "learning_rate": 2.203918392586215e-05, "loss": 0.0387, "step": 3277 }, { "epoch": 2.419188191881919, "grad_norm": 0.15031664595564537, "learning_rate": 2.1985427071124488e-05, "loss": 0.0156, "step": 3278 }, { "epoch": 2.4199261992619925, "grad_norm": 0.10696007791595248, "learning_rate": 2.1931727758797484e-05, "loss": 0.0129, "step": 3279 }, { "epoch": 2.4206642066420665, "grad_norm": 0.1428351625232273, "learning_rate": 2.187808602848892e-05, "loss": 0.0258, "step": 3280 }, { "epoch": 2.42140221402214, "grad_norm": 0.17384921646836501, "learning_rate": 2.1824501919764163e-05, "loss": 0.0213, "step": 3281 }, { "epoch": 2.422140221402214, "grad_norm": 0.09424857245985316, "learning_rate": 2.177097547214605e-05, "loss": 0.0102, "step": 3282 }, { "epoch": 2.4228782287822876, "grad_norm": 0.15125365985163436, "learning_rate": 2.1717506725114955e-05, "loss": 0.0227, "step": 3283 }, { "epoch": 2.4236162361623617, "grad_norm": 0.1694135859906646, "learning_rate": 2.1664095718108625e-05, "loss": 0.0227, "step": 3284 }, { "epoch": 2.4243542435424352, "grad_norm": 0.1513937814986632, "learning_rate": 2.161074249052223e-05, "loss": 0.0246, "step": 3285 }, { "epoch": 2.4250922509225092, "grad_norm": 0.0947375532334934, "learning_rate": 2.155744708170834e-05, "loss": 0.0101, "step": 3286 }, { "epoch": 2.4258302583025833, "grad_norm": 0.1750388435923026, "learning_rate": 2.1504209530976828e-05, "loss": 0.0185, "step": 3287 }, { "epoch": 2.426568265682657, "grad_norm": 0.09436036424695818, "learning_rate": 2.1451029877595042e-05, "loss": 0.0133, "step": 3288 }, { "epoch": 2.4273062730627304, "grad_norm": 0.07674861498499175, "learning_rate": 2.1397908160787415e-05, "loss": 0.0123, "step": 3289 }, { "epoch": 2.4280442804428044, "grad_norm": 0.22202765730261967, "learning_rate": 2.1344844419735755e-05, "loss": 0.0271, "step": 3290 }, { "epoch": 2.4287822878228784, "grad_norm": 0.19675077420623793, "learning_rate": 2.129183869357917e-05, "loss": 0.0247, "step": 3291 }, { "epoch": 2.429520295202952, "grad_norm": 0.5326844500685934, "learning_rate": 2.1238891021413863e-05, "loss": 0.044, "step": 3292 }, { "epoch": 2.430258302583026, "grad_norm": 0.10563908419390547, "learning_rate": 2.118600144229328e-05, "loss": 0.0155, "step": 3293 }, { "epoch": 2.4309963099630996, "grad_norm": 0.38816758412179103, "learning_rate": 2.1133169995227963e-05, "loss": 0.0567, "step": 3294 }, { "epoch": 2.4317343173431736, "grad_norm": 0.1837222691613184, "learning_rate": 2.108039671918568e-05, "loss": 0.0179, "step": 3295 }, { "epoch": 2.432472324723247, "grad_norm": 0.09477577139993966, "learning_rate": 2.1027681653091215e-05, "loss": 0.0109, "step": 3296 }, { "epoch": 2.433210332103321, "grad_norm": 0.1993495396193277, "learning_rate": 2.0975024835826397e-05, "loss": 0.0202, "step": 3297 }, { "epoch": 2.4339483394833947, "grad_norm": 0.2084240371674774, "learning_rate": 2.092242630623016e-05, "loss": 0.0273, "step": 3298 }, { "epoch": 2.4346863468634687, "grad_norm": 0.2730603920840704, "learning_rate": 2.0869886103098357e-05, "loss": 0.0371, "step": 3299 }, { "epoch": 2.4354243542435423, "grad_norm": 0.114582393140407, "learning_rate": 2.0817404265183958e-05, "loss": 0.0162, "step": 3300 }, { "epoch": 2.4361623616236163, "grad_norm": 0.11925559302489061, "learning_rate": 2.0764980831196745e-05, "loss": 0.0158, "step": 3301 }, { "epoch": 2.43690036900369, "grad_norm": 0.15090343699277436, "learning_rate": 2.0712615839803507e-05, "loss": 0.0213, "step": 3302 }, { "epoch": 2.437638376383764, "grad_norm": 0.2698877421117423, "learning_rate": 2.066030932962787e-05, "loss": 0.0623, "step": 3303 }, { "epoch": 2.4383763837638375, "grad_norm": 0.402124311087859, "learning_rate": 2.0608061339250373e-05, "loss": 0.0402, "step": 3304 }, { "epoch": 2.4391143911439115, "grad_norm": 0.25164396030719743, "learning_rate": 2.0555871907208358e-05, "loss": 0.0249, "step": 3305 }, { "epoch": 2.439852398523985, "grad_norm": 0.192066632865975, "learning_rate": 2.0503741071995965e-05, "loss": 0.0167, "step": 3306 }, { "epoch": 2.440590405904059, "grad_norm": 0.21780237580443146, "learning_rate": 2.04516688720642e-05, "loss": 0.0304, "step": 3307 }, { "epoch": 2.4413284132841326, "grad_norm": 0.24652995015860388, "learning_rate": 2.039965534582071e-05, "loss": 0.0187, "step": 3308 }, { "epoch": 2.4420664206642066, "grad_norm": 0.1634654023122102, "learning_rate": 2.034770053162994e-05, "loss": 0.0264, "step": 3309 }, { "epoch": 2.4428044280442807, "grad_norm": 0.19107935741341917, "learning_rate": 2.0295804467812984e-05, "loss": 0.0241, "step": 3310 }, { "epoch": 2.4435424354243542, "grad_norm": 0.15637154878454296, "learning_rate": 2.0243967192647606e-05, "loss": 0.0251, "step": 3311 }, { "epoch": 2.444280442804428, "grad_norm": 0.10566588029174721, "learning_rate": 2.0192188744368268e-05, "loss": 0.0088, "step": 3312 }, { "epoch": 2.445018450184502, "grad_norm": 0.0832645910073872, "learning_rate": 2.0140469161165975e-05, "loss": 0.0144, "step": 3313 }, { "epoch": 2.445756457564576, "grad_norm": 0.13576095528873, "learning_rate": 2.0088808481188337e-05, "loss": 0.0208, "step": 3314 }, { "epoch": 2.4464944649446494, "grad_norm": 0.22370581188299693, "learning_rate": 2.0037206742539495e-05, "loss": 0.0238, "step": 3315 }, { "epoch": 2.4472324723247234, "grad_norm": 0.23044754677830512, "learning_rate": 1.998566398328019e-05, "loss": 0.0116, "step": 3316 }, { "epoch": 2.447970479704797, "grad_norm": 0.24543991131215392, "learning_rate": 1.9934180241427604e-05, "loss": 0.0297, "step": 3317 }, { "epoch": 2.448708487084871, "grad_norm": 0.17196843723895627, "learning_rate": 1.98827555549553e-05, "loss": 0.0213, "step": 3318 }, { "epoch": 2.4494464944649446, "grad_norm": 0.29157112293918624, "learning_rate": 1.983138996179349e-05, "loss": 0.0337, "step": 3319 }, { "epoch": 2.4501845018450186, "grad_norm": 0.2068827429366198, "learning_rate": 1.9780083499828637e-05, "loss": 0.0212, "step": 3320 }, { "epoch": 2.450922509225092, "grad_norm": 0.10097757554704208, "learning_rate": 1.9728836206903656e-05, "loss": 0.0136, "step": 3321 }, { "epoch": 2.451660516605166, "grad_norm": 0.18888836337898027, "learning_rate": 1.9677648120817748e-05, "loss": 0.0517, "step": 3322 }, { "epoch": 2.4523985239852397, "grad_norm": 0.15563828619501582, "learning_rate": 1.962651927932657e-05, "loss": 0.025, "step": 3323 }, { "epoch": 2.4531365313653137, "grad_norm": 0.15782428870414594, "learning_rate": 1.957544972014199e-05, "loss": 0.019, "step": 3324 }, { "epoch": 2.4538745387453873, "grad_norm": 0.13292573242089128, "learning_rate": 1.9524439480932144e-05, "loss": 0.0178, "step": 3325 }, { "epoch": 2.4546125461254613, "grad_norm": 0.15465665931766737, "learning_rate": 1.9473488599321465e-05, "loss": 0.0246, "step": 3326 }, { "epoch": 2.455350553505535, "grad_norm": 0.2239454382510741, "learning_rate": 1.942259711289055e-05, "loss": 0.0233, "step": 3327 }, { "epoch": 2.456088560885609, "grad_norm": 0.3640434341757827, "learning_rate": 1.937176505917626e-05, "loss": 0.0242, "step": 3328 }, { "epoch": 2.4568265682656825, "grad_norm": 0.14551392275372732, "learning_rate": 1.932099247567155e-05, "loss": 0.0298, "step": 3329 }, { "epoch": 2.4575645756457565, "grad_norm": 0.2457704392435993, "learning_rate": 1.927027939982554e-05, "loss": 0.0645, "step": 3330 }, { "epoch": 2.45830258302583, "grad_norm": 0.18864186873890582, "learning_rate": 1.9219625869043457e-05, "loss": 0.0275, "step": 3331 }, { "epoch": 2.459040590405904, "grad_norm": 0.2014064015408733, "learning_rate": 1.9169031920686586e-05, "loss": 0.0176, "step": 3332 }, { "epoch": 2.459778597785978, "grad_norm": 0.2501076965786885, "learning_rate": 1.911849759207235e-05, "loss": 0.0398, "step": 3333 }, { "epoch": 2.4605166051660516, "grad_norm": 0.1370391793611074, "learning_rate": 1.9068022920474025e-05, "loss": 0.0274, "step": 3334 }, { "epoch": 2.4612546125461257, "grad_norm": 0.16904308351595512, "learning_rate": 1.9017607943121085e-05, "loss": 0.022, "step": 3335 }, { "epoch": 2.461992619926199, "grad_norm": 0.08704830806271431, "learning_rate": 1.8967252697198856e-05, "loss": 0.013, "step": 3336 }, { "epoch": 2.4627306273062732, "grad_norm": 0.22031195735779135, "learning_rate": 1.891695721984862e-05, "loss": 0.0179, "step": 3337 }, { "epoch": 2.463468634686347, "grad_norm": 0.2038311524054918, "learning_rate": 1.8866721548167598e-05, "loss": 0.0203, "step": 3338 }, { "epoch": 2.464206642066421, "grad_norm": 0.12691859390322, "learning_rate": 1.8816545719208857e-05, "loss": 0.0201, "step": 3339 }, { "epoch": 2.4649446494464944, "grad_norm": 0.2871527897545908, "learning_rate": 1.87664297699814e-05, "loss": 0.0204, "step": 3340 }, { "epoch": 2.4656826568265684, "grad_norm": 0.39095469708437136, "learning_rate": 1.871637373745001e-05, "loss": 0.0328, "step": 3341 }, { "epoch": 2.466420664206642, "grad_norm": 0.4344337258872477, "learning_rate": 1.8666377658535284e-05, "loss": 0.036, "step": 3342 }, { "epoch": 2.467158671586716, "grad_norm": 0.25600148383106314, "learning_rate": 1.8616441570113586e-05, "loss": 0.0567, "step": 3343 }, { "epoch": 2.4678966789667895, "grad_norm": 0.11345307746368283, "learning_rate": 1.856656550901703e-05, "loss": 0.0314, "step": 3344 }, { "epoch": 2.4686346863468636, "grad_norm": 0.582078458866324, "learning_rate": 1.851674951203356e-05, "loss": 0.0725, "step": 3345 }, { "epoch": 2.469372693726937, "grad_norm": 0.22965032070096345, "learning_rate": 1.8466993615906603e-05, "loss": 0.0181, "step": 3346 }, { "epoch": 2.470110701107011, "grad_norm": 0.2766242461060504, "learning_rate": 1.841729785733547e-05, "loss": 0.031, "step": 3347 }, { "epoch": 2.4708487084870847, "grad_norm": 0.17093839138324307, "learning_rate": 1.8367662272974985e-05, "loss": 0.0221, "step": 3348 }, { "epoch": 2.4715867158671587, "grad_norm": 0.20267442382269765, "learning_rate": 1.8318086899435693e-05, "loss": 0.0247, "step": 3349 }, { "epoch": 2.4723247232472323, "grad_norm": 0.16028568902619594, "learning_rate": 1.8268571773283595e-05, "loss": 0.0208, "step": 3350 }, { "epoch": 2.4730627306273063, "grad_norm": 0.3636661673831604, "learning_rate": 1.8219116931040327e-05, "loss": 0.0319, "step": 3351 }, { "epoch": 2.4738007380073803, "grad_norm": 0.4607622988649372, "learning_rate": 1.8169722409183097e-05, "loss": 0.0362, "step": 3352 }, { "epoch": 2.474538745387454, "grad_norm": 0.4801826598745878, "learning_rate": 1.8120388244144583e-05, "loss": 0.0264, "step": 3353 }, { "epoch": 2.4752767527675275, "grad_norm": 0.19333416300138706, "learning_rate": 1.8071114472312922e-05, "loss": 0.0196, "step": 3354 }, { "epoch": 2.4760147601476015, "grad_norm": 0.1830625556530548, "learning_rate": 1.8021901130031714e-05, "loss": 0.0156, "step": 3355 }, { "epoch": 2.4767527675276755, "grad_norm": 0.29881146815023113, "learning_rate": 1.7972748253600058e-05, "loss": 0.0262, "step": 3356 }, { "epoch": 2.477490774907749, "grad_norm": 0.1433939036504998, "learning_rate": 1.7923655879272393e-05, "loss": 0.0118, "step": 3357 }, { "epoch": 2.478228782287823, "grad_norm": 0.10003433062961715, "learning_rate": 1.787462404325846e-05, "loss": 0.0161, "step": 3358 }, { "epoch": 2.4789667896678966, "grad_norm": 0.0725306881697179, "learning_rate": 1.78256527817235e-05, "loss": 0.0081, "step": 3359 }, { "epoch": 2.4797047970479706, "grad_norm": 0.2374340029952281, "learning_rate": 1.777674213078796e-05, "loss": 0.0318, "step": 3360 }, { "epoch": 2.480442804428044, "grad_norm": 0.1483264350851746, "learning_rate": 1.772789212652769e-05, "loss": 0.0278, "step": 3361 }, { "epoch": 2.4811808118081182, "grad_norm": 0.2894104127014279, "learning_rate": 1.7679102804973635e-05, "loss": 0.0447, "step": 3362 }, { "epoch": 2.481918819188192, "grad_norm": 0.08209882136002478, "learning_rate": 1.7630374202112177e-05, "loss": 0.0092, "step": 3363 }, { "epoch": 2.482656826568266, "grad_norm": 0.20760068105074714, "learning_rate": 1.7581706353884786e-05, "loss": 0.0233, "step": 3364 }, { "epoch": 2.4833948339483394, "grad_norm": 0.19255454120290377, "learning_rate": 1.753309929618816e-05, "loss": 0.01, "step": 3365 }, { "epoch": 2.4841328413284134, "grad_norm": 0.0759842753827046, "learning_rate": 1.7484553064874155e-05, "loss": 0.0071, "step": 3366 }, { "epoch": 2.484870848708487, "grad_norm": 0.1567561799964635, "learning_rate": 1.7436067695749736e-05, "loss": 0.0241, "step": 3367 }, { "epoch": 2.485608856088561, "grad_norm": 0.1699229633334765, "learning_rate": 1.7387643224577054e-05, "loss": 0.0145, "step": 3368 }, { "epoch": 2.4863468634686345, "grad_norm": 0.10354868799269043, "learning_rate": 1.7339279687073273e-05, "loss": 0.0178, "step": 3369 }, { "epoch": 2.4870848708487086, "grad_norm": 0.4761498474139563, "learning_rate": 1.7290977118910634e-05, "loss": 0.061, "step": 3370 }, { "epoch": 2.487822878228782, "grad_norm": 0.15681936987958178, "learning_rate": 1.7242735555716395e-05, "loss": 0.0205, "step": 3371 }, { "epoch": 2.488560885608856, "grad_norm": 0.5617277857509819, "learning_rate": 1.71945550330728e-05, "loss": 0.0476, "step": 3372 }, { "epoch": 2.4892988929889297, "grad_norm": 0.23612344860881376, "learning_rate": 1.7146435586517195e-05, "loss": 0.0179, "step": 3373 }, { "epoch": 2.4900369003690037, "grad_norm": 0.19634216329952844, "learning_rate": 1.7098377251541676e-05, "loss": 0.0232, "step": 3374 }, { "epoch": 2.4907749077490777, "grad_norm": 0.32210457900364, "learning_rate": 1.705038006359343e-05, "loss": 0.019, "step": 3375 }, { "epoch": 2.4915129151291513, "grad_norm": 0.08164744654973913, "learning_rate": 1.700244405807445e-05, "loss": 0.0079, "step": 3376 }, { "epoch": 2.492250922509225, "grad_norm": 0.23107716961224936, "learning_rate": 1.6954569270341692e-05, "loss": 0.0599, "step": 3377 }, { "epoch": 2.492988929889299, "grad_norm": 0.5846164546003884, "learning_rate": 1.6906755735706847e-05, "loss": 0.0312, "step": 3378 }, { "epoch": 2.493726937269373, "grad_norm": 0.2115321757542089, "learning_rate": 1.6859003489436464e-05, "loss": 0.0192, "step": 3379 }, { "epoch": 2.4944649446494465, "grad_norm": 0.18036999701489956, "learning_rate": 1.6811312566751956e-05, "loss": 0.0181, "step": 3380 }, { "epoch": 2.4952029520295205, "grad_norm": 0.18790796219747283, "learning_rate": 1.6763683002829433e-05, "loss": 0.0248, "step": 3381 }, { "epoch": 2.495940959409594, "grad_norm": 0.3735512214423692, "learning_rate": 1.6716114832799757e-05, "loss": 0.0381, "step": 3382 }, { "epoch": 2.496678966789668, "grad_norm": 0.1318971337139899, "learning_rate": 1.6668608091748495e-05, "loss": 0.0185, "step": 3383 }, { "epoch": 2.4974169741697416, "grad_norm": 0.18172114058502348, "learning_rate": 1.6621162814715973e-05, "loss": 0.0203, "step": 3384 }, { "epoch": 2.4981549815498156, "grad_norm": 0.3649291810848237, "learning_rate": 1.6573779036697123e-05, "loss": 0.0357, "step": 3385 }, { "epoch": 2.498892988929889, "grad_norm": 0.21338632929586973, "learning_rate": 1.652645679264152e-05, "loss": 0.0341, "step": 3386 }, { "epoch": 2.499630996309963, "grad_norm": 0.1148203797463043, "learning_rate": 1.6479196117453355e-05, "loss": 0.0132, "step": 3387 }, { "epoch": 2.500369003690037, "grad_norm": 0.2613407726231511, "learning_rate": 1.64319970459914e-05, "loss": 0.0291, "step": 3388 }, { "epoch": 2.501107011070111, "grad_norm": 0.20986428995501913, "learning_rate": 1.6384859613069058e-05, "loss": 0.0178, "step": 3389 }, { "epoch": 2.5018450184501844, "grad_norm": 0.17304987957122656, "learning_rate": 1.6337783853454126e-05, "loss": 0.0209, "step": 3390 }, { "epoch": 2.5025830258302584, "grad_norm": 0.11500231492003732, "learning_rate": 1.6290769801869078e-05, "loss": 0.0137, "step": 3391 }, { "epoch": 2.503321033210332, "grad_norm": 0.1281546942528599, "learning_rate": 1.624381749299074e-05, "loss": 0.0228, "step": 3392 }, { "epoch": 2.504059040590406, "grad_norm": 0.2770089409272177, "learning_rate": 1.6196926961450488e-05, "loss": 0.0181, "step": 3393 }, { "epoch": 2.50479704797048, "grad_norm": 0.10836046996008941, "learning_rate": 1.6150098241834067e-05, "loss": 0.0126, "step": 3394 }, { "epoch": 2.5055350553505535, "grad_norm": 0.2387843944126391, "learning_rate": 1.6103331368681628e-05, "loss": 0.0227, "step": 3395 }, { "epoch": 2.506273062730627, "grad_norm": 0.3202855328195141, "learning_rate": 1.6056626376487814e-05, "loss": 0.0387, "step": 3396 }, { "epoch": 2.507011070110701, "grad_norm": 0.18973345623720278, "learning_rate": 1.600998329970149e-05, "loss": 0.032, "step": 3397 }, { "epoch": 2.507749077490775, "grad_norm": 0.15100054856649783, "learning_rate": 1.5963402172725928e-05, "loss": 0.0353, "step": 3398 }, { "epoch": 2.5084870848708487, "grad_norm": 0.1222591870087853, "learning_rate": 1.591688302991867e-05, "loss": 0.0143, "step": 3399 }, { "epoch": 2.5092250922509223, "grad_norm": 0.06637587976637706, "learning_rate": 1.587042590559156e-05, "loss": 0.0077, "step": 3400 }, { "epoch": 2.5099630996309963, "grad_norm": 0.11470602312694109, "learning_rate": 1.582403083401074e-05, "loss": 0.0148, "step": 3401 }, { "epoch": 2.5107011070110703, "grad_norm": 0.1381473305523559, "learning_rate": 1.5777697849396445e-05, "loss": 0.0177, "step": 3402 }, { "epoch": 2.511439114391144, "grad_norm": 0.1947063510150245, "learning_rate": 1.5731426985923302e-05, "loss": 0.0424, "step": 3403 }, { "epoch": 2.5121771217712174, "grad_norm": 0.17052583279803374, "learning_rate": 1.5685218277719982e-05, "loss": 0.0187, "step": 3404 }, { "epoch": 2.5129151291512914, "grad_norm": 0.28043876608539, "learning_rate": 1.563907175886935e-05, "loss": 0.0463, "step": 3405 }, { "epoch": 2.5136531365313655, "grad_norm": 0.11285822844521773, "learning_rate": 1.5592987463408424e-05, "loss": 0.0074, "step": 3406 }, { "epoch": 2.514391143911439, "grad_norm": 0.1345171723156449, "learning_rate": 1.5546965425328273e-05, "loss": 0.0161, "step": 3407 }, { "epoch": 2.515129151291513, "grad_norm": 0.5403848115572503, "learning_rate": 1.550100567857412e-05, "loss": 0.0393, "step": 3408 }, { "epoch": 2.5158671586715866, "grad_norm": 0.14775635906075338, "learning_rate": 1.5455108257045205e-05, "loss": 0.0141, "step": 3409 }, { "epoch": 2.5166051660516606, "grad_norm": 0.1450302119059882, "learning_rate": 1.5409273194594765e-05, "loss": 0.0118, "step": 3410 }, { "epoch": 2.517343173431734, "grad_norm": 0.2761936025055043, "learning_rate": 1.5363500525030096e-05, "loss": 0.0453, "step": 3411 }, { "epoch": 2.518081180811808, "grad_norm": 0.25912545521795055, "learning_rate": 1.531779028211241e-05, "loss": 0.0277, "step": 3412 }, { "epoch": 2.5188191881918818, "grad_norm": 0.15651542751603353, "learning_rate": 1.5272142499556983e-05, "loss": 0.0171, "step": 3413 }, { "epoch": 2.519557195571956, "grad_norm": 0.14991612127900844, "learning_rate": 1.522655721103291e-05, "loss": 0.0148, "step": 3414 }, { "epoch": 2.5202952029520294, "grad_norm": 0.11113313459387927, "learning_rate": 1.5181034450163245e-05, "loss": 0.0092, "step": 3415 }, { "epoch": 2.5210332103321034, "grad_norm": 0.3386566854435621, "learning_rate": 1.5135574250524897e-05, "loss": 0.0236, "step": 3416 }, { "epoch": 2.5217712177121774, "grad_norm": 0.16964358087482448, "learning_rate": 1.5090176645648702e-05, "loss": 0.0164, "step": 3417 }, { "epoch": 2.522509225092251, "grad_norm": 0.22023336962853687, "learning_rate": 1.5044841669019194e-05, "loss": 0.0459, "step": 3418 }, { "epoch": 2.5232472324723245, "grad_norm": 0.31388750707197655, "learning_rate": 1.4999569354074817e-05, "loss": 0.0197, "step": 3419 }, { "epoch": 2.5239852398523985, "grad_norm": 0.08515171225590898, "learning_rate": 1.4954359734207791e-05, "loss": 0.0109, "step": 3420 }, { "epoch": 2.5247232472324725, "grad_norm": 0.22147685867370764, "learning_rate": 1.4909212842764064e-05, "loss": 0.0377, "step": 3421 }, { "epoch": 2.525461254612546, "grad_norm": 0.13537745742178908, "learning_rate": 1.4864128713043313e-05, "loss": 0.0168, "step": 3422 }, { "epoch": 2.5261992619926197, "grad_norm": 0.3763732738195972, "learning_rate": 1.4819107378298923e-05, "loss": 0.0672, "step": 3423 }, { "epoch": 2.5269372693726937, "grad_norm": 0.16940939363277194, "learning_rate": 1.4774148871738014e-05, "loss": 0.0179, "step": 3424 }, { "epoch": 2.5276752767527677, "grad_norm": 0.25590137702463056, "learning_rate": 1.47292532265213e-05, "loss": 0.0268, "step": 3425 }, { "epoch": 2.5284132841328413, "grad_norm": 0.11603480560881858, "learning_rate": 1.468442047576315e-05, "loss": 0.0157, "step": 3426 }, { "epoch": 2.5291512915129153, "grad_norm": 0.12526484784282454, "learning_rate": 1.4639650652531556e-05, "loss": 0.0092, "step": 3427 }, { "epoch": 2.529889298892989, "grad_norm": 0.5050455584053489, "learning_rate": 1.459494378984806e-05, "loss": 0.0165, "step": 3428 }, { "epoch": 2.530627306273063, "grad_norm": 0.22611480992101377, "learning_rate": 1.4550299920687838e-05, "loss": 0.0258, "step": 3429 }, { "epoch": 2.5313653136531364, "grad_norm": 0.1442269238789023, "learning_rate": 1.450571907797953e-05, "loss": 0.0213, "step": 3430 }, { "epoch": 2.5321033210332105, "grad_norm": 0.09763409024324714, "learning_rate": 1.446120129460532e-05, "loss": 0.0132, "step": 3431 }, { "epoch": 2.532841328413284, "grad_norm": 0.29157556831339504, "learning_rate": 1.4416746603400865e-05, "loss": 0.037, "step": 3432 }, { "epoch": 2.533579335793358, "grad_norm": 0.15090303067831576, "learning_rate": 1.4372355037155315e-05, "loss": 0.0188, "step": 3433 }, { "epoch": 2.5343173431734316, "grad_norm": 0.1895107897681443, "learning_rate": 1.4328026628611224e-05, "loss": 0.0326, "step": 3434 }, { "epoch": 2.5350553505535056, "grad_norm": 0.21683645312640984, "learning_rate": 1.4283761410464559e-05, "loss": 0.0452, "step": 3435 }, { "epoch": 2.535793357933579, "grad_norm": 0.3076260707806977, "learning_rate": 1.4239559415364757e-05, "loss": 0.0265, "step": 3436 }, { "epoch": 2.536531365313653, "grad_norm": 0.13703437736921795, "learning_rate": 1.4195420675914527e-05, "loss": 0.0183, "step": 3437 }, { "epoch": 2.5372693726937268, "grad_norm": 0.198601867419984, "learning_rate": 1.4151345224669966e-05, "loss": 0.0174, "step": 3438 }, { "epoch": 2.538007380073801, "grad_norm": 0.1616767558172505, "learning_rate": 1.4107333094140485e-05, "loss": 0.0273, "step": 3439 }, { "epoch": 2.538745387453875, "grad_norm": 0.12595719605651073, "learning_rate": 1.4063384316788775e-05, "loss": 0.019, "step": 3440 }, { "epoch": 2.5394833948339484, "grad_norm": 0.18714652599188356, "learning_rate": 1.401949892503084e-05, "loss": 0.0149, "step": 3441 }, { "epoch": 2.540221402214022, "grad_norm": 0.17220200069257546, "learning_rate": 1.3975676951235882e-05, "loss": 0.0123, "step": 3442 }, { "epoch": 2.540959409594096, "grad_norm": 0.18985196188573858, "learning_rate": 1.3931918427726365e-05, "loss": 0.0196, "step": 3443 }, { "epoch": 2.54169741697417, "grad_norm": 0.20250824551337104, "learning_rate": 1.388822338677791e-05, "loss": 0.0157, "step": 3444 }, { "epoch": 2.5424354243542435, "grad_norm": 0.2340359210865124, "learning_rate": 1.3844591860619383e-05, "loss": 0.0319, "step": 3445 }, { "epoch": 2.543173431734317, "grad_norm": 0.17592265638691304, "learning_rate": 1.3801023881432761e-05, "loss": 0.0248, "step": 3446 }, { "epoch": 2.543911439114391, "grad_norm": 0.1428399481023847, "learning_rate": 1.3757519481353088e-05, "loss": 0.0094, "step": 3447 }, { "epoch": 2.544649446494465, "grad_norm": 0.07516501998691129, "learning_rate": 1.3714078692468634e-05, "loss": 0.0083, "step": 3448 }, { "epoch": 2.5453874538745387, "grad_norm": 0.197257148978408, "learning_rate": 1.3670701546820663e-05, "loss": 0.0153, "step": 3449 }, { "epoch": 2.5461254612546127, "grad_norm": 0.247475839615643, "learning_rate": 1.3627388076403547e-05, "loss": 0.0446, "step": 3450 }, { "epoch": 2.5468634686346863, "grad_norm": 0.24752997893669645, "learning_rate": 1.3584138313164652e-05, "loss": 0.0166, "step": 3451 }, { "epoch": 2.5476014760147603, "grad_norm": 0.1639935796632329, "learning_rate": 1.354095228900435e-05, "loss": 0.0178, "step": 3452 }, { "epoch": 2.548339483394834, "grad_norm": 0.4040705251063059, "learning_rate": 1.3497830035776082e-05, "loss": 0.0533, "step": 3453 }, { "epoch": 2.549077490774908, "grad_norm": 0.15864473118837463, "learning_rate": 1.3454771585286152e-05, "loss": 0.0334, "step": 3454 }, { "epoch": 2.5498154981549814, "grad_norm": 0.29932872668085936, "learning_rate": 1.3411776969293854e-05, "loss": 0.0287, "step": 3455 }, { "epoch": 2.5505535055350554, "grad_norm": 0.24243073225700026, "learning_rate": 1.3368846219511366e-05, "loss": 0.0161, "step": 3456 }, { "epoch": 2.551291512915129, "grad_norm": 0.12698623698423828, "learning_rate": 1.3325979367603825e-05, "loss": 0.0468, "step": 3457 }, { "epoch": 2.552029520295203, "grad_norm": 0.15654806953662867, "learning_rate": 1.3283176445189193e-05, "loss": 0.0136, "step": 3458 }, { "epoch": 2.5527675276752766, "grad_norm": 0.18609180778978762, "learning_rate": 1.324043748383823e-05, "loss": 0.0216, "step": 3459 }, { "epoch": 2.5535055350553506, "grad_norm": 0.13651580668958574, "learning_rate": 1.3197762515074618e-05, "loss": 0.0285, "step": 3460 }, { "epoch": 2.554243542435424, "grad_norm": 0.1911686281036351, "learning_rate": 1.3155151570374758e-05, "loss": 0.0327, "step": 3461 }, { "epoch": 2.554981549815498, "grad_norm": 0.23464076580759338, "learning_rate": 1.3112604681167928e-05, "loss": 0.0322, "step": 3462 }, { "epoch": 2.555719557195572, "grad_norm": 0.10474602561839734, "learning_rate": 1.3070121878835995e-05, "loss": 0.0119, "step": 3463 }, { "epoch": 2.5564575645756458, "grad_norm": 0.12047295215453382, "learning_rate": 1.3027703194713714e-05, "loss": 0.0141, "step": 3464 }, { "epoch": 2.5571955719557193, "grad_norm": 0.19709633086354197, "learning_rate": 1.2985348660088492e-05, "loss": 0.0145, "step": 3465 }, { "epoch": 2.5579335793357934, "grad_norm": 0.25875373433417054, "learning_rate": 1.2943058306200394e-05, "loss": 0.0347, "step": 3466 }, { "epoch": 2.5586715867158674, "grad_norm": 0.1266912212264831, "learning_rate": 1.2900832164242183e-05, "loss": 0.0143, "step": 3467 }, { "epoch": 2.559409594095941, "grad_norm": 0.11117232606353668, "learning_rate": 1.2858670265359207e-05, "loss": 0.0181, "step": 3468 }, { "epoch": 2.5601476014760145, "grad_norm": 0.17766296583613528, "learning_rate": 1.2816572640649516e-05, "loss": 0.021, "step": 3469 }, { "epoch": 2.5608856088560885, "grad_norm": 0.11622324081077953, "learning_rate": 1.2774539321163692e-05, "loss": 0.0113, "step": 3470 }, { "epoch": 2.5616236162361625, "grad_norm": 0.2200223632688348, "learning_rate": 1.2732570337904892e-05, "loss": 0.0231, "step": 3471 }, { "epoch": 2.562361623616236, "grad_norm": 0.11476958757136488, "learning_rate": 1.269066572182882e-05, "loss": 0.009, "step": 3472 }, { "epoch": 2.56309963099631, "grad_norm": 0.203612970917563, "learning_rate": 1.2648825503843686e-05, "loss": 0.029, "step": 3473 }, { "epoch": 2.5638376383763837, "grad_norm": 0.18401277820433148, "learning_rate": 1.2607049714810303e-05, "loss": 0.0317, "step": 3474 }, { "epoch": 2.5645756457564577, "grad_norm": 0.14744746472411288, "learning_rate": 1.2565338385541792e-05, "loss": 0.0175, "step": 3475 }, { "epoch": 2.5653136531365313, "grad_norm": 0.19307409681603305, "learning_rate": 1.2523691546803873e-05, "loss": 0.0128, "step": 3476 }, { "epoch": 2.5660516605166053, "grad_norm": 0.32472799185871953, "learning_rate": 1.2482109229314621e-05, "loss": 0.0247, "step": 3477 }, { "epoch": 2.566789667896679, "grad_norm": 0.11503458431931653, "learning_rate": 1.24405914637446e-05, "loss": 0.0185, "step": 3478 }, { "epoch": 2.567527675276753, "grad_norm": 0.4121197924426362, "learning_rate": 1.239913828071665e-05, "loss": 0.0276, "step": 3479 }, { "epoch": 2.5682656826568264, "grad_norm": 0.29983104260336013, "learning_rate": 1.2357749710806032e-05, "loss": 0.03, "step": 3480 }, { "epoch": 2.5690036900369004, "grad_norm": 0.18729837624496887, "learning_rate": 1.2316425784540398e-05, "loss": 0.0232, "step": 3481 }, { "epoch": 2.5697416974169744, "grad_norm": 0.18301474016262972, "learning_rate": 1.227516653239964e-05, "loss": 0.0179, "step": 3482 }, { "epoch": 2.570479704797048, "grad_norm": 0.10116977104633654, "learning_rate": 1.2233971984815984e-05, "loss": 0.0097, "step": 3483 }, { "epoch": 2.5712177121771216, "grad_norm": 0.33851617180283267, "learning_rate": 1.2192842172173913e-05, "loss": 0.0448, "step": 3484 }, { "epoch": 2.5719557195571956, "grad_norm": 0.2640238753599735, "learning_rate": 1.2151777124810215e-05, "loss": 0.0242, "step": 3485 }, { "epoch": 2.5726937269372696, "grad_norm": 0.18129985628582337, "learning_rate": 1.2110776873013862e-05, "loss": 0.0212, "step": 3486 }, { "epoch": 2.573431734317343, "grad_norm": 0.2647971500084496, "learning_rate": 1.2069841447025998e-05, "loss": 0.0301, "step": 3487 }, { "epoch": 2.5741697416974167, "grad_norm": 0.6913713487849497, "learning_rate": 1.2028970877040047e-05, "loss": 0.0744, "step": 3488 }, { "epoch": 2.5749077490774908, "grad_norm": 0.1785048347606724, "learning_rate": 1.1988165193201496e-05, "loss": 0.0199, "step": 3489 }, { "epoch": 2.5756457564575648, "grad_norm": 0.17773310830998437, "learning_rate": 1.1947424425608088e-05, "loss": 0.0345, "step": 3490 }, { "epoch": 2.5763837638376383, "grad_norm": 0.26728873103467554, "learning_rate": 1.1906748604309548e-05, "loss": 0.0222, "step": 3491 }, { "epoch": 2.577121771217712, "grad_norm": 0.15947737131995238, "learning_rate": 1.1866137759307816e-05, "loss": 0.0157, "step": 3492 }, { "epoch": 2.577859778597786, "grad_norm": 0.07980867161768172, "learning_rate": 1.1825591920556855e-05, "loss": 0.0079, "step": 3493 }, { "epoch": 2.57859778597786, "grad_norm": 0.41146975513559325, "learning_rate": 1.1785111117962665e-05, "loss": 0.0852, "step": 3494 }, { "epoch": 2.5793357933579335, "grad_norm": 0.21898562114082445, "learning_rate": 1.1744695381383297e-05, "loss": 0.0162, "step": 3495 }, { "epoch": 2.5800738007380075, "grad_norm": 0.15012096036547568, "learning_rate": 1.1704344740628803e-05, "loss": 0.0186, "step": 3496 }, { "epoch": 2.580811808118081, "grad_norm": 0.1891565562607102, "learning_rate": 1.1664059225461255e-05, "loss": 0.0218, "step": 3497 }, { "epoch": 2.581549815498155, "grad_norm": 0.1713201559053694, "learning_rate": 1.1623838865594639e-05, "loss": 0.0676, "step": 3498 }, { "epoch": 2.5822878228782287, "grad_norm": 0.18833872984827268, "learning_rate": 1.1583683690694925e-05, "loss": 0.0495, "step": 3499 }, { "epoch": 2.5830258302583027, "grad_norm": 0.11529917474952284, "learning_rate": 1.1543593730379954e-05, "loss": 0.017, "step": 3500 }, { "epoch": 2.5837638376383762, "grad_norm": 0.1489939171132524, "learning_rate": 1.1503569014219506e-05, "loss": 0.0177, "step": 3501 }, { "epoch": 2.5845018450184503, "grad_norm": 0.1809909974555379, "learning_rate": 1.1463609571735267e-05, "loss": 0.0166, "step": 3502 }, { "epoch": 2.585239852398524, "grad_norm": 0.19134377659711999, "learning_rate": 1.1423715432400661e-05, "loss": 0.0193, "step": 3503 }, { "epoch": 2.585977859778598, "grad_norm": 0.1616592151055066, "learning_rate": 1.1383886625641094e-05, "loss": 0.02, "step": 3504 }, { "epoch": 2.586715867158672, "grad_norm": 0.2753915168425404, "learning_rate": 1.1344123180833687e-05, "loss": 0.0389, "step": 3505 }, { "epoch": 2.5874538745387454, "grad_norm": 0.18341083328576674, "learning_rate": 1.1304425127307371e-05, "loss": 0.0187, "step": 3506 }, { "epoch": 2.588191881918819, "grad_norm": 0.25243523691597297, "learning_rate": 1.1264792494342857e-05, "loss": 0.0171, "step": 3507 }, { "epoch": 2.588929889298893, "grad_norm": 0.12845335507904745, "learning_rate": 1.1225225311172572e-05, "loss": 0.0195, "step": 3508 }, { "epoch": 2.589667896678967, "grad_norm": 0.2126297598049534, "learning_rate": 1.1185723606980747e-05, "loss": 0.0209, "step": 3509 }, { "epoch": 2.5904059040590406, "grad_norm": 0.2448310091712584, "learning_rate": 1.1146287410903223e-05, "loss": 0.1168, "step": 3510 }, { "epoch": 2.591143911439114, "grad_norm": 0.21202990103378755, "learning_rate": 1.1106916752027574e-05, "loss": 0.0224, "step": 3511 }, { "epoch": 2.591881918819188, "grad_norm": 0.2258613574067767, "learning_rate": 1.1067611659393017e-05, "loss": 0.0223, "step": 3512 }, { "epoch": 2.592619926199262, "grad_norm": 0.3594471288284067, "learning_rate": 1.1028372161990407e-05, "loss": 0.0291, "step": 3513 }, { "epoch": 2.5933579335793358, "grad_norm": 0.1165858060538867, "learning_rate": 1.0989198288762259e-05, "loss": 0.0133, "step": 3514 }, { "epoch": 2.5940959409594093, "grad_norm": 0.4239596733295635, "learning_rate": 1.0950090068602614e-05, "loss": 0.0382, "step": 3515 }, { "epoch": 2.5948339483394833, "grad_norm": 0.10121520795434545, "learning_rate": 1.0911047530357155e-05, "loss": 0.01, "step": 3516 }, { "epoch": 2.5955719557195573, "grad_norm": 0.12417856381141278, "learning_rate": 1.0872070702823033e-05, "loss": 0.0112, "step": 3517 }, { "epoch": 2.596309963099631, "grad_norm": 0.2570674931772926, "learning_rate": 1.0833159614749077e-05, "loss": 0.0158, "step": 3518 }, { "epoch": 2.597047970479705, "grad_norm": 0.13630007916717857, "learning_rate": 1.0794314294835473e-05, "loss": 0.0167, "step": 3519 }, { "epoch": 2.5977859778597785, "grad_norm": 0.09523791492536043, "learning_rate": 1.0755534771733955e-05, "loss": 0.0125, "step": 3520 }, { "epoch": 2.5985239852398525, "grad_norm": 0.26724128540012404, "learning_rate": 1.0716821074047767e-05, "loss": 0.0334, "step": 3521 }, { "epoch": 2.599261992619926, "grad_norm": 0.25500789684037467, "learning_rate": 1.0678173230331557e-05, "loss": 0.0263, "step": 3522 }, { "epoch": 2.6, "grad_norm": 0.4220217553590804, "learning_rate": 1.063959126909141e-05, "loss": 0.0557, "step": 3523 }, { "epoch": 2.6007380073800737, "grad_norm": 0.2807550562101159, "learning_rate": 1.0601075218784794e-05, "loss": 0.0231, "step": 3524 }, { "epoch": 2.6014760147601477, "grad_norm": 0.35728235527790575, "learning_rate": 1.0562625107820634e-05, "loss": 0.0224, "step": 3525 }, { "epoch": 2.6022140221402212, "grad_norm": 0.1610451061477112, "learning_rate": 1.052424096455914e-05, "loss": 0.015, "step": 3526 }, { "epoch": 2.6029520295202953, "grad_norm": 0.23409341570485392, "learning_rate": 1.04859228173119e-05, "loss": 0.0156, "step": 3527 }, { "epoch": 2.6036900369003693, "grad_norm": 0.22139129705676044, "learning_rate": 1.0447670694341827e-05, "loss": 0.0343, "step": 3528 }, { "epoch": 2.604428044280443, "grad_norm": 0.16829814339606936, "learning_rate": 1.0409484623863097e-05, "loss": 0.0179, "step": 3529 }, { "epoch": 2.6051660516605164, "grad_norm": 0.1610810631604792, "learning_rate": 1.0371364634041248e-05, "loss": 0.0197, "step": 3530 }, { "epoch": 2.6059040590405904, "grad_norm": 0.17089558489050943, "learning_rate": 1.0333310752993009e-05, "loss": 0.0196, "step": 3531 }, { "epoch": 2.6066420664206644, "grad_norm": 0.40440917456252506, "learning_rate": 1.029532300878635e-05, "loss": 0.0233, "step": 3532 }, { "epoch": 2.607380073800738, "grad_norm": 0.16085057877983006, "learning_rate": 1.0257401429440494e-05, "loss": 0.0068, "step": 3533 }, { "epoch": 2.6081180811808116, "grad_norm": 0.1567268631680671, "learning_rate": 1.0219546042925843e-05, "loss": 0.0425, "step": 3534 }, { "epoch": 2.6088560885608856, "grad_norm": 0.22857774543652284, "learning_rate": 1.0181756877163972e-05, "loss": 0.0244, "step": 3535 }, { "epoch": 2.6095940959409596, "grad_norm": 0.10725775081212455, "learning_rate": 1.014403396002761e-05, "loss": 0.0141, "step": 3536 }, { "epoch": 2.610332103321033, "grad_norm": 0.26531224176183177, "learning_rate": 1.0106377319340655e-05, "loss": 0.0279, "step": 3537 }, { "epoch": 2.611070110701107, "grad_norm": 0.1661833238808769, "learning_rate": 1.0068786982878087e-05, "loss": 0.0245, "step": 3538 }, { "epoch": 2.6118081180811807, "grad_norm": 0.16916677007700237, "learning_rate": 1.0031262978365974e-05, "loss": 0.0141, "step": 3539 }, { "epoch": 2.6125461254612548, "grad_norm": 0.21284495661602987, "learning_rate": 9.993805333481499e-06, "loss": 0.0212, "step": 3540 }, { "epoch": 2.6132841328413283, "grad_norm": 0.164046479810621, "learning_rate": 9.956414075852827e-06, "loss": 0.0106, "step": 3541 }, { "epoch": 2.6140221402214023, "grad_norm": 0.11507452684476568, "learning_rate": 9.919089233059265e-06, "loss": 0.0211, "step": 3542 }, { "epoch": 2.614760147601476, "grad_norm": 0.13008384984169435, "learning_rate": 9.881830832631045e-06, "loss": 0.0166, "step": 3543 }, { "epoch": 2.61549815498155, "grad_norm": 0.12320631245051514, "learning_rate": 9.844638902049419e-06, "loss": 0.0163, "step": 3544 }, { "epoch": 2.6162361623616235, "grad_norm": 0.278622853376671, "learning_rate": 9.807513468746587e-06, "loss": 0.0341, "step": 3545 }, { "epoch": 2.6169741697416975, "grad_norm": 0.19197805842623233, "learning_rate": 9.77045456010578e-06, "loss": 0.026, "step": 3546 }, { "epoch": 2.617712177121771, "grad_norm": 0.2102117930977618, "learning_rate": 9.733462203461097e-06, "loss": 0.0654, "step": 3547 }, { "epoch": 2.618450184501845, "grad_norm": 0.17805775524081624, "learning_rate": 9.696536426097503e-06, "loss": 0.0174, "step": 3548 }, { "epoch": 2.6191881918819186, "grad_norm": 0.12270722547937496, "learning_rate": 9.659677255250977e-06, "loss": 0.0152, "step": 3549 }, { "epoch": 2.6199261992619927, "grad_norm": 0.14114345034471362, "learning_rate": 9.622884718108272e-06, "loss": 0.0135, "step": 3550 }, { "epoch": 2.6206642066420667, "grad_norm": 0.10931725860780316, "learning_rate": 9.586158841807047e-06, "loss": 0.0058, "step": 3551 }, { "epoch": 2.6214022140221402, "grad_norm": 0.15768182956788132, "learning_rate": 9.549499653435745e-06, "loss": 0.0213, "step": 3552 }, { "epoch": 2.622140221402214, "grad_norm": 0.14083614337016967, "learning_rate": 9.51290718003368e-06, "loss": 0.0184, "step": 3553 }, { "epoch": 2.622878228782288, "grad_norm": 0.2668089110690295, "learning_rate": 9.47638144859091e-06, "loss": 0.0275, "step": 3554 }, { "epoch": 2.623616236162362, "grad_norm": 0.24678398442729202, "learning_rate": 9.439922486048292e-06, "loss": 0.0267, "step": 3555 }, { "epoch": 2.6243542435424354, "grad_norm": 0.11266295547168496, "learning_rate": 9.403530319297404e-06, "loss": 0.0095, "step": 3556 }, { "epoch": 2.625092250922509, "grad_norm": 0.14652152157973952, "learning_rate": 9.367204975180577e-06, "loss": 0.0193, "step": 3557 }, { "epoch": 2.625830258302583, "grad_norm": 0.14533910271137915, "learning_rate": 9.330946480490888e-06, "loss": 0.0202, "step": 3558 }, { "epoch": 2.626568265682657, "grad_norm": 0.2833394131782413, "learning_rate": 9.294754861972076e-06, "loss": 0.0186, "step": 3559 }, { "epoch": 2.6273062730627306, "grad_norm": 0.18396449848379332, "learning_rate": 9.25863014631848e-06, "loss": 0.0175, "step": 3560 }, { "epoch": 2.6280442804428046, "grad_norm": 0.17046193641005064, "learning_rate": 9.222572360175242e-06, "loss": 0.0166, "step": 3561 }, { "epoch": 2.628782287822878, "grad_norm": 0.19316948031436101, "learning_rate": 9.186581530137994e-06, "loss": 0.019, "step": 3562 }, { "epoch": 2.629520295202952, "grad_norm": 0.47381233110774995, "learning_rate": 9.150657682753127e-06, "loss": 0.0614, "step": 3563 }, { "epoch": 2.6302583025830257, "grad_norm": 0.20094579194899745, "learning_rate": 9.114800844517469e-06, "loss": 0.0368, "step": 3564 }, { "epoch": 2.6309963099630997, "grad_norm": 0.18333451695265096, "learning_rate": 9.079011041878538e-06, "loss": 0.019, "step": 3565 }, { "epoch": 2.6317343173431733, "grad_norm": 0.203934950324133, "learning_rate": 9.043288301234377e-06, "loss": 0.0227, "step": 3566 }, { "epoch": 2.6324723247232473, "grad_norm": 0.20411672131628328, "learning_rate": 9.007632648933528e-06, "loss": 0.0366, "step": 3567 }, { "epoch": 2.633210332103321, "grad_norm": 0.12039426367411325, "learning_rate": 8.972044111275113e-06, "loss": 0.0147, "step": 3568 }, { "epoch": 2.633948339483395, "grad_norm": 0.31609641957216905, "learning_rate": 8.936522714508678e-06, "loss": 0.0179, "step": 3569 }, { "epoch": 2.6346863468634685, "grad_norm": 0.12425796576416881, "learning_rate": 8.90106848483433e-06, "loss": 0.0188, "step": 3570 }, { "epoch": 2.6354243542435425, "grad_norm": 0.10032422494788962, "learning_rate": 8.865681448402575e-06, "loss": 0.0131, "step": 3571 }, { "epoch": 2.636162361623616, "grad_norm": 0.12661864028905723, "learning_rate": 8.830361631314377e-06, "loss": 0.0187, "step": 3572 }, { "epoch": 2.63690036900369, "grad_norm": 0.3291204471387356, "learning_rate": 8.795109059621109e-06, "loss": 0.0708, "step": 3573 }, { "epoch": 2.637638376383764, "grad_norm": 0.1865725466353189, "learning_rate": 8.759923759324539e-06, "loss": 0.0217, "step": 3574 }, { "epoch": 2.6383763837638377, "grad_norm": 0.12822724329703927, "learning_rate": 8.724805756376886e-06, "loss": 0.0127, "step": 3575 }, { "epoch": 2.639114391143911, "grad_norm": 0.44638933243945333, "learning_rate": 8.689755076680595e-06, "loss": 0.0738, "step": 3576 }, { "epoch": 2.6398523985239852, "grad_norm": 0.1633083514989677, "learning_rate": 8.654771746088608e-06, "loss": 0.0149, "step": 3577 }, { "epoch": 2.6405904059040592, "grad_norm": 0.1409868095001439, "learning_rate": 8.619855790404086e-06, "loss": 0.0242, "step": 3578 }, { "epoch": 2.641328413284133, "grad_norm": 0.16176339103295576, "learning_rate": 8.585007235380548e-06, "loss": 0.0266, "step": 3579 }, { "epoch": 2.6420664206642064, "grad_norm": 0.2886400426839416, "learning_rate": 8.55022610672177e-06, "loss": 0.0192, "step": 3580 }, { "epoch": 2.6428044280442804, "grad_norm": 0.26269361148628634, "learning_rate": 8.515512430081773e-06, "loss": 0.0204, "step": 3581 }, { "epoch": 2.6435424354243544, "grad_norm": 0.15378666423772844, "learning_rate": 8.480866231064898e-06, "loss": 0.0206, "step": 3582 }, { "epoch": 2.644280442804428, "grad_norm": 0.34217901917909505, "learning_rate": 8.446287535225683e-06, "loss": 0.1298, "step": 3583 }, { "epoch": 2.645018450184502, "grad_norm": 0.12164284805422482, "learning_rate": 8.411776368068835e-06, "loss": 0.0114, "step": 3584 }, { "epoch": 2.6457564575645756, "grad_norm": 0.2603258988718242, "learning_rate": 8.377332755049294e-06, "loss": 0.048, "step": 3585 }, { "epoch": 2.6464944649446496, "grad_norm": 0.31656494503798355, "learning_rate": 8.342956721572193e-06, "loss": 0.0277, "step": 3586 }, { "epoch": 2.647232472324723, "grad_norm": 0.305465937099564, "learning_rate": 8.308648292992793e-06, "loss": 0.0246, "step": 3587 }, { "epoch": 2.647970479704797, "grad_norm": 0.14160270763032692, "learning_rate": 8.274407494616432e-06, "loss": 0.0158, "step": 3588 }, { "epoch": 2.6487084870848707, "grad_norm": 0.20698852101876442, "learning_rate": 8.240234351698694e-06, "loss": 0.0316, "step": 3589 }, { "epoch": 2.6494464944649447, "grad_norm": 0.15277861556335653, "learning_rate": 8.206128889445131e-06, "loss": 0.0212, "step": 3590 }, { "epoch": 2.6501845018450183, "grad_norm": 0.12328257192343868, "learning_rate": 8.172091133011506e-06, "loss": 0.0077, "step": 3591 }, { "epoch": 2.6509225092250923, "grad_norm": 0.5708400305368098, "learning_rate": 8.138121107503494e-06, "loss": 0.0448, "step": 3592 }, { "epoch": 2.6516605166051663, "grad_norm": 0.26884543571299274, "learning_rate": 8.10421883797694e-06, "loss": 0.0221, "step": 3593 }, { "epoch": 2.65239852398524, "grad_norm": 0.12997763888003802, "learning_rate": 8.070384349437655e-06, "loss": 0.0241, "step": 3594 }, { "epoch": 2.6531365313653135, "grad_norm": 0.12014349192058316, "learning_rate": 8.03661766684145e-06, "loss": 0.0205, "step": 3595 }, { "epoch": 2.6538745387453875, "grad_norm": 0.3569464785113816, "learning_rate": 8.002918815094152e-06, "loss": 0.06, "step": 3596 }, { "epoch": 2.6546125461254615, "grad_norm": 0.14451160006946037, "learning_rate": 7.969287819051508e-06, "loss": 0.0215, "step": 3597 }, { "epoch": 2.655350553505535, "grad_norm": 0.6209057202997168, "learning_rate": 7.93572470351931e-06, "loss": 0.0489, "step": 3598 }, { "epoch": 2.6560885608856086, "grad_norm": 0.10116054823656673, "learning_rate": 7.902229493253177e-06, "loss": 0.0162, "step": 3599 }, { "epoch": 2.6568265682656826, "grad_norm": 0.1715006275744975, "learning_rate": 7.868802212958703e-06, "loss": 0.0217, "step": 3600 }, { "epoch": 2.6575645756457567, "grad_norm": 0.21756887007105255, "learning_rate": 7.835442887291367e-06, "loss": 0.0299, "step": 3601 }, { "epoch": 2.6583025830258302, "grad_norm": 0.2861097472275615, "learning_rate": 7.802151540856496e-06, "loss": 0.038, "step": 3602 }, { "epoch": 2.659040590405904, "grad_norm": 0.23282543486864857, "learning_rate": 7.768928198209346e-06, "loss": 0.0328, "step": 3603 }, { "epoch": 2.659778597785978, "grad_norm": 0.6183875313001537, "learning_rate": 7.735772883854908e-06, "loss": 0.0625, "step": 3604 }, { "epoch": 2.660516605166052, "grad_norm": 0.4272708285954988, "learning_rate": 7.702685622248107e-06, "loss": 0.0175, "step": 3605 }, { "epoch": 2.6612546125461254, "grad_norm": 0.1549554703772098, "learning_rate": 7.6696664377936e-06, "loss": 0.0161, "step": 3606 }, { "epoch": 2.6619926199261994, "grad_norm": 0.1417728034453145, "learning_rate": 7.636715354845902e-06, "loss": 0.0151, "step": 3607 }, { "epoch": 2.662730627306273, "grad_norm": 0.19982429990596232, "learning_rate": 7.603832397709187e-06, "loss": 0.0131, "step": 3608 }, { "epoch": 2.663468634686347, "grad_norm": 0.12857099350763637, "learning_rate": 7.571017590637464e-06, "loss": 0.0127, "step": 3609 }, { "epoch": 2.6642066420664205, "grad_norm": 0.16489802535448705, "learning_rate": 7.5382709578344815e-06, "loss": 0.0221, "step": 3610 }, { "epoch": 2.6649446494464946, "grad_norm": 0.31163455824663033, "learning_rate": 7.505592523453653e-06, "loss": 0.0428, "step": 3611 }, { "epoch": 2.665682656826568, "grad_norm": 0.3159993190987622, "learning_rate": 7.47298231159812e-06, "loss": 0.0466, "step": 3612 }, { "epoch": 2.666420664206642, "grad_norm": 0.10273826245614932, "learning_rate": 7.440440346320709e-06, "loss": 0.0162, "step": 3613 }, { "epoch": 2.6671586715867157, "grad_norm": 0.2508625648235423, "learning_rate": 7.4079666516238765e-06, "loss": 0.0456, "step": 3614 }, { "epoch": 2.6678966789667897, "grad_norm": 0.2717374660011146, "learning_rate": 7.375561251459772e-06, "loss": 0.0299, "step": 3615 }, { "epoch": 2.6686346863468637, "grad_norm": 0.3691351951156122, "learning_rate": 7.343224169730134e-06, "loss": 0.0387, "step": 3616 }, { "epoch": 2.6693726937269373, "grad_norm": 0.21185443959982508, "learning_rate": 7.310955430286315e-06, "loss": 0.0198, "step": 3617 }, { "epoch": 2.670110701107011, "grad_norm": 0.2548512884079733, "learning_rate": 7.278755056929265e-06, "loss": 0.0348, "step": 3618 }, { "epoch": 2.670848708487085, "grad_norm": 0.12071530021277238, "learning_rate": 7.246623073409553e-06, "loss": 0.0191, "step": 3619 }, { "epoch": 2.671586715867159, "grad_norm": 0.18177127433657678, "learning_rate": 7.214559503427198e-06, "loss": 0.0235, "step": 3620 }, { "epoch": 2.6723247232472325, "grad_norm": 0.1434285424287746, "learning_rate": 7.182564370631839e-06, "loss": 0.0167, "step": 3621 }, { "epoch": 2.673062730627306, "grad_norm": 0.10797972601312063, "learning_rate": 7.150637698622653e-06, "loss": 0.0193, "step": 3622 }, { "epoch": 2.67380073800738, "grad_norm": 0.12508807567483055, "learning_rate": 7.118779510948259e-06, "loss": 0.0233, "step": 3623 }, { "epoch": 2.674538745387454, "grad_norm": 0.2702218763340325, "learning_rate": 7.086989831106794e-06, "loss": 0.0306, "step": 3624 }, { "epoch": 2.6752767527675276, "grad_norm": 0.24331663616952182, "learning_rate": 7.0552686825458455e-06, "loss": 0.0216, "step": 3625 }, { "epoch": 2.676014760147601, "grad_norm": 0.11364827208412548, "learning_rate": 7.02361608866251e-06, "loss": 0.0139, "step": 3626 }, { "epoch": 2.676752767527675, "grad_norm": 0.11225023343031731, "learning_rate": 6.992032072803267e-06, "loss": 0.0112, "step": 3627 }, { "epoch": 2.6774907749077492, "grad_norm": 0.23184189748751238, "learning_rate": 6.960516658264005e-06, "loss": 0.0184, "step": 3628 }, { "epoch": 2.678228782287823, "grad_norm": 0.23845345067197005, "learning_rate": 6.929069868290039e-06, "loss": 0.0271, "step": 3629 }, { "epoch": 2.678966789667897, "grad_norm": 0.09316373162635505, "learning_rate": 6.897691726076061e-06, "loss": 0.0176, "step": 3630 }, { "epoch": 2.6797047970479704, "grad_norm": 0.10422340708248325, "learning_rate": 6.866382254766157e-06, "loss": 0.018, "step": 3631 }, { "epoch": 2.6804428044280444, "grad_norm": 0.17297182913366635, "learning_rate": 6.83514147745371e-06, "loss": 0.022, "step": 3632 }, { "epoch": 2.681180811808118, "grad_norm": 0.09955364848690829, "learning_rate": 6.8039694171814776e-06, "loss": 0.0094, "step": 3633 }, { "epoch": 2.681918819188192, "grad_norm": 0.1799226593704204, "learning_rate": 6.772866096941499e-06, "loss": 0.0305, "step": 3634 }, { "epoch": 2.6826568265682655, "grad_norm": 0.1594254013330715, "learning_rate": 6.741831539675148e-06, "loss": 0.0228, "step": 3635 }, { "epoch": 2.6833948339483396, "grad_norm": 0.17532177593891207, "learning_rate": 6.710865768273044e-06, "loss": 0.0267, "step": 3636 }, { "epoch": 2.684132841328413, "grad_norm": 0.3140978422609889, "learning_rate": 6.679968805575077e-06, "loss": 0.0599, "step": 3637 }, { "epoch": 2.684870848708487, "grad_norm": 0.12807585344269606, "learning_rate": 6.649140674370436e-06, "loss": 0.0169, "step": 3638 }, { "epoch": 2.685608856088561, "grad_norm": 0.10661608488038903, "learning_rate": 6.618381397397477e-06, "loss": 0.0292, "step": 3639 }, { "epoch": 2.6863468634686347, "grad_norm": 0.19892581436595613, "learning_rate": 6.587690997343799e-06, "loss": 0.0237, "step": 3640 }, { "epoch": 2.6870848708487083, "grad_norm": 0.3374752015223856, "learning_rate": 6.557069496846191e-06, "loss": 0.0359, "step": 3641 }, { "epoch": 2.6878228782287823, "grad_norm": 0.3344458316823594, "learning_rate": 6.526516918490611e-06, "loss": 0.03, "step": 3642 }, { "epoch": 2.6885608856088563, "grad_norm": 0.11166911280914729, "learning_rate": 6.496033284812219e-06, "loss": 0.0088, "step": 3643 }, { "epoch": 2.68929889298893, "grad_norm": 0.18159178993423888, "learning_rate": 6.465618618295288e-06, "loss": 0.0222, "step": 3644 }, { "epoch": 2.6900369003690034, "grad_norm": 0.18540454821638386, "learning_rate": 6.435272941373227e-06, "loss": 0.018, "step": 3645 }, { "epoch": 2.6907749077490775, "grad_norm": 0.09719452653893856, "learning_rate": 6.404996276428566e-06, "loss": 0.0094, "step": 3646 }, { "epoch": 2.6915129151291515, "grad_norm": 0.15122528986849956, "learning_rate": 6.3747886457929394e-06, "loss": 0.0199, "step": 3647 }, { "epoch": 2.692250922509225, "grad_norm": 0.2100737358066136, "learning_rate": 6.3446500717470715e-06, "loss": 0.0383, "step": 3648 }, { "epoch": 2.692988929889299, "grad_norm": 0.31247858319695554, "learning_rate": 6.314580576520679e-06, "loss": 0.0248, "step": 3649 }, { "epoch": 2.6937269372693726, "grad_norm": 0.3257138699163694, "learning_rate": 6.284580182292632e-06, "loss": 0.0269, "step": 3650 }, { "epoch": 2.6944649446494466, "grad_norm": 0.20752100102322524, "learning_rate": 6.254648911190775e-06, "loss": 0.0186, "step": 3651 }, { "epoch": 2.69520295202952, "grad_norm": 0.17859485171756978, "learning_rate": 6.22478678529197e-06, "loss": 0.0197, "step": 3652 }, { "epoch": 2.695940959409594, "grad_norm": 0.2843577883832028, "learning_rate": 6.194993826622064e-06, "loss": 0.037, "step": 3653 }, { "epoch": 2.696678966789668, "grad_norm": 0.25571278507949835, "learning_rate": 6.1652700571559474e-06, "loss": 0.0242, "step": 3654 }, { "epoch": 2.697416974169742, "grad_norm": 0.16276326581097583, "learning_rate": 6.135615498817426e-06, "loss": 0.0149, "step": 3655 }, { "epoch": 2.6981549815498154, "grad_norm": 0.20311122014733707, "learning_rate": 6.10603017347926e-06, "loss": 0.0147, "step": 3656 }, { "epoch": 2.6988929889298894, "grad_norm": 0.16700018215492785, "learning_rate": 6.07651410296316e-06, "loss": 0.0199, "step": 3657 }, { "epoch": 2.699630996309963, "grad_norm": 0.10463148386731974, "learning_rate": 6.0470673090397335e-06, "loss": 0.0123, "step": 3658 }, { "epoch": 2.700369003690037, "grad_norm": 0.1343170858661748, "learning_rate": 6.01768981342854e-06, "loss": 0.013, "step": 3659 }, { "epoch": 2.7011070110701105, "grad_norm": 0.1688623062773538, "learning_rate": 5.988381637797957e-06, "loss": 0.0149, "step": 3660 }, { "epoch": 2.7018450184501845, "grad_norm": 0.21073413778680766, "learning_rate": 5.959142803765294e-06, "loss": 0.0196, "step": 3661 }, { "epoch": 2.7025830258302586, "grad_norm": 0.10174090365642412, "learning_rate": 5.929973332896677e-06, "loss": 0.0099, "step": 3662 }, { "epoch": 2.703321033210332, "grad_norm": 0.1487346561179246, "learning_rate": 5.900873246707062e-06, "loss": 0.0255, "step": 3663 }, { "epoch": 2.7040590405904057, "grad_norm": 0.11772282364784985, "learning_rate": 5.871842566660302e-06, "loss": 0.013, "step": 3664 }, { "epoch": 2.7047970479704797, "grad_norm": 0.20627457008110966, "learning_rate": 5.842881314168935e-06, "loss": 0.0231, "step": 3665 }, { "epoch": 2.7055350553505537, "grad_norm": 0.2773892587845547, "learning_rate": 5.813989510594409e-06, "loss": 0.0329, "step": 3666 }, { "epoch": 2.7062730627306273, "grad_norm": 0.3856567711682024, "learning_rate": 5.785167177246875e-06, "loss": 0.0477, "step": 3667 }, { "epoch": 2.707011070110701, "grad_norm": 0.24517336712396975, "learning_rate": 5.756414335385274e-06, "loss": 0.0234, "step": 3668 }, { "epoch": 2.707749077490775, "grad_norm": 0.09616630084637465, "learning_rate": 5.727731006217285e-06, "loss": 0.012, "step": 3669 }, { "epoch": 2.708487084870849, "grad_norm": 0.16106087572246838, "learning_rate": 5.699117210899285e-06, "loss": 0.0492, "step": 3670 }, { "epoch": 2.7092250922509225, "grad_norm": 0.22579555787863534, "learning_rate": 5.6705729705364255e-06, "loss": 0.0292, "step": 3671 }, { "epoch": 2.7099630996309965, "grad_norm": 0.13671341619518332, "learning_rate": 5.642098306182509e-06, "loss": 0.0223, "step": 3672 }, { "epoch": 2.71070110701107, "grad_norm": 0.09859389817569594, "learning_rate": 5.613693238840034e-06, "loss": 0.0091, "step": 3673 }, { "epoch": 2.711439114391144, "grad_norm": 0.15056522423245466, "learning_rate": 5.585357789460166e-06, "loss": 0.0157, "step": 3674 }, { "epoch": 2.7121771217712176, "grad_norm": 0.2298722172871924, "learning_rate": 5.557091978942697e-06, "loss": 0.0176, "step": 3675 }, { "epoch": 2.7129151291512916, "grad_norm": 0.2123508656515992, "learning_rate": 5.528895828136127e-06, "loss": 0.0227, "step": 3676 }, { "epoch": 2.713653136531365, "grad_norm": 0.21866210556489502, "learning_rate": 5.500769357837465e-06, "loss": 0.0213, "step": 3677 }, { "epoch": 2.714391143911439, "grad_norm": 0.17962186323684173, "learning_rate": 5.472712588792428e-06, "loss": 0.0459, "step": 3678 }, { "epoch": 2.7151291512915128, "grad_norm": 0.15567157527196485, "learning_rate": 5.4447255416952505e-06, "loss": 0.0294, "step": 3679 }, { "epoch": 2.715867158671587, "grad_norm": 0.2912125444533761, "learning_rate": 5.416808237188808e-06, "loss": 0.0397, "step": 3680 }, { "epoch": 2.7166051660516604, "grad_norm": 0.176390842063875, "learning_rate": 5.388960695864465e-06, "loss": 0.0222, "step": 3681 }, { "epoch": 2.7173431734317344, "grad_norm": 0.08316414885911941, "learning_rate": 5.361182938262155e-06, "loss": 0.021, "step": 3682 }, { "epoch": 2.718081180811808, "grad_norm": 0.21607930865238614, "learning_rate": 5.3334749848703794e-06, "loss": 0.0299, "step": 3683 }, { "epoch": 2.718819188191882, "grad_norm": 0.14976045612970015, "learning_rate": 5.3058368561261e-06, "loss": 0.0218, "step": 3684 }, { "epoch": 2.719557195571956, "grad_norm": 0.1554157568821293, "learning_rate": 5.278268572414802e-06, "loss": 0.0164, "step": 3685 }, { "epoch": 2.7202952029520295, "grad_norm": 0.3023720642559195, "learning_rate": 5.250770154070428e-06, "loss": 0.0722, "step": 3686 }, { "epoch": 2.721033210332103, "grad_norm": 0.2174488813412612, "learning_rate": 5.223341621375444e-06, "loss": 0.0149, "step": 3687 }, { "epoch": 2.721771217712177, "grad_norm": 0.15113521508407846, "learning_rate": 5.195982994560744e-06, "loss": 0.0129, "step": 3688 }, { "epoch": 2.722509225092251, "grad_norm": 0.24502872786245236, "learning_rate": 5.168694293805587e-06, "loss": 0.0273, "step": 3689 }, { "epoch": 2.7232472324723247, "grad_norm": 0.15109026488347174, "learning_rate": 5.1414755392377835e-06, "loss": 0.0182, "step": 3690 }, { "epoch": 2.7239852398523983, "grad_norm": 0.18111442259484747, "learning_rate": 5.114326750933452e-06, "loss": 0.0227, "step": 3691 }, { "epoch": 2.7247232472324723, "grad_norm": 0.1927467014439258, "learning_rate": 5.087247948917195e-06, "loss": 0.0192, "step": 3692 }, { "epoch": 2.7254612546125463, "grad_norm": 0.19262603486930668, "learning_rate": 5.060239153161872e-06, "loss": 0.0259, "step": 3693 }, { "epoch": 2.72619926199262, "grad_norm": 0.12596223470648715, "learning_rate": 5.033300383588823e-06, "loss": 0.0138, "step": 3694 }, { "epoch": 2.726937269372694, "grad_norm": 0.11672995343527624, "learning_rate": 5.006431660067679e-06, "loss": 0.0175, "step": 3695 }, { "epoch": 2.7276752767527674, "grad_norm": 0.43497723441863473, "learning_rate": 4.979633002416417e-06, "loss": 0.05, "step": 3696 }, { "epoch": 2.7284132841328415, "grad_norm": 0.14550228378088217, "learning_rate": 4.952904430401339e-06, "loss": 0.0267, "step": 3697 }, { "epoch": 2.729151291512915, "grad_norm": 0.22058840931486862, "learning_rate": 4.926245963737042e-06, "loss": 0.0369, "step": 3698 }, { "epoch": 2.729889298892989, "grad_norm": 0.12517305375470905, "learning_rate": 4.899657622086428e-06, "loss": 0.0113, "step": 3699 }, { "epoch": 2.7306273062730626, "grad_norm": 0.1516253826242098, "learning_rate": 4.87313942506068e-06, "loss": 0.0137, "step": 3700 }, { "epoch": 2.7313653136531366, "grad_norm": 0.11676131555541458, "learning_rate": 4.846691392219216e-06, "loss": 0.0198, "step": 3701 }, { "epoch": 2.73210332103321, "grad_norm": 0.12344094993910118, "learning_rate": 4.820313543069732e-06, "loss": 0.0111, "step": 3702 }, { "epoch": 2.732841328413284, "grad_norm": 0.11833286793280245, "learning_rate": 4.794005897068121e-06, "loss": 0.0225, "step": 3703 }, { "epoch": 2.7335793357933578, "grad_norm": 0.17985696175425694, "learning_rate": 4.767768473618562e-06, "loss": 0.0225, "step": 3704 }, { "epoch": 2.734317343173432, "grad_norm": 0.1188689795484356, "learning_rate": 4.741601292073339e-06, "loss": 0.0181, "step": 3705 }, { "epoch": 2.7350553505535053, "grad_norm": 0.3211947426319188, "learning_rate": 4.7155043717330374e-06, "loss": 0.0669, "step": 3706 }, { "epoch": 2.7357933579335794, "grad_norm": 0.13970653899358038, "learning_rate": 4.689477731846326e-06, "loss": 0.0169, "step": 3707 }, { "epoch": 2.7365313653136534, "grad_norm": 0.3473526539818314, "learning_rate": 4.663521391610115e-06, "loss": 0.0331, "step": 3708 }, { "epoch": 2.737269372693727, "grad_norm": 0.09949732641248504, "learning_rate": 4.63763537016938e-06, "loss": 0.0123, "step": 3709 }, { "epoch": 2.7380073800738005, "grad_norm": 0.10565817917280403, "learning_rate": 4.6118196866172804e-06, "loss": 0.0127, "step": 3710 }, { "epoch": 2.7387453874538745, "grad_norm": 0.24935862715593132, "learning_rate": 4.586074359995119e-06, "loss": 0.0148, "step": 3711 }, { "epoch": 2.7394833948339485, "grad_norm": 0.12092927176575563, "learning_rate": 4.560399409292238e-06, "loss": 0.0119, "step": 3712 }, { "epoch": 2.740221402214022, "grad_norm": 0.10418030578917757, "learning_rate": 4.534794853446134e-06, "loss": 0.0157, "step": 3713 }, { "epoch": 2.7409594095940957, "grad_norm": 0.19707382873374205, "learning_rate": 4.509260711342322e-06, "loss": 0.0344, "step": 3714 }, { "epoch": 2.7416974169741697, "grad_norm": 0.38055968179758665, "learning_rate": 4.483797001814438e-06, "loss": 0.075, "step": 3715 }, { "epoch": 2.7424354243542437, "grad_norm": 0.09216643462260199, "learning_rate": 4.458403743644135e-06, "loss": 0.0101, "step": 3716 }, { "epoch": 2.7431734317343173, "grad_norm": 0.2550259083434673, "learning_rate": 4.433080955561109e-06, "loss": 0.0195, "step": 3717 }, { "epoch": 2.7439114391143913, "grad_norm": 0.07980363048717061, "learning_rate": 4.407828656243085e-06, "loss": 0.0106, "step": 3718 }, { "epoch": 2.744649446494465, "grad_norm": 0.17202687921680568, "learning_rate": 4.3826468643157755e-06, "loss": 0.0465, "step": 3719 }, { "epoch": 2.745387453874539, "grad_norm": 0.22860784783793153, "learning_rate": 4.357535598352936e-06, "loss": 0.021, "step": 3720 }, { "epoch": 2.7461254612546124, "grad_norm": 0.34849067294822905, "learning_rate": 4.332494876876225e-06, "loss": 0.0272, "step": 3721 }, { "epoch": 2.7468634686346864, "grad_norm": 0.19429335941859846, "learning_rate": 4.307524718355327e-06, "loss": 0.0336, "step": 3722 }, { "epoch": 2.74760147601476, "grad_norm": 0.32002394817621266, "learning_rate": 4.2826251412078855e-06, "loss": 0.0295, "step": 3723 }, { "epoch": 2.748339483394834, "grad_norm": 0.41473834048474434, "learning_rate": 4.257796163799455e-06, "loss": 0.0446, "step": 3724 }, { "epoch": 2.7490774907749076, "grad_norm": 0.18245059839905264, "learning_rate": 4.23303780444353e-06, "loss": 0.0387, "step": 3725 }, { "epoch": 2.7498154981549816, "grad_norm": 0.37560021822867456, "learning_rate": 4.208350081401491e-06, "loss": 0.0246, "step": 3726 }, { "epoch": 2.7505535055350556, "grad_norm": 0.3459144991361729, "learning_rate": 4.183733012882685e-06, "loss": 0.034, "step": 3727 }, { "epoch": 2.751291512915129, "grad_norm": 0.06750854201062248, "learning_rate": 4.159186617044275e-06, "loss": 0.0069, "step": 3728 }, { "epoch": 2.7520295202952028, "grad_norm": 0.26400316854306943, "learning_rate": 4.134710911991324e-06, "loss": 0.0259, "step": 3729 }, { "epoch": 2.7527675276752768, "grad_norm": 0.3244937940890163, "learning_rate": 4.110305915776769e-06, "loss": 0.0482, "step": 3730 }, { "epoch": 2.753505535055351, "grad_norm": 0.24842636701959533, "learning_rate": 4.085971646401343e-06, "loss": 0.0264, "step": 3731 }, { "epoch": 2.7542435424354244, "grad_norm": 0.06778345353416969, "learning_rate": 4.061708121813701e-06, "loss": 0.0086, "step": 3732 }, { "epoch": 2.754981549815498, "grad_norm": 0.11393436550711528, "learning_rate": 4.037515359910238e-06, "loss": 0.0205, "step": 3733 }, { "epoch": 2.755719557195572, "grad_norm": 0.37681687840046113, "learning_rate": 4.01339337853518e-06, "loss": 0.0201, "step": 3734 }, { "epoch": 2.756457564575646, "grad_norm": 0.21416886669682617, "learning_rate": 3.98934219548055e-06, "loss": 0.0258, "step": 3735 }, { "epoch": 2.7571955719557195, "grad_norm": 0.11622166606186744, "learning_rate": 3.965361828486147e-06, "loss": 0.015, "step": 3736 }, { "epoch": 2.757933579335793, "grad_norm": 0.3301167781251834, "learning_rate": 3.941452295239556e-06, "loss": 0.041, "step": 3737 }, { "epoch": 2.758671586715867, "grad_norm": 0.09619071189537738, "learning_rate": 3.91761361337607e-06, "loss": 0.0124, "step": 3738 }, { "epoch": 2.759409594095941, "grad_norm": 0.23525231060196744, "learning_rate": 3.8938458004787795e-06, "loss": 0.0187, "step": 3739 }, { "epoch": 2.7601476014760147, "grad_norm": 0.25394784580962976, "learning_rate": 3.870148874078472e-06, "loss": 0.0376, "step": 3740 }, { "epoch": 2.7608856088560887, "grad_norm": 0.10461999713724021, "learning_rate": 3.846522851653645e-06, "loss": 0.0079, "step": 3741 }, { "epoch": 2.7616236162361623, "grad_norm": 0.4463884487743026, "learning_rate": 3.8229677506305125e-06, "loss": 0.0458, "step": 3742 }, { "epoch": 2.7623616236162363, "grad_norm": 0.22351833584670314, "learning_rate": 3.799483588382968e-06, "loss": 0.017, "step": 3743 }, { "epoch": 2.76309963099631, "grad_norm": 0.21650734102763128, "learning_rate": 3.7760703822325996e-06, "loss": 0.0301, "step": 3744 }, { "epoch": 2.763837638376384, "grad_norm": 0.13327024841438945, "learning_rate": 3.7527281494486475e-06, "loss": 0.0143, "step": 3745 }, { "epoch": 2.7645756457564574, "grad_norm": 0.16384716630002916, "learning_rate": 3.7294569072479855e-06, "loss": 0.035, "step": 3746 }, { "epoch": 2.7653136531365314, "grad_norm": 0.17586304278314777, "learning_rate": 3.7062566727951496e-06, "loss": 0.0145, "step": 3747 }, { "epoch": 2.766051660516605, "grad_norm": 0.39468233509953166, "learning_rate": 3.683127463202296e-06, "loss": 0.0578, "step": 3748 }, { "epoch": 2.766789667896679, "grad_norm": 0.12160981154114495, "learning_rate": 3.6600692955292114e-06, "loss": 0.0197, "step": 3749 }, { "epoch": 2.767527675276753, "grad_norm": 0.14316841002529632, "learning_rate": 3.637082186783225e-06, "loss": 0.0141, "step": 3750 }, { "epoch": 2.7682656826568266, "grad_norm": 0.19129614320490412, "learning_rate": 3.6141661539193183e-06, "loss": 0.0189, "step": 3751 }, { "epoch": 2.7690036900369, "grad_norm": 0.32521423833047813, "learning_rate": 3.5913212138400153e-06, "loss": 0.0321, "step": 3752 }, { "epoch": 2.769741697416974, "grad_norm": 0.09165139300637669, "learning_rate": 3.568547383395404e-06, "loss": 0.0143, "step": 3753 }, { "epoch": 2.770479704797048, "grad_norm": 0.25472246892076694, "learning_rate": 3.5458446793831367e-06, "loss": 0.0447, "step": 3754 }, { "epoch": 2.7712177121771218, "grad_norm": 0.21222743848926295, "learning_rate": 3.5232131185484076e-06, "loss": 0.0115, "step": 3755 }, { "epoch": 2.7719557195571953, "grad_norm": 0.2400153966358625, "learning_rate": 3.5006527175839078e-06, "loss": 0.0174, "step": 3756 }, { "epoch": 2.7726937269372693, "grad_norm": 0.3194834146805711, "learning_rate": 3.4781634931298714e-06, "loss": 0.0482, "step": 3757 }, { "epoch": 2.7734317343173434, "grad_norm": 0.21027858956251164, "learning_rate": 3.455745461774018e-06, "loss": 0.0288, "step": 3758 }, { "epoch": 2.774169741697417, "grad_norm": 0.25462215881500566, "learning_rate": 3.433398640051555e-06, "loss": 0.0388, "step": 3759 }, { "epoch": 2.774907749077491, "grad_norm": 0.19271551411061738, "learning_rate": 3.4111230444451857e-06, "loss": 0.0312, "step": 3760 }, { "epoch": 2.7756457564575645, "grad_norm": 0.18168008480713416, "learning_rate": 3.388918691385057e-06, "loss": 0.0259, "step": 3761 }, { "epoch": 2.7763837638376385, "grad_norm": 0.8990513666417568, "learning_rate": 3.366785597248767e-06, "loss": 0.0986, "step": 3762 }, { "epoch": 2.777121771217712, "grad_norm": 0.13510226646697826, "learning_rate": 3.3447237783613807e-06, "loss": 0.0137, "step": 3763 }, { "epoch": 2.777859778597786, "grad_norm": 0.6654626168568285, "learning_rate": 3.322733250995347e-06, "loss": 0.0444, "step": 3764 }, { "epoch": 2.7785977859778597, "grad_norm": 0.3699621575617719, "learning_rate": 3.3008140313705917e-06, "loss": 0.0602, "step": 3765 }, { "epoch": 2.7793357933579337, "grad_norm": 0.09242777858861688, "learning_rate": 3.278966135654382e-06, "loss": 0.0125, "step": 3766 }, { "epoch": 2.7800738007380073, "grad_norm": 0.2261436992092219, "learning_rate": 3.2571895799614285e-06, "loss": 0.0305, "step": 3767 }, { "epoch": 2.7808118081180813, "grad_norm": 0.331011931008004, "learning_rate": 3.235484380353793e-06, "loss": 0.0325, "step": 3768 }, { "epoch": 2.781549815498155, "grad_norm": 0.2433084239915278, "learning_rate": 3.2138505528409136e-06, "loss": 0.0254, "step": 3769 }, { "epoch": 2.782287822878229, "grad_norm": 0.19847247054262268, "learning_rate": 3.1922881133795825e-06, "loss": 0.0231, "step": 3770 }, { "epoch": 2.7830258302583024, "grad_norm": 0.17923335898642587, "learning_rate": 3.1707970778739217e-06, "loss": 0.0195, "step": 3771 }, { "epoch": 2.7837638376383764, "grad_norm": 0.17846576316373006, "learning_rate": 3.149377462175451e-06, "loss": 0.0197, "step": 3772 }, { "epoch": 2.7845018450184504, "grad_norm": 0.1211439945319876, "learning_rate": 3.128029282082945e-06, "loss": 0.009, "step": 3773 }, { "epoch": 2.785239852398524, "grad_norm": 0.13004296013712086, "learning_rate": 3.106752553342496e-06, "loss": 0.0146, "step": 3774 }, { "epoch": 2.7859778597785976, "grad_norm": 0.20136854007556076, "learning_rate": 3.085547291647528e-06, "loss": 0.0322, "step": 3775 }, { "epoch": 2.7867158671586716, "grad_norm": 0.25577073986099746, "learning_rate": 3.0644135126387087e-06, "loss": 0.0269, "step": 3776 }, { "epoch": 2.7874538745387456, "grad_norm": 0.47864697845152376, "learning_rate": 3.0433512319040456e-06, "loss": 0.0376, "step": 3777 }, { "epoch": 2.788191881918819, "grad_norm": 0.2331921531165614, "learning_rate": 3.022360464978724e-06, "loss": 0.039, "step": 3778 }, { "epoch": 2.7889298892988927, "grad_norm": 0.11013466729575135, "learning_rate": 3.0014412273452586e-06, "loss": 0.015, "step": 3779 }, { "epoch": 2.7896678966789668, "grad_norm": 0.1868343442961323, "learning_rate": 2.9805935344333403e-06, "loss": 0.0236, "step": 3780 }, { "epoch": 2.7904059040590408, "grad_norm": 0.18119636701872327, "learning_rate": 2.9598174016199798e-06, "loss": 0.0319, "step": 3781 }, { "epoch": 2.7911439114391143, "grad_norm": 0.2115407734429676, "learning_rate": 2.9391128442293083e-06, "loss": 0.0178, "step": 3782 }, { "epoch": 2.7918819188191883, "grad_norm": 0.20018002858386263, "learning_rate": 2.9184798775326984e-06, "loss": 0.015, "step": 3783 }, { "epoch": 2.792619926199262, "grad_norm": 0.2168324992028329, "learning_rate": 2.8979185167487433e-06, "loss": 0.0323, "step": 3784 }, { "epoch": 2.793357933579336, "grad_norm": 0.11972258443185958, "learning_rate": 2.8774287770432007e-06, "loss": 0.0134, "step": 3785 }, { "epoch": 2.7940959409594095, "grad_norm": 0.1535288359555435, "learning_rate": 2.857010673529015e-06, "loss": 0.0166, "step": 3786 }, { "epoch": 2.7948339483394835, "grad_norm": 0.13042829041309636, "learning_rate": 2.8366642212662386e-06, "loss": 0.013, "step": 3787 }, { "epoch": 2.795571955719557, "grad_norm": 0.09521389316092564, "learning_rate": 2.816389435262168e-06, "loss": 0.0112, "step": 3788 }, { "epoch": 2.796309963099631, "grad_norm": 0.5023401135591089, "learning_rate": 2.7961863304711843e-06, "loss": 0.0189, "step": 3789 }, { "epoch": 2.7970479704797047, "grad_norm": 0.19105077951411706, "learning_rate": 2.7760549217947573e-06, "loss": 0.0162, "step": 3790 }, { "epoch": 2.7977859778597787, "grad_norm": 0.12306711091961108, "learning_rate": 2.7559952240815646e-06, "loss": 0.0171, "step": 3791 }, { "epoch": 2.7985239852398522, "grad_norm": 0.19011148507338316, "learning_rate": 2.736007252127326e-06, "loss": 0.011, "step": 3792 }, { "epoch": 2.7992619926199263, "grad_norm": 0.27123921995420014, "learning_rate": 2.7160910206749046e-06, "loss": 0.0345, "step": 3793 }, { "epoch": 2.8, "grad_norm": 0.1293785013116161, "learning_rate": 2.6962465444141716e-06, "loss": 0.0082, "step": 3794 }, { "epoch": 2.800738007380074, "grad_norm": 0.2573200103128409, "learning_rate": 2.676473837982174e-06, "loss": 0.0154, "step": 3795 }, { "epoch": 2.801476014760148, "grad_norm": 0.1302673575921616, "learning_rate": 2.6567729159629463e-06, "loss": 0.0176, "step": 3796 }, { "epoch": 2.8022140221402214, "grad_norm": 0.06816030761380627, "learning_rate": 2.6371437928876085e-06, "loss": 0.0059, "step": 3797 }, { "epoch": 2.802952029520295, "grad_norm": 0.16780784824036374, "learning_rate": 2.6175864832343134e-06, "loss": 0.0217, "step": 3798 }, { "epoch": 2.803690036900369, "grad_norm": 0.1352794997800779, "learning_rate": 2.5981010014282326e-06, "loss": 0.0114, "step": 3799 }, { "epoch": 2.804428044280443, "grad_norm": 0.4190912831395229, "learning_rate": 2.578687361841603e-06, "loss": 0.024, "step": 3800 }, { "epoch": 2.8051660516605166, "grad_norm": 0.5299773278212025, "learning_rate": 2.559345578793615e-06, "loss": 0.0363, "step": 3801 }, { "epoch": 2.80590405904059, "grad_norm": 0.17289236521847753, "learning_rate": 2.540075666550501e-06, "loss": 0.0171, "step": 3802 }, { "epoch": 2.806642066420664, "grad_norm": 0.2772572658100902, "learning_rate": 2.5208776393254696e-06, "loss": 0.0518, "step": 3803 }, { "epoch": 2.807380073800738, "grad_norm": 0.26590899242436017, "learning_rate": 2.5017515112786826e-06, "loss": 0.0428, "step": 3804 }, { "epoch": 2.8081180811808117, "grad_norm": 0.1325589842471044, "learning_rate": 2.4826972965173333e-06, "loss": 0.0187, "step": 3805 }, { "epoch": 2.8088560885608858, "grad_norm": 0.15977602626863716, "learning_rate": 2.4637150090954796e-06, "loss": 0.0241, "step": 3806 }, { "epoch": 2.8095940959409593, "grad_norm": 0.13287524728998668, "learning_rate": 2.4448046630142216e-06, "loss": 0.0404, "step": 3807 }, { "epoch": 2.8103321033210333, "grad_norm": 0.14130434706896422, "learning_rate": 2.4259662722215357e-06, "loss": 0.0115, "step": 3808 }, { "epoch": 2.811070110701107, "grad_norm": 0.38185923622562074, "learning_rate": 2.4071998506123626e-06, "loss": 0.033, "step": 3809 }, { "epoch": 2.811808118081181, "grad_norm": 0.12438727077166893, "learning_rate": 2.38850541202853e-06, "loss": 0.019, "step": 3810 }, { "epoch": 2.8125461254612545, "grad_norm": 0.12549374392256155, "learning_rate": 2.3698829702587633e-06, "loss": 0.02, "step": 3811 }, { "epoch": 2.8132841328413285, "grad_norm": 0.17310836028131837, "learning_rate": 2.351332539038731e-06, "loss": 0.0174, "step": 3812 }, { "epoch": 2.814022140221402, "grad_norm": 0.6128713524958658, "learning_rate": 2.3328541320509433e-06, "loss": 0.0498, "step": 3813 }, { "epoch": 2.814760147601476, "grad_norm": 0.14935091493218117, "learning_rate": 2.3144477629248207e-06, "loss": 0.0227, "step": 3814 }, { "epoch": 2.8154981549815496, "grad_norm": 0.10023424812805355, "learning_rate": 2.296113445236614e-06, "loss": 0.0117, "step": 3815 }, { "epoch": 2.8162361623616237, "grad_norm": 0.6411471866347482, "learning_rate": 2.2778511925094613e-06, "loss": 0.0314, "step": 3816 }, { "epoch": 2.8169741697416972, "grad_norm": 0.16868070133516824, "learning_rate": 2.259661018213333e-06, "loss": 0.0301, "step": 3817 }, { "epoch": 2.8177121771217712, "grad_norm": 0.17689639118075412, "learning_rate": 2.2415429357650398e-06, "loss": 0.0163, "step": 3818 }, { "epoch": 2.8184501845018453, "grad_norm": 0.10006551317305909, "learning_rate": 2.223496958528193e-06, "loss": 0.0094, "step": 3819 }, { "epoch": 2.819188191881919, "grad_norm": 0.290853271612824, "learning_rate": 2.205523099813267e-06, "loss": 0.0271, "step": 3820 }, { "epoch": 2.8199261992619924, "grad_norm": 0.17266092750420972, "learning_rate": 2.187621372877513e-06, "loss": 0.0276, "step": 3821 }, { "epoch": 2.8206642066420664, "grad_norm": 0.12980362474236926, "learning_rate": 2.1697917909249575e-06, "loss": 0.013, "step": 3822 }, { "epoch": 2.8214022140221404, "grad_norm": 0.3095017529871103, "learning_rate": 2.1520343671064815e-06, "loss": 0.0369, "step": 3823 }, { "epoch": 2.822140221402214, "grad_norm": 0.2048519046443386, "learning_rate": 2.1343491145196735e-06, "loss": 0.0177, "step": 3824 }, { "epoch": 2.8228782287822876, "grad_norm": 0.29076982769322907, "learning_rate": 2.1167360462089335e-06, "loss": 0.0389, "step": 3825 }, { "epoch": 2.8236162361623616, "grad_norm": 0.21249813743972676, "learning_rate": 2.0991951751653914e-06, "loss": 0.033, "step": 3826 }, { "epoch": 2.8243542435424356, "grad_norm": 0.09393638455391623, "learning_rate": 2.0817265143269316e-06, "loss": 0.0068, "step": 3827 }, { "epoch": 2.825092250922509, "grad_norm": 0.09386825050909399, "learning_rate": 2.0643300765782026e-06, "loss": 0.0094, "step": 3828 }, { "epoch": 2.825830258302583, "grad_norm": 0.18602328192618295, "learning_rate": 2.0470058747505516e-06, "loss": 0.0198, "step": 3829 }, { "epoch": 2.8265682656826567, "grad_norm": 0.20280232019744282, "learning_rate": 2.0297539216220683e-06, "loss": 0.0062, "step": 3830 }, { "epoch": 2.8273062730627307, "grad_norm": 0.13149691599220562, "learning_rate": 2.012574229917519e-06, "loss": 0.0134, "step": 3831 }, { "epoch": 2.8280442804428043, "grad_norm": 0.260497536883162, "learning_rate": 1.9954668123084107e-06, "loss": 0.0309, "step": 3832 }, { "epoch": 2.8287822878228783, "grad_norm": 0.1544739073758566, "learning_rate": 1.978431681412929e-06, "loss": 0.0199, "step": 3833 }, { "epoch": 2.829520295202952, "grad_norm": 0.5467364404386739, "learning_rate": 1.9614688497959333e-06, "loss": 0.0559, "step": 3834 }, { "epoch": 2.830258302583026, "grad_norm": 0.11515087220972499, "learning_rate": 1.944578329968949e-06, "loss": 0.0174, "step": 3835 }, { "epoch": 2.8309963099630995, "grad_norm": 0.1654374165729414, "learning_rate": 1.9277601343901997e-06, "loss": 0.0173, "step": 3836 }, { "epoch": 2.8317343173431735, "grad_norm": 0.15886821349061994, "learning_rate": 1.9110142754645177e-06, "loss": 0.0177, "step": 3837 }, { "epoch": 2.8324723247232475, "grad_norm": 0.1362304665089127, "learning_rate": 1.894340765543412e-06, "loss": 0.0069, "step": 3838 }, { "epoch": 2.833210332103321, "grad_norm": 0.1408289776680267, "learning_rate": 1.8777396169250228e-06, "loss": 0.019, "step": 3839 }, { "epoch": 2.8339483394833946, "grad_norm": 0.2780264124129058, "learning_rate": 1.8612108418541219e-06, "loss": 0.0316, "step": 3840 }, { "epoch": 2.8346863468634687, "grad_norm": 0.08750421096398586, "learning_rate": 1.8447544525220794e-06, "loss": 0.0132, "step": 3841 }, { "epoch": 2.8354243542435427, "grad_norm": 0.07980311241230623, "learning_rate": 1.8283704610668972e-06, "loss": 0.0124, "step": 3842 }, { "epoch": 2.8361623616236162, "grad_norm": 0.06253189670702652, "learning_rate": 1.8120588795731641e-06, "loss": 0.0045, "step": 3843 }, { "epoch": 2.83690036900369, "grad_norm": 0.11786601282534066, "learning_rate": 1.795819720072056e-06, "loss": 0.0241, "step": 3844 }, { "epoch": 2.837638376383764, "grad_norm": 0.1366596412223378, "learning_rate": 1.7796529945413587e-06, "loss": 0.0133, "step": 3845 }, { "epoch": 2.838376383763838, "grad_norm": 0.187551265526138, "learning_rate": 1.7635587149054112e-06, "loss": 0.0237, "step": 3846 }, { "epoch": 2.8391143911439114, "grad_norm": 0.25226965288646197, "learning_rate": 1.7475368930351067e-06, "loss": 0.0347, "step": 3847 }, { "epoch": 2.839852398523985, "grad_norm": 0.14206530465133815, "learning_rate": 1.7315875407479032e-06, "loss": 0.0912, "step": 3848 }, { "epoch": 2.840590405904059, "grad_norm": 0.19535633693312343, "learning_rate": 1.7157106698078352e-06, "loss": 0.0092, "step": 3849 }, { "epoch": 2.841328413284133, "grad_norm": 0.1521420618524522, "learning_rate": 1.6999062919254238e-06, "loss": 0.0406, "step": 3850 }, { "epoch": 2.8420664206642066, "grad_norm": 0.25917971504422416, "learning_rate": 1.6841744187577557e-06, "loss": 0.0224, "step": 3851 }, { "epoch": 2.8428044280442806, "grad_norm": 0.17235752693559597, "learning_rate": 1.6685150619084489e-06, "loss": 0.0368, "step": 3852 }, { "epoch": 2.843542435424354, "grad_norm": 0.30224780653301947, "learning_rate": 1.6529282329275974e-06, "loss": 0.0336, "step": 3853 }, { "epoch": 2.844280442804428, "grad_norm": 0.14527129088951377, "learning_rate": 1.6374139433118164e-06, "loss": 0.0244, "step": 3854 }, { "epoch": 2.8450184501845017, "grad_norm": 0.17828624240089078, "learning_rate": 1.6219722045042297e-06, "loss": 0.0251, "step": 3855 }, { "epoch": 2.8457564575645757, "grad_norm": 0.33459327771877645, "learning_rate": 1.6066030278944376e-06, "loss": 0.0413, "step": 3856 }, { "epoch": 2.8464944649446493, "grad_norm": 0.21373655176872966, "learning_rate": 1.5913064248185173e-06, "loss": 0.0363, "step": 3857 }, { "epoch": 2.8472324723247233, "grad_norm": 0.20847517138142577, "learning_rate": 1.5760824065590207e-06, "loss": 0.0192, "step": 3858 }, { "epoch": 2.847970479704797, "grad_norm": 0.09620733296929435, "learning_rate": 1.560930984344966e-06, "loss": 0.0109, "step": 3859 }, { "epoch": 2.848708487084871, "grad_norm": 0.36446887997008043, "learning_rate": 1.5458521693518023e-06, "loss": 0.0147, "step": 3860 }, { "epoch": 2.849446494464945, "grad_norm": 0.2058092686766045, "learning_rate": 1.5308459727014669e-06, "loss": 0.0166, "step": 3861 }, { "epoch": 2.8501845018450185, "grad_norm": 0.1390779014729248, "learning_rate": 1.5159124054623053e-06, "loss": 0.0249, "step": 3862 }, { "epoch": 2.850922509225092, "grad_norm": 0.08337388360461481, "learning_rate": 1.5010514786490958e-06, "loss": 0.0071, "step": 3863 }, { "epoch": 2.851660516605166, "grad_norm": 0.37727651445860094, "learning_rate": 1.4862632032230484e-06, "loss": 0.0269, "step": 3864 }, { "epoch": 2.85239852398524, "grad_norm": 0.2125960683420323, "learning_rate": 1.4715475900917598e-06, "loss": 0.0337, "step": 3865 }, { "epoch": 2.8531365313653136, "grad_norm": 0.1486800360489583, "learning_rate": 1.4569046501092697e-06, "loss": 0.0135, "step": 3866 }, { "epoch": 2.853874538745387, "grad_norm": 0.1874263444926136, "learning_rate": 1.4423343940759947e-06, "loss": 0.0603, "step": 3867 }, { "epoch": 2.8546125461254612, "grad_norm": 0.22106375921476196, "learning_rate": 1.4278368327387382e-06, "loss": 0.0393, "step": 3868 }, { "epoch": 2.8553505535055352, "grad_norm": 0.13974185321971408, "learning_rate": 1.4134119767906906e-06, "loss": 0.0151, "step": 3869 }, { "epoch": 2.856088560885609, "grad_norm": 0.3396021566859321, "learning_rate": 1.399059836871408e-06, "loss": 0.0444, "step": 3870 }, { "epoch": 2.856826568265683, "grad_norm": 0.11794061974419735, "learning_rate": 1.384780423566845e-06, "loss": 0.0099, "step": 3871 }, { "epoch": 2.8575645756457564, "grad_norm": 0.19492030107706138, "learning_rate": 1.370573747409254e-06, "loss": 0.0209, "step": 3872 }, { "epoch": 2.8583025830258304, "grad_norm": 0.36825594332932626, "learning_rate": 1.3564398188772975e-06, "loss": 0.0294, "step": 3873 }, { "epoch": 2.859040590405904, "grad_norm": 0.15126568312728156, "learning_rate": 1.3423786483959478e-06, "loss": 0.0188, "step": 3874 }, { "epoch": 2.859778597785978, "grad_norm": 0.2524896134152087, "learning_rate": 1.3283902463365082e-06, "loss": 0.0136, "step": 3875 }, { "epoch": 2.8605166051660516, "grad_norm": 0.1470512644324954, "learning_rate": 1.3144746230166372e-06, "loss": 0.0313, "step": 3876 }, { "epoch": 2.8612546125461256, "grad_norm": 0.1000828202437831, "learning_rate": 1.3006317887002794e-06, "loss": 0.0188, "step": 3877 }, { "epoch": 2.861992619926199, "grad_norm": 0.10096907005983995, "learning_rate": 1.2868617535977346e-06, "loss": 0.0121, "step": 3878 }, { "epoch": 2.862730627306273, "grad_norm": 0.2869475424219153, "learning_rate": 1.2731645278655445e-06, "loss": 0.0444, "step": 3879 }, { "epoch": 2.8634686346863467, "grad_norm": 0.09707241338346798, "learning_rate": 1.259540121606606e-06, "loss": 0.0121, "step": 3880 }, { "epoch": 2.8642066420664207, "grad_norm": 0.2054259837569282, "learning_rate": 1.2459885448700692e-06, "loss": 0.0255, "step": 3881 }, { "epoch": 2.8649446494464943, "grad_norm": 0.2478139747994922, "learning_rate": 1.2325098076513941e-06, "loss": 0.0307, "step": 3882 }, { "epoch": 2.8656826568265683, "grad_norm": 0.2662513030320209, "learning_rate": 1.2191039198922948e-06, "loss": 0.039, "step": 3883 }, { "epoch": 2.8664206642066423, "grad_norm": 0.14851944084406085, "learning_rate": 1.2057708914807398e-06, "loss": 0.0206, "step": 3884 }, { "epoch": 2.867158671586716, "grad_norm": 0.1715489018503992, "learning_rate": 1.1925107322510066e-06, "loss": 0.0245, "step": 3885 }, { "epoch": 2.8678966789667895, "grad_norm": 0.13820954966865762, "learning_rate": 1.1793234519835828e-06, "loss": 0.0205, "step": 3886 }, { "epoch": 2.8686346863468635, "grad_norm": 0.14300728344400854, "learning_rate": 1.166209060405199e-06, "loss": 0.0247, "step": 3887 }, { "epoch": 2.8693726937269375, "grad_norm": 0.17477393886592651, "learning_rate": 1.1531675671888619e-06, "loss": 0.0215, "step": 3888 }, { "epoch": 2.870110701107011, "grad_norm": 0.1718984542970949, "learning_rate": 1.1401989819537772e-06, "loss": 0.015, "step": 3889 }, { "epoch": 2.8708487084870846, "grad_norm": 0.13836871870644832, "learning_rate": 1.1273033142653821e-06, "loss": 0.007, "step": 3890 }, { "epoch": 2.8715867158671586, "grad_norm": 0.19601698646534957, "learning_rate": 1.1144805736353347e-06, "loss": 0.0287, "step": 3891 }, { "epoch": 2.8723247232472326, "grad_norm": 0.17572450148402916, "learning_rate": 1.1017307695215028e-06, "loss": 0.015, "step": 3892 }, { "epoch": 2.873062730627306, "grad_norm": 0.28597669937870274, "learning_rate": 1.0890539113279418e-06, "loss": 0.0208, "step": 3893 }, { "epoch": 2.8738007380073802, "grad_norm": 0.12871277442153783, "learning_rate": 1.07645000840495e-06, "loss": 0.0122, "step": 3894 }, { "epoch": 2.874538745387454, "grad_norm": 0.1667785637660987, "learning_rate": 1.0639190700489465e-06, "loss": 0.0124, "step": 3895 }, { "epoch": 2.875276752767528, "grad_norm": 0.195212221248094, "learning_rate": 1.0514611055025936e-06, "loss": 0.0287, "step": 3896 }, { "epoch": 2.8760147601476014, "grad_norm": 0.10932440098912462, "learning_rate": 1.0390761239546964e-06, "loss": 0.0181, "step": 3897 }, { "epoch": 2.8767527675276754, "grad_norm": 0.25220074225106326, "learning_rate": 1.0267641345402367e-06, "loss": 0.0252, "step": 3898 }, { "epoch": 2.877490774907749, "grad_norm": 0.20457180937098327, "learning_rate": 1.0145251463403505e-06, "loss": 0.0327, "step": 3899 }, { "epoch": 2.878228782287823, "grad_norm": 0.22933878853994766, "learning_rate": 1.0023591683823386e-06, "loss": 0.0237, "step": 3900 }, { "epoch": 2.8789667896678965, "grad_norm": 0.22819793114003373, "learning_rate": 9.902662096396564e-07, "loss": 0.0184, "step": 3901 }, { "epoch": 2.8797047970479706, "grad_norm": 0.23503664091783844, "learning_rate": 9.782462790318913e-07, "loss": 0.0326, "step": 3902 }, { "epoch": 2.880442804428044, "grad_norm": 0.1891170464527559, "learning_rate": 9.662993854247736e-07, "loss": 0.0186, "step": 3903 }, { "epoch": 2.881180811808118, "grad_norm": 0.12096812724486274, "learning_rate": 9.544255376301547e-07, "loss": 0.0164, "step": 3904 }, { "epoch": 2.8819188191881917, "grad_norm": 0.3215107992545522, "learning_rate": 9.426247444059954e-07, "loss": 0.024, "step": 3905 }, { "epoch": 2.8826568265682657, "grad_norm": 0.12757788648138432, "learning_rate": 9.308970144564111e-07, "loss": 0.026, "step": 3906 }, { "epoch": 2.8833948339483397, "grad_norm": 0.13953861356890382, "learning_rate": 9.192423564315933e-07, "loss": 0.014, "step": 3907 }, { "epoch": 2.8841328413284133, "grad_norm": 0.12102034372368901, "learning_rate": 9.076607789278435e-07, "loss": 0.0129, "step": 3908 }, { "epoch": 2.884870848708487, "grad_norm": 0.3125381444965218, "learning_rate": 8.96152290487573e-07, "loss": 0.0292, "step": 3909 }, { "epoch": 2.885608856088561, "grad_norm": 0.2573884122741035, "learning_rate": 8.847168995992916e-07, "loss": 0.0375, "step": 3910 }, { "epoch": 2.886346863468635, "grad_norm": 0.4586333444534214, "learning_rate": 8.733546146975414e-07, "loss": 0.0294, "step": 3911 }, { "epoch": 2.8870848708487085, "grad_norm": 0.21912266728083843, "learning_rate": 8.62065444162985e-07, "loss": 0.0578, "step": 3912 }, { "epoch": 2.887822878228782, "grad_norm": 0.10453402579899485, "learning_rate": 8.508493963223729e-07, "loss": 0.019, "step": 3913 }, { "epoch": 2.888560885608856, "grad_norm": 0.24047399146037837, "learning_rate": 8.397064794484877e-07, "loss": 0.038, "step": 3914 }, { "epoch": 2.88929889298893, "grad_norm": 0.20156419846345075, "learning_rate": 8.286367017601659e-07, "loss": 0.0241, "step": 3915 }, { "epoch": 2.8900369003690036, "grad_norm": 0.3331009925960031, "learning_rate": 8.17640071422332e-07, "loss": 0.0456, "step": 3916 }, { "epoch": 2.8907749077490776, "grad_norm": 0.28254183781425896, "learning_rate": 8.067165965459423e-07, "loss": 0.0161, "step": 3917 }, { "epoch": 2.891512915129151, "grad_norm": 0.17805814035124035, "learning_rate": 7.958662851879851e-07, "loss": 0.0198, "step": 3918 }, { "epoch": 2.892250922509225, "grad_norm": 0.12270221389609341, "learning_rate": 7.850891453514808e-07, "loss": 0.0107, "step": 3919 }, { "epoch": 2.892988929889299, "grad_norm": 0.1598380797862102, "learning_rate": 7.743851849855044e-07, "loss": 0.0119, "step": 3920 }, { "epoch": 2.893726937269373, "grad_norm": 0.2474702163775979, "learning_rate": 7.637544119851403e-07, "loss": 0.0302, "step": 3921 }, { "epoch": 2.8944649446494464, "grad_norm": 0.24044394985211112, "learning_rate": 7.531968341914941e-07, "loss": 0.028, "step": 3922 }, { "epoch": 2.8952029520295204, "grad_norm": 0.19919285084792865, "learning_rate": 7.427124593916701e-07, "loss": 0.0274, "step": 3923 }, { "epoch": 2.895940959409594, "grad_norm": 0.2663325680942599, "learning_rate": 7.323012953188047e-07, "loss": 0.0144, "step": 3924 }, { "epoch": 2.896678966789668, "grad_norm": 0.09684889577861386, "learning_rate": 7.219633496520107e-07, "loss": 0.0176, "step": 3925 }, { "epoch": 2.8974169741697415, "grad_norm": 0.1606739967403476, "learning_rate": 7.116986300163997e-07, "loss": 0.0117, "step": 3926 }, { "epoch": 2.8981549815498155, "grad_norm": 0.2758125672292223, "learning_rate": 7.015071439830934e-07, "loss": 0.0235, "step": 3927 }, { "epoch": 2.898892988929889, "grad_norm": 0.1130224624333915, "learning_rate": 6.913888990691675e-07, "loss": 0.0119, "step": 3928 }, { "epoch": 2.899630996309963, "grad_norm": 0.35577519996985413, "learning_rate": 6.813439027377077e-07, "loss": 0.0313, "step": 3929 }, { "epoch": 2.900369003690037, "grad_norm": 0.10582914298918827, "learning_rate": 6.713721623977542e-07, "loss": 0.0126, "step": 3930 }, { "epoch": 2.9011070110701107, "grad_norm": 0.13730562764286097, "learning_rate": 6.614736854043124e-07, "loss": 0.0084, "step": 3931 }, { "epoch": 2.9018450184501843, "grad_norm": 0.29968317825712903, "learning_rate": 6.516484790583533e-07, "loss": 0.041, "step": 3932 }, { "epoch": 2.9025830258302583, "grad_norm": 0.17321396986620766, "learning_rate": 6.418965506068019e-07, "loss": 0.0061, "step": 3933 }, { "epoch": 2.9033210332103323, "grad_norm": 0.19781216433802817, "learning_rate": 6.322179072425605e-07, "loss": 0.0252, "step": 3934 }, { "epoch": 2.904059040590406, "grad_norm": 0.29557642751016755, "learning_rate": 6.226125561044294e-07, "loss": 0.0232, "step": 3935 }, { "epoch": 2.9047970479704794, "grad_norm": 0.17072976617070357, "learning_rate": 6.130805042771859e-07, "loss": 0.0734, "step": 3936 }, { "epoch": 2.9055350553505535, "grad_norm": 0.07832791308353536, "learning_rate": 6.036217587915282e-07, "loss": 0.0087, "step": 3937 }, { "epoch": 2.9062730627306275, "grad_norm": 0.18971219037167275, "learning_rate": 5.942363266240869e-07, "loss": 0.0259, "step": 3938 }, { "epoch": 2.907011070110701, "grad_norm": 0.23749701285681382, "learning_rate": 5.849242146974355e-07, "loss": 0.0408, "step": 3939 }, { "epoch": 2.907749077490775, "grad_norm": 0.29926182548936964, "learning_rate": 5.756854298800352e-07, "loss": 0.0287, "step": 3940 }, { "epoch": 2.9084870848708486, "grad_norm": 0.25193484502396024, "learning_rate": 5.665199789862907e-07, "loss": 0.0209, "step": 3941 }, { "epoch": 2.9092250922509226, "grad_norm": 0.2074197890206099, "learning_rate": 5.574278687764944e-07, "loss": 0.0159, "step": 3942 }, { "epoch": 2.909963099630996, "grad_norm": 0.16652140688644412, "learning_rate": 5.484091059568597e-07, "loss": 0.0084, "step": 3943 }, { "epoch": 2.91070110701107, "grad_norm": 0.17039514893664176, "learning_rate": 5.394636971794987e-07, "loss": 0.0233, "step": 3944 }, { "epoch": 2.911439114391144, "grad_norm": 0.11831518229190562, "learning_rate": 5.305916490424001e-07, "loss": 0.0189, "step": 3945 }, { "epoch": 2.912177121771218, "grad_norm": 0.2704936228071665, "learning_rate": 5.217929680894739e-07, "loss": 0.0293, "step": 3946 }, { "epoch": 2.9129151291512914, "grad_norm": 0.2787132553389123, "learning_rate": 5.130676608104845e-07, "loss": 0.0484, "step": 3947 }, { "epoch": 2.9136531365313654, "grad_norm": 0.08745447499590679, "learning_rate": 5.04415733641106e-07, "loss": 0.0053, "step": 3948 }, { "epoch": 2.9143911439114394, "grad_norm": 0.17784987366858107, "learning_rate": 4.958371929628558e-07, "loss": 0.0218, "step": 3949 }, { "epoch": 2.915129151291513, "grad_norm": 0.15242789922397854, "learning_rate": 4.873320451031616e-07, "loss": 0.0184, "step": 3950 }, { "epoch": 2.9158671586715865, "grad_norm": 0.13124655450405168, "learning_rate": 4.789002963352828e-07, "loss": 0.0192, "step": 3951 }, { "epoch": 2.9166051660516605, "grad_norm": 0.17496163795303743, "learning_rate": 4.70541952878345e-07, "loss": 0.0189, "step": 3952 }, { "epoch": 2.9173431734317345, "grad_norm": 0.22230314939473175, "learning_rate": 4.622570208973609e-07, "loss": 0.0349, "step": 3953 }, { "epoch": 2.918081180811808, "grad_norm": 0.12348810682102257, "learning_rate": 4.5404550650317566e-07, "loss": 0.0106, "step": 3954 }, { "epoch": 2.9188191881918817, "grad_norm": 0.15335645943068055, "learning_rate": 4.459074157524556e-07, "loss": 0.0238, "step": 3955 }, { "epoch": 2.9195571955719557, "grad_norm": 0.22810279251511767, "learning_rate": 4.378427546477659e-07, "loss": 0.0241, "step": 3956 }, { "epoch": 2.9202952029520297, "grad_norm": 0.16173178674031044, "learning_rate": 4.298515291374705e-07, "loss": 0.02, "step": 3957 }, { "epoch": 2.9210332103321033, "grad_norm": 0.1443270862137707, "learning_rate": 4.2193374511577675e-07, "loss": 0.0265, "step": 3958 }, { "epoch": 2.921771217712177, "grad_norm": 0.15453586478091966, "learning_rate": 4.1408940842273534e-07, "loss": 0.0259, "step": 3959 }, { "epoch": 2.922509225092251, "grad_norm": 0.08948594084990803, "learning_rate": 4.0631852484421804e-07, "loss": 0.0093, "step": 3960 }, { "epoch": 2.923247232472325, "grad_norm": 0.12647237510354645, "learning_rate": 3.9862110011189557e-07, "loss": 0.0065, "step": 3961 }, { "epoch": 2.9239852398523984, "grad_norm": 0.15555170559431664, "learning_rate": 3.909971399033041e-07, "loss": 0.0145, "step": 3962 }, { "epoch": 2.9247232472324725, "grad_norm": 0.21703005113282658, "learning_rate": 3.834466498417455e-07, "loss": 0.0265, "step": 3963 }, { "epoch": 2.925461254612546, "grad_norm": 0.15572155735054236, "learning_rate": 3.759696354963538e-07, "loss": 0.0232, "step": 3964 }, { "epoch": 2.92619926199262, "grad_norm": 0.1700460030904301, "learning_rate": 3.685661023820619e-07, "loss": 0.0291, "step": 3965 }, { "epoch": 2.9269372693726936, "grad_norm": 0.22071068501358534, "learning_rate": 3.6123605595962396e-07, "loss": 0.0171, "step": 3966 }, { "epoch": 2.9276752767527676, "grad_norm": 0.16637179105704963, "learning_rate": 3.539795016355596e-07, "loss": 0.0492, "step": 3967 }, { "epoch": 2.928413284132841, "grad_norm": 0.37439697423700946, "learning_rate": 3.467964447622096e-07, "loss": 0.0306, "step": 3968 }, { "epoch": 2.929151291512915, "grad_norm": 0.2915873421505405, "learning_rate": 3.3968689063768043e-07, "loss": 0.0157, "step": 3969 }, { "epoch": 2.9298892988929888, "grad_norm": 0.17715140663052778, "learning_rate": 3.3265084450587735e-07, "loss": 0.0243, "step": 3970 }, { "epoch": 2.930627306273063, "grad_norm": 0.2611567424222216, "learning_rate": 3.2568831155649346e-07, "loss": 0.0433, "step": 3971 }, { "epoch": 2.931365313653137, "grad_norm": 0.21822417227332813, "learning_rate": 3.1879929692498757e-07, "loss": 0.0233, "step": 3972 }, { "epoch": 2.9321033210332104, "grad_norm": 0.08589069734071912, "learning_rate": 3.119838056925839e-07, "loss": 0.009, "step": 3973 }, { "epoch": 2.932841328413284, "grad_norm": 0.3592522111317456, "learning_rate": 3.0524184288631686e-07, "loss": 0.0322, "step": 3974 }, { "epoch": 2.933579335793358, "grad_norm": 0.1005683178492493, "learning_rate": 2.9857341347893085e-07, "loss": 0.009, "step": 3975 }, { "epoch": 2.934317343173432, "grad_norm": 0.3934917417447196, "learning_rate": 2.919785223889804e-07, "loss": 0.0453, "step": 3976 }, { "epoch": 2.9350553505535055, "grad_norm": 0.2443691977840749, "learning_rate": 2.8545717448075217e-07, "loss": 0.0325, "step": 3977 }, { "epoch": 2.935793357933579, "grad_norm": 0.2540463436136096, "learning_rate": 2.7900937456430967e-07, "loss": 0.0139, "step": 3978 }, { "epoch": 2.936531365313653, "grad_norm": 0.29586863356554133, "learning_rate": 2.726351273954375e-07, "loss": 0.0205, "step": 3979 }, { "epoch": 2.937269372693727, "grad_norm": 0.18640790864155296, "learning_rate": 2.663344376756971e-07, "loss": 0.029, "step": 3980 }, { "epoch": 2.9380073800738007, "grad_norm": 0.2771739785660235, "learning_rate": 2.6010731005239317e-07, "loss": 0.0894, "step": 3981 }, { "epoch": 2.9387453874538747, "grad_norm": 0.15173563690977723, "learning_rate": 2.5395374911854063e-07, "loss": 0.0183, "step": 3982 }, { "epoch": 2.9394833948339483, "grad_norm": 0.5895558038307765, "learning_rate": 2.478737594129421e-07, "loss": 0.0376, "step": 3983 }, { "epoch": 2.9402214022140223, "grad_norm": 0.14838408883124807, "learning_rate": 2.4186734542009926e-07, "loss": 0.0106, "step": 3984 }, { "epoch": 2.940959409594096, "grad_norm": 0.24355634007213714, "learning_rate": 2.3593451157024603e-07, "loss": 0.0174, "step": 3985 }, { "epoch": 2.94169741697417, "grad_norm": 0.13675104005696018, "learning_rate": 2.3007526223937093e-07, "loss": 0.0115, "step": 3986 }, { "epoch": 2.9424354243542434, "grad_norm": 0.29593905699130696, "learning_rate": 2.2428960174916135e-07, "loss": 0.022, "step": 3987 }, { "epoch": 2.9431734317343174, "grad_norm": 0.12617420541888527, "learning_rate": 2.185775343670371e-07, "loss": 0.0218, "step": 3988 }, { "epoch": 2.943911439114391, "grad_norm": 0.11776091458788618, "learning_rate": 2.1293906430612797e-07, "loss": 0.0145, "step": 3989 }, { "epoch": 2.944649446494465, "grad_norm": 0.11263975738971828, "learning_rate": 2.0737419572530725e-07, "loss": 0.0049, "step": 3990 }, { "epoch": 2.9453874538745386, "grad_norm": 0.2770139509225665, "learning_rate": 2.0188293272912496e-07, "loss": 0.0247, "step": 3991 }, { "epoch": 2.9461254612546126, "grad_norm": 0.21489807531742272, "learning_rate": 1.964652793678523e-07, "loss": 0.0149, "step": 3992 }, { "epoch": 2.946863468634686, "grad_norm": 0.13742123625726615, "learning_rate": 1.9112123963749285e-07, "loss": 0.013, "step": 3993 }, { "epoch": 2.94760147601476, "grad_norm": 0.1696937265715892, "learning_rate": 1.8585081747970468e-07, "loss": 0.014, "step": 3994 }, { "epoch": 2.948339483394834, "grad_norm": 0.24445860131808975, "learning_rate": 1.806540167819004e-07, "loss": 0.0128, "step": 3995 }, { "epoch": 2.9490774907749078, "grad_norm": 0.11069857629067012, "learning_rate": 1.7553084137714726e-07, "loss": 0.0334, "step": 3996 }, { "epoch": 2.9498154981549813, "grad_norm": 0.23044761125018384, "learning_rate": 1.704812950442336e-07, "loss": 0.0376, "step": 3997 }, { "epoch": 2.9505535055350554, "grad_norm": 0.26949084254221606, "learning_rate": 1.655053815076135e-07, "loss": 0.0137, "step": 3998 }, { "epoch": 2.9512915129151294, "grad_norm": 0.3005360579310435, "learning_rate": 1.6060310443747339e-07, "loss": 0.0322, "step": 3999 }, { "epoch": 2.952029520295203, "grad_norm": 0.1964687274645349, "learning_rate": 1.557744674496542e-07, "loss": 0.0252, "step": 4000 }, { "epoch": 2.9527675276752765, "grad_norm": 0.5528256576443588, "learning_rate": 1.5101947410567364e-07, "loss": 0.0876, "step": 4001 }, { "epoch": 2.9535055350553505, "grad_norm": 0.21860597207286453, "learning_rate": 1.463381279127596e-07, "loss": 0.0164, "step": 4002 }, { "epoch": 2.9542435424354245, "grad_norm": 0.28981323224060634, "learning_rate": 1.4173043232380557e-07, "loss": 0.0309, "step": 4003 }, { "epoch": 2.954981549815498, "grad_norm": 0.40022638419847917, "learning_rate": 1.3719639073737079e-07, "loss": 0.023, "step": 4004 }, { "epoch": 2.955719557195572, "grad_norm": 0.22247699180452657, "learning_rate": 1.3273600649770235e-07, "loss": 0.0288, "step": 4005 }, { "epoch": 2.9564575645756457, "grad_norm": 0.2756086378090502, "learning_rate": 1.2834928289472416e-07, "loss": 0.008, "step": 4006 }, { "epoch": 2.9571955719557197, "grad_norm": 0.16080760178861253, "learning_rate": 1.2403622316400355e-07, "loss": 0.0215, "step": 4007 }, { "epoch": 2.9579335793357933, "grad_norm": 0.31739622865400247, "learning_rate": 1.197968304867958e-07, "loss": 0.0086, "step": 4008 }, { "epoch": 2.9586715867158673, "grad_norm": 0.22760327721375997, "learning_rate": 1.1563110799002185e-07, "loss": 0.0241, "step": 4009 }, { "epoch": 2.959409594095941, "grad_norm": 0.18160874832682475, "learning_rate": 1.1153905874624615e-07, "loss": 0.0232, "step": 4010 }, { "epoch": 2.960147601476015, "grad_norm": 0.17723290762106195, "learning_rate": 1.0752068577370988e-07, "loss": 0.0132, "step": 4011 }, { "epoch": 2.9608856088560884, "grad_norm": 0.4174899130429082, "learning_rate": 1.0357599203631996e-07, "loss": 0.0086, "step": 4012 }, { "epoch": 2.9616236162361624, "grad_norm": 0.33982565064213555, "learning_rate": 9.970498044360455e-08, "loss": 0.076, "step": 4013 }, { "epoch": 2.962361623616236, "grad_norm": 0.3652016368371941, "learning_rate": 9.590765385076861e-08, "loss": 0.0178, "step": 4014 }, { "epoch": 2.96309963099631, "grad_norm": 0.3751739487847985, "learning_rate": 9.218401505868279e-08, "loss": 0.0651, "step": 4015 }, { "epoch": 2.9638376383763836, "grad_norm": 0.3197712971719323, "learning_rate": 8.853406681382792e-08, "loss": 0.0297, "step": 4016 }, { "epoch": 2.9645756457564576, "grad_norm": 0.13817595589439843, "learning_rate": 8.49578118083505e-08, "loss": 0.0183, "step": 4017 }, { "epoch": 2.9653136531365316, "grad_norm": 0.1063076330657269, "learning_rate": 8.145525268007382e-08, "loss": 0.013, "step": 4018 }, { "epoch": 2.966051660516605, "grad_norm": 0.3559970576632683, "learning_rate": 7.802639201239803e-08, "loss": 0.0674, "step": 4019 }, { "epoch": 2.9667896678966788, "grad_norm": 0.2994269984008706, "learning_rate": 7.467123233442231e-08, "loss": 0.0131, "step": 4020 }, { "epoch": 2.9675276752767528, "grad_norm": 0.172509516033699, "learning_rate": 7.138977612086706e-08, "loss": 0.0284, "step": 4021 }, { "epoch": 2.9682656826568268, "grad_norm": 0.06888286240312977, "learning_rate": 6.81820257920629e-08, "loss": 0.0067, "step": 4022 }, { "epoch": 2.9690036900369003, "grad_norm": 0.2821161043671789, "learning_rate": 6.504798371402832e-08, "loss": 0.0471, "step": 4023 }, { "epoch": 2.969741697416974, "grad_norm": 0.1574844136134434, "learning_rate": 6.198765219835867e-08, "loss": 0.013, "step": 4024 }, { "epoch": 2.970479704797048, "grad_norm": 0.2853459279911946, "learning_rate": 5.900103350233721e-08, "loss": 0.0297, "step": 4025 }, { "epoch": 2.971217712177122, "grad_norm": 0.1374493537923582, "learning_rate": 5.608812982882405e-08, "loss": 0.028, "step": 4026 }, { "epoch": 2.9719557195571955, "grad_norm": 0.47793805380903553, "learning_rate": 5.3248943326356104e-08, "loss": 0.0342, "step": 4027 }, { "epoch": 2.9726937269372695, "grad_norm": 0.1435219488532863, "learning_rate": 5.0483476089069335e-08, "loss": 0.013, "step": 4028 }, { "epoch": 2.973431734317343, "grad_norm": 0.16991160565582808, "learning_rate": 4.7791730156732107e-08, "loss": 0.0162, "step": 4029 }, { "epoch": 2.974169741697417, "grad_norm": 0.11039322257879244, "learning_rate": 4.517370751472294e-08, "loss": 0.0096, "step": 4030 }, { "epoch": 2.9749077490774907, "grad_norm": 0.16433797237253253, "learning_rate": 4.262941009408605e-08, "loss": 0.0145, "step": 4031 }, { "epoch": 2.9756457564575647, "grad_norm": 0.2731458462285727, "learning_rate": 4.015883977143142e-08, "loss": 0.0231, "step": 4032 }, { "epoch": 2.9763837638376383, "grad_norm": 0.26086822345890653, "learning_rate": 3.776199836902361e-08, "loss": 0.0389, "step": 4033 }, { "epoch": 2.9771217712177123, "grad_norm": 0.1270189396057529, "learning_rate": 3.5438887654737355e-08, "loss": 0.018, "step": 4034 }, { "epoch": 2.977859778597786, "grad_norm": 0.07794721040423513, "learning_rate": 3.318950934207976e-08, "loss": 0.0066, "step": 4035 }, { "epoch": 2.97859778597786, "grad_norm": 0.18121607679024193, "learning_rate": 3.1013865090134816e-08, "loss": 0.0249, "step": 4036 }, { "epoch": 2.9793357933579334, "grad_norm": 0.27135076052421125, "learning_rate": 2.8911956503652193e-08, "loss": 0.0349, "step": 4037 }, { "epoch": 2.9800738007380074, "grad_norm": 0.3062512863481714, "learning_rate": 2.6883785132947316e-08, "loss": 0.0213, "step": 4038 }, { "epoch": 2.980811808118081, "grad_norm": 0.15761954801104, "learning_rate": 2.4929352473979094e-08, "loss": 0.0138, "step": 4039 }, { "epoch": 2.981549815498155, "grad_norm": 0.2520147113863704, "learning_rate": 2.304865996830552e-08, "loss": 0.0206, "step": 4040 }, { "epoch": 2.982287822878229, "grad_norm": 0.2255725733603836, "learning_rate": 2.1241709003094746e-08, "loss": 0.0264, "step": 4041 }, { "epoch": 2.9830258302583026, "grad_norm": 0.18485697713354393, "learning_rate": 1.9508500911136208e-08, "loss": 0.0084, "step": 4042 }, { "epoch": 2.983763837638376, "grad_norm": 0.11861529936449769, "learning_rate": 1.784903697081841e-08, "loss": 0.0179, "step": 4043 }, { "epoch": 2.98450184501845, "grad_norm": 0.2315037946571553, "learning_rate": 1.6263318406128936e-08, "loss": 0.0515, "step": 4044 }, { "epoch": 2.985239852398524, "grad_norm": 0.13958091807894293, "learning_rate": 1.4751346386687736e-08, "loss": 0.014, "step": 4045 }, { "epoch": 2.9859778597785978, "grad_norm": 0.13718761256180412, "learning_rate": 1.3313122027680536e-08, "loss": 0.0163, "step": 4046 }, { "epoch": 2.9867158671586713, "grad_norm": 0.12158715224997045, "learning_rate": 1.1948646389936535e-08, "loss": 0.0119, "step": 4047 }, { "epoch": 2.9874538745387453, "grad_norm": 0.10133153043045678, "learning_rate": 1.0657920479861805e-08, "loss": 0.009, "step": 4048 }, { "epoch": 2.9881918819188193, "grad_norm": 0.22641749321583504, "learning_rate": 9.440945249494793e-09, "loss": 0.0198, "step": 4049 }, { "epoch": 2.988929889298893, "grad_norm": 0.17348665548495265, "learning_rate": 8.297721596439712e-09, "loss": 0.0178, "step": 4050 }, { "epoch": 2.989667896678967, "grad_norm": 0.22348994672615094, "learning_rate": 7.228250363933153e-09, "loss": 0.0229, "step": 4051 }, { "epoch": 2.9904059040590405, "grad_norm": 0.13220453583046113, "learning_rate": 6.232532340788577e-09, "loss": 0.0221, "step": 4052 }, { "epoch": 2.9911439114391145, "grad_norm": 0.3125472906425021, "learning_rate": 5.31056826145182e-09, "loss": 0.0135, "step": 4053 }, { "epoch": 2.991881918819188, "grad_norm": 0.1632159803037222, "learning_rate": 4.462358805934486e-09, "loss": 0.0363, "step": 4054 }, { "epoch": 2.992619926199262, "grad_norm": 0.25620626581946954, "learning_rate": 3.6879045998694517e-09, "loss": 0.0369, "step": 4055 }, { "epoch": 2.9933579335793357, "grad_norm": 0.3556688437661072, "learning_rate": 2.987206214488669e-09, "loss": 0.0353, "step": 4056 }, { "epoch": 2.9940959409594097, "grad_norm": 0.16914964347288836, "learning_rate": 2.3602641666120585e-09, "loss": 0.0126, "step": 4057 }, { "epoch": 2.9948339483394832, "grad_norm": 0.3161909233321807, "learning_rate": 1.8070789186586113e-09, "loss": 0.0445, "step": 4058 }, { "epoch": 2.9955719557195573, "grad_norm": 0.19409408733292874, "learning_rate": 1.3276508786463916e-09, "loss": 0.0219, "step": 4059 }, { "epoch": 2.9963099630996313, "grad_norm": 0.17545739462050894, "learning_rate": 9.21980400203637e-10, "loss": 0.0155, "step": 4060 }, { "epoch": 2.997047970479705, "grad_norm": 0.13680997550273247, "learning_rate": 5.900677825465551e-10, "loss": 0.0158, "step": 4061 }, { "epoch": 2.9977859778597784, "grad_norm": 0.10356661587018835, "learning_rate": 3.3191327049042487e-10, "loss": 0.0129, "step": 4062 }, { "epoch": 2.9985239852398524, "grad_norm": 0.10190542852622929, "learning_rate": 1.475170544495974e-10, "loss": 0.0085, "step": 4063 }, { "epoch": 2.9992619926199264, "grad_norm": 0.12479246714199313, "learning_rate": 3.6879270415290936e-11, "loss": 0.0204, "step": 4064 }, { "epoch": 3.0, "grad_norm": 0.29238102672045707, "learning_rate": 0.0, "loss": 0.0207, "step": 4065 }, { "epoch": 3.0, "eval_loss": 0.059077899903059006, "eval_runtime": 581.8363, "eval_samples_per_second": 18.435, "eval_steps_per_second": 2.305, "step": 4065 }, { "epoch": 3.0, "step": 4065, "total_flos": 1151005435797504.0, "train_loss": 0.04631142559911492, "train_runtime": 12255.7942, "train_samples_per_second": 5.306, "train_steps_per_second": 0.332 } ], "logging_steps": 1, "max_steps": 4065, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1151005435797504.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }