diff --git "a/checkpoint-1664/trainer_state.json" "b/checkpoint-1664/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1664/trainer_state.json" @@ -0,0 +1,5857 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1664, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001201923076923077, + "grad_norm": 354.53998230392546, + "learning_rate": 2.5000000000000004e-07, + "loss": 7.4318, + "step": 2 + }, + { + "epoch": 0.002403846153846154, + "grad_norm": 371.91393245440116, + "learning_rate": 5.000000000000001e-07, + "loss": 7.4355, + "step": 4 + }, + { + "epoch": 0.003605769230769231, + "grad_norm": 346.9801049377746, + "learning_rate": 7.5e-07, + "loss": 7.2172, + "step": 6 + }, + { + "epoch": 0.004807692307692308, + "grad_norm": 449.5218610086392, + "learning_rate": 1.0000000000000002e-06, + "loss": 5.8268, + "step": 8 + }, + { + "epoch": 0.006009615384615385, + "grad_norm": 139.92367559356336, + "learning_rate": 1.25e-06, + "loss": 2.5854, + "step": 10 + }, + { + "epoch": 0.007211538461538462, + "grad_norm": 52.715150799729045, + "learning_rate": 1.5e-06, + "loss": 0.5361, + "step": 12 + }, + { + "epoch": 0.008413461538461538, + "grad_norm": 19.976853421159532, + "learning_rate": 1.75e-06, + "loss": 0.2878, + "step": 14 + }, + { + "epoch": 0.009615384615384616, + "grad_norm": 20.500149107050714, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.2495, + "step": 16 + }, + { + "epoch": 0.010817307692307692, + "grad_norm": 17.263902163149385, + "learning_rate": 2.25e-06, + "loss": 0.2177, + "step": 18 + }, + { + "epoch": 0.01201923076923077, + "grad_norm": 12.128764817788255, + "learning_rate": 2.5e-06, + "loss": 0.212, + "step": 20 + }, + { + "epoch": 0.013221153846153846, + "grad_norm": 8.417030179298662, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.2035, + "step": 22 + }, + { + "epoch": 0.014423076923076924, + "grad_norm": 10.874846551654207, + "learning_rate": 3e-06, + "loss": 0.1999, + "step": 24 + }, + { + "epoch": 0.015625, + "grad_norm": 2.553082597942841, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.1884, + "step": 26 + }, + { + "epoch": 0.016826923076923076, + "grad_norm": 8.091183712435873, + "learning_rate": 3.5e-06, + "loss": 0.1729, + "step": 28 + }, + { + "epoch": 0.018028846153846152, + "grad_norm": 6.473289695229128, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.1858, + "step": 30 + }, + { + "epoch": 0.019230769230769232, + "grad_norm": 10.845224583341055, + "learning_rate": 4.000000000000001e-06, + "loss": 0.1779, + "step": 32 + }, + { + "epoch": 0.020432692307692308, + "grad_norm": 7.588560990570617, + "learning_rate": 4.25e-06, + "loss": 0.1807, + "step": 34 + }, + { + "epoch": 0.021634615384615384, + "grad_norm": 3.2833536176531437, + "learning_rate": 4.5e-06, + "loss": 0.1741, + "step": 36 + }, + { + "epoch": 0.02283653846153846, + "grad_norm": 9.48172518986478, + "learning_rate": 4.75e-06, + "loss": 0.17, + "step": 38 + }, + { + "epoch": 0.02403846153846154, + "grad_norm": 10.178982049068438, + "learning_rate": 5e-06, + "loss": 0.1603, + "step": 40 + }, + { + "epoch": 0.025240384615384616, + "grad_norm": 2.410777001535273, + "learning_rate": 4.999981288993133e-06, + "loss": 0.1772, + "step": 42 + }, + { + "epoch": 0.026442307692307692, + "grad_norm": 11.143463053654319, + "learning_rate": 4.999925156252611e-06, + "loss": 0.1799, + "step": 44 + }, + { + "epoch": 0.027644230769230768, + "grad_norm": 9.886918600788055, + "learning_rate": 4.9998316026186755e-06, + "loss": 0.1773, + "step": 46 + }, + { + "epoch": 0.028846153846153848, + "grad_norm": 7.527819471799396, + "learning_rate": 4.999700629491713e-06, + "loss": 0.176, + "step": 48 + }, + { + "epoch": 0.030048076923076924, + "grad_norm": 7.572713386840099, + "learning_rate": 4.999532238832233e-06, + "loss": 0.153, + "step": 50 + }, + { + "epoch": 0.03125, + "grad_norm": 3.477539644705173, + "learning_rate": 4.999326433160844e-06, + "loss": 0.1588, + "step": 52 + }, + { + "epoch": 0.03245192307692308, + "grad_norm": 2.5927408403215266, + "learning_rate": 4.999083215558211e-06, + "loss": 0.1657, + "step": 54 + }, + { + "epoch": 0.03365384615384615, + "grad_norm": 1.8986386268295627, + "learning_rate": 4.998802589665009e-06, + "loss": 0.1624, + "step": 56 + }, + { + "epoch": 0.03485576923076923, + "grad_norm": 4.511287232603737, + "learning_rate": 4.998484559681875e-06, + "loss": 0.1604, + "step": 58 + }, + { + "epoch": 0.036057692307692304, + "grad_norm": 3.1615021035675586, + "learning_rate": 4.998129130369338e-06, + "loss": 0.1541, + "step": 60 + }, + { + "epoch": 0.037259615384615384, + "grad_norm": 2.3641598718509163, + "learning_rate": 4.997736307047748e-06, + "loss": 0.1609, + "step": 62 + }, + { + "epoch": 0.038461538461538464, + "grad_norm": 2.1050208268263018, + "learning_rate": 4.997306095597203e-06, + "loss": 0.1628, + "step": 64 + }, + { + "epoch": 0.039663461538461536, + "grad_norm": 4.670481996347925, + "learning_rate": 4.996838502457453e-06, + "loss": 0.1605, + "step": 66 + }, + { + "epoch": 0.040865384615384616, + "grad_norm": 2.812878853953604, + "learning_rate": 4.99633353462781e-06, + "loss": 0.1394, + "step": 68 + }, + { + "epoch": 0.042067307692307696, + "grad_norm": 3.3453547259139658, + "learning_rate": 4.995791199667038e-06, + "loss": 0.1353, + "step": 70 + }, + { + "epoch": 0.04326923076923077, + "grad_norm": 9.508467033910211, + "learning_rate": 4.9952115056932445e-06, + "loss": 0.1464, + "step": 72 + }, + { + "epoch": 0.04447115384615385, + "grad_norm": 6.12221208629695, + "learning_rate": 4.994594461383756e-06, + "loss": 0.1534, + "step": 74 + }, + { + "epoch": 0.04567307692307692, + "grad_norm": 3.2936985183570644, + "learning_rate": 4.993940075974988e-06, + "loss": 0.1551, + "step": 76 + }, + { + "epoch": 0.046875, + "grad_norm": 4.147948524170783, + "learning_rate": 4.993248359262308e-06, + "loss": 0.1599, + "step": 78 + }, + { + "epoch": 0.04807692307692308, + "grad_norm": 6.328245841206747, + "learning_rate": 4.99251932159989e-06, + "loss": 0.1433, + "step": 80 + }, + { + "epoch": 0.04927884615384615, + "grad_norm": 3.454386437171457, + "learning_rate": 4.991752973900558e-06, + "loss": 0.1589, + "step": 82 + }, + { + "epoch": 0.05048076923076923, + "grad_norm": 4.628897164295182, + "learning_rate": 4.9909493276356184e-06, + "loss": 0.16, + "step": 84 + }, + { + "epoch": 0.051682692307692304, + "grad_norm": 4.959693562958358, + "learning_rate": 4.990108394834698e-06, + "loss": 0.1504, + "step": 86 + }, + { + "epoch": 0.052884615384615384, + "grad_norm": 2.037506021404142, + "learning_rate": 4.9892301880855565e-06, + "loss": 0.1469, + "step": 88 + }, + { + "epoch": 0.054086538461538464, + "grad_norm": 3.2873483486362898, + "learning_rate": 4.988314720533899e-06, + "loss": 0.152, + "step": 90 + }, + { + "epoch": 0.055288461538461536, + "grad_norm": 3.5444104019705165, + "learning_rate": 4.987362005883182e-06, + "loss": 0.1396, + "step": 92 + }, + { + "epoch": 0.056490384615384616, + "grad_norm": 2.43315278564924, + "learning_rate": 4.986372058394404e-06, + "loss": 0.1365, + "step": 94 + }, + { + "epoch": 0.057692307692307696, + "grad_norm": 3.1953964952618015, + "learning_rate": 4.985344892885899e-06, + "loss": 0.158, + "step": 96 + }, + { + "epoch": 0.05889423076923077, + "grad_norm": 2.0524573584491814, + "learning_rate": 4.984280524733107e-06, + "loss": 0.1571, + "step": 98 + }, + { + "epoch": 0.06009615384615385, + "grad_norm": 6.255981390751074, + "learning_rate": 4.983178969868346e-06, + "loss": 0.1464, + "step": 100 + }, + { + "epoch": 0.06129807692307692, + "grad_norm": 2.401646579094803, + "learning_rate": 4.98204024478058e-06, + "loss": 0.1417, + "step": 102 + }, + { + "epoch": 0.0625, + "grad_norm": 4.086961173737914, + "learning_rate": 4.980864366515159e-06, + "loss": 0.1541, + "step": 104 + }, + { + "epoch": 0.06370192307692307, + "grad_norm": 5.143484575050959, + "learning_rate": 4.97965135267358e-06, + "loss": 0.1499, + "step": 106 + }, + { + "epoch": 0.06490384615384616, + "grad_norm": 9.335258253064257, + "learning_rate": 4.978401221413209e-06, + "loss": 0.1684, + "step": 108 + }, + { + "epoch": 0.06610576923076923, + "grad_norm": 7.980472809228967, + "learning_rate": 4.977113991447017e-06, + "loss": 0.1663, + "step": 110 + }, + { + "epoch": 0.0673076923076923, + "grad_norm": 5.567091210665335, + "learning_rate": 4.9757896820433015e-06, + "loss": 0.1496, + "step": 112 + }, + { + "epoch": 0.06850961538461539, + "grad_norm": 5.007400500292786, + "learning_rate": 4.9744283130253905e-06, + "loss": 0.1415, + "step": 114 + }, + { + "epoch": 0.06971153846153846, + "grad_norm": 2.720822334329411, + "learning_rate": 4.973029904771353e-06, + "loss": 0.1541, + "step": 116 + }, + { + "epoch": 0.07091346153846154, + "grad_norm": 6.0293181825471605, + "learning_rate": 4.97159447821369e-06, + "loss": 0.1334, + "step": 118 + }, + { + "epoch": 0.07211538461538461, + "grad_norm": 2.190960264135707, + "learning_rate": 4.9701220548390215e-06, + "loss": 0.1353, + "step": 120 + }, + { + "epoch": 0.0733173076923077, + "grad_norm": 1.8394820554560345, + "learning_rate": 4.968612656687768e-06, + "loss": 0.1424, + "step": 122 + }, + { + "epoch": 0.07451923076923077, + "grad_norm": 1.890530118257683, + "learning_rate": 4.967066306353816e-06, + "loss": 0.161, + "step": 124 + }, + { + "epoch": 0.07572115384615384, + "grad_norm": 1.74782914530488, + "learning_rate": 4.965483026984182e-06, + "loss": 0.1391, + "step": 126 + }, + { + "epoch": 0.07692307692307693, + "grad_norm": 2.529223967308172, + "learning_rate": 4.963862842278669e-06, + "loss": 0.1509, + "step": 128 + }, + { + "epoch": 0.078125, + "grad_norm": 3.2375323783057546, + "learning_rate": 4.962205776489506e-06, + "loss": 0.1452, + "step": 130 + }, + { + "epoch": 0.07932692307692307, + "grad_norm": 3.289988413600293, + "learning_rate": 4.9605118544209874e-06, + "loss": 0.1369, + "step": 132 + }, + { + "epoch": 0.08052884615384616, + "grad_norm": 2.533449561050558, + "learning_rate": 4.958781101429104e-06, + "loss": 0.157, + "step": 134 + }, + { + "epoch": 0.08173076923076923, + "grad_norm": 2.3698097270600846, + "learning_rate": 4.9570135434211615e-06, + "loss": 0.1586, + "step": 136 + }, + { + "epoch": 0.0829326923076923, + "grad_norm": 2.9294128800046835, + "learning_rate": 4.95520920685539e-06, + "loss": 0.1438, + "step": 138 + }, + { + "epoch": 0.08413461538461539, + "grad_norm": 3.199532338265642, + "learning_rate": 4.953368118740555e-06, + "loss": 0.1404, + "step": 140 + }, + { + "epoch": 0.08533653846153846, + "grad_norm": 5.007008541932283, + "learning_rate": 4.951490306635543e-06, + "loss": 0.1595, + "step": 142 + }, + { + "epoch": 0.08653846153846154, + "grad_norm": 6.472389725628937, + "learning_rate": 4.949575798648962e-06, + "loss": 0.1589, + "step": 144 + }, + { + "epoch": 0.08774038461538461, + "grad_norm": 2.0940111362516998, + "learning_rate": 4.947624623438707e-06, + "loss": 0.1352, + "step": 146 + }, + { + "epoch": 0.0889423076923077, + "grad_norm": 2.6543426483078214, + "learning_rate": 4.9456368102115414e-06, + "loss": 0.1396, + "step": 148 + }, + { + "epoch": 0.09014423076923077, + "grad_norm": 4.492736133720869, + "learning_rate": 4.943612388722654e-06, + "loss": 0.1362, + "step": 150 + }, + { + "epoch": 0.09134615384615384, + "grad_norm": 3.1413400916428698, + "learning_rate": 4.941551389275217e-06, + "loss": 0.1398, + "step": 152 + }, + { + "epoch": 0.09254807692307693, + "grad_norm": 7.121010742342612, + "learning_rate": 4.9394538427199305e-06, + "loss": 0.1612, + "step": 154 + }, + { + "epoch": 0.09375, + "grad_norm": 2.0494879939162773, + "learning_rate": 4.937319780454559e-06, + "loss": 0.1372, + "step": 156 + }, + { + "epoch": 0.09495192307692307, + "grad_norm": 3.41511576689734, + "learning_rate": 4.935149234423468e-06, + "loss": 0.1463, + "step": 158 + }, + { + "epoch": 0.09615384615384616, + "grad_norm": 4.070726562354956, + "learning_rate": 4.9329422371171375e-06, + "loss": 0.1534, + "step": 160 + }, + { + "epoch": 0.09735576923076923, + "grad_norm": 4.8008312467816125, + "learning_rate": 4.930698821571681e-06, + "loss": 0.1603, + "step": 162 + }, + { + "epoch": 0.0985576923076923, + "grad_norm": 6.5840675047315225, + "learning_rate": 4.928419021368349e-06, + "loss": 0.1472, + "step": 164 + }, + { + "epoch": 0.09975961538461539, + "grad_norm": 4.906437852842057, + "learning_rate": 4.926102870633029e-06, + "loss": 0.1518, + "step": 166 + }, + { + "epoch": 0.10096153846153846, + "grad_norm": 2.2753956803841424, + "learning_rate": 4.923750404035729e-06, + "loss": 0.132, + "step": 168 + }, + { + "epoch": 0.10216346153846154, + "grad_norm": 10.360890258671878, + "learning_rate": 4.921361656790065e-06, + "loss": 0.1615, + "step": 170 + }, + { + "epoch": 0.10336538461538461, + "grad_norm": 3.781202441981427, + "learning_rate": 4.918936664652729e-06, + "loss": 0.1317, + "step": 172 + }, + { + "epoch": 0.1045673076923077, + "grad_norm": 4.612706315229004, + "learning_rate": 4.9164754639229575e-06, + "loss": 0.1556, + "step": 174 + }, + { + "epoch": 0.10576923076923077, + "grad_norm": 10.089023799727872, + "learning_rate": 4.913978091441985e-06, + "loss": 0.1366, + "step": 176 + }, + { + "epoch": 0.10697115384615384, + "grad_norm": 2.3844132422742215, + "learning_rate": 4.911444584592495e-06, + "loss": 0.1364, + "step": 178 + }, + { + "epoch": 0.10817307692307693, + "grad_norm": 8.01833294402442, + "learning_rate": 4.908874981298058e-06, + "loss": 0.1367, + "step": 180 + }, + { + "epoch": 0.109375, + "grad_norm": 2.6816022197266083, + "learning_rate": 4.906269320022566e-06, + "loss": 0.1357, + "step": 182 + }, + { + "epoch": 0.11057692307692307, + "grad_norm": 3.540312508006275, + "learning_rate": 4.903627639769656e-06, + "loss": 0.1485, + "step": 184 + }, + { + "epoch": 0.11177884615384616, + "grad_norm": 2.127391987641345, + "learning_rate": 4.900949980082127e-06, + "loss": 0.1491, + "step": 186 + }, + { + "epoch": 0.11298076923076923, + "grad_norm": 1.8381751149591552, + "learning_rate": 4.898236381041343e-06, + "loss": 0.1378, + "step": 188 + }, + { + "epoch": 0.1141826923076923, + "grad_norm": 2.4143842380581355, + "learning_rate": 4.895486883266644e-06, + "loss": 0.134, + "step": 190 + }, + { + "epoch": 0.11538461538461539, + "grad_norm": 2.33879580368458, + "learning_rate": 4.892701527914725e-06, + "loss": 0.1274, + "step": 192 + }, + { + "epoch": 0.11658653846153846, + "grad_norm": 2.677161136207072, + "learning_rate": 4.88988035667903e-06, + "loss": 0.1247, + "step": 194 + }, + { + "epoch": 0.11778846153846154, + "grad_norm": 2.224205454303052, + "learning_rate": 4.88702341178912e-06, + "loss": 0.1171, + "step": 196 + }, + { + "epoch": 0.11899038461538461, + "grad_norm": 2.331487573448718, + "learning_rate": 4.88413073601005e-06, + "loss": 0.1304, + "step": 198 + }, + { + "epoch": 0.1201923076923077, + "grad_norm": 3.904053698054214, + "learning_rate": 4.8812023726417194e-06, + "loss": 0.1441, + "step": 200 + }, + { + "epoch": 0.12139423076923077, + "grad_norm": 1.9333636759461283, + "learning_rate": 4.878238365518231e-06, + "loss": 0.1473, + "step": 202 + }, + { + "epoch": 0.12259615384615384, + "grad_norm": 4.5752867405646205, + "learning_rate": 4.87523875900723e-06, + "loss": 0.1337, + "step": 204 + }, + { + "epoch": 0.12379807692307693, + "grad_norm": 1.857974634859215, + "learning_rate": 4.872203598009244e-06, + "loss": 0.127, + "step": 206 + }, + { + "epoch": 0.125, + "grad_norm": 3.153527922810332, + "learning_rate": 4.869132927957007e-06, + "loss": 0.1484, + "step": 208 + }, + { + "epoch": 0.12620192307692307, + "grad_norm": 2.2228179237011534, + "learning_rate": 4.866026794814781e-06, + "loss": 0.1306, + "step": 210 + }, + { + "epoch": 0.12740384615384615, + "grad_norm": 1.7350718661408604, + "learning_rate": 4.862885245077669e-06, + "loss": 0.1352, + "step": 212 + }, + { + "epoch": 0.12860576923076922, + "grad_norm": 2.1132426954959924, + "learning_rate": 4.859708325770919e-06, + "loss": 0.1416, + "step": 214 + }, + { + "epoch": 0.12980769230769232, + "grad_norm": 1.8563726212012472, + "learning_rate": 4.856496084449218e-06, + "loss": 0.1461, + "step": 216 + }, + { + "epoch": 0.1310096153846154, + "grad_norm": 1.8179558309835169, + "learning_rate": 4.85324856919598e-06, + "loss": 0.1322, + "step": 218 + }, + { + "epoch": 0.13221153846153846, + "grad_norm": 4.497720485766678, + "learning_rate": 4.849965828622632e-06, + "loss": 0.1275, + "step": 220 + }, + { + "epoch": 0.13341346153846154, + "grad_norm": 3.0404012207264843, + "learning_rate": 4.846647911867877e-06, + "loss": 0.1436, + "step": 222 + }, + { + "epoch": 0.1346153846153846, + "grad_norm": 3.224075088217143, + "learning_rate": 4.8432948685969646e-06, + "loss": 0.1656, + "step": 224 + }, + { + "epoch": 0.13581730769230768, + "grad_norm": 3.2443798600258686, + "learning_rate": 4.83990674900095e-06, + "loss": 0.1393, + "step": 226 + }, + { + "epoch": 0.13701923076923078, + "grad_norm": 1.786558241709454, + "learning_rate": 4.836483603795935e-06, + "loss": 0.1263, + "step": 228 + }, + { + "epoch": 0.13822115384615385, + "grad_norm": 2.1509725801613513, + "learning_rate": 4.8330254842223155e-06, + "loss": 0.1409, + "step": 230 + }, + { + "epoch": 0.13942307692307693, + "grad_norm": 2.835049924036413, + "learning_rate": 4.829532442044008e-06, + "loss": 0.1319, + "step": 232 + }, + { + "epoch": 0.140625, + "grad_norm": 4.679921143965946, + "learning_rate": 4.8260045295476846e-06, + "loss": 0.1506, + "step": 234 + }, + { + "epoch": 0.14182692307692307, + "grad_norm": 1.9142698244457717, + "learning_rate": 4.822441799541979e-06, + "loss": 0.15, + "step": 236 + }, + { + "epoch": 0.14302884615384615, + "grad_norm": 8.216926584060278, + "learning_rate": 4.818844305356705e-06, + "loss": 0.1508, + "step": 238 + }, + { + "epoch": 0.14423076923076922, + "grad_norm": 1.5774872715864894, + "learning_rate": 4.815212100842053e-06, + "loss": 0.1365, + "step": 240 + }, + { + "epoch": 0.14543269230769232, + "grad_norm": 4.90886123617284, + "learning_rate": 4.811545240367785e-06, + "loss": 0.1488, + "step": 242 + }, + { + "epoch": 0.1466346153846154, + "grad_norm": 2.867193865140862, + "learning_rate": 4.807843778822424e-06, + "loss": 0.1403, + "step": 244 + }, + { + "epoch": 0.14783653846153846, + "grad_norm": 2.8742525824591123, + "learning_rate": 4.804107771612427e-06, + "loss": 0.1543, + "step": 246 + }, + { + "epoch": 0.14903846153846154, + "grad_norm": 2.3762533430208563, + "learning_rate": 4.800337274661358e-06, + "loss": 0.1375, + "step": 248 + }, + { + "epoch": 0.1502403846153846, + "grad_norm": 2.0839909923885447, + "learning_rate": 4.796532344409055e-06, + "loss": 0.1501, + "step": 250 + }, + { + "epoch": 0.15144230769230768, + "grad_norm": 4.198893033771618, + "learning_rate": 4.7926930378107765e-06, + "loss": 0.1323, + "step": 252 + }, + { + "epoch": 0.15264423076923078, + "grad_norm": 6.846739098146311, + "learning_rate": 4.788819412336358e-06, + "loss": 0.1399, + "step": 254 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 5.537919034600803, + "learning_rate": 4.784911525969344e-06, + "loss": 0.1233, + "step": 256 + }, + { + "epoch": 0.15504807692307693, + "grad_norm": 3.033292609600203, + "learning_rate": 4.780969437206128e-06, + "loss": 0.1478, + "step": 258 + }, + { + "epoch": 0.15625, + "grad_norm": 4.62635643989095, + "learning_rate": 4.776993205055067e-06, + "loss": 0.1465, + "step": 260 + }, + { + "epoch": 0.15745192307692307, + "grad_norm": 1.6930995723930686, + "learning_rate": 4.772982889035609e-06, + "loss": 0.134, + "step": 262 + }, + { + "epoch": 0.15865384615384615, + "grad_norm": 5.190694670180204, + "learning_rate": 4.7689385491773934e-06, + "loss": 0.1397, + "step": 264 + }, + { + "epoch": 0.15985576923076922, + "grad_norm": 3.0169437457346104, + "learning_rate": 4.764860246019356e-06, + "loss": 0.1462, + "step": 266 + }, + { + "epoch": 0.16105769230769232, + "grad_norm": 6.609894055693969, + "learning_rate": 4.760748040608826e-06, + "loss": 0.1349, + "step": 268 + }, + { + "epoch": 0.1622596153846154, + "grad_norm": 4.93984379875883, + "learning_rate": 4.756601994500604e-06, + "loss": 0.1336, + "step": 270 + }, + { + "epoch": 0.16346153846153846, + "grad_norm": 4.75518188154229, + "learning_rate": 4.752422169756048e-06, + "loss": 0.146, + "step": 272 + }, + { + "epoch": 0.16466346153846154, + "grad_norm": 2.086098294528403, + "learning_rate": 4.748208628942143e-06, + "loss": 0.1419, + "step": 274 + }, + { + "epoch": 0.1658653846153846, + "grad_norm": 5.396963944335693, + "learning_rate": 4.7439614351305614e-06, + "loss": 0.1432, + "step": 276 + }, + { + "epoch": 0.16706730769230768, + "grad_norm": 7.532501685521908, + "learning_rate": 4.739680651896721e-06, + "loss": 0.145, + "step": 278 + }, + { + "epoch": 0.16826923076923078, + "grad_norm": 2.336463554377762, + "learning_rate": 4.7353663433188325e-06, + "loss": 0.1475, + "step": 280 + }, + { + "epoch": 0.16947115384615385, + "grad_norm": 5.48654115007209, + "learning_rate": 4.731018573976943e-06, + "loss": 0.1544, + "step": 282 + }, + { + "epoch": 0.17067307692307693, + "grad_norm": 2.120931360446975, + "learning_rate": 4.726637408951966e-06, + "loss": 0.1286, + "step": 284 + }, + { + "epoch": 0.171875, + "grad_norm": 3.3875599595704498, + "learning_rate": 4.7222229138247076e-06, + "loss": 0.1383, + "step": 286 + }, + { + "epoch": 0.17307692307692307, + "grad_norm": 2.0989272460873796, + "learning_rate": 4.717775154674888e-06, + "loss": 0.1168, + "step": 288 + }, + { + "epoch": 0.17427884615384615, + "grad_norm": 3.817152405138102, + "learning_rate": 4.713294198080149e-06, + "loss": 0.1257, + "step": 290 + }, + { + "epoch": 0.17548076923076922, + "grad_norm": 2.293976238111847, + "learning_rate": 4.708780111115058e-06, + "loss": 0.1358, + "step": 292 + }, + { + "epoch": 0.17668269230769232, + "grad_norm": 2.0161046467731407, + "learning_rate": 4.7042329613501035e-06, + "loss": 0.1214, + "step": 294 + }, + { + "epoch": 0.1778846153846154, + "grad_norm": 2.3356279678505674, + "learning_rate": 4.699652816850686e-06, + "loss": 0.1296, + "step": 296 + }, + { + "epoch": 0.17908653846153846, + "grad_norm": 2.034038118147649, + "learning_rate": 4.6950397461761e-06, + "loss": 0.1163, + "step": 298 + }, + { + "epoch": 0.18028846153846154, + "grad_norm": 2.63792392669932, + "learning_rate": 4.690393818378501e-06, + "loss": 0.1269, + "step": 300 + }, + { + "epoch": 0.1814903846153846, + "grad_norm": 2.75722633258936, + "learning_rate": 4.685715103001879e-06, + "loss": 0.1243, + "step": 302 + }, + { + "epoch": 0.18269230769230768, + "grad_norm": 2.0819705788021183, + "learning_rate": 4.681003670081015e-06, + "loss": 0.1304, + "step": 304 + }, + { + "epoch": 0.18389423076923078, + "grad_norm": 3.4298950454490176, + "learning_rate": 4.676259590140431e-06, + "loss": 0.1377, + "step": 306 + }, + { + "epoch": 0.18509615384615385, + "grad_norm": 2.471860576622299, + "learning_rate": 4.671482934193337e-06, + "loss": 0.1356, + "step": 308 + }, + { + "epoch": 0.18629807692307693, + "grad_norm": 4.8199475175470825, + "learning_rate": 4.666673773740568e-06, + "loss": 0.125, + "step": 310 + }, + { + "epoch": 0.1875, + "grad_norm": 3.178092400708656, + "learning_rate": 4.66183218076951e-06, + "loss": 0.1365, + "step": 312 + }, + { + "epoch": 0.18870192307692307, + "grad_norm": 5.888487682413386, + "learning_rate": 4.656958227753028e-06, + "loss": 0.1415, + "step": 314 + }, + { + "epoch": 0.18990384615384615, + "grad_norm": 1.7792863825981573, + "learning_rate": 4.652051987648375e-06, + "loss": 0.1416, + "step": 316 + }, + { + "epoch": 0.19110576923076922, + "grad_norm": 2.9722465375990836, + "learning_rate": 4.647113533896106e-06, + "loss": 0.1396, + "step": 318 + }, + { + "epoch": 0.19230769230769232, + "grad_norm": 2.4672094667500475, + "learning_rate": 4.642142940418973e-06, + "loss": 0.1248, + "step": 320 + }, + { + "epoch": 0.1935096153846154, + "grad_norm": 2.7077423897634914, + "learning_rate": 4.637140281620825e-06, + "loss": 0.1383, + "step": 322 + }, + { + "epoch": 0.19471153846153846, + "grad_norm": 4.807221287056447, + "learning_rate": 4.632105632385488e-06, + "loss": 0.1361, + "step": 324 + }, + { + "epoch": 0.19591346153846154, + "grad_norm": 5.728075789744543, + "learning_rate": 4.627039068075647e-06, + "loss": 0.1444, + "step": 326 + }, + { + "epoch": 0.1971153846153846, + "grad_norm": 4.398279898396926, + "learning_rate": 4.621940664531718e-06, + "loss": 0.1486, + "step": 328 + }, + { + "epoch": 0.19831730769230768, + "grad_norm": 2.610023546419106, + "learning_rate": 4.6168104980707105e-06, + "loss": 0.1263, + "step": 330 + }, + { + "epoch": 0.19951923076923078, + "grad_norm": 3.5181999115242943, + "learning_rate": 4.61164864548509e-06, + "loss": 0.1308, + "step": 332 + }, + { + "epoch": 0.20072115384615385, + "grad_norm": 2.1510452933939903, + "learning_rate": 4.606455184041623e-06, + "loss": 0.14, + "step": 334 + }, + { + "epoch": 0.20192307692307693, + "grad_norm": 3.244280076674407, + "learning_rate": 4.6012301914802245e-06, + "loss": 0.1211, + "step": 336 + }, + { + "epoch": 0.203125, + "grad_norm": 2.4060103217800726, + "learning_rate": 4.595973746012791e-06, + "loss": 0.1331, + "step": 338 + }, + { + "epoch": 0.20432692307692307, + "grad_norm": 5.706691276517432, + "learning_rate": 4.590685926322032e-06, + "loss": 0.1275, + "step": 340 + }, + { + "epoch": 0.20552884615384615, + "grad_norm": 1.982976110922252, + "learning_rate": 4.585366811560293e-06, + "loss": 0.1236, + "step": 342 + }, + { + "epoch": 0.20673076923076922, + "grad_norm": 4.23602021986134, + "learning_rate": 4.580016481348367e-06, + "loss": 0.1361, + "step": 344 + }, + { + "epoch": 0.20793269230769232, + "grad_norm": 2.211392940952842, + "learning_rate": 4.574635015774308e-06, + "loss": 0.1255, + "step": 346 + }, + { + "epoch": 0.2091346153846154, + "grad_norm": 6.442272520375928, + "learning_rate": 4.569222495392227e-06, + "loss": 0.1344, + "step": 348 + }, + { + "epoch": 0.21033653846153846, + "grad_norm": 3.8749351925382594, + "learning_rate": 4.563779001221087e-06, + "loss": 0.1501, + "step": 350 + }, + { + "epoch": 0.21153846153846154, + "grad_norm": 1.7448331765525071, + "learning_rate": 4.558304614743496e-06, + "loss": 0.1381, + "step": 352 + }, + { + "epoch": 0.2127403846153846, + "grad_norm": 5.668086585104286, + "learning_rate": 4.5527994179044785e-06, + "loss": 0.1306, + "step": 354 + }, + { + "epoch": 0.21394230769230768, + "grad_norm": 2.5525220134836677, + "learning_rate": 4.547263493110257e-06, + "loss": 0.1386, + "step": 356 + }, + { + "epoch": 0.21514423076923078, + "grad_norm": 4.733640695947825, + "learning_rate": 4.54169692322701e-06, + "loss": 0.131, + "step": 358 + }, + { + "epoch": 0.21634615384615385, + "grad_norm": 2.4560081882135965, + "learning_rate": 4.536099791579643e-06, + "loss": 0.1332, + "step": 360 + }, + { + "epoch": 0.21754807692307693, + "grad_norm": 3.30310384335084, + "learning_rate": 4.530472181950528e-06, + "loss": 0.1452, + "step": 362 + }, + { + "epoch": 0.21875, + "grad_norm": 3.96117046469673, + "learning_rate": 4.524814178578261e-06, + "loss": 0.1258, + "step": 364 + }, + { + "epoch": 0.21995192307692307, + "grad_norm": 2.1571934099507324, + "learning_rate": 4.519125866156392e-06, + "loss": 0.1268, + "step": 366 + }, + { + "epoch": 0.22115384615384615, + "grad_norm": 2.731599995456764, + "learning_rate": 4.5134073298321655e-06, + "loss": 0.1275, + "step": 368 + }, + { + "epoch": 0.22235576923076922, + "grad_norm": 6.257464871792732, + "learning_rate": 4.5076586552052375e-06, + "loss": 0.136, + "step": 370 + }, + { + "epoch": 0.22355769230769232, + "grad_norm": 2.1253597649510496, + "learning_rate": 4.501879928326402e-06, + "loss": 0.1097, + "step": 372 + }, + { + "epoch": 0.2247596153846154, + "grad_norm": 4.866268104213111, + "learning_rate": 4.496071235696296e-06, + "loss": 0.1172, + "step": 374 + }, + { + "epoch": 0.22596153846153846, + "grad_norm": 2.65071594422531, + "learning_rate": 4.49023266426411e-06, + "loss": 0.1167, + "step": 376 + }, + { + "epoch": 0.22716346153846154, + "grad_norm": 3.5486758869705266, + "learning_rate": 4.484364301426285e-06, + "loss": 0.1276, + "step": 378 + }, + { + "epoch": 0.2283653846153846, + "grad_norm": 6.057336639310383, + "learning_rate": 4.478466235025203e-06, + "loss": 0.1393, + "step": 380 + }, + { + "epoch": 0.22956730769230768, + "grad_norm": 3.204768478540457, + "learning_rate": 4.472538553347871e-06, + "loss": 0.1208, + "step": 382 + }, + { + "epoch": 0.23076923076923078, + "grad_norm": 3.509914333448296, + "learning_rate": 4.466581345124605e-06, + "loss": 0.138, + "step": 384 + }, + { + "epoch": 0.23197115384615385, + "grad_norm": 1.9711436203178656, + "learning_rate": 4.460594699527695e-06, + "loss": 0.1263, + "step": 386 + }, + { + "epoch": 0.23317307692307693, + "grad_norm": 3.331060762805983, + "learning_rate": 4.454578706170075e-06, + "loss": 0.1424, + "step": 388 + }, + { + "epoch": 0.234375, + "grad_norm": 2.467895906628356, + "learning_rate": 4.448533455103979e-06, + "loss": 0.1324, + "step": 390 + }, + { + "epoch": 0.23557692307692307, + "grad_norm": 1.5497019030552028, + "learning_rate": 4.442459036819595e-06, + "loss": 0.1319, + "step": 392 + }, + { + "epoch": 0.23677884615384615, + "grad_norm": 2.822888845766881, + "learning_rate": 4.4363555422437095e-06, + "loss": 0.1272, + "step": 394 + }, + { + "epoch": 0.23798076923076922, + "grad_norm": 2.5029600610225695, + "learning_rate": 4.430223062738344e-06, + "loss": 0.128, + "step": 396 + }, + { + "epoch": 0.23918269230769232, + "grad_norm": 1.9911030442643596, + "learning_rate": 4.424061690099392e-06, + "loss": 0.1365, + "step": 398 + }, + { + "epoch": 0.2403846153846154, + "grad_norm": 2.062089943745463, + "learning_rate": 4.417871516555241e-06, + "loss": 0.1287, + "step": 400 + }, + { + "epoch": 0.24158653846153846, + "grad_norm": 3.3929814258858784, + "learning_rate": 4.411652634765398e-06, + "loss": 0.1354, + "step": 402 + }, + { + "epoch": 0.24278846153846154, + "grad_norm": 2.47220919536628, + "learning_rate": 4.4054051378190915e-06, + "loss": 0.1243, + "step": 404 + }, + { + "epoch": 0.2439903846153846, + "grad_norm": 4.8206335139544265, + "learning_rate": 4.39912911923389e-06, + "loss": 0.1225, + "step": 406 + }, + { + "epoch": 0.24519230769230768, + "grad_norm": 6.242330191426054, + "learning_rate": 4.392824672954295e-06, + "loss": 0.1495, + "step": 408 + }, + { + "epoch": 0.24639423076923078, + "grad_norm": 4.388543194744337, + "learning_rate": 4.386491893350334e-06, + "loss": 0.1225, + "step": 410 + }, + { + "epoch": 0.24759615384615385, + "grad_norm": 3.190150517704449, + "learning_rate": 4.380130875216156e-06, + "loss": 0.1255, + "step": 412 + }, + { + "epoch": 0.24879807692307693, + "grad_norm": 2.213731831929862, + "learning_rate": 4.373741713768605e-06, + "loss": 0.1356, + "step": 414 + }, + { + "epoch": 0.25, + "grad_norm": 2.7046643388864546, + "learning_rate": 4.367324504645793e-06, + "loss": 0.1374, + "step": 416 + }, + { + "epoch": 0.2512019230769231, + "grad_norm": 2.1009699551445977, + "learning_rate": 4.360879343905677e-06, + "loss": 0.1332, + "step": 418 + }, + { + "epoch": 0.25240384615384615, + "grad_norm": 2.650338768654261, + "learning_rate": 4.354406328024613e-06, + "loss": 0.1314, + "step": 420 + }, + { + "epoch": 0.2536057692307692, + "grad_norm": 2.810331149813075, + "learning_rate": 4.347905553895918e-06, + "loss": 0.1295, + "step": 422 + }, + { + "epoch": 0.2548076923076923, + "grad_norm": 3.814782755239228, + "learning_rate": 4.341377118828415e-06, + "loss": 0.1193, + "step": 424 + }, + { + "epoch": 0.25600961538461536, + "grad_norm": 2.9998629650762405, + "learning_rate": 4.33482112054498e-06, + "loss": 0.131, + "step": 426 + }, + { + "epoch": 0.25721153846153844, + "grad_norm": 2.7743819788707365, + "learning_rate": 4.3282376571810745e-06, + "loss": 0.1262, + "step": 428 + }, + { + "epoch": 0.25841346153846156, + "grad_norm": 2.9124275491739255, + "learning_rate": 4.32162682728328e-06, + "loss": 0.1256, + "step": 430 + }, + { + "epoch": 0.25961538461538464, + "grad_norm": 2.1687486550981805, + "learning_rate": 4.3149887298078275e-06, + "loss": 0.1355, + "step": 432 + }, + { + "epoch": 0.2608173076923077, + "grad_norm": 3.46835598153599, + "learning_rate": 4.308323464119103e-06, + "loss": 0.1294, + "step": 434 + }, + { + "epoch": 0.2620192307692308, + "grad_norm": 3.1610918409226603, + "learning_rate": 4.301631129988174e-06, + "loss": 0.1179, + "step": 436 + }, + { + "epoch": 0.26322115384615385, + "grad_norm": 3.5692561412500914, + "learning_rate": 4.294911827591288e-06, + "loss": 0.1316, + "step": 438 + }, + { + "epoch": 0.2644230769230769, + "grad_norm": 3.1685646314642955, + "learning_rate": 4.288165657508377e-06, + "loss": 0.1287, + "step": 440 + }, + { + "epoch": 0.265625, + "grad_norm": 2.6180618695713695, + "learning_rate": 4.281392720721546e-06, + "loss": 0.1225, + "step": 442 + }, + { + "epoch": 0.2668269230769231, + "grad_norm": 1.8843206020979073, + "learning_rate": 4.274593118613569e-06, + "loss": 0.1116, + "step": 444 + }, + { + "epoch": 0.26802884615384615, + "grad_norm": 2.541701491013687, + "learning_rate": 4.267766952966369e-06, + "loss": 0.131, + "step": 446 + }, + { + "epoch": 0.2692307692307692, + "grad_norm": 2.7752887478918185, + "learning_rate": 4.260914325959491e-06, + "loss": 0.134, + "step": 448 + }, + { + "epoch": 0.2704326923076923, + "grad_norm": 2.2244522364780748, + "learning_rate": 4.254035340168577e-06, + "loss": 0.1331, + "step": 450 + }, + { + "epoch": 0.27163461538461536, + "grad_norm": 2.1577691411009186, + "learning_rate": 4.247130098563825e-06, + "loss": 0.1356, + "step": 452 + }, + { + "epoch": 0.27283653846153844, + "grad_norm": 2.495807078564746, + "learning_rate": 4.2401987045084544e-06, + "loss": 0.1285, + "step": 454 + }, + { + "epoch": 0.27403846153846156, + "grad_norm": 2.315378410961849, + "learning_rate": 4.233241261757155e-06, + "loss": 0.1314, + "step": 456 + }, + { + "epoch": 0.27524038461538464, + "grad_norm": 2.3360381121240485, + "learning_rate": 4.226257874454535e-06, + "loss": 0.1335, + "step": 458 + }, + { + "epoch": 0.2764423076923077, + "grad_norm": 4.342066939412811, + "learning_rate": 4.219248647133559e-06, + "loss": 0.1407, + "step": 460 + }, + { + "epoch": 0.2776442307692308, + "grad_norm": 2.3663888374606032, + "learning_rate": 4.212213684713987e-06, + "loss": 0.1224, + "step": 462 + }, + { + "epoch": 0.27884615384615385, + "grad_norm": 3.0614706455153553, + "learning_rate": 4.205153092500805e-06, + "loss": 0.1229, + "step": 464 + }, + { + "epoch": 0.2800480769230769, + "grad_norm": 2.380259494398439, + "learning_rate": 4.198066976182644e-06, + "loss": 0.1292, + "step": 466 + }, + { + "epoch": 0.28125, + "grad_norm": 4.013842010005791, + "learning_rate": 4.1909554418302e-06, + "loss": 0.134, + "step": 468 + }, + { + "epoch": 0.2824519230769231, + "grad_norm": 1.778945958084193, + "learning_rate": 4.183818595894648e-06, + "loss": 0.1428, + "step": 470 + }, + { + "epoch": 0.28365384615384615, + "grad_norm": 3.750377365680276, + "learning_rate": 4.176656545206046e-06, + "loss": 0.1291, + "step": 472 + }, + { + "epoch": 0.2848557692307692, + "grad_norm": 1.9066583171893872, + "learning_rate": 4.169469396971739e-06, + "loss": 0.1176, + "step": 474 + }, + { + "epoch": 0.2860576923076923, + "grad_norm": 3.7582224188634736, + "learning_rate": 4.16225725877475e-06, + "loss": 0.1249, + "step": 476 + }, + { + "epoch": 0.28725961538461536, + "grad_norm": 2.7825989989563564, + "learning_rate": 4.155020238572174e-06, + "loss": 0.1109, + "step": 478 + }, + { + "epoch": 0.28846153846153844, + "grad_norm": 4.879245102252371, + "learning_rate": 4.147758444693557e-06, + "loss": 0.1364, + "step": 480 + }, + { + "epoch": 0.28966346153846156, + "grad_norm": 3.2182991915950394, + "learning_rate": 4.140471985839281e-06, + "loss": 0.1271, + "step": 482 + }, + { + "epoch": 0.29086538461538464, + "grad_norm": 2.166479148262207, + "learning_rate": 4.13316097107893e-06, + "loss": 0.1213, + "step": 484 + }, + { + "epoch": 0.2920673076923077, + "grad_norm": 2.47776248902879, + "learning_rate": 4.125825509849662e-06, + "loss": 0.1193, + "step": 486 + }, + { + "epoch": 0.2932692307692308, + "grad_norm": 2.540451340281278, + "learning_rate": 4.11846571195457e-06, + "loss": 0.119, + "step": 488 + }, + { + "epoch": 0.29447115384615385, + "grad_norm": 3.2230059766589814, + "learning_rate": 4.111081687561036e-06, + "loss": 0.1276, + "step": 490 + }, + { + "epoch": 0.2956730769230769, + "grad_norm": 2.835333516397744, + "learning_rate": 4.103673547199087e-06, + "loss": 0.1241, + "step": 492 + }, + { + "epoch": 0.296875, + "grad_norm": 2.752629007829119, + "learning_rate": 4.096241401759732e-06, + "loss": 0.1239, + "step": 494 + }, + { + "epoch": 0.2980769230769231, + "grad_norm": 1.8919133248892268, + "learning_rate": 4.0887853624933134e-06, + "loss": 0.1239, + "step": 496 + }, + { + "epoch": 0.29927884615384615, + "grad_norm": 2.8561871397763774, + "learning_rate": 4.081305541007832e-06, + "loss": 0.1289, + "step": 498 + }, + { + "epoch": 0.3004807692307692, + "grad_norm": 1.6600940797126917, + "learning_rate": 4.07380204926728e-06, + "loss": 0.1384, + "step": 500 + }, + { + "epoch": 0.3016826923076923, + "grad_norm": 2.404290817625276, + "learning_rate": 4.066274999589967e-06, + "loss": 0.1299, + "step": 502 + }, + { + "epoch": 0.30288461538461536, + "grad_norm": 1.9475394667243153, + "learning_rate": 4.058724504646834e-06, + "loss": 0.1259, + "step": 504 + }, + { + "epoch": 0.30408653846153844, + "grad_norm": 3.0051337393851143, + "learning_rate": 4.051150677459772e-06, + "loss": 0.1237, + "step": 506 + }, + { + "epoch": 0.30528846153846156, + "grad_norm": 2.1578955093603063, + "learning_rate": 4.043553631399928e-06, + "loss": 0.1202, + "step": 508 + }, + { + "epoch": 0.30649038461538464, + "grad_norm": 6.142783994800525, + "learning_rate": 4.035933480186005e-06, + "loss": 0.1347, + "step": 510 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 2.954830082548502, + "learning_rate": 4.028290337882565e-06, + "loss": 0.1437, + "step": 512 + }, + { + "epoch": 0.3088942307692308, + "grad_norm": 2.2019067150054386, + "learning_rate": 4.020624318898319e-06, + "loss": 0.1307, + "step": 514 + }, + { + "epoch": 0.31009615384615385, + "grad_norm": 4.54110621977567, + "learning_rate": 4.012935537984414e-06, + "loss": 0.1335, + "step": 516 + }, + { + "epoch": 0.3112980769230769, + "grad_norm": 2.85625320530939, + "learning_rate": 4.005224110232715e-06, + "loss": 0.1317, + "step": 518 + }, + { + "epoch": 0.3125, + "grad_norm": 2.3576527639858895, + "learning_rate": 3.997490151074085e-06, + "loss": 0.1284, + "step": 520 + }, + { + "epoch": 0.3137019230769231, + "grad_norm": 2.4537651255404214, + "learning_rate": 3.989733776276654e-06, + "loss": 0.1211, + "step": 522 + }, + { + "epoch": 0.31490384615384615, + "grad_norm": 3.352379617409583, + "learning_rate": 3.981955101944088e-06, + "loss": 0.1223, + "step": 524 + }, + { + "epoch": 0.3161057692307692, + "grad_norm": 2.233759105251149, + "learning_rate": 3.9741542445138505e-06, + "loss": 0.1279, + "step": 526 + }, + { + "epoch": 0.3173076923076923, + "grad_norm": 3.26893899586464, + "learning_rate": 3.966331320755457e-06, + "loss": 0.1308, + "step": 528 + }, + { + "epoch": 0.31850961538461536, + "grad_norm": 1.9938930635011727, + "learning_rate": 3.958486447768736e-06, + "loss": 0.1191, + "step": 530 + }, + { + "epoch": 0.31971153846153844, + "grad_norm": 1.6739004551575976, + "learning_rate": 3.95061974298206e-06, + "loss": 0.1114, + "step": 532 + }, + { + "epoch": 0.32091346153846156, + "grad_norm": 2.1002098020462574, + "learning_rate": 3.942731324150606e-06, + "loss": 0.1229, + "step": 534 + }, + { + "epoch": 0.32211538461538464, + "grad_norm": 3.227784545692672, + "learning_rate": 3.934821309354581e-06, + "loss": 0.1282, + "step": 536 + }, + { + "epoch": 0.3233173076923077, + "grad_norm": 2.6927966633468134, + "learning_rate": 3.926889816997457e-06, + "loss": 0.1274, + "step": 538 + }, + { + "epoch": 0.3245192307692308, + "grad_norm": 3.841849853659577, + "learning_rate": 3.9189369658042e-06, + "loss": 0.1316, + "step": 540 + }, + { + "epoch": 0.32572115384615385, + "grad_norm": 2.6872062042849727, + "learning_rate": 3.910962874819495e-06, + "loss": 0.1275, + "step": 542 + }, + { + "epoch": 0.3269230769230769, + "grad_norm": 3.6657337480434946, + "learning_rate": 3.9029676634059565e-06, + "loss": 0.1254, + "step": 544 + }, + { + "epoch": 0.328125, + "grad_norm": 2.8137841340293352, + "learning_rate": 3.894951451242351e-06, + "loss": 0.1316, + "step": 546 + }, + { + "epoch": 0.3293269230769231, + "grad_norm": 1.5503149824535458, + "learning_rate": 3.886914358321796e-06, + "loss": 0.1199, + "step": 548 + }, + { + "epoch": 0.33052884615384615, + "grad_norm": 1.9124225846435765, + "learning_rate": 3.8788565049499746e-06, + "loss": 0.1144, + "step": 550 + }, + { + "epoch": 0.3317307692307692, + "grad_norm": 2.2194257928538974, + "learning_rate": 3.8707780117433276e-06, + "loss": 0.1203, + "step": 552 + }, + { + "epoch": 0.3329326923076923, + "grad_norm": 2.2430374522475556, + "learning_rate": 3.8626789996272466e-06, + "loss": 0.1254, + "step": 554 + }, + { + "epoch": 0.33413461538461536, + "grad_norm": 1.656547967694163, + "learning_rate": 3.854559589834269e-06, + "loss": 0.1155, + "step": 556 + }, + { + "epoch": 0.33533653846153844, + "grad_norm": 2.71535491729536, + "learning_rate": 3.846419903902261e-06, + "loss": 0.1248, + "step": 558 + }, + { + "epoch": 0.33653846153846156, + "grad_norm": 4.963796667259708, + "learning_rate": 3.838260063672599e-06, + "loss": 0.1201, + "step": 560 + }, + { + "epoch": 0.33774038461538464, + "grad_norm": 2.02605376529183, + "learning_rate": 3.830080191288342e-06, + "loss": 0.1264, + "step": 562 + }, + { + "epoch": 0.3389423076923077, + "grad_norm": 2.2760213880197124, + "learning_rate": 3.82188040919241e-06, + "loss": 0.1121, + "step": 564 + }, + { + "epoch": 0.3401442307692308, + "grad_norm": 2.458521927082506, + "learning_rate": 3.813660840125747e-06, + "loss": 0.1322, + "step": 566 + }, + { + "epoch": 0.34134615384615385, + "grad_norm": 4.210654399847963, + "learning_rate": 3.805421607125482e-06, + "loss": 0.128, + "step": 568 + }, + { + "epoch": 0.3425480769230769, + "grad_norm": 2.161926215111614, + "learning_rate": 3.7971628335230932e-06, + "loss": 0.13, + "step": 570 + }, + { + "epoch": 0.34375, + "grad_norm": 3.904255891368641, + "learning_rate": 3.788884642942555e-06, + "loss": 0.1317, + "step": 572 + }, + { + "epoch": 0.3449519230769231, + "grad_norm": 2.71934531169795, + "learning_rate": 3.780587159298492e-06, + "loss": 0.1359, + "step": 574 + }, + { + "epoch": 0.34615384615384615, + "grad_norm": 2.8064237134830274, + "learning_rate": 3.7722705067943227e-06, + "loss": 0.133, + "step": 576 + }, + { + "epoch": 0.3473557692307692, + "grad_norm": 2.5669808093942272, + "learning_rate": 3.763934809920401e-06, + "loss": 0.1312, + "step": 578 + }, + { + "epoch": 0.3485576923076923, + "grad_norm": 2.6878698838883883, + "learning_rate": 3.755580193452153e-06, + "loss": 0.126, + "step": 580 + }, + { + "epoch": 0.34975961538461536, + "grad_norm": 1.9940547887564615, + "learning_rate": 3.747206782448207e-06, + "loss": 0.1215, + "step": 582 + }, + { + "epoch": 0.35096153846153844, + "grad_norm": 2.4246119443294147, + "learning_rate": 3.738814702248524e-06, + "loss": 0.1259, + "step": 584 + }, + { + "epoch": 0.35216346153846156, + "grad_norm": 2.448624947878468, + "learning_rate": 3.7304040784725183e-06, + "loss": 0.1265, + "step": 586 + }, + { + "epoch": 0.35336538461538464, + "grad_norm": 2.6611405194352544, + "learning_rate": 3.7219750370171843e-06, + "loss": 0.1258, + "step": 588 + }, + { + "epoch": 0.3545673076923077, + "grad_norm": 3.9151580028753092, + "learning_rate": 3.7135277040552014e-06, + "loss": 0.1269, + "step": 590 + }, + { + "epoch": 0.3557692307692308, + "grad_norm": 1.902396245377977, + "learning_rate": 3.7050622060330553e-06, + "loss": 0.1269, + "step": 592 + }, + { + "epoch": 0.35697115384615385, + "grad_norm": 2.200109114807576, + "learning_rate": 3.6965786696691386e-06, + "loss": 0.1297, + "step": 594 + }, + { + "epoch": 0.3581730769230769, + "grad_norm": 2.640515221983352, + "learning_rate": 3.688077221951857e-06, + "loss": 0.1217, + "step": 596 + }, + { + "epoch": 0.359375, + "grad_norm": 2.9478456557798194, + "learning_rate": 3.6795579901377277e-06, + "loss": 0.1206, + "step": 598 + }, + { + "epoch": 0.3605769230769231, + "grad_norm": 4.499371410793944, + "learning_rate": 3.671021101749476e-06, + "loss": 0.1159, + "step": 600 + }, + { + "epoch": 0.36177884615384615, + "grad_norm": 3.2861013876529266, + "learning_rate": 3.662466684574122e-06, + "loss": 0.1147, + "step": 602 + }, + { + "epoch": 0.3629807692307692, + "grad_norm": 2.936797344536718, + "learning_rate": 3.653894866661073e-06, + "loss": 0.1218, + "step": 604 + }, + { + "epoch": 0.3641826923076923, + "grad_norm": 2.5284722183745347, + "learning_rate": 3.645305776320205e-06, + "loss": 0.1277, + "step": 606 + }, + { + "epoch": 0.36538461538461536, + "grad_norm": 2.0656418154561416, + "learning_rate": 3.636699542119939e-06, + "loss": 0.1226, + "step": 608 + }, + { + "epoch": 0.36658653846153844, + "grad_norm": 2.761257208121012, + "learning_rate": 3.628076292885322e-06, + "loss": 0.1176, + "step": 610 + }, + { + "epoch": 0.36778846153846156, + "grad_norm": 4.409264331933305, + "learning_rate": 3.6194361576960944e-06, + "loss": 0.1303, + "step": 612 + }, + { + "epoch": 0.36899038461538464, + "grad_norm": 2.2897088881849483, + "learning_rate": 3.6107792658847597e-06, + "loss": 0.1166, + "step": 614 + }, + { + "epoch": 0.3701923076923077, + "grad_norm": 2.556001831241419, + "learning_rate": 3.602105747034646e-06, + "loss": 0.1238, + "step": 616 + }, + { + "epoch": 0.3713942307692308, + "grad_norm": 2.3832438875718442, + "learning_rate": 3.5934157309779714e-06, + "loss": 0.1189, + "step": 618 + }, + { + "epoch": 0.37259615384615385, + "grad_norm": 2.256691965422808, + "learning_rate": 3.5847093477938955e-06, + "loss": 0.1324, + "step": 620 + }, + { + "epoch": 0.3737980769230769, + "grad_norm": 4.4764970926214325, + "learning_rate": 3.5759867278065752e-06, + "loss": 0.1266, + "step": 622 + }, + { + "epoch": 0.375, + "grad_norm": 2.8438597379920045, + "learning_rate": 3.5672480015832117e-06, + "loss": 0.1258, + "step": 624 + }, + { + "epoch": 0.3762019230769231, + "grad_norm": 2.5547304438348193, + "learning_rate": 3.5584932999320986e-06, + "loss": 0.1189, + "step": 626 + }, + { + "epoch": 0.37740384615384615, + "grad_norm": 3.861193208078938, + "learning_rate": 3.549722753900662e-06, + "loss": 0.12, + "step": 628 + }, + { + "epoch": 0.3786057692307692, + "grad_norm": 2.0271164351237076, + "learning_rate": 3.5409364947734994e-06, + "loss": 0.1034, + "step": 630 + }, + { + "epoch": 0.3798076923076923, + "grad_norm": 2.661574124686293, + "learning_rate": 3.532134654070415e-06, + "loss": 0.1179, + "step": 632 + }, + { + "epoch": 0.38100961538461536, + "grad_norm": 4.444020843792755, + "learning_rate": 3.523317363544449e-06, + "loss": 0.1383, + "step": 634 + }, + { + "epoch": 0.38221153846153844, + "grad_norm": 2.0898293018736145, + "learning_rate": 3.5144847551799105e-06, + "loss": 0.128, + "step": 636 + }, + { + "epoch": 0.38341346153846156, + "grad_norm": 6.381896171861657, + "learning_rate": 3.5056369611903945e-06, + "loss": 0.135, + "step": 638 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 3.3029527185373913, + "learning_rate": 3.496774114016809e-06, + "loss": 0.1367, + "step": 640 + }, + { + "epoch": 0.3858173076923077, + "grad_norm": 2.3200365246792094, + "learning_rate": 3.487896346325389e-06, + "loss": 0.1244, + "step": 642 + }, + { + "epoch": 0.3870192307692308, + "grad_norm": 3.598439324678028, + "learning_rate": 3.4790037910057128e-06, + "loss": 0.131, + "step": 644 + }, + { + "epoch": 0.38822115384615385, + "grad_norm": 1.4871335164149173, + "learning_rate": 3.4700965811687106e-06, + "loss": 0.1194, + "step": 646 + }, + { + "epoch": 0.3894230769230769, + "grad_norm": 2.4184023479090024, + "learning_rate": 3.461174850144674e-06, + "loss": 0.1213, + "step": 648 + }, + { + "epoch": 0.390625, + "grad_norm": 3.436257185320764, + "learning_rate": 3.4522387314812606e-06, + "loss": 0.1324, + "step": 650 + }, + { + "epoch": 0.3918269230769231, + "grad_norm": 1.8151625861479124, + "learning_rate": 3.443288358941491e-06, + "loss": 0.1108, + "step": 652 + }, + { + "epoch": 0.39302884615384615, + "grad_norm": 1.5261810547328365, + "learning_rate": 3.4343238665017512e-06, + "loss": 0.1105, + "step": 654 + }, + { + "epoch": 0.3942307692307692, + "grad_norm": 2.8091934186049063, + "learning_rate": 3.425345388349787e-06, + "loss": 0.1348, + "step": 656 + }, + { + "epoch": 0.3954326923076923, + "grad_norm": 2.002504867469609, + "learning_rate": 3.4163530588826877e-06, + "loss": 0.1075, + "step": 658 + }, + { + "epoch": 0.39663461538461536, + "grad_norm": 1.925848303593358, + "learning_rate": 3.4073470127048867e-06, + "loss": 0.121, + "step": 660 + }, + { + "epoch": 0.39783653846153844, + "grad_norm": 3.4486630510150134, + "learning_rate": 3.3983273846261373e-06, + "loss": 0.13, + "step": 662 + }, + { + "epoch": 0.39903846153846156, + "grad_norm": 2.29190337434423, + "learning_rate": 3.3892943096594968e-06, + "loss": 0.1175, + "step": 664 + }, + { + "epoch": 0.40024038461538464, + "grad_norm": 2.7382806950058574, + "learning_rate": 3.3802479230193074e-06, + "loss": 0.1355, + "step": 666 + }, + { + "epoch": 0.4014423076923077, + "grad_norm": 3.8969395559370286, + "learning_rate": 3.371188360119173e-06, + "loss": 0.1265, + "step": 668 + }, + { + "epoch": 0.4026442307692308, + "grad_norm": 2.0972867493422567, + "learning_rate": 3.3621157565699265e-06, + "loss": 0.1182, + "step": 670 + }, + { + "epoch": 0.40384615384615385, + "grad_norm": 3.7477223788217673, + "learning_rate": 3.3530302481776062e-06, + "loss": 0.1147, + "step": 672 + }, + { + "epoch": 0.4050480769230769, + "grad_norm": 2.585644020351654, + "learning_rate": 3.343931970941421e-06, + "loss": 0.1184, + "step": 674 + }, + { + "epoch": 0.40625, + "grad_norm": 2.6033563821440664, + "learning_rate": 3.3348210610517117e-06, + "loss": 0.1221, + "step": 676 + }, + { + "epoch": 0.4074519230769231, + "grad_norm": 3.1763777004125067, + "learning_rate": 3.3256976548879183e-06, + "loss": 0.1149, + "step": 678 + }, + { + "epoch": 0.40865384615384615, + "grad_norm": 2.7352894929472535, + "learning_rate": 3.3165618890165306e-06, + "loss": 0.1205, + "step": 680 + }, + { + "epoch": 0.4098557692307692, + "grad_norm": 3.574807534485726, + "learning_rate": 3.307413900189054e-06, + "loss": 0.1073, + "step": 682 + }, + { + "epoch": 0.4110576923076923, + "grad_norm": 3.311593916021147, + "learning_rate": 3.29825382533995e-06, + "loss": 0.1152, + "step": 684 + }, + { + "epoch": 0.41225961538461536, + "grad_norm": 2.6214370492688692, + "learning_rate": 3.289081801584601e-06, + "loss": 0.1178, + "step": 686 + }, + { + "epoch": 0.41346153846153844, + "grad_norm": 2.28098423314985, + "learning_rate": 3.2798979662172446e-06, + "loss": 0.1175, + "step": 688 + }, + { + "epoch": 0.41466346153846156, + "grad_norm": 4.235250427718613, + "learning_rate": 3.2707024567089267e-06, + "loss": 0.1504, + "step": 690 + }, + { + "epoch": 0.41586538461538464, + "grad_norm": 1.9122767567805194, + "learning_rate": 3.2614954107054405e-06, + "loss": 0.1294, + "step": 692 + }, + { + "epoch": 0.4170673076923077, + "grad_norm": 3.054582085992648, + "learning_rate": 3.2522769660252673e-06, + "loss": 0.1223, + "step": 694 + }, + { + "epoch": 0.4182692307692308, + "grad_norm": 1.6351923608702348, + "learning_rate": 3.243047260657511e-06, + "loss": 0.1197, + "step": 696 + }, + { + "epoch": 0.41947115384615385, + "grad_norm": 2.7477487145437576, + "learning_rate": 3.233806432759837e-06, + "loss": 0.1293, + "step": 698 + }, + { + "epoch": 0.4206730769230769, + "grad_norm": 2.4016286502537505, + "learning_rate": 3.2245546206564015e-06, + "loss": 0.1154, + "step": 700 + }, + { + "epoch": 0.421875, + "grad_norm": 1.9800234381047233, + "learning_rate": 3.215291962835779e-06, + "loss": 0.123, + "step": 702 + }, + { + "epoch": 0.4230769230769231, + "grad_norm": 3.217074666511334, + "learning_rate": 3.206018597948893e-06, + "loss": 0.1208, + "step": 704 + }, + { + "epoch": 0.42427884615384615, + "grad_norm": 3.25172973443265, + "learning_rate": 3.1967346648069397e-06, + "loss": 0.1244, + "step": 706 + }, + { + "epoch": 0.4254807692307692, + "grad_norm": 2.2450714988867353, + "learning_rate": 3.1874403023793078e-06, + "loss": 0.1179, + "step": 708 + }, + { + "epoch": 0.4266826923076923, + "grad_norm": 3.2488238286410875, + "learning_rate": 3.1781356497914995e-06, + "loss": 0.1245, + "step": 710 + }, + { + "epoch": 0.42788461538461536, + "grad_norm": 2.218601857724757, + "learning_rate": 3.168820846323053e-06, + "loss": 0.1251, + "step": 712 + }, + { + "epoch": 0.42908653846153844, + "grad_norm": 2.088964444672931, + "learning_rate": 3.1594960314054455e-06, + "loss": 0.1193, + "step": 714 + }, + { + "epoch": 0.43028846153846156, + "grad_norm": 4.741704269019802, + "learning_rate": 3.150161344620021e-06, + "loss": 0.1322, + "step": 716 + }, + { + "epoch": 0.43149038461538464, + "grad_norm": 3.493342583852878, + "learning_rate": 3.1408169256958888e-06, + "loss": 0.1278, + "step": 718 + }, + { + "epoch": 0.4326923076923077, + "grad_norm": 2.351714268349835, + "learning_rate": 3.1314629145078377e-06, + "loss": 0.116, + "step": 720 + }, + { + "epoch": 0.4338942307692308, + "grad_norm": 3.8649842638324015, + "learning_rate": 3.1220994510742432e-06, + "loss": 0.1297, + "step": 722 + }, + { + "epoch": 0.43509615384615385, + "grad_norm": 2.841739719188719, + "learning_rate": 3.1127266755549673e-06, + "loss": 0.1238, + "step": 724 + }, + { + "epoch": 0.4362980769230769, + "grad_norm": 2.0373254493345843, + "learning_rate": 3.1033447282492645e-06, + "loss": 0.1339, + "step": 726 + }, + { + "epoch": 0.4375, + "grad_norm": 1.8332876940880098, + "learning_rate": 3.0939537495936784e-06, + "loss": 0.1255, + "step": 728 + }, + { + "epoch": 0.4387019230769231, + "grad_norm": 1.9574438212255216, + "learning_rate": 3.0845538801599423e-06, + "loss": 0.1197, + "step": 730 + }, + { + "epoch": 0.43990384615384615, + "grad_norm": 1.7871551779346857, + "learning_rate": 3.075145260652873e-06, + "loss": 0.1344, + "step": 732 + }, + { + "epoch": 0.4411057692307692, + "grad_norm": 3.6706640863007416, + "learning_rate": 3.0657280319082657e-06, + "loss": 0.116, + "step": 734 + }, + { + "epoch": 0.4423076923076923, + "grad_norm": 1.6394420662743008, + "learning_rate": 3.056302334890786e-06, + "loss": 0.123, + "step": 736 + }, + { + "epoch": 0.44350961538461536, + "grad_norm": 1.8174034087550737, + "learning_rate": 3.0468683106918608e-06, + "loss": 0.1203, + "step": 738 + }, + { + "epoch": 0.44471153846153844, + "grad_norm": 2.028279546605494, + "learning_rate": 3.0374261005275606e-06, + "loss": 0.1153, + "step": 740 + }, + { + "epoch": 0.44591346153846156, + "grad_norm": 3.1742172663448893, + "learning_rate": 3.0279758457364943e-06, + "loss": 0.1119, + "step": 742 + }, + { + "epoch": 0.44711538461538464, + "grad_norm": 2.1542693819149994, + "learning_rate": 3.018517687777688e-06, + "loss": 0.1152, + "step": 744 + }, + { + "epoch": 0.4483173076923077, + "grad_norm": 4.6204720149874605, + "learning_rate": 3.009051768228468e-06, + "loss": 0.1297, + "step": 746 + }, + { + "epoch": 0.4495192307692308, + "grad_norm": 2.0445376227310095, + "learning_rate": 2.9995782287823428e-06, + "loss": 0.115, + "step": 748 + }, + { + "epoch": 0.45072115384615385, + "grad_norm": 2.320840534894566, + "learning_rate": 2.9900972112468823e-06, + "loss": 0.1257, + "step": 750 + }, + { + "epoch": 0.4519230769230769, + "grad_norm": 4.0732649101420915, + "learning_rate": 2.9806088575415926e-06, + "loss": 0.1182, + "step": 752 + }, + { + "epoch": 0.453125, + "grad_norm": 3.8261178694802327, + "learning_rate": 2.971113309695796e-06, + "loss": 0.1202, + "step": 754 + }, + { + "epoch": 0.4543269230769231, + "grad_norm": 2.393271946060094, + "learning_rate": 2.961610709846501e-06, + "loss": 0.1171, + "step": 756 + }, + { + "epoch": 0.45552884615384615, + "grad_norm": 1.8371462666695046, + "learning_rate": 2.9521012002362766e-06, + "loss": 0.1142, + "step": 758 + }, + { + "epoch": 0.4567307692307692, + "grad_norm": 2.08485758756134, + "learning_rate": 2.942584923211121e-06, + "loss": 0.1154, + "step": 760 + }, + { + "epoch": 0.4579326923076923, + "grad_norm": 2.6562279999651257, + "learning_rate": 2.933062021218337e-06, + "loss": 0.1063, + "step": 762 + }, + { + "epoch": 0.45913461538461536, + "grad_norm": 2.533470915365061, + "learning_rate": 2.9235326368043885e-06, + "loss": 0.1135, + "step": 764 + }, + { + "epoch": 0.46033653846153844, + "grad_norm": 2.4011631762333905, + "learning_rate": 2.9139969126127803e-06, + "loss": 0.1134, + "step": 766 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 2.252103330371488, + "learning_rate": 2.9044549913819125e-06, + "loss": 0.1329, + "step": 768 + }, + { + "epoch": 0.46274038461538464, + "grad_norm": 2.111392303163354, + "learning_rate": 2.8949070159429473e-06, + "loss": 0.1167, + "step": 770 + }, + { + "epoch": 0.4639423076923077, + "grad_norm": 2.10465453166218, + "learning_rate": 2.885353129217671e-06, + "loss": 0.1294, + "step": 772 + }, + { + "epoch": 0.4651442307692308, + "grad_norm": 1.7606762750864913, + "learning_rate": 2.875793474216358e-06, + "loss": 0.1195, + "step": 774 + }, + { + "epoch": 0.46634615384615385, + "grad_norm": 3.4911755377127665, + "learning_rate": 2.8662281940356234e-06, + "loss": 0.1197, + "step": 776 + }, + { + "epoch": 0.4675480769230769, + "grad_norm": 2.485129458685194, + "learning_rate": 2.8566574318562855e-06, + "loss": 0.1257, + "step": 778 + }, + { + "epoch": 0.46875, + "grad_norm": 3.0980789105745536, + "learning_rate": 2.8470813309412222e-06, + "loss": 0.1159, + "step": 780 + }, + { + "epoch": 0.4699519230769231, + "grad_norm": 2.06101810490773, + "learning_rate": 2.8375000346332256e-06, + "loss": 0.1114, + "step": 782 + }, + { + "epoch": 0.47115384615384615, + "grad_norm": 2.5211271193230567, + "learning_rate": 2.827913686352856e-06, + "loss": 0.1278, + "step": 784 + }, + { + "epoch": 0.4723557692307692, + "grad_norm": 2.1529408157219825, + "learning_rate": 2.818322429596297e-06, + "loss": 0.1206, + "step": 786 + }, + { + "epoch": 0.4735576923076923, + "grad_norm": 2.366887732661358, + "learning_rate": 2.808726407933205e-06, + "loss": 0.1149, + "step": 788 + }, + { + "epoch": 0.47475961538461536, + "grad_norm": 2.16343980990941, + "learning_rate": 2.7991257650045606e-06, + "loss": 0.1208, + "step": 790 + }, + { + "epoch": 0.47596153846153844, + "grad_norm": 2.8342000182216345, + "learning_rate": 2.7895206445205226e-06, + "loss": 0.1217, + "step": 792 + }, + { + "epoch": 0.47716346153846156, + "grad_norm": 1.852391269800072, + "learning_rate": 2.7799111902582697e-06, + "loss": 0.1155, + "step": 794 + }, + { + "epoch": 0.47836538461538464, + "grad_norm": 2.5799284357343484, + "learning_rate": 2.7702975460598545e-06, + "loss": 0.1283, + "step": 796 + }, + { + "epoch": 0.4795673076923077, + "grad_norm": 1.881492308096937, + "learning_rate": 2.760679855830047e-06, + "loss": 0.1081, + "step": 798 + }, + { + "epoch": 0.4807692307692308, + "grad_norm": 2.5186830859263436, + "learning_rate": 2.7510582635341815e-06, + "loss": 0.1187, + "step": 800 + }, + { + "epoch": 0.48197115384615385, + "grad_norm": 2.6559499054158615, + "learning_rate": 2.7414329131960004e-06, + "loss": 0.1233, + "step": 802 + }, + { + "epoch": 0.4831730769230769, + "grad_norm": 4.62630178829242, + "learning_rate": 2.731803948895503e-06, + "loss": 0.124, + "step": 804 + }, + { + "epoch": 0.484375, + "grad_norm": 1.914060815314394, + "learning_rate": 2.722171514766781e-06, + "loss": 0.1123, + "step": 806 + }, + { + "epoch": 0.4855769230769231, + "grad_norm": 2.4270202069774145, + "learning_rate": 2.7125357549958687e-06, + "loss": 0.1287, + "step": 808 + }, + { + "epoch": 0.48677884615384615, + "grad_norm": 2.1602891567758746, + "learning_rate": 2.7028968138185783e-06, + "loss": 0.1143, + "step": 810 + }, + { + "epoch": 0.4879807692307692, + "grad_norm": 2.68436864433482, + "learning_rate": 2.6932548355183476e-06, + "loss": 0.1166, + "step": 812 + }, + { + "epoch": 0.4891826923076923, + "grad_norm": 2.4944384936946196, + "learning_rate": 2.6836099644240727e-06, + "loss": 0.1133, + "step": 814 + }, + { + "epoch": 0.49038461538461536, + "grad_norm": 1.8613202081753457, + "learning_rate": 2.673962344907953e-06, + "loss": 0.109, + "step": 816 + }, + { + "epoch": 0.49158653846153844, + "grad_norm": 2.219693506080579, + "learning_rate": 2.6643121213833306e-06, + "loss": 0.1145, + "step": 818 + }, + { + "epoch": 0.49278846153846156, + "grad_norm": 2.8619481470099117, + "learning_rate": 2.6546594383025214e-06, + "loss": 0.1115, + "step": 820 + }, + { + "epoch": 0.49399038461538464, + "grad_norm": 2.666948981163753, + "learning_rate": 2.6450044401546632e-06, + "loss": 0.1305, + "step": 822 + }, + { + "epoch": 0.4951923076923077, + "grad_norm": 2.09947237601635, + "learning_rate": 2.6353472714635443e-06, + "loss": 0.1099, + "step": 824 + }, + { + "epoch": 0.4963942307692308, + "grad_norm": 1.9473099754220278, + "learning_rate": 2.625688076785445e-06, + "loss": 0.1208, + "step": 826 + }, + { + "epoch": 0.49759615384615385, + "grad_norm": 2.0864251934157774, + "learning_rate": 2.6160270007069703e-06, + "loss": 0.1257, + "step": 828 + }, + { + "epoch": 0.4987980769230769, + "grad_norm": 1.9893158881100514, + "learning_rate": 2.606364187842891e-06, + "loss": 0.1264, + "step": 830 + }, + { + "epoch": 0.5, + "grad_norm": 2.3582369528291083, + "learning_rate": 2.5966997828339724e-06, + "loss": 0.1147, + "step": 832 + }, + { + "epoch": 0.5012019230769231, + "grad_norm": 2.146899490039593, + "learning_rate": 2.5870339303448127e-06, + "loss": 0.1152, + "step": 834 + }, + { + "epoch": 0.5024038461538461, + "grad_norm": 1.990060058415754, + "learning_rate": 2.5773667750616783e-06, + "loss": 0.1041, + "step": 836 + }, + { + "epoch": 0.5036057692307693, + "grad_norm": 2.178599936980344, + "learning_rate": 2.5676984616903367e-06, + "loss": 0.1286, + "step": 838 + }, + { + "epoch": 0.5048076923076923, + "grad_norm": 2.562250086026024, + "learning_rate": 2.5580291349538895e-06, + "loss": 0.1146, + "step": 840 + }, + { + "epoch": 0.5060096153846154, + "grad_norm": 2.4548795707580418, + "learning_rate": 2.5483589395906084e-06, + "loss": 0.1232, + "step": 842 + }, + { + "epoch": 0.5072115384615384, + "grad_norm": 2.0576956102764536, + "learning_rate": 2.5386880203517665e-06, + "loss": 0.1091, + "step": 844 + }, + { + "epoch": 0.5084134615384616, + "grad_norm": 1.7798937747570411, + "learning_rate": 2.5290165219994734e-06, + "loss": 0.122, + "step": 846 + }, + { + "epoch": 0.5096153846153846, + "grad_norm": 3.2665103557785473, + "learning_rate": 2.5193445893045054e-06, + "loss": 0.119, + "step": 848 + }, + { + "epoch": 0.5108173076923077, + "grad_norm": 2.3751458473034175, + "learning_rate": 2.5096723670441437e-06, + "loss": 0.1161, + "step": 850 + }, + { + "epoch": 0.5120192307692307, + "grad_norm": 1.7591316722682409, + "learning_rate": 2.5e-06, + "loss": 0.1151, + "step": 852 + }, + { + "epoch": 0.5132211538461539, + "grad_norm": 2.2115382855282464, + "learning_rate": 2.4903276329558567e-06, + "loss": 0.1313, + "step": 854 + }, + { + "epoch": 0.5144230769230769, + "grad_norm": 3.714925572378303, + "learning_rate": 2.480655410695495e-06, + "loss": 0.118, + "step": 856 + }, + { + "epoch": 0.515625, + "grad_norm": 2.292092125779591, + "learning_rate": 2.4709834780005283e-06, + "loss": 0.1105, + "step": 858 + }, + { + "epoch": 0.5168269230769231, + "grad_norm": 2.8062030763080066, + "learning_rate": 2.4613119796482343e-06, + "loss": 0.1279, + "step": 860 + }, + { + "epoch": 0.5180288461538461, + "grad_norm": 3.0016696690528684, + "learning_rate": 2.4516410604093924e-06, + "loss": 0.124, + "step": 862 + }, + { + "epoch": 0.5192307692307693, + "grad_norm": 2.6910032305249776, + "learning_rate": 2.441970865046111e-06, + "loss": 0.1164, + "step": 864 + }, + { + "epoch": 0.5204326923076923, + "grad_norm": 2.790603434708355, + "learning_rate": 2.4323015383096645e-06, + "loss": 0.1284, + "step": 866 + }, + { + "epoch": 0.5216346153846154, + "grad_norm": 1.88418937937736, + "learning_rate": 2.422633224938323e-06, + "loss": 0.1197, + "step": 868 + }, + { + "epoch": 0.5228365384615384, + "grad_norm": 2.268135867297592, + "learning_rate": 2.412966069655188e-06, + "loss": 0.1087, + "step": 870 + }, + { + "epoch": 0.5240384615384616, + "grad_norm": 1.7727390247554256, + "learning_rate": 2.403300217166028e-06, + "loss": 0.1047, + "step": 872 + }, + { + "epoch": 0.5252403846153846, + "grad_norm": 2.6452934155833, + "learning_rate": 2.39363581215711e-06, + "loss": 0.1209, + "step": 874 + }, + { + "epoch": 0.5264423076923077, + "grad_norm": 2.231648348284633, + "learning_rate": 2.38397299929303e-06, + "loss": 0.1225, + "step": 876 + }, + { + "epoch": 0.5276442307692307, + "grad_norm": 3.1589321862401323, + "learning_rate": 2.374311923214556e-06, + "loss": 0.1278, + "step": 878 + }, + { + "epoch": 0.5288461538461539, + "grad_norm": 2.538600702095854, + "learning_rate": 2.3646527285364565e-06, + "loss": 0.1133, + "step": 880 + }, + { + "epoch": 0.5300480769230769, + "grad_norm": 1.8010218639998627, + "learning_rate": 2.3549955598453384e-06, + "loss": 0.1102, + "step": 882 + }, + { + "epoch": 0.53125, + "grad_norm": 1.9859781619064247, + "learning_rate": 2.3453405616974794e-06, + "loss": 0.1223, + "step": 884 + }, + { + "epoch": 0.5324519230769231, + "grad_norm": 2.48433192428649, + "learning_rate": 2.3356878786166703e-06, + "loss": 0.1276, + "step": 886 + }, + { + "epoch": 0.5336538461538461, + "grad_norm": 1.9977524593562115, + "learning_rate": 2.3260376550920472e-06, + "loss": 0.1219, + "step": 888 + }, + { + "epoch": 0.5348557692307693, + "grad_norm": 1.9708632642889377, + "learning_rate": 2.3163900355759277e-06, + "loss": 0.117, + "step": 890 + }, + { + "epoch": 0.5360576923076923, + "grad_norm": 2.3393380650509146, + "learning_rate": 2.3067451644816537e-06, + "loss": 0.1328, + "step": 892 + }, + { + "epoch": 0.5372596153846154, + "grad_norm": 2.016237150348988, + "learning_rate": 2.2971031861814225e-06, + "loss": 0.115, + "step": 894 + }, + { + "epoch": 0.5384615384615384, + "grad_norm": 2.254008950804627, + "learning_rate": 2.287464245004132e-06, + "loss": 0.1184, + "step": 896 + }, + { + "epoch": 0.5396634615384616, + "grad_norm": 3.2043282624716403, + "learning_rate": 2.27782848523322e-06, + "loss": 0.1193, + "step": 898 + }, + { + "epoch": 0.5408653846153846, + "grad_norm": 1.4454861904743852, + "learning_rate": 2.268196051104499e-06, + "loss": 0.1104, + "step": 900 + }, + { + "epoch": 0.5420673076923077, + "grad_norm": 2.397925567908216, + "learning_rate": 2.2585670868040004e-06, + "loss": 0.1173, + "step": 902 + }, + { + "epoch": 0.5432692307692307, + "grad_norm": 2.6349415212538503, + "learning_rate": 2.2489417364658194e-06, + "loss": 0.1175, + "step": 904 + }, + { + "epoch": 0.5444711538461539, + "grad_norm": 1.912841995108057, + "learning_rate": 2.2393201441699535e-06, + "loss": 0.1124, + "step": 906 + }, + { + "epoch": 0.5456730769230769, + "grad_norm": 2.2214483754728396, + "learning_rate": 2.2297024539401463e-06, + "loss": 0.1169, + "step": 908 + }, + { + "epoch": 0.546875, + "grad_norm": 2.6645784394778995, + "learning_rate": 2.2200888097417308e-06, + "loss": 0.1124, + "step": 910 + }, + { + "epoch": 0.5480769230769231, + "grad_norm": 2.3991327890112757, + "learning_rate": 2.2104793554794783e-06, + "loss": 0.1082, + "step": 912 + }, + { + "epoch": 0.5492788461538461, + "grad_norm": 2.4642009420576487, + "learning_rate": 2.2008742349954394e-06, + "loss": 0.119, + "step": 914 + }, + { + "epoch": 0.5504807692307693, + "grad_norm": 2.5918285453531116, + "learning_rate": 2.1912735920667966e-06, + "loss": 0.1055, + "step": 916 + }, + { + "epoch": 0.5516826923076923, + "grad_norm": 2.0680446180956373, + "learning_rate": 2.181677570403704e-06, + "loss": 0.1109, + "step": 918 + }, + { + "epoch": 0.5528846153846154, + "grad_norm": 2.193301046368466, + "learning_rate": 2.1720863136471447e-06, + "loss": 0.1277, + "step": 920 + }, + { + "epoch": 0.5540865384615384, + "grad_norm": 2.5163737723965736, + "learning_rate": 2.162499965366775e-06, + "loss": 0.1219, + "step": 922 + }, + { + "epoch": 0.5552884615384616, + "grad_norm": 3.521848753217605, + "learning_rate": 2.1529186690587786e-06, + "loss": 0.114, + "step": 924 + }, + { + "epoch": 0.5564903846153846, + "grad_norm": 3.069616221629034, + "learning_rate": 2.1433425681437154e-06, + "loss": 0.1071, + "step": 926 + }, + { + "epoch": 0.5576923076923077, + "grad_norm": 3.53398612074779, + "learning_rate": 2.1337718059643774e-06, + "loss": 0.1236, + "step": 928 + }, + { + "epoch": 0.5588942307692307, + "grad_norm": 5.058223699592573, + "learning_rate": 2.124206525783643e-06, + "loss": 0.1109, + "step": 930 + }, + { + "epoch": 0.5600961538461539, + "grad_norm": 2.5478159897083352, + "learning_rate": 2.114646870782329e-06, + "loss": 0.1167, + "step": 932 + }, + { + "epoch": 0.5612980769230769, + "grad_norm": 2.354071051813213, + "learning_rate": 2.1050929840570544e-06, + "loss": 0.1011, + "step": 934 + }, + { + "epoch": 0.5625, + "grad_norm": 4.853864942677267, + "learning_rate": 2.0955450086180883e-06, + "loss": 0.116, + "step": 936 + }, + { + "epoch": 0.5637019230769231, + "grad_norm": 2.722700474105122, + "learning_rate": 2.08600308738722e-06, + "loss": 0.1108, + "step": 938 + }, + { + "epoch": 0.5649038461538461, + "grad_norm": 4.630914205750646, + "learning_rate": 2.0764673631956115e-06, + "loss": 0.1172, + "step": 940 + }, + { + "epoch": 0.5661057692307693, + "grad_norm": 2.0844097872671616, + "learning_rate": 2.0669379787816644e-06, + "loss": 0.1086, + "step": 942 + }, + { + "epoch": 0.5673076923076923, + "grad_norm": 2.2079993034525147, + "learning_rate": 2.0574150767888795e-06, + "loss": 0.1199, + "step": 944 + }, + { + "epoch": 0.5685096153846154, + "grad_norm": 3.624361624117408, + "learning_rate": 2.0478987997637246e-06, + "loss": 0.1028, + "step": 946 + }, + { + "epoch": 0.5697115384615384, + "grad_norm": 2.304855132990531, + "learning_rate": 2.0383892901534995e-06, + "loss": 0.1143, + "step": 948 + }, + { + "epoch": 0.5709134615384616, + "grad_norm": 2.303913178369359, + "learning_rate": 2.0288866903042055e-06, + "loss": 0.1149, + "step": 950 + }, + { + "epoch": 0.5721153846153846, + "grad_norm": 2.5242944062982944, + "learning_rate": 2.0193911424584082e-06, + "loss": 0.1271, + "step": 952 + }, + { + "epoch": 0.5733173076923077, + "grad_norm": 2.2840209986395643, + "learning_rate": 2.0099027887531186e-06, + "loss": 0.1025, + "step": 954 + }, + { + "epoch": 0.5745192307692307, + "grad_norm": 2.1475236126016757, + "learning_rate": 2.0004217712176576e-06, + "loss": 0.1052, + "step": 956 + }, + { + "epoch": 0.5757211538461539, + "grad_norm": 2.2062922770065625, + "learning_rate": 1.9909482317715335e-06, + "loss": 0.1261, + "step": 958 + }, + { + "epoch": 0.5769230769230769, + "grad_norm": 2.573858967297316, + "learning_rate": 1.9814823122223125e-06, + "loss": 0.1206, + "step": 960 + }, + { + "epoch": 0.578125, + "grad_norm": 3.5542312782650267, + "learning_rate": 1.972024154263506e-06, + "loss": 0.118, + "step": 962 + }, + { + "epoch": 0.5793269230769231, + "grad_norm": 3.043328928606157, + "learning_rate": 1.96257389947244e-06, + "loss": 0.1148, + "step": 964 + }, + { + "epoch": 0.5805288461538461, + "grad_norm": 2.186779926589517, + "learning_rate": 1.9531316893081396e-06, + "loss": 0.1028, + "step": 966 + }, + { + "epoch": 0.5817307692307693, + "grad_norm": 1.9507127168704683, + "learning_rate": 1.9436976651092143e-06, + "loss": 0.1069, + "step": 968 + }, + { + "epoch": 0.5829326923076923, + "grad_norm": 2.2374696361826403, + "learning_rate": 1.934271968091735e-06, + "loss": 0.1172, + "step": 970 + }, + { + "epoch": 0.5841346153846154, + "grad_norm": 2.55684351637379, + "learning_rate": 1.924854739347128e-06, + "loss": 0.1084, + "step": 972 + }, + { + "epoch": 0.5853365384615384, + "grad_norm": 2.325113870079778, + "learning_rate": 1.9154461198400585e-06, + "loss": 0.1235, + "step": 974 + }, + { + "epoch": 0.5865384615384616, + "grad_norm": 2.6657491779145976, + "learning_rate": 1.9060462504063229e-06, + "loss": 0.1071, + "step": 976 + }, + { + "epoch": 0.5877403846153846, + "grad_norm": 1.956462181600069, + "learning_rate": 1.8966552717507364e-06, + "loss": 0.119, + "step": 978 + }, + { + "epoch": 0.5889423076923077, + "grad_norm": 2.705164990543757, + "learning_rate": 1.8872733244450331e-06, + "loss": 0.1023, + "step": 980 + }, + { + "epoch": 0.5901442307692307, + "grad_norm": 1.9312443310397687, + "learning_rate": 1.8779005489257572e-06, + "loss": 0.1053, + "step": 982 + }, + { + "epoch": 0.5913461538461539, + "grad_norm": 2.855051765002529, + "learning_rate": 1.8685370854921631e-06, + "loss": 0.1072, + "step": 984 + }, + { + "epoch": 0.5925480769230769, + "grad_norm": 2.3926479059613373, + "learning_rate": 1.8591830743041123e-06, + "loss": 0.1226, + "step": 986 + }, + { + "epoch": 0.59375, + "grad_norm": 3.5632611766021465, + "learning_rate": 1.8498386553799802e-06, + "loss": 0.1003, + "step": 988 + }, + { + "epoch": 0.5949519230769231, + "grad_norm": 3.0673131415514803, + "learning_rate": 1.8405039685945547e-06, + "loss": 0.1103, + "step": 990 + }, + { + "epoch": 0.5961538461538461, + "grad_norm": 3.875636229689881, + "learning_rate": 1.8311791536769485e-06, + "loss": 0.1301, + "step": 992 + }, + { + "epoch": 0.5973557692307693, + "grad_norm": 3.2290497985605775, + "learning_rate": 1.821864350208501e-06, + "loss": 0.1149, + "step": 994 + }, + { + "epoch": 0.5985576923076923, + "grad_norm": 1.9347455813936323, + "learning_rate": 1.8125596976206933e-06, + "loss": 0.1087, + "step": 996 + }, + { + "epoch": 0.5997596153846154, + "grad_norm": 2.04110053593177, + "learning_rate": 1.8032653351930607e-06, + "loss": 0.112, + "step": 998 + }, + { + "epoch": 0.6009615384615384, + "grad_norm": 2.4285768876217637, + "learning_rate": 1.793981402051107e-06, + "loss": 0.1184, + "step": 1000 + }, + { + "epoch": 0.6021634615384616, + "grad_norm": 2.056516574333895, + "learning_rate": 1.7847080371642222e-06, + "loss": 0.1089, + "step": 1002 + }, + { + "epoch": 0.6033653846153846, + "grad_norm": 2.872014799730928, + "learning_rate": 1.7754453793435995e-06, + "loss": 0.1203, + "step": 1004 + }, + { + "epoch": 0.6045673076923077, + "grad_norm": 3.1357344499654225, + "learning_rate": 1.7661935672401635e-06, + "loss": 0.1057, + "step": 1006 + }, + { + "epoch": 0.6057692307692307, + "grad_norm": 2.2990506646197684, + "learning_rate": 1.7569527393424894e-06, + "loss": 0.1102, + "step": 1008 + }, + { + "epoch": 0.6069711538461539, + "grad_norm": 1.9080185886472223, + "learning_rate": 1.7477230339747342e-06, + "loss": 0.1128, + "step": 1010 + }, + { + "epoch": 0.6081730769230769, + "grad_norm": 2.4866351605585217, + "learning_rate": 1.7385045892945603e-06, + "loss": 0.1082, + "step": 1012 + }, + { + "epoch": 0.609375, + "grad_norm": 2.408386002877475, + "learning_rate": 1.7292975432910738e-06, + "loss": 0.1065, + "step": 1014 + }, + { + "epoch": 0.6105769230769231, + "grad_norm": 2.3425864575127724, + "learning_rate": 1.7201020337827556e-06, + "loss": 0.0992, + "step": 1016 + }, + { + "epoch": 0.6117788461538461, + "grad_norm": 3.3764842505199897, + "learning_rate": 1.7109181984154e-06, + "loss": 0.0994, + "step": 1018 + }, + { + "epoch": 0.6129807692307693, + "grad_norm": 3.0511908446927105, + "learning_rate": 1.7017461746600506e-06, + "loss": 0.1116, + "step": 1020 + }, + { + "epoch": 0.6141826923076923, + "grad_norm": 2.0835754940115048, + "learning_rate": 1.6925860998109472e-06, + "loss": 0.1027, + "step": 1022 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 2.1369310697938873, + "learning_rate": 1.6834381109834696e-06, + "loss": 0.1082, + "step": 1024 + }, + { + "epoch": 0.6165865384615384, + "grad_norm": 3.633547678025306, + "learning_rate": 1.6743023451120831e-06, + "loss": 0.1179, + "step": 1026 + }, + { + "epoch": 0.6177884615384616, + "grad_norm": 3.222183839199512, + "learning_rate": 1.6651789389482885e-06, + "loss": 0.1047, + "step": 1028 + }, + { + "epoch": 0.6189903846153846, + "grad_norm": 3.373496600017691, + "learning_rate": 1.6560680290585798e-06, + "loss": 0.1174, + "step": 1030 + }, + { + "epoch": 0.6201923076923077, + "grad_norm": 3.355771088882065, + "learning_rate": 1.646969751822394e-06, + "loss": 0.1225, + "step": 1032 + }, + { + "epoch": 0.6213942307692307, + "grad_norm": 2.132501441209862, + "learning_rate": 1.6378842434300746e-06, + "loss": 0.1085, + "step": 1034 + }, + { + "epoch": 0.6225961538461539, + "grad_norm": 1.8201904843022139, + "learning_rate": 1.6288116398808278e-06, + "loss": 0.1072, + "step": 1036 + }, + { + "epoch": 0.6237980769230769, + "grad_norm": 1.9943546756148034, + "learning_rate": 1.619752076980693e-06, + "loss": 0.1175, + "step": 1038 + }, + { + "epoch": 0.625, + "grad_norm": 1.9417071481978827, + "learning_rate": 1.6107056903405038e-06, + "loss": 0.1031, + "step": 1040 + }, + { + "epoch": 0.6262019230769231, + "grad_norm": 2.0812507755491776, + "learning_rate": 1.6016726153738638e-06, + "loss": 0.1181, + "step": 1042 + }, + { + "epoch": 0.6274038461538461, + "grad_norm": 1.9437266222472136, + "learning_rate": 1.5926529872951144e-06, + "loss": 0.1104, + "step": 1044 + }, + { + "epoch": 0.6286057692307693, + "grad_norm": 2.0078937220346265, + "learning_rate": 1.583646941117313e-06, + "loss": 0.1044, + "step": 1046 + }, + { + "epoch": 0.6298076923076923, + "grad_norm": 2.2331084033833366, + "learning_rate": 1.574654611650214e-06, + "loss": 0.1147, + "step": 1048 + }, + { + "epoch": 0.6310096153846154, + "grad_norm": 2.133371687932722, + "learning_rate": 1.5656761334982487e-06, + "loss": 0.1159, + "step": 1050 + }, + { + "epoch": 0.6322115384615384, + "grad_norm": 2.068123773517536, + "learning_rate": 1.5567116410585101e-06, + "loss": 0.1038, + "step": 1052 + }, + { + "epoch": 0.6334134615384616, + "grad_norm": 2.5576918982500683, + "learning_rate": 1.5477612685187405e-06, + "loss": 0.1169, + "step": 1054 + }, + { + "epoch": 0.6346153846153846, + "grad_norm": 2.694751080220668, + "learning_rate": 1.5388251498553263e-06, + "loss": 0.1081, + "step": 1056 + }, + { + "epoch": 0.6358173076923077, + "grad_norm": 2.135244446442495, + "learning_rate": 1.52990341883129e-06, + "loss": 0.1075, + "step": 1058 + }, + { + "epoch": 0.6370192307692307, + "grad_norm": 2.1823074476166764, + "learning_rate": 1.5209962089942885e-06, + "loss": 0.1085, + "step": 1060 + }, + { + "epoch": 0.6382211538461539, + "grad_norm": 1.9277746702424785, + "learning_rate": 1.5121036536746119e-06, + "loss": 0.1049, + "step": 1062 + }, + { + "epoch": 0.6394230769230769, + "grad_norm": 2.365543759553611, + "learning_rate": 1.5032258859831916e-06, + "loss": 0.1093, + "step": 1064 + }, + { + "epoch": 0.640625, + "grad_norm": 2.4257341316404406, + "learning_rate": 1.4943630388096055e-06, + "loss": 0.1175, + "step": 1066 + }, + { + "epoch": 0.6418269230769231, + "grad_norm": 2.653293916979889, + "learning_rate": 1.4855152448200901e-06, + "loss": 0.1153, + "step": 1068 + }, + { + "epoch": 0.6430288461538461, + "grad_norm": 2.419944610975381, + "learning_rate": 1.4766826364555514e-06, + "loss": 0.1159, + "step": 1070 + }, + { + "epoch": 0.6442307692307693, + "grad_norm": 2.0103810925549626, + "learning_rate": 1.467865345929586e-06, + "loss": 0.1143, + "step": 1072 + }, + { + "epoch": 0.6454326923076923, + "grad_norm": 2.01089727654853, + "learning_rate": 1.4590635052265008e-06, + "loss": 0.1106, + "step": 1074 + }, + { + "epoch": 0.6466346153846154, + "grad_norm": 1.748446439918439, + "learning_rate": 1.4502772460993387e-06, + "loss": 0.1018, + "step": 1076 + }, + { + "epoch": 0.6478365384615384, + "grad_norm": 2.484572897708403, + "learning_rate": 1.4415067000679029e-06, + "loss": 0.1104, + "step": 1078 + }, + { + "epoch": 0.6490384615384616, + "grad_norm": 2.4037649077365657, + "learning_rate": 1.4327519984167887e-06, + "loss": 0.1189, + "step": 1080 + }, + { + "epoch": 0.6502403846153846, + "grad_norm": 1.8720994441559204, + "learning_rate": 1.4240132721934256e-06, + "loss": 0.118, + "step": 1082 + }, + { + "epoch": 0.6514423076923077, + "grad_norm": 1.9961620517391614, + "learning_rate": 1.415290652206105e-06, + "loss": 0.1062, + "step": 1084 + }, + { + "epoch": 0.6526442307692307, + "grad_norm": 3.3559687716616, + "learning_rate": 1.4065842690220294e-06, + "loss": 0.1192, + "step": 1086 + }, + { + "epoch": 0.6538461538461539, + "grad_norm": 2.1228084765105373, + "learning_rate": 1.3978942529653549e-06, + "loss": 0.0997, + "step": 1088 + }, + { + "epoch": 0.6550480769230769, + "grad_norm": 2.609409554692004, + "learning_rate": 1.3892207341152416e-06, + "loss": 0.1146, + "step": 1090 + }, + { + "epoch": 0.65625, + "grad_norm": 3.084566569938987, + "learning_rate": 1.3805638423039056e-06, + "loss": 0.1238, + "step": 1092 + }, + { + "epoch": 0.6574519230769231, + "grad_norm": 2.755372215903661, + "learning_rate": 1.371923707114679e-06, + "loss": 0.1091, + "step": 1094 + }, + { + "epoch": 0.6586538461538461, + "grad_norm": 2.119982444557482, + "learning_rate": 1.3633004578800613e-06, + "loss": 0.099, + "step": 1096 + }, + { + "epoch": 0.6598557692307693, + "grad_norm": 2.701943705630255, + "learning_rate": 1.354694223679796e-06, + "loss": 0.1235, + "step": 1098 + }, + { + "epoch": 0.6610576923076923, + "grad_norm": 2.383471150976908, + "learning_rate": 1.3461051333389275e-06, + "loss": 0.1031, + "step": 1100 + }, + { + "epoch": 0.6622596153846154, + "grad_norm": 3.00768304842994, + "learning_rate": 1.3375333154258788e-06, + "loss": 0.1087, + "step": 1102 + }, + { + "epoch": 0.6634615384615384, + "grad_norm": 2.088505043527597, + "learning_rate": 1.328978898250525e-06, + "loss": 0.1166, + "step": 1104 + }, + { + "epoch": 0.6646634615384616, + "grad_norm": 2.434276624558114, + "learning_rate": 1.3204420098622727e-06, + "loss": 0.11, + "step": 1106 + }, + { + "epoch": 0.6658653846153846, + "grad_norm": 1.8412804984656046, + "learning_rate": 1.3119227780481442e-06, + "loss": 0.113, + "step": 1108 + }, + { + "epoch": 0.6670673076923077, + "grad_norm": 2.0956844206733405, + "learning_rate": 1.3034213303308627e-06, + "loss": 0.1144, + "step": 1110 + }, + { + "epoch": 0.6682692307692307, + "grad_norm": 2.1476124760530566, + "learning_rate": 1.294937793966946e-06, + "loss": 0.1095, + "step": 1112 + }, + { + "epoch": 0.6694711538461539, + "grad_norm": 2.292664553276864, + "learning_rate": 1.286472295944799e-06, + "loss": 0.1146, + "step": 1114 + }, + { + "epoch": 0.6706730769230769, + "grad_norm": 2.1662467131117404, + "learning_rate": 1.2780249629828161e-06, + "loss": 0.1097, + "step": 1116 + }, + { + "epoch": 0.671875, + "grad_norm": 2.906015346971846, + "learning_rate": 1.2695959215274817e-06, + "loss": 0.1148, + "step": 1118 + }, + { + "epoch": 0.6730769230769231, + "grad_norm": 2.1982439434562737, + "learning_rate": 1.261185297751477e-06, + "loss": 0.1053, + "step": 1120 + }, + { + "epoch": 0.6742788461538461, + "grad_norm": 2.018201703916458, + "learning_rate": 1.2527932175517934e-06, + "loss": 0.115, + "step": 1122 + }, + { + "epoch": 0.6754807692307693, + "grad_norm": 2.6111890149300976, + "learning_rate": 1.2444198065478475e-06, + "loss": 0.1224, + "step": 1124 + }, + { + "epoch": 0.6766826923076923, + "grad_norm": 2.5284325319117267, + "learning_rate": 1.2360651900795995e-06, + "loss": 0.1207, + "step": 1126 + }, + { + "epoch": 0.6778846153846154, + "grad_norm": 2.2545340347392955, + "learning_rate": 1.2277294932056783e-06, + "loss": 0.112, + "step": 1128 + }, + { + "epoch": 0.6790865384615384, + "grad_norm": 3.362156324890133, + "learning_rate": 1.2194128407015094e-06, + "loss": 0.1164, + "step": 1130 + }, + { + "epoch": 0.6802884615384616, + "grad_norm": 1.632189263569225, + "learning_rate": 1.2111153570574454e-06, + "loss": 0.1012, + "step": 1132 + }, + { + "epoch": 0.6814903846153846, + "grad_norm": 2.092206166748186, + "learning_rate": 1.202837166476907e-06, + "loss": 0.1085, + "step": 1134 + }, + { + "epoch": 0.6826923076923077, + "grad_norm": 2.7031603463833704, + "learning_rate": 1.1945783928745187e-06, + "loss": 0.1109, + "step": 1136 + }, + { + "epoch": 0.6838942307692307, + "grad_norm": 3.4147286461299355, + "learning_rate": 1.1863391598742535e-06, + "loss": 0.1133, + "step": 1138 + }, + { + "epoch": 0.6850961538461539, + "grad_norm": 2.1129785563716994, + "learning_rate": 1.1781195908075903e-06, + "loss": 0.1097, + "step": 1140 + }, + { + "epoch": 0.6862980769230769, + "grad_norm": 3.2026130054118593, + "learning_rate": 1.169919808711659e-06, + "loss": 0.1184, + "step": 1142 + }, + { + "epoch": 0.6875, + "grad_norm": 2.249630204645609, + "learning_rate": 1.1617399363274024e-06, + "loss": 0.1106, + "step": 1144 + }, + { + "epoch": 0.6887019230769231, + "grad_norm": 3.2963891692649514, + "learning_rate": 1.1535800960977398e-06, + "loss": 0.1196, + "step": 1146 + }, + { + "epoch": 0.6899038461538461, + "grad_norm": 3.186358499780556, + "learning_rate": 1.1454404101657319e-06, + "loss": 0.1121, + "step": 1148 + }, + { + "epoch": 0.6911057692307693, + "grad_norm": 2.47209843153002, + "learning_rate": 1.1373210003727536e-06, + "loss": 0.1167, + "step": 1150 + }, + { + "epoch": 0.6923076923076923, + "grad_norm": 2.4518332512722876, + "learning_rate": 1.1292219882566726e-06, + "loss": 0.1148, + "step": 1152 + }, + { + "epoch": 0.6935096153846154, + "grad_norm": 3.262824991958051, + "learning_rate": 1.121143495050026e-06, + "loss": 0.106, + "step": 1154 + }, + { + "epoch": 0.6947115384615384, + "grad_norm": 2.338537712422274, + "learning_rate": 1.1130856416782046e-06, + "loss": 0.106, + "step": 1156 + }, + { + "epoch": 0.6959134615384616, + "grad_norm": 2.2204770447922297, + "learning_rate": 1.1050485487576506e-06, + "loss": 0.1101, + "step": 1158 + }, + { + "epoch": 0.6971153846153846, + "grad_norm": 2.0597064409649892, + "learning_rate": 1.0970323365940443e-06, + "loss": 0.0959, + "step": 1160 + }, + { + "epoch": 0.6983173076923077, + "grad_norm": 2.222014493052337, + "learning_rate": 1.089037125180506e-06, + "loss": 0.1034, + "step": 1162 + }, + { + "epoch": 0.6995192307692307, + "grad_norm": 3.1568264036888265, + "learning_rate": 1.0810630341958004e-06, + "loss": 0.1224, + "step": 1164 + }, + { + "epoch": 0.7007211538461539, + "grad_norm": 2.1388673626652817, + "learning_rate": 1.0731101830025442e-06, + "loss": 0.1024, + "step": 1166 + }, + { + "epoch": 0.7019230769230769, + "grad_norm": 2.844127901014447, + "learning_rate": 1.0651786906454192e-06, + "loss": 0.1236, + "step": 1168 + }, + { + "epoch": 0.703125, + "grad_norm": 1.9936288576811623, + "learning_rate": 1.057268675849395e-06, + "loss": 0.1006, + "step": 1170 + }, + { + "epoch": 0.7043269230769231, + "grad_norm": 1.8480926642214928, + "learning_rate": 1.0493802570179411e-06, + "loss": 0.1001, + "step": 1172 + }, + { + "epoch": 0.7055288461538461, + "grad_norm": 2.261808036207062, + "learning_rate": 1.041513552231265e-06, + "loss": 0.1038, + "step": 1174 + }, + { + "epoch": 0.7067307692307693, + "grad_norm": 2.2099301841197545, + "learning_rate": 1.0336686792445424e-06, + "loss": 0.1101, + "step": 1176 + }, + { + "epoch": 0.7079326923076923, + "grad_norm": 2.180203910907892, + "learning_rate": 1.0258457554861502e-06, + "loss": 0.1057, + "step": 1178 + }, + { + "epoch": 0.7091346153846154, + "grad_norm": 2.797064097348832, + "learning_rate": 1.0180448980559125e-06, + "loss": 0.0926, + "step": 1180 + }, + { + "epoch": 0.7103365384615384, + "grad_norm": 3.1147260554752147, + "learning_rate": 1.0102662237233465e-06, + "loss": 0.1191, + "step": 1182 + }, + { + "epoch": 0.7115384615384616, + "grad_norm": 2.488698925587082, + "learning_rate": 1.0025098489259161e-06, + "loss": 0.1014, + "step": 1184 + }, + { + "epoch": 0.7127403846153846, + "grad_norm": 2.2420640526045927, + "learning_rate": 9.947758897672855e-07, + "loss": 0.1125, + "step": 1186 + }, + { + "epoch": 0.7139423076923077, + "grad_norm": 2.634277342582424, + "learning_rate": 9.870644620155878e-07, + "loss": 0.1104, + "step": 1188 + }, + { + "epoch": 0.7151442307692307, + "grad_norm": 2.278093380222903, + "learning_rate": 9.793756811016824e-07, + "loss": 0.1045, + "step": 1190 + }, + { + "epoch": 0.7163461538461539, + "grad_norm": 2.1408407961096088, + "learning_rate": 9.717096621174355e-07, + "loss": 0.1154, + "step": 1192 + }, + { + "epoch": 0.7175480769230769, + "grad_norm": 2.2023983168340413, + "learning_rate": 9.640665198139957e-07, + "loss": 0.1147, + "step": 1194 + }, + { + "epoch": 0.71875, + "grad_norm": 1.9253814839904362, + "learning_rate": 9.564463686000728e-07, + "loss": 0.1157, + "step": 1196 + }, + { + "epoch": 0.7199519230769231, + "grad_norm": 2.036041516452903, + "learning_rate": 9.488493225402282e-07, + "loss": 0.0948, + "step": 1198 + }, + { + "epoch": 0.7211538461538461, + "grad_norm": 3.306618789071036, + "learning_rate": 9.412754953531664e-07, + "loss": 0.101, + "step": 1200 + }, + { + "epoch": 0.7223557692307693, + "grad_norm": 2.818632480308661, + "learning_rate": 9.337250004100337e-07, + "loss": 0.1232, + "step": 1202 + }, + { + "epoch": 0.7235576923076923, + "grad_norm": 2.3587067360069334, + "learning_rate": 9.261979507327204e-07, + "loss": 0.1062, + "step": 1204 + }, + { + "epoch": 0.7247596153846154, + "grad_norm": 2.3843210548687908, + "learning_rate": 9.186944589921687e-07, + "loss": 0.1161, + "step": 1206 + }, + { + "epoch": 0.7259615384615384, + "grad_norm": 2.069552811499533, + "learning_rate": 9.112146375066872e-07, + "loss": 0.1037, + "step": 1208 + }, + { + "epoch": 0.7271634615384616, + "grad_norm": 2.5490211308951487, + "learning_rate": 9.037585982402678e-07, + "loss": 0.1182, + "step": 1210 + }, + { + "epoch": 0.7283653846153846, + "grad_norm": 2.2537446863547177, + "learning_rate": 8.96326452800915e-07, + "loss": 0.1024, + "step": 1212 + }, + { + "epoch": 0.7295673076923077, + "grad_norm": 2.1542852130856085, + "learning_rate": 8.889183124389645e-07, + "loss": 0.1102, + "step": 1214 + }, + { + "epoch": 0.7307692307692307, + "grad_norm": 1.8957554942236439, + "learning_rate": 8.815342880454312e-07, + "loss": 0.107, + "step": 1216 + }, + { + "epoch": 0.7319711538461539, + "grad_norm": 2.4674322862732314, + "learning_rate": 8.741744901503387e-07, + "loss": 0.114, + "step": 1218 + }, + { + "epoch": 0.7331730769230769, + "grad_norm": 2.039475353958351, + "learning_rate": 8.66839028921071e-07, + "loss": 0.1106, + "step": 1220 + }, + { + "epoch": 0.734375, + "grad_norm": 2.3489571512912364, + "learning_rate": 8.595280141607198e-07, + "loss": 0.1073, + "step": 1222 + }, + { + "epoch": 0.7355769230769231, + "grad_norm": 3.073761818193723, + "learning_rate": 8.522415553064433e-07, + "loss": 0.1069, + "step": 1224 + }, + { + "epoch": 0.7367788461538461, + "grad_norm": 2.4347506007521433, + "learning_rate": 8.44979761427826e-07, + "loss": 0.1064, + "step": 1226 + }, + { + "epoch": 0.7379807692307693, + "grad_norm": 2.3883683060647134, + "learning_rate": 8.377427412252495e-07, + "loss": 0.1063, + "step": 1228 + }, + { + "epoch": 0.7391826923076923, + "grad_norm": 2.7706472616211077, + "learning_rate": 8.305306030282618e-07, + "loss": 0.1126, + "step": 1230 + }, + { + "epoch": 0.7403846153846154, + "grad_norm": 2.5634994337657413, + "learning_rate": 8.233434547939539e-07, + "loss": 0.112, + "step": 1232 + }, + { + "epoch": 0.7415865384615384, + "grad_norm": 2.397107165704345, + "learning_rate": 8.161814041053526e-07, + "loss": 0.1106, + "step": 1234 + }, + { + "epoch": 0.7427884615384616, + "grad_norm": 2.2450002047020807, + "learning_rate": 8.090445581698006e-07, + "loss": 0.108, + "step": 1236 + }, + { + "epoch": 0.7439903846153846, + "grad_norm": 2.014500102466641, + "learning_rate": 8.019330238173568e-07, + "loss": 0.1077, + "step": 1238 + }, + { + "epoch": 0.7451923076923077, + "grad_norm": 2.168024712104591, + "learning_rate": 7.948469074991955e-07, + "loss": 0.1045, + "step": 1240 + }, + { + "epoch": 0.7463942307692307, + "grad_norm": 3.0079126945368904, + "learning_rate": 7.877863152860133e-07, + "loss": 0.1092, + "step": 1242 + }, + { + "epoch": 0.7475961538461539, + "grad_norm": 2.4795980921136294, + "learning_rate": 7.807513528664415e-07, + "loss": 0.1107, + "step": 1244 + }, + { + "epoch": 0.7487980769230769, + "grad_norm": 2.247412162226902, + "learning_rate": 7.737421255454661e-07, + "loss": 0.1198, + "step": 1246 + }, + { + "epoch": 0.75, + "grad_norm": 2.3971310721116983, + "learning_rate": 7.667587382428455e-07, + "loss": 0.1161, + "step": 1248 + }, + { + "epoch": 0.7512019230769231, + "grad_norm": 1.8099165767446914, + "learning_rate": 7.598012954915457e-07, + "loss": 0.0973, + "step": 1250 + }, + { + "epoch": 0.7524038461538461, + "grad_norm": 2.3170574343599286, + "learning_rate": 7.528699014361757e-07, + "loss": 0.1093, + "step": 1252 + }, + { + "epoch": 0.7536057692307693, + "grad_norm": 1.7417689564815537, + "learning_rate": 7.459646598314246e-07, + "loss": 0.1021, + "step": 1254 + }, + { + "epoch": 0.7548076923076923, + "grad_norm": 2.012989717897973, + "learning_rate": 7.390856740405092e-07, + "loss": 0.1022, + "step": 1256 + }, + { + "epoch": 0.7560096153846154, + "grad_norm": 2.7755030823894082, + "learning_rate": 7.322330470336314e-07, + "loss": 0.108, + "step": 1258 + }, + { + "epoch": 0.7572115384615384, + "grad_norm": 2.7553825309268305, + "learning_rate": 7.254068813864315e-07, + "loss": 0.1164, + "step": 1260 + }, + { + "epoch": 0.7584134615384616, + "grad_norm": 3.2871528914249164, + "learning_rate": 7.186072792784549e-07, + "loss": 0.1018, + "step": 1262 + }, + { + "epoch": 0.7596153846153846, + "grad_norm": 2.3825605880656826, + "learning_rate": 7.118343424916249e-07, + "loss": 0.1006, + "step": 1264 + }, + { + "epoch": 0.7608173076923077, + "grad_norm": 2.8679237655683627, + "learning_rate": 7.050881724087125e-07, + "loss": 0.1043, + "step": 1266 + }, + { + "epoch": 0.7620192307692307, + "grad_norm": 2.6274099068260557, + "learning_rate": 6.983688700118257e-07, + "loss": 0.1084, + "step": 1268 + }, + { + "epoch": 0.7632211538461539, + "grad_norm": 2.432380483126836, + "learning_rate": 6.916765358808969e-07, + "loss": 0.1098, + "step": 1270 + }, + { + "epoch": 0.7644230769230769, + "grad_norm": 2.2356194365705218, + "learning_rate": 6.850112701921735e-07, + "loss": 0.0974, + "step": 1272 + }, + { + "epoch": 0.765625, + "grad_norm": 2.322442564380917, + "learning_rate": 6.783731727167195e-07, + "loss": 0.1149, + "step": 1274 + }, + { + "epoch": 0.7668269230769231, + "grad_norm": 2.7155413629798777, + "learning_rate": 6.717623428189262e-07, + "loss": 0.1107, + "step": 1276 + }, + { + "epoch": 0.7680288461538461, + "grad_norm": 2.5926151388895184, + "learning_rate": 6.65178879455021e-07, + "loss": 0.0961, + "step": 1278 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 2.053872678475484, + "learning_rate": 6.586228811715853e-07, + "loss": 0.104, + "step": 1280 + }, + { + "epoch": 0.7704326923076923, + "grad_norm": 2.414054056484151, + "learning_rate": 6.520944461040829e-07, + "loss": 0.0987, + "step": 1282 + }, + { + "epoch": 0.7716346153846154, + "grad_norm": 2.2800029934734014, + "learning_rate": 6.455936719753883e-07, + "loss": 0.1109, + "step": 1284 + }, + { + "epoch": 0.7728365384615384, + "grad_norm": 2.254565376531854, + "learning_rate": 6.391206560943241e-07, + "loss": 0.0972, + "step": 1286 + }, + { + "epoch": 0.7740384615384616, + "grad_norm": 2.4180234435201866, + "learning_rate": 6.326754953542086e-07, + "loss": 0.1055, + "step": 1288 + }, + { + "epoch": 0.7752403846153846, + "grad_norm": 2.864294623339486, + "learning_rate": 6.262582862313968e-07, + "loss": 0.1073, + "step": 1290 + }, + { + "epoch": 0.7764423076923077, + "grad_norm": 2.5508498340470465, + "learning_rate": 6.198691247838437e-07, + "loss": 0.1072, + "step": 1292 + }, + { + "epoch": 0.7776442307692307, + "grad_norm": 2.7551427113131233, + "learning_rate": 6.135081066496662e-07, + "loss": 0.0988, + "step": 1294 + }, + { + "epoch": 0.7788461538461539, + "grad_norm": 2.891789692416902, + "learning_rate": 6.071753270457065e-07, + "loss": 0.1214, + "step": 1296 + }, + { + "epoch": 0.7800480769230769, + "grad_norm": 2.1917615480612165, + "learning_rate": 6.00870880766111e-07, + "loss": 0.1027, + "step": 1298 + }, + { + "epoch": 0.78125, + "grad_norm": 2.323581668550887, + "learning_rate": 5.945948621809092e-07, + "loss": 0.0992, + "step": 1300 + }, + { + "epoch": 0.7824519230769231, + "grad_norm": 2.166198462254229, + "learning_rate": 5.883473652346031e-07, + "loss": 0.1107, + "step": 1302 + }, + { + "epoch": 0.7836538461538461, + "grad_norm": 2.1325952846722043, + "learning_rate": 5.821284834447586e-07, + "loss": 0.1137, + "step": 1304 + }, + { + "epoch": 0.7848557692307693, + "grad_norm": 2.3389500061042856, + "learning_rate": 5.759383099006094e-07, + "loss": 0.114, + "step": 1306 + }, + { + "epoch": 0.7860576923076923, + "grad_norm": 2.392023151903911, + "learning_rate": 5.697769372616565e-07, + "loss": 0.1154, + "step": 1308 + }, + { + "epoch": 0.7872596153846154, + "grad_norm": 1.9514556840096247, + "learning_rate": 5.636444577562911e-07, + "loss": 0.1071, + "step": 1310 + }, + { + "epoch": 0.7884615384615384, + "grad_norm": 1.7463857786932386, + "learning_rate": 5.575409631804049e-07, + "loss": 0.0932, + "step": 1312 + }, + { + "epoch": 0.7896634615384616, + "grad_norm": 2.5321085368327023, + "learning_rate": 5.51466544896021e-07, + "loss": 0.1249, + "step": 1314 + }, + { + "epoch": 0.7908653846153846, + "grad_norm": 2.7367754184042794, + "learning_rate": 5.454212938299256e-07, + "loss": 0.1083, + "step": 1316 + }, + { + "epoch": 0.7920673076923077, + "grad_norm": 2.1480042151176795, + "learning_rate": 5.39405300472306e-07, + "loss": 0.1135, + "step": 1318 + }, + { + "epoch": 0.7932692307692307, + "grad_norm": 2.1126995444295895, + "learning_rate": 5.334186548753961e-07, + "loss": 0.0993, + "step": 1320 + }, + { + "epoch": 0.7944711538461539, + "grad_norm": 1.9387325114766338, + "learning_rate": 5.2746144665213e-07, + "loss": 0.0975, + "step": 1322 + }, + { + "epoch": 0.7956730769230769, + "grad_norm": 2.5557991339707193, + "learning_rate": 5.215337649747986e-07, + "loss": 0.1062, + "step": 1324 + }, + { + "epoch": 0.796875, + "grad_norm": 1.9233646398585384, + "learning_rate": 5.156356985737154e-07, + "loss": 0.0983, + "step": 1326 + }, + { + "epoch": 0.7980769230769231, + "grad_norm": 2.2467131024558182, + "learning_rate": 5.097673357358906e-07, + "loss": 0.0968, + "step": 1328 + }, + { + "epoch": 0.7992788461538461, + "grad_norm": 2.4109454813538442, + "learning_rate": 5.039287643037058e-07, + "loss": 0.0979, + "step": 1330 + }, + { + "epoch": 0.8004807692307693, + "grad_norm": 3.124394231436496, + "learning_rate": 4.981200716735993e-07, + "loss": 0.1265, + "step": 1332 + }, + { + "epoch": 0.8016826923076923, + "grad_norm": 2.6675264999412, + "learning_rate": 4.92341344794763e-07, + "loss": 0.1049, + "step": 1334 + }, + { + "epoch": 0.8028846153846154, + "grad_norm": 2.848770862565795, + "learning_rate": 4.865926701678353e-07, + "loss": 0.1025, + "step": 1336 + }, + { + "epoch": 0.8040865384615384, + "grad_norm": 2.6854316431958867, + "learning_rate": 4.808741338436082e-07, + "loss": 0.1073, + "step": 1338 + }, + { + "epoch": 0.8052884615384616, + "grad_norm": 3.1092668803437515, + "learning_rate": 4.7518582142174e-07, + "loss": 0.0928, + "step": 1340 + }, + { + "epoch": 0.8064903846153846, + "grad_norm": 2.1214191642164266, + "learning_rate": 4.695278180494725e-07, + "loss": 0.1012, + "step": 1342 + }, + { + "epoch": 0.8076923076923077, + "grad_norm": 2.5730076842528553, + "learning_rate": 4.6390020842035755e-07, + "loss": 0.11, + "step": 1344 + }, + { + "epoch": 0.8088942307692307, + "grad_norm": 2.68220531087873, + "learning_rate": 4.5830307677298984e-07, + "loss": 0.1188, + "step": 1346 + }, + { + "epoch": 0.8100961538461539, + "grad_norm": 2.363649492332498, + "learning_rate": 4.5273650688974437e-07, + "loss": 0.1021, + "step": 1348 + }, + { + "epoch": 0.8112980769230769, + "grad_norm": 2.541964709244174, + "learning_rate": 4.4720058209552163e-07, + "loss": 0.0925, + "step": 1350 + }, + { + "epoch": 0.8125, + "grad_norm": 3.265981110682609, + "learning_rate": 4.4169538525650453e-07, + "loss": 0.1037, + "step": 1352 + }, + { + "epoch": 0.8137019230769231, + "grad_norm": 2.4485095937525854, + "learning_rate": 4.362209987789129e-07, + "loss": 0.1086, + "step": 1354 + }, + { + "epoch": 0.8149038461538461, + "grad_norm": 2.2907426923363805, + "learning_rate": 4.307775046077739e-07, + "loss": 0.0986, + "step": 1356 + }, + { + "epoch": 0.8161057692307693, + "grad_norm": 2.0945358815806387, + "learning_rate": 4.2536498422569237e-07, + "loss": 0.0955, + "step": 1358 + }, + { + "epoch": 0.8173076923076923, + "grad_norm": 2.211078181765995, + "learning_rate": 4.1998351865163323e-07, + "loss": 0.1005, + "step": 1360 + }, + { + "epoch": 0.8185096153846154, + "grad_norm": 2.3888674275205473, + "learning_rate": 4.1463318843970727e-07, + "loss": 0.0946, + "step": 1362 + }, + { + "epoch": 0.8197115384615384, + "grad_norm": 2.8100928189396783, + "learning_rate": 4.093140736779691e-07, + "loss": 0.1072, + "step": 1364 + }, + { + "epoch": 0.8209134615384616, + "grad_norm": 2.5814035620911775, + "learning_rate": 4.0402625398721056e-07, + "loss": 0.1085, + "step": 1366 + }, + { + "epoch": 0.8221153846153846, + "grad_norm": 2.2204309134850604, + "learning_rate": 3.987698085197761e-07, + "loss": 0.1057, + "step": 1368 + }, + { + "epoch": 0.8233173076923077, + "grad_norm": 2.284890393659316, + "learning_rate": 3.935448159583774e-07, + "loss": 0.1095, + "step": 1370 + }, + { + "epoch": 0.8245192307692307, + "grad_norm": 2.9277446873455233, + "learning_rate": 3.8835135451491037e-07, + "loss": 0.0972, + "step": 1372 + }, + { + "epoch": 0.8257211538461539, + "grad_norm": 2.624827973263955, + "learning_rate": 3.831895019292897e-07, + "loss": 0.1103, + "step": 1374 + }, + { + "epoch": 0.8269230769230769, + "grad_norm": 2.680261506966643, + "learning_rate": 3.7805933546828265e-07, + "loss": 0.1172, + "step": 1376 + }, + { + "epoch": 0.828125, + "grad_norm": 2.194552961136517, + "learning_rate": 3.7296093192435325e-07, + "loss": 0.1003, + "step": 1378 + }, + { + "epoch": 0.8293269230769231, + "grad_norm": 2.559847310791807, + "learning_rate": 3.6789436761451135e-07, + "loss": 0.1039, + "step": 1380 + }, + { + "epoch": 0.8305288461538461, + "grad_norm": 2.1332823235020575, + "learning_rate": 3.6285971837917514e-07, + "loss": 0.1004, + "step": 1382 + }, + { + "epoch": 0.8317307692307693, + "grad_norm": 2.260620258768886, + "learning_rate": 3.578570595810274e-07, + "loss": 0.1043, + "step": 1384 + }, + { + "epoch": 0.8329326923076923, + "grad_norm": 2.1460191050714768, + "learning_rate": 3.5288646610389497e-07, + "loss": 0.0973, + "step": 1386 + }, + { + "epoch": 0.8341346153846154, + "grad_norm": 2.4453293937330804, + "learning_rate": 3.4794801235162575e-07, + "loss": 0.0982, + "step": 1388 + }, + { + "epoch": 0.8353365384615384, + "grad_norm": 2.5470784403076823, + "learning_rate": 3.4304177224697284e-07, + "loss": 0.1071, + "step": 1390 + }, + { + "epoch": 0.8365384615384616, + "grad_norm": 2.1819545974434194, + "learning_rate": 3.3816781923049047e-07, + "loss": 0.0977, + "step": 1392 + }, + { + "epoch": 0.8377403846153846, + "grad_norm": 2.66559740829053, + "learning_rate": 3.333262262594328e-07, + "loss": 0.1013, + "step": 1394 + }, + { + "epoch": 0.8389423076923077, + "grad_norm": 2.4531040234693493, + "learning_rate": 3.285170658066636e-07, + "loss": 0.1136, + "step": 1396 + }, + { + "epoch": 0.8401442307692307, + "grad_norm": 2.222643442815888, + "learning_rate": 3.2374040985957005e-07, + "loss": 0.1069, + "step": 1398 + }, + { + "epoch": 0.8413461538461539, + "grad_norm": 2.513474365488169, + "learning_rate": 3.1899632991898634e-07, + "loss": 0.1115, + "step": 1400 + }, + { + "epoch": 0.8425480769230769, + "grad_norm": 2.4676487353640537, + "learning_rate": 3.1428489699812187e-07, + "loss": 0.1134, + "step": 1402 + }, + { + "epoch": 0.84375, + "grad_norm": 2.5478148761024344, + "learning_rate": 3.096061816214993e-07, + "loss": 0.1125, + "step": 1404 + }, + { + "epoch": 0.8449519230769231, + "grad_norm": 2.693250913616855, + "learning_rate": 3.0496025382390023e-07, + "loss": 0.1101, + "step": 1406 + }, + { + "epoch": 0.8461538461538461, + "grad_norm": 2.1582519269517846, + "learning_rate": 3.0034718314931376e-07, + "loss": 0.0987, + "step": 1408 + }, + { + "epoch": 0.8473557692307693, + "grad_norm": 2.356390319809914, + "learning_rate": 2.9576703864989705e-07, + "loss": 0.1103, + "step": 1410 + }, + { + "epoch": 0.8485576923076923, + "grad_norm": 2.4939149281676944, + "learning_rate": 2.9121988888494297e-07, + "loss": 0.1075, + "step": 1412 + }, + { + "epoch": 0.8497596153846154, + "grad_norm": 2.560971562380158, + "learning_rate": 2.8670580191985096e-07, + "loss": 0.1047, + "step": 1414 + }, + { + "epoch": 0.8509615384615384, + "grad_norm": 2.328897343184531, + "learning_rate": 2.822248453251117e-07, + "loss": 0.0952, + "step": 1416 + }, + { + "epoch": 0.8521634615384616, + "grad_norm": 2.741941178369846, + "learning_rate": 2.7777708617529263e-07, + "loss": 0.114, + "step": 1418 + }, + { + "epoch": 0.8533653846153846, + "grad_norm": 2.8152555502780747, + "learning_rate": 2.73362591048035e-07, + "loss": 0.1118, + "step": 1420 + }, + { + "epoch": 0.8545673076923077, + "grad_norm": 2.308625479951822, + "learning_rate": 2.689814260230575e-07, + "loss": 0.0916, + "step": 1422 + }, + { + "epoch": 0.8557692307692307, + "grad_norm": 2.7282177204631655, + "learning_rate": 2.646336566811686e-07, + "loss": 0.0998, + "step": 1424 + }, + { + "epoch": 0.8569711538461539, + "grad_norm": 2.3678387797587415, + "learning_rate": 2.6031934810328006e-07, + "loss": 0.097, + "step": 1426 + }, + { + "epoch": 0.8581730769230769, + "grad_norm": 2.2659866299053864, + "learning_rate": 2.560385648694394e-07, + "loss": 0.1035, + "step": 1428 + }, + { + "epoch": 0.859375, + "grad_norm": 2.1167601912463665, + "learning_rate": 2.5179137105785733e-07, + "loss": 0.1133, + "step": 1430 + }, + { + "epoch": 0.8605769230769231, + "grad_norm": 3.156290005065614, + "learning_rate": 2.4757783024395244e-07, + "loss": 0.1083, + "step": 1432 + }, + { + "epoch": 0.8617788461538461, + "grad_norm": 2.3830390104253514, + "learning_rate": 2.43398005499397e-07, + "loss": 0.1142, + "step": 1434 + }, + { + "epoch": 0.8629807692307693, + "grad_norm": 2.1651688894763805, + "learning_rate": 2.3925195939117516e-07, + "loss": 0.1008, + "step": 1436 + }, + { + "epoch": 0.8641826923076923, + "grad_norm": 2.906596215817537, + "learning_rate": 2.3513975398064382e-07, + "loss": 0.109, + "step": 1438 + }, + { + "epoch": 0.8653846153846154, + "grad_norm": 2.822390125625684, + "learning_rate": 2.3106145082260777e-07, + "loss": 0.11, + "step": 1440 + }, + { + "epoch": 0.8665865384615384, + "grad_norm": 4.20785338557248, + "learning_rate": 2.2701711096439177e-07, + "loss": 0.0926, + "step": 1442 + }, + { + "epoch": 0.8677884615384616, + "grad_norm": 2.401735024395661, + "learning_rate": 2.23006794944933e-07, + "loss": 0.1096, + "step": 1444 + }, + { + "epoch": 0.8689903846153846, + "grad_norm": 2.324535843969192, + "learning_rate": 2.1903056279387242e-07, + "loss": 0.0979, + "step": 1446 + }, + { + "epoch": 0.8701923076923077, + "grad_norm": 2.3309020366100395, + "learning_rate": 2.1508847403065582e-07, + "loss": 0.1003, + "step": 1448 + }, + { + "epoch": 0.8713942307692307, + "grad_norm": 1.8021632811568191, + "learning_rate": 2.1118058766364245e-07, + "loss": 0.0973, + "step": 1450 + }, + { + "epoch": 0.8725961538461539, + "grad_norm": 3.2593847440771753, + "learning_rate": 2.0730696218922376e-07, + "loss": 0.1181, + "step": 1452 + }, + { + "epoch": 0.8737980769230769, + "grad_norm": 2.822198349147213, + "learning_rate": 2.0346765559094566e-07, + "loss": 0.1011, + "step": 1454 + }, + { + "epoch": 0.875, + "grad_norm": 2.281439077352929, + "learning_rate": 1.9966272533864183e-07, + "loss": 0.1078, + "step": 1456 + }, + { + "epoch": 0.8762019230769231, + "grad_norm": 2.113587059455818, + "learning_rate": 1.9589222838757416e-07, + "loss": 0.101, + "step": 1458 + }, + { + "epoch": 0.8774038461538461, + "grad_norm": 2.8223349033116034, + "learning_rate": 1.9215622117757683e-07, + "loss": 0.1061, + "step": 1460 + }, + { + "epoch": 0.8786057692307693, + "grad_norm": 3.472491327729482, + "learning_rate": 1.8845475963221504e-07, + "loss": 0.1025, + "step": 1462 + }, + { + "epoch": 0.8798076923076923, + "grad_norm": 1.8581268435798293, + "learning_rate": 1.847878991579477e-07, + "loss": 0.095, + "step": 1464 + }, + { + "epoch": 0.8810096153846154, + "grad_norm": 2.730059314985803, + "learning_rate": 1.8115569464329602e-07, + "loss": 0.1186, + "step": 1466 + }, + { + "epoch": 0.8822115384615384, + "grad_norm": 2.1077055410907906, + "learning_rate": 1.7755820045802146e-07, + "loss": 0.1038, + "step": 1468 + }, + { + "epoch": 0.8834134615384616, + "grad_norm": 2.425923169061633, + "learning_rate": 1.7399547045231612e-07, + "loss": 0.1052, + "step": 1470 + }, + { + "epoch": 0.8846153846153846, + "grad_norm": 3.112007013321009, + "learning_rate": 1.7046755795599224e-07, + "loss": 0.1081, + "step": 1472 + }, + { + "epoch": 0.8858173076923077, + "grad_norm": 2.569797668666943, + "learning_rate": 1.6697451577768558e-07, + "loss": 0.1066, + "step": 1474 + }, + { + "epoch": 0.8870192307692307, + "grad_norm": 2.2793145814741815, + "learning_rate": 1.6351639620406506e-07, + "loss": 0.093, + "step": 1476 + }, + { + "epoch": 0.8882211538461539, + "grad_norm": 2.5222639348107148, + "learning_rate": 1.600932509990502e-07, + "loss": 0.1044, + "step": 1478 + }, + { + "epoch": 0.8894230769230769, + "grad_norm": 2.588170889871738, + "learning_rate": 1.567051314030349e-07, + "loss": 0.1095, + "step": 1480 + }, + { + "epoch": 0.890625, + "grad_norm": 2.291713127389421, + "learning_rate": 1.5335208813212376e-07, + "loss": 0.108, + "step": 1482 + }, + { + "epoch": 0.8918269230769231, + "grad_norm": 2.466650814807856, + "learning_rate": 1.500341713773687e-07, + "loss": 0.0961, + "step": 1484 + }, + { + "epoch": 0.8930288461538461, + "grad_norm": 2.7084007905892733, + "learning_rate": 1.4675143080401965e-07, + "loss": 0.1085, + "step": 1486 + }, + { + "epoch": 0.8942307692307693, + "grad_norm": 2.3729169086566286, + "learning_rate": 1.4350391555078253e-07, + "loss": 0.0961, + "step": 1488 + }, + { + "epoch": 0.8954326923076923, + "grad_norm": 2.6818163643038995, + "learning_rate": 1.4029167422908107e-07, + "loss": 0.1043, + "step": 1490 + }, + { + "epoch": 0.8966346153846154, + "grad_norm": 2.8498937973846066, + "learning_rate": 1.3711475492233116e-07, + "loss": 0.1005, + "step": 1492 + }, + { + "epoch": 0.8978365384615384, + "grad_norm": 2.3727561897542184, + "learning_rate": 1.3397320518521993e-07, + "loss": 0.1083, + "step": 1494 + }, + { + "epoch": 0.8990384615384616, + "grad_norm": 2.864804942811416, + "learning_rate": 1.3086707204299415e-07, + "loss": 0.1042, + "step": 1496 + }, + { + "epoch": 0.9002403846153846, + "grad_norm": 2.5133132508801537, + "learning_rate": 1.2779640199075627e-07, + "loss": 0.1155, + "step": 1498 + }, + { + "epoch": 0.9014423076923077, + "grad_norm": 2.534644984379523, + "learning_rate": 1.2476124099277038e-07, + "loss": 0.1136, + "step": 1500 + }, + { + "epoch": 0.9026442307692307, + "grad_norm": 2.3008784304419714, + "learning_rate": 1.217616344817693e-07, + "loss": 0.0916, + "step": 1502 + }, + { + "epoch": 0.9038461538461539, + "grad_norm": 3.1775796763443047, + "learning_rate": 1.1879762735828081e-07, + "loss": 0.1042, + "step": 1504 + }, + { + "epoch": 0.9050480769230769, + "grad_norm": 2.497072810226958, + "learning_rate": 1.1586926398995057e-07, + "loss": 0.1107, + "step": 1506 + }, + { + "epoch": 0.90625, + "grad_norm": 2.5866908472155923, + "learning_rate": 1.129765882108802e-07, + "loss": 0.1043, + "step": 1508 + }, + { + "epoch": 0.9074519230769231, + "grad_norm": 2.2123227768120626, + "learning_rate": 1.1011964332097114e-07, + "loss": 0.1056, + "step": 1510 + }, + { + "epoch": 0.9086538461538461, + "grad_norm": 2.2624610691482134, + "learning_rate": 1.0729847208527516e-07, + "loss": 0.1097, + "step": 1512 + }, + { + "epoch": 0.9098557692307693, + "grad_norm": 2.4925186199498115, + "learning_rate": 1.045131167333563e-07, + "loss": 0.1055, + "step": 1514 + }, + { + "epoch": 0.9110576923076923, + "grad_norm": 2.0402886616020846, + "learning_rate": 1.0176361895865683e-07, + "loss": 0.1012, + "step": 1516 + }, + { + "epoch": 0.9122596153846154, + "grad_norm": 3.0557112542579583, + "learning_rate": 9.9050019917874e-08, + "loss": 0.0904, + "step": 1518 + }, + { + "epoch": 0.9134615384615384, + "grad_norm": 2.8003064126365653, + "learning_rate": 9.637236023034403e-08, + "loss": 0.096, + "step": 1520 + }, + { + "epoch": 0.9146634615384616, + "grad_norm": 2.3950834236132548, + "learning_rate": 9.373067997743429e-08, + "loss": 0.1103, + "step": 1522 + }, + { + "epoch": 0.9158653846153846, + "grad_norm": 2.3223933444523275, + "learning_rate": 9.112501870194273e-08, + "loss": 0.1051, + "step": 1524 + }, + { + "epoch": 0.9170673076923077, + "grad_norm": 2.6778326848037084, + "learning_rate": 8.855541540750579e-08, + "loss": 0.1079, + "step": 1526 + }, + { + "epoch": 0.9182692307692307, + "grad_norm": 2.527199338042573, + "learning_rate": 8.602190855801523e-08, + "loss": 0.1109, + "step": 1528 + }, + { + "epoch": 0.9194711538461539, + "grad_norm": 2.2105422763119598, + "learning_rate": 8.352453607704286e-08, + "loss": 0.0994, + "step": 1530 + }, + { + "epoch": 0.9206730769230769, + "grad_norm": 2.4639244734521357, + "learning_rate": 8.106333534727145e-08, + "loss": 0.1108, + "step": 1532 + }, + { + "epoch": 0.921875, + "grad_norm": 2.2497655156731162, + "learning_rate": 7.86383432099358e-08, + "loss": 0.0991, + "step": 1534 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 2.3748694066193115, + "learning_rate": 7.624959596427145e-08, + "loss": 0.0998, + "step": 1536 + }, + { + "epoch": 0.9242788461538461, + "grad_norm": 3.0743080890043983, + "learning_rate": 7.38971293669713e-08, + "loss": 0.1068, + "step": 1538 + }, + { + "epoch": 0.9254807692307693, + "grad_norm": 2.1876563987125675, + "learning_rate": 7.15809786316507e-08, + "loss": 0.1021, + "step": 1540 + }, + { + "epoch": 0.9266826923076923, + "grad_norm": 2.0975147997242143, + "learning_rate": 6.930117842831958e-08, + "loss": 0.1046, + "step": 1542 + }, + { + "epoch": 0.9278846153846154, + "grad_norm": 2.075830478745358, + "learning_rate": 6.705776288286281e-08, + "loss": 0.0954, + "step": 1544 + }, + { + "epoch": 0.9290865384615384, + "grad_norm": 2.6184435003671362, + "learning_rate": 6.485076557653236e-08, + "loss": 0.1175, + "step": 1546 + }, + { + "epoch": 0.9302884615384616, + "grad_norm": 1.9245903381939464, + "learning_rate": 6.268021954544095e-08, + "loss": 0.1013, + "step": 1548 + }, + { + "epoch": 0.9314903846153846, + "grad_norm": 2.2746719047460853, + "learning_rate": 6.05461572800703e-08, + "loss": 0.1126, + "step": 1550 + }, + { + "epoch": 0.9326923076923077, + "grad_norm": 2.340635789861226, + "learning_rate": 5.844861072478336e-08, + "loss": 0.1123, + "step": 1552 + }, + { + "epoch": 0.9338942307692307, + "grad_norm": 2.1875192748623418, + "learning_rate": 5.6387611277346486e-08, + "loss": 0.1207, + "step": 1554 + }, + { + "epoch": 0.9350961538461539, + "grad_norm": 2.7002532298324997, + "learning_rate": 5.436318978845917e-08, + "loss": 0.1021, + "step": 1556 + }, + { + "epoch": 0.9362980769230769, + "grad_norm": 2.50568764777332, + "learning_rate": 5.237537656129332e-08, + "loss": 0.0963, + "step": 1558 + }, + { + "epoch": 0.9375, + "grad_norm": 2.4255108084712806, + "learning_rate": 5.042420135103865e-08, + "loss": 0.1056, + "step": 1560 + }, + { + "epoch": 0.9387019230769231, + "grad_norm": 2.3846332604623215, + "learning_rate": 4.850969336445688e-08, + "loss": 0.1018, + "step": 1562 + }, + { + "epoch": 0.9399038461538461, + "grad_norm": 1.9410217717252691, + "learning_rate": 4.663188125944601e-08, + "loss": 0.1034, + "step": 1564 + }, + { + "epoch": 0.9411057692307693, + "grad_norm": 1.9719340906948433, + "learning_rate": 4.47907931446101e-08, + "loss": 0.1002, + "step": 1566 + }, + { + "epoch": 0.9423076923076923, + "grad_norm": 2.7209374640824073, + "learning_rate": 4.298645657883904e-08, + "loss": 0.1025, + "step": 1568 + }, + { + "epoch": 0.9435096153846154, + "grad_norm": 2.5221543954885, + "learning_rate": 4.121889857089584e-08, + "loss": 0.1129, + "step": 1570 + }, + { + "epoch": 0.9447115384615384, + "grad_norm": 2.362069229118289, + "learning_rate": 3.948814557901276e-08, + "loss": 0.1076, + "step": 1572 + }, + { + "epoch": 0.9459134615384616, + "grad_norm": 2.383853603153857, + "learning_rate": 3.779422351049417e-08, + "loss": 0.116, + "step": 1574 + }, + { + "epoch": 0.9471153846153846, + "grad_norm": 1.9719712080104705, + "learning_rate": 3.613715772133097e-08, + "loss": 0.0939, + "step": 1576 + }, + { + "epoch": 0.9483173076923077, + "grad_norm": 2.302141720175791, + "learning_rate": 3.451697301581791e-08, + "loss": 0.1108, + "step": 1578 + }, + { + "epoch": 0.9495192307692307, + "grad_norm": 2.259505291636599, + "learning_rate": 3.293369364618465e-08, + "loss": 0.0928, + "step": 1580 + }, + { + "epoch": 0.9507211538461539, + "grad_norm": 3.306570471168316, + "learning_rate": 3.138734331223248e-08, + "loss": 0.1092, + "step": 1582 + }, + { + "epoch": 0.9519230769230769, + "grad_norm": 2.132657723925501, + "learning_rate": 2.987794516097875e-08, + "loss": 0.1076, + "step": 1584 + }, + { + "epoch": 0.953125, + "grad_norm": 2.4599044268457995, + "learning_rate": 2.8405521786310508e-08, + "loss": 0.1032, + "step": 1586 + }, + { + "epoch": 0.9543269230769231, + "grad_norm": 2.743173912553656, + "learning_rate": 2.6970095228647243e-08, + "loss": 0.1006, + "step": 1588 + }, + { + "epoch": 0.9555288461538461, + "grad_norm": 2.7902694032785678, + "learning_rate": 2.5571686974609766e-08, + "loss": 0.1082, + "step": 1590 + }, + { + "epoch": 0.9567307692307693, + "grad_norm": 2.1596954453730617, + "learning_rate": 2.4210317956698814e-08, + "loss": 0.0968, + "step": 1592 + }, + { + "epoch": 0.9579326923076923, + "grad_norm": 2.1748717942295452, + "learning_rate": 2.2886008552983064e-08, + "loss": 0.1159, + "step": 1594 + }, + { + "epoch": 0.9591346153846154, + "grad_norm": 2.772837831508022, + "learning_rate": 2.1598778586792158e-08, + "loss": 0.1188, + "step": 1596 + }, + { + "epoch": 0.9603365384615384, + "grad_norm": 2.3289207936080105, + "learning_rate": 2.0348647326420835e-08, + "loss": 0.1177, + "step": 1598 + }, + { + "epoch": 0.9615384615384616, + "grad_norm": 2.2952508596930685, + "learning_rate": 1.91356334848411e-08, + "loss": 0.1076, + "step": 1600 + }, + { + "epoch": 0.9627403846153846, + "grad_norm": 2.273702791234848, + "learning_rate": 1.795975521942106e-08, + "loss": 0.1046, + "step": 1602 + }, + { + "epoch": 0.9639423076923077, + "grad_norm": 2.592999996375681, + "learning_rate": 1.682103013165376e-08, + "loss": 0.114, + "step": 1604 + }, + { + "epoch": 0.9651442307692307, + "grad_norm": 2.227735041940276, + "learning_rate": 1.571947526689349e-08, + "loss": 0.1054, + "step": 1606 + }, + { + "epoch": 0.9663461538461539, + "grad_norm": 2.26729275395297, + "learning_rate": 1.4655107114101008e-08, + "loss": 0.0916, + "step": 1608 + }, + { + "epoch": 0.9675480769230769, + "grad_norm": 2.546233331956148, + "learning_rate": 1.362794160559594e-08, + "loss": 0.1151, + "step": 1610 + }, + { + "epoch": 0.96875, + "grad_norm": 2.496560671829686, + "learning_rate": 1.263799411681893e-08, + "loss": 0.1161, + "step": 1612 + }, + { + "epoch": 0.9699519230769231, + "grad_norm": 2.2682911816628715, + "learning_rate": 1.1685279466101817e-08, + "loss": 0.1008, + "step": 1614 + }, + { + "epoch": 0.9711538461538461, + "grad_norm": 2.7304953540259405, + "learning_rate": 1.0769811914444206e-08, + "loss": 0.1041, + "step": 1616 + }, + { + "epoch": 0.9723557692307693, + "grad_norm": 2.6466979182161885, + "learning_rate": 9.89160516530252e-09, + "loss": 0.1044, + "step": 1618 + }, + { + "epoch": 0.9735576923076923, + "grad_norm": 2.394702205679758, + "learning_rate": 9.050672364382118e-09, + "loss": 0.0955, + "step": 1620 + }, + { + "epoch": 0.9747596153846154, + "grad_norm": 3.3455759598567836, + "learning_rate": 8.247026099443279e-09, + "loss": 0.1109, + "step": 1622 + }, + { + "epoch": 0.9759615384615384, + "grad_norm": 2.275839668759994, + "learning_rate": 7.480678400109965e-09, + "loss": 0.1061, + "step": 1624 + }, + { + "epoch": 0.9771634615384616, + "grad_norm": 2.133232464295508, + "learning_rate": 6.751640737691911e-09, + "loss": 0.1042, + "step": 1626 + }, + { + "epoch": 0.9783653846153846, + "grad_norm": 2.304963137531874, + "learning_rate": 6.059924025012542e-09, + "loss": 0.1038, + "step": 1628 + }, + { + "epoch": 0.9795673076923077, + "grad_norm": 1.9133668945458344, + "learning_rate": 5.405538616244377e-09, + "loss": 0.0946, + "step": 1630 + }, + { + "epoch": 0.9807692307692307, + "grad_norm": 2.7193573070283596, + "learning_rate": 4.788494306755542e-09, + "loss": 0.1021, + "step": 1632 + }, + { + "epoch": 0.9819711538461539, + "grad_norm": 2.8786636757179878, + "learning_rate": 4.208800332961838e-09, + "loss": 0.1102, + "step": 1634 + }, + { + "epoch": 0.9831730769230769, + "grad_norm": 2.4551588884935, + "learning_rate": 3.666465372190453e-09, + "loss": 0.0962, + "step": 1636 + }, + { + "epoch": 0.984375, + "grad_norm": 2.1960677965536957, + "learning_rate": 3.1614975425470207e-09, + "loss": 0.1151, + "step": 1638 + }, + { + "epoch": 0.9855769230769231, + "grad_norm": 3.4510437899419992, + "learning_rate": 2.693904402797376e-09, + "loss": 0.0972, + "step": 1640 + }, + { + "epoch": 0.9867788461538461, + "grad_norm": 2.4216851513924205, + "learning_rate": 2.2636929522520945e-09, + "loss": 0.1199, + "step": 1642 + }, + { + "epoch": 0.9879807692307693, + "grad_norm": 2.5149641594525214, + "learning_rate": 1.8708696306624087e-09, + "loss": 0.0947, + "step": 1644 + }, + { + "epoch": 0.9891826923076923, + "grad_norm": 2.8133693313833303, + "learning_rate": 1.5154403181247279e-09, + "loss": 0.102, + "step": 1646 + }, + { + "epoch": 0.9903846153846154, + "grad_norm": 1.9008875611487197, + "learning_rate": 1.1974103349909894e-09, + "loss": 0.0975, + "step": 1648 + }, + { + "epoch": 0.9915865384615384, + "grad_norm": 2.3226078974966184, + "learning_rate": 9.167844417901084e-10, + "loss": 0.115, + "step": 1650 + }, + { + "epoch": 0.9927884615384616, + "grad_norm": 3.1578748196963695, + "learning_rate": 6.735668391566475e-10, + "loss": 0.1127, + "step": 1652 + }, + { + "epoch": 0.9939903846153846, + "grad_norm": 2.273472912036705, + "learning_rate": 4.677611677675331e-10, + "loss": 0.096, + "step": 1654 + }, + { + "epoch": 0.9951923076923077, + "grad_norm": 2.0942516236835993, + "learning_rate": 2.993705082879328e-10, + "loss": 0.1131, + "step": 1656 + }, + { + "epoch": 0.9963942307692307, + "grad_norm": 2.5707284998861346, + "learning_rate": 1.683973813249029e-10, + "loss": 0.0986, + "step": 1658 + }, + { + "epoch": 0.9975961538461539, + "grad_norm": 2.5538888383981013, + "learning_rate": 7.484374738936373e-11, + "loss": 0.1101, + "step": 1660 + }, + { + "epoch": 0.9987980769230769, + "grad_norm": 2.0051743259281305, + "learning_rate": 1.8711006867788707e-11, + "loss": 0.1041, + "step": 1662 + }, + { + "epoch": 1.0, + "grad_norm": 2.79621961431732, + "learning_rate": 0.0, + "loss": 0.1053, + "step": 1664 + } + ], + "logging_steps": 2, + "max_steps": 1664, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 522572361891840.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}