{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 7758, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003866976024748647, "grad_norm": 6.7041209811527915, "learning_rate": 1.2886597938144331e-08, "loss": 0.3578, "step": 1 }, { "epoch": 0.0007733952049497294, "grad_norm": 6.402703415499569, "learning_rate": 2.5773195876288662e-08, "loss": 0.3196, "step": 2 }, { "epoch": 0.001160092807424594, "grad_norm": 6.656089226396188, "learning_rate": 3.865979381443299e-08, "loss": 0.3398, "step": 3 }, { "epoch": 0.0015467904098994587, "grad_norm": 6.560314504699263, "learning_rate": 5.1546391752577325e-08, "loss": 0.3315, "step": 4 }, { "epoch": 0.0019334880123743233, "grad_norm": 6.626549340669369, "learning_rate": 6.443298969072165e-08, "loss": 0.3077, "step": 5 }, { "epoch": 0.002320185614849188, "grad_norm": 6.83302592968739, "learning_rate": 7.731958762886598e-08, "loss": 0.3474, "step": 6 }, { "epoch": 0.0027068832173240526, "grad_norm": 6.249810975875102, "learning_rate": 9.02061855670103e-08, "loss": 0.2979, "step": 7 }, { "epoch": 0.0030935808197989174, "grad_norm": 6.020176015749318, "learning_rate": 1.0309278350515465e-07, "loss": 0.3116, "step": 8 }, { "epoch": 0.0034802784222737818, "grad_norm": 6.393381555585113, "learning_rate": 1.1597938144329898e-07, "loss": 0.2891, "step": 9 }, { "epoch": 0.0038669760247486465, "grad_norm": 7.219745816660879, "learning_rate": 1.288659793814433e-07, "loss": 0.3206, "step": 10 }, { "epoch": 0.004253673627223511, "grad_norm": 7.375510829611577, "learning_rate": 1.4175257731958764e-07, "loss": 0.3672, "step": 11 }, { "epoch": 0.004640371229698376, "grad_norm": 6.243342330144109, "learning_rate": 1.5463917525773197e-07, "loss": 0.3087, "step": 12 }, { "epoch": 0.005027068832173241, "grad_norm": 6.135506284131878, "learning_rate": 1.675257731958763e-07, "loss": 0.2955, "step": 13 }, { "epoch": 0.005413766434648105, "grad_norm": 6.4008922963817465, "learning_rate": 1.804123711340206e-07, "loss": 0.3064, "step": 14 }, { "epoch": 0.00580046403712297, "grad_norm": 6.2641737355227445, "learning_rate": 1.9329896907216497e-07, "loss": 0.3555, "step": 15 }, { "epoch": 0.006187161639597835, "grad_norm": 7.0852621397043185, "learning_rate": 2.061855670103093e-07, "loss": 0.3969, "step": 16 }, { "epoch": 0.006573859242072699, "grad_norm": 6.521888997937717, "learning_rate": 2.190721649484536e-07, "loss": 0.3706, "step": 17 }, { "epoch": 0.0069605568445475635, "grad_norm": 7.206774811265903, "learning_rate": 2.3195876288659797e-07, "loss": 0.3637, "step": 18 }, { "epoch": 0.007347254447022429, "grad_norm": 5.995603762054923, "learning_rate": 2.448453608247423e-07, "loss": 0.3106, "step": 19 }, { "epoch": 0.007733952049497293, "grad_norm": 6.270714562737045, "learning_rate": 2.577319587628866e-07, "loss": 0.3617, "step": 20 }, { "epoch": 0.008120649651972157, "grad_norm": 6.484291180975977, "learning_rate": 2.7061855670103096e-07, "loss": 0.3379, "step": 21 }, { "epoch": 0.008507347254447023, "grad_norm": 5.433722804507635, "learning_rate": 2.8350515463917527e-07, "loss": 0.2668, "step": 22 }, { "epoch": 0.008894044856921888, "grad_norm": 6.337000450492004, "learning_rate": 2.963917525773196e-07, "loss": 0.3765, "step": 23 }, { "epoch": 0.009280742459396751, "grad_norm": 5.783624540391704, "learning_rate": 3.0927835051546394e-07, "loss": 0.2889, "step": 24 }, { "epoch": 0.009667440061871617, "grad_norm": 5.9341381750848745, "learning_rate": 3.2216494845360824e-07, "loss": 0.3075, "step": 25 }, { "epoch": 0.010054137664346482, "grad_norm": 6.285520745576489, "learning_rate": 3.350515463917526e-07, "loss": 0.383, "step": 26 }, { "epoch": 0.010440835266821345, "grad_norm": 4.947490258387687, "learning_rate": 3.4793814432989696e-07, "loss": 0.2836, "step": 27 }, { "epoch": 0.01082753286929621, "grad_norm": 5.499146299297474, "learning_rate": 3.608247422680412e-07, "loss": 0.324, "step": 28 }, { "epoch": 0.011214230471771076, "grad_norm": 4.763494215373806, "learning_rate": 3.737113402061856e-07, "loss": 0.3165, "step": 29 }, { "epoch": 0.01160092807424594, "grad_norm": 5.648989143664179, "learning_rate": 3.8659793814432993e-07, "loss": 0.3067, "step": 30 }, { "epoch": 0.011987625676720804, "grad_norm": 5.248299415439264, "learning_rate": 3.9948453608247424e-07, "loss": 0.2843, "step": 31 }, { "epoch": 0.01237432327919567, "grad_norm": 4.850478169202229, "learning_rate": 4.123711340206186e-07, "loss": 0.2974, "step": 32 }, { "epoch": 0.012761020881670533, "grad_norm": 4.279498630245538, "learning_rate": 4.2525773195876296e-07, "loss": 0.2325, "step": 33 }, { "epoch": 0.013147718484145398, "grad_norm": 4.726965707276182, "learning_rate": 4.381443298969072e-07, "loss": 0.2776, "step": 34 }, { "epoch": 0.013534416086620264, "grad_norm": 4.062465305013222, "learning_rate": 4.5103092783505157e-07, "loss": 0.2401, "step": 35 }, { "epoch": 0.013921113689095127, "grad_norm": 3.650332966942205, "learning_rate": 4.6391752577319593e-07, "loss": 0.2494, "step": 36 }, { "epoch": 0.014307811291569992, "grad_norm": 3.1023603624332323, "learning_rate": 4.7680412371134024e-07, "loss": 0.2308, "step": 37 }, { "epoch": 0.014694508894044857, "grad_norm": 2.706303641642022, "learning_rate": 4.896907216494846e-07, "loss": 0.1917, "step": 38 }, { "epoch": 0.015081206496519721, "grad_norm": 2.9225969169709556, "learning_rate": 5.02577319587629e-07, "loss": 0.2469, "step": 39 }, { "epoch": 0.015467904098994586, "grad_norm": 2.361159949028339, "learning_rate": 5.154639175257732e-07, "loss": 0.2276, "step": 40 }, { "epoch": 0.01585460170146945, "grad_norm": 2.744650920411295, "learning_rate": 5.283505154639176e-07, "loss": 0.2479, "step": 41 }, { "epoch": 0.016241299303944315, "grad_norm": 2.241602471561204, "learning_rate": 5.412371134020619e-07, "loss": 0.2024, "step": 42 }, { "epoch": 0.016627996906419182, "grad_norm": 2.279023536398088, "learning_rate": 5.541237113402062e-07, "loss": 0.214, "step": 43 }, { "epoch": 0.017014694508894045, "grad_norm": 2.3579724398947755, "learning_rate": 5.670103092783505e-07, "loss": 0.1921, "step": 44 }, { "epoch": 0.01740139211136891, "grad_norm": 1.8515901164900208, "learning_rate": 5.798969072164949e-07, "loss": 0.1902, "step": 45 }, { "epoch": 0.017788089713843776, "grad_norm": 1.9716961607609775, "learning_rate": 5.927835051546392e-07, "loss": 0.2007, "step": 46 }, { "epoch": 0.01817478731631864, "grad_norm": 1.9138988076595183, "learning_rate": 6.056701030927835e-07, "loss": 0.1926, "step": 47 }, { "epoch": 0.018561484918793503, "grad_norm": 2.097331459486274, "learning_rate": 6.185567010309279e-07, "loss": 0.1957, "step": 48 }, { "epoch": 0.01894818252126837, "grad_norm": 1.9582592759483881, "learning_rate": 6.314432989690722e-07, "loss": 0.2234, "step": 49 }, { "epoch": 0.019334880123743233, "grad_norm": 1.7816145182684962, "learning_rate": 6.443298969072165e-07, "loss": 0.1955, "step": 50 }, { "epoch": 0.019721577726218097, "grad_norm": 1.5418354682235804, "learning_rate": 6.57216494845361e-07, "loss": 0.2042, "step": 51 }, { "epoch": 0.020108275328692964, "grad_norm": 1.5039222695029162, "learning_rate": 6.701030927835052e-07, "loss": 0.1647, "step": 52 }, { "epoch": 0.020494972931167827, "grad_norm": 1.520482402209741, "learning_rate": 6.829896907216495e-07, "loss": 0.1639, "step": 53 }, { "epoch": 0.02088167053364269, "grad_norm": 1.3665234511115374, "learning_rate": 6.958762886597939e-07, "loss": 0.178, "step": 54 }, { "epoch": 0.021268368136117557, "grad_norm": 1.4705167502526055, "learning_rate": 7.087628865979382e-07, "loss": 0.183, "step": 55 }, { "epoch": 0.02165506573859242, "grad_norm": 1.3882861839307346, "learning_rate": 7.216494845360824e-07, "loss": 0.205, "step": 56 }, { "epoch": 0.022041763341067284, "grad_norm": 1.4614483982645416, "learning_rate": 7.345360824742269e-07, "loss": 0.1822, "step": 57 }, { "epoch": 0.02242846094354215, "grad_norm": 1.5280018874316637, "learning_rate": 7.474226804123711e-07, "loss": 0.1816, "step": 58 }, { "epoch": 0.022815158546017015, "grad_norm": 1.3869113605315677, "learning_rate": 7.603092783505155e-07, "loss": 0.1598, "step": 59 }, { "epoch": 0.02320185614849188, "grad_norm": 1.2179751106806995, "learning_rate": 7.731958762886599e-07, "loss": 0.1556, "step": 60 }, { "epoch": 0.023588553750966745, "grad_norm": 1.6349934208780694, "learning_rate": 7.860824742268041e-07, "loss": 0.1635, "step": 61 }, { "epoch": 0.02397525135344161, "grad_norm": 1.169203352687778, "learning_rate": 7.989690721649485e-07, "loss": 0.1514, "step": 62 }, { "epoch": 0.024361948955916472, "grad_norm": 1.003098546372894, "learning_rate": 8.118556701030928e-07, "loss": 0.1398, "step": 63 }, { "epoch": 0.02474864655839134, "grad_norm": 1.3777348922766224, "learning_rate": 8.247422680412372e-07, "loss": 0.1765, "step": 64 }, { "epoch": 0.025135344160866203, "grad_norm": 1.0912543055270787, "learning_rate": 8.376288659793815e-07, "loss": 0.1319, "step": 65 }, { "epoch": 0.025522041763341066, "grad_norm": 1.0283687625771774, "learning_rate": 8.505154639175259e-07, "loss": 0.1471, "step": 66 }, { "epoch": 0.025908739365815933, "grad_norm": 1.7499183857584755, "learning_rate": 8.634020618556702e-07, "loss": 0.137, "step": 67 }, { "epoch": 0.026295436968290797, "grad_norm": 1.161976463826632, "learning_rate": 8.762886597938144e-07, "loss": 0.1544, "step": 68 }, { "epoch": 0.02668213457076566, "grad_norm": 0.9759478880112308, "learning_rate": 8.891752577319589e-07, "loss": 0.111, "step": 69 }, { "epoch": 0.027068832173240527, "grad_norm": 1.3843395802086353, "learning_rate": 9.020618556701031e-07, "loss": 0.1591, "step": 70 }, { "epoch": 0.02745552977571539, "grad_norm": 1.3509025379649824, "learning_rate": 9.149484536082474e-07, "loss": 0.1465, "step": 71 }, { "epoch": 0.027842227378190254, "grad_norm": 0.9684979142025556, "learning_rate": 9.278350515463919e-07, "loss": 0.1183, "step": 72 }, { "epoch": 0.02822892498066512, "grad_norm": 1.4983987361048416, "learning_rate": 9.407216494845361e-07, "loss": 0.1727, "step": 73 }, { "epoch": 0.028615622583139984, "grad_norm": 1.3787189509563318, "learning_rate": 9.536082474226805e-07, "loss": 0.1634, "step": 74 }, { "epoch": 0.029002320185614848, "grad_norm": 1.2087387785413246, "learning_rate": 9.664948453608248e-07, "loss": 0.1254, "step": 75 }, { "epoch": 0.029389017788089715, "grad_norm": 1.0890415848015333, "learning_rate": 9.793814432989692e-07, "loss": 0.15, "step": 76 }, { "epoch": 0.02977571539056458, "grad_norm": 1.012927673645412, "learning_rate": 9.922680412371133e-07, "loss": 0.1311, "step": 77 }, { "epoch": 0.030162412993039442, "grad_norm": 1.0112561259030004, "learning_rate": 1.005154639175258e-06, "loss": 0.1574, "step": 78 }, { "epoch": 0.03054911059551431, "grad_norm": 1.1072580801749476, "learning_rate": 1.018041237113402e-06, "loss": 0.1253, "step": 79 }, { "epoch": 0.030935808197989172, "grad_norm": 1.5722722742835495, "learning_rate": 1.0309278350515464e-06, "loss": 0.1554, "step": 80 }, { "epoch": 0.031322505800464036, "grad_norm": 0.9402512960901913, "learning_rate": 1.0438144329896908e-06, "loss": 0.1445, "step": 81 }, { "epoch": 0.0317092034029389, "grad_norm": 1.0615783473218448, "learning_rate": 1.0567010309278351e-06, "loss": 0.1302, "step": 82 }, { "epoch": 0.03209590100541377, "grad_norm": 1.0755291484488314, "learning_rate": 1.0695876288659795e-06, "loss": 0.1305, "step": 83 }, { "epoch": 0.03248259860788863, "grad_norm": 1.4752393780094955, "learning_rate": 1.0824742268041239e-06, "loss": 0.1526, "step": 84 }, { "epoch": 0.0328692962103635, "grad_norm": 0.8845097218376421, "learning_rate": 1.0953608247422682e-06, "loss": 0.1272, "step": 85 }, { "epoch": 0.033255993812838364, "grad_norm": 1.0700431583176342, "learning_rate": 1.1082474226804124e-06, "loss": 0.1515, "step": 86 }, { "epoch": 0.033642691415313224, "grad_norm": 1.0971853227240675, "learning_rate": 1.121134020618557e-06, "loss": 0.1362, "step": 87 }, { "epoch": 0.03402938901778809, "grad_norm": 0.818126403840349, "learning_rate": 1.134020618556701e-06, "loss": 0.121, "step": 88 }, { "epoch": 0.03441608662026296, "grad_norm": 1.024432023261264, "learning_rate": 1.1469072164948454e-06, "loss": 0.1451, "step": 89 }, { "epoch": 0.03480278422273782, "grad_norm": 0.9239394232111241, "learning_rate": 1.1597938144329898e-06, "loss": 0.1328, "step": 90 }, { "epoch": 0.035189481825212685, "grad_norm": 0.9315118082093131, "learning_rate": 1.1726804123711342e-06, "loss": 0.1389, "step": 91 }, { "epoch": 0.03557617942768755, "grad_norm": 0.9551324529666022, "learning_rate": 1.1855670103092783e-06, "loss": 0.1426, "step": 92 }, { "epoch": 0.03596287703016241, "grad_norm": 0.9043002968544921, "learning_rate": 1.1984536082474229e-06, "loss": 0.1347, "step": 93 }, { "epoch": 0.03634957463263728, "grad_norm": 0.9107943310998122, "learning_rate": 1.211340206185567e-06, "loss": 0.116, "step": 94 }, { "epoch": 0.036736272235112145, "grad_norm": 1.1485310908301236, "learning_rate": 1.2242268041237114e-06, "loss": 0.1309, "step": 95 }, { "epoch": 0.037122969837587005, "grad_norm": 1.3382857069728502, "learning_rate": 1.2371134020618557e-06, "loss": 0.1548, "step": 96 }, { "epoch": 0.03750966744006187, "grad_norm": 1.3164077895291841, "learning_rate": 1.25e-06, "loss": 0.1635, "step": 97 }, { "epoch": 0.03789636504253674, "grad_norm": 1.1204576160733302, "learning_rate": 1.2628865979381445e-06, "loss": 0.1227, "step": 98 }, { "epoch": 0.0382830626450116, "grad_norm": 0.9252283523339128, "learning_rate": 1.2757731958762886e-06, "loss": 0.1294, "step": 99 }, { "epoch": 0.038669760247486466, "grad_norm": 0.9651109623826672, "learning_rate": 1.288659793814433e-06, "loss": 0.1455, "step": 100 }, { "epoch": 0.03905645784996133, "grad_norm": 1.1818535477496281, "learning_rate": 1.3015463917525775e-06, "loss": 0.1401, "step": 101 }, { "epoch": 0.03944315545243619, "grad_norm": 0.8370123649541144, "learning_rate": 1.314432989690722e-06, "loss": 0.1131, "step": 102 }, { "epoch": 0.03982985305491106, "grad_norm": 1.1838695388474125, "learning_rate": 1.327319587628866e-06, "loss": 0.1867, "step": 103 }, { "epoch": 0.04021655065738593, "grad_norm": 1.0020830250172887, "learning_rate": 1.3402061855670104e-06, "loss": 0.1209, "step": 104 }, { "epoch": 0.04060324825986079, "grad_norm": 1.0055240174451414, "learning_rate": 1.3530927835051548e-06, "loss": 0.1332, "step": 105 }, { "epoch": 0.040989945862335654, "grad_norm": 1.0580275987405954, "learning_rate": 1.365979381443299e-06, "loss": 0.1174, "step": 106 }, { "epoch": 0.04137664346481052, "grad_norm": 0.7756736756331193, "learning_rate": 1.3788659793814435e-06, "loss": 0.1031, "step": 107 }, { "epoch": 0.04176334106728538, "grad_norm": 0.790147310298227, "learning_rate": 1.3917525773195878e-06, "loss": 0.0955, "step": 108 }, { "epoch": 0.04215003866976025, "grad_norm": 1.0959321968404074, "learning_rate": 1.404639175257732e-06, "loss": 0.1257, "step": 109 }, { "epoch": 0.042536736272235115, "grad_norm": 1.0924852630480424, "learning_rate": 1.4175257731958764e-06, "loss": 0.1037, "step": 110 }, { "epoch": 0.042923433874709975, "grad_norm": 0.908826334075511, "learning_rate": 1.4304123711340207e-06, "loss": 0.1203, "step": 111 }, { "epoch": 0.04331013147718484, "grad_norm": 1.1717505431225104, "learning_rate": 1.4432989690721649e-06, "loss": 0.1334, "step": 112 }, { "epoch": 0.04369682907965971, "grad_norm": 0.8124820592753748, "learning_rate": 1.4561855670103094e-06, "loss": 0.0984, "step": 113 }, { "epoch": 0.04408352668213457, "grad_norm": 1.2956398650742311, "learning_rate": 1.4690721649484538e-06, "loss": 0.1201, "step": 114 }, { "epoch": 0.044470224284609436, "grad_norm": 1.3003083711625336, "learning_rate": 1.4819587628865981e-06, "loss": 0.1457, "step": 115 }, { "epoch": 0.0448569218870843, "grad_norm": 1.0401184762437548, "learning_rate": 1.4948453608247423e-06, "loss": 0.098, "step": 116 }, { "epoch": 0.04524361948955916, "grad_norm": 1.0666278625616314, "learning_rate": 1.5077319587628867e-06, "loss": 0.1171, "step": 117 }, { "epoch": 0.04563031709203403, "grad_norm": 1.1347657777019091, "learning_rate": 1.520618556701031e-06, "loss": 0.0951, "step": 118 }, { "epoch": 0.0460170146945089, "grad_norm": 1.2100300252945768, "learning_rate": 1.5335051546391756e-06, "loss": 0.1339, "step": 119 }, { "epoch": 0.04640371229698376, "grad_norm": 1.0543306903711906, "learning_rate": 1.5463917525773197e-06, "loss": 0.1256, "step": 120 }, { "epoch": 0.046790409899458624, "grad_norm": 1.1205028332844844, "learning_rate": 1.559278350515464e-06, "loss": 0.1187, "step": 121 }, { "epoch": 0.04717710750193349, "grad_norm": 1.3729123705965631, "learning_rate": 1.5721649484536082e-06, "loss": 0.1798, "step": 122 }, { "epoch": 0.04756380510440835, "grad_norm": 0.8570207263653069, "learning_rate": 1.5850515463917526e-06, "loss": 0.1104, "step": 123 }, { "epoch": 0.04795050270688322, "grad_norm": 0.9957566545205869, "learning_rate": 1.597938144329897e-06, "loss": 0.1039, "step": 124 }, { "epoch": 0.048337200309358085, "grad_norm": 1.3253282695740258, "learning_rate": 1.6108247422680415e-06, "loss": 0.1016, "step": 125 }, { "epoch": 0.048723897911832945, "grad_norm": 1.1435802373609811, "learning_rate": 1.6237113402061857e-06, "loss": 0.1227, "step": 126 }, { "epoch": 0.04911059551430781, "grad_norm": 1.0857713507716968, "learning_rate": 1.63659793814433e-06, "loss": 0.1281, "step": 127 }, { "epoch": 0.04949729311678268, "grad_norm": 0.8970538673574773, "learning_rate": 1.6494845360824744e-06, "loss": 0.1135, "step": 128 }, { "epoch": 0.04988399071925754, "grad_norm": 1.1613711736063523, "learning_rate": 1.6623711340206185e-06, "loss": 0.1404, "step": 129 }, { "epoch": 0.050270688321732405, "grad_norm": 1.554310049559413, "learning_rate": 1.675257731958763e-06, "loss": 0.1334, "step": 130 }, { "epoch": 0.05065738592420727, "grad_norm": 0.9276060932780231, "learning_rate": 1.6881443298969075e-06, "loss": 0.1068, "step": 131 }, { "epoch": 0.05104408352668213, "grad_norm": 0.939690756120488, "learning_rate": 1.7010309278350518e-06, "loss": 0.1234, "step": 132 }, { "epoch": 0.051430781129157, "grad_norm": 1.424093464208867, "learning_rate": 1.713917525773196e-06, "loss": 0.1411, "step": 133 }, { "epoch": 0.051817478731631866, "grad_norm": 0.8691440714928186, "learning_rate": 1.7268041237113403e-06, "loss": 0.0859, "step": 134 }, { "epoch": 0.052204176334106726, "grad_norm": 0.9687927156594589, "learning_rate": 1.7396907216494847e-06, "loss": 0.1196, "step": 135 }, { "epoch": 0.05259087393658159, "grad_norm": 1.1021440751644536, "learning_rate": 1.7525773195876288e-06, "loss": 0.1241, "step": 136 }, { "epoch": 0.05297757153905646, "grad_norm": 0.8729211767075024, "learning_rate": 1.7654639175257734e-06, "loss": 0.1063, "step": 137 }, { "epoch": 0.05336426914153132, "grad_norm": 1.0079835472296614, "learning_rate": 1.7783505154639178e-06, "loss": 0.1092, "step": 138 }, { "epoch": 0.05375096674400619, "grad_norm": 1.028993045522652, "learning_rate": 1.791237113402062e-06, "loss": 0.1162, "step": 139 }, { "epoch": 0.054137664346481054, "grad_norm": 0.6880773888336736, "learning_rate": 1.8041237113402063e-06, "loss": 0.0894, "step": 140 }, { "epoch": 0.054524361948955914, "grad_norm": 0.90522739374868, "learning_rate": 1.8170103092783506e-06, "loss": 0.0919, "step": 141 }, { "epoch": 0.05491105955143078, "grad_norm": 1.0804101327798608, "learning_rate": 1.8298969072164948e-06, "loss": 0.1301, "step": 142 }, { "epoch": 0.05529775715390565, "grad_norm": 0.8931397486838035, "learning_rate": 1.8427835051546394e-06, "loss": 0.105, "step": 143 }, { "epoch": 0.05568445475638051, "grad_norm": 0.9730971488061428, "learning_rate": 1.8556701030927837e-06, "loss": 0.1149, "step": 144 }, { "epoch": 0.056071152358855375, "grad_norm": 0.8874634144950599, "learning_rate": 1.868556701030928e-06, "loss": 0.1269, "step": 145 }, { "epoch": 0.05645784996133024, "grad_norm": 0.8897073994813484, "learning_rate": 1.8814432989690722e-06, "loss": 0.1121, "step": 146 }, { "epoch": 0.0568445475638051, "grad_norm": 1.1136767481255905, "learning_rate": 1.8943298969072166e-06, "loss": 0.11, "step": 147 }, { "epoch": 0.05723124516627997, "grad_norm": 1.3563154452184856, "learning_rate": 1.907216494845361e-06, "loss": 0.1302, "step": 148 }, { "epoch": 0.057617942768754836, "grad_norm": 0.8368181626675972, "learning_rate": 1.9201030927835053e-06, "loss": 0.0873, "step": 149 }, { "epoch": 0.058004640371229696, "grad_norm": 1.0036645364200323, "learning_rate": 1.9329896907216497e-06, "loss": 0.1114, "step": 150 }, { "epoch": 0.05839133797370456, "grad_norm": 0.9465947229789281, "learning_rate": 1.945876288659794e-06, "loss": 0.1294, "step": 151 }, { "epoch": 0.05877803557617943, "grad_norm": 1.0554877677052055, "learning_rate": 1.9587628865979384e-06, "loss": 0.1321, "step": 152 }, { "epoch": 0.05916473317865429, "grad_norm": 1.0430374934142534, "learning_rate": 1.9716494845360827e-06, "loss": 0.1137, "step": 153 }, { "epoch": 0.05955143078112916, "grad_norm": 0.8870246198108427, "learning_rate": 1.9845360824742267e-06, "loss": 0.0818, "step": 154 }, { "epoch": 0.059938128383604024, "grad_norm": 0.8740394658725257, "learning_rate": 1.9974226804123715e-06, "loss": 0.1098, "step": 155 }, { "epoch": 0.060324825986078884, "grad_norm": 1.3329628667242435, "learning_rate": 2.010309278350516e-06, "loss": 0.1053, "step": 156 }, { "epoch": 0.06071152358855375, "grad_norm": 0.9365109986609086, "learning_rate": 2.02319587628866e-06, "loss": 0.134, "step": 157 }, { "epoch": 0.06109822119102862, "grad_norm": 0.8931261879776613, "learning_rate": 2.036082474226804e-06, "loss": 0.1116, "step": 158 }, { "epoch": 0.06148491879350348, "grad_norm": 0.8871929680953761, "learning_rate": 2.0489690721649485e-06, "loss": 0.0845, "step": 159 }, { "epoch": 0.061871616395978345, "grad_norm": 0.8946751651997866, "learning_rate": 2.061855670103093e-06, "loss": 0.0949, "step": 160 }, { "epoch": 0.06225831399845321, "grad_norm": 0.6952496918587947, "learning_rate": 2.0747422680412376e-06, "loss": 0.0979, "step": 161 }, { "epoch": 0.06264501160092807, "grad_norm": 0.8945556273019543, "learning_rate": 2.0876288659793816e-06, "loss": 0.1105, "step": 162 }, { "epoch": 0.06303170920340294, "grad_norm": 0.7592931017428645, "learning_rate": 2.100515463917526e-06, "loss": 0.091, "step": 163 }, { "epoch": 0.0634184068058778, "grad_norm": 0.8433904149019388, "learning_rate": 2.1134020618556703e-06, "loss": 0.1142, "step": 164 }, { "epoch": 0.06380510440835267, "grad_norm": 0.900831570741077, "learning_rate": 2.1262886597938146e-06, "loss": 0.1115, "step": 165 }, { "epoch": 0.06419180201082754, "grad_norm": 0.8113890635956366, "learning_rate": 2.139175257731959e-06, "loss": 0.1002, "step": 166 }, { "epoch": 0.06457849961330239, "grad_norm": 1.0789015869514411, "learning_rate": 2.1520618556701033e-06, "loss": 0.1316, "step": 167 }, { "epoch": 0.06496519721577726, "grad_norm": 0.8951775242549909, "learning_rate": 2.1649484536082477e-06, "loss": 0.1102, "step": 168 }, { "epoch": 0.06535189481825213, "grad_norm": 0.7811031299037886, "learning_rate": 2.177835051546392e-06, "loss": 0.0864, "step": 169 }, { "epoch": 0.065738592420727, "grad_norm": 0.8888276155112317, "learning_rate": 2.1907216494845364e-06, "loss": 0.1138, "step": 170 }, { "epoch": 0.06612529002320186, "grad_norm": 0.900027164103491, "learning_rate": 2.2036082474226804e-06, "loss": 0.0926, "step": 171 }, { "epoch": 0.06651198762567673, "grad_norm": 0.9133068038740638, "learning_rate": 2.2164948453608247e-06, "loss": 0.0917, "step": 172 }, { "epoch": 0.06689868522815158, "grad_norm": 0.6971841261546431, "learning_rate": 2.2293814432989695e-06, "loss": 0.091, "step": 173 }, { "epoch": 0.06728538283062645, "grad_norm": 0.8728158485455457, "learning_rate": 2.242268041237114e-06, "loss": 0.103, "step": 174 }, { "epoch": 0.06767208043310131, "grad_norm": 0.8769767010998937, "learning_rate": 2.255154639175258e-06, "loss": 0.1149, "step": 175 }, { "epoch": 0.06805877803557618, "grad_norm": 1.1884569511080465, "learning_rate": 2.268041237113402e-06, "loss": 0.1024, "step": 176 }, { "epoch": 0.06844547563805105, "grad_norm": 0.9263789677662791, "learning_rate": 2.2809278350515465e-06, "loss": 0.1049, "step": 177 }, { "epoch": 0.06883217324052592, "grad_norm": 0.7459201063578768, "learning_rate": 2.293814432989691e-06, "loss": 0.0991, "step": 178 }, { "epoch": 0.06921887084300077, "grad_norm": 0.6185528482583521, "learning_rate": 2.3067010309278352e-06, "loss": 0.0795, "step": 179 }, { "epoch": 0.06960556844547564, "grad_norm": 1.0591203572183774, "learning_rate": 2.3195876288659796e-06, "loss": 0.1145, "step": 180 }, { "epoch": 0.0699922660479505, "grad_norm": 1.1290476748768599, "learning_rate": 2.332474226804124e-06, "loss": 0.1015, "step": 181 }, { "epoch": 0.07037896365042537, "grad_norm": 0.8231981516829948, "learning_rate": 2.3453608247422683e-06, "loss": 0.0941, "step": 182 }, { "epoch": 0.07076566125290024, "grad_norm": 0.6327987522073508, "learning_rate": 2.3582474226804127e-06, "loss": 0.0781, "step": 183 }, { "epoch": 0.0711523588553751, "grad_norm": 0.9357827302052948, "learning_rate": 2.3711340206185566e-06, "loss": 0.1086, "step": 184 }, { "epoch": 0.07153905645784996, "grad_norm": 0.9428699863075525, "learning_rate": 2.3840206185567014e-06, "loss": 0.0767, "step": 185 }, { "epoch": 0.07192575406032482, "grad_norm": 1.0897235233820879, "learning_rate": 2.3969072164948458e-06, "loss": 0.1166, "step": 186 }, { "epoch": 0.07231245166279969, "grad_norm": 0.8949479543163881, "learning_rate": 2.40979381443299e-06, "loss": 0.0867, "step": 187 }, { "epoch": 0.07269914926527456, "grad_norm": 0.807777665858959, "learning_rate": 2.422680412371134e-06, "loss": 0.1062, "step": 188 }, { "epoch": 0.07308584686774942, "grad_norm": 1.1921181742701576, "learning_rate": 2.4355670103092784e-06, "loss": 0.154, "step": 189 }, { "epoch": 0.07347254447022429, "grad_norm": 0.8509955699023334, "learning_rate": 2.4484536082474228e-06, "loss": 0.0961, "step": 190 }, { "epoch": 0.07385924207269914, "grad_norm": 0.9106664551052476, "learning_rate": 2.4613402061855676e-06, "loss": 0.1204, "step": 191 }, { "epoch": 0.07424593967517401, "grad_norm": 0.7048076201753687, "learning_rate": 2.4742268041237115e-06, "loss": 0.0899, "step": 192 }, { "epoch": 0.07463263727764888, "grad_norm": 0.8212268763002206, "learning_rate": 2.487113402061856e-06, "loss": 0.1138, "step": 193 }, { "epoch": 0.07501933488012374, "grad_norm": 1.115254549453141, "learning_rate": 2.5e-06, "loss": 0.088, "step": 194 }, { "epoch": 0.07540603248259861, "grad_norm": 1.261598476637348, "learning_rate": 2.5128865979381446e-06, "loss": 0.1376, "step": 195 }, { "epoch": 0.07579273008507348, "grad_norm": 0.7589598932444372, "learning_rate": 2.525773195876289e-06, "loss": 0.0871, "step": 196 }, { "epoch": 0.07617942768754833, "grad_norm": 0.9769034372099656, "learning_rate": 2.538659793814433e-06, "loss": 0.0974, "step": 197 }, { "epoch": 0.0765661252900232, "grad_norm": 1.0477722391044093, "learning_rate": 2.5515463917525772e-06, "loss": 0.099, "step": 198 }, { "epoch": 0.07695282289249807, "grad_norm": 0.76494609286112, "learning_rate": 2.5644329896907216e-06, "loss": 0.0908, "step": 199 }, { "epoch": 0.07733952049497293, "grad_norm": 0.848410139943853, "learning_rate": 2.577319587628866e-06, "loss": 0.0997, "step": 200 }, { "epoch": 0.0777262180974478, "grad_norm": 1.2885943033556224, "learning_rate": 2.5902061855670107e-06, "loss": 0.1222, "step": 201 }, { "epoch": 0.07811291569992267, "grad_norm": 1.3423700603665634, "learning_rate": 2.603092783505155e-06, "loss": 0.1487, "step": 202 }, { "epoch": 0.07849961330239752, "grad_norm": 0.8685750142822143, "learning_rate": 2.6159793814432994e-06, "loss": 0.1043, "step": 203 }, { "epoch": 0.07888631090487239, "grad_norm": 0.8465441460885929, "learning_rate": 2.628865979381444e-06, "loss": 0.0936, "step": 204 }, { "epoch": 0.07927300850734725, "grad_norm": 1.1188605276562367, "learning_rate": 2.6417525773195877e-06, "loss": 0.1157, "step": 205 }, { "epoch": 0.07965970610982212, "grad_norm": 0.9121900697287191, "learning_rate": 2.654639175257732e-06, "loss": 0.108, "step": 206 }, { "epoch": 0.08004640371229699, "grad_norm": 1.5476498757444497, "learning_rate": 2.6675257731958765e-06, "loss": 0.103, "step": 207 }, { "epoch": 0.08043310131477185, "grad_norm": 0.9149311254707395, "learning_rate": 2.680412371134021e-06, "loss": 0.0955, "step": 208 }, { "epoch": 0.08081979891724671, "grad_norm": 1.165069030594075, "learning_rate": 2.693298969072165e-06, "loss": 0.1115, "step": 209 }, { "epoch": 0.08120649651972157, "grad_norm": 1.058936074418461, "learning_rate": 2.7061855670103095e-06, "loss": 0.0872, "step": 210 }, { "epoch": 0.08159319412219644, "grad_norm": 0.662147874257291, "learning_rate": 2.7190721649484535e-06, "loss": 0.0655, "step": 211 }, { "epoch": 0.08197989172467131, "grad_norm": 0.9570797435389508, "learning_rate": 2.731958762886598e-06, "loss": 0.1445, "step": 212 }, { "epoch": 0.08236658932714618, "grad_norm": 0.797890561247842, "learning_rate": 2.7448453608247426e-06, "loss": 0.074, "step": 213 }, { "epoch": 0.08275328692962104, "grad_norm": 2.0566524655627703, "learning_rate": 2.757731958762887e-06, "loss": 0.1108, "step": 214 }, { "epoch": 0.0831399845320959, "grad_norm": 1.985993883399437, "learning_rate": 2.7706185567010313e-06, "loss": 0.1174, "step": 215 }, { "epoch": 0.08352668213457076, "grad_norm": 1.1122989709459543, "learning_rate": 2.7835051546391757e-06, "loss": 0.1122, "step": 216 }, { "epoch": 0.08391337973704563, "grad_norm": 1.1250613179359783, "learning_rate": 2.79639175257732e-06, "loss": 0.1007, "step": 217 }, { "epoch": 0.0843000773395205, "grad_norm": 1.0966661133944562, "learning_rate": 2.809278350515464e-06, "loss": 0.0977, "step": 218 }, { "epoch": 0.08468677494199536, "grad_norm": 1.2760401551720568, "learning_rate": 2.8221649484536083e-06, "loss": 0.1098, "step": 219 }, { "epoch": 0.08507347254447023, "grad_norm": 1.4563954507423325, "learning_rate": 2.8350515463917527e-06, "loss": 0.1323, "step": 220 }, { "epoch": 0.08546017014694508, "grad_norm": 1.0962460830317229, "learning_rate": 2.847938144329897e-06, "loss": 0.0957, "step": 221 }, { "epoch": 0.08584686774941995, "grad_norm": 1.274082138302703, "learning_rate": 2.8608247422680414e-06, "loss": 0.1022, "step": 222 }, { "epoch": 0.08623356535189482, "grad_norm": 1.0147597959053443, "learning_rate": 2.8737113402061858e-06, "loss": 0.0912, "step": 223 }, { "epoch": 0.08662026295436968, "grad_norm": 0.7108671095712679, "learning_rate": 2.8865979381443297e-06, "loss": 0.0849, "step": 224 }, { "epoch": 0.08700696055684455, "grad_norm": 0.8232111202181209, "learning_rate": 2.899484536082475e-06, "loss": 0.1218, "step": 225 }, { "epoch": 0.08739365815931942, "grad_norm": 1.145064434459928, "learning_rate": 2.912371134020619e-06, "loss": 0.0848, "step": 226 }, { "epoch": 0.08778035576179427, "grad_norm": 1.5740412395165904, "learning_rate": 2.9252577319587632e-06, "loss": 0.1438, "step": 227 }, { "epoch": 0.08816705336426914, "grad_norm": 0.8678780133557927, "learning_rate": 2.9381443298969076e-06, "loss": 0.0855, "step": 228 }, { "epoch": 0.088553750966744, "grad_norm": 0.7737339768340524, "learning_rate": 2.951030927835052e-06, "loss": 0.0937, "step": 229 }, { "epoch": 0.08894044856921887, "grad_norm": 0.9798060836960556, "learning_rate": 2.9639175257731963e-06, "loss": 0.1008, "step": 230 }, { "epoch": 0.08932714617169374, "grad_norm": 0.8833987311267308, "learning_rate": 2.9768041237113402e-06, "loss": 0.0902, "step": 231 }, { "epoch": 0.0897138437741686, "grad_norm": 0.8182976242565493, "learning_rate": 2.9896907216494846e-06, "loss": 0.1041, "step": 232 }, { "epoch": 0.09010054137664346, "grad_norm": 0.9421336732915081, "learning_rate": 3.002577319587629e-06, "loss": 0.1173, "step": 233 }, { "epoch": 0.09048723897911833, "grad_norm": 0.7917172738193541, "learning_rate": 3.0154639175257733e-06, "loss": 0.0713, "step": 234 }, { "epoch": 0.09087393658159319, "grad_norm": 1.2416115951423794, "learning_rate": 3.0283505154639177e-06, "loss": 0.1069, "step": 235 }, { "epoch": 0.09126063418406806, "grad_norm": 0.7371191139247785, "learning_rate": 3.041237113402062e-06, "loss": 0.069, "step": 236 }, { "epoch": 0.09164733178654293, "grad_norm": 0.7915833218163134, "learning_rate": 3.054123711340207e-06, "loss": 0.0912, "step": 237 }, { "epoch": 0.0920340293890178, "grad_norm": 0.9473503411429703, "learning_rate": 3.067010309278351e-06, "loss": 0.1078, "step": 238 }, { "epoch": 0.09242072699149265, "grad_norm": 0.8406286676335835, "learning_rate": 3.079896907216495e-06, "loss": 0.091, "step": 239 }, { "epoch": 0.09280742459396751, "grad_norm": 1.0692528360230782, "learning_rate": 3.0927835051546395e-06, "loss": 0.1039, "step": 240 }, { "epoch": 0.09319412219644238, "grad_norm": 0.8722841974626556, "learning_rate": 3.105670103092784e-06, "loss": 0.1044, "step": 241 }, { "epoch": 0.09358081979891725, "grad_norm": 1.2273314964528699, "learning_rate": 3.118556701030928e-06, "loss": 0.0801, "step": 242 }, { "epoch": 0.09396751740139211, "grad_norm": 1.252534301438174, "learning_rate": 3.1314432989690725e-06, "loss": 0.1152, "step": 243 }, { "epoch": 0.09435421500386698, "grad_norm": 1.03066490699483, "learning_rate": 3.1443298969072165e-06, "loss": 0.0991, "step": 244 }, { "epoch": 0.09474091260634183, "grad_norm": 0.9451171627246024, "learning_rate": 3.157216494845361e-06, "loss": 0.1081, "step": 245 }, { "epoch": 0.0951276102088167, "grad_norm": 0.8183997441360881, "learning_rate": 3.170103092783505e-06, "loss": 0.0991, "step": 246 }, { "epoch": 0.09551430781129157, "grad_norm": 0.6360664362084811, "learning_rate": 3.1829896907216496e-06, "loss": 0.0844, "step": 247 }, { "epoch": 0.09590100541376644, "grad_norm": 0.8487010666793622, "learning_rate": 3.195876288659794e-06, "loss": 0.1109, "step": 248 }, { "epoch": 0.0962877030162413, "grad_norm": 0.9263214332510412, "learning_rate": 3.2087628865979387e-06, "loss": 0.1082, "step": 249 }, { "epoch": 0.09667440061871617, "grad_norm": 0.7101245219581213, "learning_rate": 3.221649484536083e-06, "loss": 0.0947, "step": 250 }, { "epoch": 0.09706109822119102, "grad_norm": 0.5920764362967631, "learning_rate": 3.2345360824742274e-06, "loss": 0.0725, "step": 251 }, { "epoch": 0.09744779582366589, "grad_norm": 0.9302392248883257, "learning_rate": 3.2474226804123714e-06, "loss": 0.1196, "step": 252 }, { "epoch": 0.09783449342614076, "grad_norm": 1.127917028669162, "learning_rate": 3.2603092783505157e-06, "loss": 0.1273, "step": 253 }, { "epoch": 0.09822119102861562, "grad_norm": 0.7340958578386589, "learning_rate": 3.27319587628866e-06, "loss": 0.0901, "step": 254 }, { "epoch": 0.09860788863109049, "grad_norm": 0.9083940379824932, "learning_rate": 3.2860824742268044e-06, "loss": 0.1038, "step": 255 }, { "epoch": 0.09899458623356536, "grad_norm": 0.962965134736419, "learning_rate": 3.298969072164949e-06, "loss": 0.1075, "step": 256 }, { "epoch": 0.09938128383604021, "grad_norm": 0.7143711579151675, "learning_rate": 3.311855670103093e-06, "loss": 0.075, "step": 257 }, { "epoch": 0.09976798143851508, "grad_norm": 0.7096391773289191, "learning_rate": 3.324742268041237e-06, "loss": 0.0749, "step": 258 }, { "epoch": 0.10015467904098994, "grad_norm": 0.8380905892407408, "learning_rate": 3.3376288659793814e-06, "loss": 0.0767, "step": 259 }, { "epoch": 0.10054137664346481, "grad_norm": 1.5385262733111844, "learning_rate": 3.350515463917526e-06, "loss": 0.0882, "step": 260 }, { "epoch": 0.10092807424593968, "grad_norm": 0.9891862276849915, "learning_rate": 3.3634020618556706e-06, "loss": 0.1131, "step": 261 }, { "epoch": 0.10131477184841454, "grad_norm": 0.9042248682691031, "learning_rate": 3.376288659793815e-06, "loss": 0.09, "step": 262 }, { "epoch": 0.1017014694508894, "grad_norm": 0.875926380891838, "learning_rate": 3.3891752577319593e-06, "loss": 0.0856, "step": 263 }, { "epoch": 0.10208816705336426, "grad_norm": 0.749716157694066, "learning_rate": 3.4020618556701037e-06, "loss": 0.0974, "step": 264 }, { "epoch": 0.10247486465583913, "grad_norm": 0.7840042961521685, "learning_rate": 3.4149484536082476e-06, "loss": 0.084, "step": 265 }, { "epoch": 0.102861562258314, "grad_norm": 0.7185404192312065, "learning_rate": 3.427835051546392e-06, "loss": 0.0855, "step": 266 }, { "epoch": 0.10324825986078887, "grad_norm": 1.2013955030257022, "learning_rate": 3.4407216494845363e-06, "loss": 0.144, "step": 267 }, { "epoch": 0.10363495746326373, "grad_norm": 1.1834222935886984, "learning_rate": 3.4536082474226807e-06, "loss": 0.1205, "step": 268 }, { "epoch": 0.10402165506573859, "grad_norm": 0.8409258386815291, "learning_rate": 3.466494845360825e-06, "loss": 0.1012, "step": 269 }, { "epoch": 0.10440835266821345, "grad_norm": 1.0291102518329347, "learning_rate": 3.4793814432989694e-06, "loss": 0.1149, "step": 270 }, { "epoch": 0.10479505027068832, "grad_norm": 0.6896272876769647, "learning_rate": 3.4922680412371133e-06, "loss": 0.0693, "step": 271 }, { "epoch": 0.10518174787316319, "grad_norm": 1.0229677300943767, "learning_rate": 3.5051546391752577e-06, "loss": 0.1134, "step": 272 }, { "epoch": 0.10556844547563805, "grad_norm": 0.7621850154062054, "learning_rate": 3.5180412371134025e-06, "loss": 0.0925, "step": 273 }, { "epoch": 0.10595514307811292, "grad_norm": 0.9638004733405466, "learning_rate": 3.530927835051547e-06, "loss": 0.1388, "step": 274 }, { "epoch": 0.10634184068058777, "grad_norm": 0.6973862440975864, "learning_rate": 3.543814432989691e-06, "loss": 0.0862, "step": 275 }, { "epoch": 0.10672853828306264, "grad_norm": 0.8129840562848517, "learning_rate": 3.5567010309278356e-06, "loss": 0.0939, "step": 276 }, { "epoch": 0.10711523588553751, "grad_norm": 0.6808744199626738, "learning_rate": 3.56958762886598e-06, "loss": 0.084, "step": 277 }, { "epoch": 0.10750193348801237, "grad_norm": 0.897113436485918, "learning_rate": 3.582474226804124e-06, "loss": 0.103, "step": 278 }, { "epoch": 0.10788863109048724, "grad_norm": 0.9434810806823265, "learning_rate": 3.595360824742268e-06, "loss": 0.0877, "step": 279 }, { "epoch": 0.10827532869296211, "grad_norm": 0.8132609017307095, "learning_rate": 3.6082474226804126e-06, "loss": 0.0834, "step": 280 }, { "epoch": 0.10866202629543696, "grad_norm": 0.9809552808646648, "learning_rate": 3.621134020618557e-06, "loss": 0.1007, "step": 281 }, { "epoch": 0.10904872389791183, "grad_norm": 0.8036916974194168, "learning_rate": 3.6340206185567013e-06, "loss": 0.0983, "step": 282 }, { "epoch": 0.1094354215003867, "grad_norm": 0.9018423651921023, "learning_rate": 3.6469072164948456e-06, "loss": 0.0991, "step": 283 }, { "epoch": 0.10982211910286156, "grad_norm": 0.9827877928244836, "learning_rate": 3.6597938144329896e-06, "loss": 0.1014, "step": 284 }, { "epoch": 0.11020881670533643, "grad_norm": 0.5431048686703187, "learning_rate": 3.6726804123711348e-06, "loss": 0.0654, "step": 285 }, { "epoch": 0.1105955143078113, "grad_norm": 0.8961333844162951, "learning_rate": 3.6855670103092787e-06, "loss": 0.0933, "step": 286 }, { "epoch": 0.11098221191028615, "grad_norm": 1.1157470158076923, "learning_rate": 3.698453608247423e-06, "loss": 0.1229, "step": 287 }, { "epoch": 0.11136890951276102, "grad_norm": 1.1116017504359068, "learning_rate": 3.7113402061855674e-06, "loss": 0.1024, "step": 288 }, { "epoch": 0.11175560711523588, "grad_norm": 0.6946198725641521, "learning_rate": 3.724226804123712e-06, "loss": 0.1011, "step": 289 }, { "epoch": 0.11214230471771075, "grad_norm": 1.0956946466320725, "learning_rate": 3.737113402061856e-06, "loss": 0.1117, "step": 290 }, { "epoch": 0.11252900232018562, "grad_norm": 0.7686628990287813, "learning_rate": 3.7500000000000005e-06, "loss": 0.0863, "step": 291 }, { "epoch": 0.11291569992266048, "grad_norm": 0.8336803700967059, "learning_rate": 3.7628865979381445e-06, "loss": 0.0855, "step": 292 }, { "epoch": 0.11330239752513535, "grad_norm": 1.036382184385844, "learning_rate": 3.775773195876289e-06, "loss": 0.1013, "step": 293 }, { "epoch": 0.1136890951276102, "grad_norm": 0.7204375109333548, "learning_rate": 3.788659793814433e-06, "loss": 0.0849, "step": 294 }, { "epoch": 0.11407579273008507, "grad_norm": 0.8413636766526129, "learning_rate": 3.8015463917525775e-06, "loss": 0.0992, "step": 295 }, { "epoch": 0.11446249033255994, "grad_norm": 0.9383203105738177, "learning_rate": 3.814432989690722e-06, "loss": 0.0922, "step": 296 }, { "epoch": 0.1148491879350348, "grad_norm": 0.7873472841547311, "learning_rate": 3.827319587628866e-06, "loss": 0.1026, "step": 297 }, { "epoch": 0.11523588553750967, "grad_norm": 0.8177160471163044, "learning_rate": 3.840206185567011e-06, "loss": 0.1041, "step": 298 }, { "epoch": 0.11562258313998454, "grad_norm": 1.0482933570720412, "learning_rate": 3.853092783505155e-06, "loss": 0.0984, "step": 299 }, { "epoch": 0.11600928074245939, "grad_norm": 0.6797841364997734, "learning_rate": 3.865979381443299e-06, "loss": 0.066, "step": 300 }, { "epoch": 0.11639597834493426, "grad_norm": 0.7569200079906829, "learning_rate": 3.878865979381444e-06, "loss": 0.0783, "step": 301 }, { "epoch": 0.11678267594740913, "grad_norm": 0.9349817948720307, "learning_rate": 3.891752577319588e-06, "loss": 0.0826, "step": 302 }, { "epoch": 0.11716937354988399, "grad_norm": 0.7511194707632698, "learning_rate": 3.904639175257732e-06, "loss": 0.0892, "step": 303 }, { "epoch": 0.11755607115235886, "grad_norm": 0.7967594488626344, "learning_rate": 3.917525773195877e-06, "loss": 0.0937, "step": 304 }, { "epoch": 0.11794276875483373, "grad_norm": 1.0491001672322793, "learning_rate": 3.930412371134021e-06, "loss": 0.1193, "step": 305 }, { "epoch": 0.11832946635730858, "grad_norm": 1.0942889972814198, "learning_rate": 3.9432989690721655e-06, "loss": 0.1195, "step": 306 }, { "epoch": 0.11871616395978345, "grad_norm": 0.7732624546191981, "learning_rate": 3.956185567010309e-06, "loss": 0.0777, "step": 307 }, { "epoch": 0.11910286156225831, "grad_norm": 0.8687673940557797, "learning_rate": 3.969072164948453e-06, "loss": 0.0887, "step": 308 }, { "epoch": 0.11948955916473318, "grad_norm": 0.9152475448346877, "learning_rate": 3.981958762886598e-06, "loss": 0.1149, "step": 309 }, { "epoch": 0.11987625676720805, "grad_norm": 2.3955663019790467, "learning_rate": 3.994845360824743e-06, "loss": 0.0814, "step": 310 }, { "epoch": 0.12026295436968291, "grad_norm": 0.9812184381434256, "learning_rate": 4.007731958762887e-06, "loss": 0.0796, "step": 311 }, { "epoch": 0.12064965197215777, "grad_norm": 0.7674868985833461, "learning_rate": 4.020618556701032e-06, "loss": 0.0757, "step": 312 }, { "epoch": 0.12103634957463263, "grad_norm": 0.9679271183186337, "learning_rate": 4.033505154639176e-06, "loss": 0.0929, "step": 313 }, { "epoch": 0.1214230471771075, "grad_norm": 1.0962907173744096, "learning_rate": 4.04639175257732e-06, "loss": 0.0946, "step": 314 }, { "epoch": 0.12180974477958237, "grad_norm": 0.9731728043660695, "learning_rate": 4.059278350515464e-06, "loss": 0.0859, "step": 315 }, { "epoch": 0.12219644238205724, "grad_norm": 0.7149095526756181, "learning_rate": 4.072164948453608e-06, "loss": 0.071, "step": 316 }, { "epoch": 0.1225831399845321, "grad_norm": 0.9651449551943639, "learning_rate": 4.085051546391753e-06, "loss": 0.0805, "step": 317 }, { "epoch": 0.12296983758700696, "grad_norm": 0.973517483879309, "learning_rate": 4.097938144329897e-06, "loss": 0.1246, "step": 318 }, { "epoch": 0.12335653518948182, "grad_norm": 0.6519980428267449, "learning_rate": 4.110824742268042e-06, "loss": 0.0674, "step": 319 }, { "epoch": 0.12374323279195669, "grad_norm": 0.8104670165202404, "learning_rate": 4.123711340206186e-06, "loss": 0.0922, "step": 320 }, { "epoch": 0.12412993039443156, "grad_norm": 0.6858030351595606, "learning_rate": 4.13659793814433e-06, "loss": 0.0814, "step": 321 }, { "epoch": 0.12451662799690642, "grad_norm": 1.1236932462832376, "learning_rate": 4.149484536082475e-06, "loss": 0.0969, "step": 322 }, { "epoch": 0.12490332559938129, "grad_norm": 0.7453106503061325, "learning_rate": 4.162371134020619e-06, "loss": 0.0832, "step": 323 }, { "epoch": 0.12529002320185614, "grad_norm": 1.8514672924213231, "learning_rate": 4.175257731958763e-06, "loss": 0.0805, "step": 324 }, { "epoch": 0.125676720804331, "grad_norm": 0.9838236839418637, "learning_rate": 4.188144329896908e-06, "loss": 0.1038, "step": 325 }, { "epoch": 0.12606341840680588, "grad_norm": 0.794733956937775, "learning_rate": 4.201030927835052e-06, "loss": 0.0728, "step": 326 }, { "epoch": 0.12645011600928074, "grad_norm": 0.9271052696885549, "learning_rate": 4.213917525773197e-06, "loss": 0.1031, "step": 327 }, { "epoch": 0.1268368136117556, "grad_norm": 0.6663230450137797, "learning_rate": 4.2268041237113405e-06, "loss": 0.0763, "step": 328 }, { "epoch": 0.12722351121423048, "grad_norm": 1.0055441226394581, "learning_rate": 4.2396907216494845e-06, "loss": 0.0783, "step": 329 }, { "epoch": 0.12761020881670534, "grad_norm": 0.6812657901488884, "learning_rate": 4.252577319587629e-06, "loss": 0.0673, "step": 330 }, { "epoch": 0.1279969064191802, "grad_norm": 0.874928638600509, "learning_rate": 4.265463917525773e-06, "loss": 0.1025, "step": 331 }, { "epoch": 0.12838360402165508, "grad_norm": 0.7371897250078046, "learning_rate": 4.278350515463918e-06, "loss": 0.0903, "step": 332 }, { "epoch": 0.12877030162412992, "grad_norm": 0.7940941190518299, "learning_rate": 4.291237113402062e-06, "loss": 0.0791, "step": 333 }, { "epoch": 0.12915699922660479, "grad_norm": 0.8112780095697195, "learning_rate": 4.304123711340207e-06, "loss": 0.0942, "step": 334 }, { "epoch": 0.12954369682907965, "grad_norm": 0.6766786997716805, "learning_rate": 4.3170103092783515e-06, "loss": 0.0948, "step": 335 }, { "epoch": 0.12993039443155452, "grad_norm": 1.3831618525201574, "learning_rate": 4.329896907216495e-06, "loss": 0.0716, "step": 336 }, { "epoch": 0.13031709203402939, "grad_norm": 0.7913491250939042, "learning_rate": 4.342783505154639e-06, "loss": 0.0777, "step": 337 }, { "epoch": 0.13070378963650425, "grad_norm": 0.9560120868367478, "learning_rate": 4.355670103092784e-06, "loss": 0.1007, "step": 338 }, { "epoch": 0.13109048723897912, "grad_norm": 0.6881163393770976, "learning_rate": 4.368556701030928e-06, "loss": 0.0805, "step": 339 }, { "epoch": 0.131477184841454, "grad_norm": 0.6712908635317006, "learning_rate": 4.381443298969073e-06, "loss": 0.0803, "step": 340 }, { "epoch": 0.13186388244392885, "grad_norm": 1.0246480786528698, "learning_rate": 4.394329896907217e-06, "loss": 0.0963, "step": 341 }, { "epoch": 0.13225058004640372, "grad_norm": 0.7206023185091615, "learning_rate": 4.407216494845361e-06, "loss": 0.0774, "step": 342 }, { "epoch": 0.1326372776488786, "grad_norm": 0.5696067521040848, "learning_rate": 4.4201030927835055e-06, "loss": 0.0663, "step": 343 }, { "epoch": 0.13302397525135345, "grad_norm": 0.7672758541847725, "learning_rate": 4.4329896907216494e-06, "loss": 0.069, "step": 344 }, { "epoch": 0.1334106728538283, "grad_norm": 0.8892529827007091, "learning_rate": 4.445876288659794e-06, "loss": 0.0969, "step": 345 }, { "epoch": 0.13379737045630316, "grad_norm": 0.767417038186944, "learning_rate": 4.458762886597939e-06, "loss": 0.0909, "step": 346 }, { "epoch": 0.13418406805877803, "grad_norm": 0.6253993101600073, "learning_rate": 4.471649484536083e-06, "loss": 0.0617, "step": 347 }, { "epoch": 0.1345707656612529, "grad_norm": 0.7074302592018953, "learning_rate": 4.484536082474228e-06, "loss": 0.0723, "step": 348 }, { "epoch": 0.13495746326372776, "grad_norm": 0.6471892911329291, "learning_rate": 4.497422680412372e-06, "loss": 0.0859, "step": 349 }, { "epoch": 0.13534416086620263, "grad_norm": 1.1171045002954587, "learning_rate": 4.510309278350516e-06, "loss": 0.0764, "step": 350 }, { "epoch": 0.1357308584686775, "grad_norm": 0.7910657925371162, "learning_rate": 4.52319587628866e-06, "loss": 0.0735, "step": 351 }, { "epoch": 0.13611755607115236, "grad_norm": 1.0143826442544885, "learning_rate": 4.536082474226804e-06, "loss": 0.0867, "step": 352 }, { "epoch": 0.13650425367362723, "grad_norm": 1.864241117554201, "learning_rate": 4.548969072164949e-06, "loss": 0.1064, "step": 353 }, { "epoch": 0.1368909512761021, "grad_norm": 0.7819829033948802, "learning_rate": 4.561855670103093e-06, "loss": 0.0981, "step": 354 }, { "epoch": 0.13727764887857696, "grad_norm": 0.7819257783570431, "learning_rate": 4.574742268041237e-06, "loss": 0.1092, "step": 355 }, { "epoch": 0.13766434648105183, "grad_norm": 1.1735255440328356, "learning_rate": 4.587628865979382e-06, "loss": 0.1137, "step": 356 }, { "epoch": 0.13805104408352667, "grad_norm": 0.6759387225321871, "learning_rate": 4.600515463917526e-06, "loss": 0.0666, "step": 357 }, { "epoch": 0.13843774168600154, "grad_norm": 0.6365758870714017, "learning_rate": 4.6134020618556705e-06, "loss": 0.077, "step": 358 }, { "epoch": 0.1388244392884764, "grad_norm": 0.9262307175236822, "learning_rate": 4.626288659793815e-06, "loss": 0.1035, "step": 359 }, { "epoch": 0.13921113689095127, "grad_norm": 0.8188277387657003, "learning_rate": 4.639175257731959e-06, "loss": 0.081, "step": 360 }, { "epoch": 0.13959783449342614, "grad_norm": 0.81240673378559, "learning_rate": 4.652061855670104e-06, "loss": 0.0994, "step": 361 }, { "epoch": 0.139984532095901, "grad_norm": 2.7801905007671373, "learning_rate": 4.664948453608248e-06, "loss": 0.0948, "step": 362 }, { "epoch": 0.14037122969837587, "grad_norm": 1.1868487389370557, "learning_rate": 4.677835051546392e-06, "loss": 0.0912, "step": 363 }, { "epoch": 0.14075792730085074, "grad_norm": 0.774253718652216, "learning_rate": 4.690721649484537e-06, "loss": 0.078, "step": 364 }, { "epoch": 0.1411446249033256, "grad_norm": 0.8637914023924564, "learning_rate": 4.7036082474226806e-06, "loss": 0.1014, "step": 365 }, { "epoch": 0.14153132250580047, "grad_norm": 0.6147151870697279, "learning_rate": 4.716494845360825e-06, "loss": 0.064, "step": 366 }, { "epoch": 0.14191802010827534, "grad_norm": 1.1303739423502879, "learning_rate": 4.729381443298969e-06, "loss": 0.0869, "step": 367 }, { "epoch": 0.1423047177107502, "grad_norm": 0.9180590756173936, "learning_rate": 4.742268041237113e-06, "loss": 0.0767, "step": 368 }, { "epoch": 0.14269141531322505, "grad_norm": 1.0835703522687157, "learning_rate": 4.755154639175258e-06, "loss": 0.1124, "step": 369 }, { "epoch": 0.1430781129156999, "grad_norm": 0.8725019984266548, "learning_rate": 4.768041237113403e-06, "loss": 0.091, "step": 370 }, { "epoch": 0.14346481051817478, "grad_norm": 0.9241353498829304, "learning_rate": 4.780927835051547e-06, "loss": 0.0717, "step": 371 }, { "epoch": 0.14385150812064965, "grad_norm": 1.66354365077923, "learning_rate": 4.7938144329896915e-06, "loss": 0.0783, "step": 372 }, { "epoch": 0.1442382057231245, "grad_norm": 0.8254964464422547, "learning_rate": 4.8067010309278354e-06, "loss": 0.0789, "step": 373 }, { "epoch": 0.14462490332559938, "grad_norm": 0.9852898111805476, "learning_rate": 4.81958762886598e-06, "loss": 0.1016, "step": 374 }, { "epoch": 0.14501160092807425, "grad_norm": 0.5838516663948649, "learning_rate": 4.832474226804124e-06, "loss": 0.1007, "step": 375 }, { "epoch": 0.1453982985305491, "grad_norm": 0.8378137673101768, "learning_rate": 4.845360824742268e-06, "loss": 0.0869, "step": 376 }, { "epoch": 0.14578499613302398, "grad_norm": 0.6588636827224352, "learning_rate": 4.858247422680413e-06, "loss": 0.0631, "step": 377 }, { "epoch": 0.14617169373549885, "grad_norm": 0.7316524474885403, "learning_rate": 4.871134020618557e-06, "loss": 0.101, "step": 378 }, { "epoch": 0.14655839133797371, "grad_norm": 0.5706429188695605, "learning_rate": 4.884020618556702e-06, "loss": 0.0598, "step": 379 }, { "epoch": 0.14694508894044858, "grad_norm": 0.8442292980381358, "learning_rate": 4.8969072164948455e-06, "loss": 0.1005, "step": 380 }, { "epoch": 0.14733178654292342, "grad_norm": 0.796363691617808, "learning_rate": 4.9097938144329895e-06, "loss": 0.066, "step": 381 }, { "epoch": 0.1477184841453983, "grad_norm": 0.7686221776760322, "learning_rate": 4.922680412371135e-06, "loss": 0.0734, "step": 382 }, { "epoch": 0.14810518174787315, "grad_norm": 0.7934368321766282, "learning_rate": 4.935567010309279e-06, "loss": 0.1077, "step": 383 }, { "epoch": 0.14849187935034802, "grad_norm": 0.6617841499122797, "learning_rate": 4.948453608247423e-06, "loss": 0.0934, "step": 384 }, { "epoch": 0.1488785769528229, "grad_norm": 0.6498662844181182, "learning_rate": 4.961340206185568e-06, "loss": 0.0963, "step": 385 }, { "epoch": 0.14926527455529776, "grad_norm": 1.3333511002869542, "learning_rate": 4.974226804123712e-06, "loss": 0.0911, "step": 386 }, { "epoch": 0.14965197215777262, "grad_norm": 0.5041620174347647, "learning_rate": 4.9871134020618565e-06, "loss": 0.0543, "step": 387 }, { "epoch": 0.1500386697602475, "grad_norm": 0.9406071652631274, "learning_rate": 5e-06, "loss": 0.0892, "step": 388 }, { "epoch": 0.15042536736272236, "grad_norm": 0.7274292447359951, "learning_rate": 5.012886597938144e-06, "loss": 0.0828, "step": 389 }, { "epoch": 0.15081206496519722, "grad_norm": 1.1367544493523007, "learning_rate": 5.025773195876289e-06, "loss": 0.1344, "step": 390 }, { "epoch": 0.1511987625676721, "grad_norm": 0.8306666903967171, "learning_rate": 5.038659793814433e-06, "loss": 0.1059, "step": 391 }, { "epoch": 0.15158546017014696, "grad_norm": 0.7480027957861988, "learning_rate": 5.051546391752578e-06, "loss": 0.0924, "step": 392 }, { "epoch": 0.1519721577726218, "grad_norm": 1.087562139469761, "learning_rate": 5.064432989690722e-06, "loss": 0.0961, "step": 393 }, { "epoch": 0.15235885537509666, "grad_norm": 0.7792816778029468, "learning_rate": 5.077319587628866e-06, "loss": 0.0837, "step": 394 }, { "epoch": 0.15274555297757153, "grad_norm": 0.7755145441161435, "learning_rate": 5.0902061855670105e-06, "loss": 0.0887, "step": 395 }, { "epoch": 0.1531322505800464, "grad_norm": 0.7573088617400257, "learning_rate": 5.1030927835051544e-06, "loss": 0.0854, "step": 396 }, { "epoch": 0.15351894818252126, "grad_norm": 0.5606921241659946, "learning_rate": 5.115979381443299e-06, "loss": 0.0759, "step": 397 }, { "epoch": 0.15390564578499613, "grad_norm": 0.6463285833194466, "learning_rate": 5.128865979381443e-06, "loss": 0.0744, "step": 398 }, { "epoch": 0.154292343387471, "grad_norm": 0.9053736510449214, "learning_rate": 5.141752577319588e-06, "loss": 0.114, "step": 399 }, { "epoch": 0.15467904098994587, "grad_norm": 0.7391308883111105, "learning_rate": 5.154639175257732e-06, "loss": 0.0848, "step": 400 }, { "epoch": 0.15506573859242073, "grad_norm": 0.7291580560367706, "learning_rate": 5.1675257731958775e-06, "loss": 0.1013, "step": 401 }, { "epoch": 0.1554524361948956, "grad_norm": 0.6324999735694116, "learning_rate": 5.1804123711340214e-06, "loss": 0.0699, "step": 402 }, { "epoch": 0.15583913379737047, "grad_norm": 1.2196618654196123, "learning_rate": 5.193298969072165e-06, "loss": 0.099, "step": 403 }, { "epoch": 0.15622583139984533, "grad_norm": 0.6488614396299249, "learning_rate": 5.20618556701031e-06, "loss": 0.0752, "step": 404 }, { "epoch": 0.15661252900232017, "grad_norm": 0.7376847058758096, "learning_rate": 5.219072164948454e-06, "loss": 0.0757, "step": 405 }, { "epoch": 0.15699922660479504, "grad_norm": 0.9152002637480194, "learning_rate": 5.231958762886599e-06, "loss": 0.1134, "step": 406 }, { "epoch": 0.1573859242072699, "grad_norm": 0.8050738641308446, "learning_rate": 5.244845360824743e-06, "loss": 0.0752, "step": 407 }, { "epoch": 0.15777262180974477, "grad_norm": 0.778892067231076, "learning_rate": 5.257731958762888e-06, "loss": 0.0623, "step": 408 }, { "epoch": 0.15815931941221964, "grad_norm": 0.6978232189867338, "learning_rate": 5.2706185567010315e-06, "loss": 0.0691, "step": 409 }, { "epoch": 0.1585460170146945, "grad_norm": 0.8978689102750096, "learning_rate": 5.2835051546391755e-06, "loss": 0.0964, "step": 410 }, { "epoch": 0.15893271461716937, "grad_norm": 0.7514977568074243, "learning_rate": 5.29639175257732e-06, "loss": 0.0862, "step": 411 }, { "epoch": 0.15931941221964424, "grad_norm": 0.7114624906639938, "learning_rate": 5.309278350515464e-06, "loss": 0.0813, "step": 412 }, { "epoch": 0.1597061098221191, "grad_norm": 0.7553463524751955, "learning_rate": 5.322164948453609e-06, "loss": 0.0726, "step": 413 }, { "epoch": 0.16009280742459397, "grad_norm": 0.9312039190011528, "learning_rate": 5.335051546391753e-06, "loss": 0.0811, "step": 414 }, { "epoch": 0.16047950502706884, "grad_norm": 0.5819125209287048, "learning_rate": 5.347938144329897e-06, "loss": 0.0689, "step": 415 }, { "epoch": 0.1608662026295437, "grad_norm": 0.9259885985008737, "learning_rate": 5.360824742268042e-06, "loss": 0.0987, "step": 416 }, { "epoch": 0.16125290023201855, "grad_norm": 0.7472158438791875, "learning_rate": 5.3737113402061856e-06, "loss": 0.079, "step": 417 }, { "epoch": 0.16163959783449341, "grad_norm": 0.7085333558991854, "learning_rate": 5.38659793814433e-06, "loss": 0.0884, "step": 418 }, { "epoch": 0.16202629543696828, "grad_norm": 1.0896910201173873, "learning_rate": 5.399484536082474e-06, "loss": 0.0815, "step": 419 }, { "epoch": 0.16241299303944315, "grad_norm": 0.7226761264297983, "learning_rate": 5.412371134020619e-06, "loss": 0.0767, "step": 420 }, { "epoch": 0.16279969064191802, "grad_norm": 0.8842387845297123, "learning_rate": 5.425257731958763e-06, "loss": 0.0742, "step": 421 }, { "epoch": 0.16318638824439288, "grad_norm": 0.9256745232288898, "learning_rate": 5.438144329896907e-06, "loss": 0.0776, "step": 422 }, { "epoch": 0.16357308584686775, "grad_norm": 0.6934275818904952, "learning_rate": 5.451030927835052e-06, "loss": 0.0616, "step": 423 }, { "epoch": 0.16395978344934262, "grad_norm": 0.833594639940566, "learning_rate": 5.463917525773196e-06, "loss": 0.0694, "step": 424 }, { "epoch": 0.16434648105181748, "grad_norm": 1.3792544798308772, "learning_rate": 5.476804123711341e-06, "loss": 0.1018, "step": 425 }, { "epoch": 0.16473317865429235, "grad_norm": 0.825963596089703, "learning_rate": 5.489690721649485e-06, "loss": 0.0765, "step": 426 }, { "epoch": 0.16511987625676722, "grad_norm": 0.8073874986997392, "learning_rate": 5.50257731958763e-06, "loss": 0.0824, "step": 427 }, { "epoch": 0.16550657385924208, "grad_norm": 0.7006305006157533, "learning_rate": 5.515463917525774e-06, "loss": 0.0697, "step": 428 }, { "epoch": 0.16589327146171692, "grad_norm": 0.8742274073656549, "learning_rate": 5.528350515463919e-06, "loss": 0.083, "step": 429 }, { "epoch": 0.1662799690641918, "grad_norm": 0.635617580095699, "learning_rate": 5.541237113402063e-06, "loss": 0.067, "step": 430 }, { "epoch": 0.16666666666666666, "grad_norm": 0.756615863501289, "learning_rate": 5.554123711340207e-06, "loss": 0.0699, "step": 431 }, { "epoch": 0.16705336426914152, "grad_norm": 0.7722347162772399, "learning_rate": 5.567010309278351e-06, "loss": 0.0748, "step": 432 }, { "epoch": 0.1674400618716164, "grad_norm": 0.8227978135996165, "learning_rate": 5.579896907216495e-06, "loss": 0.0898, "step": 433 }, { "epoch": 0.16782675947409126, "grad_norm": 0.9139666408684457, "learning_rate": 5.59278350515464e-06, "loss": 0.0931, "step": 434 }, { "epoch": 0.16821345707656613, "grad_norm": 0.6642226979881745, "learning_rate": 5.605670103092784e-06, "loss": 0.0854, "step": 435 }, { "epoch": 0.168600154679041, "grad_norm": 0.9691619066019612, "learning_rate": 5.618556701030928e-06, "loss": 0.0802, "step": 436 }, { "epoch": 0.16898685228151586, "grad_norm": 0.7833761869125645, "learning_rate": 5.631443298969073e-06, "loss": 0.0729, "step": 437 }, { "epoch": 0.16937354988399073, "grad_norm": 0.9407676498844555, "learning_rate": 5.644329896907217e-06, "loss": 0.1143, "step": 438 }, { "epoch": 0.1697602474864656, "grad_norm": 0.872820831097649, "learning_rate": 5.6572164948453615e-06, "loss": 0.0933, "step": 439 }, { "epoch": 0.17014694508894046, "grad_norm": 0.6870206370030209, "learning_rate": 5.670103092783505e-06, "loss": 0.0813, "step": 440 }, { "epoch": 0.17053364269141533, "grad_norm": 0.8328490992012477, "learning_rate": 5.682989690721649e-06, "loss": 0.1214, "step": 441 }, { "epoch": 0.17092034029389017, "grad_norm": 0.7626210650531695, "learning_rate": 5.695876288659794e-06, "loss": 0.071, "step": 442 }, { "epoch": 0.17130703789636503, "grad_norm": 0.8294467575371175, "learning_rate": 5.708762886597938e-06, "loss": 0.0792, "step": 443 }, { "epoch": 0.1716937354988399, "grad_norm": 0.5603449512767011, "learning_rate": 5.721649484536083e-06, "loss": 0.0642, "step": 444 }, { "epoch": 0.17208043310131477, "grad_norm": 1.0187743885166765, "learning_rate": 5.734536082474227e-06, "loss": 0.1342, "step": 445 }, { "epoch": 0.17246713070378963, "grad_norm": 0.5494180514953456, "learning_rate": 5.7474226804123716e-06, "loss": 0.0637, "step": 446 }, { "epoch": 0.1728538283062645, "grad_norm": 0.7332540806829057, "learning_rate": 5.7603092783505155e-06, "loss": 0.0824, "step": 447 }, { "epoch": 0.17324052590873937, "grad_norm": 0.7820108650962433, "learning_rate": 5.7731958762886594e-06, "loss": 0.1052, "step": 448 }, { "epoch": 0.17362722351121423, "grad_norm": 0.7095954288575764, "learning_rate": 5.786082474226805e-06, "loss": 0.0743, "step": 449 }, { "epoch": 0.1740139211136891, "grad_norm": 0.6510674627175873, "learning_rate": 5.79896907216495e-06, "loss": 0.0753, "step": 450 }, { "epoch": 0.17440061871616397, "grad_norm": 0.6412683209624107, "learning_rate": 5.811855670103094e-06, "loss": 0.0721, "step": 451 }, { "epoch": 0.17478731631863884, "grad_norm": 0.8705389544013832, "learning_rate": 5.824742268041238e-06, "loss": 0.078, "step": 452 }, { "epoch": 0.1751740139211137, "grad_norm": 0.6990057562639903, "learning_rate": 5.8376288659793825e-06, "loss": 0.0916, "step": 453 }, { "epoch": 0.17556071152358854, "grad_norm": 0.6920941549670482, "learning_rate": 5.8505154639175264e-06, "loss": 0.0885, "step": 454 }, { "epoch": 0.1759474091260634, "grad_norm": 0.5224524963576197, "learning_rate": 5.863402061855671e-06, "loss": 0.0628, "step": 455 }, { "epoch": 0.17633410672853828, "grad_norm": 0.5466820540649178, "learning_rate": 5.876288659793815e-06, "loss": 0.0654, "step": 456 }, { "epoch": 0.17672080433101314, "grad_norm": 0.6271972993982132, "learning_rate": 5.889175257731959e-06, "loss": 0.0596, "step": 457 }, { "epoch": 0.177107501933488, "grad_norm": 0.5859247678226751, "learning_rate": 5.902061855670104e-06, "loss": 0.0623, "step": 458 }, { "epoch": 0.17749419953596288, "grad_norm": 0.5303344592408451, "learning_rate": 5.914948453608248e-06, "loss": 0.0687, "step": 459 }, { "epoch": 0.17788089713843774, "grad_norm": 0.567665902058051, "learning_rate": 5.927835051546393e-06, "loss": 0.0759, "step": 460 }, { "epoch": 0.1782675947409126, "grad_norm": 0.5945798273187098, "learning_rate": 5.9407216494845365e-06, "loss": 0.0575, "step": 461 }, { "epoch": 0.17865429234338748, "grad_norm": 0.5705646137843546, "learning_rate": 5.9536082474226805e-06, "loss": 0.0559, "step": 462 }, { "epoch": 0.17904098994586234, "grad_norm": 0.6582474646995378, "learning_rate": 5.966494845360825e-06, "loss": 0.0872, "step": 463 }, { "epoch": 0.1794276875483372, "grad_norm": 0.7398134668814395, "learning_rate": 5.979381443298969e-06, "loss": 0.0962, "step": 464 }, { "epoch": 0.17981438515081208, "grad_norm": 0.7889456437835641, "learning_rate": 5.992268041237114e-06, "loss": 0.1037, "step": 465 }, { "epoch": 0.18020108275328692, "grad_norm": 1.6538944723302431, "learning_rate": 6.005154639175258e-06, "loss": 0.0841, "step": 466 }, { "epoch": 0.18058778035576178, "grad_norm": 0.5709928104242434, "learning_rate": 6.018041237113403e-06, "loss": 0.0764, "step": 467 }, { "epoch": 0.18097447795823665, "grad_norm": 0.6188076410916248, "learning_rate": 6.030927835051547e-06, "loss": 0.0583, "step": 468 }, { "epoch": 0.18136117556071152, "grad_norm": 0.692428015528814, "learning_rate": 6.0438144329896906e-06, "loss": 0.0851, "step": 469 }, { "epoch": 0.18174787316318639, "grad_norm": 0.6910227371851557, "learning_rate": 6.056701030927835e-06, "loss": 0.0624, "step": 470 }, { "epoch": 0.18213457076566125, "grad_norm": 0.7172004569753664, "learning_rate": 6.069587628865979e-06, "loss": 0.082, "step": 471 }, { "epoch": 0.18252126836813612, "grad_norm": 0.590213552993389, "learning_rate": 6.082474226804124e-06, "loss": 0.0799, "step": 472 }, { "epoch": 0.182907965970611, "grad_norm": 0.5586416278996121, "learning_rate": 6.095360824742269e-06, "loss": 0.0637, "step": 473 }, { "epoch": 0.18329466357308585, "grad_norm": 0.5848962758571585, "learning_rate": 6.108247422680414e-06, "loss": 0.0664, "step": 474 }, { "epoch": 0.18368136117556072, "grad_norm": 0.7346251411074433, "learning_rate": 6.1211340206185576e-06, "loss": 0.0726, "step": 475 }, { "epoch": 0.1840680587780356, "grad_norm": 0.7274066631589422, "learning_rate": 6.134020618556702e-06, "loss": 0.094, "step": 476 }, { "epoch": 0.18445475638051045, "grad_norm": 0.5978920424985926, "learning_rate": 6.146907216494846e-06, "loss": 0.1005, "step": 477 }, { "epoch": 0.1848414539829853, "grad_norm": 0.7905500895567459, "learning_rate": 6.15979381443299e-06, "loss": 0.0812, "step": 478 }, { "epoch": 0.18522815158546016, "grad_norm": 0.6180769807924281, "learning_rate": 6.172680412371135e-06, "loss": 0.0801, "step": 479 }, { "epoch": 0.18561484918793503, "grad_norm": 0.7148520599008757, "learning_rate": 6.185567010309279e-06, "loss": 0.0661, "step": 480 }, { "epoch": 0.1860015467904099, "grad_norm": 0.7757445888689807, "learning_rate": 6.198453608247424e-06, "loss": 0.0855, "step": 481 }, { "epoch": 0.18638824439288476, "grad_norm": 0.741783008211247, "learning_rate": 6.211340206185568e-06, "loss": 0.0663, "step": 482 }, { "epoch": 0.18677494199535963, "grad_norm": 1.020813882995714, "learning_rate": 6.224226804123712e-06, "loss": 0.0729, "step": 483 }, { "epoch": 0.1871616395978345, "grad_norm": 0.7583543275139447, "learning_rate": 6.237113402061856e-06, "loss": 0.0667, "step": 484 }, { "epoch": 0.18754833720030936, "grad_norm": 0.5163406064449441, "learning_rate": 6.25e-06, "loss": 0.0554, "step": 485 }, { "epoch": 0.18793503480278423, "grad_norm": 0.6784337516355998, "learning_rate": 6.262886597938145e-06, "loss": 0.0752, "step": 486 }, { "epoch": 0.1883217324052591, "grad_norm": 0.6371913570323648, "learning_rate": 6.275773195876289e-06, "loss": 0.0782, "step": 487 }, { "epoch": 0.18870843000773396, "grad_norm": 0.6494278299333583, "learning_rate": 6.288659793814433e-06, "loss": 0.0593, "step": 488 }, { "epoch": 0.18909512761020883, "grad_norm": 0.8541253335366732, "learning_rate": 6.301546391752578e-06, "loss": 0.0759, "step": 489 }, { "epoch": 0.18948182521268367, "grad_norm": 0.7680521663723728, "learning_rate": 6.314432989690722e-06, "loss": 0.0771, "step": 490 }, { "epoch": 0.18986852281515854, "grad_norm": 0.685183884053568, "learning_rate": 6.3273195876288665e-06, "loss": 0.0819, "step": 491 }, { "epoch": 0.1902552204176334, "grad_norm": 0.6009007509847308, "learning_rate": 6.34020618556701e-06, "loss": 0.0732, "step": 492 }, { "epoch": 0.19064191802010827, "grad_norm": 0.6641261660285925, "learning_rate": 6.353092783505155e-06, "loss": 0.0937, "step": 493 }, { "epoch": 0.19102861562258314, "grad_norm": 0.7438492951465095, "learning_rate": 6.365979381443299e-06, "loss": 0.0827, "step": 494 }, { "epoch": 0.191415313225058, "grad_norm": 0.6262818953207294, "learning_rate": 6.378865979381443e-06, "loss": 0.0675, "step": 495 }, { "epoch": 0.19180201082753287, "grad_norm": 0.6495568058307297, "learning_rate": 6.391752577319588e-06, "loss": 0.0891, "step": 496 }, { "epoch": 0.19218870843000774, "grad_norm": 0.6318859384507892, "learning_rate": 6.404639175257732e-06, "loss": 0.0761, "step": 497 }, { "epoch": 0.1925754060324826, "grad_norm": 0.6636927113238602, "learning_rate": 6.417525773195877e-06, "loss": 0.0698, "step": 498 }, { "epoch": 0.19296210363495747, "grad_norm": 0.8570171302575307, "learning_rate": 6.430412371134021e-06, "loss": 0.0754, "step": 499 }, { "epoch": 0.19334880123743234, "grad_norm": 0.5392521481814903, "learning_rate": 6.443298969072166e-06, "loss": 0.0674, "step": 500 }, { "epoch": 0.1937354988399072, "grad_norm": 1.2944963522490862, "learning_rate": 6.45618556701031e-06, "loss": 0.0887, "step": 501 }, { "epoch": 0.19412219644238204, "grad_norm": 0.6178290028905495, "learning_rate": 6.469072164948455e-06, "loss": 0.0706, "step": 502 }, { "epoch": 0.1945088940448569, "grad_norm": 0.5308017338532148, "learning_rate": 6.481958762886599e-06, "loss": 0.0829, "step": 503 }, { "epoch": 0.19489559164733178, "grad_norm": 0.8383946703065455, "learning_rate": 6.494845360824743e-06, "loss": 0.0882, "step": 504 }, { "epoch": 0.19528228924980665, "grad_norm": 0.7105186721427094, "learning_rate": 6.5077319587628875e-06, "loss": 0.0684, "step": 505 }, { "epoch": 0.1956689868522815, "grad_norm": 0.6388662051855111, "learning_rate": 6.520618556701031e-06, "loss": 0.0752, "step": 506 }, { "epoch": 0.19605568445475638, "grad_norm": 0.65798038315927, "learning_rate": 6.533505154639176e-06, "loss": 0.0693, "step": 507 }, { "epoch": 0.19644238205723125, "grad_norm": 0.8936240557377984, "learning_rate": 6.54639175257732e-06, "loss": 0.0998, "step": 508 }, { "epoch": 0.1968290796597061, "grad_norm": 0.6354917282223251, "learning_rate": 6.559278350515464e-06, "loss": 0.083, "step": 509 }, { "epoch": 0.19721577726218098, "grad_norm": 0.5685165901773499, "learning_rate": 6.572164948453609e-06, "loss": 0.063, "step": 510 }, { "epoch": 0.19760247486465585, "grad_norm": 0.7111259887300069, "learning_rate": 6.585051546391753e-06, "loss": 0.0761, "step": 511 }, { "epoch": 0.19798917246713071, "grad_norm": 0.953885617216303, "learning_rate": 6.597938144329898e-06, "loss": 0.0921, "step": 512 }, { "epoch": 0.19837587006960558, "grad_norm": 0.5855955519615186, "learning_rate": 6.6108247422680415e-06, "loss": 0.074, "step": 513 }, { "epoch": 0.19876256767208042, "grad_norm": 0.7013788431961574, "learning_rate": 6.623711340206186e-06, "loss": 0.0843, "step": 514 }, { "epoch": 0.1991492652745553, "grad_norm": 0.717003663189454, "learning_rate": 6.63659793814433e-06, "loss": 0.0694, "step": 515 }, { "epoch": 0.19953596287703015, "grad_norm": 0.8126382967445018, "learning_rate": 6.649484536082474e-06, "loss": 0.0871, "step": 516 }, { "epoch": 0.19992266047950502, "grad_norm": 0.4626968234005786, "learning_rate": 6.662371134020619e-06, "loss": 0.067, "step": 517 }, { "epoch": 0.2003093580819799, "grad_norm": 0.5709042848335617, "learning_rate": 6.675257731958763e-06, "loss": 0.0645, "step": 518 }, { "epoch": 0.20069605568445475, "grad_norm": 0.5014105904437585, "learning_rate": 6.688144329896908e-06, "loss": 0.0641, "step": 519 }, { "epoch": 0.20108275328692962, "grad_norm": 0.8205626522114214, "learning_rate": 6.701030927835052e-06, "loss": 0.0964, "step": 520 }, { "epoch": 0.2014694508894045, "grad_norm": 0.753126084952189, "learning_rate": 6.7139175257731955e-06, "loss": 0.0897, "step": 521 }, { "epoch": 0.20185614849187936, "grad_norm": 0.6859679423775203, "learning_rate": 6.726804123711341e-06, "loss": 0.0616, "step": 522 }, { "epoch": 0.20224284609435422, "grad_norm": 0.5792242337347234, "learning_rate": 6.739690721649486e-06, "loss": 0.0741, "step": 523 }, { "epoch": 0.2026295436968291, "grad_norm": 0.8494773607148065, "learning_rate": 6.75257731958763e-06, "loss": 0.0794, "step": 524 }, { "epoch": 0.20301624129930396, "grad_norm": 0.7299951990982172, "learning_rate": 6.765463917525774e-06, "loss": 0.069, "step": 525 }, { "epoch": 0.2034029389017788, "grad_norm": 0.6042657531899733, "learning_rate": 6.778350515463919e-06, "loss": 0.0659, "step": 526 }, { "epoch": 0.20378963650425366, "grad_norm": 0.533578012989336, "learning_rate": 6.7912371134020625e-06, "loss": 0.0598, "step": 527 }, { "epoch": 0.20417633410672853, "grad_norm": 0.555862876772962, "learning_rate": 6.804123711340207e-06, "loss": 0.0602, "step": 528 }, { "epoch": 0.2045630317092034, "grad_norm": 0.6974053521649494, "learning_rate": 6.817010309278351e-06, "loss": 0.0754, "step": 529 }, { "epoch": 0.20494972931167826, "grad_norm": 1.2128414861379981, "learning_rate": 6.829896907216495e-06, "loss": 0.0761, "step": 530 }, { "epoch": 0.20533642691415313, "grad_norm": 0.6121581017506389, "learning_rate": 6.84278350515464e-06, "loss": 0.0858, "step": 531 }, { "epoch": 0.205723124516628, "grad_norm": 1.0469965480660661, "learning_rate": 6.855670103092784e-06, "loss": 0.1296, "step": 532 }, { "epoch": 0.20610982211910286, "grad_norm": 0.7477270273746296, "learning_rate": 6.868556701030929e-06, "loss": 0.1008, "step": 533 }, { "epoch": 0.20649651972157773, "grad_norm": 0.6639767971320859, "learning_rate": 6.881443298969073e-06, "loss": 0.076, "step": 534 }, { "epoch": 0.2068832173240526, "grad_norm": 0.6833182061783034, "learning_rate": 6.8943298969072166e-06, "loss": 0.068, "step": 535 }, { "epoch": 0.20726991492652747, "grad_norm": 0.6289703866120361, "learning_rate": 6.907216494845361e-06, "loss": 0.0644, "step": 536 }, { "epoch": 0.20765661252900233, "grad_norm": 0.6573685951313722, "learning_rate": 6.920103092783505e-06, "loss": 0.0817, "step": 537 }, { "epoch": 0.20804331013147717, "grad_norm": 0.7073100586159449, "learning_rate": 6.93298969072165e-06, "loss": 0.09, "step": 538 }, { "epoch": 0.20843000773395204, "grad_norm": 0.781602568659379, "learning_rate": 6.945876288659794e-06, "loss": 0.0995, "step": 539 }, { "epoch": 0.2088167053364269, "grad_norm": 0.6995821803832646, "learning_rate": 6.958762886597939e-06, "loss": 0.0762, "step": 540 }, { "epoch": 0.20920340293890177, "grad_norm": 0.6993573850847832, "learning_rate": 6.971649484536083e-06, "loss": 0.0742, "step": 541 }, { "epoch": 0.20959010054137664, "grad_norm": 1.1056865183223605, "learning_rate": 6.984536082474227e-06, "loss": 0.0868, "step": 542 }, { "epoch": 0.2099767981438515, "grad_norm": 0.6580300452948826, "learning_rate": 6.9974226804123714e-06, "loss": 0.0739, "step": 543 }, { "epoch": 0.21036349574632637, "grad_norm": 0.9284191073317334, "learning_rate": 7.010309278350515e-06, "loss": 0.0803, "step": 544 }, { "epoch": 0.21075019334880124, "grad_norm": 0.5119377879108816, "learning_rate": 7.02319587628866e-06, "loss": 0.0611, "step": 545 }, { "epoch": 0.2111368909512761, "grad_norm": 0.4803042669223074, "learning_rate": 7.036082474226805e-06, "loss": 0.0486, "step": 546 }, { "epoch": 0.21152358855375097, "grad_norm": 0.6515786977490622, "learning_rate": 7.04896907216495e-06, "loss": 0.0684, "step": 547 }, { "epoch": 0.21191028615622584, "grad_norm": 0.7623795707972331, "learning_rate": 7.061855670103094e-06, "loss": 0.0825, "step": 548 }, { "epoch": 0.2122969837587007, "grad_norm": 0.525217348400502, "learning_rate": 7.0747422680412384e-06, "loss": 0.0637, "step": 549 }, { "epoch": 0.21268368136117555, "grad_norm": 1.6485264994633673, "learning_rate": 7.087628865979382e-06, "loss": 0.1036, "step": 550 }, { "epoch": 0.21307037896365041, "grad_norm": 0.5718812324179546, "learning_rate": 7.100515463917526e-06, "loss": 0.0881, "step": 551 }, { "epoch": 0.21345707656612528, "grad_norm": 0.9746251892383238, "learning_rate": 7.113402061855671e-06, "loss": 0.0929, "step": 552 }, { "epoch": 0.21384377416860015, "grad_norm": 0.4921445442756644, "learning_rate": 7.126288659793815e-06, "loss": 0.0627, "step": 553 }, { "epoch": 0.21423047177107502, "grad_norm": 0.6284209334655265, "learning_rate": 7.13917525773196e-06, "loss": 0.055, "step": 554 }, { "epoch": 0.21461716937354988, "grad_norm": 0.658777794589967, "learning_rate": 7.152061855670104e-06, "loss": 0.0642, "step": 555 }, { "epoch": 0.21500386697602475, "grad_norm": 0.6392351313527719, "learning_rate": 7.164948453608248e-06, "loss": 0.0541, "step": 556 }, { "epoch": 0.21539056457849962, "grad_norm": 0.6371951606332418, "learning_rate": 7.1778350515463925e-06, "loss": 0.0728, "step": 557 }, { "epoch": 0.21577726218097448, "grad_norm": 0.5259704310222735, "learning_rate": 7.190721649484536e-06, "loss": 0.0559, "step": 558 }, { "epoch": 0.21616395978344935, "grad_norm": 0.7689004904054364, "learning_rate": 7.203608247422681e-06, "loss": 0.0948, "step": 559 }, { "epoch": 0.21655065738592422, "grad_norm": 0.7407659340781657, "learning_rate": 7.216494845360825e-06, "loss": 0.0812, "step": 560 }, { "epoch": 0.21693735498839908, "grad_norm": 0.8316271342446914, "learning_rate": 7.22938144329897e-06, "loss": 0.0886, "step": 561 }, { "epoch": 0.21732405259087392, "grad_norm": 1.595743795715186, "learning_rate": 7.242268041237114e-06, "loss": 0.0756, "step": 562 }, { "epoch": 0.2177107501933488, "grad_norm": 0.7623516868801284, "learning_rate": 7.255154639175258e-06, "loss": 0.0781, "step": 563 }, { "epoch": 0.21809744779582366, "grad_norm": 1.0246250389151077, "learning_rate": 7.2680412371134026e-06, "loss": 0.0739, "step": 564 }, { "epoch": 0.21848414539829852, "grad_norm": 0.8505316554017638, "learning_rate": 7.2809278350515465e-06, "loss": 0.0814, "step": 565 }, { "epoch": 0.2188708430007734, "grad_norm": 0.7221684142191289, "learning_rate": 7.293814432989691e-06, "loss": 0.0656, "step": 566 }, { "epoch": 0.21925754060324826, "grad_norm": 0.8478364787061523, "learning_rate": 7.306701030927835e-06, "loss": 0.0729, "step": 567 }, { "epoch": 0.21964423820572312, "grad_norm": 0.7226308401900639, "learning_rate": 7.319587628865979e-06, "loss": 0.0693, "step": 568 }, { "epoch": 0.220030935808198, "grad_norm": 0.6106450384122375, "learning_rate": 7.332474226804124e-06, "loss": 0.0526, "step": 569 }, { "epoch": 0.22041763341067286, "grad_norm": 0.7810111782764099, "learning_rate": 7.3453608247422696e-06, "loss": 0.0629, "step": 570 }, { "epoch": 0.22080433101314773, "grad_norm": 0.5510391902869886, "learning_rate": 7.3582474226804135e-06, "loss": 0.0597, "step": 571 }, { "epoch": 0.2211910286156226, "grad_norm": 0.7074163764411348, "learning_rate": 7.3711340206185574e-06, "loss": 0.085, "step": 572 }, { "epoch": 0.22157772621809746, "grad_norm": 0.8583808514902358, "learning_rate": 7.384020618556702e-06, "loss": 0.0913, "step": 573 }, { "epoch": 0.2219644238205723, "grad_norm": 1.0327063229834392, "learning_rate": 7.396907216494846e-06, "loss": 0.0954, "step": 574 }, { "epoch": 0.22235112142304717, "grad_norm": 0.588835604493498, "learning_rate": 7.409793814432991e-06, "loss": 0.0799, "step": 575 }, { "epoch": 0.22273781902552203, "grad_norm": 0.6094677700530767, "learning_rate": 7.422680412371135e-06, "loss": 0.0575, "step": 576 }, { "epoch": 0.2231245166279969, "grad_norm": 0.6777414701243908, "learning_rate": 7.435567010309279e-06, "loss": 0.0713, "step": 577 }, { "epoch": 0.22351121423047177, "grad_norm": 0.7398155500332643, "learning_rate": 7.448453608247424e-06, "loss": 0.0903, "step": 578 }, { "epoch": 0.22389791183294663, "grad_norm": 0.694637319104862, "learning_rate": 7.4613402061855675e-06, "loss": 0.0817, "step": 579 }, { "epoch": 0.2242846094354215, "grad_norm": 0.5003163197046672, "learning_rate": 7.474226804123712e-06, "loss": 0.0603, "step": 580 }, { "epoch": 0.22467130703789637, "grad_norm": 0.6215805859120587, "learning_rate": 7.487113402061856e-06, "loss": 0.072, "step": 581 }, { "epoch": 0.22505800464037123, "grad_norm": 0.877494618336339, "learning_rate": 7.500000000000001e-06, "loss": 0.0817, "step": 582 }, { "epoch": 0.2254447022428461, "grad_norm": 0.5783661218107835, "learning_rate": 7.512886597938145e-06, "loss": 0.0672, "step": 583 }, { "epoch": 0.22583139984532097, "grad_norm": 0.559134470214501, "learning_rate": 7.525773195876289e-06, "loss": 0.0802, "step": 584 }, { "epoch": 0.22621809744779584, "grad_norm": 0.7363643660358357, "learning_rate": 7.538659793814434e-06, "loss": 0.0683, "step": 585 }, { "epoch": 0.2266047950502707, "grad_norm": 0.6263036781964506, "learning_rate": 7.551546391752578e-06, "loss": 0.0671, "step": 586 }, { "epoch": 0.22699149265274554, "grad_norm": 0.5383499736377071, "learning_rate": 7.564432989690722e-06, "loss": 0.0562, "step": 587 }, { "epoch": 0.2273781902552204, "grad_norm": 0.5269120552197847, "learning_rate": 7.577319587628866e-06, "loss": 0.0601, "step": 588 }, { "epoch": 0.22776488785769528, "grad_norm": 0.6258001426545546, "learning_rate": 7.59020618556701e-06, "loss": 0.0609, "step": 589 }, { "epoch": 0.22815158546017014, "grad_norm": 0.6077418427605358, "learning_rate": 7.603092783505155e-06, "loss": 0.0856, "step": 590 }, { "epoch": 0.228538283062645, "grad_norm": 0.5267966024866764, "learning_rate": 7.615979381443299e-06, "loss": 0.0641, "step": 591 }, { "epoch": 0.22892498066511988, "grad_norm": 0.8271390673728625, "learning_rate": 7.628865979381444e-06, "loss": 0.0897, "step": 592 }, { "epoch": 0.22931167826759474, "grad_norm": 0.45063759381508117, "learning_rate": 7.641752577319589e-06, "loss": 0.062, "step": 593 }, { "epoch": 0.2296983758700696, "grad_norm": 0.7408765071383501, "learning_rate": 7.654639175257732e-06, "loss": 0.0711, "step": 594 }, { "epoch": 0.23008507347254448, "grad_norm": 0.5797552019883465, "learning_rate": 7.667525773195878e-06, "loss": 0.0665, "step": 595 }, { "epoch": 0.23047177107501934, "grad_norm": 0.550443312457893, "learning_rate": 7.680412371134021e-06, "loss": 0.0729, "step": 596 }, { "epoch": 0.2308584686774942, "grad_norm": 0.5206326179874262, "learning_rate": 7.693298969072166e-06, "loss": 0.0514, "step": 597 }, { "epoch": 0.23124516627996908, "grad_norm": 1.1409372893436593, "learning_rate": 7.70618556701031e-06, "loss": 0.0781, "step": 598 }, { "epoch": 0.23163186388244392, "grad_norm": 0.6617805836591524, "learning_rate": 7.719072164948454e-06, "loss": 0.0638, "step": 599 }, { "epoch": 0.23201856148491878, "grad_norm": 1.057403113588234, "learning_rate": 7.731958762886599e-06, "loss": 0.0746, "step": 600 }, { "epoch": 0.23240525908739365, "grad_norm": 0.6727837227484215, "learning_rate": 7.744845360824743e-06, "loss": 0.0876, "step": 601 }, { "epoch": 0.23279195668986852, "grad_norm": 0.7214981361084651, "learning_rate": 7.757731958762888e-06, "loss": 0.0579, "step": 602 }, { "epoch": 0.23317865429234338, "grad_norm": 0.6502752670696409, "learning_rate": 7.770618556701031e-06, "loss": 0.074, "step": 603 }, { "epoch": 0.23356535189481825, "grad_norm": 0.535066096601764, "learning_rate": 7.783505154639176e-06, "loss": 0.0578, "step": 604 }, { "epoch": 0.23395204949729312, "grad_norm": 0.9683367519776378, "learning_rate": 7.796391752577321e-06, "loss": 0.0711, "step": 605 }, { "epoch": 0.23433874709976799, "grad_norm": 0.7171531666049156, "learning_rate": 7.809278350515464e-06, "loss": 0.0734, "step": 606 }, { "epoch": 0.23472544470224285, "grad_norm": 0.6460893126255546, "learning_rate": 7.822164948453609e-06, "loss": 0.0546, "step": 607 }, { "epoch": 0.23511214230471772, "grad_norm": 0.7928987008235284, "learning_rate": 7.835051546391754e-06, "loss": 0.089, "step": 608 }, { "epoch": 0.2354988399071926, "grad_norm": 0.6706601670855703, "learning_rate": 7.847938144329897e-06, "loss": 0.06, "step": 609 }, { "epoch": 0.23588553750966745, "grad_norm": 0.47825236411709815, "learning_rate": 7.860824742268041e-06, "loss": 0.0673, "step": 610 }, { "epoch": 0.2362722351121423, "grad_norm": 0.553117582217593, "learning_rate": 7.873711340206186e-06, "loss": 0.0637, "step": 611 }, { "epoch": 0.23665893271461716, "grad_norm": 0.9412496099828036, "learning_rate": 7.886597938144331e-06, "loss": 0.0916, "step": 612 }, { "epoch": 0.23704563031709203, "grad_norm": 0.7771023952010917, "learning_rate": 7.899484536082474e-06, "loss": 0.0936, "step": 613 }, { "epoch": 0.2374323279195669, "grad_norm": 0.40424026975263544, "learning_rate": 7.912371134020619e-06, "loss": 0.0535, "step": 614 }, { "epoch": 0.23781902552204176, "grad_norm": 0.8724163245949341, "learning_rate": 7.925257731958764e-06, "loss": 0.0588, "step": 615 }, { "epoch": 0.23820572312451663, "grad_norm": 0.8300177495746421, "learning_rate": 7.938144329896907e-06, "loss": 0.0747, "step": 616 }, { "epoch": 0.2385924207269915, "grad_norm": 0.8898383393294954, "learning_rate": 7.951030927835051e-06, "loss": 0.0767, "step": 617 }, { "epoch": 0.23897911832946636, "grad_norm": 0.6501021668513073, "learning_rate": 7.963917525773196e-06, "loss": 0.0758, "step": 618 }, { "epoch": 0.23936581593194123, "grad_norm": 0.9907866600081798, "learning_rate": 7.976804123711341e-06, "loss": 0.1039, "step": 619 }, { "epoch": 0.2397525135344161, "grad_norm": 0.608041778385665, "learning_rate": 7.989690721649486e-06, "loss": 0.0745, "step": 620 }, { "epoch": 0.24013921113689096, "grad_norm": 0.8416743441682312, "learning_rate": 8.00257731958763e-06, "loss": 0.071, "step": 621 }, { "epoch": 0.24052590873936583, "grad_norm": 0.5060468668755908, "learning_rate": 8.015463917525774e-06, "loss": 0.0565, "step": 622 }, { "epoch": 0.24091260634184067, "grad_norm": 0.6105711971781802, "learning_rate": 8.028350515463918e-06, "loss": 0.0754, "step": 623 }, { "epoch": 0.24129930394431554, "grad_norm": 0.7793106574679652, "learning_rate": 8.041237113402063e-06, "loss": 0.1002, "step": 624 }, { "epoch": 0.2416860015467904, "grad_norm": 0.6140950654710874, "learning_rate": 8.054123711340206e-06, "loss": 0.0871, "step": 625 }, { "epoch": 0.24207269914926527, "grad_norm": 0.5309517589003278, "learning_rate": 8.067010309278351e-06, "loss": 0.0551, "step": 626 }, { "epoch": 0.24245939675174014, "grad_norm": 1.0678882985093543, "learning_rate": 8.079896907216496e-06, "loss": 0.062, "step": 627 }, { "epoch": 0.242846094354215, "grad_norm": 0.6109737035682361, "learning_rate": 8.09278350515464e-06, "loss": 0.0553, "step": 628 }, { "epoch": 0.24323279195668987, "grad_norm": 0.7016965321894286, "learning_rate": 8.105670103092784e-06, "loss": 0.0732, "step": 629 }, { "epoch": 0.24361948955916474, "grad_norm": 0.6226468700578907, "learning_rate": 8.118556701030929e-06, "loss": 0.0714, "step": 630 }, { "epoch": 0.2440061871616396, "grad_norm": 0.8349938450874137, "learning_rate": 8.131443298969073e-06, "loss": 0.078, "step": 631 }, { "epoch": 0.24439288476411447, "grad_norm": 0.5076922681531024, "learning_rate": 8.144329896907216e-06, "loss": 0.0712, "step": 632 }, { "epoch": 0.24477958236658934, "grad_norm": 0.7144444113469366, "learning_rate": 8.157216494845361e-06, "loss": 0.0913, "step": 633 }, { "epoch": 0.2451662799690642, "grad_norm": 0.7863425943508017, "learning_rate": 8.170103092783506e-06, "loss": 0.1094, "step": 634 }, { "epoch": 0.24555297757153904, "grad_norm": 0.493788666701036, "learning_rate": 8.182989690721649e-06, "loss": 0.0601, "step": 635 }, { "epoch": 0.2459396751740139, "grad_norm": 0.5895641994113247, "learning_rate": 8.195876288659794e-06, "loss": 0.058, "step": 636 }, { "epoch": 0.24632637277648878, "grad_norm": 0.6991258515685677, "learning_rate": 8.208762886597939e-06, "loss": 0.0742, "step": 637 }, { "epoch": 0.24671307037896364, "grad_norm": 0.6723550392137057, "learning_rate": 8.221649484536083e-06, "loss": 0.0637, "step": 638 }, { "epoch": 0.2470997679814385, "grad_norm": 1.0862233204007954, "learning_rate": 8.234536082474227e-06, "loss": 0.0763, "step": 639 }, { "epoch": 0.24748646558391338, "grad_norm": 0.6083856369441467, "learning_rate": 8.247422680412371e-06, "loss": 0.0619, "step": 640 }, { "epoch": 0.24787316318638825, "grad_norm": 0.695930150881801, "learning_rate": 8.260309278350516e-06, "loss": 0.0714, "step": 641 }, { "epoch": 0.2482598607888631, "grad_norm": 0.7184347523592379, "learning_rate": 8.27319587628866e-06, "loss": 0.1023, "step": 642 }, { "epoch": 0.24864655839133798, "grad_norm": 0.548959815310898, "learning_rate": 8.286082474226806e-06, "loss": 0.0571, "step": 643 }, { "epoch": 0.24903325599381285, "grad_norm": 0.45550945454537556, "learning_rate": 8.29896907216495e-06, "loss": 0.0507, "step": 644 }, { "epoch": 0.2494199535962877, "grad_norm": 0.4445257390321096, "learning_rate": 8.311855670103094e-06, "loss": 0.0509, "step": 645 }, { "epoch": 0.24980665119876258, "grad_norm": 0.4313899074763436, "learning_rate": 8.324742268041238e-06, "loss": 0.0619, "step": 646 }, { "epoch": 0.25019334880123745, "grad_norm": 0.6327856684841737, "learning_rate": 8.337628865979383e-06, "loss": 0.0709, "step": 647 }, { "epoch": 0.2505800464037123, "grad_norm": 0.7227029163902324, "learning_rate": 8.350515463917526e-06, "loss": 0.0759, "step": 648 }, { "epoch": 0.2509667440061872, "grad_norm": 0.7268488410176571, "learning_rate": 8.363402061855671e-06, "loss": 0.0626, "step": 649 }, { "epoch": 0.251353441608662, "grad_norm": 0.6314131819586887, "learning_rate": 8.376288659793816e-06, "loss": 0.0947, "step": 650 }, { "epoch": 0.2517401392111369, "grad_norm": 0.7459177825270831, "learning_rate": 8.389175257731959e-06, "loss": 0.0896, "step": 651 }, { "epoch": 0.25212683681361175, "grad_norm": 0.6344310435541886, "learning_rate": 8.402061855670104e-06, "loss": 0.0628, "step": 652 }, { "epoch": 0.2525135344160866, "grad_norm": 0.6764770749041525, "learning_rate": 8.414948453608248e-06, "loss": 0.0641, "step": 653 }, { "epoch": 0.2529002320185615, "grad_norm": 0.5466497085568601, "learning_rate": 8.427835051546393e-06, "loss": 0.0544, "step": 654 }, { "epoch": 0.2532869296210363, "grad_norm": 0.7014855878046217, "learning_rate": 8.440721649484536e-06, "loss": 0.0714, "step": 655 }, { "epoch": 0.2536736272235112, "grad_norm": 0.499195204514521, "learning_rate": 8.453608247422681e-06, "loss": 0.0577, "step": 656 }, { "epoch": 0.25406032482598606, "grad_norm": 0.5290224676339346, "learning_rate": 8.466494845360826e-06, "loss": 0.0618, "step": 657 }, { "epoch": 0.25444702242846096, "grad_norm": 0.7345682560665986, "learning_rate": 8.479381443298969e-06, "loss": 0.0746, "step": 658 }, { "epoch": 0.2548337200309358, "grad_norm": 0.6230210416960449, "learning_rate": 8.492268041237114e-06, "loss": 0.0986, "step": 659 }, { "epoch": 0.2552204176334107, "grad_norm": 0.4263319883858693, "learning_rate": 8.505154639175259e-06, "loss": 0.0532, "step": 660 }, { "epoch": 0.25560711523588553, "grad_norm": 0.527994458296436, "learning_rate": 8.518041237113403e-06, "loss": 0.0603, "step": 661 }, { "epoch": 0.2559938128383604, "grad_norm": 0.7735128040397784, "learning_rate": 8.530927835051546e-06, "loss": 0.0732, "step": 662 }, { "epoch": 0.25638051044083526, "grad_norm": 0.7556644213060477, "learning_rate": 8.543814432989691e-06, "loss": 0.0867, "step": 663 }, { "epoch": 0.25676720804331016, "grad_norm": 0.5217950818353502, "learning_rate": 8.556701030927836e-06, "loss": 0.0708, "step": 664 }, { "epoch": 0.257153905645785, "grad_norm": 1.027420380955457, "learning_rate": 8.569587628865979e-06, "loss": 0.0799, "step": 665 }, { "epoch": 0.25754060324825984, "grad_norm": 0.5157394923759699, "learning_rate": 8.582474226804124e-06, "loss": 0.0594, "step": 666 }, { "epoch": 0.25792730085073473, "grad_norm": 0.8490726319287246, "learning_rate": 8.595360824742269e-06, "loss": 0.082, "step": 667 }, { "epoch": 0.25831399845320957, "grad_norm": 0.5406298749035567, "learning_rate": 8.608247422680413e-06, "loss": 0.0827, "step": 668 }, { "epoch": 0.25870069605568446, "grad_norm": 0.7979854018658817, "learning_rate": 8.621134020618558e-06, "loss": 0.0757, "step": 669 }, { "epoch": 0.2590873936581593, "grad_norm": 0.44250649734033165, "learning_rate": 8.634020618556703e-06, "loss": 0.0509, "step": 670 }, { "epoch": 0.2594740912606342, "grad_norm": 0.9486991310538139, "learning_rate": 8.646907216494846e-06, "loss": 0.1038, "step": 671 }, { "epoch": 0.25986078886310904, "grad_norm": 0.6672183229854332, "learning_rate": 8.65979381443299e-06, "loss": 0.0726, "step": 672 }, { "epoch": 0.26024748646558393, "grad_norm": 0.5016682405365674, "learning_rate": 8.672680412371136e-06, "loss": 0.0633, "step": 673 }, { "epoch": 0.26063418406805877, "grad_norm": 0.5957690719963589, "learning_rate": 8.685567010309279e-06, "loss": 0.0593, "step": 674 }, { "epoch": 0.26102088167053367, "grad_norm": 0.7808195935681645, "learning_rate": 8.698453608247423e-06, "loss": 0.08, "step": 675 }, { "epoch": 0.2614075792730085, "grad_norm": 0.7263396386908969, "learning_rate": 8.711340206185568e-06, "loss": 0.0729, "step": 676 }, { "epoch": 0.26179427687548334, "grad_norm": 0.6101383705492915, "learning_rate": 8.724226804123711e-06, "loss": 0.0706, "step": 677 }, { "epoch": 0.26218097447795824, "grad_norm": 0.7955696817737804, "learning_rate": 8.737113402061856e-06, "loss": 0.0877, "step": 678 }, { "epoch": 0.2625676720804331, "grad_norm": 0.6407673598016912, "learning_rate": 8.750000000000001e-06, "loss": 0.0679, "step": 679 }, { "epoch": 0.262954369682908, "grad_norm": 0.7091986650458306, "learning_rate": 8.762886597938146e-06, "loss": 0.0799, "step": 680 }, { "epoch": 0.2633410672853828, "grad_norm": 0.7727865315922753, "learning_rate": 8.775773195876289e-06, "loss": 0.0777, "step": 681 }, { "epoch": 0.2637277648878577, "grad_norm": 0.6260266198920548, "learning_rate": 8.788659793814434e-06, "loss": 0.0719, "step": 682 }, { "epoch": 0.26411446249033255, "grad_norm": 0.6200448690811576, "learning_rate": 8.801546391752578e-06, "loss": 0.064, "step": 683 }, { "epoch": 0.26450116009280744, "grad_norm": 0.7401564713616687, "learning_rate": 8.814432989690721e-06, "loss": 0.0625, "step": 684 }, { "epoch": 0.2648878576952823, "grad_norm": 0.722132986179434, "learning_rate": 8.827319587628866e-06, "loss": 0.0726, "step": 685 }, { "epoch": 0.2652745552977572, "grad_norm": 1.0687473745111635, "learning_rate": 8.840206185567011e-06, "loss": 0.0962, "step": 686 }, { "epoch": 0.265661252900232, "grad_norm": 0.6179070396135158, "learning_rate": 8.853092783505156e-06, "loss": 0.0694, "step": 687 }, { "epoch": 0.2660479505027069, "grad_norm": 0.6796417761656101, "learning_rate": 8.865979381443299e-06, "loss": 0.0691, "step": 688 }, { "epoch": 0.26643464810518175, "grad_norm": 0.5437851149007312, "learning_rate": 8.878865979381444e-06, "loss": 0.0434, "step": 689 }, { "epoch": 0.2668213457076566, "grad_norm": 0.7262313525668356, "learning_rate": 8.891752577319588e-06, "loss": 0.0727, "step": 690 }, { "epoch": 0.2672080433101315, "grad_norm": 0.8189617294038807, "learning_rate": 8.904639175257732e-06, "loss": 0.082, "step": 691 }, { "epoch": 0.2675947409126063, "grad_norm": 0.9044124784549938, "learning_rate": 8.917525773195878e-06, "loss": 0.0735, "step": 692 }, { "epoch": 0.2679814385150812, "grad_norm": 0.5552541585858207, "learning_rate": 8.930412371134021e-06, "loss": 0.0595, "step": 693 }, { "epoch": 0.26836813611755606, "grad_norm": 0.8119806623306439, "learning_rate": 8.943298969072166e-06, "loss": 0.0646, "step": 694 }, { "epoch": 0.26875483372003095, "grad_norm": 0.85627655362849, "learning_rate": 8.95618556701031e-06, "loss": 0.0658, "step": 695 }, { "epoch": 0.2691415313225058, "grad_norm": 0.6753548229998914, "learning_rate": 8.969072164948455e-06, "loss": 0.0698, "step": 696 }, { "epoch": 0.2695282289249807, "grad_norm": 0.6272305564973448, "learning_rate": 8.981958762886599e-06, "loss": 0.0936, "step": 697 }, { "epoch": 0.2699149265274555, "grad_norm": 0.49848661528059485, "learning_rate": 8.994845360824743e-06, "loss": 0.0576, "step": 698 }, { "epoch": 0.2703016241299304, "grad_norm": 0.9473507191628878, "learning_rate": 9.007731958762888e-06, "loss": 0.1227, "step": 699 }, { "epoch": 0.27068832173240526, "grad_norm": 0.664025767728487, "learning_rate": 9.020618556701031e-06, "loss": 0.0553, "step": 700 }, { "epoch": 0.2710750193348801, "grad_norm": 0.6439785250648512, "learning_rate": 9.033505154639176e-06, "loss": 0.0658, "step": 701 }, { "epoch": 0.271461716937355, "grad_norm": 0.8129385130907251, "learning_rate": 9.04639175257732e-06, "loss": 0.0747, "step": 702 }, { "epoch": 0.27184841453982983, "grad_norm": 1.011081505829676, "learning_rate": 9.059278350515464e-06, "loss": 0.0867, "step": 703 }, { "epoch": 0.2722351121423047, "grad_norm": 1.0828559266595417, "learning_rate": 9.072164948453609e-06, "loss": 0.0779, "step": 704 }, { "epoch": 0.27262180974477956, "grad_norm": 0.5273899076276066, "learning_rate": 9.085051546391753e-06, "loss": 0.0657, "step": 705 }, { "epoch": 0.27300850734725446, "grad_norm": 0.6433703241864313, "learning_rate": 9.097938144329898e-06, "loss": 0.0901, "step": 706 }, { "epoch": 0.2733952049497293, "grad_norm": 0.7062309703108304, "learning_rate": 9.110824742268041e-06, "loss": 0.057, "step": 707 }, { "epoch": 0.2737819025522042, "grad_norm": 0.6735051210946806, "learning_rate": 9.123711340206186e-06, "loss": 0.0732, "step": 708 }, { "epoch": 0.27416860015467903, "grad_norm": 0.8342464115975586, "learning_rate": 9.136597938144331e-06, "loss": 0.0645, "step": 709 }, { "epoch": 0.2745552977571539, "grad_norm": 1.0215761850240033, "learning_rate": 9.149484536082474e-06, "loss": 0.086, "step": 710 }, { "epoch": 0.27494199535962877, "grad_norm": 0.7070677719340416, "learning_rate": 9.162371134020619e-06, "loss": 0.056, "step": 711 }, { "epoch": 0.27532869296210366, "grad_norm": 0.636698289222279, "learning_rate": 9.175257731958764e-06, "loss": 0.0612, "step": 712 }, { "epoch": 0.2757153905645785, "grad_norm": 0.5957771096531898, "learning_rate": 9.188144329896908e-06, "loss": 0.0684, "step": 713 }, { "epoch": 0.27610208816705334, "grad_norm": 0.7181952464353456, "learning_rate": 9.201030927835051e-06, "loss": 0.076, "step": 714 }, { "epoch": 0.27648878576952823, "grad_norm": 0.6436542943680317, "learning_rate": 9.213917525773196e-06, "loss": 0.0722, "step": 715 }, { "epoch": 0.2768754833720031, "grad_norm": 0.606973991739112, "learning_rate": 9.226804123711341e-06, "loss": 0.0612, "step": 716 }, { "epoch": 0.27726218097447797, "grad_norm": 0.5905695932974315, "learning_rate": 9.239690721649486e-06, "loss": 0.0781, "step": 717 }, { "epoch": 0.2776488785769528, "grad_norm": 0.6305719741654584, "learning_rate": 9.25257731958763e-06, "loss": 0.0729, "step": 718 }, { "epoch": 0.2780355761794277, "grad_norm": 0.4956783638944779, "learning_rate": 9.265463917525774e-06, "loss": 0.0636, "step": 719 }, { "epoch": 0.27842227378190254, "grad_norm": 0.5610411843837684, "learning_rate": 9.278350515463918e-06, "loss": 0.062, "step": 720 }, { "epoch": 0.27880897138437744, "grad_norm": 1.0750405794603204, "learning_rate": 9.291237113402063e-06, "loss": 0.0903, "step": 721 }, { "epoch": 0.2791956689868523, "grad_norm": 0.651699559673203, "learning_rate": 9.304123711340208e-06, "loss": 0.0737, "step": 722 }, { "epoch": 0.27958236658932717, "grad_norm": 0.9766880515750446, "learning_rate": 9.317010309278351e-06, "loss": 0.0872, "step": 723 }, { "epoch": 0.279969064191802, "grad_norm": 0.5030442035221198, "learning_rate": 9.329896907216496e-06, "loss": 0.0545, "step": 724 }, { "epoch": 0.2803557617942769, "grad_norm": 0.5089346055517512, "learning_rate": 9.34278350515464e-06, "loss": 0.054, "step": 725 }, { "epoch": 0.28074245939675174, "grad_norm": 0.571106031978767, "learning_rate": 9.355670103092784e-06, "loss": 0.0607, "step": 726 }, { "epoch": 0.2811291569992266, "grad_norm": 0.9315842101641564, "learning_rate": 9.368556701030928e-06, "loss": 0.0959, "step": 727 }, { "epoch": 0.2815158546017015, "grad_norm": 0.6146931386880108, "learning_rate": 9.381443298969073e-06, "loss": 0.0655, "step": 728 }, { "epoch": 0.2819025522041763, "grad_norm": 0.4853273971019521, "learning_rate": 9.394329896907216e-06, "loss": 0.0616, "step": 729 }, { "epoch": 0.2822892498066512, "grad_norm": 0.8947757098878881, "learning_rate": 9.407216494845361e-06, "loss": 0.0826, "step": 730 }, { "epoch": 0.28267594740912605, "grad_norm": 0.5443543256121964, "learning_rate": 9.420103092783506e-06, "loss": 0.0532, "step": 731 }, { "epoch": 0.28306264501160094, "grad_norm": 0.8728298582059498, "learning_rate": 9.43298969072165e-06, "loss": 0.0859, "step": 732 }, { "epoch": 0.2834493426140758, "grad_norm": 0.5750817741017774, "learning_rate": 9.445876288659794e-06, "loss": 0.087, "step": 733 }, { "epoch": 0.2838360402165507, "grad_norm": 0.5313244553890849, "learning_rate": 9.458762886597939e-06, "loss": 0.0431, "step": 734 }, { "epoch": 0.2842227378190255, "grad_norm": 0.7646673518754808, "learning_rate": 9.471649484536083e-06, "loss": 0.0813, "step": 735 }, { "epoch": 0.2846094354215004, "grad_norm": 0.6303303681800612, "learning_rate": 9.484536082474226e-06, "loss": 0.0548, "step": 736 }, { "epoch": 0.28499613302397525, "grad_norm": 0.6058320502535719, "learning_rate": 9.497422680412371e-06, "loss": 0.0715, "step": 737 }, { "epoch": 0.2853828306264501, "grad_norm": 0.6766186494772161, "learning_rate": 9.510309278350516e-06, "loss": 0.0738, "step": 738 }, { "epoch": 0.285769528228925, "grad_norm": 0.6037982895537563, "learning_rate": 9.52319587628866e-06, "loss": 0.0676, "step": 739 }, { "epoch": 0.2861562258313998, "grad_norm": 0.6921048735524884, "learning_rate": 9.536082474226806e-06, "loss": 0.0782, "step": 740 }, { "epoch": 0.2865429234338747, "grad_norm": 0.6952114090055467, "learning_rate": 9.54896907216495e-06, "loss": 0.066, "step": 741 }, { "epoch": 0.28692962103634956, "grad_norm": 0.7440228072450382, "learning_rate": 9.561855670103093e-06, "loss": 0.0853, "step": 742 }, { "epoch": 0.28731631863882445, "grad_norm": 0.54962338102697, "learning_rate": 9.574742268041238e-06, "loss": 0.067, "step": 743 }, { "epoch": 0.2877030162412993, "grad_norm": 0.6043240794282718, "learning_rate": 9.587628865979383e-06, "loss": 0.0673, "step": 744 }, { "epoch": 0.2880897138437742, "grad_norm": 0.5183167007417208, "learning_rate": 9.600515463917526e-06, "loss": 0.0552, "step": 745 }, { "epoch": 0.288476411446249, "grad_norm": 0.7035734029210619, "learning_rate": 9.613402061855671e-06, "loss": 0.0665, "step": 746 }, { "epoch": 0.2888631090487239, "grad_norm": 0.5113145961481558, "learning_rate": 9.626288659793816e-06, "loss": 0.0508, "step": 747 }, { "epoch": 0.28924980665119876, "grad_norm": 0.5002173066783324, "learning_rate": 9.63917525773196e-06, "loss": 0.0656, "step": 748 }, { "epoch": 0.28963650425367365, "grad_norm": 0.4358246779914215, "learning_rate": 9.652061855670104e-06, "loss": 0.0559, "step": 749 }, { "epoch": 0.2900232018561485, "grad_norm": 0.49114245643651844, "learning_rate": 9.664948453608248e-06, "loss": 0.0581, "step": 750 }, { "epoch": 0.29040989945862333, "grad_norm": 0.549830941907731, "learning_rate": 9.677835051546393e-06, "loss": 0.0497, "step": 751 }, { "epoch": 0.2907965970610982, "grad_norm": 0.5670587888227043, "learning_rate": 9.690721649484536e-06, "loss": 0.0697, "step": 752 }, { "epoch": 0.29118329466357307, "grad_norm": 0.5438164242386709, "learning_rate": 9.703608247422681e-06, "loss": 0.0549, "step": 753 }, { "epoch": 0.29156999226604796, "grad_norm": 0.5716722224962597, "learning_rate": 9.716494845360826e-06, "loss": 0.06, "step": 754 }, { "epoch": 0.2919566898685228, "grad_norm": 0.427464830198663, "learning_rate": 9.72938144329897e-06, "loss": 0.0507, "step": 755 }, { "epoch": 0.2923433874709977, "grad_norm": 0.471746763623389, "learning_rate": 9.742268041237114e-06, "loss": 0.0577, "step": 756 }, { "epoch": 0.29273008507347253, "grad_norm": 0.598790503756667, "learning_rate": 9.755154639175258e-06, "loss": 0.0607, "step": 757 }, { "epoch": 0.29311678267594743, "grad_norm": 0.6711526534978062, "learning_rate": 9.768041237113403e-06, "loss": 0.0847, "step": 758 }, { "epoch": 0.29350348027842227, "grad_norm": 0.8950957740313106, "learning_rate": 9.780927835051546e-06, "loss": 0.0788, "step": 759 }, { "epoch": 0.29389017788089716, "grad_norm": 0.8683053648664752, "learning_rate": 9.793814432989691e-06, "loss": 0.0784, "step": 760 }, { "epoch": 0.294276875483372, "grad_norm": 1.1249977919978569, "learning_rate": 9.806701030927836e-06, "loss": 0.0777, "step": 761 }, { "epoch": 0.29466357308584684, "grad_norm": 0.72672324836126, "learning_rate": 9.819587628865979e-06, "loss": 0.0718, "step": 762 }, { "epoch": 0.29505027068832174, "grad_norm": 0.5526171945701548, "learning_rate": 9.832474226804124e-06, "loss": 0.0592, "step": 763 }, { "epoch": 0.2954369682907966, "grad_norm": 0.5183950089049724, "learning_rate": 9.84536082474227e-06, "loss": 0.0481, "step": 764 }, { "epoch": 0.29582366589327147, "grad_norm": 1.3392205323645943, "learning_rate": 9.858247422680413e-06, "loss": 0.0885, "step": 765 }, { "epoch": 0.2962103634957463, "grad_norm": 0.4581700232143298, "learning_rate": 9.871134020618558e-06, "loss": 0.0496, "step": 766 }, { "epoch": 0.2965970610982212, "grad_norm": 0.9529476692851886, "learning_rate": 9.884020618556703e-06, "loss": 0.0874, "step": 767 }, { "epoch": 0.29698375870069604, "grad_norm": 0.4844861739800673, "learning_rate": 9.896907216494846e-06, "loss": 0.0599, "step": 768 }, { "epoch": 0.29737045630317094, "grad_norm": 0.7900974125676071, "learning_rate": 9.90979381443299e-06, "loss": 0.0758, "step": 769 }, { "epoch": 0.2977571539056458, "grad_norm": 0.6136164546427771, "learning_rate": 9.922680412371136e-06, "loss": 0.0535, "step": 770 }, { "epoch": 0.29814385150812067, "grad_norm": 0.581290411541795, "learning_rate": 9.935567010309279e-06, "loss": 0.0627, "step": 771 }, { "epoch": 0.2985305491105955, "grad_norm": 0.5560067830685645, "learning_rate": 9.948453608247423e-06, "loss": 0.0723, "step": 772 }, { "epoch": 0.2989172467130704, "grad_norm": 0.84067274883239, "learning_rate": 9.961340206185568e-06, "loss": 0.085, "step": 773 }, { "epoch": 0.29930394431554525, "grad_norm": 0.5030223288018769, "learning_rate": 9.974226804123713e-06, "loss": 0.077, "step": 774 }, { "epoch": 0.2996906419180201, "grad_norm": 0.5548449185468834, "learning_rate": 9.987113402061856e-06, "loss": 0.0643, "step": 775 }, { "epoch": 0.300077339520495, "grad_norm": 0.4019940789380299, "learning_rate": 1e-05, "loss": 0.056, "step": 776 }, { "epoch": 0.3004640371229698, "grad_norm": 0.4706139977207997, "learning_rate": 9.999999493849048e-06, "loss": 0.0623, "step": 777 }, { "epoch": 0.3008507347254447, "grad_norm": 0.6444755406314291, "learning_rate": 9.999997975396295e-06, "loss": 0.0788, "step": 778 }, { "epoch": 0.30123743232791955, "grad_norm": 0.5426856350456143, "learning_rate": 9.99999544464205e-06, "loss": 0.0835, "step": 779 }, { "epoch": 0.30162412993039445, "grad_norm": 0.5719839786609306, "learning_rate": 9.999991901586824e-06, "loss": 0.0486, "step": 780 }, { "epoch": 0.3020108275328693, "grad_norm": 0.48314493230267586, "learning_rate": 9.999987346231333e-06, "loss": 0.0545, "step": 781 }, { "epoch": 0.3023975251353442, "grad_norm": 0.9525130547927393, "learning_rate": 9.999981778576499e-06, "loss": 0.1229, "step": 782 }, { "epoch": 0.302784222737819, "grad_norm": 0.5339578904705017, "learning_rate": 9.999975198623454e-06, "loss": 0.0637, "step": 783 }, { "epoch": 0.3031709203402939, "grad_norm": 0.5590493764320357, "learning_rate": 9.999967606373523e-06, "loss": 0.0647, "step": 784 }, { "epoch": 0.30355761794276875, "grad_norm": 0.7558710254199348, "learning_rate": 9.99995900182825e-06, "loss": 0.0742, "step": 785 }, { "epoch": 0.3039443155452436, "grad_norm": 0.5260468273210764, "learning_rate": 9.999949384989374e-06, "loss": 0.0602, "step": 786 }, { "epoch": 0.3043310131477185, "grad_norm": 0.4731735294012572, "learning_rate": 9.99993875585884e-06, "loss": 0.0607, "step": 787 }, { "epoch": 0.3047177107501933, "grad_norm": 0.7937306611712004, "learning_rate": 9.999927114438803e-06, "loss": 0.0759, "step": 788 }, { "epoch": 0.3051044083526682, "grad_norm": 0.7549080024381307, "learning_rate": 9.99991446073162e-06, "loss": 0.0648, "step": 789 }, { "epoch": 0.30549110595514306, "grad_norm": 0.7545535532336851, "learning_rate": 9.999900794739853e-06, "loss": 0.0805, "step": 790 }, { "epoch": 0.30587780355761796, "grad_norm": 0.4350942921893401, "learning_rate": 9.999886116466263e-06, "loss": 0.0552, "step": 791 }, { "epoch": 0.3062645011600928, "grad_norm": 0.5523585721787605, "learning_rate": 9.999870425913831e-06, "loss": 0.0577, "step": 792 }, { "epoch": 0.3066511987625677, "grad_norm": 0.7354504848659215, "learning_rate": 9.999853723085729e-06, "loss": 0.0587, "step": 793 }, { "epoch": 0.30703789636504253, "grad_norm": 0.8763159248040281, "learning_rate": 9.999836007985338e-06, "loss": 0.0688, "step": 794 }, { "epoch": 0.3074245939675174, "grad_norm": 0.5497495433604525, "learning_rate": 9.999817280616244e-06, "loss": 0.0733, "step": 795 }, { "epoch": 0.30781129156999226, "grad_norm": 0.44659604042943274, "learning_rate": 9.999797540982241e-06, "loss": 0.0536, "step": 796 }, { "epoch": 0.30819798917246716, "grad_norm": 0.4398575609751322, "learning_rate": 9.999776789087325e-06, "loss": 0.0428, "step": 797 }, { "epoch": 0.308584686774942, "grad_norm": 0.5721198685585271, "learning_rate": 9.999755024935697e-06, "loss": 0.0647, "step": 798 }, { "epoch": 0.30897138437741684, "grad_norm": 0.5859846396196956, "learning_rate": 9.999732248531763e-06, "loss": 0.0725, "step": 799 }, { "epoch": 0.30935808197989173, "grad_norm": 0.5724477196812394, "learning_rate": 9.999708459880134e-06, "loss": 0.0769, "step": 800 }, { "epoch": 0.30974477958236657, "grad_norm": 0.8387685755424121, "learning_rate": 9.999683658985628e-06, "loss": 0.0902, "step": 801 }, { "epoch": 0.31013147718484146, "grad_norm": 0.6238946850384559, "learning_rate": 9.999657845853265e-06, "loss": 0.0787, "step": 802 }, { "epoch": 0.3105181747873163, "grad_norm": 0.6182553592421567, "learning_rate": 9.999631020488272e-06, "loss": 0.0604, "step": 803 }, { "epoch": 0.3109048723897912, "grad_norm": 0.5548409878979409, "learning_rate": 9.999603182896077e-06, "loss": 0.0695, "step": 804 }, { "epoch": 0.31129156999226604, "grad_norm": 0.6074674990791001, "learning_rate": 9.99957433308232e-06, "loss": 0.0692, "step": 805 }, { "epoch": 0.31167826759474093, "grad_norm": 0.6479192421456527, "learning_rate": 9.99954447105284e-06, "loss": 0.0661, "step": 806 }, { "epoch": 0.31206496519721577, "grad_norm": 0.6496793006878071, "learning_rate": 9.999513596813682e-06, "loss": 0.0716, "step": 807 }, { "epoch": 0.31245166279969067, "grad_norm": 0.53388688941301, "learning_rate": 9.9994817103711e-06, "loss": 0.054, "step": 808 }, { "epoch": 0.3128383604021655, "grad_norm": 0.48153253750914154, "learning_rate": 9.999448811731547e-06, "loss": 0.0623, "step": 809 }, { "epoch": 0.31322505800464034, "grad_norm": 0.5176902662574686, "learning_rate": 9.999414900901684e-06, "loss": 0.0524, "step": 810 }, { "epoch": 0.31361175560711524, "grad_norm": 0.5253184271292107, "learning_rate": 9.999379977888378e-06, "loss": 0.0547, "step": 811 }, { "epoch": 0.3139984532095901, "grad_norm": 0.618345308490765, "learning_rate": 9.999344042698697e-06, "loss": 0.059, "step": 812 }, { "epoch": 0.314385150812065, "grad_norm": 0.4555081604679781, "learning_rate": 9.999307095339918e-06, "loss": 0.0554, "step": 813 }, { "epoch": 0.3147718484145398, "grad_norm": 0.44419828484769114, "learning_rate": 9.999269135819522e-06, "loss": 0.0634, "step": 814 }, { "epoch": 0.3151585460170147, "grad_norm": 0.5985271971618881, "learning_rate": 9.999230164145195e-06, "loss": 0.0763, "step": 815 }, { "epoch": 0.31554524361948955, "grad_norm": 0.5045996124679334, "learning_rate": 9.999190180324824e-06, "loss": 0.0491, "step": 816 }, { "epoch": 0.31593194122196444, "grad_norm": 0.4973768269616357, "learning_rate": 9.999149184366505e-06, "loss": 0.053, "step": 817 }, { "epoch": 0.3163186388244393, "grad_norm": 0.7239013262256084, "learning_rate": 9.999107176278542e-06, "loss": 0.0645, "step": 818 }, { "epoch": 0.3167053364269142, "grad_norm": 0.5712172572567139, "learning_rate": 9.999064156069436e-06, "loss": 0.0647, "step": 819 }, { "epoch": 0.317092034029389, "grad_norm": 0.5501979904802516, "learning_rate": 9.999020123747897e-06, "loss": 0.066, "step": 820 }, { "epoch": 0.3174787316318639, "grad_norm": 0.5032756124476997, "learning_rate": 9.998975079322842e-06, "loss": 0.0574, "step": 821 }, { "epoch": 0.31786542923433875, "grad_norm": 0.6403081568519022, "learning_rate": 9.998929022803387e-06, "loss": 0.0763, "step": 822 }, { "epoch": 0.3182521268368136, "grad_norm": 0.5779203932961345, "learning_rate": 9.998881954198861e-06, "loss": 0.0701, "step": 823 }, { "epoch": 0.3186388244392885, "grad_norm": 0.9167628439278658, "learning_rate": 9.998833873518792e-06, "loss": 0.0839, "step": 824 }, { "epoch": 0.3190255220417633, "grad_norm": 0.4712789630719075, "learning_rate": 9.998784780772912e-06, "loss": 0.0604, "step": 825 }, { "epoch": 0.3194122196442382, "grad_norm": 0.7639745242342065, "learning_rate": 9.998734675971165e-06, "loss": 0.0635, "step": 826 }, { "epoch": 0.31979891724671305, "grad_norm": 0.6922254687926921, "learning_rate": 9.99868355912369e-06, "loss": 0.0789, "step": 827 }, { "epoch": 0.32018561484918795, "grad_norm": 0.5631639151428467, "learning_rate": 9.99863143024084e-06, "loss": 0.0637, "step": 828 }, { "epoch": 0.3205723124516628, "grad_norm": 0.6786263975750112, "learning_rate": 9.998578289333167e-06, "loss": 0.0582, "step": 829 }, { "epoch": 0.3209590100541377, "grad_norm": 0.6283699010856082, "learning_rate": 9.998524136411432e-06, "loss": 0.0777, "step": 830 }, { "epoch": 0.3213457076566125, "grad_norm": 0.45684278986994276, "learning_rate": 9.998468971486597e-06, "loss": 0.0658, "step": 831 }, { "epoch": 0.3217324052590874, "grad_norm": 0.4609128226980892, "learning_rate": 9.998412794569832e-06, "loss": 0.0534, "step": 832 }, { "epoch": 0.32211910286156226, "grad_norm": 0.671469874791923, "learning_rate": 9.998355605672507e-06, "loss": 0.0964, "step": 833 }, { "epoch": 0.3225058004640371, "grad_norm": 0.5003364595273461, "learning_rate": 9.998297404806206e-06, "loss": 0.0611, "step": 834 }, { "epoch": 0.322892498066512, "grad_norm": 0.6205728506828355, "learning_rate": 9.99823819198271e-06, "loss": 0.0773, "step": 835 }, { "epoch": 0.32327919566898683, "grad_norm": 0.4917619433697458, "learning_rate": 9.998177967214004e-06, "loss": 0.0632, "step": 836 }, { "epoch": 0.3236658932714617, "grad_norm": 0.47855186365521346, "learning_rate": 9.998116730512287e-06, "loss": 0.0668, "step": 837 }, { "epoch": 0.32405259087393656, "grad_norm": 0.5283565851228643, "learning_rate": 9.998054481889953e-06, "loss": 0.065, "step": 838 }, { "epoch": 0.32443928847641146, "grad_norm": 0.5204919364541929, "learning_rate": 9.997991221359605e-06, "loss": 0.0757, "step": 839 }, { "epoch": 0.3248259860788863, "grad_norm": 0.47588753097269265, "learning_rate": 9.997926948934054e-06, "loss": 0.0576, "step": 840 }, { "epoch": 0.3252126836813612, "grad_norm": 0.4215578708365482, "learning_rate": 9.997861664626307e-06, "loss": 0.0499, "step": 841 }, { "epoch": 0.32559938128383603, "grad_norm": 0.32502328887398724, "learning_rate": 9.997795368449587e-06, "loss": 0.0489, "step": 842 }, { "epoch": 0.3259860788863109, "grad_norm": 0.40481477658421006, "learning_rate": 9.997728060417313e-06, "loss": 0.0533, "step": 843 }, { "epoch": 0.32637277648878577, "grad_norm": 0.5491043080753264, "learning_rate": 9.997659740543115e-06, "loss": 0.0525, "step": 844 }, { "epoch": 0.32675947409126066, "grad_norm": 0.5267922255389085, "learning_rate": 9.997590408840823e-06, "loss": 0.0553, "step": 845 }, { "epoch": 0.3271461716937355, "grad_norm": 0.6910755385433012, "learning_rate": 9.997520065324474e-06, "loss": 0.0864, "step": 846 }, { "epoch": 0.32753286929621034, "grad_norm": 0.5420991253268458, "learning_rate": 9.99744871000831e-06, "loss": 0.0825, "step": 847 }, { "epoch": 0.32791956689868523, "grad_norm": 0.6588138121051968, "learning_rate": 9.997376342906777e-06, "loss": 0.0522, "step": 848 }, { "epoch": 0.32830626450116007, "grad_norm": 0.596767987245927, "learning_rate": 9.997302964034527e-06, "loss": 0.0646, "step": 849 }, { "epoch": 0.32869296210363497, "grad_norm": 0.5581856945279121, "learning_rate": 9.997228573406419e-06, "loss": 0.051, "step": 850 }, { "epoch": 0.3290796597061098, "grad_norm": 0.38439680824632255, "learning_rate": 9.997153171037509e-06, "loss": 0.0571, "step": 851 }, { "epoch": 0.3294663573085847, "grad_norm": 0.4580306389170435, "learning_rate": 9.997076756943067e-06, "loss": 0.0565, "step": 852 }, { "epoch": 0.32985305491105954, "grad_norm": 0.5782557094732568, "learning_rate": 9.996999331138562e-06, "loss": 0.0709, "step": 853 }, { "epoch": 0.33023975251353443, "grad_norm": 0.4754874322290346, "learning_rate": 9.996920893639672e-06, "loss": 0.0614, "step": 854 }, { "epoch": 0.3306264501160093, "grad_norm": 0.4288182618450654, "learning_rate": 9.996841444462273e-06, "loss": 0.0592, "step": 855 }, { "epoch": 0.33101314771848417, "grad_norm": 0.5513502297884308, "learning_rate": 9.996760983622453e-06, "loss": 0.0646, "step": 856 }, { "epoch": 0.331399845320959, "grad_norm": 0.43947007401412463, "learning_rate": 9.996679511136504e-06, "loss": 0.0575, "step": 857 }, { "epoch": 0.33178654292343385, "grad_norm": 0.4804873682807887, "learning_rate": 9.996597027020917e-06, "loss": 0.0809, "step": 858 }, { "epoch": 0.33217324052590874, "grad_norm": 0.40468196443035886, "learning_rate": 9.996513531292396e-06, "loss": 0.0548, "step": 859 }, { "epoch": 0.3325599381283836, "grad_norm": 1.0395570827438696, "learning_rate": 9.996429023967841e-06, "loss": 0.0706, "step": 860 }, { "epoch": 0.3329466357308585, "grad_norm": 0.641724712885977, "learning_rate": 9.996343505064368e-06, "loss": 0.0534, "step": 861 }, { "epoch": 0.3333333333333333, "grad_norm": 0.5944973852554191, "learning_rate": 9.996256974599283e-06, "loss": 0.0669, "step": 862 }, { "epoch": 0.3337200309358082, "grad_norm": 0.4573439967580408, "learning_rate": 9.99616943259011e-06, "loss": 0.0464, "step": 863 }, { "epoch": 0.33410672853828305, "grad_norm": 0.48084644311113856, "learning_rate": 9.996080879054571e-06, "loss": 0.0616, "step": 864 }, { "epoch": 0.33449342614075794, "grad_norm": 1.336437340600204, "learning_rate": 9.995991314010597e-06, "loss": 0.0578, "step": 865 }, { "epoch": 0.3348801237432328, "grad_norm": 0.5285307831940175, "learning_rate": 9.995900737476319e-06, "loss": 0.0594, "step": 866 }, { "epoch": 0.3352668213457077, "grad_norm": 0.5013221941399281, "learning_rate": 9.995809149470075e-06, "loss": 0.0647, "step": 867 }, { "epoch": 0.3356535189481825, "grad_norm": 0.35648219448046414, "learning_rate": 9.99571655001041e-06, "loss": 0.0456, "step": 868 }, { "epoch": 0.3360402165506574, "grad_norm": 0.5284255134904973, "learning_rate": 9.99562293911607e-06, "loss": 0.059, "step": 869 }, { "epoch": 0.33642691415313225, "grad_norm": 0.5616824481335005, "learning_rate": 9.995528316806007e-06, "loss": 0.0644, "step": 870 }, { "epoch": 0.3368136117556071, "grad_norm": 0.5261402015343671, "learning_rate": 9.99543268309938e-06, "loss": 0.0667, "step": 871 }, { "epoch": 0.337200309358082, "grad_norm": 0.4688763236223633, "learning_rate": 9.995336038015551e-06, "loss": 0.0529, "step": 872 }, { "epoch": 0.3375870069605568, "grad_norm": 0.6754891886290719, "learning_rate": 9.995238381574087e-06, "loss": 0.0905, "step": 873 }, { "epoch": 0.3379737045630317, "grad_norm": 0.5880816465003307, "learning_rate": 9.995139713794756e-06, "loss": 0.0715, "step": 874 }, { "epoch": 0.33836040216550656, "grad_norm": 0.4115800473075557, "learning_rate": 9.995040034697539e-06, "loss": 0.0472, "step": 875 }, { "epoch": 0.33874709976798145, "grad_norm": 0.5049018741117791, "learning_rate": 9.994939344302613e-06, "loss": 0.0512, "step": 876 }, { "epoch": 0.3391337973704563, "grad_norm": 0.8998564732077532, "learning_rate": 9.994837642630369e-06, "loss": 0.0664, "step": 877 }, { "epoch": 0.3395204949729312, "grad_norm": 1.2311229874715524, "learning_rate": 9.99473492970139e-06, "loss": 0.0841, "step": 878 }, { "epoch": 0.339907192575406, "grad_norm": 0.757668095396291, "learning_rate": 9.99463120553648e-06, "loss": 0.0687, "step": 879 }, { "epoch": 0.3402938901778809, "grad_norm": 0.5670169562246878, "learning_rate": 9.994526470156634e-06, "loss": 0.0632, "step": 880 }, { "epoch": 0.34068058778035576, "grad_norm": 0.6067487466033227, "learning_rate": 9.994420723583055e-06, "loss": 0.065, "step": 881 }, { "epoch": 0.34106728538283065, "grad_norm": 0.4537330485166544, "learning_rate": 9.994313965837157e-06, "loss": 0.0532, "step": 882 }, { "epoch": 0.3414539829853055, "grad_norm": 0.5199793723271581, "learning_rate": 9.994206196940552e-06, "loss": 0.0649, "step": 883 }, { "epoch": 0.34184068058778033, "grad_norm": 0.38838417350101456, "learning_rate": 9.99409741691506e-06, "loss": 0.0474, "step": 884 }, { "epoch": 0.3422273781902552, "grad_norm": 0.4615709891288872, "learning_rate": 9.993987625782702e-06, "loss": 0.0627, "step": 885 }, { "epoch": 0.34261407579273007, "grad_norm": 0.549226413960182, "learning_rate": 9.993876823565708e-06, "loss": 0.0713, "step": 886 }, { "epoch": 0.34300077339520496, "grad_norm": 0.5227978657026056, "learning_rate": 9.993765010286514e-06, "loss": 0.0655, "step": 887 }, { "epoch": 0.3433874709976798, "grad_norm": 0.5555973242852047, "learning_rate": 9.993652185967753e-06, "loss": 0.0816, "step": 888 }, { "epoch": 0.3437741686001547, "grad_norm": 0.4977363128600217, "learning_rate": 9.99353835063227e-06, "loss": 0.0717, "step": 889 }, { "epoch": 0.34416086620262953, "grad_norm": 0.6157936491836619, "learning_rate": 9.993423504303111e-06, "loss": 0.0633, "step": 890 }, { "epoch": 0.34454756380510443, "grad_norm": 0.5319626577954446, "learning_rate": 9.993307647003529e-06, "loss": 0.055, "step": 891 }, { "epoch": 0.34493426140757927, "grad_norm": 0.5738187147834469, "learning_rate": 9.993190778756981e-06, "loss": 0.0671, "step": 892 }, { "epoch": 0.34532095901005416, "grad_norm": 0.4574675650881015, "learning_rate": 9.993072899587124e-06, "loss": 0.0552, "step": 893 }, { "epoch": 0.345707656612529, "grad_norm": 0.6224556583845869, "learning_rate": 9.99295400951783e-06, "loss": 0.0688, "step": 894 }, { "epoch": 0.34609435421500384, "grad_norm": 0.4937898209431141, "learning_rate": 9.992834108573164e-06, "loss": 0.0536, "step": 895 }, { "epoch": 0.34648105181747874, "grad_norm": 0.4636594863857891, "learning_rate": 9.992713196777406e-06, "loss": 0.0553, "step": 896 }, { "epoch": 0.3468677494199536, "grad_norm": 0.5285873411740356, "learning_rate": 9.992591274155033e-06, "loss": 0.0654, "step": 897 }, { "epoch": 0.34725444702242847, "grad_norm": 0.8808847637985185, "learning_rate": 9.99246834073073e-06, "loss": 0.0739, "step": 898 }, { "epoch": 0.3476411446249033, "grad_norm": 0.7197150622348065, "learning_rate": 9.992344396529385e-06, "loss": 0.0699, "step": 899 }, { "epoch": 0.3480278422273782, "grad_norm": 0.5569780336347386, "learning_rate": 9.992219441576095e-06, "loss": 0.0606, "step": 900 }, { "epoch": 0.34841453982985304, "grad_norm": 0.5118458985420365, "learning_rate": 9.992093475896154e-06, "loss": 0.052, "step": 901 }, { "epoch": 0.34880123743232794, "grad_norm": 0.4731941614482374, "learning_rate": 9.991966499515069e-06, "loss": 0.0614, "step": 902 }, { "epoch": 0.3491879350348028, "grad_norm": 0.8728383176322817, "learning_rate": 9.991838512458545e-06, "loss": 0.0935, "step": 903 }, { "epoch": 0.34957463263727767, "grad_norm": 0.7450661825052104, "learning_rate": 9.991709514752497e-06, "loss": 0.071, "step": 904 }, { "epoch": 0.3499613302397525, "grad_norm": 0.6289621993047319, "learning_rate": 9.99157950642304e-06, "loss": 0.0684, "step": 905 }, { "epoch": 0.3503480278422274, "grad_norm": 0.7231375519035692, "learning_rate": 9.991448487496496e-06, "loss": 0.0796, "step": 906 }, { "epoch": 0.35073472544470224, "grad_norm": 0.6232564768238267, "learning_rate": 9.99131645799939e-06, "loss": 0.0714, "step": 907 }, { "epoch": 0.3511214230471771, "grad_norm": 0.5451412611597958, "learning_rate": 9.991183417958456e-06, "loss": 0.0517, "step": 908 }, { "epoch": 0.351508120649652, "grad_norm": 0.6231818763187501, "learning_rate": 9.991049367400624e-06, "loss": 0.0531, "step": 909 }, { "epoch": 0.3518948182521268, "grad_norm": 0.4818967395329347, "learning_rate": 9.99091430635304e-06, "loss": 0.0527, "step": 910 }, { "epoch": 0.3522815158546017, "grad_norm": 0.5414326444291431, "learning_rate": 9.990778234843043e-06, "loss": 0.065, "step": 911 }, { "epoch": 0.35266821345707655, "grad_norm": 0.5266582797736398, "learning_rate": 9.990641152898188e-06, "loss": 0.0563, "step": 912 }, { "epoch": 0.35305491105955145, "grad_norm": 0.4201258853878456, "learning_rate": 9.990503060546224e-06, "loss": 0.0488, "step": 913 }, { "epoch": 0.3534416086620263, "grad_norm": 0.533618886307557, "learning_rate": 9.99036395781511e-06, "loss": 0.0603, "step": 914 }, { "epoch": 0.3538283062645012, "grad_norm": 0.7831500994029064, "learning_rate": 9.99022384473301e-06, "loss": 0.0598, "step": 915 }, { "epoch": 0.354215003866976, "grad_norm": 0.44846612910500994, "learning_rate": 9.99008272132829e-06, "loss": 0.0478, "step": 916 }, { "epoch": 0.3546017014694509, "grad_norm": 0.5195842060331396, "learning_rate": 9.98994058762952e-06, "loss": 0.0547, "step": 917 }, { "epoch": 0.35498839907192575, "grad_norm": 0.5719888581690001, "learning_rate": 9.989797443665484e-06, "loss": 0.065, "step": 918 }, { "epoch": 0.3553750966744006, "grad_norm": 0.8504675711562705, "learning_rate": 9.989653289465155e-06, "loss": 0.0567, "step": 919 }, { "epoch": 0.3557617942768755, "grad_norm": 0.3887739944834127, "learning_rate": 9.989508125057723e-06, "loss": 0.0566, "step": 920 }, { "epoch": 0.3561484918793503, "grad_norm": 0.5511389067510651, "learning_rate": 9.989361950472576e-06, "loss": 0.0479, "step": 921 }, { "epoch": 0.3565351894818252, "grad_norm": 0.5350631332742368, "learning_rate": 9.989214765739308e-06, "loss": 0.0611, "step": 922 }, { "epoch": 0.35692188708430006, "grad_norm": 0.5147667649766557, "learning_rate": 9.98906657088772e-06, "loss": 0.0446, "step": 923 }, { "epoch": 0.35730858468677495, "grad_norm": 0.42410260948108247, "learning_rate": 9.988917365947817e-06, "loss": 0.0466, "step": 924 }, { "epoch": 0.3576952822892498, "grad_norm": 0.41868643789051985, "learning_rate": 9.988767150949802e-06, "loss": 0.0555, "step": 925 }, { "epoch": 0.3580819798917247, "grad_norm": 0.5130121403642889, "learning_rate": 9.988615925924092e-06, "loss": 0.0507, "step": 926 }, { "epoch": 0.35846867749419953, "grad_norm": 0.9278879695308337, "learning_rate": 9.988463690901302e-06, "loss": 0.0821, "step": 927 }, { "epoch": 0.3588553750966744, "grad_norm": 0.47534734929430517, "learning_rate": 9.988310445912255e-06, "loss": 0.07, "step": 928 }, { "epoch": 0.35924207269914926, "grad_norm": 0.6153513450369307, "learning_rate": 9.988156190987974e-06, "loss": 0.08, "step": 929 }, { "epoch": 0.35962877030162416, "grad_norm": 0.4825102600850624, "learning_rate": 9.988000926159694e-06, "loss": 0.0494, "step": 930 }, { "epoch": 0.360015467904099, "grad_norm": 0.6417090923240601, "learning_rate": 9.987844651458847e-06, "loss": 0.0547, "step": 931 }, { "epoch": 0.36040216550657383, "grad_norm": 0.39040936482453437, "learning_rate": 9.987687366917073e-06, "loss": 0.0448, "step": 932 }, { "epoch": 0.36078886310904873, "grad_norm": 0.47889757774809133, "learning_rate": 9.987529072566215e-06, "loss": 0.0447, "step": 933 }, { "epoch": 0.36117556071152357, "grad_norm": 0.6704658979012652, "learning_rate": 9.987369768438324e-06, "loss": 0.0435, "step": 934 }, { "epoch": 0.36156225831399846, "grad_norm": 1.0550202477944803, "learning_rate": 9.98720945456565e-06, "loss": 0.0889, "step": 935 }, { "epoch": 0.3619489559164733, "grad_norm": 0.49312145206526214, "learning_rate": 9.987048130980651e-06, "loss": 0.0538, "step": 936 }, { "epoch": 0.3623356535189482, "grad_norm": 0.396636473231423, "learning_rate": 9.98688579771599e-06, "loss": 0.0478, "step": 937 }, { "epoch": 0.36272235112142304, "grad_norm": 0.5518218978527093, "learning_rate": 9.986722454804531e-06, "loss": 0.0489, "step": 938 }, { "epoch": 0.36310904872389793, "grad_norm": 0.7221247479507237, "learning_rate": 9.986558102279347e-06, "loss": 0.0797, "step": 939 }, { "epoch": 0.36349574632637277, "grad_norm": 0.5629958211573362, "learning_rate": 9.98639274017371e-06, "loss": 0.0601, "step": 940 }, { "epoch": 0.36388244392884767, "grad_norm": 0.9697350914069193, "learning_rate": 9.9862263685211e-06, "loss": 0.0863, "step": 941 }, { "epoch": 0.3642691415313225, "grad_norm": 0.5992423670579664, "learning_rate": 9.986058987355203e-06, "loss": 0.0729, "step": 942 }, { "epoch": 0.36465583913379734, "grad_norm": 0.5210377716952499, "learning_rate": 9.985890596709905e-06, "loss": 0.0681, "step": 943 }, { "epoch": 0.36504253673627224, "grad_norm": 0.9317158190543812, "learning_rate": 9.985721196619297e-06, "loss": 0.0652, "step": 944 }, { "epoch": 0.3654292343387471, "grad_norm": 0.5690735238244989, "learning_rate": 9.98555078711768e-06, "loss": 0.0685, "step": 945 }, { "epoch": 0.365815931941222, "grad_norm": 0.6549117403390879, "learning_rate": 9.98537936823955e-06, "loss": 0.0717, "step": 946 }, { "epoch": 0.3662026295436968, "grad_norm": 0.8790838103498281, "learning_rate": 9.985206940019618e-06, "loss": 0.1027, "step": 947 }, { "epoch": 0.3665893271461717, "grad_norm": 0.6858252633075806, "learning_rate": 9.98503350249279e-06, "loss": 0.0538, "step": 948 }, { "epoch": 0.36697602474864655, "grad_norm": 0.5117938607130412, "learning_rate": 9.984859055694181e-06, "loss": 0.0421, "step": 949 }, { "epoch": 0.36736272235112144, "grad_norm": 0.5202515805682434, "learning_rate": 9.98468359965911e-06, "loss": 0.0461, "step": 950 }, { "epoch": 0.3677494199535963, "grad_norm": 0.9229230591185166, "learning_rate": 9.9845071344231e-06, "loss": 0.0764, "step": 951 }, { "epoch": 0.3681361175560712, "grad_norm": 0.5145946029554698, "learning_rate": 9.984329660021877e-06, "loss": 0.0678, "step": 952 }, { "epoch": 0.368522815158546, "grad_norm": 0.5247467274449752, "learning_rate": 9.984151176491376e-06, "loss": 0.0539, "step": 953 }, { "epoch": 0.3689095127610209, "grad_norm": 0.5293904347033866, "learning_rate": 9.983971683867729e-06, "loss": 0.0508, "step": 954 }, { "epoch": 0.36929621036349575, "grad_norm": 0.5688456416616888, "learning_rate": 9.983791182187276e-06, "loss": 0.0508, "step": 955 }, { "epoch": 0.3696829079659706, "grad_norm": 0.4853064031028958, "learning_rate": 9.983609671486567e-06, "loss": 0.0558, "step": 956 }, { "epoch": 0.3700696055684455, "grad_norm": 0.5001270197400045, "learning_rate": 9.983427151802343e-06, "loss": 0.0567, "step": 957 }, { "epoch": 0.3704563031709203, "grad_norm": 0.49021790420475225, "learning_rate": 9.983243623171564e-06, "loss": 0.0504, "step": 958 }, { "epoch": 0.3708430007733952, "grad_norm": 0.4849346148190077, "learning_rate": 9.98305908563138e-06, "loss": 0.0557, "step": 959 }, { "epoch": 0.37122969837587005, "grad_norm": 0.5518916195147923, "learning_rate": 9.98287353921916e-06, "loss": 0.054, "step": 960 }, { "epoch": 0.37161639597834495, "grad_norm": 0.3873687214106472, "learning_rate": 9.982686983972464e-06, "loss": 0.0437, "step": 961 }, { "epoch": 0.3720030935808198, "grad_norm": 0.8185088592393813, "learning_rate": 9.982499419929067e-06, "loss": 0.0678, "step": 962 }, { "epoch": 0.3723897911832947, "grad_norm": 0.5180822750014515, "learning_rate": 9.982310847126939e-06, "loss": 0.0674, "step": 963 }, { "epoch": 0.3727764887857695, "grad_norm": 0.5451937041931791, "learning_rate": 9.982121265604262e-06, "loss": 0.0559, "step": 964 }, { "epoch": 0.3731631863882444, "grad_norm": 0.8288034117624636, "learning_rate": 9.981930675399416e-06, "loss": 0.0669, "step": 965 }, { "epoch": 0.37354988399071926, "grad_norm": 1.0867643991673905, "learning_rate": 9.981739076550987e-06, "loss": 0.0958, "step": 966 }, { "epoch": 0.3739365815931941, "grad_norm": 0.42301324291686365, "learning_rate": 9.98154646909777e-06, "loss": 0.0518, "step": 967 }, { "epoch": 0.374323279195669, "grad_norm": 0.5950160285537011, "learning_rate": 9.981352853078758e-06, "loss": 0.0673, "step": 968 }, { "epoch": 0.37470997679814383, "grad_norm": 0.4472819715526235, "learning_rate": 9.981158228533152e-06, "loss": 0.0599, "step": 969 }, { "epoch": 0.3750966744006187, "grad_norm": 0.5070490194964148, "learning_rate": 9.980962595500353e-06, "loss": 0.0494, "step": 970 }, { "epoch": 0.37548337200309356, "grad_norm": 0.6725690578560927, "learning_rate": 9.980765954019972e-06, "loss": 0.0592, "step": 971 }, { "epoch": 0.37587006960556846, "grad_norm": 0.42341267659591675, "learning_rate": 9.98056830413182e-06, "loss": 0.0582, "step": 972 }, { "epoch": 0.3762567672080433, "grad_norm": 0.5123513881147962, "learning_rate": 9.980369645875913e-06, "loss": 0.0664, "step": 973 }, { "epoch": 0.3766434648105182, "grad_norm": 0.46276170411328776, "learning_rate": 9.980169979292471e-06, "loss": 0.0498, "step": 974 }, { "epoch": 0.37703016241299303, "grad_norm": 0.48866765768204845, "learning_rate": 9.97996930442192e-06, "loss": 0.0569, "step": 975 }, { "epoch": 0.3774168600154679, "grad_norm": 0.7048190721898252, "learning_rate": 9.979767621304886e-06, "loss": 0.0597, "step": 976 }, { "epoch": 0.37780355761794276, "grad_norm": 0.5900599680251604, "learning_rate": 9.979564929982206e-06, "loss": 0.0722, "step": 977 }, { "epoch": 0.37819025522041766, "grad_norm": 0.45202981138429305, "learning_rate": 9.979361230494914e-06, "loss": 0.0792, "step": 978 }, { "epoch": 0.3785769528228925, "grad_norm": 0.4256088061640823, "learning_rate": 9.97915652288425e-06, "loss": 0.0769, "step": 979 }, { "epoch": 0.37896365042536734, "grad_norm": 0.38992481174119337, "learning_rate": 9.978950807191664e-06, "loss": 0.0482, "step": 980 }, { "epoch": 0.37935034802784223, "grad_norm": 0.3763954754676402, "learning_rate": 9.9787440834588e-06, "loss": 0.0437, "step": 981 }, { "epoch": 0.37973704563031707, "grad_norm": 0.7133517877151354, "learning_rate": 9.978536351727514e-06, "loss": 0.0822, "step": 982 }, { "epoch": 0.38012374323279197, "grad_norm": 0.5007976125777767, "learning_rate": 9.978327612039863e-06, "loss": 0.059, "step": 983 }, { "epoch": 0.3805104408352668, "grad_norm": 0.5864195936555736, "learning_rate": 9.978117864438109e-06, "loss": 0.0885, "step": 984 }, { "epoch": 0.3808971384377417, "grad_norm": 0.40157500045510547, "learning_rate": 9.977907108964714e-06, "loss": 0.0576, "step": 985 }, { "epoch": 0.38128383604021654, "grad_norm": 0.4515698831962873, "learning_rate": 9.977695345662354e-06, "loss": 0.0492, "step": 986 }, { "epoch": 0.38167053364269143, "grad_norm": 0.4419313308540604, "learning_rate": 9.9774825745739e-06, "loss": 0.064, "step": 987 }, { "epoch": 0.3820572312451663, "grad_norm": 0.3433473088195942, "learning_rate": 9.977268795742427e-06, "loss": 0.0465, "step": 988 }, { "epoch": 0.38244392884764117, "grad_norm": 0.41079978289074615, "learning_rate": 9.97705400921122e-06, "loss": 0.0574, "step": 989 }, { "epoch": 0.382830626450116, "grad_norm": 0.472970915289181, "learning_rate": 9.976838215023764e-06, "loss": 0.0602, "step": 990 }, { "epoch": 0.38321732405259085, "grad_norm": 0.3959155133756218, "learning_rate": 9.976621413223749e-06, "loss": 0.0452, "step": 991 }, { "epoch": 0.38360402165506574, "grad_norm": 0.5018166120090164, "learning_rate": 9.976403603855065e-06, "loss": 0.0622, "step": 992 }, { "epoch": 0.3839907192575406, "grad_norm": 0.8295370837586684, "learning_rate": 9.976184786961815e-06, "loss": 0.0775, "step": 993 }, { "epoch": 0.3843774168600155, "grad_norm": 0.5144535680144968, "learning_rate": 9.9759649625883e-06, "loss": 0.0601, "step": 994 }, { "epoch": 0.3847641144624903, "grad_norm": 0.5325184908633243, "learning_rate": 9.975744130779021e-06, "loss": 0.0679, "step": 995 }, { "epoch": 0.3851508120649652, "grad_norm": 0.4807996950401152, "learning_rate": 9.975522291578695e-06, "loss": 0.0576, "step": 996 }, { "epoch": 0.38553750966744005, "grad_norm": 0.6211961482802161, "learning_rate": 9.97529944503223e-06, "loss": 0.0654, "step": 997 }, { "epoch": 0.38592420726991494, "grad_norm": 0.4422863212097768, "learning_rate": 9.975075591184743e-06, "loss": 0.0425, "step": 998 }, { "epoch": 0.3863109048723898, "grad_norm": 0.3467511215052906, "learning_rate": 9.974850730081561e-06, "loss": 0.0459, "step": 999 }, { "epoch": 0.3866976024748647, "grad_norm": 0.4332883513055862, "learning_rate": 9.974624861768207e-06, "loss": 0.0633, "step": 1000 }, { "epoch": 0.3870843000773395, "grad_norm": 0.47151080452710586, "learning_rate": 9.974397986290408e-06, "loss": 0.0527, "step": 1001 }, { "epoch": 0.3874709976798144, "grad_norm": 0.704728870017558, "learning_rate": 9.974170103694098e-06, "loss": 0.0802, "step": 1002 }, { "epoch": 0.38785769528228925, "grad_norm": 0.5373592689651576, "learning_rate": 9.973941214025417e-06, "loss": 0.0665, "step": 1003 }, { "epoch": 0.3882443928847641, "grad_norm": 0.36764022466766816, "learning_rate": 9.973711317330705e-06, "loss": 0.0401, "step": 1004 }, { "epoch": 0.388631090487239, "grad_norm": 0.5076114548878248, "learning_rate": 9.973480413656504e-06, "loss": 0.0733, "step": 1005 }, { "epoch": 0.3890177880897138, "grad_norm": 0.43814982409265846, "learning_rate": 9.973248503049567e-06, "loss": 0.0466, "step": 1006 }, { "epoch": 0.3894044856921887, "grad_norm": 0.7825505694475446, "learning_rate": 9.973015585556845e-06, "loss": 0.07, "step": 1007 }, { "epoch": 0.38979118329466356, "grad_norm": 0.4748847660360474, "learning_rate": 9.972781661225493e-06, "loss": 0.0676, "step": 1008 }, { "epoch": 0.39017788089713845, "grad_norm": 0.3728610594318924, "learning_rate": 9.972546730102872e-06, "loss": 0.0504, "step": 1009 }, { "epoch": 0.3905645784996133, "grad_norm": 0.3685343835817228, "learning_rate": 9.97231079223655e-06, "loss": 0.0555, "step": 1010 }, { "epoch": 0.3909512761020882, "grad_norm": 0.36707145134566666, "learning_rate": 9.97207384767429e-06, "loss": 0.0569, "step": 1011 }, { "epoch": 0.391337973704563, "grad_norm": 0.4240225523878421, "learning_rate": 9.971835896464066e-06, "loss": 0.0569, "step": 1012 }, { "epoch": 0.3917246713070379, "grad_norm": 0.3958966572468328, "learning_rate": 9.971596938654055e-06, "loss": 0.0447, "step": 1013 }, { "epoch": 0.39211136890951276, "grad_norm": 0.3897523018275613, "learning_rate": 9.971356974292635e-06, "loss": 0.0513, "step": 1014 }, { "epoch": 0.39249806651198765, "grad_norm": 0.5440501457982967, "learning_rate": 9.97111600342839e-06, "loss": 0.056, "step": 1015 }, { "epoch": 0.3928847641144625, "grad_norm": 0.4393574685226052, "learning_rate": 9.970874026110104e-06, "loss": 0.0534, "step": 1016 }, { "epoch": 0.39327146171693733, "grad_norm": 0.36486659956877904, "learning_rate": 9.970631042386773e-06, "loss": 0.054, "step": 1017 }, { "epoch": 0.3936581593194122, "grad_norm": 0.5063043700541708, "learning_rate": 9.970387052307588e-06, "loss": 0.0521, "step": 1018 }, { "epoch": 0.39404485692188707, "grad_norm": 0.6066217225253538, "learning_rate": 9.970142055921948e-06, "loss": 0.0663, "step": 1019 }, { "epoch": 0.39443155452436196, "grad_norm": 0.4984919979178298, "learning_rate": 9.969896053279455e-06, "loss": 0.0684, "step": 1020 }, { "epoch": 0.3948182521268368, "grad_norm": 0.5596624228334957, "learning_rate": 9.969649044429915e-06, "loss": 0.0631, "step": 1021 }, { "epoch": 0.3952049497293117, "grad_norm": 0.48951971018173446, "learning_rate": 9.969401029423339e-06, "loss": 0.0609, "step": 1022 }, { "epoch": 0.39559164733178653, "grad_norm": 0.3721454300732626, "learning_rate": 9.969152008309937e-06, "loss": 0.0471, "step": 1023 }, { "epoch": 0.39597834493426143, "grad_norm": 0.5460103220274977, "learning_rate": 9.96890198114013e-06, "loss": 0.0588, "step": 1024 }, { "epoch": 0.39636504253673627, "grad_norm": 0.3547767245423931, "learning_rate": 9.968650947964534e-06, "loss": 0.0533, "step": 1025 }, { "epoch": 0.39675174013921116, "grad_norm": 0.4061870046600322, "learning_rate": 9.968398908833978e-06, "loss": 0.0493, "step": 1026 }, { "epoch": 0.397138437741686, "grad_norm": 1.1140687166637417, "learning_rate": 9.968145863799485e-06, "loss": 0.0785, "step": 1027 }, { "epoch": 0.39752513534416084, "grad_norm": 0.3825617861567935, "learning_rate": 9.967891812912292e-06, "loss": 0.0551, "step": 1028 }, { "epoch": 0.39791183294663574, "grad_norm": 0.3954092250154181, "learning_rate": 9.967636756223828e-06, "loss": 0.0485, "step": 1029 }, { "epoch": 0.3982985305491106, "grad_norm": 0.47185088400864217, "learning_rate": 9.967380693785738e-06, "loss": 0.0636, "step": 1030 }, { "epoch": 0.39868522815158547, "grad_norm": 0.40286453190987337, "learning_rate": 9.967123625649861e-06, "loss": 0.044, "step": 1031 }, { "epoch": 0.3990719257540603, "grad_norm": 0.7502590482559126, "learning_rate": 9.966865551868244e-06, "loss": 0.0872, "step": 1032 }, { "epoch": 0.3994586233565352, "grad_norm": 0.639390524944662, "learning_rate": 9.966606472493135e-06, "loss": 0.066, "step": 1033 }, { "epoch": 0.39984532095901004, "grad_norm": 0.5938129604958269, "learning_rate": 9.96634638757699e-06, "loss": 0.0601, "step": 1034 }, { "epoch": 0.40023201856148494, "grad_norm": 0.37752641460230574, "learning_rate": 9.966085297172465e-06, "loss": 0.0569, "step": 1035 }, { "epoch": 0.4006187161639598, "grad_norm": 0.41139635281535886, "learning_rate": 9.96582320133242e-06, "loss": 0.0401, "step": 1036 }, { "epoch": 0.40100541376643467, "grad_norm": 0.458802129153944, "learning_rate": 9.96556010010992e-06, "loss": 0.077, "step": 1037 }, { "epoch": 0.4013921113689095, "grad_norm": 0.8298395286034634, "learning_rate": 9.965295993558231e-06, "loss": 0.1072, "step": 1038 }, { "epoch": 0.4017788089713844, "grad_norm": 0.390701157461639, "learning_rate": 9.965030881730826e-06, "loss": 0.0479, "step": 1039 }, { "epoch": 0.40216550657385924, "grad_norm": 0.4952360785877923, "learning_rate": 9.964764764681378e-06, "loss": 0.0615, "step": 1040 }, { "epoch": 0.4025522041763341, "grad_norm": 0.46568838867056844, "learning_rate": 9.964497642463765e-06, "loss": 0.0584, "step": 1041 }, { "epoch": 0.402938901778809, "grad_norm": 0.3728941479162945, "learning_rate": 9.96422951513207e-06, "loss": 0.0498, "step": 1042 }, { "epoch": 0.4033255993812838, "grad_norm": 0.667847854225959, "learning_rate": 9.963960382740579e-06, "loss": 0.0653, "step": 1043 }, { "epoch": 0.4037122969837587, "grad_norm": 0.4930639297132769, "learning_rate": 9.963690245343776e-06, "loss": 0.052, "step": 1044 }, { "epoch": 0.40409899458623355, "grad_norm": 0.3843767525703053, "learning_rate": 9.96341910299636e-06, "loss": 0.0407, "step": 1045 }, { "epoch": 0.40448569218870845, "grad_norm": 0.46013609309054554, "learning_rate": 9.96314695575322e-06, "loss": 0.0681, "step": 1046 }, { "epoch": 0.4048723897911833, "grad_norm": 0.476880629768895, "learning_rate": 9.96287380366946e-06, "loss": 0.0485, "step": 1047 }, { "epoch": 0.4052590873936582, "grad_norm": 0.795755356522573, "learning_rate": 9.962599646800378e-06, "loss": 0.0598, "step": 1048 }, { "epoch": 0.405645784996133, "grad_norm": 0.44668433822679365, "learning_rate": 9.962324485201484e-06, "loss": 0.0549, "step": 1049 }, { "epoch": 0.4060324825986079, "grad_norm": 0.4854218431402898, "learning_rate": 9.962048318928486e-06, "loss": 0.0734, "step": 1050 }, { "epoch": 0.40641918020108275, "grad_norm": 0.5873604926992492, "learning_rate": 9.961771148037296e-06, "loss": 0.0609, "step": 1051 }, { "epoch": 0.4068058778035576, "grad_norm": 0.44986956849638193, "learning_rate": 9.96149297258403e-06, "loss": 0.05, "step": 1052 }, { "epoch": 0.4071925754060325, "grad_norm": 0.5108861067694761, "learning_rate": 9.961213792625008e-06, "loss": 0.0557, "step": 1053 }, { "epoch": 0.4075792730085073, "grad_norm": 0.5765627080118766, "learning_rate": 9.960933608216755e-06, "loss": 0.0682, "step": 1054 }, { "epoch": 0.4079659706109822, "grad_norm": 0.6307344568609409, "learning_rate": 9.960652419415992e-06, "loss": 0.0595, "step": 1055 }, { "epoch": 0.40835266821345706, "grad_norm": 0.5457611929817113, "learning_rate": 9.960370226279653e-06, "loss": 0.0568, "step": 1056 }, { "epoch": 0.40873936581593195, "grad_norm": 0.6217966535519959, "learning_rate": 9.960087028864871e-06, "loss": 0.0655, "step": 1057 }, { "epoch": 0.4091260634184068, "grad_norm": 0.5884802207692984, "learning_rate": 9.95980282722898e-06, "loss": 0.0479, "step": 1058 }, { "epoch": 0.4095127610208817, "grad_norm": 0.5339783691522887, "learning_rate": 9.95951762142952e-06, "loss": 0.0466, "step": 1059 }, { "epoch": 0.4098994586233565, "grad_norm": 0.5145565441025453, "learning_rate": 9.959231411524235e-06, "loss": 0.0578, "step": 1060 }, { "epoch": 0.4102861562258314, "grad_norm": 0.4767266880776152, "learning_rate": 9.95894419757107e-06, "loss": 0.0498, "step": 1061 }, { "epoch": 0.41067285382830626, "grad_norm": 0.5677784839179061, "learning_rate": 9.958655979628175e-06, "loss": 0.0731, "step": 1062 }, { "epoch": 0.41105955143078116, "grad_norm": 0.4165947442266816, "learning_rate": 9.958366757753902e-06, "loss": 0.0514, "step": 1063 }, { "epoch": 0.411446249033256, "grad_norm": 0.40446283017720824, "learning_rate": 9.958076532006808e-06, "loss": 0.0455, "step": 1064 }, { "epoch": 0.41183294663573083, "grad_norm": 0.9939606005923368, "learning_rate": 9.957785302445652e-06, "loss": 0.0625, "step": 1065 }, { "epoch": 0.41221964423820573, "grad_norm": 0.5346249333651013, "learning_rate": 9.957493069129398e-06, "loss": 0.0898, "step": 1066 }, { "epoch": 0.41260634184068057, "grad_norm": 0.6297315009364611, "learning_rate": 9.95719983211721e-06, "loss": 0.0769, "step": 1067 }, { "epoch": 0.41299303944315546, "grad_norm": 0.5005448848307944, "learning_rate": 9.956905591468455e-06, "loss": 0.0725, "step": 1068 }, { "epoch": 0.4133797370456303, "grad_norm": 0.6323490446178901, "learning_rate": 9.956610347242708e-06, "loss": 0.0539, "step": 1069 }, { "epoch": 0.4137664346481052, "grad_norm": 0.5689231130123967, "learning_rate": 9.956314099499742e-06, "loss": 0.055, "step": 1070 }, { "epoch": 0.41415313225058004, "grad_norm": 0.5731413751267299, "learning_rate": 9.956016848299537e-06, "loss": 0.0671, "step": 1071 }, { "epoch": 0.41453982985305493, "grad_norm": 0.5943655298626591, "learning_rate": 9.955718593702275e-06, "loss": 0.0558, "step": 1072 }, { "epoch": 0.41492652745552977, "grad_norm": 0.4441173074959687, "learning_rate": 9.955419335768341e-06, "loss": 0.0522, "step": 1073 }, { "epoch": 0.41531322505800466, "grad_norm": 0.4982713391804292, "learning_rate": 9.95511907455832e-06, "loss": 0.0476, "step": 1074 }, { "epoch": 0.4156999226604795, "grad_norm": 0.5586645663234033, "learning_rate": 9.954817810133007e-06, "loss": 0.0704, "step": 1075 }, { "epoch": 0.41608662026295434, "grad_norm": 0.48663138719549265, "learning_rate": 9.954515542553392e-06, "loss": 0.0564, "step": 1076 }, { "epoch": 0.41647331786542924, "grad_norm": 0.3616363143104271, "learning_rate": 9.954212271880673e-06, "loss": 0.0504, "step": 1077 }, { "epoch": 0.4168600154679041, "grad_norm": 0.47544841846950275, "learning_rate": 9.953907998176253e-06, "loss": 0.0454, "step": 1078 }, { "epoch": 0.41724671307037897, "grad_norm": 0.4741712311216125, "learning_rate": 9.953602721501734e-06, "loss": 0.0516, "step": 1079 }, { "epoch": 0.4176334106728538, "grad_norm": 0.5663341882476278, "learning_rate": 9.953296441918923e-06, "loss": 0.0668, "step": 1080 }, { "epoch": 0.4180201082753287, "grad_norm": 0.5366688638486927, "learning_rate": 9.952989159489827e-06, "loss": 0.0578, "step": 1081 }, { "epoch": 0.41840680587780354, "grad_norm": 0.426067251716693, "learning_rate": 9.952680874276662e-06, "loss": 0.0499, "step": 1082 }, { "epoch": 0.41879350348027844, "grad_norm": 0.48302889529184595, "learning_rate": 9.95237158634184e-06, "loss": 0.0578, "step": 1083 }, { "epoch": 0.4191802010827533, "grad_norm": 0.5332504064627314, "learning_rate": 9.952061295747983e-06, "loss": 0.0697, "step": 1084 }, { "epoch": 0.4195668986852282, "grad_norm": 0.46651994331517016, "learning_rate": 9.951750002557909e-06, "loss": 0.0485, "step": 1085 }, { "epoch": 0.419953596287703, "grad_norm": 0.5062373707685688, "learning_rate": 9.951437706834646e-06, "loss": 0.0687, "step": 1086 }, { "epoch": 0.4203402938901779, "grad_norm": 0.42190617559776983, "learning_rate": 9.951124408641419e-06, "loss": 0.0583, "step": 1087 }, { "epoch": 0.42072699149265275, "grad_norm": 0.5619774958377082, "learning_rate": 9.950810108041662e-06, "loss": 0.0568, "step": 1088 }, { "epoch": 0.4211136890951276, "grad_norm": 0.5350401106572803, "learning_rate": 9.950494805099003e-06, "loss": 0.0658, "step": 1089 }, { "epoch": 0.4215003866976025, "grad_norm": 0.37618803692088637, "learning_rate": 9.950178499877281e-06, "loss": 0.0398, "step": 1090 }, { "epoch": 0.4218870843000773, "grad_norm": 0.3758279183163323, "learning_rate": 9.949861192440537e-06, "loss": 0.0483, "step": 1091 }, { "epoch": 0.4222737819025522, "grad_norm": 0.6752123771112816, "learning_rate": 9.949542882853012e-06, "loss": 0.0722, "step": 1092 }, { "epoch": 0.42266047950502705, "grad_norm": 0.4327211716521404, "learning_rate": 9.949223571179149e-06, "loss": 0.0441, "step": 1093 }, { "epoch": 0.42304717710750195, "grad_norm": 0.5473489017719713, "learning_rate": 9.9489032574836e-06, "loss": 0.0703, "step": 1094 }, { "epoch": 0.4234338747099768, "grad_norm": 0.4503298962450428, "learning_rate": 9.948581941831212e-06, "loss": 0.0631, "step": 1095 }, { "epoch": 0.4238205723124517, "grad_norm": 0.3684031143909234, "learning_rate": 9.94825962428704e-06, "loss": 0.0464, "step": 1096 }, { "epoch": 0.4242072699149265, "grad_norm": 0.4299741190921024, "learning_rate": 9.947936304916342e-06, "loss": 0.0546, "step": 1097 }, { "epoch": 0.4245939675174014, "grad_norm": 0.40198921278468575, "learning_rate": 9.947611983784575e-06, "loss": 0.047, "step": 1098 }, { "epoch": 0.42498066511987626, "grad_norm": 0.6230909486714461, "learning_rate": 9.947286660957402e-06, "loss": 0.0685, "step": 1099 }, { "epoch": 0.4253673627223511, "grad_norm": 0.4048000945684485, "learning_rate": 9.94696033650069e-06, "loss": 0.0552, "step": 1100 }, { "epoch": 0.425754060324826, "grad_norm": 0.5156995002962098, "learning_rate": 9.946633010480504e-06, "loss": 0.0515, "step": 1101 }, { "epoch": 0.42614075792730083, "grad_norm": 0.5428357551082301, "learning_rate": 9.946304682963115e-06, "loss": 0.0874, "step": 1102 }, { "epoch": 0.4265274555297757, "grad_norm": 0.35659314561006644, "learning_rate": 9.945975354014997e-06, "loss": 0.047, "step": 1103 }, { "epoch": 0.42691415313225056, "grad_norm": 0.6713303615477236, "learning_rate": 9.945645023702826e-06, "loss": 0.0527, "step": 1104 }, { "epoch": 0.42730085073472546, "grad_norm": 0.5298857235235616, "learning_rate": 9.945313692093482e-06, "loss": 0.0536, "step": 1105 }, { "epoch": 0.4276875483372003, "grad_norm": 0.4940263793111875, "learning_rate": 9.944981359254044e-06, "loss": 0.0507, "step": 1106 }, { "epoch": 0.4280742459396752, "grad_norm": 0.6868867845716515, "learning_rate": 9.9446480252518e-06, "loss": 0.0554, "step": 1107 }, { "epoch": 0.42846094354215003, "grad_norm": 0.45535956136274536, "learning_rate": 9.944313690154232e-06, "loss": 0.056, "step": 1108 }, { "epoch": 0.4288476411446249, "grad_norm": 0.4082887160720819, "learning_rate": 9.943978354029033e-06, "loss": 0.0506, "step": 1109 }, { "epoch": 0.42923433874709976, "grad_norm": 0.5398495145796117, "learning_rate": 9.943642016944094e-06, "loss": 0.0702, "step": 1110 }, { "epoch": 0.42962103634957466, "grad_norm": 0.4300306641233883, "learning_rate": 9.943304678967509e-06, "loss": 0.0469, "step": 1111 }, { "epoch": 0.4300077339520495, "grad_norm": 0.38739238160831657, "learning_rate": 9.94296634016758e-06, "loss": 0.0548, "step": 1112 }, { "epoch": 0.43039443155452434, "grad_norm": 0.4761866698354302, "learning_rate": 9.9426270006128e-06, "loss": 0.0568, "step": 1113 }, { "epoch": 0.43078112915699923, "grad_norm": 0.4457211887148132, "learning_rate": 9.94228666037188e-06, "loss": 0.0793, "step": 1114 }, { "epoch": 0.43116782675947407, "grad_norm": 0.6078199566978735, "learning_rate": 9.94194531951372e-06, "loss": 0.0645, "step": 1115 }, { "epoch": 0.43155452436194897, "grad_norm": 0.35699878060168416, "learning_rate": 9.941602978107429e-06, "loss": 0.0464, "step": 1116 }, { "epoch": 0.4319412219644238, "grad_norm": 0.5423466264404657, "learning_rate": 9.941259636222317e-06, "loss": 0.0517, "step": 1117 }, { "epoch": 0.4323279195668987, "grad_norm": 0.6540436928626443, "learning_rate": 9.9409152939279e-06, "loss": 0.0501, "step": 1118 }, { "epoch": 0.43271461716937354, "grad_norm": 0.49746340066879696, "learning_rate": 9.940569951293888e-06, "loss": 0.0671, "step": 1119 }, { "epoch": 0.43310131477184843, "grad_norm": 0.7652971216240854, "learning_rate": 9.940223608390206e-06, "loss": 0.0532, "step": 1120 }, { "epoch": 0.4334880123743233, "grad_norm": 0.4596453810914782, "learning_rate": 9.939876265286972e-06, "loss": 0.0685, "step": 1121 }, { "epoch": 0.43387470997679817, "grad_norm": 0.4744829731149929, "learning_rate": 9.939527922054507e-06, "loss": 0.0835, "step": 1122 }, { "epoch": 0.434261407579273, "grad_norm": 0.42596902929185565, "learning_rate": 9.93917857876334e-06, "loss": 0.0503, "step": 1123 }, { "epoch": 0.43464810518174785, "grad_norm": 0.41209522870590815, "learning_rate": 9.938828235484198e-06, "loss": 0.0473, "step": 1124 }, { "epoch": 0.43503480278422274, "grad_norm": 0.410534172008193, "learning_rate": 9.93847689228801e-06, "loss": 0.0484, "step": 1125 }, { "epoch": 0.4354215003866976, "grad_norm": 0.6652130894026292, "learning_rate": 9.938124549245909e-06, "loss": 0.0519, "step": 1126 }, { "epoch": 0.4358081979891725, "grad_norm": 0.4144242528546377, "learning_rate": 9.937771206429235e-06, "loss": 0.0496, "step": 1127 }, { "epoch": 0.4361948955916473, "grad_norm": 0.46128508800201673, "learning_rate": 9.937416863909521e-06, "loss": 0.0662, "step": 1128 }, { "epoch": 0.4365815931941222, "grad_norm": 0.37752376114771, "learning_rate": 9.93706152175851e-06, "loss": 0.0509, "step": 1129 }, { "epoch": 0.43696829079659705, "grad_norm": 0.5335372806662612, "learning_rate": 9.936705180048143e-06, "loss": 0.0506, "step": 1130 }, { "epoch": 0.43735498839907194, "grad_norm": 0.5744507136075062, "learning_rate": 9.936347838850567e-06, "loss": 0.0582, "step": 1131 }, { "epoch": 0.4377416860015468, "grad_norm": 0.5026051970488705, "learning_rate": 9.935989498238126e-06, "loss": 0.0628, "step": 1132 }, { "epoch": 0.4381283836040217, "grad_norm": 0.5051510242517794, "learning_rate": 9.935630158283375e-06, "loss": 0.0535, "step": 1133 }, { "epoch": 0.4385150812064965, "grad_norm": 0.30788209105168557, "learning_rate": 9.93526981905906e-06, "loss": 0.0414, "step": 1134 }, { "epoch": 0.4389017788089714, "grad_norm": 0.3964684709430627, "learning_rate": 9.934908480638143e-06, "loss": 0.0512, "step": 1135 }, { "epoch": 0.43928847641144625, "grad_norm": 0.3892153731317427, "learning_rate": 9.934546143093774e-06, "loss": 0.0598, "step": 1136 }, { "epoch": 0.4396751740139211, "grad_norm": 0.5665973776801057, "learning_rate": 9.934182806499315e-06, "loss": 0.0635, "step": 1137 }, { "epoch": 0.440061871616396, "grad_norm": 0.478303367261044, "learning_rate": 9.933818470928325e-06, "loss": 0.0498, "step": 1138 }, { "epoch": 0.4404485692188708, "grad_norm": 0.5530354240577331, "learning_rate": 9.933453136454572e-06, "loss": 0.0597, "step": 1139 }, { "epoch": 0.4408352668213457, "grad_norm": 0.3615092562839036, "learning_rate": 9.933086803152017e-06, "loss": 0.0387, "step": 1140 }, { "epoch": 0.44122196442382056, "grad_norm": 0.3481161240516007, "learning_rate": 9.93271947109483e-06, "loss": 0.0447, "step": 1141 }, { "epoch": 0.44160866202629545, "grad_norm": 0.45101532490397866, "learning_rate": 9.932351140357382e-06, "loss": 0.0451, "step": 1142 }, { "epoch": 0.4419953596287703, "grad_norm": 0.5158811225147313, "learning_rate": 9.931981811014242e-06, "loss": 0.0492, "step": 1143 }, { "epoch": 0.4423820572312452, "grad_norm": 0.48248673929788494, "learning_rate": 9.93161148314019e-06, "loss": 0.0572, "step": 1144 }, { "epoch": 0.44276875483372, "grad_norm": 0.356022049020988, "learning_rate": 9.931240156810198e-06, "loss": 0.0395, "step": 1145 }, { "epoch": 0.4431554524361949, "grad_norm": 0.5291041888339757, "learning_rate": 9.930867832099448e-06, "loss": 0.0549, "step": 1146 }, { "epoch": 0.44354215003866976, "grad_norm": 0.3482550536889346, "learning_rate": 9.93049450908332e-06, "loss": 0.0414, "step": 1147 }, { "epoch": 0.4439288476411446, "grad_norm": 0.49683173616126447, "learning_rate": 9.930120187837394e-06, "loss": 0.0516, "step": 1148 }, { "epoch": 0.4443155452436195, "grad_norm": 1.150584287241269, "learning_rate": 9.92974486843746e-06, "loss": 0.0758, "step": 1149 }, { "epoch": 0.44470224284609433, "grad_norm": 0.3628181772485553, "learning_rate": 9.929368550959503e-06, "loss": 0.0533, "step": 1150 }, { "epoch": 0.4450889404485692, "grad_norm": 0.5245123001898563, "learning_rate": 9.928991235479713e-06, "loss": 0.0617, "step": 1151 }, { "epoch": 0.44547563805104406, "grad_norm": 0.3906210422266261, "learning_rate": 9.92861292207448e-06, "loss": 0.0563, "step": 1152 }, { "epoch": 0.44586233565351896, "grad_norm": 0.4467522248679796, "learning_rate": 9.9282336108204e-06, "loss": 0.0514, "step": 1153 }, { "epoch": 0.4462490332559938, "grad_norm": 0.52534053973824, "learning_rate": 9.927853301794266e-06, "loss": 0.0668, "step": 1154 }, { "epoch": 0.4466357308584687, "grad_norm": 0.3680918930701901, "learning_rate": 9.927471995073076e-06, "loss": 0.0453, "step": 1155 }, { "epoch": 0.44702242846094353, "grad_norm": 0.2910830989671139, "learning_rate": 9.92708969073403e-06, "loss": 0.0332, "step": 1156 }, { "epoch": 0.4474091260634184, "grad_norm": 0.4315721508670282, "learning_rate": 9.926706388854533e-06, "loss": 0.0521, "step": 1157 }, { "epoch": 0.44779582366589327, "grad_norm": 0.3725477504633664, "learning_rate": 9.92632208951218e-06, "loss": 0.0521, "step": 1158 }, { "epoch": 0.44818252126836816, "grad_norm": 0.39363770933609554, "learning_rate": 9.925936792784784e-06, "loss": 0.0418, "step": 1159 }, { "epoch": 0.448569218870843, "grad_norm": 0.5591942763835195, "learning_rate": 9.925550498750352e-06, "loss": 0.0511, "step": 1160 }, { "epoch": 0.44895591647331784, "grad_norm": 0.5213405794856998, "learning_rate": 9.925163207487087e-06, "loss": 0.0659, "step": 1161 }, { "epoch": 0.44934261407579273, "grad_norm": 0.2639159614419688, "learning_rate": 9.924774919073405e-06, "loss": 0.0364, "step": 1162 }, { "epoch": 0.4497293116782676, "grad_norm": 0.4044753871155672, "learning_rate": 9.92438563358792e-06, "loss": 0.0606, "step": 1163 }, { "epoch": 0.45011600928074247, "grad_norm": 0.3263713465256438, "learning_rate": 9.923995351109443e-06, "loss": 0.0404, "step": 1164 }, { "epoch": 0.4505027068832173, "grad_norm": 0.5197721003465927, "learning_rate": 9.923604071716994e-06, "loss": 0.0504, "step": 1165 }, { "epoch": 0.4508894044856922, "grad_norm": 0.3477788231227501, "learning_rate": 9.923211795489792e-06, "loss": 0.0505, "step": 1166 }, { "epoch": 0.45127610208816704, "grad_norm": 0.3760969876205044, "learning_rate": 9.922818522507251e-06, "loss": 0.0489, "step": 1167 }, { "epoch": 0.45166279969064194, "grad_norm": 0.3670731651768137, "learning_rate": 9.922424252849002e-06, "loss": 0.0431, "step": 1168 }, { "epoch": 0.4520494972931168, "grad_norm": 0.7658247441660944, "learning_rate": 9.922028986594862e-06, "loss": 0.0891, "step": 1169 }, { "epoch": 0.45243619489559167, "grad_norm": 0.5767921639486664, "learning_rate": 9.921632723824862e-06, "loss": 0.0653, "step": 1170 }, { "epoch": 0.4528228924980665, "grad_norm": 0.37854005285133674, "learning_rate": 9.921235464619225e-06, "loss": 0.0482, "step": 1171 }, { "epoch": 0.4532095901005414, "grad_norm": 0.5276612960112592, "learning_rate": 9.920837209058385e-06, "loss": 0.0602, "step": 1172 }, { "epoch": 0.45359628770301624, "grad_norm": 0.4819289738854553, "learning_rate": 9.920437957222967e-06, "loss": 0.0546, "step": 1173 }, { "epoch": 0.4539829853054911, "grad_norm": 0.4151551102272604, "learning_rate": 9.92003770919381e-06, "loss": 0.0504, "step": 1174 }, { "epoch": 0.454369682907966, "grad_norm": 0.35216427033253017, "learning_rate": 9.919636465051945e-06, "loss": 0.0471, "step": 1175 }, { "epoch": 0.4547563805104408, "grad_norm": 0.2930113395484997, "learning_rate": 9.919234224878609e-06, "loss": 0.0257, "step": 1176 }, { "epoch": 0.4551430781129157, "grad_norm": 0.4585713064993994, "learning_rate": 9.918830988755237e-06, "loss": 0.0498, "step": 1177 }, { "epoch": 0.45552977571539055, "grad_norm": 0.6562220808034819, "learning_rate": 9.918426756763473e-06, "loss": 0.0666, "step": 1178 }, { "epoch": 0.45591647331786544, "grad_norm": 0.7418745755106425, "learning_rate": 9.918021528985153e-06, "loss": 0.0736, "step": 1179 }, { "epoch": 0.4563031709203403, "grad_norm": 0.5606740842980097, "learning_rate": 9.917615305502323e-06, "loss": 0.0603, "step": 1180 }, { "epoch": 0.4566898685228152, "grad_norm": 0.684685649275116, "learning_rate": 9.917208086397228e-06, "loss": 0.0676, "step": 1181 }, { "epoch": 0.45707656612529, "grad_norm": 0.43806169568554276, "learning_rate": 9.91679987175231e-06, "loss": 0.0586, "step": 1182 }, { "epoch": 0.4574632637277649, "grad_norm": 0.6325250208780754, "learning_rate": 9.916390661650218e-06, "loss": 0.0596, "step": 1183 }, { "epoch": 0.45784996133023975, "grad_norm": 0.5302869214350849, "learning_rate": 9.915980456173802e-06, "loss": 0.0637, "step": 1184 }, { "epoch": 0.4582366589327146, "grad_norm": 0.45444748034261084, "learning_rate": 9.915569255406111e-06, "loss": 0.0587, "step": 1185 }, { "epoch": 0.4586233565351895, "grad_norm": 0.6668979108403864, "learning_rate": 9.915157059430398e-06, "loss": 0.0463, "step": 1186 }, { "epoch": 0.4590100541376643, "grad_norm": 0.6122232342897236, "learning_rate": 9.914743868330115e-06, "loss": 0.0582, "step": 1187 }, { "epoch": 0.4593967517401392, "grad_norm": 0.40469913949958664, "learning_rate": 9.914329682188916e-06, "loss": 0.0614, "step": 1188 }, { "epoch": 0.45978344934261406, "grad_norm": 0.5664580006214291, "learning_rate": 9.913914501090661e-06, "loss": 0.0646, "step": 1189 }, { "epoch": 0.46017014694508895, "grad_norm": 0.299340330395943, "learning_rate": 9.913498325119404e-06, "loss": 0.0345, "step": 1190 }, { "epoch": 0.4605568445475638, "grad_norm": 0.43522547888898844, "learning_rate": 9.913081154359408e-06, "loss": 0.0563, "step": 1191 }, { "epoch": 0.4609435421500387, "grad_norm": 0.8063464595014194, "learning_rate": 9.91266298889513e-06, "loss": 0.0786, "step": 1192 }, { "epoch": 0.4613302397525135, "grad_norm": 0.5544080988610366, "learning_rate": 9.912243828811233e-06, "loss": 0.0492, "step": 1193 }, { "epoch": 0.4617169373549884, "grad_norm": 0.4618487621389859, "learning_rate": 9.91182367419258e-06, "loss": 0.0607, "step": 1194 }, { "epoch": 0.46210363495746326, "grad_norm": 0.38899039261695606, "learning_rate": 9.911402525124235e-06, "loss": 0.0447, "step": 1195 }, { "epoch": 0.46249033255993816, "grad_norm": 0.3868905248765946, "learning_rate": 9.910980381691466e-06, "loss": 0.0513, "step": 1196 }, { "epoch": 0.462877030162413, "grad_norm": 0.5360243264100125, "learning_rate": 9.91055724397974e-06, "loss": 0.0726, "step": 1197 }, { "epoch": 0.46326372776488783, "grad_norm": 0.5013791888875476, "learning_rate": 9.910133112074724e-06, "loss": 0.0626, "step": 1198 }, { "epoch": 0.46365042536736273, "grad_norm": 0.4170345727403757, "learning_rate": 9.909707986062291e-06, "loss": 0.0614, "step": 1199 }, { "epoch": 0.46403712296983757, "grad_norm": 0.4928846325761155, "learning_rate": 9.909281866028508e-06, "loss": 0.0506, "step": 1200 }, { "epoch": 0.46442382057231246, "grad_norm": 0.3833669856166002, "learning_rate": 9.908854752059651e-06, "loss": 0.0528, "step": 1201 }, { "epoch": 0.4648105181747873, "grad_norm": 0.6443026944615676, "learning_rate": 9.90842664424219e-06, "loss": 0.0657, "step": 1202 }, { "epoch": 0.4651972157772622, "grad_norm": 0.520708111491466, "learning_rate": 9.907997542662804e-06, "loss": 0.0459, "step": 1203 }, { "epoch": 0.46558391337973704, "grad_norm": 0.4160220677004582, "learning_rate": 9.907567447408367e-06, "loss": 0.0565, "step": 1204 }, { "epoch": 0.46597061098221193, "grad_norm": 0.3847228041122695, "learning_rate": 9.907136358565956e-06, "loss": 0.0618, "step": 1205 }, { "epoch": 0.46635730858468677, "grad_norm": 0.4872085038745401, "learning_rate": 9.906704276222849e-06, "loss": 0.0651, "step": 1206 }, { "epoch": 0.46674400618716166, "grad_norm": 0.7230777869254934, "learning_rate": 9.906271200466528e-06, "loss": 0.0688, "step": 1207 }, { "epoch": 0.4671307037896365, "grad_norm": 0.44536568232294416, "learning_rate": 9.905837131384672e-06, "loss": 0.0542, "step": 1208 }, { "epoch": 0.46751740139211134, "grad_norm": 0.5004129307939688, "learning_rate": 9.905402069065161e-06, "loss": 0.0468, "step": 1209 }, { "epoch": 0.46790409899458624, "grad_norm": 0.4900935245699128, "learning_rate": 9.904966013596081e-06, "loss": 0.081, "step": 1210 }, { "epoch": 0.4682907965970611, "grad_norm": 0.3468667511569187, "learning_rate": 9.904528965065714e-06, "loss": 0.0444, "step": 1211 }, { "epoch": 0.46867749419953597, "grad_norm": 0.5129433609952109, "learning_rate": 9.904090923562547e-06, "loss": 0.0652, "step": 1212 }, { "epoch": 0.4690641918020108, "grad_norm": 0.42354255075710673, "learning_rate": 9.903651889175263e-06, "loss": 0.053, "step": 1213 }, { "epoch": 0.4694508894044857, "grad_norm": 0.8682119982220625, "learning_rate": 9.903211861992752e-06, "loss": 0.0551, "step": 1214 }, { "epoch": 0.46983758700696054, "grad_norm": 0.43654373992703754, "learning_rate": 9.9027708421041e-06, "loss": 0.0552, "step": 1215 }, { "epoch": 0.47022428460943544, "grad_norm": 0.5076524402423342, "learning_rate": 9.902328829598597e-06, "loss": 0.0531, "step": 1216 }, { "epoch": 0.4706109822119103, "grad_norm": 0.39942915741165436, "learning_rate": 9.901885824565735e-06, "loss": 0.0562, "step": 1217 }, { "epoch": 0.4709976798143852, "grad_norm": 0.44697549068855025, "learning_rate": 9.901441827095201e-06, "loss": 0.0713, "step": 1218 }, { "epoch": 0.47138437741686, "grad_norm": 0.6985195280231931, "learning_rate": 9.90099683727689e-06, "loss": 0.0552, "step": 1219 }, { "epoch": 0.4717710750193349, "grad_norm": 0.35668622275059336, "learning_rate": 9.900550855200892e-06, "loss": 0.0445, "step": 1220 }, { "epoch": 0.47215777262180975, "grad_norm": 0.40216542883744416, "learning_rate": 9.900103880957503e-06, "loss": 0.0384, "step": 1221 }, { "epoch": 0.4725444702242846, "grad_norm": 0.34269654369985975, "learning_rate": 9.899655914637216e-06, "loss": 0.0413, "step": 1222 }, { "epoch": 0.4729311678267595, "grad_norm": 0.4148892293647555, "learning_rate": 9.89920695633073e-06, "loss": 0.0554, "step": 1223 }, { "epoch": 0.4733178654292343, "grad_norm": 0.3778826939578081, "learning_rate": 9.898757006128935e-06, "loss": 0.0404, "step": 1224 }, { "epoch": 0.4737045630317092, "grad_norm": 0.43490502654493324, "learning_rate": 9.898306064122933e-06, "loss": 0.0574, "step": 1225 }, { "epoch": 0.47409126063418405, "grad_norm": 0.36765031760614214, "learning_rate": 9.897854130404022e-06, "loss": 0.0452, "step": 1226 }, { "epoch": 0.47447795823665895, "grad_norm": 0.34415279592664566, "learning_rate": 9.897401205063697e-06, "loss": 0.0416, "step": 1227 }, { "epoch": 0.4748646558391338, "grad_norm": 0.4908420118610618, "learning_rate": 9.896947288193662e-06, "loss": 0.0587, "step": 1228 }, { "epoch": 0.4752513534416087, "grad_norm": 0.5002074478789393, "learning_rate": 9.896492379885814e-06, "loss": 0.0508, "step": 1229 }, { "epoch": 0.4756380510440835, "grad_norm": 0.36704232730655706, "learning_rate": 9.896036480232255e-06, "loss": 0.0442, "step": 1230 }, { "epoch": 0.4760247486465584, "grad_norm": 0.4145181127232641, "learning_rate": 9.895579589325285e-06, "loss": 0.0537, "step": 1231 }, { "epoch": 0.47641144624903325, "grad_norm": 0.4184561749376356, "learning_rate": 9.895121707257409e-06, "loss": 0.0481, "step": 1232 }, { "epoch": 0.4767981438515081, "grad_norm": 0.45646038897728325, "learning_rate": 9.894662834121329e-06, "loss": 0.0626, "step": 1233 }, { "epoch": 0.477184841453983, "grad_norm": 0.6721924890507573, "learning_rate": 9.894202970009947e-06, "loss": 0.0679, "step": 1234 }, { "epoch": 0.47757153905645783, "grad_norm": 0.4741480801079458, "learning_rate": 9.893742115016369e-06, "loss": 0.0636, "step": 1235 }, { "epoch": 0.4779582366589327, "grad_norm": 0.5592726457040439, "learning_rate": 9.8932802692339e-06, "loss": 0.0716, "step": 1236 }, { "epoch": 0.47834493426140756, "grad_norm": 0.4419762602980459, "learning_rate": 9.892817432756044e-06, "loss": 0.0562, "step": 1237 }, { "epoch": 0.47873163186388246, "grad_norm": 0.3785600074919249, "learning_rate": 9.89235360567651e-06, "loss": 0.0507, "step": 1238 }, { "epoch": 0.4791183294663573, "grad_norm": 0.39296342345889074, "learning_rate": 9.8918887880892e-06, "loss": 0.0597, "step": 1239 }, { "epoch": 0.4795050270688322, "grad_norm": 0.4480845067164168, "learning_rate": 9.891422980088225e-06, "loss": 0.0502, "step": 1240 }, { "epoch": 0.47989172467130703, "grad_norm": 0.48912656441389113, "learning_rate": 9.89095618176789e-06, "loss": 0.0579, "step": 1241 }, { "epoch": 0.4802784222737819, "grad_norm": 0.4788438344893242, "learning_rate": 9.890488393222706e-06, "loss": 0.0605, "step": 1242 }, { "epoch": 0.48066511987625676, "grad_norm": 0.5259796338836912, "learning_rate": 9.890019614547381e-06, "loss": 0.0623, "step": 1243 }, { "epoch": 0.48105181747873166, "grad_norm": 0.35514898787278076, "learning_rate": 9.889549845836821e-06, "loss": 0.0462, "step": 1244 }, { "epoch": 0.4814385150812065, "grad_norm": 0.42982104666459753, "learning_rate": 9.88907908718614e-06, "loss": 0.0413, "step": 1245 }, { "epoch": 0.48182521268368134, "grad_norm": 0.6368705294495326, "learning_rate": 9.888607338690645e-06, "loss": 0.0669, "step": 1246 }, { "epoch": 0.48221191028615623, "grad_norm": 0.492654382237399, "learning_rate": 9.888134600445846e-06, "loss": 0.0535, "step": 1247 }, { "epoch": 0.48259860788863107, "grad_norm": 0.444481353673135, "learning_rate": 9.887660872547457e-06, "loss": 0.0639, "step": 1248 }, { "epoch": 0.48298530549110597, "grad_norm": 0.4712201702473108, "learning_rate": 9.887186155091386e-06, "loss": 0.0529, "step": 1249 }, { "epoch": 0.4833720030935808, "grad_norm": 0.44204767164410574, "learning_rate": 9.886710448173747e-06, "loss": 0.0558, "step": 1250 }, { "epoch": 0.4837587006960557, "grad_norm": 0.32880177453761494, "learning_rate": 9.886233751890848e-06, "loss": 0.0409, "step": 1251 }, { "epoch": 0.48414539829853054, "grad_norm": 0.3526036880361751, "learning_rate": 9.885756066339204e-06, "loss": 0.045, "step": 1252 }, { "epoch": 0.48453209590100543, "grad_norm": 0.6672316095041897, "learning_rate": 9.88527739161553e-06, "loss": 0.0567, "step": 1253 }, { "epoch": 0.48491879350348027, "grad_norm": 0.5220660506924161, "learning_rate": 9.884797727816732e-06, "loss": 0.0587, "step": 1254 }, { "epoch": 0.48530549110595517, "grad_norm": 0.6147314648093127, "learning_rate": 9.884317075039927e-06, "loss": 0.063, "step": 1255 }, { "epoch": 0.48569218870843, "grad_norm": 0.34634911723978884, "learning_rate": 9.88383543338243e-06, "loss": 0.0421, "step": 1256 }, { "epoch": 0.48607888631090485, "grad_norm": 0.5337233600424439, "learning_rate": 9.883352802941748e-06, "loss": 0.0447, "step": 1257 }, { "epoch": 0.48646558391337974, "grad_norm": 0.6755237183348837, "learning_rate": 9.882869183815603e-06, "loss": 0.063, "step": 1258 }, { "epoch": 0.4868522815158546, "grad_norm": 3.3369701989745297, "learning_rate": 9.8823845761019e-06, "loss": 0.0559, "step": 1259 }, { "epoch": 0.4872389791183295, "grad_norm": 0.3781971355291517, "learning_rate": 9.88189897989876e-06, "loss": 0.059, "step": 1260 }, { "epoch": 0.4876256767208043, "grad_norm": 0.3891982371511904, "learning_rate": 9.881412395304492e-06, "loss": 0.048, "step": 1261 }, { "epoch": 0.4880123743232792, "grad_norm": 0.42844633474911525, "learning_rate": 9.880924822417615e-06, "loss": 0.042, "step": 1262 }, { "epoch": 0.48839907192575405, "grad_norm": 0.2955242421804777, "learning_rate": 9.880436261336838e-06, "loss": 0.0447, "step": 1263 }, { "epoch": 0.48878576952822894, "grad_norm": 0.3629532615214767, "learning_rate": 9.879946712161078e-06, "loss": 0.0531, "step": 1264 }, { "epoch": 0.4891724671307038, "grad_norm": 0.6906917406756481, "learning_rate": 9.87945617498945e-06, "loss": 0.0585, "step": 1265 }, { "epoch": 0.4895591647331787, "grad_norm": 0.5188924957558806, "learning_rate": 9.878964649921265e-06, "loss": 0.0411, "step": 1266 }, { "epoch": 0.4899458623356535, "grad_norm": 0.6018172635177467, "learning_rate": 9.878472137056044e-06, "loss": 0.0546, "step": 1267 }, { "epoch": 0.4903325599381284, "grad_norm": 0.6268528554655345, "learning_rate": 9.877978636493493e-06, "loss": 0.0436, "step": 1268 }, { "epoch": 0.49071925754060325, "grad_norm": 0.3983912776495477, "learning_rate": 9.877484148333532e-06, "loss": 0.0434, "step": 1269 }, { "epoch": 0.4911059551430781, "grad_norm": 0.5704418201728969, "learning_rate": 9.876988672676273e-06, "loss": 0.068, "step": 1270 }, { "epoch": 0.491492652745553, "grad_norm": 0.6508359448934651, "learning_rate": 9.87649220962203e-06, "loss": 0.0602, "step": 1271 }, { "epoch": 0.4918793503480278, "grad_norm": 0.4567139378124477, "learning_rate": 9.875994759271318e-06, "loss": 0.0519, "step": 1272 }, { "epoch": 0.4922660479505027, "grad_norm": 0.4872636745796592, "learning_rate": 9.875496321724853e-06, "loss": 0.0637, "step": 1273 }, { "epoch": 0.49265274555297756, "grad_norm": 0.4878168675606023, "learning_rate": 9.874996897083545e-06, "loss": 0.0566, "step": 1274 }, { "epoch": 0.49303944315545245, "grad_norm": 0.40752802459244664, "learning_rate": 9.87449648544851e-06, "loss": 0.0545, "step": 1275 }, { "epoch": 0.4934261407579273, "grad_norm": 0.48269070221070914, "learning_rate": 9.87399508692106e-06, "loss": 0.0591, "step": 1276 }, { "epoch": 0.4938128383604022, "grad_norm": 0.35373422029413665, "learning_rate": 9.87349270160271e-06, "loss": 0.0624, "step": 1277 }, { "epoch": 0.494199535962877, "grad_norm": 0.40516769014829285, "learning_rate": 9.872989329595174e-06, "loss": 0.042, "step": 1278 }, { "epoch": 0.4945862335653519, "grad_norm": 0.518338046820317, "learning_rate": 9.87248497100036e-06, "loss": 0.063, "step": 1279 }, { "epoch": 0.49497293116782676, "grad_norm": 0.4083162109534896, "learning_rate": 9.871979625920384e-06, "loss": 0.0456, "step": 1280 }, { "epoch": 0.4953596287703016, "grad_norm": 0.4019881359041864, "learning_rate": 9.871473294457561e-06, "loss": 0.0399, "step": 1281 }, { "epoch": 0.4957463263727765, "grad_norm": 1.2647734770971335, "learning_rate": 9.8709659767144e-06, "loss": 0.0679, "step": 1282 }, { "epoch": 0.49613302397525133, "grad_norm": 0.45642774817419374, "learning_rate": 9.870457672793612e-06, "loss": 0.0622, "step": 1283 }, { "epoch": 0.4965197215777262, "grad_norm": 0.5313690967323638, "learning_rate": 9.869948382798109e-06, "loss": 0.0496, "step": 1284 }, { "epoch": 0.49690641918020106, "grad_norm": 0.6252821376757547, "learning_rate": 9.869438106831004e-06, "loss": 0.0556, "step": 1285 }, { "epoch": 0.49729311678267596, "grad_norm": 0.3437846964643638, "learning_rate": 9.868926844995605e-06, "loss": 0.051, "step": 1286 }, { "epoch": 0.4976798143851508, "grad_norm": 0.409447755233917, "learning_rate": 9.868414597395424e-06, "loss": 0.0382, "step": 1287 }, { "epoch": 0.4980665119876257, "grad_norm": 0.49036005480200673, "learning_rate": 9.867901364134169e-06, "loss": 0.0472, "step": 1288 }, { "epoch": 0.49845320959010053, "grad_norm": 0.7103344049843234, "learning_rate": 9.867387145315752e-06, "loss": 0.0589, "step": 1289 }, { "epoch": 0.4988399071925754, "grad_norm": 0.5829776709948511, "learning_rate": 9.86687194104428e-06, "loss": 0.0452, "step": 1290 }, { "epoch": 0.49922660479505027, "grad_norm": 0.3810602644520353, "learning_rate": 9.866355751424064e-06, "loss": 0.047, "step": 1291 }, { "epoch": 0.49961330239752516, "grad_norm": 0.3626680522765469, "learning_rate": 9.865838576559608e-06, "loss": 0.035, "step": 1292 }, { "epoch": 0.5, "grad_norm": 0.584763492295887, "learning_rate": 9.865320416555621e-06, "loss": 0.0582, "step": 1293 }, { "epoch": 0.5003866976024749, "grad_norm": 0.5790623542922426, "learning_rate": 9.864801271517013e-06, "loss": 0.0618, "step": 1294 }, { "epoch": 0.5007733952049497, "grad_norm": 0.4841545319309988, "learning_rate": 9.864281141548886e-06, "loss": 0.0632, "step": 1295 }, { "epoch": 0.5011600928074246, "grad_norm": 0.40921807993415243, "learning_rate": 9.863760026756545e-06, "loss": 0.0504, "step": 1296 }, { "epoch": 0.5015467904098995, "grad_norm": 0.4686825793408633, "learning_rate": 9.863237927245502e-06, "loss": 0.0409, "step": 1297 }, { "epoch": 0.5019334880123744, "grad_norm": 0.72566148459106, "learning_rate": 9.862714843121455e-06, "loss": 0.0801, "step": 1298 }, { "epoch": 0.5023201856148491, "grad_norm": 0.490521770566993, "learning_rate": 9.862190774490308e-06, "loss": 0.0582, "step": 1299 }, { "epoch": 0.502706883217324, "grad_norm": 0.5394261203570054, "learning_rate": 9.861665721458168e-06, "loss": 0.0646, "step": 1300 }, { "epoch": 0.5030935808197989, "grad_norm": 0.5052968674786326, "learning_rate": 9.861139684131335e-06, "loss": 0.047, "step": 1301 }, { "epoch": 0.5034802784222738, "grad_norm": 1.3571696996969305, "learning_rate": 9.860612662616308e-06, "loss": 0.0803, "step": 1302 }, { "epoch": 0.5038669760247486, "grad_norm": 0.4232448572204575, "learning_rate": 9.860084657019795e-06, "loss": 0.05, "step": 1303 }, { "epoch": 0.5042536736272235, "grad_norm": 0.5687439605076784, "learning_rate": 9.859555667448691e-06, "loss": 0.0579, "step": 1304 }, { "epoch": 0.5046403712296984, "grad_norm": 0.35167313668861655, "learning_rate": 9.859025694010096e-06, "loss": 0.0463, "step": 1305 }, { "epoch": 0.5050270688321732, "grad_norm": 0.5090725442510405, "learning_rate": 9.85849473681131e-06, "loss": 0.0505, "step": 1306 }, { "epoch": 0.5054137664346481, "grad_norm": 0.4696028988490019, "learning_rate": 9.85796279595983e-06, "loss": 0.0614, "step": 1307 }, { "epoch": 0.505800464037123, "grad_norm": 0.5095041866515092, "learning_rate": 9.857429871563353e-06, "loss": 0.0637, "step": 1308 }, { "epoch": 0.5061871616395979, "grad_norm": 0.4465582063648533, "learning_rate": 9.856895963729774e-06, "loss": 0.0484, "step": 1309 }, { "epoch": 0.5065738592420727, "grad_norm": 0.540113278383033, "learning_rate": 9.85636107256719e-06, "loss": 0.062, "step": 1310 }, { "epoch": 0.5069605568445475, "grad_norm": 0.47780168604521145, "learning_rate": 9.855825198183895e-06, "loss": 0.0528, "step": 1311 }, { "epoch": 0.5073472544470224, "grad_norm": 0.3922111353721201, "learning_rate": 9.855288340688382e-06, "loss": 0.0412, "step": 1312 }, { "epoch": 0.5077339520494973, "grad_norm": 0.6438526383655857, "learning_rate": 9.854750500189343e-06, "loss": 0.0732, "step": 1313 }, { "epoch": 0.5081206496519721, "grad_norm": 0.4672216812106418, "learning_rate": 9.854211676795669e-06, "loss": 0.0534, "step": 1314 }, { "epoch": 0.508507347254447, "grad_norm": 0.4242588302495938, "learning_rate": 9.85367187061645e-06, "loss": 0.06, "step": 1315 }, { "epoch": 0.5088940448569219, "grad_norm": 0.5613733655446794, "learning_rate": 9.853131081760979e-06, "loss": 0.0502, "step": 1316 }, { "epoch": 0.5092807424593968, "grad_norm": 0.334631561330477, "learning_rate": 9.852589310338741e-06, "loss": 0.0354, "step": 1317 }, { "epoch": 0.5096674400618716, "grad_norm": 0.5779534905200726, "learning_rate": 9.852046556459422e-06, "loss": 0.0488, "step": 1318 }, { "epoch": 0.5100541376643465, "grad_norm": 0.4238967625981374, "learning_rate": 9.851502820232912e-06, "loss": 0.0645, "step": 1319 }, { "epoch": 0.5104408352668214, "grad_norm": 0.46126916593201595, "learning_rate": 9.850958101769292e-06, "loss": 0.0583, "step": 1320 }, { "epoch": 0.5108275328692962, "grad_norm": 0.42625762847504256, "learning_rate": 9.85041240117885e-06, "loss": 0.068, "step": 1321 }, { "epoch": 0.5112142304717711, "grad_norm": 0.568811769459661, "learning_rate": 9.849865718572067e-06, "loss": 0.0517, "step": 1322 }, { "epoch": 0.511600928074246, "grad_norm": 0.40830993968467383, "learning_rate": 9.849318054059625e-06, "loss": 0.0516, "step": 1323 }, { "epoch": 0.5119876256767208, "grad_norm": 0.3290657018453001, "learning_rate": 9.848769407752401e-06, "loss": 0.0337, "step": 1324 }, { "epoch": 0.5123743232791956, "grad_norm": 0.5150055500104612, "learning_rate": 9.848219779761479e-06, "loss": 0.0599, "step": 1325 }, { "epoch": 0.5127610208816705, "grad_norm": 0.4374007774296615, "learning_rate": 9.847669170198132e-06, "loss": 0.0546, "step": 1326 }, { "epoch": 0.5131477184841454, "grad_norm": 0.4508594239967237, "learning_rate": 9.847117579173844e-06, "loss": 0.0557, "step": 1327 }, { "epoch": 0.5135344160866203, "grad_norm": 0.6963293533231855, "learning_rate": 9.846565006800283e-06, "loss": 0.076, "step": 1328 }, { "epoch": 0.5139211136890951, "grad_norm": 0.39652634420592187, "learning_rate": 9.846011453189326e-06, "loss": 0.0462, "step": 1329 }, { "epoch": 0.51430781129157, "grad_norm": 0.509399214110446, "learning_rate": 9.845456918453045e-06, "loss": 0.0735, "step": 1330 }, { "epoch": 0.5146945088940449, "grad_norm": 0.47970898889640207, "learning_rate": 9.844901402703713e-06, "loss": 0.0575, "step": 1331 }, { "epoch": 0.5150812064965197, "grad_norm": 0.8075225521609055, "learning_rate": 9.844344906053797e-06, "loss": 0.0616, "step": 1332 }, { "epoch": 0.5154679040989946, "grad_norm": 0.4905012619856899, "learning_rate": 9.843787428615968e-06, "loss": 0.0567, "step": 1333 }, { "epoch": 0.5158546017014695, "grad_norm": 0.3541703298259199, "learning_rate": 9.843228970503091e-06, "loss": 0.0522, "step": 1334 }, { "epoch": 0.5162412993039444, "grad_norm": 0.5682349526313809, "learning_rate": 9.842669531828235e-06, "loss": 0.0689, "step": 1335 }, { "epoch": 0.5166279969064191, "grad_norm": 0.502182448103057, "learning_rate": 9.84210911270466e-06, "loss": 0.0538, "step": 1336 }, { "epoch": 0.517014694508894, "grad_norm": 0.43674962853942517, "learning_rate": 9.841547713245834e-06, "loss": 0.0504, "step": 1337 }, { "epoch": 0.5174013921113689, "grad_norm": 0.5053042550019405, "learning_rate": 9.840985333565412e-06, "loss": 0.039, "step": 1338 }, { "epoch": 0.5177880897138438, "grad_norm": 0.3930456838256593, "learning_rate": 9.840421973777256e-06, "loss": 0.0448, "step": 1339 }, { "epoch": 0.5181747873163186, "grad_norm": 0.437202988698919, "learning_rate": 9.839857633995426e-06, "loss": 0.0486, "step": 1340 }, { "epoch": 0.5185614849187935, "grad_norm": 0.4968584679298795, "learning_rate": 9.839292314334178e-06, "loss": 0.0585, "step": 1341 }, { "epoch": 0.5189481825212684, "grad_norm": 0.5632661712683714, "learning_rate": 9.838726014907965e-06, "loss": 0.055, "step": 1342 }, { "epoch": 0.5193348801237432, "grad_norm": 0.503971422679721, "learning_rate": 9.83815873583144e-06, "loss": 0.0558, "step": 1343 }, { "epoch": 0.5197215777262181, "grad_norm": 0.3230057765118212, "learning_rate": 9.837590477219457e-06, "loss": 0.0329, "step": 1344 }, { "epoch": 0.520108275328693, "grad_norm": 0.412722846904093, "learning_rate": 9.837021239187063e-06, "loss": 0.0616, "step": 1345 }, { "epoch": 0.5204949729311679, "grad_norm": 0.4512565213572088, "learning_rate": 9.836451021849509e-06, "loss": 0.0678, "step": 1346 }, { "epoch": 0.5208816705336426, "grad_norm": 0.39523232712722883, "learning_rate": 9.835879825322239e-06, "loss": 0.0475, "step": 1347 }, { "epoch": 0.5212683681361175, "grad_norm": 0.31009976458235883, "learning_rate": 9.835307649720898e-06, "loss": 0.0402, "step": 1348 }, { "epoch": 0.5216550657385924, "grad_norm": 0.6534423048658711, "learning_rate": 9.834734495161331e-06, "loss": 0.0667, "step": 1349 }, { "epoch": 0.5220417633410673, "grad_norm": 0.40049915127684504, "learning_rate": 9.834160361759576e-06, "loss": 0.0467, "step": 1350 }, { "epoch": 0.5224284609435421, "grad_norm": 0.4505189576414681, "learning_rate": 9.833585249631876e-06, "loss": 0.0432, "step": 1351 }, { "epoch": 0.522815158546017, "grad_norm": 0.41019190004082307, "learning_rate": 9.833009158894663e-06, "loss": 0.0681, "step": 1352 }, { "epoch": 0.5232018561484919, "grad_norm": 0.38644642341442115, "learning_rate": 9.832432089664579e-06, "loss": 0.0491, "step": 1353 }, { "epoch": 0.5235885537509667, "grad_norm": 0.43326056238474997, "learning_rate": 9.831854042058454e-06, "loss": 0.0472, "step": 1354 }, { "epoch": 0.5239752513534416, "grad_norm": 0.5201921052620397, "learning_rate": 9.83127501619332e-06, "loss": 0.0412, "step": 1355 }, { "epoch": 0.5243619489559165, "grad_norm": 0.568199337386787, "learning_rate": 9.830695012186407e-06, "loss": 0.0483, "step": 1356 }, { "epoch": 0.5247486465583914, "grad_norm": 0.4501896968429634, "learning_rate": 9.830114030155143e-06, "loss": 0.0516, "step": 1357 }, { "epoch": 0.5251353441608662, "grad_norm": 0.43700179305945447, "learning_rate": 9.829532070217153e-06, "loss": 0.0465, "step": 1358 }, { "epoch": 0.525522041763341, "grad_norm": 0.2990289941541428, "learning_rate": 9.828949132490262e-06, "loss": 0.0373, "step": 1359 }, { "epoch": 0.525908739365816, "grad_norm": 0.36327243377921675, "learning_rate": 9.828365217092491e-06, "loss": 0.0544, "step": 1360 }, { "epoch": 0.5262954369682908, "grad_norm": 0.50231354803876, "learning_rate": 9.82778032414206e-06, "loss": 0.0528, "step": 1361 }, { "epoch": 0.5266821345707656, "grad_norm": 0.4935287726952576, "learning_rate": 9.827194453757388e-06, "loss": 0.0548, "step": 1362 }, { "epoch": 0.5270688321732405, "grad_norm": 0.5395427567529207, "learning_rate": 9.826607606057087e-06, "loss": 0.0631, "step": 1363 }, { "epoch": 0.5274555297757154, "grad_norm": 0.381188086685741, "learning_rate": 9.826019781159975e-06, "loss": 0.0456, "step": 1364 }, { "epoch": 0.5278422273781903, "grad_norm": 0.3171772427626862, "learning_rate": 9.82543097918506e-06, "loss": 0.0512, "step": 1365 }, { "epoch": 0.5282289249806651, "grad_norm": 0.4859135751539673, "learning_rate": 9.824841200251552e-06, "loss": 0.0566, "step": 1366 }, { "epoch": 0.52861562258314, "grad_norm": 0.4041679747148513, "learning_rate": 9.824250444478859e-06, "loss": 0.0513, "step": 1367 }, { "epoch": 0.5290023201856149, "grad_norm": 0.39784425086193315, "learning_rate": 9.823658711986582e-06, "loss": 0.0535, "step": 1368 }, { "epoch": 0.5293890177880897, "grad_norm": 0.4665202179023926, "learning_rate": 9.823066002894528e-06, "loss": 0.0796, "step": 1369 }, { "epoch": 0.5297757153905646, "grad_norm": 0.4040843227969607, "learning_rate": 9.822472317322694e-06, "loss": 0.0554, "step": 1370 }, { "epoch": 0.5301624129930395, "grad_norm": 0.6148386877611921, "learning_rate": 9.821877655391279e-06, "loss": 0.057, "step": 1371 }, { "epoch": 0.5305491105955144, "grad_norm": 0.40030107925029756, "learning_rate": 9.821282017220679e-06, "loss": 0.0435, "step": 1372 }, { "epoch": 0.5309358081979891, "grad_norm": 0.3423299504240558, "learning_rate": 9.820685402931484e-06, "loss": 0.0318, "step": 1373 }, { "epoch": 0.531322505800464, "grad_norm": 0.34793796564675183, "learning_rate": 9.82008781264449e-06, "loss": 0.0443, "step": 1374 }, { "epoch": 0.5317092034029389, "grad_norm": 0.3827511926296242, "learning_rate": 9.819489246480682e-06, "loss": 0.0397, "step": 1375 }, { "epoch": 0.5320959010054138, "grad_norm": 0.5279206626373096, "learning_rate": 9.818889704561247e-06, "loss": 0.0492, "step": 1376 }, { "epoch": 0.5324825986078886, "grad_norm": 0.3443252193656321, "learning_rate": 9.818289187007565e-06, "loss": 0.0561, "step": 1377 }, { "epoch": 0.5328692962103635, "grad_norm": 0.39106958370119194, "learning_rate": 9.817687693941222e-06, "loss": 0.043, "step": 1378 }, { "epoch": 0.5332559938128384, "grad_norm": 0.41548329161570874, "learning_rate": 9.817085225483996e-06, "loss": 0.0407, "step": 1379 }, { "epoch": 0.5336426914153132, "grad_norm": 0.4426713118450878, "learning_rate": 9.816481781757859e-06, "loss": 0.0504, "step": 1380 }, { "epoch": 0.5340293890177881, "grad_norm": 0.4141896203579085, "learning_rate": 9.815877362884989e-06, "loss": 0.0445, "step": 1381 }, { "epoch": 0.534416086620263, "grad_norm": 0.4113087658693946, "learning_rate": 9.815271968987753e-06, "loss": 0.0619, "step": 1382 }, { "epoch": 0.5348027842227379, "grad_norm": 0.41549326364542893, "learning_rate": 9.814665600188722e-06, "loss": 0.0555, "step": 1383 }, { "epoch": 0.5351894818252126, "grad_norm": 0.3602341533521777, "learning_rate": 9.81405825661066e-06, "loss": 0.0575, "step": 1384 }, { "epoch": 0.5355761794276875, "grad_norm": 0.36317567159144065, "learning_rate": 9.813449938376532e-06, "loss": 0.0506, "step": 1385 }, { "epoch": 0.5359628770301624, "grad_norm": 0.6833864106633932, "learning_rate": 9.812840645609496e-06, "loss": 0.0431, "step": 1386 }, { "epoch": 0.5363495746326373, "grad_norm": 0.457356793018531, "learning_rate": 9.81223037843291e-06, "loss": 0.0407, "step": 1387 }, { "epoch": 0.5367362722351121, "grad_norm": 0.45084758682993287, "learning_rate": 9.811619136970331e-06, "loss": 0.052, "step": 1388 }, { "epoch": 0.537122969837587, "grad_norm": 0.3579755409649572, "learning_rate": 9.81100692134551e-06, "loss": 0.043, "step": 1389 }, { "epoch": 0.5375096674400619, "grad_norm": 0.3397957017247582, "learning_rate": 9.810393731682396e-06, "loss": 0.0268, "step": 1390 }, { "epoch": 0.5378963650425367, "grad_norm": 0.42600146243826076, "learning_rate": 9.809779568105136e-06, "loss": 0.06, "step": 1391 }, { "epoch": 0.5382830626450116, "grad_norm": 0.5093012965220558, "learning_rate": 9.809164430738072e-06, "loss": 0.0562, "step": 1392 }, { "epoch": 0.5386697602474865, "grad_norm": 0.5119931704788178, "learning_rate": 9.808548319705748e-06, "loss": 0.0438, "step": 1393 }, { "epoch": 0.5390564578499614, "grad_norm": 0.37043897855473473, "learning_rate": 9.807931235132902e-06, "loss": 0.0484, "step": 1394 }, { "epoch": 0.5394431554524362, "grad_norm": 0.44515950482636923, "learning_rate": 9.807313177144466e-06, "loss": 0.0438, "step": 1395 }, { "epoch": 0.539829853054911, "grad_norm": 0.5491314178627604, "learning_rate": 9.806694145865574e-06, "loss": 0.0586, "step": 1396 }, { "epoch": 0.5402165506573859, "grad_norm": 0.652079813485557, "learning_rate": 9.806074141421556e-06, "loss": 0.0507, "step": 1397 }, { "epoch": 0.5406032482598608, "grad_norm": 0.3785483865365917, "learning_rate": 9.805453163937939e-06, "loss": 0.0544, "step": 1398 }, { "epoch": 0.5409899458623356, "grad_norm": 0.37284450070298014, "learning_rate": 9.804831213540443e-06, "loss": 0.0385, "step": 1399 }, { "epoch": 0.5413766434648105, "grad_norm": 0.3951540570527633, "learning_rate": 9.80420829035499e-06, "loss": 0.0428, "step": 1400 }, { "epoch": 0.5417633410672854, "grad_norm": 0.3474560220719124, "learning_rate": 9.803584394507701e-06, "loss": 0.0569, "step": 1401 }, { "epoch": 0.5421500386697602, "grad_norm": 0.5259914646446183, "learning_rate": 9.802959526124885e-06, "loss": 0.0622, "step": 1402 }, { "epoch": 0.5425367362722351, "grad_norm": 0.36771148768482814, "learning_rate": 9.802333685333054e-06, "loss": 0.0492, "step": 1403 }, { "epoch": 0.54292343387471, "grad_norm": 0.5449823697460119, "learning_rate": 9.801706872258918e-06, "loss": 0.0551, "step": 1404 }, { "epoch": 0.5433101314771849, "grad_norm": 0.5915824356356167, "learning_rate": 9.801079087029379e-06, "loss": 0.0847, "step": 1405 }, { "epoch": 0.5436968290796597, "grad_norm": 0.3813159787985441, "learning_rate": 9.800450329771543e-06, "loss": 0.0502, "step": 1406 }, { "epoch": 0.5440835266821346, "grad_norm": 0.44350914132276037, "learning_rate": 9.799820600612705e-06, "loss": 0.0428, "step": 1407 }, { "epoch": 0.5444702242846094, "grad_norm": 0.4705063571109686, "learning_rate": 9.799189899680359e-06, "loss": 0.0436, "step": 1408 }, { "epoch": 0.5448569218870843, "grad_norm": 0.38845666817276026, "learning_rate": 9.798558227102201e-06, "loss": 0.0464, "step": 1409 }, { "epoch": 0.5452436194895591, "grad_norm": 0.5447147106731819, "learning_rate": 9.797925583006116e-06, "loss": 0.0731, "step": 1410 }, { "epoch": 0.545630317092034, "grad_norm": 0.3034563130438838, "learning_rate": 9.797291967520193e-06, "loss": 0.0361, "step": 1411 }, { "epoch": 0.5460170146945089, "grad_norm": 0.3390964170553518, "learning_rate": 9.796657380772712e-06, "loss": 0.0481, "step": 1412 }, { "epoch": 0.5464037122969838, "grad_norm": 0.4948275287770094, "learning_rate": 9.79602182289215e-06, "loss": 0.0595, "step": 1413 }, { "epoch": 0.5467904098994586, "grad_norm": 0.36292480644693914, "learning_rate": 9.795385294007184e-06, "loss": 0.0425, "step": 1414 }, { "epoch": 0.5471771075019335, "grad_norm": 0.3364879658977519, "learning_rate": 9.794747794246686e-06, "loss": 0.0339, "step": 1415 }, { "epoch": 0.5475638051044084, "grad_norm": 0.5849416524051946, "learning_rate": 9.794109323739727e-06, "loss": 0.0545, "step": 1416 }, { "epoch": 0.5479505027068832, "grad_norm": 0.6401532564615433, "learning_rate": 9.793469882615565e-06, "loss": 0.041, "step": 1417 }, { "epoch": 0.5483372003093581, "grad_norm": 0.4198876853043725, "learning_rate": 9.792829471003668e-06, "loss": 0.0428, "step": 1418 }, { "epoch": 0.548723897911833, "grad_norm": 0.3773121848170281, "learning_rate": 9.792188089033693e-06, "loss": 0.0494, "step": 1419 }, { "epoch": 0.5491105955143079, "grad_norm": 0.4155297013406437, "learning_rate": 9.791545736835491e-06, "loss": 0.0427, "step": 1420 }, { "epoch": 0.5494972931167826, "grad_norm": 0.34732530231863257, "learning_rate": 9.790902414539116e-06, "loss": 0.0474, "step": 1421 }, { "epoch": 0.5498839907192575, "grad_norm": 0.45102962288376025, "learning_rate": 9.790258122274817e-06, "loss": 0.0545, "step": 1422 }, { "epoch": 0.5502706883217324, "grad_norm": 0.350939238535524, "learning_rate": 9.78961286017303e-06, "loss": 0.0356, "step": 1423 }, { "epoch": 0.5506573859242073, "grad_norm": 0.3459930467880876, "learning_rate": 9.788966628364403e-06, "loss": 0.0434, "step": 1424 }, { "epoch": 0.5510440835266821, "grad_norm": 0.41744914038364966, "learning_rate": 9.788319426979769e-06, "loss": 0.0517, "step": 1425 }, { "epoch": 0.551430781129157, "grad_norm": 0.5221863048234009, "learning_rate": 9.787671256150161e-06, "loss": 0.0553, "step": 1426 }, { "epoch": 0.5518174787316319, "grad_norm": 0.33648044387947257, "learning_rate": 9.787022116006808e-06, "loss": 0.0411, "step": 1427 }, { "epoch": 0.5522041763341067, "grad_norm": 0.32242878931559593, "learning_rate": 9.786372006681133e-06, "loss": 0.0503, "step": 1428 }, { "epoch": 0.5525908739365816, "grad_norm": 0.39909961381234843, "learning_rate": 9.785720928304762e-06, "loss": 0.0568, "step": 1429 }, { "epoch": 0.5529775715390565, "grad_norm": 0.4369160903695525, "learning_rate": 9.78506888100951e-06, "loss": 0.0418, "step": 1430 }, { "epoch": 0.5533642691415314, "grad_norm": 0.39245384666032923, "learning_rate": 9.78441586492739e-06, "loss": 0.0412, "step": 1431 }, { "epoch": 0.5537509667440061, "grad_norm": 0.4837783939723066, "learning_rate": 9.783761880190612e-06, "loss": 0.055, "step": 1432 }, { "epoch": 0.554137664346481, "grad_norm": 0.4150172177689991, "learning_rate": 9.783106926931584e-06, "loss": 0.0571, "step": 1433 }, { "epoch": 0.5545243619489559, "grad_norm": 0.4326798207109173, "learning_rate": 9.782451005282905e-06, "loss": 0.0564, "step": 1434 }, { "epoch": 0.5549110595514308, "grad_norm": 0.4045635565911878, "learning_rate": 9.781794115377377e-06, "loss": 0.0423, "step": 1435 }, { "epoch": 0.5552977571539056, "grad_norm": 0.4377977960230161, "learning_rate": 9.78113625734799e-06, "loss": 0.048, "step": 1436 }, { "epoch": 0.5556844547563805, "grad_norm": 0.48250264182189867, "learning_rate": 9.780477431327938e-06, "loss": 0.0491, "step": 1437 }, { "epoch": 0.5560711523588554, "grad_norm": 0.4786915941356229, "learning_rate": 9.779817637450603e-06, "loss": 0.0448, "step": 1438 }, { "epoch": 0.5564578499613302, "grad_norm": 0.4648466349482373, "learning_rate": 9.779156875849573e-06, "loss": 0.064, "step": 1439 }, { "epoch": 0.5568445475638051, "grad_norm": 0.3414705850304527, "learning_rate": 9.77849514665862e-06, "loss": 0.0427, "step": 1440 }, { "epoch": 0.55723124516628, "grad_norm": 0.3713832567599116, "learning_rate": 9.777832450011722e-06, "loss": 0.0501, "step": 1441 }, { "epoch": 0.5576179427687549, "grad_norm": 0.477365605653362, "learning_rate": 9.777168786043046e-06, "loss": 0.0413, "step": 1442 }, { "epoch": 0.5580046403712297, "grad_norm": 0.40227469201160715, "learning_rate": 9.77650415488696e-06, "loss": 0.0499, "step": 1443 }, { "epoch": 0.5583913379737045, "grad_norm": 0.4318602972895715, "learning_rate": 9.775838556678023e-06, "loss": 0.0621, "step": 1444 }, { "epoch": 0.5587780355761794, "grad_norm": 0.5596774985566126, "learning_rate": 9.775171991550996e-06, "loss": 0.0687, "step": 1445 }, { "epoch": 0.5591647331786543, "grad_norm": 0.41243772557616665, "learning_rate": 9.774504459640828e-06, "loss": 0.0508, "step": 1446 }, { "epoch": 0.5595514307811291, "grad_norm": 0.4678445331214343, "learning_rate": 9.773835961082671e-06, "loss": 0.0576, "step": 1447 }, { "epoch": 0.559938128383604, "grad_norm": 0.48390972003325805, "learning_rate": 9.773166496011867e-06, "loss": 0.0579, "step": 1448 }, { "epoch": 0.5603248259860789, "grad_norm": 0.49811885247371995, "learning_rate": 9.772496064563957e-06, "loss": 0.0595, "step": 1449 }, { "epoch": 0.5607115235885538, "grad_norm": 0.5898297456505789, "learning_rate": 9.771824666874678e-06, "loss": 0.0512, "step": 1450 }, { "epoch": 0.5610982211910286, "grad_norm": 0.38367390540127505, "learning_rate": 9.77115230307996e-06, "loss": 0.0458, "step": 1451 }, { "epoch": 0.5614849187935035, "grad_norm": 0.453064312300978, "learning_rate": 9.77047897331593e-06, "loss": 0.0653, "step": 1452 }, { "epoch": 0.5618716163959784, "grad_norm": 0.36876312093342734, "learning_rate": 9.76980467771891e-06, "loss": 0.042, "step": 1453 }, { "epoch": 0.5622583139984532, "grad_norm": 0.3249217990966604, "learning_rate": 9.76912941642542e-06, "loss": 0.0412, "step": 1454 }, { "epoch": 0.5626450116009281, "grad_norm": 0.44834542342937195, "learning_rate": 9.768453189572173e-06, "loss": 0.0531, "step": 1455 }, { "epoch": 0.563031709203403, "grad_norm": 0.36934257876733456, "learning_rate": 9.76777599729608e-06, "loss": 0.0438, "step": 1456 }, { "epoch": 0.5634184068058778, "grad_norm": 0.5499819543040004, "learning_rate": 9.76709783973424e-06, "loss": 0.0614, "step": 1457 }, { "epoch": 0.5638051044083526, "grad_norm": 0.33373412419518417, "learning_rate": 9.76641871702396e-06, "loss": 0.043, "step": 1458 }, { "epoch": 0.5641918020108275, "grad_norm": 0.42074510920735997, "learning_rate": 9.76573862930273e-06, "loss": 0.0726, "step": 1459 }, { "epoch": 0.5645784996133024, "grad_norm": 0.37161879408878434, "learning_rate": 9.765057576708243e-06, "loss": 0.0428, "step": 1460 }, { "epoch": 0.5649651972157773, "grad_norm": 0.39129873561084044, "learning_rate": 9.764375559378386e-06, "loss": 0.0396, "step": 1461 }, { "epoch": 0.5653518948182521, "grad_norm": 0.5354820007390818, "learning_rate": 9.76369257745124e-06, "loss": 0.0751, "step": 1462 }, { "epoch": 0.565738592420727, "grad_norm": 0.4508954457872216, "learning_rate": 9.76300863106508e-06, "loss": 0.0576, "step": 1463 }, { "epoch": 0.5661252900232019, "grad_norm": 0.4646469556433587, "learning_rate": 9.76232372035838e-06, "loss": 0.0712, "step": 1464 }, { "epoch": 0.5665119876256767, "grad_norm": 0.47261160144588577, "learning_rate": 9.761637845469805e-06, "loss": 0.0492, "step": 1465 }, { "epoch": 0.5668986852281516, "grad_norm": 0.4148947380839024, "learning_rate": 9.760951006538221e-06, "loss": 0.0477, "step": 1466 }, { "epoch": 0.5672853828306265, "grad_norm": 0.38415006839565113, "learning_rate": 9.760263203702682e-06, "loss": 0.0437, "step": 1467 }, { "epoch": 0.5676720804331014, "grad_norm": 0.5134482184899085, "learning_rate": 9.759574437102444e-06, "loss": 0.0556, "step": 1468 }, { "epoch": 0.5680587780355761, "grad_norm": 0.38172639899166194, "learning_rate": 9.758884706876954e-06, "loss": 0.0446, "step": 1469 }, { "epoch": 0.568445475638051, "grad_norm": 0.4347498720970354, "learning_rate": 9.758194013165854e-06, "loss": 0.0569, "step": 1470 }, { "epoch": 0.5688321732405259, "grad_norm": 0.45556962944815305, "learning_rate": 9.757502356108983e-06, "loss": 0.0407, "step": 1471 }, { "epoch": 0.5692188708430008, "grad_norm": 0.32212780926646073, "learning_rate": 9.756809735846374e-06, "loss": 0.0487, "step": 1472 }, { "epoch": 0.5696055684454756, "grad_norm": 0.3215587060126289, "learning_rate": 9.756116152518255e-06, "loss": 0.0366, "step": 1473 }, { "epoch": 0.5699922660479505, "grad_norm": 0.4065627613711272, "learning_rate": 9.755421606265049e-06, "loss": 0.0439, "step": 1474 }, { "epoch": 0.5703789636504254, "grad_norm": 0.34460190226259707, "learning_rate": 9.754726097227374e-06, "loss": 0.0365, "step": 1475 }, { "epoch": 0.5707656612529002, "grad_norm": 0.4516824946300991, "learning_rate": 9.754029625546044e-06, "loss": 0.0494, "step": 1476 }, { "epoch": 0.5711523588553751, "grad_norm": 0.3782762462200411, "learning_rate": 9.753332191362066e-06, "loss": 0.0374, "step": 1477 }, { "epoch": 0.57153905645785, "grad_norm": 0.3305625613880441, "learning_rate": 9.752633794816643e-06, "loss": 0.0367, "step": 1478 }, { "epoch": 0.5719257540603249, "grad_norm": 0.5111999284798917, "learning_rate": 9.751934436051174e-06, "loss": 0.0742, "step": 1479 }, { "epoch": 0.5723124516627996, "grad_norm": 0.3668246808841585, "learning_rate": 9.751234115207247e-06, "loss": 0.0472, "step": 1480 }, { "epoch": 0.5726991492652745, "grad_norm": 0.4351563296887029, "learning_rate": 9.750532832426655e-06, "loss": 0.0509, "step": 1481 }, { "epoch": 0.5730858468677494, "grad_norm": 0.35844154818271423, "learning_rate": 9.749830587851377e-06, "loss": 0.0419, "step": 1482 }, { "epoch": 0.5734725444702243, "grad_norm": 0.38818793118763245, "learning_rate": 9.74912738162359e-06, "loss": 0.0515, "step": 1483 }, { "epoch": 0.5738592420726991, "grad_norm": 0.32894731699595203, "learning_rate": 9.748423213885663e-06, "loss": 0.046, "step": 1484 }, { "epoch": 0.574245939675174, "grad_norm": 0.4418310089733912, "learning_rate": 9.747718084780167e-06, "loss": 0.0476, "step": 1485 }, { "epoch": 0.5746326372776489, "grad_norm": 0.6737542764574898, "learning_rate": 9.747011994449858e-06, "loss": 0.057, "step": 1486 }, { "epoch": 0.5750193348801237, "grad_norm": 0.2809999961717934, "learning_rate": 9.746304943037697e-06, "loss": 0.0308, "step": 1487 }, { "epoch": 0.5754060324825986, "grad_norm": 0.3542832742562085, "learning_rate": 9.745596930686827e-06, "loss": 0.0408, "step": 1488 }, { "epoch": 0.5757927300850735, "grad_norm": 0.32019174920797255, "learning_rate": 9.744887957540598e-06, "loss": 0.0338, "step": 1489 }, { "epoch": 0.5761794276875484, "grad_norm": 0.40316047352850853, "learning_rate": 9.744178023742546e-06, "loss": 0.0478, "step": 1490 }, { "epoch": 0.5765661252900232, "grad_norm": 0.8361631264390859, "learning_rate": 9.743467129436406e-06, "loss": 0.0445, "step": 1491 }, { "epoch": 0.576952822892498, "grad_norm": 0.5327922307229875, "learning_rate": 9.742755274766103e-06, "loss": 0.0446, "step": 1492 }, { "epoch": 0.577339520494973, "grad_norm": 0.4860034196584748, "learning_rate": 9.742042459875765e-06, "loss": 0.0496, "step": 1493 }, { "epoch": 0.5777262180974478, "grad_norm": 0.3572387742218114, "learning_rate": 9.741328684909702e-06, "loss": 0.0573, "step": 1494 }, { "epoch": 0.5781129156999226, "grad_norm": 0.411889319543381, "learning_rate": 9.740613950012431e-06, "loss": 0.0454, "step": 1495 }, { "epoch": 0.5784996133023975, "grad_norm": 0.35377974942913865, "learning_rate": 9.739898255328654e-06, "loss": 0.0516, "step": 1496 }, { "epoch": 0.5788863109048724, "grad_norm": 0.4036213151480714, "learning_rate": 9.73918160100327e-06, "loss": 0.0505, "step": 1497 }, { "epoch": 0.5792730085073473, "grad_norm": 0.37965952700325817, "learning_rate": 9.73846398718138e-06, "loss": 0.0434, "step": 1498 }, { "epoch": 0.5796597061098221, "grad_norm": 0.4894715559818342, "learning_rate": 9.737745414008263e-06, "loss": 0.0448, "step": 1499 }, { "epoch": 0.580046403712297, "grad_norm": 0.4479458495077911, "learning_rate": 9.737025881629407e-06, "loss": 0.0711, "step": 1500 }, { "epoch": 0.5804331013147719, "grad_norm": 0.30886142366831015, "learning_rate": 9.736305390190488e-06, "loss": 0.0465, "step": 1501 }, { "epoch": 0.5808197989172467, "grad_norm": 0.44439259194812336, "learning_rate": 9.735583939837376e-06, "loss": 0.0531, "step": 1502 }, { "epoch": 0.5812064965197216, "grad_norm": 0.40961081782891334, "learning_rate": 9.734861530716137e-06, "loss": 0.0472, "step": 1503 }, { "epoch": 0.5815931941221965, "grad_norm": 0.4686120458130824, "learning_rate": 9.73413816297303e-06, "loss": 0.0521, "step": 1504 }, { "epoch": 0.5819798917246713, "grad_norm": 0.3227411488741675, "learning_rate": 9.733413836754509e-06, "loss": 0.0455, "step": 1505 }, { "epoch": 0.5823665893271461, "grad_norm": 0.3360020993248578, "learning_rate": 9.73268855220722e-06, "loss": 0.0326, "step": 1506 }, { "epoch": 0.582753286929621, "grad_norm": 0.33800254145911585, "learning_rate": 9.731962309478008e-06, "loss": 0.0468, "step": 1507 }, { "epoch": 0.5831399845320959, "grad_norm": 0.48065840937504917, "learning_rate": 9.7312351087139e-06, "loss": 0.0512, "step": 1508 }, { "epoch": 0.5835266821345708, "grad_norm": 0.4403335136292439, "learning_rate": 9.730506950062134e-06, "loss": 0.0629, "step": 1509 }, { "epoch": 0.5839133797370456, "grad_norm": 0.34257140611513565, "learning_rate": 9.729777833670132e-06, "loss": 0.039, "step": 1510 }, { "epoch": 0.5843000773395205, "grad_norm": 0.5045392221123939, "learning_rate": 9.729047759685507e-06, "loss": 0.0636, "step": 1511 }, { "epoch": 0.5846867749419954, "grad_norm": 0.3995612059705348, "learning_rate": 9.72831672825607e-06, "loss": 0.0451, "step": 1512 }, { "epoch": 0.5850734725444702, "grad_norm": 0.29934117350177586, "learning_rate": 9.727584739529832e-06, "loss": 0.0461, "step": 1513 }, { "epoch": 0.5854601701469451, "grad_norm": 0.31837287617461013, "learning_rate": 9.726851793654987e-06, "loss": 0.0403, "step": 1514 }, { "epoch": 0.58584686774942, "grad_norm": 0.4197918331122782, "learning_rate": 9.726117890779927e-06, "loss": 0.047, "step": 1515 }, { "epoch": 0.5862335653518949, "grad_norm": 0.5722365631233349, "learning_rate": 9.725383031053241e-06, "loss": 0.0447, "step": 1516 }, { "epoch": 0.5866202629543696, "grad_norm": 0.43457393297234714, "learning_rate": 9.724647214623709e-06, "loss": 0.0687, "step": 1517 }, { "epoch": 0.5870069605568445, "grad_norm": 0.4171146657026387, "learning_rate": 9.723910441640301e-06, "loss": 0.0519, "step": 1518 }, { "epoch": 0.5873936581593194, "grad_norm": 0.31352087462485, "learning_rate": 9.723172712252188e-06, "loss": 0.0361, "step": 1519 }, { "epoch": 0.5877803557617943, "grad_norm": 0.4091710338602005, "learning_rate": 9.722434026608727e-06, "loss": 0.0442, "step": 1520 }, { "epoch": 0.5881670533642691, "grad_norm": 0.538440929619704, "learning_rate": 9.721694384859476e-06, "loss": 0.058, "step": 1521 }, { "epoch": 0.588553750966744, "grad_norm": 0.32267884072815345, "learning_rate": 9.720953787154186e-06, "loss": 0.044, "step": 1522 }, { "epoch": 0.5889404485692189, "grad_norm": 0.32891553405837093, "learning_rate": 9.720212233642792e-06, "loss": 0.0439, "step": 1523 }, { "epoch": 0.5893271461716937, "grad_norm": 0.5926108899823513, "learning_rate": 9.719469724475433e-06, "loss": 0.0498, "step": 1524 }, { "epoch": 0.5897138437741686, "grad_norm": 0.4013350099253238, "learning_rate": 9.718726259802439e-06, "loss": 0.0456, "step": 1525 }, { "epoch": 0.5901005413766435, "grad_norm": 0.4017025087247575, "learning_rate": 9.717981839774326e-06, "loss": 0.0505, "step": 1526 }, { "epoch": 0.5904872389791184, "grad_norm": 0.7013743773625379, "learning_rate": 9.717236464541816e-06, "loss": 0.0653, "step": 1527 }, { "epoch": 0.5908739365815932, "grad_norm": 0.44070327991030317, "learning_rate": 9.716490134255817e-06, "loss": 0.0393, "step": 1528 }, { "epoch": 0.591260634184068, "grad_norm": 0.3406558838829299, "learning_rate": 9.715742849067426e-06, "loss": 0.0441, "step": 1529 }, { "epoch": 0.5916473317865429, "grad_norm": 0.4373125519856973, "learning_rate": 9.714994609127946e-06, "loss": 0.0607, "step": 1530 }, { "epoch": 0.5920340293890178, "grad_norm": 0.5042451854473881, "learning_rate": 9.71424541458886e-06, "loss": 0.0414, "step": 1531 }, { "epoch": 0.5924207269914926, "grad_norm": 0.7129788931502912, "learning_rate": 9.713495265601855e-06, "loss": 0.0897, "step": 1532 }, { "epoch": 0.5928074245939675, "grad_norm": 0.3401284211754683, "learning_rate": 9.712744162318804e-06, "loss": 0.0362, "step": 1533 }, { "epoch": 0.5931941221964424, "grad_norm": 0.748835837578454, "learning_rate": 9.711992104891774e-06, "loss": 0.0342, "step": 1534 }, { "epoch": 0.5935808197989172, "grad_norm": 0.38026751364600037, "learning_rate": 9.711239093473031e-06, "loss": 0.0503, "step": 1535 }, { "epoch": 0.5939675174013921, "grad_norm": 0.39883198813810394, "learning_rate": 9.710485128215025e-06, "loss": 0.0643, "step": 1536 }, { "epoch": 0.594354215003867, "grad_norm": 0.36217199850508824, "learning_rate": 9.70973020927041e-06, "loss": 0.049, "step": 1537 }, { "epoch": 0.5947409126063419, "grad_norm": 0.4914880884900422, "learning_rate": 9.708974336792021e-06, "loss": 0.0532, "step": 1538 }, { "epoch": 0.5951276102088167, "grad_norm": 0.3917014931212936, "learning_rate": 9.708217510932897e-06, "loss": 0.0396, "step": 1539 }, { "epoch": 0.5955143078112916, "grad_norm": 0.3711510542446065, "learning_rate": 9.707459731846263e-06, "loss": 0.0521, "step": 1540 }, { "epoch": 0.5959010054137664, "grad_norm": 0.3440085761690635, "learning_rate": 9.706700999685538e-06, "loss": 0.0415, "step": 1541 }, { "epoch": 0.5962877030162413, "grad_norm": 0.3422922315048507, "learning_rate": 9.705941314604339e-06, "loss": 0.045, "step": 1542 }, { "epoch": 0.5966744006187161, "grad_norm": 0.4520641145233483, "learning_rate": 9.705180676756469e-06, "loss": 0.04, "step": 1543 }, { "epoch": 0.597061098221191, "grad_norm": 0.32121685322177307, "learning_rate": 9.70441908629593e-06, "loss": 0.042, "step": 1544 }, { "epoch": 0.5974477958236659, "grad_norm": 0.34019946938554535, "learning_rate": 9.703656543376909e-06, "loss": 0.0289, "step": 1545 }, { "epoch": 0.5978344934261408, "grad_norm": 0.5271680124144138, "learning_rate": 9.702893048153797e-06, "loss": 0.0722, "step": 1546 }, { "epoch": 0.5982211910286156, "grad_norm": 0.34219192066560794, "learning_rate": 9.702128600781164e-06, "loss": 0.0413, "step": 1547 }, { "epoch": 0.5986078886310905, "grad_norm": 0.36981316975972817, "learning_rate": 9.701363201413786e-06, "loss": 0.0474, "step": 1548 }, { "epoch": 0.5989945862335654, "grad_norm": 0.47330831243578025, "learning_rate": 9.700596850206626e-06, "loss": 0.0499, "step": 1549 }, { "epoch": 0.5993812838360402, "grad_norm": 0.42559890102750486, "learning_rate": 9.699829547314837e-06, "loss": 0.0512, "step": 1550 }, { "epoch": 0.5997679814385151, "grad_norm": 0.5082289609675045, "learning_rate": 9.69906129289377e-06, "loss": 0.0531, "step": 1551 }, { "epoch": 0.60015467904099, "grad_norm": 0.40820689373580704, "learning_rate": 9.698292087098965e-06, "loss": 0.0352, "step": 1552 }, { "epoch": 0.6005413766434649, "grad_norm": 0.7740723878428828, "learning_rate": 9.697521930086153e-06, "loss": 0.0641, "step": 1553 }, { "epoch": 0.6009280742459396, "grad_norm": 0.4077800259159616, "learning_rate": 9.696750822011264e-06, "loss": 0.0401, "step": 1554 }, { "epoch": 0.6013147718484145, "grad_norm": 0.4282424047962682, "learning_rate": 9.695978763030416e-06, "loss": 0.042, "step": 1555 }, { "epoch": 0.6017014694508894, "grad_norm": 0.9235073485309792, "learning_rate": 9.695205753299919e-06, "loss": 0.0524, "step": 1556 }, { "epoch": 0.6020881670533643, "grad_norm": 0.38483405655801173, "learning_rate": 9.694431792976278e-06, "loss": 0.0417, "step": 1557 }, { "epoch": 0.6024748646558391, "grad_norm": 0.3639987362050531, "learning_rate": 9.693656882216192e-06, "loss": 0.0387, "step": 1558 }, { "epoch": 0.602861562258314, "grad_norm": 0.5783028347296993, "learning_rate": 9.692881021176543e-06, "loss": 0.0824, "step": 1559 }, { "epoch": 0.6032482598607889, "grad_norm": 0.33496387658806503, "learning_rate": 9.692104210014417e-06, "loss": 0.0453, "step": 1560 }, { "epoch": 0.6036349574632637, "grad_norm": 0.34781950587169047, "learning_rate": 9.691326448887087e-06, "loss": 0.0398, "step": 1561 }, { "epoch": 0.6040216550657386, "grad_norm": 0.3475443019950894, "learning_rate": 9.690547737952019e-06, "loss": 0.0545, "step": 1562 }, { "epoch": 0.6044083526682135, "grad_norm": 0.421146753153643, "learning_rate": 9.689768077366869e-06, "loss": 0.0516, "step": 1563 }, { "epoch": 0.6047950502706884, "grad_norm": 0.35530771918589465, "learning_rate": 9.68898746728949e-06, "loss": 0.0473, "step": 1564 }, { "epoch": 0.6051817478731631, "grad_norm": 0.3412887800240831, "learning_rate": 9.688205907877923e-06, "loss": 0.0373, "step": 1565 }, { "epoch": 0.605568445475638, "grad_norm": 0.5137660111198974, "learning_rate": 9.687423399290402e-06, "loss": 0.0544, "step": 1566 }, { "epoch": 0.6059551430781129, "grad_norm": 0.4178467190619986, "learning_rate": 9.686639941685357e-06, "loss": 0.0541, "step": 1567 }, { "epoch": 0.6063418406805878, "grad_norm": 0.3600043070424862, "learning_rate": 9.685855535221403e-06, "loss": 0.0403, "step": 1568 }, { "epoch": 0.6067285382830626, "grad_norm": 0.3315432297732052, "learning_rate": 9.685070180057354e-06, "loss": 0.0316, "step": 1569 }, { "epoch": 0.6071152358855375, "grad_norm": 0.4148331060272119, "learning_rate": 9.684283876352213e-06, "loss": 0.0408, "step": 1570 }, { "epoch": 0.6075019334880124, "grad_norm": 0.3243133012718558, "learning_rate": 9.683496624265177e-06, "loss": 0.0358, "step": 1571 }, { "epoch": 0.6078886310904872, "grad_norm": 0.7689934077929351, "learning_rate": 9.682708423955631e-06, "loss": 0.0509, "step": 1572 }, { "epoch": 0.6082753286929621, "grad_norm": 0.3335337902383706, "learning_rate": 9.681919275583154e-06, "loss": 0.0478, "step": 1573 }, { "epoch": 0.608662026295437, "grad_norm": 0.3359767537025353, "learning_rate": 9.681129179307517e-06, "loss": 0.0423, "step": 1574 }, { "epoch": 0.6090487238979119, "grad_norm": 0.3876140808359027, "learning_rate": 9.680338135288687e-06, "loss": 0.0392, "step": 1575 }, { "epoch": 0.6094354215003867, "grad_norm": 0.32822403671237094, "learning_rate": 9.679546143686815e-06, "loss": 0.0458, "step": 1576 }, { "epoch": 0.6098221191028615, "grad_norm": 0.4710252746761268, "learning_rate": 9.67875320466225e-06, "loss": 0.0436, "step": 1577 }, { "epoch": 0.6102088167053364, "grad_norm": 0.3967446389845192, "learning_rate": 9.67795931837553e-06, "loss": 0.0478, "step": 1578 }, { "epoch": 0.6105955143078113, "grad_norm": 0.34440754540620494, "learning_rate": 9.677164484987385e-06, "loss": 0.0462, "step": 1579 }, { "epoch": 0.6109822119102861, "grad_norm": 0.362549920307705, "learning_rate": 9.676368704658739e-06, "loss": 0.0465, "step": 1580 }, { "epoch": 0.611368909512761, "grad_norm": 0.3759614247979355, "learning_rate": 9.675571977550702e-06, "loss": 0.0498, "step": 1581 }, { "epoch": 0.6117556071152359, "grad_norm": 0.29875045327406513, "learning_rate": 9.674774303824585e-06, "loss": 0.0367, "step": 1582 }, { "epoch": 0.6121423047177108, "grad_norm": 0.43927428721837064, "learning_rate": 9.673975683641883e-06, "loss": 0.0459, "step": 1583 }, { "epoch": 0.6125290023201856, "grad_norm": 0.36001398968647985, "learning_rate": 9.673176117164284e-06, "loss": 0.0582, "step": 1584 }, { "epoch": 0.6129156999226605, "grad_norm": 0.3635786914909202, "learning_rate": 9.672375604553668e-06, "loss": 0.0312, "step": 1585 }, { "epoch": 0.6133023975251354, "grad_norm": 0.34173692979494186, "learning_rate": 9.67157414597211e-06, "loss": 0.0307, "step": 1586 }, { "epoch": 0.6136890951276102, "grad_norm": 0.337970189308892, "learning_rate": 9.670771741581873e-06, "loss": 0.0414, "step": 1587 }, { "epoch": 0.6140757927300851, "grad_norm": 0.5018663295444469, "learning_rate": 9.66996839154541e-06, "loss": 0.0406, "step": 1588 }, { "epoch": 0.61446249033256, "grad_norm": 0.31798530738699027, "learning_rate": 9.669164096025368e-06, "loss": 0.0391, "step": 1589 }, { "epoch": 0.6148491879350348, "grad_norm": 0.40517927081602817, "learning_rate": 9.668358855184586e-06, "loss": 0.0508, "step": 1590 }, { "epoch": 0.6152358855375096, "grad_norm": 0.3588499742301946, "learning_rate": 9.667552669186094e-06, "loss": 0.0458, "step": 1591 }, { "epoch": 0.6156225831399845, "grad_norm": 0.6377662958407174, "learning_rate": 9.666745538193112e-06, "loss": 0.0733, "step": 1592 }, { "epoch": 0.6160092807424594, "grad_norm": 0.41771880581258014, "learning_rate": 9.66593746236905e-06, "loss": 0.0544, "step": 1593 }, { "epoch": 0.6163959783449343, "grad_norm": 0.4358689923078484, "learning_rate": 9.665128441877514e-06, "loss": 0.0733, "step": 1594 }, { "epoch": 0.6167826759474091, "grad_norm": 0.33262234318771194, "learning_rate": 9.664318476882298e-06, "loss": 0.0363, "step": 1595 }, { "epoch": 0.617169373549884, "grad_norm": 0.43986245568557125, "learning_rate": 9.663507567547386e-06, "loss": 0.0425, "step": 1596 }, { "epoch": 0.6175560711523589, "grad_norm": 0.4705855685928936, "learning_rate": 9.662695714036958e-06, "loss": 0.062, "step": 1597 }, { "epoch": 0.6179427687548337, "grad_norm": 0.455658999308194, "learning_rate": 9.661882916515382e-06, "loss": 0.0418, "step": 1598 }, { "epoch": 0.6183294663573086, "grad_norm": 0.41804462143375215, "learning_rate": 9.661069175147213e-06, "loss": 0.0419, "step": 1599 }, { "epoch": 0.6187161639597835, "grad_norm": 0.5805264427845701, "learning_rate": 9.660254490097208e-06, "loss": 0.0483, "step": 1600 }, { "epoch": 0.6191028615622584, "grad_norm": 0.4694753855107609, "learning_rate": 9.6594388615303e-06, "loss": 0.0534, "step": 1601 }, { "epoch": 0.6194895591647331, "grad_norm": 0.31691394154036884, "learning_rate": 9.658622289611631e-06, "loss": 0.0324, "step": 1602 }, { "epoch": 0.619876256767208, "grad_norm": 0.37474179296467675, "learning_rate": 9.657804774506519e-06, "loss": 0.0389, "step": 1603 }, { "epoch": 0.6202629543696829, "grad_norm": 0.5634572491586235, "learning_rate": 9.656986316380476e-06, "loss": 0.0455, "step": 1604 }, { "epoch": 0.6206496519721578, "grad_norm": 0.3287063211409154, "learning_rate": 9.656166915399214e-06, "loss": 0.0404, "step": 1605 }, { "epoch": 0.6210363495746326, "grad_norm": 0.29725276790977834, "learning_rate": 9.655346571728625e-06, "loss": 0.0398, "step": 1606 }, { "epoch": 0.6214230471771075, "grad_norm": 0.3526932201347924, "learning_rate": 9.654525285534797e-06, "loss": 0.0389, "step": 1607 }, { "epoch": 0.6218097447795824, "grad_norm": 0.289761235295998, "learning_rate": 9.653703056984008e-06, "loss": 0.0351, "step": 1608 }, { "epoch": 0.6221964423820572, "grad_norm": 0.35932022046612366, "learning_rate": 9.652879886242725e-06, "loss": 0.0413, "step": 1609 }, { "epoch": 0.6225831399845321, "grad_norm": 0.44924288494699044, "learning_rate": 9.65205577347761e-06, "loss": 0.0649, "step": 1610 }, { "epoch": 0.622969837587007, "grad_norm": 0.3081265414323956, "learning_rate": 9.65123071885551e-06, "loss": 0.0367, "step": 1611 }, { "epoch": 0.6233565351894819, "grad_norm": 0.3119992625713686, "learning_rate": 9.650404722543472e-06, "loss": 0.036, "step": 1612 }, { "epoch": 0.6237432327919566, "grad_norm": 0.5827239099665135, "learning_rate": 9.64957778470872e-06, "loss": 0.056, "step": 1613 }, { "epoch": 0.6241299303944315, "grad_norm": 0.4231854537518781, "learning_rate": 9.648749905518682e-06, "loss": 0.0475, "step": 1614 }, { "epoch": 0.6245166279969064, "grad_norm": 0.5552906681499779, "learning_rate": 9.647921085140967e-06, "loss": 0.0423, "step": 1615 }, { "epoch": 0.6249033255993813, "grad_norm": 0.3698129624973388, "learning_rate": 9.647091323743382e-06, "loss": 0.046, "step": 1616 }, { "epoch": 0.6252900232018561, "grad_norm": 0.7519432522335898, "learning_rate": 9.646260621493916e-06, "loss": 0.0563, "step": 1617 }, { "epoch": 0.625676720804331, "grad_norm": 0.44435600570817874, "learning_rate": 9.645428978560758e-06, "loss": 0.054, "step": 1618 }, { "epoch": 0.6260634184068059, "grad_norm": 0.34324394444368, "learning_rate": 9.644596395112277e-06, "loss": 0.0333, "step": 1619 }, { "epoch": 0.6264501160092807, "grad_norm": 0.5169014339618127, "learning_rate": 9.643762871317047e-06, "loss": 0.0576, "step": 1620 }, { "epoch": 0.6268368136117556, "grad_norm": 0.3614320829676001, "learning_rate": 9.642928407343815e-06, "loss": 0.0525, "step": 1621 }, { "epoch": 0.6272235112142305, "grad_norm": 0.5748562398562245, "learning_rate": 9.64209300336153e-06, "loss": 0.0547, "step": 1622 }, { "epoch": 0.6276102088167054, "grad_norm": 0.5886214118685934, "learning_rate": 9.641256659539328e-06, "loss": 0.0591, "step": 1623 }, { "epoch": 0.6279969064191802, "grad_norm": 0.48218544326210633, "learning_rate": 9.640419376046537e-06, "loss": 0.0509, "step": 1624 }, { "epoch": 0.628383604021655, "grad_norm": 0.3680691879259511, "learning_rate": 9.639581153052673e-06, "loss": 0.0405, "step": 1625 }, { "epoch": 0.62877030162413, "grad_norm": 0.3430296663749092, "learning_rate": 9.638741990727442e-06, "loss": 0.0301, "step": 1626 }, { "epoch": 0.6291569992266048, "grad_norm": 0.37928941823335954, "learning_rate": 9.637901889240742e-06, "loss": 0.0481, "step": 1627 }, { "epoch": 0.6295436968290796, "grad_norm": 0.3174760085775219, "learning_rate": 9.63706084876266e-06, "loss": 0.039, "step": 1628 }, { "epoch": 0.6299303944315545, "grad_norm": 0.5149343498469872, "learning_rate": 9.636218869463471e-06, "loss": 0.0723, "step": 1629 }, { "epoch": 0.6303170920340294, "grad_norm": 0.7872650575076059, "learning_rate": 9.635375951513647e-06, "loss": 0.0555, "step": 1630 }, { "epoch": 0.6307037896365043, "grad_norm": 0.7922351453004514, "learning_rate": 9.634532095083844e-06, "loss": 0.0543, "step": 1631 }, { "epoch": 0.6310904872389791, "grad_norm": 0.4197128440230904, "learning_rate": 9.633687300344907e-06, "loss": 0.0446, "step": 1632 }, { "epoch": 0.631477184841454, "grad_norm": 0.5525811385825178, "learning_rate": 9.632841567467877e-06, "loss": 0.0669, "step": 1633 }, { "epoch": 0.6318638824439289, "grad_norm": 0.39017108750335483, "learning_rate": 9.631994896623979e-06, "loss": 0.0442, "step": 1634 }, { "epoch": 0.6322505800464037, "grad_norm": 0.4405529177603424, "learning_rate": 9.63114728798463e-06, "loss": 0.0416, "step": 1635 }, { "epoch": 0.6326372776488786, "grad_norm": 0.48487009028488387, "learning_rate": 9.630298741721437e-06, "loss": 0.0756, "step": 1636 }, { "epoch": 0.6330239752513535, "grad_norm": 0.37587709440744377, "learning_rate": 9.6294492580062e-06, "loss": 0.0421, "step": 1637 }, { "epoch": 0.6334106728538283, "grad_norm": 0.6757411096228917, "learning_rate": 9.628598837010905e-06, "loss": 0.0541, "step": 1638 }, { "epoch": 0.6337973704563031, "grad_norm": 0.39988572685852003, "learning_rate": 9.627747478907725e-06, "loss": 0.0564, "step": 1639 }, { "epoch": 0.634184068058778, "grad_norm": 0.4312491472453284, "learning_rate": 9.62689518386903e-06, "loss": 0.0558, "step": 1640 }, { "epoch": 0.6345707656612529, "grad_norm": 0.5584584751392906, "learning_rate": 9.626041952067375e-06, "loss": 0.0513, "step": 1641 }, { "epoch": 0.6349574632637278, "grad_norm": 0.4050610162759147, "learning_rate": 9.625187783675505e-06, "loss": 0.0492, "step": 1642 }, { "epoch": 0.6353441608662026, "grad_norm": 0.3934331846078309, "learning_rate": 9.624332678866354e-06, "loss": 0.0455, "step": 1643 }, { "epoch": 0.6357308584686775, "grad_norm": 0.3738708585362167, "learning_rate": 9.623476637813051e-06, "loss": 0.0437, "step": 1644 }, { "epoch": 0.6361175560711524, "grad_norm": 0.3201183810512126, "learning_rate": 9.622619660688906e-06, "loss": 0.0482, "step": 1645 }, { "epoch": 0.6365042536736272, "grad_norm": 0.35345844003376886, "learning_rate": 9.621761747667426e-06, "loss": 0.0333, "step": 1646 }, { "epoch": 0.6368909512761021, "grad_norm": 0.42550874366777747, "learning_rate": 9.620902898922303e-06, "loss": 0.0622, "step": 1647 }, { "epoch": 0.637277648878577, "grad_norm": 0.33544629049854563, "learning_rate": 9.620043114627419e-06, "loss": 0.0356, "step": 1648 }, { "epoch": 0.6376643464810519, "grad_norm": 0.35615738740001096, "learning_rate": 9.619182394956847e-06, "loss": 0.0422, "step": 1649 }, { "epoch": 0.6380510440835266, "grad_norm": 0.48798926004487, "learning_rate": 9.61832074008485e-06, "loss": 0.055, "step": 1650 }, { "epoch": 0.6384377416860015, "grad_norm": 0.3834452220923011, "learning_rate": 9.617458150185878e-06, "loss": 0.0377, "step": 1651 }, { "epoch": 0.6388244392884764, "grad_norm": 0.5126155131214877, "learning_rate": 9.61659462543457e-06, "loss": 0.0588, "step": 1652 }, { "epoch": 0.6392111368909513, "grad_norm": 0.3935738367786806, "learning_rate": 9.61573016600576e-06, "loss": 0.0455, "step": 1653 }, { "epoch": 0.6395978344934261, "grad_norm": 0.2962930633304181, "learning_rate": 9.61486477207446e-06, "loss": 0.0385, "step": 1654 }, { "epoch": 0.639984532095901, "grad_norm": 0.3392637033859755, "learning_rate": 9.613998443815882e-06, "loss": 0.0371, "step": 1655 }, { "epoch": 0.6403712296983759, "grad_norm": 0.5674710647609772, "learning_rate": 9.613131181405422e-06, "loss": 0.073, "step": 1656 }, { "epoch": 0.6407579273008507, "grad_norm": 0.4619729037913427, "learning_rate": 9.612262985018667e-06, "loss": 0.0589, "step": 1657 }, { "epoch": 0.6411446249033256, "grad_norm": 0.3883960662249087, "learning_rate": 9.611393854831395e-06, "loss": 0.0379, "step": 1658 }, { "epoch": 0.6415313225058005, "grad_norm": 0.7029756120877059, "learning_rate": 9.610523791019565e-06, "loss": 0.0668, "step": 1659 }, { "epoch": 0.6419180201082754, "grad_norm": 0.40905150767285453, "learning_rate": 9.609652793759334e-06, "loss": 0.0391, "step": 1660 }, { "epoch": 0.6423047177107502, "grad_norm": 0.4485012026715848, "learning_rate": 9.608780863227044e-06, "loss": 0.0455, "step": 1661 }, { "epoch": 0.642691415313225, "grad_norm": 0.32417143157464695, "learning_rate": 9.607907999599225e-06, "loss": 0.0378, "step": 1662 }, { "epoch": 0.6430781129156999, "grad_norm": 0.6802282250728932, "learning_rate": 9.6070342030526e-06, "loss": 0.056, "step": 1663 }, { "epoch": 0.6434648105181748, "grad_norm": 0.4548659091686903, "learning_rate": 9.606159473764073e-06, "loss": 0.0489, "step": 1664 }, { "epoch": 0.6438515081206496, "grad_norm": 0.6320223312058879, "learning_rate": 9.605283811910748e-06, "loss": 0.0593, "step": 1665 }, { "epoch": 0.6442382057231245, "grad_norm": 0.3743198296127785, "learning_rate": 9.604407217669908e-06, "loss": 0.0451, "step": 1666 }, { "epoch": 0.6446249033255994, "grad_norm": 0.40420096103692493, "learning_rate": 9.60352969121903e-06, "loss": 0.0451, "step": 1667 }, { "epoch": 0.6450116009280742, "grad_norm": 0.5279825140764829, "learning_rate": 9.60265123273578e-06, "loss": 0.0732, "step": 1668 }, { "epoch": 0.6453982985305491, "grad_norm": 0.42159774534949407, "learning_rate": 9.60177184239801e-06, "loss": 0.0468, "step": 1669 }, { "epoch": 0.645784996133024, "grad_norm": 1.1128586393474154, "learning_rate": 9.600891520383758e-06, "loss": 0.0391, "step": 1670 }, { "epoch": 0.6461716937354989, "grad_norm": 0.4400725829745443, "learning_rate": 9.600010266871257e-06, "loss": 0.0447, "step": 1671 }, { "epoch": 0.6465583913379737, "grad_norm": 0.4622011529683878, "learning_rate": 9.599128082038928e-06, "loss": 0.045, "step": 1672 }, { "epoch": 0.6469450889404486, "grad_norm": 0.42425451441520856, "learning_rate": 9.598244966065377e-06, "loss": 0.0413, "step": 1673 }, { "epoch": 0.6473317865429234, "grad_norm": 0.37710116241357605, "learning_rate": 9.5973609191294e-06, "loss": 0.04, "step": 1674 }, { "epoch": 0.6477184841453983, "grad_norm": 0.40646555548724966, "learning_rate": 9.59647594140998e-06, "loss": 0.0477, "step": 1675 }, { "epoch": 0.6481051817478731, "grad_norm": 0.37633701368935074, "learning_rate": 9.595590033086292e-06, "loss": 0.0361, "step": 1676 }, { "epoch": 0.648491879350348, "grad_norm": 0.5585728324622973, "learning_rate": 9.594703194337696e-06, "loss": 0.0466, "step": 1677 }, { "epoch": 0.6488785769528229, "grad_norm": 0.3501646456422594, "learning_rate": 9.593815425343743e-06, "loss": 0.0406, "step": 1678 }, { "epoch": 0.6492652745552978, "grad_norm": 0.4843245873332521, "learning_rate": 9.59292672628417e-06, "loss": 0.0519, "step": 1679 }, { "epoch": 0.6496519721577726, "grad_norm": 0.5742472265614759, "learning_rate": 9.592037097338902e-06, "loss": 0.0583, "step": 1680 }, { "epoch": 0.6500386697602475, "grad_norm": 0.3922693313974978, "learning_rate": 9.591146538688056e-06, "loss": 0.0484, "step": 1681 }, { "epoch": 0.6504253673627224, "grad_norm": 0.38020198647705, "learning_rate": 9.590255050511935e-06, "loss": 0.0474, "step": 1682 }, { "epoch": 0.6508120649651972, "grad_norm": 0.4760482804109816, "learning_rate": 9.589362632991028e-06, "loss": 0.0449, "step": 1683 }, { "epoch": 0.6511987625676721, "grad_norm": 0.46466857599567446, "learning_rate": 9.588469286306015e-06, "loss": 0.0752, "step": 1684 }, { "epoch": 0.651585460170147, "grad_norm": 0.40826556940578207, "learning_rate": 9.587575010637764e-06, "loss": 0.0407, "step": 1685 }, { "epoch": 0.6519721577726219, "grad_norm": 0.43214582775801313, "learning_rate": 9.58667980616733e-06, "loss": 0.0435, "step": 1686 }, { "epoch": 0.6523588553750966, "grad_norm": 0.3299826965144364, "learning_rate": 9.585783673075955e-06, "loss": 0.0375, "step": 1687 }, { "epoch": 0.6527455529775715, "grad_norm": 0.3070802947307931, "learning_rate": 9.584886611545074e-06, "loss": 0.0358, "step": 1688 }, { "epoch": 0.6531322505800464, "grad_norm": 0.2976215791446596, "learning_rate": 9.583988621756304e-06, "loss": 0.0364, "step": 1689 }, { "epoch": 0.6535189481825213, "grad_norm": 0.4810463087211608, "learning_rate": 9.58308970389145e-06, "loss": 0.037, "step": 1690 }, { "epoch": 0.6539056457849961, "grad_norm": 0.41251671225667014, "learning_rate": 9.582189858132514e-06, "loss": 0.0528, "step": 1691 }, { "epoch": 0.654292343387471, "grad_norm": 0.3290016361060707, "learning_rate": 9.58128908466167e-06, "loss": 0.0483, "step": 1692 }, { "epoch": 0.6546790409899459, "grad_norm": 0.47957236159031114, "learning_rate": 9.580387383661298e-06, "loss": 0.0541, "step": 1693 }, { "epoch": 0.6550657385924207, "grad_norm": 0.38872982055988675, "learning_rate": 9.579484755313953e-06, "loss": 0.0474, "step": 1694 }, { "epoch": 0.6554524361948956, "grad_norm": 0.3763880104998817, "learning_rate": 9.578581199802378e-06, "loss": 0.0413, "step": 1695 }, { "epoch": 0.6558391337973705, "grad_norm": 0.40651431352892886, "learning_rate": 9.577676717309512e-06, "loss": 0.0485, "step": 1696 }, { "epoch": 0.6562258313998454, "grad_norm": 0.42808614549058194, "learning_rate": 9.576771308018475e-06, "loss": 0.0519, "step": 1697 }, { "epoch": 0.6566125290023201, "grad_norm": 0.5602620867160116, "learning_rate": 9.575864972112577e-06, "loss": 0.0338, "step": 1698 }, { "epoch": 0.656999226604795, "grad_norm": 0.3824648329238015, "learning_rate": 9.574957709775315e-06, "loss": 0.0485, "step": 1699 }, { "epoch": 0.6573859242072699, "grad_norm": 0.349506273369756, "learning_rate": 9.574049521190373e-06, "loss": 0.0483, "step": 1700 }, { "epoch": 0.6577726218097448, "grad_norm": 0.6860940473470556, "learning_rate": 9.573140406541623e-06, "loss": 0.035, "step": 1701 }, { "epoch": 0.6581593194122196, "grad_norm": 0.5060654074580773, "learning_rate": 9.572230366013126e-06, "loss": 0.0541, "step": 1702 }, { "epoch": 0.6585460170146945, "grad_norm": 0.763847619363347, "learning_rate": 9.571319399789129e-06, "loss": 0.0435, "step": 1703 }, { "epoch": 0.6589327146171694, "grad_norm": 0.35925594969353264, "learning_rate": 9.570407508054065e-06, "loss": 0.04, "step": 1704 }, { "epoch": 0.6593194122196442, "grad_norm": 0.49787671938895595, "learning_rate": 9.569494690992557e-06, "loss": 0.0562, "step": 1705 }, { "epoch": 0.6597061098221191, "grad_norm": 0.36057094303126824, "learning_rate": 9.568580948789416e-06, "loss": 0.0429, "step": 1706 }, { "epoch": 0.660092807424594, "grad_norm": 0.34204097694674857, "learning_rate": 9.567666281629636e-06, "loss": 0.0332, "step": 1707 }, { "epoch": 0.6604795050270689, "grad_norm": 0.49418245275548617, "learning_rate": 9.566750689698401e-06, "loss": 0.0531, "step": 1708 }, { "epoch": 0.6608662026295437, "grad_norm": 0.5392935003588937, "learning_rate": 9.565834173181086e-06, "loss": 0.0651, "step": 1709 }, { "epoch": 0.6612529002320185, "grad_norm": 0.4180567155390562, "learning_rate": 9.564916732263243e-06, "loss": 0.0481, "step": 1710 }, { "epoch": 0.6616395978344934, "grad_norm": 0.33889684096861106, "learning_rate": 9.563998367130622e-06, "loss": 0.0425, "step": 1711 }, { "epoch": 0.6620262954369683, "grad_norm": 0.41597533267560177, "learning_rate": 9.563079077969153e-06, "loss": 0.0421, "step": 1712 }, { "epoch": 0.6624129930394431, "grad_norm": 0.3850984586497909, "learning_rate": 9.562158864964958e-06, "loss": 0.0349, "step": 1713 }, { "epoch": 0.662799690641918, "grad_norm": 0.289173040670742, "learning_rate": 9.561237728304341e-06, "loss": 0.0251, "step": 1714 }, { "epoch": 0.6631863882443929, "grad_norm": 0.41384003064794184, "learning_rate": 9.560315668173799e-06, "loss": 0.055, "step": 1715 }, { "epoch": 0.6635730858468677, "grad_norm": 0.3514354951472323, "learning_rate": 9.55939268476001e-06, "loss": 0.0371, "step": 1716 }, { "epoch": 0.6639597834493426, "grad_norm": 0.37239491308708306, "learning_rate": 9.558468778249841e-06, "loss": 0.0448, "step": 1717 }, { "epoch": 0.6643464810518175, "grad_norm": 0.5199302751104387, "learning_rate": 9.55754394883035e-06, "loss": 0.0462, "step": 1718 }, { "epoch": 0.6647331786542924, "grad_norm": 0.44817108693325985, "learning_rate": 9.556618196688775e-06, "loss": 0.0436, "step": 1719 }, { "epoch": 0.6651198762567672, "grad_norm": 0.46811136639191, "learning_rate": 9.555691522012546e-06, "loss": 0.0453, "step": 1720 }, { "epoch": 0.6655065738592421, "grad_norm": 0.3063460896011617, "learning_rate": 9.554763924989276e-06, "loss": 0.0341, "step": 1721 }, { "epoch": 0.665893271461717, "grad_norm": 0.3611122798419338, "learning_rate": 9.553835405806769e-06, "loss": 0.0457, "step": 1722 }, { "epoch": 0.6662799690641918, "grad_norm": 0.32073908632854087, "learning_rate": 9.552905964653011e-06, "loss": 0.0384, "step": 1723 }, { "epoch": 0.6666666666666666, "grad_norm": 0.4487859634752963, "learning_rate": 9.55197560171618e-06, "loss": 0.0449, "step": 1724 }, { "epoch": 0.6670533642691415, "grad_norm": 0.2928999967442906, "learning_rate": 9.551044317184634e-06, "loss": 0.0444, "step": 1725 }, { "epoch": 0.6674400618716164, "grad_norm": 0.4278560242243563, "learning_rate": 9.550112111246923e-06, "loss": 0.0507, "step": 1726 }, { "epoch": 0.6678267594740913, "grad_norm": 0.3684564598755489, "learning_rate": 9.549178984091784e-06, "loss": 0.047, "step": 1727 }, { "epoch": 0.6682134570765661, "grad_norm": 0.9261232208193899, "learning_rate": 9.548244935908135e-06, "loss": 0.0515, "step": 1728 }, { "epoch": 0.668600154679041, "grad_norm": 0.3566080528512873, "learning_rate": 9.547309966885084e-06, "loss": 0.0422, "step": 1729 }, { "epoch": 0.6689868522815159, "grad_norm": 0.3648162939331308, "learning_rate": 9.546374077211926e-06, "loss": 0.0413, "step": 1730 }, { "epoch": 0.6693735498839907, "grad_norm": 0.25343308952211147, "learning_rate": 9.545437267078143e-06, "loss": 0.0347, "step": 1731 }, { "epoch": 0.6697602474864656, "grad_norm": 0.49387905544930005, "learning_rate": 9.5444995366734e-06, "loss": 0.0398, "step": 1732 }, { "epoch": 0.6701469450889405, "grad_norm": 0.37941393606647095, "learning_rate": 9.54356088618755e-06, "loss": 0.0493, "step": 1733 }, { "epoch": 0.6705336426914154, "grad_norm": 0.5191076196095081, "learning_rate": 9.542621315810634e-06, "loss": 0.0704, "step": 1734 }, { "epoch": 0.6709203402938901, "grad_norm": 0.39278947070758663, "learning_rate": 9.541680825732878e-06, "loss": 0.0494, "step": 1735 }, { "epoch": 0.671307037896365, "grad_norm": 0.3478143046270935, "learning_rate": 9.540739416144692e-06, "loss": 0.0333, "step": 1736 }, { "epoch": 0.6716937354988399, "grad_norm": 0.3001652157220205, "learning_rate": 9.539797087236675e-06, "loss": 0.0421, "step": 1737 }, { "epoch": 0.6720804331013148, "grad_norm": 0.3544491728771151, "learning_rate": 9.538853839199611e-06, "loss": 0.0358, "step": 1738 }, { "epoch": 0.6724671307037896, "grad_norm": 0.3209175748427175, "learning_rate": 9.537909672224473e-06, "loss": 0.0348, "step": 1739 }, { "epoch": 0.6728538283062645, "grad_norm": 0.4997172071966922, "learning_rate": 9.536964586502413e-06, "loss": 0.0562, "step": 1740 }, { "epoch": 0.6732405259087394, "grad_norm": 0.36117697088801526, "learning_rate": 9.536018582224777e-06, "loss": 0.0409, "step": 1741 }, { "epoch": 0.6736272235112142, "grad_norm": 0.478606966403985, "learning_rate": 9.535071659583091e-06, "loss": 0.0529, "step": 1742 }, { "epoch": 0.6740139211136891, "grad_norm": 0.5127526318214235, "learning_rate": 9.534123818769073e-06, "loss": 0.0715, "step": 1743 }, { "epoch": 0.674400618716164, "grad_norm": 0.3410370785305336, "learning_rate": 9.533175059974617e-06, "loss": 0.0495, "step": 1744 }, { "epoch": 0.6747873163186389, "grad_norm": 0.3220141629812554, "learning_rate": 9.532225383391815e-06, "loss": 0.0332, "step": 1745 }, { "epoch": 0.6751740139211136, "grad_norm": 0.4573150742915447, "learning_rate": 9.531274789212937e-06, "loss": 0.0505, "step": 1746 }, { "epoch": 0.6755607115235885, "grad_norm": 0.37579352518725, "learning_rate": 9.530323277630438e-06, "loss": 0.0434, "step": 1747 }, { "epoch": 0.6759474091260634, "grad_norm": 0.512775443297149, "learning_rate": 9.529370848836965e-06, "loss": 0.0632, "step": 1748 }, { "epoch": 0.6763341067285383, "grad_norm": 0.4483025127816612, "learning_rate": 9.528417503025346e-06, "loss": 0.0521, "step": 1749 }, { "epoch": 0.6767208043310131, "grad_norm": 0.48007995795581176, "learning_rate": 9.527463240388594e-06, "loss": 0.0577, "step": 1750 }, { "epoch": 0.677107501933488, "grad_norm": 0.5501250395063059, "learning_rate": 9.526508061119912e-06, "loss": 0.0637, "step": 1751 }, { "epoch": 0.6774941995359629, "grad_norm": 0.4332375792542968, "learning_rate": 9.525551965412685e-06, "loss": 0.0434, "step": 1752 }, { "epoch": 0.6778808971384377, "grad_norm": 0.4419606457428007, "learning_rate": 9.524594953460484e-06, "loss": 0.0499, "step": 1753 }, { "epoch": 0.6782675947409126, "grad_norm": 0.43325837214421803, "learning_rate": 9.523637025457065e-06, "loss": 0.0474, "step": 1754 }, { "epoch": 0.6786542923433875, "grad_norm": 0.32428048285228156, "learning_rate": 9.522678181596374e-06, "loss": 0.0378, "step": 1755 }, { "epoch": 0.6790409899458624, "grad_norm": 0.38125245473525066, "learning_rate": 9.521718422072535e-06, "loss": 0.0488, "step": 1756 }, { "epoch": 0.6794276875483372, "grad_norm": 0.47895983487163707, "learning_rate": 9.520757747079862e-06, "loss": 0.0463, "step": 1757 }, { "epoch": 0.679814385150812, "grad_norm": 0.35923106274931194, "learning_rate": 9.519796156812857e-06, "loss": 0.0369, "step": 1758 }, { "epoch": 0.680201082753287, "grad_norm": 0.46781475987953297, "learning_rate": 9.518833651466202e-06, "loss": 0.0443, "step": 1759 }, { "epoch": 0.6805877803557618, "grad_norm": 0.4156759029814697, "learning_rate": 9.517870231234764e-06, "loss": 0.0403, "step": 1760 }, { "epoch": 0.6809744779582366, "grad_norm": 0.586264028989562, "learning_rate": 9.5169058963136e-06, "loss": 0.0467, "step": 1761 }, { "epoch": 0.6813611755607115, "grad_norm": 0.567566986796573, "learning_rate": 9.515940646897948e-06, "loss": 0.039, "step": 1762 }, { "epoch": 0.6817478731631864, "grad_norm": 0.39086534418457963, "learning_rate": 9.514974483183233e-06, "loss": 0.0446, "step": 1763 }, { "epoch": 0.6821345707656613, "grad_norm": 0.4304995019073666, "learning_rate": 9.514007405365066e-06, "loss": 0.0456, "step": 1764 }, { "epoch": 0.6825212683681361, "grad_norm": 0.3802704917256468, "learning_rate": 9.51303941363924e-06, "loss": 0.0443, "step": 1765 }, { "epoch": 0.682907965970611, "grad_norm": 0.32428427130554593, "learning_rate": 9.512070508201738e-06, "loss": 0.0302, "step": 1766 }, { "epoch": 0.6832946635730859, "grad_norm": 0.5249578314215211, "learning_rate": 9.511100689248723e-06, "loss": 0.0592, "step": 1767 }, { "epoch": 0.6836813611755607, "grad_norm": 0.35872395408075464, "learning_rate": 9.510129956976546e-06, "loss": 0.0435, "step": 1768 }, { "epoch": 0.6840680587780356, "grad_norm": 0.30874224893293195, "learning_rate": 9.50915831158174e-06, "loss": 0.0423, "step": 1769 }, { "epoch": 0.6844547563805105, "grad_norm": 0.3508339748818848, "learning_rate": 9.508185753261025e-06, "loss": 0.0405, "step": 1770 }, { "epoch": 0.6848414539829853, "grad_norm": 0.4215457659045958, "learning_rate": 9.507212282211307e-06, "loss": 0.053, "step": 1771 }, { "epoch": 0.6852281515854601, "grad_norm": 0.40262413242714473, "learning_rate": 9.506237898629675e-06, "loss": 0.05, "step": 1772 }, { "epoch": 0.685614849187935, "grad_norm": 0.45533619816930915, "learning_rate": 9.505262602713402e-06, "loss": 0.0416, "step": 1773 }, { "epoch": 0.6860015467904099, "grad_norm": 0.3601100582794497, "learning_rate": 9.504286394659948e-06, "loss": 0.0501, "step": 1774 }, { "epoch": 0.6863882443928848, "grad_norm": 0.3957644856321168, "learning_rate": 9.503309274666954e-06, "loss": 0.0574, "step": 1775 }, { "epoch": 0.6867749419953596, "grad_norm": 0.6359567915872274, "learning_rate": 9.502331242932251e-06, "loss": 0.0446, "step": 1776 }, { "epoch": 0.6871616395978345, "grad_norm": 0.31532805541186704, "learning_rate": 9.50135229965385e-06, "loss": 0.0323, "step": 1777 }, { "epoch": 0.6875483372003094, "grad_norm": 0.3090695975424638, "learning_rate": 9.50037244502995e-06, "loss": 0.0389, "step": 1778 }, { "epoch": 0.6879350348027842, "grad_norm": 0.38879749867874874, "learning_rate": 9.499391679258932e-06, "loss": 0.0471, "step": 1779 }, { "epoch": 0.6883217324052591, "grad_norm": 0.37329204115375203, "learning_rate": 9.498410002539362e-06, "loss": 0.0513, "step": 1780 }, { "epoch": 0.688708430007734, "grad_norm": 0.4643734884445846, "learning_rate": 9.497427415069989e-06, "loss": 0.052, "step": 1781 }, { "epoch": 0.6890951276102089, "grad_norm": 0.48676338882496106, "learning_rate": 9.49644391704975e-06, "loss": 0.0445, "step": 1782 }, { "epoch": 0.6894818252126836, "grad_norm": 0.7373101170962407, "learning_rate": 9.495459508677763e-06, "loss": 0.06, "step": 1783 }, { "epoch": 0.6898685228151585, "grad_norm": 0.306638918894206, "learning_rate": 9.494474190153333e-06, "loss": 0.0365, "step": 1784 }, { "epoch": 0.6902552204176334, "grad_norm": 0.4671845588904037, "learning_rate": 9.493487961675947e-06, "loss": 0.0605, "step": 1785 }, { "epoch": 0.6906419180201083, "grad_norm": 0.32357205265011957, "learning_rate": 9.49250082344528e-06, "loss": 0.0361, "step": 1786 }, { "epoch": 0.6910286156225831, "grad_norm": 0.40072458939811867, "learning_rate": 9.491512775661185e-06, "loss": 0.035, "step": 1787 }, { "epoch": 0.691415313225058, "grad_norm": 0.8139441774725312, "learning_rate": 9.490523818523703e-06, "loss": 0.0528, "step": 1788 }, { "epoch": 0.6918020108275329, "grad_norm": 0.4916324310537645, "learning_rate": 9.489533952233058e-06, "loss": 0.0363, "step": 1789 }, { "epoch": 0.6921887084300077, "grad_norm": 0.46104648483749383, "learning_rate": 9.488543176989662e-06, "loss": 0.0538, "step": 1790 }, { "epoch": 0.6925754060324826, "grad_norm": 0.26624246525406353, "learning_rate": 9.487551492994105e-06, "loss": 0.0299, "step": 1791 }, { "epoch": 0.6929621036349575, "grad_norm": 0.34888258405054484, "learning_rate": 9.486558900447165e-06, "loss": 0.0379, "step": 1792 }, { "epoch": 0.6933488012374324, "grad_norm": 0.36465870742608736, "learning_rate": 9.4855653995498e-06, "loss": 0.0339, "step": 1793 }, { "epoch": 0.6937354988399071, "grad_norm": 0.551000616819729, "learning_rate": 9.48457099050316e-06, "loss": 0.0673, "step": 1794 }, { "epoch": 0.694122196442382, "grad_norm": 0.38155630054816864, "learning_rate": 9.483575673508567e-06, "loss": 0.0454, "step": 1795 }, { "epoch": 0.6945088940448569, "grad_norm": 0.541001194490977, "learning_rate": 9.48257944876754e-06, "loss": 0.052, "step": 1796 }, { "epoch": 0.6948955916473318, "grad_norm": 0.6203548102774539, "learning_rate": 9.481582316481768e-06, "loss": 0.0564, "step": 1797 }, { "epoch": 0.6952822892498066, "grad_norm": 0.30647203192666883, "learning_rate": 9.480584276853134e-06, "loss": 0.0401, "step": 1798 }, { "epoch": 0.6956689868522815, "grad_norm": 0.44555616840867257, "learning_rate": 9.479585330083702e-06, "loss": 0.0335, "step": 1799 }, { "epoch": 0.6960556844547564, "grad_norm": 0.3809173333396466, "learning_rate": 9.47858547637572e-06, "loss": 0.0466, "step": 1800 }, { "epoch": 0.6964423820572312, "grad_norm": 0.5865363844863943, "learning_rate": 9.477584715931619e-06, "loss": 0.0577, "step": 1801 }, { "epoch": 0.6968290796597061, "grad_norm": 0.428844503668442, "learning_rate": 9.476583048954009e-06, "loss": 0.0444, "step": 1802 }, { "epoch": 0.697215777262181, "grad_norm": 0.45702432839185386, "learning_rate": 9.475580475645691e-06, "loss": 0.0392, "step": 1803 }, { "epoch": 0.6976024748646559, "grad_norm": 0.5051793447904049, "learning_rate": 9.474576996209647e-06, "loss": 0.0584, "step": 1804 }, { "epoch": 0.6979891724671307, "grad_norm": 0.58275924890383, "learning_rate": 9.473572610849042e-06, "loss": 0.057, "step": 1805 }, { "epoch": 0.6983758700696056, "grad_norm": 0.43412332846880736, "learning_rate": 9.47256731976722e-06, "loss": 0.0521, "step": 1806 }, { "epoch": 0.6987625676720804, "grad_norm": 0.24726191297430752, "learning_rate": 9.471561123167719e-06, "loss": 0.0317, "step": 1807 }, { "epoch": 0.6991492652745553, "grad_norm": 0.4778689861934416, "learning_rate": 9.47055402125425e-06, "loss": 0.049, "step": 1808 }, { "epoch": 0.6995359628770301, "grad_norm": 0.34803351271737615, "learning_rate": 9.469546014230711e-06, "loss": 0.0493, "step": 1809 }, { "epoch": 0.699922660479505, "grad_norm": 0.5210055894766181, "learning_rate": 9.468537102301186e-06, "loss": 0.0388, "step": 1810 }, { "epoch": 0.7003093580819799, "grad_norm": 1.8319643561356567, "learning_rate": 9.467527285669938e-06, "loss": 0.0576, "step": 1811 }, { "epoch": 0.7006960556844548, "grad_norm": 0.4500277703546245, "learning_rate": 9.466516564541412e-06, "loss": 0.0472, "step": 1812 }, { "epoch": 0.7010827532869296, "grad_norm": 0.32463616294783787, "learning_rate": 9.465504939120245e-06, "loss": 0.0367, "step": 1813 }, { "epoch": 0.7014694508894045, "grad_norm": 0.44052002455709227, "learning_rate": 9.464492409611247e-06, "loss": 0.0592, "step": 1814 }, { "epoch": 0.7018561484918794, "grad_norm": 0.33572552767105884, "learning_rate": 9.463478976219417e-06, "loss": 0.041, "step": 1815 }, { "epoch": 0.7022428460943542, "grad_norm": 0.4335116054754791, "learning_rate": 9.462464639149935e-06, "loss": 0.0582, "step": 1816 }, { "epoch": 0.7026295436968291, "grad_norm": 0.43204030339364186, "learning_rate": 9.461449398608162e-06, "loss": 0.0338, "step": 1817 }, { "epoch": 0.703016241299304, "grad_norm": 0.500515848376238, "learning_rate": 9.460433254799645e-06, "loss": 0.0512, "step": 1818 }, { "epoch": 0.7034029389017789, "grad_norm": 0.578236166406584, "learning_rate": 9.459416207930114e-06, "loss": 0.037, "step": 1819 }, { "epoch": 0.7037896365042536, "grad_norm": 0.5315095602401135, "learning_rate": 9.458398258205482e-06, "loss": 0.0404, "step": 1820 }, { "epoch": 0.7041763341067285, "grad_norm": 0.4930061190417461, "learning_rate": 9.457379405831838e-06, "loss": 0.0733, "step": 1821 }, { "epoch": 0.7045630317092034, "grad_norm": 0.34625054201216643, "learning_rate": 9.456359651015463e-06, "loss": 0.0391, "step": 1822 }, { "epoch": 0.7049497293116783, "grad_norm": 0.39335172210920216, "learning_rate": 9.455338993962817e-06, "loss": 0.0458, "step": 1823 }, { "epoch": 0.7053364269141531, "grad_norm": 0.44036980289113137, "learning_rate": 9.454317434880544e-06, "loss": 0.0704, "step": 1824 }, { "epoch": 0.705723124516628, "grad_norm": 0.4034666858554019, "learning_rate": 9.453294973975466e-06, "loss": 0.0408, "step": 1825 }, { "epoch": 0.7061098221191029, "grad_norm": 0.37932926648756926, "learning_rate": 9.452271611454594e-06, "loss": 0.0522, "step": 1826 }, { "epoch": 0.7064965197215777, "grad_norm": 0.36906904218101527, "learning_rate": 9.451247347525115e-06, "loss": 0.0504, "step": 1827 }, { "epoch": 0.7068832173240526, "grad_norm": 0.43669582553537956, "learning_rate": 9.450222182394403e-06, "loss": 0.0377, "step": 1828 }, { "epoch": 0.7072699149265275, "grad_norm": 0.4237811973817155, "learning_rate": 9.449196116270014e-06, "loss": 0.0546, "step": 1829 }, { "epoch": 0.7076566125290024, "grad_norm": 0.3652767493355737, "learning_rate": 9.448169149359687e-06, "loss": 0.0548, "step": 1830 }, { "epoch": 0.7080433101314771, "grad_norm": 0.409473336528831, "learning_rate": 9.44714128187134e-06, "loss": 0.0451, "step": 1831 }, { "epoch": 0.708430007733952, "grad_norm": 0.327991896093441, "learning_rate": 9.446112514013074e-06, "loss": 0.04, "step": 1832 }, { "epoch": 0.7088167053364269, "grad_norm": 0.3542254254648809, "learning_rate": 9.445082845993178e-06, "loss": 0.039, "step": 1833 }, { "epoch": 0.7092034029389018, "grad_norm": 0.4035021826435994, "learning_rate": 9.444052278020117e-06, "loss": 0.0398, "step": 1834 }, { "epoch": 0.7095901005413766, "grad_norm": 0.5315240770111241, "learning_rate": 9.443020810302541e-06, "loss": 0.0533, "step": 1835 }, { "epoch": 0.7099767981438515, "grad_norm": 0.25902673603932097, "learning_rate": 9.44198844304928e-06, "loss": 0.0365, "step": 1836 }, { "epoch": 0.7103634957463264, "grad_norm": 0.5022661998869145, "learning_rate": 9.440955176469347e-06, "loss": 0.0616, "step": 1837 }, { "epoch": 0.7107501933488012, "grad_norm": 0.3147168494247597, "learning_rate": 9.439921010771938e-06, "loss": 0.0424, "step": 1838 }, { "epoch": 0.7111368909512761, "grad_norm": 0.37558489016197344, "learning_rate": 9.43888594616643e-06, "loss": 0.0451, "step": 1839 }, { "epoch": 0.711523588553751, "grad_norm": 0.32824278148073704, "learning_rate": 9.437849982862385e-06, "loss": 0.0376, "step": 1840 }, { "epoch": 0.7119102861562259, "grad_norm": 0.5234076486474056, "learning_rate": 9.436813121069543e-06, "loss": 0.0527, "step": 1841 }, { "epoch": 0.7122969837587007, "grad_norm": 0.29847474503059057, "learning_rate": 9.435775360997828e-06, "loss": 0.0456, "step": 1842 }, { "epoch": 0.7126836813611755, "grad_norm": 0.4568049499950976, "learning_rate": 9.434736702857343e-06, "loss": 0.0569, "step": 1843 }, { "epoch": 0.7130703789636504, "grad_norm": 0.642465717621089, "learning_rate": 9.433697146858379e-06, "loss": 0.0472, "step": 1844 }, { "epoch": 0.7134570765661253, "grad_norm": 0.47987737778609907, "learning_rate": 9.432656693211401e-06, "loss": 0.0532, "step": 1845 }, { "epoch": 0.7138437741686001, "grad_norm": 0.38404659291648635, "learning_rate": 9.431615342127062e-06, "loss": 0.0421, "step": 1846 }, { "epoch": 0.714230471771075, "grad_norm": 0.42925215244483045, "learning_rate": 9.430573093816196e-06, "loss": 0.0593, "step": 1847 }, { "epoch": 0.7146171693735499, "grad_norm": 0.4659135986956194, "learning_rate": 9.429529948489813e-06, "loss": 0.0416, "step": 1848 }, { "epoch": 0.7150038669760247, "grad_norm": 0.4306224263323763, "learning_rate": 9.42848590635911e-06, "loss": 0.0541, "step": 1849 }, { "epoch": 0.7153905645784996, "grad_norm": 0.31333192430561857, "learning_rate": 9.427440967635463e-06, "loss": 0.0328, "step": 1850 }, { "epoch": 0.7157772621809745, "grad_norm": 0.5716780240938789, "learning_rate": 9.426395132530436e-06, "loss": 0.0472, "step": 1851 }, { "epoch": 0.7161639597834494, "grad_norm": 0.3676241472411481, "learning_rate": 9.425348401255762e-06, "loss": 0.0586, "step": 1852 }, { "epoch": 0.7165506573859242, "grad_norm": 0.382909076470163, "learning_rate": 9.42430077402337e-06, "loss": 0.0501, "step": 1853 }, { "epoch": 0.7169373549883991, "grad_norm": 0.40771984302885445, "learning_rate": 9.423252251045357e-06, "loss": 0.0449, "step": 1854 }, { "epoch": 0.717324052590874, "grad_norm": 0.2968140681432667, "learning_rate": 9.422202832534008e-06, "loss": 0.0268, "step": 1855 }, { "epoch": 0.7177107501933488, "grad_norm": 0.3750424501778786, "learning_rate": 9.42115251870179e-06, "loss": 0.0433, "step": 1856 }, { "epoch": 0.7180974477958236, "grad_norm": 0.45654789847125693, "learning_rate": 9.42010130976135e-06, "loss": 0.0533, "step": 1857 }, { "epoch": 0.7184841453982985, "grad_norm": 0.4123270832266151, "learning_rate": 9.419049205925519e-06, "loss": 0.0457, "step": 1858 }, { "epoch": 0.7188708430007734, "grad_norm": 0.3910975145046956, "learning_rate": 9.417996207407302e-06, "loss": 0.043, "step": 1859 }, { "epoch": 0.7192575406032483, "grad_norm": 0.3270872160709016, "learning_rate": 9.416942314419892e-06, "loss": 0.0323, "step": 1860 }, { "epoch": 0.7196442382057231, "grad_norm": 0.4055019174143548, "learning_rate": 9.415887527176657e-06, "loss": 0.0575, "step": 1861 }, { "epoch": 0.720030935808198, "grad_norm": 0.38511092484924775, "learning_rate": 9.414831845891155e-06, "loss": 0.0378, "step": 1862 }, { "epoch": 0.7204176334106729, "grad_norm": 0.4158926620699745, "learning_rate": 9.413775270777115e-06, "loss": 0.0436, "step": 1863 }, { "epoch": 0.7208043310131477, "grad_norm": 0.48849360324845975, "learning_rate": 9.412717802048454e-06, "loss": 0.0539, "step": 1864 }, { "epoch": 0.7211910286156226, "grad_norm": 0.44313017749887196, "learning_rate": 9.411659439919268e-06, "loss": 0.0377, "step": 1865 }, { "epoch": 0.7215777262180975, "grad_norm": 0.4264944879625542, "learning_rate": 9.41060018460383e-06, "loss": 0.0585, "step": 1866 }, { "epoch": 0.7219644238205724, "grad_norm": 0.3266997118464345, "learning_rate": 9.409540036316602e-06, "loss": 0.0404, "step": 1867 }, { "epoch": 0.7223511214230471, "grad_norm": 0.3596595282311552, "learning_rate": 9.408478995272219e-06, "loss": 0.0442, "step": 1868 }, { "epoch": 0.722737819025522, "grad_norm": 0.39216044618972956, "learning_rate": 9.407417061685499e-06, "loss": 0.0347, "step": 1869 }, { "epoch": 0.7231245166279969, "grad_norm": 0.3915682997644164, "learning_rate": 9.406354235771444e-06, "loss": 0.043, "step": 1870 }, { "epoch": 0.7235112142304718, "grad_norm": 0.3412823358450297, "learning_rate": 9.405290517745232e-06, "loss": 0.0383, "step": 1871 }, { "epoch": 0.7238979118329466, "grad_norm": 0.3518354926638862, "learning_rate": 9.404225907822226e-06, "loss": 0.0466, "step": 1872 }, { "epoch": 0.7242846094354215, "grad_norm": 0.4544860294561059, "learning_rate": 9.403160406217966e-06, "loss": 0.0491, "step": 1873 }, { "epoch": 0.7246713070378964, "grad_norm": 0.41255780142860243, "learning_rate": 9.402094013148174e-06, "loss": 0.0523, "step": 1874 }, { "epoch": 0.7250580046403712, "grad_norm": 0.37270919959468407, "learning_rate": 9.401026728828752e-06, "loss": 0.0611, "step": 1875 }, { "epoch": 0.7254447022428461, "grad_norm": 0.525817965294853, "learning_rate": 9.399958553475783e-06, "loss": 0.0368, "step": 1876 }, { "epoch": 0.725831399845321, "grad_norm": 0.34427141752651325, "learning_rate": 9.39888948730553e-06, "loss": 0.0416, "step": 1877 }, { "epoch": 0.7262180974477959, "grad_norm": 0.4821771639911883, "learning_rate": 9.397819530534437e-06, "loss": 0.055, "step": 1878 }, { "epoch": 0.7266047950502706, "grad_norm": 0.4546951617427501, "learning_rate": 9.396748683379128e-06, "loss": 0.0486, "step": 1879 }, { "epoch": 0.7269914926527455, "grad_norm": 0.3715922185567698, "learning_rate": 9.395676946056406e-06, "loss": 0.0428, "step": 1880 }, { "epoch": 0.7273781902552204, "grad_norm": 0.5402576742086274, "learning_rate": 9.394604318783258e-06, "loss": 0.0507, "step": 1881 }, { "epoch": 0.7277648878576953, "grad_norm": 0.30662946360129245, "learning_rate": 9.393530801776844e-06, "loss": 0.0459, "step": 1882 }, { "epoch": 0.7281515854601701, "grad_norm": 0.29027552246504723, "learning_rate": 9.392456395254513e-06, "loss": 0.0355, "step": 1883 }, { "epoch": 0.728538283062645, "grad_norm": 0.4235747478521092, "learning_rate": 9.391381099433788e-06, "loss": 0.0434, "step": 1884 }, { "epoch": 0.7289249806651199, "grad_norm": 0.30039311445433686, "learning_rate": 9.390304914532374e-06, "loss": 0.0323, "step": 1885 }, { "epoch": 0.7293116782675947, "grad_norm": 0.2638576244336945, "learning_rate": 9.389227840768156e-06, "loss": 0.0289, "step": 1886 }, { "epoch": 0.7296983758700696, "grad_norm": 0.49107516219371633, "learning_rate": 9.388149878359199e-06, "loss": 0.0406, "step": 1887 }, { "epoch": 0.7300850734725445, "grad_norm": 0.3670024564487179, "learning_rate": 9.387071027523744e-06, "loss": 0.0402, "step": 1888 }, { "epoch": 0.7304717710750194, "grad_norm": 0.36280394070478217, "learning_rate": 9.385991288480221e-06, "loss": 0.0335, "step": 1889 }, { "epoch": 0.7308584686774942, "grad_norm": 0.39494346867868235, "learning_rate": 9.384910661447234e-06, "loss": 0.0484, "step": 1890 }, { "epoch": 0.731245166279969, "grad_norm": 0.27599077050674964, "learning_rate": 9.383829146643562e-06, "loss": 0.0399, "step": 1891 }, { "epoch": 0.731631863882444, "grad_norm": 0.31312112354902405, "learning_rate": 9.382746744288174e-06, "loss": 0.0475, "step": 1892 }, { "epoch": 0.7320185614849188, "grad_norm": 0.4476008170279303, "learning_rate": 9.38166345460021e-06, "loss": 0.0427, "step": 1893 }, { "epoch": 0.7324052590873936, "grad_norm": 0.3935904954676824, "learning_rate": 9.380579277798996e-06, "loss": 0.0515, "step": 1894 }, { "epoch": 0.7327919566898685, "grad_norm": 0.37270486429115324, "learning_rate": 9.379494214104033e-06, "loss": 0.0331, "step": 1895 }, { "epoch": 0.7331786542923434, "grad_norm": 0.39134711062514976, "learning_rate": 9.378408263735006e-06, "loss": 0.0425, "step": 1896 }, { "epoch": 0.7335653518948183, "grad_norm": 0.3167995264455294, "learning_rate": 9.377321426911775e-06, "loss": 0.0414, "step": 1897 }, { "epoch": 0.7339520494972931, "grad_norm": 0.44415943763536897, "learning_rate": 9.37623370385438e-06, "loss": 0.041, "step": 1898 }, { "epoch": 0.734338747099768, "grad_norm": 0.3905224718328915, "learning_rate": 9.375145094783044e-06, "loss": 0.0363, "step": 1899 }, { "epoch": 0.7347254447022429, "grad_norm": 0.475094613716428, "learning_rate": 9.374055599918165e-06, "loss": 0.073, "step": 1900 }, { "epoch": 0.7351121423047177, "grad_norm": 0.6767913936839486, "learning_rate": 9.372965219480327e-06, "loss": 0.065, "step": 1901 }, { "epoch": 0.7354988399071926, "grad_norm": 0.416718149350025, "learning_rate": 9.371873953690284e-06, "loss": 0.0546, "step": 1902 }, { "epoch": 0.7358855375096675, "grad_norm": 0.3682384069155538, "learning_rate": 9.370781802768977e-06, "loss": 0.0565, "step": 1903 }, { "epoch": 0.7362722351121423, "grad_norm": 0.378615764653346, "learning_rate": 9.369688766937522e-06, "loss": 0.0444, "step": 1904 }, { "epoch": 0.7366589327146171, "grad_norm": 0.4795721125658108, "learning_rate": 9.368594846417216e-06, "loss": 0.0664, "step": 1905 }, { "epoch": 0.737045630317092, "grad_norm": 0.37822036543824683, "learning_rate": 9.367500041429535e-06, "loss": 0.04, "step": 1906 }, { "epoch": 0.7374323279195669, "grad_norm": 0.3472056159204241, "learning_rate": 9.366404352196132e-06, "loss": 0.0392, "step": 1907 }, { "epoch": 0.7378190255220418, "grad_norm": 0.27235597369048403, "learning_rate": 9.365307778938841e-06, "loss": 0.0419, "step": 1908 }, { "epoch": 0.7382057231245166, "grad_norm": 0.31737632970880614, "learning_rate": 9.364210321879677e-06, "loss": 0.0421, "step": 1909 }, { "epoch": 0.7385924207269915, "grad_norm": 0.3761392762144653, "learning_rate": 9.363111981240829e-06, "loss": 0.0421, "step": 1910 }, { "epoch": 0.7389791183294664, "grad_norm": 0.45540508319140266, "learning_rate": 9.362012757244669e-06, "loss": 0.0664, "step": 1911 }, { "epoch": 0.7393658159319412, "grad_norm": 0.30329849060220904, "learning_rate": 9.360912650113745e-06, "loss": 0.0282, "step": 1912 }, { "epoch": 0.7397525135344161, "grad_norm": 0.30372014596247654, "learning_rate": 9.359811660070785e-06, "loss": 0.0419, "step": 1913 }, { "epoch": 0.740139211136891, "grad_norm": 0.3452644535945752, "learning_rate": 9.358709787338697e-06, "loss": 0.0402, "step": 1914 }, { "epoch": 0.7405259087393659, "grad_norm": 0.7282801406013995, "learning_rate": 9.357607032140568e-06, "loss": 0.0482, "step": 1915 }, { "epoch": 0.7409126063418406, "grad_norm": 0.3662567565563927, "learning_rate": 9.356503394699658e-06, "loss": 0.048, "step": 1916 }, { "epoch": 0.7412993039443155, "grad_norm": 0.5101098691508079, "learning_rate": 9.355398875239414e-06, "loss": 0.0459, "step": 1917 }, { "epoch": 0.7416860015467904, "grad_norm": 0.25665048603779167, "learning_rate": 9.354293473983454e-06, "loss": 0.0283, "step": 1918 }, { "epoch": 0.7420726991492653, "grad_norm": 0.4372312944281087, "learning_rate": 9.353187191155582e-06, "loss": 0.0416, "step": 1919 }, { "epoch": 0.7424593967517401, "grad_norm": 0.37731143613625856, "learning_rate": 9.352080026979771e-06, "loss": 0.0511, "step": 1920 }, { "epoch": 0.742846094354215, "grad_norm": 0.40393532633895346, "learning_rate": 9.350971981680184e-06, "loss": 0.046, "step": 1921 }, { "epoch": 0.7432327919566899, "grad_norm": 0.6642255283031521, "learning_rate": 9.349863055481152e-06, "loss": 0.057, "step": 1922 }, { "epoch": 0.7436194895591647, "grad_norm": 0.3375720675717647, "learning_rate": 9.34875324860719e-06, "loss": 0.0486, "step": 1923 }, { "epoch": 0.7440061871616396, "grad_norm": 0.4615723075817448, "learning_rate": 9.34764256128299e-06, "loss": 0.0492, "step": 1924 }, { "epoch": 0.7443928847641145, "grad_norm": 0.4011183592012996, "learning_rate": 9.34653099373342e-06, "loss": 0.051, "step": 1925 }, { "epoch": 0.7447795823665894, "grad_norm": 0.3225201548548953, "learning_rate": 9.345418546183532e-06, "loss": 0.0427, "step": 1926 }, { "epoch": 0.7451662799690641, "grad_norm": 0.4552511841417585, "learning_rate": 9.344305218858551e-06, "loss": 0.0527, "step": 1927 }, { "epoch": 0.745552977571539, "grad_norm": 0.25620045194425684, "learning_rate": 9.34319101198388e-06, "loss": 0.0421, "step": 1928 }, { "epoch": 0.7459396751740139, "grad_norm": 0.5933180304000206, "learning_rate": 9.342075925785107e-06, "loss": 0.0409, "step": 1929 }, { "epoch": 0.7463263727764888, "grad_norm": 0.3386122662649574, "learning_rate": 9.340959960487988e-06, "loss": 0.0441, "step": 1930 }, { "epoch": 0.7467130703789636, "grad_norm": 0.2869745665282421, "learning_rate": 9.339843116318462e-06, "loss": 0.0403, "step": 1931 }, { "epoch": 0.7470997679814385, "grad_norm": 0.44127962858321745, "learning_rate": 9.338725393502646e-06, "loss": 0.0472, "step": 1932 }, { "epoch": 0.7474864655839134, "grad_norm": 0.3931369250397279, "learning_rate": 9.337606792266834e-06, "loss": 0.037, "step": 1933 }, { "epoch": 0.7478731631863882, "grad_norm": 0.3005308110760948, "learning_rate": 9.336487312837501e-06, "loss": 0.0352, "step": 1934 }, { "epoch": 0.7482598607888631, "grad_norm": 0.32518027519682463, "learning_rate": 9.335366955441297e-06, "loss": 0.0306, "step": 1935 }, { "epoch": 0.748646558391338, "grad_norm": 0.3569215162141167, "learning_rate": 9.334245720305049e-06, "loss": 0.044, "step": 1936 }, { "epoch": 0.7490332559938129, "grad_norm": 0.3896228416157557, "learning_rate": 9.333123607655761e-06, "loss": 0.0436, "step": 1937 }, { "epoch": 0.7494199535962877, "grad_norm": 0.36279556769633886, "learning_rate": 9.332000617720618e-06, "loss": 0.0433, "step": 1938 }, { "epoch": 0.7498066511987626, "grad_norm": 0.31662315348174225, "learning_rate": 9.33087675072698e-06, "loss": 0.0442, "step": 1939 }, { "epoch": 0.7501933488012374, "grad_norm": 0.5194247845609847, "learning_rate": 9.329752006902389e-06, "loss": 0.0463, "step": 1940 }, { "epoch": 0.7505800464037123, "grad_norm": 0.29530937144500397, "learning_rate": 9.328626386474556e-06, "loss": 0.0414, "step": 1941 }, { "epoch": 0.7509667440061871, "grad_norm": 0.415924180883789, "learning_rate": 9.327499889671377e-06, "loss": 0.0532, "step": 1942 }, { "epoch": 0.751353441608662, "grad_norm": 0.3602690224497562, "learning_rate": 9.326372516720924e-06, "loss": 0.0495, "step": 1943 }, { "epoch": 0.7517401392111369, "grad_norm": 0.32001121405456356, "learning_rate": 9.325244267851442e-06, "loss": 0.0325, "step": 1944 }, { "epoch": 0.7521268368136118, "grad_norm": 0.4468889839003064, "learning_rate": 9.324115143291363e-06, "loss": 0.0372, "step": 1945 }, { "epoch": 0.7525135344160866, "grad_norm": 0.28859079955392075, "learning_rate": 9.322985143269285e-06, "loss": 0.0399, "step": 1946 }, { "epoch": 0.7529002320185615, "grad_norm": 0.3551047126841683, "learning_rate": 9.321854268013988e-06, "loss": 0.0433, "step": 1947 }, { "epoch": 0.7532869296210364, "grad_norm": 0.31613742181238325, "learning_rate": 9.320722517754431e-06, "loss": 0.0331, "step": 1948 }, { "epoch": 0.7536736272235112, "grad_norm": 0.39437395138238257, "learning_rate": 9.319589892719747e-06, "loss": 0.0403, "step": 1949 }, { "epoch": 0.7540603248259861, "grad_norm": 0.3461781698469521, "learning_rate": 9.31845639313925e-06, "loss": 0.0459, "step": 1950 }, { "epoch": 0.754447022428461, "grad_norm": 0.45713646659517015, "learning_rate": 9.317322019242427e-06, "loss": 0.0483, "step": 1951 }, { "epoch": 0.7548337200309359, "grad_norm": 0.5461837894494609, "learning_rate": 9.316186771258946e-06, "loss": 0.0443, "step": 1952 }, { "epoch": 0.7552204176334106, "grad_norm": 0.3164765979397139, "learning_rate": 9.315050649418648e-06, "loss": 0.0397, "step": 1953 }, { "epoch": 0.7556071152358855, "grad_norm": 0.3050102122558607, "learning_rate": 9.313913653951551e-06, "loss": 0.0311, "step": 1954 }, { "epoch": 0.7559938128383604, "grad_norm": 0.45084152014992707, "learning_rate": 9.312775785087856e-06, "loss": 0.0467, "step": 1955 }, { "epoch": 0.7563805104408353, "grad_norm": 0.37031223862695717, "learning_rate": 9.311637043057933e-06, "loss": 0.0657, "step": 1956 }, { "epoch": 0.7567672080433101, "grad_norm": 0.34716959133235253, "learning_rate": 9.31049742809233e-06, "loss": 0.0334, "step": 1957 }, { "epoch": 0.757153905645785, "grad_norm": 0.38195572719487686, "learning_rate": 9.309356940421781e-06, "loss": 0.0345, "step": 1958 }, { "epoch": 0.7575406032482599, "grad_norm": 0.2879010505566296, "learning_rate": 9.308215580277184e-06, "loss": 0.038, "step": 1959 }, { "epoch": 0.7579273008507347, "grad_norm": 0.36532295093447453, "learning_rate": 9.307073347889619e-06, "loss": 0.038, "step": 1960 }, { "epoch": 0.7583139984532096, "grad_norm": 0.347084906844417, "learning_rate": 9.305930243490345e-06, "loss": 0.0393, "step": 1961 }, { "epoch": 0.7587006960556845, "grad_norm": 0.8026621731795484, "learning_rate": 9.304786267310796e-06, "loss": 0.0488, "step": 1962 }, { "epoch": 0.7590873936581594, "grad_norm": 0.2912599791148767, "learning_rate": 9.30364141958258e-06, "loss": 0.0325, "step": 1963 }, { "epoch": 0.7594740912606341, "grad_norm": 0.28827657206659846, "learning_rate": 9.302495700537483e-06, "loss": 0.0289, "step": 1964 }, { "epoch": 0.759860788863109, "grad_norm": 0.37348082665414106, "learning_rate": 9.30134911040747e-06, "loss": 0.0398, "step": 1965 }, { "epoch": 0.7602474864655839, "grad_norm": 0.3538641972746564, "learning_rate": 9.300201649424678e-06, "loss": 0.0372, "step": 1966 }, { "epoch": 0.7606341840680588, "grad_norm": 0.30761148221886264, "learning_rate": 9.299053317821422e-06, "loss": 0.0364, "step": 1967 }, { "epoch": 0.7610208816705336, "grad_norm": 0.3769660947582738, "learning_rate": 9.297904115830196e-06, "loss": 0.0426, "step": 1968 }, { "epoch": 0.7614075792730085, "grad_norm": 0.34550068464668054, "learning_rate": 9.296754043683667e-06, "loss": 0.0408, "step": 1969 }, { "epoch": 0.7617942768754834, "grad_norm": 0.4381017748973492, "learning_rate": 9.295603101614676e-06, "loss": 0.0694, "step": 1970 }, { "epoch": 0.7621809744779582, "grad_norm": 0.34799161397387884, "learning_rate": 9.294451289856248e-06, "loss": 0.03, "step": 1971 }, { "epoch": 0.7625676720804331, "grad_norm": 0.46677919827995734, "learning_rate": 9.293298608641576e-06, "loss": 0.0453, "step": 1972 }, { "epoch": 0.762954369682908, "grad_norm": 0.27677640874230347, "learning_rate": 9.292145058204035e-06, "loss": 0.0385, "step": 1973 }, { "epoch": 0.7633410672853829, "grad_norm": 0.5883755607926787, "learning_rate": 9.29099063877717e-06, "loss": 0.0703, "step": 1974 }, { "epoch": 0.7637277648878577, "grad_norm": 0.38578197770452816, "learning_rate": 9.289835350594709e-06, "loss": 0.0475, "step": 1975 }, { "epoch": 0.7641144624903325, "grad_norm": 0.5755487849108301, "learning_rate": 9.288679193890547e-06, "loss": 0.0392, "step": 1976 }, { "epoch": 0.7645011600928074, "grad_norm": 0.5230389288791049, "learning_rate": 9.287522168898763e-06, "loss": 0.0511, "step": 1977 }, { "epoch": 0.7648878576952823, "grad_norm": 0.4773329152056393, "learning_rate": 9.28636427585361e-06, "loss": 0.0485, "step": 1978 }, { "epoch": 0.7652745552977571, "grad_norm": 0.45092429078393986, "learning_rate": 9.285205514989514e-06, "loss": 0.0402, "step": 1979 }, { "epoch": 0.765661252900232, "grad_norm": 0.4443426174202063, "learning_rate": 9.284045886541077e-06, "loss": 0.0409, "step": 1980 }, { "epoch": 0.7660479505027069, "grad_norm": 0.4444731265947755, "learning_rate": 9.282885390743081e-06, "loss": 0.0367, "step": 1981 }, { "epoch": 0.7664346481051817, "grad_norm": 0.5931738121468442, "learning_rate": 9.281724027830477e-06, "loss": 0.0587, "step": 1982 }, { "epoch": 0.7668213457076566, "grad_norm": 0.5217227911368073, "learning_rate": 9.280561798038395e-06, "loss": 0.0567, "step": 1983 }, { "epoch": 0.7672080433101315, "grad_norm": 0.41342919954387547, "learning_rate": 9.279398701602143e-06, "loss": 0.0469, "step": 1984 }, { "epoch": 0.7675947409126064, "grad_norm": 0.3640357465764493, "learning_rate": 9.278234738757202e-06, "loss": 0.0409, "step": 1985 }, { "epoch": 0.7679814385150812, "grad_norm": 0.33682009963652093, "learning_rate": 9.277069909739226e-06, "loss": 0.039, "step": 1986 }, { "epoch": 0.768368136117556, "grad_norm": 0.26350746834478467, "learning_rate": 9.275904214784048e-06, "loss": 0.039, "step": 1987 }, { "epoch": 0.768754833720031, "grad_norm": 0.3393546725636686, "learning_rate": 9.274737654127674e-06, "loss": 0.0467, "step": 1988 }, { "epoch": 0.7691415313225058, "grad_norm": 0.3854304356681642, "learning_rate": 9.273570228006289e-06, "loss": 0.0347, "step": 1989 }, { "epoch": 0.7695282289249806, "grad_norm": 0.3697482638156762, "learning_rate": 9.272401936656249e-06, "loss": 0.0421, "step": 1990 }, { "epoch": 0.7699149265274555, "grad_norm": 0.4128246009206392, "learning_rate": 9.271232780314084e-06, "loss": 0.0544, "step": 1991 }, { "epoch": 0.7703016241299304, "grad_norm": 0.3196788574798021, "learning_rate": 9.270062759216506e-06, "loss": 0.0391, "step": 1992 }, { "epoch": 0.7706883217324053, "grad_norm": 0.33109847156474187, "learning_rate": 9.268891873600396e-06, "loss": 0.0412, "step": 1993 }, { "epoch": 0.7710750193348801, "grad_norm": 0.37675127728268243, "learning_rate": 9.267720123702812e-06, "loss": 0.052, "step": 1994 }, { "epoch": 0.771461716937355, "grad_norm": 0.33341411657503583, "learning_rate": 9.266547509760988e-06, "loss": 0.036, "step": 1995 }, { "epoch": 0.7718484145398299, "grad_norm": 0.37783184544722287, "learning_rate": 9.26537403201233e-06, "loss": 0.0399, "step": 1996 }, { "epoch": 0.7722351121423047, "grad_norm": 0.4247607164098058, "learning_rate": 9.264199690694422e-06, "loss": 0.0489, "step": 1997 }, { "epoch": 0.7726218097447796, "grad_norm": 0.34699855543601915, "learning_rate": 9.26302448604502e-06, "loss": 0.0395, "step": 1998 }, { "epoch": 0.7730085073472545, "grad_norm": 0.371677838737501, "learning_rate": 9.261848418302059e-06, "loss": 0.0357, "step": 1999 }, { "epoch": 0.7733952049497294, "grad_norm": 0.29671695979753726, "learning_rate": 9.260671487703644e-06, "loss": 0.0325, "step": 2000 }, { "epoch": 0.7737819025522041, "grad_norm": 0.4272548554883361, "learning_rate": 9.25949369448806e-06, "loss": 0.0371, "step": 2001 }, { "epoch": 0.774168600154679, "grad_norm": 0.26567009873722874, "learning_rate": 9.25831503889376e-06, "loss": 0.035, "step": 2002 }, { "epoch": 0.7745552977571539, "grad_norm": 0.47094624746764047, "learning_rate": 9.257135521159375e-06, "loss": 0.0569, "step": 2003 }, { "epoch": 0.7749419953596288, "grad_norm": 0.3730363075068027, "learning_rate": 9.255955141523714e-06, "loss": 0.0443, "step": 2004 }, { "epoch": 0.7753286929621036, "grad_norm": 0.3446902816912548, "learning_rate": 9.254773900225753e-06, "loss": 0.0296, "step": 2005 }, { "epoch": 0.7757153905645785, "grad_norm": 0.45005859230781714, "learning_rate": 9.253591797504648e-06, "loss": 0.0425, "step": 2006 }, { "epoch": 0.7761020881670534, "grad_norm": 0.320744179189913, "learning_rate": 9.25240883359973e-06, "loss": 0.0384, "step": 2007 }, { "epoch": 0.7764887857695282, "grad_norm": 0.4839807759279548, "learning_rate": 9.2512250087505e-06, "loss": 0.0438, "step": 2008 }, { "epoch": 0.7768754833720031, "grad_norm": 0.32776749193978333, "learning_rate": 9.250040323196636e-06, "loss": 0.0373, "step": 2009 }, { "epoch": 0.777262180974478, "grad_norm": 0.37850471014457804, "learning_rate": 9.24885477717799e-06, "loss": 0.0479, "step": 2010 }, { "epoch": 0.7776488785769529, "grad_norm": 0.3575227909259011, "learning_rate": 9.24766837093459e-06, "loss": 0.0449, "step": 2011 }, { "epoch": 0.7780355761794276, "grad_norm": 0.5796170775923972, "learning_rate": 9.246481104706633e-06, "loss": 0.0379, "step": 2012 }, { "epoch": 0.7784222737819025, "grad_norm": 0.4177173105471873, "learning_rate": 9.245292978734496e-06, "loss": 0.0534, "step": 2013 }, { "epoch": 0.7788089713843774, "grad_norm": 0.25166271949478086, "learning_rate": 9.244103993258727e-06, "loss": 0.0335, "step": 2014 }, { "epoch": 0.7791956689868523, "grad_norm": 0.19513970774939177, "learning_rate": 9.242914148520045e-06, "loss": 0.0197, "step": 2015 }, { "epoch": 0.7795823665893271, "grad_norm": 0.4744267815198986, "learning_rate": 9.241723444759351e-06, "loss": 0.0581, "step": 2016 }, { "epoch": 0.779969064191802, "grad_norm": 0.28742784693892853, "learning_rate": 9.240531882217716e-06, "loss": 0.0434, "step": 2017 }, { "epoch": 0.7803557617942769, "grad_norm": 0.5419397627255754, "learning_rate": 9.239339461136378e-06, "loss": 0.0516, "step": 2018 }, { "epoch": 0.7807424593967517, "grad_norm": 0.41895091688638947, "learning_rate": 9.238146181756762e-06, "loss": 0.0358, "step": 2019 }, { "epoch": 0.7811291569992266, "grad_norm": 0.33418194545493696, "learning_rate": 9.236952044320455e-06, "loss": 0.0356, "step": 2020 }, { "epoch": 0.7815158546017015, "grad_norm": 0.3494088211883013, "learning_rate": 9.235757049069225e-06, "loss": 0.0461, "step": 2021 }, { "epoch": 0.7819025522041764, "grad_norm": 0.31823441703622796, "learning_rate": 9.23456119624501e-06, "loss": 0.0412, "step": 2022 }, { "epoch": 0.7822892498066512, "grad_norm": 0.3713039518343329, "learning_rate": 9.233364486089922e-06, "loss": 0.0348, "step": 2023 }, { "epoch": 0.782675947409126, "grad_norm": 0.3583215092436028, "learning_rate": 9.232166918846249e-06, "loss": 0.0497, "step": 2024 }, { "epoch": 0.7830626450116009, "grad_norm": 0.2983640183805472, "learning_rate": 9.230968494756452e-06, "loss": 0.0295, "step": 2025 }, { "epoch": 0.7834493426140758, "grad_norm": 0.3520953259231617, "learning_rate": 9.22976921406316e-06, "loss": 0.04, "step": 2026 }, { "epoch": 0.7838360402165506, "grad_norm": 0.45471481501166433, "learning_rate": 9.228569077009186e-06, "loss": 0.0555, "step": 2027 }, { "epoch": 0.7842227378190255, "grad_norm": 0.31289325869727275, "learning_rate": 9.227368083837505e-06, "loss": 0.0312, "step": 2028 }, { "epoch": 0.7846094354215004, "grad_norm": 0.34666861500371127, "learning_rate": 9.226166234791274e-06, "loss": 0.0502, "step": 2029 }, { "epoch": 0.7849961330239753, "grad_norm": 0.3715021140854643, "learning_rate": 9.224963530113818e-06, "loss": 0.0376, "step": 2030 }, { "epoch": 0.7853828306264501, "grad_norm": 0.33610125400528074, "learning_rate": 9.223759970048636e-06, "loss": 0.0408, "step": 2031 }, { "epoch": 0.785769528228925, "grad_norm": 0.28850823601076303, "learning_rate": 9.222555554839404e-06, "loss": 0.0278, "step": 2032 }, { "epoch": 0.7861562258313999, "grad_norm": 0.3863186661643496, "learning_rate": 9.221350284729967e-06, "loss": 0.0431, "step": 2033 }, { "epoch": 0.7865429234338747, "grad_norm": 0.3759850493045158, "learning_rate": 9.220144159964341e-06, "loss": 0.0358, "step": 2034 }, { "epoch": 0.7869296210363496, "grad_norm": 0.35559764055397364, "learning_rate": 9.218937180786726e-06, "loss": 0.0506, "step": 2035 }, { "epoch": 0.7873163186388245, "grad_norm": 0.3206411890687523, "learning_rate": 9.217729347441483e-06, "loss": 0.0294, "step": 2036 }, { "epoch": 0.7877030162412993, "grad_norm": 0.42880178526752893, "learning_rate": 9.216520660173148e-06, "loss": 0.0524, "step": 2037 }, { "epoch": 0.7880897138437741, "grad_norm": 0.2734754385163893, "learning_rate": 9.215311119226436e-06, "loss": 0.0309, "step": 2038 }, { "epoch": 0.788476411446249, "grad_norm": 0.3192418924340534, "learning_rate": 9.214100724846231e-06, "loss": 0.0376, "step": 2039 }, { "epoch": 0.7888631090487239, "grad_norm": 0.36546713533160613, "learning_rate": 9.21288947727759e-06, "loss": 0.043, "step": 2040 }, { "epoch": 0.7892498066511988, "grad_norm": 0.27920088176646174, "learning_rate": 9.21167737676574e-06, "loss": 0.0397, "step": 2041 }, { "epoch": 0.7896365042536736, "grad_norm": 0.3529461349069079, "learning_rate": 9.210464423556083e-06, "loss": 0.0351, "step": 2042 }, { "epoch": 0.7900232018561485, "grad_norm": 0.2691027971707512, "learning_rate": 9.209250617894199e-06, "loss": 0.031, "step": 2043 }, { "epoch": 0.7904098994586234, "grad_norm": 0.33465015780976015, "learning_rate": 9.208035960025833e-06, "loss": 0.0361, "step": 2044 }, { "epoch": 0.7907965970610982, "grad_norm": 0.2550126803570466, "learning_rate": 9.206820450196903e-06, "loss": 0.0308, "step": 2045 }, { "epoch": 0.7911832946635731, "grad_norm": 0.3378480012627455, "learning_rate": 9.205604088653502e-06, "loss": 0.0315, "step": 2046 }, { "epoch": 0.791569992266048, "grad_norm": 0.3947089187281523, "learning_rate": 9.204386875641898e-06, "loss": 0.0453, "step": 2047 }, { "epoch": 0.7919566898685229, "grad_norm": 0.37034754261319264, "learning_rate": 9.203168811408526e-06, "loss": 0.0424, "step": 2048 }, { "epoch": 0.7923433874709976, "grad_norm": 0.40820646859394444, "learning_rate": 9.201949896199998e-06, "loss": 0.0483, "step": 2049 }, { "epoch": 0.7927300850734725, "grad_norm": 0.46394163612782485, "learning_rate": 9.200730130263093e-06, "loss": 0.0487, "step": 2050 }, { "epoch": 0.7931167826759474, "grad_norm": 0.4005011482335009, "learning_rate": 9.199509513844767e-06, "loss": 0.0564, "step": 2051 }, { "epoch": 0.7935034802784223, "grad_norm": 0.4561291558618522, "learning_rate": 9.198288047192146e-06, "loss": 0.0489, "step": 2052 }, { "epoch": 0.7938901778808971, "grad_norm": 0.3598790133478172, "learning_rate": 9.197065730552528e-06, "loss": 0.0364, "step": 2053 }, { "epoch": 0.794276875483372, "grad_norm": 0.28990252331989397, "learning_rate": 9.195842564173385e-06, "loss": 0.0339, "step": 2054 }, { "epoch": 0.7946635730858469, "grad_norm": 0.4476991570334629, "learning_rate": 9.194618548302361e-06, "loss": 0.042, "step": 2055 }, { "epoch": 0.7950502706883217, "grad_norm": 0.3397666509525285, "learning_rate": 9.193393683187266e-06, "loss": 0.0369, "step": 2056 }, { "epoch": 0.7954369682907966, "grad_norm": 0.42253137408441926, "learning_rate": 9.192167969076092e-06, "loss": 0.0361, "step": 2057 }, { "epoch": 0.7958236658932715, "grad_norm": 0.3765118956473476, "learning_rate": 9.190941406216994e-06, "loss": 0.0391, "step": 2058 }, { "epoch": 0.7962103634957464, "grad_norm": 0.512157539061835, "learning_rate": 9.189713994858303e-06, "loss": 0.0531, "step": 2059 }, { "epoch": 0.7965970610982211, "grad_norm": 0.5218466437526283, "learning_rate": 9.188485735248523e-06, "loss": 0.058, "step": 2060 }, { "epoch": 0.796983758700696, "grad_norm": 0.30558270016798506, "learning_rate": 9.187256627636325e-06, "loss": 0.0312, "step": 2061 }, { "epoch": 0.7973704563031709, "grad_norm": 0.32316717014265667, "learning_rate": 9.186026672270556e-06, "loss": 0.0311, "step": 2062 }, { "epoch": 0.7977571539056458, "grad_norm": 0.34437183615378336, "learning_rate": 9.184795869400235e-06, "loss": 0.0424, "step": 2063 }, { "epoch": 0.7981438515081206, "grad_norm": 0.38253840212282414, "learning_rate": 9.183564219274547e-06, "loss": 0.046, "step": 2064 }, { "epoch": 0.7985305491105955, "grad_norm": 0.2673745424385019, "learning_rate": 9.182331722142857e-06, "loss": 0.0325, "step": 2065 }, { "epoch": 0.7989172467130704, "grad_norm": 0.6507714976015732, "learning_rate": 9.181098378254694e-06, "loss": 0.0417, "step": 2066 }, { "epoch": 0.7993039443155452, "grad_norm": 0.30962627647736607, "learning_rate": 9.179864187859761e-06, "loss": 0.0353, "step": 2067 }, { "epoch": 0.7996906419180201, "grad_norm": 0.6084566003943205, "learning_rate": 9.178629151207934e-06, "loss": 0.0483, "step": 2068 }, { "epoch": 0.800077339520495, "grad_norm": 0.39169418730785566, "learning_rate": 9.177393268549257e-06, "loss": 0.0344, "step": 2069 }, { "epoch": 0.8004640371229699, "grad_norm": 0.40312949083775734, "learning_rate": 9.176156540133949e-06, "loss": 0.0444, "step": 2070 }, { "epoch": 0.8008507347254447, "grad_norm": 0.465872284880614, "learning_rate": 9.174918966212399e-06, "loss": 0.0467, "step": 2071 }, { "epoch": 0.8012374323279196, "grad_norm": 0.36147068296530194, "learning_rate": 9.173680547035167e-06, "loss": 0.0532, "step": 2072 }, { "epoch": 0.8016241299303944, "grad_norm": 0.36598276292799425, "learning_rate": 9.17244128285298e-06, "loss": 0.0387, "step": 2073 }, { "epoch": 0.8020108275328693, "grad_norm": 0.3403415720702072, "learning_rate": 9.171201173916742e-06, "loss": 0.0353, "step": 2074 }, { "epoch": 0.8023975251353441, "grad_norm": 0.3486619604475852, "learning_rate": 9.169960220477529e-06, "loss": 0.0371, "step": 2075 }, { "epoch": 0.802784222737819, "grad_norm": 1.0081650790632883, "learning_rate": 9.16871842278658e-06, "loss": 0.0437, "step": 2076 }, { "epoch": 0.8031709203402939, "grad_norm": 0.3272699828739487, "learning_rate": 9.167475781095315e-06, "loss": 0.0491, "step": 2077 }, { "epoch": 0.8035576179427688, "grad_norm": 0.33338352424977036, "learning_rate": 9.166232295655313e-06, "loss": 0.0354, "step": 2078 }, { "epoch": 0.8039443155452436, "grad_norm": 0.5060367392638123, "learning_rate": 9.164987966718337e-06, "loss": 0.0427, "step": 2079 }, { "epoch": 0.8043310131477185, "grad_norm": 0.97856272802287, "learning_rate": 9.163742794536312e-06, "loss": 0.0457, "step": 2080 }, { "epoch": 0.8047177107501934, "grad_norm": 0.4225304193609868, "learning_rate": 9.162496779361337e-06, "loss": 0.0538, "step": 2081 }, { "epoch": 0.8051044083526682, "grad_norm": 0.541321785696796, "learning_rate": 9.161249921445676e-06, "loss": 0.0545, "step": 2082 }, { "epoch": 0.8054911059551431, "grad_norm": 0.33457150189525287, "learning_rate": 9.160002221041773e-06, "loss": 0.0367, "step": 2083 }, { "epoch": 0.805877803557618, "grad_norm": 0.45254192995037185, "learning_rate": 9.158753678402238e-06, "loss": 0.0438, "step": 2084 }, { "epoch": 0.8062645011600929, "grad_norm": 0.3996070683704904, "learning_rate": 9.157504293779848e-06, "loss": 0.043, "step": 2085 }, { "epoch": 0.8066511987625676, "grad_norm": 0.44488718026183927, "learning_rate": 9.156254067427558e-06, "loss": 0.051, "step": 2086 }, { "epoch": 0.8070378963650425, "grad_norm": 0.3148495614865018, "learning_rate": 9.155002999598486e-06, "loss": 0.0311, "step": 2087 }, { "epoch": 0.8074245939675174, "grad_norm": 0.425978376660224, "learning_rate": 9.153751090545926e-06, "loss": 0.0362, "step": 2088 }, { "epoch": 0.8078112915699923, "grad_norm": 1.3287169207355949, "learning_rate": 9.152498340523339e-06, "loss": 0.0617, "step": 2089 }, { "epoch": 0.8081979891724671, "grad_norm": 0.3301967160746616, "learning_rate": 9.151244749784356e-06, "loss": 0.0426, "step": 2090 }, { "epoch": 0.808584686774942, "grad_norm": 0.3596689498290518, "learning_rate": 9.149990318582781e-06, "loss": 0.0448, "step": 2091 }, { "epoch": 0.8089713843774169, "grad_norm": 0.4578045998043632, "learning_rate": 9.148735047172586e-06, "loss": 0.0654, "step": 2092 }, { "epoch": 0.8093580819798917, "grad_norm": 0.3875789834405183, "learning_rate": 9.147478935807916e-06, "loss": 0.0373, "step": 2093 }, { "epoch": 0.8097447795823666, "grad_norm": 0.4411591218240339, "learning_rate": 9.14622198474308e-06, "loss": 0.057, "step": 2094 }, { "epoch": 0.8101314771848415, "grad_norm": 0.4477779996650148, "learning_rate": 9.144964194232562e-06, "loss": 0.0322, "step": 2095 }, { "epoch": 0.8105181747873164, "grad_norm": 0.41360989750066135, "learning_rate": 9.143705564531017e-06, "loss": 0.0399, "step": 2096 }, { "epoch": 0.8109048723897911, "grad_norm": 0.31578115734589557, "learning_rate": 9.142446095893266e-06, "loss": 0.032, "step": 2097 }, { "epoch": 0.811291569992266, "grad_norm": 0.4191533168498602, "learning_rate": 9.1411857885743e-06, "loss": 0.0466, "step": 2098 }, { "epoch": 0.8116782675947409, "grad_norm": 0.3992180988293265, "learning_rate": 9.139924642829283e-06, "loss": 0.0313, "step": 2099 }, { "epoch": 0.8120649651972158, "grad_norm": 0.4351266981606623, "learning_rate": 9.138662658913549e-06, "loss": 0.0517, "step": 2100 }, { "epoch": 0.8124516627996906, "grad_norm": 0.37675070533886085, "learning_rate": 9.137399837082594e-06, "loss": 0.0401, "step": 2101 }, { "epoch": 0.8128383604021655, "grad_norm": 0.34642803256219595, "learning_rate": 9.136136177592096e-06, "loss": 0.0432, "step": 2102 }, { "epoch": 0.8132250580046404, "grad_norm": 0.3460148893125311, "learning_rate": 9.134871680697893e-06, "loss": 0.0472, "step": 2103 }, { "epoch": 0.8136117556071152, "grad_norm": 0.4371557802381087, "learning_rate": 9.133606346655992e-06, "loss": 0.0527, "step": 2104 }, { "epoch": 0.8139984532095901, "grad_norm": 0.38443523181105493, "learning_rate": 9.13234017572258e-06, "loss": 0.0415, "step": 2105 }, { "epoch": 0.814385150812065, "grad_norm": 0.33115313920210865, "learning_rate": 9.131073168154001e-06, "loss": 0.0395, "step": 2106 }, { "epoch": 0.8147718484145399, "grad_norm": 0.2755475878664706, "learning_rate": 9.129805324206777e-06, "loss": 0.0399, "step": 2107 }, { "epoch": 0.8151585460170147, "grad_norm": 0.28547060272422686, "learning_rate": 9.128536644137593e-06, "loss": 0.0322, "step": 2108 }, { "epoch": 0.8155452436194895, "grad_norm": 0.3684990960380758, "learning_rate": 9.127267128203312e-06, "loss": 0.0424, "step": 2109 }, { "epoch": 0.8159319412219644, "grad_norm": 0.2649442395915846, "learning_rate": 9.125996776660953e-06, "loss": 0.0304, "step": 2110 }, { "epoch": 0.8163186388244393, "grad_norm": 0.34431431694464987, "learning_rate": 9.124725589767717e-06, "loss": 0.0362, "step": 2111 }, { "epoch": 0.8167053364269141, "grad_norm": 0.2881601176217222, "learning_rate": 9.12345356778097e-06, "loss": 0.0405, "step": 2112 }, { "epoch": 0.817092034029389, "grad_norm": 0.47567426061488466, "learning_rate": 9.122180710958242e-06, "loss": 0.0506, "step": 2113 }, { "epoch": 0.8174787316318639, "grad_norm": 0.41743067751267077, "learning_rate": 9.120907019557239e-06, "loss": 0.0422, "step": 2114 }, { "epoch": 0.8178654292343387, "grad_norm": 0.2973640312603333, "learning_rate": 9.119632493835832e-06, "loss": 0.0383, "step": 2115 }, { "epoch": 0.8182521268368136, "grad_norm": 0.3531091001904011, "learning_rate": 9.11835713405206e-06, "loss": 0.0373, "step": 2116 }, { "epoch": 0.8186388244392885, "grad_norm": 0.4552287639691133, "learning_rate": 9.117080940464139e-06, "loss": 0.0524, "step": 2117 }, { "epoch": 0.8190255220417634, "grad_norm": 0.5225819688893499, "learning_rate": 9.11580391333044e-06, "loss": 0.0437, "step": 2118 }, { "epoch": 0.8194122196442382, "grad_norm": 0.46729510625190385, "learning_rate": 9.114526052909515e-06, "loss": 0.0409, "step": 2119 }, { "epoch": 0.819798917246713, "grad_norm": 0.5879739477767872, "learning_rate": 9.11324735946008e-06, "loss": 0.0525, "step": 2120 }, { "epoch": 0.820185614849188, "grad_norm": 0.6039281296010306, "learning_rate": 9.111967833241016e-06, "loss": 0.0566, "step": 2121 }, { "epoch": 0.8205723124516628, "grad_norm": 0.38969187365389946, "learning_rate": 9.110687474511381e-06, "loss": 0.0488, "step": 2122 }, { "epoch": 0.8209590100541376, "grad_norm": 0.3352756088149255, "learning_rate": 9.109406283530396e-06, "loss": 0.0407, "step": 2123 }, { "epoch": 0.8213457076566125, "grad_norm": 0.3507033670865344, "learning_rate": 9.108124260557449e-06, "loss": 0.033, "step": 2124 }, { "epoch": 0.8217324052590874, "grad_norm": 0.4507972711056142, "learning_rate": 9.1068414058521e-06, "loss": 0.0469, "step": 2125 }, { "epoch": 0.8221191028615623, "grad_norm": 0.31060941669221487, "learning_rate": 9.105557719674077e-06, "loss": 0.0348, "step": 2126 }, { "epoch": 0.8225058004640371, "grad_norm": 0.3654328532911747, "learning_rate": 9.104273202283275e-06, "loss": 0.0334, "step": 2127 }, { "epoch": 0.822892498066512, "grad_norm": 0.41327829433341173, "learning_rate": 9.102987853939759e-06, "loss": 0.046, "step": 2128 }, { "epoch": 0.8232791956689869, "grad_norm": 0.4276852774031115, "learning_rate": 9.10170167490376e-06, "loss": 0.0347, "step": 2129 }, { "epoch": 0.8236658932714617, "grad_norm": 0.4573793544667996, "learning_rate": 9.100414665435677e-06, "loss": 0.0435, "step": 2130 }, { "epoch": 0.8240525908739366, "grad_norm": 0.4750213265668397, "learning_rate": 9.099126825796083e-06, "loss": 0.0592, "step": 2131 }, { "epoch": 0.8244392884764115, "grad_norm": 0.34247594663451103, "learning_rate": 9.097838156245708e-06, "loss": 0.0398, "step": 2132 }, { "epoch": 0.8248259860788864, "grad_norm": 0.3611144466288654, "learning_rate": 9.096548657045463e-06, "loss": 0.0406, "step": 2133 }, { "epoch": 0.8252126836813611, "grad_norm": 0.3236300816729297, "learning_rate": 9.095258328456415e-06, "loss": 0.037, "step": 2134 }, { "epoch": 0.825599381283836, "grad_norm": 0.4565259090823549, "learning_rate": 9.093967170739809e-06, "loss": 0.053, "step": 2135 }, { "epoch": 0.8259860788863109, "grad_norm": 0.5057970524462408, "learning_rate": 9.092675184157048e-06, "loss": 0.0528, "step": 2136 }, { "epoch": 0.8263727764887858, "grad_norm": 0.3456970270875201, "learning_rate": 9.091382368969712e-06, "loss": 0.0421, "step": 2137 }, { "epoch": 0.8267594740912606, "grad_norm": 0.3813927609134159, "learning_rate": 9.090088725439545e-06, "loss": 0.0407, "step": 2138 }, { "epoch": 0.8271461716937355, "grad_norm": 0.35832474385919116, "learning_rate": 9.08879425382846e-06, "loss": 0.0399, "step": 2139 }, { "epoch": 0.8275328692962104, "grad_norm": 0.6972363389917994, "learning_rate": 9.08749895439853e-06, "loss": 0.0531, "step": 2140 }, { "epoch": 0.8279195668986852, "grad_norm": 0.45100571054798944, "learning_rate": 9.086202827412007e-06, "loss": 0.0457, "step": 2141 }, { "epoch": 0.8283062645011601, "grad_norm": 0.34160421795562257, "learning_rate": 9.084905873131305e-06, "loss": 0.0343, "step": 2142 }, { "epoch": 0.828692962103635, "grad_norm": 0.4765158436943409, "learning_rate": 9.083608091819002e-06, "loss": 0.0428, "step": 2143 }, { "epoch": 0.8290796597061099, "grad_norm": 0.47975878937155286, "learning_rate": 9.082309483737854e-06, "loss": 0.0382, "step": 2144 }, { "epoch": 0.8294663573085846, "grad_norm": 0.3688777738042674, "learning_rate": 9.081010049150771e-06, "loss": 0.0453, "step": 2145 }, { "epoch": 0.8298530549110595, "grad_norm": 0.6378318211484849, "learning_rate": 9.07970978832084e-06, "loss": 0.0444, "step": 2146 }, { "epoch": 0.8302397525135344, "grad_norm": 0.3281376485203529, "learning_rate": 9.078408701511314e-06, "loss": 0.034, "step": 2147 }, { "epoch": 0.8306264501160093, "grad_norm": 0.39071964051772673, "learning_rate": 9.07710678898561e-06, "loss": 0.0359, "step": 2148 }, { "epoch": 0.8310131477184841, "grad_norm": 0.5831501811439268, "learning_rate": 9.07580405100731e-06, "loss": 0.0624, "step": 2149 }, { "epoch": 0.831399845320959, "grad_norm": 0.38189361428208934, "learning_rate": 9.074500487840174e-06, "loss": 0.0379, "step": 2150 }, { "epoch": 0.8317865429234339, "grad_norm": 0.355592814334192, "learning_rate": 9.073196099748117e-06, "loss": 0.0551, "step": 2151 }, { "epoch": 0.8321732405259087, "grad_norm": 0.38654344675885094, "learning_rate": 9.071890886995228e-06, "loss": 0.0489, "step": 2152 }, { "epoch": 0.8325599381283836, "grad_norm": 0.3714242299211017, "learning_rate": 9.070584849845758e-06, "loss": 0.0531, "step": 2153 }, { "epoch": 0.8329466357308585, "grad_norm": 0.46204146433585125, "learning_rate": 9.06927798856413e-06, "loss": 0.0555, "step": 2154 }, { "epoch": 0.8333333333333334, "grad_norm": 0.3572295939361985, "learning_rate": 9.067970303414933e-06, "loss": 0.0452, "step": 2155 }, { "epoch": 0.8337200309358082, "grad_norm": 0.4318358169617031, "learning_rate": 9.066661794662917e-06, "loss": 0.0575, "step": 2156 }, { "epoch": 0.834106728538283, "grad_norm": 0.28130338641975333, "learning_rate": 9.065352462573007e-06, "loss": 0.0232, "step": 2157 }, { "epoch": 0.8344934261407579, "grad_norm": 0.43296211346417407, "learning_rate": 9.06404230741029e-06, "loss": 0.0473, "step": 2158 }, { "epoch": 0.8348801237432328, "grad_norm": 0.5841168502190871, "learning_rate": 9.062731329440023e-06, "loss": 0.0449, "step": 2159 }, { "epoch": 0.8352668213457076, "grad_norm": 0.4231124662429941, "learning_rate": 9.06141952892762e-06, "loss": 0.0516, "step": 2160 }, { "epoch": 0.8356535189481825, "grad_norm": 0.3835444733899793, "learning_rate": 9.060106906138676e-06, "loss": 0.0436, "step": 2161 }, { "epoch": 0.8360402165506574, "grad_norm": 0.317127029571776, "learning_rate": 9.058793461338942e-06, "loss": 0.0363, "step": 2162 }, { "epoch": 0.8364269141531323, "grad_norm": 0.5333019369331304, "learning_rate": 9.057479194794337e-06, "loss": 0.055, "step": 2163 }, { "epoch": 0.8368136117556071, "grad_norm": 0.3364783112805771, "learning_rate": 9.05616410677095e-06, "loss": 0.0401, "step": 2164 }, { "epoch": 0.837200309358082, "grad_norm": 0.3761408716070129, "learning_rate": 9.054848197535035e-06, "loss": 0.0434, "step": 2165 }, { "epoch": 0.8375870069605569, "grad_norm": 0.7420992601840545, "learning_rate": 9.05353146735301e-06, "loss": 0.0332, "step": 2166 }, { "epoch": 0.8379737045630317, "grad_norm": 0.36240694380307464, "learning_rate": 9.052213916491462e-06, "loss": 0.0497, "step": 2167 }, { "epoch": 0.8383604021655066, "grad_norm": 0.4318529927365121, "learning_rate": 9.05089554521714e-06, "loss": 0.0524, "step": 2168 }, { "epoch": 0.8387470997679815, "grad_norm": 0.3764979520127407, "learning_rate": 9.049576353796965e-06, "loss": 0.0388, "step": 2169 }, { "epoch": 0.8391337973704563, "grad_norm": 0.6738916078717975, "learning_rate": 9.048256342498019e-06, "loss": 0.0469, "step": 2170 }, { "epoch": 0.8395204949729311, "grad_norm": 0.5268988442306334, "learning_rate": 9.046935511587553e-06, "loss": 0.0508, "step": 2171 }, { "epoch": 0.839907192575406, "grad_norm": 0.32239999504987793, "learning_rate": 9.045613861332982e-06, "loss": 0.039, "step": 2172 }, { "epoch": 0.8402938901778809, "grad_norm": 0.3180217718423391, "learning_rate": 9.044291392001889e-06, "loss": 0.0351, "step": 2173 }, { "epoch": 0.8406805877803558, "grad_norm": 0.27750749505815975, "learning_rate": 9.042968103862022e-06, "loss": 0.0382, "step": 2174 }, { "epoch": 0.8410672853828306, "grad_norm": 0.3738044214704607, "learning_rate": 9.041643997181293e-06, "loss": 0.0413, "step": 2175 }, { "epoch": 0.8414539829853055, "grad_norm": 0.2789478079445284, "learning_rate": 9.040319072227783e-06, "loss": 0.0367, "step": 2176 }, { "epoch": 0.8418406805877804, "grad_norm": 0.3798985225318177, "learning_rate": 9.038993329269733e-06, "loss": 0.0417, "step": 2177 }, { "epoch": 0.8422273781902552, "grad_norm": 0.3409164145282661, "learning_rate": 9.037666768575559e-06, "loss": 0.0374, "step": 2178 }, { "epoch": 0.8426140757927301, "grad_norm": 0.32895824067807283, "learning_rate": 9.036339390413833e-06, "loss": 0.0426, "step": 2179 }, { "epoch": 0.843000773395205, "grad_norm": 0.39054006400835994, "learning_rate": 9.035011195053296e-06, "loss": 0.0454, "step": 2180 }, { "epoch": 0.8433874709976799, "grad_norm": 0.8226983949693456, "learning_rate": 9.033682182762858e-06, "loss": 0.0478, "step": 2181 }, { "epoch": 0.8437741686001546, "grad_norm": 0.3446821531008199, "learning_rate": 9.032352353811588e-06, "loss": 0.0444, "step": 2182 }, { "epoch": 0.8441608662026295, "grad_norm": 0.42351589080069485, "learning_rate": 9.031021708468728e-06, "loss": 0.0437, "step": 2183 }, { "epoch": 0.8445475638051044, "grad_norm": 0.3164726264631792, "learning_rate": 9.029690247003676e-06, "loss": 0.0398, "step": 2184 }, { "epoch": 0.8449342614075793, "grad_norm": 0.4510634188622118, "learning_rate": 9.028357969686004e-06, "loss": 0.0422, "step": 2185 }, { "epoch": 0.8453209590100541, "grad_norm": 0.4548543808653387, "learning_rate": 9.027024876785445e-06, "loss": 0.0435, "step": 2186 }, { "epoch": 0.845707656612529, "grad_norm": 0.35619796476963445, "learning_rate": 9.025690968571892e-06, "loss": 0.0389, "step": 2187 }, { "epoch": 0.8460943542150039, "grad_norm": 0.2668269274237047, "learning_rate": 9.024356245315418e-06, "loss": 0.0371, "step": 2188 }, { "epoch": 0.8464810518174787, "grad_norm": 0.29238078898254627, "learning_rate": 9.023020707286243e-06, "loss": 0.0382, "step": 2189 }, { "epoch": 0.8468677494199536, "grad_norm": 0.2767543147238385, "learning_rate": 9.021684354754766e-06, "loss": 0.0301, "step": 2190 }, { "epoch": 0.8472544470224285, "grad_norm": 0.3066503245586736, "learning_rate": 9.020347187991544e-06, "loss": 0.0332, "step": 2191 }, { "epoch": 0.8476411446249034, "grad_norm": 0.266492566237681, "learning_rate": 9.019009207267298e-06, "loss": 0.0347, "step": 2192 }, { "epoch": 0.8480278422273781, "grad_norm": 0.38636066541332104, "learning_rate": 9.01767041285292e-06, "loss": 0.0447, "step": 2193 }, { "epoch": 0.848414539829853, "grad_norm": 0.44678269253744823, "learning_rate": 9.016330805019458e-06, "loss": 0.041, "step": 2194 }, { "epoch": 0.8488012374323279, "grad_norm": 0.32854273177483595, "learning_rate": 9.014990384038133e-06, "loss": 0.0389, "step": 2195 }, { "epoch": 0.8491879350348028, "grad_norm": 0.527885053213165, "learning_rate": 9.013649150180328e-06, "loss": 0.0499, "step": 2196 }, { "epoch": 0.8495746326372776, "grad_norm": 0.2733254711303735, "learning_rate": 9.012307103717588e-06, "loss": 0.0333, "step": 2197 }, { "epoch": 0.8499613302397525, "grad_norm": 0.5295780761157406, "learning_rate": 9.010964244921625e-06, "loss": 0.0423, "step": 2198 }, { "epoch": 0.8503480278422274, "grad_norm": 0.299488533161423, "learning_rate": 9.009620574064312e-06, "loss": 0.0341, "step": 2199 }, { "epoch": 0.8507347254447022, "grad_norm": 0.32154749323203713, "learning_rate": 9.008276091417692e-06, "loss": 0.0368, "step": 2200 }, { "epoch": 0.8511214230471771, "grad_norm": 0.34259246377289543, "learning_rate": 9.00693079725397e-06, "loss": 0.0426, "step": 2201 }, { "epoch": 0.851508120649652, "grad_norm": 0.28882203819450986, "learning_rate": 9.005584691845512e-06, "loss": 0.0359, "step": 2202 }, { "epoch": 0.8518948182521269, "grad_norm": 0.27889225075644986, "learning_rate": 9.004237775464852e-06, "loss": 0.0282, "step": 2203 }, { "epoch": 0.8522815158546017, "grad_norm": 0.27855517668822005, "learning_rate": 9.00289004838469e-06, "loss": 0.0331, "step": 2204 }, { "epoch": 0.8526682134570766, "grad_norm": 0.3646414742558235, "learning_rate": 9.001541510877882e-06, "loss": 0.0331, "step": 2205 }, { "epoch": 0.8530549110595514, "grad_norm": 0.33814139429068335, "learning_rate": 9.000192163217457e-06, "loss": 0.0329, "step": 2206 }, { "epoch": 0.8534416086620263, "grad_norm": 0.31857073936351865, "learning_rate": 8.998842005676604e-06, "loss": 0.0334, "step": 2207 }, { "epoch": 0.8538283062645011, "grad_norm": 0.36343413504884475, "learning_rate": 8.997491038528678e-06, "loss": 0.0345, "step": 2208 }, { "epoch": 0.854215003866976, "grad_norm": 0.4677461683707757, "learning_rate": 8.996139262047193e-06, "loss": 0.0478, "step": 2209 }, { "epoch": 0.8546017014694509, "grad_norm": 0.4690870351666983, "learning_rate": 8.99478667650583e-06, "loss": 0.0402, "step": 2210 }, { "epoch": 0.8549883990719258, "grad_norm": 0.4691671986003197, "learning_rate": 8.99343328217844e-06, "loss": 0.0502, "step": 2211 }, { "epoch": 0.8553750966744006, "grad_norm": 0.5230484730093937, "learning_rate": 8.992079079339024e-06, "loss": 0.0522, "step": 2212 }, { "epoch": 0.8557617942768755, "grad_norm": 0.3279607690866833, "learning_rate": 8.990724068261761e-06, "loss": 0.0461, "step": 2213 }, { "epoch": 0.8561484918793504, "grad_norm": 0.40490781689083416, "learning_rate": 8.989368249220983e-06, "loss": 0.0538, "step": 2214 }, { "epoch": 0.8565351894818252, "grad_norm": 0.2784750410973289, "learning_rate": 8.988011622491189e-06, "loss": 0.0299, "step": 2215 }, { "epoch": 0.8569218870843001, "grad_norm": 0.3633472511991136, "learning_rate": 8.986654188347043e-06, "loss": 0.0314, "step": 2216 }, { "epoch": 0.857308584686775, "grad_norm": 0.4555114075600702, "learning_rate": 8.985295947063374e-06, "loss": 0.045, "step": 2217 }, { "epoch": 0.8576952822892498, "grad_norm": 0.3212225037130318, "learning_rate": 8.983936898915171e-06, "loss": 0.0345, "step": 2218 }, { "epoch": 0.8580819798917246, "grad_norm": 0.551233723632404, "learning_rate": 8.982577044177586e-06, "loss": 0.0406, "step": 2219 }, { "epoch": 0.8584686774941995, "grad_norm": 0.34033053547406183, "learning_rate": 8.981216383125936e-06, "loss": 0.0397, "step": 2220 }, { "epoch": 0.8588553750966744, "grad_norm": 0.3402659975235014, "learning_rate": 8.979854916035705e-06, "loss": 0.045, "step": 2221 }, { "epoch": 0.8592420726991493, "grad_norm": 0.493022585163963, "learning_rate": 8.978492643182527e-06, "loss": 0.0786, "step": 2222 }, { "epoch": 0.8596287703016241, "grad_norm": 0.3288692397992617, "learning_rate": 8.977129564842217e-06, "loss": 0.0335, "step": 2223 }, { "epoch": 0.860015467904099, "grad_norm": 0.4145124858808473, "learning_rate": 8.975765681290742e-06, "loss": 0.0602, "step": 2224 }, { "epoch": 0.8604021655065739, "grad_norm": 0.25062179316372174, "learning_rate": 8.974400992804233e-06, "loss": 0.027, "step": 2225 }, { "epoch": 0.8607888631090487, "grad_norm": 0.4395898430061998, "learning_rate": 8.973035499658985e-06, "loss": 0.045, "step": 2226 }, { "epoch": 0.8611755607115236, "grad_norm": 0.2928437011685637, "learning_rate": 8.971669202131458e-06, "loss": 0.0377, "step": 2227 }, { "epoch": 0.8615622583139985, "grad_norm": 0.3379985516649998, "learning_rate": 8.970302100498272e-06, "loss": 0.0354, "step": 2228 }, { "epoch": 0.8619489559164734, "grad_norm": 0.5486163725192793, "learning_rate": 8.968934195036212e-06, "loss": 0.0398, "step": 2229 }, { "epoch": 0.8623356535189481, "grad_norm": 0.425587243556553, "learning_rate": 8.967565486022224e-06, "loss": 0.0451, "step": 2230 }, { "epoch": 0.862722351121423, "grad_norm": 0.37745582556512586, "learning_rate": 8.966195973733418e-06, "loss": 0.0462, "step": 2231 }, { "epoch": 0.8631090487238979, "grad_norm": 0.31746458164110275, "learning_rate": 8.964825658447063e-06, "loss": 0.0386, "step": 2232 }, { "epoch": 0.8634957463263728, "grad_norm": 0.2237253007847635, "learning_rate": 8.963454540440597e-06, "loss": 0.0253, "step": 2233 }, { "epoch": 0.8638824439288476, "grad_norm": 0.37518064786239136, "learning_rate": 8.962082619991616e-06, "loss": 0.0367, "step": 2234 }, { "epoch": 0.8642691415313225, "grad_norm": 0.39924113697027475, "learning_rate": 8.96070989737788e-06, "loss": 0.0349, "step": 2235 }, { "epoch": 0.8646558391337974, "grad_norm": 0.3359825909573115, "learning_rate": 8.959336372877311e-06, "loss": 0.0362, "step": 2236 }, { "epoch": 0.8650425367362722, "grad_norm": 0.3598714510054823, "learning_rate": 8.957962046767991e-06, "loss": 0.0374, "step": 2237 }, { "epoch": 0.8654292343387471, "grad_norm": 0.3662319934795733, "learning_rate": 8.95658691932817e-06, "loss": 0.0436, "step": 2238 }, { "epoch": 0.865815931941222, "grad_norm": 0.3295597820094079, "learning_rate": 8.955210990836254e-06, "loss": 0.0452, "step": 2239 }, { "epoch": 0.8662026295436969, "grad_norm": 0.3224639204597179, "learning_rate": 8.953834261570815e-06, "loss": 0.0409, "step": 2240 }, { "epoch": 0.8665893271461717, "grad_norm": 0.659465589133375, "learning_rate": 8.952456731810585e-06, "loss": 0.0419, "step": 2241 }, { "epoch": 0.8669760247486465, "grad_norm": 0.4500966911461079, "learning_rate": 8.951078401834463e-06, "loss": 0.0404, "step": 2242 }, { "epoch": 0.8673627223511214, "grad_norm": 0.5539510655732663, "learning_rate": 8.949699271921502e-06, "loss": 0.0466, "step": 2243 }, { "epoch": 0.8677494199535963, "grad_norm": 0.36938370384055635, "learning_rate": 8.948319342350924e-06, "loss": 0.042, "step": 2244 }, { "epoch": 0.8681361175560711, "grad_norm": 0.39361269789137215, "learning_rate": 8.946938613402109e-06, "loss": 0.0342, "step": 2245 }, { "epoch": 0.868522815158546, "grad_norm": 0.646337391902557, "learning_rate": 8.945557085354599e-06, "loss": 0.0279, "step": 2246 }, { "epoch": 0.8689095127610209, "grad_norm": 0.3959222464657782, "learning_rate": 8.9441747584881e-06, "loss": 0.0405, "step": 2247 }, { "epoch": 0.8692962103634957, "grad_norm": 0.366828910052707, "learning_rate": 8.942791633082478e-06, "loss": 0.0333, "step": 2248 }, { "epoch": 0.8696829079659706, "grad_norm": 0.454043053064041, "learning_rate": 8.94140770941776e-06, "loss": 0.0435, "step": 2249 }, { "epoch": 0.8700696055684455, "grad_norm": 0.3374843756565058, "learning_rate": 8.940022987774137e-06, "loss": 0.0332, "step": 2250 }, { "epoch": 0.8704563031709204, "grad_norm": 0.42512731110915136, "learning_rate": 8.938637468431962e-06, "loss": 0.0378, "step": 2251 }, { "epoch": 0.8708430007733952, "grad_norm": 0.4357719309847745, "learning_rate": 8.937251151671743e-06, "loss": 0.0377, "step": 2252 }, { "epoch": 0.87122969837587, "grad_norm": 0.3935200211264893, "learning_rate": 8.935864037774158e-06, "loss": 0.0461, "step": 2253 }, { "epoch": 0.871616395978345, "grad_norm": 0.23708321384826297, "learning_rate": 8.934476127020043e-06, "loss": 0.0225, "step": 2254 }, { "epoch": 0.8720030935808198, "grad_norm": 0.3374633191336888, "learning_rate": 8.933087419690392e-06, "loss": 0.0293, "step": 2255 }, { "epoch": 0.8723897911832946, "grad_norm": 0.49457602458616323, "learning_rate": 8.931697916066364e-06, "loss": 0.0534, "step": 2256 }, { "epoch": 0.8727764887857695, "grad_norm": 0.3610230121477349, "learning_rate": 8.930307616429282e-06, "loss": 0.0334, "step": 2257 }, { "epoch": 0.8731631863882444, "grad_norm": 0.2762614058216952, "learning_rate": 8.92891652106062e-06, "loss": 0.0351, "step": 2258 }, { "epoch": 0.8735498839907193, "grad_norm": 0.3615440538503273, "learning_rate": 8.927524630242026e-06, "loss": 0.0396, "step": 2259 }, { "epoch": 0.8739365815931941, "grad_norm": 0.46720745581228706, "learning_rate": 8.9261319442553e-06, "loss": 0.041, "step": 2260 }, { "epoch": 0.874323279195669, "grad_norm": 0.33592088070988474, "learning_rate": 8.924738463382405e-06, "loss": 0.0324, "step": 2261 }, { "epoch": 0.8747099767981439, "grad_norm": 0.3720415541835371, "learning_rate": 8.923344187905467e-06, "loss": 0.0412, "step": 2262 }, { "epoch": 0.8750966744006187, "grad_norm": 0.36703820189325675, "learning_rate": 8.921949118106772e-06, "loss": 0.0465, "step": 2263 }, { "epoch": 0.8754833720030936, "grad_norm": 0.4702744757251938, "learning_rate": 8.920553254268766e-06, "loss": 0.0306, "step": 2264 }, { "epoch": 0.8758700696055685, "grad_norm": 0.3536305184539658, "learning_rate": 8.919156596674054e-06, "loss": 0.042, "step": 2265 }, { "epoch": 0.8762567672080434, "grad_norm": 0.449346949981791, "learning_rate": 8.917759145605407e-06, "loss": 0.0468, "step": 2266 }, { "epoch": 0.8766434648105181, "grad_norm": 0.41028925486584233, "learning_rate": 8.91636090134575e-06, "loss": 0.0474, "step": 2267 }, { "epoch": 0.877030162412993, "grad_norm": 0.604974857198001, "learning_rate": 8.914961864178175e-06, "loss": 0.0471, "step": 2268 }, { "epoch": 0.8774168600154679, "grad_norm": 0.6393131408515618, "learning_rate": 8.913562034385933e-06, "loss": 0.0558, "step": 2269 }, { "epoch": 0.8778035576179428, "grad_norm": 0.4259577024915279, "learning_rate": 8.912161412252429e-06, "loss": 0.0423, "step": 2270 }, { "epoch": 0.8781902552204176, "grad_norm": 0.4974261007561685, "learning_rate": 8.910759998061236e-06, "loss": 0.045, "step": 2271 }, { "epoch": 0.8785769528228925, "grad_norm": 0.4135795849252917, "learning_rate": 8.909357792096087e-06, "loss": 0.0372, "step": 2272 }, { "epoch": 0.8789636504253674, "grad_norm": 0.38599457120137004, "learning_rate": 8.907954794640869e-06, "loss": 0.0385, "step": 2273 }, { "epoch": 0.8793503480278422, "grad_norm": 0.49824293065047615, "learning_rate": 8.906551005979636e-06, "loss": 0.0666, "step": 2274 }, { "epoch": 0.8797370456303171, "grad_norm": 0.6521570352410346, "learning_rate": 8.9051464263966e-06, "loss": 0.043, "step": 2275 }, { "epoch": 0.880123743232792, "grad_norm": 0.6838488142977158, "learning_rate": 8.903741056176131e-06, "loss": 0.0387, "step": 2276 }, { "epoch": 0.8805104408352669, "grad_norm": 0.44069682509438973, "learning_rate": 8.902334895602763e-06, "loss": 0.0585, "step": 2277 }, { "epoch": 0.8808971384377416, "grad_norm": 0.3517554271640157, "learning_rate": 8.900927944961186e-06, "loss": 0.0385, "step": 2278 }, { "epoch": 0.8812838360402165, "grad_norm": 0.4260751612094906, "learning_rate": 8.899520204536252e-06, "loss": 0.058, "step": 2279 }, { "epoch": 0.8816705336426914, "grad_norm": 0.3349140275323176, "learning_rate": 8.898111674612973e-06, "loss": 0.0381, "step": 2280 }, { "epoch": 0.8820572312451663, "grad_norm": 0.2920814265072497, "learning_rate": 8.89670235547652e-06, "loss": 0.0408, "step": 2281 }, { "epoch": 0.8824439288476411, "grad_norm": 0.43580167488677296, "learning_rate": 8.895292247412226e-06, "loss": 0.0549, "step": 2282 }, { "epoch": 0.882830626450116, "grad_norm": 0.3045468407713868, "learning_rate": 8.89388135070558e-06, "loss": 0.0322, "step": 2283 }, { "epoch": 0.8832173240525909, "grad_norm": 0.4860943656323564, "learning_rate": 8.892469665642234e-06, "loss": 0.0549, "step": 2284 }, { "epoch": 0.8836040216550657, "grad_norm": 0.3259517556742216, "learning_rate": 8.891057192507996e-06, "loss": 0.0389, "step": 2285 }, { "epoch": 0.8839907192575406, "grad_norm": 0.36482410010240024, "learning_rate": 8.889643931588838e-06, "loss": 0.0417, "step": 2286 }, { "epoch": 0.8843774168600155, "grad_norm": 0.39742413179426705, "learning_rate": 8.88822988317089e-06, "loss": 0.0455, "step": 2287 }, { "epoch": 0.8847641144624904, "grad_norm": 1.1827893227734276, "learning_rate": 8.88681504754044e-06, "loss": 0.03, "step": 2288 }, { "epoch": 0.8851508120649652, "grad_norm": 0.4038185036546568, "learning_rate": 8.885399424983933e-06, "loss": 0.0348, "step": 2289 }, { "epoch": 0.88553750966744, "grad_norm": 0.4132688190627585, "learning_rate": 8.883983015787983e-06, "loss": 0.0333, "step": 2290 }, { "epoch": 0.8859242072699149, "grad_norm": 0.46743490405756366, "learning_rate": 8.88256582023935e-06, "loss": 0.0509, "step": 2291 }, { "epoch": 0.8863109048723898, "grad_norm": 0.4662085110625304, "learning_rate": 8.881147838624965e-06, "loss": 0.0541, "step": 2292 }, { "epoch": 0.8866976024748646, "grad_norm": 0.56864496759269, "learning_rate": 8.87972907123191e-06, "loss": 0.0463, "step": 2293 }, { "epoch": 0.8870843000773395, "grad_norm": 0.3802848868106582, "learning_rate": 8.878309518347432e-06, "loss": 0.0333, "step": 2294 }, { "epoch": 0.8874709976798144, "grad_norm": 0.390261120329278, "learning_rate": 8.876889180258931e-06, "loss": 0.0356, "step": 2295 }, { "epoch": 0.8878576952822892, "grad_norm": 0.3969409133918033, "learning_rate": 8.875468057253971e-06, "loss": 0.0547, "step": 2296 }, { "epoch": 0.8882443928847641, "grad_norm": 0.496403355432098, "learning_rate": 8.874046149620272e-06, "loss": 0.0372, "step": 2297 }, { "epoch": 0.888631090487239, "grad_norm": 0.4299041533579138, "learning_rate": 8.872623457645716e-06, "loss": 0.0444, "step": 2298 }, { "epoch": 0.8890177880897139, "grad_norm": 0.5469758887118007, "learning_rate": 8.87119998161834e-06, "loss": 0.0532, "step": 2299 }, { "epoch": 0.8894044856921887, "grad_norm": 0.3262154800688651, "learning_rate": 8.869775721826342e-06, "loss": 0.0385, "step": 2300 }, { "epoch": 0.8897911832946636, "grad_norm": 0.4635380520057581, "learning_rate": 8.86835067855808e-06, "loss": 0.0439, "step": 2301 }, { "epoch": 0.8901778808971385, "grad_norm": 0.3821290798166801, "learning_rate": 8.866924852102064e-06, "loss": 0.0404, "step": 2302 }, { "epoch": 0.8905645784996133, "grad_norm": 0.3836598756779588, "learning_rate": 8.865498242746972e-06, "loss": 0.042, "step": 2303 }, { "epoch": 0.8909512761020881, "grad_norm": 0.4562000523862278, "learning_rate": 8.864070850781634e-06, "loss": 0.0309, "step": 2304 }, { "epoch": 0.891337973704563, "grad_norm": 0.4165173792805896, "learning_rate": 8.86264267649504e-06, "loss": 0.0402, "step": 2305 }, { "epoch": 0.8917246713070379, "grad_norm": 0.3039134535884858, "learning_rate": 8.861213720176341e-06, "loss": 0.028, "step": 2306 }, { "epoch": 0.8921113689095128, "grad_norm": 0.4101596719315101, "learning_rate": 8.85978398211484e-06, "loss": 0.0392, "step": 2307 }, { "epoch": 0.8924980665119876, "grad_norm": 0.5405031105986156, "learning_rate": 8.858353462600007e-06, "loss": 0.0439, "step": 2308 }, { "epoch": 0.8928847641144625, "grad_norm": 0.34710187196729403, "learning_rate": 8.856922161921461e-06, "loss": 0.0498, "step": 2309 }, { "epoch": 0.8932714617169374, "grad_norm": 0.5195060729341497, "learning_rate": 8.855490080368988e-06, "loss": 0.0427, "step": 2310 }, { "epoch": 0.8936581593194122, "grad_norm": 0.37660942607513725, "learning_rate": 8.854057218232524e-06, "loss": 0.0464, "step": 2311 }, { "epoch": 0.8940448569218871, "grad_norm": 0.3178777854413134, "learning_rate": 8.85262357580217e-06, "loss": 0.042, "step": 2312 }, { "epoch": 0.894431554524362, "grad_norm": 0.513234518181234, "learning_rate": 8.851189153368179e-06, "loss": 0.0447, "step": 2313 }, { "epoch": 0.8948182521268369, "grad_norm": 0.6363707616398515, "learning_rate": 8.849753951220965e-06, "loss": 0.0394, "step": 2314 }, { "epoch": 0.8952049497293116, "grad_norm": 0.3156003598026051, "learning_rate": 8.848317969651103e-06, "loss": 0.042, "step": 2315 }, { "epoch": 0.8955916473317865, "grad_norm": 0.3809284778438872, "learning_rate": 8.846881208949318e-06, "loss": 0.045, "step": 2316 }, { "epoch": 0.8959783449342614, "grad_norm": 0.5225988530670238, "learning_rate": 8.845443669406499e-06, "loss": 0.0378, "step": 2317 }, { "epoch": 0.8963650425367363, "grad_norm": 0.3855042404127461, "learning_rate": 8.844005351313692e-06, "loss": 0.0382, "step": 2318 }, { "epoch": 0.8967517401392111, "grad_norm": 0.3547346898253072, "learning_rate": 8.842566254962097e-06, "loss": 0.0378, "step": 2319 }, { "epoch": 0.897138437741686, "grad_norm": 0.3355996773461769, "learning_rate": 8.841126380643076e-06, "loss": 0.0401, "step": 2320 }, { "epoch": 0.8975251353441609, "grad_norm": 0.38778592625250924, "learning_rate": 8.839685728648145e-06, "loss": 0.0317, "step": 2321 }, { "epoch": 0.8979118329466357, "grad_norm": 0.4448811893673452, "learning_rate": 8.838244299268981e-06, "loss": 0.0387, "step": 2322 }, { "epoch": 0.8982985305491106, "grad_norm": 0.3898590694455895, "learning_rate": 8.836802092797415e-06, "loss": 0.0429, "step": 2323 }, { "epoch": 0.8986852281515855, "grad_norm": 0.41682867740669827, "learning_rate": 8.835359109525434e-06, "loss": 0.0497, "step": 2324 }, { "epoch": 0.8990719257540604, "grad_norm": 0.504839936572292, "learning_rate": 8.83391534974519e-06, "loss": 0.0534, "step": 2325 }, { "epoch": 0.8994586233565351, "grad_norm": 0.4558598211848119, "learning_rate": 8.832470813748983e-06, "loss": 0.047, "step": 2326 }, { "epoch": 0.89984532095901, "grad_norm": 0.34222965426501367, "learning_rate": 8.831025501829278e-06, "loss": 0.045, "step": 2327 }, { "epoch": 0.9002320185614849, "grad_norm": 0.4032099704530826, "learning_rate": 8.829579414278692e-06, "loss": 0.0409, "step": 2328 }, { "epoch": 0.9006187161639598, "grad_norm": 0.38292434557604593, "learning_rate": 8.828132551389999e-06, "loss": 0.0388, "step": 2329 }, { "epoch": 0.9010054137664346, "grad_norm": 0.42263299277637095, "learning_rate": 8.826684913456132e-06, "loss": 0.0405, "step": 2330 }, { "epoch": 0.9013921113689095, "grad_norm": 0.398738351738655, "learning_rate": 8.82523650077018e-06, "loss": 0.04, "step": 2331 }, { "epoch": 0.9017788089713844, "grad_norm": 0.27992376816114994, "learning_rate": 8.823787313625392e-06, "loss": 0.0355, "step": 2332 }, { "epoch": 0.9021655065738592, "grad_norm": 0.29811075642280555, "learning_rate": 8.822337352315167e-06, "loss": 0.0378, "step": 2333 }, { "epoch": 0.9025522041763341, "grad_norm": 0.3284996477044495, "learning_rate": 8.820886617133068e-06, "loss": 0.0415, "step": 2334 }, { "epoch": 0.902938901778809, "grad_norm": 0.3373760168654902, "learning_rate": 8.819435108372809e-06, "loss": 0.0344, "step": 2335 }, { "epoch": 0.9033255993812839, "grad_norm": 0.37786899108590627, "learning_rate": 8.817982826328264e-06, "loss": 0.0513, "step": 2336 }, { "epoch": 0.9037122969837587, "grad_norm": 0.3862897267954175, "learning_rate": 8.816529771293462e-06, "loss": 0.0459, "step": 2337 }, { "epoch": 0.9040989945862336, "grad_norm": 0.3371304045087073, "learning_rate": 8.81507594356259e-06, "loss": 0.0314, "step": 2338 }, { "epoch": 0.9044856921887084, "grad_norm": 0.3589608932013755, "learning_rate": 8.813621343429992e-06, "loss": 0.0385, "step": 2339 }, { "epoch": 0.9048723897911833, "grad_norm": 0.35720551613971613, "learning_rate": 8.812165971190164e-06, "loss": 0.0551, "step": 2340 }, { "epoch": 0.9052590873936581, "grad_norm": 0.3631842664736449, "learning_rate": 8.81070982713776e-06, "loss": 0.0348, "step": 2341 }, { "epoch": 0.905645784996133, "grad_norm": 0.3027865207466154, "learning_rate": 8.809252911567596e-06, "loss": 0.0373, "step": 2342 }, { "epoch": 0.9060324825986079, "grad_norm": 0.39200283795233887, "learning_rate": 8.807795224774636e-06, "loss": 0.0437, "step": 2343 }, { "epoch": 0.9064191802010828, "grad_norm": 0.41955441804251903, "learning_rate": 8.806336767054007e-06, "loss": 0.042, "step": 2344 }, { "epoch": 0.9068058778035576, "grad_norm": 0.4129093107983274, "learning_rate": 8.804877538700986e-06, "loss": 0.033, "step": 2345 }, { "epoch": 0.9071925754060325, "grad_norm": 0.3608364892737636, "learning_rate": 8.80341754001101e-06, "loss": 0.041, "step": 2346 }, { "epoch": 0.9075792730085074, "grad_norm": 0.38668565868317545, "learning_rate": 8.801956771279671e-06, "loss": 0.0376, "step": 2347 }, { "epoch": 0.9079659706109822, "grad_norm": 0.279359651016413, "learning_rate": 8.800495232802717e-06, "loss": 0.0289, "step": 2348 }, { "epoch": 0.9083526682134571, "grad_norm": 0.32694886104738424, "learning_rate": 8.799032924876052e-06, "loss": 0.041, "step": 2349 }, { "epoch": 0.908739365815932, "grad_norm": 0.5241186235808029, "learning_rate": 8.797569847795733e-06, "loss": 0.0592, "step": 2350 }, { "epoch": 0.9091260634184068, "grad_norm": 0.3690981789158699, "learning_rate": 8.796106001857979e-06, "loss": 0.0468, "step": 2351 }, { "epoch": 0.9095127610208816, "grad_norm": 0.43363753059976484, "learning_rate": 8.794641387359157e-06, "loss": 0.0332, "step": 2352 }, { "epoch": 0.9098994586233565, "grad_norm": 0.25322722484713506, "learning_rate": 8.793176004595798e-06, "loss": 0.04, "step": 2353 }, { "epoch": 0.9102861562258314, "grad_norm": 0.43054828770186304, "learning_rate": 8.791709853864578e-06, "loss": 0.0469, "step": 2354 }, { "epoch": 0.9106728538283063, "grad_norm": 0.4569578976470584, "learning_rate": 8.790242935462339e-06, "loss": 0.0367, "step": 2355 }, { "epoch": 0.9110595514307811, "grad_norm": 0.3442214736908071, "learning_rate": 8.788775249686073e-06, "loss": 0.0333, "step": 2356 }, { "epoch": 0.911446249033256, "grad_norm": 0.29074093290958225, "learning_rate": 8.787306796832926e-06, "loss": 0.025, "step": 2357 }, { "epoch": 0.9118329466357309, "grad_norm": 0.2856050600476902, "learning_rate": 8.785837577200205e-06, "loss": 0.0354, "step": 2358 }, { "epoch": 0.9122196442382057, "grad_norm": 0.4139541961650813, "learning_rate": 8.784367591085366e-06, "loss": 0.0503, "step": 2359 }, { "epoch": 0.9126063418406806, "grad_norm": 0.27104615211976896, "learning_rate": 8.782896838786024e-06, "loss": 0.0379, "step": 2360 }, { "epoch": 0.9129930394431555, "grad_norm": 0.4319300883465495, "learning_rate": 8.781425320599947e-06, "loss": 0.0436, "step": 2361 }, { "epoch": 0.9133797370456304, "grad_norm": 0.3237277418528955, "learning_rate": 8.779953036825061e-06, "loss": 0.0404, "step": 2362 }, { "epoch": 0.9137664346481051, "grad_norm": 0.6347967689715663, "learning_rate": 8.778479987759443e-06, "loss": 0.0378, "step": 2363 }, { "epoch": 0.91415313225058, "grad_norm": 0.4141136359275656, "learning_rate": 8.77700617370133e-06, "loss": 0.0352, "step": 2364 }, { "epoch": 0.9145398298530549, "grad_norm": 0.35112241259929233, "learning_rate": 8.775531594949109e-06, "loss": 0.0344, "step": 2365 }, { "epoch": 0.9149265274555298, "grad_norm": 0.6233510435614911, "learning_rate": 8.774056251801324e-06, "loss": 0.0485, "step": 2366 }, { "epoch": 0.9153132250580046, "grad_norm": 0.42744580693990175, "learning_rate": 8.772580144556672e-06, "loss": 0.0357, "step": 2367 }, { "epoch": 0.9156999226604795, "grad_norm": 0.379826752028121, "learning_rate": 8.77110327351401e-06, "loss": 0.0296, "step": 2368 }, { "epoch": 0.9160866202629544, "grad_norm": 0.28359226799035564, "learning_rate": 8.769625638972343e-06, "loss": 0.0292, "step": 2369 }, { "epoch": 0.9164733178654292, "grad_norm": 0.3642903772087456, "learning_rate": 8.768147241230833e-06, "loss": 0.0357, "step": 2370 }, { "epoch": 0.9168600154679041, "grad_norm": 0.4143600755447172, "learning_rate": 8.766668080588798e-06, "loss": 0.0415, "step": 2371 }, { "epoch": 0.917246713070379, "grad_norm": 0.3866351564682915, "learning_rate": 8.76518815734571e-06, "loss": 0.0536, "step": 2372 }, { "epoch": 0.9176334106728539, "grad_norm": 0.3916764199180711, "learning_rate": 8.763707471801195e-06, "loss": 0.0441, "step": 2373 }, { "epoch": 0.9180201082753287, "grad_norm": 0.4251357734537538, "learning_rate": 8.762226024255032e-06, "loss": 0.04, "step": 2374 }, { "epoch": 0.9184068058778035, "grad_norm": 0.5070047551233775, "learning_rate": 8.760743815007155e-06, "loss": 0.0375, "step": 2375 }, { "epoch": 0.9187935034802784, "grad_norm": 0.442209594831471, "learning_rate": 8.759260844357656e-06, "loss": 0.0488, "step": 2376 }, { "epoch": 0.9191802010827533, "grad_norm": 0.5761718968662154, "learning_rate": 8.757777112606774e-06, "loss": 0.0474, "step": 2377 }, { "epoch": 0.9195668986852281, "grad_norm": 0.31384033801270805, "learning_rate": 8.756292620054907e-06, "loss": 0.0303, "step": 2378 }, { "epoch": 0.919953596287703, "grad_norm": 0.3721431285155936, "learning_rate": 8.754807367002607e-06, "loss": 0.0396, "step": 2379 }, { "epoch": 0.9203402938901779, "grad_norm": 0.29423196560174614, "learning_rate": 8.753321353750577e-06, "loss": 0.0271, "step": 2380 }, { "epoch": 0.9207269914926527, "grad_norm": 0.3438012127603406, "learning_rate": 8.751834580599678e-06, "loss": 0.0364, "step": 2381 }, { "epoch": 0.9211136890951276, "grad_norm": 0.43427737489198615, "learning_rate": 8.75034704785092e-06, "loss": 0.0342, "step": 2382 }, { "epoch": 0.9215003866976025, "grad_norm": 0.43431770825063154, "learning_rate": 8.748858755805473e-06, "loss": 0.0403, "step": 2383 }, { "epoch": 0.9218870843000774, "grad_norm": 0.37299574057586266, "learning_rate": 8.747369704764654e-06, "loss": 0.0466, "step": 2384 }, { "epoch": 0.9222737819025522, "grad_norm": 0.3616838104553121, "learning_rate": 8.745879895029938e-06, "loss": 0.0409, "step": 2385 }, { "epoch": 0.922660479505027, "grad_norm": 0.3296012493467962, "learning_rate": 8.744389326902954e-06, "loss": 0.0361, "step": 2386 }, { "epoch": 0.923047177107502, "grad_norm": 0.3058214792500728, "learning_rate": 8.742898000685478e-06, "loss": 0.0345, "step": 2387 }, { "epoch": 0.9234338747099768, "grad_norm": 0.36494772563341354, "learning_rate": 8.741405916679451e-06, "loss": 0.0416, "step": 2388 }, { "epoch": 0.9238205723124516, "grad_norm": 0.3913166347255968, "learning_rate": 8.739913075186957e-06, "loss": 0.0419, "step": 2389 }, { "epoch": 0.9242072699149265, "grad_norm": 0.3153574685951916, "learning_rate": 8.738419476510237e-06, "loss": 0.0325, "step": 2390 }, { "epoch": 0.9245939675174014, "grad_norm": 0.4808056233358263, "learning_rate": 8.73692512095169e-06, "loss": 0.0445, "step": 2391 }, { "epoch": 0.9249806651198763, "grad_norm": 0.27758885213849654, "learning_rate": 8.735430008813857e-06, "loss": 0.038, "step": 2392 }, { "epoch": 0.9253673627223511, "grad_norm": 0.30800623885909756, "learning_rate": 8.733934140399442e-06, "loss": 0.0382, "step": 2393 }, { "epoch": 0.925754060324826, "grad_norm": 0.8026875815454569, "learning_rate": 8.732437516011303e-06, "loss": 0.0445, "step": 2394 }, { "epoch": 0.9261407579273009, "grad_norm": 0.24073603462873128, "learning_rate": 8.73094013595244e-06, "loss": 0.0253, "step": 2395 }, { "epoch": 0.9265274555297757, "grad_norm": 0.4324078397767801, "learning_rate": 8.729442000526019e-06, "loss": 0.0285, "step": 2396 }, { "epoch": 0.9269141531322506, "grad_norm": 1.124088002145699, "learning_rate": 8.72794311003535e-06, "loss": 0.0517, "step": 2397 }, { "epoch": 0.9273008507347255, "grad_norm": 0.3826790811173698, "learning_rate": 8.7264434647839e-06, "loss": 0.0395, "step": 2398 }, { "epoch": 0.9276875483372004, "grad_norm": 0.9885285873450617, "learning_rate": 8.72494306507529e-06, "loss": 0.0466, "step": 2399 }, { "epoch": 0.9280742459396751, "grad_norm": 0.3832783923900663, "learning_rate": 8.723441911213283e-06, "loss": 0.0398, "step": 2400 }, { "epoch": 0.92846094354215, "grad_norm": 0.257034563264198, "learning_rate": 8.721940003501813e-06, "loss": 0.0299, "step": 2401 }, { "epoch": 0.9288476411446249, "grad_norm": 0.41398068603101934, "learning_rate": 8.720437342244953e-06, "loss": 0.0348, "step": 2402 }, { "epoch": 0.9292343387470998, "grad_norm": 0.3886153360178636, "learning_rate": 8.718933927746933e-06, "loss": 0.0445, "step": 2403 }, { "epoch": 0.9296210363495746, "grad_norm": 0.3885073171430969, "learning_rate": 8.717429760312135e-06, "loss": 0.0421, "step": 2404 }, { "epoch": 0.9300077339520495, "grad_norm": 0.5450016420519124, "learning_rate": 8.71592484024509e-06, "loss": 0.0325, "step": 2405 }, { "epoch": 0.9303944315545244, "grad_norm": 0.46212539276586984, "learning_rate": 8.71441916785049e-06, "loss": 0.0433, "step": 2406 }, { "epoch": 0.9307811291569992, "grad_norm": 0.5443723346853874, "learning_rate": 8.712912743433168e-06, "loss": 0.0347, "step": 2407 }, { "epoch": 0.9311678267594741, "grad_norm": 0.3954301105381744, "learning_rate": 8.71140556729812e-06, "loss": 0.042, "step": 2408 }, { "epoch": 0.931554524361949, "grad_norm": 0.3033240965399214, "learning_rate": 8.709897639750488e-06, "loss": 0.0287, "step": 2409 }, { "epoch": 0.9319412219644239, "grad_norm": 0.6217218534431653, "learning_rate": 8.708388961095568e-06, "loss": 0.043, "step": 2410 }, { "epoch": 0.9323279195668986, "grad_norm": 0.450397179541005, "learning_rate": 8.706879531638807e-06, "loss": 0.0441, "step": 2411 }, { "epoch": 0.9327146171693735, "grad_norm": 0.3802200606788175, "learning_rate": 8.705369351685804e-06, "loss": 0.0446, "step": 2412 }, { "epoch": 0.9331013147718484, "grad_norm": 0.3707982253013681, "learning_rate": 8.703858421542312e-06, "loss": 0.0355, "step": 2413 }, { "epoch": 0.9334880123743233, "grad_norm": 0.45034740392905265, "learning_rate": 8.702346741514237e-06, "loss": 0.0563, "step": 2414 }, { "epoch": 0.9338747099767981, "grad_norm": 0.4731211876157296, "learning_rate": 8.700834311907627e-06, "loss": 0.0376, "step": 2415 }, { "epoch": 0.934261407579273, "grad_norm": 0.39955925423312905, "learning_rate": 8.699321133028695e-06, "loss": 0.04, "step": 2416 }, { "epoch": 0.9346481051817479, "grad_norm": 0.3119338383767187, "learning_rate": 8.697807205183798e-06, "loss": 0.0317, "step": 2417 }, { "epoch": 0.9350348027842227, "grad_norm": 0.45608236805596747, "learning_rate": 8.696292528679446e-06, "loss": 0.04, "step": 2418 }, { "epoch": 0.9354215003866976, "grad_norm": 0.34302626590125274, "learning_rate": 8.694777103822303e-06, "loss": 0.0401, "step": 2419 }, { "epoch": 0.9358081979891725, "grad_norm": 0.3424433761008471, "learning_rate": 8.693260930919181e-06, "loss": 0.0399, "step": 2420 }, { "epoch": 0.9361948955916474, "grad_norm": 0.36909993749532327, "learning_rate": 8.691744010277043e-06, "loss": 0.0512, "step": 2421 }, { "epoch": 0.9365815931941222, "grad_norm": 0.26609288002341125, "learning_rate": 8.690226342203008e-06, "loss": 0.0344, "step": 2422 }, { "epoch": 0.936968290796597, "grad_norm": 0.401457607460734, "learning_rate": 8.688707927004344e-06, "loss": 0.0414, "step": 2423 }, { "epoch": 0.9373549883990719, "grad_norm": 0.3686351084251006, "learning_rate": 8.687188764988468e-06, "loss": 0.0509, "step": 2424 }, { "epoch": 0.9377416860015468, "grad_norm": 0.4432668446966689, "learning_rate": 8.685668856462953e-06, "loss": 0.0405, "step": 2425 }, { "epoch": 0.9381283836040216, "grad_norm": 0.2957950036355022, "learning_rate": 8.684148201735516e-06, "loss": 0.0389, "step": 2426 }, { "epoch": 0.9385150812064965, "grad_norm": 0.32475999553636703, "learning_rate": 8.68262680111403e-06, "loss": 0.0564, "step": 2427 }, { "epoch": 0.9389017788089714, "grad_norm": 0.4647170049952091, "learning_rate": 8.681104654906523e-06, "loss": 0.0481, "step": 2428 }, { "epoch": 0.9392884764114462, "grad_norm": 0.36038338586627794, "learning_rate": 8.679581763421165e-06, "loss": 0.0327, "step": 2429 }, { "epoch": 0.9396751740139211, "grad_norm": 0.412993113427477, "learning_rate": 8.678058126966284e-06, "loss": 0.048, "step": 2430 }, { "epoch": 0.940061871616396, "grad_norm": 0.4330869727227075, "learning_rate": 8.676533745850353e-06, "loss": 0.0354, "step": 2431 }, { "epoch": 0.9404485692188709, "grad_norm": 0.4637439163730528, "learning_rate": 8.675008620382002e-06, "loss": 0.0518, "step": 2432 }, { "epoch": 0.9408352668213457, "grad_norm": 0.30771859992837847, "learning_rate": 8.673482750870003e-06, "loss": 0.033, "step": 2433 }, { "epoch": 0.9412219644238206, "grad_norm": 0.26672636041604014, "learning_rate": 8.671956137623291e-06, "loss": 0.0318, "step": 2434 }, { "epoch": 0.9416086620262955, "grad_norm": 0.3053688139644245, "learning_rate": 8.67042878095094e-06, "loss": 0.0349, "step": 2435 }, { "epoch": 0.9419953596287703, "grad_norm": 0.28578136002829485, "learning_rate": 8.668900681162183e-06, "loss": 0.0328, "step": 2436 }, { "epoch": 0.9423820572312451, "grad_norm": 0.4553668838983159, "learning_rate": 8.667371838566396e-06, "loss": 0.0484, "step": 2437 }, { "epoch": 0.94276875483372, "grad_norm": 0.3761414921895271, "learning_rate": 8.66584225347311e-06, "loss": 0.039, "step": 2438 }, { "epoch": 0.9431554524361949, "grad_norm": 0.30337181666049934, "learning_rate": 8.664311926192006e-06, "loss": 0.0315, "step": 2439 }, { "epoch": 0.9435421500386698, "grad_norm": 0.2668124901720114, "learning_rate": 8.662780857032913e-06, "loss": 0.0299, "step": 2440 }, { "epoch": 0.9439288476411446, "grad_norm": 0.3021523640311698, "learning_rate": 8.661249046305816e-06, "loss": 0.0417, "step": 2441 }, { "epoch": 0.9443155452436195, "grad_norm": 0.3284118176797601, "learning_rate": 8.659716494320842e-06, "loss": 0.0473, "step": 2442 }, { "epoch": 0.9447022428460944, "grad_norm": 0.35806537980694414, "learning_rate": 8.658183201388273e-06, "loss": 0.0395, "step": 2443 }, { "epoch": 0.9450889404485692, "grad_norm": 0.2930513092294572, "learning_rate": 8.656649167818542e-06, "loss": 0.0352, "step": 2444 }, { "epoch": 0.9454756380510441, "grad_norm": 0.3206594531653908, "learning_rate": 8.655114393922228e-06, "loss": 0.045, "step": 2445 }, { "epoch": 0.945862335653519, "grad_norm": 0.3265182657079514, "learning_rate": 8.65357888001006e-06, "loss": 0.0364, "step": 2446 }, { "epoch": 0.9462490332559939, "grad_norm": 0.2814178116421935, "learning_rate": 8.652042626392924e-06, "loss": 0.0341, "step": 2447 }, { "epoch": 0.9466357308584686, "grad_norm": 0.30201978254312317, "learning_rate": 8.650505633381848e-06, "loss": 0.0288, "step": 2448 }, { "epoch": 0.9470224284609435, "grad_norm": 0.42733333224589953, "learning_rate": 8.648967901288009e-06, "loss": 0.0398, "step": 2449 }, { "epoch": 0.9474091260634184, "grad_norm": 0.32935134750260103, "learning_rate": 8.64742943042274e-06, "loss": 0.0315, "step": 2450 }, { "epoch": 0.9477958236658933, "grad_norm": 0.5646304543741556, "learning_rate": 8.645890221097521e-06, "loss": 0.0404, "step": 2451 }, { "epoch": 0.9481825212683681, "grad_norm": 0.30254980531320974, "learning_rate": 8.64435027362398e-06, "loss": 0.0449, "step": 2452 }, { "epoch": 0.948569218870843, "grad_norm": 0.3264244952910399, "learning_rate": 8.642809588313897e-06, "loss": 0.0314, "step": 2453 }, { "epoch": 0.9489559164733179, "grad_norm": 0.6449687664007092, "learning_rate": 8.641268165479195e-06, "loss": 0.061, "step": 2454 }, { "epoch": 0.9493426140757927, "grad_norm": 0.3770539056118282, "learning_rate": 8.639726005431957e-06, "loss": 0.0381, "step": 2455 }, { "epoch": 0.9497293116782676, "grad_norm": 0.2694904877520714, "learning_rate": 8.638183108484404e-06, "loss": 0.0269, "step": 2456 }, { "epoch": 0.9501160092807425, "grad_norm": 0.2849292724947755, "learning_rate": 8.636639474948914e-06, "loss": 0.0322, "step": 2457 }, { "epoch": 0.9505027068832174, "grad_norm": 0.4258515422078839, "learning_rate": 8.635095105138011e-06, "loss": 0.0424, "step": 2458 }, { "epoch": 0.9508894044856921, "grad_norm": 0.3410087423680014, "learning_rate": 8.633549999364371e-06, "loss": 0.0353, "step": 2459 }, { "epoch": 0.951276102088167, "grad_norm": 0.3931865039892419, "learning_rate": 8.632004157940815e-06, "loss": 0.0416, "step": 2460 }, { "epoch": 0.9516627996906419, "grad_norm": 0.345726704705122, "learning_rate": 8.630457581180312e-06, "loss": 0.0385, "step": 2461 }, { "epoch": 0.9520494972931168, "grad_norm": 0.35180781494276026, "learning_rate": 8.628910269395987e-06, "loss": 0.025, "step": 2462 }, { "epoch": 0.9524361948955916, "grad_norm": 0.4395717932927166, "learning_rate": 8.627362222901109e-06, "loss": 0.0564, "step": 2463 }, { "epoch": 0.9528228924980665, "grad_norm": 0.32841444572297085, "learning_rate": 8.62581344200909e-06, "loss": 0.035, "step": 2464 }, { "epoch": 0.9532095901005414, "grad_norm": 0.594117447214709, "learning_rate": 8.624263927033505e-06, "loss": 0.0571, "step": 2465 }, { "epoch": 0.9535962877030162, "grad_norm": 0.2859662291614291, "learning_rate": 8.622713678288064e-06, "loss": 0.0297, "step": 2466 }, { "epoch": 0.9539829853054911, "grad_norm": 0.39641823618873406, "learning_rate": 8.621162696086634e-06, "loss": 0.0312, "step": 2467 }, { "epoch": 0.954369682907966, "grad_norm": 0.367589703726885, "learning_rate": 8.619610980743226e-06, "loss": 0.0399, "step": 2468 }, { "epoch": 0.9547563805104409, "grad_norm": 0.37305893882395114, "learning_rate": 8.618058532572e-06, "loss": 0.0462, "step": 2469 }, { "epoch": 0.9551430781129157, "grad_norm": 0.2876594840011683, "learning_rate": 8.616505351887266e-06, "loss": 0.0337, "step": 2470 }, { "epoch": 0.9555297757153906, "grad_norm": 0.23979517795860142, "learning_rate": 8.614951439003482e-06, "loss": 0.0272, "step": 2471 }, { "epoch": 0.9559164733178654, "grad_norm": 0.3103393216865617, "learning_rate": 8.613396794235253e-06, "loss": 0.0447, "step": 2472 }, { "epoch": 0.9563031709203403, "grad_norm": 0.2992491156690106, "learning_rate": 8.611841417897333e-06, "loss": 0.0375, "step": 2473 }, { "epoch": 0.9566898685228151, "grad_norm": 0.3024713086505571, "learning_rate": 8.610285310304627e-06, "loss": 0.0436, "step": 2474 }, { "epoch": 0.95707656612529, "grad_norm": 0.3492233593324274, "learning_rate": 8.60872847177218e-06, "loss": 0.0375, "step": 2475 }, { "epoch": 0.9574632637277649, "grad_norm": 0.43087028781464887, "learning_rate": 8.607170902615194e-06, "loss": 0.0566, "step": 2476 }, { "epoch": 0.9578499613302398, "grad_norm": 0.3726580898809164, "learning_rate": 8.605612603149015e-06, "loss": 0.0528, "step": 2477 }, { "epoch": 0.9582366589327146, "grad_norm": 0.38579785291711943, "learning_rate": 8.604053573689135e-06, "loss": 0.0371, "step": 2478 }, { "epoch": 0.9586233565351895, "grad_norm": 0.3330240994264893, "learning_rate": 8.602493814551196e-06, "loss": 0.0387, "step": 2479 }, { "epoch": 0.9590100541376644, "grad_norm": 0.29348974816354917, "learning_rate": 8.600933326050989e-06, "loss": 0.025, "step": 2480 }, { "epoch": 0.9593967517401392, "grad_norm": 0.48534363646937856, "learning_rate": 8.599372108504447e-06, "loss": 0.0573, "step": 2481 }, { "epoch": 0.9597834493426141, "grad_norm": 0.476222564158098, "learning_rate": 8.597810162227662e-06, "loss": 0.0386, "step": 2482 }, { "epoch": 0.960170146945089, "grad_norm": 0.3292478535975779, "learning_rate": 8.596247487536858e-06, "loss": 0.0395, "step": 2483 }, { "epoch": 0.9605568445475638, "grad_norm": 0.3988154662823465, "learning_rate": 8.594684084748422e-06, "loss": 0.0267, "step": 2484 }, { "epoch": 0.9609435421500386, "grad_norm": 0.43755050079259034, "learning_rate": 8.593119954178876e-06, "loss": 0.0506, "step": 2485 }, { "epoch": 0.9613302397525135, "grad_norm": 0.41944039442824993, "learning_rate": 8.591555096144895e-06, "loss": 0.0339, "step": 2486 }, { "epoch": 0.9617169373549884, "grad_norm": 0.460292928669495, "learning_rate": 8.589989510963305e-06, "loss": 0.0395, "step": 2487 }, { "epoch": 0.9621036349574633, "grad_norm": 0.33868052797042814, "learning_rate": 8.58842319895107e-06, "loss": 0.0488, "step": 2488 }, { "epoch": 0.9624903325599381, "grad_norm": 0.3061825036318028, "learning_rate": 8.586856160425308e-06, "loss": 0.0254, "step": 2489 }, { "epoch": 0.962877030162413, "grad_norm": 0.46855598927937653, "learning_rate": 8.585288395703282e-06, "loss": 0.0412, "step": 2490 }, { "epoch": 0.9632637277648879, "grad_norm": 0.34434353462804873, "learning_rate": 8.583719905102405e-06, "loss": 0.0335, "step": 2491 }, { "epoch": 0.9636504253673627, "grad_norm": 0.3706056444652187, "learning_rate": 8.58215068894023e-06, "loss": 0.0324, "step": 2492 }, { "epoch": 0.9640371229698376, "grad_norm": 0.3122064568305266, "learning_rate": 8.580580747534463e-06, "loss": 0.0358, "step": 2493 }, { "epoch": 0.9644238205723125, "grad_norm": 0.28468353023951953, "learning_rate": 8.579010081202954e-06, "loss": 0.0295, "step": 2494 }, { "epoch": 0.9648105181747874, "grad_norm": 0.4046885928450535, "learning_rate": 8.577438690263702e-06, "loss": 0.0385, "step": 2495 }, { "epoch": 0.9651972157772621, "grad_norm": 0.3280225355322726, "learning_rate": 8.575866575034853e-06, "loss": 0.0343, "step": 2496 }, { "epoch": 0.965583913379737, "grad_norm": 0.30252441067018093, "learning_rate": 8.574293735834693e-06, "loss": 0.0339, "step": 2497 }, { "epoch": 0.9659706109822119, "grad_norm": 0.3957852409620677, "learning_rate": 8.572720172981663e-06, "loss": 0.0388, "step": 2498 }, { "epoch": 0.9663573085846868, "grad_norm": 0.6413353653302307, "learning_rate": 8.571145886794349e-06, "loss": 0.0409, "step": 2499 }, { "epoch": 0.9667440061871616, "grad_norm": 0.4042726813192561, "learning_rate": 8.569570877591478e-06, "loss": 0.0432, "step": 2500 }, { "epoch": 0.9671307037896365, "grad_norm": 0.3002055083686511, "learning_rate": 8.567995145691928e-06, "loss": 0.0373, "step": 2501 }, { "epoch": 0.9675174013921114, "grad_norm": 0.37101926763561804, "learning_rate": 8.566418691414722e-06, "loss": 0.0456, "step": 2502 }, { "epoch": 0.9679040989945862, "grad_norm": 0.3474413077476996, "learning_rate": 8.564841515079031e-06, "loss": 0.0441, "step": 2503 }, { "epoch": 0.9682907965970611, "grad_norm": 0.2828757384571081, "learning_rate": 8.56326361700417e-06, "loss": 0.0283, "step": 2504 }, { "epoch": 0.968677494199536, "grad_norm": 0.27718680592604433, "learning_rate": 8.5616849975096e-06, "loss": 0.0322, "step": 2505 }, { "epoch": 0.9690641918020109, "grad_norm": 0.38887256413345417, "learning_rate": 8.560105656914928e-06, "loss": 0.035, "step": 2506 }, { "epoch": 0.9694508894044856, "grad_norm": 0.3206672315653493, "learning_rate": 8.558525595539913e-06, "loss": 0.0441, "step": 2507 }, { "epoch": 0.9698375870069605, "grad_norm": 0.5062254037768132, "learning_rate": 8.556944813704449e-06, "loss": 0.0469, "step": 2508 }, { "epoch": 0.9702242846094354, "grad_norm": 0.23726551171408072, "learning_rate": 8.555363311728584e-06, "loss": 0.0265, "step": 2509 }, { "epoch": 0.9706109822119103, "grad_norm": 0.3646116014280537, "learning_rate": 8.55378108993251e-06, "loss": 0.0394, "step": 2510 }, { "epoch": 0.9709976798143851, "grad_norm": 0.3109629655929484, "learning_rate": 8.552198148636563e-06, "loss": 0.0407, "step": 2511 }, { "epoch": 0.97138437741686, "grad_norm": 0.3081781926356206, "learning_rate": 8.550614488161226e-06, "loss": 0.0468, "step": 2512 }, { "epoch": 0.9717710750193349, "grad_norm": 0.33588642544074326, "learning_rate": 8.54903010882713e-06, "loss": 0.0302, "step": 2513 }, { "epoch": 0.9721577726218097, "grad_norm": 0.3248508573378233, "learning_rate": 8.547445010955047e-06, "loss": 0.0358, "step": 2514 }, { "epoch": 0.9725444702242846, "grad_norm": 0.25475158125774694, "learning_rate": 8.545859194865896e-06, "loss": 0.0342, "step": 2515 }, { "epoch": 0.9729311678267595, "grad_norm": 0.3270378216864922, "learning_rate": 8.54427266088074e-06, "loss": 0.033, "step": 2516 }, { "epoch": 0.9733178654292344, "grad_norm": 0.362517568267449, "learning_rate": 8.542685409320795e-06, "loss": 0.0556, "step": 2517 }, { "epoch": 0.9737045630317092, "grad_norm": 0.47229287598374686, "learning_rate": 8.541097440507413e-06, "loss": 0.0469, "step": 2518 }, { "epoch": 0.974091260634184, "grad_norm": 0.3649727273043589, "learning_rate": 8.539508754762095e-06, "loss": 0.0326, "step": 2519 }, { "epoch": 0.974477958236659, "grad_norm": 0.27685154168660875, "learning_rate": 8.537919352406488e-06, "loss": 0.0419, "step": 2520 }, { "epoch": 0.9748646558391338, "grad_norm": 0.3472291941602803, "learning_rate": 8.53632923376238e-06, "loss": 0.041, "step": 2521 }, { "epoch": 0.9752513534416086, "grad_norm": 0.36905618520919264, "learning_rate": 8.53473839915171e-06, "loss": 0.0467, "step": 2522 }, { "epoch": 0.9756380510440835, "grad_norm": 0.2928096639815303, "learning_rate": 8.53314684889656e-06, "loss": 0.0356, "step": 2523 }, { "epoch": 0.9760247486465584, "grad_norm": 0.2631439316835429, "learning_rate": 8.53155458331915e-06, "loss": 0.0286, "step": 2524 }, { "epoch": 0.9764114462490333, "grad_norm": 0.4205701917451624, "learning_rate": 8.529961602741857e-06, "loss": 0.0428, "step": 2525 }, { "epoch": 0.9767981438515081, "grad_norm": 0.37637699342230424, "learning_rate": 8.528367907487194e-06, "loss": 0.0403, "step": 2526 }, { "epoch": 0.977184841453983, "grad_norm": 0.302819352884846, "learning_rate": 8.52677349787782e-06, "loss": 0.0274, "step": 2527 }, { "epoch": 0.9775715390564579, "grad_norm": 0.34271499779277287, "learning_rate": 8.525178374236543e-06, "loss": 0.0413, "step": 2528 }, { "epoch": 0.9779582366589327, "grad_norm": 0.35168668744628495, "learning_rate": 8.523582536886311e-06, "loss": 0.027, "step": 2529 }, { "epoch": 0.9783449342614076, "grad_norm": 0.37652025299012704, "learning_rate": 8.521985986150216e-06, "loss": 0.051, "step": 2530 }, { "epoch": 0.9787316318638825, "grad_norm": 0.3721382755682063, "learning_rate": 8.520388722351496e-06, "loss": 0.051, "step": 2531 }, { "epoch": 0.9791183294663574, "grad_norm": 0.3280270892717001, "learning_rate": 8.518790745813536e-06, "loss": 0.036, "step": 2532 }, { "epoch": 0.9795050270688321, "grad_norm": 0.41644622801652464, "learning_rate": 8.517192056859865e-06, "loss": 0.0471, "step": 2533 }, { "epoch": 0.979891724671307, "grad_norm": 0.43940361914359044, "learning_rate": 8.515592655814146e-06, "loss": 0.0484, "step": 2534 }, { "epoch": 0.9802784222737819, "grad_norm": 0.48338613468512337, "learning_rate": 8.513992543000205e-06, "loss": 0.0511, "step": 2535 }, { "epoch": 0.9806651198762568, "grad_norm": 0.2818492633742265, "learning_rate": 8.512391718741993e-06, "loss": 0.0302, "step": 2536 }, { "epoch": 0.9810518174787316, "grad_norm": 0.3850355682827806, "learning_rate": 8.510790183363616e-06, "loss": 0.0601, "step": 2537 }, { "epoch": 0.9814385150812065, "grad_norm": 0.47082937753516535, "learning_rate": 8.509187937189322e-06, "loss": 0.0524, "step": 2538 }, { "epoch": 0.9818252126836814, "grad_norm": 0.364765292872421, "learning_rate": 8.507584980543504e-06, "loss": 0.0407, "step": 2539 }, { "epoch": 0.9822119102861562, "grad_norm": 0.4092446797789383, "learning_rate": 8.505981313750695e-06, "loss": 0.0528, "step": 2540 }, { "epoch": 0.9825986078886311, "grad_norm": 0.7507643165212071, "learning_rate": 8.504376937135574e-06, "loss": 0.0514, "step": 2541 }, { "epoch": 0.982985305491106, "grad_norm": 0.28500283846996366, "learning_rate": 8.502771851022965e-06, "loss": 0.0289, "step": 2542 }, { "epoch": 0.9833720030935809, "grad_norm": 0.32017898387059374, "learning_rate": 8.501166055737834e-06, "loss": 0.033, "step": 2543 }, { "epoch": 0.9837587006960556, "grad_norm": 0.5303264558763537, "learning_rate": 8.499559551605289e-06, "loss": 0.0325, "step": 2544 }, { "epoch": 0.9841453982985305, "grad_norm": 1.048284222334851, "learning_rate": 8.497952338950586e-06, "loss": 0.0613, "step": 2545 }, { "epoch": 0.9845320959010054, "grad_norm": 0.41452148098328373, "learning_rate": 8.496344418099121e-06, "loss": 0.0389, "step": 2546 }, { "epoch": 0.9849187935034803, "grad_norm": 0.39835344062965117, "learning_rate": 8.494735789376434e-06, "loss": 0.0432, "step": 2547 }, { "epoch": 0.9853054911059551, "grad_norm": 0.37715695177448433, "learning_rate": 8.493126453108208e-06, "loss": 0.0459, "step": 2548 }, { "epoch": 0.98569218870843, "grad_norm": 0.4398233647551593, "learning_rate": 8.491516409620272e-06, "loss": 0.0473, "step": 2549 }, { "epoch": 0.9860788863109049, "grad_norm": 0.5058953684309623, "learning_rate": 8.489905659238592e-06, "loss": 0.0621, "step": 2550 }, { "epoch": 0.9864655839133797, "grad_norm": 0.7327419258326384, "learning_rate": 8.488294202289286e-06, "loss": 0.0468, "step": 2551 }, { "epoch": 0.9868522815158546, "grad_norm": 0.7139943993448039, "learning_rate": 8.486682039098608e-06, "loss": 0.0517, "step": 2552 }, { "epoch": 0.9872389791183295, "grad_norm": 0.6248442224930599, "learning_rate": 8.485069169992956e-06, "loss": 0.0438, "step": 2553 }, { "epoch": 0.9876256767208044, "grad_norm": 0.6327869151736891, "learning_rate": 8.483455595298873e-06, "loss": 0.052, "step": 2554 }, { "epoch": 0.9880123743232792, "grad_norm": 0.49215706075206456, "learning_rate": 8.481841315343044e-06, "loss": 0.0643, "step": 2555 }, { "epoch": 0.988399071925754, "grad_norm": 0.4626920477007546, "learning_rate": 8.480226330452298e-06, "loss": 0.0467, "step": 2556 }, { "epoch": 0.9887857695282289, "grad_norm": 0.48505146831244106, "learning_rate": 8.478610640953603e-06, "loss": 0.0575, "step": 2557 }, { "epoch": 0.9891724671307038, "grad_norm": 0.714434110372915, "learning_rate": 8.476994247174074e-06, "loss": 0.0483, "step": 2558 }, { "epoch": 0.9895591647331786, "grad_norm": 0.3498445357575546, "learning_rate": 8.475377149440966e-06, "loss": 0.0377, "step": 2559 }, { "epoch": 0.9899458623356535, "grad_norm": 0.38572198834728094, "learning_rate": 8.473759348081679e-06, "loss": 0.0358, "step": 2560 }, { "epoch": 0.9903325599381284, "grad_norm": 0.34681734295949845, "learning_rate": 8.47214084342375e-06, "loss": 0.0426, "step": 2561 }, { "epoch": 0.9907192575406032, "grad_norm": 0.5051231753380173, "learning_rate": 8.470521635794864e-06, "loss": 0.0568, "step": 2562 }, { "epoch": 0.9911059551430781, "grad_norm": 0.3707483739867154, "learning_rate": 8.468901725522846e-06, "loss": 0.0408, "step": 2563 }, { "epoch": 0.991492652745553, "grad_norm": 0.47647859737320525, "learning_rate": 8.467281112935666e-06, "loss": 0.0375, "step": 2564 }, { "epoch": 0.9918793503480279, "grad_norm": 0.40408905802637024, "learning_rate": 8.46565979836143e-06, "loss": 0.0331, "step": 2565 }, { "epoch": 0.9922660479505027, "grad_norm": 0.4291101778144944, "learning_rate": 8.464037782128394e-06, "loss": 0.0474, "step": 2566 }, { "epoch": 0.9926527455529776, "grad_norm": 0.33824703551467517, "learning_rate": 8.462415064564949e-06, "loss": 0.0349, "step": 2567 }, { "epoch": 0.9930394431554525, "grad_norm": 0.491365372975367, "learning_rate": 8.46079164599963e-06, "loss": 0.0545, "step": 2568 }, { "epoch": 0.9934261407579273, "grad_norm": 0.39187418974284566, "learning_rate": 8.45916752676112e-06, "loss": 0.0518, "step": 2569 }, { "epoch": 0.9938128383604021, "grad_norm": 0.3474825153055417, "learning_rate": 8.457542707178233e-06, "loss": 0.0406, "step": 2570 }, { "epoch": 0.994199535962877, "grad_norm": 0.4183686379597766, "learning_rate": 8.455917187579935e-06, "loss": 0.0379, "step": 2571 }, { "epoch": 0.9945862335653519, "grad_norm": 0.5729711268139099, "learning_rate": 8.454290968295326e-06, "loss": 0.0377, "step": 2572 }, { "epoch": 0.9949729311678268, "grad_norm": 0.46623844101440826, "learning_rate": 8.452664049653655e-06, "loss": 0.0321, "step": 2573 }, { "epoch": 0.9953596287703016, "grad_norm": 0.4471931561058597, "learning_rate": 8.451036431984305e-06, "loss": 0.0526, "step": 2574 }, { "epoch": 0.9957463263727765, "grad_norm": 0.2740860578943622, "learning_rate": 8.449408115616807e-06, "loss": 0.0254, "step": 2575 }, { "epoch": 0.9961330239752514, "grad_norm": 0.3479724955501463, "learning_rate": 8.447779100880826e-06, "loss": 0.0379, "step": 2576 }, { "epoch": 0.9965197215777262, "grad_norm": 0.374913745994494, "learning_rate": 8.446149388106176e-06, "loss": 0.0426, "step": 2577 }, { "epoch": 0.9969064191802011, "grad_norm": 0.4387605304339527, "learning_rate": 8.44451897762281e-06, "loss": 0.0361, "step": 2578 }, { "epoch": 0.997293116782676, "grad_norm": 0.4132905788274935, "learning_rate": 8.442887869760821e-06, "loss": 0.0422, "step": 2579 }, { "epoch": 0.9976798143851509, "grad_norm": 0.34795330986696865, "learning_rate": 8.441256064850441e-06, "loss": 0.0407, "step": 2580 }, { "epoch": 0.9980665119876256, "grad_norm": 0.4772436226833803, "learning_rate": 8.43962356322205e-06, "loss": 0.0457, "step": 2581 }, { "epoch": 0.9984532095901005, "grad_norm": 0.3028646376598682, "learning_rate": 8.437990365206164e-06, "loss": 0.0317, "step": 2582 }, { "epoch": 0.9988399071925754, "grad_norm": 0.38174386241820357, "learning_rate": 8.436356471133438e-06, "loss": 0.0382, "step": 2583 }, { "epoch": 0.9992266047950503, "grad_norm": 0.3926885864807864, "learning_rate": 8.434721881334673e-06, "loss": 0.0434, "step": 2584 }, { "epoch": 0.9996133023975251, "grad_norm": 0.3139319395721139, "learning_rate": 8.433086596140808e-06, "loss": 0.0277, "step": 2585 }, { "epoch": 1.0, "grad_norm": 0.2908820189728672, "learning_rate": 8.431450615882925e-06, "loss": 0.0341, "step": 2586 }, { "epoch": 1.0, "eval_loss": 0.039489828050136566, "eval_runtime": 796.8547, "eval_samples_per_second": 24.69, "eval_steps_per_second": 0.772, "step": 2586 }, { "epoch": 1.000386697602475, "grad_norm": 0.28604433522188305, "learning_rate": 8.429813940892242e-06, "loss": 0.0291, "step": 2587 }, { "epoch": 1.0007733952049498, "grad_norm": 0.38057548402374475, "learning_rate": 8.428176571500125e-06, "loss": 0.0506, "step": 2588 }, { "epoch": 1.0011600928074247, "grad_norm": 0.3937970350495037, "learning_rate": 8.426538508038073e-06, "loss": 0.0329, "step": 2589 }, { "epoch": 1.0015467904098994, "grad_norm": 0.30549158869965226, "learning_rate": 8.424899750837731e-06, "loss": 0.0314, "step": 2590 }, { "epoch": 1.0019334880123743, "grad_norm": 0.31464076724369217, "learning_rate": 8.423260300230882e-06, "loss": 0.0284, "step": 2591 }, { "epoch": 1.0023201856148491, "grad_norm": 0.3536538725452888, "learning_rate": 8.421620156549448e-06, "loss": 0.0322, "step": 2592 }, { "epoch": 1.002706883217324, "grad_norm": 0.26005976188529106, "learning_rate": 8.419979320125496e-06, "loss": 0.0244, "step": 2593 }, { "epoch": 1.003093580819799, "grad_norm": 0.3892734859179721, "learning_rate": 8.418337791291228e-06, "loss": 0.0308, "step": 2594 }, { "epoch": 1.0034802784222738, "grad_norm": 0.4617304713763621, "learning_rate": 8.416695570378991e-06, "loss": 0.0344, "step": 2595 }, { "epoch": 1.0038669760247487, "grad_norm": 0.27541976035882765, "learning_rate": 8.415052657721267e-06, "loss": 0.0273, "step": 2596 }, { "epoch": 1.0042536736272236, "grad_norm": 0.29074265283017947, "learning_rate": 8.41340905365068e-06, "loss": 0.0247, "step": 2597 }, { "epoch": 1.0046403712296983, "grad_norm": 0.5031572575419431, "learning_rate": 8.411764758500001e-06, "loss": 0.0376, "step": 2598 }, { "epoch": 1.0050270688321732, "grad_norm": 0.33245594663991956, "learning_rate": 8.410119772602127e-06, "loss": 0.0401, "step": 2599 }, { "epoch": 1.005413766434648, "grad_norm": 0.38070921956684456, "learning_rate": 8.408474096290105e-06, "loss": 0.0367, "step": 2600 }, { "epoch": 1.005800464037123, "grad_norm": 0.3951649097306544, "learning_rate": 8.406827729897123e-06, "loss": 0.0358, "step": 2601 }, { "epoch": 1.0061871616395979, "grad_norm": 0.25827645073174826, "learning_rate": 8.405180673756499e-06, "loss": 0.034, "step": 2602 }, { "epoch": 1.0065738592420728, "grad_norm": 0.3916772227309293, "learning_rate": 8.403532928201702e-06, "loss": 0.0415, "step": 2603 }, { "epoch": 1.0069605568445477, "grad_norm": 0.22001572433753117, "learning_rate": 8.40188449356633e-06, "loss": 0.0253, "step": 2604 }, { "epoch": 1.0073472544470223, "grad_norm": 0.29361717742856003, "learning_rate": 8.40023537018413e-06, "loss": 0.0359, "step": 2605 }, { "epoch": 1.0077339520494972, "grad_norm": 0.31645925705959765, "learning_rate": 8.398585558388981e-06, "loss": 0.033, "step": 2606 }, { "epoch": 1.0081206496519721, "grad_norm": 0.32230710679456864, "learning_rate": 8.396935058514908e-06, "loss": 0.0351, "step": 2607 }, { "epoch": 1.008507347254447, "grad_norm": 0.3465144170928507, "learning_rate": 8.39528387089607e-06, "loss": 0.0369, "step": 2608 }, { "epoch": 1.008894044856922, "grad_norm": 0.3955394070509779, "learning_rate": 8.393631995866767e-06, "loss": 0.0347, "step": 2609 }, { "epoch": 1.0092807424593968, "grad_norm": 0.3595404509382633, "learning_rate": 8.391979433761437e-06, "loss": 0.0347, "step": 2610 }, { "epoch": 1.0096674400618717, "grad_norm": 0.3479136720262509, "learning_rate": 8.39032618491466e-06, "loss": 0.0331, "step": 2611 }, { "epoch": 1.0100541376643464, "grad_norm": 0.3455527479009051, "learning_rate": 8.388672249661154e-06, "loss": 0.0406, "step": 2612 }, { "epoch": 1.0104408352668213, "grad_norm": 0.35071711700451436, "learning_rate": 8.387017628335772e-06, "loss": 0.0344, "step": 2613 }, { "epoch": 1.0108275328692962, "grad_norm": 0.3964960495568213, "learning_rate": 8.385362321273514e-06, "loss": 0.0276, "step": 2614 }, { "epoch": 1.011214230471771, "grad_norm": 0.33793493974176675, "learning_rate": 8.383706328809512e-06, "loss": 0.0369, "step": 2615 }, { "epoch": 1.011600928074246, "grad_norm": 0.3703718809263619, "learning_rate": 8.382049651279037e-06, "loss": 0.0423, "step": 2616 }, { "epoch": 1.0119876256767208, "grad_norm": 0.3582339780383617, "learning_rate": 8.380392289017501e-06, "loss": 0.0328, "step": 2617 }, { "epoch": 1.0123743232791957, "grad_norm": 0.318071897195481, "learning_rate": 8.378734242360456e-06, "loss": 0.0317, "step": 2618 }, { "epoch": 1.0127610208816706, "grad_norm": 0.32155603977301567, "learning_rate": 8.37707551164359e-06, "loss": 0.0269, "step": 2619 }, { "epoch": 1.0131477184841453, "grad_norm": 0.7146833769687339, "learning_rate": 8.375416097202733e-06, "loss": 0.0378, "step": 2620 }, { "epoch": 1.0135344160866202, "grad_norm": 0.3391016067666973, "learning_rate": 8.373755999373845e-06, "loss": 0.0351, "step": 2621 }, { "epoch": 1.013921113689095, "grad_norm": 0.3127160523204977, "learning_rate": 8.372095218493035e-06, "loss": 0.0259, "step": 2622 }, { "epoch": 1.01430781129157, "grad_norm": 0.2766205985876372, "learning_rate": 8.370433754896543e-06, "loss": 0.0278, "step": 2623 }, { "epoch": 1.0146945088940449, "grad_norm": 0.2778377899088999, "learning_rate": 8.368771608920748e-06, "loss": 0.0354, "step": 2624 }, { "epoch": 1.0150812064965198, "grad_norm": 0.34705306560802196, "learning_rate": 8.367108780902173e-06, "loss": 0.0447, "step": 2625 }, { "epoch": 1.0154679040989947, "grad_norm": 0.3682720557921605, "learning_rate": 8.36544527117747e-06, "loss": 0.0457, "step": 2626 }, { "epoch": 1.0158546017014694, "grad_norm": 0.346633363441478, "learning_rate": 8.363781080083439e-06, "loss": 0.0461, "step": 2627 }, { "epoch": 1.0162412993039442, "grad_norm": 0.33589824955907854, "learning_rate": 8.36211620795701e-06, "loss": 0.0341, "step": 2628 }, { "epoch": 1.0166279969064191, "grad_norm": 0.3077157181915021, "learning_rate": 8.36045065513525e-06, "loss": 0.0245, "step": 2629 }, { "epoch": 1.017014694508894, "grad_norm": 0.37860025439781986, "learning_rate": 8.358784421955375e-06, "loss": 0.0438, "step": 2630 }, { "epoch": 1.017401392111369, "grad_norm": 0.38308441232244445, "learning_rate": 8.357117508754725e-06, "loss": 0.0331, "step": 2631 }, { "epoch": 1.0177880897138438, "grad_norm": 0.36055762110299433, "learning_rate": 8.355449915870787e-06, "loss": 0.0345, "step": 2632 }, { "epoch": 1.0181747873163187, "grad_norm": 0.2774166380757577, "learning_rate": 8.353781643641181e-06, "loss": 0.029, "step": 2633 }, { "epoch": 1.0185614849187936, "grad_norm": 0.3173098025804593, "learning_rate": 8.352112692403668e-06, "loss": 0.0327, "step": 2634 }, { "epoch": 1.0189481825212683, "grad_norm": 0.38274021434245636, "learning_rate": 8.350443062496142e-06, "loss": 0.0412, "step": 2635 }, { "epoch": 1.0193348801237432, "grad_norm": 0.5401877785220365, "learning_rate": 8.34877275425664e-06, "loss": 0.0603, "step": 2636 }, { "epoch": 1.019721577726218, "grad_norm": 0.35292306836277365, "learning_rate": 8.34710176802333e-06, "loss": 0.0382, "step": 2637 }, { "epoch": 1.020108275328693, "grad_norm": 0.3626407756793344, "learning_rate": 8.34543010413452e-06, "loss": 0.0386, "step": 2638 }, { "epoch": 1.0204949729311679, "grad_norm": 0.3205993532785203, "learning_rate": 8.343757762928661e-06, "loss": 0.0353, "step": 2639 }, { "epoch": 1.0208816705336428, "grad_norm": 0.3800108183318496, "learning_rate": 8.34208474474433e-06, "loss": 0.0361, "step": 2640 }, { "epoch": 1.0212683681361177, "grad_norm": 0.4119178779326424, "learning_rate": 8.34041104992025e-06, "loss": 0.0347, "step": 2641 }, { "epoch": 1.0216550657385923, "grad_norm": 0.35171354899309476, "learning_rate": 8.338736678795279e-06, "loss": 0.0344, "step": 2642 }, { "epoch": 1.0220417633410672, "grad_norm": 0.28877300552962815, "learning_rate": 8.337061631708407e-06, "loss": 0.0359, "step": 2643 }, { "epoch": 1.0224284609435421, "grad_norm": 0.419058393987725, "learning_rate": 8.335385908998768e-06, "loss": 0.0406, "step": 2644 }, { "epoch": 1.022815158546017, "grad_norm": 0.32233068603321635, "learning_rate": 8.333709511005625e-06, "loss": 0.0261, "step": 2645 }, { "epoch": 1.023201856148492, "grad_norm": 0.2582970633331964, "learning_rate": 8.33203243806839e-06, "loss": 0.0226, "step": 2646 }, { "epoch": 1.0235885537509668, "grad_norm": 0.2970309456601428, "learning_rate": 8.330354690526595e-06, "loss": 0.0339, "step": 2647 }, { "epoch": 1.0239752513534417, "grad_norm": 0.33382899092999624, "learning_rate": 8.328676268719923e-06, "loss": 0.0363, "step": 2648 }, { "epoch": 1.0243619489559164, "grad_norm": 0.7585111550844564, "learning_rate": 8.326997172988189e-06, "loss": 0.0364, "step": 2649 }, { "epoch": 1.0247486465583913, "grad_norm": 0.37581093762051865, "learning_rate": 8.325317403671338e-06, "loss": 0.0395, "step": 2650 }, { "epoch": 1.0251353441608662, "grad_norm": 0.31780013973425286, "learning_rate": 8.32363696110946e-06, "loss": 0.0284, "step": 2651 }, { "epoch": 1.025522041763341, "grad_norm": 0.3472572678948667, "learning_rate": 8.321955845642778e-06, "loss": 0.0343, "step": 2652 }, { "epoch": 1.025908739365816, "grad_norm": 0.41312925578457904, "learning_rate": 8.32027405761165e-06, "loss": 0.0316, "step": 2653 }, { "epoch": 1.0262954369682908, "grad_norm": 0.32834018237722046, "learning_rate": 8.318591597356573e-06, "loss": 0.0307, "step": 2654 }, { "epoch": 1.0266821345707657, "grad_norm": 0.2737075596792407, "learning_rate": 8.31690846521818e-06, "loss": 0.0359, "step": 2655 }, { "epoch": 1.0270688321732406, "grad_norm": 0.3661008440933735, "learning_rate": 8.315224661537232e-06, "loss": 0.0384, "step": 2656 }, { "epoch": 1.0274555297757153, "grad_norm": 0.33641785386701056, "learning_rate": 8.313540186654639e-06, "loss": 0.0258, "step": 2657 }, { "epoch": 1.0278422273781902, "grad_norm": 0.42442602036423677, "learning_rate": 8.311855040911438e-06, "loss": 0.0329, "step": 2658 }, { "epoch": 1.028228924980665, "grad_norm": 0.3467151358390194, "learning_rate": 8.310169224648807e-06, "loss": 0.0321, "step": 2659 }, { "epoch": 1.02861562258314, "grad_norm": 0.48669335937114533, "learning_rate": 8.308482738208052e-06, "loss": 0.025, "step": 2660 }, { "epoch": 1.0290023201856149, "grad_norm": 0.4625838807967639, "learning_rate": 8.306795581930625e-06, "loss": 0.0481, "step": 2661 }, { "epoch": 1.0293890177880898, "grad_norm": 0.3581433732223238, "learning_rate": 8.305107756158105e-06, "loss": 0.0342, "step": 2662 }, { "epoch": 1.0297757153905647, "grad_norm": 0.3704440179541943, "learning_rate": 8.303419261232208e-06, "loss": 0.0374, "step": 2663 }, { "epoch": 1.0301624129930393, "grad_norm": 0.5127744967036244, "learning_rate": 8.30173009749479e-06, "loss": 0.0383, "step": 2664 }, { "epoch": 1.0305491105955142, "grad_norm": 0.4499735471096172, "learning_rate": 8.300040265287842e-06, "loss": 0.0457, "step": 2665 }, { "epoch": 1.0309358081979891, "grad_norm": 0.2749439269030395, "learning_rate": 8.298349764953483e-06, "loss": 0.0309, "step": 2666 }, { "epoch": 1.031322505800464, "grad_norm": 0.3951403929332403, "learning_rate": 8.296658596833977e-06, "loss": 0.0363, "step": 2667 }, { "epoch": 1.031709203402939, "grad_norm": 0.37272372441830287, "learning_rate": 8.294966761271718e-06, "loss": 0.0276, "step": 2668 }, { "epoch": 1.0320959010054138, "grad_norm": 0.34005114336197506, "learning_rate": 8.293274258609231e-06, "loss": 0.0352, "step": 2669 }, { "epoch": 1.0324825986078887, "grad_norm": 0.2682891036274546, "learning_rate": 8.291581089189186e-06, "loss": 0.0317, "step": 2670 }, { "epoch": 1.0328692962103636, "grad_norm": 0.27521682336619857, "learning_rate": 8.28988725335438e-06, "loss": 0.0249, "step": 2671 }, { "epoch": 1.0332559938128383, "grad_norm": 0.34870271564503097, "learning_rate": 8.288192751447749e-06, "loss": 0.0385, "step": 2672 }, { "epoch": 1.0336426914153132, "grad_norm": 0.24954229005023118, "learning_rate": 8.28649758381236e-06, "loss": 0.0228, "step": 2673 }, { "epoch": 1.034029389017788, "grad_norm": 0.4035946513073069, "learning_rate": 8.284801750791419e-06, "loss": 0.0382, "step": 2674 }, { "epoch": 1.034416086620263, "grad_norm": 0.446890246226145, "learning_rate": 8.283105252728267e-06, "loss": 0.0382, "step": 2675 }, { "epoch": 1.0348027842227379, "grad_norm": 0.32846972995343576, "learning_rate": 8.281408089966375e-06, "loss": 0.0308, "step": 2676 }, { "epoch": 1.0351894818252128, "grad_norm": 0.3747431832182304, "learning_rate": 8.27971026284935e-06, "loss": 0.0409, "step": 2677 }, { "epoch": 1.0355761794276876, "grad_norm": 0.5359186117924981, "learning_rate": 8.27801177172094e-06, "loss": 0.0325, "step": 2678 }, { "epoch": 1.0359628770301623, "grad_norm": 0.35433523497029096, "learning_rate": 8.276312616925016e-06, "loss": 0.034, "step": 2679 }, { "epoch": 1.0363495746326372, "grad_norm": 0.34713932636008427, "learning_rate": 8.274612798805593e-06, "loss": 0.0339, "step": 2680 }, { "epoch": 1.036736272235112, "grad_norm": 0.42766303726185195, "learning_rate": 8.272912317706817e-06, "loss": 0.0465, "step": 2681 }, { "epoch": 1.037122969837587, "grad_norm": 0.38093242543108, "learning_rate": 8.271211173972968e-06, "loss": 0.0453, "step": 2682 }, { "epoch": 1.037509667440062, "grad_norm": 0.3278025509288705, "learning_rate": 8.269509367948458e-06, "loss": 0.0275, "step": 2683 }, { "epoch": 1.0378963650425368, "grad_norm": 0.3552967121389711, "learning_rate": 8.267806899977836e-06, "loss": 0.026, "step": 2684 }, { "epoch": 1.0382830626450117, "grad_norm": 0.2879023366976655, "learning_rate": 8.266103770405787e-06, "loss": 0.0334, "step": 2685 }, { "epoch": 1.0386697602474864, "grad_norm": 0.3618078735800212, "learning_rate": 8.264399979577124e-06, "loss": 0.0353, "step": 2686 }, { "epoch": 1.0390564578499613, "grad_norm": 0.36239765347236985, "learning_rate": 8.2626955278368e-06, "loss": 0.0376, "step": 2687 }, { "epoch": 1.0394431554524362, "grad_norm": 0.25957682519597014, "learning_rate": 8.260990415529895e-06, "loss": 0.0255, "step": 2688 }, { "epoch": 1.039829853054911, "grad_norm": 0.6260254177642914, "learning_rate": 8.25928464300163e-06, "loss": 0.0477, "step": 2689 }, { "epoch": 1.040216550657386, "grad_norm": 0.2953778542562066, "learning_rate": 8.257578210597356e-06, "loss": 0.0286, "step": 2690 }, { "epoch": 1.0406032482598608, "grad_norm": 0.44737084892100004, "learning_rate": 8.255871118662557e-06, "loss": 0.0406, "step": 2691 }, { "epoch": 1.0409899458623357, "grad_norm": 0.6099409650760299, "learning_rate": 8.254163367542853e-06, "loss": 0.0357, "step": 2692 }, { "epoch": 1.0413766434648106, "grad_norm": 0.35612057029200905, "learning_rate": 8.252454957583993e-06, "loss": 0.031, "step": 2693 }, { "epoch": 1.0417633410672853, "grad_norm": 0.30570654781485096, "learning_rate": 8.250745889131866e-06, "loss": 0.0399, "step": 2694 }, { "epoch": 1.0421500386697602, "grad_norm": 0.35936810216369725, "learning_rate": 8.249036162532487e-06, "loss": 0.0425, "step": 2695 }, { "epoch": 1.042536736272235, "grad_norm": 0.3165906176139552, "learning_rate": 8.247325778132011e-06, "loss": 0.0225, "step": 2696 }, { "epoch": 1.04292343387471, "grad_norm": 0.3880095259198951, "learning_rate": 8.245614736276721e-06, "loss": 0.0302, "step": 2697 }, { "epoch": 1.0433101314771849, "grad_norm": 0.34087151229743384, "learning_rate": 8.243903037313037e-06, "loss": 0.0254, "step": 2698 }, { "epoch": 1.0436968290796598, "grad_norm": 0.35454176413368516, "learning_rate": 8.242190681587507e-06, "loss": 0.033, "step": 2699 }, { "epoch": 1.0440835266821347, "grad_norm": 0.2996269330336937, "learning_rate": 8.240477669446819e-06, "loss": 0.0347, "step": 2700 }, { "epoch": 1.0444702242846093, "grad_norm": 0.36111029501632974, "learning_rate": 8.238764001237788e-06, "loss": 0.0333, "step": 2701 }, { "epoch": 1.0448569218870842, "grad_norm": 0.4747549058184568, "learning_rate": 8.237049677307365e-06, "loss": 0.0475, "step": 2702 }, { "epoch": 1.0452436194895591, "grad_norm": 0.43877076864057457, "learning_rate": 8.235334698002632e-06, "loss": 0.0461, "step": 2703 }, { "epoch": 1.045630317092034, "grad_norm": 0.2766454861766778, "learning_rate": 8.233619063670803e-06, "loss": 0.0295, "step": 2704 }, { "epoch": 1.046017014694509, "grad_norm": 0.3170990458897529, "learning_rate": 8.231902774659229e-06, "loss": 0.0235, "step": 2705 }, { "epoch": 1.0464037122969838, "grad_norm": 0.36338014153981496, "learning_rate": 8.230185831315387e-06, "loss": 0.034, "step": 2706 }, { "epoch": 1.0467904098994587, "grad_norm": 0.5238860848230708, "learning_rate": 8.228468233986894e-06, "loss": 0.0402, "step": 2707 }, { "epoch": 1.0471771075019336, "grad_norm": 0.35023185369705956, "learning_rate": 8.226749983021494e-06, "loss": 0.0333, "step": 2708 }, { "epoch": 1.0475638051044083, "grad_norm": 0.45427211944719287, "learning_rate": 8.225031078767062e-06, "loss": 0.0445, "step": 2709 }, { "epoch": 1.0479505027068832, "grad_norm": 0.32741769194398207, "learning_rate": 8.223311521571611e-06, "loss": 0.0278, "step": 2710 }, { "epoch": 1.048337200309358, "grad_norm": 0.37851455328463635, "learning_rate": 8.221591311783282e-06, "loss": 0.0311, "step": 2711 }, { "epoch": 1.048723897911833, "grad_norm": 0.5939804952853841, "learning_rate": 8.21987044975035e-06, "loss": 0.0571, "step": 2712 }, { "epoch": 1.0491105955143079, "grad_norm": 0.239700883360625, "learning_rate": 8.218148935821219e-06, "loss": 0.0338, "step": 2713 }, { "epoch": 1.0494972931167827, "grad_norm": 0.348372387204146, "learning_rate": 8.216426770344432e-06, "loss": 0.0338, "step": 2714 }, { "epoch": 1.0498839907192576, "grad_norm": 0.32860311961748356, "learning_rate": 8.214703953668655e-06, "loss": 0.0293, "step": 2715 }, { "epoch": 1.0502706883217323, "grad_norm": 0.3279741413145628, "learning_rate": 8.212980486142693e-06, "loss": 0.0272, "step": 2716 }, { "epoch": 1.0506573859242072, "grad_norm": 0.3582872048185305, "learning_rate": 8.211256368115479e-06, "loss": 0.033, "step": 2717 }, { "epoch": 1.051044083526682, "grad_norm": 0.48745588662328876, "learning_rate": 8.209531599936075e-06, "loss": 0.0346, "step": 2718 }, { "epoch": 1.051430781129157, "grad_norm": 0.25318541157765884, "learning_rate": 8.207806181953686e-06, "loss": 0.0247, "step": 2719 }, { "epoch": 1.051817478731632, "grad_norm": 0.5028929550243572, "learning_rate": 8.206080114517633e-06, "loss": 0.0408, "step": 2720 }, { "epoch": 1.0522041763341068, "grad_norm": 0.27775509914476526, "learning_rate": 8.20435339797738e-06, "loss": 0.029, "step": 2721 }, { "epoch": 1.0525908739365817, "grad_norm": 0.5226297843367894, "learning_rate": 8.202626032682518e-06, "loss": 0.0505, "step": 2722 }, { "epoch": 1.0529775715390564, "grad_norm": 0.43522524783333155, "learning_rate": 8.200898018982773e-06, "loss": 0.0306, "step": 2723 }, { "epoch": 1.0533642691415313, "grad_norm": 0.34540955469305523, "learning_rate": 8.199169357227993e-06, "loss": 0.0414, "step": 2724 }, { "epoch": 1.0537509667440061, "grad_norm": 0.25569613952922526, "learning_rate": 8.19744004776817e-06, "loss": 0.0233, "step": 2725 }, { "epoch": 1.054137664346481, "grad_norm": 0.4095718188202182, "learning_rate": 8.195710090953416e-06, "loss": 0.0433, "step": 2726 }, { "epoch": 1.054524361948956, "grad_norm": 0.34967565213561363, "learning_rate": 8.19397948713398e-06, "loss": 0.0391, "step": 2727 }, { "epoch": 1.0549110595514308, "grad_norm": 0.21742145086018858, "learning_rate": 8.192248236660244e-06, "loss": 0.0209, "step": 2728 }, { "epoch": 1.0552977571539057, "grad_norm": 0.23838514012044276, "learning_rate": 8.190516339882713e-06, "loss": 0.0246, "step": 2729 }, { "epoch": 1.0556844547563804, "grad_norm": 0.5328006775916866, "learning_rate": 8.188783797152028e-06, "loss": 0.0613, "step": 2730 }, { "epoch": 1.0560711523588553, "grad_norm": 0.30125038363900114, "learning_rate": 8.187050608818963e-06, "loss": 0.0335, "step": 2731 }, { "epoch": 1.0564578499613302, "grad_norm": 0.28649022306668714, "learning_rate": 8.185316775234418e-06, "loss": 0.0267, "step": 2732 }, { "epoch": 1.056844547563805, "grad_norm": 0.4700922533494361, "learning_rate": 8.183582296749426e-06, "loss": 0.051, "step": 2733 }, { "epoch": 1.05723124516628, "grad_norm": 0.3559795691463978, "learning_rate": 8.18184717371515e-06, "loss": 0.0434, "step": 2734 }, { "epoch": 1.0576179427687549, "grad_norm": 0.3912031825367695, "learning_rate": 8.180111406482885e-06, "loss": 0.0323, "step": 2735 }, { "epoch": 1.0580046403712298, "grad_norm": 0.3728724513872796, "learning_rate": 8.178374995404053e-06, "loss": 0.0474, "step": 2736 }, { "epoch": 1.0583913379737047, "grad_norm": 0.6634242435696885, "learning_rate": 8.17663794083021e-06, "loss": 0.0538, "step": 2737 }, { "epoch": 1.0587780355761793, "grad_norm": 0.2830794291449087, "learning_rate": 8.17490024311304e-06, "loss": 0.0284, "step": 2738 }, { "epoch": 1.0591647331786542, "grad_norm": 0.48402604062829363, "learning_rate": 8.173161902604358e-06, "loss": 0.0424, "step": 2739 }, { "epoch": 1.0595514307811291, "grad_norm": 0.28027567474766346, "learning_rate": 8.171422919656108e-06, "loss": 0.0238, "step": 2740 }, { "epoch": 1.059938128383604, "grad_norm": 0.3234778016205677, "learning_rate": 8.169683294620369e-06, "loss": 0.0312, "step": 2741 }, { "epoch": 1.060324825986079, "grad_norm": 0.3046535495349869, "learning_rate": 8.167943027849343e-06, "loss": 0.0318, "step": 2742 }, { "epoch": 1.0607115235885538, "grad_norm": 0.30765475962133404, "learning_rate": 8.166202119695365e-06, "loss": 0.0283, "step": 2743 }, { "epoch": 1.0610982211910287, "grad_norm": 0.35796409799344936, "learning_rate": 8.1644605705109e-06, "loss": 0.0333, "step": 2744 }, { "epoch": 1.0614849187935034, "grad_norm": 0.367995093845418, "learning_rate": 8.162718380648544e-06, "loss": 0.0327, "step": 2745 }, { "epoch": 1.0618716163959783, "grad_norm": 0.3000253668880032, "learning_rate": 8.16097555046102e-06, "loss": 0.0317, "step": 2746 }, { "epoch": 1.0622583139984532, "grad_norm": 0.32697758752714856, "learning_rate": 8.159232080301182e-06, "loss": 0.0343, "step": 2747 }, { "epoch": 1.062645011600928, "grad_norm": 0.4156960467540396, "learning_rate": 8.157487970522018e-06, "loss": 0.0334, "step": 2748 }, { "epoch": 1.063031709203403, "grad_norm": 0.4136993380542769, "learning_rate": 8.155743221476635e-06, "loss": 0.0412, "step": 2749 }, { "epoch": 1.0634184068058778, "grad_norm": 0.5742713840796717, "learning_rate": 8.153997833518276e-06, "loss": 0.039, "step": 2750 }, { "epoch": 1.0638051044083527, "grad_norm": 0.31616120527103303, "learning_rate": 8.15225180700032e-06, "loss": 0.031, "step": 2751 }, { "epoch": 1.0641918020108276, "grad_norm": 0.2943659801035774, "learning_rate": 8.15050514227626e-06, "loss": 0.0314, "step": 2752 }, { "epoch": 1.0645784996133023, "grad_norm": 0.4242024758904585, "learning_rate": 8.148757839699729e-06, "loss": 0.0426, "step": 2753 }, { "epoch": 1.0649651972157772, "grad_norm": 0.28576237196304954, "learning_rate": 8.147009899624487e-06, "loss": 0.0344, "step": 2754 }, { "epoch": 1.065351894818252, "grad_norm": 0.2906264758119608, "learning_rate": 8.145261322404424e-06, "loss": 0.0255, "step": 2755 }, { "epoch": 1.065738592420727, "grad_norm": 0.34119674668567546, "learning_rate": 8.143512108393556e-06, "loss": 0.0404, "step": 2756 }, { "epoch": 1.0661252900232019, "grad_norm": 0.33683050858601715, "learning_rate": 8.141762257946029e-06, "loss": 0.0381, "step": 2757 }, { "epoch": 1.0665119876256768, "grad_norm": 0.5903114018296438, "learning_rate": 8.14001177141612e-06, "loss": 0.0602, "step": 2758 }, { "epoch": 1.0668986852281517, "grad_norm": 0.5204900616855601, "learning_rate": 8.138260649158233e-06, "loss": 0.0502, "step": 2759 }, { "epoch": 1.0672853828306264, "grad_norm": 0.29807513648829215, "learning_rate": 8.1365088915269e-06, "loss": 0.035, "step": 2760 }, { "epoch": 1.0676720804331012, "grad_norm": 0.3064085083739329, "learning_rate": 8.134756498876783e-06, "loss": 0.0273, "step": 2761 }, { "epoch": 1.0680587780355761, "grad_norm": 0.44600666856440013, "learning_rate": 8.13300347156267e-06, "loss": 0.0438, "step": 2762 }, { "epoch": 1.068445475638051, "grad_norm": 0.38531080681238755, "learning_rate": 8.131249809939484e-06, "loss": 0.0312, "step": 2763 }, { "epoch": 1.068832173240526, "grad_norm": 0.3281143095065217, "learning_rate": 8.129495514362269e-06, "loss": 0.0338, "step": 2764 }, { "epoch": 1.0692188708430008, "grad_norm": 0.3935696268816983, "learning_rate": 8.1277405851862e-06, "loss": 0.0351, "step": 2765 }, { "epoch": 1.0696055684454757, "grad_norm": 0.49583608515710603, "learning_rate": 8.12598502276658e-06, "loss": 0.0427, "step": 2766 }, { "epoch": 1.0699922660479504, "grad_norm": 0.35353762089264973, "learning_rate": 8.124228827458845e-06, "loss": 0.0321, "step": 2767 }, { "epoch": 1.0703789636504253, "grad_norm": 0.24973178064890922, "learning_rate": 8.122471999618551e-06, "loss": 0.0172, "step": 2768 }, { "epoch": 1.0707656612529002, "grad_norm": 0.46876106738100465, "learning_rate": 8.120714539601387e-06, "loss": 0.0438, "step": 2769 }, { "epoch": 1.071152358855375, "grad_norm": 0.4602016919205706, "learning_rate": 8.11895644776317e-06, "loss": 0.0345, "step": 2770 }, { "epoch": 1.07153905645785, "grad_norm": 0.3176384935774323, "learning_rate": 8.117197724459844e-06, "loss": 0.0298, "step": 2771 }, { "epoch": 1.0719257540603249, "grad_norm": 0.28613888617457695, "learning_rate": 8.115438370047479e-06, "loss": 0.0236, "step": 2772 }, { "epoch": 1.0723124516627998, "grad_norm": 0.2661614942452397, "learning_rate": 8.113678384882276e-06, "loss": 0.0275, "step": 2773 }, { "epoch": 1.0726991492652747, "grad_norm": 0.37429137896078557, "learning_rate": 8.11191776932056e-06, "loss": 0.0405, "step": 2774 }, { "epoch": 1.0730858468677493, "grad_norm": 0.3677439622040804, "learning_rate": 8.110156523718791e-06, "loss": 0.0358, "step": 2775 }, { "epoch": 1.0734725444702242, "grad_norm": 0.38407061052940794, "learning_rate": 8.108394648433546e-06, "loss": 0.0409, "step": 2776 }, { "epoch": 1.0738592420726991, "grad_norm": 0.4521284803669408, "learning_rate": 8.106632143821538e-06, "loss": 0.0426, "step": 2777 }, { "epoch": 1.074245939675174, "grad_norm": 0.399518277425635, "learning_rate": 8.104869010239605e-06, "loss": 0.0348, "step": 2778 }, { "epoch": 1.074632637277649, "grad_norm": 0.2561430775246142, "learning_rate": 8.103105248044707e-06, "loss": 0.0223, "step": 2779 }, { "epoch": 1.0750193348801238, "grad_norm": 0.31822157518340016, "learning_rate": 8.10134085759394e-06, "loss": 0.0263, "step": 2780 }, { "epoch": 1.0754060324825987, "grad_norm": 0.5800938197636822, "learning_rate": 8.099575839244524e-06, "loss": 0.028, "step": 2781 }, { "epoch": 1.0757927300850736, "grad_norm": 0.3198640890923043, "learning_rate": 8.097810193353803e-06, "loss": 0.0465, "step": 2782 }, { "epoch": 1.0761794276875483, "grad_norm": 0.2276666111237598, "learning_rate": 8.09604392027925e-06, "loss": 0.0234, "step": 2783 }, { "epoch": 1.0765661252900232, "grad_norm": 0.29971881713705406, "learning_rate": 8.094277020378468e-06, "loss": 0.0339, "step": 2784 }, { "epoch": 1.076952822892498, "grad_norm": 0.29054066622029084, "learning_rate": 8.092509494009182e-06, "loss": 0.0264, "step": 2785 }, { "epoch": 1.077339520494973, "grad_norm": 0.33818057922722794, "learning_rate": 8.090741341529245e-06, "loss": 0.0316, "step": 2786 }, { "epoch": 1.0777262180974478, "grad_norm": 0.3212696885172994, "learning_rate": 8.08897256329664e-06, "loss": 0.0344, "step": 2787 }, { "epoch": 1.0781129156999227, "grad_norm": 0.4069983101678968, "learning_rate": 8.087203159669476e-06, "loss": 0.0377, "step": 2788 }, { "epoch": 1.0784996133023976, "grad_norm": 0.2913536954333628, "learning_rate": 8.085433131005982e-06, "loss": 0.0239, "step": 2789 }, { "epoch": 1.0788863109048723, "grad_norm": 0.28196251375669734, "learning_rate": 8.083662477664525e-06, "loss": 0.0291, "step": 2790 }, { "epoch": 1.0792730085073472, "grad_norm": 0.34222532894724844, "learning_rate": 8.081891200003586e-06, "loss": 0.0284, "step": 2791 }, { "epoch": 1.079659706109822, "grad_norm": 0.37752193196444916, "learning_rate": 8.080119298381783e-06, "loss": 0.0402, "step": 2792 }, { "epoch": 1.080046403712297, "grad_norm": 0.2624398977440741, "learning_rate": 8.078346773157853e-06, "loss": 0.03, "step": 2793 }, { "epoch": 1.0804331013147719, "grad_norm": 0.3086610056425077, "learning_rate": 8.076573624690664e-06, "loss": 0.0237, "step": 2794 }, { "epoch": 1.0808197989172468, "grad_norm": 0.34388536188924973, "learning_rate": 8.07479985333921e-06, "loss": 0.0259, "step": 2795 }, { "epoch": 1.0812064965197217, "grad_norm": 0.30621820343505646, "learning_rate": 8.073025459462604e-06, "loss": 0.0294, "step": 2796 }, { "epoch": 1.0815931941221963, "grad_norm": 0.28408938919496574, "learning_rate": 8.071250443420096e-06, "loss": 0.0275, "step": 2797 }, { "epoch": 1.0819798917246712, "grad_norm": 0.39517759306428646, "learning_rate": 8.069474805571052e-06, "loss": 0.0322, "step": 2798 }, { "epoch": 1.0823665893271461, "grad_norm": 0.41477001040498346, "learning_rate": 8.067698546274972e-06, "loss": 0.0338, "step": 2799 }, { "epoch": 1.082753286929621, "grad_norm": 0.3696061041551299, "learning_rate": 8.065921665891475e-06, "loss": 0.0332, "step": 2800 }, { "epoch": 1.083139984532096, "grad_norm": 0.3865776814524402, "learning_rate": 8.064144164780312e-06, "loss": 0.0422, "step": 2801 }, { "epoch": 1.0835266821345708, "grad_norm": 0.40889066031953397, "learning_rate": 8.062366043301354e-06, "loss": 0.0428, "step": 2802 }, { "epoch": 1.0839133797370457, "grad_norm": 0.2517564824968195, "learning_rate": 8.060587301814602e-06, "loss": 0.0232, "step": 2803 }, { "epoch": 1.0843000773395204, "grad_norm": 0.42047039439812, "learning_rate": 8.058807940680177e-06, "loss": 0.0555, "step": 2804 }, { "epoch": 1.0846867749419953, "grad_norm": 0.463664974676807, "learning_rate": 8.057027960258335e-06, "loss": 0.0428, "step": 2805 }, { "epoch": 1.0850734725444702, "grad_norm": 0.3500413303487954, "learning_rate": 8.055247360909447e-06, "loss": 0.0422, "step": 2806 }, { "epoch": 1.085460170146945, "grad_norm": 0.3506791021907888, "learning_rate": 8.053466142994015e-06, "loss": 0.0322, "step": 2807 }, { "epoch": 1.08584686774942, "grad_norm": 0.3893616961010892, "learning_rate": 8.051684306872667e-06, "loss": 0.0398, "step": 2808 }, { "epoch": 1.0862335653518949, "grad_norm": 0.23071434532341809, "learning_rate": 8.049901852906152e-06, "loss": 0.0256, "step": 2809 }, { "epoch": 1.0866202629543698, "grad_norm": 0.2958433240236609, "learning_rate": 8.048118781455346e-06, "loss": 0.0317, "step": 2810 }, { "epoch": 1.0870069605568446, "grad_norm": 0.32764864318144904, "learning_rate": 8.04633509288125e-06, "loss": 0.0437, "step": 2811 }, { "epoch": 1.0873936581593193, "grad_norm": 0.3626699983450668, "learning_rate": 8.044550787544994e-06, "loss": 0.0385, "step": 2812 }, { "epoch": 1.0877803557617942, "grad_norm": 0.30430687406806006, "learning_rate": 8.042765865807825e-06, "loss": 0.0396, "step": 2813 }, { "epoch": 1.088167053364269, "grad_norm": 0.30873049601868846, "learning_rate": 8.04098032803112e-06, "loss": 0.0379, "step": 2814 }, { "epoch": 1.088553750966744, "grad_norm": 0.3708112973015729, "learning_rate": 8.039194174576382e-06, "loss": 0.037, "step": 2815 }, { "epoch": 1.088940448569219, "grad_norm": 0.40445076305861666, "learning_rate": 8.037407405805233e-06, "loss": 0.0289, "step": 2816 }, { "epoch": 1.0893271461716938, "grad_norm": 0.5137060102514559, "learning_rate": 8.035620022079424e-06, "loss": 0.0396, "step": 2817 }, { "epoch": 1.0897138437741687, "grad_norm": 0.328818693795021, "learning_rate": 8.033832023760833e-06, "loss": 0.0302, "step": 2818 }, { "epoch": 1.0901005413766434, "grad_norm": 0.2960475658715547, "learning_rate": 8.032043411211453e-06, "loss": 0.0328, "step": 2819 }, { "epoch": 1.0904872389791183, "grad_norm": 0.31000166094744713, "learning_rate": 8.03025418479341e-06, "loss": 0.0311, "step": 2820 }, { "epoch": 1.0908739365815932, "grad_norm": 0.32722531177695996, "learning_rate": 8.028464344868949e-06, "loss": 0.0288, "step": 2821 }, { "epoch": 1.091260634184068, "grad_norm": 0.2706757382033973, "learning_rate": 8.026673891800446e-06, "loss": 0.0283, "step": 2822 }, { "epoch": 1.091647331786543, "grad_norm": 0.7098392512608923, "learning_rate": 8.024882825950395e-06, "loss": 0.0299, "step": 2823 }, { "epoch": 1.0920340293890178, "grad_norm": 0.32790550702740895, "learning_rate": 8.023091147681414e-06, "loss": 0.0377, "step": 2824 }, { "epoch": 1.0924207269914927, "grad_norm": 0.3162267091112136, "learning_rate": 8.02129885735625e-06, "loss": 0.0316, "step": 2825 }, { "epoch": 1.0928074245939676, "grad_norm": 0.2953514720164199, "learning_rate": 8.019505955337769e-06, "loss": 0.0333, "step": 2826 }, { "epoch": 1.0931941221964423, "grad_norm": 0.5540660494122205, "learning_rate": 8.017712441988962e-06, "loss": 0.0486, "step": 2827 }, { "epoch": 1.0935808197989172, "grad_norm": 0.3732380772820561, "learning_rate": 8.015918317672946e-06, "loss": 0.0391, "step": 2828 }, { "epoch": 1.093967517401392, "grad_norm": 0.2955223898374131, "learning_rate": 8.01412358275296e-06, "loss": 0.0246, "step": 2829 }, { "epoch": 1.094354215003867, "grad_norm": 0.28180963028779765, "learning_rate": 8.012328237592363e-06, "loss": 0.0239, "step": 2830 }, { "epoch": 1.0947409126063419, "grad_norm": 0.44032282930160854, "learning_rate": 8.010532282554648e-06, "loss": 0.0373, "step": 2831 }, { "epoch": 1.0951276102088168, "grad_norm": 0.34774821744921747, "learning_rate": 8.008735718003418e-06, "loss": 0.045, "step": 2832 }, { "epoch": 1.0955143078112917, "grad_norm": 0.3053833512836554, "learning_rate": 8.00693854430241e-06, "loss": 0.0293, "step": 2833 }, { "epoch": 1.0959010054137663, "grad_norm": 0.5146409341601128, "learning_rate": 8.00514076181548e-06, "loss": 0.0471, "step": 2834 }, { "epoch": 1.0962877030162412, "grad_norm": 0.2612625691324244, "learning_rate": 8.003342370906608e-06, "loss": 0.025, "step": 2835 }, { "epoch": 1.0966744006187161, "grad_norm": 0.26230639051394, "learning_rate": 8.001543371939896e-06, "loss": 0.027, "step": 2836 }, { "epoch": 1.097061098221191, "grad_norm": 0.51831225826512, "learning_rate": 7.99974376527957e-06, "loss": 0.0505, "step": 2837 }, { "epoch": 1.097447795823666, "grad_norm": 0.35233396557073654, "learning_rate": 7.99794355128998e-06, "loss": 0.036, "step": 2838 }, { "epoch": 1.0978344934261408, "grad_norm": 0.3306196759015474, "learning_rate": 7.996142730335595e-06, "loss": 0.0332, "step": 2839 }, { "epoch": 1.0982211910286157, "grad_norm": 0.48588121567963205, "learning_rate": 7.994341302781015e-06, "loss": 0.0455, "step": 2840 }, { "epoch": 1.0986078886310904, "grad_norm": 0.3991068988913961, "learning_rate": 7.992539268990954e-06, "loss": 0.0331, "step": 2841 }, { "epoch": 1.0989945862335653, "grad_norm": 0.4764272812181763, "learning_rate": 7.990736629330253e-06, "loss": 0.047, "step": 2842 }, { "epoch": 1.0993812838360402, "grad_norm": 0.4178899649206485, "learning_rate": 7.988933384163876e-06, "loss": 0.0409, "step": 2843 }, { "epoch": 1.099767981438515, "grad_norm": 0.45060226181528107, "learning_rate": 7.987129533856907e-06, "loss": 0.0327, "step": 2844 }, { "epoch": 1.10015467904099, "grad_norm": 0.8246940474863937, "learning_rate": 7.985325078774557e-06, "loss": 0.0617, "step": 2845 }, { "epoch": 1.1005413766434649, "grad_norm": 0.2905968686052987, "learning_rate": 7.983520019282155e-06, "loss": 0.0321, "step": 2846 }, { "epoch": 1.1009280742459397, "grad_norm": 0.3039331162626491, "learning_rate": 7.981714355745154e-06, "loss": 0.0281, "step": 2847 }, { "epoch": 1.1013147718484146, "grad_norm": 0.5126199638016773, "learning_rate": 7.979908088529129e-06, "loss": 0.0507, "step": 2848 }, { "epoch": 1.1017014694508893, "grad_norm": 0.36639390138908456, "learning_rate": 7.978101217999779e-06, "loss": 0.0298, "step": 2849 }, { "epoch": 1.1020881670533642, "grad_norm": 0.4347512283144462, "learning_rate": 7.97629374452292e-06, "loss": 0.0316, "step": 2850 }, { "epoch": 1.102474864655839, "grad_norm": 0.41754211048037926, "learning_rate": 7.974485668464499e-06, "loss": 0.0279, "step": 2851 }, { "epoch": 1.102861562258314, "grad_norm": 0.4301358694291662, "learning_rate": 7.972676990190577e-06, "loss": 0.04, "step": 2852 }, { "epoch": 1.103248259860789, "grad_norm": 1.884677834704718, "learning_rate": 7.970867710067338e-06, "loss": 0.0645, "step": 2853 }, { "epoch": 1.1036349574632638, "grad_norm": 0.22512893595859734, "learning_rate": 7.969057828461095e-06, "loss": 0.0233, "step": 2854 }, { "epoch": 1.1040216550657387, "grad_norm": 0.30676745979685927, "learning_rate": 7.967247345738272e-06, "loss": 0.033, "step": 2855 }, { "epoch": 1.1044083526682134, "grad_norm": 0.29617559670596155, "learning_rate": 7.96543626226542e-06, "loss": 0.0245, "step": 2856 }, { "epoch": 1.1047950502706883, "grad_norm": 0.3797519967092428, "learning_rate": 7.963624578409215e-06, "loss": 0.0313, "step": 2857 }, { "epoch": 1.1051817478731631, "grad_norm": 0.34851860262539025, "learning_rate": 7.96181229453645e-06, "loss": 0.031, "step": 2858 }, { "epoch": 1.105568445475638, "grad_norm": 0.6403604442449568, "learning_rate": 7.95999941101404e-06, "loss": 0.0396, "step": 2859 }, { "epoch": 1.105955143078113, "grad_norm": 0.4629518291942353, "learning_rate": 7.95818592820902e-06, "loss": 0.0339, "step": 2860 }, { "epoch": 1.1063418406805878, "grad_norm": 0.3426736856250729, "learning_rate": 7.956371846488555e-06, "loss": 0.0338, "step": 2861 }, { "epoch": 1.1067285382830627, "grad_norm": 0.43477185287091225, "learning_rate": 7.95455716621992e-06, "loss": 0.0368, "step": 2862 }, { "epoch": 1.1071152358855376, "grad_norm": 0.3298637437295614, "learning_rate": 7.952741887770514e-06, "loss": 0.0299, "step": 2863 }, { "epoch": 1.1075019334880123, "grad_norm": 0.32493606799718666, "learning_rate": 7.950926011507862e-06, "loss": 0.0333, "step": 2864 }, { "epoch": 1.1078886310904872, "grad_norm": 0.4112332918644492, "learning_rate": 7.949109537799607e-06, "loss": 0.0301, "step": 2865 }, { "epoch": 1.108275328692962, "grad_norm": 0.3929679316672959, "learning_rate": 7.947292467013512e-06, "loss": 0.035, "step": 2866 }, { "epoch": 1.108662026295437, "grad_norm": 0.5443032204241129, "learning_rate": 7.945474799517463e-06, "loss": 0.0597, "step": 2867 }, { "epoch": 1.1090487238979119, "grad_norm": 0.2832934987343566, "learning_rate": 7.943656535679465e-06, "loss": 0.0273, "step": 2868 }, { "epoch": 1.1094354215003868, "grad_norm": 0.23860099298290327, "learning_rate": 7.941837675867643e-06, "loss": 0.0285, "step": 2869 }, { "epoch": 1.1098221191028617, "grad_norm": 0.41804541196837547, "learning_rate": 7.940018220450244e-06, "loss": 0.0428, "step": 2870 }, { "epoch": 1.1102088167053363, "grad_norm": 0.24865713377642484, "learning_rate": 7.93819816979564e-06, "loss": 0.0288, "step": 2871 }, { "epoch": 1.1105955143078112, "grad_norm": 0.4590953972931345, "learning_rate": 7.936377524272314e-06, "loss": 0.0369, "step": 2872 }, { "epoch": 1.1109822119102861, "grad_norm": 0.5175607543402124, "learning_rate": 7.934556284248878e-06, "loss": 0.0426, "step": 2873 }, { "epoch": 1.111368909512761, "grad_norm": 0.3935144105473328, "learning_rate": 7.932734450094058e-06, "loss": 0.0327, "step": 2874 }, { "epoch": 1.111755607115236, "grad_norm": 0.2997975819898972, "learning_rate": 7.930912022176705e-06, "loss": 0.0302, "step": 2875 }, { "epoch": 1.1121423047177108, "grad_norm": 0.4450428730099987, "learning_rate": 7.92908900086579e-06, "loss": 0.0356, "step": 2876 }, { "epoch": 1.1125290023201857, "grad_norm": 0.3946204397761512, "learning_rate": 7.927265386530398e-06, "loss": 0.0405, "step": 2877 }, { "epoch": 1.1129156999226604, "grad_norm": 0.4241968526635646, "learning_rate": 7.925441179539744e-06, "loss": 0.0417, "step": 2878 }, { "epoch": 1.1133023975251353, "grad_norm": 0.4683617946234156, "learning_rate": 7.923616380263153e-06, "loss": 0.0455, "step": 2879 }, { "epoch": 1.1136890951276102, "grad_norm": 0.32442436926865403, "learning_rate": 7.921790989070078e-06, "loss": 0.0246, "step": 2880 }, { "epoch": 1.114075792730085, "grad_norm": 0.544755844827933, "learning_rate": 7.919965006330089e-06, "loss": 0.0518, "step": 2881 }, { "epoch": 1.11446249033256, "grad_norm": 0.44929799505253704, "learning_rate": 7.918138432412871e-06, "loss": 0.0341, "step": 2882 }, { "epoch": 1.1148491879350348, "grad_norm": 0.36419143289802025, "learning_rate": 7.916311267688235e-06, "loss": 0.0358, "step": 2883 }, { "epoch": 1.1152358855375097, "grad_norm": 0.3923882393436111, "learning_rate": 7.91448351252611e-06, "loss": 0.0308, "step": 2884 }, { "epoch": 1.1156225831399846, "grad_norm": 0.5952705341762782, "learning_rate": 7.912655167296544e-06, "loss": 0.0468, "step": 2885 }, { "epoch": 1.1160092807424593, "grad_norm": 0.4239489204883244, "learning_rate": 7.910826232369701e-06, "loss": 0.042, "step": 2886 }, { "epoch": 1.1163959783449342, "grad_norm": 0.3563885552797971, "learning_rate": 7.908996708115875e-06, "loss": 0.0239, "step": 2887 }, { "epoch": 1.116782675947409, "grad_norm": 0.37189748196655864, "learning_rate": 7.907166594905467e-06, "loss": 0.0328, "step": 2888 }, { "epoch": 1.117169373549884, "grad_norm": 0.360605687806293, "learning_rate": 7.905335893109001e-06, "loss": 0.0347, "step": 2889 }, { "epoch": 1.1175560711523589, "grad_norm": 0.32684402321766154, "learning_rate": 7.903504603097127e-06, "loss": 0.054, "step": 2890 }, { "epoch": 1.1179427687548338, "grad_norm": 0.320880251420474, "learning_rate": 7.901672725240605e-06, "loss": 0.03, "step": 2891 }, { "epoch": 1.1183294663573087, "grad_norm": 0.2797977848141085, "learning_rate": 7.899840259910318e-06, "loss": 0.0352, "step": 2892 }, { "epoch": 1.1187161639597833, "grad_norm": 0.5707823548760051, "learning_rate": 7.898007207477267e-06, "loss": 0.0546, "step": 2893 }, { "epoch": 1.1191028615622582, "grad_norm": 0.2580616549904406, "learning_rate": 7.896173568312577e-06, "loss": 0.0265, "step": 2894 }, { "epoch": 1.1194895591647331, "grad_norm": 0.28130917421465784, "learning_rate": 7.894339342787481e-06, "loss": 0.0237, "step": 2895 }, { "epoch": 1.119876256767208, "grad_norm": 0.33072969397065843, "learning_rate": 7.892504531273339e-06, "loss": 0.0394, "step": 2896 }, { "epoch": 1.120262954369683, "grad_norm": 0.2635697992242552, "learning_rate": 7.89066913414163e-06, "loss": 0.0274, "step": 2897 }, { "epoch": 1.1206496519721578, "grad_norm": 0.2647647899188567, "learning_rate": 7.888833151763949e-06, "loss": 0.0329, "step": 2898 }, { "epoch": 1.1210363495746327, "grad_norm": 0.301144762516099, "learning_rate": 7.886996584512007e-06, "loss": 0.0399, "step": 2899 }, { "epoch": 1.1214230471771076, "grad_norm": 0.4167000529621517, "learning_rate": 7.885159432757637e-06, "loss": 0.03, "step": 2900 }, { "epoch": 1.1218097447795823, "grad_norm": 0.343233601604009, "learning_rate": 7.883321696872792e-06, "loss": 0.0317, "step": 2901 }, { "epoch": 1.1221964423820572, "grad_norm": 0.22798057028875504, "learning_rate": 7.881483377229536e-06, "loss": 0.0195, "step": 2902 }, { "epoch": 1.122583139984532, "grad_norm": 0.33342195475208164, "learning_rate": 7.879644474200059e-06, "loss": 0.0266, "step": 2903 }, { "epoch": 1.122969837587007, "grad_norm": 0.4017885912800055, "learning_rate": 7.877804988156665e-06, "loss": 0.0415, "step": 2904 }, { "epoch": 1.1233565351894819, "grad_norm": 0.4858515582199975, "learning_rate": 7.87596491947178e-06, "loss": 0.0425, "step": 2905 }, { "epoch": 1.1237432327919568, "grad_norm": 0.382250010700915, "learning_rate": 7.87412426851794e-06, "loss": 0.0417, "step": 2906 }, { "epoch": 1.1241299303944317, "grad_norm": 0.3212032416720621, "learning_rate": 7.87228303566781e-06, "loss": 0.0221, "step": 2907 }, { "epoch": 1.1245166279969063, "grad_norm": 0.5429521345907404, "learning_rate": 7.87044122129416e-06, "loss": 0.0485, "step": 2908 }, { "epoch": 1.1249033255993812, "grad_norm": 0.311028260744726, "learning_rate": 7.868598825769888e-06, "loss": 0.0287, "step": 2909 }, { "epoch": 1.1252900232018561, "grad_norm": 0.39316948285151926, "learning_rate": 7.866755849468007e-06, "loss": 0.039, "step": 2910 }, { "epoch": 1.125676720804331, "grad_norm": 0.33770680319394364, "learning_rate": 7.864912292761644e-06, "loss": 0.0359, "step": 2911 }, { "epoch": 1.126063418406806, "grad_norm": 0.5366065639356439, "learning_rate": 7.863068156024048e-06, "loss": 0.0422, "step": 2912 }, { "epoch": 1.1264501160092808, "grad_norm": 0.3481424774809158, "learning_rate": 7.861223439628583e-06, "loss": 0.0276, "step": 2913 }, { "epoch": 1.1268368136117557, "grad_norm": 0.521471098720829, "learning_rate": 7.859378143948732e-06, "loss": 0.0508, "step": 2914 }, { "epoch": 1.1272235112142304, "grad_norm": 0.27490316050860625, "learning_rate": 7.857532269358095e-06, "loss": 0.0295, "step": 2915 }, { "epoch": 1.1276102088167053, "grad_norm": 0.35144912261886074, "learning_rate": 7.855685816230385e-06, "loss": 0.0333, "step": 2916 }, { "epoch": 1.1279969064191802, "grad_norm": 0.3761561779372147, "learning_rate": 7.853838784939437e-06, "loss": 0.0352, "step": 2917 }, { "epoch": 1.128383604021655, "grad_norm": 0.3164015722235571, "learning_rate": 7.851991175859203e-06, "loss": 0.027, "step": 2918 }, { "epoch": 1.12877030162413, "grad_norm": 0.3349618999688671, "learning_rate": 7.850142989363749e-06, "loss": 0.0329, "step": 2919 }, { "epoch": 1.1291569992266048, "grad_norm": 0.2026085101149657, "learning_rate": 7.84829422582726e-06, "loss": 0.0215, "step": 2920 }, { "epoch": 1.1295436968290797, "grad_norm": 0.31052988742929216, "learning_rate": 7.84644488562404e-06, "loss": 0.0263, "step": 2921 }, { "epoch": 1.1299303944315544, "grad_norm": 0.3551104403276906, "learning_rate": 7.844594969128503e-06, "loss": 0.0422, "step": 2922 }, { "epoch": 1.1303170920340293, "grad_norm": 0.3967445889739598, "learning_rate": 7.842744476715187e-06, "loss": 0.0338, "step": 2923 }, { "epoch": 1.1307037896365042, "grad_norm": 0.5461748972852628, "learning_rate": 7.840893408758741e-06, "loss": 0.0465, "step": 2924 }, { "epoch": 1.131090487238979, "grad_norm": 0.3949629105055978, "learning_rate": 7.839041765633934e-06, "loss": 0.0322, "step": 2925 }, { "epoch": 1.131477184841454, "grad_norm": 0.4236294949449925, "learning_rate": 7.837189547715651e-06, "loss": 0.0455, "step": 2926 }, { "epoch": 1.1318638824439289, "grad_norm": 0.6310448876492659, "learning_rate": 7.835336755378891e-06, "loss": 0.0369, "step": 2927 }, { "epoch": 1.1322505800464038, "grad_norm": 0.26553937466432176, "learning_rate": 7.833483388998775e-06, "loss": 0.0294, "step": 2928 }, { "epoch": 1.1326372776488787, "grad_norm": 0.4319673742295662, "learning_rate": 7.83162944895053e-06, "loss": 0.0508, "step": 2929 }, { "epoch": 1.1330239752513536, "grad_norm": 0.5752079204321036, "learning_rate": 7.829774935609512e-06, "loss": 0.0516, "step": 2930 }, { "epoch": 1.1334106728538282, "grad_norm": 0.23515758294919564, "learning_rate": 7.82791984935118e-06, "loss": 0.0269, "step": 2931 }, { "epoch": 1.1337973704563031, "grad_norm": 0.2864845709150602, "learning_rate": 7.82606419055112e-06, "loss": 0.0317, "step": 2932 }, { "epoch": 1.134184068058778, "grad_norm": 0.36240843105546766, "learning_rate": 7.824207959585028e-06, "loss": 0.0305, "step": 2933 }, { "epoch": 1.134570765661253, "grad_norm": 0.3126647018451781, "learning_rate": 7.822351156828719e-06, "loss": 0.0328, "step": 2934 }, { "epoch": 1.1349574632637278, "grad_norm": 0.6121976751488666, "learning_rate": 7.820493782658118e-06, "loss": 0.041, "step": 2935 }, { "epoch": 1.1353441608662027, "grad_norm": 0.3726502048583313, "learning_rate": 7.818635837449273e-06, "loss": 0.0258, "step": 2936 }, { "epoch": 1.1357308584686776, "grad_norm": 0.24244440815143312, "learning_rate": 7.816777321578344e-06, "loss": 0.0226, "step": 2937 }, { "epoch": 1.1361175560711523, "grad_norm": 0.33111363979780134, "learning_rate": 7.814918235421606e-06, "loss": 0.0374, "step": 2938 }, { "epoch": 1.1365042536736272, "grad_norm": 0.400115821304739, "learning_rate": 7.81305857935545e-06, "loss": 0.0346, "step": 2939 }, { "epoch": 1.136890951276102, "grad_norm": 0.4047174261475301, "learning_rate": 7.811198353756384e-06, "loss": 0.0272, "step": 2940 }, { "epoch": 1.137277648878577, "grad_norm": 0.3298465366245158, "learning_rate": 7.80933755900103e-06, "loss": 0.0267, "step": 2941 }, { "epoch": 1.1376643464810519, "grad_norm": 0.2807886503334379, "learning_rate": 7.807476195466121e-06, "loss": 0.0268, "step": 2942 }, { "epoch": 1.1380510440835268, "grad_norm": 0.3353159685650416, "learning_rate": 7.805614263528516e-06, "loss": 0.029, "step": 2943 }, { "epoch": 1.1384377416860016, "grad_norm": 0.3191132234914456, "learning_rate": 7.803751763565178e-06, "loss": 0.0242, "step": 2944 }, { "epoch": 1.1388244392884763, "grad_norm": 0.36105796456973516, "learning_rate": 7.801888695953189e-06, "loss": 0.0376, "step": 2945 }, { "epoch": 1.1392111368909512, "grad_norm": 0.360815361781836, "learning_rate": 7.800025061069749e-06, "loss": 0.0416, "step": 2946 }, { "epoch": 1.139597834493426, "grad_norm": 0.3355885783936543, "learning_rate": 7.79816085929217e-06, "loss": 0.0327, "step": 2947 }, { "epoch": 1.139984532095901, "grad_norm": 0.22157619333766182, "learning_rate": 7.796296090997879e-06, "loss": 0.0206, "step": 2948 }, { "epoch": 1.140371229698376, "grad_norm": 0.4594695104221502, "learning_rate": 7.794430756564416e-06, "loss": 0.0326, "step": 2949 }, { "epoch": 1.1407579273008508, "grad_norm": 0.3301622542864772, "learning_rate": 7.792564856369436e-06, "loss": 0.0268, "step": 2950 }, { "epoch": 1.1411446249033257, "grad_norm": 0.31648330395984847, "learning_rate": 7.790698390790715e-06, "loss": 0.0362, "step": 2951 }, { "epoch": 1.1415313225058004, "grad_norm": 0.28680384973169737, "learning_rate": 7.788831360206135e-06, "loss": 0.0375, "step": 2952 }, { "epoch": 1.1419180201082753, "grad_norm": 0.354981128733844, "learning_rate": 7.786963764993695e-06, "loss": 0.0327, "step": 2953 }, { "epoch": 1.1423047177107502, "grad_norm": 0.6065471210073302, "learning_rate": 7.78509560553151e-06, "loss": 0.0493, "step": 2954 }, { "epoch": 1.142691415313225, "grad_norm": 0.31774536013889676, "learning_rate": 7.78322688219781e-06, "loss": 0.0271, "step": 2955 }, { "epoch": 1.1430781129157, "grad_norm": 0.3201014470299638, "learning_rate": 7.781357595370935e-06, "loss": 0.0331, "step": 2956 }, { "epoch": 1.1434648105181748, "grad_norm": 0.4499800264216485, "learning_rate": 7.779487745429343e-06, "loss": 0.0495, "step": 2957 }, { "epoch": 1.1438515081206497, "grad_norm": 0.5152495689699648, "learning_rate": 7.777617332751604e-06, "loss": 0.0421, "step": 2958 }, { "epoch": 1.1442382057231244, "grad_norm": 0.6125056419226053, "learning_rate": 7.775746357716401e-06, "loss": 0.0533, "step": 2959 }, { "epoch": 1.1446249033255993, "grad_norm": 0.31388826453694585, "learning_rate": 7.773874820702534e-06, "loss": 0.032, "step": 2960 }, { "epoch": 1.1450116009280742, "grad_norm": 0.3084546955986095, "learning_rate": 7.772002722088918e-06, "loss": 0.0315, "step": 2961 }, { "epoch": 1.145398298530549, "grad_norm": 0.2871721484167467, "learning_rate": 7.770130062254573e-06, "loss": 0.0264, "step": 2962 }, { "epoch": 1.145784996133024, "grad_norm": 0.2797232065995428, "learning_rate": 7.76825684157864e-06, "loss": 0.028, "step": 2963 }, { "epoch": 1.1461716937354989, "grad_norm": 0.38841880101511544, "learning_rate": 7.766383060440374e-06, "loss": 0.0334, "step": 2964 }, { "epoch": 1.1465583913379738, "grad_norm": 0.2587842949192418, "learning_rate": 7.764508719219142e-06, "loss": 0.0262, "step": 2965 }, { "epoch": 1.1469450889404487, "grad_norm": 0.331366631947136, "learning_rate": 7.76263381829442e-06, "loss": 0.0371, "step": 2966 }, { "epoch": 1.1473317865429233, "grad_norm": 0.3616691566444925, "learning_rate": 7.760758358045806e-06, "loss": 0.0302, "step": 2967 }, { "epoch": 1.1477184841453982, "grad_norm": 0.3500039697537271, "learning_rate": 7.758882338853002e-06, "loss": 0.0317, "step": 2968 }, { "epoch": 1.1481051817478731, "grad_norm": 0.6099807721394676, "learning_rate": 7.757005761095831e-06, "loss": 0.0492, "step": 2969 }, { "epoch": 1.148491879350348, "grad_norm": 0.4597428409686071, "learning_rate": 7.755128625154223e-06, "loss": 0.0482, "step": 2970 }, { "epoch": 1.148878576952823, "grad_norm": 0.3257328282401976, "learning_rate": 7.753250931408224e-06, "loss": 0.0252, "step": 2971 }, { "epoch": 1.1492652745552978, "grad_norm": 0.32450437013803324, "learning_rate": 7.751372680237994e-06, "loss": 0.0438, "step": 2972 }, { "epoch": 1.1496519721577727, "grad_norm": 0.338538370209082, "learning_rate": 7.749493872023804e-06, "loss": 0.0346, "step": 2973 }, { "epoch": 1.1500386697602476, "grad_norm": 0.36399510155759807, "learning_rate": 7.747614507146037e-06, "loss": 0.0452, "step": 2974 }, { "epoch": 1.1504253673627223, "grad_norm": 0.3214002156148467, "learning_rate": 7.745734585985193e-06, "loss": 0.0291, "step": 2975 }, { "epoch": 1.1508120649651972, "grad_norm": 0.2827641696093035, "learning_rate": 7.743854108921877e-06, "loss": 0.0276, "step": 2976 }, { "epoch": 1.151198762567672, "grad_norm": 0.35752092102139815, "learning_rate": 7.741973076336816e-06, "loss": 0.0279, "step": 2977 }, { "epoch": 1.151585460170147, "grad_norm": 0.5784724583673043, "learning_rate": 7.74009148861084e-06, "loss": 0.0441, "step": 2978 }, { "epoch": 1.1519721577726219, "grad_norm": 0.3288724149423513, "learning_rate": 7.738209346124899e-06, "loss": 0.0298, "step": 2979 }, { "epoch": 1.1523588553750967, "grad_norm": 0.305248570529525, "learning_rate": 7.736326649260051e-06, "loss": 0.0273, "step": 2980 }, { "epoch": 1.1527455529775716, "grad_norm": 0.2693868294761882, "learning_rate": 7.73444339839747e-06, "loss": 0.0224, "step": 2981 }, { "epoch": 1.1531322505800463, "grad_norm": 0.24354644480702559, "learning_rate": 7.732559593918436e-06, "loss": 0.0229, "step": 2982 }, { "epoch": 1.1535189481825212, "grad_norm": 0.346439773461077, "learning_rate": 7.730675236204347e-06, "loss": 0.0278, "step": 2983 }, { "epoch": 1.153905645784996, "grad_norm": 0.43034004700214534, "learning_rate": 7.728790325636708e-06, "loss": 0.037, "step": 2984 }, { "epoch": 1.154292343387471, "grad_norm": 0.5389241630140997, "learning_rate": 7.726904862597142e-06, "loss": 0.0387, "step": 2985 }, { "epoch": 1.154679040989946, "grad_norm": 0.5669074277011901, "learning_rate": 7.725018847467381e-06, "loss": 0.0581, "step": 2986 }, { "epoch": 1.1550657385924208, "grad_norm": 0.29143469061932537, "learning_rate": 7.723132280629266e-06, "loss": 0.0314, "step": 2987 }, { "epoch": 1.1554524361948957, "grad_norm": 0.3914296772217591, "learning_rate": 7.721245162464753e-06, "loss": 0.033, "step": 2988 }, { "epoch": 1.1558391337973704, "grad_norm": 0.31536024153919007, "learning_rate": 7.719357493355905e-06, "loss": 0.048, "step": 2989 }, { "epoch": 1.1562258313998452, "grad_norm": 0.27010282031350064, "learning_rate": 7.717469273684907e-06, "loss": 0.0263, "step": 2990 }, { "epoch": 1.1566125290023201, "grad_norm": 0.31309555611442474, "learning_rate": 7.715580503834047e-06, "loss": 0.0369, "step": 2991 }, { "epoch": 1.156999226604795, "grad_norm": 0.3339096093915523, "learning_rate": 7.713691184185721e-06, "loss": 0.0297, "step": 2992 }, { "epoch": 1.15738592420727, "grad_norm": 0.46157047874306417, "learning_rate": 7.711801315122447e-06, "loss": 0.0515, "step": 2993 }, { "epoch": 1.1577726218097448, "grad_norm": 0.28623535472535305, "learning_rate": 7.709910897026846e-06, "loss": 0.0299, "step": 2994 }, { "epoch": 1.1581593194122197, "grad_norm": 0.6755800898048745, "learning_rate": 7.708019930281651e-06, "loss": 0.0342, "step": 2995 }, { "epoch": 1.1585460170146944, "grad_norm": 0.3039198352950578, "learning_rate": 7.706128415269714e-06, "loss": 0.0305, "step": 2996 }, { "epoch": 1.1589327146171693, "grad_norm": 0.3004389648887865, "learning_rate": 7.704236352373983e-06, "loss": 0.0235, "step": 2997 }, { "epoch": 1.1593194122196442, "grad_norm": 0.2983330460162529, "learning_rate": 7.702343741977535e-06, "loss": 0.0321, "step": 2998 }, { "epoch": 1.159706109822119, "grad_norm": 0.3888317074458211, "learning_rate": 7.700450584463542e-06, "loss": 0.0428, "step": 2999 }, { "epoch": 1.160092807424594, "grad_norm": 0.5592423436923009, "learning_rate": 7.698556880215296e-06, "loss": 0.0701, "step": 3000 }, { "epoch": 1.1604795050270689, "grad_norm": 0.3183087272753406, "learning_rate": 7.696662629616198e-06, "loss": 0.0289, "step": 3001 }, { "epoch": 1.1608662026295438, "grad_norm": 0.2792421422698878, "learning_rate": 7.694767833049757e-06, "loss": 0.0314, "step": 3002 }, { "epoch": 1.1612529002320184, "grad_norm": 0.3335183197768573, "learning_rate": 7.692872490899592e-06, "loss": 0.0297, "step": 3003 }, { "epoch": 1.1616395978344933, "grad_norm": 0.31551173768427265, "learning_rate": 7.690976603549441e-06, "loss": 0.0395, "step": 3004 }, { "epoch": 1.1620262954369682, "grad_norm": 0.3073911697760464, "learning_rate": 7.68908017138314e-06, "loss": 0.0273, "step": 3005 }, { "epoch": 1.1624129930394431, "grad_norm": 0.28362381700061756, "learning_rate": 7.687183194784645e-06, "loss": 0.032, "step": 3006 }, { "epoch": 1.162799690641918, "grad_norm": 0.45905100519879194, "learning_rate": 7.685285674138018e-06, "loss": 0.0469, "step": 3007 }, { "epoch": 1.163186388244393, "grad_norm": 0.3142310057436528, "learning_rate": 7.68338760982743e-06, "loss": 0.0242, "step": 3008 }, { "epoch": 1.1635730858468678, "grad_norm": 0.3220284654582613, "learning_rate": 7.681489002237166e-06, "loss": 0.0313, "step": 3009 }, { "epoch": 1.1639597834493427, "grad_norm": 0.537968626528977, "learning_rate": 7.679589851751617e-06, "loss": 0.0405, "step": 3010 }, { "epoch": 1.1643464810518176, "grad_norm": 0.2681840554837464, "learning_rate": 7.677690158755286e-06, "loss": 0.0281, "step": 3011 }, { "epoch": 1.1647331786542923, "grad_norm": 0.24608067923149513, "learning_rate": 7.675789923632786e-06, "loss": 0.0248, "step": 3012 }, { "epoch": 1.1651198762567672, "grad_norm": 0.32000897895598407, "learning_rate": 7.67388914676884e-06, "loss": 0.0329, "step": 3013 }, { "epoch": 1.165506573859242, "grad_norm": 0.26312477011530755, "learning_rate": 7.671987828548283e-06, "loss": 0.029, "step": 3014 }, { "epoch": 1.165893271461717, "grad_norm": 0.38772549662413575, "learning_rate": 7.670085969356048e-06, "loss": 0.0326, "step": 3015 }, { "epoch": 1.1662799690641918, "grad_norm": 0.5350572442939138, "learning_rate": 7.668183569577193e-06, "loss": 0.0507, "step": 3016 }, { "epoch": 1.1666666666666667, "grad_norm": 0.37289475431597374, "learning_rate": 7.666280629596876e-06, "loss": 0.0355, "step": 3017 }, { "epoch": 1.1670533642691416, "grad_norm": 0.28606906281218225, "learning_rate": 7.664377149800368e-06, "loss": 0.0281, "step": 3018 }, { "epoch": 1.1674400618716163, "grad_norm": 0.7514133802101606, "learning_rate": 7.662473130573049e-06, "loss": 0.0551, "step": 3019 }, { "epoch": 1.1678267594740912, "grad_norm": 0.35126705121661544, "learning_rate": 7.660568572300406e-06, "loss": 0.0353, "step": 3020 }, { "epoch": 1.168213457076566, "grad_norm": 0.6001457580538186, "learning_rate": 7.658663475368038e-06, "loss": 0.0333, "step": 3021 }, { "epoch": 1.168600154679041, "grad_norm": 0.33145460235273333, "learning_rate": 7.656757840161648e-06, "loss": 0.0325, "step": 3022 }, { "epoch": 1.1689868522815159, "grad_norm": 0.28711087050953155, "learning_rate": 7.654851667067058e-06, "loss": 0.0339, "step": 3023 }, { "epoch": 1.1693735498839908, "grad_norm": 0.4164300056801829, "learning_rate": 7.652944956470185e-06, "loss": 0.0287, "step": 3024 }, { "epoch": 1.1697602474864657, "grad_norm": 0.28217862452226117, "learning_rate": 7.651037708757068e-06, "loss": 0.0312, "step": 3025 }, { "epoch": 1.1701469450889403, "grad_norm": 0.30541545386106994, "learning_rate": 7.649129924313847e-06, "loss": 0.0342, "step": 3026 }, { "epoch": 1.1705336426914152, "grad_norm": 0.36150179895841356, "learning_rate": 7.647221603526773e-06, "loss": 0.0374, "step": 3027 }, { "epoch": 1.1709203402938901, "grad_norm": 0.3125343411727398, "learning_rate": 7.645312746782208e-06, "loss": 0.0255, "step": 3028 }, { "epoch": 1.171307037896365, "grad_norm": 0.2972397500316982, "learning_rate": 7.643403354466614e-06, "loss": 0.0278, "step": 3029 }, { "epoch": 1.17169373549884, "grad_norm": 0.5376833178069447, "learning_rate": 7.641493426966571e-06, "loss": 0.02, "step": 3030 }, { "epoch": 1.1720804331013148, "grad_norm": 0.44283236775443596, "learning_rate": 7.639582964668764e-06, "loss": 0.0487, "step": 3031 }, { "epoch": 1.1724671307037897, "grad_norm": 0.2797625133578102, "learning_rate": 7.637671967959986e-06, "loss": 0.0226, "step": 3032 }, { "epoch": 1.1728538283062644, "grad_norm": 0.39304413707881525, "learning_rate": 7.635760437227135e-06, "loss": 0.0383, "step": 3033 }, { "epoch": 1.1732405259087393, "grad_norm": 0.33458647475335607, "learning_rate": 7.633848372857225e-06, "loss": 0.0326, "step": 3034 }, { "epoch": 1.1736272235112142, "grad_norm": 0.27815359536365997, "learning_rate": 7.631935775237369e-06, "loss": 0.0258, "step": 3035 }, { "epoch": 1.174013921113689, "grad_norm": 0.3735667041210378, "learning_rate": 7.630022644754793e-06, "loss": 0.0328, "step": 3036 }, { "epoch": 1.174400618716164, "grad_norm": 0.24514831492892508, "learning_rate": 7.628108981796832e-06, "loss": 0.0234, "step": 3037 }, { "epoch": 1.1747873163186389, "grad_norm": 0.3426741353872363, "learning_rate": 7.626194786750926e-06, "loss": 0.0242, "step": 3038 }, { "epoch": 1.1751740139211138, "grad_norm": 0.3364776266226224, "learning_rate": 7.624280060004625e-06, "loss": 0.0286, "step": 3039 }, { "epoch": 1.1755607115235884, "grad_norm": 0.4784611037949659, "learning_rate": 7.622364801945584e-06, "loss": 0.0406, "step": 3040 }, { "epoch": 1.1759474091260633, "grad_norm": 0.9985526523677448, "learning_rate": 7.620449012961566e-06, "loss": 0.0443, "step": 3041 }, { "epoch": 1.1763341067285382, "grad_norm": 0.2960601286904227, "learning_rate": 7.6185326934404435e-06, "loss": 0.03, "step": 3042 }, { "epoch": 1.1767208043310131, "grad_norm": 0.30208049866829884, "learning_rate": 7.616615843770196e-06, "loss": 0.0297, "step": 3043 }, { "epoch": 1.177107501933488, "grad_norm": 0.4684890409895218, "learning_rate": 7.614698464338908e-06, "loss": 0.0306, "step": 3044 }, { "epoch": 1.177494199535963, "grad_norm": 0.5007362430668637, "learning_rate": 7.6127805555347735e-06, "loss": 0.0386, "step": 3045 }, { "epoch": 1.1778808971384378, "grad_norm": 0.3638459964579541, "learning_rate": 7.610862117746092e-06, "loss": 0.0391, "step": 3046 }, { "epoch": 1.1782675947409127, "grad_norm": 0.43975077087151393, "learning_rate": 7.608943151361274e-06, "loss": 0.0394, "step": 3047 }, { "epoch": 1.1786542923433876, "grad_norm": 0.320675760055585, "learning_rate": 7.60702365676883e-06, "loss": 0.0311, "step": 3048 }, { "epoch": 1.1790409899458623, "grad_norm": 0.5437493713570246, "learning_rate": 7.605103634357387e-06, "loss": 0.0394, "step": 3049 }, { "epoch": 1.1794276875483372, "grad_norm": 0.32438097744484234, "learning_rate": 7.603183084515668e-06, "loss": 0.0302, "step": 3050 }, { "epoch": 1.179814385150812, "grad_norm": 0.41285191395343224, "learning_rate": 7.601262007632513e-06, "loss": 0.037, "step": 3051 }, { "epoch": 1.180201082753287, "grad_norm": 0.254650423470574, "learning_rate": 7.59934040409686e-06, "loss": 0.0255, "step": 3052 }, { "epoch": 1.1805877803557618, "grad_norm": 0.3104539110550006, "learning_rate": 7.5974182742977595e-06, "loss": 0.0302, "step": 3053 }, { "epoch": 1.1809744779582367, "grad_norm": 0.39742026353807475, "learning_rate": 7.595495618624367e-06, "loss": 0.0553, "step": 3054 }, { "epoch": 1.1813611755607116, "grad_norm": 0.32704499564513767, "learning_rate": 7.593572437465944e-06, "loss": 0.024, "step": 3055 }, { "epoch": 1.1817478731631863, "grad_norm": 0.37769312743762495, "learning_rate": 7.591648731211856e-06, "loss": 0.0346, "step": 3056 }, { "epoch": 1.1821345707656612, "grad_norm": 0.3445600842276799, "learning_rate": 7.5897245002515805e-06, "loss": 0.0275, "step": 3057 }, { "epoch": 1.182521268368136, "grad_norm": 0.32727150851551545, "learning_rate": 7.587799744974697e-06, "loss": 0.0333, "step": 3058 }, { "epoch": 1.182907965970611, "grad_norm": 0.29040398208495644, "learning_rate": 7.585874465770893e-06, "loss": 0.0325, "step": 3059 }, { "epoch": 1.1832946635730859, "grad_norm": 0.26407569064092357, "learning_rate": 7.583948663029957e-06, "loss": 0.0245, "step": 3060 }, { "epoch": 1.1836813611755608, "grad_norm": 0.367917997099939, "learning_rate": 7.582022337141795e-06, "loss": 0.0398, "step": 3061 }, { "epoch": 1.1840680587780357, "grad_norm": 0.3526505682258965, "learning_rate": 7.580095488496405e-06, "loss": 0.0458, "step": 3062 }, { "epoch": 1.1844547563805103, "grad_norm": 0.27735252446815734, "learning_rate": 7.578168117483901e-06, "loss": 0.0283, "step": 3063 }, { "epoch": 1.1848414539829852, "grad_norm": 0.4027308916565138, "learning_rate": 7.576240224494498e-06, "loss": 0.036, "step": 3064 }, { "epoch": 1.1852281515854601, "grad_norm": 0.2367122759985592, "learning_rate": 7.574311809918519e-06, "loss": 0.0276, "step": 3065 }, { "epoch": 1.185614849187935, "grad_norm": 0.32715040195799244, "learning_rate": 7.572382874146391e-06, "loss": 0.0258, "step": 3066 }, { "epoch": 1.18600154679041, "grad_norm": 0.3539463330398732, "learning_rate": 7.570453417568648e-06, "loss": 0.0387, "step": 3067 }, { "epoch": 1.1863882443928848, "grad_norm": 0.3359265736207355, "learning_rate": 7.568523440575926e-06, "loss": 0.0302, "step": 3068 }, { "epoch": 1.1867749419953597, "grad_norm": 0.3242896003115111, "learning_rate": 7.566592943558971e-06, "loss": 0.0275, "step": 3069 }, { "epoch": 1.1871616395978344, "grad_norm": 0.35847291803952613, "learning_rate": 7.564661926908632e-06, "loss": 0.0409, "step": 3070 }, { "epoch": 1.1875483372003093, "grad_norm": 0.3754036019112419, "learning_rate": 7.562730391015861e-06, "loss": 0.0394, "step": 3071 }, { "epoch": 1.1879350348027842, "grad_norm": 0.8536002963679367, "learning_rate": 7.560798336271722e-06, "loss": 0.0527, "step": 3072 }, { "epoch": 1.188321732405259, "grad_norm": 0.34765635764673086, "learning_rate": 7.558865763067375e-06, "loss": 0.0346, "step": 3073 }, { "epoch": 1.188708430007734, "grad_norm": 0.298472644440635, "learning_rate": 7.556932671794093e-06, "loss": 0.0284, "step": 3074 }, { "epoch": 1.1890951276102089, "grad_norm": 0.3562160346582943, "learning_rate": 7.554999062843248e-06, "loss": 0.0313, "step": 3075 }, { "epoch": 1.1894818252126838, "grad_norm": 0.33416103305684386, "learning_rate": 7.55306493660632e-06, "loss": 0.0285, "step": 3076 }, { "epoch": 1.1898685228151584, "grad_norm": 0.24752826650796259, "learning_rate": 7.551130293474893e-06, "loss": 0.0264, "step": 3077 }, { "epoch": 1.1902552204176333, "grad_norm": 0.39099181597631255, "learning_rate": 7.549195133840657e-06, "loss": 0.0383, "step": 3078 }, { "epoch": 1.1906419180201082, "grad_norm": 0.8665906400665421, "learning_rate": 7.547259458095402e-06, "loss": 0.0397, "step": 3079 }, { "epoch": 1.191028615622583, "grad_norm": 0.550891415325239, "learning_rate": 7.5453232666310284e-06, "loss": 0.0521, "step": 3080 }, { "epoch": 1.191415313225058, "grad_norm": 0.36970963064741846, "learning_rate": 7.543386559839538e-06, "loss": 0.0374, "step": 3081 }, { "epoch": 1.191802010827533, "grad_norm": 0.22843250928307643, "learning_rate": 7.541449338113035e-06, "loss": 0.023, "step": 3082 }, { "epoch": 1.1921887084300078, "grad_norm": 0.4464668710042819, "learning_rate": 7.539511601843732e-06, "loss": 0.0446, "step": 3083 }, { "epoch": 1.1925754060324827, "grad_norm": 0.37925376150701884, "learning_rate": 7.5375733514239434e-06, "loss": 0.0325, "step": 3084 }, { "epoch": 1.1929621036349576, "grad_norm": 0.39986500149557136, "learning_rate": 7.535634587246088e-06, "loss": 0.0385, "step": 3085 }, { "epoch": 1.1933488012374323, "grad_norm": 0.3787398978803447, "learning_rate": 7.533695309702689e-06, "loss": 0.0448, "step": 3086 }, { "epoch": 1.1937354988399071, "grad_norm": 0.3780007111469678, "learning_rate": 7.531755519186374e-06, "loss": 0.0525, "step": 3087 }, { "epoch": 1.194122196442382, "grad_norm": 0.25949197032863947, "learning_rate": 7.529815216089871e-06, "loss": 0.0191, "step": 3088 }, { "epoch": 1.194508894044857, "grad_norm": 0.3219085758925435, "learning_rate": 7.527874400806017e-06, "loss": 0.0257, "step": 3089 }, { "epoch": 1.1948955916473318, "grad_norm": 0.4764507168611572, "learning_rate": 7.525933073727749e-06, "loss": 0.0444, "step": 3090 }, { "epoch": 1.1952822892498067, "grad_norm": 0.3189617122565271, "learning_rate": 7.523991235248111e-06, "loss": 0.0261, "step": 3091 }, { "epoch": 1.1956689868522816, "grad_norm": 0.6295538704642745, "learning_rate": 7.522048885760245e-06, "loss": 0.0482, "step": 3092 }, { "epoch": 1.1960556844547563, "grad_norm": 0.26741325563276624, "learning_rate": 7.520106025657402e-06, "loss": 0.0223, "step": 3093 }, { "epoch": 1.1964423820572312, "grad_norm": 0.24749008001668285, "learning_rate": 7.518162655332935e-06, "loss": 0.031, "step": 3094 }, { "epoch": 1.196829079659706, "grad_norm": 0.2676830715513694, "learning_rate": 7.516218775180295e-06, "loss": 0.0244, "step": 3095 }, { "epoch": 1.197215777262181, "grad_norm": 0.29755941325863644, "learning_rate": 7.514274385593047e-06, "loss": 0.0376, "step": 3096 }, { "epoch": 1.1976024748646559, "grad_norm": 0.31911684687146286, "learning_rate": 7.512329486964848e-06, "loss": 0.038, "step": 3097 }, { "epoch": 1.1979891724671308, "grad_norm": 0.2967236075713835, "learning_rate": 7.510384079689466e-06, "loss": 0.0303, "step": 3098 }, { "epoch": 1.1983758700696057, "grad_norm": 0.37106721375652757, "learning_rate": 7.508438164160765e-06, "loss": 0.0249, "step": 3099 }, { "epoch": 1.1987625676720803, "grad_norm": 0.2752253441250679, "learning_rate": 7.506491740772721e-06, "loss": 0.0238, "step": 3100 }, { "epoch": 1.1991492652745552, "grad_norm": 0.31825196144824475, "learning_rate": 7.504544809919405e-06, "loss": 0.0279, "step": 3101 }, { "epoch": 1.1995359628770301, "grad_norm": 0.3739604259806867, "learning_rate": 7.502597371994993e-06, "loss": 0.0527, "step": 3102 }, { "epoch": 1.199922660479505, "grad_norm": 0.37701192803046235, "learning_rate": 7.500649427393763e-06, "loss": 0.0366, "step": 3103 }, { "epoch": 1.20030935808198, "grad_norm": 0.32704706169555814, "learning_rate": 7.498700976510099e-06, "loss": 0.0281, "step": 3104 }, { "epoch": 1.2006960556844548, "grad_norm": 0.8880360706292847, "learning_rate": 7.4967520197384845e-06, "loss": 0.0257, "step": 3105 }, { "epoch": 1.2010827532869297, "grad_norm": 0.3818216136615942, "learning_rate": 7.4948025574735055e-06, "loss": 0.0342, "step": 3106 }, { "epoch": 1.2014694508894044, "grad_norm": 0.30424826648426734, "learning_rate": 7.49285259010985e-06, "loss": 0.0415, "step": 3107 }, { "epoch": 1.2018561484918793, "grad_norm": 0.35197070504969086, "learning_rate": 7.490902118042311e-06, "loss": 0.0309, "step": 3108 }, { "epoch": 1.2022428460943542, "grad_norm": 0.36523502547450737, "learning_rate": 7.488951141665781e-06, "loss": 0.0343, "step": 3109 }, { "epoch": 1.202629543696829, "grad_norm": 0.4450144118489087, "learning_rate": 7.486999661375254e-06, "loss": 0.0322, "step": 3110 }, { "epoch": 1.203016241299304, "grad_norm": 0.47019822156684477, "learning_rate": 7.485047677565831e-06, "loss": 0.0368, "step": 3111 }, { "epoch": 1.2034029389017789, "grad_norm": 0.47646878824410344, "learning_rate": 7.4830951906327075e-06, "loss": 0.0392, "step": 3112 }, { "epoch": 1.2037896365042537, "grad_norm": 0.4418271067390028, "learning_rate": 7.481142200971188e-06, "loss": 0.0387, "step": 3113 }, { "epoch": 1.2041763341067284, "grad_norm": 0.313410616700052, "learning_rate": 7.479188708976673e-06, "loss": 0.039, "step": 3114 }, { "epoch": 1.2045630317092033, "grad_norm": 0.312431316233879, "learning_rate": 7.4772347150446674e-06, "loss": 0.0295, "step": 3115 }, { "epoch": 1.2049497293116782, "grad_norm": 0.4751496760660762, "learning_rate": 7.47528021957078e-06, "loss": 0.0418, "step": 3116 }, { "epoch": 1.205336426914153, "grad_norm": 0.36238870476010454, "learning_rate": 7.473325222950716e-06, "loss": 0.0322, "step": 3117 }, { "epoch": 1.205723124516628, "grad_norm": 0.7774888317414232, "learning_rate": 7.471369725580286e-06, "loss": 0.0454, "step": 3118 }, { "epoch": 1.206109822119103, "grad_norm": 0.4715948019788958, "learning_rate": 7.4694137278554e-06, "loss": 0.0406, "step": 3119 }, { "epoch": 1.2064965197215778, "grad_norm": 0.4461263759876195, "learning_rate": 7.4674572301720715e-06, "loss": 0.0368, "step": 3120 }, { "epoch": 1.2068832173240527, "grad_norm": 0.28080752825513244, "learning_rate": 7.4655002329264126e-06, "loss": 0.0269, "step": 3121 }, { "epoch": 1.2072699149265276, "grad_norm": 0.4540718337868732, "learning_rate": 7.4635427365146375e-06, "loss": 0.036, "step": 3122 }, { "epoch": 1.2076566125290022, "grad_norm": 0.30765423525469254, "learning_rate": 7.461584741333063e-06, "loss": 0.0279, "step": 3123 }, { "epoch": 1.2080433101314771, "grad_norm": 0.44674946911093605, "learning_rate": 7.459626247778104e-06, "loss": 0.0322, "step": 3124 }, { "epoch": 1.208430007733952, "grad_norm": 0.3297011640383503, "learning_rate": 7.4576672562462795e-06, "loss": 0.036, "step": 3125 }, { "epoch": 1.208816705336427, "grad_norm": 0.2750518821840527, "learning_rate": 7.455707767134205e-06, "loss": 0.0261, "step": 3126 }, { "epoch": 1.2092034029389018, "grad_norm": 0.27905708134016055, "learning_rate": 7.453747780838603e-06, "loss": 0.0251, "step": 3127 }, { "epoch": 1.2095901005413767, "grad_norm": 0.3334427921892724, "learning_rate": 7.45178729775629e-06, "loss": 0.0271, "step": 3128 }, { "epoch": 1.2099767981438516, "grad_norm": 0.5660196405551263, "learning_rate": 7.4498263182841865e-06, "loss": 0.0537, "step": 3129 }, { "epoch": 1.2103634957463263, "grad_norm": 0.3062489270520299, "learning_rate": 7.447864842819316e-06, "loss": 0.0308, "step": 3130 }, { "epoch": 1.2107501933488012, "grad_norm": 0.40256130955858277, "learning_rate": 7.445902871758797e-06, "loss": 0.0334, "step": 3131 }, { "epoch": 1.211136890951276, "grad_norm": 0.423719694192633, "learning_rate": 7.443940405499851e-06, "loss": 0.0376, "step": 3132 }, { "epoch": 1.211523588553751, "grad_norm": 0.4386944808247939, "learning_rate": 7.4419774444398e-06, "loss": 0.0302, "step": 3133 }, { "epoch": 1.2119102861562259, "grad_norm": 1.4865673432637758, "learning_rate": 7.440013988976067e-06, "loss": 0.0418, "step": 3134 }, { "epoch": 1.2122969837587008, "grad_norm": 0.4947149457757649, "learning_rate": 7.438050039506171e-06, "loss": 0.0416, "step": 3135 }, { "epoch": 1.2126836813611757, "grad_norm": 0.33731670078862025, "learning_rate": 7.436085596427738e-06, "loss": 0.0367, "step": 3136 }, { "epoch": 1.2130703789636503, "grad_norm": 0.3568925254184513, "learning_rate": 7.434120660138486e-06, "loss": 0.0354, "step": 3137 }, { "epoch": 1.2134570765661252, "grad_norm": 0.34119648475865927, "learning_rate": 7.43215523103624e-06, "loss": 0.0312, "step": 3138 }, { "epoch": 1.2138437741686001, "grad_norm": 0.42475483233963923, "learning_rate": 7.430189309518919e-06, "loss": 0.0303, "step": 3139 }, { "epoch": 1.214230471771075, "grad_norm": 0.48734313134641893, "learning_rate": 7.428222895984547e-06, "loss": 0.0311, "step": 3140 }, { "epoch": 1.21461716937355, "grad_norm": 0.46294721125032184, "learning_rate": 7.426255990831242e-06, "loss": 0.0366, "step": 3141 }, { "epoch": 1.2150038669760248, "grad_norm": 0.3730577959496593, "learning_rate": 7.424288594457225e-06, "loss": 0.0313, "step": 3142 }, { "epoch": 1.2153905645784997, "grad_norm": 0.3759929486300316, "learning_rate": 7.422320707260816e-06, "loss": 0.0354, "step": 3143 }, { "epoch": 1.2157772621809744, "grad_norm": 0.2642724246768793, "learning_rate": 7.420352329640434e-06, "loss": 0.0325, "step": 3144 }, { "epoch": 1.2161639597834493, "grad_norm": 0.3224492175008997, "learning_rate": 7.4183834619946005e-06, "loss": 0.0265, "step": 3145 }, { "epoch": 1.2165506573859242, "grad_norm": 0.3359189158353851, "learning_rate": 7.416414104721928e-06, "loss": 0.0282, "step": 3146 }, { "epoch": 1.216937354988399, "grad_norm": 0.39840815775549265, "learning_rate": 7.414444258221138e-06, "loss": 0.0468, "step": 3147 }, { "epoch": 1.217324052590874, "grad_norm": 0.469911554988596, "learning_rate": 7.412473922891044e-06, "loss": 0.044, "step": 3148 }, { "epoch": 1.2177107501933488, "grad_norm": 0.3544550516924325, "learning_rate": 7.410503099130562e-06, "loss": 0.0373, "step": 3149 }, { "epoch": 1.2180974477958237, "grad_norm": 0.31621191301088, "learning_rate": 7.408531787338702e-06, "loss": 0.0295, "step": 3150 }, { "epoch": 1.2184841453982984, "grad_norm": 0.305234172960162, "learning_rate": 7.4065599879145815e-06, "loss": 0.0355, "step": 3151 }, { "epoch": 1.2188708430007733, "grad_norm": 0.38519400421443295, "learning_rate": 7.404587701257409e-06, "loss": 0.0398, "step": 3152 }, { "epoch": 1.2192575406032482, "grad_norm": 0.6317289487856617, "learning_rate": 7.402614927766496e-06, "loss": 0.0461, "step": 3153 }, { "epoch": 1.219644238205723, "grad_norm": 0.3707744369622754, "learning_rate": 7.40064166784125e-06, "loss": 0.0358, "step": 3154 }, { "epoch": 1.220030935808198, "grad_norm": 0.31857775626197116, "learning_rate": 7.398667921881178e-06, "loss": 0.0221, "step": 3155 }, { "epoch": 1.2204176334106729, "grad_norm": 0.2500873093185792, "learning_rate": 7.396693690285884e-06, "loss": 0.0269, "step": 3156 }, { "epoch": 1.2208043310131478, "grad_norm": 0.2435143965042991, "learning_rate": 7.394718973455075e-06, "loss": 0.0214, "step": 3157 }, { "epoch": 1.2211910286156227, "grad_norm": 0.31446950018729064, "learning_rate": 7.392743771788549e-06, "loss": 0.0357, "step": 3158 }, { "epoch": 1.2215777262180976, "grad_norm": 0.3361495855032909, "learning_rate": 7.39076808568621e-06, "loss": 0.0265, "step": 3159 }, { "epoch": 1.2219644238205722, "grad_norm": 0.31213081693402506, "learning_rate": 7.388791915548054e-06, "loss": 0.0318, "step": 3160 }, { "epoch": 1.2223511214230471, "grad_norm": 0.5207542612437647, "learning_rate": 7.386815261774176e-06, "loss": 0.0349, "step": 3161 }, { "epoch": 1.222737819025522, "grad_norm": 0.2853482019507058, "learning_rate": 7.384838124764772e-06, "loss": 0.0309, "step": 3162 }, { "epoch": 1.223124516627997, "grad_norm": 0.3431551516203049, "learning_rate": 7.382860504920133e-06, "loss": 0.0354, "step": 3163 }, { "epoch": 1.2235112142304718, "grad_norm": 0.29381186274006177, "learning_rate": 7.380882402640651e-06, "loss": 0.0249, "step": 3164 }, { "epoch": 1.2238979118329467, "grad_norm": 0.392626422595803, "learning_rate": 7.378903818326809e-06, "loss": 0.0343, "step": 3165 }, { "epoch": 1.2242846094354216, "grad_norm": 0.26110725156383147, "learning_rate": 7.376924752379197e-06, "loss": 0.0274, "step": 3166 }, { "epoch": 1.2246713070378963, "grad_norm": 0.3727079111697775, "learning_rate": 7.3749452051984936e-06, "loss": 0.0288, "step": 3167 }, { "epoch": 1.2250580046403712, "grad_norm": 0.4233979557623197, "learning_rate": 7.37296517718548e-06, "loss": 0.0345, "step": 3168 }, { "epoch": 1.225444702242846, "grad_norm": 0.3638393629560069, "learning_rate": 7.370984668741031e-06, "loss": 0.0345, "step": 3169 }, { "epoch": 1.225831399845321, "grad_norm": 0.31962634896175335, "learning_rate": 7.369003680266127e-06, "loss": 0.0275, "step": 3170 }, { "epoch": 1.2262180974477959, "grad_norm": 0.2966646625150684, "learning_rate": 7.367022212161833e-06, "loss": 0.0264, "step": 3171 }, { "epoch": 1.2266047950502708, "grad_norm": 0.302554437033713, "learning_rate": 7.3650402648293225e-06, "loss": 0.0363, "step": 3172 }, { "epoch": 1.2269914926527457, "grad_norm": 0.33380241991999693, "learning_rate": 7.3630578386698595e-06, "loss": 0.0389, "step": 3173 }, { "epoch": 1.2273781902552203, "grad_norm": 0.31997014157546166, "learning_rate": 7.361074934084806e-06, "loss": 0.042, "step": 3174 }, { "epoch": 1.2277648878576952, "grad_norm": 0.42021155138649313, "learning_rate": 7.359091551475624e-06, "loss": 0.0503, "step": 3175 }, { "epoch": 1.2281515854601701, "grad_norm": 0.2836580703737619, "learning_rate": 7.357107691243868e-06, "loss": 0.0264, "step": 3176 }, { "epoch": 1.228538283062645, "grad_norm": 0.3310692474167899, "learning_rate": 7.355123353791192e-06, "loss": 0.0329, "step": 3177 }, { "epoch": 1.22892498066512, "grad_norm": 0.34350562472766527, "learning_rate": 7.353138539519342e-06, "loss": 0.0299, "step": 3178 }, { "epoch": 1.2293116782675948, "grad_norm": 0.47462089479792385, "learning_rate": 7.351153248830172e-06, "loss": 0.0402, "step": 3179 }, { "epoch": 1.2296983758700697, "grad_norm": 0.3050567521898424, "learning_rate": 7.349167482125618e-06, "loss": 0.0249, "step": 3180 }, { "epoch": 1.2300850734725444, "grad_norm": 0.3435118430703107, "learning_rate": 7.347181239807721e-06, "loss": 0.0355, "step": 3181 }, { "epoch": 1.2304717710750193, "grad_norm": 0.2947757552410232, "learning_rate": 7.345194522278617e-06, "loss": 0.0285, "step": 3182 }, { "epoch": 1.2308584686774942, "grad_norm": 0.27806066589669914, "learning_rate": 7.343207329940538e-06, "loss": 0.0229, "step": 3183 }, { "epoch": 1.231245166279969, "grad_norm": 0.3079423579796199, "learning_rate": 7.34121966319581e-06, "loss": 0.0279, "step": 3184 }, { "epoch": 1.231631863882444, "grad_norm": 0.4204288426755316, "learning_rate": 7.339231522446858e-06, "loss": 0.0407, "step": 3185 }, { "epoch": 1.2320185614849188, "grad_norm": 0.32911319911272446, "learning_rate": 7.3372429080962e-06, "loss": 0.0261, "step": 3186 }, { "epoch": 1.2324052590873937, "grad_norm": 0.3497694573610909, "learning_rate": 7.335253820546457e-06, "loss": 0.0235, "step": 3187 }, { "epoch": 1.2327919566898684, "grad_norm": 0.29162539813630933, "learning_rate": 7.3332642602003336e-06, "loss": 0.0256, "step": 3188 }, { "epoch": 1.2331786542923433, "grad_norm": 0.5980531690423664, "learning_rate": 7.3312742274606405e-06, "loss": 0.0268, "step": 3189 }, { "epoch": 1.2335653518948182, "grad_norm": 0.3886949607778157, "learning_rate": 7.329283722730279e-06, "loss": 0.0417, "step": 3190 }, { "epoch": 1.233952049497293, "grad_norm": 0.3402384975871346, "learning_rate": 7.32729274641225e-06, "loss": 0.0411, "step": 3191 }, { "epoch": 1.234338747099768, "grad_norm": 0.39352768406443833, "learning_rate": 7.325301298909644e-06, "loss": 0.0305, "step": 3192 }, { "epoch": 1.2347254447022429, "grad_norm": 0.41618410756701946, "learning_rate": 7.323309380625653e-06, "loss": 0.035, "step": 3193 }, { "epoch": 1.2351121423047178, "grad_norm": 0.2387261040017833, "learning_rate": 7.321316991963559e-06, "loss": 0.0242, "step": 3194 }, { "epoch": 1.2354988399071927, "grad_norm": 0.3365993718803836, "learning_rate": 7.319324133326744e-06, "loss": 0.0407, "step": 3195 }, { "epoch": 1.2358855375096676, "grad_norm": 0.2940981039683737, "learning_rate": 7.317330805118682e-06, "loss": 0.032, "step": 3196 }, { "epoch": 1.2362722351121422, "grad_norm": 0.2956763401235374, "learning_rate": 7.315337007742943e-06, "loss": 0.0238, "step": 3197 }, { "epoch": 1.2366589327146171, "grad_norm": 0.7704473662641156, "learning_rate": 7.313342741603192e-06, "loss": 0.0518, "step": 3198 }, { "epoch": 1.237045630317092, "grad_norm": 0.2935623589587392, "learning_rate": 7.311348007103188e-06, "loss": 0.0341, "step": 3199 }, { "epoch": 1.237432327919567, "grad_norm": 0.35641481724401397, "learning_rate": 7.30935280464679e-06, "loss": 0.0342, "step": 3200 }, { "epoch": 1.2378190255220418, "grad_norm": 0.3189186149899984, "learning_rate": 7.30735713463794e-06, "loss": 0.0284, "step": 3201 }, { "epoch": 1.2382057231245167, "grad_norm": 0.3477203361237451, "learning_rate": 7.305360997480688e-06, "loss": 0.0399, "step": 3202 }, { "epoch": 1.2385924207269916, "grad_norm": 0.31555230553976504, "learning_rate": 7.303364393579171e-06, "loss": 0.0259, "step": 3203 }, { "epoch": 1.2389791183294663, "grad_norm": 0.3190922179779031, "learning_rate": 7.301367323337622e-06, "loss": 0.0373, "step": 3204 }, { "epoch": 1.2393658159319412, "grad_norm": 0.31529245498508335, "learning_rate": 7.299369787160367e-06, "loss": 0.0316, "step": 3205 }, { "epoch": 1.239752513534416, "grad_norm": 0.4524889134363989, "learning_rate": 7.297371785451831e-06, "loss": 0.0305, "step": 3206 }, { "epoch": 1.240139211136891, "grad_norm": 0.35665194262232514, "learning_rate": 7.295373318616529e-06, "loss": 0.0284, "step": 3207 }, { "epoch": 1.2405259087393659, "grad_norm": 0.3537368270281943, "learning_rate": 7.29337438705907e-06, "loss": 0.0448, "step": 3208 }, { "epoch": 1.2409126063418408, "grad_norm": 0.245187055647261, "learning_rate": 7.29137499118416e-06, "loss": 0.0249, "step": 3209 }, { "epoch": 1.2412993039443156, "grad_norm": 0.24557051493536922, "learning_rate": 7.289375131396597e-06, "loss": 0.0297, "step": 3210 }, { "epoch": 1.2416860015467903, "grad_norm": 0.4347401969586231, "learning_rate": 7.287374808101274e-06, "loss": 0.04, "step": 3211 }, { "epoch": 1.2420726991492652, "grad_norm": 0.2772660997605896, "learning_rate": 7.285374021703176e-06, "loss": 0.0279, "step": 3212 }, { "epoch": 1.24245939675174, "grad_norm": 0.35193132531948884, "learning_rate": 7.283372772607385e-06, "loss": 0.0351, "step": 3213 }, { "epoch": 1.242846094354215, "grad_norm": 0.5784332657542243, "learning_rate": 7.281371061219072e-06, "loss": 0.0582, "step": 3214 }, { "epoch": 1.24323279195669, "grad_norm": 0.27047000722077935, "learning_rate": 7.279368887943505e-06, "loss": 0.0254, "step": 3215 }, { "epoch": 1.2436194895591648, "grad_norm": 0.3226303373663441, "learning_rate": 7.277366253186044e-06, "loss": 0.042, "step": 3216 }, { "epoch": 1.2440061871616397, "grad_norm": 0.40988712953343986, "learning_rate": 7.275363157352148e-06, "loss": 0.0406, "step": 3217 }, { "epoch": 1.2443928847641144, "grad_norm": 0.33538884962257476, "learning_rate": 7.273359600847359e-06, "loss": 0.04, "step": 3218 }, { "epoch": 1.2447795823665893, "grad_norm": 0.2811130197667965, "learning_rate": 7.27135558407732e-06, "loss": 0.0328, "step": 3219 }, { "epoch": 1.2451662799690641, "grad_norm": 0.284061733622866, "learning_rate": 7.269351107447764e-06, "loss": 0.0248, "step": 3220 }, { "epoch": 1.245552977571539, "grad_norm": 0.2786335550692187, "learning_rate": 7.2673461713645195e-06, "loss": 0.0248, "step": 3221 }, { "epoch": 1.245939675174014, "grad_norm": 0.2839087368482974, "learning_rate": 7.265340776233506e-06, "loss": 0.031, "step": 3222 }, { "epoch": 1.2463263727764888, "grad_norm": 0.2751466001520827, "learning_rate": 7.263334922460737e-06, "loss": 0.0287, "step": 3223 }, { "epoch": 1.2467130703789637, "grad_norm": 0.4355098327918121, "learning_rate": 7.261328610452317e-06, "loss": 0.0357, "step": 3224 }, { "epoch": 1.2470997679814384, "grad_norm": 0.3250271445592564, "learning_rate": 7.259321840614446e-06, "loss": 0.0352, "step": 3225 }, { "epoch": 1.2474864655839133, "grad_norm": 0.6305600547631722, "learning_rate": 7.2573146133534155e-06, "loss": 0.0505, "step": 3226 }, { "epoch": 1.2478731631863882, "grad_norm": 0.5249734334726444, "learning_rate": 7.255306929075609e-06, "loss": 0.0584, "step": 3227 }, { "epoch": 1.248259860788863, "grad_norm": 0.4592748857922137, "learning_rate": 7.253298788187504e-06, "loss": 0.0385, "step": 3228 }, { "epoch": 1.248646558391338, "grad_norm": 0.530756857975358, "learning_rate": 7.2512901910956665e-06, "loss": 0.0487, "step": 3229 }, { "epoch": 1.2490332559938129, "grad_norm": 0.23954182205313992, "learning_rate": 7.249281138206762e-06, "loss": 0.0274, "step": 3230 }, { "epoch": 1.2494199535962878, "grad_norm": 0.24452154009383317, "learning_rate": 7.2472716299275396e-06, "loss": 0.0296, "step": 3231 }, { "epoch": 1.2498066511987627, "grad_norm": 0.2925078417529508, "learning_rate": 7.245261666664849e-06, "loss": 0.0327, "step": 3232 }, { "epoch": 1.2501933488012376, "grad_norm": 0.2649040167193265, "learning_rate": 7.243251248825627e-06, "loss": 0.0364, "step": 3233 }, { "epoch": 1.2505800464037122, "grad_norm": 0.3105837861644695, "learning_rate": 7.241240376816904e-06, "loss": 0.0313, "step": 3234 }, { "epoch": 1.2509667440061871, "grad_norm": 0.3144419215038623, "learning_rate": 7.239229051045799e-06, "loss": 0.0212, "step": 3235 }, { "epoch": 1.251353441608662, "grad_norm": 0.30881192990276374, "learning_rate": 7.237217271919529e-06, "loss": 0.028, "step": 3236 }, { "epoch": 1.251740139211137, "grad_norm": 0.4449495847501961, "learning_rate": 7.235205039845397e-06, "loss": 0.0335, "step": 3237 }, { "epoch": 1.2521268368136118, "grad_norm": 0.6489147105893278, "learning_rate": 7.233192355230804e-06, "loss": 0.0595, "step": 3238 }, { "epoch": 1.2525135344160865, "grad_norm": 0.4612758144616136, "learning_rate": 7.231179218483235e-06, "loss": 0.0314, "step": 3239 }, { "epoch": 1.2529002320185616, "grad_norm": 0.2902812930791192, "learning_rate": 7.229165630010273e-06, "loss": 0.0268, "step": 3240 }, { "epoch": 1.2532869296210363, "grad_norm": 0.32183690735556236, "learning_rate": 7.227151590219589e-06, "loss": 0.0242, "step": 3241 }, { "epoch": 1.2536736272235112, "grad_norm": 0.3494295661835745, "learning_rate": 7.225137099518945e-06, "loss": 0.0282, "step": 3242 }, { "epoch": 1.254060324825986, "grad_norm": 0.24678644934547123, "learning_rate": 7.223122158316198e-06, "loss": 0.026, "step": 3243 }, { "epoch": 1.254447022428461, "grad_norm": 0.36158503677348364, "learning_rate": 7.221106767019291e-06, "loss": 0.0313, "step": 3244 }, { "epoch": 1.2548337200309359, "grad_norm": 0.5099665694603799, "learning_rate": 7.219090926036264e-06, "loss": 0.0348, "step": 3245 }, { "epoch": 1.2552204176334107, "grad_norm": 0.4735560821950894, "learning_rate": 7.217074635775242e-06, "loss": 0.0369, "step": 3246 }, { "epoch": 1.2556071152358856, "grad_norm": 0.3724955421082857, "learning_rate": 7.2150578966444476e-06, "loss": 0.032, "step": 3247 }, { "epoch": 1.2559938128383603, "grad_norm": 0.5937752807923737, "learning_rate": 7.213040709052186e-06, "loss": 0.0326, "step": 3248 }, { "epoch": 1.2563805104408352, "grad_norm": 0.3868783904010236, "learning_rate": 7.211023073406861e-06, "loss": 0.028, "step": 3249 }, { "epoch": 1.25676720804331, "grad_norm": 0.4347808135761637, "learning_rate": 7.209004990116962e-06, "loss": 0.032, "step": 3250 }, { "epoch": 1.257153905645785, "grad_norm": 0.4444230974871714, "learning_rate": 7.206986459591073e-06, "loss": 0.0313, "step": 3251 }, { "epoch": 1.25754060324826, "grad_norm": 0.2967021237161494, "learning_rate": 7.204967482237864e-06, "loss": 0.0238, "step": 3252 }, { "epoch": 1.2579273008507348, "grad_norm": 0.45239287468511286, "learning_rate": 7.202948058466102e-06, "loss": 0.0358, "step": 3253 }, { "epoch": 1.2583139984532097, "grad_norm": 0.39541997077045427, "learning_rate": 7.200928188684634e-06, "loss": 0.0299, "step": 3254 }, { "epoch": 1.2587006960556844, "grad_norm": 0.4337189121685805, "learning_rate": 7.198907873302408e-06, "loss": 0.0377, "step": 3255 }, { "epoch": 1.2590873936581592, "grad_norm": 0.3364868709647789, "learning_rate": 7.196887112728457e-06, "loss": 0.031, "step": 3256 }, { "epoch": 1.2594740912606341, "grad_norm": 0.3430038651586682, "learning_rate": 7.1948659073719055e-06, "loss": 0.0221, "step": 3257 }, { "epoch": 1.259860788863109, "grad_norm": 0.22973767941783169, "learning_rate": 7.192844257641965e-06, "loss": 0.0296, "step": 3258 }, { "epoch": 1.260247486465584, "grad_norm": 0.2988394585512486, "learning_rate": 7.190822163947943e-06, "loss": 0.0279, "step": 3259 }, { "epoch": 1.2606341840680588, "grad_norm": 0.29511023883427534, "learning_rate": 7.188799626699232e-06, "loss": 0.0261, "step": 3260 }, { "epoch": 1.2610208816705337, "grad_norm": 0.36873779838419213, "learning_rate": 7.186776646305314e-06, "loss": 0.0346, "step": 3261 }, { "epoch": 1.2614075792730084, "grad_norm": 0.33595175707375363, "learning_rate": 7.184753223175764e-06, "loss": 0.0453, "step": 3262 }, { "epoch": 1.2617942768754833, "grad_norm": 0.4156891626798153, "learning_rate": 7.182729357720245e-06, "loss": 0.0264, "step": 3263 }, { "epoch": 1.2621809744779582, "grad_norm": 0.32762209050838054, "learning_rate": 7.18070505034851e-06, "loss": 0.0343, "step": 3264 }, { "epoch": 1.262567672080433, "grad_norm": 0.3198655920541631, "learning_rate": 7.1786803014704e-06, "loss": 0.0359, "step": 3265 }, { "epoch": 1.262954369682908, "grad_norm": 0.254699695879521, "learning_rate": 7.176655111495846e-06, "loss": 0.0251, "step": 3266 }, { "epoch": 1.2633410672853829, "grad_norm": 0.311741180944192, "learning_rate": 7.174629480834871e-06, "loss": 0.0338, "step": 3267 }, { "epoch": 1.2637277648878578, "grad_norm": 0.3509006606628104, "learning_rate": 7.172603409897583e-06, "loss": 0.0418, "step": 3268 }, { "epoch": 1.2641144624903324, "grad_norm": 0.2637333868655635, "learning_rate": 7.1705768990941805e-06, "loss": 0.0245, "step": 3269 }, { "epoch": 1.2645011600928076, "grad_norm": 0.3266448275051571, "learning_rate": 7.168549948834953e-06, "loss": 0.0345, "step": 3270 }, { "epoch": 1.2648878576952822, "grad_norm": 0.27689424601842394, "learning_rate": 7.1665225595302775e-06, "loss": 0.0244, "step": 3271 }, { "epoch": 1.2652745552977571, "grad_norm": 0.24985190391196366, "learning_rate": 7.164494731590621e-06, "loss": 0.0215, "step": 3272 }, { "epoch": 1.265661252900232, "grad_norm": 0.275174955501957, "learning_rate": 7.162466465426536e-06, "loss": 0.027, "step": 3273 }, { "epoch": 1.266047950502707, "grad_norm": 0.3634938845763194, "learning_rate": 7.160437761448667e-06, "loss": 0.0375, "step": 3274 }, { "epoch": 1.2664346481051818, "grad_norm": 0.3244191394451, "learning_rate": 7.158408620067747e-06, "loss": 0.0269, "step": 3275 }, { "epoch": 1.2668213457076565, "grad_norm": 0.5071905126559726, "learning_rate": 7.156379041694595e-06, "loss": 0.0454, "step": 3276 }, { "epoch": 1.2672080433101316, "grad_norm": 0.2673295266633372, "learning_rate": 7.154349026740123e-06, "loss": 0.0265, "step": 3277 }, { "epoch": 1.2675947409126063, "grad_norm": 0.5421159982736518, "learning_rate": 7.152318575615326e-06, "loss": 0.0345, "step": 3278 }, { "epoch": 1.2679814385150812, "grad_norm": 0.28761182871109114, "learning_rate": 7.150287688731291e-06, "loss": 0.0241, "step": 3279 }, { "epoch": 1.268368136117556, "grad_norm": 0.6860377682505557, "learning_rate": 7.148256366499192e-06, "loss": 0.0297, "step": 3280 }, { "epoch": 1.268754833720031, "grad_norm": 0.4510932469496028, "learning_rate": 7.146224609330292e-06, "loss": 0.0376, "step": 3281 }, { "epoch": 1.2691415313225058, "grad_norm": 0.2915386113170865, "learning_rate": 7.144192417635939e-06, "loss": 0.0253, "step": 3282 }, { "epoch": 1.2695282289249807, "grad_norm": 0.2680065448445078, "learning_rate": 7.142159791827574e-06, "loss": 0.0215, "step": 3283 }, { "epoch": 1.2699149265274556, "grad_norm": 0.2986199357173248, "learning_rate": 7.140126732316721e-06, "loss": 0.026, "step": 3284 }, { "epoch": 1.2703016241299303, "grad_norm": 0.33445947552210487, "learning_rate": 7.138093239514996e-06, "loss": 0.0294, "step": 3285 }, { "epoch": 1.2706883217324052, "grad_norm": 0.36433839719946715, "learning_rate": 7.136059313834099e-06, "loss": 0.0338, "step": 3286 }, { "epoch": 1.27107501933488, "grad_norm": 0.2854922708810868, "learning_rate": 7.134024955685823e-06, "loss": 0.0283, "step": 3287 }, { "epoch": 1.271461716937355, "grad_norm": 0.41121102959263245, "learning_rate": 7.131990165482039e-06, "loss": 0.036, "step": 3288 }, { "epoch": 1.2718484145398299, "grad_norm": 0.20975921263647135, "learning_rate": 7.129954943634716e-06, "loss": 0.0254, "step": 3289 }, { "epoch": 1.2722351121423048, "grad_norm": 0.30515372965849713, "learning_rate": 7.127919290555902e-06, "loss": 0.0318, "step": 3290 }, { "epoch": 1.2726218097447797, "grad_norm": 0.3968941763749918, "learning_rate": 7.125883206657741e-06, "loss": 0.0337, "step": 3291 }, { "epoch": 1.2730085073472543, "grad_norm": 0.3165884878060185, "learning_rate": 7.123846692352455e-06, "loss": 0.0273, "step": 3292 }, { "epoch": 1.2733952049497292, "grad_norm": 0.2738197074314633, "learning_rate": 7.12180974805236e-06, "loss": 0.0255, "step": 3293 }, { "epoch": 1.2737819025522041, "grad_norm": 0.2869326472434286, "learning_rate": 7.1197723741698555e-06, "loss": 0.0319, "step": 3294 }, { "epoch": 1.274168600154679, "grad_norm": 0.25768267068138795, "learning_rate": 7.11773457111743e-06, "loss": 0.0283, "step": 3295 }, { "epoch": 1.274555297757154, "grad_norm": 0.2503055982177114, "learning_rate": 7.115696339307656e-06, "loss": 0.0277, "step": 3296 }, { "epoch": 1.2749419953596288, "grad_norm": 0.28137960466728157, "learning_rate": 7.113657679153195e-06, "loss": 0.0218, "step": 3297 }, { "epoch": 1.2753286929621037, "grad_norm": 0.40811129465113866, "learning_rate": 7.111618591066798e-06, "loss": 0.0323, "step": 3298 }, { "epoch": 1.2757153905645784, "grad_norm": 0.2576861436670399, "learning_rate": 7.109579075461295e-06, "loss": 0.0271, "step": 3299 }, { "epoch": 1.2761020881670533, "grad_norm": 0.24162497962971335, "learning_rate": 7.107539132749612e-06, "loss": 0.0296, "step": 3300 }, { "epoch": 1.2764887857695282, "grad_norm": 0.3448828474310214, "learning_rate": 7.105498763344751e-06, "loss": 0.0311, "step": 3301 }, { "epoch": 1.276875483372003, "grad_norm": 0.29275102564019057, "learning_rate": 7.103457967659809e-06, "loss": 0.0313, "step": 3302 }, { "epoch": 1.277262180974478, "grad_norm": 0.5876838566850873, "learning_rate": 7.101416746107968e-06, "loss": 0.0191, "step": 3303 }, { "epoch": 1.2776488785769529, "grad_norm": 0.286668531008976, "learning_rate": 7.0993750991024916e-06, "loss": 0.0318, "step": 3304 }, { "epoch": 1.2780355761794278, "grad_norm": 0.23333976183864258, "learning_rate": 7.097333027056732e-06, "loss": 0.0232, "step": 3305 }, { "epoch": 1.2784222737819024, "grad_norm": 0.2828370479259549, "learning_rate": 7.095290530384131e-06, "loss": 0.0256, "step": 3306 }, { "epoch": 1.2788089713843775, "grad_norm": 0.24786661180843828, "learning_rate": 7.093247609498212e-06, "loss": 0.0302, "step": 3307 }, { "epoch": 1.2791956689868522, "grad_norm": 0.2512456186780955, "learning_rate": 7.0912042648125835e-06, "loss": 0.0276, "step": 3308 }, { "epoch": 1.2795823665893271, "grad_norm": 0.2464441342577933, "learning_rate": 7.089160496740945e-06, "loss": 0.0287, "step": 3309 }, { "epoch": 1.279969064191802, "grad_norm": 0.3238831895101824, "learning_rate": 7.087116305697076e-06, "loss": 0.0323, "step": 3310 }, { "epoch": 1.280355761794277, "grad_norm": 0.24666400397372665, "learning_rate": 7.085071692094847e-06, "loss": 0.0246, "step": 3311 }, { "epoch": 1.2807424593967518, "grad_norm": 0.27927800713953116, "learning_rate": 7.083026656348207e-06, "loss": 0.026, "step": 3312 }, { "epoch": 1.2811291569992265, "grad_norm": 0.3077525932100984, "learning_rate": 7.0809811988712e-06, "loss": 0.0326, "step": 3313 }, { "epoch": 1.2815158546017016, "grad_norm": 0.43109429735963006, "learning_rate": 7.078935320077944e-06, "loss": 0.0337, "step": 3314 }, { "epoch": 1.2819025522041763, "grad_norm": 0.30295419690001235, "learning_rate": 7.076889020382654e-06, "loss": 0.0279, "step": 3315 }, { "epoch": 1.2822892498066512, "grad_norm": 0.28238019184075247, "learning_rate": 7.074842300199621e-06, "loss": 0.0332, "step": 3316 }, { "epoch": 1.282675947409126, "grad_norm": 0.2032037904778722, "learning_rate": 7.072795159943227e-06, "loss": 0.0227, "step": 3317 }, { "epoch": 1.283062645011601, "grad_norm": 0.33325759707451963, "learning_rate": 7.070747600027935e-06, "loss": 0.0295, "step": 3318 }, { "epoch": 1.2834493426140758, "grad_norm": 0.3523287408959678, "learning_rate": 7.0686996208682965e-06, "loss": 0.0395, "step": 3319 }, { "epoch": 1.2838360402165507, "grad_norm": 0.34705405857325444, "learning_rate": 7.0666512228789465e-06, "loss": 0.0296, "step": 3320 }, { "epoch": 1.2842227378190256, "grad_norm": 0.28901626281867676, "learning_rate": 7.064602406474601e-06, "loss": 0.0328, "step": 3321 }, { "epoch": 1.2846094354215003, "grad_norm": 0.31152131867326427, "learning_rate": 7.0625531720700665e-06, "loss": 0.0343, "step": 3322 }, { "epoch": 1.2849961330239752, "grad_norm": 0.3120103390987622, "learning_rate": 7.060503520080233e-06, "loss": 0.0276, "step": 3323 }, { "epoch": 1.28538283062645, "grad_norm": 0.3460837226471983, "learning_rate": 7.058453450920071e-06, "loss": 0.0272, "step": 3324 }, { "epoch": 1.285769528228925, "grad_norm": 0.2934867262998221, "learning_rate": 7.05640296500464e-06, "loss": 0.035, "step": 3325 }, { "epoch": 1.2861562258313999, "grad_norm": 0.3115358336627465, "learning_rate": 7.054352062749082e-06, "loss": 0.037, "step": 3326 }, { "epoch": 1.2865429234338748, "grad_norm": 0.4024402424684608, "learning_rate": 7.052300744568623e-06, "loss": 0.0308, "step": 3327 }, { "epoch": 1.2869296210363497, "grad_norm": 0.2595824948104933, "learning_rate": 7.050249010878575e-06, "loss": 0.0259, "step": 3328 }, { "epoch": 1.2873163186388243, "grad_norm": 0.3705217328668406, "learning_rate": 7.048196862094329e-06, "loss": 0.0429, "step": 3329 }, { "epoch": 1.2877030162412992, "grad_norm": 0.32251893653611935, "learning_rate": 7.04614429863137e-06, "loss": 0.0235, "step": 3330 }, { "epoch": 1.2880897138437741, "grad_norm": 0.3406272063928076, "learning_rate": 7.044091320905255e-06, "loss": 0.0327, "step": 3331 }, { "epoch": 1.288476411446249, "grad_norm": 0.2834502421222847, "learning_rate": 7.042037929331636e-06, "loss": 0.0279, "step": 3332 }, { "epoch": 1.288863109048724, "grad_norm": 0.2577962533457804, "learning_rate": 7.039984124326238e-06, "loss": 0.0189, "step": 3333 }, { "epoch": 1.2892498066511988, "grad_norm": 0.6354110110113002, "learning_rate": 7.037929906304879e-06, "loss": 0.0432, "step": 3334 }, { "epoch": 1.2896365042536737, "grad_norm": 0.23549156096407992, "learning_rate": 7.035875275683454e-06, "loss": 0.0209, "step": 3335 }, { "epoch": 1.2900232018561484, "grad_norm": 0.4404178138908116, "learning_rate": 7.033820232877946e-06, "loss": 0.0365, "step": 3336 }, { "epoch": 1.2904098994586233, "grad_norm": 0.45007744962398805, "learning_rate": 7.0317647783044195e-06, "loss": 0.0432, "step": 3337 }, { "epoch": 1.2907965970610982, "grad_norm": 0.284942556109195, "learning_rate": 7.029708912379023e-06, "loss": 0.0277, "step": 3338 }, { "epoch": 1.291183294663573, "grad_norm": 0.37080000385241657, "learning_rate": 7.027652635517987e-06, "loss": 0.0414, "step": 3339 }, { "epoch": 1.291569992266048, "grad_norm": 0.3574981981773122, "learning_rate": 7.025595948137627e-06, "loss": 0.0342, "step": 3340 }, { "epoch": 1.2919566898685229, "grad_norm": 0.3314321236156023, "learning_rate": 7.023538850654339e-06, "loss": 0.0292, "step": 3341 }, { "epoch": 1.2923433874709978, "grad_norm": 0.3494837823390977, "learning_rate": 7.021481343484606e-06, "loss": 0.0326, "step": 3342 }, { "epoch": 1.2927300850734724, "grad_norm": 0.25915214841848383, "learning_rate": 7.01942342704499e-06, "loss": 0.025, "step": 3343 }, { "epoch": 1.2931167826759475, "grad_norm": 0.45015501741848185, "learning_rate": 7.0173651017521384e-06, "loss": 0.0244, "step": 3344 }, { "epoch": 1.2935034802784222, "grad_norm": 0.3319650627176774, "learning_rate": 7.015306368022781e-06, "loss": 0.0267, "step": 3345 }, { "epoch": 1.293890177880897, "grad_norm": 0.37834939077951285, "learning_rate": 7.013247226273728e-06, "loss": 0.0275, "step": 3346 }, { "epoch": 1.294276875483372, "grad_norm": 0.2506329139957056, "learning_rate": 7.011187676921878e-06, "loss": 0.0306, "step": 3347 }, { "epoch": 1.294663573085847, "grad_norm": 0.27665052331617923, "learning_rate": 7.009127720384202e-06, "loss": 0.0279, "step": 3348 }, { "epoch": 1.2950502706883218, "grad_norm": 0.2735489548154064, "learning_rate": 7.007067357077765e-06, "loss": 0.0262, "step": 3349 }, { "epoch": 1.2954369682907965, "grad_norm": 0.2760982160509551, "learning_rate": 7.005006587419705e-06, "loss": 0.0265, "step": 3350 }, { "epoch": 1.2958236658932716, "grad_norm": 0.30777223380160146, "learning_rate": 7.002945411827249e-06, "loss": 0.0269, "step": 3351 }, { "epoch": 1.2962103634957463, "grad_norm": 0.3102994145459354, "learning_rate": 7.000883830717702e-06, "loss": 0.0309, "step": 3352 }, { "epoch": 1.2965970610982211, "grad_norm": 0.2382530991882006, "learning_rate": 6.998821844508455e-06, "loss": 0.0344, "step": 3353 }, { "epoch": 1.296983758700696, "grad_norm": 0.4452524952745933, "learning_rate": 6.996759453616974e-06, "loss": 0.0415, "step": 3354 }, { "epoch": 1.297370456303171, "grad_norm": 0.706582757995257, "learning_rate": 6.994696658460815e-06, "loss": 0.0511, "step": 3355 }, { "epoch": 1.2977571539056458, "grad_norm": 0.2942784687734618, "learning_rate": 6.9926334594576105e-06, "loss": 0.0361, "step": 3356 }, { "epoch": 1.2981438515081207, "grad_norm": 0.2776162565249855, "learning_rate": 6.990569857025078e-06, "loss": 0.0247, "step": 3357 }, { "epoch": 1.2985305491105956, "grad_norm": 0.24528089479865367, "learning_rate": 6.988505851581013e-06, "loss": 0.0317, "step": 3358 }, { "epoch": 1.2989172467130703, "grad_norm": 0.6039031715783924, "learning_rate": 6.986441443543299e-06, "loss": 0.0358, "step": 3359 }, { "epoch": 1.2993039443155452, "grad_norm": 0.2412907730051657, "learning_rate": 6.984376633329892e-06, "loss": 0.0208, "step": 3360 }, { "epoch": 1.29969064191802, "grad_norm": 0.3193719177338144, "learning_rate": 6.982311421358837e-06, "loss": 0.0298, "step": 3361 }, { "epoch": 1.300077339520495, "grad_norm": 0.2502697641009402, "learning_rate": 6.980245808048257e-06, "loss": 0.0326, "step": 3362 }, { "epoch": 1.3004640371229699, "grad_norm": 0.2596285381140403, "learning_rate": 6.978179793816356e-06, "loss": 0.0294, "step": 3363 }, { "epoch": 1.3008507347254448, "grad_norm": 0.2783962236522658, "learning_rate": 6.976113379081423e-06, "loss": 0.0317, "step": 3364 }, { "epoch": 1.3012374323279197, "grad_norm": 0.26260737558470704, "learning_rate": 6.974046564261821e-06, "loss": 0.0304, "step": 3365 }, { "epoch": 1.3016241299303943, "grad_norm": 0.3864632445846274, "learning_rate": 6.9719793497760015e-06, "loss": 0.0255, "step": 3366 }, { "epoch": 1.3020108275328692, "grad_norm": 0.2568185901170703, "learning_rate": 6.969911736042492e-06, "loss": 0.0249, "step": 3367 }, { "epoch": 1.3023975251353441, "grad_norm": 0.30508136593172713, "learning_rate": 6.967843723479901e-06, "loss": 0.0451, "step": 3368 }, { "epoch": 1.302784222737819, "grad_norm": 0.296069272410513, "learning_rate": 6.965775312506922e-06, "loss": 0.0255, "step": 3369 }, { "epoch": 1.303170920340294, "grad_norm": 0.41228882154075275, "learning_rate": 6.963706503542324e-06, "loss": 0.036, "step": 3370 }, { "epoch": 1.3035576179427688, "grad_norm": 0.32457548769957545, "learning_rate": 6.961637297004961e-06, "loss": 0.0287, "step": 3371 }, { "epoch": 1.3039443155452437, "grad_norm": 0.29392833018441766, "learning_rate": 6.959567693313762e-06, "loss": 0.0283, "step": 3372 }, { "epoch": 1.3043310131477184, "grad_norm": 0.4024699598851943, "learning_rate": 6.957497692887744e-06, "loss": 0.0356, "step": 3373 }, { "epoch": 1.3047177107501933, "grad_norm": 0.2956994387482094, "learning_rate": 6.9554272961459966e-06, "loss": 0.0275, "step": 3374 }, { "epoch": 1.3051044083526682, "grad_norm": 0.2793330111040093, "learning_rate": 6.953356503507696e-06, "loss": 0.0323, "step": 3375 }, { "epoch": 1.305491105955143, "grad_norm": 0.32881288518601015, "learning_rate": 6.9512853153920924e-06, "loss": 0.0343, "step": 3376 }, { "epoch": 1.305877803557618, "grad_norm": 0.43141022729388556, "learning_rate": 6.949213732218522e-06, "loss": 0.0229, "step": 3377 }, { "epoch": 1.3062645011600929, "grad_norm": 0.30663786581601055, "learning_rate": 6.947141754406397e-06, "loss": 0.0372, "step": 3378 }, { "epoch": 1.3066511987625677, "grad_norm": 0.35077636096998316, "learning_rate": 6.945069382375211e-06, "loss": 0.0333, "step": 3379 }, { "epoch": 1.3070378963650424, "grad_norm": 0.3492972073766244, "learning_rate": 6.942996616544539e-06, "loss": 0.0275, "step": 3380 }, { "epoch": 1.3074245939675175, "grad_norm": 0.4358535229305178, "learning_rate": 6.94092345733403e-06, "loss": 0.0409, "step": 3381 }, { "epoch": 1.3078112915699922, "grad_norm": 0.2672526326968958, "learning_rate": 6.938849905163419e-06, "loss": 0.028, "step": 3382 }, { "epoch": 1.308197989172467, "grad_norm": 0.3488400969889651, "learning_rate": 6.93677596045252e-06, "loss": 0.0387, "step": 3383 }, { "epoch": 1.308584686774942, "grad_norm": 0.3008549688693114, "learning_rate": 6.93470162362122e-06, "loss": 0.0262, "step": 3384 }, { "epoch": 1.308971384377417, "grad_norm": 0.29581822321598933, "learning_rate": 6.932626895089496e-06, "loss": 0.0317, "step": 3385 }, { "epoch": 1.3093580819798918, "grad_norm": 0.46189909100775023, "learning_rate": 6.930551775277394e-06, "loss": 0.0379, "step": 3386 }, { "epoch": 1.3097447795823665, "grad_norm": 0.2967894812940358, "learning_rate": 6.928476264605045e-06, "loss": 0.0296, "step": 3387 }, { "epoch": 1.3101314771848416, "grad_norm": 0.2785631106677307, "learning_rate": 6.9264003634926556e-06, "loss": 0.0374, "step": 3388 }, { "epoch": 1.3105181747873162, "grad_norm": 0.34370061487854653, "learning_rate": 6.9243240723605176e-06, "loss": 0.0336, "step": 3389 }, { "epoch": 1.3109048723897911, "grad_norm": 0.5603688022148537, "learning_rate": 6.922247391628993e-06, "loss": 0.0294, "step": 3390 }, { "epoch": 1.311291569992266, "grad_norm": 0.27337783141052674, "learning_rate": 6.920170321718531e-06, "loss": 0.0299, "step": 3391 }, { "epoch": 1.311678267594741, "grad_norm": 0.25041660144993994, "learning_rate": 6.918092863049654e-06, "loss": 0.0232, "step": 3392 }, { "epoch": 1.3120649651972158, "grad_norm": 0.63212784712029, "learning_rate": 6.916015016042966e-06, "loss": 0.024, "step": 3393 }, { "epoch": 1.3124516627996907, "grad_norm": 0.3646930999313354, "learning_rate": 6.913936781119149e-06, "loss": 0.0363, "step": 3394 }, { "epoch": 1.3128383604021656, "grad_norm": 0.33781596959485743, "learning_rate": 6.911858158698963e-06, "loss": 0.0365, "step": 3395 }, { "epoch": 1.3132250580046403, "grad_norm": 0.27559193125313075, "learning_rate": 6.909779149203246e-06, "loss": 0.0288, "step": 3396 }, { "epoch": 1.3136117556071152, "grad_norm": 0.3629050426027472, "learning_rate": 6.907699753052914e-06, "loss": 0.0222, "step": 3397 }, { "epoch": 1.31399845320959, "grad_norm": 0.4021110733309718, "learning_rate": 6.905619970668966e-06, "loss": 0.051, "step": 3398 }, { "epoch": 1.314385150812065, "grad_norm": 0.4671493920000694, "learning_rate": 6.903539802472472e-06, "loss": 0.0321, "step": 3399 }, { "epoch": 1.3147718484145399, "grad_norm": 0.31300736977461074, "learning_rate": 6.901459248884586e-06, "loss": 0.0348, "step": 3400 }, { "epoch": 1.3151585460170148, "grad_norm": 0.34259873958555437, "learning_rate": 6.899378310326535e-06, "loss": 0.036, "step": 3401 }, { "epoch": 1.3155452436194897, "grad_norm": 0.43756149861609694, "learning_rate": 6.897296987219631e-06, "loss": 0.0338, "step": 3402 }, { "epoch": 1.3159319412219643, "grad_norm": 0.31225097440547706, "learning_rate": 6.895215279985253e-06, "loss": 0.028, "step": 3403 }, { "epoch": 1.3163186388244392, "grad_norm": 0.33539926202637804, "learning_rate": 6.893133189044871e-06, "loss": 0.0337, "step": 3404 }, { "epoch": 1.3167053364269141, "grad_norm": 0.32849035923292175, "learning_rate": 6.89105071482002e-06, "loss": 0.0298, "step": 3405 }, { "epoch": 1.317092034029389, "grad_norm": 0.3250191005715964, "learning_rate": 6.888967857732324e-06, "loss": 0.0252, "step": 3406 }, { "epoch": 1.317478731631864, "grad_norm": 0.315292276556328, "learning_rate": 6.886884618203475e-06, "loss": 0.0357, "step": 3407 }, { "epoch": 1.3178654292343388, "grad_norm": 0.36782062943102783, "learning_rate": 6.884800996655249e-06, "loss": 0.0393, "step": 3408 }, { "epoch": 1.3182521268368137, "grad_norm": 0.259121611191994, "learning_rate": 6.882716993509495e-06, "loss": 0.0309, "step": 3409 }, { "epoch": 1.3186388244392884, "grad_norm": 0.32503265440864887, "learning_rate": 6.88063260918814e-06, "loss": 0.0318, "step": 3410 }, { "epoch": 1.3190255220417633, "grad_norm": 0.39814161807138215, "learning_rate": 6.878547844113195e-06, "loss": 0.0282, "step": 3411 }, { "epoch": 1.3194122196442382, "grad_norm": 0.35048185006164795, "learning_rate": 6.876462698706735e-06, "loss": 0.0337, "step": 3412 }, { "epoch": 1.319798917246713, "grad_norm": 0.38706344705882073, "learning_rate": 6.874377173390925e-06, "loss": 0.025, "step": 3413 }, { "epoch": 1.320185614849188, "grad_norm": 0.6343666528610111, "learning_rate": 6.872291268587997e-06, "loss": 0.0342, "step": 3414 }, { "epoch": 1.3205723124516628, "grad_norm": 0.23611559409898314, "learning_rate": 6.870204984720268e-06, "loss": 0.02, "step": 3415 }, { "epoch": 1.3209590100541377, "grad_norm": 0.4746492652340421, "learning_rate": 6.868118322210124e-06, "loss": 0.0406, "step": 3416 }, { "epoch": 1.3213457076566124, "grad_norm": 0.2787970389184098, "learning_rate": 6.866031281480034e-06, "loss": 0.0257, "step": 3417 }, { "epoch": 1.3217324052590875, "grad_norm": 0.32631172595009333, "learning_rate": 6.86394386295254e-06, "loss": 0.0233, "step": 3418 }, { "epoch": 1.3221191028615622, "grad_norm": 0.26460005361111055, "learning_rate": 6.861856067050263e-06, "loss": 0.027, "step": 3419 }, { "epoch": 1.322505800464037, "grad_norm": 0.3134516461107503, "learning_rate": 6.859767894195898e-06, "loss": 0.025, "step": 3420 }, { "epoch": 1.322892498066512, "grad_norm": 0.32567012189411126, "learning_rate": 6.857679344812216e-06, "loss": 0.0257, "step": 3421 }, { "epoch": 1.3232791956689869, "grad_norm": 0.2772393409368174, "learning_rate": 6.855590419322067e-06, "loss": 0.0333, "step": 3422 }, { "epoch": 1.3236658932714618, "grad_norm": 0.33664195417381626, "learning_rate": 6.853501118148375e-06, "loss": 0.0332, "step": 3423 }, { "epoch": 1.3240525908739365, "grad_norm": 0.3131297318280238, "learning_rate": 6.851411441714142e-06, "loss": 0.0242, "step": 3424 }, { "epoch": 1.3244392884764116, "grad_norm": 0.2649441851907265, "learning_rate": 6.849321390442443e-06, "loss": 0.0272, "step": 3425 }, { "epoch": 1.3248259860788862, "grad_norm": 0.4042943128840668, "learning_rate": 6.847230964756431e-06, "loss": 0.0413, "step": 3426 }, { "epoch": 1.3252126836813611, "grad_norm": 0.26343996582767354, "learning_rate": 6.8451401650793355e-06, "loss": 0.028, "step": 3427 }, { "epoch": 1.325599381283836, "grad_norm": 0.6042771326928056, "learning_rate": 6.84304899183446e-06, "loss": 0.0438, "step": 3428 }, { "epoch": 1.325986078886311, "grad_norm": 0.4437179490548376, "learning_rate": 6.840957445445182e-06, "loss": 0.0245, "step": 3429 }, { "epoch": 1.3263727764887858, "grad_norm": 0.3032834927844517, "learning_rate": 6.838865526334961e-06, "loss": 0.0297, "step": 3430 }, { "epoch": 1.3267594740912607, "grad_norm": 0.38122028695234345, "learning_rate": 6.8367732349273245e-06, "loss": 0.0326, "step": 3431 }, { "epoch": 1.3271461716937356, "grad_norm": 0.3965951491376518, "learning_rate": 6.8346805716458795e-06, "loss": 0.0287, "step": 3432 }, { "epoch": 1.3275328692962103, "grad_norm": 0.40768942654985735, "learning_rate": 6.832587536914308e-06, "loss": 0.0291, "step": 3433 }, { "epoch": 1.3279195668986852, "grad_norm": 0.28725430504216554, "learning_rate": 6.830494131156367e-06, "loss": 0.0309, "step": 3434 }, { "epoch": 1.32830626450116, "grad_norm": 0.33545273484881016, "learning_rate": 6.828400354795886e-06, "loss": 0.0325, "step": 3435 }, { "epoch": 1.328692962103635, "grad_norm": 0.3254428513389614, "learning_rate": 6.826306208256774e-06, "loss": 0.0424, "step": 3436 }, { "epoch": 1.3290796597061099, "grad_norm": 0.28649556230837686, "learning_rate": 6.8242116919630095e-06, "loss": 0.0342, "step": 3437 }, { "epoch": 1.3294663573085848, "grad_norm": 0.4733914956955735, "learning_rate": 6.822116806338654e-06, "loss": 0.0337, "step": 3438 }, { "epoch": 1.3298530549110597, "grad_norm": 0.3024124455033654, "learning_rate": 6.820021551807835e-06, "loss": 0.0283, "step": 3439 }, { "epoch": 1.3302397525135343, "grad_norm": 0.5205488555654258, "learning_rate": 6.8179259287947604e-06, "loss": 0.0493, "step": 3440 }, { "epoch": 1.3306264501160092, "grad_norm": 0.3003884489959236, "learning_rate": 6.8158299377237106e-06, "loss": 0.0275, "step": 3441 }, { "epoch": 1.3310131477184841, "grad_norm": 0.34946537820976203, "learning_rate": 6.813733579019039e-06, "loss": 0.0343, "step": 3442 }, { "epoch": 1.331399845320959, "grad_norm": 0.3701861402468206, "learning_rate": 6.811636853105178e-06, "loss": 0.0327, "step": 3443 }, { "epoch": 1.331786542923434, "grad_norm": 0.31310565927771605, "learning_rate": 6.809539760406629e-06, "loss": 0.0257, "step": 3444 }, { "epoch": 1.3321732405259088, "grad_norm": 0.26593666564675567, "learning_rate": 6.807442301347973e-06, "loss": 0.0356, "step": 3445 }, { "epoch": 1.3325599381283837, "grad_norm": 0.3254931691496042, "learning_rate": 6.805344476353859e-06, "loss": 0.0367, "step": 3446 }, { "epoch": 1.3329466357308584, "grad_norm": 0.24769563086457644, "learning_rate": 6.8032462858490154e-06, "loss": 0.024, "step": 3447 }, { "epoch": 1.3333333333333333, "grad_norm": 0.24199479109717292, "learning_rate": 6.801147730258242e-06, "loss": 0.0253, "step": 3448 }, { "epoch": 1.3337200309358082, "grad_norm": 0.3131754560672383, "learning_rate": 6.799048810006415e-06, "loss": 0.0301, "step": 3449 }, { "epoch": 1.334106728538283, "grad_norm": 0.3607103416233516, "learning_rate": 6.796949525518478e-06, "loss": 0.0351, "step": 3450 }, { "epoch": 1.334493426140758, "grad_norm": 0.40703937377502636, "learning_rate": 6.794849877219458e-06, "loss": 0.0354, "step": 3451 }, { "epoch": 1.3348801237432328, "grad_norm": 0.2635685581827177, "learning_rate": 6.792749865534448e-06, "loss": 0.0188, "step": 3452 }, { "epoch": 1.3352668213457077, "grad_norm": 0.38726035718146035, "learning_rate": 6.79064949088862e-06, "loss": 0.0303, "step": 3453 }, { "epoch": 1.3356535189481824, "grad_norm": 0.4076180089796587, "learning_rate": 6.788548753707211e-06, "loss": 0.0437, "step": 3454 }, { "epoch": 1.3360402165506575, "grad_norm": 0.26024379446383195, "learning_rate": 6.7864476544155436e-06, "loss": 0.0331, "step": 3455 }, { "epoch": 1.3364269141531322, "grad_norm": 0.5233524724908799, "learning_rate": 6.784346193439001e-06, "loss": 0.0501, "step": 3456 }, { "epoch": 1.336813611755607, "grad_norm": 0.28293322557934125, "learning_rate": 6.782244371203051e-06, "loss": 0.0223, "step": 3457 }, { "epoch": 1.337200309358082, "grad_norm": 0.2785699626023878, "learning_rate": 6.780142188133225e-06, "loss": 0.033, "step": 3458 }, { "epoch": 1.3375870069605569, "grad_norm": 0.25915832787527, "learning_rate": 6.778039644655136e-06, "loss": 0.027, "step": 3459 }, { "epoch": 1.3379737045630318, "grad_norm": 0.2615257253317243, "learning_rate": 6.775936741194464e-06, "loss": 0.0214, "step": 3460 }, { "epoch": 1.3383604021655064, "grad_norm": 0.5382973110079121, "learning_rate": 6.773833478176961e-06, "loss": 0.0315, "step": 3461 }, { "epoch": 1.3387470997679816, "grad_norm": 0.4333889582482502, "learning_rate": 6.771729856028459e-06, "loss": 0.0396, "step": 3462 }, { "epoch": 1.3391337973704562, "grad_norm": 0.3796658882342298, "learning_rate": 6.769625875174854e-06, "loss": 0.0235, "step": 3463 }, { "epoch": 1.3395204949729311, "grad_norm": 0.28004256583017817, "learning_rate": 6.767521536042122e-06, "loss": 0.023, "step": 3464 }, { "epoch": 1.339907192575406, "grad_norm": 0.28629601825891327, "learning_rate": 6.765416839056306e-06, "loss": 0.0283, "step": 3465 }, { "epoch": 1.340293890177881, "grad_norm": 0.40979540200640013, "learning_rate": 6.763311784643527e-06, "loss": 0.0308, "step": 3466 }, { "epoch": 1.3406805877803558, "grad_norm": 0.27132005686825506, "learning_rate": 6.76120637322997e-06, "loss": 0.0297, "step": 3467 }, { "epoch": 1.3410672853828307, "grad_norm": 0.537938956305941, "learning_rate": 6.759100605241901e-06, "loss": 0.0561, "step": 3468 }, { "epoch": 1.3414539829853056, "grad_norm": 0.26834253027371613, "learning_rate": 6.756994481105654e-06, "loss": 0.0238, "step": 3469 }, { "epoch": 1.3418406805877803, "grad_norm": 0.3259221557569971, "learning_rate": 6.754888001247637e-06, "loss": 0.0321, "step": 3470 }, { "epoch": 1.3422273781902552, "grad_norm": 0.43964865428750316, "learning_rate": 6.752781166094324e-06, "loss": 0.0396, "step": 3471 }, { "epoch": 1.34261407579273, "grad_norm": 0.3215985699797482, "learning_rate": 6.7506739760722714e-06, "loss": 0.0325, "step": 3472 }, { "epoch": 1.343000773395205, "grad_norm": 0.32459099164530963, "learning_rate": 6.748566431608099e-06, "loss": 0.0267, "step": 3473 }, { "epoch": 1.3433874709976799, "grad_norm": 0.2873798157011744, "learning_rate": 6.746458533128501e-06, "loss": 0.0274, "step": 3474 }, { "epoch": 1.3437741686001548, "grad_norm": 0.2866873997096841, "learning_rate": 6.744350281060242e-06, "loss": 0.0275, "step": 3475 }, { "epoch": 1.3441608662026296, "grad_norm": 0.39483977484157, "learning_rate": 6.742241675830163e-06, "loss": 0.0385, "step": 3476 }, { "epoch": 1.3445475638051043, "grad_norm": 0.27419965550440123, "learning_rate": 6.74013271786517e-06, "loss": 0.0296, "step": 3477 }, { "epoch": 1.3449342614075792, "grad_norm": 0.38695288332979266, "learning_rate": 6.738023407592245e-06, "loss": 0.0379, "step": 3478 }, { "epoch": 1.345320959010054, "grad_norm": 0.3253467232580454, "learning_rate": 6.735913745438438e-06, "loss": 0.0382, "step": 3479 }, { "epoch": 1.345707656612529, "grad_norm": 0.3216095804879307, "learning_rate": 6.733803731830876e-06, "loss": 0.033, "step": 3480 }, { "epoch": 1.346094354215004, "grad_norm": 0.2866947030270902, "learning_rate": 6.7316933671967484e-06, "loss": 0.0305, "step": 3481 }, { "epoch": 1.3464810518174788, "grad_norm": 0.31629269615470434, "learning_rate": 6.729582651963322e-06, "loss": 0.0285, "step": 3482 }, { "epoch": 1.3468677494199537, "grad_norm": 0.28991152354752564, "learning_rate": 6.727471586557935e-06, "loss": 0.0303, "step": 3483 }, { "epoch": 1.3472544470224284, "grad_norm": 0.33873348277743115, "learning_rate": 6.725360171407992e-06, "loss": 0.0345, "step": 3484 }, { "epoch": 1.3476411446249033, "grad_norm": 0.26912584464782985, "learning_rate": 6.723248406940972e-06, "loss": 0.0301, "step": 3485 }, { "epoch": 1.3480278422273781, "grad_norm": 0.3099981560570871, "learning_rate": 6.721136293584425e-06, "loss": 0.0296, "step": 3486 }, { "epoch": 1.348414539829853, "grad_norm": 0.3859800741495482, "learning_rate": 6.719023831765967e-06, "loss": 0.0358, "step": 3487 }, { "epoch": 1.348801237432328, "grad_norm": 0.3151549039386121, "learning_rate": 6.7169110219132905e-06, "loss": 0.0359, "step": 3488 }, { "epoch": 1.3491879350348028, "grad_norm": 0.23064024500691083, "learning_rate": 6.714797864454155e-06, "loss": 0.0313, "step": 3489 }, { "epoch": 1.3495746326372777, "grad_norm": 0.2541350550147551, "learning_rate": 6.71268435981639e-06, "loss": 0.0308, "step": 3490 }, { "epoch": 1.3499613302397524, "grad_norm": 0.31610820748123214, "learning_rate": 6.710570508427898e-06, "loss": 0.036, "step": 3491 }, { "epoch": 1.3503480278422275, "grad_norm": 0.36574832940354207, "learning_rate": 6.708456310716649e-06, "loss": 0.0354, "step": 3492 }, { "epoch": 1.3507347254447022, "grad_norm": 0.3466697031176437, "learning_rate": 6.706341767110685e-06, "loss": 0.0363, "step": 3493 }, { "epoch": 1.351121423047177, "grad_norm": 0.6205962328026741, "learning_rate": 6.7042268780381194e-06, "loss": 0.0328, "step": 3494 }, { "epoch": 1.351508120649652, "grad_norm": 0.29154714525366254, "learning_rate": 6.70211164392713e-06, "loss": 0.0327, "step": 3495 }, { "epoch": 1.3518948182521269, "grad_norm": 0.23673855555863885, "learning_rate": 6.6999960652059694e-06, "loss": 0.028, "step": 3496 }, { "epoch": 1.3522815158546018, "grad_norm": 0.43719987174590236, "learning_rate": 6.6978801423029575e-06, "loss": 0.0254, "step": 3497 }, { "epoch": 1.3526682134570764, "grad_norm": 0.2887179895653923, "learning_rate": 6.695763875646486e-06, "loss": 0.0239, "step": 3498 }, { "epoch": 1.3530549110595516, "grad_norm": 0.31582548216588374, "learning_rate": 6.693647265665015e-06, "loss": 0.0256, "step": 3499 }, { "epoch": 1.3534416086620262, "grad_norm": 0.26960871231991523, "learning_rate": 6.691530312787075e-06, "loss": 0.0252, "step": 3500 }, { "epoch": 1.3538283062645011, "grad_norm": 0.2915134428159724, "learning_rate": 6.689413017441262e-06, "loss": 0.0295, "step": 3501 }, { "epoch": 1.354215003866976, "grad_norm": 0.3851501899226635, "learning_rate": 6.6872953800562465e-06, "loss": 0.0419, "step": 3502 }, { "epoch": 1.354601701469451, "grad_norm": 0.26112759344775865, "learning_rate": 6.685177401060766e-06, "loss": 0.0274, "step": 3503 }, { "epoch": 1.3549883990719258, "grad_norm": 0.32486156831859486, "learning_rate": 6.683059080883628e-06, "loss": 0.0304, "step": 3504 }, { "epoch": 1.3553750966744005, "grad_norm": 0.2351088398120209, "learning_rate": 6.6809404199537075e-06, "loss": 0.0233, "step": 3505 }, { "epoch": 1.3557617942768756, "grad_norm": 0.27477447613366834, "learning_rate": 6.67882141869995e-06, "loss": 0.0276, "step": 3506 }, { "epoch": 1.3561484918793503, "grad_norm": 0.26671280765096866, "learning_rate": 6.676702077551369e-06, "loss": 0.0281, "step": 3507 }, { "epoch": 1.3565351894818252, "grad_norm": 0.3306234813341256, "learning_rate": 6.674582396937047e-06, "loss": 0.0409, "step": 3508 }, { "epoch": 1.3569218870843, "grad_norm": 0.27039992326351686, "learning_rate": 6.672462377286136e-06, "loss": 0.0246, "step": 3509 }, { "epoch": 1.357308584686775, "grad_norm": 0.2746512173330655, "learning_rate": 6.670342019027853e-06, "loss": 0.0204, "step": 3510 }, { "epoch": 1.3576952822892498, "grad_norm": 0.3683585074317367, "learning_rate": 6.668221322591492e-06, "loss": 0.0326, "step": 3511 }, { "epoch": 1.3580819798917247, "grad_norm": 0.3425360805728988, "learning_rate": 6.666100288406405e-06, "loss": 0.0318, "step": 3512 }, { "epoch": 1.3584686774941996, "grad_norm": 0.42913446739156735, "learning_rate": 6.663978916902021e-06, "loss": 0.0403, "step": 3513 }, { "epoch": 1.3588553750966743, "grad_norm": 0.4491524921255563, "learning_rate": 6.661857208507831e-06, "loss": 0.0303, "step": 3514 }, { "epoch": 1.3592420726991492, "grad_norm": 0.35581309461781885, "learning_rate": 6.659735163653398e-06, "loss": 0.0406, "step": 3515 }, { "epoch": 1.359628770301624, "grad_norm": 0.3133877243167549, "learning_rate": 6.6576127827683525e-06, "loss": 0.0352, "step": 3516 }, { "epoch": 1.360015467904099, "grad_norm": 0.27722072717473206, "learning_rate": 6.655490066282391e-06, "loss": 0.0289, "step": 3517 }, { "epoch": 1.360402165506574, "grad_norm": 0.2720490456244613, "learning_rate": 6.65336701462528e-06, "loss": 0.029, "step": 3518 }, { "epoch": 1.3607888631090488, "grad_norm": 0.3273446239987742, "learning_rate": 6.651243628226855e-06, "loss": 0.0322, "step": 3519 }, { "epoch": 1.3611755607115237, "grad_norm": 0.3198852722877272, "learning_rate": 6.649119907517016e-06, "loss": 0.0259, "step": 3520 }, { "epoch": 1.3615622583139984, "grad_norm": 0.30267331765562855, "learning_rate": 6.646995852925735e-06, "loss": 0.022, "step": 3521 }, { "epoch": 1.3619489559164732, "grad_norm": 0.2915400290969437, "learning_rate": 6.644871464883042e-06, "loss": 0.033, "step": 3522 }, { "epoch": 1.3623356535189481, "grad_norm": 0.33635210943854776, "learning_rate": 6.642746743819051e-06, "loss": 0.0298, "step": 3523 }, { "epoch": 1.362722351121423, "grad_norm": 0.2720496207603095, "learning_rate": 6.6406216901639256e-06, "loss": 0.023, "step": 3524 }, { "epoch": 1.363109048723898, "grad_norm": 0.2362297685850582, "learning_rate": 6.638496304347909e-06, "loss": 0.0242, "step": 3525 }, { "epoch": 1.3634957463263728, "grad_norm": 0.398746548229541, "learning_rate": 6.636370586801307e-06, "loss": 0.033, "step": 3526 }, { "epoch": 1.3638824439288477, "grad_norm": 0.6135689247858449, "learning_rate": 6.634244537954493e-06, "loss": 0.0317, "step": 3527 }, { "epoch": 1.3642691415313224, "grad_norm": 0.358619397196188, "learning_rate": 6.632118158237908e-06, "loss": 0.0309, "step": 3528 }, { "epoch": 1.3646558391337973, "grad_norm": 0.31288562938309317, "learning_rate": 6.629991448082058e-06, "loss": 0.0284, "step": 3529 }, { "epoch": 1.3650425367362722, "grad_norm": 0.32747257485337583, "learning_rate": 6.627864407917521e-06, "loss": 0.0417, "step": 3530 }, { "epoch": 1.365429234338747, "grad_norm": 0.32866685083602976, "learning_rate": 6.625737038174935e-06, "loss": 0.03, "step": 3531 }, { "epoch": 1.365815931941222, "grad_norm": 0.4165821196832786, "learning_rate": 6.62360933928501e-06, "loss": 0.0438, "step": 3532 }, { "epoch": 1.3662026295436969, "grad_norm": 0.49491887594954537, "learning_rate": 6.6214813116785205e-06, "loss": 0.0437, "step": 3533 }, { "epoch": 1.3665893271461718, "grad_norm": 0.2577679135592763, "learning_rate": 6.619352955786307e-06, "loss": 0.0252, "step": 3534 }, { "epoch": 1.3669760247486464, "grad_norm": 0.27294044330268463, "learning_rate": 6.617224272039277e-06, "loss": 0.0223, "step": 3535 }, { "epoch": 1.3673627223511216, "grad_norm": 0.25022697047858733, "learning_rate": 6.615095260868405e-06, "loss": 0.0263, "step": 3536 }, { "epoch": 1.3677494199535962, "grad_norm": 0.40197235868942904, "learning_rate": 6.612965922704731e-06, "loss": 0.0375, "step": 3537 }, { "epoch": 1.3681361175560711, "grad_norm": 0.3847806451309904, "learning_rate": 6.610836257979365e-06, "loss": 0.025, "step": 3538 }, { "epoch": 1.368522815158546, "grad_norm": 0.3905114686598771, "learning_rate": 6.608706267123476e-06, "loss": 0.0334, "step": 3539 }, { "epoch": 1.368909512761021, "grad_norm": 0.31584383331543286, "learning_rate": 6.606575950568302e-06, "loss": 0.0361, "step": 3540 }, { "epoch": 1.3692962103634958, "grad_norm": 0.21932570853246675, "learning_rate": 6.604445308745151e-06, "loss": 0.0204, "step": 3541 }, { "epoch": 1.3696829079659705, "grad_norm": 0.21890499415378695, "learning_rate": 6.602314342085392e-06, "loss": 0.0234, "step": 3542 }, { "epoch": 1.3700696055684456, "grad_norm": 0.29424167291477055, "learning_rate": 6.600183051020461e-06, "loss": 0.028, "step": 3543 }, { "epoch": 1.3704563031709203, "grad_norm": 0.28487117453958044, "learning_rate": 6.598051435981859e-06, "loss": 0.0336, "step": 3544 }, { "epoch": 1.3708430007733952, "grad_norm": 0.3850977817269631, "learning_rate": 6.595919497401158e-06, "loss": 0.0309, "step": 3545 }, { "epoch": 1.37122969837587, "grad_norm": 0.27407053140335363, "learning_rate": 6.593787235709985e-06, "loss": 0.0258, "step": 3546 }, { "epoch": 1.371616395978345, "grad_norm": 0.2502126208407924, "learning_rate": 6.591654651340044e-06, "loss": 0.0272, "step": 3547 }, { "epoch": 1.3720030935808198, "grad_norm": 0.31024471889524974, "learning_rate": 6.5895217447230945e-06, "loss": 0.0362, "step": 3548 }, { "epoch": 1.3723897911832947, "grad_norm": 0.2660974493473264, "learning_rate": 6.587388516290969e-06, "loss": 0.0359, "step": 3549 }, { "epoch": 1.3727764887857696, "grad_norm": 0.2594061278211308, "learning_rate": 6.585254966475558e-06, "loss": 0.0315, "step": 3550 }, { "epoch": 1.3731631863882443, "grad_norm": 0.23905937325718682, "learning_rate": 6.583121095708826e-06, "loss": 0.0254, "step": 3551 }, { "epoch": 1.3735498839907192, "grad_norm": 0.3774355092331003, "learning_rate": 6.5809869044227915e-06, "loss": 0.0318, "step": 3552 }, { "epoch": 1.373936581593194, "grad_norm": 0.2693705242488239, "learning_rate": 6.578852393049547e-06, "loss": 0.0287, "step": 3553 }, { "epoch": 1.374323279195669, "grad_norm": 0.34004897413894725, "learning_rate": 6.576717562021248e-06, "loss": 0.0261, "step": 3554 }, { "epoch": 1.3747099767981439, "grad_norm": 0.29187552726651156, "learning_rate": 6.574582411770109e-06, "loss": 0.0297, "step": 3555 }, { "epoch": 1.3750966744006188, "grad_norm": 0.3572792871982733, "learning_rate": 6.572446942728416e-06, "loss": 0.024, "step": 3556 }, { "epoch": 1.3754833720030937, "grad_norm": 0.3246892213317544, "learning_rate": 6.570311155328517e-06, "loss": 0.0327, "step": 3557 }, { "epoch": 1.3758700696055683, "grad_norm": 0.4602286126735849, "learning_rate": 6.568175050002823e-06, "loss": 0.0362, "step": 3558 }, { "epoch": 1.3762567672080432, "grad_norm": 0.24642538439228429, "learning_rate": 6.566038627183813e-06, "loss": 0.0233, "step": 3559 }, { "epoch": 1.3766434648105181, "grad_norm": 0.30484285410365547, "learning_rate": 6.563901887304025e-06, "loss": 0.0269, "step": 3560 }, { "epoch": 1.377030162412993, "grad_norm": 0.4495568288571857, "learning_rate": 6.5617648307960646e-06, "loss": 0.0333, "step": 3561 }, { "epoch": 1.377416860015468, "grad_norm": 0.3552284022095931, "learning_rate": 6.559627458092604e-06, "loss": 0.0324, "step": 3562 }, { "epoch": 1.3778035576179428, "grad_norm": 0.3643769300947549, "learning_rate": 6.557489769626373e-06, "loss": 0.0435, "step": 3563 }, { "epoch": 1.3781902552204177, "grad_norm": 0.7472513027008978, "learning_rate": 6.555351765830171e-06, "loss": 0.0408, "step": 3564 }, { "epoch": 1.3785769528228924, "grad_norm": 0.399289132058217, "learning_rate": 6.553213447136858e-06, "loss": 0.0355, "step": 3565 }, { "epoch": 1.3789636504253673, "grad_norm": 0.3651550922732247, "learning_rate": 6.551074813979361e-06, "loss": 0.0229, "step": 3566 }, { "epoch": 1.3793503480278422, "grad_norm": 0.29712467872054077, "learning_rate": 6.548935866790664e-06, "loss": 0.0311, "step": 3567 }, { "epoch": 1.379737045630317, "grad_norm": 0.4122456828035931, "learning_rate": 6.546796606003825e-06, "loss": 0.0301, "step": 3568 }, { "epoch": 1.380123743232792, "grad_norm": 0.3871934219202428, "learning_rate": 6.544657032051953e-06, "loss": 0.0367, "step": 3569 }, { "epoch": 1.3805104408352669, "grad_norm": 0.43203800346063503, "learning_rate": 6.542517145368233e-06, "loss": 0.0452, "step": 3570 }, { "epoch": 1.3808971384377418, "grad_norm": 0.4008604839234141, "learning_rate": 6.5403769463859015e-06, "loss": 0.0252, "step": 3571 }, { "epoch": 1.3812838360402164, "grad_norm": 0.3694783363106255, "learning_rate": 6.538236435538269e-06, "loss": 0.0298, "step": 3572 }, { "epoch": 1.3816705336426915, "grad_norm": 0.26093097432361173, "learning_rate": 6.5360956132587026e-06, "loss": 0.0181, "step": 3573 }, { "epoch": 1.3820572312451662, "grad_norm": 0.3816513888653915, "learning_rate": 6.533954479980632e-06, "loss": 0.0369, "step": 3574 }, { "epoch": 1.3824439288476411, "grad_norm": 0.4156337354327435, "learning_rate": 6.531813036137556e-06, "loss": 0.0292, "step": 3575 }, { "epoch": 1.382830626450116, "grad_norm": 0.29470724025692047, "learning_rate": 6.529671282163026e-06, "loss": 0.0317, "step": 3576 }, { "epoch": 1.383217324052591, "grad_norm": 0.31385059460545756, "learning_rate": 6.527529218490668e-06, "loss": 0.0356, "step": 3577 }, { "epoch": 1.3836040216550658, "grad_norm": 0.27988885488671184, "learning_rate": 6.525386845554163e-06, "loss": 0.027, "step": 3578 }, { "epoch": 1.3839907192575405, "grad_norm": 0.3244730239010006, "learning_rate": 6.523244163787257e-06, "loss": 0.0254, "step": 3579 }, { "epoch": 1.3843774168600156, "grad_norm": 0.25022386394925644, "learning_rate": 6.521101173623756e-06, "loss": 0.02, "step": 3580 }, { "epoch": 1.3847641144624903, "grad_norm": 0.377817097459521, "learning_rate": 6.518957875497534e-06, "loss": 0.0333, "step": 3581 }, { "epoch": 1.3851508120649652, "grad_norm": 0.45325123115263705, "learning_rate": 6.516814269842521e-06, "loss": 0.0233, "step": 3582 }, { "epoch": 1.38553750966744, "grad_norm": 0.4989894106233219, "learning_rate": 6.514670357092715e-06, "loss": 0.0468, "step": 3583 }, { "epoch": 1.385924207269915, "grad_norm": 0.3103816725015238, "learning_rate": 6.51252613768217e-06, "loss": 0.0262, "step": 3584 }, { "epoch": 1.3863109048723898, "grad_norm": 0.22880837353941158, "learning_rate": 6.510381612045009e-06, "loss": 0.0282, "step": 3585 }, { "epoch": 1.3866976024748647, "grad_norm": 0.24173924295821644, "learning_rate": 6.508236780615412e-06, "loss": 0.0324, "step": 3586 }, { "epoch": 1.3870843000773396, "grad_norm": 0.37135328199515966, "learning_rate": 6.50609164382762e-06, "loss": 0.0429, "step": 3587 }, { "epoch": 1.3874709976798143, "grad_norm": 0.442740791713809, "learning_rate": 6.503946202115943e-06, "loss": 0.0362, "step": 3588 }, { "epoch": 1.3878576952822892, "grad_norm": 0.27807087926692237, "learning_rate": 6.501800455914743e-06, "loss": 0.0272, "step": 3589 }, { "epoch": 1.388244392884764, "grad_norm": 0.27585612750878674, "learning_rate": 6.499654405658453e-06, "loss": 0.0274, "step": 3590 }, { "epoch": 1.388631090487239, "grad_norm": 0.3540166795628697, "learning_rate": 6.49750805178156e-06, "loss": 0.0311, "step": 3591 }, { "epoch": 1.3890177880897139, "grad_norm": 0.48224373393389786, "learning_rate": 6.495361394718618e-06, "loss": 0.0445, "step": 3592 }, { "epoch": 1.3894044856921888, "grad_norm": 0.2587039586070836, "learning_rate": 6.493214434904238e-06, "loss": 0.0243, "step": 3593 }, { "epoch": 1.3897911832946637, "grad_norm": 0.37908051333892867, "learning_rate": 6.491067172773096e-06, "loss": 0.0342, "step": 3594 }, { "epoch": 1.3901778808971383, "grad_norm": 0.32469933348594504, "learning_rate": 6.488919608759925e-06, "loss": 0.0339, "step": 3595 }, { "epoch": 1.3905645784996132, "grad_norm": 0.3461762499753608, "learning_rate": 6.4867717432995235e-06, "loss": 0.0382, "step": 3596 }, { "epoch": 1.3909512761020881, "grad_norm": 0.44624067243034304, "learning_rate": 6.484623576826749e-06, "loss": 0.0369, "step": 3597 }, { "epoch": 1.391337973704563, "grad_norm": 0.29731806874498806, "learning_rate": 6.48247510977652e-06, "loss": 0.0269, "step": 3598 }, { "epoch": 1.391724671307038, "grad_norm": 0.33100254329725665, "learning_rate": 6.480326342583817e-06, "loss": 0.0251, "step": 3599 }, { "epoch": 1.3921113689095128, "grad_norm": 0.29441858459206094, "learning_rate": 6.478177275683679e-06, "loss": 0.0221, "step": 3600 }, { "epoch": 1.3924980665119877, "grad_norm": 0.2540485682152353, "learning_rate": 6.476027909511205e-06, "loss": 0.0229, "step": 3601 }, { "epoch": 1.3928847641144624, "grad_norm": 0.25345074188562433, "learning_rate": 6.4738782445015615e-06, "loss": 0.0212, "step": 3602 }, { "epoch": 1.3932714617169373, "grad_norm": 0.28385072222723146, "learning_rate": 6.471728281089964e-06, "loss": 0.0298, "step": 3603 }, { "epoch": 1.3936581593194122, "grad_norm": 0.4023696299354552, "learning_rate": 6.469578019711701e-06, "loss": 0.0387, "step": 3604 }, { "epoch": 1.394044856921887, "grad_norm": 0.5235258156157939, "learning_rate": 6.467427460802112e-06, "loss": 0.034, "step": 3605 }, { "epoch": 1.394431554524362, "grad_norm": 0.28157872802549194, "learning_rate": 6.465276604796601e-06, "loss": 0.0264, "step": 3606 }, { "epoch": 1.3948182521268369, "grad_norm": 0.3095493594270625, "learning_rate": 6.463125452130631e-06, "loss": 0.0361, "step": 3607 }, { "epoch": 1.3952049497293117, "grad_norm": 0.32363341025336195, "learning_rate": 6.460974003239724e-06, "loss": 0.0285, "step": 3608 }, { "epoch": 1.3955916473317864, "grad_norm": 0.2559367868731056, "learning_rate": 6.458822258559465e-06, "loss": 0.0259, "step": 3609 }, { "epoch": 1.3959783449342615, "grad_norm": 0.3906911710785992, "learning_rate": 6.456670218525495e-06, "loss": 0.0304, "step": 3610 }, { "epoch": 1.3963650425367362, "grad_norm": 0.2908024812904141, "learning_rate": 6.454517883573519e-06, "loss": 0.0249, "step": 3611 }, { "epoch": 1.396751740139211, "grad_norm": 0.31885957369954043, "learning_rate": 6.452365254139298e-06, "loss": 0.0322, "step": 3612 }, { "epoch": 1.397138437741686, "grad_norm": 0.3251095653246821, "learning_rate": 6.450212330658656e-06, "loss": 0.0285, "step": 3613 }, { "epoch": 1.397525135344161, "grad_norm": 0.361172073967012, "learning_rate": 6.448059113567473e-06, "loss": 0.0243, "step": 3614 }, { "epoch": 1.3979118329466358, "grad_norm": 0.22325995829253534, "learning_rate": 6.44590560330169e-06, "loss": 0.0245, "step": 3615 }, { "epoch": 1.3982985305491105, "grad_norm": 0.29873187906470866, "learning_rate": 6.443751800297307e-06, "loss": 0.0352, "step": 3616 }, { "epoch": 1.3986852281515856, "grad_norm": 0.7120973848594413, "learning_rate": 6.441597704990388e-06, "loss": 0.0319, "step": 3617 }, { "epoch": 1.3990719257540603, "grad_norm": 0.3159221055291224, "learning_rate": 6.439443317817047e-06, "loss": 0.0314, "step": 3618 }, { "epoch": 1.3994586233565351, "grad_norm": 0.3400022975854136, "learning_rate": 6.4372886392134635e-06, "loss": 0.0332, "step": 3619 }, { "epoch": 1.39984532095901, "grad_norm": 0.3512998130686768, "learning_rate": 6.4351336696158765e-06, "loss": 0.0258, "step": 3620 }, { "epoch": 1.400232018561485, "grad_norm": 0.2643711282500721, "learning_rate": 6.432978409460578e-06, "loss": 0.0273, "step": 3621 }, { "epoch": 1.4006187161639598, "grad_norm": 0.5033550477540517, "learning_rate": 6.430822859183928e-06, "loss": 0.0321, "step": 3622 }, { "epoch": 1.4010054137664347, "grad_norm": 0.495227604826131, "learning_rate": 6.428667019222334e-06, "loss": 0.0293, "step": 3623 }, { "epoch": 1.4013921113689096, "grad_norm": 0.24876922340131508, "learning_rate": 6.426510890012274e-06, "loss": 0.0254, "step": 3624 }, { "epoch": 1.4017788089713843, "grad_norm": 0.3641070844903612, "learning_rate": 6.424354471990275e-06, "loss": 0.0342, "step": 3625 }, { "epoch": 1.4021655065738592, "grad_norm": 0.25447729376031175, "learning_rate": 6.42219776559293e-06, "loss": 0.0253, "step": 3626 }, { "epoch": 1.402552204176334, "grad_norm": 0.37691056952014995, "learning_rate": 6.4200407712568815e-06, "loss": 0.0262, "step": 3627 }, { "epoch": 1.402938901778809, "grad_norm": 0.5673857872322128, "learning_rate": 6.417883489418838e-06, "loss": 0.0638, "step": 3628 }, { "epoch": 1.4033255993812839, "grad_norm": 0.25014293328499776, "learning_rate": 6.415725920515563e-06, "loss": 0.0238, "step": 3629 }, { "epoch": 1.4037122969837588, "grad_norm": 0.39567113195639164, "learning_rate": 6.4135680649838805e-06, "loss": 0.0256, "step": 3630 }, { "epoch": 1.4040989945862337, "grad_norm": 0.29377456759104625, "learning_rate": 6.411409923260668e-06, "loss": 0.0301, "step": 3631 }, { "epoch": 1.4044856921887083, "grad_norm": 0.22794680554526203, "learning_rate": 6.409251495782867e-06, "loss": 0.0264, "step": 3632 }, { "epoch": 1.4048723897911832, "grad_norm": 0.28421302391743786, "learning_rate": 6.4070927829874705e-06, "loss": 0.0217, "step": 3633 }, { "epoch": 1.4052590873936581, "grad_norm": 0.3060605772697404, "learning_rate": 6.404933785311535e-06, "loss": 0.0273, "step": 3634 }, { "epoch": 1.405645784996133, "grad_norm": 0.31611465674525796, "learning_rate": 6.402774503192168e-06, "loss": 0.0334, "step": 3635 }, { "epoch": 1.406032482598608, "grad_norm": 0.3608823462747229, "learning_rate": 6.4006149370665425e-06, "loss": 0.0193, "step": 3636 }, { "epoch": 1.4064191802010828, "grad_norm": 0.36309125596881603, "learning_rate": 6.398455087371882e-06, "loss": 0.0382, "step": 3637 }, { "epoch": 1.4068058778035577, "grad_norm": 0.3234320961493819, "learning_rate": 6.396294954545474e-06, "loss": 0.0345, "step": 3638 }, { "epoch": 1.4071925754060324, "grad_norm": 0.33976598377522554, "learning_rate": 6.394134539024659e-06, "loss": 0.0396, "step": 3639 }, { "epoch": 1.4075792730085073, "grad_norm": 0.42168032292742474, "learning_rate": 6.3919738412468324e-06, "loss": 0.0329, "step": 3640 }, { "epoch": 1.4079659706109822, "grad_norm": 0.2906440089038686, "learning_rate": 6.3898128616494525e-06, "loss": 0.0307, "step": 3641 }, { "epoch": 1.408352668213457, "grad_norm": 0.2728111791771065, "learning_rate": 6.387651600670031e-06, "loss": 0.028, "step": 3642 }, { "epoch": 1.408739365815932, "grad_norm": 0.2759059704570874, "learning_rate": 6.38549005874614e-06, "loss": 0.027, "step": 3643 }, { "epoch": 1.4091260634184068, "grad_norm": 0.33287289826194627, "learning_rate": 6.383328236315401e-06, "loss": 0.0292, "step": 3644 }, { "epoch": 1.4095127610208817, "grad_norm": 0.334596760096833, "learning_rate": 6.381166133815504e-06, "loss": 0.0361, "step": 3645 }, { "epoch": 1.4098994586233564, "grad_norm": 0.26359393202719744, "learning_rate": 6.379003751684185e-06, "loss": 0.0347, "step": 3646 }, { "epoch": 1.4102861562258315, "grad_norm": 0.27151198287238754, "learning_rate": 6.376841090359241e-06, "loss": 0.0288, "step": 3647 }, { "epoch": 1.4106728538283062, "grad_norm": 0.29686207714683255, "learning_rate": 6.374678150278526e-06, "loss": 0.0248, "step": 3648 }, { "epoch": 1.411059551430781, "grad_norm": 0.4731087844718751, "learning_rate": 6.372514931879949e-06, "loss": 0.0427, "step": 3649 }, { "epoch": 1.411446249033256, "grad_norm": 0.30435269457257685, "learning_rate": 6.3703514356014765e-06, "loss": 0.0346, "step": 3650 }, { "epoch": 1.411832946635731, "grad_norm": 0.2236967683616865, "learning_rate": 6.368187661881131e-06, "loss": 0.0228, "step": 3651 }, { "epoch": 1.4122196442382058, "grad_norm": 0.25778992280093366, "learning_rate": 6.366023611156991e-06, "loss": 0.0245, "step": 3652 }, { "epoch": 1.4126063418406805, "grad_norm": 0.4284091617949091, "learning_rate": 6.363859283867189e-06, "loss": 0.0277, "step": 3653 }, { "epoch": 1.4129930394431556, "grad_norm": 0.24598743405917997, "learning_rate": 6.361694680449918e-06, "loss": 0.0236, "step": 3654 }, { "epoch": 1.4133797370456302, "grad_norm": 0.2850639934442848, "learning_rate": 6.3595298013434234e-06, "loss": 0.0301, "step": 3655 }, { "epoch": 1.4137664346481051, "grad_norm": 0.3255325724359742, "learning_rate": 6.357364646986008e-06, "loss": 0.0308, "step": 3656 }, { "epoch": 1.41415313225058, "grad_norm": 0.3081766010549718, "learning_rate": 6.35519921781603e-06, "loss": 0.0306, "step": 3657 }, { "epoch": 1.414539829853055, "grad_norm": 0.37908381207371894, "learning_rate": 6.353033514271902e-06, "loss": 0.0366, "step": 3658 }, { "epoch": 1.4149265274555298, "grad_norm": 0.25961019813553593, "learning_rate": 6.350867536792093e-06, "loss": 0.0208, "step": 3659 }, { "epoch": 1.4153132250580047, "grad_norm": 0.3055336178792763, "learning_rate": 6.3487012858151305e-06, "loss": 0.0243, "step": 3660 }, { "epoch": 1.4156999226604796, "grad_norm": 0.23336388503313063, "learning_rate": 6.34653476177959e-06, "loss": 0.021, "step": 3661 }, { "epoch": 1.4160866202629543, "grad_norm": 0.28694502360485075, "learning_rate": 6.344367965124109e-06, "loss": 0.0253, "step": 3662 }, { "epoch": 1.4164733178654292, "grad_norm": 0.2933738335883093, "learning_rate": 6.342200896287378e-06, "loss": 0.0269, "step": 3663 }, { "epoch": 1.416860015467904, "grad_norm": 0.26303697189761194, "learning_rate": 6.340033555708145e-06, "loss": 0.023, "step": 3664 }, { "epoch": 1.417246713070379, "grad_norm": 0.2512911672751295, "learning_rate": 6.3378659438252055e-06, "loss": 0.0293, "step": 3665 }, { "epoch": 1.4176334106728539, "grad_norm": 0.2215780215643222, "learning_rate": 6.33569806107742e-06, "loss": 0.0249, "step": 3666 }, { "epoch": 1.4180201082753288, "grad_norm": 0.2269903573018994, "learning_rate": 6.333529907903695e-06, "loss": 0.0231, "step": 3667 }, { "epoch": 1.4184068058778037, "grad_norm": 0.3139388368814683, "learning_rate": 6.3313614847429975e-06, "loss": 0.03, "step": 3668 }, { "epoch": 1.4187935034802783, "grad_norm": 0.3310151741913759, "learning_rate": 6.329192792034347e-06, "loss": 0.0296, "step": 3669 }, { "epoch": 1.4191802010827532, "grad_norm": 0.2517512644413476, "learning_rate": 6.3270238302168176e-06, "loss": 0.0261, "step": 3670 }, { "epoch": 1.4195668986852281, "grad_norm": 0.24294245624272606, "learning_rate": 6.324854599729537e-06, "loss": 0.0207, "step": 3671 }, { "epoch": 1.419953596287703, "grad_norm": 0.4145251536963985, "learning_rate": 6.322685101011692e-06, "loss": 0.0321, "step": 3672 }, { "epoch": 1.420340293890178, "grad_norm": 0.4439905561646275, "learning_rate": 6.320515334502518e-06, "loss": 0.0338, "step": 3673 }, { "epoch": 1.4207269914926528, "grad_norm": 0.3145497202464342, "learning_rate": 6.318345300641305e-06, "loss": 0.0217, "step": 3674 }, { "epoch": 1.4211136890951277, "grad_norm": 0.45357119759749187, "learning_rate": 6.316174999867402e-06, "loss": 0.0439, "step": 3675 }, { "epoch": 1.4215003866976024, "grad_norm": 0.28732079858814275, "learning_rate": 6.314004432620207e-06, "loss": 0.036, "step": 3676 }, { "epoch": 1.4218870843000773, "grad_norm": 0.26953342493062854, "learning_rate": 6.311833599339175e-06, "loss": 0.0227, "step": 3677 }, { "epoch": 1.4222737819025522, "grad_norm": 0.2673702465828427, "learning_rate": 6.309662500463811e-06, "loss": 0.0307, "step": 3678 }, { "epoch": 1.422660479505027, "grad_norm": 0.3877096573682511, "learning_rate": 6.307491136433681e-06, "loss": 0.0314, "step": 3679 }, { "epoch": 1.423047177107502, "grad_norm": 0.3244728605808721, "learning_rate": 6.3053195076883965e-06, "loss": 0.0313, "step": 3680 }, { "epoch": 1.4234338747099768, "grad_norm": 0.23271875496502273, "learning_rate": 6.303147614667629e-06, "loss": 0.022, "step": 3681 }, { "epoch": 1.4238205723124517, "grad_norm": 0.2880414627849361, "learning_rate": 6.300975457811097e-06, "loss": 0.0244, "step": 3682 }, { "epoch": 1.4242072699149264, "grad_norm": 0.22826544042185085, "learning_rate": 6.298803037558581e-06, "loss": 0.0191, "step": 3683 }, { "epoch": 1.4245939675174015, "grad_norm": 0.2929857896782944, "learning_rate": 6.296630354349904e-06, "loss": 0.0322, "step": 3684 }, { "epoch": 1.4249806651198762, "grad_norm": 0.5857552044090328, "learning_rate": 6.2944574086249554e-06, "loss": 0.0363, "step": 3685 }, { "epoch": 1.425367362722351, "grad_norm": 0.28750722055320865, "learning_rate": 6.292284200823666e-06, "loss": 0.0288, "step": 3686 }, { "epoch": 1.425754060324826, "grad_norm": 0.30288718469849385, "learning_rate": 6.290110731386025e-06, "loss": 0.0298, "step": 3687 }, { "epoch": 1.4261407579273009, "grad_norm": 0.31733333214806686, "learning_rate": 6.2879370007520746e-06, "loss": 0.0307, "step": 3688 }, { "epoch": 1.4265274555297758, "grad_norm": 0.32224981368838196, "learning_rate": 6.285763009361908e-06, "loss": 0.032, "step": 3689 }, { "epoch": 1.4269141531322505, "grad_norm": 0.3614876267378102, "learning_rate": 6.283588757655674e-06, "loss": 0.0398, "step": 3690 }, { "epoch": 1.4273008507347256, "grad_norm": 0.29987518540465397, "learning_rate": 6.281414246073571e-06, "loss": 0.0257, "step": 3691 }, { "epoch": 1.4276875483372002, "grad_norm": 0.2861115240869688, "learning_rate": 6.279239475055853e-06, "loss": 0.0266, "step": 3692 }, { "epoch": 1.4280742459396751, "grad_norm": 0.24002674170365307, "learning_rate": 6.277064445042822e-06, "loss": 0.0243, "step": 3693 }, { "epoch": 1.42846094354215, "grad_norm": 0.36024115361869286, "learning_rate": 6.274889156474839e-06, "loss": 0.0397, "step": 3694 }, { "epoch": 1.428847641144625, "grad_norm": 0.33797804915696866, "learning_rate": 6.27271360979231e-06, "loss": 0.0331, "step": 3695 }, { "epoch": 1.4292343387470998, "grad_norm": 0.2255331292941151, "learning_rate": 6.270537805435701e-06, "loss": 0.0213, "step": 3696 }, { "epoch": 1.4296210363495747, "grad_norm": 0.2681368541238646, "learning_rate": 6.268361743845523e-06, "loss": 0.0192, "step": 3697 }, { "epoch": 1.4300077339520496, "grad_norm": 0.24812921264201793, "learning_rate": 6.2661854254623455e-06, "loss": 0.0254, "step": 3698 }, { "epoch": 1.4303944315545243, "grad_norm": 0.3765780050725673, "learning_rate": 6.264008850726784e-06, "loss": 0.0307, "step": 3699 }, { "epoch": 1.4307811291569992, "grad_norm": 0.23837262929503208, "learning_rate": 6.261832020079508e-06, "loss": 0.0269, "step": 3700 }, { "epoch": 1.431167826759474, "grad_norm": 0.3223445746952975, "learning_rate": 6.259654933961241e-06, "loss": 0.0291, "step": 3701 }, { "epoch": 1.431554524361949, "grad_norm": 0.3305808305950195, "learning_rate": 6.257477592812758e-06, "loss": 0.0357, "step": 3702 }, { "epoch": 1.4319412219644239, "grad_norm": 0.42777522311272, "learning_rate": 6.255299997074882e-06, "loss": 0.0359, "step": 3703 }, { "epoch": 1.4323279195668988, "grad_norm": 0.47886280939000336, "learning_rate": 6.2531221471884905e-06, "loss": 0.0352, "step": 3704 }, { "epoch": 1.4327146171693736, "grad_norm": 0.2692062618135738, "learning_rate": 6.250944043594512e-06, "loss": 0.0301, "step": 3705 }, { "epoch": 1.4331013147718483, "grad_norm": 0.2675388193950103, "learning_rate": 6.2487656867339266e-06, "loss": 0.0219, "step": 3706 }, { "epoch": 1.4334880123743232, "grad_norm": 0.3017561853411987, "learning_rate": 6.246587077047765e-06, "loss": 0.0343, "step": 3707 }, { "epoch": 1.4338747099767981, "grad_norm": 0.4393597491338117, "learning_rate": 6.244408214977109e-06, "loss": 0.0382, "step": 3708 }, { "epoch": 1.434261407579273, "grad_norm": 0.25102891752916945, "learning_rate": 6.242229100963092e-06, "loss": 0.0241, "step": 3709 }, { "epoch": 1.434648105181748, "grad_norm": 0.26784990469532977, "learning_rate": 6.240049735446898e-06, "loss": 0.0221, "step": 3710 }, { "epoch": 1.4350348027842228, "grad_norm": 0.2797602931277366, "learning_rate": 6.237870118869763e-06, "loss": 0.0235, "step": 3711 }, { "epoch": 1.4354215003866977, "grad_norm": 0.4058850062265914, "learning_rate": 6.2356902516729725e-06, "loss": 0.0314, "step": 3712 }, { "epoch": 1.4358081979891724, "grad_norm": 0.24095583999627568, "learning_rate": 6.2335101342978645e-06, "loss": 0.0214, "step": 3713 }, { "epoch": 1.4361948955916473, "grad_norm": 0.41027653546648135, "learning_rate": 6.231329767185823e-06, "loss": 0.0249, "step": 3714 }, { "epoch": 1.4365815931941222, "grad_norm": 0.29015059492308115, "learning_rate": 6.229149150778289e-06, "loss": 0.022, "step": 3715 }, { "epoch": 1.436968290796597, "grad_norm": 0.2795900655829145, "learning_rate": 6.226968285516749e-06, "loss": 0.0315, "step": 3716 }, { "epoch": 1.437354988399072, "grad_norm": 0.34897764138285275, "learning_rate": 6.224787171842745e-06, "loss": 0.0322, "step": 3717 }, { "epoch": 1.4377416860015468, "grad_norm": 0.3876681597588374, "learning_rate": 6.222605810197862e-06, "loss": 0.0254, "step": 3718 }, { "epoch": 1.4381283836040217, "grad_norm": 0.23209904766222528, "learning_rate": 6.220424201023741e-06, "loss": 0.0217, "step": 3719 }, { "epoch": 1.4385150812064964, "grad_norm": 0.35414022334183365, "learning_rate": 6.218242344762073e-06, "loss": 0.037, "step": 3720 }, { "epoch": 1.4389017788089715, "grad_norm": 0.3766558193945928, "learning_rate": 6.216060241854596e-06, "loss": 0.0455, "step": 3721 }, { "epoch": 1.4392884764114462, "grad_norm": 0.2962435310475221, "learning_rate": 6.2138778927431e-06, "loss": 0.0257, "step": 3722 }, { "epoch": 1.439675174013921, "grad_norm": 0.2670251142018387, "learning_rate": 6.211695297869422e-06, "loss": 0.0233, "step": 3723 }, { "epoch": 1.440061871616396, "grad_norm": 0.3246239117575488, "learning_rate": 6.209512457675454e-06, "loss": 0.0381, "step": 3724 }, { "epoch": 1.4404485692188709, "grad_norm": 0.2763529850836496, "learning_rate": 6.2073293726031324e-06, "loss": 0.0249, "step": 3725 }, { "epoch": 1.4408352668213458, "grad_norm": 0.38648244531492826, "learning_rate": 6.205146043094449e-06, "loss": 0.0328, "step": 3726 }, { "epoch": 1.4412219644238204, "grad_norm": 0.316182739207157, "learning_rate": 6.2029624695914355e-06, "loss": 0.0369, "step": 3727 }, { "epoch": 1.4416086620262956, "grad_norm": 0.3200063253436197, "learning_rate": 6.200778652536185e-06, "loss": 0.0331, "step": 3728 }, { "epoch": 1.4419953596287702, "grad_norm": 0.2536570406743533, "learning_rate": 6.1985945923708294e-06, "loss": 0.0291, "step": 3729 }, { "epoch": 1.4423820572312451, "grad_norm": 0.24735804341760267, "learning_rate": 6.196410289537558e-06, "loss": 0.0278, "step": 3730 }, { "epoch": 1.44276875483372, "grad_norm": 0.37056112127972785, "learning_rate": 6.1942257444786025e-06, "loss": 0.034, "step": 3731 }, { "epoch": 1.443155452436195, "grad_norm": 0.32781730006085963, "learning_rate": 6.192040957636251e-06, "loss": 0.038, "step": 3732 }, { "epoch": 1.4435421500386698, "grad_norm": 0.33789636880715285, "learning_rate": 6.18985592945283e-06, "loss": 0.0282, "step": 3733 }, { "epoch": 1.4439288476411445, "grad_norm": 0.2507348470731272, "learning_rate": 6.187670660370726e-06, "loss": 0.0276, "step": 3734 }, { "epoch": 1.4443155452436196, "grad_norm": 0.2966081665132611, "learning_rate": 6.1854851508323674e-06, "loss": 0.0298, "step": 3735 }, { "epoch": 1.4447022428460943, "grad_norm": 0.27865701905139273, "learning_rate": 6.183299401280235e-06, "loss": 0.0285, "step": 3736 }, { "epoch": 1.4450889404485692, "grad_norm": 0.3958706833718677, "learning_rate": 6.1811134121568535e-06, "loss": 0.0298, "step": 3737 }, { "epoch": 1.445475638051044, "grad_norm": 0.2513922892773463, "learning_rate": 6.1789271839048026e-06, "loss": 0.0198, "step": 3738 }, { "epoch": 1.445862335653519, "grad_norm": 0.49095823031361246, "learning_rate": 6.1767407169667055e-06, "loss": 0.0403, "step": 3739 }, { "epoch": 1.4462490332559939, "grad_norm": 0.3503808115565053, "learning_rate": 6.174554011785233e-06, "loss": 0.0293, "step": 3740 }, { "epoch": 1.4466357308584687, "grad_norm": 0.29854852129895004, "learning_rate": 6.17236706880311e-06, "loss": 0.0213, "step": 3741 }, { "epoch": 1.4470224284609436, "grad_norm": 0.31215325807236655, "learning_rate": 6.170179888463102e-06, "loss": 0.0305, "step": 3742 }, { "epoch": 1.4474091260634183, "grad_norm": 0.6167853353417337, "learning_rate": 6.167992471208031e-06, "loss": 0.0335, "step": 3743 }, { "epoch": 1.4477958236658932, "grad_norm": 0.33906523293429647, "learning_rate": 6.165804817480758e-06, "loss": 0.0317, "step": 3744 }, { "epoch": 1.448182521268368, "grad_norm": 0.3060408873323843, "learning_rate": 6.163616927724199e-06, "loss": 0.0345, "step": 3745 }, { "epoch": 1.448569218870843, "grad_norm": 0.4706142413735641, "learning_rate": 6.161428802381313e-06, "loss": 0.0321, "step": 3746 }, { "epoch": 1.448955916473318, "grad_norm": 0.19836265761513935, "learning_rate": 6.159240441895109e-06, "loss": 0.0221, "step": 3747 }, { "epoch": 1.4493426140757928, "grad_norm": 0.37277276367467826, "learning_rate": 6.157051846708644e-06, "loss": 0.0371, "step": 3748 }, { "epoch": 1.4497293116782677, "grad_norm": 0.3332850650133452, "learning_rate": 6.154863017265022e-06, "loss": 0.0346, "step": 3749 }, { "epoch": 1.4501160092807424, "grad_norm": 0.3967827002439958, "learning_rate": 6.1526739540073935e-06, "loss": 0.04, "step": 3750 }, { "epoch": 1.4505027068832173, "grad_norm": 0.27485919261573444, "learning_rate": 6.150484657378959e-06, "loss": 0.0262, "step": 3751 }, { "epoch": 1.4508894044856921, "grad_norm": 0.4497187600143756, "learning_rate": 6.148295127822962e-06, "loss": 0.047, "step": 3752 }, { "epoch": 1.451276102088167, "grad_norm": 0.5854880933940978, "learning_rate": 6.146105365782695e-06, "loss": 0.0478, "step": 3753 }, { "epoch": 1.451662799690642, "grad_norm": 0.31188607747188624, "learning_rate": 6.143915371701501e-06, "loss": 0.0366, "step": 3754 }, { "epoch": 1.4520494972931168, "grad_norm": 0.2530277286001239, "learning_rate": 6.1417251460227635e-06, "loss": 0.0231, "step": 3755 }, { "epoch": 1.4524361948955917, "grad_norm": 0.31921349414878214, "learning_rate": 6.13953468918992e-06, "loss": 0.0247, "step": 3756 }, { "epoch": 1.4528228924980664, "grad_norm": 0.32265311710507405, "learning_rate": 6.137344001646448e-06, "loss": 0.0338, "step": 3757 }, { "epoch": 1.4532095901005415, "grad_norm": 0.48805161286668636, "learning_rate": 6.135153083835879e-06, "loss": 0.0331, "step": 3758 }, { "epoch": 1.4535962877030162, "grad_norm": 0.3648180919569892, "learning_rate": 6.132961936201782e-06, "loss": 0.0405, "step": 3759 }, { "epoch": 1.453982985305491, "grad_norm": 0.2610215653406667, "learning_rate": 6.1307705591877815e-06, "loss": 0.0304, "step": 3760 }, { "epoch": 1.454369682907966, "grad_norm": 0.29186591058665096, "learning_rate": 6.128578953237543e-06, "loss": 0.0323, "step": 3761 }, { "epoch": 1.4547563805104409, "grad_norm": 0.332164133817088, "learning_rate": 6.12638711879478e-06, "loss": 0.0295, "step": 3762 }, { "epoch": 1.4551430781129158, "grad_norm": 0.3307589744327341, "learning_rate": 6.12419505630325e-06, "loss": 0.0358, "step": 3763 }, { "epoch": 1.4555297757153904, "grad_norm": 0.36392775798632165, "learning_rate": 6.122002766206764e-06, "loss": 0.0329, "step": 3764 }, { "epoch": 1.4559164733178656, "grad_norm": 0.33297694346419154, "learning_rate": 6.119810248949169e-06, "loss": 0.0332, "step": 3765 }, { "epoch": 1.4563031709203402, "grad_norm": 0.2215115874208359, "learning_rate": 6.117617504974367e-06, "loss": 0.026, "step": 3766 }, { "epoch": 1.4566898685228151, "grad_norm": 0.24692082601983115, "learning_rate": 6.115424534726296e-06, "loss": 0.0257, "step": 3767 }, { "epoch": 1.45707656612529, "grad_norm": 0.2731334708989129, "learning_rate": 6.113231338648952e-06, "loss": 0.0268, "step": 3768 }, { "epoch": 1.457463263727765, "grad_norm": 0.3637777463611669, "learning_rate": 6.111037917186366e-06, "loss": 0.0323, "step": 3769 }, { "epoch": 1.4578499613302398, "grad_norm": 0.2830605190404462, "learning_rate": 6.108844270782621e-06, "loss": 0.0317, "step": 3770 }, { "epoch": 1.4582366589327145, "grad_norm": 0.22061588485115113, "learning_rate": 6.1066503998818414e-06, "loss": 0.0239, "step": 3771 }, { "epoch": 1.4586233565351896, "grad_norm": 0.22686926623044773, "learning_rate": 6.104456304928202e-06, "loss": 0.0284, "step": 3772 }, { "epoch": 1.4590100541376643, "grad_norm": 0.3442178626417675, "learning_rate": 6.102261986365918e-06, "loss": 0.0329, "step": 3773 }, { "epoch": 1.4593967517401392, "grad_norm": 0.3282463995959652, "learning_rate": 6.100067444639253e-06, "loss": 0.0233, "step": 3774 }, { "epoch": 1.459783449342614, "grad_norm": 0.32817597571146156, "learning_rate": 6.097872680192515e-06, "loss": 0.031, "step": 3775 }, { "epoch": 1.460170146945089, "grad_norm": 0.35273425747223297, "learning_rate": 6.095677693470054e-06, "loss": 0.0308, "step": 3776 }, { "epoch": 1.4605568445475638, "grad_norm": 0.25796392674026114, "learning_rate": 6.0934824849162726e-06, "loss": 0.0268, "step": 3777 }, { "epoch": 1.4609435421500387, "grad_norm": 0.29390778695661507, "learning_rate": 6.091287054975609e-06, "loss": 0.0326, "step": 3778 }, { "epoch": 1.4613302397525136, "grad_norm": 0.27384734926029114, "learning_rate": 6.089091404092555e-06, "loss": 0.0241, "step": 3779 }, { "epoch": 1.4617169373549883, "grad_norm": 0.29580270300300354, "learning_rate": 6.0868955327116385e-06, "loss": 0.025, "step": 3780 }, { "epoch": 1.4621036349574632, "grad_norm": 0.24520257878790994, "learning_rate": 6.084699441277441e-06, "loss": 0.0234, "step": 3781 }, { "epoch": 1.462490332559938, "grad_norm": 0.2654229035437446, "learning_rate": 6.08250313023458e-06, "loss": 0.0272, "step": 3782 }, { "epoch": 1.462877030162413, "grad_norm": 0.3034360132738818, "learning_rate": 6.080306600027724e-06, "loss": 0.0379, "step": 3783 }, { "epoch": 1.463263727764888, "grad_norm": 0.2192444658616074, "learning_rate": 6.078109851101581e-06, "loss": 0.0222, "step": 3784 }, { "epoch": 1.4636504253673628, "grad_norm": 0.3112539598639267, "learning_rate": 6.0759128839009084e-06, "loss": 0.0269, "step": 3785 }, { "epoch": 1.4640371229698377, "grad_norm": 0.4128016766763906, "learning_rate": 6.073715698870503e-06, "loss": 0.0353, "step": 3786 }, { "epoch": 1.4644238205723124, "grad_norm": 0.31672277683099265, "learning_rate": 6.07151829645521e-06, "loss": 0.0217, "step": 3787 }, { "epoch": 1.4648105181747872, "grad_norm": 0.26629452404849036, "learning_rate": 6.0693206770999135e-06, "loss": 0.0234, "step": 3788 }, { "epoch": 1.4651972157772621, "grad_norm": 0.33401089942022655, "learning_rate": 6.067122841249545e-06, "loss": 0.0326, "step": 3789 }, { "epoch": 1.465583913379737, "grad_norm": 0.30765036541991686, "learning_rate": 6.064924789349081e-06, "loss": 0.0282, "step": 3790 }, { "epoch": 1.465970610982212, "grad_norm": 0.2836891442617674, "learning_rate": 6.0627265218435385e-06, "loss": 0.0256, "step": 3791 }, { "epoch": 1.4663573085846868, "grad_norm": 0.3366466468212567, "learning_rate": 6.0605280391779815e-06, "loss": 0.0269, "step": 3792 }, { "epoch": 1.4667440061871617, "grad_norm": 0.8772211385922812, "learning_rate": 6.058329341797512e-06, "loss": 0.0385, "step": 3793 }, { "epoch": 1.4671307037896364, "grad_norm": 0.2321154673353928, "learning_rate": 6.056130430147282e-06, "loss": 0.0202, "step": 3794 }, { "epoch": 1.4675174013921113, "grad_norm": 0.3183939947190619, "learning_rate": 6.053931304672481e-06, "loss": 0.0278, "step": 3795 }, { "epoch": 1.4679040989945862, "grad_norm": 0.5675462721798663, "learning_rate": 6.05173196581835e-06, "loss": 0.028, "step": 3796 }, { "epoch": 1.468290796597061, "grad_norm": 0.3388338153340702, "learning_rate": 6.0495324140301615e-06, "loss": 0.0329, "step": 3797 }, { "epoch": 1.468677494199536, "grad_norm": 0.38483268749960525, "learning_rate": 6.047332649753243e-06, "loss": 0.027, "step": 3798 }, { "epoch": 1.4690641918020109, "grad_norm": 0.31330470412024625, "learning_rate": 6.045132673432958e-06, "loss": 0.0431, "step": 3799 }, { "epoch": 1.4694508894044858, "grad_norm": 0.32030913401288896, "learning_rate": 6.042932485514713e-06, "loss": 0.0312, "step": 3800 }, { "epoch": 1.4698375870069604, "grad_norm": 0.2067802705425949, "learning_rate": 6.04073208644396e-06, "loss": 0.0188, "step": 3801 }, { "epoch": 1.4702242846094355, "grad_norm": 0.24145110804651138, "learning_rate": 6.038531476666193e-06, "loss": 0.0254, "step": 3802 }, { "epoch": 1.4706109822119102, "grad_norm": 0.25804296320148734, "learning_rate": 6.036330656626948e-06, "loss": 0.0246, "step": 3803 }, { "epoch": 1.4709976798143851, "grad_norm": 0.28684992075736704, "learning_rate": 6.034129626771803e-06, "loss": 0.024, "step": 3804 }, { "epoch": 1.47138437741686, "grad_norm": 0.3016686819111701, "learning_rate": 6.031928387546381e-06, "loss": 0.0346, "step": 3805 }, { "epoch": 1.471771075019335, "grad_norm": 0.36456061516932825, "learning_rate": 6.029726939396343e-06, "loss": 0.039, "step": 3806 }, { "epoch": 1.4721577726218098, "grad_norm": 0.31385672692353694, "learning_rate": 6.0275252827673995e-06, "loss": 0.0362, "step": 3807 }, { "epoch": 1.4725444702242845, "grad_norm": 0.43277178671559496, "learning_rate": 6.0253234181052936e-06, "loss": 0.0331, "step": 3808 }, { "epoch": 1.4729311678267596, "grad_norm": 0.3091482263017455, "learning_rate": 6.023121345855819e-06, "loss": 0.0379, "step": 3809 }, { "epoch": 1.4733178654292343, "grad_norm": 0.2626621948984504, "learning_rate": 6.020919066464807e-06, "loss": 0.0331, "step": 3810 }, { "epoch": 1.4737045630317092, "grad_norm": 0.2259374319799097, "learning_rate": 6.018716580378133e-06, "loss": 0.0211, "step": 3811 }, { "epoch": 1.474091260634184, "grad_norm": 0.24087351086976344, "learning_rate": 6.016513888041713e-06, "loss": 0.0214, "step": 3812 }, { "epoch": 1.474477958236659, "grad_norm": 0.284812042534697, "learning_rate": 6.014310989901504e-06, "loss": 0.0312, "step": 3813 }, { "epoch": 1.4748646558391338, "grad_norm": 0.44201012604822987, "learning_rate": 6.012107886403504e-06, "loss": 0.0426, "step": 3814 }, { "epoch": 1.4752513534416087, "grad_norm": 0.9652125025577564, "learning_rate": 6.009904577993758e-06, "loss": 0.0322, "step": 3815 }, { "epoch": 1.4756380510440836, "grad_norm": 0.26093875555610163, "learning_rate": 6.007701065118346e-06, "loss": 0.0308, "step": 3816 }, { "epoch": 1.4760247486465583, "grad_norm": 0.3020342262386558, "learning_rate": 6.005497348223391e-06, "loss": 0.0177, "step": 3817 }, { "epoch": 1.4764114462490332, "grad_norm": 0.27838398505775946, "learning_rate": 6.003293427755061e-06, "loss": 0.0205, "step": 3818 }, { "epoch": 1.476798143851508, "grad_norm": 0.4620235392729423, "learning_rate": 6.001089304159564e-06, "loss": 0.0397, "step": 3819 }, { "epoch": 1.477184841453983, "grad_norm": 0.2788990652442548, "learning_rate": 5.998884977883144e-06, "loss": 0.0213, "step": 3820 }, { "epoch": 1.4775715390564579, "grad_norm": 0.3246178577224618, "learning_rate": 5.99668044937209e-06, "loss": 0.021, "step": 3821 }, { "epoch": 1.4779582366589328, "grad_norm": 0.25751712770636537, "learning_rate": 5.994475719072734e-06, "loss": 0.0245, "step": 3822 }, { "epoch": 1.4783449342614077, "grad_norm": 0.2648871822034266, "learning_rate": 5.992270787431445e-06, "loss": 0.0235, "step": 3823 }, { "epoch": 1.4787316318638823, "grad_norm": 0.3007620780662229, "learning_rate": 5.990065654894634e-06, "loss": 0.0238, "step": 3824 }, { "epoch": 1.4791183294663572, "grad_norm": 0.3604584029839664, "learning_rate": 5.987860321908755e-06, "loss": 0.0282, "step": 3825 }, { "epoch": 1.4795050270688321, "grad_norm": 0.2985243042722365, "learning_rate": 5.985654788920299e-06, "loss": 0.029, "step": 3826 }, { "epoch": 1.479891724671307, "grad_norm": 0.36198215531832395, "learning_rate": 5.983449056375798e-06, "loss": 0.03, "step": 3827 }, { "epoch": 1.480278422273782, "grad_norm": 0.3859449125168881, "learning_rate": 5.9812431247218285e-06, "loss": 0.0356, "step": 3828 }, { "epoch": 1.4806651198762568, "grad_norm": 0.7491538652084541, "learning_rate": 5.9790369944049995e-06, "loss": 0.0265, "step": 3829 }, { "epoch": 1.4810518174787317, "grad_norm": 0.24333486675374258, "learning_rate": 5.976830665871971e-06, "loss": 0.0199, "step": 3830 }, { "epoch": 1.4814385150812064, "grad_norm": 0.25341132192469745, "learning_rate": 5.974624139569431e-06, "loss": 0.0296, "step": 3831 }, { "epoch": 1.4818252126836813, "grad_norm": 0.6044033211755927, "learning_rate": 5.972417415944121e-06, "loss": 0.0295, "step": 3832 }, { "epoch": 1.4822119102861562, "grad_norm": 0.32270716805712774, "learning_rate": 5.970210495442807e-06, "loss": 0.0258, "step": 3833 }, { "epoch": 1.482598607888631, "grad_norm": 0.4067349583735926, "learning_rate": 5.968003378512308e-06, "loss": 0.0474, "step": 3834 }, { "epoch": 1.482985305491106, "grad_norm": 0.26609204804685266, "learning_rate": 5.965796065599476e-06, "loss": 0.0229, "step": 3835 }, { "epoch": 1.4833720030935809, "grad_norm": 0.2903782567182381, "learning_rate": 5.963588557151205e-06, "loss": 0.0316, "step": 3836 }, { "epoch": 1.4837587006960558, "grad_norm": 0.2431125580324654, "learning_rate": 5.961380853614428e-06, "loss": 0.0211, "step": 3837 }, { "epoch": 1.4841453982985304, "grad_norm": 0.3021268631050198, "learning_rate": 5.9591729554361185e-06, "loss": 0.0319, "step": 3838 }, { "epoch": 1.4845320959010055, "grad_norm": 0.4995883590726827, "learning_rate": 5.956964863063287e-06, "loss": 0.0399, "step": 3839 }, { "epoch": 1.4849187935034802, "grad_norm": 0.2739933834542241, "learning_rate": 5.954756576942984e-06, "loss": 0.0309, "step": 3840 }, { "epoch": 1.4853054911059551, "grad_norm": 0.3035447166163289, "learning_rate": 5.952548097522301e-06, "loss": 0.0371, "step": 3841 }, { "epoch": 1.48569218870843, "grad_norm": 0.24624759955953068, "learning_rate": 5.950339425248367e-06, "loss": 0.0243, "step": 3842 }, { "epoch": 1.486078886310905, "grad_norm": 0.2799828830655836, "learning_rate": 5.948130560568353e-06, "loss": 0.0255, "step": 3843 }, { "epoch": 1.4864655839133798, "grad_norm": 0.48095302751047136, "learning_rate": 5.9459215039294635e-06, "loss": 0.0334, "step": 3844 }, { "epoch": 1.4868522815158545, "grad_norm": 0.2969415124683762, "learning_rate": 5.943712255778948e-06, "loss": 0.0234, "step": 3845 }, { "epoch": 1.4872389791183296, "grad_norm": 0.2497385526094989, "learning_rate": 5.941502816564087e-06, "loss": 0.0155, "step": 3846 }, { "epoch": 1.4876256767208043, "grad_norm": 0.3042325674483668, "learning_rate": 5.9392931867322105e-06, "loss": 0.0277, "step": 3847 }, { "epoch": 1.4880123743232792, "grad_norm": 0.23720999881517765, "learning_rate": 5.937083366730675e-06, "loss": 0.0278, "step": 3848 }, { "epoch": 1.488399071925754, "grad_norm": 0.45236857936566766, "learning_rate": 5.934873357006886e-06, "loss": 0.0292, "step": 3849 }, { "epoch": 1.488785769528229, "grad_norm": 0.43242855024442073, "learning_rate": 5.93266315800828e-06, "loss": 0.0419, "step": 3850 }, { "epoch": 1.4891724671307038, "grad_norm": 0.2880259638585464, "learning_rate": 5.930452770182338e-06, "loss": 0.0324, "step": 3851 }, { "epoch": 1.4895591647331787, "grad_norm": 0.38959027494308246, "learning_rate": 5.928242193976573e-06, "loss": 0.0419, "step": 3852 }, { "epoch": 1.4899458623356536, "grad_norm": 0.4616488935334508, "learning_rate": 5.926031429838539e-06, "loss": 0.0311, "step": 3853 }, { "epoch": 1.4903325599381283, "grad_norm": 0.22994806227653178, "learning_rate": 5.923820478215831e-06, "loss": 0.0271, "step": 3854 }, { "epoch": 1.4907192575406032, "grad_norm": 0.36222379920194203, "learning_rate": 5.921609339556076e-06, "loss": 0.0335, "step": 3855 }, { "epoch": 1.491105955143078, "grad_norm": 0.2879221383355738, "learning_rate": 5.919398014306943e-06, "loss": 0.03, "step": 3856 }, { "epoch": 1.491492652745553, "grad_norm": 0.34711909510593714, "learning_rate": 5.9171865029161375e-06, "loss": 0.0306, "step": 3857 }, { "epoch": 1.4918793503480279, "grad_norm": 0.2861803647209031, "learning_rate": 5.914974805831406e-06, "loss": 0.0211, "step": 3858 }, { "epoch": 1.4922660479505028, "grad_norm": 0.3773416853168393, "learning_rate": 5.912762923500526e-06, "loss": 0.0321, "step": 3859 }, { "epoch": 1.4926527455529777, "grad_norm": 0.3349912508800418, "learning_rate": 5.910550856371317e-06, "loss": 0.0412, "step": 3860 }, { "epoch": 1.4930394431554523, "grad_norm": 0.2604038815961738, "learning_rate": 5.908338604891634e-06, "loss": 0.0222, "step": 3861 }, { "epoch": 1.4934261407579272, "grad_norm": 0.4214833725548914, "learning_rate": 5.906126169509372e-06, "loss": 0.041, "step": 3862 }, { "epoch": 1.4938128383604021, "grad_norm": 0.3562604264669138, "learning_rate": 5.90391355067246e-06, "loss": 0.0342, "step": 3863 }, { "epoch": 1.494199535962877, "grad_norm": 0.255193920138381, "learning_rate": 5.901700748828868e-06, "loss": 0.0225, "step": 3864 }, { "epoch": 1.494586233565352, "grad_norm": 0.2686593373201067, "learning_rate": 5.899487764426598e-06, "loss": 0.0215, "step": 3865 }, { "epoch": 1.4949729311678268, "grad_norm": 0.2932479275414112, "learning_rate": 5.897274597913694e-06, "loss": 0.0272, "step": 3866 }, { "epoch": 1.4953596287703017, "grad_norm": 0.30820895431254197, "learning_rate": 5.895061249738232e-06, "loss": 0.0318, "step": 3867 }, { "epoch": 1.4957463263727764, "grad_norm": 0.3067300718352519, "learning_rate": 5.892847720348329e-06, "loss": 0.0281, "step": 3868 }, { "epoch": 1.4961330239752513, "grad_norm": 0.29042295354972253, "learning_rate": 5.890634010192135e-06, "loss": 0.0314, "step": 3869 }, { "epoch": 1.4965197215777262, "grad_norm": 0.3470418875352763, "learning_rate": 5.8884201197178435e-06, "loss": 0.0333, "step": 3870 }, { "epoch": 1.496906419180201, "grad_norm": 0.46982221244372946, "learning_rate": 5.8862060493736736e-06, "loss": 0.0362, "step": 3871 }, { "epoch": 1.497293116782676, "grad_norm": 0.24761956653386258, "learning_rate": 5.883991799607891e-06, "loss": 0.0263, "step": 3872 }, { "epoch": 1.4976798143851509, "grad_norm": 0.3839258814794831, "learning_rate": 5.881777370868793e-06, "loss": 0.0305, "step": 3873 }, { "epoch": 1.4980665119876257, "grad_norm": 0.3941559317933134, "learning_rate": 5.8795627636047105e-06, "loss": 0.0339, "step": 3874 }, { "epoch": 1.4984532095901004, "grad_norm": 0.265641559526354, "learning_rate": 5.8773479782640174e-06, "loss": 0.0276, "step": 3875 }, { "epoch": 1.4988399071925755, "grad_norm": 0.2717489359489246, "learning_rate": 5.875133015295118e-06, "loss": 0.0237, "step": 3876 }, { "epoch": 1.4992266047950502, "grad_norm": 0.32737265009326694, "learning_rate": 5.872917875146457e-06, "loss": 0.0348, "step": 3877 }, { "epoch": 1.499613302397525, "grad_norm": 0.4000839446334012, "learning_rate": 5.870702558266508e-06, "loss": 0.0377, "step": 3878 }, { "epoch": 1.5, "grad_norm": 0.2547216695561455, "learning_rate": 5.868487065103791e-06, "loss": 0.025, "step": 3879 }, { "epoch": 1.500386697602475, "grad_norm": 0.3051414178253609, "learning_rate": 5.866271396106848e-06, "loss": 0.0305, "step": 3880 }, { "epoch": 1.5007733952049498, "grad_norm": 0.28298203603297484, "learning_rate": 5.864055551724271e-06, "loss": 0.0306, "step": 3881 }, { "epoch": 1.5011600928074245, "grad_norm": 0.4311638387740667, "learning_rate": 5.861839532404679e-06, "loss": 0.0392, "step": 3882 }, { "epoch": 1.5015467904098996, "grad_norm": 0.22241587609821203, "learning_rate": 5.859623338596725e-06, "loss": 0.0149, "step": 3883 }, { "epoch": 1.5019334880123743, "grad_norm": 0.26466833901988307, "learning_rate": 5.857406970749102e-06, "loss": 0.0259, "step": 3884 }, { "epoch": 1.5023201856148491, "grad_norm": 0.3561382342850067, "learning_rate": 5.85519042931054e-06, "loss": 0.0358, "step": 3885 }, { "epoch": 1.502706883217324, "grad_norm": 0.27917162805842183, "learning_rate": 5.8529737147297975e-06, "loss": 0.0227, "step": 3886 }, { "epoch": 1.503093580819799, "grad_norm": 0.43512152296380013, "learning_rate": 5.85075682745567e-06, "loss": 0.0345, "step": 3887 }, { "epoch": 1.5034802784222738, "grad_norm": 0.2715070699254474, "learning_rate": 5.848539767936994e-06, "loss": 0.0266, "step": 3888 }, { "epoch": 1.5038669760247485, "grad_norm": 0.2513950839033799, "learning_rate": 5.846322536622631e-06, "loss": 0.0238, "step": 3889 }, { "epoch": 1.5042536736272236, "grad_norm": 0.2726008431390729, "learning_rate": 5.844105133961486e-06, "loss": 0.0278, "step": 3890 }, { "epoch": 1.5046403712296983, "grad_norm": 0.3067225366371527, "learning_rate": 5.841887560402495e-06, "loss": 0.0222, "step": 3891 }, { "epoch": 1.5050270688321732, "grad_norm": 0.3090493550367741, "learning_rate": 5.839669816394628e-06, "loss": 0.0347, "step": 3892 }, { "epoch": 1.505413766434648, "grad_norm": 0.421293905787687, "learning_rate": 5.837451902386889e-06, "loss": 0.0315, "step": 3893 }, { "epoch": 1.505800464037123, "grad_norm": 0.24463101285434907, "learning_rate": 5.835233818828321e-06, "loss": 0.0249, "step": 3894 }, { "epoch": 1.5061871616395979, "grad_norm": 0.22832631275661328, "learning_rate": 5.833015566167994e-06, "loss": 0.028, "step": 3895 }, { "epoch": 1.5065738592420725, "grad_norm": 0.2711671105707122, "learning_rate": 5.83079714485502e-06, "loss": 0.0261, "step": 3896 }, { "epoch": 1.5069605568445477, "grad_norm": 0.306711968753884, "learning_rate": 5.8285785553385385e-06, "loss": 0.0325, "step": 3897 }, { "epoch": 1.5073472544470223, "grad_norm": 0.22636437759110803, "learning_rate": 5.826359798067729e-06, "loss": 0.0201, "step": 3898 }, { "epoch": 1.5077339520494975, "grad_norm": 0.2948846231884141, "learning_rate": 5.824140873491799e-06, "loss": 0.0254, "step": 3899 }, { "epoch": 1.5081206496519721, "grad_norm": 0.2012574895415717, "learning_rate": 5.821921782059995e-06, "loss": 0.0214, "step": 3900 }, { "epoch": 1.508507347254447, "grad_norm": 0.3712437269417801, "learning_rate": 5.819702524221592e-06, "loss": 0.0318, "step": 3901 }, { "epoch": 1.508894044856922, "grad_norm": 0.2665240139513778, "learning_rate": 5.817483100425906e-06, "loss": 0.0233, "step": 3902 }, { "epoch": 1.5092807424593968, "grad_norm": 0.21801370649470694, "learning_rate": 5.815263511122279e-06, "loss": 0.0216, "step": 3903 }, { "epoch": 1.5096674400618717, "grad_norm": 0.5339390871513646, "learning_rate": 5.813043756760092e-06, "loss": 0.0421, "step": 3904 }, { "epoch": 1.5100541376643464, "grad_norm": 0.26284050543882165, "learning_rate": 5.810823837788756e-06, "loss": 0.0323, "step": 3905 }, { "epoch": 1.5104408352668215, "grad_norm": 0.33934905421067774, "learning_rate": 5.8086037546577165e-06, "loss": 0.0387, "step": 3906 }, { "epoch": 1.5108275328692962, "grad_norm": 0.3527685322827616, "learning_rate": 5.806383507816455e-06, "loss": 0.0381, "step": 3907 }, { "epoch": 1.511214230471771, "grad_norm": 0.26029374756830737, "learning_rate": 5.804163097714479e-06, "loss": 0.0277, "step": 3908 }, { "epoch": 1.511600928074246, "grad_norm": 0.29683764035034593, "learning_rate": 5.801942524801337e-06, "loss": 0.0231, "step": 3909 }, { "epoch": 1.5119876256767208, "grad_norm": 0.4036222215697481, "learning_rate": 5.799721789526607e-06, "loss": 0.0393, "step": 3910 }, { "epoch": 1.5123743232791957, "grad_norm": 0.2975674212278139, "learning_rate": 5.797500892339899e-06, "loss": 0.0319, "step": 3911 }, { "epoch": 1.5127610208816704, "grad_norm": 0.36277929021857663, "learning_rate": 5.795279833690856e-06, "loss": 0.0262, "step": 3912 }, { "epoch": 1.5131477184841455, "grad_norm": 0.23814357733984784, "learning_rate": 5.793058614029157e-06, "loss": 0.0228, "step": 3913 }, { "epoch": 1.5135344160866202, "grad_norm": 0.2887954307306446, "learning_rate": 5.790837233804506e-06, "loss": 0.0354, "step": 3914 }, { "epoch": 1.513921113689095, "grad_norm": 0.2341137804042859, "learning_rate": 5.788615693466649e-06, "loss": 0.0191, "step": 3915 }, { "epoch": 1.51430781129157, "grad_norm": 0.2640310678119296, "learning_rate": 5.786393993465359e-06, "loss": 0.0247, "step": 3916 }, { "epoch": 1.5146945088940449, "grad_norm": 0.4593772815217929, "learning_rate": 5.784172134250442e-06, "loss": 0.0288, "step": 3917 }, { "epoch": 1.5150812064965198, "grad_norm": 0.3751142692708482, "learning_rate": 5.781950116271735e-06, "loss": 0.029, "step": 3918 }, { "epoch": 1.5154679040989945, "grad_norm": 0.3662529523761503, "learning_rate": 5.7797279399791105e-06, "loss": 0.0328, "step": 3919 }, { "epoch": 1.5158546017014696, "grad_norm": 0.23225878693084448, "learning_rate": 5.77750560582247e-06, "loss": 0.0216, "step": 3920 }, { "epoch": 1.5162412993039442, "grad_norm": 0.23676276527773463, "learning_rate": 5.775283114251748e-06, "loss": 0.0302, "step": 3921 }, { "epoch": 1.5166279969064191, "grad_norm": 0.28660870160325247, "learning_rate": 5.773060465716913e-06, "loss": 0.0223, "step": 3922 }, { "epoch": 1.517014694508894, "grad_norm": 0.3421546370793801, "learning_rate": 5.7708376606679606e-06, "loss": 0.0276, "step": 3923 }, { "epoch": 1.517401392111369, "grad_norm": 0.35047386472507464, "learning_rate": 5.768614699554923e-06, "loss": 0.0288, "step": 3924 }, { "epoch": 1.5177880897138438, "grad_norm": 0.32483999920146434, "learning_rate": 5.766391582827862e-06, "loss": 0.0304, "step": 3925 }, { "epoch": 1.5181747873163185, "grad_norm": 0.34201003763870913, "learning_rate": 5.764168310936867e-06, "loss": 0.0387, "step": 3926 }, { "epoch": 1.5185614849187936, "grad_norm": 0.2802503860039152, "learning_rate": 5.7619448843320645e-06, "loss": 0.0266, "step": 3927 }, { "epoch": 1.5189481825212683, "grad_norm": 0.3428647321776678, "learning_rate": 5.7597213034636124e-06, "loss": 0.0357, "step": 3928 }, { "epoch": 1.5193348801237432, "grad_norm": 0.20806196300381358, "learning_rate": 5.7574975687816945e-06, "loss": 0.0184, "step": 3929 }, { "epoch": 1.519721577726218, "grad_norm": 0.17814089050693535, "learning_rate": 5.7552736807365324e-06, "loss": 0.0171, "step": 3930 }, { "epoch": 1.520108275328693, "grad_norm": 0.35517567261389116, "learning_rate": 5.753049639778372e-06, "loss": 0.0277, "step": 3931 }, { "epoch": 1.5204949729311679, "grad_norm": 0.30124197874025826, "learning_rate": 5.7508254463574964e-06, "loss": 0.035, "step": 3932 }, { "epoch": 1.5208816705336425, "grad_norm": 0.2953794673399029, "learning_rate": 5.7486011009242125e-06, "loss": 0.0335, "step": 3933 }, { "epoch": 1.5212683681361177, "grad_norm": 0.19965178620303178, "learning_rate": 5.746376603928867e-06, "loss": 0.0175, "step": 3934 }, { "epoch": 1.5216550657385923, "grad_norm": 0.2590836480255252, "learning_rate": 5.744151955821828e-06, "loss": 0.0212, "step": 3935 }, { "epoch": 1.5220417633410674, "grad_norm": 0.22705900142414506, "learning_rate": 5.741927157053503e-06, "loss": 0.0247, "step": 3936 }, { "epoch": 1.5224284609435421, "grad_norm": 0.336185973883868, "learning_rate": 5.739702208074321e-06, "loss": 0.0233, "step": 3937 }, { "epoch": 1.522815158546017, "grad_norm": 0.3502342787319848, "learning_rate": 5.73747710933475e-06, "loss": 0.0262, "step": 3938 }, { "epoch": 1.523201856148492, "grad_norm": 0.2419591258450098, "learning_rate": 5.735251861285284e-06, "loss": 0.0226, "step": 3939 }, { "epoch": 1.5235885537509666, "grad_norm": 0.29684514554247254, "learning_rate": 5.733026464376443e-06, "loss": 0.0288, "step": 3940 }, { "epoch": 1.5239752513534417, "grad_norm": 0.3951712440147616, "learning_rate": 5.730800919058787e-06, "loss": 0.0338, "step": 3941 }, { "epoch": 1.5243619489559164, "grad_norm": 0.5161388065630526, "learning_rate": 5.728575225782897e-06, "loss": 0.0261, "step": 3942 }, { "epoch": 1.5247486465583915, "grad_norm": 0.2219044721405727, "learning_rate": 5.726349384999392e-06, "loss": 0.021, "step": 3943 }, { "epoch": 1.5251353441608662, "grad_norm": 0.25670644085297234, "learning_rate": 5.724123397158912e-06, "loss": 0.0285, "step": 3944 }, { "epoch": 1.525522041763341, "grad_norm": 0.7680237515173868, "learning_rate": 5.721897262712136e-06, "loss": 0.03, "step": 3945 }, { "epoch": 1.525908739365816, "grad_norm": 0.2715050334880077, "learning_rate": 5.719670982109763e-06, "loss": 0.0282, "step": 3946 }, { "epoch": 1.5262954369682908, "grad_norm": 0.2587250640069574, "learning_rate": 5.7174445558025295e-06, "loss": 0.0279, "step": 3947 }, { "epoch": 1.5266821345707657, "grad_norm": 0.2757225482903144, "learning_rate": 5.715217984241197e-06, "loss": 0.0314, "step": 3948 }, { "epoch": 1.5270688321732404, "grad_norm": 0.5283484561835657, "learning_rate": 5.7129912678765605e-06, "loss": 0.0305, "step": 3949 }, { "epoch": 1.5274555297757155, "grad_norm": 0.40568154308542614, "learning_rate": 5.71076440715944e-06, "loss": 0.0353, "step": 3950 }, { "epoch": 1.5278422273781902, "grad_norm": 0.23097354974254292, "learning_rate": 5.708537402540688e-06, "loss": 0.0204, "step": 3951 }, { "epoch": 1.528228924980665, "grad_norm": 0.3134955814479561, "learning_rate": 5.706310254471183e-06, "loss": 0.0323, "step": 3952 }, { "epoch": 1.52861562258314, "grad_norm": 0.2245134987161901, "learning_rate": 5.704082963401834e-06, "loss": 0.0182, "step": 3953 }, { "epoch": 1.5290023201856149, "grad_norm": 0.30009888426767195, "learning_rate": 5.701855529783582e-06, "loss": 0.0258, "step": 3954 }, { "epoch": 1.5293890177880898, "grad_norm": 0.3582619387786139, "learning_rate": 5.699627954067392e-06, "loss": 0.0235, "step": 3955 }, { "epoch": 1.5297757153905645, "grad_norm": 0.30714311948809736, "learning_rate": 5.697400236704259e-06, "loss": 0.0321, "step": 3956 }, { "epoch": 1.5301624129930396, "grad_norm": 0.3454958597795667, "learning_rate": 5.695172378145208e-06, "loss": 0.0256, "step": 3957 }, { "epoch": 1.5305491105955142, "grad_norm": 0.30500492750753116, "learning_rate": 5.692944378841296e-06, "loss": 0.0302, "step": 3958 }, { "epoch": 1.5309358081979891, "grad_norm": 0.30990432240840743, "learning_rate": 5.690716239243599e-06, "loss": 0.0239, "step": 3959 }, { "epoch": 1.531322505800464, "grad_norm": 0.3595420408931245, "learning_rate": 5.688487959803229e-06, "loss": 0.041, "step": 3960 }, { "epoch": 1.531709203402939, "grad_norm": 0.27340444646267703, "learning_rate": 5.686259540971325e-06, "loss": 0.0287, "step": 3961 }, { "epoch": 1.5320959010054138, "grad_norm": 0.2567890537511986, "learning_rate": 5.684030983199054e-06, "loss": 0.0232, "step": 3962 }, { "epoch": 1.5324825986078885, "grad_norm": 0.3177323155344485, "learning_rate": 5.6818022869376074e-06, "loss": 0.0289, "step": 3963 }, { "epoch": 1.5328692962103636, "grad_norm": 0.38057715282110544, "learning_rate": 5.679573452638212e-06, "loss": 0.0296, "step": 3964 }, { "epoch": 1.5332559938128383, "grad_norm": 0.37303623574966666, "learning_rate": 5.677344480752116e-06, "loss": 0.0298, "step": 3965 }, { "epoch": 1.5336426914153132, "grad_norm": 0.32584440085782884, "learning_rate": 5.6751153717306e-06, "loss": 0.0272, "step": 3966 }, { "epoch": 1.534029389017788, "grad_norm": 0.5316875105678273, "learning_rate": 5.6728861260249665e-06, "loss": 0.0382, "step": 3967 }, { "epoch": 1.534416086620263, "grad_norm": 0.3488028663112968, "learning_rate": 5.670656744086554e-06, "loss": 0.0331, "step": 3968 }, { "epoch": 1.5348027842227379, "grad_norm": 0.29553369766926124, "learning_rate": 5.66842722636672e-06, "loss": 0.0197, "step": 3969 }, { "epoch": 1.5351894818252125, "grad_norm": 0.2885562669708755, "learning_rate": 5.666197573316856e-06, "loss": 0.0307, "step": 3970 }, { "epoch": 1.5355761794276876, "grad_norm": 0.32597940041736445, "learning_rate": 5.663967785388378e-06, "loss": 0.0331, "step": 3971 }, { "epoch": 1.5359628770301623, "grad_norm": 0.24242842021863267, "learning_rate": 5.661737863032729e-06, "loss": 0.0256, "step": 3972 }, { "epoch": 1.5363495746326374, "grad_norm": 0.2695051232981639, "learning_rate": 5.659507806701381e-06, "loss": 0.0247, "step": 3973 }, { "epoch": 1.536736272235112, "grad_norm": 0.35477324755819767, "learning_rate": 5.6572776168458295e-06, "loss": 0.0254, "step": 3974 }, { "epoch": 1.537122969837587, "grad_norm": 0.3472105235182358, "learning_rate": 5.655047293917604e-06, "loss": 0.0307, "step": 3975 }, { "epoch": 1.537509667440062, "grad_norm": 0.24958953662028227, "learning_rate": 5.6528168383682524e-06, "loss": 0.0242, "step": 3976 }, { "epoch": 1.5378963650425366, "grad_norm": 0.3719497013455779, "learning_rate": 5.650586250649356e-06, "loss": 0.0258, "step": 3977 }, { "epoch": 1.5382830626450117, "grad_norm": 0.222749334685494, "learning_rate": 5.6483555312125194e-06, "loss": 0.0295, "step": 3978 }, { "epoch": 1.5386697602474864, "grad_norm": 0.49680869300745534, "learning_rate": 5.646124680509376e-06, "loss": 0.0281, "step": 3979 }, { "epoch": 1.5390564578499615, "grad_norm": 0.2542066182659863, "learning_rate": 5.643893698991583e-06, "loss": 0.024, "step": 3980 }, { "epoch": 1.5394431554524362, "grad_norm": 0.3140826720187734, "learning_rate": 5.641662587110825e-06, "loss": 0.0277, "step": 3981 }, { "epoch": 1.539829853054911, "grad_norm": 0.3546212156763922, "learning_rate": 5.6394313453188175e-06, "loss": 0.0322, "step": 3982 }, { "epoch": 1.540216550657386, "grad_norm": 0.27925103199692214, "learning_rate": 5.637199974067295e-06, "loss": 0.0263, "step": 3983 }, { "epoch": 1.5406032482598608, "grad_norm": 0.3299401014237023, "learning_rate": 5.634968473808023e-06, "loss": 0.0322, "step": 3984 }, { "epoch": 1.5409899458623357, "grad_norm": 0.3761932419069641, "learning_rate": 5.632736844992794e-06, "loss": 0.0207, "step": 3985 }, { "epoch": 1.5413766434648104, "grad_norm": 0.44635771976193034, "learning_rate": 5.630505088073422e-06, "loss": 0.0292, "step": 3986 }, { "epoch": 1.5417633410672855, "grad_norm": 0.4613708382192073, "learning_rate": 5.628273203501749e-06, "loss": 0.0349, "step": 3987 }, { "epoch": 1.5421500386697602, "grad_norm": 0.26917152459553906, "learning_rate": 5.626041191729644e-06, "loss": 0.0194, "step": 3988 }, { "epoch": 1.542536736272235, "grad_norm": 0.3082554298705649, "learning_rate": 5.623809053209002e-06, "loss": 0.0351, "step": 3989 }, { "epoch": 1.54292343387471, "grad_norm": 0.320032950515723, "learning_rate": 5.621576788391741e-06, "loss": 0.0256, "step": 3990 }, { "epoch": 1.5433101314771849, "grad_norm": 0.2974398523011758, "learning_rate": 5.619344397729806e-06, "loss": 0.0301, "step": 3991 }, { "epoch": 1.5436968290796598, "grad_norm": 0.39839318938651774, "learning_rate": 5.61711188167517e-06, "loss": 0.0441, "step": 3992 }, { "epoch": 1.5440835266821344, "grad_norm": 0.2562066928117135, "learning_rate": 5.614879240679827e-06, "loss": 0.0234, "step": 3993 }, { "epoch": 1.5444702242846096, "grad_norm": 0.3110345233788925, "learning_rate": 5.612646475195799e-06, "loss": 0.034, "step": 3994 }, { "epoch": 1.5448569218870842, "grad_norm": 0.37059223112635054, "learning_rate": 5.61041358567513e-06, "loss": 0.0351, "step": 3995 }, { "epoch": 1.5452436194895591, "grad_norm": 0.24693105901561355, "learning_rate": 5.608180572569897e-06, "loss": 0.0255, "step": 3996 }, { "epoch": 1.545630317092034, "grad_norm": 0.2435397687118648, "learning_rate": 5.605947436332192e-06, "loss": 0.0294, "step": 3997 }, { "epoch": 1.546017014694509, "grad_norm": 0.2932971657675814, "learning_rate": 5.603714177414141e-06, "loss": 0.026, "step": 3998 }, { "epoch": 1.5464037122969838, "grad_norm": 0.22536161187980594, "learning_rate": 5.601480796267885e-06, "loss": 0.0249, "step": 3999 }, { "epoch": 1.5467904098994585, "grad_norm": 0.30054114848200997, "learning_rate": 5.5992472933456e-06, "loss": 0.0278, "step": 4000 }, { "epoch": 1.5471771075019336, "grad_norm": 0.6701455860289527, "learning_rate": 5.597013669099478e-06, "loss": 0.0358, "step": 4001 }, { "epoch": 1.5475638051044083, "grad_norm": 0.2803019171608351, "learning_rate": 5.594779923981742e-06, "loss": 0.0295, "step": 4002 }, { "epoch": 1.5479505027068832, "grad_norm": 0.2879345499956234, "learning_rate": 5.592546058444637e-06, "loss": 0.0304, "step": 4003 }, { "epoch": 1.548337200309358, "grad_norm": 0.40889555525653304, "learning_rate": 5.5903120729404304e-06, "loss": 0.0389, "step": 4004 }, { "epoch": 1.548723897911833, "grad_norm": 0.27950959539944076, "learning_rate": 5.588077967921418e-06, "loss": 0.025, "step": 4005 }, { "epoch": 1.5491105955143079, "grad_norm": 0.3068888349285751, "learning_rate": 5.585843743839915e-06, "loss": 0.026, "step": 4006 }, { "epoch": 1.5494972931167825, "grad_norm": 0.32039080825724237, "learning_rate": 5.583609401148265e-06, "loss": 0.0282, "step": 4007 }, { "epoch": 1.5498839907192576, "grad_norm": 0.3413236997263547, "learning_rate": 5.581374940298834e-06, "loss": 0.0334, "step": 4008 }, { "epoch": 1.5502706883217323, "grad_norm": 0.32475078259756246, "learning_rate": 5.579140361744011e-06, "loss": 0.0422, "step": 4009 }, { "epoch": 1.5506573859242074, "grad_norm": 0.30499490721489575, "learning_rate": 5.576905665936209e-06, "loss": 0.0313, "step": 4010 }, { "epoch": 1.551044083526682, "grad_norm": 0.26883673763250027, "learning_rate": 5.574670853327868e-06, "loss": 0.0407, "step": 4011 }, { "epoch": 1.551430781129157, "grad_norm": 0.28530728199602234, "learning_rate": 5.572435924371446e-06, "loss": 0.0344, "step": 4012 }, { "epoch": 1.551817478731632, "grad_norm": 0.255071450740405, "learning_rate": 5.570200879519429e-06, "loss": 0.0225, "step": 4013 }, { "epoch": 1.5522041763341066, "grad_norm": 0.20115766071184146, "learning_rate": 5.5679657192243245e-06, "loss": 0.0217, "step": 4014 }, { "epoch": 1.5525908739365817, "grad_norm": 0.26044973097086244, "learning_rate": 5.565730443938665e-06, "loss": 0.0206, "step": 4015 }, { "epoch": 1.5529775715390564, "grad_norm": 0.29752915413829417, "learning_rate": 5.563495054115004e-06, "loss": 0.0268, "step": 4016 }, { "epoch": 1.5533642691415315, "grad_norm": 0.591107908172615, "learning_rate": 5.561259550205921e-06, "loss": 0.0282, "step": 4017 }, { "epoch": 1.5537509667440061, "grad_norm": 0.2378393945529803, "learning_rate": 5.559023932664015e-06, "loss": 0.0224, "step": 4018 }, { "epoch": 1.554137664346481, "grad_norm": 0.27880934374737687, "learning_rate": 5.55678820194191e-06, "loss": 0.0227, "step": 4019 }, { "epoch": 1.554524361948956, "grad_norm": 0.24981384820598024, "learning_rate": 5.554552358492254e-06, "loss": 0.0241, "step": 4020 }, { "epoch": 1.5549110595514308, "grad_norm": 0.3154810752957485, "learning_rate": 5.552316402767717e-06, "loss": 0.0393, "step": 4021 }, { "epoch": 1.5552977571539057, "grad_norm": 0.21092280067066083, "learning_rate": 5.550080335220991e-06, "loss": 0.0253, "step": 4022 }, { "epoch": 1.5556844547563804, "grad_norm": 0.30319667287118446, "learning_rate": 5.547844156304789e-06, "loss": 0.0333, "step": 4023 }, { "epoch": 1.5560711523588555, "grad_norm": 0.35847873917635736, "learning_rate": 5.545607866471853e-06, "loss": 0.029, "step": 4024 }, { "epoch": 1.5564578499613302, "grad_norm": 0.35351711962881305, "learning_rate": 5.543371466174939e-06, "loss": 0.0381, "step": 4025 }, { "epoch": 1.556844547563805, "grad_norm": 0.2355056645217639, "learning_rate": 5.541134955866834e-06, "loss": 0.0246, "step": 4026 }, { "epoch": 1.55723124516628, "grad_norm": 0.2905490776458517, "learning_rate": 5.538898336000336e-06, "loss": 0.0248, "step": 4027 }, { "epoch": 1.5576179427687549, "grad_norm": 0.2951948026057243, "learning_rate": 5.536661607028277e-06, "loss": 0.025, "step": 4028 }, { "epoch": 1.5580046403712298, "grad_norm": 0.35917064659544595, "learning_rate": 5.534424769403506e-06, "loss": 0.0365, "step": 4029 }, { "epoch": 1.5583913379737044, "grad_norm": 0.35901644886423967, "learning_rate": 5.532187823578892e-06, "loss": 0.0352, "step": 4030 }, { "epoch": 1.5587780355761796, "grad_norm": 0.45501903766482116, "learning_rate": 5.529950770007329e-06, "loss": 0.0398, "step": 4031 }, { "epoch": 1.5591647331786542, "grad_norm": 0.33054931214173233, "learning_rate": 5.527713609141731e-06, "loss": 0.0367, "step": 4032 }, { "epoch": 1.5595514307811291, "grad_norm": 0.3639002301964468, "learning_rate": 5.525476341435034e-06, "loss": 0.0303, "step": 4033 }, { "epoch": 1.559938128383604, "grad_norm": 0.28584784829866383, "learning_rate": 5.5232389673401976e-06, "loss": 0.0297, "step": 4034 }, { "epoch": 1.560324825986079, "grad_norm": 0.5232093305681019, "learning_rate": 5.5210014873102006e-06, "loss": 0.035, "step": 4035 }, { "epoch": 1.5607115235885538, "grad_norm": 0.30490249869963415, "learning_rate": 5.518763901798045e-06, "loss": 0.0273, "step": 4036 }, { "epoch": 1.5610982211910285, "grad_norm": 0.3331740787374244, "learning_rate": 5.51652621125675e-06, "loss": 0.0252, "step": 4037 }, { "epoch": 1.5614849187935036, "grad_norm": 0.3355856217560534, "learning_rate": 5.514288416139364e-06, "loss": 0.0312, "step": 4038 }, { "epoch": 1.5618716163959783, "grad_norm": 0.25558104780329094, "learning_rate": 5.512050516898949e-06, "loss": 0.0243, "step": 4039 }, { "epoch": 1.5622583139984532, "grad_norm": 0.873497868404465, "learning_rate": 5.509812513988591e-06, "loss": 0.04, "step": 4040 }, { "epoch": 1.562645011600928, "grad_norm": 0.25970691241756516, "learning_rate": 5.5075744078613965e-06, "loss": 0.0266, "step": 4041 }, { "epoch": 1.563031709203403, "grad_norm": 0.24498830038031486, "learning_rate": 5.505336198970495e-06, "loss": 0.0281, "step": 4042 }, { "epoch": 1.5634184068058778, "grad_norm": 0.30499619108907056, "learning_rate": 5.503097887769034e-06, "loss": 0.0243, "step": 4043 }, { "epoch": 1.5638051044083525, "grad_norm": 0.3863264610718762, "learning_rate": 5.500859474710184e-06, "loss": 0.0407, "step": 4044 }, { "epoch": 1.5641918020108276, "grad_norm": 0.23358989678915645, "learning_rate": 5.498620960247135e-06, "loss": 0.0195, "step": 4045 }, { "epoch": 1.5645784996133023, "grad_norm": 0.22174833335811286, "learning_rate": 5.496382344833094e-06, "loss": 0.0176, "step": 4046 }, { "epoch": 1.5649651972157774, "grad_norm": 0.36196122838979394, "learning_rate": 5.4941436289212965e-06, "loss": 0.0427, "step": 4047 }, { "epoch": 1.565351894818252, "grad_norm": 0.2866907741510018, "learning_rate": 5.49190481296499e-06, "loss": 0.0371, "step": 4048 }, { "epoch": 1.565738592420727, "grad_norm": 0.2797813757379502, "learning_rate": 5.489665897417449e-06, "loss": 0.0362, "step": 4049 }, { "epoch": 1.5661252900232019, "grad_norm": 0.29113091533367325, "learning_rate": 5.487426882731964e-06, "loss": 0.0219, "step": 4050 }, { "epoch": 1.5665119876256766, "grad_norm": 0.2892889953705328, "learning_rate": 5.485187769361846e-06, "loss": 0.0375, "step": 4051 }, { "epoch": 1.5668986852281517, "grad_norm": 0.23428188590726565, "learning_rate": 5.482948557760429e-06, "loss": 0.0237, "step": 4052 }, { "epoch": 1.5672853828306264, "grad_norm": 0.3174837313471544, "learning_rate": 5.480709248381061e-06, "loss": 0.0245, "step": 4053 }, { "epoch": 1.5676720804331015, "grad_norm": 0.5434714467987161, "learning_rate": 5.478469841677118e-06, "loss": 0.0244, "step": 4054 }, { "epoch": 1.5680587780355761, "grad_norm": 0.3756642411549055, "learning_rate": 5.476230338101987e-06, "loss": 0.0357, "step": 4055 }, { "epoch": 1.568445475638051, "grad_norm": 0.2665115313168904, "learning_rate": 5.4739907381090806e-06, "loss": 0.0275, "step": 4056 }, { "epoch": 1.568832173240526, "grad_norm": 0.3993459849807913, "learning_rate": 5.471751042151828e-06, "loss": 0.036, "step": 4057 }, { "epoch": 1.5692188708430008, "grad_norm": 0.8154940070852146, "learning_rate": 5.469511250683683e-06, "loss": 0.0297, "step": 4058 }, { "epoch": 1.5696055684454757, "grad_norm": 0.39781087935731724, "learning_rate": 5.467271364158109e-06, "loss": 0.0246, "step": 4059 }, { "epoch": 1.5699922660479504, "grad_norm": 0.3522848984321114, "learning_rate": 5.465031383028597e-06, "loss": 0.0337, "step": 4060 }, { "epoch": 1.5703789636504255, "grad_norm": 0.2559201890355602, "learning_rate": 5.462791307748654e-06, "loss": 0.0212, "step": 4061 }, { "epoch": 1.5707656612529002, "grad_norm": 0.3818885548732653, "learning_rate": 5.460551138771808e-06, "loss": 0.0373, "step": 4062 }, { "epoch": 1.571152358855375, "grad_norm": 0.30022651133620004, "learning_rate": 5.458310876551601e-06, "loss": 0.033, "step": 4063 }, { "epoch": 1.57153905645785, "grad_norm": 0.3281441436709687, "learning_rate": 5.456070521541601e-06, "loss": 0.0241, "step": 4064 }, { "epoch": 1.5719257540603249, "grad_norm": 1.003049290895682, "learning_rate": 5.45383007419539e-06, "loss": 0.0361, "step": 4065 }, { "epoch": 1.5723124516627998, "grad_norm": 0.8886472134359892, "learning_rate": 5.45158953496657e-06, "loss": 0.0663, "step": 4066 }, { "epoch": 1.5726991492652744, "grad_norm": 0.2903923547988275, "learning_rate": 5.449348904308758e-06, "loss": 0.0227, "step": 4067 }, { "epoch": 1.5730858468677495, "grad_norm": 0.2817301194891327, "learning_rate": 5.447108182675598e-06, "loss": 0.0325, "step": 4068 }, { "epoch": 1.5734725444702242, "grad_norm": 0.3119441666142583, "learning_rate": 5.444867370520743e-06, "loss": 0.0285, "step": 4069 }, { "epoch": 1.5738592420726991, "grad_norm": 0.6758338246580956, "learning_rate": 5.442626468297874e-06, "loss": 0.0195, "step": 4070 }, { "epoch": 1.574245939675174, "grad_norm": 0.3817858899731235, "learning_rate": 5.440385476460679e-06, "loss": 0.0329, "step": 4071 }, { "epoch": 1.574632637277649, "grad_norm": 0.2652668098086159, "learning_rate": 5.438144395462873e-06, "loss": 0.0324, "step": 4072 }, { "epoch": 1.5750193348801238, "grad_norm": 0.30497991280891257, "learning_rate": 5.435903225758188e-06, "loss": 0.0329, "step": 4073 }, { "epoch": 1.5754060324825985, "grad_norm": 0.25830651669006943, "learning_rate": 5.433661967800367e-06, "loss": 0.0261, "step": 4074 }, { "epoch": 1.5757927300850736, "grad_norm": 0.38982905507921406, "learning_rate": 5.431420622043182e-06, "loss": 0.0351, "step": 4075 }, { "epoch": 1.5761794276875483, "grad_norm": 0.48463685925448463, "learning_rate": 5.4291791889404115e-06, "loss": 0.031, "step": 4076 }, { "epoch": 1.5765661252900232, "grad_norm": 0.49582236926731865, "learning_rate": 5.42693766894586e-06, "loss": 0.0273, "step": 4077 }, { "epoch": 1.576952822892498, "grad_norm": 0.25916143711003975, "learning_rate": 5.4246960625133446e-06, "loss": 0.027, "step": 4078 }, { "epoch": 1.577339520494973, "grad_norm": 0.32320701314163225, "learning_rate": 5.4224543700967045e-06, "loss": 0.0284, "step": 4079 }, { "epoch": 1.5777262180974478, "grad_norm": 0.30359364598128113, "learning_rate": 5.4202125921497895e-06, "loss": 0.0277, "step": 4080 }, { "epoch": 1.5781129156999225, "grad_norm": 0.5520098442137965, "learning_rate": 5.417970729126475e-06, "loss": 0.0273, "step": 4081 }, { "epoch": 1.5784996133023976, "grad_norm": 0.27673201216178855, "learning_rate": 5.415728781480646e-06, "loss": 0.0263, "step": 4082 }, { "epoch": 1.5788863109048723, "grad_norm": 0.2608378055137791, "learning_rate": 5.413486749666212e-06, "loss": 0.038, "step": 4083 }, { "epoch": 1.5792730085073474, "grad_norm": 0.6603401290175401, "learning_rate": 5.41124463413709e-06, "loss": 0.031, "step": 4084 }, { "epoch": 1.579659706109822, "grad_norm": 0.40152807077725844, "learning_rate": 5.409002435347225e-06, "loss": 0.0447, "step": 4085 }, { "epoch": 1.580046403712297, "grad_norm": 0.38551613480977476, "learning_rate": 5.4067601537505705e-06, "loss": 0.0279, "step": 4086 }, { "epoch": 1.5804331013147719, "grad_norm": 0.2520061980956944, "learning_rate": 5.404517789801101e-06, "loss": 0.0253, "step": 4087 }, { "epoch": 1.5808197989172466, "grad_norm": 1.441196928761519, "learning_rate": 5.402275343952805e-06, "loss": 0.0412, "step": 4088 }, { "epoch": 1.5812064965197217, "grad_norm": 0.300472625976636, "learning_rate": 5.400032816659691e-06, "loss": 0.0295, "step": 4089 }, { "epoch": 1.5815931941221963, "grad_norm": 0.23716390526699582, "learning_rate": 5.39779020837578e-06, "loss": 0.0219, "step": 4090 }, { "epoch": 1.5819798917246715, "grad_norm": 0.27307058004661466, "learning_rate": 5.395547519555111e-06, "loss": 0.0238, "step": 4091 }, { "epoch": 1.5823665893271461, "grad_norm": 0.3112779800221705, "learning_rate": 5.393304750651742e-06, "loss": 0.0273, "step": 4092 }, { "epoch": 1.582753286929621, "grad_norm": 0.25916585404946424, "learning_rate": 5.391061902119743e-06, "loss": 0.024, "step": 4093 }, { "epoch": 1.583139984532096, "grad_norm": 0.4556227513605, "learning_rate": 5.3888189744132026e-06, "loss": 0.0348, "step": 4094 }, { "epoch": 1.5835266821345708, "grad_norm": 0.3058355484970247, "learning_rate": 5.386575967986224e-06, "loss": 0.0297, "step": 4095 }, { "epoch": 1.5839133797370457, "grad_norm": 0.29748462728958847, "learning_rate": 5.384332883292929e-06, "loss": 0.0336, "step": 4096 }, { "epoch": 1.5843000773395204, "grad_norm": 0.2858521181964836, "learning_rate": 5.38208972078745e-06, "loss": 0.0204, "step": 4097 }, { "epoch": 1.5846867749419955, "grad_norm": 0.31279304336325153, "learning_rate": 5.379846480923943e-06, "loss": 0.03, "step": 4098 }, { "epoch": 1.5850734725444702, "grad_norm": 0.3415712876510773, "learning_rate": 5.3776031641565715e-06, "loss": 0.0424, "step": 4099 }, { "epoch": 1.585460170146945, "grad_norm": 0.2845786391752355, "learning_rate": 5.37535977093952e-06, "loss": 0.0293, "step": 4100 }, { "epoch": 1.58584686774942, "grad_norm": 0.22400873952773565, "learning_rate": 5.373116301726986e-06, "loss": 0.0224, "step": 4101 }, { "epoch": 1.5862335653518949, "grad_norm": 0.28801565647671123, "learning_rate": 5.3708727569731845e-06, "loss": 0.0253, "step": 4102 }, { "epoch": 1.5866202629543698, "grad_norm": 0.33402245371920253, "learning_rate": 5.368629137132342e-06, "loss": 0.028, "step": 4103 }, { "epoch": 1.5870069605568444, "grad_norm": 0.35139020587444814, "learning_rate": 5.366385442658705e-06, "loss": 0.0247, "step": 4104 }, { "epoch": 1.5873936581593195, "grad_norm": 0.3371281651563803, "learning_rate": 5.3641416740065325e-06, "loss": 0.0319, "step": 4105 }, { "epoch": 1.5877803557617942, "grad_norm": 0.28743738807189195, "learning_rate": 5.3618978316300965e-06, "loss": 0.0279, "step": 4106 }, { "epoch": 1.588167053364269, "grad_norm": 0.22846232412637285, "learning_rate": 5.359653915983689e-06, "loss": 0.0199, "step": 4107 }, { "epoch": 1.588553750966744, "grad_norm": 0.35179015227034577, "learning_rate": 5.35740992752161e-06, "loss": 0.0406, "step": 4108 }, { "epoch": 1.588940448569219, "grad_norm": 0.23578267713915804, "learning_rate": 5.355165866698184e-06, "loss": 0.0222, "step": 4109 }, { "epoch": 1.5893271461716938, "grad_norm": 0.5157158409042764, "learning_rate": 5.352921733967741e-06, "loss": 0.0252, "step": 4110 }, { "epoch": 1.5897138437741685, "grad_norm": 0.2060165245693378, "learning_rate": 5.35067752978463e-06, "loss": 0.0193, "step": 4111 }, { "epoch": 1.5901005413766436, "grad_norm": 0.20276473880684412, "learning_rate": 5.348433254603211e-06, "loss": 0.0323, "step": 4112 }, { "epoch": 1.5904872389791183, "grad_norm": 0.4016808994371202, "learning_rate": 5.346188908877864e-06, "loss": 0.0359, "step": 4113 }, { "epoch": 1.5908739365815932, "grad_norm": 0.2760739082220597, "learning_rate": 5.343944493062978e-06, "loss": 0.0237, "step": 4114 }, { "epoch": 1.591260634184068, "grad_norm": 0.2517049485200727, "learning_rate": 5.34170000761296e-06, "loss": 0.024, "step": 4115 }, { "epoch": 1.591647331786543, "grad_norm": 0.24585613834303513, "learning_rate": 5.339455452982226e-06, "loss": 0.0238, "step": 4116 }, { "epoch": 1.5920340293890178, "grad_norm": 0.32621996374532036, "learning_rate": 5.337210829625214e-06, "loss": 0.0263, "step": 4117 }, { "epoch": 1.5924207269914925, "grad_norm": 0.30793412205788206, "learning_rate": 5.334966137996369e-06, "loss": 0.0274, "step": 4118 }, { "epoch": 1.5928074245939676, "grad_norm": 0.3720266202341405, "learning_rate": 5.3327213785501505e-06, "loss": 0.0353, "step": 4119 }, { "epoch": 1.5931941221964423, "grad_norm": 0.30335900311682507, "learning_rate": 5.330476551741037e-06, "loss": 0.0271, "step": 4120 }, { "epoch": 1.5935808197989172, "grad_norm": 0.19379961080979602, "learning_rate": 5.328231658023514e-06, "loss": 0.0155, "step": 4121 }, { "epoch": 1.593967517401392, "grad_norm": 0.29684736170367193, "learning_rate": 5.3259866978520845e-06, "loss": 0.0272, "step": 4122 }, { "epoch": 1.594354215003867, "grad_norm": 0.24622042581547, "learning_rate": 5.323741671681264e-06, "loss": 0.0242, "step": 4123 }, { "epoch": 1.5947409126063419, "grad_norm": 0.3331837761918014, "learning_rate": 5.321496579965581e-06, "loss": 0.038, "step": 4124 }, { "epoch": 1.5951276102088165, "grad_norm": 0.32142406070950574, "learning_rate": 5.319251423159581e-06, "loss": 0.0238, "step": 4125 }, { "epoch": 1.5955143078112917, "grad_norm": 0.3466469683088945, "learning_rate": 5.317006201717814e-06, "loss": 0.0268, "step": 4126 }, { "epoch": 1.5959010054137663, "grad_norm": 0.3741479453512154, "learning_rate": 5.3147609160948495e-06, "loss": 0.0361, "step": 4127 }, { "epoch": 1.5962877030162415, "grad_norm": 0.2785957829342107, "learning_rate": 5.312515566745272e-06, "loss": 0.0212, "step": 4128 }, { "epoch": 1.5966744006187161, "grad_norm": 0.3887415876171458, "learning_rate": 5.310270154123673e-06, "loss": 0.0261, "step": 4129 }, { "epoch": 1.597061098221191, "grad_norm": 0.27482177892982584, "learning_rate": 5.308024678684662e-06, "loss": 0.03, "step": 4130 }, { "epoch": 1.597447795823666, "grad_norm": 1.0700047718670866, "learning_rate": 5.305779140882855e-06, "loss": 0.0343, "step": 4131 }, { "epoch": 1.5978344934261408, "grad_norm": 0.4002637903594127, "learning_rate": 5.3035335411728885e-06, "loss": 0.0258, "step": 4132 }, { "epoch": 1.5982211910286157, "grad_norm": 0.37961485624669694, "learning_rate": 5.301287880009405e-06, "loss": 0.0327, "step": 4133 }, { "epoch": 1.5986078886310904, "grad_norm": 0.2714788265822519, "learning_rate": 5.299042157847063e-06, "loss": 0.0226, "step": 4134 }, { "epoch": 1.5989945862335655, "grad_norm": 0.2501882526954321, "learning_rate": 5.29679637514053e-06, "loss": 0.0245, "step": 4135 }, { "epoch": 1.5993812838360402, "grad_norm": 0.2991329012239144, "learning_rate": 5.294550532344492e-06, "loss": 0.0334, "step": 4136 }, { "epoch": 1.599767981438515, "grad_norm": 0.28317115344011934, "learning_rate": 5.29230462991364e-06, "loss": 0.0236, "step": 4137 }, { "epoch": 1.60015467904099, "grad_norm": 0.3742104986579626, "learning_rate": 5.290058668302682e-06, "loss": 0.0273, "step": 4138 }, { "epoch": 1.6005413766434649, "grad_norm": 0.24942459001527145, "learning_rate": 5.287812647966336e-06, "loss": 0.0266, "step": 4139 }, { "epoch": 1.6009280742459397, "grad_norm": 0.2973580976432199, "learning_rate": 5.285566569359331e-06, "loss": 0.0222, "step": 4140 }, { "epoch": 1.6013147718484144, "grad_norm": 0.2968991534716913, "learning_rate": 5.2833204329364105e-06, "loss": 0.0201, "step": 4141 }, { "epoch": 1.6017014694508895, "grad_norm": 0.31118077087866675, "learning_rate": 5.2810742391523266e-06, "loss": 0.0295, "step": 4142 }, { "epoch": 1.6020881670533642, "grad_norm": 0.31407104284991344, "learning_rate": 5.278827988461844e-06, "loss": 0.0291, "step": 4143 }, { "epoch": 1.602474864655839, "grad_norm": 0.31293291748809965, "learning_rate": 5.2765816813197435e-06, "loss": 0.0386, "step": 4144 }, { "epoch": 1.602861562258314, "grad_norm": 0.29161840376170495, "learning_rate": 5.2743353181808086e-06, "loss": 0.0345, "step": 4145 }, { "epoch": 1.603248259860789, "grad_norm": 0.21102218691192032, "learning_rate": 5.272088899499841e-06, "loss": 0.0174, "step": 4146 }, { "epoch": 1.6036349574632638, "grad_norm": 0.4711881073144181, "learning_rate": 5.269842425731652e-06, "loss": 0.0336, "step": 4147 }, { "epoch": 1.6040216550657385, "grad_norm": 0.3001210340179311, "learning_rate": 5.267595897331061e-06, "loss": 0.03, "step": 4148 }, { "epoch": 1.6044083526682136, "grad_norm": 0.23548393531152836, "learning_rate": 5.265349314752904e-06, "loss": 0.0208, "step": 4149 }, { "epoch": 1.6047950502706883, "grad_norm": 0.286535366700273, "learning_rate": 5.263102678452024e-06, "loss": 0.0232, "step": 4150 }, { "epoch": 1.6051817478731631, "grad_norm": 0.2886030719225612, "learning_rate": 5.260855988883276e-06, "loss": 0.027, "step": 4151 }, { "epoch": 1.605568445475638, "grad_norm": 0.2632818703865649, "learning_rate": 5.258609246501524e-06, "loss": 0.0302, "step": 4152 }, { "epoch": 1.605955143078113, "grad_norm": 0.3197916565463362, "learning_rate": 5.256362451761645e-06, "loss": 0.03, "step": 4153 }, { "epoch": 1.6063418406805878, "grad_norm": 0.21394055742749737, "learning_rate": 5.254115605118528e-06, "loss": 0.0201, "step": 4154 }, { "epoch": 1.6067285382830625, "grad_norm": 0.342320275668845, "learning_rate": 5.251868707027067e-06, "loss": 0.0394, "step": 4155 }, { "epoch": 1.6071152358855376, "grad_norm": 0.29438180165634364, "learning_rate": 5.249621757942174e-06, "loss": 0.0286, "step": 4156 }, { "epoch": 1.6075019334880123, "grad_norm": 0.3029502405770584, "learning_rate": 5.247374758318763e-06, "loss": 0.019, "step": 4157 }, { "epoch": 1.6078886310904872, "grad_norm": 0.2593461369336941, "learning_rate": 5.2451277086117654e-06, "loss": 0.0325, "step": 4158 }, { "epoch": 1.608275328692962, "grad_norm": 0.2611405350799906, "learning_rate": 5.242880609276119e-06, "loss": 0.0215, "step": 4159 }, { "epoch": 1.608662026295437, "grad_norm": 0.2043355869881812, "learning_rate": 5.240633460766771e-06, "loss": 0.0194, "step": 4160 }, { "epoch": 1.6090487238979119, "grad_norm": 0.2729086104500502, "learning_rate": 5.23838626353868e-06, "loss": 0.0311, "step": 4161 }, { "epoch": 1.6094354215003865, "grad_norm": 0.27561202897917686, "learning_rate": 5.236139018046818e-06, "loss": 0.024, "step": 4162 }, { "epoch": 1.6098221191028617, "grad_norm": 0.29618963706885976, "learning_rate": 5.233891724746157e-06, "loss": 0.0278, "step": 4163 }, { "epoch": 1.6102088167053363, "grad_norm": 0.2626748658786833, "learning_rate": 5.231644384091692e-06, "loss": 0.0253, "step": 4164 }, { "epoch": 1.6105955143078114, "grad_norm": 0.31874828430458185, "learning_rate": 5.229396996538415e-06, "loss": 0.0241, "step": 4165 }, { "epoch": 1.6109822119102861, "grad_norm": 0.32004511340193875, "learning_rate": 5.227149562541335e-06, "loss": 0.0278, "step": 4166 }, { "epoch": 1.611368909512761, "grad_norm": 0.24598426436506704, "learning_rate": 5.2249020825554676e-06, "loss": 0.0249, "step": 4167 }, { "epoch": 1.611755607115236, "grad_norm": 0.32083461181030193, "learning_rate": 5.222654557035839e-06, "loss": 0.0415, "step": 4168 }, { "epoch": 1.6121423047177108, "grad_norm": 0.3268583743977594, "learning_rate": 5.220406986437484e-06, "loss": 0.0299, "step": 4169 }, { "epoch": 1.6125290023201857, "grad_norm": 0.29785502994589097, "learning_rate": 5.2181593712154475e-06, "loss": 0.0233, "step": 4170 }, { "epoch": 1.6129156999226604, "grad_norm": 0.35427648407841583, "learning_rate": 5.2159117118247815e-06, "loss": 0.0379, "step": 4171 }, { "epoch": 1.6133023975251355, "grad_norm": 0.4195967370540191, "learning_rate": 5.213664008720547e-06, "loss": 0.0271, "step": 4172 }, { "epoch": 1.6136890951276102, "grad_norm": 0.2698725109481711, "learning_rate": 5.211416262357817e-06, "loss": 0.0242, "step": 4173 }, { "epoch": 1.614075792730085, "grad_norm": 0.31615137572800356, "learning_rate": 5.2091684731916695e-06, "loss": 0.0262, "step": 4174 }, { "epoch": 1.61446249033256, "grad_norm": 0.5305552926705267, "learning_rate": 5.206920641677193e-06, "loss": 0.0294, "step": 4175 }, { "epoch": 1.6148491879350348, "grad_norm": 0.3396772025822714, "learning_rate": 5.204672768269485e-06, "loss": 0.0278, "step": 4176 }, { "epoch": 1.6152358855375097, "grad_norm": 0.2951818765808152, "learning_rate": 5.202424853423651e-06, "loss": 0.0254, "step": 4177 }, { "epoch": 1.6156225831399844, "grad_norm": 0.3800058158011108, "learning_rate": 5.2001768975948045e-06, "loss": 0.0364, "step": 4178 }, { "epoch": 1.6160092807424595, "grad_norm": 0.41301785730650337, "learning_rate": 5.197928901238067e-06, "loss": 0.0309, "step": 4179 }, { "epoch": 1.6163959783449342, "grad_norm": 0.2969992178006589, "learning_rate": 5.195680864808567e-06, "loss": 0.0295, "step": 4180 }, { "epoch": 1.616782675947409, "grad_norm": 0.2927324491470188, "learning_rate": 5.193432788761445e-06, "loss": 0.0193, "step": 4181 }, { "epoch": 1.617169373549884, "grad_norm": 0.31348262553458683, "learning_rate": 5.191184673551848e-06, "loss": 0.0245, "step": 4182 }, { "epoch": 1.6175560711523589, "grad_norm": 0.31690106616521674, "learning_rate": 5.1889365196349296e-06, "loss": 0.0313, "step": 4183 }, { "epoch": 1.6179427687548338, "grad_norm": 0.31025655805724295, "learning_rate": 5.186688327465851e-06, "loss": 0.02, "step": 4184 }, { "epoch": 1.6183294663573085, "grad_norm": 0.2736226151246406, "learning_rate": 5.184440097499781e-06, "loss": 0.0249, "step": 4185 }, { "epoch": 1.6187161639597836, "grad_norm": 0.32760314238074995, "learning_rate": 5.182191830191901e-06, "loss": 0.0235, "step": 4186 }, { "epoch": 1.6191028615622582, "grad_norm": 0.35097422167541176, "learning_rate": 5.179943525997392e-06, "loss": 0.0319, "step": 4187 }, { "epoch": 1.6194895591647331, "grad_norm": 0.2939858325672492, "learning_rate": 5.1776951853714496e-06, "loss": 0.0207, "step": 4188 }, { "epoch": 1.619876256767208, "grad_norm": 0.43808887219613407, "learning_rate": 5.175446808769271e-06, "loss": 0.0286, "step": 4189 }, { "epoch": 1.620262954369683, "grad_norm": 0.27923406973632037, "learning_rate": 5.1731983966460655e-06, "loss": 0.0283, "step": 4190 }, { "epoch": 1.6206496519721578, "grad_norm": 0.3590834920768025, "learning_rate": 5.170949949457047e-06, "loss": 0.0297, "step": 4191 }, { "epoch": 1.6210363495746325, "grad_norm": 0.4564779096811386, "learning_rate": 5.168701467657436e-06, "loss": 0.0327, "step": 4192 }, { "epoch": 1.6214230471771076, "grad_norm": 0.3354302217211758, "learning_rate": 5.166452951702462e-06, "loss": 0.0417, "step": 4193 }, { "epoch": 1.6218097447795823, "grad_norm": 0.23382216948343776, "learning_rate": 5.16420440204736e-06, "loss": 0.0246, "step": 4194 }, { "epoch": 1.6221964423820572, "grad_norm": 0.35953563626522966, "learning_rate": 5.161955819147372e-06, "loss": 0.0349, "step": 4195 }, { "epoch": 1.622583139984532, "grad_norm": 0.46263725782257226, "learning_rate": 5.159707203457747e-06, "loss": 0.0326, "step": 4196 }, { "epoch": 1.622969837587007, "grad_norm": 0.36437289198044104, "learning_rate": 5.157458555433741e-06, "loss": 0.0265, "step": 4197 }, { "epoch": 1.6233565351894819, "grad_norm": 0.3775104786516661, "learning_rate": 5.155209875530617e-06, "loss": 0.0314, "step": 4198 }, { "epoch": 1.6237432327919565, "grad_norm": 0.2778227773776491, "learning_rate": 5.152961164203642e-06, "loss": 0.0279, "step": 4199 }, { "epoch": 1.6241299303944317, "grad_norm": 0.36530238548531435, "learning_rate": 5.150712421908091e-06, "loss": 0.0332, "step": 4200 }, { "epoch": 1.6245166279969063, "grad_norm": 0.26739354455405945, "learning_rate": 5.148463649099245e-06, "loss": 0.0273, "step": 4201 }, { "epoch": 1.6249033255993814, "grad_norm": 0.4116453619553405, "learning_rate": 5.146214846232393e-06, "loss": 0.0358, "step": 4202 }, { "epoch": 1.6252900232018561, "grad_norm": 0.3379205979732895, "learning_rate": 5.1439660137628265e-06, "loss": 0.0329, "step": 4203 }, { "epoch": 1.625676720804331, "grad_norm": 0.28513667380285873, "learning_rate": 5.141717152145848e-06, "loss": 0.026, "step": 4204 }, { "epoch": 1.626063418406806, "grad_norm": 0.26597820788621923, "learning_rate": 5.13946826183676e-06, "loss": 0.0309, "step": 4205 }, { "epoch": 1.6264501160092806, "grad_norm": 0.35908655239052967, "learning_rate": 5.137219343290874e-06, "loss": 0.0336, "step": 4206 }, { "epoch": 1.6268368136117557, "grad_norm": 1.102379310554413, "learning_rate": 5.134970396963508e-06, "loss": 0.0299, "step": 4207 }, { "epoch": 1.6272235112142304, "grad_norm": 0.3565670194072269, "learning_rate": 5.132721423309983e-06, "loss": 0.0283, "step": 4208 }, { "epoch": 1.6276102088167055, "grad_norm": 0.2952598585704345, "learning_rate": 5.130472422785629e-06, "loss": 0.0215, "step": 4209 }, { "epoch": 1.6279969064191802, "grad_norm": 0.4887307860839854, "learning_rate": 5.128223395845778e-06, "loss": 0.0463, "step": 4210 }, { "epoch": 1.628383604021655, "grad_norm": 0.33602334379844806, "learning_rate": 5.125974342945772e-06, "loss": 0.0243, "step": 4211 }, { "epoch": 1.62877030162413, "grad_norm": 0.46185966767870934, "learning_rate": 5.123725264540951e-06, "loss": 0.024, "step": 4212 }, { "epoch": 1.6291569992266048, "grad_norm": 0.3238658827316715, "learning_rate": 5.121476161086666e-06, "loss": 0.0296, "step": 4213 }, { "epoch": 1.6295436968290797, "grad_norm": 1.6822691481607155, "learning_rate": 5.11922703303827e-06, "loss": 0.0283, "step": 4214 }, { "epoch": 1.6299303944315544, "grad_norm": 0.3142576103634287, "learning_rate": 5.116977880851124e-06, "loss": 0.0299, "step": 4215 }, { "epoch": 1.6303170920340295, "grad_norm": 0.43403866134067554, "learning_rate": 5.114728704980592e-06, "loss": 0.0333, "step": 4216 }, { "epoch": 1.6307037896365042, "grad_norm": 0.3317205429363384, "learning_rate": 5.112479505882043e-06, "loss": 0.0315, "step": 4217 }, { "epoch": 1.631090487238979, "grad_norm": 0.3249316752003611, "learning_rate": 5.1102302840108505e-06, "loss": 0.0317, "step": 4218 }, { "epoch": 1.631477184841454, "grad_norm": 0.2987868799303305, "learning_rate": 5.1079810398223915e-06, "loss": 0.0266, "step": 4219 }, { "epoch": 1.6318638824439289, "grad_norm": 0.28060949997421875, "learning_rate": 5.105731773772051e-06, "loss": 0.0392, "step": 4220 }, { "epoch": 1.6322505800464038, "grad_norm": 0.2862494684620262, "learning_rate": 5.103482486315215e-06, "loss": 0.0323, "step": 4221 }, { "epoch": 1.6326372776488784, "grad_norm": 0.2549833476694772, "learning_rate": 5.101233177907276e-06, "loss": 0.0251, "step": 4222 }, { "epoch": 1.6330239752513536, "grad_norm": 0.35680917751411406, "learning_rate": 5.098983849003628e-06, "loss": 0.031, "step": 4223 }, { "epoch": 1.6334106728538282, "grad_norm": 0.37794471261582935, "learning_rate": 5.096734500059676e-06, "loss": 0.0384, "step": 4224 }, { "epoch": 1.6337973704563031, "grad_norm": 0.46662782551573667, "learning_rate": 5.0944851315308165e-06, "loss": 0.0316, "step": 4225 }, { "epoch": 1.634184068058778, "grad_norm": 0.3706931537802239, "learning_rate": 5.0922357438724625e-06, "loss": 0.0273, "step": 4226 }, { "epoch": 1.634570765661253, "grad_norm": 0.37330635555512814, "learning_rate": 5.089986337540024e-06, "loss": 0.0361, "step": 4227 }, { "epoch": 1.6349574632637278, "grad_norm": 0.4137508022618581, "learning_rate": 5.087736912988917e-06, "loss": 0.0451, "step": 4228 }, { "epoch": 1.6353441608662025, "grad_norm": 0.25032082879495693, "learning_rate": 5.085487470674563e-06, "loss": 0.0222, "step": 4229 }, { "epoch": 1.6357308584686776, "grad_norm": 0.30027131166971505, "learning_rate": 5.083238011052381e-06, "loss": 0.0407, "step": 4230 }, { "epoch": 1.6361175560711523, "grad_norm": 0.3001656059319389, "learning_rate": 5.080988534577802e-06, "loss": 0.0215, "step": 4231 }, { "epoch": 1.6365042536736272, "grad_norm": 0.28346257111850176, "learning_rate": 5.078739041706253e-06, "loss": 0.0311, "step": 4232 }, { "epoch": 1.636890951276102, "grad_norm": 0.3988934369617034, "learning_rate": 5.076489532893166e-06, "loss": 0.0393, "step": 4233 }, { "epoch": 1.637277648878577, "grad_norm": 0.30518517302811604, "learning_rate": 5.07424000859398e-06, "loss": 0.026, "step": 4234 }, { "epoch": 1.6376643464810519, "grad_norm": 0.3097090095310176, "learning_rate": 5.071990469264133e-06, "loss": 0.0294, "step": 4235 }, { "epoch": 1.6380510440835265, "grad_norm": 0.3581903015740015, "learning_rate": 5.069740915359069e-06, "loss": 0.0378, "step": 4236 }, { "epoch": 1.6384377416860016, "grad_norm": 0.2834100065420424, "learning_rate": 5.067491347334233e-06, "loss": 0.0383, "step": 4237 }, { "epoch": 1.6388244392884763, "grad_norm": 0.3126374794314893, "learning_rate": 5.065241765645072e-06, "loss": 0.0252, "step": 4238 }, { "epoch": 1.6392111368909514, "grad_norm": 0.40055537848223893, "learning_rate": 5.06299217074704e-06, "loss": 0.0362, "step": 4239 }, { "epoch": 1.639597834493426, "grad_norm": 0.30041893324403374, "learning_rate": 5.060742563095587e-06, "loss": 0.0283, "step": 4240 }, { "epoch": 1.639984532095901, "grad_norm": 0.32037786636858157, "learning_rate": 5.058492943146173e-06, "loss": 0.0266, "step": 4241 }, { "epoch": 1.640371229698376, "grad_norm": 0.20798981743640618, "learning_rate": 5.056243311354254e-06, "loss": 0.0214, "step": 4242 }, { "epoch": 1.6407579273008506, "grad_norm": 0.3058695198515805, "learning_rate": 5.053993668175294e-06, "loss": 0.0262, "step": 4243 }, { "epoch": 1.6411446249033257, "grad_norm": 0.32090771712454164, "learning_rate": 5.051744014064756e-06, "loss": 0.0308, "step": 4244 }, { "epoch": 1.6415313225058004, "grad_norm": 0.2829604149656678, "learning_rate": 5.049494349478105e-06, "loss": 0.0235, "step": 4245 }, { "epoch": 1.6419180201082755, "grad_norm": 0.3143474898329577, "learning_rate": 5.047244674870808e-06, "loss": 0.0289, "step": 4246 }, { "epoch": 1.6423047177107502, "grad_norm": 0.3312742543848184, "learning_rate": 5.044994990698336e-06, "loss": 0.0312, "step": 4247 }, { "epoch": 1.642691415313225, "grad_norm": 0.2998859937575704, "learning_rate": 5.042745297416162e-06, "loss": 0.0345, "step": 4248 }, { "epoch": 1.6430781129157, "grad_norm": 0.22619762914177385, "learning_rate": 5.040495595479758e-06, "loss": 0.0193, "step": 4249 }, { "epoch": 1.6434648105181748, "grad_norm": 0.2671469361466281, "learning_rate": 5.038245885344602e-06, "loss": 0.0278, "step": 4250 }, { "epoch": 1.6438515081206497, "grad_norm": 0.2855944882920992, "learning_rate": 5.035996167466169e-06, "loss": 0.0292, "step": 4251 }, { "epoch": 1.6442382057231244, "grad_norm": 0.3787448999832857, "learning_rate": 5.033746442299937e-06, "loss": 0.0211, "step": 4252 }, { "epoch": 1.6446249033255995, "grad_norm": 0.2577429032053894, "learning_rate": 5.031496710301388e-06, "loss": 0.0226, "step": 4253 }, { "epoch": 1.6450116009280742, "grad_norm": 0.4811611681852843, "learning_rate": 5.029246971926003e-06, "loss": 0.0276, "step": 4254 }, { "epoch": 1.645398298530549, "grad_norm": 0.36227763345595726, "learning_rate": 5.026997227629265e-06, "loss": 0.0296, "step": 4255 }, { "epoch": 1.645784996133024, "grad_norm": 0.27910447050174564, "learning_rate": 5.024747477866658e-06, "loss": 0.028, "step": 4256 }, { "epoch": 1.6461716937354989, "grad_norm": 0.5064574735536912, "learning_rate": 5.022497723093667e-06, "loss": 0.0276, "step": 4257 }, { "epoch": 1.6465583913379738, "grad_norm": 0.28076704426113347, "learning_rate": 5.02024796376578e-06, "loss": 0.0274, "step": 4258 }, { "epoch": 1.6469450889404484, "grad_norm": 0.3032811075261957, "learning_rate": 5.01799820033848e-06, "loss": 0.0232, "step": 4259 }, { "epoch": 1.6473317865429236, "grad_norm": 0.38706355663073644, "learning_rate": 5.015748433267259e-06, "loss": 0.0388, "step": 4260 }, { "epoch": 1.6477184841453982, "grad_norm": 0.26448709428752537, "learning_rate": 5.013498663007603e-06, "loss": 0.0188, "step": 4261 }, { "epoch": 1.6481051817478731, "grad_norm": 0.3081584359857234, "learning_rate": 5.011248890015005e-06, "loss": 0.0304, "step": 4262 }, { "epoch": 1.648491879350348, "grad_norm": 0.3337468740935769, "learning_rate": 5.00899911474495e-06, "loss": 0.0424, "step": 4263 }, { "epoch": 1.648878576952823, "grad_norm": 0.3182258799495605, "learning_rate": 5.0067493376529315e-06, "loss": 0.0275, "step": 4264 }, { "epoch": 1.6492652745552978, "grad_norm": 0.2705415041636461, "learning_rate": 5.004499559194441e-06, "loss": 0.0313, "step": 4265 }, { "epoch": 1.6496519721577725, "grad_norm": 0.29068635134624726, "learning_rate": 5.002249779824966e-06, "loss": 0.0263, "step": 4266 }, { "epoch": 1.6500386697602476, "grad_norm": 0.25964648558291936, "learning_rate": 5e-06, "loss": 0.0315, "step": 4267 }, { "epoch": 1.6504253673627223, "grad_norm": 0.37088122647559135, "learning_rate": 4.997750220175035e-06, "loss": 0.0339, "step": 4268 }, { "epoch": 1.6508120649651972, "grad_norm": 0.2769401623179313, "learning_rate": 4.995500440805561e-06, "loss": 0.0317, "step": 4269 }, { "epoch": 1.651198762567672, "grad_norm": 0.30967254524760957, "learning_rate": 4.993250662347069e-06, "loss": 0.0236, "step": 4270 }, { "epoch": 1.651585460170147, "grad_norm": 0.24639364254664076, "learning_rate": 4.991000885255051e-06, "loss": 0.0187, "step": 4271 }, { "epoch": 1.6519721577726219, "grad_norm": 0.3252970233896982, "learning_rate": 4.988751109984997e-06, "loss": 0.0293, "step": 4272 }, { "epoch": 1.6523588553750965, "grad_norm": 0.36438417536245443, "learning_rate": 4.986501336992397e-06, "loss": 0.0286, "step": 4273 }, { "epoch": 1.6527455529775716, "grad_norm": 0.3681916981334097, "learning_rate": 4.9842515667327425e-06, "loss": 0.0244, "step": 4274 }, { "epoch": 1.6531322505800463, "grad_norm": 0.2432129155976237, "learning_rate": 4.982001799661522e-06, "loss": 0.0269, "step": 4275 }, { "epoch": 1.6535189481825214, "grad_norm": 0.38784587691304573, "learning_rate": 4.979752036234222e-06, "loss": 0.0468, "step": 4276 }, { "epoch": 1.653905645784996, "grad_norm": 0.2980552600477279, "learning_rate": 4.977502276906335e-06, "loss": 0.0299, "step": 4277 }, { "epoch": 1.654292343387471, "grad_norm": 0.2456443826350322, "learning_rate": 4.9752525221333424e-06, "loss": 0.0199, "step": 4278 }, { "epoch": 1.654679040989946, "grad_norm": 0.28830950511123493, "learning_rate": 4.973002772370737e-06, "loss": 0.0231, "step": 4279 }, { "epoch": 1.6550657385924206, "grad_norm": 0.22202844080818232, "learning_rate": 4.970753028073997e-06, "loss": 0.0226, "step": 4280 }, { "epoch": 1.6554524361948957, "grad_norm": 0.29687470496538554, "learning_rate": 4.9685032896986145e-06, "loss": 0.0234, "step": 4281 }, { "epoch": 1.6558391337973704, "grad_norm": 0.2059417224913672, "learning_rate": 4.966253557700065e-06, "loss": 0.0237, "step": 4282 }, { "epoch": 1.6562258313998455, "grad_norm": 0.2159491673740092, "learning_rate": 4.964003832533834e-06, "loss": 0.0205, "step": 4283 }, { "epoch": 1.6566125290023201, "grad_norm": 0.23156175230442555, "learning_rate": 4.9617541146554e-06, "loss": 0.0205, "step": 4284 }, { "epoch": 1.656999226604795, "grad_norm": 0.3172395205263842, "learning_rate": 4.9595044045202426e-06, "loss": 0.0314, "step": 4285 }, { "epoch": 1.65738592420727, "grad_norm": 0.2681803867385043, "learning_rate": 4.95725470258384e-06, "loss": 0.0355, "step": 4286 }, { "epoch": 1.6577726218097448, "grad_norm": 0.4202533344445624, "learning_rate": 4.9550050093016655e-06, "loss": 0.0364, "step": 4287 }, { "epoch": 1.6581593194122197, "grad_norm": 0.2842965873511716, "learning_rate": 4.9527553251291936e-06, "loss": 0.0396, "step": 4288 }, { "epoch": 1.6585460170146944, "grad_norm": 0.418294551209941, "learning_rate": 4.950505650521897e-06, "loss": 0.0295, "step": 4289 }, { "epoch": 1.6589327146171695, "grad_norm": 0.2451941481947044, "learning_rate": 4.948255985935245e-06, "loss": 0.0349, "step": 4290 }, { "epoch": 1.6593194122196442, "grad_norm": 0.25075865891845356, "learning_rate": 4.946006331824707e-06, "loss": 0.0213, "step": 4291 }, { "epoch": 1.659706109822119, "grad_norm": 0.3451582417338743, "learning_rate": 4.943756688645746e-06, "loss": 0.0344, "step": 4292 }, { "epoch": 1.660092807424594, "grad_norm": 0.2866463997961387, "learning_rate": 4.941507056853829e-06, "loss": 0.0283, "step": 4293 }, { "epoch": 1.6604795050270689, "grad_norm": 0.2661026448404382, "learning_rate": 4.939257436904414e-06, "loss": 0.029, "step": 4294 }, { "epoch": 1.6608662026295438, "grad_norm": 0.30632762447937145, "learning_rate": 4.937007829252962e-06, "loss": 0.0284, "step": 4295 }, { "epoch": 1.6612529002320184, "grad_norm": 0.3547847494094056, "learning_rate": 4.934758234354929e-06, "loss": 0.0398, "step": 4296 }, { "epoch": 1.6616395978344936, "grad_norm": 0.4022260749084985, "learning_rate": 4.932508652665768e-06, "loss": 0.0232, "step": 4297 }, { "epoch": 1.6620262954369682, "grad_norm": 0.39514838556257054, "learning_rate": 4.930259084640932e-06, "loss": 0.0316, "step": 4298 }, { "epoch": 1.6624129930394431, "grad_norm": 0.3514022186997082, "learning_rate": 4.928009530735867e-06, "loss": 0.0364, "step": 4299 }, { "epoch": 1.662799690641918, "grad_norm": 0.2717581930681761, "learning_rate": 4.925759991406021e-06, "loss": 0.0261, "step": 4300 }, { "epoch": 1.663186388244393, "grad_norm": 0.29550365544743, "learning_rate": 4.923510467106837e-06, "loss": 0.0346, "step": 4301 }, { "epoch": 1.6635730858468678, "grad_norm": 0.3619727755703586, "learning_rate": 4.921260958293748e-06, "loss": 0.0244, "step": 4302 }, { "epoch": 1.6639597834493425, "grad_norm": 0.3463247408391, "learning_rate": 4.9190114654222e-06, "loss": 0.0342, "step": 4303 }, { "epoch": 1.6643464810518176, "grad_norm": 0.265171792380576, "learning_rate": 4.91676198894762e-06, "loss": 0.0249, "step": 4304 }, { "epoch": 1.6647331786542923, "grad_norm": 0.24546371448335788, "learning_rate": 4.9145125293254395e-06, "loss": 0.0239, "step": 4305 }, { "epoch": 1.6651198762567672, "grad_norm": 0.28751668573085754, "learning_rate": 4.912263087011083e-06, "loss": 0.0231, "step": 4306 }, { "epoch": 1.665506573859242, "grad_norm": 1.64633416177342, "learning_rate": 4.910013662459977e-06, "loss": 0.042, "step": 4307 }, { "epoch": 1.665893271461717, "grad_norm": 0.2959689841248582, "learning_rate": 4.90776425612754e-06, "loss": 0.0251, "step": 4308 }, { "epoch": 1.6662799690641918, "grad_norm": 0.2730530874402176, "learning_rate": 4.905514868469186e-06, "loss": 0.0289, "step": 4309 }, { "epoch": 1.6666666666666665, "grad_norm": 0.2575853933398049, "learning_rate": 4.903265499940327e-06, "loss": 0.0237, "step": 4310 }, { "epoch": 1.6670533642691416, "grad_norm": 0.24827734883831137, "learning_rate": 4.901016150996371e-06, "loss": 0.0244, "step": 4311 }, { "epoch": 1.6674400618716163, "grad_norm": 0.3518761667583459, "learning_rate": 4.898766822092725e-06, "loss": 0.0326, "step": 4312 }, { "epoch": 1.6678267594740914, "grad_norm": 0.23070471714012655, "learning_rate": 4.896517513684785e-06, "loss": 0.026, "step": 4313 }, { "epoch": 1.668213457076566, "grad_norm": 0.23152691355832566, "learning_rate": 4.89426822622795e-06, "loss": 0.0234, "step": 4314 }, { "epoch": 1.668600154679041, "grad_norm": 0.35269032327663036, "learning_rate": 4.892018960177609e-06, "loss": 0.0231, "step": 4315 }, { "epoch": 1.6689868522815159, "grad_norm": 0.25383328959658535, "learning_rate": 4.889769715989151e-06, "loss": 0.0266, "step": 4316 }, { "epoch": 1.6693735498839906, "grad_norm": 0.2723550101101396, "learning_rate": 4.887520494117959e-06, "loss": 0.0237, "step": 4317 }, { "epoch": 1.6697602474864657, "grad_norm": 0.3601599861690881, "learning_rate": 4.885271295019408e-06, "loss": 0.0238, "step": 4318 }, { "epoch": 1.6701469450889403, "grad_norm": 0.5447782517496977, "learning_rate": 4.883022119148877e-06, "loss": 0.0307, "step": 4319 }, { "epoch": 1.6705336426914155, "grad_norm": 0.5509140333534984, "learning_rate": 4.880772966961731e-06, "loss": 0.0318, "step": 4320 }, { "epoch": 1.6709203402938901, "grad_norm": 0.31258234125457185, "learning_rate": 4.878523838913336e-06, "loss": 0.0285, "step": 4321 }, { "epoch": 1.671307037896365, "grad_norm": 0.29231764012894856, "learning_rate": 4.876274735459053e-06, "loss": 0.0246, "step": 4322 }, { "epoch": 1.67169373549884, "grad_norm": 0.2986094896704104, "learning_rate": 4.874025657054229e-06, "loss": 0.0284, "step": 4323 }, { "epoch": 1.6720804331013148, "grad_norm": 0.3923490338729159, "learning_rate": 4.8717766041542235e-06, "loss": 0.0244, "step": 4324 }, { "epoch": 1.6724671307037897, "grad_norm": 0.44624766595061627, "learning_rate": 4.869527577214371e-06, "loss": 0.0243, "step": 4325 }, { "epoch": 1.6728538283062644, "grad_norm": 0.28213466810061644, "learning_rate": 4.867278576690019e-06, "loss": 0.026, "step": 4326 }, { "epoch": 1.6732405259087395, "grad_norm": 0.30177789069500666, "learning_rate": 4.865029603036495e-06, "loss": 0.0293, "step": 4327 }, { "epoch": 1.6736272235112142, "grad_norm": 0.2331638795800324, "learning_rate": 4.862780656709129e-06, "loss": 0.0232, "step": 4328 }, { "epoch": 1.674013921113689, "grad_norm": 0.2698885333714115, "learning_rate": 4.860531738163244e-06, "loss": 0.0292, "step": 4329 }, { "epoch": 1.674400618716164, "grad_norm": 0.2409564233965396, "learning_rate": 4.858282847854155e-06, "loss": 0.0244, "step": 4330 }, { "epoch": 1.6747873163186389, "grad_norm": 0.20448191762931886, "learning_rate": 4.856033986237175e-06, "loss": 0.0177, "step": 4331 }, { "epoch": 1.6751740139211138, "grad_norm": 0.25473410566298377, "learning_rate": 4.853785153767609e-06, "loss": 0.0273, "step": 4332 }, { "epoch": 1.6755607115235884, "grad_norm": 0.27195240690289274, "learning_rate": 4.851536350900757e-06, "loss": 0.0241, "step": 4333 }, { "epoch": 1.6759474091260635, "grad_norm": 0.2740347978172209, "learning_rate": 4.849287578091912e-06, "loss": 0.0314, "step": 4334 }, { "epoch": 1.6763341067285382, "grad_norm": 1.2246407654063975, "learning_rate": 4.847038835796361e-06, "loss": 0.043, "step": 4335 }, { "epoch": 1.6767208043310131, "grad_norm": 0.3064754537237513, "learning_rate": 4.844790124469386e-06, "loss": 0.0278, "step": 4336 }, { "epoch": 1.677107501933488, "grad_norm": 0.3517315329792987, "learning_rate": 4.84254144456626e-06, "loss": 0.0236, "step": 4337 }, { "epoch": 1.677494199535963, "grad_norm": 0.5477335828412644, "learning_rate": 4.840292796542254e-06, "loss": 0.0274, "step": 4338 }, { "epoch": 1.6778808971384378, "grad_norm": 0.43969956672790766, "learning_rate": 4.838044180852629e-06, "loss": 0.0342, "step": 4339 }, { "epoch": 1.6782675947409125, "grad_norm": 0.31691035673691426, "learning_rate": 4.835795597952641e-06, "loss": 0.0218, "step": 4340 }, { "epoch": 1.6786542923433876, "grad_norm": 0.34641322608306735, "learning_rate": 4.8335470482975415e-06, "loss": 0.033, "step": 4341 }, { "epoch": 1.6790409899458623, "grad_norm": 0.2954840971249251, "learning_rate": 4.831298532342565e-06, "loss": 0.0233, "step": 4342 }, { "epoch": 1.6794276875483372, "grad_norm": 0.3847433359127481, "learning_rate": 4.829050050542956e-06, "loss": 0.0313, "step": 4343 }, { "epoch": 1.679814385150812, "grad_norm": 0.2707214227455262, "learning_rate": 4.826801603353935e-06, "loss": 0.0254, "step": 4344 }, { "epoch": 1.680201082753287, "grad_norm": 0.29368383941609627, "learning_rate": 4.824553191230731e-06, "loss": 0.0344, "step": 4345 }, { "epoch": 1.6805877803557618, "grad_norm": 0.2598647159598509, "learning_rate": 4.822304814628551e-06, "loss": 0.0209, "step": 4346 }, { "epoch": 1.6809744779582365, "grad_norm": 0.3620785293788714, "learning_rate": 4.82005647400261e-06, "loss": 0.0337, "step": 4347 }, { "epoch": 1.6813611755607116, "grad_norm": 0.3337171232060677, "learning_rate": 4.817808169808102e-06, "loss": 0.05, "step": 4348 }, { "epoch": 1.6817478731631863, "grad_norm": 0.29820494057170627, "learning_rate": 4.81555990250022e-06, "loss": 0.0344, "step": 4349 }, { "epoch": 1.6821345707656614, "grad_norm": 0.24394295287202622, "learning_rate": 4.813311672534152e-06, "loss": 0.0239, "step": 4350 }, { "epoch": 1.682521268368136, "grad_norm": 0.3152443056311994, "learning_rate": 4.811063480365072e-06, "loss": 0.0309, "step": 4351 }, { "epoch": 1.682907965970611, "grad_norm": 0.2790196163238752, "learning_rate": 4.808815326448154e-06, "loss": 0.0218, "step": 4352 }, { "epoch": 1.6832946635730859, "grad_norm": 0.3152167867199068, "learning_rate": 4.806567211238555e-06, "loss": 0.0209, "step": 4353 }, { "epoch": 1.6836813611755606, "grad_norm": 0.2573006038324596, "learning_rate": 4.804319135191435e-06, "loss": 0.0184, "step": 4354 }, { "epoch": 1.6840680587780357, "grad_norm": 0.25391477717759736, "learning_rate": 4.802071098761936e-06, "loss": 0.0206, "step": 4355 }, { "epoch": 1.6844547563805103, "grad_norm": 0.30433564633638577, "learning_rate": 4.799823102405197e-06, "loss": 0.0256, "step": 4356 }, { "epoch": 1.6848414539829855, "grad_norm": 0.33458870490533615, "learning_rate": 4.797575146576351e-06, "loss": 0.0387, "step": 4357 }, { "epoch": 1.6852281515854601, "grad_norm": 0.2810003821661574, "learning_rate": 4.795327231730516e-06, "loss": 0.0188, "step": 4358 }, { "epoch": 1.685614849187935, "grad_norm": 0.3653465476912794, "learning_rate": 4.793079358322808e-06, "loss": 0.0332, "step": 4359 }, { "epoch": 1.68600154679041, "grad_norm": 0.3602313124915901, "learning_rate": 4.790831526808331e-06, "loss": 0.0279, "step": 4360 }, { "epoch": 1.6863882443928848, "grad_norm": 0.26987800447217297, "learning_rate": 4.788583737642185e-06, "loss": 0.0288, "step": 4361 }, { "epoch": 1.6867749419953597, "grad_norm": 0.2859804274835121, "learning_rate": 4.786335991279454e-06, "loss": 0.0256, "step": 4362 }, { "epoch": 1.6871616395978344, "grad_norm": 0.6315904705319726, "learning_rate": 4.78408828817522e-06, "loss": 0.0297, "step": 4363 }, { "epoch": 1.6875483372003095, "grad_norm": 0.5924515455337979, "learning_rate": 4.781840628784554e-06, "loss": 0.0261, "step": 4364 }, { "epoch": 1.6879350348027842, "grad_norm": 0.257789629251975, "learning_rate": 4.779593013562516e-06, "loss": 0.0203, "step": 4365 }, { "epoch": 1.688321732405259, "grad_norm": 0.314677639449451, "learning_rate": 4.777345442964162e-06, "loss": 0.0326, "step": 4366 }, { "epoch": 1.688708430007734, "grad_norm": 0.2752127610101965, "learning_rate": 4.775097917444536e-06, "loss": 0.0197, "step": 4367 }, { "epoch": 1.6890951276102089, "grad_norm": 0.25507303133991605, "learning_rate": 4.772850437458667e-06, "loss": 0.0273, "step": 4368 }, { "epoch": 1.6894818252126838, "grad_norm": 0.26748560611308975, "learning_rate": 4.770603003461588e-06, "loss": 0.026, "step": 4369 }, { "epoch": 1.6898685228151584, "grad_norm": 0.35018074907131946, "learning_rate": 4.768355615908309e-06, "loss": 0.0231, "step": 4370 }, { "epoch": 1.6902552204176335, "grad_norm": 0.3015849782787575, "learning_rate": 4.766108275253844e-06, "loss": 0.0315, "step": 4371 }, { "epoch": 1.6906419180201082, "grad_norm": 0.46601148068484216, "learning_rate": 4.763860981953183e-06, "loss": 0.0274, "step": 4372 }, { "epoch": 1.691028615622583, "grad_norm": 0.27097576003629475, "learning_rate": 4.761613736461321e-06, "loss": 0.022, "step": 4373 }, { "epoch": 1.691415313225058, "grad_norm": 0.3160882153199262, "learning_rate": 4.759366539233232e-06, "loss": 0.0255, "step": 4374 }, { "epoch": 1.691802010827533, "grad_norm": 0.19125399416600475, "learning_rate": 4.757119390723884e-06, "loss": 0.0187, "step": 4375 }, { "epoch": 1.6921887084300078, "grad_norm": 0.25966676752473683, "learning_rate": 4.754872291388237e-06, "loss": 0.032, "step": 4376 }, { "epoch": 1.6925754060324825, "grad_norm": 0.44959080990964745, "learning_rate": 4.752625241681238e-06, "loss": 0.0295, "step": 4377 }, { "epoch": 1.6929621036349576, "grad_norm": 0.24668699249777282, "learning_rate": 4.750378242057828e-06, "loss": 0.0286, "step": 4378 }, { "epoch": 1.6933488012374323, "grad_norm": 0.2081876907090612, "learning_rate": 4.7481312929729325e-06, "loss": 0.0183, "step": 4379 }, { "epoch": 1.6937354988399071, "grad_norm": 0.4007342859543293, "learning_rate": 4.745884394881474e-06, "loss": 0.0395, "step": 4380 }, { "epoch": 1.694122196442382, "grad_norm": 0.31678915928913715, "learning_rate": 4.743637548238356e-06, "loss": 0.0217, "step": 4381 }, { "epoch": 1.694508894044857, "grad_norm": 0.35007728476463873, "learning_rate": 4.741390753498478e-06, "loss": 0.0392, "step": 4382 }, { "epoch": 1.6948955916473318, "grad_norm": 0.2548363571813687, "learning_rate": 4.739144011116727e-06, "loss": 0.023, "step": 4383 }, { "epoch": 1.6952822892498065, "grad_norm": 0.3363467160504555, "learning_rate": 4.736897321547977e-06, "loss": 0.0285, "step": 4384 }, { "epoch": 1.6956689868522816, "grad_norm": 0.35352812449928384, "learning_rate": 4.734650685247097e-06, "loss": 0.0363, "step": 4385 }, { "epoch": 1.6960556844547563, "grad_norm": 0.3010530837425505, "learning_rate": 4.73240410266894e-06, "loss": 0.0256, "step": 4386 }, { "epoch": 1.6964423820572312, "grad_norm": 0.2824074069306447, "learning_rate": 4.73015757426835e-06, "loss": 0.0192, "step": 4387 }, { "epoch": 1.696829079659706, "grad_norm": 0.22956847717413664, "learning_rate": 4.727911100500161e-06, "loss": 0.0221, "step": 4388 }, { "epoch": 1.697215777262181, "grad_norm": 0.21125604839951123, "learning_rate": 4.725664681819192e-06, "loss": 0.0178, "step": 4389 }, { "epoch": 1.6976024748646559, "grad_norm": 0.2219518779432822, "learning_rate": 4.72341831868026e-06, "loss": 0.0322, "step": 4390 }, { "epoch": 1.6979891724671305, "grad_norm": 0.3679083059672947, "learning_rate": 4.721172011538156e-06, "loss": 0.0328, "step": 4391 }, { "epoch": 1.6983758700696057, "grad_norm": 0.24954179278326902, "learning_rate": 4.718925760847676e-06, "loss": 0.0245, "step": 4392 }, { "epoch": 1.6987625676720803, "grad_norm": 0.3413110506584718, "learning_rate": 4.71667956706359e-06, "loss": 0.0293, "step": 4393 }, { "epoch": 1.6991492652745555, "grad_norm": 0.26347685162532963, "learning_rate": 4.71443343064067e-06, "loss": 0.0306, "step": 4394 }, { "epoch": 1.6995359628770301, "grad_norm": 0.24512693991598708, "learning_rate": 4.7121873520336656e-06, "loss": 0.0231, "step": 4395 }, { "epoch": 1.699922660479505, "grad_norm": 0.2945398600449714, "learning_rate": 4.7099413316973196e-06, "loss": 0.0277, "step": 4396 }, { "epoch": 1.70030935808198, "grad_norm": 0.3014360468649797, "learning_rate": 4.7076953700863616e-06, "loss": 0.0236, "step": 4397 }, { "epoch": 1.7006960556844548, "grad_norm": 0.3422221108147975, "learning_rate": 4.705449467655509e-06, "loss": 0.0401, "step": 4398 }, { "epoch": 1.7010827532869297, "grad_norm": 0.2591082686903481, "learning_rate": 4.7032036248594705e-06, "loss": 0.0262, "step": 4399 }, { "epoch": 1.7014694508894044, "grad_norm": 0.2956016146261482, "learning_rate": 4.70095784215294e-06, "loss": 0.0241, "step": 4400 }, { "epoch": 1.7018561484918795, "grad_norm": 0.20705593341436532, "learning_rate": 4.698712119990597e-06, "loss": 0.0197, "step": 4401 }, { "epoch": 1.7022428460943542, "grad_norm": 0.2783044467427669, "learning_rate": 4.696466458827113e-06, "loss": 0.0316, "step": 4402 }, { "epoch": 1.702629543696829, "grad_norm": 0.22480616015868182, "learning_rate": 4.694220859117146e-06, "loss": 0.0228, "step": 4403 }, { "epoch": 1.703016241299304, "grad_norm": 0.2579778624889504, "learning_rate": 4.69197532131534e-06, "loss": 0.0251, "step": 4404 }, { "epoch": 1.7034029389017789, "grad_norm": 0.19885988292319162, "learning_rate": 4.6897298458763275e-06, "loss": 0.0245, "step": 4405 }, { "epoch": 1.7037896365042537, "grad_norm": 0.2909716107245386, "learning_rate": 4.687484433254729e-06, "loss": 0.035, "step": 4406 }, { "epoch": 1.7041763341067284, "grad_norm": 0.31593462131224853, "learning_rate": 4.685239083905152e-06, "loss": 0.021, "step": 4407 }, { "epoch": 1.7045630317092035, "grad_norm": 0.27604715901337296, "learning_rate": 4.682993798282188e-06, "loss": 0.0363, "step": 4408 }, { "epoch": 1.7049497293116782, "grad_norm": 0.24000467943586207, "learning_rate": 4.680748576840422e-06, "loss": 0.025, "step": 4409 }, { "epoch": 1.705336426914153, "grad_norm": 0.27136058125541734, "learning_rate": 4.678503420034418e-06, "loss": 0.0218, "step": 4410 }, { "epoch": 1.705723124516628, "grad_norm": 0.2591305422746398, "learning_rate": 4.676258328318738e-06, "loss": 0.0224, "step": 4411 }, { "epoch": 1.706109822119103, "grad_norm": 0.7458668017173542, "learning_rate": 4.6740133021479155e-06, "loss": 0.0269, "step": 4412 }, { "epoch": 1.7064965197215778, "grad_norm": 0.2388834229758083, "learning_rate": 4.671768341976488e-06, "loss": 0.0228, "step": 4413 }, { "epoch": 1.7068832173240525, "grad_norm": 0.29637479806348016, "learning_rate": 4.669523448258965e-06, "loss": 0.0341, "step": 4414 }, { "epoch": 1.7072699149265276, "grad_norm": 0.2987996919739265, "learning_rate": 4.66727862144985e-06, "loss": 0.0345, "step": 4415 }, { "epoch": 1.7076566125290022, "grad_norm": 0.5551765035059271, "learning_rate": 4.665033862003633e-06, "loss": 0.0305, "step": 4416 }, { "epoch": 1.7080433101314771, "grad_norm": 0.2550452558856584, "learning_rate": 4.6627891703747875e-06, "loss": 0.0217, "step": 4417 }, { "epoch": 1.708430007733952, "grad_norm": 0.3157427283886707, "learning_rate": 4.6605445470177754e-06, "loss": 0.0349, "step": 4418 }, { "epoch": 1.708816705336427, "grad_norm": 0.4100016282215613, "learning_rate": 4.658299992387042e-06, "loss": 0.0286, "step": 4419 }, { "epoch": 1.7092034029389018, "grad_norm": 0.32239052197671025, "learning_rate": 4.656055506937023e-06, "loss": 0.024, "step": 4420 }, { "epoch": 1.7095901005413765, "grad_norm": 0.24385384937977272, "learning_rate": 4.653811091122138e-06, "loss": 0.0235, "step": 4421 }, { "epoch": 1.7099767981438516, "grad_norm": 0.3705372611160879, "learning_rate": 4.65156674539679e-06, "loss": 0.0286, "step": 4422 }, { "epoch": 1.7103634957463263, "grad_norm": 0.27558231371924746, "learning_rate": 4.649322470215373e-06, "loss": 0.02, "step": 4423 }, { "epoch": 1.7107501933488012, "grad_norm": 0.2463408048502852, "learning_rate": 4.64707826603226e-06, "loss": 0.0203, "step": 4424 }, { "epoch": 1.711136890951276, "grad_norm": 0.3023237152756365, "learning_rate": 4.6448341333018165e-06, "loss": 0.0274, "step": 4425 }, { "epoch": 1.711523588553751, "grad_norm": 0.44844695247417643, "learning_rate": 4.642590072478389e-06, "loss": 0.0404, "step": 4426 }, { "epoch": 1.7119102861562259, "grad_norm": 0.35926851071082444, "learning_rate": 4.640346084016313e-06, "loss": 0.0324, "step": 4427 }, { "epoch": 1.7122969837587005, "grad_norm": 0.3604651653769751, "learning_rate": 4.638102168369905e-06, "loss": 0.019, "step": 4428 }, { "epoch": 1.7126836813611757, "grad_norm": 0.2811303010469347, "learning_rate": 4.635858325993469e-06, "loss": 0.0215, "step": 4429 }, { "epoch": 1.7130703789636503, "grad_norm": 0.3562224762595162, "learning_rate": 4.633614557341297e-06, "loss": 0.0368, "step": 4430 }, { "epoch": 1.7134570765661254, "grad_norm": 0.39075565819363306, "learning_rate": 4.631370862867659e-06, "loss": 0.0449, "step": 4431 }, { "epoch": 1.7138437741686001, "grad_norm": 0.36642279535509903, "learning_rate": 4.629127243026817e-06, "loss": 0.0495, "step": 4432 }, { "epoch": 1.714230471771075, "grad_norm": 0.33922412224328746, "learning_rate": 4.6268836982730145e-06, "loss": 0.0312, "step": 4433 }, { "epoch": 1.71461716937355, "grad_norm": 0.3655985734174419, "learning_rate": 4.624640229060481e-06, "loss": 0.0309, "step": 4434 }, { "epoch": 1.7150038669760246, "grad_norm": 0.3118477651512686, "learning_rate": 4.622396835843431e-06, "loss": 0.0277, "step": 4435 }, { "epoch": 1.7153905645784997, "grad_norm": 0.24017407551410314, "learning_rate": 4.620153519076058e-06, "loss": 0.0313, "step": 4436 }, { "epoch": 1.7157772621809744, "grad_norm": 0.30998733578536, "learning_rate": 4.617910279212552e-06, "loss": 0.0259, "step": 4437 }, { "epoch": 1.7161639597834495, "grad_norm": 0.28950988060718746, "learning_rate": 4.615667116707072e-06, "loss": 0.0288, "step": 4438 }, { "epoch": 1.7165506573859242, "grad_norm": 0.3449798071381972, "learning_rate": 4.613424032013778e-06, "loss": 0.0358, "step": 4439 }, { "epoch": 1.716937354988399, "grad_norm": 0.3175485982607637, "learning_rate": 4.6111810255868e-06, "loss": 0.0339, "step": 4440 }, { "epoch": 1.717324052590874, "grad_norm": 0.3056439430180715, "learning_rate": 4.608938097880259e-06, "loss": 0.0232, "step": 4441 }, { "epoch": 1.7177107501933488, "grad_norm": 0.348408784398709, "learning_rate": 4.6066952493482605e-06, "loss": 0.028, "step": 4442 }, { "epoch": 1.7180974477958237, "grad_norm": 0.24300243698476054, "learning_rate": 4.60445248044489e-06, "loss": 0.0249, "step": 4443 }, { "epoch": 1.7184841453982984, "grad_norm": 0.2854400567016271, "learning_rate": 4.602209791624222e-06, "loss": 0.0257, "step": 4444 }, { "epoch": 1.7188708430007735, "grad_norm": 0.26307977194368704, "learning_rate": 4.59996718334031e-06, "loss": 0.0185, "step": 4445 }, { "epoch": 1.7192575406032482, "grad_norm": 0.24843287200811498, "learning_rate": 4.597724656047196e-06, "loss": 0.0328, "step": 4446 }, { "epoch": 1.719644238205723, "grad_norm": 0.225461587557164, "learning_rate": 4.5954822101989e-06, "loss": 0.0227, "step": 4447 }, { "epoch": 1.720030935808198, "grad_norm": 0.4787554777659047, "learning_rate": 4.59323984624943e-06, "loss": 0.0521, "step": 4448 }, { "epoch": 1.7204176334106729, "grad_norm": 0.26396345147677114, "learning_rate": 4.590997564652777e-06, "loss": 0.022, "step": 4449 }, { "epoch": 1.7208043310131478, "grad_norm": 0.26434729431234294, "learning_rate": 4.58875536586291e-06, "loss": 0.0234, "step": 4450 }, { "epoch": 1.7211910286156225, "grad_norm": 0.235601033153113, "learning_rate": 4.586513250333791e-06, "loss": 0.0226, "step": 4451 }, { "epoch": 1.7215777262180976, "grad_norm": 0.41657378080195345, "learning_rate": 4.584271218519355e-06, "loss": 0.0343, "step": 4452 }, { "epoch": 1.7219644238205722, "grad_norm": 0.345995102830017, "learning_rate": 4.5820292708735265e-06, "loss": 0.0235, "step": 4453 }, { "epoch": 1.7223511214230471, "grad_norm": 0.2686243542857051, "learning_rate": 4.579787407850213e-06, "loss": 0.0293, "step": 4454 }, { "epoch": 1.722737819025522, "grad_norm": 0.23855242850461708, "learning_rate": 4.577545629903297e-06, "loss": 0.0265, "step": 4455 }, { "epoch": 1.723124516627997, "grad_norm": 0.27987957194078883, "learning_rate": 4.575303937486658e-06, "loss": 0.0217, "step": 4456 }, { "epoch": 1.7235112142304718, "grad_norm": 0.32616480816187887, "learning_rate": 4.573062331054141e-06, "loss": 0.03, "step": 4457 }, { "epoch": 1.7238979118329465, "grad_norm": 0.22654579518499723, "learning_rate": 4.57082081105959e-06, "loss": 0.0209, "step": 4458 }, { "epoch": 1.7242846094354216, "grad_norm": 0.3553430856740011, "learning_rate": 4.568579377956819e-06, "loss": 0.0285, "step": 4459 }, { "epoch": 1.7246713070378963, "grad_norm": 0.2103419659880217, "learning_rate": 4.5663380321996335e-06, "loss": 0.017, "step": 4460 }, { "epoch": 1.7250580046403712, "grad_norm": 0.22237057933996532, "learning_rate": 4.564096774241815e-06, "loss": 0.0247, "step": 4461 }, { "epoch": 1.725444702242846, "grad_norm": 0.2824839735159477, "learning_rate": 4.561855604537128e-06, "loss": 0.0227, "step": 4462 }, { "epoch": 1.725831399845321, "grad_norm": 0.3499302817292599, "learning_rate": 4.559614523539323e-06, "loss": 0.0369, "step": 4463 }, { "epoch": 1.7262180974477959, "grad_norm": 0.2333846930860808, "learning_rate": 4.557373531702128e-06, "loss": 0.0247, "step": 4464 }, { "epoch": 1.7266047950502705, "grad_norm": 0.2896560730733553, "learning_rate": 4.555132629479258e-06, "loss": 0.0298, "step": 4465 }, { "epoch": 1.7269914926527457, "grad_norm": 0.22134732858645, "learning_rate": 4.552891817324404e-06, "loss": 0.0225, "step": 4466 }, { "epoch": 1.7273781902552203, "grad_norm": 0.21075182591764346, "learning_rate": 4.550651095691244e-06, "loss": 0.0204, "step": 4467 }, { "epoch": 1.7277648878576954, "grad_norm": 0.4756369045584627, "learning_rate": 4.548410465033433e-06, "loss": 0.0372, "step": 4468 }, { "epoch": 1.7281515854601701, "grad_norm": 0.2749772122820138, "learning_rate": 4.546169925804611e-06, "loss": 0.0189, "step": 4469 }, { "epoch": 1.728538283062645, "grad_norm": 0.24731249082811632, "learning_rate": 4.5439294784584e-06, "loss": 0.0225, "step": 4470 }, { "epoch": 1.72892498066512, "grad_norm": 0.3724707226647827, "learning_rate": 4.541689123448399e-06, "loss": 0.0378, "step": 4471 }, { "epoch": 1.7293116782675946, "grad_norm": 0.2967434852466295, "learning_rate": 4.5394488612281934e-06, "loss": 0.0295, "step": 4472 }, { "epoch": 1.7296983758700697, "grad_norm": 0.2649786824577742, "learning_rate": 4.537208692251346e-06, "loss": 0.0303, "step": 4473 }, { "epoch": 1.7300850734725444, "grad_norm": 0.2550812428635498, "learning_rate": 4.534968616971404e-06, "loss": 0.0229, "step": 4474 }, { "epoch": 1.7304717710750195, "grad_norm": 0.2241306843882758, "learning_rate": 4.532728635841894e-06, "loss": 0.0182, "step": 4475 }, { "epoch": 1.7308584686774942, "grad_norm": 0.3037846177605807, "learning_rate": 4.530488749316318e-06, "loss": 0.037, "step": 4476 }, { "epoch": 1.731245166279969, "grad_norm": 0.3136953951649053, "learning_rate": 4.528248957848173e-06, "loss": 0.0292, "step": 4477 }, { "epoch": 1.731631863882444, "grad_norm": 0.5047997920563049, "learning_rate": 4.526009261890919e-06, "loss": 0.0267, "step": 4478 }, { "epoch": 1.7320185614849188, "grad_norm": 0.2278391114458246, "learning_rate": 4.523769661898016e-06, "loss": 0.0203, "step": 4479 }, { "epoch": 1.7324052590873937, "grad_norm": 0.4309931297427997, "learning_rate": 4.521530158322885e-06, "loss": 0.0341, "step": 4480 }, { "epoch": 1.7327919566898684, "grad_norm": 0.33464886629040236, "learning_rate": 4.51929075161894e-06, "loss": 0.0251, "step": 4481 }, { "epoch": 1.7331786542923435, "grad_norm": 0.28641893721657197, "learning_rate": 4.517051442239574e-06, "loss": 0.0254, "step": 4482 }, { "epoch": 1.7335653518948182, "grad_norm": 0.23224604646107985, "learning_rate": 4.5148122306381555e-06, "loss": 0.0178, "step": 4483 }, { "epoch": 1.733952049497293, "grad_norm": 0.24674579149305081, "learning_rate": 4.512573117268038e-06, "loss": 0.0218, "step": 4484 }, { "epoch": 1.734338747099768, "grad_norm": 0.2861732280434414, "learning_rate": 4.510334102582552e-06, "loss": 0.027, "step": 4485 }, { "epoch": 1.7347254447022429, "grad_norm": 0.31723891150255606, "learning_rate": 4.508095187035012e-06, "loss": 0.036, "step": 4486 }, { "epoch": 1.7351121423047178, "grad_norm": 0.23137311956703618, "learning_rate": 4.505856371078706e-06, "loss": 0.0215, "step": 4487 }, { "epoch": 1.7354988399071924, "grad_norm": 0.28970464144336866, "learning_rate": 4.503617655166907e-06, "loss": 0.0258, "step": 4488 }, { "epoch": 1.7358855375096676, "grad_norm": 0.19977540071521724, "learning_rate": 4.501379039752868e-06, "loss": 0.0229, "step": 4489 }, { "epoch": 1.7362722351121422, "grad_norm": 0.2478475721743367, "learning_rate": 4.499140525289817e-06, "loss": 0.0192, "step": 4490 }, { "epoch": 1.7366589327146171, "grad_norm": 0.27840888960353943, "learning_rate": 4.4969021122309665e-06, "loss": 0.0205, "step": 4491 }, { "epoch": 1.737045630317092, "grad_norm": 0.26944949154471826, "learning_rate": 4.494663801029505e-06, "loss": 0.0225, "step": 4492 }, { "epoch": 1.737432327919567, "grad_norm": 0.22491649945918496, "learning_rate": 4.492425592138604e-06, "loss": 0.0204, "step": 4493 }, { "epoch": 1.7378190255220418, "grad_norm": 0.2906548699475943, "learning_rate": 4.490187486011412e-06, "loss": 0.0243, "step": 4494 }, { "epoch": 1.7382057231245165, "grad_norm": 0.27889584458502675, "learning_rate": 4.487949483101053e-06, "loss": 0.0282, "step": 4495 }, { "epoch": 1.7385924207269916, "grad_norm": 0.2925663358518216, "learning_rate": 4.485711583860638e-06, "loss": 0.0293, "step": 4496 }, { "epoch": 1.7389791183294663, "grad_norm": 0.2963851627181647, "learning_rate": 4.48347378874325e-06, "loss": 0.0228, "step": 4497 }, { "epoch": 1.7393658159319412, "grad_norm": 0.2533318295439095, "learning_rate": 4.481236098201957e-06, "loss": 0.0287, "step": 4498 }, { "epoch": 1.739752513534416, "grad_norm": 0.3480540891331587, "learning_rate": 4.4789985126898e-06, "loss": 0.0359, "step": 4499 }, { "epoch": 1.740139211136891, "grad_norm": 0.2400168507162096, "learning_rate": 4.476761032659804e-06, "loss": 0.0251, "step": 4500 }, { "epoch": 1.7405259087393659, "grad_norm": 0.24984951014012702, "learning_rate": 4.474523658564969e-06, "loss": 0.0241, "step": 4501 }, { "epoch": 1.7409126063418405, "grad_norm": 0.2808529368506185, "learning_rate": 4.47228639085827e-06, "loss": 0.0283, "step": 4502 }, { "epoch": 1.7412993039443156, "grad_norm": 0.3048178670434134, "learning_rate": 4.470049229992674e-06, "loss": 0.0332, "step": 4503 }, { "epoch": 1.7416860015467903, "grad_norm": 0.18696259418611427, "learning_rate": 4.467812176421108e-06, "loss": 0.0207, "step": 4504 }, { "epoch": 1.7420726991492654, "grad_norm": 0.3042640660482357, "learning_rate": 4.465575230596496e-06, "loss": 0.0375, "step": 4505 }, { "epoch": 1.74245939675174, "grad_norm": 0.19916808976766, "learning_rate": 4.463338392971723e-06, "loss": 0.017, "step": 4506 }, { "epoch": 1.742846094354215, "grad_norm": 0.23293518031479854, "learning_rate": 4.461101663999665e-06, "loss": 0.0275, "step": 4507 }, { "epoch": 1.74323279195669, "grad_norm": 0.37252562318031407, "learning_rate": 4.458865044133169e-06, "loss": 0.0196, "step": 4508 }, { "epoch": 1.7436194895591646, "grad_norm": 0.3468554074713195, "learning_rate": 4.456628533825062e-06, "loss": 0.0264, "step": 4509 }, { "epoch": 1.7440061871616397, "grad_norm": 0.34571189742038333, "learning_rate": 4.454392133528149e-06, "loss": 0.0262, "step": 4510 }, { "epoch": 1.7443928847641144, "grad_norm": 0.2582232372976987, "learning_rate": 4.452155843695211e-06, "loss": 0.028, "step": 4511 }, { "epoch": 1.7447795823665895, "grad_norm": 0.23158470555037586, "learning_rate": 4.449919664779011e-06, "loss": 0.0248, "step": 4512 }, { "epoch": 1.7451662799690641, "grad_norm": 0.3085661538090357, "learning_rate": 4.447683597232284e-06, "loss": 0.0353, "step": 4513 }, { "epoch": 1.745552977571539, "grad_norm": 0.6296637888407837, "learning_rate": 4.445447641507747e-06, "loss": 0.0324, "step": 4514 }, { "epoch": 1.745939675174014, "grad_norm": 0.19782786434763336, "learning_rate": 4.443211798058092e-06, "loss": 0.0166, "step": 4515 }, { "epoch": 1.7463263727764888, "grad_norm": 0.25282658962748883, "learning_rate": 4.440976067335987e-06, "loss": 0.0259, "step": 4516 }, { "epoch": 1.7467130703789637, "grad_norm": 0.27125896998051047, "learning_rate": 4.438740449794081e-06, "loss": 0.0248, "step": 4517 }, { "epoch": 1.7470997679814384, "grad_norm": 0.3755989640419376, "learning_rate": 4.436504945884996e-06, "loss": 0.0261, "step": 4518 }, { "epoch": 1.7474864655839135, "grad_norm": 0.3841599088617491, "learning_rate": 4.4342695560613355e-06, "loss": 0.0268, "step": 4519 }, { "epoch": 1.7478731631863882, "grad_norm": 0.22785902339788408, "learning_rate": 4.432034280775678e-06, "loss": 0.0277, "step": 4520 }, { "epoch": 1.748259860788863, "grad_norm": 0.4602240837119651, "learning_rate": 4.4297991204805715e-06, "loss": 0.0217, "step": 4521 }, { "epoch": 1.748646558391338, "grad_norm": 0.260071533361686, "learning_rate": 4.427564075628557e-06, "loss": 0.0278, "step": 4522 }, { "epoch": 1.7490332559938129, "grad_norm": 0.24303293631885425, "learning_rate": 4.425329146672133e-06, "loss": 0.0212, "step": 4523 }, { "epoch": 1.7494199535962878, "grad_norm": 0.5168894815421097, "learning_rate": 4.423094334063793e-06, "loss": 0.0397, "step": 4524 }, { "epoch": 1.7498066511987624, "grad_norm": 0.4022389010960765, "learning_rate": 4.42085963825599e-06, "loss": 0.0362, "step": 4525 }, { "epoch": 1.7501933488012376, "grad_norm": 0.25172303984692723, "learning_rate": 4.418625059701168e-06, "loss": 0.02, "step": 4526 }, { "epoch": 1.7505800464037122, "grad_norm": 0.31943065436403545, "learning_rate": 4.416390598851737e-06, "loss": 0.0312, "step": 4527 }, { "epoch": 1.7509667440061871, "grad_norm": 0.3786617864573975, "learning_rate": 4.414156256160086e-06, "loss": 0.0202, "step": 4528 }, { "epoch": 1.751353441608662, "grad_norm": 0.27890538605010395, "learning_rate": 4.411922032078585e-06, "loss": 0.0317, "step": 4529 }, { "epoch": 1.751740139211137, "grad_norm": 0.4996418745783153, "learning_rate": 4.40968792705957e-06, "loss": 0.0274, "step": 4530 }, { "epoch": 1.7521268368136118, "grad_norm": 0.4007382337592019, "learning_rate": 4.407453941555365e-06, "loss": 0.0311, "step": 4531 }, { "epoch": 1.7525135344160865, "grad_norm": 0.43082754735527307, "learning_rate": 4.405220076018259e-06, "loss": 0.0495, "step": 4532 }, { "epoch": 1.7529002320185616, "grad_norm": 0.23787226765378594, "learning_rate": 4.402986330900523e-06, "loss": 0.0191, "step": 4533 }, { "epoch": 1.7532869296210363, "grad_norm": 0.3787766951622099, "learning_rate": 4.400752706654403e-06, "loss": 0.0256, "step": 4534 }, { "epoch": 1.7536736272235112, "grad_norm": 0.23452147125695447, "learning_rate": 4.398519203732116e-06, "loss": 0.014, "step": 4535 }, { "epoch": 1.754060324825986, "grad_norm": 0.2614206423713848, "learning_rate": 4.3962858225858615e-06, "loss": 0.0296, "step": 4536 }, { "epoch": 1.754447022428461, "grad_norm": 0.29067245891303295, "learning_rate": 4.3940525636678075e-06, "loss": 0.0214, "step": 4537 }, { "epoch": 1.7548337200309359, "grad_norm": 0.7481567243212918, "learning_rate": 4.391819427430104e-06, "loss": 0.0313, "step": 4538 }, { "epoch": 1.7552204176334105, "grad_norm": 0.379825668615119, "learning_rate": 4.38958641432487e-06, "loss": 0.0318, "step": 4539 }, { "epoch": 1.7556071152358856, "grad_norm": 0.39624382357179005, "learning_rate": 4.387353524804203e-06, "loss": 0.0576, "step": 4540 }, { "epoch": 1.7559938128383603, "grad_norm": 0.3924947007047011, "learning_rate": 4.385120759320176e-06, "loss": 0.0268, "step": 4541 }, { "epoch": 1.7563805104408354, "grad_norm": 0.28141871203429714, "learning_rate": 4.3828881183248315e-06, "loss": 0.025, "step": 4542 }, { "epoch": 1.75676720804331, "grad_norm": 0.4545823554771914, "learning_rate": 4.380655602270196e-06, "loss": 0.0337, "step": 4543 }, { "epoch": 1.757153905645785, "grad_norm": 0.3142824891164504, "learning_rate": 4.37842321160826e-06, "loss": 0.0371, "step": 4544 }, { "epoch": 1.75754060324826, "grad_norm": 0.27614641584053135, "learning_rate": 4.3761909467910006e-06, "loss": 0.0266, "step": 4545 }, { "epoch": 1.7579273008507346, "grad_norm": 0.3165402912603495, "learning_rate": 4.373958808270356e-06, "loss": 0.0273, "step": 4546 }, { "epoch": 1.7583139984532097, "grad_norm": 0.27153408814993835, "learning_rate": 4.371726796498252e-06, "loss": 0.0264, "step": 4547 }, { "epoch": 1.7587006960556844, "grad_norm": 0.29323862001304973, "learning_rate": 4.3694949119265805e-06, "loss": 0.0249, "step": 4548 }, { "epoch": 1.7590873936581595, "grad_norm": 0.4054311143664056, "learning_rate": 4.367263155007207e-06, "loss": 0.032, "step": 4549 }, { "epoch": 1.7594740912606341, "grad_norm": 0.3132489162360198, "learning_rate": 4.365031526191977e-06, "loss": 0.025, "step": 4550 }, { "epoch": 1.759860788863109, "grad_norm": 0.2966515134809475, "learning_rate": 4.362800025932706e-06, "loss": 0.0353, "step": 4551 }, { "epoch": 1.760247486465584, "grad_norm": 0.2805504675280267, "learning_rate": 4.360568654681184e-06, "loss": 0.0213, "step": 4552 }, { "epoch": 1.7606341840680588, "grad_norm": 0.45616537949134434, "learning_rate": 4.358337412889176e-06, "loss": 0.0313, "step": 4553 }, { "epoch": 1.7610208816705337, "grad_norm": 1.6981294507023605, "learning_rate": 4.356106301008419e-06, "loss": 0.0345, "step": 4554 }, { "epoch": 1.7614075792730084, "grad_norm": 0.2329184816321937, "learning_rate": 4.353875319490627e-06, "loss": 0.0228, "step": 4555 }, { "epoch": 1.7617942768754835, "grad_norm": 0.2473187446902442, "learning_rate": 4.351644468787481e-06, "loss": 0.0224, "step": 4556 }, { "epoch": 1.7621809744779582, "grad_norm": 0.31987918834929974, "learning_rate": 4.349413749350645e-06, "loss": 0.0308, "step": 4557 }, { "epoch": 1.762567672080433, "grad_norm": 0.34106107072421393, "learning_rate": 4.3471831616317475e-06, "loss": 0.0283, "step": 4558 }, { "epoch": 1.762954369682908, "grad_norm": 0.2760553578709058, "learning_rate": 4.344952706082397e-06, "loss": 0.0246, "step": 4559 }, { "epoch": 1.7633410672853829, "grad_norm": 0.44191136001263304, "learning_rate": 4.342722383154171e-06, "loss": 0.0359, "step": 4560 }, { "epoch": 1.7637277648878578, "grad_norm": 0.46518027155667285, "learning_rate": 4.340492193298621e-06, "loss": 0.032, "step": 4561 }, { "epoch": 1.7641144624903324, "grad_norm": 0.3837166064066875, "learning_rate": 4.338262136967272e-06, "loss": 0.0377, "step": 4562 }, { "epoch": 1.7645011600928076, "grad_norm": 0.3995949514663197, "learning_rate": 4.336032214611623e-06, "loss": 0.0317, "step": 4563 }, { "epoch": 1.7648878576952822, "grad_norm": 0.7100374968900774, "learning_rate": 4.333802426683145e-06, "loss": 0.0313, "step": 4564 }, { "epoch": 1.7652745552977571, "grad_norm": 0.2520523738450635, "learning_rate": 4.33157277363328e-06, "loss": 0.0281, "step": 4565 }, { "epoch": 1.765661252900232, "grad_norm": 0.35749292596993304, "learning_rate": 4.329343255913447e-06, "loss": 0.0319, "step": 4566 }, { "epoch": 1.766047950502707, "grad_norm": 0.303306027603626, "learning_rate": 4.327113873975036e-06, "loss": 0.0268, "step": 4567 }, { "epoch": 1.7664346481051818, "grad_norm": 0.3357817521147219, "learning_rate": 4.324884628269401e-06, "loss": 0.0301, "step": 4568 }, { "epoch": 1.7668213457076565, "grad_norm": 0.41799593994787454, "learning_rate": 4.322655519247886e-06, "loss": 0.028, "step": 4569 }, { "epoch": 1.7672080433101316, "grad_norm": 0.37315905963234997, "learning_rate": 4.320426547361788e-06, "loss": 0.0283, "step": 4570 }, { "epoch": 1.7675947409126063, "grad_norm": 0.3681345331870292, "learning_rate": 4.318197713062394e-06, "loss": 0.0347, "step": 4571 }, { "epoch": 1.7679814385150812, "grad_norm": 0.3135707607421746, "learning_rate": 4.315969016800947e-06, "loss": 0.0305, "step": 4572 }, { "epoch": 1.768368136117556, "grad_norm": 0.246006511972865, "learning_rate": 4.3137404590286765e-06, "loss": 0.0238, "step": 4573 }, { "epoch": 1.768754833720031, "grad_norm": 0.2673135415864864, "learning_rate": 4.311512040196773e-06, "loss": 0.0309, "step": 4574 }, { "epoch": 1.7691415313225058, "grad_norm": 0.37403461070455507, "learning_rate": 4.309283760756403e-06, "loss": 0.0406, "step": 4575 }, { "epoch": 1.7695282289249805, "grad_norm": 0.3268159155735855, "learning_rate": 4.3070556211587065e-06, "loss": 0.0299, "step": 4576 }, { "epoch": 1.7699149265274556, "grad_norm": 0.3541983863392653, "learning_rate": 4.304827621854791e-06, "loss": 0.0309, "step": 4577 }, { "epoch": 1.7703016241299303, "grad_norm": 0.3506684335822493, "learning_rate": 4.302599763295743e-06, "loss": 0.0309, "step": 4578 }, { "epoch": 1.7706883217324054, "grad_norm": 0.4383924078828177, "learning_rate": 4.300372045932609e-06, "loss": 0.029, "step": 4579 }, { "epoch": 1.77107501933488, "grad_norm": 0.30491090611397453, "learning_rate": 4.298144470216419e-06, "loss": 0.0199, "step": 4580 }, { "epoch": 1.771461716937355, "grad_norm": 0.24395996689600744, "learning_rate": 4.295917036598166e-06, "loss": 0.0224, "step": 4581 }, { "epoch": 1.7718484145398299, "grad_norm": 0.2697978094530277, "learning_rate": 4.2936897455288175e-06, "loss": 0.0226, "step": 4582 }, { "epoch": 1.7722351121423046, "grad_norm": 0.27248127058635896, "learning_rate": 4.291462597459313e-06, "loss": 0.0245, "step": 4583 }, { "epoch": 1.7726218097447797, "grad_norm": 0.2657417596342614, "learning_rate": 4.28923559284056e-06, "loss": 0.0229, "step": 4584 }, { "epoch": 1.7730085073472543, "grad_norm": 0.3215548861543192, "learning_rate": 4.28700873212344e-06, "loss": 0.0326, "step": 4585 }, { "epoch": 1.7733952049497295, "grad_norm": 0.3298069707451675, "learning_rate": 4.284782015758802e-06, "loss": 0.0264, "step": 4586 }, { "epoch": 1.7737819025522041, "grad_norm": 0.2890203448588601, "learning_rate": 4.282555444197471e-06, "loss": 0.0192, "step": 4587 }, { "epoch": 1.774168600154679, "grad_norm": 0.2369750965977397, "learning_rate": 4.28032901789024e-06, "loss": 0.0216, "step": 4588 }, { "epoch": 1.774555297757154, "grad_norm": 0.24328249297559337, "learning_rate": 4.278102737287866e-06, "loss": 0.0247, "step": 4589 }, { "epoch": 1.7749419953596288, "grad_norm": 0.2827519899463772, "learning_rate": 4.275876602841089e-06, "loss": 0.0231, "step": 4590 }, { "epoch": 1.7753286929621037, "grad_norm": 0.29269484461180245, "learning_rate": 4.2736506150006085e-06, "loss": 0.0258, "step": 4591 }, { "epoch": 1.7757153905645784, "grad_norm": 0.33753622308423986, "learning_rate": 4.271424774217104e-06, "loss": 0.0339, "step": 4592 }, { "epoch": 1.7761020881670535, "grad_norm": 0.18514686557461052, "learning_rate": 4.269199080941215e-06, "loss": 0.0153, "step": 4593 }, { "epoch": 1.7764887857695282, "grad_norm": 0.42467712729970825, "learning_rate": 4.266973535623558e-06, "loss": 0.0254, "step": 4594 }, { "epoch": 1.776875483372003, "grad_norm": 0.6077603450529195, "learning_rate": 4.2647481387147196e-06, "loss": 0.0368, "step": 4595 }, { "epoch": 1.777262180974478, "grad_norm": 0.23823560479699396, "learning_rate": 4.262522890665251e-06, "loss": 0.0201, "step": 4596 }, { "epoch": 1.7776488785769529, "grad_norm": 0.2945680694038262, "learning_rate": 4.26029779192568e-06, "loss": 0.0264, "step": 4597 }, { "epoch": 1.7780355761794278, "grad_norm": 0.2694646834137199, "learning_rate": 4.258072842946499e-06, "loss": 0.0226, "step": 4598 }, { "epoch": 1.7784222737819024, "grad_norm": 0.34194398131650733, "learning_rate": 4.255848044178173e-06, "loss": 0.0221, "step": 4599 }, { "epoch": 1.7788089713843775, "grad_norm": 0.2956546878871849, "learning_rate": 4.253623396071136e-06, "loss": 0.0312, "step": 4600 }, { "epoch": 1.7791956689868522, "grad_norm": 0.2730358500170644, "learning_rate": 4.251398899075788e-06, "loss": 0.023, "step": 4601 }, { "epoch": 1.7795823665893271, "grad_norm": 0.22227547340074738, "learning_rate": 4.249174553642507e-06, "loss": 0.0254, "step": 4602 }, { "epoch": 1.779969064191802, "grad_norm": 0.19787014751221263, "learning_rate": 4.246950360221629e-06, "loss": 0.0167, "step": 4603 }, { "epoch": 1.780355761794277, "grad_norm": 0.39724760336561676, "learning_rate": 4.244726319263469e-06, "loss": 0.034, "step": 4604 }, { "epoch": 1.7807424593967518, "grad_norm": 0.42320152607590306, "learning_rate": 4.242502431218305e-06, "loss": 0.0341, "step": 4605 }, { "epoch": 1.7811291569992265, "grad_norm": 0.25657729963827197, "learning_rate": 4.240278696536388e-06, "loss": 0.021, "step": 4606 }, { "epoch": 1.7815158546017016, "grad_norm": 0.3687447478881092, "learning_rate": 4.238055115667937e-06, "loss": 0.0316, "step": 4607 }, { "epoch": 1.7819025522041763, "grad_norm": 0.36823979088872316, "learning_rate": 4.235831689063134e-06, "loss": 0.0251, "step": 4608 }, { "epoch": 1.7822892498066512, "grad_norm": 0.21523157652848088, "learning_rate": 4.233608417172142e-06, "loss": 0.0188, "step": 4609 }, { "epoch": 1.782675947409126, "grad_norm": 0.44992997463843637, "learning_rate": 4.231385300445076e-06, "loss": 0.0378, "step": 4610 }, { "epoch": 1.783062645011601, "grad_norm": 0.2504360469948534, "learning_rate": 4.22916233933204e-06, "loss": 0.0191, "step": 4611 }, { "epoch": 1.7834493426140758, "grad_norm": 0.3176497549540469, "learning_rate": 4.2269395342830875e-06, "loss": 0.0263, "step": 4612 }, { "epoch": 1.7838360402165505, "grad_norm": 0.31244213214853245, "learning_rate": 4.2247168857482525e-06, "loss": 0.0245, "step": 4613 }, { "epoch": 1.7842227378190256, "grad_norm": 0.2643571604413881, "learning_rate": 4.222494394177532e-06, "loss": 0.0207, "step": 4614 }, { "epoch": 1.7846094354215003, "grad_norm": 0.5164481295670084, "learning_rate": 4.220272060020891e-06, "loss": 0.0236, "step": 4615 }, { "epoch": 1.7849961330239754, "grad_norm": 0.2741599533680092, "learning_rate": 4.218049883728267e-06, "loss": 0.021, "step": 4616 }, { "epoch": 1.78538283062645, "grad_norm": 0.34602665262114735, "learning_rate": 4.21582786574956e-06, "loss": 0.0266, "step": 4617 }, { "epoch": 1.785769528228925, "grad_norm": 0.2422855142605583, "learning_rate": 4.213606006534643e-06, "loss": 0.0198, "step": 4618 }, { "epoch": 1.7861562258313999, "grad_norm": 0.3298960412285013, "learning_rate": 4.211384306533352e-06, "loss": 0.0299, "step": 4619 }, { "epoch": 1.7865429234338746, "grad_norm": 0.35711892608682066, "learning_rate": 4.209162766195495e-06, "loss": 0.0314, "step": 4620 }, { "epoch": 1.7869296210363497, "grad_norm": 0.29699515242530405, "learning_rate": 4.206941385970846e-06, "loss": 0.0289, "step": 4621 }, { "epoch": 1.7873163186388243, "grad_norm": 0.3383934498003281, "learning_rate": 4.204720166309145e-06, "loss": 0.0302, "step": 4622 }, { "epoch": 1.7877030162412995, "grad_norm": 0.36285118441009995, "learning_rate": 4.202499107660103e-06, "loss": 0.0408, "step": 4623 }, { "epoch": 1.7880897138437741, "grad_norm": 0.33626962235610813, "learning_rate": 4.200278210473394e-06, "loss": 0.0323, "step": 4624 }, { "epoch": 1.788476411446249, "grad_norm": 0.3051231093979111, "learning_rate": 4.198057475198664e-06, "loss": 0.0272, "step": 4625 }, { "epoch": 1.788863109048724, "grad_norm": 0.30190940190208243, "learning_rate": 4.195836902285522e-06, "loss": 0.0306, "step": 4626 }, { "epoch": 1.7892498066511988, "grad_norm": 0.285629406759103, "learning_rate": 4.193616492183547e-06, "loss": 0.0233, "step": 4627 }, { "epoch": 1.7896365042536737, "grad_norm": 0.3523786932136656, "learning_rate": 4.191396245342285e-06, "loss": 0.0364, "step": 4628 }, { "epoch": 1.7900232018561484, "grad_norm": 0.28644238453634885, "learning_rate": 4.189176162211245e-06, "loss": 0.038, "step": 4629 }, { "epoch": 1.7904098994586235, "grad_norm": 0.2650958106724323, "learning_rate": 4.18695624323991e-06, "loss": 0.0291, "step": 4630 }, { "epoch": 1.7907965970610982, "grad_norm": 0.4203430525544028, "learning_rate": 4.184736488877722e-06, "loss": 0.0239, "step": 4631 }, { "epoch": 1.791183294663573, "grad_norm": 0.3284676543915865, "learning_rate": 4.182516899574096e-06, "loss": 0.0315, "step": 4632 }, { "epoch": 1.791569992266048, "grad_norm": 0.30489829134983326, "learning_rate": 4.1802974757784104e-06, "loss": 0.0331, "step": 4633 }, { "epoch": 1.7919566898685229, "grad_norm": 0.208735714447132, "learning_rate": 4.178078217940007e-06, "loss": 0.0241, "step": 4634 }, { "epoch": 1.7923433874709978, "grad_norm": 0.3129647824780298, "learning_rate": 4.175859126508204e-06, "loss": 0.0328, "step": 4635 }, { "epoch": 1.7927300850734724, "grad_norm": 0.6552305933222358, "learning_rate": 4.173640201932272e-06, "loss": 0.0231, "step": 4636 }, { "epoch": 1.7931167826759475, "grad_norm": 0.24629684896874857, "learning_rate": 4.171421444661463e-06, "loss": 0.0245, "step": 4637 }, { "epoch": 1.7935034802784222, "grad_norm": 0.24774375965542125, "learning_rate": 4.16920285514498e-06, "loss": 0.0254, "step": 4638 }, { "epoch": 1.793890177880897, "grad_norm": 0.3679067840578863, "learning_rate": 4.166984433832007e-06, "loss": 0.0271, "step": 4639 }, { "epoch": 1.794276875483372, "grad_norm": 0.5442091053437496, "learning_rate": 4.164766181171681e-06, "loss": 0.0345, "step": 4640 }, { "epoch": 1.794663573085847, "grad_norm": 0.27969501587415496, "learning_rate": 4.162548097613112e-06, "loss": 0.0242, "step": 4641 }, { "epoch": 1.7950502706883218, "grad_norm": 0.25921019112062255, "learning_rate": 4.160330183605374e-06, "loss": 0.026, "step": 4642 }, { "epoch": 1.7954369682907965, "grad_norm": 0.2728956293934482, "learning_rate": 4.1581124395975055e-06, "loss": 0.0238, "step": 4643 }, { "epoch": 1.7958236658932716, "grad_norm": 0.3496307791739454, "learning_rate": 4.155894866038515e-06, "loss": 0.0334, "step": 4644 }, { "epoch": 1.7962103634957463, "grad_norm": 0.35856992431508267, "learning_rate": 4.15367746337737e-06, "loss": 0.0259, "step": 4645 }, { "epoch": 1.7965970610982211, "grad_norm": 0.24817208125354762, "learning_rate": 4.151460232063008e-06, "loss": 0.0278, "step": 4646 }, { "epoch": 1.796983758700696, "grad_norm": 0.3070666483608851, "learning_rate": 4.149243172544331e-06, "loss": 0.02, "step": 4647 }, { "epoch": 1.797370456303171, "grad_norm": 0.31251580219681, "learning_rate": 4.147026285270204e-06, "loss": 0.0259, "step": 4648 }, { "epoch": 1.7977571539056458, "grad_norm": 0.2988696432331726, "learning_rate": 4.144809570689461e-06, "loss": 0.0292, "step": 4649 }, { "epoch": 1.7981438515081205, "grad_norm": 0.33933043268422675, "learning_rate": 4.142593029250898e-06, "loss": 0.0236, "step": 4650 }, { "epoch": 1.7985305491105956, "grad_norm": 0.2736528779505772, "learning_rate": 4.140376661403277e-06, "loss": 0.0246, "step": 4651 }, { "epoch": 1.7989172467130703, "grad_norm": 0.3562796844984365, "learning_rate": 4.138160467595322e-06, "loss": 0.0393, "step": 4652 }, { "epoch": 1.7993039443155452, "grad_norm": 0.3111652087153986, "learning_rate": 4.1359444482757295e-06, "loss": 0.0329, "step": 4653 }, { "epoch": 1.79969064191802, "grad_norm": 0.34504522338448784, "learning_rate": 4.1337286038931535e-06, "loss": 0.0223, "step": 4654 }, { "epoch": 1.800077339520495, "grad_norm": 0.26455279878409493, "learning_rate": 4.131512934896211e-06, "loss": 0.0206, "step": 4655 }, { "epoch": 1.8004640371229699, "grad_norm": 0.22535755196903068, "learning_rate": 4.129297441733493e-06, "loss": 0.0221, "step": 4656 }, { "epoch": 1.8008507347254445, "grad_norm": 0.2588424556542948, "learning_rate": 4.127082124853544e-06, "loss": 0.0245, "step": 4657 }, { "epoch": 1.8012374323279197, "grad_norm": 0.33812061579888025, "learning_rate": 4.124866984704883e-06, "loss": 0.0333, "step": 4658 }, { "epoch": 1.8016241299303943, "grad_norm": 0.33671483941597263, "learning_rate": 4.1226520217359825e-06, "loss": 0.0371, "step": 4659 }, { "epoch": 1.8020108275328695, "grad_norm": 0.2696179141082581, "learning_rate": 4.12043723639529e-06, "loss": 0.0225, "step": 4660 }, { "epoch": 1.8023975251353441, "grad_norm": 0.22000352371395043, "learning_rate": 4.11822262913121e-06, "loss": 0.0234, "step": 4661 }, { "epoch": 1.802784222737819, "grad_norm": 0.2039267297330506, "learning_rate": 4.1160082003921095e-06, "loss": 0.0157, "step": 4662 }, { "epoch": 1.803170920340294, "grad_norm": 0.36599618365923353, "learning_rate": 4.113793950626327e-06, "loss": 0.0301, "step": 4663 }, { "epoch": 1.8035576179427688, "grad_norm": 0.23135114820618272, "learning_rate": 4.111579880282158e-06, "loss": 0.0229, "step": 4664 }, { "epoch": 1.8039443155452437, "grad_norm": 0.2881968478940684, "learning_rate": 4.1093659898078655e-06, "loss": 0.0219, "step": 4665 }, { "epoch": 1.8043310131477184, "grad_norm": 0.18297090923584233, "learning_rate": 4.107152279651674e-06, "loss": 0.0182, "step": 4666 }, { "epoch": 1.8047177107501935, "grad_norm": 0.3054347168994886, "learning_rate": 4.10493875026177e-06, "loss": 0.0384, "step": 4667 }, { "epoch": 1.8051044083526682, "grad_norm": 0.5845167290252093, "learning_rate": 4.1027254020863086e-06, "loss": 0.0497, "step": 4668 }, { "epoch": 1.805491105955143, "grad_norm": 0.228177940925182, "learning_rate": 4.100512235573403e-06, "loss": 0.0183, "step": 4669 }, { "epoch": 1.805877803557618, "grad_norm": 0.2560562796997449, "learning_rate": 4.098299251171134e-06, "loss": 0.0231, "step": 4670 }, { "epoch": 1.8062645011600929, "grad_norm": 0.5438320851958138, "learning_rate": 4.09608644932754e-06, "loss": 0.0282, "step": 4671 }, { "epoch": 1.8066511987625677, "grad_norm": 0.4460244707383743, "learning_rate": 4.093873830490629e-06, "loss": 0.0302, "step": 4672 }, { "epoch": 1.8070378963650424, "grad_norm": 0.29471203111585786, "learning_rate": 4.0916613951083685e-06, "loss": 0.0299, "step": 4673 }, { "epoch": 1.8074245939675175, "grad_norm": 0.34227633093279897, "learning_rate": 4.089449143628684e-06, "loss": 0.0294, "step": 4674 }, { "epoch": 1.8078112915699922, "grad_norm": 0.5801084725590503, "learning_rate": 4.0872370764994765e-06, "loss": 0.0401, "step": 4675 }, { "epoch": 1.808197989172467, "grad_norm": 0.2894993052656687, "learning_rate": 4.085025194168595e-06, "loss": 0.0238, "step": 4676 }, { "epoch": 1.808584686774942, "grad_norm": 0.25665767775347964, "learning_rate": 4.082813497083863e-06, "loss": 0.0282, "step": 4677 }, { "epoch": 1.808971384377417, "grad_norm": 0.28589932484117064, "learning_rate": 4.080601985693057e-06, "loss": 0.0307, "step": 4678 }, { "epoch": 1.8093580819798918, "grad_norm": 0.4073286840865984, "learning_rate": 4.078390660443926e-06, "loss": 0.0341, "step": 4679 }, { "epoch": 1.8097447795823665, "grad_norm": 0.4023432811256415, "learning_rate": 4.076179521784172e-06, "loss": 0.029, "step": 4680 }, { "epoch": 1.8101314771848416, "grad_norm": 0.3952564507275352, "learning_rate": 4.073968570161463e-06, "loss": 0.0313, "step": 4681 }, { "epoch": 1.8105181747873162, "grad_norm": 0.3997369196585217, "learning_rate": 4.07175780602343e-06, "loss": 0.0427, "step": 4682 }, { "epoch": 1.8109048723897911, "grad_norm": 0.31646100919849623, "learning_rate": 4.069547229817664e-06, "loss": 0.0296, "step": 4683 }, { "epoch": 1.811291569992266, "grad_norm": 0.2584897389279577, "learning_rate": 4.067336841991721e-06, "loss": 0.0254, "step": 4684 }, { "epoch": 1.811678267594741, "grad_norm": 0.2134958142166988, "learning_rate": 4.065126642993115e-06, "loss": 0.0215, "step": 4685 }, { "epoch": 1.8120649651972158, "grad_norm": 0.4131590217823517, "learning_rate": 4.062916633269326e-06, "loss": 0.0248, "step": 4686 }, { "epoch": 1.8124516627996905, "grad_norm": 0.8870540212549141, "learning_rate": 4.060706813267792e-06, "loss": 0.0254, "step": 4687 }, { "epoch": 1.8128383604021656, "grad_norm": 0.3816766148880196, "learning_rate": 4.0584971834359135e-06, "loss": 0.0253, "step": 4688 }, { "epoch": 1.8132250580046403, "grad_norm": 0.2335909686686464, "learning_rate": 4.0562877442210545e-06, "loss": 0.0225, "step": 4689 }, { "epoch": 1.8136117556071152, "grad_norm": 0.3545842514906585, "learning_rate": 4.054078496070537e-06, "loss": 0.0361, "step": 4690 }, { "epoch": 1.81399845320959, "grad_norm": 0.20833632624805476, "learning_rate": 4.051869439431648e-06, "loss": 0.0236, "step": 4691 }, { "epoch": 1.814385150812065, "grad_norm": 0.33026402434259194, "learning_rate": 4.049660574751632e-06, "loss": 0.0287, "step": 4692 }, { "epoch": 1.8147718484145399, "grad_norm": 0.29853869849175363, "learning_rate": 4.0474519024777e-06, "loss": 0.0371, "step": 4693 }, { "epoch": 1.8151585460170145, "grad_norm": 0.2613285643787391, "learning_rate": 4.045243423057018e-06, "loss": 0.0268, "step": 4694 }, { "epoch": 1.8155452436194897, "grad_norm": 0.22960226210586002, "learning_rate": 4.043035136936715e-06, "loss": 0.0238, "step": 4695 }, { "epoch": 1.8159319412219643, "grad_norm": 0.2493523976845017, "learning_rate": 4.040827044563883e-06, "loss": 0.0294, "step": 4696 }, { "epoch": 1.8163186388244394, "grad_norm": 0.17452562712055283, "learning_rate": 4.0386191463855715e-06, "loss": 0.0189, "step": 4697 }, { "epoch": 1.8167053364269141, "grad_norm": 0.20994491737901197, "learning_rate": 4.0364114428487954e-06, "loss": 0.0219, "step": 4698 }, { "epoch": 1.817092034029389, "grad_norm": 0.3435863430919129, "learning_rate": 4.034203934400525e-06, "loss": 0.0275, "step": 4699 }, { "epoch": 1.817478731631864, "grad_norm": 0.21908396671140276, "learning_rate": 4.031996621487693e-06, "loss": 0.0201, "step": 4700 }, { "epoch": 1.8178654292343386, "grad_norm": 0.24869007735610768, "learning_rate": 4.029789504557195e-06, "loss": 0.0226, "step": 4701 }, { "epoch": 1.8182521268368137, "grad_norm": 0.27187005879381243, "learning_rate": 4.027582584055881e-06, "loss": 0.0299, "step": 4702 }, { "epoch": 1.8186388244392884, "grad_norm": 0.20462943634228584, "learning_rate": 4.0253758604305694e-06, "loss": 0.0185, "step": 4703 }, { "epoch": 1.8190255220417635, "grad_norm": 0.40591697879365146, "learning_rate": 4.02316933412803e-06, "loss": 0.0314, "step": 4704 }, { "epoch": 1.8194122196442382, "grad_norm": 0.24831981576274645, "learning_rate": 4.020963005595001e-06, "loss": 0.0267, "step": 4705 }, { "epoch": 1.819798917246713, "grad_norm": 0.29814001360397174, "learning_rate": 4.018756875278175e-06, "loss": 0.0334, "step": 4706 }, { "epoch": 1.820185614849188, "grad_norm": 0.19523822123682505, "learning_rate": 4.016550943624203e-06, "loss": 0.0219, "step": 4707 }, { "epoch": 1.8205723124516628, "grad_norm": 0.2385052122982582, "learning_rate": 4.014345211079703e-06, "loss": 0.0188, "step": 4708 }, { "epoch": 1.8209590100541377, "grad_norm": 0.22840025310836054, "learning_rate": 4.012139678091246e-06, "loss": 0.0231, "step": 4709 }, { "epoch": 1.8213457076566124, "grad_norm": 0.4330613822695142, "learning_rate": 4.0099343451053666e-06, "loss": 0.0275, "step": 4710 }, { "epoch": 1.8217324052590875, "grad_norm": 0.34768422228613266, "learning_rate": 4.007729212568556e-06, "loss": 0.0224, "step": 4711 }, { "epoch": 1.8221191028615622, "grad_norm": 0.30281445611067354, "learning_rate": 4.005524280927267e-06, "loss": 0.0292, "step": 4712 }, { "epoch": 1.822505800464037, "grad_norm": 0.25859846367274486, "learning_rate": 4.003319550627912e-06, "loss": 0.024, "step": 4713 }, { "epoch": 1.822892498066512, "grad_norm": 0.20332450033060154, "learning_rate": 4.001115022116858e-06, "loss": 0.0185, "step": 4714 }, { "epoch": 1.8232791956689869, "grad_norm": 0.2852238650446646, "learning_rate": 3.998910695840437e-06, "loss": 0.0338, "step": 4715 }, { "epoch": 1.8236658932714618, "grad_norm": 0.3234924484756794, "learning_rate": 3.996706572244938e-06, "loss": 0.0322, "step": 4716 }, { "epoch": 1.8240525908739365, "grad_norm": 0.2636726017029993, "learning_rate": 3.99450265177661e-06, "loss": 0.0239, "step": 4717 }, { "epoch": 1.8244392884764116, "grad_norm": 0.22766429362292806, "learning_rate": 3.992298934881655e-06, "loss": 0.0265, "step": 4718 }, { "epoch": 1.8248259860788862, "grad_norm": 0.2814404142176881, "learning_rate": 3.990095422006244e-06, "loss": 0.0326, "step": 4719 }, { "epoch": 1.8252126836813611, "grad_norm": 0.3259555823778973, "learning_rate": 3.9878921135964984e-06, "loss": 0.0241, "step": 4720 }, { "epoch": 1.825599381283836, "grad_norm": 0.30639104950741536, "learning_rate": 3.985689010098498e-06, "loss": 0.0227, "step": 4721 }, { "epoch": 1.825986078886311, "grad_norm": 0.33344908791948213, "learning_rate": 3.983486111958289e-06, "loss": 0.0239, "step": 4722 }, { "epoch": 1.8263727764887858, "grad_norm": 0.2746852694463452, "learning_rate": 3.981283419621867e-06, "loss": 0.0229, "step": 4723 }, { "epoch": 1.8267594740912605, "grad_norm": 0.31258177581423796, "learning_rate": 3.979080933535194e-06, "loss": 0.0228, "step": 4724 }, { "epoch": 1.8271461716937356, "grad_norm": 0.8437048673141683, "learning_rate": 3.976878654144181e-06, "loss": 0.043, "step": 4725 }, { "epoch": 1.8275328692962103, "grad_norm": 0.6925508451154547, "learning_rate": 3.974676581894708e-06, "loss": 0.033, "step": 4726 }, { "epoch": 1.8279195668986852, "grad_norm": 0.26394219660789836, "learning_rate": 3.972474717232604e-06, "loss": 0.0205, "step": 4727 }, { "epoch": 1.82830626450116, "grad_norm": 0.20329477084936057, "learning_rate": 3.970273060603658e-06, "loss": 0.0176, "step": 4728 }, { "epoch": 1.828692962103635, "grad_norm": 0.2464698600061185, "learning_rate": 3.968071612453622e-06, "loss": 0.0231, "step": 4729 }, { "epoch": 1.8290796597061099, "grad_norm": 0.4169575480712342, "learning_rate": 3.965870373228199e-06, "loss": 0.0205, "step": 4730 }, { "epoch": 1.8294663573085845, "grad_norm": 0.31497020318982744, "learning_rate": 3.963669343373055e-06, "loss": 0.0349, "step": 4731 }, { "epoch": 1.8298530549110597, "grad_norm": 0.637428279777258, "learning_rate": 3.9614685233338084e-06, "loss": 0.0286, "step": 4732 }, { "epoch": 1.8302397525135343, "grad_norm": 0.256457753609932, "learning_rate": 3.959267913556041e-06, "loss": 0.028, "step": 4733 }, { "epoch": 1.8306264501160094, "grad_norm": 0.3087908627397226, "learning_rate": 3.957067514485289e-06, "loss": 0.0255, "step": 4734 }, { "epoch": 1.8310131477184841, "grad_norm": 0.30970354520722543, "learning_rate": 3.954867326567043e-06, "loss": 0.0353, "step": 4735 }, { "epoch": 1.831399845320959, "grad_norm": 0.2622958550497605, "learning_rate": 3.952667350246758e-06, "loss": 0.0203, "step": 4736 }, { "epoch": 1.831786542923434, "grad_norm": 0.2897036833461201, "learning_rate": 3.9504675859698385e-06, "loss": 0.0234, "step": 4737 }, { "epoch": 1.8321732405259086, "grad_norm": 0.27643543150038796, "learning_rate": 3.948268034181652e-06, "loss": 0.0229, "step": 4738 }, { "epoch": 1.8325599381283837, "grad_norm": 0.2703579891390442, "learning_rate": 3.946068695327519e-06, "loss": 0.0231, "step": 4739 }, { "epoch": 1.8329466357308584, "grad_norm": 0.20935239125776867, "learning_rate": 3.94386956985272e-06, "loss": 0.0193, "step": 4740 }, { "epoch": 1.8333333333333335, "grad_norm": 0.32298176554227986, "learning_rate": 3.941670658202491e-06, "loss": 0.0306, "step": 4741 }, { "epoch": 1.8337200309358082, "grad_norm": 0.22223214330124233, "learning_rate": 3.93947196082202e-06, "loss": 0.0149, "step": 4742 }, { "epoch": 1.834106728538283, "grad_norm": 0.3385263708614642, "learning_rate": 3.937273478156462e-06, "loss": 0.029, "step": 4743 }, { "epoch": 1.834493426140758, "grad_norm": 0.35055888535361124, "learning_rate": 3.935075210650918e-06, "loss": 0.0285, "step": 4744 }, { "epoch": 1.8348801237432328, "grad_norm": 0.33913382164003536, "learning_rate": 3.9328771587504555e-06, "loss": 0.0303, "step": 4745 }, { "epoch": 1.8352668213457077, "grad_norm": 0.25059138295856764, "learning_rate": 3.930679322900088e-06, "loss": 0.0253, "step": 4746 }, { "epoch": 1.8356535189481824, "grad_norm": 0.21705584818752446, "learning_rate": 3.928481703544792e-06, "loss": 0.0212, "step": 4747 }, { "epoch": 1.8360402165506575, "grad_norm": 0.34202663223868884, "learning_rate": 3.926284301129498e-06, "loss": 0.0256, "step": 4748 }, { "epoch": 1.8364269141531322, "grad_norm": 0.3043645228205641, "learning_rate": 3.924087116099093e-06, "loss": 0.0211, "step": 4749 }, { "epoch": 1.836813611755607, "grad_norm": 0.2982516789744802, "learning_rate": 3.921890148898421e-06, "loss": 0.0214, "step": 4750 }, { "epoch": 1.837200309358082, "grad_norm": 0.26711985559577783, "learning_rate": 3.919693399972277e-06, "loss": 0.0281, "step": 4751 }, { "epoch": 1.8375870069605569, "grad_norm": 0.31877983810558713, "learning_rate": 3.917496869765422e-06, "loss": 0.031, "step": 4752 }, { "epoch": 1.8379737045630318, "grad_norm": 0.31432733000137536, "learning_rate": 3.915300558722562e-06, "loss": 0.0274, "step": 4753 }, { "epoch": 1.8383604021655064, "grad_norm": 0.27792860524466695, "learning_rate": 3.913104467288362e-06, "loss": 0.0261, "step": 4754 }, { "epoch": 1.8387470997679816, "grad_norm": 0.2684112263248608, "learning_rate": 3.910908595907447e-06, "loss": 0.0241, "step": 4755 }, { "epoch": 1.8391337973704562, "grad_norm": 0.29177551950477426, "learning_rate": 3.908712945024391e-06, "loss": 0.0272, "step": 4756 }, { "epoch": 1.8395204949729311, "grad_norm": 0.3781669568773716, "learning_rate": 3.906517515083729e-06, "loss": 0.0336, "step": 4757 }, { "epoch": 1.839907192575406, "grad_norm": 0.25255149314695663, "learning_rate": 3.9043223065299464e-06, "loss": 0.024, "step": 4758 }, { "epoch": 1.840293890177881, "grad_norm": 0.46371344101292344, "learning_rate": 3.9021273198074875e-06, "loss": 0.0284, "step": 4759 }, { "epoch": 1.8406805877803558, "grad_norm": 0.28684608531899974, "learning_rate": 3.899932555360749e-06, "loss": 0.024, "step": 4760 }, { "epoch": 1.8410672853828305, "grad_norm": 0.32936509931720487, "learning_rate": 3.897738013634083e-06, "loss": 0.0308, "step": 4761 }, { "epoch": 1.8414539829853056, "grad_norm": 0.29001103805917744, "learning_rate": 3.8955436950718e-06, "loss": 0.0236, "step": 4762 }, { "epoch": 1.8418406805877803, "grad_norm": 0.49177824055301467, "learning_rate": 3.893349600118159e-06, "loss": 0.0292, "step": 4763 }, { "epoch": 1.8422273781902552, "grad_norm": 0.41644065831392785, "learning_rate": 3.891155729217381e-06, "loss": 0.0314, "step": 4764 }, { "epoch": 1.84261407579273, "grad_norm": 0.42916371116022384, "learning_rate": 3.888962082813635e-06, "loss": 0.0213, "step": 4765 }, { "epoch": 1.843000773395205, "grad_norm": 0.3094721307730446, "learning_rate": 3.886768661351049e-06, "loss": 0.0195, "step": 4766 }, { "epoch": 1.8433874709976799, "grad_norm": 0.3718094509633933, "learning_rate": 3.8845754652737054e-06, "loss": 0.0184, "step": 4767 }, { "epoch": 1.8437741686001545, "grad_norm": 0.3072541028498652, "learning_rate": 3.882382495025635e-06, "loss": 0.0233, "step": 4768 }, { "epoch": 1.8441608662026296, "grad_norm": 0.26688683223899995, "learning_rate": 3.8801897510508325e-06, "loss": 0.0207, "step": 4769 }, { "epoch": 1.8445475638051043, "grad_norm": 0.25609199576978803, "learning_rate": 3.877997233793237e-06, "loss": 0.0247, "step": 4770 }, { "epoch": 1.8449342614075794, "grad_norm": 0.2703487603728657, "learning_rate": 3.875804943696751e-06, "loss": 0.0268, "step": 4771 }, { "epoch": 1.845320959010054, "grad_norm": 0.2884369971620011, "learning_rate": 3.873612881205221e-06, "loss": 0.0247, "step": 4772 }, { "epoch": 1.845707656612529, "grad_norm": 0.2873268408858321, "learning_rate": 3.8714210467624595e-06, "loss": 0.0357, "step": 4773 }, { "epoch": 1.846094354215004, "grad_norm": 0.26415613387569475, "learning_rate": 3.86922944081222e-06, "loss": 0.026, "step": 4774 }, { "epoch": 1.8464810518174786, "grad_norm": 0.22741224632846996, "learning_rate": 3.867038063798219e-06, "loss": 0.0186, "step": 4775 }, { "epoch": 1.8468677494199537, "grad_norm": 0.29639802807232596, "learning_rate": 3.864846916164123e-06, "loss": 0.0223, "step": 4776 }, { "epoch": 1.8472544470224284, "grad_norm": 0.4776055212562411, "learning_rate": 3.8626559983535525e-06, "loss": 0.0301, "step": 4777 }, { "epoch": 1.8476411446249035, "grad_norm": 0.38867388913021383, "learning_rate": 3.8604653108100814e-06, "loss": 0.0361, "step": 4778 }, { "epoch": 1.8480278422273781, "grad_norm": 0.29042936246745193, "learning_rate": 3.858274853977237e-06, "loss": 0.0276, "step": 4779 }, { "epoch": 1.848414539829853, "grad_norm": 0.34626034351043344, "learning_rate": 3.856084628298501e-06, "loss": 0.0441, "step": 4780 }, { "epoch": 1.848801237432328, "grad_norm": 0.22551017573168375, "learning_rate": 3.853894634217307e-06, "loss": 0.0252, "step": 4781 }, { "epoch": 1.8491879350348028, "grad_norm": 0.5067924717579745, "learning_rate": 3.85170487217704e-06, "loss": 0.0383, "step": 4782 }, { "epoch": 1.8495746326372777, "grad_norm": 0.37708685707964185, "learning_rate": 3.849515342621043e-06, "loss": 0.0266, "step": 4783 }, { "epoch": 1.8499613302397524, "grad_norm": 0.25422085979210773, "learning_rate": 3.8473260459926065e-06, "loss": 0.0237, "step": 4784 }, { "epoch": 1.8503480278422275, "grad_norm": 0.2854534206778196, "learning_rate": 3.8451369827349785e-06, "loss": 0.0284, "step": 4785 }, { "epoch": 1.8507347254447022, "grad_norm": 0.30636699316260957, "learning_rate": 3.842948153291358e-06, "loss": 0.0321, "step": 4786 }, { "epoch": 1.851121423047177, "grad_norm": 0.8718506517331751, "learning_rate": 3.840759558104892e-06, "loss": 0.0297, "step": 4787 }, { "epoch": 1.851508120649652, "grad_norm": 0.36542508649735606, "learning_rate": 3.83857119761869e-06, "loss": 0.0244, "step": 4788 }, { "epoch": 1.8518948182521269, "grad_norm": 0.2655952235084073, "learning_rate": 3.836383072275802e-06, "loss": 0.025, "step": 4789 }, { "epoch": 1.8522815158546018, "grad_norm": 0.3548180522007978, "learning_rate": 3.834195182519243e-06, "loss": 0.0284, "step": 4790 }, { "epoch": 1.8526682134570764, "grad_norm": 0.2848248620970612, "learning_rate": 3.832007528791969e-06, "loss": 0.0229, "step": 4791 }, { "epoch": 1.8530549110595516, "grad_norm": 0.30716797116248407, "learning_rate": 3.8298201115368986e-06, "loss": 0.0256, "step": 4792 }, { "epoch": 1.8534416086620262, "grad_norm": 0.26601664742421033, "learning_rate": 3.827632931196892e-06, "loss": 0.0293, "step": 4793 }, { "epoch": 1.8538283062645011, "grad_norm": 0.4153342954484787, "learning_rate": 3.825445988214768e-06, "loss": 0.0395, "step": 4794 }, { "epoch": 1.854215003866976, "grad_norm": 0.2527897001408714, "learning_rate": 3.823259283033297e-06, "loss": 0.0277, "step": 4795 }, { "epoch": 1.854601701469451, "grad_norm": 0.41284442855480985, "learning_rate": 3.821072816095198e-06, "loss": 0.0319, "step": 4796 }, { "epoch": 1.8549883990719258, "grad_norm": 0.24936324089557232, "learning_rate": 3.818886587843147e-06, "loss": 0.022, "step": 4797 }, { "epoch": 1.8553750966744005, "grad_norm": 0.2751519552224818, "learning_rate": 3.816700598719766e-06, "loss": 0.0337, "step": 4798 }, { "epoch": 1.8557617942768756, "grad_norm": 0.3772287419315525, "learning_rate": 3.8145148491676342e-06, "loss": 0.0253, "step": 4799 }, { "epoch": 1.8561484918793503, "grad_norm": 0.25105260061025464, "learning_rate": 3.812329339629276e-06, "loss": 0.0267, "step": 4800 }, { "epoch": 1.8565351894818252, "grad_norm": 0.41074390560438234, "learning_rate": 3.8101440705471716e-06, "loss": 0.0294, "step": 4801 }, { "epoch": 1.8569218870843, "grad_norm": 0.2908983313725067, "learning_rate": 3.8079590423637525e-06, "loss": 0.0252, "step": 4802 }, { "epoch": 1.857308584686775, "grad_norm": 0.3066772317736524, "learning_rate": 3.8057742555213983e-06, "loss": 0.0344, "step": 4803 }, { "epoch": 1.8576952822892498, "grad_norm": 0.3608170688458364, "learning_rate": 3.8035897104624436e-06, "loss": 0.0279, "step": 4804 }, { "epoch": 1.8580819798917245, "grad_norm": 0.4834547069182079, "learning_rate": 3.8014054076291705e-06, "loss": 0.0284, "step": 4805 }, { "epoch": 1.8584686774941996, "grad_norm": 0.25863537705808254, "learning_rate": 3.7992213474638166e-06, "loss": 0.0213, "step": 4806 }, { "epoch": 1.8588553750966743, "grad_norm": 0.3401043120776392, "learning_rate": 3.7970375304085666e-06, "loss": 0.0248, "step": 4807 }, { "epoch": 1.8592420726991494, "grad_norm": 0.5106900040176883, "learning_rate": 3.7948539569055533e-06, "loss": 0.0297, "step": 4808 }, { "epoch": 1.859628770301624, "grad_norm": 0.34639570015853394, "learning_rate": 3.7926706273968692e-06, "loss": 0.0319, "step": 4809 }, { "epoch": 1.860015467904099, "grad_norm": 0.2467433792076407, "learning_rate": 3.7904875423245465e-06, "loss": 0.0193, "step": 4810 }, { "epoch": 1.860402165506574, "grad_norm": 0.27956059867934974, "learning_rate": 3.7883047021305796e-06, "loss": 0.0211, "step": 4811 }, { "epoch": 1.8607888631090486, "grad_norm": 0.2731124918541051, "learning_rate": 3.786122107256901e-06, "loss": 0.02, "step": 4812 }, { "epoch": 1.8611755607115237, "grad_norm": 0.2931165905014588, "learning_rate": 3.783939758145406e-06, "loss": 0.0285, "step": 4813 }, { "epoch": 1.8615622583139984, "grad_norm": 0.2469983913736181, "learning_rate": 3.7817576552379288e-06, "loss": 0.0176, "step": 4814 }, { "epoch": 1.8619489559164735, "grad_norm": 0.28931256849860304, "learning_rate": 3.7795757989762595e-06, "loss": 0.0208, "step": 4815 }, { "epoch": 1.8623356535189481, "grad_norm": 0.25685507597711354, "learning_rate": 3.77739418980214e-06, "loss": 0.0203, "step": 4816 }, { "epoch": 1.862722351121423, "grad_norm": 0.3314108192992911, "learning_rate": 3.775212828157257e-06, "loss": 0.0299, "step": 4817 }, { "epoch": 1.863109048723898, "grad_norm": 0.30196924622151256, "learning_rate": 3.7730317144832524e-06, "loss": 0.0317, "step": 4818 }, { "epoch": 1.8634957463263728, "grad_norm": 0.23812030921674338, "learning_rate": 3.7708508492217135e-06, "loss": 0.0189, "step": 4819 }, { "epoch": 1.8638824439288477, "grad_norm": 0.37812322985078795, "learning_rate": 3.7686702328141787e-06, "loss": 0.0419, "step": 4820 }, { "epoch": 1.8642691415313224, "grad_norm": 0.24597278848041626, "learning_rate": 3.766489865702139e-06, "loss": 0.0202, "step": 4821 }, { "epoch": 1.8646558391337975, "grad_norm": 0.21969719343105837, "learning_rate": 3.7643097483270287e-06, "loss": 0.0266, "step": 4822 }, { "epoch": 1.8650425367362722, "grad_norm": 0.3423914347244921, "learning_rate": 3.7621298811302388e-06, "loss": 0.0247, "step": 4823 }, { "epoch": 1.865429234338747, "grad_norm": 0.4003643497115828, "learning_rate": 3.7599502645531028e-06, "loss": 0.0494, "step": 4824 }, { "epoch": 1.865815931941222, "grad_norm": 0.3623169699012008, "learning_rate": 3.7577708990369093e-06, "loss": 0.0329, "step": 4825 }, { "epoch": 1.8662026295436969, "grad_norm": 0.25472060486827836, "learning_rate": 3.755591785022893e-06, "loss": 0.0206, "step": 4826 }, { "epoch": 1.8665893271461718, "grad_norm": 0.36028548630987556, "learning_rate": 3.7534129229522366e-06, "loss": 0.0262, "step": 4827 }, { "epoch": 1.8669760247486464, "grad_norm": 0.3553328914397371, "learning_rate": 3.751234313266075e-06, "loss": 0.0391, "step": 4828 }, { "epoch": 1.8673627223511216, "grad_norm": 0.27614372308765506, "learning_rate": 3.7490559564054884e-06, "loss": 0.0368, "step": 4829 }, { "epoch": 1.8677494199535962, "grad_norm": 0.26849613651810844, "learning_rate": 3.746877852811511e-06, "loss": 0.0245, "step": 4830 }, { "epoch": 1.8681361175560711, "grad_norm": 0.2832855809243205, "learning_rate": 3.744700002925119e-06, "loss": 0.0221, "step": 4831 }, { "epoch": 1.868522815158546, "grad_norm": 0.2790297160020414, "learning_rate": 3.7425224071872436e-06, "loss": 0.033, "step": 4832 }, { "epoch": 1.868909512761021, "grad_norm": 0.2728115934401318, "learning_rate": 3.7403450660387613e-06, "loss": 0.0358, "step": 4833 }, { "epoch": 1.8692962103634958, "grad_norm": 0.3113506499520934, "learning_rate": 3.7381679799204928e-06, "loss": 0.0183, "step": 4834 }, { "epoch": 1.8696829079659705, "grad_norm": 0.22714989536382887, "learning_rate": 3.7359911492732192e-06, "loss": 0.0214, "step": 4835 }, { "epoch": 1.8700696055684456, "grad_norm": 0.34578116652162666, "learning_rate": 3.7338145745376553e-06, "loss": 0.0319, "step": 4836 }, { "epoch": 1.8704563031709203, "grad_norm": 0.2570505793643324, "learning_rate": 3.7316382561544783e-06, "loss": 0.0345, "step": 4837 }, { "epoch": 1.8708430007733952, "grad_norm": 0.24314031015474794, "learning_rate": 3.7294621945642988e-06, "loss": 0.024, "step": 4838 }, { "epoch": 1.87122969837587, "grad_norm": 0.2838833060150782, "learning_rate": 3.7272863902076906e-06, "loss": 0.0255, "step": 4839 }, { "epoch": 1.871616395978345, "grad_norm": 0.2570320866361453, "learning_rate": 3.7251108435251636e-06, "loss": 0.0204, "step": 4840 }, { "epoch": 1.8720030935808198, "grad_norm": 0.4424239504951086, "learning_rate": 3.722935554957179e-06, "loss": 0.0412, "step": 4841 }, { "epoch": 1.8723897911832945, "grad_norm": 0.19540227848065367, "learning_rate": 3.720760524944149e-06, "loss": 0.0146, "step": 4842 }, { "epoch": 1.8727764887857696, "grad_norm": 0.2400152815450509, "learning_rate": 3.7185857539264297e-06, "loss": 0.0285, "step": 4843 }, { "epoch": 1.8731631863882443, "grad_norm": 0.23782375290570973, "learning_rate": 3.716411242344328e-06, "loss": 0.0213, "step": 4844 }, { "epoch": 1.8735498839907194, "grad_norm": 0.2818205856781151, "learning_rate": 3.7142369906380925e-06, "loss": 0.0249, "step": 4845 }, { "epoch": 1.873936581593194, "grad_norm": 0.28428163354800107, "learning_rate": 3.7120629992479267e-06, "loss": 0.0281, "step": 4846 }, { "epoch": 1.874323279195669, "grad_norm": 0.2728509338187203, "learning_rate": 3.7098892686139764e-06, "loss": 0.0298, "step": 4847 }, { "epoch": 1.8747099767981439, "grad_norm": 0.3746980254476846, "learning_rate": 3.7077157991763352e-06, "loss": 0.0245, "step": 4848 }, { "epoch": 1.8750966744006186, "grad_norm": 0.24900389478132806, "learning_rate": 3.7055425913750467e-06, "loss": 0.0253, "step": 4849 }, { "epoch": 1.8754833720030937, "grad_norm": 0.20781269576765038, "learning_rate": 3.7033696456500955e-06, "loss": 0.023, "step": 4850 }, { "epoch": 1.8758700696055683, "grad_norm": 0.23336128542670748, "learning_rate": 3.701196962441421e-06, "loss": 0.0207, "step": 4851 }, { "epoch": 1.8762567672080435, "grad_norm": 0.29932318165413035, "learning_rate": 3.6990245421889035e-06, "loss": 0.0244, "step": 4852 }, { "epoch": 1.8766434648105181, "grad_norm": 0.335689595027374, "learning_rate": 3.6968523853323733e-06, "loss": 0.0349, "step": 4853 }, { "epoch": 1.877030162412993, "grad_norm": 0.3461760781904064, "learning_rate": 3.694680492311606e-06, "loss": 0.0472, "step": 4854 }, { "epoch": 1.877416860015468, "grad_norm": 0.2647095028044157, "learning_rate": 3.6925088635663197e-06, "loss": 0.0277, "step": 4855 }, { "epoch": 1.8778035576179428, "grad_norm": 0.22652762040421576, "learning_rate": 3.6903374995361907e-06, "loss": 0.0187, "step": 4856 }, { "epoch": 1.8781902552204177, "grad_norm": 0.27920995919271707, "learning_rate": 3.6881664006608265e-06, "loss": 0.0335, "step": 4857 }, { "epoch": 1.8785769528228924, "grad_norm": 0.27643125013492786, "learning_rate": 3.685995567379795e-06, "loss": 0.0406, "step": 4858 }, { "epoch": 1.8789636504253675, "grad_norm": 0.3016399768981266, "learning_rate": 3.6838250001326003e-06, "loss": 0.0333, "step": 4859 }, { "epoch": 1.8793503480278422, "grad_norm": 0.28152308812550414, "learning_rate": 3.681654699358696e-06, "loss": 0.0234, "step": 4860 }, { "epoch": 1.879737045630317, "grad_norm": 0.22586853588679115, "learning_rate": 3.6794846654974847e-06, "loss": 0.0224, "step": 4861 }, { "epoch": 1.880123743232792, "grad_norm": 0.31133249211030195, "learning_rate": 3.677314898988309e-06, "loss": 0.0236, "step": 4862 }, { "epoch": 1.8805104408352669, "grad_norm": 0.22088234152081782, "learning_rate": 3.6751454002704633e-06, "loss": 0.0203, "step": 4863 }, { "epoch": 1.8808971384377418, "grad_norm": 1.240733284788999, "learning_rate": 3.672976169783184e-06, "loss": 0.0217, "step": 4864 }, { "epoch": 1.8812838360402164, "grad_norm": 0.3564144912811748, "learning_rate": 3.6708072079656553e-06, "loss": 0.031, "step": 4865 }, { "epoch": 1.8816705336426915, "grad_norm": 0.3325478628839708, "learning_rate": 3.6686385152570046e-06, "loss": 0.0254, "step": 4866 }, { "epoch": 1.8820572312451662, "grad_norm": 0.44177754662110646, "learning_rate": 3.666470092096307e-06, "loss": 0.0413, "step": 4867 }, { "epoch": 1.8824439288476411, "grad_norm": 0.2773272088133475, "learning_rate": 3.6643019389225826e-06, "loss": 0.0208, "step": 4868 }, { "epoch": 1.882830626450116, "grad_norm": 0.29260661647853126, "learning_rate": 3.6621340561747953e-06, "loss": 0.0318, "step": 4869 }, { "epoch": 1.883217324052591, "grad_norm": 0.2885213938787049, "learning_rate": 3.659966444291857e-06, "loss": 0.0292, "step": 4870 }, { "epoch": 1.8836040216550658, "grad_norm": 0.4251583695918771, "learning_rate": 3.6577991037126213e-06, "loss": 0.0318, "step": 4871 }, { "epoch": 1.8839907192575405, "grad_norm": 0.8842448234998134, "learning_rate": 3.6556320348758915e-06, "loss": 0.0205, "step": 4872 }, { "epoch": 1.8843774168600156, "grad_norm": 0.27485358692554085, "learning_rate": 3.653465238220413e-06, "loss": 0.0321, "step": 4873 }, { "epoch": 1.8847641144624903, "grad_norm": 0.24468561071885025, "learning_rate": 3.651298714184871e-06, "loss": 0.0216, "step": 4874 }, { "epoch": 1.8851508120649652, "grad_norm": 0.2479582324269409, "learning_rate": 3.6491324632079085e-06, "loss": 0.0245, "step": 4875 }, { "epoch": 1.88553750966744, "grad_norm": 0.36619132796429893, "learning_rate": 3.6469664857280984e-06, "loss": 0.0284, "step": 4876 }, { "epoch": 1.885924207269915, "grad_norm": 0.27935800599660265, "learning_rate": 3.644800782183972e-06, "loss": 0.0308, "step": 4877 }, { "epoch": 1.8863109048723898, "grad_norm": 0.24414212170069, "learning_rate": 3.642635353013992e-06, "loss": 0.0227, "step": 4878 }, { "epoch": 1.8866976024748645, "grad_norm": 0.3626897404919003, "learning_rate": 3.6404701986565782e-06, "loss": 0.0328, "step": 4879 }, { "epoch": 1.8870843000773396, "grad_norm": 0.27407504679161043, "learning_rate": 3.638305319550084e-06, "loss": 0.0291, "step": 4880 }, { "epoch": 1.8874709976798143, "grad_norm": 0.42042502267617127, "learning_rate": 3.6361407161328128e-06, "loss": 0.0266, "step": 4881 }, { "epoch": 1.8878576952822892, "grad_norm": 0.27557588721163256, "learning_rate": 3.6339763888430123e-06, "loss": 0.0256, "step": 4882 }, { "epoch": 1.888244392884764, "grad_norm": 0.2297064292591118, "learning_rate": 3.6318123381188707e-06, "loss": 0.0228, "step": 4883 }, { "epoch": 1.888631090487239, "grad_norm": 0.3308299856075331, "learning_rate": 3.629648564398526e-06, "loss": 0.0218, "step": 4884 }, { "epoch": 1.8890177880897139, "grad_norm": 0.312988912540579, "learning_rate": 3.6274850681200526e-06, "loss": 0.0266, "step": 4885 }, { "epoch": 1.8894044856921886, "grad_norm": 0.2671574724127383, "learning_rate": 3.6253218497214758e-06, "loss": 0.0252, "step": 4886 }, { "epoch": 1.8897911832946637, "grad_norm": 0.28877067917612365, "learning_rate": 3.623158909640761e-06, "loss": 0.0261, "step": 4887 }, { "epoch": 1.8901778808971383, "grad_norm": 0.2792234172832205, "learning_rate": 3.6209962483158167e-06, "loss": 0.0258, "step": 4888 }, { "epoch": 1.8905645784996135, "grad_norm": 0.2524604356426204, "learning_rate": 3.6188338661844975e-06, "loss": 0.0192, "step": 4889 }, { "epoch": 1.8909512761020881, "grad_norm": 0.2759238731622538, "learning_rate": 3.616671763684598e-06, "loss": 0.0225, "step": 4890 }, { "epoch": 1.891337973704563, "grad_norm": 0.2735202094818874, "learning_rate": 3.614509941253862e-06, "loss": 0.0292, "step": 4891 }, { "epoch": 1.891724671307038, "grad_norm": 0.34303597199759284, "learning_rate": 3.612348399329969e-06, "loss": 0.0393, "step": 4892 }, { "epoch": 1.8921113689095128, "grad_norm": 0.3330239416362149, "learning_rate": 3.6101871383505487e-06, "loss": 0.0298, "step": 4893 }, { "epoch": 1.8924980665119877, "grad_norm": 0.2616878033396317, "learning_rate": 3.6080261587531697e-06, "loss": 0.0268, "step": 4894 }, { "epoch": 1.8928847641144624, "grad_norm": 0.3445987639735293, "learning_rate": 3.605865460975343e-06, "loss": 0.0308, "step": 4895 }, { "epoch": 1.8932714617169375, "grad_norm": 0.4445949359592012, "learning_rate": 3.603705045454527e-06, "loss": 0.0308, "step": 4896 }, { "epoch": 1.8936581593194122, "grad_norm": 0.3301070744397786, "learning_rate": 3.6015449126281177e-06, "loss": 0.0358, "step": 4897 }, { "epoch": 1.894044856921887, "grad_norm": 0.2860898438983867, "learning_rate": 3.599385062933459e-06, "loss": 0.0188, "step": 4898 }, { "epoch": 1.894431554524362, "grad_norm": 0.2769884127420434, "learning_rate": 3.5972254968078346e-06, "loss": 0.024, "step": 4899 }, { "epoch": 1.8948182521268369, "grad_norm": 0.26256625414301754, "learning_rate": 3.5950662146884673e-06, "loss": 0.0208, "step": 4900 }, { "epoch": 1.8952049497293117, "grad_norm": 0.2932164337166673, "learning_rate": 3.5929072170125316e-06, "loss": 0.0277, "step": 4901 }, { "epoch": 1.8955916473317864, "grad_norm": 0.2619343992214239, "learning_rate": 3.5907485042171343e-06, "loss": 0.0217, "step": 4902 }, { "epoch": 1.8959783449342615, "grad_norm": 0.27847890538064535, "learning_rate": 3.5885900767393334e-06, "loss": 0.0335, "step": 4903 }, { "epoch": 1.8963650425367362, "grad_norm": 0.3123014620097982, "learning_rate": 3.58643193501612e-06, "loss": 0.0195, "step": 4904 }, { "epoch": 1.896751740139211, "grad_norm": 0.30989875152780816, "learning_rate": 3.5842740794844383e-06, "loss": 0.0279, "step": 4905 }, { "epoch": 1.897138437741686, "grad_norm": 0.3165546428889481, "learning_rate": 3.582116510581164e-06, "loss": 0.0282, "step": 4906 }, { "epoch": 1.897525135344161, "grad_norm": 0.26190922622597196, "learning_rate": 3.5799592287431206e-06, "loss": 0.0255, "step": 4907 }, { "epoch": 1.8979118329466358, "grad_norm": 0.23986732614529738, "learning_rate": 3.5778022344070727e-06, "loss": 0.0177, "step": 4908 }, { "epoch": 1.8982985305491105, "grad_norm": 0.4187053573613224, "learning_rate": 3.5756455280097245e-06, "loss": 0.0363, "step": 4909 }, { "epoch": 1.8986852281515856, "grad_norm": 0.5278316076439377, "learning_rate": 3.5734891099877268e-06, "loss": 0.0406, "step": 4910 }, { "epoch": 1.8990719257540603, "grad_norm": 0.27707652923830906, "learning_rate": 3.5713329807776654e-06, "loss": 0.0305, "step": 4911 }, { "epoch": 1.8994586233565351, "grad_norm": 0.22639463142796024, "learning_rate": 3.5691771408160736e-06, "loss": 0.0187, "step": 4912 }, { "epoch": 1.89984532095901, "grad_norm": 0.21846644103767154, "learning_rate": 3.567021590539423e-06, "loss": 0.0199, "step": 4913 }, { "epoch": 1.900232018561485, "grad_norm": 0.3831117652963353, "learning_rate": 3.564866330384125e-06, "loss": 0.0403, "step": 4914 }, { "epoch": 1.9006187161639598, "grad_norm": 0.49416704121031046, "learning_rate": 3.5627113607865378e-06, "loss": 0.0268, "step": 4915 }, { "epoch": 1.9010054137664345, "grad_norm": 0.37387895763361256, "learning_rate": 3.560556682182954e-06, "loss": 0.0213, "step": 4916 }, { "epoch": 1.9013921113689096, "grad_norm": 0.2666566138776922, "learning_rate": 3.5584022950096135e-06, "loss": 0.0252, "step": 4917 }, { "epoch": 1.9017788089713843, "grad_norm": 0.3014946604491024, "learning_rate": 3.556248199702692e-06, "loss": 0.0256, "step": 4918 }, { "epoch": 1.9021655065738592, "grad_norm": 0.3608872181298916, "learning_rate": 3.554094396698311e-06, "loss": 0.0356, "step": 4919 }, { "epoch": 1.902552204176334, "grad_norm": 0.3058852085979234, "learning_rate": 3.55194088643253e-06, "loss": 0.0276, "step": 4920 }, { "epoch": 1.902938901778809, "grad_norm": 0.26889732056422033, "learning_rate": 3.5497876693413445e-06, "loss": 0.0203, "step": 4921 }, { "epoch": 1.9033255993812839, "grad_norm": 0.2867849447759126, "learning_rate": 3.5476347458607035e-06, "loss": 0.0223, "step": 4922 }, { "epoch": 1.9037122969837585, "grad_norm": 0.3871991098011749, "learning_rate": 3.545482116426481e-06, "loss": 0.0242, "step": 4923 }, { "epoch": 1.9040989945862337, "grad_norm": 0.4465252888177779, "learning_rate": 3.5433297814745065e-06, "loss": 0.0324, "step": 4924 }, { "epoch": 1.9044856921887083, "grad_norm": 0.30732348221835076, "learning_rate": 3.5411777414405356e-06, "loss": 0.0239, "step": 4925 }, { "epoch": 1.9048723897911835, "grad_norm": 0.3378296064542228, "learning_rate": 3.539025996760278e-06, "loss": 0.0251, "step": 4926 }, { "epoch": 1.9052590873936581, "grad_norm": 0.2177669675416582, "learning_rate": 3.536874547869372e-06, "loss": 0.0228, "step": 4927 }, { "epoch": 1.905645784996133, "grad_norm": 0.2638634505600508, "learning_rate": 3.534723395203401e-06, "loss": 0.0208, "step": 4928 }, { "epoch": 1.906032482598608, "grad_norm": 0.299037096649157, "learning_rate": 3.53257253919789e-06, "loss": 0.0315, "step": 4929 }, { "epoch": 1.9064191802010828, "grad_norm": 0.2617306967819463, "learning_rate": 3.5304219802883e-06, "loss": 0.0213, "step": 4930 }, { "epoch": 1.9068058778035577, "grad_norm": 0.392207155787647, "learning_rate": 3.528271718910037e-06, "loss": 0.0346, "step": 4931 }, { "epoch": 1.9071925754060324, "grad_norm": 0.34344728797837193, "learning_rate": 3.526121755498442e-06, "loss": 0.0316, "step": 4932 }, { "epoch": 1.9075792730085075, "grad_norm": 0.2600288434744331, "learning_rate": 3.5239720904887963e-06, "loss": 0.0262, "step": 4933 }, { "epoch": 1.9079659706109822, "grad_norm": 0.36453712212000483, "learning_rate": 3.5218227243163238e-06, "loss": 0.0385, "step": 4934 }, { "epoch": 1.908352668213457, "grad_norm": 0.3624687641360695, "learning_rate": 3.5196736574161846e-06, "loss": 0.0301, "step": 4935 }, { "epoch": 1.908739365815932, "grad_norm": 0.29913894656389894, "learning_rate": 3.517524890223481e-06, "loss": 0.0259, "step": 4936 }, { "epoch": 1.9091260634184068, "grad_norm": 0.23679887363079427, "learning_rate": 3.5153764231732512e-06, "loss": 0.0236, "step": 4937 }, { "epoch": 1.9095127610208817, "grad_norm": 0.2890498064610795, "learning_rate": 3.5132282567004777e-06, "loss": 0.0221, "step": 4938 }, { "epoch": 1.9098994586233564, "grad_norm": 0.30485266119574833, "learning_rate": 3.511080391240077e-06, "loss": 0.0234, "step": 4939 }, { "epoch": 1.9102861562258315, "grad_norm": 0.33191326875107957, "learning_rate": 3.5089328272269063e-06, "loss": 0.0192, "step": 4940 }, { "epoch": 1.9106728538283062, "grad_norm": 0.2779134787817982, "learning_rate": 3.506785565095763e-06, "loss": 0.0344, "step": 4941 }, { "epoch": 1.911059551430781, "grad_norm": 0.27733666052170547, "learning_rate": 3.504638605281383e-06, "loss": 0.0307, "step": 4942 }, { "epoch": 1.911446249033256, "grad_norm": 0.2988037610844797, "learning_rate": 3.5024919482184404e-06, "loss": 0.0319, "step": 4943 }, { "epoch": 1.911832946635731, "grad_norm": 0.237532774169759, "learning_rate": 3.5003455943415476e-06, "loss": 0.028, "step": 4944 }, { "epoch": 1.9122196442382058, "grad_norm": 0.2801641098860185, "learning_rate": 3.498199544085258e-06, "loss": 0.0261, "step": 4945 }, { "epoch": 1.9126063418406805, "grad_norm": 0.30982851847728893, "learning_rate": 3.4960537978840596e-06, "loss": 0.0284, "step": 4946 }, { "epoch": 1.9129930394431556, "grad_norm": 0.33916254365944015, "learning_rate": 3.493908356172381e-06, "loss": 0.0241, "step": 4947 }, { "epoch": 1.9133797370456302, "grad_norm": 0.2970285930673233, "learning_rate": 3.491763219384591e-06, "loss": 0.0295, "step": 4948 }, { "epoch": 1.9137664346481051, "grad_norm": 0.24877203646575163, "learning_rate": 3.4896183879549927e-06, "loss": 0.0274, "step": 4949 }, { "epoch": 1.91415313225058, "grad_norm": 0.2798390145454241, "learning_rate": 3.4874738623178317e-06, "loss": 0.0235, "step": 4950 }, { "epoch": 1.914539829853055, "grad_norm": 0.2124640333035081, "learning_rate": 3.485329642907287e-06, "loss": 0.02, "step": 4951 }, { "epoch": 1.9149265274555298, "grad_norm": 0.25903891085252345, "learning_rate": 3.48318573015748e-06, "loss": 0.0254, "step": 4952 }, { "epoch": 1.9153132250580045, "grad_norm": 0.23845918501838492, "learning_rate": 3.4810421245024685e-06, "loss": 0.0239, "step": 4953 }, { "epoch": 1.9156999226604796, "grad_norm": 0.3171998869227755, "learning_rate": 3.4788988263762444e-06, "loss": 0.036, "step": 4954 }, { "epoch": 1.9160866202629543, "grad_norm": 0.24506354007830392, "learning_rate": 3.4767558362127452e-06, "loss": 0.0208, "step": 4955 }, { "epoch": 1.9164733178654292, "grad_norm": 0.33914503569162463, "learning_rate": 3.4746131544458386e-06, "loss": 0.0308, "step": 4956 }, { "epoch": 1.916860015467904, "grad_norm": 0.30061536840064834, "learning_rate": 3.4724707815093328e-06, "loss": 0.0248, "step": 4957 }, { "epoch": 1.917246713070379, "grad_norm": 0.2810902809565448, "learning_rate": 3.4703287178369744e-06, "loss": 0.0278, "step": 4958 }, { "epoch": 1.9176334106728539, "grad_norm": 0.30869956761929757, "learning_rate": 3.4681869638624468e-06, "loss": 0.0236, "step": 4959 }, { "epoch": 1.9180201082753285, "grad_norm": 0.30784681489837995, "learning_rate": 3.4660455200193694e-06, "loss": 0.0332, "step": 4960 }, { "epoch": 1.9184068058778037, "grad_norm": 0.2953184657396303, "learning_rate": 3.463904386741299e-06, "loss": 0.0406, "step": 4961 }, { "epoch": 1.9187935034802783, "grad_norm": 0.20922718019296999, "learning_rate": 3.461763564461732e-06, "loss": 0.0159, "step": 4962 }, { "epoch": 1.9191802010827534, "grad_norm": 0.32742632861588883, "learning_rate": 3.4596230536140984e-06, "loss": 0.0279, "step": 4963 }, { "epoch": 1.9195668986852281, "grad_norm": 0.24333771929411865, "learning_rate": 3.4574828546317695e-06, "loss": 0.0312, "step": 4964 }, { "epoch": 1.919953596287703, "grad_norm": 0.282289703414491, "learning_rate": 3.455342967948048e-06, "loss": 0.0262, "step": 4965 }, { "epoch": 1.920340293890178, "grad_norm": 0.20303696612894442, "learning_rate": 3.4532033939961774e-06, "loss": 0.0156, "step": 4966 }, { "epoch": 1.9207269914926526, "grad_norm": 0.2838062896415757, "learning_rate": 3.4510641332093375e-06, "loss": 0.0269, "step": 4967 }, { "epoch": 1.9211136890951277, "grad_norm": 0.27352405180482403, "learning_rate": 3.4489251860206403e-06, "loss": 0.0259, "step": 4968 }, { "epoch": 1.9215003866976024, "grad_norm": 0.26980281784236415, "learning_rate": 3.446786552863143e-06, "loss": 0.022, "step": 4969 }, { "epoch": 1.9218870843000775, "grad_norm": 0.27714320960497396, "learning_rate": 3.444648234169829e-06, "loss": 0.0249, "step": 4970 }, { "epoch": 1.9222737819025522, "grad_norm": 0.241347740066537, "learning_rate": 3.4425102303736285e-06, "loss": 0.0194, "step": 4971 }, { "epoch": 1.922660479505027, "grad_norm": 0.3556272944938002, "learning_rate": 3.4403725419073984e-06, "loss": 0.0298, "step": 4972 }, { "epoch": 1.923047177107502, "grad_norm": 0.2839227969561036, "learning_rate": 3.4382351692039363e-06, "loss": 0.0249, "step": 4973 }, { "epoch": 1.9234338747099768, "grad_norm": 0.49016836254823887, "learning_rate": 3.4360981126959777e-06, "loss": 0.0315, "step": 4974 }, { "epoch": 1.9238205723124517, "grad_norm": 0.307311975987943, "learning_rate": 3.4339613728161892e-06, "loss": 0.0278, "step": 4975 }, { "epoch": 1.9242072699149264, "grad_norm": 0.2630533009681627, "learning_rate": 3.4318249499971777e-06, "loss": 0.0249, "step": 4976 }, { "epoch": 1.9245939675174015, "grad_norm": 0.39911406863471593, "learning_rate": 3.429688844671484e-06, "loss": 0.0345, "step": 4977 }, { "epoch": 1.9249806651198762, "grad_norm": 0.25776241213152445, "learning_rate": 3.427553057271585e-06, "loss": 0.0189, "step": 4978 }, { "epoch": 1.925367362722351, "grad_norm": 0.34898105244639327, "learning_rate": 3.4254175882298925e-06, "loss": 0.0305, "step": 4979 }, { "epoch": 1.925754060324826, "grad_norm": 0.3325098219563953, "learning_rate": 3.423282437978754e-06, "loss": 0.0269, "step": 4980 }, { "epoch": 1.9261407579273009, "grad_norm": 0.28685784133380837, "learning_rate": 3.4211476069504534e-06, "loss": 0.0262, "step": 4981 }, { "epoch": 1.9265274555297758, "grad_norm": 0.2940254040188883, "learning_rate": 3.4190130955772093e-06, "loss": 0.0306, "step": 4982 }, { "epoch": 1.9269141531322505, "grad_norm": 0.3935846946550293, "learning_rate": 3.4168789042911764e-06, "loss": 0.0271, "step": 4983 }, { "epoch": 1.9273008507347256, "grad_norm": 0.3074867450421778, "learning_rate": 3.414745033524442e-06, "loss": 0.0258, "step": 4984 }, { "epoch": 1.9276875483372002, "grad_norm": 0.2537304680854942, "learning_rate": 3.412611483709033e-06, "loss": 0.0216, "step": 4985 }, { "epoch": 1.9280742459396751, "grad_norm": 0.2695056117785922, "learning_rate": 3.410478255276908e-06, "loss": 0.029, "step": 4986 }, { "epoch": 1.92846094354215, "grad_norm": 0.20481916996316601, "learning_rate": 3.408345348659957e-06, "loss": 0.02, "step": 4987 }, { "epoch": 1.928847641144625, "grad_norm": 0.2751836644782838, "learning_rate": 3.4062127642900167e-06, "loss": 0.0196, "step": 4988 }, { "epoch": 1.9292343387470998, "grad_norm": 0.3516541626628836, "learning_rate": 3.4040805025988433e-06, "loss": 0.0226, "step": 4989 }, { "epoch": 1.9296210363495745, "grad_norm": 0.2644479343051273, "learning_rate": 3.401948564018142e-06, "loss": 0.0251, "step": 4990 }, { "epoch": 1.9300077339520496, "grad_norm": 0.3239112428052199, "learning_rate": 3.3998169489795395e-06, "loss": 0.0159, "step": 4991 }, { "epoch": 1.9303944315545243, "grad_norm": 0.27751932640518945, "learning_rate": 3.39768565791461e-06, "loss": 0.0263, "step": 4992 }, { "epoch": 1.9307811291569992, "grad_norm": 0.30026303673808746, "learning_rate": 3.395554691254851e-06, "loss": 0.0224, "step": 4993 }, { "epoch": 1.931167826759474, "grad_norm": 0.2962351171849623, "learning_rate": 3.3934240494316995e-06, "loss": 0.023, "step": 4994 }, { "epoch": 1.931554524361949, "grad_norm": 0.24399167889194065, "learning_rate": 3.3912937328765272e-06, "loss": 0.0217, "step": 4995 }, { "epoch": 1.9319412219644239, "grad_norm": 0.22729708624278258, "learning_rate": 3.3891637420206368e-06, "loss": 0.0249, "step": 4996 }, { "epoch": 1.9323279195668985, "grad_norm": 0.25419011555090754, "learning_rate": 3.387034077295269e-06, "loss": 0.0285, "step": 4997 }, { "epoch": 1.9327146171693736, "grad_norm": 0.2739406780857596, "learning_rate": 3.384904739131596e-06, "loss": 0.0282, "step": 4998 }, { "epoch": 1.9331013147718483, "grad_norm": 0.29921897117494917, "learning_rate": 3.3827757279607253e-06, "loss": 0.0265, "step": 4999 }, { "epoch": 1.9334880123743234, "grad_norm": 0.30801769638750043, "learning_rate": 3.3806470442136957e-06, "loss": 0.0271, "step": 5000 }, { "epoch": 1.9338747099767981, "grad_norm": 0.40997726100803134, "learning_rate": 3.3785186883214816e-06, "loss": 0.0231, "step": 5001 }, { "epoch": 1.934261407579273, "grad_norm": 0.2747651385059117, "learning_rate": 3.3763906607149915e-06, "loss": 0.0212, "step": 5002 }, { "epoch": 1.934648105181748, "grad_norm": 0.20495530850913107, "learning_rate": 3.3742629618250654e-06, "loss": 0.0178, "step": 5003 }, { "epoch": 1.9350348027842226, "grad_norm": 0.27063638842725973, "learning_rate": 3.3721355920824805e-06, "loss": 0.0298, "step": 5004 }, { "epoch": 1.9354215003866977, "grad_norm": 0.4300987934648665, "learning_rate": 3.3700085519179416e-06, "loss": 0.0344, "step": 5005 }, { "epoch": 1.9358081979891724, "grad_norm": 0.26399146404486545, "learning_rate": 3.3678818417620934e-06, "loss": 0.02, "step": 5006 }, { "epoch": 1.9361948955916475, "grad_norm": 0.26381460121818406, "learning_rate": 3.3657554620455086e-06, "loss": 0.0238, "step": 5007 }, { "epoch": 1.9365815931941222, "grad_norm": 0.2357087395329057, "learning_rate": 3.363629413198694e-06, "loss": 0.0197, "step": 5008 }, { "epoch": 1.936968290796597, "grad_norm": 0.31699751560229933, "learning_rate": 3.3615036956520924e-06, "loss": 0.02, "step": 5009 }, { "epoch": 1.937354988399072, "grad_norm": 0.4973882876524193, "learning_rate": 3.3593783098360757e-06, "loss": 0.0226, "step": 5010 }, { "epoch": 1.9377416860015468, "grad_norm": 0.28623421947203675, "learning_rate": 3.357253256180951e-06, "loss": 0.0256, "step": 5011 }, { "epoch": 1.9381283836040217, "grad_norm": 0.3367833307022547, "learning_rate": 3.3551285351169593e-06, "loss": 0.0286, "step": 5012 }, { "epoch": 1.9385150812064964, "grad_norm": 0.2596477511319996, "learning_rate": 3.353004147074267e-06, "loss": 0.0228, "step": 5013 }, { "epoch": 1.9389017788089715, "grad_norm": 0.8291642802892055, "learning_rate": 3.3508800924829856e-06, "loss": 0.042, "step": 5014 }, { "epoch": 1.9392884764114462, "grad_norm": 0.3257827171823463, "learning_rate": 3.3487563717731457e-06, "loss": 0.0234, "step": 5015 }, { "epoch": 1.939675174013921, "grad_norm": 0.3943652926525129, "learning_rate": 3.3466329853747214e-06, "loss": 0.026, "step": 5016 }, { "epoch": 1.940061871616396, "grad_norm": 0.3498591186982406, "learning_rate": 3.34450993371761e-06, "loss": 0.0269, "step": 5017 }, { "epoch": 1.9404485692188709, "grad_norm": 0.24332209602601484, "learning_rate": 3.3423872172316496e-06, "loss": 0.0205, "step": 5018 }, { "epoch": 1.9408352668213458, "grad_norm": 0.1856017308400394, "learning_rate": 3.340264836346604e-06, "loss": 0.0134, "step": 5019 }, { "epoch": 1.9412219644238204, "grad_norm": 0.2912522490904565, "learning_rate": 3.3381427914921704e-06, "loss": 0.022, "step": 5020 }, { "epoch": 1.9416086620262956, "grad_norm": 0.2626374686573969, "learning_rate": 3.3360210830979812e-06, "loss": 0.0202, "step": 5021 }, { "epoch": 1.9419953596287702, "grad_norm": 0.2631570922731478, "learning_rate": 3.3338997115935955e-06, "loss": 0.0174, "step": 5022 }, { "epoch": 1.9423820572312451, "grad_norm": 0.27781012353981355, "learning_rate": 3.3317786774085092e-06, "loss": 0.0226, "step": 5023 }, { "epoch": 1.94276875483372, "grad_norm": 0.4323401021942277, "learning_rate": 3.329657980972146e-06, "loss": 0.0326, "step": 5024 }, { "epoch": 1.943155452436195, "grad_norm": 0.3862330488405056, "learning_rate": 3.3275376227138656e-06, "loss": 0.0373, "step": 5025 }, { "epoch": 1.9435421500386698, "grad_norm": 0.947245020927608, "learning_rate": 3.325417603062955e-06, "loss": 0.0258, "step": 5026 }, { "epoch": 1.9439288476411445, "grad_norm": 0.2514189315532018, "learning_rate": 3.3232979224486317e-06, "loss": 0.0174, "step": 5027 }, { "epoch": 1.9443155452436196, "grad_norm": 0.3176226891194705, "learning_rate": 3.3211785813000506e-06, "loss": 0.0203, "step": 5028 }, { "epoch": 1.9447022428460943, "grad_norm": 0.30706166903588655, "learning_rate": 3.3190595800462933e-06, "loss": 0.0271, "step": 5029 }, { "epoch": 1.9450889404485692, "grad_norm": 0.28659663260221696, "learning_rate": 3.3169409191163724e-06, "loss": 0.0224, "step": 5030 }, { "epoch": 1.945475638051044, "grad_norm": 0.27301688564686893, "learning_rate": 3.3148225989392346e-06, "loss": 0.025, "step": 5031 }, { "epoch": 1.945862335653519, "grad_norm": 0.30755792010783056, "learning_rate": 3.3127046199437548e-06, "loss": 0.0249, "step": 5032 }, { "epoch": 1.9462490332559939, "grad_norm": 0.46101965816170337, "learning_rate": 3.310586982558741e-06, "loss": 0.0637, "step": 5033 }, { "epoch": 1.9466357308584685, "grad_norm": 0.2522570029899115, "learning_rate": 3.3084696872129276e-06, "loss": 0.0281, "step": 5034 }, { "epoch": 1.9470224284609436, "grad_norm": 0.3368627903407711, "learning_rate": 3.3063527343349872e-06, "loss": 0.0253, "step": 5035 }, { "epoch": 1.9474091260634183, "grad_norm": 0.2888292973124829, "learning_rate": 3.304236124353515e-06, "loss": 0.0254, "step": 5036 }, { "epoch": 1.9477958236658934, "grad_norm": 0.2403149856439762, "learning_rate": 3.3021198576970446e-06, "loss": 0.0249, "step": 5037 }, { "epoch": 1.948182521268368, "grad_norm": 0.5346713314941686, "learning_rate": 3.3000039347940314e-06, "loss": 0.0271, "step": 5038 }, { "epoch": 1.948569218870843, "grad_norm": 0.2078061676026146, "learning_rate": 3.297888356072872e-06, "loss": 0.023, "step": 5039 }, { "epoch": 1.948955916473318, "grad_norm": 0.24016835524832356, "learning_rate": 3.2957731219618826e-06, "loss": 0.0273, "step": 5040 }, { "epoch": 1.9493426140757926, "grad_norm": 0.3070528048495348, "learning_rate": 3.293658232889315e-06, "loss": 0.0316, "step": 5041 }, { "epoch": 1.9497293116782677, "grad_norm": 0.24614559382103135, "learning_rate": 3.2915436892833526e-06, "loss": 0.0193, "step": 5042 }, { "epoch": 1.9501160092807424, "grad_norm": 0.26159671424439346, "learning_rate": 3.2894294915721036e-06, "loss": 0.026, "step": 5043 }, { "epoch": 1.9505027068832175, "grad_norm": 0.2791319116871227, "learning_rate": 3.2873156401836124e-06, "loss": 0.0349, "step": 5044 }, { "epoch": 1.9508894044856921, "grad_norm": 0.2870335542021223, "learning_rate": 3.2852021355458485e-06, "loss": 0.0221, "step": 5045 }, { "epoch": 1.951276102088167, "grad_norm": 0.23023769121153614, "learning_rate": 3.2830889780867116e-06, "loss": 0.0236, "step": 5046 }, { "epoch": 1.951662799690642, "grad_norm": 0.30645153348544407, "learning_rate": 3.280976168234035e-06, "loss": 0.0254, "step": 5047 }, { "epoch": 1.9520494972931168, "grad_norm": 0.28719163023705835, "learning_rate": 3.278863706415577e-06, "loss": 0.0279, "step": 5048 }, { "epoch": 1.9524361948955917, "grad_norm": 0.25676132103956006, "learning_rate": 3.2767515930590286e-06, "loss": 0.024, "step": 5049 }, { "epoch": 1.9528228924980664, "grad_norm": 0.3322216258342865, "learning_rate": 3.2746398285920087e-06, "loss": 0.028, "step": 5050 }, { "epoch": 1.9532095901005415, "grad_norm": 0.2755609707595179, "learning_rate": 3.2725284134420666e-06, "loss": 0.022, "step": 5051 }, { "epoch": 1.9535962877030162, "grad_norm": 0.36135356393716933, "learning_rate": 3.27041734803668e-06, "loss": 0.0269, "step": 5052 }, { "epoch": 1.953982985305491, "grad_norm": 0.4318425182037591, "learning_rate": 3.2683066328032532e-06, "loss": 0.0264, "step": 5053 }, { "epoch": 1.954369682907966, "grad_norm": 0.2458737266956619, "learning_rate": 3.2661962681691273e-06, "loss": 0.0198, "step": 5054 }, { "epoch": 1.9547563805104409, "grad_norm": 0.23333084276028165, "learning_rate": 3.2640862545615614e-06, "loss": 0.0176, "step": 5055 }, { "epoch": 1.9551430781129158, "grad_norm": 0.32740986308998915, "learning_rate": 3.2619765924077575e-06, "loss": 0.0221, "step": 5056 }, { "epoch": 1.9555297757153904, "grad_norm": 0.3067786247262959, "learning_rate": 3.2598672821348303e-06, "loss": 0.0203, "step": 5057 }, { "epoch": 1.9559164733178656, "grad_norm": 0.2937837513736631, "learning_rate": 3.257758324169839e-06, "loss": 0.0215, "step": 5058 }, { "epoch": 1.9563031709203402, "grad_norm": 0.28222863526161585, "learning_rate": 3.2556497189397597e-06, "loss": 0.0202, "step": 5059 }, { "epoch": 1.9566898685228151, "grad_norm": 0.3530559056767875, "learning_rate": 3.2535414668715013e-06, "loss": 0.0284, "step": 5060 }, { "epoch": 1.95707656612529, "grad_norm": 0.4456493237470436, "learning_rate": 3.2514335683919028e-06, "loss": 0.0266, "step": 5061 }, { "epoch": 1.957463263727765, "grad_norm": 0.24539190736896022, "learning_rate": 3.24932602392773e-06, "loss": 0.0289, "step": 5062 }, { "epoch": 1.9578499613302398, "grad_norm": 0.25690249520684155, "learning_rate": 3.2472188339056765e-06, "loss": 0.0182, "step": 5063 }, { "epoch": 1.9582366589327145, "grad_norm": 0.27439620028334677, "learning_rate": 3.245111998752365e-06, "loss": 0.0335, "step": 5064 }, { "epoch": 1.9586233565351896, "grad_norm": 0.40298720939914673, "learning_rate": 3.243005518894347e-06, "loss": 0.043, "step": 5065 }, { "epoch": 1.9590100541376643, "grad_norm": 0.29078051579135816, "learning_rate": 3.2408993947581003e-06, "loss": 0.0218, "step": 5066 }, { "epoch": 1.9593967517401392, "grad_norm": 0.2315807605684924, "learning_rate": 3.2387936267700315e-06, "loss": 0.0179, "step": 5067 }, { "epoch": 1.959783449342614, "grad_norm": 1.1708062952587213, "learning_rate": 3.2366882153564755e-06, "loss": 0.0432, "step": 5068 }, { "epoch": 1.960170146945089, "grad_norm": 0.4479533816574468, "learning_rate": 3.234583160943694e-06, "loss": 0.0264, "step": 5069 }, { "epoch": 1.9605568445475638, "grad_norm": 0.258356288819928, "learning_rate": 3.232478463957879e-06, "loss": 0.0239, "step": 5070 }, { "epoch": 1.9609435421500385, "grad_norm": 0.2525572434334769, "learning_rate": 3.2303741248251462e-06, "loss": 0.0223, "step": 5071 }, { "epoch": 1.9613302397525136, "grad_norm": 0.5045595453705883, "learning_rate": 3.2282701439715424e-06, "loss": 0.0289, "step": 5072 }, { "epoch": 1.9617169373549883, "grad_norm": 0.27245021739848185, "learning_rate": 3.2261665218230396e-06, "loss": 0.026, "step": 5073 }, { "epoch": 1.9621036349574634, "grad_norm": 0.2524356643071184, "learning_rate": 3.2240632588055377e-06, "loss": 0.0285, "step": 5074 }, { "epoch": 1.962490332559938, "grad_norm": 0.4709447930621872, "learning_rate": 3.2219603553448653e-06, "loss": 0.0275, "step": 5075 }, { "epoch": 1.962877030162413, "grad_norm": 0.34820198951065745, "learning_rate": 3.2198578118667746e-06, "loss": 0.0259, "step": 5076 }, { "epoch": 1.963263727764888, "grad_norm": 0.23713157873399382, "learning_rate": 3.2177556287969503e-06, "loss": 0.023, "step": 5077 }, { "epoch": 1.9636504253673626, "grad_norm": 0.3082943911551527, "learning_rate": 3.2156538065609987e-06, "loss": 0.0318, "step": 5078 }, { "epoch": 1.9640371229698377, "grad_norm": 0.2586370619670418, "learning_rate": 3.2135523455844585e-06, "loss": 0.0292, "step": 5079 }, { "epoch": 1.9644238205723124, "grad_norm": 0.26188200119933197, "learning_rate": 3.2114512462927905e-06, "loss": 0.0222, "step": 5080 }, { "epoch": 1.9648105181747875, "grad_norm": 0.2552558774826217, "learning_rate": 3.209350509111382e-06, "loss": 0.0225, "step": 5081 }, { "epoch": 1.9651972157772621, "grad_norm": 0.296316269421393, "learning_rate": 3.2072501344655534e-06, "loss": 0.0324, "step": 5082 }, { "epoch": 1.965583913379737, "grad_norm": 0.19891941091423554, "learning_rate": 3.205150122780542e-06, "loss": 0.0154, "step": 5083 }, { "epoch": 1.965970610982212, "grad_norm": 0.24322678546812554, "learning_rate": 3.2030504744815234e-06, "loss": 0.0232, "step": 5084 }, { "epoch": 1.9663573085846868, "grad_norm": 0.35107767459292694, "learning_rate": 3.2009511899935885e-06, "loss": 0.0181, "step": 5085 }, { "epoch": 1.9667440061871617, "grad_norm": 0.29934054133866855, "learning_rate": 3.1988522697417594e-06, "loss": 0.0262, "step": 5086 }, { "epoch": 1.9671307037896364, "grad_norm": 0.29587398067875925, "learning_rate": 3.1967537141509862e-06, "loss": 0.0229, "step": 5087 }, { "epoch": 1.9675174013921115, "grad_norm": 0.3405010357381095, "learning_rate": 3.1946555236461424e-06, "loss": 0.0277, "step": 5088 }, { "epoch": 1.9679040989945862, "grad_norm": 0.22335668913983298, "learning_rate": 3.192557698652029e-06, "loss": 0.0201, "step": 5089 }, { "epoch": 1.968290796597061, "grad_norm": 0.2583418692792707, "learning_rate": 3.1904602395933716e-06, "loss": 0.0301, "step": 5090 }, { "epoch": 1.968677494199536, "grad_norm": 0.2709817044345082, "learning_rate": 3.1883631468948227e-06, "loss": 0.025, "step": 5091 }, { "epoch": 1.9690641918020109, "grad_norm": 0.32146270324088916, "learning_rate": 3.186266420980962e-06, "loss": 0.0303, "step": 5092 }, { "epoch": 1.9694508894044858, "grad_norm": 0.22201021155016287, "learning_rate": 3.1841700622762907e-06, "loss": 0.0277, "step": 5093 }, { "epoch": 1.9698375870069604, "grad_norm": 0.2398053644020358, "learning_rate": 3.1820740712052404e-06, "loss": 0.0282, "step": 5094 }, { "epoch": 1.9702242846094355, "grad_norm": 0.22061771444039405, "learning_rate": 3.1799784481921653e-06, "loss": 0.0183, "step": 5095 }, { "epoch": 1.9706109822119102, "grad_norm": 0.2916604246235834, "learning_rate": 3.177883193661347e-06, "loss": 0.0235, "step": 5096 }, { "epoch": 1.9709976798143851, "grad_norm": 0.4225831164220383, "learning_rate": 3.17578830803699e-06, "loss": 0.0379, "step": 5097 }, { "epoch": 1.97138437741686, "grad_norm": 0.2624677687459755, "learning_rate": 3.1736937917432276e-06, "loss": 0.0262, "step": 5098 }, { "epoch": 1.971771075019335, "grad_norm": 0.23167648850676423, "learning_rate": 3.1715996452041164e-06, "loss": 0.0214, "step": 5099 }, { "epoch": 1.9721577726218098, "grad_norm": 0.3005068438004753, "learning_rate": 3.1695058688436342e-06, "loss": 0.0201, "step": 5100 }, { "epoch": 1.9725444702242845, "grad_norm": 0.2304572801029187, "learning_rate": 3.167412463085694e-06, "loss": 0.0184, "step": 5101 }, { "epoch": 1.9729311678267596, "grad_norm": 0.4484125684784599, "learning_rate": 3.1653194283541205e-06, "loss": 0.04, "step": 5102 }, { "epoch": 1.9733178654292343, "grad_norm": 0.2838569633157693, "learning_rate": 3.1632267650726776e-06, "loss": 0.0308, "step": 5103 }, { "epoch": 1.9737045630317092, "grad_norm": 0.31641604311808674, "learning_rate": 3.1611344736650396e-06, "loss": 0.0267, "step": 5104 }, { "epoch": 1.974091260634184, "grad_norm": 0.22666086072267058, "learning_rate": 3.1590425545548186e-06, "loss": 0.0186, "step": 5105 }, { "epoch": 1.974477958236659, "grad_norm": 0.3279762836241718, "learning_rate": 3.156951008165543e-06, "loss": 0.031, "step": 5106 }, { "epoch": 1.9748646558391338, "grad_norm": 0.2781164398818435, "learning_rate": 3.154859834920666e-06, "loss": 0.0283, "step": 5107 }, { "epoch": 1.9752513534416085, "grad_norm": 0.24942458669095474, "learning_rate": 3.1527690352435704e-06, "loss": 0.0182, "step": 5108 }, { "epoch": 1.9756380510440836, "grad_norm": 0.28104879290781143, "learning_rate": 3.150678609557558e-06, "loss": 0.0285, "step": 5109 }, { "epoch": 1.9760247486465583, "grad_norm": 0.26085234534948, "learning_rate": 3.14858855828586e-06, "loss": 0.0237, "step": 5110 }, { "epoch": 1.9764114462490334, "grad_norm": 0.2730222208450581, "learning_rate": 3.1464988818516255e-06, "loss": 0.021, "step": 5111 }, { "epoch": 1.976798143851508, "grad_norm": 0.4530213881601491, "learning_rate": 3.1444095806779346e-06, "loss": 0.0237, "step": 5112 }, { "epoch": 1.977184841453983, "grad_norm": 0.3167704044605817, "learning_rate": 3.142320655187786e-06, "loss": 0.0302, "step": 5113 }, { "epoch": 1.9775715390564579, "grad_norm": 0.1890043152372148, "learning_rate": 3.140232105804104e-06, "loss": 0.0192, "step": 5114 }, { "epoch": 1.9779582366589326, "grad_norm": 0.3755895609463767, "learning_rate": 3.138143932949739e-06, "loss": 0.0246, "step": 5115 }, { "epoch": 1.9783449342614077, "grad_norm": 0.22283349590379883, "learning_rate": 3.136056137047461e-06, "loss": 0.0205, "step": 5116 }, { "epoch": 1.9787316318638823, "grad_norm": 0.278365545923232, "learning_rate": 3.1339687185199675e-06, "loss": 0.0205, "step": 5117 }, { "epoch": 1.9791183294663575, "grad_norm": 0.2657263240098706, "learning_rate": 3.1318816777898764e-06, "loss": 0.0211, "step": 5118 }, { "epoch": 1.9795050270688321, "grad_norm": 0.3533241745136344, "learning_rate": 3.1297950152797342e-06, "loss": 0.0238, "step": 5119 }, { "epoch": 1.979891724671307, "grad_norm": 0.23759182760453293, "learning_rate": 3.1277087314120046e-06, "loss": 0.0195, "step": 5120 }, { "epoch": 1.980278422273782, "grad_norm": 0.47246870540751607, "learning_rate": 3.125622826609076e-06, "loss": 0.034, "step": 5121 }, { "epoch": 1.9806651198762568, "grad_norm": 0.22672763287546133, "learning_rate": 3.123537301293267e-06, "loss": 0.0207, "step": 5122 }, { "epoch": 1.9810518174787317, "grad_norm": 0.3404592960231921, "learning_rate": 3.121452155886806e-06, "loss": 0.0224, "step": 5123 }, { "epoch": 1.9814385150812064, "grad_norm": 0.30256531742421666, "learning_rate": 3.1193673908118606e-06, "loss": 0.0291, "step": 5124 }, { "epoch": 1.9818252126836815, "grad_norm": 0.22915680462373483, "learning_rate": 3.117283006490508e-06, "loss": 0.0217, "step": 5125 }, { "epoch": 1.9822119102861562, "grad_norm": 0.3552387112301804, "learning_rate": 3.115199003344753e-06, "loss": 0.0217, "step": 5126 }, { "epoch": 1.982598607888631, "grad_norm": 0.3414681213927457, "learning_rate": 3.1131153817965266e-06, "loss": 0.0239, "step": 5127 }, { "epoch": 1.982985305491106, "grad_norm": 0.40067597341780364, "learning_rate": 3.1110321422676775e-06, "loss": 0.0333, "step": 5128 }, { "epoch": 1.9833720030935809, "grad_norm": 0.27218307113113116, "learning_rate": 3.108949285179981e-06, "loss": 0.0214, "step": 5129 }, { "epoch": 1.9837587006960558, "grad_norm": 0.40873426567780385, "learning_rate": 3.1068668109551312e-06, "loss": 0.0248, "step": 5130 }, { "epoch": 1.9841453982985304, "grad_norm": 0.34291652457623306, "learning_rate": 3.1047847200147485e-06, "loss": 0.0257, "step": 5131 }, { "epoch": 1.9845320959010055, "grad_norm": 0.2377724962164112, "learning_rate": 3.1027030127803726e-06, "loss": 0.0197, "step": 5132 }, { "epoch": 1.9849187935034802, "grad_norm": 0.22639152563381346, "learning_rate": 3.100621689673466e-06, "loss": 0.017, "step": 5133 }, { "epoch": 1.9853054911059551, "grad_norm": 0.4701751842653701, "learning_rate": 3.0985407511154165e-06, "loss": 0.0324, "step": 5134 }, { "epoch": 1.98569218870843, "grad_norm": 0.3315149800206972, "learning_rate": 3.096460197527529e-06, "loss": 0.0273, "step": 5135 }, { "epoch": 1.986078886310905, "grad_norm": 0.6858326697653149, "learning_rate": 3.0943800293310356e-06, "loss": 0.0296, "step": 5136 }, { "epoch": 1.9864655839133798, "grad_norm": 0.3055106121782156, "learning_rate": 3.0923002469470865e-06, "loss": 0.0224, "step": 5137 }, { "epoch": 1.9868522815158545, "grad_norm": 0.2558251696839949, "learning_rate": 3.0902208507967557e-06, "loss": 0.0168, "step": 5138 }, { "epoch": 1.9872389791183296, "grad_norm": 0.30202237628621925, "learning_rate": 3.088141841301039e-06, "loss": 0.0278, "step": 5139 }, { "epoch": 1.9876256767208043, "grad_norm": 0.2071050213839865, "learning_rate": 3.0860632188808513e-06, "loss": 0.0226, "step": 5140 }, { "epoch": 1.9880123743232792, "grad_norm": 0.19484119053457305, "learning_rate": 3.0839849839570345e-06, "loss": 0.0174, "step": 5141 }, { "epoch": 1.988399071925754, "grad_norm": 0.27580508405757403, "learning_rate": 3.081907136950346e-06, "loss": 0.0233, "step": 5142 }, { "epoch": 1.988785769528229, "grad_norm": 0.26484819068988513, "learning_rate": 3.0798296782814703e-06, "loss": 0.0249, "step": 5143 }, { "epoch": 1.9891724671307038, "grad_norm": 0.25754271620414987, "learning_rate": 3.0777526083710072e-06, "loss": 0.0224, "step": 5144 }, { "epoch": 1.9895591647331785, "grad_norm": 0.7813821617118721, "learning_rate": 3.075675927639484e-06, "loss": 0.0338, "step": 5145 }, { "epoch": 1.9899458623356536, "grad_norm": 0.2651661575471223, "learning_rate": 3.073599636507346e-06, "loss": 0.0269, "step": 5146 }, { "epoch": 1.9903325599381283, "grad_norm": 0.4050580337459264, "learning_rate": 3.0715237353949566e-06, "loss": 0.0428, "step": 5147 }, { "epoch": 1.9907192575406032, "grad_norm": 0.20520860227941554, "learning_rate": 3.0694482247226083e-06, "loss": 0.0166, "step": 5148 }, { "epoch": 1.991105955143078, "grad_norm": 0.2797771900390122, "learning_rate": 3.0673731049105044e-06, "loss": 0.0183, "step": 5149 }, { "epoch": 1.991492652745553, "grad_norm": 0.4340713566243784, "learning_rate": 3.06529837637878e-06, "loss": 0.0231, "step": 5150 }, { "epoch": 1.9918793503480279, "grad_norm": 0.5158270016662416, "learning_rate": 3.06322403954748e-06, "loss": 0.0399, "step": 5151 }, { "epoch": 1.9922660479505025, "grad_norm": 0.47594928805132586, "learning_rate": 3.061150094836582e-06, "loss": 0.0322, "step": 5152 }, { "epoch": 1.9926527455529777, "grad_norm": 0.271003234922563, "learning_rate": 3.059076542665972e-06, "loss": 0.0224, "step": 5153 }, { "epoch": 1.9930394431554523, "grad_norm": 0.2933053198581355, "learning_rate": 3.0570033834554634e-06, "loss": 0.0266, "step": 5154 }, { "epoch": 1.9934261407579275, "grad_norm": 0.2873722492134692, "learning_rate": 3.0549306176247907e-06, "loss": 0.0184, "step": 5155 }, { "epoch": 1.9938128383604021, "grad_norm": 0.2284175046554195, "learning_rate": 3.052858245593604e-06, "loss": 0.0202, "step": 5156 }, { "epoch": 1.994199535962877, "grad_norm": 0.2775045485910534, "learning_rate": 3.0507862677814793e-06, "loss": 0.02, "step": 5157 }, { "epoch": 1.994586233565352, "grad_norm": 0.4093881790098151, "learning_rate": 3.048714684607908e-06, "loss": 0.0219, "step": 5158 }, { "epoch": 1.9949729311678268, "grad_norm": 0.24020934034683417, "learning_rate": 3.0466434964923053e-06, "loss": 0.0193, "step": 5159 }, { "epoch": 1.9953596287703017, "grad_norm": 0.2750730580227321, "learning_rate": 3.0445727038540043e-06, "loss": 0.0267, "step": 5160 }, { "epoch": 1.9957463263727764, "grad_norm": 0.276048363219775, "learning_rate": 3.0425023071122565e-06, "loss": 0.0285, "step": 5161 }, { "epoch": 1.9961330239752515, "grad_norm": 0.25288968973140724, "learning_rate": 3.0404323066862384e-06, "loss": 0.0242, "step": 5162 }, { "epoch": 1.9965197215777262, "grad_norm": 0.450975440775767, "learning_rate": 3.03836270299504e-06, "loss": 0.0269, "step": 5163 }, { "epoch": 1.996906419180201, "grad_norm": 0.37994105373154025, "learning_rate": 3.0362934964576766e-06, "loss": 0.0266, "step": 5164 }, { "epoch": 1.997293116782676, "grad_norm": 0.3050974907798716, "learning_rate": 3.0342246874930803e-06, "loss": 0.0187, "step": 5165 }, { "epoch": 1.9976798143851509, "grad_norm": 0.26260751253346765, "learning_rate": 3.0321562765200996e-06, "loss": 0.028, "step": 5166 }, { "epoch": 1.9980665119876257, "grad_norm": 0.3604250719322749, "learning_rate": 3.0300882639575113e-06, "loss": 0.0347, "step": 5167 }, { "epoch": 1.9984532095901004, "grad_norm": 0.23231585065883892, "learning_rate": 3.0280206502239994e-06, "loss": 0.0237, "step": 5168 }, { "epoch": 1.9988399071925755, "grad_norm": 0.2826163387675735, "learning_rate": 3.0259534357381804e-06, "loss": 0.0266, "step": 5169 }, { "epoch": 1.9992266047950502, "grad_norm": 0.3242318110569419, "learning_rate": 3.023886620918578e-06, "loss": 0.0217, "step": 5170 }, { "epoch": 1.999613302397525, "grad_norm": 0.3126793277303383, "learning_rate": 3.021820206183645e-06, "loss": 0.0194, "step": 5171 }, { "epoch": 2.0, "grad_norm": 0.23321003032024118, "learning_rate": 3.0197541919517447e-06, "loss": 0.0199, "step": 5172 }, { "epoch": 2.0, "eval_loss": 0.02753594145178795, "eval_runtime": 787.2024, "eval_samples_per_second": 24.992, "eval_steps_per_second": 0.781, "step": 5172 }, { "epoch": 2.0003866976024747, "grad_norm": 0.2626169700957055, "learning_rate": 3.0176885786411648e-06, "loss": 0.0189, "step": 5173 }, { "epoch": 2.00077339520495, "grad_norm": 0.46444394383866955, "learning_rate": 3.0156233666701096e-06, "loss": 0.0249, "step": 5174 }, { "epoch": 2.0011600928074245, "grad_norm": 0.3690193417395462, "learning_rate": 3.0135585564567026e-06, "loss": 0.0244, "step": 5175 }, { "epoch": 2.0015467904098996, "grad_norm": 0.34703993555008705, "learning_rate": 3.0114941484189876e-06, "loss": 0.0337, "step": 5176 }, { "epoch": 2.0019334880123743, "grad_norm": 0.2822361837752235, "learning_rate": 3.009430142974923e-06, "loss": 0.0264, "step": 5177 }, { "epoch": 2.0023201856148494, "grad_norm": 0.2917018922005416, "learning_rate": 3.0073665405423907e-06, "loss": 0.0209, "step": 5178 }, { "epoch": 2.002706883217324, "grad_norm": 0.39988024397772504, "learning_rate": 3.005303341539187e-06, "loss": 0.0243, "step": 5179 }, { "epoch": 2.0030935808197987, "grad_norm": 0.2505333958494766, "learning_rate": 3.0032405463830273e-06, "loss": 0.0192, "step": 5180 }, { "epoch": 2.003480278422274, "grad_norm": 0.24457464721369473, "learning_rate": 3.0011781554915474e-06, "loss": 0.0225, "step": 5181 }, { "epoch": 2.0038669760247485, "grad_norm": 0.33834372418661207, "learning_rate": 2.9991161692822978e-06, "loss": 0.0243, "step": 5182 }, { "epoch": 2.0042536736272236, "grad_norm": 0.21230936513230483, "learning_rate": 2.997054588172752e-06, "loss": 0.0182, "step": 5183 }, { "epoch": 2.0046403712296983, "grad_norm": 0.4222007777586146, "learning_rate": 2.994993412580295e-06, "loss": 0.0294, "step": 5184 }, { "epoch": 2.0050270688321734, "grad_norm": 0.42692683175111085, "learning_rate": 2.9929326429222366e-06, "loss": 0.0219, "step": 5185 }, { "epoch": 2.005413766434648, "grad_norm": 0.2522770735220561, "learning_rate": 2.9908722796158e-06, "loss": 0.018, "step": 5186 }, { "epoch": 2.0058004640371228, "grad_norm": 0.36971212051701197, "learning_rate": 2.988812323078124e-06, "loss": 0.0191, "step": 5187 }, { "epoch": 2.006187161639598, "grad_norm": 0.23534286474903252, "learning_rate": 2.986752773726273e-06, "loss": 0.0196, "step": 5188 }, { "epoch": 2.0065738592420725, "grad_norm": 0.45913173661744333, "learning_rate": 2.9846936319772192e-06, "loss": 0.0439, "step": 5189 }, { "epoch": 2.0069605568445477, "grad_norm": 0.30686544416763967, "learning_rate": 2.9826348982478624e-06, "loss": 0.025, "step": 5190 }, { "epoch": 2.0073472544470223, "grad_norm": 0.36612608958695564, "learning_rate": 2.9805765729550098e-06, "loss": 0.026, "step": 5191 }, { "epoch": 2.0077339520494975, "grad_norm": 0.2481300738542359, "learning_rate": 2.9785186565153957e-06, "loss": 0.0157, "step": 5192 }, { "epoch": 2.008120649651972, "grad_norm": 0.3606660469188605, "learning_rate": 2.976461149345663e-06, "loss": 0.0178, "step": 5193 }, { "epoch": 2.0085073472544472, "grad_norm": 0.4068919189181153, "learning_rate": 2.974404051862375e-06, "loss": 0.0305, "step": 5194 }, { "epoch": 2.008894044856922, "grad_norm": 0.31142880199891937, "learning_rate": 2.972347364482015e-06, "loss": 0.024, "step": 5195 }, { "epoch": 2.0092807424593966, "grad_norm": 0.27420790056806016, "learning_rate": 2.970291087620979e-06, "loss": 0.027, "step": 5196 }, { "epoch": 2.0096674400618717, "grad_norm": 0.30766598802187806, "learning_rate": 2.968235221695582e-06, "loss": 0.0139, "step": 5197 }, { "epoch": 2.0100541376643464, "grad_norm": 0.3600030966985551, "learning_rate": 2.966179767122056e-06, "loss": 0.0222, "step": 5198 }, { "epoch": 2.0104408352668215, "grad_norm": 0.2958401542184258, "learning_rate": 2.9641247243165477e-06, "loss": 0.023, "step": 5199 }, { "epoch": 2.010827532869296, "grad_norm": 0.22541554095814867, "learning_rate": 2.9620700936951237e-06, "loss": 0.011, "step": 5200 }, { "epoch": 2.0112142304717713, "grad_norm": 0.2924921676084037, "learning_rate": 2.9600158756737627e-06, "loss": 0.0191, "step": 5201 }, { "epoch": 2.011600928074246, "grad_norm": 0.2938048176865945, "learning_rate": 2.9579620706683663e-06, "loss": 0.0222, "step": 5202 }, { "epoch": 2.0119876256767206, "grad_norm": 0.248735236042722, "learning_rate": 2.9559086790947446e-06, "loss": 0.0143, "step": 5203 }, { "epoch": 2.0123743232791957, "grad_norm": 0.23254738126782912, "learning_rate": 2.9538557013686308e-06, "loss": 0.0163, "step": 5204 }, { "epoch": 2.0127610208816704, "grad_norm": 0.3460494500609735, "learning_rate": 2.951803137905671e-06, "loss": 0.0168, "step": 5205 }, { "epoch": 2.0131477184841455, "grad_norm": 0.26951689722922384, "learning_rate": 2.9497509891214265e-06, "loss": 0.0201, "step": 5206 }, { "epoch": 2.01353441608662, "grad_norm": 0.2588220087521858, "learning_rate": 2.9476992554313787e-06, "loss": 0.0154, "step": 5207 }, { "epoch": 2.0139211136890953, "grad_norm": 0.3191353845029397, "learning_rate": 2.9456479372509195e-06, "loss": 0.0243, "step": 5208 }, { "epoch": 2.01430781129157, "grad_norm": 0.32367359864697465, "learning_rate": 2.9435970349953612e-06, "loss": 0.0215, "step": 5209 }, { "epoch": 2.0146945088940447, "grad_norm": 0.36427355783920534, "learning_rate": 2.94154654907993e-06, "loss": 0.0139, "step": 5210 }, { "epoch": 2.01508120649652, "grad_norm": 0.2943982386720044, "learning_rate": 2.939496479919769e-06, "loss": 0.0232, "step": 5211 }, { "epoch": 2.0154679040989945, "grad_norm": 0.28208851436076393, "learning_rate": 2.937446827929936e-06, "loss": 0.0206, "step": 5212 }, { "epoch": 2.0158546017014696, "grad_norm": 0.22973901177513203, "learning_rate": 2.9353975935253997e-06, "loss": 0.0126, "step": 5213 }, { "epoch": 2.0162412993039442, "grad_norm": 0.22561029551448053, "learning_rate": 2.933348777121057e-06, "loss": 0.0127, "step": 5214 }, { "epoch": 2.0166279969064194, "grad_norm": 0.25343105669635, "learning_rate": 2.931300379131703e-06, "loss": 0.0173, "step": 5215 }, { "epoch": 2.017014694508894, "grad_norm": 0.4028098043713796, "learning_rate": 2.929252399972066e-06, "loss": 0.0191, "step": 5216 }, { "epoch": 2.0174013921113687, "grad_norm": 0.2827038943116871, "learning_rate": 2.927204840056773e-06, "loss": 0.0221, "step": 5217 }, { "epoch": 2.017788089713844, "grad_norm": 0.5995558405034073, "learning_rate": 2.92515769980038e-06, "loss": 0.0192, "step": 5218 }, { "epoch": 2.0181747873163185, "grad_norm": 0.22032930298704587, "learning_rate": 2.923110979617348e-06, "loss": 0.0171, "step": 5219 }, { "epoch": 2.0185614849187936, "grad_norm": 0.24447531302713335, "learning_rate": 2.9210646799220565e-06, "loss": 0.022, "step": 5220 }, { "epoch": 2.0189481825212683, "grad_norm": 0.31510021820992395, "learning_rate": 2.919018801128802e-06, "loss": 0.0186, "step": 5221 }, { "epoch": 2.0193348801237434, "grad_norm": 0.20869299126440963, "learning_rate": 2.916973343651792e-06, "loss": 0.0183, "step": 5222 }, { "epoch": 2.019721577726218, "grad_norm": 0.23203237494192083, "learning_rate": 2.914928307905156e-06, "loss": 0.0138, "step": 5223 }, { "epoch": 2.0201082753286927, "grad_norm": 0.31825477823727466, "learning_rate": 2.9128836943029236e-06, "loss": 0.0209, "step": 5224 }, { "epoch": 2.020494972931168, "grad_norm": 0.2913269288585493, "learning_rate": 2.9108395032590565e-06, "loss": 0.0261, "step": 5225 }, { "epoch": 2.0208816705336425, "grad_norm": 0.3733265725010495, "learning_rate": 2.9087957351874174e-06, "loss": 0.0258, "step": 5226 }, { "epoch": 2.0212683681361177, "grad_norm": 0.3202965561944607, "learning_rate": 2.90675239050179e-06, "loss": 0.0266, "step": 5227 }, { "epoch": 2.0216550657385923, "grad_norm": 0.20584174255369328, "learning_rate": 2.9047094696158707e-06, "loss": 0.0137, "step": 5228 }, { "epoch": 2.0220417633410674, "grad_norm": 0.35129539299635393, "learning_rate": 2.9026669729432675e-06, "loss": 0.034, "step": 5229 }, { "epoch": 2.022428460943542, "grad_norm": 0.25094402470108784, "learning_rate": 2.9006249008975105e-06, "loss": 0.0241, "step": 5230 }, { "epoch": 2.0228151585460172, "grad_norm": 0.24141390078828945, "learning_rate": 2.8985832538920343e-06, "loss": 0.0181, "step": 5231 }, { "epoch": 2.023201856148492, "grad_norm": 0.2891756297574712, "learning_rate": 2.8965420323401927e-06, "loss": 0.0219, "step": 5232 }, { "epoch": 2.0235885537509666, "grad_norm": 0.18827904817858307, "learning_rate": 2.894501236655252e-06, "loss": 0.0165, "step": 5233 }, { "epoch": 2.0239752513534417, "grad_norm": 0.33020957831862613, "learning_rate": 2.892460867250391e-06, "loss": 0.0214, "step": 5234 }, { "epoch": 2.0243619489559164, "grad_norm": 0.32441327577682716, "learning_rate": 2.890420924538706e-06, "loss": 0.0275, "step": 5235 }, { "epoch": 2.0247486465583915, "grad_norm": 0.2866683447514598, "learning_rate": 2.888381408933204e-06, "loss": 0.0225, "step": 5236 }, { "epoch": 2.025135344160866, "grad_norm": 0.2565819036681513, "learning_rate": 2.886342320846806e-06, "loss": 0.0166, "step": 5237 }, { "epoch": 2.0255220417633413, "grad_norm": 0.4356380180436268, "learning_rate": 2.8843036606923467e-06, "loss": 0.0194, "step": 5238 }, { "epoch": 2.025908739365816, "grad_norm": 0.23149560829920743, "learning_rate": 2.8822654288825714e-06, "loss": 0.0161, "step": 5239 }, { "epoch": 2.0262954369682906, "grad_norm": 0.22151496286764613, "learning_rate": 2.8802276258301454e-06, "loss": 0.0197, "step": 5240 }, { "epoch": 2.0266821345707657, "grad_norm": 0.28930660625925403, "learning_rate": 2.878190251947641e-06, "loss": 0.0268, "step": 5241 }, { "epoch": 2.0270688321732404, "grad_norm": 0.2472674377785183, "learning_rate": 2.8761533076475468e-06, "loss": 0.0131, "step": 5242 }, { "epoch": 2.0274555297757155, "grad_norm": 0.31914425262517976, "learning_rate": 2.8741167933422597e-06, "loss": 0.0144, "step": 5243 }, { "epoch": 2.02784222737819, "grad_norm": 0.2531603639183595, "learning_rate": 2.872080709444098e-06, "loss": 0.0161, "step": 5244 }, { "epoch": 2.0282289249806653, "grad_norm": 0.3300298751442462, "learning_rate": 2.870045056365288e-06, "loss": 0.017, "step": 5245 }, { "epoch": 2.02861562258314, "grad_norm": 0.23420365611391658, "learning_rate": 2.8680098345179626e-06, "loss": 0.0139, "step": 5246 }, { "epoch": 2.0290023201856147, "grad_norm": 0.21608264605819333, "learning_rate": 2.8659750443141796e-06, "loss": 0.0175, "step": 5247 }, { "epoch": 2.0293890177880898, "grad_norm": 0.30333288451850215, "learning_rate": 2.8639406861659e-06, "loss": 0.0216, "step": 5248 }, { "epoch": 2.0297757153905645, "grad_norm": 0.2580166924934847, "learning_rate": 2.861906760485006e-06, "loss": 0.0175, "step": 5249 }, { "epoch": 2.0301624129930396, "grad_norm": 0.16582879663710795, "learning_rate": 2.859873267683279e-06, "loss": 0.0116, "step": 5250 }, { "epoch": 2.0305491105955142, "grad_norm": 0.2740184964032042, "learning_rate": 2.8578402081724273e-06, "loss": 0.0151, "step": 5251 }, { "epoch": 2.0309358081979894, "grad_norm": 0.26275171540252334, "learning_rate": 2.855807582364062e-06, "loss": 0.0214, "step": 5252 }, { "epoch": 2.031322505800464, "grad_norm": 0.25641161840748794, "learning_rate": 2.853775390669711e-06, "loss": 0.0237, "step": 5253 }, { "epoch": 2.0317092034029387, "grad_norm": 0.3087553126020504, "learning_rate": 2.85174363350081e-06, "loss": 0.0172, "step": 5254 }, { "epoch": 2.032095901005414, "grad_norm": 0.2032526304349401, "learning_rate": 2.8497123112687092e-06, "loss": 0.0189, "step": 5255 }, { "epoch": 2.0324825986078885, "grad_norm": 0.30207120323283126, "learning_rate": 2.8476814243846747e-06, "loss": 0.0176, "step": 5256 }, { "epoch": 2.0328692962103636, "grad_norm": 0.2308665389166466, "learning_rate": 2.8456509732598782e-06, "loss": 0.0137, "step": 5257 }, { "epoch": 2.0332559938128383, "grad_norm": 0.4381522698525051, "learning_rate": 2.843620958305406e-06, "loss": 0.0184, "step": 5258 }, { "epoch": 2.0336426914153134, "grad_norm": 0.26251158312122785, "learning_rate": 2.841591379932255e-06, "loss": 0.0192, "step": 5259 }, { "epoch": 2.034029389017788, "grad_norm": 0.3621736629468489, "learning_rate": 2.8395622385513332e-06, "loss": 0.0318, "step": 5260 }, { "epoch": 2.0344160866202627, "grad_norm": 0.2088429814810804, "learning_rate": 2.837533534573465e-06, "loss": 0.0169, "step": 5261 }, { "epoch": 2.034802784222738, "grad_norm": 0.2801107677868053, "learning_rate": 2.835505268409381e-06, "loss": 0.0191, "step": 5262 }, { "epoch": 2.0351894818252125, "grad_norm": 0.24625089771206998, "learning_rate": 2.8334774404697234e-06, "loss": 0.0182, "step": 5263 }, { "epoch": 2.0355761794276876, "grad_norm": 0.2488100460253395, "learning_rate": 2.8314500511650466e-06, "loss": 0.0167, "step": 5264 }, { "epoch": 2.0359628770301623, "grad_norm": 0.2223370290284412, "learning_rate": 2.8294231009058203e-06, "loss": 0.017, "step": 5265 }, { "epoch": 2.0363495746326374, "grad_norm": 0.22730699511851385, "learning_rate": 2.82739659010242e-06, "loss": 0.0163, "step": 5266 }, { "epoch": 2.036736272235112, "grad_norm": 0.23354787166220806, "learning_rate": 2.8253705191651303e-06, "loss": 0.0187, "step": 5267 }, { "epoch": 2.0371229698375872, "grad_norm": 0.22237761562955044, "learning_rate": 2.8233448885041548e-06, "loss": 0.0165, "step": 5268 }, { "epoch": 2.037509667440062, "grad_norm": 0.3089891051287465, "learning_rate": 2.8213196985296e-06, "loss": 0.0211, "step": 5269 }, { "epoch": 2.0378963650425366, "grad_norm": 0.29256987184057137, "learning_rate": 2.819294949651492e-06, "loss": 0.0271, "step": 5270 }, { "epoch": 2.0382830626450117, "grad_norm": 0.29188413600142893, "learning_rate": 2.8172706422797548e-06, "loss": 0.0216, "step": 5271 }, { "epoch": 2.0386697602474864, "grad_norm": 0.28964037226570005, "learning_rate": 2.8152467768242363e-06, "loss": 0.0224, "step": 5272 }, { "epoch": 2.0390564578499615, "grad_norm": 0.4527467453597204, "learning_rate": 2.8132233536946872e-06, "loss": 0.0258, "step": 5273 }, { "epoch": 2.039443155452436, "grad_norm": 0.3066817977459527, "learning_rate": 2.8112003733007702e-06, "loss": 0.0177, "step": 5274 }, { "epoch": 2.0398298530549113, "grad_norm": 0.25304892156143943, "learning_rate": 2.8091778360520593e-06, "loss": 0.0211, "step": 5275 }, { "epoch": 2.040216550657386, "grad_norm": 0.2725089729195152, "learning_rate": 2.8071557423580355e-06, "loss": 0.0167, "step": 5276 }, { "epoch": 2.0406032482598606, "grad_norm": 0.27738008577678047, "learning_rate": 2.8051340926280966e-06, "loss": 0.0183, "step": 5277 }, { "epoch": 2.0409899458623357, "grad_norm": 0.26579163697446107, "learning_rate": 2.8031128872715454e-06, "loss": 0.0189, "step": 5278 }, { "epoch": 2.0413766434648104, "grad_norm": 0.4627557182717628, "learning_rate": 2.8010921266975945e-06, "loss": 0.0379, "step": 5279 }, { "epoch": 2.0417633410672855, "grad_norm": 0.30912114793486667, "learning_rate": 2.7990718113153694e-06, "loss": 0.0253, "step": 5280 }, { "epoch": 2.04215003866976, "grad_norm": 0.21966739611414648, "learning_rate": 2.7970519415339005e-06, "loss": 0.0191, "step": 5281 }, { "epoch": 2.0425367362722353, "grad_norm": 0.2638843009217461, "learning_rate": 2.795032517762136e-06, "loss": 0.0143, "step": 5282 }, { "epoch": 2.04292343387471, "grad_norm": 0.27270621919872895, "learning_rate": 2.7930135404089286e-06, "loss": 0.0156, "step": 5283 }, { "epoch": 2.0433101314771847, "grad_norm": 0.31169587135182886, "learning_rate": 2.7909950098830395e-06, "loss": 0.0189, "step": 5284 }, { "epoch": 2.0436968290796598, "grad_norm": 0.3716709548346217, "learning_rate": 2.788976926593141e-06, "loss": 0.0226, "step": 5285 }, { "epoch": 2.0440835266821344, "grad_norm": 0.23579809386744757, "learning_rate": 2.786959290947814e-06, "loss": 0.0196, "step": 5286 }, { "epoch": 2.0444702242846096, "grad_norm": 0.28607920666076014, "learning_rate": 2.784942103355556e-06, "loss": 0.0185, "step": 5287 }, { "epoch": 2.0448569218870842, "grad_norm": 0.31488954103064215, "learning_rate": 2.782925364224758e-06, "loss": 0.0218, "step": 5288 }, { "epoch": 2.0452436194895594, "grad_norm": 0.2652522817871518, "learning_rate": 2.7809090739637373e-06, "loss": 0.017, "step": 5289 }, { "epoch": 2.045630317092034, "grad_norm": 0.30650971759817547, "learning_rate": 2.7788932329807085e-06, "loss": 0.0311, "step": 5290 }, { "epoch": 2.0460170146945087, "grad_norm": 0.3111050836092335, "learning_rate": 2.776877841683805e-06, "loss": 0.0225, "step": 5291 }, { "epoch": 2.046403712296984, "grad_norm": 0.2769584565895511, "learning_rate": 2.774862900481058e-06, "loss": 0.0202, "step": 5292 }, { "epoch": 2.0467904098994585, "grad_norm": 0.3861640367522358, "learning_rate": 2.772848409780413e-06, "loss": 0.0217, "step": 5293 }, { "epoch": 2.0471771075019336, "grad_norm": 0.3023155702150787, "learning_rate": 2.770834369989729e-06, "loss": 0.0143, "step": 5294 }, { "epoch": 2.0475638051044083, "grad_norm": 0.2751198485850258, "learning_rate": 2.7688207815167667e-06, "loss": 0.0184, "step": 5295 }, { "epoch": 2.0479505027068834, "grad_norm": 0.2553891686579063, "learning_rate": 2.7668076447691984e-06, "loss": 0.0155, "step": 5296 }, { "epoch": 2.048337200309358, "grad_norm": 0.29815214130392, "learning_rate": 2.764794960154603e-06, "loss": 0.0176, "step": 5297 }, { "epoch": 2.0487238979118327, "grad_norm": 0.27193472440991556, "learning_rate": 2.7627827280804724e-06, "loss": 0.0187, "step": 5298 }, { "epoch": 2.049110595514308, "grad_norm": 0.23581755286091677, "learning_rate": 2.7607709489542025e-06, "loss": 0.0169, "step": 5299 }, { "epoch": 2.0494972931167825, "grad_norm": 0.30902412470547175, "learning_rate": 2.758759623183099e-06, "loss": 0.0199, "step": 5300 }, { "epoch": 2.0498839907192576, "grad_norm": 0.24379270287881988, "learning_rate": 2.756748751174375e-06, "loss": 0.0162, "step": 5301 }, { "epoch": 2.0502706883217323, "grad_norm": 0.27168087268476826, "learning_rate": 2.7547383333351507e-06, "loss": 0.0205, "step": 5302 }, { "epoch": 2.0506573859242074, "grad_norm": 0.5011379586827117, "learning_rate": 2.7527283700724604e-06, "loss": 0.0211, "step": 5303 }, { "epoch": 2.051044083526682, "grad_norm": 0.2889746484008473, "learning_rate": 2.75071886179324e-06, "loss": 0.0213, "step": 5304 }, { "epoch": 2.0514307811291568, "grad_norm": 0.4158700031663791, "learning_rate": 2.7487098089043348e-06, "loss": 0.0294, "step": 5305 }, { "epoch": 2.051817478731632, "grad_norm": 0.2792975541584348, "learning_rate": 2.7467012118124987e-06, "loss": 0.0203, "step": 5306 }, { "epoch": 2.0522041763341066, "grad_norm": 0.31655436654296093, "learning_rate": 2.7446930709243914e-06, "loss": 0.0161, "step": 5307 }, { "epoch": 2.0525908739365817, "grad_norm": 0.2605850591588904, "learning_rate": 2.7426853866465853e-06, "loss": 0.0183, "step": 5308 }, { "epoch": 2.0529775715390564, "grad_norm": 0.28930582239391206, "learning_rate": 2.7406781593855545e-06, "loss": 0.0197, "step": 5309 }, { "epoch": 2.0533642691415315, "grad_norm": 0.2950515333659069, "learning_rate": 2.7386713895476847e-06, "loss": 0.0225, "step": 5310 }, { "epoch": 2.053750966744006, "grad_norm": 0.3080845759780095, "learning_rate": 2.7366650775392655e-06, "loss": 0.0277, "step": 5311 }, { "epoch": 2.0541376643464813, "grad_norm": 0.3434702530833834, "learning_rate": 2.7346592237664943e-06, "loss": 0.0189, "step": 5312 }, { "epoch": 2.054524361948956, "grad_norm": 0.24951697329195172, "learning_rate": 2.732653828635483e-06, "loss": 0.0187, "step": 5313 }, { "epoch": 2.0549110595514306, "grad_norm": 0.4580270119699704, "learning_rate": 2.730648892552237e-06, "loss": 0.0201, "step": 5314 }, { "epoch": 2.0552977571539057, "grad_norm": 0.21477409976937442, "learning_rate": 2.728644415922682e-06, "loss": 0.0194, "step": 5315 }, { "epoch": 2.0556844547563804, "grad_norm": 0.3053076473088914, "learning_rate": 2.7266403991526412e-06, "loss": 0.0291, "step": 5316 }, { "epoch": 2.0560711523588555, "grad_norm": 0.3512164634217934, "learning_rate": 2.7246368426478554e-06, "loss": 0.0164, "step": 5317 }, { "epoch": 2.05645784996133, "grad_norm": 0.24449341087936882, "learning_rate": 2.7226337468139574e-06, "loss": 0.0155, "step": 5318 }, { "epoch": 2.0568445475638053, "grad_norm": 0.19147271055841916, "learning_rate": 2.7206311120564967e-06, "loss": 0.0152, "step": 5319 }, { "epoch": 2.05723124516628, "grad_norm": 0.2988089521838308, "learning_rate": 2.7186289387809308e-06, "loss": 0.0255, "step": 5320 }, { "epoch": 2.0576179427687546, "grad_norm": 0.33703994653749597, "learning_rate": 2.7166272273926177e-06, "loss": 0.0236, "step": 5321 }, { "epoch": 2.0580046403712298, "grad_norm": 0.25187941298135347, "learning_rate": 2.7146259782968256e-06, "loss": 0.0229, "step": 5322 }, { "epoch": 2.0583913379737044, "grad_norm": 0.3090763681910955, "learning_rate": 2.712625191898726e-06, "loss": 0.0273, "step": 5323 }, { "epoch": 2.0587780355761796, "grad_norm": 0.3580162706154533, "learning_rate": 2.7106248686034033e-06, "loss": 0.0187, "step": 5324 }, { "epoch": 2.0591647331786542, "grad_norm": 0.34235922949267256, "learning_rate": 2.7086250088158404e-06, "loss": 0.0306, "step": 5325 }, { "epoch": 2.0595514307811293, "grad_norm": 0.32921895196790696, "learning_rate": 2.7066256129409314e-06, "loss": 0.0249, "step": 5326 }, { "epoch": 2.059938128383604, "grad_norm": 0.3044897865040958, "learning_rate": 2.704626681383473e-06, "loss": 0.0218, "step": 5327 }, { "epoch": 2.0603248259860787, "grad_norm": 0.2368929327807475, "learning_rate": 2.70262821454817e-06, "loss": 0.0144, "step": 5328 }, { "epoch": 2.060711523588554, "grad_norm": 0.22168249230573958, "learning_rate": 2.700630212839633e-06, "loss": 0.0181, "step": 5329 }, { "epoch": 2.0610982211910285, "grad_norm": 0.5269342720921114, "learning_rate": 2.6986326766623804e-06, "loss": 0.0207, "step": 5330 }, { "epoch": 2.0614849187935036, "grad_norm": 0.27389651058615294, "learning_rate": 2.696635606420831e-06, "loss": 0.0154, "step": 5331 }, { "epoch": 2.0618716163959783, "grad_norm": 0.2772242178033075, "learning_rate": 2.6946390025193136e-06, "loss": 0.0147, "step": 5332 }, { "epoch": 2.0622583139984534, "grad_norm": 0.23440551950361013, "learning_rate": 2.6926428653620602e-06, "loss": 0.0204, "step": 5333 }, { "epoch": 2.062645011600928, "grad_norm": 0.38015386200160123, "learning_rate": 2.6906471953532143e-06, "loss": 0.0303, "step": 5334 }, { "epoch": 2.0630317092034027, "grad_norm": 0.23063352089866604, "learning_rate": 2.688651992896812e-06, "loss": 0.0175, "step": 5335 }, { "epoch": 2.063418406805878, "grad_norm": 0.3097195017581493, "learning_rate": 2.6866572583968093e-06, "loss": 0.0177, "step": 5336 }, { "epoch": 2.0638051044083525, "grad_norm": 0.2586893051336498, "learning_rate": 2.6846629922570566e-06, "loss": 0.0198, "step": 5337 }, { "epoch": 2.0641918020108276, "grad_norm": 0.18990276171025186, "learning_rate": 2.68266919488132e-06, "loss": 0.0128, "step": 5338 }, { "epoch": 2.0645784996133023, "grad_norm": 0.2986148617881932, "learning_rate": 2.680675866673258e-06, "loss": 0.0237, "step": 5339 }, { "epoch": 2.0649651972157774, "grad_norm": 0.24501492930374824, "learning_rate": 2.678683008036442e-06, "loss": 0.0182, "step": 5340 }, { "epoch": 2.065351894818252, "grad_norm": 0.22558062290740055, "learning_rate": 2.6766906193743493e-06, "loss": 0.0184, "step": 5341 }, { "epoch": 2.065738592420727, "grad_norm": 0.24002996851878047, "learning_rate": 2.674698701090358e-06, "loss": 0.017, "step": 5342 }, { "epoch": 2.066125290023202, "grad_norm": 0.2524232253670015, "learning_rate": 2.672707253587753e-06, "loss": 0.0222, "step": 5343 }, { "epoch": 2.0665119876256766, "grad_norm": 0.26596579181338403, "learning_rate": 2.670716277269721e-06, "loss": 0.0201, "step": 5344 }, { "epoch": 2.0668986852281517, "grad_norm": 0.30220675820504544, "learning_rate": 2.6687257725393608e-06, "loss": 0.0265, "step": 5345 }, { "epoch": 2.0672853828306264, "grad_norm": 0.21509124129780147, "learning_rate": 2.666735739799668e-06, "loss": 0.0208, "step": 5346 }, { "epoch": 2.0676720804331015, "grad_norm": 0.2949304610432963, "learning_rate": 2.6647461794535455e-06, "loss": 0.0206, "step": 5347 }, { "epoch": 2.068058778035576, "grad_norm": 0.29894021872474774, "learning_rate": 2.6627570919038003e-06, "loss": 0.0129, "step": 5348 }, { "epoch": 2.0684454756380513, "grad_norm": 0.3240697276428335, "learning_rate": 2.6607684775531425e-06, "loss": 0.0295, "step": 5349 }, { "epoch": 2.068832173240526, "grad_norm": 0.5908287060783236, "learning_rate": 2.658780336804191e-06, "loss": 0.0228, "step": 5350 }, { "epoch": 2.0692188708430006, "grad_norm": 0.33619935484795893, "learning_rate": 2.6567926700594655e-06, "loss": 0.0407, "step": 5351 }, { "epoch": 2.0696055684454757, "grad_norm": 0.31561915895984854, "learning_rate": 2.654805477721384e-06, "loss": 0.0202, "step": 5352 }, { "epoch": 2.0699922660479504, "grad_norm": 0.302071537854745, "learning_rate": 2.6528187601922804e-06, "loss": 0.0215, "step": 5353 }, { "epoch": 2.0703789636504255, "grad_norm": 0.2885034121271702, "learning_rate": 2.6508325178743826e-06, "loss": 0.0186, "step": 5354 }, { "epoch": 2.0707656612529, "grad_norm": 0.3002374978167991, "learning_rate": 2.648846751169831e-06, "loss": 0.0221, "step": 5355 }, { "epoch": 2.0711523588553753, "grad_norm": 0.23943990949609953, "learning_rate": 2.6468614604806577e-06, "loss": 0.0163, "step": 5356 }, { "epoch": 2.07153905645785, "grad_norm": 0.3278520576059983, "learning_rate": 2.6448766462088106e-06, "loss": 0.0186, "step": 5357 }, { "epoch": 2.0719257540603246, "grad_norm": 0.23455298059756494, "learning_rate": 2.6428923087561342e-06, "loss": 0.0147, "step": 5358 }, { "epoch": 2.0723124516627998, "grad_norm": 0.3256429611375834, "learning_rate": 2.640908448524378e-06, "loss": 0.0254, "step": 5359 }, { "epoch": 2.0726991492652744, "grad_norm": 0.3337877641442039, "learning_rate": 2.6389250659151954e-06, "loss": 0.0175, "step": 5360 }, { "epoch": 2.0730858468677495, "grad_norm": 0.3402212393014706, "learning_rate": 2.6369421613301417e-06, "loss": 0.0186, "step": 5361 }, { "epoch": 2.073472544470224, "grad_norm": 0.3047969024288215, "learning_rate": 2.634959735170679e-06, "loss": 0.0203, "step": 5362 }, { "epoch": 2.0738592420726993, "grad_norm": 0.23798552973805917, "learning_rate": 2.632977787838168e-06, "loss": 0.0205, "step": 5363 }, { "epoch": 2.074245939675174, "grad_norm": 0.24126768955324826, "learning_rate": 2.6309963197338763e-06, "loss": 0.0151, "step": 5364 }, { "epoch": 2.0746326372776487, "grad_norm": 0.3240003857664378, "learning_rate": 2.6290153312589706e-06, "loss": 0.0241, "step": 5365 }, { "epoch": 2.075019334880124, "grad_norm": 0.34149611910690064, "learning_rate": 2.6270348228145216e-06, "loss": 0.0232, "step": 5366 }, { "epoch": 2.0754060324825985, "grad_norm": 0.5003642408497414, "learning_rate": 2.625054794801508e-06, "loss": 0.0195, "step": 5367 }, { "epoch": 2.0757927300850736, "grad_norm": 0.3660563099101123, "learning_rate": 2.623075247620805e-06, "loss": 0.0269, "step": 5368 }, { "epoch": 2.0761794276875483, "grad_norm": 0.2480173460137287, "learning_rate": 2.6210961816731915e-06, "loss": 0.0167, "step": 5369 }, { "epoch": 2.0765661252900234, "grad_norm": 0.45781244955565076, "learning_rate": 2.6191175973593497e-06, "loss": 0.0227, "step": 5370 }, { "epoch": 2.076952822892498, "grad_norm": 0.24479356956901707, "learning_rate": 2.617139495079867e-06, "loss": 0.0155, "step": 5371 }, { "epoch": 2.0773395204949727, "grad_norm": 0.3766126109551183, "learning_rate": 2.6151618752352294e-06, "loss": 0.0332, "step": 5372 }, { "epoch": 2.077726218097448, "grad_norm": 0.25176585709974336, "learning_rate": 2.613184738225826e-06, "loss": 0.0137, "step": 5373 }, { "epoch": 2.0781129156999225, "grad_norm": 0.29053666169212417, "learning_rate": 2.611208084451949e-06, "loss": 0.0201, "step": 5374 }, { "epoch": 2.0784996133023976, "grad_norm": 0.3207824268101955, "learning_rate": 2.6092319143137904e-06, "loss": 0.0244, "step": 5375 }, { "epoch": 2.0788863109048723, "grad_norm": 0.22748006357026077, "learning_rate": 2.6072562282114512e-06, "loss": 0.0209, "step": 5376 }, { "epoch": 2.0792730085073474, "grad_norm": 0.4413123511814255, "learning_rate": 2.605281026544927e-06, "loss": 0.0224, "step": 5377 }, { "epoch": 2.079659706109822, "grad_norm": 0.3977939095965566, "learning_rate": 2.6033063097141174e-06, "loss": 0.0187, "step": 5378 }, { "epoch": 2.0800464037122968, "grad_norm": 0.3112159565892516, "learning_rate": 2.6013320781188244e-06, "loss": 0.0203, "step": 5379 }, { "epoch": 2.080433101314772, "grad_norm": 0.2564646781334262, "learning_rate": 2.5993583321587506e-06, "loss": 0.0146, "step": 5380 }, { "epoch": 2.0808197989172466, "grad_norm": 0.27774198616690887, "learning_rate": 2.5973850722335066e-06, "loss": 0.0153, "step": 5381 }, { "epoch": 2.0812064965197217, "grad_norm": 0.3751640713584095, "learning_rate": 2.5954122987425913e-06, "loss": 0.0261, "step": 5382 }, { "epoch": 2.0815931941221963, "grad_norm": 0.34056027757878904, "learning_rate": 2.5934400120854198e-06, "loss": 0.0262, "step": 5383 }, { "epoch": 2.0819798917246715, "grad_norm": 0.3224574789055472, "learning_rate": 2.5914682126612972e-06, "loss": 0.0285, "step": 5384 }, { "epoch": 2.082366589327146, "grad_norm": 0.2622650225473402, "learning_rate": 2.5894969008694413e-06, "loss": 0.02, "step": 5385 }, { "epoch": 2.0827532869296213, "grad_norm": 0.3408821472649468, "learning_rate": 2.5875260771089583e-06, "loss": 0.0211, "step": 5386 }, { "epoch": 2.083139984532096, "grad_norm": 0.23488072627105036, "learning_rate": 2.585555741778862e-06, "loss": 0.02, "step": 5387 }, { "epoch": 2.0835266821345706, "grad_norm": 0.25400477824578654, "learning_rate": 2.583585895278072e-06, "loss": 0.0193, "step": 5388 }, { "epoch": 2.0839133797370457, "grad_norm": 0.47670736976139455, "learning_rate": 2.5816165380054007e-06, "loss": 0.0207, "step": 5389 }, { "epoch": 2.0843000773395204, "grad_norm": 0.31345394924369896, "learning_rate": 2.5796476703595665e-06, "loss": 0.0156, "step": 5390 }, { "epoch": 2.0846867749419955, "grad_norm": 0.25882885991022764, "learning_rate": 2.577679292739186e-06, "loss": 0.0147, "step": 5391 }, { "epoch": 2.08507347254447, "grad_norm": 0.2824308255420566, "learning_rate": 2.5757114055427757e-06, "loss": 0.019, "step": 5392 }, { "epoch": 2.0854601701469453, "grad_norm": 0.40406744447160475, "learning_rate": 2.5737440091687594e-06, "loss": 0.0388, "step": 5393 }, { "epoch": 2.08584686774942, "grad_norm": 0.34120742852857844, "learning_rate": 2.571777104015455e-06, "loss": 0.0309, "step": 5394 }, { "epoch": 2.0862335653518946, "grad_norm": 0.26691660112765164, "learning_rate": 2.569810690481082e-06, "loss": 0.0171, "step": 5395 }, { "epoch": 2.0866202629543698, "grad_norm": 0.22778915302662778, "learning_rate": 2.5678447689637596e-06, "loss": 0.0159, "step": 5396 }, { "epoch": 2.0870069605568444, "grad_norm": 0.30765773517624495, "learning_rate": 2.5658793398615134e-06, "loss": 0.0253, "step": 5397 }, { "epoch": 2.0873936581593195, "grad_norm": 0.281473778901191, "learning_rate": 2.563914403572265e-06, "loss": 0.0191, "step": 5398 }, { "epoch": 2.087780355761794, "grad_norm": 0.33990059321654753, "learning_rate": 2.56194996049383e-06, "loss": 0.0179, "step": 5399 }, { "epoch": 2.0881670533642693, "grad_norm": 0.2122517215681955, "learning_rate": 2.559986011023935e-06, "loss": 0.0151, "step": 5400 }, { "epoch": 2.088553750966744, "grad_norm": 0.2901014261012021, "learning_rate": 2.5580225555602e-06, "loss": 0.0187, "step": 5401 }, { "epoch": 2.0889404485692187, "grad_norm": 0.29209967466638936, "learning_rate": 2.556059594500152e-06, "loss": 0.0162, "step": 5402 }, { "epoch": 2.089327146171694, "grad_norm": 0.5950041289566228, "learning_rate": 2.554097128241204e-06, "loss": 0.0153, "step": 5403 }, { "epoch": 2.0897138437741685, "grad_norm": 0.24862712591346503, "learning_rate": 2.5521351571806853e-06, "loss": 0.0212, "step": 5404 }, { "epoch": 2.0901005413766436, "grad_norm": 0.26776640766050547, "learning_rate": 2.5501736817158147e-06, "loss": 0.0186, "step": 5405 }, { "epoch": 2.0904872389791183, "grad_norm": 0.22878484513025024, "learning_rate": 2.5482127022437127e-06, "loss": 0.015, "step": 5406 }, { "epoch": 2.0908739365815934, "grad_norm": 0.29970930180180605, "learning_rate": 2.5462522191614005e-06, "loss": 0.0137, "step": 5407 }, { "epoch": 2.091260634184068, "grad_norm": 0.32187599939584355, "learning_rate": 2.544292232865796e-06, "loss": 0.0249, "step": 5408 }, { "epoch": 2.0916473317865427, "grad_norm": 0.2824846749139934, "learning_rate": 2.5423327437537226e-06, "loss": 0.022, "step": 5409 }, { "epoch": 2.092034029389018, "grad_norm": 0.31870695894385753, "learning_rate": 2.5403737522218976e-06, "loss": 0.0194, "step": 5410 }, { "epoch": 2.0924207269914925, "grad_norm": 0.3072943943254528, "learning_rate": 2.5384152586669395e-06, "loss": 0.0203, "step": 5411 }, { "epoch": 2.0928074245939676, "grad_norm": 0.24520171081307457, "learning_rate": 2.5364572634853646e-06, "loss": 0.0187, "step": 5412 }, { "epoch": 2.0931941221964423, "grad_norm": 0.2670288486613197, "learning_rate": 2.5344997670735883e-06, "loss": 0.0258, "step": 5413 }, { "epoch": 2.0935808197989174, "grad_norm": 0.3604804867528823, "learning_rate": 2.53254276982793e-06, "loss": 0.0261, "step": 5414 }, { "epoch": 2.093967517401392, "grad_norm": 0.2608597768040157, "learning_rate": 2.530586272144602e-06, "loss": 0.017, "step": 5415 }, { "epoch": 2.094354215003867, "grad_norm": 0.3669424236300309, "learning_rate": 2.528630274419717e-06, "loss": 0.0228, "step": 5416 }, { "epoch": 2.094740912606342, "grad_norm": 0.2506982509199585, "learning_rate": 2.526674777049285e-06, "loss": 0.0207, "step": 5417 }, { "epoch": 2.0951276102088165, "grad_norm": 0.3986598180025309, "learning_rate": 2.524719780429221e-06, "loss": 0.0209, "step": 5418 }, { "epoch": 2.0955143078112917, "grad_norm": 0.3043523965830727, "learning_rate": 2.522765284955335e-06, "loss": 0.0225, "step": 5419 }, { "epoch": 2.0959010054137663, "grad_norm": 0.4073609196373294, "learning_rate": 2.520811291023329e-06, "loss": 0.0238, "step": 5420 }, { "epoch": 2.0962877030162415, "grad_norm": 0.3293416874767295, "learning_rate": 2.5188577990288144e-06, "loss": 0.0221, "step": 5421 }, { "epoch": 2.096674400618716, "grad_norm": 0.2571252641622299, "learning_rate": 2.516904809367292e-06, "loss": 0.0162, "step": 5422 }, { "epoch": 2.0970610982211912, "grad_norm": 0.23276657398085973, "learning_rate": 2.5149523224341723e-06, "loss": 0.0143, "step": 5423 }, { "epoch": 2.097447795823666, "grad_norm": 0.23323525516236837, "learning_rate": 2.5130003386247457e-06, "loss": 0.0141, "step": 5424 }, { "epoch": 2.0978344934261406, "grad_norm": 0.2580325851266247, "learning_rate": 2.5110488583342207e-06, "loss": 0.0206, "step": 5425 }, { "epoch": 2.0982211910286157, "grad_norm": 0.3043910144093354, "learning_rate": 2.5090978819576907e-06, "loss": 0.0192, "step": 5426 }, { "epoch": 2.0986078886310904, "grad_norm": 0.4474758025022345, "learning_rate": 2.5071474098901516e-06, "loss": 0.0232, "step": 5427 }, { "epoch": 2.0989945862335655, "grad_norm": 0.25318048569201407, "learning_rate": 2.505197442526497e-06, "loss": 0.0158, "step": 5428 }, { "epoch": 2.09938128383604, "grad_norm": 0.2586973348955052, "learning_rate": 2.5032479802615163e-06, "loss": 0.021, "step": 5429 }, { "epoch": 2.0997679814385153, "grad_norm": 0.27892285905970765, "learning_rate": 2.5012990234899015e-06, "loss": 0.0213, "step": 5430 }, { "epoch": 2.10015467904099, "grad_norm": 0.28852101585250617, "learning_rate": 2.499350572606238e-06, "loss": 0.0287, "step": 5431 }, { "epoch": 2.1005413766434646, "grad_norm": 0.2530058272569247, "learning_rate": 2.4974026280050096e-06, "loss": 0.0126, "step": 5432 }, { "epoch": 2.1009280742459397, "grad_norm": 0.28833348827168387, "learning_rate": 2.4954551900805973e-06, "loss": 0.0157, "step": 5433 }, { "epoch": 2.1013147718484144, "grad_norm": 0.25806233389686806, "learning_rate": 2.493508259227279e-06, "loss": 0.0154, "step": 5434 }, { "epoch": 2.1017014694508895, "grad_norm": 0.39923436071529783, "learning_rate": 2.4915618358392345e-06, "loss": 0.0202, "step": 5435 }, { "epoch": 2.102088167053364, "grad_norm": 0.36207912022479644, "learning_rate": 2.489615920310536e-06, "loss": 0.0219, "step": 5436 }, { "epoch": 2.1024748646558393, "grad_norm": 0.27243075146437945, "learning_rate": 2.4876705130351536e-06, "loss": 0.018, "step": 5437 }, { "epoch": 2.102861562258314, "grad_norm": 0.33701916711654073, "learning_rate": 2.4857256144069554e-06, "loss": 0.0294, "step": 5438 }, { "epoch": 2.1032482598607887, "grad_norm": 0.42645260500134935, "learning_rate": 2.4837812248197046e-06, "loss": 0.0275, "step": 5439 }, { "epoch": 2.103634957463264, "grad_norm": 0.27815931623373497, "learning_rate": 2.481837344667067e-06, "loss": 0.0159, "step": 5440 }, { "epoch": 2.1040216550657385, "grad_norm": 0.29302846935556304, "learning_rate": 2.479893974342599e-06, "loss": 0.0196, "step": 5441 }, { "epoch": 2.1044083526682136, "grad_norm": 0.31624756472165894, "learning_rate": 2.4779511142397566e-06, "loss": 0.0199, "step": 5442 }, { "epoch": 2.1047950502706883, "grad_norm": 0.2768338042211442, "learning_rate": 2.47600876475189e-06, "loss": 0.0274, "step": 5443 }, { "epoch": 2.1051817478731634, "grad_norm": 0.2805432707507984, "learning_rate": 2.474066926272251e-06, "loss": 0.0219, "step": 5444 }, { "epoch": 2.105568445475638, "grad_norm": 0.35472279988019656, "learning_rate": 2.4721255991939856e-06, "loss": 0.0169, "step": 5445 }, { "epoch": 2.1059551430781127, "grad_norm": 0.3176801456699407, "learning_rate": 2.4701847839101306e-06, "loss": 0.0227, "step": 5446 }, { "epoch": 2.106341840680588, "grad_norm": 0.2844101645884148, "learning_rate": 2.4682444808136284e-06, "loss": 0.0266, "step": 5447 }, { "epoch": 2.1067285382830625, "grad_norm": 0.414636230001969, "learning_rate": 2.466304690297311e-06, "loss": 0.0242, "step": 5448 }, { "epoch": 2.1071152358855376, "grad_norm": 0.2879887811376733, "learning_rate": 2.464365412753914e-06, "loss": 0.0199, "step": 5449 }, { "epoch": 2.1075019334880123, "grad_norm": 0.24263585742630944, "learning_rate": 2.4624266485760574e-06, "loss": 0.0182, "step": 5450 }, { "epoch": 2.1078886310904874, "grad_norm": 0.27498792276500256, "learning_rate": 2.4604883981562696e-06, "loss": 0.0198, "step": 5451 }, { "epoch": 2.108275328692962, "grad_norm": 0.3070363099624175, "learning_rate": 2.4585506618869664e-06, "loss": 0.0215, "step": 5452 }, { "epoch": 2.1086620262954368, "grad_norm": 0.2663684346094763, "learning_rate": 2.456613440160465e-06, "loss": 0.0145, "step": 5453 }, { "epoch": 2.109048723897912, "grad_norm": 0.2283651319182616, "learning_rate": 2.4546767333689737e-06, "loss": 0.0183, "step": 5454 }, { "epoch": 2.1094354215003865, "grad_norm": 0.2807185942502614, "learning_rate": 2.4527405419045984e-06, "loss": 0.0152, "step": 5455 }, { "epoch": 2.1098221191028617, "grad_norm": 0.3245784149415432, "learning_rate": 2.4508048661593447e-06, "loss": 0.0177, "step": 5456 }, { "epoch": 2.1102088167053363, "grad_norm": 0.26479220983259794, "learning_rate": 2.448869706525108e-06, "loss": 0.019, "step": 5457 }, { "epoch": 2.1105955143078114, "grad_norm": 0.30525865888583753, "learning_rate": 2.4469350633936815e-06, "loss": 0.0205, "step": 5458 }, { "epoch": 2.110982211910286, "grad_norm": 0.2242751430316754, "learning_rate": 2.4450009371567547e-06, "loss": 0.017, "step": 5459 }, { "epoch": 2.111368909512761, "grad_norm": 0.26606564894680657, "learning_rate": 2.4430673282059082e-06, "loss": 0.0189, "step": 5460 }, { "epoch": 2.111755607115236, "grad_norm": 0.23533184433441198, "learning_rate": 2.441134236932626e-06, "loss": 0.0146, "step": 5461 }, { "epoch": 2.1121423047177106, "grad_norm": 0.29278651223722857, "learning_rate": 2.43920166372828e-06, "loss": 0.0279, "step": 5462 }, { "epoch": 2.1125290023201857, "grad_norm": 0.4850914613541874, "learning_rate": 2.4372696089841403e-06, "loss": 0.0273, "step": 5463 }, { "epoch": 2.1129156999226604, "grad_norm": 0.31963595626005437, "learning_rate": 2.4353380730913713e-06, "loss": 0.02, "step": 5464 }, { "epoch": 2.1133023975251355, "grad_norm": 0.27421809632272326, "learning_rate": 2.43340705644103e-06, "loss": 0.0188, "step": 5465 }, { "epoch": 2.11368909512761, "grad_norm": 0.325568759835259, "learning_rate": 2.4314765594240773e-06, "loss": 0.0235, "step": 5466 }, { "epoch": 2.1140757927300853, "grad_norm": 0.2510434607126261, "learning_rate": 2.429546582431354e-06, "loss": 0.0138, "step": 5467 }, { "epoch": 2.11446249033256, "grad_norm": 0.2981881763604929, "learning_rate": 2.4276171258536106e-06, "loss": 0.0223, "step": 5468 }, { "epoch": 2.1148491879350346, "grad_norm": 0.2484112377750013, "learning_rate": 2.4256881900814812e-06, "loss": 0.0187, "step": 5469 }, { "epoch": 2.1152358855375097, "grad_norm": 0.2342101614202839, "learning_rate": 2.4237597755055036e-06, "loss": 0.0133, "step": 5470 }, { "epoch": 2.1156225831399844, "grad_norm": 0.30599799864202704, "learning_rate": 2.421831882516102e-06, "loss": 0.0216, "step": 5471 }, { "epoch": 2.1160092807424595, "grad_norm": 0.30112849672945774, "learning_rate": 2.419904511503596e-06, "loss": 0.0215, "step": 5472 }, { "epoch": 2.116395978344934, "grad_norm": 0.3200077292362353, "learning_rate": 2.4179776628582075e-06, "loss": 0.0295, "step": 5473 }, { "epoch": 2.1167826759474093, "grad_norm": 0.162157248502624, "learning_rate": 2.416051336970044e-06, "loss": 0.0113, "step": 5474 }, { "epoch": 2.117169373549884, "grad_norm": 0.22774365330787089, "learning_rate": 2.4141255342291105e-06, "loss": 0.0131, "step": 5475 }, { "epoch": 2.1175560711523587, "grad_norm": 0.2896862885198182, "learning_rate": 2.412200255025303e-06, "loss": 0.0183, "step": 5476 }, { "epoch": 2.117942768754834, "grad_norm": 0.33141224944242914, "learning_rate": 2.4102754997484203e-06, "loss": 0.0412, "step": 5477 }, { "epoch": 2.1183294663573085, "grad_norm": 0.2982183984202617, "learning_rate": 2.408351268788145e-06, "loss": 0.0219, "step": 5478 }, { "epoch": 2.1187161639597836, "grad_norm": 0.2795130155449387, "learning_rate": 2.4064275625340583e-06, "loss": 0.0249, "step": 5479 }, { "epoch": 2.1191028615622582, "grad_norm": 0.4061381919277994, "learning_rate": 2.4045043813756347e-06, "loss": 0.0303, "step": 5480 }, { "epoch": 2.1194895591647334, "grad_norm": 0.2734769219316921, "learning_rate": 2.4025817257022405e-06, "loss": 0.0231, "step": 5481 }, { "epoch": 2.119876256767208, "grad_norm": 0.32180798353200557, "learning_rate": 2.400659595903141e-06, "loss": 0.0235, "step": 5482 }, { "epoch": 2.1202629543696827, "grad_norm": 0.4082546940041547, "learning_rate": 2.398737992367489e-06, "loss": 0.0342, "step": 5483 }, { "epoch": 2.120649651972158, "grad_norm": 0.2910790283568161, "learning_rate": 2.3968169154843327e-06, "loss": 0.0183, "step": 5484 }, { "epoch": 2.1210363495746325, "grad_norm": 0.21258752710118534, "learning_rate": 2.394896365642615e-06, "loss": 0.0148, "step": 5485 }, { "epoch": 2.1214230471771076, "grad_norm": 0.22868690692904195, "learning_rate": 2.3929763432311693e-06, "loss": 0.0186, "step": 5486 }, { "epoch": 2.1218097447795823, "grad_norm": 0.2718617362664035, "learning_rate": 2.3910568486387294e-06, "loss": 0.025, "step": 5487 }, { "epoch": 2.1221964423820574, "grad_norm": 0.20144829302458733, "learning_rate": 2.3891378822539084e-06, "loss": 0.0124, "step": 5488 }, { "epoch": 2.122583139984532, "grad_norm": 0.3407181503675884, "learning_rate": 2.3872194444652286e-06, "loss": 0.0168, "step": 5489 }, { "epoch": 2.1229698375870067, "grad_norm": 0.7350102094436153, "learning_rate": 2.3853015356610927e-06, "loss": 0.0294, "step": 5490 }, { "epoch": 2.123356535189482, "grad_norm": 0.33618406502760956, "learning_rate": 2.3833841562298064e-06, "loss": 0.0293, "step": 5491 }, { "epoch": 2.1237432327919565, "grad_norm": 0.322297795042359, "learning_rate": 2.381467306559558e-06, "loss": 0.0283, "step": 5492 }, { "epoch": 2.1241299303944317, "grad_norm": 0.2737027094838066, "learning_rate": 2.3795509870384348e-06, "loss": 0.0217, "step": 5493 }, { "epoch": 2.1245166279969063, "grad_norm": 0.3817386195578453, "learning_rate": 2.377635198054417e-06, "loss": 0.0257, "step": 5494 }, { "epoch": 2.1249033255993814, "grad_norm": 0.2527057222333551, "learning_rate": 2.3757199399953764e-06, "loss": 0.0188, "step": 5495 }, { "epoch": 2.125290023201856, "grad_norm": 0.565445689785127, "learning_rate": 2.3738052132490745e-06, "loss": 0.0267, "step": 5496 }, { "epoch": 2.1256767208043312, "grad_norm": 0.28931297630017094, "learning_rate": 2.371891018203168e-06, "loss": 0.025, "step": 5497 }, { "epoch": 2.126063418406806, "grad_norm": 0.26828129654120175, "learning_rate": 2.369977355245208e-06, "loss": 0.0226, "step": 5498 }, { "epoch": 2.1264501160092806, "grad_norm": 0.3177277184298452, "learning_rate": 2.3680642247626334e-06, "loss": 0.0194, "step": 5499 }, { "epoch": 2.1268368136117557, "grad_norm": 0.33115677107435965, "learning_rate": 2.3661516271427783e-06, "loss": 0.0179, "step": 5500 }, { "epoch": 2.1272235112142304, "grad_norm": 0.3327680153209643, "learning_rate": 2.364239562772867e-06, "loss": 0.0209, "step": 5501 }, { "epoch": 2.1276102088167055, "grad_norm": 0.7372225576644016, "learning_rate": 2.3623280320400162e-06, "loss": 0.0208, "step": 5502 }, { "epoch": 2.12799690641918, "grad_norm": 0.26672817525684583, "learning_rate": 2.3604170353312373e-06, "loss": 0.0196, "step": 5503 }, { "epoch": 2.1283836040216553, "grad_norm": 0.2900093643176145, "learning_rate": 2.35850657303343e-06, "loss": 0.0189, "step": 5504 }, { "epoch": 2.12877030162413, "grad_norm": 0.2806568442336254, "learning_rate": 2.3565966455333877e-06, "loss": 0.0164, "step": 5505 }, { "epoch": 2.1291569992266046, "grad_norm": 0.23091543040526374, "learning_rate": 2.354687253217795e-06, "loss": 0.0172, "step": 5506 }, { "epoch": 2.1295436968290797, "grad_norm": 0.2475184933095914, "learning_rate": 2.3527783964732264e-06, "loss": 0.0161, "step": 5507 }, { "epoch": 2.1299303944315544, "grad_norm": 0.3549396453037442, "learning_rate": 2.350870075686153e-06, "loss": 0.0202, "step": 5508 }, { "epoch": 2.1303170920340295, "grad_norm": 0.22640252050724408, "learning_rate": 2.348962291242933e-06, "loss": 0.0159, "step": 5509 }, { "epoch": 2.130703789636504, "grad_norm": 0.27472329842640564, "learning_rate": 2.3470550435298157e-06, "loss": 0.0223, "step": 5510 }, { "epoch": 2.1310904872389793, "grad_norm": 0.3087125877040483, "learning_rate": 2.345148332932945e-06, "loss": 0.0279, "step": 5511 }, { "epoch": 2.131477184841454, "grad_norm": 0.21520352949784338, "learning_rate": 2.3432421598383513e-06, "loss": 0.0117, "step": 5512 }, { "epoch": 2.1318638824439287, "grad_norm": 0.4098141676222249, "learning_rate": 2.341336524631965e-06, "loss": 0.0369, "step": 5513 }, { "epoch": 2.1322505800464038, "grad_norm": 0.2994900541374728, "learning_rate": 2.3394314276995943e-06, "loss": 0.0167, "step": 5514 }, { "epoch": 2.1326372776488784, "grad_norm": 0.6434190062551625, "learning_rate": 2.337526869426952e-06, "loss": 0.0256, "step": 5515 }, { "epoch": 2.1330239752513536, "grad_norm": 0.3695018465599936, "learning_rate": 2.335622850199631e-06, "loss": 0.0224, "step": 5516 }, { "epoch": 2.1334106728538282, "grad_norm": 0.34687135033862937, "learning_rate": 2.333719370403126e-06, "loss": 0.0209, "step": 5517 }, { "epoch": 2.1337973704563034, "grad_norm": 0.3103474104415665, "learning_rate": 2.3318164304228096e-06, "loss": 0.021, "step": 5518 }, { "epoch": 2.134184068058778, "grad_norm": 0.231495856348425, "learning_rate": 2.329914030643953e-06, "loss": 0.0131, "step": 5519 }, { "epoch": 2.1345707656612527, "grad_norm": 0.2579856078062935, "learning_rate": 2.32801217145172e-06, "loss": 0.0197, "step": 5520 }, { "epoch": 2.134957463263728, "grad_norm": 0.3730167693042328, "learning_rate": 2.3261108532311603e-06, "loss": 0.0271, "step": 5521 }, { "epoch": 2.1353441608662025, "grad_norm": 0.26635541239413096, "learning_rate": 2.324210076367215e-06, "loss": 0.0191, "step": 5522 }, { "epoch": 2.1357308584686776, "grad_norm": 0.18074229393333355, "learning_rate": 2.3223098412447144e-06, "loss": 0.0118, "step": 5523 }, { "epoch": 2.1361175560711523, "grad_norm": 0.24465818865691946, "learning_rate": 2.320410148248384e-06, "loss": 0.0199, "step": 5524 }, { "epoch": 2.1365042536736274, "grad_norm": 0.21406160385549636, "learning_rate": 2.318510997762836e-06, "loss": 0.0141, "step": 5525 }, { "epoch": 2.136890951276102, "grad_norm": 0.295402265584286, "learning_rate": 2.3166123901725718e-06, "loss": 0.0258, "step": 5526 }, { "epoch": 2.1372776488785767, "grad_norm": 0.20434205916514375, "learning_rate": 2.314714325861984e-06, "loss": 0.0113, "step": 5527 }, { "epoch": 2.137664346481052, "grad_norm": 0.30316729227640027, "learning_rate": 2.3128168052153547e-06, "loss": 0.0257, "step": 5528 }, { "epoch": 2.1380510440835265, "grad_norm": 0.39420222921264064, "learning_rate": 2.31091982861686e-06, "loss": 0.0201, "step": 5529 }, { "epoch": 2.1384377416860016, "grad_norm": 0.2661123763019593, "learning_rate": 2.3090233964505605e-06, "loss": 0.0171, "step": 5530 }, { "epoch": 2.1388244392884763, "grad_norm": 0.2732923523255615, "learning_rate": 2.307127509100408e-06, "loss": 0.0173, "step": 5531 }, { "epoch": 2.1392111368909514, "grad_norm": 0.27590093699951257, "learning_rate": 2.305232166950246e-06, "loss": 0.0203, "step": 5532 }, { "epoch": 2.139597834493426, "grad_norm": 0.2906203864424876, "learning_rate": 2.3033373703838023e-06, "loss": 0.0206, "step": 5533 }, { "epoch": 2.139984532095901, "grad_norm": 0.40520834336307604, "learning_rate": 2.301443119784706e-06, "loss": 0.0286, "step": 5534 }, { "epoch": 2.140371229698376, "grad_norm": 0.2696850325563892, "learning_rate": 2.299549415536459e-06, "loss": 0.0176, "step": 5535 }, { "epoch": 2.1407579273008506, "grad_norm": 0.29467124200695377, "learning_rate": 2.2976562580224667e-06, "loss": 0.0212, "step": 5536 }, { "epoch": 2.1411446249033257, "grad_norm": 0.24255328402551768, "learning_rate": 2.2957636476260154e-06, "loss": 0.0208, "step": 5537 }, { "epoch": 2.1415313225058004, "grad_norm": 0.6738038071018794, "learning_rate": 2.2938715847302894e-06, "loss": 0.0248, "step": 5538 }, { "epoch": 2.1419180201082755, "grad_norm": 0.2336252728824754, "learning_rate": 2.2919800697183503e-06, "loss": 0.0166, "step": 5539 }, { "epoch": 2.14230471771075, "grad_norm": 0.2433603255552502, "learning_rate": 2.2900891029731557e-06, "loss": 0.0193, "step": 5540 }, { "epoch": 2.142691415313225, "grad_norm": 0.2686421130217084, "learning_rate": 2.288198684877555e-06, "loss": 0.0214, "step": 5541 }, { "epoch": 2.1430781129157, "grad_norm": 0.3197464530022398, "learning_rate": 2.2863088158142802e-06, "loss": 0.0175, "step": 5542 }, { "epoch": 2.1434648105181746, "grad_norm": 0.2408817129637925, "learning_rate": 2.284419496165956e-06, "loss": 0.016, "step": 5543 }, { "epoch": 2.1438515081206497, "grad_norm": 0.2892834779805419, "learning_rate": 2.282530726315094e-06, "loss": 0.0206, "step": 5544 }, { "epoch": 2.1442382057231244, "grad_norm": 0.24726764073563431, "learning_rate": 2.2806425066440945e-06, "loss": 0.0206, "step": 5545 }, { "epoch": 2.1446249033255995, "grad_norm": 0.4248268318843476, "learning_rate": 2.27875483753525e-06, "loss": 0.0219, "step": 5546 }, { "epoch": 2.145011600928074, "grad_norm": 0.30003350104532955, "learning_rate": 2.2768677193707366e-06, "loss": 0.0143, "step": 5547 }, { "epoch": 2.1453982985305493, "grad_norm": 0.36107894005379737, "learning_rate": 2.2749811525326216e-06, "loss": 0.0416, "step": 5548 }, { "epoch": 2.145784996133024, "grad_norm": 0.20963065508467082, "learning_rate": 2.273095137402858e-06, "loss": 0.0151, "step": 5549 }, { "epoch": 2.1461716937354987, "grad_norm": 0.2941758687971202, "learning_rate": 2.271209674363293e-06, "loss": 0.0203, "step": 5550 }, { "epoch": 2.1465583913379738, "grad_norm": 0.29512177701185927, "learning_rate": 2.269324763795657e-06, "loss": 0.0242, "step": 5551 }, { "epoch": 2.1469450889404484, "grad_norm": 0.22154503140599002, "learning_rate": 2.2674404060815662e-06, "loss": 0.0146, "step": 5552 }, { "epoch": 2.1473317865429236, "grad_norm": 0.2036213502037634, "learning_rate": 2.2655566016025326e-06, "loss": 0.017, "step": 5553 }, { "epoch": 2.1477184841453982, "grad_norm": 0.25569214476998253, "learning_rate": 2.263673350739948e-06, "loss": 0.0147, "step": 5554 }, { "epoch": 2.1481051817478733, "grad_norm": 0.24619174241738714, "learning_rate": 2.261790653875103e-06, "loss": 0.0153, "step": 5555 }, { "epoch": 2.148491879350348, "grad_norm": 0.2497812141166436, "learning_rate": 2.2599085113891606e-06, "loss": 0.0171, "step": 5556 }, { "epoch": 2.1488785769528227, "grad_norm": 0.2598705789827968, "learning_rate": 2.258026923663186e-06, "loss": 0.0211, "step": 5557 }, { "epoch": 2.149265274555298, "grad_norm": 0.25877491267936864, "learning_rate": 2.2561458910781244e-06, "loss": 0.0223, "step": 5558 }, { "epoch": 2.1496519721577725, "grad_norm": 0.2932654209076981, "learning_rate": 2.2542654140148094e-06, "loss": 0.0207, "step": 5559 }, { "epoch": 2.1500386697602476, "grad_norm": 0.2857294690995296, "learning_rate": 2.2523854928539648e-06, "loss": 0.0177, "step": 5560 }, { "epoch": 2.1504253673627223, "grad_norm": 0.2503564571122219, "learning_rate": 2.2505061279761968e-06, "loss": 0.0251, "step": 5561 }, { "epoch": 2.1508120649651974, "grad_norm": 0.24992132073760906, "learning_rate": 2.2486273197620074e-06, "loss": 0.0196, "step": 5562 }, { "epoch": 2.151198762567672, "grad_norm": 0.2966557112959344, "learning_rate": 2.2467490685917776e-06, "loss": 0.0222, "step": 5563 }, { "epoch": 2.151585460170147, "grad_norm": 0.33309351554689876, "learning_rate": 2.2448713748457794e-06, "loss": 0.0181, "step": 5564 }, { "epoch": 2.151972157772622, "grad_norm": 0.25773643641157434, "learning_rate": 2.242994238904172e-06, "loss": 0.023, "step": 5565 }, { "epoch": 2.1523588553750965, "grad_norm": 0.21083328809641946, "learning_rate": 2.241117661146998e-06, "loss": 0.0172, "step": 5566 }, { "epoch": 2.1527455529775716, "grad_norm": 0.29214279948727884, "learning_rate": 2.239241641954195e-06, "loss": 0.0226, "step": 5567 }, { "epoch": 2.1531322505800463, "grad_norm": 0.3204374364354827, "learning_rate": 2.23736618170558e-06, "loss": 0.0276, "step": 5568 }, { "epoch": 2.1535189481825214, "grad_norm": 0.28141432431653246, "learning_rate": 2.23549128078086e-06, "loss": 0.0214, "step": 5569 }, { "epoch": 2.153905645784996, "grad_norm": 0.23186199612000344, "learning_rate": 2.233616939559625e-06, "loss": 0.0173, "step": 5570 }, { "epoch": 2.154292343387471, "grad_norm": 0.23565969763168002, "learning_rate": 2.23174315842136e-06, "loss": 0.0158, "step": 5571 }, { "epoch": 2.154679040989946, "grad_norm": 0.29869162181551534, "learning_rate": 2.2298699377454285e-06, "loss": 0.0197, "step": 5572 }, { "epoch": 2.1550657385924206, "grad_norm": 0.25432781946261507, "learning_rate": 2.227997277911084e-06, "loss": 0.018, "step": 5573 }, { "epoch": 2.1554524361948957, "grad_norm": 0.22148014031705895, "learning_rate": 2.2261251792974662e-06, "loss": 0.0155, "step": 5574 }, { "epoch": 2.1558391337973704, "grad_norm": 0.3766216551499557, "learning_rate": 2.2242536422835987e-06, "loss": 0.0192, "step": 5575 }, { "epoch": 2.1562258313998455, "grad_norm": 0.25947260923228804, "learning_rate": 2.2223826672483967e-06, "loss": 0.0249, "step": 5576 }, { "epoch": 2.15661252900232, "grad_norm": 0.31053692680533485, "learning_rate": 2.2205122545706596e-06, "loss": 0.0218, "step": 5577 }, { "epoch": 2.1569992266047953, "grad_norm": 0.27775412998228993, "learning_rate": 2.2186424046290663e-06, "loss": 0.0202, "step": 5578 }, { "epoch": 2.15738592420727, "grad_norm": 0.281112529571805, "learning_rate": 2.2167731178021918e-06, "loss": 0.0164, "step": 5579 }, { "epoch": 2.1577726218097446, "grad_norm": 0.2923689841473786, "learning_rate": 2.2149043944684895e-06, "loss": 0.0186, "step": 5580 }, { "epoch": 2.1581593194122197, "grad_norm": 0.21464585713755938, "learning_rate": 2.213036235006307e-06, "loss": 0.0181, "step": 5581 }, { "epoch": 2.1585460170146944, "grad_norm": 0.2057313360145273, "learning_rate": 2.2111686397938663e-06, "loss": 0.0132, "step": 5582 }, { "epoch": 2.1589327146171695, "grad_norm": 0.23772087376832995, "learning_rate": 2.2093016092092866e-06, "loss": 0.0138, "step": 5583 }, { "epoch": 2.159319412219644, "grad_norm": 0.3233367447850515, "learning_rate": 2.2074351436305647e-06, "loss": 0.02, "step": 5584 }, { "epoch": 2.1597061098221193, "grad_norm": 0.2687148915024554, "learning_rate": 2.2055692434355876e-06, "loss": 0.0189, "step": 5585 }, { "epoch": 2.160092807424594, "grad_norm": 0.30355156105127606, "learning_rate": 2.2037039090021246e-06, "loss": 0.0207, "step": 5586 }, { "epoch": 2.1604795050270686, "grad_norm": 0.22316146474174103, "learning_rate": 2.2018391407078306e-06, "loss": 0.017, "step": 5587 }, { "epoch": 2.1608662026295438, "grad_norm": 0.44157600683347326, "learning_rate": 2.1999749389302514e-06, "loss": 0.0199, "step": 5588 }, { "epoch": 2.1612529002320184, "grad_norm": 0.27136811296966146, "learning_rate": 2.1981113040468122e-06, "loss": 0.0202, "step": 5589 }, { "epoch": 2.1616395978344936, "grad_norm": 0.23242062588700826, "learning_rate": 2.1962482364348247e-06, "loss": 0.0137, "step": 5590 }, { "epoch": 2.1620262954369682, "grad_norm": 0.3040533725581531, "learning_rate": 2.194385736471487e-06, "loss": 0.0243, "step": 5591 }, { "epoch": 2.1624129930394433, "grad_norm": 0.38653814798359193, "learning_rate": 2.192523804533879e-06, "loss": 0.0166, "step": 5592 }, { "epoch": 2.162799690641918, "grad_norm": 0.24604893746756218, "learning_rate": 2.190662440998972e-06, "loss": 0.0177, "step": 5593 }, { "epoch": 2.1631863882443927, "grad_norm": 0.31121122213394387, "learning_rate": 2.188801646243617e-06, "loss": 0.017, "step": 5594 }, { "epoch": 2.163573085846868, "grad_norm": 0.3943141306938805, "learning_rate": 2.186941420644551e-06, "loss": 0.0241, "step": 5595 }, { "epoch": 2.1639597834493425, "grad_norm": 0.22332488279222268, "learning_rate": 2.185081764578394e-06, "loss": 0.0119, "step": 5596 }, { "epoch": 2.1643464810518176, "grad_norm": 0.3037050543414954, "learning_rate": 2.183222678421656e-06, "loss": 0.0233, "step": 5597 }, { "epoch": 2.1647331786542923, "grad_norm": 0.2623923635530505, "learning_rate": 2.1813641625507288e-06, "loss": 0.0188, "step": 5598 }, { "epoch": 2.1651198762567674, "grad_norm": 0.25336853524373604, "learning_rate": 2.1795062173418825e-06, "loss": 0.0171, "step": 5599 }, { "epoch": 2.165506573859242, "grad_norm": 0.2200344157859348, "learning_rate": 2.1776488431712827e-06, "loss": 0.0104, "step": 5600 }, { "epoch": 2.1658932714617167, "grad_norm": 0.1743369629901826, "learning_rate": 2.175792040414971e-06, "loss": 0.0077, "step": 5601 }, { "epoch": 2.166279969064192, "grad_norm": 0.4660965099672376, "learning_rate": 2.1739358094488816e-06, "loss": 0.0403, "step": 5602 }, { "epoch": 2.1666666666666665, "grad_norm": 0.25941008043301067, "learning_rate": 2.172080150648821e-06, "loss": 0.0217, "step": 5603 }, { "epoch": 2.1670533642691416, "grad_norm": 0.25415814303107664, "learning_rate": 2.1702250643904905e-06, "loss": 0.0221, "step": 5604 }, { "epoch": 2.1674400618716163, "grad_norm": 0.3313888161232865, "learning_rate": 2.1683705510494712e-06, "loss": 0.0287, "step": 5605 }, { "epoch": 2.1678267594740914, "grad_norm": 0.36967014220163674, "learning_rate": 2.1665166110012275e-06, "loss": 0.0239, "step": 5606 }, { "epoch": 2.168213457076566, "grad_norm": 0.314502039692919, "learning_rate": 2.16466324462111e-06, "loss": 0.0204, "step": 5607 }, { "epoch": 2.1686001546790408, "grad_norm": 0.4570046596605372, "learning_rate": 2.1628104522843495e-06, "loss": 0.0237, "step": 5608 }, { "epoch": 2.168986852281516, "grad_norm": 0.29210118607798025, "learning_rate": 2.160958234366067e-06, "loss": 0.0205, "step": 5609 }, { "epoch": 2.1693735498839906, "grad_norm": 0.29022518585691487, "learning_rate": 2.15910659124126e-06, "loss": 0.0166, "step": 5610 }, { "epoch": 2.1697602474864657, "grad_norm": 0.2528450580329501, "learning_rate": 2.157255523284815e-06, "loss": 0.0194, "step": 5611 }, { "epoch": 2.1701469450889403, "grad_norm": 0.24301097756699686, "learning_rate": 2.1554050308714986e-06, "loss": 0.0167, "step": 5612 }, { "epoch": 2.1705336426914155, "grad_norm": 0.2588546458779273, "learning_rate": 2.1535551143759607e-06, "loss": 0.0191, "step": 5613 }, { "epoch": 2.17092034029389, "grad_norm": 0.28430896650144316, "learning_rate": 2.1517057741727397e-06, "loss": 0.0239, "step": 5614 }, { "epoch": 2.171307037896365, "grad_norm": 0.28122547977509166, "learning_rate": 2.1498570106362523e-06, "loss": 0.0182, "step": 5615 }, { "epoch": 2.17169373549884, "grad_norm": 0.27420074460356425, "learning_rate": 2.148008824140799e-06, "loss": 0.0218, "step": 5616 }, { "epoch": 2.1720804331013146, "grad_norm": 0.3008921412527602, "learning_rate": 2.1461612150605653e-06, "loss": 0.0217, "step": 5617 }, { "epoch": 2.1724671307037897, "grad_norm": 0.23323385914645334, "learning_rate": 2.144314183769617e-06, "loss": 0.0154, "step": 5618 }, { "epoch": 2.1728538283062644, "grad_norm": 0.25083377547833696, "learning_rate": 2.1424677306419073e-06, "loss": 0.024, "step": 5619 }, { "epoch": 2.1732405259087395, "grad_norm": 0.2745465009711697, "learning_rate": 2.1406218560512688e-06, "loss": 0.0132, "step": 5620 }, { "epoch": 2.173627223511214, "grad_norm": 0.2482283427611297, "learning_rate": 2.1387765603714174e-06, "loss": 0.0197, "step": 5621 }, { "epoch": 2.1740139211136893, "grad_norm": 0.28865613719475425, "learning_rate": 2.136931843975951e-06, "loss": 0.0198, "step": 5622 }, { "epoch": 2.174400618716164, "grad_norm": 0.29274351467594467, "learning_rate": 2.135087707238358e-06, "loss": 0.0182, "step": 5623 }, { "epoch": 2.1747873163186386, "grad_norm": 0.23444152272997698, "learning_rate": 2.1332441505319956e-06, "loss": 0.0166, "step": 5624 }, { "epoch": 2.1751740139211138, "grad_norm": 0.32207808077452604, "learning_rate": 2.131401174230113e-06, "loss": 0.0205, "step": 5625 }, { "epoch": 2.1755607115235884, "grad_norm": 0.2619668597770223, "learning_rate": 2.1295587787058415e-06, "loss": 0.018, "step": 5626 }, { "epoch": 2.1759474091260635, "grad_norm": 0.2429767530982541, "learning_rate": 2.1277169643321924e-06, "loss": 0.012, "step": 5627 }, { "epoch": 2.176334106728538, "grad_norm": 0.3334092911594734, "learning_rate": 2.125875731482061e-06, "loss": 0.0189, "step": 5628 }, { "epoch": 2.1767208043310133, "grad_norm": 0.23455935006030784, "learning_rate": 2.1240350805282204e-06, "loss": 0.0274, "step": 5629 }, { "epoch": 2.177107501933488, "grad_norm": 0.2588458577111247, "learning_rate": 2.122195011843335e-06, "loss": 0.0144, "step": 5630 }, { "epoch": 2.1774941995359627, "grad_norm": 0.3493903760769609, "learning_rate": 2.120355525799942e-06, "loss": 0.0289, "step": 5631 }, { "epoch": 2.177880897138438, "grad_norm": 0.272783423390379, "learning_rate": 2.1185166227704663e-06, "loss": 0.0191, "step": 5632 }, { "epoch": 2.1782675947409125, "grad_norm": 0.2774848006585916, "learning_rate": 2.1166783031272114e-06, "loss": 0.0208, "step": 5633 }, { "epoch": 2.1786542923433876, "grad_norm": 0.23377363134996548, "learning_rate": 2.114840567242363e-06, "loss": 0.0122, "step": 5634 }, { "epoch": 2.1790409899458623, "grad_norm": 0.263169658504804, "learning_rate": 2.113003415487994e-06, "loss": 0.031, "step": 5635 }, { "epoch": 2.1794276875483374, "grad_norm": 0.23497212733189435, "learning_rate": 2.1111668482360526e-06, "loss": 0.0171, "step": 5636 }, { "epoch": 2.179814385150812, "grad_norm": 0.22321560806393856, "learning_rate": 2.1093308658583704e-06, "loss": 0.0139, "step": 5637 }, { "epoch": 2.1802010827532867, "grad_norm": 0.2739517650841435, "learning_rate": 2.1074954687266617e-06, "loss": 0.0241, "step": 5638 }, { "epoch": 2.180587780355762, "grad_norm": 0.2876978332054289, "learning_rate": 2.10566065721252e-06, "loss": 0.0159, "step": 5639 }, { "epoch": 2.1809744779582365, "grad_norm": 0.30372322358028625, "learning_rate": 2.103826431687425e-06, "loss": 0.0281, "step": 5640 }, { "epoch": 2.1813611755607116, "grad_norm": 0.2582398649054049, "learning_rate": 2.101992792522733e-06, "loss": 0.0234, "step": 5641 }, { "epoch": 2.1817478731631863, "grad_norm": 0.2930647706925985, "learning_rate": 2.1001597400896837e-06, "loss": 0.0226, "step": 5642 }, { "epoch": 2.1821345707656614, "grad_norm": 0.2460570259353019, "learning_rate": 2.098327274759395e-06, "loss": 0.0153, "step": 5643 }, { "epoch": 2.182521268368136, "grad_norm": 0.24882700292639057, "learning_rate": 2.0964953969028736e-06, "loss": 0.0205, "step": 5644 }, { "epoch": 2.182907965970611, "grad_norm": 0.22948188081085313, "learning_rate": 2.094664106891001e-06, "loss": 0.0122, "step": 5645 }, { "epoch": 2.183294663573086, "grad_norm": 0.2332362814425822, "learning_rate": 2.0928334050945353e-06, "loss": 0.0165, "step": 5646 }, { "epoch": 2.1836813611755606, "grad_norm": 0.2678063827088697, "learning_rate": 2.0910032918841272e-06, "loss": 0.0204, "step": 5647 }, { "epoch": 2.1840680587780357, "grad_norm": 0.3221361370220454, "learning_rate": 2.089173767630298e-06, "loss": 0.02, "step": 5648 }, { "epoch": 2.1844547563805103, "grad_norm": 0.2482404956094941, "learning_rate": 2.0873448327034595e-06, "loss": 0.0184, "step": 5649 }, { "epoch": 2.1848414539829855, "grad_norm": 0.22584493494607955, "learning_rate": 2.0855164874738916e-06, "loss": 0.02, "step": 5650 }, { "epoch": 2.18522815158546, "grad_norm": 0.2521073384696538, "learning_rate": 2.0836887323117664e-06, "loss": 0.0167, "step": 5651 }, { "epoch": 2.1856148491879352, "grad_norm": 0.28583452611595866, "learning_rate": 2.081861567587131e-06, "loss": 0.0239, "step": 5652 }, { "epoch": 2.18600154679041, "grad_norm": 0.4527987311168051, "learning_rate": 2.080034993669914e-06, "loss": 0.0228, "step": 5653 }, { "epoch": 2.1863882443928846, "grad_norm": 0.2980900737195314, "learning_rate": 2.078209010929923e-06, "loss": 0.0136, "step": 5654 }, { "epoch": 2.1867749419953597, "grad_norm": 0.3054503497894516, "learning_rate": 2.0763836197368464e-06, "loss": 0.0211, "step": 5655 }, { "epoch": 2.1871616395978344, "grad_norm": 0.3081610991108833, "learning_rate": 2.0745588204602567e-06, "loss": 0.0221, "step": 5656 }, { "epoch": 2.1875483372003095, "grad_norm": 0.26653704482451557, "learning_rate": 2.0727346134696024e-06, "loss": 0.0219, "step": 5657 }, { "epoch": 2.187935034802784, "grad_norm": 0.2569698603593562, "learning_rate": 2.0709109991342125e-06, "loss": 0.0152, "step": 5658 }, { "epoch": 2.1883217324052593, "grad_norm": 0.32476276369450136, "learning_rate": 2.0690879778232963e-06, "loss": 0.0158, "step": 5659 }, { "epoch": 2.188708430007734, "grad_norm": 0.3185305166078096, "learning_rate": 2.0672655499059424e-06, "loss": 0.02, "step": 5660 }, { "epoch": 2.1890951276102086, "grad_norm": 0.20989509293227626, "learning_rate": 2.065443715751124e-06, "loss": 0.0182, "step": 5661 }, { "epoch": 2.1894818252126838, "grad_norm": 0.22714067513817146, "learning_rate": 2.0636224757276868e-06, "loss": 0.0134, "step": 5662 }, { "epoch": 2.1898685228151584, "grad_norm": 0.27237175753311377, "learning_rate": 2.061801830204362e-06, "loss": 0.0231, "step": 5663 }, { "epoch": 2.1902552204176335, "grad_norm": 0.29776076435340265, "learning_rate": 2.059981779549757e-06, "loss": 0.019, "step": 5664 }, { "epoch": 2.190641918020108, "grad_norm": 0.27570812590383564, "learning_rate": 2.058162324132358e-06, "loss": 0.0272, "step": 5665 }, { "epoch": 2.1910286156225833, "grad_norm": 0.4055004143509855, "learning_rate": 2.0563434643205383e-06, "loss": 0.0343, "step": 5666 }, { "epoch": 2.191415313225058, "grad_norm": 0.30135969878924024, "learning_rate": 2.0545252004825376e-06, "loss": 0.0204, "step": 5667 }, { "epoch": 2.1918020108275327, "grad_norm": 0.2213976688504251, "learning_rate": 2.052707532986489e-06, "loss": 0.0131, "step": 5668 }, { "epoch": 2.192188708430008, "grad_norm": 0.2571748812595965, "learning_rate": 2.0508904622003924e-06, "loss": 0.0157, "step": 5669 }, { "epoch": 2.1925754060324825, "grad_norm": 0.2849156676532009, "learning_rate": 2.049073988492139e-06, "loss": 0.0233, "step": 5670 }, { "epoch": 2.1929621036349576, "grad_norm": 0.24507781020199168, "learning_rate": 2.0472581122294883e-06, "loss": 0.0135, "step": 5671 }, { "epoch": 2.1933488012374323, "grad_norm": 0.2561583018294419, "learning_rate": 2.045442833780082e-06, "loss": 0.0193, "step": 5672 }, { "epoch": 2.1937354988399074, "grad_norm": 0.28471581779399635, "learning_rate": 2.043628153511446e-06, "loss": 0.0172, "step": 5673 }, { "epoch": 2.194122196442382, "grad_norm": 0.2483367815338595, "learning_rate": 2.0418140717909797e-06, "loss": 0.0232, "step": 5674 }, { "epoch": 2.1945088940448567, "grad_norm": 0.260285824278518, "learning_rate": 2.0400005889859624e-06, "loss": 0.019, "step": 5675 }, { "epoch": 2.194895591647332, "grad_norm": 0.253577252969207, "learning_rate": 2.0381877054635506e-06, "loss": 0.0208, "step": 5676 }, { "epoch": 2.1952822892498065, "grad_norm": 0.2611884301229439, "learning_rate": 2.036375421590786e-06, "loss": 0.0207, "step": 5677 }, { "epoch": 2.1956689868522816, "grad_norm": 0.42533230378426445, "learning_rate": 2.0345637377345813e-06, "loss": 0.0195, "step": 5678 }, { "epoch": 2.1960556844547563, "grad_norm": 0.3011191095891308, "learning_rate": 2.032752654261731e-06, "loss": 0.0206, "step": 5679 }, { "epoch": 2.1964423820572314, "grad_norm": 0.4169715516084138, "learning_rate": 2.0309421715389083e-06, "loss": 0.0353, "step": 5680 }, { "epoch": 2.196829079659706, "grad_norm": 0.23781850816614572, "learning_rate": 2.0291322899326615e-06, "loss": 0.0149, "step": 5681 }, { "epoch": 2.1972157772621808, "grad_norm": 0.23584005965942614, "learning_rate": 2.0273230098094242e-06, "loss": 0.0159, "step": 5682 }, { "epoch": 2.197602474864656, "grad_norm": 0.28593882767854684, "learning_rate": 2.0255143315355026e-06, "loss": 0.0181, "step": 5683 }, { "epoch": 2.1979891724671305, "grad_norm": 0.3757570045067115, "learning_rate": 2.023706255477081e-06, "loss": 0.0261, "step": 5684 }, { "epoch": 2.1983758700696057, "grad_norm": 0.2980700785417389, "learning_rate": 2.021898782000224e-06, "loss": 0.0157, "step": 5685 }, { "epoch": 2.1987625676720803, "grad_norm": 0.24764410600492054, "learning_rate": 2.020091911470872e-06, "loss": 0.0196, "step": 5686 }, { "epoch": 2.1991492652745555, "grad_norm": 0.23334255680619034, "learning_rate": 2.018285644254847e-06, "loss": 0.0139, "step": 5687 }, { "epoch": 2.19953596287703, "grad_norm": 0.2649836934591459, "learning_rate": 2.0164799807178463e-06, "loss": 0.02, "step": 5688 }, { "epoch": 2.199922660479505, "grad_norm": 0.33844560696947446, "learning_rate": 2.014674921225444e-06, "loss": 0.0179, "step": 5689 }, { "epoch": 2.20030935808198, "grad_norm": 0.3159465631451391, "learning_rate": 2.0128704661430936e-06, "loss": 0.0193, "step": 5690 }, { "epoch": 2.2006960556844546, "grad_norm": 0.37368044181674476, "learning_rate": 2.0110666158361242e-06, "loss": 0.0221, "step": 5691 }, { "epoch": 2.2010827532869297, "grad_norm": 0.3927588500281335, "learning_rate": 2.0092633706697497e-06, "loss": 0.0247, "step": 5692 }, { "epoch": 2.2014694508894044, "grad_norm": 0.34164271540073754, "learning_rate": 2.007460731009047e-06, "loss": 0.0168, "step": 5693 }, { "epoch": 2.2018561484918795, "grad_norm": 0.23474120003132656, "learning_rate": 2.005658697218986e-06, "loss": 0.0152, "step": 5694 }, { "epoch": 2.202242846094354, "grad_norm": 0.21977483460005415, "learning_rate": 2.003857269664406e-06, "loss": 0.0166, "step": 5695 }, { "epoch": 2.2026295436968293, "grad_norm": 0.3364944779013163, "learning_rate": 2.0020564487100227e-06, "loss": 0.0222, "step": 5696 }, { "epoch": 2.203016241299304, "grad_norm": 0.24176122448666693, "learning_rate": 2.0002562347204324e-06, "loss": 0.0198, "step": 5697 }, { "epoch": 2.2034029389017786, "grad_norm": 0.2536627900477685, "learning_rate": 1.998456628060105e-06, "loss": 0.0197, "step": 5698 }, { "epoch": 2.2037896365042537, "grad_norm": 0.3110076430565348, "learning_rate": 1.9966576290933932e-06, "loss": 0.0213, "step": 5699 }, { "epoch": 2.2041763341067284, "grad_norm": 0.28929575444932343, "learning_rate": 1.9948592381845205e-06, "loss": 0.0268, "step": 5700 }, { "epoch": 2.2045630317092035, "grad_norm": 0.25843684758382063, "learning_rate": 1.9930614556975914e-06, "loss": 0.0157, "step": 5701 }, { "epoch": 2.204949729311678, "grad_norm": 0.23141874433881343, "learning_rate": 1.9912642819965824e-06, "loss": 0.0131, "step": 5702 }, { "epoch": 2.2053364269141533, "grad_norm": 0.3284163151874794, "learning_rate": 1.9894677174453537e-06, "loss": 0.0227, "step": 5703 }, { "epoch": 2.205723124516628, "grad_norm": 0.24682889523370144, "learning_rate": 1.9876717624076375e-06, "loss": 0.0176, "step": 5704 }, { "epoch": 2.2061098221191027, "grad_norm": 0.2778859603100362, "learning_rate": 1.985876417247043e-06, "loss": 0.0183, "step": 5705 }, { "epoch": 2.206496519721578, "grad_norm": 0.3093455582591721, "learning_rate": 1.984081682327056e-06, "loss": 0.0213, "step": 5706 }, { "epoch": 2.2068832173240525, "grad_norm": 0.2980015471208426, "learning_rate": 1.9822875580110386e-06, "loss": 0.0191, "step": 5707 }, { "epoch": 2.2072699149265276, "grad_norm": 0.24950984753625668, "learning_rate": 1.9804940446622323e-06, "loss": 0.0183, "step": 5708 }, { "epoch": 2.2076566125290022, "grad_norm": 0.3123843530996817, "learning_rate": 1.978701142643751e-06, "loss": 0.0201, "step": 5709 }, { "epoch": 2.2080433101314774, "grad_norm": 0.2510688515301665, "learning_rate": 1.976908852318587e-06, "loss": 0.0168, "step": 5710 }, { "epoch": 2.208430007733952, "grad_norm": 0.3978062732334352, "learning_rate": 1.975117174049607e-06, "loss": 0.0275, "step": 5711 }, { "epoch": 2.2088167053364267, "grad_norm": 0.30610236285168035, "learning_rate": 1.9733261081995543e-06, "loss": 0.0219, "step": 5712 }, { "epoch": 2.209203402938902, "grad_norm": 0.4874576150751252, "learning_rate": 1.9715356551310533e-06, "loss": 0.0202, "step": 5713 }, { "epoch": 2.2095901005413765, "grad_norm": 0.3069277615923745, "learning_rate": 1.9697458152065924e-06, "loss": 0.0182, "step": 5714 }, { "epoch": 2.2099767981438516, "grad_norm": 0.30418588167967703, "learning_rate": 1.96795658878855e-06, "loss": 0.0237, "step": 5715 }, { "epoch": 2.2103634957463263, "grad_norm": 0.24111691905022228, "learning_rate": 1.9661679762391684e-06, "loss": 0.0228, "step": 5716 }, { "epoch": 2.2107501933488014, "grad_norm": 0.24469917778572317, "learning_rate": 1.964379977920577e-06, "loss": 0.0179, "step": 5717 }, { "epoch": 2.211136890951276, "grad_norm": 0.27318089869702417, "learning_rate": 1.962592594194769e-06, "loss": 0.0208, "step": 5718 }, { "epoch": 2.211523588553751, "grad_norm": 0.24526818177162282, "learning_rate": 1.960805825423619e-06, "loss": 0.0169, "step": 5719 }, { "epoch": 2.211910286156226, "grad_norm": 0.46491070444761945, "learning_rate": 1.9590196719688804e-06, "loss": 0.0265, "step": 5720 }, { "epoch": 2.2122969837587005, "grad_norm": 0.29856926957239954, "learning_rate": 1.957234134192176e-06, "loss": 0.022, "step": 5721 }, { "epoch": 2.2126836813611757, "grad_norm": 0.2611076684155604, "learning_rate": 1.955449212455008e-06, "loss": 0.0148, "step": 5722 }, { "epoch": 2.2130703789636503, "grad_norm": 0.23328865803354043, "learning_rate": 1.9536649071187497e-06, "loss": 0.0185, "step": 5723 }, { "epoch": 2.2134570765661254, "grad_norm": 0.3079001877771197, "learning_rate": 1.951881218544655e-06, "loss": 0.0247, "step": 5724 }, { "epoch": 2.2138437741686, "grad_norm": 0.2388776926677693, "learning_rate": 1.9500981470938495e-06, "loss": 0.0211, "step": 5725 }, { "epoch": 2.2142304717710752, "grad_norm": 0.2723808603251801, "learning_rate": 1.9483156931273346e-06, "loss": 0.0282, "step": 5726 }, { "epoch": 2.21461716937355, "grad_norm": 0.3376339277863955, "learning_rate": 1.9465338570059854e-06, "loss": 0.0236, "step": 5727 }, { "epoch": 2.2150038669760246, "grad_norm": 0.29185353029527233, "learning_rate": 1.944752639090553e-06, "loss": 0.0203, "step": 5728 }, { "epoch": 2.2153905645784997, "grad_norm": 0.2994295021218634, "learning_rate": 1.9429720397416656e-06, "loss": 0.0175, "step": 5729 }, { "epoch": 2.2157772621809744, "grad_norm": 0.2825245679941026, "learning_rate": 1.9411920593198246e-06, "loss": 0.0196, "step": 5730 }, { "epoch": 2.2161639597834495, "grad_norm": 0.2530648347876913, "learning_rate": 1.9394126981854e-06, "loss": 0.0173, "step": 5731 }, { "epoch": 2.216550657385924, "grad_norm": 0.3631802845291508, "learning_rate": 1.937633956698648e-06, "loss": 0.0261, "step": 5732 }, { "epoch": 2.2169373549883993, "grad_norm": 0.3273238952890786, "learning_rate": 1.935855835219688e-06, "loss": 0.0271, "step": 5733 }, { "epoch": 2.217324052590874, "grad_norm": 0.2374436059911325, "learning_rate": 1.9340783341085267e-06, "loss": 0.0163, "step": 5734 }, { "epoch": 2.2177107501933486, "grad_norm": 0.35186931145967437, "learning_rate": 1.932301453725029e-06, "loss": 0.0298, "step": 5735 }, { "epoch": 2.2180974477958237, "grad_norm": 0.25314037688859087, "learning_rate": 1.9305251944289486e-06, "loss": 0.0191, "step": 5736 }, { "epoch": 2.2184841453982984, "grad_norm": 0.2625393152997333, "learning_rate": 1.9287495565799063e-06, "loss": 0.0216, "step": 5737 }, { "epoch": 2.2188708430007735, "grad_norm": 0.24069921213727824, "learning_rate": 1.9269745405373974e-06, "loss": 0.0165, "step": 5738 }, { "epoch": 2.219257540603248, "grad_norm": 0.27819743754061615, "learning_rate": 1.925200146660793e-06, "loss": 0.0151, "step": 5739 }, { "epoch": 2.2196442382057233, "grad_norm": 0.24135893914997125, "learning_rate": 1.9234263753093357e-06, "loss": 0.0162, "step": 5740 }, { "epoch": 2.220030935808198, "grad_norm": 0.35380447886026317, "learning_rate": 1.9216532268421474e-06, "loss": 0.0239, "step": 5741 }, { "epoch": 2.2204176334106727, "grad_norm": 0.32302656028163296, "learning_rate": 1.9198807016182187e-06, "loss": 0.0238, "step": 5742 }, { "epoch": 2.220804331013148, "grad_norm": 0.2493070993287722, "learning_rate": 1.9181087999964154e-06, "loss": 0.0169, "step": 5743 }, { "epoch": 2.2211910286156225, "grad_norm": 0.3132221619984221, "learning_rate": 1.916337522335478e-06, "loss": 0.0174, "step": 5744 }, { "epoch": 2.2215777262180976, "grad_norm": 0.23618382625860662, "learning_rate": 1.9145668689940177e-06, "loss": 0.0163, "step": 5745 }, { "epoch": 2.2219644238205722, "grad_norm": 0.266112534680366, "learning_rate": 1.912796840330526e-06, "loss": 0.0193, "step": 5746 }, { "epoch": 2.2223511214230474, "grad_norm": 0.3164351202701275, "learning_rate": 1.9110274367033605e-06, "loss": 0.0319, "step": 5747 }, { "epoch": 2.222737819025522, "grad_norm": 0.3788463593287549, "learning_rate": 1.909258658470756e-06, "loss": 0.0177, "step": 5748 }, { "epoch": 2.2231245166279967, "grad_norm": 0.23950536203150294, "learning_rate": 1.907490505990819e-06, "loss": 0.0156, "step": 5749 }, { "epoch": 2.223511214230472, "grad_norm": 0.2910695998368224, "learning_rate": 1.9057229796215331e-06, "loss": 0.0207, "step": 5750 }, { "epoch": 2.2238979118329465, "grad_norm": 0.28914352981676017, "learning_rate": 1.9039560797207502e-06, "loss": 0.0147, "step": 5751 }, { "epoch": 2.2242846094354216, "grad_norm": 0.2963117411437154, "learning_rate": 1.9021898066461985e-06, "loss": 0.0187, "step": 5752 }, { "epoch": 2.2246713070378963, "grad_norm": 0.26256366016918115, "learning_rate": 1.9004241607554774e-06, "loss": 0.0162, "step": 5753 }, { "epoch": 2.2250580046403714, "grad_norm": 0.5588025232551111, "learning_rate": 1.898659142406059e-06, "loss": 0.0209, "step": 5754 }, { "epoch": 2.225444702242846, "grad_norm": 0.18319186299421533, "learning_rate": 1.8968947519552932e-06, "loss": 0.0165, "step": 5755 }, { "epoch": 2.2258313998453207, "grad_norm": 0.2765461288041542, "learning_rate": 1.8951309897603975e-06, "loss": 0.0184, "step": 5756 }, { "epoch": 2.226218097447796, "grad_norm": 0.24907162392965404, "learning_rate": 1.8933678561784629e-06, "loss": 0.0188, "step": 5757 }, { "epoch": 2.2266047950502705, "grad_norm": 0.2194240082233168, "learning_rate": 1.891605351566455e-06, "loss": 0.0157, "step": 5758 }, { "epoch": 2.2269914926527457, "grad_norm": 0.2254732860136083, "learning_rate": 1.8898434762812095e-06, "loss": 0.0205, "step": 5759 }, { "epoch": 2.2273781902552203, "grad_norm": 0.2113084076449776, "learning_rate": 1.888082230679441e-06, "loss": 0.0169, "step": 5760 }, { "epoch": 2.2277648878576954, "grad_norm": 0.29013787491210463, "learning_rate": 1.8863216151177249e-06, "loss": 0.0222, "step": 5761 }, { "epoch": 2.22815158546017, "grad_norm": 0.24346726485082157, "learning_rate": 1.8845616299525222e-06, "loss": 0.0211, "step": 5762 }, { "epoch": 2.228538283062645, "grad_norm": 0.8362629131719977, "learning_rate": 1.8828022755401577e-06, "loss": 0.0276, "step": 5763 }, { "epoch": 2.22892498066512, "grad_norm": 0.33894520649783294, "learning_rate": 1.881043552236831e-06, "loss": 0.0201, "step": 5764 }, { "epoch": 2.2293116782675946, "grad_norm": 0.24497308661803543, "learning_rate": 1.8792854603986143e-06, "loss": 0.0225, "step": 5765 }, { "epoch": 2.2296983758700697, "grad_norm": 0.28815547669911934, "learning_rate": 1.8775280003814494e-06, "loss": 0.0228, "step": 5766 }, { "epoch": 2.2300850734725444, "grad_norm": 0.4047512044205394, "learning_rate": 1.875771172541156e-06, "loss": 0.0274, "step": 5767 }, { "epoch": 2.2304717710750195, "grad_norm": 0.20006994693739497, "learning_rate": 1.8740149772334205e-06, "loss": 0.0126, "step": 5768 }, { "epoch": 2.230858468677494, "grad_norm": 0.22424557232716225, "learning_rate": 1.8722594148138024e-06, "loss": 0.0117, "step": 5769 }, { "epoch": 2.2312451662799693, "grad_norm": 0.35785262646495775, "learning_rate": 1.870504485637734e-06, "loss": 0.0226, "step": 5770 }, { "epoch": 2.231631863882444, "grad_norm": 0.3972352826811145, "learning_rate": 1.868750190060517e-06, "loss": 0.0221, "step": 5771 }, { "epoch": 2.2320185614849186, "grad_norm": 0.27072607168959745, "learning_rate": 1.8669965284373303e-06, "loss": 0.0206, "step": 5772 }, { "epoch": 2.2324052590873937, "grad_norm": 0.29149133467036237, "learning_rate": 1.865243501123219e-06, "loss": 0.0167, "step": 5773 }, { "epoch": 2.2327919566898684, "grad_norm": 0.2609972830240908, "learning_rate": 1.8634911084731017e-06, "loss": 0.0222, "step": 5774 }, { "epoch": 2.2331786542923435, "grad_norm": 0.2887390676094001, "learning_rate": 1.8617393508417675e-06, "loss": 0.0142, "step": 5775 }, { "epoch": 2.233565351894818, "grad_norm": 0.29780493105841727, "learning_rate": 1.85998822858388e-06, "loss": 0.0229, "step": 5776 }, { "epoch": 2.2339520494972933, "grad_norm": 0.3727485169652481, "learning_rate": 1.8582377420539733e-06, "loss": 0.0283, "step": 5777 }, { "epoch": 2.234338747099768, "grad_norm": 0.2228699538899036, "learning_rate": 1.8564878916064455e-06, "loss": 0.0165, "step": 5778 }, { "epoch": 2.2347254447022427, "grad_norm": 0.2825404669728387, "learning_rate": 1.8547386775955779e-06, "loss": 0.018, "step": 5779 }, { "epoch": 2.2351121423047178, "grad_norm": 0.21829058733457923, "learning_rate": 1.852990100375513e-06, "loss": 0.0172, "step": 5780 }, { "epoch": 2.2354988399071924, "grad_norm": 0.23473238451434866, "learning_rate": 1.851242160300274e-06, "loss": 0.0151, "step": 5781 }, { "epoch": 2.2358855375096676, "grad_norm": 0.2779516236679501, "learning_rate": 1.8494948577237421e-06, "loss": 0.0247, "step": 5782 }, { "epoch": 2.2362722351121422, "grad_norm": 0.2649612131115908, "learning_rate": 1.8477481929996827e-06, "loss": 0.0164, "step": 5783 }, { "epoch": 2.2366589327146174, "grad_norm": 0.24779671457631183, "learning_rate": 1.846002166481724e-06, "loss": 0.0275, "step": 5784 }, { "epoch": 2.237045630317092, "grad_norm": 0.2695430995364103, "learning_rate": 1.8442567785233679e-06, "loss": 0.0183, "step": 5785 }, { "epoch": 2.2374323279195667, "grad_norm": 0.2544884173309681, "learning_rate": 1.8425120294779852e-06, "loss": 0.0211, "step": 5786 }, { "epoch": 2.237819025522042, "grad_norm": 0.2965300563981387, "learning_rate": 1.840767919698817e-06, "loss": 0.0205, "step": 5787 }, { "epoch": 2.2382057231245165, "grad_norm": 0.3324187374931934, "learning_rate": 1.8390244495389808e-06, "loss": 0.0239, "step": 5788 }, { "epoch": 2.2385924207269916, "grad_norm": 0.26766748987478467, "learning_rate": 1.8372816193514576e-06, "loss": 0.0259, "step": 5789 }, { "epoch": 2.2389791183294663, "grad_norm": 0.34469645487189154, "learning_rate": 1.8355394294891016e-06, "loss": 0.0207, "step": 5790 }, { "epoch": 2.2393658159319414, "grad_norm": 0.23886598580270157, "learning_rate": 1.8337978803046374e-06, "loss": 0.0188, "step": 5791 }, { "epoch": 2.239752513534416, "grad_norm": 0.27375855767853374, "learning_rate": 1.8320569721506581e-06, "loss": 0.0157, "step": 5792 }, { "epoch": 2.240139211136891, "grad_norm": 0.30375656790782013, "learning_rate": 1.8303167053796317e-06, "loss": 0.0242, "step": 5793 }, { "epoch": 2.240525908739366, "grad_norm": 0.32163434199271385, "learning_rate": 1.8285770803438918e-06, "loss": 0.0172, "step": 5794 }, { "epoch": 2.2409126063418405, "grad_norm": 0.21275212840633762, "learning_rate": 1.826838097395644e-06, "loss": 0.0133, "step": 5795 }, { "epoch": 2.2412993039443156, "grad_norm": 0.3206093817244247, "learning_rate": 1.8250997568869605e-06, "loss": 0.017, "step": 5796 }, { "epoch": 2.2416860015467903, "grad_norm": 0.34295459046323107, "learning_rate": 1.8233620591697908e-06, "loss": 0.0272, "step": 5797 }, { "epoch": 2.2420726991492654, "grad_norm": 0.22540020253262757, "learning_rate": 1.8216250045959499e-06, "loss": 0.0101, "step": 5798 }, { "epoch": 2.24245939675174, "grad_norm": 0.2507893015680734, "learning_rate": 1.8198885935171167e-06, "loss": 0.0179, "step": 5799 }, { "epoch": 2.2428460943542152, "grad_norm": 0.31838815610465876, "learning_rate": 1.8181528262848519e-06, "loss": 0.0257, "step": 5800 }, { "epoch": 2.24323279195669, "grad_norm": 0.3082729615567283, "learning_rate": 1.8164177032505748e-06, "loss": 0.0223, "step": 5801 }, { "epoch": 2.2436194895591646, "grad_norm": 0.6252439314079191, "learning_rate": 1.814683224765585e-06, "loss": 0.0194, "step": 5802 }, { "epoch": 2.2440061871616397, "grad_norm": 0.40468076972442607, "learning_rate": 1.8129493911810386e-06, "loss": 0.0195, "step": 5803 }, { "epoch": 2.2443928847641144, "grad_norm": 0.23329990461647798, "learning_rate": 1.8112162028479736e-06, "loss": 0.0146, "step": 5804 }, { "epoch": 2.2447795823665895, "grad_norm": 0.3618816420326041, "learning_rate": 1.80948366011729e-06, "loss": 0.0259, "step": 5805 }, { "epoch": 2.245166279969064, "grad_norm": 0.2293322261524675, "learning_rate": 1.807751763339759e-06, "loss": 0.0143, "step": 5806 }, { "epoch": 2.2455529775715393, "grad_norm": 0.21712741822776643, "learning_rate": 1.8060205128660213e-06, "loss": 0.0119, "step": 5807 }, { "epoch": 2.245939675174014, "grad_norm": 0.2541989605965101, "learning_rate": 1.8042899090465842e-06, "loss": 0.0206, "step": 5808 }, { "epoch": 2.2463263727764886, "grad_norm": 0.2859291210721773, "learning_rate": 1.802559952231831e-06, "loss": 0.0222, "step": 5809 }, { "epoch": 2.2467130703789637, "grad_norm": 0.2220313405932998, "learning_rate": 1.8008306427720074e-06, "loss": 0.0197, "step": 5810 }, { "epoch": 2.2470997679814384, "grad_norm": 0.26920289101049716, "learning_rate": 1.7991019810172294e-06, "loss": 0.0171, "step": 5811 }, { "epoch": 2.2474864655839135, "grad_norm": 0.3990921734133395, "learning_rate": 1.7973739673174828e-06, "loss": 0.0216, "step": 5812 }, { "epoch": 2.247873163186388, "grad_norm": 0.2670079451094265, "learning_rate": 1.7956466020226199e-06, "loss": 0.0145, "step": 5813 }, { "epoch": 2.2482598607888633, "grad_norm": 0.38066756221126086, "learning_rate": 1.793919885482368e-06, "loss": 0.0215, "step": 5814 }, { "epoch": 2.248646558391338, "grad_norm": 0.4935997015110904, "learning_rate": 1.7921938180463162e-06, "loss": 0.022, "step": 5815 }, { "epoch": 2.2490332559938127, "grad_norm": 0.29199606644150633, "learning_rate": 1.7904684000639256e-06, "loss": 0.0238, "step": 5816 }, { "epoch": 2.2494199535962878, "grad_norm": 0.2688147874829973, "learning_rate": 1.7887436318845242e-06, "loss": 0.017, "step": 5817 }, { "epoch": 2.2498066511987624, "grad_norm": 0.30596921318313813, "learning_rate": 1.7870195138573076e-06, "loss": 0.0255, "step": 5818 }, { "epoch": 2.2501933488012376, "grad_norm": 0.21258988458955783, "learning_rate": 1.7852960463313457e-06, "loss": 0.0172, "step": 5819 }, { "epoch": 2.2505800464037122, "grad_norm": 0.3437258597023526, "learning_rate": 1.7835732296555698e-06, "loss": 0.0203, "step": 5820 }, { "epoch": 2.2509667440061873, "grad_norm": 0.3634494622708203, "learning_rate": 1.7818510641787817e-06, "loss": 0.0234, "step": 5821 }, { "epoch": 2.251353441608662, "grad_norm": 0.21184456669302687, "learning_rate": 1.7801295502496513e-06, "loss": 0.0107, "step": 5822 }, { "epoch": 2.2517401392111367, "grad_norm": 0.2714741923237112, "learning_rate": 1.778408688216719e-06, "loss": 0.0215, "step": 5823 }, { "epoch": 2.252126836813612, "grad_norm": 0.2659938032248652, "learning_rate": 1.7766884784283922e-06, "loss": 0.0176, "step": 5824 }, { "epoch": 2.2525135344160865, "grad_norm": 0.3208639904601813, "learning_rate": 1.7749689212329395e-06, "loss": 0.0206, "step": 5825 }, { "epoch": 2.2529002320185616, "grad_norm": 0.23646212493339708, "learning_rate": 1.7732500169785083e-06, "loss": 0.0151, "step": 5826 }, { "epoch": 2.2532869296210363, "grad_norm": 0.22735161326395198, "learning_rate": 1.7715317660131054e-06, "loss": 0.0146, "step": 5827 }, { "epoch": 2.2536736272235114, "grad_norm": 0.2720260283321801, "learning_rate": 1.7698141686846137e-06, "loss": 0.014, "step": 5828 }, { "epoch": 2.254060324825986, "grad_norm": 0.28557448001353425, "learning_rate": 1.7680972253407718e-06, "loss": 0.0167, "step": 5829 }, { "epoch": 2.2544470224284607, "grad_norm": 0.24780984598131564, "learning_rate": 1.7663809363291973e-06, "loss": 0.0183, "step": 5830 }, { "epoch": 2.254833720030936, "grad_norm": 0.20600157690618445, "learning_rate": 1.7646653019973697e-06, "loss": 0.0202, "step": 5831 }, { "epoch": 2.2552204176334105, "grad_norm": 0.4061138522955897, "learning_rate": 1.762950322692637e-06, "loss": 0.0282, "step": 5832 }, { "epoch": 2.2556071152358856, "grad_norm": 0.26415622302455893, "learning_rate": 1.7612359987622135e-06, "loss": 0.0163, "step": 5833 }, { "epoch": 2.2559938128383603, "grad_norm": 0.2613047723027424, "learning_rate": 1.759522330553181e-06, "loss": 0.0176, "step": 5834 }, { "epoch": 2.2563805104408354, "grad_norm": 0.24149211448106592, "learning_rate": 1.7578093184124935e-06, "loss": 0.0165, "step": 5835 }, { "epoch": 2.25676720804331, "grad_norm": 0.24937607534979853, "learning_rate": 1.7560969626869651e-06, "loss": 0.0181, "step": 5836 }, { "epoch": 2.2571539056457848, "grad_norm": 0.23213038438313086, "learning_rate": 1.7543852637232805e-06, "loss": 0.0157, "step": 5837 }, { "epoch": 2.25754060324826, "grad_norm": 0.27764979318095995, "learning_rate": 1.752674221867991e-06, "loss": 0.022, "step": 5838 }, { "epoch": 2.2579273008507346, "grad_norm": 0.23807644933742172, "learning_rate": 1.7509638374675131e-06, "loss": 0.015, "step": 5839 }, { "epoch": 2.2583139984532097, "grad_norm": 0.2495450054429785, "learning_rate": 1.7492541108681354e-06, "loss": 0.0164, "step": 5840 }, { "epoch": 2.2587006960556844, "grad_norm": 0.25291726814414495, "learning_rate": 1.7475450424160072e-06, "loss": 0.0222, "step": 5841 }, { "epoch": 2.2590873936581595, "grad_norm": 0.33949875920755657, "learning_rate": 1.7458366324571485e-06, "loss": 0.0249, "step": 5842 }, { "epoch": 2.259474091260634, "grad_norm": 0.2572626107043677, "learning_rate": 1.7441288813374442e-06, "loss": 0.0193, "step": 5843 }, { "epoch": 2.259860788863109, "grad_norm": 0.4329012156436384, "learning_rate": 1.7424217894026435e-06, "loss": 0.0187, "step": 5844 }, { "epoch": 2.260247486465584, "grad_norm": 0.2599580640532226, "learning_rate": 1.7407153569983714e-06, "loss": 0.0175, "step": 5845 }, { "epoch": 2.2606341840680586, "grad_norm": 0.27949011059146545, "learning_rate": 1.7390095844701054e-06, "loss": 0.0142, "step": 5846 }, { "epoch": 2.2610208816705337, "grad_norm": 0.24317909978322588, "learning_rate": 1.737304472163202e-06, "loss": 0.0179, "step": 5847 }, { "epoch": 2.2614075792730084, "grad_norm": 0.27146114296092966, "learning_rate": 1.7356000204228757e-06, "loss": 0.0215, "step": 5848 }, { "epoch": 2.2617942768754835, "grad_norm": 0.24793947265762892, "learning_rate": 1.7338962295942152e-06, "loss": 0.0169, "step": 5849 }, { "epoch": 2.262180974477958, "grad_norm": 0.21571481178937693, "learning_rate": 1.7321931000221654e-06, "loss": 0.0118, "step": 5850 }, { "epoch": 2.262567672080433, "grad_norm": 0.2865116679884154, "learning_rate": 1.730490632051543e-06, "loss": 0.0178, "step": 5851 }, { "epoch": 2.262954369682908, "grad_norm": 0.26213802059125935, "learning_rate": 1.7287888260270335e-06, "loss": 0.0158, "step": 5852 }, { "epoch": 2.2633410672853826, "grad_norm": 0.2340017431158562, "learning_rate": 1.7270876822931843e-06, "loss": 0.0149, "step": 5853 }, { "epoch": 2.2637277648878578, "grad_norm": 0.3184937962824089, "learning_rate": 1.725387201194408e-06, "loss": 0.0181, "step": 5854 }, { "epoch": 2.2641144624903324, "grad_norm": 0.2778048357730845, "learning_rate": 1.723687383074984e-06, "loss": 0.0148, "step": 5855 }, { "epoch": 2.2645011600928076, "grad_norm": 0.24624970038357305, "learning_rate": 1.7219882282790618e-06, "loss": 0.018, "step": 5856 }, { "epoch": 2.2648878576952822, "grad_norm": 0.3074791310540488, "learning_rate": 1.7202897371506505e-06, "loss": 0.0224, "step": 5857 }, { "epoch": 2.2652745552977573, "grad_norm": 0.2630713101877071, "learning_rate": 1.7185919100336275e-06, "loss": 0.0158, "step": 5858 }, { "epoch": 2.265661252900232, "grad_norm": 0.2697932343380198, "learning_rate": 1.716894747271735e-06, "loss": 0.0259, "step": 5859 }, { "epoch": 2.266047950502707, "grad_norm": 0.2727722338873404, "learning_rate": 1.7151982492085812e-06, "loss": 0.0218, "step": 5860 }, { "epoch": 2.266434648105182, "grad_norm": 0.2696895588967917, "learning_rate": 1.7135024161876412e-06, "loss": 0.0183, "step": 5861 }, { "epoch": 2.2668213457076565, "grad_norm": 0.23261243313071608, "learning_rate": 1.7118072485522535e-06, "loss": 0.0203, "step": 5862 }, { "epoch": 2.2672080433101316, "grad_norm": 0.2915669810927872, "learning_rate": 1.7101127466456219e-06, "loss": 0.0249, "step": 5863 }, { "epoch": 2.2675947409126063, "grad_norm": 0.2588435212343667, "learning_rate": 1.7084189108108162e-06, "loss": 0.0244, "step": 5864 }, { "epoch": 2.2679814385150814, "grad_norm": 0.2692992715798755, "learning_rate": 1.706725741390769e-06, "loss": 0.0173, "step": 5865 }, { "epoch": 2.268368136117556, "grad_norm": 0.29101660996929707, "learning_rate": 1.7050332387282853e-06, "loss": 0.0166, "step": 5866 }, { "epoch": 2.268754833720031, "grad_norm": 0.2658764851174439, "learning_rate": 1.703341403166023e-06, "loss": 0.0216, "step": 5867 }, { "epoch": 2.269141531322506, "grad_norm": 0.31324130156859026, "learning_rate": 1.7016502350465165e-06, "loss": 0.0172, "step": 5868 }, { "epoch": 2.2695282289249805, "grad_norm": 0.26254603678800575, "learning_rate": 1.6999597347121576e-06, "loss": 0.0182, "step": 5869 }, { "epoch": 2.2699149265274556, "grad_norm": 0.27211955563548607, "learning_rate": 1.6982699025052107e-06, "loss": 0.0208, "step": 5870 }, { "epoch": 2.2703016241299303, "grad_norm": 0.42027852585797265, "learning_rate": 1.6965807387677947e-06, "loss": 0.0172, "step": 5871 }, { "epoch": 2.2706883217324054, "grad_norm": 0.30560503724587734, "learning_rate": 1.6948922438418974e-06, "loss": 0.0149, "step": 5872 }, { "epoch": 2.27107501933488, "grad_norm": 0.4152809071037006, "learning_rate": 1.693204418069377e-06, "loss": 0.0238, "step": 5873 }, { "epoch": 2.271461716937355, "grad_norm": 0.291654615269304, "learning_rate": 1.691517261791949e-06, "loss": 0.0179, "step": 5874 }, { "epoch": 2.27184841453983, "grad_norm": 0.3391829671911838, "learning_rate": 1.689830775351195e-06, "loss": 0.0237, "step": 5875 }, { "epoch": 2.2722351121423046, "grad_norm": 0.29898259218476664, "learning_rate": 1.6881449590885612e-06, "loss": 0.024, "step": 5876 }, { "epoch": 2.2726218097447797, "grad_norm": 0.22516458079486254, "learning_rate": 1.6864598133453613e-06, "loss": 0.0116, "step": 5877 }, { "epoch": 2.2730085073472543, "grad_norm": 0.5423684549502713, "learning_rate": 1.684775338462769e-06, "loss": 0.0208, "step": 5878 }, { "epoch": 2.2733952049497295, "grad_norm": 0.26082650197212776, "learning_rate": 1.6830915347818233e-06, "loss": 0.0236, "step": 5879 }, { "epoch": 2.273781902552204, "grad_norm": 0.20561355115376673, "learning_rate": 1.6814084026434286e-06, "loss": 0.0119, "step": 5880 }, { "epoch": 2.2741686001546793, "grad_norm": 0.19987048870040378, "learning_rate": 1.6797259423883505e-06, "loss": 0.013, "step": 5881 }, { "epoch": 2.274555297757154, "grad_norm": 0.618136676139077, "learning_rate": 1.6780441543572234e-06, "loss": 0.0221, "step": 5882 }, { "epoch": 2.2749419953596286, "grad_norm": 0.24446789056300214, "learning_rate": 1.6763630388905416e-06, "loss": 0.018, "step": 5883 }, { "epoch": 2.2753286929621037, "grad_norm": 0.26214652099829144, "learning_rate": 1.6746825963286638e-06, "loss": 0.018, "step": 5884 }, { "epoch": 2.2757153905645784, "grad_norm": 0.6476321390078196, "learning_rate": 1.6730028270118143e-06, "loss": 0.0263, "step": 5885 }, { "epoch": 2.2761020881670535, "grad_norm": 0.3061607946669517, "learning_rate": 1.6713237312800767e-06, "loss": 0.0202, "step": 5886 }, { "epoch": 2.276488785769528, "grad_norm": 0.2878703603981016, "learning_rate": 1.669645309473405e-06, "loss": 0.0209, "step": 5887 }, { "epoch": 2.2768754833720033, "grad_norm": 0.2373044272048141, "learning_rate": 1.667967561931612e-06, "loss": 0.0158, "step": 5888 }, { "epoch": 2.277262180974478, "grad_norm": 0.30796715912959444, "learning_rate": 1.6662904889943748e-06, "loss": 0.0217, "step": 5889 }, { "epoch": 2.2776488785769526, "grad_norm": 0.30700576861430684, "learning_rate": 1.6646140910012349e-06, "loss": 0.0321, "step": 5890 }, { "epoch": 2.2780355761794278, "grad_norm": 0.32027068074790643, "learning_rate": 1.6629383682915935e-06, "loss": 0.0191, "step": 5891 }, { "epoch": 2.2784222737819024, "grad_norm": 0.29815143521992715, "learning_rate": 1.6612633212047242e-06, "loss": 0.0251, "step": 5892 }, { "epoch": 2.2788089713843775, "grad_norm": 0.31294714310030747, "learning_rate": 1.6595889500797502e-06, "loss": 0.0226, "step": 5893 }, { "epoch": 2.279195668986852, "grad_norm": 0.35754016097820884, "learning_rate": 1.6579152552556703e-06, "loss": 0.0214, "step": 5894 }, { "epoch": 2.2795823665893273, "grad_norm": 0.36071554960365676, "learning_rate": 1.6562422370713394e-06, "loss": 0.0315, "step": 5895 }, { "epoch": 2.279969064191802, "grad_norm": 0.24459558410746002, "learning_rate": 1.6545698958654804e-06, "loss": 0.0152, "step": 5896 }, { "epoch": 2.2803557617942767, "grad_norm": 0.22862704761359892, "learning_rate": 1.652898231976673e-06, "loss": 0.0148, "step": 5897 }, { "epoch": 2.280742459396752, "grad_norm": 0.32403793211982596, "learning_rate": 1.6512272457433614e-06, "loss": 0.0255, "step": 5898 }, { "epoch": 2.2811291569992265, "grad_norm": 0.34090480292117853, "learning_rate": 1.6495569375038584e-06, "loss": 0.0183, "step": 5899 }, { "epoch": 2.2815158546017016, "grad_norm": 0.29186064255076094, "learning_rate": 1.6478873075963336e-06, "loss": 0.0202, "step": 5900 }, { "epoch": 2.2819025522041763, "grad_norm": 0.21886419860632372, "learning_rate": 1.6462183563588202e-06, "loss": 0.0188, "step": 5901 }, { "epoch": 2.2822892498066514, "grad_norm": 0.21744437772385294, "learning_rate": 1.6445500841292133e-06, "loss": 0.0133, "step": 5902 }, { "epoch": 2.282675947409126, "grad_norm": 0.30934191937426114, "learning_rate": 1.6428824912452756e-06, "loss": 0.0196, "step": 5903 }, { "epoch": 2.2830626450116007, "grad_norm": 0.2679242019853718, "learning_rate": 1.641215578044627e-06, "loss": 0.0166, "step": 5904 }, { "epoch": 2.283449342614076, "grad_norm": 0.3201568155354032, "learning_rate": 1.6395493448647503e-06, "loss": 0.0198, "step": 5905 }, { "epoch": 2.2838360402165505, "grad_norm": 0.29284427403936814, "learning_rate": 1.6378837920429936e-06, "loss": 0.0169, "step": 5906 }, { "epoch": 2.2842227378190256, "grad_norm": 0.2239742106257109, "learning_rate": 1.6362189199165619e-06, "loss": 0.0128, "step": 5907 }, { "epoch": 2.2846094354215003, "grad_norm": 0.35131791270111434, "learning_rate": 1.63455472882253e-06, "loss": 0.0226, "step": 5908 }, { "epoch": 2.2849961330239754, "grad_norm": 0.2308067254891865, "learning_rate": 1.6328912190978286e-06, "loss": 0.0141, "step": 5909 }, { "epoch": 2.28538283062645, "grad_norm": 0.2028250404468029, "learning_rate": 1.6312283910792532e-06, "loss": 0.0142, "step": 5910 }, { "epoch": 2.2857695282289248, "grad_norm": 0.43870349027359273, "learning_rate": 1.6295662451034594e-06, "loss": 0.0289, "step": 5911 }, { "epoch": 2.2861562258314, "grad_norm": 0.3305504138269114, "learning_rate": 1.627904781506966e-06, "loss": 0.0213, "step": 5912 }, { "epoch": 2.2865429234338746, "grad_norm": 0.26640247455871213, "learning_rate": 1.626244000626157e-06, "loss": 0.0198, "step": 5913 }, { "epoch": 2.2869296210363497, "grad_norm": 0.2700383568445777, "learning_rate": 1.6245839027972688e-06, "loss": 0.0222, "step": 5914 }, { "epoch": 2.2873163186388243, "grad_norm": 0.23520878822780206, "learning_rate": 1.62292448835641e-06, "loss": 0.017, "step": 5915 }, { "epoch": 2.2877030162412995, "grad_norm": 0.23971239011764983, "learning_rate": 1.6212657576395436e-06, "loss": 0.0189, "step": 5916 }, { "epoch": 2.288089713843774, "grad_norm": 0.27727348340836694, "learning_rate": 1.619607710982501e-06, "loss": 0.015, "step": 5917 }, { "epoch": 2.288476411446249, "grad_norm": 0.2684866744637338, "learning_rate": 1.6179503487209668e-06, "loss": 0.0136, "step": 5918 }, { "epoch": 2.288863109048724, "grad_norm": 0.29584125013199525, "learning_rate": 1.6162936711904908e-06, "loss": 0.019, "step": 5919 }, { "epoch": 2.2892498066511986, "grad_norm": 0.22933464156906205, "learning_rate": 1.6146376787264877e-06, "loss": 0.0158, "step": 5920 }, { "epoch": 2.2896365042536737, "grad_norm": 0.3174768448640342, "learning_rate": 1.612982371664229e-06, "loss": 0.0197, "step": 5921 }, { "epoch": 2.2900232018561484, "grad_norm": 0.17906127095992674, "learning_rate": 1.6113277503388485e-06, "loss": 0.0154, "step": 5922 }, { "epoch": 2.2904098994586235, "grad_norm": 0.2740423284587045, "learning_rate": 1.609673815085342e-06, "loss": 0.0212, "step": 5923 }, { "epoch": 2.290796597061098, "grad_norm": 0.25317632791796013, "learning_rate": 1.608020566238564e-06, "loss": 0.0169, "step": 5924 }, { "epoch": 2.291183294663573, "grad_norm": 0.3653065074521622, "learning_rate": 1.6063680041332346e-06, "loss": 0.0279, "step": 5925 }, { "epoch": 2.291569992266048, "grad_norm": 0.2485955059880498, "learning_rate": 1.6047161291039314e-06, "loss": 0.0203, "step": 5926 }, { "epoch": 2.2919566898685226, "grad_norm": 0.30192140245146637, "learning_rate": 1.603064941485093e-06, "loss": 0.0174, "step": 5927 }, { "epoch": 2.2923433874709978, "grad_norm": 0.3386728191151528, "learning_rate": 1.6014144416110178e-06, "loss": 0.017, "step": 5928 }, { "epoch": 2.2927300850734724, "grad_norm": 0.32352074314410617, "learning_rate": 1.5997646298158704e-06, "loss": 0.024, "step": 5929 }, { "epoch": 2.2931167826759475, "grad_norm": 0.22500075238084857, "learning_rate": 1.5981155064336723e-06, "loss": 0.016, "step": 5930 }, { "epoch": 2.293503480278422, "grad_norm": 0.25233134392381396, "learning_rate": 1.5964670717983006e-06, "loss": 0.0122, "step": 5931 }, { "epoch": 2.2938901778808973, "grad_norm": 0.33773144634161134, "learning_rate": 1.5948193262435025e-06, "loss": 0.0252, "step": 5932 }, { "epoch": 2.294276875483372, "grad_norm": 0.32963429439358904, "learning_rate": 1.593172270102878e-06, "loss": 0.0217, "step": 5933 }, { "epoch": 2.2946635730858467, "grad_norm": 0.3455482352392099, "learning_rate": 1.5915259037098962e-06, "loss": 0.0205, "step": 5934 }, { "epoch": 2.295050270688322, "grad_norm": 0.3442776662650289, "learning_rate": 1.5898802273978742e-06, "loss": 0.0313, "step": 5935 }, { "epoch": 2.2954369682907965, "grad_norm": 0.23186223784174498, "learning_rate": 1.5882352415000014e-06, "loss": 0.02, "step": 5936 }, { "epoch": 2.2958236658932716, "grad_norm": 0.2709968197711466, "learning_rate": 1.5865909463493202e-06, "loss": 0.0166, "step": 5937 }, { "epoch": 2.2962103634957463, "grad_norm": 0.4748131742437371, "learning_rate": 1.5849473422787353e-06, "loss": 0.0238, "step": 5938 }, { "epoch": 2.2965970610982214, "grad_norm": 0.2801236930188171, "learning_rate": 1.5833044296210115e-06, "loss": 0.0217, "step": 5939 }, { "epoch": 2.296983758700696, "grad_norm": 0.2325393697330556, "learning_rate": 1.5816622087087723e-06, "loss": 0.0158, "step": 5940 }, { "epoch": 2.297370456303171, "grad_norm": 0.24358188113106607, "learning_rate": 1.5800206798745048e-06, "loss": 0.0224, "step": 5941 }, { "epoch": 2.297757153905646, "grad_norm": 0.18780285600662372, "learning_rate": 1.5783798434505525e-06, "loss": 0.0133, "step": 5942 }, { "epoch": 2.2981438515081205, "grad_norm": 0.2638560887531838, "learning_rate": 1.5767396997691199e-06, "loss": 0.0206, "step": 5943 }, { "epoch": 2.2985305491105956, "grad_norm": 0.24452736971398245, "learning_rate": 1.5751002491622707e-06, "loss": 0.015, "step": 5944 }, { "epoch": 2.2989172467130703, "grad_norm": 0.312698090185964, "learning_rate": 1.5734614919619273e-06, "loss": 0.017, "step": 5945 }, { "epoch": 2.2993039443155454, "grad_norm": 0.41892493521180224, "learning_rate": 1.5718234284998763e-06, "loss": 0.0316, "step": 5946 }, { "epoch": 2.29969064191802, "grad_norm": 0.22239763063540696, "learning_rate": 1.5701860591077584e-06, "loss": 0.016, "step": 5947 }, { "epoch": 2.300077339520495, "grad_norm": 0.2220304478061116, "learning_rate": 1.5685493841170774e-06, "loss": 0.0181, "step": 5948 }, { "epoch": 2.30046403712297, "grad_norm": 0.22342933051810435, "learning_rate": 1.5669134038591921e-06, "loss": 0.0192, "step": 5949 }, { "epoch": 2.3008507347254445, "grad_norm": 0.2569591433786363, "learning_rate": 1.5652781186653282e-06, "loss": 0.0148, "step": 5950 }, { "epoch": 2.3012374323279197, "grad_norm": 0.27733526145757603, "learning_rate": 1.5636435288665635e-06, "loss": 0.0205, "step": 5951 }, { "epoch": 2.3016241299303943, "grad_norm": 0.31649722790219176, "learning_rate": 1.5620096347938385e-06, "loss": 0.0443, "step": 5952 }, { "epoch": 2.3020108275328695, "grad_norm": 0.8947305170986087, "learning_rate": 1.560376436777951e-06, "loss": 0.0241, "step": 5953 }, { "epoch": 2.302397525135344, "grad_norm": 0.3819683583797005, "learning_rate": 1.5587439351495587e-06, "loss": 0.0178, "step": 5954 }, { "epoch": 2.3027842227378192, "grad_norm": 0.43774953956027635, "learning_rate": 1.5571121302391806e-06, "loss": 0.0195, "step": 5955 }, { "epoch": 2.303170920340294, "grad_norm": 0.24916810455629226, "learning_rate": 1.5554810223771926e-06, "loss": 0.0178, "step": 5956 }, { "epoch": 2.3035576179427686, "grad_norm": 0.3738207832332674, "learning_rate": 1.5538506118938251e-06, "loss": 0.0296, "step": 5957 }, { "epoch": 2.3039443155452437, "grad_norm": 0.2998407334955723, "learning_rate": 1.552220899119176e-06, "loss": 0.027, "step": 5958 }, { "epoch": 2.3043310131477184, "grad_norm": 0.3404761944349263, "learning_rate": 1.5505918843831946e-06, "loss": 0.0232, "step": 5959 }, { "epoch": 2.3047177107501935, "grad_norm": 0.2761015257507722, "learning_rate": 1.5489635680156967e-06, "loss": 0.0183, "step": 5960 }, { "epoch": 2.305104408352668, "grad_norm": 0.26038972073681105, "learning_rate": 1.5473359503463454e-06, "loss": 0.017, "step": 5961 }, { "epoch": 2.3054911059551433, "grad_norm": 0.3038060961643893, "learning_rate": 1.5457090317046735e-06, "loss": 0.0205, "step": 5962 }, { "epoch": 2.305877803557618, "grad_norm": 0.1988178870673261, "learning_rate": 1.5440828124200657e-06, "loss": 0.0148, "step": 5963 }, { "epoch": 2.3062645011600926, "grad_norm": 0.27030768605729083, "learning_rate": 1.542457292821768e-06, "loss": 0.0188, "step": 5964 }, { "epoch": 2.3066511987625677, "grad_norm": 0.31856238335865456, "learning_rate": 1.5408324732388824e-06, "loss": 0.0247, "step": 5965 }, { "epoch": 2.3070378963650424, "grad_norm": 0.33779863640221536, "learning_rate": 1.53920835400037e-06, "loss": 0.0171, "step": 5966 }, { "epoch": 2.3074245939675175, "grad_norm": 0.27128577886647415, "learning_rate": 1.537584935435053e-06, "loss": 0.0233, "step": 5967 }, { "epoch": 2.307811291569992, "grad_norm": 0.3192717431386833, "learning_rate": 1.5359622178716082e-06, "loss": 0.0225, "step": 5968 }, { "epoch": 2.3081979891724673, "grad_norm": 0.33785378381243375, "learning_rate": 1.534340201638571e-06, "loss": 0.0228, "step": 5969 }, { "epoch": 2.308584686774942, "grad_norm": 0.32228985886406714, "learning_rate": 1.5327188870643368e-06, "loss": 0.0191, "step": 5970 }, { "epoch": 2.3089713843774167, "grad_norm": 0.2746541663039907, "learning_rate": 1.5310982744771541e-06, "loss": 0.0154, "step": 5971 }, { "epoch": 2.309358081979892, "grad_norm": 0.3444508621222745, "learning_rate": 1.5294783642051374e-06, "loss": 0.0272, "step": 5972 }, { "epoch": 2.3097447795823665, "grad_norm": 0.2500169138051058, "learning_rate": 1.5278591565762518e-06, "loss": 0.0168, "step": 5973 }, { "epoch": 2.3101314771848416, "grad_norm": 0.24659932773974028, "learning_rate": 1.5262406519183238e-06, "loss": 0.0184, "step": 5974 }, { "epoch": 2.3105181747873162, "grad_norm": 0.27008456746642273, "learning_rate": 1.5246228505590338e-06, "loss": 0.0239, "step": 5975 }, { "epoch": 2.3109048723897914, "grad_norm": 0.25861134539416797, "learning_rate": 1.5230057528259261e-06, "loss": 0.0123, "step": 5976 }, { "epoch": 2.311291569992266, "grad_norm": 0.3164223199645151, "learning_rate": 1.521389359046399e-06, "loss": 0.0307, "step": 5977 }, { "epoch": 2.3116782675947407, "grad_norm": 0.22607280780313568, "learning_rate": 1.5197736695477033e-06, "loss": 0.0124, "step": 5978 }, { "epoch": 2.312064965197216, "grad_norm": 0.32435947336846693, "learning_rate": 1.5181586846569568e-06, "loss": 0.017, "step": 5979 }, { "epoch": 2.3124516627996905, "grad_norm": 0.2514086163916773, "learning_rate": 1.5165444047011268e-06, "loss": 0.0163, "step": 5980 }, { "epoch": 2.3128383604021656, "grad_norm": 0.28431506887807273, "learning_rate": 1.5149308300070464e-06, "loss": 0.0181, "step": 5981 }, { "epoch": 2.3132250580046403, "grad_norm": 0.26644003158939183, "learning_rate": 1.513317960901393e-06, "loss": 0.0256, "step": 5982 }, { "epoch": 2.3136117556071154, "grad_norm": 0.22141196381084108, "learning_rate": 1.5117057977107147e-06, "loss": 0.0141, "step": 5983 }, { "epoch": 2.31399845320959, "grad_norm": 0.21660249745867804, "learning_rate": 1.5100943407614083e-06, "loss": 0.0164, "step": 5984 }, { "epoch": 2.3143851508120648, "grad_norm": 0.4805758291212928, "learning_rate": 1.5084835903797302e-06, "loss": 0.0157, "step": 5985 }, { "epoch": 2.31477184841454, "grad_norm": 0.34310791976449506, "learning_rate": 1.5068735468917932e-06, "loss": 0.0228, "step": 5986 }, { "epoch": 2.3151585460170145, "grad_norm": 0.25088551995825287, "learning_rate": 1.5052642106235666e-06, "loss": 0.0141, "step": 5987 }, { "epoch": 2.3155452436194897, "grad_norm": 0.2310527542910046, "learning_rate": 1.5036555819008797e-06, "loss": 0.0137, "step": 5988 }, { "epoch": 2.3159319412219643, "grad_norm": 0.2612660235404181, "learning_rate": 1.502047661049415e-06, "loss": 0.0185, "step": 5989 }, { "epoch": 2.3163186388244394, "grad_norm": 0.2852633792574955, "learning_rate": 1.5004404483947117e-06, "loss": 0.0142, "step": 5990 }, { "epoch": 2.316705336426914, "grad_norm": 0.24804348568759182, "learning_rate": 1.498833944262168e-06, "loss": 0.0178, "step": 5991 }, { "epoch": 2.317092034029389, "grad_norm": 0.2889748120447778, "learning_rate": 1.4972281489770352e-06, "loss": 0.0236, "step": 5992 }, { "epoch": 2.317478731631864, "grad_norm": 0.23961299367992925, "learning_rate": 1.4956230628644259e-06, "loss": 0.0102, "step": 5993 }, { "epoch": 2.3178654292343386, "grad_norm": 0.3870363959825784, "learning_rate": 1.4940186862493055e-06, "loss": 0.0215, "step": 5994 }, { "epoch": 2.3182521268368137, "grad_norm": 0.33543721991234354, "learning_rate": 1.4924150194564968e-06, "loss": 0.0192, "step": 5995 }, { "epoch": 2.3186388244392884, "grad_norm": 0.24417805818841049, "learning_rate": 1.4908120628106786e-06, "loss": 0.0247, "step": 5996 }, { "epoch": 2.3190255220417635, "grad_norm": 0.21548132478081625, "learning_rate": 1.4892098166363844e-06, "loss": 0.013, "step": 5997 }, { "epoch": 2.319412219644238, "grad_norm": 0.29446524125231793, "learning_rate": 1.48760828125801e-06, "loss": 0.0149, "step": 5998 }, { "epoch": 2.319798917246713, "grad_norm": 0.2463448032161159, "learning_rate": 1.4860074569997972e-06, "loss": 0.0206, "step": 5999 }, { "epoch": 2.320185614849188, "grad_norm": 0.3800821174178936, "learning_rate": 1.4844073441858542e-06, "loss": 0.0317, "step": 6000 }, { "epoch": 2.3205723124516626, "grad_norm": 0.2924886413488623, "learning_rate": 1.4828079431401364e-06, "loss": 0.0132, "step": 6001 }, { "epoch": 2.3209590100541377, "grad_norm": 0.2786365473151962, "learning_rate": 1.4812092541864648e-06, "loss": 0.0204, "step": 6002 }, { "epoch": 2.3213457076566124, "grad_norm": 0.28672210653323865, "learning_rate": 1.4796112776485061e-06, "loss": 0.0202, "step": 6003 }, { "epoch": 2.3217324052590875, "grad_norm": 0.2451777362451255, "learning_rate": 1.4780140138497862e-06, "loss": 0.0158, "step": 6004 }, { "epoch": 2.322119102861562, "grad_norm": 0.2793520942732273, "learning_rate": 1.4764174631136912e-06, "loss": 0.0202, "step": 6005 }, { "epoch": 2.322505800464037, "grad_norm": 0.3003065936553508, "learning_rate": 1.474821625763458e-06, "loss": 0.0179, "step": 6006 }, { "epoch": 2.322892498066512, "grad_norm": 0.261413170607602, "learning_rate": 1.4732265021221808e-06, "loss": 0.0164, "step": 6007 }, { "epoch": 2.3232791956689867, "grad_norm": 0.2855210656051005, "learning_rate": 1.4716320925128068e-06, "loss": 0.0163, "step": 6008 }, { "epoch": 2.323665893271462, "grad_norm": 0.29808130911657515, "learning_rate": 1.4700383972581438e-06, "loss": 0.0199, "step": 6009 }, { "epoch": 2.3240525908739365, "grad_norm": 0.27010289571650664, "learning_rate": 1.468445416680851e-06, "loss": 0.0234, "step": 6010 }, { "epoch": 2.3244392884764116, "grad_norm": 0.286691102811317, "learning_rate": 1.4668531511034433e-06, "loss": 0.0211, "step": 6011 }, { "epoch": 2.3248259860788862, "grad_norm": 0.3838714283641476, "learning_rate": 1.4652616008482923e-06, "loss": 0.0228, "step": 6012 }, { "epoch": 2.3252126836813614, "grad_norm": 0.2514579258930699, "learning_rate": 1.463670766237621e-06, "loss": 0.0187, "step": 6013 }, { "epoch": 2.325599381283836, "grad_norm": 0.3151495077670835, "learning_rate": 1.462080647593514e-06, "loss": 0.022, "step": 6014 }, { "epoch": 2.325986078886311, "grad_norm": 0.3453166862342851, "learning_rate": 1.4604912452379067e-06, "loss": 0.0147, "step": 6015 }, { "epoch": 2.326372776488786, "grad_norm": 0.24077357980577485, "learning_rate": 1.4589025594925888e-06, "loss": 0.0157, "step": 6016 }, { "epoch": 2.3267594740912605, "grad_norm": 0.28160687150606256, "learning_rate": 1.4573145906792063e-06, "loss": 0.0157, "step": 6017 }, { "epoch": 2.3271461716937356, "grad_norm": 0.22906309856283372, "learning_rate": 1.4557273391192595e-06, "loss": 0.0153, "step": 6018 }, { "epoch": 2.3275328692962103, "grad_norm": 0.34113216623600734, "learning_rate": 1.4541408051341062e-06, "loss": 0.0228, "step": 6019 }, { "epoch": 2.3279195668986854, "grad_norm": 0.2602905576209649, "learning_rate": 1.4525549890449554e-06, "loss": 0.0199, "step": 6020 }, { "epoch": 2.32830626450116, "grad_norm": 0.2624269994021272, "learning_rate": 1.4509698911728715e-06, "loss": 0.0195, "step": 6021 }, { "epoch": 2.328692962103635, "grad_norm": 0.2926490765995933, "learning_rate": 1.4493855118387734e-06, "loss": 0.0216, "step": 6022 }, { "epoch": 2.32907965970611, "grad_norm": 0.3167947852074012, "learning_rate": 1.4478018513634379e-06, "loss": 0.0183, "step": 6023 }, { "epoch": 2.3294663573085845, "grad_norm": 0.20614761720297026, "learning_rate": 1.4462189100674929e-06, "loss": 0.0169, "step": 6024 }, { "epoch": 2.3298530549110597, "grad_norm": 0.2283004569084134, "learning_rate": 1.4446366882714175e-06, "loss": 0.0147, "step": 6025 }, { "epoch": 2.3302397525135343, "grad_norm": 0.20597319800274805, "learning_rate": 1.4430551862955534e-06, "loss": 0.0129, "step": 6026 }, { "epoch": 2.3306264501160094, "grad_norm": 0.2670898027894835, "learning_rate": 1.4414744044600882e-06, "loss": 0.0198, "step": 6027 }, { "epoch": 2.331013147718484, "grad_norm": 0.28626577391121444, "learning_rate": 1.4398943430850731e-06, "loss": 0.0154, "step": 6028 }, { "epoch": 2.3313998453209592, "grad_norm": 0.29610476334055813, "learning_rate": 1.4383150024904014e-06, "loss": 0.0255, "step": 6029 }, { "epoch": 2.331786542923434, "grad_norm": 0.31541304893975286, "learning_rate": 1.436736382995832e-06, "loss": 0.0275, "step": 6030 }, { "epoch": 2.3321732405259086, "grad_norm": 0.28879318239206836, "learning_rate": 1.4351584849209705e-06, "loss": 0.0207, "step": 6031 }, { "epoch": 2.3325599381283837, "grad_norm": 0.23439591603260626, "learning_rate": 1.4335813085852795e-06, "loss": 0.0156, "step": 6032 }, { "epoch": 2.3329466357308584, "grad_norm": 0.22642483307848452, "learning_rate": 1.4320048543080744e-06, "loss": 0.017, "step": 6033 }, { "epoch": 2.3333333333333335, "grad_norm": 0.33936568674854956, "learning_rate": 1.4304291224085232e-06, "loss": 0.0233, "step": 6034 }, { "epoch": 2.333720030935808, "grad_norm": 0.2690817425252845, "learning_rate": 1.428854113205652e-06, "loss": 0.0154, "step": 6035 }, { "epoch": 2.3341067285382833, "grad_norm": 0.22837697515168512, "learning_rate": 1.4272798270183368e-06, "loss": 0.0199, "step": 6036 }, { "epoch": 2.334493426140758, "grad_norm": 0.24615275824867414, "learning_rate": 1.425706264165308e-06, "loss": 0.015, "step": 6037 }, { "epoch": 2.3348801237432326, "grad_norm": 0.22503767331166794, "learning_rate": 1.4241334249651495e-06, "loss": 0.0138, "step": 6038 }, { "epoch": 2.3352668213457077, "grad_norm": 0.2482380766886516, "learning_rate": 1.4225613097362979e-06, "loss": 0.0184, "step": 6039 }, { "epoch": 2.3356535189481824, "grad_norm": 0.28801716081193457, "learning_rate": 1.420989918797046e-06, "loss": 0.0219, "step": 6040 }, { "epoch": 2.3360402165506575, "grad_norm": 0.20914701954700982, "learning_rate": 1.419419252465538e-06, "loss": 0.0164, "step": 6041 }, { "epoch": 2.336426914153132, "grad_norm": 0.41070065503490777, "learning_rate": 1.4178493110597718e-06, "loss": 0.0283, "step": 6042 }, { "epoch": 2.3368136117556073, "grad_norm": 0.27467026028690017, "learning_rate": 1.4162800948975975e-06, "loss": 0.0179, "step": 6043 }, { "epoch": 2.337200309358082, "grad_norm": 0.34415023680993606, "learning_rate": 1.4147116042967175e-06, "loss": 0.0165, "step": 6044 }, { "epoch": 2.3375870069605567, "grad_norm": 0.23998962719111844, "learning_rate": 1.4131438395746938e-06, "loss": 0.0178, "step": 6045 }, { "epoch": 2.3379737045630318, "grad_norm": 0.2789196108683749, "learning_rate": 1.4115768010489312e-06, "loss": 0.0276, "step": 6046 }, { "epoch": 2.3383604021655064, "grad_norm": 0.28155401802455804, "learning_rate": 1.4100104890366972e-06, "loss": 0.0161, "step": 6047 }, { "epoch": 2.3387470997679816, "grad_norm": 0.33084211966026444, "learning_rate": 1.408444903855104e-06, "loss": 0.0159, "step": 6048 }, { "epoch": 2.3391337973704562, "grad_norm": 0.2667843988605441, "learning_rate": 1.4068800458211268e-06, "loss": 0.0205, "step": 6049 }, { "epoch": 2.3395204949729314, "grad_norm": 0.35161743918952715, "learning_rate": 1.405315915251581e-06, "loss": 0.0136, "step": 6050 }, { "epoch": 2.339907192575406, "grad_norm": 0.24223459013363938, "learning_rate": 1.4037525124631423e-06, "loss": 0.0155, "step": 6051 }, { "epoch": 2.3402938901778807, "grad_norm": 0.23347689125445112, "learning_rate": 1.4021898377723403e-06, "loss": 0.0149, "step": 6052 }, { "epoch": 2.340680587780356, "grad_norm": 0.2332812823981555, "learning_rate": 1.4006278914955535e-06, "loss": 0.0193, "step": 6053 }, { "epoch": 2.3410672853828305, "grad_norm": 0.4498552265612094, "learning_rate": 1.3990666739490138e-06, "loss": 0.0344, "step": 6054 }, { "epoch": 2.3414539829853056, "grad_norm": 0.24211268584353482, "learning_rate": 1.3975061854488047e-06, "loss": 0.0211, "step": 6055 }, { "epoch": 2.3418406805877803, "grad_norm": 0.22554597349773614, "learning_rate": 1.3959464263108663e-06, "loss": 0.0146, "step": 6056 }, { "epoch": 2.3422273781902554, "grad_norm": 0.3196987559951371, "learning_rate": 1.3943873968509864e-06, "loss": 0.0259, "step": 6057 }, { "epoch": 2.34261407579273, "grad_norm": 0.24304645123050606, "learning_rate": 1.3928290973848069e-06, "loss": 0.0134, "step": 6058 }, { "epoch": 2.3430007733952047, "grad_norm": 0.23532148453337406, "learning_rate": 1.3912715282278211e-06, "loss": 0.0154, "step": 6059 }, { "epoch": 2.34338747099768, "grad_norm": 0.2903722184518933, "learning_rate": 1.3897146896953745e-06, "loss": 0.0164, "step": 6060 }, { "epoch": 2.3437741686001545, "grad_norm": 0.2553407331061743, "learning_rate": 1.3881585821026672e-06, "loss": 0.0226, "step": 6061 }, { "epoch": 2.3441608662026296, "grad_norm": 0.2850219001165943, "learning_rate": 1.3866032057647482e-06, "loss": 0.0229, "step": 6062 }, { "epoch": 2.3445475638051043, "grad_norm": 0.4033789883592279, "learning_rate": 1.3850485609965203e-06, "loss": 0.0185, "step": 6063 }, { "epoch": 2.3449342614075794, "grad_norm": 0.27314487719243485, "learning_rate": 1.3834946481127365e-06, "loss": 0.0247, "step": 6064 }, { "epoch": 2.345320959010054, "grad_norm": 0.23656403656265731, "learning_rate": 1.3819414674280013e-06, "loss": 0.0196, "step": 6065 }, { "epoch": 2.345707656612529, "grad_norm": 0.40912547717347275, "learning_rate": 1.3803890192567776e-06, "loss": 0.0267, "step": 6066 }, { "epoch": 2.346094354215004, "grad_norm": 0.2886729515971641, "learning_rate": 1.3788373039133674e-06, "loss": 0.0179, "step": 6067 }, { "epoch": 2.3464810518174786, "grad_norm": 0.2993393993817129, "learning_rate": 1.377286321711937e-06, "loss": 0.0209, "step": 6068 }, { "epoch": 2.3468677494199537, "grad_norm": 0.27618114231499263, "learning_rate": 1.3757360729664949e-06, "loss": 0.0188, "step": 6069 }, { "epoch": 2.3472544470224284, "grad_norm": 0.35069771965605934, "learning_rate": 1.3741865579909108e-06, "loss": 0.0204, "step": 6070 }, { "epoch": 2.3476411446249035, "grad_norm": 0.3068706273819812, "learning_rate": 1.3726377770988946e-06, "loss": 0.0177, "step": 6071 }, { "epoch": 2.348027842227378, "grad_norm": 0.3096560095311317, "learning_rate": 1.3710897306040134e-06, "loss": 0.0178, "step": 6072 }, { "epoch": 2.348414539829853, "grad_norm": 0.31647932958452824, "learning_rate": 1.3695424188196887e-06, "loss": 0.0272, "step": 6073 }, { "epoch": 2.348801237432328, "grad_norm": 0.2538998155040216, "learning_rate": 1.3679958420591877e-06, "loss": 0.0153, "step": 6074 }, { "epoch": 2.3491879350348026, "grad_norm": 0.29324387346193426, "learning_rate": 1.3664500006356307e-06, "loss": 0.0182, "step": 6075 }, { "epoch": 2.3495746326372777, "grad_norm": 0.23403703270821147, "learning_rate": 1.3649048948619903e-06, "loss": 0.0164, "step": 6076 }, { "epoch": 2.3499613302397524, "grad_norm": 0.23966201962308992, "learning_rate": 1.3633605250510873e-06, "loss": 0.0171, "step": 6077 }, { "epoch": 2.3503480278422275, "grad_norm": 0.2567841489985654, "learning_rate": 1.3618168915155983e-06, "loss": 0.018, "step": 6078 }, { "epoch": 2.350734725444702, "grad_norm": 0.3467161846693151, "learning_rate": 1.360273994568046e-06, "loss": 0.0203, "step": 6079 }, { "epoch": 2.351121423047177, "grad_norm": 0.2671047652755823, "learning_rate": 1.3587318345208068e-06, "loss": 0.0237, "step": 6080 }, { "epoch": 2.351508120649652, "grad_norm": 0.2322934507468739, "learning_rate": 1.3571904116861045e-06, "loss": 0.015, "step": 6081 }, { "epoch": 2.3518948182521267, "grad_norm": 0.26552839620400037, "learning_rate": 1.35564972637602e-06, "loss": 0.0202, "step": 6082 }, { "epoch": 2.3522815158546018, "grad_norm": 0.29270124207833087, "learning_rate": 1.3541097789024793e-06, "loss": 0.0192, "step": 6083 }, { "epoch": 2.3526682134570764, "grad_norm": 0.2980751102304541, "learning_rate": 1.3525705695772606e-06, "loss": 0.02, "step": 6084 }, { "epoch": 2.3530549110595516, "grad_norm": 0.30697198383878205, "learning_rate": 1.3510320987119928e-06, "loss": 0.0221, "step": 6085 }, { "epoch": 2.3534416086620262, "grad_norm": 0.20454241253325658, "learning_rate": 1.3494943666181537e-06, "loss": 0.0126, "step": 6086 }, { "epoch": 2.3538283062645013, "grad_norm": 0.245599574449869, "learning_rate": 1.3479573736070767e-06, "loss": 0.0163, "step": 6087 }, { "epoch": 2.354215003866976, "grad_norm": 0.2830324716273415, "learning_rate": 1.34642111998994e-06, "loss": 0.0199, "step": 6088 }, { "epoch": 2.354601701469451, "grad_norm": 0.28717457994166706, "learning_rate": 1.3448856060777743e-06, "loss": 0.0205, "step": 6089 }, { "epoch": 2.354988399071926, "grad_norm": 0.2698852845116881, "learning_rate": 1.3433508321814597e-06, "loss": 0.0218, "step": 6090 }, { "epoch": 2.3553750966744005, "grad_norm": 0.287019171083398, "learning_rate": 1.341816798611727e-06, "loss": 0.0173, "step": 6091 }, { "epoch": 2.3557617942768756, "grad_norm": 0.24546809711681003, "learning_rate": 1.3402835056791602e-06, "loss": 0.0172, "step": 6092 }, { "epoch": 2.3561484918793503, "grad_norm": 0.2831972635945296, "learning_rate": 1.338750953694185e-06, "loss": 0.018, "step": 6093 }, { "epoch": 2.3565351894818254, "grad_norm": 0.22911081428150545, "learning_rate": 1.3372191429670872e-06, "loss": 0.0182, "step": 6094 }, { "epoch": 2.3569218870843, "grad_norm": 0.2549234191766632, "learning_rate": 1.3356880738079947e-06, "loss": 0.0189, "step": 6095 }, { "epoch": 2.357308584686775, "grad_norm": 0.2580645534555229, "learning_rate": 1.334157746526893e-06, "loss": 0.0192, "step": 6096 }, { "epoch": 2.35769528228925, "grad_norm": 0.3370025679355699, "learning_rate": 1.3326281614336073e-06, "loss": 0.0212, "step": 6097 }, { "epoch": 2.3580819798917245, "grad_norm": 0.2223644720650028, "learning_rate": 1.3310993188378184e-06, "loss": 0.016, "step": 6098 }, { "epoch": 2.3584686774941996, "grad_norm": 0.7458964317935528, "learning_rate": 1.32957121904906e-06, "loss": 0.014, "step": 6099 }, { "epoch": 2.3588553750966743, "grad_norm": 0.2972323667118688, "learning_rate": 1.32804386237671e-06, "loss": 0.0172, "step": 6100 }, { "epoch": 2.3592420726991494, "grad_norm": 0.27989328376089945, "learning_rate": 1.3265172491299977e-06, "loss": 0.018, "step": 6101 }, { "epoch": 2.359628770301624, "grad_norm": 0.32564623757126737, "learning_rate": 1.3249913796179997e-06, "loss": 0.0261, "step": 6102 }, { "epoch": 2.360015467904099, "grad_norm": 0.34515724138068116, "learning_rate": 1.3234662541496473e-06, "loss": 0.0224, "step": 6103 }, { "epoch": 2.360402165506574, "grad_norm": 0.3288237969564502, "learning_rate": 1.321941873033717e-06, "loss": 0.026, "step": 6104 }, { "epoch": 2.3607888631090486, "grad_norm": 0.25176200339075316, "learning_rate": 1.3204182365788348e-06, "loss": 0.0176, "step": 6105 }, { "epoch": 2.3611755607115237, "grad_norm": 0.3134327074525968, "learning_rate": 1.3188953450934776e-06, "loss": 0.0152, "step": 6106 }, { "epoch": 2.3615622583139984, "grad_norm": 0.3984178697872661, "learning_rate": 1.3173731988859684e-06, "loss": 0.0184, "step": 6107 }, { "epoch": 2.3619489559164735, "grad_norm": 0.25874931173385773, "learning_rate": 1.3158517982644852e-06, "loss": 0.0171, "step": 6108 }, { "epoch": 2.362335653518948, "grad_norm": 0.249612650055156, "learning_rate": 1.3143311435370503e-06, "loss": 0.0173, "step": 6109 }, { "epoch": 2.3627223511214233, "grad_norm": 0.2715982114605633, "learning_rate": 1.312811235011533e-06, "loss": 0.0236, "step": 6110 }, { "epoch": 2.363109048723898, "grad_norm": 0.22960292662688692, "learning_rate": 1.3112920729956575e-06, "loss": 0.0163, "step": 6111 }, { "epoch": 2.3634957463263726, "grad_norm": 0.26255046768692586, "learning_rate": 1.3097736577969917e-06, "loss": 0.0233, "step": 6112 }, { "epoch": 2.3638824439288477, "grad_norm": 0.18252979426273824, "learning_rate": 1.308255989722959e-06, "loss": 0.0136, "step": 6113 }, { "epoch": 2.3642691415313224, "grad_norm": 0.4098272065938083, "learning_rate": 1.3067390690808208e-06, "loss": 0.0248, "step": 6114 }, { "epoch": 2.3646558391337975, "grad_norm": 0.4454100414498876, "learning_rate": 1.3052228961776987e-06, "loss": 0.0218, "step": 6115 }, { "epoch": 2.365042536736272, "grad_norm": 0.2162875780266285, "learning_rate": 1.303707471320555e-06, "loss": 0.0145, "step": 6116 }, { "epoch": 2.3654292343387473, "grad_norm": 0.27616791362248805, "learning_rate": 1.3021927948162038e-06, "loss": 0.0223, "step": 6117 }, { "epoch": 2.365815931941222, "grad_norm": 0.4360092718025009, "learning_rate": 1.3006788669713072e-06, "loss": 0.0186, "step": 6118 }, { "epoch": 2.3662026295436966, "grad_norm": 0.2454529911826001, "learning_rate": 1.2991656880923743e-06, "loss": 0.0158, "step": 6119 }, { "epoch": 2.3665893271461718, "grad_norm": 0.2105103154304339, "learning_rate": 1.297653258485766e-06, "loss": 0.014, "step": 6120 }, { "epoch": 2.3669760247486464, "grad_norm": 0.3091154532181554, "learning_rate": 1.2961415784576886e-06, "loss": 0.0179, "step": 6121 }, { "epoch": 2.3673627223511216, "grad_norm": 0.2897984728539654, "learning_rate": 1.2946306483141968e-06, "loss": 0.0213, "step": 6122 }, { "epoch": 2.3677494199535962, "grad_norm": 0.2322175712138168, "learning_rate": 1.2931204683611953e-06, "loss": 0.015, "step": 6123 }, { "epoch": 2.3681361175560713, "grad_norm": 0.20678423951215422, "learning_rate": 1.2916110389044328e-06, "loss": 0.0188, "step": 6124 }, { "epoch": 2.368522815158546, "grad_norm": 0.23870691311250714, "learning_rate": 1.2901023602495126e-06, "loss": 0.0161, "step": 6125 }, { "epoch": 2.3689095127610207, "grad_norm": 0.2646963956286224, "learning_rate": 1.2885944327018813e-06, "loss": 0.0197, "step": 6126 }, { "epoch": 2.369296210363496, "grad_norm": 0.271953313377826, "learning_rate": 1.2870872565668335e-06, "loss": 0.0266, "step": 6127 }, { "epoch": 2.3696829079659705, "grad_norm": 0.2820785403284308, "learning_rate": 1.2855808321495123e-06, "loss": 0.0152, "step": 6128 }, { "epoch": 2.3700696055684456, "grad_norm": 0.2648242897763518, "learning_rate": 1.2840751597549106e-06, "loss": 0.0158, "step": 6129 }, { "epoch": 2.3704563031709203, "grad_norm": 0.340307447013666, "learning_rate": 1.2825702396878686e-06, "loss": 0.0217, "step": 6130 }, { "epoch": 2.3708430007733954, "grad_norm": 0.31600570074233814, "learning_rate": 1.2810660722530677e-06, "loss": 0.0223, "step": 6131 }, { "epoch": 2.37122969837587, "grad_norm": 0.3115357751979125, "learning_rate": 1.2795626577550475e-06, "loss": 0.021, "step": 6132 }, { "epoch": 2.3716163959783447, "grad_norm": 0.3826992647050658, "learning_rate": 1.278059996498186e-06, "loss": 0.0238, "step": 6133 }, { "epoch": 2.37200309358082, "grad_norm": 0.2672278261621382, "learning_rate": 1.2765580887867174e-06, "loss": 0.0199, "step": 6134 }, { "epoch": 2.3723897911832945, "grad_norm": 0.324985044151062, "learning_rate": 1.2750569349247128e-06, "loss": 0.0201, "step": 6135 }, { "epoch": 2.3727764887857696, "grad_norm": 0.3222238487775443, "learning_rate": 1.2735565352161005e-06, "loss": 0.0217, "step": 6136 }, { "epoch": 2.3731631863882443, "grad_norm": 0.32281615327082064, "learning_rate": 1.2720568899646512e-06, "loss": 0.0126, "step": 6137 }, { "epoch": 2.3735498839907194, "grad_norm": 0.35194431748525856, "learning_rate": 1.2705579994739824e-06, "loss": 0.013, "step": 6138 }, { "epoch": 2.373936581593194, "grad_norm": 0.3162772645882558, "learning_rate": 1.2690598640475615e-06, "loss": 0.0224, "step": 6139 }, { "epoch": 2.3743232791956688, "grad_norm": 0.2577334720542324, "learning_rate": 1.2675624839886985e-06, "loss": 0.0213, "step": 6140 }, { "epoch": 2.374709976798144, "grad_norm": 0.28931143768029377, "learning_rate": 1.2660658596005582e-06, "loss": 0.016, "step": 6141 }, { "epoch": 2.3750966744006186, "grad_norm": 0.2297445798607936, "learning_rate": 1.2645699911861448e-06, "loss": 0.0166, "step": 6142 }, { "epoch": 2.3754833720030937, "grad_norm": 0.3933750718525388, "learning_rate": 1.2630748790483133e-06, "loss": 0.0291, "step": 6143 }, { "epoch": 2.3758700696055683, "grad_norm": 0.2387356906161886, "learning_rate": 1.2615805234897643e-06, "loss": 0.0156, "step": 6144 }, { "epoch": 2.3762567672080435, "grad_norm": 0.2183623530041554, "learning_rate": 1.2600869248130436e-06, "loss": 0.015, "step": 6145 }, { "epoch": 2.376643464810518, "grad_norm": 0.2729297748225767, "learning_rate": 1.25859408332055e-06, "loss": 0.0242, "step": 6146 }, { "epoch": 2.377030162412993, "grad_norm": 0.2664539508802344, "learning_rate": 1.2571019993145223e-06, "loss": 0.0138, "step": 6147 }, { "epoch": 2.377416860015468, "grad_norm": 0.2918557485371479, "learning_rate": 1.2556106730970492e-06, "loss": 0.0272, "step": 6148 }, { "epoch": 2.3778035576179426, "grad_norm": 0.26255830933322566, "learning_rate": 1.2541201049700635e-06, "loss": 0.0147, "step": 6149 }, { "epoch": 2.3781902552204177, "grad_norm": 0.27322977094176654, "learning_rate": 1.252630295235347e-06, "loss": 0.0213, "step": 6150 }, { "epoch": 2.3785769528228924, "grad_norm": 0.2543167406383352, "learning_rate": 1.2511412441945282e-06, "loss": 0.0142, "step": 6151 }, { "epoch": 2.3789636504253675, "grad_norm": 0.2991016274936895, "learning_rate": 1.2496529521490809e-06, "loss": 0.0217, "step": 6152 }, { "epoch": 2.379350348027842, "grad_norm": 0.44578481130074216, "learning_rate": 1.248165419400324e-06, "loss": 0.0156, "step": 6153 }, { "epoch": 2.379737045630317, "grad_norm": 0.27055910126369287, "learning_rate": 1.246678646249423e-06, "loss": 0.0185, "step": 6154 }, { "epoch": 2.380123743232792, "grad_norm": 0.2143252386786859, "learning_rate": 1.245192632997394e-06, "loss": 0.014, "step": 6155 }, { "epoch": 2.3805104408352666, "grad_norm": 0.25158972436995486, "learning_rate": 1.243707379945095e-06, "loss": 0.0157, "step": 6156 }, { "epoch": 2.3808971384377418, "grad_norm": 0.352168430518058, "learning_rate": 1.2422228873932275e-06, "loss": 0.0137, "step": 6157 }, { "epoch": 2.3812838360402164, "grad_norm": 0.22762420272832187, "learning_rate": 1.2407391556423454e-06, "loss": 0.0147, "step": 6158 }, { "epoch": 2.3816705336426915, "grad_norm": 0.4691373897336156, "learning_rate": 1.2392561849928435e-06, "loss": 0.0195, "step": 6159 }, { "epoch": 2.382057231245166, "grad_norm": 0.2561745543106194, "learning_rate": 1.2377739757449697e-06, "loss": 0.0187, "step": 6160 }, { "epoch": 2.3824439288476413, "grad_norm": 0.3065798170094304, "learning_rate": 1.2362925281988053e-06, "loss": 0.0193, "step": 6161 }, { "epoch": 2.382830626450116, "grad_norm": 0.27547157078655327, "learning_rate": 1.2348118426542904e-06, "loss": 0.0207, "step": 6162 }, { "epoch": 2.3832173240525907, "grad_norm": 0.22497259979252526, "learning_rate": 1.2333319194112031e-06, "loss": 0.0158, "step": 6163 }, { "epoch": 2.383604021655066, "grad_norm": 0.22406621029714013, "learning_rate": 1.2318527587691692e-06, "loss": 0.0137, "step": 6164 }, { "epoch": 2.3839907192575405, "grad_norm": 0.24622485458021692, "learning_rate": 1.2303743610276602e-06, "loss": 0.0156, "step": 6165 }, { "epoch": 2.3843774168600156, "grad_norm": 0.21144536876674028, "learning_rate": 1.2288967264859913e-06, "loss": 0.0146, "step": 6166 }, { "epoch": 2.3847641144624903, "grad_norm": 0.24432561413163606, "learning_rate": 1.2274198554433286e-06, "loss": 0.0133, "step": 6167 }, { "epoch": 2.3851508120649654, "grad_norm": 0.2665733561869094, "learning_rate": 1.225943748198678e-06, "loss": 0.0157, "step": 6168 }, { "epoch": 2.38553750966744, "grad_norm": 0.2913278007120739, "learning_rate": 1.224468405050893e-06, "loss": 0.0176, "step": 6169 }, { "epoch": 2.385924207269915, "grad_norm": 0.29655942300314453, "learning_rate": 1.2229938262986719e-06, "loss": 0.0243, "step": 6170 }, { "epoch": 2.38631090487239, "grad_norm": 0.3054505974362176, "learning_rate": 1.2215200122405568e-06, "loss": 0.0208, "step": 6171 }, { "epoch": 2.3866976024748645, "grad_norm": 0.337751099655269, "learning_rate": 1.22004696317494e-06, "loss": 0.0256, "step": 6172 }, { "epoch": 2.3870843000773396, "grad_norm": 0.2098509844118684, "learning_rate": 1.218574679400054e-06, "loss": 0.0143, "step": 6173 }, { "epoch": 2.3874709976798143, "grad_norm": 0.34442910732510384, "learning_rate": 1.2171031612139784e-06, "loss": 0.0283, "step": 6174 }, { "epoch": 2.3878576952822894, "grad_norm": 0.27779125927107884, "learning_rate": 1.215632408914635e-06, "loss": 0.0153, "step": 6175 }, { "epoch": 2.388244392884764, "grad_norm": 0.2656279496898169, "learning_rate": 1.214162422799796e-06, "loss": 0.021, "step": 6176 }, { "epoch": 2.388631090487239, "grad_norm": 0.2709645484552655, "learning_rate": 1.2126932031670758e-06, "loss": 0.0153, "step": 6177 }, { "epoch": 2.389017788089714, "grad_norm": 0.3085615927409442, "learning_rate": 1.2112247503139286e-06, "loss": 0.0171, "step": 6178 }, { "epoch": 2.3894044856921886, "grad_norm": 0.26686141201923624, "learning_rate": 1.2097570645376621e-06, "loss": 0.0203, "step": 6179 }, { "epoch": 2.3897911832946637, "grad_norm": 0.27302873781918563, "learning_rate": 1.208290146135422e-06, "loss": 0.0186, "step": 6180 }, { "epoch": 2.3901778808971383, "grad_norm": 0.661848383109408, "learning_rate": 1.206823995404205e-06, "loss": 0.0335, "step": 6181 }, { "epoch": 2.3905645784996135, "grad_norm": 0.35417062954968886, "learning_rate": 1.2053586126408428e-06, "loss": 0.0239, "step": 6182 }, { "epoch": 2.390951276102088, "grad_norm": 0.27635182582393036, "learning_rate": 1.2038939981420223e-06, "loss": 0.0159, "step": 6183 }, { "epoch": 2.3913379737045632, "grad_norm": 0.2537293260318534, "learning_rate": 1.2024301522042675e-06, "loss": 0.0168, "step": 6184 }, { "epoch": 2.391724671307038, "grad_norm": 0.3961732599262001, "learning_rate": 1.2009670751239498e-06, "loss": 0.0167, "step": 6185 }, { "epoch": 2.3921113689095126, "grad_norm": 0.21759273141900726, "learning_rate": 1.1995047671972847e-06, "loss": 0.0141, "step": 6186 }, { "epoch": 2.3924980665119877, "grad_norm": 0.25176688210060283, "learning_rate": 1.1980432287203296e-06, "loss": 0.0153, "step": 6187 }, { "epoch": 2.3928847641144624, "grad_norm": 0.3864071916510346, "learning_rate": 1.1965824599889907e-06, "loss": 0.0228, "step": 6188 }, { "epoch": 2.3932714617169375, "grad_norm": 0.3246426881967687, "learning_rate": 1.1951224612990158e-06, "loss": 0.0239, "step": 6189 }, { "epoch": 2.393658159319412, "grad_norm": 0.24285472128611696, "learning_rate": 1.1936632329459951e-06, "loss": 0.0187, "step": 6190 }, { "epoch": 2.3940448569218873, "grad_norm": 0.23199476279685813, "learning_rate": 1.1922047752253657e-06, "loss": 0.0138, "step": 6191 }, { "epoch": 2.394431554524362, "grad_norm": 0.28329256586349744, "learning_rate": 1.1907470884324053e-06, "loss": 0.0184, "step": 6192 }, { "epoch": 2.3948182521268366, "grad_norm": 0.28334764686773123, "learning_rate": 1.1892901728622407e-06, "loss": 0.0193, "step": 6193 }, { "epoch": 2.3952049497293117, "grad_norm": 0.2652424497423092, "learning_rate": 1.187834028809839e-06, "loss": 0.0141, "step": 6194 }, { "epoch": 2.3955916473317864, "grad_norm": 0.3074867708390365, "learning_rate": 1.1863786565700109e-06, "loss": 0.0475, "step": 6195 }, { "epoch": 2.3959783449342615, "grad_norm": 0.27630565060997087, "learning_rate": 1.1849240564374115e-06, "loss": 0.0216, "step": 6196 }, { "epoch": 2.396365042536736, "grad_norm": 0.2007661706723802, "learning_rate": 1.1834702287065385e-06, "loss": 0.0106, "step": 6197 }, { "epoch": 2.3967517401392113, "grad_norm": 0.26739147753466536, "learning_rate": 1.182017173671739e-06, "loss": 0.02, "step": 6198 }, { "epoch": 2.397138437741686, "grad_norm": 0.22050350473933844, "learning_rate": 1.1805648916271927e-06, "loss": 0.0136, "step": 6199 }, { "epoch": 2.3975251353441607, "grad_norm": 0.2982580982862624, "learning_rate": 1.1791133828669343e-06, "loss": 0.0213, "step": 6200 }, { "epoch": 2.397911832946636, "grad_norm": 0.23359832314245563, "learning_rate": 1.177662647684833e-06, "loss": 0.0223, "step": 6201 }, { "epoch": 2.3982985305491105, "grad_norm": 0.2629405882079705, "learning_rate": 1.1762126863746104e-06, "loss": 0.016, "step": 6202 }, { "epoch": 2.3986852281515856, "grad_norm": 0.30894134911424226, "learning_rate": 1.1747634992298219e-06, "loss": 0.0211, "step": 6203 }, { "epoch": 2.3990719257540603, "grad_norm": 0.27934839527817573, "learning_rate": 1.1733150865438692e-06, "loss": 0.0233, "step": 6204 }, { "epoch": 2.3994586233565354, "grad_norm": 0.2647344986275104, "learning_rate": 1.1718674486100029e-06, "loss": 0.0177, "step": 6205 }, { "epoch": 2.39984532095901, "grad_norm": 0.21264961539966654, "learning_rate": 1.1704205857213096e-06, "loss": 0.0169, "step": 6206 }, { "epoch": 2.4002320185614847, "grad_norm": 0.3332303911143398, "learning_rate": 1.1689744981707229e-06, "loss": 0.0191, "step": 6207 }, { "epoch": 2.40061871616396, "grad_norm": 0.32457400080832677, "learning_rate": 1.1675291862510163e-06, "loss": 0.0167, "step": 6208 }, { "epoch": 2.4010054137664345, "grad_norm": 0.34226620005111114, "learning_rate": 1.1660846502548112e-06, "loss": 0.0277, "step": 6209 }, { "epoch": 2.4013921113689096, "grad_norm": 0.27772599054591945, "learning_rate": 1.164640890474567e-06, "loss": 0.0221, "step": 6210 }, { "epoch": 2.4017788089713843, "grad_norm": 0.31192747146378463, "learning_rate": 1.163197907202588e-06, "loss": 0.0145, "step": 6211 }, { "epoch": 2.4021655065738594, "grad_norm": 0.21217316884264295, "learning_rate": 1.1617557007310215e-06, "loss": 0.0119, "step": 6212 }, { "epoch": 2.402552204176334, "grad_norm": 0.2072237933122646, "learning_rate": 1.1603142713518555e-06, "loss": 0.0126, "step": 6213 }, { "epoch": 2.4029389017788088, "grad_norm": 0.24168887635379827, "learning_rate": 1.158873619356925e-06, "loss": 0.0145, "step": 6214 }, { "epoch": 2.403325599381284, "grad_norm": 0.23120360401746076, "learning_rate": 1.1574337450379041e-06, "loss": 0.0139, "step": 6215 }, { "epoch": 2.4037122969837585, "grad_norm": 0.2428876008378731, "learning_rate": 1.15599464868631e-06, "loss": 0.0153, "step": 6216 }, { "epoch": 2.4040989945862337, "grad_norm": 0.2306154757892645, "learning_rate": 1.1545563305935026e-06, "loss": 0.0158, "step": 6217 }, { "epoch": 2.4044856921887083, "grad_norm": 0.2772132181134098, "learning_rate": 1.153118791050683e-06, "loss": 0.0176, "step": 6218 }, { "epoch": 2.4048723897911835, "grad_norm": 0.2972595166664899, "learning_rate": 1.1516820303488985e-06, "loss": 0.0206, "step": 6219 }, { "epoch": 2.405259087393658, "grad_norm": 0.287745967384458, "learning_rate": 1.1502460487790357e-06, "loss": 0.0268, "step": 6220 }, { "epoch": 2.405645784996133, "grad_norm": 0.3489709941347012, "learning_rate": 1.1488108466318231e-06, "loss": 0.0204, "step": 6221 }, { "epoch": 2.406032482598608, "grad_norm": 0.2607372238490731, "learning_rate": 1.1473764241978324e-06, "loss": 0.0222, "step": 6222 }, { "epoch": 2.4064191802010826, "grad_norm": 0.3206745802432405, "learning_rate": 1.1459427817674762e-06, "loss": 0.0255, "step": 6223 }, { "epoch": 2.4068058778035577, "grad_norm": 0.30096358833977643, "learning_rate": 1.1445099196310144e-06, "loss": 0.0285, "step": 6224 }, { "epoch": 2.4071925754060324, "grad_norm": 0.2544329568511881, "learning_rate": 1.143077838078539e-06, "loss": 0.0166, "step": 6225 }, { "epoch": 2.4075792730085075, "grad_norm": 0.4745265302517763, "learning_rate": 1.141646537399994e-06, "loss": 0.0303, "step": 6226 }, { "epoch": 2.407965970610982, "grad_norm": 0.2708993536244965, "learning_rate": 1.140216017885159e-06, "loss": 0.0247, "step": 6227 }, { "epoch": 2.408352668213457, "grad_norm": 0.2596818842194758, "learning_rate": 1.1387862798236609e-06, "loss": 0.0162, "step": 6228 }, { "epoch": 2.408739365815932, "grad_norm": 0.2879804888128722, "learning_rate": 1.1373573235049612e-06, "loss": 0.0181, "step": 6229 }, { "epoch": 2.4091260634184066, "grad_norm": 0.29877856290891075, "learning_rate": 1.1359291492183665e-06, "loss": 0.0241, "step": 6230 }, { "epoch": 2.4095127610208817, "grad_norm": 0.27635685629757334, "learning_rate": 1.1345017572530287e-06, "loss": 0.0174, "step": 6231 }, { "epoch": 2.4098994586233564, "grad_norm": 0.2768145640669899, "learning_rate": 1.1330751478979368e-06, "loss": 0.0196, "step": 6232 }, { "epoch": 2.4102861562258315, "grad_norm": 0.22408785548802213, "learning_rate": 1.1316493214419227e-06, "loss": 0.0141, "step": 6233 }, { "epoch": 2.410672853828306, "grad_norm": 0.3827476313150076, "learning_rate": 1.1302242781736578e-06, "loss": 0.0263, "step": 6234 }, { "epoch": 2.4110595514307813, "grad_norm": 0.2705966736558675, "learning_rate": 1.1288000183816605e-06, "loss": 0.0217, "step": 6235 }, { "epoch": 2.411446249033256, "grad_norm": 0.4552574867552009, "learning_rate": 1.127376542354285e-06, "loss": 0.0254, "step": 6236 }, { "epoch": 2.4118329466357307, "grad_norm": 0.2897981487333659, "learning_rate": 1.1259538503797285e-06, "loss": 0.0223, "step": 6237 }, { "epoch": 2.412219644238206, "grad_norm": 0.30489963832497013, "learning_rate": 1.124531942746031e-06, "loss": 0.0195, "step": 6238 }, { "epoch": 2.4126063418406805, "grad_norm": 0.2681630742928126, "learning_rate": 1.1231108197410694e-06, "loss": 0.0219, "step": 6239 }, { "epoch": 2.4129930394431556, "grad_norm": 0.22032920107436885, "learning_rate": 1.1216904816525693e-06, "loss": 0.0126, "step": 6240 }, { "epoch": 2.4133797370456302, "grad_norm": 0.31105398162554554, "learning_rate": 1.1202709287680902e-06, "loss": 0.0248, "step": 6241 }, { "epoch": 2.4137664346481054, "grad_norm": 0.23460169633135133, "learning_rate": 1.1188521613750363e-06, "loss": 0.0159, "step": 6242 }, { "epoch": 2.41415313225058, "grad_norm": 0.23469938621255365, "learning_rate": 1.117434179760651e-06, "loss": 0.0119, "step": 6243 }, { "epoch": 2.414539829853055, "grad_norm": 0.2811737706711765, "learning_rate": 1.1160169842120178e-06, "loss": 0.0238, "step": 6244 }, { "epoch": 2.41492652745553, "grad_norm": 0.23849357170758811, "learning_rate": 1.1146005750160682e-06, "loss": 0.0168, "step": 6245 }, { "epoch": 2.4153132250580045, "grad_norm": 0.20973110625342808, "learning_rate": 1.1131849524595618e-06, "loss": 0.0127, "step": 6246 }, { "epoch": 2.4156999226604796, "grad_norm": 0.28013286315512587, "learning_rate": 1.1117701168291111e-06, "loss": 0.0169, "step": 6247 }, { "epoch": 2.4160866202629543, "grad_norm": 0.29091112490551996, "learning_rate": 1.1103560684111613e-06, "loss": 0.0202, "step": 6248 }, { "epoch": 2.4164733178654294, "grad_norm": 0.2835236988822846, "learning_rate": 1.1089428074920055e-06, "loss": 0.0151, "step": 6249 }, { "epoch": 2.416860015467904, "grad_norm": 0.29769741247958625, "learning_rate": 1.1075303343577693e-06, "loss": 0.0142, "step": 6250 }, { "epoch": 2.417246713070379, "grad_norm": 0.23118533062493563, "learning_rate": 1.1061186492944215e-06, "loss": 0.0169, "step": 6251 }, { "epoch": 2.417633410672854, "grad_norm": 0.27126213582136216, "learning_rate": 1.1047077525877754e-06, "loss": 0.0172, "step": 6252 }, { "epoch": 2.4180201082753285, "grad_norm": 0.29004269491153994, "learning_rate": 1.1032976445234806e-06, "loss": 0.0201, "step": 6253 }, { "epoch": 2.4184068058778037, "grad_norm": 0.2499867068168117, "learning_rate": 1.1018883253870283e-06, "loss": 0.017, "step": 6254 }, { "epoch": 2.4187935034802783, "grad_norm": 0.19735528246737732, "learning_rate": 1.1004797954637486e-06, "loss": 0.0134, "step": 6255 }, { "epoch": 2.4191802010827534, "grad_norm": 0.35078914697880426, "learning_rate": 1.0990720550388146e-06, "loss": 0.0254, "step": 6256 }, { "epoch": 2.419566898685228, "grad_norm": 0.2635134627905387, "learning_rate": 1.097665104397238e-06, "loss": 0.0192, "step": 6257 }, { "epoch": 2.4199535962877032, "grad_norm": 0.25503292316069687, "learning_rate": 1.0962589438238696e-06, "loss": 0.0168, "step": 6258 }, { "epoch": 2.420340293890178, "grad_norm": 0.24632556300688668, "learning_rate": 1.0948535736034016e-06, "loss": 0.0166, "step": 6259 }, { "epoch": 2.4207269914926526, "grad_norm": 0.3806444290265964, "learning_rate": 1.093448994020364e-06, "loss": 0.0419, "step": 6260 }, { "epoch": 2.4211136890951277, "grad_norm": 0.27431619872533597, "learning_rate": 1.0920452053591319e-06, "loss": 0.0146, "step": 6261 }, { "epoch": 2.4215003866976024, "grad_norm": 0.2994609397399044, "learning_rate": 1.0906422079039153e-06, "loss": 0.0233, "step": 6262 }, { "epoch": 2.4218870843000775, "grad_norm": 0.28690703681952734, "learning_rate": 1.089240001938765e-06, "loss": 0.0226, "step": 6263 }, { "epoch": 2.422273781902552, "grad_norm": 0.32667877339033047, "learning_rate": 1.0878385877475734e-06, "loss": 0.0208, "step": 6264 }, { "epoch": 2.4226604795050273, "grad_norm": 0.25632758787337573, "learning_rate": 1.0864379656140689e-06, "loss": 0.0254, "step": 6265 }, { "epoch": 2.423047177107502, "grad_norm": 0.30505387100694514, "learning_rate": 1.085038135821826e-06, "loss": 0.0211, "step": 6266 }, { "epoch": 2.4234338747099766, "grad_norm": 0.3294917059027911, "learning_rate": 1.0836390986542506e-06, "loss": 0.0248, "step": 6267 }, { "epoch": 2.4238205723124517, "grad_norm": 0.2727297391907719, "learning_rate": 1.0822408543945951e-06, "loss": 0.02, "step": 6268 }, { "epoch": 2.4242072699149264, "grad_norm": 0.3343428640621957, "learning_rate": 1.0808434033259475e-06, "loss": 0.0201, "step": 6269 }, { "epoch": 2.4245939675174015, "grad_norm": 0.21530474501224614, "learning_rate": 1.0794467457312364e-06, "loss": 0.0131, "step": 6270 }, { "epoch": 2.424980665119876, "grad_norm": 0.36745615961796224, "learning_rate": 1.0780508818932294e-06, "loss": 0.0207, "step": 6271 }, { "epoch": 2.4253673627223513, "grad_norm": 0.24630896344094716, "learning_rate": 1.0766558120945331e-06, "loss": 0.0207, "step": 6272 }, { "epoch": 2.425754060324826, "grad_norm": 0.307477574177931, "learning_rate": 1.0752615366175956e-06, "loss": 0.021, "step": 6273 }, { "epoch": 2.4261407579273007, "grad_norm": 0.27647190920863707, "learning_rate": 1.0738680557447017e-06, "loss": 0.0282, "step": 6274 }, { "epoch": 2.426527455529776, "grad_norm": 0.19094962591576584, "learning_rate": 1.0724753697579755e-06, "loss": 0.0112, "step": 6275 }, { "epoch": 2.4269141531322505, "grad_norm": 0.27897494080137536, "learning_rate": 1.0710834789393816e-06, "loss": 0.0175, "step": 6276 }, { "epoch": 2.4273008507347256, "grad_norm": 0.39741922606525215, "learning_rate": 1.0696923835707201e-06, "loss": 0.0153, "step": 6277 }, { "epoch": 2.4276875483372002, "grad_norm": 0.38374232998141555, "learning_rate": 1.0683020839336366e-06, "loss": 0.0175, "step": 6278 }, { "epoch": 2.4280742459396754, "grad_norm": 0.26407248721789556, "learning_rate": 1.0669125803096102e-06, "loss": 0.0246, "step": 6279 }, { "epoch": 2.42846094354215, "grad_norm": 0.3335909809627517, "learning_rate": 1.06552387297996e-06, "loss": 0.0236, "step": 6280 }, { "epoch": 2.4288476411446247, "grad_norm": 0.20772642117398857, "learning_rate": 1.0641359622258424e-06, "loss": 0.0126, "step": 6281 }, { "epoch": 2.4292343387471, "grad_norm": 0.3189540325079515, "learning_rate": 1.0627488483282578e-06, "loss": 0.0168, "step": 6282 }, { "epoch": 2.4296210363495745, "grad_norm": 0.2614130520455643, "learning_rate": 1.0613625315680405e-06, "loss": 0.0166, "step": 6283 }, { "epoch": 2.4300077339520496, "grad_norm": 0.22892253675154614, "learning_rate": 1.059977012225864e-06, "loss": 0.0186, "step": 6284 }, { "epoch": 2.4303944315545243, "grad_norm": 0.23455344885411222, "learning_rate": 1.0585922905822417e-06, "loss": 0.0199, "step": 6285 }, { "epoch": 2.4307811291569994, "grad_norm": 0.3039627285091294, "learning_rate": 1.0572083669175231e-06, "loss": 0.0271, "step": 6286 }, { "epoch": 2.431167826759474, "grad_norm": 0.20864055489587388, "learning_rate": 1.0558252415119009e-06, "loss": 0.0128, "step": 6287 }, { "epoch": 2.4315545243619487, "grad_norm": 0.18977923097313232, "learning_rate": 1.054442914645402e-06, "loss": 0.0134, "step": 6288 }, { "epoch": 2.431941221964424, "grad_norm": 0.21653059103590339, "learning_rate": 1.0530613865978922e-06, "loss": 0.0177, "step": 6289 }, { "epoch": 2.4323279195668985, "grad_norm": 0.23892762142136534, "learning_rate": 1.0516806576490774e-06, "loss": 0.017, "step": 6290 }, { "epoch": 2.4327146171693736, "grad_norm": 0.24773181860226526, "learning_rate": 1.0503007280784977e-06, "loss": 0.0153, "step": 6291 }, { "epoch": 2.4331013147718483, "grad_norm": 0.2552265176767849, "learning_rate": 1.0489215981655386e-06, "loss": 0.0177, "step": 6292 }, { "epoch": 2.4334880123743234, "grad_norm": 0.34941254422816204, "learning_rate": 1.0475432681894143e-06, "loss": 0.0226, "step": 6293 }, { "epoch": 2.433874709976798, "grad_norm": 0.2910620606143121, "learning_rate": 1.0461657384291861e-06, "loss": 0.0199, "step": 6294 }, { "epoch": 2.434261407579273, "grad_norm": 0.33310816736163734, "learning_rate": 1.0447890091637463e-06, "loss": 0.0304, "step": 6295 }, { "epoch": 2.434648105181748, "grad_norm": 0.41745875576061137, "learning_rate": 1.0434130806718323e-06, "loss": 0.0242, "step": 6296 }, { "epoch": 2.4350348027842226, "grad_norm": 0.2855443823348092, "learning_rate": 1.0420379532320107e-06, "loss": 0.0186, "step": 6297 }, { "epoch": 2.4354215003866977, "grad_norm": 0.2881273457030403, "learning_rate": 1.0406636271226899e-06, "loss": 0.0237, "step": 6298 }, { "epoch": 2.4358081979891724, "grad_norm": 0.2889118742228192, "learning_rate": 1.0392901026221203e-06, "loss": 0.0156, "step": 6299 }, { "epoch": 2.4361948955916475, "grad_norm": 0.23447830339886322, "learning_rate": 1.0379173800083841e-06, "loss": 0.0117, "step": 6300 }, { "epoch": 2.436581593194122, "grad_norm": 0.2817564031850495, "learning_rate": 1.0365454595594038e-06, "loss": 0.0232, "step": 6301 }, { "epoch": 2.436968290796597, "grad_norm": 0.32338333806573166, "learning_rate": 1.035174341552938e-06, "loss": 0.0214, "step": 6302 }, { "epoch": 2.437354988399072, "grad_norm": 0.23644536626591156, "learning_rate": 1.0338040262665839e-06, "loss": 0.0177, "step": 6303 }, { "epoch": 2.4377416860015466, "grad_norm": 0.32314609268267974, "learning_rate": 1.032434513977777e-06, "loss": 0.0256, "step": 6304 }, { "epoch": 2.4381283836040217, "grad_norm": 0.36764836805625173, "learning_rate": 1.031065804963789e-06, "loss": 0.0178, "step": 6305 }, { "epoch": 2.4385150812064964, "grad_norm": 0.23650015770391783, "learning_rate": 1.0296978995017287e-06, "loss": 0.0186, "step": 6306 }, { "epoch": 2.4389017788089715, "grad_norm": 0.24082268177954877, "learning_rate": 1.028330797868542e-06, "loss": 0.0153, "step": 6307 }, { "epoch": 2.439288476411446, "grad_norm": 0.32009727136462307, "learning_rate": 1.0269645003410155e-06, "loss": 0.025, "step": 6308 }, { "epoch": 2.439675174013921, "grad_norm": 0.3214266721249286, "learning_rate": 1.02559900719577e-06, "loss": 0.0244, "step": 6309 }, { "epoch": 2.440061871616396, "grad_norm": 0.21400342177747006, "learning_rate": 1.0242343187092597e-06, "loss": 0.0145, "step": 6310 }, { "epoch": 2.4404485692188707, "grad_norm": 0.2872601951017198, "learning_rate": 1.022870435157784e-06, "loss": 0.0241, "step": 6311 }, { "epoch": 2.4408352668213458, "grad_norm": 0.3528987275808406, "learning_rate": 1.0215073568174726e-06, "loss": 0.0316, "step": 6312 }, { "epoch": 2.4412219644238204, "grad_norm": 0.2582517660280186, "learning_rate": 1.020145083964299e-06, "loss": 0.0224, "step": 6313 }, { "epoch": 2.4416086620262956, "grad_norm": 0.2333747679038025, "learning_rate": 1.0187836168740638e-06, "loss": 0.0165, "step": 6314 }, { "epoch": 2.4419953596287702, "grad_norm": 0.26906980463510805, "learning_rate": 1.0174229558224151e-06, "loss": 0.0193, "step": 6315 }, { "epoch": 2.4423820572312454, "grad_norm": 0.3131130534320002, "learning_rate": 1.0160631010848303e-06, "loss": 0.0248, "step": 6316 }, { "epoch": 2.44276875483372, "grad_norm": 0.46056387436145474, "learning_rate": 1.014704052936627e-06, "loss": 0.0172, "step": 6317 }, { "epoch": 2.443155452436195, "grad_norm": 0.28932609737571063, "learning_rate": 1.0133458116529582e-06, "loss": 0.0193, "step": 6318 }, { "epoch": 2.44354215003867, "grad_norm": 0.2133416485798993, "learning_rate": 1.0119883775088124e-06, "loss": 0.0222, "step": 6319 }, { "epoch": 2.4439288476411445, "grad_norm": 0.27198927220365415, "learning_rate": 1.0106317507790197e-06, "loss": 0.0166, "step": 6320 }, { "epoch": 2.4443155452436196, "grad_norm": 0.2445067472316342, "learning_rate": 1.0092759317382412e-06, "loss": 0.0191, "step": 6321 }, { "epoch": 2.4447022428460943, "grad_norm": 0.23590716479566146, "learning_rate": 1.0079209206609764e-06, "loss": 0.0189, "step": 6322 }, { "epoch": 2.4450889404485694, "grad_norm": 0.20582143937066497, "learning_rate": 1.0065667178215621e-06, "loss": 0.0144, "step": 6323 }, { "epoch": 2.445475638051044, "grad_norm": 0.24730996147573497, "learning_rate": 1.0052133234941686e-06, "loss": 0.0196, "step": 6324 }, { "epoch": 2.445862335653519, "grad_norm": 0.2244334287491697, "learning_rate": 1.0038607379528081e-06, "loss": 0.0175, "step": 6325 }, { "epoch": 2.446249033255994, "grad_norm": 0.44578409535184654, "learning_rate": 1.0025089614713236e-06, "loss": 0.0185, "step": 6326 }, { "epoch": 2.4466357308584685, "grad_norm": 0.2696562399184923, "learning_rate": 1.0011579943233967e-06, "loss": 0.0153, "step": 6327 }, { "epoch": 2.4470224284609436, "grad_norm": 0.28248350758451146, "learning_rate": 9.99807836782543e-07, "loss": 0.0278, "step": 6328 }, { "epoch": 2.4474091260634183, "grad_norm": 0.345575719833794, "learning_rate": 9.984584891221194e-07, "loss": 0.0252, "step": 6329 }, { "epoch": 2.4477958236658934, "grad_norm": 0.32478638378618324, "learning_rate": 9.971099516153126e-07, "loss": 0.0223, "step": 6330 }, { "epoch": 2.448182521268368, "grad_norm": 0.22837962374607854, "learning_rate": 9.957622245351495e-07, "loss": 0.0149, "step": 6331 }, { "epoch": 2.4485692188708432, "grad_norm": 0.2771159584689809, "learning_rate": 9.9441530815449e-07, "loss": 0.0219, "step": 6332 }, { "epoch": 2.448955916473318, "grad_norm": 0.258359413081953, "learning_rate": 9.930692027460314e-07, "loss": 0.0175, "step": 6333 }, { "epoch": 2.4493426140757926, "grad_norm": 0.28078018905368, "learning_rate": 9.9172390858231e-07, "loss": 0.0173, "step": 6334 }, { "epoch": 2.4497293116782677, "grad_norm": 0.3353185067720457, "learning_rate": 9.903794259356886e-07, "loss": 0.0168, "step": 6335 }, { "epoch": 2.4501160092807424, "grad_norm": 0.26157580270446906, "learning_rate": 9.89035755078377e-07, "loss": 0.0161, "step": 6336 }, { "epoch": 2.4505027068832175, "grad_norm": 0.3010753113529805, "learning_rate": 9.876928962824128e-07, "loss": 0.0259, "step": 6337 }, { "epoch": 2.450889404485692, "grad_norm": 0.3771844871465141, "learning_rate": 9.863508498196728e-07, "loss": 0.0255, "step": 6338 }, { "epoch": 2.4512761020881673, "grad_norm": 0.28420025406664684, "learning_rate": 9.850096159618677e-07, "loss": 0.0173, "step": 6339 }, { "epoch": 2.451662799690642, "grad_norm": 0.39627695842841026, "learning_rate": 9.836691949805421e-07, "loss": 0.0145, "step": 6340 }, { "epoch": 2.4520494972931166, "grad_norm": 0.25812330102880693, "learning_rate": 9.823295871470824e-07, "loss": 0.0231, "step": 6341 }, { "epoch": 2.4524361948955917, "grad_norm": 0.2145326357050024, "learning_rate": 9.809907927327035e-07, "loss": 0.0124, "step": 6342 }, { "epoch": 2.4528228924980664, "grad_norm": 0.2752209914161608, "learning_rate": 9.796528120084587e-07, "loss": 0.0204, "step": 6343 }, { "epoch": 2.4532095901005415, "grad_norm": 0.2699271184753934, "learning_rate": 9.783156452452359e-07, "loss": 0.0215, "step": 6344 }, { "epoch": 2.453596287703016, "grad_norm": 0.2724781493293756, "learning_rate": 9.769792927137572e-07, "loss": 0.0198, "step": 6345 }, { "epoch": 2.4539829853054913, "grad_norm": 0.25557475107912025, "learning_rate": 9.756437546845842e-07, "loss": 0.0112, "step": 6346 }, { "epoch": 2.454369682907966, "grad_norm": 0.3213256671406853, "learning_rate": 9.743090314281084e-07, "loss": 0.0214, "step": 6347 }, { "epoch": 2.4547563805104406, "grad_norm": 0.43359134876221994, "learning_rate": 9.729751232145584e-07, "loss": 0.0267, "step": 6348 }, { "epoch": 2.4551430781129158, "grad_norm": 0.2634840836701825, "learning_rate": 9.716420303139978e-07, "loss": 0.0221, "step": 6349 }, { "epoch": 2.4555297757153904, "grad_norm": 0.22288279670034053, "learning_rate": 9.703097529963245e-07, "loss": 0.016, "step": 6350 }, { "epoch": 2.4559164733178656, "grad_norm": 0.27646125965125434, "learning_rate": 9.689782915312733e-07, "loss": 0.018, "step": 6351 }, { "epoch": 2.4563031709203402, "grad_norm": 0.33848270537328834, "learning_rate": 9.676476461884122e-07, "loss": 0.0301, "step": 6352 }, { "epoch": 2.4566898685228153, "grad_norm": 0.27773464332469805, "learning_rate": 9.663178172371435e-07, "loss": 0.0159, "step": 6353 }, { "epoch": 2.45707656612529, "grad_norm": 0.2476975642458953, "learning_rate": 9.649888049467039e-07, "loss": 0.0184, "step": 6354 }, { "epoch": 2.4574632637277647, "grad_norm": 0.31596994493324027, "learning_rate": 9.636606095861683e-07, "loss": 0.0248, "step": 6355 }, { "epoch": 2.45784996133024, "grad_norm": 0.22951248312933237, "learning_rate": 9.623332314244432e-07, "loss": 0.0122, "step": 6356 }, { "epoch": 2.4582366589327145, "grad_norm": 0.27625271989315625, "learning_rate": 9.610066707302667e-07, "loss": 0.0191, "step": 6357 }, { "epoch": 2.4586233565351896, "grad_norm": 0.3006502640571688, "learning_rate": 9.596809277722186e-07, "loss": 0.0191, "step": 6358 }, { "epoch": 2.4590100541376643, "grad_norm": 0.2531643217107604, "learning_rate": 9.583560028187068e-07, "loss": 0.021, "step": 6359 }, { "epoch": 2.4593967517401394, "grad_norm": 0.26527962318570586, "learning_rate": 9.570318961379793e-07, "loss": 0.0206, "step": 6360 }, { "epoch": 2.459783449342614, "grad_norm": 0.2609929820190006, "learning_rate": 9.55708607998111e-07, "loss": 0.0156, "step": 6361 }, { "epoch": 2.4601701469450887, "grad_norm": 0.28645806458956724, "learning_rate": 9.543861386670184e-07, "loss": 0.0201, "step": 6362 }, { "epoch": 2.460556844547564, "grad_norm": 0.2380141424651419, "learning_rate": 9.530644884124485e-07, "loss": 0.0166, "step": 6363 }, { "epoch": 2.4609435421500385, "grad_norm": 0.24528933411974954, "learning_rate": 9.517436575019834e-07, "loss": 0.0167, "step": 6364 }, { "epoch": 2.4613302397525136, "grad_norm": 0.2903339293944282, "learning_rate": 9.504236462030376e-07, "loss": 0.0185, "step": 6365 }, { "epoch": 2.4617169373549883, "grad_norm": 0.2239803240400058, "learning_rate": 9.49104454782861e-07, "loss": 0.0174, "step": 6366 }, { "epoch": 2.4621036349574634, "grad_norm": 0.3749426362042917, "learning_rate": 9.4778608350854e-07, "loss": 0.0206, "step": 6367 }, { "epoch": 2.462490332559938, "grad_norm": 0.19769955060198566, "learning_rate": 9.464685326469913e-07, "loss": 0.0142, "step": 6368 }, { "epoch": 2.4628770301624128, "grad_norm": 0.23763113843046643, "learning_rate": 9.451518024649664e-07, "loss": 0.0142, "step": 6369 }, { "epoch": 2.463263727764888, "grad_norm": 0.3163459974384166, "learning_rate": 9.438358932290509e-07, "loss": 0.0218, "step": 6370 }, { "epoch": 2.4636504253673626, "grad_norm": 0.2408811396524928, "learning_rate": 9.425208052056639e-07, "loss": 0.0163, "step": 6371 }, { "epoch": 2.4640371229698377, "grad_norm": 0.24784398798788837, "learning_rate": 9.412065386610598e-07, "loss": 0.0235, "step": 6372 }, { "epoch": 2.4644238205723124, "grad_norm": 0.3056850531929148, "learning_rate": 9.398930938613254e-07, "loss": 0.0226, "step": 6373 }, { "epoch": 2.4648105181747875, "grad_norm": 0.3258094305554858, "learning_rate": 9.385804710723812e-07, "loss": 0.0217, "step": 6374 }, { "epoch": 2.465197215777262, "grad_norm": 0.29868595516634805, "learning_rate": 9.372686705599804e-07, "loss": 0.0212, "step": 6375 }, { "epoch": 2.465583913379737, "grad_norm": 0.24756677926005963, "learning_rate": 9.359576925897096e-07, "loss": 0.0174, "step": 6376 }, { "epoch": 2.465970610982212, "grad_norm": 0.2975463580542829, "learning_rate": 9.346475374269948e-07, "loss": 0.0179, "step": 6377 }, { "epoch": 2.4663573085846866, "grad_norm": 0.2923478329767711, "learning_rate": 9.333382053370843e-07, "loss": 0.014, "step": 6378 }, { "epoch": 2.4667440061871617, "grad_norm": 0.23857565334989012, "learning_rate": 9.3202969658507e-07, "loss": 0.0139, "step": 6379 }, { "epoch": 2.4671307037896364, "grad_norm": 0.27696742541014713, "learning_rate": 9.307220114358706e-07, "loss": 0.0174, "step": 6380 }, { "epoch": 2.4675174013921115, "grad_norm": 0.33533773469541506, "learning_rate": 9.294151501542442e-07, "loss": 0.0417, "step": 6381 }, { "epoch": 2.467904098994586, "grad_norm": 0.2962172279733939, "learning_rate": 9.281091130047753e-07, "loss": 0.022, "step": 6382 }, { "epoch": 2.468290796597061, "grad_norm": 0.2906763052479689, "learning_rate": 9.268039002518841e-07, "loss": 0.023, "step": 6383 }, { "epoch": 2.468677494199536, "grad_norm": 0.23038812795008592, "learning_rate": 9.254995121598271e-07, "loss": 0.0159, "step": 6384 }, { "epoch": 2.4690641918020106, "grad_norm": 0.24611832107854484, "learning_rate": 9.241959489926899e-07, "loss": 0.0138, "step": 6385 }, { "epoch": 2.4694508894044858, "grad_norm": 0.3140686015033842, "learning_rate": 9.228932110143923e-07, "loss": 0.0291, "step": 6386 }, { "epoch": 2.4698375870069604, "grad_norm": 0.2632097223021515, "learning_rate": 9.21591298488686e-07, "loss": 0.0217, "step": 6387 }, { "epoch": 2.4702242846094355, "grad_norm": 0.2638033612763533, "learning_rate": 9.202902116791596e-07, "loss": 0.0175, "step": 6388 }, { "epoch": 2.4706109822119102, "grad_norm": 0.2581488159575018, "learning_rate": 9.189899508492295e-07, "loss": 0.0195, "step": 6389 }, { "epoch": 2.4709976798143853, "grad_norm": 0.2393168638658289, "learning_rate": 9.17690516262148e-07, "loss": 0.0152, "step": 6390 }, { "epoch": 2.47138437741686, "grad_norm": 0.2018942576127493, "learning_rate": 9.163919081809986e-07, "loss": 0.0165, "step": 6391 }, { "epoch": 2.471771075019335, "grad_norm": 0.21513076907167078, "learning_rate": 9.150941268686964e-07, "loss": 0.0106, "step": 6392 }, { "epoch": 2.47215777262181, "grad_norm": 0.2539942010949675, "learning_rate": 9.13797172587994e-07, "loss": 0.0224, "step": 6393 }, { "epoch": 2.4725444702242845, "grad_norm": 0.32898725230278514, "learning_rate": 9.12501045601471e-07, "loss": 0.0258, "step": 6394 }, { "epoch": 2.4729311678267596, "grad_norm": 0.25345946638630384, "learning_rate": 9.112057461715429e-07, "loss": 0.013, "step": 6395 }, { "epoch": 2.4733178654292343, "grad_norm": 0.2530582539591371, "learning_rate": 9.099112745604554e-07, "loss": 0.0132, "step": 6396 }, { "epoch": 2.4737045630317094, "grad_norm": 0.33522459355746115, "learning_rate": 9.086176310302874e-07, "loss": 0.0192, "step": 6397 }, { "epoch": 2.474091260634184, "grad_norm": 0.2503400493241477, "learning_rate": 9.073248158429526e-07, "loss": 0.02, "step": 6398 }, { "epoch": 2.474477958236659, "grad_norm": 0.21606558469991036, "learning_rate": 9.060328292601939e-07, "loss": 0.0145, "step": 6399 }, { "epoch": 2.474864655839134, "grad_norm": 0.24254709723251003, "learning_rate": 9.047416715435864e-07, "loss": 0.0146, "step": 6400 }, { "epoch": 2.4752513534416085, "grad_norm": 0.2320572203661794, "learning_rate": 9.034513429545383e-07, "loss": 0.0116, "step": 6401 }, { "epoch": 2.4756380510440836, "grad_norm": 0.28513834187122367, "learning_rate": 9.021618437542917e-07, "loss": 0.0178, "step": 6402 }, { "epoch": 2.4760247486465583, "grad_norm": 0.24062484819908134, "learning_rate": 9.008731742039195e-07, "loss": 0.0123, "step": 6403 }, { "epoch": 2.4764114462490334, "grad_norm": 0.40177342202146454, "learning_rate": 8.99585334564323e-07, "loss": 0.0311, "step": 6404 }, { "epoch": 2.476798143851508, "grad_norm": 0.270605232600921, "learning_rate": 8.982983250962413e-07, "loss": 0.016, "step": 6405 }, { "epoch": 2.477184841453983, "grad_norm": 0.33487910809440624, "learning_rate": 8.970121460602421e-07, "loss": 0.0271, "step": 6406 }, { "epoch": 2.477571539056458, "grad_norm": 0.18316292340871007, "learning_rate": 8.957267977167256e-07, "loss": 0.0113, "step": 6407 }, { "epoch": 2.4779582366589326, "grad_norm": 0.30578545118355843, "learning_rate": 8.944422803259228e-07, "loss": 0.021, "step": 6408 }, { "epoch": 2.4783449342614077, "grad_norm": 0.35302862214001385, "learning_rate": 8.931585941479004e-07, "loss": 0.0181, "step": 6409 }, { "epoch": 2.4787316318638823, "grad_norm": 0.4554734691864372, "learning_rate": 8.918757394425526e-07, "loss": 0.0318, "step": 6410 }, { "epoch": 2.4791183294663575, "grad_norm": 0.2683819653444124, "learning_rate": 8.90593716469606e-07, "loss": 0.0134, "step": 6411 }, { "epoch": 2.479505027068832, "grad_norm": 0.2760643603001147, "learning_rate": 8.893125254886204e-07, "loss": 0.0208, "step": 6412 }, { "epoch": 2.4798917246713073, "grad_norm": 0.33635000649721664, "learning_rate": 8.880321667589842e-07, "loss": 0.0176, "step": 6413 }, { "epoch": 2.480278422273782, "grad_norm": 0.24649911866730523, "learning_rate": 8.867526405399218e-07, "loss": 0.0142, "step": 6414 }, { "epoch": 2.4806651198762566, "grad_norm": 0.3207866599599188, "learning_rate": 8.854739470904861e-07, "loss": 0.0253, "step": 6415 }, { "epoch": 2.4810518174787317, "grad_norm": 0.19363991602316757, "learning_rate": 8.841960866695615e-07, "loss": 0.0105, "step": 6416 }, { "epoch": 2.4814385150812064, "grad_norm": 0.3842720219324367, "learning_rate": 8.829190595358639e-07, "loss": 0.03, "step": 6417 }, { "epoch": 2.4818252126836815, "grad_norm": 0.3049126214666256, "learning_rate": 8.816428659479393e-07, "loss": 0.0173, "step": 6418 }, { "epoch": 2.482211910286156, "grad_norm": 0.3518710226803727, "learning_rate": 8.803675061641692e-07, "loss": 0.0186, "step": 6419 }, { "epoch": 2.4825986078886313, "grad_norm": 0.19540945830417086, "learning_rate": 8.790929804427623e-07, "loss": 0.0127, "step": 6420 }, { "epoch": 2.482985305491106, "grad_norm": 0.24046991980648177, "learning_rate": 8.778192890417586e-07, "loss": 0.0153, "step": 6421 }, { "epoch": 2.4833720030935806, "grad_norm": 0.3454607158793431, "learning_rate": 8.765464322190315e-07, "loss": 0.0267, "step": 6422 }, { "epoch": 2.4837587006960558, "grad_norm": 0.30579469942963855, "learning_rate": 8.752744102322824e-07, "loss": 0.0154, "step": 6423 }, { "epoch": 2.4841453982985304, "grad_norm": 0.244037270763043, "learning_rate": 8.740032233390483e-07, "loss": 0.0188, "step": 6424 }, { "epoch": 2.4845320959010055, "grad_norm": 0.25212047901234075, "learning_rate": 8.727328717966899e-07, "loss": 0.0208, "step": 6425 }, { "epoch": 2.48491879350348, "grad_norm": 0.2500738764202834, "learning_rate": 8.714633558624069e-07, "loss": 0.0209, "step": 6426 }, { "epoch": 2.4853054911059553, "grad_norm": 0.2509464341641821, "learning_rate": 8.701946757932234e-07, "loss": 0.0129, "step": 6427 }, { "epoch": 2.48569218870843, "grad_norm": 0.3000805718220735, "learning_rate": 8.689268318460004e-07, "loss": 0.0215, "step": 6428 }, { "epoch": 2.4860788863109047, "grad_norm": 0.20877818828042968, "learning_rate": 8.676598242774221e-07, "loss": 0.0165, "step": 6429 }, { "epoch": 2.48646558391338, "grad_norm": 0.24678427096771793, "learning_rate": 8.663936533440082e-07, "loss": 0.0158, "step": 6430 }, { "epoch": 2.4868522815158545, "grad_norm": 0.32581565466300577, "learning_rate": 8.651283193021092e-07, "loss": 0.0171, "step": 6431 }, { "epoch": 2.4872389791183296, "grad_norm": 0.26964213884471033, "learning_rate": 8.638638224079055e-07, "loss": 0.0201, "step": 6432 }, { "epoch": 2.4876256767208043, "grad_norm": 0.35065213741087664, "learning_rate": 8.626001629174068e-07, "loss": 0.0337, "step": 6433 }, { "epoch": 2.4880123743232794, "grad_norm": 0.2518528277712195, "learning_rate": 8.613373410864528e-07, "loss": 0.0153, "step": 6434 }, { "epoch": 2.488399071925754, "grad_norm": 0.24673380061741934, "learning_rate": 8.600753571707171e-07, "loss": 0.0151, "step": 6435 }, { "epoch": 2.4887857695282287, "grad_norm": 0.251900509304579, "learning_rate": 8.588142114257009e-07, "loss": 0.0194, "step": 6436 }, { "epoch": 2.489172467130704, "grad_norm": 0.20917815169245182, "learning_rate": 8.57553904106736e-07, "loss": 0.0152, "step": 6437 }, { "epoch": 2.4895591647331785, "grad_norm": 0.21220442524238342, "learning_rate": 8.562944354689845e-07, "loss": 0.0213, "step": 6438 }, { "epoch": 2.4899458623356536, "grad_norm": 0.2228974010710126, "learning_rate": 8.550358057674379e-07, "loss": 0.0172, "step": 6439 }, { "epoch": 2.4903325599381283, "grad_norm": 0.3049512355887204, "learning_rate": 8.537780152569208e-07, "loss": 0.0245, "step": 6440 }, { "epoch": 2.4907192575406034, "grad_norm": 0.24406292413123393, "learning_rate": 8.525210641920855e-07, "loss": 0.0212, "step": 6441 }, { "epoch": 2.491105955143078, "grad_norm": 0.3344979793597817, "learning_rate": 8.512649528274142e-07, "loss": 0.0211, "step": 6442 }, { "epoch": 2.4914926527455528, "grad_norm": 0.2083980064718557, "learning_rate": 8.500096814172204e-07, "loss": 0.0118, "step": 6443 }, { "epoch": 2.491879350348028, "grad_norm": 0.28053135287514946, "learning_rate": 8.487552502156443e-07, "loss": 0.0225, "step": 6444 }, { "epoch": 2.4922660479505025, "grad_norm": 0.2799308817870498, "learning_rate": 8.475016594766639e-07, "loss": 0.0156, "step": 6445 }, { "epoch": 2.4926527455529777, "grad_norm": 0.2418252485498337, "learning_rate": 8.462489094540749e-07, "loss": 0.0155, "step": 6446 }, { "epoch": 2.4930394431554523, "grad_norm": 0.22955648935351608, "learning_rate": 8.449970004015146e-07, "loss": 0.0176, "step": 6447 }, { "epoch": 2.4934261407579275, "grad_norm": 0.3302270172718116, "learning_rate": 8.437459325724423e-07, "loss": 0.0267, "step": 6448 }, { "epoch": 2.493812838360402, "grad_norm": 0.31758141226605063, "learning_rate": 8.424957062201527e-07, "loss": 0.0203, "step": 6449 }, { "epoch": 2.494199535962877, "grad_norm": 0.38272409156233256, "learning_rate": 8.412463215977646e-07, "loss": 0.0331, "step": 6450 }, { "epoch": 2.494586233565352, "grad_norm": 0.27140310011420676, "learning_rate": 8.399977789582275e-07, "loss": 0.0216, "step": 6451 }, { "epoch": 2.4949729311678266, "grad_norm": 0.2707831951196283, "learning_rate": 8.387500785543251e-07, "loss": 0.0225, "step": 6452 }, { "epoch": 2.4953596287703017, "grad_norm": 0.24611953584677515, "learning_rate": 8.375032206386658e-07, "loss": 0.0178, "step": 6453 }, { "epoch": 2.4957463263727764, "grad_norm": 0.297084392704454, "learning_rate": 8.362572054636892e-07, "loss": 0.0159, "step": 6454 }, { "epoch": 2.4961330239752515, "grad_norm": 0.31664503311584746, "learning_rate": 8.350120332816642e-07, "loss": 0.0194, "step": 6455 }, { "epoch": 2.496519721577726, "grad_norm": 0.2532156582500114, "learning_rate": 8.337677043446868e-07, "loss": 0.0152, "step": 6456 }, { "epoch": 2.496906419180201, "grad_norm": 0.2981209833398677, "learning_rate": 8.325242189046872e-07, "loss": 0.0222, "step": 6457 }, { "epoch": 2.497293116782676, "grad_norm": 0.292027737284995, "learning_rate": 8.312815772134208e-07, "loss": 0.0242, "step": 6458 }, { "epoch": 2.4976798143851506, "grad_norm": 0.2658017535477079, "learning_rate": 8.30039779522473e-07, "loss": 0.0181, "step": 6459 }, { "epoch": 2.4980665119876257, "grad_norm": 0.2604415224134365, "learning_rate": 8.28798826083258e-07, "loss": 0.0171, "step": 6460 }, { "epoch": 2.4984532095901004, "grad_norm": 0.31306356569526017, "learning_rate": 8.275587171470212e-07, "loss": 0.0265, "step": 6461 }, { "epoch": 2.4988399071925755, "grad_norm": 0.27922728329570223, "learning_rate": 8.263194529648355e-07, "loss": 0.0198, "step": 6462 }, { "epoch": 2.49922660479505, "grad_norm": 0.25478402325839067, "learning_rate": 8.25081033787602e-07, "loss": 0.0234, "step": 6463 }, { "epoch": 2.4996133023975253, "grad_norm": 0.1951657747331994, "learning_rate": 8.238434598660516e-07, "loss": 0.0125, "step": 6464 }, { "epoch": 2.5, "grad_norm": 0.31157308006331935, "learning_rate": 8.22606731450743e-07, "loss": 0.0233, "step": 6465 }, { "epoch": 2.500386697602475, "grad_norm": 0.24640060635512082, "learning_rate": 8.213708487920674e-07, "loss": 0.0218, "step": 6466 }, { "epoch": 2.50077339520495, "grad_norm": 0.2626095233907258, "learning_rate": 8.201358121402397e-07, "loss": 0.0117, "step": 6467 }, { "epoch": 2.5011600928074245, "grad_norm": 0.30398383645916793, "learning_rate": 8.189016217453072e-07, "loss": 0.0254, "step": 6468 }, { "epoch": 2.5015467904098996, "grad_norm": 0.22682915780624785, "learning_rate": 8.176682778571438e-07, "loss": 0.0143, "step": 6469 }, { "epoch": 2.5019334880123743, "grad_norm": 0.23951448180651835, "learning_rate": 8.16435780725452e-07, "loss": 0.0187, "step": 6470 }, { "epoch": 2.502320185614849, "grad_norm": 0.19484912670061097, "learning_rate": 8.152041305997671e-07, "loss": 0.0104, "step": 6471 }, { "epoch": 2.502706883217324, "grad_norm": 0.22050812713963083, "learning_rate": 8.139733277294443e-07, "loss": 0.0112, "step": 6472 }, { "epoch": 2.503093580819799, "grad_norm": 0.26355996913488816, "learning_rate": 8.127433723636763e-07, "loss": 0.0194, "step": 6473 }, { "epoch": 2.503480278422274, "grad_norm": 0.2812404363275622, "learning_rate": 8.115142647514778e-07, "loss": 0.0142, "step": 6474 }, { "epoch": 2.5038669760247485, "grad_norm": 0.3300575277905311, "learning_rate": 8.102860051416988e-07, "loss": 0.0199, "step": 6475 }, { "epoch": 2.5042536736272236, "grad_norm": 0.2672109815594007, "learning_rate": 8.090585937830081e-07, "loss": 0.016, "step": 6476 }, { "epoch": 2.5046403712296983, "grad_norm": 0.23310259751615808, "learning_rate": 8.078320309239096e-07, "loss": 0.0154, "step": 6477 }, { "epoch": 2.505027068832173, "grad_norm": 0.2746151033005804, "learning_rate": 8.066063168127348e-07, "loss": 0.0118, "step": 6478 }, { "epoch": 2.505413766434648, "grad_norm": 0.46654458397280485, "learning_rate": 8.05381451697641e-07, "loss": 0.0221, "step": 6479 }, { "epoch": 2.505800464037123, "grad_norm": 0.29125240530656893, "learning_rate": 8.041574358266158e-07, "loss": 0.02, "step": 6480 }, { "epoch": 2.506187161639598, "grad_norm": 0.23524276880339756, "learning_rate": 8.029342694474718e-07, "loss": 0.0133, "step": 6481 }, { "epoch": 2.5065738592420725, "grad_norm": 0.28118277078984005, "learning_rate": 8.017119528078549e-07, "loss": 0.0181, "step": 6482 }, { "epoch": 2.5069605568445477, "grad_norm": 0.3742664037018786, "learning_rate": 8.004904861552343e-07, "loss": 0.0245, "step": 6483 }, { "epoch": 2.5073472544470223, "grad_norm": 0.2593599371046639, "learning_rate": 7.992698697369084e-07, "loss": 0.0137, "step": 6484 }, { "epoch": 2.5077339520494975, "grad_norm": 0.30683888936069975, "learning_rate": 7.980501038000039e-07, "loss": 0.0171, "step": 6485 }, { "epoch": 2.508120649651972, "grad_norm": 0.4309467253523246, "learning_rate": 7.968311885914737e-07, "loss": 0.019, "step": 6486 }, { "epoch": 2.5085073472544472, "grad_norm": 0.28352836852886204, "learning_rate": 7.956131243581023e-07, "loss": 0.0156, "step": 6487 }, { "epoch": 2.508894044856922, "grad_norm": 0.2723086574597286, "learning_rate": 7.943959113464994e-07, "loss": 0.0197, "step": 6488 }, { "epoch": 2.5092807424593966, "grad_norm": 0.20459939948445283, "learning_rate": 7.931795498030992e-07, "loss": 0.0127, "step": 6489 }, { "epoch": 2.5096674400618717, "grad_norm": 0.21158817089820692, "learning_rate": 7.919640399741696e-07, "loss": 0.0171, "step": 6490 }, { "epoch": 2.5100541376643464, "grad_norm": 0.504174044644194, "learning_rate": 7.907493821058015e-07, "loss": 0.033, "step": 6491 }, { "epoch": 2.5104408352668215, "grad_norm": 0.29508706402687124, "learning_rate": 7.895355764439178e-07, "loss": 0.0151, "step": 6492 }, { "epoch": 2.510827532869296, "grad_norm": 0.3176347797626717, "learning_rate": 7.883226232342623e-07, "loss": 0.0192, "step": 6493 }, { "epoch": 2.5112142304717713, "grad_norm": 0.3107276960211019, "learning_rate": 7.871105227224129e-07, "loss": 0.0208, "step": 6494 }, { "epoch": 2.511600928074246, "grad_norm": 0.25195106194337014, "learning_rate": 7.858992751537703e-07, "loss": 0.0151, "step": 6495 }, { "epoch": 2.5119876256767206, "grad_norm": 0.28188430468454595, "learning_rate": 7.846888807735653e-07, "loss": 0.0159, "step": 6496 }, { "epoch": 2.5123743232791957, "grad_norm": 0.2306351209764461, "learning_rate": 7.834793398268542e-07, "loss": 0.0209, "step": 6497 }, { "epoch": 2.5127610208816704, "grad_norm": 0.21314799288151773, "learning_rate": 7.822706525585194e-07, "loss": 0.0186, "step": 6498 }, { "epoch": 2.5131477184841455, "grad_norm": 0.3352885577764858, "learning_rate": 7.810628192132752e-07, "loss": 0.0158, "step": 6499 }, { "epoch": 2.51353441608662, "grad_norm": 0.3169052269993867, "learning_rate": 7.798558400356593e-07, "loss": 0.0252, "step": 6500 }, { "epoch": 2.5139211136890953, "grad_norm": 0.28501397861612204, "learning_rate": 7.786497152700357e-07, "loss": 0.0207, "step": 6501 }, { "epoch": 2.51430781129157, "grad_norm": 0.324289091964346, "learning_rate": 7.774444451605983e-07, "loss": 0.0165, "step": 6502 }, { "epoch": 2.5146945088940447, "grad_norm": 0.3135917646088473, "learning_rate": 7.762400299513645e-07, "loss": 0.0176, "step": 6503 }, { "epoch": 2.51508120649652, "grad_norm": 0.3828155179785435, "learning_rate": 7.750364698861834e-07, "loss": 0.0203, "step": 6504 }, { "epoch": 2.5154679040989945, "grad_norm": 0.3160773747000635, "learning_rate": 7.738337652087274e-07, "loss": 0.0242, "step": 6505 }, { "epoch": 2.5158546017014696, "grad_norm": 0.23860356321241796, "learning_rate": 7.726319161624956e-07, "loss": 0.0152, "step": 6506 }, { "epoch": 2.5162412993039442, "grad_norm": 0.21382540912306464, "learning_rate": 7.71430922990814e-07, "loss": 0.0152, "step": 6507 }, { "epoch": 2.5166279969064194, "grad_norm": 0.32954329089306333, "learning_rate": 7.702307859368391e-07, "loss": 0.0305, "step": 6508 }, { "epoch": 2.517014694508894, "grad_norm": 0.27757747358064516, "learning_rate": 7.690315052435505e-07, "loss": 0.0222, "step": 6509 }, { "epoch": 2.5174013921113687, "grad_norm": 0.32750686051492833, "learning_rate": 7.678330811537515e-07, "loss": 0.0231, "step": 6510 }, { "epoch": 2.517788089713844, "grad_norm": 0.19646591389259724, "learning_rate": 7.666355139100795e-07, "loss": 0.0127, "step": 6511 }, { "epoch": 2.5181747873163185, "grad_norm": 0.2720425091847505, "learning_rate": 7.654388037549914e-07, "loss": 0.0175, "step": 6512 }, { "epoch": 2.5185614849187936, "grad_norm": 0.252337574948169, "learning_rate": 7.642429509307775e-07, "loss": 0.0178, "step": 6513 }, { "epoch": 2.5189481825212683, "grad_norm": 0.23236025536531751, "learning_rate": 7.630479556795462e-07, "loss": 0.018, "step": 6514 }, { "epoch": 2.5193348801237434, "grad_norm": 0.2536242241063786, "learning_rate": 7.618538182432394e-07, "loss": 0.0196, "step": 6515 }, { "epoch": 2.519721577726218, "grad_norm": 0.21172874602922295, "learning_rate": 7.606605388636224e-07, "loss": 0.0153, "step": 6516 }, { "epoch": 2.5201082753286927, "grad_norm": 0.2894527201455786, "learning_rate": 7.594681177822866e-07, "loss": 0.019, "step": 6517 }, { "epoch": 2.520494972931168, "grad_norm": 0.20976084880456825, "learning_rate": 7.582765552406496e-07, "loss": 0.0134, "step": 6518 }, { "epoch": 2.5208816705336425, "grad_norm": 0.23559118590497585, "learning_rate": 7.570858514799545e-07, "loss": 0.0213, "step": 6519 }, { "epoch": 2.5212683681361177, "grad_norm": 0.2438564806203088, "learning_rate": 7.558960067412746e-07, "loss": 0.0199, "step": 6520 }, { "epoch": 2.5216550657385923, "grad_norm": 0.3904132659365734, "learning_rate": 7.54707021265505e-07, "loss": 0.0264, "step": 6521 }, { "epoch": 2.5220417633410674, "grad_norm": 0.24662980392339126, "learning_rate": 7.535188952933681e-07, "loss": 0.0217, "step": 6522 }, { "epoch": 2.522428460943542, "grad_norm": 0.23669994244487763, "learning_rate": 7.523316290654121e-07, "loss": 0.0107, "step": 6523 }, { "epoch": 2.522815158546017, "grad_norm": 0.23267665061385895, "learning_rate": 7.5114522282201e-07, "loss": 0.0156, "step": 6524 }, { "epoch": 2.523201856148492, "grad_norm": 0.21864470720927764, "learning_rate": 7.499596768033646e-07, "loss": 0.0143, "step": 6525 }, { "epoch": 2.5235885537509666, "grad_norm": 0.2300310392753397, "learning_rate": 7.487749912495012e-07, "loss": 0.0153, "step": 6526 }, { "epoch": 2.5239752513534417, "grad_norm": 0.2168853553616679, "learning_rate": 7.475911664002716e-07, "loss": 0.0156, "step": 6527 }, { "epoch": 2.5243619489559164, "grad_norm": 0.26680674555388434, "learning_rate": 7.464082024953534e-07, "loss": 0.0202, "step": 6528 }, { "epoch": 2.5247486465583915, "grad_norm": 0.3356204911942597, "learning_rate": 7.452260997742484e-07, "loss": 0.0213, "step": 6529 }, { "epoch": 2.525135344160866, "grad_norm": 0.26761633597125395, "learning_rate": 7.440448584762877e-07, "loss": 0.0203, "step": 6530 }, { "epoch": 2.525522041763341, "grad_norm": 0.43119828757339695, "learning_rate": 7.428644788406253e-07, "loss": 0.0138, "step": 6531 }, { "epoch": 2.525908739365816, "grad_norm": 0.27874850973674636, "learning_rate": 7.416849611062415e-07, "loss": 0.0192, "step": 6532 }, { "epoch": 2.526295436968291, "grad_norm": 0.2955786402650893, "learning_rate": 7.405063055119399e-07, "loss": 0.0206, "step": 6533 }, { "epoch": 2.5266821345707657, "grad_norm": 0.2328349428032751, "learning_rate": 7.393285122963551e-07, "loss": 0.0151, "step": 6534 }, { "epoch": 2.5270688321732404, "grad_norm": 0.24144674172256367, "learning_rate": 7.381515816979423e-07, "loss": 0.0175, "step": 6535 }, { "epoch": 2.5274555297757155, "grad_norm": 0.30371674972257645, "learning_rate": 7.3697551395498e-07, "loss": 0.0156, "step": 6536 }, { "epoch": 2.52784222737819, "grad_norm": 0.33435741387611895, "learning_rate": 7.358003093055799e-07, "loss": 0.021, "step": 6537 }, { "epoch": 2.528228924980665, "grad_norm": 0.6002154414866435, "learning_rate": 7.346259679876705e-07, "loss": 0.024, "step": 6538 }, { "epoch": 2.52861562258314, "grad_norm": 0.2962434934310736, "learning_rate": 7.334524902390145e-07, "loss": 0.0263, "step": 6539 }, { "epoch": 2.529002320185615, "grad_norm": 0.23516251788168094, "learning_rate": 7.322798762971889e-07, "loss": 0.02, "step": 6540 }, { "epoch": 2.5293890177880898, "grad_norm": 0.6348623735984615, "learning_rate": 7.311081263996051e-07, "loss": 0.0298, "step": 6541 }, { "epoch": 2.5297757153905645, "grad_norm": 0.25437661747044454, "learning_rate": 7.299372407834954e-07, "loss": 0.0176, "step": 6542 }, { "epoch": 2.5301624129930396, "grad_norm": 0.4187296326338917, "learning_rate": 7.287672196859169e-07, "loss": 0.0268, "step": 6543 }, { "epoch": 2.5305491105955142, "grad_norm": 0.4455100669096287, "learning_rate": 7.275980633437535e-07, "loss": 0.0445, "step": 6544 }, { "epoch": 2.530935808197989, "grad_norm": 0.17353057885330606, "learning_rate": 7.264297719937119e-07, "loss": 0.0133, "step": 6545 }, { "epoch": 2.531322505800464, "grad_norm": 0.265656974333914, "learning_rate": 7.252623458723263e-07, "loss": 0.0233, "step": 6546 }, { "epoch": 2.531709203402939, "grad_norm": 0.304607040257736, "learning_rate": 7.24095785215953e-07, "loss": 0.0206, "step": 6547 }, { "epoch": 2.532095901005414, "grad_norm": 0.24200717934204277, "learning_rate": 7.229300902607755e-07, "loss": 0.0198, "step": 6548 }, { "epoch": 2.5324825986078885, "grad_norm": 0.38579890197455863, "learning_rate": 7.217652612428e-07, "loss": 0.0295, "step": 6549 }, { "epoch": 2.5328692962103636, "grad_norm": 0.4543038742564152, "learning_rate": 7.206012983978566e-07, "loss": 0.0195, "step": 6550 }, { "epoch": 2.5332559938128383, "grad_norm": 0.3369699606974158, "learning_rate": 7.194382019616053e-07, "loss": 0.0252, "step": 6551 }, { "epoch": 2.533642691415313, "grad_norm": 0.2664690072321662, "learning_rate": 7.182759721695248e-07, "loss": 0.0172, "step": 6552 }, { "epoch": 2.534029389017788, "grad_norm": 0.24472103638615156, "learning_rate": 7.171146092569209e-07, "loss": 0.0183, "step": 6553 }, { "epoch": 2.534416086620263, "grad_norm": 0.291362233756338, "learning_rate": 7.159541134589227e-07, "loss": 0.0179, "step": 6554 }, { "epoch": 2.534802784222738, "grad_norm": 0.33541039754586904, "learning_rate": 7.147944850104865e-07, "loss": 0.0209, "step": 6555 }, { "epoch": 2.5351894818252125, "grad_norm": 0.2532195432705478, "learning_rate": 7.136357241463909e-07, "loss": 0.0153, "step": 6556 }, { "epoch": 2.5355761794276876, "grad_norm": 0.2372309212329382, "learning_rate": 7.124778311012371e-07, "loss": 0.0167, "step": 6557 }, { "epoch": 2.5359628770301623, "grad_norm": 0.27066540470340894, "learning_rate": 7.11320806109454e-07, "loss": 0.0158, "step": 6558 }, { "epoch": 2.5363495746326374, "grad_norm": 0.2317285969425788, "learning_rate": 7.101646494052922e-07, "loss": 0.0172, "step": 6559 }, { "epoch": 2.536736272235112, "grad_norm": 0.2865447922736939, "learning_rate": 7.090093612228311e-07, "loss": 0.0233, "step": 6560 }, { "epoch": 2.5371229698375872, "grad_norm": 0.30310007927976446, "learning_rate": 7.078549417959657e-07, "loss": 0.0189, "step": 6561 }, { "epoch": 2.537509667440062, "grad_norm": 0.32842597595060896, "learning_rate": 7.067013913584236e-07, "loss": 0.0252, "step": 6562 }, { "epoch": 2.5378963650425366, "grad_norm": 0.2347874894666668, "learning_rate": 7.055487101437524e-07, "loss": 0.0171, "step": 6563 }, { "epoch": 2.5382830626450117, "grad_norm": 0.26090179372431366, "learning_rate": 7.043968983853245e-07, "loss": 0.0235, "step": 6564 }, { "epoch": 2.5386697602474864, "grad_norm": 0.32170971224209954, "learning_rate": 7.032459563163357e-07, "loss": 0.0161, "step": 6565 }, { "epoch": 2.5390564578499615, "grad_norm": 0.5901903598583983, "learning_rate": 7.02095884169805e-07, "loss": 0.0207, "step": 6566 }, { "epoch": 2.539443155452436, "grad_norm": 0.2556229476618158, "learning_rate": 7.009466821785788e-07, "loss": 0.0161, "step": 6567 }, { "epoch": 2.5398298530549113, "grad_norm": 0.3308960032521955, "learning_rate": 6.997983505753236e-07, "loss": 0.0197, "step": 6568 }, { "epoch": 2.540216550657386, "grad_norm": 0.29500505431881946, "learning_rate": 6.986508895925321e-07, "loss": 0.0164, "step": 6569 }, { "epoch": 2.5406032482598606, "grad_norm": 0.2986733293562936, "learning_rate": 6.97504299462518e-07, "loss": 0.0224, "step": 6570 }, { "epoch": 2.5409899458623357, "grad_norm": 0.22783954074962404, "learning_rate": 6.963585804174211e-07, "loss": 0.0152, "step": 6571 }, { "epoch": 2.5413766434648104, "grad_norm": 0.23096364647301407, "learning_rate": 6.95213732689205e-07, "loss": 0.0178, "step": 6572 }, { "epoch": 2.5417633410672855, "grad_norm": 0.25064385598787, "learning_rate": 6.940697565096554e-07, "loss": 0.014, "step": 6573 }, { "epoch": 2.54215003866976, "grad_norm": 0.26592528358255013, "learning_rate": 6.929266521103817e-07, "loss": 0.017, "step": 6574 }, { "epoch": 2.5425367362722353, "grad_norm": 0.31704214240672757, "learning_rate": 6.91784419722818e-07, "loss": 0.0139, "step": 6575 }, { "epoch": 2.54292343387471, "grad_norm": 0.1956767915991591, "learning_rate": 6.906430595782199e-07, "loss": 0.0124, "step": 6576 }, { "epoch": 2.5433101314771847, "grad_norm": 0.20601182177128288, "learning_rate": 6.895025719076704e-07, "loss": 0.0152, "step": 6577 }, { "epoch": 2.5436968290796598, "grad_norm": 0.2262175385073028, "learning_rate": 6.883629569420686e-07, "loss": 0.0193, "step": 6578 }, { "epoch": 2.5440835266821344, "grad_norm": 0.27586777388620026, "learning_rate": 6.872242149121456e-07, "loss": 0.0153, "step": 6579 }, { "epoch": 2.5444702242846096, "grad_norm": 0.2999130233277652, "learning_rate": 6.86086346048448e-07, "loss": 0.0226, "step": 6580 }, { "epoch": 2.5448569218870842, "grad_norm": 0.30564039086580824, "learning_rate": 6.849493505813537e-07, "loss": 0.0198, "step": 6581 }, { "epoch": 2.5452436194895594, "grad_norm": 0.2515089094831796, "learning_rate": 6.838132287410554e-07, "loss": 0.0135, "step": 6582 }, { "epoch": 2.545630317092034, "grad_norm": 0.3095790933994073, "learning_rate": 6.826779807575728e-07, "loss": 0.0225, "step": 6583 }, { "epoch": 2.5460170146945087, "grad_norm": 0.3298091264827261, "learning_rate": 6.815436068607506e-07, "loss": 0.0203, "step": 6584 }, { "epoch": 2.546403712296984, "grad_norm": 0.319436184785225, "learning_rate": 6.804101072802538e-07, "loss": 0.0193, "step": 6585 }, { "epoch": 2.5467904098994585, "grad_norm": 0.20508358367096316, "learning_rate": 6.792774822455711e-07, "loss": 0.0109, "step": 6586 }, { "epoch": 2.5471771075019336, "grad_norm": 0.25586517442165857, "learning_rate": 6.781457319860135e-07, "loss": 0.0182, "step": 6587 }, { "epoch": 2.5475638051044083, "grad_norm": 0.23461392221609484, "learning_rate": 6.770148567307166e-07, "loss": 0.0181, "step": 6588 }, { "epoch": 2.5479505027068834, "grad_norm": 0.23178430540531111, "learning_rate": 6.758848567086379e-07, "loss": 0.0153, "step": 6589 }, { "epoch": 2.548337200309358, "grad_norm": 0.25157398107730655, "learning_rate": 6.747557321485571e-07, "loss": 0.0115, "step": 6590 }, { "epoch": 2.5487238979118327, "grad_norm": 0.2988537596871584, "learning_rate": 6.736274832790773e-07, "loss": 0.017, "step": 6591 }, { "epoch": 2.549110595514308, "grad_norm": 0.24269366883916219, "learning_rate": 6.725001103286233e-07, "loss": 0.0202, "step": 6592 }, { "epoch": 2.5494972931167825, "grad_norm": 0.2593223791867935, "learning_rate": 6.713736135254456e-07, "loss": 0.0171, "step": 6593 }, { "epoch": 2.5498839907192576, "grad_norm": 0.2601204197353235, "learning_rate": 6.702479930976135e-07, "loss": 0.0199, "step": 6594 }, { "epoch": 2.5502706883217323, "grad_norm": 0.28297111998708646, "learning_rate": 6.69123249273021e-07, "loss": 0.0183, "step": 6595 }, { "epoch": 2.5506573859242074, "grad_norm": 0.3251524992039758, "learning_rate": 6.67999382279384e-07, "loss": 0.0193, "step": 6596 }, { "epoch": 2.551044083526682, "grad_norm": 0.2908965197512948, "learning_rate": 6.668763923442406e-07, "loss": 0.0203, "step": 6597 }, { "epoch": 2.5514307811291568, "grad_norm": 0.23009510793168272, "learning_rate": 6.657542796949528e-07, "loss": 0.0159, "step": 6598 }, { "epoch": 2.551817478731632, "grad_norm": 0.2407961986718522, "learning_rate": 6.646330445587041e-07, "loss": 0.0184, "step": 6599 }, { "epoch": 2.5522041763341066, "grad_norm": 0.37065363122341627, "learning_rate": 6.635126871624992e-07, "loss": 0.0305, "step": 6600 }, { "epoch": 2.5525908739365817, "grad_norm": 0.5308534149146189, "learning_rate": 6.623932077331669e-07, "loss": 0.0217, "step": 6601 }, { "epoch": 2.5529775715390564, "grad_norm": 0.3157523348449327, "learning_rate": 6.612746064973552e-07, "loss": 0.0162, "step": 6602 }, { "epoch": 2.5533642691415315, "grad_norm": 0.253577409078358, "learning_rate": 6.601568836815414e-07, "loss": 0.0186, "step": 6603 }, { "epoch": 2.553750966744006, "grad_norm": 0.3438975999312267, "learning_rate": 6.590400395120144e-07, "loss": 0.0308, "step": 6604 }, { "epoch": 2.554137664346481, "grad_norm": 0.38815679710927853, "learning_rate": 6.579240742148945e-07, "loss": 0.0273, "step": 6605 }, { "epoch": 2.554524361948956, "grad_norm": 0.2860437022496927, "learning_rate": 6.568089880161183e-07, "loss": 0.0365, "step": 6606 }, { "epoch": 2.554911059551431, "grad_norm": 0.23670944713944206, "learning_rate": 6.556947811414499e-07, "loss": 0.0157, "step": 6607 }, { "epoch": 2.5552977571539057, "grad_norm": 0.35181287576100534, "learning_rate": 6.54581453816469e-07, "loss": 0.0271, "step": 6608 }, { "epoch": 2.5556844547563804, "grad_norm": 0.29210618998942733, "learning_rate": 6.534690062665805e-07, "loss": 0.0159, "step": 6609 }, { "epoch": 2.5560711523588555, "grad_norm": 0.3405288378754863, "learning_rate": 6.523574387170123e-07, "loss": 0.0179, "step": 6610 }, { "epoch": 2.55645784996133, "grad_norm": 0.2277265474987057, "learning_rate": 6.512467513928122e-07, "loss": 0.0136, "step": 6611 }, { "epoch": 2.556844547563805, "grad_norm": 0.30097470207096844, "learning_rate": 6.501369445188499e-07, "loss": 0.0212, "step": 6612 }, { "epoch": 2.55723124516628, "grad_norm": 0.2570456847293349, "learning_rate": 6.490280183198172e-07, "loss": 0.0183, "step": 6613 }, { "epoch": 2.557617942768755, "grad_norm": 0.2311158485992105, "learning_rate": 6.479199730202291e-07, "loss": 0.0104, "step": 6614 }, { "epoch": 2.5580046403712298, "grad_norm": 0.26438320742271887, "learning_rate": 6.468128088444197e-07, "loss": 0.0144, "step": 6615 }, { "epoch": 2.5583913379737044, "grad_norm": 0.2474866210222249, "learning_rate": 6.457065260165469e-07, "loss": 0.0221, "step": 6616 }, { "epoch": 2.5587780355761796, "grad_norm": 0.25430360147517833, "learning_rate": 6.446011247605882e-07, "loss": 0.0137, "step": 6617 }, { "epoch": 2.5591647331786542, "grad_norm": 0.29486273357303955, "learning_rate": 6.434966053003422e-07, "loss": 0.0193, "step": 6618 }, { "epoch": 2.559551430781129, "grad_norm": 0.34930201666919997, "learning_rate": 6.423929678594337e-07, "loss": 0.0353, "step": 6619 }, { "epoch": 2.559938128383604, "grad_norm": 0.20598956745641903, "learning_rate": 6.412902126613029e-07, "loss": 0.0108, "step": 6620 }, { "epoch": 2.560324825986079, "grad_norm": 0.2540029731683998, "learning_rate": 6.401883399292158e-07, "loss": 0.0126, "step": 6621 }, { "epoch": 2.560711523588554, "grad_norm": 0.37854065193954595, "learning_rate": 6.390873498862565e-07, "loss": 0.0182, "step": 6622 }, { "epoch": 2.5610982211910285, "grad_norm": 0.3203020346721458, "learning_rate": 6.379872427553318e-07, "loss": 0.025, "step": 6623 }, { "epoch": 2.5614849187935036, "grad_norm": 0.21471471296402003, "learning_rate": 6.368880187591725e-07, "loss": 0.0167, "step": 6624 }, { "epoch": 2.5618716163959783, "grad_norm": 0.23019350527827798, "learning_rate": 6.357896781203237e-07, "loss": 0.0158, "step": 6625 }, { "epoch": 2.562258313998453, "grad_norm": 0.5715218074769053, "learning_rate": 6.346922210611589e-07, "loss": 0.0237, "step": 6626 }, { "epoch": 2.562645011600928, "grad_norm": 0.2354386272176208, "learning_rate": 6.33595647803868e-07, "loss": 0.0138, "step": 6627 }, { "epoch": 2.563031709203403, "grad_norm": 0.3092607517577321, "learning_rate": 6.324999585704667e-07, "loss": 0.019, "step": 6628 }, { "epoch": 2.563418406805878, "grad_norm": 0.271375466222753, "learning_rate": 6.314051535827853e-07, "loss": 0.0178, "step": 6629 }, { "epoch": 2.5638051044083525, "grad_norm": 0.1902985316584679, "learning_rate": 6.303112330624783e-07, "loss": 0.0187, "step": 6630 }, { "epoch": 2.5641918020108276, "grad_norm": 0.5760534989805828, "learning_rate": 6.292181972310235e-07, "loss": 0.034, "step": 6631 }, { "epoch": 2.5645784996133023, "grad_norm": 0.24388428433975384, "learning_rate": 6.281260463097166e-07, "loss": 0.018, "step": 6632 }, { "epoch": 2.5649651972157774, "grad_norm": 0.3072986466496338, "learning_rate": 6.270347805196747e-07, "loss": 0.0242, "step": 6633 }, { "epoch": 2.565351894818252, "grad_norm": 0.34780418704056554, "learning_rate": 6.259444000818343e-07, "loss": 0.0209, "step": 6634 }, { "epoch": 2.565738592420727, "grad_norm": 0.22433655457139892, "learning_rate": 6.248549052169573e-07, "loss": 0.0134, "step": 6635 }, { "epoch": 2.566125290023202, "grad_norm": 0.2697184393873274, "learning_rate": 6.237662961456215e-07, "loss": 0.0148, "step": 6636 }, { "epoch": 2.5665119876256766, "grad_norm": 0.2176096396733496, "learning_rate": 6.226785730882273e-07, "loss": 0.0151, "step": 6637 }, { "epoch": 2.5668986852281517, "grad_norm": 0.2672236129556349, "learning_rate": 6.215917362649959e-07, "loss": 0.0133, "step": 6638 }, { "epoch": 2.5672853828306264, "grad_norm": 0.26915156747424773, "learning_rate": 6.205057858959667e-07, "loss": 0.0188, "step": 6639 }, { "epoch": 2.5676720804331015, "grad_norm": 0.3131965952176974, "learning_rate": 6.194207222010046e-07, "loss": 0.0213, "step": 6640 }, { "epoch": 2.568058778035576, "grad_norm": 0.2656946396244437, "learning_rate": 6.183365453997919e-07, "loss": 0.0145, "step": 6641 }, { "epoch": 2.5684454756380513, "grad_norm": 0.3097262064600956, "learning_rate": 6.172532557118282e-07, "loss": 0.0183, "step": 6642 }, { "epoch": 2.568832173240526, "grad_norm": 0.2956547350885377, "learning_rate": 6.161708533564398e-07, "loss": 0.0155, "step": 6643 }, { "epoch": 2.5692188708430006, "grad_norm": 0.2756073838325232, "learning_rate": 6.150893385527679e-07, "loss": 0.0182, "step": 6644 }, { "epoch": 2.5696055684454757, "grad_norm": 0.26696611737352954, "learning_rate": 6.140087115197802e-07, "loss": 0.0191, "step": 6645 }, { "epoch": 2.5699922660479504, "grad_norm": 1.0846555301297813, "learning_rate": 6.129289724762561e-07, "loss": 0.0242, "step": 6646 }, { "epoch": 2.5703789636504255, "grad_norm": 0.28524415784489154, "learning_rate": 6.118501216408035e-07, "loss": 0.022, "step": 6647 }, { "epoch": 2.5707656612529, "grad_norm": 0.3261849942346285, "learning_rate": 6.107721592318455e-07, "loss": 0.0188, "step": 6648 }, { "epoch": 2.5711523588553753, "grad_norm": 0.26712304269023224, "learning_rate": 6.09695085467627e-07, "loss": 0.0182, "step": 6649 }, { "epoch": 2.57153905645785, "grad_norm": 0.38743955723878776, "learning_rate": 6.086189005662135e-07, "loss": 0.0185, "step": 6650 }, { "epoch": 2.5719257540603246, "grad_norm": 0.3155759934938697, "learning_rate": 6.075436047454875e-07, "loss": 0.0224, "step": 6651 }, { "epoch": 2.5723124516627998, "grad_norm": 0.2681306223624347, "learning_rate": 6.064691982231563e-07, "loss": 0.0186, "step": 6652 }, { "epoch": 2.5726991492652744, "grad_norm": 0.3024861487599477, "learning_rate": 6.053956812167438e-07, "loss": 0.0179, "step": 6653 }, { "epoch": 2.5730858468677495, "grad_norm": 0.21102340714498755, "learning_rate": 6.043230539435952e-07, "loss": 0.0109, "step": 6654 }, { "epoch": 2.573472544470224, "grad_norm": 0.37464553810908113, "learning_rate": 6.032513166208737e-07, "loss": 0.0255, "step": 6655 }, { "epoch": 2.5738592420726993, "grad_norm": 0.2828655166698672, "learning_rate": 6.021804694655642e-07, "loss": 0.0152, "step": 6656 }, { "epoch": 2.574245939675174, "grad_norm": 0.28372750310775713, "learning_rate": 6.011105126944711e-07, "loss": 0.0164, "step": 6657 }, { "epoch": 2.5746326372776487, "grad_norm": 0.19805755586149204, "learning_rate": 6.000414465242188e-07, "loss": 0.0099, "step": 6658 }, { "epoch": 2.575019334880124, "grad_norm": 0.2585164947461087, "learning_rate": 5.989732711712498e-07, "loss": 0.0174, "step": 6659 }, { "epoch": 2.5754060324825985, "grad_norm": 0.3740268863128294, "learning_rate": 5.979059868518272e-07, "loss": 0.0289, "step": 6660 }, { "epoch": 2.5757927300850736, "grad_norm": 0.27278883754903793, "learning_rate": 5.968395937820348e-07, "loss": 0.0174, "step": 6661 }, { "epoch": 2.5761794276875483, "grad_norm": 0.4010562898539188, "learning_rate": 5.95774092177775e-07, "loss": 0.0211, "step": 6662 }, { "epoch": 2.5765661252900234, "grad_norm": 0.24231530916371533, "learning_rate": 5.947094822547689e-07, "loss": 0.0184, "step": 6663 }, { "epoch": 2.576952822892498, "grad_norm": 0.23314232310459027, "learning_rate": 5.936457642285576e-07, "loss": 0.0135, "step": 6664 }, { "epoch": 2.5773395204949727, "grad_norm": 0.30186472484496135, "learning_rate": 5.925829383145015e-07, "loss": 0.0229, "step": 6665 }, { "epoch": 2.577726218097448, "grad_norm": 1.435056943909763, "learning_rate": 5.915210047277825e-07, "loss": 0.0153, "step": 6666 }, { "epoch": 2.5781129156999225, "grad_norm": 0.19361063515155336, "learning_rate": 5.904599636833997e-07, "loss": 0.0094, "step": 6667 }, { "epoch": 2.5784996133023976, "grad_norm": 0.3059719808711998, "learning_rate": 5.893998153961705e-07, "loss": 0.0137, "step": 6668 }, { "epoch": 2.5788863109048723, "grad_norm": 0.2454071888786641, "learning_rate": 5.88340560080734e-07, "loss": 0.0157, "step": 6669 }, { "epoch": 2.5792730085073474, "grad_norm": 0.21808229356002806, "learning_rate": 5.872821979515464e-07, "loss": 0.0149, "step": 6670 }, { "epoch": 2.579659706109822, "grad_norm": 0.18152808691187333, "learning_rate": 5.862247292228862e-07, "loss": 0.0148, "step": 6671 }, { "epoch": 2.5800464037122968, "grad_norm": 0.674226898413356, "learning_rate": 5.851681541088461e-07, "loss": 0.022, "step": 6672 }, { "epoch": 2.580433101314772, "grad_norm": 0.26030722010805085, "learning_rate": 5.841124728233433e-07, "loss": 0.0234, "step": 6673 }, { "epoch": 2.5808197989172466, "grad_norm": 0.2359344003517858, "learning_rate": 5.83057685580109e-07, "loss": 0.0151, "step": 6674 }, { "epoch": 2.5812064965197217, "grad_norm": 0.23216471462566562, "learning_rate": 5.820037925926991e-07, "loss": 0.0108, "step": 6675 }, { "epoch": 2.5815931941221963, "grad_norm": 0.27776659670478177, "learning_rate": 5.809507940744819e-07, "loss": 0.0244, "step": 6676 }, { "epoch": 2.5819798917246715, "grad_norm": 0.26303555005469975, "learning_rate": 5.798986902386483e-07, "loss": 0.0228, "step": 6677 }, { "epoch": 2.582366589327146, "grad_norm": 0.2989943787120511, "learning_rate": 5.788474812982097e-07, "loss": 0.0218, "step": 6678 }, { "epoch": 2.582753286929621, "grad_norm": 0.2764424137785353, "learning_rate": 5.777971674659933e-07, "loss": 0.0158, "step": 6679 }, { "epoch": 2.583139984532096, "grad_norm": 0.2505415980985481, "learning_rate": 5.767477489546458e-07, "loss": 0.0207, "step": 6680 }, { "epoch": 2.583526682134571, "grad_norm": 0.23777018126028715, "learning_rate": 5.756992259766325e-07, "loss": 0.0134, "step": 6681 }, { "epoch": 2.5839133797370457, "grad_norm": 0.23533528271253146, "learning_rate": 5.746515987442375e-07, "loss": 0.0156, "step": 6682 }, { "epoch": 2.5843000773395204, "grad_norm": 0.2493196928156538, "learning_rate": 5.736048674695655e-07, "loss": 0.0235, "step": 6683 }, { "epoch": 2.5846867749419955, "grad_norm": 0.19557323434145035, "learning_rate": 5.725590323645369e-07, "loss": 0.0116, "step": 6684 }, { "epoch": 2.58507347254447, "grad_norm": 0.26746050775506325, "learning_rate": 5.71514093640892e-07, "loss": 0.0175, "step": 6685 }, { "epoch": 2.585460170146945, "grad_norm": 0.30717387611018976, "learning_rate": 5.704700515101886e-07, "loss": 0.0172, "step": 6686 }, { "epoch": 2.58584686774942, "grad_norm": 0.2993912634420093, "learning_rate": 5.694269061838054e-07, "loss": 0.0177, "step": 6687 }, { "epoch": 2.586233565351895, "grad_norm": 0.27412528051611285, "learning_rate": 5.683846578729391e-07, "loss": 0.0145, "step": 6688 }, { "epoch": 2.5866202629543698, "grad_norm": 0.22192845406383155, "learning_rate": 5.673433067885998e-07, "loss": 0.0147, "step": 6689 }, { "epoch": 2.5870069605568444, "grad_norm": 0.27606459798836547, "learning_rate": 5.663028531416226e-07, "loss": 0.0156, "step": 6690 }, { "epoch": 2.5873936581593195, "grad_norm": 0.2714864667464679, "learning_rate": 5.652632971426569e-07, "loss": 0.0219, "step": 6691 }, { "epoch": 2.587780355761794, "grad_norm": 0.2668598499074003, "learning_rate": 5.642246390021744e-07, "loss": 0.0148, "step": 6692 }, { "epoch": 2.588167053364269, "grad_norm": 0.2658356692214322, "learning_rate": 5.631868789304579e-07, "loss": 0.0183, "step": 6693 }, { "epoch": 2.588553750966744, "grad_norm": 0.26448239723949274, "learning_rate": 5.62150017137616e-07, "loss": 0.0184, "step": 6694 }, { "epoch": 2.588940448569219, "grad_norm": 0.23294434533323127, "learning_rate": 5.611140538335708e-07, "loss": 0.0165, "step": 6695 }, { "epoch": 2.589327146171694, "grad_norm": 0.17399042303867765, "learning_rate": 5.600789892280644e-07, "loss": 0.0094, "step": 6696 }, { "epoch": 2.5897138437741685, "grad_norm": 0.2628385372877009, "learning_rate": 5.590448235306561e-07, "loss": 0.0141, "step": 6697 }, { "epoch": 2.5901005413766436, "grad_norm": 0.24712882530428837, "learning_rate": 5.580115569507222e-07, "loss": 0.0167, "step": 6698 }, { "epoch": 2.5904872389791183, "grad_norm": 0.2742743310795481, "learning_rate": 5.569791896974602e-07, "loss": 0.0175, "step": 6699 }, { "epoch": 2.590873936581593, "grad_norm": 0.2839036192578962, "learning_rate": 5.559477219798831e-07, "loss": 0.0317, "step": 6700 }, { "epoch": 2.591260634184068, "grad_norm": 0.27202689253028506, "learning_rate": 5.549171540068222e-07, "loss": 0.0173, "step": 6701 }, { "epoch": 2.591647331786543, "grad_norm": 0.23616304391896745, "learning_rate": 5.538874859869259e-07, "loss": 0.0184, "step": 6702 }, { "epoch": 2.592034029389018, "grad_norm": 0.2130250157341296, "learning_rate": 5.52858718128661e-07, "loss": 0.016, "step": 6703 }, { "epoch": 2.5924207269914925, "grad_norm": 0.2449189119536611, "learning_rate": 5.518308506403142e-07, "loss": 0.0144, "step": 6704 }, { "epoch": 2.5928074245939676, "grad_norm": 0.2505680609544331, "learning_rate": 5.508038837299867e-07, "loss": 0.0166, "step": 6705 }, { "epoch": 2.5931941221964423, "grad_norm": 0.21839823565415808, "learning_rate": 5.497778176055985e-07, "loss": 0.013, "step": 6706 }, { "epoch": 2.593580819798917, "grad_norm": 0.39905917867999463, "learning_rate": 5.487526524748865e-07, "loss": 0.026, "step": 6707 }, { "epoch": 2.593967517401392, "grad_norm": 0.42421352914066157, "learning_rate": 5.477283885454077e-07, "loss": 0.0169, "step": 6708 }, { "epoch": 2.594354215003867, "grad_norm": 0.3363447332827542, "learning_rate": 5.467050260245355e-07, "loss": 0.0256, "step": 6709 }, { "epoch": 2.594740912606342, "grad_norm": 0.38638173019659827, "learning_rate": 5.456825651194569e-07, "loss": 0.0232, "step": 6710 }, { "epoch": 2.5951276102088165, "grad_norm": 0.26545981600024626, "learning_rate": 5.446610060371827e-07, "loss": 0.0206, "step": 6711 }, { "epoch": 2.5955143078112917, "grad_norm": 0.28760204416722346, "learning_rate": 5.436403489845366e-07, "loss": 0.0185, "step": 6712 }, { "epoch": 2.5959010054137663, "grad_norm": 0.2505318747389367, "learning_rate": 5.42620594168164e-07, "loss": 0.0136, "step": 6713 }, { "epoch": 2.5962877030162415, "grad_norm": 0.2155248469409335, "learning_rate": 5.416017417945202e-07, "loss": 0.0158, "step": 6714 }, { "epoch": 2.596674400618716, "grad_norm": 0.24268316136028356, "learning_rate": 5.405837920698864e-07, "loss": 0.0205, "step": 6715 }, { "epoch": 2.5970610982211912, "grad_norm": 0.28415973880111517, "learning_rate": 5.395667452003555e-07, "loss": 0.0238, "step": 6716 }, { "epoch": 2.597447795823666, "grad_norm": 0.24719667483802482, "learning_rate": 5.385506013918395e-07, "loss": 0.0113, "step": 6717 }, { "epoch": 2.5978344934261406, "grad_norm": 0.28511823289425575, "learning_rate": 5.375353608500672e-07, "loss": 0.0177, "step": 6718 }, { "epoch": 2.5982211910286157, "grad_norm": 0.23849918761888872, "learning_rate": 5.365210237805834e-07, "loss": 0.0167, "step": 6719 }, { "epoch": 2.5986078886310904, "grad_norm": 0.3065493438984574, "learning_rate": 5.35507590388753e-07, "loss": 0.0211, "step": 6720 }, { "epoch": 2.5989945862335655, "grad_norm": 0.356560734487989, "learning_rate": 5.344950608797555e-07, "loss": 0.0179, "step": 6721 }, { "epoch": 2.59938128383604, "grad_norm": 0.2415080921279333, "learning_rate": 5.334834354585883e-07, "loss": 0.0144, "step": 6722 }, { "epoch": 2.5997679814385153, "grad_norm": 0.28056991832465716, "learning_rate": 5.324727143300645e-07, "loss": 0.0163, "step": 6723 }, { "epoch": 2.60015467904099, "grad_norm": 0.25304468923684653, "learning_rate": 5.314628976988151e-07, "loss": 0.0162, "step": 6724 }, { "epoch": 2.6005413766434646, "grad_norm": 0.281709928866316, "learning_rate": 5.304539857692897e-07, "loss": 0.0188, "step": 6725 }, { "epoch": 2.6009280742459397, "grad_norm": 0.24087500349448498, "learning_rate": 5.294459787457512e-07, "loss": 0.0176, "step": 6726 }, { "epoch": 2.6013147718484144, "grad_norm": 0.2897779557201371, "learning_rate": 5.284388768322824e-07, "loss": 0.0217, "step": 6727 }, { "epoch": 2.6017014694508895, "grad_norm": 0.24457877501195202, "learning_rate": 5.274326802327801e-07, "loss": 0.0163, "step": 6728 }, { "epoch": 2.602088167053364, "grad_norm": 0.2538857108908113, "learning_rate": 5.264273891509597e-07, "loss": 0.0202, "step": 6729 }, { "epoch": 2.6024748646558393, "grad_norm": 0.23384346282629379, "learning_rate": 5.254230037903535e-07, "loss": 0.0182, "step": 6730 }, { "epoch": 2.602861562258314, "grad_norm": 0.27000861553031846, "learning_rate": 5.244195243543093e-07, "loss": 0.0155, "step": 6731 }, { "epoch": 2.6032482598607887, "grad_norm": 0.3155168689422197, "learning_rate": 5.234169510459924e-07, "loss": 0.0221, "step": 6732 }, { "epoch": 2.603634957463264, "grad_norm": 0.25340952945727374, "learning_rate": 5.224152840683827e-07, "loss": 0.0223, "step": 6733 }, { "epoch": 2.6040216550657385, "grad_norm": 0.24162086038185215, "learning_rate": 5.214145236242795e-07, "loss": 0.015, "step": 6734 }, { "epoch": 2.6044083526682136, "grad_norm": 0.2723746785549484, "learning_rate": 5.204146699162987e-07, "loss": 0.0215, "step": 6735 }, { "epoch": 2.6047950502706883, "grad_norm": 0.35575370197338574, "learning_rate": 5.194157231468666e-07, "loss": 0.0271, "step": 6736 }, { "epoch": 2.6051817478731634, "grad_norm": 0.3085226570731262, "learning_rate": 5.184176835182337e-07, "loss": 0.0156, "step": 6737 }, { "epoch": 2.605568445475638, "grad_norm": 0.30440062113141936, "learning_rate": 5.174205512324621e-07, "loss": 0.0191, "step": 6738 }, { "epoch": 2.6059551430781127, "grad_norm": 0.34723591264490067, "learning_rate": 5.164243264914337e-07, "loss": 0.023, "step": 6739 }, { "epoch": 2.606341840680588, "grad_norm": 0.2613082650579631, "learning_rate": 5.154290094968411e-07, "loss": 0.0203, "step": 6740 }, { "epoch": 2.6067285382830625, "grad_norm": 0.2143623044534145, "learning_rate": 5.144346004501993e-07, "loss": 0.0153, "step": 6741 }, { "epoch": 2.6071152358855376, "grad_norm": 0.21034310237125162, "learning_rate": 5.134410995528361e-07, "loss": 0.0104, "step": 6742 }, { "epoch": 2.6075019334880123, "grad_norm": 0.20847396624893025, "learning_rate": 5.12448507005896e-07, "loss": 0.0163, "step": 6743 }, { "epoch": 2.6078886310904874, "grad_norm": 0.4330256060579283, "learning_rate": 5.11456823010339e-07, "loss": 0.0303, "step": 6744 }, { "epoch": 2.608275328692962, "grad_norm": 0.2062035364207861, "learning_rate": 5.104660477669415e-07, "loss": 0.0111, "step": 6745 }, { "epoch": 2.6086620262954368, "grad_norm": 0.22752518402819505, "learning_rate": 5.094761814762983e-07, "loss": 0.0183, "step": 6746 }, { "epoch": 2.609048723897912, "grad_norm": 0.26137783118511, "learning_rate": 5.084872243388167e-07, "loss": 0.0213, "step": 6747 }, { "epoch": 2.6094354215003865, "grad_norm": 0.25657948164380834, "learning_rate": 5.074991765547215e-07, "loss": 0.0168, "step": 6748 }, { "epoch": 2.6098221191028617, "grad_norm": 0.34429326811320493, "learning_rate": 5.06512038324053e-07, "loss": 0.019, "step": 6749 }, { "epoch": 2.6102088167053363, "grad_norm": 0.21857142105057845, "learning_rate": 5.05525809846667e-07, "loss": 0.0112, "step": 6750 }, { "epoch": 2.6105955143078114, "grad_norm": 0.21183065020195782, "learning_rate": 5.045404913222379e-07, "loss": 0.0133, "step": 6751 }, { "epoch": 2.610982211910286, "grad_norm": 0.2522479004613218, "learning_rate": 5.035560829502517e-07, "loss": 0.016, "step": 6752 }, { "epoch": 2.611368909512761, "grad_norm": 0.32665478134142767, "learning_rate": 5.025725849300128e-07, "loss": 0.0213, "step": 6753 }, { "epoch": 2.611755607115236, "grad_norm": 0.21829224087314458, "learning_rate": 5.015899974606403e-07, "loss": 0.0119, "step": 6754 }, { "epoch": 2.612142304717711, "grad_norm": 0.3464859513402996, "learning_rate": 5.006083207410683e-07, "loss": 0.0241, "step": 6755 }, { "epoch": 2.6125290023201857, "grad_norm": 0.3430046541123561, "learning_rate": 4.996275549700508e-07, "loss": 0.0241, "step": 6756 }, { "epoch": 2.6129156999226604, "grad_norm": 0.24736481072802516, "learning_rate": 4.986477003461499e-07, "loss": 0.0218, "step": 6757 }, { "epoch": 2.6133023975251355, "grad_norm": 0.3273599259352192, "learning_rate": 4.976687570677497e-07, "loss": 0.0345, "step": 6758 }, { "epoch": 2.61368909512761, "grad_norm": 0.3386720365042712, "learning_rate": 4.966907253330461e-07, "loss": 0.0186, "step": 6759 }, { "epoch": 2.614075792730085, "grad_norm": 0.4058583459035851, "learning_rate": 4.957136053400546e-07, "loss": 0.0274, "step": 6760 }, { "epoch": 2.61446249033256, "grad_norm": 0.2672384749104275, "learning_rate": 4.947373972866004e-07, "loss": 0.0164, "step": 6761 }, { "epoch": 2.614849187935035, "grad_norm": 0.2372967083829702, "learning_rate": 4.937621013703264e-07, "loss": 0.0154, "step": 6762 }, { "epoch": 2.6152358855375097, "grad_norm": 0.3353097918007221, "learning_rate": 4.927877177886942e-07, "loss": 0.019, "step": 6763 }, { "epoch": 2.6156225831399844, "grad_norm": 0.30425161675429596, "learning_rate": 4.918142467389763e-07, "loss": 0.0232, "step": 6764 }, { "epoch": 2.6160092807424595, "grad_norm": 0.30388355591113086, "learning_rate": 4.908416884182621e-07, "loss": 0.0228, "step": 6765 }, { "epoch": 2.616395978344934, "grad_norm": 0.20965225793691872, "learning_rate": 4.898700430234554e-07, "loss": 0.0122, "step": 6766 }, { "epoch": 2.616782675947409, "grad_norm": 0.29943337329739006, "learning_rate": 4.888993107512774e-07, "loss": 0.0183, "step": 6767 }, { "epoch": 2.617169373549884, "grad_norm": 0.2780355718806216, "learning_rate": 4.879294917982624e-07, "loss": 0.0197, "step": 6768 }, { "epoch": 2.617556071152359, "grad_norm": 0.2527298107184915, "learning_rate": 4.869605863607601e-07, "loss": 0.0166, "step": 6769 }, { "epoch": 2.617942768754834, "grad_norm": 0.22910525390608724, "learning_rate": 4.859925946349359e-07, "loss": 0.0201, "step": 6770 }, { "epoch": 2.6183294663573085, "grad_norm": 0.23548835485295805, "learning_rate": 4.850255168167678e-07, "loss": 0.0147, "step": 6771 }, { "epoch": 2.6187161639597836, "grad_norm": 0.269732354349577, "learning_rate": 4.840593531020538e-07, "loss": 0.0218, "step": 6772 }, { "epoch": 2.6191028615622582, "grad_norm": 0.2504879628352538, "learning_rate": 4.830941036864023e-07, "loss": 0.0188, "step": 6773 }, { "epoch": 2.619489559164733, "grad_norm": 0.23361282322749308, "learning_rate": 4.821297687652376e-07, "loss": 0.0144, "step": 6774 }, { "epoch": 2.619876256767208, "grad_norm": 0.27613198727290295, "learning_rate": 4.811663485338003e-07, "loss": 0.0265, "step": 6775 }, { "epoch": 2.620262954369683, "grad_norm": 0.24941724477416288, "learning_rate": 4.802038431871431e-07, "loss": 0.0272, "step": 6776 }, { "epoch": 2.620649651972158, "grad_norm": 0.2639908838565183, "learning_rate": 4.792422529201379e-07, "loss": 0.0183, "step": 6777 }, { "epoch": 2.6210363495746325, "grad_norm": 0.24929976412247332, "learning_rate": 4.782815779274657e-07, "loss": 0.0136, "step": 6778 }, { "epoch": 2.6214230471771076, "grad_norm": 0.2827206264584039, "learning_rate": 4.773218184036276e-07, "loss": 0.0199, "step": 6779 }, { "epoch": 2.6218097447795823, "grad_norm": 0.2941229934830303, "learning_rate": 4.763629745429349e-07, "loss": 0.0293, "step": 6780 }, { "epoch": 2.622196442382057, "grad_norm": 0.245640473697995, "learning_rate": 4.7540504653951824e-07, "loss": 0.0152, "step": 6781 }, { "epoch": 2.622583139984532, "grad_norm": 0.2785442227015803, "learning_rate": 4.74448034587317e-07, "loss": 0.0169, "step": 6782 }, { "epoch": 2.622969837587007, "grad_norm": 0.2510649535814358, "learning_rate": 4.73491938880089e-07, "loss": 0.0189, "step": 6783 }, { "epoch": 2.623356535189482, "grad_norm": 0.26424294669534215, "learning_rate": 4.7253675961140666e-07, "loss": 0.0176, "step": 6784 }, { "epoch": 2.6237432327919565, "grad_norm": 0.2505404764687806, "learning_rate": 4.7158249697465577e-07, "loss": 0.0153, "step": 6785 }, { "epoch": 2.6241299303944317, "grad_norm": 0.3656519602715101, "learning_rate": 4.7062915116303676e-07, "loss": 0.027, "step": 6786 }, { "epoch": 2.6245166279969063, "grad_norm": 0.2430355975498676, "learning_rate": 4.696767223695625e-07, "loss": 0.0176, "step": 6787 }, { "epoch": 2.6249033255993814, "grad_norm": 0.2798916770761027, "learning_rate": 4.687252107870649e-07, "loss": 0.0172, "step": 6788 }, { "epoch": 2.625290023201856, "grad_norm": 0.2789602832312918, "learning_rate": 4.677746166081859e-07, "loss": 0.0181, "step": 6789 }, { "epoch": 2.6256767208043312, "grad_norm": 0.3012308419676823, "learning_rate": 4.668249400253838e-07, "loss": 0.0291, "step": 6790 }, { "epoch": 2.626063418406806, "grad_norm": 0.21580820569295006, "learning_rate": 4.6587618123092924e-07, "loss": 0.0141, "step": 6791 }, { "epoch": 2.6264501160092806, "grad_norm": 0.25848144447169885, "learning_rate": 4.649283404169092e-07, "loss": 0.02, "step": 6792 }, { "epoch": 2.6268368136117557, "grad_norm": 0.24801865026726233, "learning_rate": 4.639814177752239e-07, "loss": 0.014, "step": 6793 }, { "epoch": 2.6272235112142304, "grad_norm": 0.19486123354810111, "learning_rate": 4.6303541349758784e-07, "loss": 0.0088, "step": 6794 }, { "epoch": 2.6276102088167055, "grad_norm": 0.3093071299771013, "learning_rate": 4.6209032777552897e-07, "loss": 0.0206, "step": 6795 }, { "epoch": 2.62799690641918, "grad_norm": 0.3178349768926756, "learning_rate": 4.6114616080038977e-07, "loss": 0.022, "step": 6796 }, { "epoch": 2.6283836040216553, "grad_norm": 0.29190338304726465, "learning_rate": 4.6020291276332564e-07, "loss": 0.0151, "step": 6797 }, { "epoch": 2.62877030162413, "grad_norm": 0.23664192613088533, "learning_rate": 4.5926058385530935e-07, "loss": 0.016, "step": 6798 }, { "epoch": 2.6291569992266046, "grad_norm": 0.2681829607250863, "learning_rate": 4.583191742671239e-07, "loss": 0.0231, "step": 6799 }, { "epoch": 2.6295436968290797, "grad_norm": 0.2852839214956871, "learning_rate": 4.5737868418936684e-07, "loss": 0.021, "step": 6800 }, { "epoch": 2.6299303944315544, "grad_norm": 0.24534498633177745, "learning_rate": 4.5643911381245075e-07, "loss": 0.0182, "step": 6801 }, { "epoch": 2.6303170920340295, "grad_norm": 0.20602051070216124, "learning_rate": 4.555004633266008e-07, "loss": 0.016, "step": 6802 }, { "epoch": 2.630703789636504, "grad_norm": 0.23616079050804287, "learning_rate": 4.545627329218588e-07, "loss": 0.0148, "step": 6803 }, { "epoch": 2.6310904872389793, "grad_norm": 0.21162169748411397, "learning_rate": 4.5362592278807405e-07, "loss": 0.0179, "step": 6804 }, { "epoch": 2.631477184841454, "grad_norm": 0.21932977557338743, "learning_rate": 4.52690033114917e-07, "loss": 0.0144, "step": 6805 }, { "epoch": 2.6318638824439287, "grad_norm": 0.18457645887684745, "learning_rate": 4.517550640918661e-07, "loss": 0.0133, "step": 6806 }, { "epoch": 2.6322505800464038, "grad_norm": 0.27300117274635877, "learning_rate": 4.508210159082177e-07, "loss": 0.0172, "step": 6807 }, { "epoch": 2.6326372776488784, "grad_norm": 0.2960377141680043, "learning_rate": 4.4988788875307775e-07, "loss": 0.0145, "step": 6808 }, { "epoch": 2.6330239752513536, "grad_norm": 0.2619655748671402, "learning_rate": 4.4895568281536686e-07, "loss": 0.0158, "step": 6809 }, { "epoch": 2.6334106728538282, "grad_norm": 0.2626517328743487, "learning_rate": 4.480243982838217e-07, "loss": 0.0161, "step": 6810 }, { "epoch": 2.6337973704563034, "grad_norm": 0.3108320301683049, "learning_rate": 4.470940353469899e-07, "loss": 0.0173, "step": 6811 }, { "epoch": 2.634184068058778, "grad_norm": 0.21585723617786762, "learning_rate": 4.461645941932324e-07, "loss": 0.0179, "step": 6812 }, { "epoch": 2.6345707656612527, "grad_norm": 0.25585714874067167, "learning_rate": 4.452360750107243e-07, "loss": 0.0161, "step": 6813 }, { "epoch": 2.634957463263728, "grad_norm": 0.27905598876610854, "learning_rate": 4.4430847798745515e-07, "loss": 0.0163, "step": 6814 }, { "epoch": 2.6353441608662025, "grad_norm": 0.26501858408633633, "learning_rate": 4.433818033112258e-07, "loss": 0.0157, "step": 6815 }, { "epoch": 2.6357308584686776, "grad_norm": 0.23161800225315718, "learning_rate": 4.424560511696513e-07, "loss": 0.0129, "step": 6816 }, { "epoch": 2.6361175560711523, "grad_norm": 0.2682644482935839, "learning_rate": 4.4153122175015995e-07, "loss": 0.0182, "step": 6817 }, { "epoch": 2.6365042536736274, "grad_norm": 0.24874112304449564, "learning_rate": 4.4060731523999146e-07, "loss": 0.0197, "step": 6818 }, { "epoch": 2.636890951276102, "grad_norm": 0.3077466182791392, "learning_rate": 4.396843318262023e-07, "loss": 0.0244, "step": 6819 }, { "epoch": 2.6372776488785767, "grad_norm": 0.2608062121015188, "learning_rate": 4.3876227169565966e-07, "loss": 0.0181, "step": 6820 }, { "epoch": 2.637664346481052, "grad_norm": 0.24771684536672206, "learning_rate": 4.3784113503504366e-07, "loss": 0.0176, "step": 6821 }, { "epoch": 2.6380510440835265, "grad_norm": 0.3947827861708407, "learning_rate": 4.3692092203084844e-07, "loss": 0.0333, "step": 6822 }, { "epoch": 2.6384377416860016, "grad_norm": 0.26135777519511855, "learning_rate": 4.3600163286937945e-07, "loss": 0.0209, "step": 6823 }, { "epoch": 2.6388244392884763, "grad_norm": 0.22856018133535977, "learning_rate": 4.350832677367589e-07, "loss": 0.0139, "step": 6824 }, { "epoch": 2.6392111368909514, "grad_norm": 0.3323907968317309, "learning_rate": 4.341658268189158e-07, "loss": 0.0264, "step": 6825 }, { "epoch": 2.639597834493426, "grad_norm": 0.36378594491172034, "learning_rate": 4.3324931030159887e-07, "loss": 0.0301, "step": 6826 }, { "epoch": 2.639984532095901, "grad_norm": 0.33090768023058476, "learning_rate": 4.323337183703641e-07, "loss": 0.0168, "step": 6827 }, { "epoch": 2.640371229698376, "grad_norm": 0.284087634344307, "learning_rate": 4.314190512105848e-07, "loss": 0.0191, "step": 6828 }, { "epoch": 2.6407579273008506, "grad_norm": 0.21217826741697093, "learning_rate": 4.305053090074435e-07, "loss": 0.0176, "step": 6829 }, { "epoch": 2.6411446249033257, "grad_norm": 0.20682760224787758, "learning_rate": 4.2959249194593543e-07, "loss": 0.0153, "step": 6830 }, { "epoch": 2.6415313225058004, "grad_norm": 0.2650326573323984, "learning_rate": 4.286806002108723e-07, "loss": 0.0167, "step": 6831 }, { "epoch": 2.6419180201082755, "grad_norm": 0.3045987524561647, "learning_rate": 4.2776963398687477e-07, "loss": 0.0192, "step": 6832 }, { "epoch": 2.64230471771075, "grad_norm": 0.1813616591165011, "learning_rate": 4.26859593458378e-07, "loss": 0.0108, "step": 6833 }, { "epoch": 2.642691415313225, "grad_norm": 0.26424721208060525, "learning_rate": 4.2595047880962915e-07, "loss": 0.0161, "step": 6834 }, { "epoch": 2.6430781129157, "grad_norm": 0.30872390526725224, "learning_rate": 4.2504229022468644e-07, "loss": 0.0247, "step": 6835 }, { "epoch": 2.643464810518175, "grad_norm": 0.2964810218038572, "learning_rate": 4.24135027887424e-07, "loss": 0.0201, "step": 6836 }, { "epoch": 2.6438515081206497, "grad_norm": 0.2957885071168595, "learning_rate": 4.232286919815259e-07, "loss": 0.0201, "step": 6837 }, { "epoch": 2.6442382057231244, "grad_norm": 0.29658734286512495, "learning_rate": 4.2232328269048885e-07, "loss": 0.0193, "step": 6838 }, { "epoch": 2.6446249033255995, "grad_norm": 0.22720777037567783, "learning_rate": 4.2141880019762217e-07, "loss": 0.0174, "step": 6839 }, { "epoch": 2.645011600928074, "grad_norm": 0.2966987453794315, "learning_rate": 4.205152446860489e-07, "loss": 0.019, "step": 6840 }, { "epoch": 2.645398298530549, "grad_norm": 0.2704932098924, "learning_rate": 4.196126163387032e-07, "loss": 0.0137, "step": 6841 }, { "epoch": 2.645784996133024, "grad_norm": 0.23333813157068706, "learning_rate": 4.1871091533832895e-07, "loss": 0.0167, "step": 6842 }, { "epoch": 2.646171693735499, "grad_norm": 0.3226623984382989, "learning_rate": 4.1781014186748794e-07, "loss": 0.0185, "step": 6843 }, { "epoch": 2.6465583913379738, "grad_norm": 0.2552013285839668, "learning_rate": 4.169102961085492e-07, "loss": 0.0192, "step": 6844 }, { "epoch": 2.6469450889404484, "grad_norm": 0.27174187696627533, "learning_rate": 4.160113782436981e-07, "loss": 0.021, "step": 6845 }, { "epoch": 2.6473317865429236, "grad_norm": 0.19862787288852232, "learning_rate": 4.1511338845492686e-07, "loss": 0.0102, "step": 6846 }, { "epoch": 2.6477184841453982, "grad_norm": 0.34150318941519503, "learning_rate": 4.142163269240451e-07, "loss": 0.0116, "step": 6847 }, { "epoch": 2.648105181747873, "grad_norm": 0.26622744355965616, "learning_rate": 4.133201938326714e-07, "loss": 0.0198, "step": 6848 }, { "epoch": 2.648491879350348, "grad_norm": 0.24576027769662387, "learning_rate": 4.1242498936223785e-07, "loss": 0.0232, "step": 6849 }, { "epoch": 2.648878576952823, "grad_norm": 0.26011596305605156, "learning_rate": 4.1153071369398667e-07, "loss": 0.0182, "step": 6850 }, { "epoch": 2.649265274555298, "grad_norm": 0.21109010171545592, "learning_rate": 4.1063736700897363e-07, "loss": 0.0141, "step": 6851 }, { "epoch": 2.6496519721577725, "grad_norm": 0.3132995873355688, "learning_rate": 4.0974494948806676e-07, "loss": 0.0326, "step": 6852 }, { "epoch": 2.6500386697602476, "grad_norm": 0.3559491958943128, "learning_rate": 4.0885346131194547e-07, "loss": 0.0335, "step": 6853 }, { "epoch": 2.6504253673627223, "grad_norm": 0.3128107174818629, "learning_rate": 4.0796290266109926e-07, "loss": 0.0209, "step": 6854 }, { "epoch": 2.650812064965197, "grad_norm": 0.23631546037405193, "learning_rate": 4.070732737158328e-07, "loss": 0.017, "step": 6855 }, { "epoch": 2.651198762567672, "grad_norm": 0.32916217748545473, "learning_rate": 4.061845746562582e-07, "loss": 0.0197, "step": 6856 }, { "epoch": 2.651585460170147, "grad_norm": 0.3257109469097562, "learning_rate": 4.0529680566230466e-07, "loss": 0.0174, "step": 6857 }, { "epoch": 2.651972157772622, "grad_norm": 0.340655874431736, "learning_rate": 4.044099669137086e-07, "loss": 0.0167, "step": 6858 }, { "epoch": 2.6523588553750965, "grad_norm": 0.2585613967210729, "learning_rate": 4.0352405859002076e-07, "loss": 0.0149, "step": 6859 }, { "epoch": 2.6527455529775716, "grad_norm": 0.2003838339578643, "learning_rate": 4.0263908087060043e-07, "loss": 0.0142, "step": 6860 }, { "epoch": 2.6531322505800463, "grad_norm": 0.27043855469007483, "learning_rate": 4.0175503393462267e-07, "loss": 0.0186, "step": 6861 }, { "epoch": 2.6535189481825214, "grad_norm": 0.21841074346077935, "learning_rate": 4.008719179610715e-07, "loss": 0.0138, "step": 6862 }, { "epoch": 2.653905645784996, "grad_norm": 0.24176548599423311, "learning_rate": 3.9998973312874224e-07, "loss": 0.0131, "step": 6863 }, { "epoch": 2.654292343387471, "grad_norm": 0.21814412561714586, "learning_rate": 3.991084796162431e-07, "loss": 0.0114, "step": 6864 }, { "epoch": 2.654679040989946, "grad_norm": 0.592851226651067, "learning_rate": 3.9822815760199195e-07, "loss": 0.0306, "step": 6865 }, { "epoch": 2.6550657385924206, "grad_norm": 0.25138274191193344, "learning_rate": 3.973487672642201e-07, "loss": 0.0192, "step": 6866 }, { "epoch": 2.6554524361948957, "grad_norm": 0.2628561596483362, "learning_rate": 3.9647030878097016e-07, "loss": 0.0218, "step": 6867 }, { "epoch": 2.6558391337973704, "grad_norm": 0.21996245552517657, "learning_rate": 3.955927823300926e-07, "loss": 0.0214, "step": 6868 }, { "epoch": 2.6562258313998455, "grad_norm": 0.2563810434912044, "learning_rate": 3.9471618808925363e-07, "loss": 0.0172, "step": 6869 }, { "epoch": 2.65661252900232, "grad_norm": 0.3081974908720677, "learning_rate": 3.938405262359274e-07, "loss": 0.0185, "step": 6870 }, { "epoch": 2.6569992266047953, "grad_norm": 0.29099458549157176, "learning_rate": 3.9296579694740323e-07, "loss": 0.0208, "step": 6871 }, { "epoch": 2.65738592420727, "grad_norm": 0.3117934155574541, "learning_rate": 3.920920004007761e-07, "loss": 0.0217, "step": 6872 }, { "epoch": 2.6577726218097446, "grad_norm": 0.3270188368688422, "learning_rate": 3.9121913677295787e-07, "loss": 0.0216, "step": 6873 }, { "epoch": 2.6581593194122197, "grad_norm": 0.255036655054513, "learning_rate": 3.903472062406677e-07, "loss": 0.0187, "step": 6874 }, { "epoch": 2.6585460170146944, "grad_norm": 0.2217212179398478, "learning_rate": 3.894762089804366e-07, "loss": 0.0169, "step": 6875 }, { "epoch": 2.6589327146171695, "grad_norm": 0.21705700775506412, "learning_rate": 3.886061451686074e-07, "loss": 0.0112, "step": 6876 }, { "epoch": 2.659319412219644, "grad_norm": 0.2071510161072359, "learning_rate": 3.8773701498133253e-07, "loss": 0.0089, "step": 6877 }, { "epoch": 2.6597061098221193, "grad_norm": 0.3615090937436096, "learning_rate": 3.8686881859457904e-07, "loss": 0.0318, "step": 6878 }, { "epoch": 2.660092807424594, "grad_norm": 0.4348326672927617, "learning_rate": 3.860015561841196e-07, "loss": 0.0395, "step": 6879 }, { "epoch": 2.6604795050270686, "grad_norm": 0.2553993811642135, "learning_rate": 3.851352279255421e-07, "loss": 0.018, "step": 6880 }, { "epoch": 2.6608662026295438, "grad_norm": 0.26149183446233326, "learning_rate": 3.842698339942436e-07, "loss": 0.0184, "step": 6881 }, { "epoch": 2.6612529002320184, "grad_norm": 0.2521359349143553, "learning_rate": 3.834053745654298e-07, "loss": 0.0164, "step": 6882 }, { "epoch": 2.6616395978344936, "grad_norm": 0.23003645192203345, "learning_rate": 3.8254184981412314e-07, "loss": 0.0175, "step": 6883 }, { "epoch": 2.6620262954369682, "grad_norm": 0.2766190939324577, "learning_rate": 3.816792599151509e-07, "loss": 0.0168, "step": 6884 }, { "epoch": 2.6624129930394433, "grad_norm": 0.239835705930815, "learning_rate": 3.8081760504315335e-07, "loss": 0.015, "step": 6885 }, { "epoch": 2.662799690641918, "grad_norm": 0.3725600080936133, "learning_rate": 3.799568853725816e-07, "loss": 0.0235, "step": 6886 }, { "epoch": 2.6631863882443927, "grad_norm": 0.21931087470575725, "learning_rate": 3.7909710107769835e-07, "loss": 0.0128, "step": 6887 }, { "epoch": 2.663573085846868, "grad_norm": 0.2721430220774661, "learning_rate": 3.782382523325756e-07, "loss": 0.0227, "step": 6888 }, { "epoch": 2.6639597834493425, "grad_norm": 0.27552350218633614, "learning_rate": 3.7738033931109473e-07, "loss": 0.0159, "step": 6889 }, { "epoch": 2.6643464810518176, "grad_norm": 0.37699767643420407, "learning_rate": 3.765233621869502e-07, "loss": 0.0199, "step": 6890 }, { "epoch": 2.6647331786542923, "grad_norm": 0.24153765351387696, "learning_rate": 3.7566732113364533e-07, "loss": 0.0197, "step": 6891 }, { "epoch": 2.6651198762567674, "grad_norm": 0.2270921918873071, "learning_rate": 3.7481221632449715e-07, "loss": 0.0095, "step": 6892 }, { "epoch": 2.665506573859242, "grad_norm": 0.8194500320023105, "learning_rate": 3.73958047932626e-07, "loss": 0.0172, "step": 6893 }, { "epoch": 2.6658932714617167, "grad_norm": 0.23281342201358984, "learning_rate": 3.731048161309708e-07, "loss": 0.0191, "step": 6894 }, { "epoch": 2.666279969064192, "grad_norm": 0.28253448930862674, "learning_rate": 3.722525210922756e-07, "loss": 0.0212, "step": 6895 }, { "epoch": 2.6666666666666665, "grad_norm": 0.18224346329932026, "learning_rate": 3.7140116298909677e-07, "loss": 0.0117, "step": 6896 }, { "epoch": 2.6670533642691416, "grad_norm": 0.21385913596819772, "learning_rate": 3.705507419938004e-07, "loss": 0.0156, "step": 6897 }, { "epoch": 2.6674400618716163, "grad_norm": 0.4127500120999198, "learning_rate": 3.697012582785625e-07, "loss": 0.0218, "step": 6898 }, { "epoch": 2.6678267594740914, "grad_norm": 0.19111036867657302, "learning_rate": 3.6885271201537123e-07, "loss": 0.0101, "step": 6899 }, { "epoch": 2.668213457076566, "grad_norm": 0.25332599598948274, "learning_rate": 3.680051033760229e-07, "loss": 0.0207, "step": 6900 }, { "epoch": 2.6686001546790408, "grad_norm": 0.38966992799320516, "learning_rate": 3.6715843253212425e-07, "loss": 0.0299, "step": 6901 }, { "epoch": 2.668986852281516, "grad_norm": 0.40006949256521, "learning_rate": 3.6631269965509364e-07, "loss": 0.0306, "step": 6902 }, { "epoch": 2.6693735498839906, "grad_norm": 0.2652235640791629, "learning_rate": 3.654679049161569e-07, "loss": 0.0157, "step": 6903 }, { "epoch": 2.6697602474864657, "grad_norm": 0.29039721772074323, "learning_rate": 3.646240484863528e-07, "loss": 0.0203, "step": 6904 }, { "epoch": 2.6701469450889403, "grad_norm": 0.347290330757072, "learning_rate": 3.637811305365285e-07, "loss": 0.0203, "step": 6905 }, { "epoch": 2.6705336426914155, "grad_norm": 0.2394310541979219, "learning_rate": 3.62939151237342e-07, "loss": 0.0231, "step": 6906 }, { "epoch": 2.67092034029389, "grad_norm": 0.32791794745874636, "learning_rate": 3.620981107592597e-07, "loss": 0.0206, "step": 6907 }, { "epoch": 2.671307037896365, "grad_norm": 0.2704352580387085, "learning_rate": 3.612580092725587e-07, "loss": 0.0167, "step": 6908 }, { "epoch": 2.67169373549884, "grad_norm": 0.21413926248616671, "learning_rate": 3.604188469473285e-07, "loss": 0.0153, "step": 6909 }, { "epoch": 2.672080433101315, "grad_norm": 0.22879566565471213, "learning_rate": 3.5958062395346325e-07, "loss": 0.0156, "step": 6910 }, { "epoch": 2.6724671307037897, "grad_norm": 0.21476448347051197, "learning_rate": 3.5874334046067214e-07, "loss": 0.0121, "step": 6911 }, { "epoch": 2.6728538283062644, "grad_norm": 0.36590809323003315, "learning_rate": 3.5790699663847074e-07, "loss": 0.0273, "step": 6912 }, { "epoch": 2.6732405259087395, "grad_norm": 0.2829430065690627, "learning_rate": 3.570715926561874e-07, "loss": 0.0181, "step": 6913 }, { "epoch": 2.673627223511214, "grad_norm": 0.272634765598642, "learning_rate": 3.562371286829558e-07, "loss": 0.017, "step": 6914 }, { "epoch": 2.674013921113689, "grad_norm": 0.29410694681789945, "learning_rate": 3.554036048877224e-07, "loss": 0.0275, "step": 6915 }, { "epoch": 2.674400618716164, "grad_norm": 0.23625280225925915, "learning_rate": 3.5457102143924395e-07, "loss": 0.0154, "step": 6916 }, { "epoch": 2.674787316318639, "grad_norm": 0.23031123342036844, "learning_rate": 3.53739378506085e-07, "loss": 0.0137, "step": 6917 }, { "epoch": 2.6751740139211138, "grad_norm": 0.3108216592645888, "learning_rate": 3.529086762566197e-07, "loss": 0.0145, "step": 6918 }, { "epoch": 2.6755607115235884, "grad_norm": 0.231426632815214, "learning_rate": 3.5207891485903254e-07, "loss": 0.0141, "step": 6919 }, { "epoch": 2.6759474091260635, "grad_norm": 0.25734633329570133, "learning_rate": 3.5125009448131853e-07, "loss": 0.0239, "step": 6920 }, { "epoch": 2.676334106728538, "grad_norm": 0.26337594249408913, "learning_rate": 3.5042221529128004e-07, "loss": 0.0174, "step": 6921 }, { "epoch": 2.676720804331013, "grad_norm": 0.21794395277582207, "learning_rate": 3.4959527745652977e-07, "loss": 0.0126, "step": 6922 }, { "epoch": 2.677107501933488, "grad_norm": 0.3741530102102967, "learning_rate": 3.4876928114448985e-07, "loss": 0.024, "step": 6923 }, { "epoch": 2.677494199535963, "grad_norm": 0.22325824377591663, "learning_rate": 3.479442265223915e-07, "loss": 0.0114, "step": 6924 }, { "epoch": 2.677880897138438, "grad_norm": 0.2685068148113639, "learning_rate": 3.471201137572761e-07, "loss": 0.0217, "step": 6925 }, { "epoch": 2.6782675947409125, "grad_norm": 0.3085225120253817, "learning_rate": 3.4629694301599405e-07, "loss": 0.0165, "step": 6926 }, { "epoch": 2.6786542923433876, "grad_norm": 0.25900666022383717, "learning_rate": 3.454747144652049e-07, "loss": 0.0164, "step": 6927 }, { "epoch": 2.6790409899458623, "grad_norm": 0.24267040578386948, "learning_rate": 3.446534282713765e-07, "loss": 0.0181, "step": 6928 }, { "epoch": 2.679427687548337, "grad_norm": 0.2922729726047943, "learning_rate": 3.4383308460078643e-07, "loss": 0.0235, "step": 6929 }, { "epoch": 2.679814385150812, "grad_norm": 0.25887890935181956, "learning_rate": 3.4301368361952357e-07, "loss": 0.0184, "step": 6930 }, { "epoch": 2.680201082753287, "grad_norm": 0.2530583046390743, "learning_rate": 3.4219522549348295e-07, "loss": 0.0233, "step": 6931 }, { "epoch": 2.680587780355762, "grad_norm": 0.26142601286895223, "learning_rate": 3.4137771038837033e-07, "loss": 0.0225, "step": 6932 }, { "epoch": 2.6809744779582365, "grad_norm": 0.3052825791955554, "learning_rate": 3.4056113846969885e-07, "loss": 0.018, "step": 6933 }, { "epoch": 2.6813611755607116, "grad_norm": 0.24176824981555983, "learning_rate": 3.397455099027941e-07, "loss": 0.0128, "step": 6934 }, { "epoch": 2.6817478731631863, "grad_norm": 0.35834016144920944, "learning_rate": 3.3893082485278785e-07, "loss": 0.0183, "step": 6935 }, { "epoch": 2.6821345707656614, "grad_norm": 0.23228608392584332, "learning_rate": 3.381170834846198e-07, "loss": 0.0172, "step": 6936 }, { "epoch": 2.682521268368136, "grad_norm": 0.2925339297644954, "learning_rate": 3.3730428596304265e-07, "loss": 0.0219, "step": 6937 }, { "epoch": 2.682907965970611, "grad_norm": 0.3186274898577006, "learning_rate": 3.3649243245261364e-07, "loss": 0.0262, "step": 6938 }, { "epoch": 2.683294663573086, "grad_norm": 0.19538463617857518, "learning_rate": 3.356815231177041e-07, "loss": 0.0127, "step": 6939 }, { "epoch": 2.6836813611755606, "grad_norm": 0.32410863708434146, "learning_rate": 3.3487155812248707e-07, "loss": 0.0149, "step": 6940 }, { "epoch": 2.6840680587780357, "grad_norm": 0.2646927500989584, "learning_rate": 3.340625376309514e-07, "loss": 0.0148, "step": 6941 }, { "epoch": 2.6844547563805103, "grad_norm": 0.20118975490409538, "learning_rate": 3.332544618068906e-07, "loss": 0.0122, "step": 6942 }, { "epoch": 2.6848414539829855, "grad_norm": 0.1771951581959058, "learning_rate": 3.3244733081390754e-07, "loss": 0.0102, "step": 6943 }, { "epoch": 2.68522815158546, "grad_norm": 0.28662445054337593, "learning_rate": 3.31641144815415e-07, "loss": 0.018, "step": 6944 }, { "epoch": 2.6856148491879352, "grad_norm": 0.2923828348821418, "learning_rate": 3.3083590397463283e-07, "loss": 0.0299, "step": 6945 }, { "epoch": 2.68600154679041, "grad_norm": 0.267251204182977, "learning_rate": 3.300316084545918e-07, "loss": 0.0216, "step": 6946 }, { "epoch": 2.6863882443928846, "grad_norm": 0.29099399439188056, "learning_rate": 3.292282584181289e-07, "loss": 0.0169, "step": 6947 }, { "epoch": 2.6867749419953597, "grad_norm": 0.22562972019231547, "learning_rate": 3.284258540278906e-07, "loss": 0.0154, "step": 6948 }, { "epoch": 2.6871616395978344, "grad_norm": 0.25609662589060433, "learning_rate": 3.276243954463326e-07, "loss": 0.0156, "step": 6949 }, { "epoch": 2.6875483372003095, "grad_norm": 0.2815710774376325, "learning_rate": 3.268238828357173e-07, "loss": 0.0173, "step": 6950 }, { "epoch": 2.687935034802784, "grad_norm": 0.6521347291669172, "learning_rate": 3.2602431635811837e-07, "loss": 0.0215, "step": 6951 }, { "epoch": 2.6883217324052593, "grad_norm": 0.28574259359961135, "learning_rate": 3.2522569617541577e-07, "loss": 0.0262, "step": 6952 }, { "epoch": 2.688708430007734, "grad_norm": 0.23200719143703155, "learning_rate": 3.244280224492985e-07, "loss": 0.014, "step": 6953 }, { "epoch": 2.6890951276102086, "grad_norm": 0.26545622398077845, "learning_rate": 3.2363129534126347e-07, "loss": 0.0187, "step": 6954 }, { "epoch": 2.6894818252126838, "grad_norm": 0.25954486765882234, "learning_rate": 3.228355150126161e-07, "loss": 0.0206, "step": 6955 }, { "epoch": 2.6898685228151584, "grad_norm": 0.21828293290207315, "learning_rate": 3.22040681624472e-07, "loss": 0.0127, "step": 6956 }, { "epoch": 2.6902552204176335, "grad_norm": 0.2697635992278145, "learning_rate": 3.212467953377513e-07, "loss": 0.0172, "step": 6957 }, { "epoch": 2.690641918020108, "grad_norm": 0.2685447857002627, "learning_rate": 3.2045385631318535e-07, "loss": 0.0198, "step": 6958 }, { "epoch": 2.6910286156225833, "grad_norm": 0.2735096891312043, "learning_rate": 3.196618647113131e-07, "loss": 0.0199, "step": 6959 }, { "epoch": 2.691415313225058, "grad_norm": 0.29171853057463215, "learning_rate": 3.188708206924829e-07, "loss": 0.0184, "step": 6960 }, { "epoch": 2.6918020108275327, "grad_norm": 0.3080408348503539, "learning_rate": 3.1808072441684713e-07, "loss": 0.0241, "step": 6961 }, { "epoch": 2.692188708430008, "grad_norm": 0.2559776789154028, "learning_rate": 3.1729157604437013e-07, "loss": 0.0149, "step": 6962 }, { "epoch": 2.6925754060324825, "grad_norm": 0.5644607813225923, "learning_rate": 3.165033757348235e-07, "loss": 0.0303, "step": 6963 }, { "epoch": 2.6929621036349576, "grad_norm": 0.2519008739786905, "learning_rate": 3.157161236477863e-07, "loss": 0.0181, "step": 6964 }, { "epoch": 2.6933488012374323, "grad_norm": 0.19960635067842905, "learning_rate": 3.149298199426465e-07, "loss": 0.016, "step": 6965 }, { "epoch": 2.6937354988399074, "grad_norm": 0.19973652277829668, "learning_rate": 3.1414446477859737e-07, "loss": 0.0206, "step": 6966 }, { "epoch": 2.694122196442382, "grad_norm": 0.18224077984988246, "learning_rate": 3.133600583146451e-07, "loss": 0.01, "step": 6967 }, { "epoch": 2.6945088940448567, "grad_norm": 0.27087084675663525, "learning_rate": 3.1257660070959927e-07, "loss": 0.0178, "step": 6968 }, { "epoch": 2.694895591647332, "grad_norm": 0.2944971081800529, "learning_rate": 3.1179409212207914e-07, "loss": 0.0242, "step": 6969 }, { "epoch": 2.6952822892498065, "grad_norm": 0.30313089206048044, "learning_rate": 3.110125327105118e-07, "loss": 0.0131, "step": 6970 }, { "epoch": 2.6956689868522816, "grad_norm": 0.25196474808541625, "learning_rate": 3.1023192263313127e-07, "loss": 0.018, "step": 6971 }, { "epoch": 2.6960556844547563, "grad_norm": 0.2960746537619575, "learning_rate": 3.094522620479823e-07, "loss": 0.0268, "step": 6972 }, { "epoch": 2.6964423820572314, "grad_norm": 0.49949489951217757, "learning_rate": 3.086735511129135e-07, "loss": 0.0213, "step": 6973 }, { "epoch": 2.696829079659706, "grad_norm": 0.20580308745455073, "learning_rate": 3.078957899855833e-07, "loss": 0.0133, "step": 6974 }, { "epoch": 2.6972157772621808, "grad_norm": 0.2605719053151453, "learning_rate": 3.0711897882345797e-07, "loss": 0.0154, "step": 6975 }, { "epoch": 2.697602474864656, "grad_norm": 0.2209607024838484, "learning_rate": 3.0634311778381e-07, "loss": 0.0123, "step": 6976 }, { "epoch": 2.6979891724671305, "grad_norm": 0.27003992516173836, "learning_rate": 3.055682070237226e-07, "loss": 0.0221, "step": 6977 }, { "epoch": 2.6983758700696057, "grad_norm": 0.4889278590166775, "learning_rate": 3.047942467000814e-07, "loss": 0.0145, "step": 6978 }, { "epoch": 2.6987625676720803, "grad_norm": 0.30165334618646056, "learning_rate": 3.040212369695855e-07, "loss": 0.025, "step": 6979 }, { "epoch": 2.6991492652745555, "grad_norm": 0.26757516167203915, "learning_rate": 3.032491779887364e-07, "loss": 0.0212, "step": 6980 }, { "epoch": 2.69953596287703, "grad_norm": 0.22313157409351056, "learning_rate": 3.024780699138485e-07, "loss": 0.0135, "step": 6981 }, { "epoch": 2.699922660479505, "grad_norm": 0.2055478456937605, "learning_rate": 3.017079129010381e-07, "loss": 0.0131, "step": 6982 }, { "epoch": 2.70030935808198, "grad_norm": 0.2698138252739117, "learning_rate": 3.009387071062314e-07, "loss": 0.0143, "step": 6983 }, { "epoch": 2.700696055684455, "grad_norm": 0.24989019932729167, "learning_rate": 3.0017045268516343e-07, "loss": 0.0154, "step": 6984 }, { "epoch": 2.7010827532869297, "grad_norm": 0.348718793681794, "learning_rate": 2.994031497933747e-07, "loss": 0.0256, "step": 6985 }, { "epoch": 2.7014694508894044, "grad_norm": 0.3016696092802387, "learning_rate": 2.986367985862143e-07, "loss": 0.016, "step": 6986 }, { "epoch": 2.7018561484918795, "grad_norm": 0.2768352993450647, "learning_rate": 2.9787139921883647e-07, "loss": 0.0169, "step": 6987 }, { "epoch": 2.702242846094354, "grad_norm": 0.27001294269463605, "learning_rate": 2.9710695184620497e-07, "loss": 0.0197, "step": 6988 }, { "epoch": 2.702629543696829, "grad_norm": 0.2775801575888946, "learning_rate": 2.9634345662309105e-07, "loss": 0.0159, "step": 6989 }, { "epoch": 2.703016241299304, "grad_norm": 0.2097031213997207, "learning_rate": 2.955809137040716e-07, "loss": 0.0182, "step": 6990 }, { "epoch": 2.703402938901779, "grad_norm": 0.23757945421831939, "learning_rate": 2.948193232435309e-07, "loss": 0.0169, "step": 6991 }, { "epoch": 2.7037896365042537, "grad_norm": 0.22713141462570813, "learning_rate": 2.9405868539566065e-07, "loss": 0.016, "step": 6992 }, { "epoch": 2.7041763341067284, "grad_norm": 0.26126889823824306, "learning_rate": 2.9329900031446144e-07, "loss": 0.0139, "step": 6993 }, { "epoch": 2.7045630317092035, "grad_norm": 0.21130490755196474, "learning_rate": 2.925402681537376e-07, "loss": 0.0145, "step": 6994 }, { "epoch": 2.704949729311678, "grad_norm": 0.26454339740914473, "learning_rate": 2.917824890671039e-07, "loss": 0.0183, "step": 6995 }, { "epoch": 2.705336426914153, "grad_norm": 0.24005304579602615, "learning_rate": 2.9102566320797943e-07, "loss": 0.0178, "step": 6996 }, { "epoch": 2.705723124516628, "grad_norm": 0.31944634533166316, "learning_rate": 2.902697907295909e-07, "loss": 0.0337, "step": 6997 }, { "epoch": 2.706109822119103, "grad_norm": 0.2792478199085034, "learning_rate": 2.895148717849744e-07, "loss": 0.0133, "step": 6998 }, { "epoch": 2.706496519721578, "grad_norm": 0.2722105722635391, "learning_rate": 2.8876090652697033e-07, "loss": 0.0176, "step": 6999 }, { "epoch": 2.7068832173240525, "grad_norm": 0.3424952903573023, "learning_rate": 2.88007895108226e-07, "loss": 0.018, "step": 7000 }, { "epoch": 2.7072699149265276, "grad_norm": 0.30324594269375, "learning_rate": 2.872558376811974e-07, "loss": 0.0169, "step": 7001 }, { "epoch": 2.7076566125290022, "grad_norm": 0.2626042297624014, "learning_rate": 2.865047343981453e-07, "loss": 0.0181, "step": 7002 }, { "epoch": 2.708043310131477, "grad_norm": 0.27116230909020184, "learning_rate": 2.857545854111404e-07, "loss": 0.0097, "step": 7003 }, { "epoch": 2.708430007733952, "grad_norm": 0.21529395083826924, "learning_rate": 2.8500539087205514e-07, "loss": 0.0137, "step": 7004 }, { "epoch": 2.708816705336427, "grad_norm": 0.22184666581439444, "learning_rate": 2.8425715093257424e-07, "loss": 0.0153, "step": 7005 }, { "epoch": 2.709203402938902, "grad_norm": 0.2800568182176543, "learning_rate": 2.835098657441854e-07, "loss": 0.0218, "step": 7006 }, { "epoch": 2.7095901005413765, "grad_norm": 0.22832968352217012, "learning_rate": 2.8276353545818536e-07, "loss": 0.0162, "step": 7007 }, { "epoch": 2.7099767981438516, "grad_norm": 0.3415590248712895, "learning_rate": 2.820181602256755e-07, "loss": 0.0221, "step": 7008 }, { "epoch": 2.7103634957463263, "grad_norm": 0.1827392970691581, "learning_rate": 2.81273740197564e-07, "loss": 0.009, "step": 7009 }, { "epoch": 2.710750193348801, "grad_norm": 0.2251404863745338, "learning_rate": 2.805302755245681e-07, "loss": 0.0151, "step": 7010 }, { "epoch": 2.711136890951276, "grad_norm": 0.23524401588421137, "learning_rate": 2.797877663572091e-07, "loss": 0.013, "step": 7011 }, { "epoch": 2.711523588553751, "grad_norm": 0.22389318576493208, "learning_rate": 2.79046212845816e-07, "loss": 0.0195, "step": 7012 }, { "epoch": 2.711910286156226, "grad_norm": 0.2813342924259227, "learning_rate": 2.783056151405228e-07, "loss": 0.0179, "step": 7013 }, { "epoch": 2.7122969837587005, "grad_norm": 0.2638586677853868, "learning_rate": 2.7756597339127334e-07, "loss": 0.0207, "step": 7014 }, { "epoch": 2.7126836813611757, "grad_norm": 0.27723125011522903, "learning_rate": 2.7682728774781456e-07, "loss": 0.0168, "step": 7015 }, { "epoch": 2.7130703789636503, "grad_norm": 0.3235029675241881, "learning_rate": 2.760895583597012e-07, "loss": 0.0211, "step": 7016 }, { "epoch": 2.7134570765661254, "grad_norm": 0.23316287188488638, "learning_rate": 2.7535278537629384e-07, "loss": 0.0173, "step": 7017 }, { "epoch": 2.7138437741686, "grad_norm": 0.3234983850565134, "learning_rate": 2.746169689467593e-07, "loss": 0.0286, "step": 7018 }, { "epoch": 2.7142304717710752, "grad_norm": 0.2373217742078167, "learning_rate": 2.738821092200733e-07, "loss": 0.0132, "step": 7019 }, { "epoch": 2.71461716937355, "grad_norm": 0.22393159233717866, "learning_rate": 2.7314820634501524e-07, "loss": 0.0152, "step": 7020 }, { "epoch": 2.7150038669760246, "grad_norm": 0.27177210651491196, "learning_rate": 2.724152604701691e-07, "loss": 0.0286, "step": 7021 }, { "epoch": 2.7153905645784997, "grad_norm": 0.30848341539956525, "learning_rate": 2.7168327174392994e-07, "loss": 0.0218, "step": 7022 }, { "epoch": 2.7157772621809744, "grad_norm": 0.20642587154994962, "learning_rate": 2.7095224031449486e-07, "loss": 0.0227, "step": 7023 }, { "epoch": 2.7161639597834495, "grad_norm": 0.24382083848299402, "learning_rate": 2.7022216632987043e-07, "loss": 0.0184, "step": 7024 }, { "epoch": 2.716550657385924, "grad_norm": 0.3701123295430709, "learning_rate": 2.6949304993786565e-07, "loss": 0.0233, "step": 7025 }, { "epoch": 2.7169373549883993, "grad_norm": 0.20947685424348522, "learning_rate": 2.687648912860996e-07, "loss": 0.0113, "step": 7026 }, { "epoch": 2.717324052590874, "grad_norm": 0.2642940051147024, "learning_rate": 2.680376905219945e-07, "loss": 0.0196, "step": 7027 }, { "epoch": 2.7177107501933486, "grad_norm": 0.29162413691629835, "learning_rate": 2.6731144779278015e-07, "loss": 0.0206, "step": 7028 }, { "epoch": 2.7180974477958237, "grad_norm": 0.3301480735114923, "learning_rate": 2.665861632454919e-07, "loss": 0.018, "step": 7029 }, { "epoch": 2.7184841453982984, "grad_norm": 0.2503532434350446, "learning_rate": 2.6586183702696997e-07, "loss": 0.0177, "step": 7030 }, { "epoch": 2.7188708430007735, "grad_norm": 0.3101032848458141, "learning_rate": 2.651384692838632e-07, "loss": 0.0168, "step": 7031 }, { "epoch": 2.719257540603248, "grad_norm": 0.36067373151377113, "learning_rate": 2.64416060162625e-07, "loss": 0.0232, "step": 7032 }, { "epoch": 2.7196442382057233, "grad_norm": 0.25706970456852685, "learning_rate": 2.6369460980951335e-07, "loss": 0.0168, "step": 7033 }, { "epoch": 2.720030935808198, "grad_norm": 0.3105215781724887, "learning_rate": 2.6297411837059474e-07, "loss": 0.022, "step": 7034 }, { "epoch": 2.7204176334106727, "grad_norm": 0.2521179706604075, "learning_rate": 2.62254585991738e-07, "loss": 0.0145, "step": 7035 }, { "epoch": 2.720804331013148, "grad_norm": 0.3458782573725918, "learning_rate": 2.615360128186223e-07, "loss": 0.0197, "step": 7036 }, { "epoch": 2.7211910286156225, "grad_norm": 0.36349058065464707, "learning_rate": 2.6081839899672944e-07, "loss": 0.014, "step": 7037 }, { "epoch": 2.7215777262180976, "grad_norm": 0.30137949494051136, "learning_rate": 2.6010174467134776e-07, "loss": 0.0189, "step": 7038 }, { "epoch": 2.7219644238205722, "grad_norm": 0.35539246042802236, "learning_rate": 2.5938604998757e-07, "loss": 0.0217, "step": 7039 }, { "epoch": 2.7223511214230474, "grad_norm": 0.2189279875271406, "learning_rate": 2.586713150902981e-07, "loss": 0.017, "step": 7040 }, { "epoch": 2.722737819025522, "grad_norm": 0.20762509541080706, "learning_rate": 2.579575401242368e-07, "loss": 0.0141, "step": 7041 }, { "epoch": 2.7231245166279967, "grad_norm": 0.3170056787517989, "learning_rate": 2.572447252338972e-07, "loss": 0.0175, "step": 7042 }, { "epoch": 2.723511214230472, "grad_norm": 0.2455023781682499, "learning_rate": 2.565328705635961e-07, "loss": 0.0274, "step": 7043 }, { "epoch": 2.7238979118329465, "grad_norm": 0.23206813980093197, "learning_rate": 2.5582197625745486e-07, "loss": 0.0165, "step": 7044 }, { "epoch": 2.7242846094354216, "grad_norm": 0.22056499203077712, "learning_rate": 2.5511204245940336e-07, "loss": 0.0124, "step": 7045 }, { "epoch": 2.7246713070378963, "grad_norm": 0.2883395731540859, "learning_rate": 2.5440306931317324e-07, "loss": 0.017, "step": 7046 }, { "epoch": 2.7250580046403714, "grad_norm": 0.3440945404713343, "learning_rate": 2.536950569623048e-07, "loss": 0.0228, "step": 7047 }, { "epoch": 2.725444702242846, "grad_norm": 0.3029088660530426, "learning_rate": 2.5298800555014214e-07, "loss": 0.0193, "step": 7048 }, { "epoch": 2.7258313998453207, "grad_norm": 0.23425504578068107, "learning_rate": 2.522819152198347e-07, "loss": 0.0169, "step": 7049 }, { "epoch": 2.726218097447796, "grad_norm": 0.3251805420146074, "learning_rate": 2.515767861143381e-07, "loss": 0.0164, "step": 7050 }, { "epoch": 2.7266047950502705, "grad_norm": 0.22998534243134647, "learning_rate": 2.5087261837641264e-07, "loss": 0.0166, "step": 7051 }, { "epoch": 2.7269914926527457, "grad_norm": 0.3434329119497584, "learning_rate": 2.501694121486248e-07, "loss": 0.0181, "step": 7052 }, { "epoch": 2.7273781902552203, "grad_norm": 0.2955429652602655, "learning_rate": 2.494671675733462e-07, "loss": 0.0229, "step": 7053 }, { "epoch": 2.7277648878576954, "grad_norm": 0.24048377364252474, "learning_rate": 2.487658847927538e-07, "loss": 0.0148, "step": 7054 }, { "epoch": 2.72815158546017, "grad_norm": 0.22184666125963792, "learning_rate": 2.480655639488283e-07, "loss": 0.0174, "step": 7055 }, { "epoch": 2.728538283062645, "grad_norm": 0.26772674736959534, "learning_rate": 2.473662051833575e-07, "loss": 0.0168, "step": 7056 }, { "epoch": 2.72892498066512, "grad_norm": 0.24192638103953717, "learning_rate": 2.4666780863793483e-07, "loss": 0.0129, "step": 7057 }, { "epoch": 2.7293116782675946, "grad_norm": 0.27495487843126315, "learning_rate": 2.4597037445395657e-07, "loss": 0.0184, "step": 7058 }, { "epoch": 2.7296983758700697, "grad_norm": 0.38984271513336627, "learning_rate": 2.4527390277262644e-07, "loss": 0.0265, "step": 7059 }, { "epoch": 2.7300850734725444, "grad_norm": 0.2940410369298387, "learning_rate": 2.445783937349522e-07, "loss": 0.02, "step": 7060 }, { "epoch": 2.7304717710750195, "grad_norm": 0.21175369189286594, "learning_rate": 2.4388384748174566e-07, "loss": 0.0111, "step": 7061 }, { "epoch": 2.730858468677494, "grad_norm": 0.2931086791956513, "learning_rate": 2.431902641536266e-07, "loss": 0.0151, "step": 7062 }, { "epoch": 2.731245166279969, "grad_norm": 0.2816926629890432, "learning_rate": 2.424976438910176e-07, "loss": 0.0208, "step": 7063 }, { "epoch": 2.731631863882444, "grad_norm": 0.2542821907829915, "learning_rate": 2.418059868341466e-07, "loss": 0.0154, "step": 7064 }, { "epoch": 2.732018561484919, "grad_norm": 0.21480071309654467, "learning_rate": 2.4111529312304594e-07, "loss": 0.0108, "step": 7065 }, { "epoch": 2.7324052590873937, "grad_norm": 0.3262991011280757, "learning_rate": 2.404255628975555e-07, "loss": 0.0222, "step": 7066 }, { "epoch": 2.7327919566898684, "grad_norm": 0.27478763323198635, "learning_rate": 2.397367962973185e-07, "loss": 0.0171, "step": 7067 }, { "epoch": 2.7331786542923435, "grad_norm": 0.2636659436130475, "learning_rate": 2.3904899346178014e-07, "loss": 0.0192, "step": 7068 }, { "epoch": 2.733565351894818, "grad_norm": 0.24794497035546523, "learning_rate": 2.3836215453019562e-07, "loss": 0.0169, "step": 7069 }, { "epoch": 2.733952049497293, "grad_norm": 0.32169490554362323, "learning_rate": 2.3767627964162155e-07, "loss": 0.0215, "step": 7070 }, { "epoch": 2.734338747099768, "grad_norm": 0.3518387304174399, "learning_rate": 2.369913689349218e-07, "loss": 0.0249, "step": 7071 }, { "epoch": 2.734725444702243, "grad_norm": 0.2800967165252869, "learning_rate": 2.363074225487616e-07, "loss": 0.0185, "step": 7072 }, { "epoch": 2.7351121423047178, "grad_norm": 0.2121331005128204, "learning_rate": 2.356244406216146e-07, "loss": 0.0112, "step": 7073 }, { "epoch": 2.7354988399071924, "grad_norm": 0.2686978268907189, "learning_rate": 2.3494242329175741e-07, "loss": 0.023, "step": 7074 }, { "epoch": 2.7358855375096676, "grad_norm": 0.37560263914673553, "learning_rate": 2.3426137069727072e-07, "loss": 0.0241, "step": 7075 }, { "epoch": 2.7362722351121422, "grad_norm": 0.3095174774890971, "learning_rate": 2.3358128297604143e-07, "loss": 0.0284, "step": 7076 }, { "epoch": 2.736658932714617, "grad_norm": 0.2677046253526925, "learning_rate": 2.329021602657594e-07, "loss": 0.0145, "step": 7077 }, { "epoch": 2.737045630317092, "grad_norm": 0.3184440435046986, "learning_rate": 2.3222400270392131e-07, "loss": 0.0195, "step": 7078 }, { "epoch": 2.737432327919567, "grad_norm": 0.18331117274353276, "learning_rate": 2.315468104278268e-07, "loss": 0.0135, "step": 7079 }, { "epoch": 2.737819025522042, "grad_norm": 0.2930144045514897, "learning_rate": 2.3087058357458003e-07, "loss": 0.0226, "step": 7080 }, { "epoch": 2.7382057231245165, "grad_norm": 0.5761820784552644, "learning_rate": 2.3019532228109043e-07, "loss": 0.0326, "step": 7081 }, { "epoch": 2.7385924207269916, "grad_norm": 0.22430646313117086, "learning_rate": 2.295210266840714e-07, "loss": 0.0126, "step": 7082 }, { "epoch": 2.7389791183294663, "grad_norm": 0.2738121956775299, "learning_rate": 2.288476969200415e-07, "loss": 0.0115, "step": 7083 }, { "epoch": 2.739365815931941, "grad_norm": 0.26088706676252915, "learning_rate": 2.2817533312532335e-07, "loss": 0.0151, "step": 7084 }, { "epoch": 2.739752513534416, "grad_norm": 0.274879371261748, "learning_rate": 2.2750393543604367e-07, "loss": 0.0152, "step": 7085 }, { "epoch": 2.740139211136891, "grad_norm": 0.36977484861382, "learning_rate": 2.2683350398813363e-07, "loss": 0.0229, "step": 7086 }, { "epoch": 2.740525908739366, "grad_norm": 0.26199105167133174, "learning_rate": 2.261640389173303e-07, "loss": 0.0211, "step": 7087 }, { "epoch": 2.7409126063418405, "grad_norm": 0.22140350419120994, "learning_rate": 2.25495540359173e-07, "loss": 0.0131, "step": 7088 }, { "epoch": 2.7412993039443156, "grad_norm": 0.24309138226945756, "learning_rate": 2.248280084490051e-07, "loss": 0.0198, "step": 7089 }, { "epoch": 2.7416860015467903, "grad_norm": 0.2490748661500928, "learning_rate": 2.2416144332197743e-07, "loss": 0.0126, "step": 7090 }, { "epoch": 2.7420726991492654, "grad_norm": 0.2788982603579442, "learning_rate": 2.2349584511304089e-07, "loss": 0.0221, "step": 7091 }, { "epoch": 2.74245939675174, "grad_norm": 0.232136211430918, "learning_rate": 2.2283121395695495e-07, "loss": 0.0169, "step": 7092 }, { "epoch": 2.7428460943542152, "grad_norm": 0.2641885558430749, "learning_rate": 2.2216754998827917e-07, "loss": 0.0126, "step": 7093 }, { "epoch": 2.74323279195669, "grad_norm": 0.30606815178327534, "learning_rate": 2.215048533413805e-07, "loss": 0.0148, "step": 7094 }, { "epoch": 2.7436194895591646, "grad_norm": 0.29594446903022864, "learning_rate": 2.2084312415042886e-07, "loss": 0.0186, "step": 7095 }, { "epoch": 2.7440061871616397, "grad_norm": 0.19555359130020145, "learning_rate": 2.2018236254939708e-07, "loss": 0.0129, "step": 7096 }, { "epoch": 2.7443928847641144, "grad_norm": 0.3238177630046587, "learning_rate": 2.1952256867206368e-07, "loss": 0.0221, "step": 7097 }, { "epoch": 2.7447795823665895, "grad_norm": 0.2668136323468576, "learning_rate": 2.1886374265201072e-07, "loss": 0.0246, "step": 7098 }, { "epoch": 2.745166279969064, "grad_norm": 0.32169280976192316, "learning_rate": 2.1820588462262426e-07, "loss": 0.0203, "step": 7099 }, { "epoch": 2.7455529775715393, "grad_norm": 0.2178130458068604, "learning_rate": 2.1754899471709557e-07, "loss": 0.011, "step": 7100 }, { "epoch": 2.745939675174014, "grad_norm": 0.3045366828258106, "learning_rate": 2.1689307306841767e-07, "loss": 0.0185, "step": 7101 }, { "epoch": 2.7463263727764886, "grad_norm": 0.25974781498796606, "learning_rate": 2.1623811980938936e-07, "loss": 0.0185, "step": 7102 }, { "epoch": 2.7467130703789637, "grad_norm": 0.21483750521919143, "learning_rate": 2.1558413507261123e-07, "loss": 0.0197, "step": 7103 }, { "epoch": 2.7470997679814384, "grad_norm": 0.3428999395443835, "learning_rate": 2.1493111899049125e-07, "loss": 0.027, "step": 7104 }, { "epoch": 2.7474864655839135, "grad_norm": 0.2871701699393676, "learning_rate": 2.1427907169523864e-07, "loss": 0.0212, "step": 7105 }, { "epoch": 2.747873163186388, "grad_norm": 0.2467151764855831, "learning_rate": 2.1362799331886676e-07, "loss": 0.0201, "step": 7106 }, { "epoch": 2.7482598607888633, "grad_norm": 0.24423886976881648, "learning_rate": 2.1297788399319398e-07, "loss": 0.0186, "step": 7107 }, { "epoch": 2.748646558391338, "grad_norm": 0.23177737199576567, "learning_rate": 2.1232874384984003e-07, "loss": 0.0189, "step": 7108 }, { "epoch": 2.7490332559938127, "grad_norm": 0.2370731119000633, "learning_rate": 2.1168057302023203e-07, "loss": 0.0152, "step": 7109 }, { "epoch": 2.7494199535962878, "grad_norm": 0.3094690478347199, "learning_rate": 2.1103337163559778e-07, "loss": 0.0148, "step": 7110 }, { "epoch": 2.7498066511987624, "grad_norm": 0.2541524560001621, "learning_rate": 2.1038713982697022e-07, "loss": 0.012, "step": 7111 }, { "epoch": 2.7501933488012376, "grad_norm": 0.4255750185254184, "learning_rate": 2.0974187772518528e-07, "loss": 0.0156, "step": 7112 }, { "epoch": 2.7505800464037122, "grad_norm": 0.3007060390090155, "learning_rate": 2.0909758546088398e-07, "loss": 0.0168, "step": 7113 }, { "epoch": 2.7509667440061873, "grad_norm": 0.24179895588010866, "learning_rate": 2.0845426316450978e-07, "loss": 0.0167, "step": 7114 }, { "epoch": 2.751353441608662, "grad_norm": 2.2272976122737407, "learning_rate": 2.0781191096630792e-07, "loss": 0.0334, "step": 7115 }, { "epoch": 2.7517401392111367, "grad_norm": 0.34791038983078293, "learning_rate": 2.0717052899633161e-07, "loss": 0.0229, "step": 7116 }, { "epoch": 2.752126836813612, "grad_norm": 0.23071298515535327, "learning_rate": 2.0653011738443473e-07, "loss": 0.0129, "step": 7117 }, { "epoch": 2.7525135344160865, "grad_norm": 0.29055383979201327, "learning_rate": 2.0589067626027527e-07, "loss": 0.0228, "step": 7118 }, { "epoch": 2.7529002320185616, "grad_norm": 0.24570708424140883, "learning_rate": 2.052522057533135e-07, "loss": 0.0197, "step": 7119 }, { "epoch": 2.7532869296210363, "grad_norm": 0.1800360836689852, "learning_rate": 2.0461470599281663e-07, "loss": 0.0101, "step": 7120 }, { "epoch": 2.7536736272235114, "grad_norm": 0.46206825727883843, "learning_rate": 2.039781771078514e-07, "loss": 0.019, "step": 7121 }, { "epoch": 2.754060324825986, "grad_norm": 0.2880859218550195, "learning_rate": 2.0334261922729027e-07, "loss": 0.022, "step": 7122 }, { "epoch": 2.7544470224284607, "grad_norm": 0.27132131905258017, "learning_rate": 2.0270803247980864e-07, "loss": 0.0188, "step": 7123 }, { "epoch": 2.754833720030936, "grad_norm": 0.23119504230090301, "learning_rate": 2.0207441699388375e-07, "loss": 0.0134, "step": 7124 }, { "epoch": 2.7552204176334105, "grad_norm": 0.24526541448965203, "learning_rate": 2.0144177289780022e-07, "loss": 0.0185, "step": 7125 }, { "epoch": 2.7556071152358856, "grad_norm": 0.28297750074235384, "learning_rate": 2.0081010031964165e-07, "loss": 0.0159, "step": 7126 }, { "epoch": 2.7559938128383603, "grad_norm": 0.20480272303897398, "learning_rate": 2.0017939938729748e-07, "loss": 0.0191, "step": 7127 }, { "epoch": 2.7563805104408354, "grad_norm": 0.2617852657993775, "learning_rate": 1.9954967022845885e-07, "loss": 0.0193, "step": 7128 }, { "epoch": 2.75676720804331, "grad_norm": 0.26621816672566373, "learning_rate": 1.98920912970621e-07, "loss": 0.0135, "step": 7129 }, { "epoch": 2.7571539056457848, "grad_norm": 0.24686408090472056, "learning_rate": 1.982931277410838e-07, "loss": 0.0139, "step": 7130 }, { "epoch": 2.75754060324826, "grad_norm": 0.2944138219575255, "learning_rate": 1.976663146669472e-07, "loss": 0.0269, "step": 7131 }, { "epoch": 2.7579273008507346, "grad_norm": 0.25390778611881537, "learning_rate": 1.9704047387511692e-07, "loss": 0.0158, "step": 7132 }, { "epoch": 2.7583139984532097, "grad_norm": 0.2585824113264119, "learning_rate": 1.964156054923011e-07, "loss": 0.0186, "step": 7133 }, { "epoch": 2.7587006960556844, "grad_norm": 0.2678982104620923, "learning_rate": 1.9579170964500904e-07, "loss": 0.0176, "step": 7134 }, { "epoch": 2.7590873936581595, "grad_norm": 0.27144880798734383, "learning_rate": 1.9516878645955805e-07, "loss": 0.0266, "step": 7135 }, { "epoch": 2.759474091260634, "grad_norm": 0.24710063069543112, "learning_rate": 1.9454683606206225e-07, "loss": 0.0136, "step": 7136 }, { "epoch": 2.759860788863109, "grad_norm": 0.24841074827596718, "learning_rate": 1.9392585857844425e-07, "loss": 0.0185, "step": 7137 }, { "epoch": 2.760247486465584, "grad_norm": 0.2302978000065851, "learning_rate": 1.9330585413442571e-07, "loss": 0.0147, "step": 7138 }, { "epoch": 2.760634184068059, "grad_norm": 0.20301699255995073, "learning_rate": 1.926868228555351e-07, "loss": 0.0107, "step": 7139 }, { "epoch": 2.7610208816705337, "grad_norm": 0.26395276332708834, "learning_rate": 1.9206876486709991e-07, "loss": 0.0197, "step": 7140 }, { "epoch": 2.7614075792730084, "grad_norm": 0.26391793853176254, "learning_rate": 1.9145168029425177e-07, "loss": 0.0177, "step": 7141 }, { "epoch": 2.7617942768754835, "grad_norm": 0.34161828650964976, "learning_rate": 1.9083556926192792e-07, "loss": 0.0147, "step": 7142 }, { "epoch": 2.762180974477958, "grad_norm": 0.3473769678000978, "learning_rate": 1.902204318948653e-07, "loss": 0.0286, "step": 7143 }, { "epoch": 2.762567672080433, "grad_norm": 0.3421766529888227, "learning_rate": 1.896062683176053e-07, "loss": 0.031, "step": 7144 }, { "epoch": 2.762954369682908, "grad_norm": 0.29515897095647065, "learning_rate": 1.8899307865449023e-07, "loss": 0.0191, "step": 7145 }, { "epoch": 2.763341067285383, "grad_norm": 0.27207535064548055, "learning_rate": 1.8838086302966908e-07, "loss": 0.0136, "step": 7146 }, { "epoch": 2.7637277648878578, "grad_norm": 0.23395557549233598, "learning_rate": 1.8776962156708988e-07, "loss": 0.0157, "step": 7147 }, { "epoch": 2.7641144624903324, "grad_norm": 0.21393310565636978, "learning_rate": 1.8715935439050538e-07, "loss": 0.0139, "step": 7148 }, { "epoch": 2.7645011600928076, "grad_norm": 0.2723421376542955, "learning_rate": 1.8655006162346945e-07, "loss": 0.0168, "step": 7149 }, { "epoch": 2.7648878576952822, "grad_norm": 0.31030322347093253, "learning_rate": 1.8594174338934013e-07, "loss": 0.0198, "step": 7150 }, { "epoch": 2.765274555297757, "grad_norm": 0.2920714887227501, "learning_rate": 1.8533439981127887e-07, "loss": 0.0156, "step": 7151 }, { "epoch": 2.765661252900232, "grad_norm": 0.39133282527083185, "learning_rate": 1.847280310122479e-07, "loss": 0.0137, "step": 7152 }, { "epoch": 2.766047950502707, "grad_norm": 0.2531528129939521, "learning_rate": 1.8412263711501232e-07, "loss": 0.0218, "step": 7153 }, { "epoch": 2.766434648105182, "grad_norm": 0.2792566254435932, "learning_rate": 1.8351821824214132e-07, "loss": 0.0196, "step": 7154 }, { "epoch": 2.7668213457076565, "grad_norm": 0.29578558417771666, "learning_rate": 1.8291477451600482e-07, "loss": 0.0197, "step": 7155 }, { "epoch": 2.7672080433101316, "grad_norm": 0.2567766618182643, "learning_rate": 1.8231230605877837e-07, "loss": 0.0191, "step": 7156 }, { "epoch": 2.7675947409126063, "grad_norm": 0.2707031169163284, "learning_rate": 1.8171081299243498e-07, "loss": 0.0182, "step": 7157 }, { "epoch": 2.767981438515081, "grad_norm": 0.21277289664411556, "learning_rate": 1.8111029543875502e-07, "loss": 0.012, "step": 7158 }, { "epoch": 2.768368136117556, "grad_norm": 0.27453943526125263, "learning_rate": 1.8051075351931847e-07, "loss": 0.0194, "step": 7159 }, { "epoch": 2.768754833720031, "grad_norm": 0.2656107573809142, "learning_rate": 1.7991218735551097e-07, "loss": 0.0164, "step": 7160 }, { "epoch": 2.769141531322506, "grad_norm": 0.3263588908301449, "learning_rate": 1.7931459706851616e-07, "loss": 0.0264, "step": 7161 }, { "epoch": 2.7695282289249805, "grad_norm": 0.23279625222363512, "learning_rate": 1.7871798277932229e-07, "loss": 0.0153, "step": 7162 }, { "epoch": 2.7699149265274556, "grad_norm": 0.26070236025081384, "learning_rate": 1.7812234460872212e-07, "loss": 0.0144, "step": 7163 }, { "epoch": 2.7703016241299303, "grad_norm": 0.5850809070686982, "learning_rate": 1.77527682677307e-07, "loss": 0.0221, "step": 7164 }, { "epoch": 2.7706883217324054, "grad_norm": 0.27181110961854693, "learning_rate": 1.7693399710547342e-07, "loss": 0.0175, "step": 7165 }, { "epoch": 2.77107501933488, "grad_norm": 0.2830776951264371, "learning_rate": 1.7634128801341855e-07, "loss": 0.018, "step": 7166 }, { "epoch": 2.771461716937355, "grad_norm": 0.21954489432536284, "learning_rate": 1.7574955552114303e-07, "loss": 0.0162, "step": 7167 }, { "epoch": 2.77184841453983, "grad_norm": 0.2788031541829579, "learning_rate": 1.7515879974844885e-07, "loss": 0.0165, "step": 7168 }, { "epoch": 2.7722351121423046, "grad_norm": 0.28484400974120705, "learning_rate": 1.7456902081494088e-07, "loss": 0.0151, "step": 7169 }, { "epoch": 2.7726218097447797, "grad_norm": 0.24630670937139168, "learning_rate": 1.7398021884002637e-07, "loss": 0.0132, "step": 7170 }, { "epoch": 2.7730085073472543, "grad_norm": 0.2924987257832343, "learning_rate": 1.7339239394291273e-07, "loss": 0.0245, "step": 7171 }, { "epoch": 2.7733952049497295, "grad_norm": 0.3139766980380609, "learning_rate": 1.728055462426137e-07, "loss": 0.0244, "step": 7172 }, { "epoch": 2.773781902552204, "grad_norm": 0.232870710536494, "learning_rate": 1.7221967585794085e-07, "loss": 0.0213, "step": 7173 }, { "epoch": 2.7741686001546793, "grad_norm": 0.27499140276523176, "learning_rate": 1.7163478290751046e-07, "loss": 0.0162, "step": 7174 }, { "epoch": 2.774555297757154, "grad_norm": 0.2784036851482537, "learning_rate": 1.7105086750973943e-07, "loss": 0.0188, "step": 7175 }, { "epoch": 2.7749419953596286, "grad_norm": 0.3876731987970812, "learning_rate": 1.704679297828482e-07, "loss": 0.0363, "step": 7176 }, { "epoch": 2.7753286929621037, "grad_norm": 0.21006464616440412, "learning_rate": 1.698859698448585e-07, "loss": 0.0138, "step": 7177 }, { "epoch": 2.7757153905645784, "grad_norm": 0.26727865137283113, "learning_rate": 1.693049878135944e-07, "loss": 0.0153, "step": 7178 }, { "epoch": 2.7761020881670535, "grad_norm": 0.2814809115342595, "learning_rate": 1.6872498380668124e-07, "loss": 0.0195, "step": 7179 }, { "epoch": 2.776488785769528, "grad_norm": 0.2819903044392319, "learning_rate": 1.6814595794154732e-07, "loss": 0.0235, "step": 7180 }, { "epoch": 2.7768754833720033, "grad_norm": 0.312195543594607, "learning_rate": 1.6756791033542108e-07, "loss": 0.0151, "step": 7181 }, { "epoch": 2.777262180974478, "grad_norm": 0.2727211779278698, "learning_rate": 1.6699084110533725e-07, "loss": 0.0145, "step": 7182 }, { "epoch": 2.7776488785769526, "grad_norm": 0.25488509023641354, "learning_rate": 1.664147503681257e-07, "loss": 0.0143, "step": 7183 }, { "epoch": 2.7780355761794278, "grad_norm": 0.38523568284135984, "learning_rate": 1.6583963824042481e-07, "loss": 0.0203, "step": 7184 }, { "epoch": 2.7784222737819024, "grad_norm": 0.23796020148341004, "learning_rate": 1.6526550483867032e-07, "loss": 0.0174, "step": 7185 }, { "epoch": 2.7788089713843775, "grad_norm": 0.25405548850111637, "learning_rate": 1.6469235027910313e-07, "loss": 0.0136, "step": 7186 }, { "epoch": 2.779195668986852, "grad_norm": 0.2773576438704098, "learning_rate": 1.641201746777632e-07, "loss": 0.0135, "step": 7187 }, { "epoch": 2.7795823665893273, "grad_norm": 0.23337035720461585, "learning_rate": 1.6354897815049288e-07, "loss": 0.0203, "step": 7188 }, { "epoch": 2.779969064191802, "grad_norm": 0.23809343686570042, "learning_rate": 1.6297876081293796e-07, "loss": 0.014, "step": 7189 }, { "epoch": 2.7803557617942767, "grad_norm": 0.32506844037933114, "learning_rate": 1.6240952278054446e-07, "loss": 0.0234, "step": 7190 }, { "epoch": 2.780742459396752, "grad_norm": 0.2602575872481886, "learning_rate": 1.6184126416856128e-07, "loss": 0.0211, "step": 7191 }, { "epoch": 2.7811291569992265, "grad_norm": 0.32051120161657287, "learning_rate": 1.612739850920364e-07, "loss": 0.0251, "step": 7192 }, { "epoch": 2.7815158546017016, "grad_norm": 0.2301425069952596, "learning_rate": 1.6070768566582296e-07, "loss": 0.014, "step": 7193 }, { "epoch": 2.7819025522041763, "grad_norm": 0.3290307979696232, "learning_rate": 1.6014236600457367e-07, "loss": 0.0232, "step": 7194 }, { "epoch": 2.7822892498066514, "grad_norm": 0.2881622063025656, "learning_rate": 1.5957802622274366e-07, "loss": 0.0197, "step": 7195 }, { "epoch": 2.782675947409126, "grad_norm": 0.20317934604316384, "learning_rate": 1.590146664345893e-07, "loss": 0.0118, "step": 7196 }, { "epoch": 2.7830626450116007, "grad_norm": 0.25479618003042176, "learning_rate": 1.584522867541677e-07, "loss": 0.0142, "step": 7197 }, { "epoch": 2.783449342614076, "grad_norm": 0.30244754932069096, "learning_rate": 1.5789088729533996e-07, "loss": 0.0164, "step": 7198 }, { "epoch": 2.7838360402165505, "grad_norm": 0.27864361598745774, "learning_rate": 1.5733046817176578e-07, "loss": 0.0228, "step": 7199 }, { "epoch": 2.7842227378190256, "grad_norm": 0.29618466855784237, "learning_rate": 1.5677102949690936e-07, "loss": 0.0261, "step": 7200 }, { "epoch": 2.7846094354215003, "grad_norm": 0.4178154833736528, "learning_rate": 1.5621257138403344e-07, "loss": 0.0275, "step": 7201 }, { "epoch": 2.7849961330239754, "grad_norm": 0.7776799931999911, "learning_rate": 1.5565509394620372e-07, "loss": 0.0176, "step": 7202 }, { "epoch": 2.78538283062645, "grad_norm": 0.22634215789129808, "learning_rate": 1.550985972962893e-07, "loss": 0.0161, "step": 7203 }, { "epoch": 2.7857695282289248, "grad_norm": 0.2405447675567445, "learning_rate": 1.5454308154695619e-07, "loss": 0.0177, "step": 7204 }, { "epoch": 2.7861562258314, "grad_norm": 0.27469625482295057, "learning_rate": 1.5398854681067555e-07, "loss": 0.0164, "step": 7205 }, { "epoch": 2.7865429234338746, "grad_norm": 0.31396134750788013, "learning_rate": 1.534349931997181e-07, "loss": 0.0159, "step": 7206 }, { "epoch": 2.7869296210363497, "grad_norm": 0.19285228163358267, "learning_rate": 1.528824208261581e-07, "loss": 0.0132, "step": 7207 }, { "epoch": 2.7873163186388243, "grad_norm": 0.18408536217009333, "learning_rate": 1.5233082980186775e-07, "loss": 0.0163, "step": 7208 }, { "epoch": 2.7877030162412995, "grad_norm": 0.2277151813211753, "learning_rate": 1.5178022023852213e-07, "loss": 0.0207, "step": 7209 }, { "epoch": 2.788089713843774, "grad_norm": 0.26801955964020213, "learning_rate": 1.5123059224759984e-07, "loss": 0.0228, "step": 7210 }, { "epoch": 2.788476411446249, "grad_norm": 0.2809090442997923, "learning_rate": 1.5068194594037745e-07, "loss": 0.0207, "step": 7211 }, { "epoch": 2.788863109048724, "grad_norm": 0.23783722411707564, "learning_rate": 1.501342814279344e-07, "loss": 0.0107, "step": 7212 }, { "epoch": 2.789249806651199, "grad_norm": 0.25732245380834845, "learning_rate": 1.4958759882115093e-07, "loss": 0.0205, "step": 7213 }, { "epoch": 2.7896365042536737, "grad_norm": 0.2470243492088019, "learning_rate": 1.4904189823070792e-07, "loss": 0.0204, "step": 7214 }, { "epoch": 2.7900232018561484, "grad_norm": 0.26318409781572843, "learning_rate": 1.4849717976708976e-07, "loss": 0.0132, "step": 7215 }, { "epoch": 2.7904098994586235, "grad_norm": 0.2700269102686877, "learning_rate": 1.4795344354057883e-07, "loss": 0.0184, "step": 7216 }, { "epoch": 2.790796597061098, "grad_norm": 0.2729901271081971, "learning_rate": 1.4741068966126148e-07, "loss": 0.0221, "step": 7217 }, { "epoch": 2.791183294663573, "grad_norm": 0.28651966723723665, "learning_rate": 1.468689182390226e-07, "loss": 0.0186, "step": 7218 }, { "epoch": 2.791569992266048, "grad_norm": 0.2575252281882917, "learning_rate": 1.4632812938355002e-07, "loss": 0.0157, "step": 7219 }, { "epoch": 2.791956689868523, "grad_norm": 0.2780134644102377, "learning_rate": 1.457883232043328e-07, "loss": 0.0186, "step": 7220 }, { "epoch": 2.7923433874709978, "grad_norm": 0.25249252969638175, "learning_rate": 1.4524949981065907e-07, "loss": 0.0122, "step": 7221 }, { "epoch": 2.7927300850734724, "grad_norm": 0.3082855897780723, "learning_rate": 1.4471165931161991e-07, "loss": 0.0141, "step": 7222 }, { "epoch": 2.7931167826759475, "grad_norm": 0.25225848031937725, "learning_rate": 1.4417480181610544e-07, "loss": 0.0222, "step": 7223 }, { "epoch": 2.793503480278422, "grad_norm": 0.22518400438886724, "learning_rate": 1.4363892743281095e-07, "loss": 0.0145, "step": 7224 }, { "epoch": 2.793890177880897, "grad_norm": 0.2962253683030752, "learning_rate": 1.4310403627022628e-07, "loss": 0.018, "step": 7225 }, { "epoch": 2.794276875483372, "grad_norm": 0.24739030328588055, "learning_rate": 1.4257012843664818e-07, "loss": 0.0138, "step": 7226 }, { "epoch": 2.794663573085847, "grad_norm": 0.23380457081340314, "learning_rate": 1.420372040401713e-07, "loss": 0.0161, "step": 7227 }, { "epoch": 2.795050270688322, "grad_norm": 0.21044494699297062, "learning_rate": 1.4150526318869096e-07, "loss": 0.0153, "step": 7228 }, { "epoch": 2.7954369682907965, "grad_norm": 0.19866352182288605, "learning_rate": 1.409743059899049e-07, "loss": 0.0117, "step": 7229 }, { "epoch": 2.7958236658932716, "grad_norm": 0.29498800896479377, "learning_rate": 1.4044433255130942e-07, "loss": 0.0207, "step": 7230 }, { "epoch": 2.7962103634957463, "grad_norm": 0.27315858664535214, "learning_rate": 1.3991534298020526e-07, "loss": 0.0211, "step": 7231 }, { "epoch": 2.796597061098221, "grad_norm": 0.24236194077676118, "learning_rate": 1.3938733738369127e-07, "loss": 0.0181, "step": 7232 }, { "epoch": 2.796983758700696, "grad_norm": 0.23911475254565986, "learning_rate": 1.3886031586866688e-07, "loss": 0.0164, "step": 7233 }, { "epoch": 2.797370456303171, "grad_norm": 0.26765992467313665, "learning_rate": 1.3833427854183345e-07, "loss": 0.02, "step": 7234 }, { "epoch": 2.797757153905646, "grad_norm": 0.23074802527724222, "learning_rate": 1.3780922550969245e-07, "loss": 0.013, "step": 7235 }, { "epoch": 2.7981438515081205, "grad_norm": 0.26579154021246787, "learning_rate": 1.372851568785466e-07, "loss": 0.0256, "step": 7236 }, { "epoch": 2.7985305491105956, "grad_norm": 0.2954247346088978, "learning_rate": 1.3676207275449993e-07, "loss": 0.0182, "step": 7237 }, { "epoch": 2.7989172467130703, "grad_norm": 0.2959017696418239, "learning_rate": 1.3623997324345438e-07, "loss": 0.0242, "step": 7238 }, { "epoch": 2.799303944315545, "grad_norm": 0.2614109593254945, "learning_rate": 1.3571885845111544e-07, "loss": 0.0192, "step": 7239 }, { "epoch": 2.79969064191802, "grad_norm": 0.21941518390932946, "learning_rate": 1.3519872848298865e-07, "loss": 0.0196, "step": 7240 }, { "epoch": 2.800077339520495, "grad_norm": 0.25253784079372, "learning_rate": 1.3467958344437925e-07, "loss": 0.0179, "step": 7241 }, { "epoch": 2.80046403712297, "grad_norm": 0.24525571670377833, "learning_rate": 1.341614234403932e-07, "loss": 0.0147, "step": 7242 }, { "epoch": 2.8008507347254445, "grad_norm": 0.2475515937836189, "learning_rate": 1.3364424857593762e-07, "loss": 0.0118, "step": 7243 }, { "epoch": 2.8012374323279197, "grad_norm": 0.3005053341051803, "learning_rate": 1.331280589557199e-07, "loss": 0.023, "step": 7244 }, { "epoch": 2.8016241299303943, "grad_norm": 0.19960420504573645, "learning_rate": 1.3261285468424867e-07, "loss": 0.0158, "step": 7245 }, { "epoch": 2.8020108275328695, "grad_norm": 0.22715865466964275, "learning_rate": 1.3209863586583215e-07, "loss": 0.0162, "step": 7246 }, { "epoch": 2.802397525135344, "grad_norm": 0.26598940090649864, "learning_rate": 1.3158540260457763e-07, "loss": 0.0161, "step": 7247 }, { "epoch": 2.8027842227378192, "grad_norm": 0.2463120571501944, "learning_rate": 1.3107315500439642e-07, "loss": 0.0189, "step": 7248 }, { "epoch": 2.803170920340294, "grad_norm": 0.30669324753059773, "learning_rate": 1.3056189316899724e-07, "loss": 0.0209, "step": 7249 }, { "epoch": 2.8035576179427686, "grad_norm": 0.22089063259121428, "learning_rate": 1.300516172018923e-07, "loss": 0.0124, "step": 7250 }, { "epoch": 2.8039443155452437, "grad_norm": 0.3201460869959806, "learning_rate": 1.2954232720638948e-07, "loss": 0.0271, "step": 7251 }, { "epoch": 2.8043310131477184, "grad_norm": 0.3261429297510175, "learning_rate": 1.2903402328560132e-07, "loss": 0.0237, "step": 7252 }, { "epoch": 2.8047177107501935, "grad_norm": 0.2568582880442241, "learning_rate": 1.2852670554243996e-07, "loss": 0.0138, "step": 7253 }, { "epoch": 2.805104408352668, "grad_norm": 0.2762942409259895, "learning_rate": 1.2802037407961542e-07, "loss": 0.016, "step": 7254 }, { "epoch": 2.8054911059551433, "grad_norm": 0.5214999599913134, "learning_rate": 1.275150289996413e-07, "loss": 0.02, "step": 7255 }, { "epoch": 2.805877803557618, "grad_norm": 0.39249393174026126, "learning_rate": 1.270106704048285e-07, "loss": 0.0248, "step": 7256 }, { "epoch": 2.8062645011600926, "grad_norm": 0.26973610504516354, "learning_rate": 1.2650729839729094e-07, "loss": 0.0126, "step": 7257 }, { "epoch": 2.8066511987625677, "grad_norm": 0.22341876350241707, "learning_rate": 1.2600491307894093e-07, "loss": 0.0177, "step": 7258 }, { "epoch": 2.8070378963650424, "grad_norm": 0.2212133740139875, "learning_rate": 1.2550351455149158e-07, "loss": 0.0119, "step": 7259 }, { "epoch": 2.8074245939675175, "grad_norm": 0.2639141788928629, "learning_rate": 1.250031029164561e-07, "loss": 0.0177, "step": 7260 }, { "epoch": 2.807811291569992, "grad_norm": 0.36062024092702233, "learning_rate": 1.2450367827514797e-07, "loss": 0.0283, "step": 7261 }, { "epoch": 2.8081979891724673, "grad_norm": 0.3001216027402409, "learning_rate": 1.240052407286818e-07, "loss": 0.0174, "step": 7262 }, { "epoch": 2.808584686774942, "grad_norm": 0.4101719907942861, "learning_rate": 1.2350779037797077e-07, "loss": 0.0202, "step": 7263 }, { "epoch": 2.8089713843774167, "grad_norm": 0.2534521065923077, "learning_rate": 1.2301132732372878e-07, "loss": 0.0177, "step": 7264 }, { "epoch": 2.809358081979892, "grad_norm": 0.2182460276394026, "learning_rate": 1.225158516664693e-07, "loss": 0.0137, "step": 7265 }, { "epoch": 2.8097447795823665, "grad_norm": 0.2677223836471821, "learning_rate": 1.2202136350650818e-07, "loss": 0.0187, "step": 7266 }, { "epoch": 2.8101314771848416, "grad_norm": 0.26255323114578805, "learning_rate": 1.215278629439587e-07, "loss": 0.0219, "step": 7267 }, { "epoch": 2.8105181747873162, "grad_norm": 0.36012957815593744, "learning_rate": 1.2103535007873478e-07, "loss": 0.0363, "step": 7268 }, { "epoch": 2.8109048723897914, "grad_norm": 0.22722758471892157, "learning_rate": 1.2054382501055117e-07, "loss": 0.0173, "step": 7269 }, { "epoch": 2.811291569992266, "grad_norm": 0.34976752543366313, "learning_rate": 1.200532878389221e-07, "loss": 0.0202, "step": 7270 }, { "epoch": 2.8116782675947407, "grad_norm": 0.17497037294136536, "learning_rate": 1.1956373866316316e-07, "loss": 0.0097, "step": 7271 }, { "epoch": 2.812064965197216, "grad_norm": 0.19193585685522768, "learning_rate": 1.1907517758238673e-07, "loss": 0.0127, "step": 7272 }, { "epoch": 2.8124516627996905, "grad_norm": 0.24142231537436804, "learning_rate": 1.1858760469550812e-07, "loss": 0.0152, "step": 7273 }, { "epoch": 2.8128383604021656, "grad_norm": 0.24163860097486337, "learning_rate": 1.1810102010124114e-07, "loss": 0.0197, "step": 7274 }, { "epoch": 2.8132250580046403, "grad_norm": 0.2351863853303676, "learning_rate": 1.1761542389810032e-07, "loss": 0.014, "step": 7275 }, { "epoch": 2.8136117556071154, "grad_norm": 0.2598565973598723, "learning_rate": 1.1713081618439981e-07, "loss": 0.0231, "step": 7276 }, { "epoch": 2.81399845320959, "grad_norm": 0.33197136758208334, "learning_rate": 1.1664719705825167e-07, "loss": 0.0184, "step": 7277 }, { "epoch": 2.8143851508120648, "grad_norm": 0.2396028994468798, "learning_rate": 1.1616456661757202e-07, "loss": 0.0184, "step": 7278 }, { "epoch": 2.81477184841454, "grad_norm": 0.2691767341925774, "learning_rate": 1.1568292496007383e-07, "loss": 0.0154, "step": 7279 }, { "epoch": 2.8151585460170145, "grad_norm": 0.32888734573851436, "learning_rate": 1.1520227218326907e-07, "loss": 0.0224, "step": 7280 }, { "epoch": 2.8155452436194897, "grad_norm": 0.24227555003470203, "learning_rate": 1.1472260838447268e-07, "loss": 0.0165, "step": 7281 }, { "epoch": 2.8159319412219643, "grad_norm": 0.25376197664840794, "learning_rate": 1.1424393366079589e-07, "loss": 0.0151, "step": 7282 }, { "epoch": 2.8163186388244394, "grad_norm": 0.41916664093184175, "learning_rate": 1.1376624810915227e-07, "loss": 0.0162, "step": 7283 }, { "epoch": 2.816705336426914, "grad_norm": 0.2946891234234371, "learning_rate": 1.1328955182625501e-07, "loss": 0.0159, "step": 7284 }, { "epoch": 2.817092034029389, "grad_norm": 0.26535039741948047, "learning_rate": 1.1281384490861469e-07, "loss": 0.0212, "step": 7285 }, { "epoch": 2.817478731631864, "grad_norm": 0.22531347374097516, "learning_rate": 1.1233912745254427e-07, "loss": 0.0185, "step": 7286 }, { "epoch": 2.8178654292343386, "grad_norm": 0.3164415045021424, "learning_rate": 1.1186539955415354e-07, "loss": 0.0195, "step": 7287 }, { "epoch": 2.8182521268368137, "grad_norm": 0.16953724773668843, "learning_rate": 1.1139266130935634e-07, "loss": 0.0092, "step": 7288 }, { "epoch": 2.8186388244392884, "grad_norm": 0.3841902791268925, "learning_rate": 1.1092091281386108e-07, "loss": 0.0146, "step": 7289 }, { "epoch": 2.8190255220417635, "grad_norm": 0.2671298601099069, "learning_rate": 1.1045015416317917e-07, "loss": 0.0229, "step": 7290 }, { "epoch": 2.819412219644238, "grad_norm": 0.2831695760689637, "learning_rate": 1.0998038545261991e-07, "loss": 0.0222, "step": 7291 }, { "epoch": 2.819798917246713, "grad_norm": 0.3027500144037677, "learning_rate": 1.0951160677729444e-07, "loss": 0.02, "step": 7292 }, { "epoch": 2.820185614849188, "grad_norm": 0.2093137325130658, "learning_rate": 1.0904381823211018e-07, "loss": 0.0109, "step": 7293 }, { "epoch": 2.820572312451663, "grad_norm": 0.24278509994354577, "learning_rate": 1.0857701991177638e-07, "loss": 0.0203, "step": 7294 }, { "epoch": 2.8209590100541377, "grad_norm": 0.2900412453113828, "learning_rate": 1.0811121191080076e-07, "loss": 0.0236, "step": 7295 }, { "epoch": 2.8213457076566124, "grad_norm": 0.20541892280097174, "learning_rate": 1.0764639432349233e-07, "loss": 0.0143, "step": 7296 }, { "epoch": 2.8217324052590875, "grad_norm": 0.25659804120071505, "learning_rate": 1.0718256724395693e-07, "loss": 0.0153, "step": 7297 }, { "epoch": 2.822119102861562, "grad_norm": 0.277202600568814, "learning_rate": 1.067197307661011e-07, "loss": 0.0163, "step": 7298 }, { "epoch": 2.822505800464037, "grad_norm": 0.2898408413057896, "learning_rate": 1.0625788498363154e-07, "loss": 0.0207, "step": 7299 }, { "epoch": 2.822892498066512, "grad_norm": 0.2644954321057248, "learning_rate": 1.0579702999005404e-07, "loss": 0.015, "step": 7300 }, { "epoch": 2.823279195668987, "grad_norm": 0.23171470423973967, "learning_rate": 1.0533716587867282e-07, "loss": 0.0179, "step": 7301 }, { "epoch": 2.823665893271462, "grad_norm": 0.26482340449148134, "learning_rate": 1.0487829274259231e-07, "loss": 0.0178, "step": 7302 }, { "epoch": 2.8240525908739365, "grad_norm": 0.29032407261089277, "learning_rate": 1.0442041067471542e-07, "loss": 0.025, "step": 7303 }, { "epoch": 2.8244392884764116, "grad_norm": 0.24678513346029712, "learning_rate": 1.0396351976774633e-07, "loss": 0.0186, "step": 7304 }, { "epoch": 2.8248259860788862, "grad_norm": 0.2957129299224266, "learning_rate": 1.0350762011418714e-07, "loss": 0.0228, "step": 7305 }, { "epoch": 2.825212683681361, "grad_norm": 0.3304477833637613, "learning_rate": 1.0305271180633902e-07, "loss": 0.0217, "step": 7306 }, { "epoch": 2.825599381283836, "grad_norm": 0.20999902072205343, "learning_rate": 1.0259879493630275e-07, "loss": 0.0137, "step": 7307 }, { "epoch": 2.825986078886311, "grad_norm": 0.3682209674611798, "learning_rate": 1.0214586959597872e-07, "loss": 0.0193, "step": 7308 }, { "epoch": 2.826372776488786, "grad_norm": 0.3263980435073142, "learning_rate": 1.0169393587706688e-07, "loss": 0.0163, "step": 7309 }, { "epoch": 2.8267594740912605, "grad_norm": 0.22929747896738573, "learning_rate": 1.0124299387106574e-07, "loss": 0.0133, "step": 7310 }, { "epoch": 2.8271461716937356, "grad_norm": 0.26224133021815554, "learning_rate": 1.0079304366927222e-07, "loss": 0.0158, "step": 7311 }, { "epoch": 2.8275328692962103, "grad_norm": 0.2168373568429159, "learning_rate": 1.0034408536278406e-07, "loss": 0.0164, "step": 7312 }, { "epoch": 2.827919566898685, "grad_norm": 0.2827761864292659, "learning_rate": 9.989611904249796e-08, "loss": 0.0206, "step": 7313 }, { "epoch": 2.82830626450116, "grad_norm": 0.22472096838665684, "learning_rate": 9.944914479910971e-08, "loss": 0.0127, "step": 7314 }, { "epoch": 2.828692962103635, "grad_norm": 0.3039170405855271, "learning_rate": 9.900316272311194e-08, "loss": 0.0241, "step": 7315 }, { "epoch": 2.82907965970611, "grad_norm": 0.24683543529747398, "learning_rate": 9.855817290480019e-08, "loss": 0.0144, "step": 7316 }, { "epoch": 2.8294663573085845, "grad_norm": 0.27293737829127096, "learning_rate": 9.811417543426626e-08, "loss": 0.0265, "step": 7317 }, { "epoch": 2.8298530549110597, "grad_norm": 0.2674349297288932, "learning_rate": 9.767117040140273e-08, "loss": 0.0164, "step": 7318 }, { "epoch": 2.8302397525135343, "grad_norm": 0.24126496527161015, "learning_rate": 9.722915789590004e-08, "loss": 0.0187, "step": 7319 }, { "epoch": 2.8306264501160094, "grad_norm": 0.22334994748505127, "learning_rate": 9.678813800724885e-08, "loss": 0.0161, "step": 7320 }, { "epoch": 2.831013147718484, "grad_norm": 0.21317885175408594, "learning_rate": 9.634811082473716e-08, "loss": 0.0158, "step": 7321 }, { "epoch": 2.8313998453209592, "grad_norm": 0.20042299216845658, "learning_rate": 9.590907643745428e-08, "loss": 0.0119, "step": 7322 }, { "epoch": 2.831786542923434, "grad_norm": 0.21107016346241156, "learning_rate": 9.547103493428633e-08, "loss": 0.0118, "step": 7323 }, { "epoch": 2.8321732405259086, "grad_norm": 0.23304638120406126, "learning_rate": 9.503398640391959e-08, "loss": 0.0118, "step": 7324 }, { "epoch": 2.8325599381283837, "grad_norm": 0.2604387093971901, "learning_rate": 9.459793093483938e-08, "loss": 0.0185, "step": 7325 }, { "epoch": 2.8329466357308584, "grad_norm": 0.23360492053743112, "learning_rate": 9.416286861532952e-08, "loss": 0.0163, "step": 7326 }, { "epoch": 2.8333333333333335, "grad_norm": 0.36148627151815743, "learning_rate": 9.372879953347291e-08, "loss": 0.0186, "step": 7327 }, { "epoch": 2.833720030935808, "grad_norm": 0.27819075736350535, "learning_rate": 9.329572377715091e-08, "loss": 0.0184, "step": 7328 }, { "epoch": 2.8341067285382833, "grad_norm": 0.25950228372997197, "learning_rate": 9.286364143404447e-08, "loss": 0.0212, "step": 7329 }, { "epoch": 2.834493426140758, "grad_norm": 0.282639448795754, "learning_rate": 9.243255259163363e-08, "loss": 0.0213, "step": 7330 }, { "epoch": 2.8348801237432326, "grad_norm": 0.23703965332775426, "learning_rate": 9.200245733719637e-08, "loss": 0.0147, "step": 7331 }, { "epoch": 2.8352668213457077, "grad_norm": 0.29678963427389277, "learning_rate": 9.157335575781023e-08, "loss": 0.0229, "step": 7332 }, { "epoch": 2.8356535189481824, "grad_norm": 0.25546743826486884, "learning_rate": 9.114524794035074e-08, "loss": 0.0152, "step": 7333 }, { "epoch": 2.8360402165506575, "grad_norm": 0.22730363034153486, "learning_rate": 9.071813397149243e-08, "loss": 0.015, "step": 7334 }, { "epoch": 2.836426914153132, "grad_norm": 0.23204012045003505, "learning_rate": 9.029201393771114e-08, "loss": 0.0167, "step": 7335 }, { "epoch": 2.8368136117556073, "grad_norm": 0.2525184993045604, "learning_rate": 8.98668879252762e-08, "loss": 0.0148, "step": 7336 }, { "epoch": 2.837200309358082, "grad_norm": 0.27626553204400706, "learning_rate": 8.944275602026098e-08, "loss": 0.02, "step": 7337 }, { "epoch": 2.8375870069605567, "grad_norm": 0.26709718038195956, "learning_rate": 8.901961830853456e-08, "loss": 0.0206, "step": 7338 }, { "epoch": 2.8379737045630318, "grad_norm": 0.2594769452101678, "learning_rate": 8.859747487576619e-08, "loss": 0.0213, "step": 7339 }, { "epoch": 2.8383604021655064, "grad_norm": 0.27442845222112794, "learning_rate": 8.817632580742252e-08, "loss": 0.0224, "step": 7340 }, { "epoch": 2.8387470997679816, "grad_norm": 0.3551934983946224, "learning_rate": 8.775617118876923e-08, "loss": 0.0168, "step": 7341 }, { "epoch": 2.8391337973704562, "grad_norm": 0.2618373762914255, "learning_rate": 8.733701110487214e-08, "loss": 0.0252, "step": 7342 }, { "epoch": 2.8395204949729314, "grad_norm": 0.2817449919013379, "learning_rate": 8.691884564059338e-08, "loss": 0.0192, "step": 7343 }, { "epoch": 2.839907192575406, "grad_norm": 0.2510847609806304, "learning_rate": 8.650167488059579e-08, "loss": 0.0116, "step": 7344 }, { "epoch": 2.8402938901778807, "grad_norm": 0.2859646843490237, "learning_rate": 8.608549890933959e-08, "loss": 0.0232, "step": 7345 }, { "epoch": 2.840680587780356, "grad_norm": 0.22522313557369056, "learning_rate": 8.567031781108348e-08, "loss": 0.0174, "step": 7346 }, { "epoch": 2.8410672853828305, "grad_norm": 0.22747036803321175, "learning_rate": 8.525613166988633e-08, "loss": 0.0252, "step": 7347 }, { "epoch": 2.8414539829853056, "grad_norm": 0.2210311845422131, "learning_rate": 8.484294056960329e-08, "loss": 0.0117, "step": 7348 }, { "epoch": 2.8418406805877803, "grad_norm": 0.45083892951229465, "learning_rate": 8.443074459388967e-08, "loss": 0.0198, "step": 7349 }, { "epoch": 2.8422273781902554, "grad_norm": 0.19713785334203404, "learning_rate": 8.401954382619871e-08, "loss": 0.011, "step": 7350 }, { "epoch": 2.84261407579273, "grad_norm": 0.29911127956107625, "learning_rate": 8.360933834978269e-08, "loss": 0.0194, "step": 7351 }, { "epoch": 2.8430007733952047, "grad_norm": 0.21563244717986663, "learning_rate": 8.32001282476913e-08, "loss": 0.018, "step": 7352 }, { "epoch": 2.84338747099768, "grad_norm": 0.3290294601886272, "learning_rate": 8.279191360277384e-08, "loss": 0.0149, "step": 7353 }, { "epoch": 2.8437741686001545, "grad_norm": 0.2795760056393693, "learning_rate": 8.238469449767694e-08, "loss": 0.018, "step": 7354 }, { "epoch": 2.8441608662026296, "grad_norm": 0.3516488997409393, "learning_rate": 8.197847101484691e-08, "loss": 0.0181, "step": 7355 }, { "epoch": 2.8445475638051043, "grad_norm": 0.19396636254349872, "learning_rate": 8.157324323652849e-08, "loss": 0.0106, "step": 7356 }, { "epoch": 2.8449342614075794, "grad_norm": 0.30114901511058334, "learning_rate": 8.116901124476329e-08, "loss": 0.0219, "step": 7357 }, { "epoch": 2.845320959010054, "grad_norm": 0.2409058973209165, "learning_rate": 8.07657751213925e-08, "loss": 0.0164, "step": 7358 }, { "epoch": 2.845707656612529, "grad_norm": 0.2931890505363018, "learning_rate": 8.036353494805527e-08, "loss": 0.0164, "step": 7359 }, { "epoch": 2.846094354215004, "grad_norm": 0.2261059734061271, "learning_rate": 7.996229080619033e-08, "loss": 0.0168, "step": 7360 }, { "epoch": 2.8464810518174786, "grad_norm": 0.39590151957296976, "learning_rate": 7.956204277703216e-08, "loss": 0.0292, "step": 7361 }, { "epoch": 2.8468677494199537, "grad_norm": 0.2695274587946133, "learning_rate": 7.916279094161594e-08, "loss": 0.0147, "step": 7362 }, { "epoch": 2.8472544470224284, "grad_norm": 0.2112878325000143, "learning_rate": 7.876453538077478e-08, "loss": 0.0142, "step": 7363 }, { "epoch": 2.8476411446249035, "grad_norm": 0.290635110489018, "learning_rate": 7.836727617513917e-08, "loss": 0.0249, "step": 7364 }, { "epoch": 2.848027842227378, "grad_norm": 0.2447571117646085, "learning_rate": 7.797101340513813e-08, "loss": 0.0147, "step": 7365 }, { "epoch": 2.848414539829853, "grad_norm": 0.25132117220021327, "learning_rate": 7.757574715099969e-08, "loss": 0.0224, "step": 7366 }, { "epoch": 2.848801237432328, "grad_norm": 0.24183086319838631, "learning_rate": 7.718147749274873e-08, "loss": 0.0145, "step": 7367 }, { "epoch": 2.849187935034803, "grad_norm": 0.23113782684362078, "learning_rate": 7.678820451021085e-08, "loss": 0.0162, "step": 7368 }, { "epoch": 2.8495746326372777, "grad_norm": 0.9302032680143149, "learning_rate": 7.639592828300679e-08, "loss": 0.0192, "step": 7369 }, { "epoch": 2.8499613302397524, "grad_norm": 0.2997942764350811, "learning_rate": 7.600464889055748e-08, "loss": 0.0227, "step": 7370 }, { "epoch": 2.8503480278422275, "grad_norm": 0.19819668141943647, "learning_rate": 7.561436641208064e-08, "loss": 0.0099, "step": 7371 }, { "epoch": 2.850734725444702, "grad_norm": 0.34474142783832584, "learning_rate": 7.522508092659475e-08, "loss": 0.0225, "step": 7372 }, { "epoch": 2.851121423047177, "grad_norm": 0.2771242287280219, "learning_rate": 7.483679251291343e-08, "loss": 0.0197, "step": 7373 }, { "epoch": 2.851508120649652, "grad_norm": 0.2708726607574833, "learning_rate": 7.44495012496499e-08, "loss": 0.0164, "step": 7374 }, { "epoch": 2.851894818252127, "grad_norm": 0.20607312755845983, "learning_rate": 7.406320721521587e-08, "loss": 0.0132, "step": 7375 }, { "epoch": 2.8522815158546018, "grad_norm": 0.2327891553913066, "learning_rate": 7.367791048781936e-08, "loss": 0.0131, "step": 7376 }, { "epoch": 2.8526682134570764, "grad_norm": 0.28274922950116854, "learning_rate": 7.329361114546907e-08, "loss": 0.0136, "step": 7377 }, { "epoch": 2.8530549110595516, "grad_norm": 0.3134415900448523, "learning_rate": 7.291030926596942e-08, "loss": 0.0165, "step": 7378 }, { "epoch": 2.8534416086620262, "grad_norm": 0.22873378748714632, "learning_rate": 7.252800492692447e-08, "loss": 0.0149, "step": 7379 }, { "epoch": 2.853828306264501, "grad_norm": 0.2145567840738607, "learning_rate": 7.214669820573561e-08, "loss": 0.0121, "step": 7380 }, { "epoch": 2.854215003866976, "grad_norm": 0.2816447178033944, "learning_rate": 7.176638917960165e-08, "loss": 0.0252, "step": 7381 }, { "epoch": 2.854601701469451, "grad_norm": 0.20117994415573606, "learning_rate": 7.138707792552102e-08, "loss": 0.0126, "step": 7382 }, { "epoch": 2.854988399071926, "grad_norm": 0.2679542882642256, "learning_rate": 7.100876452028837e-08, "loss": 0.0207, "step": 7383 }, { "epoch": 2.8553750966744005, "grad_norm": 0.3680551878031254, "learning_rate": 7.063144904049801e-08, "loss": 0.018, "step": 7384 }, { "epoch": 2.8557617942768756, "grad_norm": 0.3109320488448833, "learning_rate": 7.02551315625405e-08, "loss": 0.0325, "step": 7385 }, { "epoch": 2.8561484918793503, "grad_norm": 0.21117596002020947, "learning_rate": 6.987981216260654e-08, "loss": 0.0147, "step": 7386 }, { "epoch": 2.856535189481825, "grad_norm": 0.2731528214658974, "learning_rate": 6.950549091668258e-08, "loss": 0.0185, "step": 7387 }, { "epoch": 2.8569218870843, "grad_norm": 0.24212421490596736, "learning_rate": 6.9132167900553e-08, "loss": 0.0167, "step": 7388 }, { "epoch": 2.857308584686775, "grad_norm": 0.26129639295769813, "learning_rate": 6.875984318980234e-08, "loss": 0.0255, "step": 7389 }, { "epoch": 2.85769528228925, "grad_norm": 0.20603085028284976, "learning_rate": 6.838851685981085e-08, "loss": 0.0116, "step": 7390 }, { "epoch": 2.8580819798917245, "grad_norm": 0.2396854514909069, "learning_rate": 6.801818898575785e-08, "loss": 0.0173, "step": 7391 }, { "epoch": 2.8584686774941996, "grad_norm": 0.2097901283536701, "learning_rate": 6.764885964261947e-08, "loss": 0.0136, "step": 7392 }, { "epoch": 2.8588553750966743, "grad_norm": 0.30033647809662145, "learning_rate": 6.72805289051709e-08, "loss": 0.0151, "step": 7393 }, { "epoch": 2.8592420726991494, "grad_norm": 0.2551295197038048, "learning_rate": 6.691319684798415e-08, "loss": 0.0237, "step": 7394 }, { "epoch": 2.859628770301624, "grad_norm": 0.1523870332254651, "learning_rate": 6.654686354542971e-08, "loss": 0.0092, "step": 7395 }, { "epoch": 2.860015467904099, "grad_norm": 0.23149029148916406, "learning_rate": 6.618152907167496e-08, "loss": 0.0125, "step": 7396 }, { "epoch": 2.860402165506574, "grad_norm": 0.2259106787546039, "learning_rate": 6.581719350068627e-08, "loss": 0.0166, "step": 7397 }, { "epoch": 2.8607888631090486, "grad_norm": 0.23437571168393284, "learning_rate": 6.545385690622686e-08, "loss": 0.0122, "step": 7398 }, { "epoch": 2.8611755607115237, "grad_norm": 0.3150822652175665, "learning_rate": 6.509151936185842e-08, "loss": 0.0199, "step": 7399 }, { "epoch": 2.8615622583139984, "grad_norm": 0.3005986734133844, "learning_rate": 6.473018094093897e-08, "loss": 0.0158, "step": 7400 }, { "epoch": 2.8619489559164735, "grad_norm": 0.2902823976489097, "learning_rate": 6.436984171662608e-08, "loss": 0.022, "step": 7401 }, { "epoch": 2.862335653518948, "grad_norm": 0.29434785117253975, "learning_rate": 6.401050176187418e-08, "loss": 0.0203, "step": 7402 }, { "epoch": 2.8627223511214233, "grad_norm": 0.2451981103196898, "learning_rate": 6.365216114943507e-08, "loss": 0.0155, "step": 7403 }, { "epoch": 2.863109048723898, "grad_norm": 0.2486840313658623, "learning_rate": 6.329481995185849e-08, "loss": 0.0174, "step": 7404 }, { "epoch": 2.8634957463263726, "grad_norm": 0.2942517565715705, "learning_rate": 6.293847824149157e-08, "loss": 0.02, "step": 7405 }, { "epoch": 2.8638824439288477, "grad_norm": 0.2684772598067711, "learning_rate": 6.25831360904805e-08, "loss": 0.0201, "step": 7406 }, { "epoch": 2.8642691415313224, "grad_norm": 0.2773674970579117, "learning_rate": 6.222879357076717e-08, "loss": 0.0173, "step": 7407 }, { "epoch": 2.8646558391337975, "grad_norm": 0.2692966285183755, "learning_rate": 6.18754507540914e-08, "loss": 0.0197, "step": 7408 }, { "epoch": 2.865042536736272, "grad_norm": 0.29809897953734665, "learning_rate": 6.15231077119921e-08, "loss": 0.02, "step": 7409 }, { "epoch": 2.8654292343387473, "grad_norm": 0.3367584638566903, "learning_rate": 6.117176451580442e-08, "loss": 0.0262, "step": 7410 }, { "epoch": 2.865815931941222, "grad_norm": 0.29912271822908687, "learning_rate": 6.082142123666146e-08, "loss": 0.0199, "step": 7411 }, { "epoch": 2.8662026295436966, "grad_norm": 0.40119006325655887, "learning_rate": 6.04720779454937e-08, "loss": 0.0179, "step": 7412 }, { "epoch": 2.8665893271461718, "grad_norm": 0.43076439618150275, "learning_rate": 6.012373471302957e-08, "loss": 0.0248, "step": 7413 }, { "epoch": 2.8669760247486464, "grad_norm": 0.3057406496763107, "learning_rate": 5.977639160979431e-08, "loss": 0.0225, "step": 7414 }, { "epoch": 2.8673627223511216, "grad_norm": 0.20241400831179054, "learning_rate": 5.9430048706111684e-08, "loss": 0.0133, "step": 7415 }, { "epoch": 2.8677494199535962, "grad_norm": 0.24638922434772173, "learning_rate": 5.908470607210226e-08, "loss": 0.0159, "step": 7416 }, { "epoch": 2.8681361175560713, "grad_norm": 0.22967898376587803, "learning_rate": 5.874036377768399e-08, "loss": 0.0202, "step": 7417 }, { "epoch": 2.868522815158546, "grad_norm": 0.37973208756268534, "learning_rate": 5.839702189257224e-08, "loss": 0.0286, "step": 7418 }, { "epoch": 2.8689095127610207, "grad_norm": 0.2406177493518227, "learning_rate": 5.805468048628138e-08, "loss": 0.0159, "step": 7419 }, { "epoch": 2.869296210363496, "grad_norm": 0.3061588522664258, "learning_rate": 5.771333962812153e-08, "loss": 0.0166, "step": 7420 }, { "epoch": 2.8696829079659705, "grad_norm": 0.27389352590487914, "learning_rate": 5.737299938719909e-08, "loss": 0.0179, "step": 7421 }, { "epoch": 2.8700696055684456, "grad_norm": 0.27688183197436744, "learning_rate": 5.703365983242171e-08, "loss": 0.0203, "step": 7422 }, { "epoch": 2.8704563031709203, "grad_norm": 0.32475766253751687, "learning_rate": 5.6695321032490535e-08, "loss": 0.0219, "step": 7423 }, { "epoch": 2.8708430007733954, "grad_norm": 0.2826615009245278, "learning_rate": 5.635798305590745e-08, "loss": 0.0127, "step": 7424 }, { "epoch": 2.87122969837587, "grad_norm": 0.3520354770173403, "learning_rate": 5.602164597096837e-08, "loss": 0.022, "step": 7425 }, { "epoch": 2.8716163959783447, "grad_norm": 0.3044037042830433, "learning_rate": 5.5686309845769395e-08, "loss": 0.018, "step": 7426 }, { "epoch": 2.87200309358082, "grad_norm": 0.24365407832831223, "learning_rate": 5.5351974748201755e-08, "loss": 0.0158, "step": 7427 }, { "epoch": 2.8723897911832945, "grad_norm": 0.20754609110820865, "learning_rate": 5.501864074595631e-08, "loss": 0.0147, "step": 7428 }, { "epoch": 2.8727764887857696, "grad_norm": 0.3636238035988395, "learning_rate": 5.468630790651852e-08, "loss": 0.0299, "step": 7429 }, { "epoch": 2.8731631863882443, "grad_norm": 0.22979328679287636, "learning_rate": 5.435497629717401e-08, "loss": 0.014, "step": 7430 }, { "epoch": 2.8735498839907194, "grad_norm": 0.31129624338482714, "learning_rate": 5.402464598500357e-08, "loss": 0.0175, "step": 7431 }, { "epoch": 2.873936581593194, "grad_norm": 0.21032777879323655, "learning_rate": 5.3695317036885906e-08, "loss": 0.0119, "step": 7432 }, { "epoch": 2.8743232791956688, "grad_norm": 0.22989746135670477, "learning_rate": 5.3366989519497705e-08, "loss": 0.0136, "step": 7433 }, { "epoch": 2.874709976798144, "grad_norm": 0.29133597953456425, "learning_rate": 5.3039663499311886e-08, "loss": 0.0213, "step": 7434 }, { "epoch": 2.8750966744006186, "grad_norm": 0.3086572870681923, "learning_rate": 5.2713339042598236e-08, "loss": 0.0227, "step": 7435 }, { "epoch": 2.8754833720030937, "grad_norm": 0.26321121955420274, "learning_rate": 5.2388016215426105e-08, "loss": 0.0189, "step": 7436 }, { "epoch": 2.8758700696055683, "grad_norm": 0.252043994586249, "learning_rate": 5.206369508365949e-08, "loss": 0.019, "step": 7437 }, { "epoch": 2.8762567672080435, "grad_norm": 0.2911159821651362, "learning_rate": 5.174037571296087e-08, "loss": 0.024, "step": 7438 }, { "epoch": 2.876643464810518, "grad_norm": 0.31975813618437565, "learning_rate": 5.141805816878897e-08, "loss": 0.0279, "step": 7439 }, { "epoch": 2.877030162412993, "grad_norm": 0.289016275691005, "learning_rate": 5.109674251640106e-08, "loss": 0.0171, "step": 7440 }, { "epoch": 2.877416860015468, "grad_norm": 0.2780480972650692, "learning_rate": 5.077642882085121e-08, "loss": 0.0179, "step": 7441 }, { "epoch": 2.877803557617943, "grad_norm": 0.29949222781728, "learning_rate": 5.04571171469892e-08, "loss": 0.0196, "step": 7442 }, { "epoch": 2.8781902552204177, "grad_norm": 0.24950195716619583, "learning_rate": 5.013880755946332e-08, "loss": 0.0159, "step": 7443 }, { "epoch": 2.8785769528228924, "grad_norm": 0.2271534236196968, "learning_rate": 4.982150012271869e-08, "loss": 0.0179, "step": 7444 }, { "epoch": 2.8789636504253675, "grad_norm": 0.2590524017659453, "learning_rate": 4.950519490099781e-08, "loss": 0.0176, "step": 7445 }, { "epoch": 2.879350348027842, "grad_norm": 0.2827095613523019, "learning_rate": 4.9189891958340006e-08, "loss": 0.0189, "step": 7446 }, { "epoch": 2.879737045630317, "grad_norm": 0.2802234937523041, "learning_rate": 4.887559135858089e-08, "loss": 0.0139, "step": 7447 }, { "epoch": 2.880123743232792, "grad_norm": 0.27367289021875424, "learning_rate": 4.856229316535455e-08, "loss": 0.0235, "step": 7448 }, { "epoch": 2.880510440835267, "grad_norm": 0.24541484945819947, "learning_rate": 4.82499974420908e-08, "loss": 0.0154, "step": 7449 }, { "epoch": 2.8808971384377418, "grad_norm": 0.2791892175148738, "learning_rate": 4.793870425201852e-08, "loss": 0.0258, "step": 7450 }, { "epoch": 2.8812838360402164, "grad_norm": 0.3154777542293288, "learning_rate": 4.7628413658160625e-08, "loss": 0.0176, "step": 7451 }, { "epoch": 2.8816705336426915, "grad_norm": 0.2793358551129709, "learning_rate": 4.731912572333963e-08, "loss": 0.0171, "step": 7452 }, { "epoch": 2.882057231245166, "grad_norm": 0.24586002815249222, "learning_rate": 4.701084051017379e-08, "loss": 0.0161, "step": 7453 }, { "epoch": 2.882443928847641, "grad_norm": 0.2502625354731991, "learning_rate": 4.670355808107874e-08, "loss": 0.018, "step": 7454 }, { "epoch": 2.882830626450116, "grad_norm": 0.2439025387250581, "learning_rate": 4.639727849826692e-08, "loss": 0.0204, "step": 7455 }, { "epoch": 2.883217324052591, "grad_norm": 0.25040695865337037, "learning_rate": 4.609200182374707e-08, "loss": 0.0201, "step": 7456 }, { "epoch": 2.883604021655066, "grad_norm": 0.2894911525973048, "learning_rate": 4.5787728119327525e-08, "loss": 0.0201, "step": 7457 }, { "epoch": 2.8839907192575405, "grad_norm": 0.3175234224493231, "learning_rate": 4.548445744661012e-08, "loss": 0.0211, "step": 7458 }, { "epoch": 2.8843774168600156, "grad_norm": 0.25889786339212234, "learning_rate": 4.518218986699518e-08, "loss": 0.0155, "step": 7459 }, { "epoch": 2.8847641144624903, "grad_norm": 0.22688718347469242, "learning_rate": 4.488092544168099e-08, "loss": 0.0143, "step": 7460 }, { "epoch": 2.885150812064965, "grad_norm": 0.25902219671075105, "learning_rate": 4.458066423166041e-08, "loss": 0.0178, "step": 7461 }, { "epoch": 2.88553750966744, "grad_norm": 0.24636790107467907, "learning_rate": 4.4281406297724817e-08, "loss": 0.0175, "step": 7462 }, { "epoch": 2.885924207269915, "grad_norm": 0.462472032117445, "learning_rate": 4.3983151700462964e-08, "loss": 0.0277, "step": 7463 }, { "epoch": 2.88631090487239, "grad_norm": 0.15355648258564972, "learning_rate": 4.368590050025823e-08, "loss": 0.0094, "step": 7464 }, { "epoch": 2.8866976024748645, "grad_norm": 0.26044336181237343, "learning_rate": 4.338965275729301e-08, "loss": 0.0181, "step": 7465 }, { "epoch": 2.8870843000773396, "grad_norm": 0.23998307066417782, "learning_rate": 4.3094408531546e-08, "loss": 0.0167, "step": 7466 }, { "epoch": 2.8874709976798143, "grad_norm": 0.2165895935067025, "learning_rate": 4.2800167882791624e-08, "loss": 0.011, "step": 7467 }, { "epoch": 2.887857695282289, "grad_norm": 0.2701900096680423, "learning_rate": 4.2506930870602204e-08, "loss": 0.015, "step": 7468 }, { "epoch": 2.888244392884764, "grad_norm": 0.28400150480400244, "learning_rate": 4.2214697554347486e-08, "loss": 0.0184, "step": 7469 }, { "epoch": 2.888631090487239, "grad_norm": 0.28508811320917615, "learning_rate": 4.19234679931918e-08, "loss": 0.0287, "step": 7470 }, { "epoch": 2.889017788089714, "grad_norm": 0.2302305285760141, "learning_rate": 4.163324224609855e-08, "loss": 0.0179, "step": 7471 }, { "epoch": 2.8894044856921886, "grad_norm": 0.2827498980121577, "learning_rate": 4.134402037182628e-08, "loss": 0.0198, "step": 7472 }, { "epoch": 2.8897911832946637, "grad_norm": 0.37594108618436406, "learning_rate": 4.105580242893148e-08, "loss": 0.0226, "step": 7473 }, { "epoch": 2.8901778808971383, "grad_norm": 0.26947302256571964, "learning_rate": 4.0768588475766926e-08, "loss": 0.0213, "step": 7474 }, { "epoch": 2.8905645784996135, "grad_norm": 0.29411884641531116, "learning_rate": 4.048237857048165e-08, "loss": 0.0194, "step": 7475 }, { "epoch": 2.890951276102088, "grad_norm": 0.23354106156478724, "learning_rate": 4.0197172771022084e-08, "loss": 0.0111, "step": 7476 }, { "epoch": 2.8913379737045632, "grad_norm": 0.2554611851460455, "learning_rate": 3.991297113513037e-08, "loss": 0.016, "step": 7477 }, { "epoch": 2.891724671307038, "grad_norm": 0.24344478933353925, "learning_rate": 3.962977372034771e-08, "loss": 0.0179, "step": 7478 }, { "epoch": 2.8921113689095126, "grad_norm": 0.27931772444294195, "learning_rate": 3.9347580584008784e-08, "loss": 0.0191, "step": 7479 }, { "epoch": 2.8924980665119877, "grad_norm": 0.20782814909566405, "learning_rate": 3.906639178324734e-08, "loss": 0.0139, "step": 7480 }, { "epoch": 2.8928847641144624, "grad_norm": 0.25629782343926905, "learning_rate": 3.878620737499283e-08, "loss": 0.0151, "step": 7481 }, { "epoch": 2.8932714617169375, "grad_norm": 0.23946103625321075, "learning_rate": 3.850702741597101e-08, "loss": 0.0234, "step": 7482 }, { "epoch": 2.893658159319412, "grad_norm": 0.24754318581431228, "learning_rate": 3.822885196270554e-08, "loss": 0.0161, "step": 7483 }, { "epoch": 2.8940448569218873, "grad_norm": 0.2785985233613569, "learning_rate": 3.795168107151526e-08, "loss": 0.0267, "step": 7484 }, { "epoch": 2.894431554524362, "grad_norm": 0.34458872419734854, "learning_rate": 3.767551479851694e-08, "loss": 0.0181, "step": 7485 }, { "epoch": 2.8948182521268366, "grad_norm": 0.30503629483364414, "learning_rate": 3.740035319962254e-08, "loss": 0.0241, "step": 7486 }, { "epoch": 2.8952049497293117, "grad_norm": 0.22949389598097464, "learning_rate": 3.712619633054193e-08, "loss": 0.0184, "step": 7487 }, { "epoch": 2.8955916473317864, "grad_norm": 0.18452725339254084, "learning_rate": 3.685304424678127e-08, "loss": 0.0127, "step": 7488 }, { "epoch": 2.8959783449342615, "grad_norm": 0.21192940690965958, "learning_rate": 3.6580897003642426e-08, "loss": 0.0118, "step": 7489 }, { "epoch": 2.896365042536736, "grad_norm": 0.273015360906838, "learning_rate": 3.630975465622466e-08, "loss": 0.0228, "step": 7490 }, { "epoch": 2.8967517401392113, "grad_norm": 0.2839711490478351, "learning_rate": 3.6039617259422955e-08, "loss": 0.0162, "step": 7491 }, { "epoch": 2.897138437741686, "grad_norm": 0.27837694793451095, "learning_rate": 3.5770484867930776e-08, "loss": 0.0174, "step": 7492 }, { "epoch": 2.8975251353441607, "grad_norm": 0.2600019985597687, "learning_rate": 3.550235753623621e-08, "loss": 0.0175, "step": 7493 }, { "epoch": 2.897911832946636, "grad_norm": 0.21786112916265318, "learning_rate": 3.523523531862305e-08, "loss": 0.0153, "step": 7494 }, { "epoch": 2.8982985305491105, "grad_norm": 0.27829449709478693, "learning_rate": 3.4969118269175263e-08, "loss": 0.0196, "step": 7495 }, { "epoch": 2.8986852281515856, "grad_norm": 0.2923588292678503, "learning_rate": 3.4704006441769186e-08, "loss": 0.0188, "step": 7496 }, { "epoch": 2.8990719257540603, "grad_norm": 0.2408596725299025, "learning_rate": 3.443989989008078e-08, "loss": 0.0164, "step": 7497 }, { "epoch": 2.8994586233565354, "grad_norm": 0.3100577514408833, "learning_rate": 3.4176798667580055e-08, "loss": 0.021, "step": 7498 }, { "epoch": 2.89984532095901, "grad_norm": 0.23322090632752684, "learning_rate": 3.391470282753495e-08, "loss": 0.015, "step": 7499 }, { "epoch": 2.9002320185614847, "grad_norm": 0.2182609073525675, "learning_rate": 3.365361242301024e-08, "loss": 0.0125, "step": 7500 }, { "epoch": 2.90061871616396, "grad_norm": 0.394762627741312, "learning_rate": 3.339352750686531e-08, "loss": 0.0263, "step": 7501 }, { "epoch": 2.9010054137664345, "grad_norm": 0.24269428104733962, "learning_rate": 3.313444813175748e-08, "loss": 0.0156, "step": 7502 }, { "epoch": 2.9013921113689096, "grad_norm": 0.2696573474106562, "learning_rate": 3.287637435014035e-08, "loss": 0.0209, "step": 7503 }, { "epoch": 2.9017788089713843, "grad_norm": 0.32234400928939244, "learning_rate": 3.261930621426268e-08, "loss": 0.0342, "step": 7504 }, { "epoch": 2.9021655065738594, "grad_norm": 0.2561540585958968, "learning_rate": 3.236324377617228e-08, "loss": 0.0173, "step": 7505 }, { "epoch": 2.902552204176334, "grad_norm": 0.4791253073624147, "learning_rate": 3.2108187087709906e-08, "loss": 0.0277, "step": 7506 }, { "epoch": 2.9029389017788088, "grad_norm": 0.32396296606478264, "learning_rate": 3.185413620051536e-08, "loss": 0.0336, "step": 7507 }, { "epoch": 2.903325599381284, "grad_norm": 0.2651539040751018, "learning_rate": 3.160109116602361e-08, "loss": 0.0211, "step": 7508 }, { "epoch": 2.9037122969837585, "grad_norm": 0.30431963599083484, "learning_rate": 3.1349052035466456e-08, "loss": 0.0193, "step": 7509 }, { "epoch": 2.9040989945862337, "grad_norm": 0.2635581836806067, "learning_rate": 3.10980188598714e-08, "loss": 0.02, "step": 7510 }, { "epoch": 2.9044856921887083, "grad_norm": 0.2149366520754775, "learning_rate": 3.084799169006336e-08, "loss": 0.0163, "step": 7511 }, { "epoch": 2.9048723897911835, "grad_norm": 0.2884702394929832, "learning_rate": 3.059897057666239e-08, "loss": 0.0229, "step": 7512 }, { "epoch": 2.905259087393658, "grad_norm": 0.20892980563744956, "learning_rate": 3.035095557008538e-08, "loss": 0.0112, "step": 7513 }, { "epoch": 2.905645784996133, "grad_norm": 0.22700873898751722, "learning_rate": 3.0103946720546065e-08, "loss": 0.0135, "step": 7514 }, { "epoch": 2.906032482598608, "grad_norm": 0.2675009965294625, "learning_rate": 2.9857944078053316e-08, "loss": 0.0164, "step": 7515 }, { "epoch": 2.906419180201083, "grad_norm": 0.20399781008614543, "learning_rate": 2.9612947692413963e-08, "loss": 0.0159, "step": 7516 }, { "epoch": 2.9068058778035577, "grad_norm": 0.2685273918729466, "learning_rate": 2.936895761322833e-08, "loss": 0.0205, "step": 7517 }, { "epoch": 2.9071925754060324, "grad_norm": 0.21601846248311438, "learning_rate": 2.9125973889896907e-08, "loss": 0.0161, "step": 7518 }, { "epoch": 2.9075792730085075, "grad_norm": 0.3217172545917879, "learning_rate": 2.888399657161256e-08, "loss": 0.0181, "step": 7519 }, { "epoch": 2.907965970610982, "grad_norm": 0.24634465977727651, "learning_rate": 2.86430257073661e-08, "loss": 0.0224, "step": 7520 }, { "epoch": 2.908352668213457, "grad_norm": 0.21271631061422402, "learning_rate": 2.8403061345945725e-08, "loss": 0.0131, "step": 7521 }, { "epoch": 2.908739365815932, "grad_norm": 0.2770532399793178, "learning_rate": 2.816410353593424e-08, "loss": 0.0175, "step": 7522 }, { "epoch": 2.909126063418407, "grad_norm": 0.31758415003960955, "learning_rate": 2.7926152325711277e-08, "loss": 0.0161, "step": 7523 }, { "epoch": 2.9095127610208817, "grad_norm": 0.3294146333684728, "learning_rate": 2.768920776345163e-08, "loss": 0.0258, "step": 7524 }, { "epoch": 2.9098994586233564, "grad_norm": 0.31530057037630727, "learning_rate": 2.745326989712804e-08, "loss": 0.0152, "step": 7525 }, { "epoch": 2.9102861562258315, "grad_norm": 0.22608098831117462, "learning_rate": 2.7218338774508412e-08, "loss": 0.0114, "step": 7526 }, { "epoch": 2.910672853828306, "grad_norm": 0.21987380728798953, "learning_rate": 2.6984414443156916e-08, "loss": 0.0173, "step": 7527 }, { "epoch": 2.911059551430781, "grad_norm": 0.22599397152773135, "learning_rate": 2.6751496950434552e-08, "loss": 0.0151, "step": 7528 }, { "epoch": 2.911446249033256, "grad_norm": 0.28357946118891764, "learning_rate": 2.6519586343496383e-08, "loss": 0.0199, "step": 7529 }, { "epoch": 2.911832946635731, "grad_norm": 0.2145343231730131, "learning_rate": 2.6288682669296516e-08, "loss": 0.0132, "step": 7530 }, { "epoch": 2.912219644238206, "grad_norm": 0.2539456409809296, "learning_rate": 2.6058785974583668e-08, "loss": 0.0169, "step": 7531 }, { "epoch": 2.9126063418406805, "grad_norm": 0.29009759836471183, "learning_rate": 2.582989630590227e-08, "loss": 0.0231, "step": 7532 }, { "epoch": 2.9129930394431556, "grad_norm": 0.23116388164460583, "learning_rate": 2.5602013709593587e-08, "loss": 0.0123, "step": 7533 }, { "epoch": 2.9133797370456302, "grad_norm": 0.2956183994067039, "learning_rate": 2.5375138231794604e-08, "loss": 0.0199, "step": 7534 }, { "epoch": 2.913766434648105, "grad_norm": 0.2502316838343999, "learning_rate": 2.5149269918439134e-08, "loss": 0.0136, "step": 7535 }, { "epoch": 2.91415313225058, "grad_norm": 0.2908192845177913, "learning_rate": 2.4924408815256152e-08, "loss": 0.0239, "step": 7536 }, { "epoch": 2.914539829853055, "grad_norm": 0.26061888452814846, "learning_rate": 2.4700554967771463e-08, "loss": 0.0176, "step": 7537 }, { "epoch": 2.91492652745553, "grad_norm": 0.23946331558416994, "learning_rate": 2.447770842130659e-08, "loss": 0.0177, "step": 7538 }, { "epoch": 2.9153132250580045, "grad_norm": 0.264221848360767, "learning_rate": 2.425586922097878e-08, "loss": 0.0188, "step": 7539 }, { "epoch": 2.9156999226604796, "grad_norm": 0.24223497562856333, "learning_rate": 2.40350374117021e-08, "loss": 0.0223, "step": 7540 }, { "epoch": 2.9160866202629543, "grad_norm": 0.2130169146052434, "learning_rate": 2.3815213038185235e-08, "loss": 0.0134, "step": 7541 }, { "epoch": 2.916473317865429, "grad_norm": 0.24079951054093932, "learning_rate": 2.3596396144935363e-08, "loss": 0.0145, "step": 7542 }, { "epoch": 2.916860015467904, "grad_norm": 0.2230538463425754, "learning_rate": 2.3378586776253154e-08, "loss": 0.0152, "step": 7543 }, { "epoch": 2.917246713070379, "grad_norm": 0.24416024416451446, "learning_rate": 2.3161784976237222e-08, "loss": 0.0219, "step": 7544 }, { "epoch": 2.917633410672854, "grad_norm": 0.26793004757878514, "learning_rate": 2.2945990788780237e-08, "loss": 0.0165, "step": 7545 }, { "epoch": 2.9180201082753285, "grad_norm": 0.29099887738743085, "learning_rate": 2.2731204257573358e-08, "loss": 0.0206, "step": 7546 }, { "epoch": 2.9184068058778037, "grad_norm": 0.27108210343880923, "learning_rate": 2.2517425426101246e-08, "loss": 0.0189, "step": 7547 }, { "epoch": 2.9187935034802783, "grad_norm": 0.2485370172690081, "learning_rate": 2.2304654337645947e-08, "loss": 0.0184, "step": 7548 }, { "epoch": 2.9191802010827534, "grad_norm": 0.6363681083554527, "learning_rate": 2.2092891035285225e-08, "loss": 0.02, "step": 7549 }, { "epoch": 2.919566898685228, "grad_norm": 0.2196669815186893, "learning_rate": 2.188213556189256e-08, "loss": 0.0158, "step": 7550 }, { "epoch": 2.9199535962877032, "grad_norm": 0.3045428615921702, "learning_rate": 2.1672387960138264e-08, "loss": 0.0136, "step": 7551 }, { "epoch": 2.920340293890178, "grad_norm": 0.34603378529695794, "learning_rate": 2.1463648272487812e-08, "loss": 0.0294, "step": 7552 }, { "epoch": 2.9207269914926526, "grad_norm": 0.33424308359858373, "learning_rate": 2.1255916541201294e-08, "loss": 0.0171, "step": 7553 }, { "epoch": 2.9211136890951277, "grad_norm": 0.2285549272731695, "learning_rate": 2.1049192808337836e-08, "loss": 0.0178, "step": 7554 }, { "epoch": 2.9215003866976024, "grad_norm": 0.20735343185771882, "learning_rate": 2.0843477115750076e-08, "loss": 0.0123, "step": 7555 }, { "epoch": 2.9218870843000775, "grad_norm": 0.3700534440652775, "learning_rate": 2.063876950508803e-08, "loss": 0.0138, "step": 7556 }, { "epoch": 2.922273781902552, "grad_norm": 0.2556379146287434, "learning_rate": 2.0435070017795212e-08, "loss": 0.0161, "step": 7557 }, { "epoch": 2.9226604795050273, "grad_norm": 0.36757980078327196, "learning_rate": 2.0232378695114187e-08, "loss": 0.027, "step": 7558 }, { "epoch": 2.923047177107502, "grad_norm": 0.24567432072097117, "learning_rate": 2.003069557808157e-08, "loss": 0.0198, "step": 7559 }, { "epoch": 2.9234338747099766, "grad_norm": 0.2746696837194233, "learning_rate": 1.9830020707530263e-08, "loss": 0.0235, "step": 7560 }, { "epoch": 2.9238205723124517, "grad_norm": 0.36279675979551196, "learning_rate": 1.963035412408887e-08, "loss": 0.0261, "step": 7561 }, { "epoch": 2.9242072699149264, "grad_norm": 0.2947470564106355, "learning_rate": 1.9431695868181167e-08, "loss": 0.0273, "step": 7562 }, { "epoch": 2.9245939675174015, "grad_norm": 0.26503805573275385, "learning_rate": 1.923404598002887e-08, "loss": 0.0131, "step": 7563 }, { "epoch": 2.924980665119876, "grad_norm": 0.2471366966005144, "learning_rate": 1.903740449964775e-08, "loss": 0.019, "step": 7564 }, { "epoch": 2.9253673627223513, "grad_norm": 0.30514167755324634, "learning_rate": 1.8841771466849846e-08, "loss": 0.0204, "step": 7565 }, { "epoch": 2.925754060324826, "grad_norm": 0.24549291352056757, "learning_rate": 1.864714692124292e-08, "loss": 0.017, "step": 7566 }, { "epoch": 2.9261407579273007, "grad_norm": 0.2586939160748329, "learning_rate": 1.845353090223101e-08, "loss": 0.0155, "step": 7567 }, { "epoch": 2.926527455529776, "grad_norm": 0.288711705988567, "learning_rate": 1.8260923449013313e-08, "loss": 0.0185, "step": 7568 }, { "epoch": 2.9269141531322505, "grad_norm": 0.26156670316670627, "learning_rate": 1.8069324600585858e-08, "loss": 0.0206, "step": 7569 }, { "epoch": 2.9273008507347256, "grad_norm": 0.2701968316228852, "learning_rate": 1.787873439573984e-08, "loss": 0.0171, "step": 7570 }, { "epoch": 2.9276875483372002, "grad_norm": 0.2939579701434402, "learning_rate": 1.7689152873061055e-08, "loss": 0.0338, "step": 7571 }, { "epoch": 2.9280742459396754, "grad_norm": 0.23335375614364423, "learning_rate": 1.750058007093325e-08, "loss": 0.0145, "step": 7572 }, { "epoch": 2.92846094354215, "grad_norm": 0.246620685387117, "learning_rate": 1.7313016027535323e-08, "loss": 0.0172, "step": 7573 }, { "epoch": 2.9288476411446247, "grad_norm": 0.3231386559899201, "learning_rate": 1.712646078084079e-08, "loss": 0.0299, "step": 7574 }, { "epoch": 2.9292343387471, "grad_norm": 0.24309584338240334, "learning_rate": 1.6940914368619443e-08, "loss": 0.0194, "step": 7575 }, { "epoch": 2.9296210363495745, "grad_norm": 0.3255474017566754, "learning_rate": 1.6756376828437893e-08, "loss": 0.0212, "step": 7576 }, { "epoch": 2.9300077339520496, "grad_norm": 0.27389348651391415, "learning_rate": 1.657284819765681e-08, "loss": 0.0263, "step": 7577 }, { "epoch": 2.9303944315545243, "grad_norm": 0.3012835680046356, "learning_rate": 1.6390328513434806e-08, "loss": 0.0206, "step": 7578 }, { "epoch": 2.9307811291569994, "grad_norm": 0.31629206060967235, "learning_rate": 1.6208817812723432e-08, "loss": 0.0243, "step": 7579 }, { "epoch": 2.931167826759474, "grad_norm": 0.2741199090328041, "learning_rate": 1.602831613227218e-08, "loss": 0.0231, "step": 7580 }, { "epoch": 2.9315545243619487, "grad_norm": 0.29884271975585713, "learning_rate": 1.5848823508625155e-08, "loss": 0.017, "step": 7581 }, { "epoch": 2.931941221964424, "grad_norm": 0.25227602397367255, "learning_rate": 1.5670339978123283e-08, "loss": 0.0176, "step": 7582 }, { "epoch": 2.9323279195668985, "grad_norm": 0.2569641720568242, "learning_rate": 1.5492865576900997e-08, "loss": 0.0169, "step": 7583 }, { "epoch": 2.9327146171693736, "grad_norm": 0.3414869118273151, "learning_rate": 1.5316400340891214e-08, "loss": 0.0126, "step": 7584 }, { "epoch": 2.9331013147718483, "grad_norm": 0.34033207851837466, "learning_rate": 1.514094430582036e-08, "loss": 0.0375, "step": 7585 }, { "epoch": 2.9334880123743234, "grad_norm": 0.2462890063248976, "learning_rate": 1.4966497507212242e-08, "loss": 0.016, "step": 7586 }, { "epoch": 2.933874709976798, "grad_norm": 0.23212138920850742, "learning_rate": 1.4793059980384161e-08, "loss": 0.0196, "step": 7587 }, { "epoch": 2.934261407579273, "grad_norm": 0.2920631651883102, "learning_rate": 1.462063176045081e-08, "loss": 0.0187, "step": 7588 }, { "epoch": 2.934648105181748, "grad_norm": 0.33431715475831714, "learning_rate": 1.444921288232204e-08, "loss": 0.0199, "step": 7589 }, { "epoch": 2.9350348027842226, "grad_norm": 0.30196172237426094, "learning_rate": 1.4278803380703976e-08, "loss": 0.0166, "step": 7590 }, { "epoch": 2.9354215003866977, "grad_norm": 0.2510439173291046, "learning_rate": 1.4109403290097356e-08, "loss": 0.0132, "step": 7591 }, { "epoch": 2.9358081979891724, "grad_norm": 0.27591409138948325, "learning_rate": 1.3941012644798634e-08, "loss": 0.0168, "step": 7592 }, { "epoch": 2.9361948955916475, "grad_norm": 0.3148940158383751, "learning_rate": 1.377363147890054e-08, "loss": 0.0272, "step": 7593 }, { "epoch": 2.936581593194122, "grad_norm": 0.23164292488388263, "learning_rate": 1.360725982629152e-08, "loss": 0.0188, "step": 7594 }, { "epoch": 2.936968290796597, "grad_norm": 0.3136678070402079, "learning_rate": 1.3441897720654628e-08, "loss": 0.0211, "step": 7595 }, { "epoch": 2.937354988399072, "grad_norm": 0.32647651592548804, "learning_rate": 1.3277545195469754e-08, "loss": 0.0155, "step": 7596 }, { "epoch": 2.937741686001547, "grad_norm": 0.34210063234212845, "learning_rate": 1.3114202284010834e-08, "loss": 0.0173, "step": 7597 }, { "epoch": 2.9381283836040217, "grad_norm": 0.21100108699631476, "learning_rate": 1.295186901934975e-08, "loss": 0.0156, "step": 7598 }, { "epoch": 2.9385150812064964, "grad_norm": 0.284868607602646, "learning_rate": 1.279054543435132e-08, "loss": 0.014, "step": 7599 }, { "epoch": 2.9389017788089715, "grad_norm": 0.2731848362989901, "learning_rate": 1.2630231561677198e-08, "loss": 0.0206, "step": 7600 }, { "epoch": 2.939288476411446, "grad_norm": 0.26256689952322776, "learning_rate": 1.2470927433785863e-08, "loss": 0.0182, "step": 7601 }, { "epoch": 2.939675174013921, "grad_norm": 0.49837780964563183, "learning_rate": 1.2312633082928183e-08, "loss": 0.022, "step": 7602 }, { "epoch": 2.940061871616396, "grad_norm": 0.26308468258478823, "learning_rate": 1.2155348541154632e-08, "loss": 0.0193, "step": 7603 }, { "epoch": 2.940448569218871, "grad_norm": 0.2231787278874319, "learning_rate": 1.1999073840306964e-08, "loss": 0.0154, "step": 7604 }, { "epoch": 2.9408352668213458, "grad_norm": 0.2830378925586416, "learning_rate": 1.1843809012025976e-08, "loss": 0.014, "step": 7605 }, { "epoch": 2.9412219644238204, "grad_norm": 0.23431519119558905, "learning_rate": 1.1689554087746525e-08, "loss": 0.0169, "step": 7606 }, { "epoch": 2.9416086620262956, "grad_norm": 0.3210342375106101, "learning_rate": 1.1536309098699183e-08, "loss": 0.0187, "step": 7607 }, { "epoch": 2.9419953596287702, "grad_norm": 0.22660042858646345, "learning_rate": 1.1384074075909136e-08, "loss": 0.0156, "step": 7608 }, { "epoch": 2.942382057231245, "grad_norm": 0.22925388810147593, "learning_rate": 1.123284905019839e-08, "loss": 0.0144, "step": 7609 }, { "epoch": 2.94276875483372, "grad_norm": 0.29094416270301304, "learning_rate": 1.1082634052184682e-08, "loss": 0.0321, "step": 7610 }, { "epoch": 2.943155452436195, "grad_norm": 0.2789626403941114, "learning_rate": 1.0933429112279791e-08, "loss": 0.0184, "step": 7611 }, { "epoch": 2.94354215003867, "grad_norm": 0.2443988906504191, "learning_rate": 1.0785234260692335e-08, "loss": 0.0144, "step": 7612 }, { "epoch": 2.9439288476411445, "grad_norm": 0.2670593052634397, "learning_rate": 1.0638049527425532e-08, "loss": 0.0203, "step": 7613 }, { "epoch": 2.9443155452436196, "grad_norm": 0.27760072126964574, "learning_rate": 1.0491874942278323e-08, "loss": 0.013, "step": 7614 }, { "epoch": 2.9447022428460943, "grad_norm": 0.27490744678196005, "learning_rate": 1.0346710534845927e-08, "loss": 0.0199, "step": 7615 }, { "epoch": 2.945088940448569, "grad_norm": 0.28103224930010773, "learning_rate": 1.020255633451761e-08, "loss": 0.016, "step": 7616 }, { "epoch": 2.945475638051044, "grad_norm": 0.37406576368760286, "learning_rate": 1.005941237047947e-08, "loss": 0.0202, "step": 7617 }, { "epoch": 2.945862335653519, "grad_norm": 0.27359880578473933, "learning_rate": 9.917278671712216e-09, "loss": 0.0181, "step": 7618 }, { "epoch": 2.946249033255994, "grad_norm": 0.20564494489518406, "learning_rate": 9.776155266992272e-09, "loss": 0.0133, "step": 7619 }, { "epoch": 2.9466357308584685, "grad_norm": 0.24264996843354636, "learning_rate": 9.63604218489178e-09, "loss": 0.0198, "step": 7620 }, { "epoch": 2.9470224284609436, "grad_norm": 0.343362250990421, "learning_rate": 9.496939453777498e-09, "loss": 0.0226, "step": 7621 }, { "epoch": 2.9474091260634183, "grad_norm": 0.39042923497735993, "learning_rate": 9.358847101813007e-09, "loss": 0.0131, "step": 7622 }, { "epoch": 2.9477958236658934, "grad_norm": 0.2697123784919473, "learning_rate": 9.221765156956497e-09, "loss": 0.0138, "step": 7623 }, { "epoch": 2.948182521268368, "grad_norm": 0.329622395879545, "learning_rate": 9.08569364696077e-09, "loss": 0.0234, "step": 7624 }, { "epoch": 2.9485692188708432, "grad_norm": 0.3312145885803891, "learning_rate": 8.950632599375453e-09, "loss": 0.024, "step": 7625 }, { "epoch": 2.948955916473318, "grad_norm": 0.23032173350937493, "learning_rate": 8.816582041545341e-09, "loss": 0.0131, "step": 7626 }, { "epoch": 2.9493426140757926, "grad_norm": 0.33610312339598847, "learning_rate": 8.683542000609835e-09, "loss": 0.0185, "step": 7627 }, { "epoch": 2.9497293116782677, "grad_norm": 0.21552287727494662, "learning_rate": 8.551512503504612e-09, "loss": 0.0132, "step": 7628 }, { "epoch": 2.9501160092807424, "grad_norm": 0.29008026183454283, "learning_rate": 8.420493576960509e-09, "loss": 0.0202, "step": 7629 }, { "epoch": 2.9505027068832175, "grad_norm": 0.35459067183813064, "learning_rate": 8.290485247503533e-09, "loss": 0.0301, "step": 7630 }, { "epoch": 2.950889404485692, "grad_norm": 0.30506753378253487, "learning_rate": 8.161487541454849e-09, "loss": 0.0247, "step": 7631 }, { "epoch": 2.9512761020881673, "grad_norm": 0.26126008593919603, "learning_rate": 8.0335004849319e-09, "loss": 0.015, "step": 7632 }, { "epoch": 2.951662799690642, "grad_norm": 0.25791979892477546, "learning_rate": 7.906524103846181e-09, "loss": 0.0146, "step": 7633 }, { "epoch": 2.9520494972931166, "grad_norm": 0.2388198434962796, "learning_rate": 7.78055842390657e-09, "loss": 0.0201, "step": 7634 }, { "epoch": 2.9524361948955917, "grad_norm": 0.21933243708223207, "learning_rate": 7.655603470615446e-09, "loss": 0.0138, "step": 7635 }, { "epoch": 2.9528228924980664, "grad_norm": 0.34912544055862516, "learning_rate": 7.531659269271463e-09, "loss": 0.0219, "step": 7636 }, { "epoch": 2.9532095901005415, "grad_norm": 0.2801102715182204, "learning_rate": 7.408725844967879e-09, "loss": 0.0166, "step": 7637 }, { "epoch": 2.953596287703016, "grad_norm": 0.38606569405588953, "learning_rate": 7.2868032225947845e-09, "loss": 0.0372, "step": 7638 }, { "epoch": 2.9539829853054913, "grad_norm": 0.3252406135378013, "learning_rate": 7.165891426835769e-09, "loss": 0.0172, "step": 7639 }, { "epoch": 2.954369682907966, "grad_norm": 0.3075007865998963, "learning_rate": 7.04599048217125e-09, "loss": 0.0207, "step": 7640 }, { "epoch": 2.9547563805104406, "grad_norm": 0.35023532838807264, "learning_rate": 6.927100412875698e-09, "loss": 0.0193, "step": 7641 }, { "epoch": 2.9551430781129158, "grad_norm": 0.23122933571136792, "learning_rate": 6.809221243020969e-09, "loss": 0.0148, "step": 7642 }, { "epoch": 2.9555297757153904, "grad_norm": 0.2519328069400623, "learning_rate": 6.6923529964713076e-09, "loss": 0.021, "step": 7643 }, { "epoch": 2.9559164733178656, "grad_norm": 0.33025286099514056, "learning_rate": 6.5764956968894515e-09, "loss": 0.0259, "step": 7644 }, { "epoch": 2.9563031709203402, "grad_norm": 0.23914220290825733, "learning_rate": 6.461649367730527e-09, "loss": 0.0173, "step": 7645 }, { "epoch": 2.9566898685228153, "grad_norm": 0.29721031812048226, "learning_rate": 6.347814032247601e-09, "loss": 0.0202, "step": 7646 }, { "epoch": 2.95707656612529, "grad_norm": 0.2788351020780349, "learning_rate": 6.234989713486683e-09, "loss": 0.018, "step": 7647 }, { "epoch": 2.9574632637277647, "grad_norm": 0.270487494629462, "learning_rate": 6.123176434291167e-09, "loss": 0.0139, "step": 7648 }, { "epoch": 2.95784996133024, "grad_norm": 0.2790127189257436, "learning_rate": 6.0123742172985e-09, "loss": 0.0179, "step": 7649 }, { "epoch": 2.9582366589327145, "grad_norm": 0.2200252479964238, "learning_rate": 5.902583084941294e-09, "loss": 0.0201, "step": 7650 }, { "epoch": 2.9586233565351896, "grad_norm": 0.27451331767788, "learning_rate": 5.793803059448433e-09, "loss": 0.0169, "step": 7651 }, { "epoch": 2.9590100541376643, "grad_norm": 0.22515680343724231, "learning_rate": 5.686034162843412e-09, "loss": 0.0181, "step": 7652 }, { "epoch": 2.9593967517401394, "grad_norm": 0.30392646977339494, "learning_rate": 5.57927641694489e-09, "loss": 0.0188, "step": 7653 }, { "epoch": 2.959783449342614, "grad_norm": 0.2620631287599091, "learning_rate": 5.473529843367797e-09, "loss": 0.0185, "step": 7654 }, { "epoch": 2.9601701469450887, "grad_norm": 0.204190525918476, "learning_rate": 5.3687944635211206e-09, "loss": 0.0161, "step": 7655 }, { "epoch": 2.960556844547564, "grad_norm": 0.22883722324413627, "learning_rate": 5.265070298609564e-09, "loss": 0.0189, "step": 7656 }, { "epoch": 2.9609435421500385, "grad_norm": 0.23380942832445128, "learning_rate": 5.162357369632998e-09, "loss": 0.0207, "step": 7657 }, { "epoch": 2.9613302397525136, "grad_norm": 0.35555309308875127, "learning_rate": 5.060655697387562e-09, "loss": 0.0254, "step": 7658 }, { "epoch": 2.9617169373549883, "grad_norm": 0.3580899285713402, "learning_rate": 4.959965302462899e-09, "loss": 0.03, "step": 7659 }, { "epoch": 2.9621036349574634, "grad_norm": 0.25131000194456005, "learning_rate": 4.860286205244924e-09, "loss": 0.0193, "step": 7660 }, { "epoch": 2.962490332559938, "grad_norm": 0.33165266782222025, "learning_rate": 4.761618425915271e-09, "loss": 0.0235, "step": 7661 }, { "epoch": 2.9628770301624128, "grad_norm": 0.20383899122596424, "learning_rate": 4.663961984450182e-09, "loss": 0.0119, "step": 7662 }, { "epoch": 2.963263727764888, "grad_norm": 0.3052308070233327, "learning_rate": 4.567316900620511e-09, "loss": 0.019, "step": 7663 }, { "epoch": 2.9636504253673626, "grad_norm": 0.2194435515778963, "learning_rate": 4.471683193993936e-09, "loss": 0.0158, "step": 7664 }, { "epoch": 2.9640371229698377, "grad_norm": 0.21833804445013566, "learning_rate": 4.377060883931638e-09, "loss": 0.0193, "step": 7665 }, { "epoch": 2.9644238205723124, "grad_norm": 0.32643784664172515, "learning_rate": 4.283449989591626e-09, "loss": 0.0309, "step": 7666 }, { "epoch": 2.9648105181747875, "grad_norm": 0.2454195108442397, "learning_rate": 4.190850529925961e-09, "loss": 0.0178, "step": 7667 }, { "epoch": 2.965197215777262, "grad_norm": 0.24625763543376716, "learning_rate": 4.0992625236824234e-09, "loss": 0.0121, "step": 7668 }, { "epoch": 2.965583913379737, "grad_norm": 0.284708798421791, "learning_rate": 4.00868598940396e-09, "loss": 0.0237, "step": 7669 }, { "epoch": 2.965970610982212, "grad_norm": 0.26040921059996525, "learning_rate": 3.919120945429233e-09, "loss": 0.0143, "step": 7670 }, { "epoch": 2.966357308584687, "grad_norm": 0.24059615570866733, "learning_rate": 3.830567409890962e-09, "loss": 0.0203, "step": 7671 }, { "epoch": 2.9667440061871617, "grad_norm": 0.2611832127601772, "learning_rate": 3.7430254007181365e-09, "loss": 0.0181, "step": 7672 }, { "epoch": 2.9671307037896364, "grad_norm": 0.2321087505757096, "learning_rate": 3.656494935634358e-09, "loss": 0.0154, "step": 7673 }, { "epoch": 2.9675174013921115, "grad_norm": 0.25823729145653884, "learning_rate": 3.570976032158391e-09, "loss": 0.0207, "step": 7674 }, { "epoch": 2.967904098994586, "grad_norm": 0.2194802431284541, "learning_rate": 3.4864687076047175e-09, "loss": 0.0125, "step": 7675 }, { "epoch": 2.968290796597061, "grad_norm": 0.2209780506932813, "learning_rate": 3.402972979082431e-09, "loss": 0.0133, "step": 7676 }, { "epoch": 2.968677494199536, "grad_norm": 0.325002427521162, "learning_rate": 3.3204888634968956e-09, "loss": 0.0216, "step": 7677 }, { "epoch": 2.969064191802011, "grad_norm": 0.21569469808678263, "learning_rate": 3.2390163775469775e-09, "loss": 0.0166, "step": 7678 }, { "epoch": 2.9694508894044858, "grad_norm": 0.3064486470698069, "learning_rate": 3.158555537727814e-09, "loss": 0.0165, "step": 7679 }, { "epoch": 2.9698375870069604, "grad_norm": 0.21915365765021197, "learning_rate": 3.079106360329709e-09, "loss": 0.0158, "step": 7680 }, { "epoch": 2.9702242846094355, "grad_norm": 0.3718967418031255, "learning_rate": 3.0006688614386826e-09, "loss": 0.0173, "step": 7681 }, { "epoch": 2.9706109822119102, "grad_norm": 0.2649612154406003, "learning_rate": 2.9232430569337e-09, "loss": 0.016, "step": 7682 }, { "epoch": 2.970997679814385, "grad_norm": 0.3024287746683226, "learning_rate": 2.846828962491666e-09, "loss": 0.0199, "step": 7683 }, { "epoch": 2.97138437741686, "grad_norm": 0.24773872965383192, "learning_rate": 2.771426593582427e-09, "loss": 0.0125, "step": 7684 }, { "epoch": 2.971771075019335, "grad_norm": 0.3109461625575248, "learning_rate": 2.6970359654732157e-09, "loss": 0.0246, "step": 7685 }, { "epoch": 2.97215777262181, "grad_norm": 0.281656498995332, "learning_rate": 2.623657093223653e-09, "loss": 0.0142, "step": 7686 }, { "epoch": 2.9725444702242845, "grad_norm": 0.24334638922491966, "learning_rate": 2.551289991691297e-09, "loss": 0.0158, "step": 7687 }, { "epoch": 2.9729311678267596, "grad_norm": 0.226393231743414, "learning_rate": 2.479934675527762e-09, "loss": 0.0158, "step": 7688 }, { "epoch": 2.9733178654292343, "grad_norm": 0.28292240450832623, "learning_rate": 2.4095911591781596e-09, "loss": 0.0289, "step": 7689 }, { "epoch": 2.973704563031709, "grad_norm": 0.20537446701192752, "learning_rate": 2.340259456886096e-09, "loss": 0.0238, "step": 7690 }, { "epoch": 2.974091260634184, "grad_norm": 0.2575597001691337, "learning_rate": 2.2719395826870107e-09, "loss": 0.0192, "step": 7691 }, { "epoch": 2.974477958236659, "grad_norm": 0.21736006322050214, "learning_rate": 2.2046315504137274e-09, "loss": 0.0131, "step": 7692 }, { "epoch": 2.974864655839134, "grad_norm": 0.1940595765328098, "learning_rate": 2.1383353736931233e-09, "loss": 0.0107, "step": 7693 }, { "epoch": 2.9752513534416085, "grad_norm": 0.3152966604478872, "learning_rate": 2.0730510659477952e-09, "loss": 0.0166, "step": 7694 }, { "epoch": 2.9756380510440836, "grad_norm": 0.3128026046497086, "learning_rate": 2.0087786403955035e-09, "loss": 0.0283, "step": 7695 }, { "epoch": 2.9760247486465583, "grad_norm": 0.9310017921856887, "learning_rate": 1.9455181100480614e-09, "loss": 0.0195, "step": 7696 }, { "epoch": 2.9764114462490334, "grad_norm": 0.27128717069000535, "learning_rate": 1.8832694877141123e-09, "loss": 0.0118, "step": 7697 }, { "epoch": 2.976798143851508, "grad_norm": 0.2958312157953728, "learning_rate": 1.8220327859957975e-09, "loss": 0.0173, "step": 7698 }, { "epoch": 2.977184841453983, "grad_norm": 0.2337237808418791, "learning_rate": 1.761808017291533e-09, "loss": 0.0149, "step": 7699 }, { "epoch": 2.977571539056458, "grad_norm": 0.27136106584538977, "learning_rate": 1.702595193794343e-09, "loss": 0.0184, "step": 7700 }, { "epoch": 2.9779582366589326, "grad_norm": 0.2847265970735418, "learning_rate": 1.6443943274929709e-09, "loss": 0.024, "step": 7701 }, { "epoch": 2.9783449342614077, "grad_norm": 0.1955790462142066, "learning_rate": 1.5872054301696583e-09, "loss": 0.0116, "step": 7702 }, { "epoch": 2.9787316318638823, "grad_norm": 0.2258835218815629, "learning_rate": 1.5310285134034764e-09, "loss": 0.0204, "step": 7703 }, { "epoch": 2.9791183294663575, "grad_norm": 0.2475855970571635, "learning_rate": 1.47586358856866e-09, "loss": 0.0229, "step": 7704 }, { "epoch": 2.979505027068832, "grad_norm": 0.2625789013960461, "learning_rate": 1.4217106668329427e-09, "loss": 0.0149, "step": 7705 }, { "epoch": 2.9798917246713073, "grad_norm": 0.28030381300472973, "learning_rate": 1.368569759160332e-09, "loss": 0.0208, "step": 7706 }, { "epoch": 2.980278422273782, "grad_norm": 0.21344802447243466, "learning_rate": 1.316440876310554e-09, "loss": 0.015, "step": 7707 }, { "epoch": 2.9806651198762566, "grad_norm": 0.1923273492432618, "learning_rate": 1.265324028836834e-09, "loss": 0.011, "step": 7708 }, { "epoch": 2.9810518174787317, "grad_norm": 0.26928348399282903, "learning_rate": 1.2152192270881157e-09, "loss": 0.013, "step": 7709 }, { "epoch": 2.9814385150812064, "grad_norm": 0.2766353752271138, "learning_rate": 1.1661264812096175e-09, "loss": 0.0243, "step": 7710 }, { "epoch": 2.9818252126836815, "grad_norm": 0.2989290306468233, "learning_rate": 1.1180458011395002e-09, "loss": 0.0256, "step": 7711 }, { "epoch": 2.982211910286156, "grad_norm": 0.18918197968524483, "learning_rate": 1.0709771966133099e-09, "loss": 0.0112, "step": 7712 }, { "epoch": 2.9825986078886313, "grad_norm": 0.24686599093148134, "learning_rate": 1.0249206771600906e-09, "loss": 0.0139, "step": 7713 }, { "epoch": 2.982985305491106, "grad_norm": 0.25834846641060955, "learning_rate": 9.798762521040505e-10, "loss": 0.0202, "step": 7714 }, { "epoch": 2.9833720030935806, "grad_norm": 0.2796109030305868, "learning_rate": 9.358439305656719e-10, "loss": 0.0218, "step": 7715 }, { "epoch": 2.9837587006960558, "grad_norm": 0.2187334457721638, "learning_rate": 8.928237214594903e-10, "loss": 0.017, "step": 7716 }, { "epoch": 2.9841453982985304, "grad_norm": 0.26949526103266397, "learning_rate": 8.508156334946505e-10, "loss": 0.0181, "step": 7717 }, { "epoch": 2.9845320959010055, "grad_norm": 0.23827647010153544, "learning_rate": 8.098196751771259e-10, "loss": 0.0128, "step": 7718 }, { "epoch": 2.98491879350348, "grad_norm": 0.23930012202757514, "learning_rate": 7.69835854806944e-10, "loss": 0.013, "step": 7719 }, { "epoch": 2.9853054911059553, "grad_norm": 0.25321965987752043, "learning_rate": 7.308641804781858e-10, "loss": 0.0235, "step": 7720 }, { "epoch": 2.98569218870843, "grad_norm": 0.29120253263242973, "learning_rate": 6.929046600823164e-10, "loss": 0.0187, "step": 7721 }, { "epoch": 2.9860788863109047, "grad_norm": 0.2545042057267935, "learning_rate": 6.559573013042997e-10, "loss": 0.0176, "step": 7722 }, { "epoch": 2.98646558391338, "grad_norm": 0.2829726056686017, "learning_rate": 6.200221116237082e-10, "loss": 0.0181, "step": 7723 }, { "epoch": 2.9868522815158545, "grad_norm": 0.30370966615081263, "learning_rate": 5.850990983169436e-10, "loss": 0.0194, "step": 7724 }, { "epoch": 2.9872389791183296, "grad_norm": 0.3077588935372271, "learning_rate": 5.51188268453906e-10, "loss": 0.0245, "step": 7725 }, { "epoch": 2.9876256767208043, "grad_norm": 0.28064710988655245, "learning_rate": 5.182896289007699e-10, "loss": 0.0179, "step": 7726 }, { "epoch": 2.9880123743232794, "grad_norm": 0.20645715690075728, "learning_rate": 4.864031863183183e-10, "loss": 0.0111, "step": 7727 }, { "epoch": 2.988399071925754, "grad_norm": 0.2336777592655324, "learning_rate": 4.555289471613877e-10, "loss": 0.0141, "step": 7728 }, { "epoch": 2.9887857695282287, "grad_norm": 0.22740758341434722, "learning_rate": 4.25666917681089e-10, "loss": 0.017, "step": 7729 }, { "epoch": 2.989172467130704, "grad_norm": 0.2858197147666055, "learning_rate": 3.968171039236968e-10, "loss": 0.0204, "step": 7730 }, { "epoch": 2.9895591647331785, "grad_norm": 0.30159936339299104, "learning_rate": 3.6897951173009425e-10, "loss": 0.0184, "step": 7731 }, { "epoch": 2.9899458623356536, "grad_norm": 0.2731792694103445, "learning_rate": 3.4215414673632873e-10, "loss": 0.0199, "step": 7732 }, { "epoch": 2.9903325599381283, "grad_norm": 0.29233745630872904, "learning_rate": 3.1634101437305607e-10, "loss": 0.0245, "step": 7733 }, { "epoch": 2.9907192575406034, "grad_norm": 0.2492411003449426, "learning_rate": 2.915401198666512e-10, "loss": 0.0238, "step": 7734 }, { "epoch": 2.991105955143078, "grad_norm": 0.3040102559471703, "learning_rate": 2.677514682380977e-10, "loss": 0.0222, "step": 7735 }, { "epoch": 2.9914926527455528, "grad_norm": 0.244327620934352, "learning_rate": 2.4497506430409823e-10, "loss": 0.0149, "step": 7736 }, { "epoch": 2.991879350348028, "grad_norm": 0.19833385248616156, "learning_rate": 2.2321091267596407e-10, "loss": 0.01, "step": 7737 }, { "epoch": 2.9922660479505025, "grad_norm": 0.3816526318151247, "learning_rate": 2.024590177596153e-10, "loss": 0.022, "step": 7738 }, { "epoch": 2.9926527455529777, "grad_norm": 0.2940615350945476, "learning_rate": 1.8271938375669097e-10, "loss": 0.0198, "step": 7739 }, { "epoch": 2.9930394431554523, "grad_norm": 0.21256900963953793, "learning_rate": 1.6399201466399394e-10, "loss": 0.0131, "step": 7740 }, { "epoch": 2.9934261407579275, "grad_norm": 0.3230777313837747, "learning_rate": 1.462769142729359e-10, "loss": 0.0201, "step": 7741 }, { "epoch": 2.993812838360402, "grad_norm": 0.274283179284252, "learning_rate": 1.2957408616953715e-10, "loss": 0.0132, "step": 7742 }, { "epoch": 2.994199535962877, "grad_norm": 0.20445554595845652, "learning_rate": 1.1388353373609218e-10, "loss": 0.016, "step": 7743 }, { "epoch": 2.994586233565352, "grad_norm": 0.26935723222917146, "learning_rate": 9.920526014950416e-11, "loss": 0.0215, "step": 7744 }, { "epoch": 2.994972931167827, "grad_norm": 0.2706381318459455, "learning_rate": 8.55392683807299e-11, "loss": 0.0163, "step": 7745 }, { "epoch": 2.9953596287703017, "grad_norm": 0.2696695732742662, "learning_rate": 7.28855611970003e-11, "loss": 0.0186, "step": 7746 }, { "epoch": 2.9957463263727764, "grad_norm": 0.21049919368196884, "learning_rate": 6.12441411607101e-11, "loss": 0.0166, "step": 7747 }, { "epoch": 2.9961330239752515, "grad_norm": 0.22470572161786773, "learning_rate": 5.0615010627752534e-11, "loss": 0.0185, "step": 7748 }, { "epoch": 2.996519721577726, "grad_norm": 0.31875723327015487, "learning_rate": 4.099817175085008e-11, "loss": 0.0228, "step": 7749 }, { "epoch": 2.996906419180201, "grad_norm": 0.6835317990854345, "learning_rate": 3.239362647733391e-11, "loss": 0.0251, "step": 7750 }, { "epoch": 2.997293116782676, "grad_norm": 0.2829543246520429, "learning_rate": 2.4801376548033716e-11, "loss": 0.0189, "step": 7751 }, { "epoch": 2.997679814385151, "grad_norm": 0.35796551898557744, "learning_rate": 1.8221423501163514e-11, "loss": 0.0257, "step": 7752 }, { "epoch": 2.9980665119876257, "grad_norm": 0.3426175742710674, "learning_rate": 1.2653768668990928e-11, "loss": 0.0201, "step": 7753 }, { "epoch": 2.9984532095901004, "grad_norm": 0.2812311601060425, "learning_rate": 8.098413177837217e-12, "loss": 0.012, "step": 7754 }, { "epoch": 2.9988399071925755, "grad_norm": 0.3165068616682353, "learning_rate": 4.555357950297712e-12, "loss": 0.0242, "step": 7755 }, { "epoch": 2.99922660479505, "grad_norm": 0.3182691445104684, "learning_rate": 2.0246037041316e-12, "loss": 0.0173, "step": 7756 }, { "epoch": 2.999613302397525, "grad_norm": 0.2597649044769413, "learning_rate": 5.061509517068075e-13, "loss": 0.0243, "step": 7757 }, { "epoch": 3.0, "grad_norm": 0.29465208594296266, "learning_rate": 0.0, "loss": 0.0238, "step": 7758 }, { "epoch": 3.0, "eval_loss": 0.025541018694639206, "eval_runtime": 787.078, "eval_samples_per_second": 24.996, "eval_steps_per_second": 0.781, "step": 7758 }, { "epoch": 3.0, "step": 7758, "total_flos": 2332096332890112.0, "train_loss": 0.03801635660235435, "train_runtime": 128961.4313, "train_samples_per_second": 7.699, "train_steps_per_second": 0.06 } ], "logging_steps": 1, "max_steps": 7758, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2332096332890112.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }