{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000, "global_step": 27584, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "crossentropy": 5.657482624053955, "epoch": 3.625290023201856e-05, "grad_norm": 2.765242338180542, "learning_rate": 1.8115942028985507e-05, "loss": 5.76, "step": 1 }, { "crossentropy": 5.648288249969482, "epoch": 7.250580046403712e-05, "grad_norm": 1.7149956226348877, "learning_rate": 3.6231884057971014e-05, "loss": 5.7901, "step": 2 }, { "crossentropy": 5.585716247558594, "epoch": 0.00010875870069605568, "grad_norm": 2.098741054534912, "learning_rate": 5.4347826086956524e-05, "loss": 5.6953, "step": 3 }, { "crossentropy": 5.75126314163208, "epoch": 0.00014501160092807424, "grad_norm": 1.5369895696640015, "learning_rate": 7.246376811594203e-05, "loss": 5.7137, "step": 4 }, { "crossentropy": 6.088944435119629, "epoch": 0.0001812645011600928, "grad_norm": 1.6631064414978027, "learning_rate": 9.057971014492754e-05, "loss": 5.8801, "step": 5 }, { "crossentropy": 5.657492637634277, "epoch": 0.00021751740139211136, "grad_norm": 1.6087040901184082, "learning_rate": 0.00010869565217391305, "loss": 5.7009, "step": 6 }, { "crossentropy": 5.623044967651367, "epoch": 0.0002537703016241299, "grad_norm": 1.5264256000518799, "learning_rate": 0.00012681159420289856, "loss": 5.6792, "step": 7 }, { "crossentropy": 5.719192981719971, "epoch": 0.0002900232018561485, "grad_norm": 1.5831079483032227, "learning_rate": 0.00014492753623188405, "loss": 5.6712, "step": 8 }, { "crossentropy": 5.581594944000244, "epoch": 0.00032627610208816704, "grad_norm": 1.3904093503952026, "learning_rate": 0.00016304347826086955, "loss": 5.6012, "step": 9 }, { "crossentropy": 5.570854187011719, "epoch": 0.0003625290023201856, "grad_norm": 1.3816750049591064, "learning_rate": 0.00018115942028985507, "loss": 5.6472, "step": 10 }, { "crossentropy": 5.646821975708008, "epoch": 0.00039878190255220416, "grad_norm": 1.5102839469909668, "learning_rate": 0.00019927536231884057, "loss": 5.5323, "step": 11 }, { "crossentropy": 5.667036056518555, "epoch": 0.0004350348027842227, "grad_norm": 1.306938886642456, "learning_rate": 0.0002173913043478261, "loss": 5.7507, "step": 12 }, { "crossentropy": 5.61738920211792, "epoch": 0.0004712877030162413, "grad_norm": 1.147897720336914, "learning_rate": 0.0002355072463768116, "loss": 5.6385, "step": 13 }, { "crossentropy": 5.52905797958374, "epoch": 0.0005075406032482598, "grad_norm": 1.0561491250991821, "learning_rate": 0.0002536231884057971, "loss": 5.4965, "step": 14 }, { "crossentropy": 5.8374714851379395, "epoch": 0.0005437935034802784, "grad_norm": 1.356590986251831, "learning_rate": 0.0002717391304347826, "loss": 5.6086, "step": 15 }, { "crossentropy": 5.541373252868652, "epoch": 0.000580046403712297, "grad_norm": 0.932308554649353, "grad_norm_var": 0.1839440630167869, "learning_rate": 0.0002898550724637681, "loss": 5.4879, "step": 16 }, { "crossentropy": 5.455516338348389, "epoch": 0.0006162993039443155, "grad_norm": 0.8412453532218933, "grad_norm_var": 0.10002037620423712, "learning_rate": 0.00030797101449275366, "loss": 5.4014, "step": 17 }, { "crossentropy": 5.68115234375, "epoch": 0.0006525522041763341, "grad_norm": 0.8789597749710083, "grad_norm_var": 0.11037264687339125, "learning_rate": 0.0003260869565217391, "loss": 5.6867, "step": 18 }, { "crossentropy": 5.409543037414551, "epoch": 0.0006888051044083526, "grad_norm": 0.8186715841293335, "grad_norm_var": 0.08733350386095891, "learning_rate": 0.00034420289855072465, "loss": 5.4018, "step": 19 }, { "crossentropy": 5.346035480499268, "epoch": 0.0007250580046403712, "grad_norm": 0.7042343020439148, "grad_norm_var": 0.10255415585550172, "learning_rate": 0.00036231884057971015, "loss": 5.4144, "step": 20 }, { "crossentropy": 5.512085914611816, "epoch": 0.0007613109048723898, "grad_norm": 0.7121629118919373, "grad_norm_var": 0.10436952351864393, "learning_rate": 0.0003804347826086957, "loss": 5.4159, "step": 21 }, { "crossentropy": 5.337124347686768, "epoch": 0.0007975638051044083, "grad_norm": 0.6628484129905701, "grad_norm_var": 0.10523984691818426, "learning_rate": 0.00039855072463768114, "loss": 5.2749, "step": 22 }, { "crossentropy": 5.3433098793029785, "epoch": 0.0008338167053364269, "grad_norm": 0.6867004036903381, "grad_norm_var": 0.10303584003247411, "learning_rate": 0.00041666666666666664, "loss": 5.3434, "step": 23 }, { "crossentropy": 5.461874008178711, "epoch": 0.0008700696055684454, "grad_norm": 0.7407732009887695, "grad_norm_var": 0.08870188367722774, "learning_rate": 0.0004347826086956522, "loss": 5.407, "step": 24 }, { "crossentropy": 5.085108280181885, "epoch": 0.000906322505800464, "grad_norm": 0.5986258387565613, "grad_norm_var": 0.08751207224528462, "learning_rate": 0.0004528985507246377, "loss": 5.2454, "step": 25 }, { "crossentropy": 5.332317352294922, "epoch": 0.0009425754060324826, "grad_norm": 0.662661075592041, "grad_norm_var": 0.0792546190791498, "learning_rate": 0.0004710144927536232, "loss": 5.2573, "step": 26 }, { "crossentropy": 5.124617576599121, "epoch": 0.0009788283062645011, "grad_norm": 0.7089167237281799, "grad_norm_var": 0.05563273814677907, "learning_rate": 0.0004891304347826087, "loss": 5.2116, "step": 27 }, { "crossentropy": 5.228032112121582, "epoch": 0.0010150812064965197, "grad_norm": 0.5875171422958374, "grad_norm_var": 0.04544288303996695, "learning_rate": 0.0005072463768115942, "loss": 5.2242, "step": 28 }, { "crossentropy": 5.314358234405518, "epoch": 0.0010513341067285382, "grad_norm": 0.6402634978294373, "grad_norm_var": 0.03925463680976969, "learning_rate": 0.0005253623188405797, "loss": 5.3168, "step": 29 }, { "crossentropy": 5.200428485870361, "epoch": 0.0010875870069605568, "grad_norm": 0.5870943665504456, "grad_norm_var": 0.036159475698294335, "learning_rate": 0.0005434782608695652, "loss": 5.1799, "step": 30 }, { "crossentropy": 5.201673984527588, "epoch": 0.0011238399071925754, "grad_norm": 0.5389686822891235, "grad_norm_var": 0.012627526523099642, "learning_rate": 0.0005615942028985507, "loss": 5.217, "step": 31 }, { "crossentropy": 5.276546478271484, "epoch": 0.001160092807424594, "grad_norm": 0.566356897354126, "grad_norm_var": 0.009973316909626684, "learning_rate": 0.0005797101449275362, "loss": 5.2742, "step": 32 }, { "crossentropy": 5.180262565612793, "epoch": 0.0011963457076566125, "grad_norm": 0.5723462104797363, "grad_norm_var": 0.008836810396886996, "learning_rate": 0.0005978260869565218, "loss": 5.2394, "step": 33 }, { "crossentropy": 4.984674453735352, "epoch": 0.001232598607888631, "grad_norm": 0.5306499600410461, "grad_norm_var": 0.006561384339112057, "learning_rate": 0.0006159420289855073, "loss": 5.0582, "step": 34 }, { "crossentropy": 5.164369106292725, "epoch": 0.0012688515081206496, "grad_norm": 0.931186854839325, "grad_norm_var": 0.009959175490611887, "learning_rate": 0.0006340579710144928, "loss": 5.1135, "step": 35 }, { "crossentropy": 5.25167179107666, "epoch": 0.0013051044083526682, "grad_norm": 1.1481037139892578, "grad_norm_var": 0.02536685537453942, "learning_rate": 0.0006521739130434782, "loss": 5.102, "step": 36 }, { "crossentropy": 5.262511730194092, "epoch": 0.0013413573085846867, "grad_norm": 0.48443806171417236, "grad_norm_var": 0.027622291022620437, "learning_rate": 0.0006702898550724638, "loss": 5.0879, "step": 37 }, { "crossentropy": 5.022038459777832, "epoch": 0.0013776102088167053, "grad_norm": 0.44529759883880615, "grad_norm_var": 0.030656232090325596, "learning_rate": 0.0006884057971014493, "loss": 5.0531, "step": 38 }, { "crossentropy": 5.010119438171387, "epoch": 0.0014138631090487238, "grad_norm": 0.47453922033309937, "grad_norm_var": 0.03248418216251682, "learning_rate": 0.0007065217391304347, "loss": 5.0236, "step": 39 }, { "crossentropy": 4.852511405944824, "epoch": 0.0014501160092807424, "grad_norm": 0.49381205439567566, "grad_norm_var": 0.03293195653283917, "learning_rate": 0.0007246376811594203, "loss": 4.9401, "step": 40 }, { "crossentropy": 5.013350009918213, "epoch": 0.001486368909512761, "grad_norm": 0.4619635045528412, "grad_norm_var": 0.03454654455289619, "learning_rate": 0.0007427536231884058, "loss": 5.0354, "step": 41 }, { "crossentropy": 5.139444828033447, "epoch": 0.0015226218097447795, "grad_norm": 0.43311601877212524, "grad_norm_var": 0.03636975643992487, "learning_rate": 0.0007608695652173914, "loss": 5.0998, "step": 42 }, { "crossentropy": 4.782350540161133, "epoch": 0.001558874709976798, "grad_norm": 0.4879867434501648, "grad_norm_var": 0.036220403686857715, "learning_rate": 0.0007789855072463768, "loss": 4.8591, "step": 43 }, { "crossentropy": 5.116602420806885, "epoch": 0.0015951276102088166, "grad_norm": 0.4727453291416168, "grad_norm_var": 0.03702778020916674, "learning_rate": 0.0007971014492753623, "loss": 5.0033, "step": 44 }, { "crossentropy": 4.874434947967529, "epoch": 0.0016313805104408352, "grad_norm": 0.41835930943489075, "grad_norm_var": 0.0383017583436116, "learning_rate": 0.0008152173913043479, "loss": 4.8764, "step": 45 }, { "crossentropy": 4.914855480194092, "epoch": 0.0016676334106728538, "grad_norm": 0.4212419092655182, "grad_norm_var": 0.039541986559722686, "learning_rate": 0.0008333333333333333, "loss": 4.9152, "step": 46 }, { "crossentropy": 4.933865547180176, "epoch": 0.0017038863109048723, "grad_norm": 0.8342291116714478, "grad_norm_var": 0.044356798549225175, "learning_rate": 0.0008514492753623189, "loss": 4.8482, "step": 47 }, { "crossentropy": 4.870279788970947, "epoch": 0.0017401392111368909, "grad_norm": 0.4034121036529541, "grad_norm_var": 0.046171933068425666, "learning_rate": 0.0008695652173913044, "loss": 4.8999, "step": 48 }, { "crossentropy": 4.6477532386779785, "epoch": 0.0017763921113689094, "grad_norm": 0.41017699241638184, "grad_norm_var": 0.047620857559553656, "learning_rate": 0.0008876811594202899, "loss": 4.8162, "step": 49 }, { "crossentropy": 4.915560722351074, "epoch": 0.001812645011600928, "grad_norm": 0.4022304117679596, "grad_norm_var": 0.04903775938265521, "learning_rate": 0.0009057971014492754, "loss": 4.8029, "step": 50 }, { "crossentropy": 4.866330146789551, "epoch": 0.0018488979118329466, "grad_norm": 0.4288548529148102, "grad_norm_var": 0.038954864684111946, "learning_rate": 0.0009239130434782609, "loss": 4.9629, "step": 51 }, { "crossentropy": 4.704883575439453, "epoch": 0.0018851508120649651, "grad_norm": 0.37232303619384766, "grad_norm_var": 0.010956956804869, "learning_rate": 0.0009420289855072464, "loss": 4.837, "step": 52 }, { "crossentropy": 4.94840669631958, "epoch": 0.0019214037122969837, "grad_norm": 0.4246358275413513, "grad_norm_var": 0.011027839409977865, "learning_rate": 0.0009601449275362319, "loss": 4.9738, "step": 53 }, { "crossentropy": 4.867421627044678, "epoch": 0.0019576566125290022, "grad_norm": 0.8696596026420593, "grad_norm_var": 0.02136300833869754, "learning_rate": 0.0009782608695652175, "loss": 4.8052, "step": 54 }, { "crossentropy": 4.814804553985596, "epoch": 0.001993909512761021, "grad_norm": 0.37308114767074585, "grad_norm_var": 0.022189548404817835, "learning_rate": 0.0009963768115942029, "loss": 4.7406, "step": 55 }, { "crossentropy": 4.728457927703857, "epoch": 0.0020301624129930394, "grad_norm": 0.3679368495941162, "grad_norm_var": 0.022977211577001087, "learning_rate": 0.0010144927536231885, "loss": 4.7217, "step": 56 }, { "crossentropy": 4.72299861907959, "epoch": 0.002066415313225058, "grad_norm": 0.3430425822734833, "grad_norm_var": 0.024049921498874114, "learning_rate": 0.0010326086956521738, "loss": 4.732, "step": 57 }, { "crossentropy": 4.778980731964111, "epoch": 0.0021026682134570765, "grad_norm": 0.43546098470687866, "grad_norm_var": 0.02403984619086364, "learning_rate": 0.0010507246376811595, "loss": 4.7826, "step": 58 }, { "crossentropy": 4.741180419921875, "epoch": 0.0021389211136890953, "grad_norm": 0.41067400574684143, "grad_norm_var": 0.02419281874049727, "learning_rate": 0.001068840579710145, "loss": 4.7425, "step": 59 }, { "crossentropy": 4.787547588348389, "epoch": 0.0021751740139211136, "grad_norm": 0.34844252467155457, "grad_norm_var": 0.024976350927487203, "learning_rate": 0.0010869565217391304, "loss": 4.7763, "step": 60 }, { "crossentropy": 4.620233058929443, "epoch": 0.0022114269141531324, "grad_norm": 0.41425156593322754, "grad_norm_var": 0.024996917727497334, "learning_rate": 0.001105072463768116, "loss": 4.7038, "step": 61 }, { "crossentropy": 4.615128517150879, "epoch": 0.0022476798143851507, "grad_norm": 0.3117712438106537, "grad_norm_var": 0.026220081777765653, "learning_rate": 0.0011231884057971014, "loss": 4.6061, "step": 62 }, { "crossentropy": 4.709559917449951, "epoch": 0.0022839327146171695, "grad_norm": 0.37151968479156494, "grad_norm_var": 0.01570438446131117, "learning_rate": 0.0011413043478260868, "loss": 4.7261, "step": 63 }, { "crossentropy": 4.699501037597656, "epoch": 0.002320185614849188, "grad_norm": 0.3867760896682739, "grad_norm_var": 0.015753966695632827, "learning_rate": 0.0011594202898550724, "loss": 4.6484, "step": 64 }, { "crossentropy": 4.508828639984131, "epoch": 0.0023564385150812066, "grad_norm": 0.28546223044395447, "grad_norm_var": 0.016838326462978302, "learning_rate": 0.001177536231884058, "loss": 4.5461, "step": 65 }, { "crossentropy": 4.70086669921875, "epoch": 0.002392691415313225, "grad_norm": 0.3223285377025604, "grad_norm_var": 0.017310879534808428, "learning_rate": 0.0011956521739130436, "loss": 4.6153, "step": 66 }, { "crossentropy": 4.7410454750061035, "epoch": 0.0024289443155452437, "grad_norm": 0.6461515426635742, "grad_norm_var": 0.020978090837746658, "learning_rate": 0.001213768115942029, "loss": 4.6881, "step": 67 }, { "crossentropy": 4.575645446777344, "epoch": 0.002465197215777262, "grad_norm": 0.30999574065208435, "grad_norm_var": 0.021598145416012595, "learning_rate": 0.0012318840579710146, "loss": 4.5273, "step": 68 }, { "crossentropy": 4.624166488647461, "epoch": 0.002501450116009281, "grad_norm": 0.3118176758289337, "grad_norm_var": 0.022231011114251336, "learning_rate": 0.00125, "loss": 4.6172, "step": 69 }, { "crossentropy": 4.545130729675293, "epoch": 0.002537703016241299, "grad_norm": 0.49287647008895874, "grad_norm_var": 0.007849487374309409, "learning_rate": 0.0012681159420289856, "loss": 4.5511, "step": 70 }, { "crossentropy": 4.563446998596191, "epoch": 0.002573955916473318, "grad_norm": 0.28058046102523804, "grad_norm_var": 0.008509360804812838, "learning_rate": 0.001286231884057971, "loss": 4.5476, "step": 71 }, { "crossentropy": 4.612325191497803, "epoch": 0.0026102088167053363, "grad_norm": 0.33336859941482544, "grad_norm_var": 0.008627860902522854, "learning_rate": 0.0013043478260869564, "loss": 4.629, "step": 72 }, { "crossentropy": 4.553452014923096, "epoch": 0.002646461716937355, "grad_norm": 0.2727714776992798, "grad_norm_var": 0.009238558909682271, "learning_rate": 0.001322463768115942, "loss": 4.5679, "step": 73 }, { "crossentropy": 4.61199426651001, "epoch": 0.0026827146171693734, "grad_norm": 0.28093552589416504, "grad_norm_var": 0.009400571153544724, "learning_rate": 0.0013405797101449276, "loss": 4.5005, "step": 74 }, { "crossentropy": 4.514711380004883, "epoch": 0.002718967517401392, "grad_norm": 0.26186808943748474, "grad_norm_var": 0.009803566910020722, "learning_rate": 0.001358695652173913, "loss": 4.5691, "step": 75 }, { "crossentropy": 4.558838844299316, "epoch": 0.0027552204176334106, "grad_norm": 0.26831796765327454, "grad_norm_var": 0.010242096117330369, "learning_rate": 0.0013768115942028986, "loss": 4.5192, "step": 76 }, { "crossentropy": 4.57639217376709, "epoch": 0.0027914733178654293, "grad_norm": 0.3416396379470825, "grad_norm_var": 0.009919794343850112, "learning_rate": 0.0013949275362318842, "loss": 4.5074, "step": 77 }, { "crossentropy": 4.589596748352051, "epoch": 0.0028277262180974477, "grad_norm": 0.24692387878894806, "grad_norm_var": 0.010447325665009461, "learning_rate": 0.0014130434782608694, "loss": 4.5675, "step": 78 }, { "crossentropy": 4.6283087730407715, "epoch": 0.0028639791183294665, "grad_norm": 0.2800174355506897, "grad_norm_var": 0.01056573378641623, "learning_rate": 0.001431159420289855, "loss": 4.5657, "step": 79 }, { "crossentropy": 4.511780261993408, "epoch": 0.002900232018561485, "grad_norm": 0.2488143891096115, "grad_norm_var": 0.010759025808717378, "learning_rate": 0.0014492753623188406, "loss": 4.513, "step": 80 }, { "crossentropy": 4.384176731109619, "epoch": 0.0029364849187935036, "grad_norm": 0.2607593834400177, "grad_norm_var": 0.010924070742947982, "learning_rate": 0.0014673913043478262, "loss": 4.4559, "step": 81 }, { "crossentropy": 4.475325584411621, "epoch": 0.002972737819025522, "grad_norm": 0.2332693189382553, "grad_norm_var": 0.011421209952376572, "learning_rate": 0.0014855072463768116, "loss": 4.4161, "step": 82 }, { "crossentropy": 4.538520336151123, "epoch": 0.0030089907192575407, "grad_norm": 0.3231326937675476, "grad_norm_var": 0.0037611524859119108, "learning_rate": 0.0015036231884057972, "loss": 4.4987, "step": 83 }, { "crossentropy": 4.480780601501465, "epoch": 0.003045243619489559, "grad_norm": 0.23043230175971985, "grad_norm_var": 0.004015677708152276, "learning_rate": 0.0015217391304347828, "loss": 4.4246, "step": 84 }, { "crossentropy": 4.498739719390869, "epoch": 0.003081496519721578, "grad_norm": 0.25715479254722595, "grad_norm_var": 0.004055952463118583, "learning_rate": 0.001539855072463768, "loss": 4.5057, "step": 85 }, { "crossentropy": 4.4514031410217285, "epoch": 0.003117749419953596, "grad_norm": 0.22328197956085205, "grad_norm_var": 0.001244975139737454, "learning_rate": 0.0015579710144927536, "loss": 4.453, "step": 86 }, { "crossentropy": 4.105385780334473, "epoch": 0.003154002320185615, "grad_norm": 0.24489901959896088, "grad_norm_var": 0.0012811297666824207, "learning_rate": 0.0015760869565217392, "loss": 4.2532, "step": 87 }, { "crossentropy": 4.228404998779297, "epoch": 0.0031902552204176333, "grad_norm": 0.25829124450683594, "grad_norm_var": 0.0009913118207197868, "learning_rate": 0.0015942028985507246, "loss": 4.2855, "step": 88 }, { "crossentropy": 4.336317539215088, "epoch": 0.003226508120649652, "grad_norm": 0.2277096062898636, "grad_norm_var": 0.001068716673308537, "learning_rate": 0.0016123188405797102, "loss": 4.4177, "step": 89 }, { "crossentropy": 4.317057132720947, "epoch": 0.0032627610208816704, "grad_norm": 0.24074344336986542, "grad_norm_var": 0.0010666800967500537, "learning_rate": 0.0016304347826086958, "loss": 4.3553, "step": 90 }, { "crossentropy": 4.478431224822998, "epoch": 0.003299013921113689, "grad_norm": 0.22058692574501038, "grad_norm_var": 0.0011585219064805734, "learning_rate": 0.0016485507246376814, "loss": 4.435, "step": 91 }, { "crossentropy": 4.424564361572266, "epoch": 0.0033352668213457075, "grad_norm": 0.22510817646980286, "grad_norm_var": 0.0012078386706812605, "learning_rate": 0.0016666666666666666, "loss": 4.408, "step": 92 }, { "crossentropy": 4.286017417907715, "epoch": 0.0033715197215777263, "grad_norm": 0.2152361124753952, "grad_norm_var": 0.0007280914294463565, "learning_rate": 0.0016847826086956522, "loss": 4.4138, "step": 93 }, { "crossentropy": 4.3135294914245605, "epoch": 0.0034077726218097446, "grad_norm": 0.21083571016788483, "grad_norm_var": 0.0008051516705587893, "learning_rate": 0.0017028985507246378, "loss": 4.2995, "step": 94 }, { "crossentropy": 4.436722278594971, "epoch": 0.0034440255220417634, "grad_norm": 0.27359771728515625, "grad_norm_var": 0.0007766984869035663, "learning_rate": 0.0017210144927536232, "loss": 4.4508, "step": 95 }, { "crossentropy": 4.270875930786133, "epoch": 0.0034802784222737818, "grad_norm": 0.23538550734519958, "grad_norm_var": 0.0007782136231511398, "learning_rate": 0.0017391304347826088, "loss": 4.2968, "step": 96 }, { "crossentropy": 4.365535736083984, "epoch": 0.0035165313225058005, "grad_norm": 0.5062203407287598, "grad_norm_var": 0.005140634493164686, "learning_rate": 0.0017572463768115944, "loss": 4.3759, "step": 97 }, { "crossentropy": 4.289638042449951, "epoch": 0.003552784222737819, "grad_norm": 0.20142437517642975, "grad_norm_var": 0.005308460761033991, "learning_rate": 0.0017753623188405798, "loss": 4.2977, "step": 98 }, { "crossentropy": 4.286318778991699, "epoch": 0.0035890371229698377, "grad_norm": 0.20671236515045166, "grad_norm_var": 0.005111583615339444, "learning_rate": 0.0017934782608695651, "loss": 4.2471, "step": 99 }, { "crossentropy": 4.2075958251953125, "epoch": 0.003625290023201856, "grad_norm": 0.19432991743087769, "grad_norm_var": 0.005280503865402301, "learning_rate": 0.0018115942028985507, "loss": 4.2503, "step": 100 }, { "crossentropy": 4.220251560211182, "epoch": 0.0036615429234338748, "grad_norm": 0.1790691763162613, "grad_norm_var": 0.005549042040165366, "learning_rate": 0.0018297101449275364, "loss": 4.2577, "step": 101 }, { "crossentropy": 4.321506977081299, "epoch": 0.003697795823665893, "grad_norm": 0.2657252848148346, "grad_norm_var": 0.005558734975095572, "learning_rate": 0.0018478260869565217, "loss": 4.3715, "step": 102 }, { "crossentropy": 4.392655849456787, "epoch": 0.003734048723897912, "grad_norm": 0.18766900897026062, "grad_norm_var": 0.005757473669352361, "learning_rate": 0.0018659420289855073, "loss": 4.3948, "step": 103 }, { "crossentropy": 4.2992730140686035, "epoch": 0.0037703016241299302, "grad_norm": 0.19151321053504944, "grad_norm_var": 0.005878130588926918, "learning_rate": 0.0018840579710144927, "loss": 4.2985, "step": 104 }, { "crossentropy": 4.404228687286377, "epoch": 0.003806554524361949, "grad_norm": 0.21028070151805878, "grad_norm_var": 0.005917233783878547, "learning_rate": 0.0019021739130434783, "loss": 4.315, "step": 105 }, { "crossentropy": 4.210457801818848, "epoch": 0.0038428074245939674, "grad_norm": 0.1978939026594162, "grad_norm_var": 0.0060007598347237975, "learning_rate": 0.0019202898550724637, "loss": 4.2217, "step": 106 }, { "crossentropy": 4.285305976867676, "epoch": 0.003879060324825986, "grad_norm": 0.17414973676204681, "grad_norm_var": 0.006209911594283389, "learning_rate": 0.0019384057971014493, "loss": 4.2723, "step": 107 }, { "crossentropy": 4.346632957458496, "epoch": 0.0039153132250580045, "grad_norm": 0.18539611995220184, "grad_norm_var": 0.006332774357837497, "learning_rate": 0.001956521739130435, "loss": 4.269, "step": 108 }, { "crossentropy": 4.093925952911377, "epoch": 0.003951566125290023, "grad_norm": 0.2343578338623047, "grad_norm_var": 0.006325086075005051, "learning_rate": 0.00197463768115942, "loss": 4.1679, "step": 109 }, { "crossentropy": 4.199519157409668, "epoch": 0.003987819025522042, "grad_norm": 0.18168866634368896, "grad_norm_var": 0.006446481660401053, "learning_rate": 0.0019927536231884057, "loss": 4.2237, "step": 110 }, { "crossentropy": 4.276994705200195, "epoch": 0.00402407192575406, "grad_norm": 0.18867121636867523, "grad_norm_var": 0.006364951614534637, "learning_rate": 0.0020108695652173913, "loss": 4.1955, "step": 111 }, { "crossentropy": 4.197327613830566, "epoch": 0.004060324825986079, "grad_norm": 0.18344587087631226, "grad_norm_var": 0.006435878010155311, "learning_rate": 0.002028985507246377, "loss": 4.2488, "step": 112 }, { "crossentropy": 4.165685176849365, "epoch": 0.0040965777262180975, "grad_norm": 0.22770081460475922, "grad_norm_var": 0.0005821373229182413, "learning_rate": 0.002047101449275362, "loss": 4.213, "step": 113 }, { "crossentropy": 4.2665886878967285, "epoch": 0.004132830626450116, "grad_norm": 0.1630914807319641, "grad_norm_var": 0.0006698988577897182, "learning_rate": 0.0020652173913043477, "loss": 4.2736, "step": 114 }, { "crossentropy": 4.189716339111328, "epoch": 0.004169083526682134, "grad_norm": 0.19285452365875244, "grad_norm_var": 0.0006662301415726167, "learning_rate": 0.0020833333333333333, "loss": 4.1975, "step": 115 }, { "crossentropy": 4.314552307128906, "epoch": 0.004205336426914153, "grad_norm": 0.15810039639472961, "grad_norm_var": 0.0007629267918744548, "learning_rate": 0.002101449275362319, "loss": 4.248, "step": 116 }, { "crossentropy": 4.162571430206299, "epoch": 0.004241589327146172, "grad_norm": 0.19287006556987762, "grad_norm_var": 0.0007453312959178865, "learning_rate": 0.0021195652173913045, "loss": 4.1789, "step": 117 }, { "crossentropy": 4.277385234832764, "epoch": 0.0042778422273781905, "grad_norm": 0.26867061853408813, "grad_norm_var": 0.0007732698903499064, "learning_rate": 0.00213768115942029, "loss": 4.2515, "step": 118 }, { "crossentropy": 4.203953266143799, "epoch": 0.004314095127610208, "grad_norm": 0.20968346297740936, "grad_norm_var": 0.0007786741418402077, "learning_rate": 0.0021557971014492757, "loss": 4.2001, "step": 119 }, { "crossentropy": 4.1310343742370605, "epoch": 0.004350348027842227, "grad_norm": 0.15753516554832458, "grad_norm_var": 0.0008780578253356098, "learning_rate": 0.002173913043478261, "loss": 4.1411, "step": 120 }, { "crossentropy": 4.260190963745117, "epoch": 0.004386600928074246, "grad_norm": 0.23268643021583557, "grad_norm_var": 0.0009538906887421937, "learning_rate": 0.0021920289855072465, "loss": 4.1954, "step": 121 }, { "crossentropy": 4.1391119956970215, "epoch": 0.004422853828306265, "grad_norm": 0.1646864116191864, "grad_norm_var": 0.0010179673225342337, "learning_rate": 0.002210144927536232, "loss": 4.1318, "step": 122 }, { "crossentropy": 4.171885013580322, "epoch": 0.004459106728538283, "grad_norm": 0.19414694607257843, "grad_norm_var": 0.0009881024976219936, "learning_rate": 0.0022282608695652173, "loss": 4.1291, "step": 123 }, { "crossentropy": 4.103010654449463, "epoch": 0.0044953596287703014, "grad_norm": 0.15248098969459534, "grad_norm_var": 0.0011022388883821519, "learning_rate": 0.002246376811594203, "loss": 4.1408, "step": 124 }, { "crossentropy": 4.070254325866699, "epoch": 0.00453161252900232, "grad_norm": 0.16964617371559143, "grad_norm_var": 0.0010150307601308592, "learning_rate": 0.0022644927536231885, "loss": 4.1274, "step": 125 }, { "crossentropy": 4.333624362945557, "epoch": 0.004567865429234339, "grad_norm": 0.15654295682907104, "grad_norm_var": 0.0010819882090912067, "learning_rate": 0.0022826086956521737, "loss": 4.2867, "step": 126 }, { "crossentropy": 4.128780841827393, "epoch": 0.004604118329466357, "grad_norm": 0.2005322128534317, "grad_norm_var": 0.0010913666409049514, "learning_rate": 0.0023007246376811593, "loss": 4.1239, "step": 127 }, { "crossentropy": 4.141488552093506, "epoch": 0.004640371229698376, "grad_norm": 0.1619352549314499, "grad_norm_var": 0.0011363364129191292, "learning_rate": 0.002318840579710145, "loss": 4.0772, "step": 128 }, { "crossentropy": 4.153632164001465, "epoch": 0.0046766241299303945, "grad_norm": 0.19959695637226105, "grad_norm_var": 0.0010358018408610577, "learning_rate": 0.0023369565217391305, "loss": 4.1771, "step": 129 }, { "crossentropy": 4.025748252868652, "epoch": 0.004712877030162413, "grad_norm": 0.15153394639492035, "grad_norm_var": 0.0010793619818159064, "learning_rate": 0.002355072463768116, "loss": 4.0351, "step": 130 }, { "crossentropy": 4.08782434463501, "epoch": 0.004749129930394431, "grad_norm": 0.1605081707239151, "grad_norm_var": 0.001111823644068366, "learning_rate": 0.0023731884057971017, "loss": 4.0329, "step": 131 }, { "crossentropy": 4.075578212738037, "epoch": 0.00478538283062645, "grad_norm": 0.15904808044433594, "grad_norm_var": 0.0011087085893355644, "learning_rate": 0.0023913043478260873, "loss": 4.1456, "step": 132 }, { "crossentropy": 4.16206693649292, "epoch": 0.004821635730858469, "grad_norm": 0.16340163350105286, "grad_norm_var": 0.0011252099373366454, "learning_rate": 0.0024094202898550725, "loss": 4.1756, "step": 133 }, { "crossentropy": 3.83982515335083, "epoch": 0.0048578886310904875, "grad_norm": 0.17666038870811462, "grad_norm_var": 0.0005838694658223167, "learning_rate": 0.002427536231884058, "loss": 4.047, "step": 134 }, { "crossentropy": 4.030238628387451, "epoch": 0.004894141531322505, "grad_norm": 0.14967688918113708, "grad_norm_var": 0.0005367338402725334, "learning_rate": 0.0024456521739130437, "loss": 4.0523, "step": 135 }, { "crossentropy": 3.964215040206909, "epoch": 0.004930394431554524, "grad_norm": 0.2575192451477051, "grad_norm_var": 0.0009698520742706199, "learning_rate": 0.0024637681159420293, "loss": 3.937, "step": 136 }, { "crossentropy": 3.954915761947632, "epoch": 0.004966647331786543, "grad_norm": 0.30805814266204834, "grad_norm_var": 0.0018728479253851682, "learning_rate": 0.0024818840579710144, "loss": 4.0309, "step": 137 }, { "crossentropy": 4.20035982131958, "epoch": 0.005002900232018562, "grad_norm": 0.4298364520072937, "grad_norm_var": 0.005623911162291773, "learning_rate": 0.0025, "loss": 4.1447, "step": 138 }, { "crossentropy": 4.079083442687988, "epoch": 0.00503915313225058, "grad_norm": 0.16683976352214813, "grad_norm_var": 0.0056898073012632095, "learning_rate": 0.0025181159420289857, "loss": 4.0465, "step": 139 }, { "crossentropy": 4.118255138397217, "epoch": 0.005075406032482598, "grad_norm": 0.16905726492404938, "grad_norm_var": 0.005606953641013541, "learning_rate": 0.0025362318840579713, "loss": 4.0784, "step": 140 }, { "crossentropy": 4.099374771118164, "epoch": 0.005111658932714617, "grad_norm": 0.1552768498659134, "grad_norm_var": 0.005675665913339678, "learning_rate": 0.002554347826086957, "loss": 4.1105, "step": 141 }, { "crossentropy": 4.033939838409424, "epoch": 0.005147911832946636, "grad_norm": 0.14152942597866058, "grad_norm_var": 0.005772495477899478, "learning_rate": 0.002572463768115942, "loss": 4.1069, "step": 142 }, { "crossentropy": 4.175137996673584, "epoch": 0.005184164733178654, "grad_norm": 0.14986905455589294, "grad_norm_var": 0.005908639610445121, "learning_rate": 0.0025905797101449276, "loss": 4.0626, "step": 143 }, { "crossentropy": 4.11448860168457, "epoch": 0.005220417633410673, "grad_norm": 0.23142489790916443, "grad_norm_var": 0.0059154663206383434, "learning_rate": 0.002608695652173913, "loss": 4.0851, "step": 144 }, { "crossentropy": 4.185144424438477, "epoch": 0.005256670533642691, "grad_norm": 0.14102445542812347, "grad_norm_var": 0.006118312466558939, "learning_rate": 0.0026268115942028984, "loss": 4.0771, "step": 145 }, { "crossentropy": 4.049999713897705, "epoch": 0.00529292343387471, "grad_norm": 0.15550795197486877, "grad_norm_var": 0.006096557552023037, "learning_rate": 0.002644927536231884, "loss": 4.1425, "step": 146 }, { "crossentropy": 4.095916271209717, "epoch": 0.005329176334106729, "grad_norm": 0.23985247313976288, "grad_norm_var": 0.006128278302014763, "learning_rate": 0.0026630434782608696, "loss": 4.1428, "step": 147 }, { "crossentropy": 4.132740020751953, "epoch": 0.005365429234338747, "grad_norm": 0.1335650533437729, "grad_norm_var": 0.006306858341108334, "learning_rate": 0.0026811594202898552, "loss": 4.0429, "step": 148 }, { "crossentropy": 4.0970354080200195, "epoch": 0.005401682134570766, "grad_norm": 0.12883082032203674, "grad_norm_var": 0.006541350698989001, "learning_rate": 0.0026992753623188404, "loss": 4.0221, "step": 149 }, { "crossentropy": 3.9817965030670166, "epoch": 0.005437935034802784, "grad_norm": 0.14436551928520203, "grad_norm_var": 0.006689415793798047, "learning_rate": 0.002717391304347826, "loss": 4.013, "step": 150 }, { "crossentropy": 3.9725284576416016, "epoch": 0.005474187935034803, "grad_norm": 0.14335018396377563, "grad_norm_var": 0.0067292136335818315, "learning_rate": 0.0027355072463768116, "loss": 4.0117, "step": 151 }, { "crossentropy": 4.087752819061279, "epoch": 0.005510440835266821, "grad_norm": 0.15806379914283752, "grad_norm_var": 0.006498406946332731, "learning_rate": 0.0027536231884057972, "loss": 3.9513, "step": 152 }, { "crossentropy": 4.029160022735596, "epoch": 0.00554669373549884, "grad_norm": 0.1340571939945221, "grad_norm_var": 0.0055885689494577905, "learning_rate": 0.002771739130434783, "loss": 4.0412, "step": 153 }, { "crossentropy": 4.172536849975586, "epoch": 0.005582946635730859, "grad_norm": 0.3825332522392273, "grad_norm_var": 0.004129991311362697, "learning_rate": 0.0027898550724637684, "loss": 4.0982, "step": 154 }, { "crossentropy": 3.966825485229492, "epoch": 0.0056191995359628774, "grad_norm": 0.1529775857925415, "grad_norm_var": 0.004154212934349299, "learning_rate": 0.002807971014492754, "loss": 3.9159, "step": 155 }, { "crossentropy": 3.956963539123535, "epoch": 0.005655452436194895, "grad_norm": 0.14481453597545624, "grad_norm_var": 0.004202332733143458, "learning_rate": 0.0028260869565217388, "loss": 3.9576, "step": 156 }, { "crossentropy": 4.0330328941345215, "epoch": 0.005691705336426914, "grad_norm": 0.127924844622612, "grad_norm_var": 0.00430667003296407, "learning_rate": 0.0028442028985507244, "loss": 3.9864, "step": 157 }, { "crossentropy": 3.9622409343719482, "epoch": 0.005727958236658933, "grad_norm": 0.16339094936847687, "grad_norm_var": 0.004255430483140924, "learning_rate": 0.00286231884057971, "loss": 3.9356, "step": 158 }, { "crossentropy": 3.9727988243103027, "epoch": 0.005764211136890952, "grad_norm": 0.1517779529094696, "grad_norm_var": 0.00425035073026786, "learning_rate": 0.0028804347826086956, "loss": 3.9401, "step": 159 }, { "crossentropy": 3.9741997718811035, "epoch": 0.00580046403712297, "grad_norm": 0.14338764548301697, "grad_norm_var": 0.0040236126178641585, "learning_rate": 0.002898550724637681, "loss": 3.9446, "step": 160 }, { "crossentropy": 3.8807437419891357, "epoch": 0.005836716937354988, "grad_norm": 0.15216918289661407, "grad_norm_var": 0.003995244877405583, "learning_rate": 0.002916666666666667, "loss": 3.8691, "step": 161 }, { "crossentropy": 3.7002599239349365, "epoch": 0.005872969837587007, "grad_norm": 0.12238529324531555, "grad_norm_var": 0.004110307929744072, "learning_rate": 0.0029347826086956524, "loss": 3.897, "step": 162 }, { "crossentropy": 4.028830528259277, "epoch": 0.005909222737819026, "grad_norm": 0.12791574001312256, "grad_norm_var": 0.003760815529914532, "learning_rate": 0.0029528985507246376, "loss": 3.9638, "step": 163 }, { "crossentropy": 3.893089532852173, "epoch": 0.005945475638051044, "grad_norm": 0.13388660550117493, "grad_norm_var": 0.0037598185653341166, "learning_rate": 0.002971014492753623, "loss": 3.8896, "step": 164 }, { "crossentropy": 3.9267306327819824, "epoch": 0.005981728538283063, "grad_norm": 0.12726785242557526, "grad_norm_var": 0.003765839381276584, "learning_rate": 0.002989130434782609, "loss": 3.9594, "step": 165 }, { "crossentropy": 3.7026567459106445, "epoch": 0.006017981438515081, "grad_norm": 0.14319948852062225, "grad_norm_var": 0.003767871822057452, "learning_rate": 0.0030072463768115944, "loss": 3.8717, "step": 166 }, { "crossentropy": 3.989921808242798, "epoch": 0.0060542343387471, "grad_norm": 0.19230422377586365, "grad_norm_var": 0.003829739993019236, "learning_rate": 0.00302536231884058, "loss": 4.0329, "step": 167 }, { "crossentropy": 3.971086025238037, "epoch": 0.006090487238979118, "grad_norm": 0.12941381335258484, "grad_norm_var": 0.0038879735370519067, "learning_rate": 0.0030434782608695656, "loss": 4.0157, "step": 168 }, { "crossentropy": 3.920429229736328, "epoch": 0.006126740139211137, "grad_norm": 0.12549394369125366, "grad_norm_var": 0.003919994058483697, "learning_rate": 0.003061594202898551, "loss": 3.9246, "step": 169 }, { "crossentropy": 4.003246307373047, "epoch": 0.006162993039443156, "grad_norm": 0.13404743373394012, "grad_norm_var": 0.0003251383596876295, "learning_rate": 0.003079710144927536, "loss": 3.9434, "step": 170 }, { "crossentropy": 4.018300533294678, "epoch": 0.006199245939675174, "grad_norm": 0.13986100256443024, "grad_norm_var": 0.0003167317330346364, "learning_rate": 0.0030978260869565215, "loss": 4.0356, "step": 171 }, { "crossentropy": 4.130532741546631, "epoch": 0.006235498839907192, "grad_norm": 0.19664044678211212, "grad_norm_var": 0.0005095614352769063, "learning_rate": 0.003115942028985507, "loss": 4.0396, "step": 172 }, { "crossentropy": 3.969748020172119, "epoch": 0.006271751740139211, "grad_norm": 0.12060961127281189, "grad_norm_var": 0.0005290158811005169, "learning_rate": 0.0031340579710144928, "loss": 3.9643, "step": 173 }, { "crossentropy": 4.009191989898682, "epoch": 0.00630800464037123, "grad_norm": 0.1350429803133011, "grad_norm_var": 0.000505890024555225, "learning_rate": 0.0031521739130434784, "loss": 3.9589, "step": 174 }, { "crossentropy": 4.046260833740234, "epoch": 0.006344257540603249, "grad_norm": 0.1254049390554428, "grad_norm_var": 0.0005157257464115994, "learning_rate": 0.003170289855072464, "loss": 4.0222, "step": 175 }, { "crossentropy": 3.9005017280578613, "epoch": 0.0063805104408352666, "grad_norm": 0.11997130513191223, "grad_norm_var": 0.0005411813431619413, "learning_rate": 0.003188405797101449, "loss": 3.8992, "step": 176 }, { "crossentropy": 3.891305446624756, "epoch": 0.006416763341067285, "grad_norm": 0.12412162870168686, "grad_norm_var": 0.0005414766821294472, "learning_rate": 0.0032065217391304347, "loss": 3.8391, "step": 177 }, { "crossentropy": 3.8428845405578613, "epoch": 0.006453016241299304, "grad_norm": 0.12762437760829926, "grad_norm_var": 0.0005327401385756304, "learning_rate": 0.0032246376811594204, "loss": 3.9244, "step": 178 }, { "crossentropy": 3.8995001316070557, "epoch": 0.006489269141531323, "grad_norm": 0.12499173730611801, "grad_norm_var": 0.0005370794456437169, "learning_rate": 0.003242753623188406, "loss": 3.8446, "step": 179 }, { "crossentropy": 3.8773326873779297, "epoch": 0.006525522041763341, "grad_norm": 0.10947594046592712, "grad_norm_var": 0.0005860585681819898, "learning_rate": 0.0032608695652173916, "loss": 3.9424, "step": 180 }, { "crossentropy": 3.9526400566101074, "epoch": 0.00656177494199536, "grad_norm": 2.0911166667938232, "grad_norm_var": 0.23935212337492134, "learning_rate": 0.003278985507246377, "loss": 3.8799, "step": 181 }, { "crossentropy": 3.9272005558013916, "epoch": 0.006598027842227378, "grad_norm": 0.12492311745882034, "grad_norm_var": 0.23965447555054065, "learning_rate": 0.0032971014492753628, "loss": 3.998, "step": 182 }, { "crossentropy": 3.8371901512145996, "epoch": 0.006634280742459397, "grad_norm": 0.11276186257600784, "grad_norm_var": 0.24074204718332295, "learning_rate": 0.0033152173913043475, "loss": 3.8767, "step": 183 }, { "crossentropy": 4.049253940582275, "epoch": 0.006670533642691415, "grad_norm": 0.15798459947109222, "grad_norm_var": 0.24032381875507175, "learning_rate": 0.003333333333333333, "loss": 3.9585, "step": 184 }, { "crossentropy": 3.964411735534668, "epoch": 0.006706786542923434, "grad_norm": 0.1152317076921463, "grad_norm_var": 0.24050675467933633, "learning_rate": 0.0033514492753623187, "loss": 3.9529, "step": 185 }, { "crossentropy": 3.870490312576294, "epoch": 0.006743039443155453, "grad_norm": 0.13026714324951172, "grad_norm_var": 0.2405679765655062, "learning_rate": 0.0033695652173913043, "loss": 3.9013, "step": 186 }, { "crossentropy": 3.914052963256836, "epoch": 0.006779292343387471, "grad_norm": 0.1108328253030777, "grad_norm_var": 0.2410604793504315, "learning_rate": 0.00338768115942029, "loss": 3.8871, "step": 187 }, { "crossentropy": 3.8270764350891113, "epoch": 0.006815545243619489, "grad_norm": 0.10842662304639816, "grad_norm_var": 0.24219428972934362, "learning_rate": 0.0034057971014492755, "loss": 3.861, "step": 188 }, { "crossentropy": 3.9902610778808594, "epoch": 0.006851798143851508, "grad_norm": 0.10389725863933563, "grad_norm_var": 0.2424915434226083, "learning_rate": 0.003423913043478261, "loss": 3.9037, "step": 189 }, { "crossentropy": 3.950812816619873, "epoch": 0.006888051044083527, "grad_norm": 0.10747572779655457, "grad_norm_var": 0.242943678852275, "learning_rate": 0.0034420289855072463, "loss": 3.8941, "step": 190 }, { "crossentropy": 3.849165916442871, "epoch": 0.006924303944315546, "grad_norm": 0.1339205801486969, "grad_norm_var": 0.24281422967347846, "learning_rate": 0.003460144927536232, "loss": 3.8772, "step": 191 }, { "crossentropy": 3.9105582237243652, "epoch": 0.0069605568445475635, "grad_norm": 0.15055333077907562, "grad_norm_var": 0.24236719257426556, "learning_rate": 0.0034782608695652175, "loss": 3.9604, "step": 192 }, { "crossentropy": 3.7582380771636963, "epoch": 0.006996809744779582, "grad_norm": 0.11019210517406464, "grad_norm_var": 0.2426054025742792, "learning_rate": 0.003496376811594203, "loss": 3.8364, "step": 193 }, { "crossentropy": 3.8108556270599365, "epoch": 0.007033062645011601, "grad_norm": 0.10770846903324127, "grad_norm_var": 0.24294182457085267, "learning_rate": 0.0035144927536231887, "loss": 3.8776, "step": 194 }, { "crossentropy": 3.8759782314300537, "epoch": 0.00706931554524362, "grad_norm": 0.1188572347164154, "grad_norm_var": 0.24304130067655128, "learning_rate": 0.0035326086956521743, "loss": 3.8853, "step": 195 }, { "crossentropy": 3.863194227218628, "epoch": 0.007105568445475638, "grad_norm": 0.16394786536693573, "grad_norm_var": 0.2422544216246225, "learning_rate": 0.0035507246376811595, "loss": 3.8228, "step": 196 }, { "crossentropy": 3.807340383529663, "epoch": 0.0071418213457076565, "grad_norm": 0.17098696529865265, "grad_norm_var": 0.0004973427012980108, "learning_rate": 0.0035688405797101447, "loss": 3.7953, "step": 197 }, { "crossentropy": 3.9336514472961426, "epoch": 0.007178074245939675, "grad_norm": 0.12109386920928955, "grad_norm_var": 0.0004991908528093308, "learning_rate": 0.0035869565217391303, "loss": 3.8595, "step": 198 }, { "crossentropy": 3.9519968032836914, "epoch": 0.007214327146171694, "grad_norm": 0.11440156400203705, "grad_norm_var": 0.0004963534778584945, "learning_rate": 0.003605072463768116, "loss": 3.9504, "step": 199 }, { "crossentropy": 3.8658361434936523, "epoch": 0.007250580046403712, "grad_norm": 0.10948830097913742, "grad_norm_var": 0.00044048029266016513, "learning_rate": 0.0036231884057971015, "loss": 3.8712, "step": 200 }, { "crossentropy": 3.8783185482025146, "epoch": 0.007286832946635731, "grad_norm": 0.10578801482915878, "grad_norm_var": 0.00045656620247810714, "learning_rate": 0.003641304347826087, "loss": 3.8811, "step": 201 }, { "crossentropy": 3.9218430519104004, "epoch": 0.0073230858468677495, "grad_norm": 0.10838513821363449, "grad_norm_var": 0.00046526040144656555, "learning_rate": 0.0036594202898550727, "loss": 3.923, "step": 202 }, { "crossentropy": 3.6917262077331543, "epoch": 0.007359338747099768, "grad_norm": 0.1460334062576294, "grad_norm_var": 0.0004920637927248903, "learning_rate": 0.0036775362318840583, "loss": 3.7772, "step": 203 }, { "crossentropy": 3.8585119247436523, "epoch": 0.007395591647331786, "grad_norm": 0.16977597773075104, "grad_norm_var": 0.0006013626145834929, "learning_rate": 0.0036956521739130435, "loss": 3.8158, "step": 204 }, { "crossentropy": 3.767824649810791, "epoch": 0.007431844547563805, "grad_norm": 0.14241966605186462, "grad_norm_var": 0.0005720754361089724, "learning_rate": 0.003713768115942029, "loss": 3.7524, "step": 205 }, { "crossentropy": 4.037430286407471, "epoch": 0.007468097447795824, "grad_norm": 0.1540265828371048, "grad_norm_var": 0.0005673097310174418, "learning_rate": 0.0037318840579710147, "loss": 3.9262, "step": 206 }, { "crossentropy": 3.8997836112976074, "epoch": 0.0075043503480278426, "grad_norm": 0.11046750098466873, "grad_norm_var": 0.0005987266710299209, "learning_rate": 0.00375, "loss": 3.7878, "step": 207 }, { "crossentropy": 3.8929333686828613, "epoch": 0.0075406032482598605, "grad_norm": 0.3815559148788452, "grad_norm_var": 0.004520470403804, "learning_rate": 0.0037681159420289855, "loss": 3.9187, "step": 208 }, { "crossentropy": 3.9014816284179688, "epoch": 0.007576856148491879, "grad_norm": 0.12198270112276077, "grad_norm_var": 0.004472951804178715, "learning_rate": 0.003786231884057971, "loss": 3.822, "step": 209 }, { "crossentropy": 3.7720913887023926, "epoch": 0.007613109048723898, "grad_norm": 0.1224663108587265, "grad_norm_var": 0.004409874346619086, "learning_rate": 0.0038043478260869567, "loss": 3.7495, "step": 210 }, { "crossentropy": 3.826660633087158, "epoch": 0.007649361948955917, "grad_norm": 0.10170865803956985, "grad_norm_var": 0.0044939846259054155, "learning_rate": 0.003822463768115942, "loss": 3.7999, "step": 211 }, { "crossentropy": 3.710343599319458, "epoch": 0.007685614849187935, "grad_norm": 0.3600614666938782, "grad_norm_var": 0.007353140213199534, "learning_rate": 0.0038405797101449275, "loss": 3.8202, "step": 212 }, { "crossentropy": 3.7846574783325195, "epoch": 0.0077218677494199535, "grad_norm": 0.14725592732429504, "grad_norm_var": 0.007349745365214774, "learning_rate": 0.003858695652173913, "loss": 3.7774, "step": 213 }, { "crossentropy": 3.8702969551086426, "epoch": 0.007758120649651972, "grad_norm": 0.12480543553829193, "grad_norm_var": 0.007332685387344416, "learning_rate": 0.0038768115942028987, "loss": 3.9214, "step": 214 }, { "crossentropy": 3.8021974563598633, "epoch": 0.007794373549883991, "grad_norm": 0.0962606891989708, "grad_norm_var": 0.007457593489378411, "learning_rate": 0.0038949275362318843, "loss": 3.8312, "step": 215 }, { "crossentropy": 3.727283477783203, "epoch": 0.007830626450116009, "grad_norm": 0.10178136825561523, "grad_norm_var": 0.007509517077828683, "learning_rate": 0.00391304347826087, "loss": 3.7961, "step": 216 }, { "crossentropy": 3.7752161026000977, "epoch": 0.007866879350348029, "grad_norm": 0.15109629929065704, "grad_norm_var": 0.007334946376682745, "learning_rate": 0.003931159420289855, "loss": 3.7955, "step": 217 }, { "crossentropy": 3.7960970401763916, "epoch": 0.007903132250580047, "grad_norm": 0.10729534178972244, "grad_norm_var": 0.0073423396855357296, "learning_rate": 0.00394927536231884, "loss": 3.8119, "step": 218 }, { "crossentropy": 3.781132221221924, "epoch": 0.007939385150812064, "grad_norm": 0.11219000071287155, "grad_norm_var": 0.007471024803979165, "learning_rate": 0.003967391304347826, "loss": 3.7771, "step": 219 }, { "crossentropy": 3.8018200397491455, "epoch": 0.007975638051044084, "grad_norm": 0.098166324198246, "grad_norm_var": 0.007665448951319973, "learning_rate": 0.003985507246376811, "loss": 3.7839, "step": 220 }, { "crossentropy": 3.722001314163208, "epoch": 0.008011890951276102, "grad_norm": 0.10663153976202011, "grad_norm_var": 0.007791672622398563, "learning_rate": 0.0040036231884057975, "loss": 3.8125, "step": 221 }, { "crossentropy": 3.7224504947662354, "epoch": 0.00804814385150812, "grad_norm": 0.1491197794675827, "grad_norm_var": 0.007790451145721943, "learning_rate": 0.004021739130434783, "loss": 3.7954, "step": 222 }, { "crossentropy": 3.9419665336608887, "epoch": 0.00808439675174014, "grad_norm": 0.11602369695901871, "grad_norm_var": 0.0077634251708109435, "learning_rate": 0.004039855072463769, "loss": 3.9233, "step": 223 }, { "crossentropy": 3.8566808700561523, "epoch": 0.008120649651972157, "grad_norm": 0.11103367060422897, "grad_norm_var": 0.003981577506969769, "learning_rate": 0.004057971014492754, "loss": 3.8335, "step": 224 }, { "crossentropy": 3.8045542240142822, "epoch": 0.008156902552204177, "grad_norm": 0.09679228812456131, "grad_norm_var": 0.004058215998549064, "learning_rate": 0.004076086956521739, "loss": 3.8051, "step": 225 }, { "crossentropy": 3.771785020828247, "epoch": 0.008193155452436195, "grad_norm": 0.09342307597398758, "grad_norm_var": 0.0041456003360655505, "learning_rate": 0.004094202898550724, "loss": 3.7861, "step": 226 }, { "crossentropy": 3.7887086868286133, "epoch": 0.008229408352668213, "grad_norm": 0.09782678633928299, "grad_norm_var": 0.004160979699410875, "learning_rate": 0.00411231884057971, "loss": 3.7878, "step": 227 }, { "crossentropy": 3.8231449127197266, "epoch": 0.008265661252900233, "grad_norm": 0.10287096351385117, "grad_norm_var": 0.00038394211761093566, "learning_rate": 0.004130434782608695, "loss": 3.7895, "step": 228 }, { "crossentropy": 3.840770721435547, "epoch": 0.00830191415313225, "grad_norm": 0.09465685486793518, "grad_norm_var": 0.00031861906358961997, "learning_rate": 0.0041485507246376814, "loss": 3.7587, "step": 229 }, { "crossentropy": 3.7733914852142334, "epoch": 0.008338167053364268, "grad_norm": 0.0985942929983139, "grad_norm_var": 0.0003098100916164616, "learning_rate": 0.004166666666666667, "loss": 3.8135, "step": 230 }, { "crossentropy": 3.9017512798309326, "epoch": 0.008374419953596288, "grad_norm": 0.12222007662057877, "grad_norm_var": 0.00031004880133412085, "learning_rate": 0.004184782608695653, "loss": 3.8855, "step": 231 }, { "crossentropy": 3.7618207931518555, "epoch": 0.008410672853828306, "grad_norm": 0.11357433348894119, "grad_norm_var": 0.0003058452747311489, "learning_rate": 0.004202898550724638, "loss": 3.7027, "step": 232 }, { "crossentropy": 3.554384708404541, "epoch": 0.008446925754060326, "grad_norm": 0.11928927153348923, "grad_norm_var": 0.0001978411309582611, "learning_rate": 0.004221014492753623, "loss": 3.6496, "step": 233 }, { "crossentropy": 3.608700752258301, "epoch": 0.008483178654292343, "grad_norm": 0.10009418427944183, "grad_norm_var": 0.0002024613641908979, "learning_rate": 0.004239130434782609, "loss": 3.7201, "step": 234 }, { "crossentropy": 3.7638583183288574, "epoch": 0.008519431554524361, "grad_norm": 0.10031793266534805, "grad_norm_var": 0.00020508386824758622, "learning_rate": 0.004257246376811594, "loss": 3.7368, "step": 235 }, { "crossentropy": 4.0352277755737305, "epoch": 0.008555684454756381, "grad_norm": 0.09876122325658798, "grad_norm_var": 0.0002043624929490099, "learning_rate": 0.00427536231884058, "loss": 3.8823, "step": 236 }, { "crossentropy": 3.6788406372070312, "epoch": 0.008591937354988399, "grad_norm": 0.09182939678430557, "grad_norm_var": 0.0002199221857476739, "learning_rate": 0.004293478260869565, "loss": 3.763, "step": 237 }, { "crossentropy": 3.7194344997406006, "epoch": 0.008628190255220417, "grad_norm": 0.09673842787742615, "grad_norm_var": 9.48056140393542e-05, "learning_rate": 0.0043115942028985514, "loss": 3.7924, "step": 238 }, { "crossentropy": 3.7707536220550537, "epoch": 0.008664443155452436, "grad_norm": 0.11895768344402313, "grad_norm_var": 0.00010029064245995456, "learning_rate": 0.004329710144927536, "loss": 3.7447, "step": 239 }, { "crossentropy": 3.697831392288208, "epoch": 0.008700696055684454, "grad_norm": 0.10440103709697723, "grad_norm_var": 9.643191358144805e-05, "learning_rate": 0.004347826086956522, "loss": 3.6677, "step": 240 }, { "crossentropy": 3.721008539199829, "epoch": 0.008736948955916474, "grad_norm": 0.12074580788612366, "grad_norm_var": 0.00011199774497163487, "learning_rate": 0.004365942028985507, "loss": 3.7566, "step": 241 }, { "crossentropy": 3.7749555110931396, "epoch": 0.008773201856148492, "grad_norm": 0.15432053804397583, "grad_norm_var": 0.0002526702898626656, "learning_rate": 0.004384057971014493, "loss": 3.802, "step": 242 }, { "crossentropy": 3.9055933952331543, "epoch": 0.00880945475638051, "grad_norm": 0.09381218999624252, "grad_norm_var": 0.0002593639501903907, "learning_rate": 0.004402173913043478, "loss": 3.8249, "step": 243 }, { "crossentropy": 3.726924419403076, "epoch": 0.00884570765661253, "grad_norm": 0.4013107419013977, "grad_norm_var": 0.005613994179421703, "learning_rate": 0.004420289855072464, "loss": 3.7562, "step": 244 }, { "crossentropy": 3.5893056392669678, "epoch": 0.008881960556844547, "grad_norm": 0.0954953134059906, "grad_norm_var": 0.005610438934422006, "learning_rate": 0.004438405797101449, "loss": 3.6457, "step": 245 }, { "crossentropy": 3.881089925765991, "epoch": 0.008918213457076565, "grad_norm": 0.12080498784780502, "grad_norm_var": 0.005557434304630403, "learning_rate": 0.0044565217391304346, "loss": 3.8036, "step": 246 }, { "crossentropy": 3.67484974861145, "epoch": 0.008954466357308585, "grad_norm": 0.10654780268669128, "grad_norm_var": 0.005585473827826313, "learning_rate": 0.004474637681159421, "loss": 3.6924, "step": 247 }, { "crossentropy": 3.7291243076324463, "epoch": 0.008990719257540603, "grad_norm": 0.2659108638763428, "grad_norm_var": 0.006756830593564212, "learning_rate": 0.004492753623188406, "loss": 3.6662, "step": 248 }, { "crossentropy": 3.677609920501709, "epoch": 0.009026972157772623, "grad_norm": 0.08992792665958405, "grad_norm_var": 0.00687939442488555, "learning_rate": 0.004510869565217392, "loss": 3.6923, "step": 249 }, { "crossentropy": 3.7748990058898926, "epoch": 0.00906322505800464, "grad_norm": 0.0984487533569336, "grad_norm_var": 0.006887221326248134, "learning_rate": 0.004528985507246377, "loss": 3.7545, "step": 250 }, { "crossentropy": 3.7218031883239746, "epoch": 0.009099477958236658, "grad_norm": 0.09538257122039795, "grad_norm_var": 0.0069114975020107775, "learning_rate": 0.004547101449275363, "loss": 3.7485, "step": 251 }, { "crossentropy": 3.8567285537719727, "epoch": 0.009135730858468678, "grad_norm": 0.09093115478754044, "grad_norm_var": 0.006952732026321531, "learning_rate": 0.004565217391304347, "loss": 3.8006, "step": 252 }, { "crossentropy": 3.6038882732391357, "epoch": 0.009171983758700696, "grad_norm": 0.10982728004455566, "grad_norm_var": 0.006871544966190268, "learning_rate": 0.004583333333333333, "loss": 3.6505, "step": 253 }, { "crossentropy": 3.7393958568573, "epoch": 0.009208236658932714, "grad_norm": 0.09961426258087158, "grad_norm_var": 0.006857305283266278, "learning_rate": 0.0046014492753623185, "loss": 3.7755, "step": 254 }, { "crossentropy": 4.047731876373291, "epoch": 0.009244489559164733, "grad_norm": 0.09465339779853821, "grad_norm_var": 0.006947514309806326, "learning_rate": 0.004619565217391305, "loss": 3.9071, "step": 255 }, { "crossentropy": 3.87233567237854, "epoch": 0.009280742459396751, "grad_norm": 0.09261798113584518, "grad_norm_var": 0.007002510835391122, "learning_rate": 0.00463768115942029, "loss": 3.8259, "step": 256 }, { "crossentropy": 3.6928374767303467, "epoch": 0.009316995359628771, "grad_norm": 0.095506951212883, "grad_norm_var": 0.007084055493228687, "learning_rate": 0.004655797101449276, "loss": 3.7259, "step": 257 }, { "crossentropy": 3.7626352310180664, "epoch": 0.009353248259860789, "grad_norm": 0.10615887492895126, "grad_norm_var": 0.0070829303489694405, "learning_rate": 0.004673913043478261, "loss": 3.7606, "step": 258 }, { "crossentropy": 3.846984624862671, "epoch": 0.009389501160092807, "grad_norm": 0.09178503602743149, "grad_norm_var": 0.007092578920539039, "learning_rate": 0.004692028985507246, "loss": 3.7915, "step": 259 }, { "crossentropy": 3.7018728256225586, "epoch": 0.009425754060324826, "grad_norm": 0.08825083822011948, "grad_norm_var": 0.001827697399233604, "learning_rate": 0.004710144927536232, "loss": 3.7686, "step": 260 }, { "crossentropy": 3.785767078399658, "epoch": 0.009462006960556844, "grad_norm": 0.09740439057350159, "grad_norm_var": 0.001824521635006989, "learning_rate": 0.004728260869565217, "loss": 3.7972, "step": 261 }, { "crossentropy": 3.653217077255249, "epoch": 0.009498259860788862, "grad_norm": 0.09755682945251465, "grad_norm_var": 0.001821664912028098, "learning_rate": 0.004746376811594203, "loss": 3.7113, "step": 262 }, { "crossentropy": 3.6510825157165527, "epoch": 0.009534512761020882, "grad_norm": 0.15645483136177063, "grad_norm_var": 0.001970779895276157, "learning_rate": 0.0047644927536231885, "loss": 3.7648, "step": 263 }, { "crossentropy": 3.75962495803833, "epoch": 0.0095707656612529, "grad_norm": 0.11550380289554596, "grad_norm_var": 0.00027106865277146466, "learning_rate": 0.004782608695652175, "loss": 3.7658, "step": 264 }, { "crossentropy": 3.798236131668091, "epoch": 0.00960701856148492, "grad_norm": 0.1771736592054367, "grad_norm_var": 0.00061508216664539, "learning_rate": 0.00480072463768116, "loss": 3.7051, "step": 265 }, { "crossentropy": 3.693713665008545, "epoch": 0.009643271461716937, "grad_norm": 0.30801525712013245, "grad_norm_var": 0.0031292833264648283, "learning_rate": 0.004818840579710145, "loss": 3.6766, "step": 266 }, { "crossentropy": 3.8539373874664307, "epoch": 0.009679524361948955, "grad_norm": 0.09204233437776566, "grad_norm_var": 0.003140856349585079, "learning_rate": 0.00483695652173913, "loss": 3.8822, "step": 267 }, { "crossentropy": 3.7352824211120605, "epoch": 0.009715777262180975, "grad_norm": 0.18971386551856995, "grad_norm_var": 0.0033732195658377796, "learning_rate": 0.004855072463768116, "loss": 3.827, "step": 268 }, { "crossentropy": 3.699345350265503, "epoch": 0.009752030162412993, "grad_norm": 0.097272127866745, "grad_norm_var": 0.003409755765224257, "learning_rate": 0.004873188405797101, "loss": 3.7415, "step": 269 }, { "crossentropy": 3.8160760402679443, "epoch": 0.00978828306264501, "grad_norm": 0.09786469489336014, "grad_norm_var": 0.003415864934785969, "learning_rate": 0.004891304347826087, "loss": 3.8085, "step": 270 }, { "crossentropy": 3.7390012741088867, "epoch": 0.00982453596287703, "grad_norm": 0.08684591948986053, "grad_norm_var": 0.0034511336952701395, "learning_rate": 0.0049094202898550725, "loss": 3.6551, "step": 271 }, { "crossentropy": 3.7313177585601807, "epoch": 0.009860788863109048, "grad_norm": 0.1039125993847847, "grad_norm_var": 0.0034112665109301814, "learning_rate": 0.0049275362318840586, "loss": 3.7243, "step": 272 }, { "crossentropy": 3.7668914794921875, "epoch": 0.009897041763341068, "grad_norm": 0.08746562153100967, "grad_norm_var": 0.0034470276955599695, "learning_rate": 0.004945652173913044, "loss": 3.757, "step": 273 }, { "crossentropy": 3.6201181411743164, "epoch": 0.009933294663573086, "grad_norm": 0.09287498891353607, "grad_norm_var": 0.0034906993375095684, "learning_rate": 0.004963768115942029, "loss": 3.6538, "step": 274 }, { "crossentropy": 3.5869853496551514, "epoch": 0.009969547563805104, "grad_norm": 0.0874985083937645, "grad_norm_var": 0.0035101217791687245, "learning_rate": 0.004981884057971015, "loss": 3.5946, "step": 275 }, { "crossentropy": 3.7347805500030518, "epoch": 0.010005800464037123, "grad_norm": 0.09630752354860306, "grad_norm_var": 0.0034763231974996906, "learning_rate": 0.005, "loss": 3.6537, "step": 276 }, { "crossentropy": 3.692316770553589, "epoch": 0.010042053364269141, "grad_norm": 0.09478503465652466, "grad_norm_var": 0.003486038429517261, "learning_rate": 0.005018115942028985, "loss": 3.7821, "step": 277 }, { "crossentropy": 3.586575508117676, "epoch": 0.01007830626450116, "grad_norm": 0.10935340076684952, "grad_norm_var": 0.0034534106819588367, "learning_rate": 0.005036231884057971, "loss": 3.5884, "step": 278 }, { "crossentropy": 3.5880093574523926, "epoch": 0.010114559164733179, "grad_norm": 0.1617310643196106, "grad_norm_var": 0.003477583078407964, "learning_rate": 0.0050543478260869565, "loss": 3.6001, "step": 279 }, { "crossentropy": 3.724146842956543, "epoch": 0.010150812064965197, "grad_norm": 0.136312797665596, "grad_norm_var": 0.0034785832808145473, "learning_rate": 0.0050724637681159425, "loss": 3.7107, "step": 280 }, { "crossentropy": 3.6408567428588867, "epoch": 0.010187064965197216, "grad_norm": 0.09634851664304733, "grad_norm_var": 0.003337529545250416, "learning_rate": 0.005090579710144928, "loss": 3.6622, "step": 281 }, { "crossentropy": 3.6463589668273926, "epoch": 0.010223317865429234, "grad_norm": 0.15904048085212708, "grad_norm_var": 0.0010127917718960441, "learning_rate": 0.005108695652173914, "loss": 3.6768, "step": 282 }, { "crossentropy": 3.758164644241333, "epoch": 0.010259570765661252, "grad_norm": 0.09502509981393814, "grad_norm_var": 0.0010054760080354505, "learning_rate": 0.005126811594202898, "loss": 3.7579, "step": 283 }, { "crossentropy": 3.599891424179077, "epoch": 0.010295823665893272, "grad_norm": 0.08049993216991425, "grad_norm_var": 0.0006196185363509372, "learning_rate": 0.005144927536231884, "loss": 3.685, "step": 284 }, { "crossentropy": 3.6804182529449463, "epoch": 0.01033207656612529, "grad_norm": 0.08257720619440079, "grad_norm_var": 0.0006486405385696112, "learning_rate": 0.005163043478260869, "loss": 3.6952, "step": 285 }, { "crossentropy": 3.5879037380218506, "epoch": 0.010368329466357308, "grad_norm": 0.08105617016553879, "grad_norm_var": 0.0006806708922018657, "learning_rate": 0.005181159420289855, "loss": 3.5749, "step": 286 }, { "crossentropy": 3.7273125648498535, "epoch": 0.010404582366589327, "grad_norm": 0.08560299128293991, "grad_norm_var": 0.0006834822105497832, "learning_rate": 0.0051992753623188405, "loss": 3.7184, "step": 287 }, { "crossentropy": 3.4162917137145996, "epoch": 0.010440835266821345, "grad_norm": 0.08285696804523468, "grad_norm_var": 0.0007090485864911305, "learning_rate": 0.005217391304347826, "loss": 3.5747, "step": 288 }, { "crossentropy": 3.6779584884643555, "epoch": 0.010477088167053365, "grad_norm": 0.09379025548696518, "grad_norm_var": 0.0006994324229799544, "learning_rate": 0.005235507246376812, "loss": 3.8064, "step": 289 }, { "crossentropy": 3.7528462409973145, "epoch": 0.010513341067285383, "grad_norm": 0.13249264657497406, "grad_norm_var": 0.0007481196573004535, "learning_rate": 0.005253623188405797, "loss": 3.7703, "step": 290 }, { "crossentropy": 3.806748867034912, "epoch": 0.0105495939675174, "grad_norm": 0.10220588743686676, "grad_norm_var": 0.0007278973641598935, "learning_rate": 0.005271739130434783, "loss": 3.7082, "step": 291 }, { "crossentropy": 3.7747087478637695, "epoch": 0.01058584686774942, "grad_norm": 0.0876772329211235, "grad_norm_var": 0.000743273145934531, "learning_rate": 0.005289855072463768, "loss": 3.6841, "step": 292 }, { "crossentropy": 3.5968334674835205, "epoch": 0.010622099767981438, "grad_norm": 0.08927058428525925, "grad_norm_var": 0.0007527466733532715, "learning_rate": 0.005307971014492754, "loss": 3.6586, "step": 293 }, { "crossentropy": 3.520001173019409, "epoch": 0.010658352668213458, "grad_norm": 0.08280796557664871, "grad_norm_var": 0.0007804595737483967, "learning_rate": 0.005326086956521739, "loss": 3.5776, "step": 294 }, { "crossentropy": 3.652935266494751, "epoch": 0.010694605568445476, "grad_norm": 0.08602146804332733, "grad_norm_var": 0.000546656190566517, "learning_rate": 0.005344202898550725, "loss": 3.6058, "step": 295 }, { "crossentropy": 3.6589183807373047, "epoch": 0.010730858468677494, "grad_norm": 0.08920534700155258, "grad_norm_var": 0.00044690118628995984, "learning_rate": 0.0053623188405797105, "loss": 3.6772, "step": 296 }, { "crossentropy": 3.647825241088867, "epoch": 0.010767111368909513, "grad_norm": 0.09780922532081604, "grad_norm_var": 0.00044721831623348285, "learning_rate": 0.005380434782608696, "loss": 3.6729, "step": 297 }, { "crossentropy": 3.7091314792633057, "epoch": 0.010803364269141531, "grad_norm": 0.0831354632973671, "grad_norm_var": 0.00016420607824835972, "learning_rate": 0.005398550724637681, "loss": 3.6214, "step": 298 }, { "crossentropy": 3.6160733699798584, "epoch": 0.01083961716937355, "grad_norm": 0.09303738176822662, "grad_norm_var": 0.00016332056198302669, "learning_rate": 0.005416666666666666, "loss": 3.6238, "step": 299 }, { "crossentropy": 3.6471571922302246, "epoch": 0.010875870069605569, "grad_norm": 0.08128952234983444, "grad_norm_var": 0.00016229326643940148, "learning_rate": 0.005434782608695652, "loss": 3.686, "step": 300 }, { "crossentropy": 3.6119513511657715, "epoch": 0.010912122969837587, "grad_norm": 0.08999276161193848, "grad_norm_var": 0.00015772130764824509, "learning_rate": 0.005452898550724637, "loss": 3.6168, "step": 301 }, { "crossentropy": 3.594571352005005, "epoch": 0.010948375870069606, "grad_norm": 0.10055975615978241, "grad_norm_var": 0.00015527096591624323, "learning_rate": 0.005471014492753623, "loss": 3.5855, "step": 302 }, { "crossentropy": 3.701869487762451, "epoch": 0.010984628770301624, "grad_norm": 0.08124972879886627, "grad_norm_var": 0.00016037723634617986, "learning_rate": 0.005489130434782608, "loss": 3.6783, "step": 303 }, { "crossentropy": 3.5412774085998535, "epoch": 0.011020881670533642, "grad_norm": 0.09082558006048203, "grad_norm_var": 0.00015453849381999636, "learning_rate": 0.0055072463768115944, "loss": 3.4667, "step": 304 }, { "crossentropy": 3.62601375579834, "epoch": 0.011057134570765662, "grad_norm": 0.08573874831199646, "grad_norm_var": 0.00015729700879810714, "learning_rate": 0.00552536231884058, "loss": 3.659, "step": 305 }, { "crossentropy": 3.7195749282836914, "epoch": 0.01109338747099768, "grad_norm": 0.08978822082281113, "grad_norm_var": 4.118372765484675e-05, "learning_rate": 0.005543478260869566, "loss": 3.6424, "step": 306 }, { "crossentropy": 3.6707708835601807, "epoch": 0.011129640371229698, "grad_norm": 0.15931512415409088, "grad_norm_var": 0.000342434285350345, "learning_rate": 0.005561594202898551, "loss": 3.5749, "step": 307 }, { "crossentropy": 3.6112711429595947, "epoch": 0.011165893271461717, "grad_norm": 0.07927972823381424, "grad_norm_var": 0.00035278208733123447, "learning_rate": 0.005579710144927537, "loss": 3.5923, "step": 308 }, { "crossentropy": 3.6725125312805176, "epoch": 0.011202146171693735, "grad_norm": 0.16594186425209045, "grad_norm_var": 0.0006876038690088623, "learning_rate": 0.005597826086956522, "loss": 3.6086, "step": 309 }, { "crossentropy": 3.7567732334136963, "epoch": 0.011238399071925755, "grad_norm": 0.08022478967905045, "grad_norm_var": 0.0006929950487188213, "learning_rate": 0.005615942028985508, "loss": 3.6779, "step": 310 }, { "crossentropy": 3.578857898712158, "epoch": 0.011274651972157773, "grad_norm": 0.07549439370632172, "grad_norm_var": 0.0007154549371684617, "learning_rate": 0.005634057971014492, "loss": 3.5818, "step": 311 }, { "crossentropy": 3.706106185913086, "epoch": 0.01131090487238979, "grad_norm": 0.0867394283413887, "grad_norm_var": 0.0007182105288030942, "learning_rate": 0.0056521739130434775, "loss": 3.66, "step": 312 }, { "crossentropy": 3.6831212043762207, "epoch": 0.01134715777262181, "grad_norm": 0.09091100841760635, "grad_norm_var": 0.0007197747419392423, "learning_rate": 0.005670289855072464, "loss": 3.6803, "step": 313 }, { "crossentropy": 3.5825576782226562, "epoch": 0.011383410672853828, "grad_norm": 0.07975761592388153, "grad_norm_var": 0.0007262120729529621, "learning_rate": 0.005688405797101449, "loss": 3.6171, "step": 314 }, { "crossentropy": 3.718885660171509, "epoch": 0.011419663573085846, "grad_norm": 0.09458688646554947, "grad_norm_var": 0.0007258256489612683, "learning_rate": 0.005706521739130435, "loss": 3.6624, "step": 315 }, { "crossentropy": 3.6223480701446533, "epoch": 0.011455916473317866, "grad_norm": 0.0804867297410965, "grad_norm_var": 0.0007274117246074516, "learning_rate": 0.00572463768115942, "loss": 3.633, "step": 316 }, { "crossentropy": 3.7660703659057617, "epoch": 0.011492169373549884, "grad_norm": 0.10181817412376404, "grad_norm_var": 0.0007271833379469514, "learning_rate": 0.005742753623188406, "loss": 3.7297, "step": 317 }, { "crossentropy": 3.6294260025024414, "epoch": 0.011528422273781903, "grad_norm": 0.0766710638999939, "grad_norm_var": 0.0007496639651073997, "learning_rate": 0.005760869565217391, "loss": 3.6151, "step": 318 }, { "crossentropy": 3.6058335304260254, "epoch": 0.011564675174013921, "grad_norm": 0.08670145273208618, "grad_norm_var": 0.0007415797175073844, "learning_rate": 0.005778985507246377, "loss": 3.6058, "step": 319 }, { "crossentropy": 3.5273187160491943, "epoch": 0.01160092807424594, "grad_norm": 0.07360902428627014, "grad_norm_var": 0.0007703020539196248, "learning_rate": 0.005797101449275362, "loss": 3.4997, "step": 320 }, { "crossentropy": 3.5800018310546875, "epoch": 0.011637180974477959, "grad_norm": 0.08406592905521393, "grad_norm_var": 0.0007723622761539142, "learning_rate": 0.005815217391304348, "loss": 3.5417, "step": 321 }, { "crossentropy": 3.6124157905578613, "epoch": 0.011673433874709977, "grad_norm": 0.08863270282745361, "grad_norm_var": 0.0007731080309599567, "learning_rate": 0.005833333333333334, "loss": 3.6199, "step": 322 }, { "crossentropy": 3.6198415756225586, "epoch": 0.011709686774941995, "grad_norm": 0.08412732183933258, "grad_norm_var": 0.0004717944462220659, "learning_rate": 0.00585144927536232, "loss": 3.6084, "step": 323 }, { "crossentropy": 3.561368465423584, "epoch": 0.011745939675174014, "grad_norm": 0.07847727090120316, "grad_norm_var": 0.0004729084635775147, "learning_rate": 0.005869565217391305, "loss": 3.5889, "step": 324 }, { "crossentropy": 3.4300484657287598, "epoch": 0.011782192575406032, "grad_norm": 0.07994164526462555, "grad_norm_var": 5.593458905031208e-05, "learning_rate": 0.005887681159420289, "loss": 3.5066, "step": 325 }, { "crossentropy": 3.5097317695617676, "epoch": 0.011818445475638052, "grad_norm": 0.08196486532688141, "grad_norm_var": 5.527338589802015e-05, "learning_rate": 0.005905797101449275, "loss": 3.4697, "step": 326 }, { "crossentropy": 3.684917688369751, "epoch": 0.01185469837587007, "grad_norm": 0.0800597295165062, "grad_norm_var": 5.139912037837753e-05, "learning_rate": 0.00592391304347826, "loss": 3.7157, "step": 327 }, { "crossentropy": 3.566286563873291, "epoch": 0.011890951276102088, "grad_norm": 0.1149894967675209, "grad_norm_var": 0.00011052545845312358, "learning_rate": 0.005942028985507246, "loss": 3.5288, "step": 328 }, { "crossentropy": 3.5792319774627686, "epoch": 0.011927204176334107, "grad_norm": 0.07905799895524979, "grad_norm_var": 0.00011162406978476568, "learning_rate": 0.0059601449275362315, "loss": 3.5817, "step": 329 }, { "crossentropy": 3.570779323577881, "epoch": 0.011963457076566125, "grad_norm": 0.0924992710351944, "grad_norm_var": 0.00011233933963537533, "learning_rate": 0.005978260869565218, "loss": 3.525, "step": 330 }, { "crossentropy": 3.5518035888671875, "epoch": 0.011999709976798143, "grad_norm": 0.08576760441064835, "grad_norm_var": 0.00010722738939083463, "learning_rate": 0.005996376811594203, "loss": 3.5735, "step": 331 }, { "crossentropy": 3.78454327583313, "epoch": 0.012035962877030163, "grad_norm": 0.07692056894302368, "grad_norm_var": 0.00011043184670764422, "learning_rate": 0.006014492753623189, "loss": 3.6581, "step": 332 }, { "crossentropy": 3.550543785095215, "epoch": 0.01207221577726218, "grad_norm": 0.09338723123073578, "grad_norm_var": 9.634131060514301e-05, "learning_rate": 0.006032608695652174, "loss": 3.5271, "step": 333 }, { "crossentropy": 3.718745231628418, "epoch": 0.0121084686774942, "grad_norm": 0.07919477671384811, "grad_norm_var": 9.400249532898845e-05, "learning_rate": 0.00605072463768116, "loss": 3.6485, "step": 334 }, { "crossentropy": 3.539457082748413, "epoch": 0.012144721577726218, "grad_norm": 0.07498329877853394, "grad_norm_var": 9.986741192514285e-05, "learning_rate": 0.006068840579710145, "loss": 3.5823, "step": 335 }, { "crossentropy": 3.498491048812866, "epoch": 0.012180974477958236, "grad_norm": 0.08305423706769943, "grad_norm_var": 9.206761413174276e-05, "learning_rate": 0.006086956521739131, "loss": 3.5363, "step": 336 }, { "crossentropy": 3.550910711288452, "epoch": 0.012217227378190256, "grad_norm": 0.0794009193778038, "grad_norm_var": 9.389694544149156e-05, "learning_rate": 0.006105072463768116, "loss": 3.5774, "step": 337 }, { "crossentropy": 3.490596055984497, "epoch": 0.012253480278422274, "grad_norm": 0.0736975446343422, "grad_norm_var": 9.966557314917482e-05, "learning_rate": 0.006123188405797102, "loss": 3.5716, "step": 338 }, { "crossentropy": 3.601818799972534, "epoch": 0.012289733178654292, "grad_norm": 0.07918187975883484, "grad_norm_var": 0.00010084330738782653, "learning_rate": 0.006141304347826087, "loss": 3.6164, "step": 339 }, { "crossentropy": 3.6879959106445312, "epoch": 0.012325986078886311, "grad_norm": 0.10042684525251389, "grad_norm_var": 0.00011688109374790566, "learning_rate": 0.006159420289855072, "loss": 3.6356, "step": 340 }, { "crossentropy": 3.523080587387085, "epoch": 0.01236223897911833, "grad_norm": 0.0798918604850769, "grad_norm_var": 0.00011691255564369613, "learning_rate": 0.006177536231884058, "loss": 3.5618, "step": 341 }, { "crossentropy": 3.5473625659942627, "epoch": 0.012398491879350349, "grad_norm": 0.07507070899009705, "grad_norm_var": 0.00012235586295135636, "learning_rate": 0.006195652173913043, "loss": 3.5683, "step": 342 }, { "crossentropy": 3.461395740509033, "epoch": 0.012434744779582367, "grad_norm": 0.07260739803314209, "grad_norm_var": 0.00012996474219490278, "learning_rate": 0.006213768115942029, "loss": 3.509, "step": 343 }, { "crossentropy": 3.541700839996338, "epoch": 0.012470997679814385, "grad_norm": 0.07864222675561905, "grad_norm_var": 6.117881107468767e-05, "learning_rate": 0.006231884057971014, "loss": 3.5191, "step": 344 }, { "crossentropy": 3.6261472702026367, "epoch": 0.012507250580046404, "grad_norm": 0.11656654626131058, "grad_norm_var": 0.00013696411751185818, "learning_rate": 0.00625, "loss": 3.5413, "step": 345 }, { "crossentropy": 3.585214853286743, "epoch": 0.012543503480278422, "grad_norm": 0.07826150208711624, "grad_norm_var": 0.00013317780215449357, "learning_rate": 0.0062681159420289855, "loss": 3.5752, "step": 346 }, { "crossentropy": 3.3134448528289795, "epoch": 0.01257975638051044, "grad_norm": 0.07962746173143387, "grad_norm_var": 0.0001332199925231016, "learning_rate": 0.0062862318840579716, "loss": 3.4474, "step": 347 }, { "crossentropy": 3.589108467102051, "epoch": 0.01261600928074246, "grad_norm": 0.08822091668844223, "grad_norm_var": 0.00013270834150624078, "learning_rate": 0.006304347826086957, "loss": 3.5737, "step": 348 }, { "crossentropy": 3.534435749053955, "epoch": 0.012652262180974478, "grad_norm": 0.0861198902130127, "grad_norm_var": 0.000126199511752767, "learning_rate": 0.006322463768115943, "loss": 3.4917, "step": 349 }, { "crossentropy": 3.6580746173858643, "epoch": 0.012688515081206497, "grad_norm": 0.08473580330610275, "grad_norm_var": 0.00012544806146198942, "learning_rate": 0.006340579710144928, "loss": 3.5861, "step": 350 }, { "crossentropy": 3.603139877319336, "epoch": 0.012724767981438515, "grad_norm": 0.07643251866102219, "grad_norm_var": 0.00012400020493496695, "learning_rate": 0.006358695652173914, "loss": 3.5674, "step": 351 }, { "crossentropy": 3.492473602294922, "epoch": 0.012761020881670533, "grad_norm": 0.08763831108808517, "grad_norm_var": 0.00012519626964539615, "learning_rate": 0.006376811594202898, "loss": 3.5617, "step": 352 }, { "crossentropy": 3.4340548515319824, "epoch": 0.012797273781902553, "grad_norm": 0.0743059441447258, "grad_norm_var": 0.0001296254987955397, "learning_rate": 0.0063949275362318835, "loss": 3.4409, "step": 353 }, { "crossentropy": 3.4237043857574463, "epoch": 0.01283352668213457, "grad_norm": 0.07387466728687286, "grad_norm_var": 0.00012940271064044347, "learning_rate": 0.0064130434782608695, "loss": 3.3944, "step": 354 }, { "crossentropy": 3.6178019046783447, "epoch": 0.012869779582366589, "grad_norm": 0.07535757124423981, "grad_norm_var": 0.00013237855546942106, "learning_rate": 0.006431159420289855, "loss": 3.5765, "step": 355 }, { "crossentropy": 3.508892059326172, "epoch": 0.012906032482598608, "grad_norm": 0.0737202987074852, "grad_norm_var": 0.00011485232805443686, "learning_rate": 0.006449275362318841, "loss": 3.5468, "step": 356 }, { "crossentropy": 3.531202554702759, "epoch": 0.012942285382830626, "grad_norm": 0.07732763886451721, "grad_norm_var": 0.00011575056463287252, "learning_rate": 0.006467391304347826, "loss": 3.5908, "step": 357 }, { "crossentropy": 3.6475818157196045, "epoch": 0.012978538283062646, "grad_norm": 0.07402508705854416, "grad_norm_var": 0.00011666740275452202, "learning_rate": 0.006485507246376812, "loss": 3.616, "step": 358 }, { "crossentropy": 3.5336480140686035, "epoch": 0.013014791183294664, "grad_norm": 0.07586218416690826, "grad_norm_var": 0.00011364765238712192, "learning_rate": 0.006503623188405797, "loss": 3.5648, "step": 359 }, { "crossentropy": 3.5177767276763916, "epoch": 0.013051044083526682, "grad_norm": 0.07289610803127289, "grad_norm_var": 0.00011774362135558898, "learning_rate": 0.006521739130434783, "loss": 3.521, "step": 360 }, { "crossentropy": 3.4844284057617188, "epoch": 0.013087296983758701, "grad_norm": 0.07567230612039566, "grad_norm_var": 2.798570918358667e-05, "learning_rate": 0.006539855072463768, "loss": 3.5316, "step": 361 }, { "crossentropy": 3.275210380554199, "epoch": 0.01312354988399072, "grad_norm": 0.07434966415166855, "grad_norm_var": 2.9003861515311537e-05, "learning_rate": 0.006557971014492754, "loss": 3.3925, "step": 362 }, { "crossentropy": 3.4880683422088623, "epoch": 0.013159802784222737, "grad_norm": 0.08340936154127121, "grad_norm_var": 3.0650162364135686e-05, "learning_rate": 0.0065760869565217395, "loss": 3.5208, "step": 363 }, { "crossentropy": 3.6337907314300537, "epoch": 0.013196055684454757, "grad_norm": 0.07496842741966248, "grad_norm_var": 2.4223508365691047e-05, "learning_rate": 0.0065942028985507255, "loss": 3.6091, "step": 364 }, { "crossentropy": 3.563347101211548, "epoch": 0.013232308584686775, "grad_norm": 0.07195571064949036, "grad_norm_var": 2.056547742281973e-05, "learning_rate": 0.006612318840579711, "loss": 3.5324, "step": 365 }, { "crossentropy": 3.6122117042541504, "epoch": 0.013268561484918794, "grad_norm": 0.07661737501621246, "grad_norm_var": 1.5941150425469175e-05, "learning_rate": 0.006630434782608695, "loss": 3.6086, "step": 366 }, { "crossentropy": 3.4908673763275146, "epoch": 0.013304814385150812, "grad_norm": 0.07294797897338867, "grad_norm_var": 1.6569149361533633e-05, "learning_rate": 0.006648550724637681, "loss": 3.4441, "step": 367 }, { "crossentropy": 3.5683953762054443, "epoch": 0.01334106728538283, "grad_norm": 0.08917582035064697, "grad_norm_var": 1.911649038239723e-05, "learning_rate": 0.006666666666666666, "loss": 3.5135, "step": 368 }, { "crossentropy": 3.4920120239257812, "epoch": 0.01337732018561485, "grad_norm": 0.15842483937740326, "grad_norm_var": 0.0004420387304345313, "learning_rate": 0.006684782608695652, "loss": 3.5007, "step": 369 }, { "crossentropy": 3.577091932296753, "epoch": 0.013413573085846868, "grad_norm": 0.07735821604728699, "grad_norm_var": 0.0004393545473504917, "learning_rate": 0.0067028985507246374, "loss": 3.4549, "step": 370 }, { "crossentropy": 3.4644813537597656, "epoch": 0.013449825986078886, "grad_norm": 0.0757811889052391, "grad_norm_var": 0.0004390185821305977, "learning_rate": 0.0067210144927536235, "loss": 3.3846, "step": 371 }, { "crossentropy": 3.3571813106536865, "epoch": 0.013486078886310905, "grad_norm": 0.0782000869512558, "grad_norm_var": 0.0004356076334988688, "learning_rate": 0.006739130434782609, "loss": 3.4066, "step": 372 }, { "crossentropy": 3.500845193862915, "epoch": 0.013522331786542923, "grad_norm": 0.08100726455450058, "grad_norm_var": 0.0004342543720664699, "learning_rate": 0.006757246376811595, "loss": 3.5457, "step": 373 }, { "crossentropy": 3.3769571781158447, "epoch": 0.013558584686774943, "grad_norm": 0.07336889207363129, "grad_norm_var": 0.00043498259368563786, "learning_rate": 0.00677536231884058, "loss": 3.3786, "step": 374 }, { "crossentropy": 3.487448215484619, "epoch": 0.01359483758700696, "grad_norm": 0.07530085742473602, "grad_norm_var": 0.00043546164128597587, "learning_rate": 0.006793478260869566, "loss": 3.4838, "step": 375 }, { "crossentropy": 3.5025100708007812, "epoch": 0.013631090487238979, "grad_norm": 0.07432354241609573, "grad_norm_var": 0.00043386302645742245, "learning_rate": 0.006811594202898551, "loss": 3.5498, "step": 376 }, { "crossentropy": 3.4972891807556152, "epoch": 0.013667343387470998, "grad_norm": 0.07668522745370865, "grad_norm_var": 0.0004330652857086744, "learning_rate": 0.006829710144927537, "loss": 3.5184, "step": 377 }, { "crossentropy": 3.5986597537994385, "epoch": 0.013703596287703016, "grad_norm": 0.07577864080667496, "grad_norm_var": 0.00043171296767371693, "learning_rate": 0.006847826086956522, "loss": 3.5377, "step": 378 }, { "crossentropy": 3.4588418006896973, "epoch": 0.013739849187935034, "grad_norm": 0.08404802531003952, "grad_norm_var": 0.00043184089379870634, "learning_rate": 0.006865942028985508, "loss": 3.425, "step": 379 }, { "crossentropy": 3.3981330394744873, "epoch": 0.013776102088167054, "grad_norm": 0.0765705332159996, "grad_norm_var": 0.00043044664171655844, "learning_rate": 0.006884057971014493, "loss": 3.4465, "step": 380 }, { "crossentropy": 3.571713924407959, "epoch": 0.013812354988399072, "grad_norm": 0.08365282416343689, "grad_norm_var": 0.0004227923939647713, "learning_rate": 0.006902173913043478, "loss": 3.5258, "step": 381 }, { "crossentropy": 3.3678994178771973, "epoch": 0.013848607888631091, "grad_norm": 0.07547866553068161, "grad_norm_var": 0.0004238542750432108, "learning_rate": 0.006920289855072464, "loss": 3.4441, "step": 382 }, { "crossentropy": 3.4121851921081543, "epoch": 0.013884860788863109, "grad_norm": 0.06851305067539215, "grad_norm_var": 0.00043103135285169997, "learning_rate": 0.006938405797101449, "loss": 3.4584, "step": 383 }, { "crossentropy": 3.4365222454071045, "epoch": 0.013921113689095127, "grad_norm": 0.07532430440187454, "grad_norm_var": 0.00043111687664317255, "learning_rate": 0.006956521739130435, "loss": 3.532, "step": 384 }, { "crossentropy": 3.5355842113494873, "epoch": 0.013957366589327147, "grad_norm": 0.06749235093593597, "grad_norm_var": 1.9656757180399007e-05, "learning_rate": 0.00697463768115942, "loss": 3.5244, "step": 385 }, { "crossentropy": 3.513319253921509, "epoch": 0.013993619489559165, "grad_norm": 0.07463652640581131, "grad_norm_var": 1.96922499734159e-05, "learning_rate": 0.006992753623188406, "loss": 3.459, "step": 386 }, { "crossentropy": 3.3792009353637695, "epoch": 0.014029872389791182, "grad_norm": 0.08664506673812866, "grad_norm_var": 2.6737123935819535e-05, "learning_rate": 0.007010869565217391, "loss": 3.4436, "step": 387 }, { "crossentropy": 3.396704912185669, "epoch": 0.014066125290023202, "grad_norm": 0.07597548514604568, "grad_norm_var": 2.6598252822887525e-05, "learning_rate": 0.0070289855072463775, "loss": 3.3949, "step": 388 }, { "crossentropy": 3.4755380153656006, "epoch": 0.01410237819025522, "grad_norm": 0.07447408139705658, "grad_norm_var": 2.5383292978518186e-05, "learning_rate": 0.007047101449275363, "loss": 3.4611, "step": 389 }, { "crossentropy": 3.3847038745880127, "epoch": 0.01413863109048724, "grad_norm": 0.07569452375173569, "grad_norm_var": 2.4861507213760902e-05, "learning_rate": 0.007065217391304349, "loss": 3.4916, "step": 390 }, { "crossentropy": 3.5034289360046387, "epoch": 0.014174883990719258, "grad_norm": 0.10068751871585846, "grad_norm_var": 6.18033247283295e-05, "learning_rate": 0.007083333333333334, "loss": 3.496, "step": 391 }, { "crossentropy": 3.3285679817199707, "epoch": 0.014211136890951276, "grad_norm": 0.068727046251297, "grad_norm_var": 6.641005301362482e-05, "learning_rate": 0.007101449275362319, "loss": 3.3829, "step": 392 }, { "crossentropy": 3.6101181507110596, "epoch": 0.014247389791183295, "grad_norm": 0.07583058625459671, "grad_norm_var": 6.655128276117153e-05, "learning_rate": 0.007119565217391305, "loss": 3.5737, "step": 393 }, { "crossentropy": 3.358393430709839, "epoch": 0.014283642691415313, "grad_norm": 0.07620744407176971, "grad_norm_var": 6.646604041586068e-05, "learning_rate": 0.007137681159420289, "loss": 3.4624, "step": 394 }, { "crossentropy": 3.522677421569824, "epoch": 0.014319895591647331, "grad_norm": 0.07162865996360779, "grad_norm_var": 6.525876095886187e-05, "learning_rate": 0.007155797101449275, "loss": 3.4217, "step": 395 }, { "crossentropy": 3.4273650646209717, "epoch": 0.01435614849187935, "grad_norm": 0.07451940327882767, "grad_norm_var": 6.556290268870183e-05, "learning_rate": 0.007173913043478261, "loss": 3.5058, "step": 396 }, { "crossentropy": 3.442660093307495, "epoch": 0.014392401392111369, "grad_norm": 0.07478702068328857, "grad_norm_var": 6.213005450616514e-05, "learning_rate": 0.007192028985507247, "loss": 3.4513, "step": 397 }, { "crossentropy": 3.4532575607299805, "epoch": 0.014428654292343388, "grad_norm": 0.07107475399971008, "grad_norm_var": 6.367114544208556e-05, "learning_rate": 0.007210144927536232, "loss": 3.4254, "step": 398 }, { "crossentropy": 3.5147106647491455, "epoch": 0.014464907192575406, "grad_norm": 0.08266875147819519, "grad_norm_var": 6.251022983169893e-05, "learning_rate": 0.007228260869565218, "loss": 3.5094, "step": 399 }, { "crossentropy": 3.562474489212036, "epoch": 0.014501160092807424, "grad_norm": 0.07738645374774933, "grad_norm_var": 6.24119592568526e-05, "learning_rate": 0.007246376811594203, "loss": 3.512, "step": 400 }, { "crossentropy": 3.3842575550079346, "epoch": 0.014537412993039444, "grad_norm": 0.08255626261234283, "grad_norm_var": 5.794567009242945e-05, "learning_rate": 0.007264492753623189, "loss": 3.3935, "step": 401 }, { "crossentropy": 3.440669059753418, "epoch": 0.014573665893271462, "grad_norm": 0.23188205063343048, "grad_norm_var": 0.0015387087798474522, "learning_rate": 0.007282608695652174, "loss": 3.4705, "step": 402 }, { "crossentropy": 3.3488667011260986, "epoch": 0.01460991879350348, "grad_norm": 0.07735315710306168, "grad_norm_var": 0.001545221894595272, "learning_rate": 0.007300724637681159, "loss": 3.446, "step": 403 }, { "crossentropy": 3.5141355991363525, "epoch": 0.014646171693735499, "grad_norm": 0.081379234790802, "grad_norm_var": 0.0015391283871810575, "learning_rate": 0.007318840579710145, "loss": 3.4209, "step": 404 }, { "crossentropy": 3.4379451274871826, "epoch": 0.014682424593967517, "grad_norm": 0.07398152351379395, "grad_norm_var": 0.001539986118573955, "learning_rate": 0.007336956521739131, "loss": 3.5236, "step": 405 }, { "crossentropy": 3.568312883377075, "epoch": 0.014718677494199537, "grad_norm": 0.07001513987779617, "grad_norm_var": 0.0015507697251721695, "learning_rate": 0.007355072463768117, "loss": 3.4873, "step": 406 }, { "crossentropy": 3.496826171875, "epoch": 0.014754930394431555, "grad_norm": 0.07851694524288177, "grad_norm_var": 0.0015407863175205844, "learning_rate": 0.007373188405797102, "loss": 3.4109, "step": 407 }, { "crossentropy": 3.4009974002838135, "epoch": 0.014791183294663572, "grad_norm": 0.06920502334833145, "grad_norm_var": 0.001539729602308028, "learning_rate": 0.007391304347826087, "loss": 3.3739, "step": 408 }, { "crossentropy": 3.423499345779419, "epoch": 0.014827436194895592, "grad_norm": 0.06867057830095291, "grad_norm_var": 0.0015522240005680196, "learning_rate": 0.007409420289855072, "loss": 3.4182, "step": 409 }, { "crossentropy": 3.3889882564544678, "epoch": 0.01486368909512761, "grad_norm": 0.06797011941671371, "grad_norm_var": 0.0015662475812363648, "learning_rate": 0.007427536231884058, "loss": 3.4419, "step": 410 }, { "crossentropy": 3.411120653152466, "epoch": 0.014899941995359628, "grad_norm": 0.09694691747426987, "grad_norm_var": 0.0015625237746130706, "learning_rate": 0.007445652173913043, "loss": 3.3989, "step": 411 }, { "crossentropy": 3.467514991760254, "epoch": 0.014936194895591648, "grad_norm": 0.09981544315814972, "grad_norm_var": 0.0015631809269532595, "learning_rate": 0.007463768115942029, "loss": 3.493, "step": 412 }, { "crossentropy": 3.399747371673584, "epoch": 0.014972447795823665, "grad_norm": 0.07527470588684082, "grad_norm_var": 0.0015623520270853452, "learning_rate": 0.0074818840579710146, "loss": 3.3255, "step": 413 }, { "crossentropy": 3.46065616607666, "epoch": 0.015008700696055685, "grad_norm": 0.07265092432498932, "grad_norm_var": 0.001558993737201178, "learning_rate": 0.0075, "loss": 3.3967, "step": 414 }, { "crossentropy": 3.2030246257781982, "epoch": 0.015044953596287703, "grad_norm": 0.07259818911552429, "grad_norm_var": 0.0015723458279745556, "learning_rate": 0.007518115942028986, "loss": 3.3664, "step": 415 }, { "crossentropy": 3.3476223945617676, "epoch": 0.015081206496519721, "grad_norm": 0.07432989776134491, "grad_norm_var": 0.0015769546961246356, "learning_rate": 0.007536231884057971, "loss": 3.3843, "step": 416 }, { "crossentropy": 3.358029365539551, "epoch": 0.01511745939675174, "grad_norm": 0.07036937028169632, "grad_norm_var": 0.0015935743271321189, "learning_rate": 0.007554347826086957, "loss": 3.3752, "step": 417 }, { "crossentropy": 3.384023666381836, "epoch": 0.015153712296983759, "grad_norm": 0.18037578463554382, "grad_norm_var": 0.0007596635890179021, "learning_rate": 0.007572463768115942, "loss": 3.3816, "step": 418 }, { "crossentropy": 3.438408374786377, "epoch": 0.015189965197215776, "grad_norm": 0.0703258365392685, "grad_norm_var": 0.0007681260843163822, "learning_rate": 0.007590579710144928, "loss": 3.4852, "step": 419 }, { "crossentropy": 3.346719741821289, "epoch": 0.015226218097447796, "grad_norm": 0.06911547482013702, "grad_norm_var": 0.0007796066066653238, "learning_rate": 0.007608695652173913, "loss": 3.355, "step": 420 }, { "crossentropy": 3.4767746925354004, "epoch": 0.015262470997679814, "grad_norm": 0.06616703420877457, "grad_norm_var": 0.0007916582532054459, "learning_rate": 0.0076268115942028985, "loss": 3.4625, "step": 421 }, { "crossentropy": 3.5249104499816895, "epoch": 0.015298723897911834, "grad_norm": 0.0667826384305954, "grad_norm_var": 0.0007972167793817819, "learning_rate": 0.007644927536231884, "loss": 3.4615, "step": 422 }, { "crossentropy": 3.5629310607910156, "epoch": 0.015334976798143852, "grad_norm": 0.07284633815288544, "grad_norm_var": 0.0008012511002955784, "learning_rate": 0.00766304347826087, "loss": 3.4708, "step": 423 }, { "crossentropy": 3.5377635955810547, "epoch": 0.01537122969837587, "grad_norm": 0.07768989354372025, "grad_norm_var": 0.0007925875263023167, "learning_rate": 0.007681159420289855, "loss": 3.5014, "step": 424 }, { "crossentropy": 3.335219621658325, "epoch": 0.015407482598607889, "grad_norm": 0.0734563022851944, "grad_norm_var": 0.000785915151741854, "learning_rate": 0.007699275362318841, "loss": 3.3731, "step": 425 }, { "crossentropy": 3.506666898727417, "epoch": 0.015443735498839907, "grad_norm": 0.07088012248277664, "grad_norm_var": 0.0007811289742419604, "learning_rate": 0.007717391304347826, "loss": 3.4836, "step": 426 }, { "crossentropy": 3.5011754035949707, "epoch": 0.015479988399071925, "grad_norm": 0.06614641845226288, "grad_norm_var": 0.0007784282674382498, "learning_rate": 0.007735507246376811, "loss": 3.4835, "step": 427 }, { "crossentropy": 3.4074149131774902, "epoch": 0.015516241299303945, "grad_norm": 0.07307995855808258, "grad_norm_var": 0.0007522037465294515, "learning_rate": 0.007753623188405797, "loss": 3.3839, "step": 428 }, { "crossentropy": 3.418712615966797, "epoch": 0.015552494199535962, "grad_norm": 0.06872078031301498, "grad_norm_var": 0.0007574932034218215, "learning_rate": 0.0077717391304347825, "loss": 3.469, "step": 429 }, { "crossentropy": 3.3389992713928223, "epoch": 0.015588747099767982, "grad_norm": 0.08971322327852249, "grad_norm_var": 0.0007638698206880271, "learning_rate": 0.0077898550724637685, "loss": 3.3295, "step": 430 }, { "crossentropy": 3.516091823577881, "epoch": 0.015625, "grad_norm": 0.1918448507785797, "grad_norm_var": 0.0015522132030166688, "learning_rate": 0.007807971014492754, "loss": 3.4215, "step": 431 }, { "crossentropy": 3.3540711402893066, "epoch": 0.015661252900232018, "grad_norm": 0.07159462571144104, "grad_norm_var": 0.0015570701367156427, "learning_rate": 0.00782608695652174, "loss": 3.361, "step": 432 }, { "crossentropy": 3.4438283443450928, "epoch": 0.015697505800464036, "grad_norm": 0.06877025961875916, "grad_norm_var": 0.0015606040654454255, "learning_rate": 0.007844202898550725, "loss": 3.3047, "step": 433 }, { "crossentropy": 3.377261161804199, "epoch": 0.015733758700696057, "grad_norm": 0.07261960953474045, "grad_norm_var": 0.0009317285221354369, "learning_rate": 0.00786231884057971, "loss": 3.4102, "step": 434 }, { "crossentropy": 3.5245068073272705, "epoch": 0.015770011600928075, "grad_norm": 0.08787623047828674, "grad_norm_var": 0.0009298400957201016, "learning_rate": 0.007880434782608695, "loss": 3.4456, "step": 435 }, { "crossentropy": 3.139965772628784, "epoch": 0.015806264501160093, "grad_norm": 0.09740269929170609, "grad_norm_var": 0.0009370764439108445, "learning_rate": 0.00789855072463768, "loss": 3.2954, "step": 436 }, { "crossentropy": 3.446301221847534, "epoch": 0.01584251740139211, "grad_norm": 0.07024470716714859, "grad_norm_var": 0.0009293854127394028, "learning_rate": 0.007916666666666666, "loss": 3.472, "step": 437 }, { "crossentropy": 3.4654734134674072, "epoch": 0.01587877030162413, "grad_norm": 0.14728708565235138, "grad_norm_var": 0.0011659590759729719, "learning_rate": 0.007934782608695653, "loss": 3.4561, "step": 438 }, { "crossentropy": 3.3144712448120117, "epoch": 0.01591502320185615, "grad_norm": 0.07457926869392395, "grad_norm_var": 0.0011627584295733349, "learning_rate": 0.007952898550724638, "loss": 3.2965, "step": 439 }, { "crossentropy": 3.4712400436401367, "epoch": 0.015951276102088168, "grad_norm": 0.06986482441425323, "grad_norm_var": 0.001176945003263563, "learning_rate": 0.007971014492753623, "loss": 3.365, "step": 440 }, { "crossentropy": 3.3170347213745117, "epoch": 0.015987529002320186, "grad_norm": 0.06862474977970123, "grad_norm_var": 0.0011872127277057997, "learning_rate": 0.007989130434782608, "loss": 3.376, "step": 441 }, { "crossentropy": 3.4261491298675537, "epoch": 0.016023781902552204, "grad_norm": 0.06622549146413803, "grad_norm_var": 0.0011984644133139035, "learning_rate": 0.008007246376811595, "loss": 3.4387, "step": 442 }, { "crossentropy": 3.346585750579834, "epoch": 0.016060034802784222, "grad_norm": 0.06803715229034424, "grad_norm_var": 0.0011935473774465318, "learning_rate": 0.00802536231884058, "loss": 3.3482, "step": 443 }, { "crossentropy": 3.383690357208252, "epoch": 0.01609628770301624, "grad_norm": 0.06622589379549026, "grad_norm_var": 0.0012088897253922368, "learning_rate": 0.008043478260869565, "loss": 3.3679, "step": 444 }, { "crossentropy": 3.3036460876464844, "epoch": 0.01613254060324826, "grad_norm": 0.08091865479946136, "grad_norm_var": 0.0011897172172135179, "learning_rate": 0.00806159420289855, "loss": 3.3252, "step": 445 }, { "crossentropy": 3.389893054962158, "epoch": 0.01616879350348028, "grad_norm": 0.06859023123979568, "grad_norm_var": 0.0012099319548460284, "learning_rate": 0.008079710144927537, "loss": 3.3777, "step": 446 }, { "crossentropy": 3.248444080352783, "epoch": 0.016205046403712297, "grad_norm": 0.06781932711601257, "grad_norm_var": 0.00041552795961135686, "learning_rate": 0.008097826086956523, "loss": 3.3615, "step": 447 }, { "crossentropy": 3.4388556480407715, "epoch": 0.016241299303944315, "grad_norm": 0.07484560459852219, "grad_norm_var": 0.0004134477541792652, "learning_rate": 0.008115942028985508, "loss": 3.4111, "step": 448 }, { "crossentropy": 3.3554844856262207, "epoch": 0.016277552204176333, "grad_norm": 0.06653818488121033, "grad_norm_var": 0.0004165419342391986, "learning_rate": 0.008134057971014493, "loss": 3.3891, "step": 449 }, { "crossentropy": 3.318077325820923, "epoch": 0.016313805104408354, "grad_norm": 0.0698573887348175, "grad_norm_var": 0.0004189934654763278, "learning_rate": 0.008152173913043478, "loss": 3.3003, "step": 450 }, { "crossentropy": 3.3242828845977783, "epoch": 0.016350058004640372, "grad_norm": 0.0704098716378212, "grad_norm_var": 0.0004146145762544182, "learning_rate": 0.008170289855072463, "loss": 3.3357, "step": 451 }, { "crossentropy": 3.4115045070648193, "epoch": 0.01638631090487239, "grad_norm": 0.0716426819562912, "grad_norm_var": 0.0003850395168605821, "learning_rate": 0.008188405797101448, "loss": 3.3683, "step": 452 }, { "crossentropy": 3.530386209487915, "epoch": 0.016422563805104408, "grad_norm": 0.07210268825292587, "grad_norm_var": 0.0003840507466215846, "learning_rate": 0.008206521739130435, "loss": 3.3782, "step": 453 }, { "crossentropy": 3.385469675064087, "epoch": 0.016458816705336426, "grad_norm": 0.07155074179172516, "grad_norm_var": 1.483500230758846e-05, "learning_rate": 0.00822463768115942, "loss": 3.3892, "step": 454 }, { "crossentropy": 3.2552683353424072, "epoch": 0.016495069605568447, "grad_norm": 0.07418261468410492, "grad_norm_var": 1.4628541784674395e-05, "learning_rate": 0.008242753623188406, "loss": 3.2916, "step": 455 }, { "crossentropy": 3.404574394226074, "epoch": 0.016531322505800465, "grad_norm": 0.07184728980064392, "grad_norm_var": 1.4715598110813985e-05, "learning_rate": 0.00826086956521739, "loss": 3.4775, "step": 456 }, { "crossentropy": 3.495000123977661, "epoch": 0.016567575406032483, "grad_norm": 0.08239766210317612, "grad_norm_var": 2.2964915174544568e-05, "learning_rate": 0.008278985507246378, "loss": 3.4872, "step": 457 }, { "crossentropy": 3.447628974914551, "epoch": 0.0166038283062645, "grad_norm": 0.0694870874285698, "grad_norm_var": 2.135799082486229e-05, "learning_rate": 0.008297101449275363, "loss": 3.425, "step": 458 }, { "crossentropy": 3.4203603267669678, "epoch": 0.01664008120649652, "grad_norm": 0.06738352030515671, "grad_norm_var": 2.1699845137452636e-05, "learning_rate": 0.008315217391304348, "loss": 3.3175, "step": 459 }, { "crossentropy": 3.369736433029175, "epoch": 0.016676334106728537, "grad_norm": 0.06763142347335815, "grad_norm_var": 2.0813849946900674e-05, "learning_rate": 0.008333333333333333, "loss": 3.4282, "step": 460 }, { "crossentropy": 3.3039541244506836, "epoch": 0.016712587006960558, "grad_norm": 0.06767123937606812, "grad_norm_var": 1.5499661579580075e-05, "learning_rate": 0.00835144927536232, "loss": 3.3516, "step": 461 }, { "crossentropy": 3.311474084854126, "epoch": 0.016748839907192576, "grad_norm": 0.06572616845369339, "grad_norm_var": 1.6883823228645152e-05, "learning_rate": 0.008369565217391305, "loss": 3.2953, "step": 462 }, { "crossentropy": 3.292489528656006, "epoch": 0.016785092807424594, "grad_norm": 0.07628379017114639, "grad_norm_var": 1.8118168519931643e-05, "learning_rate": 0.008387681159420289, "loss": 3.4185, "step": 463 }, { "crossentropy": 3.257793426513672, "epoch": 0.016821345707656612, "grad_norm": 0.06259351223707199, "grad_norm_var": 2.1581322156268475e-05, "learning_rate": 0.008405797101449276, "loss": 3.2899, "step": 464 }, { "crossentropy": 3.3756330013275146, "epoch": 0.01685759860788863, "grad_norm": 0.06412434577941895, "grad_norm_var": 2.320661434397353e-05, "learning_rate": 0.00842391304347826, "loss": 3.3427, "step": 465 }, { "crossentropy": 3.353093147277832, "epoch": 0.01689385150812065, "grad_norm": 0.06420785933732986, "grad_norm_var": 2.5539176830724255e-05, "learning_rate": 0.008442028985507246, "loss": 3.3828, "step": 466 }, { "crossentropy": 3.2794957160949707, "epoch": 0.01693010440835267, "grad_norm": 0.06773433834314346, "grad_norm_var": 2.5823475660742788e-05, "learning_rate": 0.008460144927536231, "loss": 3.3, "step": 467 }, { "crossentropy": 3.1897547245025635, "epoch": 0.016966357308584687, "grad_norm": 0.07134365290403366, "grad_norm_var": 2.575501487821023e-05, "learning_rate": 0.008478260869565218, "loss": 3.1886, "step": 468 }, { "crossentropy": 3.3260445594787598, "epoch": 0.017002610208816705, "grad_norm": 0.07491853833198547, "grad_norm_var": 2.7127599901023933e-05, "learning_rate": 0.008496376811594203, "loss": 3.3433, "step": 469 }, { "crossentropy": 3.31577205657959, "epoch": 0.017038863109048723, "grad_norm": 0.06565466523170471, "grad_norm_var": 2.803620937718183e-05, "learning_rate": 0.008514492753623188, "loss": 3.2819, "step": 470 }, { "crossentropy": 3.367560386657715, "epoch": 0.017075116009280744, "grad_norm": 0.06933457404375076, "grad_norm_var": 2.6526294303634106e-05, "learning_rate": 0.008532608695652174, "loss": 3.3261, "step": 471 }, { "crossentropy": 3.3931355476379395, "epoch": 0.017111368909512762, "grad_norm": 0.0663369670510292, "grad_norm_var": 2.6531366148514988e-05, "learning_rate": 0.00855072463768116, "loss": 3.3709, "step": 472 }, { "crossentropy": 3.2654671669006348, "epoch": 0.01714762180974478, "grad_norm": 0.07279624789953232, "grad_norm_var": 1.5047863325586228e-05, "learning_rate": 0.008568840579710146, "loss": 3.3078, "step": 473 }, { "crossentropy": 3.3388569355010986, "epoch": 0.017183874709976798, "grad_norm": 0.06353197246789932, "grad_norm_var": 1.6342996303116373e-05, "learning_rate": 0.00858695652173913, "loss": 3.2872, "step": 474 }, { "crossentropy": 3.21557879447937, "epoch": 0.017220127610208816, "grad_norm": 0.068290114402771, "grad_norm_var": 1.6325340120215694e-05, "learning_rate": 0.008605072463768116, "loss": 3.302, "step": 475 }, { "crossentropy": 3.147590398788452, "epoch": 0.017256380510440834, "grad_norm": 0.0748642235994339, "grad_norm_var": 1.9228668485396506e-05, "learning_rate": 0.008623188405797103, "loss": 3.2391, "step": 476 }, { "crossentropy": 3.5093719959259033, "epoch": 0.017292633410672855, "grad_norm": 0.06930918246507645, "grad_norm_var": 1.922337511133775e-05, "learning_rate": 0.008641304347826086, "loss": 3.3981, "step": 477 }, { "crossentropy": 3.25329852104187, "epoch": 0.017328886310904873, "grad_norm": 0.07338903844356537, "grad_norm_var": 1.999222070839431e-05, "learning_rate": 0.008659420289855072, "loss": 3.251, "step": 478 }, { "crossentropy": 3.3681952953338623, "epoch": 0.01736513921113689, "grad_norm": 0.07234054058790207, "grad_norm_var": 1.715790333471819e-05, "learning_rate": 0.008677536231884058, "loss": 3.3512, "step": 479 }, { "crossentropy": 3.3327219486236572, "epoch": 0.01740139211136891, "grad_norm": 0.06852211058139801, "grad_norm_var": 1.4450061011833427e-05, "learning_rate": 0.008695652173913044, "loss": 3.3333, "step": 480 }, { "crossentropy": 3.3745031356811523, "epoch": 0.017437645011600927, "grad_norm": 0.07047855108976364, "grad_norm_var": 1.2699885220868179e-05, "learning_rate": 0.008713768115942029, "loss": 3.3413, "step": 481 }, { "crossentropy": 3.160841226577759, "epoch": 0.017473897911832948, "grad_norm": 0.07084584981203079, "grad_norm_var": 1.071170203729895e-05, "learning_rate": 0.008731884057971014, "loss": 3.2279, "step": 482 }, { "crossentropy": 3.274975538253784, "epoch": 0.017510150812064966, "grad_norm": 0.0736788883805275, "grad_norm_var": 1.1139856842686443e-05, "learning_rate": 0.00875, "loss": 3.44, "step": 483 }, { "crossentropy": 3.2495856285095215, "epoch": 0.017546403712296984, "grad_norm": 0.06573770940303802, "grad_norm_var": 1.2362945108902966e-05, "learning_rate": 0.008768115942028986, "loss": 3.2872, "step": 484 }, { "crossentropy": 3.359928846359253, "epoch": 0.017582656612529002, "grad_norm": 0.06295441836118698, "grad_norm_var": 1.3465983157655755e-05, "learning_rate": 0.008786231884057971, "loss": 3.2844, "step": 485 }, { "crossentropy": 3.2792112827301025, "epoch": 0.01761890951276102, "grad_norm": 0.07066478580236435, "grad_norm_var": 1.2630357429427077e-05, "learning_rate": 0.008804347826086956, "loss": 3.3196, "step": 486 }, { "crossentropy": 3.1801726818084717, "epoch": 0.01765516241299304, "grad_norm": 0.06852620840072632, "grad_norm_var": 1.2696271105979336e-05, "learning_rate": 0.008822463768115943, "loss": 3.2376, "step": 487 }, { "crossentropy": 3.296704053878784, "epoch": 0.01769141531322506, "grad_norm": 0.06995571404695511, "grad_norm_var": 1.1980521144179986e-05, "learning_rate": 0.008840579710144928, "loss": 3.2694, "step": 488 }, { "crossentropy": 3.4606738090515137, "epoch": 0.017727668213457077, "grad_norm": 0.0687551274895668, "grad_norm_var": 1.1355965729771082e-05, "learning_rate": 0.008858695652173914, "loss": 3.4604, "step": 489 }, { "crossentropy": 3.11507248878479, "epoch": 0.017763921113689095, "grad_norm": 0.06948113441467285, "grad_norm_var": 8.841742744327096e-06, "learning_rate": 0.008876811594202899, "loss": 3.1751, "step": 490 }, { "crossentropy": 3.3195109367370605, "epoch": 0.017800174013921113, "grad_norm": 0.07127152383327484, "grad_norm_var": 8.772395257673991e-06, "learning_rate": 0.008894927536231884, "loss": 3.3017, "step": 491 }, { "crossentropy": 3.221052646636963, "epoch": 0.01783642691415313, "grad_norm": 0.07178385555744171, "grad_norm_var": 7.3875179882200435e-06, "learning_rate": 0.008913043478260869, "loss": 3.2976, "step": 492 }, { "crossentropy": 3.252774238586426, "epoch": 0.017872679814385152, "grad_norm": 0.06267053633928299, "grad_norm_var": 1.0625936156891808e-05, "learning_rate": 0.008931159420289854, "loss": 3.2836, "step": 493 }, { "crossentropy": 3.21404767036438, "epoch": 0.01790893271461717, "grad_norm": 0.06359795480966568, "grad_norm_var": 1.1463441356783972e-05, "learning_rate": 0.008949275362318841, "loss": 3.1965, "step": 494 }, { "crossentropy": 3.264695644378662, "epoch": 0.017945185614849188, "grad_norm": 0.0695631206035614, "grad_norm_var": 1.0645188186952694e-05, "learning_rate": 0.008967391304347826, "loss": 3.2709, "step": 495 }, { "crossentropy": 3.1985690593719482, "epoch": 0.017981438515081206, "grad_norm": 0.06443258374929428, "grad_norm_var": 1.176316840761545e-05, "learning_rate": 0.008985507246376812, "loss": 3.2684, "step": 496 }, { "crossentropy": 3.302816152572632, "epoch": 0.018017691415313224, "grad_norm": 0.06724197417497635, "grad_norm_var": 1.1520842428657437e-05, "learning_rate": 0.009003623188405797, "loss": 3.4009, "step": 497 }, { "crossentropy": 3.221519708633423, "epoch": 0.018053944315545245, "grad_norm": 0.06415873765945435, "grad_norm_var": 1.1954453132874641e-05, "learning_rate": 0.009021739130434784, "loss": 3.2223, "step": 498 }, { "crossentropy": 3.284346580505371, "epoch": 0.018090197215777263, "grad_norm": 0.07087349146604538, "grad_norm_var": 1.0239713551520267e-05, "learning_rate": 0.009039855072463769, "loss": 3.307, "step": 499 }, { "crossentropy": 3.118818521499634, "epoch": 0.01812645011600928, "grad_norm": 0.06846999377012253, "grad_norm_var": 1.0026290427237832e-05, "learning_rate": 0.009057971014492754, "loss": 3.2857, "step": 500 }, { "crossentropy": 3.228172540664673, "epoch": 0.0181627030162413, "grad_norm": 0.06925363093614578, "grad_norm_var": 8.457451987548918e-06, "learning_rate": 0.009076086956521739, "loss": 3.2512, "step": 501 }, { "crossentropy": 3.3282158374786377, "epoch": 0.018198955916473317, "grad_norm": 0.06447337567806244, "grad_norm_var": 8.79278784825128e-06, "learning_rate": 0.009094202898550726, "loss": 3.2695, "step": 502 }, { "crossentropy": 3.4156723022460938, "epoch": 0.018235208816705338, "grad_norm": 0.06376152485609055, "grad_norm_var": 9.738766336512642e-06, "learning_rate": 0.009112318840579711, "loss": 3.3381, "step": 503 }, { "crossentropy": 3.416674852371216, "epoch": 0.018271461716937356, "grad_norm": 0.08103004842996597, "grad_norm_var": 2.105347406055058e-05, "learning_rate": 0.009130434782608695, "loss": 3.2972, "step": 504 }, { "crossentropy": 3.3074402809143066, "epoch": 0.018307714617169374, "grad_norm": 0.0643564760684967, "grad_norm_var": 2.1923177603916146e-05, "learning_rate": 0.009148550724637682, "loss": 3.3838, "step": 505 }, { "crossentropy": 3.2107787132263184, "epoch": 0.018343967517401392, "grad_norm": 0.06739254295825958, "grad_norm_var": 2.1755850928467426e-05, "learning_rate": 0.009166666666666667, "loss": 3.2584, "step": 506 }, { "crossentropy": 3.321382999420166, "epoch": 0.01838022041763341, "grad_norm": 0.07023452967405319, "grad_norm_var": 2.1339017694949984e-05, "learning_rate": 0.009184782608695652, "loss": 3.2644, "step": 507 }, { "crossentropy": 3.1726431846618652, "epoch": 0.018416473317865428, "grad_norm": 0.061338648200035095, "grad_norm_var": 2.247856744682919e-05, "learning_rate": 0.009202898550724637, "loss": 3.2634, "step": 508 }, { "crossentropy": 3.330549478530884, "epoch": 0.01845272621809745, "grad_norm": 0.06534148007631302, "grad_norm_var": 2.1363704181323605e-05, "learning_rate": 0.009221014492753624, "loss": 3.3079, "step": 509 }, { "crossentropy": 3.223475456237793, "epoch": 0.018488979118329467, "grad_norm": 0.06857087463140488, "grad_norm_var": 2.050770150636705e-05, "learning_rate": 0.00923913043478261, "loss": 3.2416, "step": 510 }, { "crossentropy": 3.269338846206665, "epoch": 0.018525232018561485, "grad_norm": 0.06584212929010391, "grad_norm_var": 2.036477010976696e-05, "learning_rate": 0.009257246376811594, "loss": 3.3002, "step": 511 }, { "crossentropy": 3.1996097564697266, "epoch": 0.018561484918793503, "grad_norm": 0.06748079508543015, "grad_norm_var": 1.9780806060970314e-05, "learning_rate": 0.00927536231884058, "loss": 3.2902, "step": 512 }, { "crossentropy": 3.366356611251831, "epoch": 0.01859773781902552, "grad_norm": 0.06544312834739685, "grad_norm_var": 2.0042238469083116e-05, "learning_rate": 0.009293478260869566, "loss": 3.2585, "step": 513 }, { "crossentropy": 3.328038454055786, "epoch": 0.018633990719257542, "grad_norm": 0.06443372368812561, "grad_norm_var": 1.9928991868064146e-05, "learning_rate": 0.009311594202898552, "loss": 3.3171, "step": 514 }, { "crossentropy": 3.340986967086792, "epoch": 0.01867024361948956, "grad_norm": 0.06613671779632568, "grad_norm_var": 1.9133463844080388e-05, "learning_rate": 0.009329710144927537, "loss": 3.3622, "step": 515 }, { "crossentropy": 3.1906371116638184, "epoch": 0.018706496519721578, "grad_norm": 0.06716199219226837, "grad_norm_var": 1.9001025752200343e-05, "learning_rate": 0.009347826086956522, "loss": 3.2482, "step": 516 }, { "crossentropy": 3.2572977542877197, "epoch": 0.018742749419953596, "grad_norm": 0.06109975278377533, "grad_norm_var": 2.0723370050821506e-05, "learning_rate": 0.009365942028985509, "loss": 3.2832, "step": 517 }, { "crossentropy": 3.161670207977295, "epoch": 0.018779002320185614, "grad_norm": 0.0670626312494278, "grad_norm_var": 2.0440616607734358e-05, "learning_rate": 0.009384057971014492, "loss": 3.2946, "step": 518 }, { "crossentropy": 3.1452040672302246, "epoch": 0.018815255220417635, "grad_norm": 0.14444293081760406, "grad_norm_var": 0.0003960179045393697, "learning_rate": 0.009402173913043477, "loss": 3.243, "step": 519 }, { "crossentropy": 3.360076904296875, "epoch": 0.018851508120649653, "grad_norm": 0.06677348166704178, "grad_norm_var": 0.00039100575628705505, "learning_rate": 0.009420289855072464, "loss": 3.3062, "step": 520 }, { "crossentropy": 3.3733069896698, "epoch": 0.01888776102088167, "grad_norm": 0.07933101803064346, "grad_norm_var": 0.0003921164738802438, "learning_rate": 0.00943840579710145, "loss": 3.247, "step": 521 }, { "crossentropy": 3.371295213699341, "epoch": 0.01892401392111369, "grad_norm": 0.06605290621519089, "grad_norm_var": 0.0003930079235935274, "learning_rate": 0.009456521739130435, "loss": 3.3046, "step": 522 }, { "crossentropy": 3.101473331451416, "epoch": 0.018960266821345707, "grad_norm": 0.06431251764297485, "grad_norm_var": 0.0003963345820307142, "learning_rate": 0.00947463768115942, "loss": 3.137, "step": 523 }, { "crossentropy": 3.2955784797668457, "epoch": 0.018996519721577725, "grad_norm": 0.06500940024852753, "grad_norm_var": 0.00039230055657068656, "learning_rate": 0.009492753623188407, "loss": 3.2403, "step": 524 }, { "crossentropy": 3.3083763122558594, "epoch": 0.019032772621809746, "grad_norm": 0.0657099187374115, "grad_norm_var": 0.0003920049812339501, "learning_rate": 0.009510869565217392, "loss": 3.2357, "step": 525 }, { "crossentropy": 3.128986120223999, "epoch": 0.019069025522041764, "grad_norm": 0.06273305416107178, "grad_norm_var": 0.0003964569798846034, "learning_rate": 0.009528985507246377, "loss": 3.2086, "step": 526 }, { "crossentropy": 3.25179386138916, "epoch": 0.019105278422273782, "grad_norm": 0.06196725368499756, "grad_norm_var": 0.0004001579251221863, "learning_rate": 0.009547101449275362, "loss": 3.2934, "step": 527 }, { "crossentropy": 3.3827857971191406, "epoch": 0.0191415313225058, "grad_norm": 0.06557908654212952, "grad_norm_var": 0.0004012628386262889, "learning_rate": 0.00956521739130435, "loss": 3.2601, "step": 528 }, { "crossentropy": 3.2741410732269287, "epoch": 0.019177784222737818, "grad_norm": 0.06397060304880142, "grad_norm_var": 0.0004024556258395299, "learning_rate": 0.009583333333333334, "loss": 3.251, "step": 529 }, { "crossentropy": 3.1797749996185303, "epoch": 0.01921403712296984, "grad_norm": 0.06505992263555527, "grad_norm_var": 0.0004019539313289057, "learning_rate": 0.00960144927536232, "loss": 3.2191, "step": 530 }, { "crossentropy": 3.2498879432678223, "epoch": 0.019250290023201857, "grad_norm": 0.07533573359251022, "grad_norm_var": 0.0004015535377085794, "learning_rate": 0.009619565217391305, "loss": 3.2264, "step": 531 }, { "crossentropy": 3.2679860591888428, "epoch": 0.019286542923433875, "grad_norm": 0.06650101393461227, "grad_norm_var": 0.00040194994655765753, "learning_rate": 0.00963768115942029, "loss": 3.2316, "step": 532 }, { "crossentropy": 3.3192503452301025, "epoch": 0.019322795823665893, "grad_norm": 0.0646582841873169, "grad_norm_var": 0.0003978974852149058, "learning_rate": 0.009655797101449275, "loss": 3.3397, "step": 533 }, { "crossentropy": 3.145738363265991, "epoch": 0.01935904872389791, "grad_norm": 0.06489859521389008, "grad_norm_var": 0.0003994795385035737, "learning_rate": 0.00967391304347826, "loss": 3.2713, "step": 534 }, { "crossentropy": 3.103012800216675, "epoch": 0.019395301624129932, "grad_norm": 0.06664381921291351, "grad_norm_var": 2.0041707869649975e-05, "learning_rate": 0.009692028985507247, "loss": 3.1691, "step": 535 }, { "crossentropy": 3.316683530807495, "epoch": 0.01943155452436195, "grad_norm": 0.06906645745038986, "grad_norm_var": 2.0443674479036825e-05, "learning_rate": 0.009710144927536232, "loss": 3.3451, "step": 536 }, { "crossentropy": 3.2674198150634766, "epoch": 0.019467807424593968, "grad_norm": 0.06499864906072617, "grad_norm_var": 9.100328620869948e-06, "learning_rate": 0.009728260869565217, "loss": 3.2814, "step": 537 }, { "crossentropy": 3.1434688568115234, "epoch": 0.019504060324825986, "grad_norm": 0.06006074696779251, "grad_norm_var": 1.1127271885544993e-05, "learning_rate": 0.009746376811594203, "loss": 3.1857, "step": 538 }, { "crossentropy": 3.1755878925323486, "epoch": 0.019540313225058004, "grad_norm": 0.06455808877944946, "grad_norm_var": 1.109521873237628e-05, "learning_rate": 0.00976449275362319, "loss": 3.1659, "step": 539 }, { "crossentropy": 3.2023766040802, "epoch": 0.01957656612529002, "grad_norm": 0.06897122412919998, "grad_norm_var": 1.1858314069489636e-05, "learning_rate": 0.009782608695652175, "loss": 3.2368, "step": 540 }, { "crossentropy": 3.2213315963745117, "epoch": 0.019612819025522043, "grad_norm": 0.06480810046195984, "grad_norm_var": 1.1904287176514593e-05, "learning_rate": 0.00980072463768116, "loss": 3.2669, "step": 541 }, { "crossentropy": 3.197927474975586, "epoch": 0.01964907192575406, "grad_norm": 0.06653331965208054, "grad_norm_var": 1.1347555397964864e-05, "learning_rate": 0.009818840579710145, "loss": 3.255, "step": 542 }, { "crossentropy": 3.2378149032592773, "epoch": 0.01968532482598608, "grad_norm": 0.06701715290546417, "grad_norm_var": 1.0326609047924216e-05, "learning_rate": 0.00983695652173913, "loss": 3.2401, "step": 543 }, { "crossentropy": 3.3792037963867188, "epoch": 0.019721577726218097, "grad_norm": 0.09368102252483368, "grad_norm_var": 5.748379027569542e-05, "learning_rate": 0.009855072463768117, "loss": 3.2691, "step": 544 }, { "crossentropy": 3.12640643119812, "epoch": 0.019757830626450115, "grad_norm": 0.0768749862909317, "grad_norm_var": 6.109161810342138e-05, "learning_rate": 0.009873188405797102, "loss": 3.2386, "step": 545 }, { "crossentropy": 3.23068904876709, "epoch": 0.019794083526682136, "grad_norm": 0.0686834305524826, "grad_norm_var": 6.0139479633932134e-05, "learning_rate": 0.009891304347826087, "loss": 3.253, "step": 546 }, { "crossentropy": 3.2185661792755127, "epoch": 0.019830336426914154, "grad_norm": 0.06345617026090622, "grad_norm_var": 5.885407251278151e-05, "learning_rate": 0.009909420289855073, "loss": 3.2, "step": 547 }, { "crossentropy": 3.2029902935028076, "epoch": 0.01986658932714617, "grad_norm": 0.06171520799398422, "grad_norm_var": 6.137812184797859e-05, "learning_rate": 0.009927536231884058, "loss": 3.2479, "step": 548 }, { "crossentropy": 3.3220746517181396, "epoch": 0.01990284222737819, "grad_norm": 0.06571347266435623, "grad_norm_var": 6.0989647218922155e-05, "learning_rate": 0.009945652173913043, "loss": 3.2803, "step": 549 }, { "crossentropy": 3.1647305488586426, "epoch": 0.019939095127610208, "grad_norm": 0.06275252252817154, "grad_norm_var": 6.215922939585076e-05, "learning_rate": 0.00996376811594203, "loss": 3.142, "step": 550 }, { "crossentropy": 3.4042861461639404, "epoch": 0.01997534802784223, "grad_norm": 0.06480416655540466, "grad_norm_var": 6.266560389198127e-05, "learning_rate": 0.009981884057971015, "loss": 3.2981, "step": 551 }, { "crossentropy": 3.143096923828125, "epoch": 0.020011600928074247, "grad_norm": 0.06189797818660736, "grad_norm_var": 6.460079425114525e-05, "learning_rate": 0.01, "loss": 3.2229, "step": 552 }, { "crossentropy": 3.1436960697174072, "epoch": 0.020047853828306265, "grad_norm": 0.06078370660543442, "grad_norm_var": 6.69948788596673e-05, "learning_rate": 0.009999999966233707, "loss": 3.2414, "step": 553 }, { "crossentropy": 3.183739423751831, "epoch": 0.020084106728538283, "grad_norm": 0.06474097073078156, "grad_norm_var": 6.402146767320477e-05, "learning_rate": 0.009999999864934826, "loss": 3.2577, "step": 554 }, { "crossentropy": 3.205310583114624, "epoch": 0.0201203596287703, "grad_norm": 0.07047619670629501, "grad_norm_var": 6.40374322181512e-05, "learning_rate": 0.009999999696103358, "loss": 3.1407, "step": 555 }, { "crossentropy": 3.2447471618652344, "epoch": 0.02015661252900232, "grad_norm": 0.06368948519229889, "grad_norm_var": 6.487296303187724e-05, "learning_rate": 0.009999999459739308, "loss": 3.1925, "step": 556 }, { "crossentropy": 3.22067928314209, "epoch": 0.02019286542923434, "grad_norm": 0.06835933774709702, "grad_norm_var": 6.445675793972649e-05, "learning_rate": 0.009999999155842676, "loss": 3.1898, "step": 557 }, { "crossentropy": 3.100111722946167, "epoch": 0.020229118329466358, "grad_norm": 0.07434767484664917, "grad_norm_var": 6.718928500450208e-05, "learning_rate": 0.00999999878441347, "loss": 3.1584, "step": 558 }, { "crossentropy": 3.3143484592437744, "epoch": 0.020265371229698376, "grad_norm": 0.05939435958862305, "grad_norm_var": 7.188301907973051e-05, "learning_rate": 0.00999999834545169, "loss": 3.2867, "step": 559 }, { "crossentropy": 3.0279414653778076, "epoch": 0.020301624129930394, "grad_norm": 0.06921882182359695, "grad_norm_var": 2.4169666590993326e-05, "learning_rate": 0.009999997838957345, "loss": 3.1109, "step": 560 }, { "crossentropy": 3.2441179752349854, "epoch": 0.02033787703016241, "grad_norm": 0.07192805409431458, "grad_norm_var": 1.8563584498904977e-05, "learning_rate": 0.009999997264930442, "loss": 3.189, "step": 561 }, { "crossentropy": 3.2813987731933594, "epoch": 0.020374129930394433, "grad_norm": 0.0675252303481102, "grad_norm_var": 1.8194052700944383e-05, "learning_rate": 0.009999996623370989, "loss": 3.3038, "step": 562 }, { "crossentropy": 3.3759374618530273, "epoch": 0.02041038283062645, "grad_norm": 0.06749460846185684, "grad_norm_var": 1.801850363687199e-05, "learning_rate": 0.009999995914278992, "loss": 3.3302, "step": 563 }, { "crossentropy": 3.1776320934295654, "epoch": 0.02044663573085847, "grad_norm": 0.07357696443796158, "grad_norm_var": 2.0150131171185054e-05, "learning_rate": 0.009999995137654464, "loss": 3.1149, "step": 564 }, { "crossentropy": 3.348745346069336, "epoch": 0.020482888631090487, "grad_norm": 0.06661145389080048, "grad_norm_var": 2.0086126596263743e-05, "learning_rate": 0.009999994293497412, "loss": 3.3004, "step": 565 }, { "crossentropy": 3.125425100326538, "epoch": 0.020519141531322505, "grad_norm": 0.08438596874475479, "grad_norm_var": 3.787777479132837e-05, "learning_rate": 0.009999993381807851, "loss": 3.2522, "step": 566 }, { "crossentropy": 3.3379316329956055, "epoch": 0.020555394431554526, "grad_norm": 0.06555455923080444, "grad_norm_var": 3.7585494543002083e-05, "learning_rate": 0.009999992402585792, "loss": 3.2967, "step": 567 }, { "crossentropy": 3.282196521759033, "epoch": 0.020591647331786544, "grad_norm": 0.07484105229377747, "grad_norm_var": 3.731102977014382e-05, "learning_rate": 0.009999991355831246, "loss": 3.2157, "step": 568 }, { "crossentropy": 3.189117193222046, "epoch": 0.02062790023201856, "grad_norm": 0.06200915202498436, "grad_norm_var": 3.607334727311658e-05, "learning_rate": 0.00999999024154423, "loss": 3.1823, "step": 569 }, { "crossentropy": 3.3101375102996826, "epoch": 0.02066415313225058, "grad_norm": 0.0638836920261383, "grad_norm_var": 3.660720293236121e-05, "learning_rate": 0.009999989059724759, "loss": 3.2783, "step": 570 }, { "crossentropy": 3.312185525894165, "epoch": 0.020700406032482598, "grad_norm": 0.07171119004487991, "grad_norm_var": 3.695284654770114e-05, "learning_rate": 0.009999987810372847, "loss": 3.2027, "step": 571 }, { "crossentropy": 3.2208151817321777, "epoch": 0.020736658932714615, "grad_norm": 0.06412351876497269, "grad_norm_var": 3.665537227104983e-05, "learning_rate": 0.009999986493488511, "loss": 3.203, "step": 572 }, { "crossentropy": 3.3395769596099854, "epoch": 0.020772911832946637, "grad_norm": 0.06317106634378433, "grad_norm_var": 3.882269784963675e-05, "learning_rate": 0.009999985109071772, "loss": 3.2867, "step": 573 }, { "crossentropy": 3.0388002395629883, "epoch": 0.020809164733178655, "grad_norm": 0.05915345624089241, "grad_norm_var": 4.188321979085493e-05, "learning_rate": 0.009999983657122645, "loss": 3.133, "step": 574 }, { "crossentropy": 3.100248336791992, "epoch": 0.020845417633410673, "grad_norm": 0.06577987968921661, "grad_norm_var": 3.728660345572362e-05, "learning_rate": 0.009999982137641151, "loss": 3.2408, "step": 575 }, { "crossentropy": 3.2795300483703613, "epoch": 0.02088167053364269, "grad_norm": 0.058618493378162384, "grad_norm_var": 4.284912427254262e-05, "learning_rate": 0.00999998055062731, "loss": 3.2508, "step": 576 }, { "crossentropy": 3.1818044185638428, "epoch": 0.02091792343387471, "grad_norm": 0.06367861479520798, "grad_norm_var": 4.2257245512488986e-05, "learning_rate": 0.009999978896081143, "loss": 3.0998, "step": 577 }, { "crossentropy": 3.2379446029663086, "epoch": 0.02095417633410673, "grad_norm": 0.06263900548219681, "grad_norm_var": 4.341210075450829e-05, "learning_rate": 0.009999977174002675, "loss": 3.1763, "step": 578 }, { "crossentropy": 3.223158359527588, "epoch": 0.020990429234338748, "grad_norm": 0.06306331604719162, "grad_norm_var": 4.417109375897098e-05, "learning_rate": 0.009999975384391927, "loss": 3.3255, "step": 579 }, { "crossentropy": 3.2791662216186523, "epoch": 0.021026682134570766, "grad_norm": 0.06120477616786957, "grad_norm_var": 4.1940116484949035e-05, "learning_rate": 0.009999973527248924, "loss": 3.2229, "step": 580 }, { "crossentropy": 3.2758147716522217, "epoch": 0.021062935034802784, "grad_norm": 0.05874580144882202, "grad_norm_var": 4.4800482378753076e-05, "learning_rate": 0.009999971602573692, "loss": 3.2286, "step": 581 }, { "crossentropy": 3.020049810409546, "epoch": 0.0210991879350348, "grad_norm": 0.06639175117015839, "grad_norm_var": 1.8910503292473893e-05, "learning_rate": 0.009999969610366253, "loss": 3.1183, "step": 582 }, { "crossentropy": 3.306903123855591, "epoch": 0.021135440835266823, "grad_norm": 0.06612138450145721, "grad_norm_var": 1.9045383201491546e-05, "learning_rate": 0.009999967550626638, "loss": 3.264, "step": 583 }, { "crossentropy": 3.220360040664673, "epoch": 0.02117169373549884, "grad_norm": 0.0766371563076973, "grad_norm_var": 2.1826223189211345e-05, "learning_rate": 0.009999965423354875, "loss": 3.12, "step": 584 }, { "crossentropy": 3.2629613876342773, "epoch": 0.02120794663573086, "grad_norm": 0.06966302543878555, "grad_norm_var": 2.3268865341847557e-05, "learning_rate": 0.00999996322855099, "loss": 3.2366, "step": 585 }, { "crossentropy": 3.3443071842193604, "epoch": 0.021244199535962877, "grad_norm": 0.06599593907594681, "grad_norm_var": 2.3328620765901825e-05, "learning_rate": 0.009999960966215013, "loss": 3.2072, "step": 586 }, { "crossentropy": 3.171685218811035, "epoch": 0.021280452436194894, "grad_norm": 0.06221204251050949, "grad_norm_var": 2.0206800393097966e-05, "learning_rate": 0.009999958636346975, "loss": 3.1677, "step": 587 }, { "crossentropy": 3.2444639205932617, "epoch": 0.021316705336426916, "grad_norm": 0.0570712685585022, "grad_norm_var": 2.3387059854496476e-05, "learning_rate": 0.00999995623894691, "loss": 3.1953, "step": 588 }, { "crossentropy": 3.1750705242156982, "epoch": 0.021352958236658934, "grad_norm": 0.06160087138414383, "grad_norm_var": 2.366428271881367e-05, "learning_rate": 0.009999953774014848, "loss": 3.1709, "step": 589 }, { "crossentropy": 3.2770228385925293, "epoch": 0.02138921113689095, "grad_norm": 0.0646369680762291, "grad_norm_var": 2.2247930746772486e-05, "learning_rate": 0.009999951241550823, "loss": 3.1867, "step": 590 }, { "crossentropy": 2.9304018020629883, "epoch": 0.02142546403712297, "grad_norm": 0.0631120428442955, "grad_norm_var": 2.2060981976297865e-05, "learning_rate": 0.009999948641554866, "loss": 3.0733, "step": 591 }, { "crossentropy": 3.041409492492676, "epoch": 0.021461716937354988, "grad_norm": 0.06178765371441841, "grad_norm_var": 2.0483588919386192e-05, "learning_rate": 0.009999945974027017, "loss": 3.1224, "step": 592 }, { "crossentropy": 3.2787115573883057, "epoch": 0.021497969837587005, "grad_norm": 0.06760119646787643, "grad_norm_var": 2.1258808138629958e-05, "learning_rate": 0.009999943238967308, "loss": 3.2183, "step": 593 }, { "crossentropy": 3.1323509216308594, "epoch": 0.021534222737819027, "grad_norm": 0.06765486299991608, "grad_norm_var": 2.17335933236098e-05, "learning_rate": 0.00999994043637578, "loss": 3.1609, "step": 594 }, { "crossentropy": 3.2594974040985107, "epoch": 0.021570475638051045, "grad_norm": 0.06472139805555344, "grad_norm_var": 2.156707511967674e-05, "learning_rate": 0.009999937566252469, "loss": 3.2692, "step": 595 }, { "crossentropy": 3.105459213256836, "epoch": 0.021606728538283063, "grad_norm": 0.06459522992372513, "grad_norm_var": 2.0706653765083642e-05, "learning_rate": 0.009999934628597412, "loss": 3.1363, "step": 596 }, { "crossentropy": 3.1545872688293457, "epoch": 0.02164298143851508, "grad_norm": 0.059345897287130356, "grad_norm_var": 2.023600334730887e-05, "learning_rate": 0.009999931623410651, "loss": 3.1082, "step": 597 }, { "crossentropy": 3.2904937267303467, "epoch": 0.0216792343387471, "grad_norm": 0.06638670712709427, "grad_norm_var": 2.0235033146638834e-05, "learning_rate": 0.009999928550692226, "loss": 3.2492, "step": 598 }, { "crossentropy": 3.2726681232452393, "epoch": 0.02171548723897912, "grad_norm": 0.06524449586868286, "grad_norm_var": 2.014572318662251e-05, "learning_rate": 0.009999925410442178, "loss": 3.1391, "step": 599 }, { "crossentropy": 3.2451179027557373, "epoch": 0.021751740139211138, "grad_norm": 0.11377185583114624, "grad_norm_var": 0.0001644876787567621, "learning_rate": 0.00999992220266055, "loss": 3.2922, "step": 600 }, { "crossentropy": 3.1897428035736084, "epoch": 0.021787993039443156, "grad_norm": 0.06323596090078354, "grad_norm_var": 0.00016496949608506655, "learning_rate": 0.009999918927347383, "loss": 3.2327, "step": 601 }, { "crossentropy": 2.9466238021850586, "epoch": 0.021824245939675174, "grad_norm": 0.07216349989175797, "grad_norm_var": 0.0001666767458263561, "learning_rate": 0.009999915584502726, "loss": 3.0971, "step": 602 }, { "crossentropy": 3.2007923126220703, "epoch": 0.02186049883990719, "grad_norm": 0.06604227423667908, "grad_norm_var": 0.00016504817811052337, "learning_rate": 0.009999912174126619, "loss": 3.2481, "step": 603 }, { "crossentropy": 3.2061853408813477, "epoch": 0.021896751740139213, "grad_norm": 0.0635044202208519, "grad_norm_var": 0.00015874458750842017, "learning_rate": 0.009999908696219112, "loss": 3.2065, "step": 604 }, { "crossentropy": 3.0696165561676025, "epoch": 0.02193300464037123, "grad_norm": 0.06804117560386658, "grad_norm_var": 0.0001559812080161258, "learning_rate": 0.009999905150780251, "loss": 3.1425, "step": 605 }, { "crossentropy": 3.301729679107666, "epoch": 0.02196925754060325, "grad_norm": 0.07407893240451813, "grad_norm_var": 0.00015701672261403583, "learning_rate": 0.009999901537810081, "loss": 3.2374, "step": 606 }, { "crossentropy": 3.0625476837158203, "epoch": 0.022005510440835267, "grad_norm": 0.07073163241147995, "grad_norm_var": 0.0001548357423402295, "learning_rate": 0.009999897857308655, "loss": 3.1894, "step": 607 }, { "crossentropy": 3.199037551879883, "epoch": 0.022041763341067284, "grad_norm": 0.06260344386100769, "grad_norm_var": 0.00015405947513635556, "learning_rate": 0.00999989410927602, "loss": 3.1202, "step": 608 }, { "crossentropy": 3.0741114616394043, "epoch": 0.022078016241299302, "grad_norm": 0.05928171053528786, "grad_norm_var": 0.0001603337530082162, "learning_rate": 0.009999890293712229, "loss": 3.0701, "step": 609 }, { "crossentropy": 3.260331869125366, "epoch": 0.022114269141531324, "grad_norm": 0.0639534592628479, "grad_norm_var": 0.00016177379083131895, "learning_rate": 0.00999988641061733, "loss": 3.2184, "step": 610 }, { "crossentropy": 3.2131407260894775, "epoch": 0.02215052204176334, "grad_norm": 0.06702038645744324, "grad_norm_var": 0.00016091325434609282, "learning_rate": 0.009999882459991378, "loss": 3.1976, "step": 611 }, { "crossentropy": 3.155208110809326, "epoch": 0.02218677494199536, "grad_norm": 0.06783188134431839, "grad_norm_var": 0.0001597749640305614, "learning_rate": 0.009999878441834425, "loss": 3.0068, "step": 612 }, { "crossentropy": 3.0319271087646484, "epoch": 0.022223027842227377, "grad_norm": 0.0716361403465271, "grad_norm_var": 0.00015347349486684818, "learning_rate": 0.009999874356146528, "loss": 3.11, "step": 613 }, { "crossentropy": 3.199005603790283, "epoch": 0.022259280742459395, "grad_norm": 0.061219073832035065, "grad_norm_var": 0.00015743956350328607, "learning_rate": 0.009999870202927738, "loss": 3.1856, "step": 614 }, { "crossentropy": 3.112236499786377, "epoch": 0.022295533642691417, "grad_norm": 0.06181376054883003, "grad_norm_var": 0.0001600749092561561, "learning_rate": 0.009999865982178115, "loss": 3.1842, "step": 615 }, { "crossentropy": 3.2334656715393066, "epoch": 0.022331786542923435, "grad_norm": 0.06184909865260124, "grad_norm_var": 1.9883721558617132e-05, "learning_rate": 0.009999861693897714, "loss": 3.2036, "step": 616 }, { "crossentropy": 3.3997602462768555, "epoch": 0.022368039443155453, "grad_norm": 0.0947858914732933, "grad_norm_var": 7.07298534255167e-05, "learning_rate": 0.009999857338086593, "loss": 3.2439, "step": 617 }, { "crossentropy": 3.3054099082946777, "epoch": 0.02240429234338747, "grad_norm": 0.05867037922143936, "grad_norm_var": 7.445611226282219e-05, "learning_rate": 0.009999852914744811, "loss": 3.3337, "step": 618 }, { "crossentropy": 3.164717197418213, "epoch": 0.02244054524361949, "grad_norm": 0.06618062406778336, "grad_norm_var": 7.443841541839019e-05, "learning_rate": 0.009999848423872429, "loss": 3.069, "step": 619 }, { "crossentropy": 3.1502885818481445, "epoch": 0.02247679814385151, "grad_norm": 0.07730723172426224, "grad_norm_var": 7.977432924403804e-05, "learning_rate": 0.009999843865469505, "loss": 3.1629, "step": 620 }, { "crossentropy": 3.159416913986206, "epoch": 0.022513051044083528, "grad_norm": 0.07104211300611496, "grad_norm_var": 8.037854344309271e-05, "learning_rate": 0.009999839239536103, "loss": 3.2456, "step": 621 }, { "crossentropy": 3.24289608001709, "epoch": 0.022549303944315546, "grad_norm": 0.06560228765010834, "grad_norm_var": 7.814054508727957e-05, "learning_rate": 0.009999834546072284, "loss": 3.2215, "step": 622 }, { "crossentropy": 3.3628780841827393, "epoch": 0.022585556844547564, "grad_norm": 0.07119865715503693, "grad_norm_var": 7.834945961516879e-05, "learning_rate": 0.009999829785078112, "loss": 3.2309, "step": 623 }, { "crossentropy": 3.2017149925231934, "epoch": 0.02262180974477958, "grad_norm": 0.06897466629743576, "grad_norm_var": 7.662090082513936e-05, "learning_rate": 0.009999824956553651, "loss": 3.1353, "step": 624 }, { "crossentropy": 3.1081323623657227, "epoch": 0.0226580626450116, "grad_norm": 0.06227541342377663, "grad_norm_var": 7.369188134450406e-05, "learning_rate": 0.009999820060498968, "loss": 3.166, "step": 625 }, { "crossentropy": 3.154702663421631, "epoch": 0.02269431554524362, "grad_norm": 0.06569401174783707, "grad_norm_var": 7.289338008177228e-05, "learning_rate": 0.009999815096914125, "loss": 3.1272, "step": 626 }, { "crossentropy": 3.059826612472534, "epoch": 0.02273056844547564, "grad_norm": 0.0607682503759861, "grad_norm_var": 7.641887908229246e-05, "learning_rate": 0.009999810065799194, "loss": 3.0971, "step": 627 }, { "crossentropy": 3.1246166229248047, "epoch": 0.022766821345707657, "grad_norm": 0.059311170130968094, "grad_norm_var": 8.106584161412285e-05, "learning_rate": 0.00999980496715424, "loss": 3.1943, "step": 628 }, { "crossentropy": 3.1599910259246826, "epoch": 0.022803074245939674, "grad_norm": 0.06402644515037537, "grad_norm_var": 8.038244266906398e-05, "learning_rate": 0.009999799800979332, "loss": 3.1551, "step": 629 }, { "crossentropy": 3.0566160678863525, "epoch": 0.022839327146171692, "grad_norm": 0.06048544496297836, "grad_norm_var": 8.097372375366243e-05, "learning_rate": 0.00999979456727454, "loss": 3.1181, "step": 630 }, { "crossentropy": 3.0840604305267334, "epoch": 0.022875580046403714, "grad_norm": 0.06216483190655708, "grad_norm_var": 8.074455536682117e-05, "learning_rate": 0.009999789266039933, "loss": 3.1291, "step": 631 }, { "crossentropy": 3.2559714317321777, "epoch": 0.02291183294663573, "grad_norm": 0.07008416950702667, "grad_norm_var": 7.944149903083009e-05, "learning_rate": 0.009999783897275589, "loss": 3.2187, "step": 632 }, { "crossentropy": 3.1294286251068115, "epoch": 0.02294808584686775, "grad_norm": 0.06463313102722168, "grad_norm_var": 2.6207561210859348e-05, "learning_rate": 0.009999778460981571, "loss": 3.1221, "step": 633 }, { "crossentropy": 3.0751686096191406, "epoch": 0.022984338747099767, "grad_norm": 0.06258301436901093, "grad_norm_var": 2.358779126095112e-05, "learning_rate": 0.009999772957157963, "loss": 3.0877, "step": 634 }, { "crossentropy": 3.217547655105591, "epoch": 0.023020591647331785, "grad_norm": 0.08360175043344498, "grad_norm_var": 4.3508409423756444e-05, "learning_rate": 0.00999976738580483, "loss": 3.142, "step": 635 }, { "crossentropy": 3.2825427055358887, "epoch": 0.023056844547563807, "grad_norm": 0.06320024281740189, "grad_norm_var": 3.629495324639909e-05, "learning_rate": 0.00999976174692225, "loss": 3.2167, "step": 636 }, { "crossentropy": 3.074920654296875, "epoch": 0.023093097447795825, "grad_norm": 0.07085117697715759, "grad_norm_var": 3.616830507938683e-05, "learning_rate": 0.009999756040510304, "loss": 3.0797, "step": 637 }, { "crossentropy": 3.3367505073547363, "epoch": 0.023129350348027843, "grad_norm": 0.0712052658200264, "grad_norm_var": 3.785873642144684e-05, "learning_rate": 0.009999750266569064, "loss": 3.1915, "step": 638 }, { "crossentropy": 2.8972177505493164, "epoch": 0.02316560324825986, "grad_norm": 0.0647938996553421, "grad_norm_var": 3.625300014208262e-05, "learning_rate": 0.009999744425098609, "loss": 2.989, "step": 639 }, { "crossentropy": 3.1634280681610107, "epoch": 0.02320185614849188, "grad_norm": 0.061346545815467834, "grad_norm_var": 3.677864931319029e-05, "learning_rate": 0.009999738516099017, "loss": 3.1533, "step": 640 }, { "crossentropy": 3.2236125469207764, "epoch": 0.023238109048723896, "grad_norm": 0.06148490309715271, "grad_norm_var": 3.715115738895467e-05, "learning_rate": 0.00999973253957037, "loss": 3.1956, "step": 641 }, { "crossentropy": 3.093712568283081, "epoch": 0.023274361948955918, "grad_norm": 0.06514480710029602, "grad_norm_var": 3.7147720744008274e-05, "learning_rate": 0.009999726495512748, "loss": 3.0994, "step": 642 }, { "crossentropy": 3.0960161685943604, "epoch": 0.023310614849187936, "grad_norm": 0.06026138365268707, "grad_norm_var": 3.7473781955245596e-05, "learning_rate": 0.009999720383926231, "loss": 3.0874, "step": 643 }, { "crossentropy": 3.045644998550415, "epoch": 0.023346867749419953, "grad_norm": 0.05860079824924469, "grad_norm_var": 3.807479948217298e-05, "learning_rate": 0.009999714204810904, "loss": 3.1257, "step": 644 }, { "crossentropy": 3.1303770542144775, "epoch": 0.02338312064965197, "grad_norm": 0.06247532367706299, "grad_norm_var": 3.848427098117237e-05, "learning_rate": 0.009999707958166848, "loss": 3.1481, "step": 645 }, { "crossentropy": 3.135613441467285, "epoch": 0.02341937354988399, "grad_norm": 0.06896071881055832, "grad_norm_var": 3.7666052802307e-05, "learning_rate": 0.00999970164399415, "loss": 3.0808, "step": 646 }, { "crossentropy": 3.0568687915802, "epoch": 0.02345562645011601, "grad_norm": 0.06818880885839462, "grad_norm_var": 3.7084998796808186e-05, "learning_rate": 0.009999695262292893, "loss": 3.0904, "step": 647 }, { "crossentropy": 3.1415648460388184, "epoch": 0.02349187935034803, "grad_norm": 0.06115707382559776, "grad_norm_var": 3.7309845954910416e-05, "learning_rate": 0.009999688813063165, "loss": 3.1602, "step": 648 }, { "crossentropy": 3.0467634201049805, "epoch": 0.023528132250580047, "grad_norm": 0.06393153965473175, "grad_norm_var": 3.742456013779514e-05, "learning_rate": 0.009999682296305053, "loss": 3.0626, "step": 649 }, { "crossentropy": 3.2190897464752197, "epoch": 0.023564385150812064, "grad_norm": 0.06433241814374924, "grad_norm_var": 3.693853940200057e-05, "learning_rate": 0.009999675712018643, "loss": 3.0746, "step": 650 }, { "crossentropy": 3.207329511642456, "epoch": 0.023600638051044082, "grad_norm": 0.06482156366109848, "grad_norm_var": 1.3895258127176863e-05, "learning_rate": 0.009999669060204027, "loss": 3.1656, "step": 651 }, { "crossentropy": 3.2331981658935547, "epoch": 0.023636890951276104, "grad_norm": 0.06209374964237213, "grad_norm_var": 1.4152068586878394e-05, "learning_rate": 0.00999966234086129, "loss": 3.1155, "step": 652 }, { "crossentropy": 3.3104031085968018, "epoch": 0.02367314385150812, "grad_norm": 0.05920703709125519, "grad_norm_var": 1.2537627431012607e-05, "learning_rate": 0.009999655553990528, "loss": 3.1756, "step": 653 }, { "crossentropy": 3.1252493858337402, "epoch": 0.02370939675174014, "grad_norm": 0.0611281655728817, "grad_norm_var": 8.699917938542514e-06, "learning_rate": 0.00999964869959183, "loss": 3.1137, "step": 654 }, { "crossentropy": 3.123387575149536, "epoch": 0.023745649651972157, "grad_norm": 0.06217436119914055, "grad_norm_var": 8.500677599613572e-06, "learning_rate": 0.00999964177766529, "loss": 3.0817, "step": 655 }, { "crossentropy": 3.17832088470459, "epoch": 0.023781902552204175, "grad_norm": 0.06364920735359192, "grad_norm_var": 8.376055571733485e-06, "learning_rate": 0.009999634788211, "loss": 3.2114, "step": 656 }, { "crossentropy": 3.0600855350494385, "epoch": 0.023818155452436193, "grad_norm": 0.06364669650793076, "grad_norm_var": 8.238422121671583e-06, "learning_rate": 0.009999627731229055, "loss": 3.0398, "step": 657 }, { "crossentropy": 3.034425735473633, "epoch": 0.023854408352668215, "grad_norm": 0.09745422005653381, "grad_norm_var": 8.224417118523012e-05, "learning_rate": 0.009999620606719549, "loss": 3.1516, "step": 658 }, { "crossentropy": 3.1766512393951416, "epoch": 0.023890661252900233, "grad_norm": 0.06201084703207016, "grad_norm_var": 8.129975323492248e-05, "learning_rate": 0.009999613414682581, "loss": 3.1257, "step": 659 }, { "crossentropy": 2.983360528945923, "epoch": 0.02392691415313225, "grad_norm": 0.05948062241077423, "grad_norm_var": 8.056934465339952e-05, "learning_rate": 0.009999606155118246, "loss": 3.0455, "step": 660 }, { "crossentropy": 3.072589635848999, "epoch": 0.02396316705336427, "grad_norm": 0.05929773300886154, "grad_norm_var": 8.239484678417789e-05, "learning_rate": 0.009999598828026644, "loss": 3.0439, "step": 661 }, { "crossentropy": 3.159270763397217, "epoch": 0.023999419953596286, "grad_norm": 0.06381157785654068, "grad_norm_var": 8.139856606264589e-05, "learning_rate": 0.00999959143340787, "loss": 3.0425, "step": 662 }, { "crossentropy": 3.1216025352478027, "epoch": 0.024035672853828308, "grad_norm": 0.06818268448114395, "grad_norm_var": 8.139578001264547e-05, "learning_rate": 0.009999583971262029, "loss": 3.1155, "step": 663 }, { "crossentropy": 3.069392681121826, "epoch": 0.024071925754060326, "grad_norm": 0.05595078319311142, "grad_norm_var": 8.560044523393516e-05, "learning_rate": 0.009999576441589218, "loss": 3.1477, "step": 664 }, { "crossentropy": 3.0295047760009766, "epoch": 0.024108178654292343, "grad_norm": 0.06480176001787186, "grad_norm_var": 8.55878131175621e-05, "learning_rate": 0.009999568844389541, "loss": 3.0877, "step": 665 }, { "crossentropy": 3.191793918609619, "epoch": 0.02414443155452436, "grad_norm": 0.06597647070884705, "grad_norm_var": 8.57194148303854e-05, "learning_rate": 0.0099995611796631, "loss": 3.1963, "step": 666 }, { "crossentropy": 3.0811479091644287, "epoch": 0.02418068445475638, "grad_norm": 0.06546717137098312, "grad_norm_var": 8.576406719337584e-05, "learning_rate": 0.009999553447409996, "loss": 3.0699, "step": 667 }, { "crossentropy": 3.1462812423706055, "epoch": 0.0242169373549884, "grad_norm": 0.06292705982923508, "grad_norm_var": 8.552391228887716e-05, "learning_rate": 0.009999545647630337, "loss": 3.1701, "step": 668 }, { "crossentropy": 3.08982515335083, "epoch": 0.02425319025522042, "grad_norm": 0.06261551380157471, "grad_norm_var": 8.37546223347326e-05, "learning_rate": 0.009999537780324226, "loss": 3.0551, "step": 669 }, { "crossentropy": 3.0519418716430664, "epoch": 0.024289443155452436, "grad_norm": 0.062133174389600754, "grad_norm_var": 8.331085517161904e-05, "learning_rate": 0.009999529845491772, "loss": 3.0499, "step": 670 }, { "crossentropy": 3.0909056663513184, "epoch": 0.024325696055684454, "grad_norm": 0.06614769995212555, "grad_norm_var": 8.281451684033532e-05, "learning_rate": 0.00999952184313308, "loss": 3.0649, "step": 671 }, { "crossentropy": 2.958310127258301, "epoch": 0.024361948955916472, "grad_norm": 0.06175444647669792, "grad_norm_var": 8.343626068849456e-05, "learning_rate": 0.009999513773248257, "loss": 3.0129, "step": 672 }, { "crossentropy": 3.163679361343384, "epoch": 0.02439820185614849, "grad_norm": 0.061699673533439636, "grad_norm_var": 8.405142242328111e-05, "learning_rate": 0.009999505635837415, "loss": 3.0779, "step": 673 }, { "crossentropy": 3.1070029735565186, "epoch": 0.02443445475638051, "grad_norm": 0.06682096421718597, "grad_norm_var": 1.007040840699594e-05, "learning_rate": 0.00999949743090066, "loss": 3.1876, "step": 674 }, { "crossentropy": 3.073202610015869, "epoch": 0.02447070765661253, "grad_norm": 0.06375990808010101, "grad_norm_var": 1.0015215737548451e-05, "learning_rate": 0.009999489158438107, "loss": 3.1245, "step": 675 }, { "crossentropy": 3.0082340240478516, "epoch": 0.024506960556844547, "grad_norm": 0.06356590241193771, "grad_norm_var": 9.045040557745937e-06, "learning_rate": 0.009999480818449867, "loss": 2.9715, "step": 676 }, { "crossentropy": 3.143594264984131, "epoch": 0.024543213457076565, "grad_norm": 0.06104160100221634, "grad_norm_var": 8.273818058666255e-06, "learning_rate": 0.00999947241093605, "loss": 3.1632, "step": 677 }, { "crossentropy": 2.93363094329834, "epoch": 0.024579466357308583, "grad_norm": 0.05930423364043236, "grad_norm_var": 9.38098057831112e-06, "learning_rate": 0.009999463935896772, "loss": 3.0632, "step": 678 }, { "crossentropy": 3.0316028594970703, "epoch": 0.024615719257540605, "grad_norm": 0.060423508286476135, "grad_norm_var": 8.05027648564053e-06, "learning_rate": 0.009999455393332144, "loss": 3.0746, "step": 679 }, { "crossentropy": 3.0982120037078857, "epoch": 0.024651972157772623, "grad_norm": 0.06574347615242004, "grad_norm_var": 5.13432730625072e-06, "learning_rate": 0.009999446783242287, "loss": 3.15, "step": 680 }, { "crossentropy": 3.195587635040283, "epoch": 0.02468822505800464, "grad_norm": 0.06256560981273651, "grad_norm_var": 5.024858997759671e-06, "learning_rate": 0.009999438105627314, "loss": 3.1235, "step": 681 }, { "crossentropy": 3.1539394855499268, "epoch": 0.02472447795823666, "grad_norm": 0.06045912951231003, "grad_norm_var": 4.919245192720224e-06, "learning_rate": 0.00999942936048734, "loss": 3.114, "step": 682 }, { "crossentropy": 2.960416793823242, "epoch": 0.024760730858468676, "grad_norm": 0.0620671808719635, "grad_norm_var": 4.478783769716089e-06, "learning_rate": 0.009999420547822489, "loss": 2.9874, "step": 683 }, { "crossentropy": 3.1188156604766846, "epoch": 0.024796983758700698, "grad_norm": 0.06190263479948044, "grad_norm_var": 4.511900975943429e-06, "learning_rate": 0.009999411667632874, "loss": 3.0584, "step": 684 }, { "crossentropy": 3.1330506801605225, "epoch": 0.024833236658932716, "grad_norm": 0.06356937438249588, "grad_norm_var": 4.5675231160663755e-06, "learning_rate": 0.009999402719918617, "loss": 3.1072, "step": 685 }, { "crossentropy": 3.121992826461792, "epoch": 0.024869489559164733, "grad_norm": 0.06827576458454132, "grad_norm_var": 6.473860584311579e-06, "learning_rate": 0.009999393704679843, "loss": 3.1942, "step": 686 }, { "crossentropy": 3.2036025524139404, "epoch": 0.02490574245939675, "grad_norm": 0.05782178044319153, "grad_norm_var": 7.388483990172778e-06, "learning_rate": 0.009999384621916668, "loss": 3.1506, "step": 687 }, { "crossentropy": 3.0748789310455322, "epoch": 0.02494199535962877, "grad_norm": 0.05964336916804314, "grad_norm_var": 7.890517954883824e-06, "learning_rate": 0.009999375471629216, "loss": 3.0968, "step": 688 }, { "crossentropy": 2.9978983402252197, "epoch": 0.024978248259860787, "grad_norm": 0.07436281442642212, "grad_norm_var": 1.6702399059610278e-05, "learning_rate": 0.009999366253817612, "loss": 3.0334, "step": 689 }, { "crossentropy": 2.9775469303131104, "epoch": 0.02501450116009281, "grad_norm": 0.05806899443268776, "grad_norm_var": 1.727358161328531e-05, "learning_rate": 0.009999356968481981, "loss": 3.0529, "step": 690 }, { "crossentropy": 3.2193102836608887, "epoch": 0.025050754060324826, "grad_norm": 0.057532936334609985, "grad_norm_var": 1.8784610316813626e-05, "learning_rate": 0.009999347615622446, "loss": 3.1196, "step": 691 }, { "crossentropy": 3.136455774307251, "epoch": 0.025087006960556844, "grad_norm": 0.09357401728630066, "grad_norm_var": 8.024297752388289e-05, "learning_rate": 0.009999338195239136, "loss": 3.192, "step": 692 }, { "crossentropy": 3.218092918395996, "epoch": 0.025123259860788862, "grad_norm": 0.07325505465269089, "grad_norm_var": 8.450853570500095e-05, "learning_rate": 0.009999328707332176, "loss": 3.1688, "step": 693 }, { "crossentropy": 3.236100196838379, "epoch": 0.02515951276102088, "grad_norm": 0.06066981330513954, "grad_norm_var": 8.360429103544254e-05, "learning_rate": 0.009999319151901694, "loss": 3.0777, "step": 694 }, { "crossentropy": 3.1052982807159424, "epoch": 0.0251957656612529, "grad_norm": 0.06107566878199577, "grad_norm_var": 8.323327626568826e-05, "learning_rate": 0.009999309528947823, "loss": 3.0511, "step": 695 }, { "crossentropy": 3.1213912963867188, "epoch": 0.02523201856148492, "grad_norm": 0.061442781239748, "grad_norm_var": 8.398400601237749e-05, "learning_rate": 0.009999299838470688, "loss": 3.0716, "step": 696 }, { "crossentropy": 3.1745145320892334, "epoch": 0.025268271461716937, "grad_norm": 0.061686720699071884, "grad_norm_var": 8.429036357950615e-05, "learning_rate": 0.009999290080470423, "loss": 3.1916, "step": 697 }, { "crossentropy": 3.242690086364746, "epoch": 0.025304524361948955, "grad_norm": 0.06345046311616898, "grad_norm_var": 8.315298473977051e-05, "learning_rate": 0.009999280254947158, "loss": 3.2012, "step": 698 }, { "crossentropy": 2.9860401153564453, "epoch": 0.025340777262180973, "grad_norm": 0.05723074451088905, "grad_norm_var": 8.644167067254159e-05, "learning_rate": 0.009999270361901026, "loss": 3.0111, "step": 699 }, { "crossentropy": 2.981092691421509, "epoch": 0.025377030162412995, "grad_norm": 0.058808136731386185, "grad_norm_var": 8.815214186657095e-05, "learning_rate": 0.009999260401332161, "loss": 3.0288, "step": 700 }, { "crossentropy": 3.009434223175049, "epoch": 0.025413283062645012, "grad_norm": 0.05980432778596878, "grad_norm_var": 8.945724172556938e-05, "learning_rate": 0.0099992503732407, "loss": 3.0376, "step": 701 }, { "crossentropy": 2.8777763843536377, "epoch": 0.02544953596287703, "grad_norm": 0.06469990313053131, "grad_norm_var": 8.829836805441064e-05, "learning_rate": 0.009999240277626774, "loss": 3.0028, "step": 702 }, { "crossentropy": 3.01361083984375, "epoch": 0.02548578886310905, "grad_norm": 0.06301222741603851, "grad_norm_var": 8.574420590603075e-05, "learning_rate": 0.00999923011449052, "loss": 3.0354, "step": 703 }, { "crossentropy": 2.9836790561676025, "epoch": 0.025522041763341066, "grad_norm": 0.06027335301041603, "grad_norm_var": 8.538039450251717e-05, "learning_rate": 0.00999921988383208, "loss": 3.0729, "step": 704 }, { "crossentropy": 3.0363194942474365, "epoch": 0.025558294663573084, "grad_norm": 0.059005849063396454, "grad_norm_var": 7.953453096473476e-05, "learning_rate": 0.009999209585651588, "loss": 3.0433, "step": 705 }, { "crossentropy": 2.9643685817718506, "epoch": 0.025594547563805106, "grad_norm": 0.05819394811987877, "grad_norm_var": 7.944753203508343e-05, "learning_rate": 0.009999199219949182, "loss": 2.9787, "step": 706 }, { "crossentropy": 3.107799768447876, "epoch": 0.025630800464037123, "grad_norm": 0.06269856542348862, "grad_norm_var": 7.710376792511575e-05, "learning_rate": 0.009999188786725007, "loss": 3.1536, "step": 707 }, { "crossentropy": 3.0847420692443848, "epoch": 0.02566705336426914, "grad_norm": 0.057833656668663025, "grad_norm_var": 1.4483678451974551e-05, "learning_rate": 0.009999178285979198, "loss": 3.1891, "step": 708 }, { "crossentropy": 3.047227382659912, "epoch": 0.02570330626450116, "grad_norm": 0.07117146253585815, "grad_norm_var": 1.1474403310690375e-05, "learning_rate": 0.009999167717711902, "loss": 3.0485, "step": 709 }, { "crossentropy": 3.0658700466156006, "epoch": 0.025739559164733177, "grad_norm": 0.06124090775847435, "grad_norm_var": 1.1445575413289907e-05, "learning_rate": 0.009999157081923258, "loss": 3.1438, "step": 710 }, { "crossentropy": 3.013218879699707, "epoch": 0.0257758120649652, "grad_norm": 0.05680357292294502, "grad_norm_var": 1.2743535482909878e-05, "learning_rate": 0.009999146378613412, "loss": 3.0598, "step": 711 }, { "crossentropy": 3.0660245418548584, "epoch": 0.025812064965197216, "grad_norm": 0.059306517243385315, "grad_norm_var": 1.2926793114418668e-05, "learning_rate": 0.009999135607782508, "loss": 3.1429, "step": 712 }, { "crossentropy": 3.0770816802978516, "epoch": 0.025848317865429234, "grad_norm": 0.06119045987725258, "grad_norm_var": 1.2893522056622343e-05, "learning_rate": 0.00999912476943069, "loss": 3.0825, "step": 713 }, { "crossentropy": 3.1234633922576904, "epoch": 0.025884570765661252, "grad_norm": 0.06340350210666656, "grad_norm_var": 1.2877817079156618e-05, "learning_rate": 0.009999113863558104, "loss": 3.07, "step": 714 }, { "crossentropy": 2.944788932800293, "epoch": 0.02592082366589327, "grad_norm": 0.06096170097589493, "grad_norm_var": 1.1913891872729574e-05, "learning_rate": 0.009999102890164901, "loss": 3.0832, "step": 715 }, { "crossentropy": 3.021115779876709, "epoch": 0.02595707656612529, "grad_norm": 0.059511277824640274, "grad_norm_var": 1.1725190220575564e-05, "learning_rate": 0.009999091849251228, "loss": 3.0021, "step": 716 }, { "crossentropy": 3.2153847217559814, "epoch": 0.02599332946635731, "grad_norm": 0.061616018414497375, "grad_norm_var": 1.159453248783989e-05, "learning_rate": 0.009999080740817233, "loss": 3.0753, "step": 717 }, { "crossentropy": 3.055194616317749, "epoch": 0.026029582366589327, "grad_norm": 0.059627238661050797, "grad_norm_var": 1.0908431619077672e-05, "learning_rate": 0.009999069564863065, "loss": 3.167, "step": 718 }, { "crossentropy": 3.3513810634613037, "epoch": 0.026065835266821345, "grad_norm": 0.06064898148179054, "grad_norm_var": 1.0620489084226663e-05, "learning_rate": 0.009999058321388877, "loss": 3.204, "step": 719 }, { "crossentropy": 3.0986733436584473, "epoch": 0.026102088167053363, "grad_norm": 0.06470068544149399, "grad_norm_var": 1.1509336182293806e-05, "learning_rate": 0.00999904701039482, "loss": 3.1486, "step": 720 }, { "crossentropy": 3.101132392883301, "epoch": 0.026138341067285385, "grad_norm": 0.05979052186012268, "grad_norm_var": 1.1326666228428777e-05, "learning_rate": 0.009999035631881046, "loss": 3.1099, "step": 721 }, { "crossentropy": 3.0574779510498047, "epoch": 0.026174593967517402, "grad_norm": 0.06340288370847702, "grad_norm_var": 1.0956448489804483e-05, "learning_rate": 0.00999902418584771, "loss": 3.0326, "step": 722 }, { "crossentropy": 2.9886322021484375, "epoch": 0.02621084686774942, "grad_norm": 0.06217657029628754, "grad_norm_var": 1.0889658645969977e-05, "learning_rate": 0.009999012672294964, "loss": 3.0294, "step": 723 }, { "crossentropy": 3.147138833999634, "epoch": 0.02624709976798144, "grad_norm": 0.06280377507209778, "grad_norm_var": 1.0029349247649376e-05, "learning_rate": 0.009999001091222967, "loss": 3.1628, "step": 724 }, { "crossentropy": 3.0884761810302734, "epoch": 0.026283352668213456, "grad_norm": 0.05942694470286369, "grad_norm_var": 3.9316503333487626e-06, "learning_rate": 0.009998989442631875, "loss": 3.1262, "step": 725 }, { "crossentropy": 3.1720495223999023, "epoch": 0.026319605568445474, "grad_norm": 0.05918183922767639, "grad_norm_var": 4.1409897942580595e-06, "learning_rate": 0.009998977726521845, "loss": 3.1386, "step": 726 }, { "crossentropy": 2.9733726978302, "epoch": 0.026355858468677495, "grad_norm": 0.0625188872218132, "grad_norm_var": 3.053629063548037e-06, "learning_rate": 0.009998965942893033, "loss": 3.0022, "step": 727 }, { "crossentropy": 3.1214206218719482, "epoch": 0.026392111368909513, "grad_norm": 0.06108885258436203, "grad_norm_var": 2.7863379780496482e-06, "learning_rate": 0.0099989540917456, "loss": 3.1037, "step": 728 }, { "crossentropy": 3.1085829734802246, "epoch": 0.02642836426914153, "grad_norm": 0.06061914563179016, "grad_norm_var": 2.8210340743397194e-06, "learning_rate": 0.009998942173079705, "loss": 3.1348, "step": 729 }, { "crossentropy": 3.008488178253174, "epoch": 0.02646461716937355, "grad_norm": 0.0660037100315094, "grad_norm_var": 3.958164976517834e-06, "learning_rate": 0.009998930186895513, "loss": 3.0508, "step": 730 }, { "crossentropy": 2.821611166000366, "epoch": 0.026500870069605567, "grad_norm": 0.08123109489679337, "grad_norm_var": 2.8168036652861012e-05, "learning_rate": 0.00999891813319318, "loss": 2.9927, "step": 731 }, { "crossentropy": 3.205559015274048, "epoch": 0.02653712296983759, "grad_norm": 0.06555937230587006, "grad_norm_var": 2.7824944721206666e-05, "learning_rate": 0.00999890601197287, "loss": 3.1337, "step": 732 }, { "crossentropy": 3.1119801998138428, "epoch": 0.026573375870069606, "grad_norm": 0.06338720768690109, "grad_norm_var": 2.7658802630797266e-05, "learning_rate": 0.009998893823234751, "loss": 2.9495, "step": 733 }, { "crossentropy": 3.1255218982696533, "epoch": 0.026609628770301624, "grad_norm": 0.06144316866993904, "grad_norm_var": 2.6985207346609934e-05, "learning_rate": 0.009998881566978984, "loss": 3.0614, "step": 734 }, { "crossentropy": 3.0158417224884033, "epoch": 0.026645881670533642, "grad_norm": 0.0672612115740776, "grad_norm_var": 2.7315366386293275e-05, "learning_rate": 0.009998869243205736, "loss": 3.0071, "step": 735 }, { "crossentropy": 3.029970645904541, "epoch": 0.02668213457076566, "grad_norm": 0.0633026733994484, "grad_norm_var": 2.726725140385055e-05, "learning_rate": 0.00999885685191517, "loss": 3.046, "step": 736 }, { "crossentropy": 3.12333345413208, "epoch": 0.02671838747099768, "grad_norm": 0.05790822580456734, "grad_norm_var": 2.8469830425323305e-05, "learning_rate": 0.00999884439310746, "loss": 3.1086, "step": 737 }, { "crossentropy": 3.060638427734375, "epoch": 0.0267546403712297, "grad_norm": 0.06353557854890823, "grad_norm_var": 2.8467757939610167e-05, "learning_rate": 0.009998831866782769, "loss": 3.0472, "step": 738 }, { "crossentropy": 2.936352491378784, "epoch": 0.026790893271461717, "grad_norm": 0.0640573501586914, "grad_norm_var": 2.8334265138539074e-05, "learning_rate": 0.009998819272941266, "loss": 2.9208, "step": 739 }, { "crossentropy": 2.996821641921997, "epoch": 0.026827146171693735, "grad_norm": 0.06672947108745575, "grad_norm_var": 2.8824129312844612e-05, "learning_rate": 0.009998806611583125, "loss": 3.0404, "step": 740 }, { "crossentropy": 3.082564115524292, "epoch": 0.026863399071925753, "grad_norm": 0.060067206621170044, "grad_norm_var": 2.8463332897471493e-05, "learning_rate": 0.009998793882708513, "loss": 3.1199, "step": 741 }, { "crossentropy": 3.0745227336883545, "epoch": 0.02689965197215777, "grad_norm": 0.061467546969652176, "grad_norm_var": 2.7323474070417053e-05, "learning_rate": 0.009998781086317606, "loss": 3.0926, "step": 742 }, { "crossentropy": 3.110471487045288, "epoch": 0.026935904872389792, "grad_norm": 0.06603724509477615, "grad_norm_var": 2.7338402943810566e-05, "learning_rate": 0.009998768222410572, "loss": 3.0752, "step": 743 }, { "crossentropy": 3.11826753616333, "epoch": 0.02697215777262181, "grad_norm": 0.05574227124452591, "grad_norm_var": 3.145423600151986e-05, "learning_rate": 0.00999875529098759, "loss": 3.0772, "step": 744 }, { "crossentropy": 3.058439016342163, "epoch": 0.02700841067285383, "grad_norm": 0.05802452564239502, "grad_norm_var": 3.3052214874963676e-05, "learning_rate": 0.009998742292048828, "loss": 3.0582, "step": 745 }, { "crossentropy": 3.060356378555298, "epoch": 0.027044663573085846, "grad_norm": 0.05853678658604622, "grad_norm_var": 3.440250998127924e-05, "learning_rate": 0.009998729225594468, "loss": 3.0765, "step": 746 }, { "crossentropy": 3.1048996448516846, "epoch": 0.027080916473317864, "grad_norm": 0.061466749757528305, "grad_norm_var": 1.1809558715262549e-05, "learning_rate": 0.009998716091624685, "loss": 2.9714, "step": 747 }, { "crossentropy": 3.076960563659668, "epoch": 0.027117169373549885, "grad_norm": 0.059854328632354736, "grad_norm_var": 1.125638160241049e-05, "learning_rate": 0.009998702890139653, "loss": 3.1036, "step": 748 }, { "crossentropy": 3.0027482509613037, "epoch": 0.027153422273781903, "grad_norm": 0.059876274317502975, "grad_norm_var": 1.1284417129744002e-05, "learning_rate": 0.009998689621139555, "loss": 3.0043, "step": 749 }, { "crossentropy": 3.1255850791931152, "epoch": 0.02718967517401392, "grad_norm": 0.08675537258386612, "grad_norm_var": 5.086038839081045e-05, "learning_rate": 0.009998676284624567, "loss": 3.142, "step": 750 }, { "crossentropy": 2.980839490890503, "epoch": 0.02722592807424594, "grad_norm": 0.05653032660484314, "grad_norm_var": 5.219504847810876e-05, "learning_rate": 0.009998662880594868, "loss": 3.0377, "step": 751 }, { "crossentropy": 3.0628106594085693, "epoch": 0.027262180974477957, "grad_norm": 0.05823187902569771, "grad_norm_var": 5.325484942366894e-05, "learning_rate": 0.009998649409050642, "loss": 3.044, "step": 752 }, { "crossentropy": 2.940164804458618, "epoch": 0.02729843387470998, "grad_norm": 0.07832581549882889, "grad_norm_var": 6.769049316186117e-05, "learning_rate": 0.00999863586999207, "loss": 2.9815, "step": 753 }, { "crossentropy": 3.1055946350097656, "epoch": 0.027334686774941996, "grad_norm": 0.05795340612530708, "grad_norm_var": 6.957614006404059e-05, "learning_rate": 0.009998622263419337, "loss": 3.0931, "step": 754 }, { "crossentropy": 3.048733711242676, "epoch": 0.027370939675174014, "grad_norm": 0.06456825137138367, "grad_norm_var": 6.965742787718605e-05, "learning_rate": 0.009998608589332623, "loss": 3.1034, "step": 755 }, { "crossentropy": 3.167865753173828, "epoch": 0.027407192575406032, "grad_norm": 0.058875929564237595, "grad_norm_var": 6.974888768167999e-05, "learning_rate": 0.009998594847732115, "loss": 3.1288, "step": 756 }, { "crossentropy": 2.9428515434265137, "epoch": 0.02744344547563805, "grad_norm": 0.06542471051216125, "grad_norm_var": 6.970168224526835e-05, "learning_rate": 0.009998581038617997, "loss": 2.9622, "step": 757 }, { "crossentropy": 3.0264594554901123, "epoch": 0.027479698375870068, "grad_norm": 0.05959624797105789, "grad_norm_var": 7.02977753486098e-05, "learning_rate": 0.009998567161990457, "loss": 3.0636, "step": 758 }, { "crossentropy": 3.086542844772339, "epoch": 0.02751595127610209, "grad_norm": 0.057463567703962326, "grad_norm_var": 7.126279842835271e-05, "learning_rate": 0.009998553217849681, "loss": 3.1195, "step": 759 }, { "crossentropy": 3.135301351547241, "epoch": 0.027552204176334107, "grad_norm": 0.056254349648952484, "grad_norm_var": 7.082962484022166e-05, "learning_rate": 0.00999853920619586, "loss": 3.0561, "step": 760 }, { "crossentropy": 3.06524920463562, "epoch": 0.027588457076566125, "grad_norm": 0.058231983333826065, "grad_norm_var": 7.071242822396887e-05, "learning_rate": 0.00999852512702918, "loss": 3.0429, "step": 761 }, { "crossentropy": 2.8224973678588867, "epoch": 0.027624709976798143, "grad_norm": 0.058367110788822174, "grad_norm_var": 7.080098479722013e-05, "learning_rate": 0.009998510980349833, "loss": 2.9689, "step": 762 }, { "crossentropy": 3.0874831676483154, "epoch": 0.02766096287703016, "grad_norm": 0.05590413138270378, "grad_norm_var": 7.339816877296775e-05, "learning_rate": 0.00999849676615801, "loss": 3.0107, "step": 763 }, { "crossentropy": 2.9512240886688232, "epoch": 0.027697215777262182, "grad_norm": 0.06508902460336685, "grad_norm_var": 7.360388328756975e-05, "learning_rate": 0.009998482484453902, "loss": 3.0091, "step": 764 }, { "crossentropy": 2.979914665222168, "epoch": 0.0277334686774942, "grad_norm": 0.059425126761198044, "grad_norm_var": 7.3764836188427e-05, "learning_rate": 0.009998468135237705, "loss": 3.0999, "step": 765 }, { "crossentropy": 3.0868093967437744, "epoch": 0.027769721577726218, "grad_norm": 0.0630580484867096, "grad_norm_var": 3.163123148309839e-05, "learning_rate": 0.009998453718509607, "loss": 3.0703, "step": 766 }, { "crossentropy": 2.9243147373199463, "epoch": 0.027805974477958236, "grad_norm": 0.07035291194915771, "grad_norm_var": 3.564608276263355e-05, "learning_rate": 0.009998439234269808, "loss": 3.0421, "step": 767 }, { "crossentropy": 2.9669668674468994, "epoch": 0.027842227378190254, "grad_norm": 0.05932963266968727, "grad_norm_var": 3.521448922654818e-05, "learning_rate": 0.0099984246825185, "loss": 3.0144, "step": 768 }, { "crossentropy": 3.1019253730773926, "epoch": 0.027878480278422275, "grad_norm": 0.06487329304218292, "grad_norm_var": 1.6818290191111288e-05, "learning_rate": 0.009998410063255882, "loss": 3.0443, "step": 769 }, { "crossentropy": 2.857086658477783, "epoch": 0.027914733178654293, "grad_norm": 0.05733237788081169, "grad_norm_var": 1.7088287083799556e-05, "learning_rate": 0.009998395376482153, "loss": 2.9746, "step": 770 }, { "crossentropy": 3.074039936065674, "epoch": 0.02795098607888631, "grad_norm": 0.05773630365729332, "grad_norm_var": 1.664957832000491e-05, "learning_rate": 0.009998380622197506, "loss": 3.0715, "step": 771 }, { "crossentropy": 2.9978129863739014, "epoch": 0.02798723897911833, "grad_norm": 0.06221301108598709, "grad_norm_var": 1.6642020789346157e-05, "learning_rate": 0.009998365800402146, "loss": 3.0134, "step": 772 }, { "crossentropy": 2.987618923187256, "epoch": 0.028023491879350347, "grad_norm": 0.06026940047740936, "grad_norm_var": 1.50319009788545e-05, "learning_rate": 0.00999835091109627, "loss": 2.9556, "step": 773 }, { "crossentropy": 3.053683280944824, "epoch": 0.028059744779582365, "grad_norm": 0.05851570889353752, "grad_norm_var": 1.5212536470216956e-05, "learning_rate": 0.009998335954280077, "loss": 3.0202, "step": 774 }, { "crossentropy": 2.9492926597595215, "epoch": 0.028095997679814386, "grad_norm": 0.05833932012319565, "grad_norm_var": 1.4932071241024041e-05, "learning_rate": 0.009998320929953775, "loss": 2.9932, "step": 775 }, { "crossentropy": 3.1242713928222656, "epoch": 0.028132250580046404, "grad_norm": 0.07064994424581528, "grad_norm_var": 2.0059880295341242e-05, "learning_rate": 0.009998305838117562, "loss": 3.0933, "step": 776 }, { "crossentropy": 2.944582939147949, "epoch": 0.028168503480278422, "grad_norm": 0.056754469871520996, "grad_norm_var": 2.0787025580729784e-05, "learning_rate": 0.009998290678771645, "loss": 3.0389, "step": 777 }, { "crossentropy": 3.0559747219085693, "epoch": 0.02820475638051044, "grad_norm": 0.05695182830095291, "grad_norm_var": 2.143511482710428e-05, "learning_rate": 0.009998275451916226, "loss": 3.1043, "step": 778 }, { "crossentropy": 3.2383463382720947, "epoch": 0.028241009280742458, "grad_norm": 0.056304994970560074, "grad_norm_var": 2.1170137466801816e-05, "learning_rate": 0.00999826015755151, "loss": 3.0759, "step": 779 }, { "crossentropy": 3.1132519245147705, "epoch": 0.02827726218097448, "grad_norm": 0.05799183249473572, "grad_norm_var": 2.051955822214823e-05, "learning_rate": 0.00999824479567771, "loss": 3.0997, "step": 780 }, { "crossentropy": 3.0877130031585693, "epoch": 0.028313515081206497, "grad_norm": 0.05832294747233391, "grad_norm_var": 2.0772715224096765e-05, "learning_rate": 0.009998229366295025, "loss": 3.0503, "step": 781 }, { "crossentropy": 2.902621269226074, "epoch": 0.028349767981438515, "grad_norm": 0.06372907012701035, "grad_norm_var": 2.1024154931020833e-05, "learning_rate": 0.00999821386940367, "loss": 2.9559, "step": 782 }, { "crossentropy": 3.055917978286743, "epoch": 0.028386020881670533, "grad_norm": 0.06072783097624779, "grad_norm_var": 1.4303326911219104e-05, "learning_rate": 0.00999819830500385, "loss": 3.0449, "step": 783 }, { "crossentropy": 3.073991060256958, "epoch": 0.02842227378190255, "grad_norm": 0.05719171091914177, "grad_norm_var": 1.4780836307634546e-05, "learning_rate": 0.009998182673095777, "loss": 3.0675, "step": 784 }, { "crossentropy": 3.042163372039795, "epoch": 0.028458526682134572, "grad_norm": 0.057607006281614304, "grad_norm_var": 1.32324212728648e-05, "learning_rate": 0.00999816697367966, "loss": 3.0459, "step": 785 }, { "crossentropy": 2.984884023666382, "epoch": 0.02849477958236659, "grad_norm": 0.05655231326818466, "grad_norm_var": 1.3487048639428065e-05, "learning_rate": 0.009998151206755717, "loss": 3.0552, "step": 786 }, { "crossentropy": 3.0398900508880615, "epoch": 0.028531032482598608, "grad_norm": 0.05980037525296211, "grad_norm_var": 1.3304785997234e-05, "learning_rate": 0.009998135372324154, "loss": 3.0233, "step": 787 }, { "crossentropy": 3.0362298488616943, "epoch": 0.028567285382830626, "grad_norm": 0.05514860898256302, "grad_norm_var": 1.3863851191789286e-05, "learning_rate": 0.009998119470385189, "loss": 3.1215, "step": 788 }, { "crossentropy": 2.894001007080078, "epoch": 0.028603538283062644, "grad_norm": 0.06411325931549072, "grad_norm_var": 1.5410427355373677e-05, "learning_rate": 0.009998103500939035, "loss": 2.9919, "step": 789 }, { "crossentropy": 3.099130868911743, "epoch": 0.028639791183294662, "grad_norm": 0.05886482819914818, "grad_norm_var": 1.5381824346962352e-05, "learning_rate": 0.00999808746398591, "loss": 3.0036, "step": 790 }, { "crossentropy": 3.2743353843688965, "epoch": 0.028676044083526683, "grad_norm": 0.07472538203001022, "grad_norm_var": 3.0030177907718463e-05, "learning_rate": 0.009998071359526028, "loss": 3.1868, "step": 791 }, { "crossentropy": 3.103536605834961, "epoch": 0.0287122969837587, "grad_norm": 0.05909156799316406, "grad_norm_var": 2.2490756635560175e-05, "learning_rate": 0.009998055187559607, "loss": 3.0481, "step": 792 }, { "crossentropy": 3.086066246032715, "epoch": 0.02874854988399072, "grad_norm": 0.05920225381851196, "grad_norm_var": 2.1930864224576202e-05, "learning_rate": 0.009998038948086867, "loss": 3.0224, "step": 793 }, { "crossentropy": 3.1531362533569336, "epoch": 0.028784802784222737, "grad_norm": 0.07039190083742142, "grad_norm_var": 2.8169744376843087e-05, "learning_rate": 0.009998022641108025, "loss": 3.1627, "step": 794 }, { "crossentropy": 2.9533486366271973, "epoch": 0.028821055684454755, "grad_norm": 0.06335605680942535, "grad_norm_var": 2.7229426544189465e-05, "learning_rate": 0.009998006266623304, "loss": 3.0755, "step": 795 }, { "crossentropy": 3.077626943588257, "epoch": 0.028857308584686776, "grad_norm": 0.057416707277297974, "grad_norm_var": 2.748469138593885e-05, "learning_rate": 0.00999798982463292, "loss": 3.0341, "step": 796 }, { "crossentropy": 3.0166141986846924, "epoch": 0.028893561484918794, "grad_norm": 0.05669049173593521, "grad_norm_var": 2.8237227334275358e-05, "learning_rate": 0.009997973315137102, "loss": 3.0925, "step": 797 }, { "crossentropy": 3.0392632484436035, "epoch": 0.028929814385150812, "grad_norm": 0.05795282498002052, "grad_norm_var": 2.8153764461715433e-05, "learning_rate": 0.009997956738136068, "loss": 3.0506, "step": 798 }, { "crossentropy": 3.0604610443115234, "epoch": 0.02896606728538283, "grad_norm": 0.058579374104738235, "grad_norm_var": 2.8391907470788855e-05, "learning_rate": 0.009997940093630045, "loss": 3.081, "step": 799 }, { "crossentropy": 3.0102386474609375, "epoch": 0.029002320185614848, "grad_norm": 0.06607726961374283, "grad_norm_var": 2.9504409268176804e-05, "learning_rate": 0.009997923381619256, "loss": 3.0966, "step": 800 }, { "crossentropy": 2.9966323375701904, "epoch": 0.02903857308584687, "grad_norm": 0.07399126142263412, "grad_norm_var": 3.892860463935444e-05, "learning_rate": 0.009997906602103928, "loss": 3.0042, "step": 801 }, { "crossentropy": 2.9498300552368164, "epoch": 0.029074825986078887, "grad_norm": 0.060815174132585526, "grad_norm_var": 3.696960668516652e-05, "learning_rate": 0.009997889755084284, "loss": 3.0421, "step": 802 }, { "crossentropy": 3.0307087898254395, "epoch": 0.029111078886310905, "grad_norm": 0.06557436287403107, "grad_norm_var": 3.7156952119484174e-05, "learning_rate": 0.009997872840560557, "loss": 2.9809, "step": 803 }, { "crossentropy": 3.0976853370666504, "epoch": 0.029147331786542923, "grad_norm": 0.0661657378077507, "grad_norm_var": 3.376137071997738e-05, "learning_rate": 0.009997855858532972, "loss": 3.0835, "step": 804 }, { "crossentropy": 3.0416414737701416, "epoch": 0.02918358468677494, "grad_norm": 0.05834650993347168, "grad_norm_var": 3.522453556759226e-05, "learning_rate": 0.009997838809001758, "loss": 3.048, "step": 805 }, { "crossentropy": 3.062837839126587, "epoch": 0.02921983758700696, "grad_norm": 0.05900814011693001, "grad_norm_var": 3.5147708899182134e-05, "learning_rate": 0.009997821691967147, "loss": 3.0792, "step": 806 }, { "crossentropy": 2.9421684741973877, "epoch": 0.02925609048723898, "grad_norm": 0.06178198382258415, "grad_norm_var": 2.5316592809662178e-05, "learning_rate": 0.00999780450742937, "loss": 2.9646, "step": 807 }, { "crossentropy": 3.0209970474243164, "epoch": 0.029292343387470998, "grad_norm": 0.07264702767133713, "grad_norm_var": 3.126850350825775e-05, "learning_rate": 0.00999778725538866, "loss": 3.041, "step": 808 }, { "crossentropy": 3.169128179550171, "epoch": 0.029328596287703016, "grad_norm": 0.06918805837631226, "grad_norm_var": 3.2444541557139346e-05, "learning_rate": 0.009997769935845245, "loss": 3.2035, "step": 809 }, { "crossentropy": 3.1037826538085938, "epoch": 0.029364849187935034, "grad_norm": 0.05770374834537506, "grad_norm_var": 3.105662809837154e-05, "learning_rate": 0.009997752548799366, "loss": 3.1129, "step": 810 }, { "crossentropy": 3.0945639610290527, "epoch": 0.029401102088167052, "grad_norm": 0.06039104983210564, "grad_norm_var": 3.139847789595066e-05, "learning_rate": 0.009997735094251252, "loss": 3.0518, "step": 811 }, { "crossentropy": 2.950979709625244, "epoch": 0.029437354988399073, "grad_norm": 0.059306323528289795, "grad_norm_var": 3.0304228150802395e-05, "learning_rate": 0.009997717572201144, "loss": 3.0775, "step": 812 }, { "crossentropy": 3.0794568061828613, "epoch": 0.02947360788863109, "grad_norm": 0.06509862095117569, "grad_norm_var": 2.791418228880275e-05, "learning_rate": 0.009997699982649273, "loss": 3.0272, "step": 813 }, { "crossentropy": 3.0202524662017822, "epoch": 0.02950986078886311, "grad_norm": 0.06319686025381088, "grad_norm_var": 2.590169620859019e-05, "learning_rate": 0.009997682325595881, "loss": 2.9888, "step": 814 }, { "crossentropy": 3.0657472610473633, "epoch": 0.029546113689095127, "grad_norm": 0.059411339461803436, "grad_norm_var": 2.5386142689439126e-05, "learning_rate": 0.009997664601041205, "loss": 3.0092, "step": 815 }, { "crossentropy": 3.0177571773529053, "epoch": 0.029582366589327145, "grad_norm": 0.057182878255844116, "grad_norm_var": 2.7474478294694146e-05, "learning_rate": 0.009997646808985485, "loss": 3.0026, "step": 816 }, { "crossentropy": 3.0408692359924316, "epoch": 0.029618619489559166, "grad_norm": 0.058333661407232285, "grad_norm_var": 2.0086817354737255e-05, "learning_rate": 0.009997628949428959, "loss": 3.0135, "step": 817 }, { "crossentropy": 3.088428497314453, "epoch": 0.029654872389791184, "grad_norm": 0.0637252926826477, "grad_norm_var": 2.010421012631769e-05, "learning_rate": 0.00999761102237187, "loss": 3.0043, "step": 818 }, { "crossentropy": 3.0925073623657227, "epoch": 0.029691125290023202, "grad_norm": 0.05949314311146736, "grad_norm_var": 1.977384469822359e-05, "learning_rate": 0.009997593027814462, "loss": 3.0629, "step": 819 }, { "crossentropy": 3.1498944759368896, "epoch": 0.02972737819025522, "grad_norm": 0.06279916316270828, "grad_norm_var": 1.8583701098289346e-05, "learning_rate": 0.009997574965756975, "loss": 3.036, "step": 820 }, { "crossentropy": 3.068241596221924, "epoch": 0.029763631090487238, "grad_norm": 0.06813716888427734, "grad_norm_var": 2.016328523152342e-05, "learning_rate": 0.009997556836199652, "loss": 3.0735, "step": 821 }, { "crossentropy": 3.0395760536193848, "epoch": 0.029799883990719256, "grad_norm": 0.060793548822402954, "grad_norm_var": 1.9569880058688723e-05, "learning_rate": 0.009997538639142742, "loss": 3.0195, "step": 822 }, { "crossentropy": 3.1406641006469727, "epoch": 0.029836136890951277, "grad_norm": 0.05921267345547676, "grad_norm_var": 2.0211093322602728e-05, "learning_rate": 0.009997520374586488, "loss": 3.0919, "step": 823 }, { "crossentropy": 3.0524415969848633, "epoch": 0.029872389791183295, "grad_norm": 0.05615696683526039, "grad_norm_var": 1.4431817816798417e-05, "learning_rate": 0.009997502042531138, "loss": 3.0011, "step": 824 }, { "crossentropy": 3.0523221492767334, "epoch": 0.029908642691415313, "grad_norm": 0.06186412647366524, "grad_norm_var": 1.0040574557510646e-05, "learning_rate": 0.009997483642976939, "loss": 3.0928, "step": 825 }, { "crossentropy": 2.81217622756958, "epoch": 0.02994489559164733, "grad_norm": 0.06031085178256035, "grad_norm_var": 9.388943910428917e-06, "learning_rate": 0.009997465175924139, "loss": 2.871, "step": 826 }, { "crossentropy": 3.107961416244507, "epoch": 0.02998114849187935, "grad_norm": 0.0629018023610115, "grad_norm_var": 9.591347647650017e-06, "learning_rate": 0.009997446641372988, "loss": 3.0788, "step": 827 }, { "crossentropy": 2.9122018814086914, "epoch": 0.03001740139211137, "grad_norm": 0.05818412825465202, "grad_norm_var": 9.9414698566511e-06, "learning_rate": 0.009997428039323734, "loss": 2.9416, "step": 828 }, { "crossentropy": 3.0901763439178467, "epoch": 0.030053654292343388, "grad_norm": 0.05762484297156334, "grad_norm_var": 9.398227453789414e-06, "learning_rate": 0.009997409369776633, "loss": 3.0592, "step": 829 }, { "crossentropy": 3.1117184162139893, "epoch": 0.030089907192575406, "grad_norm": 0.06030873581767082, "grad_norm_var": 8.913013339888604e-06, "learning_rate": 0.009997390632731935, "loss": 3.0128, "step": 830 }, { "crossentropy": 3.006903886795044, "epoch": 0.030126160092807424, "grad_norm": 0.07372355461120605, "grad_norm_var": 1.982401658369385e-05, "learning_rate": 0.009997371828189891, "loss": 3.0796, "step": 831 }, { "crossentropy": 3.034677505493164, "epoch": 0.030162412993039442, "grad_norm": 0.056196048855781555, "grad_norm_var": 2.04262103708561e-05, "learning_rate": 0.009997352956150759, "loss": 3.0131, "step": 832 }, { "crossentropy": 3.0775322914123535, "epoch": 0.030198665893271463, "grad_norm": 0.05760009214282036, "grad_norm_var": 2.0743655723986544e-05, "learning_rate": 0.00999733401661479, "loss": 3.0523, "step": 833 }, { "crossentropy": 2.853123664855957, "epoch": 0.03023491879350348, "grad_norm": 0.06606628000736237, "grad_norm_var": 2.1877667978937623e-05, "learning_rate": 0.00999731500958224, "loss": 2.9048, "step": 834 }, { "crossentropy": 3.0705361366271973, "epoch": 0.0302711716937355, "grad_norm": 0.06012656167149544, "grad_norm_var": 2.174711936558654e-05, "learning_rate": 0.00999729593505337, "loss": 3.0303, "step": 835 }, { "crossentropy": 2.8889708518981934, "epoch": 0.030307424593967517, "grad_norm": 0.05772967264056206, "grad_norm_var": 2.2390991741525847e-05, "learning_rate": 0.009997276793028432, "loss": 2.9625, "step": 836 }, { "crossentropy": 2.9827682971954346, "epoch": 0.030343677494199535, "grad_norm": 0.05880379676818848, "grad_norm_var": 1.902651676308579e-05, "learning_rate": 0.009997257583507689, "loss": 3.01, "step": 837 }, { "crossentropy": 3.0798394680023193, "epoch": 0.030379930394431553, "grad_norm": 0.0593048632144928, "grad_norm_var": 1.910184480531929e-05, "learning_rate": 0.009997238306491397, "loss": 3.058, "step": 838 }, { "crossentropy": 2.710145950317383, "epoch": 0.030416183294663574, "grad_norm": 0.16552603244781494, "grad_norm_var": 0.000708931994934248, "learning_rate": 0.00999721896197982, "loss": 2.8621, "step": 839 }, { "crossentropy": 3.1440515518188477, "epoch": 0.030452436194895592, "grad_norm": 0.06863439083099365, "grad_norm_var": 0.0007005787539052027, "learning_rate": 0.009997199549973216, "loss": 3.1529, "step": 840 }, { "crossentropy": 3.133981227874756, "epoch": 0.03048868909512761, "grad_norm": 0.06238800287246704, "grad_norm_var": 0.0007001808231366188, "learning_rate": 0.00999718007047185, "loss": 3.1286, "step": 841 }, { "crossentropy": 3.036158800125122, "epoch": 0.030524941995359628, "grad_norm": 0.05884895846247673, "grad_norm_var": 0.0007017818427562729, "learning_rate": 0.009997160523475983, "loss": 2.9949, "step": 842 }, { "crossentropy": 2.928697347640991, "epoch": 0.030561194895591646, "grad_norm": 0.056694626808166504, "grad_norm_var": 0.0007082007214179235, "learning_rate": 0.00999714090898588, "loss": 3.0026, "step": 843 }, { "crossentropy": 2.8912322521209717, "epoch": 0.030597447795823667, "grad_norm": 0.06224499270319939, "grad_norm_var": 0.0007042631024138994, "learning_rate": 0.009997121227001804, "loss": 2.9648, "step": 844 }, { "crossentropy": 3.043175220489502, "epoch": 0.030633700696055685, "grad_norm": 0.059896115213632584, "grad_norm_var": 0.0007015604886307429, "learning_rate": 0.009997101477524025, "loss": 2.9965, "step": 845 }, { "crossentropy": 3.086089611053467, "epoch": 0.030669953596287703, "grad_norm": 0.05763562396168709, "grad_norm_var": 0.0007046613271398068, "learning_rate": 0.009997081660552805, "loss": 3.0827, "step": 846 }, { "crossentropy": 3.083588123321533, "epoch": 0.03070620649651972, "grad_norm": 0.06471537053585052, "grad_norm_var": 0.0007023645494814439, "learning_rate": 0.009997061776088414, "loss": 3.0629, "step": 847 }, { "crossentropy": 3.0349535942077637, "epoch": 0.03074245939675174, "grad_norm": 0.05693119764328003, "grad_norm_var": 0.0007013368051570702, "learning_rate": 0.009997041824131121, "loss": 3.0367, "step": 848 }, { "crossentropy": 2.913498878479004, "epoch": 0.03077871229698376, "grad_norm": 0.05511502921581268, "grad_norm_var": 0.0007048611022567171, "learning_rate": 0.009997021804681195, "loss": 2.9559, "step": 849 }, { "crossentropy": 3.2563700675964355, "epoch": 0.030814965197215778, "grad_norm": 0.062195051461458206, "grad_norm_var": 0.0007062365255307983, "learning_rate": 0.009997001717738907, "loss": 3.0811, "step": 850 }, { "crossentropy": 3.085444927215576, "epoch": 0.030851218097447796, "grad_norm": 0.057180047035217285, "grad_norm_var": 0.0007093515847201831, "learning_rate": 0.009996981563304527, "loss": 3.0379, "step": 851 }, { "crossentropy": 2.964756727218628, "epoch": 0.030887470997679814, "grad_norm": 0.07537350058555603, "grad_norm_var": 0.0007081988105213163, "learning_rate": 0.009996961341378327, "loss": 2.9813, "step": 852 }, { "crossentropy": 2.982938051223755, "epoch": 0.030923723897911832, "grad_norm": 0.06721853464841843, "grad_norm_var": 0.000702763148087244, "learning_rate": 0.009996941051960582, "loss": 3.0071, "step": 853 }, { "crossentropy": 2.9678893089294434, "epoch": 0.03095997679814385, "grad_norm": 0.05628621578216553, "grad_norm_var": 0.0007068801903067449, "learning_rate": 0.009996920695051563, "loss": 2.9515, "step": 854 }, { "crossentropy": 3.0805749893188477, "epoch": 0.03099622969837587, "grad_norm": 0.06026478111743927, "grad_norm_var": 2.9635050038537685e-05, "learning_rate": 0.00999690027065155, "loss": 3.0672, "step": 855 }, { "crossentropy": 3.1277074813842773, "epoch": 0.03103248259860789, "grad_norm": 0.058365609496831894, "grad_norm_var": 2.6253886539031e-05, "learning_rate": 0.009996879778760813, "loss": 3.0583, "step": 856 }, { "crossentropy": 2.996070146560669, "epoch": 0.031068735498839907, "grad_norm": 0.06393631547689438, "grad_norm_var": 2.675020760133188e-05, "learning_rate": 0.009996859219379633, "loss": 3.0114, "step": 857 }, { "crossentropy": 3.0506954193115234, "epoch": 0.031104988399071925, "grad_norm": 0.0628434494137764, "grad_norm_var": 2.6704938281251327e-05, "learning_rate": 0.009996838592508286, "loss": 2.9948, "step": 858 }, { "crossentropy": 2.9101765155792236, "epoch": 0.031141241299303943, "grad_norm": 0.06054609641432762, "grad_norm_var": 2.5392344276351646e-05, "learning_rate": 0.00999681789814705, "loss": 2.9259, "step": 859 }, { "crossentropy": 3.0297584533691406, "epoch": 0.031177494199535964, "grad_norm": 0.05882558226585388, "grad_norm_var": 2.569079115262851e-05, "learning_rate": 0.009996797136296207, "loss": 3.0403, "step": 860 }, { "crossentropy": 3.0766260623931885, "epoch": 0.031213747099767982, "grad_norm": 0.05694207549095154, "grad_norm_var": 2.6703681518458246e-05, "learning_rate": 0.009996776306956034, "loss": 3.0044, "step": 861 }, { "crossentropy": 2.9979617595672607, "epoch": 0.03125, "grad_norm": 0.05888771265745163, "grad_norm_var": 2.6256958893501545e-05, "learning_rate": 0.009996755410126814, "loss": 2.9855, "step": 862 }, { "crossentropy": 2.971400022506714, "epoch": 0.03128625290023202, "grad_norm": 0.15074659883975983, "grad_norm_var": 0.0005317288321085675, "learning_rate": 0.009996734445808831, "loss": 2.9617, "step": 863 }, { "crossentropy": 2.9093189239501953, "epoch": 0.031322505800464036, "grad_norm": 0.058851055800914764, "grad_norm_var": 0.0005295472381032922, "learning_rate": 0.009996713414002366, "loss": 2.9842, "step": 864 }, { "crossentropy": 2.950122356414795, "epoch": 0.03135875870069606, "grad_norm": 0.06026972085237503, "grad_norm_var": 0.0005234012562673472, "learning_rate": 0.009996692314707702, "loss": 3.0406, "step": 865 }, { "crossentropy": 3.108513832092285, "epoch": 0.03139501160092807, "grad_norm": 0.05793042853474617, "grad_norm_var": 0.0005271539892509808, "learning_rate": 0.009996671147925127, "loss": 3.0275, "step": 866 }, { "crossentropy": 2.981942653656006, "epoch": 0.03143126450116009, "grad_norm": 0.0583430752158165, "grad_norm_var": 0.0005257887467126922, "learning_rate": 0.009996649913654928, "loss": 2.9549, "step": 867 }, { "crossentropy": 2.9440975189208984, "epoch": 0.031467517401392114, "grad_norm": 0.05947504937648773, "grad_norm_var": 0.00052299235849671, "learning_rate": 0.009996628611897386, "loss": 3.019, "step": 868 }, { "crossentropy": 3.1449856758117676, "epoch": 0.03150377030162413, "grad_norm": 0.06090269610285759, "grad_norm_var": 0.0005241294481789354, "learning_rate": 0.009996607242652791, "loss": 3.0931, "step": 869 }, { "crossentropy": 3.01990008354187, "epoch": 0.03154002320185615, "grad_norm": 0.05941033363342285, "grad_norm_var": 0.0005210207914192742, "learning_rate": 0.009996585805921434, "loss": 2.9787, "step": 870 }, { "crossentropy": 3.035792350769043, "epoch": 0.031576276102088165, "grad_norm": 0.05356639623641968, "grad_norm_var": 0.0005284192674530543, "learning_rate": 0.009996564301703604, "loss": 3.0392, "step": 871 }, { "crossentropy": 3.018162488937378, "epoch": 0.031612529002320186, "grad_norm": 0.05607851222157478, "grad_norm_var": 0.0005307663185709522, "learning_rate": 0.00999654272999959, "loss": 2.9573, "step": 872 }, { "crossentropy": 3.07381534576416, "epoch": 0.03164878190255221, "grad_norm": 0.07701469212770462, "grad_norm_var": 0.0005398681901677166, "learning_rate": 0.009996521090809682, "loss": 3.0225, "step": 873 }, { "crossentropy": 2.9435105323791504, "epoch": 0.03168503480278422, "grad_norm": 0.058779872953891754, "grad_norm_var": 0.0005424287549421656, "learning_rate": 0.009996499384134177, "loss": 2.9854, "step": 874 }, { "crossentropy": 3.0903873443603516, "epoch": 0.03172128770301624, "grad_norm": 0.05637519806623459, "grad_norm_var": 0.0005462212865520996, "learning_rate": 0.009996477609973365, "loss": 3.034, "step": 875 }, { "crossentropy": 3.0869054794311523, "epoch": 0.03175754060324826, "grad_norm": 0.05764101818203926, "grad_norm_var": 0.0005473078666019278, "learning_rate": 0.00999645576832754, "loss": 3.0741, "step": 876 }, { "crossentropy": 2.904763698577881, "epoch": 0.03179379350348028, "grad_norm": 0.0550025999546051, "grad_norm_var": 0.0005496463455102204, "learning_rate": 0.009996433859196996, "loss": 2.9584, "step": 877 }, { "crossentropy": 3.1866402626037598, "epoch": 0.0318300464037123, "grad_norm": 0.06559019535779953, "grad_norm_var": 0.0005470322131588341, "learning_rate": 0.00999641188258203, "loss": 3.2014, "step": 878 }, { "crossentropy": 2.9792935848236084, "epoch": 0.031866299303944315, "grad_norm": 0.05991651862859726, "grad_norm_var": 2.873871728529797e-05, "learning_rate": 0.009996389838482942, "loss": 2.951, "step": 879 }, { "crossentropy": 2.941742420196533, "epoch": 0.031902552204176336, "grad_norm": 0.05947592854499817, "grad_norm_var": 2.86926645553844e-05, "learning_rate": 0.009996367726900025, "loss": 2.9683, "step": 880 }, { "crossentropy": 3.098891019821167, "epoch": 0.03193880510440835, "grad_norm": 0.06303567439317703, "grad_norm_var": 2.9367740445424933e-05, "learning_rate": 0.00999634554783358, "loss": 2.9577, "step": 881 }, { "crossentropy": 2.891829252243042, "epoch": 0.03197505800464037, "grad_norm": 0.06894974410533905, "grad_norm_var": 3.405035578756769e-05, "learning_rate": 0.009996323301283907, "loss": 3.0274, "step": 882 }, { "crossentropy": 3.076956272125244, "epoch": 0.032011310904872387, "grad_norm": 0.06194799393415451, "grad_norm_var": 3.377904325835516e-05, "learning_rate": 0.009996300987251305, "loss": 3.0125, "step": 883 }, { "crossentropy": 2.9961163997650146, "epoch": 0.03204756380510441, "grad_norm": 0.06550730764865875, "grad_norm_var": 3.4969424134537594e-05, "learning_rate": 0.009996278605736075, "loss": 2.9959, "step": 884 }, { "crossentropy": 3.0829317569732666, "epoch": 0.03208381670533643, "grad_norm": 0.14440107345581055, "grad_norm_var": 0.0004674118941610387, "learning_rate": 0.009996256156738522, "loss": 3.0775, "step": 885 }, { "crossentropy": 2.881577968597412, "epoch": 0.032120069605568444, "grad_norm": 0.062133897095918655, "grad_norm_var": 0.00046533061515782964, "learning_rate": 0.009996233640258947, "loss": 2.9468, "step": 886 }, { "crossentropy": 3.0112500190734863, "epoch": 0.032156322505800465, "grad_norm": 0.059668172150850296, "grad_norm_var": 0.000457063168269402, "learning_rate": 0.009996211056297654, "loss": 2.9997, "step": 887 }, { "crossentropy": 2.9521117210388184, "epoch": 0.03219257540603248, "grad_norm": 0.06964720040559769, "grad_norm_var": 0.0004488657536381464, "learning_rate": 0.00999618840485495, "loss": 3.0078, "step": 888 }, { "crossentropy": 3.000326633453369, "epoch": 0.0322288283062645, "grad_norm": 0.05817452073097229, "grad_norm_var": 0.00044794781314196937, "learning_rate": 0.009996165685931138, "loss": 3.0018, "step": 889 }, { "crossentropy": 2.7673168182373047, "epoch": 0.03226508120649652, "grad_norm": 0.059955909848213196, "grad_norm_var": 0.00044680168020372514, "learning_rate": 0.009996142899526526, "loss": 2.8973, "step": 890 }, { "crossentropy": 3.062640428543091, "epoch": 0.03230133410672854, "grad_norm": 0.05799656733870506, "grad_norm_var": 0.00044473092798246925, "learning_rate": 0.009996120045641424, "loss": 3.0815, "step": 891 }, { "crossentropy": 3.0182530879974365, "epoch": 0.03233758700696056, "grad_norm": 0.05500761419534683, "grad_norm_var": 0.0004483856222312301, "learning_rate": 0.009996097124276139, "loss": 3.0085, "step": 892 }, { "crossentropy": 2.933666467666626, "epoch": 0.03237383990719257, "grad_norm": 0.06690708547830582, "grad_norm_var": 0.0004387543302525257, "learning_rate": 0.00999607413543098, "loss": 2.9857, "step": 893 }, { "crossentropy": 3.013545036315918, "epoch": 0.032410092807424594, "grad_norm": 0.056700874119997025, "grad_norm_var": 0.00044583187350969236, "learning_rate": 0.009996051079106257, "loss": 2.9795, "step": 894 }, { "crossentropy": 2.937061071395874, "epoch": 0.032446345707656615, "grad_norm": 0.052645936608314514, "grad_norm_var": 0.00045584656398687157, "learning_rate": 0.009996027955302283, "loss": 2.946, "step": 895 }, { "crossentropy": 3.1391866207122803, "epoch": 0.03248259860788863, "grad_norm": 0.060049690306186676, "grad_norm_var": 0.0004553386058617638, "learning_rate": 0.00999600476401937, "loss": 3.0398, "step": 896 }, { "crossentropy": 2.9006576538085938, "epoch": 0.03251885150812065, "grad_norm": 0.05639304220676422, "grad_norm_var": 0.0004610943477271534, "learning_rate": 0.00999598150525783, "loss": 2.9217, "step": 897 }, { "crossentropy": 3.007207155227661, "epoch": 0.032555104408352666, "grad_norm": 0.06147104129195213, "grad_norm_var": 0.0004616540660013485, "learning_rate": 0.009995958179017978, "loss": 2.9253, "step": 898 }, { "crossentropy": 2.9628939628601074, "epoch": 0.03259135730858469, "grad_norm": 0.05817538499832153, "grad_norm_var": 0.00046434942503213305, "learning_rate": 0.009995934785300132, "loss": 2.915, "step": 899 }, { "crossentropy": 3.0199787616729736, "epoch": 0.03262761020881671, "grad_norm": 0.057977356016635895, "grad_norm_var": 0.00046768726639633684, "learning_rate": 0.009995911324104602, "loss": 2.9959, "step": 900 }, { "crossentropy": 2.964082956314087, "epoch": 0.03266386310904872, "grad_norm": 0.060639768838882446, "grad_norm_var": 1.7538642101675287e-05, "learning_rate": 0.00999588779543171, "loss": 2.9273, "step": 901 }, { "crossentropy": 3.0782504081726074, "epoch": 0.032700116009280744, "grad_norm": 0.06388824433088303, "grad_norm_var": 1.8324529666690254e-05, "learning_rate": 0.00999586419928177, "loss": 3.024, "step": 902 }, { "crossentropy": 3.071545362472534, "epoch": 0.03273636890951276, "grad_norm": 0.060763970017433167, "grad_norm_var": 1.8394029089861762e-05, "learning_rate": 0.009995840535655104, "loss": 3.0383, "step": 903 }, { "crossentropy": 2.773643970489502, "epoch": 0.03277262180974478, "grad_norm": 0.06256736814975739, "grad_norm_var": 1.2207302460999435e-05, "learning_rate": 0.009995816804552029, "loss": 2.8737, "step": 904 }, { "crossentropy": 2.9668495655059814, "epoch": 0.0328088747099768, "grad_norm": 0.060086190700531006, "grad_norm_var": 1.2140640649201985e-05, "learning_rate": 0.009995793005972867, "loss": 2.9943, "step": 905 }, { "crossentropy": 3.0639734268188477, "epoch": 0.032845127610208816, "grad_norm": 0.0553167425096035, "grad_norm_var": 1.3173831227582719e-05, "learning_rate": 0.00999576913991794, "loss": 3.0122, "step": 906 }, { "crossentropy": 3.0830583572387695, "epoch": 0.03288138051044084, "grad_norm": 0.0532623715698719, "grad_norm_var": 1.5310068736251504e-05, "learning_rate": 0.009995745206387569, "loss": 3.0414, "step": 907 }, { "crossentropy": 2.8962340354919434, "epoch": 0.03291763341067285, "grad_norm": 0.06320207566022873, "grad_norm_var": 1.5291467771260992e-05, "learning_rate": 0.009995721205382077, "loss": 2.9232, "step": 908 }, { "crossentropy": 3.0220978260040283, "epoch": 0.03295388631090487, "grad_norm": 0.06487412005662918, "grad_norm_var": 1.3508913133299336e-05, "learning_rate": 0.00999569713690179, "loss": 2.928, "step": 909 }, { "crossentropy": 2.9108190536499023, "epoch": 0.032990139211136894, "grad_norm": 0.07515747845172882, "grad_norm_var": 2.8524028428815867e-05, "learning_rate": 0.009995673000947029, "loss": 2.8796, "step": 910 }, { "crossentropy": 2.9664533138275146, "epoch": 0.03302639211136891, "grad_norm": 0.055449478328228, "grad_norm_var": 2.6115103123384015e-05, "learning_rate": 0.009995648797518127, "loss": 2.9984, "step": 911 }, { "crossentropy": 2.9096102714538574, "epoch": 0.03306264501160093, "grad_norm": 0.058625079691410065, "grad_norm_var": 2.634261175450166e-05, "learning_rate": 0.009995624526615404, "loss": 2.9098, "step": 912 }, { "crossentropy": 3.019153594970703, "epoch": 0.033098897911832945, "grad_norm": 0.06865235418081284, "grad_norm_var": 2.9038005616107733e-05, "learning_rate": 0.009995600188239192, "loss": 2.9128, "step": 913 }, { "crossentropy": 2.965471029281616, "epoch": 0.033135150812064966, "grad_norm": 0.06322067230939865, "grad_norm_var": 2.9279306976760325e-05, "learning_rate": 0.009995575782389817, "loss": 2.9711, "step": 914 }, { "crossentropy": 2.9602198600769043, "epoch": 0.03317140371229698, "grad_norm": 0.06039263308048248, "grad_norm_var": 2.8643268388837498e-05, "learning_rate": 0.009995551309067612, "loss": 2.9865, "step": 915 }, { "crossentropy": 2.989624500274658, "epoch": 0.033207656612529, "grad_norm": 0.06429435312747955, "grad_norm_var": 2.8166296715681262e-05, "learning_rate": 0.009995526768272905, "loss": 3.0161, "step": 916 }, { "crossentropy": 2.884016990661621, "epoch": 0.03324390951276102, "grad_norm": 0.057602278888225555, "grad_norm_var": 2.9253155529597914e-05, "learning_rate": 0.009995502160006027, "loss": 2.9215, "step": 917 }, { "crossentropy": 3.0165183544158936, "epoch": 0.03328016241299304, "grad_norm": 0.0597626194357872, "grad_norm_var": 2.9118580687198526e-05, "learning_rate": 0.00999547748426731, "loss": 3.0261, "step": 918 }, { "crossentropy": 2.933485746383667, "epoch": 0.03331641531322506, "grad_norm": 0.05599352717399597, "grad_norm_var": 3.0978440680637374e-05, "learning_rate": 0.00999545274105709, "loss": 2.9926, "step": 919 }, { "crossentropy": 3.0170040130615234, "epoch": 0.03335266821345707, "grad_norm": 0.06011001020669937, "grad_norm_var": 3.0892671464971314e-05, "learning_rate": 0.009995427930375703, "loss": 3.023, "step": 920 }, { "crossentropy": 2.9800050258636475, "epoch": 0.033388921113689095, "grad_norm": 0.05897995084524155, "grad_norm_var": 3.110396081666041e-05, "learning_rate": 0.009995403052223477, "loss": 3.0018, "step": 921 }, { "crossentropy": 3.233323574066162, "epoch": 0.033425174013921116, "grad_norm": 0.059299830347299576, "grad_norm_var": 2.911392029508348e-05, "learning_rate": 0.009995378106600753, "loss": 3.0975, "step": 922 }, { "crossentropy": 3.0894007682800293, "epoch": 0.03346142691415313, "grad_norm": 0.05848586931824684, "grad_norm_var": 2.5304917220033094e-05, "learning_rate": 0.009995353093507867, "loss": 3.0643, "step": 923 }, { "crossentropy": 3.00400710105896, "epoch": 0.03349767981438515, "grad_norm": 0.07115796208381653, "grad_norm_var": 3.105967689884618e-05, "learning_rate": 0.009995328012945157, "loss": 2.9967, "step": 924 }, { "crossentropy": 3.029707431793213, "epoch": 0.033533932714617166, "grad_norm": 0.06281105428934097, "grad_norm_var": 3.0536092982900534e-05, "learning_rate": 0.009995302864912962, "loss": 3.0436, "step": 925 }, { "crossentropy": 2.9213669300079346, "epoch": 0.03357018561484919, "grad_norm": 0.06271608173847198, "grad_norm_var": 1.8176184502677327e-05, "learning_rate": 0.00999527764941162, "loss": 2.884, "step": 926 }, { "crossentropy": 2.8927741050720215, "epoch": 0.03360643851508121, "grad_norm": 0.05519765987992287, "grad_norm_var": 1.8369771488018807e-05, "learning_rate": 0.009995252366441474, "loss": 2.9885, "step": 927 }, { "crossentropy": 3.006495952606201, "epoch": 0.033642691415313224, "grad_norm": 0.06742114573717117, "grad_norm_var": 2.032468498480751e-05, "learning_rate": 0.009995227016002863, "loss": 3.0157, "step": 928 }, { "crossentropy": 3.01570987701416, "epoch": 0.033678944315545245, "grad_norm": 0.06577841937541962, "grad_norm_var": 1.8150429887943813e-05, "learning_rate": 0.009995201598096132, "loss": 2.9467, "step": 929 }, { "crossentropy": 2.878938674926758, "epoch": 0.03371519721577726, "grad_norm": 0.05466398596763611, "grad_norm_var": 2.070805612559238e-05, "learning_rate": 0.009995176112721622, "loss": 2.8624, "step": 930 }, { "crossentropy": 2.979588270187378, "epoch": 0.03375145011600928, "grad_norm": 0.05971275269985199, "grad_norm_var": 2.078445402936135e-05, "learning_rate": 0.009995150559879677, "loss": 2.9096, "step": 931 }, { "crossentropy": 2.9722323417663574, "epoch": 0.0337877030162413, "grad_norm": 0.05821536108851433, "grad_norm_var": 2.0321958811886133e-05, "learning_rate": 0.009995124939570646, "loss": 2.9739, "step": 932 }, { "crossentropy": 2.992114305496216, "epoch": 0.03382395591647332, "grad_norm": 0.05355155095458031, "grad_norm_var": 2.2909445884571e-05, "learning_rate": 0.00999509925179487, "loss": 2.9603, "step": 933 }, { "crossentropy": 2.8104429244995117, "epoch": 0.03386020881670534, "grad_norm": 0.053990140557289124, "grad_norm_var": 2.5360318316818414e-05, "learning_rate": 0.0099950734965527, "loss": 2.9179, "step": 934 }, { "crossentropy": 2.7842299938201904, "epoch": 0.03389646171693735, "grad_norm": 0.05448063835501671, "grad_norm_var": 2.6287410705270558e-05, "learning_rate": 0.009995047673844482, "loss": 2.8584, "step": 935 }, { "crossentropy": 2.9830410480499268, "epoch": 0.033932714617169374, "grad_norm": 0.06095065176486969, "grad_norm_var": 2.6367920079002858e-05, "learning_rate": 0.009995021783670564, "loss": 3.03, "step": 936 }, { "crossentropy": 2.862534761428833, "epoch": 0.033968967517401395, "grad_norm": 0.06100334972143173, "grad_norm_var": 2.6392228688813106e-05, "learning_rate": 0.009994995826031297, "loss": 2.8423, "step": 937 }, { "crossentropy": 3.0104618072509766, "epoch": 0.03400522041763341, "grad_norm": 0.056468166410923004, "grad_norm_var": 2.7144428306438184e-05, "learning_rate": 0.009994969800927029, "loss": 3.0579, "step": 938 }, { "crossentropy": 3.0488245487213135, "epoch": 0.03404147331786543, "grad_norm": 0.05591230466961861, "grad_norm_var": 2.8005127315286423e-05, "learning_rate": 0.009994943708358118, "loss": 3.0287, "step": 939 }, { "crossentropy": 2.9212381839752197, "epoch": 0.034077726218097446, "grad_norm": 0.05518101528286934, "grad_norm_var": 1.939500861723361e-05, "learning_rate": 0.009994917548324909, "loss": 3.005, "step": 940 }, { "crossentropy": 2.952828884124756, "epoch": 0.03411397911832947, "grad_norm": 0.05998263135552406, "grad_norm_var": 1.831762889607924e-05, "learning_rate": 0.009994891320827759, "loss": 2.9472, "step": 941 }, { "crossentropy": 3.0180492401123047, "epoch": 0.03415023201856149, "grad_norm": 0.06273234635591507, "grad_norm_var": 1.8326893416928333e-05, "learning_rate": 0.00999486502586702, "loss": 3.0318, "step": 942 }, { "crossentropy": 2.90012264251709, "epoch": 0.0341864849187935, "grad_norm": 0.06720420718193054, "grad_norm_var": 2.212591933417977e-05, "learning_rate": 0.009994838663443053, "loss": 2.9171, "step": 943 }, { "crossentropy": 2.9893550872802734, "epoch": 0.034222737819025524, "grad_norm": 0.06373432278633118, "grad_norm_var": 1.893563486611456e-05, "learning_rate": 0.009994812233556208, "loss": 2.99, "step": 944 }, { "crossentropy": 2.8590431213378906, "epoch": 0.03425899071925754, "grad_norm": 0.06517757475376129, "grad_norm_var": 1.8412967485992073e-05, "learning_rate": 0.009994785736206843, "loss": 2.9613, "step": 945 }, { "crossentropy": 2.834561347961426, "epoch": 0.03429524361948956, "grad_norm": 0.06261087208986282, "grad_norm_var": 1.7834462089255652e-05, "learning_rate": 0.009994759171395319, "loss": 2.9008, "step": 946 }, { "crossentropy": 2.9118764400482178, "epoch": 0.034331496519721574, "grad_norm": 0.11454721540212631, "grad_norm_var": 0.000207815142184749, "learning_rate": 0.00999473253912199, "loss": 2.9852, "step": 947 }, { "crossentropy": 2.8710708618164062, "epoch": 0.034367749419953596, "grad_norm": 0.05337929725646973, "grad_norm_var": 0.00021227105311629276, "learning_rate": 0.009994705839387219, "loss": 2.8487, "step": 948 }, { "crossentropy": 2.9036617279052734, "epoch": 0.03440400232018562, "grad_norm": 0.06568218767642975, "grad_norm_var": 0.00020690307510135285, "learning_rate": 0.009994679072191367, "loss": 3.0363, "step": 949 }, { "crossentropy": 2.952289581298828, "epoch": 0.03444025522041763, "grad_norm": 0.06355520337820053, "grad_norm_var": 0.0002007290899663799, "learning_rate": 0.009994652237534792, "loss": 2.9878, "step": 950 }, { "crossentropy": 2.6094038486480713, "epoch": 0.03447650812064965, "grad_norm": 0.06345998495817184, "grad_norm_var": 0.00019447597242044592, "learning_rate": 0.00999462533541786, "loss": 2.8213, "step": 951 }, { "crossentropy": 2.9773318767547607, "epoch": 0.03451276102088167, "grad_norm": 0.058694224804639816, "grad_norm_var": 0.00019585416237516694, "learning_rate": 0.009994598365840934, "loss": 2.9816, "step": 952 }, { "crossentropy": 3.0823774337768555, "epoch": 0.03454901392111369, "grad_norm": 0.055621832609176636, "grad_norm_var": 0.0002000532115979038, "learning_rate": 0.009994571328804379, "loss": 3.0031, "step": 953 }, { "crossentropy": 3.1010220050811768, "epoch": 0.03458526682134571, "grad_norm": 0.1095011904835701, "grad_norm_var": 0.00032260146230292917, "learning_rate": 0.009994544224308555, "loss": 3.0456, "step": 954 }, { "crossentropy": 2.9434444904327393, "epoch": 0.034621519721577725, "grad_norm": 0.06720379739999771, "grad_norm_var": 0.00031340893190254183, "learning_rate": 0.009994517052353834, "loss": 3.0461, "step": 955 }, { "crossentropy": 3.017050266265869, "epoch": 0.034657772621809746, "grad_norm": 0.07076240330934525, "grad_norm_var": 0.00030191619760117636, "learning_rate": 0.00999448981294058, "loss": 2.9825, "step": 956 }, { "crossentropy": 2.8895785808563232, "epoch": 0.03469402552204176, "grad_norm": 0.0643143281340599, "grad_norm_var": 0.00029788629503656503, "learning_rate": 0.009994462506069161, "loss": 2.9304, "step": 957 }, { "crossentropy": 2.930098056793213, "epoch": 0.03473027842227378, "grad_norm": 0.067986860871315, "grad_norm_var": 0.00029503770939453454, "learning_rate": 0.00999443513173995, "loss": 2.948, "step": 958 }, { "crossentropy": 2.839083671569824, "epoch": 0.0347665313225058, "grad_norm": 0.06627162545919418, "grad_norm_var": 0.00029538869080880697, "learning_rate": 0.00999440768995331, "loss": 2.9245, "step": 959 }, { "crossentropy": 2.9551830291748047, "epoch": 0.03480278422273782, "grad_norm": 0.06247350201010704, "grad_norm_var": 0.0002964625939867951, "learning_rate": 0.009994380180709615, "loss": 2.9161, "step": 960 }, { "crossentropy": 2.860232353210449, "epoch": 0.03483903712296984, "grad_norm": 0.06318455189466476, "grad_norm_var": 0.00029784689079679694, "learning_rate": 0.009994352604009238, "loss": 2.927, "step": 961 }, { "crossentropy": 2.968109607696533, "epoch": 0.03487529002320185, "grad_norm": 0.05627082288265228, "grad_norm_var": 0.00030603746793207985, "learning_rate": 0.009994324959852547, "loss": 2.9471, "step": 962 }, { "crossentropy": 2.840799570083618, "epoch": 0.034911542923433875, "grad_norm": 0.05985596030950546, "grad_norm_var": 0.0001603481659073147, "learning_rate": 0.009994297248239921, "loss": 2.8812, "step": 963 }, { "crossentropy": 2.874081611633301, "epoch": 0.034947795823665896, "grad_norm": 0.057504598051309586, "grad_norm_var": 0.00015473743844343166, "learning_rate": 0.009994269469171731, "loss": 2.942, "step": 964 }, { "crossentropy": 2.9944379329681396, "epoch": 0.03498404872389791, "grad_norm": 0.06040211766958237, "grad_norm_var": 0.0001565427206233968, "learning_rate": 0.009994241622648352, "loss": 2.9873, "step": 965 }, { "crossentropy": 2.8273205757141113, "epoch": 0.03502030162412993, "grad_norm": 0.07435360550880432, "grad_norm_var": 0.0001611147880044271, "learning_rate": 0.00999421370867016, "loss": 2.8259, "step": 966 }, { "crossentropy": 2.9801862239837646, "epoch": 0.035056554524361946, "grad_norm": 0.05787266790866852, "grad_norm_var": 0.0001650448381347956, "learning_rate": 0.009994185727237535, "loss": 2.9126, "step": 967 }, { "crossentropy": 2.714871644973755, "epoch": 0.03509280742459397, "grad_norm": 0.06075387820601463, "grad_norm_var": 0.00016336760930099145, "learning_rate": 0.009994157678350853, "loss": 2.8071, "step": 968 }, { "crossentropy": 2.994896173477173, "epoch": 0.03512906032482599, "grad_norm": 0.10220716148614883, "grad_norm_var": 0.000235188810855925, "learning_rate": 0.009994129562010493, "loss": 2.9689, "step": 969 }, { "crossentropy": 2.903658628463745, "epoch": 0.035165313225058004, "grad_norm": 0.06155078485608101, "grad_norm_var": 0.0001187204986747579, "learning_rate": 0.009994101378216831, "loss": 2.978, "step": 970 }, { "crossentropy": 3.129281759262085, "epoch": 0.035201566125290025, "grad_norm": 0.06677264720201492, "grad_norm_var": 0.00011865202316333386, "learning_rate": 0.009994073126970253, "loss": 3.0089, "step": 971 }, { "crossentropy": 3.028912305831909, "epoch": 0.03523781902552204, "grad_norm": 0.07136520743370056, "grad_norm_var": 0.00011907490014317322, "learning_rate": 0.009994044808271138, "loss": 2.9431, "step": 972 }, { "crossentropy": 2.948263168334961, "epoch": 0.03527407192575406, "grad_norm": 0.06305084377527237, "grad_norm_var": 0.00011942854101755195, "learning_rate": 0.009994016422119872, "loss": 2.9623, "step": 973 }, { "crossentropy": 2.921992063522339, "epoch": 0.03531032482598608, "grad_norm": 0.05923829972743988, "grad_norm_var": 0.00012159390250148759, "learning_rate": 0.009993987968516834, "loss": 2.9206, "step": 974 }, { "crossentropy": 2.891427516937256, "epoch": 0.0353465777262181, "grad_norm": 0.05949410796165466, "grad_norm_var": 0.0001234923782454563, "learning_rate": 0.00999395944746241, "loss": 2.8548, "step": 975 }, { "crossentropy": 2.7223684787750244, "epoch": 0.03538283062645012, "grad_norm": 0.05713564157485962, "grad_norm_var": 0.00012690899493967745, "learning_rate": 0.009993930858956982, "loss": 2.85, "step": 976 }, { "crossentropy": 3.0388665199279785, "epoch": 0.03541908352668213, "grad_norm": 0.058730173856019974, "grad_norm_var": 0.00012889371365529072, "learning_rate": 0.009993902203000941, "loss": 3.028, "step": 977 }, { "crossentropy": 2.8506112098693848, "epoch": 0.035455336426914154, "grad_norm": 0.06576342135667801, "grad_norm_var": 0.00012454049787956033, "learning_rate": 0.009993873479594673, "loss": 2.8185, "step": 978 }, { "crossentropy": 2.961294651031494, "epoch": 0.03549158932714617, "grad_norm": 0.06216941773891449, "grad_norm_var": 0.0001233643974057221, "learning_rate": 0.009993844688738564, "loss": 2.9654, "step": 979 }, { "crossentropy": 2.959746837615967, "epoch": 0.03552784222737819, "grad_norm": 0.0610094889998436, "grad_norm_var": 0.00012067718807043654, "learning_rate": 0.009993815830433007, "loss": 2.9665, "step": 980 }, { "crossentropy": 3.062130928039551, "epoch": 0.03556409512761021, "grad_norm": 0.05660371482372284, "grad_norm_var": 0.00012396671901024147, "learning_rate": 0.009993786904678385, "loss": 2.9748, "step": 981 }, { "crossentropy": 2.9138240814208984, "epoch": 0.035600348027842225, "grad_norm": 0.0653587132692337, "grad_norm_var": 0.00011766092971569464, "learning_rate": 0.009993757911475096, "loss": 3.0146, "step": 982 }, { "crossentropy": 3.015881299972534, "epoch": 0.03563660092807425, "grad_norm": 0.07094347476959229, "grad_norm_var": 0.00011710733397055357, "learning_rate": 0.009993728850823527, "loss": 3.0181, "step": 983 }, { "crossentropy": 3.20676851272583, "epoch": 0.03567285382830626, "grad_norm": 0.07044597715139389, "grad_norm_var": 0.00011731779921965145, "learning_rate": 0.00999369972272407, "loss": 3.0651, "step": 984 }, { "crossentropy": 2.879263162612915, "epoch": 0.03570910672853828, "grad_norm": 0.06234925240278244, "grad_norm_var": 2.2807658649966433e-05, "learning_rate": 0.009993670527177121, "loss": 2.9716, "step": 985 }, { "crossentropy": 2.9375534057617188, "epoch": 0.035745359628770304, "grad_norm": 0.05542083829641342, "grad_norm_var": 2.6544024972391795e-05, "learning_rate": 0.009993641264183074, "loss": 3.0363, "step": 986 }, { "crossentropy": 3.076068162918091, "epoch": 0.03578161252900232, "grad_norm": 0.053001925349235535, "grad_norm_var": 3.122254498314717e-05, "learning_rate": 0.009993611933742323, "loss": 2.965, "step": 987 }, { "crossentropy": 2.889291524887085, "epoch": 0.03581786542923434, "grad_norm": 0.061823491007089615, "grad_norm_var": 2.5004530639290378e-05, "learning_rate": 0.009993582535855264, "loss": 2.8824, "step": 988 }, { "crossentropy": 2.962131977081299, "epoch": 0.035854118329466354, "grad_norm": 0.0620260015130043, "grad_norm_var": 2.4845779144962395e-05, "learning_rate": 0.009993553070522297, "loss": 2.9807, "step": 989 }, { "crossentropy": 2.8242697715759277, "epoch": 0.035890371229698376, "grad_norm": 0.05584389343857765, "grad_norm_var": 2.651919857061761e-05, "learning_rate": 0.009993523537743815, "loss": 2.9017, "step": 990 }, { "crossentropy": 3.0170347690582275, "epoch": 0.0359266241299304, "grad_norm": 0.058479420840740204, "grad_norm_var": 2.680520470489106e-05, "learning_rate": 0.009993493937520222, "loss": 2.9638, "step": 991 }, { "crossentropy": 3.0077526569366455, "epoch": 0.03596287703016241, "grad_norm": 0.06044355779886246, "grad_norm_var": 2.575424637920176e-05, "learning_rate": 0.009993464269851914, "loss": 2.9859, "step": 992 }, { "crossentropy": 2.8976995944976807, "epoch": 0.03599912993039443, "grad_norm": 0.05846073105931282, "grad_norm_var": 2.5850237176060162e-05, "learning_rate": 0.009993434534739293, "loss": 2.87, "step": 993 }, { "crossentropy": 2.871591806411743, "epoch": 0.03603538283062645, "grad_norm": 0.057252366095781326, "grad_norm_var": 2.5265917242358e-05, "learning_rate": 0.009993404732182763, "loss": 2.907, "step": 994 }, { "crossentropy": 2.9942150115966797, "epoch": 0.03607163573085847, "grad_norm": 0.061451248824596405, "grad_norm_var": 2.5160034293284876e-05, "learning_rate": 0.009993374862182722, "loss": 3.0186, "step": 995 }, { "crossentropy": 3.016728639602661, "epoch": 0.03610788863109049, "grad_norm": 0.05739809200167656, "grad_norm_var": 2.5817541756792812e-05, "learning_rate": 0.009993344924739576, "loss": 3.0277, "step": 996 }, { "crossentropy": 3.0242488384246826, "epoch": 0.036144141531322505, "grad_norm": 0.06212999299168587, "grad_norm_var": 2.4887461072632908e-05, "learning_rate": 0.009993314919853731, "loss": 3.01, "step": 997 }, { "crossentropy": 2.8328349590301514, "epoch": 0.036180394431554526, "grad_norm": 0.059981707483530045, "grad_norm_var": 2.3427474411270797e-05, "learning_rate": 0.009993284847525588, "loss": 2.8749, "step": 998 }, { "crossentropy": 3.0400357246398926, "epoch": 0.03621664733178654, "grad_norm": 0.05967167764902115, "grad_norm_var": 1.5621271449920276e-05, "learning_rate": 0.009993254707755557, "loss": 3.0145, "step": 999 }, { "crossentropy": 3.0118556022644043, "epoch": 0.03625290023201856, "grad_norm": 0.05767536535859108, "grad_norm_var": 7.620922463051164e-06, "learning_rate": 0.009993224500544044, "loss": 2.8805, "step": 1000 }, { "crossentropy": 2.9082415103912354, "epoch": 0.03628915313225058, "grad_norm": 0.06020934879779816, "grad_norm_var": 6.940982371141333e-06, "learning_rate": 0.009993194225891455, "loss": 2.9404, "step": 1001 }, { "crossentropy": 2.942258358001709, "epoch": 0.0363254060324826, "grad_norm": 0.05881434306502342, "grad_norm_var": 6.118482784053744e-06, "learning_rate": 0.009993163883798202, "loss": 3.0038, "step": 1002 }, { "crossentropy": 2.800978660583496, "epoch": 0.03636165893271462, "grad_norm": 0.05259254574775696, "grad_norm_var": 6.458618230016586e-06, "learning_rate": 0.009993133474264693, "loss": 2.8988, "step": 1003 }, { "crossentropy": 2.9574012756347656, "epoch": 0.03639791183294663, "grad_norm": 0.057553086429834366, "grad_norm_var": 5.999761901946619e-06, "learning_rate": 0.009993102997291338, "loss": 2.8434, "step": 1004 }, { "crossentropy": 2.9503400325775146, "epoch": 0.036434164733178655, "grad_norm": 0.05963631719350815, "grad_norm_var": 5.312528266503541e-06, "learning_rate": 0.009993072452878551, "loss": 3.01, "step": 1005 }, { "crossentropy": 2.955038547515869, "epoch": 0.036470417633410676, "grad_norm": 0.06491678208112717, "grad_norm_var": 7.12372370227643e-06, "learning_rate": 0.009993041841026742, "loss": 2.9643, "step": 1006 }, { "crossentropy": 2.996734142303467, "epoch": 0.03650667053364269, "grad_norm": 0.07369448244571686, "grad_norm_var": 2.0198167141253358e-05, "learning_rate": 0.009993011161736327, "loss": 3.0174, "step": 1007 }, { "crossentropy": 3.026055097579956, "epoch": 0.03654292343387471, "grad_norm": 0.06351834535598755, "grad_norm_var": 2.092269435973367e-05, "learning_rate": 0.00999298041500772, "loss": 2.9916, "step": 1008 }, { "crossentropy": 3.1293702125549316, "epoch": 0.036579176334106726, "grad_norm": 0.057706478983163834, "grad_norm_var": 2.1144203274154707e-05, "learning_rate": 0.009992949600841334, "loss": 3.0819, "step": 1009 }, { "crossentropy": 2.966123342514038, "epoch": 0.03661542923433875, "grad_norm": 0.05835391581058502, "grad_norm_var": 2.0777913208092483e-05, "learning_rate": 0.009992918719237588, "loss": 2.9951, "step": 1010 }, { "crossentropy": 3.024228572845459, "epoch": 0.03665168213457077, "grad_norm": 0.05401235818862915, "grad_norm_var": 2.312583964644606e-05, "learning_rate": 0.009992887770196895, "loss": 2.9644, "step": 1011 }, { "crossentropy": 3.110471487045288, "epoch": 0.036687935034802784, "grad_norm": 0.059769608080387115, "grad_norm_var": 2.2696812685123655e-05, "learning_rate": 0.009992856753719677, "loss": 3.0416, "step": 1012 }, { "crossentropy": 2.913721799850464, "epoch": 0.036724187935034805, "grad_norm": 0.059221409261226654, "grad_norm_var": 2.2405247666178127e-05, "learning_rate": 0.009992825669806351, "loss": 2.9166, "step": 1013 }, { "crossentropy": 2.9359912872314453, "epoch": 0.03676044083526682, "grad_norm": 0.054856449365615845, "grad_norm_var": 2.3945382950443795e-05, "learning_rate": 0.009992794518457339, "loss": 2.9388, "step": 1014 }, { "crossentropy": 3.040257453918457, "epoch": 0.03679669373549884, "grad_norm": 0.056351348757743835, "grad_norm_var": 2.456401938324207e-05, "learning_rate": 0.009992763299673059, "loss": 2.9725, "step": 1015 }, { "crossentropy": 2.8741791248321533, "epoch": 0.036832946635730855, "grad_norm": 0.05595576390624046, "grad_norm_var": 2.512250796154932e-05, "learning_rate": 0.009992732013453933, "loss": 2.9311, "step": 1016 }, { "crossentropy": 2.973700761795044, "epoch": 0.03686919953596288, "grad_norm": 0.05260361731052399, "grad_norm_var": 2.7712005302526117e-05, "learning_rate": 0.009992700659800386, "loss": 2.9204, "step": 1017 }, { "crossentropy": 2.8421223163604736, "epoch": 0.0369054524361949, "grad_norm": 0.05811430886387825, "grad_norm_var": 2.7734042492649538e-05, "learning_rate": 0.00999266923871284, "loss": 2.881, "step": 1018 }, { "crossentropy": 3.015760898590088, "epoch": 0.03694170533642691, "grad_norm": 0.05695939064025879, "grad_norm_var": 2.538232369563645e-05, "learning_rate": 0.009992637750191718, "loss": 2.9976, "step": 1019 }, { "crossentropy": 3.0832417011260986, "epoch": 0.036977958236658934, "grad_norm": 0.05507667735219002, "grad_norm_var": 2.622734356225338e-05, "learning_rate": 0.009992606194237448, "loss": 3.0501, "step": 1020 }, { "crossentropy": 2.9758784770965576, "epoch": 0.03701421113689095, "grad_norm": 0.055020738393068314, "grad_norm_var": 2.7042109360184546e-05, "learning_rate": 0.009992574570850454, "loss": 3.0302, "step": 1021 }, { "crossentropy": 3.053823471069336, "epoch": 0.03705046403712297, "grad_norm": 0.05306261032819748, "grad_norm_var": 2.5695618651919566e-05, "learning_rate": 0.009992542880031164, "loss": 3.0203, "step": 1022 }, { "crossentropy": 2.9287915229797363, "epoch": 0.03708671693735499, "grad_norm": 0.05191310495138168, "grad_norm_var": 9.09205850644906e-06, "learning_rate": 0.009992511121780005, "loss": 2.9049, "step": 1023 }, { "crossentropy": 2.9767189025878906, "epoch": 0.037122969837587005, "grad_norm": 0.06110186502337456, "grad_norm_var": 7.165443086676095e-06, "learning_rate": 0.009992479296097409, "loss": 2.9314, "step": 1024 }, { "crossentropy": 3.0706419944763184, "epoch": 0.03715922273781903, "grad_norm": 0.06358874589204788, "grad_norm_var": 1.0466425215861717e-05, "learning_rate": 0.009992447402983802, "loss": 2.9556, "step": 1025 }, { "crossentropy": 3.012380361557007, "epoch": 0.03719547563805104, "grad_norm": 0.05789479613304138, "grad_norm_var": 1.0373616685722433e-05, "learning_rate": 0.009992415442439617, "loss": 2.9673, "step": 1026 }, { "crossentropy": 2.849834442138672, "epoch": 0.03723172853828306, "grad_norm": 0.059418946504592896, "grad_norm_var": 1.03395709908989e-05, "learning_rate": 0.009992383414465287, "loss": 2.9674, "step": 1027 }, { "crossentropy": 2.881862163543701, "epoch": 0.037267981438515084, "grad_norm": 0.05995497852563858, "grad_norm_var": 1.0411857164371816e-05, "learning_rate": 0.009992351319061241, "loss": 2.8648, "step": 1028 }, { "crossentropy": 2.9012928009033203, "epoch": 0.0373042343387471, "grad_norm": 0.06002028286457062, "grad_norm_var": 1.0694387794836444e-05, "learning_rate": 0.009992319156227914, "loss": 2.9098, "step": 1029 }, { "crossentropy": 2.973731756210327, "epoch": 0.03734048723897912, "grad_norm": 0.05456991493701935, "grad_norm_var": 1.0781158641388379e-05, "learning_rate": 0.009992286925965741, "loss": 3.0396, "step": 1030 }, { "crossentropy": 3.1689579486846924, "epoch": 0.037376740139211134, "grad_norm": 0.05641657114028931, "grad_norm_var": 1.0775997190753429e-05, "learning_rate": 0.009992254628275157, "loss": 3.0626, "step": 1031 }, { "crossentropy": 2.887955665588379, "epoch": 0.037412993039443156, "grad_norm": 0.053898971527814865, "grad_norm_var": 1.1321150578066872e-05, "learning_rate": 0.0099922222631566, "loss": 2.9042, "step": 1032 }, { "crossentropy": 2.9569153785705566, "epoch": 0.03744924593967518, "grad_norm": 0.05935292690992355, "grad_norm_var": 1.0345997873319133e-05, "learning_rate": 0.009992189830610503, "loss": 3.0274, "step": 1033 }, { "crossentropy": 2.920689582824707, "epoch": 0.03748549883990719, "grad_norm": 0.061697665601968765, "grad_norm_var": 1.1550581683391197e-05, "learning_rate": 0.009992157330637306, "loss": 2.926, "step": 1034 }, { "crossentropy": 2.9162139892578125, "epoch": 0.03752175174013921, "grad_norm": 0.059648461639881134, "grad_norm_var": 1.1809855097218563e-05, "learning_rate": 0.009992124763237449, "loss": 2.885, "step": 1035 }, { "crossentropy": 2.922572612762451, "epoch": 0.03755800464037123, "grad_norm": 0.060973890125751495, "grad_norm_var": 1.194837451731744e-05, "learning_rate": 0.009992092128411371, "loss": 2.9688, "step": 1036 }, { "crossentropy": 3.0637097358703613, "epoch": 0.03759425754060325, "grad_norm": 0.05741005390882492, "grad_norm_var": 1.1345415020415767e-05, "learning_rate": 0.009992059426159515, "loss": 2.9107, "step": 1037 }, { "crossentropy": 2.956623077392578, "epoch": 0.03763051044083527, "grad_norm": 0.05542083829641342, "grad_norm_var": 1.008306912915844e-05, "learning_rate": 0.009992026656482321, "loss": 2.9204, "step": 1038 }, { "crossentropy": 2.8960325717926025, "epoch": 0.037666763341067284, "grad_norm": 0.05127057060599327, "grad_norm_var": 1.0658626456594433e-05, "learning_rate": 0.009991993819380228, "loss": 2.9042, "step": 1039 }, { "crossentropy": 2.87739896774292, "epoch": 0.037703016241299306, "grad_norm": 0.0563557893037796, "grad_norm_var": 1.028705647169442e-05, "learning_rate": 0.009991960914853685, "loss": 2.8876, "step": 1040 }, { "crossentropy": 3.00119948387146, "epoch": 0.03773926914153132, "grad_norm": 0.05578198656439781, "grad_norm_var": 8.27188237232911e-06, "learning_rate": 0.009991927942903133, "loss": 2.9786, "step": 1041 }, { "crossentropy": 2.955353260040283, "epoch": 0.03777552204176334, "grad_norm": 0.06375160813331604, "grad_norm_var": 1.071984351941127e-05, "learning_rate": 0.00999189490352902, "loss": 2.9751, "step": 1042 }, { "crossentropy": 3.050157070159912, "epoch": 0.03781177494199536, "grad_norm": 0.06113818287849426, "grad_norm_var": 1.1259310670643184e-05, "learning_rate": 0.009991861796731791, "loss": 3.0355, "step": 1043 }, { "crossentropy": 2.892045259475708, "epoch": 0.03784802784222738, "grad_norm": 0.05646098777651787, "grad_norm_var": 1.1101730732026154e-05, "learning_rate": 0.009991828622511892, "loss": 2.8994, "step": 1044 }, { "crossentropy": 2.8477823734283447, "epoch": 0.0378842807424594, "grad_norm": 0.06447753310203552, "grad_norm_var": 1.3686386157052826e-05, "learning_rate": 0.009991795380869773, "loss": 2.8662, "step": 1045 }, { "crossentropy": 3.008091688156128, "epoch": 0.03792053364269141, "grad_norm": 0.059962280094623566, "grad_norm_var": 1.3009438755824145e-05, "learning_rate": 0.00999176207180588, "loss": 2.9275, "step": 1046 }, { "crossentropy": 2.716219902038574, "epoch": 0.037956786542923435, "grad_norm": 0.0555126890540123, "grad_norm_var": 1.3296664575577523e-05, "learning_rate": 0.009991728695320667, "loss": 2.7631, "step": 1047 }, { "crossentropy": 2.982461929321289, "epoch": 0.03799303944315545, "grad_norm": 0.05951879918575287, "grad_norm_var": 1.1958106690845848e-05, "learning_rate": 0.009991695251414584, "loss": 2.9498, "step": 1048 }, { "crossentropy": 2.8186678886413574, "epoch": 0.03802929234338747, "grad_norm": 0.05476095527410507, "grad_norm_var": 1.285840941094296e-05, "learning_rate": 0.009991661740088078, "loss": 2.8977, "step": 1049 }, { "crossentropy": 2.98272705078125, "epoch": 0.03806554524361949, "grad_norm": 0.05446164309978485, "grad_norm_var": 1.2933773229354256e-05, "learning_rate": 0.009991628161341607, "loss": 3.0472, "step": 1050 }, { "crossentropy": 2.9121527671813965, "epoch": 0.038101798143851506, "grad_norm": 0.05447417497634888, "grad_norm_var": 1.3422658494479228e-05, "learning_rate": 0.009991594515175624, "loss": 2.9565, "step": 1051 }, { "crossentropy": 2.990868330001831, "epoch": 0.03813805104408353, "grad_norm": 0.05577827990055084, "grad_norm_var": 1.2778265015466785e-05, "learning_rate": 0.00999156080159058, "loss": 2.9102, "step": 1052 }, { "crossentropy": 2.847437858581543, "epoch": 0.03817430394431554, "grad_norm": 0.056824371218681335, "grad_norm_var": 1.2789823120682754e-05, "learning_rate": 0.009991527020586933, "loss": 2.9468, "step": 1053 }, { "crossentropy": 2.8976166248321533, "epoch": 0.038210556844547564, "grad_norm": 0.05627831816673279, "grad_norm_var": 1.262700071489268e-05, "learning_rate": 0.009991493172165139, "loss": 2.9988, "step": 1054 }, { "crossentropy": 2.9681055545806885, "epoch": 0.038246809744779585, "grad_norm": 0.06517865508794785, "grad_norm_var": 1.353468797021413e-05, "learning_rate": 0.009991459256325653, "loss": 2.9123, "step": 1055 }, { "crossentropy": 2.8167757987976074, "epoch": 0.0382830626450116, "grad_norm": 0.06993228197097778, "grad_norm_var": 2.17711014466287e-05, "learning_rate": 0.009991425273068937, "loss": 2.7338, "step": 1056 }, { "crossentropy": 3.0360240936279297, "epoch": 0.03831931554524362, "grad_norm": 0.056422896683216095, "grad_norm_var": 2.1520216456587653e-05, "learning_rate": 0.009991391222395448, "loss": 2.9339, "step": 1057 }, { "crossentropy": 2.985656976699829, "epoch": 0.038355568445475635, "grad_norm": 0.2068956196308136, "grad_norm_var": 0.0013917330568208475, "learning_rate": 0.009991357104305643, "loss": 2.9724, "step": 1058 }, { "crossentropy": 3.0761914253234863, "epoch": 0.03839182134570766, "grad_norm": 0.05878248065710068, "grad_norm_var": 0.0013942366678663563, "learning_rate": 0.00999132291879999, "loss": 3.0741, "step": 1059 }, { "crossentropy": 2.9215216636657715, "epoch": 0.03842807424593968, "grad_norm": 0.06340748071670532, "grad_norm_var": 0.001386696975433387, "learning_rate": 0.009991288665878943, "loss": 2.9226, "step": 1060 }, { "crossentropy": 2.892768621444702, "epoch": 0.03846432714617169, "grad_norm": 0.07058333605527878, "grad_norm_var": 0.0013859218230377166, "learning_rate": 0.009991254345542969, "loss": 2.9513, "step": 1061 }, { "crossentropy": 3.052351236343384, "epoch": 0.038500580046403714, "grad_norm": 0.0693354606628418, "grad_norm_var": 0.0013805260793141446, "learning_rate": 0.009991219957792532, "loss": 3.0357, "step": 1062 }, { "crossentropy": 2.971945285797119, "epoch": 0.03853683294663573, "grad_norm": 0.058926258236169815, "grad_norm_var": 0.0013749977286801815, "learning_rate": 0.009991185502628093, "loss": 2.9541, "step": 1063 }, { "crossentropy": 2.8585803508758545, "epoch": 0.03857308584686775, "grad_norm": 0.06255356222391129, "grad_norm_var": 0.0013715456983244916, "learning_rate": 0.00999115098005012, "loss": 2.9133, "step": 1064 }, { "crossentropy": 2.8061721324920654, "epoch": 0.03860933874709977, "grad_norm": 0.06405504047870636, "grad_norm_var": 0.0013584786133096095, "learning_rate": 0.009991116390059078, "loss": 2.9145, "step": 1065 }, { "crossentropy": 2.874053716659546, "epoch": 0.038645591647331785, "grad_norm": 0.05913647264242172, "grad_norm_var": 0.001350007730684168, "learning_rate": 0.009991081732655435, "loss": 2.9119, "step": 1066 }, { "crossentropy": 2.829465866088867, "epoch": 0.03868184454756381, "grad_norm": 0.05623326450586319, "grad_norm_var": 0.0013464340711402646, "learning_rate": 0.00999104700783966, "loss": 2.9147, "step": 1067 }, { "crossentropy": 2.917759656906128, "epoch": 0.03871809744779582, "grad_norm": 0.08093726634979248, "grad_norm_var": 0.0013361233100526735, "learning_rate": 0.009991012215612221, "loss": 2.8819, "step": 1068 }, { "crossentropy": 2.965245246887207, "epoch": 0.03875435034802784, "grad_norm": 0.06475797295570374, "grad_norm_var": 0.0013237739440811963, "learning_rate": 0.009990977355973589, "loss": 2.9363, "step": 1069 }, { "crossentropy": 3.0858962535858154, "epoch": 0.038790603248259864, "grad_norm": 0.05748013034462929, "grad_norm_var": 0.0013212306122142572, "learning_rate": 0.00999094242892423, "loss": 3.0141, "step": 1070 }, { "crossentropy": 2.8845038414001465, "epoch": 0.03882685614849188, "grad_norm": 0.05656438320875168, "grad_norm_var": 0.0013346090600542529, "learning_rate": 0.009990907434464623, "loss": 2.9234, "step": 1071 }, { "crossentropy": 3.0554263591766357, "epoch": 0.0388631090487239, "grad_norm": 0.06385012716054916, "grad_norm_var": 0.0013388008586894926, "learning_rate": 0.009990872372595237, "loss": 3.0864, "step": 1072 }, { "crossentropy": 3.02563738822937, "epoch": 0.038899361948955914, "grad_norm": 0.05899902060627937, "grad_norm_var": 0.0013339097766517486, "learning_rate": 0.009990837243316545, "loss": 2.9727, "step": 1073 }, { "crossentropy": 3.0445950031280518, "epoch": 0.038935614849187936, "grad_norm": 0.053516935557127, "grad_norm_var": 4.6178252263158934e-05, "learning_rate": 0.00999080204662902, "loss": 2.9441, "step": 1074 }, { "crossentropy": 2.754816770553589, "epoch": 0.03897186774941996, "grad_norm": 0.059115830808877945, "grad_norm_var": 4.6022412801429745e-05, "learning_rate": 0.00999076678253314, "loss": 2.8037, "step": 1075 }, { "crossentropy": 2.991581916809082, "epoch": 0.03900812064965197, "grad_norm": 0.06261463463306427, "grad_norm_var": 4.5962151191454984e-05, "learning_rate": 0.009990731451029383, "loss": 2.9755, "step": 1076 }, { "crossentropy": 2.9280569553375244, "epoch": 0.03904437354988399, "grad_norm": 0.054344043135643005, "grad_norm_var": 4.47605810680017e-05, "learning_rate": 0.009990696052118223, "loss": 2.917, "step": 1077 }, { "crossentropy": 2.8641324043273926, "epoch": 0.03908062645011601, "grad_norm": 0.13197897374629974, "grad_norm_var": 0.0003562937225672496, "learning_rate": 0.00999066058580014, "loss": 2.9343, "step": 1078 }, { "crossentropy": 3.0008656978607178, "epoch": 0.03911687935034803, "grad_norm": 0.05676545575261116, "grad_norm_var": 0.00035842661114603294, "learning_rate": 0.00999062505207561, "loss": 3.0036, "step": 1079 }, { "crossentropy": 2.936464548110962, "epoch": 0.03915313225058004, "grad_norm": 0.06606167554855347, "grad_norm_var": 0.0003579666019342217, "learning_rate": 0.009990589450945117, "loss": 2.9583, "step": 1080 }, { "crossentropy": 2.936250925064087, "epoch": 0.039189385150812064, "grad_norm": 0.05634692683815956, "grad_norm_var": 0.0003630630417248371, "learning_rate": 0.00999055378240914, "loss": 2.9327, "step": 1081 }, { "crossentropy": 3.0010640621185303, "epoch": 0.039225638051044086, "grad_norm": 0.05787204951047897, "grad_norm_var": 0.0003641378300999846, "learning_rate": 0.009990518046468161, "loss": 2.9555, "step": 1082 }, { "crossentropy": 2.892986536026001, "epoch": 0.0392618909512761, "grad_norm": 0.06020532548427582, "grad_norm_var": 0.00036056575562136867, "learning_rate": 0.009990482243122661, "loss": 2.9639, "step": 1083 }, { "crossentropy": 2.961540460586548, "epoch": 0.03929814385150812, "grad_norm": 0.0584711916744709, "grad_norm_var": 0.0003446354458849526, "learning_rate": 0.009990446372373127, "loss": 2.9463, "step": 1084 }, { "crossentropy": 2.888306140899658, "epoch": 0.039334396751740136, "grad_norm": 0.06487250328063965, "grad_norm_var": 0.000344652665393097, "learning_rate": 0.00999041043422004, "loss": 2.9592, "step": 1085 }, { "crossentropy": 2.8860676288604736, "epoch": 0.03937064965197216, "grad_norm": 0.0642077848315239, "grad_norm_var": 0.0003419100409972592, "learning_rate": 0.009990374428663887, "loss": 2.9874, "step": 1086 }, { "crossentropy": 3.0480918884277344, "epoch": 0.03940690255220418, "grad_norm": 0.0591641403734684, "grad_norm_var": 0.00033971631061996036, "learning_rate": 0.009990338355705152, "loss": 3.0242, "step": 1087 }, { "crossentropy": 3.0192196369171143, "epoch": 0.03944315545243619, "grad_norm": 0.05526174604892731, "grad_norm_var": 0.0003448119003880418, "learning_rate": 0.009990302215344329, "loss": 2.9688, "step": 1088 }, { "crossentropy": 2.995063304901123, "epoch": 0.039479408352668215, "grad_norm": 0.052814509719610214, "grad_norm_var": 0.00035110967748800396, "learning_rate": 0.009990266007581898, "loss": 2.9602, "step": 1089 }, { "crossentropy": 2.9678807258605957, "epoch": 0.03951566125290023, "grad_norm": 0.058095019310712814, "grad_norm_var": 0.00034641686963076394, "learning_rate": 0.009990229732418353, "loss": 2.9567, "step": 1090 }, { "crossentropy": 3.020216464996338, "epoch": 0.03955191415313225, "grad_norm": 0.05123753100633621, "grad_norm_var": 0.0003550452995348304, "learning_rate": 0.009990193389854183, "loss": 3.0147, "step": 1091 }, { "crossentropy": 2.9286022186279297, "epoch": 0.03958816705336427, "grad_norm": 0.05452517792582512, "grad_norm_var": 0.0003597068679773218, "learning_rate": 0.009990156979889878, "loss": 2.9575, "step": 1092 }, { "crossentropy": 2.7767932415008545, "epoch": 0.039624419953596286, "grad_norm": 0.05356564000248909, "grad_norm_var": 0.0003606056471820218, "learning_rate": 0.009990120502525929, "loss": 2.8479, "step": 1093 }, { "crossentropy": 3.13061261177063, "epoch": 0.03966067285382831, "grad_norm": 0.05702202767133713, "grad_norm_var": 1.827675392902864e-05, "learning_rate": 0.00999008395776283, "loss": 3.0206, "step": 1094 }, { "crossentropy": 2.8203535079956055, "epoch": 0.03969692575406032, "grad_norm": 0.05198553204536438, "grad_norm_var": 2.0431338029091617e-05, "learning_rate": 0.009990047345601076, "loss": 2.8464, "step": 1095 }, { "crossentropy": 2.967087984085083, "epoch": 0.03973317865429234, "grad_norm": 0.05597798526287079, "grad_norm_var": 1.5418874208395805e-05, "learning_rate": 0.009990010666041159, "loss": 2.9876, "step": 1096 }, { "crossentropy": 2.906562566757202, "epoch": 0.039769431554524365, "grad_norm": 0.05396462604403496, "grad_norm_var": 1.5973583370106014e-05, "learning_rate": 0.009989973919083574, "loss": 2.9894, "step": 1097 }, { "crossentropy": 2.877403736114502, "epoch": 0.03980568445475638, "grad_norm": 0.05425731837749481, "grad_norm_var": 1.6286874573071854e-05, "learning_rate": 0.009989937104728822, "loss": 2.8861, "step": 1098 }, { "crossentropy": 3.0573461055755615, "epoch": 0.0398419373549884, "grad_norm": 0.05272733047604561, "grad_norm_var": 1.6188901239638556e-05, "learning_rate": 0.009989900222977393, "loss": 2.9556, "step": 1099 }, { "crossentropy": 3.002443313598633, "epoch": 0.039878190255220415, "grad_norm": 0.05118293687701225, "grad_norm_var": 1.7237979209224395e-05, "learning_rate": 0.009989863273829792, "loss": 2.9147, "step": 1100 }, { "crossentropy": 2.89210844039917, "epoch": 0.039914443155452436, "grad_norm": 0.05443362891674042, "grad_norm_var": 1.1252442997499352e-05, "learning_rate": 0.009989826257286514, "loss": 2.9405, "step": 1101 }, { "crossentropy": 2.9013113975524902, "epoch": 0.03995069605568446, "grad_norm": 0.05435929074883461, "grad_norm_var": 5.258163902817763e-06, "learning_rate": 0.009989789173348059, "loss": 2.9713, "step": 1102 }, { "crossentropy": 2.843703269958496, "epoch": 0.03998694895591647, "grad_norm": 0.05697063356637955, "grad_norm_var": 4.168712973795508e-06, "learning_rate": 0.00998975202201493, "loss": 2.9099, "step": 1103 }, { "crossentropy": 2.856498956680298, "epoch": 0.040023201856148494, "grad_norm": 0.05654578283429146, "grad_norm_var": 4.440899653260164e-06, "learning_rate": 0.009989714803287628, "loss": 2.8286, "step": 1104 }, { "crossentropy": 2.8985204696655273, "epoch": 0.04005945475638051, "grad_norm": 0.060379013419151306, "grad_norm_var": 6.464465064931248e-06, "learning_rate": 0.009989677517166656, "loss": 2.9822, "step": 1105 }, { "crossentropy": 3.014573574066162, "epoch": 0.04009570765661253, "grad_norm": 0.0578666590154171, "grad_norm_var": 6.368214754755528e-06, "learning_rate": 0.009989640163652516, "loss": 2.996, "step": 1106 }, { "crossentropy": 2.9601521492004395, "epoch": 0.04013196055684455, "grad_norm": 0.0608530230820179, "grad_norm_var": 7.5633861629189595e-06, "learning_rate": 0.009989602742745714, "loss": 2.8758, "step": 1107 }, { "crossentropy": 2.813703775405884, "epoch": 0.040168213457076565, "grad_norm": 0.06257050484418869, "grad_norm_var": 1.0655888643285038e-05, "learning_rate": 0.009989565254446754, "loss": 2.8939, "step": 1108 }, { "crossentropy": 2.8938894271850586, "epoch": 0.04020446635730859, "grad_norm": 0.054584257304668427, "grad_norm_var": 1.0401471457132232e-05, "learning_rate": 0.009989527698756145, "loss": 2.844, "step": 1109 }, { "crossentropy": 2.8507518768310547, "epoch": 0.0402407192575406, "grad_norm": 0.05765366181731224, "grad_norm_var": 1.0514161033251673e-05, "learning_rate": 0.00998949007567439, "loss": 2.9452, "step": 1110 }, { "crossentropy": 3.0166754722595215, "epoch": 0.04027697215777262, "grad_norm": 0.06494962424039841, "grad_norm_var": 1.4045473825874399e-05, "learning_rate": 0.009989452385202003, "loss": 2.9128, "step": 1111 }, { "crossentropy": 2.8756229877471924, "epoch": 0.04031322505800464, "grad_norm": 0.06163807958364487, "grad_norm_var": 1.540494333432558e-05, "learning_rate": 0.009989414627339486, "loss": 2.8703, "step": 1112 }, { "crossentropy": 3.058265447616577, "epoch": 0.04034947795823666, "grad_norm": 0.06108400970697403, "grad_norm_var": 1.5517252977875706e-05, "learning_rate": 0.009989376802087356, "loss": 3.0328, "step": 1113 }, { "crossentropy": 2.8840978145599365, "epoch": 0.04038573085846868, "grad_norm": 0.060853321105241776, "grad_norm_var": 1.5271626613784738e-05, "learning_rate": 0.00998933890944612, "loss": 2.8794, "step": 1114 }, { "crossentropy": 2.8105556964874268, "epoch": 0.040421983758700694, "grad_norm": 0.05440807342529297, "grad_norm_var": 1.4257453743419097e-05, "learning_rate": 0.009989300949416288, "loss": 2.8812, "step": 1115 }, { "crossentropy": 2.747284173965454, "epoch": 0.040458236658932716, "grad_norm": 0.06247404217720032, "grad_norm_var": 1.1743092011558496e-05, "learning_rate": 0.009989262921998377, "loss": 2.8756, "step": 1116 }, { "crossentropy": 2.999858856201172, "epoch": 0.04049448955916473, "grad_norm": 0.05566103756427765, "grad_norm_var": 1.1114249870134568e-05, "learning_rate": 0.0099892248271929, "loss": 2.9362, "step": 1117 }, { "crossentropy": 2.9290504455566406, "epoch": 0.04053074245939675, "grad_norm": 0.051280323415994644, "grad_norm_var": 1.3582417242837218e-05, "learning_rate": 0.00998918666500037, "loss": 2.9255, "step": 1118 }, { "crossentropy": 2.8759095668792725, "epoch": 0.04056699535962877, "grad_norm": 0.05941528454422951, "grad_norm_var": 1.3380590368084789e-05, "learning_rate": 0.0099891484354213, "loss": 2.9241, "step": 1119 }, { "crossentropy": 2.9794399738311768, "epoch": 0.04060324825986079, "grad_norm": 0.059651006013154984, "grad_norm_var": 1.3013268371604193e-05, "learning_rate": 0.00998911013845621, "loss": 2.932, "step": 1120 }, { "crossentropy": 2.880331516265869, "epoch": 0.04063950116009281, "grad_norm": 0.06016841530799866, "grad_norm_var": 1.297963794587246e-05, "learning_rate": 0.009989071774105619, "loss": 2.9421, "step": 1121 }, { "crossentropy": 2.933192253112793, "epoch": 0.04067575406032482, "grad_norm": 0.05490773916244507, "grad_norm_var": 1.4001369684035527e-05, "learning_rate": 0.00998903334237004, "loss": 2.9612, "step": 1122 }, { "crossentropy": 2.892385482788086, "epoch": 0.040712006960556844, "grad_norm": 0.051961757242679596, "grad_norm_var": 1.6608624308762863e-05, "learning_rate": 0.009988994843249995, "loss": 2.9712, "step": 1123 }, { "crossentropy": 2.9136524200439453, "epoch": 0.040748259860788866, "grad_norm": 0.05498221144080162, "grad_norm_var": 1.5915892828093544e-05, "learning_rate": 0.009988956276746005, "loss": 2.9445, "step": 1124 }, { "crossentropy": 3.0138232707977295, "epoch": 0.04078451276102088, "grad_norm": 0.05721759796142578, "grad_norm_var": 1.5201057754580389e-05, "learning_rate": 0.009988917642858587, "loss": 2.9962, "step": 1125 }, { "crossentropy": 2.9883124828338623, "epoch": 0.0408207656612529, "grad_norm": 0.05192161351442337, "grad_norm_var": 1.7533903868902945e-05, "learning_rate": 0.009988878941588267, "loss": 2.9485, "step": 1126 }, { "crossentropy": 3.0104994773864746, "epoch": 0.040857018561484916, "grad_norm": 0.05621252954006195, "grad_norm_var": 1.3813966306800742e-05, "learning_rate": 0.009988840172935566, "loss": 2.9656, "step": 1127 }, { "crossentropy": 2.974032163619995, "epoch": 0.04089327146171694, "grad_norm": 0.054864007979631424, "grad_norm_var": 1.2596513527647856e-05, "learning_rate": 0.009988801336901007, "loss": 2.8617, "step": 1128 }, { "crossentropy": 2.9221351146698, "epoch": 0.04092952436194896, "grad_norm": 0.05335924029350281, "grad_norm_var": 1.1801801184597541e-05, "learning_rate": 0.009988762433485116, "loss": 2.9514, "step": 1129 }, { "crossentropy": 3.005882978439331, "epoch": 0.04096577726218097, "grad_norm": 0.05953366309404373, "grad_norm_var": 1.1093392247440009e-05, "learning_rate": 0.009988723462688417, "loss": 2.9609, "step": 1130 }, { "crossentropy": 2.901883840560913, "epoch": 0.041002030162412995, "grad_norm": 0.05853910744190216, "grad_norm_var": 1.1213653093869862e-05, "learning_rate": 0.009988684424511436, "loss": 2.916, "step": 1131 }, { "crossentropy": 2.9293580055236816, "epoch": 0.04103828306264501, "grad_norm": 0.054168086498975754, "grad_norm_var": 8.781362604691015e-06, "learning_rate": 0.009988645318954704, "loss": 2.9866, "step": 1132 }, { "crossentropy": 2.7579448223114014, "epoch": 0.04107453596287703, "grad_norm": 0.05489306524395943, "grad_norm_var": 8.839132121822985e-06, "learning_rate": 0.009988606146018745, "loss": 2.8077, "step": 1133 }, { "crossentropy": 2.839462995529175, "epoch": 0.04111078886310905, "grad_norm": 0.05828847363591194, "grad_norm_var": 7.669393160859792e-06, "learning_rate": 0.009988566905704089, "loss": 2.8515, "step": 1134 }, { "crossentropy": 2.7832770347595215, "epoch": 0.041147041763341066, "grad_norm": 0.06568261981010437, "grad_norm_var": 1.2765037996898914e-05, "learning_rate": 0.009988527598011266, "loss": 2.874, "step": 1135 }, { "crossentropy": 2.9708971977233887, "epoch": 0.04118329466357309, "grad_norm": 0.055424656718969345, "grad_norm_var": 1.218858711627575e-05, "learning_rate": 0.009988488222940808, "loss": 3.0328, "step": 1136 }, { "crossentropy": 2.9701340198516846, "epoch": 0.0412195475638051, "grad_norm": 0.050518982112407684, "grad_norm_var": 1.3137519527425811e-05, "learning_rate": 0.009988448780493245, "loss": 2.9113, "step": 1137 }, { "crossentropy": 2.9855198860168457, "epoch": 0.04125580046403712, "grad_norm": 0.06095551699399948, "grad_norm_var": 1.4720364536428138e-05, "learning_rate": 0.009988409270669114, "loss": 2.9156, "step": 1138 }, { "crossentropy": 3.0876877307891846, "epoch": 0.041292053364269145, "grad_norm": 0.06145065277814865, "grad_norm_var": 1.5039167931605451e-05, "learning_rate": 0.009988369693468944, "loss": 2.923, "step": 1139 }, { "crossentropy": 2.839892864227295, "epoch": 0.04132830626450116, "grad_norm": 0.06004701927304268, "grad_norm_var": 1.5448126546407002e-05, "learning_rate": 0.00998833004889327, "loss": 2.8082, "step": 1140 }, { "crossentropy": 2.9926700592041016, "epoch": 0.04136455916473318, "grad_norm": 0.05442432686686516, "grad_norm_var": 1.587979863149341e-05, "learning_rate": 0.00998829033694263, "loss": 2.8599, "step": 1141 }, { "crossentropy": 2.8689193725585938, "epoch": 0.041400812064965195, "grad_norm": 0.06240992248058319, "grad_norm_var": 1.580328394243374e-05, "learning_rate": 0.009988250557617559, "loss": 2.836, "step": 1142 }, { "crossentropy": 2.9137117862701416, "epoch": 0.041437064965197216, "grad_norm": 0.05703837051987648, "grad_norm_var": 1.5698831638810464e-05, "learning_rate": 0.009988210710918594, "loss": 2.9385, "step": 1143 }, { "crossentropy": 2.9308183193206787, "epoch": 0.04147331786542923, "grad_norm": 0.05392833054065704, "grad_norm_var": 1.6094866197634224e-05, "learning_rate": 0.009988170796846274, "loss": 2.94, "step": 1144 }, { "crossentropy": 2.916253089904785, "epoch": 0.04150957076566125, "grad_norm": 0.057855091989040375, "grad_norm_var": 1.485118991190326e-05, "learning_rate": 0.009988130815401137, "loss": 2.8304, "step": 1145 }, { "crossentropy": 3.016124963760376, "epoch": 0.041545823665893274, "grad_norm": 0.0547703392803669, "grad_norm_var": 1.5182407835584212e-05, "learning_rate": 0.009988090766583723, "loss": 2.951, "step": 1146 }, { "crossentropy": 2.9870588779449463, "epoch": 0.04158207656612529, "grad_norm": 0.05655821040272713, "grad_norm_var": 1.5159719481263172e-05, "learning_rate": 0.009988050650394575, "loss": 2.9909, "step": 1147 }, { "crossentropy": 2.7873291969299316, "epoch": 0.04161832946635731, "grad_norm": 0.061157334595918655, "grad_norm_var": 1.5200203510656815e-05, "learning_rate": 0.009988010466834231, "loss": 2.7968, "step": 1148 }, { "crossentropy": 2.8394486904144287, "epoch": 0.041654582366589324, "grad_norm": 0.05699891597032547, "grad_norm_var": 1.4650576666015944e-05, "learning_rate": 0.009987970215903239, "loss": 2.8551, "step": 1149 }, { "crossentropy": 2.8889269828796387, "epoch": 0.041690835266821345, "grad_norm": 0.055470772087574005, "grad_norm_var": 1.50268794728657e-05, "learning_rate": 0.009987929897602136, "loss": 2.8963, "step": 1150 }, { "crossentropy": 2.768990993499756, "epoch": 0.04172708816705337, "grad_norm": 0.05300021544098854, "grad_norm_var": 1.17386679163195e-05, "learning_rate": 0.009987889511931474, "loss": 2.7529, "step": 1151 }, { "crossentropy": 2.9056336879730225, "epoch": 0.04176334106728538, "grad_norm": 0.05839546397328377, "grad_norm_var": 1.166605405555146e-05, "learning_rate": 0.009987849058891794, "loss": 2.9199, "step": 1152 }, { "crossentropy": 2.757187843322754, "epoch": 0.0417995939675174, "grad_norm": 0.04818135127425194, "grad_norm_var": 1.4085657479910527e-05, "learning_rate": 0.009987808538483642, "loss": 2.7964, "step": 1153 }, { "crossentropy": 2.8780996799468994, "epoch": 0.04183584686774942, "grad_norm": 0.057022713124752045, "grad_norm_var": 1.2999207025831718e-05, "learning_rate": 0.009987767950707566, "loss": 2.8985, "step": 1154 }, { "crossentropy": 2.9527013301849365, "epoch": 0.04187209976798144, "grad_norm": 0.054454006254673004, "grad_norm_var": 1.1714939618223616e-05, "learning_rate": 0.009987727295564117, "loss": 2.8867, "step": 1155 }, { "crossentropy": 2.9219048023223877, "epoch": 0.04190835266821346, "grad_norm": 0.10176730155944824, "grad_norm_var": 0.0001410276643110051, "learning_rate": 0.00998768657305384, "loss": 2.9503, "step": 1156 }, { "crossentropy": 2.9237232208251953, "epoch": 0.041944605568445474, "grad_norm": 0.052696116268634796, "grad_norm_var": 0.00014226052677663206, "learning_rate": 0.009987645783177287, "loss": 2.9602, "step": 1157 }, { "crossentropy": 2.829944610595703, "epoch": 0.041980858468677495, "grad_norm": 0.05551503598690033, "grad_norm_var": 0.00014196504324294102, "learning_rate": 0.009987604925935008, "loss": 2.8586, "step": 1158 }, { "crossentropy": 2.8848063945770264, "epoch": 0.04201711136890951, "grad_norm": 0.05654885992407799, "grad_norm_var": 0.00014207056121045552, "learning_rate": 0.009987564001327555, "loss": 2.8984, "step": 1159 }, { "crossentropy": 2.8160932064056396, "epoch": 0.04205336426914153, "grad_norm": 0.06833651661872864, "grad_norm_var": 0.00014646441266512919, "learning_rate": 0.009987523009355483, "loss": 2.8222, "step": 1160 }, { "crossentropy": 2.836780309677124, "epoch": 0.04208961716937355, "grad_norm": 0.056827012449502945, "grad_norm_var": 0.000146727921183844, "learning_rate": 0.009987481950019344, "loss": 2.8931, "step": 1161 }, { "crossentropy": 2.867375135421753, "epoch": 0.04212587006960557, "grad_norm": 0.05921230837702751, "grad_norm_var": 0.00014531907781798747, "learning_rate": 0.00998744082331969, "loss": 2.8767, "step": 1162 }, { "crossentropy": 2.999027729034424, "epoch": 0.04216212296983759, "grad_norm": 0.0592101514339447, "grad_norm_var": 0.00014471529249991208, "learning_rate": 0.009987399629257083, "loss": 2.8678, "step": 1163 }, { "crossentropy": 2.8616533279418945, "epoch": 0.0421983758700696, "grad_norm": 0.05879746004939079, "grad_norm_var": 0.00014459682248902321, "learning_rate": 0.009987358367832074, "loss": 2.891, "step": 1164 }, { "crossentropy": 2.826003074645996, "epoch": 0.042234628770301624, "grad_norm": 0.05453408136963844, "grad_norm_var": 0.000145807421993141, "learning_rate": 0.00998731703904522, "loss": 2.8801, "step": 1165 }, { "crossentropy": 2.968885898590088, "epoch": 0.042270881670533646, "grad_norm": 0.060906924307346344, "grad_norm_var": 0.0001448259305472312, "learning_rate": 0.009987275642897081, "loss": 3.013, "step": 1166 }, { "crossentropy": 2.7548482418060303, "epoch": 0.04230713457076566, "grad_norm": 0.05500271916389465, "grad_norm_var": 0.00014328428147191563, "learning_rate": 0.009987234179388217, "loss": 2.753, "step": 1167 }, { "crossentropy": 2.8974876403808594, "epoch": 0.04234338747099768, "grad_norm": 0.05214027687907219, "grad_norm_var": 0.00014693285400637073, "learning_rate": 0.009987192648519187, "loss": 2.8898, "step": 1168 }, { "crossentropy": 2.7908706665039062, "epoch": 0.042379640371229696, "grad_norm": 0.05508797988295555, "grad_norm_var": 0.00013953979729521647, "learning_rate": 0.009987151050290551, "loss": 2.8639, "step": 1169 }, { "crossentropy": 2.768899917602539, "epoch": 0.04241589327146172, "grad_norm": 0.060908444225788116, "grad_norm_var": 0.00013900379079141568, "learning_rate": 0.009987109384702873, "loss": 2.8048, "step": 1170 }, { "crossentropy": 2.840951442718506, "epoch": 0.04245214617169374, "grad_norm": 0.06201393902301788, "grad_norm_var": 0.00013686296909851397, "learning_rate": 0.009987067651756713, "loss": 2.8732, "step": 1171 }, { "crossentropy": 2.8319129943847656, "epoch": 0.04248839907192575, "grad_norm": 0.055298324674367905, "grad_norm_var": 1.6719609224283977e-05, "learning_rate": 0.009987025851452638, "loss": 2.9335, "step": 1172 }, { "crossentropy": 2.9706859588623047, "epoch": 0.042524651972157775, "grad_norm": 0.06044822186231613, "grad_norm_var": 1.531405568325553e-05, "learning_rate": 0.00998698398379121, "loss": 2.9967, "step": 1173 }, { "crossentropy": 2.751816749572754, "epoch": 0.04256090487238979, "grad_norm": 0.0627799853682518, "grad_norm_var": 1.603688423891525e-05, "learning_rate": 0.009986942048772996, "loss": 2.8142, "step": 1174 }, { "crossentropy": 2.822906970977783, "epoch": 0.04259715777262181, "grad_norm": 0.0576271153986454, "grad_norm_var": 1.5810589587485444e-05, "learning_rate": 0.009986900046398559, "loss": 2.797, "step": 1175 }, { "crossentropy": 2.919542074203491, "epoch": 0.04263341067285383, "grad_norm": 0.053354259580373764, "grad_norm_var": 1.058104812155184e-05, "learning_rate": 0.009986857976668472, "loss": 2.9149, "step": 1176 }, { "crossentropy": 2.8596224784851074, "epoch": 0.042669663573085846, "grad_norm": 0.06434943526983261, "grad_norm_var": 1.3182625791866714e-05, "learning_rate": 0.009986815839583298, "loss": 2.8749, "step": 1177 }, { "crossentropy": 2.832634210586548, "epoch": 0.04270591647331787, "grad_norm": 0.05191702023148537, "grad_norm_var": 1.5552947220961084e-05, "learning_rate": 0.00998677363514361, "loss": 2.9078, "step": 1178 }, { "crossentropy": 2.972543239593506, "epoch": 0.04274216937354988, "grad_norm": 0.052675314247608185, "grad_norm_var": 1.697020063714504e-05, "learning_rate": 0.009986731363349977, "loss": 2.9389, "step": 1179 }, { "crossentropy": 2.712378740310669, "epoch": 0.0427784222737819, "grad_norm": 0.070809006690979, "grad_norm_var": 2.8281520060519035e-05, "learning_rate": 0.009986689024202968, "loss": 2.768, "step": 1180 }, { "crossentropy": 2.976285934448242, "epoch": 0.04281467517401392, "grad_norm": 0.052213191986083984, "grad_norm_var": 2.972655253029935e-05, "learning_rate": 0.009986646617703156, "loss": 2.9146, "step": 1181 }, { "crossentropy": 2.910865545272827, "epoch": 0.04285092807424594, "grad_norm": 0.05602560192346573, "grad_norm_var": 2.930477730379554e-05, "learning_rate": 0.009986604143851113, "loss": 2.9101, "step": 1182 }, { "crossentropy": 2.950162410736084, "epoch": 0.04288718097447796, "grad_norm": 0.056597061455249786, "grad_norm_var": 2.8897558864343642e-05, "learning_rate": 0.009986561602647414, "loss": 2.9683, "step": 1183 }, { "crossentropy": 2.8900160789489746, "epoch": 0.042923433874709975, "grad_norm": 0.05792435258626938, "grad_norm_var": 2.6650436735960124e-05, "learning_rate": 0.009986518994092634, "loss": 2.8291, "step": 1184 }, { "crossentropy": 2.881138324737549, "epoch": 0.042959686774941996, "grad_norm": 0.0558072067797184, "grad_norm_var": 2.6391350982164717e-05, "learning_rate": 0.009986476318187349, "loss": 2.944, "step": 1185 }, { "crossentropy": 2.878627300262451, "epoch": 0.04299593967517401, "grad_norm": 0.05507408082485199, "grad_norm_var": 2.638994583397941e-05, "learning_rate": 0.009986433574932133, "loss": 2.8625, "step": 1186 }, { "crossentropy": 2.8905229568481445, "epoch": 0.04303219257540603, "grad_norm": 0.054207075387239456, "grad_norm_var": 2.5820212037776697e-05, "learning_rate": 0.009986390764327564, "loss": 2.9276, "step": 1187 }, { "crossentropy": 2.7621569633483887, "epoch": 0.043068445475638054, "grad_norm": 0.05350214242935181, "grad_norm_var": 2.650583615003878e-05, "learning_rate": 0.00998634788637422, "loss": 2.8632, "step": 1188 }, { "crossentropy": 2.9218616485595703, "epoch": 0.04310469837587007, "grad_norm": 0.05335371941328049, "grad_norm_var": 2.6585548240955618e-05, "learning_rate": 0.009986304941072681, "loss": 2.9175, "step": 1189 }, { "crossentropy": 3.0201284885406494, "epoch": 0.04314095127610209, "grad_norm": 0.05284619331359863, "grad_norm_var": 2.478424082125215e-05, "learning_rate": 0.009986261928423527, "loss": 3.0049, "step": 1190 }, { "crossentropy": 2.9039058685302734, "epoch": 0.043177204176334104, "grad_norm": 0.05399677902460098, "grad_norm_var": 2.4889413297996912e-05, "learning_rate": 0.00998621884842734, "loss": 2.9487, "step": 1191 }, { "crossentropy": 2.7874696254730225, "epoch": 0.043213457076566125, "grad_norm": 0.05303391069173813, "grad_norm_var": 2.500523785337368e-05, "learning_rate": 0.0099861757010847, "loss": 2.7853, "step": 1192 }, { "crossentropy": 2.8297197818756104, "epoch": 0.04324970997679815, "grad_norm": 0.056688714772462845, "grad_norm_var": 2.003831598158141e-05, "learning_rate": 0.009986132486396189, "loss": 2.8863, "step": 1193 }, { "crossentropy": 2.8987114429473877, "epoch": 0.04328596287703016, "grad_norm": 0.05282575637102127, "grad_norm_var": 1.966585891106694e-05, "learning_rate": 0.009986089204362393, "loss": 2.8568, "step": 1194 }, { "crossentropy": 3.1022746562957764, "epoch": 0.04332221577726218, "grad_norm": 0.05275965854525566, "grad_norm_var": 1.963483251135693e-05, "learning_rate": 0.009986045854983897, "loss": 2.9952, "step": 1195 }, { "crossentropy": 2.8713934421539307, "epoch": 0.0433584686774942, "grad_norm": 0.06445924937725067, "grad_norm_var": 9.175910446483174e-06, "learning_rate": 0.009986002438261284, "loss": 2.8669, "step": 1196 }, { "crossentropy": 3.0037403106689453, "epoch": 0.04339472157772622, "grad_norm": 0.05174782872200012, "grad_norm_var": 9.367461130963909e-06, "learning_rate": 0.009985958954195141, "loss": 2.942, "step": 1197 }, { "crossentropy": 2.9952309131622314, "epoch": 0.04343097447795824, "grad_norm": 0.05547742173075676, "grad_norm_var": 9.315160411268026e-06, "learning_rate": 0.009985915402786054, "loss": 2.9951, "step": 1198 }, { "crossentropy": 2.9712367057800293, "epoch": 0.043467227378190254, "grad_norm": 0.055935557931661606, "grad_norm_var": 9.203308128650362e-06, "learning_rate": 0.009985871784034617, "loss": 2.9483, "step": 1199 }, { "crossentropy": 2.7964541912078857, "epoch": 0.043503480278422275, "grad_norm": 0.05213291943073273, "grad_norm_var": 9.024051457348636e-06, "learning_rate": 0.009985828097941412, "loss": 2.8275, "step": 1200 }, { "crossentropy": 2.839080810546875, "epoch": 0.04353973317865429, "grad_norm": 0.052003972232341766, "grad_norm_var": 9.323783274646907e-06, "learning_rate": 0.009985784344507033, "loss": 2.8532, "step": 1201 }, { "crossentropy": 2.8394601345062256, "epoch": 0.04357598607888631, "grad_norm": 0.0507194958627224, "grad_norm_var": 1.0104671916164043e-05, "learning_rate": 0.009985740523732071, "loss": 2.8319, "step": 1202 }, { "crossentropy": 2.8313045501708984, "epoch": 0.04361223897911833, "grad_norm": 0.05719850957393646, "grad_norm_var": 1.0704418756862268e-05, "learning_rate": 0.009985696635617118, "loss": 2.802, "step": 1203 }, { "crossentropy": 2.942594528198242, "epoch": 0.04364849187935035, "grad_norm": 0.06281400471925735, "grad_norm_var": 1.5142407054706484e-05, "learning_rate": 0.009985652680162766, "loss": 2.9833, "step": 1204 }, { "crossentropy": 2.8545591831207275, "epoch": 0.04368474477958237, "grad_norm": 0.054000124335289, "grad_norm_var": 1.5037440892104863e-05, "learning_rate": 0.009985608657369607, "loss": 2.9082, "step": 1205 }, { "crossentropy": 2.722557306289673, "epoch": 0.04372099767981438, "grad_norm": 0.053632184863090515, "grad_norm_var": 1.4859243069157736e-05, "learning_rate": 0.009985564567238236, "loss": 2.8603, "step": 1206 }, { "crossentropy": 2.7235028743743896, "epoch": 0.043757250580046404, "grad_norm": 0.05378083512187004, "grad_norm_var": 1.4890010042114174e-05, "learning_rate": 0.009985520409769253, "loss": 2.7856, "step": 1207 }, { "crossentropy": 2.9208364486694336, "epoch": 0.043793503480278426, "grad_norm": 0.05714304745197296, "grad_norm_var": 1.4895179078932886e-05, "learning_rate": 0.00998547618496325, "loss": 2.8658, "step": 1208 }, { "crossentropy": 2.788606643676758, "epoch": 0.04382975638051044, "grad_norm": 0.05362454056739807, "grad_norm_var": 1.4876823429862904e-05, "learning_rate": 0.009985431892820824, "loss": 2.7928, "step": 1209 }, { "crossentropy": 2.9641311168670654, "epoch": 0.04386600928074246, "grad_norm": 0.05278218165040016, "grad_norm_var": 1.488966701169899e-05, "learning_rate": 0.009985387533342575, "loss": 2.8933, "step": 1210 }, { "crossentropy": 2.806209087371826, "epoch": 0.043902262180974476, "grad_norm": 0.05437735468149185, "grad_norm_var": 1.4567148626609626e-05, "learning_rate": 0.009985343106529102, "loss": 2.8377, "step": 1211 }, { "crossentropy": 2.9423789978027344, "epoch": 0.0439385150812065, "grad_norm": 0.06081034988164902, "grad_norm_var": 1.0852811726705323e-05, "learning_rate": 0.009985298612381005, "loss": 2.929, "step": 1212 }, { "crossentropy": 3.014434576034546, "epoch": 0.04397476798143851, "grad_norm": 0.0652114525437355, "grad_norm_var": 1.6548161277924316e-05, "learning_rate": 0.009985254050898885, "loss": 2.9948, "step": 1213 }, { "crossentropy": 2.7298226356506348, "epoch": 0.04401102088167053, "grad_norm": 0.07018118351697922, "grad_norm_var": 2.9569936289950553e-05, "learning_rate": 0.009985209422083344, "loss": 2.6886, "step": 1214 }, { "crossentropy": 2.914000988006592, "epoch": 0.044047273781902554, "grad_norm": 0.06863563507795334, "grad_norm_var": 3.844642143883455e-05, "learning_rate": 0.009985164725934983, "loss": 2.9082, "step": 1215 }, { "crossentropy": 2.96396541595459, "epoch": 0.04408352668213457, "grad_norm": 0.05818355828523636, "grad_norm_var": 3.645267093172518e-05, "learning_rate": 0.009985119962454407, "loss": 2.9051, "step": 1216 }, { "crossentropy": 2.8272228240966797, "epoch": 0.04411977958236659, "grad_norm": 0.054493099451065063, "grad_norm_var": 3.491010858635794e-05, "learning_rate": 0.009985075131642223, "loss": 2.7362, "step": 1217 }, { "crossentropy": 2.954188108444214, "epoch": 0.044156032482598605, "grad_norm": 0.05422961339354515, "grad_norm_var": 3.228484072109409e-05, "learning_rate": 0.009985030233499033, "loss": 3.0017, "step": 1218 }, { "crossentropy": 2.88002610206604, "epoch": 0.044192285382830626, "grad_norm": 0.05320289731025696, "grad_norm_var": 3.381278329192729e-05, "learning_rate": 0.009984985268025444, "loss": 2.9052, "step": 1219 }, { "crossentropy": 2.7948594093322754, "epoch": 0.04422853828306265, "grad_norm": 0.05649898201227188, "grad_norm_var": 3.220459211793197e-05, "learning_rate": 0.009984940235222063, "loss": 2.7925, "step": 1220 }, { "crossentropy": 2.820466995239258, "epoch": 0.04426479118329466, "grad_norm": 0.05579555034637451, "grad_norm_var": 3.155645296792486e-05, "learning_rate": 0.009984895135089503, "loss": 2.8846, "step": 1221 }, { "crossentropy": 2.8658244609832764, "epoch": 0.04430104408352668, "grad_norm": 0.05679238960146904, "grad_norm_var": 3.0482879464321007e-05, "learning_rate": 0.009984849967628367, "loss": 2.8795, "step": 1222 }, { "crossentropy": 2.891575574874878, "epoch": 0.0443372969837587, "grad_norm": 0.06863709539175034, "grad_norm_var": 3.619915282223786e-05, "learning_rate": 0.009984804732839267, "loss": 2.9463, "step": 1223 }, { "crossentropy": 2.842547655105591, "epoch": 0.04437354988399072, "grad_norm": 0.06100790202617645, "grad_norm_var": 3.628534672439337e-05, "learning_rate": 0.009984759430722814, "loss": 2.8708, "step": 1224 }, { "crossentropy": 2.832509756088257, "epoch": 0.04440980278422274, "grad_norm": 0.058239854872226715, "grad_norm_var": 3.4290904824508564e-05, "learning_rate": 0.009984714061279622, "loss": 2.8042, "step": 1225 }, { "crossentropy": 3.0015110969543457, "epoch": 0.044446055684454755, "grad_norm": 0.06709156930446625, "grad_norm_var": 3.461956925186379e-05, "learning_rate": 0.0099846686245103, "loss": 2.8911, "step": 1226 }, { "crossentropy": 2.895642042160034, "epoch": 0.044482308584686776, "grad_norm": 0.06805714219808578, "grad_norm_var": 3.5673778545506725e-05, "learning_rate": 0.009984623120415466, "loss": 2.9081, "step": 1227 }, { "crossentropy": 2.9638750553131104, "epoch": 0.04451856148491879, "grad_norm": 0.06741197407245636, "grad_norm_var": 3.817191589750001e-05, "learning_rate": 0.00998457754899573, "loss": 2.9283, "step": 1228 }, { "crossentropy": 3.0775604248046875, "epoch": 0.04455481438515081, "grad_norm": 0.05658677592873573, "grad_norm_var": 3.852924583144368e-05, "learning_rate": 0.009984531910251712, "loss": 3.0453, "step": 1229 }, { "crossentropy": 2.7941858768463135, "epoch": 0.044591067285382834, "grad_norm": 0.05629894509911537, "grad_norm_var": 3.3469525620159296e-05, "learning_rate": 0.009984486204184025, "loss": 2.8249, "step": 1230 }, { "crossentropy": 2.9221861362457275, "epoch": 0.04462732018561485, "grad_norm": 0.05071529373526573, "grad_norm_var": 3.308056210158574e-05, "learning_rate": 0.009984440430793287, "loss": 2.9695, "step": 1231 }, { "crossentropy": 2.671085834503174, "epoch": 0.04466357308584687, "grad_norm": 0.06384836137294769, "grad_norm_var": 3.450527483935302e-05, "learning_rate": 0.009984394590080117, "loss": 2.8477, "step": 1232 }, { "crossentropy": 2.664674758911133, "epoch": 0.044699825986078884, "grad_norm": 0.06833293288946152, "grad_norm_var": 3.759396765962055e-05, "learning_rate": 0.009984348682045136, "loss": 2.7881, "step": 1233 }, { "crossentropy": 2.8122947216033936, "epoch": 0.044736078886310905, "grad_norm": 0.05030784755945206, "grad_norm_var": 4.166236534468848e-05, "learning_rate": 0.009984302706688961, "loss": 2.8004, "step": 1234 }, { "crossentropy": 2.8715925216674805, "epoch": 0.04477233178654293, "grad_norm": 0.05062533915042877, "grad_norm_var": 4.438836600160201e-05, "learning_rate": 0.009984256664012214, "loss": 2.8843, "step": 1235 }, { "crossentropy": 2.787626266479492, "epoch": 0.04480858468677494, "grad_norm": 0.06782639771699905, "grad_norm_var": 4.74742725684929e-05, "learning_rate": 0.009984210554015516, "loss": 2.7724, "step": 1236 }, { "crossentropy": 2.9379630088806152, "epoch": 0.04484483758700696, "grad_norm": 0.06006539240479469, "grad_norm_var": 4.5950552733282395e-05, "learning_rate": 0.009984164376699491, "loss": 2.8512, "step": 1237 }, { "crossentropy": 2.794067621231079, "epoch": 0.04488109048723898, "grad_norm": 0.06289453059434891, "grad_norm_var": 4.506569539162915e-05, "learning_rate": 0.009984118132064762, "loss": 2.8844, "step": 1238 }, { "crossentropy": 2.892777919769287, "epoch": 0.044917343387471, "grad_norm": 0.0675365999341011, "grad_norm_var": 4.403863548213075e-05, "learning_rate": 0.009984071820111955, "loss": 2.8736, "step": 1239 }, { "crossentropy": 2.936134099960327, "epoch": 0.04495359628770302, "grad_norm": 0.05415048077702522, "grad_norm_var": 4.7018818515398055e-05, "learning_rate": 0.009984025440841694, "loss": 2.8665, "step": 1240 }, { "crossentropy": 3.0963311195373535, "epoch": 0.044989849187935034, "grad_norm": 0.05404161289334297, "grad_norm_var": 4.9455148405384376e-05, "learning_rate": 0.009983978994254605, "loss": 3.0131, "step": 1241 }, { "crossentropy": 2.911607265472412, "epoch": 0.045026102088167055, "grad_norm": 0.05538380146026611, "grad_norm_var": 4.7516961156277016e-05, "learning_rate": 0.009983932480351317, "loss": 2.8568, "step": 1242 }, { "crossentropy": 2.9690372943878174, "epoch": 0.04506235498839907, "grad_norm": 0.05827159434556961, "grad_norm_var": 4.250682275416806e-05, "learning_rate": 0.009983885899132459, "loss": 2.9121, "step": 1243 }, { "crossentropy": 2.714402675628662, "epoch": 0.04509860788863109, "grad_norm": 0.05364951118826866, "grad_norm_var": 3.894289151292514e-05, "learning_rate": 0.009983839250598656, "loss": 2.7913, "step": 1244 }, { "crossentropy": 2.8188514709472656, "epoch": 0.045134860788863106, "grad_norm": 0.05784996971487999, "grad_norm_var": 3.877790736420133e-05, "learning_rate": 0.009983792534750542, "loss": 2.9602, "step": 1245 }, { "crossentropy": 2.9588122367858887, "epoch": 0.04517111368909513, "grad_norm": 0.06015508994460106, "grad_norm_var": 3.871060468934312e-05, "learning_rate": 0.009983745751588747, "loss": 2.8923, "step": 1246 }, { "crossentropy": 2.8572638034820557, "epoch": 0.04520736658932715, "grad_norm": 0.06981157511472702, "grad_norm_var": 4.173610086407205e-05, "learning_rate": 0.009983698901113903, "loss": 2.8432, "step": 1247 }, { "crossentropy": 2.7601640224456787, "epoch": 0.04524361948955916, "grad_norm": 0.05642852559685707, "grad_norm_var": 4.104519193716723e-05, "learning_rate": 0.00998365198332664, "loss": 2.7775, "step": 1248 }, { "crossentropy": 2.872962474822998, "epoch": 0.045279872389791184, "grad_norm": 0.05470874160528183, "grad_norm_var": 3.607073966857595e-05, "learning_rate": 0.009983604998227596, "loss": 2.8001, "step": 1249 }, { "crossentropy": 3.0595662593841553, "epoch": 0.0453161252900232, "grad_norm": 0.05788363516330719, "grad_norm_var": 3.152760032126874e-05, "learning_rate": 0.009983557945817402, "loss": 3.0498, "step": 1250 }, { "crossentropy": 2.8059306144714355, "epoch": 0.04535237819025522, "grad_norm": 0.05565055459737778, "grad_norm_var": 2.7608424101128494e-05, "learning_rate": 0.009983510826096696, "loss": 2.8617, "step": 1251 }, { "crossentropy": 2.824244976043701, "epoch": 0.04538863109048724, "grad_norm": 0.057874053716659546, "grad_norm_var": 2.2277967695231008e-05, "learning_rate": 0.009983463639066114, "loss": 2.8372, "step": 1252 }, { "crossentropy": 2.909034013748169, "epoch": 0.045424883990719256, "grad_norm": 0.05482453480362892, "grad_norm_var": 2.291629640259514e-05, "learning_rate": 0.009983416384726291, "loss": 2.8643, "step": 1253 }, { "crossentropy": 2.995067834854126, "epoch": 0.04546113689095128, "grad_norm": 0.054829131811857224, "grad_norm_var": 2.1927801988750466e-05, "learning_rate": 0.00998336906307787, "loss": 2.8818, "step": 1254 }, { "crossentropy": 2.707899570465088, "epoch": 0.04549738979118329, "grad_norm": 0.052914734929800034, "grad_norm_var": 1.6094628426105833e-05, "learning_rate": 0.009983321674121486, "loss": 2.7912, "step": 1255 }, { "crossentropy": 2.91428279876709, "epoch": 0.04553364269141531, "grad_norm": 0.0616690032184124, "grad_norm_var": 1.699491278630462e-05, "learning_rate": 0.009983274217857779, "loss": 2.9408, "step": 1256 }, { "crossentropy": 2.8760621547698975, "epoch": 0.045569895591647334, "grad_norm": 0.05656775087118149, "grad_norm_var": 1.631424008793509e-05, "learning_rate": 0.009983226694287393, "loss": 2.8332, "step": 1257 }, { "crossentropy": 2.7298052310943604, "epoch": 0.04560614849187935, "grad_norm": 0.05634253844618797, "grad_norm_var": 1.6113377839815865e-05, "learning_rate": 0.009983179103410967, "loss": 2.7895, "step": 1258 }, { "crossentropy": 2.867295980453491, "epoch": 0.04564240139211137, "grad_norm": 0.05355926975607872, "grad_norm_var": 1.6994106199028775e-05, "learning_rate": 0.009983131445229146, "loss": 2.8381, "step": 1259 }, { "crossentropy": 2.9712185859680176, "epoch": 0.045678654292343385, "grad_norm": 0.05756937712430954, "grad_norm_var": 1.6114506348777606e-05, "learning_rate": 0.009983083719742573, "loss": 2.9359, "step": 1260 }, { "crossentropy": 2.894237518310547, "epoch": 0.045714907192575406, "grad_norm": 0.05270418897271156, "grad_norm_var": 1.747094815508308e-05, "learning_rate": 0.009983035926951891, "loss": 2.9104, "step": 1261 }, { "crossentropy": 2.768577814102173, "epoch": 0.04575116009280743, "grad_norm": 0.056175220757722855, "grad_norm_var": 1.683616848725296e-05, "learning_rate": 0.009982988066857748, "loss": 2.8121, "step": 1262 }, { "crossentropy": 2.9125704765319824, "epoch": 0.04578741299303944, "grad_norm": 0.05904598906636238, "grad_norm_var": 5.466770323981721e-06, "learning_rate": 0.009982940139460788, "loss": 2.8214, "step": 1263 }, { "crossentropy": 2.8690710067749023, "epoch": 0.04582366589327146, "grad_norm": 0.056451037526130676, "grad_norm_var": 5.467572873994221e-06, "learning_rate": 0.00998289214476166, "loss": 2.7887, "step": 1264 }, { "crossentropy": 2.885760545730591, "epoch": 0.04585991879350348, "grad_norm": 0.051356974989175797, "grad_norm_var": 6.8241486355404e-06, "learning_rate": 0.009982844082761011, "loss": 2.9709, "step": 1265 }, { "crossentropy": 2.9611001014709473, "epoch": 0.0458961716937355, "grad_norm": 0.06035269796848297, "grad_norm_var": 7.837249092932514e-06, "learning_rate": 0.009982795953459493, "loss": 2.87, "step": 1266 }, { "crossentropy": 2.914900779724121, "epoch": 0.04593242459396752, "grad_norm": 0.05183212831616402, "grad_norm_var": 8.986480281434652e-06, "learning_rate": 0.009982747756857754, "loss": 2.8653, "step": 1267 }, { "crossentropy": 2.9094150066375732, "epoch": 0.045968677494199535, "grad_norm": 0.053154610097408295, "grad_norm_var": 9.123328413053871e-06, "learning_rate": 0.009982699492956442, "loss": 2.8677, "step": 1268 }, { "crossentropy": 2.8459348678588867, "epoch": 0.046004930394431556, "grad_norm": 0.05390338972210884, "grad_norm_var": 9.26967702594746e-06, "learning_rate": 0.009982651161756213, "loss": 2.9326, "step": 1269 }, { "crossentropy": 2.942772626876831, "epoch": 0.04604118329466357, "grad_norm": 0.06205759570002556, "grad_norm_var": 1.1862981486813045e-05, "learning_rate": 0.00998260276325772, "loss": 2.9292, "step": 1270 }, { "crossentropy": 2.8502466678619385, "epoch": 0.04607743619489559, "grad_norm": 0.07369332015514374, "grad_norm_var": 3.0359150048966018e-05, "learning_rate": 0.009982554297461614, "loss": 2.8498, "step": 1271 }, { "crossentropy": 2.8982579708099365, "epoch": 0.046113689095127613, "grad_norm": 0.053083378821611404, "grad_norm_var": 2.993868513272412e-05, "learning_rate": 0.009982505764368552, "loss": 2.8795, "step": 1272 }, { "crossentropy": 2.845799207687378, "epoch": 0.04614994199535963, "grad_norm": 0.051310524344444275, "grad_norm_var": 3.1787242221290964e-05, "learning_rate": 0.009982457163979188, "loss": 2.8142, "step": 1273 }, { "crossentropy": 2.909034490585327, "epoch": 0.04618619489559165, "grad_norm": 0.061183396726846695, "grad_norm_var": 3.320701796918878e-05, "learning_rate": 0.009982408496294178, "loss": 2.9006, "step": 1274 }, { "crossentropy": 2.7777621746063232, "epoch": 0.046222447795823664, "grad_norm": 0.05884173512458801, "grad_norm_var": 3.272867780195609e-05, "learning_rate": 0.009982359761314182, "loss": 2.8422, "step": 1275 }, { "crossentropy": 2.9754421710968018, "epoch": 0.046258700696055685, "grad_norm": 0.05946463346481323, "grad_norm_var": 3.308575817083052e-05, "learning_rate": 0.009982310959039854, "loss": 2.8695, "step": 1276 }, { "crossentropy": 2.7925970554351807, "epoch": 0.0462949535962877, "grad_norm": 0.06128448247909546, "grad_norm_var": 3.25858418923658e-05, "learning_rate": 0.009982262089471856, "loss": 2.8393, "step": 1277 }, { "crossentropy": 2.943992853164673, "epoch": 0.04633120649651972, "grad_norm": 0.060614731162786484, "grad_norm_var": 3.291542910641501e-05, "learning_rate": 0.00998221315261085, "loss": 2.9392, "step": 1278 }, { "crossentropy": 2.8114638328552246, "epoch": 0.04636745939675174, "grad_norm": 0.0534709133207798, "grad_norm_var": 3.406333110214816e-05, "learning_rate": 0.009982164148457493, "loss": 2.8046, "step": 1279 }, { "crossentropy": 2.762702465057373, "epoch": 0.04640371229698376, "grad_norm": 0.053102243691682816, "grad_norm_var": 3.528996367740163e-05, "learning_rate": 0.009982115077012448, "loss": 2.8528, "step": 1280 }, { "crossentropy": 2.880038022994995, "epoch": 0.04643996519721578, "grad_norm": 0.05528977885842323, "grad_norm_var": 3.3077790104265625e-05, "learning_rate": 0.009982065938276379, "loss": 2.9223, "step": 1281 }, { "crossentropy": 2.8834404945373535, "epoch": 0.04647621809744779, "grad_norm": 0.06315144896507263, "grad_norm_var": 3.4570322962766166e-05, "learning_rate": 0.00998201673224995, "loss": 2.9289, "step": 1282 }, { "crossentropy": 2.936967372894287, "epoch": 0.046512470997679814, "grad_norm": 0.060625217854976654, "grad_norm_var": 3.2359147719285276e-05, "learning_rate": 0.009981967458933823, "loss": 2.8802, "step": 1283 }, { "crossentropy": 2.8543694019317627, "epoch": 0.046548723897911835, "grad_norm": 0.0558047890663147, "grad_norm_var": 3.0948340478229726e-05, "learning_rate": 0.009981918118328664, "loss": 2.8896, "step": 1284 }, { "crossentropy": 2.8298497200012207, "epoch": 0.04658497679814385, "grad_norm": 0.051643241196870804, "grad_norm_var": 3.26694145343566e-05, "learning_rate": 0.009981868710435143, "loss": 2.8358, "step": 1285 }, { "crossentropy": 2.9249353408813477, "epoch": 0.04662122969837587, "grad_norm": 0.05216941982507706, "grad_norm_var": 3.397640220967144e-05, "learning_rate": 0.009981819235253923, "loss": 2.9181, "step": 1286 }, { "crossentropy": 2.7892308235168457, "epoch": 0.046657482598607886, "grad_norm": 0.056085824966430664, "grad_norm_var": 1.6030893953741277e-05, "learning_rate": 0.009981769692785674, "loss": 2.8594, "step": 1287 }, { "crossentropy": 2.857928514480591, "epoch": 0.04669373549883991, "grad_norm": 0.05131244659423828, "grad_norm_var": 1.7079783027904954e-05, "learning_rate": 0.009981720083031066, "loss": 2.8407, "step": 1288 }, { "crossentropy": 3.0025827884674072, "epoch": 0.04672998839907193, "grad_norm": 0.1340680569410324, "grad_norm_var": 0.0003869335797504166, "learning_rate": 0.009981670405990769, "loss": 2.9145, "step": 1289 }, { "crossentropy": 2.8851540088653564, "epoch": 0.04676624129930394, "grad_norm": 0.0546879842877388, "grad_norm_var": 0.0003900672701497983, "learning_rate": 0.009981620661665452, "loss": 2.8705, "step": 1290 }, { "crossentropy": 3.017929792404175, "epoch": 0.046802494199535964, "grad_norm": 0.06140216812491417, "grad_norm_var": 0.00038962034788599386, "learning_rate": 0.009981570850055788, "loss": 2.8801, "step": 1291 }, { "crossentropy": 2.813943862915039, "epoch": 0.04683874709976798, "grad_norm": 0.061927374452352524, "grad_norm_var": 0.000389327431772383, "learning_rate": 0.00998152097116245, "loss": 2.8908, "step": 1292 }, { "crossentropy": 3.0133583545684814, "epoch": 0.046875, "grad_norm": 0.05541573837399483, "grad_norm_var": 0.0003917778272252842, "learning_rate": 0.009981471024986113, "loss": 2.9205, "step": 1293 }, { "crossentropy": 3.0073490142822266, "epoch": 0.04691125290023202, "grad_norm": 0.0672013908624649, "grad_norm_var": 0.0003938890859613084, "learning_rate": 0.009981421011527448, "loss": 2.9824, "step": 1294 }, { "crossentropy": 2.8840835094451904, "epoch": 0.046947505800464036, "grad_norm": 0.05445793271064758, "grad_norm_var": 0.0003928657049512785, "learning_rate": 0.009981370930787135, "loss": 2.8484, "step": 1295 }, { "crossentropy": 2.94525408744812, "epoch": 0.04698375870069606, "grad_norm": 0.058077096939086914, "grad_norm_var": 0.00038866204688956283, "learning_rate": 0.009981320782765845, "loss": 2.8725, "step": 1296 }, { "crossentropy": 2.861881971359253, "epoch": 0.04702001160092807, "grad_norm": 0.06105613708496094, "grad_norm_var": 0.00038551766320514433, "learning_rate": 0.009981270567464262, "loss": 2.8714, "step": 1297 }, { "crossentropy": 2.672353744506836, "epoch": 0.04705626450116009, "grad_norm": 0.060425303876399994, "grad_norm_var": 0.00038572460426476257, "learning_rate": 0.009981220284883059, "loss": 2.7545, "step": 1298 }, { "crossentropy": 2.7238872051239014, "epoch": 0.047092517401392114, "grad_norm": 0.05952686443924904, "grad_norm_var": 0.0003860412438855519, "learning_rate": 0.009981169935022917, "loss": 2.8174, "step": 1299 }, { "crossentropy": 2.8243536949157715, "epoch": 0.04712877030162413, "grad_norm": 0.06628069281578064, "grad_norm_var": 0.0003839621378373019, "learning_rate": 0.009981119517884513, "loss": 2.8179, "step": 1300 }, { "crossentropy": 3.0073323249816895, "epoch": 0.04716502320185615, "grad_norm": 0.06029698997735977, "grad_norm_var": 0.0003757019400260408, "learning_rate": 0.009981069033468534, "loss": 2.9343, "step": 1301 }, { "crossentropy": 2.890002727508545, "epoch": 0.047201276102088165, "grad_norm": 0.06364323198795319, "grad_norm_var": 0.00036674977404558434, "learning_rate": 0.009981018481775657, "loss": 2.8397, "step": 1302 }, { "crossentropy": 2.8486926555633545, "epoch": 0.047237529002320186, "grad_norm": 0.05844702199101448, "grad_norm_var": 0.00036456993550190806, "learning_rate": 0.009980967862806568, "loss": 2.8356, "step": 1303 }, { "crossentropy": 3.0373053550720215, "epoch": 0.04727378190255221, "grad_norm": 0.05743655934929848, "grad_norm_var": 0.0003563382889409402, "learning_rate": 0.009980917176561948, "loss": 2.9627, "step": 1304 }, { "crossentropy": 2.8877031803131104, "epoch": 0.04731003480278422, "grad_norm": 0.05394254997372627, "grad_norm_var": 1.594038685330806e-05, "learning_rate": 0.009980866423042484, "loss": 2.875, "step": 1305 }, { "crossentropy": 2.81734037399292, "epoch": 0.04734628770301624, "grad_norm": 0.057624295353889465, "grad_norm_var": 1.4540868676689242e-05, "learning_rate": 0.009980815602248859, "loss": 2.8099, "step": 1306 }, { "crossentropy": 2.9324588775634766, "epoch": 0.04738254060324826, "grad_norm": 0.054673779755830765, "grad_norm_var": 1.5953245485808508e-05, "learning_rate": 0.00998076471418176, "loss": 2.8754, "step": 1307 }, { "crossentropy": 2.930391311645508, "epoch": 0.04741879350348028, "grad_norm": 0.05475599318742752, "grad_norm_var": 1.6752874110175418e-05, "learning_rate": 0.009980713758841877, "loss": 2.8941, "step": 1308 }, { "crossentropy": 2.9383187294006348, "epoch": 0.0474550464037123, "grad_norm": 0.05551009997725487, "grad_norm_var": 1.6708915715879283e-05, "learning_rate": 0.009980662736229896, "loss": 2.8451, "step": 1309 }, { "crossentropy": 2.740119457244873, "epoch": 0.047491299303944315, "grad_norm": 0.05905621126294136, "grad_norm_var": 1.190478924579513e-05, "learning_rate": 0.009980611646346504, "loss": 2.838, "step": 1310 }, { "crossentropy": 2.8720412254333496, "epoch": 0.047527552204176336, "grad_norm": 0.0677107721567154, "grad_norm_var": 1.5826796839742082e-05, "learning_rate": 0.009980560489192396, "loss": 2.8879, "step": 1311 }, { "crossentropy": 2.730499029159546, "epoch": 0.04756380510440835, "grad_norm": 0.06595606356859207, "grad_norm_var": 1.844407141012716e-05, "learning_rate": 0.009980509264768258, "loss": 2.7707, "step": 1312 }, { "crossentropy": 2.781433582305908, "epoch": 0.04760005800464037, "grad_norm": 0.05877519026398659, "grad_norm_var": 1.8378522217228864e-05, "learning_rate": 0.009980457973074786, "loss": 2.9278, "step": 1313 }, { "crossentropy": 2.8510658740997314, "epoch": 0.047636310904872387, "grad_norm": 0.05340952053666115, "grad_norm_var": 2.070981632982984e-05, "learning_rate": 0.009980406614112671, "loss": 2.8429, "step": 1314 }, { "crossentropy": 2.73989200592041, "epoch": 0.04767256380510441, "grad_norm": 0.055996935814619064, "grad_norm_var": 2.133021511864641e-05, "learning_rate": 0.009980355187882605, "loss": 2.8059, "step": 1315 }, { "crossentropy": 3.0055911540985107, "epoch": 0.04770881670533643, "grad_norm": 0.05682901293039322, "grad_norm_var": 1.7700173173874844e-05, "learning_rate": 0.009980303694385285, "loss": 2.986, "step": 1316 }, { "crossentropy": 2.900726795196533, "epoch": 0.047745069605568444, "grad_norm": 0.056265294551849365, "grad_norm_var": 1.7685057814910365e-05, "learning_rate": 0.009980252133621406, "loss": 2.8529, "step": 1317 }, { "crossentropy": 2.7029573917388916, "epoch": 0.047781322505800465, "grad_norm": 0.05521516874432564, "grad_norm_var": 1.592579057844249e-05, "learning_rate": 0.009980200505591664, "loss": 2.7979, "step": 1318 }, { "crossentropy": 2.8889310359954834, "epoch": 0.04781757540603248, "grad_norm": 0.0525866262614727, "grad_norm_var": 1.7410672523600578e-05, "learning_rate": 0.009980148810296756, "loss": 2.8217, "step": 1319 }, { "crossentropy": 2.861650228500366, "epoch": 0.0478538283062645, "grad_norm": 0.055668532848358154, "grad_norm_var": 1.755829275970986e-05, "learning_rate": 0.009980097047737382, "loss": 2.9162, "step": 1320 }, { "crossentropy": 3.057961940765381, "epoch": 0.04789008120649652, "grad_norm": 0.056822918355464935, "grad_norm_var": 1.685518320235016e-05, "learning_rate": 0.009980045217914239, "loss": 2.9383, "step": 1321 }, { "crossentropy": 2.8716070652008057, "epoch": 0.04792633410672854, "grad_norm": 0.05475273355841637, "grad_norm_var": 1.7247735352393136e-05, "learning_rate": 0.009979993320828027, "loss": 2.7764, "step": 1322 }, { "crossentropy": 2.9112770557403564, "epoch": 0.04796258700696056, "grad_norm": 0.10735594481229782, "grad_norm_var": 0.00017349946340946716, "learning_rate": 0.00997994135647945, "loss": 2.8658, "step": 1323 }, { "crossentropy": 2.803697347640991, "epoch": 0.04799883990719257, "grad_norm": 0.058843862265348434, "grad_norm_var": 0.00017145852290255108, "learning_rate": 0.009979889324869205, "loss": 2.7929, "step": 1324 }, { "crossentropy": 2.820261001586914, "epoch": 0.048035092807424594, "grad_norm": 0.05434583127498627, "grad_norm_var": 0.00017234458284118756, "learning_rate": 0.009979837225997998, "loss": 2.8241, "step": 1325 }, { "crossentropy": 2.8146755695343018, "epoch": 0.048071345707656615, "grad_norm": 0.055679041892290115, "grad_norm_var": 0.00017375229986833402, "learning_rate": 0.009979785059866533, "loss": 2.8363, "step": 1326 }, { "crossentropy": 2.90596866607666, "epoch": 0.04810759860788863, "grad_norm": 0.05521926283836365, "grad_norm_var": 0.000171308899772038, "learning_rate": 0.009979732826475515, "loss": 2.8966, "step": 1327 }, { "crossentropy": 2.8411192893981934, "epoch": 0.04814385150812065, "grad_norm": 0.05199432000517845, "grad_norm_var": 0.00017167399938315624, "learning_rate": 0.009979680525825645, "loss": 2.8462, "step": 1328 }, { "crossentropy": 2.813337564468384, "epoch": 0.048180104408352666, "grad_norm": 0.05037800595164299, "grad_norm_var": 0.00017603605918154827, "learning_rate": 0.009979628157917635, "loss": 2.8412, "step": 1329 }, { "crossentropy": 3.0552310943603516, "epoch": 0.04821635730858469, "grad_norm": 0.05097188428044319, "grad_norm_var": 0.00017796774282487996, "learning_rate": 0.009979575722752189, "loss": 2.9525, "step": 1330 }, { "crossentropy": 2.9098455905914307, "epoch": 0.04825261020881671, "grad_norm": 0.05100809782743454, "grad_norm_var": 0.00018089414065415287, "learning_rate": 0.009979523220330015, "loss": 2.9404, "step": 1331 }, { "crossentropy": 2.9363291263580322, "epoch": 0.04828886310904872, "grad_norm": 0.05093889683485031, "grad_norm_var": 0.0001837826635136539, "learning_rate": 0.009979470650651825, "loss": 2.8885, "step": 1332 }, { "crossentropy": 2.7939529418945312, "epoch": 0.048325116009280744, "grad_norm": 0.06075603514909744, "grad_norm_var": 0.00018437689496287785, "learning_rate": 0.009979418013718327, "loss": 2.8142, "step": 1333 }, { "crossentropy": 2.8535592555999756, "epoch": 0.04836136890951276, "grad_norm": 0.061397287994623184, "grad_norm_var": 0.00018475150209580719, "learning_rate": 0.009979365309530232, "loss": 2.8635, "step": 1334 }, { "crossentropy": 2.7951061725616455, "epoch": 0.04839762180974478, "grad_norm": 0.05553188547492027, "grad_norm_var": 0.0001831501691317248, "learning_rate": 0.009979312538088252, "loss": 2.8004, "step": 1335 }, { "crossentropy": 2.7684664726257324, "epoch": 0.0484338747099768, "grad_norm": 0.07093252241611481, "grad_norm_var": 0.0001925008771800944, "learning_rate": 0.0099792596993931, "loss": 2.7802, "step": 1336 }, { "crossentropy": 2.8639354705810547, "epoch": 0.048470127610208816, "grad_norm": 0.060188982635736465, "grad_norm_var": 0.0001921497867137388, "learning_rate": 0.009979206793445487, "loss": 2.9246, "step": 1337 }, { "crossentropy": 2.921879768371582, "epoch": 0.04850638051044084, "grad_norm": 0.05828065052628517, "grad_norm_var": 0.00019074475024037865, "learning_rate": 0.009979153820246132, "loss": 2.9547, "step": 1338 }, { "crossentropy": 2.8396060466766357, "epoch": 0.04854263341067285, "grad_norm": 0.05646011605858803, "grad_norm_var": 2.8661092736446495e-05, "learning_rate": 0.009979100779795748, "loss": 2.8097, "step": 1339 }, { "crossentropy": 2.9632318019866943, "epoch": 0.04857888631090487, "grad_norm": 0.07724034041166306, "grad_norm_var": 5.572671174800855e-05, "learning_rate": 0.009979047672095055, "loss": 2.9192, "step": 1340 }, { "crossentropy": 2.8954479694366455, "epoch": 0.048615139211136894, "grad_norm": 0.06956000626087189, "grad_norm_var": 6.36274900262287e-05, "learning_rate": 0.009978994497144764, "loss": 2.8539, "step": 1341 }, { "crossentropy": 3.0479953289031982, "epoch": 0.04865139211136891, "grad_norm": 0.058407507836818695, "grad_norm_var": 6.305430363900874e-05, "learning_rate": 0.009978941254945597, "loss": 2.9506, "step": 1342 }, { "crossentropy": 2.77490234375, "epoch": 0.04868764501160093, "grad_norm": 0.05550272762775421, "grad_norm_var": 6.292761469430437e-05, "learning_rate": 0.009978887945498272, "loss": 2.8383, "step": 1343 }, { "crossentropy": 2.8914639949798584, "epoch": 0.048723897911832945, "grad_norm": 0.05330285057425499, "grad_norm_var": 6.186087693424991e-05, "learning_rate": 0.00997883456880351, "loss": 2.9089, "step": 1344 }, { "crossentropy": 2.739142894744873, "epoch": 0.048760150812064966, "grad_norm": 0.05274046212434769, "grad_norm_var": 5.955568509782936e-05, "learning_rate": 0.009978781124862033, "loss": 2.79, "step": 1345 }, { "crossentropy": 2.9771735668182373, "epoch": 0.04879640371229698, "grad_norm": 0.05267205461859703, "grad_norm_var": 5.79275052421714e-05, "learning_rate": 0.00997872761367456, "loss": 2.9241, "step": 1346 }, { "crossentropy": 3.006352186203003, "epoch": 0.048832656612529, "grad_norm": 0.05580109730362892, "grad_norm_var": 5.421918698782039e-05, "learning_rate": 0.009978674035241815, "loss": 2.9565, "step": 1347 }, { "crossentropy": 2.883561372756958, "epoch": 0.04886890951276102, "grad_norm": 0.055826809257268906, "grad_norm_var": 5.0226099343470404e-05, "learning_rate": 0.009978620389564522, "loss": 2.869, "step": 1348 }, { "crossentropy": 2.7447829246520996, "epoch": 0.04890516241299304, "grad_norm": 0.05304872617125511, "grad_norm_var": 5.281508662925404e-05, "learning_rate": 0.009978566676643406, "loss": 2.821, "step": 1349 }, { "crossentropy": 2.701580286026001, "epoch": 0.04894141531322506, "grad_norm": 0.051592033356428146, "grad_norm_var": 5.5926361677421004e-05, "learning_rate": 0.00997851289647919, "loss": 2.7322, "step": 1350 }, { "crossentropy": 2.7459917068481445, "epoch": 0.04897766821345707, "grad_norm": 0.05010829493403435, "grad_norm_var": 5.9960407265204534e-05, "learning_rate": 0.009978459049072604, "loss": 2.8118, "step": 1351 }, { "crossentropy": 2.9091265201568604, "epoch": 0.049013921113689095, "grad_norm": 0.05466047674417496, "grad_norm_var": 4.894764486385016e-05, "learning_rate": 0.009978405134424372, "loss": 2.8807, "step": 1352 }, { "crossentropy": 2.868378162384033, "epoch": 0.049050174013921116, "grad_norm": 0.05435984581708908, "grad_norm_var": 4.8757612108899355e-05, "learning_rate": 0.009978351152535224, "loss": 2.8141, "step": 1353 }, { "crossentropy": 2.841867208480835, "epoch": 0.04908642691415313, "grad_norm": 0.05945044010877609, "grad_norm_var": 4.9066629872899593e-05, "learning_rate": 0.009978297103405889, "loss": 2.8677, "step": 1354 }, { "crossentropy": 2.817500352859497, "epoch": 0.04912267981438515, "grad_norm": 0.05231430009007454, "grad_norm_var": 5.0395555704998986e-05, "learning_rate": 0.009978242987037098, "loss": 2.8077, "step": 1355 }, { "crossentropy": 2.8138883113861084, "epoch": 0.049158932714617166, "grad_norm": 0.0517692007124424, "grad_norm_var": 2.1056214714413637e-05, "learning_rate": 0.00997818880342958, "loss": 2.8632, "step": 1356 }, { "crossentropy": 2.726343870162964, "epoch": 0.04919518561484919, "grad_norm": 0.05116744711995125, "grad_norm_var": 6.6641792570191575e-06, "learning_rate": 0.009978134552584067, "loss": 2.8318, "step": 1357 }, { "crossentropy": 2.8256149291992188, "epoch": 0.04923143851508121, "grad_norm": 0.050940100103616714, "grad_norm_var": 5.681574623628687e-06, "learning_rate": 0.009978080234501292, "loss": 2.8598, "step": 1358 }, { "crossentropy": 3.1189138889312744, "epoch": 0.049267691415313224, "grad_norm": 0.060616686940193176, "grad_norm_var": 8.71336282488629e-06, "learning_rate": 0.009978025849181988, "loss": 2.9651, "step": 1359 }, { "crossentropy": 2.8665058612823486, "epoch": 0.049303944315545245, "grad_norm": 0.062578946352005, "grad_norm_var": 1.3509529615808129e-05, "learning_rate": 0.009977971396626893, "loss": 2.8603, "step": 1360 }, { "crossentropy": 2.755146026611328, "epoch": 0.04934019721577726, "grad_norm": 0.04939519241452217, "grad_norm_var": 1.4928176278256911e-05, "learning_rate": 0.009977916876836739, "loss": 2.7816, "step": 1361 }, { "crossentropy": 2.9694647789001465, "epoch": 0.04937645011600928, "grad_norm": 0.053778573870658875, "grad_norm_var": 1.4787557195192203e-05, "learning_rate": 0.009977862289812263, "loss": 2.8763, "step": 1362 }, { "crossentropy": 2.8452565670013428, "epoch": 0.0494127030162413, "grad_norm": 0.0502944178879261, "grad_norm_var": 1.5516765911569954e-05, "learning_rate": 0.009977807635554203, "loss": 2.9245, "step": 1363 }, { "crossentropy": 2.9277400970458984, "epoch": 0.04944895591647332, "grad_norm": 0.05619419366121292, "grad_norm_var": 1.5621111774805477e-05, "learning_rate": 0.009977752914063297, "loss": 2.8604, "step": 1364 }, { "crossentropy": 2.7154130935668945, "epoch": 0.04948520881670534, "grad_norm": 0.05944351851940155, "grad_norm_var": 1.745810567391389e-05, "learning_rate": 0.009977698125340282, "loss": 2.8303, "step": 1365 }, { "crossentropy": 2.83673357963562, "epoch": 0.04952146171693735, "grad_norm": 0.0538797602057457, "grad_norm_var": 1.6961798898889638e-05, "learning_rate": 0.0099776432693859, "loss": 2.8376, "step": 1366 }, { "crossentropy": 2.8336610794067383, "epoch": 0.049557714617169374, "grad_norm": 0.05843455344438553, "grad_norm_var": 1.6491938576099902e-05, "learning_rate": 0.009977588346200895, "loss": 2.9278, "step": 1367 }, { "crossentropy": 2.9368066787719727, "epoch": 0.049593967517401395, "grad_norm": 0.0573304258286953, "grad_norm_var": 1.68326817512646e-05, "learning_rate": 0.009977533355786003, "loss": 2.87, "step": 1368 }, { "crossentropy": 2.8088886737823486, "epoch": 0.04963022041763341, "grad_norm": 0.5440422296524048, "grad_norm_var": 0.014953891142180096, "learning_rate": 0.00997747829814197, "loss": 2.8, "step": 1369 }, { "crossentropy": 2.883659839630127, "epoch": 0.04966647331786543, "grad_norm": 0.05865461379289627, "grad_norm_var": 0.01495671892293014, "learning_rate": 0.00997742317326954, "loss": 2.9461, "step": 1370 }, { "crossentropy": 2.8654134273529053, "epoch": 0.049702726218097446, "grad_norm": 0.0564429946243763, "grad_norm_var": 0.014939418311433954, "learning_rate": 0.009977367981169454, "loss": 2.7712, "step": 1371 }, { "crossentropy": 2.8038017749786377, "epoch": 0.04973897911832947, "grad_norm": 0.05223368853330612, "grad_norm_var": 0.01493731583833644, "learning_rate": 0.009977312721842461, "loss": 2.8061, "step": 1372 }, { "crossentropy": 2.9283034801483154, "epoch": 0.04977523201856149, "grad_norm": 0.05542606860399246, "grad_norm_var": 0.014918691164403686, "learning_rate": 0.009977257395289308, "loss": 2.865, "step": 1373 }, { "crossentropy": 3.0239555835723877, "epoch": 0.0498114849187935, "grad_norm": 0.06075230985879898, "grad_norm_var": 0.014878538556299052, "learning_rate": 0.009977202001510739, "loss": 2.9973, "step": 1374 }, { "crossentropy": 2.847540855407715, "epoch": 0.049847737819025524, "grad_norm": 0.054511137306690216, "grad_norm_var": 0.014902219073290101, "learning_rate": 0.009977146540507504, "loss": 2.8256, "step": 1375 }, { "crossentropy": 2.99908447265625, "epoch": 0.04988399071925754, "grad_norm": 0.05803452432155609, "grad_norm_var": 0.014917981122686929, "learning_rate": 0.009977091012280353, "loss": 2.9522, "step": 1376 }, { "crossentropy": 2.8464112281799316, "epoch": 0.04992024361948956, "grad_norm": 0.10701567679643631, "grad_norm_var": 0.014842896113881407, "learning_rate": 0.009977035416830034, "loss": 2.8782, "step": 1377 }, { "crossentropy": 2.8492491245269775, "epoch": 0.049956496519721574, "grad_norm": 0.054568469524383545, "grad_norm_var": 0.014839143534958915, "learning_rate": 0.009976979754157298, "loss": 2.8006, "step": 1378 }, { "crossentropy": 2.9410400390625, "epoch": 0.049992749419953596, "grad_norm": 0.1495012491941452, "grad_norm_var": 0.014931325975682428, "learning_rate": 0.009976924024262899, "loss": 2.9539, "step": 1379 }, { "crossentropy": 2.8799500465393066, "epoch": 0.05002900232018562, "grad_norm": 0.050449129194021225, "grad_norm_var": 0.01496390270779099, "learning_rate": 0.009976868227147589, "loss": 2.8254, "step": 1380 }, { "crossentropy": 2.830671548843384, "epoch": 0.05006525522041763, "grad_norm": 0.05676817148923874, "grad_norm_var": 0.014977272512807957, "learning_rate": 0.009976812362812119, "loss": 2.8074, "step": 1381 }, { "crossentropy": 2.8567683696746826, "epoch": 0.05010150812064965, "grad_norm": 0.06358209997415543, "grad_norm_var": 0.014929310508853318, "learning_rate": 0.009976756431257245, "loss": 2.8006, "step": 1382 }, { "crossentropy": 2.9382126331329346, "epoch": 0.05013776102088167, "grad_norm": 0.05757995694875717, "grad_norm_var": 0.014933649038603124, "learning_rate": 0.009976700432483726, "loss": 2.9827, "step": 1383 }, { "crossentropy": 2.8775508403778076, "epoch": 0.05017401392111369, "grad_norm": 0.05542274937033653, "grad_norm_var": 0.014943726554200568, "learning_rate": 0.009976644366492312, "loss": 2.8786, "step": 1384 }, { "crossentropy": 2.892151117324829, "epoch": 0.05021026682134571, "grad_norm": 0.055896904319524765, "grad_norm_var": 0.0006711681108277726, "learning_rate": 0.009976588233283765, "loss": 2.9118, "step": 1385 }, { "crossentropy": 2.9853246212005615, "epoch": 0.050246519721577725, "grad_norm": 0.06486585736274719, "grad_norm_var": 0.0006679702729667111, "learning_rate": 0.009976532032858842, "loss": 2.9435, "step": 1386 }, { "crossentropy": 3.0283358097076416, "epoch": 0.050282772621809746, "grad_norm": 0.06745002418756485, "grad_norm_var": 0.0006617870418490383, "learning_rate": 0.0099764757652183, "loss": 2.9665, "step": 1387 }, { "crossentropy": 3.0100255012512207, "epoch": 0.05031902552204176, "grad_norm": 0.05931282415986061, "grad_norm_var": 0.0006514500005058652, "learning_rate": 0.009976419430362902, "loss": 2.9522, "step": 1388 }, { "crossentropy": 3.084855794906616, "epoch": 0.05035527842227378, "grad_norm": 0.055959053337574005, "grad_norm_var": 0.0006506490902564182, "learning_rate": 0.009976363028293408, "loss": 2.943, "step": 1389 }, { "crossentropy": 2.779642105102539, "epoch": 0.0503915313225058, "grad_norm": 0.05826914682984352, "grad_norm_var": 0.0006530961833211284, "learning_rate": 0.00997630655901058, "loss": 2.8881, "step": 1390 }, { "crossentropy": 2.9277079105377197, "epoch": 0.05042778422273782, "grad_norm": 0.0525548942387104, "grad_norm_var": 0.0006565470058811011, "learning_rate": 0.009976250022515178, "loss": 2.8395, "step": 1391 }, { "crossentropy": 2.8591063022613525, "epoch": 0.05046403712296984, "grad_norm": 0.05412998050451279, "grad_norm_var": 0.0006620121444758004, "learning_rate": 0.009976193418807969, "loss": 2.8873, "step": 1392 }, { "crossentropy": 2.9051430225372314, "epoch": 0.05050029002320185, "grad_norm": 0.054498493671417236, "grad_norm_var": 0.0005503931003368415, "learning_rate": 0.009976136747889715, "loss": 2.8703, "step": 1393 }, { "crossentropy": 2.957796573638916, "epoch": 0.050536542923433875, "grad_norm": 0.05344133451581001, "grad_norm_var": 0.0005517660165689149, "learning_rate": 0.009976080009761183, "loss": 2.9673, "step": 1394 }, { "crossentropy": 2.914504289627075, "epoch": 0.050572795823665896, "grad_norm": 0.05209086090326309, "grad_norm_var": 2.2697560713737183e-05, "learning_rate": 0.009976023204423142, "loss": 2.9037, "step": 1395 }, { "crossentropy": 2.9092421531677246, "epoch": 0.05060904872389791, "grad_norm": 0.057599980384111404, "grad_norm_var": 1.96313931179183e-05, "learning_rate": 0.009975966331876352, "loss": 2.8374, "step": 1396 }, { "crossentropy": 2.9052417278289795, "epoch": 0.05064530162412993, "grad_norm": 0.05636207386851311, "grad_norm_var": 1.9679371252364028e-05, "learning_rate": 0.009975909392121587, "loss": 2.9292, "step": 1397 }, { "crossentropy": 2.7174668312072754, "epoch": 0.050681554524361946, "grad_norm": 0.05579852685332298, "grad_norm_var": 1.708999917583298e-05, "learning_rate": 0.009975852385159616, "loss": 2.7608, "step": 1398 }, { "crossentropy": 2.6611461639404297, "epoch": 0.05071780742459397, "grad_norm": 0.05719911679625511, "grad_norm_var": 1.706717939708354e-05, "learning_rate": 0.009975795310991204, "loss": 2.8046, "step": 1399 }, { "crossentropy": 2.615424633026123, "epoch": 0.05075406032482599, "grad_norm": 0.05919685959815979, "grad_norm_var": 1.719983922473748e-05, "learning_rate": 0.009975738169617129, "loss": 2.7827, "step": 1400 }, { "crossentropy": 2.814906597137451, "epoch": 0.050790313225058004, "grad_norm": 0.05786755308508873, "grad_norm_var": 1.7109590134673352e-05, "learning_rate": 0.009975680961038158, "loss": 2.8431, "step": 1401 }, { "crossentropy": 2.7793824672698975, "epoch": 0.050826566125290025, "grad_norm": 0.05259976163506508, "grad_norm_var": 1.411856202364776e-05, "learning_rate": 0.009975623685255064, "loss": 2.8885, "step": 1402 }, { "crossentropy": 2.7709193229675293, "epoch": 0.05086281902552204, "grad_norm": 0.05352729558944702, "grad_norm_var": 5.944825320204165e-06, "learning_rate": 0.00997556634226862, "loss": 2.7852, "step": 1403 }, { "crossentropy": 2.9025940895080566, "epoch": 0.05089907192575406, "grad_norm": 0.06664872169494629, "grad_norm_var": 1.2890493582235334e-05, "learning_rate": 0.009975508932079604, "loss": 2.8648, "step": 1404 }, { "crossentropy": 2.7094075679779053, "epoch": 0.05093532482598608, "grad_norm": 0.06274436414241791, "grad_norm_var": 1.5632382883213184e-05, "learning_rate": 0.009975451454688788, "loss": 2.8248, "step": 1405 }, { "crossentropy": 2.9237821102142334, "epoch": 0.0509715777262181, "grad_norm": 0.05800911784172058, "grad_norm_var": 1.557641778116662e-05, "learning_rate": 0.009975393910096952, "loss": 2.8736, "step": 1406 }, { "crossentropy": 2.9978268146514893, "epoch": 0.05100783062645012, "grad_norm": 0.05489606037735939, "grad_norm_var": 1.4682250706558413e-05, "learning_rate": 0.009975336298304868, "loss": 2.9614, "step": 1407 }, { "crossentropy": 2.8171985149383545, "epoch": 0.05104408352668213, "grad_norm": 0.05125775933265686, "grad_norm_var": 1.616795670564992e-05, "learning_rate": 0.00997527861931332, "loss": 2.7876, "step": 1408 }, { "crossentropy": 2.680380344390869, "epoch": 0.051080336426914154, "grad_norm": 0.05117477849125862, "grad_norm_var": 1.7738130869903295e-05, "learning_rate": 0.00997522087312308, "loss": 2.7998, "step": 1409 }, { "crossentropy": 2.8742644786834717, "epoch": 0.05111658932714617, "grad_norm": 0.08002983778715134, "grad_norm_var": 5.187355073305403e-05, "learning_rate": 0.009975163059734934, "loss": 2.8647, "step": 1410 }, { "crossentropy": 2.8819949626922607, "epoch": 0.05115284222737819, "grad_norm": 0.060920801013708115, "grad_norm_var": 4.986294812362128e-05, "learning_rate": 0.009975105179149662, "loss": 2.9119, "step": 1411 }, { "crossentropy": 2.9412055015563965, "epoch": 0.05118909512761021, "grad_norm": 0.05209735035896301, "grad_norm_var": 5.2408035913587e-05, "learning_rate": 0.009975047231368042, "loss": 2.8003, "step": 1412 }, { "crossentropy": 2.9896607398986816, "epoch": 0.051225348027842225, "grad_norm": 0.05375918000936508, "grad_norm_var": 5.345046237913934e-05, "learning_rate": 0.009974989216390862, "loss": 2.8757, "step": 1413 }, { "crossentropy": 2.7393558025360107, "epoch": 0.05126160092807425, "grad_norm": 0.05553032085299492, "grad_norm_var": 5.353307474425245e-05, "learning_rate": 0.009974931134218901, "loss": 2.7976, "step": 1414 }, { "crossentropy": 2.949650526046753, "epoch": 0.05129785382830626, "grad_norm": 0.05276602506637573, "grad_norm_var": 5.5214738458262784e-05, "learning_rate": 0.009974872984852945, "loss": 2.8874, "step": 1415 }, { "crossentropy": 2.833252429962158, "epoch": 0.05133410672853828, "grad_norm": 0.05274861305952072, "grad_norm_var": 5.6517170466104334e-05, "learning_rate": 0.009974814768293781, "loss": 2.8366, "step": 1416 }, { "crossentropy": 2.8916831016540527, "epoch": 0.051370359628770304, "grad_norm": 0.04743964597582817, "grad_norm_var": 6.250504803102681e-05, "learning_rate": 0.009974756484542194, "loss": 2.8626, "step": 1417 }, { "crossentropy": 2.9048378467559814, "epoch": 0.05140661252900232, "grad_norm": 0.0525028370320797, "grad_norm_var": 6.255777532326312e-05, "learning_rate": 0.00997469813359897, "loss": 2.8732, "step": 1418 }, { "crossentropy": 2.7908244132995605, "epoch": 0.05144286542923434, "grad_norm": 0.04911266267299652, "grad_norm_var": 6.560113960107268e-05, "learning_rate": 0.0099746397154649, "loss": 2.8265, "step": 1419 }, { "crossentropy": 2.872426986694336, "epoch": 0.051479118329466354, "grad_norm": 0.05534900724887848, "grad_norm_var": 5.806859805827513e-05, "learning_rate": 0.009974581230140769, "loss": 2.8829, "step": 1420 }, { "crossentropy": 2.8507633209228516, "epoch": 0.051515371229698376, "grad_norm": 0.05922609567642212, "grad_norm_var": 5.551244550402919e-05, "learning_rate": 0.00997452267762737, "loss": 2.9277, "step": 1421 }, { "crossentropy": 2.9314184188842773, "epoch": 0.0515516241299304, "grad_norm": 0.054422877728939056, "grad_norm_var": 5.508123026013237e-05, "learning_rate": 0.009974464057925495, "loss": 2.8212, "step": 1422 }, { "crossentropy": 2.789599895477295, "epoch": 0.05158787703016241, "grad_norm": 0.058309927582740784, "grad_norm_var": 5.567032483503268e-05, "learning_rate": 0.009974405371035932, "loss": 2.7734, "step": 1423 }, { "crossentropy": 2.9888782501220703, "epoch": 0.05162412993039443, "grad_norm": 0.05830707028508186, "grad_norm_var": 5.4868246103239916e-05, "learning_rate": 0.009974346616959475, "loss": 2.9292, "step": 1424 }, { "crossentropy": 2.7336528301239014, "epoch": 0.05166038283062645, "grad_norm": 0.06053389608860016, "grad_norm_var": 5.4501119662739236e-05, "learning_rate": 0.00997428779569692, "loss": 2.7962, "step": 1425 }, { "crossentropy": 2.9083383083343506, "epoch": 0.05169663573085847, "grad_norm": 0.05935864895582199, "grad_norm_var": 1.6192697186244846e-05, "learning_rate": 0.009974228907249059, "loss": 2.8834, "step": 1426 }, { "crossentropy": 2.9524283409118652, "epoch": 0.05173288863109049, "grad_norm": 0.058230552822351456, "grad_norm_var": 1.4574714775909767e-05, "learning_rate": 0.009974169951616688, "loss": 2.9447, "step": 1427 }, { "crossentropy": 2.787797451019287, "epoch": 0.051769141531322505, "grad_norm": 0.07255986332893372, "grad_norm_var": 3.287702439761549e-05, "learning_rate": 0.009974110928800602, "loss": 2.8764, "step": 1428 }, { "crossentropy": 2.8748691082000732, "epoch": 0.051805394431554526, "grad_norm": 0.057988040149211884, "grad_norm_var": 3.258474344504976e-05, "learning_rate": 0.0099740518388016, "loss": 2.9005, "step": 1429 }, { "crossentropy": 2.988588809967041, "epoch": 0.05184164733178654, "grad_norm": 0.06190837174654007, "grad_norm_var": 3.4282071987081174e-05, "learning_rate": 0.009973992681620482, "loss": 2.9384, "step": 1430 }, { "crossentropy": 2.9180736541748047, "epoch": 0.05187790023201856, "grad_norm": 0.060807034373283386, "grad_norm_var": 3.386660881288782e-05, "learning_rate": 0.009973933457258041, "loss": 2.879, "step": 1431 }, { "crossentropy": 3.1012003421783447, "epoch": 0.05191415313225058, "grad_norm": 0.05993267148733139, "grad_norm_var": 3.2612577679572843e-05, "learning_rate": 0.009973874165715083, "loss": 3.055, "step": 1432 }, { "crossentropy": 2.625701427459717, "epoch": 0.0519504060324826, "grad_norm": 0.05793371424078941, "grad_norm_var": 2.4895121333501126e-05, "learning_rate": 0.009973814806992404, "loss": 2.7744, "step": 1433 }, { "crossentropy": 2.9987099170684814, "epoch": 0.05198665893271462, "grad_norm": 0.05603315308690071, "grad_norm_var": 2.2836932113956542e-05, "learning_rate": 0.00997375538109081, "loss": 2.8997, "step": 1434 }, { "crossentropy": 2.675551176071167, "epoch": 0.05202291183294663, "grad_norm": 0.05043406784534454, "grad_norm_var": 2.1247937483929538e-05, "learning_rate": 0.0099736958880111, "loss": 2.7316, "step": 1435 }, { "crossentropy": 2.8193678855895996, "epoch": 0.052059164733178655, "grad_norm": 0.048327866941690445, "grad_norm_var": 2.759091924571007e-05, "learning_rate": 0.00997363632775408, "loss": 2.8324, "step": 1436 }, { "crossentropy": 2.720282554626465, "epoch": 0.052095417633410676, "grad_norm": 0.052487220615148544, "grad_norm_var": 2.9682098004336446e-05, "learning_rate": 0.009973576700320555, "loss": 2.7686, "step": 1437 }, { "crossentropy": 2.9121620655059814, "epoch": 0.05213167053364269, "grad_norm": 0.0568271204829216, "grad_norm_var": 2.8905184730533088e-05, "learning_rate": 0.009973517005711327, "loss": 2.8562, "step": 1438 }, { "crossentropy": 2.699889659881592, "epoch": 0.05216792343387471, "grad_norm": 0.054490745067596436, "grad_norm_var": 2.9721988443807505e-05, "learning_rate": 0.009973457243927205, "loss": 2.7803, "step": 1439 }, { "crossentropy": 2.761583089828491, "epoch": 0.052204176334106726, "grad_norm": 0.0546770803630352, "grad_norm_var": 3.0341259805012854e-05, "learning_rate": 0.009973397414968995, "loss": 2.9262, "step": 1440 }, { "crossentropy": 2.99218487739563, "epoch": 0.05224042923433875, "grad_norm": 0.058862607926130295, "grad_norm_var": 2.9875003435073506e-05, "learning_rate": 0.009973337518837506, "loss": 2.921, "step": 1441 }, { "crossentropy": 2.8595752716064453, "epoch": 0.05227668213457077, "grad_norm": 0.050266366451978683, "grad_norm_var": 3.285367268283818e-05, "learning_rate": 0.009973277555533547, "loss": 2.8527, "step": 1442 }, { "crossentropy": 2.7294349670410156, "epoch": 0.052312935034802784, "grad_norm": 0.05205545946955681, "grad_norm_var": 3.421172143907673e-05, "learning_rate": 0.009973217525057925, "loss": 2.7869, "step": 1443 }, { "crossentropy": 2.9529454708099365, "epoch": 0.052349187935034805, "grad_norm": 0.05583490803837776, "grad_norm_var": 1.6102878537550305e-05, "learning_rate": 0.009973157427411455, "loss": 2.8566, "step": 1444 }, { "crossentropy": 2.7848892211914062, "epoch": 0.05238544083526682, "grad_norm": 0.05609011650085449, "grad_norm_var": 1.571209943129799e-05, "learning_rate": 0.009973097262594948, "loss": 2.7783, "step": 1445 }, { "crossentropy": 2.843754529953003, "epoch": 0.05242169373549884, "grad_norm": 0.052870966494083405, "grad_norm_var": 1.3017077109725844e-05, "learning_rate": 0.009973037030609214, "loss": 2.8768, "step": 1446 }, { "crossentropy": 2.8717148303985596, "epoch": 0.052457946635730855, "grad_norm": 0.05001659691333771, "grad_norm_var": 1.1753411708959455e-05, "learning_rate": 0.009972976731455068, "loss": 2.9064, "step": 1447 }, { "crossentropy": 2.635178327560425, "epoch": 0.05249419953596288, "grad_norm": 0.04973367601633072, "grad_norm_var": 1.0453922353104104e-05, "learning_rate": 0.009972916365133325, "loss": 2.6711, "step": 1448 }, { "crossentropy": 2.632815361022949, "epoch": 0.0525304524361949, "grad_norm": 0.05018383264541626, "grad_norm_var": 9.687094209664959e-06, "learning_rate": 0.0099728559316448, "loss": 2.7221, "step": 1449 }, { "crossentropy": 2.765789031982422, "epoch": 0.05256670533642691, "grad_norm": 0.05465102568268776, "grad_norm_var": 9.261252590356261e-06, "learning_rate": 0.009972795430990309, "loss": 2.8717, "step": 1450 }, { "crossentropy": 2.8928449153900146, "epoch": 0.052602958236658934, "grad_norm": 0.05090288072824478, "grad_norm_var": 9.115340537183686e-06, "learning_rate": 0.009972734863170669, "loss": 2.889, "step": 1451 }, { "crossentropy": 2.973820686340332, "epoch": 0.05263921113689095, "grad_norm": 0.04963652789592743, "grad_norm_var": 8.404109022505136e-06, "learning_rate": 0.009972674228186698, "loss": 2.8867, "step": 1452 }, { "crossentropy": 2.864670753479004, "epoch": 0.05267546403712297, "grad_norm": 0.050403546541929245, "grad_norm_var": 8.845486009177208e-06, "learning_rate": 0.009972613526039216, "loss": 2.9165, "step": 1453 }, { "crossentropy": 2.864306688308716, "epoch": 0.05271171693735499, "grad_norm": 0.049522943794727325, "grad_norm_var": 8.422504463797588e-06, "learning_rate": 0.009972552756729041, "loss": 2.8243, "step": 1454 }, { "crossentropy": 2.798454523086548, "epoch": 0.052747969837587005, "grad_norm": 0.05777154117822647, "grad_norm_var": 9.960613075534324e-06, "learning_rate": 0.009972491920256996, "loss": 2.8536, "step": 1455 }, { "crossentropy": 2.7043232917785645, "epoch": 0.05278422273781903, "grad_norm": 0.054564058780670166, "grad_norm_var": 9.93188153197803e-06, "learning_rate": 0.009972431016623899, "loss": 2.8297, "step": 1456 }, { "crossentropy": 2.985116720199585, "epoch": 0.05282047563805104, "grad_norm": 0.05409533530473709, "grad_norm_var": 7.441771083434243e-06, "learning_rate": 0.009972370045830578, "loss": 3.0456, "step": 1457 }, { "crossentropy": 2.8902456760406494, "epoch": 0.05285672853828306, "grad_norm": 0.052380215376615524, "grad_norm_var": 7.1161669666405225e-06, "learning_rate": 0.009972309007877853, "loss": 2.8859, "step": 1458 }, { "crossentropy": 2.741957426071167, "epoch": 0.052892981438515084, "grad_norm": 0.05419386178255081, "grad_norm_var": 7.262500287147135e-06, "learning_rate": 0.009972247902766549, "loss": 2.7987, "step": 1459 }, { "crossentropy": 3.008786916732788, "epoch": 0.0529292343387471, "grad_norm": 0.055905260145664215, "grad_norm_var": 7.2924199454232315e-06, "learning_rate": 0.009972186730497492, "loss": 2.8902, "step": 1460 }, { "crossentropy": 2.7604455947875977, "epoch": 0.05296548723897912, "grad_norm": 0.04870016872882843, "grad_norm_var": 7.348160477584943e-06, "learning_rate": 0.009972125491071506, "loss": 2.756, "step": 1461 }, { "crossentropy": 2.8159666061401367, "epoch": 0.053001740139211134, "grad_norm": 0.05050842836499214, "grad_norm_var": 7.492196750500529e-06, "learning_rate": 0.009972064184489423, "loss": 2.824, "step": 1462 }, { "crossentropy": 2.9371237754821777, "epoch": 0.053037993039443156, "grad_norm": 0.051216237246990204, "grad_norm_var": 7.253197937517886e-06, "learning_rate": 0.009972002810752065, "loss": 2.9092, "step": 1463 }, { "crossentropy": 2.8604419231414795, "epoch": 0.05307424593967518, "grad_norm": 0.05147930234670639, "grad_norm_var": 6.881691902063669e-06, "learning_rate": 0.009971941369860264, "loss": 2.7874, "step": 1464 }, { "crossentropy": 2.7560460567474365, "epoch": 0.05311049883990719, "grad_norm": 0.05141125246882439, "grad_norm_var": 6.636533249226804e-06, "learning_rate": 0.00997187986181485, "loss": 2.8467, "step": 1465 }, { "crossentropy": 2.9082839488983154, "epoch": 0.05314675174013921, "grad_norm": 0.06063452735543251, "grad_norm_var": 1.072277060961037e-05, "learning_rate": 0.009971818286616654, "loss": 2.9223, "step": 1466 }, { "crossentropy": 2.7236149311065674, "epoch": 0.05318300464037123, "grad_norm": 0.06213130056858063, "grad_norm_var": 1.590030282949863e-05, "learning_rate": 0.009971756644266506, "loss": 2.7603, "step": 1467 }, { "crossentropy": 2.821955442428589, "epoch": 0.05321925754060325, "grad_norm": 0.056974686682224274, "grad_norm_var": 1.557412958141189e-05, "learning_rate": 0.00997169493476524, "loss": 2.8513, "step": 1468 }, { "crossentropy": 2.7433321475982666, "epoch": 0.05325551044083527, "grad_norm": 0.057982638478279114, "grad_norm_var": 1.566301133105371e-05, "learning_rate": 0.00997163315811369, "loss": 2.6977, "step": 1469 }, { "crossentropy": 2.6460578441619873, "epoch": 0.053291763341067284, "grad_norm": 0.056356627494096756, "grad_norm_var": 1.4190806356180417e-05, "learning_rate": 0.009971571314312689, "loss": 2.7722, "step": 1470 }, { "crossentropy": 2.747607707977295, "epoch": 0.053328016241299306, "grad_norm": 0.05108533054590225, "grad_norm_var": 1.4308225307808839e-05, "learning_rate": 0.009971509403363072, "loss": 2.7665, "step": 1471 }, { "crossentropy": 2.7557833194732666, "epoch": 0.05336426914153132, "grad_norm": 0.04925964027643204, "grad_norm_var": 1.5916234579707992e-05, "learning_rate": 0.009971447425265676, "loss": 2.8093, "step": 1472 }, { "crossentropy": 2.936298370361328, "epoch": 0.05340052204176334, "grad_norm": 0.05253548547625542, "grad_norm_var": 1.605256964037511e-05, "learning_rate": 0.009971385380021338, "loss": 2.8552, "step": 1473 }, { "crossentropy": 2.9022860527038574, "epoch": 0.05343677494199536, "grad_norm": 0.06653493642807007, "grad_norm_var": 2.5664673930864538e-05, "learning_rate": 0.009971323267630896, "loss": 2.8455, "step": 1474 }, { "crossentropy": 2.892286777496338, "epoch": 0.05347302784222738, "grad_norm": 0.08205777406692505, "grad_norm_var": 7.191213816624959e-05, "learning_rate": 0.009971261088095191, "loss": 2.928, "step": 1475 }, { "crossentropy": 2.9148571491241455, "epoch": 0.0535092807424594, "grad_norm": 0.07455160468816757, "grad_norm_var": 9.204368731679369e-05, "learning_rate": 0.00997119884141506, "loss": 2.8441, "step": 1476 }, { "crossentropy": 2.8396360874176025, "epoch": 0.05354553364269141, "grad_norm": 0.05461309850215912, "grad_norm_var": 8.712263829480782e-05, "learning_rate": 0.009971136527591342, "loss": 2.7961, "step": 1477 }, { "crossentropy": 2.8146142959594727, "epoch": 0.053581786542923435, "grad_norm": 0.05892155319452286, "grad_norm_var": 8.304931307067087e-05, "learning_rate": 0.009971074146624884, "loss": 2.858, "step": 1478 }, { "crossentropy": 2.678715467453003, "epoch": 0.05361803944315545, "grad_norm": 0.05696796998381615, "grad_norm_var": 7.94473765121303e-05, "learning_rate": 0.009971011698516526, "loss": 2.7415, "step": 1479 }, { "crossentropy": 2.9726955890655518, "epoch": 0.05365429234338747, "grad_norm": 0.051889568567276, "grad_norm_var": 7.904821519178618e-05, "learning_rate": 0.00997094918326711, "loss": 2.8875, "step": 1480 }, { "crossentropy": 2.8411662578582764, "epoch": 0.05369054524361949, "grad_norm": 0.05779115855693817, "grad_norm_var": 7.514165729325557e-05, "learning_rate": 0.009970886600877483, "loss": 2.8313, "step": 1481 }, { "crossentropy": 2.748932361602783, "epoch": 0.053726798143851506, "grad_norm": 0.05543334409594536, "grad_norm_var": 7.597143402963506e-05, "learning_rate": 0.009970823951348486, "loss": 2.8231, "step": 1482 }, { "crossentropy": 2.9378857612609863, "epoch": 0.05376305104408353, "grad_norm": 0.05792783945798874, "grad_norm_var": 7.535884496593162e-05, "learning_rate": 0.009970761234680969, "loss": 2.9268, "step": 1483 }, { "crossentropy": 2.6384315490722656, "epoch": 0.05379930394431554, "grad_norm": 0.060094818472862244, "grad_norm_var": 7.520576919030353e-05, "learning_rate": 0.00997069845087578, "loss": 2.7355, "step": 1484 }, { "crossentropy": 2.819826364517212, "epoch": 0.053835556844547564, "grad_norm": 0.05206628516316414, "grad_norm_var": 7.819618122198487e-05, "learning_rate": 0.009970635599933764, "loss": 2.8677, "step": 1485 }, { "crossentropy": 2.807738780975342, "epoch": 0.053871809744779585, "grad_norm": 0.050769221037626266, "grad_norm_var": 8.1841337376505e-05, "learning_rate": 0.009970572681855772, "loss": 2.8047, "step": 1486 }, { "crossentropy": 2.775806427001953, "epoch": 0.0539080626450116, "grad_norm": 0.05518389493227005, "grad_norm_var": 7.895884754775766e-05, "learning_rate": 0.00997050969664265, "loss": 2.7919, "step": 1487 }, { "crossentropy": 2.863308906555176, "epoch": 0.05394431554524362, "grad_norm": 0.0634244754910469, "grad_norm_var": 7.397664009891006e-05, "learning_rate": 0.009970446644295255, "loss": 2.902, "step": 1488 }, { "crossentropy": 2.8998003005981445, "epoch": 0.053980568445475635, "grad_norm": 0.05432151257991791, "grad_norm_var": 7.253591071702258e-05, "learning_rate": 0.009970383524814433, "loss": 2.7995, "step": 1489 }, { "crossentropy": 2.8473129272460938, "epoch": 0.05401682134570766, "grad_norm": 0.0516357496380806, "grad_norm_var": 7.250288087055241e-05, "learning_rate": 0.00997032033820104, "loss": 2.8754, "step": 1490 }, { "crossentropy": 2.829521894454956, "epoch": 0.05405307424593968, "grad_norm": 0.05191561579704285, "grad_norm_var": 3.5024038482803914e-05, "learning_rate": 0.009970257084455926, "loss": 2.7983, "step": 1491 }, { "crossentropy": 2.910909414291382, "epoch": 0.05408932714617169, "grad_norm": 0.052694689482450485, "grad_norm_var": 1.291374684021651e-05, "learning_rate": 0.00997019376357995, "loss": 2.9144, "step": 1492 }, { "crossentropy": 2.963209867477417, "epoch": 0.054125580046403714, "grad_norm": 0.051140282303094864, "grad_norm_var": 1.4010211586222103e-05, "learning_rate": 0.009970130375573963, "loss": 2.9853, "step": 1493 }, { "crossentropy": 2.923062562942505, "epoch": 0.05416183294663573, "grad_norm": 0.0539436936378479, "grad_norm_var": 1.3046459576705981e-05, "learning_rate": 0.009970066920438822, "loss": 2.8558, "step": 1494 }, { "crossentropy": 2.880051612854004, "epoch": 0.05419808584686775, "grad_norm": 0.05351411923766136, "grad_norm_var": 1.280516435482118e-05, "learning_rate": 0.009970003398175387, "loss": 2.8521, "step": 1495 }, { "crossentropy": 2.8137948513031006, "epoch": 0.05423433874709977, "grad_norm": 0.05120949074625969, "grad_norm_var": 1.3080673826436756e-05, "learning_rate": 0.009969939808784512, "loss": 2.7978, "step": 1496 }, { "crossentropy": 3.0412471294403076, "epoch": 0.054270591647331785, "grad_norm": 0.05836424604058266, "grad_norm_var": 1.3347591721309347e-05, "learning_rate": 0.009969876152267059, "loss": 2.8925, "step": 1497 }, { "crossentropy": 2.7991576194763184, "epoch": 0.05430684454756381, "grad_norm": 0.05132029578089714, "grad_norm_var": 1.394924962647125e-05, "learning_rate": 0.009969812428623886, "loss": 2.8375, "step": 1498 }, { "crossentropy": 2.760923147201538, "epoch": 0.05434309744779582, "grad_norm": 0.059978652745485306, "grad_norm_var": 1.5191705823768865e-05, "learning_rate": 0.009969748637855855, "loss": 2.826, "step": 1499 }, { "crossentropy": 2.762451410293579, "epoch": 0.05437935034802784, "grad_norm": 0.0624321885406971, "grad_norm_var": 1.7285021888936943e-05, "learning_rate": 0.009969684779963826, "loss": 2.8294, "step": 1500 }, { "crossentropy": 2.8184051513671875, "epoch": 0.054415603248259864, "grad_norm": 0.05218669772148132, "grad_norm_var": 1.724493378244072e-05, "learning_rate": 0.009969620854948662, "loss": 2.8342, "step": 1501 }, { "crossentropy": 2.942683458328247, "epoch": 0.05445185614849188, "grad_norm": 0.06102383881807327, "grad_norm_var": 1.8542343259294232e-05, "learning_rate": 0.009969556862811228, "loss": 2.8823, "step": 1502 }, { "crossentropy": 2.8834786415100098, "epoch": 0.0544881090487239, "grad_norm": 0.05586370825767517, "grad_norm_var": 1.8563595783813863e-05, "learning_rate": 0.009969492803552386, "loss": 2.8338, "step": 1503 }, { "crossentropy": 2.674447774887085, "epoch": 0.054524361948955914, "grad_norm": 0.055692609399557114, "grad_norm_var": 1.3935213413095113e-05, "learning_rate": 0.009969428677173002, "loss": 2.8303, "step": 1504 }, { "crossentropy": 2.887359142303467, "epoch": 0.054560614849187936, "grad_norm": 0.05996624752879143, "grad_norm_var": 1.5545953708904352e-05, "learning_rate": 0.009969364483673944, "loss": 2.8623, "step": 1505 }, { "crossentropy": 2.7974441051483154, "epoch": 0.05459686774941996, "grad_norm": 0.05693890154361725, "grad_norm_var": 1.479748045265239e-05, "learning_rate": 0.009969300223056076, "loss": 2.8474, "step": 1506 }, { "crossentropy": 2.7607502937316895, "epoch": 0.05463312064965197, "grad_norm": 0.056769758462905884, "grad_norm_var": 1.3942772929280885e-05, "learning_rate": 0.009969235895320267, "loss": 2.7803, "step": 1507 }, { "crossentropy": 2.9118053913116455, "epoch": 0.05466937354988399, "grad_norm": 0.0594710037112236, "grad_norm_var": 1.3993480551021247e-05, "learning_rate": 0.009969171500467387, "loss": 2.8975, "step": 1508 }, { "crossentropy": 2.7340543270111084, "epoch": 0.05470562645011601, "grad_norm": 0.0598909966647625, "grad_norm_var": 1.2831031286144173e-05, "learning_rate": 0.009969107038498305, "loss": 2.9289, "step": 1509 }, { "crossentropy": 2.9760303497314453, "epoch": 0.05474187935034803, "grad_norm": 0.0912477970123291, "grad_norm_var": 8.567146642086681e-05, "learning_rate": 0.00996904250941389, "loss": 2.8816, "step": 1510 }, { "crossentropy": 2.7096738815307617, "epoch": 0.05477813225058004, "grad_norm": 0.06301754713058472, "grad_norm_var": 8.421673432701926e-05, "learning_rate": 0.009968977913215016, "loss": 2.757, "step": 1511 }, { "crossentropy": 2.8474745750427246, "epoch": 0.054814385150812064, "grad_norm": 0.056433942168951035, "grad_norm_var": 8.000065677704037e-05, "learning_rate": 0.009968913249902555, "loss": 2.8952, "step": 1512 }, { "crossentropy": 2.922882080078125, "epoch": 0.054850638051044086, "grad_norm": 0.0554632768034935, "grad_norm_var": 8.117380303506213e-05, "learning_rate": 0.00996884851947738, "loss": 2.8415, "step": 1513 }, { "crossentropy": 2.9205520153045654, "epoch": 0.0548868909512761, "grad_norm": 0.04939021170139313, "grad_norm_var": 8.360326999863424e-05, "learning_rate": 0.009968783721940364, "loss": 2.8496, "step": 1514 }, { "crossentropy": 2.874635696411133, "epoch": 0.05492314385150812, "grad_norm": 0.054852262139320374, "grad_norm_var": 8.507953651625122e-05, "learning_rate": 0.009968718857292384, "loss": 2.9017, "step": 1515 }, { "crossentropy": 2.9087910652160645, "epoch": 0.054959396751740136, "grad_norm": 0.06826084852218628, "grad_norm_var": 8.954764051705121e-05, "learning_rate": 0.009968653925534317, "loss": 2.9205, "step": 1516 }, { "crossentropy": 2.7769477367401123, "epoch": 0.05499564965197216, "grad_norm": 0.05859686806797981, "grad_norm_var": 8.562642145208728e-05, "learning_rate": 0.009968588926667037, "loss": 2.7398, "step": 1517 }, { "crossentropy": 2.772388458251953, "epoch": 0.05503190255220418, "grad_norm": 0.06544806808233261, "grad_norm_var": 8.734756913682986e-05, "learning_rate": 0.009968523860691424, "loss": 2.8133, "step": 1518 }, { "crossentropy": 2.828303575515747, "epoch": 0.05506815545243619, "grad_norm": 0.05520660802721977, "grad_norm_var": 8.777694562480282e-05, "learning_rate": 0.009968458727608357, "loss": 2.8683, "step": 1519 }, { "crossentropy": 2.8364994525909424, "epoch": 0.055104408352668215, "grad_norm": 0.0538766011595726, "grad_norm_var": 8.912662199223315e-05, "learning_rate": 0.009968393527418714, "loss": 2.8826, "step": 1520 }, { "crossentropy": 2.8633909225463867, "epoch": 0.05514066125290023, "grad_norm": 0.050089314579963684, "grad_norm_var": 9.566580804031809e-05, "learning_rate": 0.009968328260123376, "loss": 2.8112, "step": 1521 }, { "crossentropy": 2.83091139793396, "epoch": 0.05517691415313225, "grad_norm": 0.052509061992168427, "grad_norm_var": 9.851402423875591e-05, "learning_rate": 0.009968262925723226, "loss": 2.851, "step": 1522 }, { "crossentropy": 2.805391311645508, "epoch": 0.05521316705336427, "grad_norm": 0.053705453872680664, "grad_norm_var": 0.000100178715757492, "learning_rate": 0.009968197524219145, "loss": 2.8738, "step": 1523 }, { "crossentropy": 2.8237144947052, "epoch": 0.055249419953596286, "grad_norm": 0.05113782733678818, "grad_norm_var": 0.00010423576608184955, "learning_rate": 0.009968132055612018, "loss": 2.8433, "step": 1524 }, { "crossentropy": 2.685746192932129, "epoch": 0.05528567285382831, "grad_norm": 0.06507644802331924, "grad_norm_var": 0.00010674293814703402, "learning_rate": 0.009968066519902728, "loss": 2.6992, "step": 1525 }, { "crossentropy": 2.754725933074951, "epoch": 0.05532192575406032, "grad_norm": 0.053902436047792435, "grad_norm_var": 3.3433245070006264e-05, "learning_rate": 0.009968000917092159, "loss": 2.8756, "step": 1526 }, { "crossentropy": 2.8131918907165527, "epoch": 0.05535817865429234, "grad_norm": 0.05403832346200943, "grad_norm_var": 3.0891391399745276e-05, "learning_rate": 0.0099679352471812, "loss": 2.8192, "step": 1527 }, { "crossentropy": 2.9055001735687256, "epoch": 0.055394431554524365, "grad_norm": 0.048225659877061844, "grad_norm_var": 3.476341562489438e-05, "learning_rate": 0.009967869510170735, "loss": 2.8023, "step": 1528 }, { "crossentropy": 2.8277029991149902, "epoch": 0.05543068445475638, "grad_norm": 0.05092617869377136, "grad_norm_var": 3.613948257095873e-05, "learning_rate": 0.009967803706061654, "loss": 2.8039, "step": 1529 }, { "crossentropy": 2.864807367324829, "epoch": 0.0554669373549884, "grad_norm": 0.05086524039506912, "grad_norm_var": 3.5107748317238695e-05, "learning_rate": 0.009967737834854846, "loss": 2.8219, "step": 1530 }, { "crossentropy": 2.9026989936828613, "epoch": 0.055503190255220415, "grad_norm": 0.049421507865190506, "grad_norm_var": 3.736203916986271e-05, "learning_rate": 0.009967671896551199, "loss": 2.7796, "step": 1531 }, { "crossentropy": 2.9825968742370605, "epoch": 0.055539443155452436, "grad_norm": 0.052736908197402954, "grad_norm_var": 2.514242414685854e-05, "learning_rate": 0.009967605891151603, "loss": 2.8618, "step": 1532 }, { "crossentropy": 2.801103115081787, "epoch": 0.05557569605568446, "grad_norm": 0.05491020902991295, "grad_norm_var": 2.3786426693644426e-05, "learning_rate": 0.009967539818656952, "loss": 2.8381, "step": 1533 }, { "crossentropy": 2.877410650253296, "epoch": 0.05561194895591647, "grad_norm": 0.05447278544306755, "grad_norm_var": 1.4386223065404602e-05, "learning_rate": 0.009967473679068138, "loss": 2.8933, "step": 1534 }, { "crossentropy": 2.741288185119629, "epoch": 0.055648201856148494, "grad_norm": 0.049340344965457916, "grad_norm_var": 1.496267182219581e-05, "learning_rate": 0.009967407472386051, "loss": 2.7062, "step": 1535 }, { "crossentropy": 2.968871831893921, "epoch": 0.05568445475638051, "grad_norm": 0.0492333322763443, "grad_norm_var": 1.566044675418565e-05, "learning_rate": 0.00996734119861159, "loss": 2.9286, "step": 1536 }, { "crossentropy": 2.8451104164123535, "epoch": 0.05572070765661253, "grad_norm": 0.05214967206120491, "grad_norm_var": 1.5253366170560566e-05, "learning_rate": 0.009967274857745647, "loss": 2.8469, "step": 1537 }, { "crossentropy": 2.8521125316619873, "epoch": 0.05575696055684455, "grad_norm": 0.0583011731505394, "grad_norm_var": 1.7229172867658932e-05, "learning_rate": 0.009967208449789119, "loss": 2.8407, "step": 1538 }, { "crossentropy": 2.9486584663391113, "epoch": 0.055793213457076565, "grad_norm": 0.050875477492809296, "grad_norm_var": 1.747399085999246e-05, "learning_rate": 0.009967141974742903, "loss": 2.9389, "step": 1539 }, { "crossentropy": 2.919334650039673, "epoch": 0.05582946635730859, "grad_norm": 0.0492018386721611, "grad_norm_var": 1.815042855693727e-05, "learning_rate": 0.009967075432607898, "loss": 2.8451, "step": 1540 }, { "crossentropy": 2.850132465362549, "epoch": 0.0558657192575406, "grad_norm": 0.04984137415885925, "grad_norm_var": 7.57695990132104e-06, "learning_rate": 0.009967008823385, "loss": 2.8744, "step": 1541 }, { "crossentropy": 2.979681968688965, "epoch": 0.05590197215777262, "grad_norm": 0.049206994473934174, "grad_norm_var": 7.624669376459815e-06, "learning_rate": 0.00996694214707511, "loss": 2.9057, "step": 1542 }, { "crossentropy": 2.7640609741210938, "epoch": 0.05593822505800464, "grad_norm": 0.05059802532196045, "grad_norm_var": 7.192799601839622e-06, "learning_rate": 0.00996687540367913, "loss": 2.7548, "step": 1543 }, { "crossentropy": 2.8249242305755615, "epoch": 0.05597447795823666, "grad_norm": 0.05203748494386673, "grad_norm_var": 6.554081425154632e-06, "learning_rate": 0.009966808593197958, "loss": 2.8523, "step": 1544 }, { "crossentropy": 2.786722421646118, "epoch": 0.05601073085846868, "grad_norm": 0.05128967761993408, "grad_norm_var": 6.534169420181988e-06, "learning_rate": 0.0099667417156325, "loss": 2.7617, "step": 1545 }, { "crossentropy": 2.7113661766052246, "epoch": 0.056046983758700694, "grad_norm": 0.05538543313741684, "grad_norm_var": 7.410455711211415e-06, "learning_rate": 0.009966674770983659, "loss": 2.7699, "step": 1546 }, { "crossentropy": 2.683079957962036, "epoch": 0.056083236658932716, "grad_norm": 0.052056003361940384, "grad_norm_var": 7.004317566146565e-06, "learning_rate": 0.009966607759252337, "loss": 2.7352, "step": 1547 }, { "crossentropy": 2.915294885635376, "epoch": 0.05611948955916473, "grad_norm": 0.05569211021065712, "grad_norm_var": 7.84945145754819e-06, "learning_rate": 0.009966540680439439, "loss": 2.7919, "step": 1548 }, { "crossentropy": 2.9371795654296875, "epoch": 0.05615574245939675, "grad_norm": 0.06088916212320328, "grad_norm_var": 1.227455218683222e-05, "learning_rate": 0.009966473534545873, "loss": 2.8611, "step": 1549 }, { "crossentropy": 2.907644271850586, "epoch": 0.05619199535962877, "grad_norm": 0.05216985568404198, "grad_norm_var": 1.2011217840177807e-05, "learning_rate": 0.009966406321572545, "loss": 2.8971, "step": 1550 }, { "crossentropy": 2.909512758255005, "epoch": 0.05622824825986079, "grad_norm": 0.05089854076504707, "grad_norm_var": 1.1529008547408494e-05, "learning_rate": 0.009966339041520363, "loss": 2.8976, "step": 1551 }, { "crossentropy": 2.8217241764068604, "epoch": 0.05626450116009281, "grad_norm": 0.04985124617815018, "grad_norm_var": 1.1284631401597446e-05, "learning_rate": 0.009966271694390235, "loss": 2.8013, "step": 1552 }, { "crossentropy": 2.9601690769195557, "epoch": 0.05630075406032482, "grad_norm": 0.05072799697518349, "grad_norm_var": 1.1482621924567965e-05, "learning_rate": 0.00996620428018307, "loss": 2.8356, "step": 1553 }, { "crossentropy": 2.925957441329956, "epoch": 0.056337006960556844, "grad_norm": 0.057870492339134216, "grad_norm_var": 1.1157578968963519e-05, "learning_rate": 0.009966136798899784, "loss": 2.8555, "step": 1554 }, { "crossentropy": 2.7670910358428955, "epoch": 0.056373259860788866, "grad_norm": 0.06214733049273491, "grad_norm_var": 1.6789261985057876e-05, "learning_rate": 0.009966069250541281, "loss": 2.8305, "step": 1555 }, { "crossentropy": 2.9264822006225586, "epoch": 0.05640951276102088, "grad_norm": 0.052505578845739365, "grad_norm_var": 1.5747039430911932e-05, "learning_rate": 0.009966001635108477, "loss": 2.821, "step": 1556 }, { "crossentropy": 2.865114688873291, "epoch": 0.0564457656612529, "grad_norm": 0.05299770459532738, "grad_norm_var": 1.4904487484310863e-05, "learning_rate": 0.009965933952602283, "loss": 2.8349, "step": 1557 }, { "crossentropy": 2.7811508178710938, "epoch": 0.056482018561484916, "grad_norm": 0.05308510735630989, "grad_norm_var": 1.3614178607842102e-05, "learning_rate": 0.009965866203023617, "loss": 2.8135, "step": 1558 }, { "crossentropy": 2.852108955383301, "epoch": 0.05651827146171694, "grad_norm": 0.05037655681371689, "grad_norm_var": 1.3710691554435066e-05, "learning_rate": 0.009965798386373393, "loss": 2.8796, "step": 1559 }, { "crossentropy": 2.7336177825927734, "epoch": 0.05655452436194896, "grad_norm": 0.05181380733847618, "grad_norm_var": 1.3764855281144424e-05, "learning_rate": 0.009965730502652523, "loss": 2.742, "step": 1560 }, { "crossentropy": 3.039026975631714, "epoch": 0.05659077726218097, "grad_norm": 0.05036398023366928, "grad_norm_var": 1.4120203448941595e-05, "learning_rate": 0.00996566255186193, "loss": 2.8994, "step": 1561 }, { "crossentropy": 2.856505870819092, "epoch": 0.056627030162412995, "grad_norm": 0.04952233284711838, "grad_norm_var": 1.4933084605889037e-05, "learning_rate": 0.009965594534002526, "loss": 2.8301, "step": 1562 }, { "crossentropy": 2.904428482055664, "epoch": 0.05666328306264501, "grad_norm": 0.051342904567718506, "grad_norm_var": 1.5084142660805264e-05, "learning_rate": 0.009965526449075232, "loss": 2.8818, "step": 1563 }, { "crossentropy": 2.921726942062378, "epoch": 0.05669953596287703, "grad_norm": 0.050791725516319275, "grad_norm_var": 1.4999767396236171e-05, "learning_rate": 0.00996545829708097, "loss": 2.8203, "step": 1564 }, { "crossentropy": 2.881057024002075, "epoch": 0.05673578886310905, "grad_norm": 0.05335037037730217, "grad_norm_var": 1.0581323444195691e-05, "learning_rate": 0.009965390078020657, "loss": 2.8315, "step": 1565 }, { "crossentropy": 2.800327777862549, "epoch": 0.056772041763341066, "grad_norm": 0.05651964619755745, "grad_norm_var": 1.1579077988744559e-05, "learning_rate": 0.009965321791895217, "loss": 2.8592, "step": 1566 }, { "crossentropy": 2.8451626300811768, "epoch": 0.05680829466357309, "grad_norm": 0.0574449747800827, "grad_norm_var": 1.2632485739109273e-05, "learning_rate": 0.009965253438705569, "loss": 2.8201, "step": 1567 }, { "crossentropy": 2.9001266956329346, "epoch": 0.0568445475638051, "grad_norm": 0.06757784634828568, "grad_norm_var": 2.4429196128202223e-05, "learning_rate": 0.00996518501845264, "loss": 2.8754, "step": 1568 }, { "crossentropy": 2.775674819946289, "epoch": 0.05688080046403712, "grad_norm": 0.062119536101818085, "grad_norm_var": 2.714855961254651e-05, "learning_rate": 0.00996511653113735, "loss": 2.8337, "step": 1569 }, { "crossentropy": 2.707669734954834, "epoch": 0.056917053364269145, "grad_norm": 0.05440250039100647, "grad_norm_var": 2.656801651588671e-05, "learning_rate": 0.009965047976760628, "loss": 2.7868, "step": 1570 }, { "crossentropy": 2.8390445709228516, "epoch": 0.05695330626450116, "grad_norm": 0.05214314162731171, "grad_norm_var": 2.2986186047707635e-05, "learning_rate": 0.0099649793553234, "loss": 2.8638, "step": 1571 }, { "crossentropy": 2.825791358947754, "epoch": 0.05698955916473318, "grad_norm": 0.05336413532495499, "grad_norm_var": 2.284431474935718e-05, "learning_rate": 0.00996491066682659, "loss": 2.7871, "step": 1572 }, { "crossentropy": 2.943040370941162, "epoch": 0.057025812064965195, "grad_norm": 0.05383538827300072, "grad_norm_var": 2.275377252715967e-05, "learning_rate": 0.009964841911271125, "loss": 2.9616, "step": 1573 }, { "crossentropy": 2.810002565383911, "epoch": 0.057062064965197216, "grad_norm": 0.05132429674267769, "grad_norm_var": 2.3221829971848257e-05, "learning_rate": 0.009964773088657938, "loss": 2.7304, "step": 1574 }, { "crossentropy": 2.7823967933654785, "epoch": 0.05709831786542923, "grad_norm": 0.04875557869672775, "grad_norm_var": 2.4200165510430914e-05, "learning_rate": 0.009964704198987954, "loss": 2.7651, "step": 1575 }, { "crossentropy": 2.9095098972320557, "epoch": 0.05713457076566125, "grad_norm": 0.05566804111003876, "grad_norm_var": 2.3983541684620875e-05, "learning_rate": 0.009964635242262107, "loss": 2.8803, "step": 1576 }, { "crossentropy": 2.995256185531616, "epoch": 0.057170823665893274, "grad_norm": 0.05448927357792854, "grad_norm_var": 2.28916103873516e-05, "learning_rate": 0.009964566218481328, "loss": 2.9782, "step": 1577 }, { "crossentropy": 2.9738526344299316, "epoch": 0.05720707656612529, "grad_norm": 0.05363227054476738, "grad_norm_var": 2.119729425027935e-05, "learning_rate": 0.009964497127646547, "loss": 2.9536, "step": 1578 }, { "crossentropy": 2.709338665008545, "epoch": 0.05724332946635731, "grad_norm": 0.05647753179073334, "grad_norm_var": 2.0479924658809805e-05, "learning_rate": 0.009964427969758698, "loss": 2.796, "step": 1579 }, { "crossentropy": 2.755878448486328, "epoch": 0.057279582366589324, "grad_norm": 0.05827171728014946, "grad_norm_var": 1.9661569246463318e-05, "learning_rate": 0.009964358744818715, "loss": 2.7839, "step": 1580 }, { "crossentropy": 2.896472930908203, "epoch": 0.057315835266821345, "grad_norm": 0.053867291659116745, "grad_norm_var": 1.9524182711504595e-05, "learning_rate": 0.009964289452827535, "loss": 2.8354, "step": 1581 }, { "crossentropy": 2.8610122203826904, "epoch": 0.05735208816705337, "grad_norm": 0.04923809692263603, "grad_norm_var": 2.1962922280044776e-05, "learning_rate": 0.009964220093786093, "loss": 2.8862, "step": 1582 }, { "crossentropy": 2.913170576095581, "epoch": 0.05738834106728538, "grad_norm": 0.0530567541718483, "grad_norm_var": 2.183141020653112e-05, "learning_rate": 0.009964150667695324, "loss": 2.8727, "step": 1583 }, { "crossentropy": 2.813631534576416, "epoch": 0.0574245939675174, "grad_norm": 0.05587901547551155, "grad_norm_var": 1.0592644684229852e-05, "learning_rate": 0.009964081174556166, "loss": 2.8256, "step": 1584 }, { "crossentropy": 2.832481861114502, "epoch": 0.05746084686774942, "grad_norm": 0.05384361743927002, "grad_norm_var": 6.087881605332246e-06, "learning_rate": 0.009964011614369559, "loss": 2.7807, "step": 1585 }, { "crossentropy": 2.8859877586364746, "epoch": 0.05749709976798144, "grad_norm": 0.05245070904493332, "grad_norm_var": 6.1276831828972755e-06, "learning_rate": 0.009963941987136442, "loss": 2.911, "step": 1586 }, { "crossentropy": 2.756622076034546, "epoch": 0.05753335266821346, "grad_norm": 0.051596373319625854, "grad_norm_var": 6.2466388064987e-06, "learning_rate": 0.009963872292857756, "loss": 2.8473, "step": 1587 }, { "crossentropy": 2.91902756690979, "epoch": 0.057569605568445474, "grad_norm": 0.0534910149872303, "grad_norm_var": 6.245610733681718e-06, "learning_rate": 0.009963802531534442, "loss": 2.8672, "step": 1588 }, { "crossentropy": 2.879546642303467, "epoch": 0.057605858468677495, "grad_norm": 0.05412451922893524, "grad_norm_var": 6.264061441316887e-06, "learning_rate": 0.00996373270316744, "loss": 2.9611, "step": 1589 }, { "crossentropy": 2.891731023788452, "epoch": 0.05764211136890951, "grad_norm": 0.054564252495765686, "grad_norm_var": 5.9757678574883765e-06, "learning_rate": 0.0099636628077577, "loss": 2.882, "step": 1590 }, { "crossentropy": 2.735013723373413, "epoch": 0.05767836426914153, "grad_norm": 0.05393480136990547, "grad_norm_var": 4.2289615622988686e-06, "learning_rate": 0.009963592845306155, "loss": 2.8109, "step": 1591 }, { "crossentropy": 2.7841241359710693, "epoch": 0.05771461716937355, "grad_norm": 0.051806945353746414, "grad_norm_var": 4.3208183440778794e-06, "learning_rate": 0.00996352281581376, "loss": 2.7906, "step": 1592 }, { "crossentropy": 2.709374189376831, "epoch": 0.05775087006960557, "grad_norm": 0.05095397308468819, "grad_norm_var": 4.774826379989362e-06, "learning_rate": 0.009963452719281455, "loss": 2.7386, "step": 1593 }, { "crossentropy": 2.8296053409576416, "epoch": 0.05778712296983759, "grad_norm": 0.05581948533654213, "grad_norm_var": 5.090725001013058e-06, "learning_rate": 0.00996338255571019, "loss": 2.8353, "step": 1594 }, { "crossentropy": 2.8501176834106445, "epoch": 0.0578233758700696, "grad_norm": 0.059041827917099, "grad_norm_var": 6.447593034216749e-06, "learning_rate": 0.00996331232510091, "loss": 2.901, "step": 1595 }, { "crossentropy": 2.9756767749786377, "epoch": 0.057859628770301624, "grad_norm": 0.06996358186006546, "grad_norm_var": 2.1851241295093442e-05, "learning_rate": 0.009963242027454567, "loss": 2.9753, "step": 1596 }, { "crossentropy": 2.7322707176208496, "epoch": 0.057895881670533646, "grad_norm": 0.0668538510799408, "grad_norm_var": 3.111970573955915e-05, "learning_rate": 0.009963171662772106, "loss": 2.7892, "step": 1597 }, { "crossentropy": 2.8716683387756348, "epoch": 0.05793213457076566, "grad_norm": 0.05446505919098854, "grad_norm_var": 2.852334067172246e-05, "learning_rate": 0.009963101231054481, "loss": 2.8131, "step": 1598 }, { "crossentropy": 2.851775646209717, "epoch": 0.05796838747099768, "grad_norm": 0.06050573289394379, "grad_norm_var": 2.9325944901200234e-05, "learning_rate": 0.009963030732302643, "loss": 2.7811, "step": 1599 }, { "crossentropy": 2.776174306869507, "epoch": 0.058004640371229696, "grad_norm": 0.053926385939121246, "grad_norm_var": 2.9649352975677502e-05, "learning_rate": 0.009962960166517541, "loss": 2.8546, "step": 1600 }, { "crossentropy": 2.7619853019714355, "epoch": 0.05804089327146172, "grad_norm": 0.05693696439266205, "grad_norm_var": 2.9323413468090864e-05, "learning_rate": 0.009962889533700132, "loss": 2.7962, "step": 1601 }, { "crossentropy": 2.785717725753784, "epoch": 0.05807714617169374, "grad_norm": 0.05582616850733757, "grad_norm_var": 2.8313358226927412e-05, "learning_rate": 0.009962818833851368, "loss": 2.8363, "step": 1602 }, { "crossentropy": 2.718827962875366, "epoch": 0.05811339907192575, "grad_norm": 0.05049153044819832, "grad_norm_var": 2.911027481055812e-05, "learning_rate": 0.009962748066972204, "loss": 2.8173, "step": 1603 }, { "crossentropy": 2.8226046562194824, "epoch": 0.058149651972157775, "grad_norm": 0.05314037203788757, "grad_norm_var": 2.925485564113413e-05, "learning_rate": 0.009962677233063598, "loss": 2.8258, "step": 1604 }, { "crossentropy": 2.8388054370880127, "epoch": 0.05818590487238979, "grad_norm": 0.05204897001385689, "grad_norm_var": 3.0153045457605396e-05, "learning_rate": 0.009962606332126502, "loss": 2.8428, "step": 1605 }, { "crossentropy": 2.7416884899139404, "epoch": 0.05822215777262181, "grad_norm": 0.05397060513496399, "grad_norm_var": 3.030988816480872e-05, "learning_rate": 0.00996253536416188, "loss": 2.8573, "step": 1606 }, { "crossentropy": 3.1691043376922607, "epoch": 0.05825841067285383, "grad_norm": 0.15927205979824066, "grad_norm_var": 0.0006915645328381347, "learning_rate": 0.009962464329170685, "loss": 3.0241, "step": 1607 }, { "crossentropy": 2.9314539432525635, "epoch": 0.058294663573085846, "grad_norm": 0.07650832831859589, "grad_norm_var": 0.0006934475984926784, "learning_rate": 0.009962393227153877, "loss": 2.9304, "step": 1608 }, { "crossentropy": 2.7468416690826416, "epoch": 0.05833091647331787, "grad_norm": 0.07252824306488037, "grad_norm_var": 0.0006839811141466538, "learning_rate": 0.009962322058112419, "loss": 2.7876, "step": 1609 }, { "crossentropy": 2.897923707962036, "epoch": 0.05836716937354988, "grad_norm": 0.06805692613124847, "grad_norm_var": 0.0006772090577327766, "learning_rate": 0.009962250822047271, "loss": 2.8596, "step": 1610 }, { "crossentropy": 2.9351892471313477, "epoch": 0.0584034222737819, "grad_norm": 0.08801339566707611, "grad_norm_var": 0.0007009704265584354, "learning_rate": 0.009962179518959396, "loss": 2.7832, "step": 1611 }, { "crossentropy": 2.789367914199829, "epoch": 0.05843967517401392, "grad_norm": 0.061107832938432693, "grad_norm_var": 0.000703886106737323, "learning_rate": 0.009962108148849756, "loss": 2.7794, "step": 1612 }, { "crossentropy": 2.6985254287719727, "epoch": 0.05847592807424594, "grad_norm": 0.060849983245134354, "grad_norm_var": 0.0007068389999134799, "learning_rate": 0.009962036711719314, "loss": 2.7599, "step": 1613 }, { "crossentropy": 2.779519557952881, "epoch": 0.05851218097447796, "grad_norm": 0.050731148570775986, "grad_norm_var": 0.0007141267203412738, "learning_rate": 0.009961965207569036, "loss": 2.8178, "step": 1614 }, { "crossentropy": 3.0166752338409424, "epoch": 0.058548433874709975, "grad_norm": 0.05787710100412369, "grad_norm_var": 0.0007168766561444252, "learning_rate": 0.009961893636399889, "loss": 2.9102, "step": 1615 }, { "crossentropy": 2.8045120239257812, "epoch": 0.058584686774941996, "grad_norm": 0.07424033433198929, "grad_norm_var": 0.0007073783232567685, "learning_rate": 0.009961821998212838, "loss": 2.8315, "step": 1616 }, { "crossentropy": 2.695925712585449, "epoch": 0.05862093967517401, "grad_norm": 0.05145394802093506, "grad_norm_var": 0.0007175096195824125, "learning_rate": 0.009961750293008852, "loss": 2.8258, "step": 1617 }, { "crossentropy": 2.65385103225708, "epoch": 0.05865719257540603, "grad_norm": 0.052566416561603546, "grad_norm_var": 0.0007234137472491861, "learning_rate": 0.009961678520788898, "loss": 2.7975, "step": 1618 }, { "crossentropy": 2.8124423027038574, "epoch": 0.058693445475638054, "grad_norm": 0.049864452332258224, "grad_norm_var": 0.0007248753398452637, "learning_rate": 0.009961606681553945, "loss": 2.8577, "step": 1619 }, { "crossentropy": 2.8508434295654297, "epoch": 0.05872969837587007, "grad_norm": 0.05490894988179207, "grad_norm_var": 0.0007216518144564041, "learning_rate": 0.009961534775304964, "loss": 2.8574, "step": 1620 }, { "crossentropy": 2.925318956375122, "epoch": 0.05876595127610209, "grad_norm": 0.05251513421535492, "grad_norm_var": 0.0007206895002614755, "learning_rate": 0.009961462802042928, "loss": 2.8728, "step": 1621 }, { "crossentropy": 2.920893907546997, "epoch": 0.058802204176334104, "grad_norm": 0.052012305706739426, "grad_norm_var": 0.0007245346607263649, "learning_rate": 0.009961390761768807, "loss": 2.8457, "step": 1622 }, { "crossentropy": 2.9236981868743896, "epoch": 0.058838457076566125, "grad_norm": 0.058384139090776443, "grad_norm_var": 0.00012829798586767483, "learning_rate": 0.009961318654483573, "loss": 2.8529, "step": 1623 }, { "crossentropy": 2.857003927230835, "epoch": 0.05887470997679815, "grad_norm": 0.05418425798416138, "grad_norm_var": 0.00011432979896607893, "learning_rate": 0.009961246480188203, "loss": 2.8482, "step": 1624 }, { "crossentropy": 2.782912015914917, "epoch": 0.05891096287703016, "grad_norm": 0.05145278945565224, "grad_norm_var": 0.00010676170575203703, "learning_rate": 0.009961174238883671, "loss": 2.8496, "step": 1625 }, { "crossentropy": 2.924509048461914, "epoch": 0.05894721577726218, "grad_norm": 0.049618203192949295, "grad_norm_var": 0.00010485617422661912, "learning_rate": 0.009961101930570952, "loss": 2.8891, "step": 1626 }, { "crossentropy": 2.8382325172424316, "epoch": 0.0589834686774942, "grad_norm": 0.05232589319348335, "grad_norm_var": 3.9197810547967474e-05, "learning_rate": 0.009961029555251022, "loss": 2.8178, "step": 1627 }, { "crossentropy": 2.6896393299102783, "epoch": 0.05901972157772622, "grad_norm": 0.05484047532081604, "grad_norm_var": 3.6762563191346645e-05, "learning_rate": 0.009960957112924858, "loss": 2.714, "step": 1628 }, { "crossentropy": 2.904934883117676, "epoch": 0.05905597447795824, "grad_norm": 0.05488830804824829, "grad_norm_var": 3.422578912566643e-05, "learning_rate": 0.009960884603593442, "loss": 2.8538, "step": 1629 }, { "crossentropy": 2.8194291591644287, "epoch": 0.059092227378190254, "grad_norm": 0.08884970843791962, "grad_norm_var": 0.00010592798166472548, "learning_rate": 0.009960812027257751, "loss": 2.8665, "step": 1630 }, { "crossentropy": 2.9219958782196045, "epoch": 0.059128480278422275, "grad_norm": 0.05363742262125015, "grad_norm_var": 0.00010648431185880467, "learning_rate": 0.009960739383918763, "loss": 2.8314, "step": 1631 }, { "crossentropy": 2.9222497940063477, "epoch": 0.05916473317865429, "grad_norm": 0.056771449744701385, "grad_norm_var": 8.449011613853348e-05, "learning_rate": 0.009960666673577463, "loss": 2.7881, "step": 1632 }, { "crossentropy": 2.781033754348755, "epoch": 0.05920098607888631, "grad_norm": 0.06415402144193649, "grad_norm_var": 8.769052037249872e-05, "learning_rate": 0.009960593896234831, "loss": 2.7389, "step": 1633 }, { "crossentropy": 2.775279998779297, "epoch": 0.05923723897911833, "grad_norm": 0.053289834409952164, "grad_norm_var": 8.736205473875228e-05, "learning_rate": 0.00996052105189185, "loss": 2.7748, "step": 1634 }, { "crossentropy": 2.698249340057373, "epoch": 0.05927349187935035, "grad_norm": 0.054258424788713455, "grad_norm_var": 8.476553505496753e-05, "learning_rate": 0.009960448140549505, "loss": 2.7448, "step": 1635 }, { "crossentropy": 2.9462125301361084, "epoch": 0.05930974477958237, "grad_norm": 0.059100598096847534, "grad_norm_var": 8.490138784723132e-05, "learning_rate": 0.009960375162208782, "loss": 2.9036, "step": 1636 }, { "crossentropy": 2.7800981998443604, "epoch": 0.05934599767981438, "grad_norm": 0.06546468287706375, "grad_norm_var": 8.782375529834355e-05, "learning_rate": 0.009960302116870661, "loss": 2.77, "step": 1637 }, { "crossentropy": 2.736316680908203, "epoch": 0.059382250580046404, "grad_norm": 0.059940923005342484, "grad_norm_var": 8.573780259735388e-05, "learning_rate": 0.009960229004536135, "loss": 2.7946, "step": 1638 }, { "crossentropy": 2.9289615154266357, "epoch": 0.059418503480278426, "grad_norm": 0.0585554763674736, "grad_norm_var": 8.574389952485754e-05, "learning_rate": 0.009960155825206188, "loss": 2.9197, "step": 1639 }, { "crossentropy": 2.769068956375122, "epoch": 0.05945475638051044, "grad_norm": 0.05783127620816231, "grad_norm_var": 8.461843867989209e-05, "learning_rate": 0.009960082578881809, "loss": 2.762, "step": 1640 }, { "crossentropy": 2.732512950897217, "epoch": 0.05949100928074246, "grad_norm": 0.05228857323527336, "grad_norm_var": 8.388387891948192e-05, "learning_rate": 0.009960009265563988, "loss": 2.8118, "step": 1641 }, { "crossentropy": 2.817357063293457, "epoch": 0.059527262180974476, "grad_norm": 0.05288931354880333, "grad_norm_var": 8.068389634408492e-05, "learning_rate": 0.009959935885253715, "loss": 2.8568, "step": 1642 }, { "crossentropy": 2.54719877243042, "epoch": 0.0595635150812065, "grad_norm": 0.053872015327215195, "grad_norm_var": 7.952074641781552e-05, "learning_rate": 0.00995986243795198, "loss": 2.6664, "step": 1643 }, { "crossentropy": 2.7513859272003174, "epoch": 0.05959976798143851, "grad_norm": 0.051944270730018616, "grad_norm_var": 8.15699664211392e-05, "learning_rate": 0.009959788923659778, "loss": 2.7759, "step": 1644 }, { "crossentropy": 2.7554008960723877, "epoch": 0.05963602088167053, "grad_norm": 0.05189180374145508, "grad_norm_var": 8.361750659084764e-05, "learning_rate": 0.009959715342378097, "loss": 2.7343, "step": 1645 }, { "crossentropy": 2.7629995346069336, "epoch": 0.059672273781902554, "grad_norm": 0.04982190579175949, "grad_norm_var": 2.0474743228571177e-05, "learning_rate": 0.009959641694107936, "loss": 2.6762, "step": 1646 }, { "crossentropy": 2.65854549407959, "epoch": 0.05970852668213457, "grad_norm": 0.06317607313394547, "grad_norm_var": 2.317947915766444e-05, "learning_rate": 0.009959567978850286, "loss": 2.6986, "step": 1647 }, { "crossentropy": 2.789534091949463, "epoch": 0.05974477958236659, "grad_norm": 0.06019242852926254, "grad_norm_var": 2.3999085646267346e-05, "learning_rate": 0.009959494196606146, "loss": 2.8017, "step": 1648 }, { "crossentropy": 2.9337148666381836, "epoch": 0.059781032482598605, "grad_norm": 0.04731852561235428, "grad_norm_var": 2.5187881658387107e-05, "learning_rate": 0.00995942034737651, "loss": 2.8466, "step": 1649 }, { "crossentropy": 2.862002372741699, "epoch": 0.059817285382830626, "grad_norm": 0.04790749400854111, "grad_norm_var": 2.8756657166769072e-05, "learning_rate": 0.009959346431162372, "loss": 2.8249, "step": 1650 }, { "crossentropy": 2.811253786087036, "epoch": 0.05985353828306265, "grad_norm": 0.04778282344341278, "grad_norm_var": 3.236604934752542e-05, "learning_rate": 0.009959272447964738, "loss": 2.7994, "step": 1651 }, { "crossentropy": 2.679260730743408, "epoch": 0.05988979118329466, "grad_norm": 0.04609405994415283, "grad_norm_var": 3.582553381214672e-05, "learning_rate": 0.009959198397784603, "loss": 2.7061, "step": 1652 }, { "crossentropy": 2.951784372329712, "epoch": 0.05992604408352668, "grad_norm": 0.04725532606244087, "grad_norm_var": 2.9164990478237433e-05, "learning_rate": 0.009959124280622969, "loss": 2.8261, "step": 1653 }, { "crossentropy": 2.9675228595733643, "epoch": 0.0599622969837587, "grad_norm": 0.04681790992617607, "grad_norm_var": 2.7866918748438673e-05, "learning_rate": 0.009959050096480834, "loss": 2.8872, "step": 1654 }, { "crossentropy": 2.801482915878296, "epoch": 0.05999854988399072, "grad_norm": 0.04847428947687149, "grad_norm_var": 2.57129518094458e-05, "learning_rate": 0.009958975845359205, "loss": 2.756, "step": 1655 }, { "crossentropy": 2.7666819095611572, "epoch": 0.06003480278422274, "grad_norm": 0.049314770847558975, "grad_norm_var": 2.3167330004057047e-05, "learning_rate": 0.009958901527259078, "loss": 2.7753, "step": 1656 }, { "crossentropy": 2.8666582107543945, "epoch": 0.060071055684454755, "grad_norm": 0.05452077463269234, "grad_norm_var": 2.3842888994748956e-05, "learning_rate": 0.009958827142181461, "loss": 2.8419, "step": 1657 }, { "crossentropy": 2.889173746109009, "epoch": 0.060107308584686776, "grad_norm": 0.05299699306488037, "grad_norm_var": 2.386780139413628e-05, "learning_rate": 0.00995875269012736, "loss": 2.8323, "step": 1658 }, { "crossentropy": 2.7365386486053467, "epoch": 0.06014356148491879, "grad_norm": 0.04947089031338692, "grad_norm_var": 2.3517092489568937e-05, "learning_rate": 0.009958678171097777, "loss": 2.8216, "step": 1659 }, { "crossentropy": 2.9469127655029297, "epoch": 0.06017981438515081, "grad_norm": 0.04856264218688011, "grad_norm_var": 2.3777315052173758e-05, "learning_rate": 0.00995860358509372, "loss": 2.9041, "step": 1660 }, { "crossentropy": 2.703817367553711, "epoch": 0.060216067285382834, "grad_norm": 0.05166219174861908, "grad_norm_var": 2.37448860717746e-05, "learning_rate": 0.009958528932116198, "loss": 2.7349, "step": 1661 }, { "crossentropy": 2.7696008682250977, "epoch": 0.06025232018561485, "grad_norm": 0.05736355483531952, "grad_norm_var": 2.6406067659007216e-05, "learning_rate": 0.009958454212166217, "loss": 2.754, "step": 1662 }, { "crossentropy": 2.8655412197113037, "epoch": 0.06028857308584687, "grad_norm": 0.04705493524670601, "grad_norm_var": 1.6868011444548005e-05, "learning_rate": 0.009958379425244787, "loss": 2.8324, "step": 1663 }, { "crossentropy": 2.907595634460449, "epoch": 0.060324825986078884, "grad_norm": 0.04874119162559509, "grad_norm_var": 9.767770310206182e-06, "learning_rate": 0.009958304571352921, "loss": 2.8593, "step": 1664 }, { "crossentropy": 2.913017749786377, "epoch": 0.060361078886310905, "grad_norm": 0.053378619253635406, "grad_norm_var": 1.0333820383028914e-05, "learning_rate": 0.009958229650491623, "loss": 2.8225, "step": 1665 }, { "crossentropy": 2.867636203765869, "epoch": 0.06039733178654293, "grad_norm": 0.06433100998401642, "grad_norm_var": 2.296594132704336e-05, "learning_rate": 0.009958154662661912, "loss": 2.901, "step": 1666 }, { "crossentropy": 2.726292610168457, "epoch": 0.06043358468677494, "grad_norm": 0.05882781744003296, "grad_norm_var": 2.6053076743920375e-05, "learning_rate": 0.009958079607864798, "loss": 2.739, "step": 1667 }, { "crossentropy": 2.774498462677002, "epoch": 0.06046983758700696, "grad_norm": 0.05238303542137146, "grad_norm_var": 2.3946547648587223e-05, "learning_rate": 0.009958004486101292, "loss": 2.8241, "step": 1668 }, { "crossentropy": 2.7639787197113037, "epoch": 0.06050609048723898, "grad_norm": 0.05026763305068016, "grad_norm_var": 2.262920473675798e-05, "learning_rate": 0.009957929297372414, "loss": 2.7772, "step": 1669 }, { "crossentropy": 2.7962825298309326, "epoch": 0.060542343387471, "grad_norm": 0.05332828685641289, "grad_norm_var": 2.0662318235957893e-05, "learning_rate": 0.009957854041679178, "loss": 2.825, "step": 1670 }, { "crossentropy": 2.779085159301758, "epoch": 0.06057859628770302, "grad_norm": 0.08478192239999771, "grad_norm_var": 8.33587147236904e-05, "learning_rate": 0.009957778719022597, "loss": 2.8068, "step": 1671 }, { "crossentropy": 2.7277474403381348, "epoch": 0.060614849187935034, "grad_norm": 0.06336750090122223, "grad_norm_var": 8.540169359904185e-05, "learning_rate": 0.009957703329403692, "loss": 2.6944, "step": 1672 }, { "crossentropy": 2.84596848487854, "epoch": 0.060651102088167055, "grad_norm": 0.06294731050729752, "grad_norm_var": 8.852600130389571e-05, "learning_rate": 0.009957627872823479, "loss": 2.8645, "step": 1673 }, { "crossentropy": 2.8675692081451416, "epoch": 0.06068735498839907, "grad_norm": 0.06092670559883118, "grad_norm_var": 8.905195258539217e-05, "learning_rate": 0.009957552349282979, "loss": 2.8566, "step": 1674 }, { "crossentropy": 2.8793883323669434, "epoch": 0.06072360788863109, "grad_norm": 0.05955531448125839, "grad_norm_var": 8.567133249131585e-05, "learning_rate": 0.00995747675878321, "loss": 2.8258, "step": 1675 }, { "crossentropy": 2.814040184020996, "epoch": 0.060759860788863106, "grad_norm": 0.05778161436319351, "grad_norm_var": 8.01910295724179e-05, "learning_rate": 0.009957401101325194, "loss": 2.7627, "step": 1676 }, { "crossentropy": 2.7630481719970703, "epoch": 0.06079611368909513, "grad_norm": 0.051095083355903625, "grad_norm_var": 8.068421013415986e-05, "learning_rate": 0.009957325376909954, "loss": 2.7682, "step": 1677 }, { "crossentropy": 2.6876602172851562, "epoch": 0.06083236658932715, "grad_norm": 0.05107767507433891, "grad_norm_var": 8.358926907531539e-05, "learning_rate": 0.009957249585538513, "loss": 2.7526, "step": 1678 }, { "crossentropy": 2.7220137119293213, "epoch": 0.06086861948955916, "grad_norm": 0.05112702399492264, "grad_norm_var": 7.895977836327444e-05, "learning_rate": 0.009957173727211891, "loss": 2.7467, "step": 1679 }, { "crossentropy": 2.803746461868286, "epoch": 0.060904872389791184, "grad_norm": 0.05238756909966469, "grad_norm_var": 7.541334666020041e-05, "learning_rate": 0.009957097801931117, "loss": 2.8211, "step": 1680 }, { "crossentropy": 2.8290207386016846, "epoch": 0.0609411252900232, "grad_norm": 0.05568999797105789, "grad_norm_var": 7.433141198364852e-05, "learning_rate": 0.009957021809697214, "loss": 2.8416, "step": 1681 }, { "crossentropy": 2.7901718616485596, "epoch": 0.06097737819025522, "grad_norm": 0.05364492908120155, "grad_norm_var": 7.26149555658824e-05, "learning_rate": 0.009956945750511207, "loss": 2.8153, "step": 1682 }, { "crossentropy": 2.847810983657837, "epoch": 0.06101363109048724, "grad_norm": 0.05529514700174332, "grad_norm_var": 7.274564579154887e-05, "learning_rate": 0.009956869624374127, "loss": 2.7713, "step": 1683 }, { "crossentropy": 2.810818910598755, "epoch": 0.061049883990719256, "grad_norm": 0.05042193830013275, "grad_norm_var": 7.42530171227583e-05, "learning_rate": 0.009956793431287, "loss": 2.7711, "step": 1684 }, { "crossentropy": 2.719700574874878, "epoch": 0.06108613689095128, "grad_norm": 0.04854493960738182, "grad_norm_var": 7.600921299207626e-05, "learning_rate": 0.009956717171250856, "loss": 2.7379, "step": 1685 }, { "crossentropy": 2.8653604984283447, "epoch": 0.06112238979118329, "grad_norm": 0.050883252173662186, "grad_norm_var": 7.757929464227333e-05, "learning_rate": 0.009956640844266725, "loss": 2.7906, "step": 1686 }, { "crossentropy": 2.8767945766448975, "epoch": 0.06115864269141531, "grad_norm": 0.05160606652498245, "grad_norm_var": 2.2793815494053365e-05, "learning_rate": 0.009956564450335634, "loss": 2.9095, "step": 1687 }, { "crossentropy": 2.9318103790283203, "epoch": 0.061194895591647334, "grad_norm": 0.04892916604876518, "grad_norm_var": 1.9275622230657587e-05, "learning_rate": 0.009956487989458623, "loss": 2.8662, "step": 1688 }, { "crossentropy": 2.7866005897521973, "epoch": 0.06123114849187935, "grad_norm": 0.05032357946038246, "grad_norm_var": 1.3956271042453855e-05, "learning_rate": 0.009956411461636717, "loss": 2.8386, "step": 1689 }, { "crossentropy": 2.726386308670044, "epoch": 0.06126740139211137, "grad_norm": 0.09559206664562225, "grad_norm_var": 0.00012532668619128398, "learning_rate": 0.009956334866870952, "loss": 2.794, "step": 1690 }, { "crossentropy": 2.701107978820801, "epoch": 0.061303654292343385, "grad_norm": 0.05083281546831131, "grad_norm_var": 0.00012507148579243884, "learning_rate": 0.009956258205162365, "loss": 2.7829, "step": 1691 }, { "crossentropy": 2.9306576251983643, "epoch": 0.061339907192575406, "grad_norm": 0.05014241114258766, "grad_norm_var": 0.00012558210659275776, "learning_rate": 0.00995618147651199, "loss": 2.84, "step": 1692 }, { "crossentropy": 2.7380151748657227, "epoch": 0.06137616009280743, "grad_norm": 0.04960675165057182, "grad_norm_var": 0.0001263415875652859, "learning_rate": 0.009956104680920862, "loss": 2.7616, "step": 1693 }, { "crossentropy": 2.7576959133148193, "epoch": 0.06141241299303944, "grad_norm": 0.05133215710520744, "grad_norm_var": 0.0001262420131728461, "learning_rate": 0.009956027818390018, "loss": 2.7682, "step": 1694 }, { "crossentropy": 2.8097221851348877, "epoch": 0.06144866589327146, "grad_norm": 0.07581955194473267, "grad_norm_var": 0.00015440518545097308, "learning_rate": 0.009955950888920498, "loss": 2.8544, "step": 1695 }, { "crossentropy": 2.968122959136963, "epoch": 0.06148491879350348, "grad_norm": 0.05570794641971588, "grad_norm_var": 0.00015363185840197303, "learning_rate": 0.009955873892513342, "loss": 2.9153, "step": 1696 }, { "crossentropy": 2.7168610095977783, "epoch": 0.0615211716937355, "grad_norm": 0.056089576333761215, "grad_norm_var": 0.00015363073987794955, "learning_rate": 0.009955796829169585, "loss": 2.7884, "step": 1697 }, { "crossentropy": 2.6718978881835938, "epoch": 0.06155742459396752, "grad_norm": 0.05428461357951164, "grad_norm_var": 0.0001534619921954402, "learning_rate": 0.009955719698890274, "loss": 2.7452, "step": 1698 }, { "crossentropy": 2.842849016189575, "epoch": 0.061593677494199535, "grad_norm": 0.06250185519456863, "grad_norm_var": 0.00015606605717137552, "learning_rate": 0.009955642501676448, "loss": 2.8208, "step": 1699 }, { "crossentropy": 2.829820394515991, "epoch": 0.061629930394431556, "grad_norm": 0.06867648661136627, "grad_norm_var": 0.00016230933137364312, "learning_rate": 0.009955565237529147, "loss": 2.8193, "step": 1700 }, { "crossentropy": 2.753232479095459, "epoch": 0.06166618329466357, "grad_norm": 0.06715915352106094, "grad_norm_var": 0.0001616039150486838, "learning_rate": 0.009955487906449418, "loss": 2.7986, "step": 1701 }, { "crossentropy": 2.73870587348938, "epoch": 0.06170243619489559, "grad_norm": 0.05378776416182518, "grad_norm_var": 0.00015909704103524622, "learning_rate": 0.009955410508438307, "loss": 2.7499, "step": 1702 }, { "crossentropy": 2.748292922973633, "epoch": 0.061738689095127613, "grad_norm": 0.06472522765398026, "grad_norm_var": 0.00015709623897998127, "learning_rate": 0.009955333043496854, "loss": 2.7421, "step": 1703 }, { "crossentropy": 2.7887749671936035, "epoch": 0.06177494199535963, "grad_norm": 0.05111086368560791, "grad_norm_var": 0.00015425490990081896, "learning_rate": 0.00995525551162611, "loss": 2.6948, "step": 1704 }, { "crossentropy": 2.678922653198242, "epoch": 0.06181119489559165, "grad_norm": 0.05095735937356949, "grad_norm_var": 0.00015347450394913564, "learning_rate": 0.00995517791282712, "loss": 2.7751, "step": 1705 }, { "crossentropy": 2.8192617893218994, "epoch": 0.061847447795823664, "grad_norm": 0.05229119211435318, "grad_norm_var": 6.45670169288709e-05, "learning_rate": 0.009955100247100931, "loss": 2.8668, "step": 1706 }, { "crossentropy": 2.751765251159668, "epoch": 0.061883700696055685, "grad_norm": 0.0504077784717083, "grad_norm_var": 6.493852921463783e-05, "learning_rate": 0.009955022514448596, "loss": 2.7572, "step": 1707 }, { "crossentropy": 2.918368101119995, "epoch": 0.0619199535962877, "grad_norm": 0.05362781882286072, "grad_norm_var": 6.243538048959404e-05, "learning_rate": 0.00995494471487116, "loss": 2.7195, "step": 1708 }, { "crossentropy": 2.786059617996216, "epoch": 0.06195620649651972, "grad_norm": 0.06163530796766281, "grad_norm_var": 5.9010860744718115e-05, "learning_rate": 0.00995486684836968, "loss": 2.8241, "step": 1709 }, { "crossentropy": 2.7322194576263428, "epoch": 0.06199245939675174, "grad_norm": 0.05622323229908943, "grad_norm_var": 5.60714437599446e-05, "learning_rate": 0.0099547889149452, "loss": 2.7744, "step": 1710 }, { "crossentropy": 2.752769947052002, "epoch": 0.06202871229698376, "grad_norm": 0.0616946704685688, "grad_norm_var": 3.580571216305594e-05, "learning_rate": 0.009954710914598779, "loss": 2.7891, "step": 1711 }, { "crossentropy": 2.754908323287964, "epoch": 0.06206496519721578, "grad_norm": 0.06639014184474945, "grad_norm_var": 4.03067230127653e-05, "learning_rate": 0.009954632847331467, "loss": 2.7829, "step": 1712 }, { "crossentropy": 2.9657537937164307, "epoch": 0.06210121809744779, "grad_norm": 0.051720574498176575, "grad_norm_var": 4.274234449077729e-05, "learning_rate": 0.009954554713144319, "loss": 2.9693, "step": 1713 }, { "crossentropy": 2.811952829360962, "epoch": 0.062137470997679814, "grad_norm": 0.05589492619037628, "grad_norm_var": 4.211750457152885e-05, "learning_rate": 0.009954476512038391, "loss": 2.8737, "step": 1714 }, { "crossentropy": 2.7833409309387207, "epoch": 0.062173723897911835, "grad_norm": 0.048255834728479385, "grad_norm_var": 4.634617733144797e-05, "learning_rate": 0.00995439824401474, "loss": 2.8042, "step": 1715 }, { "crossentropy": 2.7168357372283936, "epoch": 0.06220997679814385, "grad_norm": 0.048995863646268845, "grad_norm_var": 4.033361890620364e-05, "learning_rate": 0.009954319909074421, "loss": 2.7511, "step": 1716 }, { "crossentropy": 2.988476276397705, "epoch": 0.06224622969837587, "grad_norm": 0.053062450140714645, "grad_norm_var": 3.164729236323164e-05, "learning_rate": 0.009954241507218494, "loss": 2.8954, "step": 1717 }, { "crossentropy": 2.829782724380493, "epoch": 0.062282482598607886, "grad_norm": 0.05201921612024307, "grad_norm_var": 3.214014086446518e-05, "learning_rate": 0.009954163038448016, "loss": 2.8029, "step": 1718 }, { "crossentropy": 2.7803423404693604, "epoch": 0.06231873549883991, "grad_norm": 0.05318750441074371, "grad_norm_var": 2.5404200730411164e-05, "learning_rate": 0.00995408450276405, "loss": 2.7274, "step": 1719 }, { "crossentropy": 2.7521920204162598, "epoch": 0.06235498839907193, "grad_norm": 0.05822649970650673, "grad_norm_var": 2.5621604422332243e-05, "learning_rate": 0.009954005900167652, "loss": 2.9057, "step": 1720 }, { "crossentropy": 2.9402782917022705, "epoch": 0.06239124129930394, "grad_norm": 0.06473607569932938, "grad_norm_var": 3.0681579827617336e-05, "learning_rate": 0.00995392723065989, "loss": 2.8527, "step": 1721 }, { "crossentropy": 2.79398512840271, "epoch": 0.062427494199535964, "grad_norm": 0.0576503612101078, "grad_norm_var": 3.0167267288094823e-05, "learning_rate": 0.009953848494241819, "loss": 2.7028, "step": 1722 }, { "crossentropy": 2.8700971603393555, "epoch": 0.06246374709976798, "grad_norm": 0.05318591743707657, "grad_norm_var": 2.8630776823422058e-05, "learning_rate": 0.009953769690914509, "loss": 2.8479, "step": 1723 }, { "crossentropy": 2.757821798324585, "epoch": 0.0625, "grad_norm": 0.05459347367286682, "grad_norm_var": 2.8379554611398088e-05, "learning_rate": 0.00995369082067902, "loss": 2.7665, "step": 1724 }, { "crossentropy": 2.9303176403045654, "epoch": 0.06253625290023201, "grad_norm": 0.05235707759857178, "grad_norm_var": 2.690229412692548e-05, "learning_rate": 0.00995361188353642, "loss": 2.817, "step": 1725 }, { "crossentropy": 2.7889087200164795, "epoch": 0.06257250580046404, "grad_norm": 0.046857912093400955, "grad_norm_var": 3.1496140122152086e-05, "learning_rate": 0.009953532879487774, "loss": 2.775, "step": 1726 }, { "crossentropy": 2.7127561569213867, "epoch": 0.06260875870069606, "grad_norm": 0.049007005989551544, "grad_norm_var": 3.010803091828713e-05, "learning_rate": 0.00995345380853415, "loss": 2.8185, "step": 1727 }, { "crossentropy": 2.8849196434020996, "epoch": 0.06264501160092807, "grad_norm": 0.05060530826449394, "grad_norm_var": 1.9885354041502116e-05, "learning_rate": 0.009953374670676614, "loss": 2.9229, "step": 1728 }, { "crossentropy": 2.8625121116638184, "epoch": 0.06268126450116009, "grad_norm": 0.04884390905499458, "grad_norm_var": 2.0949763411692625e-05, "learning_rate": 0.009953295465916237, "loss": 2.8234, "step": 1729 }, { "crossentropy": 2.769411087036133, "epoch": 0.06271751740139211, "grad_norm": 0.04669143259525299, "grad_norm_var": 2.2651391121019985e-05, "learning_rate": 0.009953216194254086, "loss": 2.836, "step": 1730 }, { "crossentropy": 2.7635905742645264, "epoch": 0.06275377030162413, "grad_norm": 0.04446142539381981, "grad_norm_var": 2.564393288657559e-05, "learning_rate": 0.009953136855691236, "loss": 2.7595, "step": 1731 }, { "crossentropy": 2.825526237487793, "epoch": 0.06279002320185614, "grad_norm": 0.060362283140420914, "grad_norm_var": 2.89307728685848e-05, "learning_rate": 0.009953057450228755, "loss": 2.8514, "step": 1732 }, { "crossentropy": 2.9274802207946777, "epoch": 0.06282627610208817, "grad_norm": 0.057454634457826614, "grad_norm_var": 3.025182230107411e-05, "learning_rate": 0.009952977977867716, "loss": 2.8583, "step": 1733 }, { "crossentropy": 2.7359282970428467, "epoch": 0.06286252900232019, "grad_norm": 0.05603962019085884, "grad_norm_var": 3.066124857933056e-05, "learning_rate": 0.009952898438609192, "loss": 2.7571, "step": 1734 }, { "crossentropy": 2.6679041385650635, "epoch": 0.0628987819025522, "grad_norm": 0.04850968345999718, "grad_norm_var": 3.2155969371466574e-05, "learning_rate": 0.00995281883245426, "loss": 2.7712, "step": 1735 }, { "crossentropy": 2.6616110801696777, "epoch": 0.06293503480278423, "grad_norm": 0.05485205352306366, "grad_norm_var": 3.056061476263272e-05, "learning_rate": 0.009952739159403994, "loss": 2.7635, "step": 1736 }, { "crossentropy": 2.845876932144165, "epoch": 0.06297128770301624, "grad_norm": 0.05026246979832649, "grad_norm_var": 2.0788881178140927e-05, "learning_rate": 0.009952659419459468, "loss": 2.7624, "step": 1737 }, { "crossentropy": 2.774428367614746, "epoch": 0.06300754060324826, "grad_norm": 0.04830045625567436, "grad_norm_var": 1.9187949598769895e-05, "learning_rate": 0.009952579612621762, "loss": 2.8181, "step": 1738 }, { "crossentropy": 2.8468081951141357, "epoch": 0.06304379350348027, "grad_norm": 0.10722361505031586, "grad_norm_var": 0.00021456698599907714, "learning_rate": 0.009952499738891953, "loss": 2.8732, "step": 1739 }, { "crossentropy": 2.7539310455322266, "epoch": 0.0630800464037123, "grad_norm": 0.05171601474285126, "grad_norm_var": 0.00021515465240370217, "learning_rate": 0.009952419798271118, "loss": 2.8711, "step": 1740 }, { "crossentropy": 2.800266742706299, "epoch": 0.06311629930394431, "grad_norm": 0.05045456439256668, "grad_norm_var": 0.00021594895973826186, "learning_rate": 0.009952339790760338, "loss": 2.8038, "step": 1741 }, { "crossentropy": 2.883857250213623, "epoch": 0.06315255220417633, "grad_norm": 0.0530906580388546, "grad_norm_var": 0.00021204465339449882, "learning_rate": 0.009952259716360694, "loss": 2.7826, "step": 1742 }, { "crossentropy": 2.787116050720215, "epoch": 0.06318880510440836, "grad_norm": 0.05142649635672569, "grad_norm_var": 0.00021052003450941395, "learning_rate": 0.009952179575073269, "loss": 2.8025, "step": 1743 }, { "crossentropy": 2.9180240631103516, "epoch": 0.06322505800464037, "grad_norm": 0.05481230095028877, "grad_norm_var": 0.00020915075489366796, "learning_rate": 0.00995209936689914, "loss": 2.8252, "step": 1744 }, { "crossentropy": 2.7732393741607666, "epoch": 0.06326131090487239, "grad_norm": 0.054696835577487946, "grad_norm_var": 0.00020626808500389301, "learning_rate": 0.009952019091839397, "loss": 2.7231, "step": 1745 }, { "crossentropy": 2.7616729736328125, "epoch": 0.06329756380510441, "grad_norm": 0.053471639752388, "grad_norm_var": 0.00020104506154094797, "learning_rate": 0.00995193874989512, "loss": 2.7861, "step": 1746 }, { "crossentropy": 2.80940580368042, "epoch": 0.06333381670533643, "grad_norm": 0.055771779268980026, "grad_norm_var": 0.00019153264951924097, "learning_rate": 0.009951858341067394, "loss": 2.8384, "step": 1747 }, { "crossentropy": 2.9046332836151123, "epoch": 0.06337006960556844, "grad_norm": 0.05166763812303543, "grad_norm_var": 0.00019210203373339936, "learning_rate": 0.009951777865357307, "loss": 2.8124, "step": 1748 }, { "crossentropy": 2.8525683879852295, "epoch": 0.06340632250580046, "grad_norm": 0.049230169504880905, "grad_norm_var": 0.00019499154842223757, "learning_rate": 0.009951697322765947, "loss": 2.8425, "step": 1749 }, { "crossentropy": 2.8774049282073975, "epoch": 0.06344257540603249, "grad_norm": 0.04633321613073349, "grad_norm_var": 0.00020046677746061631, "learning_rate": 0.009951616713294401, "loss": 2.8339, "step": 1750 }, { "crossentropy": 2.9340195655822754, "epoch": 0.0634788283062645, "grad_norm": 0.04821059852838516, "grad_norm_var": 0.00020073572408358474, "learning_rate": 0.009951536036943754, "loss": 2.8274, "step": 1751 }, { "crossentropy": 2.739346981048584, "epoch": 0.06351508120649652, "grad_norm": 0.05890791118144989, "grad_norm_var": 0.0002016324499717623, "learning_rate": 0.009951455293715099, "loss": 2.7874, "step": 1752 }, { "crossentropy": 2.7946829795837402, "epoch": 0.06355133410672854, "grad_norm": 0.06594004482030869, "grad_norm_var": 0.00020636250053230332, "learning_rate": 0.009951374483609528, "loss": 2.8118, "step": 1753 }, { "crossentropy": 2.9415314197540283, "epoch": 0.06358758700696056, "grad_norm": 0.0622415766119957, "grad_norm_var": 0.00020358726076609012, "learning_rate": 0.00995129360662813, "loss": 2.8507, "step": 1754 }, { "crossentropy": 2.913971185684204, "epoch": 0.06362383990719257, "grad_norm": 0.0644824430346489, "grad_norm_var": 3.268526272754002e-05, "learning_rate": 0.009951212662771998, "loss": 2.9076, "step": 1755 }, { "crossentropy": 2.720388889312744, "epoch": 0.0636600928074246, "grad_norm": 0.05836372822523117, "grad_norm_var": 3.295450620242792e-05, "learning_rate": 0.009951131652042224, "loss": 2.7227, "step": 1756 }, { "crossentropy": 2.897993564605713, "epoch": 0.06369634570765662, "grad_norm": 0.05310815945267677, "grad_norm_var": 3.180623796468195e-05, "learning_rate": 0.009951050574439903, "loss": 2.8242, "step": 1757 }, { "crossentropy": 2.7470602989196777, "epoch": 0.06373259860788863, "grad_norm": 0.051214415580034256, "grad_norm_var": 3.253135072265046e-05, "learning_rate": 0.009950969429966134, "loss": 2.8268, "step": 1758 }, { "crossentropy": 2.7673802375793457, "epoch": 0.06376885150812064, "grad_norm": 0.04988210275769234, "grad_norm_var": 3.341471765016327e-05, "learning_rate": 0.009950888218622007, "loss": 2.7856, "step": 1759 }, { "crossentropy": 2.833186388015747, "epoch": 0.06380510440835267, "grad_norm": 0.04903369024395943, "grad_norm_var": 3.556615820202701e-05, "learning_rate": 0.00995080694040862, "loss": 2.8154, "step": 1760 }, { "crossentropy": 2.837430000305176, "epoch": 0.06384135730858469, "grad_norm": 0.05090303346514702, "grad_norm_var": 3.6383725573202866e-05, "learning_rate": 0.009950725595327075, "loss": 2.7814, "step": 1761 }, { "crossentropy": 2.965571641921997, "epoch": 0.0638776102088167, "grad_norm": 0.04890932887792587, "grad_norm_var": 3.818710221243863e-05, "learning_rate": 0.009950644183378467, "loss": 2.9401, "step": 1762 }, { "crossentropy": 2.8564765453338623, "epoch": 0.06391386310904873, "grad_norm": 0.047075189650058746, "grad_norm_var": 4.08740432736282e-05, "learning_rate": 0.009950562704563896, "loss": 2.7815, "step": 1763 }, { "crossentropy": 2.7633919715881348, "epoch": 0.06395011600928074, "grad_norm": 0.04806822910904884, "grad_norm_var": 4.2548266498094954e-05, "learning_rate": 0.009950481158884463, "loss": 2.805, "step": 1764 }, { "crossentropy": 2.765120506286621, "epoch": 0.06398636890951276, "grad_norm": 0.04866718500852585, "grad_norm_var": 4.286937178427074e-05, "learning_rate": 0.00995039954634127, "loss": 2.6947, "step": 1765 }, { "crossentropy": 2.7700068950653076, "epoch": 0.06402262180974477, "grad_norm": 0.048817120492458344, "grad_norm_var": 4.097787629713222e-05, "learning_rate": 0.009950317866935416, "loss": 2.7509, "step": 1766 }, { "crossentropy": 2.8592920303344727, "epoch": 0.0640588747099768, "grad_norm": 0.04905533045530319, "grad_norm_var": 4.0442036852402176e-05, "learning_rate": 0.00995023612066801, "loss": 2.8792, "step": 1767 }, { "crossentropy": 2.553219795227051, "epoch": 0.06409512761020882, "grad_norm": 0.050328463315963745, "grad_norm_var": 3.876109201780993e-05, "learning_rate": 0.00995015430754015, "loss": 2.6831, "step": 1768 }, { "crossentropy": 2.612241268157959, "epoch": 0.06413138051044083, "grad_norm": 0.04705767333507538, "grad_norm_var": 2.8166053351331526e-05, "learning_rate": 0.009950072427552947, "loss": 2.6884, "step": 1769 }, { "crossentropy": 2.7468714714050293, "epoch": 0.06416763341067286, "grad_norm": 0.05574467405676842, "grad_norm_var": 2.1672898046426874e-05, "learning_rate": 0.009949990480707502, "loss": 2.7898, "step": 1770 }, { "crossentropy": 2.586934804916382, "epoch": 0.06420388631090487, "grad_norm": 0.0488184429705143, "grad_norm_var": 9.464334493562248e-06, "learning_rate": 0.009949908467004926, "loss": 2.6935, "step": 1771 }, { "crossentropy": 2.7297439575195312, "epoch": 0.06424013921113689, "grad_norm": 0.049431849271059036, "grad_norm_var": 4.865623197710239e-06, "learning_rate": 0.009949826386446323, "loss": 2.713, "step": 1772 }, { "crossentropy": 2.8762545585632324, "epoch": 0.06427639211136892, "grad_norm": 0.052960075438022614, "grad_norm_var": 4.800830230296723e-06, "learning_rate": 0.009949744239032802, "loss": 2.8229, "step": 1773 }, { "crossentropy": 2.732227325439453, "epoch": 0.06431264501160093, "grad_norm": 0.051084402948617935, "grad_norm_var": 4.7764650500298215e-06, "learning_rate": 0.009949662024765474, "loss": 2.7468, "step": 1774 }, { "crossentropy": 2.8652889728546143, "epoch": 0.06434889791183294, "grad_norm": 0.05185997858643532, "grad_norm_var": 5.0584923661019705e-06, "learning_rate": 0.009949579743645449, "loss": 2.7403, "step": 1775 }, { "crossentropy": 2.8521175384521484, "epoch": 0.06438515081206496, "grad_norm": 0.05204534903168678, "grad_norm_var": 5.292192509967486e-06, "learning_rate": 0.009949497395673839, "loss": 2.7683, "step": 1776 }, { "crossentropy": 2.7269692420959473, "epoch": 0.06442140371229699, "grad_norm": 0.05211332067847252, "grad_norm_var": 5.521132088555832e-06, "learning_rate": 0.009949414980851754, "loss": 2.7591, "step": 1777 }, { "crossentropy": 2.752333641052246, "epoch": 0.064457656612529, "grad_norm": 0.05578725039958954, "grad_norm_var": 7.360807599898535e-06, "learning_rate": 0.00994933249918031, "loss": 2.7345, "step": 1778 }, { "crossentropy": 2.8395888805389404, "epoch": 0.06449390951276102, "grad_norm": 0.05440576747059822, "grad_norm_var": 7.316080892250052e-06, "learning_rate": 0.00994924995066062, "loss": 2.8259, "step": 1779 }, { "crossentropy": 2.8185038566589355, "epoch": 0.06453016241299304, "grad_norm": 0.0543074756860733, "grad_norm_var": 7.297410077077478e-06, "learning_rate": 0.009949167335293798, "loss": 2.8898, "step": 1780 }, { "crossentropy": 2.7159929275512695, "epoch": 0.06456641531322506, "grad_norm": 0.048894718289375305, "grad_norm_var": 7.2175783201249935e-06, "learning_rate": 0.00994908465308096, "loss": 2.6949, "step": 1781 }, { "crossentropy": 2.793741464614868, "epoch": 0.06460266821345707, "grad_norm": 0.05246083065867424, "grad_norm_var": 6.783061683425922e-06, "learning_rate": 0.009949001904023225, "loss": 2.7826, "step": 1782 }, { "crossentropy": 2.7266907691955566, "epoch": 0.0646389211136891, "grad_norm": 0.05180177837610245, "grad_norm_var": 6.305363851022832e-06, "learning_rate": 0.009948919088121707, "loss": 2.8356, "step": 1783 }, { "crossentropy": 2.7834606170654297, "epoch": 0.06467517401392112, "grad_norm": 0.04976921156048775, "grad_norm_var": 6.436047124401082e-06, "learning_rate": 0.009948836205377527, "loss": 2.7325, "step": 1784 }, { "crossentropy": 2.948991298675537, "epoch": 0.06471142691415313, "grad_norm": 0.05303463712334633, "grad_norm_var": 4.902318520110839e-06, "learning_rate": 0.009948753255791806, "loss": 2.9872, "step": 1785 }, { "crossentropy": 2.8457438945770264, "epoch": 0.06474767981438515, "grad_norm": 0.05764234438538551, "grad_norm_var": 6.035030820264407e-06, "learning_rate": 0.009948670239365659, "loss": 2.834, "step": 1786 }, { "crossentropy": 2.914262056350708, "epoch": 0.06478393271461717, "grad_norm": 0.050889354199171066, "grad_norm_var": 5.34834235113296e-06, "learning_rate": 0.009948587156100211, "loss": 2.896, "step": 1787 }, { "crossentropy": 2.868595600128174, "epoch": 0.06482018561484919, "grad_norm": 0.0512971356511116, "grad_norm_var": 4.8262314718547824e-06, "learning_rate": 0.009948504005996586, "loss": 2.8321, "step": 1788 }, { "crossentropy": 2.8154468536376953, "epoch": 0.0648564385150812, "grad_norm": 0.04917095974087715, "grad_norm_var": 5.502297847163363e-06, "learning_rate": 0.009948420789055903, "loss": 2.7371, "step": 1789 }, { "crossentropy": 2.8630359172821045, "epoch": 0.06489269141531323, "grad_norm": 0.05370299890637398, "grad_norm_var": 5.511580815114638e-06, "learning_rate": 0.009948337505279289, "loss": 2.828, "step": 1790 }, { "crossentropy": 2.7991132736206055, "epoch": 0.06492894431554525, "grad_norm": 0.05259546637535095, "grad_norm_var": 5.487632750790043e-06, "learning_rate": 0.009948254154667865, "loss": 2.7333, "step": 1791 }, { "crossentropy": 2.791489362716675, "epoch": 0.06496519721577726, "grad_norm": 0.04873353987932205, "grad_norm_var": 6.371653489506818e-06, "learning_rate": 0.009948170737222763, "loss": 2.7986, "step": 1792 }, { "crossentropy": 2.8461549282073975, "epoch": 0.06500145011600927, "grad_norm": 0.05723801255226135, "grad_norm_var": 7.893752338420517e-06, "learning_rate": 0.009948087252945104, "loss": 2.7507, "step": 1793 }, { "crossentropy": 2.6997110843658447, "epoch": 0.0650377030162413, "grad_norm": 0.06227550283074379, "grad_norm_var": 1.3275023388192344e-05, "learning_rate": 0.009948003701836017, "loss": 2.7061, "step": 1794 }, { "crossentropy": 2.992666006088257, "epoch": 0.06507395591647332, "grad_norm": 0.05656956136226654, "grad_norm_var": 1.3969258627292286e-05, "learning_rate": 0.009947920083896633, "loss": 2.9396, "step": 1795 }, { "crossentropy": 2.8629395961761475, "epoch": 0.06511020881670533, "grad_norm": 0.05358332023024559, "grad_norm_var": 1.3890175314855367e-05, "learning_rate": 0.009947836399128078, "loss": 2.8969, "step": 1796 }, { "crossentropy": 2.7756927013397217, "epoch": 0.06514646171693736, "grad_norm": 0.05038424953818321, "grad_norm_var": 1.3192920832696039e-05, "learning_rate": 0.009947752647531486, "loss": 2.7319, "step": 1797 }, { "crossentropy": 2.7349071502685547, "epoch": 0.06518271461716937, "grad_norm": 0.05109427869319916, "grad_norm_var": 1.3443737240334483e-05, "learning_rate": 0.009947668829107983, "loss": 2.8667, "step": 1798 }, { "crossentropy": 2.8513360023498535, "epoch": 0.06521896751740139, "grad_norm": 0.05184294655919075, "grad_norm_var": 1.3436654550840385e-05, "learning_rate": 0.009947584943858705, "loss": 2.7905, "step": 1799 }, { "crossentropy": 2.7342944145202637, "epoch": 0.06525522041763342, "grad_norm": 0.051439423114061356, "grad_norm_var": 1.2866144412575682e-05, "learning_rate": 0.009947500991784786, "loss": 2.8081, "step": 1800 }, { "crossentropy": 2.7037975788116455, "epoch": 0.06529147331786543, "grad_norm": 0.04816339164972305, "grad_norm_var": 1.4468535667610562e-05, "learning_rate": 0.009947416972887355, "loss": 2.7671, "step": 1801 }, { "crossentropy": 2.828169584274292, "epoch": 0.06532772621809745, "grad_norm": 0.046279825270175934, "grad_norm_var": 1.537411602426167e-05, "learning_rate": 0.009947332887167553, "loss": 2.8386, "step": 1802 }, { "crossentropy": 2.6546716690063477, "epoch": 0.06536397911832946, "grad_norm": 0.05134029686450958, "grad_norm_var": 1.5307796527652627e-05, "learning_rate": 0.009947248734626511, "loss": 2.7709, "step": 1803 }, { "crossentropy": 2.8577353954315186, "epoch": 0.06540023201856149, "grad_norm": 0.05385071039199829, "grad_norm_var": 1.539706677160657e-05, "learning_rate": 0.009947164515265366, "loss": 2.7444, "step": 1804 }, { "crossentropy": 2.7991206645965576, "epoch": 0.0654364849187935, "grad_norm": 0.05576664209365845, "grad_norm_var": 1.5283757870341678e-05, "learning_rate": 0.00994708022908526, "loss": 2.7254, "step": 1805 }, { "crossentropy": 2.8256499767303467, "epoch": 0.06547273781902552, "grad_norm": 0.05607886239886284, "grad_norm_var": 1.592141572231027e-05, "learning_rate": 0.009946995876087324, "loss": 2.8359, "step": 1806 }, { "crossentropy": 2.7938101291656494, "epoch": 0.06550899071925755, "grad_norm": 0.05529727414250374, "grad_norm_var": 1.6249122274506326e-05, "learning_rate": 0.009946911456272703, "loss": 2.8083, "step": 1807 }, { "crossentropy": 2.7731847763061523, "epoch": 0.06554524361948956, "grad_norm": 0.04817267879843712, "grad_norm_var": 1.6596891935109306e-05, "learning_rate": 0.009946826969642537, "loss": 2.8058, "step": 1808 }, { "crossentropy": 2.697294235229492, "epoch": 0.06558149651972157, "grad_norm": 0.05158711597323418, "grad_norm_var": 1.5464381624302227e-05, "learning_rate": 0.009946742416197963, "loss": 2.7624, "step": 1809 }, { "crossentropy": 2.6697380542755127, "epoch": 0.0656177494199536, "grad_norm": 0.08091334998607635, "grad_norm_var": 6.088882523710773e-05, "learning_rate": 0.00994665779594013, "loss": 2.7341, "step": 1810 }, { "crossentropy": 2.874109983444214, "epoch": 0.06565400232018562, "grad_norm": 0.04849274829030037, "grad_norm_var": 6.208870604076741e-05, "learning_rate": 0.009946573108870172, "loss": 2.8111, "step": 1811 }, { "crossentropy": 2.717113971710205, "epoch": 0.06569025522041763, "grad_norm": 0.053793374449014664, "grad_norm_var": 6.209679560740805e-05, "learning_rate": 0.009946488354989238, "loss": 2.7799, "step": 1812 }, { "crossentropy": 2.8391332626342773, "epoch": 0.06572650812064965, "grad_norm": 0.060148436576128006, "grad_norm_var": 6.412141827515367e-05, "learning_rate": 0.009946403534298473, "loss": 2.789, "step": 1813 }, { "crossentropy": 2.833066701889038, "epoch": 0.06576276102088167, "grad_norm": 0.06236288696527481, "grad_norm_var": 6.766743011274024e-05, "learning_rate": 0.009946318646799025, "loss": 2.8256, "step": 1814 }, { "crossentropy": 2.722160816192627, "epoch": 0.06579901392111369, "grad_norm": 0.057243768125772476, "grad_norm_var": 6.741824264943532e-05, "learning_rate": 0.009946233692492033, "loss": 2.831, "step": 1815 }, { "crossentropy": 2.6746697425842285, "epoch": 0.0658352668213457, "grad_norm": 0.052817825227975845, "grad_norm_var": 6.687191297835796e-05, "learning_rate": 0.00994614867137865, "loss": 2.6988, "step": 1816 }, { "crossentropy": 2.912391424179077, "epoch": 0.06587151972157773, "grad_norm": 0.051517557352781296, "grad_norm_var": 6.445303742957908e-05, "learning_rate": 0.009946063583460025, "loss": 2.89, "step": 1817 }, { "crossentropy": 2.823227643966675, "epoch": 0.06590777262180975, "grad_norm": 0.0677361860871315, "grad_norm_var": 6.726678123512367e-05, "learning_rate": 0.009945978428737304, "loss": 2.8378, "step": 1818 }, { "crossentropy": 2.7664520740509033, "epoch": 0.06594402552204176, "grad_norm": 0.048940256237983704, "grad_norm_var": 6.934032170664919e-05, "learning_rate": 0.009945893207211638, "loss": 2.8019, "step": 1819 }, { "crossentropy": 2.875180721282959, "epoch": 0.06598027842227379, "grad_norm": 0.0497179739177227, "grad_norm_var": 7.189241839585057e-05, "learning_rate": 0.00994580791888418, "loss": 2.7968, "step": 1820 }, { "crossentropy": 2.8064470291137695, "epoch": 0.0660165313225058, "grad_norm": 0.052953675389289856, "grad_norm_var": 7.258201515356836e-05, "learning_rate": 0.009945722563756079, "loss": 2.8202, "step": 1821 }, { "crossentropy": 2.87994647026062, "epoch": 0.06605278422273782, "grad_norm": 0.05363913252949715, "grad_norm_var": 7.296444576992128e-05, "learning_rate": 0.00994563714182849, "loss": 2.8417, "step": 1822 }, { "crossentropy": 2.8876824378967285, "epoch": 0.06608903712296983, "grad_norm": 0.05732276663184166, "grad_norm_var": 7.304231481425611e-05, "learning_rate": 0.009945551653102565, "loss": 2.8943, "step": 1823 }, { "crossentropy": 2.8757882118225098, "epoch": 0.06612529002320186, "grad_norm": 0.051603082567453384, "grad_norm_var": 7.015880764005226e-05, "learning_rate": 0.009945466097579462, "loss": 2.8468, "step": 1824 }, { "crossentropy": 2.9006435871124268, "epoch": 0.06616154292343387, "grad_norm": 0.0528857447206974, "grad_norm_var": 6.944827844890898e-05, "learning_rate": 0.009945380475260333, "loss": 2.8455, "step": 1825 }, { "crossentropy": 2.723085641860962, "epoch": 0.06619779582366589, "grad_norm": 0.050470247864723206, "grad_norm_var": 2.7791569050291248e-05, "learning_rate": 0.009945294786146336, "loss": 2.795, "step": 1826 }, { "crossentropy": 2.8841075897216797, "epoch": 0.06623404872389792, "grad_norm": 0.04733697324991226, "grad_norm_var": 2.879738230896651e-05, "learning_rate": 0.00994520903023863, "loss": 2.8575, "step": 1827 }, { "crossentropy": 2.6435248851776123, "epoch": 0.06627030162412993, "grad_norm": 0.050252921879291534, "grad_norm_var": 2.9869823481170194e-05, "learning_rate": 0.009945123207538369, "loss": 2.7144, "step": 1828 }, { "crossentropy": 2.8368470668792725, "epoch": 0.06630655452436195, "grad_norm": 0.06067387014627457, "grad_norm_var": 3.03049100698292e-05, "learning_rate": 0.009945037318046714, "loss": 2.7744, "step": 1829 }, { "crossentropy": 2.8467156887054443, "epoch": 0.06634280742459396, "grad_norm": 0.06261584162712097, "grad_norm_var": 3.058364182883684e-05, "learning_rate": 0.009944951361764827, "loss": 2.8665, "step": 1830 }, { "crossentropy": 2.6925315856933594, "epoch": 0.06637906032482599, "grad_norm": 0.058602239936590195, "grad_norm_var": 3.1244323330463045e-05, "learning_rate": 0.009944865338693867, "loss": 2.7155, "step": 1831 }, { "crossentropy": 2.96073317527771, "epoch": 0.066415313225058, "grad_norm": 0.050105445086956024, "grad_norm_var": 3.224663691180485e-05, "learning_rate": 0.009944779248834997, "loss": 2.8814, "step": 1832 }, { "crossentropy": 2.7176101207733154, "epoch": 0.06645156612529002, "grad_norm": 0.05743587017059326, "grad_norm_var": 3.235979251554402e-05, "learning_rate": 0.00994469309218938, "loss": 2.7629, "step": 1833 }, { "crossentropy": 2.7921721935272217, "epoch": 0.06648781902552205, "grad_norm": 0.05770375207066536, "grad_norm_var": 2.0969343632123474e-05, "learning_rate": 0.009944606868758178, "loss": 2.8053, "step": 1834 }, { "crossentropy": 2.6417102813720703, "epoch": 0.06652407192575406, "grad_norm": 0.051628317683935165, "grad_norm_var": 1.9646476165332178e-05, "learning_rate": 0.009944520578542558, "loss": 2.7088, "step": 1835 }, { "crossentropy": 2.7716386318206787, "epoch": 0.06656032482598608, "grad_norm": 0.050847165286540985, "grad_norm_var": 1.9072552066136336e-05, "learning_rate": 0.009944434221543681, "loss": 2.7814, "step": 1836 }, { "crossentropy": 2.75921630859375, "epoch": 0.0665965777262181, "grad_norm": 0.047593239694833755, "grad_norm_var": 2.1709060399014805e-05, "learning_rate": 0.009944347797762718, "loss": 2.8161, "step": 1837 }, { "crossentropy": 2.725820541381836, "epoch": 0.06663283062645012, "grad_norm": 0.04543311893939972, "grad_norm_var": 2.608803492510908e-05, "learning_rate": 0.009944261307200835, "loss": 2.7429, "step": 1838 }, { "crossentropy": 2.8185558319091797, "epoch": 0.06666908352668213, "grad_norm": 0.05779609829187393, "grad_norm_var": 2.6357059506076687e-05, "learning_rate": 0.0099441747498592, "loss": 2.8324, "step": 1839 }, { "crossentropy": 2.9064531326293945, "epoch": 0.06670533642691415, "grad_norm": 0.05633537471294403, "grad_norm_var": 2.6678760053888175e-05, "learning_rate": 0.009944088125738982, "loss": 2.8704, "step": 1840 }, { "crossentropy": 2.874577522277832, "epoch": 0.06674158932714618, "grad_norm": 0.05200249329209328, "grad_norm_var": 2.6812489410251752e-05, "learning_rate": 0.009944001434841351, "loss": 2.7889, "step": 1841 }, { "crossentropy": 2.8117146492004395, "epoch": 0.06677784222737819, "grad_norm": 0.04956836253404617, "grad_norm_var": 2.72339189609944e-05, "learning_rate": 0.009943914677167476, "loss": 2.7762, "step": 1842 }, { "crossentropy": 2.827619791030884, "epoch": 0.0668140951276102, "grad_norm": 0.05004426836967468, "grad_norm_var": 2.546887987693863e-05, "learning_rate": 0.009943827852718534, "loss": 2.8194, "step": 1843 }, { "crossentropy": 2.75083065032959, "epoch": 0.06685034802784223, "grad_norm": 0.047011103481054306, "grad_norm_var": 2.7600517779907454e-05, "learning_rate": 0.009943740961495692, "loss": 2.704, "step": 1844 }, { "crossentropy": 2.7665319442749023, "epoch": 0.06688660092807425, "grad_norm": 0.046499818563461304, "grad_norm_var": 2.652801759742788e-05, "learning_rate": 0.009943654003500127, "loss": 2.7553, "step": 1845 }, { "crossentropy": 2.849487066268921, "epoch": 0.06692285382830626, "grad_norm": 0.05022476986050606, "grad_norm_var": 1.953760292313176e-05, "learning_rate": 0.00994356697873301, "loss": 2.8145, "step": 1846 }, { "crossentropy": 2.739912986755371, "epoch": 0.06695910672853829, "grad_norm": 0.04507222771644592, "grad_norm_var": 1.871122254650595e-05, "learning_rate": 0.009943479887195522, "loss": 2.8423, "step": 1847 }, { "crossentropy": 2.715284585952759, "epoch": 0.0669953596287703, "grad_norm": 0.0492466576397419, "grad_norm_var": 1.8854748903058585e-05, "learning_rate": 0.009943392728888834, "loss": 2.7145, "step": 1848 }, { "crossentropy": 2.7947866916656494, "epoch": 0.06703161252900232, "grad_norm": 0.4469831883907318, "grad_norm_var": 0.009842381665655868, "learning_rate": 0.009943305503814127, "loss": 2.7399, "step": 1849 }, { "crossentropy": 2.8850460052490234, "epoch": 0.06706786542923433, "grad_norm": 0.052534595131874084, "grad_norm_var": 0.009856144485717446, "learning_rate": 0.009943218211972575, "loss": 2.7716, "step": 1850 }, { "crossentropy": 2.800236701965332, "epoch": 0.06710411832946636, "grad_norm": 0.05769893899559975, "grad_norm_var": 0.00983958999970369, "learning_rate": 0.00994313085336536, "loss": 2.8251, "step": 1851 }, { "crossentropy": 2.754197597503662, "epoch": 0.06714037122969838, "grad_norm": 0.06867548823356628, "grad_norm_var": 0.009801314914820884, "learning_rate": 0.009943043427993663, "loss": 2.7359, "step": 1852 }, { "crossentropy": 2.6994805335998535, "epoch": 0.06717662412993039, "grad_norm": 0.06558840721845627, "grad_norm_var": 0.009752388430840458, "learning_rate": 0.009942955935858663, "loss": 2.6934, "step": 1853 }, { "crossentropy": 2.6854567527770996, "epoch": 0.06721287703016242, "grad_norm": 0.055443648248910904, "grad_norm_var": 0.009715791100366612, "learning_rate": 0.00994286837696154, "loss": 2.7634, "step": 1854 }, { "crossentropy": 2.7170915603637695, "epoch": 0.06724912993039443, "grad_norm": 0.053317613899707794, "grad_norm_var": 0.009729210748260815, "learning_rate": 0.009942780751303481, "loss": 2.7744, "step": 1855 }, { "crossentropy": 2.773857593536377, "epoch": 0.06728538283062645, "grad_norm": 0.04641419276595116, "grad_norm_var": 0.009763876170320408, "learning_rate": 0.009942693058885666, "loss": 2.7268, "step": 1856 }, { "crossentropy": 2.802151679992676, "epoch": 0.06732163573085846, "grad_norm": 0.05026129633188248, "grad_norm_var": 0.009769931833650608, "learning_rate": 0.00994260529970928, "loss": 2.7845, "step": 1857 }, { "crossentropy": 2.8549020290374756, "epoch": 0.06735788863109049, "grad_norm": 0.057343918830156326, "grad_norm_var": 0.009745103568470161, "learning_rate": 0.00994251747377551, "loss": 2.839, "step": 1858 }, { "crossentropy": 2.871413230895996, "epoch": 0.0673941415313225, "grad_norm": 0.06168120726943016, "grad_norm_var": 0.009710738252869236, "learning_rate": 0.00994242958108554, "loss": 2.8974, "step": 1859 }, { "crossentropy": 2.81520676612854, "epoch": 0.06743039443155452, "grad_norm": 0.056686870753765106, "grad_norm_var": 0.009676127133300096, "learning_rate": 0.009942341621640557, "loss": 2.7825, "step": 1860 }, { "crossentropy": 2.834693431854248, "epoch": 0.06746664733178655, "grad_norm": 0.05370532348752022, "grad_norm_var": 0.009648167705259843, "learning_rate": 0.009942253595441751, "loss": 2.7576, "step": 1861 }, { "crossentropy": 2.8183794021606445, "epoch": 0.06750290023201856, "grad_norm": 0.06067708879709244, "grad_norm_var": 0.00961429438446016, "learning_rate": 0.00994216550249031, "loss": 2.8121, "step": 1862 }, { "crossentropy": 2.821840524673462, "epoch": 0.06753915313225058, "grad_norm": 0.05297967046499252, "grad_norm_var": 0.00958128943012172, "learning_rate": 0.009942077342787426, "loss": 2.8424, "step": 1863 }, { "crossentropy": 2.7504851818084717, "epoch": 0.0675754060324826, "grad_norm": 0.0527673102915287, "grad_norm_var": 0.009567356837770836, "learning_rate": 0.009941989116334284, "loss": 2.8207, "step": 1864 }, { "crossentropy": 2.8723244667053223, "epoch": 0.06761165893271462, "grad_norm": 0.08183641731739044, "grad_norm_var": 7.241044124409682e-05, "learning_rate": 0.009941900823132082, "loss": 2.8541, "step": 1865 }, { "crossentropy": 2.7666432857513428, "epoch": 0.06764791183294663, "grad_norm": 0.05189526826143265, "grad_norm_var": 7.289981089344342e-05, "learning_rate": 0.009941812463182008, "loss": 2.83, "step": 1866 }, { "crossentropy": 2.851454734802246, "epoch": 0.06768416473317865, "grad_norm": 0.04959055408835411, "grad_norm_var": 7.726499531771988e-05, "learning_rate": 0.009941724036485258, "loss": 2.8711, "step": 1867 }, { "crossentropy": 2.7515883445739746, "epoch": 0.06772041763341068, "grad_norm": 0.04764719679951668, "grad_norm_var": 7.336926781007779e-05, "learning_rate": 0.009941635543043026, "loss": 2.8155, "step": 1868 }, { "crossentropy": 2.9147942066192627, "epoch": 0.06775667053364269, "grad_norm": 0.05754590407013893, "grad_norm_var": 6.725296044804239e-05, "learning_rate": 0.009941546982856507, "loss": 2.8034, "step": 1869 }, { "crossentropy": 2.905043363571167, "epoch": 0.0677929234338747, "grad_norm": 0.05265054851770401, "grad_norm_var": 6.780327924786519e-05, "learning_rate": 0.009941458355926896, "loss": 2.8425, "step": 1870 }, { "crossentropy": 2.866396903991699, "epoch": 0.06782917633410673, "grad_norm": 0.05051201581954956, "grad_norm_var": 6.908825591423265e-05, "learning_rate": 0.009941369662255391, "loss": 2.8696, "step": 1871 }, { "crossentropy": 2.783132314682007, "epoch": 0.06786542923433875, "grad_norm": 0.05695345252752304, "grad_norm_var": 6.359701601726334e-05, "learning_rate": 0.00994128090184319, "loss": 2.8107, "step": 1872 }, { "crossentropy": 2.918153762817383, "epoch": 0.06790168213457076, "grad_norm": 0.0540611632168293, "grad_norm_var": 6.163203199871884e-05, "learning_rate": 0.009941192074691492, "loss": 2.894, "step": 1873 }, { "crossentropy": 2.786611795425415, "epoch": 0.06793793503480279, "grad_norm": 0.055847764015197754, "grad_norm_var": 6.153543488691538e-05, "learning_rate": 0.009941103180801497, "loss": 2.754, "step": 1874 }, { "crossentropy": 2.826822519302368, "epoch": 0.0679741879350348, "grad_norm": 0.0570729561150074, "grad_norm_var": 5.9411811552199416e-05, "learning_rate": 0.009941014220174405, "loss": 2.8442, "step": 1875 }, { "crossentropy": 2.7056007385253906, "epoch": 0.06801044083526682, "grad_norm": 0.04977404326200485, "grad_norm_var": 6.15597295176332e-05, "learning_rate": 0.009940925192811416, "loss": 2.7244, "step": 1876 }, { "crossentropy": 2.763902187347412, "epoch": 0.06804669373549883, "grad_norm": 0.05364535003900528, "grad_norm_var": 6.157306426512258e-05, "learning_rate": 0.009940836098713737, "loss": 2.8018, "step": 1877 }, { "crossentropy": 2.749725103378296, "epoch": 0.06808294663573086, "grad_norm": 0.058373257517814636, "grad_norm_var": 6.026567874622783e-05, "learning_rate": 0.009940746937882565, "loss": 2.8382, "step": 1878 }, { "crossentropy": 2.8325605392456055, "epoch": 0.06811919953596288, "grad_norm": 0.05887988954782486, "grad_norm_var": 6.06970584272439e-05, "learning_rate": 0.00994065771031911, "loss": 2.769, "step": 1879 }, { "crossentropy": 2.744225025177002, "epoch": 0.06815545243619489, "grad_norm": 0.04871797561645508, "grad_norm_var": 6.323282390611051e-05, "learning_rate": 0.009940568416024576, "loss": 2.8184, "step": 1880 }, { "crossentropy": 2.7002482414245605, "epoch": 0.06819170533642692, "grad_norm": 0.05688851699233055, "grad_norm_var": 1.3904657029922598e-05, "learning_rate": 0.009940479055000166, "loss": 2.7121, "step": 1881 }, { "crossentropy": 2.7891573905944824, "epoch": 0.06822795823665893, "grad_norm": 0.049630094319581985, "grad_norm_var": 1.478657171268451e-05, "learning_rate": 0.009940389627247088, "loss": 2.8506, "step": 1882 }, { "crossentropy": 2.6250529289245605, "epoch": 0.06826421113689095, "grad_norm": 0.055768825113773346, "grad_norm_var": 1.385958473323808e-05, "learning_rate": 0.009940300132766554, "loss": 2.7547, "step": 1883 }, { "crossentropy": 2.790436029434204, "epoch": 0.06830046403712298, "grad_norm": 0.05014628544449806, "grad_norm_var": 1.2133742440921061e-05, "learning_rate": 0.009940210571559766, "loss": 2.8333, "step": 1884 }, { "crossentropy": 2.7673869132995605, "epoch": 0.06833671693735499, "grad_norm": 0.06162147596478462, "grad_norm_var": 1.5014941235603578e-05, "learning_rate": 0.00994012094362794, "loss": 2.7815, "step": 1885 }, { "crossentropy": 2.917602062225342, "epoch": 0.068372969837587, "grad_norm": 0.0513615682721138, "grad_norm_var": 1.542099353243435e-05, "learning_rate": 0.009940031248972284, "loss": 2.8532, "step": 1886 }, { "crossentropy": 2.7229957580566406, "epoch": 0.06840922273781902, "grad_norm": 0.050922051072120667, "grad_norm_var": 1.5222853850196023e-05, "learning_rate": 0.009939941487594006, "loss": 2.792, "step": 1887 }, { "crossentropy": 2.7312002182006836, "epoch": 0.06844547563805105, "grad_norm": 0.05284552276134491, "grad_norm_var": 1.4853787233263388e-05, "learning_rate": 0.009939851659494325, "loss": 2.7748, "step": 1888 }, { "crossentropy": 2.7185516357421875, "epoch": 0.06848172853828306, "grad_norm": 0.053448330610990524, "grad_norm_var": 1.4880212433995843e-05, "learning_rate": 0.009939761764674449, "loss": 2.8116, "step": 1889 }, { "crossentropy": 2.7871670722961426, "epoch": 0.06851798143851508, "grad_norm": 0.048257552087306976, "grad_norm_var": 1.6670634121198023e-05, "learning_rate": 0.009939671803135595, "loss": 2.7642, "step": 1890 }, { "crossentropy": 2.876168727874756, "epoch": 0.0685542343387471, "grad_norm": 0.058957211673259735, "grad_norm_var": 1.7768927741525164e-05, "learning_rate": 0.009939581774878976, "loss": 2.7957, "step": 1891 }, { "crossentropy": 2.8236043453216553, "epoch": 0.06859048723897912, "grad_norm": 0.05323978513479233, "grad_norm_var": 1.6704361758331822e-05, "learning_rate": 0.00993949167990581, "loss": 2.8024, "step": 1892 }, { "crossentropy": 2.960049629211426, "epoch": 0.06862674013921113, "grad_norm": 0.058804843574762344, "grad_norm_var": 1.8179895646428662e-05, "learning_rate": 0.009939401518217313, "loss": 2.8805, "step": 1893 }, { "crossentropy": 2.6556146144866943, "epoch": 0.06866299303944315, "grad_norm": 0.04826712608337402, "grad_norm_var": 1.8995717416488375e-05, "learning_rate": 0.009939311289814701, "loss": 2.643, "step": 1894 }, { "crossentropy": 2.7311697006225586, "epoch": 0.06869924593967518, "grad_norm": 0.04880813881754875, "grad_norm_var": 1.8258545164008948e-05, "learning_rate": 0.009939220994699195, "loss": 2.7924, "step": 1895 }, { "crossentropy": 2.7713232040405273, "epoch": 0.06873549883990719, "grad_norm": 0.052362531423568726, "grad_norm_var": 1.7017466849142345e-05, "learning_rate": 0.009939130632872016, "loss": 2.7688, "step": 1896 }, { "crossentropy": 2.8294661045074463, "epoch": 0.0687717517401392, "grad_norm": 0.05457890033721924, "grad_norm_var": 1.621748709130973e-05, "learning_rate": 0.009939040204334379, "loss": 2.8884, "step": 1897 }, { "crossentropy": 2.8007194995880127, "epoch": 0.06880800464037123, "grad_norm": 0.05294477567076683, "grad_norm_var": 1.5386644989654613e-05, "learning_rate": 0.009938949709087512, "loss": 2.6994, "step": 1898 }, { "crossentropy": 2.7534122467041016, "epoch": 0.06884425754060325, "grad_norm": 0.045142851769924164, "grad_norm_var": 1.890459681430437e-05, "learning_rate": 0.009938859147132632, "loss": 2.7726, "step": 1899 }, { "crossentropy": 2.7377769947052, "epoch": 0.06888051044083526, "grad_norm": 0.05103204399347305, "grad_norm_var": 1.866304164123879e-05, "learning_rate": 0.009938768518470965, "loss": 2.7781, "step": 1900 }, { "crossentropy": 2.8002848625183105, "epoch": 0.06891676334106729, "grad_norm": 0.053092826157808304, "grad_norm_var": 1.3021053186025994e-05, "learning_rate": 0.009938677823103734, "loss": 2.8758, "step": 1901 }, { "crossentropy": 3.0363643169403076, "epoch": 0.0689530162412993, "grad_norm": 0.060298580676317215, "grad_norm_var": 1.7098313812346824e-05, "learning_rate": 0.009938587061032164, "loss": 2.9296, "step": 1902 }, { "crossentropy": 2.5981411933898926, "epoch": 0.06898926914153132, "grad_norm": 0.05199650675058365, "grad_norm_var": 1.6917520195404897e-05, "learning_rate": 0.009938496232257482, "loss": 2.6819, "step": 1903 }, { "crossentropy": 2.86602520942688, "epoch": 0.06902552204176333, "grad_norm": 0.0518575944006443, "grad_norm_var": 1.696657599259173e-05, "learning_rate": 0.009938405336780913, "loss": 2.7737, "step": 1904 }, { "crossentropy": 2.7478127479553223, "epoch": 0.06906177494199536, "grad_norm": 0.05239572376012802, "grad_norm_var": 1.6929830020467884e-05, "learning_rate": 0.009938314374603687, "loss": 2.8145, "step": 1905 }, { "crossentropy": 2.947749376296997, "epoch": 0.06909802784222738, "grad_norm": 0.0541929192841053, "grad_norm_var": 1.5673465870955952e-05, "learning_rate": 0.009938223345727032, "loss": 2.813, "step": 1906 }, { "crossentropy": 2.761981964111328, "epoch": 0.06913428074245939, "grad_norm": 0.04951147362589836, "grad_norm_var": 1.3744961656524575e-05, "learning_rate": 0.009938132250152174, "loss": 2.8039, "step": 1907 }, { "crossentropy": 2.7097442150115967, "epoch": 0.06917053364269142, "grad_norm": 0.05544750764966011, "grad_norm_var": 1.429446120665371e-05, "learning_rate": 0.009938041087880347, "loss": 2.7266, "step": 1908 }, { "crossentropy": 2.7407352924346924, "epoch": 0.06920678654292343, "grad_norm": 0.047772638499736786, "grad_norm_var": 1.2694642981181392e-05, "learning_rate": 0.009937949858912781, "loss": 2.779, "step": 1909 }, { "crossentropy": 2.830962896347046, "epoch": 0.06924303944315545, "grad_norm": 0.04827186092734337, "grad_norm_var": 1.2692378439060845e-05, "learning_rate": 0.00993785856325071, "loss": 2.757, "step": 1910 }, { "crossentropy": 2.6742565631866455, "epoch": 0.06927929234338748, "grad_norm": 0.04788253828883171, "grad_norm_var": 1.312215525602339e-05, "learning_rate": 0.009937767200895365, "loss": 2.775, "step": 1911 }, { "crossentropy": 2.75799822807312, "epoch": 0.06931554524361949, "grad_norm": 0.049865204840898514, "grad_norm_var": 1.3324245578654378e-05, "learning_rate": 0.00993767577184798, "loss": 2.7426, "step": 1912 }, { "crossentropy": 2.9053916931152344, "epoch": 0.0693517981438515, "grad_norm": 0.049477770924568176, "grad_norm_var": 1.2953564011003448e-05, "learning_rate": 0.00993758427610979, "loss": 2.8399, "step": 1913 }, { "crossentropy": 2.7601754665374756, "epoch": 0.06938805104408352, "grad_norm": 0.04808203876018524, "grad_norm_var": 1.338055013240935e-05, "learning_rate": 0.009937492713682033, "loss": 2.8509, "step": 1914 }, { "crossentropy": 2.7667219638824463, "epoch": 0.06942430394431555, "grad_norm": 0.05329331010580063, "grad_norm_var": 1.1145557632777764e-05, "learning_rate": 0.009937401084565943, "loss": 2.732, "step": 1915 }, { "crossentropy": 2.8546884059906006, "epoch": 0.06946055684454756, "grad_norm": 0.054566144943237305, "grad_norm_var": 1.1691809550060511e-05, "learning_rate": 0.009937309388762758, "loss": 2.8025, "step": 1916 }, { "crossentropy": 2.8358895778656006, "epoch": 0.06949680974477958, "grad_norm": 0.046538349241018295, "grad_norm_var": 1.3203599538767706e-05, "learning_rate": 0.009937217626273719, "loss": 2.8408, "step": 1917 }, { "crossentropy": 2.9226393699645996, "epoch": 0.0695330626450116, "grad_norm": 0.047576483339071274, "grad_norm_var": 8.12415395248228e-06, "learning_rate": 0.009937125797100061, "loss": 2.8485, "step": 1918 }, { "crossentropy": 2.879143238067627, "epoch": 0.06956931554524362, "grad_norm": 0.050735317170619965, "grad_norm_var": 7.979567807266975e-06, "learning_rate": 0.00993703390124303, "loss": 2.8291, "step": 1919 }, { "crossentropy": 2.7276601791381836, "epoch": 0.06960556844547564, "grad_norm": 0.051888011395931244, "grad_norm_var": 7.9852666245106e-06, "learning_rate": 0.00993694193870386, "loss": 2.7198, "step": 1920 }, { "crossentropy": 2.84263014793396, "epoch": 0.06964182134570766, "grad_norm": 0.05904214456677437, "grad_norm_var": 1.2454012215422766e-05, "learning_rate": 0.0099368499094838, "loss": 2.8626, "step": 1921 }, { "crossentropy": 2.8762125968933105, "epoch": 0.06967807424593968, "grad_norm": 0.0588693767786026, "grad_norm_var": 1.5884054514577147e-05, "learning_rate": 0.00993675781358409, "loss": 2.7891, "step": 1922 }, { "crossentropy": 2.9530723094940186, "epoch": 0.06971432714617169, "grad_norm": 0.05781828239560127, "grad_norm_var": 1.8352870804842307e-05, "learning_rate": 0.009936665651005972, "loss": 2.9055, "step": 1923 }, { "crossentropy": 2.9087183475494385, "epoch": 0.0697505800464037, "grad_norm": 0.056244827806949615, "grad_norm_var": 1.8791483558367312e-05, "learning_rate": 0.009936573421750694, "loss": 2.8518, "step": 1924 }, { "crossentropy": 2.7025063037872314, "epoch": 0.06978683294663574, "grad_norm": 0.05193911865353584, "grad_norm_var": 1.76695378505685e-05, "learning_rate": 0.0099364811258195, "loss": 2.7705, "step": 1925 }, { "crossentropy": 2.622291326522827, "epoch": 0.06982308584686775, "grad_norm": 0.04984423890709877, "grad_norm_var": 1.7041265788982306e-05, "learning_rate": 0.009936388763213637, "loss": 2.7279, "step": 1926 }, { "crossentropy": 2.8253607749938965, "epoch": 0.06985933874709976, "grad_norm": 0.04857255890965462, "grad_norm_var": 1.6682642574420794e-05, "learning_rate": 0.009936296333934353, "loss": 2.7508, "step": 1927 }, { "crossentropy": 2.5908830165863037, "epoch": 0.06989559164733179, "grad_norm": 0.04582679644227028, "grad_norm_var": 1.893062140724544e-05, "learning_rate": 0.009936203837982896, "loss": 2.6687, "step": 1928 }, { "crossentropy": 2.843251943588257, "epoch": 0.0699318445475638, "grad_norm": 0.04879101365804672, "grad_norm_var": 1.9181408650457282e-05, "learning_rate": 0.009936111275360515, "loss": 2.8583, "step": 1929 }, { "crossentropy": 2.79872465133667, "epoch": 0.06996809744779582, "grad_norm": 0.05548166111111641, "grad_norm_var": 1.888429992154359e-05, "learning_rate": 0.009936018646068459, "loss": 2.7889, "step": 1930 }, { "crossentropy": 2.774129629135132, "epoch": 0.07000435034802784, "grad_norm": 0.054747626185417175, "grad_norm_var": 1.9206342424285018e-05, "learning_rate": 0.009935925950107982, "loss": 2.7041, "step": 1931 }, { "crossentropy": 2.7644989490509033, "epoch": 0.07004060324825986, "grad_norm": 0.054492779076099396, "grad_norm_var": 1.918553945710142e-05, "learning_rate": 0.009935833187480335, "loss": 2.6969, "step": 1932 }, { "crossentropy": 2.8098537921905518, "epoch": 0.07007685614849188, "grad_norm": 0.054229941219091415, "grad_norm_var": 1.687113571675978e-05, "learning_rate": 0.009935740358186769, "loss": 2.8194, "step": 1933 }, { "crossentropy": 2.9611728191375732, "epoch": 0.07011310904872389, "grad_norm": 0.06015348061919212, "grad_norm_var": 1.786168277215962e-05, "learning_rate": 0.009935647462228539, "loss": 2.8741, "step": 1934 }, { "crossentropy": 2.771028757095337, "epoch": 0.07014936194895592, "grad_norm": 0.06154982000589371, "grad_norm_var": 2.0943516076754078e-05, "learning_rate": 0.009935554499606902, "loss": 2.7871, "step": 1935 }, { "crossentropy": 2.6928176879882812, "epoch": 0.07018561484918794, "grad_norm": 0.058309346437454224, "grad_norm_var": 2.1418508512786814e-05, "learning_rate": 0.00993546147032311, "loss": 2.7945, "step": 1936 }, { "crossentropy": 2.7205400466918945, "epoch": 0.07022186774941995, "grad_norm": 0.05033339560031891, "grad_norm_var": 2.116844527382163e-05, "learning_rate": 0.009935368374378422, "loss": 2.8266, "step": 1937 }, { "crossentropy": 2.7767746448516846, "epoch": 0.07025812064965198, "grad_norm": 0.0501214936375618, "grad_norm_var": 2.0505308996639447e-05, "learning_rate": 0.009935275211774093, "loss": 2.7592, "step": 1938 }, { "crossentropy": 2.747687339782715, "epoch": 0.07029437354988399, "grad_norm": 0.04560794681310654, "grad_norm_var": 2.304316407977184e-05, "learning_rate": 0.009935181982511383, "loss": 2.7778, "step": 1939 }, { "crossentropy": 2.6831092834472656, "epoch": 0.07033062645011601, "grad_norm": 0.0502648651599884, "grad_norm_var": 2.2603563073534705e-05, "learning_rate": 0.009935088686591551, "loss": 2.7292, "step": 1940 }, { "crossentropy": 2.885814666748047, "epoch": 0.07036687935034802, "grad_norm": 0.05636049807071686, "grad_norm_var": 2.3484897326644014e-05, "learning_rate": 0.009934995324015858, "loss": 2.8838, "step": 1941 }, { "crossentropy": 2.8050897121429443, "epoch": 0.07040313225058005, "grad_norm": 0.07015823572874069, "grad_norm_var": 4.128932614276774e-05, "learning_rate": 0.009934901894785565, "loss": 2.8163, "step": 1942 }, { "crossentropy": 2.8145480155944824, "epoch": 0.07043938515081206, "grad_norm": 0.05239808186888695, "grad_norm_var": 3.940369100250505e-05, "learning_rate": 0.00993480839890193, "loss": 2.8259, "step": 1943 }, { "crossentropy": 2.820460796356201, "epoch": 0.07047563805104408, "grad_norm": 0.0495808981359005, "grad_norm_var": 3.604244116436842e-05, "learning_rate": 0.00993471483636622, "loss": 2.8402, "step": 1944 }, { "crossentropy": 2.635537624359131, "epoch": 0.07051189095127611, "grad_norm": 0.050998542457818985, "grad_norm_var": 3.465595857316169e-05, "learning_rate": 0.009934621207179696, "loss": 2.668, "step": 1945 }, { "crossentropy": 2.842597484588623, "epoch": 0.07054814385150812, "grad_norm": 0.05079182982444763, "grad_norm_var": 3.552575692118276e-05, "learning_rate": 0.009934527511343627, "loss": 2.7885, "step": 1946 }, { "crossentropy": 2.8392350673675537, "epoch": 0.07058439675174014, "grad_norm": 0.051067981868982315, "grad_norm_var": 3.619220472426317e-05, "learning_rate": 0.009934433748859274, "loss": 2.7453, "step": 1947 }, { "crossentropy": 2.7101101875305176, "epoch": 0.07062064965197216, "grad_norm": 0.05021768435835838, "grad_norm_var": 3.71397752535515e-05, "learning_rate": 0.009934339919727905, "loss": 2.6893, "step": 1948 }, { "crossentropy": 2.7239606380462646, "epoch": 0.07065690255220418, "grad_norm": 0.051970988512039185, "grad_norm_var": 3.7354509935055616e-05, "learning_rate": 0.009934246023950788, "loss": 2.6885, "step": 1949 }, { "crossentropy": 2.816451072692871, "epoch": 0.0706931554524362, "grad_norm": 0.054262425750494, "grad_norm_var": 3.448813425773722e-05, "learning_rate": 0.00993415206152919, "loss": 2.828, "step": 1950 }, { "crossentropy": 2.890608549118042, "epoch": 0.07072940835266821, "grad_norm": 0.06910550594329834, "grad_norm_var": 4.6292050602019424e-05, "learning_rate": 0.00993405803246438, "loss": 2.8053, "step": 1951 }, { "crossentropy": 2.80033802986145, "epoch": 0.07076566125290024, "grad_norm": 0.0812276229262352, "grad_norm_var": 9.275635360215228e-05, "learning_rate": 0.009933963936757629, "loss": 2.8435, "step": 1952 }, { "crossentropy": 2.742828607559204, "epoch": 0.07080191415313225, "grad_norm": 0.06247442588210106, "grad_norm_var": 9.396277338789693e-05, "learning_rate": 0.009933869774410208, "loss": 2.7836, "step": 1953 }, { "crossentropy": 2.79049015045166, "epoch": 0.07083816705336426, "grad_norm": 0.04753601923584938, "grad_norm_var": 9.642018464790051e-05, "learning_rate": 0.009933775545423388, "loss": 2.7735, "step": 1954 }, { "crossentropy": 2.93510365486145, "epoch": 0.0708744199535963, "grad_norm": 0.04789373651146889, "grad_norm_var": 9.361717825146678e-05, "learning_rate": 0.00993368124979844, "loss": 2.8736, "step": 1955 }, { "crossentropy": 2.81553316116333, "epoch": 0.07091067285382831, "grad_norm": 0.04787131026387215, "grad_norm_var": 9.581173222352099e-05, "learning_rate": 0.00993358688753664, "loss": 2.7736, "step": 1956 }, { "crossentropy": 2.759462833404541, "epoch": 0.07094692575406032, "grad_norm": 0.04569721966981888, "grad_norm_var": 0.00010222057636025105, "learning_rate": 0.009933492458639266, "loss": 2.7418, "step": 1957 }, { "crossentropy": 2.767953634262085, "epoch": 0.07098317865429234, "grad_norm": 0.04962427541613579, "grad_norm_var": 8.762870603978284e-05, "learning_rate": 0.009933397963107585, "loss": 2.8016, "step": 1958 }, { "crossentropy": 2.792165994644165, "epoch": 0.07101943155452436, "grad_norm": 0.05227125808596611, "grad_norm_var": 8.765544516149548e-05, "learning_rate": 0.00993330340094288, "loss": 2.7592, "step": 1959 }, { "crossentropy": 2.967656135559082, "epoch": 0.07105568445475638, "grad_norm": 0.05589893460273743, "grad_norm_var": 8.650176742917848e-05, "learning_rate": 0.009933208772146426, "loss": 2.7887, "step": 1960 }, { "crossentropy": 2.8370347023010254, "epoch": 0.0710919373549884, "grad_norm": 0.04902854934334755, "grad_norm_var": 8.76133037089305e-05, "learning_rate": 0.0099331140767195, "loss": 2.7657, "step": 1961 }, { "crossentropy": 2.7090604305267334, "epoch": 0.07112819025522042, "grad_norm": 0.0529395267367363, "grad_norm_var": 8.693028661930756e-05, "learning_rate": 0.009933019314663383, "loss": 2.7623, "step": 1962 }, { "crossentropy": 2.627962112426758, "epoch": 0.07116444315545244, "grad_norm": 0.0487947054207325, "grad_norm_var": 8.823835505593786e-05, "learning_rate": 0.009932924485979355, "loss": 2.6557, "step": 1963 }, { "crossentropy": 2.852266788482666, "epoch": 0.07120069605568445, "grad_norm": 0.047594159841537476, "grad_norm_var": 9.005312726159854e-05, "learning_rate": 0.009932829590668695, "loss": 2.7677, "step": 1964 }, { "crossentropy": 2.826446294784546, "epoch": 0.07123694895591648, "grad_norm": 0.047291964292526245, "grad_norm_var": 9.269473002653418e-05, "learning_rate": 0.009932734628732684, "loss": 2.7964, "step": 1965 }, { "crossentropy": 2.9384102821350098, "epoch": 0.0712732018561485, "grad_norm": 0.050489600747823715, "grad_norm_var": 9.331124155745786e-05, "learning_rate": 0.009932639600172608, "loss": 2.866, "step": 1966 }, { "crossentropy": 2.7717909812927246, "epoch": 0.07130945475638051, "grad_norm": 0.05077775940299034, "grad_norm_var": 7.613032609418863e-05, "learning_rate": 0.00993254450498975, "loss": 2.7871, "step": 1967 }, { "crossentropy": 2.7668964862823486, "epoch": 0.07134570765661252, "grad_norm": 0.050466038286685944, "grad_norm_var": 1.678122831208657e-05, "learning_rate": 0.009932449343185391, "loss": 2.7392, "step": 1968 }, { "crossentropy": 2.8433046340942383, "epoch": 0.07138196055684455, "grad_norm": 0.05285113304853439, "grad_norm_var": 7.096455939788598e-06, "learning_rate": 0.009932354114760818, "loss": 2.8194, "step": 1969 }, { "crossentropy": 2.754732847213745, "epoch": 0.07141821345707657, "grad_norm": 0.05412512645125389, "grad_norm_var": 7.808541908682574e-06, "learning_rate": 0.009932258819717318, "loss": 2.7569, "step": 1970 }, { "crossentropy": 2.7906601428985596, "epoch": 0.07145446635730858, "grad_norm": 0.04929640516638756, "grad_norm_var": 7.495331853867894e-06, "learning_rate": 0.009932163458056177, "loss": 2.7655, "step": 1971 }, { "crossentropy": 2.7103652954101562, "epoch": 0.07149071925754061, "grad_norm": 0.05827387422323227, "grad_norm_var": 1.0871156925072039e-05, "learning_rate": 0.009932068029778685, "loss": 2.6589, "step": 1972 }, { "crossentropy": 2.8035995960235596, "epoch": 0.07152697215777262, "grad_norm": 0.04703420028090477, "grad_norm_var": 1.0044037642134053e-05, "learning_rate": 0.00993197253488613, "loss": 2.8221, "step": 1973 }, { "crossentropy": 2.885878086090088, "epoch": 0.07156322505800464, "grad_norm": 0.049490198493003845, "grad_norm_var": 1.0070601276793156e-05, "learning_rate": 0.009931876973379798, "loss": 2.8921, "step": 1974 }, { "crossentropy": 2.660935163497925, "epoch": 0.07159947795823667, "grad_norm": 0.05089378356933594, "grad_norm_var": 9.962863989147232e-06, "learning_rate": 0.009931781345260985, "loss": 2.7262, "step": 1975 }, { "crossentropy": 2.848978281021118, "epoch": 0.07163573085846868, "grad_norm": 0.05442788824439049, "grad_norm_var": 9.127994355540014e-06, "learning_rate": 0.009931685650530982, "loss": 2.8511, "step": 1976 }, { "crossentropy": 2.803598642349243, "epoch": 0.0716719837587007, "grad_norm": 0.05387163534760475, "grad_norm_var": 9.41071034761265e-06, "learning_rate": 0.00993158988919108, "loss": 2.8216, "step": 1977 }, { "crossentropy": 2.8263232707977295, "epoch": 0.07170823665893271, "grad_norm": 0.04674188420176506, "grad_norm_var": 1.0343863084313378e-05, "learning_rate": 0.009931494061242573, "loss": 2.7965, "step": 1978 }, { "crossentropy": 2.9269449710845947, "epoch": 0.07174448955916474, "grad_norm": 0.05132643133401871, "grad_norm_var": 1.007556077366259e-05, "learning_rate": 0.009931398166686752, "loss": 2.902, "step": 1979 }, { "crossentropy": 2.884197235107422, "epoch": 0.07178074245939675, "grad_norm": 0.058702558279037476, "grad_norm_var": 1.284039141744182e-05, "learning_rate": 0.009931302205524918, "loss": 2.8442, "step": 1980 }, { "crossentropy": 2.782546281814575, "epoch": 0.07181699535962877, "grad_norm": 0.061444006860256195, "grad_norm_var": 1.7174603824950583e-05, "learning_rate": 0.009931206177758363, "loss": 2.7717, "step": 1981 }, { "crossentropy": 2.828416109085083, "epoch": 0.0718532482598608, "grad_norm": 0.05300399661064148, "grad_norm_var": 1.68912954654265e-05, "learning_rate": 0.009931110083388385, "loss": 2.7779, "step": 1982 }, { "crossentropy": 2.906168222427368, "epoch": 0.07188950116009281, "grad_norm": 0.04551725089550018, "grad_norm_var": 1.994837787365917e-05, "learning_rate": 0.009931013922416283, "loss": 2.8326, "step": 1983 }, { "crossentropy": 2.801685094833374, "epoch": 0.07192575406032482, "grad_norm": 0.04732893407344818, "grad_norm_var": 2.1347999017767848e-05, "learning_rate": 0.009930917694843354, "loss": 2.7839, "step": 1984 }, { "crossentropy": 2.7281153202056885, "epoch": 0.07196200696055685, "grad_norm": 0.04925956204533577, "grad_norm_var": 2.181633870413918e-05, "learning_rate": 0.009930821400670899, "loss": 2.8202, "step": 1985 }, { "crossentropy": 2.7601728439331055, "epoch": 0.07199825986078887, "grad_norm": 0.05034736171364784, "grad_norm_var": 2.1598139684356142e-05, "learning_rate": 0.009930725039900218, "loss": 2.7411, "step": 1986 }, { "crossentropy": 2.90266489982605, "epoch": 0.07203451276102088, "grad_norm": 0.04785352945327759, "grad_norm_var": 2.2187783548132992e-05, "learning_rate": 0.009930628612532614, "loss": 2.8918, "step": 1987 }, { "crossentropy": 2.951864719390869, "epoch": 0.0720707656612529, "grad_norm": 0.06710554659366608, "grad_norm_var": 3.4927650131564665e-05, "learning_rate": 0.009930532118569386, "loss": 2.8618, "step": 1988 }, { "crossentropy": 2.6958136558532715, "epoch": 0.07210701856148492, "grad_norm": 0.05701984092593193, "grad_norm_var": 3.4352705347074726e-05, "learning_rate": 0.00993043555801184, "loss": 2.6966, "step": 1989 }, { "crossentropy": 2.8822264671325684, "epoch": 0.07214327146171694, "grad_norm": 0.05782020837068558, "grad_norm_var": 3.5045751154529305e-05, "learning_rate": 0.00993033893086128, "loss": 2.8575, "step": 1990 }, { "crossentropy": 2.8542675971984863, "epoch": 0.07217952436194895, "grad_norm": 0.05871773511171341, "grad_norm_var": 3.637033018709749e-05, "learning_rate": 0.00993024223711901, "loss": 2.7671, "step": 1991 }, { "crossentropy": 2.640528678894043, "epoch": 0.07221577726218098, "grad_norm": 0.05549455061554909, "grad_norm_var": 3.653351006003442e-05, "learning_rate": 0.009930145476786337, "loss": 2.7452, "step": 1992 }, { "crossentropy": 2.729565382003784, "epoch": 0.072252030162413, "grad_norm": 0.0526210255920887, "grad_norm_var": 3.662718531667492e-05, "learning_rate": 0.009930048649864569, "loss": 2.853, "step": 1993 }, { "crossentropy": 2.771787643432617, "epoch": 0.07228828306264501, "grad_norm": 0.04497451335191727, "grad_norm_var": 3.847835247382334e-05, "learning_rate": 0.009929951756355011, "loss": 2.7483, "step": 1994 }, { "crossentropy": 2.661485195159912, "epoch": 0.07232453596287702, "grad_norm": 0.04773047938942909, "grad_norm_var": 4.040469763962699e-05, "learning_rate": 0.009929854796258972, "loss": 2.7283, "step": 1995 }, { "crossentropy": 2.7230310440063477, "epoch": 0.07236078886310905, "grad_norm": 0.051764100790023804, "grad_norm_var": 3.853932795418458e-05, "learning_rate": 0.009929757769577765, "loss": 2.7695, "step": 1996 }, { "crossentropy": 2.7123324871063232, "epoch": 0.07239704176334107, "grad_norm": 0.05613846704363823, "grad_norm_var": 3.432540654728962e-05, "learning_rate": 0.009929660676312697, "loss": 2.6999, "step": 1997 }, { "crossentropy": 2.9381392002105713, "epoch": 0.07243329466357308, "grad_norm": 0.05792966112494469, "grad_norm_var": 3.606208613795107e-05, "learning_rate": 0.009929563516465081, "loss": 2.8679, "step": 1998 }, { "crossentropy": 2.8228015899658203, "epoch": 0.07246954756380511, "grad_norm": 0.054756853729486465, "grad_norm_var": 3.2208422462279224e-05, "learning_rate": 0.009929466290036229, "loss": 2.8004, "step": 1999 }, { "crossentropy": 2.8637232780456543, "epoch": 0.07250580046403712, "grad_norm": 0.05266121029853821, "grad_norm_var": 2.9559731865862136e-05, "learning_rate": 0.009929368997027453, "loss": 2.8077, "step": 2000 }, { "crossentropy": 2.9510908126831055, "epoch": 0.07254205336426914, "grad_norm": 0.05368742719292641, "grad_norm_var": 2.805305242062067e-05, "learning_rate": 0.009929271637440068, "loss": 2.8444, "step": 2001 }, { "crossentropy": 2.7672033309936523, "epoch": 0.07257830626450117, "grad_norm": 0.04714066907763481, "grad_norm_var": 3.032753061101423e-05, "learning_rate": 0.009929174211275388, "loss": 2.657, "step": 2002 }, { "crossentropy": 2.770689010620117, "epoch": 0.07261455916473318, "grad_norm": 0.048270758241415024, "grad_norm_var": 2.9998510485295995e-05, "learning_rate": 0.009929076718534734, "loss": 2.8, "step": 2003 }, { "crossentropy": 2.7895615100860596, "epoch": 0.0726508120649652, "grad_norm": 0.04885687306523323, "grad_norm_var": 1.889865648384692e-05, "learning_rate": 0.009928979159219416, "loss": 2.8627, "step": 2004 }, { "crossentropy": 2.961071252822876, "epoch": 0.07268706496519721, "grad_norm": 0.05421260744333267, "grad_norm_var": 1.783006364727812e-05, "learning_rate": 0.009928881533330755, "loss": 2.8083, "step": 2005 }, { "crossentropy": 2.881855010986328, "epoch": 0.07272331786542924, "grad_norm": 0.053952984511852264, "grad_norm_var": 1.6111017719357366e-05, "learning_rate": 0.009928783840870068, "loss": 2.8318, "step": 2006 }, { "crossentropy": 2.755237340927124, "epoch": 0.07275957076566125, "grad_norm": 0.052251674234867096, "grad_norm_var": 1.3304834125537946e-05, "learning_rate": 0.009928686081838677, "loss": 2.7927, "step": 2007 }, { "crossentropy": 2.8540802001953125, "epoch": 0.07279582366589327, "grad_norm": 0.046781010925769806, "grad_norm_var": 1.4022437216698544e-05, "learning_rate": 0.009928588256237901, "loss": 2.8031, "step": 2008 }, { "crossentropy": 2.9194045066833496, "epoch": 0.0728320765661253, "grad_norm": 0.04826320707798004, "grad_norm_var": 1.4548191511392962e-05, "learning_rate": 0.009928490364069061, "loss": 2.8769, "step": 2009 }, { "crossentropy": 2.7998785972595215, "epoch": 0.07286832946635731, "grad_norm": 0.05287284776568413, "grad_norm_var": 1.1879688169269485e-05, "learning_rate": 0.009928392405333478, "loss": 2.889, "step": 2010 }, { "crossentropy": 2.793210029602051, "epoch": 0.07290458236658932, "grad_norm": 0.05292699486017227, "grad_norm_var": 1.081399995613482e-05, "learning_rate": 0.009928294380032477, "loss": 2.7703, "step": 2011 }, { "crossentropy": 2.7523796558380127, "epoch": 0.07294083526682135, "grad_norm": 0.051737017929553986, "grad_norm_var": 1.081500311796967e-05, "learning_rate": 0.00992819628816738, "loss": 2.8762, "step": 2012 }, { "crossentropy": 2.859361410140991, "epoch": 0.07297708816705337, "grad_norm": 0.05125017836689949, "grad_norm_var": 9.629061793558122e-06, "learning_rate": 0.009928098129739518, "loss": 2.9001, "step": 2013 }, { "crossentropy": 2.7260689735412598, "epoch": 0.07301334106728538, "grad_norm": 0.049711138010025024, "grad_norm_var": 7.048193583571114e-06, "learning_rate": 0.009927999904750209, "loss": 2.6929, "step": 2014 }, { "crossentropy": 2.834216356277466, "epoch": 0.0730495939675174, "grad_norm": 0.04902620613574982, "grad_norm_var": 6.38934341809135e-06, "learning_rate": 0.009927901613200783, "loss": 2.805, "step": 2015 }, { "crossentropy": 2.714998245239258, "epoch": 0.07308584686774942, "grad_norm": 0.04869122430682182, "grad_norm_var": 6.415754917702259e-06, "learning_rate": 0.009927803255092568, "loss": 2.7554, "step": 2016 }, { "crossentropy": 2.7419958114624023, "epoch": 0.07312209976798144, "grad_norm": 0.04894194006919861, "grad_norm_var": 5.871017978649705e-06, "learning_rate": 0.00992770483042689, "loss": 2.7913, "step": 2017 }, { "crossentropy": 2.691923141479492, "epoch": 0.07315835266821345, "grad_norm": 0.05483722686767578, "grad_norm_var": 6.32559955623755e-06, "learning_rate": 0.009927606339205083, "loss": 2.6587, "step": 2018 }, { "crossentropy": 2.9149582386016846, "epoch": 0.07319460556844548, "grad_norm": 0.049059174954891205, "grad_norm_var": 6.099989957415962e-06, "learning_rate": 0.009927507781428473, "loss": 2.8651, "step": 2019 }, { "crossentropy": 2.8811686038970947, "epoch": 0.0732308584686775, "grad_norm": 0.05489003658294678, "grad_norm_var": 6.7830641728679085e-06, "learning_rate": 0.009927409157098391, "loss": 2.8306, "step": 2020 }, { "crossentropy": 2.7396047115325928, "epoch": 0.07326711136890951, "grad_norm": 0.06128318980336189, "grad_norm_var": 1.2735647037071899e-05, "learning_rate": 0.009927310466216173, "loss": 2.8627, "step": 2021 }, { "crossentropy": 2.8082971572875977, "epoch": 0.07330336426914154, "grad_norm": 0.0651969313621521, "grad_norm_var": 2.4082785318603703e-05, "learning_rate": 0.009927211708783149, "loss": 2.7941, "step": 2022 }, { "crossentropy": 2.77307391166687, "epoch": 0.07333961716937355, "grad_norm": 0.051890380680561066, "grad_norm_var": 2.4096041519790455e-05, "learning_rate": 0.009927112884800653, "loss": 2.7644, "step": 2023 }, { "crossentropy": 2.5904645919799805, "epoch": 0.07337587006960557, "grad_norm": 0.047589436173439026, "grad_norm_var": 2.3538232534932777e-05, "learning_rate": 0.00992701399427002, "loss": 2.6653, "step": 2024 }, { "crossentropy": 2.704700231552124, "epoch": 0.07341212296983758, "grad_norm": 0.0473657101392746, "grad_norm_var": 2.4081869202744836e-05, "learning_rate": 0.009926915037192587, "loss": 2.7827, "step": 2025 }, { "crossentropy": 2.800082206726074, "epoch": 0.07344837587006961, "grad_norm": 0.04839244857430458, "grad_norm_var": 2.5011815769652308e-05, "learning_rate": 0.00992681601356969, "loss": 2.7571, "step": 2026 }, { "crossentropy": 2.7783501148223877, "epoch": 0.07348462877030162, "grad_norm": 0.04924025759100914, "grad_norm_var": 2.54298870622666e-05, "learning_rate": 0.009926716923402665, "loss": 2.7717, "step": 2027 }, { "crossentropy": 2.803814172744751, "epoch": 0.07352088167053364, "grad_norm": 0.05377192422747612, "grad_norm_var": 2.566647182356459e-05, "learning_rate": 0.009926617766692854, "loss": 2.8334, "step": 2028 }, { "crossentropy": 2.604264736175537, "epoch": 0.07355713457076567, "grad_norm": 0.050140321254730225, "grad_norm_var": 2.584643957816244e-05, "learning_rate": 0.00992651854344159, "loss": 2.7218, "step": 2029 }, { "crossentropy": 2.761869430541992, "epoch": 0.07359338747099768, "grad_norm": 0.0484679713845253, "grad_norm_var": 2.630198854275025e-05, "learning_rate": 0.009926419253650218, "loss": 2.7321, "step": 2030 }, { "crossentropy": 2.7704944610595703, "epoch": 0.0736296403712297, "grad_norm": 0.04578019678592682, "grad_norm_var": 2.8160603509532328e-05, "learning_rate": 0.00992631989732008, "loss": 2.7522, "step": 2031 }, { "crossentropy": 2.6599605083465576, "epoch": 0.07366589327146171, "grad_norm": 0.04576724022626877, "grad_norm_var": 2.9827485509410315e-05, "learning_rate": 0.009926220474452513, "loss": 2.6388, "step": 2032 }, { "crossentropy": 2.7795801162719727, "epoch": 0.07370214617169374, "grad_norm": 0.04740096628665924, "grad_norm_var": 3.0483691845132703e-05, "learning_rate": 0.009926120985048864, "loss": 2.7755, "step": 2033 }, { "crossentropy": 2.729508876800537, "epoch": 0.07373839907192575, "grad_norm": 0.04906540736556053, "grad_norm_var": 2.985679669784963e-05, "learning_rate": 0.009926021429110475, "loss": 2.8068, "step": 2034 }, { "crossentropy": 2.7301087379455566, "epoch": 0.07377465197215777, "grad_norm": 0.04572197422385216, "grad_norm_var": 3.1397020502057135e-05, "learning_rate": 0.00992592180663869, "loss": 2.8202, "step": 2035 }, { "crossentropy": 2.8092257976531982, "epoch": 0.0738109048723898, "grad_norm": 0.0580693744122982, "grad_norm_var": 3.3784735704341504e-05, "learning_rate": 0.00992582211763486, "loss": 2.7532, "step": 2036 }, { "crossentropy": 2.7865664958953857, "epoch": 0.07384715777262181, "grad_norm": 0.05719595402479172, "grad_norm_var": 2.9195688257978764e-05, "learning_rate": 0.009925722362100322, "loss": 2.8679, "step": 2037 }, { "crossentropy": 2.722369432449341, "epoch": 0.07388341067285382, "grad_norm": 0.05839730426669121, "grad_norm_var": 1.893407613866993e-05, "learning_rate": 0.00992562254003643, "loss": 2.7594, "step": 2038 }, { "crossentropy": 2.9380877017974854, "epoch": 0.07391966357308585, "grad_norm": 0.05366268754005432, "grad_norm_var": 1.9514233769575966e-05, "learning_rate": 0.009925522651444531, "loss": 2.8139, "step": 2039 }, { "crossentropy": 2.859806537628174, "epoch": 0.07395591647331787, "grad_norm": 0.0541263185441494, "grad_norm_var": 1.9755467721367197e-05, "learning_rate": 0.009925422696325974, "loss": 2.8504, "step": 2040 }, { "crossentropy": 2.769193649291992, "epoch": 0.07399216937354988, "grad_norm": 0.053492579609155655, "grad_norm_var": 1.930804421504297e-05, "learning_rate": 0.00992532267468211, "loss": 2.6832, "step": 2041 }, { "crossentropy": 2.8380050659179688, "epoch": 0.0740284222737819, "grad_norm": 0.04837430641055107, "grad_norm_var": 1.9314779465421222e-05, "learning_rate": 0.009925222586514288, "loss": 2.8136, "step": 2042 }, { "crossentropy": 2.7863800525665283, "epoch": 0.07406467517401392, "grad_norm": 0.04919418320059776, "grad_norm_var": 1.932674967676003e-05, "learning_rate": 0.009925122431823861, "loss": 2.7204, "step": 2043 }, { "crossentropy": 2.803860664367676, "epoch": 0.07410092807424594, "grad_norm": 0.04982874169945717, "grad_norm_var": 1.8927561300551415e-05, "learning_rate": 0.00992502221061218, "loss": 2.8106, "step": 2044 }, { "crossentropy": 2.6687660217285156, "epoch": 0.07413718097447795, "grad_norm": 0.05260353907942772, "grad_norm_var": 1.9051414900482048e-05, "learning_rate": 0.009924921922880602, "loss": 2.745, "step": 2045 }, { "crossentropy": 2.697920322418213, "epoch": 0.07417343387470998, "grad_norm": 0.05383492633700371, "grad_norm_var": 1.8988396081261588e-05, "learning_rate": 0.009924821568630478, "loss": 2.7033, "step": 2046 }, { "crossentropy": 2.710675001144409, "epoch": 0.074209686774942, "grad_norm": 0.059481944888830185, "grad_norm_var": 2.044198705797584e-05, "learning_rate": 0.009924721147863165, "loss": 2.7664, "step": 2047 }, { "crossentropy": 2.7602972984313965, "epoch": 0.07424593967517401, "grad_norm": 0.0545029416680336, "grad_norm_var": 1.764482683589789e-05, "learning_rate": 0.00992462066058002, "loss": 2.7624, "step": 2048 }, { "crossentropy": 2.953299045562744, "epoch": 0.07428219257540604, "grad_norm": 0.058502938598394394, "grad_norm_var": 1.734203010428318e-05, "learning_rate": 0.0099245201067824, "loss": 2.9067, "step": 2049 }, { "crossentropy": 2.8309249877929688, "epoch": 0.07431844547563805, "grad_norm": 0.057368792593479156, "grad_norm_var": 1.6737736380535947e-05, "learning_rate": 0.00992441948647166, "loss": 2.7941, "step": 2050 }, { "crossentropy": 2.7351484298706055, "epoch": 0.07435469837587007, "grad_norm": 0.056666430085897446, "grad_norm_var": 1.2111560439405599e-05, "learning_rate": 0.009924318799649166, "loss": 2.7981, "step": 2051 }, { "crossentropy": 2.808483600616455, "epoch": 0.07439095127610208, "grad_norm": 0.04780031368136406, "grad_norm_var": 1.4097847336504387e-05, "learning_rate": 0.00992421804631627, "loss": 2.8267, "step": 2052 }, { "crossentropy": 2.8274800777435303, "epoch": 0.07442720417633411, "grad_norm": 0.052008118480443954, "grad_norm_var": 1.361396945970857e-05, "learning_rate": 0.009924117226474339, "loss": 2.8426, "step": 2053 }, { "crossentropy": 2.924898147583008, "epoch": 0.07446345707656613, "grad_norm": 0.0556655079126358, "grad_norm_var": 1.2384152783013561e-05, "learning_rate": 0.009924016340124731, "loss": 2.868, "step": 2054 }, { "crossentropy": 2.782668352127075, "epoch": 0.07449970997679814, "grad_norm": 0.050515566021203995, "grad_norm_var": 1.2964132688924637e-05, "learning_rate": 0.009923915387268811, "loss": 2.7469, "step": 2055 }, { "crossentropy": 2.855886936187744, "epoch": 0.07453596287703017, "grad_norm": 0.05278913304209709, "grad_norm_var": 1.2941567052533222e-05, "learning_rate": 0.009923814367907942, "loss": 2.8813, "step": 2056 }, { "crossentropy": 2.8283021450042725, "epoch": 0.07457221577726218, "grad_norm": 0.050010956823825836, "grad_norm_var": 1.3604841157884249e-05, "learning_rate": 0.009923713282043485, "loss": 2.8276, "step": 2057 }, { "crossentropy": 2.8474907875061035, "epoch": 0.0746084686774942, "grad_norm": 0.048518773168325424, "grad_norm_var": 1.3515661903828792e-05, "learning_rate": 0.00992361212967681, "loss": 2.772, "step": 2058 }, { "crossentropy": 2.7855327129364014, "epoch": 0.07464472157772621, "grad_norm": 0.0519280731678009, "grad_norm_var": 1.2566052028431471e-05, "learning_rate": 0.009923510910809282, "loss": 2.7704, "step": 2059 }, { "crossentropy": 2.5985639095306396, "epoch": 0.07468097447795824, "grad_norm": 0.05672670528292656, "grad_norm_var": 1.2391757550739233e-05, "learning_rate": 0.009923409625442265, "loss": 2.6339, "step": 2060 }, { "crossentropy": 2.936232805252075, "epoch": 0.07471722737819025, "grad_norm": 0.052988920360803604, "grad_norm_var": 1.2345583490897388e-05, "learning_rate": 0.00992330827357713, "loss": 2.8461, "step": 2061 }, { "crossentropy": 2.909606695175171, "epoch": 0.07475348027842227, "grad_norm": 0.05194473639130592, "grad_norm_var": 1.2536613101205885e-05, "learning_rate": 0.009923206855215248, "loss": 2.9368, "step": 2062 }, { "crossentropy": 2.746263027191162, "epoch": 0.0747897331786543, "grad_norm": 0.05350637808442116, "grad_norm_var": 1.0072960854349127e-05, "learning_rate": 0.009923105370357984, "loss": 2.8237, "step": 2063 }, { "crossentropy": 2.816760301589966, "epoch": 0.07482598607888631, "grad_norm": 0.05732126906514168, "grad_norm_var": 1.1053274628900629e-05, "learning_rate": 0.009923003819006711, "loss": 2.7453, "step": 2064 }, { "crossentropy": 2.7018609046936035, "epoch": 0.07486223897911833, "grad_norm": 0.0568852499127388, "grad_norm_var": 1.0114317702206108e-05, "learning_rate": 0.0099229022011628, "loss": 2.6827, "step": 2065 }, { "crossentropy": 2.6068334579467773, "epoch": 0.07489849187935035, "grad_norm": 0.05083124712109566, "grad_norm_var": 9.23043249012138e-06, "learning_rate": 0.009922800516827627, "loss": 2.725, "step": 2066 }, { "crossentropy": 2.771027088165283, "epoch": 0.07493474477958237, "grad_norm": 0.08389013260602951, "grad_norm_var": 6.928893111691985e-05, "learning_rate": 0.009922698766002559, "loss": 2.7563, "step": 2067 }, { "crossentropy": 2.7445285320281982, "epoch": 0.07497099767981438, "grad_norm": 0.05561444163322449, "grad_norm_var": 6.603824720907888e-05, "learning_rate": 0.009922596948688975, "loss": 2.6807, "step": 2068 }, { "crossentropy": 2.8797483444213867, "epoch": 0.0750072505800464, "grad_norm": 0.05213935673236847, "grad_norm_var": 6.598571797611496e-05, "learning_rate": 0.00992249506488825, "loss": 2.7886, "step": 2069 }, { "crossentropy": 2.5396058559417725, "epoch": 0.07504350348027843, "grad_norm": 0.047835055738687515, "grad_norm_var": 6.920642941779497e-05, "learning_rate": 0.009922393114601757, "loss": 2.6941, "step": 2070 }, { "crossentropy": 2.670664072036743, "epoch": 0.07507975638051044, "grad_norm": 0.04615216702222824, "grad_norm_var": 7.276705140724272e-05, "learning_rate": 0.009922291097830875, "loss": 2.6986, "step": 2071 }, { "crossentropy": 2.7447221279144287, "epoch": 0.07511600928074245, "grad_norm": 0.050402380526065826, "grad_norm_var": 7.360951759129038e-05, "learning_rate": 0.009922189014576983, "loss": 2.8026, "step": 2072 }, { "crossentropy": 2.7020561695098877, "epoch": 0.07515226218097448, "grad_norm": 0.046748362481594086, "grad_norm_var": 7.608337953439344e-05, "learning_rate": 0.009922086864841459, "loss": 2.6974, "step": 2073 }, { "crossentropy": 2.8342602252960205, "epoch": 0.0751885150812065, "grad_norm": 0.051980145275592804, "grad_norm_var": 7.431887045830961e-05, "learning_rate": 0.009921984648625681, "loss": 2.7623, "step": 2074 }, { "crossentropy": 2.707148790359497, "epoch": 0.07522476798143851, "grad_norm": 0.05082252621650696, "grad_norm_var": 7.47273429050579e-05, "learning_rate": 0.009921882365931032, "loss": 2.6627, "step": 2075 }, { "crossentropy": 2.783849000930786, "epoch": 0.07526102088167054, "grad_norm": 0.04553103446960449, "grad_norm_var": 7.865789298648687e-05, "learning_rate": 0.009921780016758893, "loss": 2.8107, "step": 2076 }, { "crossentropy": 2.603991746902466, "epoch": 0.07529727378190255, "grad_norm": 0.047385696321725845, "grad_norm_var": 8.09362973382556e-05, "learning_rate": 0.009921677601110644, "loss": 2.6882, "step": 2077 }, { "crossentropy": 2.7389578819274902, "epoch": 0.07533352668213457, "grad_norm": 0.0544666089117527, "grad_norm_var": 8.095814613913524e-05, "learning_rate": 0.00992157511898767, "loss": 2.7296, "step": 2078 }, { "crossentropy": 2.8289270401000977, "epoch": 0.07536977958236658, "grad_norm": 0.05700371786952019, "grad_norm_var": 8.185638090351943e-05, "learning_rate": 0.009921472570391356, "loss": 2.7736, "step": 2079 }, { "crossentropy": 2.6078736782073975, "epoch": 0.07540603248259861, "grad_norm": 0.059005171060562134, "grad_norm_var": 8.290545434591432e-05, "learning_rate": 0.009921369955323087, "loss": 2.7583, "step": 2080 }, { "crossentropy": 2.822566032409668, "epoch": 0.07544228538283063, "grad_norm": 0.05647945776581764, "grad_norm_var": 8.273492944957545e-05, "learning_rate": 0.00992126727378425, "loss": 2.78, "step": 2081 }, { "crossentropy": 2.5386877059936523, "epoch": 0.07547853828306264, "grad_norm": 0.0537431575357914, "grad_norm_var": 8.222174843949062e-05, "learning_rate": 0.009921164525776228, "loss": 2.6769, "step": 2082 }, { "crossentropy": 2.710564613342285, "epoch": 0.07551479118329467, "grad_norm": 0.05432868003845215, "grad_norm_var": 1.7843847926332645e-05, "learning_rate": 0.00992106171130041, "loss": 2.774, "step": 2083 }, { "crossentropy": 2.6983556747436523, "epoch": 0.07555104408352668, "grad_norm": 0.048404958099126816, "grad_norm_var": 1.747604530803141e-05, "learning_rate": 0.009920958830358187, "loss": 2.7245, "step": 2084 }, { "crossentropy": 2.8448386192321777, "epoch": 0.0755872969837587, "grad_norm": 0.04793370142579079, "grad_norm_var": 1.8167917071861458e-05, "learning_rate": 0.009920855882950946, "loss": 2.7717, "step": 2085 }, { "crossentropy": 2.7691879272460938, "epoch": 0.07562354988399073, "grad_norm": 0.04646808281540871, "grad_norm_var": 1.888687905378106e-05, "learning_rate": 0.00992075286908008, "loss": 2.7564, "step": 2086 }, { "crossentropy": 2.796769857406616, "epoch": 0.07565980278422274, "grad_norm": 0.04798479750752449, "grad_norm_var": 1.789914548578162e-05, "learning_rate": 0.009920649788746978, "loss": 2.8162, "step": 2087 }, { "crossentropy": 2.812124252319336, "epoch": 0.07569605568445475, "grad_norm": 0.04743465036153793, "grad_norm_var": 1.875257481174456e-05, "learning_rate": 0.009920546641953035, "loss": 2.7768, "step": 2088 }, { "crossentropy": 2.69940447807312, "epoch": 0.07573230858468677, "grad_norm": 0.047442302107810974, "grad_norm_var": 1.8390902723835975e-05, "learning_rate": 0.00992044342869964, "loss": 2.6662, "step": 2089 }, { "crossentropy": 2.7037672996520996, "epoch": 0.0757685614849188, "grad_norm": 0.0463123694062233, "grad_norm_var": 1.96775200766191e-05, "learning_rate": 0.009920340148988191, "loss": 2.7264, "step": 2090 }, { "crossentropy": 2.7794806957244873, "epoch": 0.07580481438515081, "grad_norm": 0.0553862527012825, "grad_norm_var": 2.107103327695503e-05, "learning_rate": 0.00992023680282008, "loss": 2.7685, "step": 2091 }, { "crossentropy": 2.711880922317505, "epoch": 0.07584106728538283, "grad_norm": 0.04707668721675873, "grad_norm_var": 2.0102144828842924e-05, "learning_rate": 0.009920133390196706, "loss": 2.7355, "step": 2092 }, { "crossentropy": 2.8495066165924072, "epoch": 0.07587732018561485, "grad_norm": 0.05331236496567726, "grad_norm_var": 1.939908708516267e-05, "learning_rate": 0.009920029911119463, "loss": 2.8198, "step": 2093 }, { "crossentropy": 2.7604029178619385, "epoch": 0.07591357308584687, "grad_norm": 0.05398482084274292, "grad_norm_var": 1.9218138036471388e-05, "learning_rate": 0.00991992636558975, "loss": 2.7853, "step": 2094 }, { "crossentropy": 2.840726852416992, "epoch": 0.07594982598607888, "grad_norm": 0.0487307645380497, "grad_norm_var": 1.7307694967811742e-05, "learning_rate": 0.009919822753608965, "loss": 2.8612, "step": 2095 }, { "crossentropy": 2.798649549484253, "epoch": 0.0759860788863109, "grad_norm": 0.0457671582698822, "grad_norm_var": 1.3913311546086733e-05, "learning_rate": 0.009919719075178508, "loss": 2.7966, "step": 2096 }, { "crossentropy": 2.82726788520813, "epoch": 0.07602233178654293, "grad_norm": 0.04879046604037285, "grad_norm_var": 1.1016248072006028e-05, "learning_rate": 0.009919615330299779, "loss": 2.8128, "step": 2097 }, { "crossentropy": 2.8796117305755615, "epoch": 0.07605858468677494, "grad_norm": 0.04882320761680603, "grad_norm_var": 9.790783571598088e-06, "learning_rate": 0.009919511518974179, "loss": 2.8667, "step": 2098 }, { "crossentropy": 2.7599010467529297, "epoch": 0.07609483758700696, "grad_norm": 0.04672246053814888, "grad_norm_var": 8.267576607097714e-06, "learning_rate": 0.009919407641203108, "loss": 2.7839, "step": 2099 }, { "crossentropy": 2.7470192909240723, "epoch": 0.07613109048723898, "grad_norm": 0.04330229014158249, "grad_norm_var": 1.015410628266335e-05, "learning_rate": 0.009919303696987973, "loss": 2.7399, "step": 2100 }, { "crossentropy": 2.8107309341430664, "epoch": 0.076167343387471, "grad_norm": 0.049314793199300766, "grad_norm_var": 1.017511078514512e-05, "learning_rate": 0.009919199686330179, "loss": 2.7829, "step": 2101 }, { "crossentropy": 2.8237123489379883, "epoch": 0.07620359628770301, "grad_norm": 0.045600779354572296, "grad_norm_var": 1.0463264548341937e-05, "learning_rate": 0.009919095609231125, "loss": 2.6752, "step": 2102 }, { "crossentropy": 2.6946356296539307, "epoch": 0.07623984918793504, "grad_norm": 0.04478510469198227, "grad_norm_var": 1.1322571405923167e-05, "learning_rate": 0.00991899146569222, "loss": 2.7593, "step": 2103 }, { "crossentropy": 2.785280704498291, "epoch": 0.07627610208816706, "grad_norm": 0.04399905353784561, "grad_norm_var": 1.2456290941272909e-05, "learning_rate": 0.00991888725571487, "loss": 2.7799, "step": 2104 }, { "crossentropy": 2.738211154937744, "epoch": 0.07631235498839907, "grad_norm": 0.04421928897500038, "grad_norm_var": 1.338147404884189e-05, "learning_rate": 0.009918782979300485, "loss": 2.767, "step": 2105 }, { "crossentropy": 2.7583720684051514, "epoch": 0.07634860788863108, "grad_norm": 0.04816862940788269, "grad_norm_var": 1.3208099341109152e-05, "learning_rate": 0.00991867863645047, "loss": 2.7957, "step": 2106 }, { "crossentropy": 2.8471999168395996, "epoch": 0.07638486078886311, "grad_norm": 0.049527496099472046, "grad_norm_var": 9.582737535764284e-06, "learning_rate": 0.009918574227166237, "loss": 2.7302, "step": 2107 }, { "crossentropy": 2.7577884197235107, "epoch": 0.07642111368909513, "grad_norm": 0.048706524074077606, "grad_norm_var": 9.627903119241444e-06, "learning_rate": 0.009918469751449192, "loss": 2.7595, "step": 2108 }, { "crossentropy": 2.82611346244812, "epoch": 0.07645736658932714, "grad_norm": 0.05108053982257843, "grad_norm_var": 8.27943532404011e-06, "learning_rate": 0.009918365209300752, "loss": 2.7877, "step": 2109 }, { "crossentropy": 2.7418031692504883, "epoch": 0.07649361948955917, "grad_norm": 0.05354434996843338, "grad_norm_var": 7.916302969016425e-06, "learning_rate": 0.009918260600722324, "loss": 2.8054, "step": 2110 }, { "crossentropy": 2.7597568035125732, "epoch": 0.07652987238979118, "grad_norm": 0.0510750412940979, "grad_norm_var": 8.623325156389971e-06, "learning_rate": 0.009918155925715324, "loss": 2.7924, "step": 2111 }, { "crossentropy": 2.7305026054382324, "epoch": 0.0765661252900232, "grad_norm": 0.04781293496489525, "grad_norm_var": 8.353805575769292e-06, "learning_rate": 0.009918051184281166, "loss": 2.8127, "step": 2112 }, { "crossentropy": 2.654738187789917, "epoch": 0.07660237819025523, "grad_norm": 0.05201118066906929, "grad_norm_var": 9.409390947486863e-06, "learning_rate": 0.009917946376421261, "loss": 2.7089, "step": 2113 }, { "crossentropy": 2.689913272857666, "epoch": 0.07663863109048724, "grad_norm": 0.04739860072731972, "grad_norm_var": 9.388103808403297e-06, "learning_rate": 0.009917841502137027, "loss": 2.7232, "step": 2114 }, { "crossentropy": 2.7479214668273926, "epoch": 0.07667488399071926, "grad_norm": 0.04527657851576805, "grad_norm_var": 9.756247228503988e-06, "learning_rate": 0.00991773656142988, "loss": 2.7473, "step": 2115 }, { "crossentropy": 2.7332661151885986, "epoch": 0.07671113689095127, "grad_norm": 0.047332968562841415, "grad_norm_var": 8.320101223378319e-06, "learning_rate": 0.009917631554301238, "loss": 2.8028, "step": 2116 }, { "crossentropy": 2.8079071044921875, "epoch": 0.0767473897911833, "grad_norm": 0.04512076452374458, "grad_norm_var": 8.749024107642239e-06, "learning_rate": 0.00991752648075252, "loss": 2.7653, "step": 2117 }, { "crossentropy": 2.772214651107788, "epoch": 0.07678364269141531, "grad_norm": 0.0458076037466526, "grad_norm_var": 8.689568674389441e-06, "learning_rate": 0.009917421340785144, "loss": 2.8001, "step": 2118 }, { "crossentropy": 2.623283624649048, "epoch": 0.07681989559164733, "grad_norm": 0.045596979558467865, "grad_norm_var": 8.397185993348997e-06, "learning_rate": 0.009917316134400531, "loss": 2.7095, "step": 2119 }, { "crossentropy": 2.8519160747528076, "epoch": 0.07685614849187936, "grad_norm": 0.045657020062208176, "grad_norm_var": 7.70278914599126e-06, "learning_rate": 0.009917210861600101, "loss": 2.8014, "step": 2120 }, { "crossentropy": 2.9165000915527344, "epoch": 0.07689240139211137, "grad_norm": 0.05006633326411247, "grad_norm_var": 6.875673803337772e-06, "learning_rate": 0.009917105522385276, "loss": 2.8152, "step": 2121 }, { "crossentropy": 2.8733139038085938, "epoch": 0.07692865429234338, "grad_norm": 0.049689099192619324, "grad_norm_var": 6.9760000843914365e-06, "learning_rate": 0.00991700011675748, "loss": 2.8294, "step": 2122 }, { "crossentropy": 2.783639430999756, "epoch": 0.0769649071925754, "grad_norm": 0.04601652920246124, "grad_norm_var": 7.256770035159581e-06, "learning_rate": 0.009916894644718134, "loss": 2.8182, "step": 2123 }, { "crossentropy": 2.83742618560791, "epoch": 0.07700116009280743, "grad_norm": 0.052321601659059525, "grad_norm_var": 8.287802782730447e-06, "learning_rate": 0.009916789106268665, "loss": 2.785, "step": 2124 }, { "crossentropy": 2.6536600589752197, "epoch": 0.07703741299303944, "grad_norm": 0.05963245406746864, "grad_norm_var": 1.5814903248954847e-05, "learning_rate": 0.009916683501410497, "loss": 2.7428, "step": 2125 }, { "crossentropy": 2.778773307800293, "epoch": 0.07707366589327146, "grad_norm": 0.05511189624667168, "grad_norm_var": 1.691357235528134e-05, "learning_rate": 0.009916577830145058, "loss": 2.7494, "step": 2126 }, { "crossentropy": 2.8011600971221924, "epoch": 0.07710991879350348, "grad_norm": 0.07091549783945084, "grad_norm_var": 4.668690522267778e-05, "learning_rate": 0.009916472092473773, "loss": 2.8315, "step": 2127 }, { "crossentropy": 2.741143226623535, "epoch": 0.0771461716937355, "grad_norm": 0.07675857841968536, "grad_norm_var": 8.922041586175567e-05, "learning_rate": 0.00991636628839807, "loss": 2.747, "step": 2128 }, { "crossentropy": 2.7569057941436768, "epoch": 0.07718242459396751, "grad_norm": 0.044471222907304764, "grad_norm_var": 9.293286981400106e-05, "learning_rate": 0.009916260417919383, "loss": 2.7659, "step": 2129 }, { "crossentropy": 2.81646728515625, "epoch": 0.07721867749419954, "grad_norm": 0.046711117029190063, "grad_norm_var": 9.335654450232563e-05, "learning_rate": 0.009916154481039136, "loss": 2.7465, "step": 2130 }, { "crossentropy": 2.65028977394104, "epoch": 0.07725493039443156, "grad_norm": 0.04851669818162918, "grad_norm_var": 9.125694450036742e-05, "learning_rate": 0.009916048477758763, "loss": 2.6529, "step": 2131 }, { "crossentropy": 2.794302463531494, "epoch": 0.07729118329466357, "grad_norm": 0.05009210482239723, "grad_norm_var": 9.006809372025521e-05, "learning_rate": 0.009915942408079694, "loss": 2.7381, "step": 2132 }, { "crossentropy": 2.7248308658599854, "epoch": 0.07732743619489559, "grad_norm": 0.0513441301882267, "grad_norm_var": 8.675529111317604e-05, "learning_rate": 0.009915836272003365, "loss": 2.7425, "step": 2133 }, { "crossentropy": 2.753697395324707, "epoch": 0.07736368909512761, "grad_norm": 0.062190745025873184, "grad_norm_var": 8.908802559361017e-05, "learning_rate": 0.009915730069531203, "loss": 2.7074, "step": 2134 }, { "crossentropy": 2.822585105895996, "epoch": 0.07739994199535963, "grad_norm": 0.056633830070495605, "grad_norm_var": 8.515486402299871e-05, "learning_rate": 0.009915623800664651, "loss": 2.823, "step": 2135 }, { "crossentropy": 2.682453155517578, "epoch": 0.07743619489559164, "grad_norm": 0.05529533326625824, "grad_norm_var": 8.006830943651326e-05, "learning_rate": 0.009915517465405137, "loss": 2.7021, "step": 2136 }, { "crossentropy": 2.9596519470214844, "epoch": 0.07747244779582367, "grad_norm": 0.050490740686655045, "grad_norm_var": 7.981535276823636e-05, "learning_rate": 0.0099154110637541, "loss": 2.8786, "step": 2137 }, { "crossentropy": 2.6580862998962402, "epoch": 0.07750870069605569, "grad_norm": 0.045031387358903885, "grad_norm_var": 8.43216435731569e-05, "learning_rate": 0.009915304595712979, "loss": 2.7105, "step": 2138 }, { "crossentropy": 2.78312611579895, "epoch": 0.0775449535962877, "grad_norm": 0.04668654128909111, "grad_norm_var": 8.35944330623226e-05, "learning_rate": 0.00991519806128321, "loss": 2.7988, "step": 2139 }, { "crossentropy": 2.750471830368042, "epoch": 0.07758120649651973, "grad_norm": 0.046857018023729324, "grad_norm_var": 8.705727731460958e-05, "learning_rate": 0.00991509146046623, "loss": 2.8052, "step": 2140 }, { "crossentropy": 2.7424731254577637, "epoch": 0.07761745939675174, "grad_norm": 0.04811940714716911, "grad_norm_var": 8.695825381351275e-05, "learning_rate": 0.009914984793263483, "loss": 2.7096, "step": 2141 }, { "crossentropy": 2.698888063430786, "epoch": 0.07765371229698376, "grad_norm": 0.04659384489059448, "grad_norm_var": 8.960745961883032e-05, "learning_rate": 0.009914878059676406, "loss": 2.6305, "step": 2142 }, { "crossentropy": 2.6707186698913574, "epoch": 0.07768996519721577, "grad_norm": 0.04838840663433075, "grad_norm_var": 6.727061265633933e-05, "learning_rate": 0.009914771259706445, "loss": 2.7892, "step": 2143 }, { "crossentropy": 2.762049674987793, "epoch": 0.0777262180974478, "grad_norm": 0.05330384150147438, "grad_norm_var": 2.269769543379041e-05, "learning_rate": 0.009914664393355038, "loss": 2.8304, "step": 2144 }, { "crossentropy": 2.7510390281677246, "epoch": 0.07776247099767981, "grad_norm": 0.05336882546544075, "grad_norm_var": 2.103274762117191e-05, "learning_rate": 0.009914557460623629, "loss": 2.7623, "step": 2145 }, { "crossentropy": 2.8023898601531982, "epoch": 0.07779872389791183, "grad_norm": 0.049695465713739395, "grad_norm_var": 2.0041359821353946e-05, "learning_rate": 0.009914450461513664, "loss": 2.8293, "step": 2146 }, { "crossentropy": 2.7075741291046143, "epoch": 0.07783497679814386, "grad_norm": 0.05721602216362953, "grad_norm_var": 2.2136721079005218e-05, "learning_rate": 0.009914343396026589, "loss": 2.736, "step": 2147 }, { "crossentropy": 2.704517364501953, "epoch": 0.07787122969837587, "grad_norm": 0.05164794623851776, "grad_norm_var": 2.2030857007312014e-05, "learning_rate": 0.009914236264163847, "loss": 2.7523, "step": 2148 }, { "crossentropy": 2.979910373687744, "epoch": 0.07790748259860789, "grad_norm": 0.0487651526927948, "grad_norm_var": 2.247572490570888e-05, "learning_rate": 0.009914129065926886, "loss": 2.8354, "step": 2149 }, { "crossentropy": 2.6249210834503174, "epoch": 0.07794373549883991, "grad_norm": 0.052039407193660736, "grad_norm_var": 1.413196994416299e-05, "learning_rate": 0.009914021801317157, "loss": 2.7679, "step": 2150 }, { "crossentropy": 2.820723056793213, "epoch": 0.07797998839907193, "grad_norm": 0.0526898168027401, "grad_norm_var": 1.1948695295397466e-05, "learning_rate": 0.009913914470336104, "loss": 2.8322, "step": 2151 }, { "crossentropy": 2.7635557651519775, "epoch": 0.07801624129930394, "grad_norm": 0.05409902334213257, "grad_norm_var": 1.1255195920188005e-05, "learning_rate": 0.00991380707298518, "loss": 2.7387, "step": 2152 }, { "crossentropy": 2.903564929962158, "epoch": 0.07805249419953596, "grad_norm": 0.05206955596804619, "grad_norm_var": 1.144860235061107e-05, "learning_rate": 0.009913699609265834, "loss": 2.8577, "step": 2153 }, { "crossentropy": 2.625230550765991, "epoch": 0.07808874709976799, "grad_norm": 0.051452700048685074, "grad_norm_var": 9.420023010398941e-06, "learning_rate": 0.00991359207917952, "loss": 2.6959, "step": 2154 }, { "crossentropy": 2.7589025497436523, "epoch": 0.078125, "grad_norm": 0.05103462189435959, "grad_norm_var": 8.20989021958664e-06, "learning_rate": 0.009913484482727686, "loss": 2.7702, "step": 2155 }, { "crossentropy": 2.851317882537842, "epoch": 0.07816125290023201, "grad_norm": 0.048406414687633514, "grad_norm_var": 7.486731375113317e-06, "learning_rate": 0.009913376819911788, "loss": 2.8376, "step": 2156 }, { "crossentropy": 2.705206871032715, "epoch": 0.07819750580046404, "grad_norm": 0.04870903864502907, "grad_norm_var": 7.26779283801877e-06, "learning_rate": 0.00991326909073328, "loss": 2.8261, "step": 2157 }, { "crossentropy": 2.8579330444335938, "epoch": 0.07823375870069606, "grad_norm": 0.06228760629892349, "grad_norm_var": 1.298616071453799e-05, "learning_rate": 0.009913161295193616, "loss": 2.873, "step": 2158 }, { "crossentropy": 2.7019619941711426, "epoch": 0.07827001160092807, "grad_norm": 0.06005597114562988, "grad_norm_var": 1.5567356049423867e-05, "learning_rate": 0.009913053433294254, "loss": 2.7057, "step": 2159 }, { "crossentropy": 2.785447835922241, "epoch": 0.07830626450116009, "grad_norm": 0.0468013733625412, "grad_norm_var": 1.7883776703308113e-05, "learning_rate": 0.00991294550503665, "loss": 2.7781, "step": 2160 }, { "crossentropy": 2.643899917602539, "epoch": 0.07834251740139211, "grad_norm": 0.04631583020091057, "grad_norm_var": 2.0195701454670383e-05, "learning_rate": 0.009912837510422259, "loss": 2.6547, "step": 2161 }, { "crossentropy": 2.7512614727020264, "epoch": 0.07837877030162413, "grad_norm": 0.04725802689790726, "grad_norm_var": 2.1342095676944035e-05, "learning_rate": 0.009912729449452545, "loss": 2.761, "step": 2162 }, { "crossentropy": 2.911954641342163, "epoch": 0.07841502320185614, "grad_norm": 0.053260888904333115, "grad_norm_var": 1.9531160538907576e-05, "learning_rate": 0.009912621322128962, "loss": 2.8565, "step": 2163 }, { "crossentropy": 2.6972408294677734, "epoch": 0.07845127610208817, "grad_norm": 0.05630664527416229, "grad_norm_var": 2.0867198098011484e-05, "learning_rate": 0.009912513128452973, "loss": 2.703, "step": 2164 }, { "crossentropy": 2.7130203247070312, "epoch": 0.07848752900232019, "grad_norm": 0.05908309295773506, "grad_norm_var": 2.3109193803347246e-05, "learning_rate": 0.00991240486842604, "loss": 2.7536, "step": 2165 }, { "crossentropy": 2.790349245071411, "epoch": 0.0785237819025522, "grad_norm": 0.04998442158102989, "grad_norm_var": 2.353135447459275e-05, "learning_rate": 0.009912296542049623, "loss": 2.6985, "step": 2166 }, { "crossentropy": 2.554532289505005, "epoch": 0.07856003480278423, "grad_norm": 0.05099363252520561, "grad_norm_var": 2.3665626418545075e-05, "learning_rate": 0.00991218814932519, "loss": 2.7235, "step": 2167 }, { "crossentropy": 2.846658706665039, "epoch": 0.07859628770301624, "grad_norm": 0.046707089990377426, "grad_norm_var": 2.5388807458280534e-05, "learning_rate": 0.009912079690254198, "loss": 2.8619, "step": 2168 }, { "crossentropy": 2.755220890045166, "epoch": 0.07863254060324826, "grad_norm": 0.048611029982566833, "grad_norm_var": 2.6067628487038094e-05, "learning_rate": 0.009911971164838117, "loss": 2.685, "step": 2169 }, { "crossentropy": 2.764695167541504, "epoch": 0.07866879350348027, "grad_norm": 0.04685620591044426, "grad_norm_var": 2.754229449226653e-05, "learning_rate": 0.009911862573078412, "loss": 2.817, "step": 2170 }, { "crossentropy": 2.803288221359253, "epoch": 0.0787050464037123, "grad_norm": 0.04510413855314255, "grad_norm_var": 3.0042811874797515e-05, "learning_rate": 0.009911753914976546, "loss": 2.7426, "step": 2171 }, { "crossentropy": 2.867279529571533, "epoch": 0.07874129930394431, "grad_norm": 0.048355747014284134, "grad_norm_var": 3.0060806828101766e-05, "learning_rate": 0.009911645190533992, "loss": 2.7295, "step": 2172 }, { "crossentropy": 2.577233076095581, "epoch": 0.07877755220417633, "grad_norm": 0.05428203567862511, "grad_norm_var": 3.0267535148094326e-05, "learning_rate": 0.009911536399752216, "loss": 2.6396, "step": 2173 }, { "crossentropy": 2.6894993782043457, "epoch": 0.07881380510440836, "grad_norm": 0.05161505192518234, "grad_norm_var": 2.188123698865224e-05, "learning_rate": 0.009911427542632687, "loss": 2.7143, "step": 2174 }, { "crossentropy": 2.5723845958709717, "epoch": 0.07885005800464037, "grad_norm": 0.052396923303604126, "grad_norm_var": 1.6018140279974667e-05, "learning_rate": 0.009911318619176875, "loss": 2.6836, "step": 2175 }, { "crossentropy": 2.7844038009643555, "epoch": 0.07888631090487239, "grad_norm": 0.05400007963180542, "grad_norm_var": 1.595096055880581e-05, "learning_rate": 0.009911209629386254, "loss": 2.8176, "step": 2176 }, { "crossentropy": 2.7218523025512695, "epoch": 0.07892256380510441, "grad_norm": 0.052455171942710876, "grad_norm_var": 1.4721429776300375e-05, "learning_rate": 0.009911100573262291, "loss": 2.8472, "step": 2177 }, { "crossentropy": 2.7946858406066895, "epoch": 0.07895881670533643, "grad_norm": 0.04884560406208038, "grad_norm_var": 1.4070061114671943e-05, "learning_rate": 0.009910991450806464, "loss": 2.73, "step": 2178 }, { "crossentropy": 2.7108469009399414, "epoch": 0.07899506960556844, "grad_norm": 0.059693191200494766, "grad_norm_var": 1.844181452439267e-05, "learning_rate": 0.009910882262020242, "loss": 2.7841, "step": 2179 }, { "crossentropy": 2.766331672668457, "epoch": 0.07903132250580046, "grad_norm": 0.05594077333807945, "grad_norm_var": 1.8219632012414903e-05, "learning_rate": 0.009910773006905104, "loss": 2.7836, "step": 2180 }, { "crossentropy": 2.6907031536102295, "epoch": 0.07906757540603249, "grad_norm": 0.0647846981883049, "grad_norm_var": 2.597226319445307e-05, "learning_rate": 0.009910663685462527, "loss": 2.7885, "step": 2181 }, { "crossentropy": 2.7178921699523926, "epoch": 0.0791038283062645, "grad_norm": 0.06643133610486984, "grad_norm_var": 3.864691469108213e-05, "learning_rate": 0.00991055429769398, "loss": 2.7826, "step": 2182 }, { "crossentropy": 2.8108768463134766, "epoch": 0.07914008120649652, "grad_norm": 0.056326206773519516, "grad_norm_var": 3.903884636295609e-05, "learning_rate": 0.00991044484360095, "loss": 2.8023, "step": 2183 }, { "crossentropy": 2.6244120597839355, "epoch": 0.07917633410672854, "grad_norm": 0.050095733255147934, "grad_norm_var": 3.6788871407119186e-05, "learning_rate": 0.009910335323184907, "loss": 2.7173, "step": 2184 }, { "crossentropy": 2.7965147495269775, "epoch": 0.07921258700696056, "grad_norm": 0.048490315675735474, "grad_norm_var": 3.6868264004050595e-05, "learning_rate": 0.009910225736447335, "loss": 2.7747, "step": 2185 }, { "crossentropy": 2.8873167037963867, "epoch": 0.07924883990719257, "grad_norm": 0.04773464798927307, "grad_norm_var": 3.6140726544236824e-05, "learning_rate": 0.009910116083389714, "loss": 2.8643, "step": 2186 }, { "crossentropy": 2.797049045562744, "epoch": 0.0792850928074246, "grad_norm": 0.05145428329706192, "grad_norm_var": 3.152314723776801e-05, "learning_rate": 0.009910006364013521, "loss": 2.8087, "step": 2187 }, { "crossentropy": 2.735541820526123, "epoch": 0.07932134570765662, "grad_norm": 0.056499678641557693, "grad_norm_var": 2.9614049503897077e-05, "learning_rate": 0.009909896578320244, "loss": 2.7414, "step": 2188 }, { "crossentropy": 2.8620169162750244, "epoch": 0.07935759860788863, "grad_norm": 0.06465304642915726, "grad_norm_var": 3.6117487410217086e-05, "learning_rate": 0.00990978672631136, "loss": 2.8115, "step": 2189 }, { "crossentropy": 2.8502309322357178, "epoch": 0.07939385150812064, "grad_norm": 0.07607972621917725, "grad_norm_var": 6.219461742864995e-05, "learning_rate": 0.009909676807988358, "loss": 2.7902, "step": 2190 }, { "crossentropy": 2.8227272033691406, "epoch": 0.07943010440835267, "grad_norm": 0.052393216639757156, "grad_norm_var": 6.21967042323072e-05, "learning_rate": 0.009909566823352717, "loss": 2.7701, "step": 2191 }, { "crossentropy": 2.796086072921753, "epoch": 0.07946635730858469, "grad_norm": 0.051095183938741684, "grad_norm_var": 6.373782772307503e-05, "learning_rate": 0.009909456772405926, "loss": 2.7891, "step": 2192 }, { "crossentropy": 2.6842916011810303, "epoch": 0.0795026102088167, "grad_norm": 0.04663239046931267, "grad_norm_var": 6.894732103366308e-05, "learning_rate": 0.009909346655149474, "loss": 2.7483, "step": 2193 }, { "crossentropy": 2.8274810314178467, "epoch": 0.07953886310904873, "grad_norm": 0.04512741044163704, "grad_norm_var": 7.33938722029749e-05, "learning_rate": 0.009909236471584842, "loss": 2.7922, "step": 2194 }, { "crossentropy": 2.6111462116241455, "epoch": 0.07957511600928074, "grad_norm": 0.04514937102794647, "grad_norm_var": 7.914103614140184e-05, "learning_rate": 0.009909126221713523, "loss": 2.6754, "step": 2195 }, { "crossentropy": 2.797502279281616, "epoch": 0.07961136890951276, "grad_norm": 0.04440996050834656, "grad_norm_var": 8.589777932531725e-05, "learning_rate": 0.009909015905537005, "loss": 2.747, "step": 2196 }, { "crossentropy": 2.7625069618225098, "epoch": 0.07964762180974477, "grad_norm": 0.04553046077489853, "grad_norm_var": 8.191998467410907e-05, "learning_rate": 0.009908905523056776, "loss": 2.7625, "step": 2197 }, { "crossentropy": 2.739946126937866, "epoch": 0.0796838747099768, "grad_norm": 0.04833174869418144, "grad_norm_var": 6.999665463248705e-05, "learning_rate": 0.00990879507427433, "loss": 2.6754, "step": 2198 }, { "crossentropy": 2.6109564304351807, "epoch": 0.07972012761020882, "grad_norm": 0.05215208977460861, "grad_norm_var": 6.860841114425803e-05, "learning_rate": 0.009908684559191154, "loss": 2.6274, "step": 2199 }, { "crossentropy": 2.7106661796569824, "epoch": 0.07975638051044083, "grad_norm": 0.04934851452708244, "grad_norm_var": 6.87946035562117e-05, "learning_rate": 0.009908573977808748, "loss": 2.6945, "step": 2200 }, { "crossentropy": 2.813372850418091, "epoch": 0.07979263341067286, "grad_norm": 0.04907135292887688, "grad_norm_var": 6.857729942527253e-05, "learning_rate": 0.009908463330128599, "loss": 2.7936, "step": 2201 }, { "crossentropy": 2.7758023738861084, "epoch": 0.07982888631090487, "grad_norm": 0.05680260434746742, "grad_norm_var": 6.90383262821105e-05, "learning_rate": 0.009908352616152204, "loss": 2.6463, "step": 2202 }, { "crossentropy": 2.8193180561065674, "epoch": 0.07986513921113689, "grad_norm": 0.06302308291196823, "grad_norm_var": 7.629808461588749e-05, "learning_rate": 0.009908241835881059, "loss": 2.7268, "step": 2203 }, { "crossentropy": 2.667539119720459, "epoch": 0.07990139211136892, "grad_norm": 0.053136248141527176, "grad_norm_var": 7.538801624529656e-05, "learning_rate": 0.00990813098931666, "loss": 2.7452, "step": 2204 }, { "crossentropy": 2.707580327987671, "epoch": 0.07993764501160093, "grad_norm": 0.04585370793938637, "grad_norm_var": 6.747391828729349e-05, "learning_rate": 0.009908020076460503, "loss": 2.7187, "step": 2205 }, { "crossentropy": 2.759612798690796, "epoch": 0.07997389791183294, "grad_norm": 0.04786133021116257, "grad_norm_var": 2.479345789820712e-05, "learning_rate": 0.009907909097314085, "loss": 2.8054, "step": 2206 }, { "crossentropy": 2.866403341293335, "epoch": 0.08001015081206496, "grad_norm": 0.04722389206290245, "grad_norm_var": 2.463825501756144e-05, "learning_rate": 0.009907798051878908, "loss": 2.7753, "step": 2207 }, { "crossentropy": 2.5997939109802246, "epoch": 0.08004640371229699, "grad_norm": 0.051583804190158844, "grad_norm_var": 2.4762194546841474e-05, "learning_rate": 0.00990768694015647, "loss": 2.5904, "step": 2208 }, { "crossentropy": 2.802506446838379, "epoch": 0.080082656612529, "grad_norm": 0.05327446758747101, "grad_norm_var": 2.5022113246475398e-05, "learning_rate": 0.009907575762148273, "loss": 2.8031, "step": 2209 }, { "crossentropy": 2.6309142112731934, "epoch": 0.08011890951276102, "grad_norm": 0.05454077944159508, "grad_norm_var": 2.4610967764045662e-05, "learning_rate": 0.009907464517855817, "loss": 2.7376, "step": 2210 }, { "crossentropy": 2.6773555278778076, "epoch": 0.08015516241299304, "grad_norm": 0.05871395394206047, "grad_norm_var": 2.651350148723987e-05, "learning_rate": 0.009907353207280607, "loss": 2.728, "step": 2211 }, { "crossentropy": 2.9519362449645996, "epoch": 0.08019141531322506, "grad_norm": 0.04760744050145149, "grad_norm_var": 2.4213513362944528e-05, "learning_rate": 0.009907241830424143, "loss": 2.9262, "step": 2212 }, { "crossentropy": 2.711111545562744, "epoch": 0.08022766821345707, "grad_norm": 0.047632697969675064, "grad_norm_var": 2.281550238787764e-05, "learning_rate": 0.009907130387287931, "loss": 2.7896, "step": 2213 }, { "crossentropy": 2.773214817047119, "epoch": 0.0802639211136891, "grad_norm": 0.04765806347131729, "grad_norm_var": 2.314056884258874e-05, "learning_rate": 0.009907018877873477, "loss": 2.7706, "step": 2214 }, { "crossentropy": 2.691188335418701, "epoch": 0.08030017401392112, "grad_norm": 0.046661995351314545, "grad_norm_var": 2.46149475043346e-05, "learning_rate": 0.009906907302182286, "loss": 2.792, "step": 2215 }, { "crossentropy": 2.747451066970825, "epoch": 0.08033642691415313, "grad_norm": 0.04743387550115585, "grad_norm_var": 2.5329387011969497e-05, "learning_rate": 0.009906795660215866, "loss": 2.6985, "step": 2216 }, { "crossentropy": 2.783952236175537, "epoch": 0.08037267981438515, "grad_norm": 0.0449879914522171, "grad_norm_var": 2.7492304818464015e-05, "learning_rate": 0.009906683951975723, "loss": 2.7124, "step": 2217 }, { "crossentropy": 2.8111824989318848, "epoch": 0.08040893271461717, "grad_norm": 0.04705936089158058, "grad_norm_var": 2.572460369731517e-05, "learning_rate": 0.00990657217746337, "loss": 2.7827, "step": 2218 }, { "crossentropy": 2.675962448120117, "epoch": 0.08044518561484919, "grad_norm": 0.06531664729118347, "grad_norm_var": 2.9954669739061988e-05, "learning_rate": 0.009906460336680311, "loss": 2.7197, "step": 2219 }, { "crossentropy": 2.6446354389190674, "epoch": 0.0804814385150812, "grad_norm": 0.04703211411833763, "grad_norm_var": 3.0063897665475596e-05, "learning_rate": 0.009906348429628061, "loss": 2.6838, "step": 2220 }, { "crossentropy": 2.724642515182495, "epoch": 0.08051769141531323, "grad_norm": 0.04527537152171135, "grad_norm_var": 3.0406659915064675e-05, "learning_rate": 0.009906236456308129, "loss": 2.6109, "step": 2221 }, { "crossentropy": 2.7585690021514893, "epoch": 0.08055394431554525, "grad_norm": 0.044416289776563644, "grad_norm_var": 3.212689220597238e-05, "learning_rate": 0.009906124416722027, "loss": 2.7543, "step": 2222 }, { "crossentropy": 2.7516355514526367, "epoch": 0.08059019721577726, "grad_norm": 0.07340454310178757, "grad_norm_var": 6.605666921743562e-05, "learning_rate": 0.00990601231087127, "loss": 2.7628, "step": 2223 }, { "crossentropy": 2.670119047164917, "epoch": 0.08062645011600927, "grad_norm": 0.0554637610912323, "grad_norm_var": 6.708618822345095e-05, "learning_rate": 0.009905900138757374, "loss": 2.7399, "step": 2224 }, { "crossentropy": 2.742485284805298, "epoch": 0.0806627030162413, "grad_norm": 0.058623313903808594, "grad_norm_var": 7.00293229680328e-05, "learning_rate": 0.009905787900381848, "loss": 2.7896, "step": 2225 }, { "crossentropy": 2.8932337760925293, "epoch": 0.08069895591647332, "grad_norm": 0.0566326305270195, "grad_norm_var": 7.10144655044898e-05, "learning_rate": 0.009905675595746214, "loss": 2.7977, "step": 2226 }, { "crossentropy": 2.704925298690796, "epoch": 0.08073520881670533, "grad_norm": 0.05717046558856964, "grad_norm_var": 6.980633793976301e-05, "learning_rate": 0.009905563224851985, "loss": 2.8193, "step": 2227 }, { "crossentropy": 2.8655762672424316, "epoch": 0.08077146171693736, "grad_norm": 0.05530824884772301, "grad_norm_var": 6.897840789878059e-05, "learning_rate": 0.009905450787700682, "loss": 2.7896, "step": 2228 }, { "crossentropy": 2.8656582832336426, "epoch": 0.08080771461716937, "grad_norm": 0.05479985103011131, "grad_norm_var": 6.753299888726505e-05, "learning_rate": 0.00990533828429382, "loss": 2.8012, "step": 2229 }, { "crossentropy": 2.907651424407959, "epoch": 0.08084396751740139, "grad_norm": 0.05658673867583275, "grad_norm_var": 6.621226598529876e-05, "learning_rate": 0.009905225714632922, "loss": 2.8553, "step": 2230 }, { "crossentropy": 2.7356951236724854, "epoch": 0.08088022041763342, "grad_norm": 0.05292833223938942, "grad_norm_var": 6.294417566372774e-05, "learning_rate": 0.009905113078719505, "loss": 2.6614, "step": 2231 }, { "crossentropy": 2.632802963256836, "epoch": 0.08091647331786543, "grad_norm": 0.05073351040482521, "grad_norm_var": 6.077878295054253e-05, "learning_rate": 0.009905000376555095, "loss": 2.7109, "step": 2232 }, { "crossentropy": 2.8839516639709473, "epoch": 0.08095272621809745, "grad_norm": 0.05239925906062126, "grad_norm_var": 5.5198913670760166e-05, "learning_rate": 0.009904887608141209, "loss": 2.826, "step": 2233 }, { "crossentropy": 2.70497989654541, "epoch": 0.08098897911832946, "grad_norm": 0.04336022585630417, "grad_norm_var": 5.975945939420731e-05, "learning_rate": 0.009904774773479372, "loss": 2.724, "step": 2234 }, { "crossentropy": 2.8878257274627686, "epoch": 0.08102523201856149, "grad_norm": 0.047611210495233536, "grad_norm_var": 5.3440938216371926e-05, "learning_rate": 0.00990466187257111, "loss": 2.86, "step": 2235 }, { "crossentropy": 2.721708297729492, "epoch": 0.0810614849187935, "grad_norm": 0.04445204883813858, "grad_norm_var": 5.599052711932889e-05, "learning_rate": 0.009904548905417945, "loss": 2.7879, "step": 2236 }, { "crossentropy": 2.7844903469085693, "epoch": 0.08109773781902552, "grad_norm": 0.046165499836206436, "grad_norm_var": 5.5114611971326886e-05, "learning_rate": 0.009904435872021404, "loss": 2.7179, "step": 2237 }, { "crossentropy": 2.7533440589904785, "epoch": 0.08113399071925755, "grad_norm": 0.045726727694272995, "grad_norm_var": 5.369969929818191e-05, "learning_rate": 0.009904322772383015, "loss": 2.7654, "step": 2238 }, { "crossentropy": 2.8329896926879883, "epoch": 0.08117024361948956, "grad_norm": 0.048668161034584045, "grad_norm_var": 2.533872465790143e-05, "learning_rate": 0.009904209606504304, "loss": 2.7783, "step": 2239 }, { "crossentropy": 2.8540778160095215, "epoch": 0.08120649651972157, "grad_norm": 0.05705297738313675, "grad_norm_var": 2.6301648218039244e-05, "learning_rate": 0.009904096374386799, "loss": 2.8011, "step": 2240 }, { "crossentropy": 2.743757724761963, "epoch": 0.0812427494199536, "grad_norm": 0.06092303618788719, "grad_norm_var": 2.8735554279701637e-05, "learning_rate": 0.009903983076032033, "loss": 2.7628, "step": 2241 }, { "crossentropy": 2.799818754196167, "epoch": 0.08127900232018562, "grad_norm": 0.04651034250855446, "grad_norm_var": 2.8762040545837694e-05, "learning_rate": 0.009903869711441532, "loss": 2.8519, "step": 2242 }, { "crossentropy": 2.786656141281128, "epoch": 0.08131525522041763, "grad_norm": 0.047717463225126266, "grad_norm_var": 2.691608219113073e-05, "learning_rate": 0.009903756280616826, "loss": 2.8182, "step": 2243 }, { "crossentropy": 2.793452739715576, "epoch": 0.08135150812064965, "grad_norm": 0.04778261482715607, "grad_norm_var": 2.581570323199374e-05, "learning_rate": 0.009903642783559452, "loss": 2.8164, "step": 2244 }, { "crossentropy": 2.6948323249816895, "epoch": 0.08138776102088167, "grad_norm": 0.047109734266996384, "grad_norm_var": 2.4809339853217443e-05, "learning_rate": 0.009903529220270942, "loss": 2.7017, "step": 2245 }, { "crossentropy": 2.7086305618286133, "epoch": 0.08142401392111369, "grad_norm": 0.04993642494082451, "grad_norm_var": 2.149623206478864e-05, "learning_rate": 0.009903415590752827, "loss": 2.7485, "step": 2246 }, { "crossentropy": 2.661850690841675, "epoch": 0.0814602668213457, "grad_norm": 0.052543431520462036, "grad_norm_var": 2.1320175295263164e-05, "learning_rate": 0.009903301895006641, "loss": 2.6308, "step": 2247 }, { "crossentropy": 2.6856844425201416, "epoch": 0.08149651972157773, "grad_norm": 0.0445554219186306, "grad_norm_var": 2.2519350778154915e-05, "learning_rate": 0.009903188133033926, "loss": 2.6991, "step": 2248 }, { "crossentropy": 2.661999464035034, "epoch": 0.08153277262180975, "grad_norm": 0.05341017246246338, "grad_norm_var": 2.3053916864340605e-05, "learning_rate": 0.009903074304836213, "loss": 2.802, "step": 2249 }, { "crossentropy": 2.7934176921844482, "epoch": 0.08156902552204176, "grad_norm": 0.04678485542535782, "grad_norm_var": 2.1225245794602385e-05, "learning_rate": 0.00990296041041504, "loss": 2.7722, "step": 2250 }, { "crossentropy": 2.6250312328338623, "epoch": 0.08160527842227379, "grad_norm": 0.04756617173552513, "grad_norm_var": 2.123481973830271e-05, "learning_rate": 0.009902846449771947, "loss": 2.6951, "step": 2251 }, { "crossentropy": 2.717778444290161, "epoch": 0.0816415313225058, "grad_norm": 0.04864633083343506, "grad_norm_var": 1.9689395033653464e-05, "learning_rate": 0.009902732422908471, "loss": 2.6592, "step": 2252 }, { "crossentropy": 2.7951629161834717, "epoch": 0.08167778422273782, "grad_norm": 0.0802820473909378, "grad_norm_var": 7.75234077240725e-05, "learning_rate": 0.009902618329826154, "loss": 2.7768, "step": 2253 }, { "crossentropy": 2.6553127765655518, "epoch": 0.08171403712296983, "grad_norm": 0.053720828145742416, "grad_norm_var": 7.52828934055536e-05, "learning_rate": 0.009902504170526537, "loss": 2.7719, "step": 2254 }, { "crossentropy": 2.7533602714538574, "epoch": 0.08175029002320186, "grad_norm": 0.058720458298921585, "grad_norm_var": 7.703138934074717e-05, "learning_rate": 0.00990238994501116, "loss": 2.7702, "step": 2255 }, { "crossentropy": 2.816162109375, "epoch": 0.08178654292343387, "grad_norm": 0.04866946116089821, "grad_norm_var": 7.656268403770441e-05, "learning_rate": 0.00990227565328157, "loss": 2.8083, "step": 2256 }, { "crossentropy": 2.6930999755859375, "epoch": 0.08182279582366589, "grad_norm": 0.0896776020526886, "grad_norm_var": 0.00016175983360363998, "learning_rate": 0.009902161295339306, "loss": 2.6934, "step": 2257 }, { "crossentropy": 2.763338565826416, "epoch": 0.08185904872389792, "grad_norm": 0.05177617073059082, "grad_norm_var": 0.00015825041497290264, "learning_rate": 0.009902046871185914, "loss": 2.7421, "step": 2258 }, { "crossentropy": 2.6853017807006836, "epoch": 0.08189530162412993, "grad_norm": 0.04670342803001404, "grad_norm_var": 0.00015920550972127137, "learning_rate": 0.009901932380822941, "loss": 2.6451, "step": 2259 }, { "crossentropy": 2.817859411239624, "epoch": 0.08193155452436195, "grad_norm": 0.049583058804273605, "grad_norm_var": 0.00015785727745628932, "learning_rate": 0.009901817824251933, "loss": 2.7015, "step": 2260 }, { "crossentropy": 2.683969497680664, "epoch": 0.08196780742459396, "grad_norm": 0.047534260898828506, "grad_norm_var": 0.00015745841381415994, "learning_rate": 0.009901703201474434, "loss": 2.7829, "step": 2261 }, { "crossentropy": 2.908827304840088, "epoch": 0.08200406032482599, "grad_norm": 0.04599883407354355, "grad_norm_var": 0.00016076137189988444, "learning_rate": 0.009901588512491996, "loss": 2.6739, "step": 2262 }, { "crossentropy": 2.7970871925354004, "epoch": 0.082040313225058, "grad_norm": 0.04914811998605728, "grad_norm_var": 0.00016220275141035488, "learning_rate": 0.009901473757306167, "loss": 2.8156, "step": 2263 }, { "crossentropy": 2.7128443717956543, "epoch": 0.08207656612529002, "grad_norm": 0.07031416893005371, "grad_norm_var": 0.0001714973941490355, "learning_rate": 0.009901358935918496, "loss": 2.7124, "step": 2264 }, { "crossentropy": 2.833726406097412, "epoch": 0.08211281902552205, "grad_norm": 0.05220489203929901, "grad_norm_var": 0.00017192941499181134, "learning_rate": 0.009901244048330536, "loss": 2.8224, "step": 2265 }, { "crossentropy": 2.7464170455932617, "epoch": 0.08214907192575406, "grad_norm": 0.04870130121707916, "grad_norm_var": 0.00016994270497825887, "learning_rate": 0.009901129094543834, "loss": 2.795, "step": 2266 }, { "crossentropy": 2.880704879760742, "epoch": 0.08218532482598608, "grad_norm": 0.05342362821102142, "grad_norm_var": 0.00016582991812643568, "learning_rate": 0.009901014074559948, "loss": 2.8557, "step": 2267 }, { "crossentropy": 2.7056074142456055, "epoch": 0.0822215777262181, "grad_norm": 0.05752045661211014, "grad_norm_var": 0.00016211703166352272, "learning_rate": 0.00990089898838043, "loss": 2.6249, "step": 2268 }, { "crossentropy": 2.6970582008361816, "epoch": 0.08225783062645012, "grad_norm": 0.055689360946416855, "grad_norm_var": 0.00012193075942068879, "learning_rate": 0.009900783836006833, "loss": 2.8295, "step": 2269 }, { "crossentropy": 2.8435401916503906, "epoch": 0.08229408352668213, "grad_norm": 0.044560208916664124, "grad_norm_var": 0.00012869109976560195, "learning_rate": 0.009900668617440713, "loss": 2.7068, "step": 2270 }, { "crossentropy": 2.707291603088379, "epoch": 0.08233033642691415, "grad_norm": 0.04663959890604019, "grad_norm_var": 0.00013083590771318193, "learning_rate": 0.009900553332683627, "loss": 2.7548, "step": 2271 }, { "crossentropy": 2.704033613204956, "epoch": 0.08236658932714618, "grad_norm": 0.050307296216487885, "grad_norm_var": 0.00012991941056568243, "learning_rate": 0.009900437981737129, "loss": 2.8001, "step": 2272 }, { "crossentropy": 2.9417216777801514, "epoch": 0.08240284222737819, "grad_norm": 0.05276009067893028, "grad_norm_var": 3.818619040244035e-05, "learning_rate": 0.009900322564602783, "loss": 2.89, "step": 2273 }, { "crossentropy": 2.8792450428009033, "epoch": 0.0824390951276102, "grad_norm": 0.04624456539750099, "grad_norm_var": 3.984259198175211e-05, "learning_rate": 0.009900207081282142, "loss": 2.7899, "step": 2274 }, { "crossentropy": 2.782724380493164, "epoch": 0.08247534802784223, "grad_norm": 0.05195773392915726, "grad_norm_var": 3.849962961285363e-05, "learning_rate": 0.009900091531776768, "loss": 2.7868, "step": 2275 }, { "crossentropy": 2.666691303253174, "epoch": 0.08251160092807425, "grad_norm": 0.05459056794643402, "grad_norm_var": 3.884588566986924e-05, "learning_rate": 0.009899975916088223, "loss": 2.7883, "step": 2276 }, { "crossentropy": 2.761441707611084, "epoch": 0.08254785382830626, "grad_norm": 0.05799656733870506, "grad_norm_var": 3.984158230787918e-05, "learning_rate": 0.009899860234218065, "loss": 2.7276, "step": 2277 }, { "crossentropy": 2.8844332695007324, "epoch": 0.08258410672853829, "grad_norm": 0.05171952396631241, "grad_norm_var": 3.7020763897525206e-05, "learning_rate": 0.009899744486167861, "loss": 2.8347, "step": 2278 }, { "crossentropy": 2.806002140045166, "epoch": 0.0826203596287703, "grad_norm": 0.04927174374461174, "grad_norm_var": 3.696257730220053e-05, "learning_rate": 0.009899628671939172, "loss": 2.7606, "step": 2279 }, { "crossentropy": 2.719343662261963, "epoch": 0.08265661252900232, "grad_norm": 0.05101571977138519, "grad_norm_var": 1.5028823723675586e-05, "learning_rate": 0.00989951279153356, "loss": 2.7378, "step": 2280 }, { "crossentropy": 2.8023178577423096, "epoch": 0.08269286542923433, "grad_norm": 0.052142269909381866, "grad_norm_var": 1.5023498050303376e-05, "learning_rate": 0.009899396844952595, "loss": 2.8028, "step": 2281 }, { "crossentropy": 2.662050485610962, "epoch": 0.08272911832946636, "grad_norm": 0.04707254469394684, "grad_norm_var": 1.5804425564050887e-05, "learning_rate": 0.009899280832197838, "loss": 2.7621, "step": 2282 }, { "crossentropy": 2.8121068477630615, "epoch": 0.08276537122969838, "grad_norm": 0.04852825030684471, "grad_norm_var": 1.6002246157041192e-05, "learning_rate": 0.009899164753270863, "loss": 2.7121, "step": 2283 }, { "crossentropy": 2.7975857257843018, "epoch": 0.08280162412993039, "grad_norm": 0.057061005383729935, "grad_norm_var": 1.5623716073754196e-05, "learning_rate": 0.00989904860817323, "loss": 2.8, "step": 2284 }, { "crossentropy": 2.6525161266326904, "epoch": 0.08283787703016242, "grad_norm": 0.04420854151248932, "grad_norm_var": 1.6832399394741313e-05, "learning_rate": 0.009898932396906509, "loss": 2.6985, "step": 2285 }, { "crossentropy": 2.756038188934326, "epoch": 0.08287412993039443, "grad_norm": 0.044148292392492294, "grad_norm_var": 1.7162626898979626e-05, "learning_rate": 0.009898816119472274, "loss": 2.7417, "step": 2286 }, { "crossentropy": 2.818448781967163, "epoch": 0.08291038283062645, "grad_norm": 0.04752086475491524, "grad_norm_var": 1.6774713972768594e-05, "learning_rate": 0.009898699775872093, "loss": 2.7427, "step": 2287 }, { "crossentropy": 2.6506927013397217, "epoch": 0.08294663573085846, "grad_norm": 0.04709773510694504, "grad_norm_var": 1.7462109601991636e-05, "learning_rate": 0.009898583366107537, "loss": 2.7648, "step": 2288 }, { "crossentropy": 2.5858144760131836, "epoch": 0.08298288863109049, "grad_norm": 0.05241410806775093, "grad_norm_var": 1.7351883681044714e-05, "learning_rate": 0.00989846689018018, "loss": 2.645, "step": 2289 }, { "crossentropy": 2.696089744567871, "epoch": 0.0830191415313225, "grad_norm": 0.04771946743130684, "grad_norm_var": 1.6712572367586868e-05, "learning_rate": 0.009898350348091592, "loss": 2.7385, "step": 2290 }, { "crossentropy": 2.7774150371551514, "epoch": 0.08305539443155452, "grad_norm": 0.04457170516252518, "grad_norm_var": 1.846899519093448e-05, "learning_rate": 0.00989823373984335, "loss": 2.7283, "step": 2291 }, { "crossentropy": 2.740838050842285, "epoch": 0.08309164733178655, "grad_norm": 0.04484839737415314, "grad_norm_var": 1.82007688503405e-05, "learning_rate": 0.009898117065437029, "loss": 2.7939, "step": 2292 }, { "crossentropy": 2.6753129959106445, "epoch": 0.08312790023201856, "grad_norm": 0.044174958020448685, "grad_norm_var": 1.3945294531963795e-05, "learning_rate": 0.009898000324874203, "loss": 2.7432, "step": 2293 }, { "crossentropy": 2.8608996868133545, "epoch": 0.08316415313225058, "grad_norm": 0.045664940029382706, "grad_norm_var": 1.351199438105599e-05, "learning_rate": 0.00989788351815645, "loss": 2.7259, "step": 2294 }, { "crossentropy": 2.626701593399048, "epoch": 0.0832004060324826, "grad_norm": 0.046268098056316376, "grad_norm_var": 1.3553043865318267e-05, "learning_rate": 0.009897766645285347, "loss": 2.687, "step": 2295 }, { "crossentropy": 2.7361013889312744, "epoch": 0.08323665893271462, "grad_norm": 0.04736119508743286, "grad_norm_var": 1.28103931875669e-05, "learning_rate": 0.009897649706262472, "loss": 2.7198, "step": 2296 }, { "crossentropy": 2.7751402854919434, "epoch": 0.08327291183294663, "grad_norm": 0.04526575282216072, "grad_norm_var": 1.155542497369242e-05, "learning_rate": 0.009897532701089408, "loss": 2.7198, "step": 2297 }, { "crossentropy": 2.814744472503662, "epoch": 0.08330916473317865, "grad_norm": 0.05022790655493736, "grad_norm_var": 1.2157575111269164e-05, "learning_rate": 0.009897415629767729, "loss": 2.7593, "step": 2298 }, { "crossentropy": 2.6011736392974854, "epoch": 0.08334541763341068, "grad_norm": 0.05070545896887779, "grad_norm_var": 1.2805291999569655e-05, "learning_rate": 0.009897298492299022, "loss": 2.6438, "step": 2299 }, { "crossentropy": 2.671825408935547, "epoch": 0.08338167053364269, "grad_norm": 0.04942614212632179, "grad_norm_var": 6.66837714391628e-06, "learning_rate": 0.009897181288684867, "loss": 2.6853, "step": 2300 }, { "crossentropy": 2.793334722518921, "epoch": 0.0834179234338747, "grad_norm": 0.047833871096372604, "grad_norm_var": 6.1518605050940475e-06, "learning_rate": 0.009897064018926845, "loss": 2.8132, "step": 2301 }, { "crossentropy": 2.929166078567505, "epoch": 0.08345417633410673, "grad_norm": 0.047957584261894226, "grad_norm_var": 5.507248143071565e-06, "learning_rate": 0.009896946683026544, "loss": 2.7976, "step": 2302 }, { "crossentropy": 2.742283582687378, "epoch": 0.08349042923433875, "grad_norm": 0.048823609948158264, "grad_norm_var": 5.627168444753174e-06, "learning_rate": 0.009896829280985547, "loss": 2.802, "step": 2303 }, { "crossentropy": 2.5307397842407227, "epoch": 0.08352668213457076, "grad_norm": 0.04922885075211525, "grad_norm_var": 5.79030888033516e-06, "learning_rate": 0.009896711812805437, "loss": 2.6431, "step": 2304 }, { "crossentropy": 2.807511329650879, "epoch": 0.08356293503480279, "grad_norm": 0.04209734499454498, "grad_norm_var": 5.897090793437141e-06, "learning_rate": 0.009896594278487806, "loss": 2.7742, "step": 2305 }, { "crossentropy": 2.791043758392334, "epoch": 0.0835991879350348, "grad_norm": 0.04516008868813515, "grad_norm_var": 6.0647118518876325e-06, "learning_rate": 0.009896476678034234, "loss": 2.7583, "step": 2306 }, { "crossentropy": 2.7337911128997803, "epoch": 0.08363544083526682, "grad_norm": 0.04715581238269806, "grad_norm_var": 5.696738920362282e-06, "learning_rate": 0.009896359011446318, "loss": 2.7183, "step": 2307 }, { "crossentropy": 2.6071066856384277, "epoch": 0.08367169373549883, "grad_norm": 0.05873402953147888, "grad_norm_var": 1.374075366898138e-05, "learning_rate": 0.00989624127872564, "loss": 2.6584, "step": 2308 }, { "crossentropy": 2.624505043029785, "epoch": 0.08370794663573086, "grad_norm": 0.05086047947406769, "grad_norm_var": 1.3231266564830978e-05, "learning_rate": 0.009896123479873795, "loss": 2.7392, "step": 2309 }, { "crossentropy": 2.7670061588287354, "epoch": 0.08374419953596288, "grad_norm": 0.04727109894156456, "grad_norm_var": 1.2828576660185952e-05, "learning_rate": 0.009896005614892373, "loss": 2.7847, "step": 2310 }, { "crossentropy": 2.896800994873047, "epoch": 0.08378045243619489, "grad_norm": 0.04551319032907486, "grad_norm_var": 1.3078637094078814e-05, "learning_rate": 0.009895887683782963, "loss": 2.8284, "step": 2311 }, { "crossentropy": 2.7705931663513184, "epoch": 0.08381670533642692, "grad_norm": 0.04481786862015724, "grad_norm_var": 1.3818707858905393e-05, "learning_rate": 0.009895769686547163, "loss": 2.6668, "step": 2312 }, { "crossentropy": 2.734783887863159, "epoch": 0.08385295823665893, "grad_norm": 0.05871303007006645, "grad_norm_var": 1.9873068212675608e-05, "learning_rate": 0.009895651623186561, "loss": 2.7723, "step": 2313 }, { "crossentropy": 2.777596950531006, "epoch": 0.08388921113689095, "grad_norm": 0.050296735018491745, "grad_norm_var": 1.9884331045271336e-05, "learning_rate": 0.009895533493702756, "loss": 2.745, "step": 2314 }, { "crossentropy": 2.9428982734680176, "epoch": 0.08392546403712298, "grad_norm": 0.05237540975213051, "grad_norm_var": 2.0430081804401607e-05, "learning_rate": 0.009895415298097341, "loss": 2.7557, "step": 2315 }, { "crossentropy": 2.7727057933807373, "epoch": 0.08396171693735499, "grad_norm": 0.05200918763875961, "grad_norm_var": 2.094509736965492e-05, "learning_rate": 0.009895297036371913, "loss": 2.7261, "step": 2316 }, { "crossentropy": 2.832995891571045, "epoch": 0.083997969837587, "grad_norm": 0.046786822378635406, "grad_norm_var": 2.1218718417114327e-05, "learning_rate": 0.00989517870852807, "loss": 2.8115, "step": 2317 }, { "crossentropy": 2.7114224433898926, "epoch": 0.08403422273781902, "grad_norm": 0.04575507342815399, "grad_norm_var": 2.1897800702585084e-05, "learning_rate": 0.009895060314567411, "loss": 2.6649, "step": 2318 }, { "crossentropy": 2.6698014736175537, "epoch": 0.08407047563805105, "grad_norm": 0.04748726263642311, "grad_norm_var": 2.2058646556617305e-05, "learning_rate": 0.009894941854491532, "loss": 2.7159, "step": 2319 }, { "crossentropy": 2.6625378131866455, "epoch": 0.08410672853828306, "grad_norm": 0.045308344066143036, "grad_norm_var": 2.290823580054681e-05, "learning_rate": 0.009894823328302037, "loss": 2.7432, "step": 2320 }, { "crossentropy": 2.8064963817596436, "epoch": 0.08414298143851508, "grad_norm": 0.047837428748607635, "grad_norm_var": 1.9859599354921502e-05, "learning_rate": 0.009894704736000523, "loss": 2.768, "step": 2321 }, { "crossentropy": 2.9328014850616455, "epoch": 0.0841792343387471, "grad_norm": 0.0514356754720211, "grad_norm_var": 1.8999135775220737e-05, "learning_rate": 0.009894586077588597, "loss": 2.8768, "step": 2322 }, { "crossentropy": 2.9270129203796387, "epoch": 0.08421548723897912, "grad_norm": 0.05038558319211006, "grad_norm_var": 1.8631987021934757e-05, "learning_rate": 0.009894467353067855, "loss": 2.8726, "step": 2323 }, { "crossentropy": 2.7054855823516846, "epoch": 0.08425174013921113, "grad_norm": 0.059470128268003464, "grad_norm_var": 1.955013520585121e-05, "learning_rate": 0.009894348562439903, "loss": 2.7669, "step": 2324 }, { "crossentropy": 2.7318313121795654, "epoch": 0.08428799303944315, "grad_norm": 0.05514359101653099, "grad_norm_var": 2.131933475320591e-05, "learning_rate": 0.009894229705706348, "loss": 2.7117, "step": 2325 }, { "crossentropy": 2.9271697998046875, "epoch": 0.08432424593967518, "grad_norm": 0.053931042551994324, "grad_norm_var": 2.1634612630190508e-05, "learning_rate": 0.009894110782868793, "loss": 2.8172, "step": 2326 }, { "crossentropy": 2.667534112930298, "epoch": 0.08436049883990719, "grad_norm": 0.04744139686226845, "grad_norm_var": 2.0596694742496443e-05, "learning_rate": 0.009893991793928844, "loss": 2.7182, "step": 2327 }, { "crossentropy": 2.6731817722320557, "epoch": 0.0843967517401392, "grad_norm": 0.06219680979847908, "grad_norm_var": 2.6133824830970596e-05, "learning_rate": 0.009893872738888112, "loss": 2.7492, "step": 2328 }, { "crossentropy": 2.797550678253174, "epoch": 0.08443300464037123, "grad_norm": 0.05180172249674797, "grad_norm_var": 2.26205679782913e-05, "learning_rate": 0.0098937536177482, "loss": 2.7571, "step": 2329 }, { "crossentropy": 2.7185118198394775, "epoch": 0.08446925754060325, "grad_norm": 0.04780741035938263, "grad_norm_var": 2.3317255021142584e-05, "learning_rate": 0.00989363443051072, "loss": 2.7599, "step": 2330 }, { "crossentropy": 2.5871071815490723, "epoch": 0.08450551044083526, "grad_norm": 0.04549674689769745, "grad_norm_var": 2.5080273739004158e-05, "learning_rate": 0.009893515177177277, "loss": 2.5875, "step": 2331 }, { "crossentropy": 2.82824444770813, "epoch": 0.08454176334106729, "grad_norm": 0.04341127350926399, "grad_norm_var": 2.8134795908067665e-05, "learning_rate": 0.009893395857749487, "loss": 2.803, "step": 2332 }, { "crossentropy": 2.7485995292663574, "epoch": 0.0845780162412993, "grad_norm": 0.0452730655670166, "grad_norm_var": 2.8947939767900017e-05, "learning_rate": 0.009893276472228962, "loss": 2.7016, "step": 2333 }, { "crossentropy": 2.809645175933838, "epoch": 0.08461426914153132, "grad_norm": 0.059334225952625275, "grad_norm_var": 3.276620100288076e-05, "learning_rate": 0.009893157020617311, "loss": 2.8707, "step": 2334 }, { "crossentropy": 2.710418224334717, "epoch": 0.08465052204176333, "grad_norm": 0.04803905636072159, "grad_norm_var": 3.2537082213309386e-05, "learning_rate": 0.009893037502916148, "loss": 2.7279, "step": 2335 }, { "crossentropy": 2.751460552215576, "epoch": 0.08468677494199536, "grad_norm": 0.05179622396826744, "grad_norm_var": 3.0335479814015166e-05, "learning_rate": 0.00989291791912709, "loss": 2.7865, "step": 2336 }, { "crossentropy": 2.7866291999816895, "epoch": 0.08472302784222738, "grad_norm": 0.057444967329502106, "grad_norm_var": 3.1668847509566143e-05, "learning_rate": 0.009892798269251749, "loss": 2.7432, "step": 2337 }, { "crossentropy": 2.734830617904663, "epoch": 0.08475928074245939, "grad_norm": 0.04738197848200798, "grad_norm_var": 3.294714165190758e-05, "learning_rate": 0.009892678553291743, "loss": 2.7175, "step": 2338 }, { "crossentropy": 2.827225923538208, "epoch": 0.08479553364269142, "grad_norm": 0.044239215552806854, "grad_norm_var": 3.634217227029071e-05, "learning_rate": 0.009892558771248687, "loss": 2.7654, "step": 2339 }, { "crossentropy": 2.7519102096557617, "epoch": 0.08483178654292343, "grad_norm": 0.04430915787816048, "grad_norm_var": 3.411781541552694e-05, "learning_rate": 0.0098924389231242, "loss": 2.7553, "step": 2340 }, { "crossentropy": 2.8063220977783203, "epoch": 0.08486803944315545, "grad_norm": 0.0487961620092392, "grad_norm_var": 3.2549796585781776e-05, "learning_rate": 0.009892319008919903, "loss": 2.8071, "step": 2341 }, { "crossentropy": 2.636403799057007, "epoch": 0.08490429234338748, "grad_norm": 0.046121977269649506, "grad_norm_var": 3.2183536089759546e-05, "learning_rate": 0.009892199028637413, "loss": 2.7166, "step": 2342 }, { "crossentropy": 2.709817886352539, "epoch": 0.08494054524361949, "grad_norm": 0.050529733300209045, "grad_norm_var": 3.196049341560199e-05, "learning_rate": 0.00989207898227835, "loss": 2.7126, "step": 2343 }, { "crossentropy": 2.7905678749084473, "epoch": 0.0849767981438515, "grad_norm": 0.0523817278444767, "grad_norm_var": 2.1527379029969106e-05, "learning_rate": 0.009891958869844335, "loss": 2.6999, "step": 2344 }, { "crossentropy": 2.604454278945923, "epoch": 0.08501305104408352, "grad_norm": 0.05180135741829872, "grad_norm_var": 2.1527243159400837e-05, "learning_rate": 0.009891838691336995, "loss": 2.6503, "step": 2345 }, { "crossentropy": 2.7900006771087646, "epoch": 0.08504930394431555, "grad_norm": 0.048752766102552414, "grad_norm_var": 2.143148220019499e-05, "learning_rate": 0.009891718446757947, "loss": 2.7441, "step": 2346 }, { "crossentropy": 2.7018203735351562, "epoch": 0.08508555684454756, "grad_norm": 0.04654207080602646, "grad_norm_var": 2.100183877932939e-05, "learning_rate": 0.009891598136108818, "loss": 2.6797, "step": 2347 }, { "crossentropy": 2.7595791816711426, "epoch": 0.08512180974477958, "grad_norm": 0.04430655390024185, "grad_norm_var": 2.0368726476706576e-05, "learning_rate": 0.009891477759391235, "loss": 2.8284, "step": 2348 }, { "crossentropy": 2.8483824729919434, "epoch": 0.0851580626450116, "grad_norm": 0.04473021626472473, "grad_norm_var": 2.067069798386718e-05, "learning_rate": 0.009891357316606819, "loss": 2.8281, "step": 2349 }, { "crossentropy": 2.932541608810425, "epoch": 0.08519431554524362, "grad_norm": 0.049234624952077866, "grad_norm_var": 1.334064177913637e-05, "learning_rate": 0.0098912368077572, "loss": 2.8221, "step": 2350 }, { "crossentropy": 2.547691822052002, "epoch": 0.08523056844547564, "grad_norm": 0.05540714040398598, "grad_norm_var": 1.6255808429817507e-05, "learning_rate": 0.009891116232844008, "loss": 2.662, "step": 2351 }, { "crossentropy": 2.810698986053467, "epoch": 0.08526682134570766, "grad_norm": 0.053916964679956436, "grad_norm_var": 1.7331541172430077e-05, "learning_rate": 0.009890995591868866, "loss": 2.8053, "step": 2352 }, { "crossentropy": 2.7802717685699463, "epoch": 0.08530307424593968, "grad_norm": 0.05066726729273796, "grad_norm_var": 1.267807891768228e-05, "learning_rate": 0.009890874884833407, "loss": 2.8142, "step": 2353 }, { "crossentropy": 2.7735869884490967, "epoch": 0.08533932714617169, "grad_norm": 0.04702760651707649, "grad_norm_var": 1.2747964164552759e-05, "learning_rate": 0.009890754111739262, "loss": 2.8307, "step": 2354 }, { "crossentropy": 2.751495599746704, "epoch": 0.0853755800464037, "grad_norm": 0.04573655501008034, "grad_norm_var": 1.2002949848086836e-05, "learning_rate": 0.00989063327258806, "loss": 2.7576, "step": 2355 }, { "crossentropy": 2.777484893798828, "epoch": 0.08541183294663574, "grad_norm": 0.045431386679410934, "grad_norm_var": 1.1414727640529643e-05, "learning_rate": 0.009890512367381433, "loss": 2.8148, "step": 2356 }, { "crossentropy": 2.7897584438323975, "epoch": 0.08544808584686775, "grad_norm": 0.05155814439058304, "grad_norm_var": 1.1876654222878897e-05, "learning_rate": 0.009890391396121015, "loss": 2.682, "step": 2357 }, { "crossentropy": 2.6223936080932617, "epoch": 0.08548433874709976, "grad_norm": 0.04430237039923668, "grad_norm_var": 1.2784054382678802e-05, "learning_rate": 0.00989027035880844, "loss": 2.6392, "step": 2358 }, { "crossentropy": 2.8599116802215576, "epoch": 0.08552059164733179, "grad_norm": 0.046359024941921234, "grad_norm_var": 1.2962389209418037e-05, "learning_rate": 0.009890149255445341, "loss": 2.7698, "step": 2359 }, { "crossentropy": 2.688291549682617, "epoch": 0.0855568445475638, "grad_norm": 0.046038784086704254, "grad_norm_var": 1.2308019959764854e-05, "learning_rate": 0.009890028086033356, "loss": 2.7408, "step": 2360 }, { "crossentropy": 2.6609609127044678, "epoch": 0.08559309744779582, "grad_norm": 0.05565812438726425, "grad_norm_var": 1.506993554237218e-05, "learning_rate": 0.009889906850574122, "loss": 2.6837, "step": 2361 }, { "crossentropy": 2.8499343395233154, "epoch": 0.08562935034802784, "grad_norm": 0.053485624492168427, "grad_norm_var": 1.6642470919730803e-05, "learning_rate": 0.009889785549069274, "loss": 2.8453, "step": 2362 }, { "crossentropy": 2.764822244644165, "epoch": 0.08566560324825986, "grad_norm": 0.04728315770626068, "grad_norm_var": 1.64561420678296e-05, "learning_rate": 0.009889664181520453, "loss": 2.7608, "step": 2363 }, { "crossentropy": 2.6352410316467285, "epoch": 0.08570185614849188, "grad_norm": 0.04934392124414444, "grad_norm_var": 1.5009644042337141e-05, "learning_rate": 0.009889542747929296, "loss": 2.6689, "step": 2364 }, { "crossentropy": 2.7084457874298096, "epoch": 0.08573810904872389, "grad_norm": 0.0517406091094017, "grad_norm_var": 1.3962787519043036e-05, "learning_rate": 0.009889421248297446, "loss": 2.6841, "step": 2365 }, { "crossentropy": 2.8642799854278564, "epoch": 0.08577436194895592, "grad_norm": 0.05827080085873604, "grad_norm_var": 1.8656630015459475e-05, "learning_rate": 0.00988929968262654, "loss": 2.8342, "step": 2366 }, { "crossentropy": 2.8614776134490967, "epoch": 0.08581061484918794, "grad_norm": 0.046882372349500656, "grad_norm_var": 1.7210900119261214e-05, "learning_rate": 0.009889178050918221, "loss": 2.7688, "step": 2367 }, { "crossentropy": 2.692725658416748, "epoch": 0.08584686774941995, "grad_norm": 0.04408695548772812, "grad_norm_var": 1.7600524674278525e-05, "learning_rate": 0.009889056353174134, "loss": 2.7022, "step": 2368 }, { "crossentropy": 2.6649229526519775, "epoch": 0.08588312064965198, "grad_norm": 0.04397617653012276, "grad_norm_var": 1.8904150388764027e-05, "learning_rate": 0.009888934589395923, "loss": 2.7618, "step": 2369 }, { "crossentropy": 2.902202606201172, "epoch": 0.08591937354988399, "grad_norm": 0.04574467986822128, "grad_norm_var": 1.927151493560167e-05, "learning_rate": 0.00988881275958523, "loss": 2.8448, "step": 2370 }, { "crossentropy": 2.687145233154297, "epoch": 0.08595562645011601, "grad_norm": 0.047280702739953995, "grad_norm_var": 1.8852887486146135e-05, "learning_rate": 0.009888690863743702, "loss": 2.7384, "step": 2371 }, { "crossentropy": 2.779836893081665, "epoch": 0.08599187935034802, "grad_norm": 0.045942068099975586, "grad_norm_var": 1.8654102454714917e-05, "learning_rate": 0.009888568901872983, "loss": 2.7455, "step": 2372 }, { "crossentropy": 2.957484006881714, "epoch": 0.08602813225058005, "grad_norm": 0.05064047500491142, "grad_norm_var": 1.834749172951363e-05, "learning_rate": 0.009888446873974725, "loss": 2.8548, "step": 2373 }, { "crossentropy": 2.775259494781494, "epoch": 0.08606438515081206, "grad_norm": 0.05030646175146103, "grad_norm_var": 1.71883400629943e-05, "learning_rate": 0.009888324780050573, "loss": 2.7577, "step": 2374 }, { "crossentropy": 2.5864458084106445, "epoch": 0.08610063805104408, "grad_norm": 0.043537817895412445, "grad_norm_var": 1.8656651198567924e-05, "learning_rate": 0.009888202620102175, "loss": 2.5909, "step": 2375 }, { "crossentropy": 2.7616608142852783, "epoch": 0.08613689095127611, "grad_norm": 0.04885765165090561, "grad_norm_var": 1.8129131153376125e-05, "learning_rate": 0.009888080394131183, "loss": 2.663, "step": 2376 }, { "crossentropy": 2.822800874710083, "epoch": 0.08617314385150812, "grad_norm": 0.06027139350771904, "grad_norm_var": 2.3591699636790754e-05, "learning_rate": 0.009887958102139249, "loss": 2.8398, "step": 2377 }, { "crossentropy": 2.756823778152466, "epoch": 0.08620939675174014, "grad_norm": 0.04681064933538437, "grad_norm_var": 2.2587293324224934e-05, "learning_rate": 0.009887835744128022, "loss": 2.7445, "step": 2378 }, { "crossentropy": 2.8252146244049072, "epoch": 0.08624564965197216, "grad_norm": 0.04787209630012512, "grad_norm_var": 2.2488997851434953e-05, "learning_rate": 0.009887713320099155, "loss": 2.8107, "step": 2379 }, { "crossentropy": 2.622612237930298, "epoch": 0.08628190255220418, "grad_norm": 0.04732893779873848, "grad_norm_var": 2.26094681312724e-05, "learning_rate": 0.009887590830054303, "loss": 2.7297, "step": 2380 }, { "crossentropy": 2.750981092453003, "epoch": 0.0863181554524362, "grad_norm": 0.04522167146205902, "grad_norm_var": 2.264163555165486e-05, "learning_rate": 0.00988746827399512, "loss": 2.7501, "step": 2381 }, { "crossentropy": 2.6743359565734863, "epoch": 0.08635440835266821, "grad_norm": 0.04625578969717026, "grad_norm_var": 1.571404851251415e-05, "learning_rate": 0.009887345651923259, "loss": 2.7908, "step": 2382 }, { "crossentropy": 2.7056758403778076, "epoch": 0.08639066125290024, "grad_norm": 0.048160068690776825, "grad_norm_var": 1.5700044754821525e-05, "learning_rate": 0.009887222963840382, "loss": 2.7501, "step": 2383 }, { "crossentropy": 2.7825260162353516, "epoch": 0.08642691415313225, "grad_norm": 0.05377129092812538, "grad_norm_var": 1.696951649437506e-05, "learning_rate": 0.009887100209748138, "loss": 2.7525, "step": 2384 }, { "crossentropy": 2.5653374195098877, "epoch": 0.08646316705336426, "grad_norm": 0.056757234036922455, "grad_norm_var": 1.989838364090662e-05, "learning_rate": 0.00988697738964819, "loss": 2.6983, "step": 2385 }, { "crossentropy": 2.5936732292175293, "epoch": 0.0864994199535963, "grad_norm": 0.05489387735724449, "grad_norm_var": 2.1101112004734182e-05, "learning_rate": 0.009886854503542198, "loss": 2.7192, "step": 2386 }, { "crossentropy": 2.997675895690918, "epoch": 0.08653567285382831, "grad_norm": 0.0550273098051548, "grad_norm_var": 2.243627914789321e-05, "learning_rate": 0.00988673155143182, "loss": 2.928, "step": 2387 }, { "crossentropy": 2.7127842903137207, "epoch": 0.08657192575406032, "grad_norm": 0.05430193617939949, "grad_norm_var": 2.2165789128452255e-05, "learning_rate": 0.009886608533318715, "loss": 2.7843, "step": 2388 }, { "crossentropy": 2.7982230186462402, "epoch": 0.08660817865429234, "grad_norm": 0.04905817657709122, "grad_norm_var": 2.231919691059898e-05, "learning_rate": 0.009886485449204547, "loss": 2.8122, "step": 2389 }, { "crossentropy": 2.6317508220672607, "epoch": 0.08664443155452436, "grad_norm": 0.04952959716320038, "grad_norm_var": 2.2379762952977797e-05, "learning_rate": 0.009886362299090977, "loss": 2.7499, "step": 2390 }, { "crossentropy": 2.8545565605163574, "epoch": 0.08668068445475638, "grad_norm": 0.057951971888542175, "grad_norm_var": 2.202610482477732e-05, "learning_rate": 0.009886239082979668, "loss": 2.8658, "step": 2391 }, { "crossentropy": 2.8608896732330322, "epoch": 0.0867169373549884, "grad_norm": 0.07159021496772766, "grad_norm_var": 4.668089560197393e-05, "learning_rate": 0.009886115800872286, "loss": 2.8558, "step": 2392 }, { "crossentropy": 2.756075620651245, "epoch": 0.08675319025522042, "grad_norm": 0.05595868080854416, "grad_norm_var": 4.354718025421916e-05, "learning_rate": 0.009885992452770496, "loss": 2.7229, "step": 2393 }, { "crossentropy": 2.893331289291382, "epoch": 0.08678944315545244, "grad_norm": 0.04288475215435028, "grad_norm_var": 4.750459396489718e-05, "learning_rate": 0.009885869038675964, "loss": 2.8088, "step": 2394 }, { "crossentropy": 2.802199363708496, "epoch": 0.08682569605568445, "grad_norm": 0.04717263579368591, "grad_norm_var": 4.7946746376016586e-05, "learning_rate": 0.009885745558590354, "loss": 2.7555, "step": 2395 }, { "crossentropy": 2.722806453704834, "epoch": 0.08686194895591648, "grad_norm": 0.0466252826154232, "grad_norm_var": 4.8438592872476704e-05, "learning_rate": 0.009885622012515334, "loss": 2.7968, "step": 2396 }, { "crossentropy": 2.765072822570801, "epoch": 0.0868982018561485, "grad_norm": 0.0555366687476635, "grad_norm_var": 4.549441181359003e-05, "learning_rate": 0.009885498400452579, "loss": 2.7067, "step": 2397 }, { "crossentropy": 2.8349227905273438, "epoch": 0.08693445475638051, "grad_norm": 0.04852870851755142, "grad_norm_var": 4.382124124268615e-05, "learning_rate": 0.00988537472240375, "loss": 2.7828, "step": 2398 }, { "crossentropy": 2.719635486602783, "epoch": 0.08697070765661252, "grad_norm": 0.04282856732606888, "grad_norm_var": 4.902716653082742e-05, "learning_rate": 0.009885250978370525, "loss": 2.7086, "step": 2399 }, { "crossentropy": 2.745685577392578, "epoch": 0.08700696055684455, "grad_norm": 0.04665743187069893, "grad_norm_var": 5.112754485587026e-05, "learning_rate": 0.00988512716835457, "loss": 2.6933, "step": 2400 }, { "crossentropy": 2.7675600051879883, "epoch": 0.08704321345707657, "grad_norm": 0.04557354003190994, "grad_norm_var": 5.215877522363535e-05, "learning_rate": 0.00988500329235756, "loss": 2.6909, "step": 2401 }, { "crossentropy": 2.776318311691284, "epoch": 0.08707946635730858, "grad_norm": 0.04546554386615753, "grad_norm_var": 5.345751364801647e-05, "learning_rate": 0.009884879350381166, "loss": 2.751, "step": 2402 }, { "crossentropy": 2.6217620372772217, "epoch": 0.08711571925754061, "grad_norm": 0.045311808586120605, "grad_norm_var": 5.403399267477696e-05, "learning_rate": 0.009884755342427064, "loss": 2.6681, "step": 2403 }, { "crossentropy": 2.6642239093780518, "epoch": 0.08715197215777262, "grad_norm": 0.05053732544183731, "grad_norm_var": 5.29165030746551e-05, "learning_rate": 0.00988463126849693, "loss": 2.6352, "step": 2404 }, { "crossentropy": 2.7452824115753174, "epoch": 0.08718822505800464, "grad_norm": 0.04373893886804581, "grad_norm_var": 5.54065430326248e-05, "learning_rate": 0.009884507128592434, "loss": 2.7289, "step": 2405 }, { "crossentropy": 2.755275011062622, "epoch": 0.08722447795823667, "grad_norm": 0.045674461871385574, "grad_norm_var": 5.644523305711931e-05, "learning_rate": 0.00988438292271526, "loss": 2.7226, "step": 2406 }, { "crossentropy": 2.996882915496826, "epoch": 0.08726073085846868, "grad_norm": 0.04501444101333618, "grad_norm_var": 5.233071708869534e-05, "learning_rate": 0.009884258650867082, "loss": 2.8578, "step": 2407 }, { "crossentropy": 2.7708346843719482, "epoch": 0.0872969837587007, "grad_norm": 0.048637986183166504, "grad_norm_var": 1.518583970781251e-05, "learning_rate": 0.009884134313049577, "loss": 2.7652, "step": 2408 }, { "crossentropy": 2.7695560455322266, "epoch": 0.08733323665893271, "grad_norm": 0.04772007092833519, "grad_norm_var": 9.871761598336838e-06, "learning_rate": 0.009884009909264429, "loss": 2.71, "step": 2409 }, { "crossentropy": 2.6569533348083496, "epoch": 0.08736948955916474, "grad_norm": 0.05143333598971367, "grad_norm_var": 1.0040043991663138e-05, "learning_rate": 0.009883885439513313, "loss": 2.8008, "step": 2410 }, { "crossentropy": 2.839914560317993, "epoch": 0.08740574245939675, "grad_norm": 0.05034792050719261, "grad_norm_var": 1.0625356405391008e-05, "learning_rate": 0.009883760903797913, "loss": 2.8557, "step": 2411 }, { "crossentropy": 2.9629461765289307, "epoch": 0.08744199535962877, "grad_norm": 0.048051584511995316, "grad_norm_var": 1.0590527940008456e-05, "learning_rate": 0.009883636302119912, "loss": 2.8767, "step": 2412 }, { "crossentropy": 2.729440450668335, "epoch": 0.0874782482598608, "grad_norm": 0.046112775802612305, "grad_norm_var": 6.12602394809772e-06, "learning_rate": 0.00988351163448099, "loss": 2.7753, "step": 2413 }, { "crossentropy": 2.715579032897949, "epoch": 0.08751450116009281, "grad_norm": 0.045564405620098114, "grad_norm_var": 6.061979610064855e-06, "learning_rate": 0.009883386900882833, "loss": 2.624, "step": 2414 }, { "crossentropy": 2.7694783210754395, "epoch": 0.08755075406032482, "grad_norm": 0.08631570637226105, "grad_norm_var": 0.00010127724509634746, "learning_rate": 0.009883262101327124, "loss": 2.7303, "step": 2415 }, { "crossentropy": 2.8505728244781494, "epoch": 0.08758700696055685, "grad_norm": 0.04865118861198425, "grad_norm_var": 0.00010076742169338176, "learning_rate": 0.009883137235815551, "loss": 2.8739, "step": 2416 }, { "crossentropy": 2.7461743354797363, "epoch": 0.08762325986078887, "grad_norm": 0.04731384664773941, "grad_norm_var": 0.00010001441866146759, "learning_rate": 0.009883012304349799, "loss": 2.7361, "step": 2417 }, { "crossentropy": 2.737614870071411, "epoch": 0.08765951276102088, "grad_norm": 0.04769390448927879, "grad_norm_var": 9.905381075639591e-05, "learning_rate": 0.009882887306931556, "loss": 2.8161, "step": 2418 }, { "crossentropy": 2.8124654293060303, "epoch": 0.0876957656612529, "grad_norm": 0.04662858322262764, "grad_norm_var": 9.835970642767157e-05, "learning_rate": 0.009882762243562509, "loss": 2.7894, "step": 2419 }, { "crossentropy": 2.698051929473877, "epoch": 0.08773201856148492, "grad_norm": 0.052101120352745056, "grad_norm_var": 9.863192653954035e-05, "learning_rate": 0.009882637114244348, "loss": 2.7413, "step": 2420 }, { "crossentropy": 2.756173849105835, "epoch": 0.08776827146171694, "grad_norm": 0.0481523834168911, "grad_norm_var": 9.612816418427978e-05, "learning_rate": 0.009882511918978763, "loss": 2.7678, "step": 2421 }, { "crossentropy": 2.873375177383423, "epoch": 0.08780452436194895, "grad_norm": 0.04571206495165825, "grad_norm_var": 9.610486898126066e-05, "learning_rate": 0.009882386657767446, "loss": 2.7457, "step": 2422 }, { "crossentropy": 2.8234691619873047, "epoch": 0.08784077726218098, "grad_norm": 0.04555344954133034, "grad_norm_var": 9.574024005003923e-05, "learning_rate": 0.009882261330612088, "loss": 2.8116, "step": 2423 }, { "crossentropy": 2.883312225341797, "epoch": 0.087877030162413, "grad_norm": 0.04445366933941841, "grad_norm_var": 9.780327996955103e-05, "learning_rate": 0.00988213593751438, "loss": 2.8214, "step": 2424 }, { "crossentropy": 2.6255412101745605, "epoch": 0.08791328306264501, "grad_norm": 0.047354210168123245, "grad_norm_var": 9.79283702975231e-05, "learning_rate": 0.00988201047847602, "loss": 2.7108, "step": 2425 }, { "crossentropy": 2.7002711296081543, "epoch": 0.08794953596287702, "grad_norm": 0.04627292603254318, "grad_norm_var": 9.866845257242429e-05, "learning_rate": 0.009881884953498697, "loss": 2.7877, "step": 2426 }, { "crossentropy": 2.7563273906707764, "epoch": 0.08798578886310905, "grad_norm": 0.0516897588968277, "grad_norm_var": 9.888483269499352e-05, "learning_rate": 0.00988175936258411, "loss": 2.7431, "step": 2427 }, { "crossentropy": 2.726428985595703, "epoch": 0.08802204176334107, "grad_norm": 0.05828342214226723, "grad_norm_var": 0.00010297266980401621, "learning_rate": 0.009881633705733956, "loss": 2.8067, "step": 2428 }, { "crossentropy": 2.79010272026062, "epoch": 0.08805829466357308, "grad_norm": 0.059072453528642654, "grad_norm_var": 0.00010590464355263446, "learning_rate": 0.00988150798294993, "loss": 2.7398, "step": 2429 }, { "crossentropy": 2.815228223800659, "epoch": 0.08809454756380511, "grad_norm": 0.05078306421637535, "grad_norm_var": 0.00010361527613008843, "learning_rate": 0.00988138219423373, "loss": 2.73, "step": 2430 }, { "crossentropy": 2.7525885105133057, "epoch": 0.08813080046403712, "grad_norm": 0.043153684586286545, "grad_norm_var": 2.0418901338890328e-05, "learning_rate": 0.009881256339587054, "loss": 2.7521, "step": 2431 }, { "crossentropy": 2.785667896270752, "epoch": 0.08816705336426914, "grad_norm": 0.04463834688067436, "grad_norm_var": 2.157416588029204e-05, "learning_rate": 0.009881130419011607, "loss": 2.7587, "step": 2432 }, { "crossentropy": 2.886186122894287, "epoch": 0.08820330626450117, "grad_norm": 0.049312371760606766, "grad_norm_var": 2.146014329917987e-05, "learning_rate": 0.009881004432509085, "loss": 2.8111, "step": 2433 }, { "crossentropy": 2.711560010910034, "epoch": 0.08823955916473318, "grad_norm": 0.044709060341119766, "grad_norm_var": 2.24585555788081e-05, "learning_rate": 0.009880878380081191, "loss": 2.7884, "step": 2434 }, { "crossentropy": 3.0080316066741943, "epoch": 0.0882758120649652, "grad_norm": 0.04736946150660515, "grad_norm_var": 2.2296447392946134e-05, "learning_rate": 0.009880752261729627, "loss": 2.8563, "step": 2435 }, { "crossentropy": 2.8817899227142334, "epoch": 0.08831206496519721, "grad_norm": 0.04798220470547676, "grad_norm_var": 2.146873033427495e-05, "learning_rate": 0.009880626077456097, "loss": 2.8269, "step": 2436 }, { "crossentropy": 2.739628314971924, "epoch": 0.08834831786542924, "grad_norm": 0.04687211290001869, "grad_norm_var": 2.161442966641461e-05, "learning_rate": 0.009880499827262305, "loss": 2.7534, "step": 2437 }, { "crossentropy": 2.804823398590088, "epoch": 0.08838457076566125, "grad_norm": 0.05145973712205887, "grad_norm_var": 2.1676136627253074e-05, "learning_rate": 0.009880373511149955, "loss": 2.676, "step": 2438 }, { "crossentropy": 2.887044906616211, "epoch": 0.08842082366589327, "grad_norm": 0.04549383744597435, "grad_norm_var": 2.1701249132361802e-05, "learning_rate": 0.00988024712912076, "loss": 2.8028, "step": 2439 }, { "crossentropy": 2.7728655338287354, "epoch": 0.0884570765661253, "grad_norm": 0.044914502650499344, "grad_norm_var": 2.1454759523729417e-05, "learning_rate": 0.009880120681176415, "loss": 2.814, "step": 2440 }, { "crossentropy": 2.8403565883636475, "epoch": 0.08849332946635731, "grad_norm": 0.04763532057404518, "grad_norm_var": 2.140887887558529e-05, "learning_rate": 0.009879994167318638, "loss": 2.8082, "step": 2441 }, { "crossentropy": 2.7976226806640625, "epoch": 0.08852958236658932, "grad_norm": 0.054816920310258865, "grad_norm_var": 2.317495868468869e-05, "learning_rate": 0.009879867587549133, "loss": 2.7119, "step": 2442 }, { "crossentropy": 2.831101655960083, "epoch": 0.08856583526682135, "grad_norm": 0.05829795077443123, "grad_norm_var": 2.8043616843245223e-05, "learning_rate": 0.00987974094186961, "loss": 2.8408, "step": 2443 }, { "crossentropy": 2.8076677322387695, "epoch": 0.08860208816705337, "grad_norm": 0.05226277559995651, "grad_norm_var": 2.3398414409327626e-05, "learning_rate": 0.009879614230281782, "loss": 2.7046, "step": 2444 }, { "crossentropy": 2.562523126602173, "epoch": 0.08863834106728538, "grad_norm": 0.04575212672352791, "grad_norm_var": 1.7128647527507128e-05, "learning_rate": 0.009879487452787357, "loss": 2.6026, "step": 2445 }, { "crossentropy": 2.745783805847168, "epoch": 0.0886745939675174, "grad_norm": 0.04969201236963272, "grad_norm_var": 1.686595260575226e-05, "learning_rate": 0.00987936060938805, "loss": 2.7959, "step": 2446 }, { "crossentropy": 2.782505750656128, "epoch": 0.08871084686774942, "grad_norm": 0.046078745275735855, "grad_norm_var": 1.535551177304541e-05, "learning_rate": 0.009879233700085574, "loss": 2.7517, "step": 2447 }, { "crossentropy": 2.5781779289245605, "epoch": 0.08874709976798144, "grad_norm": 0.048961225897073746, "grad_norm_var": 1.4251291975865035e-05, "learning_rate": 0.009879106724881643, "loss": 2.6939, "step": 2448 }, { "crossentropy": 2.729496955871582, "epoch": 0.08878335266821345, "grad_norm": 0.052687570452690125, "grad_norm_var": 1.5171077845904425e-05, "learning_rate": 0.00987897968377797, "loss": 2.7006, "step": 2449 }, { "crossentropy": 2.828514814376831, "epoch": 0.08881960556844548, "grad_norm": 0.05531662702560425, "grad_norm_var": 1.6047629782366066e-05, "learning_rate": 0.009878852576776271, "loss": 2.7019, "step": 2450 }, { "crossentropy": 2.867232084274292, "epoch": 0.0888558584686775, "grad_norm": 0.059328801929950714, "grad_norm_var": 2.1231336718749438e-05, "learning_rate": 0.009878725403878266, "loss": 2.7979, "step": 2451 }, { "crossentropy": 2.666956901550293, "epoch": 0.08889211136890951, "grad_norm": 0.04805063456296921, "grad_norm_var": 2.120891227010927e-05, "learning_rate": 0.00987859816508567, "loss": 2.7529, "step": 2452 }, { "crossentropy": 2.51580548286438, "epoch": 0.08892836426914154, "grad_norm": 0.043616004288196564, "grad_norm_var": 2.3436305187070582e-05, "learning_rate": 0.009878470860400201, "loss": 2.6329, "step": 2453 }, { "crossentropy": 2.665778636932373, "epoch": 0.08896461716937355, "grad_norm": 0.04251626133918762, "grad_norm_var": 2.7020035487064034e-05, "learning_rate": 0.009878343489823582, "loss": 2.6759, "step": 2454 }, { "crossentropy": 2.8116469383239746, "epoch": 0.08900087006960557, "grad_norm": 0.045817308127880096, "grad_norm_var": 2.68445691176515e-05, "learning_rate": 0.00987821605335753, "loss": 2.8128, "step": 2455 }, { "crossentropy": 2.552163600921631, "epoch": 0.08903712296983758, "grad_norm": 0.04999344050884247, "grad_norm_var": 2.5193037707804446e-05, "learning_rate": 0.009878088551003767, "loss": 2.6231, "step": 2456 }, { "crossentropy": 2.8378753662109375, "epoch": 0.08907337587006961, "grad_norm": 0.049967046827077866, "grad_norm_var": 2.4781669710747615e-05, "learning_rate": 0.009877960982764016, "loss": 2.8077, "step": 2457 }, { "crossentropy": 2.719089984893799, "epoch": 0.08910962877030162, "grad_norm": 0.05389334633946419, "grad_norm_var": 2.426609633120284e-05, "learning_rate": 0.00987783334864, "loss": 2.7214, "step": 2458 }, { "crossentropy": 2.779067277908325, "epoch": 0.08914588167053364, "grad_norm": 0.04717858135700226, "grad_norm_var": 1.9898031474193238e-05, "learning_rate": 0.009877705648633441, "loss": 2.8032, "step": 2459 }, { "crossentropy": 2.8824234008789062, "epoch": 0.08918213457076567, "grad_norm": 0.0455084890127182, "grad_norm_var": 2.021127549772557e-05, "learning_rate": 0.009877577882746066, "loss": 2.7559, "step": 2460 }, { "crossentropy": 2.806574821472168, "epoch": 0.08921838747099768, "grad_norm": 0.04924764111638069, "grad_norm_var": 1.9450772845197065e-05, "learning_rate": 0.009877450050979597, "loss": 2.7529, "step": 2461 }, { "crossentropy": 2.7745234966278076, "epoch": 0.0892546403712297, "grad_norm": 0.050821516662836075, "grad_norm_var": 1.9598453118938042e-05, "learning_rate": 0.009877322153335767, "loss": 2.7637, "step": 2462 }, { "crossentropy": 2.7108747959136963, "epoch": 0.08929089327146171, "grad_norm": 0.04442603513598442, "grad_norm_var": 2.048153253873322e-05, "learning_rate": 0.009877194189816297, "loss": 2.7114, "step": 2463 }, { "crossentropy": 2.8353614807128906, "epoch": 0.08932714617169374, "grad_norm": 0.04406028985977173, "grad_norm_var": 2.2144090814135187e-05, "learning_rate": 0.009877066160422918, "loss": 2.8511, "step": 2464 }, { "crossentropy": 2.8407187461853027, "epoch": 0.08936339907192575, "grad_norm": 0.04588136449456215, "grad_norm_var": 2.1603848896232374e-05, "learning_rate": 0.009876938065157362, "loss": 2.8106, "step": 2465 }, { "crossentropy": 2.7930190563201904, "epoch": 0.08939965197215777, "grad_norm": 0.04665956646203995, "grad_norm_var": 1.8392462531333088e-05, "learning_rate": 0.009876809904021353, "loss": 2.7152, "step": 2466 }, { "crossentropy": 2.696847438812256, "epoch": 0.0894359048723898, "grad_norm": 0.048641856759786606, "grad_norm_var": 9.295876041324514e-06, "learning_rate": 0.009876681677016629, "loss": 2.6695, "step": 2467 }, { "crossentropy": 2.7068216800689697, "epoch": 0.08947215777262181, "grad_norm": 0.045231547206640244, "grad_norm_var": 9.498201588964185e-06, "learning_rate": 0.009876553384144915, "loss": 2.7109, "step": 2468 }, { "crossentropy": 2.7743899822235107, "epoch": 0.08950841067285382, "grad_norm": 0.046354781836271286, "grad_norm_var": 8.69794459110584e-06, "learning_rate": 0.00987642502540795, "loss": 2.8201, "step": 2469 }, { "crossentropy": 2.751743793487549, "epoch": 0.08954466357308585, "grad_norm": 0.04328249394893646, "grad_norm_var": 8.249748657645992e-06, "learning_rate": 0.009876296600807462, "loss": 2.7892, "step": 2470 }, { "crossentropy": 2.7518486976623535, "epoch": 0.08958091647331787, "grad_norm": 0.04464217647910118, "grad_norm_var": 8.569990275621626e-06, "learning_rate": 0.009876168110345189, "loss": 2.7894, "step": 2471 }, { "crossentropy": 2.6159989833831787, "epoch": 0.08961716937354988, "grad_norm": 0.04864843189716339, "grad_norm_var": 8.18871048990366e-06, "learning_rate": 0.009876039554022867, "loss": 2.5991, "step": 2472 }, { "crossentropy": 2.6435437202453613, "epoch": 0.0896534222737819, "grad_norm": 0.05043249949812889, "grad_norm_var": 8.3769026197833e-06, "learning_rate": 0.009875910931842232, "loss": 2.6559, "step": 2473 }, { "crossentropy": 2.7596449851989746, "epoch": 0.08968967517401392, "grad_norm": 0.04844473674893379, "grad_norm_var": 5.35663155662817e-06, "learning_rate": 0.009875782243805017, "loss": 2.7206, "step": 2474 }, { "crossentropy": 2.6588237285614014, "epoch": 0.08972592807424594, "grad_norm": 0.046671297401189804, "grad_norm_var": 5.3499072390720985e-06, "learning_rate": 0.009875653489912968, "loss": 2.6408, "step": 2475 }, { "crossentropy": 2.759401798248291, "epoch": 0.08976218097447795, "grad_norm": 0.0463036447763443, "grad_norm_var": 5.251472043937312e-06, "learning_rate": 0.009875524670167817, "loss": 2.7083, "step": 2476 }, { "crossentropy": 2.68678617477417, "epoch": 0.08979843387470998, "grad_norm": 0.04463553801178932, "grad_norm_var": 5.112278607220279e-06, "learning_rate": 0.009875395784571306, "loss": 2.7666, "step": 2477 }, { "crossentropy": 2.7932701110839844, "epoch": 0.089834686774942, "grad_norm": 0.04391040652990341, "grad_norm_var": 4.180824326761405e-06, "learning_rate": 0.009875266833125175, "loss": 2.8446, "step": 2478 }, { "crossentropy": 2.799778461456299, "epoch": 0.08987093967517401, "grad_norm": 0.048022061586380005, "grad_norm_var": 4.167641698562113e-06, "learning_rate": 0.00987513781583117, "loss": 2.8198, "step": 2479 }, { "crossentropy": 2.8555870056152344, "epoch": 0.08990719257540604, "grad_norm": 0.04936697706580162, "grad_norm_var": 4.2977483271626016e-06, "learning_rate": 0.00987500873269103, "loss": 2.7555, "step": 2480 }, { "crossentropy": 2.575535535812378, "epoch": 0.08994344547563805, "grad_norm": 0.05180629342794418, "grad_norm_var": 5.848569749957768e-06, "learning_rate": 0.0098748795837065, "loss": 2.6109, "step": 2481 }, { "crossentropy": 2.94877290725708, "epoch": 0.08997969837587007, "grad_norm": 0.046002473682165146, "grad_norm_var": 5.911154782267438e-06, "learning_rate": 0.00987475036887932, "loss": 2.8589, "step": 2482 }, { "crossentropy": 2.750195264816284, "epoch": 0.09001595127610208, "grad_norm": 0.044598378241062164, "grad_norm_var": 6.061221667163531e-06, "learning_rate": 0.00987462108821124, "loss": 2.672, "step": 2483 }, { "crossentropy": 2.684295177459717, "epoch": 0.09005220417633411, "grad_norm": 0.04665711149573326, "grad_norm_var": 5.895413680458092e-06, "learning_rate": 0.009874491741704006, "loss": 2.7813, "step": 2484 }, { "crossentropy": 2.6186485290527344, "epoch": 0.09008845707656613, "grad_norm": 0.04311781004071236, "grad_norm_var": 6.768858774154098e-06, "learning_rate": 0.009874362329359362, "loss": 2.7447, "step": 2485 }, { "crossentropy": 2.692932367324829, "epoch": 0.09012470997679814, "grad_norm": 0.04492144659161568, "grad_norm_var": 6.198909129812025e-06, "learning_rate": 0.009874232851179058, "loss": 2.718, "step": 2486 }, { "crossentropy": 2.729050874710083, "epoch": 0.09016096287703017, "grad_norm": 0.054080940783023834, "grad_norm_var": 9.100092082946151e-06, "learning_rate": 0.009874103307164843, "loss": 2.741, "step": 2487 }, { "crossentropy": 2.8208119869232178, "epoch": 0.09019721577726218, "grad_norm": 0.050259172916412354, "grad_norm_var": 9.540836734899377e-06, "learning_rate": 0.009873973697318466, "loss": 2.7535, "step": 2488 }, { "crossentropy": 2.6298229694366455, "epoch": 0.0902334686774942, "grad_norm": 0.043662238866090775, "grad_norm_var": 9.71504409221635e-06, "learning_rate": 0.009873844021641678, "loss": 2.6423, "step": 2489 }, { "crossentropy": 2.6684389114379883, "epoch": 0.09026972157772621, "grad_norm": 0.050281498581171036, "grad_norm_var": 1.027266927342927e-05, "learning_rate": 0.009873714280136229, "loss": 2.7282, "step": 2490 }, { "crossentropy": 2.655829906463623, "epoch": 0.09030597447795824, "grad_norm": 0.059850189834833145, "grad_norm_var": 2.029798032750999e-05, "learning_rate": 0.009873584472803874, "loss": 2.6536, "step": 2491 }, { "crossentropy": 2.825962781906128, "epoch": 0.09034222737819025, "grad_norm": 0.07671396434307098, "grad_norm_var": 7.135171957414239e-05, "learning_rate": 0.009873454599646362, "loss": 2.7637, "step": 2492 }, { "crossentropy": 2.6572744846343994, "epoch": 0.09037848027842227, "grad_norm": 0.04593241214752197, "grad_norm_var": 7.055207413872977e-05, "learning_rate": 0.009873324660665452, "loss": 2.7692, "step": 2493 }, { "crossentropy": 2.640338659286499, "epoch": 0.0904147331786543, "grad_norm": 0.042804744094610214, "grad_norm_var": 7.151869344090145e-05, "learning_rate": 0.009873194655862899, "loss": 2.7182, "step": 2494 }, { "crossentropy": 2.8398261070251465, "epoch": 0.09045098607888631, "grad_norm": 0.046866558492183685, "grad_norm_var": 7.188836778297305e-05, "learning_rate": 0.009873064585240454, "loss": 2.7579, "step": 2495 }, { "crossentropy": 2.7614803314208984, "epoch": 0.09048723897911833, "grad_norm": 0.060244787484407425, "grad_norm_var": 7.864466653572835e-05, "learning_rate": 0.009872934448799877, "loss": 2.7851, "step": 2496 }, { "crossentropy": 2.6846141815185547, "epoch": 0.09052349187935035, "grad_norm": 0.04878835380077362, "grad_norm_var": 7.868324267951368e-05, "learning_rate": 0.009872804246542926, "loss": 2.694, "step": 2497 }, { "crossentropy": 2.766813039779663, "epoch": 0.09055974477958237, "grad_norm": 0.07402332872152328, "grad_norm_var": 0.00011170439835842815, "learning_rate": 0.009872673978471358, "loss": 2.8115, "step": 2498 }, { "crossentropy": 2.8232431411743164, "epoch": 0.09059599767981438, "grad_norm": 0.046704743057489395, "grad_norm_var": 0.00010988886725820049, "learning_rate": 0.009872543644586933, "loss": 2.7829, "step": 2499 }, { "crossentropy": 2.645751476287842, "epoch": 0.0906322505800464, "grad_norm": 0.05425739660859108, "grad_norm_var": 0.00010790054533444341, "learning_rate": 0.009872413244891414, "loss": 2.7141, "step": 2500 }, { "crossentropy": 2.6648902893066406, "epoch": 0.09066850348027843, "grad_norm": 0.05424986034631729, "grad_norm_var": 0.00010148716257395788, "learning_rate": 0.009872282779386558, "loss": 2.7398, "step": 2501 }, { "crossentropy": 2.7141380310058594, "epoch": 0.09070475638051044, "grad_norm": 0.05159127712249756, "grad_norm_var": 9.676966023753475e-05, "learning_rate": 0.00987215224807413, "loss": 2.7989, "step": 2502 }, { "crossentropy": 2.7768003940582275, "epoch": 0.09074100928074245, "grad_norm": 0.05353159457445145, "grad_norm_var": 9.676570728521465e-05, "learning_rate": 0.009872021650955892, "loss": 2.7972, "step": 2503 }, { "crossentropy": 2.8410494327545166, "epoch": 0.09077726218097448, "grad_norm": 0.05997234210371971, "grad_norm_var": 9.816063288060154e-05, "learning_rate": 0.009871890988033608, "loss": 2.7773, "step": 2504 }, { "crossentropy": 2.891775131225586, "epoch": 0.0908135150812065, "grad_norm": 0.06115105003118515, "grad_norm_var": 9.237280051635807e-05, "learning_rate": 0.009871760259309043, "loss": 2.8532, "step": 2505 }, { "crossentropy": 2.766498327255249, "epoch": 0.09084976798143851, "grad_norm": 0.05341417342424393, "grad_norm_var": 9.083348077793147e-05, "learning_rate": 0.009871629464783963, "loss": 2.7208, "step": 2506 }, { "crossentropy": 2.7690787315368652, "epoch": 0.09088602088167054, "grad_norm": 0.04628067463636398, "grad_norm_var": 9.470815439021658e-05, "learning_rate": 0.009871498604460132, "loss": 2.8, "step": 2507 }, { "crossentropy": 2.7692768573760986, "epoch": 0.09092227378190255, "grad_norm": 0.05162924900650978, "grad_norm_var": 6.06847508640096e-05, "learning_rate": 0.009871367678339321, "loss": 2.82, "step": 2508 }, { "crossentropy": 2.8429086208343506, "epoch": 0.09095852668213457, "grad_norm": 0.050552982836961746, "grad_norm_var": 5.7532379174535646e-05, "learning_rate": 0.009871236686423297, "loss": 2.8172, "step": 2509 }, { "crossentropy": 2.7256412506103516, "epoch": 0.09099477958236658, "grad_norm": 0.046512652188539505, "grad_norm_var": 5.310211187154508e-05, "learning_rate": 0.009871105628713829, "loss": 2.743, "step": 2510 }, { "crossentropy": 2.6483988761901855, "epoch": 0.09103103248259861, "grad_norm": 0.04611608013510704, "grad_norm_var": 5.3824664154954643e-05, "learning_rate": 0.009870974505212687, "loss": 2.7211, "step": 2511 }, { "crossentropy": 2.6137702465057373, "epoch": 0.09106728538283063, "grad_norm": 0.046950094401836395, "grad_norm_var": 5.325012783877336e-05, "learning_rate": 0.009870843315921642, "loss": 2.7106, "step": 2512 }, { "crossentropy": 2.721022129058838, "epoch": 0.09110353828306264, "grad_norm": 0.043889835476875305, "grad_norm_var": 5.740778933704804e-05, "learning_rate": 0.009870712060842468, "loss": 2.7462, "step": 2513 }, { "crossentropy": 2.7396535873413086, "epoch": 0.09113979118329467, "grad_norm": 0.04761295020580292, "grad_norm_var": 2.5392229225966398e-05, "learning_rate": 0.009870580739976936, "loss": 2.8065, "step": 2514 }, { "crossentropy": 2.744631767272949, "epoch": 0.09117604408352668, "grad_norm": 0.05081099271774292, "grad_norm_var": 2.414857641357381e-05, "learning_rate": 0.009870449353326817, "loss": 2.7596, "step": 2515 }, { "crossentropy": 2.7827980518341064, "epoch": 0.0912122969837587, "grad_norm": 0.046022187918424606, "grad_norm_var": 2.498369007202702e-05, "learning_rate": 0.00987031790089389, "loss": 2.7632, "step": 2516 }, { "crossentropy": 2.6238584518432617, "epoch": 0.09124854988399073, "grad_norm": 0.0487741120159626, "grad_norm_var": 2.4224310741106404e-05, "learning_rate": 0.00987018638267993, "loss": 2.7418, "step": 2517 }, { "crossentropy": 2.8045928478240967, "epoch": 0.09128480278422274, "grad_norm": 0.04959208145737648, "grad_norm_var": 2.4130111678888343e-05, "learning_rate": 0.009870054798686712, "loss": 2.7791, "step": 2518 }, { "crossentropy": 2.5158627033233643, "epoch": 0.09132105568445475, "grad_norm": 0.08258046954870224, "grad_norm_var": 8.986747908612814e-05, "learning_rate": 0.00986992314891601, "loss": 2.6151, "step": 2519 }, { "crossentropy": 2.7197723388671875, "epoch": 0.09135730858468677, "grad_norm": 0.06467639654874802, "grad_norm_var": 9.625621042227643e-05, "learning_rate": 0.00986979143336961, "loss": 2.7675, "step": 2520 }, { "crossentropy": 2.732663869857788, "epoch": 0.0913935614849188, "grad_norm": 0.05456479266285896, "grad_norm_var": 9.118183459724977e-05, "learning_rate": 0.009869659652049284, "loss": 2.6876, "step": 2521 }, { "crossentropy": 2.9322640895843506, "epoch": 0.09142981438515081, "grad_norm": 0.05022100731730461, "grad_norm_var": 9.116325293026107e-05, "learning_rate": 0.009869527804956814, "loss": 2.8556, "step": 2522 }, { "crossentropy": 2.588759660720825, "epoch": 0.09146606728538283, "grad_norm": 0.05171845108270645, "grad_norm_var": 8.910086557482038e-05, "learning_rate": 0.009869395892093983, "loss": 2.663, "step": 2523 }, { "crossentropy": 2.9451024532318115, "epoch": 0.09150232018561485, "grad_norm": 0.053280603140592575, "grad_norm_var": 8.918658192654799e-05, "learning_rate": 0.009869263913462569, "loss": 2.8264, "step": 2524 }, { "crossentropy": 2.766327381134033, "epoch": 0.09153857308584687, "grad_norm": 0.05128185451030731, "grad_norm_var": 8.906776717803045e-05, "learning_rate": 0.009869131869064358, "loss": 2.6931, "step": 2525 }, { "crossentropy": 2.7711758613586426, "epoch": 0.09157482598607888, "grad_norm": 0.046383339911699295, "grad_norm_var": 8.916622982196125e-05, "learning_rate": 0.009868999758901131, "loss": 2.7479, "step": 2526 }, { "crossentropy": 2.8061227798461914, "epoch": 0.0916110788863109, "grad_norm": 0.0495302677154541, "grad_norm_var": 8.714583993577879e-05, "learning_rate": 0.009868867582974674, "loss": 2.7787, "step": 2527 }, { "crossentropy": 2.637688159942627, "epoch": 0.09164733178654293, "grad_norm": 0.05203630402684212, "grad_norm_var": 8.508841075240969e-05, "learning_rate": 0.009868735341286771, "loss": 2.7337, "step": 2528 }, { "crossentropy": 2.7901220321655273, "epoch": 0.09168358468677494, "grad_norm": 0.055118028074502945, "grad_norm_var": 7.979929257977361e-05, "learning_rate": 0.00986860303383921, "loss": 2.8118, "step": 2529 }, { "crossentropy": 2.851553440093994, "epoch": 0.09171983758700696, "grad_norm": 0.05488845705986023, "grad_norm_var": 7.750566878436758e-05, "learning_rate": 0.009868470660633774, "loss": 2.8232, "step": 2530 }, { "crossentropy": 2.518203020095825, "epoch": 0.09175609048723898, "grad_norm": 0.04657165706157684, "grad_norm_var": 8.034243718354339e-05, "learning_rate": 0.009868338221672256, "loss": 2.6415, "step": 2531 }, { "crossentropy": 2.7731683254241943, "epoch": 0.091792343387471, "grad_norm": 0.0419749990105629, "grad_norm_var": 8.544320785135359e-05, "learning_rate": 0.009868205716956442, "loss": 2.6976, "step": 2532 }, { "crossentropy": 2.8401339054107666, "epoch": 0.09182859628770301, "grad_norm": 0.04225786402821541, "grad_norm_var": 9.205062297035124e-05, "learning_rate": 0.009868073146488122, "loss": 2.7794, "step": 2533 }, { "crossentropy": 2.7495503425598145, "epoch": 0.09186484918793504, "grad_norm": 0.044483501464128494, "grad_norm_var": 9.59466653160342e-05, "learning_rate": 0.009867940510269084, "loss": 2.7345, "step": 2534 }, { "crossentropy": 2.752624034881592, "epoch": 0.09190110208816706, "grad_norm": 0.04311336576938629, "grad_norm_var": 3.55237656480265e-05, "learning_rate": 0.009867807808301125, "loss": 2.7647, "step": 2535 }, { "crossentropy": 2.7842564582824707, "epoch": 0.09193735498839907, "grad_norm": 0.048023395240306854, "grad_norm_var": 2.0560496424219114e-05, "learning_rate": 0.009867675040586034, "loss": 2.6673, "step": 2536 }, { "crossentropy": 2.718090295791626, "epoch": 0.09197360788863108, "grad_norm": 0.05179516226053238, "grad_norm_var": 1.901835314096345e-05, "learning_rate": 0.009867542207125603, "loss": 2.7351, "step": 2537 }, { "crossentropy": 2.797214984893799, "epoch": 0.09200986078886311, "grad_norm": 0.06234259530901909, "grad_norm_var": 3.0308578923276575e-05, "learning_rate": 0.009867409307921628, "loss": 2.8483, "step": 2538 }, { "crossentropy": 2.7972567081451416, "epoch": 0.09204611368909513, "grad_norm": 0.047800738364458084, "grad_norm_var": 3.0200432870772927e-05, "learning_rate": 0.009867276342975906, "loss": 2.7141, "step": 2539 }, { "crossentropy": 2.7213995456695557, "epoch": 0.09208236658932714, "grad_norm": 0.04891573265194893, "grad_norm_var": 2.9150281818435264e-05, "learning_rate": 0.00986714331229023, "loss": 2.766, "step": 2540 }, { "crossentropy": 2.8078081607818604, "epoch": 0.09211861948955917, "grad_norm": 0.04962858185172081, "grad_norm_var": 2.8852791043940523e-05, "learning_rate": 0.009867010215866397, "loss": 2.8143, "step": 2541 }, { "crossentropy": 2.680845022201538, "epoch": 0.09215487238979118, "grad_norm": 0.04119909554719925, "grad_norm_var": 3.23786122235196e-05, "learning_rate": 0.009866877053706204, "loss": 2.7412, "step": 2542 }, { "crossentropy": 2.6231932640075684, "epoch": 0.0921911252900232, "grad_norm": 0.047510914504528046, "grad_norm_var": 3.2417999894190684e-05, "learning_rate": 0.009866743825811451, "loss": 2.7526, "step": 2543 }, { "crossentropy": 2.8822202682495117, "epoch": 0.09222737819025523, "grad_norm": 0.04813244566321373, "grad_norm_var": 3.158382557063947e-05, "learning_rate": 0.009866610532183937, "loss": 2.9011, "step": 2544 }, { "crossentropy": 2.6996426582336426, "epoch": 0.09226363109048724, "grad_norm": 0.051646843552589417, "grad_norm_var": 2.9209013771200737e-05, "learning_rate": 0.009866477172825463, "loss": 2.6892, "step": 2545 }, { "crossentropy": 2.5766730308532715, "epoch": 0.09229988399071926, "grad_norm": 0.04352087900042534, "grad_norm_var": 2.7061192484906006e-05, "learning_rate": 0.00986634374773783, "loss": 2.6347, "step": 2546 }, { "crossentropy": 2.974790334701538, "epoch": 0.09233613689095127, "grad_norm": 0.049776382744312286, "grad_norm_var": 2.733530827758642e-05, "learning_rate": 0.00986621025692284, "loss": 2.9601, "step": 2547 }, { "crossentropy": 2.7589190006256104, "epoch": 0.0923723897911833, "grad_norm": 0.056677863001823425, "grad_norm_var": 2.975502809292573e-05, "learning_rate": 0.009866076700382293, "loss": 2.6545, "step": 2548 }, { "crossentropy": 2.9073381423950195, "epoch": 0.09240864269141531, "grad_norm": 0.05507124215364456, "grad_norm_var": 2.9263933960667286e-05, "learning_rate": 0.009865943078117998, "loss": 2.8088, "step": 2549 }, { "crossentropy": 2.8893632888793945, "epoch": 0.09244489559164733, "grad_norm": 0.04537411406636238, "grad_norm_var": 2.8735332216879513e-05, "learning_rate": 0.009865809390131756, "loss": 2.9294, "step": 2550 }, { "crossentropy": 2.6671719551086426, "epoch": 0.09248114849187936, "grad_norm": 0.04509762302041054, "grad_norm_var": 2.731603318661957e-05, "learning_rate": 0.009865675636425377, "loss": 2.6431, "step": 2551 }, { "crossentropy": 2.8419296741485596, "epoch": 0.09251740139211137, "grad_norm": 0.04990188777446747, "grad_norm_var": 2.7158700184033945e-05, "learning_rate": 0.009865541817000664, "loss": 2.7576, "step": 2552 }, { "crossentropy": 2.835564374923706, "epoch": 0.09255365429234338, "grad_norm": 0.05020727962255478, "grad_norm_var": 2.6862012584447788e-05, "learning_rate": 0.009865407931859421, "loss": 2.8246, "step": 2553 }, { "crossentropy": 2.6853737831115723, "epoch": 0.0925899071925754, "grad_norm": 0.047941580414772034, "grad_norm_var": 1.526083167608886e-05, "learning_rate": 0.009865273981003465, "loss": 2.6667, "step": 2554 }, { "crossentropy": 2.6068177223205566, "epoch": 0.09262616009280743, "grad_norm": 0.04519353434443474, "grad_norm_var": 1.5980972283836416e-05, "learning_rate": 0.009865139964434597, "loss": 2.6871, "step": 2555 }, { "crossentropy": 2.7623672485351562, "epoch": 0.09266241299303944, "grad_norm": 0.04715641960501671, "grad_norm_var": 1.6073909833072957e-05, "learning_rate": 0.009865005882154632, "loss": 2.7345, "step": 2556 }, { "crossentropy": 2.7926242351531982, "epoch": 0.09269866589327146, "grad_norm": 0.0531013198196888, "grad_norm_var": 1.740704061855182e-05, "learning_rate": 0.00986487173416538, "loss": 2.7959, "step": 2557 }, { "crossentropy": 2.764688491821289, "epoch": 0.09273491879350348, "grad_norm": 0.0505245067179203, "grad_norm_var": 1.364708827690612e-05, "learning_rate": 0.009864737520468653, "loss": 2.7523, "step": 2558 }, { "crossentropy": 2.7915146350860596, "epoch": 0.0927711716937355, "grad_norm": 0.04817454516887665, "grad_norm_var": 1.352717589017093e-05, "learning_rate": 0.009864603241066264, "loss": 2.8029, "step": 2559 }, { "crossentropy": 2.698310136795044, "epoch": 0.09280742459396751, "grad_norm": 0.0549909733235836, "grad_norm_var": 1.5473833018756628e-05, "learning_rate": 0.009864468895960024, "loss": 2.6766, "step": 2560 }, { "crossentropy": 2.7743375301361084, "epoch": 0.09284367749419954, "grad_norm": 0.047537703067064285, "grad_norm_var": 1.543363367854002e-05, "learning_rate": 0.00986433448515175, "loss": 2.8355, "step": 2561 }, { "crossentropy": 2.7142953872680664, "epoch": 0.09287993039443156, "grad_norm": 0.04671251028776169, "grad_norm_var": 1.3572472321008583e-05, "learning_rate": 0.009864200008643256, "loss": 2.6749, "step": 2562 }, { "crossentropy": 3.031684637069702, "epoch": 0.09291618329466357, "grad_norm": 0.8668185472488403, "grad_norm_var": 0.041756248980475655, "learning_rate": 0.00986406546643636, "loss": 2.9379, "step": 2563 }, { "crossentropy": 2.745804786682129, "epoch": 0.09295243619489559, "grad_norm": 0.10725457966327667, "grad_norm_var": 0.04161956099229268, "learning_rate": 0.009863930858532878, "loss": 2.775, "step": 2564 }, { "crossentropy": 2.9178998470306396, "epoch": 0.09298868909512761, "grad_norm": 0.12884031236171722, "grad_norm_var": 0.04148022937491002, "learning_rate": 0.00986379618493463, "loss": 2.8805, "step": 2565 }, { "crossentropy": 2.8179049491882324, "epoch": 0.09302494199535963, "grad_norm": 0.16355277597904205, "grad_norm_var": 0.041359587035440355, "learning_rate": 0.009863661445643433, "loss": 2.8289, "step": 2566 }, { "crossentropy": 2.824371099472046, "epoch": 0.09306119489559164, "grad_norm": 0.11610788851976395, "grad_norm_var": 0.04100520608103691, "learning_rate": 0.009863526640661107, "loss": 2.8521, "step": 2567 }, { "crossentropy": 2.9823191165924072, "epoch": 0.09309744779582367, "grad_norm": 0.08912822604179382, "grad_norm_var": 0.04073343665321467, "learning_rate": 0.009863391769989472, "loss": 2.8975, "step": 2568 }, { "crossentropy": 2.867903232574463, "epoch": 0.09313370069605569, "grad_norm": 0.08046626299619675, "grad_norm_var": 0.04049817712910241, "learning_rate": 0.009863256833630351, "loss": 2.8882, "step": 2569 }, { "crossentropy": 2.6872549057006836, "epoch": 0.0931699535962877, "grad_norm": 0.07475398480892181, "grad_norm_var": 0.040269077787675966, "learning_rate": 0.009863121831585569, "loss": 2.8035, "step": 2570 }, { "crossentropy": 2.8050873279571533, "epoch": 0.09320620649651973, "grad_norm": 0.11810137331485748, "grad_norm_var": 0.03981315545289935, "learning_rate": 0.009862986763856944, "loss": 2.764, "step": 2571 }, { "crossentropy": 2.8151347637176514, "epoch": 0.09324245939675174, "grad_norm": 0.06886420398950577, "grad_norm_var": 0.03960043529619715, "learning_rate": 0.009862851630446303, "loss": 2.7929, "step": 2572 }, { "crossentropy": 2.8164560794830322, "epoch": 0.09327871229698376, "grad_norm": 0.07448630034923553, "grad_norm_var": 0.039403529302433786, "learning_rate": 0.009862716431355471, "loss": 2.7822, "step": 2573 }, { "crossentropy": 2.952854633331299, "epoch": 0.09331496519721577, "grad_norm": 0.06645944714546204, "grad_norm_var": 0.03924306305361978, "learning_rate": 0.009862581166586275, "loss": 2.8942, "step": 2574 }, { "crossentropy": 2.841041326522827, "epoch": 0.0933512180974478, "grad_norm": 0.05095584690570831, "grad_norm_var": 0.03921152779351681, "learning_rate": 0.009862445836140541, "loss": 2.7057, "step": 2575 }, { "crossentropy": 2.8779382705688477, "epoch": 0.09338747099767981, "grad_norm": 0.047799885272979736, "grad_norm_var": 0.03929117559700993, "learning_rate": 0.009862310440020096, "loss": 2.7974, "step": 2576 }, { "crossentropy": 2.804730176925659, "epoch": 0.09342372389791183, "grad_norm": 0.05159708857536316, "grad_norm_var": 0.0392452777756284, "learning_rate": 0.00986217497822677, "loss": 2.8261, "step": 2577 }, { "crossentropy": 2.8890087604522705, "epoch": 0.09345997679814386, "grad_norm": 0.049536846578121185, "grad_norm_var": 0.03921271984959641, "learning_rate": 0.009862039450762392, "loss": 2.841, "step": 2578 }, { "crossentropy": 2.6900110244750977, "epoch": 0.09349622969837587, "grad_norm": 0.04436919093132019, "grad_norm_var": 0.0012018314423018288, "learning_rate": 0.009861903857628794, "loss": 2.7386, "step": 2579 }, { "crossentropy": 2.7789535522460938, "epoch": 0.09353248259860789, "grad_norm": 0.10733263939619064, "grad_norm_var": 0.001202081483569517, "learning_rate": 0.009861768198827805, "loss": 2.7838, "step": 2580 }, { "crossentropy": 2.8853824138641357, "epoch": 0.09356873549883991, "grad_norm": 0.045433320105075836, "grad_norm_var": 0.0011301149539268815, "learning_rate": 0.009861632474361259, "loss": 2.8415, "step": 2581 }, { "crossentropy": 2.7731988430023193, "epoch": 0.09360498839907193, "grad_norm": 0.05592833459377289, "grad_norm_var": 0.0006272255591499641, "learning_rate": 0.00986149668423099, "loss": 2.7964, "step": 2582 }, { "crossentropy": 2.7826132774353027, "epoch": 0.09364124129930394, "grad_norm": 0.061713989824056625, "grad_norm_var": 0.0004874100601883803, "learning_rate": 0.00986136082843883, "loss": 2.7206, "step": 2583 }, { "crossentropy": 2.918231248855591, "epoch": 0.09367749419953596, "grad_norm": 0.05597107857465744, "grad_norm_var": 0.00046241894622675025, "learning_rate": 0.009861224906986613, "loss": 2.8975, "step": 2584 }, { "crossentropy": 2.928128242492676, "epoch": 0.09371374709976799, "grad_norm": 0.07946471869945526, "grad_norm_var": 0.00046053121189686727, "learning_rate": 0.009861088919876177, "loss": 2.9347, "step": 2585 }, { "crossentropy": 2.8447232246398926, "epoch": 0.09375, "grad_norm": 0.22895869612693787, "grad_norm_var": 0.0021308648774847885, "learning_rate": 0.00986095286710936, "loss": 2.8016, "step": 2586 }, { "crossentropy": 2.8509624004364014, "epoch": 0.09378625290023201, "grad_norm": 0.06630738079547882, "grad_norm_var": 0.0020038858325132514, "learning_rate": 0.009860816748687997, "loss": 2.8247, "step": 2587 }, { "crossentropy": 2.920975923538208, "epoch": 0.09382250580046404, "grad_norm": 0.0668497309088707, "grad_norm_var": 0.0020050350936384304, "learning_rate": 0.009860680564613927, "loss": 2.8641, "step": 2588 }, { "crossentropy": 2.726677417755127, "epoch": 0.09385875870069606, "grad_norm": 0.05951033532619476, "grad_norm_var": 0.002014233259828841, "learning_rate": 0.009860544314888989, "loss": 2.7231, "step": 2589 }, { "crossentropy": 2.894882917404175, "epoch": 0.09389501160092807, "grad_norm": 0.31466853618621826, "grad_norm_var": 0.0057099234459627975, "learning_rate": 0.009860407999515024, "loss": 2.9129, "step": 2590 }, { "crossentropy": 2.88838791847229, "epoch": 0.09393126450116009, "grad_norm": 0.06915333122015, "grad_norm_var": 0.005644014745465627, "learning_rate": 0.009860271618493873, "loss": 2.7974, "step": 2591 }, { "crossentropy": 2.6969194412231445, "epoch": 0.09396751740139211, "grad_norm": 0.054925575852394104, "grad_norm_var": 0.005609196586945484, "learning_rate": 0.009860135171827378, "loss": 2.7347, "step": 2592 }, { "crossentropy": 2.9491782188415527, "epoch": 0.09400377030162413, "grad_norm": 0.05077367275953293, "grad_norm_var": 0.005613261125200213, "learning_rate": 0.009859998659517382, "loss": 2.8891, "step": 2593 }, { "crossentropy": 2.66664719581604, "epoch": 0.09404002320185614, "grad_norm": 0.05362382158637047, "grad_norm_var": 0.005593246680000446, "learning_rate": 0.009859862081565729, "loss": 2.7468, "step": 2594 }, { "crossentropy": 2.7630531787872314, "epoch": 0.09407627610208817, "grad_norm": 0.04996217042207718, "grad_norm_var": 0.005562339411499185, "learning_rate": 0.009859725437974264, "loss": 2.8267, "step": 2595 }, { "crossentropy": 2.7720298767089844, "epoch": 0.09411252900232019, "grad_norm": 3.7913591861724854, "grad_norm_var": 0.8629256982602871, "learning_rate": 0.009859588728744833, "loss": 2.761, "step": 2596 }, { "crossentropy": 3.0424208641052246, "epoch": 0.0941487819025522, "grad_norm": 0.07284369319677353, "grad_norm_var": 0.8619727097220601, "learning_rate": 0.00985945195387928, "loss": 2.9736, "step": 2597 }, { "crossentropy": 2.729368209838867, "epoch": 0.09418503480278423, "grad_norm": 0.058526866137981415, "grad_norm_var": 0.8618813784428361, "learning_rate": 0.009859315113379454, "loss": 2.7316, "step": 2598 }, { "crossentropy": 2.851381301879883, "epoch": 0.09422128770301624, "grad_norm": 0.13595342636108398, "grad_norm_var": 0.8596601387255152, "learning_rate": 0.009859178207247202, "loss": 2.8959, "step": 2599 }, { "crossentropy": 2.8655612468719482, "epoch": 0.09425754060324826, "grad_norm": 0.5958749651908875, "grad_norm_var": 0.8584721890428473, "learning_rate": 0.009859041235484376, "loss": 2.9342, "step": 2600 }, { "crossentropy": 2.808393716812134, "epoch": 0.09429379350348027, "grad_norm": 0.11105421185493469, "grad_norm_var": 0.8573559218085989, "learning_rate": 0.009858904198092823, "loss": 2.8533, "step": 2601 }, { "crossentropy": 2.7529006004333496, "epoch": 0.0943300464037123, "grad_norm": 0.8846508264541626, "grad_norm_var": 0.872659145110836, "learning_rate": 0.009858767095074397, "loss": 2.8527, "step": 2602 }, { "crossentropy": 2.8973028659820557, "epoch": 0.09436629930394431, "grad_norm": 0.24259543418884277, "grad_norm_var": 0.8667050759906939, "learning_rate": 0.009858629926430947, "loss": 2.9019, "step": 2603 }, { "crossentropy": 3.119637966156006, "epoch": 0.09440255220417633, "grad_norm": 1.2492122650146484, "grad_norm_var": 0.8994662620007425, "learning_rate": 0.009858492692164326, "loss": 3.0632, "step": 2604 }, { "crossentropy": 2.9613547325134277, "epoch": 0.09443880510440836, "grad_norm": 0.31333351135253906, "grad_norm_var": 0.889019642228008, "learning_rate": 0.009858355392276388, "loss": 3.0207, "step": 2605 }, { "crossentropy": 2.894721269607544, "epoch": 0.09447505800464037, "grad_norm": 0.15836727619171143, "grad_norm_var": 0.894472048041919, "learning_rate": 0.009858218026768988, "loss": 3.0293, "step": 2606 }, { "crossentropy": 3.218173027038574, "epoch": 0.09451131090487239, "grad_norm": 0.19536110758781433, "grad_norm_var": 0.8883307791040118, "learning_rate": 0.009858080595643982, "loss": 3.1021, "step": 2607 }, { "crossentropy": 2.961987257003784, "epoch": 0.09454756380510441, "grad_norm": 0.13506312668323517, "grad_norm_var": 0.8839642327710285, "learning_rate": 0.009857943098903223, "loss": 3.0075, "step": 2608 }, { "crossentropy": 2.925492525100708, "epoch": 0.09458381670533643, "grad_norm": 0.11967568844556808, "grad_norm_var": 0.8800773485233927, "learning_rate": 0.009857805536548572, "loss": 2.9854, "step": 2609 }, { "crossentropy": 2.874687433242798, "epoch": 0.09462006960556844, "grad_norm": 0.1083160936832428, "grad_norm_var": 0.8769328690302955, "learning_rate": 0.009857667908581886, "loss": 2.8808, "step": 2610 }, { "crossentropy": 2.9334466457366943, "epoch": 0.09465632250580046, "grad_norm": 0.1341431438922882, "grad_norm_var": 0.8721686483652241, "learning_rate": 0.009857530215005022, "loss": 2.8398, "step": 2611 }, { "crossentropy": 3.0414981842041016, "epoch": 0.09469257540603249, "grad_norm": 0.11838553845882416, "grad_norm_var": 0.1128390797704835, "learning_rate": 0.00985739245581984, "loss": 2.9633, "step": 2612 }, { "crossentropy": 2.9958014488220215, "epoch": 0.0947288283062645, "grad_norm": 0.09977994859218597, "grad_norm_var": 0.1121060014612919, "learning_rate": 0.009857254631028203, "loss": 3.0175, "step": 2613 }, { "crossentropy": 2.8319380283355713, "epoch": 0.09476508120649652, "grad_norm": 0.0738370344042778, "grad_norm_var": 0.11164554339051122, "learning_rate": 0.00985711674063197, "loss": 2.883, "step": 2614 }, { "crossentropy": 2.8047540187835693, "epoch": 0.09480133410672854, "grad_norm": 0.06018945947289467, "grad_norm_var": 0.11358294074057296, "learning_rate": 0.009856978784633003, "loss": 2.8707, "step": 2615 }, { "crossentropy": 2.9791781902313232, "epoch": 0.09483758700696056, "grad_norm": 0.06083037331700325, "grad_norm_var": 0.1094750227578276, "learning_rate": 0.00985684076303317, "loss": 2.9108, "step": 2616 }, { "crossentropy": 2.955392837524414, "epoch": 0.09487383990719257, "grad_norm": 0.06140465661883354, "grad_norm_var": 0.11057571173902958, "learning_rate": 0.00985670267583433, "loss": 2.9485, "step": 2617 }, { "crossentropy": 2.908406972885132, "epoch": 0.0949100928074246, "grad_norm": 0.07591035217046738, "grad_norm_var": 0.08312089891011598, "learning_rate": 0.009856564523038352, "loss": 2.971, "step": 2618 }, { "crossentropy": 2.9416966438293457, "epoch": 0.09494634570765662, "grad_norm": 0.07111143320798874, "grad_norm_var": 0.0839940498146608, "learning_rate": 0.009856426304647099, "loss": 2.9659, "step": 2619 }, { "crossentropy": 3.0232856273651123, "epoch": 0.09498259860788863, "grad_norm": 0.12124770879745483, "grad_norm_var": 0.004164792312495145, "learning_rate": 0.009856288020662438, "loss": 2.9465, "step": 2620 }, { "crossentropy": 2.920886516571045, "epoch": 0.09501885150812064, "grad_norm": 0.06479818373918533, "grad_norm_var": 0.0015916961829881688, "learning_rate": 0.009856149671086238, "loss": 2.8882, "step": 2621 }, { "crossentropy": 2.828701972961426, "epoch": 0.09505510440835267, "grad_norm": 0.06871624290943146, "grad_norm_var": 0.0014399823211743892, "learning_rate": 0.009856011255920368, "loss": 2.8794, "step": 2622 }, { "crossentropy": 3.090254306793213, "epoch": 0.09509135730858469, "grad_norm": 0.06803715229034424, "grad_norm_var": 0.000801157755656035, "learning_rate": 0.009855872775166695, "loss": 2.9157, "step": 2623 }, { "crossentropy": 2.8822438716888428, "epoch": 0.0951276102088167, "grad_norm": 0.06174676492810249, "grad_norm_var": 0.0006974815912741914, "learning_rate": 0.009855734228827094, "loss": 2.9447, "step": 2624 }, { "crossentropy": 3.0700223445892334, "epoch": 0.09516386310904873, "grad_norm": 0.05846373364329338, "grad_norm_var": 0.0006528011597236335, "learning_rate": 0.00985559561690343, "loss": 2.9373, "step": 2625 }, { "crossentropy": 3.010016441345215, "epoch": 0.09520011600928074, "grad_norm": 0.0644092783331871, "grad_norm_var": 0.0006173688852712096, "learning_rate": 0.00985545693939758, "loss": 2.9199, "step": 2626 }, { "crossentropy": 3.0700631141662598, "epoch": 0.09523636890951276, "grad_norm": 0.06235861778259277, "grad_norm_var": 0.0004110509737907802, "learning_rate": 0.009855318196311415, "loss": 3.012, "step": 2627 }, { "crossentropy": 2.7315571308135986, "epoch": 0.09527262180974477, "grad_norm": 0.05168313533067703, "grad_norm_var": 0.000298393887097105, "learning_rate": 0.00985517938764681, "loss": 2.8386, "step": 2628 }, { "crossentropy": 2.9287447929382324, "epoch": 0.0953088747099768, "grad_norm": 0.5243420600891113, "grad_norm_var": 0.013233990999602109, "learning_rate": 0.009855040513405642, "loss": 2.8403, "step": 2629 }, { "crossentropy": 2.832367181777954, "epoch": 0.09534512761020882, "grad_norm": 0.05295364931225777, "grad_norm_var": 0.013325237300477638, "learning_rate": 0.00985490157358978, "loss": 2.8456, "step": 2630 }, { "crossentropy": 2.8842618465423584, "epoch": 0.09538138051044083, "grad_norm": 0.16049030423164368, "grad_norm_var": 0.013481610400799293, "learning_rate": 0.009854762568201107, "loss": 2.9136, "step": 2631 }, { "crossentropy": 2.919538736343384, "epoch": 0.09541763341067286, "grad_norm": 0.10880604386329651, "grad_norm_var": 0.013363510234020876, "learning_rate": 0.009854623497241498, "loss": 2.9425, "step": 2632 }, { "crossentropy": 3.1173038482666016, "epoch": 0.09545388631090487, "grad_norm": 1.312811255455017, "grad_norm_var": 0.10400231328849985, "learning_rate": 0.009854484360712831, "loss": 3.061, "step": 2633 }, { "crossentropy": 2.8526077270507812, "epoch": 0.09549013921113689, "grad_norm": 0.23086854815483093, "grad_norm_var": 0.10329062411206673, "learning_rate": 0.009854345158616986, "loss": 2.9732, "step": 2634 }, { "crossentropy": 2.855013608932495, "epoch": 0.09552639211136892, "grad_norm": 0.09961186349391937, "grad_norm_var": 0.10287943226510887, "learning_rate": 0.009854205890955843, "loss": 2.8862, "step": 2635 }, { "crossentropy": 2.883256435394287, "epoch": 0.09556264501160093, "grad_norm": 0.20527008175849915, "grad_norm_var": 0.10250048197748436, "learning_rate": 0.009854066557731283, "loss": 2.9036, "step": 2636 }, { "crossentropy": 2.965642213821411, "epoch": 0.09559889791183294, "grad_norm": 0.23167234659194946, "grad_norm_var": 0.10123913614716441, "learning_rate": 0.009853927158945189, "loss": 2.9994, "step": 2637 }, { "crossentropy": 2.759445905685425, "epoch": 0.09563515081206496, "grad_norm": 0.07065204530954361, "grad_norm_var": 0.10120286787816717, "learning_rate": 0.00985378769459944, "loss": 2.8249, "step": 2638 }, { "crossentropy": 2.9255824089050293, "epoch": 0.09567140371229699, "grad_norm": 0.06664393097162247, "grad_norm_var": 0.1012294091112147, "learning_rate": 0.009853648164695923, "loss": 2.9044, "step": 2639 }, { "crossentropy": 2.942274570465088, "epoch": 0.095707656612529, "grad_norm": 0.06100481376051903, "grad_norm_var": 0.10124412694954385, "learning_rate": 0.009853508569236523, "loss": 2.9633, "step": 2640 }, { "crossentropy": 2.712785005569458, "epoch": 0.09574390951276102, "grad_norm": 0.17122742533683777, "grad_norm_var": 0.09975856465433199, "learning_rate": 0.009853368908223123, "loss": 2.8342, "step": 2641 }, { "crossentropy": 2.924745798110962, "epoch": 0.09578016241299304, "grad_norm": 0.06065914407372475, "grad_norm_var": 0.09983582938779882, "learning_rate": 0.009853229181657612, "loss": 2.8452, "step": 2642 }, { "crossentropy": 2.8222625255584717, "epoch": 0.09581641531322506, "grad_norm": 0.059828873723745346, "grad_norm_var": 0.09988836986269665, "learning_rate": 0.009853089389541875, "loss": 3.0039, "step": 2643 }, { "crossentropy": 2.795248508453369, "epoch": 0.09585266821345707, "grad_norm": 0.1150304302573204, "grad_norm_var": 0.09874469218107167, "learning_rate": 0.009852949531877802, "loss": 2.8944, "step": 2644 }, { "crossentropy": 2.8382015228271484, "epoch": 0.0958889211136891, "grad_norm": 0.049925241619348526, "grad_norm_var": 0.09360725614893194, "learning_rate": 0.009852809608667279, "loss": 2.8418, "step": 2645 }, { "crossentropy": 2.916269302368164, "epoch": 0.09592517401392112, "grad_norm": 0.048852380365133286, "grad_norm_var": 0.09368384588120518, "learning_rate": 0.0098526696199122, "loss": 2.8019, "step": 2646 }, { "crossentropy": 2.8537697792053223, "epoch": 0.09596142691415313, "grad_norm": 0.05862189829349518, "grad_norm_var": 0.09474457010284451, "learning_rate": 0.009852529565614452, "loss": 2.8711, "step": 2647 }, { "crossentropy": 2.966169595718384, "epoch": 0.09599767981438515, "grad_norm": 0.054086342453956604, "grad_norm_var": 0.0954837363155082, "learning_rate": 0.00985238944577593, "loss": 2.9656, "step": 2648 }, { "crossentropy": 2.808410406112671, "epoch": 0.09603393271461717, "grad_norm": 0.05947093293070793, "grad_norm_var": 0.00453132485800559, "learning_rate": 0.009852249260398523, "loss": 2.8292, "step": 2649 }, { "crossentropy": 2.7669825553894043, "epoch": 0.09607018561484919, "grad_norm": 0.0825849324464798, "grad_norm_var": 0.0033718169809790377, "learning_rate": 0.009852109009484127, "loss": 2.7929, "step": 2650 }, { "crossentropy": 2.8734664916992188, "epoch": 0.0961064385150812, "grad_norm": 0.13986548781394958, "grad_norm_var": 0.003506179994681367, "learning_rate": 0.009851968693034634, "loss": 2.87, "step": 2651 }, { "crossentropy": 2.850986957550049, "epoch": 0.09614269141531323, "grad_norm": 0.06047289818525314, "grad_norm_var": 0.0027062405649947237, "learning_rate": 0.009851828311051943, "loss": 2.8907, "step": 2652 }, { "crossentropy": 2.8102567195892334, "epoch": 0.09617894431554525, "grad_norm": 0.06754519045352936, "grad_norm_var": 0.0012219774858734432, "learning_rate": 0.009851687863537946, "loss": 2.8977, "step": 2653 }, { "crossentropy": 2.920877695083618, "epoch": 0.09621519721577726, "grad_norm": 0.04924876242876053, "grad_norm_var": 0.001267738393973905, "learning_rate": 0.009851547350494542, "loss": 2.9395, "step": 2654 }, { "crossentropy": 2.797672986984253, "epoch": 0.09625145011600927, "grad_norm": 0.05369342118501663, "grad_norm_var": 0.0012931963564226487, "learning_rate": 0.00985140677192363, "loss": 2.8592, "step": 2655 }, { "crossentropy": 2.972376585006714, "epoch": 0.0962877030162413, "grad_norm": 0.06795436143875122, "grad_norm_var": 0.0012837033004955574, "learning_rate": 0.009851266127827106, "loss": 2.8928, "step": 2656 }, { "crossentropy": 2.935931921005249, "epoch": 0.09632395591647332, "grad_norm": 0.11065857857465744, "grad_norm_var": 0.0007354014269174535, "learning_rate": 0.00985112541820687, "loss": 2.9265, "step": 2657 }, { "crossentropy": 2.804272413253784, "epoch": 0.09636020881670533, "grad_norm": 0.055916253477334976, "grad_norm_var": 0.0007434455373675366, "learning_rate": 0.009850984643064824, "loss": 2.7947, "step": 2658 }, { "crossentropy": 2.8197808265686035, "epoch": 0.09639646171693736, "grad_norm": 0.048877615481615067, "grad_norm_var": 0.0007670480945906795, "learning_rate": 0.00985084380240287, "loss": 2.9004, "step": 2659 }, { "crossentropy": 3.1359241008758545, "epoch": 0.09643271461716937, "grad_norm": 0.05864795297384262, "grad_norm_var": 0.0006285287628165362, "learning_rate": 0.009850702896222908, "loss": 2.9924, "step": 2660 }, { "crossentropy": 2.8359358310699463, "epoch": 0.09646896751740139, "grad_norm": 0.05334731936454773, "grad_norm_var": 0.0006216289186822652, "learning_rate": 0.009850561924526841, "loss": 2.7925, "step": 2661 }, { "crossentropy": 2.660195827484131, "epoch": 0.09650522041763342, "grad_norm": 0.14298874139785767, "grad_norm_var": 0.0009493932658696413, "learning_rate": 0.009850420887316576, "loss": 2.7608, "step": 2662 }, { "crossentropy": 2.8936145305633545, "epoch": 0.09654147331786543, "grad_norm": 0.048351164907217026, "grad_norm_var": 0.0009753320729291061, "learning_rate": 0.009850279784594016, "loss": 2.84, "step": 2663 }, { "crossentropy": 3.0675997734069824, "epoch": 0.09657772621809745, "grad_norm": 0.14430932700634003, "grad_norm_var": 0.001267311626285903, "learning_rate": 0.009850138616361068, "loss": 2.9923, "step": 2664 }, { "crossentropy": 2.8741631507873535, "epoch": 0.09661397911832946, "grad_norm": 0.270356684923172, "grad_norm_var": 0.003533006860129978, "learning_rate": 0.009849997382619635, "loss": 2.8856, "step": 2665 }, { "crossentropy": 2.9124631881713867, "epoch": 0.09665023201856149, "grad_norm": 0.05628642439842224, "grad_norm_var": 0.0036054808547444856, "learning_rate": 0.00984985608337163, "loss": 2.9221, "step": 2666 }, { "crossentropy": 2.861560106277466, "epoch": 0.0966864849187935, "grad_norm": 0.07171175628900528, "grad_norm_var": 0.0034361332381408986, "learning_rate": 0.009849714718618957, "loss": 2.8811, "step": 2667 }, { "crossentropy": 2.8840930461883545, "epoch": 0.09672273781902552, "grad_norm": 0.05243871361017227, "grad_norm_var": 0.003466466065386292, "learning_rate": 0.009849573288363526, "loss": 2.8777, "step": 2668 }, { "crossentropy": 2.8596675395965576, "epoch": 0.09675899071925755, "grad_norm": 0.05478464066982269, "grad_norm_var": 0.003505525400492414, "learning_rate": 0.009849431792607251, "loss": 2.7826, "step": 2669 }, { "crossentropy": 2.8695456981658936, "epoch": 0.09679524361948956, "grad_norm": 0.052850741893053055, "grad_norm_var": 0.00348977944712061, "learning_rate": 0.009849290231352038, "loss": 2.8735, "step": 2670 }, { "crossentropy": 2.7073073387145996, "epoch": 0.09683149651972157, "grad_norm": 0.047270268201828, "grad_norm_var": 0.0035182689464045066, "learning_rate": 0.009849148604599803, "loss": 2.7423, "step": 2671 }, { "crossentropy": 2.8007211685180664, "epoch": 0.0968677494199536, "grad_norm": 0.050431206822395325, "grad_norm_var": 0.003573891005295519, "learning_rate": 0.009849006912352456, "loss": 2.83, "step": 2672 }, { "crossentropy": 2.854269027709961, "epoch": 0.09690400232018562, "grad_norm": 0.08125434815883636, "grad_norm_var": 0.003517342225490708, "learning_rate": 0.009848865154611912, "loss": 2.8372, "step": 2673 }, { "crossentropy": 2.83096981048584, "epoch": 0.09694025522041763, "grad_norm": 0.08580205589532852, "grad_norm_var": 0.0034747500792330407, "learning_rate": 0.009848723331380087, "loss": 2.9121, "step": 2674 }, { "crossentropy": 2.660881757736206, "epoch": 0.09697650812064965, "grad_norm": 0.04767775535583496, "grad_norm_var": 0.003480216102681272, "learning_rate": 0.009848581442658894, "loss": 2.691, "step": 2675 }, { "crossentropy": 2.8498950004577637, "epoch": 0.09701276102088167, "grad_norm": 0.04672439768910408, "grad_norm_var": 0.003526873820975739, "learning_rate": 0.009848439488450251, "loss": 2.818, "step": 2676 }, { "crossentropy": 2.639181613922119, "epoch": 0.09704901392111369, "grad_norm": 0.0558202788233757, "grad_norm_var": 0.003517920033524557, "learning_rate": 0.009848297468756076, "loss": 2.7611, "step": 2677 }, { "crossentropy": 2.6389565467834473, "epoch": 0.0970852668213457, "grad_norm": 0.047727927565574646, "grad_norm_var": 0.0033081039213487843, "learning_rate": 0.009848155383578283, "loss": 2.7571, "step": 2678 }, { "crossentropy": 2.736511468887329, "epoch": 0.09712151972157773, "grad_norm": 0.04554983973503113, "grad_norm_var": 0.003318870090796988, "learning_rate": 0.009848013232918798, "loss": 2.8015, "step": 2679 }, { "crossentropy": 2.978120803833008, "epoch": 0.09715777262180975, "grad_norm": 0.047803398221731186, "grad_norm_var": 0.0030179658922520913, "learning_rate": 0.009847871016779535, "loss": 2.9184, "step": 2680 }, { "crossentropy": 2.8004531860351562, "epoch": 0.09719402552204176, "grad_norm": 0.05741800740361214, "grad_norm_var": 0.00015362759033762547, "learning_rate": 0.009847728735162417, "loss": 2.8485, "step": 2681 }, { "crossentropy": 2.812868595123291, "epoch": 0.09723027842227379, "grad_norm": 0.05237552523612976, "grad_norm_var": 0.00015461511564841707, "learning_rate": 0.009847586388069367, "loss": 2.8065, "step": 2682 }, { "crossentropy": 2.841041088104248, "epoch": 0.0972665313225058, "grad_norm": 0.04896192252635956, "grad_norm_var": 0.000139614731069825, "learning_rate": 0.009847443975502305, "loss": 2.8029, "step": 2683 }, { "crossentropy": 2.7711803913116455, "epoch": 0.09730278422273782, "grad_norm": 0.05012571066617966, "grad_norm_var": 0.00014064053114718107, "learning_rate": 0.009847301497463157, "loss": 2.8266, "step": 2684 }, { "crossentropy": 2.700584650039673, "epoch": 0.09733903712296983, "grad_norm": 0.04204072058200836, "grad_norm_var": 0.0001503687273097909, "learning_rate": 0.009847158953953845, "loss": 2.7291, "step": 2685 }, { "crossentropy": 3.051398515701294, "epoch": 0.09737529002320186, "grad_norm": 0.06005614995956421, "grad_norm_var": 0.00015275962137176416, "learning_rate": 0.009847016344976296, "loss": 2.9409, "step": 2686 }, { "crossentropy": 2.906297206878662, "epoch": 0.09741154292343387, "grad_norm": 0.0577937550842762, "grad_norm_var": 0.0001499718558625925, "learning_rate": 0.009846873670532438, "loss": 2.9516, "step": 2687 }, { "crossentropy": 2.6855950355529785, "epoch": 0.09744779582366589, "grad_norm": 0.04427836462855339, "grad_norm_var": 0.00015596113536143203, "learning_rate": 0.009846730930624192, "loss": 2.7494, "step": 2688 }, { "crossentropy": 2.5822575092315674, "epoch": 0.09748404872389792, "grad_norm": 0.04422275722026825, "grad_norm_var": 0.00010938697220528309, "learning_rate": 0.00984658812525349, "loss": 2.6461, "step": 2689 }, { "crossentropy": 2.709714889526367, "epoch": 0.09752030162412993, "grad_norm": 0.04459523409605026, "grad_norm_var": 3.0612177964157934e-05, "learning_rate": 0.009846445254422263, "loss": 2.7775, "step": 2690 }, { "crossentropy": 2.704359531402588, "epoch": 0.09755655452436195, "grad_norm": 0.04466864839196205, "grad_norm_var": 3.193859134413187e-05, "learning_rate": 0.009846302318132437, "loss": 2.6999, "step": 2691 }, { "crossentropy": 2.7983431816101074, "epoch": 0.09759280742459396, "grad_norm": 0.20616231858730316, "grad_norm_var": 0.0015641531302597817, "learning_rate": 0.009846159316385942, "loss": 2.7945, "step": 2692 }, { "crossentropy": 2.821295738220215, "epoch": 0.09762906032482599, "grad_norm": 0.04558052495121956, "grad_norm_var": 0.0015755255930416988, "learning_rate": 0.009846016249184712, "loss": 2.7647, "step": 2693 }, { "crossentropy": 2.8243908882141113, "epoch": 0.097665313225058, "grad_norm": 0.08223211765289307, "grad_norm_var": 0.0015994103888223937, "learning_rate": 0.009845873116530679, "loss": 2.8263, "step": 2694 }, { "crossentropy": 2.6230387687683105, "epoch": 0.09770156612529002, "grad_norm": 0.04751306027173996, "grad_norm_var": 0.0015956419312932245, "learning_rate": 0.009845729918425776, "loss": 2.7178, "step": 2695 }, { "crossentropy": 2.8646035194396973, "epoch": 0.09773781902552205, "grad_norm": 0.044745638966560364, "grad_norm_var": 0.00160160219305449, "learning_rate": 0.009845586654871936, "loss": 2.8619, "step": 2696 }, { "crossentropy": 2.7992372512817383, "epoch": 0.09777407192575406, "grad_norm": 0.043967295438051224, "grad_norm_var": 0.0016189718458926583, "learning_rate": 0.009845443325871095, "loss": 2.8316, "step": 2697 }, { "crossentropy": 2.781268835067749, "epoch": 0.09781032482598608, "grad_norm": 0.049298133701086044, "grad_norm_var": 0.0016226747628280821, "learning_rate": 0.009845299931425189, "loss": 2.7868, "step": 2698 }, { "crossentropy": 2.895667552947998, "epoch": 0.0978465777262181, "grad_norm": 0.13664378225803375, "grad_norm_var": 0.001976881970114654, "learning_rate": 0.009845156471536155, "loss": 2.836, "step": 2699 }, { "crossentropy": 2.8471596240997314, "epoch": 0.09788283062645012, "grad_norm": 0.05799508094787598, "grad_norm_var": 0.001964888225837415, "learning_rate": 0.00984501294620593, "loss": 2.7997, "step": 2700 }, { "crossentropy": 2.6435751914978027, "epoch": 0.09791908352668213, "grad_norm": 0.08182884752750397, "grad_norm_var": 0.001938120398267993, "learning_rate": 0.00984486935543645, "loss": 2.7374, "step": 2701 }, { "crossentropy": 2.9309256076812744, "epoch": 0.09795533642691415, "grad_norm": 0.04713074490427971, "grad_norm_var": 0.0019626381519246877, "learning_rate": 0.009844725699229659, "loss": 2.9379, "step": 2702 }, { "crossentropy": 2.765491247177124, "epoch": 0.09799158932714618, "grad_norm": 0.06241033226251602, "grad_norm_var": 0.0019580472777536956, "learning_rate": 0.009844581977587495, "loss": 2.8122, "step": 2703 }, { "crossentropy": 2.827014446258545, "epoch": 0.09802784222737819, "grad_norm": 0.11931081861257553, "grad_norm_var": 0.0020755508250780118, "learning_rate": 0.0098444381905119, "loss": 2.8061, "step": 2704 }, { "crossentropy": 2.774564743041992, "epoch": 0.0980640951276102, "grad_norm": 0.054595038294792175, "grad_norm_var": 0.0020433147165921615, "learning_rate": 0.009844294338004815, "loss": 2.791, "step": 2705 }, { "crossentropy": 2.781716823577881, "epoch": 0.09810034802784223, "grad_norm": 0.04465625062584877, "grad_norm_var": 0.0020430835166994456, "learning_rate": 0.009844150420068185, "loss": 2.7769, "step": 2706 }, { "crossentropy": 2.756274938583374, "epoch": 0.09813660092807425, "grad_norm": 0.0610186941921711, "grad_norm_var": 0.0019979281079554423, "learning_rate": 0.00984400643670395, "loss": 2.8068, "step": 2707 }, { "crossentropy": 2.8343353271484375, "epoch": 0.09817285382830626, "grad_norm": 0.06451935321092606, "grad_norm_var": 0.0007571520463094225, "learning_rate": 0.009843862387914057, "loss": 2.832, "step": 2708 }, { "crossentropy": 2.6122114658355713, "epoch": 0.09820910672853829, "grad_norm": 0.04665111005306244, "grad_norm_var": 0.0007544209129994891, "learning_rate": 0.009843718273700454, "loss": 2.7045, "step": 2709 }, { "crossentropy": 2.711665630340576, "epoch": 0.0982453596287703, "grad_norm": 0.046684034168720245, "grad_norm_var": 0.0007530621001293346, "learning_rate": 0.009843574094065083, "loss": 2.8041, "step": 2710 }, { "crossentropy": 2.790569305419922, "epoch": 0.09828161252900232, "grad_norm": 0.0477580651640892, "grad_norm_var": 0.0007525579582288837, "learning_rate": 0.009843429849009894, "loss": 2.8539, "step": 2711 }, { "crossentropy": 2.918860673904419, "epoch": 0.09831786542923433, "grad_norm": 0.04488568753004074, "grad_norm_var": 0.0007522169018921928, "learning_rate": 0.009843285538536835, "loss": 2.9134, "step": 2712 }, { "crossentropy": 2.7505195140838623, "epoch": 0.09835411832946636, "grad_norm": 0.04105174168944359, "grad_norm_var": 0.0007601798427318183, "learning_rate": 0.009843141162647855, "loss": 2.75, "step": 2713 }, { "crossentropy": 2.9090731143951416, "epoch": 0.09839037122969838, "grad_norm": 0.04309578239917755, "grad_norm_var": 0.0007738345884440473, "learning_rate": 0.009842996721344904, "loss": 2.8317, "step": 2714 }, { "crossentropy": 2.8954975605010986, "epoch": 0.09842662412993039, "grad_norm": 0.044730380177497864, "grad_norm_var": 0.00039337848743759284, "learning_rate": 0.009842852214629933, "loss": 2.8693, "step": 2715 }, { "crossentropy": 2.904491424560547, "epoch": 0.09846287703016242, "grad_norm": 0.05456558242440224, "grad_norm_var": 0.00039355344632045617, "learning_rate": 0.009842707642504892, "loss": 2.8307, "step": 2716 }, { "crossentropy": 2.858431577682495, "epoch": 0.09849912993039443, "grad_norm": 0.04216447472572327, "grad_norm_var": 0.0003582236288388718, "learning_rate": 0.009842563004971737, "loss": 2.7887, "step": 2717 }, { "crossentropy": 2.8039169311523438, "epoch": 0.09853538283062645, "grad_norm": 0.042348939925432205, "grad_norm_var": 0.0003640813283543621, "learning_rate": 0.009842418302032419, "loss": 2.8012, "step": 2718 }, { "crossentropy": 2.803234815597534, "epoch": 0.09857163573085846, "grad_norm": 0.04266241565346718, "grad_norm_var": 0.0003657253989938008, "learning_rate": 0.009842273533688892, "loss": 2.7954, "step": 2719 }, { "crossentropy": 2.8162965774536133, "epoch": 0.09860788863109049, "grad_norm": 0.04924614727497101, "grad_norm_var": 4.8805553076549285e-05, "learning_rate": 0.009842128699943114, "loss": 2.8432, "step": 2720 }, { "crossentropy": 2.6542229652404785, "epoch": 0.0986441415313225, "grad_norm": 0.04227708652615547, "grad_norm_var": 4.77274985695987e-05, "learning_rate": 0.00984198380079704, "loss": 2.7338, "step": 2721 }, { "crossentropy": 2.7710580825805664, "epoch": 0.09868039443155452, "grad_norm": 0.04367785528302193, "grad_norm_var": 4.814456975818082e-05, "learning_rate": 0.009841838836252626, "loss": 2.7959, "step": 2722 }, { "crossentropy": 2.661463737487793, "epoch": 0.09871664733178655, "grad_norm": 0.04390029236674309, "grad_norm_var": 3.522392203099174e-05, "learning_rate": 0.009841693806311832, "loss": 2.7221, "step": 2723 }, { "crossentropy": 2.9590437412261963, "epoch": 0.09875290023201856, "grad_norm": 0.045835189521312714, "grad_norm_var": 1.1563624468377496e-05, "learning_rate": 0.009841548710976613, "loss": 2.8289, "step": 2724 }, { "crossentropy": 2.7711150646209717, "epoch": 0.09878915313225058, "grad_norm": 0.04376066103577614, "grad_norm_var": 1.1486434826578813e-05, "learning_rate": 0.009841403550248934, "loss": 2.7622, "step": 2725 }, { "crossentropy": 2.6172914505004883, "epoch": 0.0988254060324826, "grad_norm": 0.04373082518577576, "grad_norm_var": 1.1335054725530637e-05, "learning_rate": 0.00984125832413075, "loss": 2.6782, "step": 2726 }, { "crossentropy": 2.7831666469573975, "epoch": 0.09886165893271462, "grad_norm": 0.04499201849102974, "grad_norm_var": 1.0696730315467156e-05, "learning_rate": 0.009841113032624029, "loss": 2.7532, "step": 2727 }, { "crossentropy": 2.942209005355835, "epoch": 0.09889791183294663, "grad_norm": 0.04730084165930748, "grad_norm_var": 1.1166871790587128e-05, "learning_rate": 0.009840967675730727, "loss": 2.8683, "step": 2728 }, { "crossentropy": 2.8683693408966064, "epoch": 0.09893416473317865, "grad_norm": 0.046950846910476685, "grad_norm_var": 1.0465415178784592e-05, "learning_rate": 0.009840822253452812, "loss": 2.8707, "step": 2729 }, { "crossentropy": 2.841892719268799, "epoch": 0.09897041763341068, "grad_norm": 0.05384386330842972, "grad_norm_var": 1.4845597219547273e-05, "learning_rate": 0.009840676765792245, "loss": 2.8545, "step": 2730 }, { "crossentropy": 2.8013975620269775, "epoch": 0.09900667053364269, "grad_norm": 0.04544874653220177, "grad_norm_var": 1.4780264254189249e-05, "learning_rate": 0.009840531212750993, "loss": 2.8148, "step": 2731 }, { "crossentropy": 2.9242281913757324, "epoch": 0.0990429234338747, "grad_norm": 0.04258325323462486, "grad_norm_var": 9.740090890991473e-06, "learning_rate": 0.009840385594331022, "loss": 2.8418, "step": 2732 }, { "crossentropy": 2.783345937728882, "epoch": 0.09907917633410673, "grad_norm": 0.051645427942276, "grad_norm_var": 1.1716497280206175e-05, "learning_rate": 0.009840239910534296, "loss": 2.8178, "step": 2733 }, { "crossentropy": 2.832395076751709, "epoch": 0.09911542923433875, "grad_norm": 0.05859242007136345, "grad_norm_var": 2.1084210966328584e-05, "learning_rate": 0.009840094161362787, "loss": 2.7557, "step": 2734 }, { "crossentropy": 2.8173725605010986, "epoch": 0.09915168213457076, "grad_norm": 0.05018714815378189, "grad_norm_var": 2.0619323787886683e-05, "learning_rate": 0.009839948346818459, "loss": 2.8106, "step": 2735 }, { "crossentropy": 2.8853352069854736, "epoch": 0.09918793503480279, "grad_norm": 0.04780663549900055, "grad_norm_var": 2.034138531835644e-05, "learning_rate": 0.009839802466903285, "loss": 2.7003, "step": 2736 }, { "crossentropy": 2.9932327270507812, "epoch": 0.0992241879350348, "grad_norm": 0.04370036721229553, "grad_norm_var": 1.956539937873943e-05, "learning_rate": 0.009839656521619234, "loss": 2.9222, "step": 2737 }, { "crossentropy": 2.7623019218444824, "epoch": 0.09926044083526682, "grad_norm": 0.046087149530649185, "grad_norm_var": 1.88217104563367e-05, "learning_rate": 0.009839510510968277, "loss": 2.8057, "step": 2738 }, { "crossentropy": 2.718104362487793, "epoch": 0.09929669373549883, "grad_norm": 0.050016745924949646, "grad_norm_var": 1.840948151277105e-05, "learning_rate": 0.009839364434952386, "loss": 2.6943, "step": 2739 }, { "crossentropy": 2.598681688308716, "epoch": 0.09933294663573086, "grad_norm": 0.046113185584545135, "grad_norm_var": 1.8346853314691636e-05, "learning_rate": 0.009839218293573536, "loss": 2.6462, "step": 2740 }, { "crossentropy": 2.8858141899108887, "epoch": 0.09936919953596288, "grad_norm": 0.043631404638290405, "grad_norm_var": 1.8415315024106206e-05, "learning_rate": 0.009839072086833696, "loss": 2.7784, "step": 2741 }, { "crossentropy": 2.765559673309326, "epoch": 0.09940545243619489, "grad_norm": 0.04234877973794937, "grad_norm_var": 1.925954920755993e-05, "learning_rate": 0.009838925814734846, "loss": 2.7259, "step": 2742 }, { "crossentropy": 2.747652530670166, "epoch": 0.09944170533642692, "grad_norm": 0.041328754276037216, "grad_norm_var": 2.1361378194370563e-05, "learning_rate": 0.009838779477278958, "loss": 2.6952, "step": 2743 }, { "crossentropy": 2.957989454269409, "epoch": 0.09947795823665893, "grad_norm": 0.044760193675756454, "grad_norm_var": 2.1781155985601633e-05, "learning_rate": 0.009838633074468013, "loss": 2.9316, "step": 2744 }, { "crossentropy": 2.712306261062622, "epoch": 0.09951421113689095, "grad_norm": 0.04081099480390549, "grad_norm_var": 2.43333012736655e-05, "learning_rate": 0.009838486606303983, "loss": 2.6618, "step": 2745 }, { "crossentropy": 2.813807964324951, "epoch": 0.09955046403712298, "grad_norm": 0.10347356647253036, "grad_norm_var": 0.00022484537372384738, "learning_rate": 0.009838340072788849, "loss": 2.7105, "step": 2746 }, { "crossentropy": 2.6409595012664795, "epoch": 0.09958671693735499, "grad_norm": 0.044013820588588715, "grad_norm_var": 0.00022582730279458585, "learning_rate": 0.00983819347392459, "loss": 2.7134, "step": 2747 }, { "crossentropy": 2.6050195693969727, "epoch": 0.099622969837587, "grad_norm": 0.0433153472840786, "grad_norm_var": 0.00022515452611506637, "learning_rate": 0.009838046809713187, "loss": 2.7394, "step": 2748 }, { "crossentropy": 2.710235118865967, "epoch": 0.09965922273781902, "grad_norm": 0.03984808176755905, "grad_norm_var": 0.0002310517432752165, "learning_rate": 0.00983790008015662, "loss": 2.7684, "step": 2749 }, { "crossentropy": 2.8952584266662598, "epoch": 0.09969547563805105, "grad_norm": 0.044053006917238235, "grad_norm_var": 0.00022591466239999658, "learning_rate": 0.009837753285256868, "loss": 2.7804, "step": 2750 }, { "crossentropy": 2.981821060180664, "epoch": 0.09973172853828306, "grad_norm": 0.040991466492414474, "grad_norm_var": 0.00022878589323393274, "learning_rate": 0.009837606425015919, "loss": 2.9282, "step": 2751 }, { "crossentropy": 2.748004674911499, "epoch": 0.09976798143851508, "grad_norm": 0.04629148170351982, "grad_norm_var": 0.00022889646140397443, "learning_rate": 0.009837459499435753, "loss": 2.8133, "step": 2752 }, { "crossentropy": 2.745981216430664, "epoch": 0.0998042343387471, "grad_norm": 0.04492815583944321, "grad_norm_var": 0.00022836063340575755, "learning_rate": 0.009837312508518355, "loss": 2.7231, "step": 2753 }, { "crossentropy": 3.0138373374938965, "epoch": 0.09984048723897912, "grad_norm": 0.041319433599710464, "grad_norm_var": 0.00023075941480796505, "learning_rate": 0.009837165452265711, "loss": 2.8986, "step": 2754 }, { "crossentropy": 2.84169602394104, "epoch": 0.09987674013921113, "grad_norm": 0.0398620143532753, "grad_norm_var": 0.00023356355687961325, "learning_rate": 0.009837018330679806, "loss": 2.7807, "step": 2755 }, { "crossentropy": 2.7600927352905273, "epoch": 0.09991299303944315, "grad_norm": 0.04432624578475952, "grad_norm_var": 0.00023390129992151697, "learning_rate": 0.009836871143762629, "loss": 2.7285, "step": 2756 }, { "crossentropy": 2.906977415084839, "epoch": 0.09994924593967518, "grad_norm": 0.045957159250974655, "grad_norm_var": 0.0002333245687095029, "learning_rate": 0.009836723891516167, "loss": 2.7686, "step": 2757 }, { "crossentropy": 2.7798125743865967, "epoch": 0.09998549883990719, "grad_norm": 0.0444333478808403, "grad_norm_var": 0.0002323793253557691, "learning_rate": 0.009836576573942409, "loss": 2.7687, "step": 2758 }, { "crossentropy": 2.8946664333343506, "epoch": 0.1000217517401392, "grad_norm": 0.04338553547859192, "grad_norm_var": 0.00023112765155974493, "learning_rate": 0.009836429191043343, "loss": 2.9127, "step": 2759 }, { "crossentropy": 2.8123250007629395, "epoch": 0.10005800464037123, "grad_norm": 0.06663741171360016, "grad_norm_var": 0.0002545494708705002, "learning_rate": 0.009836281742820963, "loss": 2.8022, "step": 2760 }, { "crossentropy": 2.983687162399292, "epoch": 0.10009425754060325, "grad_norm": 0.04297683760523796, "grad_norm_var": 0.00025266469432548886, "learning_rate": 0.009836134229277258, "loss": 2.8183, "step": 2761 }, { "crossentropy": 2.8060154914855957, "epoch": 0.10013051044083526, "grad_norm": 0.056758999824523926, "grad_norm_var": 4.6573689908369016e-05, "learning_rate": 0.009835986650414222, "loss": 2.7792, "step": 2762 }, { "crossentropy": 2.720247507095337, "epoch": 0.10016676334106729, "grad_norm": 0.07358037680387497, "grad_norm_var": 9.508057093482545e-05, "learning_rate": 0.009835839006233846, "loss": 2.7462, "step": 2763 }, { "crossentropy": 2.7704639434814453, "epoch": 0.1002030162412993, "grad_norm": 0.054957564920186996, "grad_norm_var": 9.718560869930258e-05, "learning_rate": 0.009835691296738128, "loss": 2.7612, "step": 2764 }, { "crossentropy": 2.673055648803711, "epoch": 0.10023926914153132, "grad_norm": 3.656028985977173, "grad_norm_var": 0.8133949235254015, "learning_rate": 0.009835543521929058, "loss": 2.7637, "step": 2765 }, { "crossentropy": 2.6867990493774414, "epoch": 0.10027552204176333, "grad_norm": 0.05058072507381439, "grad_norm_var": 0.8131973141551795, "learning_rate": 0.009835395681808636, "loss": 2.7121, "step": 2766 }, { "crossentropy": 2.781404733657837, "epoch": 0.10031177494199536, "grad_norm": 0.13049617409706116, "grad_norm_var": 0.8109105680982723, "learning_rate": 0.009835247776378857, "loss": 2.7688, "step": 2767 }, { "crossentropy": 2.9017844200134277, "epoch": 0.10034802784222738, "grad_norm": 0.06648435443639755, "grad_norm_var": 0.8103063956301894, "learning_rate": 0.009835099805641718, "loss": 2.8011, "step": 2768 }, { "crossentropy": 2.744807720184326, "epoch": 0.10038428074245939, "grad_norm": 0.04959871619939804, "grad_norm_var": 0.8101604860157557, "learning_rate": 0.009834951769599221, "loss": 2.7889, "step": 2769 }, { "crossentropy": 2.6768314838409424, "epoch": 0.10042053364269142, "grad_norm": 0.05408298224210739, "grad_norm_var": 0.8097615670113724, "learning_rate": 0.00983480366825336, "loss": 2.7524, "step": 2770 }, { "crossentropy": 2.821357250213623, "epoch": 0.10045678654292343, "grad_norm": 0.061428602784872055, "grad_norm_var": 0.809092893896687, "learning_rate": 0.009834655501606141, "loss": 2.7669, "step": 2771 }, { "crossentropy": 2.80722713470459, "epoch": 0.10049303944315545, "grad_norm": 0.0657881423830986, "grad_norm_var": 0.8084362439443699, "learning_rate": 0.009834507269659561, "loss": 2.7803, "step": 2772 }, { "crossentropy": 2.451810836791992, "epoch": 0.10052929234338748, "grad_norm": 0.06174592301249504, "grad_norm_var": 0.8079481809368434, "learning_rate": 0.009834358972415625, "loss": 2.553, "step": 2773 }, { "crossentropy": 2.830296277999878, "epoch": 0.10056554524361949, "grad_norm": 0.051832932978868484, "grad_norm_var": 0.8077130878414667, "learning_rate": 0.009834210609876335, "loss": 2.825, "step": 2774 }, { "crossentropy": 2.780677080154419, "epoch": 0.1006017981438515, "grad_norm": 0.045558150857686996, "grad_norm_var": 0.8076429141556354, "learning_rate": 0.009834062182043692, "loss": 2.8191, "step": 2775 }, { "crossentropy": 2.706183433532715, "epoch": 0.10063805104408352, "grad_norm": 0.050308384001255035, "grad_norm_var": 0.8081388819739848, "learning_rate": 0.009833913688919704, "loss": 2.7464, "step": 2776 }, { "crossentropy": 2.7741808891296387, "epoch": 0.10067430394431555, "grad_norm": 0.04635798931121826, "grad_norm_var": 0.8080301435768438, "learning_rate": 0.009833765130506378, "loss": 2.8307, "step": 2777 }, { "crossentropy": 2.7463648319244385, "epoch": 0.10071055684454756, "grad_norm": 0.047243911772966385, "grad_norm_var": 0.8083266026512383, "learning_rate": 0.009833616506805717, "loss": 2.7829, "step": 2778 }, { "crossentropy": 2.845959186553955, "epoch": 0.10074680974477958, "grad_norm": 0.05611741915345192, "grad_norm_var": 0.8088388144933091, "learning_rate": 0.00983346781781973, "loss": 2.7964, "step": 2779 }, { "crossentropy": 2.523970127105713, "epoch": 0.1007830626450116, "grad_norm": 0.05315594747662544, "grad_norm_var": 0.8088941061634493, "learning_rate": 0.009833319063550424, "loss": 2.7299, "step": 2780 }, { "crossentropy": 2.7152061462402344, "epoch": 0.10081931554524362, "grad_norm": 0.061348769813776016, "grad_norm_var": 0.0004040100073409936, "learning_rate": 0.00983317024399981, "loss": 2.7329, "step": 2781 }, { "crossentropy": 2.82492995262146, "epoch": 0.10085556844547564, "grad_norm": 0.048517856746912, "grad_norm_var": 0.0004067314298244348, "learning_rate": 0.009833021359169898, "loss": 2.8873, "step": 2782 }, { "crossentropy": 2.899949789047241, "epoch": 0.10089182134570766, "grad_norm": 0.04596574977040291, "grad_norm_var": 5.177807258090264e-05, "learning_rate": 0.009832872409062698, "loss": 2.8516, "step": 2783 }, { "crossentropy": 2.7408905029296875, "epoch": 0.10092807424593968, "grad_norm": 0.06842858344316483, "grad_norm_var": 5.5225766678338587e-05, "learning_rate": 0.00983272339368022, "loss": 2.7877, "step": 2784 }, { "crossentropy": 2.7688851356506348, "epoch": 0.10096432714617169, "grad_norm": 0.04697131738066673, "grad_norm_var": 5.727527118279483e-05, "learning_rate": 0.009832574313024482, "loss": 2.8106, "step": 2785 }, { "crossentropy": 2.8003110885620117, "epoch": 0.1010005800464037, "grad_norm": 0.052817452698946, "grad_norm_var": 5.737035905281421e-05, "learning_rate": 0.009832425167097493, "loss": 2.7971, "step": 2786 }, { "crossentropy": 2.804638147354126, "epoch": 0.10103683294663574, "grad_norm": 0.04906392842531204, "grad_norm_var": 5.463617373009094e-05, "learning_rate": 0.009832275955901266, "loss": 2.7778, "step": 2787 }, { "crossentropy": 2.7835896015167236, "epoch": 0.10107308584686775, "grad_norm": 0.047881364822387695, "grad_norm_var": 4.462524242207355e-05, "learning_rate": 0.009832126679437822, "loss": 2.8202, "step": 2788 }, { "crossentropy": 2.8539154529571533, "epoch": 0.10110933874709976, "grad_norm": 0.05021444335579872, "grad_norm_var": 3.8077957606612714e-05, "learning_rate": 0.009831977337709172, "loss": 2.869, "step": 2789 }, { "crossentropy": 2.8447937965393066, "epoch": 0.10114559164733179, "grad_norm": 0.0462467223405838, "grad_norm_var": 3.9677189818556245e-05, "learning_rate": 0.009831827930717338, "loss": 2.8165, "step": 2790 }, { "crossentropy": 2.789548873901367, "epoch": 0.1011818445475638, "grad_norm": 0.048361886292696, "grad_norm_var": 3.8129538094738615e-05, "learning_rate": 0.009831678458464334, "loss": 2.7516, "step": 2791 }, { "crossentropy": 2.8492069244384766, "epoch": 0.10121809744779582, "grad_norm": 0.04978431016206741, "grad_norm_var": 3.8208141032536104e-05, "learning_rate": 0.00983152892095218, "loss": 2.7814, "step": 2792 }, { "crossentropy": 2.7367210388183594, "epoch": 0.10125435034802784, "grad_norm": 0.04526596888899803, "grad_norm_var": 3.898110928016307e-05, "learning_rate": 0.009831379318182894, "loss": 2.7924, "step": 2793 }, { "crossentropy": 2.7471072673797607, "epoch": 0.10129060324825986, "grad_norm": 0.05623091757297516, "grad_norm_var": 3.942443073638724e-05, "learning_rate": 0.0098312296501585, "loss": 2.6806, "step": 2794 }, { "crossentropy": 2.8974106311798096, "epoch": 0.10132685614849188, "grad_norm": 0.04738877713680267, "grad_norm_var": 3.898499794769318e-05, "learning_rate": 0.009831079916881019, "loss": 2.8961, "step": 2795 }, { "crossentropy": 2.7002196311950684, "epoch": 0.10136310904872389, "grad_norm": 0.05150826647877693, "grad_norm_var": 3.8703607593583836e-05, "learning_rate": 0.009830930118352472, "loss": 2.754, "step": 2796 }, { "crossentropy": 2.9414916038513184, "epoch": 0.10139936194895592, "grad_norm": 0.04312445595860481, "grad_norm_var": 3.431433509351641e-05, "learning_rate": 0.009830780254574881, "loss": 2.855, "step": 2797 }, { "crossentropy": 2.799345016479492, "epoch": 0.10143561484918794, "grad_norm": 0.044722314924001694, "grad_norm_var": 3.5894319749037576e-05, "learning_rate": 0.009830630325550272, "loss": 2.8643, "step": 2798 }, { "crossentropy": 2.775853157043457, "epoch": 0.10147186774941995, "grad_norm": 0.044502582401037216, "grad_norm_var": 3.674171579635995e-05, "learning_rate": 0.00983048033128067, "loss": 2.8347, "step": 2799 }, { "crossentropy": 2.8107070922851562, "epoch": 0.10150812064965198, "grad_norm": 0.04373829811811447, "grad_norm_var": 1.2634346942099322e-05, "learning_rate": 0.009830330271768103, "loss": 2.7313, "step": 2800 }, { "crossentropy": 2.4503848552703857, "epoch": 0.10154437354988399, "grad_norm": 0.04390037804841995, "grad_norm_var": 1.3640437180685814e-05, "learning_rate": 0.00983018014701459, "loss": 2.6341, "step": 2801 }, { "crossentropy": 2.7688851356506348, "epoch": 0.10158062645011601, "grad_norm": 0.05627133697271347, "grad_norm_var": 1.669802587017695e-05, "learning_rate": 0.009830029957022169, "loss": 2.7235, "step": 2802 }, { "crossentropy": 2.597235679626465, "epoch": 0.10161687935034802, "grad_norm": 0.04078215733170509, "grad_norm_var": 1.9824144645038076e-05, "learning_rate": 0.009829879701792861, "loss": 2.6702, "step": 2803 }, { "crossentropy": 2.876559257507324, "epoch": 0.10165313225058005, "grad_norm": 0.04297657310962677, "grad_norm_var": 2.1075205023451718e-05, "learning_rate": 0.0098297293813287, "loss": 2.9009, "step": 2804 }, { "crossentropy": 2.744718551635742, "epoch": 0.10168938515081206, "grad_norm": 0.04817599803209305, "grad_norm_var": 2.0512536958275315e-05, "learning_rate": 0.009829578995631714, "loss": 2.7451, "step": 2805 }, { "crossentropy": 2.7516727447509766, "epoch": 0.10172563805104408, "grad_norm": 0.06882578134536743, "grad_norm_var": 4.992355939838067e-05, "learning_rate": 0.009829428544703934, "loss": 2.6428, "step": 2806 }, { "crossentropy": 2.829620122909546, "epoch": 0.10176189095127611, "grad_norm": 0.04594641551375389, "grad_norm_var": 5.0323840203995936e-05, "learning_rate": 0.009829278028547392, "loss": 2.8139, "step": 2807 }, { "crossentropy": 2.71112322807312, "epoch": 0.10179814385150812, "grad_norm": 0.04509901627898216, "grad_norm_var": 5.078203358119237e-05, "learning_rate": 0.009829127447164122, "loss": 2.7808, "step": 2808 }, { "crossentropy": 2.696422815322876, "epoch": 0.10183439675174014, "grad_norm": 0.044995471835136414, "grad_norm_var": 5.088624812492979e-05, "learning_rate": 0.009828976800556156, "loss": 2.6612, "step": 2809 }, { "crossentropy": 2.8133671283721924, "epoch": 0.10187064965197216, "grad_norm": 0.04321419447660446, "grad_norm_var": 4.721113723596121e-05, "learning_rate": 0.009828826088725531, "loss": 2.8137, "step": 2810 }, { "crossentropy": 2.7440929412841797, "epoch": 0.10190690255220418, "grad_norm": 0.08260762691497803, "grad_norm_var": 0.0001256287794100969, "learning_rate": 0.009828675311674282, "loss": 2.7776, "step": 2811 }, { "crossentropy": 2.847595691680908, "epoch": 0.1019431554524362, "grad_norm": 0.044626712799072266, "grad_norm_var": 0.00012665357216505303, "learning_rate": 0.009828524469404446, "loss": 2.8418, "step": 2812 }, { "crossentropy": 2.7345666885375977, "epoch": 0.10197940835266821, "grad_norm": 0.07422897964715958, "grad_norm_var": 0.00016288150571111312, "learning_rate": 0.009828373561918059, "loss": 2.7742, "step": 2813 }, { "crossentropy": 2.6695640087127686, "epoch": 0.10201566125290024, "grad_norm": 0.048630911856889725, "grad_norm_var": 0.0001606098838299721, "learning_rate": 0.009828222589217159, "loss": 2.7293, "step": 2814 }, { "crossentropy": 2.822613477706909, "epoch": 0.10205191415313225, "grad_norm": 0.047693781554698944, "grad_norm_var": 0.0001584146812648177, "learning_rate": 0.009828071551303786, "loss": 2.8252, "step": 2815 }, { "crossentropy": 2.93569278717041, "epoch": 0.10208816705336426, "grad_norm": 0.05273224785923958, "grad_norm_var": 0.0001543339583137237, "learning_rate": 0.00982792044817998, "loss": 2.8124, "step": 2816 }, { "crossentropy": 2.8320541381835938, "epoch": 0.1021244199535963, "grad_norm": 0.044081300497055054, "grad_norm_var": 0.0001541425655566669, "learning_rate": 0.009827769279847782, "loss": 2.795, "step": 2817 }, { "crossentropy": 2.800241470336914, "epoch": 0.10216067285382831, "grad_norm": 0.05093300715088844, "grad_norm_var": 0.00015283398913640254, "learning_rate": 0.009827618046309232, "loss": 2.8198, "step": 2818 }, { "crossentropy": 2.732579231262207, "epoch": 0.10219692575406032, "grad_norm": 0.0519140362739563, "grad_norm_var": 0.000144527142956106, "learning_rate": 0.009827466747566375, "loss": 2.8026, "step": 2819 }, { "crossentropy": 2.9120774269104004, "epoch": 0.10223317865429234, "grad_norm": 0.0572199672460556, "grad_norm_var": 0.00013951448551079894, "learning_rate": 0.009827315383621256, "loss": 2.8119, "step": 2820 }, { "crossentropy": 2.9055676460266113, "epoch": 0.10226943155452436, "grad_norm": 0.04952308163046837, "grad_norm_var": 0.00013872861541946608, "learning_rate": 0.009827163954475913, "loss": 2.838, "step": 2821 }, { "crossentropy": 2.816213369369507, "epoch": 0.10230568445475638, "grad_norm": 0.04588263854384422, "grad_norm_var": 0.0001240323092731416, "learning_rate": 0.009827012460132399, "loss": 2.8219, "step": 2822 }, { "crossentropy": 2.966238498687744, "epoch": 0.1023419373549884, "grad_norm": 0.12245016545057297, "grad_norm_var": 0.0004297868027018231, "learning_rate": 0.009826860900592753, "loss": 2.7717, "step": 2823 }, { "crossentropy": 2.8403115272521973, "epoch": 0.10237819025522042, "grad_norm": 0.04976339265704155, "grad_norm_var": 0.0004239848604602044, "learning_rate": 0.009826709275859028, "loss": 2.8173, "step": 2824 }, { "crossentropy": 2.7708826065063477, "epoch": 0.10241444315545244, "grad_norm": 0.0753774419426918, "grad_norm_var": 0.00043342727517166037, "learning_rate": 0.009826557585933269, "loss": 2.801, "step": 2825 }, { "crossentropy": 2.7235019207000732, "epoch": 0.10245069605568445, "grad_norm": 0.0441504567861557, "grad_norm_var": 0.00043153578806097805, "learning_rate": 0.009826405830817526, "loss": 2.7943, "step": 2826 }, { "crossentropy": 2.8633439540863037, "epoch": 0.10248694895591648, "grad_norm": 0.04253912717103958, "grad_norm_var": 0.0004050262904092603, "learning_rate": 0.009826254010513846, "loss": 2.7134, "step": 2827 }, { "crossentropy": 2.7277684211730957, "epoch": 0.1025232018561485, "grad_norm": 0.0687323659658432, "grad_norm_var": 0.0004036347020817656, "learning_rate": 0.009826102125024282, "loss": 2.6609, "step": 2828 }, { "crossentropy": 2.936413049697876, "epoch": 0.10255945475638051, "grad_norm": 0.04639800265431404, "grad_norm_var": 0.00039132449397713954, "learning_rate": 0.009825950174350886, "loss": 2.7866, "step": 2829 }, { "crossentropy": 2.639741897583008, "epoch": 0.10259570765661252, "grad_norm": 0.04293258488178253, "grad_norm_var": 0.0003990488035477274, "learning_rate": 0.00982579815849571, "loss": 2.7067, "step": 2830 }, { "crossentropy": 2.736928701400757, "epoch": 0.10263196055684455, "grad_norm": 0.04270734265446663, "grad_norm_var": 0.00040597253151842486, "learning_rate": 0.009825646077460804, "loss": 2.7721, "step": 2831 }, { "crossentropy": 2.6742451190948486, "epoch": 0.10266821345707657, "grad_norm": 0.041197046637535095, "grad_norm_var": 0.00041848199610258183, "learning_rate": 0.009825493931248225, "loss": 2.7347, "step": 2832 }, { "crossentropy": 2.817559242248535, "epoch": 0.10270446635730858, "grad_norm": 0.04281633719801903, "grad_norm_var": 0.0004203793185071164, "learning_rate": 0.009825341719860029, "loss": 2.832, "step": 2833 }, { "crossentropy": 2.806425094604492, "epoch": 0.10274071925754061, "grad_norm": 0.045651666820049286, "grad_norm_var": 0.00042474605977353024, "learning_rate": 0.009825189443298268, "loss": 2.7884, "step": 2834 }, { "crossentropy": 2.839984178543091, "epoch": 0.10277697215777262, "grad_norm": 0.04850802570581436, "grad_norm_var": 0.00042656759857915096, "learning_rate": 0.009825037101565003, "loss": 2.7443, "step": 2835 }, { "crossentropy": 2.6992270946502686, "epoch": 0.10281322505800464, "grad_norm": 0.04443592578172684, "grad_norm_var": 0.0004314905709084084, "learning_rate": 0.009824884694662289, "loss": 2.7821, "step": 2836 }, { "crossentropy": 2.692091226577759, "epoch": 0.10284947795823667, "grad_norm": 0.04375550150871277, "grad_norm_var": 0.0004364868884584251, "learning_rate": 0.009824732222592185, "loss": 2.7913, "step": 2837 }, { "crossentropy": 2.761460065841675, "epoch": 0.10288573085846868, "grad_norm": 0.04684516414999962, "grad_norm_var": 0.0004356370035053051, "learning_rate": 0.009824579685356752, "loss": 2.7162, "step": 2838 }, { "crossentropy": 2.7755277156829834, "epoch": 0.1029219837587007, "grad_norm": 0.043947912752628326, "grad_norm_var": 9.403758835779674e-05, "learning_rate": 0.009824427082958047, "loss": 2.7039, "step": 2839 }, { "crossentropy": 2.7759952545166016, "epoch": 0.10295823665893271, "grad_norm": 0.04914882034063339, "grad_norm_var": 9.392570190885328e-05, "learning_rate": 0.009824274415398133, "loss": 2.8128, "step": 2840 }, { "crossentropy": 2.7235031127929688, "epoch": 0.10299448955916474, "grad_norm": 0.04272300750017166, "grad_norm_var": 4.1682119924679506e-05, "learning_rate": 0.009824121682679073, "loss": 2.7987, "step": 2841 }, { "crossentropy": 2.78230881690979, "epoch": 0.10303074245939675, "grad_norm": 0.044379446655511856, "grad_norm_var": 4.1627993291744266e-05, "learning_rate": 0.009823968884802928, "loss": 2.8335, "step": 2842 }, { "crossentropy": 2.752535343170166, "epoch": 0.10306699535962877, "grad_norm": 0.04049699380993843, "grad_norm_var": 4.284320293360654e-05, "learning_rate": 0.009823816021771762, "loss": 2.7896, "step": 2843 }, { "crossentropy": 2.863199234008789, "epoch": 0.1031032482598608, "grad_norm": 0.04463754594326019, "grad_norm_var": 5.831438981151784e-06, "learning_rate": 0.009823663093587643, "loss": 2.8451, "step": 2844 }, { "crossentropy": 2.7940895557403564, "epoch": 0.10313950116009281, "grad_norm": 0.048184338957071304, "grad_norm_var": 6.5040577125620445e-06, "learning_rate": 0.009823510100252631, "loss": 2.7844, "step": 2845 }, { "crossentropy": 2.8753793239593506, "epoch": 0.10317575406032482, "grad_norm": 0.047674790024757385, "grad_norm_var": 6.903992981593712e-06, "learning_rate": 0.009823357041768796, "loss": 2.825, "step": 2846 }, { "crossentropy": 2.7609877586364746, "epoch": 0.10321200696055685, "grad_norm": 0.046201154589653015, "grad_norm_var": 6.683044476257724e-06, "learning_rate": 0.009823203918138205, "loss": 2.8268, "step": 2847 }, { "crossentropy": 2.7873780727386475, "epoch": 0.10324825986078887, "grad_norm": 0.047725994139909744, "grad_norm_var": 6.003825776455113e-06, "learning_rate": 0.009823050729362926, "loss": 2.8055, "step": 2848 }, { "crossentropy": 2.8897597789764404, "epoch": 0.10328451276102088, "grad_norm": 0.06707213819026947, "grad_norm_var": 3.4271390057517316e-05, "learning_rate": 0.009822897475445026, "loss": 2.8475, "step": 2849 }, { "crossentropy": 2.7679171562194824, "epoch": 0.1033207656612529, "grad_norm": 0.045315299183130264, "grad_norm_var": 3.4337218647713616e-05, "learning_rate": 0.009822744156386577, "loss": 2.7913, "step": 2850 }, { "crossentropy": 2.6768946647644043, "epoch": 0.10335701856148492, "grad_norm": 0.043709419667720795, "grad_norm_var": 3.477361960528442e-05, "learning_rate": 0.00982259077218965, "loss": 2.738, "step": 2851 }, { "crossentropy": 2.793975591659546, "epoch": 0.10339327146171694, "grad_norm": 0.044560614973306656, "grad_norm_var": 3.473793410754585e-05, "learning_rate": 0.009822437322856316, "loss": 2.7693, "step": 2852 }, { "crossentropy": 2.608473539352417, "epoch": 0.10342952436194895, "grad_norm": 0.05337978154420853, "grad_norm_var": 3.681453148774895e-05, "learning_rate": 0.009822283808388647, "loss": 2.7047, "step": 2853 }, { "crossentropy": 2.8505616188049316, "epoch": 0.10346577726218098, "grad_norm": 0.06525897979736328, "grad_norm_var": 5.701200466131836e-05, "learning_rate": 0.009822130228788719, "loss": 2.7038, "step": 2854 }, { "crossentropy": 2.6664505004882812, "epoch": 0.103502030162413, "grad_norm": 0.06726578623056412, "grad_norm_var": 7.714978986878193e-05, "learning_rate": 0.009821976584058601, "loss": 2.7627, "step": 2855 }, { "crossentropy": 2.865973472595215, "epoch": 0.10353828306264501, "grad_norm": 0.047439686954021454, "grad_norm_var": 7.749405899024804e-05, "learning_rate": 0.009821822874200374, "loss": 2.7489, "step": 2856 }, { "crossentropy": 2.6752254962921143, "epoch": 0.10357453596287702, "grad_norm": 0.047939226031303406, "grad_norm_var": 7.430628791322072e-05, "learning_rate": 0.009821669099216111, "loss": 2.7241, "step": 2857 }, { "crossentropy": 2.809119939804077, "epoch": 0.10361078886310905, "grad_norm": 0.04953885078430176, "grad_norm_var": 7.205014415311936e-05, "learning_rate": 0.00982151525910789, "loss": 2.7873, "step": 2858 }, { "crossentropy": 2.833568572998047, "epoch": 0.10364704176334107, "grad_norm": 0.049674078822135925, "grad_norm_var": 6.519634816315826e-05, "learning_rate": 0.009821361353877789, "loss": 2.7988, "step": 2859 }, { "crossentropy": 2.8294174671173096, "epoch": 0.10368329466357308, "grad_norm": 0.04173947498202324, "grad_norm_var": 6.816958742818692e-05, "learning_rate": 0.009821207383527887, "loss": 2.8036, "step": 2860 }, { "crossentropy": 2.9254467487335205, "epoch": 0.10371954756380511, "grad_norm": 0.041996072977781296, "grad_norm_var": 7.271498154035044e-05, "learning_rate": 0.009821053348060261, "loss": 2.8256, "step": 2861 }, { "crossentropy": 2.6777291297912598, "epoch": 0.10375580046403712, "grad_norm": 0.042454272508621216, "grad_norm_var": 7.63192524952646e-05, "learning_rate": 0.009820899247476993, "loss": 2.68, "step": 2862 }, { "crossentropy": 2.7800097465515137, "epoch": 0.10379205336426914, "grad_norm": 0.04605358466506004, "grad_norm_var": 7.639692239861569e-05, "learning_rate": 0.009820745081780166, "loss": 2.8362, "step": 2863 }, { "crossentropy": 2.8434579372406006, "epoch": 0.10382830626450117, "grad_norm": 0.05237068980932236, "grad_norm_var": 7.629349517887605e-05, "learning_rate": 0.00982059085097186, "loss": 2.9019, "step": 2864 }, { "crossentropy": 2.9581193923950195, "epoch": 0.10386455916473318, "grad_norm": 0.07861165702342987, "grad_norm_var": 0.00011032859871746934, "learning_rate": 0.009820436555054159, "loss": 2.9625, "step": 2865 }, { "crossentropy": 2.9211196899414062, "epoch": 0.1039008120649652, "grad_norm": 0.0532059445977211, "grad_norm_var": 0.0001081532234995887, "learning_rate": 0.009820282194029148, "loss": 2.8755, "step": 2866 }, { "crossentropy": 2.9045987129211426, "epoch": 0.10393706496519721, "grad_norm": 0.045902762562036514, "grad_norm_var": 0.00010615367460516337, "learning_rate": 0.009820127767898913, "loss": 2.863, "step": 2867 }, { "crossentropy": 2.759711265563965, "epoch": 0.10397331786542924, "grad_norm": 0.04809049516916275, "grad_norm_var": 0.00010356663943438237, "learning_rate": 0.009819973276665533, "loss": 2.7023, "step": 2868 }, { "crossentropy": 2.6832709312438965, "epoch": 0.10400957076566125, "grad_norm": 0.0472918264567852, "grad_norm_var": 0.00010470835945909079, "learning_rate": 0.009819818720331101, "loss": 2.7595, "step": 2869 }, { "crossentropy": 2.8125319480895996, "epoch": 0.10404582366589327, "grad_norm": 0.048662636429071426, "grad_norm_var": 9.159203250348284e-05, "learning_rate": 0.009819664098897705, "loss": 2.7054, "step": 2870 }, { "crossentropy": 2.7191615104675293, "epoch": 0.1040820765661253, "grad_norm": 0.04603579267859459, "grad_norm_var": 7.234517120890167e-05, "learning_rate": 0.009819509412367429, "loss": 2.7516, "step": 2871 }, { "crossentropy": 2.762213706970215, "epoch": 0.10411832946635731, "grad_norm": 0.05161125585436821, "grad_norm_var": 7.246040056432789e-05, "learning_rate": 0.009819354660742366, "loss": 2.7969, "step": 2872 }, { "crossentropy": 2.851093053817749, "epoch": 0.10415458236658932, "grad_norm": 0.05845686048269272, "grad_norm_var": 7.725742821218425e-05, "learning_rate": 0.009819199844024603, "loss": 2.8025, "step": 2873 }, { "crossentropy": 2.6920247077941895, "epoch": 0.10419083526682135, "grad_norm": 0.07339175045490265, "grad_norm_var": 0.00011101367500491178, "learning_rate": 0.009819044962216235, "loss": 2.7733, "step": 2874 }, { "crossentropy": 2.730698823928833, "epoch": 0.10422708816705337, "grad_norm": 0.06697384268045425, "grad_norm_var": 0.0001252837217929253, "learning_rate": 0.00981889001531935, "loss": 2.8253, "step": 2875 }, { "crossentropy": 2.8903679847717285, "epoch": 0.10426334106728538, "grad_norm": 0.04731031134724617, "grad_norm_var": 0.00011909842000025576, "learning_rate": 0.009818735003336042, "loss": 2.826, "step": 2876 }, { "crossentropy": 2.7678990364074707, "epoch": 0.1042995939675174, "grad_norm": 0.09773942083120346, "grad_norm_var": 0.00023132488607034376, "learning_rate": 0.009818579926268405, "loss": 2.6679, "step": 2877 }, { "crossentropy": 2.744133234024048, "epoch": 0.10433584686774942, "grad_norm": 0.04863615706562996, "grad_norm_var": 0.00022212775593547507, "learning_rate": 0.009818424784118535, "loss": 2.7429, "step": 2878 }, { "crossentropy": 2.823092460632324, "epoch": 0.10437209976798144, "grad_norm": 0.05339444428682327, "grad_norm_var": 0.00021488286665636793, "learning_rate": 0.009818269576888526, "loss": 2.804, "step": 2879 }, { "crossentropy": 2.7247893810272217, "epoch": 0.10440835266821345, "grad_norm": 0.041156187653541565, "grad_norm_var": 0.0002301966036309055, "learning_rate": 0.009818114304580473, "loss": 2.7163, "step": 2880 }, { "crossentropy": 2.7553179264068604, "epoch": 0.10444460556844548, "grad_norm": 0.04894036799669266, "grad_norm_var": 0.000198354241882373, "learning_rate": 0.009817958967196476, "loss": 2.7272, "step": 2881 }, { "crossentropy": 2.650158405303955, "epoch": 0.1044808584686775, "grad_norm": 0.05722113326191902, "grad_norm_var": 0.00019850845765004264, "learning_rate": 0.009817803564738629, "loss": 2.6645, "step": 2882 }, { "crossentropy": 2.4727892875671387, "epoch": 0.10451711136890951, "grad_norm": 0.05783938616514206, "grad_norm_var": 0.00019285384357992924, "learning_rate": 0.009817648097209038, "loss": 2.6383, "step": 2883 }, { "crossentropy": 2.7109415531158447, "epoch": 0.10455336426914154, "grad_norm": 0.04983852058649063, "grad_norm_var": 0.00019124866453488113, "learning_rate": 0.009817492564609795, "loss": 2.6997, "step": 2884 }, { "crossentropy": 2.7719168663024902, "epoch": 0.10458961716937355, "grad_norm": 0.04357413202524185, "grad_norm_var": 0.00019638259542599583, "learning_rate": 0.009817336966943004, "loss": 2.7862, "step": 2885 }, { "crossentropy": 2.8016624450683594, "epoch": 0.10462587006960557, "grad_norm": 0.04763470217585564, "grad_norm_var": 0.00019740958332725155, "learning_rate": 0.009817181304210769, "loss": 2.8022, "step": 2886 }, { "crossentropy": 2.905891180038452, "epoch": 0.10466212296983758, "grad_norm": 0.04137806594371796, "grad_norm_var": 0.0002047111340814889, "learning_rate": 0.009817025576415188, "loss": 2.7717, "step": 2887 }, { "crossentropy": 2.854328155517578, "epoch": 0.10469837587006961, "grad_norm": 0.0483282133936882, "grad_norm_var": 0.00020700760213571885, "learning_rate": 0.009816869783558368, "loss": 2.8154, "step": 2888 }, { "crossentropy": 2.8375911712646484, "epoch": 0.10473462877030162, "grad_norm": 0.04906788468360901, "grad_norm_var": 0.00020833152965169313, "learning_rate": 0.009816713925642411, "loss": 2.8328, "step": 2889 }, { "crossentropy": 2.8170199394226074, "epoch": 0.10477088167053364, "grad_norm": 0.04416891932487488, "grad_norm_var": 0.00018819888551923416, "learning_rate": 0.009816558002669422, "loss": 2.7821, "step": 2890 }, { "crossentropy": 2.7390379905700684, "epoch": 0.10480713457076567, "grad_norm": 0.042892564088106155, "grad_norm_var": 0.00017861248892356939, "learning_rate": 0.00981640201464151, "loss": 2.7834, "step": 2891 }, { "crossentropy": 2.7964625358581543, "epoch": 0.10484338747099768, "grad_norm": 0.038903843611478806, "grad_norm_var": 0.00018738351275054496, "learning_rate": 0.009816245961560778, "loss": 2.6461, "step": 2892 }, { "crossentropy": 2.8188095092773438, "epoch": 0.1048796403712297, "grad_norm": 0.05386398732662201, "grad_norm_var": 3.2338319744530604e-05, "learning_rate": 0.009816089843429337, "loss": 2.7295, "step": 2893 }, { "crossentropy": 2.787662982940674, "epoch": 0.10491589327146171, "grad_norm": 0.047363754361867905, "grad_norm_var": 3.2319265692364445e-05, "learning_rate": 0.009815933660249292, "loss": 2.819, "step": 2894 }, { "crossentropy": 2.725740432739258, "epoch": 0.10495214617169374, "grad_norm": 0.05053756758570671, "grad_norm_var": 3.0716595325140186e-05, "learning_rate": 0.009815777412022756, "loss": 2.7529, "step": 2895 }, { "crossentropy": 2.688425302505493, "epoch": 0.10498839907192575, "grad_norm": 0.04420386627316475, "grad_norm_var": 2.865045619406901e-05, "learning_rate": 0.009815621098751837, "loss": 2.7187, "step": 2896 }, { "crossentropy": 2.7995851039886475, "epoch": 0.10502465197215777, "grad_norm": 0.04322732985019684, "grad_norm_var": 2.9867276520048094e-05, "learning_rate": 0.009815464720438646, "loss": 2.8059, "step": 2897 }, { "crossentropy": 2.7875285148620605, "epoch": 0.1050609048723898, "grad_norm": 0.03979140892624855, "grad_norm_var": 2.6269297885312715e-05, "learning_rate": 0.009815308277085297, "loss": 2.8042, "step": 2898 }, { "crossentropy": 2.832693338394165, "epoch": 0.10509715777262181, "grad_norm": 0.04170074686408043, "grad_norm_var": 1.796109442369076e-05, "learning_rate": 0.009815151768693903, "loss": 2.7413, "step": 2899 }, { "crossentropy": 2.7459089756011963, "epoch": 0.10513341067285382, "grad_norm": 0.0444474034011364, "grad_norm_var": 1.659051773379803e-05, "learning_rate": 0.009814995195266575, "loss": 2.783, "step": 2900 }, { "crossentropy": 2.6335127353668213, "epoch": 0.10516966357308585, "grad_norm": 0.04857906326651573, "grad_norm_var": 1.7159357651659834e-05, "learning_rate": 0.009814838556805433, "loss": 2.6118, "step": 2901 }, { "crossentropy": 2.7540409564971924, "epoch": 0.10520591647331787, "grad_norm": 0.042710088193416595, "grad_norm_var": 1.719500738909709e-05, "learning_rate": 0.009814681853312588, "loss": 2.7643, "step": 2902 }, { "crossentropy": 2.716484308242798, "epoch": 0.10524216937354988, "grad_norm": 0.045336637645959854, "grad_norm_var": 1.6224287616319286e-05, "learning_rate": 0.009814525084790159, "loss": 2.7781, "step": 2903 }, { "crossentropy": 2.840708017349243, "epoch": 0.1052784222737819, "grad_norm": 0.04990792274475098, "grad_norm_var": 1.701382576595282e-05, "learning_rate": 0.009814368251240262, "loss": 2.7569, "step": 2904 }, { "crossentropy": 2.668790102005005, "epoch": 0.10531467517401392, "grad_norm": 0.050556447356939316, "grad_norm_var": 1.787653946660004e-05, "learning_rate": 0.009814211352665015, "loss": 2.7165, "step": 2905 }, { "crossentropy": 2.619995594024658, "epoch": 0.10535092807424594, "grad_norm": 0.04995075613260269, "grad_norm_var": 1.8930516975619072e-05, "learning_rate": 0.009814054389066539, "loss": 2.6821, "step": 2906 }, { "crossentropy": 2.758716106414795, "epoch": 0.10538718097447795, "grad_norm": 0.0441238097846508, "grad_norm_var": 1.8535923050611892e-05, "learning_rate": 0.00981389736044695, "loss": 2.7512, "step": 2907 }, { "crossentropy": 2.84596848487854, "epoch": 0.10542343387470998, "grad_norm": 0.04351704567670822, "grad_norm_var": 1.5531801250515814e-05, "learning_rate": 0.009813740266808374, "loss": 2.7987, "step": 2908 }, { "crossentropy": 2.7969789505004883, "epoch": 0.105459686774942, "grad_norm": 0.05081101506948471, "grad_norm_var": 1.3010334419954102e-05, "learning_rate": 0.00981358310815293, "loss": 2.7985, "step": 2909 }, { "crossentropy": 2.8219707012176514, "epoch": 0.10549593967517401, "grad_norm": 0.04495716094970703, "grad_norm_var": 1.2950054147077502e-05, "learning_rate": 0.009813425884482743, "loss": 2.7851, "step": 2910 }, { "crossentropy": 2.7854573726654053, "epoch": 0.10553219257540604, "grad_norm": 0.04261245205998421, "grad_norm_var": 1.1972329659072113e-05, "learning_rate": 0.009813268595799934, "loss": 2.7381, "step": 2911 }, { "crossentropy": 2.7429206371307373, "epoch": 0.10556844547563805, "grad_norm": 0.04712897166609764, "grad_norm_var": 1.203977763191786e-05, "learning_rate": 0.009813111242106628, "loss": 2.7439, "step": 2912 }, { "crossentropy": 2.6681578159332275, "epoch": 0.10560469837587007, "grad_norm": 0.050624165683984756, "grad_norm_var": 1.3134218810295408e-05, "learning_rate": 0.00981295382340495, "loss": 2.6931, "step": 2913 }, { "crossentropy": 2.7412912845611572, "epoch": 0.10564095127610208, "grad_norm": 0.048631779849529266, "grad_norm_var": 1.0644934591934272e-05, "learning_rate": 0.009812796339697026, "loss": 2.8079, "step": 2914 }, { "crossentropy": 2.9316630363464355, "epoch": 0.10567720417633411, "grad_norm": 0.04571322724223137, "grad_norm_var": 9.030248499266163e-06, "learning_rate": 0.009812638790984984, "loss": 2.8068, "step": 2915 }, { "crossentropy": 2.8599436283111572, "epoch": 0.10571345707656613, "grad_norm": 0.042122889310121536, "grad_norm_var": 1.0112762081361931e-05, "learning_rate": 0.009812481177270953, "loss": 2.7812, "step": 2916 }, { "crossentropy": 2.735982894897461, "epoch": 0.10574970997679814, "grad_norm": 0.045246727764606476, "grad_norm_var": 9.974218473195647e-06, "learning_rate": 0.009812323498557058, "loss": 2.5732, "step": 2917 }, { "crossentropy": 2.7049152851104736, "epoch": 0.10578596287703017, "grad_norm": 0.04360780119895935, "grad_norm_var": 9.571318603579283e-06, "learning_rate": 0.009812165754845434, "loss": 2.7439, "step": 2918 }, { "crossentropy": 2.8513543605804443, "epoch": 0.10582221577726218, "grad_norm": 0.04073595628142357, "grad_norm_var": 1.1640387558996731e-05, "learning_rate": 0.009812007946138209, "loss": 2.7995, "step": 2919 }, { "crossentropy": 2.735322952270508, "epoch": 0.1058584686774942, "grad_norm": 0.057305194437503815, "grad_norm_var": 1.865288849039447e-05, "learning_rate": 0.009811850072437511, "loss": 2.7878, "step": 2920 }, { "crossentropy": 2.8579037189483643, "epoch": 0.10589472157772621, "grad_norm": 0.04075634852051735, "grad_norm_var": 1.965274226967869e-05, "learning_rate": 0.009811692133745479, "loss": 2.7898, "step": 2921 }, { "crossentropy": 2.772751808166504, "epoch": 0.10593097447795824, "grad_norm": 0.040477246046066284, "grad_norm_var": 2.0417296308327993e-05, "learning_rate": 0.00981153413006424, "loss": 2.8179, "step": 2922 }, { "crossentropy": 2.7007641792297363, "epoch": 0.10596722737819025, "grad_norm": 0.042155906558036804, "grad_norm_var": 2.1026528139475528e-05, "learning_rate": 0.009811376061395934, "loss": 2.7455, "step": 2923 }, { "crossentropy": 2.7514336109161377, "epoch": 0.10600348027842227, "grad_norm": 0.04179231449961662, "grad_norm_var": 2.1645514630892297e-05, "learning_rate": 0.009811217927742689, "loss": 2.7622, "step": 2924 }, { "crossentropy": 2.7860538959503174, "epoch": 0.1060397331786543, "grad_norm": 0.04560169205069542, "grad_norm_var": 1.9508513091880063e-05, "learning_rate": 0.009811059729106645, "loss": 2.8557, "step": 2925 }, { "crossentropy": 2.770379066467285, "epoch": 0.10607598607888631, "grad_norm": 0.04506323114037514, "grad_norm_var": 1.9509079036017276e-05, "learning_rate": 0.00981090146548994, "loss": 2.6981, "step": 2926 }, { "crossentropy": 2.710115432739258, "epoch": 0.10611223897911833, "grad_norm": 0.040786806493997574, "grad_norm_var": 2.0292113832204074e-05, "learning_rate": 0.009810743136894711, "loss": 2.6699, "step": 2927 }, { "crossentropy": 2.7837088108062744, "epoch": 0.10614849187935035, "grad_norm": 0.06414908170700073, "grad_norm_var": 4.354784091364772e-05, "learning_rate": 0.009810584743323095, "loss": 2.883, "step": 2928 }, { "crossentropy": 2.787266254425049, "epoch": 0.10618474477958237, "grad_norm": 0.061022475361824036, "grad_norm_var": 5.682332862929606e-05, "learning_rate": 0.00981042628477723, "loss": 2.7975, "step": 2929 }, { "crossentropy": 2.807570695877075, "epoch": 0.10622099767981438, "grad_norm": 0.04088858515024185, "grad_norm_var": 5.844515115524876e-05, "learning_rate": 0.009810267761259258, "loss": 2.7739, "step": 2930 }, { "crossentropy": 2.9664289951324463, "epoch": 0.1062572505800464, "grad_norm": 0.041859909892082214, "grad_norm_var": 5.9566265104526485e-05, "learning_rate": 0.00981010917277132, "loss": 2.866, "step": 2931 }, { "crossentropy": 2.7493033409118652, "epoch": 0.10629350348027843, "grad_norm": 0.05137953534722328, "grad_norm_var": 6.032368305641609e-05, "learning_rate": 0.00980995051931556, "loss": 2.8043, "step": 2932 }, { "crossentropy": 2.7931365966796875, "epoch": 0.10632975638051044, "grad_norm": 0.040988337248563766, "grad_norm_var": 6.212707939865133e-05, "learning_rate": 0.009809791800894117, "loss": 2.7388, "step": 2933 }, { "crossentropy": 2.7247819900512695, "epoch": 0.10636600928074245, "grad_norm": 0.04040256515145302, "grad_norm_var": 6.386017385049027e-05, "learning_rate": 0.009809633017509138, "loss": 2.8156, "step": 2934 }, { "crossentropy": 2.672098398208618, "epoch": 0.10640226218097448, "grad_norm": 0.052536800503730774, "grad_norm_var": 6.434365898981068e-05, "learning_rate": 0.009809474169162766, "loss": 2.7007, "step": 2935 }, { "crossentropy": 2.6957015991210938, "epoch": 0.1064385150812065, "grad_norm": 0.05932119861245155, "grad_norm_var": 6.74489289055358e-05, "learning_rate": 0.009809315255857145, "loss": 2.7282, "step": 2936 }, { "crossentropy": 2.9730546474456787, "epoch": 0.10647476798143851, "grad_norm": 0.06949710845947266, "grad_norm_var": 9.582449954844603e-05, "learning_rate": 0.009809156277594424, "loss": 2.8746, "step": 2937 }, { "crossentropy": 2.7604713439941406, "epoch": 0.10651102088167054, "grad_norm": 0.04231026768684387, "grad_norm_var": 9.404434232541577e-05, "learning_rate": 0.00980899723437675, "loss": 2.7988, "step": 2938 }, { "crossentropy": 2.637773036956787, "epoch": 0.10654727378190255, "grad_norm": 0.04281677305698395, "grad_norm_var": 9.34919415749034e-05, "learning_rate": 0.00980883812620627, "loss": 2.689, "step": 2939 }, { "crossentropy": 2.7558419704437256, "epoch": 0.10658352668213457, "grad_norm": 0.040590256452560425, "grad_norm_var": 9.470156342919195e-05, "learning_rate": 0.009808678953085133, "loss": 2.7526, "step": 2940 }, { "crossentropy": 2.6052024364471436, "epoch": 0.10661977958236658, "grad_norm": 0.04508029296994209, "grad_norm_var": 9.493401202320667e-05, "learning_rate": 0.009808519715015491, "loss": 2.6163, "step": 2941 }, { "crossentropy": 2.6690831184387207, "epoch": 0.10665603248259861, "grad_norm": 0.0443115271627903, "grad_norm_var": 9.533065681316486e-05, "learning_rate": 0.00980836041199949, "loss": 2.7012, "step": 2942 }, { "crossentropy": 2.7760231494903564, "epoch": 0.10669228538283063, "grad_norm": 0.042386189103126526, "grad_norm_var": 9.381981005711799e-05, "learning_rate": 0.009808201044039287, "loss": 2.7641, "step": 2943 }, { "crossentropy": 2.8205184936523438, "epoch": 0.10672853828306264, "grad_norm": 0.048794519156217575, "grad_norm_var": 7.697007339400359e-05, "learning_rate": 0.009808041611137032, "loss": 2.8008, "step": 2944 }, { "crossentropy": 2.8014185428619385, "epoch": 0.10676479118329467, "grad_norm": 0.07356345653533936, "grad_norm_var": 0.00010897367740438825, "learning_rate": 0.009807882113294877, "loss": 2.7894, "step": 2945 }, { "crossentropy": 2.850480318069458, "epoch": 0.10680104408352668, "grad_norm": 0.05097383260726929, "grad_norm_var": 0.00010503449705948625, "learning_rate": 0.00980772255051498, "loss": 2.8425, "step": 2946 }, { "crossentropy": 2.732658863067627, "epoch": 0.1068372969837587, "grad_norm": 0.05445930361747742, "grad_norm_var": 0.00010266596236494177, "learning_rate": 0.009807562922799494, "loss": 2.7264, "step": 2947 }, { "crossentropy": 2.579848289489746, "epoch": 0.10687354988399073, "grad_norm": 0.08795932680368423, "grad_norm_var": 0.00019320370122898796, "learning_rate": 0.009807403230150573, "loss": 2.702, "step": 2948 }, { "crossentropy": 2.817155122756958, "epoch": 0.10690980278422274, "grad_norm": 0.053015097975730896, "grad_norm_var": 0.00018418587029726027, "learning_rate": 0.009807243472570377, "loss": 2.8344, "step": 2949 }, { "crossentropy": 2.780555009841919, "epoch": 0.10694605568445475, "grad_norm": 0.05136848986148834, "grad_norm_var": 0.00017328089436979092, "learning_rate": 0.009807083650061063, "loss": 2.7983, "step": 2950 }, { "crossentropy": 2.779350757598877, "epoch": 0.10698230858468677, "grad_norm": 0.048776719719171524, "grad_norm_var": 0.00017474094120813693, "learning_rate": 0.009806923762624788, "loss": 2.7013, "step": 2951 }, { "crossentropy": 3.074309825897217, "epoch": 0.1070185614849188, "grad_norm": 0.05827444791793823, "grad_norm_var": 0.00017399020997178142, "learning_rate": 0.009806763810263715, "loss": 2.9148, "step": 2952 }, { "crossentropy": 2.75555682182312, "epoch": 0.10705481438515081, "grad_norm": 0.051674675196409225, "grad_norm_var": 0.00015555767370881707, "learning_rate": 0.00980660379298, "loss": 2.807, "step": 2953 }, { "crossentropy": 2.730502128601074, "epoch": 0.10709106728538283, "grad_norm": 0.053548336029052734, "grad_norm_var": 0.00014852401207292488, "learning_rate": 0.009806443710775808, "loss": 2.8027, "step": 2954 }, { "crossentropy": 2.594966173171997, "epoch": 0.10712732018561485, "grad_norm": 0.06084338203072548, "grad_norm_var": 0.0001444191567270205, "learning_rate": 0.009806283563653298, "loss": 2.6883, "step": 2955 }, { "crossentropy": 2.5862104892730713, "epoch": 0.10716357308584687, "grad_norm": 0.04601629450917244, "grad_norm_var": 0.00013648445940681684, "learning_rate": 0.009806123351614637, "loss": 2.6306, "step": 2956 }, { "crossentropy": 2.8374645709991455, "epoch": 0.10719982598607888, "grad_norm": 0.0402841754257679, "grad_norm_var": 0.0001439077336786572, "learning_rate": 0.009805963074661984, "loss": 2.8458, "step": 2957 }, { "crossentropy": 2.7702629566192627, "epoch": 0.1072360788863109, "grad_norm": 0.04056781902909279, "grad_norm_var": 0.00014968998916503417, "learning_rate": 0.009805802732797509, "loss": 2.804, "step": 2958 }, { "crossentropy": 2.5318825244903564, "epoch": 0.10727233178654293, "grad_norm": 0.042836058884859085, "grad_norm_var": 0.00014901161171471087, "learning_rate": 0.009805642326023373, "loss": 2.6307, "step": 2959 }, { "crossentropy": 2.689791202545166, "epoch": 0.10730858468677494, "grad_norm": 0.045559342950582504, "grad_norm_var": 0.00015188303145695982, "learning_rate": 0.009805481854341744, "loss": 2.7365, "step": 2960 }, { "crossentropy": 2.824890375137329, "epoch": 0.10734483758700696, "grad_norm": 0.04328077286481857, "grad_norm_var": 0.00012912700334250977, "learning_rate": 0.009805321317754792, "loss": 2.8337, "step": 2961 }, { "crossentropy": 2.897275686264038, "epoch": 0.10738109048723898, "grad_norm": 0.04091843590140343, "grad_norm_var": 0.0001366075673210199, "learning_rate": 0.009805160716264683, "loss": 2.8639, "step": 2962 }, { "crossentropy": 2.916869640350342, "epoch": 0.107417343387471, "grad_norm": 0.040155380964279175, "grad_norm_var": 0.0001432008699643184, "learning_rate": 0.009805000049873586, "loss": 2.7161, "step": 2963 }, { "crossentropy": 2.510967254638672, "epoch": 0.10745359628770301, "grad_norm": 0.05228894576430321, "grad_norm_var": 4.369756473808588e-05, "learning_rate": 0.00980483931858367, "loss": 2.6209, "step": 2964 }, { "crossentropy": 2.8100979328155518, "epoch": 0.10748984918793504, "grad_norm": 0.04701915383338928, "grad_norm_var": 4.200552871202005e-05, "learning_rate": 0.00980467852239711, "loss": 2.7495, "step": 2965 }, { "crossentropy": 2.6785731315612793, "epoch": 0.10752610208816706, "grad_norm": 0.04635454714298248, "grad_norm_var": 4.1133151292827944e-05, "learning_rate": 0.009804517661316075, "loss": 2.7314, "step": 2966 }, { "crossentropy": 2.736433267593384, "epoch": 0.10756235498839907, "grad_norm": 0.039617862552404404, "grad_norm_var": 4.469460353910988e-05, "learning_rate": 0.009804356735342737, "loss": 2.743, "step": 2967 }, { "crossentropy": 2.757880926132202, "epoch": 0.10759860788863108, "grad_norm": 0.05444110929965973, "grad_norm_var": 3.976232672628528e-05, "learning_rate": 0.00980419574447927, "loss": 2.7479, "step": 2968 }, { "crossentropy": 2.827230453491211, "epoch": 0.10763486078886311, "grad_norm": 0.043493740260601044, "grad_norm_var": 3.839669016044983e-05, "learning_rate": 0.009804034688727847, "loss": 2.797, "step": 2969 }, { "crossentropy": 2.756744861602783, "epoch": 0.10767111368909513, "grad_norm": 0.04158300533890724, "grad_norm_var": 3.542449814367479e-05, "learning_rate": 0.009803873568090649, "loss": 2.7339, "step": 2970 }, { "crossentropy": 2.8094468116760254, "epoch": 0.10770736658932714, "grad_norm": 0.04399099200963974, "grad_norm_var": 1.831354182096987e-05, "learning_rate": 0.009803712382569846, "loss": 2.7157, "step": 2971 }, { "crossentropy": 2.604196548461914, "epoch": 0.10774361948955917, "grad_norm": 0.04282284528017044, "grad_norm_var": 1.8209696009532927e-05, "learning_rate": 0.009803551132167618, "loss": 2.7149, "step": 2972 }, { "crossentropy": 2.655864715576172, "epoch": 0.10777987238979118, "grad_norm": 0.03986842557787895, "grad_norm_var": 1.8430686124650217e-05, "learning_rate": 0.009803389816886143, "loss": 2.7538, "step": 2973 }, { "crossentropy": 2.7571988105773926, "epoch": 0.1078161252900232, "grad_norm": 0.04447898268699646, "grad_norm_var": 1.757089483015352e-05, "learning_rate": 0.009803228436727597, "loss": 2.7778, "step": 2974 }, { "crossentropy": 2.665046215057373, "epoch": 0.10785237819025523, "grad_norm": 0.04848862811923027, "grad_norm_var": 1.846878708701884e-05, "learning_rate": 0.009803066991694163, "loss": 2.6989, "step": 2975 }, { "crossentropy": 2.79135799407959, "epoch": 0.10788863109048724, "grad_norm": 0.04058851674199104, "grad_norm_var": 1.9408848195296274e-05, "learning_rate": 0.009802905481788022, "loss": 2.7527, "step": 2976 }, { "crossentropy": 2.768918991088867, "epoch": 0.10792488399071926, "grad_norm": 0.04290478676557541, "grad_norm_var": 1.9470631717039405e-05, "learning_rate": 0.009802743907011352, "loss": 2.7738, "step": 2977 }, { "crossentropy": 2.7745532989501953, "epoch": 0.10796113689095127, "grad_norm": 0.04655170813202858, "grad_norm_var": 1.89039788876188e-05, "learning_rate": 0.009802582267366338, "loss": 2.8055, "step": 2978 }, { "crossentropy": 2.6723473072052, "epoch": 0.1079973897911833, "grad_norm": 0.038384851068258286, "grad_norm_var": 2.0164618441860747e-05, "learning_rate": 0.009802420562855161, "loss": 2.7005, "step": 2979 }, { "crossentropy": 2.7703166007995605, "epoch": 0.10803364269141531, "grad_norm": 0.04379013925790787, "grad_norm_var": 1.5914933109247426e-05, "learning_rate": 0.009802258793480009, "loss": 2.7775, "step": 2980 }, { "crossentropy": 2.7681453227996826, "epoch": 0.10806989559164733, "grad_norm": 0.042534179985523224, "grad_norm_var": 1.5380852578641446e-05, "learning_rate": 0.009802096959243061, "loss": 2.7932, "step": 2981 }, { "crossentropy": 2.753300428390503, "epoch": 0.10810614849187936, "grad_norm": 0.04362474009394646, "grad_norm_var": 1.4896200787981733e-05, "learning_rate": 0.009801935060146507, "loss": 2.6644, "step": 2982 }, { "crossentropy": 2.7919211387634277, "epoch": 0.10814240139211137, "grad_norm": 0.0444243922829628, "grad_norm_var": 1.3805529523349714e-05, "learning_rate": 0.009801773096192535, "loss": 2.8035, "step": 2983 }, { "crossentropy": 2.5790677070617676, "epoch": 0.10817865429234338, "grad_norm": 0.05548720434308052, "grad_norm_var": 1.5347930575369123e-05, "learning_rate": 0.009801611067383326, "loss": 2.7719, "step": 2984 }, { "crossentropy": 2.7623443603515625, "epoch": 0.1082149071925754, "grad_norm": 0.04339639097452164, "grad_norm_var": 1.535429674281973e-05, "learning_rate": 0.009801448973721077, "loss": 2.7675, "step": 2985 }, { "crossentropy": 2.8353281021118164, "epoch": 0.10825116009280743, "grad_norm": 0.04935769736766815, "grad_norm_var": 1.6696628872555665e-05, "learning_rate": 0.009801286815207973, "loss": 2.8016, "step": 2986 }, { "crossentropy": 2.8275938034057617, "epoch": 0.10828741299303944, "grad_norm": 0.04949015751481056, "grad_norm_var": 1.827329153297752e-05, "learning_rate": 0.009801124591846202, "loss": 2.8328, "step": 2987 }, { "crossentropy": 2.7426931858062744, "epoch": 0.10832366589327146, "grad_norm": 0.050829704850912094, "grad_norm_var": 2.0209838933955778e-05, "learning_rate": 0.009800962303637958, "loss": 2.8053, "step": 2988 }, { "crossentropy": 2.7523815631866455, "epoch": 0.10835991879350348, "grad_norm": 0.0440983921289444, "grad_norm_var": 1.8285875752615857e-05, "learning_rate": 0.009800799950585432, "loss": 2.774, "step": 2989 }, { "crossentropy": 2.6839888095855713, "epoch": 0.1083961716937355, "grad_norm": 0.043924666941165924, "grad_norm_var": 1.8382530483877316e-05, "learning_rate": 0.00980063753269082, "loss": 2.6953, "step": 2990 }, { "crossentropy": 2.76486873626709, "epoch": 0.10843242459396751, "grad_norm": 0.04113324359059334, "grad_norm_var": 1.8825293410464304e-05, "learning_rate": 0.00980047504995631, "loss": 2.7782, "step": 2991 }, { "crossentropy": 2.934413433074951, "epoch": 0.10846867749419954, "grad_norm": 0.04300158843398094, "grad_norm_var": 1.7759390045874827e-05, "learning_rate": 0.009800312502384102, "loss": 2.8169, "step": 2992 }, { "crossentropy": 2.8574628829956055, "epoch": 0.10850493039443156, "grad_norm": 0.04740404710173607, "grad_norm_var": 1.7657676656171867e-05, "learning_rate": 0.009800149889976387, "loss": 2.7277, "step": 2993 }, { "crossentropy": 2.712007999420166, "epoch": 0.10854118329466357, "grad_norm": 0.04888274893164635, "grad_norm_var": 1.833517486165877e-05, "learning_rate": 0.009799987212735364, "loss": 2.676, "step": 2994 }, { "crossentropy": 2.7423346042633057, "epoch": 0.10857743619489559, "grad_norm": 0.04459882900118828, "grad_norm_var": 1.4762049866927767e-05, "learning_rate": 0.009799824470663228, "loss": 2.7623, "step": 2995 }, { "crossentropy": 2.4771063327789307, "epoch": 0.10861368909512761, "grad_norm": 0.04416557028889656, "grad_norm_var": 1.4660307550880501e-05, "learning_rate": 0.00979966166376218, "loss": 2.6573, "step": 2996 }, { "crossentropy": 2.726175546646118, "epoch": 0.10864994199535963, "grad_norm": 0.0456765741109848, "grad_norm_var": 1.3816084563882577e-05, "learning_rate": 0.009799498792034417, "loss": 2.6425, "step": 2997 }, { "crossentropy": 2.7873826026916504, "epoch": 0.10868619489559164, "grad_norm": 0.04536512494087219, "grad_norm_var": 1.3403508640527582e-05, "learning_rate": 0.00979933585548214, "loss": 2.78, "step": 2998 }, { "crossentropy": 2.798384428024292, "epoch": 0.10872244779582367, "grad_norm": 0.04379778355360031, "grad_norm_var": 1.3587029918739277e-05, "learning_rate": 0.009799172854107547, "loss": 2.7962, "step": 2999 }, { "crossentropy": 2.7010037899017334, "epoch": 0.10875870069605569, "grad_norm": 0.04219282045960426, "grad_norm_var": 8.32714390596913e-06, "learning_rate": 0.009799009787912843, "loss": 2.7163, "step": 3000 }, { "crossentropy": 2.6716065406799316, "epoch": 0.1087949535962877, "grad_norm": 0.04176737368106842, "grad_norm_var": 8.940614365116466e-06, "learning_rate": 0.00979884665690023, "loss": 2.6979, "step": 3001 }, { "crossentropy": 2.6695191860198975, "epoch": 0.10883120649651973, "grad_norm": 0.04106684401631355, "grad_norm_var": 8.812421546423404e-06, "learning_rate": 0.009798683461071909, "loss": 2.745, "step": 3002 }, { "crossentropy": 2.67763352394104, "epoch": 0.10886745939675174, "grad_norm": 0.03983927145600319, "grad_norm_var": 8.646313290347485e-06, "learning_rate": 0.009798520200430087, "loss": 2.6742, "step": 3003 }, { "crossentropy": 2.6872243881225586, "epoch": 0.10890371229698376, "grad_norm": 0.04007020220160484, "grad_norm_var": 6.419596177095754e-06, "learning_rate": 0.009798356874976965, "loss": 2.7007, "step": 3004 }, { "crossentropy": 2.7704694271087646, "epoch": 0.10893996519721577, "grad_norm": 0.047706812620162964, "grad_norm_var": 7.4916683366510474e-06, "learning_rate": 0.009798193484714755, "loss": 2.8402, "step": 3005 }, { "crossentropy": 2.7626657485961914, "epoch": 0.1089762180974478, "grad_norm": 0.0618351474404335, "grad_norm_var": 2.7869283714741327e-05, "learning_rate": 0.009798030029645658, "loss": 2.7264, "step": 3006 }, { "crossentropy": 2.6842613220214844, "epoch": 0.10901247099767981, "grad_norm": 0.04576033726334572, "grad_norm_var": 2.6879514375419135e-05, "learning_rate": 0.009797866509771885, "loss": 2.7531, "step": 3007 }, { "crossentropy": 2.732379913330078, "epoch": 0.10904872389791183, "grad_norm": 0.039800822734832764, "grad_norm_var": 2.8456195643048315e-05, "learning_rate": 0.009797702925095644, "loss": 2.7835, "step": 3008 }, { "crossentropy": 2.9001810550689697, "epoch": 0.10908497679814386, "grad_norm": 0.0632711723446846, "grad_norm_var": 4.9286806663147624e-05, "learning_rate": 0.009797539275619145, "loss": 2.8279, "step": 3009 }, { "crossentropy": 2.762809991836548, "epoch": 0.10912122969837587, "grad_norm": 0.051548462361097336, "grad_norm_var": 5.076004425842347e-05, "learning_rate": 0.009797375561344596, "loss": 2.7899, "step": 3010 }, { "crossentropy": 2.714916229248047, "epoch": 0.10915748259860789, "grad_norm": 0.04016520082950592, "grad_norm_var": 5.290791896361321e-05, "learning_rate": 0.00979721178227421, "loss": 2.7224, "step": 3011 }, { "crossentropy": 2.827744245529175, "epoch": 0.10919373549883991, "grad_norm": 0.040412452071905136, "grad_norm_var": 5.4644636334012264e-05, "learning_rate": 0.0097970479384102, "loss": 2.7214, "step": 3012 }, { "crossentropy": 2.8051135540008545, "epoch": 0.10922998839907193, "grad_norm": 0.039277881383895874, "grad_norm_var": 5.7174328137458e-05, "learning_rate": 0.009796884029754776, "loss": 2.7983, "step": 3013 }, { "crossentropy": 2.9605636596679688, "epoch": 0.10926624129930394, "grad_norm": 0.0384494885802269, "grad_norm_var": 6.005025216918586e-05, "learning_rate": 0.009796720056310156, "loss": 2.8657, "step": 3014 }, { "crossentropy": 2.5845813751220703, "epoch": 0.10930249419953596, "grad_norm": 0.04039901867508888, "grad_norm_var": 6.12309908125099e-05, "learning_rate": 0.00979655601807855, "loss": 2.607, "step": 3015 }, { "crossentropy": 2.752268075942993, "epoch": 0.10933874709976799, "grad_norm": 0.08344222605228424, "grad_norm_var": 0.0001543488981311023, "learning_rate": 0.009796391915062177, "loss": 2.8162, "step": 3016 }, { "crossentropy": 2.6112523078918457, "epoch": 0.109375, "grad_norm": 0.04131171107292175, "grad_norm_var": 0.00015469046359822494, "learning_rate": 0.009796227747263254, "loss": 2.7091, "step": 3017 }, { "crossentropy": 2.765984296798706, "epoch": 0.10941125290023201, "grad_norm": 0.04156181588768959, "grad_norm_var": 0.00015430448759429406, "learning_rate": 0.009796063514683994, "loss": 2.6957, "step": 3018 }, { "crossentropy": 2.6867504119873047, "epoch": 0.10944750580046404, "grad_norm": 0.03946588188409805, "grad_norm_var": 0.0001546785744757954, "learning_rate": 0.009795899217326619, "loss": 2.7525, "step": 3019 }, { "crossentropy": 2.790559768676758, "epoch": 0.10948375870069606, "grad_norm": 0.04003546014428139, "grad_norm_var": 0.00015471146824513315, "learning_rate": 0.009795734855193347, "loss": 2.7963, "step": 3020 }, { "crossentropy": 2.8236796855926514, "epoch": 0.10952001160092807, "grad_norm": 0.043522030115127563, "grad_norm_var": 0.0001554968388698339, "learning_rate": 0.009795570428286398, "loss": 2.7524, "step": 3021 }, { "crossentropy": 2.686737537384033, "epoch": 0.10955626450116009, "grad_norm": 0.08666856586933136, "grad_norm_var": 0.00024352176463273792, "learning_rate": 0.009795405936607993, "loss": 2.6838, "step": 3022 }, { "crossentropy": 2.797147750854492, "epoch": 0.10959251740139211, "grad_norm": 0.04142266884446144, "grad_norm_var": 0.0002462494222622356, "learning_rate": 0.009795241380160354, "loss": 2.7262, "step": 3023 }, { "crossentropy": 2.6294634342193604, "epoch": 0.10962877030162413, "grad_norm": 0.04842057451605797, "grad_norm_var": 0.0002412719784364585, "learning_rate": 0.009795076758945703, "loss": 2.7571, "step": 3024 }, { "crossentropy": 2.6445207595825195, "epoch": 0.10966502320185614, "grad_norm": 0.04676741361618042, "grad_norm_var": 0.0002262554894258574, "learning_rate": 0.009794912072966263, "loss": 2.7094, "step": 3025 }, { "crossentropy": 2.9907948970794678, "epoch": 0.10970127610208817, "grad_norm": 0.059622813016176224, "grad_norm_var": 0.00023449551066989134, "learning_rate": 0.009794747322224257, "loss": 2.9844, "step": 3026 }, { "crossentropy": 2.7110962867736816, "epoch": 0.10973752900232019, "grad_norm": 0.044688619673252106, "grad_norm_var": 0.0002309379794135424, "learning_rate": 0.009794582506721916, "loss": 2.727, "step": 3027 }, { "crossentropy": 2.5681440830230713, "epoch": 0.1097737819025522, "grad_norm": 0.04391255974769592, "grad_norm_var": 0.00022794484539745026, "learning_rate": 0.009794417626461459, "loss": 2.5975, "step": 3028 }, { "crossentropy": 2.598137378692627, "epoch": 0.10981003480278423, "grad_norm": 0.04502764344215393, "grad_norm_var": 0.00022279883655312467, "learning_rate": 0.009794252681445119, "loss": 2.6321, "step": 3029 }, { "crossentropy": 2.788966178894043, "epoch": 0.10984628770301624, "grad_norm": 0.0496295765042305, "grad_norm_var": 0.00021481662510119566, "learning_rate": 0.009794087671675118, "loss": 2.6896, "step": 3030 }, { "crossentropy": 2.8559038639068604, "epoch": 0.10988254060324826, "grad_norm": 0.04955557733774185, "grad_norm_var": 0.00020864814985926307, "learning_rate": 0.009793922597153691, "loss": 2.7444, "step": 3031 }, { "crossentropy": 2.759044647216797, "epoch": 0.10991879350348027, "grad_norm": 0.04677136242389679, "grad_norm_var": 0.00013072592556416328, "learning_rate": 0.009793757457883061, "loss": 2.761, "step": 3032 }, { "crossentropy": 2.762392520904541, "epoch": 0.1099550464037123, "grad_norm": 0.04409405216574669, "grad_norm_var": 0.00012871964110674645, "learning_rate": 0.009793592253865464, "loss": 2.7182, "step": 3033 }, { "crossentropy": 2.763887405395508, "epoch": 0.10999129930394431, "grad_norm": 0.09875355660915375, "grad_norm_var": 0.0002825465953762235, "learning_rate": 0.009793426985103128, "loss": 2.8735, "step": 3034 }, { "crossentropy": 2.8037729263305664, "epoch": 0.11002755220417633, "grad_norm": 0.054786693304777145, "grad_norm_var": 0.00027207760914396115, "learning_rate": 0.009793261651598287, "loss": 2.7533, "step": 3035 }, { "crossentropy": 2.512193202972412, "epoch": 0.11006380510440836, "grad_norm": 0.046340975910425186, "grad_norm_var": 0.0002638898739708237, "learning_rate": 0.009793096253353173, "loss": 2.5487, "step": 3036 }, { "crossentropy": 2.6675543785095215, "epoch": 0.11010005800464037, "grad_norm": 0.04446558281779289, "grad_norm_var": 0.0002627375165463264, "learning_rate": 0.009792930790370021, "loss": 2.6948, "step": 3037 }, { "crossentropy": 2.709214210510254, "epoch": 0.11013631090487239, "grad_norm": 0.040904249995946884, "grad_norm_var": 0.0001893100259470389, "learning_rate": 0.009792765262651064, "loss": 2.7162, "step": 3038 }, { "crossentropy": 2.796372890472412, "epoch": 0.11017256380510441, "grad_norm": 0.04423074051737785, "grad_norm_var": 0.0001864705815410703, "learning_rate": 0.009792599670198538, "loss": 2.7861, "step": 3039 }, { "crossentropy": 2.7786450386047363, "epoch": 0.11020881670533643, "grad_norm": 0.04563290253281593, "grad_norm_var": 0.0001877285264036993, "learning_rate": 0.009792434013014682, "loss": 2.7138, "step": 3040 }, { "crossentropy": 2.7276549339294434, "epoch": 0.11024506960556844, "grad_norm": 0.042443569749593735, "grad_norm_var": 0.00019094743109439265, "learning_rate": 0.009792268291101733, "loss": 2.7425, "step": 3041 }, { "crossentropy": 2.627185344696045, "epoch": 0.11028132250580046, "grad_norm": 0.04277242347598076, "grad_norm_var": 0.0001871944827132154, "learning_rate": 0.009792102504461926, "loss": 2.6849, "step": 3042 }, { "crossentropy": 2.8131191730499268, "epoch": 0.11031757540603249, "grad_norm": 0.04084349423646927, "grad_norm_var": 0.00019032924084249884, "learning_rate": 0.009791936653097504, "loss": 2.818, "step": 3043 }, { "crossentropy": 2.582956075668335, "epoch": 0.1103538283062645, "grad_norm": 0.0618695393204689, "grad_norm_var": 0.00019887576680413006, "learning_rate": 0.009791770737010704, "loss": 2.7006, "step": 3044 }, { "crossentropy": 2.6707956790924072, "epoch": 0.11039008120649652, "grad_norm": 0.047038733959198, "grad_norm_var": 0.00019782670713778023, "learning_rate": 0.009791604756203769, "loss": 2.7465, "step": 3045 }, { "crossentropy": 2.595756769180298, "epoch": 0.11042633410672854, "grad_norm": 0.041782088577747345, "grad_norm_var": 0.0002020719343154837, "learning_rate": 0.00979143871067894, "loss": 2.6462, "step": 3046 }, { "crossentropy": 2.8463735580444336, "epoch": 0.11046258700696056, "grad_norm": 0.03956054151058197, "grad_norm_var": 0.00020826544774092445, "learning_rate": 0.009791272600438461, "loss": 2.8026, "step": 3047 }, { "crossentropy": 2.63525390625, "epoch": 0.11049883990719257, "grad_norm": 0.04448259249329567, "grad_norm_var": 0.00020924035850144624, "learning_rate": 0.009791106425484574, "loss": 2.6926, "step": 3048 }, { "crossentropy": 2.7324206829071045, "epoch": 0.1105350928074246, "grad_norm": 0.041431933641433716, "grad_norm_var": 0.00021133595139309665, "learning_rate": 0.009790940185819523, "loss": 2.644, "step": 3049 }, { "crossentropy": 2.7248547077178955, "epoch": 0.11057134570765662, "grad_norm": 0.04142328351736069, "grad_norm_var": 3.325845398708712e-05, "learning_rate": 0.009790773881445556, "loss": 2.6815, "step": 3050 }, { "crossentropy": 2.7331814765930176, "epoch": 0.11060759860788863, "grad_norm": 0.04319186881184578, "grad_norm_var": 2.6531855345805246e-05, "learning_rate": 0.009790607512364916, "loss": 2.779, "step": 3051 }, { "crossentropy": 2.888073444366455, "epoch": 0.11064385150812064, "grad_norm": 0.052766527980566025, "grad_norm_var": 3.088156503142595e-05, "learning_rate": 0.009790441078579853, "loss": 2.8284, "step": 3052 }, { "crossentropy": 2.5411689281463623, "epoch": 0.11068010440835267, "grad_norm": 0.04300723597407341, "grad_norm_var": 3.1055695890563374e-05, "learning_rate": 0.009790274580092613, "loss": 2.6437, "step": 3053 }, { "crossentropy": 2.5318000316619873, "epoch": 0.11071635730858469, "grad_norm": 0.04770341515541077, "grad_norm_var": 3.060695168901409e-05, "learning_rate": 0.009790108016905443, "loss": 2.5684, "step": 3054 }, { "crossentropy": 2.609668016433716, "epoch": 0.1107526102088167, "grad_norm": 0.04240797460079193, "grad_norm_var": 3.10043114317833e-05, "learning_rate": 0.009789941389020599, "loss": 2.7096, "step": 3055 }, { "crossentropy": 2.647202253341675, "epoch": 0.11078886310904873, "grad_norm": 0.04050059989094734, "grad_norm_var": 3.214727329809749e-05, "learning_rate": 0.009789774696440323, "loss": 2.7155, "step": 3056 }, { "crossentropy": 2.7394931316375732, "epoch": 0.11082511600928074, "grad_norm": 0.04233100637793541, "grad_norm_var": 3.2180078891131245e-05, "learning_rate": 0.009789607939166873, "loss": 2.7039, "step": 3057 }, { "crossentropy": 2.7143383026123047, "epoch": 0.11086136890951276, "grad_norm": 0.08225994557142258, "grad_norm_var": 0.00012017207631286862, "learning_rate": 0.0097894411172025, "loss": 2.7237, "step": 3058 }, { "crossentropy": 2.6791210174560547, "epoch": 0.11089762180974477, "grad_norm": 0.042332153767347336, "grad_norm_var": 0.00011908113788786085, "learning_rate": 0.009789274230549457, "loss": 2.7207, "step": 3059 }, { "crossentropy": 2.8670356273651123, "epoch": 0.1109338747099768, "grad_norm": 0.04251738637685776, "grad_norm_var": 0.00010445703184839879, "learning_rate": 0.009789107279209997, "loss": 2.8273, "step": 3060 }, { "crossentropy": 2.583526372909546, "epoch": 0.11097012761020882, "grad_norm": 0.04183942824602127, "grad_norm_var": 0.00010537177766561697, "learning_rate": 0.009788940263186375, "loss": 2.6858, "step": 3061 }, { "crossentropy": 2.785621166229248, "epoch": 0.11100638051044083, "grad_norm": 0.040835313498973846, "grad_norm_var": 0.00010590927275265684, "learning_rate": 0.009788773182480849, "loss": 2.7674, "step": 3062 }, { "crossentropy": 2.8196969032287598, "epoch": 0.11104263341067286, "grad_norm": 0.059869296848773956, "grad_norm_var": 0.00011550399558440204, "learning_rate": 0.00978860603709567, "loss": 2.8104, "step": 3063 }, { "crossentropy": 2.7975659370422363, "epoch": 0.11107888631090487, "grad_norm": 0.04236678034067154, "grad_norm_var": 0.0001164393092830766, "learning_rate": 0.009788438827033103, "loss": 2.7332, "step": 3064 }, { "crossentropy": 2.763715982437134, "epoch": 0.11111513921113689, "grad_norm": 0.04855608940124512, "grad_norm_var": 0.00011463202708230462, "learning_rate": 0.009788271552295402, "loss": 2.7942, "step": 3065 }, { "crossentropy": 2.817700147628784, "epoch": 0.11115139211136892, "grad_norm": 0.057261399924755096, "grad_norm_var": 0.00011828140666199268, "learning_rate": 0.009788104212884827, "loss": 2.7701, "step": 3066 }, { "crossentropy": 2.7161953449249268, "epoch": 0.11118764501160093, "grad_norm": 0.05780026316642761, "grad_norm_var": 0.00012204141695833359, "learning_rate": 0.009787936808803637, "loss": 2.7657, "step": 3067 }, { "crossentropy": 2.86688494682312, "epoch": 0.11122389791183294, "grad_norm": 0.04956492409110069, "grad_norm_var": 0.0001210836677895718, "learning_rate": 0.009787769340054096, "loss": 2.8426, "step": 3068 }, { "crossentropy": 2.7735540866851807, "epoch": 0.11126015081206496, "grad_norm": 0.046257905662059784, "grad_norm_var": 0.00011922381297493877, "learning_rate": 0.009787601806638464, "loss": 2.8177, "step": 3069 }, { "crossentropy": 2.628558874130249, "epoch": 0.11129640371229699, "grad_norm": 0.0417424738407135, "grad_norm_var": 0.00012249519248542205, "learning_rate": 0.009787434208559005, "loss": 2.6584, "step": 3070 }, { "crossentropy": 2.5842649936676025, "epoch": 0.111332656612529, "grad_norm": 0.045886341482400894, "grad_norm_var": 0.00012035519693374264, "learning_rate": 0.00978726654581798, "loss": 2.7311, "step": 3071 }, { "crossentropy": 2.719740390777588, "epoch": 0.11136890951276102, "grad_norm": 0.04422853887081146, "grad_norm_var": 0.0001170636701282921, "learning_rate": 0.009787098818417658, "loss": 2.656, "step": 3072 }, { "crossentropy": 2.764524459838867, "epoch": 0.11140516241299304, "grad_norm": 0.05464655160903931, "grad_norm_var": 0.00011542297716317598, "learning_rate": 0.0097869310263603, "loss": 2.7764, "step": 3073 }, { "crossentropy": 2.732872486114502, "epoch": 0.11144141531322506, "grad_norm": 0.04223817214369774, "grad_norm_var": 4.2706404559455144e-05, "learning_rate": 0.009786763169648174, "loss": 2.6859, "step": 3074 }, { "crossentropy": 2.7984931468963623, "epoch": 0.11147766821345707, "grad_norm": 0.04452578350901604, "grad_norm_var": 4.1533245232711996e-05, "learning_rate": 0.009786595248283549, "loss": 2.7612, "step": 3075 }, { "crossentropy": 2.732388734817505, "epoch": 0.1115139211136891, "grad_norm": 0.04114999622106552, "grad_norm_var": 4.256008565267181e-05, "learning_rate": 0.00978642726226869, "loss": 2.6607, "step": 3076 }, { "crossentropy": 2.7031331062316895, "epoch": 0.11155017401392112, "grad_norm": 0.036948312073946, "grad_norm_var": 4.769664529801759e-05, "learning_rate": 0.009786259211605865, "loss": 2.7141, "step": 3077 }, { "crossentropy": 2.68261456489563, "epoch": 0.11158642691415313, "grad_norm": 0.043264515697956085, "grad_norm_var": 4.603073668566441e-05, "learning_rate": 0.009786091096297349, "loss": 2.7786, "step": 3078 }, { "crossentropy": 2.6841976642608643, "epoch": 0.11162267981438515, "grad_norm": 0.04095761477947235, "grad_norm_var": 3.661212185090545e-05, "learning_rate": 0.009785922916345408, "loss": 2.6885, "step": 3079 }, { "crossentropy": 2.6936545372009277, "epoch": 0.11165893271461717, "grad_norm": 0.04305035620927811, "grad_norm_var": 3.6302232082480586e-05, "learning_rate": 0.009785754671752316, "loss": 2.7116, "step": 3080 }, { "crossentropy": 2.516148567199707, "epoch": 0.11169518561484919, "grad_norm": 0.03925004601478577, "grad_norm_var": 3.870451999783183e-05, "learning_rate": 0.009785586362520344, "loss": 2.5828, "step": 3081 }, { "crossentropy": 2.8124518394470215, "epoch": 0.1117314385150812, "grad_norm": 0.0381426177918911, "grad_norm_var": 3.169137829306367e-05, "learning_rate": 0.009785417988651766, "loss": 2.7852, "step": 3082 }, { "crossentropy": 2.706012487411499, "epoch": 0.11176769141531323, "grad_norm": 0.03967321291565895, "grad_norm_var": 1.9727990295095807e-05, "learning_rate": 0.009785249550148856, "loss": 2.6848, "step": 3083 }, { "crossentropy": 2.702235221862793, "epoch": 0.11180394431554525, "grad_norm": 0.03951443359255791, "grad_norm_var": 1.7539266184099914e-05, "learning_rate": 0.00978508104701389, "loss": 2.7528, "step": 3084 }, { "crossentropy": 2.6959874629974365, "epoch": 0.11184019721577726, "grad_norm": 0.04109723120927811, "grad_norm_var": 1.668153801624341e-05, "learning_rate": 0.009784912479249142, "loss": 2.7089, "step": 3085 }, { "crossentropy": 2.7519116401672363, "epoch": 0.11187645011600927, "grad_norm": 0.04244641959667206, "grad_norm_var": 1.6663018239726894e-05, "learning_rate": 0.009784743846856889, "loss": 2.761, "step": 3086 }, { "crossentropy": 2.8087244033813477, "epoch": 0.1119127030162413, "grad_norm": 0.04209885746240616, "grad_norm_var": 1.5755436389291074e-05, "learning_rate": 0.00978457514983941, "loss": 2.8039, "step": 3087 }, { "crossentropy": 2.622591257095337, "epoch": 0.11194895591647332, "grad_norm": 0.04410627484321594, "grad_norm_var": 1.5721297234496297e-05, "learning_rate": 0.00978440638819898, "loss": 2.6555, "step": 3088 }, { "crossentropy": 2.888023853302002, "epoch": 0.11198520881670533, "grad_norm": 0.04877482354640961, "grad_norm_var": 8.029506218447854e-06, "learning_rate": 0.009784237561937885, "loss": 2.8233, "step": 3089 }, { "crossentropy": 2.690950393676758, "epoch": 0.11202146171693736, "grad_norm": 0.21543844044208527, "grad_norm_var": 0.0018952977125072583, "learning_rate": 0.0097840686710584, "loss": 2.717, "step": 3090 }, { "crossentropy": 2.6961381435394287, "epoch": 0.11205771461716937, "grad_norm": 0.04535505920648575, "grad_norm_var": 0.0018944559504606865, "learning_rate": 0.009783899715562807, "loss": 2.7323, "step": 3091 }, { "crossentropy": 2.6476902961730957, "epoch": 0.11209396751740139, "grad_norm": 0.042949479073286057, "grad_norm_var": 0.001891916098120833, "learning_rate": 0.009783730695453388, "loss": 2.6783, "step": 3092 }, { "crossentropy": 2.8202719688415527, "epoch": 0.11213022041763342, "grad_norm": 0.04476187005639076, "grad_norm_var": 0.001879330213193553, "learning_rate": 0.009783561610732428, "loss": 2.7883, "step": 3093 }, { "crossentropy": 2.693129539489746, "epoch": 0.11216647331786543, "grad_norm": 0.0398891419172287, "grad_norm_var": 0.001884504782280422, "learning_rate": 0.009783392461402208, "loss": 2.6467, "step": 3094 }, { "crossentropy": 2.8718507289886475, "epoch": 0.11220272621809745, "grad_norm": 0.07287689298391342, "grad_norm_var": 0.00189706250310725, "learning_rate": 0.009783223247465014, "loss": 2.7196, "step": 3095 }, { "crossentropy": 2.7716071605682373, "epoch": 0.11223897911832946, "grad_norm": 0.04182973504066467, "grad_norm_var": 0.00189909457406783, "learning_rate": 0.009783053968923133, "loss": 2.7959, "step": 3096 }, { "crossentropy": 2.6238479614257812, "epoch": 0.11227523201856149, "grad_norm": 0.043395429849624634, "grad_norm_var": 0.001891525330466617, "learning_rate": 0.009782884625778848, "loss": 2.6795, "step": 3097 }, { "crossentropy": 2.841825485229492, "epoch": 0.1123114849187935, "grad_norm": 0.051039233803749084, "grad_norm_var": 0.0018726808580459646, "learning_rate": 0.009782715218034446, "loss": 2.9163, "step": 3098 }, { "crossentropy": 2.6150293350219727, "epoch": 0.11234773781902552, "grad_norm": 0.05274566635489464, "grad_norm_var": 0.0018549860146911807, "learning_rate": 0.00978254574569222, "loss": 2.6055, "step": 3099 }, { "crossentropy": 2.8232054710388184, "epoch": 0.11238399071925755, "grad_norm": 0.05475530028343201, "grad_norm_var": 0.0018344385892190493, "learning_rate": 0.009782376208754454, "loss": 2.7364, "step": 3100 }, { "crossentropy": 2.8698956966400146, "epoch": 0.11242024361948956, "grad_norm": 0.04587698355317116, "grad_norm_var": 0.0018252711830011757, "learning_rate": 0.00978220660722344, "loss": 2.7546, "step": 3101 }, { "crossentropy": 2.804619550704956, "epoch": 0.11245649651972157, "grad_norm": 0.04954903945326805, "grad_norm_var": 0.0018136745434625284, "learning_rate": 0.009782036941101468, "loss": 2.7843, "step": 3102 }, { "crossentropy": 2.6298019886016846, "epoch": 0.1124927494199536, "grad_norm": 0.04530801624059677, "grad_norm_var": 0.0018073152793046433, "learning_rate": 0.009781867210390833, "loss": 2.7059, "step": 3103 }, { "crossentropy": 2.663149356842041, "epoch": 0.11252900232018562, "grad_norm": 0.04268956184387207, "grad_norm_var": 0.001810190927325206, "learning_rate": 0.009781697415093823, "loss": 2.7309, "step": 3104 }, { "crossentropy": 2.828111410140991, "epoch": 0.11256525522041763, "grad_norm": 0.04052691534161568, "grad_norm_var": 0.0018252225206644968, "learning_rate": 0.009781527555212733, "loss": 2.7765, "step": 3105 }, { "crossentropy": 2.805739164352417, "epoch": 0.11260150812064965, "grad_norm": 0.050714731216430664, "grad_norm_var": 6.459997377192064e-05, "learning_rate": 0.009781357630749856, "loss": 2.7561, "step": 3106 }, { "crossentropy": 2.7076239585876465, "epoch": 0.11263776102088167, "grad_norm": 0.041006602346897125, "grad_norm_var": 6.717989638955364e-05, "learning_rate": 0.009781187641707488, "loss": 2.7094, "step": 3107 }, { "crossentropy": 2.7446467876434326, "epoch": 0.11267401392111369, "grad_norm": 0.04091961309313774, "grad_norm_var": 6.866756705048599e-05, "learning_rate": 0.009781017588087927, "loss": 2.7882, "step": 3108 }, { "crossentropy": 2.7134413719177246, "epoch": 0.1127102668213457, "grad_norm": 0.04625134915113449, "grad_norm_var": 6.828869670264751e-05, "learning_rate": 0.009780847469893467, "loss": 2.7707, "step": 3109 }, { "crossentropy": 2.658730983734131, "epoch": 0.11274651972157773, "grad_norm": 0.04952356219291687, "grad_norm_var": 6.43634901905489e-05, "learning_rate": 0.009780677287126407, "loss": 2.7154, "step": 3110 }, { "crossentropy": 2.6482412815093994, "epoch": 0.11278277262180975, "grad_norm": 0.04977412149310112, "grad_norm_var": 2.1286280248556758e-05, "learning_rate": 0.009780507039789044, "loss": 2.644, "step": 3111 }, { "crossentropy": 2.763028383255005, "epoch": 0.11281902552204176, "grad_norm": 0.04187896102666855, "grad_norm_var": 2.1254996762912897e-05, "learning_rate": 0.009780336727883681, "loss": 2.757, "step": 3112 }, { "crossentropy": 2.8771958351135254, "epoch": 0.11285527842227379, "grad_norm": 0.040540583431720734, "grad_norm_var": 2.2992636095472972e-05, "learning_rate": 0.009780166351412614, "loss": 2.7476, "step": 3113 }, { "crossentropy": 2.6100025177001953, "epoch": 0.1128915313225058, "grad_norm": 0.042241085320711136, "grad_norm_var": 2.2439717450403218e-05, "learning_rate": 0.009779995910378148, "loss": 2.7409, "step": 3114 }, { "crossentropy": 2.76757550239563, "epoch": 0.11292778422273782, "grad_norm": 0.03859090059995651, "grad_norm_var": 2.2030665224123668e-05, "learning_rate": 0.009779825404782583, "loss": 2.7562, "step": 3115 }, { "crossentropy": 2.7029621601104736, "epoch": 0.11296403712296983, "grad_norm": 0.041336167603731155, "grad_norm_var": 1.5847354455511494e-05, "learning_rate": 0.009779654834628223, "loss": 2.7707, "step": 3116 }, { "crossentropy": 2.8512187004089355, "epoch": 0.11300029002320186, "grad_norm": 0.041401561349630356, "grad_norm_var": 1.6080901513282284e-05, "learning_rate": 0.009779484199917369, "loss": 2.8286, "step": 3117 }, { "crossentropy": 2.784391403198242, "epoch": 0.11303654292343387, "grad_norm": 0.04718204215168953, "grad_norm_var": 1.4645329883600638e-05, "learning_rate": 0.00977931350065233, "loss": 2.7749, "step": 3118 }, { "crossentropy": 2.631373167037964, "epoch": 0.11307279582366589, "grad_norm": 0.042880669236183167, "grad_norm_var": 1.4507024042508844e-05, "learning_rate": 0.009779142736835406, "loss": 2.6853, "step": 3119 }, { "crossentropy": 2.64233136177063, "epoch": 0.11310904872389792, "grad_norm": 0.06315700709819794, "grad_norm_var": 3.822886157463876e-05, "learning_rate": 0.00977897190846891, "loss": 2.7626, "step": 3120 }, { "crossentropy": 2.7736828327178955, "epoch": 0.11314530162412993, "grad_norm": 0.043186649680137634, "grad_norm_var": 3.713067456231692e-05, "learning_rate": 0.009778801015555142, "loss": 2.7344, "step": 3121 }, { "crossentropy": 2.722500801086426, "epoch": 0.11318155452436195, "grad_norm": 0.053791675716638565, "grad_norm_var": 4.0051904548159675e-05, "learning_rate": 0.009778630058096418, "loss": 2.8992, "step": 3122 }, { "crossentropy": 2.8070621490478516, "epoch": 0.11321780742459396, "grad_norm": 0.05835174396634102, "grad_norm_var": 4.90904069964176e-05, "learning_rate": 0.009778459036095043, "loss": 2.8072, "step": 3123 }, { "crossentropy": 2.6731467247009277, "epoch": 0.11325406032482599, "grad_norm": 0.05699263885617256, "grad_norm_var": 5.3678426698084794e-05, "learning_rate": 0.009778287949553327, "loss": 2.7129, "step": 3124 }, { "crossentropy": 2.8890268802642822, "epoch": 0.113290313225058, "grad_norm": 0.069704569876194, "grad_norm_var": 8.472267768934172e-05, "learning_rate": 0.00977811679847358, "loss": 2.8283, "step": 3125 }, { "crossentropy": 2.7773001194000244, "epoch": 0.11332656612529002, "grad_norm": 0.05557093769311905, "grad_norm_var": 8.760517795518355e-05, "learning_rate": 0.009777945582858114, "loss": 2.7314, "step": 3126 }, { "crossentropy": 2.5936152935028076, "epoch": 0.11336281902552205, "grad_norm": 0.04962867125868797, "grad_norm_var": 8.759461614455377e-05, "learning_rate": 0.009777774302709244, "loss": 2.6892, "step": 3127 }, { "crossentropy": 2.571117401123047, "epoch": 0.11339907192575406, "grad_norm": 0.04584747552871704, "grad_norm_var": 8.473038612069791e-05, "learning_rate": 0.00977760295802928, "loss": 2.6665, "step": 3128 }, { "crossentropy": 2.6858527660369873, "epoch": 0.11343532482598608, "grad_norm": 0.048558998852968216, "grad_norm_var": 7.927673244959524e-05, "learning_rate": 0.009777431548820539, "loss": 2.686, "step": 3129 }, { "crossentropy": 2.8676822185516357, "epoch": 0.1134715777262181, "grad_norm": 0.040904268622398376, "grad_norm_var": 8.075382082158647e-05, "learning_rate": 0.009777260075085333, "loss": 2.9674, "step": 3130 }, { "crossentropy": 2.832324266433716, "epoch": 0.11350783062645012, "grad_norm": 0.04149521887302399, "grad_norm_var": 7.693345197112844e-05, "learning_rate": 0.009777088536825983, "loss": 2.8474, "step": 3131 }, { "crossentropy": 2.7009365558624268, "epoch": 0.11354408352668213, "grad_norm": 0.046421125531196594, "grad_norm_var": 7.267588321614627e-05, "learning_rate": 0.0097769169340448, "loss": 2.6654, "step": 3132 }, { "crossentropy": 2.8784892559051514, "epoch": 0.11358033642691415, "grad_norm": 0.04373135790228844, "grad_norm_var": 7.024557926831408e-05, "learning_rate": 0.009776745266744104, "loss": 2.8551, "step": 3133 }, { "crossentropy": 2.8096094131469727, "epoch": 0.11361658932714618, "grad_norm": 0.045234739780426025, "grad_norm_var": 7.133439950171733e-05, "learning_rate": 0.009776573534926216, "loss": 2.8059, "step": 3134 }, { "crossentropy": 2.5949137210845947, "epoch": 0.11365284222737819, "grad_norm": 0.05157068744301796, "grad_norm_var": 6.740999422120599e-05, "learning_rate": 0.009776401738593454, "loss": 2.6688, "step": 3135 }, { "crossentropy": 2.783647060394287, "epoch": 0.1136890951276102, "grad_norm": 0.04803658649325371, "grad_norm_var": 5.6956593202134076e-05, "learning_rate": 0.009776229877748137, "loss": 2.7005, "step": 3136 }, { "crossentropy": 2.602985382080078, "epoch": 0.11372534802784223, "grad_norm": 0.08608628809452057, "grad_norm_var": 0.00013335596470660864, "learning_rate": 0.009776057952392588, "loss": 2.6331, "step": 3137 }, { "crossentropy": 2.633321762084961, "epoch": 0.11376160092807425, "grad_norm": 0.04913511127233505, "grad_norm_var": 0.00013398399579981664, "learning_rate": 0.009775885962529129, "loss": 2.6757, "step": 3138 }, { "crossentropy": 2.8290958404541016, "epoch": 0.11379785382830626, "grad_norm": 0.04125262796878815, "grad_norm_var": 0.00013852749974195994, "learning_rate": 0.009775713908160082, "loss": 2.8173, "step": 3139 }, { "crossentropy": 2.751917600631714, "epoch": 0.11383410672853829, "grad_norm": 0.042350295931100845, "grad_norm_var": 0.00014073686508601796, "learning_rate": 0.009775541789287771, "loss": 2.7283, "step": 3140 }, { "crossentropy": 2.6167333126068115, "epoch": 0.1138703596287703, "grad_norm": 0.043488696217536926, "grad_norm_var": 0.00011602291970209493, "learning_rate": 0.00977536960591452, "loss": 2.6866, "step": 3141 }, { "crossentropy": 2.7225534915924072, "epoch": 0.11390661252900232, "grad_norm": 0.048350341618061066, "grad_norm_var": 0.000112673318574306, "learning_rate": 0.009775197358042657, "loss": 2.7376, "step": 3142 }, { "crossentropy": 2.7837812900543213, "epoch": 0.11394286542923433, "grad_norm": 0.03802987188100815, "grad_norm_var": 0.00011895839349253991, "learning_rate": 0.009775025045674506, "loss": 2.7186, "step": 3143 }, { "crossentropy": 2.659027576446533, "epoch": 0.11397911832946636, "grad_norm": 0.03845369070768356, "grad_norm_var": 0.0001240346870173496, "learning_rate": 0.009774852668812397, "loss": 2.6483, "step": 3144 }, { "crossentropy": 2.7442362308502197, "epoch": 0.11401537122969838, "grad_norm": 0.0432090237736702, "grad_norm_var": 0.00012476053323384726, "learning_rate": 0.009774680227458655, "loss": 2.7175, "step": 3145 }, { "crossentropy": 2.7342658042907715, "epoch": 0.11405162412993039, "grad_norm": 0.07848463207483292, "grad_norm_var": 0.00018381528460269155, "learning_rate": 0.009774507721615611, "loss": 2.7588, "step": 3146 }, { "crossentropy": 2.728670835494995, "epoch": 0.11408787703016242, "grad_norm": 0.04672034829854965, "grad_norm_var": 0.00018023527311671316, "learning_rate": 0.009774335151285595, "loss": 2.7532, "step": 3147 }, { "crossentropy": 2.858022928237915, "epoch": 0.11412412993039443, "grad_norm": 0.04626499116420746, "grad_norm_var": 0.00018029901292178607, "learning_rate": 0.009774162516470937, "loss": 2.7786, "step": 3148 }, { "crossentropy": 2.758179187774658, "epoch": 0.11416038283062645, "grad_norm": 0.04493335261940956, "grad_norm_var": 0.00017948082910298758, "learning_rate": 0.00977398981717397, "loss": 2.745, "step": 3149 }, { "crossentropy": 2.76672625541687, "epoch": 0.11419663573085846, "grad_norm": 0.04452868178486824, "grad_norm_var": 0.00017991117665150467, "learning_rate": 0.009773817053397025, "loss": 2.718, "step": 3150 }, { "crossentropy": 2.8928797245025635, "epoch": 0.11423288863109049, "grad_norm": 0.058600373566150665, "grad_norm_var": 0.00018500526316797897, "learning_rate": 0.009773644225142437, "loss": 2.7625, "step": 3151 }, { "crossentropy": 2.6640803813934326, "epoch": 0.1142691415313225, "grad_norm": 0.05398409068584442, "grad_norm_var": 0.00018576192209095494, "learning_rate": 0.009773471332412539, "loss": 2.6992, "step": 3152 }, { "crossentropy": 2.8056159019470215, "epoch": 0.11430539443155452, "grad_norm": 0.06112063303589821, "grad_norm_var": 0.00010540043711329364, "learning_rate": 0.009773298375209667, "loss": 2.7501, "step": 3153 }, { "crossentropy": 2.731678009033203, "epoch": 0.11434164733178655, "grad_norm": 0.041689444333314896, "grad_norm_var": 0.00010841515565823314, "learning_rate": 0.009773125353536156, "loss": 2.757, "step": 3154 }, { "crossentropy": 2.9062416553497314, "epoch": 0.11437790023201856, "grad_norm": 0.037423983216285706, "grad_norm_var": 0.00011288617948218555, "learning_rate": 0.009772952267394344, "loss": 2.8385, "step": 3155 }, { "crossentropy": 2.6289327144622803, "epoch": 0.11441415313225058, "grad_norm": 0.04133385047316551, "grad_norm_var": 0.0001137133209263077, "learning_rate": 0.009772779116786567, "loss": 2.735, "step": 3156 }, { "crossentropy": 2.6974406242370605, "epoch": 0.1144504060324826, "grad_norm": 0.038101207464933395, "grad_norm_var": 0.00011870586327695868, "learning_rate": 0.009772605901715168, "loss": 2.7361, "step": 3157 }, { "crossentropy": 2.9713633060455322, "epoch": 0.11448665893271462, "grad_norm": 0.03758692741394043, "grad_norm_var": 0.0001248364041873245, "learning_rate": 0.009772432622182482, "loss": 2.885, "step": 3158 }, { "crossentropy": 2.6486763954162598, "epoch": 0.11452291183294663, "grad_norm": 0.06110306829214096, "grad_norm_var": 0.00013080886660042716, "learning_rate": 0.00977225927819085, "loss": 2.7083, "step": 3159 }, { "crossentropy": 2.874222755432129, "epoch": 0.11455916473317865, "grad_norm": 0.03973995894193649, "grad_norm_var": 0.00012921569223769576, "learning_rate": 0.009772085869742615, "loss": 2.806, "step": 3160 }, { "crossentropy": 2.8920183181762695, "epoch": 0.11459541763341068, "grad_norm": 0.042612701654434204, "grad_norm_var": 0.0001296527595910905, "learning_rate": 0.009771912396840119, "loss": 2.8104, "step": 3161 }, { "crossentropy": 2.8574116230010986, "epoch": 0.11463167053364269, "grad_norm": 0.048799388110637665, "grad_norm_var": 6.561017581764028e-05, "learning_rate": 0.009771738859485703, "loss": 2.8764, "step": 3162 }, { "crossentropy": 2.641542911529541, "epoch": 0.1146679234338747, "grad_norm": 0.046021755784749985, "grad_norm_var": 6.562331443410955e-05, "learning_rate": 0.009771565257681716, "loss": 2.7526, "step": 3163 }, { "crossentropy": 2.7786905765533447, "epoch": 0.11470417633410673, "grad_norm": 0.04378020763397217, "grad_norm_var": 6.608383647746513e-05, "learning_rate": 0.009771391591430496, "loss": 2.7866, "step": 3164 }, { "crossentropy": 2.743307590484619, "epoch": 0.11474042923433875, "grad_norm": 0.04167090728878975, "grad_norm_var": 6.735875452219398e-05, "learning_rate": 0.009771217860734392, "loss": 2.8125, "step": 3165 }, { "crossentropy": 2.7619197368621826, "epoch": 0.11477668213457076, "grad_norm": 0.04701247438788414, "grad_norm_var": 6.721366322670336e-05, "learning_rate": 0.00977104406559575, "loss": 2.8304, "step": 3166 }, { "crossentropy": 2.8209452629089355, "epoch": 0.11481293503480279, "grad_norm": 0.04699326679110527, "grad_norm_var": 5.657655230908073e-05, "learning_rate": 0.009770870206016918, "loss": 2.7596, "step": 3167 }, { "crossentropy": 2.7081189155578613, "epoch": 0.1148491879350348, "grad_norm": 0.04718617722392082, "grad_norm_var": 5.183006589787037e-05, "learning_rate": 0.009770696282000245, "loss": 2.7562, "step": 3168 }, { "crossentropy": 2.618964433670044, "epoch": 0.11488544083526682, "grad_norm": 0.046533841639757156, "grad_norm_var": 3.403986425257044e-05, "learning_rate": 0.009770522293548077, "loss": 2.6236, "step": 3169 }, { "crossentropy": 2.53263521194458, "epoch": 0.11492169373549883, "grad_norm": 0.046174466609954834, "grad_norm_var": 3.3781213606792526e-05, "learning_rate": 0.009770348240662767, "loss": 2.64, "step": 3170 }, { "crossentropy": 2.8727211952209473, "epoch": 0.11495794663573086, "grad_norm": 0.05868682265281677, "grad_norm_var": 4.196401258630943e-05, "learning_rate": 0.009770174123346664, "loss": 2.794, "step": 3171 }, { "crossentropy": 2.803043842315674, "epoch": 0.11499419953596288, "grad_norm": 0.0887465551495552, "grad_norm_var": 0.00015401598671815198, "learning_rate": 0.00976999994160212, "loss": 2.7538, "step": 3172 }, { "crossentropy": 2.8001341819763184, "epoch": 0.11503045243619489, "grad_norm": 0.05914928764104843, "grad_norm_var": 0.00015168845383049893, "learning_rate": 0.009769825695431491, "loss": 2.7487, "step": 3173 }, { "crossentropy": 2.7502031326293945, "epoch": 0.11506670533642692, "grad_norm": 0.05857182294130325, "grad_norm_var": 0.00014416532469146032, "learning_rate": 0.009769651384837125, "loss": 2.696, "step": 3174 }, { "crossentropy": 2.7545366287231445, "epoch": 0.11510295823665893, "grad_norm": 0.05640920624136925, "grad_norm_var": 0.00013948466707899328, "learning_rate": 0.009769477009821378, "loss": 2.7018, "step": 3175 }, { "crossentropy": 2.6433231830596924, "epoch": 0.11513921113689095, "grad_norm": 0.048516254872083664, "grad_norm_var": 0.00013096966501772536, "learning_rate": 0.009769302570386608, "loss": 2.6719, "step": 3176 }, { "crossentropy": 2.8477931022644043, "epoch": 0.11517546403712298, "grad_norm": 0.04702683538198471, "grad_norm_var": 0.00012685142880146266, "learning_rate": 0.009769128066535168, "loss": 2.7845, "step": 3177 }, { "crossentropy": 2.693939447402954, "epoch": 0.11521171693735499, "grad_norm": 0.05522121489048004, "grad_norm_var": 0.00012672698665646965, "learning_rate": 0.009768953498269417, "loss": 2.7022, "step": 3178 }, { "crossentropy": 2.82698130607605, "epoch": 0.115247969837587, "grad_norm": 0.05189359560608864, "grad_norm_var": 0.00012392248801798174, "learning_rate": 0.009768778865591712, "loss": 2.8383, "step": 3179 }, { "crossentropy": 2.689786434173584, "epoch": 0.11528422273781902, "grad_norm": 0.04775283858180046, "grad_norm_var": 0.00012017183170761516, "learning_rate": 0.009768604168504412, "loss": 2.7078, "step": 3180 }, { "crossentropy": 2.7914087772369385, "epoch": 0.11532047563805105, "grad_norm": 0.03991830348968506, "grad_norm_var": 0.00012300455909067346, "learning_rate": 0.009768429407009874, "loss": 2.7951, "step": 3181 }, { "crossentropy": 2.826056480407715, "epoch": 0.11535672853828306, "grad_norm": 0.04119907319545746, "grad_norm_var": 0.0001296509181080355, "learning_rate": 0.009768254581110462, "loss": 2.8094, "step": 3182 }, { "crossentropy": 2.7375130653381348, "epoch": 0.11539298143851508, "grad_norm": 0.05787238851189613, "grad_norm_var": 0.00012906218706144884, "learning_rate": 0.009768079690808536, "loss": 2.6679, "step": 3183 }, { "crossentropy": 2.733351230621338, "epoch": 0.1154292343387471, "grad_norm": 0.04027979448437691, "grad_norm_var": 0.0001375615108415541, "learning_rate": 0.009767904736106456, "loss": 2.6423, "step": 3184 }, { "crossentropy": 2.706073522567749, "epoch": 0.11546548723897912, "grad_norm": 0.039563022553920746, "grad_norm_var": 0.00014637332201629266, "learning_rate": 0.009767729717006589, "loss": 2.7262, "step": 3185 }, { "crossentropy": 2.7748422622680664, "epoch": 0.11550174013921113, "grad_norm": 0.03993148356676102, "grad_norm_var": 0.00015391757110797636, "learning_rate": 0.009767554633511295, "loss": 2.7839, "step": 3186 }, { "crossentropy": 2.818641185760498, "epoch": 0.11553799303944315, "grad_norm": 0.03977949917316437, "grad_norm_var": 0.000159204415378561, "learning_rate": 0.009767379485622943, "loss": 2.7416, "step": 3187 }, { "crossentropy": 2.7564449310302734, "epoch": 0.11557424593967518, "grad_norm": 0.04376230761408806, "grad_norm_var": 5.771551089558581e-05, "learning_rate": 0.009767204273343894, "loss": 2.7143, "step": 3188 }, { "crossentropy": 2.7576041221618652, "epoch": 0.11561049883990719, "grad_norm": 0.10653668642044067, "grad_norm_var": 0.0002689634659228894, "learning_rate": 0.00976702899667652, "loss": 2.7539, "step": 3189 }, { "crossentropy": 2.7058968544006348, "epoch": 0.1156467517401392, "grad_norm": 0.04841533303260803, "grad_norm_var": 0.00026500741448218045, "learning_rate": 0.009766853655623183, "loss": 2.7075, "step": 3190 }, { "crossentropy": 2.7322793006896973, "epoch": 0.11568300464037123, "grad_norm": 0.048809003084897995, "grad_norm_var": 0.0002623810415019425, "learning_rate": 0.009766678250186251, "loss": 2.7033, "step": 3191 }, { "crossentropy": 2.7271604537963867, "epoch": 0.11571925754060325, "grad_norm": 0.05881381034851074, "grad_norm_var": 0.00026727359113697, "learning_rate": 0.0097665027803681, "loss": 2.7351, "step": 3192 }, { "crossentropy": 2.7167232036590576, "epoch": 0.11575551044083526, "grad_norm": 0.04381807520985603, "grad_norm_var": 0.0002693702892938539, "learning_rate": 0.009766327246171094, "loss": 2.8094, "step": 3193 }, { "crossentropy": 2.641526699066162, "epoch": 0.11579176334106729, "grad_norm": 0.05721695348620415, "grad_norm_var": 0.00027094926857481607, "learning_rate": 0.009766151647597605, "loss": 2.7368, "step": 3194 }, { "crossentropy": 2.6843628883361816, "epoch": 0.1158280162412993, "grad_norm": 0.07180872559547424, "grad_norm_var": 0.00029984260986714857, "learning_rate": 0.009765975984650004, "loss": 2.7898, "step": 3195 }, { "crossentropy": 2.94879150390625, "epoch": 0.11586426914153132, "grad_norm": 0.04171447828412056, "grad_norm_var": 0.00030521270398898377, "learning_rate": 0.009765800257330665, "loss": 2.818, "step": 3196 }, { "crossentropy": 2.874239683151245, "epoch": 0.11590052204176333, "grad_norm": 0.04075242578983307, "grad_norm_var": 0.00030399981950769933, "learning_rate": 0.009765624465641963, "loss": 2.762, "step": 3197 }, { "crossentropy": 2.594146490097046, "epoch": 0.11593677494199536, "grad_norm": 0.04015371948480606, "grad_norm_var": 0.0003054713991376527, "learning_rate": 0.00976544860958627, "loss": 2.7314, "step": 3198 }, { "crossentropy": 2.7622222900390625, "epoch": 0.11597302784222738, "grad_norm": 0.04170859977602959, "grad_norm_var": 0.00030742424154769763, "learning_rate": 0.00976527268916596, "loss": 2.7265, "step": 3199 }, { "crossentropy": 2.6819050312042236, "epoch": 0.11600928074245939, "grad_norm": 0.04440091922879219, "grad_norm_var": 0.0003030394070415262, "learning_rate": 0.009765096704383412, "loss": 2.7253, "step": 3200 }, { "crossentropy": 2.7343573570251465, "epoch": 0.11604553364269142, "grad_norm": 0.05683889240026474, "grad_norm_var": 0.0002966174396022947, "learning_rate": 0.009764920655241002, "loss": 2.8043, "step": 3201 }, { "crossentropy": 2.7130136489868164, "epoch": 0.11608178654292343, "grad_norm": 0.04525616019964218, "grad_norm_var": 0.00029015585129623186, "learning_rate": 0.009764744541741107, "loss": 2.6807, "step": 3202 }, { "crossentropy": 2.675589084625244, "epoch": 0.11611803944315545, "grad_norm": 0.06671706587076187, "grad_norm_var": 0.00029211289871149956, "learning_rate": 0.009764568363886107, "loss": 2.7106, "step": 3203 }, { "crossentropy": 2.736767292022705, "epoch": 0.11615429234338748, "grad_norm": 0.05451798066496849, "grad_norm_var": 0.0002853136384950933, "learning_rate": 0.009764392121678381, "loss": 2.7256, "step": 3204 }, { "crossentropy": 2.797147035598755, "epoch": 0.11619054524361949, "grad_norm": 0.041055355221033096, "grad_norm_var": 9.651013220661661e-05, "learning_rate": 0.009764215815120308, "loss": 2.8486, "step": 3205 }, { "crossentropy": 2.6700682640075684, "epoch": 0.1162267981438515, "grad_norm": 0.04377920553088188, "grad_norm_var": 9.891022162535247e-05, "learning_rate": 0.00976403944421427, "loss": 2.7135, "step": 3206 }, { "crossentropy": 2.69612979888916, "epoch": 0.11626305104408352, "grad_norm": 0.04335695505142212, "grad_norm_var": 0.00010151392354096992, "learning_rate": 0.00976386300896265, "loss": 2.7014, "step": 3207 }, { "crossentropy": 2.907148599624634, "epoch": 0.11629930394431555, "grad_norm": 0.04357319697737694, "grad_norm_var": 9.709325114728202e-05, "learning_rate": 0.009763686509367831, "loss": 2.7402, "step": 3208 }, { "crossentropy": 2.583190441131592, "epoch": 0.11633555684454756, "grad_norm": 0.048895180225372314, "grad_norm_var": 9.550660430588919e-05, "learning_rate": 0.009763509945432196, "loss": 2.7678, "step": 3209 }, { "crossentropy": 2.6706268787384033, "epoch": 0.11637180974477958, "grad_norm": 0.048689886927604675, "grad_norm_var": 9.054865210809387e-05, "learning_rate": 0.009763333317158132, "loss": 2.6457, "step": 3210 }, { "crossentropy": 2.718926191329956, "epoch": 0.1164080626450116, "grad_norm": 0.048015858978033066, "grad_norm_var": 5.1434297124678535e-05, "learning_rate": 0.009763156624548021, "loss": 2.7578, "step": 3211 }, { "crossentropy": 2.8667614459991455, "epoch": 0.11644431554524362, "grad_norm": 0.047487422823905945, "grad_norm_var": 4.9572659905878076e-05, "learning_rate": 0.009762979867604252, "loss": 2.7561, "step": 3212 }, { "crossentropy": 2.773489475250244, "epoch": 0.11648056844547564, "grad_norm": 0.045631833374500275, "grad_norm_var": 4.6866033992024454e-05, "learning_rate": 0.009762803046329212, "loss": 2.742, "step": 3213 }, { "crossentropy": 2.79829478263855, "epoch": 0.11651682134570766, "grad_norm": 0.041223715990781784, "grad_norm_var": 4.588882627066341e-05, "learning_rate": 0.009762626160725289, "loss": 2.7309, "step": 3214 }, { "crossentropy": 2.7119059562683105, "epoch": 0.11655307424593968, "grad_norm": 0.042681820690631866, "grad_norm_var": 4.518720312264157e-05, "learning_rate": 0.009762449210794872, "loss": 2.7627, "step": 3215 }, { "crossentropy": 2.7113704681396484, "epoch": 0.11658932714617169, "grad_norm": 0.03913791850209236, "grad_norm_var": 4.918617352428622e-05, "learning_rate": 0.009762272196540352, "loss": 2.7347, "step": 3216 }, { "crossentropy": 2.8918163776397705, "epoch": 0.1166255800464037, "grad_norm": 0.044976141303777695, "grad_norm_var": 4.289958808571988e-05, "learning_rate": 0.009762095117964118, "loss": 2.8044, "step": 3217 }, { "crossentropy": 2.651822566986084, "epoch": 0.11666183294663574, "grad_norm": 0.03873498737812042, "grad_norm_var": 4.6693059268837256e-05, "learning_rate": 0.009761917975068564, "loss": 2.6979, "step": 3218 }, { "crossentropy": 2.836287498474121, "epoch": 0.11669808584686775, "grad_norm": 0.03751298040151596, "grad_norm_var": 1.9930462747602454e-05, "learning_rate": 0.00976174076785608, "loss": 2.7854, "step": 3219 }, { "crossentropy": 2.6762001514434814, "epoch": 0.11673433874709976, "grad_norm": 0.04055899381637573, "grad_norm_var": 1.3145831052685461e-05, "learning_rate": 0.00976156349632906, "loss": 2.7588, "step": 3220 }, { "crossentropy": 2.7706246376037598, "epoch": 0.11677059164733179, "grad_norm": 0.05126801133155823, "grad_norm_var": 1.6394234510608055e-05, "learning_rate": 0.009761386160489901, "loss": 2.7363, "step": 3221 }, { "crossentropy": 2.7192561626434326, "epoch": 0.1168068445475638, "grad_norm": 0.05675659701228142, "grad_norm_var": 2.637315808694691e-05, "learning_rate": 0.009761208760340995, "loss": 2.7386, "step": 3222 }, { "crossentropy": 2.640479803085327, "epoch": 0.11684309744779582, "grad_norm": 0.05048712342977524, "grad_norm_var": 2.8077627385345026e-05, "learning_rate": 0.00976103129588474, "loss": 2.7441, "step": 3223 }, { "crossentropy": 2.745922565460205, "epoch": 0.11687935034802784, "grad_norm": 0.042230598628520966, "grad_norm_var": 2.850871339605491e-05, "learning_rate": 0.009760853767123531, "loss": 2.8509, "step": 3224 }, { "crossentropy": 2.722334146499634, "epoch": 0.11691560324825986, "grad_norm": 0.05057443305850029, "grad_norm_var": 2.9497068563792e-05, "learning_rate": 0.009760676174059768, "loss": 2.7559, "step": 3225 }, { "crossentropy": 2.6679251194000244, "epoch": 0.11695185614849188, "grad_norm": 0.047544896602630615, "grad_norm_var": 2.9072635537522733e-05, "learning_rate": 0.009760498516695849, "loss": 2.7566, "step": 3226 }, { "crossentropy": 2.6585233211517334, "epoch": 0.11698810904872389, "grad_norm": 0.04424504563212395, "grad_norm_var": 2.859659202751307e-05, "learning_rate": 0.009760320795034173, "loss": 2.6936, "step": 3227 }, { "crossentropy": 2.715824842453003, "epoch": 0.11702436194895592, "grad_norm": 0.052595220506191254, "grad_norm_var": 3.187642502902229e-05, "learning_rate": 0.00976014300907714, "loss": 2.7254, "step": 3228 }, { "crossentropy": 2.7399206161499023, "epoch": 0.11706061484918794, "grad_norm": 0.0661354511976242, "grad_norm_var": 5.8826064199329665e-05, "learning_rate": 0.009759965158827153, "loss": 2.7881, "step": 3229 }, { "crossentropy": 2.836549758911133, "epoch": 0.11709686774941995, "grad_norm": 0.041918784379959106, "grad_norm_var": 5.835184529527539e-05, "learning_rate": 0.009759787244286612, "loss": 2.853, "step": 3230 }, { "crossentropy": 2.832155704498291, "epoch": 0.11713312064965198, "grad_norm": 0.04406632110476494, "grad_norm_var": 5.772805725246891e-05, "learning_rate": 0.009759609265457922, "loss": 2.776, "step": 3231 }, { "crossentropy": 2.613285541534424, "epoch": 0.11716937354988399, "grad_norm": 0.0399126335978508, "grad_norm_var": 5.697447609378416e-05, "learning_rate": 0.009759431222343485, "loss": 2.7835, "step": 3232 }, { "crossentropy": 2.6815803050994873, "epoch": 0.11720562645011601, "grad_norm": 0.04471400007605553, "grad_norm_var": 5.704408773553506e-05, "learning_rate": 0.009759253114945708, "loss": 2.7481, "step": 3233 }, { "crossentropy": 2.836332082748413, "epoch": 0.11724187935034802, "grad_norm": 0.0416920930147171, "grad_norm_var": 5.439949917954613e-05, "learning_rate": 0.009759074943266992, "loss": 2.7283, "step": 3234 }, { "crossentropy": 2.7962589263916016, "epoch": 0.11727813225058005, "grad_norm": 0.0432545468211174, "grad_norm_var": 4.918693441169347e-05, "learning_rate": 0.00975889670730975, "loss": 2.7521, "step": 3235 }, { "crossentropy": 2.70113468170166, "epoch": 0.11731438515081206, "grad_norm": 0.04218343645334244, "grad_norm_var": 4.787617801884306e-05, "learning_rate": 0.009758718407076384, "loss": 2.7859, "step": 3236 }, { "crossentropy": 2.755568265914917, "epoch": 0.11735063805104408, "grad_norm": 0.05098705366253853, "grad_norm_var": 4.773897279553204e-05, "learning_rate": 0.009758540042569305, "loss": 2.8367, "step": 3237 }, { "crossentropy": 2.629169225692749, "epoch": 0.11738689095127611, "grad_norm": 0.04142550006508827, "grad_norm_var": 4.3417636115276184e-05, "learning_rate": 0.009758361613790923, "loss": 2.5771, "step": 3238 }, { "crossentropy": 2.6504244804382324, "epoch": 0.11742314385150812, "grad_norm": 0.04254751652479172, "grad_norm_var": 4.313447103684101e-05, "learning_rate": 0.009758183120743644, "loss": 2.6726, "step": 3239 }, { "crossentropy": 3.0089986324310303, "epoch": 0.11745939675174014, "grad_norm": 0.04334811866283417, "grad_norm_var": 4.265061696263339e-05, "learning_rate": 0.009758004563429883, "loss": 2.8936, "step": 3240 }, { "crossentropy": 2.633770227432251, "epoch": 0.11749564965197216, "grad_norm": 0.05787350982427597, "grad_norm_var": 5.036263615704282e-05, "learning_rate": 0.009757825941852049, "loss": 2.6954, "step": 3241 }, { "crossentropy": 2.674543619155884, "epoch": 0.11753190255220418, "grad_norm": 0.04286197945475578, "grad_norm_var": 5.109815311811923e-05, "learning_rate": 0.009757647256012556, "loss": 2.7081, "step": 3242 }, { "crossentropy": 2.608259916305542, "epoch": 0.1175681554524362, "grad_norm": 0.0413987897336483, "grad_norm_var": 5.235969437867467e-05, "learning_rate": 0.009757468505913816, "loss": 2.6882, "step": 3243 }, { "crossentropy": 2.7088890075683594, "epoch": 0.11760440835266821, "grad_norm": 0.03979407623410225, "grad_norm_var": 5.144228000151836e-05, "learning_rate": 0.009757289691558244, "loss": 2.7005, "step": 3244 }, { "crossentropy": 2.81080961227417, "epoch": 0.11764066125290024, "grad_norm": 0.04158572107553482, "grad_norm_var": 2.0769349556002254e-05, "learning_rate": 0.009757110812948255, "loss": 2.8107, "step": 3245 }, { "crossentropy": 2.8361878395080566, "epoch": 0.11767691415313225, "grad_norm": 0.043810028582811356, "grad_norm_var": 2.05379999712411e-05, "learning_rate": 0.009756931870086267, "loss": 2.8233, "step": 3246 }, { "crossentropy": 2.7914016246795654, "epoch": 0.11771316705336426, "grad_norm": 0.04589811712503433, "grad_norm_var": 2.0802759875107544e-05, "learning_rate": 0.009756752862974694, "loss": 2.7227, "step": 3247 }, { "crossentropy": 2.6599268913269043, "epoch": 0.1177494199535963, "grad_norm": 0.04377833381295204, "grad_norm_var": 1.965296430948537e-05, "learning_rate": 0.009756573791615954, "loss": 2.6911, "step": 3248 }, { "crossentropy": 2.7374022006988525, "epoch": 0.11778567285382831, "grad_norm": 0.04104086756706238, "grad_norm_var": 2.0243032040537638e-05, "learning_rate": 0.009756394656012468, "loss": 2.7843, "step": 3249 }, { "crossentropy": 2.7786056995391846, "epoch": 0.11782192575406032, "grad_norm": 0.03865385428071022, "grad_norm_var": 2.1741719021254693e-05, "learning_rate": 0.009756215456166653, "loss": 2.7149, "step": 3250 }, { "crossentropy": 2.7030277252197266, "epoch": 0.11785817865429234, "grad_norm": 0.040999654680490494, "grad_norm_var": 2.221675699438785e-05, "learning_rate": 0.009756036192080932, "loss": 2.7233, "step": 3251 }, { "crossentropy": 2.7699766159057617, "epoch": 0.11789443155452436, "grad_norm": 0.04796280711889267, "grad_norm_var": 2.3184498429042594e-05, "learning_rate": 0.009755856863757724, "loss": 2.7932, "step": 3252 }, { "crossentropy": 2.6621758937835693, "epoch": 0.11793068445475638, "grad_norm": 0.05248251557350159, "grad_norm_var": 2.4717881447211313e-05, "learning_rate": 0.009755677471199452, "loss": 2.7579, "step": 3253 }, { "crossentropy": 2.698169708251953, "epoch": 0.1179669373549884, "grad_norm": 0.05214552581310272, "grad_norm_var": 2.8089937355684803e-05, "learning_rate": 0.009755498014408538, "loss": 2.6545, "step": 3254 }, { "crossentropy": 2.612401247024536, "epoch": 0.11800319025522042, "grad_norm": 0.04114682599902153, "grad_norm_var": 2.8626008823771226e-05, "learning_rate": 0.009755318493387408, "loss": 2.6483, "step": 3255 }, { "crossentropy": 2.6684396266937256, "epoch": 0.11803944315545244, "grad_norm": 0.03704225644469261, "grad_norm_var": 3.2225857069864174e-05, "learning_rate": 0.009755138908138483, "loss": 2.6394, "step": 3256 }, { "crossentropy": 2.8235061168670654, "epoch": 0.11807569605568445, "grad_norm": 0.08863939344882965, "grad_norm_var": 0.00014714807756423294, "learning_rate": 0.009754959258664193, "loss": 2.814, "step": 3257 }, { "crossentropy": 2.7125790119171143, "epoch": 0.11811194895591648, "grad_norm": 0.043650154024362564, "grad_norm_var": 0.0001468358437428161, "learning_rate": 0.00975477954496696, "loss": 2.7337, "step": 3258 }, { "crossentropy": 2.7409865856170654, "epoch": 0.1181482018561485, "grad_norm": 0.06002599000930786, "grad_norm_var": 0.00015646854587284334, "learning_rate": 0.009754599767049217, "loss": 2.6975, "step": 3259 }, { "crossentropy": 2.727975368499756, "epoch": 0.11818445475638051, "grad_norm": 0.04866819083690643, "grad_norm_var": 0.00015237202957465255, "learning_rate": 0.009754419924913386, "loss": 2.6877, "step": 3260 }, { "crossentropy": 2.9242513179779053, "epoch": 0.11822070765661252, "grad_norm": 0.0425606295466423, "grad_norm_var": 0.000151601470972322, "learning_rate": 0.0097542400185619, "loss": 2.8363, "step": 3261 }, { "crossentropy": 2.75565242767334, "epoch": 0.11825696055684455, "grad_norm": 0.09555929154157639, "grad_norm_var": 0.0002898473778796112, "learning_rate": 0.009754060047997186, "loss": 2.8006, "step": 3262 }, { "crossentropy": 2.7212753295898438, "epoch": 0.11829321345707657, "grad_norm": 0.0413956418633461, "grad_norm_var": 0.00029433683717138996, "learning_rate": 0.009753880013221678, "loss": 2.7551, "step": 3263 }, { "crossentropy": 2.685203790664673, "epoch": 0.11832946635730858, "grad_norm": 0.04804915562272072, "grad_norm_var": 0.00029137333427585525, "learning_rate": 0.009753699914237806, "loss": 2.7278, "step": 3264 }, { "crossentropy": 2.9352972507476807, "epoch": 0.11836571925754061, "grad_norm": 0.04814760759472847, "grad_norm_var": 0.0002848547762968005, "learning_rate": 0.009753519751048004, "loss": 2.7955, "step": 3265 }, { "crossentropy": 2.7550277709960938, "epoch": 0.11840197215777262, "grad_norm": 0.054355520755052567, "grad_norm_var": 0.00027296006687355833, "learning_rate": 0.009753339523654703, "loss": 2.7027, "step": 3266 }, { "crossentropy": 2.704052209854126, "epoch": 0.11843822505800464, "grad_norm": 0.052232492715120316, "grad_norm_var": 0.0002633568889130711, "learning_rate": 0.009753159232060339, "loss": 2.6752, "step": 3267 }, { "crossentropy": 2.6100339889526367, "epoch": 0.11847447795823667, "grad_norm": 0.0431668795645237, "grad_norm_var": 0.00026825786892902985, "learning_rate": 0.009752978876267344, "loss": 2.6988, "step": 3268 }, { "crossentropy": 2.94079852104187, "epoch": 0.11851073085846868, "grad_norm": 0.040439408272504807, "grad_norm_var": 0.00027828085807859906, "learning_rate": 0.009752798456278158, "loss": 2.7718, "step": 3269 }, { "crossentropy": 2.7536613941192627, "epoch": 0.1185469837587007, "grad_norm": 0.04819157347083092, "grad_norm_var": 0.00027935340698041024, "learning_rate": 0.009752617972095215, "loss": 2.7277, "step": 3270 }, { "crossentropy": 2.733222484588623, "epoch": 0.11858323665893271, "grad_norm": 1.6272908449172974, "grad_norm_var": 0.1552080634532945, "learning_rate": 0.009752437423720955, "loss": 2.7112, "step": 3271 }, { "crossentropy": 2.6279397010803223, "epoch": 0.11861948955916474, "grad_norm": 0.04185837507247925, "grad_norm_var": 0.15513619821256847, "learning_rate": 0.009752256811157815, "loss": 2.7069, "step": 3272 }, { "crossentropy": 2.8150529861450195, "epoch": 0.11865574245939675, "grad_norm": 0.04776156321167946, "grad_norm_var": 0.15558332829313543, "learning_rate": 0.009752076134408236, "loss": 2.7567, "step": 3273 }, { "crossentropy": 2.872624158859253, "epoch": 0.11869199535962877, "grad_norm": 0.057662781327962875, "grad_norm_var": 0.1553988454979589, "learning_rate": 0.009751895393474657, "loss": 2.7676, "step": 3274 }, { "crossentropy": 2.705026388168335, "epoch": 0.1187282482598608, "grad_norm": 0.055401820689439774, "grad_norm_var": 0.15545555443678716, "learning_rate": 0.009751714588359519, "loss": 2.7254, "step": 3275 }, { "crossentropy": 2.7426083087921143, "epoch": 0.11876450116009281, "grad_norm": 0.05391703173518181, "grad_norm_var": 0.15538667720371074, "learning_rate": 0.009751533719065264, "loss": 2.6873, "step": 3276 }, { "crossentropy": 2.6769275665283203, "epoch": 0.11880075406032482, "grad_norm": 0.044106513261795044, "grad_norm_var": 0.15536470727932838, "learning_rate": 0.009751352785594336, "loss": 2.7021, "step": 3277 }, { "crossentropy": 2.7388856410980225, "epoch": 0.11883700696055685, "grad_norm": 0.04584671929478645, "grad_norm_var": 0.1558798256921371, "learning_rate": 0.00975117178794918, "loss": 2.7335, "step": 3278 }, { "crossentropy": 2.7737631797790527, "epoch": 0.11887325986078887, "grad_norm": 0.04340009763836861, "grad_norm_var": 0.15585188925389498, "learning_rate": 0.009750990726132237, "loss": 2.7123, "step": 3279 }, { "crossentropy": 2.7883989810943604, "epoch": 0.11890951276102088, "grad_norm": 0.043749019503593445, "grad_norm_var": 0.15590977241565648, "learning_rate": 0.009750809600145953, "loss": 2.7961, "step": 3280 }, { "crossentropy": 2.74068021774292, "epoch": 0.1189457656612529, "grad_norm": 0.04485359787940979, "grad_norm_var": 0.15595374392077588, "learning_rate": 0.009750628409992778, "loss": 2.7515, "step": 3281 }, { "crossentropy": 2.805908441543579, "epoch": 0.11898201856148492, "grad_norm": 0.048772454261779785, "grad_norm_var": 0.156024296150638, "learning_rate": 0.009750447155675156, "loss": 2.6739, "step": 3282 }, { "crossentropy": 2.7498955726623535, "epoch": 0.11901827146171694, "grad_norm": 0.05796815827488899, "grad_norm_var": 0.15595451633989166, "learning_rate": 0.009750265837195536, "loss": 2.7482, "step": 3283 }, { "crossentropy": 2.7433359622955322, "epoch": 0.11905452436194895, "grad_norm": 0.058263614773750305, "grad_norm_var": 0.15576071310228642, "learning_rate": 0.009750084454556367, "loss": 2.7934, "step": 3284 }, { "crossentropy": 2.802582025527954, "epoch": 0.11909077726218098, "grad_norm": 0.05538922920823097, "grad_norm_var": 0.1555613411211857, "learning_rate": 0.0097499030077601, "loss": 2.785, "step": 3285 }, { "crossentropy": 2.785980224609375, "epoch": 0.119127030162413, "grad_norm": 0.051853153854608536, "grad_norm_var": 0.15551325522254555, "learning_rate": 0.009749721496809183, "loss": 2.8185, "step": 3286 }, { "crossentropy": 2.7416739463806152, "epoch": 0.11916328306264501, "grad_norm": 0.04529601335525513, "grad_norm_var": 3.483867848796298e-05, "learning_rate": 0.009749539921706072, "loss": 2.7905, "step": 3287 }, { "crossentropy": 2.7324163913726807, "epoch": 0.11919953596287702, "grad_norm": 0.0490863211452961, "grad_norm_var": 3.0492481588830177e-05, "learning_rate": 0.009749358282453213, "loss": 2.8449, "step": 3288 }, { "crossentropy": 2.7700419425964355, "epoch": 0.11923578886310905, "grad_norm": 0.04388187453150749, "grad_norm_var": 3.269875524424445e-05, "learning_rate": 0.009749176579053066, "loss": 2.8091, "step": 3289 }, { "crossentropy": 2.719669818878174, "epoch": 0.11927204176334107, "grad_norm": 0.04620622843503952, "grad_norm_var": 2.9144173176043283e-05, "learning_rate": 0.00974899481150808, "loss": 2.7487, "step": 3290 }, { "crossentropy": 2.5713682174682617, "epoch": 0.11930829466357308, "grad_norm": 0.046289101243019104, "grad_norm_var": 2.685901545152446e-05, "learning_rate": 0.009748812979820713, "loss": 2.7051, "step": 3291 }, { "crossentropy": 2.742663621902466, "epoch": 0.11934454756380511, "grad_norm": 0.05072197690606117, "grad_norm_var": 2.5266001899984515e-05, "learning_rate": 0.009748631083993422, "loss": 2.7342, "step": 3292 }, { "crossentropy": 2.700068235397339, "epoch": 0.11938080046403712, "grad_norm": 0.11640401929616928, "grad_norm_var": 0.00030978767598847446, "learning_rate": 0.00974844912402866, "loss": 2.7369, "step": 3293 }, { "crossentropy": 2.637727975845337, "epoch": 0.11941705336426914, "grad_norm": 0.041810981929302216, "grad_norm_var": 0.0003146541732916373, "learning_rate": 0.009748267099928888, "loss": 2.7174, "step": 3294 }, { "crossentropy": 2.7115769386291504, "epoch": 0.11945330626450117, "grad_norm": 0.050600629299879074, "grad_norm_var": 0.0003089213324455053, "learning_rate": 0.00974808501169656, "loss": 2.7298, "step": 3295 }, { "crossentropy": 2.6616151332855225, "epoch": 0.11948955916473318, "grad_norm": 0.09217670559883118, "grad_norm_var": 0.00039449530722659575, "learning_rate": 0.009747902859334141, "loss": 2.6603, "step": 3296 }, { "crossentropy": 2.8461272716522217, "epoch": 0.1195258120649652, "grad_norm": 0.04667432978749275, "grad_norm_var": 0.000391942322393405, "learning_rate": 0.009747720642844088, "loss": 2.8515, "step": 3297 }, { "crossentropy": 2.685025453567505, "epoch": 0.11956206496519721, "grad_norm": 0.045920200645923615, "grad_norm_var": 0.00039532764856278123, "learning_rate": 0.009747538362228863, "loss": 2.713, "step": 3298 }, { "crossentropy": 2.750471830368042, "epoch": 0.11959831786542924, "grad_norm": 0.04531897231936455, "grad_norm_var": 0.000402276362974918, "learning_rate": 0.009747356017490928, "loss": 2.7828, "step": 3299 }, { "crossentropy": 2.705716848373413, "epoch": 0.11963457076566125, "grad_norm": 0.04884710907936096, "grad_norm_var": 0.00040418315112778836, "learning_rate": 0.009747173608632744, "loss": 2.6537, "step": 3300 }, { "crossentropy": 2.6684672832489014, "epoch": 0.11967082366589327, "grad_norm": 0.04564646631479263, "grad_norm_var": 0.0004093240743104733, "learning_rate": 0.009746991135656777, "loss": 2.7347, "step": 3301 }, { "crossentropy": 2.7162320613861084, "epoch": 0.1197070765661253, "grad_norm": 0.07526375353336334, "grad_norm_var": 0.00043634300346893803, "learning_rate": 0.009746808598565491, "loss": 2.6232, "step": 3302 }, { "crossentropy": 2.7777278423309326, "epoch": 0.11974332946635731, "grad_norm": 0.04402675852179527, "grad_norm_var": 0.00043819323728935724, "learning_rate": 0.009746625997361353, "loss": 2.7046, "step": 3303 }, { "crossentropy": 2.6959216594696045, "epoch": 0.11977958236658932, "grad_norm": 0.04567129537463188, "grad_norm_var": 0.0004418674345891086, "learning_rate": 0.009746443332046826, "loss": 2.7159, "step": 3304 }, { "crossentropy": 2.8143820762634277, "epoch": 0.11981583526682135, "grad_norm": 0.064462810754776, "grad_norm_var": 0.000436894842235333, "learning_rate": 0.009746260602624379, "loss": 2.8105, "step": 3305 }, { "crossentropy": 2.705199956893921, "epoch": 0.11985208816705337, "grad_norm": 0.04584171622991562, "grad_norm_var": 0.0004374096413816447, "learning_rate": 0.009746077809096481, "loss": 2.7176, "step": 3306 }, { "crossentropy": 2.734239101409912, "epoch": 0.11988834106728538, "grad_norm": 0.07579144090414047, "grad_norm_var": 0.00045123058781740624, "learning_rate": 0.009745894951465599, "loss": 2.7532, "step": 3307 }, { "crossentropy": 2.736711025238037, "epoch": 0.1199245939675174, "grad_norm": 0.05174071714282036, "grad_norm_var": 0.00045024591608600246, "learning_rate": 0.009745712029734203, "loss": 2.6938, "step": 3308 }, { "crossentropy": 2.5719850063323975, "epoch": 0.11996084686774942, "grad_norm": 0.04568875953555107, "grad_norm_var": 0.00021694333689591713, "learning_rate": 0.009745529043904765, "loss": 2.6546, "step": 3309 }, { "crossentropy": 2.749587059020996, "epoch": 0.11999709976798144, "grad_norm": 0.04226025566458702, "grad_norm_var": 0.00021622024061655637, "learning_rate": 0.009745345993979753, "loss": 2.774, "step": 3310 }, { "crossentropy": 2.8688089847564697, "epoch": 0.12003335266821345, "grad_norm": 0.05889920890331268, "grad_norm_var": 0.00021662946398553942, "learning_rate": 0.009745162879961645, "loss": 2.8255, "step": 3311 }, { "crossentropy": 2.7037951946258545, "epoch": 0.12006960556844548, "grad_norm": 0.04950304701924324, "grad_norm_var": 0.00011686401560989194, "learning_rate": 0.00974497970185291, "loss": 2.7374, "step": 3312 }, { "crossentropy": 2.54811692237854, "epoch": 0.1201058584686775, "grad_norm": 0.0402103029191494, "grad_norm_var": 0.0001240416584743255, "learning_rate": 0.009744796459656025, "loss": 2.6587, "step": 3313 }, { "crossentropy": 2.819711208343506, "epoch": 0.12014211136890951, "grad_norm": 0.048450320959091187, "grad_norm_var": 0.00012253636967336038, "learning_rate": 0.009744613153373462, "loss": 2.75, "step": 3314 }, { "crossentropy": 2.725738286972046, "epoch": 0.12017836426914154, "grad_norm": 0.039676733314991, "grad_norm_var": 0.0001293463720983355, "learning_rate": 0.009744429783007698, "loss": 2.7107, "step": 3315 }, { "crossentropy": 2.7667791843414307, "epoch": 0.12021461716937355, "grad_norm": 0.04089352488517761, "grad_norm_var": 0.00013597958392513453, "learning_rate": 0.00974424634856121, "loss": 2.7942, "step": 3316 }, { "crossentropy": 2.590949773788452, "epoch": 0.12025087006960557, "grad_norm": 0.04156037047505379, "grad_norm_var": 0.00013987259027554503, "learning_rate": 0.009744062850036476, "loss": 2.6298, "step": 3317 }, { "crossentropy": 2.7193830013275146, "epoch": 0.12028712296983758, "grad_norm": 0.043767299503088, "grad_norm_var": 9.838765654778624e-05, "learning_rate": 0.009743879287435973, "loss": 2.7225, "step": 3318 }, { "crossentropy": 2.84232497215271, "epoch": 0.12032337587006961, "grad_norm": 0.039804618805646896, "grad_norm_var": 0.00010210604129122635, "learning_rate": 0.009743695660762181, "loss": 2.7875, "step": 3319 }, { "crossentropy": 2.6234779357910156, "epoch": 0.12035962877030162, "grad_norm": 0.03956440091133118, "grad_norm_var": 0.00010664974404852936, "learning_rate": 0.009743511970017582, "loss": 2.676, "step": 3320 }, { "crossentropy": 2.5220816135406494, "epoch": 0.12039588167053364, "grad_norm": 0.044266462326049805, "grad_norm_var": 8.78306467341899e-05, "learning_rate": 0.009743328215204653, "loss": 2.5775, "step": 3321 }, { "crossentropy": 2.6238365173339844, "epoch": 0.12043213457076567, "grad_norm": 0.03986075520515442, "grad_norm_var": 9.07866832661484e-05, "learning_rate": 0.00974314439632588, "loss": 2.6737, "step": 3322 }, { "crossentropy": 2.75134539604187, "epoch": 0.12046838747099768, "grad_norm": 0.04131404310464859, "grad_norm_var": 2.9835139666907947e-05, "learning_rate": 0.009742960513383744, "loss": 2.7838, "step": 3323 }, { "crossentropy": 2.8927643299102783, "epoch": 0.1205046403712297, "grad_norm": 0.04299324378371239, "grad_norm_var": 2.5841582509584607e-05, "learning_rate": 0.009742776566380727, "loss": 2.853, "step": 3324 }, { "crossentropy": 2.6531875133514404, "epoch": 0.12054089327146171, "grad_norm": 0.040697094053030014, "grad_norm_var": 2.6055004739071e-05, "learning_rate": 0.009742592555319316, "loss": 2.7539, "step": 3325 }, { "crossentropy": 2.6192736625671387, "epoch": 0.12057714617169374, "grad_norm": 0.04063909500837326, "grad_norm_var": 2.645646215674215e-05, "learning_rate": 0.009742408480201995, "loss": 2.6937, "step": 3326 }, { "crossentropy": 2.728226900100708, "epoch": 0.12061339907192575, "grad_norm": 0.043937910348176956, "grad_norm_var": 9.241358414191492e-06, "learning_rate": 0.00974222434103125, "loss": 2.7439, "step": 3327 }, { "crossentropy": 2.755561351776123, "epoch": 0.12064965197215777, "grad_norm": 0.059268780052661896, "grad_norm_var": 2.455341968509942e-05, "learning_rate": 0.009742040137809571, "loss": 2.781, "step": 3328 }, { "crossentropy": 2.673048496246338, "epoch": 0.1206859048723898, "grad_norm": 0.04560177028179169, "grad_norm_var": 2.4413955735550635e-05, "learning_rate": 0.009741855870539442, "loss": 2.7572, "step": 3329 }, { "crossentropy": 2.6393754482269287, "epoch": 0.12072215777262181, "grad_norm": 0.042865999042987823, "grad_norm_var": 2.2504755278748845e-05, "learning_rate": 0.009741671539223354, "loss": 2.7335, "step": 3330 }, { "crossentropy": 2.6918814182281494, "epoch": 0.12075841067285382, "grad_norm": 0.04352448135614395, "grad_norm_var": 2.1766428294035337e-05, "learning_rate": 0.009741487143863796, "loss": 2.6725, "step": 3331 }, { "crossentropy": 2.906733512878418, "epoch": 0.12079466357308585, "grad_norm": 0.041783656924963, "grad_norm_var": 2.154695540462903e-05, "learning_rate": 0.009741302684463257, "loss": 2.6908, "step": 3332 }, { "crossentropy": 2.7277631759643555, "epoch": 0.12083091647331787, "grad_norm": 0.05651390179991722, "grad_norm_var": 3.222221055696605e-05, "learning_rate": 0.009741118161024232, "loss": 2.7423, "step": 3333 }, { "crossentropy": 2.657602071762085, "epoch": 0.12086716937354988, "grad_norm": 0.04253879189491272, "grad_norm_var": 3.237926018005964e-05, "learning_rate": 0.009740933573549212, "loss": 2.7069, "step": 3334 }, { "crossentropy": 2.7203428745269775, "epoch": 0.1209034222737819, "grad_norm": 0.04374292865395546, "grad_norm_var": 3.110706203747667e-05, "learning_rate": 0.009740748922040687, "loss": 2.7454, "step": 3335 }, { "crossentropy": 2.7360410690307617, "epoch": 0.12093967517401392, "grad_norm": 0.049971360713243484, "grad_norm_var": 3.127784877700873e-05, "learning_rate": 0.009740564206501155, "loss": 2.6667, "step": 3336 }, { "crossentropy": 2.67842435836792, "epoch": 0.12097592807424594, "grad_norm": 0.046661894768476486, "grad_norm_var": 3.1411770737283885e-05, "learning_rate": 0.009740379426933109, "loss": 2.6575, "step": 3337 }, { "crossentropy": 2.623880386352539, "epoch": 0.12101218097447795, "grad_norm": 0.03934387490153313, "grad_norm_var": 3.179090341339132e-05, "learning_rate": 0.009740194583339046, "loss": 2.6873, "step": 3338 }, { "crossentropy": 2.6432104110717773, "epoch": 0.12104843387470998, "grad_norm": 0.037063296884298325, "grad_norm_var": 3.505883225042028e-05, "learning_rate": 0.009740009675721459, "loss": 2.6057, "step": 3339 }, { "crossentropy": 2.701547622680664, "epoch": 0.121084686774942, "grad_norm": 0.04223455861210823, "grad_norm_var": 3.52797760321856e-05, "learning_rate": 0.009739824704082852, "loss": 2.7393, "step": 3340 }, { "crossentropy": 2.805293321609497, "epoch": 0.12112093967517401, "grad_norm": 0.04784242808818817, "grad_norm_var": 3.458632820070486e-05, "learning_rate": 0.009739639668425717, "loss": 2.8236, "step": 3341 }, { "crossentropy": 2.8881185054779053, "epoch": 0.12115719257540604, "grad_norm": 0.06582947075366974, "grad_norm_var": 5.885696321018026e-05, "learning_rate": 0.009739454568752555, "loss": 2.731, "step": 3342 }, { "crossentropy": 2.659437894821167, "epoch": 0.12119344547563805, "grad_norm": 0.046770885586738586, "grad_norm_var": 5.827924341770541e-05, "learning_rate": 0.009739269405065868, "loss": 2.6971, "step": 3343 }, { "crossentropy": 2.784104824066162, "epoch": 0.12122969837587007, "grad_norm": 0.04635991156101227, "grad_norm_var": 4.752982565000355e-05, "learning_rate": 0.009739084177368155, "loss": 2.7277, "step": 3344 }, { "crossentropy": 2.6332662105560303, "epoch": 0.12126595127610208, "grad_norm": 0.042073220014572144, "grad_norm_var": 4.857324778966253e-05, "learning_rate": 0.009738898885661918, "loss": 2.7485, "step": 3345 }, { "crossentropy": 2.772963285446167, "epoch": 0.12130220417633411, "grad_norm": 0.039724014699459076, "grad_norm_var": 5.0480158919024897e-05, "learning_rate": 0.009738713529949662, "loss": 2.7502, "step": 3346 }, { "crossentropy": 2.8237671852111816, "epoch": 0.12133845707656613, "grad_norm": 0.04020782560110092, "grad_norm_var": 5.215125298258308e-05, "learning_rate": 0.009738528110233886, "loss": 2.6779, "step": 3347 }, { "crossentropy": 2.7959635257720947, "epoch": 0.12137470997679814, "grad_norm": 0.040580201894044876, "grad_norm_var": 5.284473816145422e-05, "learning_rate": 0.009738342626517097, "loss": 2.7567, "step": 3348 }, { "crossentropy": 2.945577383041382, "epoch": 0.12141096287703017, "grad_norm": 0.046551626175642014, "grad_norm_var": 4.437291899269946e-05, "learning_rate": 0.009738157078801804, "loss": 2.8693, "step": 3349 }, { "crossentropy": 2.809008836746216, "epoch": 0.12144721577726218, "grad_norm": 0.04663150757551193, "grad_norm_var": 4.4162135510437264e-05, "learning_rate": 0.009737971467090505, "loss": 2.7355, "step": 3350 }, { "crossentropy": 2.7606887817382812, "epoch": 0.1214834686774942, "grad_norm": 0.05539751797914505, "grad_norm_var": 5.054372942619126e-05, "learning_rate": 0.009737785791385714, "loss": 2.7837, "step": 3351 }, { "crossentropy": 2.7482614517211914, "epoch": 0.12151972157772621, "grad_norm": 0.059079382568597794, "grad_norm_var": 6.076052691157217e-05, "learning_rate": 0.009737600051689934, "loss": 2.7618, "step": 3352 }, { "crossentropy": 2.6898412704467773, "epoch": 0.12155597447795824, "grad_norm": 0.038567349314689636, "grad_norm_var": 6.456971155355448e-05, "learning_rate": 0.009737414248005677, "loss": 2.6965, "step": 3353 }, { "crossentropy": 2.744901418685913, "epoch": 0.12159222737819025, "grad_norm": 0.0367458201944828, "grad_norm_var": 6.725957467432741e-05, "learning_rate": 0.00973722838033545, "loss": 2.7974, "step": 3354 }, { "crossentropy": 2.6806881427764893, "epoch": 0.12162848027842227, "grad_norm": 0.04252590984106064, "grad_norm_var": 6.281315930689445e-05, "learning_rate": 0.009737042448681765, "loss": 2.7755, "step": 3355 }, { "crossentropy": 2.7327795028686523, "epoch": 0.1216647331786543, "grad_norm": 0.04020191356539726, "grad_norm_var": 6.411089358452493e-05, "learning_rate": 0.009736856453047133, "loss": 2.7404, "step": 3356 }, { "crossentropy": 2.781097173690796, "epoch": 0.12170098607888631, "grad_norm": 0.04397464171051979, "grad_norm_var": 6.406636678440066e-05, "learning_rate": 0.009736670393434068, "loss": 2.7293, "step": 3357 }, { "crossentropy": 2.7988927364349365, "epoch": 0.12173723897911833, "grad_norm": 0.04344876483082771, "grad_norm_var": 3.5308085003507517e-05, "learning_rate": 0.009736484269845079, "loss": 2.791, "step": 3358 }, { "crossentropy": 2.740485906600952, "epoch": 0.12177349187935035, "grad_norm": 0.04055874049663544, "grad_norm_var": 3.5675502940938665e-05, "learning_rate": 0.009736298082282683, "loss": 2.7398, "step": 3359 }, { "crossentropy": 2.6287448406219482, "epoch": 0.12180974477958237, "grad_norm": 0.041848719120025635, "grad_norm_var": 3.547639794209267e-05, "learning_rate": 0.009736111830749393, "loss": 2.6638, "step": 3360 }, { "crossentropy": 2.684713125228882, "epoch": 0.12184599767981438, "grad_norm": 0.04139203205704689, "grad_norm_var": 3.56470045599538e-05, "learning_rate": 0.009735925515247726, "loss": 2.7152, "step": 3361 }, { "crossentropy": 2.7387499809265137, "epoch": 0.1218822505800464, "grad_norm": 0.039841242134571075, "grad_norm_var": 3.558744078567243e-05, "learning_rate": 0.009735739135780198, "loss": 2.7354, "step": 3362 }, { "crossentropy": 2.794102668762207, "epoch": 0.12191850348027843, "grad_norm": 0.0491039864718914, "grad_norm_var": 3.651362165737465e-05, "learning_rate": 0.009735552692349327, "loss": 2.7086, "step": 3363 }, { "crossentropy": 2.7148547172546387, "epoch": 0.12195475638051044, "grad_norm": 0.05570696294307709, "grad_norm_var": 4.360865021008238e-05, "learning_rate": 0.00973536618495763, "loss": 2.6974, "step": 3364 }, { "crossentropy": 2.5812325477600098, "epoch": 0.12199100928074245, "grad_norm": 0.0460461862385273, "grad_norm_var": 4.352668848138045e-05, "learning_rate": 0.009735179613607625, "loss": 2.6789, "step": 3365 }, { "crossentropy": 2.7275164127349854, "epoch": 0.12202726218097448, "grad_norm": 0.04755161330103874, "grad_norm_var": 4.377154576807266e-05, "learning_rate": 0.009734992978301835, "loss": 2.7496, "step": 3366 }, { "crossentropy": 2.727987766265869, "epoch": 0.1220635150812065, "grad_norm": 0.045407336205244064, "grad_norm_var": 3.6325268743421744e-05, "learning_rate": 0.009734806279042779, "loss": 2.6425, "step": 3367 }, { "crossentropy": 2.548630952835083, "epoch": 0.12209976798143851, "grad_norm": 0.04300866648554802, "grad_norm_var": 2.1226944887133468e-05, "learning_rate": 0.009734619515832979, "loss": 2.5937, "step": 3368 }, { "crossentropy": 2.5822696685791016, "epoch": 0.12213602088167054, "grad_norm": 0.038019660860300064, "grad_norm_var": 2.1605579986101684e-05, "learning_rate": 0.009734432688674957, "loss": 2.6635, "step": 3369 }, { "crossentropy": 2.844738006591797, "epoch": 0.12217227378190255, "grad_norm": 0.0418708436191082, "grad_norm_var": 1.8658204739664153e-05, "learning_rate": 0.009734245797571235, "loss": 2.7352, "step": 3370 }, { "crossentropy": 2.8462960720062256, "epoch": 0.12220852668213457, "grad_norm": 0.046571284532547, "grad_norm_var": 1.90036678638714e-05, "learning_rate": 0.009734058842524342, "loss": 2.8199, "step": 3371 }, { "crossentropy": 2.6151139736175537, "epoch": 0.12224477958236658, "grad_norm": 0.043563902378082275, "grad_norm_var": 1.7992071725627065e-05, "learning_rate": 0.0097338718235368, "loss": 2.763, "step": 3372 }, { "crossentropy": 2.7491140365600586, "epoch": 0.12228103248259861, "grad_norm": 0.04785458743572235, "grad_norm_var": 1.8793257138697597e-05, "learning_rate": 0.009733684740611133, "loss": 2.7527, "step": 3373 }, { "crossentropy": 2.7741363048553467, "epoch": 0.12231728538283063, "grad_norm": 0.048047248274087906, "grad_norm_var": 1.947821404697092e-05, "learning_rate": 0.009733497593749871, "loss": 2.7912, "step": 3374 }, { "crossentropy": 2.666705846786499, "epoch": 0.12235353828306264, "grad_norm": 0.0398624911904335, "grad_norm_var": 1.9899880232652985e-05, "learning_rate": 0.00973331038295554, "loss": 2.7106, "step": 3375 }, { "crossentropy": 2.8750391006469727, "epoch": 0.12238979118329467, "grad_norm": 0.039638977497816086, "grad_norm_var": 2.1054291968438062e-05, "learning_rate": 0.00973312310823067, "loss": 2.8352, "step": 3376 }, { "crossentropy": 2.784520149230957, "epoch": 0.12242604408352668, "grad_norm": 0.041793592274188995, "grad_norm_var": 2.089298922110973e-05, "learning_rate": 0.00973293576957779, "loss": 2.8588, "step": 3377 }, { "crossentropy": 2.764920234680176, "epoch": 0.1224622969837587, "grad_norm": 0.041159145534038544, "grad_norm_var": 2.0162163109419388e-05, "learning_rate": 0.00973274836699943, "loss": 2.7847, "step": 3378 }, { "crossentropy": 2.7591452598571777, "epoch": 0.12249854988399073, "grad_norm": 0.040380001068115234, "grad_norm_var": 1.9796671175302075e-05, "learning_rate": 0.00973256090049812, "loss": 2.7844, "step": 3379 }, { "crossentropy": 2.5370631217956543, "epoch": 0.12253480278422274, "grad_norm": 0.03806958347558975, "grad_norm_var": 1.2073180215700872e-05, "learning_rate": 0.009732373370076396, "loss": 2.5971, "step": 3380 }, { "crossentropy": 2.832425117492676, "epoch": 0.12257105568445475, "grad_norm": 0.039831798523664474, "grad_norm_var": 1.2006585201291646e-05, "learning_rate": 0.009732185775736783, "loss": 2.7618, "step": 3381 }, { "crossentropy": 2.725691556930542, "epoch": 0.12260730858468677, "grad_norm": 0.04670051485300064, "grad_norm_var": 1.1497260642063095e-05, "learning_rate": 0.009731998117481823, "loss": 2.7257, "step": 3382 }, { "crossentropy": 2.7605743408203125, "epoch": 0.1226435614849188, "grad_norm": 0.04886048659682274, "grad_norm_var": 1.3529910801483337e-05, "learning_rate": 0.009731810395314048, "loss": 2.7809, "step": 3383 }, { "crossentropy": 2.7093544006347656, "epoch": 0.12267981438515081, "grad_norm": 0.04517652094364166, "grad_norm_var": 1.387613138688996e-05, "learning_rate": 0.00973162260923599, "loss": 2.7428, "step": 3384 }, { "crossentropy": 2.705533742904663, "epoch": 0.12271606728538283, "grad_norm": 0.04250645264983177, "grad_norm_var": 1.2177315101885347e-05, "learning_rate": 0.009731434759250191, "loss": 2.688, "step": 3385 }, { "crossentropy": 2.7632999420166016, "epoch": 0.12275232018561485, "grad_norm": 0.03733719512820244, "grad_norm_var": 1.4291366599722178e-05, "learning_rate": 0.009731246845359184, "loss": 2.7341, "step": 3386 }, { "crossentropy": 2.8046162128448486, "epoch": 0.12278857308584687, "grad_norm": 0.038119424134492874, "grad_norm_var": 1.46859404924116e-05, "learning_rate": 0.00973105886756551, "loss": 2.7646, "step": 3387 }, { "crossentropy": 2.7029426097869873, "epoch": 0.12282482598607888, "grad_norm": 0.04198423773050308, "grad_norm_var": 1.4603363111789259e-05, "learning_rate": 0.009730870825871705, "loss": 2.6454, "step": 3388 }, { "crossentropy": 2.641913414001465, "epoch": 0.1228610788863109, "grad_norm": 0.04309076443314552, "grad_norm_var": 1.2514328703860701e-05, "learning_rate": 0.00973068272028031, "loss": 2.6252, "step": 3389 }, { "crossentropy": 2.852001905441284, "epoch": 0.12289733178654293, "grad_norm": 0.04279496520757675, "grad_norm_var": 1.002801103964773e-05, "learning_rate": 0.009730494550793865, "loss": 2.7914, "step": 3390 }, { "crossentropy": 2.6679723262786865, "epoch": 0.12293358468677494, "grad_norm": 0.04456032067537308, "grad_norm_var": 1.0252231744322231e-05, "learning_rate": 0.009730306317414915, "loss": 2.7069, "step": 3391 }, { "crossentropy": 2.8412773609161377, "epoch": 0.12296983758700696, "grad_norm": 0.04802794009447098, "grad_norm_var": 1.2009501205389784e-05, "learning_rate": 0.009730118020145997, "loss": 2.7894, "step": 3392 }, { "crossentropy": 2.642420768737793, "epoch": 0.12300609048723898, "grad_norm": 0.04729607328772545, "grad_norm_var": 1.336554824726225e-05, "learning_rate": 0.009729929658989658, "loss": 2.6887, "step": 3393 }, { "crossentropy": 2.69748592376709, "epoch": 0.123042343387471, "grad_norm": 0.04248826578259468, "grad_norm_var": 1.3173039645321637e-05, "learning_rate": 0.00972974123394844, "loss": 2.6979, "step": 3394 }, { "crossentropy": 2.6268327236175537, "epoch": 0.12307859628770301, "grad_norm": 0.04056615009903908, "grad_norm_var": 1.3111380245670451e-05, "learning_rate": 0.00972955274502489, "loss": 2.7691, "step": 3395 }, { "crossentropy": 2.7356019020080566, "epoch": 0.12311484918793504, "grad_norm": 0.04774124547839165, "grad_norm_var": 1.2647149094625424e-05, "learning_rate": 0.009729364192221552, "loss": 2.777, "step": 3396 }, { "crossentropy": 2.6142587661743164, "epoch": 0.12315110208816706, "grad_norm": 0.05007994547486305, "grad_norm_var": 1.4106444475051743e-05, "learning_rate": 0.009729175575540976, "loss": 2.6684, "step": 3397 }, { "crossentropy": 2.7763383388519287, "epoch": 0.12318735498839907, "grad_norm": 0.051723942160606384, "grad_norm_var": 1.735297837270981e-05, "learning_rate": 0.009728986894985706, "loss": 2.6564, "step": 3398 }, { "crossentropy": 2.8244588375091553, "epoch": 0.12322360788863108, "grad_norm": 0.04587726667523384, "grad_norm_var": 1.6183563465872974e-05, "learning_rate": 0.009728798150558288, "loss": 2.7196, "step": 3399 }, { "crossentropy": 2.902059316635132, "epoch": 0.12325986078886311, "grad_norm": 0.04270021244883537, "grad_norm_var": 1.628919223240945e-05, "learning_rate": 0.00972860934226128, "loss": 2.7594, "step": 3400 }, { "crossentropy": 2.631239414215088, "epoch": 0.12329611368909513, "grad_norm": 0.04810790717601776, "grad_norm_var": 1.699963170631383e-05, "learning_rate": 0.009728420470097223, "loss": 2.6407, "step": 3401 }, { "crossentropy": 2.8989696502685547, "epoch": 0.12333236658932714, "grad_norm": 0.04587722197175026, "grad_norm_var": 1.3366524049145586e-05, "learning_rate": 0.009728231534068675, "loss": 2.8598, "step": 3402 }, { "crossentropy": 2.824068546295166, "epoch": 0.12336861948955917, "grad_norm": 0.04113896191120148, "grad_norm_var": 1.114015434519096e-05, "learning_rate": 0.009728042534178184, "loss": 2.71, "step": 3403 }, { "crossentropy": 2.708935022354126, "epoch": 0.12340487238979118, "grad_norm": 0.040035128593444824, "grad_norm_var": 1.2227203947577127e-05, "learning_rate": 0.0097278534704283, "loss": 2.7329, "step": 3404 }, { "crossentropy": 2.7577431201934814, "epoch": 0.1234411252900232, "grad_norm": 0.04076094925403595, "grad_norm_var": 1.320043947480957e-05, "learning_rate": 0.009727664342821584, "loss": 2.6163, "step": 3405 }, { "crossentropy": 2.733292818069458, "epoch": 0.12347737819025523, "grad_norm": 0.03929036855697632, "grad_norm_var": 1.4991917083508286e-05, "learning_rate": 0.009727475151360585, "loss": 2.6892, "step": 3406 }, { "crossentropy": 2.787000894546509, "epoch": 0.12351363109048724, "grad_norm": 0.04317604377865791, "grad_norm_var": 1.514982668893618e-05, "learning_rate": 0.009727285896047861, "loss": 2.7302, "step": 3407 }, { "crossentropy": 2.649129867553711, "epoch": 0.12354988399071926, "grad_norm": 0.04318065941333771, "grad_norm_var": 1.4454855348330388e-05, "learning_rate": 0.009727096576885965, "loss": 2.73, "step": 3408 }, { "crossentropy": 2.931295156478882, "epoch": 0.12358613689095127, "grad_norm": 0.03942592814564705, "grad_norm_var": 1.5263464099567597e-05, "learning_rate": 0.00972690719387746, "loss": 2.833, "step": 3409 }, { "crossentropy": 2.4546186923980713, "epoch": 0.1236223897911833, "grad_norm": 0.03940480947494507, "grad_norm_var": 1.643219339700445e-05, "learning_rate": 0.009726717747024896, "loss": 2.618, "step": 3410 }, { "crossentropy": 2.8205487728118896, "epoch": 0.12365864269141531, "grad_norm": 0.04004629701375961, "grad_norm_var": 1.6665812068857438e-05, "learning_rate": 0.009726528236330837, "loss": 2.8013, "step": 3411 }, { "crossentropy": 2.738278388977051, "epoch": 0.12369489559164733, "grad_norm": 0.03616747260093689, "grad_norm_var": 1.8740435591677735e-05, "learning_rate": 0.009726338661797842, "loss": 2.748, "step": 3412 }, { "crossentropy": 2.8366241455078125, "epoch": 0.12373114849187936, "grad_norm": 0.037028007209300995, "grad_norm_var": 1.695705399210734e-05, "learning_rate": 0.00972614902342847, "loss": 2.7281, "step": 3413 }, { "crossentropy": 2.705089807510376, "epoch": 0.12376740139211137, "grad_norm": 0.04269276559352875, "grad_norm_var": 1.0491629123633658e-05, "learning_rate": 0.009725959321225284, "loss": 2.6976, "step": 3414 }, { "crossentropy": 2.6890082359313965, "epoch": 0.12380365429234338, "grad_norm": 0.44520190358161926, "grad_norm_var": 0.010206783827190759, "learning_rate": 0.009725769555190845, "loss": 2.7178, "step": 3415 }, { "crossentropy": 2.717378854751587, "epoch": 0.1238399071925754, "grad_norm": 0.04706711322069168, "grad_norm_var": 0.01019410964437666, "learning_rate": 0.009725579725327718, "loss": 2.7269, "step": 3416 }, { "crossentropy": 2.6783876419067383, "epoch": 0.12387616009280743, "grad_norm": 0.05087197944521904, "grad_norm_var": 0.010187702882328115, "learning_rate": 0.009725389831638464, "loss": 2.7396, "step": 3417 }, { "crossentropy": 2.6930816173553467, "epoch": 0.12391241299303944, "grad_norm": 0.05642713978886604, "grad_norm_var": 0.010165002476937919, "learning_rate": 0.009725199874125649, "loss": 2.7743, "step": 3418 }, { "crossentropy": 2.5312600135803223, "epoch": 0.12394866589327146, "grad_norm": 0.05032786726951599, "grad_norm_var": 0.01013783583455999, "learning_rate": 0.00972500985279184, "loss": 2.6993, "step": 3419 }, { "crossentropy": 2.6529462337493896, "epoch": 0.12398491879350348, "grad_norm": 0.08865204453468323, "grad_norm_var": 0.01010302794349572, "learning_rate": 0.0097248197676396, "loss": 2.7612, "step": 3420 }, { "crossentropy": 2.615908622741699, "epoch": 0.1240211716937355, "grad_norm": 0.04841846227645874, "grad_norm_var": 0.01007558119530551, "learning_rate": 0.009724629618671503, "loss": 2.6701, "step": 3421 }, { "crossentropy": 2.915178060531616, "epoch": 0.12405742459396751, "grad_norm": 0.05632524564862251, "grad_norm_var": 0.010020079949300703, "learning_rate": 0.009724439405890111, "loss": 2.7775, "step": 3422 }, { "crossentropy": 2.7659122943878174, "epoch": 0.12409367749419954, "grad_norm": 0.04057091847062111, "grad_norm_var": 0.01003078561359182, "learning_rate": 0.009724249129297995, "loss": 2.7364, "step": 3423 }, { "crossentropy": 2.736380100250244, "epoch": 0.12412993039443156, "grad_norm": 0.04178189858794212, "grad_norm_var": 0.010036397077987234, "learning_rate": 0.009724058788897726, "loss": 2.7258, "step": 3424 }, { "crossentropy": 2.758633852005005, "epoch": 0.12416618329466357, "grad_norm": 0.04700728505849838, "grad_norm_var": 0.010006530649331756, "learning_rate": 0.009723868384691872, "loss": 2.7527, "step": 3425 }, { "crossentropy": 2.813366413116455, "epoch": 0.12420243619489559, "grad_norm": 0.044328365474939346, "grad_norm_var": 0.009985991724137696, "learning_rate": 0.009723677916683009, "loss": 2.7859, "step": 3426 }, { "crossentropy": 2.652791976928711, "epoch": 0.12423868909512761, "grad_norm": 0.042686644941568375, "grad_norm_var": 0.00997471806767874, "learning_rate": 0.009723487384873705, "loss": 2.6503, "step": 3427 }, { "crossentropy": 2.615388870239258, "epoch": 0.12427494199535963, "grad_norm": 0.04330558702349663, "grad_norm_var": 0.009942397893466147, "learning_rate": 0.009723296789266539, "loss": 2.6777, "step": 3428 }, { "crossentropy": 2.683361530303955, "epoch": 0.12431119489559164, "grad_norm": 0.048008766025304794, "grad_norm_var": 0.009895922799762712, "learning_rate": 0.00972310612986408, "loss": 2.6935, "step": 3429 }, { "crossentropy": 2.789468288421631, "epoch": 0.12434744779582367, "grad_norm": 0.04908684268593788, "grad_norm_var": 0.009871271812430717, "learning_rate": 0.009722915406668906, "loss": 2.7681, "step": 3430 }, { "crossentropy": 2.813819408416748, "epoch": 0.12438370069605569, "grad_norm": 0.040754593908786774, "grad_norm_var": 0.0001314810003291783, "learning_rate": 0.009722724619683591, "loss": 2.8025, "step": 3431 }, { "crossentropy": 2.9658238887786865, "epoch": 0.1244199535962877, "grad_norm": 0.042161908000707626, "grad_norm_var": 0.00013472399427035073, "learning_rate": 0.009722533768910713, "loss": 2.8868, "step": 3432 }, { "crossentropy": 2.7042670249938965, "epoch": 0.12445620649651973, "grad_norm": 0.03966880217194557, "grad_norm_var": 0.0001403991231362792, "learning_rate": 0.00972234285435285, "loss": 2.721, "step": 3433 }, { "crossentropy": 2.780952215194702, "epoch": 0.12449245939675174, "grad_norm": 0.044116560369729996, "grad_norm_var": 0.0001372196575703777, "learning_rate": 0.009722151876012582, "loss": 2.7283, "step": 3434 }, { "crossentropy": 2.7105941772460938, "epoch": 0.12452871229698376, "grad_norm": 0.05576789379119873, "grad_norm_var": 0.00014079394914858715, "learning_rate": 0.009721960833892485, "loss": 2.7879, "step": 3435 }, { "crossentropy": 2.7895028591156006, "epoch": 0.12456496519721577, "grad_norm": 0.06383731216192245, "grad_norm_var": 4.5736898127111436e-05, "learning_rate": 0.009721769727995141, "loss": 2.683, "step": 3436 }, { "crossentropy": 2.7718307971954346, "epoch": 0.1246012180974478, "grad_norm": 0.04776114970445633, "grad_norm_var": 4.561672788371962e-05, "learning_rate": 0.009721578558323132, "loss": 2.8311, "step": 3437 }, { "crossentropy": 2.833686113357544, "epoch": 0.12463747099767981, "grad_norm": 0.04340969771146774, "grad_norm_var": 3.946380962163616e-05, "learning_rate": 0.009721387324879038, "loss": 2.8109, "step": 3438 }, { "crossentropy": 2.686636447906494, "epoch": 0.12467372389791183, "grad_norm": 0.050202250480651855, "grad_norm_var": 3.8429682228370504e-05, "learning_rate": 0.009721196027665446, "loss": 2.6809, "step": 3439 }, { "crossentropy": 2.6695377826690674, "epoch": 0.12470997679814386, "grad_norm": 0.04236578941345215, "grad_norm_var": 3.8084232955403384e-05, "learning_rate": 0.009721004666684935, "loss": 2.7134, "step": 3440 }, { "crossentropy": 2.7086217403411865, "epoch": 0.12474622969837587, "grad_norm": 0.041009724140167236, "grad_norm_var": 3.995020384813229e-05, "learning_rate": 0.009720813241940091, "loss": 2.7626, "step": 3441 }, { "crossentropy": 2.8618862628936768, "epoch": 0.12478248259860789, "grad_norm": 0.04149078577756882, "grad_norm_var": 4.1144349325607175e-05, "learning_rate": 0.009720621753433502, "loss": 2.7559, "step": 3442 }, { "crossentropy": 2.6917316913604736, "epoch": 0.12481873549883991, "grad_norm": 0.040945157408714294, "grad_norm_var": 4.2097946476441635e-05, "learning_rate": 0.009720430201167751, "loss": 2.7458, "step": 3443 }, { "crossentropy": 2.909105062484741, "epoch": 0.12485498839907193, "grad_norm": 0.06106634810566902, "grad_norm_var": 5.574446865377891e-05, "learning_rate": 0.009720238585145426, "loss": 2.8532, "step": 3444 }, { "crossentropy": 2.6385185718536377, "epoch": 0.12489124129930394, "grad_norm": 0.05556904524564743, "grad_norm_var": 6.035553138013973e-05, "learning_rate": 0.009720046905369118, "loss": 2.7251, "step": 3445 }, { "crossentropy": 2.668325185775757, "epoch": 0.12492749419953596, "grad_norm": 0.04852326959371567, "grad_norm_var": 6.025244999308769e-05, "learning_rate": 0.009719855161841413, "loss": 2.6728, "step": 3446 }, { "crossentropy": 2.900653123855591, "epoch": 0.12496374709976799, "grad_norm": 0.04960032179951668, "grad_norm_var": 5.72866376669145e-05, "learning_rate": 0.0097196633545649, "loss": 2.8671, "step": 3447 }, { "crossentropy": 2.626075029373169, "epoch": 0.125, "grad_norm": 0.04169032350182533, "grad_norm_var": 5.766564372929137e-05, "learning_rate": 0.009719471483542172, "loss": 2.753, "step": 3448 }, { "crossentropy": 2.6281936168670654, "epoch": 0.12503625290023201, "grad_norm": 0.04461035132408142, "grad_norm_var": 5.374279584141606e-05, "learning_rate": 0.009719279548775819, "loss": 2.6904, "step": 3449 }, { "crossentropy": 2.7652013301849365, "epoch": 0.12507250580046403, "grad_norm": 0.046019311994314194, "grad_norm_var": 5.29209597176059e-05, "learning_rate": 0.009719087550268435, "loss": 2.7153, "step": 3450 }, { "crossentropy": 2.7485132217407227, "epoch": 0.12510875870069604, "grad_norm": 0.04715503752231598, "grad_norm_var": 4.9058011393195385e-05, "learning_rate": 0.009718895488022612, "loss": 2.7171, "step": 3451 }, { "crossentropy": 2.5847885608673096, "epoch": 0.12514501160092809, "grad_norm": 0.04450683295726776, "grad_norm_var": 3.115113746421742e-05, "learning_rate": 0.009718703362040945, "loss": 2.6871, "step": 3452 }, { "crossentropy": 2.738473653793335, "epoch": 0.1251812645011601, "grad_norm": 0.04150193929672241, "grad_norm_var": 3.2647666657653385e-05, "learning_rate": 0.009718511172326026, "loss": 2.7016, "step": 3453 }, { "crossentropy": 2.7310662269592285, "epoch": 0.12521751740139211, "grad_norm": 0.042397793382406235, "grad_norm_var": 3.3092063883771666e-05, "learning_rate": 0.009718318918880453, "loss": 2.716, "step": 3454 }, { "crossentropy": 2.6833877563476562, "epoch": 0.12525377030162413, "grad_norm": 0.06765905022621155, "grad_norm_var": 6.15332237483695e-05, "learning_rate": 0.009718126601706824, "loss": 2.6996, "step": 3455 }, { "crossentropy": 2.7721781730651855, "epoch": 0.12529002320185614, "grad_norm": 0.046279046684503555, "grad_norm_var": 5.9938277213642904e-05, "learning_rate": 0.009717934220807734, "loss": 2.6749, "step": 3456 }, { "crossentropy": 2.7835114002227783, "epoch": 0.12532627610208816, "grad_norm": 0.04515393078327179, "grad_norm_var": 5.742456713223321e-05, "learning_rate": 0.009717741776185784, "loss": 2.7328, "step": 3457 }, { "crossentropy": 2.8969674110412598, "epoch": 0.12536252900232017, "grad_norm": 0.04133511334657669, "grad_norm_var": 5.755621867975698e-05, "learning_rate": 0.009717549267843571, "loss": 2.8177, "step": 3458 }, { "crossentropy": 2.6781423091888428, "epoch": 0.12539878190255221, "grad_norm": 0.03849315270781517, "grad_norm_var": 6.015698629927979e-05, "learning_rate": 0.009717356695783696, "loss": 2.6532, "step": 3459 }, { "crossentropy": 2.646226406097412, "epoch": 0.12543503480278423, "grad_norm": 0.039721280336380005, "grad_norm_var": 5.030042224695511e-05, "learning_rate": 0.00971716406000876, "loss": 2.6239, "step": 3460 }, { "crossentropy": 2.698657512664795, "epoch": 0.12547128770301624, "grad_norm": 0.04444260522723198, "grad_norm_var": 4.423274585049711e-05, "learning_rate": 0.009716971360521368, "loss": 2.6612, "step": 3461 }, { "crossentropy": 2.8993287086486816, "epoch": 0.12550754060324826, "grad_norm": 0.04197156801819801, "grad_norm_var": 4.4334013809286396e-05, "learning_rate": 0.009716778597324114, "loss": 2.721, "step": 3462 }, { "crossentropy": 2.824519395828247, "epoch": 0.12554379350348027, "grad_norm": 0.043410953134298325, "grad_norm_var": 4.30627574742141e-05, "learning_rate": 0.00971658577041961, "loss": 2.6948, "step": 3463 }, { "crossentropy": 2.8361334800720215, "epoch": 0.1255800464037123, "grad_norm": 0.042175646871328354, "grad_norm_var": 4.287807903407155e-05, "learning_rate": 0.009716392879810458, "loss": 2.7816, "step": 3464 }, { "crossentropy": 2.7474541664123535, "epoch": 0.12561629930394433, "grad_norm": 0.0500224269926548, "grad_norm_var": 4.4570370822125056e-05, "learning_rate": 0.009716199925499262, "loss": 2.6745, "step": 3465 }, { "crossentropy": 2.7259695529937744, "epoch": 0.12565255220417634, "grad_norm": 0.04383651167154312, "grad_norm_var": 4.4612347891209485e-05, "learning_rate": 0.009716006907488629, "loss": 2.7244, "step": 3466 }, { "crossentropy": 2.757570743560791, "epoch": 0.12568880510440836, "grad_norm": 0.04844177886843681, "grad_norm_var": 4.508488511392918e-05, "learning_rate": 0.009715813825781166, "loss": 2.7295, "step": 3467 }, { "crossentropy": 2.768432378768921, "epoch": 0.12572505800464037, "grad_norm": 0.04662221670150757, "grad_norm_var": 4.520167324504187e-05, "learning_rate": 0.00971562068037948, "loss": 2.7946, "step": 3468 }, { "crossentropy": 2.618706464767456, "epoch": 0.1257613109048724, "grad_norm": 0.04433344677090645, "grad_norm_var": 4.43003647879967e-05, "learning_rate": 0.009715427471286182, "loss": 2.6992, "step": 3469 }, { "crossentropy": 2.7553303241729736, "epoch": 0.1257975638051044, "grad_norm": 0.10511355847120285, "grad_norm_var": 0.0002650788872022543, "learning_rate": 0.00971523419850388, "loss": 2.8775, "step": 3470 }, { "crossentropy": 2.8090784549713135, "epoch": 0.12583381670533642, "grad_norm": 0.04121226817369461, "grad_norm_var": 0.00024410181728283837, "learning_rate": 0.009715040862035182, "loss": 2.8358, "step": 3471 }, { "crossentropy": 2.7345364093780518, "epoch": 0.12587006960556846, "grad_norm": 0.0413779690861702, "grad_norm_var": 0.00024650574845936447, "learning_rate": 0.009714847461882706, "loss": 2.7706, "step": 3472 }, { "crossentropy": 2.767732858657837, "epoch": 0.12590632250580047, "grad_norm": 0.04539882019162178, "grad_norm_var": 0.00024643765927752397, "learning_rate": 0.009714653998049057, "loss": 2.7362, "step": 3473 }, { "crossentropy": 2.896852493286133, "epoch": 0.1259425754060325, "grad_norm": 0.04707108065485954, "grad_norm_var": 0.00024387904749559666, "learning_rate": 0.009714460470536852, "loss": 2.8302, "step": 3474 }, { "crossentropy": 2.7751104831695557, "epoch": 0.1259788283062645, "grad_norm": 0.04079257324337959, "grad_norm_var": 0.0002413782516322562, "learning_rate": 0.009714266879348702, "loss": 2.8205, "step": 3475 }, { "crossentropy": 2.806955337524414, "epoch": 0.12601508120649652, "grad_norm": 0.04153070226311684, "grad_norm_var": 0.00023961657500361355, "learning_rate": 0.009714073224487226, "loss": 2.8051, "step": 3476 }, { "crossentropy": 2.6108179092407227, "epoch": 0.12605133410672853, "grad_norm": 0.05331474542617798, "grad_norm_var": 0.0002403462056763855, "learning_rate": 0.009713879505955036, "loss": 2.7009, "step": 3477 }, { "crossentropy": 2.6810319423675537, "epoch": 0.12608758700696054, "grad_norm": 0.05898190289735794, "grad_norm_var": 0.0002435351237077586, "learning_rate": 0.00971368572375475, "loss": 2.7034, "step": 3478 }, { "crossentropy": 2.7813801765441895, "epoch": 0.1261238399071926, "grad_norm": 0.048671312630176544, "grad_norm_var": 0.00024092209908583458, "learning_rate": 0.009713491877888985, "loss": 2.7134, "step": 3479 }, { "crossentropy": 2.680354356765747, "epoch": 0.1261600928074246, "grad_norm": 0.04058504104614258, "grad_norm_var": 0.00024272499981853782, "learning_rate": 0.00971329796836036, "loss": 2.6446, "step": 3480 }, { "crossentropy": 2.837867021560669, "epoch": 0.12619634570765662, "grad_norm": 0.061009686440229416, "grad_norm_var": 0.0002505494779914592, "learning_rate": 0.009713103995171493, "loss": 2.7626, "step": 3481 }, { "crossentropy": 2.6281685829162598, "epoch": 0.12623259860788863, "grad_norm": 0.04137463867664337, "grad_norm_var": 0.00025312159109982195, "learning_rate": 0.009712909958325003, "loss": 2.6947, "step": 3482 }, { "crossentropy": 2.538285970687866, "epoch": 0.12626885150812064, "grad_norm": 0.037782516330480576, "grad_norm_var": 0.00026295544908465634, "learning_rate": 0.009712715857823514, "loss": 2.7063, "step": 3483 }, { "crossentropy": 2.7496581077575684, "epoch": 0.12630510440835266, "grad_norm": 0.04008220508694649, "grad_norm_var": 0.00026831101548517036, "learning_rate": 0.009712521693669643, "loss": 2.6884, "step": 3484 }, { "crossentropy": 2.7995543479919434, "epoch": 0.12634135730858467, "grad_norm": 0.03970150649547577, "grad_norm_var": 0.00027271278195372344, "learning_rate": 0.009712327465866016, "loss": 2.7596, "step": 3485 }, { "crossentropy": 2.6025493144989014, "epoch": 0.12637761020881672, "grad_norm": 0.03829675540328026, "grad_norm_var": 5.1832920609646656e-05, "learning_rate": 0.009712133174415256, "loss": 2.7701, "step": 3486 }, { "crossentropy": 2.6589508056640625, "epoch": 0.12641386310904873, "grad_norm": 0.03737560287117958, "grad_norm_var": 5.460051265099615e-05, "learning_rate": 0.009711938819319985, "loss": 2.6327, "step": 3487 }, { "crossentropy": 2.7134456634521484, "epoch": 0.12645011600928074, "grad_norm": 0.04300568625330925, "grad_norm_var": 5.4070261068858706e-05, "learning_rate": 0.009711744400582832, "loss": 2.7091, "step": 3488 }, { "crossentropy": 2.707982301712036, "epoch": 0.12648636890951276, "grad_norm": 0.042305994778871536, "grad_norm_var": 5.4374127130556636e-05, "learning_rate": 0.009711549918206419, "loss": 2.7242, "step": 3489 }, { "crossentropy": 2.6275529861450195, "epoch": 0.12652262180974477, "grad_norm": 0.03846503049135208, "grad_norm_var": 5.604442063259444e-05, "learning_rate": 0.009711355372193375, "loss": 2.6897, "step": 3490 }, { "crossentropy": 2.815488338470459, "epoch": 0.1265588747099768, "grad_norm": 0.040773656219244, "grad_norm_var": 5.6052418845949024e-05, "learning_rate": 0.009711160762546326, "loss": 2.7727, "step": 3491 }, { "crossentropy": 2.737724781036377, "epoch": 0.12659512761020883, "grad_norm": 0.03996105492115021, "grad_norm_var": 5.6713477160375005e-05, "learning_rate": 0.009710966089267902, "loss": 2.7067, "step": 3492 }, { "crossentropy": 2.820523500442505, "epoch": 0.12663138051044084, "grad_norm": 0.04048995301127434, "grad_norm_var": 5.081806460779329e-05, "learning_rate": 0.009710771352360733, "loss": 2.7278, "step": 3493 }, { "crossentropy": 2.6186656951904297, "epoch": 0.12666763341067286, "grad_norm": 0.04225146397948265, "grad_norm_var": 3.2781311496663895e-05, "learning_rate": 0.009710576551827445, "loss": 2.7033, "step": 3494 }, { "crossentropy": 2.5853123664855957, "epoch": 0.12670388631090487, "grad_norm": 0.04789237305521965, "grad_norm_var": 3.2127217431633136e-05, "learning_rate": 0.009710381687670674, "loss": 2.6611, "step": 3495 }, { "crossentropy": 2.8830618858337402, "epoch": 0.1267401392111369, "grad_norm": 0.039176128804683685, "grad_norm_var": 3.250949465643521e-05, "learning_rate": 0.009710186759893052, "loss": 2.8615, "step": 3496 }, { "crossentropy": 2.775557041168213, "epoch": 0.1267763921113689, "grad_norm": 0.047454364597797394, "grad_norm_var": 9.40379154881372e-06, "learning_rate": 0.009709991768497208, "loss": 2.7194, "step": 3497 }, { "crossentropy": 2.648617744445801, "epoch": 0.12681264501160092, "grad_norm": 0.04788614809513092, "grad_norm_var": 1.235793360240429e-05, "learning_rate": 0.009709796713485777, "loss": 2.6469, "step": 3498 }, { "crossentropy": 2.819324254989624, "epoch": 0.12684889791183296, "grad_norm": 0.04589267075061798, "grad_norm_var": 1.2523244167020736e-05, "learning_rate": 0.009709601594861395, "loss": 2.8421, "step": 3499 }, { "crossentropy": 2.7679264545440674, "epoch": 0.12688515081206497, "grad_norm": 0.04367978498339653, "grad_norm_var": 1.2441894964875371e-05, "learning_rate": 0.009709406412626695, "loss": 2.7729, "step": 3500 }, { "crossentropy": 2.701570510864258, "epoch": 0.126921403712297, "grad_norm": 0.044011812657117844, "grad_norm_var": 1.2188421140566879e-05, "learning_rate": 0.009709211166784316, "loss": 2.751, "step": 3501 }, { "crossentropy": 2.7161731719970703, "epoch": 0.126957656612529, "grad_norm": 0.04396459460258484, "grad_norm_var": 1.1070836587134756e-05, "learning_rate": 0.009709015857336892, "loss": 2.6174, "step": 3502 }, { "crossentropy": 2.610438346862793, "epoch": 0.12699390951276102, "grad_norm": 0.045189566910266876, "grad_norm_var": 9.249405226533641e-06, "learning_rate": 0.009708820484287064, "loss": 2.6429, "step": 3503 }, { "crossentropy": 2.7484285831451416, "epoch": 0.12703016241299303, "grad_norm": 0.043184924870729446, "grad_norm_var": 9.24497651747791e-06, "learning_rate": 0.009708625047637469, "loss": 2.7829, "step": 3504 }, { "crossentropy": 2.649519205093384, "epoch": 0.12706641531322505, "grad_norm": 0.04155050218105316, "grad_norm_var": 9.379389987744454e-06, "learning_rate": 0.009708429547390748, "loss": 2.755, "step": 3505 }, { "crossentropy": 2.8279221057891846, "epoch": 0.1271026682134571, "grad_norm": 0.04920274019241333, "grad_norm_var": 9.750671399025243e-06, "learning_rate": 0.009708233983549539, "loss": 2.8075, "step": 3506 }, { "crossentropy": 2.640167474746704, "epoch": 0.1271389211136891, "grad_norm": 0.043128613382577896, "grad_norm_var": 9.112457174255349e-06, "learning_rate": 0.009708038356116486, "loss": 2.7155, "step": 3507 }, { "crossentropy": 2.8080179691314697, "epoch": 0.12717517401392112, "grad_norm": 0.0402945801615715, "grad_norm_var": 8.937249753452187e-06, "learning_rate": 0.009707842665094231, "loss": 2.7951, "step": 3508 }, { "crossentropy": 2.728118419647217, "epoch": 0.12721142691415313, "grad_norm": 0.03765896335244179, "grad_norm_var": 1.0792571771934116e-05, "learning_rate": 0.009707646910485417, "loss": 2.6459, "step": 3509 }, { "crossentropy": 2.76187801361084, "epoch": 0.12724767981438515, "grad_norm": 0.04089181125164032, "grad_norm_var": 1.12071888563862e-05, "learning_rate": 0.009707451092292685, "loss": 2.7267, "step": 3510 }, { "crossentropy": 2.9226932525634766, "epoch": 0.12728393271461716, "grad_norm": 0.03771049156785011, "grad_norm_var": 1.215289237077935e-05, "learning_rate": 0.009707255210518684, "loss": 2.7279, "step": 3511 }, { "crossentropy": 2.700709342956543, "epoch": 0.1273201856148492, "grad_norm": 0.04098411649465561, "grad_norm_var": 1.1392034955385461e-05, "learning_rate": 0.009707059265166058, "loss": 2.7793, "step": 3512 }, { "crossentropy": 2.6986804008483887, "epoch": 0.12735643851508122, "grad_norm": 0.04362071678042412, "grad_norm_var": 1.0183420331409155e-05, "learning_rate": 0.009706863256237451, "loss": 2.7935, "step": 3513 }, { "crossentropy": 2.6498055458068848, "epoch": 0.12739269141531323, "grad_norm": 0.0482807494699955, "grad_norm_var": 1.0447427860385549e-05, "learning_rate": 0.009706667183735515, "loss": 2.7129, "step": 3514 }, { "crossentropy": 2.7655394077301025, "epoch": 0.12742894431554525, "grad_norm": 0.0379721000790596, "grad_norm_var": 1.1395796573670837e-05, "learning_rate": 0.009706471047662896, "loss": 2.7119, "step": 3515 }, { "crossentropy": 2.8260931968688965, "epoch": 0.12746519721577726, "grad_norm": 0.04673021659255028, "grad_norm_var": 1.2423505485320332e-05, "learning_rate": 0.009706274848022244, "loss": 2.7548, "step": 3516 }, { "crossentropy": 2.720421314239502, "epoch": 0.12750145011600927, "grad_norm": 0.04549221321940422, "grad_norm_var": 1.2804899924806258e-05, "learning_rate": 0.009706078584816204, "loss": 2.6361, "step": 3517 }, { "crossentropy": 2.8303256034851074, "epoch": 0.1275377030162413, "grad_norm": 0.04160193353891373, "grad_norm_var": 1.2807722230660327e-05, "learning_rate": 0.009705882258047437, "loss": 2.7957, "step": 3518 }, { "crossentropy": 2.8150272369384766, "epoch": 0.12757395591647333, "grad_norm": 0.0415186807513237, "grad_norm_var": 1.2440413841055542e-05, "learning_rate": 0.009705685867718583, "loss": 2.781, "step": 3519 }, { "crossentropy": 2.8256595134735107, "epoch": 0.12761020881670534, "grad_norm": 0.04528629779815674, "grad_norm_var": 1.291139702440652e-05, "learning_rate": 0.009705489413832303, "loss": 2.7538, "step": 3520 }, { "crossentropy": 2.758570432662964, "epoch": 0.12764646171693736, "grad_norm": 0.04623651131987572, "grad_norm_var": 1.3615406504557716e-05, "learning_rate": 0.009705292896391248, "loss": 2.7349, "step": 3521 }, { "crossentropy": 2.696209669113159, "epoch": 0.12768271461716937, "grad_norm": 0.04192844405770302, "grad_norm_var": 1.0822326168950732e-05, "learning_rate": 0.00970509631539807, "loss": 2.6682, "step": 3522 }, { "crossentropy": 2.615520477294922, "epoch": 0.1277189675174014, "grad_norm": 0.039644692093133926, "grad_norm_var": 1.1269662689335156e-05, "learning_rate": 0.009704899670855428, "loss": 2.6912, "step": 3523 }, { "crossentropy": 2.720273733139038, "epoch": 0.1277552204176334, "grad_norm": 0.03853153809905052, "grad_norm_var": 1.192143071116758e-05, "learning_rate": 0.009704702962765975, "loss": 2.7427, "step": 3524 }, { "crossentropy": 2.556011915206909, "epoch": 0.12779147331786542, "grad_norm": 0.035164885222911835, "grad_norm_var": 1.3797219573022321e-05, "learning_rate": 0.00970450619113237, "loss": 2.667, "step": 3525 }, { "crossentropy": 2.673146963119507, "epoch": 0.12782772621809746, "grad_norm": 0.05375201627612114, "grad_norm_var": 2.22769300540715e-05, "learning_rate": 0.009704309355957268, "loss": 2.675, "step": 3526 }, { "crossentropy": 2.6742427349090576, "epoch": 0.12786397911832947, "grad_norm": 0.038793765008449554, "grad_norm_var": 2.1618271027997773e-05, "learning_rate": 0.00970411245724333, "loss": 2.8138, "step": 3527 }, { "crossentropy": 2.7796084880828857, "epoch": 0.1279002320185615, "grad_norm": 0.04151173308491707, "grad_norm_var": 2.1504675666532875e-05, "learning_rate": 0.009703915494993214, "loss": 2.6662, "step": 3528 }, { "crossentropy": 2.7516987323760986, "epoch": 0.1279364849187935, "grad_norm": 0.04768772050738335, "grad_norm_var": 2.2940582273150063e-05, "learning_rate": 0.009703718469209581, "loss": 2.8382, "step": 3529 }, { "crossentropy": 2.7575843334198, "epoch": 0.12797273781902552, "grad_norm": 0.042163584381341934, "grad_norm_var": 2.1080976512400745e-05, "learning_rate": 0.009703521379895092, "loss": 2.6559, "step": 3530 }, { "crossentropy": 2.7664623260498047, "epoch": 0.12800899071925753, "grad_norm": 0.038901135325431824, "grad_norm_var": 2.054294923533942e-05, "learning_rate": 0.009703324227052411, "loss": 2.7574, "step": 3531 }, { "crossentropy": 2.704981803894043, "epoch": 0.12804524361948955, "grad_norm": 0.03729882463812828, "grad_norm_var": 2.1171499750725216e-05, "learning_rate": 0.009703127010684195, "loss": 2.7235, "step": 3532 }, { "crossentropy": 2.7301347255706787, "epoch": 0.1280814965197216, "grad_norm": 0.03648107126355171, "grad_norm_var": 2.2314572755804952e-05, "learning_rate": 0.009702929730793113, "loss": 2.7248, "step": 3533 }, { "crossentropy": 2.6601340770721436, "epoch": 0.1281177494199536, "grad_norm": 0.03914603963494301, "grad_norm_var": 2.270938023417688e-05, "learning_rate": 0.009702732387381828, "loss": 2.7752, "step": 3534 }, { "crossentropy": 2.672349452972412, "epoch": 0.12815400232018562, "grad_norm": 0.040793146938085556, "grad_norm_var": 2.2740756857138796e-05, "learning_rate": 0.009702534980453006, "loss": 2.6772, "step": 3535 }, { "crossentropy": 2.610250949859619, "epoch": 0.12819025522041763, "grad_norm": 0.0438348650932312, "grad_norm_var": 2.2131474268934545e-05, "learning_rate": 0.009702337510009313, "loss": 2.6822, "step": 3536 }, { "crossentropy": 2.607504367828369, "epoch": 0.12822650812064965, "grad_norm": 0.04490479454398155, "grad_norm_var": 2.137765229958862e-05, "learning_rate": 0.009702139976053415, "loss": 2.7208, "step": 3537 }, { "crossentropy": 2.936530828475952, "epoch": 0.12826276102088166, "grad_norm": 0.04222261160612106, "grad_norm_var": 2.1408351392961043e-05, "learning_rate": 0.009701942378587982, "loss": 2.8728, "step": 3538 }, { "crossentropy": 2.8272714614868164, "epoch": 0.1282990139211137, "grad_norm": 0.0399821475148201, "grad_norm_var": 2.1340898456373206e-05, "learning_rate": 0.009701744717615679, "loss": 2.8524, "step": 3539 }, { "crossentropy": 2.6776299476623535, "epoch": 0.12833526682134572, "grad_norm": 0.04163151606917381, "grad_norm_var": 2.0787670322776574e-05, "learning_rate": 0.00970154699313918, "loss": 2.7493, "step": 3540 }, { "crossentropy": 2.6922221183776855, "epoch": 0.12837151972157773, "grad_norm": 0.04290522262454033, "grad_norm_var": 1.7976691776683668e-05, "learning_rate": 0.009701349205161157, "loss": 2.7859, "step": 3541 }, { "crossentropy": 2.862983465194702, "epoch": 0.12840777262180975, "grad_norm": 0.05105297639966011, "grad_norm_var": 1.4203000679774829e-05, "learning_rate": 0.009701151353684274, "loss": 2.7835, "step": 3542 }, { "crossentropy": 2.8457484245300293, "epoch": 0.12844402552204176, "grad_norm": 0.051566168665885925, "grad_norm_var": 1.9224908944615444e-05, "learning_rate": 0.00970095343871121, "loss": 2.7866, "step": 3543 }, { "crossentropy": 2.8121824264526367, "epoch": 0.12848027842227377, "grad_norm": 0.05289426073431969, "grad_norm_var": 2.5625031861591964e-05, "learning_rate": 0.009700755460244634, "loss": 2.8101, "step": 3544 }, { "crossentropy": 2.620388984680176, "epoch": 0.1285165313225058, "grad_norm": 0.038740042597055435, "grad_norm_var": 2.5443851983172373e-05, "learning_rate": 0.009700557418287223, "loss": 2.6504, "step": 3545 }, { "crossentropy": 2.631330728530884, "epoch": 0.12855278422273783, "grad_norm": 0.038733214139938354, "grad_norm_var": 2.6462352770053e-05, "learning_rate": 0.009700359312841652, "loss": 2.6605, "step": 3546 }, { "crossentropy": 2.767338514328003, "epoch": 0.12858903712296985, "grad_norm": 0.03861600533127785, "grad_norm_var": 2.6606838468203457e-05, "learning_rate": 0.009700161143910596, "loss": 2.7315, "step": 3547 }, { "crossentropy": 2.823350191116333, "epoch": 0.12862529002320186, "grad_norm": 0.042516350746154785, "grad_norm_var": 2.4655037267290457e-05, "learning_rate": 0.009699962911496728, "loss": 2.7214, "step": 3548 }, { "crossentropy": 2.6918013095855713, "epoch": 0.12866154292343387, "grad_norm": 0.049007900059223175, "grad_norm_var": 2.378107406527241e-05, "learning_rate": 0.00969976461560273, "loss": 2.6799, "step": 3549 }, { "crossentropy": 2.770895481109619, "epoch": 0.1286977958236659, "grad_norm": 0.05018448457121849, "grad_norm_var": 2.4754086170595118e-05, "learning_rate": 0.009699566256231277, "loss": 2.7618, "step": 3550 }, { "crossentropy": 2.698488473892212, "epoch": 0.1287340487238979, "grad_norm": 0.04461776465177536, "grad_norm_var": 2.3854959221787885e-05, "learning_rate": 0.009699367833385054, "loss": 2.6635, "step": 3551 }, { "crossentropy": 2.5613224506378174, "epoch": 0.12877030162412992, "grad_norm": 0.03691413626074791, "grad_norm_var": 2.7543589466168653e-05, "learning_rate": 0.009699169347066732, "loss": 2.5909, "step": 3552 }, { "crossentropy": 2.6515257358551025, "epoch": 0.12880655452436196, "grad_norm": 0.03763919696211815, "grad_norm_var": 3.011711654345341e-05, "learning_rate": 0.009698970797279, "loss": 2.6768, "step": 3553 }, { "crossentropy": 2.7477245330810547, "epoch": 0.12884280742459397, "grad_norm": 0.03898308053612709, "grad_norm_var": 3.1411813931903994e-05, "learning_rate": 0.009698772184024537, "loss": 2.7417, "step": 3554 }, { "crossentropy": 2.8861007690429688, "epoch": 0.128879060324826, "grad_norm": 0.04374954476952553, "grad_norm_var": 3.053229531994646e-05, "learning_rate": 0.00969857350730602, "loss": 2.8297, "step": 3555 }, { "crossentropy": 2.6684346199035645, "epoch": 0.128915313225058, "grad_norm": 0.040044091641902924, "grad_norm_var": 3.1134898713443505e-05, "learning_rate": 0.009698374767126143, "loss": 2.6266, "step": 3556 }, { "crossentropy": 2.7118923664093018, "epoch": 0.12895156612529002, "grad_norm": 0.04028040170669556, "grad_norm_var": 3.18210058061781e-05, "learning_rate": 0.009698175963487581, "loss": 2.7522, "step": 3557 }, { "crossentropy": 2.685441493988037, "epoch": 0.12898781902552203, "grad_norm": 0.0508393719792366, "grad_norm_var": 3.160792477093314e-05, "learning_rate": 0.009697977096393025, "loss": 2.7003, "step": 3558 }, { "crossentropy": 2.673011064529419, "epoch": 0.12902407192575405, "grad_norm": 0.04846101999282837, "grad_norm_var": 2.885355262684334e-05, "learning_rate": 0.00969777816584516, "loss": 2.7453, "step": 3559 }, { "crossentropy": 2.6413471698760986, "epoch": 0.1290603248259861, "grad_norm": 0.04363434389233589, "grad_norm_var": 2.232238444400472e-05, "learning_rate": 0.009697579171846668, "loss": 2.6723, "step": 3560 }, { "crossentropy": 2.835669994354248, "epoch": 0.1290965777262181, "grad_norm": 0.03980895131826401, "grad_norm_var": 2.1831546432770666e-05, "learning_rate": 0.009697380114400243, "loss": 2.762, "step": 3561 }, { "crossentropy": 2.7147014141082764, "epoch": 0.12913283062645012, "grad_norm": 0.03699547052383423, "grad_norm_var": 2.2951399239895935e-05, "learning_rate": 0.00969718099350857, "loss": 2.7664, "step": 3562 }, { "crossentropy": 2.7126376628875732, "epoch": 0.12916908352668213, "grad_norm": 0.061611663550138474, "grad_norm_var": 4.365351017525613e-05, "learning_rate": 0.00969698180917434, "loss": 2.6984, "step": 3563 }, { "crossentropy": 3.0270564556121826, "epoch": 0.12920533642691415, "grad_norm": 0.044021736830472946, "grad_norm_var": 4.3481196588768253e-05, "learning_rate": 0.009696782561400243, "loss": 2.8889, "step": 3564 }, { "crossentropy": 2.593135118484497, "epoch": 0.12924158932714616, "grad_norm": 0.04796865954995155, "grad_norm_var": 4.287896591194934e-05, "learning_rate": 0.00969658325018897, "loss": 2.6502, "step": 3565 }, { "crossentropy": 2.755171537399292, "epoch": 0.1292778422273782, "grad_norm": 0.04980998486280441, "grad_norm_var": 4.258439353357323e-05, "learning_rate": 0.009696383875543214, "loss": 2.7622, "step": 3566 }, { "crossentropy": 2.7480995655059814, "epoch": 0.12931409512761022, "grad_norm": 0.04985422268509865, "grad_norm_var": 4.466930023717391e-05, "learning_rate": 0.009696184437465664, "loss": 2.7175, "step": 3567 }, { "crossentropy": 2.743391275405884, "epoch": 0.12935034802784223, "grad_norm": 0.04432925209403038, "grad_norm_var": 4.0691317486480676e-05, "learning_rate": 0.009695984935959016, "loss": 2.7708, "step": 3568 }, { "crossentropy": 2.6467912197113037, "epoch": 0.12938660092807425, "grad_norm": 0.04311037063598633, "grad_norm_var": 3.7282318580785186e-05, "learning_rate": 0.009695785371025969, "loss": 2.5768, "step": 3569 }, { "crossentropy": 2.9059667587280273, "epoch": 0.12942285382830626, "grad_norm": 0.04359474405646324, "grad_norm_var": 3.477720909274377e-05, "learning_rate": 0.00969558574266921, "loss": 2.7577, "step": 3570 }, { "crossentropy": 2.7494711875915527, "epoch": 0.12945910672853828, "grad_norm": 0.04262855276465416, "grad_norm_var": 3.511844423253266e-05, "learning_rate": 0.009695386050891444, "loss": 2.7483, "step": 3571 }, { "crossentropy": 2.7043373584747314, "epoch": 0.1294953596287703, "grad_norm": 0.0376279279589653, "grad_norm_var": 3.72206797837354e-05, "learning_rate": 0.009695186295695362, "loss": 2.6939, "step": 3572 }, { "crossentropy": 2.84897518157959, "epoch": 0.12953161252900233, "grad_norm": 0.03880952298641205, "grad_norm_var": 3.8337589548128004e-05, "learning_rate": 0.009694986477083663, "loss": 2.8687, "step": 3573 }, { "crossentropy": 2.800436496734619, "epoch": 0.12956786542923435, "grad_norm": 0.042383067309856415, "grad_norm_var": 3.644183607259058e-05, "learning_rate": 0.009694786595059048, "loss": 2.8061, "step": 3574 }, { "crossentropy": 2.7073450088500977, "epoch": 0.12960411832946636, "grad_norm": 0.04802464321255684, "grad_norm_var": 3.623290613215668e-05, "learning_rate": 0.009694586649624214, "loss": 2.724, "step": 3575 }, { "crossentropy": 2.695690870285034, "epoch": 0.12964037122969838, "grad_norm": 0.044054064899683, "grad_norm_var": 3.618773119498092e-05, "learning_rate": 0.009694386640781864, "loss": 2.6472, "step": 3576 }, { "crossentropy": 2.4853601455688477, "epoch": 0.1296766241299304, "grad_norm": 0.04340081289410591, "grad_norm_var": 3.46686535445948e-05, "learning_rate": 0.009694186568534699, "loss": 2.5829, "step": 3577 }, { "crossentropy": 2.7575485706329346, "epoch": 0.1297128770301624, "grad_norm": 0.045396365225315094, "grad_norm_var": 3.0237849588626852e-05, "learning_rate": 0.00969398643288542, "loss": 2.7332, "step": 3578 }, { "crossentropy": 2.718672037124634, "epoch": 0.12974912993039442, "grad_norm": 0.042921341955661774, "grad_norm_var": 1.170583294231673e-05, "learning_rate": 0.00969378623383673, "loss": 2.7331, "step": 3579 }, { "crossentropy": 2.680652141571045, "epoch": 0.12978538283062646, "grad_norm": 0.039441946893930435, "grad_norm_var": 1.315365358130523e-05, "learning_rate": 0.009693585971391335, "loss": 2.7258, "step": 3580 }, { "crossentropy": 2.833749294281006, "epoch": 0.12982163573085848, "grad_norm": 0.03906688094139099, "grad_norm_var": 1.3348028868871245e-05, "learning_rate": 0.009693385645551937, "loss": 2.764, "step": 3581 }, { "crossentropy": 2.7194483280181885, "epoch": 0.1298578886310905, "grad_norm": 0.03800058737397194, "grad_norm_var": 1.1976605973028758e-05, "learning_rate": 0.009693185256321247, "loss": 2.6755, "step": 3582 }, { "crossentropy": 2.7937629222869873, "epoch": 0.1298941415313225, "grad_norm": 0.03780941292643547, "grad_norm_var": 9.498668827663942e-06, "learning_rate": 0.009692984803701966, "loss": 2.826, "step": 3583 }, { "crossentropy": 2.650789976119995, "epoch": 0.12993039443155452, "grad_norm": 0.03814000263810158, "grad_norm_var": 9.89843407699035e-06, "learning_rate": 0.009692784287696802, "loss": 2.7361, "step": 3584 }, { "crossentropy": 2.5038211345672607, "epoch": 0.12996664733178653, "grad_norm": 0.04479774832725525, "grad_norm_var": 1.0432925277913986e-05, "learning_rate": 0.009692583708308465, "loss": 2.6279, "step": 3585 }, { "crossentropy": 2.706958293914795, "epoch": 0.13000290023201855, "grad_norm": 0.03900093957781792, "grad_norm_var": 1.054911986018697e-05, "learning_rate": 0.009692383065539665, "loss": 2.8152, "step": 3586 }, { "crossentropy": 2.5627148151397705, "epoch": 0.1300391531322506, "grad_norm": 0.039877381175756454, "grad_norm_var": 1.0550971411697232e-05, "learning_rate": 0.00969218235939311, "loss": 2.6691, "step": 3587 }, { "crossentropy": 2.689260482788086, "epoch": 0.1300754060324826, "grad_norm": 0.041841957718133926, "grad_norm_var": 9.669516634822767e-06, "learning_rate": 0.009691981589871513, "loss": 2.7351, "step": 3588 }, { "crossentropy": 2.9220030307769775, "epoch": 0.13011165893271462, "grad_norm": 0.04942826181650162, "grad_norm_var": 1.2999042808790421e-05, "learning_rate": 0.009691780756977582, "loss": 2.7947, "step": 3589 }, { "crossentropy": 2.7916600704193115, "epoch": 0.13014791183294663, "grad_norm": 0.047205086797475815, "grad_norm_var": 1.4634865030415239e-05, "learning_rate": 0.009691579860714032, "loss": 2.7651, "step": 3590 }, { "crossentropy": 2.7518351078033447, "epoch": 0.13018416473317865, "grad_norm": 0.03927817568182945, "grad_norm_var": 1.2857265728900778e-05, "learning_rate": 0.009691378901083578, "loss": 2.7319, "step": 3591 }, { "crossentropy": 2.6955785751342773, "epoch": 0.13022041763341066, "grad_norm": 0.04171024635434151, "grad_norm_var": 1.2513008933293043e-05, "learning_rate": 0.009691177878088932, "loss": 2.6692, "step": 3592 }, { "crossentropy": 2.7492856979370117, "epoch": 0.1302566705336427, "grad_norm": 0.03676228225231171, "grad_norm_var": 1.3768417137173759e-05, "learning_rate": 0.00969097679173281, "loss": 2.7304, "step": 3593 }, { "crossentropy": 2.6168181896209717, "epoch": 0.13029292343387472, "grad_norm": 0.048270341008901596, "grad_norm_var": 1.5857271697872616e-05, "learning_rate": 0.009690775642017925, "loss": 2.7013, "step": 3594 }, { "crossentropy": 2.7194204330444336, "epoch": 0.13032917633410673, "grad_norm": 0.038540273904800415, "grad_norm_var": 1.6210281094620507e-05, "learning_rate": 0.009690574428947002, "loss": 2.6917, "step": 3595 }, { "crossentropy": 2.807724714279175, "epoch": 0.13036542923433875, "grad_norm": 0.041716959327459335, "grad_norm_var": 1.6001022003684673e-05, "learning_rate": 0.009690373152522748, "loss": 2.7733, "step": 3596 }, { "crossentropy": 2.672431468963623, "epoch": 0.13040168213457076, "grad_norm": 0.05244191363453865, "grad_norm_var": 2.312727313195373e-05, "learning_rate": 0.009690171812747887, "loss": 2.7779, "step": 3597 }, { "crossentropy": 2.715920925140381, "epoch": 0.13043793503480278, "grad_norm": 0.04775401949882507, "grad_norm_var": 2.364246310445576e-05, "learning_rate": 0.00968997040962514, "loss": 2.7029, "step": 3598 }, { "crossentropy": 2.693370819091797, "epoch": 0.1304741879350348, "grad_norm": 0.040470633655786514, "grad_norm_var": 2.231927665083764e-05, "learning_rate": 0.009689768943157224, "loss": 2.67, "step": 3599 }, { "crossentropy": 2.635615587234497, "epoch": 0.13051044083526683, "grad_norm": 0.045767322182655334, "grad_norm_var": 2.1061322887014344e-05, "learning_rate": 0.009689567413346863, "loss": 2.7086, "step": 3600 }, { "crossentropy": 2.6221070289611816, "epoch": 0.13054669373549885, "grad_norm": 0.043942779302597046, "grad_norm_var": 2.0950973726153545e-05, "learning_rate": 0.009689365820196777, "loss": 2.7226, "step": 3601 }, { "crossentropy": 2.584912061691284, "epoch": 0.13058294663573086, "grad_norm": 0.0420035645365715, "grad_norm_var": 1.9763088915577114e-05, "learning_rate": 0.00968916416370969, "loss": 2.7041, "step": 3602 }, { "crossentropy": 2.947011947631836, "epoch": 0.13061919953596288, "grad_norm": 0.05015341192483902, "grad_norm_var": 2.1312811168311176e-05, "learning_rate": 0.009688962443888323, "loss": 2.8422, "step": 3603 }, { "crossentropy": 2.725321054458618, "epoch": 0.1306554524361949, "grad_norm": 0.05592288449406624, "grad_norm_var": 2.9267483973765875e-05, "learning_rate": 0.009688760660735403, "loss": 2.6381, "step": 3604 }, { "crossentropy": 2.844222068786621, "epoch": 0.1306917053364269, "grad_norm": 0.04324338212609291, "grad_norm_var": 2.80770268345234e-05, "learning_rate": 0.009688558814253654, "loss": 2.793, "step": 3605 }, { "crossentropy": 2.759838342666626, "epoch": 0.13072795823665892, "grad_norm": 0.04197230562567711, "grad_norm_var": 2.8039863043196645e-05, "learning_rate": 0.009688356904445805, "loss": 2.7073, "step": 3606 }, { "crossentropy": 2.7219398021698, "epoch": 0.13076421113689096, "grad_norm": 0.03929705545306206, "grad_norm_var": 2.8027062859383295e-05, "learning_rate": 0.009688154931314578, "loss": 2.7238, "step": 3607 }, { "crossentropy": 2.75382924079895, "epoch": 0.13080046403712298, "grad_norm": 0.03907852992415428, "grad_norm_var": 2.9394312067063515e-05, "learning_rate": 0.009687952894862708, "loss": 2.7446, "step": 3608 }, { "crossentropy": 2.8213608264923096, "epoch": 0.130836716937355, "grad_norm": 0.04568915069103241, "grad_norm_var": 2.5511895761773777e-05, "learning_rate": 0.009687750795092917, "loss": 2.8285, "step": 3609 }, { "crossentropy": 2.9377732276916504, "epoch": 0.130872969837587, "grad_norm": 0.04314803704619408, "grad_norm_var": 2.4758761312935046e-05, "learning_rate": 0.009687548632007941, "loss": 2.8599, "step": 3610 }, { "crossentropy": 2.9496963024139404, "epoch": 0.13090922273781902, "grad_norm": 0.04618334025144577, "grad_norm_var": 2.239101303461068e-05, "learning_rate": 0.009687346405610505, "loss": 2.7788, "step": 3611 }, { "crossentropy": 2.7284157276153564, "epoch": 0.13094547563805103, "grad_norm": 0.04469646140933037, "grad_norm_var": 2.1671769305960965e-05, "learning_rate": 0.00968714411590334, "loss": 2.8889, "step": 3612 }, { "crossentropy": 2.6801955699920654, "epoch": 0.13098172853828308, "grad_norm": 0.04277965426445007, "grad_norm_var": 1.8061395145570665e-05, "learning_rate": 0.009686941762889185, "loss": 2.7399, "step": 3613 }, { "crossentropy": 2.848005771636963, "epoch": 0.1310179814385151, "grad_norm": 0.0436069592833519, "grad_norm_var": 1.7340538348483102e-05, "learning_rate": 0.009686739346570767, "loss": 2.8178, "step": 3614 }, { "crossentropy": 2.7212698459625244, "epoch": 0.1310542343387471, "grad_norm": 0.04366449639201164, "grad_norm_var": 1.636983395076156e-05, "learning_rate": 0.009686536866950821, "loss": 2.7317, "step": 3615 }, { "crossentropy": 2.7361583709716797, "epoch": 0.13109048723897912, "grad_norm": 0.04977227374911308, "grad_norm_var": 1.8077443731905467e-05, "learning_rate": 0.009686334324032084, "loss": 2.7426, "step": 3616 }, { "crossentropy": 2.7849180698394775, "epoch": 0.13112674013921113, "grad_norm": 0.05136401951313019, "grad_norm_var": 2.0773177323743914e-05, "learning_rate": 0.009686131717817289, "loss": 2.7451, "step": 3617 }, { "crossentropy": 2.755542278289795, "epoch": 0.13116299303944315, "grad_norm": 0.0406317301094532, "grad_norm_var": 2.1468323008138935e-05, "learning_rate": 0.009685929048309174, "loss": 2.7122, "step": 3618 }, { "crossentropy": 2.7738308906555176, "epoch": 0.13119924593967516, "grad_norm": 0.04023288935422897, "grad_norm_var": 2.0902276239024437e-05, "learning_rate": 0.009685726315510478, "loss": 2.7778, "step": 3619 }, { "crossentropy": 2.6915242671966553, "epoch": 0.1312354988399072, "grad_norm": 0.04427294805645943, "grad_norm_var": 1.1571797591330748e-05, "learning_rate": 0.009685523519423934, "loss": 2.7084, "step": 3620 }, { "crossentropy": 2.64591383934021, "epoch": 0.13127175174013922, "grad_norm": 0.042543765157461166, "grad_norm_var": 1.1647509244728608e-05, "learning_rate": 0.009685320660052286, "loss": 2.7104, "step": 3621 }, { "crossentropy": 2.6842949390411377, "epoch": 0.13130800464037123, "grad_norm": 0.039498452097177505, "grad_norm_var": 1.2594389623767308e-05, "learning_rate": 0.009685117737398273, "loss": 2.6776, "step": 3622 }, { "crossentropy": 2.717644691467285, "epoch": 0.13134425754060325, "grad_norm": 0.049937743693590164, "grad_norm_var": 1.3667173730204837e-05, "learning_rate": 0.009684914751464633, "loss": 2.7081, "step": 3623 }, { "crossentropy": 2.627084493637085, "epoch": 0.13138051044083526, "grad_norm": 0.03994035720825195, "grad_norm_var": 1.3125800636212768e-05, "learning_rate": 0.009684711702254111, "loss": 2.7526, "step": 3624 }, { "crossentropy": 2.816007614135742, "epoch": 0.13141676334106728, "grad_norm": 0.03911958262324333, "grad_norm_var": 1.4560573921697702e-05, "learning_rate": 0.009684508589769448, "loss": 2.7048, "step": 3625 }, { "crossentropy": 2.6341588497161865, "epoch": 0.1314530162412993, "grad_norm": 0.04397403821349144, "grad_norm_var": 1.4527333509734008e-05, "learning_rate": 0.009684305414013387, "loss": 2.8109, "step": 3626 }, { "crossentropy": 2.6263389587402344, "epoch": 0.13148926914153133, "grad_norm": 0.03942607343196869, "grad_norm_var": 1.5313697399908094e-05, "learning_rate": 0.009684102174988675, "loss": 2.6695, "step": 3627 }, { "crossentropy": 2.7393407821655273, "epoch": 0.13152552204176335, "grad_norm": 0.04229603335261345, "grad_norm_var": 1.5280116860309084e-05, "learning_rate": 0.00968389887269805, "loss": 2.7677, "step": 3628 }, { "crossentropy": 2.6010074615478516, "epoch": 0.13156177494199536, "grad_norm": 0.04139311611652374, "grad_norm_var": 1.549948550198498e-05, "learning_rate": 0.009683695507144267, "loss": 2.6778, "step": 3629 }, { "crossentropy": 2.8067541122436523, "epoch": 0.13159802784222738, "grad_norm": 0.041370660066604614, "grad_norm_var": 1.5699548071506273e-05, "learning_rate": 0.009683492078330067, "loss": 2.847, "step": 3630 }, { "crossentropy": 2.674889326095581, "epoch": 0.1316342807424594, "grad_norm": 0.03839809447526932, "grad_norm_var": 1.7029501128247523e-05, "learning_rate": 0.009683288586258198, "loss": 2.7066, "step": 3631 }, { "crossentropy": 2.869736671447754, "epoch": 0.1316705336426914, "grad_norm": 0.04179136082530022, "grad_norm_var": 1.3549307271191965e-05, "learning_rate": 0.009683085030931411, "loss": 2.8094, "step": 3632 }, { "crossentropy": 2.720717191696167, "epoch": 0.13170678654292342, "grad_norm": 0.045786548405885696, "grad_norm_var": 8.724682534900588e-06, "learning_rate": 0.009682881412352453, "loss": 2.7417, "step": 3633 }, { "crossentropy": 2.6207776069641113, "epoch": 0.13174303944315546, "grad_norm": 0.04345704987645149, "grad_norm_var": 8.740791256035366e-06, "learning_rate": 0.009682677730524076, "loss": 2.6408, "step": 3634 }, { "crossentropy": 2.7857067584991455, "epoch": 0.13177929234338748, "grad_norm": 0.03927932679653168, "grad_norm_var": 9.033727282655122e-06, "learning_rate": 0.009682473985449028, "loss": 2.7827, "step": 3635 }, { "crossentropy": 2.7160372734069824, "epoch": 0.1318155452436195, "grad_norm": 0.040512267500162125, "grad_norm_var": 8.79314042874386e-06, "learning_rate": 0.009682270177130066, "loss": 2.7802, "step": 3636 }, { "crossentropy": 2.683316230773926, "epoch": 0.1318517981438515, "grad_norm": 0.04424852132797241, "grad_norm_var": 9.144908965224147e-06, "learning_rate": 0.009682066305569937, "loss": 2.6836, "step": 3637 }, { "crossentropy": 2.736582040786743, "epoch": 0.13188805104408352, "grad_norm": 0.03788226470351219, "grad_norm_var": 9.826069969637713e-06, "learning_rate": 0.009681862370771397, "loss": 2.6684, "step": 3638 }, { "crossentropy": 2.7231392860412598, "epoch": 0.13192430394431554, "grad_norm": 0.037553876638412476, "grad_norm_var": 5.97552760562096e-06, "learning_rate": 0.009681658372737203, "loss": 2.7485, "step": 3639 }, { "crossentropy": 2.7127344608306885, "epoch": 0.13196055684454758, "grad_norm": 0.04130874201655388, "grad_norm_var": 5.894330271919284e-06, "learning_rate": 0.009681454311470106, "loss": 2.7474, "step": 3640 }, { "crossentropy": 2.7316222190856934, "epoch": 0.1319968097447796, "grad_norm": 0.03757881000638008, "grad_norm_var": 6.452090331899157e-06, "learning_rate": 0.009681250186972865, "loss": 2.7062, "step": 3641 }, { "crossentropy": 2.7154700756073, "epoch": 0.1320330626450116, "grad_norm": 0.039146170020103455, "grad_norm_var": 6.004755505756067e-06, "learning_rate": 0.009681045999248237, "loss": 2.7347, "step": 3642 }, { "crossentropy": 2.615983247756958, "epoch": 0.13206931554524362, "grad_norm": 0.03752356022596359, "grad_norm_var": 6.557762026803323e-06, "learning_rate": 0.009680841748298979, "loss": 2.6283, "step": 3643 }, { "crossentropy": 2.7834620475769043, "epoch": 0.13210556844547564, "grad_norm": 0.03703823685646057, "grad_norm_var": 7.093327418624245e-06, "learning_rate": 0.009680637434127849, "loss": 2.7618, "step": 3644 }, { "crossentropy": 2.6539254188537598, "epoch": 0.13214182134570765, "grad_norm": 0.03922588750720024, "grad_norm_var": 7.06141430124226e-06, "learning_rate": 0.009680433056737606, "loss": 2.7025, "step": 3645 }, { "crossentropy": 2.828761577606201, "epoch": 0.13217807424593966, "grad_norm": 0.0395069383084774, "grad_norm_var": 6.970538131345569e-06, "learning_rate": 0.009680228616131013, "loss": 2.7867, "step": 3646 }, { "crossentropy": 2.663602113723755, "epoch": 0.1322143271461717, "grad_norm": 0.038054030388593674, "grad_norm_var": 7.052106048164678e-06, "learning_rate": 0.00968002411231083, "loss": 2.7264, "step": 3647 }, { "crossentropy": 2.504439353942871, "epoch": 0.13225058004640372, "grad_norm": 0.03824036940932274, "grad_norm_var": 6.988905911869033e-06, "learning_rate": 0.00967981954527982, "loss": 2.5621, "step": 3648 }, { "crossentropy": 2.5256481170654297, "epoch": 0.13228683294663574, "grad_norm": 0.03589963912963867, "grad_norm_var": 5.16886145257017e-06, "learning_rate": 0.009679614915040744, "loss": 2.6424, "step": 3649 }, { "crossentropy": 2.5845227241516113, "epoch": 0.13232308584686775, "grad_norm": 0.03986874595284462, "grad_norm_var": 3.914604865297375e-06, "learning_rate": 0.009679410221596367, "loss": 2.6414, "step": 3650 }, { "crossentropy": 2.5468051433563232, "epoch": 0.13235933874709976, "grad_norm": 0.04311696067452431, "grad_norm_var": 5.014217963900114e-06, "learning_rate": 0.009679205464949454, "loss": 2.6715, "step": 3651 }, { "crossentropy": 2.5734455585479736, "epoch": 0.13239559164733178, "grad_norm": 0.042264118790626526, "grad_norm_var": 5.519775151246499e-06, "learning_rate": 0.009679000645102772, "loss": 2.6695, "step": 3652 }, { "crossentropy": 2.583651542663574, "epoch": 0.1324318445475638, "grad_norm": 0.04125697538256645, "grad_norm_var": 4.09672479822834e-06, "learning_rate": 0.009678795762059085, "loss": 2.6956, "step": 3653 }, { "crossentropy": 2.7481436729431152, "epoch": 0.13246809744779584, "grad_norm": 0.04093435779213905, "grad_norm_var": 4.186802427376864e-06, "learning_rate": 0.009678590815821158, "loss": 2.7503, "step": 3654 }, { "crossentropy": 2.706387519836426, "epoch": 0.13250435034802785, "grad_norm": 0.040743615478277206, "grad_norm_var": 4.087592203888488e-06, "learning_rate": 0.009678385806391765, "loss": 2.7716, "step": 3655 }, { "crossentropy": 2.753178119659424, "epoch": 0.13254060324825986, "grad_norm": 0.04165208712220192, "grad_norm_var": 4.178600981806444e-06, "learning_rate": 0.00967818073377367, "loss": 2.669, "step": 3656 }, { "crossentropy": 2.8765006065368652, "epoch": 0.13257685614849188, "grad_norm": 0.04099765047430992, "grad_norm_var": 4.031925904183532e-06, "learning_rate": 0.009677975597969646, "loss": 2.7788, "step": 3657 }, { "crossentropy": 2.7064640522003174, "epoch": 0.1326131090487239, "grad_norm": 0.06717133522033691, "grad_norm_var": 5.098765015774428e-05, "learning_rate": 0.009677770398982462, "loss": 2.7688, "step": 3658 }, { "crossentropy": 2.7008094787597656, "epoch": 0.1326493619489559, "grad_norm": 0.0394819974899292, "grad_norm_var": 5.0197269503965816e-05, "learning_rate": 0.00967756513681489, "loss": 2.7553, "step": 3659 }, { "crossentropy": 2.8583288192749023, "epoch": 0.13268561484918792, "grad_norm": 0.04020405560731888, "grad_norm_var": 4.8901987540282166e-05, "learning_rate": 0.009677359811469704, "loss": 2.7991, "step": 3660 }, { "crossentropy": 2.7719345092773438, "epoch": 0.13272186774941996, "grad_norm": 0.04256412759423256, "grad_norm_var": 4.845778562567593e-05, "learning_rate": 0.009677154422949674, "loss": 2.5802, "step": 3661 }, { "crossentropy": 2.7283403873443604, "epoch": 0.13275812064965198, "grad_norm": 0.1434343308210373, "grad_norm_var": 0.0006890051341282897, "learning_rate": 0.009676948971257576, "loss": 2.7141, "step": 3662 }, { "crossentropy": 2.810544967651367, "epoch": 0.132794373549884, "grad_norm": 0.04029323533177376, "grad_norm_var": 0.0006862019131304881, "learning_rate": 0.009676743456396185, "loss": 2.7462, "step": 3663 }, { "crossentropy": 2.8045547008514404, "epoch": 0.132830626450116, "grad_norm": 0.04600170999765396, "grad_norm_var": 0.0006792123319660318, "learning_rate": 0.009676537878368275, "loss": 2.786, "step": 3664 }, { "crossentropy": 2.6807076930999756, "epoch": 0.13286687935034802, "grad_norm": 0.04554755613207817, "grad_norm_var": 0.0006680262685186545, "learning_rate": 0.009676332237176625, "loss": 2.6897, "step": 3665 }, { "crossentropy": 2.573115587234497, "epoch": 0.13290313225058004, "grad_norm": 0.04609367996454239, "grad_norm_var": 0.0006622710099525489, "learning_rate": 0.00967612653282401, "loss": 2.6777, "step": 3666 }, { "crossentropy": 2.8499562740325928, "epoch": 0.13293938515081208, "grad_norm": 0.04335479810833931, "grad_norm_var": 0.0006620527888699869, "learning_rate": 0.009675920765313213, "loss": 2.7599, "step": 3667 }, { "crossentropy": 2.7586288452148438, "epoch": 0.1329756380510441, "grad_norm": 0.04339178279042244, "grad_norm_var": 0.0006609503821882949, "learning_rate": 0.00967571493464701, "loss": 2.739, "step": 3668 }, { "crossentropy": 2.6230456829071045, "epoch": 0.1330118909512761, "grad_norm": 0.03685819357633591, "grad_norm_var": 0.0006674020224117222, "learning_rate": 0.009675509040828178, "loss": 2.6307, "step": 3669 }, { "crossentropy": 2.641782760620117, "epoch": 0.13304814385150812, "grad_norm": 0.03590918332338333, "grad_norm_var": 0.000675001074381501, "learning_rate": 0.009675303083859502, "loss": 2.7076, "step": 3670 }, { "crossentropy": 2.6023473739624023, "epoch": 0.13308439675174014, "grad_norm": 0.036909472197294235, "grad_norm_var": 0.0006804505922619201, "learning_rate": 0.009675097063743764, "loss": 2.6993, "step": 3671 }, { "crossentropy": 2.602895975112915, "epoch": 0.13312064965197215, "grad_norm": 0.036322660744190216, "grad_norm_var": 0.000687707605526643, "learning_rate": 0.009674890980483745, "loss": 2.6453, "step": 3672 }, { "crossentropy": 2.7018895149230957, "epoch": 0.13315690255220416, "grad_norm": 0.041976891458034515, "grad_norm_var": 0.0006867183350392029, "learning_rate": 0.00967468483408223, "loss": 2.7065, "step": 3673 }, { "crossentropy": 2.818880081176758, "epoch": 0.1331931554524362, "grad_norm": 0.03929513320326805, "grad_norm_var": 0.0006680982312236895, "learning_rate": 0.009674478624542, "loss": 2.7793, "step": 3674 }, { "crossentropy": 2.7528672218322754, "epoch": 0.13322940835266822, "grad_norm": 0.0364706814289093, "grad_norm_var": 0.0006718250289282708, "learning_rate": 0.009674272351865843, "loss": 2.7612, "step": 3675 }, { "crossentropy": 2.6175217628479004, "epoch": 0.13326566125290024, "grad_norm": 0.046135760843753815, "grad_norm_var": 0.0006685193482601983, "learning_rate": 0.009674066016056547, "loss": 2.653, "step": 3676 }, { "crossentropy": 2.837315797805786, "epoch": 0.13330191415313225, "grad_norm": 0.04431076720356941, "grad_norm_var": 0.0006675523887926854, "learning_rate": 0.009673859617116894, "loss": 2.8196, "step": 3677 }, { "crossentropy": 2.7844159603118896, "epoch": 0.13333816705336426, "grad_norm": 0.04385066032409668, "grad_norm_var": 1.5473595538785003e-05, "learning_rate": 0.009673653155049674, "loss": 2.7029, "step": 3678 }, { "crossentropy": 2.791259527206421, "epoch": 0.13337441995359628, "grad_norm": 0.04376582056283951, "grad_norm_var": 1.570550600604459e-05, "learning_rate": 0.009673446629857675, "loss": 2.757, "step": 3679 }, { "crossentropy": 2.8260955810546875, "epoch": 0.1334106728538283, "grad_norm": 0.04766462370753288, "grad_norm_var": 1.6846049476553806e-05, "learning_rate": 0.009673240041543688, "loss": 2.7929, "step": 3680 }, { "crossentropy": 2.7505033016204834, "epoch": 0.13344692575406034, "grad_norm": 0.04385596886277199, "grad_norm_var": 1.6166365087724213e-05, "learning_rate": 0.009673033390110502, "loss": 2.7535, "step": 3681 }, { "crossentropy": 2.7308595180511475, "epoch": 0.13348317865429235, "grad_norm": 0.03860335797071457, "grad_norm_var": 1.5220375734168769e-05, "learning_rate": 0.009672826675560908, "loss": 2.7192, "step": 3682 }, { "crossentropy": 2.5662190914154053, "epoch": 0.13351943155452436, "grad_norm": 0.03822435066103935, "grad_norm_var": 1.536904508897581e-05, "learning_rate": 0.009672619897897701, "loss": 2.5862, "step": 3683 }, { "crossentropy": 2.874704360961914, "epoch": 0.13355568445475638, "grad_norm": 0.037876468151807785, "grad_norm_var": 1.539853566023859e-05, "learning_rate": 0.00967241305712367, "loss": 2.8054, "step": 3684 }, { "crossentropy": 2.6676816940307617, "epoch": 0.1335919373549884, "grad_norm": 0.03603179007768631, "grad_norm_var": 1.584270635203901e-05, "learning_rate": 0.00967220615324161, "loss": 2.727, "step": 3685 }, { "crossentropy": 2.738694667816162, "epoch": 0.1336281902552204, "grad_norm": 0.038894448429346085, "grad_norm_var": 1.459219946584207e-05, "learning_rate": 0.009671999186254315, "loss": 2.7156, "step": 3686 }, { "crossentropy": 2.6204781532287598, "epoch": 0.13366444315545242, "grad_norm": 0.04278227686882019, "grad_norm_var": 1.3829162318770329e-05, "learning_rate": 0.00967179215616458, "loss": 2.7037, "step": 3687 }, { "crossentropy": 2.790966272354126, "epoch": 0.13370069605568446, "grad_norm": 0.04268666356801987, "grad_norm_var": 1.2388295470981172e-05, "learning_rate": 0.009671585062975203, "loss": 2.7261, "step": 3688 }, { "crossentropy": 2.760863780975342, "epoch": 0.13373694895591648, "grad_norm": 0.04320539906620979, "grad_norm_var": 1.2576855081090216e-05, "learning_rate": 0.009671377906688981, "loss": 2.6739, "step": 3689 }, { "crossentropy": 2.7464542388916016, "epoch": 0.1337732018561485, "grad_norm": 0.04661822319030762, "grad_norm_var": 1.3796829099369835e-05, "learning_rate": 0.009671170687308711, "loss": 2.7291, "step": 3690 }, { "crossentropy": 2.8017849922180176, "epoch": 0.1338094547563805, "grad_norm": 0.0457458533346653, "grad_norm_var": 1.2414629937858765e-05, "learning_rate": 0.009670963404837193, "loss": 2.7483, "step": 3691 }, { "crossentropy": 2.711627244949341, "epoch": 0.13384570765661252, "grad_norm": 0.048033151775598526, "grad_norm_var": 1.3555438965750083e-05, "learning_rate": 0.009670756059277223, "loss": 2.7006, "step": 3692 }, { "crossentropy": 2.622174024581909, "epoch": 0.13388196055684454, "grad_norm": 0.041560813784599304, "grad_norm_var": 1.3413408435212353e-05, "learning_rate": 0.009670548650631607, "loss": 2.6398, "step": 3693 }, { "crossentropy": 2.8337819576263428, "epoch": 0.13391821345707658, "grad_norm": 0.03857606276869774, "grad_norm_var": 1.417597400628582e-05, "learning_rate": 0.009670341178903143, "loss": 2.7021, "step": 3694 }, { "crossentropy": 2.626847982406616, "epoch": 0.1339544663573086, "grad_norm": 0.040384940803050995, "grad_norm_var": 1.4154244584194988e-05, "learning_rate": 0.009670133644094634, "loss": 2.7144, "step": 3695 }, { "crossentropy": 2.7074780464172363, "epoch": 0.1339907192575406, "grad_norm": 0.03781939297914505, "grad_norm_var": 1.2673328601342516e-05, "learning_rate": 0.009669926046208883, "loss": 2.6736, "step": 3696 }, { "crossentropy": 2.7766222953796387, "epoch": 0.13402697215777262, "grad_norm": 0.039716776460409164, "grad_norm_var": 1.2336936555730714e-05, "learning_rate": 0.009669718385248693, "loss": 2.7194, "step": 3697 }, { "crossentropy": 2.669257640838623, "epoch": 0.13406322505800464, "grad_norm": 0.04608992859721184, "grad_norm_var": 1.340021907853302e-05, "learning_rate": 0.009669510661216872, "loss": 2.7196, "step": 3698 }, { "crossentropy": 2.7726938724517822, "epoch": 0.13409947795823665, "grad_norm": 0.038590721786022186, "grad_norm_var": 1.3247841827273889e-05, "learning_rate": 0.009669302874116222, "loss": 2.6735, "step": 3699 }, { "crossentropy": 2.5711746215820312, "epoch": 0.13413573085846867, "grad_norm": 0.035212088376283646, "grad_norm_var": 1.499239487602703e-05, "learning_rate": 0.00966909502394955, "loss": 2.66, "step": 3700 }, { "crossentropy": 2.6079514026641846, "epoch": 0.1341719837587007, "grad_norm": 0.03543078154325485, "grad_norm_var": 1.5442888108392155e-05, "learning_rate": 0.009668887110719664, "loss": 2.6794, "step": 3701 }, { "crossentropy": 2.710974931716919, "epoch": 0.13420823665893272, "grad_norm": 0.04205768182873726, "grad_norm_var": 1.5039256944201126e-05, "learning_rate": 0.009668679134429372, "loss": 2.678, "step": 3702 }, { "crossentropy": 2.813565492630005, "epoch": 0.13424448955916474, "grad_norm": 0.04308902844786644, "grad_norm_var": 1.5096277742694155e-05, "learning_rate": 0.009668471095081484, "loss": 2.735, "step": 3703 }, { "crossentropy": 2.7455413341522217, "epoch": 0.13428074245939675, "grad_norm": 0.03972238302230835, "grad_norm_var": 1.519664322576399e-05, "learning_rate": 0.00966826299267881, "loss": 2.7531, "step": 3704 }, { "crossentropy": 2.7511708736419678, "epoch": 0.13431699535962877, "grad_norm": 0.040988121181726456, "grad_norm_var": 1.4960067562321856e-05, "learning_rate": 0.009668054827224158, "loss": 2.7489, "step": 3705 }, { "crossentropy": 2.615736484527588, "epoch": 0.13435324825986078, "grad_norm": 0.041055407375097275, "grad_norm_var": 1.2895590603046828e-05, "learning_rate": 0.009667846598720343, "loss": 2.6305, "step": 3706 }, { "crossentropy": 2.5323052406311035, "epoch": 0.1343895011600928, "grad_norm": 0.037561967968940735, "grad_norm_var": 1.1771576296463505e-05, "learning_rate": 0.009667638307170176, "loss": 2.6716, "step": 3707 }, { "crossentropy": 2.82295823097229, "epoch": 0.13442575406032484, "grad_norm": 0.03627874329686165, "grad_norm_var": 8.393837692335405e-06, "learning_rate": 0.009667429952576469, "loss": 2.7694, "step": 3708 }, { "crossentropy": 2.5014846324920654, "epoch": 0.13446200696055685, "grad_norm": 0.036196738481521606, "grad_norm_var": 8.813683170752427e-06, "learning_rate": 0.009667221534942039, "loss": 2.6309, "step": 3709 }, { "crossentropy": 2.7756998538970947, "epoch": 0.13449825986078887, "grad_norm": 0.038513414561748505, "grad_norm_var": 8.819960323835687e-06, "learning_rate": 0.009667013054269698, "loss": 2.6947, "step": 3710 }, { "crossentropy": 2.628856897354126, "epoch": 0.13453451276102088, "grad_norm": 0.04507804289460182, "grad_norm_var": 1.0879027650540808e-05, "learning_rate": 0.009666804510562265, "loss": 2.6893, "step": 3711 }, { "crossentropy": 2.8021836280822754, "epoch": 0.1345707656612529, "grad_norm": 0.038446780294179916, "grad_norm_var": 1.07557171505251e-05, "learning_rate": 0.009666595903822555, "loss": 2.7286, "step": 3712 }, { "crossentropy": 2.7160186767578125, "epoch": 0.1346070185614849, "grad_norm": 0.035870831459760666, "grad_norm_var": 1.1634027476829095e-05, "learning_rate": 0.009666387234053385, "loss": 2.6905, "step": 3713 }, { "crossentropy": 2.699218511581421, "epoch": 0.13464327146171692, "grad_norm": 0.046791862696409225, "grad_norm_var": 1.2292211791229552e-05, "learning_rate": 0.009666178501257573, "loss": 2.7154, "step": 3714 }, { "crossentropy": 2.7248430252075195, "epoch": 0.13467952436194897, "grad_norm": 0.038897570222616196, "grad_norm_var": 1.2263747294254055e-05, "learning_rate": 0.009665969705437939, "loss": 2.6794, "step": 3715 }, { "crossentropy": 2.7015304565429688, "epoch": 0.13471577726218098, "grad_norm": 0.03706469386816025, "grad_norm_var": 1.1431564804924762e-05, "learning_rate": 0.009665760846597303, "loss": 2.7042, "step": 3716 }, { "crossentropy": 2.568937301635742, "epoch": 0.134752030162413, "grad_norm": 0.036675479263067245, "grad_norm_var": 1.0842238692938764e-05, "learning_rate": 0.009665551924738487, "loss": 2.6054, "step": 3717 }, { "crossentropy": 2.6305785179138184, "epoch": 0.134788283062645, "grad_norm": 0.04121283069252968, "grad_norm_var": 1.0614848566445288e-05, "learning_rate": 0.009665342939864312, "loss": 2.7728, "step": 3718 }, { "crossentropy": 2.7158520221710205, "epoch": 0.13482453596287702, "grad_norm": 0.03739919140934944, "grad_norm_var": 9.983903407334513e-06, "learning_rate": 0.0096651338919776, "loss": 2.7114, "step": 3719 }, { "crossentropy": 2.717940330505371, "epoch": 0.13486078886310904, "grad_norm": 0.042724255472421646, "grad_norm_var": 1.0742329309717155e-05, "learning_rate": 0.009664924781081177, "loss": 2.6739, "step": 3720 }, { "crossentropy": 2.5532548427581787, "epoch": 0.13489704176334108, "grad_norm": 0.040108393877744675, "grad_norm_var": 1.0607026861900876e-05, "learning_rate": 0.009664715607177866, "loss": 2.6507, "step": 3721 }, { "crossentropy": 2.7893075942993164, "epoch": 0.1349332946635731, "grad_norm": 0.039090704172849655, "grad_norm_var": 1.0406053439485779e-05, "learning_rate": 0.00966450637027049, "loss": 2.7789, "step": 3722 }, { "crossentropy": 2.632817268371582, "epoch": 0.1349695475638051, "grad_norm": 0.03835131973028183, "grad_norm_var": 1.0267917704021023e-05, "learning_rate": 0.009664297070361877, "loss": 2.6526, "step": 3723 }, { "crossentropy": 2.6014060974121094, "epoch": 0.13500580046403712, "grad_norm": 0.040160246193408966, "grad_norm_var": 9.649151545084756e-06, "learning_rate": 0.009664087707454853, "loss": 2.7114, "step": 3724 }, { "crossentropy": 2.738065719604492, "epoch": 0.13504205336426914, "grad_norm": 0.039750803261995316, "grad_norm_var": 8.856030780701509e-06, "learning_rate": 0.009663878281552248, "loss": 2.6166, "step": 3725 }, { "crossentropy": 2.652898073196411, "epoch": 0.13507830626450115, "grad_norm": 0.04225696250796318, "grad_norm_var": 9.11043052296828e-06, "learning_rate": 0.009663668792656888, "loss": 2.5891, "step": 3726 }, { "crossentropy": 2.7741200923919678, "epoch": 0.13511455916473317, "grad_norm": 0.036156829446554184, "grad_norm_var": 8.03545274360943e-06, "learning_rate": 0.009663459240771603, "loss": 2.7584, "step": 3727 }, { "crossentropy": 2.7060999870300293, "epoch": 0.1351508120649652, "grad_norm": 0.03769955039024353, "grad_norm_var": 8.16879899694714e-06, "learning_rate": 0.009663249625899227, "loss": 2.6694, "step": 3728 }, { "crossentropy": 2.7795746326446533, "epoch": 0.13518706496519722, "grad_norm": 0.04040615260601044, "grad_norm_var": 7.327371722532818e-06, "learning_rate": 0.009663039948042585, "loss": 2.7845, "step": 3729 }, { "crossentropy": 2.771728992462158, "epoch": 0.13522331786542924, "grad_norm": 0.04683773219585419, "grad_norm_var": 7.371049798452859e-06, "learning_rate": 0.009662830207204513, "loss": 2.597, "step": 3730 }, { "crossentropy": 2.7737748622894287, "epoch": 0.13525957076566125, "grad_norm": 0.04439276456832886, "grad_norm_var": 8.689088915455185e-06, "learning_rate": 0.009662620403387842, "loss": 2.7024, "step": 3731 }, { "crossentropy": 2.632498264312744, "epoch": 0.13529582366589327, "grad_norm": 0.041128288954496384, "grad_norm_var": 8.121003584108527e-06, "learning_rate": 0.00966241053659541, "loss": 2.699, "step": 3732 }, { "crossentropy": 2.81516432762146, "epoch": 0.13533207656612528, "grad_norm": 0.038492973893880844, "grad_norm_var": 7.455912222114937e-06, "learning_rate": 0.009662200606830046, "loss": 2.8047, "step": 3733 }, { "crossentropy": 2.7006430625915527, "epoch": 0.1353683294663573, "grad_norm": 0.041892342269420624, "grad_norm_var": 7.5597224998081095e-06, "learning_rate": 0.009661990614094586, "loss": 2.6715, "step": 3734 }, { "crossentropy": 2.6505143642425537, "epoch": 0.13540458236658934, "grad_norm": 0.0476812943816185, "grad_norm_var": 1.001494507352485e-05, "learning_rate": 0.009661780558391871, "loss": 2.6784, "step": 3735 }, { "crossentropy": 2.792672872543335, "epoch": 0.13544083526682135, "grad_norm": 0.044952668249607086, "grad_norm_var": 1.0816627145134906e-05, "learning_rate": 0.009661570439724734, "loss": 2.782, "step": 3736 }, { "crossentropy": 2.7170703411102295, "epoch": 0.13547708816705337, "grad_norm": 0.045052774250507355, "grad_norm_var": 1.1618363702326866e-05, "learning_rate": 0.009661360258096012, "loss": 2.7379, "step": 3737 }, { "crossentropy": 2.6942837238311768, "epoch": 0.13551334106728538, "grad_norm": 0.03982185199856758, "grad_norm_var": 1.1415052624515429e-05, "learning_rate": 0.009661150013508549, "loss": 2.6866, "step": 3738 }, { "crossentropy": 2.87748384475708, "epoch": 0.1355495939675174, "grad_norm": 0.03802630305290222, "grad_norm_var": 1.156090674110192e-05, "learning_rate": 0.009660939705965179, "loss": 2.794, "step": 3739 }, { "crossentropy": 2.738771438598633, "epoch": 0.1355858468677494, "grad_norm": 0.0396905355155468, "grad_norm_var": 1.1661379535137367e-05, "learning_rate": 0.009660729335468745, "loss": 2.7391, "step": 3740 }, { "crossentropy": 2.6235291957855225, "epoch": 0.13562209976798145, "grad_norm": 0.03909420967102051, "grad_norm_var": 1.1842771324189575e-05, "learning_rate": 0.00966051890202209, "loss": 2.6772, "step": 3741 }, { "crossentropy": 2.7976303100585938, "epoch": 0.13565835266821347, "grad_norm": 0.03913849964737892, "grad_norm_var": 1.2125000146428801e-05, "learning_rate": 0.009660308405628055, "loss": 2.7142, "step": 3742 }, { "crossentropy": 2.7298359870910645, "epoch": 0.13569460556844548, "grad_norm": 0.04143006354570389, "grad_norm_var": 1.0261516371841359e-05, "learning_rate": 0.009660097846289482, "loss": 2.6971, "step": 3743 }, { "crossentropy": 2.619962453842163, "epoch": 0.1357308584686775, "grad_norm": 0.03763045743107796, "grad_norm_var": 1.0297826677294757e-05, "learning_rate": 0.009659887224009215, "loss": 2.644, "step": 3744 }, { "crossentropy": 2.734165906906128, "epoch": 0.1357671113689095, "grad_norm": 0.03895203396677971, "grad_norm_var": 1.0662281645261857e-05, "learning_rate": 0.0096596765387901, "loss": 2.7143, "step": 3745 }, { "crossentropy": 2.716416597366333, "epoch": 0.13580336426914152, "grad_norm": 0.03778800740838051, "grad_norm_var": 9.356407441762683e-06, "learning_rate": 0.009659465790634982, "loss": 2.7357, "step": 3746 }, { "crossentropy": 2.8198561668395996, "epoch": 0.13583961716937354, "grad_norm": 0.036600127816200256, "grad_norm_var": 9.572368118863223e-06, "learning_rate": 0.009659254979546708, "loss": 2.7952, "step": 3747 }, { "crossentropy": 2.4818737506866455, "epoch": 0.13587587006960558, "grad_norm": 0.03720199316740036, "grad_norm_var": 1.0186408950282367e-05, "learning_rate": 0.009659044105528123, "loss": 2.6218, "step": 3748 }, { "crossentropy": 2.624587059020996, "epoch": 0.1359121229698376, "grad_norm": 0.0384223535656929, "grad_norm_var": 1.0202938936582084e-05, "learning_rate": 0.00965883316858208, "loss": 2.6593, "step": 3749 }, { "crossentropy": 2.7054383754730225, "epoch": 0.1359483758700696, "grad_norm": 0.04215239733457565, "grad_norm_var": 1.0265465651868958e-05, "learning_rate": 0.009658622168711422, "loss": 2.7706, "step": 3750 }, { "crossentropy": 2.7872939109802246, "epoch": 0.13598462877030162, "grad_norm": 0.038669783622026443, "grad_norm_var": 6.384597835256261e-06, "learning_rate": 0.009658411105919003, "loss": 2.7705, "step": 3751 }, { "crossentropy": 2.6989524364471436, "epoch": 0.13602088167053364, "grad_norm": 0.041116632521152496, "grad_norm_var": 4.599295178069467e-06, "learning_rate": 0.009658199980207673, "loss": 2.6562, "step": 3752 }, { "crossentropy": 2.771838426589966, "epoch": 0.13605713457076565, "grad_norm": 0.038932908326387405, "grad_norm_var": 2.3473186928762526e-06, "learning_rate": 0.009657988791580283, "loss": 2.7598, "step": 3753 }, { "crossentropy": 2.759182929992676, "epoch": 0.13609338747099767, "grad_norm": 0.040153633803129196, "grad_norm_var": 2.3887080221052237e-06, "learning_rate": 0.009657777540039684, "loss": 2.7885, "step": 3754 }, { "crossentropy": 2.7522833347320557, "epoch": 0.1361296403712297, "grad_norm": 0.03920091316103935, "grad_norm_var": 2.3126567577630776e-06, "learning_rate": 0.009657566225588735, "loss": 2.749, "step": 3755 }, { "crossentropy": 2.7834460735321045, "epoch": 0.13616589327146172, "grad_norm": 0.04147827625274658, "grad_norm_var": 2.64461150861021e-06, "learning_rate": 0.009657354848230283, "loss": 2.7514, "step": 3756 }, { "crossentropy": 2.548104763031006, "epoch": 0.13620214617169374, "grad_norm": 0.03867309167981148, "grad_norm_var": 2.6643104306782345e-06, "learning_rate": 0.009657143407967186, "loss": 2.6245, "step": 3757 }, { "crossentropy": 2.832838773727417, "epoch": 0.13623839907192575, "grad_norm": 0.042246539145708084, "grad_norm_var": 3.233732206402567e-06, "learning_rate": 0.0096569319048023, "loss": 2.7601, "step": 3758 }, { "crossentropy": 2.9046742916107178, "epoch": 0.13627465197215777, "grad_norm": 0.04258120805025101, "grad_norm_var": 3.6257485943592844e-06, "learning_rate": 0.009656720338738482, "loss": 2.843, "step": 3759 }, { "crossentropy": 2.7142794132232666, "epoch": 0.13631090487238978, "grad_norm": 0.044173017144203186, "grad_norm_var": 4.681072171874856e-06, "learning_rate": 0.00965650870977859, "loss": 2.7699, "step": 3760 }, { "crossentropy": 2.7161459922790527, "epoch": 0.1363471577726218, "grad_norm": 0.03955481946468353, "grad_norm_var": 4.6278789523891575e-06, "learning_rate": 0.00965629701792548, "loss": 2.7284, "step": 3761 }, { "crossentropy": 2.7834339141845703, "epoch": 0.13638341067285384, "grad_norm": 0.038656849414110184, "grad_norm_var": 4.426443226879644e-06, "learning_rate": 0.009656085263182011, "loss": 2.7928, "step": 3762 }, { "crossentropy": 2.6131949424743652, "epoch": 0.13641966357308585, "grad_norm": 0.0385514535009861, "grad_norm_var": 3.782870701261182e-06, "learning_rate": 0.009655873445551046, "loss": 2.6546, "step": 3763 }, { "crossentropy": 2.6042230129241943, "epoch": 0.13645591647331787, "grad_norm": 0.036681853234767914, "grad_norm_var": 4.001481306018656e-06, "learning_rate": 0.009655661565035447, "loss": 2.6021, "step": 3764 }, { "crossentropy": 2.7845406532287598, "epoch": 0.13649216937354988, "grad_norm": 0.03787429258227348, "grad_norm_var": 4.141230145536466e-06, "learning_rate": 0.00965544962163807, "loss": 2.7068, "step": 3765 }, { "crossentropy": 2.7717607021331787, "epoch": 0.1365284222737819, "grad_norm": 0.03960235044360161, "grad_norm_var": 3.830648535272744e-06, "learning_rate": 0.009655237615361783, "loss": 2.6478, "step": 3766 }, { "crossentropy": 2.6964311599731445, "epoch": 0.1365646751740139, "grad_norm": 0.040447741746902466, "grad_norm_var": 3.7403223588504283e-06, "learning_rate": 0.009655025546209447, "loss": 2.7683, "step": 3767 }, { "crossentropy": 2.6678059101104736, "epoch": 0.13660092807424595, "grad_norm": 0.039945922791957855, "grad_norm_var": 3.650956066819586e-06, "learning_rate": 0.009654813414183927, "loss": 2.7484, "step": 3768 }, { "crossentropy": 2.723658323287964, "epoch": 0.13663718097447797, "grad_norm": 0.044085849076509476, "grad_norm_var": 4.63081858675009e-06, "learning_rate": 0.009654601219288086, "loss": 2.7854, "step": 3769 }, { "crossentropy": 2.7739789485931396, "epoch": 0.13667343387470998, "grad_norm": 0.04445786774158478, "grad_norm_var": 5.736722769230281e-06, "learning_rate": 0.009654388961524794, "loss": 2.6077, "step": 3770 }, { "crossentropy": 2.6380252838134766, "epoch": 0.136709686774942, "grad_norm": 0.048610761761665344, "grad_norm_var": 9.624278578962361e-06, "learning_rate": 0.009654176640896914, "loss": 2.7008, "step": 3771 }, { "crossentropy": 2.8290774822235107, "epoch": 0.136745939675174, "grad_norm": 0.041653115302324295, "grad_norm_var": 9.634975550927803e-06, "learning_rate": 0.009653964257407317, "loss": 2.759, "step": 3772 }, { "crossentropy": 2.5877556800842285, "epoch": 0.13678219257540603, "grad_norm": 0.04078993946313858, "grad_norm_var": 9.226584318255439e-06, "learning_rate": 0.009653751811058871, "loss": 2.5325, "step": 3773 }, { "crossentropy": 2.653859853744507, "epoch": 0.13681844547563804, "grad_norm": 0.03999338671565056, "grad_norm_var": 9.242874568242815e-06, "learning_rate": 0.009653539301854443, "loss": 2.7023, "step": 3774 }, { "crossentropy": 2.7649037837982178, "epoch": 0.13685469837587008, "grad_norm": 0.038427893072366714, "grad_norm_var": 9.502836158462961e-06, "learning_rate": 0.009653326729796905, "loss": 2.7013, "step": 3775 }, { "crossentropy": 2.567410469055176, "epoch": 0.1368909512761021, "grad_norm": 0.03860990330576897, "grad_norm_var": 8.967951707352222e-06, "learning_rate": 0.009653114094889127, "loss": 2.6505, "step": 3776 }, { "crossentropy": 2.7632033824920654, "epoch": 0.1369272041763341, "grad_norm": 0.03998364880681038, "grad_norm_var": 8.92560242224764e-06, "learning_rate": 0.009652901397133982, "loss": 2.7531, "step": 3777 }, { "crossentropy": 2.7207608222961426, "epoch": 0.13696345707656613, "grad_norm": 0.03789953887462616, "grad_norm_var": 9.149911918356473e-06, "learning_rate": 0.009652688636534344, "loss": 2.6762, "step": 3778 }, { "crossentropy": 2.7470011711120605, "epoch": 0.13699970997679814, "grad_norm": 0.043017830699682236, "grad_norm_var": 9.250612592565504e-06, "learning_rate": 0.009652475813093085, "loss": 2.7567, "step": 3779 }, { "crossentropy": 2.6697494983673096, "epoch": 0.13703596287703015, "grad_norm": 0.052813153713941574, "grad_norm_var": 1.6753348967222388e-05, "learning_rate": 0.009652262926813078, "loss": 2.6262, "step": 3780 }, { "crossentropy": 2.6815438270568848, "epoch": 0.13707221577726217, "grad_norm": 0.042073726654052734, "grad_norm_var": 1.567798731261314e-05, "learning_rate": 0.009652049977697202, "loss": 2.7048, "step": 3781 }, { "crossentropy": 2.704901933670044, "epoch": 0.1371084686774942, "grad_norm": 0.04003365710377693, "grad_norm_var": 1.5550247854809127e-05, "learning_rate": 0.00965183696574833, "loss": 2.7579, "step": 3782 }, { "crossentropy": 2.7203123569488525, "epoch": 0.13714472157772623, "grad_norm": 0.03616538271307945, "grad_norm_var": 1.7612837577043092e-05, "learning_rate": 0.009651623890969342, "loss": 2.7862, "step": 3783 }, { "crossentropy": 2.7431533336639404, "epoch": 0.13718097447795824, "grad_norm": 0.036930862814188004, "grad_norm_var": 1.8920362631626108e-05, "learning_rate": 0.009651410753363112, "loss": 2.7197, "step": 3784 }, { "crossentropy": 2.642188310623169, "epoch": 0.13721722737819025, "grad_norm": 0.03659175708889961, "grad_norm_var": 1.9943220162134344e-05, "learning_rate": 0.009651197552932523, "loss": 2.6911, "step": 3785 }, { "crossentropy": 2.7262654304504395, "epoch": 0.13725348027842227, "grad_norm": 0.04061770811676979, "grad_norm_var": 1.9160075261840752e-05, "learning_rate": 0.009650984289680451, "loss": 2.7226, "step": 3786 }, { "crossentropy": 2.6523470878601074, "epoch": 0.13728973317865428, "grad_norm": 0.04557402431964874, "grad_norm_var": 1.660961080999964e-05, "learning_rate": 0.009650770963609778, "loss": 2.6714, "step": 3787 }, { "crossentropy": 2.774305582046509, "epoch": 0.1373259860788863, "grad_norm": 0.044855598360300064, "grad_norm_var": 1.7658235572240445e-05, "learning_rate": 0.009650557574723386, "loss": 2.784, "step": 3788 }, { "crossentropy": 2.729907989501953, "epoch": 0.13736223897911834, "grad_norm": 0.0441901832818985, "grad_norm_var": 1.8331564553941462e-05, "learning_rate": 0.009650344123024157, "loss": 2.7257, "step": 3789 }, { "crossentropy": 2.668619394302368, "epoch": 0.13739849187935035, "grad_norm": 0.0417025052011013, "grad_norm_var": 1.8259415832603403e-05, "learning_rate": 0.009650130608514973, "loss": 2.7321, "step": 3790 }, { "crossentropy": 2.7574992179870605, "epoch": 0.13743474477958237, "grad_norm": 0.04650940001010895, "grad_norm_var": 1.9334944332604665e-05, "learning_rate": 0.00964991703119872, "loss": 2.7414, "step": 3791 }, { "crossentropy": 2.7345452308654785, "epoch": 0.13747099767981438, "grad_norm": 0.046380553394556046, "grad_norm_var": 1.9883386940252635e-05, "learning_rate": 0.00964970339107828, "loss": 2.7182, "step": 3792 }, { "crossentropy": 2.8429372310638428, "epoch": 0.1375072505800464, "grad_norm": 0.04704347625374794, "grad_norm_var": 2.090397651704854e-05, "learning_rate": 0.009649489688156538, "loss": 2.7159, "step": 3793 }, { "crossentropy": 2.798635721206665, "epoch": 0.1375435034802784, "grad_norm": 0.048652730882167816, "grad_norm_var": 2.1319964059032746e-05, "learning_rate": 0.009649275922436383, "loss": 2.7764, "step": 3794 }, { "crossentropy": 2.6016082763671875, "epoch": 0.13757975638051045, "grad_norm": 0.04112323001027107, "grad_norm_var": 2.1621154486797742e-05, "learning_rate": 0.009649062093920701, "loss": 2.6043, "step": 3795 }, { "crossentropy": 2.647582530975342, "epoch": 0.13761600928074247, "grad_norm": 0.03768174722790718, "grad_norm_var": 1.6543689880803696e-05, "learning_rate": 0.00964884820261238, "loss": 2.6372, "step": 3796 }, { "crossentropy": 2.7283377647399902, "epoch": 0.13765226218097448, "grad_norm": 0.0382632352411747, "grad_norm_var": 1.7544756862943277e-05, "learning_rate": 0.00964863424851431, "loss": 2.8015, "step": 3797 }, { "crossentropy": 2.808640956878662, "epoch": 0.1376885150812065, "grad_norm": 0.038028158247470856, "grad_norm_var": 1.8327215273821324e-05, "learning_rate": 0.009648420231629378, "loss": 2.7628, "step": 3798 }, { "crossentropy": 2.850539445877075, "epoch": 0.1377247679814385, "grad_norm": 0.037037111818790436, "grad_norm_var": 1.7708821822805484e-05, "learning_rate": 0.009648206151960478, "loss": 2.7356, "step": 3799 }, { "crossentropy": 2.755620241165161, "epoch": 0.13776102088167053, "grad_norm": 0.037359509617090225, "grad_norm_var": 1.743351046057731e-05, "learning_rate": 0.0096479920095105, "loss": 2.7209, "step": 3800 }, { "crossentropy": 2.6945042610168457, "epoch": 0.13779727378190254, "grad_norm": 0.041964851319789886, "grad_norm_var": 1.5380782096462095e-05, "learning_rate": 0.009647777804282337, "loss": 2.6194, "step": 3801 }, { "crossentropy": 2.741523265838623, "epoch": 0.13783352668213458, "grad_norm": 0.044806018471717834, "grad_norm_var": 1.553126928214831e-05, "learning_rate": 0.00964756353627888, "loss": 2.6703, "step": 3802 }, { "crossentropy": 2.6601810455322266, "epoch": 0.1378697795823666, "grad_norm": 0.04616102576255798, "grad_norm_var": 1.5787664499105947e-05, "learning_rate": 0.009647349205503024, "loss": 2.7373, "step": 3803 }, { "crossentropy": 2.7164952754974365, "epoch": 0.1379060324825986, "grad_norm": 0.06476505845785141, "grad_norm_var": 4.652309085993357e-05, "learning_rate": 0.009647134811957666, "loss": 2.6825, "step": 3804 }, { "crossentropy": 2.659330129623413, "epoch": 0.13794228538283063, "grad_norm": 0.043744344264268875, "grad_norm_var": 4.651554746928725e-05, "learning_rate": 0.0096469203556457, "loss": 2.7421, "step": 3805 }, { "crossentropy": 2.6654677391052246, "epoch": 0.13797853828306264, "grad_norm": 0.043584082275629044, "grad_norm_var": 4.620397328146837e-05, "learning_rate": 0.009646705836570022, "loss": 2.7135, "step": 3806 }, { "crossentropy": 2.572021484375, "epoch": 0.13801479118329466, "grad_norm": 0.04403926059603691, "grad_norm_var": 4.574041412985409e-05, "learning_rate": 0.009646491254733531, "loss": 2.5936, "step": 3807 }, { "crossentropy": 2.694330930709839, "epoch": 0.13805104408352667, "grad_norm": 0.051124878227710724, "grad_norm_var": 4.878614798135723e-05, "learning_rate": 0.009646276610139124, "loss": 2.6974, "step": 3808 }, { "crossentropy": 2.6851375102996826, "epoch": 0.1380872969837587, "grad_norm": 0.051262542605400085, "grad_norm_var": 5.156229009051164e-05, "learning_rate": 0.0096460619027897, "loss": 2.6625, "step": 3809 }, { "crossentropy": 2.8080267906188965, "epoch": 0.13812354988399073, "grad_norm": 0.04614652320742607, "grad_norm_var": 5.051700618287798e-05, "learning_rate": 0.00964584713268816, "loss": 2.762, "step": 3810 }, { "crossentropy": 2.825482130050659, "epoch": 0.13815980278422274, "grad_norm": 0.04290017858147621, "grad_norm_var": 4.9986990062557646e-05, "learning_rate": 0.009645632299837403, "loss": 2.702, "step": 3811 }, { "crossentropy": 2.6111643314361572, "epoch": 0.13819605568445475, "grad_norm": 0.038527678698301315, "grad_norm_var": 4.9284753557368724e-05, "learning_rate": 0.009645417404240332, "loss": 2.6334, "step": 3812 }, { "crossentropy": 2.803802013397217, "epoch": 0.13823230858468677, "grad_norm": 0.039493102580308914, "grad_norm_var": 4.837999459306926e-05, "learning_rate": 0.00964520244589985, "loss": 2.7921, "step": 3813 }, { "crossentropy": 2.8257901668548584, "epoch": 0.13826856148491878, "grad_norm": 0.037431661039590836, "grad_norm_var": 4.891170982728819e-05, "learning_rate": 0.009644987424818858, "loss": 2.7284, "step": 3814 }, { "crossentropy": 2.643108367919922, "epoch": 0.1383048143851508, "grad_norm": 0.04315586015582085, "grad_norm_var": 4.5247424679856205e-05, "learning_rate": 0.009644772341000264, "loss": 2.709, "step": 3815 }, { "crossentropy": 2.6524884700775146, "epoch": 0.13834106728538284, "grad_norm": 0.03733262047171593, "grad_norm_var": 4.527407094691363e-05, "learning_rate": 0.00964455719444697, "loss": 2.6626, "step": 3816 }, { "crossentropy": 2.7336196899414062, "epoch": 0.13837732018561485, "grad_norm": 0.0430707186460495, "grad_norm_var": 4.493578557286844e-05, "learning_rate": 0.009644341985161882, "loss": 2.6904, "step": 3817 }, { "crossentropy": 2.5609140396118164, "epoch": 0.13841357308584687, "grad_norm": 0.04480825737118721, "grad_norm_var": 4.493577377262288e-05, "learning_rate": 0.009644126713147908, "loss": 2.6649, "step": 3818 }, { "crossentropy": 2.943312406539917, "epoch": 0.13844982598607888, "grad_norm": 0.04498879984021187, "grad_norm_var": 4.481623676837627e-05, "learning_rate": 0.009643911378407954, "loss": 2.8599, "step": 3819 }, { "crossentropy": 2.719391107559204, "epoch": 0.1384860788863109, "grad_norm": 0.04451802372932434, "grad_norm_var": 1.6468260886532784e-05, "learning_rate": 0.009643695980944932, "loss": 2.7601, "step": 3820 }, { "crossentropy": 2.6306774616241455, "epoch": 0.1385223317865429, "grad_norm": 0.06673462688922882, "grad_norm_var": 5.022721021341366e-05, "learning_rate": 0.009643480520761747, "loss": 2.634, "step": 3821 }, { "crossentropy": 2.7814230918884277, "epoch": 0.13855858468677495, "grad_norm": 0.0364927314221859, "grad_norm_var": 5.4656859748064496e-05, "learning_rate": 0.009643264997861311, "loss": 2.7328, "step": 3822 }, { "crossentropy": 2.647494316101074, "epoch": 0.13859483758700697, "grad_norm": 0.03570931404829025, "grad_norm_var": 5.950724140681337e-05, "learning_rate": 0.009643049412246535, "loss": 2.6925, "step": 3823 }, { "crossentropy": 2.746175527572632, "epoch": 0.13863109048723898, "grad_norm": 0.04195677116513252, "grad_norm_var": 5.60279653161956e-05, "learning_rate": 0.009642833763920331, "loss": 2.6762, "step": 3824 }, { "crossentropy": 2.740445852279663, "epoch": 0.138667343387471, "grad_norm": 0.04986119642853737, "grad_norm_var": 5.468312638135612e-05, "learning_rate": 0.00964261805288561, "loss": 2.7047, "step": 3825 }, { "crossentropy": 2.769097328186035, "epoch": 0.138703596287703, "grad_norm": 0.044708169996738434, "grad_norm_var": 5.427045495867277e-05, "learning_rate": 0.009642402279145288, "loss": 2.7953, "step": 3826 }, { "crossentropy": 2.738145351409912, "epoch": 0.13873984918793503, "grad_norm": 0.045110031962394714, "grad_norm_var": 5.447831094268658e-05, "learning_rate": 0.009642186442702278, "loss": 2.6854, "step": 3827 }, { "crossentropy": 2.619734048843384, "epoch": 0.13877610208816704, "grad_norm": 0.0383308082818985, "grad_norm_var": 5.4607807763741055e-05, "learning_rate": 0.009641970543559496, "loss": 2.5931, "step": 3828 }, { "crossentropy": 2.8051631450653076, "epoch": 0.13881235498839908, "grad_norm": 0.03499313443899155, "grad_norm_var": 5.8191387910496346e-05, "learning_rate": 0.009641754581719855, "loss": 2.7033, "step": 3829 }, { "crossentropy": 2.6975505352020264, "epoch": 0.1388486078886311, "grad_norm": 0.03874578699469566, "grad_norm_var": 5.731048323810971e-05, "learning_rate": 0.009641538557186276, "loss": 2.7373, "step": 3830 }, { "crossentropy": 2.6070661544799805, "epoch": 0.1388848607888631, "grad_norm": 0.03632884845137596, "grad_norm_var": 6.022480238271321e-05, "learning_rate": 0.009641322469961675, "loss": 2.629, "step": 3831 }, { "crossentropy": 2.795854091644287, "epoch": 0.13892111368909513, "grad_norm": 0.03939031809568405, "grad_norm_var": 5.900844278115541e-05, "learning_rate": 0.009641106320048968, "loss": 2.7575, "step": 3832 }, { "crossentropy": 2.6142306327819824, "epoch": 0.13895736658932714, "grad_norm": 0.04183259606361389, "grad_norm_var": 5.906933735167804e-05, "learning_rate": 0.00964089010745108, "loss": 2.7034, "step": 3833 }, { "crossentropy": 2.821343421936035, "epoch": 0.13899361948955916, "grad_norm": 0.04312320053577423, "grad_norm_var": 5.8791516744443235e-05, "learning_rate": 0.009640673832170928, "loss": 2.7733, "step": 3834 }, { "crossentropy": 2.7901201248168945, "epoch": 0.13902987238979117, "grad_norm": 0.04756193235516548, "grad_norm_var": 5.9998636098028875e-05, "learning_rate": 0.009640457494211432, "loss": 2.7077, "step": 3835 }, { "crossentropy": 2.749300479888916, "epoch": 0.1390661252900232, "grad_norm": 0.038603562861680984, "grad_norm_var": 6.0859563144677274e-05, "learning_rate": 0.009640241093575516, "loss": 2.6929, "step": 3836 }, { "crossentropy": 2.5719969272613525, "epoch": 0.13910237819025523, "grad_norm": 0.038344960659742355, "grad_norm_var": 1.9375517406057994e-05, "learning_rate": 0.009640024630266104, "loss": 2.6056, "step": 3837 }, { "crossentropy": 2.698268175125122, "epoch": 0.13913863109048724, "grad_norm": 0.04001322016119957, "grad_norm_var": 1.8178375343421333e-05, "learning_rate": 0.009639808104286116, "loss": 2.7072, "step": 3838 }, { "crossentropy": 2.604020118713379, "epoch": 0.13917488399071926, "grad_norm": 0.04045857489109039, "grad_norm_var": 1.6292706203711177e-05, "learning_rate": 0.00963959151563848, "loss": 2.6274, "step": 3839 }, { "crossentropy": 2.5943496227264404, "epoch": 0.13921113689095127, "grad_norm": 0.04100625589489937, "grad_norm_var": 1.6254555999874417e-05, "learning_rate": 0.009639374864326119, "loss": 2.6534, "step": 3840 }, { "crossentropy": 2.4423768520355225, "epoch": 0.13924738979118328, "grad_norm": 0.038088008761405945, "grad_norm_var": 1.124431535479044e-05, "learning_rate": 0.009639158150351958, "loss": 2.5822, "step": 3841 }, { "crossentropy": 2.6647281646728516, "epoch": 0.13928364269141533, "grad_norm": 0.04047759994864464, "grad_norm_var": 9.941228065442574e-06, "learning_rate": 0.009638941373718927, "loss": 2.6976, "step": 3842 }, { "crossentropy": 2.7455358505249023, "epoch": 0.13931989559164734, "grad_norm": 0.03974420949816704, "grad_norm_var": 8.19250640995365e-06, "learning_rate": 0.009638724534429955, "loss": 2.819, "step": 3843 }, { "crossentropy": 2.6632914543151855, "epoch": 0.13935614849187936, "grad_norm": 0.03861960023641586, "grad_norm_var": 8.140562013033275e-06, "learning_rate": 0.009638507632487966, "loss": 2.6752, "step": 3844 }, { "crossentropy": 2.6203296184539795, "epoch": 0.13939240139211137, "grad_norm": 0.037736281752586365, "grad_norm_var": 6.840583267327984e-06, "learning_rate": 0.009638290667895892, "loss": 2.6467, "step": 3845 }, { "crossentropy": 2.7774441242218018, "epoch": 0.13942865429234338, "grad_norm": 0.04050092399120331, "grad_norm_var": 6.738509797591524e-06, "learning_rate": 0.009638073640656665, "loss": 2.7699, "step": 3846 }, { "crossentropy": 2.7834951877593994, "epoch": 0.1394649071925754, "grad_norm": 0.05096762627363205, "grad_norm_var": 1.2743130874540095e-05, "learning_rate": 0.009637856550773217, "loss": 2.723, "step": 3847 }, { "crossentropy": 2.609048843383789, "epoch": 0.1395011600928074, "grad_norm": 0.045682113617658615, "grad_norm_var": 1.3842343413620478e-05, "learning_rate": 0.009637639398248474, "loss": 2.6685, "step": 3848 }, { "crossentropy": 2.6523969173431396, "epoch": 0.13953741299303946, "grad_norm": 0.043216951191425323, "grad_norm_var": 1.4037808969913357e-05, "learning_rate": 0.009637422183085376, "loss": 2.6628, "step": 3849 }, { "crossentropy": 2.666867971420288, "epoch": 0.13957366589327147, "grad_norm": 0.04600990563631058, "grad_norm_var": 1.517989713838222e-05, "learning_rate": 0.009637204905286854, "loss": 2.7685, "step": 3850 }, { "crossentropy": 2.7622621059417725, "epoch": 0.13960991879350348, "grad_norm": 0.04840485751628876, "grad_norm_var": 1.588430952615908e-05, "learning_rate": 0.00963698756485584, "loss": 2.7032, "step": 3851 }, { "crossentropy": 2.751265287399292, "epoch": 0.1396461716937355, "grad_norm": 0.053299617022275925, "grad_norm_var": 2.3232674505360186e-05, "learning_rate": 0.009636770161795274, "loss": 2.8616, "step": 3852 }, { "crossentropy": 2.6531026363372803, "epoch": 0.1396824245939675, "grad_norm": 0.0475764237344265, "grad_norm_var": 2.32468783854781e-05, "learning_rate": 0.00963655269610809, "loss": 2.7255, "step": 3853 }, { "crossentropy": 2.574059009552002, "epoch": 0.13971867749419953, "grad_norm": 0.04240436479449272, "grad_norm_var": 2.2576220843114648e-05, "learning_rate": 0.009636335167797225, "loss": 2.638, "step": 3854 }, { "crossentropy": 2.8310728073120117, "epoch": 0.13975493039443154, "grad_norm": 0.045480743050575256, "grad_norm_var": 2.2191612520491323e-05, "learning_rate": 0.009636117576865619, "loss": 2.843, "step": 3855 }, { "crossentropy": 2.70576548576355, "epoch": 0.13979118329466358, "grad_norm": 0.04820660501718521, "grad_norm_var": 2.284487798418685e-05, "learning_rate": 0.00963589992331621, "loss": 2.7327, "step": 3856 }, { "crossentropy": 2.784364700317383, "epoch": 0.1398274361948956, "grad_norm": 0.04831547290086746, "grad_norm_var": 2.111458615650074e-05, "learning_rate": 0.009635682207151934, "loss": 2.7243, "step": 3857 }, { "crossentropy": 2.6701128482818604, "epoch": 0.1398636890951276, "grad_norm": 0.04741010442376137, "grad_norm_var": 2.013202418951384e-05, "learning_rate": 0.009635464428375737, "loss": 2.6792, "step": 3858 }, { "crossentropy": 2.780184268951416, "epoch": 0.13989994199535963, "grad_norm": 0.041517142206430435, "grad_norm_var": 1.9033227670772526e-05, "learning_rate": 0.009635246586990559, "loss": 2.8225, "step": 3859 }, { "crossentropy": 2.5971248149871826, "epoch": 0.13993619489559164, "grad_norm": 0.039835814386606216, "grad_norm_var": 1.8036808517582435e-05, "learning_rate": 0.00963502868299934, "loss": 2.6237, "step": 3860 }, { "crossentropy": 2.7439136505126953, "epoch": 0.13997244779582366, "grad_norm": 0.046181656420230865, "grad_norm_var": 1.3853242308075181e-05, "learning_rate": 0.009634810716405026, "loss": 2.7807, "step": 3861 }, { "crossentropy": 2.754786252975464, "epoch": 0.14000870069605567, "grad_norm": 0.04318440705537796, "grad_norm_var": 1.2357884444877803e-05, "learning_rate": 0.00963459268721056, "loss": 2.7283, "step": 3862 }, { "crossentropy": 2.7396700382232666, "epoch": 0.1400449535962877, "grad_norm": 0.043006639927625656, "grad_norm_var": 1.1158374932170265e-05, "learning_rate": 0.009634374595418887, "loss": 2.7503, "step": 3863 }, { "crossentropy": 2.730910301208496, "epoch": 0.14008120649651973, "grad_norm": 0.04208528622984886, "grad_norm_var": 1.193154914011114e-05, "learning_rate": 0.009634156441032951, "loss": 2.7556, "step": 3864 }, { "crossentropy": 2.706662893295288, "epoch": 0.14011745939675174, "grad_norm": 0.04007643833756447, "grad_norm_var": 1.3455185147175954e-05, "learning_rate": 0.009633938224055697, "loss": 2.6467, "step": 3865 }, { "crossentropy": 2.784412145614624, "epoch": 0.14015371229698376, "grad_norm": 0.03838438540697098, "grad_norm_var": 1.625301331490362e-05, "learning_rate": 0.00963371994449008, "loss": 2.6803, "step": 3866 }, { "crossentropy": 2.7365384101867676, "epoch": 0.14018996519721577, "grad_norm": 0.03797938674688339, "grad_norm_var": 1.791094716926487e-05, "learning_rate": 0.00963350160233904, "loss": 2.6727, "step": 3867 }, { "crossentropy": 2.605541229248047, "epoch": 0.14022621809744779, "grad_norm": 0.03881466016173363, "grad_norm_var": 1.3177720962758745e-05, "learning_rate": 0.009633283197605532, "loss": 2.6085, "step": 3868 }, { "crossentropy": 2.5523009300231934, "epoch": 0.14026247099767983, "grad_norm": 0.036025770008563995, "grad_norm_var": 1.4704972846672076e-05, "learning_rate": 0.0096330647302925, "loss": 2.6364, "step": 3869 }, { "crossentropy": 2.7082135677337646, "epoch": 0.14029872389791184, "grad_norm": 0.05511165037751198, "grad_norm_var": 2.4750675266589175e-05, "learning_rate": 0.009632846200402901, "loss": 2.6467, "step": 3870 }, { "crossentropy": 2.9444596767425537, "epoch": 0.14033497679814386, "grad_norm": 0.040391016751527786, "grad_norm_var": 2.4839627611120535e-05, "learning_rate": 0.00963262760793968, "loss": 2.8547, "step": 3871 }, { "crossentropy": 2.601008892059326, "epoch": 0.14037122969837587, "grad_norm": 0.03544680401682854, "grad_norm_var": 2.6000691198061614e-05, "learning_rate": 0.009632408952905796, "loss": 2.6273, "step": 3872 }, { "crossentropy": 2.675217390060425, "epoch": 0.14040748259860789, "grad_norm": 0.04155348613858223, "grad_norm_var": 2.3264000746019548e-05, "learning_rate": 0.009632190235304197, "loss": 2.7632, "step": 3873 }, { "crossentropy": 2.586639881134033, "epoch": 0.1404437354988399, "grad_norm": 0.043340664356946945, "grad_norm_var": 2.1194140362320624e-05, "learning_rate": 0.00963197145513784, "loss": 2.6902, "step": 3874 }, { "crossentropy": 2.7779643535614014, "epoch": 0.14047998839907191, "grad_norm": 0.044352222234010696, "grad_norm_var": 2.1728131630541156e-05, "learning_rate": 0.009631752612409679, "loss": 2.7815, "step": 3875 }, { "crossentropy": 2.6421451568603516, "epoch": 0.14051624129930396, "grad_norm": 0.042184341698884964, "grad_norm_var": 2.1517090940086947e-05, "learning_rate": 0.00963153370712267, "loss": 2.7013, "step": 3876 }, { "crossentropy": 2.6630806922912598, "epoch": 0.14055249419953597, "grad_norm": 0.040973879396915436, "grad_norm_var": 2.0140095709247385e-05, "learning_rate": 0.009631314739279771, "loss": 2.6842, "step": 3877 }, { "crossentropy": 2.7215189933776855, "epoch": 0.14058874709976799, "grad_norm": 0.040872011333703995, "grad_norm_var": 1.993397441752582e-05, "learning_rate": 0.009631095708883937, "loss": 2.7574, "step": 3878 }, { "crossentropy": 2.53806471824646, "epoch": 0.140625, "grad_norm": 0.04261403903365135, "grad_norm_var": 1.9853611994981545e-05, "learning_rate": 0.009630876615938129, "loss": 2.5355, "step": 3879 }, { "crossentropy": 2.6656851768493652, "epoch": 0.14066125290023201, "grad_norm": 0.044980984181165695, "grad_norm_var": 2.0695204908467944e-05, "learning_rate": 0.009630657460445302, "loss": 2.6971, "step": 3880 }, { "crossentropy": 2.615068197250366, "epoch": 0.14069750580046403, "grad_norm": 0.04651989787817001, "grad_norm_var": 2.2115301496602028e-05, "learning_rate": 0.009630438242408421, "loss": 2.6927, "step": 3881 }, { "crossentropy": 2.8302834033966064, "epoch": 0.14073375870069604, "grad_norm": 0.05144418403506279, "grad_norm_var": 2.674646460344802e-05, "learning_rate": 0.009630218961830443, "loss": 2.8251, "step": 3882 }, { "crossentropy": 2.9207632541656494, "epoch": 0.14077001160092809, "grad_norm": 0.05502944812178612, "grad_norm_var": 3.4268475688637756e-05, "learning_rate": 0.009629999618714332, "loss": 2.8298, "step": 3883 }, { "crossentropy": 2.783942461013794, "epoch": 0.1408062645011601, "grad_norm": 0.044289056211709976, "grad_norm_var": 3.2554874454173745e-05, "learning_rate": 0.00962978021306305, "loss": 2.7095, "step": 3884 }, { "crossentropy": 2.578517436981201, "epoch": 0.14084251740139211, "grad_norm": 0.038190245628356934, "grad_norm_var": 3.052597494064922e-05, "learning_rate": 0.009629560744879561, "loss": 2.7187, "step": 3885 }, { "crossentropy": 2.6776211261749268, "epoch": 0.14087877030162413, "grad_norm": 0.04325830563902855, "grad_norm_var": 2.2071340815043155e-05, "learning_rate": 0.009629341214166826, "loss": 2.6605, "step": 3886 }, { "crossentropy": 2.625288963317871, "epoch": 0.14091502320185614, "grad_norm": 0.04154813289642334, "grad_norm_var": 2.168075676613641e-05, "learning_rate": 0.009629121620927817, "loss": 2.675, "step": 3887 }, { "crossentropy": 2.627020835876465, "epoch": 0.14095127610208816, "grad_norm": 0.03744468465447426, "grad_norm_var": 1.9775032805705953e-05, "learning_rate": 0.009628901965165491, "loss": 2.6467, "step": 3888 }, { "crossentropy": 2.8235814571380615, "epoch": 0.14098752900232017, "grad_norm": 0.03666234016418457, "grad_norm_var": 2.264545888535496e-05, "learning_rate": 0.009628682246882823, "loss": 2.7818, "step": 3889 }, { "crossentropy": 2.704927682876587, "epoch": 0.14102378190255221, "grad_norm": 0.03807001933455467, "grad_norm_var": 2.4392837824623443e-05, "learning_rate": 0.009628462466082774, "loss": 2.7786, "step": 3890 }, { "crossentropy": 2.5589351654052734, "epoch": 0.14106003480278423, "grad_norm": 0.0419682040810585, "grad_norm_var": 2.432684766266735e-05, "learning_rate": 0.009628242622768316, "loss": 2.6196, "step": 3891 }, { "crossentropy": 2.5534160137176514, "epoch": 0.14109628770301624, "grad_norm": 0.04704504832625389, "grad_norm_var": 2.5353874167329214e-05, "learning_rate": 0.00962802271694242, "loss": 2.7258, "step": 3892 }, { "crossentropy": 2.678269863128662, "epoch": 0.14113254060324826, "grad_norm": 0.04527152329683304, "grad_norm_var": 2.5242992042467708e-05, "learning_rate": 0.009627802748608052, "loss": 2.6973, "step": 3893 }, { "crossentropy": 2.7315657138824463, "epoch": 0.14116879350348027, "grad_norm": 0.03766952455043793, "grad_norm_var": 2.6985000589919464e-05, "learning_rate": 0.009627582717768186, "loss": 2.6923, "step": 3894 }, { "crossentropy": 2.7047629356384277, "epoch": 0.1412050464037123, "grad_norm": 0.04119709134101868, "grad_norm_var": 2.723070075081486e-05, "learning_rate": 0.009627362624425791, "loss": 2.7235, "step": 3895 }, { "crossentropy": 2.8636910915374756, "epoch": 0.14124129930394433, "grad_norm": 0.041557520627975464, "grad_norm_var": 2.7132815942337794e-05, "learning_rate": 0.009627142468583842, "loss": 2.7422, "step": 3896 }, { "crossentropy": 2.7044856548309326, "epoch": 0.14127755220417634, "grad_norm": 0.04576026275753975, "grad_norm_var": 2.6807085188462522e-05, "learning_rate": 0.009626922250245314, "loss": 2.7526, "step": 3897 }, { "crossentropy": 2.668210029602051, "epoch": 0.14131380510440836, "grad_norm": 0.0480988584458828, "grad_norm_var": 2.3695614278623535e-05, "learning_rate": 0.009626701969413178, "loss": 2.6674, "step": 3898 }, { "crossentropy": 2.7157130241394043, "epoch": 0.14135005800464037, "grad_norm": 0.04346252977848053, "grad_norm_var": 1.3029082566544796e-05, "learning_rate": 0.009626481626090411, "loss": 2.6694, "step": 3899 }, { "crossentropy": 2.6887166500091553, "epoch": 0.1413863109048724, "grad_norm": 0.038975536823272705, "grad_norm_var": 1.314951542115301e-05, "learning_rate": 0.009626261220279987, "loss": 2.6818, "step": 3900 }, { "crossentropy": 2.756377935409546, "epoch": 0.1414225638051044, "grad_norm": 0.03987356647849083, "grad_norm_var": 1.2553185095718193e-05, "learning_rate": 0.009626040751984888, "loss": 2.7264, "step": 3901 }, { "crossentropy": 2.645491600036621, "epoch": 0.14145881670533642, "grad_norm": 0.03942965716123581, "grad_norm_var": 1.2695008526579973e-05, "learning_rate": 0.009625820221208085, "loss": 2.6473, "step": 3902 }, { "crossentropy": 2.9342997074127197, "epoch": 0.14149506960556846, "grad_norm": 0.042055074125528336, "grad_norm_var": 1.2714178022676416e-05, "learning_rate": 0.009625599627952562, "loss": 2.8133, "step": 3903 }, { "crossentropy": 2.792609453201294, "epoch": 0.14153132250580047, "grad_norm": 0.04288110136985779, "grad_norm_var": 1.1597295107121375e-05, "learning_rate": 0.009625378972221295, "loss": 2.7353, "step": 3904 }, { "crossentropy": 2.884276866912842, "epoch": 0.1415675754060325, "grad_norm": 0.04471362754702568, "grad_norm_var": 1.0054416126324163e-05, "learning_rate": 0.009625158254017267, "loss": 2.7731, "step": 3905 }, { "crossentropy": 2.754005193710327, "epoch": 0.1416038283062645, "grad_norm": 0.048249151557683945, "grad_norm_var": 1.0685068927023382e-05, "learning_rate": 0.009624937473343458, "loss": 2.7624, "step": 3906 }, { "crossentropy": 2.6472339630126953, "epoch": 0.14164008120649652, "grad_norm": 0.041756708174943924, "grad_norm_var": 1.0717327749467053e-05, "learning_rate": 0.009624716630202848, "loss": 2.66, "step": 3907 }, { "crossentropy": 2.775373697280884, "epoch": 0.14167633410672853, "grad_norm": 0.040455956012010574, "grad_norm_var": 9.876900088926869e-06, "learning_rate": 0.009624495724598423, "loss": 2.8309, "step": 3908 }, { "crossentropy": 2.6009111404418945, "epoch": 0.14171258700696054, "grad_norm": 0.03801124170422554, "grad_norm_var": 1.0573610580319876e-05, "learning_rate": 0.009624274756533166, "loss": 2.6805, "step": 3909 }, { "crossentropy": 2.801525592803955, "epoch": 0.1417488399071926, "grad_norm": 0.03647011145949364, "grad_norm_var": 1.1377523339829953e-05, "learning_rate": 0.00962405372601006, "loss": 2.7072, "step": 3910 }, { "crossentropy": 2.7469475269317627, "epoch": 0.1417850928074246, "grad_norm": 0.038009658455848694, "grad_norm_var": 1.2378915956464075e-05, "learning_rate": 0.00962383263303209, "loss": 2.7277, "step": 3911 }, { "crossentropy": 2.6844980716705322, "epoch": 0.14182134570765662, "grad_norm": 0.036642611026763916, "grad_norm_var": 1.4086931187815704e-05, "learning_rate": 0.009623611477602245, "loss": 2.6718, "step": 3912 }, { "crossentropy": 2.8224871158599854, "epoch": 0.14185759860788863, "grad_norm": 0.03690690919756889, "grad_norm_var": 1.4019173555651632e-05, "learning_rate": 0.00962339025972351, "loss": 2.7675, "step": 3913 }, { "crossentropy": 2.7162609100341797, "epoch": 0.14189385150812064, "grad_norm": 0.0371890515089035, "grad_norm_var": 1.1131176394695887e-05, "learning_rate": 0.009623168979398874, "loss": 2.6226, "step": 3914 }, { "crossentropy": 2.5898196697235107, "epoch": 0.14193010440835266, "grad_norm": 0.03673495724797249, "grad_norm_var": 1.1138958508864067e-05, "learning_rate": 0.009622947636631326, "loss": 2.7077, "step": 3915 }, { "crossentropy": 2.803788900375366, "epoch": 0.14196635730858467, "grad_norm": 0.040538448840379715, "grad_norm_var": 1.1099566736976798e-05, "learning_rate": 0.00962272623142385, "loss": 2.8078, "step": 3916 }, { "crossentropy": 2.6746749877929688, "epoch": 0.14200261020881672, "grad_norm": 0.03847738355398178, "grad_norm_var": 1.124398022137943e-05, "learning_rate": 0.009622504763779445, "loss": 2.6397, "step": 3917 }, { "crossentropy": 2.642634391784668, "epoch": 0.14203886310904873, "grad_norm": 0.037406664341688156, "grad_norm_var": 1.162867895069062e-05, "learning_rate": 0.009622283233701097, "loss": 2.6548, "step": 3918 }, { "crossentropy": 2.795154094696045, "epoch": 0.14207511600928074, "grad_norm": 0.03788265958428383, "grad_norm_var": 1.1451719171875983e-05, "learning_rate": 0.009622061641191799, "loss": 2.6764, "step": 3919 }, { "crossentropy": 2.74595308303833, "epoch": 0.14211136890951276, "grad_norm": 0.03738453611731529, "grad_norm_var": 1.0877001014249441e-05, "learning_rate": 0.009621839986254545, "loss": 2.7341, "step": 3920 }, { "crossentropy": 2.6205782890319824, "epoch": 0.14214762180974477, "grad_norm": 0.03759021311998367, "grad_norm_var": 8.789676973011174e-06, "learning_rate": 0.009621618268892329, "loss": 2.5796, "step": 3921 }, { "crossentropy": 2.6094157695770264, "epoch": 0.1421838747099768, "grad_norm": 0.03681378811597824, "grad_norm_var": 2.451156295799062e-06, "learning_rate": 0.009621396489108144, "loss": 2.6216, "step": 3922 }, { "crossentropy": 2.751600980758667, "epoch": 0.14222012761020883, "grad_norm": 0.04347416013479233, "grad_norm_var": 3.4918938561884565e-06, "learning_rate": 0.009621174646904986, "loss": 2.6997, "step": 3923 }, { "crossentropy": 2.6727309226989746, "epoch": 0.14225638051044084, "grad_norm": 0.035197339951992035, "grad_norm_var": 3.585351500510342e-06, "learning_rate": 0.009620952742285851, "loss": 2.6496, "step": 3924 }, { "crossentropy": 2.6771810054779053, "epoch": 0.14229263341067286, "grad_norm": 0.036974236369132996, "grad_norm_var": 3.622747702650547e-06, "learning_rate": 0.009620730775253737, "loss": 2.6832, "step": 3925 }, { "crossentropy": 2.7084813117980957, "epoch": 0.14232888631090487, "grad_norm": 0.038114216178655624, "grad_norm_var": 3.5153306650163746e-06, "learning_rate": 0.009620508745811641, "loss": 2.7206, "step": 3926 }, { "crossentropy": 2.6227669715881348, "epoch": 0.1423651392111369, "grad_norm": 0.03920236974954605, "grad_norm_var": 3.632246549889878e-06, "learning_rate": 0.009620286653962563, "loss": 2.7142, "step": 3927 }, { "crossentropy": 2.9557600021362305, "epoch": 0.1424013921113689, "grad_norm": 0.04299397021532059, "grad_norm_var": 5.081808011156835e-06, "learning_rate": 0.009620064499709504, "loss": 2.7849, "step": 3928 }, { "crossentropy": 2.674044609069824, "epoch": 0.14243764501160092, "grad_norm": 0.0486111119389534, "grad_norm_var": 1.1461687336852821e-05, "learning_rate": 0.00961984228305546, "loss": 2.6774, "step": 3929 }, { "crossentropy": 2.6618669033050537, "epoch": 0.14247389791183296, "grad_norm": 0.04220517352223396, "grad_norm_var": 1.179862996175899e-05, "learning_rate": 0.009619620004003437, "loss": 2.7699, "step": 3930 }, { "crossentropy": 2.61692214012146, "epoch": 0.14251015081206497, "grad_norm": 0.0352545827627182, "grad_norm_var": 1.2451780087305297e-05, "learning_rate": 0.009619397662556433, "loss": 2.6108, "step": 3931 }, { "crossentropy": 2.539673328399658, "epoch": 0.142546403712297, "grad_norm": 0.03660232201218605, "grad_norm_var": 1.2747862903363567e-05, "learning_rate": 0.009619175258717456, "loss": 2.6365, "step": 3932 }, { "crossentropy": 2.81592059135437, "epoch": 0.142582656612529, "grad_norm": 0.037746362388134, "grad_norm_var": 1.2833326889707027e-05, "learning_rate": 0.009618952792489506, "loss": 2.7626, "step": 3933 }, { "crossentropy": 2.7527682781219482, "epoch": 0.14261890951276102, "grad_norm": 0.039452582597732544, "grad_norm_var": 1.2669607564585495e-05, "learning_rate": 0.009618730263875589, "loss": 2.8009, "step": 3934 }, { "crossentropy": 2.7271978855133057, "epoch": 0.14265516241299303, "grad_norm": 0.04070857912302017, "grad_norm_var": 1.2712404352863832e-05, "learning_rate": 0.009618507672878711, "loss": 2.778, "step": 3935 }, { "crossentropy": 2.7988812923431396, "epoch": 0.14269141531322505, "grad_norm": 0.039390239864587784, "grad_norm_var": 1.2459515391175882e-05, "learning_rate": 0.009618285019501878, "loss": 2.6599, "step": 3936 }, { "crossentropy": 2.7753407955169678, "epoch": 0.1427276682134571, "grad_norm": 0.04342871159315109, "grad_norm_var": 1.3184506094272389e-05, "learning_rate": 0.009618062303748098, "loss": 2.7498, "step": 3937 }, { "crossentropy": 2.6396336555480957, "epoch": 0.1427639211136891, "grad_norm": 0.041825585067272186, "grad_norm_var": 1.278520539199172e-05, "learning_rate": 0.009617839525620378, "loss": 2.6093, "step": 3938 }, { "crossentropy": 2.82810640335083, "epoch": 0.14280017401392112, "grad_norm": 0.04910995066165924, "grad_norm_var": 1.732546622867532e-05, "learning_rate": 0.009617616685121726, "loss": 2.7692, "step": 3939 }, { "crossentropy": 2.780346155166626, "epoch": 0.14283642691415313, "grad_norm": 0.03881023824214935, "grad_norm_var": 1.5622491767538123e-05, "learning_rate": 0.009617393782255155, "loss": 2.7651, "step": 3940 }, { "crossentropy": 2.637366533279419, "epoch": 0.14287267981438515, "grad_norm": 0.03820454701781273, "grad_norm_var": 1.5113808341659687e-05, "learning_rate": 0.009617170817023672, "loss": 2.6628, "step": 3941 }, { "crossentropy": 2.7524726390838623, "epoch": 0.14290893271461716, "grad_norm": 0.04009837284684181, "grad_norm_var": 1.466816825755838e-05, "learning_rate": 0.009616947789430293, "loss": 2.7023, "step": 3942 }, { "crossentropy": 2.7495839595794678, "epoch": 0.1429451856148492, "grad_norm": 0.03696460649371147, "grad_norm_var": 1.5473576699118892e-05, "learning_rate": 0.009616724699478026, "loss": 2.6606, "step": 3943 }, { "crossentropy": 2.8292527198791504, "epoch": 0.14298143851508122, "grad_norm": 0.038476862013339996, "grad_norm_var": 1.5375018081534992e-05, "learning_rate": 0.009616501547169885, "loss": 2.7235, "step": 3944 }, { "crossentropy": 2.657580852508545, "epoch": 0.14301769141531323, "grad_norm": 0.039789192378520966, "grad_norm_var": 1.0616800627736429e-05, "learning_rate": 0.009616278332508886, "loss": 2.6291, "step": 3945 }, { "crossentropy": 2.6782333850860596, "epoch": 0.14305394431554525, "grad_norm": 0.04133501276373863, "grad_norm_var": 1.0394266715449408e-05, "learning_rate": 0.009616055055498042, "loss": 2.6867, "step": 3946 }, { "crossentropy": 2.7876627445220947, "epoch": 0.14309019721577726, "grad_norm": 0.036934901028871536, "grad_norm_var": 9.54679768504074e-06, "learning_rate": 0.00961583171614037, "loss": 2.7059, "step": 3947 }, { "crossentropy": 2.563232660293579, "epoch": 0.14312645011600927, "grad_norm": 0.0390351302921772, "grad_norm_var": 8.83733297495076e-06, "learning_rate": 0.009615608314438886, "loss": 2.6551, "step": 3948 }, { "crossentropy": 2.6413462162017822, "epoch": 0.1431627030162413, "grad_norm": 0.03791913017630577, "grad_norm_var": 8.785397080618198e-06, "learning_rate": 0.009615384850396605, "loss": 2.667, "step": 3949 }, { "crossentropy": 2.8222219944000244, "epoch": 0.14319895591647333, "grad_norm": 0.036182571202516556, "grad_norm_var": 9.732812197305909e-06, "learning_rate": 0.00961516132401655, "loss": 2.8212, "step": 3950 }, { "crossentropy": 2.6373329162597656, "epoch": 0.14323520881670534, "grad_norm": 0.03689221292734146, "grad_norm_var": 1.0225731233094205e-05, "learning_rate": 0.009614937735301737, "loss": 2.7051, "step": 3951 }, { "crossentropy": 2.4933924674987793, "epoch": 0.14327146171693736, "grad_norm": 0.040289461612701416, "grad_norm_var": 1.0245144953855562e-05, "learning_rate": 0.009614714084255186, "loss": 2.5115, "step": 3952 }, { "crossentropy": 2.788029193878174, "epoch": 0.14330771461716937, "grad_norm": 0.044261384755373, "grad_norm_var": 1.070178255442259e-05, "learning_rate": 0.009614490370879919, "loss": 2.7797, "step": 3953 }, { "crossentropy": 2.9125561714172363, "epoch": 0.1433439675174014, "grad_norm": 0.04516514018177986, "grad_norm_var": 1.2319431464007565e-05, "learning_rate": 0.009614266595178956, "loss": 2.7495, "step": 3954 }, { "crossentropy": 2.8331756591796875, "epoch": 0.1433802204176334, "grad_norm": 0.0412139855325222, "grad_norm_var": 6.590200920919496e-06, "learning_rate": 0.00961404275715532, "loss": 2.8027, "step": 3955 }, { "crossentropy": 2.834782361984253, "epoch": 0.14341647331786542, "grad_norm": 0.03681311756372452, "grad_norm_var": 7.01604268067927e-06, "learning_rate": 0.009613818856812036, "loss": 2.7773, "step": 3956 }, { "crossentropy": 2.6887753009796143, "epoch": 0.14345272621809746, "grad_norm": 0.04263629764318466, "grad_norm_var": 7.567620327813181e-06, "learning_rate": 0.009613594894152125, "loss": 2.5858, "step": 3957 }, { "crossentropy": 2.7451083660125732, "epoch": 0.14348897911832947, "grad_norm": 0.04007063806056976, "grad_norm_var": 7.565919590216998e-06, "learning_rate": 0.009613370869178614, "loss": 2.7744, "step": 3958 }, { "crossentropy": 2.5817694664001465, "epoch": 0.1435252320185615, "grad_norm": 0.04061206802725792, "grad_norm_var": 7.104212382550284e-06, "learning_rate": 0.00961314678189453, "loss": 2.7341, "step": 3959 }, { "crossentropy": 2.678190231323242, "epoch": 0.1435614849187935, "grad_norm": 0.04388010874390602, "grad_norm_var": 7.938430295656209e-06, "learning_rate": 0.009612922632302897, "loss": 2.6569, "step": 3960 }, { "crossentropy": 2.7591309547424316, "epoch": 0.14359773781902552, "grad_norm": 0.04563993960618973, "grad_norm_var": 9.765683470086318e-06, "learning_rate": 0.009612698420406742, "loss": 2.751, "step": 3961 }, { "crossentropy": 2.6701583862304688, "epoch": 0.14363399071925753, "grad_norm": 0.03932667151093483, "grad_norm_var": 9.808920635440144e-06, "learning_rate": 0.009612474146209096, "loss": 2.6623, "step": 3962 }, { "crossentropy": 2.6140599250793457, "epoch": 0.14367024361948955, "grad_norm": 0.038269102573394775, "grad_norm_var": 9.298501478897878e-06, "learning_rate": 0.009612249809712987, "loss": 2.6529, "step": 3963 }, { "crossentropy": 2.665402412414551, "epoch": 0.1437064965197216, "grad_norm": 0.04409204050898552, "grad_norm_var": 9.900356007037404e-06, "learning_rate": 0.009612025410921446, "loss": 2.7229, "step": 3964 }, { "crossentropy": 2.8799798488616943, "epoch": 0.1437427494199536, "grad_norm": 0.038265909999608994, "grad_norm_var": 9.773327843830687e-06, "learning_rate": 0.009611800949837502, "loss": 2.8027, "step": 3965 }, { "crossentropy": 2.843454122543335, "epoch": 0.14377900232018562, "grad_norm": 0.039439406245946884, "grad_norm_var": 8.4091686043836e-06, "learning_rate": 0.009611576426464187, "loss": 2.7113, "step": 3966 }, { "crossentropy": 2.6503114700317383, "epoch": 0.14381525522041763, "grad_norm": 0.03990502655506134, "grad_norm_var": 7.304571292653973e-06, "learning_rate": 0.009611351840804534, "loss": 2.6751, "step": 3967 }, { "crossentropy": 2.6546714305877686, "epoch": 0.14385150812064965, "grad_norm": 0.051107607781887054, "grad_norm_var": 1.3244381091316534e-05, "learning_rate": 0.009611127192861578, "loss": 2.6786, "step": 3968 }, { "crossentropy": 2.781959056854248, "epoch": 0.14388776102088166, "grad_norm": 0.05172746255993843, "grad_norm_var": 1.906041179617294e-05, "learning_rate": 0.009610902482638349, "loss": 2.7981, "step": 3969 }, { "crossentropy": 2.6131365299224854, "epoch": 0.1439240139211137, "grad_norm": 0.043510060757398605, "grad_norm_var": 1.861816596576225e-05, "learning_rate": 0.009610677710137886, "loss": 2.6837, "step": 3970 }, { "crossentropy": 2.673224449157715, "epoch": 0.14396026682134572, "grad_norm": 0.042128536850214005, "grad_norm_var": 1.8540226833453576e-05, "learning_rate": 0.009610452875363223, "loss": 2.7056, "step": 3971 }, { "crossentropy": 2.744168519973755, "epoch": 0.14399651972157773, "grad_norm": 0.04231613129377365, "grad_norm_var": 1.6378390314694622e-05, "learning_rate": 0.009610227978317398, "loss": 2.7335, "step": 3972 }, { "crossentropy": 2.741888999938965, "epoch": 0.14403277262180975, "grad_norm": 0.041200656443834305, "grad_norm_var": 1.6516134768989933e-05, "learning_rate": 0.009610003019003449, "loss": 2.7145, "step": 3973 }, { "crossentropy": 2.731846332550049, "epoch": 0.14406902552204176, "grad_norm": 0.03861221298575401, "grad_norm_var": 1.7139603547481652e-05, "learning_rate": 0.009609777997424412, "loss": 2.7241, "step": 3974 }, { "crossentropy": 2.8777596950531006, "epoch": 0.14410527842227377, "grad_norm": 0.03935796022415161, "grad_norm_var": 1.7553936336979722e-05, "learning_rate": 0.009609552913583327, "loss": 2.8002, "step": 3975 }, { "crossentropy": 2.7750797271728516, "epoch": 0.1441415313225058, "grad_norm": 0.042225636541843414, "grad_norm_var": 1.7403732820504778e-05, "learning_rate": 0.009609327767483236, "loss": 2.7218, "step": 3976 }, { "crossentropy": 2.7740888595581055, "epoch": 0.14417778422273783, "grad_norm": 0.03832784667611122, "grad_norm_var": 1.7508906657431646e-05, "learning_rate": 0.009609102559127178, "loss": 2.7569, "step": 3977 }, { "crossentropy": 2.75974702835083, "epoch": 0.14421403712296985, "grad_norm": 0.0374237596988678, "grad_norm_var": 1.8378812705686398e-05, "learning_rate": 0.009608877288518194, "loss": 2.7501, "step": 3978 }, { "crossentropy": 2.8672127723693848, "epoch": 0.14425029002320186, "grad_norm": 0.04171222448348999, "grad_norm_var": 1.7524335947495934e-05, "learning_rate": 0.00960865195565933, "loss": 2.782, "step": 3979 }, { "crossentropy": 2.5663039684295654, "epoch": 0.14428654292343387, "grad_norm": 0.0415356419980526, "grad_norm_var": 1.720591138132706e-05, "learning_rate": 0.009608426560553628, "loss": 2.604, "step": 3980 }, { "crossentropy": 2.703253746032715, "epoch": 0.1443227958236659, "grad_norm": 0.04021082818508148, "grad_norm_var": 1.6525925281205694e-05, "learning_rate": 0.00960820110320413, "loss": 2.6431, "step": 3981 }, { "crossentropy": 2.6762869358062744, "epoch": 0.1443590487238979, "grad_norm": 0.03827385976910591, "grad_norm_var": 1.6996535048115677e-05, "learning_rate": 0.009607975583613882, "loss": 2.7345, "step": 3982 }, { "crossentropy": 2.631680488586426, "epoch": 0.14439530162412992, "grad_norm": 0.04030117392539978, "grad_norm_var": 1.690369156478374e-05, "learning_rate": 0.009607750001785932, "loss": 2.6948, "step": 3983 }, { "crossentropy": 2.85001277923584, "epoch": 0.14443155452436196, "grad_norm": 0.041463177651166916, "grad_norm_var": 1.0842416507622575e-05, "learning_rate": 0.009607524357723328, "loss": 2.7619, "step": 3984 }, { "crossentropy": 2.775754451751709, "epoch": 0.14446780742459397, "grad_norm": 0.03917652368545532, "grad_norm_var": 3.188415812348707e-06, "learning_rate": 0.009607298651429112, "loss": 2.7348, "step": 3985 }, { "crossentropy": 2.806067705154419, "epoch": 0.144504060324826, "grad_norm": 0.040809039026498795, "grad_norm_var": 2.555317072220481e-06, "learning_rate": 0.009607072882906338, "loss": 2.7009, "step": 3986 }, { "crossentropy": 2.7273576259613037, "epoch": 0.144540313225058, "grad_norm": 0.042741626501083374, "grad_norm_var": 2.7268777067776875e-06, "learning_rate": 0.009606847052158052, "loss": 2.7046, "step": 3987 }, { "crossentropy": 2.694012403488159, "epoch": 0.14457656612529002, "grad_norm": 0.04177265241742134, "grad_norm_var": 2.6032647495862388e-06, "learning_rate": 0.009606621159187307, "loss": 2.6399, "step": 3988 }, { "crossentropy": 2.708970069885254, "epoch": 0.14461281902552203, "grad_norm": 0.04086197912693024, "grad_norm_var": 2.5707359137233076e-06, "learning_rate": 0.009606395203997151, "loss": 2.7151, "step": 3989 }, { "crossentropy": 2.5502235889434814, "epoch": 0.14464907192575405, "grad_norm": 0.038954202085733414, "grad_norm_var": 2.5010675561302e-06, "learning_rate": 0.009606169186590638, "loss": 2.6654, "step": 3990 }, { "crossentropy": 2.7730469703674316, "epoch": 0.1446853248259861, "grad_norm": 0.03797342628240585, "grad_norm_var": 2.79879742313225e-06, "learning_rate": 0.00960594310697082, "loss": 2.7329, "step": 3991 }, { "crossentropy": 2.5153234004974365, "epoch": 0.1447215777262181, "grad_norm": 0.03745065629482269, "grad_norm_var": 2.9566011787560477e-06, "learning_rate": 0.009605716965140749, "loss": 2.6554, "step": 3992 }, { "crossentropy": 2.6908533573150635, "epoch": 0.14475783062645012, "grad_norm": 0.03704049438238144, "grad_norm_var": 3.3363509366536594e-06, "learning_rate": 0.009605490761103484, "loss": 2.7016, "step": 3993 }, { "crossentropy": 2.597292423248291, "epoch": 0.14479408352668213, "grad_norm": 0.04693096876144409, "grad_norm_var": 5.901947172674143e-06, "learning_rate": 0.009605264494862074, "loss": 2.6564, "step": 3994 }, { "crossentropy": 2.6891863346099854, "epoch": 0.14483033642691415, "grad_norm": 0.053378716111183167, "grad_norm_var": 1.6371243298255596e-05, "learning_rate": 0.00960503816641958, "loss": 2.6363, "step": 3995 }, { "crossentropy": 2.7426624298095703, "epoch": 0.14486658932714616, "grad_norm": 0.053386762738227844, "grad_norm_var": 2.5711774358722016e-05, "learning_rate": 0.009604811775779057, "loss": 2.7261, "step": 3996 }, { "crossentropy": 2.7129125595092773, "epoch": 0.1449028422273782, "grad_norm": 0.04372069984674454, "grad_norm_var": 2.5681683102281246e-05, "learning_rate": 0.009604585322943561, "loss": 2.7529, "step": 3997 }, { "crossentropy": 2.6699090003967285, "epoch": 0.14493909512761022, "grad_norm": 0.041988346725702286, "grad_norm_var": 2.4629382872908407e-05, "learning_rate": 0.009604358807916156, "loss": 2.6684, "step": 3998 }, { "crossentropy": 2.7820093631744385, "epoch": 0.14497534802784223, "grad_norm": 0.04800612851977348, "grad_norm_var": 2.6212461894471275e-05, "learning_rate": 0.009604132230699896, "loss": 2.7512, "step": 3999 }, { "crossentropy": 2.602407217025757, "epoch": 0.14501160092807425, "grad_norm": 0.04957974702119827, "grad_norm_var": 2.8825301407391032e-05, "learning_rate": 0.009603905591297843, "loss": 2.6987, "step": 4000 }, { "crossentropy": 2.804447889328003, "epoch": 0.14504785382830626, "grad_norm": 0.03916850686073303, "grad_norm_var": 2.8829777983392587e-05, "learning_rate": 0.00960367888971306, "loss": 2.7659, "step": 4001 }, { "crossentropy": 2.6794939041137695, "epoch": 0.14508410672853828, "grad_norm": 0.03802219405770302, "grad_norm_var": 3.026316070208242e-05, "learning_rate": 0.009603452125948606, "loss": 2.7451, "step": 4002 }, { "crossentropy": 2.776791572570801, "epoch": 0.1451203596287703, "grad_norm": 0.042226679623126984, "grad_norm_var": 3.0310249098014903e-05, "learning_rate": 0.009603225300007545, "loss": 2.7352, "step": 4003 }, { "crossentropy": 2.747706174850464, "epoch": 0.14515661252900233, "grad_norm": 0.037606775760650635, "grad_norm_var": 3.216211309079083e-05, "learning_rate": 0.00960299841189294, "loss": 2.6863, "step": 4004 }, { "crossentropy": 2.8018434047698975, "epoch": 0.14519286542923435, "grad_norm": 0.03778928890824318, "grad_norm_var": 3.3584507350430055e-05, "learning_rate": 0.009602771461607857, "loss": 2.7357, "step": 4005 }, { "crossentropy": 2.7022221088409424, "epoch": 0.14522911832946636, "grad_norm": 0.0367787703871727, "grad_norm_var": 3.49672135209383e-05, "learning_rate": 0.009602544449155362, "loss": 2.6538, "step": 4006 }, { "crossentropy": 2.7462539672851562, "epoch": 0.14526537122969838, "grad_norm": 0.03606698289513588, "grad_norm_var": 3.636164447478185e-05, "learning_rate": 0.009602317374538516, "loss": 2.7236, "step": 4007 }, { "crossentropy": 2.849745273590088, "epoch": 0.1453016241299304, "grad_norm": 0.036130063235759735, "grad_norm_var": 3.73502807842355e-05, "learning_rate": 0.009602090237760392, "loss": 2.8142, "step": 4008 }, { "crossentropy": 2.690622329711914, "epoch": 0.1453378770301624, "grad_norm": 0.0391739085316658, "grad_norm_var": 3.612049891708457e-05, "learning_rate": 0.009601863038824055, "loss": 2.623, "step": 4009 }, { "crossentropy": 2.7908732891082764, "epoch": 0.14537412993039442, "grad_norm": 0.03841811418533325, "grad_norm_var": 3.561720825120448e-05, "learning_rate": 0.009601635777732574, "loss": 2.6989, "step": 4010 }, { "crossentropy": 2.6917386054992676, "epoch": 0.14541038283062646, "grad_norm": 0.03782524913549423, "grad_norm_var": 2.7067108421947815e-05, "learning_rate": 0.00960140845448902, "loss": 2.632, "step": 4011 }, { "crossentropy": 2.792171001434326, "epoch": 0.14544663573085848, "grad_norm": 0.0416320376098156, "grad_norm_var": 1.627827425724667e-05, "learning_rate": 0.00960118106909646, "loss": 2.705, "step": 4012 }, { "crossentropy": 2.783539295196533, "epoch": 0.1454828886310905, "grad_norm": 0.04472243785858154, "grad_norm_var": 1.6803441581763937e-05, "learning_rate": 0.009600953621557967, "loss": 2.6856, "step": 4013 }, { "crossentropy": 2.735219955444336, "epoch": 0.1455191415313225, "grad_norm": 0.045866336673498154, "grad_norm_var": 1.8605518979103452e-05, "learning_rate": 0.009600726111876614, "loss": 2.7525, "step": 4014 }, { "crossentropy": 2.7217226028442383, "epoch": 0.14555539443155452, "grad_norm": 0.046013977378606796, "grad_norm_var": 1.687660236430281e-05, "learning_rate": 0.009600498540055473, "loss": 2.6837, "step": 4015 }, { "crossentropy": 2.69991135597229, "epoch": 0.14559164733178653, "grad_norm": 0.05013791099190712, "grad_norm_var": 1.757635910121924e-05, "learning_rate": 0.009600270906097616, "loss": 2.7424, "step": 4016 }, { "crossentropy": 2.842717170715332, "epoch": 0.14562790023201855, "grad_norm": 0.041693784296512604, "grad_norm_var": 1.753545921221311e-05, "learning_rate": 0.009600043210006123, "loss": 2.8292, "step": 4017 }, { "crossentropy": 2.82804012298584, "epoch": 0.1456641531322506, "grad_norm": 0.04001884534955025, "grad_norm_var": 1.7089964347426615e-05, "learning_rate": 0.009599815451784063, "loss": 2.7375, "step": 4018 }, { "crossentropy": 2.732491970062256, "epoch": 0.1457004060324826, "grad_norm": 0.035743098706007004, "grad_norm_var": 1.8446175259502714e-05, "learning_rate": 0.009599587631434515, "loss": 2.7245, "step": 4019 }, { "crossentropy": 2.7467753887176514, "epoch": 0.14573665893271462, "grad_norm": 0.036974020302295685, "grad_norm_var": 1.8702730366555268e-05, "learning_rate": 0.009599359748960557, "loss": 2.7602, "step": 4020 }, { "crossentropy": 2.699768304824829, "epoch": 0.14577291183294663, "grad_norm": 0.03790552169084549, "grad_norm_var": 1.866448546298845e-05, "learning_rate": 0.009599131804365264, "loss": 2.6828, "step": 4021 }, { "crossentropy": 2.7215569019317627, "epoch": 0.14580916473317865, "grad_norm": 0.037083711475133896, "grad_norm_var": 1.8526363223399466e-05, "learning_rate": 0.009598903797651718, "loss": 2.7327, "step": 4022 }, { "crossentropy": 2.6569712162017822, "epoch": 0.14584541763341066, "grad_norm": 0.03704632818698883, "grad_norm_var": 1.8028617635873433e-05, "learning_rate": 0.009598675728822996, "loss": 2.6767, "step": 4023 }, { "crossentropy": 2.774595260620117, "epoch": 0.1458816705336427, "grad_norm": 0.04051245003938675, "grad_norm_var": 1.6734483006638523e-05, "learning_rate": 0.00959844759788218, "loss": 2.7053, "step": 4024 }, { "crossentropy": 2.7773892879486084, "epoch": 0.14591792343387472, "grad_norm": 0.04862481355667114, "grad_norm_var": 2.042794329775114e-05, "learning_rate": 0.009598219404832352, "loss": 2.7154, "step": 4025 }, { "crossentropy": 2.7415740489959717, "epoch": 0.14595417633410673, "grad_norm": 0.04879782721400261, "grad_norm_var": 2.3223462730690565e-05, "learning_rate": 0.009597991149676592, "loss": 2.7067, "step": 4026 }, { "crossentropy": 2.7398037910461426, "epoch": 0.14599042923433875, "grad_norm": 0.04322206228971481, "grad_norm_var": 2.2102802532438192e-05, "learning_rate": 0.009597762832417985, "loss": 2.7899, "step": 4027 }, { "crossentropy": 2.8077895641326904, "epoch": 0.14602668213457076, "grad_norm": 0.0403008833527565, "grad_norm_var": 2.2323177546725893e-05, "learning_rate": 0.00959753445305961, "loss": 2.7837, "step": 4028 }, { "crossentropy": 2.703678607940674, "epoch": 0.14606293503480278, "grad_norm": 0.040534477680921555, "grad_norm_var": 2.199214389746832e-05, "learning_rate": 0.00959730601160456, "loss": 2.7591, "step": 4029 }, { "crossentropy": 2.6874380111694336, "epoch": 0.1460991879350348, "grad_norm": 0.04121921584010124, "grad_norm_var": 2.088721612621506e-05, "learning_rate": 0.00959707750805591, "loss": 2.7474, "step": 4030 }, { "crossentropy": 2.7227530479431152, "epoch": 0.14613544083526683, "grad_norm": 0.0378328338265419, "grad_norm_var": 2.0271166944823132e-05, "learning_rate": 0.009596848942416757, "loss": 2.7274, "step": 4031 }, { "crossentropy": 2.8105461597442627, "epoch": 0.14617169373549885, "grad_norm": 0.04193892702460289, "grad_norm_var": 1.4595665489784634e-05, "learning_rate": 0.00959662031469018, "loss": 2.7868, "step": 4032 }, { "crossentropy": 2.712435483932495, "epoch": 0.14620794663573086, "grad_norm": 0.04635896161198616, "grad_norm_var": 1.664214573415442e-05, "learning_rate": 0.009596391624879273, "loss": 2.736, "step": 4033 }, { "crossentropy": 2.7537360191345215, "epoch": 0.14624419953596288, "grad_norm": 0.05669933930039406, "grad_norm_var": 3.211208787620434e-05, "learning_rate": 0.00959616287298712, "loss": 2.6945, "step": 4034 }, { "crossentropy": 2.776169538497925, "epoch": 0.1462804524361949, "grad_norm": 0.048891760408878326, "grad_norm_var": 3.208031961044414e-05, "learning_rate": 0.009595934059016813, "loss": 2.7416, "step": 4035 }, { "crossentropy": 2.7200629711151123, "epoch": 0.1463167053364269, "grad_norm": 0.04153624549508095, "grad_norm_var": 2.9869840904520744e-05, "learning_rate": 0.009595705182971441, "loss": 2.6998, "step": 4036 }, { "crossentropy": 2.7134690284729004, "epoch": 0.14635295823665892, "grad_norm": 0.04107081890106201, "grad_norm_var": 2.8332633230621075e-05, "learning_rate": 0.009595476244854097, "loss": 2.6377, "step": 4037 }, { "crossentropy": 2.6048038005828857, "epoch": 0.14638921113689096, "grad_norm": 0.04359991103410721, "grad_norm_var": 2.5646885147535767e-05, "learning_rate": 0.009595247244667872, "loss": 2.7152, "step": 4038 }, { "crossentropy": 2.739311933517456, "epoch": 0.14642546403712298, "grad_norm": 0.04071316868066788, "grad_norm_var": 2.3265140819114736e-05, "learning_rate": 0.00959501818241586, "loss": 2.6786, "step": 4039 }, { "crossentropy": 2.6802990436553955, "epoch": 0.146461716937355, "grad_norm": 0.03727380558848381, "grad_norm_var": 2.5368757224303085e-05, "learning_rate": 0.009594789058101153, "loss": 2.6525, "step": 4040 }, { "crossentropy": 2.7679696083068848, "epoch": 0.146497969837587, "grad_norm": 0.03787167742848396, "grad_norm_var": 2.5482252620831423e-05, "learning_rate": 0.009594559871726848, "loss": 2.6746, "step": 4041 }, { "crossentropy": 2.7258148193359375, "epoch": 0.14653422273781902, "grad_norm": 0.04126233607530594, "grad_norm_var": 2.319729476713897e-05, "learning_rate": 0.00959433062329604, "loss": 2.6723, "step": 4042 }, { "crossentropy": 2.791619062423706, "epoch": 0.14657047563805103, "grad_norm": 0.04800048843026161, "grad_norm_var": 2.507142408737963e-05, "learning_rate": 0.009594101312811822, "loss": 2.7473, "step": 4043 }, { "crossentropy": 2.79752254486084, "epoch": 0.14660672853828308, "grad_norm": 0.049442797899246216, "grad_norm_var": 2.7225384251206887e-05, "learning_rate": 0.009593871940277297, "loss": 2.7903, "step": 4044 }, { "crossentropy": 2.774400472640991, "epoch": 0.1466429814385151, "grad_norm": 0.04667187109589577, "grad_norm_var": 2.7242534597733623e-05, "learning_rate": 0.00959364250569556, "loss": 2.7442, "step": 4045 }, { "crossentropy": 2.829608917236328, "epoch": 0.1466792343387471, "grad_norm": 0.041827958077192307, "grad_norm_var": 2.7058333567451093e-05, "learning_rate": 0.009593413009069709, "loss": 2.7129, "step": 4046 }, { "crossentropy": 2.642845630645752, "epoch": 0.14671548723897912, "grad_norm": 0.037682514637708664, "grad_norm_var": 2.7179584724982414e-05, "learning_rate": 0.009593183450402843, "loss": 2.6798, "step": 4047 }, { "crossentropy": 2.4595584869384766, "epoch": 0.14675174013921113, "grad_norm": 0.04239173233509064, "grad_norm_var": 2.707987809190352e-05, "learning_rate": 0.009592953829698067, "loss": 2.6252, "step": 4048 }, { "crossentropy": 2.697361707687378, "epoch": 0.14678799303944315, "grad_norm": 0.038716357201337814, "grad_norm_var": 2.815439861159798e-05, "learning_rate": 0.009592724146958479, "loss": 2.7042, "step": 4049 }, { "crossentropy": 2.5674712657928467, "epoch": 0.14682424593967516, "grad_norm": 0.03829621151089668, "grad_norm_var": 1.657374157159926e-05, "learning_rate": 0.00959249440218718, "loss": 2.573, "step": 4050 }, { "crossentropy": 2.763662576675415, "epoch": 0.1468604988399072, "grad_norm": 0.0363384373486042, "grad_norm_var": 1.5227545301479035e-05, "learning_rate": 0.009592264595387274, "loss": 2.7427, "step": 4051 }, { "crossentropy": 2.67714262008667, "epoch": 0.14689675174013922, "grad_norm": 0.036142900586128235, "grad_norm_var": 1.696089851751185e-05, "learning_rate": 0.009592034726561867, "loss": 2.7053, "step": 4052 }, { "crossentropy": 2.712381362915039, "epoch": 0.14693300464037123, "grad_norm": 0.03708244487643242, "grad_norm_var": 1.79607403323824e-05, "learning_rate": 0.009591804795714063, "loss": 2.6733, "step": 4053 }, { "crossentropy": 2.772308111190796, "epoch": 0.14696925754060325, "grad_norm": 0.040630266070365906, "grad_norm_var": 1.7416017700594422e-05, "learning_rate": 0.009591574802846968, "loss": 2.7335, "step": 4054 }, { "crossentropy": 2.5795984268188477, "epoch": 0.14700551044083526, "grad_norm": 0.04325196519494057, "grad_norm_var": 1.7841407969993763e-05, "learning_rate": 0.009591344747963684, "loss": 2.6237, "step": 4055 }, { "crossentropy": 2.802126884460449, "epoch": 0.14704176334106728, "grad_norm": 0.03856896236538887, "grad_norm_var": 1.7336413400674947e-05, "learning_rate": 0.009591114631067323, "loss": 2.7619, "step": 4056 }, { "crossentropy": 2.841392755508423, "epoch": 0.1470780162412993, "grad_norm": 0.03933081775903702, "grad_norm_var": 1.688300341987613e-05, "learning_rate": 0.009590884452160994, "loss": 2.8431, "step": 4057 }, { "crossentropy": 2.761002540588379, "epoch": 0.14711426914153133, "grad_norm": 0.04118409752845764, "grad_norm_var": 1.6880413380612748e-05, "learning_rate": 0.0095906542112478, "loss": 2.7605, "step": 4058 }, { "crossentropy": 2.742319345474243, "epoch": 0.14715052204176335, "grad_norm": 0.044985584914684296, "grad_norm_var": 1.4623350695281736e-05, "learning_rate": 0.009590423908330856, "loss": 2.7361, "step": 4059 }, { "crossentropy": 2.761927843093872, "epoch": 0.14718677494199536, "grad_norm": 0.041311196982860565, "grad_norm_var": 9.368111147478539e-06, "learning_rate": 0.00959019354341327, "loss": 2.6814, "step": 4060 }, { "crossentropy": 2.7084531784057617, "epoch": 0.14722302784222738, "grad_norm": 0.044117629528045654, "grad_norm_var": 7.597600135920285e-06, "learning_rate": 0.009589963116498155, "loss": 2.6844, "step": 4061 }, { "crossentropy": 2.782711982727051, "epoch": 0.1472592807424594, "grad_norm": 0.044468652456998825, "grad_norm_var": 8.63612934446344e-06, "learning_rate": 0.009589732627588624, "loss": 2.6854, "step": 4062 }, { "crossentropy": 2.5762362480163574, "epoch": 0.1472955336426914, "grad_norm": 0.04396647959947586, "grad_norm_var": 8.926773203483892e-06, "learning_rate": 0.009589502076687786, "loss": 2.6633, "step": 4063 }, { "crossentropy": 2.694990634918213, "epoch": 0.14733178654292342, "grad_norm": 0.042627494782209396, "grad_norm_var": 8.984244621212019e-06, "learning_rate": 0.009589271463798758, "loss": 2.6393, "step": 4064 }, { "crossentropy": 2.6467671394348145, "epoch": 0.14736803944315546, "grad_norm": 0.03867127001285553, "grad_norm_var": 8.996228772389404e-06, "learning_rate": 0.009589040788924657, "loss": 2.5813, "step": 4065 }, { "crossentropy": 2.5589771270751953, "epoch": 0.14740429234338748, "grad_norm": 0.04168408736586571, "grad_norm_var": 8.634122481685803e-06, "learning_rate": 0.009588810052068592, "loss": 2.6974, "step": 4066 }, { "crossentropy": 2.667149066925049, "epoch": 0.1474405452436195, "grad_norm": 0.04264703765511513, "grad_norm_var": 7.286564056243651e-06, "learning_rate": 0.009588579253233687, "loss": 2.6703, "step": 4067 }, { "crossentropy": 2.721615791320801, "epoch": 0.1474767981438515, "grad_norm": 0.039734628051519394, "grad_norm_var": 5.626990747555303e-06, "learning_rate": 0.009588348392423056, "loss": 2.7248, "step": 4068 }, { "crossentropy": 2.682469367980957, "epoch": 0.14751305104408352, "grad_norm": 0.04058808460831642, "grad_norm_var": 4.322565599604311e-06, "learning_rate": 0.009588117469639815, "loss": 2.6332, "step": 4069 }, { "crossentropy": 2.725397825241089, "epoch": 0.14754930394431554, "grad_norm": 0.04107257351279259, "grad_norm_var": 4.2696114765658835e-06, "learning_rate": 0.009587886484887086, "loss": 2.7115, "step": 4070 }, { "crossentropy": 2.759291887283325, "epoch": 0.14758555684454758, "grad_norm": 0.040452729910612106, "grad_norm_var": 4.203675112292709e-06, "learning_rate": 0.009587655438167987, "loss": 2.7121, "step": 4071 }, { "crossentropy": 2.6090216636657715, "epoch": 0.1476218097447796, "grad_norm": 0.04572354629635811, "grad_norm_var": 4.522736781314926e-06, "learning_rate": 0.009587424329485639, "loss": 2.6999, "step": 4072 }, { "crossentropy": 2.811616897583008, "epoch": 0.1476580626450116, "grad_norm": 0.04861951246857643, "grad_norm_var": 6.5656600562920735e-06, "learning_rate": 0.009587193158843163, "loss": 2.7603, "step": 4073 }, { "crossentropy": 2.7326548099517822, "epoch": 0.14769431554524362, "grad_norm": 0.041482552886009216, "grad_norm_var": 6.514249553969213e-06, "learning_rate": 0.009586961926243684, "loss": 2.7078, "step": 4074 }, { "crossentropy": 2.6749582290649414, "epoch": 0.14773056844547564, "grad_norm": 0.037956465035676956, "grad_norm_var": 7.398870235614657e-06, "learning_rate": 0.009586730631690321, "loss": 2.6712, "step": 4075 }, { "crossentropy": 2.634037494659424, "epoch": 0.14776682134570765, "grad_norm": 0.0375509187579155, "grad_norm_var": 8.725837203516345e-06, "learning_rate": 0.009586499275186202, "loss": 2.6689, "step": 4076 }, { "crossentropy": 2.736764907836914, "epoch": 0.14780307424593966, "grad_norm": 0.039881739765405655, "grad_norm_var": 8.628791687676866e-06, "learning_rate": 0.00958626785673445, "loss": 2.7782, "step": 4077 }, { "crossentropy": 2.690952777862549, "epoch": 0.1478393271461717, "grad_norm": 0.03580375760793686, "grad_norm_var": 1.0117423888088689e-05, "learning_rate": 0.009586036376338189, "loss": 2.7576, "step": 4078 }, { "crossentropy": 2.8540873527526855, "epoch": 0.14787558004640372, "grad_norm": 0.04830237478017807, "grad_norm_var": 1.2918412504720327e-05, "learning_rate": 0.009585804834000549, "loss": 2.8632, "step": 4079 }, { "crossentropy": 2.7176003456115723, "epoch": 0.14791183294663574, "grad_norm": 0.03675711527466774, "grad_norm_var": 1.4130973796759074e-05, "learning_rate": 0.009585573229724656, "loss": 2.6735, "step": 4080 }, { "crossentropy": 2.399702548980713, "epoch": 0.14794808584686775, "grad_norm": 0.036419980227947235, "grad_norm_var": 1.5164179739531915e-05, "learning_rate": 0.009585341563513636, "loss": 2.4659, "step": 4081 }, { "crossentropy": 2.6309573650360107, "epoch": 0.14798433874709976, "grad_norm": 0.03755227476358414, "grad_norm_var": 1.580875286491141e-05, "learning_rate": 0.00958510983537062, "loss": 2.6324, "step": 4082 }, { "crossentropy": 2.6874160766601562, "epoch": 0.14802059164733178, "grad_norm": 0.037316981703042984, "grad_norm_var": 1.6171556837581377e-05, "learning_rate": 0.009584878045298739, "loss": 2.7334, "step": 4083 }, { "crossentropy": 2.6039090156555176, "epoch": 0.1480568445475638, "grad_norm": 0.03550347313284874, "grad_norm_var": 1.7624071727328818e-05, "learning_rate": 0.00958464619330112, "loss": 2.6185, "step": 4084 }, { "crossentropy": 2.751793146133423, "epoch": 0.14809309744779584, "grad_norm": 0.060876112431287766, "grad_norm_var": 4.47737610619688e-05, "learning_rate": 0.009584414279380898, "loss": 2.7576, "step": 4085 }, { "crossentropy": 2.716042995452881, "epoch": 0.14812935034802785, "grad_norm": 0.0397331677377224, "grad_norm_var": 4.4931771605784726e-05, "learning_rate": 0.009584182303541204, "loss": 2.7758, "step": 4086 }, { "crossentropy": 2.7884318828582764, "epoch": 0.14816560324825986, "grad_norm": 0.03973067179322243, "grad_norm_var": 4.5040708875528314e-05, "learning_rate": 0.00958395026578517, "loss": 2.7459, "step": 4087 }, { "crossentropy": 2.7792394161224365, "epoch": 0.14820185614849188, "grad_norm": 0.04104117304086685, "grad_norm_var": 4.358728864129324e-05, "learning_rate": 0.009583718166115933, "loss": 2.6778, "step": 4088 }, { "crossentropy": 2.7211897373199463, "epoch": 0.1482381090487239, "grad_norm": 0.039295244961977005, "grad_norm_var": 3.943395516196318e-05, "learning_rate": 0.009583486004536625, "loss": 2.6922, "step": 4089 }, { "crossentropy": 2.6016860008239746, "epoch": 0.1482743619489559, "grad_norm": 0.039946410804986954, "grad_norm_var": 3.9344400937551166e-05, "learning_rate": 0.009583253781050386, "loss": 2.6395, "step": 4090 }, { "crossentropy": 2.7743918895721436, "epoch": 0.14831061484918792, "grad_norm": 0.03875003382563591, "grad_norm_var": 3.914327982017865e-05, "learning_rate": 0.009583021495660345, "loss": 2.7193, "step": 4091 }, { "crossentropy": 2.751330852508545, "epoch": 0.14834686774941996, "grad_norm": 0.03728030249476433, "grad_norm_var": 3.9246286185312614e-05, "learning_rate": 0.009582789148369644, "loss": 2.6246, "step": 4092 }, { "crossentropy": 2.9198408126831055, "epoch": 0.14838312064965198, "grad_norm": 0.041357047855854034, "grad_norm_var": 3.9307534001218707e-05, "learning_rate": 0.009582556739181423, "loss": 2.7728, "step": 4093 }, { "crossentropy": 2.714876413345337, "epoch": 0.148419373549884, "grad_norm": 0.03593144938349724, "grad_norm_var": 3.9231080346381564e-05, "learning_rate": 0.009582324268098819, "loss": 2.7105, "step": 4094 }, { "crossentropy": 2.7359719276428223, "epoch": 0.148455626450116, "grad_norm": 0.03702208027243614, "grad_norm_var": 3.524143067847295e-05, "learning_rate": 0.009582091735124972, "loss": 2.597, "step": 4095 }, { "crossentropy": 2.8611364364624023, "epoch": 0.14849187935034802, "grad_norm": 0.035824306309223175, "grad_norm_var": 3.565649757320353e-05, "learning_rate": 0.00958185914026302, "loss": 2.7755, "step": 4096 }, { "crossentropy": 2.807295799255371, "epoch": 0.14852813225058004, "grad_norm": 0.04179157316684723, "grad_norm_var": 3.518316712543771e-05, "learning_rate": 0.009581626483516108, "loss": 2.8025, "step": 4097 }, { "crossentropy": 2.783285140991211, "epoch": 0.14856438515081208, "grad_norm": 0.045116085559129715, "grad_norm_var": 3.635635663210198e-05, "learning_rate": 0.009581393764887378, "loss": 2.7064, "step": 4098 }, { "crossentropy": 2.8117196559906006, "epoch": 0.1486006380510441, "grad_norm": 0.04044244810938835, "grad_norm_var": 3.5679083450157234e-05, "learning_rate": 0.009581160984379973, "loss": 2.7141, "step": 4099 }, { "crossentropy": 2.676490306854248, "epoch": 0.1486368909512761, "grad_norm": 0.04436125233769417, "grad_norm_var": 3.45605919822356e-05, "learning_rate": 0.009580928141997038, "loss": 2.7192, "step": 4100 }, { "crossentropy": 2.698323965072632, "epoch": 0.14867314385150812, "grad_norm": 0.04335296154022217, "grad_norm_var": 7.677916285114307e-06, "learning_rate": 0.009580695237741714, "loss": 2.718, "step": 4101 }, { "crossentropy": 2.806541919708252, "epoch": 0.14870939675174014, "grad_norm": 0.04579399153590202, "grad_norm_var": 9.708830312007918e-06, "learning_rate": 0.009580462271617152, "loss": 2.8142, "step": 4102 }, { "crossentropy": 2.623706817626953, "epoch": 0.14874564965197215, "grad_norm": 0.04370318353176117, "grad_norm_var": 1.0319523010135501e-05, "learning_rate": 0.009580229243626495, "loss": 2.6732, "step": 4103 }, { "crossentropy": 2.8361704349517822, "epoch": 0.14878190255220416, "grad_norm": 0.04702724516391754, "grad_norm_var": 1.2840894741920212e-05, "learning_rate": 0.009579996153772889, "loss": 2.783, "step": 4104 }, { "crossentropy": 2.765606641769409, "epoch": 0.1488181554524362, "grad_norm": 0.042142629623413086, "grad_norm_var": 1.2676783057515423e-05, "learning_rate": 0.009579763002059488, "loss": 2.7126, "step": 4105 }, { "crossentropy": 2.6925265789031982, "epoch": 0.14885440835266822, "grad_norm": 0.04932950437068939, "grad_norm_var": 1.656081870534213e-05, "learning_rate": 0.009579529788489435, "loss": 2.6831, "step": 4106 }, { "crossentropy": 2.6015589237213135, "epoch": 0.14889066125290024, "grad_norm": 0.043326716870069504, "grad_norm_var": 1.599253078799582e-05, "learning_rate": 0.009579296513065884, "loss": 2.6288, "step": 4107 }, { "crossentropy": 2.628725528717041, "epoch": 0.14892691415313225, "grad_norm": 0.04201741889119148, "grad_norm_var": 1.4342847165695292e-05, "learning_rate": 0.009579063175791984, "loss": 2.618, "step": 4108 }, { "crossentropy": 2.5968289375305176, "epoch": 0.14896316705336426, "grad_norm": 0.04091251641511917, "grad_norm_var": 1.4417532576817502e-05, "learning_rate": 0.009578829776670886, "loss": 2.5919, "step": 4109 }, { "crossentropy": 2.742725133895874, "epoch": 0.14899941995359628, "grad_norm": 0.0406200997531414, "grad_norm_var": 1.1759564221358368e-05, "learning_rate": 0.009578596315705745, "loss": 2.6554, "step": 4110 }, { "crossentropy": 2.6902010440826416, "epoch": 0.1490356728538283, "grad_norm": 0.04127296805381775, "grad_norm_var": 9.685518143184799e-06, "learning_rate": 0.009578362792899711, "loss": 2.6237, "step": 4111 }, { "crossentropy": 2.6925981044769287, "epoch": 0.14907192575406034, "grad_norm": 0.0429794080555439, "grad_norm_var": 6.097071410336025e-06, "learning_rate": 0.00957812920825594, "loss": 2.7138, "step": 4112 }, { "crossentropy": 2.65447998046875, "epoch": 0.14910817865429235, "grad_norm": 0.03863964602351189, "grad_norm_var": 7.388423469988486e-06, "learning_rate": 0.009577895561777586, "loss": 2.7226, "step": 4113 }, { "crossentropy": 2.633143663406372, "epoch": 0.14914443155452436, "grad_norm": 0.04132670536637306, "grad_norm_var": 7.312669286386036e-06, "learning_rate": 0.009577661853467805, "loss": 2.6824, "step": 4114 }, { "crossentropy": 2.5830605030059814, "epoch": 0.14918068445475638, "grad_norm": 0.042750775814056396, "grad_norm_var": 6.872989256737766e-06, "learning_rate": 0.009577428083329755, "loss": 2.7156, "step": 4115 }, { "crossentropy": 2.7225708961486816, "epoch": 0.1492169373549884, "grad_norm": 0.049055688083171844, "grad_norm_var": 9.041477526172161e-06, "learning_rate": 0.009577194251366591, "loss": 2.7534, "step": 4116 }, { "crossentropy": 2.7057907581329346, "epoch": 0.1492531902552204, "grad_norm": 0.04660218209028244, "grad_norm_var": 9.684960736677736e-06, "learning_rate": 0.009576960357581474, "loss": 2.687, "step": 4117 }, { "crossentropy": 2.815387487411499, "epoch": 0.14928944315545242, "grad_norm": 0.04589195176959038, "grad_norm_var": 9.714298101493914e-06, "learning_rate": 0.00957672640197756, "loss": 2.7639, "step": 4118 }, { "crossentropy": 2.513225555419922, "epoch": 0.14932569605568446, "grad_norm": 0.04687429219484329, "grad_norm_var": 1.038645717077267e-05, "learning_rate": 0.009576492384558013, "loss": 2.5756, "step": 4119 }, { "crossentropy": 2.790668487548828, "epoch": 0.14936194895591648, "grad_norm": 0.04633085057139397, "grad_norm_var": 1.0116933807983314e-05, "learning_rate": 0.00957625830532599, "loss": 2.7613, "step": 4120 }, { "crossentropy": 2.7068984508514404, "epoch": 0.1493982018561485, "grad_norm": 0.04419034719467163, "grad_norm_var": 9.9388950346851e-06, "learning_rate": 0.00957602416428465, "loss": 2.7251, "step": 4121 }, { "crossentropy": 2.6923649311065674, "epoch": 0.1494344547563805, "grad_norm": 0.03981701284646988, "grad_norm_var": 8.685837357612821e-06, "learning_rate": 0.009575789961437165, "loss": 2.7004, "step": 4122 }, { "crossentropy": 2.54469895362854, "epoch": 0.14947070765661252, "grad_norm": 0.03846415877342224, "grad_norm_var": 1.0138538575069945e-05, "learning_rate": 0.00957555569678669, "loss": 2.572, "step": 4123 }, { "crossentropy": 2.669203996658325, "epoch": 0.14950696055684454, "grad_norm": 0.03769960254430771, "grad_norm_var": 1.1860301746853773e-05, "learning_rate": 0.009575321370336391, "loss": 2.7112, "step": 4124 }, { "crossentropy": 2.7309231758117676, "epoch": 0.14954321345707658, "grad_norm": 0.03716779500246048, "grad_norm_var": 1.3636340526253464e-05, "learning_rate": 0.009575086982089436, "loss": 2.6831, "step": 4125 }, { "crossentropy": 2.7622692584991455, "epoch": 0.1495794663573086, "grad_norm": 0.04399022087454796, "grad_norm_var": 1.351035473577747e-05, "learning_rate": 0.009574852532048986, "loss": 2.6978, "step": 4126 }, { "crossentropy": 2.710381031036377, "epoch": 0.1496157192575406, "grad_norm": 0.03645690903067589, "grad_norm_var": 1.5870486897723707e-05, "learning_rate": 0.00957461802021821, "loss": 2.65, "step": 4127 }, { "crossentropy": 2.6237528324127197, "epoch": 0.14965197215777262, "grad_norm": 0.043263718485832214, "grad_norm_var": 1.589788805199758e-05, "learning_rate": 0.009574383446600278, "loss": 2.5885, "step": 4128 }, { "crossentropy": 2.6447012424468994, "epoch": 0.14968822505800464, "grad_norm": 0.046920835971832275, "grad_norm_var": 1.602358301175845e-05, "learning_rate": 0.009574148811198353, "loss": 2.7375, "step": 4129 }, { "crossentropy": 2.550959348678589, "epoch": 0.14972447795823665, "grad_norm": 0.045467231422662735, "grad_norm_var": 1.6212604398159874e-05, "learning_rate": 0.009573914114015609, "loss": 2.6508, "step": 4130 }, { "crossentropy": 2.8671021461486816, "epoch": 0.14976073085846867, "grad_norm": 0.043928004801273346, "grad_norm_var": 1.6231224803268883e-05, "learning_rate": 0.00957367935505521, "loss": 2.7452, "step": 4131 }, { "crossentropy": 2.6435041427612305, "epoch": 0.1497969837587007, "grad_norm": 0.04258328676223755, "grad_norm_var": 1.3845756692903478e-05, "learning_rate": 0.009573444534320335, "loss": 2.6539, "step": 4132 }, { "crossentropy": 2.8030405044555664, "epoch": 0.14983323665893272, "grad_norm": 0.04450748860836029, "grad_norm_var": 1.3072879175098951e-05, "learning_rate": 0.009573209651814149, "loss": 2.7894, "step": 4133 }, { "crossentropy": 2.7277443408966064, "epoch": 0.14986948955916474, "grad_norm": 0.041681092232465744, "grad_norm_var": 1.2401384811566198e-05, "learning_rate": 0.009572974707539827, "loss": 2.7297, "step": 4134 }, { "crossentropy": 2.638237953186035, "epoch": 0.14990574245939675, "grad_norm": 0.03832944482564926, "grad_norm_var": 1.1934304048228685e-05, "learning_rate": 0.009572739701500542, "loss": 2.682, "step": 4135 }, { "crossentropy": 2.692279100418091, "epoch": 0.14994199535962877, "grad_norm": 0.03705618903040886, "grad_norm_var": 1.1861988921965603e-05, "learning_rate": 0.009572504633699465, "loss": 2.7327, "step": 4136 }, { "crossentropy": 2.6546449661254883, "epoch": 0.14997824825986078, "grad_norm": 0.03655051067471504, "grad_norm_var": 1.2611747019602067e-05, "learning_rate": 0.009572269504139778, "loss": 2.6068, "step": 4137 }, { "crossentropy": 2.6174752712249756, "epoch": 0.1500145011600928, "grad_norm": 0.03716675937175751, "grad_norm_var": 1.3422022208362666e-05, "learning_rate": 0.009572034312824652, "loss": 2.656, "step": 4138 }, { "crossentropy": 2.6611530780792236, "epoch": 0.15005075406032484, "grad_norm": 0.04196659103035927, "grad_norm_var": 1.3143623613460532e-05, "learning_rate": 0.009571799059757262, "loss": 2.7383, "step": 4139 }, { "crossentropy": 2.682889223098755, "epoch": 0.15008700696055685, "grad_norm": 0.04207737371325493, "grad_norm_var": 1.246110150120346e-05, "learning_rate": 0.00957156374494079, "loss": 2.6517, "step": 4140 }, { "crossentropy": 2.691352128982544, "epoch": 0.15012325986078887, "grad_norm": 0.04340844601392746, "grad_norm_var": 1.1544565731054964e-05, "learning_rate": 0.00957132836837841, "loss": 2.6889, "step": 4141 }, { "crossentropy": 2.8055031299591064, "epoch": 0.15015951276102088, "grad_norm": 0.04208090901374817, "grad_norm_var": 1.1160004989113567e-05, "learning_rate": 0.009571092930073305, "loss": 2.7962, "step": 4142 }, { "crossentropy": 2.94675874710083, "epoch": 0.1501957656612529, "grad_norm": 0.05071982741355896, "grad_norm_var": 1.4349864059914665e-05, "learning_rate": 0.009570857430028652, "loss": 2.8677, "step": 4143 }, { "crossentropy": 2.8020706176757812, "epoch": 0.1502320185614849, "grad_norm": 0.04339838773012161, "grad_norm_var": 1.4367283308649078e-05, "learning_rate": 0.009570621868247633, "loss": 2.7813, "step": 4144 }, { "crossentropy": 2.7495028972625732, "epoch": 0.15026827146171692, "grad_norm": 0.052245814353227615, "grad_norm_var": 1.9374020338515317e-05, "learning_rate": 0.00957038624473343, "loss": 2.7516, "step": 4145 }, { "crossentropy": 2.760558843612671, "epoch": 0.15030452436194897, "grad_norm": 0.05741145834326744, "grad_norm_var": 3.270079626996551e-05, "learning_rate": 0.009570150559489225, "loss": 2.6924, "step": 4146 }, { "crossentropy": 2.7452938556671143, "epoch": 0.15034077726218098, "grad_norm": 0.044971268624067307, "grad_norm_var": 3.2836081259045766e-05, "learning_rate": 0.0095699148125182, "loss": 2.6224, "step": 4147 }, { "crossentropy": 2.547989845275879, "epoch": 0.150377030162413, "grad_norm": 0.03977268934249878, "grad_norm_var": 3.367695907864251e-05, "learning_rate": 0.009569679003823542, "loss": 2.6807, "step": 4148 }, { "crossentropy": 2.8416786193847656, "epoch": 0.150413283062645, "grad_norm": 0.04271823167800903, "grad_norm_var": 3.3597096483702535e-05, "learning_rate": 0.009569443133408432, "loss": 2.8168, "step": 4149 }, { "crossentropy": 2.692462205886841, "epoch": 0.15044953596287702, "grad_norm": 0.040199413895606995, "grad_norm_var": 3.4038761400194814e-05, "learning_rate": 0.009569207201276062, "loss": 2.689, "step": 4150 }, { "crossentropy": 2.629281520843506, "epoch": 0.15048578886310904, "grad_norm": 0.042059220373630524, "grad_norm_var": 3.252108833968151e-05, "learning_rate": 0.009568971207429612, "loss": 2.6341, "step": 4151 }, { "crossentropy": 2.715388536453247, "epoch": 0.15052204176334108, "grad_norm": 0.038480788469314575, "grad_norm_var": 3.1450032095834554e-05, "learning_rate": 0.009568735151872272, "loss": 2.7083, "step": 4152 }, { "crossentropy": 2.7368993759155273, "epoch": 0.1505582946635731, "grad_norm": 0.038386471569538116, "grad_norm_var": 2.9971321459957603e-05, "learning_rate": 0.009568499034607231, "loss": 2.8014, "step": 4153 }, { "crossentropy": 2.7313361167907715, "epoch": 0.1505945475638051, "grad_norm": 0.03974683955311775, "grad_norm_var": 2.8185800629189813e-05, "learning_rate": 0.009568262855637678, "loss": 2.7678, "step": 4154 }, { "crossentropy": 2.347775936126709, "epoch": 0.15063080046403712, "grad_norm": 0.04862760379910469, "grad_norm_var": 2.9394736763929674e-05, "learning_rate": 0.009568026614966803, "loss": 2.5369, "step": 4155 }, { "crossentropy": 2.6322121620178223, "epoch": 0.15066705336426914, "grad_norm": 0.047241561114788055, "grad_norm_var": 2.9638513996267135e-05, "learning_rate": 0.009567790312597794, "loss": 2.5355, "step": 4156 }, { "crossentropy": 2.872633695602417, "epoch": 0.15070330626450115, "grad_norm": 0.04092059284448624, "grad_norm_var": 3.037642563099707e-05, "learning_rate": 0.009567553948533846, "loss": 2.7616, "step": 4157 }, { "crossentropy": 2.565185546875, "epoch": 0.15073955916473317, "grad_norm": 0.04225961118936539, "grad_norm_var": 3.0325277691488292e-05, "learning_rate": 0.009567317522778151, "loss": 2.6327, "step": 4158 }, { "crossentropy": 2.5515801906585693, "epoch": 0.1507758120649652, "grad_norm": 0.044034093618392944, "grad_norm_var": 2.7416178146069148e-05, "learning_rate": 0.009567081035333904, "loss": 2.6553, "step": 4159 }, { "crossentropy": 2.6877222061157227, "epoch": 0.15081206496519722, "grad_norm": 0.044974882155656815, "grad_norm_var": 2.7465100254444227e-05, "learning_rate": 0.009566844486204294, "loss": 2.7072, "step": 4160 }, { "crossentropy": 2.7141165733337402, "epoch": 0.15084831786542924, "grad_norm": 0.04381919652223587, "grad_norm_var": 2.264206549956593e-05, "learning_rate": 0.009566607875392518, "loss": 2.715, "step": 4161 }, { "crossentropy": 2.626633405685425, "epoch": 0.15088457076566125, "grad_norm": 0.040307097136974335, "grad_norm_var": 9.147194685024674e-06, "learning_rate": 0.009566371202901774, "loss": 2.6154, "step": 4162 }, { "crossentropy": 2.7464840412139893, "epoch": 0.15092082366589327, "grad_norm": 0.03635238856077194, "grad_norm_var": 1.0843739660456169e-05, "learning_rate": 0.009566134468735258, "loss": 2.7445, "step": 4163 }, { "crossentropy": 2.6069495677948, "epoch": 0.15095707656612528, "grad_norm": 0.04082569479942322, "grad_norm_var": 1.0618746516807772e-05, "learning_rate": 0.009565897672896166, "loss": 2.654, "step": 4164 }, { "crossentropy": 2.783867120742798, "epoch": 0.1509933294663573, "grad_norm": 0.03952857106924057, "grad_norm_var": 1.0921350549027095e-05, "learning_rate": 0.009565660815387695, "loss": 2.7885, "step": 4165 }, { "crossentropy": 2.6387412548065186, "epoch": 0.15102958236658934, "grad_norm": 0.03891119733452797, "grad_norm_var": 1.1288868305253432e-05, "learning_rate": 0.009565423896213048, "loss": 2.6666, "step": 4166 }, { "crossentropy": 2.7491908073425293, "epoch": 0.15106583526682135, "grad_norm": 0.03548795357346535, "grad_norm_var": 1.363332025168523e-05, "learning_rate": 0.009565186915375423, "loss": 2.717, "step": 4167 }, { "crossentropy": 2.7195322513580322, "epoch": 0.15110208816705337, "grad_norm": 0.03616475313901901, "grad_norm_var": 1.4821874703287568e-05, "learning_rate": 0.009564949872878021, "loss": 2.7599, "step": 4168 }, { "crossentropy": 2.607819080352783, "epoch": 0.15113834106728538, "grad_norm": 0.04004302993416786, "grad_norm_var": 1.43941958517133e-05, "learning_rate": 0.009564712768724042, "loss": 2.6131, "step": 4169 }, { "crossentropy": 2.4917709827423096, "epoch": 0.1511745939675174, "grad_norm": 0.04101596400141716, "grad_norm_var": 1.4248487672401574e-05, "learning_rate": 0.009564475602916692, "loss": 2.5691, "step": 4170 }, { "crossentropy": 2.6457631587982178, "epoch": 0.1512108468677494, "grad_norm": 0.04317958280444145, "grad_norm_var": 1.0767778270048592e-05, "learning_rate": 0.00956423837545917, "loss": 2.6726, "step": 4171 }, { "crossentropy": 2.5839390754699707, "epoch": 0.15124709976798145, "grad_norm": 0.03910161182284355, "grad_norm_var": 8.071475123765509e-06, "learning_rate": 0.009564001086354683, "loss": 2.7105, "step": 4172 }, { "crossentropy": 2.7345824241638184, "epoch": 0.15128335266821347, "grad_norm": 0.037640996277332306, "grad_norm_var": 8.530446717103177e-06, "learning_rate": 0.009563763735606436, "loss": 2.7862, "step": 4173 }, { "crossentropy": 2.8013508319854736, "epoch": 0.15131960556844548, "grad_norm": 0.03743143752217293, "grad_norm_var": 8.679482275441899e-06, "learning_rate": 0.009563526323217632, "loss": 2.7521, "step": 4174 }, { "crossentropy": 2.7604615688323975, "epoch": 0.1513558584686775, "grad_norm": 0.04324984923005104, "grad_norm_var": 8.288371660398972e-06, "learning_rate": 0.009563288849191482, "loss": 2.7088, "step": 4175 }, { "crossentropy": 2.777038097381592, "epoch": 0.1513921113689095, "grad_norm": 0.04334438219666481, "grad_norm_var": 7.346280212344678e-06, "learning_rate": 0.00956305131353119, "loss": 2.7657, "step": 4176 }, { "crossentropy": 2.843620538711548, "epoch": 0.15142836426914152, "grad_norm": 0.04198657348752022, "grad_norm_var": 6.568045147438649e-06, "learning_rate": 0.009562813716239965, "loss": 2.7911, "step": 4177 }, { "crossentropy": 2.721999406814575, "epoch": 0.15146461716937354, "grad_norm": 0.04047331213951111, "grad_norm_var": 6.584097477410622e-06, "learning_rate": 0.009562576057321017, "loss": 2.624, "step": 4178 }, { "crossentropy": 2.5783753395080566, "epoch": 0.15150087006960558, "grad_norm": 0.044038936495780945, "grad_norm_var": 6.875547584750166e-06, "learning_rate": 0.009562338336777556, "loss": 2.6301, "step": 4179 }, { "crossentropy": 2.5651304721832275, "epoch": 0.1515371229698376, "grad_norm": 0.04009135067462921, "grad_norm_var": 6.843238340148208e-06, "learning_rate": 0.00956210055461279, "loss": 2.6528, "step": 4180 }, { "crossentropy": 2.5721328258514404, "epoch": 0.1515733758700696, "grad_norm": 0.036498360335826874, "grad_norm_var": 7.650257827290553e-06, "learning_rate": 0.009561862710829934, "loss": 2.6679, "step": 4181 }, { "crossentropy": 2.7596869468688965, "epoch": 0.15160962877030162, "grad_norm": 0.035646386444568634, "grad_norm_var": 8.753933006028167e-06, "learning_rate": 0.0095616248054322, "loss": 2.6926, "step": 4182 }, { "crossentropy": 2.6350271701812744, "epoch": 0.15164588167053364, "grad_norm": 0.03646983578801155, "grad_norm_var": 8.261166364564344e-06, "learning_rate": 0.009561386838422798, "loss": 2.6874, "step": 4183 }, { "crossentropy": 2.819552183151245, "epoch": 0.15168213457076565, "grad_norm": 0.04478023946285248, "grad_norm_var": 8.7548219104079e-06, "learning_rate": 0.009561148809804947, "loss": 2.7473, "step": 4184 }, { "crossentropy": 2.6338350772857666, "epoch": 0.15171838747099767, "grad_norm": 0.04370423033833504, "grad_norm_var": 9.461300362215558e-06, "learning_rate": 0.009560910719581858, "loss": 2.721, "step": 4185 }, { "crossentropy": 2.7353506088256836, "epoch": 0.1517546403712297, "grad_norm": 0.04202307015657425, "grad_norm_var": 9.588495109842202e-06, "learning_rate": 0.00956067256775675, "loss": 2.7636, "step": 4186 }, { "crossentropy": 2.7138757705688477, "epoch": 0.15179089327146172, "grad_norm": 0.034931767731904984, "grad_norm_var": 1.1007493433525205e-05, "learning_rate": 0.009560434354332839, "loss": 2.7389, "step": 4187 }, { "crossentropy": 2.6895570755004883, "epoch": 0.15182714617169374, "grad_norm": 0.04038412123918533, "grad_norm_var": 1.0941575331788594e-05, "learning_rate": 0.009560196079313339, "loss": 2.6941, "step": 4188 }, { "crossentropy": 2.6632542610168457, "epoch": 0.15186339907192575, "grad_norm": 0.04258342832326889, "grad_norm_var": 1.0802747821853029e-05, "learning_rate": 0.00955995774270147, "loss": 2.7436, "step": 4189 }, { "crossentropy": 2.730189323425293, "epoch": 0.15189965197215777, "grad_norm": 0.04806128516793251, "grad_norm_var": 1.3547868429189015e-05, "learning_rate": 0.009559719344500452, "loss": 2.7211, "step": 4190 }, { "crossentropy": 2.5832064151763916, "epoch": 0.15193590487238978, "grad_norm": 0.04630538821220398, "grad_norm_var": 1.4990261089783962e-05, "learning_rate": 0.009559480884713508, "loss": 2.6473, "step": 4191 }, { "crossentropy": 2.747927665710449, "epoch": 0.1519721577726218, "grad_norm": 0.04235103726387024, "grad_norm_var": 1.4785488330093002e-05, "learning_rate": 0.009559242363343852, "loss": 2.6381, "step": 4192 }, { "crossentropy": 2.695620536804199, "epoch": 0.15200841067285384, "grad_norm": 0.036467600613832474, "grad_norm_var": 1.6162308517173447e-05, "learning_rate": 0.009559003780394709, "loss": 2.631, "step": 4193 }, { "crossentropy": 2.8045859336853027, "epoch": 0.15204466357308585, "grad_norm": 0.03753230348229408, "grad_norm_var": 1.6880280321541578e-05, "learning_rate": 0.009558765135869302, "loss": 2.684, "step": 4194 }, { "crossentropy": 2.790120840072632, "epoch": 0.15208091647331787, "grad_norm": 0.03907620534300804, "grad_norm_var": 1.6237889572666825e-05, "learning_rate": 0.009558526429770856, "loss": 2.7026, "step": 4195 }, { "crossentropy": 2.7348456382751465, "epoch": 0.15211716937354988, "grad_norm": 0.03987865895032883, "grad_norm_var": 1.6250367819873635e-05, "learning_rate": 0.00955828766210259, "loss": 2.6734, "step": 4196 }, { "crossentropy": 2.704310417175293, "epoch": 0.1521534222737819, "grad_norm": 0.03875240683555603, "grad_norm_var": 1.5389795317842455e-05, "learning_rate": 0.009558048832867731, "loss": 2.6911, "step": 4197 }, { "crossentropy": 2.631570339202881, "epoch": 0.1521896751740139, "grad_norm": 0.03715996444225311, "grad_norm_var": 1.4541511194283002e-05, "learning_rate": 0.009557809942069506, "loss": 2.6577, "step": 4198 }, { "crossentropy": 2.8261988162994385, "epoch": 0.15222592807424595, "grad_norm": 0.03706483170390129, "grad_norm_var": 1.4231708218154631e-05, "learning_rate": 0.009557570989711143, "loss": 2.7663, "step": 4199 }, { "crossentropy": 2.651188373565674, "epoch": 0.15226218097447797, "grad_norm": 0.03604676574468613, "grad_norm_var": 1.4237076447765467e-05, "learning_rate": 0.009557331975795866, "loss": 2.6581, "step": 4200 }, { "crossentropy": 2.7422642707824707, "epoch": 0.15229843387470998, "grad_norm": 0.03606821596622467, "grad_norm_var": 1.4257788368477834e-05, "learning_rate": 0.009557092900326905, "loss": 2.7142, "step": 4201 }, { "crossentropy": 2.7058887481689453, "epoch": 0.152334686774942, "grad_norm": 0.03671921789646149, "grad_norm_var": 1.4350465258466717e-05, "learning_rate": 0.009556853763307489, "loss": 2.6993, "step": 4202 }, { "crossentropy": 2.804060697555542, "epoch": 0.152370939675174, "grad_norm": 0.03653906285762787, "grad_norm_var": 1.3567977712314732e-05, "learning_rate": 0.009556614564740846, "loss": 2.7647, "step": 4203 }, { "crossentropy": 2.7486014366149902, "epoch": 0.15240719257540603, "grad_norm": 0.04384861886501312, "grad_norm_var": 1.4755699249259918e-05, "learning_rate": 0.00955637530463021, "loss": 2.7305, "step": 4204 }, { "crossentropy": 2.761078357696533, "epoch": 0.15244344547563804, "grad_norm": 0.047558557242155075, "grad_norm_var": 1.8246304773240443e-05, "learning_rate": 0.009556135982978812, "loss": 2.7584, "step": 4205 }, { "crossentropy": 2.7375168800354004, "epoch": 0.15247969837587008, "grad_norm": 0.04143568500876427, "grad_norm_var": 1.3837053950103289e-05, "learning_rate": 0.00955589659978988, "loss": 2.7333, "step": 4206 }, { "crossentropy": 2.6019203662872314, "epoch": 0.1525159512761021, "grad_norm": 0.03907908499240875, "grad_norm_var": 1.0592177827160183e-05, "learning_rate": 0.009555657155066653, "loss": 2.6233, "step": 4207 }, { "crossentropy": 2.695308208465576, "epoch": 0.1525522041763341, "grad_norm": 0.042003341019153595, "grad_norm_var": 1.0448954044870822e-05, "learning_rate": 0.009555417648812362, "loss": 2.7578, "step": 4208 }, { "crossentropy": 2.667140483856201, "epoch": 0.15258845707656613, "grad_norm": 0.04482639953494072, "grad_norm_var": 1.190770939580657e-05, "learning_rate": 0.009555178081030243, "loss": 2.6499, "step": 4209 }, { "crossentropy": 2.6957180500030518, "epoch": 0.15262470997679814, "grad_norm": 0.044872574508190155, "grad_norm_var": 1.3252175979258131e-05, "learning_rate": 0.00955493845172353, "loss": 2.7111, "step": 4210 }, { "crossentropy": 2.7877726554870605, "epoch": 0.15266096287703015, "grad_norm": 0.04238377511501312, "grad_norm_var": 1.3502902663845943e-05, "learning_rate": 0.009554698760895463, "loss": 2.7466, "step": 4211 }, { "crossentropy": 2.582141876220703, "epoch": 0.15269721577726217, "grad_norm": 0.03588084876537323, "grad_norm_var": 1.4707649221697381e-05, "learning_rate": 0.009554459008549277, "loss": 2.6353, "step": 4212 }, { "crossentropy": 2.8559112548828125, "epoch": 0.1527334686774942, "grad_norm": 0.03977840766310692, "grad_norm_var": 1.4600724246869162e-05, "learning_rate": 0.00955421919468821, "loss": 2.7904, "step": 4213 }, { "crossentropy": 2.673994302749634, "epoch": 0.15276972157772623, "grad_norm": 0.03811760991811752, "grad_norm_var": 1.4285311106936443e-05, "learning_rate": 0.0095539793193155, "loss": 2.6712, "step": 4214 }, { "crossentropy": 2.7446107864379883, "epoch": 0.15280597447795824, "grad_norm": 0.03919340670108795, "grad_norm_var": 1.369602616875613e-05, "learning_rate": 0.00955373938243439, "loss": 2.7577, "step": 4215 }, { "crossentropy": 2.698267936706543, "epoch": 0.15284222737819025, "grad_norm": 0.040299903601408005, "grad_norm_var": 1.243054802855751e-05, "learning_rate": 0.009553499384048118, "loss": 2.6996, "step": 4216 }, { "crossentropy": 2.7657995223999023, "epoch": 0.15287848027842227, "grad_norm": 0.04099598154425621, "grad_norm_var": 1.1011556358081487e-05, "learning_rate": 0.009553259324159928, "loss": 2.8056, "step": 4217 }, { "crossentropy": 2.7440571784973145, "epoch": 0.15291473317865428, "grad_norm": 0.04065672308206558, "grad_norm_var": 9.814105249892485e-06, "learning_rate": 0.009553019202773058, "loss": 2.6602, "step": 4218 }, { "crossentropy": 2.6573538780212402, "epoch": 0.1529509860788863, "grad_norm": 0.040826257318258286, "grad_norm_var": 8.36035275464473e-06, "learning_rate": 0.009552779019890758, "loss": 2.7682, "step": 4219 }, { "crossentropy": 2.635396718978882, "epoch": 0.15298723897911834, "grad_norm": 0.045125123113393784, "grad_norm_var": 8.885788590979696e-06, "learning_rate": 0.009552538775516267, "loss": 2.7044, "step": 4220 }, { "crossentropy": 2.8026785850524902, "epoch": 0.15302349187935035, "grad_norm": 0.04999851435422897, "grad_norm_var": 1.1248539666905364e-05, "learning_rate": 0.00955229846965283, "loss": 2.7042, "step": 4221 }, { "crossentropy": 2.557765007019043, "epoch": 0.15305974477958237, "grad_norm": 0.04334927722811699, "grad_norm_var": 1.1437495186514673e-05, "learning_rate": 0.009552058102303694, "loss": 2.6577, "step": 4222 }, { "crossentropy": 2.52537202835083, "epoch": 0.15309599767981438, "grad_norm": 0.041545625776052475, "grad_norm_var": 1.095193992854011e-05, "learning_rate": 0.009551817673472107, "loss": 2.6042, "step": 4223 }, { "crossentropy": 2.7268574237823486, "epoch": 0.1531322505800464, "grad_norm": 0.04602980986237526, "grad_norm_var": 1.2039026232853132e-05, "learning_rate": 0.009551577183161313, "loss": 2.705, "step": 4224 }, { "crossentropy": 2.6733481884002686, "epoch": 0.1531685034802784, "grad_norm": 0.04657372459769249, "grad_norm_var": 1.2860954727406057e-05, "learning_rate": 0.009551336631374563, "loss": 2.7534, "step": 4225 }, { "crossentropy": 2.4265847206115723, "epoch": 0.15320475638051045, "grad_norm": 0.04520430043339729, "grad_norm_var": 1.2984858710009723e-05, "learning_rate": 0.009551096018115103, "loss": 2.5876, "step": 4226 }, { "crossentropy": 2.6885907649993896, "epoch": 0.15324100928074247, "grad_norm": 0.04555037245154381, "grad_norm_var": 1.366912326054518e-05, "learning_rate": 0.009550855343386186, "loss": 2.7403, "step": 4227 }, { "crossentropy": 2.5421230792999268, "epoch": 0.15327726218097448, "grad_norm": 0.03951094672083855, "grad_norm_var": 1.1315410959557524e-05, "learning_rate": 0.009550614607191062, "loss": 2.6701, "step": 4228 }, { "crossentropy": 2.7061667442321777, "epoch": 0.1533135150812065, "grad_norm": 0.04218221828341484, "grad_norm_var": 1.0749055507323218e-05, "learning_rate": 0.009550373809532981, "loss": 2.6214, "step": 4229 }, { "crossentropy": 2.680962324142456, "epoch": 0.1533497679814385, "grad_norm": 0.036135438829660416, "grad_norm_var": 1.2238067705838627e-05, "learning_rate": 0.009550132950415197, "loss": 2.7278, "step": 4230 }, { "crossentropy": 2.6575324535369873, "epoch": 0.15338602088167053, "grad_norm": 0.03797372803092003, "grad_norm_var": 1.2901071885922562e-05, "learning_rate": 0.009549892029840964, "loss": 2.6553, "step": 4231 }, { "crossentropy": 2.6296210289001465, "epoch": 0.15342227378190254, "grad_norm": 0.03856745734810829, "grad_norm_var": 1.3625130971174858e-05, "learning_rate": 0.009549651047813532, "loss": 2.7491, "step": 4232 }, { "crossentropy": 2.7233200073242188, "epoch": 0.15345852668213458, "grad_norm": 0.040199726819992065, "grad_norm_var": 1.3825931182260125e-05, "learning_rate": 0.00954941000433616, "loss": 2.6992, "step": 4233 }, { "crossentropy": 2.7200162410736084, "epoch": 0.1534947795823666, "grad_norm": 0.03819757699966431, "grad_norm_var": 1.4796582178320562e-05, "learning_rate": 0.009549168899412099, "loss": 2.7353, "step": 4234 }, { "crossentropy": 2.835690498352051, "epoch": 0.1535310324825986, "grad_norm": 0.04098307341337204, "grad_norm_var": 1.4767082639706904e-05, "learning_rate": 0.00954892773304461, "loss": 2.7589, "step": 4235 }, { "crossentropy": 2.742110013961792, "epoch": 0.15356728538283063, "grad_norm": 0.04219610616564751, "grad_norm_var": 1.4207947255206096e-05, "learning_rate": 0.00954868650523695, "loss": 2.7342, "step": 4236 }, { "crossentropy": 2.697636127471924, "epoch": 0.15360353828306264, "grad_norm": 0.04425368458032608, "grad_norm_var": 1.0249179837473755e-05, "learning_rate": 0.009548445215992374, "loss": 2.7157, "step": 4237 }, { "crossentropy": 2.639556407928467, "epoch": 0.15363979118329466, "grad_norm": 0.03859696164727211, "grad_norm_var": 1.0665284619607724e-05, "learning_rate": 0.009548203865314145, "loss": 2.7055, "step": 4238 }, { "crossentropy": 2.6313302516937256, "epoch": 0.15367604408352667, "grad_norm": 0.03620447590947151, "grad_norm_var": 1.2402465285268894e-05, "learning_rate": 0.009547962453205518, "loss": 2.7965, "step": 4239 }, { "crossentropy": 2.5782470703125, "epoch": 0.1537122969837587, "grad_norm": 0.03514746204018593, "grad_norm_var": 1.2719890036930192e-05, "learning_rate": 0.009547720979669758, "loss": 2.5524, "step": 4240 }, { "crossentropy": 2.6533007621765137, "epoch": 0.15374854988399073, "grad_norm": 0.04074106737971306, "grad_norm_var": 1.009726438847928e-05, "learning_rate": 0.009547479444710124, "loss": 2.6773, "step": 4241 }, { "crossentropy": 2.9327313899993896, "epoch": 0.15378480278422274, "grad_norm": 0.036827847361564636, "grad_norm_var": 8.78489673758285e-06, "learning_rate": 0.009547237848329879, "loss": 2.8359, "step": 4242 }, { "crossentropy": 2.694242238998413, "epoch": 0.15382105568445475, "grad_norm": 0.03716402128338814, "grad_norm_var": 6.503796536757624e-06, "learning_rate": 0.009546996190532286, "loss": 2.6965, "step": 4243 }, { "crossentropy": 2.7107295989990234, "epoch": 0.15385730858468677, "grad_norm": 0.03613459691405296, "grad_norm_var": 7.01107252287497e-06, "learning_rate": 0.00954675447132061, "loss": 2.6867, "step": 4244 }, { "crossentropy": 2.604447841644287, "epoch": 0.15389356148491878, "grad_norm": 0.03653663769364357, "grad_norm_var": 6.490352627488924e-06, "learning_rate": 0.009546512690698115, "loss": 2.6815, "step": 4245 }, { "crossentropy": 2.761596202850342, "epoch": 0.1539298143851508, "grad_norm": 0.03500176593661308, "grad_norm_var": 6.926773118974954e-06, "learning_rate": 0.009546270848668065, "loss": 2.6743, "step": 4246 }, { "crossentropy": 2.8272597789764404, "epoch": 0.15396606728538284, "grad_norm": 0.03559138998389244, "grad_norm_var": 7.423373001274174e-06, "learning_rate": 0.00954602894523373, "loss": 2.718, "step": 4247 }, { "crossentropy": 2.671473741531372, "epoch": 0.15400232018561485, "grad_norm": 0.036223895847797394, "grad_norm_var": 7.674158399373123e-06, "learning_rate": 0.009545786980398374, "loss": 2.6674, "step": 4248 }, { "crossentropy": 2.545823335647583, "epoch": 0.15403857308584687, "grad_norm": 0.03468348830938339, "grad_norm_var": 8.050018788036128e-06, "learning_rate": 0.009545544954165267, "loss": 2.6604, "step": 4249 }, { "crossentropy": 2.7435719966888428, "epoch": 0.15407482598607888, "grad_norm": 0.03942618891596794, "grad_norm_var": 8.212725603631323e-06, "learning_rate": 0.009545302866537677, "loss": 2.6856, "step": 4250 }, { "crossentropy": 2.7653229236602783, "epoch": 0.1541110788863109, "grad_norm": 0.03690556809306145, "grad_norm_var": 7.552332117199967e-06, "learning_rate": 0.009545060717518873, "loss": 2.7556, "step": 4251 }, { "crossentropy": 2.7025492191314697, "epoch": 0.1541473317865429, "grad_norm": 0.03909990191459656, "grad_norm_var": 6.254996509690893e-06, "learning_rate": 0.00954481850711213, "loss": 2.7072, "step": 4252 }, { "crossentropy": 2.6528334617614746, "epoch": 0.15418358468677495, "grad_norm": 0.03982353210449219, "grad_norm_var": 3.4383847211008045e-06, "learning_rate": 0.009544576235320712, "loss": 2.6217, "step": 4253 }, { "crossentropy": 2.862454652786255, "epoch": 0.15421983758700697, "grad_norm": 0.043305654078722, "grad_norm_var": 5.7439872691688576e-06, "learning_rate": 0.009544333902147898, "loss": 2.813, "step": 4254 }, { "crossentropy": 2.6733553409576416, "epoch": 0.15425609048723898, "grad_norm": 0.04947127401828766, "grad_norm_var": 1.4583556077884854e-05, "learning_rate": 0.009544091507596957, "loss": 2.7457, "step": 4255 }, { "crossentropy": 2.5988776683807373, "epoch": 0.154292343387471, "grad_norm": 0.03805718198418617, "grad_norm_var": 1.3906997704001232e-05, "learning_rate": 0.009543849051671166, "loss": 2.6239, "step": 4256 }, { "crossentropy": 2.521029233932495, "epoch": 0.154328596287703, "grad_norm": 0.03819898143410683, "grad_norm_var": 1.3529976241667223e-05, "learning_rate": 0.009543606534373795, "loss": 2.5247, "step": 4257 }, { "crossentropy": 2.9021308422088623, "epoch": 0.15436484918793503, "grad_norm": 0.04328060895204544, "grad_norm_var": 1.4884483161829026e-05, "learning_rate": 0.009543363955708125, "loss": 2.7571, "step": 4258 }, { "crossentropy": 2.6897103786468506, "epoch": 0.15440110208816704, "grad_norm": 0.04880957677960396, "grad_norm_var": 2.1004350607452492e-05, "learning_rate": 0.00954312131567743, "loss": 2.6368, "step": 4259 }, { "crossentropy": 2.797683000564575, "epoch": 0.15443735498839908, "grad_norm": 0.045689672231674194, "grad_norm_var": 2.253844761394974e-05, "learning_rate": 0.009542878614284985, "loss": 2.7703, "step": 4260 }, { "crossentropy": 2.6602632999420166, "epoch": 0.1544736078886311, "grad_norm": 0.04418034106492996, "grad_norm_var": 2.265365479179234e-05, "learning_rate": 0.009542635851534071, "loss": 2.6646, "step": 4261 }, { "crossentropy": 2.7245266437530518, "epoch": 0.1545098607888631, "grad_norm": 0.046036768704652786, "grad_norm_var": 2.219770264790047e-05, "learning_rate": 0.009542393027427968, "loss": 2.7349, "step": 4262 }, { "crossentropy": 2.817291736602783, "epoch": 0.15454611368909513, "grad_norm": 0.0473063662648201, "grad_norm_var": 2.205522290315929e-05, "learning_rate": 0.009542150141969952, "loss": 2.6494, "step": 4263 }, { "crossentropy": 2.616128444671631, "epoch": 0.15458236658932714, "grad_norm": 0.04650574550032616, "grad_norm_var": 2.087257073401493e-05, "learning_rate": 0.009541907195163306, "loss": 2.7523, "step": 4264 }, { "crossentropy": 2.7849535942077637, "epoch": 0.15461861948955916, "grad_norm": 0.04879181087017059, "grad_norm_var": 1.851734874043498e-05, "learning_rate": 0.009541664187011313, "loss": 2.713, "step": 4265 }, { "crossentropy": 2.5317587852478027, "epoch": 0.15465487238979117, "grad_norm": 0.042969413101673126, "grad_norm_var": 1.7410210209242195e-05, "learning_rate": 0.009541421117517249, "loss": 2.6014, "step": 4266 }, { "crossentropy": 2.8161511421203613, "epoch": 0.1546911252900232, "grad_norm": 0.04579375684261322, "grad_norm_var": 1.4352533143270155e-05, "learning_rate": 0.009541177986684402, "loss": 2.8283, "step": 4267 }, { "crossentropy": 2.724393367767334, "epoch": 0.15472737819025523, "grad_norm": 0.04628315567970276, "grad_norm_var": 1.268555423306524e-05, "learning_rate": 0.009540934794516056, "loss": 2.7217, "step": 4268 }, { "crossentropy": 2.9437780380249023, "epoch": 0.15476363109048724, "grad_norm": 0.051673512905836105, "grad_norm_var": 1.3825870979997044e-05, "learning_rate": 0.009540691541015493, "loss": 2.8137, "step": 4269 }, { "crossentropy": 2.681095838546753, "epoch": 0.15479988399071926, "grad_norm": 0.042550258338451385, "grad_norm_var": 1.4072185533092751e-05, "learning_rate": 0.009540448226186002, "loss": 2.6034, "step": 4270 }, { "crossentropy": 2.7017030715942383, "epoch": 0.15483613689095127, "grad_norm": 0.04040051996707916, "grad_norm_var": 1.4230069246159069e-05, "learning_rate": 0.009540204850030867, "loss": 2.6312, "step": 4271 }, { "crossentropy": 2.6417465209960938, "epoch": 0.15487238979118328, "grad_norm": 0.042731016874313354, "grad_norm_var": 1.1403996140317042e-05, "learning_rate": 0.009539961412553375, "loss": 2.6446, "step": 4272 }, { "crossentropy": 2.75852632522583, "epoch": 0.15490864269141533, "grad_norm": 0.04200059548020363, "grad_norm_var": 8.821886112061969e-06, "learning_rate": 0.009539717913756813, "loss": 2.7355, "step": 4273 }, { "crossentropy": 2.6243255138397217, "epoch": 0.15494489559164734, "grad_norm": 0.03541906923055649, "grad_norm_var": 1.4814667365442974e-05, "learning_rate": 0.009539474353644472, "loss": 2.6285, "step": 4274 }, { "crossentropy": 2.808029890060425, "epoch": 0.15498114849187936, "grad_norm": 0.039261654019355774, "grad_norm_var": 1.5435105076916136e-05, "learning_rate": 0.009539230732219644, "loss": 2.7836, "step": 4275 }, { "crossentropy": 2.783721446990967, "epoch": 0.15501740139211137, "grad_norm": 0.043995533138513565, "grad_norm_var": 1.5283549434631725e-05, "learning_rate": 0.009538987049485612, "loss": 2.706, "step": 4276 }, { "crossentropy": 2.6895081996917725, "epoch": 0.15505365429234338, "grad_norm": 0.042612940073013306, "grad_norm_var": 1.5424218036024355e-05, "learning_rate": 0.009538743305445675, "loss": 2.6938, "step": 4277 }, { "crossentropy": 2.736102819442749, "epoch": 0.1550899071925754, "grad_norm": 0.03671013563871384, "grad_norm_var": 1.835383517216511e-05, "learning_rate": 0.00953849950010312, "loss": 2.7351, "step": 4278 }, { "crossentropy": 2.7890450954437256, "epoch": 0.1551261600928074, "grad_norm": 0.0385771282017231, "grad_norm_var": 1.8613741627565975e-05, "learning_rate": 0.009538255633461242, "loss": 2.6199, "step": 4279 }, { "crossentropy": 2.7127718925476074, "epoch": 0.15516241299303946, "grad_norm": 0.03649343550205231, "grad_norm_var": 2.005523415993596e-05, "learning_rate": 0.009538011705523335, "loss": 2.6726, "step": 4280 }, { "crossentropy": 2.612797498703003, "epoch": 0.15519866589327147, "grad_norm": 0.036331843584775925, "grad_norm_var": 1.891771355165082e-05, "learning_rate": 0.009537767716292693, "loss": 2.7058, "step": 4281 }, { "crossentropy": 2.7017874717712402, "epoch": 0.15523491879350348, "grad_norm": 0.03881130367517471, "grad_norm_var": 1.9176873946712542e-05, "learning_rate": 0.009537523665772613, "loss": 2.6895, "step": 4282 }, { "crossentropy": 2.6963436603546143, "epoch": 0.1552711716937355, "grad_norm": 0.03591803461313248, "grad_norm_var": 1.9260296344333847e-05, "learning_rate": 0.00953727955396639, "loss": 2.6861, "step": 4283 }, { "crossentropy": 2.6864442825317383, "epoch": 0.1553074245939675, "grad_norm": 0.03746580332517624, "grad_norm_var": 1.7450519248142446e-05, "learning_rate": 0.009537035380877319, "loss": 2.696, "step": 4284 }, { "crossentropy": 2.815127372741699, "epoch": 0.15534367749419953, "grad_norm": 0.04084208980202675, "grad_norm_var": 8.010234605911917e-06, "learning_rate": 0.0095367911465087, "loss": 2.8358, "step": 4285 }, { "crossentropy": 2.732811689376831, "epoch": 0.15537993039443154, "grad_norm": 0.03611613065004349, "grad_norm_var": 7.880114330875189e-06, "learning_rate": 0.009536546850863832, "loss": 2.699, "step": 4286 }, { "crossentropy": 2.7497501373291016, "epoch": 0.15541618329466358, "grad_norm": 0.036432743072509766, "grad_norm_var": 8.112799349944908e-06, "learning_rate": 0.009536302493946016, "loss": 2.746, "step": 4287 }, { "crossentropy": 2.6068503856658936, "epoch": 0.1554524361948956, "grad_norm": 0.03737013787031174, "grad_norm_var": 7.0508887131484965e-06, "learning_rate": 0.00953605807575855, "loss": 2.6682, "step": 4288 }, { "crossentropy": 2.625653028488159, "epoch": 0.1554886890951276, "grad_norm": 0.03784792125225067, "grad_norm_var": 6.133635886898525e-06, "learning_rate": 0.009535813596304737, "loss": 2.691, "step": 4289 }, { "crossentropy": 2.5875470638275146, "epoch": 0.15552494199535963, "grad_norm": 0.03897261992096901, "grad_norm_var": 5.634682847742838e-06, "learning_rate": 0.009535569055587875, "loss": 2.6275, "step": 4290 }, { "crossentropy": 2.8388924598693848, "epoch": 0.15556119489559164, "grad_norm": 0.04158709943294525, "grad_norm_var": 6.252240739020437e-06, "learning_rate": 0.009535324453611271, "loss": 2.8132, "step": 4291 }, { "crossentropy": 2.6798338890075684, "epoch": 0.15559744779582366, "grad_norm": 0.041506264358758926, "grad_norm_var": 4.817299373373734e-06, "learning_rate": 0.00953507979037823, "loss": 2.735, "step": 4292 }, { "crossentropy": 2.7063443660736084, "epoch": 0.15563370069605567, "grad_norm": 0.045949675142765045, "grad_norm_var": 7.4098569691778575e-06, "learning_rate": 0.00953483506589205, "loss": 2.6902, "step": 4293 }, { "crossentropy": 2.6137397289276123, "epoch": 0.1556699535962877, "grad_norm": 0.04722334071993828, "grad_norm_var": 1.1727178428431323e-05, "learning_rate": 0.009534590280156045, "loss": 2.5833, "step": 4294 }, { "crossentropy": 2.6048383712768555, "epoch": 0.15570620649651973, "grad_norm": 0.03988228738307953, "grad_norm_var": 1.1722579634397487e-05, "learning_rate": 0.009534345433173514, "loss": 2.6606, "step": 4295 }, { "crossentropy": 2.628410577774048, "epoch": 0.15574245939675174, "grad_norm": 0.03818837180733681, "grad_norm_var": 1.1268566368684517e-05, "learning_rate": 0.009534100524947768, "loss": 2.6444, "step": 4296 }, { "crossentropy": 2.6460227966308594, "epoch": 0.15577871229698376, "grad_norm": 0.03699731454253197, "grad_norm_var": 1.1023755479956886e-05, "learning_rate": 0.009533855555482114, "loss": 2.775, "step": 4297 }, { "crossentropy": 2.650174856185913, "epoch": 0.15581496519721577, "grad_norm": 0.03737357631325722, "grad_norm_var": 1.1274318219290219e-05, "learning_rate": 0.00953361052477986, "loss": 2.7011, "step": 4298 }, { "crossentropy": 2.7746171951293945, "epoch": 0.15585121809744779, "grad_norm": 0.03781796619296074, "grad_norm_var": 1.0629364760785196e-05, "learning_rate": 0.009533365432844316, "loss": 2.749, "step": 4299 }, { "crossentropy": 2.5752339363098145, "epoch": 0.15588747099767983, "grad_norm": 0.03774402290582657, "grad_norm_var": 1.055973140369181e-05, "learning_rate": 0.009533120279678791, "loss": 2.6611, "step": 4300 }, { "crossentropy": 2.8629281520843506, "epoch": 0.15592372389791184, "grad_norm": 0.04090122506022453, "grad_norm_var": 1.0570605091433927e-05, "learning_rate": 0.009532875065286599, "loss": 2.8391, "step": 4301 }, { "crossentropy": 2.679044723510742, "epoch": 0.15595997679814386, "grad_norm": 0.05180787295103073, "grad_norm_var": 1.8891865555807166e-05, "learning_rate": 0.00953262978967105, "loss": 2.7471, "step": 4302 }, { "crossentropy": 2.6422464847564697, "epoch": 0.15599622969837587, "grad_norm": 0.048706263303756714, "grad_norm_var": 2.1691542659256028e-05, "learning_rate": 0.009532384452835456, "loss": 2.6876, "step": 4303 }, { "crossentropy": 2.7008473873138428, "epoch": 0.15603248259860789, "grad_norm": 0.041922446340322495, "grad_norm_var": 2.0636490554711412e-05, "learning_rate": 0.009532139054783133, "loss": 2.6921, "step": 4304 }, { "crossentropy": 2.6474640369415283, "epoch": 0.1560687354988399, "grad_norm": 0.03691281005740166, "grad_norm_var": 2.1149826558858332e-05, "learning_rate": 0.009531893595517394, "loss": 2.6744, "step": 4305 }, { "crossentropy": 2.6673495769500732, "epoch": 0.15610498839907191, "grad_norm": 0.038179222494363785, "grad_norm_var": 2.145318020306264e-05, "learning_rate": 0.009531648075041553, "loss": 2.7077, "step": 4306 }, { "crossentropy": 2.6656336784362793, "epoch": 0.15614124129930396, "grad_norm": 0.03609535098075867, "grad_norm_var": 2.321485445877236e-05, "learning_rate": 0.00953140249335893, "loss": 2.6691, "step": 4307 }, { "crossentropy": 2.7531898021698, "epoch": 0.15617749419953597, "grad_norm": 0.036097895354032516, "grad_norm_var": 2.473237737599087e-05, "learning_rate": 0.009531156850472839, "loss": 2.7474, "step": 4308 }, { "crossentropy": 2.6659703254699707, "epoch": 0.15621374709976799, "grad_norm": 0.040624551475048065, "grad_norm_var": 2.2803940054094465e-05, "learning_rate": 0.009530911146386599, "loss": 2.6376, "step": 4309 }, { "crossentropy": 2.927095890045166, "epoch": 0.15625, "grad_norm": 0.03914586827158928, "grad_norm_var": 1.9538090704557023e-05, "learning_rate": 0.009530665381103526, "loss": 2.7696, "step": 4310 }, { "crossentropy": 2.6063263416290283, "epoch": 0.15628625290023201, "grad_norm": 0.04245489463210106, "grad_norm_var": 1.994572261843208e-05, "learning_rate": 0.009530419554626943, "loss": 2.674, "step": 4311 }, { "crossentropy": 2.6917672157287598, "epoch": 0.15632250580046403, "grad_norm": 0.039808548986911774, "grad_norm_var": 1.9705337268937962e-05, "learning_rate": 0.009530173666960169, "loss": 2.6425, "step": 4312 }, { "crossentropy": 2.6926333904266357, "epoch": 0.15635875870069604, "grad_norm": 0.0363212525844574, "grad_norm_var": 2.0019161071346767e-05, "learning_rate": 0.009529927718106524, "loss": 2.6858, "step": 4313 }, { "crossentropy": 2.6940104961395264, "epoch": 0.15639501160092809, "grad_norm": 0.0381171777844429, "grad_norm_var": 1.9781459343863143e-05, "learning_rate": 0.00952968170806933, "loss": 2.6993, "step": 4314 }, { "crossentropy": 2.7427711486816406, "epoch": 0.1564312645011601, "grad_norm": 0.03989846631884575, "grad_norm_var": 1.940062103319799e-05, "learning_rate": 0.009529435636851913, "loss": 2.7819, "step": 4315 }, { "crossentropy": 2.7965011596679688, "epoch": 0.15646751740139211, "grad_norm": 0.03812876716256142, "grad_norm_var": 1.9278952327435722e-05, "learning_rate": 0.009529189504457592, "loss": 2.7205, "step": 4316 }, { "crossentropy": 2.604235887527466, "epoch": 0.15650377030162413, "grad_norm": 0.036104824393987656, "grad_norm_var": 2.034519290664786e-05, "learning_rate": 0.009528943310889696, "loss": 2.6286, "step": 4317 }, { "crossentropy": 2.7774977684020996, "epoch": 0.15654002320185614, "grad_norm": 0.04005548730492592, "grad_norm_var": 1.0506794736436554e-05, "learning_rate": 0.009528697056151544, "loss": 2.7143, "step": 4318 }, { "crossentropy": 2.7237331867218018, "epoch": 0.15657627610208816, "grad_norm": 0.039910346269607544, "grad_norm_var": 4.294164674546972e-06, "learning_rate": 0.009528450740246467, "loss": 2.5893, "step": 4319 }, { "crossentropy": 2.6119863986968994, "epoch": 0.15661252900232017, "grad_norm": 0.038335029035806656, "grad_norm_var": 3.574421103375025e-06, "learning_rate": 0.009528204363177792, "loss": 2.6415, "step": 4320 }, { "crossentropy": 2.8763163089752197, "epoch": 0.15664878190255221, "grad_norm": 0.03815968707203865, "grad_norm_var": 3.4057398965353186e-06, "learning_rate": 0.009527957924948845, "loss": 2.7738, "step": 4321 }, { "crossentropy": 2.7738037109375, "epoch": 0.15668503480278423, "grad_norm": 0.051270127296447754, "grad_norm_var": 1.339977034637372e-05, "learning_rate": 0.009527711425562954, "loss": 2.781, "step": 4322 }, { "crossentropy": 2.663390636444092, "epoch": 0.15672128770301624, "grad_norm": 0.03602416068315506, "grad_norm_var": 1.3431531059065506e-05, "learning_rate": 0.00952746486502345, "loss": 2.6387, "step": 4323 }, { "crossentropy": 2.650167942047119, "epoch": 0.15675754060324826, "grad_norm": 0.036360472440719604, "grad_norm_var": 1.3320107717795793e-05, "learning_rate": 0.009527218243333661, "loss": 2.6809, "step": 4324 }, { "crossentropy": 2.706547498703003, "epoch": 0.15679379350348027, "grad_norm": 0.09726922959089279, "grad_norm_var": 0.00022295651390766836, "learning_rate": 0.00952697156049692, "loss": 2.7998, "step": 4325 }, { "crossentropy": 2.730888843536377, "epoch": 0.1568300464037123, "grad_norm": 0.0368812121450901, "grad_norm_var": 0.0002244288304250032, "learning_rate": 0.00952672481651656, "loss": 2.7339, "step": 4326 }, { "crossentropy": 2.669982433319092, "epoch": 0.15686629930394433, "grad_norm": 0.037632036954164505, "grad_norm_var": 0.00022611654132815017, "learning_rate": 0.00952647801139591, "loss": 2.7697, "step": 4327 }, { "crossentropy": 2.7211639881134033, "epoch": 0.15690255220417634, "grad_norm": 0.04250437393784523, "grad_norm_var": 0.0002255971152783008, "learning_rate": 0.009526231145138304, "loss": 2.6989, "step": 4328 }, { "crossentropy": 2.5482821464538574, "epoch": 0.15693880510440836, "grad_norm": 0.04426846280694008, "grad_norm_var": 0.00022280045670512727, "learning_rate": 0.009525984217747079, "loss": 2.6785, "step": 4329 }, { "crossentropy": 2.836554527282715, "epoch": 0.15697505800464037, "grad_norm": 0.0412118062376976, "grad_norm_var": 0.00022130896691320475, "learning_rate": 0.009525737229225568, "loss": 2.7343, "step": 4330 }, { "crossentropy": 2.7178964614868164, "epoch": 0.1570113109048724, "grad_norm": 0.03784378245472908, "grad_norm_var": 0.00022252549655933906, "learning_rate": 0.00952549017957711, "loss": 2.7358, "step": 4331 }, { "crossentropy": 2.4650657176971436, "epoch": 0.1570475638051044, "grad_norm": 0.03909608721733093, "grad_norm_var": 0.00022192378615659418, "learning_rate": 0.009525243068805036, "loss": 2.5704, "step": 4332 }, { "crossentropy": 2.7844650745391846, "epoch": 0.15708381670533642, "grad_norm": 0.03684544563293457, "grad_norm_var": 0.00022124676406159796, "learning_rate": 0.00952499589691269, "loss": 2.738, "step": 4333 }, { "crossentropy": 2.654585361480713, "epoch": 0.15712006960556846, "grad_norm": 0.037495698779821396, "grad_norm_var": 0.00022278217564977165, "learning_rate": 0.009524748663903406, "loss": 2.7413, "step": 4334 }, { "crossentropy": 2.7281394004821777, "epoch": 0.15715632250580047, "grad_norm": 0.036353949457407, "grad_norm_var": 0.0002251298537821309, "learning_rate": 0.009524501369780525, "loss": 2.6579, "step": 4335 }, { "crossentropy": 2.752319574356079, "epoch": 0.1571925754060325, "grad_norm": 0.03772621601819992, "grad_norm_var": 0.00022552942382734545, "learning_rate": 0.009524254014547387, "loss": 2.686, "step": 4336 }, { "crossentropy": 2.7422637939453125, "epoch": 0.1572288283062645, "grad_norm": 0.03643478825688362, "grad_norm_var": 0.00022681338814816872, "learning_rate": 0.009524006598207332, "loss": 2.6682, "step": 4337 }, { "crossentropy": 2.6843631267547607, "epoch": 0.15726508120649652, "grad_norm": 0.038439374417066574, "grad_norm_var": 0.00022265691491204676, "learning_rate": 0.009523759120763703, "loss": 2.7193, "step": 4338 }, { "crossentropy": 2.7514519691467285, "epoch": 0.15730133410672853, "grad_norm": 0.041464272886514664, "grad_norm_var": 0.00022015447759805387, "learning_rate": 0.009523511582219842, "loss": 2.7579, "step": 4339 }, { "crossentropy": 2.5831058025360107, "epoch": 0.15733758700696054, "grad_norm": 0.037380117923021317, "grad_norm_var": 0.00021940323416873338, "learning_rate": 0.009523263982579093, "loss": 2.545, "step": 4340 }, { "crossentropy": 2.913315773010254, "epoch": 0.1573738399071926, "grad_norm": 0.04247959703207016, "grad_norm_var": 6.3909252193687534e-06, "learning_rate": 0.009523016321844798, "loss": 2.8252, "step": 4341 }, { "crossentropy": 2.566516160964966, "epoch": 0.1574100928074246, "grad_norm": 0.03797536715865135, "grad_norm_var": 6.156122605835828e-06, "learning_rate": 0.009522768600020307, "loss": 2.5494, "step": 4342 }, { "crossentropy": 2.67083477973938, "epoch": 0.15744634570765662, "grad_norm": 0.037159401923418045, "grad_norm_var": 6.260825241890568e-06, "learning_rate": 0.00952252081710896, "loss": 2.6217, "step": 4343 }, { "crossentropy": 2.770890474319458, "epoch": 0.15748259860788863, "grad_norm": 0.03717765212059021, "grad_norm_var": 5.575416980955344e-06, "learning_rate": 0.009522272973114105, "loss": 2.7484, "step": 4344 }, { "crossentropy": 2.523204803466797, "epoch": 0.15751885150812064, "grad_norm": 0.03647434711456299, "grad_norm_var": 3.5952233655183717e-06, "learning_rate": 0.009522025068039093, "loss": 2.526, "step": 4345 }, { "crossentropy": 2.757117748260498, "epoch": 0.15755510440835266, "grad_norm": 0.03832194209098816, "grad_norm_var": 2.9653048892560573e-06, "learning_rate": 0.009521777101887268, "loss": 2.6859, "step": 4346 }, { "crossentropy": 2.649282455444336, "epoch": 0.15759135730858467, "grad_norm": 0.03777956962585449, "grad_norm_var": 2.9672575571957256e-06, "learning_rate": 0.009521529074661984, "loss": 2.6967, "step": 4347 }, { "crossentropy": 2.650714874267578, "epoch": 0.15762761020881672, "grad_norm": 0.041692666709423065, "grad_norm_var": 3.755057914488676e-06, "learning_rate": 0.009521280986366586, "loss": 2.7026, "step": 4348 }, { "crossentropy": 2.8576560020446777, "epoch": 0.15766386310904873, "grad_norm": 0.03941158205270767, "grad_norm_var": 3.7031523764772163e-06, "learning_rate": 0.009521032837004426, "loss": 2.7821, "step": 4349 }, { "crossentropy": 2.6329185962677, "epoch": 0.15770011600928074, "grad_norm": 0.04156937450170517, "grad_norm_var": 4.270656305057664e-06, "learning_rate": 0.009520784626578859, "loss": 2.6503, "step": 4350 }, { "crossentropy": 2.727160692214966, "epoch": 0.15773636890951276, "grad_norm": 0.042034074664115906, "grad_norm_var": 4.574728132139549e-06, "learning_rate": 0.009520536355093234, "loss": 2.6726, "step": 4351 }, { "crossentropy": 2.6176650524139404, "epoch": 0.15777262180974477, "grad_norm": 0.05157285928726196, "grad_norm_var": 1.4261486248378965e-05, "learning_rate": 0.009520288022550906, "loss": 2.708, "step": 4352 }, { "crossentropy": 2.704270839691162, "epoch": 0.1578088747099768, "grad_norm": 0.03881525993347168, "grad_norm_var": 1.3536298601414474e-05, "learning_rate": 0.009520039628955227, "loss": 2.6703, "step": 4353 }, { "crossentropy": 2.689100742340088, "epoch": 0.15784512761020883, "grad_norm": 0.038683414459228516, "grad_norm_var": 1.3489753721797568e-05, "learning_rate": 0.009519791174309556, "loss": 2.7003, "step": 4354 }, { "crossentropy": 2.736194372177124, "epoch": 0.15788138051044084, "grad_norm": 0.03623441234230995, "grad_norm_var": 1.4177789266807636e-05, "learning_rate": 0.009519542658617245, "loss": 2.7267, "step": 4355 }, { "crossentropy": 2.7327301502227783, "epoch": 0.15791763341067286, "grad_norm": 0.034956928342580795, "grad_norm_var": 1.528546302801322e-05, "learning_rate": 0.00951929408188165, "loss": 2.6816, "step": 4356 }, { "crossentropy": 2.598428249359131, "epoch": 0.15795388631090487, "grad_norm": 0.03973526135087013, "grad_norm_var": 1.4673645735713224e-05, "learning_rate": 0.009519045444106132, "loss": 2.625, "step": 4357 }, { "crossentropy": 2.6104800701141357, "epoch": 0.1579901392111369, "grad_norm": 0.038437020033597946, "grad_norm_var": 1.460237483165838e-05, "learning_rate": 0.009518796745294046, "loss": 2.6222, "step": 4358 }, { "crossentropy": 2.6479125022888184, "epoch": 0.1580263921113689, "grad_norm": 0.03623777627944946, "grad_norm_var": 1.4928150512140645e-05, "learning_rate": 0.009518547985448754, "loss": 2.6379, "step": 4359 }, { "crossentropy": 2.772019147872925, "epoch": 0.15806264501160092, "grad_norm": 0.03509379178285599, "grad_norm_var": 1.5795047865400463e-05, "learning_rate": 0.009518299164573614, "loss": 2.717, "step": 4360 }, { "crossentropy": 2.594381332397461, "epoch": 0.15809889791183296, "grad_norm": 0.03687407448887825, "grad_norm_var": 1.5660263887221488e-05, "learning_rate": 0.009518050282671987, "loss": 2.6498, "step": 4361 }, { "crossentropy": 2.651259422302246, "epoch": 0.15813515081206497, "grad_norm": 0.037292808294296265, "grad_norm_var": 1.584908796660496e-05, "learning_rate": 0.009517801339747235, "loss": 2.6842, "step": 4362 }, { "crossentropy": 2.7676992416381836, "epoch": 0.158171403712297, "grad_norm": 0.03718673437833786, "grad_norm_var": 1.597948220029051e-05, "learning_rate": 0.009517552335802718, "loss": 2.7992, "step": 4363 }, { "crossentropy": 2.7369778156280518, "epoch": 0.158207656612529, "grad_norm": 0.03782873973250389, "grad_norm_var": 1.5584228998896667e-05, "learning_rate": 0.009517303270841802, "loss": 2.6523, "step": 4364 }, { "crossentropy": 2.8930845260620117, "epoch": 0.15824390951276102, "grad_norm": 0.04335429519414902, "grad_norm_var": 1.6839048365785426e-05, "learning_rate": 0.00951705414486785, "loss": 2.8633, "step": 4365 }, { "crossentropy": 2.7412703037261963, "epoch": 0.15828016241299303, "grad_norm": 0.04447949305176735, "grad_norm_var": 1.8319063267551763e-05, "learning_rate": 0.009516804957884229, "loss": 2.8117, "step": 4366 }, { "crossentropy": 2.609990358352661, "epoch": 0.15831641531322505, "grad_norm": 0.04221714660525322, "grad_norm_var": 1.8387869774320192e-05, "learning_rate": 0.009516555709894299, "loss": 2.6547, "step": 4367 }, { "crossentropy": 2.730982780456543, "epoch": 0.1583526682134571, "grad_norm": 0.0428970642387867, "grad_norm_var": 8.90976104548722e-06, "learning_rate": 0.009516306400901434, "loss": 2.7404, "step": 4368 }, { "crossentropy": 2.7329115867614746, "epoch": 0.1583889211136891, "grad_norm": 0.04560619220137596, "grad_norm_var": 1.1832800721326559e-05, "learning_rate": 0.009516057030908994, "loss": 2.5716, "step": 4369 }, { "crossentropy": 2.700620174407959, "epoch": 0.15842517401392112, "grad_norm": 0.04841604456305504, "grad_norm_var": 1.7089573024772244e-05, "learning_rate": 0.009515807599920352, "loss": 2.6595, "step": 4370 }, { "crossentropy": 2.7351245880126953, "epoch": 0.15846142691415313, "grad_norm": 0.04582379385828972, "grad_norm_var": 1.8274115438301982e-05, "learning_rate": 0.009515558107938873, "loss": 2.7397, "step": 4371 }, { "crossentropy": 2.595343589782715, "epoch": 0.15849767981438515, "grad_norm": 0.04684833809733391, "grad_norm_var": 1.847817819795542e-05, "learning_rate": 0.009515308554967931, "loss": 2.6218, "step": 4372 }, { "crossentropy": 2.677661657333374, "epoch": 0.15853393271461716, "grad_norm": 0.04243455454707146, "grad_norm_var": 1.8425998755803156e-05, "learning_rate": 0.009515058941010896, "loss": 2.7258, "step": 4373 }, { "crossentropy": 2.7318003177642822, "epoch": 0.1585701856148492, "grad_norm": 0.0374474935233593, "grad_norm_var": 1.886680804196779e-05, "learning_rate": 0.009514809266071136, "loss": 2.6356, "step": 4374 }, { "crossentropy": 2.709204912185669, "epoch": 0.15860643851508122, "grad_norm": 0.03854910656809807, "grad_norm_var": 1.765530610410293e-05, "learning_rate": 0.009514559530152027, "loss": 2.6583, "step": 4375 }, { "crossentropy": 2.8341479301452637, "epoch": 0.15864269141531323, "grad_norm": 0.03995170816779137, "grad_norm_var": 1.5047632061512307e-05, "learning_rate": 0.00951430973325694, "loss": 2.8628, "step": 4376 }, { "crossentropy": 2.5803377628326416, "epoch": 0.15867894431554525, "grad_norm": 0.04121551290154457, "grad_norm_var": 1.3431835238370862e-05, "learning_rate": 0.009514059875389249, "loss": 2.7239, "step": 4377 }, { "crossentropy": 2.8390607833862305, "epoch": 0.15871519721577726, "grad_norm": 0.045600276440382004, "grad_norm_var": 1.2562453011900435e-05, "learning_rate": 0.00951380995655233, "loss": 2.8016, "step": 4378 }, { "crossentropy": 2.525893449783325, "epoch": 0.15875145011600927, "grad_norm": 0.04169631749391556, "grad_norm_var": 1.0644118795390585e-05, "learning_rate": 0.009513559976749556, "loss": 2.6055, "step": 4379 }, { "crossentropy": 2.672786235809326, "epoch": 0.1587877030162413, "grad_norm": 0.038400717079639435, "grad_norm_var": 1.0287508024619564e-05, "learning_rate": 0.009513309935984305, "loss": 2.669, "step": 4380 }, { "crossentropy": 2.7033162117004395, "epoch": 0.15882395591647333, "grad_norm": 0.05837909132242203, "grad_norm_var": 2.5489676821433862e-05, "learning_rate": 0.009513059834259955, "loss": 2.7175, "step": 4381 }, { "crossentropy": 2.670414924621582, "epoch": 0.15886020881670534, "grad_norm": 0.03766883537173271, "grad_norm_var": 2.7724190859213827e-05, "learning_rate": 0.009512809671579884, "loss": 2.6466, "step": 4382 }, { "crossentropy": 2.7783079147338867, "epoch": 0.15889646171693736, "grad_norm": 0.03814711049199104, "grad_norm_var": 2.9359094456079573e-05, "learning_rate": 0.00951255944794747, "loss": 2.6979, "step": 4383 }, { "crossentropy": 2.712472438812256, "epoch": 0.15893271461716937, "grad_norm": 0.04200535640120506, "grad_norm_var": 2.9429070766640434e-05, "learning_rate": 0.009512309163366092, "loss": 2.6966, "step": 4384 }, { "crossentropy": 2.8164942264556885, "epoch": 0.1589689675174014, "grad_norm": 0.04176098853349686, "grad_norm_var": 2.9023094205563963e-05, "learning_rate": 0.009512058817839128, "loss": 2.6767, "step": 4385 }, { "crossentropy": 2.856735944747925, "epoch": 0.1590052204176334, "grad_norm": 0.04522394388914108, "grad_norm_var": 2.7257577568923162e-05, "learning_rate": 0.009511808411369966, "loss": 2.7634, "step": 4386 }, { "crossentropy": 2.6025567054748535, "epoch": 0.15904147331786542, "grad_norm": 0.04396429657936096, "grad_norm_var": 2.666747652911213e-05, "learning_rate": 0.009511557943961984, "loss": 2.5857, "step": 4387 }, { "crossentropy": 2.6415176391601562, "epoch": 0.15907772621809746, "grad_norm": 0.03944588452577591, "grad_norm_var": 2.575689082299533e-05, "learning_rate": 0.009511307415618564, "loss": 2.6447, "step": 4388 }, { "crossentropy": 2.6937320232391357, "epoch": 0.15911397911832947, "grad_norm": 0.03713858500123024, "grad_norm_var": 2.7198193025043614e-05, "learning_rate": 0.009511056826343092, "loss": 2.7122, "step": 4389 }, { "crossentropy": 2.5854058265686035, "epoch": 0.1591502320185615, "grad_norm": 0.03763703629374504, "grad_norm_var": 2.70939227716121e-05, "learning_rate": 0.009510806176138953, "loss": 2.5673, "step": 4390 }, { "crossentropy": 2.7651965618133545, "epoch": 0.1591864849187935, "grad_norm": 0.03965328261256218, "grad_norm_var": 2.6710058347059694e-05, "learning_rate": 0.00951055546500953, "loss": 2.7676, "step": 4391 }, { "crossentropy": 2.7326624393463135, "epoch": 0.15922273781902552, "grad_norm": 0.03678835928440094, "grad_norm_var": 2.8091037520679383e-05, "learning_rate": 0.009510304692958209, "loss": 2.7194, "step": 4392 }, { "crossentropy": 2.7030043601989746, "epoch": 0.15925899071925753, "grad_norm": 0.03573830798268318, "grad_norm_var": 3.0206901165087904e-05, "learning_rate": 0.009510053859988382, "loss": 2.6471, "step": 4393 }, { "crossentropy": 2.7782700061798096, "epoch": 0.15929524361948955, "grad_norm": 0.03526825085282326, "grad_norm_var": 3.0821153704245026e-05, "learning_rate": 0.00950980296610343, "loss": 2.7556, "step": 4394 }, { "crossentropy": 2.6940860748291016, "epoch": 0.1593314965197216, "grad_norm": 0.036482878029346466, "grad_norm_var": 3.1728121856603826e-05, "learning_rate": 0.009509552011306746, "loss": 2.6739, "step": 4395 }, { "crossentropy": 2.748903512954712, "epoch": 0.1593677494199536, "grad_norm": 0.04362840577960014, "grad_norm_var": 3.216011253408807e-05, "learning_rate": 0.00950930099560172, "loss": 2.7866, "step": 4396 }, { "crossentropy": 2.7213706970214844, "epoch": 0.15940400232018562, "grad_norm": 0.0419645719230175, "grad_norm_var": 9.996961334144544e-06, "learning_rate": 0.009509049918991739, "loss": 2.6672, "step": 4397 }, { "crossentropy": 2.740363359451294, "epoch": 0.15944025522041763, "grad_norm": 0.03871899098157883, "grad_norm_var": 9.804970480702055e-06, "learning_rate": 0.009508798781480197, "loss": 2.6698, "step": 4398 }, { "crossentropy": 2.6869893074035645, "epoch": 0.15947650812064965, "grad_norm": 0.03640584647655487, "grad_norm_var": 1.0331295990413196e-05, "learning_rate": 0.009508547583070485, "loss": 2.667, "step": 4399 }, { "crossentropy": 2.7369489669799805, "epoch": 0.15951276102088166, "grad_norm": 0.036344677209854126, "grad_norm_var": 1.043480984300223e-05, "learning_rate": 0.009508296323765998, "loss": 2.7051, "step": 4400 }, { "crossentropy": 2.7696967124938965, "epoch": 0.1595490139211137, "grad_norm": 0.036538805812597275, "grad_norm_var": 1.0310994494207535e-05, "learning_rate": 0.009508045003570125, "loss": 2.7263, "step": 4401 }, { "crossentropy": 2.6748859882354736, "epoch": 0.15958526682134572, "grad_norm": 0.03892092406749725, "grad_norm_var": 7.402763797881296e-06, "learning_rate": 0.009507793622486263, "loss": 2.7156, "step": 4402 }, { "crossentropy": 2.6457884311676025, "epoch": 0.15962151972157773, "grad_norm": 0.044482577592134476, "grad_norm_var": 7.80303546093989e-06, "learning_rate": 0.009507542180517808, "loss": 2.6982, "step": 4403 }, { "crossentropy": 2.610978603363037, "epoch": 0.15965777262180975, "grad_norm": 0.04098309949040413, "grad_norm_var": 8.155389216707443e-06, "learning_rate": 0.009507290677668156, "loss": 2.7074, "step": 4404 }, { "crossentropy": 2.6896109580993652, "epoch": 0.15969402552204176, "grad_norm": 0.041006214916706085, "grad_norm_var": 8.365852261016324e-06, "learning_rate": 0.009507039113940702, "loss": 2.6864, "step": 4405 }, { "crossentropy": 2.6653220653533936, "epoch": 0.15973027842227377, "grad_norm": 0.041578635573387146, "grad_norm_var": 8.733483451837248e-06, "learning_rate": 0.009506787489338846, "loss": 2.5869, "step": 4406 }, { "crossentropy": 2.4223575592041016, "epoch": 0.1597665313225058, "grad_norm": 0.040079496800899506, "grad_norm_var": 8.780172728600246e-06, "learning_rate": 0.009506535803865986, "loss": 2.4948, "step": 4407 }, { "crossentropy": 2.5431041717529297, "epoch": 0.15980278422273783, "grad_norm": 0.04135316237807274, "grad_norm_var": 8.70103916534859e-06, "learning_rate": 0.009506284057525521, "loss": 2.6775, "step": 4408 }, { "crossentropy": 2.6524107456207275, "epoch": 0.15983903712296985, "grad_norm": 0.03587332367897034, "grad_norm_var": 8.637278789893656e-06, "learning_rate": 0.00950603225032085, "loss": 2.696, "step": 4409 }, { "crossentropy": 2.6932990550994873, "epoch": 0.15987529002320186, "grad_norm": 0.041465725749731064, "grad_norm_var": 7.66340829774405e-06, "learning_rate": 0.009505780382255377, "loss": 2.6514, "step": 4410 }, { "crossentropy": 2.6755480766296387, "epoch": 0.15991154292343387, "grad_norm": 0.04252319782972336, "grad_norm_var": 7.321179407787127e-06, "learning_rate": 0.009505528453332501, "loss": 2.6926, "step": 4411 }, { "crossentropy": 2.743779420852661, "epoch": 0.1599477958236659, "grad_norm": 0.04147721081972122, "grad_norm_var": 6.603166568561192e-06, "learning_rate": 0.009505276463555626, "loss": 2.7826, "step": 4412 }, { "crossentropy": 2.551051616668701, "epoch": 0.1599840487238979, "grad_norm": 0.037007302045822144, "grad_norm_var": 6.828839447426717e-06, "learning_rate": 0.009505024412928156, "loss": 2.6419, "step": 4413 }, { "crossentropy": 2.469345808029175, "epoch": 0.16002030162412992, "grad_norm": 0.037332259118556976, "grad_norm_var": 7.125320693853315e-06, "learning_rate": 0.009504772301453494, "loss": 2.5242, "step": 4414 }, { "crossentropy": 2.7061266899108887, "epoch": 0.16005655452436196, "grad_norm": 0.0373515710234642, "grad_norm_var": 6.78024171192933e-06, "learning_rate": 0.009504520129135045, "loss": 2.6121, "step": 4415 }, { "crossentropy": 2.684128761291504, "epoch": 0.16009280742459397, "grad_norm": 0.041767898947000504, "grad_norm_var": 6.232081181449985e-06, "learning_rate": 0.009504267895976217, "loss": 2.6442, "step": 4416 }, { "crossentropy": 2.9067940711975098, "epoch": 0.160129060324826, "grad_norm": 0.049280691891908646, "grad_norm_var": 1.052648252441423e-05, "learning_rate": 0.009504015601980416, "loss": 2.8675, "step": 4417 }, { "crossentropy": 2.5570688247680664, "epoch": 0.160165313225058, "grad_norm": 0.04658246040344238, "grad_norm_var": 1.2295851495887587e-05, "learning_rate": 0.00950376324715105, "loss": 2.6277, "step": 4418 }, { "crossentropy": 2.770132541656494, "epoch": 0.16020156612529002, "grad_norm": 0.040520746260881424, "grad_norm_var": 1.1574049465921249e-05, "learning_rate": 0.009503510831491525, "loss": 2.8236, "step": 4419 }, { "crossentropy": 2.6885130405426025, "epoch": 0.16023781902552203, "grad_norm": 0.03576492518186569, "grad_norm_var": 1.3295599633493006e-05, "learning_rate": 0.009503258355005251, "loss": 2.6891, "step": 4420 }, { "crossentropy": 2.7787041664123535, "epoch": 0.16027407192575405, "grad_norm": 0.03758715093135834, "grad_norm_var": 1.3879928035275396e-05, "learning_rate": 0.00950300581769564, "loss": 2.6299, "step": 4421 }, { "crossentropy": 2.667994976043701, "epoch": 0.1603103248259861, "grad_norm": 0.040022484958171844, "grad_norm_var": 1.3801585176704118e-05, "learning_rate": 0.009502753219566102, "loss": 2.7378, "step": 4422 }, { "crossentropy": 2.7342586517333984, "epoch": 0.1603465777262181, "grad_norm": 0.04122929275035858, "grad_norm_var": 1.383900921833227e-05, "learning_rate": 0.00950250056062005, "loss": 2.715, "step": 4423 }, { "crossentropy": 2.870731830596924, "epoch": 0.16038283062645012, "grad_norm": 0.043367449194192886, "grad_norm_var": 1.4336174570446445e-05, "learning_rate": 0.009502247840860893, "loss": 2.7708, "step": 4424 }, { "crossentropy": 2.6742286682128906, "epoch": 0.16041908352668213, "grad_norm": 0.04547196626663208, "grad_norm_var": 1.4080955261690173e-05, "learning_rate": 0.009501995060292048, "loss": 2.7158, "step": 4425 }, { "crossentropy": 2.7234935760498047, "epoch": 0.16045533642691415, "grad_norm": 0.04011303186416626, "grad_norm_var": 1.4142344179611983e-05, "learning_rate": 0.009501742218916927, "loss": 2.6823, "step": 4426 }, { "crossentropy": 2.7117226123809814, "epoch": 0.16049158932714616, "grad_norm": 0.03502466157078743, "grad_norm_var": 1.622115689755051e-05, "learning_rate": 0.009501489316738944, "loss": 2.6595, "step": 4427 }, { "crossentropy": 2.92757511138916, "epoch": 0.1605278422273782, "grad_norm": 0.03603179380297661, "grad_norm_var": 1.7451202078304823e-05, "learning_rate": 0.00950123635376152, "loss": 2.789, "step": 4428 }, { "crossentropy": 2.6685738563537598, "epoch": 0.16056409512761022, "grad_norm": 0.0379914827644825, "grad_norm_var": 1.708248288848907e-05, "learning_rate": 0.009500983329988067, "loss": 2.7198, "step": 4429 }, { "crossentropy": 2.6899943351745605, "epoch": 0.16060034802784223, "grad_norm": 0.037350550293922424, "grad_norm_var": 1.7075168470427094e-05, "learning_rate": 0.009500730245422003, "loss": 2.6822, "step": 4430 }, { "crossentropy": 2.802830696105957, "epoch": 0.16063660092807425, "grad_norm": 0.03529965505003929, "grad_norm_var": 1.8156227095929752e-05, "learning_rate": 0.009500477100066747, "loss": 2.8054, "step": 4431 }, { "crossentropy": 2.60512375831604, "epoch": 0.16067285382830626, "grad_norm": 0.03853625804185867, "grad_norm_var": 1.813891535095511e-05, "learning_rate": 0.009500223893925717, "loss": 2.6771, "step": 4432 }, { "crossentropy": 2.7479982376098633, "epoch": 0.16070910672853828, "grad_norm": 0.038871608674526215, "grad_norm_var": 1.2045408116920925e-05, "learning_rate": 0.009499970627002334, "loss": 2.7272, "step": 4433 }, { "crossentropy": 2.7435879707336426, "epoch": 0.1607453596287703, "grad_norm": 0.04063688963651657, "grad_norm_var": 8.52949148992995e-06, "learning_rate": 0.009499717299300022, "loss": 2.6822, "step": 4434 }, { "crossentropy": 2.6315979957580566, "epoch": 0.16078161252900233, "grad_norm": 0.04152810201048851, "grad_norm_var": 8.798683482102767e-06, "learning_rate": 0.009499463910822195, "loss": 2.6619, "step": 4435 }, { "crossentropy": 2.7731032371520996, "epoch": 0.16081786542923435, "grad_norm": 0.0388631708920002, "grad_norm_var": 8.040861430600767e-06, "learning_rate": 0.009499210461572283, "loss": 2.7543, "step": 4436 }, { "crossentropy": 2.6754469871520996, "epoch": 0.16085411832946636, "grad_norm": 0.03676077723503113, "grad_norm_var": 8.266247532696889e-06, "learning_rate": 0.009498956951553705, "loss": 2.709, "step": 4437 }, { "crossentropy": 2.8334803581237793, "epoch": 0.16089037122969838, "grad_norm": 0.036158688366413116, "grad_norm_var": 8.772336951451351e-06, "learning_rate": 0.009498703380769885, "loss": 2.7409, "step": 4438 }, { "crossentropy": 2.650712251663208, "epoch": 0.1609266241299304, "grad_norm": 0.04007548466324806, "grad_norm_var": 8.505232831723432e-06, "learning_rate": 0.00949844974922425, "loss": 2.628, "step": 4439 }, { "crossentropy": 2.6076433658599854, "epoch": 0.1609628770301624, "grad_norm": 0.03975173830986023, "grad_norm_var": 7.1589896195003425e-06, "learning_rate": 0.009498196056920222, "loss": 2.6701, "step": 4440 }, { "crossentropy": 2.5474231243133545, "epoch": 0.16099912993039442, "grad_norm": 0.03824947029352188, "grad_norm_var": 3.853681685686503e-06, "learning_rate": 0.009497942303861234, "loss": 2.6606, "step": 4441 }, { "crossentropy": 2.6337890625, "epoch": 0.16103538283062646, "grad_norm": 0.03904656320810318, "grad_norm_var": 3.653126643188229e-06, "learning_rate": 0.009497688490050707, "loss": 2.6824, "step": 4442 }, { "crossentropy": 2.610867500305176, "epoch": 0.16107163573085848, "grad_norm": 0.04006091505289078, "grad_norm_var": 3.149064460196647e-06, "learning_rate": 0.009497434615492071, "loss": 2.6928, "step": 4443 }, { "crossentropy": 2.7341744899749756, "epoch": 0.1611078886310905, "grad_norm": 0.04094317555427551, "grad_norm_var": 3.0725662386214708e-06, "learning_rate": 0.009497180680188757, "loss": 2.7688, "step": 4444 }, { "crossentropy": 2.7468504905700684, "epoch": 0.1611441415313225, "grad_norm": 0.03960082307457924, "grad_norm_var": 3.0700080112854833e-06, "learning_rate": 0.009496926684144192, "loss": 2.6977, "step": 4445 }, { "crossentropy": 2.681856393814087, "epoch": 0.16118039443155452, "grad_norm": 0.03884776681661606, "grad_norm_var": 2.909107869282906e-06, "learning_rate": 0.00949667262736181, "loss": 2.6976, "step": 4446 }, { "crossentropy": 2.738816499710083, "epoch": 0.16121664733178653, "grad_norm": 0.042229097336530685, "grad_norm_var": 2.5357386090195274e-06, "learning_rate": 0.009496418509845037, "loss": 2.6844, "step": 4447 }, { "crossentropy": 2.7455272674560547, "epoch": 0.16125290023201855, "grad_norm": 0.04132706671953201, "grad_norm_var": 2.7066910966261697e-06, "learning_rate": 0.009496164331597308, "loss": 2.6792, "step": 4448 }, { "crossentropy": 2.7060928344726562, "epoch": 0.1612891531322506, "grad_norm": 0.03882819786667824, "grad_norm_var": 2.7107902273134274e-06, "learning_rate": 0.009495910092622059, "loss": 2.7023, "step": 4449 }, { "crossentropy": 2.5711052417755127, "epoch": 0.1613254060324826, "grad_norm": 0.04088568687438965, "grad_norm_var": 2.750490567099186e-06, "learning_rate": 0.009495655792922721, "loss": 2.6744, "step": 4450 }, { "crossentropy": 2.733400583267212, "epoch": 0.16136165893271462, "grad_norm": 0.0393625944852829, "grad_norm_var": 2.4788709319628044e-06, "learning_rate": 0.009495401432502729, "loss": 2.7393, "step": 4451 }, { "crossentropy": 2.5782012939453125, "epoch": 0.16139791183294663, "grad_norm": 0.03857892379164696, "grad_norm_var": 2.5056667538741448e-06, "learning_rate": 0.009495147011365519, "loss": 2.7145, "step": 4452 }, { "crossentropy": 2.687037944793701, "epoch": 0.16143416473317865, "grad_norm": 0.04060783609747887, "grad_norm_var": 2.0670509924510435e-06, "learning_rate": 0.009494892529514526, "loss": 2.7308, "step": 4453 }, { "crossentropy": 2.655391216278076, "epoch": 0.16147041763341066, "grad_norm": 0.041834574192762375, "grad_norm_var": 1.4310740895121312e-06, "learning_rate": 0.00949463798695319, "loss": 2.6656, "step": 4454 }, { "crossentropy": 2.5837550163269043, "epoch": 0.1615066705336427, "grad_norm": 0.03695441409945488, "grad_norm_var": 2.0144591102143223e-06, "learning_rate": 0.009494383383684945, "loss": 2.6373, "step": 4455 }, { "crossentropy": 2.7392632961273193, "epoch": 0.16154292343387472, "grad_norm": 0.039557524025440216, "grad_norm_var": 2.01856615636749e-06, "learning_rate": 0.009494128719713233, "loss": 2.6762, "step": 4456 }, { "crossentropy": 2.8882157802581787, "epoch": 0.16157917633410673, "grad_norm": 0.038407374173402786, "grad_norm_var": 1.9873290506936905e-06, "learning_rate": 0.009493873995041492, "loss": 2.7268, "step": 4457 }, { "crossentropy": 2.806741952896118, "epoch": 0.16161542923433875, "grad_norm": 0.03781213238835335, "grad_norm_var": 2.2093800423787662e-06, "learning_rate": 0.009493619209673163, "loss": 2.7581, "step": 4458 }, { "crossentropy": 2.6982309818267822, "epoch": 0.16165168213457076, "grad_norm": 0.037250738590955734, "grad_norm_var": 2.5826601180972684e-06, "learning_rate": 0.009493364363611687, "loss": 2.7374, "step": 4459 }, { "crossentropy": 2.8269660472869873, "epoch": 0.16168793503480278, "grad_norm": 0.04733184352517128, "grad_norm_var": 6.308206125483993e-06, "learning_rate": 0.009493109456860507, "loss": 2.7911, "step": 4460 }, { "crossentropy": 2.662874937057495, "epoch": 0.1617241879350348, "grad_norm": 0.04015989229083061, "grad_norm_var": 6.300703389845209e-06, "learning_rate": 0.009492854489423067, "loss": 2.6182, "step": 4461 }, { "crossentropy": 2.6307215690612793, "epoch": 0.16176044083526683, "grad_norm": 0.04105435311794281, "grad_norm_var": 6.266464581843913e-06, "learning_rate": 0.009492599461302806, "loss": 2.7541, "step": 4462 }, { "crossentropy": 2.7599520683288574, "epoch": 0.16179669373549885, "grad_norm": 0.043568696826696396, "grad_norm_var": 6.752407690384781e-06, "learning_rate": 0.009492344372503172, "loss": 2.7237, "step": 4463 }, { "crossentropy": 2.6308608055114746, "epoch": 0.16183294663573086, "grad_norm": 0.04368727281689644, "grad_norm_var": 7.448919539838431e-06, "learning_rate": 0.00949208922302761, "loss": 2.5991, "step": 4464 }, { "crossentropy": 2.6317641735076904, "epoch": 0.16186919953596288, "grad_norm": 0.04156315326690674, "grad_norm_var": 7.355048490799374e-06, "learning_rate": 0.009491834012879568, "loss": 2.6783, "step": 4465 }, { "crossentropy": 2.8104097843170166, "epoch": 0.1619054524361949, "grad_norm": 0.04234801232814789, "grad_norm_var": 7.556379269803697e-06, "learning_rate": 0.009491578742062488, "loss": 2.7791, "step": 4466 }, { "crossentropy": 2.8504276275634766, "epoch": 0.1619417053364269, "grad_norm": 0.04302672669291496, "grad_norm_var": 7.776323914112048e-06, "learning_rate": 0.009491323410579823, "loss": 2.7665, "step": 4467 }, { "crossentropy": 2.6935818195343018, "epoch": 0.16197795823665892, "grad_norm": 0.042085662484169006, "grad_norm_var": 7.478831275282925e-06, "learning_rate": 0.00949106801843502, "loss": 2.7533, "step": 4468 }, { "crossentropy": 2.71024489402771, "epoch": 0.16201421113689096, "grad_norm": 0.03959701582789421, "grad_norm_var": 7.6060765433713994e-06, "learning_rate": 0.009490812565631526, "loss": 2.7495, "step": 4469 }, { "crossentropy": 2.8160390853881836, "epoch": 0.16205046403712298, "grad_norm": 0.04228859394788742, "grad_norm_var": 7.668575951689377e-06, "learning_rate": 0.009490557052172794, "loss": 2.7649, "step": 4470 }, { "crossentropy": 2.570037603378296, "epoch": 0.162086716937355, "grad_norm": 0.043107494711875916, "grad_norm_var": 6.680253980426559e-06, "learning_rate": 0.009490301478062275, "loss": 2.7044, "step": 4471 }, { "crossentropy": 2.6530442237854004, "epoch": 0.162122969837587, "grad_norm": 0.04089309647679329, "grad_norm_var": 6.45866793939238e-06, "learning_rate": 0.00949004584330342, "loss": 2.6886, "step": 4472 }, { "crossentropy": 2.7174646854400635, "epoch": 0.16215922273781902, "grad_norm": 0.03519986569881439, "grad_norm_var": 8.429157636497392e-06, "learning_rate": 0.009489790147899682, "loss": 2.7211, "step": 4473 }, { "crossentropy": 2.835542917251587, "epoch": 0.16219547563805103, "grad_norm": 0.03882938623428345, "grad_norm_var": 8.019280385348122e-06, "learning_rate": 0.009489534391854513, "loss": 2.7276, "step": 4474 }, { "crossentropy": 2.623491048812866, "epoch": 0.16223172853828308, "grad_norm": 0.04126925393939018, "grad_norm_var": 6.819046219134576e-06, "learning_rate": 0.009489278575171371, "loss": 2.5769, "step": 4475 }, { "crossentropy": 2.6647496223449707, "epoch": 0.1622679814385151, "grad_norm": 0.04346371069550514, "grad_norm_var": 4.811221603021979e-06, "learning_rate": 0.009489022697853709, "loss": 2.7658, "step": 4476 }, { "crossentropy": 2.779444456100464, "epoch": 0.1623042343387471, "grad_norm": 0.0442710779607296, "grad_norm_var": 5.196644636750365e-06, "learning_rate": 0.009488766759904982, "loss": 2.7597, "step": 4477 }, { "crossentropy": 2.5448246002197266, "epoch": 0.16234048723897912, "grad_norm": 0.04168163612484932, "grad_norm_var": 5.172185301030065e-06, "learning_rate": 0.009488510761328648, "loss": 2.6275, "step": 4478 }, { "crossentropy": 2.4578185081481934, "epoch": 0.16237674013921113, "grad_norm": 0.039392679929733276, "grad_norm_var": 5.21052230312638e-06, "learning_rate": 0.009488254702128164, "loss": 2.6041, "step": 4479 }, { "crossentropy": 2.699483871459961, "epoch": 0.16241299303944315, "grad_norm": 0.03664770722389221, "grad_norm_var": 6.178756826826353e-06, "learning_rate": 0.009487998582306992, "loss": 2.7137, "step": 4480 }, { "crossentropy": 2.7408785820007324, "epoch": 0.16244924593967516, "grad_norm": 0.03492686152458191, "grad_norm_var": 8.414457668462437e-06, "learning_rate": 0.009487742401868586, "loss": 2.7794, "step": 4481 }, { "crossentropy": 2.6763174533843994, "epoch": 0.1624854988399072, "grad_norm": 0.03548530489206314, "grad_norm_var": 9.725857597854028e-06, "learning_rate": 0.00948748616081641, "loss": 2.6384, "step": 4482 }, { "crossentropy": 2.6192777156829834, "epoch": 0.16252175174013922, "grad_norm": 0.03923221305012703, "grad_norm_var": 9.162919351623997e-06, "learning_rate": 0.009487229859153921, "loss": 2.6741, "step": 4483 }, { "crossentropy": 2.6632041931152344, "epoch": 0.16255800464037123, "grad_norm": 0.04283821955323219, "grad_norm_var": 9.417805520744772e-06, "learning_rate": 0.009486973496884585, "loss": 2.6678, "step": 4484 }, { "crossentropy": 2.7389986515045166, "epoch": 0.16259425754060325, "grad_norm": 0.041639942675828934, "grad_norm_var": 9.583794805097708e-06, "learning_rate": 0.009486717074011863, "loss": 2.6546, "step": 4485 }, { "crossentropy": 2.702000141143799, "epoch": 0.16263051044083526, "grad_norm": 0.038865961134433746, "grad_norm_var": 9.304829911037834e-06, "learning_rate": 0.009486460590539217, "loss": 2.6697, "step": 4486 }, { "crossentropy": 2.5979392528533936, "epoch": 0.16266676334106728, "grad_norm": 0.036982499063014984, "grad_norm_var": 8.996638699063781e-06, "learning_rate": 0.009486204046470112, "loss": 2.6978, "step": 4487 }, { "crossentropy": 2.824173927307129, "epoch": 0.1627030162412993, "grad_norm": 0.03630471229553223, "grad_norm_var": 9.445640918031145e-06, "learning_rate": 0.009485947441808014, "loss": 2.7545, "step": 4488 }, { "crossentropy": 2.801504611968994, "epoch": 0.16273926914153133, "grad_norm": 0.03664550557732582, "grad_norm_var": 8.807259819524487e-06, "learning_rate": 0.009485690776556388, "loss": 2.8073, "step": 4489 }, { "crossentropy": 2.756605386734009, "epoch": 0.16277552204176335, "grad_norm": 0.037803422659635544, "grad_norm_var": 8.934660718241428e-06, "learning_rate": 0.009485434050718703, "loss": 2.6819, "step": 4490 }, { "crossentropy": 2.6733171939849854, "epoch": 0.16281177494199536, "grad_norm": 0.03875762224197388, "grad_norm_var": 8.641215966091048e-06, "learning_rate": 0.00948517726429842, "loss": 2.6594, "step": 4491 }, { "crossentropy": 2.660634756088257, "epoch": 0.16284802784222738, "grad_norm": 0.036906275898218155, "grad_norm_var": 7.477296779088289e-06, "learning_rate": 0.009484920417299014, "loss": 2.6991, "step": 4492 }, { "crossentropy": 2.7591447830200195, "epoch": 0.1628842807424594, "grad_norm": 0.04239834100008011, "grad_norm_var": 6.292633367141418e-06, "learning_rate": 0.009484663509723951, "loss": 2.7131, "step": 4493 }, { "crossentropy": 2.782892942428589, "epoch": 0.1629205336426914, "grad_norm": 0.043280333280563354, "grad_norm_var": 7.123789381418185e-06, "learning_rate": 0.009484406541576703, "loss": 2.7566, "step": 4494 }, { "crossentropy": 2.6950507164001465, "epoch": 0.16295678654292342, "grad_norm": 0.042053066194057465, "grad_norm_var": 7.836067318330263e-06, "learning_rate": 0.009484149512860737, "loss": 2.7704, "step": 4495 }, { "crossentropy": 2.6341068744659424, "epoch": 0.16299303944315546, "grad_norm": 0.041497476398944855, "grad_norm_var": 7.915627625266579e-06, "learning_rate": 0.009483892423579528, "loss": 2.6964, "step": 4496 }, { "crossentropy": 2.536287546157837, "epoch": 0.16302929234338748, "grad_norm": 0.03827028349041939, "grad_norm_var": 6.753445560062944e-06, "learning_rate": 0.009483635273736548, "loss": 2.6013, "step": 4497 }, { "crossentropy": 2.6799583435058594, "epoch": 0.1630655452436195, "grad_norm": 0.03895670175552368, "grad_norm_var": 5.736302423152833e-06, "learning_rate": 0.009483378063335268, "loss": 2.7585, "step": 4498 }, { "crossentropy": 2.7734920978546143, "epoch": 0.1631017981438515, "grad_norm": 0.035417426377534866, "grad_norm_var": 6.795797987864798e-06, "learning_rate": 0.009483120792379165, "loss": 2.7325, "step": 4499 }, { "crossentropy": 2.6760640144348145, "epoch": 0.16313805104408352, "grad_norm": 0.03423202410340309, "grad_norm_var": 7.351811473243983e-06, "learning_rate": 0.009482863460871712, "loss": 2.6817, "step": 4500 }, { "crossentropy": 2.6493515968322754, "epoch": 0.16317430394431554, "grad_norm": 0.03584565594792366, "grad_norm_var": 7.2180436906820725e-06, "learning_rate": 0.009482606068816384, "loss": 2.6665, "step": 4501 }, { "crossentropy": 2.7119345664978027, "epoch": 0.16321055684454758, "grad_norm": 0.036079954355955124, "grad_norm_var": 7.525827327666453e-06, "learning_rate": 0.009482348616216659, "loss": 2.6841, "step": 4502 }, { "crossentropy": 2.793269634246826, "epoch": 0.1632468097447796, "grad_norm": 0.03666073828935623, "grad_norm_var": 7.585150687112556e-06, "learning_rate": 0.009482091103076015, "loss": 2.7329, "step": 4503 }, { "crossentropy": 2.6133594512939453, "epoch": 0.1632830626450116, "grad_norm": 0.03823519125580788, "grad_norm_var": 7.331686023868909e-06, "learning_rate": 0.009481833529397927, "loss": 2.5717, "step": 4504 }, { "crossentropy": 2.6457037925720215, "epoch": 0.16331931554524362, "grad_norm": 0.03970210626721382, "grad_norm_var": 7.235213989861764e-06, "learning_rate": 0.009481575895185876, "loss": 2.6817, "step": 4505 }, { "crossentropy": 2.915682315826416, "epoch": 0.16335556844547564, "grad_norm": 0.04281655326485634, "grad_norm_var": 8.336290583941566e-06, "learning_rate": 0.009481318200443341, "loss": 2.7907, "step": 4506 }, { "crossentropy": 2.6147778034210205, "epoch": 0.16339182134570765, "grad_norm": 0.03661828860640526, "grad_norm_var": 8.639947538080274e-06, "learning_rate": 0.009481060445173805, "loss": 2.6604, "step": 4507 }, { "crossentropy": 2.5909929275512695, "epoch": 0.16342807424593966, "grad_norm": 0.034528590738773346, "grad_norm_var": 9.55739005390101e-06, "learning_rate": 0.009480802629380745, "loss": 2.5829, "step": 4508 }, { "crossentropy": 2.612229347229004, "epoch": 0.1634643271461717, "grad_norm": 0.036416683346033096, "grad_norm_var": 8.714061450193258e-06, "learning_rate": 0.009480544753067649, "loss": 2.6565, "step": 4509 }, { "crossentropy": 2.7127785682678223, "epoch": 0.16350058004640372, "grad_norm": 0.03881237655878067, "grad_norm_var": 6.913304237600911e-06, "learning_rate": 0.009480286816237993, "loss": 2.6608, "step": 4510 }, { "crossentropy": 2.6246914863586426, "epoch": 0.16353683294663574, "grad_norm": 0.042039722204208374, "grad_norm_var": 6.905897671355179e-06, "learning_rate": 0.009480028818895267, "loss": 2.7382, "step": 4511 }, { "crossentropy": 2.5974814891815186, "epoch": 0.16357308584686775, "grad_norm": 0.04204980656504631, "grad_norm_var": 7.1911408791031434e-06, "learning_rate": 0.009479770761042951, "loss": 2.6645, "step": 4512 }, { "crossentropy": 2.6725516319274902, "epoch": 0.16360933874709976, "grad_norm": 0.036256711930036545, "grad_norm_var": 7.3498665971079175e-06, "learning_rate": 0.009479512642684535, "loss": 2.6807, "step": 4513 }, { "crossentropy": 2.7817623615264893, "epoch": 0.16364559164733178, "grad_norm": 0.03768908232450485, "grad_norm_var": 7.253405503646927e-06, "learning_rate": 0.009479254463823502, "loss": 2.6866, "step": 4514 }, { "crossentropy": 2.772472620010376, "epoch": 0.1636818445475638, "grad_norm": 0.04198424145579338, "grad_norm_var": 7.939036979459115e-06, "learning_rate": 0.009478996224463338, "loss": 2.772, "step": 4515 }, { "crossentropy": 2.699472427368164, "epoch": 0.16371809744779584, "grad_norm": 0.04295962676405907, "grad_norm_var": 8.171895120135235e-06, "learning_rate": 0.009478737924607534, "loss": 2.7241, "step": 4516 }, { "crossentropy": 2.6900198459625244, "epoch": 0.16375435034802785, "grad_norm": 0.04316312447190285, "grad_norm_var": 8.764377083713265e-06, "learning_rate": 0.00947847956425958, "loss": 2.701, "step": 4517 }, { "crossentropy": 2.62585711479187, "epoch": 0.16379060324825986, "grad_norm": 0.04148240014910698, "grad_norm_var": 8.394526308548068e-06, "learning_rate": 0.009478221143422961, "loss": 2.6913, "step": 4518 }, { "crossentropy": 2.8992772102355957, "epoch": 0.16382685614849188, "grad_norm": 0.04521690309047699, "grad_norm_var": 9.772625309262204e-06, "learning_rate": 0.00947796266210117, "loss": 2.7623, "step": 4519 }, { "crossentropy": 2.6751978397369385, "epoch": 0.1638631090487239, "grad_norm": 0.04086120426654816, "grad_norm_var": 9.586326069855693e-06, "learning_rate": 0.009477704120297697, "loss": 2.6808, "step": 4520 }, { "crossentropy": 2.7840590476989746, "epoch": 0.1638993619489559, "grad_norm": 0.035684604197740555, "grad_norm_var": 1.0841627643765987e-05, "learning_rate": 0.009477445518016034, "loss": 2.7291, "step": 4521 }, { "crossentropy": 2.766782283782959, "epoch": 0.16393561484918792, "grad_norm": 0.03457942605018616, "grad_norm_var": 1.1891416682776636e-05, "learning_rate": 0.009477186855259676, "loss": 2.7154, "step": 4522 }, { "crossentropy": 2.6612768173217773, "epoch": 0.16397186774941996, "grad_norm": 0.038248639553785324, "grad_norm_var": 1.1453632904750005e-05, "learning_rate": 0.009476928132032115, "loss": 2.6728, "step": 4523 }, { "crossentropy": 2.656806468963623, "epoch": 0.16400812064965198, "grad_norm": 0.03843696415424347, "grad_norm_var": 9.818535859205426e-06, "learning_rate": 0.009476669348336846, "loss": 2.7049, "step": 4524 }, { "crossentropy": 2.743239641189575, "epoch": 0.164044373549884, "grad_norm": 0.0376870222389698, "grad_norm_var": 9.356057956055353e-06, "learning_rate": 0.009476410504177362, "loss": 2.6826, "step": 4525 }, { "crossentropy": 2.78749680519104, "epoch": 0.164080626450116, "grad_norm": 0.03796940669417381, "grad_norm_var": 9.513946949431483e-06, "learning_rate": 0.009476151599557163, "loss": 2.766, "step": 4526 }, { "crossentropy": 2.645869493484497, "epoch": 0.16411687935034802, "grad_norm": 0.037656620144844055, "grad_norm_var": 9.387808441773995e-06, "learning_rate": 0.009475892634479744, "loss": 2.64, "step": 4527 }, { "crossentropy": 2.6267848014831543, "epoch": 0.16415313225058004, "grad_norm": 0.03790063410997391, "grad_norm_var": 9.050607700504474e-06, "learning_rate": 0.009475633608948602, "loss": 2.6703, "step": 4528 }, { "crossentropy": 2.540371894836426, "epoch": 0.16418938515081208, "grad_norm": 0.03566649928689003, "grad_norm_var": 9.306837775390799e-06, "learning_rate": 0.009475374522967235, "loss": 2.4865, "step": 4529 }, { "crossentropy": 2.7661209106445312, "epoch": 0.1642256380510441, "grad_norm": 0.04035235568881035, "grad_norm_var": 9.21392212614447e-06, "learning_rate": 0.009475115376539145, "loss": 2.7028, "step": 4530 }, { "crossentropy": 2.7713654041290283, "epoch": 0.1642618909512761, "grad_norm": 0.03639071807265282, "grad_norm_var": 9.216403524911488e-06, "learning_rate": 0.009474856169667829, "loss": 2.7556, "step": 4531 }, { "crossentropy": 2.550011396408081, "epoch": 0.16429814385150812, "grad_norm": 0.035582657903432846, "grad_norm_var": 8.738707117258993e-06, "learning_rate": 0.009474596902356788, "loss": 2.653, "step": 4532 }, { "crossentropy": 2.8089520931243896, "epoch": 0.16433439675174014, "grad_norm": 0.03428385779261589, "grad_norm_var": 8.210663685028162e-06, "learning_rate": 0.00947433757460953, "loss": 2.7427, "step": 4533 }, { "crossentropy": 2.638315439224243, "epoch": 0.16437064965197215, "grad_norm": 0.035786908119916916, "grad_norm_var": 7.5935429432033115e-06, "learning_rate": 0.009474078186429551, "loss": 2.6269, "step": 4534 }, { "crossentropy": 2.610217571258545, "epoch": 0.16440690255220416, "grad_norm": 0.04130266606807709, "grad_norm_var": 4.598849885114008e-06, "learning_rate": 0.009473818737820355, "loss": 2.6456, "step": 4535 }, { "crossentropy": 2.881603717803955, "epoch": 0.1644431554524362, "grad_norm": 0.04807325080037117, "grad_norm_var": 1.1178606258079443e-05, "learning_rate": 0.009473559228785448, "loss": 2.8046, "step": 4536 }, { "crossentropy": 2.7436716556549072, "epoch": 0.16447940835266822, "grad_norm": 0.053036123514175415, "grad_norm_var": 2.4985764148617814e-05, "learning_rate": 0.009473299659328338, "loss": 2.8122, "step": 4537 }, { "crossentropy": 2.6846306324005127, "epoch": 0.16451566125290024, "grad_norm": 0.06380084902048111, "grad_norm_var": 6.138536608747135e-05, "learning_rate": 0.009473040029452524, "loss": 2.6446, "step": 4538 }, { "crossentropy": 2.8183999061584473, "epoch": 0.16455191415313225, "grad_norm": 0.05699630081653595, "grad_norm_var": 7.70725527589254e-05, "learning_rate": 0.009472780339161517, "loss": 2.7803, "step": 4539 }, { "crossentropy": 2.826996326446533, "epoch": 0.16458816705336426, "grad_norm": 0.04532535746693611, "grad_norm_var": 7.682752927291198e-05, "learning_rate": 0.009472520588458825, "loss": 2.7507, "step": 4540 }, { "crossentropy": 2.784926176071167, "epoch": 0.16462441995359628, "grad_norm": 0.0498155802488327, "grad_norm_var": 7.845935733317751e-05, "learning_rate": 0.009472260777347955, "loss": 2.7978, "step": 4541 }, { "crossentropy": 2.713557720184326, "epoch": 0.1646606728538283, "grad_norm": 0.048296310007572174, "grad_norm_var": 7.803100566320757e-05, "learning_rate": 0.009472000905832416, "loss": 2.7437, "step": 4542 }, { "crossentropy": 2.794335126876831, "epoch": 0.16469692575406034, "grad_norm": 0.04029314965009689, "grad_norm_var": 7.631755160289258e-05, "learning_rate": 0.009471740973915718, "loss": 2.7762, "step": 4543 }, { "crossentropy": 2.6059577465057373, "epoch": 0.16473317865429235, "grad_norm": 0.045207828283309937, "grad_norm_var": 7.377896435925013e-05, "learning_rate": 0.009471480981601372, "loss": 2.6466, "step": 4544 }, { "crossentropy": 2.655397891998291, "epoch": 0.16476943155452436, "grad_norm": 0.048791199922561646, "grad_norm_var": 6.928253276813895e-05, "learning_rate": 0.00947122092889289, "loss": 2.693, "step": 4545 }, { "crossentropy": 2.7773354053497314, "epoch": 0.16480568445475638, "grad_norm": 0.04135584086179733, "grad_norm_var": 6.869573407963953e-05, "learning_rate": 0.009470960815793784, "loss": 2.7264, "step": 4546 }, { "crossentropy": 2.645022392272949, "epoch": 0.1648419373549884, "grad_norm": 0.03941803425550461, "grad_norm_var": 6.568400257056048e-05, "learning_rate": 0.009470700642307565, "loss": 2.7392, "step": 4547 }, { "crossentropy": 2.642442226409912, "epoch": 0.1648781902552204, "grad_norm": 0.03678692504763603, "grad_norm_var": 6.418858994804936e-05, "learning_rate": 0.009470440408437752, "loss": 2.6332, "step": 4548 }, { "crossentropy": 2.7019407749176025, "epoch": 0.16491444315545242, "grad_norm": 0.03631090000271797, "grad_norm_var": 6.140435219103748e-05, "learning_rate": 0.009470180114187857, "loss": 2.6661, "step": 4549 }, { "crossentropy": 2.72877836227417, "epoch": 0.16495069605568446, "grad_norm": 0.03947144374251366, "grad_norm_var": 5.7401329085922326e-05, "learning_rate": 0.009469919759561397, "loss": 2.7277, "step": 4550 }, { "crossentropy": 2.5471720695495605, "epoch": 0.16498694895591648, "grad_norm": 0.04122830182313919, "grad_norm_var": 5.744718507527385e-05, "learning_rate": 0.009469659344561884, "loss": 2.5755, "step": 4551 }, { "crossentropy": 2.9135549068450928, "epoch": 0.1650232018561485, "grad_norm": 0.043311599642038345, "grad_norm_var": 5.747685676266012e-05, "learning_rate": 0.00946939886919284, "loss": 2.8188, "step": 4552 }, { "crossentropy": 2.699130058288574, "epoch": 0.1650594547563805, "grad_norm": 0.042487747967243195, "grad_norm_var": 5.3959024913050914e-05, "learning_rate": 0.009469138333457782, "loss": 2.6886, "step": 4553 }, { "crossentropy": 2.5522408485412598, "epoch": 0.16509570765661252, "grad_norm": 0.04363728314638138, "grad_norm_var": 2.8638715868164964e-05, "learning_rate": 0.009468877737360229, "loss": 2.6156, "step": 4554 }, { "crossentropy": 2.568542957305908, "epoch": 0.16513196055684454, "grad_norm": 0.10896959900856018, "grad_norm_var": 0.0002898074613171694, "learning_rate": 0.0094686170809037, "loss": 2.6609, "step": 4555 }, { "crossentropy": 2.692232370376587, "epoch": 0.16516821345707658, "grad_norm": 0.04230716452002525, "grad_norm_var": 0.0002910182051095458, "learning_rate": 0.009468356364091716, "loss": 2.6723, "step": 4556 }, { "crossentropy": 2.6429803371429443, "epoch": 0.1652044663573086, "grad_norm": 0.036573730409145355, "grad_norm_var": 0.0002965305110993086, "learning_rate": 0.009468095586927799, "loss": 2.6225, "step": 4557 }, { "crossentropy": 2.7044601440429688, "epoch": 0.1652407192575406, "grad_norm": 0.03700394555926323, "grad_norm_var": 0.00030089678206287793, "learning_rate": 0.009467834749415469, "loss": 2.7635, "step": 4558 }, { "crossentropy": 2.7089338302612305, "epoch": 0.16527697215777262, "grad_norm": 0.038045164197683334, "grad_norm_var": 0.00030268251033898437, "learning_rate": 0.009467573851558251, "loss": 2.6831, "step": 4559 }, { "crossentropy": 2.6621134281158447, "epoch": 0.16531322505800464, "grad_norm": 0.03781811520457268, "grad_norm_var": 0.0003059465653232109, "learning_rate": 0.00946731289335967, "loss": 2.6567, "step": 4560 }, { "crossentropy": 2.6915998458862305, "epoch": 0.16534947795823665, "grad_norm": 0.03824852779507637, "grad_norm_var": 0.00030699449265666936, "learning_rate": 0.009467051874823247, "loss": 2.6581, "step": 4561 }, { "crossentropy": 2.7333247661590576, "epoch": 0.16538573085846867, "grad_norm": 0.03711661323904991, "grad_norm_var": 0.0003095760083492473, "learning_rate": 0.00946679079595251, "loss": 2.7532, "step": 4562 }, { "crossentropy": 2.965202569961548, "epoch": 0.1654219837587007, "grad_norm": 0.04111413285136223, "grad_norm_var": 0.0003087940248053348, "learning_rate": 0.009466529656750985, "loss": 2.8592, "step": 4563 }, { "crossentropy": 2.7029922008514404, "epoch": 0.16545823665893272, "grad_norm": 0.04307541623711586, "grad_norm_var": 0.0003054047004912819, "learning_rate": 0.009466268457222198, "loss": 2.7273, "step": 4564 }, { "crossentropy": 2.631486654281616, "epoch": 0.16549448955916474, "grad_norm": 0.04047832265496254, "grad_norm_var": 0.00030212321579408297, "learning_rate": 0.00946600719736968, "loss": 2.6824, "step": 4565 }, { "crossentropy": 2.827977418899536, "epoch": 0.16553074245939675, "grad_norm": 0.034099381417036057, "grad_norm_var": 0.0003074789146189059, "learning_rate": 0.009465745877196953, "loss": 2.7919, "step": 4566 }, { "crossentropy": 2.738954782485962, "epoch": 0.16556699535962877, "grad_norm": 0.037228573113679886, "grad_norm_var": 0.00031000741584510385, "learning_rate": 0.009465484496707555, "loss": 2.7029, "step": 4567 }, { "crossentropy": 2.765861749649048, "epoch": 0.16560324825986078, "grad_norm": 0.03380143269896507, "grad_norm_var": 0.000316336112940137, "learning_rate": 0.00946522305590501, "loss": 2.7238, "step": 4568 }, { "crossentropy": 2.707987070083618, "epoch": 0.1656395011600928, "grad_norm": 0.036422427743673325, "grad_norm_var": 0.0003192520704388482, "learning_rate": 0.009464961554792853, "loss": 2.7269, "step": 4569 }, { "crossentropy": 2.6747360229492188, "epoch": 0.16567575406032484, "grad_norm": 0.03944878280162811, "grad_norm_var": 0.00031992073143113497, "learning_rate": 0.009464699993374613, "loss": 2.7643, "step": 4570 }, { "crossentropy": 2.8789286613464355, "epoch": 0.16571200696055685, "grad_norm": 0.038915012031793594, "grad_norm_var": 6.804251882389783e-06, "learning_rate": 0.009464438371653824, "loss": 2.72, "step": 4571 }, { "crossentropy": 2.67083477973938, "epoch": 0.16574825986078887, "grad_norm": 0.03642130270600319, "grad_norm_var": 5.770600067983578e-06, "learning_rate": 0.009464176689634021, "loss": 2.6397, "step": 4572 }, { "crossentropy": 2.818016290664673, "epoch": 0.16578451276102088, "grad_norm": 0.03397762402892113, "grad_norm_var": 6.638175378013457e-06, "learning_rate": 0.009463914947318738, "loss": 2.6824, "step": 4573 }, { "crossentropy": 2.73235821723938, "epoch": 0.1658207656612529, "grad_norm": 0.0349828265607357, "grad_norm_var": 7.081306355455746e-06, "learning_rate": 0.009463653144711507, "loss": 2.673, "step": 4574 }, { "crossentropy": 2.703019142150879, "epoch": 0.1658570185614849, "grad_norm": 0.034794341772794724, "grad_norm_var": 7.537835502623563e-06, "learning_rate": 0.009463391281815869, "loss": 2.7137, "step": 4575 }, { "crossentropy": 2.6714468002319336, "epoch": 0.16589327146171692, "grad_norm": 0.03732702136039734, "grad_norm_var": 7.523660051864806e-06, "learning_rate": 0.009463129358635356, "loss": 2.6671, "step": 4576 }, { "crossentropy": 2.775608539581299, "epoch": 0.16592952436194897, "grad_norm": 0.036536674946546555, "grad_norm_var": 7.499611239134007e-06, "learning_rate": 0.00946286737517351, "loss": 2.7312, "step": 4577 }, { "crossentropy": 2.836890459060669, "epoch": 0.16596577726218098, "grad_norm": 0.037225328385829926, "grad_norm_var": 7.498652090022426e-06, "learning_rate": 0.009462605331433865, "loss": 2.7725, "step": 4578 }, { "crossentropy": 2.731898307800293, "epoch": 0.166002030162413, "grad_norm": 0.03946081921458244, "grad_norm_var": 6.815590079568776e-06, "learning_rate": 0.009462343227419965, "loss": 2.7124, "step": 4579 }, { "crossentropy": 2.6025564670562744, "epoch": 0.166038283062645, "grad_norm": 0.041400372982025146, "grad_norm_var": 5.664716048510663e-06, "learning_rate": 0.009462081063135347, "loss": 2.6119, "step": 4580 }, { "crossentropy": 2.7081995010375977, "epoch": 0.16607453596287702, "grad_norm": 0.042922377586364746, "grad_norm_var": 7.160952832465222e-06, "learning_rate": 0.009461818838583552, "loss": 2.606, "step": 4581 }, { "crossentropy": 2.7899959087371826, "epoch": 0.16611078886310904, "grad_norm": 0.040487293154001236, "grad_norm_var": 7.082974601042326e-06, "learning_rate": 0.009461556553768123, "loss": 2.6796, "step": 4582 }, { "crossentropy": 2.6815333366394043, "epoch": 0.16614704176334108, "grad_norm": 0.041502587497234344, "grad_norm_var": 8.021835466311596e-06, "learning_rate": 0.009461294208692603, "loss": 2.7325, "step": 4583 }, { "crossentropy": 2.61782169342041, "epoch": 0.1661832946635731, "grad_norm": 0.0402161069214344, "grad_norm_var": 7.129487696603404e-06, "learning_rate": 0.009461031803360534, "loss": 2.6564, "step": 4584 }, { "crossentropy": 2.707075595855713, "epoch": 0.1662195475638051, "grad_norm": 0.0521627701818943, "grad_norm_var": 1.877347323853695e-05, "learning_rate": 0.00946076933777546, "loss": 2.7107, "step": 4585 }, { "crossentropy": 2.5302131175994873, "epoch": 0.16625580046403712, "grad_norm": 0.05751896649599075, "grad_norm_var": 3.9693574932847296e-05, "learning_rate": 0.009460506811940928, "loss": 2.5588, "step": 4586 }, { "crossentropy": 2.76265549659729, "epoch": 0.16629205336426914, "grad_norm": 0.04691266641020775, "grad_norm_var": 4.214426780702515e-05, "learning_rate": 0.009460244225860481, "loss": 2.7623, "step": 4587 }, { "crossentropy": 2.5069997310638428, "epoch": 0.16632830626450115, "grad_norm": 0.03651871159672737, "grad_norm_var": 4.2087139383785904e-05, "learning_rate": 0.009459981579537667, "loss": 2.6605, "step": 4588 }, { "crossentropy": 2.7417023181915283, "epoch": 0.16636455916473317, "grad_norm": 0.03698529303073883, "grad_norm_var": 3.988785694077201e-05, "learning_rate": 0.009459718872976034, "loss": 2.6847, "step": 4589 }, { "crossentropy": 2.630781412124634, "epoch": 0.1664008120649652, "grad_norm": 0.03633257374167442, "grad_norm_var": 3.890809987828244e-05, "learning_rate": 0.009459456106179131, "loss": 2.6234, "step": 4590 }, { "crossentropy": 2.551441192626953, "epoch": 0.16643706496519722, "grad_norm": 0.03452453017234802, "grad_norm_var": 3.9141077752184035e-05, "learning_rate": 0.009459193279150504, "loss": 2.5287, "step": 4591 }, { "crossentropy": 2.563642740249634, "epoch": 0.16647331786542924, "grad_norm": 0.03498256951570511, "grad_norm_var": 4.067249598402045e-05, "learning_rate": 0.009458930391893706, "loss": 2.6697, "step": 4592 }, { "crossentropy": 2.6013519763946533, "epoch": 0.16650957076566125, "grad_norm": 0.03756887465715408, "grad_norm_var": 4.012748296879405e-05, "learning_rate": 0.009458667444412285, "loss": 2.6571, "step": 4593 }, { "crossentropy": 2.6385631561279297, "epoch": 0.16654582366589327, "grad_norm": 0.03452116623520851, "grad_norm_var": 4.196175679687085e-05, "learning_rate": 0.009458404436709795, "loss": 2.663, "step": 4594 }, { "crossentropy": 2.7360265254974365, "epoch": 0.16658207656612528, "grad_norm": 0.03889090195298195, "grad_norm_var": 4.208960325586727e-05, "learning_rate": 0.009458141368789787, "loss": 2.7328, "step": 4595 }, { "crossentropy": 2.6499857902526855, "epoch": 0.1666183294663573, "grad_norm": 0.04311385750770569, "grad_norm_var": 4.240101963743822e-05, "learning_rate": 0.009457878240655814, "loss": 2.6237, "step": 4596 }, { "crossentropy": 2.81111478805542, "epoch": 0.16665458236658934, "grad_norm": 0.03989545255899429, "grad_norm_var": 4.217665242638609e-05, "learning_rate": 0.00945761505231143, "loss": 2.7873, "step": 4597 }, { "crossentropy": 2.8098907470703125, "epoch": 0.16669083526682135, "grad_norm": 0.04096977785229683, "grad_norm_var": 4.217376155527137e-05, "learning_rate": 0.00945735180376019, "loss": 2.7315, "step": 4598 }, { "crossentropy": 2.6606199741363525, "epoch": 0.16672708816705337, "grad_norm": 0.041517775505781174, "grad_norm_var": 4.2175221946043746e-05, "learning_rate": 0.009457088495005652, "loss": 2.7764, "step": 4599 }, { "crossentropy": 2.774230718612671, "epoch": 0.16676334106728538, "grad_norm": 0.03666618466377258, "grad_norm_var": 4.323424364432611e-05, "learning_rate": 0.009456825126051368, "loss": 2.699, "step": 4600 }, { "crossentropy": 2.7507169246673584, "epoch": 0.1667995939675174, "grad_norm": 0.0368911512196064, "grad_norm_var": 3.420043088808329e-05, "learning_rate": 0.009456561696900897, "loss": 2.686, "step": 4601 }, { "crossentropy": 2.8153529167175293, "epoch": 0.1668358468677494, "grad_norm": 0.03891666978597641, "grad_norm_var": 1.1416371130216346e-05, "learning_rate": 0.009456298207557797, "loss": 2.7088, "step": 4602 }, { "crossentropy": 2.7608656883239746, "epoch": 0.16687209976798145, "grad_norm": 0.03749833256006241, "grad_norm_var": 6.333650558404807e-06, "learning_rate": 0.009456034658025627, "loss": 2.6725, "step": 4603 }, { "crossentropy": 2.923229217529297, "epoch": 0.16690835266821347, "grad_norm": 0.036007221788167953, "grad_norm_var": 6.441620134678359e-06, "learning_rate": 0.009455771048307948, "loss": 2.7711, "step": 4604 }, { "crossentropy": 2.718733787536621, "epoch": 0.16694460556844548, "grad_norm": 0.035841040313243866, "grad_norm_var": 6.652348953725627e-06, "learning_rate": 0.009455507378408319, "loss": 2.7773, "step": 4605 }, { "crossentropy": 2.4507064819335938, "epoch": 0.1669808584686775, "grad_norm": 0.04317110404372215, "grad_norm_var": 8.27490863723092e-06, "learning_rate": 0.009455243648330301, "loss": 2.5141, "step": 4606 }, { "crossentropy": 2.7370901107788086, "epoch": 0.1670171113689095, "grad_norm": 0.044668156653642654, "grad_norm_var": 9.753601792107889e-06, "learning_rate": 0.009454979858077455, "loss": 2.6678, "step": 4607 }, { "crossentropy": 2.776002883911133, "epoch": 0.16705336426914152, "grad_norm": 0.04479571059346199, "grad_norm_var": 1.0751224810480869e-05, "learning_rate": 0.009454716007653347, "loss": 2.7581, "step": 4608 }, { "crossentropy": 2.8228919506073, "epoch": 0.16708961716937354, "grad_norm": 0.04213026911020279, "grad_norm_var": 1.091768054804944e-05, "learning_rate": 0.00945445209706154, "loss": 2.7604, "step": 4609 }, { "crossentropy": 2.7230870723724365, "epoch": 0.16712587006960558, "grad_norm": 0.038408972322940826, "grad_norm_var": 9.168246636316833e-06, "learning_rate": 0.009454188126305593, "loss": 2.6787, "step": 4610 }, { "crossentropy": 2.689222812652588, "epoch": 0.1671621229698376, "grad_norm": 0.03449047729372978, "grad_norm_var": 1.1006572887172401e-05, "learning_rate": 0.009453924095389081, "loss": 2.7241, "step": 4611 }, { "crossentropy": 2.7287182807922363, "epoch": 0.1671983758700696, "grad_norm": 0.035962700843811035, "grad_norm_var": 1.0934710236238747e-05, "learning_rate": 0.009453660004315563, "loss": 2.6907, "step": 4612 }, { "crossentropy": 2.740523338317871, "epoch": 0.16723462877030162, "grad_norm": 0.035233452916145325, "grad_norm_var": 1.1885321236378375e-05, "learning_rate": 0.009453395853088608, "loss": 2.6394, "step": 4613 }, { "crossentropy": 2.7897138595581055, "epoch": 0.16727088167053364, "grad_norm": 0.035554319620132446, "grad_norm_var": 1.225846835720869e-05, "learning_rate": 0.009453131641711783, "loss": 2.6664, "step": 4614 }, { "crossentropy": 2.517725706100464, "epoch": 0.16730713457076565, "grad_norm": 0.03680700808763504, "grad_norm_var": 1.1818792286827363e-05, "learning_rate": 0.00945286737018866, "loss": 2.5799, "step": 4615 }, { "crossentropy": 2.793710708618164, "epoch": 0.16734338747099767, "grad_norm": 0.03494128957390785, "grad_norm_var": 1.2383990534876356e-05, "learning_rate": 0.009452603038522802, "loss": 2.7312, "step": 4616 }, { "crossentropy": 2.4015283584594727, "epoch": 0.1673796403712297, "grad_norm": 0.04494967311620712, "grad_norm_var": 1.5028492339926268e-05, "learning_rate": 0.009452338646717785, "loss": 2.5579, "step": 4617 }, { "crossentropy": 2.8134796619415283, "epoch": 0.16741589327146172, "grad_norm": 0.034927938133478165, "grad_norm_var": 1.5913497807450103e-05, "learning_rate": 0.009452074194777178, "loss": 2.7463, "step": 4618 }, { "crossentropy": 2.7126972675323486, "epoch": 0.16745214617169374, "grad_norm": 0.03556495159864426, "grad_norm_var": 1.6395468788052368e-05, "learning_rate": 0.009451809682704553, "loss": 2.7187, "step": 4619 }, { "crossentropy": 2.6518094539642334, "epoch": 0.16748839907192575, "grad_norm": 0.03566615656018257, "grad_norm_var": 1.6508863672690052e-05, "learning_rate": 0.009451545110503482, "loss": 2.6186, "step": 4620 }, { "crossentropy": 2.7622830867767334, "epoch": 0.16752465197215777, "grad_norm": 0.03892506659030914, "grad_norm_var": 1.6084132121453542e-05, "learning_rate": 0.009451280478177537, "loss": 2.6819, "step": 4621 }, { "crossentropy": 2.636838674545288, "epoch": 0.16756090487238978, "grad_norm": 0.037530455738306046, "grad_norm_var": 1.4568886865924147e-05, "learning_rate": 0.009451015785730297, "loss": 2.6955, "step": 4622 }, { "crossentropy": 2.7928194999694824, "epoch": 0.1675971577726218, "grad_norm": 0.03657818213105202, "grad_norm_var": 1.1639028516303937e-05, "learning_rate": 0.009450751033165332, "loss": 2.6921, "step": 4623 }, { "crossentropy": 2.7480666637420654, "epoch": 0.16763341067285384, "grad_norm": 0.03400041162967682, "grad_norm_var": 8.643332780655424e-06, "learning_rate": 0.00945048622048622, "loss": 2.8032, "step": 4624 }, { "crossentropy": 2.8199832439422607, "epoch": 0.16766966357308585, "grad_norm": 0.037347324192523956, "grad_norm_var": 6.788311692516175e-06, "learning_rate": 0.00945022134769654, "loss": 2.785, "step": 4625 }, { "crossentropy": 2.6492154598236084, "epoch": 0.16770591647331787, "grad_norm": 0.037909407168626785, "grad_norm_var": 6.68877983752188e-06, "learning_rate": 0.009449956414799865, "loss": 2.6003, "step": 4626 }, { "crossentropy": 2.7975919246673584, "epoch": 0.16774216937354988, "grad_norm": 0.03724084049463272, "grad_norm_var": 6.369887727346775e-06, "learning_rate": 0.009449691421799777, "loss": 2.69, "step": 4627 }, { "crossentropy": 2.7226099967956543, "epoch": 0.1677784222737819, "grad_norm": 0.04347911849617958, "grad_norm_var": 9.04054404761752e-06, "learning_rate": 0.009449426368699852, "loss": 2.6841, "step": 4628 }, { "crossentropy": 2.627963066101074, "epoch": 0.1678146751740139, "grad_norm": 0.037960752844810486, "grad_norm_var": 8.757232132747667e-06, "learning_rate": 0.009449161255503673, "loss": 2.773, "step": 4629 }, { "crossentropy": 2.7116611003875732, "epoch": 0.16785092807424595, "grad_norm": 0.04227053374052048, "grad_norm_var": 9.86864377385584e-06, "learning_rate": 0.00944889608221482, "loss": 2.6878, "step": 4630 }, { "crossentropy": 2.6993637084960938, "epoch": 0.16788718097447797, "grad_norm": 0.04342919588088989, "grad_norm_var": 1.1661017690608028e-05, "learning_rate": 0.009448630848836872, "loss": 2.591, "step": 4631 }, { "crossentropy": 2.85428524017334, "epoch": 0.16792343387470998, "grad_norm": 0.051325149834156036, "grad_norm_var": 2.111154057116311e-05, "learning_rate": 0.009448365555373414, "loss": 2.788, "step": 4632 }, { "crossentropy": 2.5941083431243896, "epoch": 0.167959686774942, "grad_norm": 0.04839559644460678, "grad_norm_var": 2.4440705583862973e-05, "learning_rate": 0.00944810020182803, "loss": 2.7105, "step": 4633 }, { "crossentropy": 2.660372734069824, "epoch": 0.167995939675174, "grad_norm": 0.041435178369283676, "grad_norm_var": 2.309046573022259e-05, "learning_rate": 0.009447834788204301, "loss": 2.6758, "step": 4634 }, { "crossentropy": 2.5860719680786133, "epoch": 0.16803219257540603, "grad_norm": 0.039247065782547, "grad_norm_var": 2.178935264228905e-05, "learning_rate": 0.009447569314505815, "loss": 2.6895, "step": 4635 }, { "crossentropy": 2.5755927562713623, "epoch": 0.16806844547563804, "grad_norm": 0.04158957302570343, "grad_norm_var": 2.042418741821274e-05, "learning_rate": 0.009447303780736153, "loss": 2.6091, "step": 4636 }, { "crossentropy": 2.7921016216278076, "epoch": 0.16810469837587008, "grad_norm": 0.04054563492536545, "grad_norm_var": 2.0239057413440745e-05, "learning_rate": 0.009447038186898907, "loss": 2.7212, "step": 4637 }, { "crossentropy": 2.8081307411193848, "epoch": 0.1681409512761021, "grad_norm": 0.037497323006391525, "grad_norm_var": 2.025287531557344e-05, "learning_rate": 0.00944677253299766, "loss": 2.7646, "step": 4638 }, { "crossentropy": 2.6015865802764893, "epoch": 0.1681772041763341, "grad_norm": 0.040222104638814926, "grad_norm_var": 1.910895830691487e-05, "learning_rate": 0.009446506819036004, "loss": 2.6251, "step": 4639 }, { "crossentropy": 2.711336612701416, "epoch": 0.16821345707656613, "grad_norm": 0.041997529566287994, "grad_norm_var": 1.5782807685852698e-05, "learning_rate": 0.009446241045017524, "loss": 2.6432, "step": 4640 }, { "crossentropy": 2.5287318229675293, "epoch": 0.16824970997679814, "grad_norm": 0.04786583408713341, "grad_norm_var": 1.7058499830533562e-05, "learning_rate": 0.009445975210945812, "loss": 2.6738, "step": 4641 }, { "crossentropy": 2.627021312713623, "epoch": 0.16828596287703015, "grad_norm": 0.047486018389463425, "grad_norm_var": 1.7534478201294835e-05, "learning_rate": 0.009445709316824457, "loss": 2.6655, "step": 4642 }, { "crossentropy": 2.6560261249542236, "epoch": 0.16832221577726217, "grad_norm": 0.04349394142627716, "grad_norm_var": 1.548993594070805e-05, "learning_rate": 0.009445443362657052, "loss": 2.6592, "step": 4643 }, { "crossentropy": 2.6480236053466797, "epoch": 0.1683584686774942, "grad_norm": 0.0417347177863121, "grad_norm_var": 1.5572179489435565e-05, "learning_rate": 0.009445177348447187, "loss": 2.7261, "step": 4644 }, { "crossentropy": 2.7904365062713623, "epoch": 0.16839472157772623, "grad_norm": 0.03933101147413254, "grad_norm_var": 1.4786025978798187e-05, "learning_rate": 0.009444911274198456, "loss": 2.7811, "step": 4645 }, { "crossentropy": 2.9074807167053223, "epoch": 0.16843097447795824, "grad_norm": 0.048473864793777466, "grad_norm_var": 1.6594664754085383e-05, "learning_rate": 0.009444645139914455, "loss": 2.7876, "step": 4646 }, { "crossentropy": 2.6902976036071777, "epoch": 0.16846722737819025, "grad_norm": 0.05400640144944191, "grad_norm_var": 2.3657279667138596e-05, "learning_rate": 0.009444378945598774, "loss": 2.6659, "step": 4647 }, { "crossentropy": 2.5409793853759766, "epoch": 0.16850348027842227, "grad_norm": 0.046493835747241974, "grad_norm_var": 2.042349608145829e-05, "learning_rate": 0.00944411269125501, "loss": 2.5312, "step": 4648 }, { "crossentropy": 2.811455011367798, "epoch": 0.16853973317865428, "grad_norm": 0.03960295021533966, "grad_norm_var": 1.9795623070678384e-05, "learning_rate": 0.009443846376886764, "loss": 2.7492, "step": 4649 }, { "crossentropy": 2.807323694229126, "epoch": 0.1685759860788863, "grad_norm": 0.036511048674583435, "grad_norm_var": 2.2462494925369557e-05, "learning_rate": 0.009443580002497625, "loss": 2.6601, "step": 4650 }, { "crossentropy": 2.6222336292266846, "epoch": 0.16861223897911834, "grad_norm": 0.04100111126899719, "grad_norm_var": 2.180486729527701e-05, "learning_rate": 0.009443313568091196, "loss": 2.5531, "step": 4651 }, { "crossentropy": 2.6772966384887695, "epoch": 0.16864849187935035, "grad_norm": 0.04212047904729843, "grad_norm_var": 2.172329386421797e-05, "learning_rate": 0.009443047073671076, "loss": 2.6911, "step": 4652 }, { "crossentropy": 2.7143142223358154, "epoch": 0.16868474477958237, "grad_norm": 0.036909881979227066, "grad_norm_var": 2.3750886468866992e-05, "learning_rate": 0.009442780519240862, "loss": 2.7, "step": 4653 }, { "crossentropy": 2.5067532062530518, "epoch": 0.16872099767981438, "grad_norm": 0.040280990302562714, "grad_norm_var": 2.226827323788394e-05, "learning_rate": 0.009442513904804153, "loss": 2.563, "step": 4654 }, { "crossentropy": 2.7126755714416504, "epoch": 0.1687572505800464, "grad_norm": 0.04060456529259682, "grad_norm_var": 2.2137249893493192e-05, "learning_rate": 0.009442247230364553, "loss": 2.6112, "step": 4655 }, { "crossentropy": 2.7825639247894287, "epoch": 0.1687935034802784, "grad_norm": 0.0386294461786747, "grad_norm_var": 2.3294027503178067e-05, "learning_rate": 0.009441980495925663, "loss": 2.6934, "step": 4656 }, { "crossentropy": 2.5539121627807617, "epoch": 0.16882975638051045, "grad_norm": 0.04221736267209053, "grad_norm_var": 2.146092381856308e-05, "learning_rate": 0.009441713701491085, "loss": 2.6207, "step": 4657 }, { "crossentropy": 2.695758581161499, "epoch": 0.16886600928074247, "grad_norm": 0.03404616937041283, "grad_norm_var": 2.369196116773567e-05, "learning_rate": 0.009441446847064423, "loss": 2.7421, "step": 4658 }, { "crossentropy": 2.7328853607177734, "epoch": 0.16890226218097448, "grad_norm": 0.0355069637298584, "grad_norm_var": 2.565256768471205e-05, "learning_rate": 0.009441179932649282, "loss": 2.6844, "step": 4659 }, { "crossentropy": 2.7029879093170166, "epoch": 0.1689385150812065, "grad_norm": 0.03624805808067322, "grad_norm_var": 2.706379431695882e-05, "learning_rate": 0.009440912958249266, "loss": 2.7102, "step": 4660 }, { "crossentropy": 2.6753852367401123, "epoch": 0.1689747679814385, "grad_norm": 0.03631448745727539, "grad_norm_var": 2.820283094056922e-05, "learning_rate": 0.009440645923867981, "loss": 2.6369, "step": 4661 }, { "crossentropy": 2.8493812084198, "epoch": 0.16901102088167053, "grad_norm": 0.03400411084294319, "grad_norm_var": 2.6021387015848622e-05, "learning_rate": 0.009440378829509034, "loss": 2.8209, "step": 4662 }, { "crossentropy": 2.686877727508545, "epoch": 0.16904727378190254, "grad_norm": 0.038057513535022736, "grad_norm_var": 1.1403181071316845e-05, "learning_rate": 0.00944011167517603, "loss": 2.7117, "step": 4663 }, { "crossentropy": 2.6367027759552, "epoch": 0.16908352668213458, "grad_norm": 0.03537119925022125, "grad_norm_var": 7.516503302438387e-06, "learning_rate": 0.009439844460872582, "loss": 2.6699, "step": 4664 }, { "crossentropy": 2.4826555252075195, "epoch": 0.1691197795823666, "grad_norm": 0.03679978474974632, "grad_norm_var": 7.395099968480513e-06, "learning_rate": 0.009439577186602297, "loss": 2.5702, "step": 4665 }, { "crossentropy": 2.747666835784912, "epoch": 0.1691560324825986, "grad_norm": 0.04173074662685394, "grad_norm_var": 8.208561229638919e-06, "learning_rate": 0.009439309852368785, "loss": 2.6932, "step": 4666 }, { "crossentropy": 2.7922439575195312, "epoch": 0.16919228538283063, "grad_norm": 0.04490736499428749, "grad_norm_var": 1.0665328314102042e-05, "learning_rate": 0.009439042458175657, "loss": 2.7882, "step": 4667 }, { "crossentropy": 2.6690285205841064, "epoch": 0.16922853828306264, "grad_norm": 0.04640102759003639, "grad_norm_var": 1.395716484052922e-05, "learning_rate": 0.009438775004026524, "loss": 2.6556, "step": 4668 }, { "crossentropy": 2.7761757373809814, "epoch": 0.16926479118329466, "grad_norm": 0.03969477489590645, "grad_norm_var": 1.380434699539129e-05, "learning_rate": 0.009438507489924997, "loss": 2.8153, "step": 4669 }, { "crossentropy": 2.5929250717163086, "epoch": 0.16930104408352667, "grad_norm": 0.041694093495607376, "grad_norm_var": 1.4208018212981862e-05, "learning_rate": 0.009438239915874691, "loss": 2.6585, "step": 4670 }, { "crossentropy": 2.7286901473999023, "epoch": 0.1693372969837587, "grad_norm": 0.03644075617194176, "grad_norm_var": 1.4339288974453644e-05, "learning_rate": 0.009437972281879219, "loss": 2.7099, "step": 4671 }, { "crossentropy": 2.708890914916992, "epoch": 0.16937354988399073, "grad_norm": 0.04030619189143181, "grad_norm_var": 1.45151079476676e-05, "learning_rate": 0.009437704587942198, "loss": 2.7419, "step": 4672 }, { "crossentropy": 2.8391757011413574, "epoch": 0.16940980278422274, "grad_norm": 0.041083771735429764, "grad_norm_var": 1.4068895708343094e-05, "learning_rate": 0.009437436834067241, "loss": 2.7201, "step": 4673 }, { "crossentropy": 2.705580949783325, "epoch": 0.16944605568445475, "grad_norm": 0.03901529312133789, "grad_norm_var": 1.2553318084201846e-05, "learning_rate": 0.009437169020257968, "loss": 2.7139, "step": 4674 }, { "crossentropy": 2.526106357574463, "epoch": 0.16948230858468677, "grad_norm": 0.04483717679977417, "grad_norm_var": 1.3681642484974997e-05, "learning_rate": 0.009436901146517991, "loss": 2.6413, "step": 4675 }, { "crossentropy": 2.6377334594726562, "epoch": 0.16951856148491878, "grad_norm": 0.036215219646692276, "grad_norm_var": 1.3696196399611235e-05, "learning_rate": 0.009436633212850931, "loss": 2.6094, "step": 4676 }, { "crossentropy": 2.722201347351074, "epoch": 0.1695548143851508, "grad_norm": 0.03665501996874809, "grad_norm_var": 1.3556329141628925e-05, "learning_rate": 0.009436365219260407, "loss": 2.7064, "step": 4677 }, { "crossentropy": 2.8339309692382812, "epoch": 0.16959106728538284, "grad_norm": 0.0375487245619297, "grad_norm_var": 1.1708295481442123e-05, "learning_rate": 0.00943609716575004, "loss": 2.7386, "step": 4678 }, { "crossentropy": 2.837479591369629, "epoch": 0.16962732018561485, "grad_norm": 0.03616661578416824, "grad_norm_var": 1.2370427610974785e-05, "learning_rate": 0.009435829052323446, "loss": 2.8269, "step": 4679 }, { "crossentropy": 2.6725218296051025, "epoch": 0.16966357308584687, "grad_norm": 0.037835877388715744, "grad_norm_var": 1.1334369609021048e-05, "learning_rate": 0.00943556087898425, "loss": 2.7106, "step": 4680 }, { "crossentropy": 2.61198091506958, "epoch": 0.16969982598607888, "grad_norm": 0.03912138566374779, "grad_norm_var": 1.0732226106554185e-05, "learning_rate": 0.009435292645736074, "loss": 2.6231, "step": 4681 }, { "crossentropy": 2.6992080211639404, "epoch": 0.1697360788863109, "grad_norm": 0.04206507280468941, "grad_norm_var": 1.0817327029835564e-05, "learning_rate": 0.00943502435258254, "loss": 2.7484, "step": 4682 }, { "crossentropy": 2.5425937175750732, "epoch": 0.1697723317865429, "grad_norm": 0.039155036211013794, "grad_norm_var": 9.121012847261115e-06, "learning_rate": 0.009434755999527271, "loss": 2.5765, "step": 4683 }, { "crossentropy": 2.6705429553985596, "epoch": 0.16980858468677495, "grad_norm": 0.05334929749369621, "grad_norm_var": 1.8402304956974953e-05, "learning_rate": 0.009434487586573892, "loss": 2.6511, "step": 4684 }, { "crossentropy": 2.7158069610595703, "epoch": 0.16984483758700697, "grad_norm": 0.03777019679546356, "grad_norm_var": 1.8731123055643004e-05, "learning_rate": 0.00943421911372603, "loss": 2.7604, "step": 4685 }, { "crossentropy": 2.6922147274017334, "epoch": 0.16988109048723898, "grad_norm": 0.03804505988955498, "grad_norm_var": 1.8716587302083863e-05, "learning_rate": 0.009433950580987307, "loss": 2.7389, "step": 4686 }, { "crossentropy": 2.6079049110412598, "epoch": 0.169917343387471, "grad_norm": 0.03919316828250885, "grad_norm_var": 1.7984548750524244e-05, "learning_rate": 0.009433681988361356, "loss": 2.696, "step": 4687 }, { "crossentropy": 2.789344310760498, "epoch": 0.169953596287703, "grad_norm": 0.04045875743031502, "grad_norm_var": 1.799431320407621e-05, "learning_rate": 0.0094334133358518, "loss": 2.6985, "step": 4688 }, { "crossentropy": 2.7803521156311035, "epoch": 0.16998984918793503, "grad_norm": 0.041218291968107224, "grad_norm_var": 1.8016546681155302e-05, "learning_rate": 0.009433144623462267, "loss": 2.6755, "step": 4689 }, { "crossentropy": 2.6602120399475098, "epoch": 0.17002610208816704, "grad_norm": 0.04645462706685066, "grad_norm_var": 2.058246599541489e-05, "learning_rate": 0.00943287585119639, "loss": 2.7278, "step": 4690 }, { "crossentropy": 2.8341526985168457, "epoch": 0.17006235498839908, "grad_norm": 0.045681923627853394, "grad_norm_var": 2.112902354462944e-05, "learning_rate": 0.009432607019057798, "loss": 2.7513, "step": 4691 }, { "crossentropy": 2.678135633468628, "epoch": 0.1700986078886311, "grad_norm": 0.03446222096681595, "grad_norm_var": 2.2307013097851388e-05, "learning_rate": 0.009432338127050122, "loss": 2.6169, "step": 4692 }, { "crossentropy": 2.6921839714050293, "epoch": 0.1701348607888631, "grad_norm": 0.03524569049477577, "grad_norm_var": 2.31205594291628e-05, "learning_rate": 0.009432069175176992, "loss": 2.7103, "step": 4693 }, { "crossentropy": 2.6519851684570312, "epoch": 0.17017111368909513, "grad_norm": 0.03842165693640709, "grad_norm_var": 2.2855439969212085e-05, "learning_rate": 0.009431800163442042, "loss": 2.7001, "step": 4694 }, { "crossentropy": 2.5632998943328857, "epoch": 0.17020736658932714, "grad_norm": 0.038829296827316284, "grad_norm_var": 2.1834547678120653e-05, "learning_rate": 0.009431531091848905, "loss": 2.5678, "step": 4695 }, { "crossentropy": 2.6461756229400635, "epoch": 0.17024361948955916, "grad_norm": 0.03984108939766884, "grad_norm_var": 2.1385139035282747e-05, "learning_rate": 0.009431261960401217, "loss": 2.6694, "step": 4696 }, { "crossentropy": 2.8687903881073, "epoch": 0.17027987238979117, "grad_norm": 0.03994588181376457, "grad_norm_var": 2.1267051397925312e-05, "learning_rate": 0.009430992769102609, "loss": 2.7903, "step": 4697 }, { "crossentropy": 2.723055839538574, "epoch": 0.1703161252900232, "grad_norm": 0.04048388823866844, "grad_norm_var": 2.112151636883275e-05, "learning_rate": 0.009430723517956721, "loss": 2.6974, "step": 4698 }, { "crossentropy": 2.824146270751953, "epoch": 0.17035237819025523, "grad_norm": 0.0372438058257103, "grad_norm_var": 2.1701411252431775e-05, "learning_rate": 0.009430454206967188, "loss": 2.7542, "step": 4699 }, { "crossentropy": 2.779916286468506, "epoch": 0.17038863109048724, "grad_norm": 0.038990095257759094, "grad_norm_var": 9.825167556856637e-06, "learning_rate": 0.009430184836137648, "loss": 2.7396, "step": 4700 }, { "crossentropy": 2.7141430377960205, "epoch": 0.17042488399071926, "grad_norm": 0.0397091805934906, "grad_norm_var": 9.608322538178816e-06, "learning_rate": 0.009429915405471738, "loss": 2.701, "step": 4701 }, { "crossentropy": 2.814749240875244, "epoch": 0.17046113689095127, "grad_norm": 0.03650091215968132, "grad_norm_var": 1.0085525751348835e-05, "learning_rate": 0.009429645914973098, "loss": 2.655, "step": 4702 }, { "crossentropy": 2.6832950115203857, "epoch": 0.17049738979118328, "grad_norm": 0.03380699083209038, "grad_norm_var": 1.2149604341602644e-05, "learning_rate": 0.009429376364645367, "loss": 2.7131, "step": 4703 }, { "crossentropy": 2.7856218814849854, "epoch": 0.17053364269141533, "grad_norm": 0.03665287047624588, "grad_norm_var": 1.2419135438624697e-05, "learning_rate": 0.009429106754492187, "loss": 2.7311, "step": 4704 }, { "crossentropy": 2.6730802059173584, "epoch": 0.17056989559164734, "grad_norm": 0.03890036791563034, "grad_norm_var": 1.2059474393494076e-05, "learning_rate": 0.0094288370845172, "loss": 2.7503, "step": 4705 }, { "crossentropy": 2.682020664215088, "epoch": 0.17060614849187936, "grad_norm": 0.04343036189675331, "grad_norm_var": 9.55383174090103e-06, "learning_rate": 0.009428567354724046, "loss": 2.6106, "step": 4706 }, { "crossentropy": 2.627746105194092, "epoch": 0.17064240139211137, "grad_norm": 0.03863511234521866, "grad_norm_var": 6.035508015282224e-06, "learning_rate": 0.009428297565116369, "loss": 2.6225, "step": 4707 }, { "crossentropy": 2.4723904132843018, "epoch": 0.17067865429234338, "grad_norm": 0.03483318164944649, "grad_norm_var": 5.8595438004103875e-06, "learning_rate": 0.009428027715697814, "loss": 2.5894, "step": 4708 }, { "crossentropy": 2.7178094387054443, "epoch": 0.1707149071925754, "grad_norm": 0.035426799207925797, "grad_norm_var": 5.789845592121291e-06, "learning_rate": 0.009427757806472023, "loss": 2.6761, "step": 4709 }, { "crossentropy": 2.66682505607605, "epoch": 0.1707511600928074, "grad_norm": 0.03710993006825447, "grad_norm_var": 5.863552966675565e-06, "learning_rate": 0.009427487837442645, "loss": 2.6609, "step": 4710 }, { "crossentropy": 2.6589949131011963, "epoch": 0.17078741299303946, "grad_norm": 0.038695886731147766, "grad_norm_var": 5.852515050001836e-06, "learning_rate": 0.009427217808613327, "loss": 2.6753, "step": 4711 }, { "crossentropy": 2.5346531867980957, "epoch": 0.17082366589327147, "grad_norm": 0.03700724616646767, "grad_norm_var": 5.710887790082106e-06, "learning_rate": 0.009426947719987712, "loss": 2.677, "step": 4712 }, { "crossentropy": 2.6678128242492676, "epoch": 0.17085991879350348, "grad_norm": 0.035903219133615494, "grad_norm_var": 5.66232100119342e-06, "learning_rate": 0.00942667757156945, "loss": 2.6699, "step": 4713 }, { "crossentropy": 2.800325632095337, "epoch": 0.1708961716937355, "grad_norm": 0.03716457635164261, "grad_norm_var": 5.1224480847283465e-06, "learning_rate": 0.009426407363362187, "loss": 2.7393, "step": 4714 }, { "crossentropy": 2.620131492614746, "epoch": 0.1709324245939675, "grad_norm": 0.04326726123690605, "grad_norm_var": 7.183788579777303e-06, "learning_rate": 0.009426137095369578, "loss": 2.6613, "step": 4715 }, { "crossentropy": 2.682884693145752, "epoch": 0.17096867749419953, "grad_norm": 0.04062477871775627, "grad_norm_var": 7.593381107262367e-06, "learning_rate": 0.00942586676759527, "loss": 2.7257, "step": 4716 }, { "crossentropy": 2.8402326107025146, "epoch": 0.17100493039443154, "grad_norm": 0.044029172509908676, "grad_norm_var": 9.756190582128378e-06, "learning_rate": 0.009425596380042915, "loss": 2.7215, "step": 4717 }, { "crossentropy": 2.6915977001190186, "epoch": 0.17104118329466358, "grad_norm": 0.05085345730185509, "grad_norm_var": 1.928508639158725e-05, "learning_rate": 0.009425325932716165, "loss": 2.6826, "step": 4718 }, { "crossentropy": 2.5145533084869385, "epoch": 0.1710774361948956, "grad_norm": 0.046016741544008255, "grad_norm_var": 1.9910202339155394e-05, "learning_rate": 0.009425055425618672, "loss": 2.6008, "step": 4719 }, { "crossentropy": 2.6949944496154785, "epoch": 0.1711136890951276, "grad_norm": 0.036097202450037, "grad_norm_var": 2.0170776126411893e-05, "learning_rate": 0.00942478485875409, "loss": 2.6262, "step": 4720 }, { "crossentropy": 2.68973445892334, "epoch": 0.17114994199535963, "grad_norm": 0.037412818521261215, "grad_norm_var": 2.0502326453842624e-05, "learning_rate": 0.009424514232126075, "loss": 2.6924, "step": 4721 }, { "crossentropy": 2.8635802268981934, "epoch": 0.17118619489559164, "grad_norm": 0.035219017416238785, "grad_norm_var": 2.072177730979372e-05, "learning_rate": 0.00942424354573828, "loss": 2.7455, "step": 4722 }, { "crossentropy": 2.723754405975342, "epoch": 0.17122244779582366, "grad_norm": 0.03921665996313095, "grad_norm_var": 2.069380003916436e-05, "learning_rate": 0.009423972799594362, "loss": 2.6542, "step": 4723 }, { "crossentropy": 2.5415091514587402, "epoch": 0.17125870069605567, "grad_norm": 0.0422356016933918, "grad_norm_var": 1.9705028428620466e-05, "learning_rate": 0.009423701993697978, "loss": 2.5553, "step": 4724 }, { "crossentropy": 2.729261875152588, "epoch": 0.1712949535962877, "grad_norm": 0.04502512514591217, "grad_norm_var": 1.9907862079876263e-05, "learning_rate": 0.009423431128052785, "loss": 2.7077, "step": 4725 }, { "crossentropy": 2.4698545932769775, "epoch": 0.17133120649651973, "grad_norm": 0.04555052891373634, "grad_norm_var": 2.069457354519887e-05, "learning_rate": 0.009423160202662442, "loss": 2.52, "step": 4726 }, { "crossentropy": 2.7772786617279053, "epoch": 0.17136745939675174, "grad_norm": 0.04223749786615372, "grad_norm_var": 2.044007860512916e-05, "learning_rate": 0.009422889217530608, "loss": 2.7182, "step": 4727 }, { "crossentropy": 2.7670018672943115, "epoch": 0.17140371229698376, "grad_norm": 0.03888454660773277, "grad_norm_var": 1.963181921274514e-05, "learning_rate": 0.009422618172660943, "loss": 2.7538, "step": 4728 }, { "crossentropy": 2.6676583290100098, "epoch": 0.17143996519721577, "grad_norm": 0.04064882546663284, "grad_norm_var": 1.7666558762711484e-05, "learning_rate": 0.00942234706805711, "loss": 2.6773, "step": 4729 }, { "crossentropy": 2.5555014610290527, "epoch": 0.17147621809744779, "grad_norm": 0.037350691854953766, "grad_norm_var": 1.756038805125291e-05, "learning_rate": 0.009422075903722764, "loss": 2.6436, "step": 4730 }, { "crossentropy": 2.6974477767944336, "epoch": 0.17151247099767983, "grad_norm": 0.03917303681373596, "grad_norm_var": 1.766617020671768e-05, "learning_rate": 0.009421804679661576, "loss": 2.761, "step": 4731 }, { "crossentropy": 2.660344123840332, "epoch": 0.17154872389791184, "grad_norm": 0.044880688190460205, "grad_norm_var": 1.8423015531837616e-05, "learning_rate": 0.009421533395877204, "loss": 2.6726, "step": 4732 }, { "crossentropy": 2.5440661907196045, "epoch": 0.17158497679814386, "grad_norm": 0.04354885220527649, "grad_norm_var": 1.82787883705846e-05, "learning_rate": 0.009421262052373314, "loss": 2.6572, "step": 4733 }, { "crossentropy": 2.5895819664001465, "epoch": 0.17162122969837587, "grad_norm": 0.0406603142619133, "grad_norm_var": 1.2090238001866092e-05, "learning_rate": 0.009420990649153572, "loss": 2.6288, "step": 4734 }, { "crossentropy": 2.75696063041687, "epoch": 0.17165748259860789, "grad_norm": 0.043249428272247314, "grad_norm_var": 1.0675336871328786e-05, "learning_rate": 0.00942071918622164, "loss": 2.7441, "step": 4735 }, { "crossentropy": 2.70033860206604, "epoch": 0.1716937354988399, "grad_norm": 0.04533696174621582, "grad_norm_var": 1.0325966040237017e-05, "learning_rate": 0.009420447663581187, "loss": 2.7303, "step": 4736 }, { "crossentropy": 2.7198688983917236, "epoch": 0.17172998839907191, "grad_norm": 0.04160333797335625, "grad_norm_var": 9.257502216767667e-06, "learning_rate": 0.00942017608123588, "loss": 2.7002, "step": 4737 }, { "crossentropy": 2.6425533294677734, "epoch": 0.17176624129930396, "grad_norm": 0.04750995337963104, "grad_norm_var": 8.321873562059517e-06, "learning_rate": 0.009419904439189388, "loss": 2.7106, "step": 4738 }, { "crossentropy": 2.6922202110290527, "epoch": 0.17180249419953597, "grad_norm": 0.04846158251166344, "grad_norm_var": 9.838921297876333e-06, "learning_rate": 0.009419632737445378, "loss": 2.7211, "step": 4739 }, { "crossentropy": 2.8000383377075195, "epoch": 0.17183874709976799, "grad_norm": 0.050047967582941055, "grad_norm_var": 1.2964219103579235e-05, "learning_rate": 0.009419360976007523, "loss": 2.6987, "step": 4740 }, { "crossentropy": 2.7003071308135986, "epoch": 0.171875, "grad_norm": 0.045222796499729156, "grad_norm_var": 1.3009873277308216e-05, "learning_rate": 0.00941908915487949, "loss": 2.7471, "step": 4741 }, { "crossentropy": 2.7214529514312744, "epoch": 0.17191125290023201, "grad_norm": 0.041568461805582047, "grad_norm_var": 1.2858025471911873e-05, "learning_rate": 0.00941881727406495, "loss": 2.6742, "step": 4742 }, { "crossentropy": 2.53216814994812, "epoch": 0.17194750580046403, "grad_norm": 0.03522908315062523, "grad_norm_var": 1.6779705845900114e-05, "learning_rate": 0.00941854533356758, "loss": 2.6117, "step": 4743 }, { "crossentropy": 2.639134168624878, "epoch": 0.17198375870069604, "grad_norm": 0.037722282111644745, "grad_norm_var": 1.74571197585876e-05, "learning_rate": 0.009418273333391047, "loss": 2.6428, "step": 4744 }, { "crossentropy": 2.536909341812134, "epoch": 0.17202001160092809, "grad_norm": 0.03716839477419853, "grad_norm_var": 1.913748000798183e-05, "learning_rate": 0.00941800127353903, "loss": 2.6398, "step": 4745 }, { "crossentropy": 2.587648391723633, "epoch": 0.1720562645011601, "grad_norm": 0.03498381748795509, "grad_norm_var": 2.1087672478150148e-05, "learning_rate": 0.009417729154015201, "loss": 2.6148, "step": 4746 }, { "crossentropy": 2.6860151290893555, "epoch": 0.17209251740139211, "grad_norm": 0.040290266275405884, "grad_norm_var": 2.070391208308832e-05, "learning_rate": 0.009417456974823235, "loss": 2.7062, "step": 4747 }, { "crossentropy": 2.735941171646118, "epoch": 0.17212877030162413, "grad_norm": 0.035479094833135605, "grad_norm_var": 2.304687747268034e-05, "learning_rate": 0.009417184735966809, "loss": 2.7449, "step": 4748 }, { "crossentropy": 2.521221399307251, "epoch": 0.17216502320185614, "grad_norm": 0.03567902743816376, "grad_norm_var": 2.503562451396472e-05, "learning_rate": 0.0094169124374496, "loss": 2.5694, "step": 4749 }, { "crossentropy": 2.5457723140716553, "epoch": 0.17220127610208816, "grad_norm": 0.039494242519140244, "grad_norm_var": 2.5214356881022618e-05, "learning_rate": 0.009416640079275285, "loss": 2.6206, "step": 4750 }, { "crossentropy": 2.5153462886810303, "epoch": 0.17223752900232017, "grad_norm": 0.03917267546057701, "grad_norm_var": 2.5133891703785217e-05, "learning_rate": 0.009416367661447543, "loss": 2.6506, "step": 4751 }, { "crossentropy": 2.825538396835327, "epoch": 0.17227378190255221, "grad_norm": 0.03670822083950043, "grad_norm_var": 2.472360330051937e-05, "learning_rate": 0.009416095183970056, "loss": 2.7665, "step": 4752 }, { "crossentropy": 2.7489120960235596, "epoch": 0.17231003480278423, "grad_norm": 0.03804149851202965, "grad_norm_var": 2.4943297351595618e-05, "learning_rate": 0.009415822646846499, "loss": 2.7274, "step": 4753 }, { "crossentropy": 2.7648839950561523, "epoch": 0.17234628770301624, "grad_norm": 0.03540116176009178, "grad_norm_var": 2.226281938260127e-05, "learning_rate": 0.009415550050080558, "loss": 2.7549, "step": 4754 }, { "crossentropy": 2.5345003604888916, "epoch": 0.17238254060324826, "grad_norm": 0.034215621650218964, "grad_norm_var": 1.776702679935881e-05, "learning_rate": 0.009415277393675911, "loss": 2.5868, "step": 4755 }, { "crossentropy": 2.614518642425537, "epoch": 0.17241879350348027, "grad_norm": 0.0358983650803566, "grad_norm_var": 8.543744025299391e-06, "learning_rate": 0.009415004677636243, "loss": 2.6416, "step": 4756 }, { "crossentropy": 2.6965930461883545, "epoch": 0.1724550464037123, "grad_norm": 0.03553026169538498, "grad_norm_var": 4.618612906637636e-06, "learning_rate": 0.009414731901965236, "loss": 2.6917, "step": 4757 }, { "crossentropy": 2.7955751419067383, "epoch": 0.17249129930394433, "grad_norm": 0.03690369054675102, "grad_norm_var": 3.1598172979887515e-06, "learning_rate": 0.009414459066666577, "loss": 2.6145, "step": 4758 }, { "crossentropy": 2.5587210655212402, "epoch": 0.17252755220417634, "grad_norm": 0.035461269319057465, "grad_norm_var": 3.116261151284161e-06, "learning_rate": 0.009414186171743949, "loss": 2.598, "step": 4759 }, { "crossentropy": 2.5610673427581787, "epoch": 0.17256380510440836, "grad_norm": 0.03852643817663193, "grad_norm_var": 3.2599222516258945e-06, "learning_rate": 0.009413913217201036, "loss": 2.6185, "step": 4760 }, { "crossentropy": 2.696497917175293, "epoch": 0.17260005800464037, "grad_norm": 0.0419519878923893, "grad_norm_var": 4.91892091462956e-06, "learning_rate": 0.009413640203041527, "loss": 2.6607, "step": 4761 }, { "crossentropy": 2.5639748573303223, "epoch": 0.1726363109048724, "grad_norm": 0.04451041296124458, "grad_norm_var": 7.89224331702564e-06, "learning_rate": 0.00941336712926911, "loss": 2.5798, "step": 4762 }, { "crossentropy": 2.8130218982696533, "epoch": 0.1726725638051044, "grad_norm": 0.04113498702645302, "grad_norm_var": 8.228128440174119e-06, "learning_rate": 0.009413093995887474, "loss": 2.7565, "step": 4763 }, { "crossentropy": 2.8175201416015625, "epoch": 0.17270881670533642, "grad_norm": 0.041129838675260544, "grad_norm_var": 8.507705604780755e-06, "learning_rate": 0.009412820802900304, "loss": 2.7655, "step": 4764 }, { "crossentropy": 2.606834650039673, "epoch": 0.17274506960556846, "grad_norm": 0.04390246793627739, "grad_norm_var": 1.0068825950315431e-05, "learning_rate": 0.009412547550311293, "loss": 2.6499, "step": 4765 }, { "crossentropy": 2.6167571544647217, "epoch": 0.17278132250580047, "grad_norm": 0.04233939200639725, "grad_norm_var": 1.090490540558749e-05, "learning_rate": 0.009412274238124132, "loss": 2.6201, "step": 4766 }, { "crossentropy": 2.699427604675293, "epoch": 0.1728175754060325, "grad_norm": 0.035349488258361816, "grad_norm_var": 1.1629379824358809e-05, "learning_rate": 0.00941200086634251, "loss": 2.6972, "step": 4767 }, { "crossentropy": 2.6076252460479736, "epoch": 0.1728538283062645, "grad_norm": 0.037119876593351364, "grad_norm_var": 1.1538176974948815e-05, "learning_rate": 0.00941172743497012, "loss": 2.637, "step": 4768 }, { "crossentropy": 2.725827693939209, "epoch": 0.17289008120649652, "grad_norm": 0.04621383175253868, "grad_norm_var": 1.511627763331787e-05, "learning_rate": 0.00941145394401066, "loss": 2.7368, "step": 4769 }, { "crossentropy": 2.682169198989868, "epoch": 0.17292633410672853, "grad_norm": 0.049893107265233994, "grad_norm_var": 2.1096510354057645e-05, "learning_rate": 0.009411180393467817, "loss": 2.63, "step": 4770 }, { "crossentropy": 2.7337939739227295, "epoch": 0.17296258700696054, "grad_norm": 0.03909221664071083, "grad_norm_var": 1.8818464759834396e-05, "learning_rate": 0.009410906783345288, "loss": 2.781, "step": 4771 }, { "crossentropy": 2.8095943927764893, "epoch": 0.1729988399071926, "grad_norm": 0.03539411351084709, "grad_norm_var": 1.9130956505686245e-05, "learning_rate": 0.00941063311364677, "loss": 2.7633, "step": 4772 }, { "crossentropy": 2.717174530029297, "epoch": 0.1730350928074246, "grad_norm": 0.03739308565855026, "grad_norm_var": 1.8168528296993018e-05, "learning_rate": 0.00941035938437596, "loss": 2.688, "step": 4773 }, { "crossentropy": 2.590505361557007, "epoch": 0.17307134570765662, "grad_norm": 0.04190678521990776, "grad_norm_var": 1.7404141091279835e-05, "learning_rate": 0.009410085595536551, "loss": 2.6376, "step": 4774 }, { "crossentropy": 2.764427661895752, "epoch": 0.17310759860788863, "grad_norm": 0.035057492554187775, "grad_norm_var": 1.7696769268879935e-05, "learning_rate": 0.009409811747132244, "loss": 2.6843, "step": 4775 }, { "crossentropy": 2.7216951847076416, "epoch": 0.17314385150812064, "grad_norm": 0.036201201379299164, "grad_norm_var": 1.870305012504573e-05, "learning_rate": 0.009409537839166739, "loss": 2.759, "step": 4776 }, { "crossentropy": 2.7239112854003906, "epoch": 0.17318010440835266, "grad_norm": 0.03510243073105812, "grad_norm_var": 2.0342957273796794e-05, "learning_rate": 0.009409263871643732, "loss": 2.7011, "step": 4777 }, { "crossentropy": 2.727949380874634, "epoch": 0.17321635730858467, "grad_norm": 0.03703105449676514, "grad_norm_var": 1.9449754165562495e-05, "learning_rate": 0.009408989844566926, "loss": 2.7503, "step": 4778 }, { "crossentropy": 2.732093334197998, "epoch": 0.17325261020881672, "grad_norm": 0.03959164023399353, "grad_norm_var": 1.9291261172785176e-05, "learning_rate": 0.009408715757940021, "loss": 2.6739, "step": 4779 }, { "crossentropy": 2.643836259841919, "epoch": 0.17328886310904873, "grad_norm": 0.035420890897512436, "grad_norm_var": 2.01218042448561e-05, "learning_rate": 0.009408441611766719, "loss": 2.5897, "step": 4780 }, { "crossentropy": 2.8760762214660645, "epoch": 0.17332511600928074, "grad_norm": 0.03468623012304306, "grad_norm_var": 1.9637289093254786e-05, "learning_rate": 0.009408167406050723, "loss": 2.8023, "step": 4781 }, { "crossentropy": 2.7133049964904785, "epoch": 0.17336136890951276, "grad_norm": 0.03960345312952995, "grad_norm_var": 1.874542105326961e-05, "learning_rate": 0.009407893140795737, "loss": 2.6432, "step": 4782 }, { "crossentropy": 2.772590160369873, "epoch": 0.17339762180974477, "grad_norm": 0.036684367805719376, "grad_norm_var": 1.830654058260032e-05, "learning_rate": 0.009407618816005464, "loss": 2.7487, "step": 4783 }, { "crossentropy": 2.9163036346435547, "epoch": 0.1734338747099768, "grad_norm": 0.03780810907483101, "grad_norm_var": 1.8207251525633136e-05, "learning_rate": 0.00940734443168361, "loss": 2.766, "step": 4784 }, { "crossentropy": 2.674302577972412, "epoch": 0.17347012761020883, "grad_norm": 0.03653528913855553, "grad_norm_var": 1.4194509371708251e-05, "learning_rate": 0.009407069987833883, "loss": 2.6786, "step": 4785 }, { "crossentropy": 2.67848539352417, "epoch": 0.17350638051044084, "grad_norm": 0.03491716459393501, "grad_norm_var": 4.389175849056626e-06, "learning_rate": 0.009406795484459987, "loss": 2.6525, "step": 4786 }, { "crossentropy": 2.633363723754883, "epoch": 0.17354263341067286, "grad_norm": 0.03575843572616577, "grad_norm_var": 4.165629608672965e-06, "learning_rate": 0.00940652092156563, "loss": 2.6137, "step": 4787 }, { "crossentropy": 2.691601276397705, "epoch": 0.17357888631090487, "grad_norm": 0.03744782134890556, "grad_norm_var": 4.039273191953387e-06, "learning_rate": 0.00940624629915452, "loss": 2.6185, "step": 4788 }, { "crossentropy": 2.631441116333008, "epoch": 0.1736151392111369, "grad_norm": 0.038342833518981934, "grad_norm_var": 4.152190514678834e-06, "learning_rate": 0.009405971617230368, "loss": 2.729, "step": 4789 }, { "crossentropy": 2.7407419681549072, "epoch": 0.1736513921113689, "grad_norm": 0.0371561124920845, "grad_norm_var": 2.458444327646525e-06, "learning_rate": 0.009405696875796883, "loss": 2.6963, "step": 4790 }, { "crossentropy": 2.547755718231201, "epoch": 0.17368764501160092, "grad_norm": 0.03808574751019478, "grad_norm_var": 2.3647517796112604e-06, "learning_rate": 0.009405422074857776, "loss": 2.7237, "step": 4791 }, { "crossentropy": 2.8400726318359375, "epoch": 0.17372389791183296, "grad_norm": 0.04037831351161003, "grad_norm_var": 3.067021128771996e-06, "learning_rate": 0.009405147214416758, "loss": 2.6899, "step": 4792 }, { "crossentropy": 2.655036449432373, "epoch": 0.17376015081206497, "grad_norm": 0.036476198583841324, "grad_norm_var": 2.808206198987954e-06, "learning_rate": 0.009404872294477543, "loss": 2.6609, "step": 4793 }, { "crossentropy": 2.581857681274414, "epoch": 0.173796403712297, "grad_norm": 0.03555135428905487, "grad_norm_var": 2.987305928249066e-06, "learning_rate": 0.00940459731504384, "loss": 2.5749, "step": 4794 }, { "crossentropy": 2.601078987121582, "epoch": 0.173832656612529, "grad_norm": 0.037112943828105927, "grad_norm_var": 2.5652653405943786e-06, "learning_rate": 0.009404322276119368, "loss": 2.5798, "step": 4795 }, { "crossentropy": 2.786581516265869, "epoch": 0.17386890951276102, "grad_norm": 0.03435452654957771, "grad_norm_var": 2.8605482384091998e-06, "learning_rate": 0.009404047177707842, "loss": 2.775, "step": 4796 }, { "crossentropy": 2.6666481494903564, "epoch": 0.17390516241299303, "grad_norm": 0.036157622933387756, "grad_norm_var": 2.555433197174578e-06, "learning_rate": 0.009403772019812974, "loss": 2.7154, "step": 4797 }, { "crossentropy": 2.7085680961608887, "epoch": 0.17394141531322505, "grad_norm": 0.03844611719250679, "grad_norm_var": 2.2409759909432688e-06, "learning_rate": 0.009403496802438481, "loss": 2.726, "step": 4798 }, { "crossentropy": 2.7740159034729004, "epoch": 0.1739776682134571, "grad_norm": 0.045996058732271194, "grad_norm_var": 7.329396734661228e-06, "learning_rate": 0.00940322152558808, "loss": 2.7513, "step": 4799 }, { "crossentropy": 2.6504688262939453, "epoch": 0.1740139211136891, "grad_norm": 0.05492321774363518, "grad_norm_var": 2.626561161207496e-05, "learning_rate": 0.009402946189265491, "loss": 2.6516, "step": 4800 }, { "crossentropy": 2.594921588897705, "epoch": 0.17405017401392112, "grad_norm": 0.06041610985994339, "grad_norm_var": 5.5326784131575804e-05, "learning_rate": 0.009402670793474432, "loss": 2.6545, "step": 4801 }, { "crossentropy": 2.6718311309814453, "epoch": 0.17408642691415313, "grad_norm": 0.04419352486729622, "grad_norm_var": 5.430072260845283e-05, "learning_rate": 0.009402395338218624, "loss": 2.7731, "step": 4802 }, { "crossentropy": 2.6667730808258057, "epoch": 0.17412267981438515, "grad_norm": 0.03552618622779846, "grad_norm_var": 5.4456337201065574e-05, "learning_rate": 0.009402119823501786, "loss": 2.6944, "step": 4803 }, { "crossentropy": 2.610765218734741, "epoch": 0.17415893271461716, "grad_norm": 0.03578853979706764, "grad_norm_var": 5.5339132497970955e-05, "learning_rate": 0.009401844249327638, "loss": 2.6314, "step": 4804 }, { "crossentropy": 2.5220694541931152, "epoch": 0.1741951856148492, "grad_norm": 0.03913599252700806, "grad_norm_var": 5.514433675169502e-05, "learning_rate": 0.009401568615699905, "loss": 2.5826, "step": 4805 }, { "crossentropy": 2.595651865005493, "epoch": 0.17423143851508122, "grad_norm": 0.03625417500734329, "grad_norm_var": 5.561007695949512e-05, "learning_rate": 0.009401292922622307, "loss": 2.5929, "step": 4806 }, { "crossentropy": 2.5700693130493164, "epoch": 0.17426769141531323, "grad_norm": 0.03563349321484566, "grad_norm_var": 5.6791584854160434e-05, "learning_rate": 0.00940101717009857, "loss": 2.6602, "step": 4807 }, { "crossentropy": 2.6856541633605957, "epoch": 0.17430394431554525, "grad_norm": 0.03407653421163559, "grad_norm_var": 5.928891190155582e-05, "learning_rate": 0.009400741358132415, "loss": 2.6831, "step": 4808 }, { "crossentropy": 2.660921096801758, "epoch": 0.17434019721577726, "grad_norm": 0.036514684557914734, "grad_norm_var": 5.927090855626921e-05, "learning_rate": 0.009400465486727572, "loss": 2.7599, "step": 4809 }, { "crossentropy": 2.7191286087036133, "epoch": 0.17437645011600927, "grad_norm": 0.038406189531087875, "grad_norm_var": 5.808500646150849e-05, "learning_rate": 0.009400189555887764, "loss": 2.7698, "step": 4810 }, { "crossentropy": 2.697847604751587, "epoch": 0.1744127030162413, "grad_norm": 0.03716062009334564, "grad_norm_var": 5.806562953864849e-05, "learning_rate": 0.009399913565616718, "loss": 2.7118, "step": 4811 }, { "crossentropy": 2.7354280948638916, "epoch": 0.17444895591647333, "grad_norm": 0.03739356994628906, "grad_norm_var": 5.627972700559206e-05, "learning_rate": 0.009399637515918163, "loss": 2.6303, "step": 4812 }, { "crossentropy": 2.6576743125915527, "epoch": 0.17448520881670534, "grad_norm": 0.03597966581583023, "grad_norm_var": 5.638180817306478e-05, "learning_rate": 0.009399361406795826, "loss": 2.6913, "step": 4813 }, { "crossentropy": 2.668757438659668, "epoch": 0.17452146171693736, "grad_norm": 0.036034028977155685, "grad_norm_var": 5.7362673109247604e-05, "learning_rate": 0.009399085238253436, "loss": 2.6627, "step": 4814 }, { "crossentropy": 2.6907730102539062, "epoch": 0.17455771461716937, "grad_norm": 0.03714936226606369, "grad_norm_var": 5.543452639106989e-05, "learning_rate": 0.009398809010294724, "loss": 2.6951, "step": 4815 }, { "crossentropy": 2.7581963539123535, "epoch": 0.1745939675174014, "grad_norm": 0.038815177977085114, "grad_norm_var": 3.8873408943447944e-05, "learning_rate": 0.009398532722923424, "loss": 2.7033, "step": 4816 }, { "crossentropy": 2.6876955032348633, "epoch": 0.1746302204176334, "grad_norm": 0.03638385236263275, "grad_norm_var": 5.240670790896256e-06, "learning_rate": 0.009398256376143261, "loss": 2.6698, "step": 4817 }, { "crossentropy": 2.730405807495117, "epoch": 0.17466647331786542, "grad_norm": 0.03600678592920303, "grad_norm_var": 1.7442334031706844e-06, "learning_rate": 0.009397979969957972, "loss": 2.7613, "step": 4818 }, { "crossentropy": 2.6037561893463135, "epoch": 0.17470272621809746, "grad_norm": 0.035156503319740295, "grad_norm_var": 1.8077341462298854e-06, "learning_rate": 0.009397703504371292, "loss": 2.6471, "step": 4819 }, { "crossentropy": 2.598297357559204, "epoch": 0.17473897911832947, "grad_norm": 0.037288255989551544, "grad_norm_var": 1.7824305929757418e-06, "learning_rate": 0.009397426979386949, "loss": 2.6423, "step": 4820 }, { "crossentropy": 2.5821330547332764, "epoch": 0.1747752320185615, "grad_norm": 0.03467745706439018, "grad_norm_var": 1.5837295421798162e-06, "learning_rate": 0.009397150395008682, "loss": 2.6225, "step": 4821 }, { "crossentropy": 2.7065980434417725, "epoch": 0.1748114849187935, "grad_norm": 0.038948532193899155, "grad_norm_var": 1.9731567237034226e-06, "learning_rate": 0.009396873751240226, "loss": 2.7215, "step": 4822 }, { "crossentropy": 2.7805869579315186, "epoch": 0.17484773781902552, "grad_norm": 0.03749457374215126, "grad_norm_var": 1.9494168106102065e-06, "learning_rate": 0.009396597048085318, "loss": 2.7541, "step": 4823 }, { "crossentropy": 2.7632532119750977, "epoch": 0.17488399071925753, "grad_norm": 0.038857195526361465, "grad_norm_var": 1.6941977510463261e-06, "learning_rate": 0.009396320285547696, "loss": 2.7004, "step": 4824 }, { "crossentropy": 2.635650634765625, "epoch": 0.17492024361948955, "grad_norm": 0.03601003810763359, "grad_norm_var": 1.7438900798614927e-06, "learning_rate": 0.009396043463631097, "loss": 2.7322, "step": 4825 }, { "crossentropy": 2.719343423843384, "epoch": 0.1749564965197216, "grad_norm": 0.03694571182131767, "grad_norm_var": 1.600475540072752e-06, "learning_rate": 0.009395766582339258, "loss": 2.7177, "step": 4826 }, { "crossentropy": 2.8052234649658203, "epoch": 0.1749927494199536, "grad_norm": 0.039901673793792725, "grad_norm_var": 2.167565130238522e-06, "learning_rate": 0.00939548964167592, "loss": 2.7343, "step": 4827 }, { "crossentropy": 2.5583605766296387, "epoch": 0.17502900232018562, "grad_norm": 0.0381239615380764, "grad_norm_var": 2.232890570533913e-06, "learning_rate": 0.009395212641644826, "loss": 2.5865, "step": 4828 }, { "crossentropy": 2.654820442199707, "epoch": 0.17506525522041763, "grad_norm": 0.03825404495000839, "grad_norm_var": 2.2131739803733784e-06, "learning_rate": 0.009394935582249714, "loss": 2.6968, "step": 4829 }, { "crossentropy": 2.717078685760498, "epoch": 0.17510150812064965, "grad_norm": 0.0377088338136673, "grad_norm_var": 2.1162913265629703e-06, "learning_rate": 0.009394658463494328, "loss": 2.7358, "step": 4830 }, { "crossentropy": 2.58797287940979, "epoch": 0.17513776102088166, "grad_norm": 0.04433692246675491, "grad_norm_var": 5.1455207594358964e-06, "learning_rate": 0.009394381285382409, "loss": 2.5944, "step": 4831 }, { "crossentropy": 2.7219061851501465, "epoch": 0.1751740139211137, "grad_norm": 0.052009712904691696, "grad_norm_var": 1.78004364824208e-05, "learning_rate": 0.009394104047917705, "loss": 2.7039, "step": 4832 }, { "crossentropy": 2.734511613845825, "epoch": 0.17521026682134572, "grad_norm": 0.042288441210985184, "grad_norm_var": 1.82099196521794e-05, "learning_rate": 0.009393826751103954, "loss": 2.7304, "step": 4833 }, { "crossentropy": 2.7644267082214355, "epoch": 0.17524651972157773, "grad_norm": 0.03767312318086624, "grad_norm_var": 1.771831489050194e-05, "learning_rate": 0.009393549394944908, "loss": 2.7325, "step": 4834 }, { "crossentropy": 2.8571035861968994, "epoch": 0.17528277262180975, "grad_norm": 0.036064691841602325, "grad_norm_var": 1.7291772688401463e-05, "learning_rate": 0.009393271979444307, "loss": 2.867, "step": 4835 }, { "crossentropy": 2.6809937953948975, "epoch": 0.17531902552204176, "grad_norm": 0.03642936423420906, "grad_norm_var": 1.7552394534081407e-05, "learning_rate": 0.009392994504605902, "loss": 2.7236, "step": 4836 }, { "crossentropy": 2.686413049697876, "epoch": 0.17535527842227377, "grad_norm": 0.035855162888765335, "grad_norm_var": 1.694340116741855e-05, "learning_rate": 0.00939271697043344, "loss": 2.6088, "step": 4837 }, { "crossentropy": 2.49293851852417, "epoch": 0.1753915313225058, "grad_norm": 0.03523235395550728, "grad_norm_var": 1.79218958548312e-05, "learning_rate": 0.00939243937693067, "loss": 2.5454, "step": 4838 }, { "crossentropy": 2.7725391387939453, "epoch": 0.17542778422273783, "grad_norm": 0.03695552796125412, "grad_norm_var": 1.8044598259539004e-05, "learning_rate": 0.009392161724101341, "loss": 2.6499, "step": 4839 }, { "crossentropy": 2.7051429748535156, "epoch": 0.17546403712296985, "grad_norm": 0.033729515969753265, "grad_norm_var": 1.972772610507878e-05, "learning_rate": 0.009391884011949198, "loss": 2.7046, "step": 4840 }, { "crossentropy": 2.7524430751800537, "epoch": 0.17550029002320186, "grad_norm": 0.03752676770091057, "grad_norm_var": 1.9348758589839608e-05, "learning_rate": 0.009391606240478, "loss": 2.6437, "step": 4841 }, { "crossentropy": 2.7459616661071777, "epoch": 0.17553654292343387, "grad_norm": 0.03594283387064934, "grad_norm_var": 1.9644824923322387e-05, "learning_rate": 0.009391328409691497, "loss": 2.7302, "step": 4842 }, { "crossentropy": 2.6676785945892334, "epoch": 0.1755727958236659, "grad_norm": 0.03382130712270737, "grad_norm_var": 2.092215296655685e-05, "learning_rate": 0.009391050519593436, "loss": 2.6607, "step": 4843 }, { "crossentropy": 2.567993402481079, "epoch": 0.1756090487238979, "grad_norm": 0.03790014609694481, "grad_norm_var": 2.0928956576991148e-05, "learning_rate": 0.009390772570187576, "loss": 2.6498, "step": 4844 }, { "crossentropy": 2.8588755130767822, "epoch": 0.17564530162412992, "grad_norm": 0.04435260593891144, "grad_norm_var": 2.3270558857327317e-05, "learning_rate": 0.00939049456147767, "loss": 2.7799, "step": 4845 }, { "crossentropy": 2.6023902893066406, "epoch": 0.17568155452436196, "grad_norm": 0.04208097979426384, "grad_norm_var": 2.393749783605129e-05, "learning_rate": 0.00939021649346747, "loss": 2.7113, "step": 4846 }, { "crossentropy": 2.646994113922119, "epoch": 0.17571780742459397, "grad_norm": 0.03546906262636185, "grad_norm_var": 2.2409096067374916e-05, "learning_rate": 0.009389938366160736, "loss": 2.7149, "step": 4847 }, { "crossentropy": 2.6656665802001953, "epoch": 0.175754060324826, "grad_norm": 0.03252898529171944, "grad_norm_var": 1.0604042656351247e-05, "learning_rate": 0.00938966017956122, "loss": 2.672, "step": 4848 }, { "crossentropy": 2.727536916732788, "epoch": 0.175790313225058, "grad_norm": 0.03345803543925285, "grad_norm_var": 9.387201967737898e-06, "learning_rate": 0.009389381933672683, "loss": 2.7628, "step": 4849 }, { "crossentropy": 2.5745933055877686, "epoch": 0.17582656612529002, "grad_norm": 0.03366849943995476, "grad_norm_var": 9.797181032360677e-06, "learning_rate": 0.009389103628498882, "loss": 2.6417, "step": 4850 }, { "crossentropy": 2.6600074768066406, "epoch": 0.17586281902552203, "grad_norm": 0.038100410252809525, "grad_norm_var": 9.98865930447451e-06, "learning_rate": 0.009388825264043575, "loss": 2.6298, "step": 4851 }, { "crossentropy": 2.665799856185913, "epoch": 0.17589907192575405, "grad_norm": 0.035164348781108856, "grad_norm_var": 1.0090591579816251e-05, "learning_rate": 0.009388546840310524, "loss": 2.7406, "step": 4852 }, { "crossentropy": 2.5112268924713135, "epoch": 0.1759353248259861, "grad_norm": 0.03557997941970825, "grad_norm_var": 1.0113908363436283e-05, "learning_rate": 0.009388268357303487, "loss": 2.5834, "step": 4853 }, { "crossentropy": 2.7904815673828125, "epoch": 0.1759715777262181, "grad_norm": 0.03614719212055206, "grad_norm_var": 1.003056347986785e-05, "learning_rate": 0.009387989815026226, "loss": 2.689, "step": 4854 }, { "crossentropy": 2.7250561714172363, "epoch": 0.17600783062645012, "grad_norm": 0.03440467640757561, "grad_norm_var": 1.0248855515908721e-05, "learning_rate": 0.009387711213482504, "loss": 2.7294, "step": 4855 }, { "crossentropy": 2.661102533340454, "epoch": 0.17604408352668213, "grad_norm": 0.035212382674217224, "grad_norm_var": 9.8894878306901e-06, "learning_rate": 0.009387432552676085, "loss": 2.6335, "step": 4856 }, { "crossentropy": 2.5550456047058105, "epoch": 0.17608033642691415, "grad_norm": 0.037536606192588806, "grad_norm_var": 9.891057386496084e-06, "learning_rate": 0.00938715383261073, "loss": 2.6279, "step": 4857 }, { "crossentropy": 2.6674447059631348, "epoch": 0.17611658932714616, "grad_norm": 0.03706847131252289, "grad_norm_var": 9.911314838876619e-06, "learning_rate": 0.009386875053290203, "loss": 2.6589, "step": 4858 }, { "crossentropy": 2.615567445755005, "epoch": 0.1761528422273782, "grad_norm": 0.03846421465277672, "grad_norm_var": 9.658625816385177e-06, "learning_rate": 0.009386596214718273, "loss": 2.6632, "step": 4859 }, { "crossentropy": 2.771493434906006, "epoch": 0.17618909512761022, "grad_norm": 0.03991078585386276, "grad_norm_var": 1.0234096706074278e-05, "learning_rate": 0.009386317316898705, "loss": 2.7544, "step": 4860 }, { "crossentropy": 2.6658926010131836, "epoch": 0.17622534802784223, "grad_norm": 0.03919261321425438, "grad_norm_var": 6.716937589353276e-06, "learning_rate": 0.009386038359835263, "loss": 2.7627, "step": 4861 }, { "crossentropy": 2.601654291152954, "epoch": 0.17626160092807425, "grad_norm": 0.0404648557305336, "grad_norm_var": 5.677399417483518e-06, "learning_rate": 0.00938575934353172, "loss": 2.7042, "step": 4862 }, { "crossentropy": 2.5895721912384033, "epoch": 0.17629785382830626, "grad_norm": 0.04079775884747505, "grad_norm_var": 6.791945268139199e-06, "learning_rate": 0.009385480267991839, "loss": 2.6604, "step": 4863 }, { "crossentropy": 2.775919198989868, "epoch": 0.17633410672853828, "grad_norm": 0.03844339773058891, "grad_norm_var": 5.664364622885027e-06, "learning_rate": 0.009385201133219391, "loss": 2.7732, "step": 4864 }, { "crossentropy": 2.7057435512542725, "epoch": 0.1763703596287703, "grad_norm": 0.039358485490083694, "grad_norm_var": 4.9743912842283486e-06, "learning_rate": 0.009384921939218149, "loss": 2.7471, "step": 4865 }, { "crossentropy": 2.5491912364959717, "epoch": 0.17640661252900233, "grad_norm": 0.038063909858465195, "grad_norm_var": 3.95417590727994e-06, "learning_rate": 0.00938464268599188, "loss": 2.6436, "step": 4866 }, { "crossentropy": 2.7198069095611572, "epoch": 0.17644286542923435, "grad_norm": 0.037028536200523376, "grad_norm_var": 3.9751004955761985e-06, "learning_rate": 0.009384363373544358, "loss": 2.7021, "step": 4867 }, { "crossentropy": 2.7486519813537598, "epoch": 0.17647911832946636, "grad_norm": 0.04241875186562538, "grad_norm_var": 4.833501265660613e-06, "learning_rate": 0.009384084001879357, "loss": 2.7298, "step": 4868 }, { "crossentropy": 2.661367654800415, "epoch": 0.17651537122969838, "grad_norm": 0.048764076083898544, "grad_norm_var": 1.1213261264661431e-05, "learning_rate": 0.009383804571000647, "loss": 2.6868, "step": 4869 }, { "crossentropy": 2.810269355773926, "epoch": 0.1765516241299304, "grad_norm": 0.047437284141778946, "grad_norm_var": 1.4953485273420247e-05, "learning_rate": 0.009383525080912003, "loss": 2.7822, "step": 4870 }, { "crossentropy": 2.5425803661346436, "epoch": 0.1765878770301624, "grad_norm": 0.041080377995967865, "grad_norm_var": 1.3060689009544655e-05, "learning_rate": 0.009383245531617202, "loss": 2.6118, "step": 4871 }, { "crossentropy": 2.7138001918792725, "epoch": 0.17662412993039442, "grad_norm": 0.0358613096177578, "grad_norm_var": 1.2666047155112289e-05, "learning_rate": 0.009382965923120016, "loss": 2.7673, "step": 4872 }, { "crossentropy": 2.8032264709472656, "epoch": 0.17666038283062646, "grad_norm": 0.04020564258098602, "grad_norm_var": 1.2192560927338596e-05, "learning_rate": 0.009382686255424226, "loss": 2.6799, "step": 4873 }, { "crossentropy": 2.5604660511016846, "epoch": 0.17669663573085848, "grad_norm": 0.03821774944663048, "grad_norm_var": 1.1782217450638914e-05, "learning_rate": 0.009382406528533607, "loss": 2.7026, "step": 4874 }, { "crossentropy": 2.747579574584961, "epoch": 0.1767328886310905, "grad_norm": 0.03652723506093025, "grad_norm_var": 1.2505512454264305e-05, "learning_rate": 0.009382126742451938, "loss": 2.6443, "step": 4875 }, { "crossentropy": 2.67407488822937, "epoch": 0.1767691415313225, "grad_norm": 0.03627879545092583, "grad_norm_var": 1.3487364281990442e-05, "learning_rate": 0.009381846897182996, "loss": 2.6426, "step": 4876 }, { "crossentropy": 2.749110698699951, "epoch": 0.17680539443155452, "grad_norm": 0.03462004289031029, "grad_norm_var": 1.529174798435076e-05, "learning_rate": 0.009381566992730562, "loss": 2.6687, "step": 4877 }, { "crossentropy": 2.720747232437134, "epoch": 0.17684164733178653, "grad_norm": 0.03557654097676277, "grad_norm_var": 1.6301709576906942e-05, "learning_rate": 0.009381287029098415, "loss": 2.7132, "step": 4878 }, { "crossentropy": 2.6731908321380615, "epoch": 0.17687790023201855, "grad_norm": 0.036124709993600845, "grad_norm_var": 1.6806539787729036e-05, "learning_rate": 0.00938100700629034, "loss": 2.6543, "step": 4879 }, { "crossentropy": 2.5993146896362305, "epoch": 0.1769141531322506, "grad_norm": 0.03550398722290993, "grad_norm_var": 1.76138503656895e-05, "learning_rate": 0.009380726924310117, "loss": 2.5408, "step": 4880 }, { "crossentropy": 2.6055381298065186, "epoch": 0.1769504060324826, "grad_norm": 0.035556454211473465, "grad_norm_var": 1.8306038726693433e-05, "learning_rate": 0.009380446783161528, "loss": 2.5645, "step": 4881 }, { "crossentropy": 2.815031051635742, "epoch": 0.17698665893271462, "grad_norm": 0.03771438077092171, "grad_norm_var": 1.8343509159536527e-05, "learning_rate": 0.009380166582848357, "loss": 2.7367, "step": 4882 }, { "crossentropy": 2.534496784210205, "epoch": 0.17702291183294663, "grad_norm": 0.036414340138435364, "grad_norm_var": 1.8502513084541796e-05, "learning_rate": 0.009379886323374392, "loss": 2.5573, "step": 4883 }, { "crossentropy": 2.772315263748169, "epoch": 0.17705916473317865, "grad_norm": 0.03609303757548332, "grad_norm_var": 1.7819573459916008e-05, "learning_rate": 0.009379606004743413, "loss": 2.6773, "step": 4884 }, { "crossentropy": 2.597337245941162, "epoch": 0.17709541763341066, "grad_norm": 0.034599944949150085, "grad_norm_var": 1.049928258992916e-05, "learning_rate": 0.00937932562695921, "loss": 2.6675, "step": 4885 }, { "crossentropy": 2.6705219745635986, "epoch": 0.1771316705336427, "grad_norm": 0.037630174309015274, "grad_norm_var": 3.3375270764995912e-06, "learning_rate": 0.009379045190025566, "loss": 2.7346, "step": 4886 }, { "crossentropy": 2.737614870071411, "epoch": 0.17716792343387472, "grad_norm": 0.0415753610432148, "grad_norm_var": 3.638615767059296e-06, "learning_rate": 0.009378764693946273, "loss": 2.7457, "step": 4887 }, { "crossentropy": 2.7704155445098877, "epoch": 0.17720417633410673, "grad_norm": 0.05044950172305107, "grad_norm_var": 1.5150241849234911e-05, "learning_rate": 0.00937848413872512, "loss": 2.7453, "step": 4888 }, { "crossentropy": 2.6670665740966797, "epoch": 0.17724042923433875, "grad_norm": 0.0688709169626236, "grad_norm_var": 7.61098002110956e-05, "learning_rate": 0.00937820352436589, "loss": 2.7394, "step": 4889 }, { "crossentropy": 2.6370999813079834, "epoch": 0.17727668213457076, "grad_norm": 0.03621460124850273, "grad_norm_var": 7.669893933472619e-05, "learning_rate": 0.00937792285087238, "loss": 2.6683, "step": 4890 }, { "crossentropy": 2.664896249771118, "epoch": 0.17731293503480278, "grad_norm": 0.03381930664181709, "grad_norm_var": 7.817980895545625e-05, "learning_rate": 0.009377642118248377, "loss": 2.6264, "step": 4891 }, { "crossentropy": 2.446284294128418, "epoch": 0.1773491879350348, "grad_norm": 0.03638937696814537, "grad_norm_var": 7.813764790206654e-05, "learning_rate": 0.009377361326497673, "loss": 2.5519, "step": 4892 }, { "crossentropy": 2.6280834674835205, "epoch": 0.17738544083526683, "grad_norm": 0.03555508702993393, "grad_norm_var": 7.762166592039631e-05, "learning_rate": 0.009377080475624062, "loss": 2.6589, "step": 4893 }, { "crossentropy": 2.6752564907073975, "epoch": 0.17742169373549885, "grad_norm": 0.03498855233192444, "grad_norm_var": 7.793169753987356e-05, "learning_rate": 0.009376799565631337, "loss": 2.6918, "step": 4894 }, { "crossentropy": 2.5168228149414062, "epoch": 0.17745794663573086, "grad_norm": 0.03615759313106537, "grad_norm_var": 7.79181996285877e-05, "learning_rate": 0.009376518596523293, "loss": 2.5664, "step": 4895 }, { "crossentropy": 2.6007602214813232, "epoch": 0.17749419953596288, "grad_norm": 0.03686162829399109, "grad_norm_var": 7.736058801329718e-05, "learning_rate": 0.009376237568303723, "loss": 2.632, "step": 4896 }, { "crossentropy": 2.7223458290100098, "epoch": 0.1775304524361949, "grad_norm": 0.04177779331803322, "grad_norm_var": 7.666965922713351e-05, "learning_rate": 0.009375956480976425, "loss": 2.7266, "step": 4897 }, { "crossentropy": 2.6853301525115967, "epoch": 0.1775667053364269, "grad_norm": 0.04173378646373749, "grad_norm_var": 7.661821212702549e-05, "learning_rate": 0.00937567533454519, "loss": 2.7097, "step": 4898 }, { "crossentropy": 2.700484275817871, "epoch": 0.17760295823665892, "grad_norm": 0.04521779343485832, "grad_norm_var": 7.731693779949815e-05, "learning_rate": 0.009375394129013823, "loss": 2.6677, "step": 4899 }, { "crossentropy": 2.680069923400879, "epoch": 0.17763921113689096, "grad_norm": 0.04158042371273041, "grad_norm_var": 7.597753734773575e-05, "learning_rate": 0.009375112864386116, "loss": 2.7324, "step": 4900 }, { "crossentropy": 2.6038520336151123, "epoch": 0.17767546403712298, "grad_norm": 0.03503375127911568, "grad_norm_var": 7.56284346864736e-05, "learning_rate": 0.009374831540665872, "loss": 2.6858, "step": 4901 }, { "crossentropy": 2.652622938156128, "epoch": 0.177711716937355, "grad_norm": 0.03420960530638695, "grad_norm_var": 7.783547492611013e-05, "learning_rate": 0.009374550157856888, "loss": 2.636, "step": 4902 }, { "crossentropy": 2.532714605331421, "epoch": 0.177747969837587, "grad_norm": 0.03931506350636482, "grad_norm_var": 7.787656588696904e-05, "learning_rate": 0.009374268715962966, "loss": 2.5287, "step": 4903 }, { "crossentropy": 2.589175224304199, "epoch": 0.17778422273781902, "grad_norm": 0.036174189299345016, "grad_norm_var": 7.169625934913311e-05, "learning_rate": 0.009373987214987906, "loss": 2.6661, "step": 4904 }, { "crossentropy": 2.756040573120117, "epoch": 0.17782047563805103, "grad_norm": 0.035937920212745667, "grad_norm_var": 1.1034303183058274e-05, "learning_rate": 0.009373705654935512, "loss": 2.7341, "step": 4905 }, { "crossentropy": 2.520254373550415, "epoch": 0.17785672853828308, "grad_norm": 0.03615230694413185, "grad_norm_var": 1.1045723836406302e-05, "learning_rate": 0.009373424035809586, "loss": 2.612, "step": 4906 }, { "crossentropy": 2.724957227706909, "epoch": 0.1778929814385151, "grad_norm": 0.03402079641819, "grad_norm_var": 1.0947860086997727e-05, "learning_rate": 0.009373142357613929, "loss": 2.6604, "step": 4907 }, { "crossentropy": 2.737851619720459, "epoch": 0.1779292343387471, "grad_norm": 0.035944774746894836, "grad_norm_var": 1.1030149114314193e-05, "learning_rate": 0.00937286062035235, "loss": 2.6728, "step": 4908 }, { "crossentropy": 2.6948721408843994, "epoch": 0.17796548723897912, "grad_norm": 0.03904605656862259, "grad_norm_var": 1.086731279915324e-05, "learning_rate": 0.009372578824028652, "loss": 2.697, "step": 4909 }, { "crossentropy": 2.63187837600708, "epoch": 0.17800174013921113, "grad_norm": 0.04216991737484932, "grad_norm_var": 1.1437336053073583e-05, "learning_rate": 0.00937229696864664, "loss": 2.5715, "step": 4910 }, { "crossentropy": 2.7281670570373535, "epoch": 0.17803799303944315, "grad_norm": 0.0388825461268425, "grad_norm_var": 1.1156330669753512e-05, "learning_rate": 0.009372015054210125, "loss": 2.6767, "step": 4911 }, { "crossentropy": 2.708716869354248, "epoch": 0.17807424593967516, "grad_norm": 0.036177679896354675, "grad_norm_var": 1.132390893207764e-05, "learning_rate": 0.009371733080722911, "loss": 2.6894, "step": 4912 }, { "crossentropy": 2.561722993850708, "epoch": 0.1781104988399072, "grad_norm": 0.041413094848394394, "grad_norm_var": 1.1164854660179283e-05, "learning_rate": 0.009371451048188807, "loss": 2.6169, "step": 4913 }, { "crossentropy": 2.783745765686035, "epoch": 0.17814675174013922, "grad_norm": 0.04411100596189499, "grad_norm_var": 1.2602280515929218e-05, "learning_rate": 0.009371168956611622, "loss": 2.7114, "step": 4914 }, { "crossentropy": 2.735945463180542, "epoch": 0.17818300464037123, "grad_norm": 0.040708526968955994, "grad_norm_var": 9.811109551098075e-06, "learning_rate": 0.009370886805995167, "loss": 2.7818, "step": 4915 }, { "crossentropy": 2.640377998352051, "epoch": 0.17821925754060325, "grad_norm": 0.044259972870349884, "grad_norm_var": 1.1474791133971907e-05, "learning_rate": 0.009370604596343254, "loss": 2.7052, "step": 4916 }, { "crossentropy": 2.774845600128174, "epoch": 0.17825551044083526, "grad_norm": 0.04488183557987213, "grad_norm_var": 1.3185357779733435e-05, "learning_rate": 0.009370322327659693, "loss": 2.7412, "step": 4917 }, { "crossentropy": 2.7785141468048096, "epoch": 0.17829176334106728, "grad_norm": 0.0448017418384552, "grad_norm_var": 1.3484534111537528e-05, "learning_rate": 0.009370039999948294, "loss": 2.7204, "step": 4918 }, { "crossentropy": 2.552216053009033, "epoch": 0.1783280162412993, "grad_norm": 0.04834846779704094, "grad_norm_var": 1.82115729872427e-05, "learning_rate": 0.009369757613212876, "loss": 2.6077, "step": 4919 }, { "crossentropy": 2.8653371334075928, "epoch": 0.17836426914153133, "grad_norm": 0.051945436745882034, "grad_norm_var": 2.5313963861371633e-05, "learning_rate": 0.009369475167457248, "loss": 2.7971, "step": 4920 }, { "crossentropy": 2.694918155670166, "epoch": 0.17840052204176335, "grad_norm": 0.04621654376387596, "grad_norm_var": 2.4739587451078072e-05, "learning_rate": 0.009369192662685228, "loss": 2.7222, "step": 4921 }, { "crossentropy": 2.8062102794647217, "epoch": 0.17843677494199536, "grad_norm": 0.04297582432627678, "grad_norm_var": 2.2495365963690755e-05, "learning_rate": 0.009368910098900632, "loss": 2.7004, "step": 4922 }, { "crossentropy": 2.7241737842559814, "epoch": 0.17847302784222738, "grad_norm": 0.03838988393545151, "grad_norm_var": 1.8898029595819838e-05, "learning_rate": 0.009368627476107274, "loss": 2.7943, "step": 4923 }, { "crossentropy": 2.8101377487182617, "epoch": 0.1785092807424594, "grad_norm": 0.03657536581158638, "grad_norm_var": 1.8370290664162547e-05, "learning_rate": 0.009368344794308972, "loss": 2.7796, "step": 4924 }, { "crossentropy": 2.6698153018951416, "epoch": 0.1785455336426914, "grad_norm": 0.03544458746910095, "grad_norm_var": 2.08666493775567e-05, "learning_rate": 0.009368062053509546, "loss": 2.6658, "step": 4925 }, { "crossentropy": 2.6982903480529785, "epoch": 0.17858178654292342, "grad_norm": 0.03970315679907799, "grad_norm_var": 2.1300068618037606e-05, "learning_rate": 0.00936777925371281, "loss": 2.8011, "step": 4926 }, { "crossentropy": 2.759833335876465, "epoch": 0.17861803944315546, "grad_norm": 0.03350745886564255, "grad_norm_var": 2.5467019337000312e-05, "learning_rate": 0.009367496394922587, "loss": 2.7723, "step": 4927 }, { "crossentropy": 2.858813524246216, "epoch": 0.17865429234338748, "grad_norm": 0.037129148840904236, "grad_norm_var": 2.480510072349658e-05, "learning_rate": 0.0093672134771427, "loss": 2.8156, "step": 4928 }, { "crossentropy": 2.824413537979126, "epoch": 0.1786905452436195, "grad_norm": 0.03377151116728783, "grad_norm_var": 2.8951577653782827e-05, "learning_rate": 0.009366930500376966, "loss": 2.796, "step": 4929 }, { "crossentropy": 2.744732618331909, "epoch": 0.1787267981438515, "grad_norm": 0.03597322106361389, "grad_norm_var": 3.0174127935429265e-05, "learning_rate": 0.00936664746462921, "loss": 2.6312, "step": 4930 }, { "crossentropy": 2.520303964614868, "epoch": 0.17876305104408352, "grad_norm": 0.0350312665104866, "grad_norm_var": 3.234453068893939e-05, "learning_rate": 0.00936636436990325, "loss": 2.59, "step": 4931 }, { "crossentropy": 2.593149423599243, "epoch": 0.17879930394431554, "grad_norm": 0.034270238131284714, "grad_norm_var": 3.365309170345031e-05, "learning_rate": 0.009366081216202917, "loss": 2.7354, "step": 4932 }, { "crossentropy": 2.5680932998657227, "epoch": 0.17883555684454758, "grad_norm": 0.037616949528455734, "grad_norm_var": 3.216033710818766e-05, "learning_rate": 0.009365798003532029, "loss": 2.6359, "step": 4933 }, { "crossentropy": 2.672030448913574, "epoch": 0.1788718097447796, "grad_norm": 0.04029122367501259, "grad_norm_var": 3.0232158980475256e-05, "learning_rate": 0.009365514731894415, "loss": 2.7253, "step": 4934 }, { "crossentropy": 2.7125093936920166, "epoch": 0.1789080626450116, "grad_norm": 0.037401843816041946, "grad_norm_var": 2.4367913814273463e-05, "learning_rate": 0.009365231401293899, "loss": 2.7187, "step": 4935 }, { "crossentropy": 2.803792715072632, "epoch": 0.17894431554524362, "grad_norm": 0.035444412380456924, "grad_norm_var": 1.1837359779051649e-05, "learning_rate": 0.009364948011734308, "loss": 2.7874, "step": 4936 }, { "crossentropy": 2.627476453781128, "epoch": 0.17898056844547564, "grad_norm": 0.0361071452498436, "grad_norm_var": 6.453972360698629e-06, "learning_rate": 0.00936466456321947, "loss": 2.7428, "step": 4937 }, { "crossentropy": 2.796990156173706, "epoch": 0.17901682134570765, "grad_norm": 0.03463194519281387, "grad_norm_var": 3.992468347235647e-06, "learning_rate": 0.009364381055753214, "loss": 2.7344, "step": 4938 }, { "crossentropy": 2.69303035736084, "epoch": 0.17905307424593966, "grad_norm": 0.03563571348786354, "grad_norm_var": 3.710337870325928e-06, "learning_rate": 0.009364097489339368, "loss": 2.6795, "step": 4939 }, { "crossentropy": 2.664759635925293, "epoch": 0.1790893271461717, "grad_norm": 0.03612679988145828, "grad_norm_var": 3.697978307026552e-06, "learning_rate": 0.009363813863981763, "loss": 2.6322, "step": 4940 }, { "crossentropy": 2.8064146041870117, "epoch": 0.17912558004640372, "grad_norm": 0.038109369575977325, "grad_norm_var": 3.898117727547483e-06, "learning_rate": 0.00936353017968423, "loss": 2.7628, "step": 4941 }, { "crossentropy": 2.6048645973205566, "epoch": 0.17916183294663574, "grad_norm": 0.039902571588754654, "grad_norm_var": 3.991169181070259e-06, "learning_rate": 0.009363246436450598, "loss": 2.6822, "step": 4942 }, { "crossentropy": 2.644225597381592, "epoch": 0.17919808584686775, "grad_norm": 0.04022899642586708, "grad_norm_var": 4.303723735699962e-06, "learning_rate": 0.009362962634284704, "loss": 2.6058, "step": 4943 }, { "crossentropy": 2.5783214569091797, "epoch": 0.17923433874709976, "grad_norm": 0.03906063735485077, "grad_norm_var": 4.639805775888348e-06, "learning_rate": 0.009362678773190379, "loss": 2.6134, "step": 4944 }, { "crossentropy": 2.7560133934020996, "epoch": 0.17927059164733178, "grad_norm": 0.03708447143435478, "grad_norm_var": 3.96582640576944e-06, "learning_rate": 0.009362394853171455, "loss": 2.7512, "step": 4945 }, { "crossentropy": 2.8003172874450684, "epoch": 0.1793068445475638, "grad_norm": 0.03498954698443413, "grad_norm_var": 4.168486414170743e-06, "learning_rate": 0.009362110874231768, "loss": 2.6282, "step": 4946 }, { "crossentropy": 2.623023271560669, "epoch": 0.17934309744779584, "grad_norm": 0.034573473036289215, "grad_norm_var": 4.3014995216583696e-06, "learning_rate": 0.009361826836375153, "loss": 2.619, "step": 4947 }, { "crossentropy": 2.6587464809417725, "epoch": 0.17937935034802785, "grad_norm": 0.035896752029657364, "grad_norm_var": 3.881958242610978e-06, "learning_rate": 0.00936154273960545, "loss": 2.66, "step": 4948 }, { "crossentropy": 2.621786117553711, "epoch": 0.17941560324825986, "grad_norm": 0.03830283135175705, "grad_norm_var": 3.961483122049264e-06, "learning_rate": 0.009361258583926491, "loss": 2.6795, "step": 4949 }, { "crossentropy": 2.6290760040283203, "epoch": 0.17945185614849188, "grad_norm": 0.0365699864923954, "grad_norm_var": 3.24940682089597e-06, "learning_rate": 0.009360974369342119, "loss": 2.6773, "step": 4950 }, { "crossentropy": 2.794203996658325, "epoch": 0.1794881090487239, "grad_norm": 0.040510546416044235, "grad_norm_var": 4.0700596008857426e-06, "learning_rate": 0.009360690095856168, "loss": 2.7399, "step": 4951 }, { "crossentropy": 2.7907681465148926, "epoch": 0.1795243619489559, "grad_norm": 0.038836244493722916, "grad_norm_var": 4.052369502024975e-06, "learning_rate": 0.00936040576347248, "loss": 2.7575, "step": 4952 }, { "crossentropy": 2.6834590435028076, "epoch": 0.17956061484918792, "grad_norm": 0.03490713983774185, "grad_norm_var": 4.330898234101004e-06, "learning_rate": 0.009360121372194895, "loss": 2.6085, "step": 4953 }, { "crossentropy": 2.6426477432250977, "epoch": 0.17959686774941996, "grad_norm": 0.03438348323106766, "grad_norm_var": 4.420177588824031e-06, "learning_rate": 0.009359836922027255, "loss": 2.7488, "step": 4954 }, { "crossentropy": 2.6966259479522705, "epoch": 0.17963312064965198, "grad_norm": 0.034282997250556946, "grad_norm_var": 4.815762764372409e-06, "learning_rate": 0.0093595524129734, "loss": 2.6611, "step": 4955 }, { "crossentropy": 2.7042407989501953, "epoch": 0.179669373549884, "grad_norm": 0.035374559462070465, "grad_norm_var": 4.949779738670085e-06, "learning_rate": 0.009359267845037175, "loss": 2.7731, "step": 4956 }, { "crossentropy": 2.4658591747283936, "epoch": 0.179705626450116, "grad_norm": 0.03763217106461525, "grad_norm_var": 4.89745763630426e-06, "learning_rate": 0.00935898321822242, "loss": 2.6095, "step": 4957 }, { "crossentropy": 2.707054853439331, "epoch": 0.17974187935034802, "grad_norm": 0.04266398400068283, "grad_norm_var": 6.430394296633863e-06, "learning_rate": 0.009358698532532985, "loss": 2.7406, "step": 4958 }, { "crossentropy": 2.641845464706421, "epoch": 0.17977813225058004, "grad_norm": 0.03682071715593338, "grad_norm_var": 5.782706847152983e-06, "learning_rate": 0.009358413787972709, "loss": 2.6484, "step": 4959 }, { "crossentropy": 2.778446674346924, "epoch": 0.17981438515081208, "grad_norm": 0.03646837919950485, "grad_norm_var": 5.488081156056144e-06, "learning_rate": 0.009358128984545442, "loss": 2.7658, "step": 4960 }, { "crossentropy": 2.6830642223358154, "epoch": 0.1798506380510441, "grad_norm": 0.03373327851295471, "grad_norm_var": 6.076765316104006e-06, "learning_rate": 0.00935784412225503, "loss": 2.6467, "step": 4961 }, { "crossentropy": 2.6569032669067383, "epoch": 0.1798868909512761, "grad_norm": 0.03537355363368988, "grad_norm_var": 6.002417504259669e-06, "learning_rate": 0.009357559201105318, "loss": 2.6342, "step": 4962 }, { "crossentropy": 2.7592926025390625, "epoch": 0.17992314385150812, "grad_norm": 0.03511328995227814, "grad_norm_var": 5.871485353404979e-06, "learning_rate": 0.009357274221100158, "loss": 2.7187, "step": 4963 }, { "crossentropy": 2.622582197189331, "epoch": 0.17995939675174014, "grad_norm": 0.034974586218595505, "grad_norm_var": 6.020861807440268e-06, "learning_rate": 0.009356989182243395, "loss": 2.6966, "step": 4964 }, { "crossentropy": 2.733234167098999, "epoch": 0.17999564965197215, "grad_norm": 0.034771278500556946, "grad_norm_var": 6.008768976746512e-06, "learning_rate": 0.009356704084538883, "loss": 2.7232, "step": 4965 }, { "crossentropy": 2.755289316177368, "epoch": 0.18003190255220416, "grad_norm": 0.036288198083639145, "grad_norm_var": 6.007383105212314e-06, "learning_rate": 0.009356418927990471, "loss": 2.7101, "step": 4966 }, { "crossentropy": 2.5655722618103027, "epoch": 0.1800681554524362, "grad_norm": 0.03599156439304352, "grad_norm_var": 4.796974882377107e-06, "learning_rate": 0.009356133712602008, "loss": 2.7112, "step": 4967 }, { "crossentropy": 2.8214781284332275, "epoch": 0.18010440835266822, "grad_norm": 0.03487075865268707, "grad_norm_var": 4.333563478113994e-06, "learning_rate": 0.00935584843837735, "loss": 2.7782, "step": 4968 }, { "crossentropy": 2.535761833190918, "epoch": 0.18014066125290024, "grad_norm": 0.033387359231710434, "grad_norm_var": 4.6696130127005245e-06, "learning_rate": 0.009355563105320349, "loss": 2.6316, "step": 4969 }, { "crossentropy": 2.8158316612243652, "epoch": 0.18017691415313225, "grad_norm": 0.036154307425022125, "grad_norm_var": 4.541032798136002e-06, "learning_rate": 0.009355277713434858, "loss": 2.7916, "step": 4970 }, { "crossentropy": 2.7769742012023926, "epoch": 0.18021316705336426, "grad_norm": 0.035173095762729645, "grad_norm_var": 4.4023458970511575e-06, "learning_rate": 0.009354992262724734, "loss": 2.7368, "step": 4971 }, { "crossentropy": 2.7438721656799316, "epoch": 0.18024941995359628, "grad_norm": 0.0345146544277668, "grad_norm_var": 4.511606990103591e-06, "learning_rate": 0.009354706753193828, "loss": 2.669, "step": 4972 }, { "crossentropy": 2.689872980117798, "epoch": 0.1802856728538283, "grad_norm": 0.037078339606523514, "grad_norm_var": 4.400703048832756e-06, "learning_rate": 0.009354421184846, "loss": 2.7016, "step": 4973 }, { "crossentropy": 2.8088369369506836, "epoch": 0.18032192575406034, "grad_norm": 0.03637661412358284, "grad_norm_var": 1.1474541316428925e-06, "learning_rate": 0.009354135557685104, "loss": 2.6716, "step": 4974 }, { "crossentropy": 2.6746485233306885, "epoch": 0.18035817865429235, "grad_norm": 0.041956011205911636, "grad_norm_var": 3.738903458077164e-06, "learning_rate": 0.009353849871715001, "loss": 2.6837, "step": 4975 }, { "crossentropy": 2.526909112930298, "epoch": 0.18039443155452436, "grad_norm": 0.033448126167058945, "grad_norm_var": 4.025402130806525e-06, "learning_rate": 0.009353564126939549, "loss": 2.4741, "step": 4976 }, { "crossentropy": 2.679253578186035, "epoch": 0.18043068445475638, "grad_norm": 0.03330595791339874, "grad_norm_var": 4.141766744043355e-06, "learning_rate": 0.009353278323362605, "loss": 2.5974, "step": 4977 }, { "crossentropy": 2.7719757556915283, "epoch": 0.1804669373549884, "grad_norm": 0.033644091337919235, "grad_norm_var": 4.369072919102081e-06, "learning_rate": 0.009352992460988033, "loss": 2.6773, "step": 4978 }, { "crossentropy": 2.6276276111602783, "epoch": 0.1805031902552204, "grad_norm": 0.03543520346283913, "grad_norm_var": 4.361504618627387e-06, "learning_rate": 0.009352706539819691, "loss": 2.633, "step": 4979 }, { "crossentropy": 2.7651450634002686, "epoch": 0.18053944315545242, "grad_norm": 0.040350303053855896, "grad_norm_var": 5.819269525526339e-06, "learning_rate": 0.009352420559861442, "loss": 2.742, "step": 4980 }, { "crossentropy": 2.6676948070526123, "epoch": 0.18057569605568446, "grad_norm": 0.03578849881887436, "grad_norm_var": 5.744874651727496e-06, "learning_rate": 0.009352134521117147, "loss": 2.651, "step": 4981 }, { "crossentropy": 2.6653144359588623, "epoch": 0.18061194895591648, "grad_norm": 0.03498771786689758, "grad_norm_var": 5.776362701873095e-06, "learning_rate": 0.009351848423590672, "loss": 2.6924, "step": 4982 }, { "crossentropy": 2.835010290145874, "epoch": 0.1806482018561485, "grad_norm": 0.03365887701511383, "grad_norm_var": 6.050312117558687e-06, "learning_rate": 0.00935156226728588, "loss": 2.7418, "step": 4983 }, { "crossentropy": 2.7881314754486084, "epoch": 0.1806844547563805, "grad_norm": 0.03481099009513855, "grad_norm_var": 6.0566107482062925e-06, "learning_rate": 0.009351276052206635, "loss": 2.6961, "step": 4984 }, { "crossentropy": 2.75020432472229, "epoch": 0.18072070765661252, "grad_norm": 0.03423449397087097, "grad_norm_var": 5.848223443570034e-06, "learning_rate": 0.009350989778356805, "loss": 2.7518, "step": 4985 }, { "crossentropy": 2.7069013118743896, "epoch": 0.18075696055684454, "grad_norm": 0.034862857311964035, "grad_norm_var": 5.871192306792367e-06, "learning_rate": 0.009350703445740254, "loss": 2.6942, "step": 4986 }, { "crossentropy": 2.681025743484497, "epoch": 0.18079321345707658, "grad_norm": 0.03690734878182411, "grad_norm_var": 5.960081412139233e-06, "learning_rate": 0.00935041705436085, "loss": 2.7426, "step": 4987 }, { "crossentropy": 2.5568277835845947, "epoch": 0.1808294663573086, "grad_norm": 0.04267372936010361, "grad_norm_var": 8.820343560563225e-06, "learning_rate": 0.009350130604222462, "loss": 2.677, "step": 4988 }, { "crossentropy": 2.656586170196533, "epoch": 0.1808657192575406, "grad_norm": 0.03879036381840706, "grad_norm_var": 9.199477816586542e-06, "learning_rate": 0.00934984409532896, "loss": 2.6423, "step": 4989 }, { "crossentropy": 2.8544700145721436, "epoch": 0.18090197215777262, "grad_norm": 0.036027614027261734, "grad_norm_var": 9.204779299783427e-06, "learning_rate": 0.00934955752768421, "loss": 2.746, "step": 4990 }, { "crossentropy": 2.6991045475006104, "epoch": 0.18093822505800464, "grad_norm": 0.03561316058039665, "grad_norm_var": 6.940243384268019e-06, "learning_rate": 0.009349270901292086, "loss": 2.7308, "step": 4991 }, { "crossentropy": 2.8261985778808594, "epoch": 0.18097447795823665, "grad_norm": 0.0355924516916275, "grad_norm_var": 6.524121407774398e-06, "learning_rate": 0.00934898421615646, "loss": 2.7955, "step": 4992 }, { "crossentropy": 2.721879482269287, "epoch": 0.18101073085846867, "grad_norm": 0.03454168885946274, "grad_norm_var": 6.168639197323631e-06, "learning_rate": 0.0093486974722812, "loss": 2.6335, "step": 4993 }, { "crossentropy": 2.5911478996276855, "epoch": 0.1810469837587007, "grad_norm": 0.03609498590230942, "grad_norm_var": 5.734989771081361e-06, "learning_rate": 0.009348410669670182, "loss": 2.5367, "step": 4994 }, { "crossentropy": 2.8196465969085693, "epoch": 0.18108323665893272, "grad_norm": 0.03711055591702461, "grad_norm_var": 5.723235975666624e-06, "learning_rate": 0.009348123808327278, "loss": 2.7353, "step": 4995 }, { "crossentropy": 2.6051735877990723, "epoch": 0.18111948955916474, "grad_norm": 0.04524679854512215, "grad_norm_var": 9.815193514673727e-06, "learning_rate": 0.009347836888256366, "loss": 2.7236, "step": 4996 }, { "crossentropy": 2.729738235473633, "epoch": 0.18115574245939675, "grad_norm": 0.04805944487452507, "grad_norm_var": 1.7761238902168815e-05, "learning_rate": 0.009347549909461317, "loss": 2.7733, "step": 4997 }, { "crossentropy": 2.547048807144165, "epoch": 0.18119199535962877, "grad_norm": 0.04210257902741432, "grad_norm_var": 1.8588452145572253e-05, "learning_rate": 0.009347262871946009, "loss": 2.6502, "step": 4998 }, { "crossentropy": 2.6089916229248047, "epoch": 0.18122824825986078, "grad_norm": 0.03511684760451317, "grad_norm_var": 1.7897724872508367e-05, "learning_rate": 0.00934697577571432, "loss": 2.6957, "step": 4999 }, { "crossentropy": 2.6229565143585205, "epoch": 0.1812645011600928, "grad_norm": 0.035113830119371414, "grad_norm_var": 1.7775229190188033e-05, "learning_rate": 0.009346688620770123, "loss": 2.638, "step": 5000 }, { "crossentropy": 2.5804293155670166, "epoch": 0.18130075406032484, "grad_norm": 0.035772137343883514, "grad_norm_var": 1.7149862947264615e-05, "learning_rate": 0.009346401407117302, "loss": 2.6508, "step": 5001 }, { "crossentropy": 2.607942819595337, "epoch": 0.18133700696055685, "grad_norm": 0.036942072212696075, "grad_norm_var": 1.652217283236145e-05, "learning_rate": 0.009346114134759735, "loss": 2.6803, "step": 5002 }, { "crossentropy": 2.6427512168884277, "epoch": 0.18137325986078887, "grad_norm": 0.038294486701488495, "grad_norm_var": 1.6397509663712505e-05, "learning_rate": 0.0093458268037013, "loss": 2.6779, "step": 5003 }, { "crossentropy": 2.5640065670013428, "epoch": 0.18140951276102088, "grad_norm": 0.039756231009960175, "grad_norm_var": 1.5235234591049786e-05, "learning_rate": 0.00934553941394588, "loss": 2.6345, "step": 5004 }, { "crossentropy": 2.7393670082092285, "epoch": 0.1814457656612529, "grad_norm": 0.03567010536789894, "grad_norm_var": 1.5571477945546842e-05, "learning_rate": 0.009345251965497355, "loss": 2.6704, "step": 5005 }, { "crossentropy": 2.6731441020965576, "epoch": 0.1814820185614849, "grad_norm": 0.034389983862638474, "grad_norm_var": 1.6156867838716613e-05, "learning_rate": 0.009344964458359608, "loss": 2.6546, "step": 5006 }, { "crossentropy": 2.592313528060913, "epoch": 0.18151827146171692, "grad_norm": 0.0354493074119091, "grad_norm_var": 1.620716487515054e-05, "learning_rate": 0.009344676892536522, "loss": 2.6875, "step": 5007 }, { "crossentropy": 2.7523951530456543, "epoch": 0.18155452436194897, "grad_norm": 0.03527601808309555, "grad_norm_var": 1.630775788376091e-05, "learning_rate": 0.00934438926803198, "loss": 2.6968, "step": 5008 }, { "crossentropy": 2.6530075073242188, "epoch": 0.18159077726218098, "grad_norm": 0.03881838917732239, "grad_norm_var": 1.558803203331547e-05, "learning_rate": 0.00934410158484987, "loss": 2.6205, "step": 5009 }, { "crossentropy": 2.7385077476501465, "epoch": 0.181627030162413, "grad_norm": 0.038955945521593094, "grad_norm_var": 1.534397306706681e-05, "learning_rate": 0.009343813842994075, "loss": 2.7326, "step": 5010 }, { "crossentropy": 2.730492115020752, "epoch": 0.181663283062645, "grad_norm": 0.03381621092557907, "grad_norm_var": 1.6524815258779115e-05, "learning_rate": 0.009343526042468481, "loss": 2.6234, "step": 5011 }, { "crossentropy": 2.7445430755615234, "epoch": 0.18169953596287702, "grad_norm": 0.04492328315973282, "grad_norm_var": 1.6220867098813013e-05, "learning_rate": 0.009343238183276979, "loss": 2.6614, "step": 5012 }, { "crossentropy": 2.73443865776062, "epoch": 0.18173578886310904, "grad_norm": 0.037087392061948776, "grad_norm_var": 9.07039404214674e-06, "learning_rate": 0.009342950265423453, "loss": 2.6724, "step": 5013 }, { "crossentropy": 2.6139750480651855, "epoch": 0.18177204176334108, "grad_norm": 0.03495188429951668, "grad_norm_var": 7.728075496478646e-06, "learning_rate": 0.009342662288911794, "loss": 2.6262, "step": 5014 }, { "crossentropy": 2.710395574569702, "epoch": 0.1818082946635731, "grad_norm": 0.04723108559846878, "grad_norm_var": 1.402669402495767e-05, "learning_rate": 0.00934237425374589, "loss": 2.7473, "step": 5015 }, { "crossentropy": 2.675071954727173, "epoch": 0.1818445475638051, "grad_norm": 0.04380647465586662, "grad_norm_var": 1.5806350229581543e-05, "learning_rate": 0.00934208615992963, "loss": 2.6633, "step": 5016 }, { "crossentropy": 2.7286670207977295, "epoch": 0.18188080046403712, "grad_norm": 0.042660389095544815, "grad_norm_var": 1.6545406731921277e-05, "learning_rate": 0.00934179800746691, "loss": 2.7366, "step": 5017 }, { "crossentropy": 2.783151149749756, "epoch": 0.18191705336426914, "grad_norm": 0.04273778945207596, "grad_norm_var": 1.734288658380966e-05, "learning_rate": 0.009341509796361615, "loss": 2.7743, "step": 5018 }, { "crossentropy": 2.6873793601989746, "epoch": 0.18195330626450115, "grad_norm": 0.039405614137649536, "grad_norm_var": 1.7317147933522867e-05, "learning_rate": 0.009341221526617646, "loss": 2.6603, "step": 5019 }, { "crossentropy": 2.6404359340667725, "epoch": 0.18198955916473317, "grad_norm": 0.03667198866605759, "grad_norm_var": 1.7624755492429144e-05, "learning_rate": 0.009340933198238888, "loss": 2.7303, "step": 5020 }, { "crossentropy": 2.4753456115722656, "epoch": 0.1820258120649652, "grad_norm": 0.03542378917336464, "grad_norm_var": 1.7733499057304147e-05, "learning_rate": 0.009340644811229242, "loss": 2.6031, "step": 5021 }, { "crossentropy": 2.784242630004883, "epoch": 0.18206206496519722, "grad_norm": 0.03696309030056, "grad_norm_var": 1.6617038783756954e-05, "learning_rate": 0.009340356365592602, "loss": 2.7843, "step": 5022 }, { "crossentropy": 2.691099166870117, "epoch": 0.18209831786542924, "grad_norm": 0.03784855082631111, "grad_norm_var": 1.5837376494686783e-05, "learning_rate": 0.00934006786133286, "loss": 2.7155, "step": 5023 }, { "crossentropy": 2.6825239658355713, "epoch": 0.18213457076566125, "grad_norm": 0.03549166023731232, "grad_norm_var": 1.5728577319309006e-05, "learning_rate": 0.009339779298453917, "loss": 2.6789, "step": 5024 }, { "crossentropy": 2.6690926551818848, "epoch": 0.18217082366589327, "grad_norm": 0.034727972000837326, "grad_norm_var": 1.696856817304711e-05, "learning_rate": 0.009339490676959668, "loss": 2.6833, "step": 5025 }, { "crossentropy": 2.6274735927581787, "epoch": 0.18220707656612528, "grad_norm": 0.03640969097614288, "grad_norm_var": 1.7361219739066233e-05, "learning_rate": 0.009339201996854013, "loss": 2.6319, "step": 5026 }, { "crossentropy": 2.733280658721924, "epoch": 0.1822433294663573, "grad_norm": 0.043393317610025406, "grad_norm_var": 1.6781071288074257e-05, "learning_rate": 0.009338913258140848, "loss": 2.6944, "step": 5027 }, { "crossentropy": 2.8216753005981445, "epoch": 0.18227958236658934, "grad_norm": 0.044354114681482315, "grad_norm_var": 1.6379002162566526e-05, "learning_rate": 0.009338624460824075, "loss": 2.8074, "step": 5028 }, { "crossentropy": 2.7498576641082764, "epoch": 0.18231583526682135, "grad_norm": 0.04354611784219742, "grad_norm_var": 1.7061146544664063e-05, "learning_rate": 0.009338335604907596, "loss": 2.6688, "step": 5029 }, { "crossentropy": 2.685824155807495, "epoch": 0.18235208816705337, "grad_norm": 0.04266107827425003, "grad_norm_var": 1.586786443947361e-05, "learning_rate": 0.00933804669039531, "loss": 2.7067, "step": 5030 }, { "crossentropy": 2.5794472694396973, "epoch": 0.18238834106728538, "grad_norm": 0.046123746782541275, "grad_norm_var": 1.4907620775250763e-05, "learning_rate": 0.009337757717291121, "loss": 2.6368, "step": 5031 }, { "crossentropy": 2.7215678691864014, "epoch": 0.1824245939675174, "grad_norm": 0.04567611217498779, "grad_norm_var": 1.604031715824594e-05, "learning_rate": 0.009337468685598932, "loss": 2.702, "step": 5032 }, { "crossentropy": 2.78171443939209, "epoch": 0.1824608468677494, "grad_norm": 0.041629064828157425, "grad_norm_var": 1.577615830430917e-05, "learning_rate": 0.009337179595322645, "loss": 2.7829, "step": 5033 }, { "crossentropy": 2.768699884414673, "epoch": 0.18249709976798145, "grad_norm": 0.03567487373948097, "grad_norm_var": 1.6496042137566686e-05, "learning_rate": 0.009336890446466165, "loss": 2.772, "step": 5034 }, { "crossentropy": 2.604790687561035, "epoch": 0.18253335266821347, "grad_norm": 0.03338521346449852, "grad_norm_var": 1.9037853350037325e-05, "learning_rate": 0.009336601239033399, "loss": 2.6276, "step": 5035 }, { "crossentropy": 2.7188544273376465, "epoch": 0.18256960556844548, "grad_norm": 0.035165052860975266, "grad_norm_var": 1.972263739731343e-05, "learning_rate": 0.009336311973028252, "loss": 2.7117, "step": 5036 }, { "crossentropy": 2.536421537399292, "epoch": 0.1826058584686775, "grad_norm": 0.035014308989048004, "grad_norm_var": 1.9943633581104697e-05, "learning_rate": 0.009336022648454631, "loss": 2.6574, "step": 5037 }, { "crossentropy": 2.691945791244507, "epoch": 0.1826421113689095, "grad_norm": 0.03428126126527786, "grad_norm_var": 2.1212322834524416e-05, "learning_rate": 0.009335733265316445, "loss": 2.6538, "step": 5038 }, { "crossentropy": 2.6949715614318848, "epoch": 0.18267836426914152, "grad_norm": 0.037195246666669846, "grad_norm_var": 2.1346822397472152e-05, "learning_rate": 0.0093354438236176, "loss": 2.6223, "step": 5039 }, { "crossentropy": 2.590003490447998, "epoch": 0.18271461716937354, "grad_norm": 0.03845306485891342, "grad_norm_var": 2.0491674022295125e-05, "learning_rate": 0.009335154323362007, "loss": 2.5592, "step": 5040 }, { "crossentropy": 2.6276419162750244, "epoch": 0.18275087006960558, "grad_norm": 0.03318486735224724, "grad_norm_var": 2.156690895394001e-05, "learning_rate": 0.009334864764553577, "loss": 2.5946, "step": 5041 }, { "crossentropy": 2.7088623046875, "epoch": 0.1827871229698376, "grad_norm": 0.035821158438920975, "grad_norm_var": 2.1802351742580828e-05, "learning_rate": 0.00933457514719622, "loss": 2.7369, "step": 5042 }, { "crossentropy": 2.6240527629852295, "epoch": 0.1828233758700696, "grad_norm": 0.037576377391815186, "grad_norm_var": 2.0585281567130943e-05, "learning_rate": 0.009334285471293847, "loss": 2.714, "step": 5043 }, { "crossentropy": 2.721837282180786, "epoch": 0.18285962877030162, "grad_norm": 0.03545369207859039, "grad_norm_var": 1.8866683704697047e-05, "learning_rate": 0.009333995736850372, "loss": 2.6699, "step": 5044 }, { "crossentropy": 2.466622829437256, "epoch": 0.18289588167053364, "grad_norm": 0.03667951747775078, "grad_norm_var": 1.6898421592798954e-05, "learning_rate": 0.009333705943869708, "loss": 2.531, "step": 5045 }, { "crossentropy": 2.5777742862701416, "epoch": 0.18293213457076565, "grad_norm": 0.0345270074903965, "grad_norm_var": 1.570562214378915e-05, "learning_rate": 0.009333416092355766, "loss": 2.6261, "step": 5046 }, { "crossentropy": 2.7132625579833984, "epoch": 0.18296838747099767, "grad_norm": 0.03468063846230507, "grad_norm_var": 1.033536470929422e-05, "learning_rate": 0.009333126182312466, "loss": 2.6941, "step": 5047 }, { "crossentropy": 2.676671028137207, "epoch": 0.1830046403712297, "grad_norm": 0.03506821393966675, "grad_norm_var": 4.424900126368675e-06, "learning_rate": 0.009332836213743721, "loss": 2.7372, "step": 5048 }, { "crossentropy": 2.8274238109588623, "epoch": 0.18304089327146172, "grad_norm": 0.03299944847822189, "grad_norm_var": 2.4434427465672495e-06, "learning_rate": 0.009332546186653445, "loss": 2.7143, "step": 5049 }, { "crossentropy": 2.7920947074890137, "epoch": 0.18307714617169374, "grad_norm": 0.034720875322818756, "grad_norm_var": 2.4555024889738598e-06, "learning_rate": 0.009332256101045561, "loss": 2.7476, "step": 5050 }, { "crossentropy": 2.5479884147644043, "epoch": 0.18311339907192575, "grad_norm": 0.04192953184247017, "grad_norm_var": 4.879230743296686e-06, "learning_rate": 0.009331965956923982, "loss": 2.6592, "step": 5051 }, { "crossentropy": 2.513923168182373, "epoch": 0.18314965197215777, "grad_norm": 0.04234525188803673, "grad_norm_var": 7.496537479924579e-06, "learning_rate": 0.00933167575429263, "loss": 2.5766, "step": 5052 }, { "crossentropy": 2.7287018299102783, "epoch": 0.18318590487238978, "grad_norm": 0.0387885645031929, "grad_norm_var": 7.767195577841843e-06, "learning_rate": 0.009331385493155424, "loss": 2.7863, "step": 5053 }, { "crossentropy": 2.730299711227417, "epoch": 0.1832221577726218, "grad_norm": 0.03581748157739639, "grad_norm_var": 7.464011171259601e-06, "learning_rate": 0.009331095173516282, "loss": 2.7334, "step": 5054 }, { "crossentropy": 2.758692979812622, "epoch": 0.18325841067285384, "grad_norm": 0.032916974276304245, "grad_norm_var": 8.255635385819259e-06, "learning_rate": 0.009330804795379127, "loss": 2.6938, "step": 5055 }, { "crossentropy": 2.763784885406494, "epoch": 0.18329466357308585, "grad_norm": 0.03360635042190552, "grad_norm_var": 8.338998305211972e-06, "learning_rate": 0.009330514358747882, "loss": 2.6768, "step": 5056 }, { "crossentropy": 2.642115592956543, "epoch": 0.18333091647331787, "grad_norm": 0.034368015825748444, "grad_norm_var": 7.9812491121225e-06, "learning_rate": 0.009330223863626468, "loss": 2.728, "step": 5057 }, { "crossentropy": 2.6222574710845947, "epoch": 0.18336716937354988, "grad_norm": 0.04581227898597717, "grad_norm_var": 1.3873748887132988e-05, "learning_rate": 0.00932993331001881, "loss": 2.6137, "step": 5058 }, { "crossentropy": 2.680352210998535, "epoch": 0.1834034222737819, "grad_norm": 0.045618608593940735, "grad_norm_var": 1.884978242213097e-05, "learning_rate": 0.009329642697928832, "loss": 2.7418, "step": 5059 }, { "crossentropy": 2.4320807456970215, "epoch": 0.1834396751740139, "grad_norm": 0.04946443811058998, "grad_norm_var": 2.7840853794462198e-05, "learning_rate": 0.009329352027360458, "loss": 2.5509, "step": 5060 }, { "crossentropy": 2.7269692420959473, "epoch": 0.18347592807424595, "grad_norm": 0.03945513442158699, "grad_norm_var": 2.780260138728705e-05, "learning_rate": 0.009329061298317617, "loss": 2.7002, "step": 5061 }, { "crossentropy": 2.704491376876831, "epoch": 0.18351218097447797, "grad_norm": 0.034164853394031525, "grad_norm_var": 2.7990930118607786e-05, "learning_rate": 0.00932877051080423, "loss": 2.7766, "step": 5062 }, { "crossentropy": 2.688641309738159, "epoch": 0.18354843387470998, "grad_norm": 0.03833968564867973, "grad_norm_var": 2.7093744227399406e-05, "learning_rate": 0.00932847966482423, "loss": 2.7667, "step": 5063 }, { "crossentropy": 2.735440969467163, "epoch": 0.183584686774942, "grad_norm": 0.04899817332625389, "grad_norm_var": 3.291535415477816e-05, "learning_rate": 0.009328188760381544, "loss": 2.5858, "step": 5064 }, { "crossentropy": 2.7671823501586914, "epoch": 0.183620939675174, "grad_norm": 0.03721998259425163, "grad_norm_var": 3.0463910194370818e-05, "learning_rate": 0.009327897797480101, "loss": 2.698, "step": 5065 }, { "crossentropy": 2.6663455963134766, "epoch": 0.18365719257540603, "grad_norm": 0.034181367605924606, "grad_norm_var": 3.083292674436738e-05, "learning_rate": 0.009327606776123829, "loss": 2.6437, "step": 5066 }, { "crossentropy": 2.71325945854187, "epoch": 0.18369344547563804, "grad_norm": 0.03653954342007637, "grad_norm_var": 3.094877092252307e-05, "learning_rate": 0.00932731569631666, "loss": 2.7028, "step": 5067 }, { "crossentropy": 2.7538890838623047, "epoch": 0.18372969837587008, "grad_norm": 0.03549332544207573, "grad_norm_var": 3.1034541209680405e-05, "learning_rate": 0.009327024558062526, "loss": 2.7127, "step": 5068 }, { "crossentropy": 2.4720559120178223, "epoch": 0.1837659512761021, "grad_norm": 0.04168007895350456, "grad_norm_var": 3.1553052696011116e-05, "learning_rate": 0.009326733361365358, "loss": 2.6021, "step": 5069 }, { "crossentropy": 2.801816701889038, "epoch": 0.1838022041763341, "grad_norm": 0.04714713618159294, "grad_norm_var": 3.479860429550409e-05, "learning_rate": 0.009326442106229092, "loss": 2.7306, "step": 5070 }, { "crossentropy": 2.6591475009918213, "epoch": 0.18383845707656613, "grad_norm": 0.04415171965956688, "grad_norm_var": 3.254474880825849e-05, "learning_rate": 0.009326150792657658, "loss": 2.7269, "step": 5071 }, { "crossentropy": 2.6679821014404297, "epoch": 0.18387470997679814, "grad_norm": 0.0379491001367569, "grad_norm_var": 2.9795481967280896e-05, "learning_rate": 0.009325859420654992, "loss": 2.6668, "step": 5072 }, { "crossentropy": 2.570753335952759, "epoch": 0.18391096287703015, "grad_norm": 0.040630731731653214, "grad_norm_var": 2.6991621271717458e-05, "learning_rate": 0.009325567990225032, "loss": 2.6082, "step": 5073 }, { "crossentropy": 2.619713306427002, "epoch": 0.18394721577726217, "grad_norm": 0.04643826186656952, "grad_norm_var": 2.7413352079126938e-05, "learning_rate": 0.009325276501371712, "loss": 2.5901, "step": 5074 }, { "crossentropy": 2.634450674057007, "epoch": 0.1839834686774942, "grad_norm": 0.04180871322751045, "grad_norm_var": 2.6021108963104832e-05, "learning_rate": 0.009324984954098966, "loss": 2.6874, "step": 5075 }, { "crossentropy": 2.7132835388183594, "epoch": 0.18401972157772623, "grad_norm": 0.03417516127228737, "grad_norm_var": 2.3078026659553198e-05, "learning_rate": 0.009324693348410738, "loss": 2.704, "step": 5076 }, { "crossentropy": 2.6156740188598633, "epoch": 0.18405597447795824, "grad_norm": 0.034579798579216, "grad_norm_var": 2.485166721483666e-05, "learning_rate": 0.009324401684310962, "loss": 2.7161, "step": 5077 }, { "crossentropy": 2.5902814865112305, "epoch": 0.18409222737819025, "grad_norm": 0.036228716373443604, "grad_norm_var": 2.3623995410493314e-05, "learning_rate": 0.009324109961803577, "loss": 2.6445, "step": 5078 }, { "crossentropy": 2.6375746726989746, "epoch": 0.18412848027842227, "grad_norm": 0.034576889127492905, "grad_norm_var": 2.5202723711784115e-05, "learning_rate": 0.009323818180892528, "loss": 2.6689, "step": 5079 }, { "crossentropy": 2.6546120643615723, "epoch": 0.18416473317865428, "grad_norm": 0.03453405946493149, "grad_norm_var": 1.9936433900307784e-05, "learning_rate": 0.009323526341581751, "loss": 2.7578, "step": 5080 }, { "crossentropy": 2.782325506210327, "epoch": 0.1842009860788863, "grad_norm": 0.038660913705825806, "grad_norm_var": 1.9804253917760084e-05, "learning_rate": 0.00932323444387519, "loss": 2.7099, "step": 5081 }, { "crossentropy": 2.604735851287842, "epoch": 0.18423723897911834, "grad_norm": 0.046683140099048615, "grad_norm_var": 2.2084750020504898e-05, "learning_rate": 0.009322942487776787, "loss": 2.5997, "step": 5082 }, { "crossentropy": 2.698711395263672, "epoch": 0.18427349187935035, "grad_norm": 0.0467132069170475, "grad_norm_var": 2.4599160924834198e-05, "learning_rate": 0.009322650473290485, "loss": 2.6426, "step": 5083 }, { "crossentropy": 2.764735460281372, "epoch": 0.18430974477958237, "grad_norm": 0.04061794653534889, "grad_norm_var": 2.3099223300768132e-05, "learning_rate": 0.00932235840042023, "loss": 2.7208, "step": 5084 }, { "crossentropy": 2.635019302368164, "epoch": 0.18434599767981438, "grad_norm": 0.03964397311210632, "grad_norm_var": 2.3013793464350906e-05, "learning_rate": 0.009322066269169963, "loss": 2.682, "step": 5085 }, { "crossentropy": 2.603767156600952, "epoch": 0.1843822505800464, "grad_norm": 0.03971283137798309, "grad_norm_var": 1.9664798685053014e-05, "learning_rate": 0.009321774079543635, "loss": 2.6232, "step": 5086 }, { "crossentropy": 2.657498359680176, "epoch": 0.1844185034802784, "grad_norm": 0.038976699113845825, "grad_norm_var": 1.834906276644103e-05, "learning_rate": 0.009321481831545186, "loss": 2.7137, "step": 5087 }, { "crossentropy": 2.586146354675293, "epoch": 0.18445475638051045, "grad_norm": 0.03612429276108742, "grad_norm_var": 1.893346638821887e-05, "learning_rate": 0.009321189525178571, "loss": 2.6752, "step": 5088 }, { "crossentropy": 2.649264335632324, "epoch": 0.18449100928074247, "grad_norm": 0.03445732221007347, "grad_norm_var": 2.0287202442210938e-05, "learning_rate": 0.00932089716044773, "loss": 2.6472, "step": 5089 }, { "crossentropy": 2.549297332763672, "epoch": 0.18452726218097448, "grad_norm": 0.03705740347504616, "grad_norm_var": 1.6478274827765502e-05, "learning_rate": 0.009320604737356616, "loss": 2.6707, "step": 5090 }, { "crossentropy": 2.661580801010132, "epoch": 0.1845635150812065, "grad_norm": 0.03495585173368454, "grad_norm_var": 1.6307416824585564e-05, "learning_rate": 0.00932031225590918, "loss": 2.6913, "step": 5091 }, { "crossentropy": 2.6852190494537354, "epoch": 0.1845997679814385, "grad_norm": 0.03824483975768089, "grad_norm_var": 1.5277346035827365e-05, "learning_rate": 0.009320019716109372, "loss": 2.7693, "step": 5092 }, { "crossentropy": 2.6185848712921143, "epoch": 0.18463602088167053, "grad_norm": 0.038143590092659, "grad_norm_var": 1.4334049899359693e-05, "learning_rate": 0.009319727117961139, "loss": 2.6659, "step": 5093 }, { "crossentropy": 2.7185776233673096, "epoch": 0.18467227378190254, "grad_norm": 0.040107522159814835, "grad_norm_var": 1.4121324214725709e-05, "learning_rate": 0.009319434461468436, "loss": 2.7319, "step": 5094 }, { "crossentropy": 2.612968683242798, "epoch": 0.18470852668213458, "grad_norm": 0.036465924233198166, "grad_norm_var": 1.3305694073979338e-05, "learning_rate": 0.009319141746635218, "loss": 2.6478, "step": 5095 }, { "crossentropy": 2.572922706604004, "epoch": 0.1847447795823666, "grad_norm": 0.03439503163099289, "grad_norm_var": 1.3386327059490092e-05, "learning_rate": 0.009318848973465434, "loss": 2.6736, "step": 5096 }, { "crossentropy": 2.6593706607818604, "epoch": 0.1847810324825986, "grad_norm": 0.034484587609767914, "grad_norm_var": 1.455946803465442e-05, "learning_rate": 0.009318556141963042, "loss": 2.6622, "step": 5097 }, { "crossentropy": 2.9562408924102783, "epoch": 0.18481728538283063, "grad_norm": 0.03921271860599518, "grad_norm_var": 9.945367331183156e-06, "learning_rate": 0.009318263252131994, "loss": 2.8199, "step": 5098 }, { "crossentropy": 2.683234453201294, "epoch": 0.18485353828306264, "grad_norm": 0.03827784210443497, "grad_norm_var": 4.685050408198034e-06, "learning_rate": 0.009317970303976249, "loss": 2.5825, "step": 5099 }, { "crossentropy": 2.635481595993042, "epoch": 0.18488979118329466, "grad_norm": 0.03428701311349869, "grad_norm_var": 4.6045015524980954e-06, "learning_rate": 0.00931767729749976, "loss": 2.635, "step": 5100 }, { "crossentropy": 2.7424395084381104, "epoch": 0.18492604408352667, "grad_norm": 0.036543115973472595, "grad_norm_var": 4.178141495571562e-06, "learning_rate": 0.00931738423270649, "loss": 2.6891, "step": 5101 }, { "crossentropy": 2.5848147869110107, "epoch": 0.1849622969837587, "grad_norm": 0.041296567767858505, "grad_norm_var": 4.915063791202136e-06, "learning_rate": 0.009317091109600393, "loss": 2.7388, "step": 5102 }, { "crossentropy": 2.5072038173675537, "epoch": 0.18499854988399073, "grad_norm": 0.03876989707350731, "grad_norm_var": 4.865007618984341e-06, "learning_rate": 0.009316797928185428, "loss": 2.5344, "step": 5103 }, { "crossentropy": 2.7389297485351562, "epoch": 0.18503480278422274, "grad_norm": 0.038369692862033844, "grad_norm_var": 4.902536948705495e-06, "learning_rate": 0.009316504688465557, "loss": 2.7018, "step": 5104 }, { "crossentropy": 2.7451682090759277, "epoch": 0.18507105568445475, "grad_norm": 0.03608464449644089, "grad_norm_var": 4.474729543340235e-06, "learning_rate": 0.00931621139044474, "loss": 2.6837, "step": 5105 }, { "crossentropy": 2.714357614517212, "epoch": 0.18510730858468677, "grad_norm": 0.035975851118564606, "grad_norm_var": 4.5818882164848244e-06, "learning_rate": 0.009315918034126939, "loss": 2.7223, "step": 5106 }, { "crossentropy": 2.5748960971832275, "epoch": 0.18514356148491878, "grad_norm": 0.03493306785821915, "grad_norm_var": 4.588816781937212e-06, "learning_rate": 0.009315624619516112, "loss": 2.5914, "step": 5107 }, { "crossentropy": 2.6739745140075684, "epoch": 0.1851798143851508, "grad_norm": 0.03375138342380524, "grad_norm_var": 5.239446523665435e-06, "learning_rate": 0.009315331146616227, "loss": 2.6915, "step": 5108 }, { "crossentropy": 2.684943914413452, "epoch": 0.18521606728538284, "grad_norm": 0.03403056040406227, "grad_norm_var": 5.638709671139328e-06, "learning_rate": 0.009315037615431246, "loss": 2.6868, "step": 5109 }, { "crossentropy": 2.8299179077148438, "epoch": 0.18525232018561485, "grad_norm": 0.03716137260198593, "grad_norm_var": 4.837386147545549e-06, "learning_rate": 0.009314744025965134, "loss": 2.7614, "step": 5110 }, { "crossentropy": 2.6092803478240967, "epoch": 0.18528857308584687, "grad_norm": 0.048468392342329025, "grad_norm_var": 1.3782628355896166e-05, "learning_rate": 0.009314450378221857, "loss": 2.6732, "step": 5111 }, { "crossentropy": 2.8256642818450928, "epoch": 0.18532482598607888, "grad_norm": 0.04700205847620964, "grad_norm_var": 1.8912791969361423e-05, "learning_rate": 0.009314156672205379, "loss": 2.677, "step": 5112 }, { "crossentropy": 2.763915538787842, "epoch": 0.1853610788863109, "grad_norm": 0.038098860532045364, "grad_norm_var": 1.8015599401800346e-05, "learning_rate": 0.009313862907919667, "loss": 2.745, "step": 5113 }, { "crossentropy": 2.6171844005584717, "epoch": 0.1853973317865429, "grad_norm": 0.0402659997344017, "grad_norm_var": 1.821782997008687e-05, "learning_rate": 0.009313569085368692, "loss": 2.6796, "step": 5114 }, { "crossentropy": 2.640947103500366, "epoch": 0.18543358468677495, "grad_norm": 0.037254758179187775, "grad_norm_var": 1.829067333797447e-05, "learning_rate": 0.00931327520455642, "loss": 2.5913, "step": 5115 }, { "crossentropy": 2.7798659801483154, "epoch": 0.18546983758700697, "grad_norm": 0.036714255809783936, "grad_norm_var": 1.7370410445073184e-05, "learning_rate": 0.009312981265486821, "loss": 2.8306, "step": 5116 }, { "crossentropy": 2.6511051654815674, "epoch": 0.18550609048723898, "grad_norm": 0.036246154457330704, "grad_norm_var": 1.745023824028611e-05, "learning_rate": 0.009312687268163864, "loss": 2.6241, "step": 5117 }, { "crossentropy": 2.7610363960266113, "epoch": 0.185542343387471, "grad_norm": 0.034926559776067734, "grad_norm_var": 1.752739457937981e-05, "learning_rate": 0.009312393212591521, "loss": 2.6962, "step": 5118 }, { "crossentropy": 2.7960853576660156, "epoch": 0.185578596287703, "grad_norm": 0.03445982187986374, "grad_norm_var": 1.824792134024015e-05, "learning_rate": 0.009312099098773763, "loss": 2.7362, "step": 5119 }, { "crossentropy": 2.7809548377990723, "epoch": 0.18561484918793503, "grad_norm": 0.035679228603839874, "grad_norm_var": 1.847227982950487e-05, "learning_rate": 0.009311804926714564, "loss": 2.7758, "step": 5120 }, { "crossentropy": 2.657172203063965, "epoch": 0.18565110208816704, "grad_norm": 0.03479200229048729, "grad_norm_var": 1.8831994948488987e-05, "learning_rate": 0.009311510696417896, "loss": 2.7459, "step": 5121 }, { "crossentropy": 2.6081430912017822, "epoch": 0.18568735498839908, "grad_norm": 0.03946785628795624, "grad_norm_var": 1.889145591562931e-05, "learning_rate": 0.009311216407887731, "loss": 2.6335, "step": 5122 }, { "crossentropy": 2.67117977142334, "epoch": 0.1857236078886311, "grad_norm": 0.05309886857867241, "grad_norm_var": 3.2806498396227105e-05, "learning_rate": 0.009310922061128046, "loss": 2.7196, "step": 5123 }, { "crossentropy": 2.7393147945404053, "epoch": 0.1857598607888631, "grad_norm": 0.056994255632162094, "grad_norm_var": 5.0805302729893934e-05, "learning_rate": 0.009310627656142819, "loss": 2.6948, "step": 5124 }, { "crossentropy": 2.6602847576141357, "epoch": 0.18579611368909513, "grad_norm": 0.0432259738445282, "grad_norm_var": 4.841400187708192e-05, "learning_rate": 0.009310333192936022, "loss": 2.6231, "step": 5125 }, { "crossentropy": 2.6253769397735596, "epoch": 0.18583236658932714, "grad_norm": 0.03341495245695114, "grad_norm_var": 5.114178938161809e-05, "learning_rate": 0.009310038671511636, "loss": 2.6692, "step": 5126 }, { "crossentropy": 2.783439874649048, "epoch": 0.18586861948955916, "grad_norm": 0.043651729822158813, "grad_norm_var": 4.7559022951783266e-05, "learning_rate": 0.009309744091873635, "loss": 2.7667, "step": 5127 }, { "crossentropy": 2.6246683597564697, "epoch": 0.18590487238979117, "grad_norm": 0.04714079201221466, "grad_norm_var": 4.7683628905882217e-05, "learning_rate": 0.009309449454026002, "loss": 2.6187, "step": 5128 }, { "crossentropy": 2.709261178970337, "epoch": 0.1859411252900232, "grad_norm": 0.03897136077284813, "grad_norm_var": 4.7470545809509776e-05, "learning_rate": 0.009309154757972712, "loss": 2.6491, "step": 5129 }, { "crossentropy": 2.7165706157684326, "epoch": 0.18597737819025523, "grad_norm": 0.03814650699496269, "grad_norm_var": 4.7787494222723396e-05, "learning_rate": 0.009308860003717749, "loss": 2.6707, "step": 5130 }, { "crossentropy": 2.7628567218780518, "epoch": 0.18601363109048724, "grad_norm": 0.0350281298160553, "grad_norm_var": 4.899003421334752e-05, "learning_rate": 0.009308565191265092, "loss": 2.7109, "step": 5131 }, { "crossentropy": 2.7495269775390625, "epoch": 0.18604988399071926, "grad_norm": 0.03759295865893364, "grad_norm_var": 4.863899180481597e-05, "learning_rate": 0.009308270320618723, "loss": 2.7559, "step": 5132 }, { "crossentropy": 2.726649522781372, "epoch": 0.18608613689095127, "grad_norm": 0.03674742579460144, "grad_norm_var": 4.839195215007441e-05, "learning_rate": 0.009307975391782627, "loss": 2.7228, "step": 5133 }, { "crossentropy": 2.4979801177978516, "epoch": 0.18612238979118328, "grad_norm": 0.03682013601064682, "grad_norm_var": 4.7282448316132474e-05, "learning_rate": 0.009307680404760784, "loss": 2.6932, "step": 5134 }, { "crossentropy": 2.6976194381713867, "epoch": 0.18615864269141533, "grad_norm": 0.03752218931913376, "grad_norm_var": 4.547291852774448e-05, "learning_rate": 0.00930738535955718, "loss": 2.6804, "step": 5135 }, { "crossentropy": 2.567021131515503, "epoch": 0.18619489559164734, "grad_norm": 0.0393916480243206, "grad_norm_var": 4.3938960324618226e-05, "learning_rate": 0.009307090256175802, "loss": 2.6125, "step": 5136 }, { "crossentropy": 2.730661392211914, "epoch": 0.18623114849187936, "grad_norm": 0.03824735805392265, "grad_norm_var": 4.194004906766333e-05, "learning_rate": 0.009306795094620632, "loss": 2.6882, "step": 5137 }, { "crossentropy": 2.660858392715454, "epoch": 0.18626740139211137, "grad_norm": 0.03762548416852951, "grad_norm_var": 4.2520307734253356e-05, "learning_rate": 0.00930649987489566, "loss": 2.6466, "step": 5138 }, { "crossentropy": 2.8200032711029053, "epoch": 0.18630365429234338, "grad_norm": 0.03670544922351837, "grad_norm_var": 3.254607569782367e-05, "learning_rate": 0.009306204597004871, "loss": 2.7525, "step": 5139 }, { "crossentropy": 2.6512839794158936, "epoch": 0.1863399071925754, "grad_norm": 0.038156528025865555, "grad_norm_var": 1.1604992972585881e-05, "learning_rate": 0.009305909260952254, "loss": 2.7406, "step": 5140 }, { "crossentropy": 2.6285858154296875, "epoch": 0.1863761600928074, "grad_norm": 0.03693435713648796, "grad_norm_var": 1.023972089213328e-05, "learning_rate": 0.009305613866741797, "loss": 2.6919, "step": 5141 }, { "crossentropy": 2.5744550228118896, "epoch": 0.18641241299303946, "grad_norm": 0.03448909521102905, "grad_norm_var": 9.618493148570658e-06, "learning_rate": 0.009305318414377492, "loss": 2.5763, "step": 5142 }, { "crossentropy": 2.7140538692474365, "epoch": 0.18644866589327147, "grad_norm": 0.034732431173324585, "grad_norm_var": 8.25370753968346e-06, "learning_rate": 0.00930502290386333, "loss": 2.7295, "step": 5143 }, { "crossentropy": 2.7375569343566895, "epoch": 0.18648491879350348, "grad_norm": 0.03410706669092178, "grad_norm_var": 2.578836363308021e-06, "learning_rate": 0.009304727335203298, "loss": 2.7064, "step": 5144 }, { "crossentropy": 2.623338222503662, "epoch": 0.1865211716937355, "grad_norm": 0.03299270197749138, "grad_norm_var": 3.202425107171001e-06, "learning_rate": 0.009304431708401392, "loss": 2.6127, "step": 5145 }, { "crossentropy": 2.5643744468688965, "epoch": 0.1865574245939675, "grad_norm": 0.035438913851976395, "grad_norm_var": 3.094173343158995e-06, "learning_rate": 0.009304136023461602, "loss": 2.5335, "step": 5146 }, { "crossentropy": 2.652144432067871, "epoch": 0.18659367749419953, "grad_norm": 0.03755461052060127, "grad_norm_var": 3.028207125749018e-06, "learning_rate": 0.009303840280387924, "loss": 2.6539, "step": 5147 }, { "crossentropy": 2.717287302017212, "epoch": 0.18662993039443154, "grad_norm": 0.03805035352706909, "grad_norm_var": 3.1039038655963336e-06, "learning_rate": 0.009303544479184352, "loss": 2.7523, "step": 5148 }, { "crossentropy": 2.6429760456085205, "epoch": 0.18666618329466358, "grad_norm": 0.0337497852742672, "grad_norm_var": 3.6044908448956228e-06, "learning_rate": 0.009303248619854881, "loss": 2.6353, "step": 5149 }, { "crossentropy": 2.555891275405884, "epoch": 0.1867024361948956, "grad_norm": 0.03265722095966339, "grad_norm_var": 4.458505752528523e-06, "learning_rate": 0.009302952702403505, "loss": 2.6291, "step": 5150 }, { "crossentropy": 2.714265823364258, "epoch": 0.1867386890951276, "grad_norm": 0.03359782323241234, "grad_norm_var": 4.701584574888911e-06, "learning_rate": 0.009302656726834224, "loss": 2.7028, "step": 5151 }, { "crossentropy": 2.509042501449585, "epoch": 0.18677494199535963, "grad_norm": 0.03165343403816223, "grad_norm_var": 4.843520414993021e-06, "learning_rate": 0.009302360693151036, "loss": 2.5344, "step": 5152 }, { "crossentropy": 2.71505069732666, "epoch": 0.18681119489559164, "grad_norm": 0.04374997690320015, "grad_norm_var": 8.811585297537524e-06, "learning_rate": 0.009302064601357936, "loss": 2.6481, "step": 5153 }, { "crossentropy": 2.66917085647583, "epoch": 0.18684744779582366, "grad_norm": 0.034062065184116364, "grad_norm_var": 8.719920133299734e-06, "learning_rate": 0.009301768451458926, "loss": 2.6855, "step": 5154 }, { "crossentropy": 2.6522066593170166, "epoch": 0.18688370069605567, "grad_norm": 0.034430667757987976, "grad_norm_var": 8.689693715730843e-06, "learning_rate": 0.009301472243458005, "loss": 2.6795, "step": 5155 }, { "crossentropy": 2.8411147594451904, "epoch": 0.1869199535962877, "grad_norm": 0.03559122607111931, "grad_norm_var": 8.15722997319519e-06, "learning_rate": 0.009301175977359173, "loss": 2.7799, "step": 5156 }, { "crossentropy": 2.7764923572540283, "epoch": 0.18695620649651973, "grad_norm": 0.03585395961999893, "grad_norm_var": 7.985671821175653e-06, "learning_rate": 0.009300879653166432, "loss": 2.7466, "step": 5157 }, { "crossentropy": 2.561849594116211, "epoch": 0.18699245939675174, "grad_norm": 0.03475227952003479, "grad_norm_var": 7.966126161467318e-06, "learning_rate": 0.009300583270883785, "loss": 2.5567, "step": 5158 }, { "crossentropy": 2.862920045852661, "epoch": 0.18702871229698376, "grad_norm": 0.03388224542140961, "grad_norm_var": 8.06270733847003e-06, "learning_rate": 0.009300286830515234, "loss": 2.7302, "step": 5159 }, { "crossentropy": 2.753790855407715, "epoch": 0.18706496519721577, "grad_norm": 0.03365102410316467, "grad_norm_var": 8.138074391693337e-06, "learning_rate": 0.009299990332064785, "loss": 2.7435, "step": 5160 }, { "crossentropy": 2.6051056385040283, "epoch": 0.18710121809744779, "grad_norm": 0.033736955374479294, "grad_norm_var": 7.96315527049566e-06, "learning_rate": 0.009299693775536438, "loss": 2.661, "step": 5161 }, { "crossentropy": 2.728760004043579, "epoch": 0.18713747099767983, "grad_norm": 0.03340096026659012, "grad_norm_var": 8.144440949681142e-06, "learning_rate": 0.009299397160934204, "loss": 2.6926, "step": 5162 }, { "crossentropy": 2.7706730365753174, "epoch": 0.18717372389791184, "grad_norm": 0.041457273066043854, "grad_norm_var": 1.0413486342609965e-05, "learning_rate": 0.009299100488262086, "loss": 2.7395, "step": 5163 }, { "crossentropy": 2.5683975219726562, "epoch": 0.18720997679814386, "grad_norm": 0.042909301817417145, "grad_norm_var": 1.3692082687765244e-05, "learning_rate": 0.009298803757524092, "loss": 2.6292, "step": 5164 }, { "crossentropy": 2.764483690261841, "epoch": 0.18724622969837587, "grad_norm": 0.04367299750447273, "grad_norm_var": 1.743681004234317e-05, "learning_rate": 0.009298506968724229, "loss": 2.7344, "step": 5165 }, { "crossentropy": 2.7053520679473877, "epoch": 0.18728248259860789, "grad_norm": 0.041887540370225906, "grad_norm_var": 1.8412417889958575e-05, "learning_rate": 0.009298210121866508, "loss": 2.6809, "step": 5166 }, { "crossentropy": 2.7923264503479004, "epoch": 0.1873187354988399, "grad_norm": 0.04180759936571121, "grad_norm_var": 1.91546338164714e-05, "learning_rate": 0.009297913216954935, "loss": 2.8066, "step": 5167 }, { "crossentropy": 2.723545789718628, "epoch": 0.18735498839907191, "grad_norm": 0.03792276605963707, "grad_norm_var": 1.6906839445468558e-05, "learning_rate": 0.009297616253993522, "loss": 2.7357, "step": 5168 }, { "crossentropy": 2.7512905597686768, "epoch": 0.18739124129930396, "grad_norm": 0.03540980443358421, "grad_norm_var": 1.4496564455088872e-05, "learning_rate": 0.00929731923298628, "loss": 2.7146, "step": 5169 }, { "crossentropy": 2.7838077545166016, "epoch": 0.18742749419953597, "grad_norm": 0.03285610303282738, "grad_norm_var": 1.5084273418247637e-05, "learning_rate": 0.00929702215393722, "loss": 2.7824, "step": 5170 }, { "crossentropy": 2.7534894943237305, "epoch": 0.18746374709976799, "grad_norm": 0.032916221767663956, "grad_norm_var": 1.576186639627904e-05, "learning_rate": 0.009296725016850353, "loss": 2.7703, "step": 5171 }, { "crossentropy": 2.616903781890869, "epoch": 0.1875, "grad_norm": 0.03294726461172104, "grad_norm_var": 1.6688979228867267e-05, "learning_rate": 0.009296427821729698, "loss": 2.7009, "step": 5172 }, { "crossentropy": 2.7945098876953125, "epoch": 0.18753625290023201, "grad_norm": 0.042499441653490067, "grad_norm_var": 1.8596242143977212e-05, "learning_rate": 0.009296130568579261, "loss": 2.7086, "step": 5173 }, { "crossentropy": 2.6207096576690674, "epoch": 0.18757250580046403, "grad_norm": 0.037177808582782745, "grad_norm_var": 1.8162035289162604e-05, "learning_rate": 0.009295833257403064, "loss": 2.697, "step": 5174 }, { "crossentropy": 2.7574312686920166, "epoch": 0.18760875870069604, "grad_norm": 0.03448861464858055, "grad_norm_var": 1.7901945278518164e-05, "learning_rate": 0.00929553588820512, "loss": 2.7347, "step": 5175 }, { "crossentropy": 2.7493982315063477, "epoch": 0.18764501160092809, "grad_norm": 0.03508548438549042, "grad_norm_var": 1.730943143094654e-05, "learning_rate": 0.009295238460989442, "loss": 2.7246, "step": 5176 }, { "crossentropy": 2.6744542121887207, "epoch": 0.1876812645011601, "grad_norm": 0.042882125824689865, "grad_norm_var": 1.793465071999589e-05, "learning_rate": 0.009294940975760054, "loss": 2.657, "step": 5177 }, { "crossentropy": 2.7348973751068115, "epoch": 0.18771751740139211, "grad_norm": 0.04235103726387024, "grad_norm_var": 1.7354360123348656e-05, "learning_rate": 0.009294643432520969, "loss": 2.7296, "step": 5178 }, { "crossentropy": 2.653574228286743, "epoch": 0.18775377030162413, "grad_norm": 0.04152629151940346, "grad_norm_var": 1.7380565638142054e-05, "learning_rate": 0.009294345831276205, "loss": 2.625, "step": 5179 }, { "crossentropy": 2.4647624492645264, "epoch": 0.18779002320185614, "grad_norm": 0.037306468933820724, "grad_norm_var": 1.615787904607289e-05, "learning_rate": 0.009294048172029785, "loss": 2.6402, "step": 5180 }, { "crossentropy": 2.728597402572632, "epoch": 0.18782627610208816, "grad_norm": 0.037159476429224014, "grad_norm_var": 1.413982835329012e-05, "learning_rate": 0.00929375045478573, "loss": 2.7342, "step": 5181 }, { "crossentropy": 2.7082886695861816, "epoch": 0.18786252900232017, "grad_norm": 0.035841044038534164, "grad_norm_var": 1.3201216893680458e-05, "learning_rate": 0.009293452679548057, "loss": 2.6787, "step": 5182 }, { "crossentropy": 2.6599607467651367, "epoch": 0.18789878190255221, "grad_norm": 0.03502368927001953, "grad_norm_var": 1.2191278583161858e-05, "learning_rate": 0.009293154846320791, "loss": 2.6268, "step": 5183 }, { "crossentropy": 2.550006866455078, "epoch": 0.18793503480278423, "grad_norm": 0.03779689595103264, "grad_norm_var": 1.2178244115934541e-05, "learning_rate": 0.009292856955107951, "loss": 2.6186, "step": 5184 }, { "crossentropy": 2.4100747108459473, "epoch": 0.18797128770301624, "grad_norm": 0.0342046320438385, "grad_norm_var": 1.2537281987195827e-05, "learning_rate": 0.009292559005913567, "loss": 2.5023, "step": 5185 }, { "crossentropy": 2.882966995239258, "epoch": 0.18800754060324826, "grad_norm": 0.034672342240810394, "grad_norm_var": 1.173899706743229e-05, "learning_rate": 0.009292260998741657, "loss": 2.8237, "step": 5186 }, { "crossentropy": 2.7297182083129883, "epoch": 0.18804379350348027, "grad_norm": 0.03408176079392433, "grad_norm_var": 1.1171012920116405e-05, "learning_rate": 0.00929196293359625, "loss": 2.6626, "step": 5187 }, { "crossentropy": 2.72583270072937, "epoch": 0.1880800464037123, "grad_norm": 0.03426312655210495, "grad_norm_var": 1.0534802665299916e-05, "learning_rate": 0.009291664810481368, "loss": 2.6828, "step": 5188 }, { "crossentropy": 2.6956746578216553, "epoch": 0.18811629930394433, "grad_norm": 0.03431829437613487, "grad_norm_var": 9.016366612949317e-06, "learning_rate": 0.009291366629401042, "loss": 2.7538, "step": 5189 }, { "crossentropy": 2.7477762699127197, "epoch": 0.18815255220417634, "grad_norm": 0.03307947888970375, "grad_norm_var": 9.838478852339933e-06, "learning_rate": 0.009291068390359298, "loss": 2.7278, "step": 5190 }, { "crossentropy": 2.738596200942993, "epoch": 0.18818880510440836, "grad_norm": 0.03187717869877815, "grad_norm_var": 1.0966808477103075e-05, "learning_rate": 0.009290770093360162, "loss": 2.6994, "step": 5191 }, { "crossentropy": 2.7101731300354004, "epoch": 0.18822505800464037, "grad_norm": 0.03512818366289139, "grad_norm_var": 1.0959769738751411e-05, "learning_rate": 0.009290471738407665, "loss": 2.7159, "step": 5192 }, { "crossentropy": 2.5361812114715576, "epoch": 0.1882613109048724, "grad_norm": 0.03441992774605751, "grad_norm_var": 8.058963417820198e-06, "learning_rate": 0.009290173325505835, "loss": 2.6022, "step": 5193 }, { "crossentropy": 2.6187214851379395, "epoch": 0.1882975638051044, "grad_norm": 0.03309043496847153, "grad_norm_var": 5.349291847248777e-06, "learning_rate": 0.009289874854658705, "loss": 2.6234, "step": 5194 }, { "crossentropy": 2.5517754554748535, "epoch": 0.18833381670533642, "grad_norm": 0.033123668283224106, "grad_norm_var": 2.715646168044428e-06, "learning_rate": 0.009289576325870304, "loss": 2.622, "step": 5195 }, { "crossentropy": 2.6407251358032227, "epoch": 0.18837006960556846, "grad_norm": 0.033820804208517075, "grad_norm_var": 2.2690623786875143e-06, "learning_rate": 0.009289277739144666, "loss": 2.6538, "step": 5196 }, { "crossentropy": 2.676358938217163, "epoch": 0.18840632250580047, "grad_norm": 0.0316670760512352, "grad_norm_var": 2.202344274475759e-06, "learning_rate": 0.009288979094485821, "loss": 2.6785, "step": 5197 }, { "crossentropy": 2.727715492248535, "epoch": 0.1884425754060325, "grad_norm": 0.033124443143606186, "grad_norm_var": 2.0512636821447155e-06, "learning_rate": 0.009288680391897806, "loss": 2.7306, "step": 5198 }, { "crossentropy": 2.7154531478881836, "epoch": 0.1884788283062645, "grad_norm": 0.03353869915008545, "grad_norm_var": 1.9825870280070742e-06, "learning_rate": 0.009288381631384655, "loss": 2.8167, "step": 5199 }, { "crossentropy": 2.817744016647339, "epoch": 0.18851508120649652, "grad_norm": 0.03303029388189316, "grad_norm_var": 9.182892764701227e-07, "learning_rate": 0.009288082812950401, "loss": 2.7197, "step": 5200 }, { "crossentropy": 2.685514211654663, "epoch": 0.18855133410672853, "grad_norm": 0.033520862460136414, "grad_norm_var": 8.914769837072474e-07, "learning_rate": 0.009287783936599082, "loss": 2.6675, "step": 5201 }, { "crossentropy": 2.686434268951416, "epoch": 0.18858758700696054, "grad_norm": 0.03410088270902634, "grad_norm_var": 8.261641392226516e-07, "learning_rate": 0.009287485002334733, "loss": 2.6236, "step": 5202 }, { "crossentropy": 2.7852022647857666, "epoch": 0.1886238399071926, "grad_norm": 0.03630683571100235, "grad_norm_var": 1.3047614182220535e-06, "learning_rate": 0.009287186010161392, "loss": 2.6951, "step": 5203 }, { "crossentropy": 2.643568277359009, "epoch": 0.1886600928074246, "grad_norm": 0.037440624088048935, "grad_norm_var": 2.1952833257433967e-06, "learning_rate": 0.0092868869600831, "loss": 2.7174, "step": 5204 }, { "crossentropy": 2.683900833129883, "epoch": 0.18869634570765662, "grad_norm": 0.041898466646671295, "grad_norm_var": 6.260549550376505e-06, "learning_rate": 0.009286587852103892, "loss": 2.7232, "step": 5205 }, { "crossentropy": 2.638817071914673, "epoch": 0.18873259860788863, "grad_norm": 0.042983368039131165, "grad_norm_var": 1.0748907150454069e-05, "learning_rate": 0.009286288686227811, "loss": 2.7028, "step": 5206 }, { "crossentropy": 2.762524366378784, "epoch": 0.18876885150812064, "grad_norm": 0.04233328625559807, "grad_norm_var": 1.3309252070460807e-05, "learning_rate": 0.009285989462458896, "loss": 2.721, "step": 5207 }, { "crossentropy": 2.7882771492004395, "epoch": 0.18880510440835266, "grad_norm": 0.04678535461425781, "grad_norm_var": 2.107602326587669e-05, "learning_rate": 0.00928569018080119, "loss": 2.8347, "step": 5208 }, { "crossentropy": 2.7620866298675537, "epoch": 0.18884135730858467, "grad_norm": 0.04402061551809311, "grad_norm_var": 2.4399379285368742e-05, "learning_rate": 0.009285390841258732, "loss": 2.7352, "step": 5209 }, { "crossentropy": 2.679265022277832, "epoch": 0.18887761020881672, "grad_norm": 0.03418673202395439, "grad_norm_var": 2.3914116838770712e-05, "learning_rate": 0.009285091443835571, "loss": 2.6528, "step": 5210 }, { "crossentropy": 2.7821812629699707, "epoch": 0.18891386310904873, "grad_norm": 0.03473283350467682, "grad_norm_var": 2.3245849549446452e-05, "learning_rate": 0.009284791988535745, "loss": 2.8201, "step": 5211 }, { "crossentropy": 2.782869815826416, "epoch": 0.18895011600928074, "grad_norm": 0.03481346741318703, "grad_norm_var": 2.2874317743811803e-05, "learning_rate": 0.009284492475363303, "loss": 2.7743, "step": 5212 }, { "crossentropy": 2.5760934352874756, "epoch": 0.18898636890951276, "grad_norm": 0.03676134720444679, "grad_norm_var": 2.0768533232049893e-05, "learning_rate": 0.009284192904322284, "loss": 2.6113, "step": 5213 }, { "crossentropy": 2.793919324874878, "epoch": 0.18902262180974477, "grad_norm": 0.03559562936425209, "grad_norm_var": 1.971718506507287e-05, "learning_rate": 0.009283893275416741, "loss": 2.7638, "step": 5214 }, { "crossentropy": 2.698989152908325, "epoch": 0.1890588747099768, "grad_norm": 0.041062790900468826, "grad_norm_var": 1.9152914295972264e-05, "learning_rate": 0.009283593588650718, "loss": 2.6885, "step": 5215 }, { "crossentropy": 2.7424936294555664, "epoch": 0.18909512761020883, "grad_norm": 0.04654177278280258, "grad_norm_var": 2.1432684009509092e-05, "learning_rate": 0.009283293844028264, "loss": 2.7505, "step": 5216 }, { "crossentropy": 2.719095468521118, "epoch": 0.18913138051044084, "grad_norm": 0.044767364859580994, "grad_norm_var": 2.1207538227574426e-05, "learning_rate": 0.009282994041553425, "loss": 2.7089, "step": 5217 }, { "crossentropy": 2.651953935623169, "epoch": 0.18916763341067286, "grad_norm": 0.03995804488658905, "grad_norm_var": 1.902142409941487e-05, "learning_rate": 0.00928269418123025, "loss": 2.6821, "step": 5218 }, { "crossentropy": 2.7328341007232666, "epoch": 0.18920388631090487, "grad_norm": 0.03784612566232681, "grad_norm_var": 1.8409113954373908e-05, "learning_rate": 0.009282394263062795, "loss": 2.7019, "step": 5219 }, { "crossentropy": 2.6877944469451904, "epoch": 0.1892401392111369, "grad_norm": 0.04142235219478607, "grad_norm_var": 1.7983902636505244e-05, "learning_rate": 0.009282094287055105, "loss": 2.696, "step": 5220 }, { "crossentropy": 2.4578592777252197, "epoch": 0.1892763921113689, "grad_norm": 0.040040694177150726, "grad_norm_var": 1.7817746238712787e-05, "learning_rate": 0.009281794253211233, "loss": 2.5432, "step": 5221 }, { "crossentropy": 2.6944420337677, "epoch": 0.18931264501160092, "grad_norm": 0.03801023215055466, "grad_norm_var": 1.754490383866769e-05, "learning_rate": 0.009281494161535231, "loss": 2.7142, "step": 5222 }, { "crossentropy": 2.73551344871521, "epoch": 0.18934889791183296, "grad_norm": 0.03904145210981369, "grad_norm_var": 1.7167298079181526e-05, "learning_rate": 0.009281194012031155, "loss": 2.7135, "step": 5223 }, { "crossentropy": 2.648085594177246, "epoch": 0.18938515081206497, "grad_norm": 0.03333211690187454, "grad_norm_var": 1.581305238770447e-05, "learning_rate": 0.009280893804703056, "loss": 2.6227, "step": 5224 }, { "crossentropy": 2.690467357635498, "epoch": 0.189421403712297, "grad_norm": 0.03497403487563133, "grad_norm_var": 1.4731464177839688e-05, "learning_rate": 0.00928059353955499, "loss": 2.7217, "step": 5225 }, { "crossentropy": 2.6458210945129395, "epoch": 0.189457656612529, "grad_norm": 0.0365537628531456, "grad_norm_var": 1.3777816109471613e-05, "learning_rate": 0.009280293216591011, "loss": 2.6553, "step": 5226 }, { "crossentropy": 2.7048532962799072, "epoch": 0.18949390951276102, "grad_norm": 0.03815361484885216, "grad_norm_var": 1.2806518722412636e-05, "learning_rate": 0.009279992835815177, "loss": 2.6608, "step": 5227 }, { "crossentropy": 2.6319265365600586, "epoch": 0.18953016241299303, "grad_norm": 0.03906158357858658, "grad_norm_var": 1.1744544434537315e-05, "learning_rate": 0.009279692397231545, "loss": 2.694, "step": 5228 }, { "crossentropy": 2.637559175491333, "epoch": 0.18956641531322505, "grad_norm": 0.03943254426121712, "grad_norm_var": 1.141270638438287e-05, "learning_rate": 0.009279391900844172, "loss": 2.6765, "step": 5229 }, { "crossentropy": 2.5717368125915527, "epoch": 0.1896026682134571, "grad_norm": 0.03986099362373352, "grad_norm_var": 1.0549900872891537e-05, "learning_rate": 0.009279091346657118, "loss": 2.665, "step": 5230 }, { "crossentropy": 2.6633362770080566, "epoch": 0.1896389211136891, "grad_norm": 0.04393764212727547, "grad_norm_var": 1.1711977019609095e-05, "learning_rate": 0.00927879073467444, "loss": 2.6212, "step": 5231 }, { "crossentropy": 2.6682732105255127, "epoch": 0.18967517401392112, "grad_norm": 0.03948640823364258, "grad_norm_var": 8.253743137548221e-06, "learning_rate": 0.009278490064900202, "loss": 2.6616, "step": 5232 }, { "crossentropy": 2.779388427734375, "epoch": 0.18971142691415313, "grad_norm": 0.0357515923678875, "grad_norm_var": 6.542205663009835e-06, "learning_rate": 0.00927818933733846, "loss": 2.7149, "step": 5233 }, { "crossentropy": 2.8504083156585693, "epoch": 0.18974767981438515, "grad_norm": 0.03463420271873474, "grad_norm_var": 7.316970790276154e-06, "learning_rate": 0.009277888551993282, "loss": 2.7623, "step": 5234 }, { "crossentropy": 2.4783036708831787, "epoch": 0.18978393271461716, "grad_norm": 0.03598816692829132, "grad_norm_var": 7.6256401725982e-06, "learning_rate": 0.009277587708868726, "loss": 2.6251, "step": 5235 }, { "crossentropy": 2.6107168197631836, "epoch": 0.1898201856148492, "grad_norm": 0.036613646894693375, "grad_norm_var": 6.9439680707023e-06, "learning_rate": 0.009277286807968856, "loss": 2.6665, "step": 5236 }, { "crossentropy": 2.544055938720703, "epoch": 0.18985643851508122, "grad_norm": 0.039417341351509094, "grad_norm_var": 6.782398800016474e-06, "learning_rate": 0.009276985849297736, "loss": 2.6123, "step": 5237 }, { "crossentropy": 2.745079278945923, "epoch": 0.18989269141531323, "grad_norm": 0.03645551577210426, "grad_norm_var": 6.88275566781271e-06, "learning_rate": 0.009276684832859433, "loss": 2.6696, "step": 5238 }, { "crossentropy": 2.648148775100708, "epoch": 0.18992894431554525, "grad_norm": 0.03505382314324379, "grad_norm_var": 7.146557359768065e-06, "learning_rate": 0.00927638375865801, "loss": 2.6826, "step": 5239 }, { "crossentropy": 2.6737465858459473, "epoch": 0.18996519721577726, "grad_norm": 0.035988371819257736, "grad_norm_var": 6.140031357259173e-06, "learning_rate": 0.009276082626697536, "loss": 2.7079, "step": 5240 }, { "crossentropy": 2.7023439407348633, "epoch": 0.19000145011600927, "grad_norm": 0.036611661314964294, "grad_norm_var": 5.73749610387822e-06, "learning_rate": 0.009275781436982076, "loss": 2.7207, "step": 5241 }, { "crossentropy": 2.621748685836792, "epoch": 0.1900377030162413, "grad_norm": 0.034557607024908066, "grad_norm_var": 6.288299289015399e-06, "learning_rate": 0.0092754801895157, "loss": 2.6711, "step": 5242 }, { "crossentropy": 2.6592822074890137, "epoch": 0.19007395591647333, "grad_norm": 0.03449935093522072, "grad_norm_var": 6.835033743966303e-06, "learning_rate": 0.009275178884302474, "loss": 2.6509, "step": 5243 }, { "crossentropy": 2.742436170578003, "epoch": 0.19011020881670534, "grad_norm": 0.03449108451604843, "grad_norm_var": 7.0880816262843795e-06, "learning_rate": 0.00927487752134647, "loss": 2.6929, "step": 5244 }, { "crossentropy": 2.82779860496521, "epoch": 0.19014646171693736, "grad_norm": 0.0384661965072155, "grad_norm_var": 6.8393022662052135e-06, "learning_rate": 0.009274576100651757, "loss": 2.7606, "step": 5245 }, { "crossentropy": 2.602027416229248, "epoch": 0.19018271461716937, "grad_norm": 0.03636442497372627, "grad_norm_var": 6.264174188628605e-06, "learning_rate": 0.00927427462222241, "loss": 2.6343, "step": 5246 }, { "crossentropy": 2.720066547393799, "epoch": 0.1902189675174014, "grad_norm": 0.03871430456638336, "grad_norm_var": 2.977380074722866e-06, "learning_rate": 0.009273973086062495, "loss": 2.7857, "step": 5247 }, { "crossentropy": 2.728959083557129, "epoch": 0.1902552204176334, "grad_norm": 0.03538509085774422, "grad_norm_var": 2.364610709642186e-06, "learning_rate": 0.009273671492176087, "loss": 2.7316, "step": 5248 }, { "crossentropy": 2.714139699935913, "epoch": 0.19029147331786542, "grad_norm": 0.03572451323270798, "grad_norm_var": 2.3662286873643e-06, "learning_rate": 0.009273369840567262, "loss": 2.6901, "step": 5249 }, { "crossentropy": 2.6303319931030273, "epoch": 0.19032772621809746, "grad_norm": 0.03502972051501274, "grad_norm_var": 2.2942059659282477e-06, "learning_rate": 0.00927306813124009, "loss": 2.565, "step": 5250 }, { "crossentropy": 2.7454516887664795, "epoch": 0.19036397911832947, "grad_norm": 0.0368427075445652, "grad_norm_var": 2.3145647204661847e-06, "learning_rate": 0.00927276636419865, "loss": 2.6846, "step": 5251 }, { "crossentropy": 2.6567838191986084, "epoch": 0.1904002320185615, "grad_norm": 0.03610983118414879, "grad_norm_var": 2.3069051618663032e-06, "learning_rate": 0.009272464539447016, "loss": 2.6129, "step": 5252 }, { "crossentropy": 2.689324140548706, "epoch": 0.1904364849187935, "grad_norm": 0.03916175290942192, "grad_norm_var": 2.202435509571221e-06, "learning_rate": 0.009272162656989266, "loss": 2.7633, "step": 5253 }, { "crossentropy": 2.663985252380371, "epoch": 0.19047273781902552, "grad_norm": 0.03439333662390709, "grad_norm_var": 2.402364604140589e-06, "learning_rate": 0.009271860716829474, "loss": 2.6685, "step": 5254 }, { "crossentropy": 2.6689209938049316, "epoch": 0.19050899071925753, "grad_norm": 0.035139936953783035, "grad_norm_var": 2.3909640299188867e-06, "learning_rate": 0.00927155871897172, "loss": 2.5895, "step": 5255 }, { "crossentropy": 2.601519823074341, "epoch": 0.19054524361948955, "grad_norm": 0.03523379564285278, "grad_norm_var": 2.4370262754965763e-06, "learning_rate": 0.009271256663420085, "loss": 2.6518, "step": 5256 }, { "crossentropy": 2.655978202819824, "epoch": 0.1905814965197216, "grad_norm": 0.03282184526324272, "grad_norm_var": 3.048524295961909e-06, "learning_rate": 0.009270954550178645, "loss": 2.6575, "step": 5257 }, { "crossentropy": 2.5696325302124023, "epoch": 0.1906177494199536, "grad_norm": 0.0344652384519577, "grad_norm_var": 3.064462917465258e-06, "learning_rate": 0.009270652379251483, "loss": 2.6087, "step": 5258 }, { "crossentropy": 2.617713689804077, "epoch": 0.19065400232018562, "grad_norm": 0.033826280385255814, "grad_norm_var": 3.2097426427137843e-06, "learning_rate": 0.009270350150642679, "loss": 2.6388, "step": 5259 }, { "crossentropy": 2.436765670776367, "epoch": 0.19069025522041763, "grad_norm": 0.034129731357097626, "grad_norm_var": 3.2790708256073713e-06, "learning_rate": 0.009270047864356317, "loss": 2.6015, "step": 5260 }, { "crossentropy": 2.5958197116851807, "epoch": 0.19072650812064965, "grad_norm": 0.03752601891756058, "grad_norm_var": 2.9923236711653736e-06, "learning_rate": 0.009269745520396477, "loss": 2.5994, "step": 5261 }, { "crossentropy": 2.6956257820129395, "epoch": 0.19076276102088166, "grad_norm": 0.03915517032146454, "grad_norm_var": 3.7340307672605855e-06, "learning_rate": 0.009269443118767247, "loss": 2.6207, "step": 5262 }, { "crossentropy": 2.63393497467041, "epoch": 0.1907990139211137, "grad_norm": 0.03671332076191902, "grad_norm_var": 3.2210749038250517e-06, "learning_rate": 0.009269140659472706, "loss": 2.7224, "step": 5263 }, { "crossentropy": 2.6842806339263916, "epoch": 0.19083526682134572, "grad_norm": 0.036966100335121155, "grad_norm_var": 3.3048780750739744e-06, "learning_rate": 0.009268838142516942, "loss": 2.7606, "step": 5264 }, { "crossentropy": 2.615872383117676, "epoch": 0.19087151972157773, "grad_norm": 0.037010736763477325, "grad_norm_var": 3.3906218923244907e-06, "learning_rate": 0.009268535567904041, "loss": 2.6252, "step": 5265 }, { "crossentropy": 2.6497347354888916, "epoch": 0.19090777262180975, "grad_norm": 0.03569089248776436, "grad_norm_var": 3.3405314711890467e-06, "learning_rate": 0.00926823293563809, "loss": 2.6083, "step": 5266 }, { "crossentropy": 2.8342108726501465, "epoch": 0.19094402552204176, "grad_norm": 0.033772941678762436, "grad_norm_var": 3.563770451980343e-06, "learning_rate": 0.009267930245723177, "loss": 2.7176, "step": 5267 }, { "crossentropy": 2.828765869140625, "epoch": 0.19098027842227377, "grad_norm": 0.03389069065451622, "grad_norm_var": 3.76725056806104e-06, "learning_rate": 0.009267627498163388, "loss": 2.7797, "step": 5268 }, { "crossentropy": 2.819986343383789, "epoch": 0.1910165313225058, "grad_norm": 0.03799872472882271, "grad_norm_var": 3.302353839815195e-06, "learning_rate": 0.009267324692962813, "loss": 2.7913, "step": 5269 }, { "crossentropy": 2.6960909366607666, "epoch": 0.19105278422273783, "grad_norm": 0.03638387471437454, "grad_norm_var": 3.244091790708531e-06, "learning_rate": 0.009267021830125542, "loss": 2.6507, "step": 5270 }, { "crossentropy": 2.7423007488250732, "epoch": 0.19108903712296985, "grad_norm": 0.03973236680030823, "grad_norm_var": 4.237469410253193e-06, "learning_rate": 0.009266718909655667, "loss": 2.6652, "step": 5271 }, { "crossentropy": 2.6737260818481445, "epoch": 0.19112529002320186, "grad_norm": 0.04185570403933525, "grad_norm_var": 6.339225279764863e-06, "learning_rate": 0.009266415931557278, "loss": 2.6797, "step": 5272 }, { "crossentropy": 2.7028956413269043, "epoch": 0.19116154292343387, "grad_norm": 0.0375664122402668, "grad_norm_var": 5.50078670979795e-06, "learning_rate": 0.009266112895834464, "loss": 2.7618, "step": 5273 }, { "crossentropy": 2.7985265254974365, "epoch": 0.1911977958236659, "grad_norm": 0.03351078927516937, "grad_norm_var": 5.8380155422308365e-06, "learning_rate": 0.009265809802491325, "loss": 2.7853, "step": 5274 }, { "crossentropy": 2.633850336074829, "epoch": 0.1912340487238979, "grad_norm": 0.03842485696077347, "grad_norm_var": 5.4540365065940935e-06, "learning_rate": 0.00926550665153195, "loss": 2.6765, "step": 5275 }, { "crossentropy": 2.622990131378174, "epoch": 0.19127030162412992, "grad_norm": 0.03765545412898064, "grad_norm_var": 4.930769019387271e-06, "learning_rate": 0.009265203442960435, "loss": 2.6396, "step": 5276 }, { "crossentropy": 2.696687936782837, "epoch": 0.19130655452436196, "grad_norm": 0.03480090573430061, "grad_norm_var": 5.245885234162785e-06, "learning_rate": 0.009264900176780872, "loss": 2.6976, "step": 5277 }, { "crossentropy": 2.811967134475708, "epoch": 0.19134280742459397, "grad_norm": 0.033250823616981506, "grad_norm_var": 5.685208816201027e-06, "learning_rate": 0.009264596852997364, "loss": 2.7909, "step": 5278 }, { "crossentropy": 2.685105085372925, "epoch": 0.191379060324826, "grad_norm": 0.03396359458565712, "grad_norm_var": 6.107621958822453e-06, "learning_rate": 0.009264293471614001, "loss": 2.6888, "step": 5279 }, { "crossentropy": 2.666884183883667, "epoch": 0.191415313225058, "grad_norm": 0.0331823006272316, "grad_norm_var": 6.719202588255773e-06, "learning_rate": 0.009263990032634884, "loss": 2.6395, "step": 5280 }, { "crossentropy": 2.7924563884735107, "epoch": 0.19145156612529002, "grad_norm": 0.03461459279060364, "grad_norm_var": 6.808865509338955e-06, "learning_rate": 0.009263686536064112, "loss": 2.7089, "step": 5281 }, { "crossentropy": 2.5995635986328125, "epoch": 0.19148781902552203, "grad_norm": 0.033372774720191956, "grad_norm_var": 7.24595682410877e-06, "learning_rate": 0.009263382981905781, "loss": 2.5384, "step": 5282 }, { "crossentropy": 2.8716282844543457, "epoch": 0.19152407192575405, "grad_norm": 0.03327116742730141, "grad_norm_var": 7.402230431149034e-06, "learning_rate": 0.009263079370163994, "loss": 2.7717, "step": 5283 }, { "crossentropy": 2.60103440284729, "epoch": 0.1915603248259861, "grad_norm": 0.03274419158697128, "grad_norm_var": 7.78270307901224e-06, "learning_rate": 0.00926277570084285, "loss": 2.5125, "step": 5284 }, { "crossentropy": 2.647059440612793, "epoch": 0.1915965777262181, "grad_norm": 0.0360851027071476, "grad_norm_var": 7.4430527671739384e-06, "learning_rate": 0.00926247197394645, "loss": 2.7781, "step": 5285 }, { "crossentropy": 2.7455320358276367, "epoch": 0.19163283062645012, "grad_norm": 0.035308968275785446, "grad_norm_var": 7.4102207784689416e-06, "learning_rate": 0.0092621681894789, "loss": 2.7604, "step": 5286 }, { "crossentropy": 2.7068052291870117, "epoch": 0.19166908352668213, "grad_norm": 0.03440801054239273, "grad_norm_var": 6.236857277843131e-06, "learning_rate": 0.0092618643474443, "loss": 2.7407, "step": 5287 }, { "crossentropy": 2.6649696826934814, "epoch": 0.19170533642691415, "grad_norm": 0.033391308039426804, "grad_norm_var": 3.2607301219805354e-06, "learning_rate": 0.009261560447846755, "loss": 2.6615, "step": 5288 }, { "crossentropy": 2.709583044052124, "epoch": 0.19174158932714616, "grad_norm": 0.03655382990837097, "grad_norm_var": 2.940779629805451e-06, "learning_rate": 0.00926125649069037, "loss": 2.7342, "step": 5289 }, { "crossentropy": 2.794935703277588, "epoch": 0.1917778422273782, "grad_norm": 0.03884206339716911, "grad_norm_var": 3.901231471425959e-06, "learning_rate": 0.009260952475979245, "loss": 2.7477, "step": 5290 }, { "crossentropy": 2.700683355331421, "epoch": 0.19181409512761022, "grad_norm": 0.0474381297826767, "grad_norm_var": 1.3104332267952629e-05, "learning_rate": 0.009260648403717493, "loss": 2.6222, "step": 5291 }, { "crossentropy": 2.8296563625335693, "epoch": 0.19185034802784223, "grad_norm": 0.04767141863703728, "grad_norm_var": 2.21791117823109e-05, "learning_rate": 0.009260344273909218, "loss": 2.7915, "step": 5292 }, { "crossentropy": 2.549649715423584, "epoch": 0.19188660092807425, "grad_norm": 0.03877563029527664, "grad_norm_var": 2.2435009305201274e-05, "learning_rate": 0.00926004008655853, "loss": 2.6572, "step": 5293 }, { "crossentropy": 2.5330452919006348, "epoch": 0.19192285382830626, "grad_norm": 0.03753227740526199, "grad_norm_var": 2.1766038371149172e-05, "learning_rate": 0.009259735841669534, "loss": 2.6077, "step": 5294 }, { "crossentropy": 2.5889358520507812, "epoch": 0.19195910672853828, "grad_norm": 0.034534748643636703, "grad_norm_var": 2.1578251522639107e-05, "learning_rate": 0.00925943153924634, "loss": 2.6183, "step": 5295 }, { "crossentropy": 2.6049892902374268, "epoch": 0.1919953596287703, "grad_norm": 0.03521578758955002, "grad_norm_var": 2.0874011733060192e-05, "learning_rate": 0.009259127179293064, "loss": 2.6593, "step": 5296 }, { "crossentropy": 2.720184564590454, "epoch": 0.19203161252900233, "grad_norm": 0.03629504144191742, "grad_norm_var": 2.0547400402369462e-05, "learning_rate": 0.009258822761813808, "loss": 2.6149, "step": 5297 }, { "crossentropy": 2.7312686443328857, "epoch": 0.19206786542923435, "grad_norm": 0.03859681636095047, "grad_norm_var": 1.9750919393691535e-05, "learning_rate": 0.009258518286812688, "loss": 2.6995, "step": 5298 }, { "crossentropy": 2.883997678756714, "epoch": 0.19210411832946636, "grad_norm": 0.0348549522459507, "grad_norm_var": 1.9058707404012663e-05, "learning_rate": 0.009258213754293816, "loss": 2.8006, "step": 5299 }, { "crossentropy": 2.7480874061584473, "epoch": 0.19214037122969838, "grad_norm": 0.03490922600030899, "grad_norm_var": 1.801040760338457e-05, "learning_rate": 0.009257909164261306, "loss": 2.8175, "step": 5300 }, { "crossentropy": 2.598048686981201, "epoch": 0.1921766241299304, "grad_norm": 0.03515106067061424, "grad_norm_var": 1.824436165197233e-05, "learning_rate": 0.00925760451671927, "loss": 2.6278, "step": 5301 }, { "crossentropy": 2.6885673999786377, "epoch": 0.1922128770301624, "grad_norm": 0.03550608456134796, "grad_norm_var": 1.8190060378461827e-05, "learning_rate": 0.009257299811671827, "loss": 2.6901, "step": 5302 }, { "crossentropy": 2.6542415618896484, "epoch": 0.19224912993039442, "grad_norm": 0.03551212325692177, "grad_norm_var": 1.781404217458208e-05, "learning_rate": 0.009256995049123086, "loss": 2.6787, "step": 5303 }, { "crossentropy": 2.6320576667785645, "epoch": 0.19228538283062646, "grad_norm": 0.03951505944132805, "grad_norm_var": 1.6763235313049274e-05, "learning_rate": 0.009256690229077167, "loss": 2.6745, "step": 5304 }, { "crossentropy": 2.8276379108428955, "epoch": 0.19232163573085848, "grad_norm": 0.04450839012861252, "grad_norm_var": 1.9256740072615997e-05, "learning_rate": 0.009256385351538188, "loss": 2.7491, "step": 5305 }, { "crossentropy": 2.814030170440674, "epoch": 0.1923578886310905, "grad_norm": 0.04587479308247566, "grad_norm_var": 2.2735578101817402e-05, "learning_rate": 0.009256080416510264, "loss": 2.6713, "step": 5306 }, { "crossentropy": 2.644554853439331, "epoch": 0.1923941415313225, "grad_norm": 0.044117435812950134, "grad_norm_var": 1.9630360305322206e-05, "learning_rate": 0.009255775423997517, "loss": 2.6765, "step": 5307 }, { "crossentropy": 2.4465160369873047, "epoch": 0.19243039443155452, "grad_norm": 0.038910552859306335, "grad_norm_var": 1.3901822964476618e-05, "learning_rate": 0.009255470374004062, "loss": 2.5667, "step": 5308 }, { "crossentropy": 2.769296169281006, "epoch": 0.19246664733178653, "grad_norm": 0.03521353378891945, "grad_norm_var": 1.438020121175502e-05, "learning_rate": 0.009255165266534025, "loss": 2.72, "step": 5309 }, { "crossentropy": 2.610222578048706, "epoch": 0.19250290023201855, "grad_norm": 0.0374104306101799, "grad_norm_var": 1.4386948778353538e-05, "learning_rate": 0.009254860101591522, "loss": 2.6547, "step": 5310 }, { "crossentropy": 2.565819025039673, "epoch": 0.1925391531322506, "grad_norm": 0.04058915004134178, "grad_norm_var": 1.3975146169288983e-05, "learning_rate": 0.009254554879180676, "loss": 2.6153, "step": 5311 }, { "crossentropy": 2.6419217586517334, "epoch": 0.1925754060324826, "grad_norm": 0.03591455891728401, "grad_norm_var": 1.372191696331233e-05, "learning_rate": 0.00925424959930561, "loss": 2.6692, "step": 5312 }, { "crossentropy": 2.5651657581329346, "epoch": 0.19261165893271462, "grad_norm": 0.035562459379434586, "grad_norm_var": 1.395178236041113e-05, "learning_rate": 0.009253944261970445, "loss": 2.6288, "step": 5313 }, { "crossentropy": 2.5971791744232178, "epoch": 0.19264791183294663, "grad_norm": 0.04486790671944618, "grad_norm_var": 1.66920195501942e-05, "learning_rate": 0.00925363886717931, "loss": 2.7014, "step": 5314 }, { "crossentropy": 2.7610535621643066, "epoch": 0.19268416473317865, "grad_norm": 0.03394986689090729, "grad_norm_var": 1.7201330853661562e-05, "learning_rate": 0.009253333414936327, "loss": 2.7713, "step": 5315 }, { "crossentropy": 2.5524895191192627, "epoch": 0.19272041763341066, "grad_norm": 0.03623693063855171, "grad_norm_var": 1.665910478368763e-05, "learning_rate": 0.009253027905245622, "loss": 2.5972, "step": 5316 }, { "crossentropy": 2.710078239440918, "epoch": 0.1927566705336427, "grad_norm": 0.035199761390686035, "grad_norm_var": 1.6636354197392158e-05, "learning_rate": 0.00925272233811132, "loss": 2.662, "step": 5317 }, { "crossentropy": 2.730222225189209, "epoch": 0.19279292343387472, "grad_norm": 0.036112744361162186, "grad_norm_var": 1.6402579177985516e-05, "learning_rate": 0.009252416713537548, "loss": 2.7077, "step": 5318 }, { "crossentropy": 2.807253360748291, "epoch": 0.19282917633410673, "grad_norm": 0.03913875296711922, "grad_norm_var": 1.5674170528695344e-05, "learning_rate": 0.009252111031528435, "loss": 2.728, "step": 5319 }, { "crossentropy": 2.840446949005127, "epoch": 0.19286542923433875, "grad_norm": 0.04321572929620743, "grad_norm_var": 1.681131384021709e-05, "learning_rate": 0.00925180529208811, "loss": 2.7229, "step": 5320 }, { "crossentropy": 2.5138800144195557, "epoch": 0.19290168213457076, "grad_norm": 0.04352573677897453, "grad_norm_var": 1.6173069475303863e-05, "learning_rate": 0.009251499495220703, "loss": 2.5939, "step": 5321 }, { "crossentropy": 2.5939252376556396, "epoch": 0.19293793503480278, "grad_norm": 0.03995021432638168, "grad_norm_var": 1.3027019160354903e-05, "learning_rate": 0.009251193640930345, "loss": 2.6506, "step": 5322 }, { "crossentropy": 2.6624650955200195, "epoch": 0.1929741879350348, "grad_norm": 0.0361519530415535, "grad_norm_var": 1.1286422612385935e-05, "learning_rate": 0.009250887729221162, "loss": 2.755, "step": 5323 }, { "crossentropy": 2.580122232437134, "epoch": 0.19301044083526683, "grad_norm": 0.035894084721803665, "grad_norm_var": 1.1588193787320558e-05, "learning_rate": 0.009250581760097291, "loss": 2.5895, "step": 5324 }, { "crossentropy": 2.4735212326049805, "epoch": 0.19304669373549885, "grad_norm": 0.03372927010059357, "grad_norm_var": 1.2288880671454798e-05, "learning_rate": 0.009250275733562864, "loss": 2.574, "step": 5325 }, { "crossentropy": 2.7426435947418213, "epoch": 0.19308294663573086, "grad_norm": 0.03321637958288193, "grad_norm_var": 1.3698712431465105e-05, "learning_rate": 0.009249969649622013, "loss": 2.6928, "step": 5326 }, { "crossentropy": 2.681368827819824, "epoch": 0.19311919953596288, "grad_norm": 0.03427153080701828, "grad_norm_var": 1.3762480518937174e-05, "learning_rate": 0.009249663508278869, "loss": 2.688, "step": 5327 }, { "crossentropy": 2.8321053981781006, "epoch": 0.1931554524361949, "grad_norm": 0.03712576627731323, "grad_norm_var": 1.3629036938630061e-05, "learning_rate": 0.009249357309537574, "loss": 2.7757, "step": 5328 }, { "crossentropy": 2.6980342864990234, "epoch": 0.1931917053364269, "grad_norm": 0.03521135449409485, "grad_norm_var": 1.3722030063165885e-05, "learning_rate": 0.009249051053402258, "loss": 2.6596, "step": 5329 }, { "crossentropy": 2.8178303241729736, "epoch": 0.19322795823665892, "grad_norm": 0.03832386061549187, "grad_norm_var": 9.849690068684339e-06, "learning_rate": 0.00924874473987706, "loss": 2.7593, "step": 5330 }, { "crossentropy": 2.7390365600585938, "epoch": 0.19326421113689096, "grad_norm": 0.039388999342918396, "grad_norm_var": 9.520505947311723e-06, "learning_rate": 0.009248438368966116, "loss": 2.6933, "step": 5331 }, { "crossentropy": 2.4963245391845703, "epoch": 0.19330046403712298, "grad_norm": 0.03601552173495293, "grad_norm_var": 9.55475558968924e-06, "learning_rate": 0.009248131940673565, "loss": 2.5947, "step": 5332 }, { "crossentropy": 2.7655367851257324, "epoch": 0.193336716937355, "grad_norm": 0.03810044750571251, "grad_norm_var": 9.276281704050961e-06, "learning_rate": 0.009247825455003545, "loss": 2.6674, "step": 5333 }, { "crossentropy": 2.59415602684021, "epoch": 0.193372969837587, "grad_norm": 0.037616610527038574, "grad_norm_var": 9.147332199227476e-06, "learning_rate": 0.009247518911960196, "loss": 2.5883, "step": 5334 }, { "crossentropy": 2.7195489406585693, "epoch": 0.19340922273781902, "grad_norm": 0.03627156838774681, "grad_norm_var": 9.05558346325011e-06, "learning_rate": 0.009247212311547659, "loss": 2.7347, "step": 5335 }, { "crossentropy": 2.664947748184204, "epoch": 0.19344547563805103, "grad_norm": 0.035675786435604095, "grad_norm_var": 6.737486481393079e-06, "learning_rate": 0.009246905653770074, "loss": 2.6922, "step": 5336 }, { "crossentropy": 2.723323106765747, "epoch": 0.19348172853828308, "grad_norm": 0.03591747581958771, "grad_norm_var": 3.6383408486263698e-06, "learning_rate": 0.009246598938631582, "loss": 2.6585, "step": 5337 }, { "crossentropy": 2.6166887283325195, "epoch": 0.1935179814385151, "grad_norm": 0.039965469390153885, "grad_norm_var": 3.6455179771038386e-06, "learning_rate": 0.009246292166136328, "loss": 2.6351, "step": 5338 }, { "crossentropy": 2.5854671001434326, "epoch": 0.1935542343387471, "grad_norm": 0.03712428733706474, "grad_norm_var": 3.6685920946345647e-06, "learning_rate": 0.009245985336288455, "loss": 2.6069, "step": 5339 }, { "crossentropy": 2.7200915813446045, "epoch": 0.19359048723897912, "grad_norm": 0.03626880794763565, "grad_norm_var": 3.6475681439468445e-06, "learning_rate": 0.009245678449092106, "loss": 2.7141, "step": 5340 }, { "crossentropy": 2.6352908611297607, "epoch": 0.19362674013921113, "grad_norm": 0.03746175020933151, "grad_norm_var": 3.132448132392209e-06, "learning_rate": 0.009245371504551425, "loss": 2.704, "step": 5341 }, { "crossentropy": 2.741953134536743, "epoch": 0.19366299303944315, "grad_norm": 0.03733837604522705, "grad_norm_var": 2.25382501998402e-06, "learning_rate": 0.009245064502670562, "loss": 2.7551, "step": 5342 }, { "crossentropy": 2.7682628631591797, "epoch": 0.19369924593967516, "grad_norm": 0.03579055145382881, "grad_norm_var": 1.844443068026574e-06, "learning_rate": 0.00924475744345366, "loss": 2.7642, "step": 5343 }, { "crossentropy": 2.7581114768981934, "epoch": 0.1937354988399072, "grad_norm": 0.03326534852385521, "grad_norm_var": 2.762498865686164e-06, "learning_rate": 0.009244450326904866, "loss": 2.7122, "step": 5344 }, { "crossentropy": 2.7169618606567383, "epoch": 0.19377175174013922, "grad_norm": 0.034890662878751755, "grad_norm_var": 2.839357237225705e-06, "learning_rate": 0.009244143153028328, "loss": 2.7426, "step": 5345 }, { "crossentropy": 2.594886064529419, "epoch": 0.19380800464037123, "grad_norm": 0.034094106405973434, "grad_norm_var": 3.1198220479153675e-06, "learning_rate": 0.0092438359218282, "loss": 2.6528, "step": 5346 }, { "crossentropy": 2.647660255432129, "epoch": 0.19384425754060325, "grad_norm": 0.033346425741910934, "grad_norm_var": 3.1339758936673363e-06, "learning_rate": 0.009243528633308625, "loss": 2.6727, "step": 5347 }, { "crossentropy": 2.5887722969055176, "epoch": 0.19388051044083526, "grad_norm": 0.035021498799324036, "grad_norm_var": 3.219710542525361e-06, "learning_rate": 0.009243221287473756, "loss": 2.639, "step": 5348 }, { "crossentropy": 2.529721975326538, "epoch": 0.19391676334106728, "grad_norm": 0.03539230301976204, "grad_norm_var": 2.968148675876597e-06, "learning_rate": 0.009242913884327743, "loss": 2.5855, "step": 5349 }, { "crossentropy": 2.5510666370391846, "epoch": 0.1939530162412993, "grad_norm": 0.03465826436877251, "grad_norm_var": 2.863690870284069e-06, "learning_rate": 0.009242606423874741, "loss": 2.641, "step": 5350 }, { "crossentropy": 2.7713356018066406, "epoch": 0.19398926914153133, "grad_norm": 0.03417584300041199, "grad_norm_var": 3.000882799318536e-06, "learning_rate": 0.0092422989061189, "loss": 2.6928, "step": 5351 }, { "crossentropy": 2.8814260959625244, "epoch": 0.19402552204176335, "grad_norm": 0.03485717251896858, "grad_norm_var": 3.0398623205731e-06, "learning_rate": 0.009241991331064374, "loss": 2.822, "step": 5352 }, { "crossentropy": 2.697542190551758, "epoch": 0.19406177494199536, "grad_norm": 0.03449545055627823, "grad_norm_var": 3.105677439716829e-06, "learning_rate": 0.00924168369871532, "loss": 2.6922, "step": 5353 }, { "crossentropy": 2.735398769378662, "epoch": 0.19409802784222738, "grad_norm": 0.034889694303274155, "grad_norm_var": 1.6999889501043604e-06, "learning_rate": 0.009241376009075888, "loss": 2.7203, "step": 5354 }, { "crossentropy": 2.6572391986846924, "epoch": 0.1941342807424594, "grad_norm": 0.0368877612054348, "grad_norm_var": 1.6425444234799876e-06, "learning_rate": 0.009241068262150239, "loss": 2.6895, "step": 5355 }, { "crossentropy": 2.698317527770996, "epoch": 0.1941705336426914, "grad_norm": 0.03567711263895035, "grad_norm_var": 1.5783001420186422e-06, "learning_rate": 0.009240760457942525, "loss": 2.7203, "step": 5356 }, { "crossentropy": 2.5517356395721436, "epoch": 0.19420678654292342, "grad_norm": 0.03414114937186241, "grad_norm_var": 1.2395663313605545e-06, "learning_rate": 0.009240452596456906, "loss": 2.654, "step": 5357 }, { "crossentropy": 2.6788535118103027, "epoch": 0.19424303944315546, "grad_norm": 0.03844648227095604, "grad_norm_var": 1.6717563169915221e-06, "learning_rate": 0.00924014467769754, "loss": 2.7025, "step": 5358 }, { "crossentropy": 2.7361342906951904, "epoch": 0.19427929234338748, "grad_norm": 0.039815615862607956, "grad_norm_var": 3.107596722842776e-06, "learning_rate": 0.009239836701668584, "loss": 2.6907, "step": 5359 }, { "crossentropy": 2.7477774620056152, "epoch": 0.1943155452436195, "grad_norm": 0.04079051315784454, "grad_norm_var": 4.652100371615025e-06, "learning_rate": 0.0092395286683742, "loss": 2.7121, "step": 5360 }, { "crossentropy": 2.6372110843658447, "epoch": 0.1943517981438515, "grad_norm": 0.03649438917636871, "grad_norm_var": 4.634706585939184e-06, "learning_rate": 0.00923922057781855, "loss": 2.5647, "step": 5361 }, { "crossentropy": 2.838907241821289, "epoch": 0.19438805104408352, "grad_norm": 0.038622479885816574, "grad_norm_var": 4.871869625862566e-06, "learning_rate": 0.00923891243000579, "loss": 2.7347, "step": 5362 }, { "crossentropy": 2.890634298324585, "epoch": 0.19442430394431554, "grad_norm": 0.04205317422747612, "grad_norm_var": 6.405075041545036e-06, "learning_rate": 0.009238604224940085, "loss": 2.8562, "step": 5363 }, { "crossentropy": 2.5993874073028564, "epoch": 0.19446055684454758, "grad_norm": 0.043626751750707626, "grad_norm_var": 9.163379413457534e-06, "learning_rate": 0.009238295962625598, "loss": 2.5939, "step": 5364 }, { "crossentropy": 2.5672922134399414, "epoch": 0.1944968097447796, "grad_norm": 0.053154006600379944, "grad_norm_var": 2.462575170059933e-05, "learning_rate": 0.009237987643066492, "loss": 2.5638, "step": 5365 }, { "crossentropy": 2.611546277999878, "epoch": 0.1945330626450116, "grad_norm": 0.03919080272316933, "grad_norm_var": 2.3709438890053233e-05, "learning_rate": 0.009237679266266932, "loss": 2.7242, "step": 5366 }, { "crossentropy": 2.6192641258239746, "epoch": 0.19456931554524362, "grad_norm": 0.03838218376040459, "grad_norm_var": 2.2343872952335492e-05, "learning_rate": 0.009237370832231083, "loss": 2.5998, "step": 5367 }, { "crossentropy": 2.7478396892547607, "epoch": 0.19460556844547564, "grad_norm": 0.03918115794658661, "grad_norm_var": 2.1213147556062294e-05, "learning_rate": 0.00923706234096311, "loss": 2.6891, "step": 5368 }, { "crossentropy": 2.5575387477874756, "epoch": 0.19464182134570765, "grad_norm": 0.034702129662036896, "grad_norm_var": 2.108850038074021e-05, "learning_rate": 0.00923675379246718, "loss": 2.4942, "step": 5369 }, { "crossentropy": 2.6725423336029053, "epoch": 0.19467807424593966, "grad_norm": 0.033542584627866745, "grad_norm_var": 2.1963264214880375e-05, "learning_rate": 0.00923644518674746, "loss": 2.5935, "step": 5370 }, { "crossentropy": 2.686052083969116, "epoch": 0.1947143271461717, "grad_norm": 0.036360058933496475, "grad_norm_var": 2.2132401071898316e-05, "learning_rate": 0.00923613652380812, "loss": 2.7348, "step": 5371 }, { "crossentropy": 2.6515283584594727, "epoch": 0.19475058004640372, "grad_norm": 0.036355022341012955, "grad_norm_var": 2.185975444659125e-05, "learning_rate": 0.009235827803653326, "loss": 2.6401, "step": 5372 }, { "crossentropy": 2.6560254096984863, "epoch": 0.19478683294663574, "grad_norm": 0.04006272181868553, "grad_norm_var": 2.0172682715095292e-05, "learning_rate": 0.009235519026287251, "loss": 2.6881, "step": 5373 }, { "crossentropy": 2.613097667694092, "epoch": 0.19482308584686775, "grad_norm": 0.03831915929913521, "grad_norm_var": 2.0190286474802642e-05, "learning_rate": 0.009235210191714064, "loss": 2.7001, "step": 5374 }, { "crossentropy": 2.773994207382202, "epoch": 0.19485933874709976, "grad_norm": 0.03627670183777809, "grad_norm_var": 2.0784374387118552e-05, "learning_rate": 0.009234901299937936, "loss": 2.7585, "step": 5375 }, { "crossentropy": 2.7066917419433594, "epoch": 0.19489559164733178, "grad_norm": 0.03596866503357887, "grad_norm_var": 2.121148919981252e-05, "learning_rate": 0.009234592350963038, "loss": 2.7006, "step": 5376 }, { "crossentropy": 2.643989086151123, "epoch": 0.1949318445475638, "grad_norm": 0.03905964270234108, "grad_norm_var": 2.08022807834079e-05, "learning_rate": 0.009234283344793546, "loss": 2.6804, "step": 5377 }, { "crossentropy": 2.6998167037963867, "epoch": 0.19496809744779584, "grad_norm": 0.04394920542836189, "grad_norm_var": 2.226947746670384e-05, "learning_rate": 0.00923397428143363, "loss": 2.6763, "step": 5378 }, { "crossentropy": 2.605931520462036, "epoch": 0.19500435034802785, "grad_norm": 0.046064458787441254, "grad_norm_var": 2.470136730604496e-05, "learning_rate": 0.009233665160887465, "loss": 2.6237, "step": 5379 }, { "crossentropy": 2.7647244930267334, "epoch": 0.19504060324825986, "grad_norm": 0.04231216385960579, "grad_norm_var": 2.4110094532486573e-05, "learning_rate": 0.009233355983159231, "loss": 2.6839, "step": 5380 }, { "crossentropy": 2.6585628986358643, "epoch": 0.19507685614849188, "grad_norm": 0.04239378124475479, "grad_norm_var": 1.1836107216645076e-05, "learning_rate": 0.009233046748253098, "loss": 2.6854, "step": 5381 }, { "crossentropy": 2.697066307067871, "epoch": 0.1951131090487239, "grad_norm": 0.036458779126405716, "grad_norm_var": 1.2190308917250924e-05, "learning_rate": 0.009232737456173243, "loss": 2.6491, "step": 5382 }, { "crossentropy": 2.6624062061309814, "epoch": 0.1951493619489559, "grad_norm": 0.03851402923464775, "grad_norm_var": 1.2185601335933567e-05, "learning_rate": 0.009232428106923847, "loss": 2.6399, "step": 5383 }, { "crossentropy": 2.7347023487091064, "epoch": 0.19518561484918792, "grad_norm": 0.03957301750779152, "grad_norm_var": 1.2219292153897663e-05, "learning_rate": 0.009232118700509087, "loss": 2.7016, "step": 5384 }, { "crossentropy": 2.595956325531006, "epoch": 0.19522186774941996, "grad_norm": 0.039538659155368805, "grad_norm_var": 1.1074482573702044e-05, "learning_rate": 0.00923180923693314, "loss": 2.5705, "step": 5385 }, { "crossentropy": 2.7289652824401855, "epoch": 0.19525812064965198, "grad_norm": 0.035245370119810104, "grad_norm_var": 1.0006035680149884e-05, "learning_rate": 0.009231499716200188, "loss": 2.7231, "step": 5386 }, { "crossentropy": 2.710076093673706, "epoch": 0.195294373549884, "grad_norm": 0.03484013304114342, "grad_norm_var": 1.0716473585073381e-05, "learning_rate": 0.009231190138314412, "loss": 2.7021, "step": 5387 }, { "crossentropy": 2.6243033409118652, "epoch": 0.195330626450116, "grad_norm": 0.035160716623067856, "grad_norm_var": 1.1236080631260987e-05, "learning_rate": 0.009230880503279991, "loss": 2.5967, "step": 5388 }, { "crossentropy": 2.6381282806396484, "epoch": 0.19536687935034802, "grad_norm": 0.03630911186337471, "grad_norm_var": 1.157658718081844e-05, "learning_rate": 0.009230570811101109, "loss": 2.7301, "step": 5389 }, { "crossentropy": 2.716825485229492, "epoch": 0.19540313225058004, "grad_norm": 0.03506045788526535, "grad_norm_var": 1.2427035110524713e-05, "learning_rate": 0.009230261061781946, "loss": 2.7307, "step": 5390 }, { "crossentropy": 2.694234848022461, "epoch": 0.19543938515081208, "grad_norm": 0.03519994765520096, "grad_norm_var": 1.2825194763460962e-05, "learning_rate": 0.009229951255326689, "loss": 2.7108, "step": 5391 }, { "crossentropy": 2.639690399169922, "epoch": 0.1954756380510441, "grad_norm": 0.03307957574725151, "grad_norm_var": 1.431350112527461e-05, "learning_rate": 0.009229641391739521, "loss": 2.6678, "step": 5392 }, { "crossentropy": 2.5712690353393555, "epoch": 0.1955118909512761, "grad_norm": 0.03730100765824318, "grad_norm_var": 1.4328076246634438e-05, "learning_rate": 0.009229331471024628, "loss": 2.6544, "step": 5393 }, { "crossentropy": 2.7566874027252197, "epoch": 0.19554814385150812, "grad_norm": 0.04349571466445923, "grad_norm_var": 1.3992547155635091e-05, "learning_rate": 0.009229021493186195, "loss": 2.7481, "step": 5394 }, { "crossentropy": 2.7027859687805176, "epoch": 0.19558439675174014, "grad_norm": 0.04502427205443382, "grad_norm_var": 1.2963776333957805e-05, "learning_rate": 0.009228711458228409, "loss": 2.6017, "step": 5395 }, { "crossentropy": 2.8476579189300537, "epoch": 0.19562064965197215, "grad_norm": 0.03872809186577797, "grad_norm_var": 1.1750945953446258e-05, "learning_rate": 0.009228401366155457, "loss": 2.7794, "step": 5396 }, { "crossentropy": 2.7480568885803223, "epoch": 0.19565690255220416, "grad_norm": 0.03601761534810066, "grad_norm_var": 1.044613850967116e-05, "learning_rate": 0.009228091216971528, "loss": 2.7716, "step": 5397 }, { "crossentropy": 2.6908960342407227, "epoch": 0.1956931554524362, "grad_norm": 0.03598354011774063, "grad_norm_var": 1.052443542778238e-05, "learning_rate": 0.00922778101068081, "loss": 2.5808, "step": 5398 }, { "crossentropy": 2.6867783069610596, "epoch": 0.19572940835266822, "grad_norm": 0.03520171344280243, "grad_norm_var": 1.0736676512410985e-05, "learning_rate": 0.009227470747287494, "loss": 2.6942, "step": 5399 }, { "crossentropy": 2.69488787651062, "epoch": 0.19576566125290024, "grad_norm": 0.034783001989126205, "grad_norm_var": 1.0677431245990215e-05, "learning_rate": 0.00922716042679577, "loss": 2.7066, "step": 5400 }, { "crossentropy": 2.7002220153808594, "epoch": 0.19580191415313225, "grad_norm": 0.03471025452017784, "grad_norm_var": 1.0458681078505336e-05, "learning_rate": 0.009226850049209829, "loss": 2.6615, "step": 5401 }, { "crossentropy": 2.8298864364624023, "epoch": 0.19583816705336426, "grad_norm": 0.03733593598008156, "grad_norm_var": 1.0344826096511604e-05, "learning_rate": 0.009226539614533865, "loss": 2.6428, "step": 5402 }, { "crossentropy": 2.6881203651428223, "epoch": 0.19587441995359628, "grad_norm": 0.04110679030418396, "grad_norm_var": 1.1191397582515962e-05, "learning_rate": 0.00922622912277207, "loss": 2.7279, "step": 5403 }, { "crossentropy": 2.780643939971924, "epoch": 0.1959106728538283, "grad_norm": 0.04079107567667961, "grad_norm_var": 1.167473627871595e-05, "learning_rate": 0.009225918573928635, "loss": 2.7248, "step": 5404 }, { "crossentropy": 2.76785945892334, "epoch": 0.19594692575406034, "grad_norm": 0.037137530744075775, "grad_norm_var": 1.158520369763192e-05, "learning_rate": 0.00922560796800776, "loss": 2.637, "step": 5405 }, { "crossentropy": 2.5221645832061768, "epoch": 0.19598317865429235, "grad_norm": 0.03652258217334747, "grad_norm_var": 1.1231573496958171e-05, "learning_rate": 0.009225297305013633, "loss": 2.6156, "step": 5406 }, { "crossentropy": 2.792351484298706, "epoch": 0.19601943155452436, "grad_norm": 0.038194917142391205, "grad_norm_var": 1.0813345567605287e-05, "learning_rate": 0.009224986584950454, "loss": 2.8306, "step": 5407 }, { "crossentropy": 2.667731285095215, "epoch": 0.19605568445475638, "grad_norm": 0.03952885791659355, "grad_norm_var": 9.320831663166679e-06, "learning_rate": 0.009224675807822422, "loss": 2.6583, "step": 5408 }, { "crossentropy": 2.7071690559387207, "epoch": 0.1960919373549884, "grad_norm": 0.0407513827085495, "grad_norm_var": 9.632257736753394e-06, "learning_rate": 0.009224364973633731, "loss": 2.6876, "step": 5409 }, { "crossentropy": 2.793308734893799, "epoch": 0.1961281902552204, "grad_norm": 0.04374276474118233, "grad_norm_var": 9.80204502982005e-06, "learning_rate": 0.00922405408238858, "loss": 2.7441, "step": 5410 }, { "crossentropy": 2.6631381511688232, "epoch": 0.19616444315545242, "grad_norm": 0.046410996466875076, "grad_norm_var": 1.1133629341776065e-05, "learning_rate": 0.00922374313409117, "loss": 2.7381, "step": 5411 }, { "crossentropy": 2.7601158618927, "epoch": 0.19620069605568446, "grad_norm": 0.0431734137237072, "grad_norm_var": 1.2468794084579254e-05, "learning_rate": 0.009223432128745697, "loss": 2.7796, "step": 5412 }, { "crossentropy": 2.604675054550171, "epoch": 0.19623694895591648, "grad_norm": 0.039059750735759735, "grad_norm_var": 1.1903603051560453e-05, "learning_rate": 0.009223121066356364, "loss": 2.6816, "step": 5413 }, { "crossentropy": 2.739851951599121, "epoch": 0.1962732018561485, "grad_norm": 0.03470814973115921, "grad_norm_var": 1.2522840097950229e-05, "learning_rate": 0.009222809946927374, "loss": 2.7339, "step": 5414 }, { "crossentropy": 2.6050167083740234, "epoch": 0.1963094547563805, "grad_norm": 0.04154731333255768, "grad_norm_var": 1.1870316575584896e-05, "learning_rate": 0.009222498770462925, "loss": 2.6581, "step": 5415 }, { "crossentropy": 2.692063570022583, "epoch": 0.19634570765661252, "grad_norm": 0.03964385762810707, "grad_norm_var": 1.0390985289098394e-05, "learning_rate": 0.009222187536967224, "loss": 2.6713, "step": 5416 }, { "crossentropy": 2.6625185012817383, "epoch": 0.19638196055684454, "grad_norm": 0.03865658864378929, "grad_norm_var": 8.766279685778107e-06, "learning_rate": 0.009221876246444471, "loss": 2.6945, "step": 5417 }, { "crossentropy": 2.629188299179077, "epoch": 0.19641821345707658, "grad_norm": 0.03880609571933746, "grad_norm_var": 8.399833379230983e-06, "learning_rate": 0.009221564898898874, "loss": 2.6678, "step": 5418 }, { "crossentropy": 2.8963782787323, "epoch": 0.1964544663573086, "grad_norm": 0.03932131081819534, "grad_norm_var": 8.332349971893783e-06, "learning_rate": 0.009221253494334636, "loss": 2.8701, "step": 5419 }, { "crossentropy": 2.6307740211486816, "epoch": 0.1964907192575406, "grad_norm": 0.03449402377009392, "grad_norm_var": 1.0041331362042825e-05, "learning_rate": 0.009220942032755964, "loss": 2.6222, "step": 5420 }, { "crossentropy": 2.538219451904297, "epoch": 0.19652697215777262, "grad_norm": 0.03279053419828415, "grad_norm_var": 1.2580756840379198e-05, "learning_rate": 0.009220630514167064, "loss": 2.5797, "step": 5421 }, { "crossentropy": 2.7193050384521484, "epoch": 0.19656322505800464, "grad_norm": 0.03302538022398949, "grad_norm_var": 1.4598066472070366e-05, "learning_rate": 0.009220318938572144, "loss": 2.6551, "step": 5422 }, { "crossentropy": 2.655470609664917, "epoch": 0.19659947795823665, "grad_norm": 0.03610983490943909, "grad_norm_var": 1.5091097718555647e-05, "learning_rate": 0.009220007305975412, "loss": 2.6606, "step": 5423 }, { "crossentropy": 2.7715210914611816, "epoch": 0.19663573085846867, "grad_norm": 0.03383580222725868, "grad_norm_var": 1.6609551727263712e-05, "learning_rate": 0.009219695616381076, "loss": 2.6691, "step": 5424 }, { "crossentropy": 2.7687976360321045, "epoch": 0.1966719837587007, "grad_norm": 0.03470546752214432, "grad_norm_var": 1.7083120209624575e-05, "learning_rate": 0.00921938386979335, "loss": 2.7355, "step": 5425 }, { "crossentropy": 2.8569114208221436, "epoch": 0.19670823665893272, "grad_norm": 0.033912837505340576, "grad_norm_var": 1.5761937260607742e-05, "learning_rate": 0.00921907206621644, "loss": 2.7568, "step": 5426 }, { "crossentropy": 2.5756421089172363, "epoch": 0.19674448955916474, "grad_norm": 0.032470326870679855, "grad_norm_var": 1.1368353504184226e-05, "learning_rate": 0.009218760205654559, "loss": 2.6276, "step": 5427 }, { "crossentropy": 2.573302984237671, "epoch": 0.19678074245939675, "grad_norm": 0.03451358899474144, "grad_norm_var": 8.513119333345732e-06, "learning_rate": 0.00921844828811192, "loss": 2.5725, "step": 5428 }, { "crossentropy": 2.6772396564483643, "epoch": 0.19681699535962877, "grad_norm": 0.034015994518995285, "grad_norm_var": 8.112688074493612e-06, "learning_rate": 0.009218136313592735, "loss": 2.657, "step": 5429 }, { "crossentropy": 2.662524700164795, "epoch": 0.19685324825986078, "grad_norm": 0.032736409455537796, "grad_norm_var": 8.638728054389567e-06, "learning_rate": 0.009217824282101217, "loss": 2.6893, "step": 5430 }, { "crossentropy": 2.5001139640808105, "epoch": 0.1968895011600928, "grad_norm": 0.03256477415561676, "grad_norm_var": 6.632432780329968e-06, "learning_rate": 0.009217512193641583, "loss": 2.6301, "step": 5431 }, { "crossentropy": 2.714660882949829, "epoch": 0.19692575406032484, "grad_norm": 0.035414792597293854, "grad_norm_var": 5.188175102088799e-06, "learning_rate": 0.009217200048218044, "loss": 2.8126, "step": 5432 }, { "crossentropy": 2.633516311645508, "epoch": 0.19696200696055685, "grad_norm": 0.03515150398015976, "grad_norm_var": 4.170429337428151e-06, "learning_rate": 0.00921688784583482, "loss": 2.6847, "step": 5433 }, { "crossentropy": 2.774122953414917, "epoch": 0.19699825986078887, "grad_norm": 0.03750484809279442, "grad_norm_var": 3.5494143637568773e-06, "learning_rate": 0.009216575586496125, "loss": 2.665, "step": 5434 }, { "crossentropy": 2.653688669204712, "epoch": 0.19703451276102088, "grad_norm": 0.03944786638021469, "grad_norm_var": 3.6311721148316507e-06, "learning_rate": 0.00921626327020618, "loss": 2.7289, "step": 5435 }, { "crossentropy": 2.8965084552764893, "epoch": 0.1970707656612529, "grad_norm": 0.038630835711956024, "grad_norm_var": 4.673527532237688e-06, "learning_rate": 0.009215950896969199, "loss": 2.8193, "step": 5436 }, { "crossentropy": 2.614293336868286, "epoch": 0.1971070185614849, "grad_norm": 0.03690755367279053, "grad_norm_var": 4.628768527736339e-06, "learning_rate": 0.009215638466789404, "loss": 2.5771, "step": 5437 }, { "crossentropy": 2.5264997482299805, "epoch": 0.19714327146171692, "grad_norm": 0.033635225147008896, "grad_norm_var": 4.48663453128887e-06, "learning_rate": 0.009215325979671014, "loss": 2.6673, "step": 5438 }, { "crossentropy": 2.620574712753296, "epoch": 0.19717952436194897, "grad_norm": 0.035082388669252396, "grad_norm_var": 4.413909745829928e-06, "learning_rate": 0.009215013435618251, "loss": 2.6797, "step": 5439 }, { "crossentropy": 2.6159632205963135, "epoch": 0.19721577726218098, "grad_norm": 0.03604838624596596, "grad_norm_var": 4.3666526438945755e-06, "learning_rate": 0.009214700834635334, "loss": 2.663, "step": 5440 }, { "crossentropy": 2.662278890609741, "epoch": 0.197252030162413, "grad_norm": 0.03857056796550751, "grad_norm_var": 5.060210588603055e-06, "learning_rate": 0.009214388176726486, "loss": 2.6229, "step": 5441 }, { "crossentropy": 2.563988208770752, "epoch": 0.197288283062645, "grad_norm": 0.03785168007016182, "grad_norm_var": 5.242015067312827e-06, "learning_rate": 0.00921407546189593, "loss": 2.5472, "step": 5442 }, { "crossentropy": 2.8564116954803467, "epoch": 0.19732453596287702, "grad_norm": 0.037322040647268295, "grad_norm_var": 4.6503623481579265e-06, "learning_rate": 0.00921376269014789, "loss": 2.7903, "step": 5443 }, { "crossentropy": 2.6453585624694824, "epoch": 0.19736078886310904, "grad_norm": 0.03745167702436447, "grad_norm_var": 4.622318979374586e-06, "learning_rate": 0.00921344986148659, "loss": 2.659, "step": 5444 }, { "crossentropy": 2.6687307357788086, "epoch": 0.19739704176334108, "grad_norm": 0.03971666842699051, "grad_norm_var": 5.034402752314831e-06, "learning_rate": 0.009213136975916256, "loss": 2.6633, "step": 5445 }, { "crossentropy": 2.6284842491149902, "epoch": 0.1974332946635731, "grad_norm": 0.03636830672621727, "grad_norm_var": 4.035163776721674e-06, "learning_rate": 0.009212824033441114, "loss": 2.633, "step": 5446 }, { "crossentropy": 2.6591873168945312, "epoch": 0.1974695475638051, "grad_norm": 0.034997086971998215, "grad_norm_var": 3.054325872394135e-06, "learning_rate": 0.009212511034065389, "loss": 2.7005, "step": 5447 }, { "crossentropy": 2.5900087356567383, "epoch": 0.19750580046403712, "grad_norm": 0.032910555601119995, "grad_norm_var": 3.93595344716872e-06, "learning_rate": 0.00921219797779331, "loss": 2.5702, "step": 5448 }, { "crossentropy": 2.7484374046325684, "epoch": 0.19754205336426914, "grad_norm": 0.03331395983695984, "grad_norm_var": 4.532461754402686e-06, "learning_rate": 0.009211884864629106, "loss": 2.7415, "step": 5449 }, { "crossentropy": 2.7785346508026123, "epoch": 0.19757830626450115, "grad_norm": 0.03406825289130211, "grad_norm_var": 4.860557556823676e-06, "learning_rate": 0.009211571694577005, "loss": 2.7249, "step": 5450 }, { "crossentropy": 2.6786727905273438, "epoch": 0.19761455916473317, "grad_norm": 0.03539186716079712, "grad_norm_var": 4.2378664975945e-06, "learning_rate": 0.009211258467641236, "loss": 2.6668, "step": 5451 }, { "crossentropy": 2.7099368572235107, "epoch": 0.1976508120649652, "grad_norm": 0.04111799970269203, "grad_norm_var": 5.449945422056288e-06, "learning_rate": 0.009210945183826032, "loss": 2.7302, "step": 5452 }, { "crossentropy": 2.6670193672180176, "epoch": 0.19768706496519722, "grad_norm": 0.043703511357307434, "grad_norm_var": 8.889624523337966e-06, "learning_rate": 0.009210631843135622, "loss": 2.6283, "step": 5453 }, { "crossentropy": 2.6880788803100586, "epoch": 0.19772331786542924, "grad_norm": 0.03551435098052025, "grad_norm_var": 8.33695594657276e-06, "learning_rate": 0.009210318445574238, "loss": 2.6307, "step": 5454 }, { "crossentropy": 2.3748772144317627, "epoch": 0.19775957076566125, "grad_norm": 0.03458426892757416, "grad_norm_var": 8.469152690472834e-06, "learning_rate": 0.009210004991146116, "loss": 2.6093, "step": 5455 }, { "crossentropy": 2.5554940700531006, "epoch": 0.19779582366589327, "grad_norm": 0.033597737550735474, "grad_norm_var": 9.092778776866444e-06, "learning_rate": 0.009209691479855486, "loss": 2.5731, "step": 5456 }, { "crossentropy": 2.7707595825195312, "epoch": 0.19783207656612528, "grad_norm": 0.034545350819826126, "grad_norm_var": 9.077367997750487e-06, "learning_rate": 0.009209377911706585, "loss": 2.7117, "step": 5457 }, { "crossentropy": 2.699347734451294, "epoch": 0.1978683294663573, "grad_norm": 0.03547971323132515, "grad_norm_var": 8.970988939132942e-06, "learning_rate": 0.009209064286703648, "loss": 2.6791, "step": 5458 }, { "crossentropy": 2.657256603240967, "epoch": 0.19790458236658934, "grad_norm": 0.03499948978424072, "grad_norm_var": 8.977759735667202e-06, "learning_rate": 0.009208750604850909, "loss": 2.6804, "step": 5459 }, { "crossentropy": 2.6980552673339844, "epoch": 0.19794083526682135, "grad_norm": 0.0343787707388401, "grad_norm_var": 9.018239174866243e-06, "learning_rate": 0.009208436866152607, "loss": 2.7416, "step": 5460 }, { "crossentropy": 2.7624075412750244, "epoch": 0.19797708816705337, "grad_norm": 0.03658890724182129, "grad_norm_var": 8.045489931680454e-06, "learning_rate": 0.009208123070612978, "loss": 2.7498, "step": 5461 }, { "crossentropy": 2.5981838703155518, "epoch": 0.19801334106728538, "grad_norm": 0.03309305012226105, "grad_norm_var": 8.433925714733798e-06, "learning_rate": 0.00920780921823626, "loss": 2.5659, "step": 5462 }, { "crossentropy": 2.6016523838043213, "epoch": 0.1980495939675174, "grad_norm": 0.03441450744867325, "grad_norm_var": 8.495586066409322e-06, "learning_rate": 0.009207495309026696, "loss": 2.6343, "step": 5463 }, { "crossentropy": 2.7056355476379395, "epoch": 0.1980858468677494, "grad_norm": 0.03760648891329765, "grad_norm_var": 8.264158578896209e-06, "learning_rate": 0.00920718134298852, "loss": 2.6322, "step": 5464 }, { "crossentropy": 2.691758155822754, "epoch": 0.19812209976798145, "grad_norm": 0.03757860139012337, "grad_norm_var": 8.001525687132786e-06, "learning_rate": 0.009206867320125975, "loss": 2.6871, "step": 5465 }, { "crossentropy": 2.7117228507995605, "epoch": 0.19815835266821347, "grad_norm": 0.03386405482888222, "grad_norm_var": 8.057854245855506e-06, "learning_rate": 0.009206553240443305, "loss": 2.5957, "step": 5466 }, { "crossentropy": 2.6271088123321533, "epoch": 0.19819460556844548, "grad_norm": 0.03504057228565216, "grad_norm_var": 8.095394515498372e-06, "learning_rate": 0.009206239103944748, "loss": 2.6708, "step": 5467 }, { "crossentropy": 2.66945481300354, "epoch": 0.1982308584686775, "grad_norm": 0.04179628565907478, "grad_norm_var": 8.586404388708641e-06, "learning_rate": 0.00920592491063455, "loss": 2.7173, "step": 5468 }, { "crossentropy": 2.655337333679199, "epoch": 0.1982671113689095, "grad_norm": 0.043565455824136734, "grad_norm_var": 8.446697821876994e-06, "learning_rate": 0.009205610660516953, "loss": 2.7633, "step": 5469 }, { "crossentropy": 2.5313620567321777, "epoch": 0.19830336426914152, "grad_norm": 0.03934718295931816, "grad_norm_var": 9.095987537199347e-06, "learning_rate": 0.009205296353596203, "loss": 2.5649, "step": 5470 }, { "crossentropy": 2.479578971862793, "epoch": 0.19833961716937354, "grad_norm": 0.03813574090600014, "grad_norm_var": 9.08130530673327e-06, "learning_rate": 0.009204981989876541, "loss": 2.5913, "step": 5471 }, { "crossentropy": 2.8033268451690674, "epoch": 0.19837587006960558, "grad_norm": 0.03478936105966568, "grad_norm_var": 8.70861579365829e-06, "learning_rate": 0.009204667569362217, "loss": 2.6776, "step": 5472 }, { "crossentropy": 2.649770975112915, "epoch": 0.1984121229698376, "grad_norm": 0.036233868449926376, "grad_norm_var": 8.429531438492825e-06, "learning_rate": 0.009204353092057478, "loss": 2.7211, "step": 5473 }, { "crossentropy": 2.699059247970581, "epoch": 0.1984483758700696, "grad_norm": 0.03413519635796547, "grad_norm_var": 8.758047505857958e-06, "learning_rate": 0.009204038557966568, "loss": 2.6546, "step": 5474 }, { "crossentropy": 2.721344470977783, "epoch": 0.19848462877030162, "grad_norm": 0.03279152885079384, "grad_norm_var": 9.53332522986381e-06, "learning_rate": 0.00920372396709374, "loss": 2.6627, "step": 5475 }, { "crossentropy": 2.691671133041382, "epoch": 0.19852088167053364, "grad_norm": 0.035681623965501785, "grad_norm_var": 9.277880855816598e-06, "learning_rate": 0.009203409319443238, "loss": 2.6836, "step": 5476 }, { "crossentropy": 2.6477441787719727, "epoch": 0.19855713457076565, "grad_norm": 0.03715280443429947, "grad_norm_var": 9.301326378504243e-06, "learning_rate": 0.009203094615019317, "loss": 2.6952, "step": 5477 }, { "crossentropy": 2.629013776779175, "epoch": 0.19859338747099767, "grad_norm": 0.03543150797486305, "grad_norm_var": 8.556935075405373e-06, "learning_rate": 0.009202779853826223, "loss": 2.6233, "step": 5478 }, { "crossentropy": 2.8257832527160645, "epoch": 0.1986296403712297, "grad_norm": 0.03566201403737068, "grad_norm_var": 8.270254290586715e-06, "learning_rate": 0.009202465035868208, "loss": 2.7974, "step": 5479 }, { "crossentropy": 2.678586483001709, "epoch": 0.19866589327146172, "grad_norm": 0.03449943661689758, "grad_norm_var": 8.539826226012607e-06, "learning_rate": 0.009202150161149527, "loss": 2.6688, "step": 5480 }, { "crossentropy": 2.5879552364349365, "epoch": 0.19870214617169374, "grad_norm": 0.034712377935647964, "grad_norm_var": 8.681806736991297e-06, "learning_rate": 0.00920183522967443, "loss": 2.6021, "step": 5481 }, { "crossentropy": 2.735539674758911, "epoch": 0.19873839907192575, "grad_norm": 0.03895656764507294, "grad_norm_var": 8.562120431423494e-06, "learning_rate": 0.009201520241447173, "loss": 2.6308, "step": 5482 }, { "crossentropy": 2.7884178161621094, "epoch": 0.19877465197215777, "grad_norm": 0.04119110852479935, "grad_norm_var": 9.528095637546288e-06, "learning_rate": 0.00920120519647201, "loss": 2.7727, "step": 5483 }, { "crossentropy": 2.767789363861084, "epoch": 0.19881090487238978, "grad_norm": 0.04049190878868103, "grad_norm_var": 8.82290946991234e-06, "learning_rate": 0.009200890094753191, "loss": 2.7231, "step": 5484 }, { "crossentropy": 2.6133522987365723, "epoch": 0.1988471577726218, "grad_norm": 0.04265223443508148, "grad_norm_var": 8.081522492876282e-06, "learning_rate": 0.00920057493629498, "loss": 2.6914, "step": 5485 }, { "crossentropy": 2.6733014583587646, "epoch": 0.19888341067285384, "grad_norm": 0.04534052684903145, "grad_norm_var": 1.2208965800443855e-05, "learning_rate": 0.009200259721101629, "loss": 2.6708, "step": 5486 }, { "crossentropy": 2.5783653259277344, "epoch": 0.19891966357308585, "grad_norm": 0.03763895854353905, "grad_norm_var": 1.2173411997753955e-05, "learning_rate": 0.009199944449177395, "loss": 2.567, "step": 5487 }, { "crossentropy": 2.671395778656006, "epoch": 0.19895591647331787, "grad_norm": 0.032983411103487015, "grad_norm_var": 1.2990241239383554e-05, "learning_rate": 0.00919962912052654, "loss": 2.7448, "step": 5488 }, { "crossentropy": 2.606968879699707, "epoch": 0.19899216937354988, "grad_norm": 0.03729303553700447, "grad_norm_var": 1.2920782596183751e-05, "learning_rate": 0.009199313735153319, "loss": 2.6816, "step": 5489 }, { "crossentropy": 2.5731678009033203, "epoch": 0.1990284222737819, "grad_norm": 0.03751615062355995, "grad_norm_var": 1.2213770319326652e-05, "learning_rate": 0.009198998293061993, "loss": 2.5318, "step": 5490 }, { "crossentropy": 2.700381278991699, "epoch": 0.1990646751740139, "grad_norm": 0.041620247066020966, "grad_norm_var": 1.1543130021525403e-05, "learning_rate": 0.009198682794256823, "loss": 2.7705, "step": 5491 }, { "crossentropy": 2.77787446975708, "epoch": 0.19910092807424595, "grad_norm": 0.04028472304344177, "grad_norm_var": 1.1412912662163895e-05, "learning_rate": 0.009198367238742072, "loss": 2.7563, "step": 5492 }, { "crossentropy": 2.75134539604187, "epoch": 0.19913718097447797, "grad_norm": 0.039370886981487274, "grad_norm_var": 1.1369539479262152e-05, "learning_rate": 0.009198051626521998, "loss": 2.694, "step": 5493 }, { "crossentropy": 2.8359429836273193, "epoch": 0.19917343387470998, "grad_norm": 0.037138450890779495, "grad_norm_var": 1.0858325776423131e-05, "learning_rate": 0.009197735957600867, "loss": 2.8355, "step": 5494 }, { "crossentropy": 2.6305837631225586, "epoch": 0.199209686774942, "grad_norm": 0.03516516089439392, "grad_norm_var": 1.1067361054801992e-05, "learning_rate": 0.009197420231982941, "loss": 2.6736, "step": 5495 }, { "crossentropy": 2.753762722015381, "epoch": 0.199245939675174, "grad_norm": 0.0363750085234642, "grad_norm_var": 1.0273409423947922e-05, "learning_rate": 0.009197104449672486, "loss": 2.7106, "step": 5496 }, { "crossentropy": 2.5570602416992188, "epoch": 0.19928219257540603, "grad_norm": 0.034800317138433456, "grad_norm_var": 1.0227480854874994e-05, "learning_rate": 0.009196788610673766, "loss": 2.6079, "step": 5497 }, { "crossentropy": 2.8369436264038086, "epoch": 0.19931844547563804, "grad_norm": 0.0367794893682003, "grad_norm_var": 1.0442316777845711e-05, "learning_rate": 0.009196472714991046, "loss": 2.7664, "step": 5498 }, { "crossentropy": 2.693455696105957, "epoch": 0.19935469837587008, "grad_norm": 0.0375983789563179, "grad_norm_var": 9.979134707147757e-06, "learning_rate": 0.009196156762628594, "loss": 2.7164, "step": 5499 }, { "crossentropy": 2.7116847038269043, "epoch": 0.1993909512761021, "grad_norm": 0.04035630822181702, "grad_norm_var": 9.940935295643429e-06, "learning_rate": 0.009195840753590676, "loss": 2.7795, "step": 5500 }, { "crossentropy": 2.774688720703125, "epoch": 0.1994272041763341, "grad_norm": 0.03757191821932793, "grad_norm_var": 8.610735306838144e-06, "learning_rate": 0.009195524687881563, "loss": 2.8276, "step": 5501 }, { "crossentropy": 2.7469403743743896, "epoch": 0.19946345707656613, "grad_norm": 0.033115435391664505, "grad_norm_var": 5.969374742684161e-06, "learning_rate": 0.00919520856550552, "loss": 2.7194, "step": 5502 }, { "crossentropy": 2.6976165771484375, "epoch": 0.19949970997679814, "grad_norm": 0.03410510718822479, "grad_norm_var": 6.555064339974539e-06, "learning_rate": 0.009194892386466822, "loss": 2.7454, "step": 5503 }, { "crossentropy": 2.66890549659729, "epoch": 0.19953596287703015, "grad_norm": 0.03368280827999115, "grad_norm_var": 6.21064635970414e-06, "learning_rate": 0.009194576150769733, "loss": 2.6931, "step": 5504 }, { "crossentropy": 2.610531806945801, "epoch": 0.19957221577726217, "grad_norm": 0.03388901799917221, "grad_norm_var": 6.8237947142111195e-06, "learning_rate": 0.00919425985841853, "loss": 2.6473, "step": 5505 }, { "crossentropy": 2.779846668243408, "epoch": 0.1996084686774942, "grad_norm": 0.037444282323122025, "grad_norm_var": 6.8175960799237805e-06, "learning_rate": 0.009193943509417482, "loss": 2.7055, "step": 5506 }, { "crossentropy": 2.8586647510528564, "epoch": 0.19964472157772623, "grad_norm": 0.03886707127094269, "grad_norm_var": 5.533294770230605e-06, "learning_rate": 0.009193627103770861, "loss": 2.7668, "step": 5507 }, { "crossentropy": 2.6342570781707764, "epoch": 0.19968097447795824, "grad_norm": 0.036863651126623154, "grad_norm_var": 4.610940579150881e-06, "learning_rate": 0.009193310641482945, "loss": 2.6746, "step": 5508 }, { "crossentropy": 2.9213805198669434, "epoch": 0.19971722737819025, "grad_norm": 0.044606808573007584, "grad_norm_var": 8.366855231395122e-06, "learning_rate": 0.009192994122558001, "loss": 2.871, "step": 5509 }, { "crossentropy": 2.5092201232910156, "epoch": 0.19975348027842227, "grad_norm": 0.04323449730873108, "grad_norm_var": 1.0986953661463958e-05, "learning_rate": 0.009192677547000311, "loss": 2.5676, "step": 5510 }, { "crossentropy": 2.6988096237182617, "epoch": 0.19978973317865428, "grad_norm": 0.034334562718868256, "grad_norm_var": 1.1250268322397725e-05, "learning_rate": 0.009192360914814148, "loss": 2.6367, "step": 5511 }, { "crossentropy": 2.709110975265503, "epoch": 0.1998259860788863, "grad_norm": 0.035703808069229126, "grad_norm_var": 1.1343445094570634e-05, "learning_rate": 0.009192044226003789, "loss": 2.6813, "step": 5512 }, { "crossentropy": 2.5529584884643555, "epoch": 0.19986223897911834, "grad_norm": 0.0351334884762764, "grad_norm_var": 1.125001940108968e-05, "learning_rate": 0.00919172748057351, "loss": 2.5595, "step": 5513 }, { "crossentropy": 2.7382638454437256, "epoch": 0.19989849187935035, "grad_norm": 0.034860771149396896, "grad_norm_var": 1.155709730452e-05, "learning_rate": 0.00919141067852759, "loss": 2.6761, "step": 5514 }, { "crossentropy": 2.685227870941162, "epoch": 0.19993474477958237, "grad_norm": 0.033269427716732025, "grad_norm_var": 1.2360153338699294e-05, "learning_rate": 0.009191093819870308, "loss": 2.7009, "step": 5515 }, { "crossentropy": 2.684375762939453, "epoch": 0.19997099767981438, "grad_norm": 0.03342674672603607, "grad_norm_var": 1.1973815485060342e-05, "learning_rate": 0.009190776904605943, "loss": 2.6565, "step": 5516 }, { "crossentropy": 2.4941232204437256, "epoch": 0.2000072505800464, "grad_norm": 0.03354540094733238, "grad_norm_var": 1.2281092123328507e-05, "learning_rate": 0.00919045993273878, "loss": 2.5876, "step": 5517 }, { "crossentropy": 2.6646690368652344, "epoch": 0.2000435034802784, "grad_norm": 0.033776771277189255, "grad_norm_var": 1.2053615175137283e-05, "learning_rate": 0.009190142904273094, "loss": 2.6289, "step": 5518 }, { "crossentropy": 2.516474962234497, "epoch": 0.20007975638051045, "grad_norm": 0.035118043422698975, "grad_norm_var": 1.1855539846897939e-05, "learning_rate": 0.009189825819213168, "loss": 2.5739, "step": 5519 }, { "crossentropy": 2.6207635402679443, "epoch": 0.20011600928074247, "grad_norm": 0.038765110075473785, "grad_norm_var": 1.1825259431839466e-05, "learning_rate": 0.009189508677563287, "loss": 2.7026, "step": 5520 }, { "crossentropy": 2.582475423812866, "epoch": 0.20015226218097448, "grad_norm": 0.04207460954785347, "grad_norm_var": 1.3242510393833967e-05, "learning_rate": 0.009189191479327735, "loss": 2.6028, "step": 5521 }, { "crossentropy": 2.439952850341797, "epoch": 0.2001885150812065, "grad_norm": 0.03925261273980141, "grad_norm_var": 1.35687022267974e-05, "learning_rate": 0.009188874224510793, "loss": 2.6144, "step": 5522 }, { "crossentropy": 2.6744492053985596, "epoch": 0.2002247679814385, "grad_norm": 0.03789301589131355, "grad_norm_var": 1.3392281772283316e-05, "learning_rate": 0.00918855691311675, "loss": 2.6722, "step": 5523 }, { "crossentropy": 2.7455241680145264, "epoch": 0.20026102088167053, "grad_norm": 0.03451576456427574, "grad_norm_var": 1.3776749311295947e-05, "learning_rate": 0.009188239545149886, "loss": 2.6285, "step": 5524 }, { "crossentropy": 2.6374030113220215, "epoch": 0.20029727378190254, "grad_norm": 0.03464976325631142, "grad_norm_var": 9.667836980826554e-06, "learning_rate": 0.009187922120614493, "loss": 2.6999, "step": 5525 }, { "crossentropy": 2.6979053020477295, "epoch": 0.20033352668213458, "grad_norm": 0.0344838872551918, "grad_norm_var": 6.272017806587214e-06, "learning_rate": 0.009187604639514858, "loss": 2.7291, "step": 5526 }, { "crossentropy": 2.6179709434509277, "epoch": 0.2003697795823666, "grad_norm": 0.04725649207830429, "grad_norm_var": 1.4398155053427925e-05, "learning_rate": 0.009187287101855264, "loss": 2.6518, "step": 5527 }, { "crossentropy": 2.5560050010681152, "epoch": 0.2004060324825986, "grad_norm": 0.041824616491794586, "grad_norm_var": 1.610388557572451e-05, "learning_rate": 0.009186969507640006, "loss": 2.7115, "step": 5528 }, { "crossentropy": 2.599755048751831, "epoch": 0.20044228538283063, "grad_norm": 0.040050994604825974, "grad_norm_var": 1.6479689178831336e-05, "learning_rate": 0.00918665185687337, "loss": 2.6105, "step": 5529 }, { "crossentropy": 2.6720616817474365, "epoch": 0.20047853828306264, "grad_norm": 0.03567725047469139, "grad_norm_var": 1.626966283799254e-05, "learning_rate": 0.009186334149559647, "loss": 2.6408, "step": 5530 }, { "crossentropy": 2.770899772644043, "epoch": 0.20051479118329466, "grad_norm": 0.033640749752521515, "grad_norm_var": 1.6082501838946862e-05, "learning_rate": 0.009186016385703127, "loss": 2.6808, "step": 5531 }, { "crossentropy": 2.7348344326019287, "epoch": 0.20055104408352667, "grad_norm": 0.03654284402728081, "grad_norm_var": 1.5102147720460108e-05, "learning_rate": 0.009185698565308105, "loss": 2.6749, "step": 5532 }, { "crossentropy": 2.567629337310791, "epoch": 0.2005872969837587, "grad_norm": 0.03620728477835655, "grad_norm_var": 1.4162117142770455e-05, "learning_rate": 0.009185380688378871, "loss": 2.5955, "step": 5533 }, { "crossentropy": 2.714749574661255, "epoch": 0.20062354988399073, "grad_norm": 0.03713598847389221, "grad_norm_var": 1.3151347140810157e-05, "learning_rate": 0.00918506275491972, "loss": 2.6463, "step": 5534 }, { "crossentropy": 2.7260935306549072, "epoch": 0.20065980278422274, "grad_norm": 0.032665275037288666, "grad_norm_var": 1.4410355106302988e-05, "learning_rate": 0.009184744764934945, "loss": 2.7421, "step": 5535 }, { "crossentropy": 2.6738698482513428, "epoch": 0.20069605568445475, "grad_norm": 0.032501913607120514, "grad_norm_var": 1.594319265686006e-05, "learning_rate": 0.009184426718428841, "loss": 2.6316, "step": 5536 }, { "crossentropy": 2.551079511642456, "epoch": 0.20073230858468677, "grad_norm": 0.03252614289522171, "grad_norm_var": 1.5528853633202852e-05, "learning_rate": 0.009184108615405705, "loss": 2.5612, "step": 5537 }, { "crossentropy": 2.713700771331787, "epoch": 0.20076856148491878, "grad_norm": 0.03778819739818573, "grad_norm_var": 1.5159893054833967e-05, "learning_rate": 0.009183790455869832, "loss": 2.695, "step": 5538 }, { "crossentropy": 2.8008289337158203, "epoch": 0.2008048143851508, "grad_norm": 0.036887384951114655, "grad_norm_var": 1.5047716250695305e-05, "learning_rate": 0.00918347223982552, "loss": 2.7035, "step": 5539 }, { "crossentropy": 2.619494676589966, "epoch": 0.20084106728538284, "grad_norm": 0.033730827271938324, "grad_norm_var": 1.5296210039283885e-05, "learning_rate": 0.009183153967277067, "loss": 2.6481, "step": 5540 }, { "crossentropy": 2.5853865146636963, "epoch": 0.20087732018561485, "grad_norm": 0.037829168140888214, "grad_norm_var": 1.5155048075835935e-05, "learning_rate": 0.00918283563822877, "loss": 2.5323, "step": 5541 }, { "crossentropy": 2.6963279247283936, "epoch": 0.20091357308584687, "grad_norm": 0.03605493903160095, "grad_norm_var": 1.4850998068071988e-05, "learning_rate": 0.009182517252684931, "loss": 2.7142, "step": 5542 }, { "crossentropy": 2.7815582752227783, "epoch": 0.20094982598607888, "grad_norm": 0.037504199892282486, "grad_norm_var": 7.159559223087325e-06, "learning_rate": 0.009182198810649852, "loss": 2.6766, "step": 5543 }, { "crossentropy": 2.7122325897216797, "epoch": 0.2009860788863109, "grad_norm": 0.0410061739385128, "grad_norm_var": 6.583322685811919e-06, "learning_rate": 0.009181880312127827, "loss": 2.7167, "step": 5544 }, { "crossentropy": 2.8111565113067627, "epoch": 0.2010223317865429, "grad_norm": 0.04231239855289459, "grad_norm_var": 8.091436188080852e-06, "learning_rate": 0.009181561757123164, "loss": 2.7114, "step": 5545 }, { "crossentropy": 2.6985998153686523, "epoch": 0.20105858468677495, "grad_norm": 0.03766513988375664, "grad_norm_var": 8.186431458265658e-06, "learning_rate": 0.009181243145640164, "loss": 2.7034, "step": 5546 }, { "crossentropy": 2.7391819953918457, "epoch": 0.20109483758700697, "grad_norm": 0.0334903858602047, "grad_norm_var": 8.242660487872699e-06, "learning_rate": 0.009180924477683131, "loss": 2.614, "step": 5547 }, { "crossentropy": 2.543215751647949, "epoch": 0.20113109048723898, "grad_norm": 0.03371984884142876, "grad_norm_var": 8.673995878033525e-06, "learning_rate": 0.009180605753256365, "loss": 2.6806, "step": 5548 }, { "crossentropy": 2.679995059967041, "epoch": 0.201167343387471, "grad_norm": 0.03497348353266716, "grad_norm_var": 8.766142544544553e-06, "learning_rate": 0.009180286972364176, "loss": 2.6335, "step": 5549 }, { "crossentropy": 2.6659626960754395, "epoch": 0.201203596287703, "grad_norm": 0.03617405146360397, "grad_norm_var": 8.692635962000381e-06, "learning_rate": 0.009179968135010867, "loss": 2.6769, "step": 5550 }, { "crossentropy": 2.7745554447174072, "epoch": 0.20123984918793503, "grad_norm": 0.037109922617673874, "grad_norm_var": 7.920368338920566e-06, "learning_rate": 0.009179649241200747, "loss": 2.6853, "step": 5551 }, { "crossentropy": 2.6934242248535156, "epoch": 0.20127610208816704, "grad_norm": 0.04295094683766365, "grad_norm_var": 9.411461843711102e-06, "learning_rate": 0.009179330290938119, "loss": 2.7106, "step": 5552 }, { "crossentropy": 2.7843260765075684, "epoch": 0.20131235498839908, "grad_norm": 0.04681987687945366, "grad_norm_var": 1.3687441905643229e-05, "learning_rate": 0.009179011284227294, "loss": 2.7271, "step": 5553 }, { "crossentropy": 2.69006609916687, "epoch": 0.2013486078886311, "grad_norm": 0.03737425059080124, "grad_norm_var": 1.3703000744150612e-05, "learning_rate": 0.009178692221072579, "loss": 2.6728, "step": 5554 }, { "crossentropy": 2.7456517219543457, "epoch": 0.2013848607888631, "grad_norm": 0.03441969305276871, "grad_norm_var": 1.4400381338765198e-05, "learning_rate": 0.009178373101478284, "loss": 2.7537, "step": 5555 }, { "crossentropy": 2.6328914165496826, "epoch": 0.20142111368909513, "grad_norm": 0.03414316102862358, "grad_norm_var": 1.419301330861077e-05, "learning_rate": 0.00917805392544872, "loss": 2.5927, "step": 5556 }, { "crossentropy": 2.5478477478027344, "epoch": 0.20145736658932714, "grad_norm": 0.03432019427418709, "grad_norm_var": 1.491230189252368e-05, "learning_rate": 0.009177734692988198, "loss": 2.6342, "step": 5557 }, { "crossentropy": 2.611607074737549, "epoch": 0.20149361948955916, "grad_norm": 0.031527649611234665, "grad_norm_var": 1.7067077079143308e-05, "learning_rate": 0.009177415404101028, "loss": 2.6459, "step": 5558 }, { "crossentropy": 2.5769286155700684, "epoch": 0.20152987238979117, "grad_norm": 0.03419090434908867, "grad_norm_var": 1.7627407682241128e-05, "learning_rate": 0.009177096058791523, "loss": 2.6652, "step": 5559 }, { "crossentropy": 2.6236188411712646, "epoch": 0.2015661252900232, "grad_norm": 0.03408677503466606, "grad_norm_var": 1.6935167379072736e-05, "learning_rate": 0.009176776657063998, "loss": 2.692, "step": 5560 }, { "crossentropy": 2.6373767852783203, "epoch": 0.20160237819025523, "grad_norm": 0.036079276353120804, "grad_norm_var": 1.459923845521325e-05, "learning_rate": 0.009176457198922764, "loss": 2.6152, "step": 5561 }, { "crossentropy": 2.5294268131256104, "epoch": 0.20163863109048724, "grad_norm": 0.03270585462450981, "grad_norm_var": 1.5161206539363904e-05, "learning_rate": 0.009176137684372141, "loss": 2.5539, "step": 5562 }, { "crossentropy": 2.5548224449157715, "epoch": 0.20167488399071926, "grad_norm": 0.0352756604552269, "grad_norm_var": 1.479149786590116e-05, "learning_rate": 0.009175818113416438, "loss": 2.6192, "step": 5563 }, { "crossentropy": 2.7622570991516113, "epoch": 0.20171113689095127, "grad_norm": 0.03443446010351181, "grad_norm_var": 1.4606923402610215e-05, "learning_rate": 0.009175498486059976, "loss": 2.6582, "step": 5564 }, { "crossentropy": 2.683391571044922, "epoch": 0.20174738979118328, "grad_norm": 0.03561247140169144, "grad_norm_var": 1.4541863706442799e-05, "learning_rate": 0.009175178802307069, "loss": 2.6875, "step": 5565 }, { "crossentropy": 2.6613104343414307, "epoch": 0.20178364269141533, "grad_norm": 0.03355177864432335, "grad_norm_var": 1.4937550938637298e-05, "learning_rate": 0.009174859062162038, "loss": 2.6958, "step": 5566 }, { "crossentropy": 2.869626998901367, "epoch": 0.20181989559164734, "grad_norm": 0.03381342440843582, "grad_norm_var": 1.5090504349896117e-05, "learning_rate": 0.009174539265629198, "loss": 2.8259, "step": 5567 }, { "crossentropy": 2.686495065689087, "epoch": 0.20185614849187936, "grad_norm": 0.03365813568234444, "grad_norm_var": 1.1511789488351705e-05, "learning_rate": 0.00917421941271287, "loss": 2.5988, "step": 5568 }, { "crossentropy": 2.7138476371765137, "epoch": 0.20189240139211137, "grad_norm": 0.042034246027469635, "grad_norm_var": 5.481406792073815e-06, "learning_rate": 0.009173899503417375, "loss": 2.7168, "step": 5569 }, { "crossentropy": 2.653718948364258, "epoch": 0.20192865429234338, "grad_norm": 0.03654418885707855, "grad_norm_var": 5.242524634401814e-06, "learning_rate": 0.009173579537747034, "loss": 2.6717, "step": 5570 }, { "crossentropy": 2.6619935035705566, "epoch": 0.2019649071925754, "grad_norm": 0.035693954676389694, "grad_norm_var": 5.283663927888732e-06, "learning_rate": 0.009173259515706167, "loss": 2.7089, "step": 5571 }, { "crossentropy": 2.6667017936706543, "epoch": 0.2020011600928074, "grad_norm": 0.03503008559346199, "grad_norm_var": 5.248707090797315e-06, "learning_rate": 0.009172939437299098, "loss": 2.7287, "step": 5572 }, { "crossentropy": 2.711214065551758, "epoch": 0.20203741299303946, "grad_norm": 0.0375317707657814, "grad_norm_var": 5.640810390992029e-06, "learning_rate": 0.009172619302530147, "loss": 2.7044, "step": 5573 }, { "crossentropy": 2.6592578887939453, "epoch": 0.20207366589327147, "grad_norm": 0.03418882563710213, "grad_norm_var": 4.812088627971905e-06, "learning_rate": 0.009172299111403642, "loss": 2.5945, "step": 5574 }, { "crossentropy": 2.636958599090576, "epoch": 0.20210991879350348, "grad_norm": 0.03315950557589531, "grad_norm_var": 5.0279331779830175e-06, "learning_rate": 0.009171978863923904, "loss": 2.5876, "step": 5575 }, { "crossentropy": 2.779064655303955, "epoch": 0.2021461716937355, "grad_norm": 0.03440561890602112, "grad_norm_var": 4.98642851598415e-06, "learning_rate": 0.009171658560095261, "loss": 2.7305, "step": 5576 }, { "crossentropy": 2.556605577468872, "epoch": 0.2021824245939675, "grad_norm": 0.03817211091518402, "grad_norm_var": 5.496477118957915e-06, "learning_rate": 0.009171338199922038, "loss": 2.6216, "step": 5577 }, { "crossentropy": 2.6752102375030518, "epoch": 0.20221867749419953, "grad_norm": 0.03826988488435745, "grad_norm_var": 5.459931115377413e-06, "learning_rate": 0.009171017783408564, "loss": 2.6568, "step": 5578 }, { "crossentropy": 2.486548900604248, "epoch": 0.20225493039443154, "grad_norm": 0.03885534778237343, "grad_norm_var": 6.053028618241823e-06, "learning_rate": 0.009170697310559164, "loss": 2.5655, "step": 5579 }, { "crossentropy": 2.600186586380005, "epoch": 0.20229118329466358, "grad_norm": 0.032942995429039, "grad_norm_var": 6.490406019011147e-06, "learning_rate": 0.00917037678137817, "loss": 2.5607, "step": 5580 }, { "crossentropy": 2.8311197757720947, "epoch": 0.2023274361948956, "grad_norm": 0.03393542394042015, "grad_norm_var": 6.717403598758646e-06, "learning_rate": 0.009170056195869905, "loss": 2.7441, "step": 5581 }, { "crossentropy": 2.6750195026397705, "epoch": 0.2023636890951276, "grad_norm": 0.034270934760570526, "grad_norm_var": 6.540220500535614e-06, "learning_rate": 0.009169735554038706, "loss": 2.6062, "step": 5582 }, { "crossentropy": 2.715836524963379, "epoch": 0.20239994199535963, "grad_norm": 0.03403617441654205, "grad_norm_var": 6.484865196276721e-06, "learning_rate": 0.009169414855888899, "loss": 2.6297, "step": 5583 }, { "crossentropy": 2.7460546493530273, "epoch": 0.20243619489559164, "grad_norm": 0.03608212247490883, "grad_norm_var": 6.161280493210443e-06, "learning_rate": 0.009169094101424816, "loss": 2.6971, "step": 5584 }, { "crossentropy": 2.5089845657348633, "epoch": 0.20247244779582366, "grad_norm": 0.037185050547122955, "grad_norm_var": 3.6952310342244196e-06, "learning_rate": 0.009168773290650791, "loss": 2.6403, "step": 5585 }, { "crossentropy": 2.728640079498291, "epoch": 0.20250870069605567, "grad_norm": 0.03765537217259407, "grad_norm_var": 3.905771580070318e-06, "learning_rate": 0.009168452423571156, "loss": 2.7307, "step": 5586 }, { "crossentropy": 2.762820243835449, "epoch": 0.2025449535962877, "grad_norm": 0.03614658862352371, "grad_norm_var": 3.917399938345387e-06, "learning_rate": 0.009168131500190246, "loss": 2.6223, "step": 5587 }, { "crossentropy": 2.5531742572784424, "epoch": 0.20258120649651973, "grad_norm": 0.034108031541109085, "grad_norm_var": 4.058027383436966e-06, "learning_rate": 0.009167810520512394, "loss": 2.6418, "step": 5588 }, { "crossentropy": 2.6911935806274414, "epoch": 0.20261745939675174, "grad_norm": 0.03272978961467743, "grad_norm_var": 4.3162252981620555e-06, "learning_rate": 0.009167489484541937, "loss": 2.6385, "step": 5589 }, { "crossentropy": 2.6121160984039307, "epoch": 0.20265371229698376, "grad_norm": 0.033527959138154984, "grad_norm_var": 4.448834030606047e-06, "learning_rate": 0.00916716839228321, "loss": 2.5671, "step": 5590 }, { "crossentropy": 2.672327995300293, "epoch": 0.20268996519721577, "grad_norm": 0.035178087651729584, "grad_norm_var": 4.115911712156371e-06, "learning_rate": 0.00916684724374055, "loss": 2.7409, "step": 5591 }, { "crossentropy": 2.8033626079559326, "epoch": 0.20272621809744779, "grad_norm": 0.045576438307762146, "grad_norm_var": 1.0331500402134773e-05, "learning_rate": 0.009166526038918295, "loss": 2.7363, "step": 5592 }, { "crossentropy": 2.700674057006836, "epoch": 0.20276247099767983, "grad_norm": 0.036334577947854996, "grad_norm_var": 1.005127715768683e-05, "learning_rate": 0.00916620477782078, "loss": 2.8161, "step": 5593 }, { "crossentropy": 2.737863779067993, "epoch": 0.20279872389791184, "grad_norm": 0.03403511643409729, "grad_norm_var": 9.919907034008854e-06, "learning_rate": 0.009165883460452351, "loss": 2.684, "step": 5594 }, { "crossentropy": 2.6845710277557373, "epoch": 0.20283497679814386, "grad_norm": 0.03238590434193611, "grad_norm_var": 9.889461405536369e-06, "learning_rate": 0.009165562086817342, "loss": 2.6404, "step": 5595 }, { "crossentropy": 2.8036980628967285, "epoch": 0.20287122969837587, "grad_norm": 0.03235933929681778, "grad_norm_var": 1.0100647948351784e-05, "learning_rate": 0.009165240656920095, "loss": 2.7523, "step": 5596 }, { "crossentropy": 2.737105369567871, "epoch": 0.20290748259860789, "grad_norm": 0.04380769655108452, "grad_norm_var": 1.433436518114093e-05, "learning_rate": 0.009164919170764953, "loss": 2.7495, "step": 5597 }, { "crossentropy": 2.4983651638031006, "epoch": 0.2029437354988399, "grad_norm": 0.031387969851493835, "grad_norm_var": 1.550452374706231e-05, "learning_rate": 0.009164597628356258, "loss": 2.5835, "step": 5598 }, { "crossentropy": 2.7532050609588623, "epoch": 0.20297998839907191, "grad_norm": 0.03700581565499306, "grad_norm_var": 1.536383404952708e-05, "learning_rate": 0.00916427602969835, "loss": 2.7781, "step": 5599 }, { "crossentropy": 2.8238158226013184, "epoch": 0.20301624129930396, "grad_norm": 0.03780824691057205, "grad_norm_var": 1.5576061519493162e-05, "learning_rate": 0.009163954374795575, "loss": 2.7446, "step": 5600 }, { "crossentropy": 2.637051582336426, "epoch": 0.20305249419953597, "grad_norm": 0.04110553115606308, "grad_norm_var": 1.7115909627074487e-05, "learning_rate": 0.009163632663652277, "loss": 2.6518, "step": 5601 }, { "crossentropy": 2.7475459575653076, "epoch": 0.20308874709976799, "grad_norm": 0.04099403694272041, "grad_norm_var": 1.840612203037415e-05, "learning_rate": 0.009163310896272803, "loss": 2.6809, "step": 5602 }, { "crossentropy": 2.7257895469665527, "epoch": 0.203125, "grad_norm": 0.03828105702996254, "grad_norm_var": 1.8581554007555262e-05, "learning_rate": 0.009162989072661496, "loss": 2.7179, "step": 5603 }, { "crossentropy": 2.597461223602295, "epoch": 0.20316125290023201, "grad_norm": 0.03428163751959801, "grad_norm_var": 1.852427119752864e-05, "learning_rate": 0.009162667192822705, "loss": 2.6788, "step": 5604 }, { "crossentropy": 2.5362839698791504, "epoch": 0.20319750580046403, "grad_norm": 0.034599702805280685, "grad_norm_var": 1.775919276271829e-05, "learning_rate": 0.009162345256760776, "loss": 2.6371, "step": 5605 }, { "crossentropy": 2.640185594558716, "epoch": 0.20323375870069604, "grad_norm": 0.03479127958416939, "grad_norm_var": 1.730916782665213e-05, "learning_rate": 0.009162023264480056, "loss": 2.7752, "step": 5606 }, { "crossentropy": 2.6749043464660645, "epoch": 0.20327001160092809, "grad_norm": 0.033611852675676346, "grad_norm_var": 1.781597272291005e-05, "learning_rate": 0.009161701215984898, "loss": 2.7526, "step": 5607 }, { "crossentropy": 2.6992998123168945, "epoch": 0.2033062645011601, "grad_norm": 0.03614791855216026, "grad_norm_var": 1.2304774146885445e-05, "learning_rate": 0.009161379111279648, "loss": 2.7035, "step": 5608 }, { "crossentropy": 2.682051420211792, "epoch": 0.20334251740139211, "grad_norm": 0.036833882331848145, "grad_norm_var": 1.23304065482166e-05, "learning_rate": 0.00916105695036866, "loss": 2.6364, "step": 5609 }, { "crossentropy": 2.582914352416992, "epoch": 0.20337877030162413, "grad_norm": 0.03675062209367752, "grad_norm_var": 1.2002083080899182e-05, "learning_rate": 0.00916073473325628, "loss": 2.5839, "step": 5610 }, { "crossentropy": 2.6367576122283936, "epoch": 0.20341502320185614, "grad_norm": 0.035935111343860626, "grad_norm_var": 1.089712705950313e-05, "learning_rate": 0.009160412459946865, "loss": 2.6775, "step": 5611 }, { "crossentropy": 2.735062599182129, "epoch": 0.20345127610208816, "grad_norm": 0.03556399792432785, "grad_norm_var": 9.724293256228808e-06, "learning_rate": 0.009160090130444767, "loss": 2.6685, "step": 5612 }, { "crossentropy": 2.729100227355957, "epoch": 0.20348752900232017, "grad_norm": 0.03475967422127724, "grad_norm_var": 6.394876010899255e-06, "learning_rate": 0.009159767744754339, "loss": 2.7432, "step": 5613 }, { "crossentropy": 2.693263053894043, "epoch": 0.20352378190255221, "grad_norm": 0.03287461772561073, "grad_norm_var": 5.571013421826892e-06, "learning_rate": 0.009159445302879933, "loss": 2.6774, "step": 5614 }, { "crossentropy": 2.5988123416900635, "epoch": 0.20356003480278423, "grad_norm": 0.03497713804244995, "grad_norm_var": 5.646531217897745e-06, "learning_rate": 0.009159122804825906, "loss": 2.6234, "step": 5615 }, { "crossentropy": 2.695211410522461, "epoch": 0.20359628770301624, "grad_norm": 0.03938419744372368, "grad_norm_var": 6.138165701095183e-06, "learning_rate": 0.009158800250596615, "loss": 2.6381, "step": 5616 }, { "crossentropy": 2.6701009273529053, "epoch": 0.20363254060324826, "grad_norm": 0.03930835425853729, "grad_norm_var": 5.189894081740939e-06, "learning_rate": 0.009158477640196415, "loss": 2.7237, "step": 5617 }, { "crossentropy": 2.752426862716675, "epoch": 0.20366879350348027, "grad_norm": 0.04074244201183319, "grad_norm_var": 5.0328096364186834e-06, "learning_rate": 0.009158154973629665, "loss": 2.7494, "step": 5618 }, { "crossentropy": 2.690842628479004, "epoch": 0.2037050464037123, "grad_norm": 0.04072149842977524, "grad_norm_var": 6.08945441128809e-06, "learning_rate": 0.00915783225090072, "loss": 2.6815, "step": 5619 }, { "crossentropy": 2.5485424995422363, "epoch": 0.20374129930394433, "grad_norm": 0.03841675445437431, "grad_norm_var": 6.0286561440146364e-06, "learning_rate": 0.009157509472013941, "loss": 2.5624, "step": 5620 }, { "crossentropy": 2.7300896644592285, "epoch": 0.20377755220417634, "grad_norm": 0.03758043423295021, "grad_norm_var": 5.793468652015346e-06, "learning_rate": 0.009157186636973689, "loss": 2.6951, "step": 5621 }, { "crossentropy": 2.9491162300109863, "epoch": 0.20381380510440836, "grad_norm": 0.038847628980875015, "grad_norm_var": 5.74896098347021e-06, "learning_rate": 0.009156863745784321, "loss": 2.8782, "step": 5622 }, { "crossentropy": 2.741363048553467, "epoch": 0.20385005800464037, "grad_norm": 0.03908202797174454, "grad_norm_var": 5.1271767700880025e-06, "learning_rate": 0.0091565407984502, "loss": 2.6934, "step": 5623 }, { "crossentropy": 2.6247763633728027, "epoch": 0.2038863109048724, "grad_norm": 0.03525719419121742, "grad_norm_var": 5.321948763426359e-06, "learning_rate": 0.009156217794975687, "loss": 2.6585, "step": 5624 }, { "crossentropy": 2.6494874954223633, "epoch": 0.2039225638051044, "grad_norm": 0.03292834386229515, "grad_norm_var": 6.5256681640322535e-06, "learning_rate": 0.009155894735365147, "loss": 2.6992, "step": 5625 }, { "crossentropy": 2.7741470336914062, "epoch": 0.20395881670533642, "grad_norm": 0.03333781287074089, "grad_norm_var": 7.399237912195505e-06, "learning_rate": 0.009155571619622943, "loss": 2.6719, "step": 5626 }, { "crossentropy": 2.607670783996582, "epoch": 0.20399506960556846, "grad_norm": 0.03394310176372528, "grad_norm_var": 7.892185886947528e-06, "learning_rate": 0.009155248447753435, "loss": 2.6426, "step": 5627 }, { "crossentropy": 2.6436123847961426, "epoch": 0.20403132250580047, "grad_norm": 0.031711090356111526, "grad_norm_var": 9.420443547608394e-06, "learning_rate": 0.009154925219760992, "loss": 2.6382, "step": 5628 }, { "crossentropy": 2.6556460857391357, "epoch": 0.2040675754060325, "grad_norm": 0.03246653079986572, "grad_norm_var": 1.0278769020889643e-05, "learning_rate": 0.009154601935649978, "loss": 2.6688, "step": 5629 }, { "crossentropy": 2.682324171066284, "epoch": 0.2041038283062645, "grad_norm": 0.03509673848748207, "grad_norm_var": 9.558072703781615e-06, "learning_rate": 0.009154278595424762, "loss": 2.6962, "step": 5630 }, { "crossentropy": 2.800874710083008, "epoch": 0.20414008120649652, "grad_norm": 0.04015255346894264, "grad_norm_var": 1.0189841539860976e-05, "learning_rate": 0.009153955199089707, "loss": 2.7552, "step": 5631 }, { "crossentropy": 2.55863618850708, "epoch": 0.20417633410672853, "grad_norm": 0.03795802593231201, "grad_norm_var": 9.827663273876258e-06, "learning_rate": 0.009153631746649184, "loss": 2.6941, "step": 5632 }, { "crossentropy": 2.718684196472168, "epoch": 0.20421258700696054, "grad_norm": 0.032776378095149994, "grad_norm_var": 1.024171899185877e-05, "learning_rate": 0.00915330823810756, "loss": 2.6562, "step": 5633 }, { "crossentropy": 2.808056116104126, "epoch": 0.2042488399071926, "grad_norm": 0.03368256986141205, "grad_norm_var": 9.187946286883832e-06, "learning_rate": 0.009152984673469207, "loss": 2.8409, "step": 5634 }, { "crossentropy": 2.819925308227539, "epoch": 0.2042850928074246, "grad_norm": 0.033701445907354355, "grad_norm_var": 7.729243887049977e-06, "learning_rate": 0.009152661052738494, "loss": 2.7464, "step": 5635 }, { "crossentropy": 2.5629184246063232, "epoch": 0.20432134570765662, "grad_norm": 0.0348433218896389, "grad_norm_var": 7.106016527868899e-06, "learning_rate": 0.00915233737591979, "loss": 2.6423, "step": 5636 }, { "crossentropy": 2.738262176513672, "epoch": 0.20435759860788863, "grad_norm": 0.03864182531833649, "grad_norm_var": 7.511841020874504e-06, "learning_rate": 0.00915201364301747, "loss": 2.7287, "step": 5637 }, { "crossentropy": 2.66575026512146, "epoch": 0.20439385150812064, "grad_norm": 0.03612309321761131, "grad_norm_var": 6.678554115440751e-06, "learning_rate": 0.009151689854035902, "loss": 2.6074, "step": 5638 }, { "crossentropy": 2.913691997528076, "epoch": 0.20443010440835266, "grad_norm": 0.034556567668914795, "grad_norm_var": 5.559655126005589e-06, "learning_rate": 0.009151366008979464, "loss": 2.7832, "step": 5639 }, { "crossentropy": 2.7417221069335938, "epoch": 0.20446635730858467, "grad_norm": 0.03521103039383888, "grad_norm_var": 5.557119085140318e-06, "learning_rate": 0.009151042107852529, "loss": 2.7136, "step": 5640 }, { "crossentropy": 2.4933784008026123, "epoch": 0.20450261020881672, "grad_norm": 0.042812280356884, "grad_norm_var": 9.169088003106267e-06, "learning_rate": 0.00915071815065947, "loss": 2.5329, "step": 5641 }, { "crossentropy": 2.6197102069854736, "epoch": 0.20453886310904873, "grad_norm": 0.04466015473008156, "grad_norm_var": 1.4010163170493039e-05, "learning_rate": 0.009150394137404663, "loss": 2.6864, "step": 5642 }, { "crossentropy": 2.6681365966796875, "epoch": 0.20457511600928074, "grad_norm": 0.04073452204465866, "grad_norm_var": 1.4898060558791541e-05, "learning_rate": 0.009150070068092485, "loss": 2.6351, "step": 5643 }, { "crossentropy": 2.7680892944335938, "epoch": 0.20461136890951276, "grad_norm": 0.03924427181482315, "grad_norm_var": 1.3563945303548575e-05, "learning_rate": 0.009149745942727312, "loss": 2.7889, "step": 5644 }, { "crossentropy": 2.6523399353027344, "epoch": 0.20464762180974477, "grad_norm": 0.03835372254252434, "grad_norm_var": 1.213910365241988e-05, "learning_rate": 0.009149421761313524, "loss": 2.7233, "step": 5645 }, { "crossentropy": 2.709404706954956, "epoch": 0.2046838747099768, "grad_norm": 0.03670462220907211, "grad_norm_var": 1.1804910943013537e-05, "learning_rate": 0.009149097523855497, "loss": 2.6357, "step": 5646 }, { "crossentropy": 2.748809337615967, "epoch": 0.20472012761020883, "grad_norm": 0.03937206417322159, "grad_norm_var": 1.1567962195555697e-05, "learning_rate": 0.009148773230357614, "loss": 2.681, "step": 5647 }, { "crossentropy": 2.697425365447998, "epoch": 0.20475638051044084, "grad_norm": 0.03593151271343231, "grad_norm_var": 1.1690335398091787e-05, "learning_rate": 0.009148448880824249, "loss": 2.6591, "step": 5648 }, { "crossentropy": 2.689323663711548, "epoch": 0.20479263341067286, "grad_norm": 0.035824745893478394, "grad_norm_var": 1.041854174771891e-05, "learning_rate": 0.009148124475259789, "loss": 2.6276, "step": 5649 }, { "crossentropy": 2.7038180828094482, "epoch": 0.20482888631090487, "grad_norm": 0.03643298149108887, "grad_norm_var": 9.482289120789818e-06, "learning_rate": 0.00914780001366861, "loss": 2.7064, "step": 5650 }, { "crossentropy": 2.6936798095703125, "epoch": 0.2048651392111369, "grad_norm": 0.035529620945453644, "grad_norm_var": 8.717293617075255e-06, "learning_rate": 0.0091474754960551, "loss": 2.6564, "step": 5651 }, { "crossentropy": 2.6085290908813477, "epoch": 0.2049013921113689, "grad_norm": 0.03783940523862839, "grad_norm_var": 8.092795944760154e-06, "learning_rate": 0.009147150922423639, "loss": 2.6411, "step": 5652 }, { "crossentropy": 2.7356693744659424, "epoch": 0.20493764501160092, "grad_norm": 0.03869647532701492, "grad_norm_var": 8.097671936672472e-06, "learning_rate": 0.00914682629277861, "loss": 2.6398, "step": 5653 }, { "crossentropy": 2.6636154651641846, "epoch": 0.20497389791183296, "grad_norm": 0.03454483672976494, "grad_norm_var": 8.648674195561405e-06, "learning_rate": 0.0091465016071244, "loss": 2.7312, "step": 5654 }, { "crossentropy": 2.81756591796875, "epoch": 0.20501015081206497, "grad_norm": 0.03490680456161499, "grad_norm_var": 8.500065895051627e-06, "learning_rate": 0.009146176865465394, "loss": 2.7291, "step": 5655 }, { "crossentropy": 2.744086265563965, "epoch": 0.205046403712297, "grad_norm": 0.0346943736076355, "grad_norm_var": 8.703703974817453e-06, "learning_rate": 0.009145852067805976, "loss": 2.7706, "step": 5656 }, { "crossentropy": 2.56378173828125, "epoch": 0.205082656612529, "grad_norm": 0.031902603805065155, "grad_norm_var": 8.986308491490808e-06, "learning_rate": 0.009145527214150535, "loss": 2.5604, "step": 5657 }, { "crossentropy": 2.5786666870117188, "epoch": 0.20511890951276102, "grad_norm": 0.03317994624376297, "grad_norm_var": 5.82081375878433e-06, "learning_rate": 0.009145202304503457, "loss": 2.5288, "step": 5658 }, { "crossentropy": 2.464630126953125, "epoch": 0.20515516241299303, "grad_norm": 0.03555133193731308, "grad_norm_var": 4.568817748852131e-06, "learning_rate": 0.009144877338869132, "loss": 2.5594, "step": 5659 }, { "crossentropy": 2.5351197719573975, "epoch": 0.20519141531322505, "grad_norm": 0.0552811399102211, "grad_norm_var": 2.7217625154649094e-05, "learning_rate": 0.009144552317251949, "loss": 2.6192, "step": 5660 }, { "crossentropy": 2.709886074066162, "epoch": 0.2052276682134571, "grad_norm": 0.04144583269953728, "grad_norm_var": 2.8302548679778882e-05, "learning_rate": 0.009144227239656296, "loss": 2.6592, "step": 5661 }, { "crossentropy": 2.710822105407715, "epoch": 0.2052639211136891, "grad_norm": 0.043086789548397064, "grad_norm_var": 3.0286440765019593e-05, "learning_rate": 0.009143902106086566, "loss": 2.7875, "step": 5662 }, { "crossentropy": 2.6693367958068848, "epoch": 0.20530017401392112, "grad_norm": 0.05019703879952431, "grad_norm_var": 3.9931481578228156e-05, "learning_rate": 0.00914357691654715, "loss": 2.6486, "step": 5663 }, { "crossentropy": 2.7730560302734375, "epoch": 0.20533642691415313, "grad_norm": 0.046004243195056915, "grad_norm_var": 4.290329318960926e-05, "learning_rate": 0.009143251671042438, "loss": 2.7409, "step": 5664 }, { "crossentropy": 2.749800205230713, "epoch": 0.20537267981438515, "grad_norm": 0.038005389273166656, "grad_norm_var": 4.2256961208092006e-05, "learning_rate": 0.009142926369576825, "loss": 2.7638, "step": 5665 }, { "crossentropy": 2.7027719020843506, "epoch": 0.20540893271461716, "grad_norm": 0.03399152681231499, "grad_norm_var": 4.353225545723459e-05, "learning_rate": 0.009142601012154706, "loss": 2.6686, "step": 5666 }, { "crossentropy": 2.6903390884399414, "epoch": 0.2054451856148492, "grad_norm": 0.03710911050438881, "grad_norm_var": 4.294603775535314e-05, "learning_rate": 0.009142275598780471, "loss": 2.7225, "step": 5667 }, { "crossentropy": 2.91762113571167, "epoch": 0.20548143851508122, "grad_norm": 0.033288463950157166, "grad_norm_var": 4.503713550985152e-05, "learning_rate": 0.009141950129458521, "loss": 2.7262, "step": 5668 }, { "crossentropy": 2.577678918838501, "epoch": 0.20551769141531323, "grad_norm": 0.032109420746564865, "grad_norm_var": 4.7899496769333395e-05, "learning_rate": 0.009141624604193247, "loss": 2.6823, "step": 5669 }, { "crossentropy": 2.7407290935516357, "epoch": 0.20555394431554525, "grad_norm": 0.03391367197036743, "grad_norm_var": 4.825355495457293e-05, "learning_rate": 0.009141299022989047, "loss": 2.6519, "step": 5670 }, { "crossentropy": 2.6103806495666504, "epoch": 0.20559019721577726, "grad_norm": 0.03563892841339111, "grad_norm_var": 4.794442855762798e-05, "learning_rate": 0.009140973385850319, "loss": 2.6224, "step": 5671 }, { "crossentropy": 2.6618690490722656, "epoch": 0.20562645011600927, "grad_norm": 0.03647400811314583, "grad_norm_var": 4.724825667366516e-05, "learning_rate": 0.009140647692781462, "loss": 2.6127, "step": 5672 }, { "crossentropy": 2.6943414211273193, "epoch": 0.2056627030162413, "grad_norm": 0.03594855219125748, "grad_norm_var": 4.467256641070075e-05, "learning_rate": 0.009140321943786875, "loss": 2.6962, "step": 5673 }, { "crossentropy": 2.6145741939544678, "epoch": 0.20569895591647333, "grad_norm": 0.03767341375350952, "grad_norm_var": 4.2551453228595326e-05, "learning_rate": 0.009139996138870956, "loss": 2.5756, "step": 5674 }, { "crossentropy": 2.609811544418335, "epoch": 0.20573520881670534, "grad_norm": 0.03745100647211075, "grad_norm_var": 4.187627736733368e-05, "learning_rate": 0.009139670278038107, "loss": 2.6407, "step": 5675 }, { "crossentropy": 2.6141245365142822, "epoch": 0.20577146171693736, "grad_norm": 0.03325090557336807, "grad_norm_var": 2.5050145291902193e-05, "learning_rate": 0.009139344361292729, "loss": 2.5536, "step": 5676 }, { "crossentropy": 2.7671713829040527, "epoch": 0.20580771461716937, "grad_norm": 0.03261105716228485, "grad_norm_var": 2.5691829421676098e-05, "learning_rate": 0.009139018388639224, "loss": 2.6739, "step": 5677 }, { "crossentropy": 2.625922918319702, "epoch": 0.2058439675174014, "grad_norm": 0.03663512319326401, "grad_norm_var": 2.3312906022466124e-05, "learning_rate": 0.009138692360081992, "loss": 2.6183, "step": 5678 }, { "crossentropy": 2.6051199436187744, "epoch": 0.2058802204176334, "grad_norm": 0.043202925473451614, "grad_norm_var": 1.3964404501451364e-05, "learning_rate": 0.009138366275625441, "loss": 2.6496, "step": 5679 }, { "crossentropy": 2.7904481887817383, "epoch": 0.20591647331786542, "grad_norm": 0.03750643506646156, "grad_norm_var": 7.659980970399957e-06, "learning_rate": 0.009138040135273973, "loss": 2.6374, "step": 5680 }, { "crossentropy": 2.770355224609375, "epoch": 0.20595272621809746, "grad_norm": 0.03500474989414215, "grad_norm_var": 7.390636219200877e-06, "learning_rate": 0.009137713939031993, "loss": 2.7832, "step": 5681 }, { "crossentropy": 2.7960617542266846, "epoch": 0.20598897911832947, "grad_norm": 0.03392589092254639, "grad_norm_var": 7.406190360462137e-06, "learning_rate": 0.009137387686903907, "loss": 2.6873, "step": 5682 }, { "crossentropy": 2.8506860733032227, "epoch": 0.2060252320185615, "grad_norm": 0.04024563729763031, "grad_norm_var": 8.59613780412859e-06, "learning_rate": 0.009137061378894123, "loss": 2.7726, "step": 5683 }, { "crossentropy": 2.714996814727783, "epoch": 0.2060614849187935, "grad_norm": 0.035048626363277435, "grad_norm_var": 8.169833084521275e-06, "learning_rate": 0.009136735015007044, "loss": 2.8065, "step": 5684 }, { "crossentropy": 2.514050006866455, "epoch": 0.20609773781902552, "grad_norm": 0.03450506553053856, "grad_norm_var": 7.27301777433779e-06, "learning_rate": 0.009136408595247085, "loss": 2.5658, "step": 5685 }, { "crossentropy": 2.634424924850464, "epoch": 0.20613399071925753, "grad_norm": 0.034139204770326614, "grad_norm_var": 7.2077528110229985e-06, "learning_rate": 0.009136082119618647, "loss": 2.6852, "step": 5686 }, { "crossentropy": 2.603689670562744, "epoch": 0.20617024361948955, "grad_norm": 0.040965110063552856, "grad_norm_var": 8.579585763371771e-06, "learning_rate": 0.009135755588126144, "loss": 2.6608, "step": 5687 }, { "crossentropy": 2.7427549362182617, "epoch": 0.2062064965197216, "grad_norm": 0.03908512741327286, "grad_norm_var": 8.983869993263681e-06, "learning_rate": 0.009135429000773987, "loss": 2.6956, "step": 5688 }, { "crossentropy": 2.898383140563965, "epoch": 0.2062427494199536, "grad_norm": 0.0400468148291111, "grad_norm_var": 9.623027305726412e-06, "learning_rate": 0.009135102357566585, "loss": 2.8119, "step": 5689 }, { "crossentropy": 2.7551777362823486, "epoch": 0.20627900232018562, "grad_norm": 0.03680345043540001, "grad_norm_var": 9.587121009622483e-06, "learning_rate": 0.00913477565850835, "loss": 2.7313, "step": 5690 }, { "crossentropy": 2.563812017440796, "epoch": 0.20631525522041763, "grad_norm": 0.03611478954553604, "grad_norm_var": 9.600846784183862e-06, "learning_rate": 0.009134448903603695, "loss": 2.7142, "step": 5691 }, { "crossentropy": 2.6330466270446777, "epoch": 0.20635150812064965, "grad_norm": 0.03820106014609337, "grad_norm_var": 8.777872652202384e-06, "learning_rate": 0.009134122092857034, "loss": 2.6602, "step": 5692 }, { "crossentropy": 2.830550193786621, "epoch": 0.20638776102088166, "grad_norm": 0.03683927655220032, "grad_norm_var": 7.348998477212503e-06, "learning_rate": 0.00913379522627278, "loss": 2.8158, "step": 5693 }, { "crossentropy": 2.5981979370117188, "epoch": 0.2064240139211137, "grad_norm": 0.03519066050648689, "grad_norm_var": 7.625141062084228e-06, "learning_rate": 0.009133468303855348, "loss": 2.6052, "step": 5694 }, { "crossentropy": 2.7357847690582275, "epoch": 0.20646026682134572, "grad_norm": 0.03404410183429718, "grad_norm_var": 5.661275150357394e-06, "learning_rate": 0.009133141325609153, "loss": 2.6693, "step": 5695 }, { "crossentropy": 2.915651321411133, "epoch": 0.20649651972157773, "grad_norm": 0.0823441743850708, "grad_norm_var": 0.00013595974739704988, "learning_rate": 0.009132814291538614, "loss": 2.7744, "step": 5696 }, { "crossentropy": 2.8700766563415527, "epoch": 0.20653277262180975, "grad_norm": 0.044359419494867325, "grad_norm_var": 0.00013578296595530046, "learning_rate": 0.009132487201648144, "loss": 2.7725, "step": 5697 }, { "crossentropy": 2.647162675857544, "epoch": 0.20656902552204176, "grad_norm": 0.042312752455472946, "grad_norm_var": 0.00013325693455029815, "learning_rate": 0.009132160055942164, "loss": 2.6499, "step": 5698 }, { "crossentropy": 2.7287685871124268, "epoch": 0.20660527842227377, "grad_norm": 0.044845398515462875, "grad_norm_var": 0.00013433723195050237, "learning_rate": 0.009131832854425093, "loss": 2.7453, "step": 5699 }, { "crossentropy": 2.693969249725342, "epoch": 0.2066415313225058, "grad_norm": 0.04535533860325813, "grad_norm_var": 0.00013289715499247844, "learning_rate": 0.009131505597101347, "loss": 2.6427, "step": 5700 }, { "crossentropy": 2.600287437438965, "epoch": 0.20667778422273783, "grad_norm": 0.03976835310459137, "grad_norm_var": 0.00012966917897822786, "learning_rate": 0.00913117828397535, "loss": 2.6571, "step": 5701 }, { "crossentropy": 2.712891101837158, "epoch": 0.20671403712296985, "grad_norm": 0.060466036200523376, "grad_norm_var": 0.0001457424729121165, "learning_rate": 0.00913085091505152, "loss": 2.7323, "step": 5702 }, { "crossentropy": 2.6717276573181152, "epoch": 0.20675029002320186, "grad_norm": 0.03674941137433052, "grad_norm_var": 0.0001483041367134276, "learning_rate": 0.00913052349033428, "loss": 2.6138, "step": 5703 }, { "crossentropy": 2.699488401412964, "epoch": 0.20678654292343387, "grad_norm": 0.0352744534611702, "grad_norm_var": 0.0001513445521985882, "learning_rate": 0.009130196009828052, "loss": 2.7137, "step": 5704 }, { "crossentropy": 2.6381661891937256, "epoch": 0.2068227958236659, "grad_norm": 0.04431664198637009, "grad_norm_var": 0.0001507772789565836, "learning_rate": 0.009129868473537257, "loss": 2.6552, "step": 5705 }, { "crossentropy": 2.606962203979492, "epoch": 0.2068590487238979, "grad_norm": 0.04572504013776779, "grad_norm_var": 0.00014801023386878058, "learning_rate": 0.009129540881466322, "loss": 2.6554, "step": 5706 }, { "crossentropy": 2.618075132369995, "epoch": 0.20689530162412992, "grad_norm": 0.03689195588231087, "grad_norm_var": 0.0001472444560753757, "learning_rate": 0.009129213233619673, "loss": 2.6748, "step": 5707 }, { "crossentropy": 2.716750383377075, "epoch": 0.20693155452436196, "grad_norm": 0.03488267585635185, "grad_norm_var": 0.00015046204428955564, "learning_rate": 0.009128885530001732, "loss": 2.7746, "step": 5708 }, { "crossentropy": 2.858675479888916, "epoch": 0.20696780742459397, "grad_norm": 0.03551856428384781, "grad_norm_var": 0.00015178102430650185, "learning_rate": 0.009128557770616925, "loss": 2.772, "step": 5709 }, { "crossentropy": 2.6646981239318848, "epoch": 0.207004060324826, "grad_norm": 0.03687601536512375, "grad_norm_var": 0.00015006260491275675, "learning_rate": 0.00912822995546968, "loss": 2.6874, "step": 5710 }, { "crossentropy": 2.7745368480682373, "epoch": 0.207040313225058, "grad_norm": 0.035529375076293945, "grad_norm_var": 0.00014828169849968732, "learning_rate": 0.009127902084564425, "loss": 2.7753, "step": 5711 }, { "crossentropy": 2.6420845985412598, "epoch": 0.20707656612529002, "grad_norm": 0.03507942706346512, "grad_norm_var": 4.516357731673548e-05, "learning_rate": 0.009127574157905587, "loss": 2.6993, "step": 5712 }, { "crossentropy": 2.6483473777770996, "epoch": 0.20711281902552203, "grad_norm": 0.036278191953897476, "grad_norm_var": 4.548745631643003e-05, "learning_rate": 0.009127246175497598, "loss": 2.6641, "step": 5713 }, { "crossentropy": 2.4835381507873535, "epoch": 0.20714907192575405, "grad_norm": 0.033836837857961655, "grad_norm_var": 4.777842197482279e-05, "learning_rate": 0.009126918137344883, "loss": 2.5838, "step": 5714 }, { "crossentropy": 2.7189948558807373, "epoch": 0.2071853248259861, "grad_norm": 0.03489575535058975, "grad_norm_var": 4.7321539684735035e-05, "learning_rate": 0.00912659004345188, "loss": 2.6978, "step": 5715 }, { "crossentropy": 2.62962007522583, "epoch": 0.2072215777262181, "grad_norm": 0.03783411532640457, "grad_norm_var": 4.469963062557567e-05, "learning_rate": 0.009126261893823014, "loss": 2.6873, "step": 5716 }, { "crossentropy": 2.565721035003662, "epoch": 0.20725783062645012, "grad_norm": 0.03472163900732994, "grad_norm_var": 4.560297379031612e-05, "learning_rate": 0.00912593368846272, "loss": 2.5909, "step": 5717 }, { "crossentropy": 2.7883405685424805, "epoch": 0.20729408352668213, "grad_norm": 0.0349331870675087, "grad_norm_var": 1.1328512351734219e-05, "learning_rate": 0.00912560542737543, "loss": 2.7843, "step": 5718 }, { "crossentropy": 2.629340171813965, "epoch": 0.20733033642691415, "grad_norm": 0.03700479120016098, "grad_norm_var": 1.1329709750341049e-05, "learning_rate": 0.009125277110565577, "loss": 2.6497, "step": 5719 }, { "crossentropy": 2.6399643421173096, "epoch": 0.20736658932714616, "grad_norm": 0.03598689287900925, "grad_norm_var": 1.1211776590823857e-05, "learning_rate": 0.009124948738037595, "loss": 2.685, "step": 5720 }, { "crossentropy": 2.5587270259857178, "epoch": 0.2074028422273782, "grad_norm": 0.03401137888431549, "grad_norm_var": 7.650819357009285e-06, "learning_rate": 0.009124620309795923, "loss": 2.6041, "step": 5721 }, { "crossentropy": 2.5575766563415527, "epoch": 0.20743909512761022, "grad_norm": 0.030758006498217583, "grad_norm_var": 2.7438715517014707e-06, "learning_rate": 0.009124291825844994, "loss": 2.6357, "step": 5722 }, { "crossentropy": 2.7816720008850098, "epoch": 0.20747534802784223, "grad_norm": 0.03196243196725845, "grad_norm_var": 3.226099961057396e-06, "learning_rate": 0.009123963286189245, "loss": 2.7192, "step": 5723 }, { "crossentropy": 2.6536500453948975, "epoch": 0.20751160092807425, "grad_norm": 0.03405505046248436, "grad_norm_var": 3.2826106532476827e-06, "learning_rate": 0.009123634690833113, "loss": 2.672, "step": 5724 }, { "crossentropy": 2.753394842147827, "epoch": 0.20754785382830626, "grad_norm": 0.03470749780535698, "grad_norm_var": 3.2627910968016504e-06, "learning_rate": 0.009123306039781037, "loss": 2.7719, "step": 5725 }, { "crossentropy": 2.7177653312683105, "epoch": 0.20758410672853828, "grad_norm": 0.03480144590139389, "grad_norm_var": 2.9864172796119678e-06, "learning_rate": 0.009122977333037456, "loss": 2.7018, "step": 5726 }, { "crossentropy": 2.683804512023926, "epoch": 0.2076203596287703, "grad_norm": 0.03300405293703079, "grad_norm_var": 3.130906513614075e-06, "learning_rate": 0.009122648570606809, "loss": 2.6514, "step": 5727 }, { "crossentropy": 2.4363341331481934, "epoch": 0.20765661252900233, "grad_norm": 0.048201873898506165, "grad_norm_var": 1.470255156136555e-05, "learning_rate": 0.009122319752493535, "loss": 2.4869, "step": 5728 }, { "crossentropy": 2.5061326026916504, "epoch": 0.20769286542923435, "grad_norm": 0.03642063960433006, "grad_norm_var": 1.4719795181404695e-05, "learning_rate": 0.00912199087870208, "loss": 2.5068, "step": 5729 }, { "crossentropy": 2.5136380195617676, "epoch": 0.20772911832946636, "grad_norm": 0.032406035810709, "grad_norm_var": 1.5154725693205837e-05, "learning_rate": 0.00912166194923688, "loss": 2.6087, "step": 5730 }, { "crossentropy": 2.816812515258789, "epoch": 0.20776537122969838, "grad_norm": 0.035726554691791534, "grad_norm_var": 1.51468212333502e-05, "learning_rate": 0.009121332964102382, "loss": 2.7142, "step": 5731 }, { "crossentropy": 2.6477859020233154, "epoch": 0.2078016241299304, "grad_norm": 0.0355236791074276, "grad_norm_var": 1.4733215034919015e-05, "learning_rate": 0.009121003923303029, "loss": 2.6571, "step": 5732 }, { "crossentropy": 2.7776761054992676, "epoch": 0.2078378770301624, "grad_norm": 0.0340157151222229, "grad_norm_var": 1.4815416122553613e-05, "learning_rate": 0.009120674826843263, "loss": 2.7057, "step": 5733 }, { "crossentropy": 2.811223268508911, "epoch": 0.20787412993039442, "grad_norm": 0.034801654517650604, "grad_norm_var": 1.4821526615001522e-05, "learning_rate": 0.009120345674727528, "loss": 2.7451, "step": 5734 }, { "crossentropy": 2.6892547607421875, "epoch": 0.20791038283062646, "grad_norm": 0.035477519035339355, "grad_norm_var": 1.4602179574417492e-05, "learning_rate": 0.009120016466960274, "loss": 2.6982, "step": 5735 }, { "crossentropy": 2.490750789642334, "epoch": 0.20794663573085848, "grad_norm": 0.043602194637060165, "grad_norm_var": 1.911073141232822e-05, "learning_rate": 0.009119687203545945, "loss": 2.6265, "step": 5736 }, { "crossentropy": 2.755777359008789, "epoch": 0.2079828886310905, "grad_norm": 0.04068829491734505, "grad_norm_var": 2.0489692708576636e-05, "learning_rate": 0.009119357884488989, "loss": 2.7132, "step": 5737 }, { "crossentropy": 2.565265655517578, "epoch": 0.2080191415313225, "grad_norm": 0.04022737964987755, "grad_norm_var": 1.946350902623309e-05, "learning_rate": 0.009119028509793852, "loss": 2.582, "step": 5738 }, { "crossentropy": 2.7695250511169434, "epoch": 0.20805539443155452, "grad_norm": 0.037515319883823395, "grad_norm_var": 1.7956064266125975e-05, "learning_rate": 0.009118699079464986, "loss": 2.7212, "step": 5739 }, { "crossentropy": 2.6641721725463867, "epoch": 0.20809164733178653, "grad_norm": 0.03690886124968529, "grad_norm_var": 1.7364123986477604e-05, "learning_rate": 0.009118369593506839, "loss": 2.7471, "step": 5740 }, { "crossentropy": 2.555070400238037, "epoch": 0.20812790023201855, "grad_norm": 0.03471462428569794, "grad_norm_var": 1.736182835086793e-05, "learning_rate": 0.009118040051923858, "loss": 2.6567, "step": 5741 }, { "crossentropy": 2.6979846954345703, "epoch": 0.2081641531322506, "grad_norm": 0.03714236244559288, "grad_norm_var": 1.6978389405626162e-05, "learning_rate": 0.009117710454720498, "loss": 2.7045, "step": 5742 }, { "crossentropy": 2.5817856788635254, "epoch": 0.2082004060324826, "grad_norm": 0.03992729261517525, "grad_norm_var": 1.603292772448707e-05, "learning_rate": 0.00911738080190121, "loss": 2.5745, "step": 5743 }, { "crossentropy": 2.7181830406188965, "epoch": 0.20823665893271462, "grad_norm": 0.04027891904115677, "grad_norm_var": 8.868739763154682e-06, "learning_rate": 0.009117051093470446, "loss": 2.6762, "step": 5744 }, { "crossentropy": 2.6129918098449707, "epoch": 0.20827291183294663, "grad_norm": 0.037772227078676224, "grad_norm_var": 8.840470107224624e-06, "learning_rate": 0.009116721329432658, "loss": 2.6827, "step": 5745 }, { "crossentropy": 2.739471435546875, "epoch": 0.20830916473317865, "grad_norm": 0.03407491371035576, "grad_norm_var": 7.926544215441402e-06, "learning_rate": 0.009116391509792302, "loss": 2.7011, "step": 5746 }, { "crossentropy": 2.7262158393859863, "epoch": 0.20834541763341066, "grad_norm": 0.032252922654151917, "grad_norm_var": 9.455662430733238e-06, "learning_rate": 0.009116061634553831, "loss": 2.6372, "step": 5747 }, { "crossentropy": 2.5708091259002686, "epoch": 0.2083816705336427, "grad_norm": 0.032325271517038345, "grad_norm_var": 1.0802540417106873e-05, "learning_rate": 0.009115731703721704, "loss": 2.5494, "step": 5748 }, { "crossentropy": 2.5603902339935303, "epoch": 0.20841792343387472, "grad_norm": 0.03418656066060066, "grad_norm_var": 1.0736775293784574e-05, "learning_rate": 0.00911540171730037, "loss": 2.5692, "step": 5749 }, { "crossentropy": 2.6330955028533936, "epoch": 0.20845417633410673, "grad_norm": 0.03666289895772934, "grad_norm_var": 1.0409343429819649e-05, "learning_rate": 0.009115071675294293, "loss": 2.6953, "step": 5750 }, { "crossentropy": 2.668233871459961, "epoch": 0.20849042923433875, "grad_norm": 0.03857908025383949, "grad_norm_var": 1.0335537938950939e-05, "learning_rate": 0.009114741577707931, "loss": 2.6585, "step": 5751 }, { "crossentropy": 2.6933367252349854, "epoch": 0.20852668213457076, "grad_norm": 0.04142636060714722, "grad_norm_var": 8.804163548290871e-06, "learning_rate": 0.009114411424545735, "loss": 2.6631, "step": 5752 }, { "crossentropy": 2.575650215148926, "epoch": 0.20856293503480278, "grad_norm": 0.04071057215332985, "grad_norm_var": 8.814651765377362e-06, "learning_rate": 0.009114081215812172, "loss": 2.6092, "step": 5753 }, { "crossentropy": 2.7836499214172363, "epoch": 0.2085991879350348, "grad_norm": 0.03582945466041565, "grad_norm_var": 8.230164961008033e-06, "learning_rate": 0.009113750951511696, "loss": 2.7061, "step": 5754 }, { "crossentropy": 2.85624098777771, "epoch": 0.20863544083526683, "grad_norm": 0.03672975301742554, "grad_norm_var": 8.203680065945775e-06, "learning_rate": 0.009113420631648772, "loss": 2.8187, "step": 5755 }, { "crossentropy": 2.657461404800415, "epoch": 0.20867169373549885, "grad_norm": 0.034090861678123474, "grad_norm_var": 8.676054094210646e-06, "learning_rate": 0.00911309025622786, "loss": 2.7287, "step": 5756 }, { "crossentropy": 2.5098190307617188, "epoch": 0.20870794663573086, "grad_norm": 0.03255629166960716, "grad_norm_var": 9.529631142428327e-06, "learning_rate": 0.009112759825253423, "loss": 2.6022, "step": 5757 }, { "crossentropy": 2.6815078258514404, "epoch": 0.20874419953596288, "grad_norm": 0.03499937057495117, "grad_norm_var": 9.642859379202507e-06, "learning_rate": 0.009112429338729923, "loss": 2.5984, "step": 5758 }, { "crossentropy": 2.611128330230713, "epoch": 0.2087804524361949, "grad_norm": 0.040053967386484146, "grad_norm_var": 9.70343524616298e-06, "learning_rate": 0.009112098796661822, "loss": 2.6276, "step": 5759 }, { "crossentropy": 2.6236674785614014, "epoch": 0.2088167053364269, "grad_norm": 0.04439985752105713, "grad_norm_var": 1.2891678891457693e-05, "learning_rate": 0.009111768199053587, "loss": 2.6672, "step": 5760 }, { "crossentropy": 2.4469149112701416, "epoch": 0.20885295823665892, "grad_norm": 0.04076549410820007, "grad_norm_var": 1.3893295180994411e-05, "learning_rate": 0.009111437545909684, "loss": 2.5681, "step": 5761 }, { "crossentropy": 2.5274248123168945, "epoch": 0.20888921113689096, "grad_norm": 0.036399081349372864, "grad_norm_var": 1.3370091064723634e-05, "learning_rate": 0.009111106837234578, "loss": 2.58, "step": 5762 }, { "crossentropy": 2.9507064819335938, "epoch": 0.20892546403712298, "grad_norm": 0.037250082939863205, "grad_norm_var": 1.1769236851761657e-05, "learning_rate": 0.009110776073032734, "loss": 2.7739, "step": 5763 }, { "crossentropy": 2.738840341567993, "epoch": 0.208961716937355, "grad_norm": 0.03270144760608673, "grad_norm_var": 1.1528047495476534e-05, "learning_rate": 0.00911044525330862, "loss": 2.72, "step": 5764 }, { "crossentropy": 2.564751148223877, "epoch": 0.208997969837587, "grad_norm": 0.03413798660039902, "grad_norm_var": 1.1548578321847412e-05, "learning_rate": 0.009110114378066707, "loss": 2.5923, "step": 5765 }, { "crossentropy": 2.7048583030700684, "epoch": 0.20903422273781902, "grad_norm": 0.034334778785705566, "grad_norm_var": 1.209465984285668e-05, "learning_rate": 0.009109783447311462, "loss": 2.6914, "step": 5766 }, { "crossentropy": 2.461143970489502, "epoch": 0.20907047563805103, "grad_norm": 0.03277536481618881, "grad_norm_var": 1.3121289834108762e-05, "learning_rate": 0.009109452461047354, "loss": 2.5744, "step": 5767 }, { "crossentropy": 2.687422513961792, "epoch": 0.20910672853828308, "grad_norm": 0.032365329563617706, "grad_norm_var": 1.2690640819368095e-05, "learning_rate": 0.009109121419278854, "loss": 2.6515, "step": 5768 }, { "crossentropy": 2.7390384674072266, "epoch": 0.2091429814385151, "grad_norm": 0.03266157582402229, "grad_norm_var": 1.1959390337756606e-05, "learning_rate": 0.009108790322010434, "loss": 2.6861, "step": 5769 }, { "crossentropy": 2.5013365745544434, "epoch": 0.2091792343387471, "grad_norm": 0.03810862824320793, "grad_norm_var": 1.2307237398496925e-05, "learning_rate": 0.009108459169246564, "loss": 2.5717, "step": 5770 }, { "crossentropy": 2.7351009845733643, "epoch": 0.20921548723897912, "grad_norm": 0.04008040949702263, "grad_norm_var": 1.3381572351965169e-05, "learning_rate": 0.00910812796099172, "loss": 2.6922, "step": 5771 }, { "crossentropy": 2.6073620319366455, "epoch": 0.20925174013921113, "grad_norm": 0.03456796333193779, "grad_norm_var": 1.326767037973699e-05, "learning_rate": 0.009107796697250372, "loss": 2.5825, "step": 5772 }, { "crossentropy": 2.650667428970337, "epoch": 0.20928799303944315, "grad_norm": 0.03667358681559563, "grad_norm_var": 1.2362646035000921e-05, "learning_rate": 0.009107465378026997, "loss": 2.6641, "step": 5773 }, { "crossentropy": 2.6977946758270264, "epoch": 0.20932424593967516, "grad_norm": 0.03365597873926163, "grad_norm_var": 1.2724918897916564e-05, "learning_rate": 0.009107134003326068, "loss": 2.676, "step": 5774 }, { "crossentropy": 2.7906908988952637, "epoch": 0.2093604988399072, "grad_norm": 0.033678337931632996, "grad_norm_var": 1.2081260630482833e-05, "learning_rate": 0.009106802573152063, "loss": 2.6886, "step": 5775 }, { "crossentropy": 2.774451971054077, "epoch": 0.20939675174013922, "grad_norm": 0.038646191358566284, "grad_norm_var": 7.637065574907693e-06, "learning_rate": 0.009106471087509454, "loss": 2.7411, "step": 5776 }, { "crossentropy": 2.6780409812927246, "epoch": 0.20943300464037123, "grad_norm": 0.03895192965865135, "grad_norm_var": 6.58151159775294e-06, "learning_rate": 0.009106139546402723, "loss": 2.7421, "step": 5777 }, { "crossentropy": 2.7036609649658203, "epoch": 0.20946925754060325, "grad_norm": 0.039554744958877563, "grad_norm_var": 7.6087880684207994e-06, "learning_rate": 0.009105807949836347, "loss": 2.7353, "step": 5778 }, { "crossentropy": 2.665384531021118, "epoch": 0.20950551044083526, "grad_norm": 0.03833615779876709, "grad_norm_var": 7.916532373906474e-06, "learning_rate": 0.009105476297814802, "loss": 2.6632, "step": 5779 }, { "crossentropy": 2.7004804611206055, "epoch": 0.20954176334106728, "grad_norm": 0.037552908062934875, "grad_norm_var": 7.44669687415637e-06, "learning_rate": 0.00910514459034257, "loss": 2.6662, "step": 5780 }, { "crossentropy": 2.5395476818084717, "epoch": 0.2095780162412993, "grad_norm": 0.035372331738471985, "grad_norm_var": 7.234631258503053e-06, "learning_rate": 0.009104812827424131, "loss": 2.6071, "step": 5781 }, { "crossentropy": 2.7323355674743652, "epoch": 0.20961426914153133, "grad_norm": 0.03490687161684036, "grad_norm_var": 7.121790427431588e-06, "learning_rate": 0.009104481009063966, "loss": 2.7297, "step": 5782 }, { "crossentropy": 2.7682247161865234, "epoch": 0.20965052204176335, "grad_norm": 0.030959444120526314, "grad_norm_var": 8.137221170640889e-06, "learning_rate": 0.009104149135266556, "loss": 2.6574, "step": 5783 }, { "crossentropy": 2.618924140930176, "epoch": 0.20968677494199536, "grad_norm": 0.03215128183364868, "grad_norm_var": 8.243946214678725e-06, "learning_rate": 0.009103817206036383, "loss": 2.6543, "step": 5784 }, { "crossentropy": 2.639503240585327, "epoch": 0.20972302784222738, "grad_norm": 0.03356705978512764, "grad_norm_var": 7.893207012626954e-06, "learning_rate": 0.00910348522137793, "loss": 2.6892, "step": 5785 }, { "crossentropy": 2.7754030227661133, "epoch": 0.2097592807424594, "grad_norm": 0.03583543375134468, "grad_norm_var": 7.591530088937509e-06, "learning_rate": 0.009103153181295683, "loss": 2.7453, "step": 5786 }, { "crossentropy": 2.6785707473754883, "epoch": 0.2097955336426914, "grad_norm": 0.04230528324842453, "grad_norm_var": 9.139346466453914e-06, "learning_rate": 0.009102821085794124, "loss": 2.6332, "step": 5787 }, { "crossentropy": 2.6453752517700195, "epoch": 0.20983178654292342, "grad_norm": 0.04749751091003418, "grad_norm_var": 1.7041833731945205e-05, "learning_rate": 0.009102488934877742, "loss": 2.6877, "step": 5788 }, { "crossentropy": 2.621901512145996, "epoch": 0.20986803944315546, "grad_norm": 0.039713531732559204, "grad_norm_var": 1.7546766647171884e-05, "learning_rate": 0.009102156728551022, "loss": 2.6789, "step": 5789 }, { "crossentropy": 2.7709522247314453, "epoch": 0.20990429234338748, "grad_norm": 0.035065263509750366, "grad_norm_var": 1.7034495137064816e-05, "learning_rate": 0.009101824466818446, "loss": 2.708, "step": 5790 }, { "crossentropy": 2.671391010284424, "epoch": 0.2099405452436195, "grad_norm": 0.03308931365609169, "grad_norm_var": 1.732733129057132e-05, "learning_rate": 0.009101492149684509, "loss": 2.5825, "step": 5791 }, { "crossentropy": 2.5494449138641357, "epoch": 0.2099767981438515, "grad_norm": 0.0334356464445591, "grad_norm_var": 1.7945878675134025e-05, "learning_rate": 0.009101159777153697, "loss": 2.6335, "step": 5792 }, { "crossentropy": 2.6378304958343506, "epoch": 0.21001305104408352, "grad_norm": 0.03633727878332138, "grad_norm_var": 1.7611938076788355e-05, "learning_rate": 0.009100827349230496, "loss": 2.6415, "step": 5793 }, { "crossentropy": 2.728809118270874, "epoch": 0.21004930394431554, "grad_norm": 0.03986146301031113, "grad_norm_var": 1.773844966960248e-05, "learning_rate": 0.0091004948659194, "loss": 2.6785, "step": 5794 }, { "crossentropy": 2.595892906188965, "epoch": 0.21008555684454758, "grad_norm": 0.03303467854857445, "grad_norm_var": 1.828491508824302e-05, "learning_rate": 0.009100162327224898, "loss": 2.5818, "step": 5795 }, { "crossentropy": 2.6544249057769775, "epoch": 0.2101218097447796, "grad_norm": 0.03303580731153488, "grad_norm_var": 1.880125837732612e-05, "learning_rate": 0.00909982973315148, "loss": 2.6412, "step": 5796 }, { "crossentropy": 2.7294273376464844, "epoch": 0.2101580626450116, "grad_norm": 0.03764323145151138, "grad_norm_var": 1.89303373703764e-05, "learning_rate": 0.009099497083703643, "loss": 2.6173, "step": 5797 }, { "crossentropy": 2.6648449897766113, "epoch": 0.21019431554524362, "grad_norm": 0.03684775158762932, "grad_norm_var": 1.8843441682256364e-05, "learning_rate": 0.009099164378885874, "loss": 2.6567, "step": 5798 }, { "crossentropy": 2.524325370788574, "epoch": 0.21023056844547564, "grad_norm": 0.038913194090127945, "grad_norm_var": 1.7161505326304615e-05, "learning_rate": 0.00909883161870267, "loss": 2.5511, "step": 5799 }, { "crossentropy": 2.623656749725342, "epoch": 0.21026682134570765, "grad_norm": 0.04062304645776749, "grad_norm_var": 1.642905169063202e-05, "learning_rate": 0.009098498803158524, "loss": 2.622, "step": 5800 }, { "crossentropy": 2.7950258255004883, "epoch": 0.21030307424593966, "grad_norm": 0.03787548467516899, "grad_norm_var": 1.5444599903172094e-05, "learning_rate": 0.009098165932257934, "loss": 2.7185, "step": 5801 }, { "crossentropy": 2.785738468170166, "epoch": 0.2103393271461717, "grad_norm": 0.03665851056575775, "grad_norm_var": 1.5296625074939783e-05, "learning_rate": 0.009097833006005392, "loss": 2.7317, "step": 5802 }, { "crossentropy": 2.8127732276916504, "epoch": 0.21037558004640372, "grad_norm": 0.0376957431435585, "grad_norm_var": 1.3745669049300412e-05, "learning_rate": 0.009097500024405398, "loss": 2.7181, "step": 5803 }, { "crossentropy": 2.7017109394073486, "epoch": 0.21041183294663574, "grad_norm": 0.037935443222522736, "grad_norm_var": 6.5010313343843535e-06, "learning_rate": 0.009097166987462447, "loss": 2.6966, "step": 5804 }, { "crossentropy": 2.6506073474884033, "epoch": 0.21044808584686775, "grad_norm": 0.033600304275751114, "grad_norm_var": 6.4092354367208e-06, "learning_rate": 0.009096833895181037, "loss": 2.6906, "step": 5805 }, { "crossentropy": 2.6360020637512207, "epoch": 0.21048433874709976, "grad_norm": 0.03994546830654144, "grad_norm_var": 7.059668782207797e-06, "learning_rate": 0.00909650074756567, "loss": 2.6968, "step": 5806 }, { "crossentropy": 2.7368202209472656, "epoch": 0.21052059164733178, "grad_norm": 0.03760495409369469, "grad_norm_var": 6.18528862576604e-06, "learning_rate": 0.009096167544620844, "loss": 2.6359, "step": 5807 }, { "crossentropy": 2.6179490089416504, "epoch": 0.2105568445475638, "grad_norm": 0.04282989725470543, "grad_norm_var": 7.3109718125925646e-06, "learning_rate": 0.009095834286351058, "loss": 2.5855, "step": 5808 }, { "crossentropy": 2.9316470623016357, "epoch": 0.21059309744779584, "grad_norm": 0.03688153624534607, "grad_norm_var": 7.24310351809711e-06, "learning_rate": 0.009095500972760816, "loss": 2.7825, "step": 5809 }, { "crossentropy": 2.6766932010650635, "epoch": 0.21062935034802785, "grad_norm": 0.031927842646837234, "grad_norm_var": 8.744227714648408e-06, "learning_rate": 0.009095167603854615, "loss": 2.6811, "step": 5810 }, { "crossentropy": 2.673597574234009, "epoch": 0.21066560324825986, "grad_norm": 0.03280767425894737, "grad_norm_var": 8.869459491319306e-06, "learning_rate": 0.009094834179636965, "loss": 2.7154, "step": 5811 }, { "crossentropy": 2.664161205291748, "epoch": 0.21070185614849188, "grad_norm": 0.031388696283102036, "grad_norm_var": 9.920951910766193e-06, "learning_rate": 0.00909450070011236, "loss": 2.6365, "step": 5812 }, { "crossentropy": 2.622236490249634, "epoch": 0.2107381090487239, "grad_norm": 0.03627443686127663, "grad_norm_var": 9.911290898795277e-06, "learning_rate": 0.009094167165285313, "loss": 2.6436, "step": 5813 }, { "crossentropy": 2.5336461067199707, "epoch": 0.2107743619489559, "grad_norm": 0.040373895317316055, "grad_norm_var": 1.068116912179909e-05, "learning_rate": 0.009093833575160326, "loss": 2.6419, "step": 5814 }, { "crossentropy": 2.652097225189209, "epoch": 0.21081061484918792, "grad_norm": 0.04293722286820412, "grad_norm_var": 1.267491420487224e-05, "learning_rate": 0.009093499929741903, "loss": 2.6735, "step": 5815 }, { "crossentropy": 2.676265001296997, "epoch": 0.21084686774941996, "grad_norm": 0.03906615078449249, "grad_norm_var": 1.2143858786460108e-05, "learning_rate": 0.009093166229034551, "loss": 2.6625, "step": 5816 }, { "crossentropy": 2.631638526916504, "epoch": 0.21088312064965198, "grad_norm": 0.0363311693072319, "grad_norm_var": 1.216159101780952e-05, "learning_rate": 0.009092832473042779, "loss": 2.7053, "step": 5817 }, { "crossentropy": 2.6136229038238525, "epoch": 0.210919373549884, "grad_norm": 0.03696021810173988, "grad_norm_var": 1.2147863400692923e-05, "learning_rate": 0.00909249866177109, "loss": 2.6193, "step": 5818 }, { "crossentropy": 2.7839643955230713, "epoch": 0.210955626450116, "grad_norm": 0.03755759820342064, "grad_norm_var": 1.2139188876662762e-05, "learning_rate": 0.009092164795223998, "loss": 2.6664, "step": 5819 }, { "crossentropy": 2.733377695083618, "epoch": 0.21099187935034802, "grad_norm": 0.03538079559803009, "grad_norm_var": 1.2280019652718908e-05, "learning_rate": 0.009091830873406011, "loss": 2.7101, "step": 5820 }, { "crossentropy": 2.677398443222046, "epoch": 0.21102813225058004, "grad_norm": 0.034792620688676834, "grad_norm_var": 1.182971533808848e-05, "learning_rate": 0.009091496896321638, "loss": 2.7126, "step": 5821 }, { "crossentropy": 2.633310317993164, "epoch": 0.21106438515081208, "grad_norm": 0.03374738246202469, "grad_norm_var": 1.1851322265572262e-05, "learning_rate": 0.00909116286397539, "loss": 2.7141, "step": 5822 }, { "crossentropy": 2.78354811668396, "epoch": 0.2111006380510441, "grad_norm": 0.036076921969652176, "grad_norm_var": 1.1808576412492867e-05, "learning_rate": 0.009090828776371778, "loss": 2.719, "step": 5823 }, { "crossentropy": 2.6378402709960938, "epoch": 0.2111368909512761, "grad_norm": 0.04138459265232086, "grad_norm_var": 1.0735383393261262e-05, "learning_rate": 0.009090494633515316, "loss": 2.7222, "step": 5824 }, { "crossentropy": 2.5850367546081543, "epoch": 0.21117314385150812, "grad_norm": 0.039208658039569855, "grad_norm_var": 1.1194393411931985e-05, "learning_rate": 0.009090160435410517, "loss": 2.5956, "step": 5825 }, { "crossentropy": 2.718972682952881, "epoch": 0.21120939675174014, "grad_norm": 0.037453822791576385, "grad_norm_var": 9.632127788175512e-06, "learning_rate": 0.009089826182061894, "loss": 2.6707, "step": 5826 }, { "crossentropy": 2.584808826446533, "epoch": 0.21124564965197215, "grad_norm": 0.03499801456928253, "grad_norm_var": 8.712339720061875e-06, "learning_rate": 0.009089491873473963, "loss": 2.5892, "step": 5827 }, { "crossentropy": 2.723626136779785, "epoch": 0.21128190255220416, "grad_norm": 0.03616165369749069, "grad_norm_var": 6.488305621034305e-06, "learning_rate": 0.009089157509651237, "loss": 2.6883, "step": 5828 }, { "crossentropy": 2.7138450145721436, "epoch": 0.2113181554524362, "grad_norm": 0.034100744873285294, "grad_norm_var": 7.1153587733689565e-06, "learning_rate": 0.009088823090598233, "loss": 2.6671, "step": 5829 }, { "crossentropy": 2.7298240661621094, "epoch": 0.21135440835266822, "grad_norm": 0.03160259500145912, "grad_norm_var": 8.309270837044726e-06, "learning_rate": 0.00908848861631947, "loss": 2.6341, "step": 5830 }, { "crossentropy": 2.670992136001587, "epoch": 0.21139066125290024, "grad_norm": 0.035214949399232864, "grad_norm_var": 5.6503406436785844e-06, "learning_rate": 0.00908815408681946, "loss": 2.6654, "step": 5831 }, { "crossentropy": 2.6822409629821777, "epoch": 0.21142691415313225, "grad_norm": 0.03364953026175499, "grad_norm_var": 5.451917624332605e-06, "learning_rate": 0.00908781950210273, "loss": 2.7473, "step": 5832 }, { "crossentropy": 2.6598570346832275, "epoch": 0.21146316705336426, "grad_norm": 0.03544985502958298, "grad_norm_var": 5.451421266540699e-06, "learning_rate": 0.00908748486217379, "loss": 2.7179, "step": 5833 }, { "crossentropy": 2.6139895915985107, "epoch": 0.21149941995359628, "grad_norm": 0.04046355187892914, "grad_norm_var": 6.733014904972539e-06, "learning_rate": 0.009087150167037166, "loss": 2.5969, "step": 5834 }, { "crossentropy": 2.7157933712005615, "epoch": 0.2115356728538283, "grad_norm": 0.04629266634583473, "grad_norm_var": 1.322544858280052e-05, "learning_rate": 0.009086815416697376, "loss": 2.7234, "step": 5835 }, { "crossentropy": 2.5554938316345215, "epoch": 0.21157192575406034, "grad_norm": 0.04410925135016441, "grad_norm_var": 1.6540646455189222e-05, "learning_rate": 0.009086480611158942, "loss": 2.5951, "step": 5836 }, { "crossentropy": 2.6895217895507812, "epoch": 0.21160817865429235, "grad_norm": 0.03345344215631485, "grad_norm_var": 1.7077084759709904e-05, "learning_rate": 0.009086145750426386, "loss": 2.7359, "step": 5837 }, { "crossentropy": 2.599395513534546, "epoch": 0.21164443155452436, "grad_norm": 0.03226907551288605, "grad_norm_var": 1.7871635510662414e-05, "learning_rate": 0.00908581083450423, "loss": 2.6363, "step": 5838 }, { "crossentropy": 2.7275614738464355, "epoch": 0.21168068445475638, "grad_norm": 0.03426719084382057, "grad_norm_var": 1.8297398246192437e-05, "learning_rate": 0.009085475863396999, "loss": 2.7548, "step": 5839 }, { "crossentropy": 2.7824347019195557, "epoch": 0.2117169373549884, "grad_norm": 0.033726658672094345, "grad_norm_var": 1.7363169663538643e-05, "learning_rate": 0.009085140837109216, "loss": 2.7484, "step": 5840 }, { "crossentropy": 2.7380239963531494, "epoch": 0.2117531902552204, "grad_norm": 0.033631354570388794, "grad_norm_var": 1.721968924698498e-05, "learning_rate": 0.009084805755645407, "loss": 2.6655, "step": 5841 }, { "crossentropy": 2.5834262371063232, "epoch": 0.21178944315545242, "grad_norm": 0.03527040034532547, "grad_norm_var": 1.7109769586244837e-05, "learning_rate": 0.009084470619010097, "loss": 2.6111, "step": 5842 }, { "crossentropy": 2.636622428894043, "epoch": 0.21182569605568446, "grad_norm": 0.032933615148067474, "grad_norm_var": 1.7628892015763193e-05, "learning_rate": 0.009084135427207814, "loss": 2.6151, "step": 5843 }, { "crossentropy": 2.684128522872925, "epoch": 0.21186194895591648, "grad_norm": 0.034325968474149704, "grad_norm_var": 1.7747870654794864e-05, "learning_rate": 0.009083800180243082, "loss": 2.6574, "step": 5844 }, { "crossentropy": 2.7143425941467285, "epoch": 0.2118982018561485, "grad_norm": 0.038497209548950195, "grad_norm_var": 1.8034540418498946e-05, "learning_rate": 0.009083464878120433, "loss": 2.6201, "step": 5845 }, { "crossentropy": 2.5292322635650635, "epoch": 0.2119344547563805, "grad_norm": 0.04359612986445427, "grad_norm_var": 2.007701111660168e-05, "learning_rate": 0.009083129520844392, "loss": 2.6249, "step": 5846 }, { "crossentropy": 2.7845041751861572, "epoch": 0.21197070765661252, "grad_norm": 0.04106370732188225, "grad_norm_var": 2.105931124797289e-05, "learning_rate": 0.009082794108419494, "loss": 2.748, "step": 5847 }, { "crossentropy": 2.797694206237793, "epoch": 0.21200696055684454, "grad_norm": 0.038622815161943436, "grad_norm_var": 2.0342019075478972e-05, "learning_rate": 0.009082458640850262, "loss": 2.7144, "step": 5848 }, { "crossentropy": 2.37870454788208, "epoch": 0.21204321345707658, "grad_norm": 0.038927771151065826, "grad_norm_var": 2.0206066150629577e-05, "learning_rate": 0.009082123118141232, "loss": 2.5534, "step": 5849 }, { "crossentropy": 2.5961270332336426, "epoch": 0.2120794663573086, "grad_norm": 0.0368998758494854, "grad_norm_var": 1.9634736080303913e-05, "learning_rate": 0.009081787540296935, "loss": 2.6474, "step": 5850 }, { "crossentropy": 2.6348750591278076, "epoch": 0.2121157192575406, "grad_norm": 0.03453000262379646, "grad_norm_var": 1.4285120601959373e-05, "learning_rate": 0.009081451907321902, "loss": 2.6865, "step": 5851 }, { "crossentropy": 2.812196969985962, "epoch": 0.21215197215777262, "grad_norm": 0.035052090883255005, "grad_norm_var": 1.0383382707886366e-05, "learning_rate": 0.009081116219220668, "loss": 2.7591, "step": 5852 }, { "crossentropy": 2.5528147220611572, "epoch": 0.21218822505800464, "grad_norm": 0.034155551344156265, "grad_norm_var": 1.0169552928251425e-05, "learning_rate": 0.009080780475997767, "loss": 2.5579, "step": 5853 }, { "crossentropy": 2.7615957260131836, "epoch": 0.21222447795823665, "grad_norm": 0.03429167717695236, "grad_norm_var": 9.389255174590121e-06, "learning_rate": 0.00908044467765773, "loss": 2.7326, "step": 5854 }, { "crossentropy": 2.5985805988311768, "epoch": 0.21226073085846867, "grad_norm": 0.03366731479763985, "grad_norm_var": 9.569298151005228e-06, "learning_rate": 0.009080108824205098, "loss": 2.5826, "step": 5855 }, { "crossentropy": 2.715437412261963, "epoch": 0.2122969837587007, "grad_norm": 0.033053744584321976, "grad_norm_var": 9.819467756967161e-06, "learning_rate": 0.009079772915644404, "loss": 2.7565, "step": 5856 }, { "crossentropy": 2.549717903137207, "epoch": 0.21233323665893272, "grad_norm": 0.03474438935518265, "grad_norm_var": 9.522011113708278e-06, "learning_rate": 0.009079436951980185, "loss": 2.6454, "step": 5857 }, { "crossentropy": 2.6902060508728027, "epoch": 0.21236948955916474, "grad_norm": 0.038811247795820236, "grad_norm_var": 9.853980250019526e-06, "learning_rate": 0.009079100933216979, "loss": 2.6643, "step": 5858 }, { "crossentropy": 2.726590633392334, "epoch": 0.21240574245939675, "grad_norm": 0.03961850330233574, "grad_norm_var": 9.51424267166817e-06, "learning_rate": 0.009078764859359325, "loss": 2.6926, "step": 5859 }, { "crossentropy": 2.790983200073242, "epoch": 0.21244199535962877, "grad_norm": 0.03787993639707565, "grad_norm_var": 9.099976052526216e-06, "learning_rate": 0.009078428730411764, "loss": 2.7273, "step": 5860 }, { "crossentropy": 2.755248546600342, "epoch": 0.21247824825986078, "grad_norm": 0.03338466212153435, "grad_norm_var": 9.773158809193473e-06, "learning_rate": 0.00907809254637883, "loss": 2.6709, "step": 5861 }, { "crossentropy": 2.704472780227661, "epoch": 0.2125145011600928, "grad_norm": 0.03399871662259102, "grad_norm_var": 6.793317252838139e-06, "learning_rate": 0.009077756307265069, "loss": 2.7635, "step": 5862 }, { "crossentropy": 2.776517152786255, "epoch": 0.21255075406032484, "grad_norm": 0.03816525265574455, "grad_norm_var": 5.426722430858638e-06, "learning_rate": 0.009077420013075021, "loss": 2.6565, "step": 5863 }, { "crossentropy": 2.5755491256713867, "epoch": 0.21258700696055685, "grad_norm": 0.03944341465830803, "grad_norm_var": 5.7571230524512565e-06, "learning_rate": 0.009077083663813228, "loss": 2.627, "step": 5864 }, { "crossentropy": 2.5099456310272217, "epoch": 0.21262325986078887, "grad_norm": 0.03747496008872986, "grad_norm_var": 5.329462642054914e-06, "learning_rate": 0.009076747259484232, "loss": 2.6047, "step": 5865 }, { "crossentropy": 2.776674270629883, "epoch": 0.21265951276102088, "grad_norm": 0.037785060703754425, "grad_norm_var": 5.490754828001617e-06, "learning_rate": 0.009076410800092579, "loss": 2.7643, "step": 5866 }, { "crossentropy": 2.6389338970184326, "epoch": 0.2126957656612529, "grad_norm": 0.03882809728384018, "grad_norm_var": 5.800906299289974e-06, "learning_rate": 0.00907607428564281, "loss": 2.6702, "step": 5867 }, { "crossentropy": 2.68644642829895, "epoch": 0.2127320185614849, "grad_norm": 0.03454681485891342, "grad_norm_var": 5.899059265293954e-06, "learning_rate": 0.009075737716139474, "loss": 2.6772, "step": 5868 }, { "crossentropy": 2.5676121711730957, "epoch": 0.21276827146171692, "grad_norm": 0.033384162932634354, "grad_norm_var": 6.150698605871379e-06, "learning_rate": 0.009075401091587115, "loss": 2.5985, "step": 5869 }, { "crossentropy": 2.8618886470794678, "epoch": 0.21280452436194897, "grad_norm": 0.03417075052857399, "grad_norm_var": 6.182258516564352e-06, "learning_rate": 0.00907506441199028, "loss": 2.7648, "step": 5870 }, { "crossentropy": 2.8179054260253906, "epoch": 0.21284077726218098, "grad_norm": 0.03685314208269119, "grad_norm_var": 5.747226173737479e-06, "learning_rate": 0.009074727677353513, "loss": 2.7254, "step": 5871 }, { "crossentropy": 2.706003427505493, "epoch": 0.212877030162413, "grad_norm": 0.037462420761585236, "grad_norm_var": 5.004442444351646e-06, "learning_rate": 0.009074390887681368, "loss": 2.7403, "step": 5872 }, { "crossentropy": 2.695753812789917, "epoch": 0.212913283062645, "grad_norm": 0.04002629965543747, "grad_norm_var": 5.3993983780105856e-06, "learning_rate": 0.00907405404297839, "loss": 2.6107, "step": 5873 }, { "crossentropy": 2.8113760948181152, "epoch": 0.21294953596287702, "grad_norm": 0.04685455560684204, "grad_norm_var": 1.1396443831500288e-05, "learning_rate": 0.00907371714324913, "loss": 2.7238, "step": 5874 }, { "crossentropy": 2.7882585525512695, "epoch": 0.21298578886310904, "grad_norm": 0.03135843947529793, "grad_norm_var": 1.3319054644542137e-05, "learning_rate": 0.009073380188498138, "loss": 2.7026, "step": 5875 }, { "crossentropy": 2.8555595874786377, "epoch": 0.21302204176334108, "grad_norm": 0.03321634232997894, "grad_norm_var": 1.4116321683895796e-05, "learning_rate": 0.009073043178729964, "loss": 2.7881, "step": 5876 }, { "crossentropy": 3.0067434310913086, "epoch": 0.2130582946635731, "grad_norm": 0.0472981296479702, "grad_norm_var": 2.0093606236654995e-05, "learning_rate": 0.00907270611394916, "loss": 2.8806, "step": 5877 }, { "crossentropy": 2.808084726333618, "epoch": 0.2130945475638051, "grad_norm": 0.03560250625014305, "grad_norm_var": 1.9494073966505077e-05, "learning_rate": 0.009072368994160283, "loss": 2.7524, "step": 5878 }, { "crossentropy": 2.749206066131592, "epoch": 0.21313080046403712, "grad_norm": 0.036686819046735764, "grad_norm_var": 1.9529982150864356e-05, "learning_rate": 0.00907203181936788, "loss": 2.674, "step": 5879 }, { "crossentropy": 2.6457831859588623, "epoch": 0.21316705336426914, "grad_norm": 0.0380748026072979, "grad_norm_var": 1.930372630605381e-05, "learning_rate": 0.009071694589576507, "loss": 2.6982, "step": 5880 }, { "crossentropy": 2.7380077838897705, "epoch": 0.21320330626450115, "grad_norm": 0.04021875560283661, "grad_norm_var": 1.977370472699315e-05, "learning_rate": 0.00907135730479072, "loss": 2.8179, "step": 5881 }, { "crossentropy": 2.6072375774383545, "epoch": 0.21323955916473317, "grad_norm": 0.03857141360640526, "grad_norm_var": 1.982672797017267e-05, "learning_rate": 0.009071019965015076, "loss": 2.6036, "step": 5882 }, { "crossentropy": 2.6496894359588623, "epoch": 0.2132758120649652, "grad_norm": 0.03888092190027237, "grad_norm_var": 1.9834868370734633e-05, "learning_rate": 0.009070682570254129, "loss": 2.6986, "step": 5883 }, { "crossentropy": 2.6168646812438965, "epoch": 0.21331206496519722, "grad_norm": 0.036991022527217865, "grad_norm_var": 1.9180519725644126e-05, "learning_rate": 0.009070345120512436, "loss": 2.6048, "step": 5884 }, { "crossentropy": 2.8319735527038574, "epoch": 0.21334831786542924, "grad_norm": 0.03890679404139519, "grad_norm_var": 1.7795989464606395e-05, "learning_rate": 0.009070007615794554, "loss": 2.7071, "step": 5885 }, { "crossentropy": 2.810610771179199, "epoch": 0.21338457076566125, "grad_norm": 0.03807665407657623, "grad_norm_var": 1.665198837831333e-05, "learning_rate": 0.009069670056105044, "loss": 2.7517, "step": 5886 }, { "crossentropy": 2.759809970855713, "epoch": 0.21342082366589327, "grad_norm": 0.03581051528453827, "grad_norm_var": 1.6940869392612484e-05, "learning_rate": 0.009069332441448462, "loss": 2.7134, "step": 5887 }, { "crossentropy": 2.7123398780822754, "epoch": 0.21345707656612528, "grad_norm": 0.03500683978199959, "grad_norm_var": 1.761726976481151e-05, "learning_rate": 0.009068994771829372, "loss": 2.7771, "step": 5888 }, { "crossentropy": 2.5813939571380615, "epoch": 0.2134933294663573, "grad_norm": 0.03233732655644417, "grad_norm_var": 1.9464373523655523e-05, "learning_rate": 0.009068657047252333, "loss": 2.6318, "step": 5889 }, { "crossentropy": 2.680919647216797, "epoch": 0.21352958236658934, "grad_norm": 0.0323096364736557, "grad_norm_var": 1.5016760533159643e-05, "learning_rate": 0.009068319267721904, "loss": 2.6607, "step": 5890 }, { "crossentropy": 2.789212703704834, "epoch": 0.21356583526682135, "grad_norm": 0.03229571133852005, "grad_norm_var": 1.4387364132800497e-05, "learning_rate": 0.009067981433242651, "loss": 2.6636, "step": 5891 }, { "crossentropy": 2.6196937561035156, "epoch": 0.21360208816705337, "grad_norm": 0.0333213172852993, "grad_norm_var": 1.4336595268693517e-05, "learning_rate": 0.009067643543819136, "loss": 2.6046, "step": 5892 }, { "crossentropy": 2.778636932373047, "epoch": 0.21363834106728538, "grad_norm": 0.0363902822136879, "grad_norm_var": 6.649103212189034e-06, "learning_rate": 0.00906730559945592, "loss": 2.7707, "step": 5893 }, { "crossentropy": 2.5164332389831543, "epoch": 0.2136745939675174, "grad_norm": 0.0356103740632534, "grad_norm_var": 6.648461840522146e-06, "learning_rate": 0.00906696760015757, "loss": 2.5085, "step": 5894 }, { "crossentropy": 2.7325708866119385, "epoch": 0.2137108468677494, "grad_norm": 0.032872095704078674, "grad_norm_var": 7.3195513637865896e-06, "learning_rate": 0.009066629545928653, "loss": 2.6829, "step": 5895 }, { "crossentropy": 2.8117167949676514, "epoch": 0.21374709976798145, "grad_norm": 0.03346487134695053, "grad_norm_var": 7.3599691588988044e-06, "learning_rate": 0.00906629143677373, "loss": 2.7292, "step": 5896 }, { "crossentropy": 2.879000186920166, "epoch": 0.21378335266821347, "grad_norm": 0.03465183824300766, "grad_norm_var": 5.936523344422101e-06, "learning_rate": 0.009065953272697371, "loss": 2.7639, "step": 5897 }, { "crossentropy": 2.732769250869751, "epoch": 0.21381960556844548, "grad_norm": 0.032937563955783844, "grad_norm_var": 5.495621515884339e-06, "learning_rate": 0.009065615053704143, "loss": 2.6757, "step": 5898 }, { "crossentropy": 2.60219407081604, "epoch": 0.2138558584686775, "grad_norm": 0.03165678307414055, "grad_norm_var": 5.011005549204519e-06, "learning_rate": 0.009065276779798613, "loss": 2.6758, "step": 5899 }, { "crossentropy": 2.7885775566101074, "epoch": 0.2138921113689095, "grad_norm": 0.03622191771864891, "grad_norm_var": 4.796627528311168e-06, "learning_rate": 0.009064938450985353, "loss": 2.6924, "step": 5900 }, { "crossentropy": 2.6639769077301025, "epoch": 0.21392836426914152, "grad_norm": 0.03495961055159569, "grad_norm_var": 3.446878024880526e-06, "learning_rate": 0.009064600067268929, "loss": 2.6249, "step": 5901 }, { "crossentropy": 2.74399995803833, "epoch": 0.21396461716937354, "grad_norm": 0.0324588418006897, "grad_norm_var": 2.5494542599895895e-06, "learning_rate": 0.009064261628653912, "loss": 2.6708, "step": 5902 }, { "crossentropy": 2.644028425216675, "epoch": 0.21400087006960558, "grad_norm": 0.031682200729846954, "grad_norm_var": 2.559762661428061e-06, "learning_rate": 0.009063923135144876, "loss": 2.6234, "step": 5903 }, { "crossentropy": 2.6858553886413574, "epoch": 0.2140371229698376, "grad_norm": 0.03398830443620682, "grad_norm_var": 2.4384448265817964e-06, "learning_rate": 0.009063584586746389, "loss": 2.6732, "step": 5904 }, { "crossentropy": 2.641683340072632, "epoch": 0.2140733758700696, "grad_norm": 0.03553806245326996, "grad_norm_var": 2.55164601688686e-06, "learning_rate": 0.009063245983463027, "loss": 2.5015, "step": 5905 }, { "crossentropy": 2.6664416790008545, "epoch": 0.21410962877030162, "grad_norm": 0.03649128973484039, "grad_norm_var": 2.8289304566765532e-06, "learning_rate": 0.00906290732529936, "loss": 2.6737, "step": 5906 }, { "crossentropy": 2.818694591522217, "epoch": 0.21414588167053364, "grad_norm": 0.033820051699876785, "grad_norm_var": 2.620894447843969e-06, "learning_rate": 0.009062568612259965, "loss": 2.7404, "step": 5907 }, { "crossentropy": 2.709731340408325, "epoch": 0.21418213457076565, "grad_norm": 0.03128057345747948, "grad_norm_var": 3.1009778543987047e-06, "learning_rate": 0.009062229844349417, "loss": 2.6837, "step": 5908 }, { "crossentropy": 2.5352916717529297, "epoch": 0.21421838747099767, "grad_norm": 0.03424443304538727, "grad_norm_var": 2.705319291682745e-06, "learning_rate": 0.009061891021572288, "loss": 2.6098, "step": 5909 }, { "crossentropy": 2.7746617794036865, "epoch": 0.2142546403712297, "grad_norm": 0.03421564772725105, "grad_norm_var": 2.50277335024085e-06, "learning_rate": 0.009061552143933158, "loss": 2.7193, "step": 5910 }, { "crossentropy": 2.560218095779419, "epoch": 0.21429089327146172, "grad_norm": 0.037718210369348526, "grad_norm_var": 3.383768952556522e-06, "learning_rate": 0.009061213211436601, "loss": 2.6332, "step": 5911 }, { "crossentropy": 2.7794461250305176, "epoch": 0.21432714617169374, "grad_norm": 0.03744536638259888, "grad_norm_var": 4.045906189629964e-06, "learning_rate": 0.009060874224087198, "loss": 2.8218, "step": 5912 }, { "crossentropy": 2.644510507583618, "epoch": 0.21436339907192575, "grad_norm": 0.03327278792858124, "grad_norm_var": 4.105942695153905e-06, "learning_rate": 0.009060535181889526, "loss": 2.6482, "step": 5913 }, { "crossentropy": 2.7130322456359863, "epoch": 0.21439965197215777, "grad_norm": 0.04004045948386192, "grad_norm_var": 6.020237835400316e-06, "learning_rate": 0.009060196084848164, "loss": 2.7976, "step": 5914 }, { "crossentropy": 2.727459669113159, "epoch": 0.21443590487238978, "grad_norm": 0.034882672131061554, "grad_norm_var": 5.366139271914018e-06, "learning_rate": 0.009059856932967692, "loss": 2.6407, "step": 5915 }, { "crossentropy": 2.621814250946045, "epoch": 0.2144721577726218, "grad_norm": 0.039267655462026596, "grad_norm_var": 6.486292752924151e-06, "learning_rate": 0.009059517726252693, "loss": 2.6119, "step": 5916 }, { "crossentropy": 2.528623342514038, "epoch": 0.21450841067285384, "grad_norm": 0.040081221610307693, "grad_norm_var": 8.042395441895146e-06, "learning_rate": 0.009059178464707744, "loss": 2.4538, "step": 5917 }, { "crossentropy": 2.6068601608276367, "epoch": 0.21454466357308585, "grad_norm": 0.04723597690463066, "grad_norm_var": 1.5891788400070283e-05, "learning_rate": 0.009058839148337432, "loss": 2.6091, "step": 5918 }, { "crossentropy": 2.619974374771118, "epoch": 0.21458091647331787, "grad_norm": 0.04358818382024765, "grad_norm_var": 1.7380549364688868e-05, "learning_rate": 0.009058499777146336, "loss": 2.6774, "step": 5919 }, { "crossentropy": 2.8177618980407715, "epoch": 0.21461716937354988, "grad_norm": 0.03646909072995186, "grad_norm_var": 1.674604426784894e-05, "learning_rate": 0.009058160351139042, "loss": 2.6383, "step": 5920 }, { "crossentropy": 2.616562604904175, "epoch": 0.2146534222737819, "grad_norm": 0.03403336554765701, "grad_norm_var": 1.7225890988551278e-05, "learning_rate": 0.009057820870320133, "loss": 2.6166, "step": 5921 }, { "crossentropy": 2.706578254699707, "epoch": 0.2146896751740139, "grad_norm": 0.03533000126481056, "grad_norm_var": 1.7409142442698088e-05, "learning_rate": 0.009057481334694197, "loss": 2.6785, "step": 5922 }, { "crossentropy": 2.7609245777130127, "epoch": 0.21472592807424595, "grad_norm": 0.03280245140194893, "grad_norm_var": 1.791316726992472e-05, "learning_rate": 0.009057141744265819, "loss": 2.6118, "step": 5923 }, { "crossentropy": 2.637622594833374, "epoch": 0.21476218097447797, "grad_norm": 0.03429890051484108, "grad_norm_var": 1.6183125861758916e-05, "learning_rate": 0.009056802099039581, "loss": 2.7371, "step": 5924 }, { "crossentropy": 2.8103129863739014, "epoch": 0.21479843387470998, "grad_norm": 0.039351243525743484, "grad_norm_var": 1.5812268561395982e-05, "learning_rate": 0.009056462399020077, "loss": 2.6801, "step": 5925 }, { "crossentropy": 2.5765676498413086, "epoch": 0.214834686774942, "grad_norm": 0.03901853412389755, "grad_norm_var": 1.5149421281476782e-05, "learning_rate": 0.009056122644211892, "loss": 2.5713, "step": 5926 }, { "crossentropy": 2.6671361923217773, "epoch": 0.214870939675174, "grad_norm": 0.03550513833761215, "grad_norm_var": 1.5480327106131136e-05, "learning_rate": 0.009055782834619615, "loss": 2.6645, "step": 5927 }, { "crossentropy": 2.7575690746307373, "epoch": 0.21490719257540603, "grad_norm": 0.039406854659318924, "grad_norm_var": 1.5663627777391895e-05, "learning_rate": 0.009055442970247836, "loss": 2.7416, "step": 5928 }, { "crossentropy": 2.6911394596099854, "epoch": 0.21494344547563804, "grad_norm": 0.04256419464945793, "grad_norm_var": 1.5467394801874494e-05, "learning_rate": 0.009055103051101146, "loss": 2.7336, "step": 5929 }, { "crossentropy": 2.4875376224517822, "epoch": 0.21497969837587008, "grad_norm": 0.0332205630838871, "grad_norm_var": 1.6852846265698733e-05, "learning_rate": 0.009054763077184134, "loss": 2.6227, "step": 5930 }, { "crossentropy": 2.7052695751190186, "epoch": 0.2150159512761021, "grad_norm": 0.03315722569823265, "grad_norm_var": 1.7742517244485426e-05, "learning_rate": 0.009054423048501395, "loss": 2.7285, "step": 5931 }, { "crossentropy": 2.743027925491333, "epoch": 0.2150522041763341, "grad_norm": 0.03247249871492386, "grad_norm_var": 1.9328721417218436e-05, "learning_rate": 0.009054082965057518, "loss": 2.7045, "step": 5932 }, { "crossentropy": 2.594728946685791, "epoch": 0.21508845707656613, "grad_norm": 0.034003544598817825, "grad_norm_var": 1.947146966119097e-05, "learning_rate": 0.0090537428268571, "loss": 2.6086, "step": 5933 }, { "crossentropy": 2.802260637283325, "epoch": 0.21512470997679814, "grad_norm": 0.038233984261751175, "grad_norm_var": 1.2284659963764505e-05, "learning_rate": 0.009053402633904733, "loss": 2.7213, "step": 5934 }, { "crossentropy": 2.5368335247039795, "epoch": 0.21516096287703015, "grad_norm": 0.033746734261512756, "grad_norm_var": 8.992342885462817e-06, "learning_rate": 0.009053062386205011, "loss": 2.647, "step": 5935 }, { "crossentropy": 2.6746346950531006, "epoch": 0.21519721577726217, "grad_norm": 0.032802656292915344, "grad_norm_var": 9.530304510487152e-06, "learning_rate": 0.009052722083762533, "loss": 2.7385, "step": 5936 }, { "crossentropy": 2.80208158493042, "epoch": 0.2152334686774942, "grad_norm": 0.03530845046043396, "grad_norm_var": 9.361877425163405e-06, "learning_rate": 0.00905238172658189, "loss": 2.7425, "step": 5937 }, { "crossentropy": 2.535447835922241, "epoch": 0.21526972157772623, "grad_norm": 0.035242825746536255, "grad_norm_var": 9.36666973364719e-06, "learning_rate": 0.009052041314667685, "loss": 2.6023, "step": 5938 }, { "crossentropy": 2.654540538787842, "epoch": 0.21530597447795824, "grad_norm": 0.035895153880119324, "grad_norm_var": 8.771290762267746e-06, "learning_rate": 0.009051700848024511, "loss": 2.7069, "step": 5939 }, { "crossentropy": 2.601886749267578, "epoch": 0.21534222737819025, "grad_norm": 0.032645851373672485, "grad_norm_var": 9.292606851872959e-06, "learning_rate": 0.00905136032665697, "loss": 2.5666, "step": 5940 }, { "crossentropy": 2.7406861782073975, "epoch": 0.21537848027842227, "grad_norm": 0.03135041892528534, "grad_norm_var": 9.490076654677937e-06, "learning_rate": 0.009051019750569657, "loss": 2.6697, "step": 5941 }, { "crossentropy": 2.6918601989746094, "epoch": 0.21541473317865428, "grad_norm": 0.03337807580828667, "grad_norm_var": 8.671341589630971e-06, "learning_rate": 0.009050679119767178, "loss": 2.6643, "step": 5942 }, { "crossentropy": 2.8131799697875977, "epoch": 0.2154509860788863, "grad_norm": 0.03533894568681717, "grad_norm_var": 8.660398360642713e-06, "learning_rate": 0.009050338434254127, "loss": 2.7745, "step": 5943 }, { "crossentropy": 2.44573712348938, "epoch": 0.21548723897911834, "grad_norm": 0.03373629227280617, "grad_norm_var": 7.279971843369923e-06, "learning_rate": 0.00904999769403511, "loss": 2.5416, "step": 5944 }, { "crossentropy": 2.687924861907959, "epoch": 0.21552349187935035, "grad_norm": 0.032502494752407074, "grad_norm_var": 2.880749344541522e-06, "learning_rate": 0.009049656899114729, "loss": 2.6277, "step": 5945 }, { "crossentropy": 2.520395040512085, "epoch": 0.21555974477958237, "grad_norm": 0.035329122096300125, "grad_norm_var": 2.956437583479891e-06, "learning_rate": 0.009049316049497587, "loss": 2.6582, "step": 5946 }, { "crossentropy": 2.7271199226379395, "epoch": 0.21559599767981438, "grad_norm": 0.03361176326870918, "grad_norm_var": 2.9139397150081124e-06, "learning_rate": 0.009048975145188284, "loss": 2.7337, "step": 5947 }, { "crossentropy": 2.731426954269409, "epoch": 0.2156322505800464, "grad_norm": 0.03504639118909836, "grad_norm_var": 2.7694877343088787e-06, "learning_rate": 0.009048634186191428, "loss": 2.7123, "step": 5948 }, { "crossentropy": 2.722468852996826, "epoch": 0.2156685034802784, "grad_norm": 0.0415109358727932, "grad_norm_var": 6.034542492033834e-06, "learning_rate": 0.009048293172511624, "loss": 2.7133, "step": 5949 }, { "crossentropy": 2.6918447017669678, "epoch": 0.21570475638051045, "grad_norm": 0.04581726714968681, "grad_norm_var": 1.3171566478098241e-05, "learning_rate": 0.009047952104153476, "loss": 2.6473, "step": 5950 }, { "crossentropy": 2.710904121398926, "epoch": 0.21574100928074247, "grad_norm": 0.04265141114592552, "grad_norm_var": 1.6397244553220613e-05, "learning_rate": 0.009047610981121593, "loss": 2.6913, "step": 5951 }, { "crossentropy": 2.757952928543091, "epoch": 0.21577726218097448, "grad_norm": 0.03730969876050949, "grad_norm_var": 1.58893483139535e-05, "learning_rate": 0.009047269803420582, "loss": 2.7966, "step": 5952 }, { "crossentropy": 2.568662643432617, "epoch": 0.2158135150812065, "grad_norm": 0.03419437259435654, "grad_norm_var": 1.6075914346669272e-05, "learning_rate": 0.009046928571055048, "loss": 2.6043, "step": 5953 }, { "crossentropy": 2.5778932571411133, "epoch": 0.2158497679814385, "grad_norm": 0.03259029984474182, "grad_norm_var": 1.6773744239395535e-05, "learning_rate": 0.009046587284029604, "loss": 2.5728, "step": 5954 }, { "crossentropy": 2.7110018730163574, "epoch": 0.21588602088167053, "grad_norm": 0.03233874589204788, "grad_norm_var": 1.7522341224071533e-05, "learning_rate": 0.009046245942348857, "loss": 2.5812, "step": 5955 }, { "crossentropy": 2.490557909011841, "epoch": 0.21592227378190254, "grad_norm": 0.0340241864323616, "grad_norm_var": 1.710101920961987e-05, "learning_rate": 0.009045904546017417, "loss": 2.5265, "step": 5956 }, { "crossentropy": 2.5569114685058594, "epoch": 0.21595852668213458, "grad_norm": 0.03223095089197159, "grad_norm_var": 1.6642264042502666e-05, "learning_rate": 0.009045563095039899, "loss": 2.5979, "step": 5957 }, { "crossentropy": 2.7032763957977295, "epoch": 0.2159947795823666, "grad_norm": 0.03458112105727196, "grad_norm_var": 1.6356150795077533e-05, "learning_rate": 0.00904522158942091, "loss": 2.7132, "step": 5958 }, { "crossentropy": 2.7885680198669434, "epoch": 0.2160310324825986, "grad_norm": 0.031549230217933655, "grad_norm_var": 1.7487182974188346e-05, "learning_rate": 0.009044880029165066, "loss": 2.7317, "step": 5959 }, { "crossentropy": 2.6267776489257812, "epoch": 0.21606728538283063, "grad_norm": 0.03155939280986786, "grad_norm_var": 1.8313866949787315e-05, "learning_rate": 0.009044538414276979, "loss": 2.6733, "step": 5960 }, { "crossentropy": 2.6380743980407715, "epoch": 0.21610353828306264, "grad_norm": 0.031362589448690414, "grad_norm_var": 1.8839712464089645e-05, "learning_rate": 0.009044196744761262, "loss": 2.6765, "step": 5961 }, { "crossentropy": 2.683967113494873, "epoch": 0.21613979118329466, "grad_norm": 0.03502073884010315, "grad_norm_var": 1.884679088661467e-05, "learning_rate": 0.00904385502062253, "loss": 2.6683, "step": 5962 }, { "crossentropy": 2.625901699066162, "epoch": 0.21617604408352667, "grad_norm": 0.032485250383615494, "grad_norm_var": 1.9185305471630663e-05, "learning_rate": 0.0090435132418654, "loss": 2.6638, "step": 5963 }, { "crossentropy": 2.7252888679504395, "epoch": 0.2162122969837587, "grad_norm": 0.03139821067452431, "grad_norm_var": 2.0124458933500935e-05, "learning_rate": 0.009043171408494487, "loss": 2.7094, "step": 5964 }, { "crossentropy": 2.670311689376831, "epoch": 0.21624854988399073, "grad_norm": 0.03409375250339508, "grad_norm_var": 1.7162425503234693e-05, "learning_rate": 0.009042829520514408, "loss": 2.6484, "step": 5965 }, { "crossentropy": 2.683533191680908, "epoch": 0.21628480278422274, "grad_norm": 0.033158931881189346, "grad_norm_var": 8.203326138122904e-06, "learning_rate": 0.009042487577929784, "loss": 2.6459, "step": 5966 }, { "crossentropy": 2.6640613079071045, "epoch": 0.21632105568445475, "grad_norm": 0.03298525884747505, "grad_norm_var": 2.614876215912646e-06, "learning_rate": 0.009042145580745228, "loss": 2.6069, "step": 5967 }, { "crossentropy": 2.6772100925445557, "epoch": 0.21635730858468677, "grad_norm": 0.03540320321917534, "grad_norm_var": 1.792322964054051e-06, "learning_rate": 0.00904180352896536, "loss": 2.6861, "step": 5968 }, { "crossentropy": 2.726735830307007, "epoch": 0.21639356148491878, "grad_norm": 0.031110258772969246, "grad_norm_var": 1.920753885505559e-06, "learning_rate": 0.009041461422594805, "loss": 2.6143, "step": 5969 }, { "crossentropy": 2.5968003273010254, "epoch": 0.2164298143851508, "grad_norm": 0.031859349459409714, "grad_norm_var": 1.9812366924421096e-06, "learning_rate": 0.00904111926163818, "loss": 2.6558, "step": 5970 }, { "crossentropy": 2.6988983154296875, "epoch": 0.21646606728538284, "grad_norm": 0.030982404947280884, "grad_norm_var": 2.1837134630668984e-06, "learning_rate": 0.009040777046100106, "loss": 2.726, "step": 5971 }, { "crossentropy": 2.6413543224334717, "epoch": 0.21650232018561485, "grad_norm": 0.03175123780965805, "grad_norm_var": 2.11675546776927e-06, "learning_rate": 0.009040434775985205, "loss": 2.629, "step": 5972 }, { "crossentropy": 2.7546820640563965, "epoch": 0.21653857308584687, "grad_norm": 0.03214423358440399, "grad_norm_var": 2.121443295068414e-06, "learning_rate": 0.009040092451298103, "loss": 2.758, "step": 5973 }, { "crossentropy": 2.732962131500244, "epoch": 0.21657482598607888, "grad_norm": 0.030783761292696, "grad_norm_var": 2.014719314524085e-06, "learning_rate": 0.009039750072043419, "loss": 2.5809, "step": 5974 }, { "crossentropy": 2.7182774543762207, "epoch": 0.2166110788863109, "grad_norm": 0.03288842737674713, "grad_norm_var": 1.983291230740512e-06, "learning_rate": 0.00903940763822578, "loss": 2.6666, "step": 5975 }, { "crossentropy": 2.653019428253174, "epoch": 0.2166473317865429, "grad_norm": 0.032177671790122986, "grad_norm_var": 1.934861311771854e-06, "learning_rate": 0.009039065149849812, "loss": 2.6695, "step": 5976 }, { "crossentropy": 2.635694742202759, "epoch": 0.21668358468677495, "grad_norm": 0.03231648728251457, "grad_norm_var": 1.8502059322980512e-06, "learning_rate": 0.00903872260692014, "loss": 2.6656, "step": 5977 }, { "crossentropy": 2.807363510131836, "epoch": 0.21671983758700697, "grad_norm": 0.032988544553518295, "grad_norm_var": 1.4347714716788126e-06, "learning_rate": 0.009038380009441389, "loss": 2.7656, "step": 5978 }, { "crossentropy": 2.6947691440582275, "epoch": 0.21675609048723898, "grad_norm": 0.0365152433514595, "grad_norm_var": 2.4913674208204416e-06, "learning_rate": 0.009038037357418188, "loss": 2.8266, "step": 5979 }, { "crossentropy": 2.6881957054138184, "epoch": 0.216792343387471, "grad_norm": 0.03672944754362106, "grad_norm_var": 3.370962414750072e-06, "learning_rate": 0.009037694650855164, "loss": 2.6808, "step": 5980 }, { "crossentropy": 2.771212339401245, "epoch": 0.216828596287703, "grad_norm": 0.03406341373920441, "grad_norm_var": 3.366567267029302e-06, "learning_rate": 0.009037351889756946, "loss": 2.7898, "step": 5981 }, { "crossentropy": 2.6724462509155273, "epoch": 0.21686484918793503, "grad_norm": 0.03216918557882309, "grad_norm_var": 3.405646266685019e-06, "learning_rate": 0.009037009074128166, "loss": 2.6649, "step": 5982 }, { "crossentropy": 2.700222969055176, "epoch": 0.21690110208816704, "grad_norm": 0.045646555721759796, "grad_norm_var": 1.3519462668130445e-05, "learning_rate": 0.00903666620397345, "loss": 2.6546, "step": 5983 }, { "crossentropy": 2.631836414337158, "epoch": 0.21693735498839908, "grad_norm": 0.03402012586593628, "grad_norm_var": 1.3328727710662268e-05, "learning_rate": 0.009036323279297431, "loss": 2.6179, "step": 5984 }, { "crossentropy": 2.486698865890503, "epoch": 0.2169736078886311, "grad_norm": 0.032806798815727234, "grad_norm_var": 1.2937701272951648e-05, "learning_rate": 0.009035980300104742, "loss": 2.5757, "step": 5985 }, { "crossentropy": 2.6333718299865723, "epoch": 0.2170098607888631, "grad_norm": 0.03442481905221939, "grad_norm_var": 1.270569141800833e-05, "learning_rate": 0.009035637266400014, "loss": 2.6057, "step": 5986 }, { "crossentropy": 2.6341073513031006, "epoch": 0.21704611368909513, "grad_norm": 0.03610946983098984, "grad_norm_var": 1.2353765731364503e-05, "learning_rate": 0.00903529417818788, "loss": 2.6696, "step": 5987 }, { "crossentropy": 2.717228651046753, "epoch": 0.21708236658932714, "grad_norm": 0.04150080680847168, "grad_norm_var": 1.5084144656014817e-05, "learning_rate": 0.009034951035472976, "loss": 2.586, "step": 5988 }, { "crossentropy": 2.659590244293213, "epoch": 0.21711861948955916, "grad_norm": 0.03847265616059303, "grad_norm_var": 1.5320717676792217e-05, "learning_rate": 0.009034607838259932, "loss": 2.6805, "step": 5989 }, { "crossentropy": 2.783470869064331, "epoch": 0.21715487238979117, "grad_norm": 0.03993912786245346, "grad_norm_var": 1.5137001146688494e-05, "learning_rate": 0.00903426458655339, "loss": 2.7339, "step": 5990 }, { "crossentropy": 2.652245044708252, "epoch": 0.2171911252900232, "grad_norm": 0.034432824701070786, "grad_norm_var": 1.4686925627826104e-05, "learning_rate": 0.00903392128035798, "loss": 2.6188, "step": 5991 }, { "crossentropy": 2.6414904594421387, "epoch": 0.21722737819025523, "grad_norm": 0.034637514501810074, "grad_norm_var": 1.3846036397907543e-05, "learning_rate": 0.009033577919678341, "loss": 2.6733, "step": 5992 }, { "crossentropy": 2.6573128700256348, "epoch": 0.21726363109048724, "grad_norm": 0.0358484722673893, "grad_norm_var": 1.2868284721839704e-05, "learning_rate": 0.009033234504519112, "loss": 2.6627, "step": 5993 }, { "crossentropy": 2.6806206703186035, "epoch": 0.21729988399071926, "grad_norm": 0.03764166682958603, "grad_norm_var": 1.2186219339826766e-05, "learning_rate": 0.00903289103488493, "loss": 2.7116, "step": 5994 }, { "crossentropy": 2.7262701988220215, "epoch": 0.21733613689095127, "grad_norm": 0.04851602762937546, "grad_norm_var": 2.111596762637812e-05, "learning_rate": 0.009032547510780435, "loss": 2.6331, "step": 5995 }, { "crossentropy": 2.6262121200561523, "epoch": 0.21737238979118328, "grad_norm": 0.04111500084400177, "grad_norm_var": 2.1978602216095786e-05, "learning_rate": 0.009032203932210266, "loss": 2.6752, "step": 5996 }, { "crossentropy": 2.65299391746521, "epoch": 0.21740864269141533, "grad_norm": 0.03434472158551216, "grad_norm_var": 2.185149786621537e-05, "learning_rate": 0.009031860299179064, "loss": 2.6441, "step": 5997 }, { "crossentropy": 2.833751678466797, "epoch": 0.21744489559164734, "grad_norm": 0.032210227102041245, "grad_norm_var": 2.182187580708738e-05, "learning_rate": 0.00903151661169147, "loss": 2.7671, "step": 5998 }, { "crossentropy": 2.782806873321533, "epoch": 0.21748114849187936, "grad_norm": 0.03611131012439728, "grad_norm_var": 1.7279623783358898e-05, "learning_rate": 0.009031172869752125, "loss": 2.7704, "step": 5999 }, { "crossentropy": 2.629631757736206, "epoch": 0.21751740139211137, "grad_norm": 0.03468848392367363, "grad_norm_var": 1.7041260171909006e-05, "learning_rate": 0.009030829073365674, "loss": 2.6922, "step": 6000 }, { "crossentropy": 2.7468605041503906, "epoch": 0.21755365429234338, "grad_norm": 0.0387459434568882, "grad_norm_var": 1.588572226524735e-05, "learning_rate": 0.00903048522253676, "loss": 2.714, "step": 6001 }, { "crossentropy": 2.8451104164123535, "epoch": 0.2175899071925754, "grad_norm": 0.04240059107542038, "grad_norm_var": 1.6675079260001593e-05, "learning_rate": 0.009030141317270026, "loss": 2.7518, "step": 6002 }, { "crossentropy": 2.5782506465911865, "epoch": 0.2176261600928074, "grad_norm": 0.03226150944828987, "grad_norm_var": 1.8529252007649413e-05, "learning_rate": 0.009029797357570117, "loss": 2.6213, "step": 6003 }, { "crossentropy": 2.546947956085205, "epoch": 0.21766241299303946, "grad_norm": 0.032824963331222534, "grad_norm_var": 1.8812865436823532e-05, "learning_rate": 0.009029453343441681, "loss": 2.6212, "step": 6004 }, { "crossentropy": 2.8182101249694824, "epoch": 0.21769866589327147, "grad_norm": 0.036449506878852844, "grad_norm_var": 1.8708372505772906e-05, "learning_rate": 0.00902910927488936, "loss": 2.6653, "step": 6005 }, { "crossentropy": 2.8497869968414307, "epoch": 0.21773491879350348, "grad_norm": 0.03753005713224411, "grad_norm_var": 1.8130393801847842e-05, "learning_rate": 0.009028765151917804, "loss": 2.7994, "step": 6006 }, { "crossentropy": 2.6618051528930664, "epoch": 0.2177711716937355, "grad_norm": 0.0384674072265625, "grad_norm_var": 1.784211425064948e-05, "learning_rate": 0.009028420974531664, "loss": 2.6885, "step": 6007 }, { "crossentropy": 2.7454776763916016, "epoch": 0.2178074245939675, "grad_norm": 0.032334841787815094, "grad_norm_var": 1.893325906303737e-05, "learning_rate": 0.009028076742735583, "loss": 2.7532, "step": 6008 }, { "crossentropy": 2.569990873336792, "epoch": 0.21784367749419953, "grad_norm": 0.03279339522123337, "grad_norm_var": 1.9972704531080222e-05, "learning_rate": 0.009027732456534212, "loss": 2.6254, "step": 6009 }, { "crossentropy": 2.8513519763946533, "epoch": 0.21787993039443154, "grad_norm": 0.03296033665537834, "grad_norm_var": 2.0802819868366806e-05, "learning_rate": 0.009027388115932204, "loss": 2.8134, "step": 6010 }, { "crossentropy": 2.6937835216522217, "epoch": 0.21791618329466358, "grad_norm": 0.03466831147670746, "grad_norm_var": 1.0573483385235375e-05, "learning_rate": 0.009027043720934207, "loss": 2.6551, "step": 6011 }, { "crossentropy": 2.8074710369110107, "epoch": 0.2179524361948956, "grad_norm": 0.03385646268725395, "grad_norm_var": 8.54748202603935e-06, "learning_rate": 0.00902669927154487, "loss": 2.7282, "step": 6012 }, { "crossentropy": 2.562922716140747, "epoch": 0.2179886890951276, "grad_norm": 0.035709138959646225, "grad_norm_var": 8.514515502847503e-06, "learning_rate": 0.009026354767768854, "loss": 2.526, "step": 6013 }, { "crossentropy": 2.7338860034942627, "epoch": 0.21802494199535963, "grad_norm": 0.03583502396941185, "grad_norm_var": 7.86619411153825e-06, "learning_rate": 0.009026010209610804, "loss": 2.7582, "step": 6014 }, { "crossentropy": 2.57717227935791, "epoch": 0.21806119489559164, "grad_norm": 0.03151073679327965, "grad_norm_var": 8.800134319064679e-06, "learning_rate": 0.009025665597075376, "loss": 2.674, "step": 6015 }, { "crossentropy": 2.6912291049957275, "epoch": 0.21809744779582366, "grad_norm": 0.03179148957133293, "grad_norm_var": 9.518309476486131e-06, "learning_rate": 0.009025320930167226, "loss": 2.5184, "step": 6016 }, { "crossentropy": 2.7991793155670166, "epoch": 0.21813370069605567, "grad_norm": 0.034369684755802155, "grad_norm_var": 8.534619903259079e-06, "learning_rate": 0.009024976208891007, "loss": 2.7517, "step": 6017 }, { "crossentropy": 2.7558817863464355, "epoch": 0.2181699535962877, "grad_norm": 0.0339767150580883, "grad_norm_var": 4.3601028503701305e-06, "learning_rate": 0.009024631433251377, "loss": 2.7371, "step": 6018 }, { "crossentropy": 2.847905397415161, "epoch": 0.21820620649651973, "grad_norm": 0.03450682386755943, "grad_norm_var": 4.092244855250874e-06, "learning_rate": 0.009024286603252992, "loss": 2.9023, "step": 6019 }, { "crossentropy": 2.5744357109069824, "epoch": 0.21824245939675174, "grad_norm": 0.032571811228990555, "grad_norm_var": 4.147693863185547e-06, "learning_rate": 0.009023941718900511, "loss": 2.5639, "step": 6020 }, { "crossentropy": 2.658802032470703, "epoch": 0.21827871229698376, "grad_norm": 0.034313347190618515, "grad_norm_var": 3.830132913464531e-06, "learning_rate": 0.009023596780198588, "loss": 2.6349, "step": 6021 }, { "crossentropy": 2.6549072265625, "epoch": 0.21831496519721577, "grad_norm": 0.033851854503154755, "grad_norm_var": 3.0424209675713096e-06, "learning_rate": 0.009023251787151885, "loss": 2.6854, "step": 6022 }, { "crossentropy": 2.694451093673706, "epoch": 0.21835121809744779, "grad_norm": 0.038249626755714417, "grad_norm_var": 2.9147874901500986e-06, "learning_rate": 0.00902290673976506, "loss": 2.73, "step": 6023 }, { "crossentropy": 2.629909038543701, "epoch": 0.21838747099767983, "grad_norm": 0.03387095034122467, "grad_norm_var": 2.7301815775591007e-06, "learning_rate": 0.009022561638042777, "loss": 2.6484, "step": 6024 }, { "crossentropy": 2.471187114715576, "epoch": 0.21842372389791184, "grad_norm": 0.03150252252817154, "grad_norm_var": 3.050994968054749e-06, "learning_rate": 0.009022216481989694, "loss": 2.6384, "step": 6025 }, { "crossentropy": 2.7532031536102295, "epoch": 0.21845997679814386, "grad_norm": 0.03335453197360039, "grad_norm_var": 3.00755797832189e-06, "learning_rate": 0.009021871271610473, "loss": 2.7633, "step": 6026 }, { "crossentropy": 2.6666884422302246, "epoch": 0.21849622969837587, "grad_norm": 0.0320899523794651, "grad_norm_var": 3.1919910397609197e-06, "learning_rate": 0.009021526006909777, "loss": 2.6292, "step": 6027 }, { "crossentropy": 2.628248691558838, "epoch": 0.21853248259860789, "grad_norm": 0.03047012723982334, "grad_norm_var": 3.899023606474316e-06, "learning_rate": 0.00902118068789227, "loss": 2.6651, "step": 6028 }, { "crossentropy": 2.6534736156463623, "epoch": 0.2185687354988399, "grad_norm": 0.03168188035488129, "grad_norm_var": 3.7927225482354965e-06, "learning_rate": 0.009020835314562613, "loss": 2.5855, "step": 6029 }, { "crossentropy": 2.787696123123169, "epoch": 0.21860498839907191, "grad_norm": 0.0343775749206543, "grad_norm_var": 3.4467917073443492e-06, "learning_rate": 0.009020489886925475, "loss": 2.6699, "step": 6030 }, { "crossentropy": 2.6458492279052734, "epoch": 0.21864124129930396, "grad_norm": 0.034650299698114395, "grad_norm_var": 3.3219648058814642e-06, "learning_rate": 0.00902014440498552, "loss": 2.7218, "step": 6031 }, { "crossentropy": 2.4649744033813477, "epoch": 0.21867749419953597, "grad_norm": 0.03250640258193016, "grad_norm_var": 3.1932595429598564e-06, "learning_rate": 0.009019798868747415, "loss": 2.5845, "step": 6032 }, { "crossentropy": 2.6051504611968994, "epoch": 0.21871374709976799, "grad_norm": 0.033311545848846436, "grad_norm_var": 3.1435727884267558e-06, "learning_rate": 0.009019453278215824, "loss": 2.6271, "step": 6033 }, { "crossentropy": 2.755462646484375, "epoch": 0.21875, "grad_norm": 0.036710914224386215, "grad_norm_var": 3.8008735435743664e-06, "learning_rate": 0.009019107633395418, "loss": 2.7998, "step": 6034 }, { "crossentropy": 2.7165260314941406, "epoch": 0.21878625290023201, "grad_norm": 0.03217079117894173, "grad_norm_var": 3.8676690679173654e-06, "learning_rate": 0.009018761934290864, "loss": 2.695, "step": 6035 }, { "crossentropy": 2.694776773452759, "epoch": 0.21882250580046403, "grad_norm": 0.035705529153347015, "grad_norm_var": 4.101855233244961e-06, "learning_rate": 0.00901841618090683, "loss": 2.6558, "step": 6036 }, { "crossentropy": 2.5695903301239014, "epoch": 0.21885875870069604, "grad_norm": 0.03202304244041443, "grad_norm_var": 4.235104809559515e-06, "learning_rate": 0.00901807037324799, "loss": 2.5907, "step": 6037 }, { "crossentropy": 2.6420063972473145, "epoch": 0.21889501160092809, "grad_norm": 0.03423423692584038, "grad_norm_var": 4.2605013566592105e-06, "learning_rate": 0.00901772451131901, "loss": 2.6465, "step": 6038 }, { "crossentropy": 2.6095945835113525, "epoch": 0.2189312645011601, "grad_norm": 0.031510140746831894, "grad_norm_var": 2.8823913491517976e-06, "learning_rate": 0.009017378595124564, "loss": 2.5867, "step": 6039 }, { "crossentropy": 2.5935707092285156, "epoch": 0.21896751740139211, "grad_norm": 0.03174316883087158, "grad_norm_var": 2.956750192043449e-06, "learning_rate": 0.009017032624669323, "loss": 2.6601, "step": 6040 }, { "crossentropy": 2.6618480682373047, "epoch": 0.21900377030162413, "grad_norm": 0.03126084804534912, "grad_norm_var": 3.008740131588934e-06, "learning_rate": 0.009016686599957961, "loss": 2.6628, "step": 6041 }, { "crossentropy": 2.6423003673553467, "epoch": 0.21904002320185614, "grad_norm": 0.03382113203406334, "grad_norm_var": 3.045177807082253e-06, "learning_rate": 0.00901634052099515, "loss": 2.5892, "step": 6042 }, { "crossentropy": 2.6502456665039062, "epoch": 0.21907627610208816, "grad_norm": 0.030723892152309418, "grad_norm_var": 3.330613817465822e-06, "learning_rate": 0.009015994387785567, "loss": 2.66, "step": 6043 }, { "crossentropy": 2.6687607765197754, "epoch": 0.21911252900232017, "grad_norm": 0.031142037361860275, "grad_norm_var": 3.1383346157709545e-06, "learning_rate": 0.009015648200333884, "loss": 2.677, "step": 6044 }, { "crossentropy": 2.8152267932891846, "epoch": 0.21914878190255221, "grad_norm": 0.032708149403333664, "grad_norm_var": 3.0274433975555416e-06, "learning_rate": 0.009015301958644776, "loss": 2.7555, "step": 6045 }, { "crossentropy": 2.4825637340545654, "epoch": 0.21918503480278423, "grad_norm": 0.03147364407777786, "grad_norm_var": 3.035622515714225e-06, "learning_rate": 0.009014955662722925, "loss": 2.6118, "step": 6046 }, { "crossentropy": 2.652599334716797, "epoch": 0.21922128770301624, "grad_norm": 0.03312554210424423, "grad_norm_var": 2.816142065155634e-06, "learning_rate": 0.009014609312573004, "loss": 2.6125, "step": 6047 }, { "crossentropy": 2.7237157821655273, "epoch": 0.21925754060324826, "grad_norm": 0.03365069255232811, "grad_norm_var": 2.859182615501619e-06, "learning_rate": 0.009014262908199691, "loss": 2.6788, "step": 6048 }, { "crossentropy": 2.6297342777252197, "epoch": 0.21929379350348027, "grad_norm": 0.03926975280046463, "grad_norm_var": 5.458747247829206e-06, "learning_rate": 0.009013916449607666, "loss": 2.6415, "step": 6049 }, { "crossentropy": 2.583860158920288, "epoch": 0.2193300464037123, "grad_norm": 0.03498537093400955, "grad_norm_var": 4.838133475078219e-06, "learning_rate": 0.00901356993680161, "loss": 2.6618, "step": 6050 }, { "crossentropy": 2.6303510665893555, "epoch": 0.21936629930394433, "grad_norm": 0.03310016170144081, "grad_norm_var": 4.777375621901371e-06, "learning_rate": 0.009013223369786199, "loss": 2.6167, "step": 6051 }, { "crossentropy": 2.5896546840667725, "epoch": 0.21940255220417634, "grad_norm": 0.03597600385546684, "grad_norm_var": 4.873934381587527e-06, "learning_rate": 0.009012876748566115, "loss": 2.6276, "step": 6052 }, { "crossentropy": 2.685102701187134, "epoch": 0.21943880510440836, "grad_norm": 0.03751033544540405, "grad_norm_var": 5.915402502094472e-06, "learning_rate": 0.009012530073146044, "loss": 2.6542, "step": 6053 }, { "crossentropy": 2.5488414764404297, "epoch": 0.21947505800464037, "grad_norm": 0.038368914276361465, "grad_norm_var": 7.380551675445259e-06, "learning_rate": 0.009012183343530662, "loss": 2.5523, "step": 6054 }, { "crossentropy": 2.522819995880127, "epoch": 0.2195113109048724, "grad_norm": 0.04131554439663887, "grad_norm_var": 1.043109389067635e-05, "learning_rate": 0.009011836559724658, "loss": 2.7279, "step": 6055 }, { "crossentropy": 2.7518322467803955, "epoch": 0.2195475638051044, "grad_norm": 0.039321769028902054, "grad_norm_var": 1.1350315748537693e-05, "learning_rate": 0.009011489721732711, "loss": 2.6839, "step": 6056 }, { "crossentropy": 2.628978729248047, "epoch": 0.21958381670533642, "grad_norm": 0.03371500223875046, "grad_norm_var": 1.0549155782893909e-05, "learning_rate": 0.009011142829559508, "loss": 2.6309, "step": 6057 }, { "crossentropy": 2.7479279041290283, "epoch": 0.21962006960556846, "grad_norm": 0.032706957310438156, "grad_norm_var": 1.0803801736698238e-05, "learning_rate": 0.009010795883209735, "loss": 2.7142, "step": 6058 }, { "crossentropy": 2.683917760848999, "epoch": 0.21965632250580047, "grad_norm": 0.03131937235593796, "grad_norm_var": 1.0490949371166845e-05, "learning_rate": 0.009010448882688076, "loss": 2.6371, "step": 6059 }, { "crossentropy": 2.550133466720581, "epoch": 0.2196925754060325, "grad_norm": 0.03270827233791351, "grad_norm_var": 9.84266004985986e-06, "learning_rate": 0.009010101827999219, "loss": 2.6017, "step": 6060 }, { "crossentropy": 2.574343681335449, "epoch": 0.2197288283062645, "grad_norm": 0.034505341202020645, "grad_norm_var": 9.476539792779616e-06, "learning_rate": 0.00900975471914785, "loss": 2.6373, "step": 6061 }, { "crossentropy": 2.7836735248565674, "epoch": 0.21976508120649652, "grad_norm": 0.03599213436245918, "grad_norm_var": 8.513133869082237e-06, "learning_rate": 0.00900940755613866, "loss": 2.745, "step": 6062 }, { "crossentropy": 2.62492036819458, "epoch": 0.21980133410672853, "grad_norm": 0.037184081971645355, "grad_norm_var": 8.272210682431538e-06, "learning_rate": 0.009009060338976336, "loss": 2.6926, "step": 6063 }, { "crossentropy": 2.7236266136169434, "epoch": 0.21983758700696054, "grad_norm": 0.03578038513660431, "grad_norm_var": 7.966139577991517e-06, "learning_rate": 0.009008713067665569, "loss": 2.8399, "step": 6064 }, { "crossentropy": 2.61543607711792, "epoch": 0.2198738399071926, "grad_norm": 0.034638382494449615, "grad_norm_var": 7.201138778733842e-06, "learning_rate": 0.009008365742211049, "loss": 2.6424, "step": 6065 }, { "crossentropy": 2.673041582107544, "epoch": 0.2199100928074246, "grad_norm": 0.03155947104096413, "grad_norm_var": 8.201968135144952e-06, "learning_rate": 0.009008018362617465, "loss": 2.6604, "step": 6066 }, { "crossentropy": 2.6389448642730713, "epoch": 0.21994634570765662, "grad_norm": 0.035498861223459244, "grad_norm_var": 7.839978480968177e-06, "learning_rate": 0.009007670928889511, "loss": 2.6955, "step": 6067 }, { "crossentropy": 2.7264010906219482, "epoch": 0.21998259860788863, "grad_norm": 0.033689577132463455, "grad_norm_var": 8.023520765248979e-06, "learning_rate": 0.00900732344103188, "loss": 2.8107, "step": 6068 }, { "crossentropy": 2.653653860092163, "epoch": 0.22001885150812064, "grad_norm": 0.03469310328364372, "grad_norm_var": 7.713115215492404e-06, "learning_rate": 0.009006975899049264, "loss": 2.6633, "step": 6069 }, { "crossentropy": 2.5988574028015137, "epoch": 0.22005510440835266, "grad_norm": 0.036541346460580826, "grad_norm_var": 7.1465889910876195e-06, "learning_rate": 0.009006628302946358, "loss": 2.6251, "step": 6070 }, { "crossentropy": 2.766818046569824, "epoch": 0.22009135730858467, "grad_norm": 0.03606770187616348, "grad_norm_var": 4.499914421004075e-06, "learning_rate": 0.009006280652727856, "loss": 2.7385, "step": 6071 }, { "crossentropy": 2.6934046745300293, "epoch": 0.22012761020881672, "grad_norm": 0.031510673463344574, "grad_norm_var": 3.5467441170814834e-06, "learning_rate": 0.009005932948398455, "loss": 2.6453, "step": 6072 }, { "crossentropy": 2.695401906967163, "epoch": 0.22016386310904873, "grad_norm": 0.03446069732308388, "grad_norm_var": 3.5276175667522367e-06, "learning_rate": 0.00900558518996285, "loss": 2.6462, "step": 6073 }, { "crossentropy": 2.576840400695801, "epoch": 0.22020011600928074, "grad_norm": 0.03892020136117935, "grad_norm_var": 4.6177461665221355e-06, "learning_rate": 0.009005237377425738, "loss": 2.6541, "step": 6074 }, { "crossentropy": 2.7340996265411377, "epoch": 0.22023636890951276, "grad_norm": 0.033294569700956345, "grad_norm_var": 3.973409382709789e-06, "learning_rate": 0.009004889510791815, "loss": 2.6786, "step": 6075 }, { "crossentropy": 2.704655647277832, "epoch": 0.22027262180974477, "grad_norm": 0.03849974647164345, "grad_norm_var": 4.4426931610414e-06, "learning_rate": 0.009004541590065784, "loss": 2.6853, "step": 6076 }, { "crossentropy": 2.605949640274048, "epoch": 0.2203088747099768, "grad_norm": 0.034393277019262314, "grad_norm_var": 4.453517904253144e-06, "learning_rate": 0.009004193615252343, "loss": 2.6932, "step": 6077 }, { "crossentropy": 2.7820584774017334, "epoch": 0.22034512761020883, "grad_norm": 0.04461158066987991, "grad_norm_var": 1.00414896302571e-05, "learning_rate": 0.00900384558635619, "loss": 2.7146, "step": 6078 }, { "crossentropy": 2.7453768253326416, "epoch": 0.22038138051044084, "grad_norm": 0.04496894031763077, "grad_norm_var": 1.536037034911234e-05, "learning_rate": 0.009003497503382023, "loss": 2.6872, "step": 6079 }, { "crossentropy": 2.813199520111084, "epoch": 0.22041763341067286, "grad_norm": 0.04866824299097061, "grad_norm_var": 2.502804453233063e-05, "learning_rate": 0.00900314936633455, "loss": 2.6871, "step": 6080 }, { "crossentropy": 2.7406961917877197, "epoch": 0.22045388631090487, "grad_norm": 0.045666832476854324, "grad_norm_var": 2.9155545049616705e-05, "learning_rate": 0.00900280117521847, "loss": 2.6648, "step": 6081 }, { "crossentropy": 2.7492148876190186, "epoch": 0.2204901392111369, "grad_norm": 0.04310018569231033, "grad_norm_var": 2.8045912426547376e-05, "learning_rate": 0.009002452930038483, "loss": 2.744, "step": 6082 }, { "crossentropy": 2.7502665519714355, "epoch": 0.2205263921113689, "grad_norm": 0.04094981029629707, "grad_norm_var": 2.778600936495784e-05, "learning_rate": 0.009002104630799297, "loss": 2.6889, "step": 6083 }, { "crossentropy": 2.5818238258361816, "epoch": 0.22056264501160092, "grad_norm": 0.03846483305096626, "grad_norm_var": 2.5987774011894797e-05, "learning_rate": 0.009001756277505614, "loss": 2.5485, "step": 6084 }, { "crossentropy": 2.7215311527252197, "epoch": 0.22059889791183296, "grad_norm": 0.03305148333311081, "grad_norm_var": 2.7110016047433388e-05, "learning_rate": 0.00900140787016214, "loss": 2.7373, "step": 6085 }, { "crossentropy": 2.6104533672332764, "epoch": 0.22063515081206497, "grad_norm": 0.03323205187916756, "grad_norm_var": 2.8856449063837293e-05, "learning_rate": 0.00900105940877358, "loss": 2.5364, "step": 6086 }, { "crossentropy": 2.770472764968872, "epoch": 0.220671403712297, "grad_norm": 0.03295545279979706, "grad_norm_var": 3.057128445480908e-05, "learning_rate": 0.009000710893344642, "loss": 2.8041, "step": 6087 }, { "crossentropy": 2.6390862464904785, "epoch": 0.220707656612529, "grad_norm": 0.032785724848508835, "grad_norm_var": 2.9476706838601983e-05, "learning_rate": 0.00900036232388003, "loss": 2.6472, "step": 6088 }, { "crossentropy": 2.7721798419952393, "epoch": 0.22074390951276102, "grad_norm": 0.03163272142410278, "grad_norm_var": 3.1547310547491485e-05, "learning_rate": 0.009000013700384455, "loss": 2.7481, "step": 6089 }, { "crossentropy": 2.681168556213379, "epoch": 0.22078016241299303, "grad_norm": 0.0307126697152853, "grad_norm_var": 3.5242677796292256e-05, "learning_rate": 0.008999665022862625, "loss": 2.6957, "step": 6090 }, { "crossentropy": 2.6980197429656982, "epoch": 0.22081641531322505, "grad_norm": 0.03404550999403, "grad_norm_var": 3.481312145503858e-05, "learning_rate": 0.00899931629131925, "loss": 2.6938, "step": 6091 }, { "crossentropy": 2.6645820140838623, "epoch": 0.2208526682134571, "grad_norm": 0.044834837317466736, "grad_norm_var": 3.7757358194021954e-05, "learning_rate": 0.008998967505759037, "loss": 2.6546, "step": 6092 }, { "crossentropy": 2.658907890319824, "epoch": 0.2208889211136891, "grad_norm": 0.05261993035674095, "grad_norm_var": 4.883281120876038e-05, "learning_rate": 0.008998618666186701, "loss": 2.7603, "step": 6093 }, { "crossentropy": 2.5461604595184326, "epoch": 0.22092517401392112, "grad_norm": 0.04015690088272095, "grad_norm_var": 4.704817794364834e-05, "learning_rate": 0.008998269772606952, "loss": 2.5986, "step": 6094 }, { "crossentropy": 2.6564037799835205, "epoch": 0.22096142691415313, "grad_norm": 0.03251856938004494, "grad_norm_var": 4.722672293688995e-05, "learning_rate": 0.008997920825024501, "loss": 2.642, "step": 6095 }, { "crossentropy": 2.603757619857788, "epoch": 0.22099767981438515, "grad_norm": 0.03103758580982685, "grad_norm_var": 4.266240987663792e-05, "learning_rate": 0.008997571823444062, "loss": 2.6192, "step": 6096 }, { "crossentropy": 2.642428398132324, "epoch": 0.22103393271461716, "grad_norm": 0.031195219606161118, "grad_norm_var": 3.9723813351495196e-05, "learning_rate": 0.00899722276787035, "loss": 2.6065, "step": 6097 }, { "crossentropy": 2.6231982707977295, "epoch": 0.2210701856148492, "grad_norm": 0.03672891482710838, "grad_norm_var": 3.661649394133294e-05, "learning_rate": 0.008996873658308079, "loss": 2.5973, "step": 6098 }, { "crossentropy": 2.660506010055542, "epoch": 0.22110643851508122, "grad_norm": 0.03524624556303024, "grad_norm_var": 3.49292837817168e-05, "learning_rate": 0.008996524494761963, "loss": 2.7097, "step": 6099 }, { "crossentropy": 2.6554207801818848, "epoch": 0.22114269141531323, "grad_norm": 0.0446915365755558, "grad_norm_var": 3.9646995233378185e-05, "learning_rate": 0.008996175277236718, "loss": 2.6848, "step": 6100 }, { "crossentropy": 2.5606958866119385, "epoch": 0.22117894431554525, "grad_norm": 0.06468147784471512, "grad_norm_var": 8.935968307668587e-05, "learning_rate": 0.008995826005737062, "loss": 2.5382, "step": 6101 }, { "crossentropy": 2.5476012229919434, "epoch": 0.22121519721577726, "grad_norm": 0.043265387415885925, "grad_norm_var": 8.918305437903863e-05, "learning_rate": 0.008995476680267714, "loss": 2.5681, "step": 6102 }, { "crossentropy": 2.7505033016204834, "epoch": 0.22125145011600927, "grad_norm": 0.037488531321287155, "grad_norm_var": 8.699873947165578e-05, "learning_rate": 0.008995127300833388, "loss": 2.6679, "step": 6103 }, { "crossentropy": 2.7489025592803955, "epoch": 0.2212877030162413, "grad_norm": 0.0333600677549839, "grad_norm_var": 8.654518761015273e-05, "learning_rate": 0.008994777867438804, "loss": 2.6863, "step": 6104 }, { "crossentropy": 2.8113579750061035, "epoch": 0.22132395591647333, "grad_norm": 0.03204404562711716, "grad_norm_var": 8.6150975757158e-05, "learning_rate": 0.008994428380088683, "loss": 2.7211, "step": 6105 }, { "crossentropy": 2.7433552742004395, "epoch": 0.22136020881670534, "grad_norm": 0.12888723611831665, "grad_norm_var": 0.0005795473305764571, "learning_rate": 0.008994078838787747, "loss": 2.7128, "step": 6106 }, { "crossentropy": 2.5844335556030273, "epoch": 0.22139646171693736, "grad_norm": 0.03563638776540756, "grad_norm_var": 0.0005773447304305346, "learning_rate": 0.008993729243540714, "loss": 2.6145, "step": 6107 }, { "crossentropy": 2.7318243980407715, "epoch": 0.22143271461716937, "grad_norm": 0.03395283222198486, "grad_norm_var": 0.0005853838584242388, "learning_rate": 0.008993379594352306, "loss": 2.7764, "step": 6108 }, { "crossentropy": 2.507253646850586, "epoch": 0.2214689675174014, "grad_norm": 0.03479794040322304, "grad_norm_var": 0.0005861645964394014, "learning_rate": 0.008993029891227246, "loss": 2.5481, "step": 6109 }, { "crossentropy": 2.684504747390747, "epoch": 0.2215052204176334, "grad_norm": 0.0324668288230896, "grad_norm_var": 0.0005932685571889062, "learning_rate": 0.00899268013417026, "loss": 2.6607, "step": 6110 }, { "crossentropy": 2.700507164001465, "epoch": 0.22154147331786542, "grad_norm": 0.03507230058312416, "grad_norm_var": 0.0005901072781601754, "learning_rate": 0.008992330323186068, "loss": 2.7089, "step": 6111 }, { "crossentropy": 2.742511034011841, "epoch": 0.22157772621809746, "grad_norm": 0.03255710378289223, "grad_norm_var": 0.0005877956509097816, "learning_rate": 0.008991980458279397, "loss": 2.7033, "step": 6112 }, { "crossentropy": 2.669081687927246, "epoch": 0.22161397911832947, "grad_norm": 0.03394069895148277, "grad_norm_var": 0.0005838522858100009, "learning_rate": 0.00899163053945497, "loss": 2.7191, "step": 6113 }, { "crossentropy": 2.6337265968322754, "epoch": 0.2216502320185615, "grad_norm": 0.03365488350391388, "grad_norm_var": 0.0005871878693788972, "learning_rate": 0.00899128056671752, "loss": 2.6503, "step": 6114 }, { "crossentropy": 2.6015608310699463, "epoch": 0.2216864849187935, "grad_norm": 0.034311458468437195, "grad_norm_var": 0.0005882380596674462, "learning_rate": 0.008990930540071764, "loss": 2.6243, "step": 6115 }, { "crossentropy": 2.6233670711517334, "epoch": 0.22172273781902552, "grad_norm": 0.037280451506376266, "grad_norm_var": 0.0005901728018569709, "learning_rate": 0.008990580459522437, "loss": 2.6819, "step": 6116 }, { "crossentropy": 2.7544264793395996, "epoch": 0.22175899071925753, "grad_norm": 0.04296550154685974, "grad_norm_var": 0.0005560359763837683, "learning_rate": 0.008990230325074264, "loss": 2.7284, "step": 6117 }, { "crossentropy": 2.6782405376434326, "epoch": 0.22179524361948955, "grad_norm": 0.04481429234147072, "grad_norm_var": 0.0005565804336295811, "learning_rate": 0.008989880136731977, "loss": 2.6255, "step": 6118 }, { "crossentropy": 2.676862955093384, "epoch": 0.2218314965197216, "grad_norm": 0.03557903692126274, "grad_norm_var": 0.0005578173924473108, "learning_rate": 0.008989529894500302, "loss": 2.6747, "step": 6119 }, { "crossentropy": 2.6610069274902344, "epoch": 0.2218677494199536, "grad_norm": 0.036566413938999176, "grad_norm_var": 0.0005550515879189708, "learning_rate": 0.00898917959838397, "loss": 2.6904, "step": 6120 }, { "crossentropy": 2.745495319366455, "epoch": 0.22190400232018562, "grad_norm": 0.035197533667087555, "grad_norm_var": 0.000551683359900397, "learning_rate": 0.008988829248387715, "loss": 2.6508, "step": 6121 }, { "crossentropy": 2.647745132446289, "epoch": 0.22194025522041763, "grad_norm": 0.0376761257648468, "grad_norm_var": 1.1689605916219367e-05, "learning_rate": 0.008988478844516269, "loss": 2.56, "step": 6122 }, { "crossentropy": 2.7248566150665283, "epoch": 0.22197650812064965, "grad_norm": 0.03471703827381134, "grad_norm_var": 1.1790601875374893e-05, "learning_rate": 0.008988128386774362, "loss": 2.7315, "step": 6123 }, { "crossentropy": 2.481370449066162, "epoch": 0.22201276102088166, "grad_norm": 0.03321129456162453, "grad_norm_var": 1.2024598147726782e-05, "learning_rate": 0.008987777875166731, "loss": 2.5463, "step": 6124 }, { "crossentropy": 2.777454376220703, "epoch": 0.2220490139211137, "grad_norm": 0.03334109112620354, "grad_norm_var": 1.2376284308930156e-05, "learning_rate": 0.008987427309698107, "loss": 2.7542, "step": 6125 }, { "crossentropy": 2.6859583854675293, "epoch": 0.22208526682134572, "grad_norm": 0.03770458325743675, "grad_norm_var": 1.1739040146637771e-05, "learning_rate": 0.008987076690373226, "loss": 2.6967, "step": 6126 }, { "crossentropy": 2.7033028602600098, "epoch": 0.22212151972157773, "grad_norm": 0.03745920956134796, "grad_norm_var": 1.1748365371781062e-05, "learning_rate": 0.008986726017196823, "loss": 2.6953, "step": 6127 }, { "crossentropy": 2.779022693634033, "epoch": 0.22215777262180975, "grad_norm": 0.035068582743406296, "grad_norm_var": 1.0885526590057963e-05, "learning_rate": 0.008986375290173635, "loss": 2.6605, "step": 6128 }, { "crossentropy": 2.7956457138061523, "epoch": 0.22219402552204176, "grad_norm": 0.03595173358917236, "grad_norm_var": 1.0460624246278783e-05, "learning_rate": 0.0089860245093084, "loss": 2.7414, "step": 6129 }, { "crossentropy": 2.7739930152893066, "epoch": 0.22223027842227377, "grad_norm": 0.03454221040010452, "grad_norm_var": 1.0162141209871187e-05, "learning_rate": 0.008985673674605855, "loss": 2.6964, "step": 6130 }, { "crossentropy": 2.8094418048858643, "epoch": 0.2222665313225058, "grad_norm": 0.036856573075056076, "grad_norm_var": 9.773696006687195e-06, "learning_rate": 0.008985322786070738, "loss": 2.7825, "step": 6131 }, { "crossentropy": 2.666531562805176, "epoch": 0.22230278422273783, "grad_norm": 0.03740452602505684, "grad_norm_var": 9.782470257542937e-06, "learning_rate": 0.008984971843707787, "loss": 2.6933, "step": 6132 }, { "crossentropy": 2.6344966888427734, "epoch": 0.22233903712296985, "grad_norm": 0.0372178815305233, "grad_norm_var": 7.134487696364098e-06, "learning_rate": 0.008984620847521748, "loss": 2.7141, "step": 6133 }, { "crossentropy": 2.5866119861602783, "epoch": 0.22237529002320186, "grad_norm": 0.03829082474112511, "grad_norm_var": 2.524867209650836e-06, "learning_rate": 0.008984269797517356, "loss": 2.5737, "step": 6134 }, { "crossentropy": 2.755603551864624, "epoch": 0.22241154292343387, "grad_norm": 0.037051666527986526, "grad_norm_var": 2.568121453524047e-06, "learning_rate": 0.008983918693699352, "loss": 2.6744, "step": 6135 }, { "crossentropy": 2.714291572570801, "epoch": 0.2224477958236659, "grad_norm": 0.03495495393872261, "grad_norm_var": 2.639033962499824e-06, "learning_rate": 0.008983567536072481, "loss": 2.7294, "step": 6136 }, { "crossentropy": 2.657456398010254, "epoch": 0.2224840487238979, "grad_norm": 0.0384836308658123, "grad_norm_var": 2.9446530203429295e-06, "learning_rate": 0.008983216324641486, "loss": 2.6947, "step": 6137 }, { "crossentropy": 2.715346097946167, "epoch": 0.22252030162412992, "grad_norm": 0.034565269947052, "grad_norm_var": 2.956197727171407e-06, "learning_rate": 0.008982865059411111, "loss": 2.6645, "step": 6138 }, { "crossentropy": 2.7808773517608643, "epoch": 0.22255655452436196, "grad_norm": 0.03422923758625984, "grad_norm_var": 3.0578511748026913e-06, "learning_rate": 0.008982513740386097, "loss": 2.7058, "step": 6139 }, { "crossentropy": 2.588871717453003, "epoch": 0.22259280742459397, "grad_norm": 0.03287062793970108, "grad_norm_var": 3.19271983039663e-06, "learning_rate": 0.008982162367571192, "loss": 2.6002, "step": 6140 }, { "crossentropy": 2.699143171310425, "epoch": 0.222629060324826, "grad_norm": 0.03415153548121452, "grad_norm_var": 2.9465013501682344e-06, "learning_rate": 0.008981810940971143, "loss": 2.6368, "step": 6141 }, { "crossentropy": 2.738917827606201, "epoch": 0.222665313225058, "grad_norm": 0.0323343351483345, "grad_norm_var": 3.564374168250432e-06, "learning_rate": 0.008981459460590691, "loss": 2.7294, "step": 6142 }, { "crossentropy": 2.710871458053589, "epoch": 0.22270156612529002, "grad_norm": 0.03358132764697075, "grad_norm_var": 3.602169313434283e-06, "learning_rate": 0.008981107926434589, "loss": 2.7746, "step": 6143 }, { "crossentropy": 2.4447226524353027, "epoch": 0.22273781902552203, "grad_norm": 0.03258466720581055, "grad_norm_var": 4.1214517242491955e-06, "learning_rate": 0.008980756338507583, "loss": 2.5765, "step": 6144 }, { "crossentropy": 2.5826382637023926, "epoch": 0.22277407192575405, "grad_norm": 0.031375735998153687, "grad_norm_var": 5.042876359211597e-06, "learning_rate": 0.008980404696814421, "loss": 2.5978, "step": 6145 }, { "crossentropy": 2.648852825164795, "epoch": 0.2228103248259861, "grad_norm": 0.0342567153275013, "grad_norm_var": 5.066574474161568e-06, "learning_rate": 0.008980053001359853, "loss": 2.6951, "step": 6146 }, { "crossentropy": 2.6842703819274902, "epoch": 0.2228465777262181, "grad_norm": 0.03298790007829666, "grad_norm_var": 5.0510800280879896e-06, "learning_rate": 0.00897970125214863, "loss": 2.5856, "step": 6147 }, { "crossentropy": 2.914929151535034, "epoch": 0.22288283062645012, "grad_norm": 0.03870833292603493, "grad_norm_var": 5.615086599367741e-06, "learning_rate": 0.0089793494491855, "loss": 2.7534, "step": 6148 }, { "crossentropy": 2.6053638458251953, "epoch": 0.22291908352668213, "grad_norm": 0.03324470296502113, "grad_norm_var": 5.3487968096871395e-06, "learning_rate": 0.00897899759247522, "loss": 2.6293, "step": 6149 }, { "crossentropy": 2.6881935596466064, "epoch": 0.22295533642691415, "grad_norm": 0.035589635372161865, "grad_norm_var": 4.477149758602174e-06, "learning_rate": 0.008978645682022538, "loss": 2.707, "step": 6150 }, { "crossentropy": 2.555995464324951, "epoch": 0.22299158932714616, "grad_norm": 0.03697476163506508, "grad_norm_var": 4.450694729995058e-06, "learning_rate": 0.008978293717832207, "loss": 2.586, "step": 6151 }, { "crossentropy": 2.672106981277466, "epoch": 0.2230278422273782, "grad_norm": 0.03562050312757492, "grad_norm_var": 4.524889664799201e-06, "learning_rate": 0.008977941699908985, "loss": 2.625, "step": 6152 }, { "crossentropy": 2.6090047359466553, "epoch": 0.22306409512761022, "grad_norm": 0.04188014194369316, "grad_norm_var": 7.062451612893419e-06, "learning_rate": 0.00897758962825762, "loss": 2.6289, "step": 6153 }, { "crossentropy": 2.664567232131958, "epoch": 0.22310034802784223, "grad_norm": 0.0376630537211895, "grad_norm_var": 7.612883055753359e-06, "learning_rate": 0.008977237502882873, "loss": 2.6619, "step": 6154 }, { "crossentropy": 2.6404175758361816, "epoch": 0.22313660092807425, "grad_norm": 0.03654274716973305, "grad_norm_var": 7.74718053561657e-06, "learning_rate": 0.008976885323789497, "loss": 2.6636, "step": 6155 }, { "crossentropy": 2.4025795459747314, "epoch": 0.22317285382830626, "grad_norm": 0.038908444344997406, "grad_norm_var": 8.292946126047841e-06, "learning_rate": 0.008976533090982251, "loss": 2.521, "step": 6156 }, { "crossentropy": 2.8200974464416504, "epoch": 0.22320910672853828, "grad_norm": 0.03274454176425934, "grad_norm_var": 8.650937213694244e-06, "learning_rate": 0.00897618080446589, "loss": 2.7299, "step": 6157 }, { "crossentropy": 2.664759397506714, "epoch": 0.2232453596287703, "grad_norm": 0.03464943915605545, "grad_norm_var": 8.066664692918159e-06, "learning_rate": 0.008975828464245173, "loss": 2.6423, "step": 6158 }, { "crossentropy": 2.638176202774048, "epoch": 0.22328161252900233, "grad_norm": 0.035810768604278564, "grad_norm_var": 7.81974294387777e-06, "learning_rate": 0.00897547607032486, "loss": 2.5715, "step": 6159 }, { "crossentropy": 2.6939754486083984, "epoch": 0.22331786542923435, "grad_norm": 0.03493776172399521, "grad_norm_var": 7.220895935885983e-06, "learning_rate": 0.008975123622709708, "loss": 2.6401, "step": 6160 }, { "crossentropy": 2.825157642364502, "epoch": 0.22335411832946636, "grad_norm": 0.03784569352865219, "grad_norm_var": 6.069312054965357e-06, "learning_rate": 0.00897477112140448, "loss": 2.8748, "step": 6161 }, { "crossentropy": 2.7023050785064697, "epoch": 0.22339037122969838, "grad_norm": 0.0383165068924427, "grad_norm_var": 6.075765045906953e-06, "learning_rate": 0.008974418566413936, "loss": 2.634, "step": 6162 }, { "crossentropy": 2.641360282897949, "epoch": 0.2234266241299304, "grad_norm": 0.03487507998943329, "grad_norm_var": 5.4393972171717354e-06, "learning_rate": 0.008974065957742836, "loss": 2.6628, "step": 6163 }, { "crossentropy": 2.5277442932128906, "epoch": 0.2234628770301624, "grad_norm": 0.0345500111579895, "grad_norm_var": 5.30654581255748e-06, "learning_rate": 0.008973713295395949, "loss": 2.5801, "step": 6164 }, { "crossentropy": 2.7745308876037598, "epoch": 0.22349912993039442, "grad_norm": 0.033204156905412674, "grad_norm_var": 5.32294758605519e-06, "learning_rate": 0.00897336057937803, "loss": 2.742, "step": 6165 }, { "crossentropy": 2.7462828159332275, "epoch": 0.22353538283062646, "grad_norm": 0.03238658979535103, "grad_norm_var": 6.249212925044043e-06, "learning_rate": 0.008973007809693847, "loss": 2.6676, "step": 6166 }, { "crossentropy": 2.764706611633301, "epoch": 0.22357163573085848, "grad_norm": 0.03083157353103161, "grad_norm_var": 7.85606237588437e-06, "learning_rate": 0.008972654986348165, "loss": 2.7232, "step": 6167 }, { "crossentropy": 2.592630624771118, "epoch": 0.2236078886310905, "grad_norm": 0.03377317264676094, "grad_norm_var": 8.08226709916742e-06, "learning_rate": 0.008972302109345747, "loss": 2.6549, "step": 6168 }, { "crossentropy": 2.671046495437622, "epoch": 0.2236441415313225, "grad_norm": 0.032454732805490494, "grad_norm_var": 5.688839992333073e-06, "learning_rate": 0.008971949178691364, "loss": 2.7134, "step": 6169 }, { "crossentropy": 2.6339104175567627, "epoch": 0.22368039443155452, "grad_norm": 0.03430279344320297, "grad_norm_var": 5.187247396053413e-06, "learning_rate": 0.008971596194389777, "loss": 2.6424, "step": 6170 }, { "crossentropy": 2.7613816261291504, "epoch": 0.22371664733178653, "grad_norm": 0.033993273973464966, "grad_norm_var": 4.98692482813535e-06, "learning_rate": 0.008971243156445756, "loss": 2.7603, "step": 6171 }, { "crossentropy": 2.7536988258361816, "epoch": 0.22375290023201855, "grad_norm": 0.033499229699373245, "grad_norm_var": 3.707579778705486e-06, "learning_rate": 0.008970890064864071, "loss": 2.7356, "step": 6172 }, { "crossentropy": 2.6074090003967285, "epoch": 0.2237891531322506, "grad_norm": 0.03223760426044464, "grad_norm_var": 3.826138472193133e-06, "learning_rate": 0.00897053691964949, "loss": 2.6647, "step": 6173 }, { "crossentropy": 2.7526044845581055, "epoch": 0.2238254060324826, "grad_norm": 0.033292125910520554, "grad_norm_var": 3.86524279074236e-06, "learning_rate": 0.008970183720806782, "loss": 2.7197, "step": 6174 }, { "crossentropy": 2.6540403366088867, "epoch": 0.22386165893271462, "grad_norm": 0.03804725781083107, "grad_norm_var": 4.674756680823653e-06, "learning_rate": 0.008969830468340717, "loss": 2.6516, "step": 6175 }, { "crossentropy": 2.6398167610168457, "epoch": 0.22389791183294663, "grad_norm": 0.03825198858976364, "grad_norm_var": 5.650059767855796e-06, "learning_rate": 0.008969477162256068, "loss": 2.6595, "step": 6176 }, { "crossentropy": 2.752239227294922, "epoch": 0.22393416473317865, "grad_norm": 0.03509225696325302, "grad_norm_var": 4.892439506555763e-06, "learning_rate": 0.008969123802557606, "loss": 2.7322, "step": 6177 }, { "crossentropy": 2.5439395904541016, "epoch": 0.22397041763341066, "grad_norm": 0.03705469146370888, "grad_norm_var": 4.319447646264337e-06, "learning_rate": 0.008968770389250104, "loss": 2.6309, "step": 6178 }, { "crossentropy": 2.6227235794067383, "epoch": 0.2240066705336427, "grad_norm": 0.03923161327838898, "grad_norm_var": 5.87432126242375e-06, "learning_rate": 0.008968416922338334, "loss": 2.5999, "step": 6179 }, { "crossentropy": 2.7757885456085205, "epoch": 0.22404292343387472, "grad_norm": 0.03509344160556793, "grad_norm_var": 5.895482600966556e-06, "learning_rate": 0.008968063401827072, "loss": 2.6947, "step": 6180 }, { "crossentropy": 2.5438919067382812, "epoch": 0.22407917633410673, "grad_norm": 0.03266644850373268, "grad_norm_var": 6.009803015225472e-06, "learning_rate": 0.00896770982772109, "loss": 2.5717, "step": 6181 }, { "crossentropy": 2.7685091495513916, "epoch": 0.22411542923433875, "grad_norm": 0.04724438488483429, "grad_norm_var": 1.559433207435564e-05, "learning_rate": 0.00896735620002517, "loss": 2.6883, "step": 6182 }, { "crossentropy": 2.6635303497314453, "epoch": 0.22415168213457076, "grad_norm": 0.03453606367111206, "grad_norm_var": 1.4174964801647444e-05, "learning_rate": 0.008967002518744079, "loss": 2.6433, "step": 6183 }, { "crossentropy": 2.7358880043029785, "epoch": 0.22418793503480278, "grad_norm": 0.03449664264917374, "grad_norm_var": 1.402439687499905e-05, "learning_rate": 0.008966648783882602, "loss": 2.712, "step": 6184 }, { "crossentropy": 2.671940565109253, "epoch": 0.2242241879350348, "grad_norm": 0.033351436257362366, "grad_norm_var": 1.368444502342029e-05, "learning_rate": 0.008966294995445512, "loss": 2.6316, "step": 6185 }, { "crossentropy": 2.6547296047210693, "epoch": 0.22426044083526683, "grad_norm": 0.03755560144782066, "grad_norm_var": 1.3707472252001233e-05, "learning_rate": 0.00896594115343759, "loss": 2.6973, "step": 6186 }, { "crossentropy": 2.8128225803375244, "epoch": 0.22429669373549885, "grad_norm": 0.034716345369815826, "grad_norm_var": 1.3548826519684208e-05, "learning_rate": 0.008965587257863615, "loss": 2.7196, "step": 6187 }, { "crossentropy": 2.721510171890259, "epoch": 0.22433294663573086, "grad_norm": 0.046587083488702774, "grad_norm_var": 1.9850567723626172e-05, "learning_rate": 0.008965233308728364, "loss": 2.7026, "step": 6188 }, { "crossentropy": 2.6207308769226074, "epoch": 0.22436919953596288, "grad_norm": 0.036910682916641235, "grad_norm_var": 1.8347190496168646e-05, "learning_rate": 0.00896487930603662, "loss": 2.6513, "step": 6189 }, { "crossentropy": 2.6779000759124756, "epoch": 0.2244054524361949, "grad_norm": 0.034025464206933975, "grad_norm_var": 1.8005246991290025e-05, "learning_rate": 0.008964525249793165, "loss": 2.6939, "step": 6190 }, { "crossentropy": 2.6280677318573, "epoch": 0.2244417053364269, "grad_norm": 0.032409247010946274, "grad_norm_var": 1.933912325301652e-05, "learning_rate": 0.008964171140002778, "loss": 2.6544, "step": 6191 }, { "crossentropy": 2.6529743671417236, "epoch": 0.22447795823665892, "grad_norm": 0.032431796193122864, "grad_norm_var": 2.0350043036286106e-05, "learning_rate": 0.008963816976670248, "loss": 2.6898, "step": 6192 }, { "crossentropy": 2.5547738075256348, "epoch": 0.22451421113689096, "grad_norm": 0.03418593108654022, "grad_norm_var": 2.0566991262691707e-05, "learning_rate": 0.008963462759800351, "loss": 2.6432, "step": 6193 }, { "crossentropy": 2.5218939781188965, "epoch": 0.22455046403712298, "grad_norm": 0.034911200404167175, "grad_norm_var": 2.0668771248163962e-05, "learning_rate": 0.008963108489397876, "loss": 2.5675, "step": 6194 }, { "crossentropy": 2.5858519077301025, "epoch": 0.224586716937355, "grad_norm": 0.040840018540620804, "grad_norm_var": 2.1465139189087944e-05, "learning_rate": 0.008962754165467606, "loss": 2.6929, "step": 6195 }, { "crossentropy": 2.631559133529663, "epoch": 0.224622969837587, "grad_norm": 0.03605665639042854, "grad_norm_var": 2.135884352789981e-05, "learning_rate": 0.008962399788014329, "loss": 2.6062, "step": 6196 }, { "crossentropy": 2.7170345783233643, "epoch": 0.22465922273781902, "grad_norm": 0.03270924463868141, "grad_norm_var": 2.1337466553231615e-05, "learning_rate": 0.008962045357042828, "loss": 2.6999, "step": 6197 }, { "crossentropy": 2.670499563217163, "epoch": 0.22469547563805103, "grad_norm": 0.03312455490231514, "grad_norm_var": 1.3448760592000656e-05, "learning_rate": 0.00896169087255789, "loss": 2.6398, "step": 6198 }, { "crossentropy": 2.712292432785034, "epoch": 0.22473172853828308, "grad_norm": 0.03283214196562767, "grad_norm_var": 1.3861256795800041e-05, "learning_rate": 0.008961336334564307, "loss": 2.7237, "step": 6199 }, { "crossentropy": 2.8322038650512695, "epoch": 0.2247679814385151, "grad_norm": 0.037185825407505035, "grad_norm_var": 1.3972658558164947e-05, "learning_rate": 0.008960981743066867, "loss": 2.732, "step": 6200 }, { "crossentropy": 2.765847682952881, "epoch": 0.2248042343387471, "grad_norm": 0.040833234786987305, "grad_norm_var": 1.5213592085315936e-05, "learning_rate": 0.008960627098070354, "loss": 2.6569, "step": 6201 }, { "crossentropy": 2.7906241416931152, "epoch": 0.22484048723897912, "grad_norm": 0.04886358976364136, "grad_norm_var": 2.5427014887395108e-05, "learning_rate": 0.008960272399579564, "loss": 2.7975, "step": 6202 }, { "crossentropy": 2.6440768241882324, "epoch": 0.22487674013921113, "grad_norm": 0.04906490072607994, "grad_norm_var": 3.4329417508608255e-05, "learning_rate": 0.008959917647599284, "loss": 2.5729, "step": 6203 }, { "crossentropy": 2.4672272205352783, "epoch": 0.22491299303944315, "grad_norm": 0.04208528250455856, "grad_norm_var": 3.0253102143294047e-05, "learning_rate": 0.008959562842134308, "loss": 2.6183, "step": 6204 }, { "crossentropy": 2.719693183898926, "epoch": 0.22494924593967516, "grad_norm": 0.035842977464199066, "grad_norm_var": 3.0394632156934463e-05, "learning_rate": 0.008959207983189426, "loss": 2.6954, "step": 6205 }, { "crossentropy": 2.6358370780944824, "epoch": 0.2249854988399072, "grad_norm": 0.033507272601127625, "grad_norm_var": 3.064025962335037e-05, "learning_rate": 0.008958853070769433, "loss": 2.6196, "step": 6206 }, { "crossentropy": 2.698305130004883, "epoch": 0.22502175174013922, "grad_norm": 0.03324773162603378, "grad_norm_var": 3.0136838438267708e-05, "learning_rate": 0.00895849810487912, "loss": 2.6722, "step": 6207 }, { "crossentropy": 2.8461620807647705, "epoch": 0.22505800464037123, "grad_norm": 0.034883759915828705, "grad_norm_var": 2.8902195178704774e-05, "learning_rate": 0.008958143085523283, "loss": 2.8869, "step": 6208 }, { "crossentropy": 2.724217414855957, "epoch": 0.22509425754060325, "grad_norm": 0.03505851700901985, "grad_norm_var": 2.856294081874973e-05, "learning_rate": 0.008957788012706717, "loss": 2.6666, "step": 6209 }, { "crossentropy": 2.680084705352783, "epoch": 0.22513051044083526, "grad_norm": 0.03713410347700119, "grad_norm_var": 2.8085092067596924e-05, "learning_rate": 0.00895743288643422, "loss": 2.7298, "step": 6210 }, { "crossentropy": 2.7000198364257812, "epoch": 0.22516676334106728, "grad_norm": 0.037182558327913284, "grad_norm_var": 2.739201758069533e-05, "learning_rate": 0.008957077706710583, "loss": 2.7161, "step": 6211 }, { "crossentropy": 2.566260814666748, "epoch": 0.2252030162412993, "grad_norm": 0.03533541038632393, "grad_norm_var": 2.7561000676836326e-05, "learning_rate": 0.008956722473540608, "loss": 2.6364, "step": 6212 }, { "crossentropy": 2.7339162826538086, "epoch": 0.22523926914153133, "grad_norm": 0.039171576499938965, "grad_norm_var": 2.6102899228522965e-05, "learning_rate": 0.008956367186929092, "loss": 2.6118, "step": 6213 }, { "crossentropy": 2.7562808990478516, "epoch": 0.22527552204176335, "grad_norm": 0.03384122997522354, "grad_norm_var": 2.5684925392168747e-05, "learning_rate": 0.008956011846880833, "loss": 2.7418, "step": 6214 }, { "crossentropy": 2.6120657920837402, "epoch": 0.22531177494199536, "grad_norm": 0.03445132449269295, "grad_norm_var": 2.4759131173294514e-05, "learning_rate": 0.008955656453400629, "loss": 2.6625, "step": 6215 }, { "crossentropy": 2.6747241020202637, "epoch": 0.22534802784222738, "grad_norm": 0.03600998967885971, "grad_norm_var": 2.4970143286434133e-05, "learning_rate": 0.008955301006493284, "loss": 2.685, "step": 6216 }, { "crossentropy": 2.8257148265838623, "epoch": 0.2253842807424594, "grad_norm": 0.035517048090696335, "grad_norm_var": 2.4662384935390338e-05, "learning_rate": 0.008954945506163594, "loss": 2.8378, "step": 6217 }, { "crossentropy": 2.6557202339172363, "epoch": 0.2254205336426914, "grad_norm": 0.037096258252859116, "grad_norm_var": 1.560495441948097e-05, "learning_rate": 0.008954589952416365, "loss": 2.6265, "step": 6218 }, { "crossentropy": 2.5934977531433105, "epoch": 0.22545678654292342, "grad_norm": 0.03243520110845566, "grad_norm_var": 5.781552017284503e-06, "learning_rate": 0.008954234345256397, "loss": 2.5818, "step": 6219 }, { "crossentropy": 2.6117053031921387, "epoch": 0.22549303944315546, "grad_norm": 0.035988129675388336, "grad_norm_var": 2.9953750881862863e-06, "learning_rate": 0.008953878684688492, "loss": 2.5931, "step": 6220 }, { "crossentropy": 2.8410205841064453, "epoch": 0.22552929234338748, "grad_norm": 0.0388868972659111, "grad_norm_var": 3.746562478678985e-06, "learning_rate": 0.008953522970717458, "loss": 2.7763, "step": 6221 }, { "crossentropy": 2.784851551055908, "epoch": 0.2255655452436195, "grad_norm": 0.038112103939056396, "grad_norm_var": 3.781313155690885e-06, "learning_rate": 0.008953167203348093, "loss": 2.7803, "step": 6222 }, { "crossentropy": 2.6757874488830566, "epoch": 0.2256017981438515, "grad_norm": 0.03373980149626732, "grad_norm_var": 3.6226304242529055e-06, "learning_rate": 0.008952811382585208, "loss": 2.6743, "step": 6223 }, { "crossentropy": 2.678720474243164, "epoch": 0.22563805104408352, "grad_norm": 0.032639436423778534, "grad_norm_var": 4.249847342279305e-06, "learning_rate": 0.008952455508433608, "loss": 2.5903, "step": 6224 }, { "crossentropy": 2.590390205383301, "epoch": 0.22567430394431554, "grad_norm": 0.034400615841150284, "grad_norm_var": 4.340843696215973e-06, "learning_rate": 0.008952099580898097, "loss": 2.6326, "step": 6225 }, { "crossentropy": 2.690465211868286, "epoch": 0.22571055684454758, "grad_norm": 0.03569513559341431, "grad_norm_var": 4.204001309122905e-06, "learning_rate": 0.008951743599983481, "loss": 2.6934, "step": 6226 }, { "crossentropy": 2.5132977962493896, "epoch": 0.2257468097447796, "grad_norm": 0.03857099264860153, "grad_norm_var": 4.60701142072061e-06, "learning_rate": 0.008951387565694574, "loss": 2.5868, "step": 6227 }, { "crossentropy": 2.7055299282073975, "epoch": 0.2257830626450116, "grad_norm": 0.03510957583785057, "grad_norm_var": 4.622477971340707e-06, "learning_rate": 0.00895103147803618, "loss": 2.7172, "step": 6228 }, { "crossentropy": 2.667949676513672, "epoch": 0.22581931554524362, "grad_norm": 0.03995496407151222, "grad_norm_var": 5.020408266191191e-06, "learning_rate": 0.008950675337013112, "loss": 2.7385, "step": 6229 }, { "crossentropy": 2.670079469680786, "epoch": 0.22585556844547564, "grad_norm": 0.031078273430466652, "grad_norm_var": 6.211039894238194e-06, "learning_rate": 0.008950319142630177, "loss": 2.6722, "step": 6230 }, { "crossentropy": 2.642935037612915, "epoch": 0.22589182134570765, "grad_norm": 0.03236272186040878, "grad_norm_var": 6.805057205546425e-06, "learning_rate": 0.008949962894892185, "loss": 2.6101, "step": 6231 }, { "crossentropy": 2.7916972637176514, "epoch": 0.22592807424593966, "grad_norm": 0.03393600136041641, "grad_norm_var": 6.925905441257244e-06, "learning_rate": 0.008949606593803951, "loss": 2.711, "step": 6232 }, { "crossentropy": 2.529507637023926, "epoch": 0.2259643271461717, "grad_norm": 0.03315332904458046, "grad_norm_var": 7.220942494605637e-06, "learning_rate": 0.008949250239370288, "loss": 2.5348, "step": 6233 }, { "crossentropy": 2.6370012760162354, "epoch": 0.22600058004640372, "grad_norm": 0.04255153611302376, "grad_norm_var": 1.0462072098012818e-05, "learning_rate": 0.008948893831596007, "loss": 2.7099, "step": 6234 }, { "crossentropy": 2.6053667068481445, "epoch": 0.22603683294663574, "grad_norm": 0.041018225252628326, "grad_norm_var": 1.1515007681614379e-05, "learning_rate": 0.008948537370485921, "loss": 2.6476, "step": 6235 }, { "crossentropy": 2.6265146732330322, "epoch": 0.22607308584686775, "grad_norm": 0.04143152013421059, "grad_norm_var": 1.3303967223033868e-05, "learning_rate": 0.008948180856044844, "loss": 2.6402, "step": 6236 }, { "crossentropy": 2.7122318744659424, "epoch": 0.22610933874709976, "grad_norm": 0.03894787281751633, "grad_norm_var": 1.332429573137997e-05, "learning_rate": 0.008947824288277596, "loss": 2.6527, "step": 6237 }, { "crossentropy": 2.676023244857788, "epoch": 0.22614559164733178, "grad_norm": 0.03682278096675873, "grad_norm_var": 1.3137111447629801e-05, "learning_rate": 0.00894746766718899, "loss": 2.7034, "step": 6238 }, { "crossentropy": 2.556614398956299, "epoch": 0.2261818445475638, "grad_norm": 0.03494391590356827, "grad_norm_var": 1.2810544573648102e-05, "learning_rate": 0.00894711099278384, "loss": 2.5886, "step": 6239 }, { "crossentropy": 2.591839551925659, "epoch": 0.22621809744779584, "grad_norm": 0.03355344012379646, "grad_norm_var": 1.2402816002211635e-05, "learning_rate": 0.008946754265066968, "loss": 2.627, "step": 6240 }, { "crossentropy": 2.5767929553985596, "epoch": 0.22625435034802785, "grad_norm": 0.032585278153419495, "grad_norm_var": 1.3109830733569341e-05, "learning_rate": 0.008946397484043191, "loss": 2.6102, "step": 6241 }, { "crossentropy": 2.6497480869293213, "epoch": 0.22629060324825986, "grad_norm": 0.03270895034074783, "grad_norm_var": 1.393077743455233e-05, "learning_rate": 0.008946040649717326, "loss": 2.6766, "step": 6242 }, { "crossentropy": 2.6863174438476562, "epoch": 0.22632685614849188, "grad_norm": 0.03983275219798088, "grad_norm_var": 1.4434111203594326e-05, "learning_rate": 0.008945683762094193, "loss": 2.7483, "step": 6243 }, { "crossentropy": 2.894270420074463, "epoch": 0.2263631090487239, "grad_norm": 0.043056536465883255, "grad_norm_var": 1.7173447108559147e-05, "learning_rate": 0.008945326821178615, "loss": 2.8178, "step": 6244 }, { "crossentropy": 2.698896884918213, "epoch": 0.2263993619489559, "grad_norm": 0.03632881119847298, "grad_norm_var": 1.6443829482470905e-05, "learning_rate": 0.008944969826975409, "loss": 2.6662, "step": 6245 }, { "crossentropy": 2.6049892902374268, "epoch": 0.22643561484918792, "grad_norm": 0.03425271809101105, "grad_norm_var": 1.4770599925165104e-05, "learning_rate": 0.0089446127794894, "loss": 2.6487, "step": 6246 }, { "crossentropy": 2.759312152862549, "epoch": 0.22647186774941996, "grad_norm": 0.03568846359848976, "grad_norm_var": 1.3530658721327045e-05, "learning_rate": 0.008944255678725407, "loss": 2.7477, "step": 6247 }, { "crossentropy": 2.6856515407562256, "epoch": 0.22650812064965198, "grad_norm": 0.03798222169280052, "grad_norm_var": 1.2940940400069968e-05, "learning_rate": 0.008943898524688259, "loss": 2.7557, "step": 6248 }, { "crossentropy": 2.638141393661499, "epoch": 0.226544373549884, "grad_norm": 0.04026392102241516, "grad_norm_var": 1.228465364504494e-05, "learning_rate": 0.008943541317382772, "loss": 2.6976, "step": 6249 }, { "crossentropy": 2.8407907485961914, "epoch": 0.226580626450116, "grad_norm": 0.041783999651670456, "grad_norm_var": 1.1817101708137128e-05, "learning_rate": 0.008943184056813781, "loss": 2.7613, "step": 6250 }, { "crossentropy": 2.640866994857788, "epoch": 0.22661687935034802, "grad_norm": 0.03511964902281761, "grad_norm_var": 1.1283729111304039e-05, "learning_rate": 0.0089428267429861, "loss": 2.6741, "step": 6251 }, { "crossentropy": 2.7345221042633057, "epoch": 0.22665313225058004, "grad_norm": 0.0330791249871254, "grad_norm_var": 1.0938599292885839e-05, "learning_rate": 0.008942469375904562, "loss": 2.7627, "step": 6252 }, { "crossentropy": 2.6703033447265625, "epoch": 0.22668938515081208, "grad_norm": 0.03514402359724045, "grad_norm_var": 1.0694941838865212e-05, "learning_rate": 0.008942111955573992, "loss": 2.6046, "step": 6253 }, { "crossentropy": 2.79311466217041, "epoch": 0.2267256380510441, "grad_norm": 0.03474269434809685, "grad_norm_var": 1.0861049603124762e-05, "learning_rate": 0.008941754481999218, "loss": 2.6755, "step": 6254 }, { "crossentropy": 2.553471803665161, "epoch": 0.2267618909512761, "grad_norm": 0.034340277314186096, "grad_norm_var": 1.09943085329508e-05, "learning_rate": 0.008941396955185069, "loss": 2.6331, "step": 6255 }, { "crossentropy": 2.8529975414276123, "epoch": 0.22679814385150812, "grad_norm": 0.1408032327890396, "grad_norm_var": 0.0006909272715802319, "learning_rate": 0.008941039375136371, "loss": 2.6446, "step": 6256 }, { "crossentropy": 2.879267454147339, "epoch": 0.22683439675174014, "grad_norm": 0.03525540232658386, "grad_norm_var": 0.0006876714496014273, "learning_rate": 0.008940681741857956, "loss": 2.7659, "step": 6257 }, { "crossentropy": 2.63969087600708, "epoch": 0.22687064965197215, "grad_norm": 0.03930263593792915, "grad_norm_var": 0.0006812103558117542, "learning_rate": 0.008940324055354654, "loss": 2.6021, "step": 6258 }, { "crossentropy": 2.664247989654541, "epoch": 0.22690690255220416, "grad_norm": 0.03237399831414223, "grad_norm_var": 0.000688395192130788, "learning_rate": 0.008939966315631298, "loss": 2.6708, "step": 6259 }, { "crossentropy": 2.7164390087127686, "epoch": 0.2269431554524362, "grad_norm": 0.03938239812850952, "grad_norm_var": 0.0006892576705222799, "learning_rate": 0.008939608522692715, "loss": 2.8178, "step": 6260 }, { "crossentropy": 2.863837480545044, "epoch": 0.22697940835266822, "grad_norm": 0.04308020696043968, "grad_norm_var": 0.0006862225172839573, "learning_rate": 0.00893925067654374, "loss": 2.7659, "step": 6261 }, { "crossentropy": 2.471811056137085, "epoch": 0.22701566125290024, "grad_norm": 0.04446793347597122, "grad_norm_var": 0.000680439227324713, "learning_rate": 0.00893889277718921, "loss": 2.5721, "step": 6262 }, { "crossentropy": 2.676687002182007, "epoch": 0.22705191415313225, "grad_norm": 0.04143089801073074, "grad_norm_var": 0.000676193342741839, "learning_rate": 0.008938534824633953, "loss": 2.6206, "step": 6263 }, { "crossentropy": 2.7179906368255615, "epoch": 0.22708816705336426, "grad_norm": 0.03738822788000107, "grad_norm_var": 0.0006767145328322316, "learning_rate": 0.008938176818882806, "loss": 2.7432, "step": 6264 }, { "crossentropy": 2.673635482788086, "epoch": 0.22712441995359628, "grad_norm": 0.0339682437479496, "grad_norm_var": 0.0006825355930031126, "learning_rate": 0.008937818759940605, "loss": 2.6724, "step": 6265 }, { "crossentropy": 2.6211884021759033, "epoch": 0.2271606728538283, "grad_norm": 0.034059975296258926, "grad_norm_var": 0.000688396140788143, "learning_rate": 0.008937460647812187, "loss": 2.6802, "step": 6266 }, { "crossentropy": 2.7344985008239746, "epoch": 0.22719692575406034, "grad_norm": 0.03332676738500595, "grad_norm_var": 0.0006905695787361923, "learning_rate": 0.008937102482502386, "loss": 2.6853, "step": 6267 }, { "crossentropy": 2.5241732597351074, "epoch": 0.22723317865429235, "grad_norm": 0.030833451077342033, "grad_norm_var": 0.0006939328982583109, "learning_rate": 0.008936744264016042, "loss": 2.6515, "step": 6268 }, { "crossentropy": 2.8116366863250732, "epoch": 0.22726943155452436, "grad_norm": 0.03080214001238346, "grad_norm_var": 0.0006997278697438265, "learning_rate": 0.008936385992357992, "loss": 2.7242, "step": 6269 }, { "crossentropy": 2.831676959991455, "epoch": 0.22730568445475638, "grad_norm": 0.032816819846630096, "grad_norm_var": 0.000702040835907428, "learning_rate": 0.008936027667533077, "loss": 2.7837, "step": 6270 }, { "crossentropy": 2.5720365047454834, "epoch": 0.2273419373549884, "grad_norm": 0.032580725848674774, "grad_norm_var": 0.000704201928857207, "learning_rate": 0.008935669289546134, "loss": 2.675, "step": 6271 }, { "crossentropy": 2.7377736568450928, "epoch": 0.2273781902552204, "grad_norm": 0.033296458423137665, "grad_norm_var": 1.913484435122157e-05, "learning_rate": 0.008935310858402004, "loss": 2.7145, "step": 6272 }, { "crossentropy": 2.848200798034668, "epoch": 0.22741444315545242, "grad_norm": 0.03323270380496979, "grad_norm_var": 1.9563826417944886e-05, "learning_rate": 0.00893495237410553, "loss": 2.7546, "step": 6273 }, { "crossentropy": 2.7605302333831787, "epoch": 0.22745069605568446, "grad_norm": 0.035928256809711456, "grad_norm_var": 1.868674802624713e-05, "learning_rate": 0.008934593836661554, "loss": 2.7052, "step": 6274 }, { "crossentropy": 2.7252354621887207, "epoch": 0.22748694895591648, "grad_norm": 0.0372365266084671, "grad_norm_var": 1.8098533254806182e-05, "learning_rate": 0.008934235246074914, "loss": 2.596, "step": 6275 }, { "crossentropy": 2.6960811614990234, "epoch": 0.2275232018561485, "grad_norm": 0.032234784215688705, "grad_norm_var": 1.7938931062990578e-05, "learning_rate": 0.00893387660235046, "loss": 2.7001, "step": 6276 }, { "crossentropy": 2.696493148803711, "epoch": 0.2275594547563805, "grad_norm": 0.03356806933879852, "grad_norm_var": 1.3875808810801838e-05, "learning_rate": 0.00893351790549303, "loss": 2.5892, "step": 6277 }, { "crossentropy": 2.5972659587860107, "epoch": 0.22759570765661252, "grad_norm": 0.03369415923953056, "grad_norm_var": 7.275826400977847e-06, "learning_rate": 0.008933159155507471, "loss": 2.6848, "step": 6278 }, { "crossentropy": 2.7060084342956543, "epoch": 0.22763196055684454, "grad_norm": 0.03403269872069359, "grad_norm_var": 3.5144791732688933e-06, "learning_rate": 0.00893280035239863, "loss": 2.6756, "step": 6279 }, { "crossentropy": 2.650111198425293, "epoch": 0.22766821345707658, "grad_norm": 0.032893046736717224, "grad_norm_var": 2.5593363692616793e-06, "learning_rate": 0.008932441496171353, "loss": 2.5605, "step": 6280 }, { "crossentropy": 2.656968116760254, "epoch": 0.2277044663573086, "grad_norm": 0.036609407514333725, "grad_norm_var": 3.1931232567412754e-06, "learning_rate": 0.008932082586830484, "loss": 2.6387, "step": 6281 }, { "crossentropy": 2.7245235443115234, "epoch": 0.2277407192575406, "grad_norm": 0.0435294546186924, "grad_norm_var": 9.41415362909753e-06, "learning_rate": 0.008931723624380874, "loss": 2.7568, "step": 6282 }, { "crossentropy": 2.702707052230835, "epoch": 0.22777697215777262, "grad_norm": 0.04859781637787819, "grad_norm_var": 2.228582500026453e-05, "learning_rate": 0.008931364608827368, "loss": 2.6476, "step": 6283 }, { "crossentropy": 2.5619990825653076, "epoch": 0.22781322505800464, "grad_norm": 0.043841201812028885, "grad_norm_var": 2.5430104962213687e-05, "learning_rate": 0.008931005540174819, "loss": 2.6372, "step": 6284 }, { "crossentropy": 2.733602523803711, "epoch": 0.22784947795823665, "grad_norm": 0.040533196181058884, "grad_norm_var": 2.469402285951878e-05, "learning_rate": 0.008930646418428074, "loss": 2.7209, "step": 6285 }, { "crossentropy": 2.7973639965057373, "epoch": 0.22788573085846867, "grad_norm": 0.0446297787129879, "grad_norm_var": 2.7552855667757094e-05, "learning_rate": 0.008930287243591984, "loss": 2.7267, "step": 6286 }, { "crossentropy": 2.6389787197113037, "epoch": 0.2279219837587007, "grad_norm": 0.04146210476756096, "grad_norm_var": 2.6921069194397555e-05, "learning_rate": 0.0089299280156714, "loss": 2.7016, "step": 6287 }, { "crossentropy": 2.6082913875579834, "epoch": 0.22795823665893272, "grad_norm": 0.038298726081848145, "grad_norm_var": 2.5459601603274272e-05, "learning_rate": 0.008929568734671175, "loss": 2.6338, "step": 6288 }, { "crossentropy": 2.767451524734497, "epoch": 0.22799448955916474, "grad_norm": 0.03375796228647232, "grad_norm_var": 2.513280664613408e-05, "learning_rate": 0.008929209400596159, "loss": 2.7144, "step": 6289 }, { "crossentropy": 2.674394369125366, "epoch": 0.22803074245939675, "grad_norm": 0.03335488960146904, "grad_norm_var": 2.6318599997713203e-05, "learning_rate": 0.008928850013451211, "loss": 2.6474, "step": 6290 }, { "crossentropy": 2.6753342151641846, "epoch": 0.22806699535962877, "grad_norm": 0.03253715857863426, "grad_norm_var": 2.8187956018066452e-05, "learning_rate": 0.008928490573241177, "loss": 2.6182, "step": 6291 }, { "crossentropy": 2.6422340869903564, "epoch": 0.22810324825986078, "grad_norm": 0.031808316707611084, "grad_norm_var": 2.8511418877407283e-05, "learning_rate": 0.00892813107997092, "loss": 2.6683, "step": 6292 }, { "crossentropy": 2.6739654541015625, "epoch": 0.2281395011600928, "grad_norm": 0.030599595978856087, "grad_norm_var": 3.0696275333799114e-05, "learning_rate": 0.008927771533645289, "loss": 2.6228, "step": 6293 }, { "crossentropy": 2.6502468585968018, "epoch": 0.22817575406032484, "grad_norm": 0.03227326273918152, "grad_norm_var": 3.1545612544297515e-05, "learning_rate": 0.008927411934269144, "loss": 2.6535, "step": 6294 }, { "crossentropy": 2.7064428329467773, "epoch": 0.22821200696055685, "grad_norm": 0.03706604242324829, "grad_norm_var": 3.0749729887382284e-05, "learning_rate": 0.00892705228184734, "loss": 2.741, "step": 6295 }, { "crossentropy": 2.7936174869537354, "epoch": 0.22824825986078887, "grad_norm": 0.03735477849841118, "grad_norm_var": 2.918662815671959e-05, "learning_rate": 0.008926692576384737, "loss": 2.7785, "step": 6296 }, { "crossentropy": 2.7269647121429443, "epoch": 0.22828451276102088, "grad_norm": 0.03554344177246094, "grad_norm_var": 2.943977650561081e-05, "learning_rate": 0.00892633281788619, "loss": 2.683, "step": 6297 }, { "crossentropy": 2.4592933654785156, "epoch": 0.2283207656612529, "grad_norm": 0.03389010205864906, "grad_norm_var": 2.791447728201685e-05, "learning_rate": 0.00892597300635656, "loss": 2.476, "step": 6298 }, { "crossentropy": 2.817837715148926, "epoch": 0.2283570185614849, "grad_norm": 0.03589438647031784, "grad_norm_var": 1.873191273510075e-05, "learning_rate": 0.008925613141800707, "loss": 2.7278, "step": 6299 }, { "crossentropy": 2.6023166179656982, "epoch": 0.22839327146171692, "grad_norm": 0.038586344569921494, "grad_norm_var": 1.526358193324136e-05, "learning_rate": 0.008925253224223492, "loss": 2.6157, "step": 6300 }, { "crossentropy": 2.734956741333008, "epoch": 0.22842952436194897, "grad_norm": 0.03849354758858681, "grad_norm_var": 1.4317802200575431e-05, "learning_rate": 0.008924893253629775, "loss": 2.6963, "step": 6301 }, { "crossentropy": 2.7005693912506104, "epoch": 0.22846577726218098, "grad_norm": 0.036275845021009445, "grad_norm_var": 9.03592195479288e-06, "learning_rate": 0.008924533230024418, "loss": 2.6988, "step": 6302 }, { "crossentropy": 2.595181703567505, "epoch": 0.228502030162413, "grad_norm": 0.0349595844745636, "grad_norm_var": 6.46589461113244e-06, "learning_rate": 0.008924173153412283, "loss": 2.5914, "step": 6303 }, { "crossentropy": 2.654820203781128, "epoch": 0.228538283062645, "grad_norm": 0.03515240177512169, "grad_norm_var": 5.7189519727022e-06, "learning_rate": 0.008923813023798237, "loss": 2.6491, "step": 6304 }, { "crossentropy": 2.6393280029296875, "epoch": 0.22857453596287702, "grad_norm": 0.03664631024003029, "grad_norm_var": 5.821063393486346e-06, "learning_rate": 0.00892345284118714, "loss": 2.65, "step": 6305 }, { "crossentropy": 2.6922030448913574, "epoch": 0.22861078886310904, "grad_norm": 0.032886575907468796, "grad_norm_var": 5.939196020691738e-06, "learning_rate": 0.008923092605583858, "loss": 2.6523, "step": 6306 }, { "crossentropy": 2.6292831897735596, "epoch": 0.22864704176334108, "grad_norm": 0.035499949008226395, "grad_norm_var": 5.51570884663118e-06, "learning_rate": 0.008922732316993255, "loss": 2.6401, "step": 6307 }, { "crossentropy": 2.6739449501037598, "epoch": 0.2286832946635731, "grad_norm": 0.032223835587501526, "grad_norm_var": 5.339525287788406e-06, "learning_rate": 0.008922371975420203, "loss": 2.6894, "step": 6308 }, { "crossentropy": 2.4710779190063477, "epoch": 0.2287195475638051, "grad_norm": 0.03253896161913872, "grad_norm_var": 4.382654787361365e-06, "learning_rate": 0.008922011580869565, "loss": 2.5879, "step": 6309 }, { "crossentropy": 2.7073190212249756, "epoch": 0.22875580046403712, "grad_norm": 0.032228488475084305, "grad_norm_var": 4.401030508357068e-06, "learning_rate": 0.008921651133346207, "loss": 2.6826, "step": 6310 }, { "crossentropy": 2.6854121685028076, "epoch": 0.22879205336426914, "grad_norm": 0.03755318745970726, "grad_norm_var": 4.528782958805137e-06, "learning_rate": 0.008921290632854998, "loss": 2.6681, "step": 6311 }, { "crossentropy": 2.5523531436920166, "epoch": 0.22882830626450115, "grad_norm": 0.04090997204184532, "grad_norm_var": 6.265277753535655e-06, "learning_rate": 0.00892093007940081, "loss": 2.6082, "step": 6312 }, { "crossentropy": 2.5720345973968506, "epoch": 0.22886455916473317, "grad_norm": 0.036282576620578766, "grad_norm_var": 6.29580184016316e-06, "learning_rate": 0.008920569472988511, "loss": 2.6519, "step": 6313 }, { "crossentropy": 2.6726717948913574, "epoch": 0.2289008120649652, "grad_norm": 0.03470085561275482, "grad_norm_var": 6.149192029632616e-06, "learning_rate": 0.008920208813622972, "loss": 2.6807, "step": 6314 }, { "crossentropy": 2.5971860885620117, "epoch": 0.22893706496519722, "grad_norm": 0.03300296887755394, "grad_norm_var": 6.587922986235848e-06, "learning_rate": 0.008919848101309061, "loss": 2.599, "step": 6315 }, { "crossentropy": 2.7790310382843018, "epoch": 0.22897331786542924, "grad_norm": 0.039625681936740875, "grad_norm_var": 7.0836448039739035e-06, "learning_rate": 0.008919487336051656, "loss": 2.7073, "step": 6316 }, { "crossentropy": 2.9500346183776855, "epoch": 0.22900957076566125, "grad_norm": 0.04343597963452339, "grad_norm_var": 1.0542698933858376e-05, "learning_rate": 0.008919126517855624, "loss": 2.8706, "step": 6317 }, { "crossentropy": 2.6021625995635986, "epoch": 0.22904582366589327, "grad_norm": 0.043946173042058945, "grad_norm_var": 1.463467870684709e-05, "learning_rate": 0.008918765646725843, "loss": 2.6644, "step": 6318 }, { "crossentropy": 2.649505376815796, "epoch": 0.22908207656612528, "grad_norm": 0.04360917955636978, "grad_norm_var": 1.7707577996919852e-05, "learning_rate": 0.008918404722667184, "loss": 2.6516, "step": 6319 }, { "crossentropy": 2.647665500640869, "epoch": 0.2291183294663573, "grad_norm": 0.03797699883580208, "grad_norm_var": 1.755174989570233e-05, "learning_rate": 0.008918043745684523, "loss": 2.6354, "step": 6320 }, { "crossentropy": 2.8325679302215576, "epoch": 0.22915458236658934, "grad_norm": 0.03630360588431358, "grad_norm_var": 1.7578300953595358e-05, "learning_rate": 0.008917682715782735, "loss": 2.7429, "step": 6321 }, { "crossentropy": 2.6312148571014404, "epoch": 0.22919083526682135, "grad_norm": 0.03907310962677002, "grad_norm_var": 1.6539954542336512e-05, "learning_rate": 0.008917321632966697, "loss": 2.6583, "step": 6322 }, { "crossentropy": 2.6416451930999756, "epoch": 0.22922708816705337, "grad_norm": 0.03722655028104782, "grad_norm_var": 1.6281499164126112e-05, "learning_rate": 0.008916960497241286, "loss": 2.6267, "step": 6323 }, { "crossentropy": 2.615231513977051, "epoch": 0.22926334106728538, "grad_norm": 0.03523499146103859, "grad_norm_var": 1.4713864149336574e-05, "learning_rate": 0.00891659930861138, "loss": 2.6316, "step": 6324 }, { "crossentropy": 2.715313673019409, "epoch": 0.2292995939675174, "grad_norm": 0.03746504336595535, "grad_norm_var": 1.2822237183991667e-05, "learning_rate": 0.008916238067081854, "loss": 2.6992, "step": 6325 }, { "crossentropy": 2.8344180583953857, "epoch": 0.2293358468677494, "grad_norm": 0.039023347198963165, "grad_norm_var": 1.0446408790461501e-05, "learning_rate": 0.008915876772657592, "loss": 2.7694, "step": 6326 }, { "crossentropy": 2.637035608291626, "epoch": 0.22937209976798145, "grad_norm": 0.03411298245191574, "grad_norm_var": 1.1602339450699344e-05, "learning_rate": 0.008915515425343473, "loss": 2.6226, "step": 6327 }, { "crossentropy": 2.6617441177368164, "epoch": 0.22940835266821347, "grad_norm": 0.034075941890478134, "grad_norm_var": 1.2093574657463282e-05, "learning_rate": 0.008915154025144372, "loss": 2.6711, "step": 6328 }, { "crossentropy": 2.636997699737549, "epoch": 0.22944460556844548, "grad_norm": 0.03204616904258728, "grad_norm_var": 1.4082843988424907e-05, "learning_rate": 0.008914792572065178, "loss": 2.5538, "step": 6329 }, { "crossentropy": 2.689134120941162, "epoch": 0.2294808584686775, "grad_norm": 0.03143547475337982, "grad_norm_var": 1.5991356930775077e-05, "learning_rate": 0.008914431066110768, "loss": 2.7276, "step": 6330 }, { "crossentropy": 2.753582000732422, "epoch": 0.2295171113689095, "grad_norm": 0.032210543751716614, "grad_norm_var": 1.6489857605943754e-05, "learning_rate": 0.008914069507286026, "loss": 2.6978, "step": 6331 }, { "crossentropy": 2.463233709335327, "epoch": 0.22955336426914152, "grad_norm": 0.03471232205629349, "grad_norm_var": 1.6475160961597807e-05, "learning_rate": 0.008913707895595835, "loss": 2.5764, "step": 6332 }, { "crossentropy": 2.83394455909729, "epoch": 0.22958961716937354, "grad_norm": 0.0340723842382431, "grad_norm_var": 1.3911073337315114e-05, "learning_rate": 0.008913346231045081, "loss": 2.826, "step": 6333 }, { "crossentropy": 2.6165030002593994, "epoch": 0.22962587006960558, "grad_norm": 0.03387788310647011, "grad_norm_var": 1.0126925298831595e-05, "learning_rate": 0.008912984513638646, "loss": 2.7199, "step": 6334 }, { "crossentropy": 2.713719606399536, "epoch": 0.2296621229698376, "grad_norm": 0.033670566976070404, "grad_norm_var": 5.9236581919271e-06, "learning_rate": 0.008912622743381417, "loss": 2.6235, "step": 6335 }, { "crossentropy": 2.6729109287261963, "epoch": 0.2296983758700696, "grad_norm": 0.03214505687355995, "grad_norm_var": 5.856858072382452e-06, "learning_rate": 0.00891226092027828, "loss": 2.6812, "step": 6336 }, { "crossentropy": 2.4935801029205322, "epoch": 0.22973462877030162, "grad_norm": 0.03285611793398857, "grad_norm_var": 5.9052504158440525e-06, "learning_rate": 0.008911899044334124, "loss": 2.5965, "step": 6337 }, { "crossentropy": 2.684952974319458, "epoch": 0.22977088167053364, "grad_norm": 0.0336003340780735, "grad_norm_var": 4.496674176423844e-06, "learning_rate": 0.008911537115553834, "loss": 2.6502, "step": 6338 }, { "crossentropy": 2.824524402618408, "epoch": 0.22980713457076565, "grad_norm": 0.049288928508758545, "grad_norm_var": 1.840127225796277e-05, "learning_rate": 0.0089111751339423, "loss": 2.7643, "step": 6339 }, { "crossentropy": 2.7272098064422607, "epoch": 0.22984338747099767, "grad_norm": 0.033328622579574585, "grad_norm_var": 1.8565950612287457e-05, "learning_rate": 0.008910813099504408, "loss": 2.6825, "step": 6340 }, { "crossentropy": 2.450331211090088, "epoch": 0.2298796403712297, "grad_norm": 0.03346012160181999, "grad_norm_var": 1.8182744285012246e-05, "learning_rate": 0.008910451012245054, "loss": 2.5932, "step": 6341 }, { "crossentropy": 2.6164987087249756, "epoch": 0.22991589327146172, "grad_norm": 0.03220488876104355, "grad_norm_var": 1.70850679430438e-05, "learning_rate": 0.008910088872169122, "loss": 2.6865, "step": 6342 }, { "crossentropy": 2.627650022506714, "epoch": 0.22995214617169374, "grad_norm": 0.030340250581502914, "grad_norm_var": 1.8015238393773112e-05, "learning_rate": 0.008909726679281508, "loss": 2.7251, "step": 6343 }, { "crossentropy": 2.686929702758789, "epoch": 0.22998839907192575, "grad_norm": 0.033044371753931046, "grad_norm_var": 1.8065504335143015e-05, "learning_rate": 0.0089093644335871, "loss": 2.7387, "step": 6344 }, { "crossentropy": 2.733046770095825, "epoch": 0.23002465197215777, "grad_norm": 0.033030882477760315, "grad_norm_var": 1.7883578682661937e-05, "learning_rate": 0.008909002135090794, "loss": 2.6239, "step": 6345 }, { "crossentropy": 2.7479043006896973, "epoch": 0.23006090487238978, "grad_norm": 0.033618759363889694, "grad_norm_var": 1.7448076729048347e-05, "learning_rate": 0.008908639783797481, "loss": 2.7686, "step": 6346 }, { "crossentropy": 2.617668628692627, "epoch": 0.2300971577726218, "grad_norm": 0.03767262399196625, "grad_norm_var": 1.7942953360438932e-05, "learning_rate": 0.008908277379712056, "loss": 2.6492, "step": 6347 }, { "crossentropy": 2.680756092071533, "epoch": 0.23013341067285384, "grad_norm": 0.038938332349061966, "grad_norm_var": 1.9216676920125127e-05, "learning_rate": 0.008907914922839414, "loss": 2.703, "step": 6348 }, { "crossentropy": 2.520472288131714, "epoch": 0.23016966357308585, "grad_norm": 0.03443632647395134, "grad_norm_var": 1.9194651103883926e-05, "learning_rate": 0.008907552413184451, "loss": 2.6659, "step": 6349 }, { "crossentropy": 2.5975594520568848, "epoch": 0.23020591647331787, "grad_norm": 0.03107505850493908, "grad_norm_var": 2.0000209105898085e-05, "learning_rate": 0.008907189850752062, "loss": 2.646, "step": 6350 }, { "crossentropy": 2.9114418029785156, "epoch": 0.23024216937354988, "grad_norm": 0.032245881855487823, "grad_norm_var": 2.0293068680463433e-05, "learning_rate": 0.008906827235547145, "loss": 2.8282, "step": 6351 }, { "crossentropy": 2.6738779544830322, "epoch": 0.2302784222737819, "grad_norm": 0.03171008080244064, "grad_norm_var": 2.0438887043602152e-05, "learning_rate": 0.0089064645675746, "loss": 2.6726, "step": 6352 }, { "crossentropy": 2.542820453643799, "epoch": 0.2303146751740139, "grad_norm": 0.033227551728487015, "grad_norm_var": 2.0369651966261808e-05, "learning_rate": 0.008906101846839319, "loss": 2.5842, "step": 6353 }, { "crossentropy": 2.795196771621704, "epoch": 0.23035092807424595, "grad_norm": 0.0351356603205204, "grad_norm_var": 2.0342748913491554e-05, "learning_rate": 0.008905739073346207, "loss": 2.7215, "step": 6354 }, { "crossentropy": 2.6789615154266357, "epoch": 0.23038718097447797, "grad_norm": 0.03366338461637497, "grad_norm_var": 4.890005919782461e-06, "learning_rate": 0.008905376247100161, "loss": 2.6703, "step": 6355 }, { "crossentropy": 2.8060340881347656, "epoch": 0.23042343387470998, "grad_norm": 0.035837672650814056, "grad_norm_var": 5.202446196280241e-06, "learning_rate": 0.008905013368106082, "loss": 2.7802, "step": 6356 }, { "crossentropy": 2.712338924407959, "epoch": 0.230459686774942, "grad_norm": 0.03272475302219391, "grad_norm_var": 5.262471677235194e-06, "learning_rate": 0.00890465043636887, "loss": 2.6704, "step": 6357 }, { "crossentropy": 2.5549302101135254, "epoch": 0.230495939675174, "grad_norm": 0.0335656963288784, "grad_norm_var": 5.110263056463703e-06, "learning_rate": 0.008904287451893431, "loss": 2.5964, "step": 6358 }, { "crossentropy": 2.4864959716796875, "epoch": 0.23053219257540603, "grad_norm": 0.03218141198158264, "grad_norm_var": 4.480976076182821e-06, "learning_rate": 0.008903924414684664, "loss": 2.5869, "step": 6359 }, { "crossentropy": 2.6918575763702393, "epoch": 0.23056844547563804, "grad_norm": 0.03887192904949188, "grad_norm_var": 5.952831516969889e-06, "learning_rate": 0.008903561324747473, "loss": 2.6708, "step": 6360 }, { "crossentropy": 2.6376636028289795, "epoch": 0.23060469837587008, "grad_norm": 0.03319356590509415, "grad_norm_var": 5.928128363667434e-06, "learning_rate": 0.008903198182086762, "loss": 2.6389, "step": 6361 }, { "crossentropy": 2.6967220306396484, "epoch": 0.2306409512761021, "grad_norm": 0.03642549738287926, "grad_norm_var": 6.181950945415811e-06, "learning_rate": 0.008902834986707438, "loss": 2.6616, "step": 6362 }, { "crossentropy": 2.728823661804199, "epoch": 0.2306772041763341, "grad_norm": 0.040311794728040695, "grad_norm_var": 7.757763226827923e-06, "learning_rate": 0.008902471738614404, "loss": 2.7237, "step": 6363 }, { "crossentropy": 2.633118152618408, "epoch": 0.23071345707656613, "grad_norm": 0.04013452306389809, "grad_norm_var": 8.539674721783291e-06, "learning_rate": 0.008902108437812566, "loss": 2.7228, "step": 6364 }, { "crossentropy": 2.8228933811187744, "epoch": 0.23074970997679814, "grad_norm": 0.033422283828258514, "grad_norm_var": 8.635712036144738e-06, "learning_rate": 0.008901745084306833, "loss": 2.6774, "step": 6365 }, { "crossentropy": 2.7037291526794434, "epoch": 0.23078596287703015, "grad_norm": 0.03209025785326958, "grad_norm_var": 8.221918350435376e-06, "learning_rate": 0.00890138167810211, "loss": 2.6332, "step": 6366 }, { "crossentropy": 2.582763671875, "epoch": 0.23082221577726217, "grad_norm": 0.0340256430208683, "grad_norm_var": 7.844317893209116e-06, "learning_rate": 0.008901018219203308, "loss": 2.7581, "step": 6367 }, { "crossentropy": 2.727860689163208, "epoch": 0.2308584686774942, "grad_norm": 0.036746796220541, "grad_norm_var": 7.3664575112772706e-06, "learning_rate": 0.008900654707615335, "loss": 2.6713, "step": 6368 }, { "crossentropy": 2.7378289699554443, "epoch": 0.23089472157772623, "grad_norm": 0.03790263086557388, "grad_norm_var": 7.5669209886247705e-06, "learning_rate": 0.0089002911433431, "loss": 2.6849, "step": 6369 }, { "crossentropy": 2.7017550468444824, "epoch": 0.23093097447795824, "grad_norm": 0.04703100770711899, "grad_norm_var": 1.6007876315121624e-05, "learning_rate": 0.008899927526391516, "loss": 2.738, "step": 6370 }, { "crossentropy": 2.6708755493164062, "epoch": 0.23096722737819025, "grad_norm": 0.04332863911986351, "grad_norm_var": 1.8663784706657072e-05, "learning_rate": 0.008899563856765491, "loss": 2.6985, "step": 6371 }, { "crossentropy": 2.5411391258239746, "epoch": 0.23100348027842227, "grad_norm": 0.04329444095492363, "grad_norm_var": 2.1244722717004595e-05, "learning_rate": 0.008899200134469938, "loss": 2.5546, "step": 6372 }, { "crossentropy": 2.5920379161834717, "epoch": 0.23103973317865428, "grad_norm": 0.037657663226127625, "grad_norm_var": 1.982001619329019e-05, "learning_rate": 0.00889883635950977, "loss": 2.5608, "step": 6373 }, { "crossentropy": 2.6818063259124756, "epoch": 0.2310759860788863, "grad_norm": 0.03361058607697487, "grad_norm_var": 1.9796525400507004e-05, "learning_rate": 0.008898472531889901, "loss": 2.741, "step": 6374 }, { "crossentropy": 2.6115474700927734, "epoch": 0.23111223897911834, "grad_norm": 0.032633885741233826, "grad_norm_var": 1.9487589416224082e-05, "learning_rate": 0.008898108651615244, "loss": 2.55, "step": 6375 }, { "crossentropy": 2.6758663654327393, "epoch": 0.23114849187935035, "grad_norm": 0.0328841507434845, "grad_norm_var": 2.066711274570452e-05, "learning_rate": 0.008897744718690715, "loss": 2.7168, "step": 6376 }, { "crossentropy": 2.6550803184509277, "epoch": 0.23118474477958237, "grad_norm": 0.032463788986206055, "grad_norm_var": 2.1087157980933097e-05, "learning_rate": 0.008897380733121227, "loss": 2.782, "step": 6377 }, { "crossentropy": 2.576754093170166, "epoch": 0.23122099767981438, "grad_norm": 0.03205756098031998, "grad_norm_var": 2.2685646327448803e-05, "learning_rate": 0.008897016694911698, "loss": 2.6685, "step": 6378 }, { "crossentropy": 2.7102625370025635, "epoch": 0.2312572505800464, "grad_norm": 0.03666727617383003, "grad_norm_var": 2.1833461510929278e-05, "learning_rate": 0.008896652604067046, "loss": 2.6455, "step": 6379 }, { "crossentropy": 2.828338623046875, "epoch": 0.2312935034802784, "grad_norm": 0.03990037366747856, "grad_norm_var": 2.1727225756112735e-05, "learning_rate": 0.008896288460592186, "loss": 2.8187, "step": 6380 }, { "crossentropy": 2.7554993629455566, "epoch": 0.23132975638051045, "grad_norm": 0.040790919214487076, "grad_norm_var": 2.199153388604356e-05, "learning_rate": 0.008895924264492037, "loss": 2.6748, "step": 6381 }, { "crossentropy": 2.6833536624908447, "epoch": 0.23136600928074247, "grad_norm": 0.03299807384610176, "grad_norm_var": 2.144054348031891e-05, "learning_rate": 0.008895560015771518, "loss": 2.6838, "step": 6382 }, { "crossentropy": 2.75334095954895, "epoch": 0.23140226218097448, "grad_norm": 0.03248412534594536, "grad_norm_var": 2.2226004937916412e-05, "learning_rate": 0.00889519571443555, "loss": 2.6484, "step": 6383 }, { "crossentropy": 2.5115625858306885, "epoch": 0.2314385150812065, "grad_norm": 0.030263541266322136, "grad_norm_var": 2.5096335944469713e-05, "learning_rate": 0.008894831360489054, "loss": 2.5931, "step": 6384 }, { "crossentropy": 2.741511583328247, "epoch": 0.2314747679814385, "grad_norm": 0.03108282759785652, "grad_norm_var": 2.6839653529619786e-05, "learning_rate": 0.008894466953936949, "loss": 2.7107, "step": 6385 }, { "crossentropy": 2.682140350341797, "epoch": 0.23151102088167053, "grad_norm": 0.031093625351786613, "grad_norm_var": 1.9692149922937116e-05, "learning_rate": 0.008894102494784157, "loss": 2.7774, "step": 6386 }, { "crossentropy": 2.6001241207122803, "epoch": 0.23154727378190254, "grad_norm": 0.03201109170913696, "grad_norm_var": 1.543249461960396e-05, "learning_rate": 0.008893737983035603, "loss": 2.6789, "step": 6387 }, { "crossentropy": 2.4457876682281494, "epoch": 0.23158352668213458, "grad_norm": 0.03158371523022652, "grad_norm_var": 1.0261557172474139e-05, "learning_rate": 0.008893373418696207, "loss": 2.5465, "step": 6388 }, { "crossentropy": 2.542992115020752, "epoch": 0.2316197795823666, "grad_norm": 0.03209336847066879, "grad_norm_var": 9.306019576769907e-06, "learning_rate": 0.008893008801770897, "loss": 2.6005, "step": 6389 }, { "crossentropy": 2.5871171951293945, "epoch": 0.2316560324825986, "grad_norm": 0.03458039462566376, "grad_norm_var": 9.390263862786343e-06, "learning_rate": 0.008892644132264592, "loss": 2.6176, "step": 6390 }, { "crossentropy": 2.6808793544769287, "epoch": 0.23169228538283063, "grad_norm": 0.0352945514023304, "grad_norm_var": 9.534570505250323e-06, "learning_rate": 0.008892279410182223, "loss": 2.7766, "step": 6391 }, { "crossentropy": 2.542741060256958, "epoch": 0.23172853828306264, "grad_norm": 0.03762061148881912, "grad_norm_var": 1.0458988819992721e-05, "learning_rate": 0.008891914635528712, "loss": 2.6149, "step": 6392 }, { "crossentropy": 2.632991313934326, "epoch": 0.23176479118329466, "grad_norm": 0.03169971704483032, "grad_norm_var": 1.0645522729050596e-05, "learning_rate": 0.00889154980830899, "loss": 2.6634, "step": 6393 }, { "crossentropy": 2.5956037044525146, "epoch": 0.23180104408352667, "grad_norm": 0.03170888125896454, "grad_norm_var": 1.0738259609041102e-05, "learning_rate": 0.00889118492852798, "loss": 2.6807, "step": 6394 }, { "crossentropy": 2.8031041622161865, "epoch": 0.2318372969837587, "grad_norm": 0.03272026777267456, "grad_norm_var": 1.0238280145487047e-05, "learning_rate": 0.008890819996190614, "loss": 2.7204, "step": 6395 }, { "crossentropy": 2.68196702003479, "epoch": 0.23187354988399073, "grad_norm": 0.03210299462080002, "grad_norm_var": 7.5092266844748245e-06, "learning_rate": 0.008890455011301817, "loss": 2.774, "step": 6396 }, { "crossentropy": 2.6307930946350098, "epoch": 0.23190980278422274, "grad_norm": 0.03252682462334633, "grad_norm_var": 3.33962653377793e-06, "learning_rate": 0.008890089973866524, "loss": 2.6052, "step": 6397 }, { "crossentropy": 2.630887746810913, "epoch": 0.23194605568445475, "grad_norm": 0.033727265894412994, "grad_norm_var": 3.4099541307890114e-06, "learning_rate": 0.008889724883889661, "loss": 2.6509, "step": 6398 }, { "crossentropy": 2.651268482208252, "epoch": 0.23198230858468677, "grad_norm": 0.03444400802254677, "grad_norm_var": 3.6035141349398852e-06, "learning_rate": 0.008889359741376161, "loss": 2.6815, "step": 6399 }, { "crossentropy": 2.6337363719940186, "epoch": 0.23201856148491878, "grad_norm": 0.03604893013834953, "grad_norm_var": 3.7507228929686685e-06, "learning_rate": 0.008888994546330955, "loss": 2.604, "step": 6400 }, { "crossentropy": 2.736090660095215, "epoch": 0.2320548143851508, "grad_norm": 0.03609725087881088, "grad_norm_var": 3.9427059769031455e-06, "learning_rate": 0.008888629298758976, "loss": 2.7113, "step": 6401 }, { "crossentropy": 2.629019260406494, "epoch": 0.23209106728538284, "grad_norm": 0.03816506266593933, "grad_norm_var": 4.837259706798267e-06, "learning_rate": 0.008888263998665156, "loss": 2.7326, "step": 6402 }, { "crossentropy": 2.677569627761841, "epoch": 0.23212732018561485, "grad_norm": 0.037940844893455505, "grad_norm_var": 5.540216152953985e-06, "learning_rate": 0.008887898646054434, "loss": 2.7231, "step": 6403 }, { "crossentropy": 2.6273233890533447, "epoch": 0.23216357308584687, "grad_norm": 0.035977836698293686, "grad_norm_var": 5.171866678222297e-06, "learning_rate": 0.008887533240931736, "loss": 2.6868, "step": 6404 }, { "crossentropy": 2.7619192600250244, "epoch": 0.23219982598607888, "grad_norm": 0.03802524507045746, "grad_norm_var": 5.430603248479602e-06, "learning_rate": 0.008887167783302004, "loss": 2.7797, "step": 6405 }, { "crossentropy": 2.8296685218811035, "epoch": 0.2322360788863109, "grad_norm": 0.04008977860212326, "grad_norm_var": 7.080021297150609e-06, "learning_rate": 0.008886802273170173, "loss": 2.761, "step": 6406 }, { "crossentropy": 2.6486151218414307, "epoch": 0.2322723317865429, "grad_norm": 0.03950139135122299, "grad_norm_var": 8.20444128862008e-06, "learning_rate": 0.008886436710541177, "loss": 2.6404, "step": 6407 }, { "crossentropy": 2.513195276260376, "epoch": 0.23230858468677495, "grad_norm": 0.03640373423695564, "grad_norm_var": 7.956945761541876e-06, "learning_rate": 0.008886071095419957, "loss": 2.5765, "step": 6408 }, { "crossentropy": 2.698587656021118, "epoch": 0.23234483758700697, "grad_norm": 0.033596839755773544, "grad_norm_var": 7.233570647106196e-06, "learning_rate": 0.008885705427811448, "loss": 2.6648, "step": 6409 }, { "crossentropy": 2.645709753036499, "epoch": 0.23238109048723898, "grad_norm": 0.031360071152448654, "grad_norm_var": 7.420623357573021e-06, "learning_rate": 0.008885339707720593, "loss": 2.6431, "step": 6410 }, { "crossentropy": 2.4505021572113037, "epoch": 0.232417343387471, "grad_norm": 0.03075195476412773, "grad_norm_var": 8.404228891323307e-06, "learning_rate": 0.008884973935152326, "loss": 2.582, "step": 6411 }, { "crossentropy": 2.628106117248535, "epoch": 0.232453596287703, "grad_norm": 0.03224693983793259, "grad_norm_var": 8.3418136090474e-06, "learning_rate": 0.008884608110111592, "loss": 2.5658, "step": 6412 }, { "crossentropy": 2.7410848140716553, "epoch": 0.23248984918793503, "grad_norm": 0.03425562381744385, "grad_norm_var": 7.859063856690124e-06, "learning_rate": 0.00888424223260333, "loss": 2.7729, "step": 6413 }, { "crossentropy": 2.6221001148223877, "epoch": 0.23252610208816704, "grad_norm": 0.03678640350699425, "grad_norm_var": 7.704756076816014e-06, "learning_rate": 0.008883876302632482, "loss": 2.5972, "step": 6414 }, { "crossentropy": 2.6433064937591553, "epoch": 0.23256235498839908, "grad_norm": 0.03924514725804329, "grad_norm_var": 8.321732765130078e-06, "learning_rate": 0.00888351032020399, "loss": 2.6757, "step": 6415 }, { "crossentropy": 2.756669521331787, "epoch": 0.2325986078886311, "grad_norm": 0.037255238741636276, "grad_norm_var": 8.415595055607146e-06, "learning_rate": 0.0088831442853228, "loss": 2.6167, "step": 6416 }, { "crossentropy": 2.6403956413269043, "epoch": 0.2326348607888631, "grad_norm": 0.032235536724328995, "grad_norm_var": 9.352260431468896e-06, "learning_rate": 0.00888277819799385, "loss": 2.6658, "step": 6417 }, { "crossentropy": 2.5613338947296143, "epoch": 0.23267111368909513, "grad_norm": 0.03410690650343895, "grad_norm_var": 9.136935262066039e-06, "learning_rate": 0.008882412058222091, "loss": 2.6225, "step": 6418 }, { "crossentropy": 2.7349507808685303, "epoch": 0.23270736658932714, "grad_norm": 0.03416499122977257, "grad_norm_var": 8.855158315216177e-06, "learning_rate": 0.008882045866012465, "loss": 2.7598, "step": 6419 }, { "crossentropy": 2.8396494388580322, "epoch": 0.23274361948955916, "grad_norm": 0.03171825036406517, "grad_norm_var": 9.646914234843057e-06, "learning_rate": 0.008881679621369917, "loss": 2.741, "step": 6420 }, { "crossentropy": 2.6741771697998047, "epoch": 0.23277987238979117, "grad_norm": 0.031059054657816887, "grad_norm_var": 9.971222900720606e-06, "learning_rate": 0.008881313324299396, "loss": 2.7294, "step": 6421 }, { "crossentropy": 2.7732129096984863, "epoch": 0.2328161252900232, "grad_norm": 0.03520103916525841, "grad_norm_var": 7.934531301196376e-06, "learning_rate": 0.008880946974805848, "loss": 2.7779, "step": 6422 }, { "crossentropy": 2.6252593994140625, "epoch": 0.23285237819025523, "grad_norm": 0.03780466690659523, "grad_norm_var": 6.953150050461214e-06, "learning_rate": 0.008880580572894222, "loss": 2.6687, "step": 6423 }, { "crossentropy": 2.824155569076538, "epoch": 0.23288863109048724, "grad_norm": 0.03900009021162987, "grad_norm_var": 8.115885211748654e-06, "learning_rate": 0.008880214118569466, "loss": 2.7613, "step": 6424 }, { "crossentropy": 2.700749635696411, "epoch": 0.23292488399071926, "grad_norm": 0.03459464758634567, "grad_norm_var": 8.068025693623452e-06, "learning_rate": 0.00887984761183653, "loss": 2.6948, "step": 6425 }, { "crossentropy": 2.7827372550964355, "epoch": 0.23296113689095127, "grad_norm": 0.03134516626596451, "grad_norm_var": 8.074253105566627e-06, "learning_rate": 0.008879481052700365, "loss": 2.7176, "step": 6426 }, { "crossentropy": 2.592766046524048, "epoch": 0.23299738979118328, "grad_norm": 0.03239447996020317, "grad_norm_var": 7.425162156520397e-06, "learning_rate": 0.00887911444116592, "loss": 2.5988, "step": 6427 }, { "crossentropy": 2.7115180492401123, "epoch": 0.23303364269141533, "grad_norm": 0.040191713720560074, "grad_norm_var": 8.889824837873572e-06, "learning_rate": 0.008878747777238148, "loss": 2.632, "step": 6428 }, { "crossentropy": 2.7334132194519043, "epoch": 0.23306989559164734, "grad_norm": 0.04072308912873268, "grad_norm_var": 1.0788942901354435e-05, "learning_rate": 0.008878381060922001, "loss": 2.6925, "step": 6429 }, { "crossentropy": 2.573981761932373, "epoch": 0.23310614849187936, "grad_norm": 0.04201642423868179, "grad_norm_var": 1.3403133446986575e-05, "learning_rate": 0.008878014292222432, "loss": 2.656, "step": 6430 }, { "crossentropy": 2.7691304683685303, "epoch": 0.23314240139211137, "grad_norm": 0.042833346873521805, "grad_norm_var": 1.5848414057020852e-05, "learning_rate": 0.008877647471144395, "loss": 2.7057, "step": 6431 }, { "crossentropy": 2.5840823650360107, "epoch": 0.23317865429234338, "grad_norm": 0.038274750113487244, "grad_norm_var": 1.607853062693767e-05, "learning_rate": 0.008877280597692844, "loss": 2.6124, "step": 6432 }, { "crossentropy": 2.6208622455596924, "epoch": 0.2332149071925754, "grad_norm": 0.033355213701725006, "grad_norm_var": 1.5579360074281658e-05, "learning_rate": 0.008876913671872735, "loss": 2.6631, "step": 6433 }, { "crossentropy": 2.628056526184082, "epoch": 0.2332511600928074, "grad_norm": 0.03325597941875458, "grad_norm_var": 1.5859139792092136e-05, "learning_rate": 0.008876546693689024, "loss": 2.6527, "step": 6434 }, { "crossentropy": 2.59535551071167, "epoch": 0.23328741299303946, "grad_norm": 0.03416109085083008, "grad_norm_var": 1.5860157865646415e-05, "learning_rate": 0.008876179663146666, "loss": 2.6412, "step": 6435 }, { "crossentropy": 2.5331308841705322, "epoch": 0.23332366589327147, "grad_norm": 0.03499689698219299, "grad_norm_var": 1.460751956758135e-05, "learning_rate": 0.00887581258025062, "loss": 2.574, "step": 6436 }, { "crossentropy": 2.6225428581237793, "epoch": 0.23335991879350348, "grad_norm": 0.03404577448964119, "grad_norm_var": 1.3067806344316952e-05, "learning_rate": 0.008875445445005844, "loss": 2.6537, "step": 6437 }, { "crossentropy": 2.7041842937469482, "epoch": 0.2333961716937355, "grad_norm": 0.03519511967897415, "grad_norm_var": 1.3068843346518363e-05, "learning_rate": 0.008875078257417294, "loss": 2.7161, "step": 6438 }, { "crossentropy": 2.541555404663086, "epoch": 0.2334324245939675, "grad_norm": 0.03791842609643936, "grad_norm_var": 1.3089262565857175e-05, "learning_rate": 0.008874711017489932, "loss": 2.6602, "step": 6439 }, { "crossentropy": 2.6257503032684326, "epoch": 0.23346867749419953, "grad_norm": 0.03923574090003967, "grad_norm_var": 1.3170692867437751e-05, "learning_rate": 0.008874343725228719, "loss": 2.7363, "step": 6440 }, { "crossentropy": 2.6902337074279785, "epoch": 0.23350493039443154, "grad_norm": 0.03573448210954666, "grad_norm_var": 1.2957213832349007e-05, "learning_rate": 0.008873976380638613, "loss": 2.7299, "step": 6441 }, { "crossentropy": 2.5778896808624268, "epoch": 0.23354118329466358, "grad_norm": 0.03501730039715767, "grad_norm_var": 1.12247612857846e-05, "learning_rate": 0.008873608983724579, "loss": 2.6849, "step": 6442 }, { "crossentropy": 2.70173978805542, "epoch": 0.2335774361948956, "grad_norm": 0.03219108656048775, "grad_norm_var": 1.1347752596267227e-05, "learning_rate": 0.008873241534491576, "loss": 2.668, "step": 6443 }, { "crossentropy": 2.6016182899475098, "epoch": 0.2336136890951276, "grad_norm": 0.03788388893008232, "grad_norm_var": 1.064362951506299e-05, "learning_rate": 0.008872874032944568, "loss": 2.6319, "step": 6444 }, { "crossentropy": 2.4864799976348877, "epoch": 0.23364994199535963, "grad_norm": 0.035996802151203156, "grad_norm_var": 9.490271212475992e-06, "learning_rate": 0.00887250647908852, "loss": 2.5601, "step": 6445 }, { "crossentropy": 2.8188400268554688, "epoch": 0.23368619489559164, "grad_norm": 0.032456304877996445, "grad_norm_var": 8.02043718789262e-06, "learning_rate": 0.008872138872928394, "loss": 2.6716, "step": 6446 }, { "crossentropy": 2.6253364086151123, "epoch": 0.23372244779582366, "grad_norm": 0.034630149602890015, "grad_norm_var": 4.5164847924665404e-06, "learning_rate": 0.008871771214469155, "loss": 2.6548, "step": 6447 }, { "crossentropy": 2.4218063354492188, "epoch": 0.23375870069605567, "grad_norm": 0.03295174613595009, "grad_norm_var": 4.156096791098814e-06, "learning_rate": 0.008871403503715774, "loss": 2.5194, "step": 6448 }, { "crossentropy": 2.5668418407440186, "epoch": 0.2337949535962877, "grad_norm": 0.03337651118636131, "grad_norm_var": 4.151627362643182e-06, "learning_rate": 0.00887103574067321, "loss": 2.5759, "step": 6449 }, { "crossentropy": 2.6582751274108887, "epoch": 0.23383120649651973, "grad_norm": 0.03282567113637924, "grad_norm_var": 4.259846101009543e-06, "learning_rate": 0.008870667925346435, "loss": 2.5723, "step": 6450 }, { "crossentropy": 2.8065011501312256, "epoch": 0.23386745939675174, "grad_norm": 0.03770696371793747, "grad_norm_var": 4.689916393669074e-06, "learning_rate": 0.008870300057740418, "loss": 2.6587, "step": 6451 }, { "crossentropy": 2.6331281661987305, "epoch": 0.23390371229698376, "grad_norm": 0.03710751608014107, "grad_norm_var": 4.929421182249769e-06, "learning_rate": 0.008869932137860121, "loss": 2.63, "step": 6452 }, { "crossentropy": 2.611464262008667, "epoch": 0.23393996519721577, "grad_norm": 0.034103866666555405, "grad_norm_var": 4.9201722295120575e-06, "learning_rate": 0.008869564165710522, "loss": 2.6896, "step": 6453 }, { "crossentropy": 2.7555201053619385, "epoch": 0.23397621809744779, "grad_norm": 0.03429991751909256, "grad_norm_var": 4.979283006755394e-06, "learning_rate": 0.008869196141296583, "loss": 2.6732, "step": 6454 }, { "crossentropy": 2.604526996612549, "epoch": 0.23401247099767983, "grad_norm": 0.03623684123158455, "grad_norm_var": 4.549826453492609e-06, "learning_rate": 0.008868828064623281, "loss": 2.6099, "step": 6455 }, { "crossentropy": 2.591163158416748, "epoch": 0.23404872389791184, "grad_norm": 0.03867882117629051, "grad_norm_var": 4.262826379558318e-06, "learning_rate": 0.008868459935695584, "loss": 2.6709, "step": 6456 }, { "crossentropy": 2.5982542037963867, "epoch": 0.23408497679814386, "grad_norm": 0.03946373239159584, "grad_norm_var": 5.460015831461271e-06, "learning_rate": 0.008868091754518465, "loss": 2.5758, "step": 6457 }, { "crossentropy": 2.7021713256835938, "epoch": 0.23412122969837587, "grad_norm": 0.03662530705332756, "grad_norm_var": 5.559306711753973e-06, "learning_rate": 0.008867723521096896, "loss": 2.7, "step": 6458 }, { "crossentropy": 2.696969509124756, "epoch": 0.23415748259860789, "grad_norm": 0.03804835304617882, "grad_norm_var": 5.190872798240854e-06, "learning_rate": 0.008867355235435852, "loss": 2.6466, "step": 6459 }, { "crossentropy": 2.7931923866271973, "epoch": 0.2341937354988399, "grad_norm": 0.03716762363910675, "grad_norm_var": 5.021488947936961e-06, "learning_rate": 0.008866986897540308, "loss": 2.7452, "step": 6460 }, { "crossentropy": 2.749697208404541, "epoch": 0.23422998839907191, "grad_norm": 0.040314532816410065, "grad_norm_var": 6.340400472370824e-06, "learning_rate": 0.008866618507415236, "loss": 2.6629, "step": 6461 }, { "crossentropy": 2.591867446899414, "epoch": 0.23426624129930396, "grad_norm": 0.03717534616589546, "grad_norm_var": 5.502763936020745e-06, "learning_rate": 0.008866250065065613, "loss": 2.6535, "step": 6462 }, { "crossentropy": 2.9352481365203857, "epoch": 0.23430249419953597, "grad_norm": 0.03435242921113968, "grad_norm_var": 5.569216428990877e-06, "learning_rate": 0.008865881570496417, "loss": 2.7826, "step": 6463 }, { "crossentropy": 2.7380144596099854, "epoch": 0.23433874709976799, "grad_norm": 0.03340641409158707, "grad_norm_var": 5.38053971760871e-06, "learning_rate": 0.00886551302371262, "loss": 2.76, "step": 6464 }, { "crossentropy": 2.6180782318115234, "epoch": 0.234375, "grad_norm": 0.03508874028921127, "grad_norm_var": 4.895066407839312e-06, "learning_rate": 0.008865144424719208, "loss": 2.6535, "step": 6465 }, { "crossentropy": 2.7155821323394775, "epoch": 0.23441125290023201, "grad_norm": 0.03498588129878044, "grad_norm_var": 4.1535785570298e-06, "learning_rate": 0.008864775773521153, "loss": 2.6711, "step": 6466 }, { "crossentropy": 2.60300612449646, "epoch": 0.23444750580046403, "grad_norm": 0.03184627741575241, "grad_norm_var": 5.394384217563032e-06, "learning_rate": 0.008864407070123438, "loss": 2.5701, "step": 6467 }, { "crossentropy": 2.770231246948242, "epoch": 0.23448375870069604, "grad_norm": 0.030266014859080315, "grad_norm_var": 7.474919058614174e-06, "learning_rate": 0.008864038314531038, "loss": 2.6735, "step": 6468 }, { "crossentropy": 2.7552542686462402, "epoch": 0.23452001160092809, "grad_norm": 0.03176838904619217, "grad_norm_var": 8.329593211947811e-06, "learning_rate": 0.008863669506748939, "loss": 2.7433, "step": 6469 }, { "crossentropy": 2.795161247253418, "epoch": 0.2345562645011601, "grad_norm": 0.03458276018500328, "grad_norm_var": 8.285270302603323e-06, "learning_rate": 0.008863300646782118, "loss": 2.7514, "step": 6470 }, { "crossentropy": 2.746039390563965, "epoch": 0.23459251740139211, "grad_norm": 0.040695078670978546, "grad_norm_var": 9.890933368021066e-06, "learning_rate": 0.008862931734635562, "loss": 2.7565, "step": 6471 }, { "crossentropy": 2.717465877532959, "epoch": 0.23462877030162413, "grad_norm": 0.03997818008065224, "grad_norm_var": 1.0477167607184538e-05, "learning_rate": 0.008862562770314249, "loss": 2.6331, "step": 6472 }, { "crossentropy": 2.747084856033325, "epoch": 0.23466502320185614, "grad_norm": 0.03388578072190285, "grad_norm_var": 9.834772554497443e-06, "learning_rate": 0.008862193753823163, "loss": 2.7096, "step": 6473 }, { "crossentropy": 2.5168819427490234, "epoch": 0.23470127610208816, "grad_norm": 0.03162600100040436, "grad_norm_var": 1.0737855162745595e-05, "learning_rate": 0.00886182468516729, "loss": 2.5418, "step": 6474 }, { "crossentropy": 2.6048176288604736, "epoch": 0.23473752900232017, "grad_norm": 0.035237688571214676, "grad_norm_var": 1.0210718257778551e-05, "learning_rate": 0.008861455564351615, "loss": 2.7172, "step": 6475 }, { "crossentropy": 2.592407464981079, "epoch": 0.23477378190255221, "grad_norm": 0.037570368498563766, "grad_norm_var": 1.0329277707567123e-05, "learning_rate": 0.008861086391381121, "loss": 2.7209, "step": 6476 }, { "crossentropy": 2.6915929317474365, "epoch": 0.23481003480278423, "grad_norm": 0.03644309937953949, "grad_norm_var": 8.612397268983408e-06, "learning_rate": 0.008860717166260799, "loss": 2.7643, "step": 6477 }, { "crossentropy": 2.690849542617798, "epoch": 0.23484628770301624, "grad_norm": 0.0358998142182827, "grad_norm_var": 8.33251792781441e-06, "learning_rate": 0.00886034788899563, "loss": 2.633, "step": 6478 }, { "crossentropy": 2.590466260910034, "epoch": 0.23488254060324826, "grad_norm": 0.03546036034822464, "grad_norm_var": 8.335430248232568e-06, "learning_rate": 0.008859978559590604, "loss": 2.5262, "step": 6479 }, { "crossentropy": 2.541550874710083, "epoch": 0.23491879350348027, "grad_norm": 0.03342505916953087, "grad_norm_var": 8.331685945991045e-06, "learning_rate": 0.008859609178050712, "loss": 2.5657, "step": 6480 }, { "crossentropy": 2.658013343811035, "epoch": 0.2349550464037123, "grad_norm": 0.03164495527744293, "grad_norm_var": 8.99656709256144e-06, "learning_rate": 0.00885923974438094, "loss": 2.7133, "step": 6481 }, { "crossentropy": 2.7885897159576416, "epoch": 0.23499129930394433, "grad_norm": 0.03282717987895012, "grad_norm_var": 9.20761377110484e-06, "learning_rate": 0.008858870258586279, "loss": 2.6973, "step": 6482 }, { "crossentropy": 2.6944780349731445, "epoch": 0.23502755220417634, "grad_norm": 0.03227364644408226, "grad_norm_var": 9.063692617574633e-06, "learning_rate": 0.008858500720671719, "loss": 2.6824, "step": 6483 }, { "crossentropy": 2.67136812210083, "epoch": 0.23506380510440836, "grad_norm": 0.0332370288670063, "grad_norm_var": 7.89891812446227e-06, "learning_rate": 0.00885813113064225, "loss": 2.6632, "step": 6484 }, { "crossentropy": 2.7017405033111572, "epoch": 0.23510005800464037, "grad_norm": 0.04094916209578514, "grad_norm_var": 9.474540164087995e-06, "learning_rate": 0.008857761488502867, "loss": 2.6884, "step": 6485 }, { "crossentropy": 2.702406644821167, "epoch": 0.2351363109048724, "grad_norm": 0.045459501445293427, "grad_norm_var": 1.5743491296875874e-05, "learning_rate": 0.00885739179425856, "loss": 2.7065, "step": 6486 }, { "crossentropy": 2.677225351333618, "epoch": 0.2351725638051044, "grad_norm": 0.036654870957136154, "grad_norm_var": 1.425511930464255e-05, "learning_rate": 0.008857022047914323, "loss": 2.7243, "step": 6487 }, { "crossentropy": 2.7631659507751465, "epoch": 0.23520881670533642, "grad_norm": 0.031430356204509735, "grad_norm_var": 1.4043595904579773e-05, "learning_rate": 0.008856652249475148, "loss": 2.6972, "step": 6488 }, { "crossentropy": 2.823082447052002, "epoch": 0.23524506960556846, "grad_norm": 0.03136434033513069, "grad_norm_var": 1.49001120671693e-05, "learning_rate": 0.008856282398946034, "loss": 2.837, "step": 6489 }, { "crossentropy": 2.7494258880615234, "epoch": 0.23528132250580047, "grad_norm": 0.03383460268378258, "grad_norm_var": 1.4183735471941644e-05, "learning_rate": 0.008855912496331974, "loss": 2.7064, "step": 6490 }, { "crossentropy": 2.663731575012207, "epoch": 0.2353175754060325, "grad_norm": 0.033072978258132935, "grad_norm_var": 1.4474967382084129e-05, "learning_rate": 0.008855542541637965, "loss": 2.7632, "step": 6491 }, { "crossentropy": 2.6284875869750977, "epoch": 0.2353538283062645, "grad_norm": 0.035397522151470184, "grad_norm_var": 1.4053394915005251e-05, "learning_rate": 0.008855172534869001, "loss": 2.608, "step": 6492 }, { "crossentropy": 2.7451155185699463, "epoch": 0.23539008120649652, "grad_norm": 0.033856350928545, "grad_norm_var": 1.3960390552149777e-05, "learning_rate": 0.008854802476030084, "loss": 2.6948, "step": 6493 }, { "crossentropy": 2.6678967475891113, "epoch": 0.23542633410672853, "grad_norm": 0.039560213685035706, "grad_norm_var": 1.5334940533563655e-05, "learning_rate": 0.008854432365126206, "loss": 2.7287, "step": 6494 }, { "crossentropy": 2.735236883163452, "epoch": 0.23546258700696054, "grad_norm": 0.04350612312555313, "grad_norm_var": 1.9844648075434807e-05, "learning_rate": 0.008854062202162371, "loss": 2.6079, "step": 6495 }, { "crossentropy": 2.6140758991241455, "epoch": 0.2354988399071926, "grad_norm": 0.0403134860098362, "grad_norm_var": 2.0876204719207496e-05, "learning_rate": 0.008853691987143579, "loss": 2.6124, "step": 6496 }, { "crossentropy": 2.627772092819214, "epoch": 0.2355350928074246, "grad_norm": 0.03576714172959328, "grad_norm_var": 1.9565808518163624e-05, "learning_rate": 0.008853321720074827, "loss": 2.6214, "step": 6497 }, { "crossentropy": 2.575263738632202, "epoch": 0.23557134570765662, "grad_norm": 0.03362452983856201, "grad_norm_var": 1.924494491398933e-05, "learning_rate": 0.008852951400961118, "loss": 2.6115, "step": 6498 }, { "crossentropy": 2.6771280765533447, "epoch": 0.23560759860788863, "grad_norm": 0.0332770012319088, "grad_norm_var": 1.8773381953606355e-05, "learning_rate": 0.008852581029807453, "loss": 2.6639, "step": 6499 }, { "crossentropy": 2.6590731143951416, "epoch": 0.23564385150812064, "grad_norm": 0.03527574613690376, "grad_norm_var": 1.8191967512221942e-05, "learning_rate": 0.008852210606618836, "loss": 2.6118, "step": 6500 }, { "crossentropy": 2.7045130729675293, "epoch": 0.23568010440835266, "grad_norm": 0.03474893420934677, "grad_norm_var": 1.6882636580750622e-05, "learning_rate": 0.008851840131400267, "loss": 2.6629, "step": 6501 }, { "crossentropy": 2.810664415359497, "epoch": 0.23571635730858467, "grad_norm": 0.031406842172145844, "grad_norm_var": 1.1634743622131646e-05, "learning_rate": 0.008851469604156751, "loss": 2.796, "step": 6502 }, { "crossentropy": 2.7173333168029785, "epoch": 0.23575261020881672, "grad_norm": 0.03149714693427086, "grad_norm_var": 1.2292182994399123e-05, "learning_rate": 0.008851099024893297, "loss": 2.736, "step": 6503 }, { "crossentropy": 2.6362485885620117, "epoch": 0.23578886310904873, "grad_norm": 0.031146593391895294, "grad_norm_var": 1.2427386131486084e-05, "learning_rate": 0.008850728393614902, "loss": 2.7068, "step": 6504 }, { "crossentropy": 2.769693374633789, "epoch": 0.23582511600928074, "grad_norm": 0.03182321786880493, "grad_norm_var": 1.2227091724745636e-05, "learning_rate": 0.00885035771032658, "loss": 2.6831, "step": 6505 }, { "crossentropy": 2.7299890518188477, "epoch": 0.23586136890951276, "grad_norm": 0.03155818581581116, "grad_norm_var": 1.2868812006972911e-05, "learning_rate": 0.00884998697503333, "loss": 2.6279, "step": 6506 }, { "crossentropy": 2.720449209213257, "epoch": 0.23589762180974477, "grad_norm": 0.0325605683028698, "grad_norm_var": 1.2999081289700488e-05, "learning_rate": 0.008849616187740167, "loss": 2.7365, "step": 6507 }, { "crossentropy": 2.6422207355499268, "epoch": 0.2359338747099768, "grad_norm": 0.04048125445842743, "grad_norm_var": 1.5082087358562977e-05, "learning_rate": 0.008849245348452092, "loss": 2.6193, "step": 6508 }, { "crossentropy": 2.564298391342163, "epoch": 0.23597012761020883, "grad_norm": 0.03541278839111328, "grad_norm_var": 1.4990926297392404e-05, "learning_rate": 0.008848874457174121, "loss": 2.5543, "step": 6509 }, { "crossentropy": 2.7004566192626953, "epoch": 0.23600638051044084, "grad_norm": 0.032104846090078354, "grad_norm_var": 1.405351392950397e-05, "learning_rate": 0.008848503513911256, "loss": 2.6835, "step": 6510 }, { "crossentropy": 2.765814781188965, "epoch": 0.23604263341067286, "grad_norm": 0.05679567903280258, "grad_norm_var": 4.077274568353619e-05, "learning_rate": 0.008848132518668514, "loss": 2.7452, "step": 6511 }, { "crossentropy": 2.543546438217163, "epoch": 0.23607888631090487, "grad_norm": 0.03235452622175217, "grad_norm_var": 3.961009976771827e-05, "learning_rate": 0.008847761471450902, "loss": 2.6144, "step": 6512 }, { "crossentropy": 2.689915895462036, "epoch": 0.2361151392111369, "grad_norm": 0.0342564582824707, "grad_norm_var": 3.959613677247531e-05, "learning_rate": 0.00884739037226343, "loss": 2.6658, "step": 6513 }, { "crossentropy": 2.675412178039551, "epoch": 0.2361513921113689, "grad_norm": 0.033608391880989075, "grad_norm_var": 3.959888733613686e-05, "learning_rate": 0.008847019221111115, "loss": 2.7242, "step": 6514 }, { "crossentropy": 2.6974565982818604, "epoch": 0.23618764501160092, "grad_norm": 0.034524526447057724, "grad_norm_var": 3.942714759246988e-05, "learning_rate": 0.008846648017998966, "loss": 2.7431, "step": 6515 }, { "crossentropy": 2.716465473175049, "epoch": 0.23622389791183296, "grad_norm": 0.035078685730695724, "grad_norm_var": 3.9421599883036834e-05, "learning_rate": 0.008846276762931997, "loss": 2.6908, "step": 6516 }, { "crossentropy": 2.8101375102996826, "epoch": 0.23626015081206497, "grad_norm": 0.032531607896089554, "grad_norm_var": 3.9791258579562526e-05, "learning_rate": 0.008845905455915223, "loss": 2.7446, "step": 6517 }, { "crossentropy": 2.683401107788086, "epoch": 0.236296403712297, "grad_norm": 0.03276428207755089, "grad_norm_var": 3.928842839500627e-05, "learning_rate": 0.008845534096953662, "loss": 2.7061, "step": 6518 }, { "crossentropy": 2.551327705383301, "epoch": 0.236332656612529, "grad_norm": 0.03206324949860573, "grad_norm_var": 3.905114349235881e-05, "learning_rate": 0.008845162686052326, "loss": 2.6114, "step": 6519 }, { "crossentropy": 2.6595654487609863, "epoch": 0.23636890951276102, "grad_norm": 0.033224642276763916, "grad_norm_var": 3.8269554664296415e-05, "learning_rate": 0.008844791223216233, "loss": 2.6997, "step": 6520 }, { "crossentropy": 2.642547130584717, "epoch": 0.23640516241299303, "grad_norm": 0.033289823681116104, "grad_norm_var": 3.77688080102021e-05, "learning_rate": 0.0088444197084504, "loss": 2.6156, "step": 6521 }, { "crossentropy": 2.419670343399048, "epoch": 0.23644141531322505, "grad_norm": 0.03407183289527893, "grad_norm_var": 3.6955513519056306e-05, "learning_rate": 0.008844048141759844, "loss": 2.5736, "step": 6522 }, { "crossentropy": 2.815092086791992, "epoch": 0.2364776682134571, "grad_norm": 0.031118452548980713, "grad_norm_var": 3.761612197122847e-05, "learning_rate": 0.008843676523149584, "loss": 2.7114, "step": 6523 }, { "crossentropy": 2.6462202072143555, "epoch": 0.2365139211136891, "grad_norm": 0.03374810144305229, "grad_norm_var": 3.5735306781007474e-05, "learning_rate": 0.00884330485262464, "loss": 2.6219, "step": 6524 }, { "crossentropy": 2.855604648590088, "epoch": 0.23655017401392112, "grad_norm": 0.032158441841602325, "grad_norm_var": 3.613534405613461e-05, "learning_rate": 0.008842933130190031, "loss": 2.7294, "step": 6525 }, { "crossentropy": 2.759343147277832, "epoch": 0.23658642691415313, "grad_norm": 0.03339798375964165, "grad_norm_var": 3.580863845034088e-05, "learning_rate": 0.00884256135585078, "loss": 2.7192, "step": 6526 }, { "crossentropy": 2.85068678855896, "epoch": 0.23662267981438515, "grad_norm": 0.03364414721727371, "grad_norm_var": 1.0605640805377848e-06, "learning_rate": 0.008842189529611907, "loss": 2.7901, "step": 6527 }, { "crossentropy": 2.9042112827301025, "epoch": 0.23665893271461716, "grad_norm": 0.04419982060790062, "grad_norm_var": 8.43198689619521e-06, "learning_rate": 0.00884181765147843, "loss": 2.8287, "step": 6528 }, { "crossentropy": 2.6289992332458496, "epoch": 0.2366951856148492, "grad_norm": 0.035956934094429016, "grad_norm_var": 8.675388087051906e-06, "learning_rate": 0.008841445721455378, "loss": 2.595, "step": 6529 }, { "crossentropy": 2.6213016510009766, "epoch": 0.23673143851508122, "grad_norm": 0.03537202626466751, "grad_norm_var": 8.75740596397532e-06, "learning_rate": 0.008841073739547772, "loss": 2.591, "step": 6530 }, { "crossentropy": 2.6736268997192383, "epoch": 0.23676769141531323, "grad_norm": 0.03466946631669998, "grad_norm_var": 8.765057477259224e-06, "learning_rate": 0.008840701705760635, "loss": 2.6531, "step": 6531 }, { "crossentropy": 2.6500682830810547, "epoch": 0.23680394431554525, "grad_norm": 0.03242871165275574, "grad_norm_var": 8.89546561757265e-06, "learning_rate": 0.008840329620098994, "loss": 2.7257, "step": 6532 }, { "crossentropy": 2.5547597408294678, "epoch": 0.23684019721577726, "grad_norm": 0.03405902162194252, "grad_norm_var": 8.734091878048224e-06, "learning_rate": 0.008839957482567874, "loss": 2.6209, "step": 6533 }, { "crossentropy": 2.8059139251708984, "epoch": 0.23687645011600927, "grad_norm": 0.03534485399723053, "grad_norm_var": 8.678520656130172e-06, "learning_rate": 0.0088395852931723, "loss": 2.7483, "step": 6534 }, { "crossentropy": 2.685795307159424, "epoch": 0.2369127030162413, "grad_norm": 0.04010533168911934, "grad_norm_var": 1.0325813772002478e-05, "learning_rate": 0.008839213051917299, "loss": 2.6613, "step": 6535 }, { "crossentropy": 2.6429100036621094, "epoch": 0.23694895591647333, "grad_norm": 0.03753558546304703, "grad_norm_var": 1.0582198439247857e-05, "learning_rate": 0.0088388407588079, "loss": 2.6118, "step": 6536 }, { "crossentropy": 2.5415992736816406, "epoch": 0.23698520881670534, "grad_norm": 0.032892804592847824, "grad_norm_var": 1.0686220746745448e-05, "learning_rate": 0.008838468413849132, "loss": 2.6172, "step": 6537 }, { "crossentropy": 2.5625717639923096, "epoch": 0.23702146171693736, "grad_norm": 0.03485129028558731, "grad_norm_var": 1.0623160959886275e-05, "learning_rate": 0.008838096017046023, "loss": 2.6491, "step": 6538 }, { "crossentropy": 2.727794647216797, "epoch": 0.23705771461716937, "grad_norm": 0.03156968206167221, "grad_norm_var": 1.0396780951121842e-05, "learning_rate": 0.008837723568403602, "loss": 2.6942, "step": 6539 }, { "crossentropy": 2.681267261505127, "epoch": 0.2370939675174014, "grad_norm": 0.03377322107553482, "grad_norm_var": 1.039222253692845e-05, "learning_rate": 0.0088373510679269, "loss": 2.7215, "step": 6540 }, { "crossentropy": 2.8134677410125732, "epoch": 0.2371302204176334, "grad_norm": 0.038600049912929535, "grad_norm_var": 1.043988010102307e-05, "learning_rate": 0.008836978515620948, "loss": 2.7538, "step": 6541 }, { "crossentropy": 2.6326780319213867, "epoch": 0.23716647331786542, "grad_norm": 0.037819910794496536, "grad_norm_var": 1.040786764509623e-05, "learning_rate": 0.00883660591149078, "loss": 2.6782, "step": 6542 }, { "crossentropy": 2.4383158683776855, "epoch": 0.23720272621809746, "grad_norm": 0.037043094635009766, "grad_norm_var": 1.0152255567496584e-05, "learning_rate": 0.008836233255541425, "loss": 2.5262, "step": 6543 }, { "crossentropy": 2.593188762664795, "epoch": 0.23723897911832947, "grad_norm": 0.0483037605881691, "grad_norm_var": 1.5684191371791397e-05, "learning_rate": 0.00883586054777792, "loss": 2.6306, "step": 6544 }, { "crossentropy": 2.6103122234344482, "epoch": 0.2372752320185615, "grad_norm": 0.03363525867462158, "grad_norm_var": 1.6118100225777595e-05, "learning_rate": 0.008835487788205296, "loss": 2.7288, "step": 6545 }, { "crossentropy": 2.775510787963867, "epoch": 0.2373114849187935, "grad_norm": 0.035943496972322464, "grad_norm_var": 1.6081118360498856e-05, "learning_rate": 0.008835114976828588, "loss": 2.7652, "step": 6546 }, { "crossentropy": 2.61922287940979, "epoch": 0.23734773781902552, "grad_norm": 0.03620150685310364, "grad_norm_var": 1.592314231665063e-05, "learning_rate": 0.008834742113652833, "loss": 2.6233, "step": 6547 }, { "crossentropy": 2.8011457920074463, "epoch": 0.23738399071925753, "grad_norm": 0.03777706250548363, "grad_norm_var": 1.4981138928660034e-05, "learning_rate": 0.008834369198683068, "loss": 2.7503, "step": 6548 }, { "crossentropy": 2.714332103729248, "epoch": 0.23742024361948955, "grad_norm": 0.03769910708069801, "grad_norm_var": 1.4580397527184235e-05, "learning_rate": 0.008833996231924327, "loss": 2.7197, "step": 6549 }, { "crossentropy": 2.579589605331421, "epoch": 0.2374564965197216, "grad_norm": 0.03461179509758949, "grad_norm_var": 1.4758019500508459e-05, "learning_rate": 0.008833623213381648, "loss": 2.5855, "step": 6550 }, { "crossentropy": 2.6336464881896973, "epoch": 0.2374927494199536, "grad_norm": 0.03215890750288963, "grad_norm_var": 1.5173606403699226e-05, "learning_rate": 0.008833250143060072, "loss": 2.665, "step": 6551 }, { "crossentropy": 2.4493751525878906, "epoch": 0.23752900232018562, "grad_norm": 0.032257482409477234, "grad_norm_var": 1.602834855190475e-05, "learning_rate": 0.008832877020964632, "loss": 2.5006, "step": 6552 }, { "crossentropy": 2.6581122875213623, "epoch": 0.23756525522041763, "grad_norm": 0.03646041080355644, "grad_norm_var": 1.5371418136946904e-05, "learning_rate": 0.008832503847100374, "loss": 2.5928, "step": 6553 }, { "crossentropy": 2.544191360473633, "epoch": 0.23760150812064965, "grad_norm": 0.03466128930449486, "grad_norm_var": 1.5407059787719836e-05, "learning_rate": 0.008832130621472334, "loss": 2.6291, "step": 6554 }, { "crossentropy": 2.6358721256256104, "epoch": 0.23763776102088166, "grad_norm": 0.04016704857349396, "grad_norm_var": 1.476792622752294e-05, "learning_rate": 0.008831757344085556, "loss": 2.7028, "step": 6555 }, { "crossentropy": 2.5527358055114746, "epoch": 0.2376740139211137, "grad_norm": 0.03928053006529808, "grad_norm_var": 1.4518397026141473e-05, "learning_rate": 0.00883138401494508, "loss": 2.5222, "step": 6556 }, { "crossentropy": 2.6151347160339355, "epoch": 0.23771026682134572, "grad_norm": 0.03639901801943779, "grad_norm_var": 1.436299779821737e-05, "learning_rate": 0.008831010634055947, "loss": 2.5874, "step": 6557 }, { "crossentropy": 2.6324055194854736, "epoch": 0.23774651972157773, "grad_norm": 0.0342678427696228, "grad_norm_var": 1.471647644130976e-05, "learning_rate": 0.008830637201423202, "loss": 2.6293, "step": 6558 }, { "crossentropy": 2.5558626651763916, "epoch": 0.23778277262180975, "grad_norm": 0.03412480652332306, "grad_norm_var": 1.5107168549537547e-05, "learning_rate": 0.00883026371705189, "loss": 2.5759, "step": 6559 }, { "crossentropy": 2.7683849334716797, "epoch": 0.23781902552204176, "grad_norm": 0.037865061312913895, "grad_norm_var": 5.484375311544619e-06, "learning_rate": 0.008829890180947052, "loss": 2.7198, "step": 6560 }, { "crossentropy": 2.4978976249694824, "epoch": 0.23785527842227377, "grad_norm": 0.03432784229516983, "grad_norm_var": 5.31035150749352e-06, "learning_rate": 0.008829516593113736, "loss": 2.5792, "step": 6561 }, { "crossentropy": 2.6560230255126953, "epoch": 0.2378915313225058, "grad_norm": 0.03314872086048126, "grad_norm_var": 5.777733034541208e-06, "learning_rate": 0.008829142953556987, "loss": 2.6573, "step": 6562 }, { "crossentropy": 2.7550368309020996, "epoch": 0.23792778422273783, "grad_norm": 0.03168376535177231, "grad_norm_var": 6.759113873405718e-06, "learning_rate": 0.008828769262281851, "loss": 2.6591, "step": 6563 }, { "crossentropy": 2.7608256340026855, "epoch": 0.23796403712296985, "grad_norm": 0.032820750027894974, "grad_norm_var": 6.743833312977619e-06, "learning_rate": 0.008828395519293375, "loss": 2.7153, "step": 6564 }, { "crossentropy": 2.619945526123047, "epoch": 0.23800029002320186, "grad_norm": 0.03155442327260971, "grad_norm_var": 6.9913510731702455e-06, "learning_rate": 0.008828021724596609, "loss": 2.6408, "step": 6565 }, { "crossentropy": 2.5749642848968506, "epoch": 0.23803654292343387, "grad_norm": 0.03220757842063904, "grad_norm_var": 7.392706952226377e-06, "learning_rate": 0.008827647878196601, "loss": 2.657, "step": 6566 }, { "crossentropy": 2.5801937580108643, "epoch": 0.2380727958236659, "grad_norm": 0.03135312348604202, "grad_norm_var": 7.694112730621585e-06, "learning_rate": 0.008827273980098398, "loss": 2.5973, "step": 6567 }, { "crossentropy": 2.6142661571502686, "epoch": 0.2381090487238979, "grad_norm": 0.03216410428285599, "grad_norm_var": 7.723029065754654e-06, "learning_rate": 0.008826900030307052, "loss": 2.6105, "step": 6568 }, { "crossentropy": 2.7625160217285156, "epoch": 0.23814530162412992, "grad_norm": 0.0324680395424366, "grad_norm_var": 7.691839562599251e-06, "learning_rate": 0.008826526028827615, "loss": 2.706, "step": 6569 }, { "crossentropy": 2.692744016647339, "epoch": 0.23818155452436196, "grad_norm": 0.039699047803878784, "grad_norm_var": 9.53355479706517e-06, "learning_rate": 0.008826151975665135, "loss": 2.5917, "step": 6570 }, { "crossentropy": 2.8658788204193115, "epoch": 0.23821780742459397, "grad_norm": 0.03300425037741661, "grad_norm_var": 7.419330270987354e-06, "learning_rate": 0.008825777870824668, "loss": 2.7572, "step": 6571 }, { "crossentropy": 2.5486643314361572, "epoch": 0.238254060324826, "grad_norm": 0.03824468329548836, "grad_norm_var": 6.777530610866759e-06, "learning_rate": 0.008825403714311264, "loss": 2.6457, "step": 6572 }, { "crossentropy": 2.5952999591827393, "epoch": 0.238290313225058, "grad_norm": 0.037566348910331726, "grad_norm_var": 7.223122333132175e-06, "learning_rate": 0.00882502950612998, "loss": 2.6275, "step": 6573 }, { "crossentropy": 2.6010940074920654, "epoch": 0.23832656612529002, "grad_norm": 0.03899485245347023, "grad_norm_var": 8.689979159613031e-06, "learning_rate": 0.008824655246285864, "loss": 2.6521, "step": 6574 }, { "crossentropy": 2.730771541595459, "epoch": 0.23836281902552203, "grad_norm": 0.03340739384293556, "grad_norm_var": 8.753416907637291e-06, "learning_rate": 0.008824280934783976, "loss": 2.6323, "step": 6575 }, { "crossentropy": 2.6626944541931152, "epoch": 0.23839907192575405, "grad_norm": 0.03801819682121277, "grad_norm_var": 8.825492063848038e-06, "learning_rate": 0.008823906571629372, "loss": 2.6687, "step": 6576 }, { "crossentropy": 2.704793930053711, "epoch": 0.2384353248259861, "grad_norm": 0.04338590428233147, "grad_norm_var": 1.3846513247854572e-05, "learning_rate": 0.008823532156827105, "loss": 2.7364, "step": 6577 }, { "crossentropy": 2.743253707885742, "epoch": 0.2384715777262181, "grad_norm": 0.04563862830400467, "grad_norm_var": 2.0542420965016735e-05, "learning_rate": 0.008823157690382236, "loss": 2.6536, "step": 6578 }, { "crossentropy": 2.811781406402588, "epoch": 0.23850783062645012, "grad_norm": 0.04119402542710304, "grad_norm_var": 2.10223804685337e-05, "learning_rate": 0.00882278317229982, "loss": 2.7262, "step": 6579 }, { "crossentropy": 2.6664764881134033, "epoch": 0.23854408352668213, "grad_norm": 0.03230109438300133, "grad_norm_var": 2.128431621971789e-05, "learning_rate": 0.008822408602584915, "loss": 2.7196, "step": 6580 }, { "crossentropy": 2.6406898498535156, "epoch": 0.23858033642691415, "grad_norm": 0.03160245344042778, "grad_norm_var": 2.1253908842285875e-05, "learning_rate": 0.00882203398124258, "loss": 2.6469, "step": 6581 }, { "crossentropy": 2.6870007514953613, "epoch": 0.23861658932714616, "grad_norm": 0.030638031661510468, "grad_norm_var": 2.2270191297951657e-05, "learning_rate": 0.008821659308277876, "loss": 2.7485, "step": 6582 }, { "crossentropy": 2.689530611038208, "epoch": 0.2386528422273782, "grad_norm": 0.03364620730280876, "grad_norm_var": 2.1107749262718143e-05, "learning_rate": 0.008821284583695866, "loss": 2.7064, "step": 6583 }, { "crossentropy": 2.764772653579712, "epoch": 0.23868909512761022, "grad_norm": 0.03228498622775078, "grad_norm_var": 2.1040819973013915e-05, "learning_rate": 0.008820909807501607, "loss": 2.6889, "step": 6584 }, { "crossentropy": 2.532945156097412, "epoch": 0.23872534802784223, "grad_norm": 0.03383314982056618, "grad_norm_var": 2.0445095126848423e-05, "learning_rate": 0.008820534979700163, "loss": 2.5951, "step": 6585 }, { "crossentropy": 2.736896514892578, "epoch": 0.23876160092807425, "grad_norm": 0.03319626301527023, "grad_norm_var": 2.0284984394672704e-05, "learning_rate": 0.008820160100296594, "loss": 2.7164, "step": 6586 }, { "crossentropy": 2.779933214187622, "epoch": 0.23879785382830626, "grad_norm": 0.03398273140192032, "grad_norm_var": 1.9946186515394185e-05, "learning_rate": 0.00881978516929597, "loss": 2.7062, "step": 6587 }, { "crossentropy": 2.540233612060547, "epoch": 0.23883410672853828, "grad_norm": 0.035957712680101395, "grad_norm_var": 1.962548267211983e-05, "learning_rate": 0.008819410186703347, "loss": 2.5706, "step": 6588 }, { "crossentropy": 2.5687572956085205, "epoch": 0.2388703596287703, "grad_norm": 0.03424692153930664, "grad_norm_var": 1.9611156651293722e-05, "learning_rate": 0.008819035152523795, "loss": 2.6209, "step": 6589 }, { "crossentropy": 2.447526693344116, "epoch": 0.23890661252900233, "grad_norm": 0.033068180084228516, "grad_norm_var": 1.925856686695078e-05, "learning_rate": 0.008818660066762378, "loss": 2.5154, "step": 6590 }, { "crossentropy": 2.7828638553619385, "epoch": 0.23894286542923435, "grad_norm": 0.03234316036105156, "grad_norm_var": 1.961211702366002e-05, "learning_rate": 0.00881828492942416, "loss": 2.7294, "step": 6591 }, { "crossentropy": 2.562955141067505, "epoch": 0.23897911832946636, "grad_norm": 0.03333481028676033, "grad_norm_var": 1.9306600074053596e-05, "learning_rate": 0.008817909740514211, "loss": 2.5606, "step": 6592 }, { "crossentropy": 2.6194348335266113, "epoch": 0.23901537122969838, "grad_norm": 0.03432183340191841, "grad_norm_var": 1.4356131115283074e-05, "learning_rate": 0.008817534500037597, "loss": 2.708, "step": 6593 }, { "crossentropy": 2.6652543544769287, "epoch": 0.2390516241299304, "grad_norm": 0.03423091769218445, "grad_norm_var": 5.508497667004036e-06, "learning_rate": 0.008817159207999385, "loss": 2.6592, "step": 6594 }, { "crossentropy": 2.6508796215057373, "epoch": 0.2390878770301624, "grad_norm": 0.0334940105676651, "grad_norm_var": 1.5832985230385468e-06, "learning_rate": 0.008816783864404647, "loss": 2.5735, "step": 6595 }, { "crossentropy": 2.663630723953247, "epoch": 0.23912412993039442, "grad_norm": 0.036718666553497314, "grad_norm_var": 2.2263069849091007e-06, "learning_rate": 0.00881640846925845, "loss": 2.6556, "step": 6596 }, { "crossentropy": 2.7167398929595947, "epoch": 0.23916038283062646, "grad_norm": 0.03777977079153061, "grad_norm_var": 3.0020287149295475e-06, "learning_rate": 0.008816033022565863, "loss": 2.6208, "step": 6597 }, { "crossentropy": 2.5705041885375977, "epoch": 0.23919663573085848, "grad_norm": 0.04152429476380348, "grad_norm_var": 5.612747517183547e-06, "learning_rate": 0.00881565752433196, "loss": 2.5986, "step": 6598 }, { "crossentropy": 2.651458501815796, "epoch": 0.2392328886310905, "grad_norm": 0.037554241716861725, "grad_norm_var": 6.058457489274173e-06, "learning_rate": 0.008815281974561814, "loss": 2.5856, "step": 6599 }, { "crossentropy": 2.6186363697052, "epoch": 0.2392691415313225, "grad_norm": 0.03279075026512146, "grad_norm_var": 5.900327661451099e-06, "learning_rate": 0.008814906373260493, "loss": 2.5784, "step": 6600 }, { "crossentropy": 2.673994541168213, "epoch": 0.23930539443155452, "grad_norm": 0.034280452877283096, "grad_norm_var": 5.849289469855856e-06, "learning_rate": 0.008814530720433074, "loss": 2.6996, "step": 6601 }, { "crossentropy": 2.6134841442108154, "epoch": 0.23934164733178653, "grad_norm": 0.035452231764793396, "grad_norm_var": 5.646915083521581e-06, "learning_rate": 0.008814155016084626, "loss": 2.6159, "step": 6602 }, { "crossentropy": 2.7303662300109863, "epoch": 0.23937790023201855, "grad_norm": 0.04019268602132797, "grad_norm_var": 7.1589187324636505e-06, "learning_rate": 0.008813779260220228, "loss": 2.7321, "step": 6603 }, { "crossentropy": 2.662658929824829, "epoch": 0.2394141531322506, "grad_norm": 0.03771887347102165, "grad_norm_var": 7.470665729194155e-06, "learning_rate": 0.008813403452844952, "loss": 2.5514, "step": 6604 }, { "crossentropy": 2.6420223712921143, "epoch": 0.2394504060324826, "grad_norm": 0.03441489115357399, "grad_norm_var": 7.4428929537026855e-06, "learning_rate": 0.008813027593963877, "loss": 2.7552, "step": 6605 }, { "crossentropy": 2.5400800704956055, "epoch": 0.23948665893271462, "grad_norm": 0.03388824686408043, "grad_norm_var": 7.210688382667622e-06, "learning_rate": 0.008812651683582078, "loss": 2.6241, "step": 6606 }, { "crossentropy": 2.7470672130584717, "epoch": 0.23952291183294663, "grad_norm": 0.03201936557888985, "grad_norm_var": 7.359034243403553e-06, "learning_rate": 0.008812275721704631, "loss": 2.7179, "step": 6607 }, { "crossentropy": 2.7490413188934326, "epoch": 0.23955916473317865, "grad_norm": 0.03475986793637276, "grad_norm_var": 7.054177042270284e-06, "learning_rate": 0.008811899708336617, "loss": 2.7233, "step": 6608 }, { "crossentropy": 2.535808801651001, "epoch": 0.23959541763341066, "grad_norm": 0.0366593562066555, "grad_norm_var": 6.967292372405305e-06, "learning_rate": 0.00881152364348311, "loss": 2.5099, "step": 6609 }, { "crossentropy": 2.604576826095581, "epoch": 0.2396316705336427, "grad_norm": 0.03495270013809204, "grad_norm_var": 6.844766356871775e-06, "learning_rate": 0.008811147527149194, "loss": 2.6144, "step": 6610 }, { "crossentropy": 2.7263660430908203, "epoch": 0.23966792343387472, "grad_norm": 0.034198056906461716, "grad_norm_var": 6.651060385957861e-06, "learning_rate": 0.008810771359339947, "loss": 2.7602, "step": 6611 }, { "crossentropy": 2.6279869079589844, "epoch": 0.23970417633410673, "grad_norm": 0.03146437555551529, "grad_norm_var": 7.82508663807324e-06, "learning_rate": 0.008810395140060448, "loss": 2.659, "step": 6612 }, { "crossentropy": 2.842097043991089, "epoch": 0.23974042923433875, "grad_norm": 0.03296816349029541, "grad_norm_var": 7.875643964499965e-06, "learning_rate": 0.008810018869315782, "loss": 2.6854, "step": 6613 }, { "crossentropy": 2.5778985023498535, "epoch": 0.23977668213457076, "grad_norm": 0.033207330852746964, "grad_norm_var": 5.299261169576924e-06, "learning_rate": 0.008809642547111028, "loss": 2.6568, "step": 6614 }, { "crossentropy": 2.6376373767852783, "epoch": 0.23981293503480278, "grad_norm": 0.03650112822651863, "grad_norm_var": 4.9793961526450416e-06, "learning_rate": 0.008809266173451269, "loss": 2.6544, "step": 6615 }, { "crossentropy": 2.5406992435455322, "epoch": 0.2398491879350348, "grad_norm": 0.04019441455602646, "grad_norm_var": 6.503996284118463e-06, "learning_rate": 0.008808889748341592, "loss": 2.5809, "step": 6616 }, { "crossentropy": 2.7158830165863037, "epoch": 0.23988544083526683, "grad_norm": 0.036888424307107925, "grad_norm_var": 6.6164626648786605e-06, "learning_rate": 0.008808513271787078, "loss": 2.6862, "step": 6617 }, { "crossentropy": 2.6823537349700928, "epoch": 0.23992169373549885, "grad_norm": 0.03266638517379761, "grad_norm_var": 7.06076466730395e-06, "learning_rate": 0.008808136743792811, "loss": 2.7755, "step": 6618 }, { "crossentropy": 2.7083330154418945, "epoch": 0.23995794663573086, "grad_norm": 0.031125597655773163, "grad_norm_var": 6.124924245925108e-06, "learning_rate": 0.00880776016436388, "loss": 2.7743, "step": 6619 }, { "crossentropy": 2.5202558040618896, "epoch": 0.23999419953596288, "grad_norm": 0.03354303166270256, "grad_norm_var": 5.47920046785518e-06, "learning_rate": 0.00880738353350537, "loss": 2.6469, "step": 6620 }, { "crossentropy": 2.633805274963379, "epoch": 0.2400304524361949, "grad_norm": 0.03652152419090271, "grad_norm_var": 5.777406135841432e-06, "learning_rate": 0.008807006851222366, "loss": 2.6106, "step": 6621 }, { "crossentropy": 2.6400413513183594, "epoch": 0.2400667053364269, "grad_norm": 0.037234943360090256, "grad_norm_var": 6.216777303665449e-06, "learning_rate": 0.008806630117519958, "loss": 2.6059, "step": 6622 }, { "crossentropy": 2.6317713260650635, "epoch": 0.24010295823665892, "grad_norm": 0.036066651344299316, "grad_norm_var": 5.803948310802397e-06, "learning_rate": 0.008806253332403233, "loss": 2.5982, "step": 6623 }, { "crossentropy": 2.6091079711914062, "epoch": 0.24013921113689096, "grad_norm": 0.031060675159096718, "grad_norm_var": 6.745331540340899e-06, "learning_rate": 0.008805876495877281, "loss": 2.6025, "step": 6624 }, { "crossentropy": 2.613344192504883, "epoch": 0.24017546403712298, "grad_norm": 0.03231855854392052, "grad_norm_var": 6.790875259960134e-06, "learning_rate": 0.008805499607947192, "loss": 2.6082, "step": 6625 }, { "crossentropy": 2.5840046405792236, "epoch": 0.240211716937355, "grad_norm": 0.03373512998223305, "grad_norm_var": 6.798997823506435e-06, "learning_rate": 0.008805122668618054, "loss": 2.6453, "step": 6626 }, { "crossentropy": 2.653066635131836, "epoch": 0.240247969837587, "grad_norm": 0.03481319174170494, "grad_norm_var": 6.809701327525419e-06, "learning_rate": 0.008804745677894962, "loss": 2.6402, "step": 6627 }, { "crossentropy": 2.629084587097168, "epoch": 0.24028422273781902, "grad_norm": 0.03658456355333328, "grad_norm_var": 6.447955482929039e-06, "learning_rate": 0.008804368635783003, "loss": 2.6511, "step": 6628 }, { "crossentropy": 2.620681047439575, "epoch": 0.24032047563805103, "grad_norm": 0.03409199044108391, "grad_norm_var": 6.265236244190836e-06, "learning_rate": 0.008803991542287275, "loss": 2.713, "step": 6629 }, { "crossentropy": 2.757866859436035, "epoch": 0.24035672853828308, "grad_norm": 0.03811375051736832, "grad_norm_var": 6.73796570561957e-06, "learning_rate": 0.008803614397412866, "loss": 2.7239, "step": 6630 }, { "crossentropy": 2.708996057510376, "epoch": 0.2403929814385151, "grad_norm": 0.032908957451581955, "grad_norm_var": 6.869175418914306e-06, "learning_rate": 0.008803237201164873, "loss": 2.7119, "step": 6631 }, { "crossentropy": 2.601686477661133, "epoch": 0.2404292343387471, "grad_norm": 0.033576399087905884, "grad_norm_var": 4.905404671508512e-06, "learning_rate": 0.00880285995354839, "loss": 2.6101, "step": 6632 }, { "crossentropy": 2.7177298069000244, "epoch": 0.24046548723897912, "grad_norm": 0.030680730938911438, "grad_norm_var": 5.298180201119629e-06, "learning_rate": 0.008802482654568513, "loss": 2.715, "step": 6633 }, { "crossentropy": 2.638029098510742, "epoch": 0.24050174013921113, "grad_norm": 0.034239716827869415, "grad_norm_var": 5.1594657032877906e-06, "learning_rate": 0.008802105304230336, "loss": 2.664, "step": 6634 }, { "crossentropy": 2.7365221977233887, "epoch": 0.24053799303944315, "grad_norm": 0.03253554925322533, "grad_norm_var": 4.712614290771364e-06, "learning_rate": 0.008801727902538959, "loss": 2.6694, "step": 6635 }, { "crossentropy": 2.5830368995666504, "epoch": 0.24057424593967516, "grad_norm": 0.03183717653155327, "grad_norm_var": 5.05564428594471e-06, "learning_rate": 0.008801350449499477, "loss": 2.671, "step": 6636 }, { "crossentropy": 2.6378467082977295, "epoch": 0.2406104988399072, "grad_norm": 0.03408784791827202, "grad_norm_var": 4.65464935116669e-06, "learning_rate": 0.008800972945116987, "loss": 2.7047, "step": 6637 }, { "crossentropy": 2.6105079650878906, "epoch": 0.24064675174013922, "grad_norm": 0.03914955258369446, "grad_norm_var": 5.7113992492586654e-06, "learning_rate": 0.008800595389396591, "loss": 2.6415, "step": 6638 }, { "crossentropy": 2.4415838718414307, "epoch": 0.24068300464037123, "grad_norm": 0.03910444676876068, "grad_norm_var": 7.079658896560444e-06, "learning_rate": 0.008800217782343385, "loss": 2.5953, "step": 6639 }, { "crossentropy": 2.686282157897949, "epoch": 0.24071925754060325, "grad_norm": 0.03258568048477173, "grad_norm_var": 6.565860522348791e-06, "learning_rate": 0.008799840123962472, "loss": 2.6697, "step": 6640 }, { "crossentropy": 2.6461963653564453, "epoch": 0.24075551044083526, "grad_norm": 0.03204135224223137, "grad_norm_var": 6.647510144337325e-06, "learning_rate": 0.008799462414258951, "loss": 2.66, "step": 6641 }, { "crossentropy": 2.7667324542999268, "epoch": 0.24079176334106728, "grad_norm": 0.043293289840221405, "grad_norm_var": 1.1535094489319569e-05, "learning_rate": 0.008799084653237923, "loss": 2.7463, "step": 6642 }, { "crossentropy": 2.860924243927002, "epoch": 0.2408280162412993, "grad_norm": 0.043726421892642975, "grad_norm_var": 1.6304868316290484e-05, "learning_rate": 0.008798706840904493, "loss": 2.7194, "step": 6643 }, { "crossentropy": 2.614626407623291, "epoch": 0.24086426914153133, "grad_norm": 0.038553766906261444, "grad_norm_var": 1.6822844541332835e-05, "learning_rate": 0.008798328977263762, "loss": 2.5658, "step": 6644 }, { "crossentropy": 2.53102970123291, "epoch": 0.24090052204176335, "grad_norm": 0.033654723316431046, "grad_norm_var": 1.6926091639634978e-05, "learning_rate": 0.008797951062320834, "loss": 2.6245, "step": 6645 }, { "crossentropy": 2.57585072517395, "epoch": 0.24093677494199536, "grad_norm": 0.03311316668987274, "grad_norm_var": 1.683331968292031e-05, "learning_rate": 0.008797573096080815, "loss": 2.5955, "step": 6646 }, { "crossentropy": 2.8907148838043213, "epoch": 0.24097302784222738, "grad_norm": 0.03440818935632706, "grad_norm_var": 1.649222917960985e-05, "learning_rate": 0.008797195078548805, "loss": 2.8089, "step": 6647 }, { "crossentropy": 2.7032389640808105, "epoch": 0.2410092807424594, "grad_norm": 0.03383716195821762, "grad_norm_var": 1.6432666803233886e-05, "learning_rate": 0.008796817009729917, "loss": 2.6514, "step": 6648 }, { "crossentropy": 2.841489315032959, "epoch": 0.2410455336426914, "grad_norm": 0.0364871509373188, "grad_norm_var": 1.4864501381889096e-05, "learning_rate": 0.00879643888962925, "loss": 2.8463, "step": 6649 }, { "crossentropy": 2.6476917266845703, "epoch": 0.24108178654292342, "grad_norm": 0.04268864169716835, "grad_norm_var": 1.7578522217627455e-05, "learning_rate": 0.008796060718251914, "loss": 2.6177, "step": 6650 }, { "crossentropy": 2.6861627101898193, "epoch": 0.24111803944315546, "grad_norm": 0.031486205756664276, "grad_norm_var": 1.817669528839753e-05, "learning_rate": 0.00879568249560302, "loss": 2.7041, "step": 6651 }, { "crossentropy": 2.6497058868408203, "epoch": 0.24115429234338748, "grad_norm": 0.0320623405277729, "grad_norm_var": 1.804727999164512e-05, "learning_rate": 0.008795304221687674, "loss": 2.6616, "step": 6652 }, { "crossentropy": 2.590791702270508, "epoch": 0.2411905452436195, "grad_norm": 0.03284059092402458, "grad_norm_var": 1.850698565580821e-05, "learning_rate": 0.008794925896510982, "loss": 2.6694, "step": 6653 }, { "crossentropy": 2.89989972114563, "epoch": 0.2412267981438515, "grad_norm": 0.031931404024362564, "grad_norm_var": 1.8914567721908745e-05, "learning_rate": 0.008794547520078059, "loss": 2.8408, "step": 6654 }, { "crossentropy": 2.6712400913238525, "epoch": 0.24126305104408352, "grad_norm": 0.0388110913336277, "grad_norm_var": 1.8788286887438017e-05, "learning_rate": 0.008794169092394014, "loss": 2.6635, "step": 6655 }, { "crossentropy": 2.7066597938537598, "epoch": 0.24129930394431554, "grad_norm": 0.034263890236616135, "grad_norm_var": 1.8262955252665244e-05, "learning_rate": 0.008793790613463954, "loss": 2.6874, "step": 6656 }, { "crossentropy": 2.617318630218506, "epoch": 0.24133555684454758, "grad_norm": 0.03666353598237038, "grad_norm_var": 1.7266436561737207e-05, "learning_rate": 0.008793412083292998, "loss": 2.6855, "step": 6657 }, { "crossentropy": 2.6321098804473877, "epoch": 0.2413718097447796, "grad_norm": 0.03254679590463638, "grad_norm_var": 1.4197205515734054e-05, "learning_rate": 0.008793033501886253, "loss": 2.6694, "step": 6658 }, { "crossentropy": 2.7580442428588867, "epoch": 0.2414080626450116, "grad_norm": 0.030746646225452423, "grad_norm_var": 1.0389874383520622e-05, "learning_rate": 0.008792654869248835, "loss": 2.7488, "step": 6659 }, { "crossentropy": 2.5070526599884033, "epoch": 0.24144431554524362, "grad_norm": 0.030863279476761818, "grad_norm_var": 1.0063905831829525e-05, "learning_rate": 0.008792276185385858, "loss": 2.6694, "step": 6660 }, { "crossentropy": 2.664390802383423, "epoch": 0.24148056844547564, "grad_norm": 0.033810291439294815, "grad_norm_var": 1.0055138945362743e-05, "learning_rate": 0.008791897450302436, "loss": 2.706, "step": 6661 }, { "crossentropy": 2.607840061187744, "epoch": 0.24151682134570765, "grad_norm": 0.03346462547779083, "grad_norm_var": 1.0013802194638979e-05, "learning_rate": 0.008791518664003684, "loss": 2.6015, "step": 6662 }, { "crossentropy": 2.674194097518921, "epoch": 0.24155307424593966, "grad_norm": 0.033931899815797806, "grad_norm_var": 1.0013615580869055e-05, "learning_rate": 0.008791139826494718, "loss": 2.7248, "step": 6663 }, { "crossentropy": 2.56889271736145, "epoch": 0.2415893271461717, "grad_norm": 0.0363556444644928, "grad_norm_var": 1.030424129303691e-05, "learning_rate": 0.008790760937780657, "loss": 2.633, "step": 6664 }, { "crossentropy": 2.634023427963257, "epoch": 0.24162558004640372, "grad_norm": 0.03444433584809303, "grad_norm_var": 9.97195583977369e-06, "learning_rate": 0.008790381997866617, "loss": 2.6793, "step": 6665 }, { "crossentropy": 2.597158432006836, "epoch": 0.24166183294663574, "grad_norm": 0.03494151309132576, "grad_norm_var": 4.936090898900837e-06, "learning_rate": 0.008790003006757714, "loss": 2.6614, "step": 6666 }, { "crossentropy": 2.5279576778411865, "epoch": 0.24169808584686775, "grad_norm": 0.03680244833230972, "grad_norm_var": 5.13487499489416e-06, "learning_rate": 0.00878962396445907, "loss": 2.6997, "step": 6667 }, { "crossentropy": 2.5512425899505615, "epoch": 0.24173433874709976, "grad_norm": 0.03685667738318443, "grad_norm_var": 5.313649493909412e-06, "learning_rate": 0.008789244870975804, "loss": 2.5867, "step": 6668 }, { "crossentropy": 2.769580125808716, "epoch": 0.24177059164733178, "grad_norm": 0.03220550715923309, "grad_norm_var": 5.464949433601447e-06, "learning_rate": 0.008788865726313036, "loss": 2.6932, "step": 6669 }, { "crossentropy": 2.5896317958831787, "epoch": 0.2418068445475638, "grad_norm": 0.03473236411809921, "grad_norm_var": 5.074450755438682e-06, "learning_rate": 0.008788486530475884, "loss": 2.5714, "step": 6670 }, { "crossentropy": 2.668628215789795, "epoch": 0.24184309744779584, "grad_norm": 0.03582844138145447, "grad_norm_var": 3.902094331971574e-06, "learning_rate": 0.008788107283469473, "loss": 2.6559, "step": 6671 }, { "crossentropy": 2.6495089530944824, "epoch": 0.24187935034802785, "grad_norm": 0.0386890172958374, "grad_norm_var": 5.117263737776942e-06, "learning_rate": 0.008787727985298926, "loss": 2.6677, "step": 6672 }, { "crossentropy": 2.569711208343506, "epoch": 0.24191560324825986, "grad_norm": 0.03787797689437866, "grad_norm_var": 5.550837958715711e-06, "learning_rate": 0.008787348635969364, "loss": 2.5723, "step": 6673 }, { "crossentropy": 2.6668944358825684, "epoch": 0.24195185614849188, "grad_norm": 0.03321349620819092, "grad_norm_var": 5.3933384704426955e-06, "learning_rate": 0.00878696923548591, "loss": 2.7172, "step": 6674 }, { "crossentropy": 2.5693249702453613, "epoch": 0.2419881090487239, "grad_norm": 0.033311519771814346, "grad_norm_var": 4.461834676461662e-06, "learning_rate": 0.008786589783853691, "loss": 2.5512, "step": 6675 }, { "crossentropy": 2.6093578338623047, "epoch": 0.2420243619489559, "grad_norm": 0.03869229182600975, "grad_norm_var": 4.148741126385781e-06, "learning_rate": 0.00878621028107783, "loss": 2.6663, "step": 6676 }, { "crossentropy": 2.599972724914551, "epoch": 0.24206061484918792, "grad_norm": 0.038939956575632095, "grad_norm_var": 4.759132849746899e-06, "learning_rate": 0.008785830727163454, "loss": 2.723, "step": 6677 }, { "crossentropy": 2.7663424015045166, "epoch": 0.24209686774941996, "grad_norm": 0.04093613848090172, "grad_norm_var": 6.078019238466246e-06, "learning_rate": 0.008785451122115689, "loss": 2.7354, "step": 6678 }, { "crossentropy": 2.6657416820526123, "epoch": 0.24213312064965198, "grad_norm": 0.03618151322007179, "grad_norm_var": 5.741013445449925e-06, "learning_rate": 0.00878507146593966, "loss": 2.671, "step": 6679 }, { "crossentropy": 2.7995731830596924, "epoch": 0.242169373549884, "grad_norm": 0.03709147498011589, "grad_norm_var": 5.785164492028555e-06, "learning_rate": 0.008784691758640497, "loss": 2.7364, "step": 6680 }, { "crossentropy": 2.5861239433288574, "epoch": 0.242205626450116, "grad_norm": 0.037238214164972305, "grad_norm_var": 5.583045751103982e-06, "learning_rate": 0.00878431200022333, "loss": 2.6291, "step": 6681 }, { "crossentropy": 2.7277719974517822, "epoch": 0.24224187935034802, "grad_norm": 0.03498276323080063, "grad_norm_var": 5.5747390161691815e-06, "learning_rate": 0.008783932190693287, "loss": 2.6947, "step": 6682 }, { "crossentropy": 2.6853528022766113, "epoch": 0.24227813225058004, "grad_norm": 0.03232325240969658, "grad_norm_var": 6.632374115453308e-06, "learning_rate": 0.008783552330055495, "loss": 2.779, "step": 6683 }, { "crossentropy": 2.67983341217041, "epoch": 0.24231438515081208, "grad_norm": 0.03219177573919296, "grad_norm_var": 7.580147208870412e-06, "learning_rate": 0.008783172418315088, "loss": 2.6587, "step": 6684 }, { "crossentropy": 2.6647653579711914, "epoch": 0.2423506380510441, "grad_norm": 0.036512281745672226, "grad_norm_var": 6.616621977845125e-06, "learning_rate": 0.008782792455477197, "loss": 2.7107, "step": 6685 }, { "crossentropy": 2.6407418251037598, "epoch": 0.2423868909512761, "grad_norm": 0.034348487854003906, "grad_norm_var": 6.699487184900985e-06, "learning_rate": 0.008782412441546953, "loss": 2.5916, "step": 6686 }, { "crossentropy": 2.6941256523132324, "epoch": 0.24242314385150812, "grad_norm": 0.034019116312265396, "grad_norm_var": 6.981040445912708e-06, "learning_rate": 0.00878203237652949, "loss": 2.6777, "step": 6687 }, { "crossentropy": 2.4936611652374268, "epoch": 0.24245939675174014, "grad_norm": 0.03450285643339157, "grad_norm_var": 6.594560346036513e-06, "learning_rate": 0.008781652260429939, "loss": 2.5315, "step": 6688 }, { "crossentropy": 2.738046169281006, "epoch": 0.24249564965197215, "grad_norm": 0.032662563025951385, "grad_norm_var": 6.830605177378842e-06, "learning_rate": 0.008781272093253436, "loss": 2.7252, "step": 6689 }, { "crossentropy": 2.6966311931610107, "epoch": 0.24253190255220416, "grad_norm": 0.034055668860673904, "grad_norm_var": 6.624164323192014e-06, "learning_rate": 0.008780891875005114, "loss": 2.7035, "step": 6690 }, { "crossentropy": 2.750936985015869, "epoch": 0.2425681554524362, "grad_norm": 0.03556981682777405, "grad_norm_var": 6.2841338459953175e-06, "learning_rate": 0.00878051160569011, "loss": 2.6829, "step": 6691 }, { "crossentropy": 2.6348063945770264, "epoch": 0.24260440835266822, "grad_norm": 0.03702668473124504, "grad_norm_var": 5.779781850536366e-06, "learning_rate": 0.00878013128531356, "loss": 2.6621, "step": 6692 }, { "crossentropy": 2.6379451751708984, "epoch": 0.24264066125290024, "grad_norm": 0.034154895693063736, "grad_norm_var": 5.0393421676360925e-06, "learning_rate": 0.008779750913880599, "loss": 2.7236, "step": 6693 }, { "crossentropy": 2.6744351387023926, "epoch": 0.24267691415313225, "grad_norm": 0.032565414905548096, "grad_norm_var": 3.0582510291781693e-06, "learning_rate": 0.00877937049139637, "loss": 2.6255, "step": 6694 }, { "crossentropy": 2.747939109802246, "epoch": 0.24271316705336426, "grad_norm": 0.03439195826649666, "grad_norm_var": 2.908289765458869e-06, "learning_rate": 0.008778990017866002, "loss": 2.7502, "step": 6695 }, { "crossentropy": 2.599893808364868, "epoch": 0.24274941995359628, "grad_norm": 0.03349755331873894, "grad_norm_var": 2.522782905439204e-06, "learning_rate": 0.008778609493294643, "loss": 2.6303, "step": 6696 }, { "crossentropy": 2.671309471130371, "epoch": 0.2427856728538283, "grad_norm": 0.03220078721642494, "grad_norm_var": 2.1874824902322226e-06, "learning_rate": 0.008778228917687427, "loss": 2.6526, "step": 6697 }, { "crossentropy": 2.8055078983306885, "epoch": 0.24282192575406034, "grad_norm": 0.03118053637444973, "grad_norm_var": 2.6246868145865837e-06, "learning_rate": 0.008777848291049495, "loss": 2.6698, "step": 6698 }, { "crossentropy": 2.4586093425750732, "epoch": 0.24285817865429235, "grad_norm": 0.02994794212281704, "grad_norm_var": 3.45300577616562e-06, "learning_rate": 0.00877746761338599, "loss": 2.4973, "step": 6699 }, { "crossentropy": 2.6349828243255615, "epoch": 0.24289443155452436, "grad_norm": 0.033637501299381256, "grad_norm_var": 3.297385637206623e-06, "learning_rate": 0.008777086884702051, "loss": 2.6968, "step": 6700 }, { "crossentropy": 2.5673940181732178, "epoch": 0.24293068445475638, "grad_norm": 0.03688662126660347, "grad_norm_var": 3.44315965242614e-06, "learning_rate": 0.008776706105002823, "loss": 2.5558, "step": 6701 }, { "crossentropy": 2.727581024169922, "epoch": 0.2429669373549884, "grad_norm": 0.03721871227025986, "grad_norm_var": 4.171576787984482e-06, "learning_rate": 0.008776325274293447, "loss": 2.6849, "step": 6702 }, { "crossentropy": 2.5293991565704346, "epoch": 0.2430031902552204, "grad_norm": 0.03350827842950821, "grad_norm_var": 4.1845352638509675e-06, "learning_rate": 0.008775944392579067, "loss": 2.6194, "step": 6703 }, { "crossentropy": 2.6702184677124023, "epoch": 0.24303944315545242, "grad_norm": 0.030913295224308968, "grad_norm_var": 4.7194934689736744e-06, "learning_rate": 0.008775563459864829, "loss": 2.7004, "step": 6704 }, { "crossentropy": 2.498870611190796, "epoch": 0.24307569605568446, "grad_norm": 0.030023405328392982, "grad_norm_var": 5.524676381484366e-06, "learning_rate": 0.008775182476155876, "loss": 2.5853, "step": 6705 }, { "crossentropy": 2.6720831394195557, "epoch": 0.24311194895591648, "grad_norm": 0.03255721554160118, "grad_norm_var": 5.563720708742723e-06, "learning_rate": 0.008774801441457354, "loss": 2.6742, "step": 6706 }, { "crossentropy": 2.5040884017944336, "epoch": 0.2431482018561485, "grad_norm": 0.034734196960926056, "grad_norm_var": 5.371741908850811e-06, "learning_rate": 0.008774420355774411, "loss": 2.5404, "step": 6707 }, { "crossentropy": 2.713414430618286, "epoch": 0.2431844547563805, "grad_norm": 0.033724408596754074, "grad_norm_var": 4.457702538068244e-06, "learning_rate": 0.008774039219112193, "loss": 2.6478, "step": 6708 }, { "crossentropy": 2.8390727043151855, "epoch": 0.24322070765661252, "grad_norm": 0.05118832364678383, "grad_norm_var": 2.476812326778991e-05, "learning_rate": 0.008773658031475848, "loss": 2.7863, "step": 6709 }, { "crossentropy": 2.821531057357788, "epoch": 0.24325696055684454, "grad_norm": 0.03878936544060707, "grad_norm_var": 2.5782114596948716e-05, "learning_rate": 0.008773276792870526, "loss": 2.7597, "step": 6710 }, { "crossentropy": 2.782043218612671, "epoch": 0.24329321345707658, "grad_norm": 0.04418356716632843, "grad_norm_var": 3.143744580258219e-05, "learning_rate": 0.008772895503301372, "loss": 2.7681, "step": 6711 }, { "crossentropy": 2.620560646057129, "epoch": 0.2433294663573086, "grad_norm": 0.04299542307853699, "grad_norm_var": 3.484109972611375e-05, "learning_rate": 0.00877251416277354, "loss": 2.603, "step": 6712 }, { "crossentropy": 2.740060806274414, "epoch": 0.2433657192575406, "grad_norm": 0.04340461641550064, "grad_norm_var": 3.72267436343683e-05, "learning_rate": 0.008772132771292181, "loss": 2.7223, "step": 6713 }, { "crossentropy": 2.572435140609741, "epoch": 0.24340197215777262, "grad_norm": 0.03268909454345703, "grad_norm_var": 3.628778383033385e-05, "learning_rate": 0.008771751328862442, "loss": 2.6827, "step": 6714 }, { "crossentropy": 2.740980386734009, "epoch": 0.24343822505800464, "grad_norm": 0.0312766470015049, "grad_norm_var": 3.521076215473358e-05, "learning_rate": 0.008771369835489477, "loss": 2.7712, "step": 6715 }, { "crossentropy": 2.845414638519287, "epoch": 0.24347447795823665, "grad_norm": 0.03393733501434326, "grad_norm_var": 3.509262292727405e-05, "learning_rate": 0.008770988291178441, "loss": 2.7337, "step": 6716 }, { "crossentropy": 2.670316219329834, "epoch": 0.24351073085846867, "grad_norm": 0.0355704128742218, "grad_norm_var": 3.517725653746963e-05, "learning_rate": 0.008770606695934487, "loss": 2.6209, "step": 6717 }, { "crossentropy": 2.6203057765960693, "epoch": 0.2435469837587007, "grad_norm": 0.038612037897109985, "grad_norm_var": 3.540059550037264e-05, "learning_rate": 0.008770225049762763, "loss": 2.6512, "step": 6718 }, { "crossentropy": 2.675083637237549, "epoch": 0.24358323665893272, "grad_norm": 0.03864879906177521, "grad_norm_var": 3.482565981563906e-05, "learning_rate": 0.008769843352668432, "loss": 2.6925, "step": 6719 }, { "crossentropy": 2.900285482406616, "epoch": 0.24361948955916474, "grad_norm": 0.03908592090010643, "grad_norm_var": 3.2282561570467916e-05, "learning_rate": 0.008769461604656643, "loss": 2.7716, "step": 6720 }, { "crossentropy": 2.553251028060913, "epoch": 0.24365574245939675, "grad_norm": 0.03504645824432373, "grad_norm_var": 2.8792653767049463e-05, "learning_rate": 0.008769079805732556, "loss": 2.5304, "step": 6721 }, { "crossentropy": 2.70994234085083, "epoch": 0.24369199535962877, "grad_norm": 0.037676531821489334, "grad_norm_var": 2.6781892915659496e-05, "learning_rate": 0.008768697955901326, "loss": 2.6624, "step": 6722 }, { "crossentropy": 2.6565728187561035, "epoch": 0.24372824825986078, "grad_norm": 0.039181288331747055, "grad_norm_var": 2.594944180754847e-05, "learning_rate": 0.008768316055168111, "loss": 2.627, "step": 6723 }, { "crossentropy": 2.6749801635742188, "epoch": 0.2437645011600928, "grad_norm": 0.0391492024064064, "grad_norm_var": 2.4334040427072498e-05, "learning_rate": 0.008767934103538069, "loss": 2.6477, "step": 6724 }, { "crossentropy": 2.619328737258911, "epoch": 0.24380075406032484, "grad_norm": 0.03826674818992615, "grad_norm_var": 1.3494309677910116e-05, "learning_rate": 0.00876755210101636, "loss": 2.5941, "step": 6725 }, { "crossentropy": 2.7519986629486084, "epoch": 0.24383700696055685, "grad_norm": 0.03213744983077049, "grad_norm_var": 1.5588164662959775e-05, "learning_rate": 0.008767170047608143, "loss": 2.7161, "step": 6726 }, { "crossentropy": 2.7408039569854736, "epoch": 0.24387325986078887, "grad_norm": 0.03891722112894058, "grad_norm_var": 1.2710196669340514e-05, "learning_rate": 0.008766787943318574, "loss": 2.8341, "step": 6727 }, { "crossentropy": 2.696612596511841, "epoch": 0.24390951276102088, "grad_norm": 0.033624179661273956, "grad_norm_var": 1.1066538606395853e-05, "learning_rate": 0.00876640578815282, "loss": 2.7334, "step": 6728 }, { "crossentropy": 2.6067280769348145, "epoch": 0.2439457656612529, "grad_norm": 0.03755732998251915, "grad_norm_var": 7.977452702328154e-06, "learning_rate": 0.00876602358211604, "loss": 2.6828, "step": 6729 }, { "crossentropy": 2.8426573276519775, "epoch": 0.2439820185614849, "grad_norm": 0.03652027249336243, "grad_norm_var": 7.031876174317294e-06, "learning_rate": 0.008765641325213396, "loss": 2.7007, "step": 6730 }, { "crossentropy": 2.6322879791259766, "epoch": 0.24401827146171692, "grad_norm": 0.032799556851387024, "grad_norm_var": 6.100874947765657e-06, "learning_rate": 0.008765259017450051, "loss": 2.6824, "step": 6731 }, { "crossentropy": 2.7461235523223877, "epoch": 0.24405452436194897, "grad_norm": 0.047428905963897705, "grad_norm_var": 1.2560346538118862e-05, "learning_rate": 0.00876487665883117, "loss": 2.7345, "step": 6732 }, { "crossentropy": 2.6332638263702393, "epoch": 0.24409077726218098, "grad_norm": 0.03454840928316116, "grad_norm_var": 1.289045997129954e-05, "learning_rate": 0.008764494249361916, "loss": 2.6037, "step": 6733 }, { "crossentropy": 2.608297824859619, "epoch": 0.244127030162413, "grad_norm": 0.04205632582306862, "grad_norm_var": 1.4165548385210401e-05, "learning_rate": 0.008764111789047454, "loss": 2.6505, "step": 6734 }, { "crossentropy": 2.687762975692749, "epoch": 0.244163283062645, "grad_norm": 0.039921581745147705, "grad_norm_var": 1.4433703068391045e-05, "learning_rate": 0.00876372927789295, "loss": 2.6382, "step": 6735 }, { "crossentropy": 2.716811180114746, "epoch": 0.24419953596287702, "grad_norm": 0.03473743423819542, "grad_norm_var": 1.4837978185113186e-05, "learning_rate": 0.00876334671590357, "loss": 2.6517, "step": 6736 }, { "crossentropy": 2.4677231311798096, "epoch": 0.24423578886310904, "grad_norm": 0.0343983955681324, "grad_norm_var": 1.507390558685099e-05, "learning_rate": 0.008762964103084482, "loss": 2.5439, "step": 6737 }, { "crossentropy": 2.570859909057617, "epoch": 0.24427204176334108, "grad_norm": 0.031641919165849686, "grad_norm_var": 1.7153630193047533e-05, "learning_rate": 0.008762581439440854, "loss": 2.6517, "step": 6738 }, { "crossentropy": 2.548318862915039, "epoch": 0.2443082946635731, "grad_norm": 0.032746851444244385, "grad_norm_var": 1.7917391632287413e-05, "learning_rate": 0.008762198724977853, "loss": 2.6881, "step": 6739 }, { "crossentropy": 2.781623125076294, "epoch": 0.2443445475638051, "grad_norm": 0.035322338342666626, "grad_norm_var": 1.755913419609322e-05, "learning_rate": 0.008761815959700648, "loss": 2.7064, "step": 6740 }, { "crossentropy": 2.517364740371704, "epoch": 0.24438080046403712, "grad_norm": 0.034288108348846436, "grad_norm_var": 1.7565657424174994e-05, "learning_rate": 0.008761433143614411, "loss": 2.6542, "step": 6741 }, { "crossentropy": 2.619393825531006, "epoch": 0.24441705336426914, "grad_norm": 0.032666485756635666, "grad_norm_var": 1.729902634567381e-05, "learning_rate": 0.008761050276724311, "loss": 2.6124, "step": 6742 }, { "crossentropy": 2.7571024894714355, "epoch": 0.24445330626450115, "grad_norm": 0.03303980454802513, "grad_norm_var": 1.732745355295795e-05, "learning_rate": 0.00876066735903552, "loss": 2.8072, "step": 6743 }, { "crossentropy": 2.5869290828704834, "epoch": 0.24448955916473317, "grad_norm": 0.03480073809623718, "grad_norm_var": 1.7067759296698847e-05, "learning_rate": 0.008760284390553208, "loss": 2.579, "step": 6744 }, { "crossentropy": 2.683021306991577, "epoch": 0.2445258120649652, "grad_norm": 0.034353628754615784, "grad_norm_var": 1.7003283131291022e-05, "learning_rate": 0.008759901371282549, "loss": 2.652, "step": 6745 }, { "crossentropy": 2.6080238819122314, "epoch": 0.24456206496519722, "grad_norm": 0.03916275128722191, "grad_norm_var": 1.772715043690572e-05, "learning_rate": 0.008759518301228716, "loss": 2.6611, "step": 6746 }, { "crossentropy": 2.569430112838745, "epoch": 0.24459831786542924, "grad_norm": 0.030688898637890816, "grad_norm_var": 1.886954880668475e-05, "learning_rate": 0.008759135180396884, "loss": 2.6439, "step": 6747 }, { "crossentropy": 2.540950059890747, "epoch": 0.24463457076566125, "grad_norm": 0.04095621407032013, "grad_norm_var": 1.1398189098834803e-05, "learning_rate": 0.008758752008792227, "loss": 2.558, "step": 6748 }, { "crossentropy": 2.6023709774017334, "epoch": 0.24467082366589327, "grad_norm": 0.03598989173769951, "grad_norm_var": 1.1377236933085898e-05, "learning_rate": 0.00875836878641992, "loss": 2.6572, "step": 6749 }, { "crossentropy": 2.732876777648926, "epoch": 0.24470707656612528, "grad_norm": 0.03600148856639862, "grad_norm_var": 8.313561627470396e-06, "learning_rate": 0.008757985513285138, "loss": 2.7891, "step": 6750 }, { "crossentropy": 2.686202049255371, "epoch": 0.2447433294663573, "grad_norm": 0.033245861530303955, "grad_norm_var": 6.758070084492473e-06, "learning_rate": 0.008757602189393058, "loss": 2.7326, "step": 6751 }, { "crossentropy": 2.7203853130340576, "epoch": 0.24477958236658934, "grad_norm": 0.030374089255928993, "grad_norm_var": 7.884065787556957e-06, "learning_rate": 0.00875721881474886, "loss": 2.6758, "step": 6752 }, { "crossentropy": 2.580873727798462, "epoch": 0.24481583526682135, "grad_norm": 0.03152928501367569, "grad_norm_var": 8.381891500100723e-06, "learning_rate": 0.00875683538935772, "loss": 2.584, "step": 6753 }, { "crossentropy": 2.5835154056549072, "epoch": 0.24485208816705337, "grad_norm": 0.032542239874601364, "grad_norm_var": 8.128411887170969e-06, "learning_rate": 0.008756451913224815, "loss": 2.6246, "step": 6754 }, { "crossentropy": 2.7645249366760254, "epoch": 0.24488834106728538, "grad_norm": 0.03138863667845726, "grad_norm_var": 8.512624392056853e-06, "learning_rate": 0.008756068386355328, "loss": 2.7323, "step": 6755 }, { "crossentropy": 2.8173716068267822, "epoch": 0.2449245939675174, "grad_norm": 0.03429460898041725, "grad_norm_var": 8.417568138056885e-06, "learning_rate": 0.008755684808754436, "loss": 2.7427, "step": 6756 }, { "crossentropy": 2.7300305366516113, "epoch": 0.2449608468677494, "grad_norm": 0.029947908595204353, "grad_norm_var": 9.47601611357798e-06, "learning_rate": 0.008755301180427322, "loss": 2.6647, "step": 6757 }, { "crossentropy": 2.7639315128326416, "epoch": 0.24499709976798145, "grad_norm": 0.03217455744743347, "grad_norm_var": 9.56623667389884e-06, "learning_rate": 0.008754917501379167, "loss": 2.6428, "step": 6758 }, { "crossentropy": 2.5202109813690186, "epoch": 0.24503335266821347, "grad_norm": 0.03108137845993042, "grad_norm_var": 9.999406675911685e-06, "learning_rate": 0.008754533771615154, "loss": 2.6059, "step": 6759 }, { "crossentropy": 2.5867702960968018, "epoch": 0.24506960556844548, "grad_norm": 0.03741152212023735, "grad_norm_var": 1.0823120190786443e-05, "learning_rate": 0.008754149991140462, "loss": 2.6556, "step": 6760 }, { "crossentropy": 2.6549549102783203, "epoch": 0.2451058584686775, "grad_norm": 0.031276751309633255, "grad_norm_var": 1.119648598108222e-05, "learning_rate": 0.008753766159960279, "loss": 2.7201, "step": 6761 }, { "crossentropy": 2.644183397293091, "epoch": 0.2451421113689095, "grad_norm": 0.029651034623384476, "grad_norm_var": 9.833135007469836e-06, "learning_rate": 0.008753382278079787, "loss": 2.654, "step": 6762 }, { "crossentropy": 2.704486131668091, "epoch": 0.24517836426914152, "grad_norm": 0.030991198495030403, "grad_norm_var": 9.744297292523642e-06, "learning_rate": 0.008752998345504172, "loss": 2.6854, "step": 6763 }, { "crossentropy": 2.668344020843506, "epoch": 0.24521461716937354, "grad_norm": 0.03125074505805969, "grad_norm_var": 5.4050030630164855e-06, "learning_rate": 0.008752614362238618, "loss": 2.6821, "step": 6764 }, { "crossentropy": 2.833876848220825, "epoch": 0.24525087006960558, "grad_norm": 0.031637825071811676, "grad_norm_var": 4.532900763028651e-06, "learning_rate": 0.008752230328288313, "loss": 2.7321, "step": 6765 }, { "crossentropy": 2.6096351146698, "epoch": 0.2452871229698376, "grad_norm": 0.04928938299417496, "grad_norm_var": 2.234796910576876e-05, "learning_rate": 0.008751846243658443, "loss": 2.5638, "step": 6766 }, { "crossentropy": 2.713125467300415, "epoch": 0.2453233758700696, "grad_norm": 0.03062506765127182, "grad_norm_var": 2.2693241120709283e-05, "learning_rate": 0.008751462108354195, "loss": 2.7586, "step": 6767 }, { "crossentropy": 2.4933810234069824, "epoch": 0.24535962877030162, "grad_norm": 0.032736074179410934, "grad_norm_var": 2.2264818063421714e-05, "learning_rate": 0.00875107792238076, "loss": 2.6614, "step": 6768 }, { "crossentropy": 2.6675260066986084, "epoch": 0.24539588167053364, "grad_norm": 0.03500475361943245, "grad_norm_var": 2.2343200255023287e-05, "learning_rate": 0.008750693685743324, "loss": 2.6937, "step": 6769 }, { "crossentropy": 2.79329252243042, "epoch": 0.24543213457076565, "grad_norm": 0.03252873942255974, "grad_norm_var": 2.234440731925241e-05, "learning_rate": 0.008750309398447077, "loss": 2.7278, "step": 6770 }, { "crossentropy": 2.638746738433838, "epoch": 0.24546838747099767, "grad_norm": 0.034894585609436035, "grad_norm_var": 2.2263262690186877e-05, "learning_rate": 0.00874992506049721, "loss": 2.7591, "step": 6771 }, { "crossentropy": 2.6019179821014404, "epoch": 0.2455046403712297, "grad_norm": 0.03316076099872589, "grad_norm_var": 2.221210960916675e-05, "learning_rate": 0.008749540671898917, "loss": 2.5722, "step": 6772 }, { "crossentropy": 2.6008665561676025, "epoch": 0.24554089327146172, "grad_norm": 0.03592998906970024, "grad_norm_var": 2.173204044399016e-05, "learning_rate": 0.008749156232657386, "loss": 2.5816, "step": 6773 }, { "crossentropy": 2.746506690979004, "epoch": 0.24557714617169374, "grad_norm": 0.04841138422489166, "grad_norm_var": 3.484662743454627e-05, "learning_rate": 0.008748771742777809, "loss": 2.6929, "step": 6774 }, { "crossentropy": 2.6833841800689697, "epoch": 0.24561339907192575, "grad_norm": 0.03305741026997566, "grad_norm_var": 3.412605263167858e-05, "learning_rate": 0.008748387202265381, "loss": 2.7364, "step": 6775 }, { "crossentropy": 2.7830514907836914, "epoch": 0.24564965197215777, "grad_norm": 0.05188468471169472, "grad_norm_var": 5.213016620145629e-05, "learning_rate": 0.008748002611125296, "loss": 2.7146, "step": 6776 }, { "crossentropy": 2.6778006553649902, "epoch": 0.24568590487238978, "grad_norm": 0.03795037791132927, "grad_norm_var": 5.0915000745701735e-05, "learning_rate": 0.008747617969362749, "loss": 2.6926, "step": 6777 }, { "crossentropy": 2.7181224822998047, "epoch": 0.2457221577726218, "grad_norm": 0.03727778047323227, "grad_norm_var": 4.7903270878348735e-05, "learning_rate": 0.008747233276982933, "loss": 2.6561, "step": 6778 }, { "crossentropy": 2.6386730670928955, "epoch": 0.24575841067285384, "grad_norm": 0.0349668450653553, "grad_norm_var": 4.588383343179514e-05, "learning_rate": 0.008746848533991046, "loss": 2.5014, "step": 6779 }, { "crossentropy": 2.7379071712493896, "epoch": 0.24579466357308585, "grad_norm": 0.03452076390385628, "grad_norm_var": 4.408343343011891e-05, "learning_rate": 0.008746463740392283, "loss": 2.6434, "step": 6780 }, { "crossentropy": 2.7846333980560303, "epoch": 0.24583091647331787, "grad_norm": 0.03319011628627777, "grad_norm_var": 4.309994000752607e-05, "learning_rate": 0.008746078896191842, "loss": 2.6636, "step": 6781 }, { "crossentropy": 2.624321460723877, "epoch": 0.24586716937354988, "grad_norm": 0.03482706844806671, "grad_norm_var": 3.288785235036288e-05, "learning_rate": 0.008745694001394919, "loss": 2.5917, "step": 6782 }, { "crossentropy": 2.6827340126037598, "epoch": 0.2459034222737819, "grad_norm": 0.03269316628575325, "grad_norm_var": 3.1587456407253246e-05, "learning_rate": 0.008745309056006716, "loss": 2.6303, "step": 6783 }, { "crossentropy": 2.6788201332092285, "epoch": 0.2459396751740139, "grad_norm": 0.030873306095600128, "grad_norm_var": 3.272418073666094e-05, "learning_rate": 0.008744924060032428, "loss": 2.5949, "step": 6784 }, { "crossentropy": 2.7688443660736084, "epoch": 0.24597592807424595, "grad_norm": 0.031961873173713684, "grad_norm_var": 3.383780595275753e-05, "learning_rate": 0.00874453901347726, "loss": 2.7659, "step": 6785 }, { "crossentropy": 2.604145050048828, "epoch": 0.24601218097447797, "grad_norm": 0.030773090198636055, "grad_norm_var": 3.4874171400893016e-05, "learning_rate": 0.00874415391634641, "loss": 2.6425, "step": 6786 }, { "crossentropy": 2.6849942207336426, "epoch": 0.24604843387470998, "grad_norm": 0.03337901458144188, "grad_norm_var": 3.524582245243006e-05, "learning_rate": 0.008743768768645079, "loss": 2.6617, "step": 6787 }, { "crossentropy": 2.6807405948638916, "epoch": 0.246084686774942, "grad_norm": 0.03587065264582634, "grad_norm_var": 3.470471877745269e-05, "learning_rate": 0.008743383570378469, "loss": 2.6142, "step": 6788 }, { "crossentropy": 2.8605966567993164, "epoch": 0.246120939675174, "grad_norm": 0.033345822244882584, "grad_norm_var": 3.5179967501953766e-05, "learning_rate": 0.008742998321551784, "loss": 2.7441, "step": 6789 }, { "crossentropy": 2.6165242195129395, "epoch": 0.24615719257540603, "grad_norm": 0.033855266869068146, "grad_norm_var": 2.421097347333558e-05, "learning_rate": 0.008742613022170225, "loss": 2.6497, "step": 6790 }, { "crossentropy": 2.6638946533203125, "epoch": 0.24619344547563804, "grad_norm": 0.03947925940155983, "grad_norm_var": 2.5102282980209267e-05, "learning_rate": 0.008742227672238999, "loss": 2.7263, "step": 6791 }, { "crossentropy": 2.6326522827148438, "epoch": 0.24622969837587008, "grad_norm": 0.043049801141023636, "grad_norm_var": 1.0595091968399712e-05, "learning_rate": 0.008741842271763308, "loss": 2.7213, "step": 6792 }, { "crossentropy": 2.6436171531677246, "epoch": 0.2462659512761021, "grad_norm": 0.034436583518981934, "grad_norm_var": 9.926347090700412e-06, "learning_rate": 0.00874145682074836, "loss": 2.6116, "step": 6793 }, { "crossentropy": 2.4877126216888428, "epoch": 0.2463022041763341, "grad_norm": 0.03259735926985741, "grad_norm_var": 9.659527268573058e-06, "learning_rate": 0.00874107131919936, "loss": 2.4706, "step": 6794 }, { "crossentropy": 2.568739652633667, "epoch": 0.24633845707656613, "grad_norm": 0.03174157813191414, "grad_norm_var": 1.0050321307976843e-05, "learning_rate": 0.008740685767121512, "loss": 2.573, "step": 6795 }, { "crossentropy": 2.6061227321624756, "epoch": 0.24637470997679814, "grad_norm": 0.030903156846761703, "grad_norm_var": 1.069529701378465e-05, "learning_rate": 0.008740300164520027, "loss": 2.6874, "step": 6796 }, { "crossentropy": 2.761392831802368, "epoch": 0.24641096287703015, "grad_norm": 0.03692803159356117, "grad_norm_var": 1.1196772935626838e-05, "learning_rate": 0.008739914511400113, "loss": 2.8306, "step": 6797 }, { "crossentropy": 2.581606864929199, "epoch": 0.24644721577726217, "grad_norm": 0.03497563302516937, "grad_norm_var": 1.1211174165536815e-05, "learning_rate": 0.008739528807766978, "loss": 2.5878, "step": 6798 }, { "crossentropy": 2.567629098892212, "epoch": 0.2464834686774942, "grad_norm": 0.03383157029747963, "grad_norm_var": 1.1066645196820462e-05, "learning_rate": 0.00873914305362583, "loss": 2.5924, "step": 6799 }, { "crossentropy": 2.670640468597412, "epoch": 0.24651972157772623, "grad_norm": 0.03483006730675697, "grad_norm_var": 1.0263640542685136e-05, "learning_rate": 0.008738757248981882, "loss": 2.6572, "step": 6800 }, { "crossentropy": 2.7419490814208984, "epoch": 0.24655597447795824, "grad_norm": 0.0386979803442955, "grad_norm_var": 1.0822289167433417e-05, "learning_rate": 0.008738371393840344, "loss": 2.7374, "step": 6801 }, { "crossentropy": 2.7052712440490723, "epoch": 0.24659222737819025, "grad_norm": 0.03933723270893097, "grad_norm_var": 1.0672819515649261e-05, "learning_rate": 0.008737985488206427, "loss": 2.7864, "step": 6802 }, { "crossentropy": 2.7284278869628906, "epoch": 0.24662848027842227, "grad_norm": 0.037810828536748886, "grad_norm_var": 1.067443816332349e-05, "learning_rate": 0.008737599532085343, "loss": 2.6371, "step": 6803 }, { "crossentropy": 2.7437679767608643, "epoch": 0.24666473317865428, "grad_norm": 0.034306082874536514, "grad_norm_var": 1.079823024617791e-05, "learning_rate": 0.008737213525482304, "loss": 2.6586, "step": 6804 }, { "crossentropy": 2.6251144409179688, "epoch": 0.2467009860788863, "grad_norm": 0.035836074501276016, "grad_norm_var": 1.0426431303752793e-05, "learning_rate": 0.008736827468402528, "loss": 2.6616, "step": 6805 }, { "crossentropy": 2.626746416091919, "epoch": 0.24673723897911834, "grad_norm": 0.03405009210109711, "grad_norm_var": 1.0378583773642107e-05, "learning_rate": 0.008736441360851223, "loss": 2.621, "step": 6806 }, { "crossentropy": 2.6150529384613037, "epoch": 0.24677349187935035, "grad_norm": 0.04067869484424591, "grad_norm_var": 1.1056790397819215e-05, "learning_rate": 0.008736055202833609, "loss": 2.5939, "step": 6807 }, { "crossentropy": 2.642989158630371, "epoch": 0.24680974477958237, "grad_norm": 0.042319316416978836, "grad_norm_var": 1.0391395419030956e-05, "learning_rate": 0.008735668994354899, "loss": 2.6356, "step": 6808 }, { "crossentropy": 2.685471773147583, "epoch": 0.24684599767981438, "grad_norm": 0.03045026771724224, "grad_norm_var": 1.2125187486798046e-05, "learning_rate": 0.008735282735420312, "loss": 2.6687, "step": 6809 }, { "crossentropy": 2.6190359592437744, "epoch": 0.2468822505800464, "grad_norm": 0.03131750226020813, "grad_norm_var": 1.2736694055255728e-05, "learning_rate": 0.008734896426035062, "loss": 2.6293, "step": 6810 }, { "crossentropy": 2.700784921646118, "epoch": 0.2469185034802784, "grad_norm": 0.03128460794687271, "grad_norm_var": 1.2978797376369148e-05, "learning_rate": 0.008734510066204366, "loss": 2.6944, "step": 6811 }, { "crossentropy": 2.659958839416504, "epoch": 0.24695475638051045, "grad_norm": 0.03259637951850891, "grad_norm_var": 1.2126436706640546e-05, "learning_rate": 0.008734123655933446, "loss": 2.6606, "step": 6812 }, { "crossentropy": 2.493229627609253, "epoch": 0.24699100928074247, "grad_norm": 0.03030364215373993, "grad_norm_var": 1.3676807557779165e-05, "learning_rate": 0.00873373719522752, "loss": 2.5707, "step": 6813 }, { "crossentropy": 2.494828701019287, "epoch": 0.24702726218097448, "grad_norm": 0.03127329424023628, "grad_norm_var": 1.4626561894105063e-05, "learning_rate": 0.008733350684091805, "loss": 2.6426, "step": 6814 }, { "crossentropy": 2.6615986824035645, "epoch": 0.2470635150812065, "grad_norm": 0.03152323514223099, "grad_norm_var": 1.5298499457828603e-05, "learning_rate": 0.008732964122531526, "loss": 2.667, "step": 6815 }, { "crossentropy": 2.7998266220092773, "epoch": 0.2470997679814385, "grad_norm": 0.03575960174202919, "grad_norm_var": 1.5357658798231912e-05, "learning_rate": 0.0087325775105519, "loss": 2.7123, "step": 6816 }, { "crossentropy": 2.7343060970306396, "epoch": 0.24713602088167053, "grad_norm": 0.03460215404629707, "grad_norm_var": 1.4302841700847034e-05, "learning_rate": 0.00873219084815815, "loss": 2.7356, "step": 6817 }, { "crossentropy": 2.773763418197632, "epoch": 0.24717227378190254, "grad_norm": 0.03583661466836929, "grad_norm_var": 1.2853233398946275e-05, "learning_rate": 0.008731804135355499, "loss": 2.7339, "step": 6818 }, { "crossentropy": 2.6460866928100586, "epoch": 0.24720852668213458, "grad_norm": 0.04098692536354065, "grad_norm_var": 1.4940077075268005e-05, "learning_rate": 0.00873141737214917, "loss": 2.6793, "step": 6819 }, { "crossentropy": 2.761016845703125, "epoch": 0.2472447795823666, "grad_norm": 0.04282527416944504, "grad_norm_var": 1.9176016005250267e-05, "learning_rate": 0.008731030558544389, "loss": 2.6986, "step": 6820 }, { "crossentropy": 2.6928460597991943, "epoch": 0.2472810324825986, "grad_norm": 0.03961481526494026, "grad_norm_var": 2.043792871637254e-05, "learning_rate": 0.008730643694546375, "loss": 2.6611, "step": 6821 }, { "crossentropy": 2.8450701236724854, "epoch": 0.24731728538283063, "grad_norm": 0.03455304726958275, "grad_norm_var": 2.036731054060833e-05, "learning_rate": 0.008730256780160357, "loss": 2.8477, "step": 6822 }, { "crossentropy": 2.5157158374786377, "epoch": 0.24735353828306264, "grad_norm": 0.0330471470952034, "grad_norm_var": 1.86058770350218e-05, "learning_rate": 0.008729869815391563, "loss": 2.6182, "step": 6823 }, { "crossentropy": 2.6041901111602783, "epoch": 0.24738979118329466, "grad_norm": 0.034156542271375656, "grad_norm_var": 1.46881242274963e-05, "learning_rate": 0.008729482800245214, "loss": 2.6654, "step": 6824 }, { "crossentropy": 2.6565918922424316, "epoch": 0.24742604408352667, "grad_norm": 0.03128401190042496, "grad_norm_var": 1.4294362927006616e-05, "learning_rate": 0.008729095734726541, "loss": 2.6239, "step": 6825 }, { "crossentropy": 2.6276938915252686, "epoch": 0.2474622969837587, "grad_norm": 0.03525233641266823, "grad_norm_var": 1.362630997358092e-05, "learning_rate": 0.00872870861884077, "loss": 2.7494, "step": 6826 }, { "crossentropy": 2.481673002243042, "epoch": 0.24749854988399073, "grad_norm": 0.031141575425863266, "grad_norm_var": 1.3692365545606155e-05, "learning_rate": 0.008728321452593132, "loss": 2.5782, "step": 6827 }, { "crossentropy": 2.688821315765381, "epoch": 0.24753480278422274, "grad_norm": 0.029724784195423126, "grad_norm_var": 1.5002566550929158e-05, "learning_rate": 0.008727934235988855, "loss": 2.6597, "step": 6828 }, { "crossentropy": 2.6979033946990967, "epoch": 0.24757105568445475, "grad_norm": 0.03149816021323204, "grad_norm_var": 1.4424540840144752e-05, "learning_rate": 0.008727546969033167, "loss": 2.7045, "step": 6829 }, { "crossentropy": 2.5956878662109375, "epoch": 0.24760730858468677, "grad_norm": 0.033676113933324814, "grad_norm_var": 1.3730012538177144e-05, "learning_rate": 0.008727159651731303, "loss": 2.6412, "step": 6830 }, { "crossentropy": 2.6179747581481934, "epoch": 0.24764356148491878, "grad_norm": 0.03265248239040375, "grad_norm_var": 1.3328741845336905e-05, "learning_rate": 0.00872677228408849, "loss": 2.5996, "step": 6831 }, { "crossentropy": 2.708253860473633, "epoch": 0.2476798143851508, "grad_norm": 0.03213511407375336, "grad_norm_var": 1.3680365958979035e-05, "learning_rate": 0.008726384866109962, "loss": 2.7493, "step": 6832 }, { "crossentropy": 2.755765914916992, "epoch": 0.24771606728538284, "grad_norm": 0.0317925401031971, "grad_norm_var": 1.4158579543543937e-05, "learning_rate": 0.008725997397800951, "loss": 2.74, "step": 6833 }, { "crossentropy": 2.7429800033569336, "epoch": 0.24775232018561485, "grad_norm": 0.03147072717547417, "grad_norm_var": 1.4505515020686067e-05, "learning_rate": 0.008725609879166691, "loss": 2.7304, "step": 6834 }, { "crossentropy": 2.6053953170776367, "epoch": 0.24778857308584687, "grad_norm": 0.03408859297633171, "grad_norm_var": 1.1157425742455706e-05, "learning_rate": 0.008725222310212418, "loss": 2.5538, "step": 6835 }, { "crossentropy": 2.6185142993927, "epoch": 0.24782482598607888, "grad_norm": 0.043129369616508484, "grad_norm_var": 1.1533925904159275e-05, "learning_rate": 0.008724834690943362, "loss": 2.6591, "step": 6836 }, { "crossentropy": 2.594292640686035, "epoch": 0.2478610788863109, "grad_norm": 0.0329965204000473, "grad_norm_var": 9.053032018122887e-06, "learning_rate": 0.00872444702136476, "loss": 2.7186, "step": 6837 }, { "crossentropy": 2.62568736076355, "epoch": 0.2478973317865429, "grad_norm": 0.03240656480193138, "grad_norm_var": 8.978780317890101e-06, "learning_rate": 0.008724059301481851, "loss": 2.6503, "step": 6838 }, { "crossentropy": 2.520505666732788, "epoch": 0.24793358468677495, "grad_norm": 0.03303779661655426, "grad_norm_var": 8.978918109508494e-06, "learning_rate": 0.00872367153129987, "loss": 2.5987, "step": 6839 }, { "crossentropy": 2.7628860473632812, "epoch": 0.24796983758700697, "grad_norm": 0.03307041525840759, "grad_norm_var": 8.907274535718823e-06, "learning_rate": 0.008723283710824052, "loss": 2.664, "step": 6840 }, { "crossentropy": 2.5918571949005127, "epoch": 0.24800609048723898, "grad_norm": 0.03314606100320816, "grad_norm_var": 8.676884025493602e-06, "learning_rate": 0.00872289584005964, "loss": 2.5727, "step": 6841 }, { "crossentropy": 2.620969772338867, "epoch": 0.248042343387471, "grad_norm": 0.03448110446333885, "grad_norm_var": 8.503138395815392e-06, "learning_rate": 0.008722507919011866, "loss": 2.6645, "step": 6842 }, { "crossentropy": 2.683002233505249, "epoch": 0.248078596287703, "grad_norm": 0.029143663123250008, "grad_norm_var": 9.288435442995656e-06, "learning_rate": 0.008722119947685975, "loss": 2.6946, "step": 6843 }, { "crossentropy": 2.5275747776031494, "epoch": 0.24811484918793503, "grad_norm": 0.03157070279121399, "grad_norm_var": 8.68837232101068e-06, "learning_rate": 0.008721731926087207, "loss": 2.677, "step": 6844 }, { "crossentropy": 2.730020761489868, "epoch": 0.24815110208816704, "grad_norm": 0.031069930642843246, "grad_norm_var": 8.79377777274191e-06, "learning_rate": 0.008721343854220801, "loss": 2.698, "step": 6845 }, { "crossentropy": 2.6768460273742676, "epoch": 0.24818735498839908, "grad_norm": 0.029264092445373535, "grad_norm_var": 9.681330811029407e-06, "learning_rate": 0.008720955732091999, "loss": 2.6697, "step": 6846 }, { "crossentropy": 2.752633571624756, "epoch": 0.2482236078886311, "grad_norm": 0.03164398670196533, "grad_norm_var": 9.77024380220252e-06, "learning_rate": 0.008720567559706042, "loss": 2.6748, "step": 6847 }, { "crossentropy": 2.5253958702087402, "epoch": 0.2482598607888631, "grad_norm": 0.035481031984090805, "grad_norm_var": 1.0183158716579283e-05, "learning_rate": 0.008720179337068175, "loss": 2.6065, "step": 6848 }, { "crossentropy": 2.7897143363952637, "epoch": 0.24829611368909513, "grad_norm": 0.03896036744117737, "grad_norm_var": 1.2252644661669267e-05, "learning_rate": 0.00871979106418364, "loss": 2.7024, "step": 6849 }, { "crossentropy": 2.6812760829925537, "epoch": 0.24833236658932714, "grad_norm": 0.040737394243478775, "grad_norm_var": 1.5192549645802105e-05, "learning_rate": 0.008719402741057683, "loss": 2.715, "step": 6850 }, { "crossentropy": 2.7124927043914795, "epoch": 0.24836861948955916, "grad_norm": 0.042342547327280045, "grad_norm_var": 1.9532379159402817e-05, "learning_rate": 0.008719014367695547, "loss": 2.7017, "step": 6851 }, { "crossentropy": 2.598188877105713, "epoch": 0.24840487238979117, "grad_norm": 0.02956537716090679, "grad_norm_var": 1.5479184493832435e-05, "learning_rate": 0.008718625944102476, "loss": 2.6544, "step": 6852 }, { "crossentropy": 2.64119291305542, "epoch": 0.2484411252900232, "grad_norm": 0.03257455304265022, "grad_norm_var": 1.552889922922697e-05, "learning_rate": 0.00871823747028372, "loss": 2.6239, "step": 6853 }, { "crossentropy": 2.6797850131988525, "epoch": 0.24847737819025523, "grad_norm": 0.033239830285310745, "grad_norm_var": 1.5433482963289556e-05, "learning_rate": 0.008717848946244524, "loss": 2.7165, "step": 6854 }, { "crossentropy": 2.615417957305908, "epoch": 0.24851363109048724, "grad_norm": 0.03266226500272751, "grad_norm_var": 1.5475857311469886e-05, "learning_rate": 0.008717460371990136, "loss": 2.6206, "step": 6855 }, { "crossentropy": 2.7569310665130615, "epoch": 0.24854988399071926, "grad_norm": 0.031567808240652084, "grad_norm_var": 1.5740018522917194e-05, "learning_rate": 0.008717071747525805, "loss": 2.7438, "step": 6856 }, { "crossentropy": 2.683966636657715, "epoch": 0.24858613689095127, "grad_norm": 0.03580264747142792, "grad_norm_var": 1.6023623710037748e-05, "learning_rate": 0.008716683072856777, "loss": 2.6705, "step": 6857 }, { "crossentropy": 2.5359036922454834, "epoch": 0.24862238979118328, "grad_norm": 0.04511614888906479, "grad_norm_var": 2.411983507871005e-05, "learning_rate": 0.008716294347988308, "loss": 2.5515, "step": 6858 }, { "crossentropy": 2.7762224674224854, "epoch": 0.24865864269141533, "grad_norm": 0.039932023733854294, "grad_norm_var": 2.380238466910104e-05, "learning_rate": 0.00871590557292564, "loss": 2.8034, "step": 6859 }, { "crossentropy": 2.7166903018951416, "epoch": 0.24869489559164734, "grad_norm": 0.03305599093437195, "grad_norm_var": 2.3242185945905356e-05, "learning_rate": 0.00871551674767403, "loss": 2.6904, "step": 6860 }, { "crossentropy": 2.530388116836548, "epoch": 0.24873114849187936, "grad_norm": 0.03203834220767021, "grad_norm_var": 2.2769003765635734e-05, "learning_rate": 0.008715127872238729, "loss": 2.5472, "step": 6861 }, { "crossentropy": 2.612661361694336, "epoch": 0.24876740139211137, "grad_norm": 0.03574571758508682, "grad_norm_var": 2.022244133565245e-05, "learning_rate": 0.008714738946624985, "loss": 2.6865, "step": 6862 }, { "crossentropy": 2.571274518966675, "epoch": 0.24880365429234338, "grad_norm": 0.03432735800743103, "grad_norm_var": 1.92377120401593e-05, "learning_rate": 0.008714349970838055, "loss": 2.6439, "step": 6863 }, { "crossentropy": 2.7115824222564697, "epoch": 0.2488399071925754, "grad_norm": 0.030847005546092987, "grad_norm_var": 2.0790423311238987e-05, "learning_rate": 0.008713960944883194, "loss": 2.7254, "step": 6864 }, { "crossentropy": 2.668766498565674, "epoch": 0.2488761600928074, "grad_norm": 0.03182244300842285, "grad_norm_var": 2.071213998718854e-05, "learning_rate": 0.008713571868765653, "loss": 2.6522, "step": 6865 }, { "crossentropy": 2.660646438598633, "epoch": 0.24891241299303946, "grad_norm": 0.03140177205204964, "grad_norm_var": 1.912479723968004e-05, "learning_rate": 0.008713182742490689, "loss": 2.6094, "step": 6866 }, { "crossentropy": 2.6447582244873047, "epoch": 0.24894866589327147, "grad_norm": 0.03148522973060608, "grad_norm_var": 1.514296243838562e-05, "learning_rate": 0.008712793566063557, "loss": 2.6354, "step": 6867 }, { "crossentropy": 2.708798408508301, "epoch": 0.24898491879350348, "grad_norm": 0.03362656012177467, "grad_norm_var": 1.3867764318920411e-05, "learning_rate": 0.008712404339489513, "loss": 2.7179, "step": 6868 }, { "crossentropy": 2.735558271408081, "epoch": 0.2490211716937355, "grad_norm": 0.031925346702337265, "grad_norm_var": 1.4024233297520577e-05, "learning_rate": 0.008712015062773816, "loss": 2.7072, "step": 6869 }, { "crossentropy": 2.6165874004364014, "epoch": 0.2490574245939675, "grad_norm": 0.0323026143014431, "grad_norm_var": 1.4178782750248696e-05, "learning_rate": 0.00871162573592172, "loss": 2.6093, "step": 6870 }, { "crossentropy": 2.7165660858154297, "epoch": 0.24909367749419953, "grad_norm": 0.03623293340206146, "grad_norm_var": 1.434889473402584e-05, "learning_rate": 0.008711236358938487, "loss": 2.6929, "step": 6871 }, { "crossentropy": 2.702568531036377, "epoch": 0.24912993039443154, "grad_norm": 0.037706341594457626, "grad_norm_var": 1.4548089633374152e-05, "learning_rate": 0.008710846931829375, "loss": 2.6634, "step": 6872 }, { "crossentropy": 2.6830077171325684, "epoch": 0.24916618329466358, "grad_norm": 0.03231865167617798, "grad_norm_var": 1.4741337711702124e-05, "learning_rate": 0.008710457454599644, "loss": 2.6634, "step": 6873 }, { "crossentropy": 2.767086982727051, "epoch": 0.2492024361948956, "grad_norm": 0.031247401610016823, "grad_norm_var": 6.887200524970553e-06, "learning_rate": 0.008710067927254555, "loss": 2.6613, "step": 6874 }, { "crossentropy": 2.6778786182403564, "epoch": 0.2492386890951276, "grad_norm": 0.036119651049375534, "grad_norm_var": 4.526584048143235e-06, "learning_rate": 0.008709678349799367, "loss": 2.7099, "step": 6875 }, { "crossentropy": 2.5768563747406006, "epoch": 0.24927494199535963, "grad_norm": 0.03268294781446457, "grad_norm_var": 4.545563636453595e-06, "learning_rate": 0.008709288722239344, "loss": 2.576, "step": 6876 }, { "crossentropy": 2.726729393005371, "epoch": 0.24931119489559164, "grad_norm": 0.0332513190805912, "grad_norm_var": 4.443274160603504e-06, "learning_rate": 0.008708899044579747, "loss": 2.7336, "step": 6877 }, { "crossentropy": 2.7777159214019775, "epoch": 0.24934744779582366, "grad_norm": 0.03798175975680351, "grad_norm_var": 5.480397185428582e-06, "learning_rate": 0.008708509316825841, "loss": 2.6171, "step": 6878 }, { "crossentropy": 2.7224996089935303, "epoch": 0.24938370069605567, "grad_norm": 0.044169649481773376, "grad_norm_var": 1.2679670769716308e-05, "learning_rate": 0.008708119538982888, "loss": 2.7058, "step": 6879 }, { "crossentropy": 2.672750473022461, "epoch": 0.2494199535962877, "grad_norm": 0.038760118186473846, "grad_norm_var": 1.319262554411355e-05, "learning_rate": 0.008707729711056154, "loss": 2.6148, "step": 6880 }, { "crossentropy": 2.599780797958374, "epoch": 0.24945620649651973, "grad_norm": 0.03245869651436806, "grad_norm_var": 1.298529305976483e-05, "learning_rate": 0.008707339833050902, "loss": 2.5893, "step": 6881 }, { "crossentropy": 2.711665391921997, "epoch": 0.24949245939675174, "grad_norm": 0.03199778497219086, "grad_norm_var": 1.2752984391939378e-05, "learning_rate": 0.008706949904972401, "loss": 2.8225, "step": 6882 }, { "crossentropy": 2.5725626945495605, "epoch": 0.24952871229698376, "grad_norm": 0.033788710832595825, "grad_norm_var": 1.2115165408260727e-05, "learning_rate": 0.008706559926825915, "loss": 2.7075, "step": 6883 }, { "crossentropy": 2.678271770477295, "epoch": 0.24956496519721577, "grad_norm": 0.03595690801739693, "grad_norm_var": 1.2094426944566486e-05, "learning_rate": 0.008706169898616714, "loss": 2.7301, "step": 6884 }, { "crossentropy": 2.617400646209717, "epoch": 0.24960121809744779, "grad_norm": 0.03477145731449127, "grad_norm_var": 1.1459994304984288e-05, "learning_rate": 0.008705779820350062, "loss": 2.6995, "step": 6885 }, { "crossentropy": 2.7033603191375732, "epoch": 0.24963747099767983, "grad_norm": 0.031419672071933746, "grad_norm_var": 1.1839123690670948e-05, "learning_rate": 0.008705389692031232, "loss": 2.6129, "step": 6886 }, { "crossentropy": 2.733513593673706, "epoch": 0.24967372389791184, "grad_norm": 0.033356182277202606, "grad_norm_var": 1.1904155104163723e-05, "learning_rate": 0.00870499951366549, "loss": 2.6757, "step": 6887 }, { "crossentropy": 2.6410465240478516, "epoch": 0.24970997679814386, "grad_norm": 0.033033519983291626, "grad_norm_var": 1.1504315342116438e-05, "learning_rate": 0.008704609285258107, "loss": 2.6888, "step": 6888 }, { "crossentropy": 2.678687572479248, "epoch": 0.24974622969837587, "grad_norm": 0.030826205387711525, "grad_norm_var": 1.2093947924736216e-05, "learning_rate": 0.008704219006814354, "loss": 2.6347, "step": 6889 }, { "crossentropy": 2.740276336669922, "epoch": 0.24978248259860789, "grad_norm": 0.0315077044069767, "grad_norm_var": 1.1985680860022982e-05, "learning_rate": 0.008703828678339501, "loss": 2.7189, "step": 6890 }, { "crossentropy": 2.5811779499053955, "epoch": 0.2498187354988399, "grad_norm": 0.031178634613752365, "grad_norm_var": 1.2447892097507578e-05, "learning_rate": 0.008703438299838821, "loss": 2.6251, "step": 6891 }, { "crossentropy": 2.5909829139709473, "epoch": 0.24985498839907191, "grad_norm": 0.03065808303654194, "grad_norm_var": 1.3112732680741328e-05, "learning_rate": 0.00870304787131759, "loss": 2.5584, "step": 6892 }, { "crossentropy": 2.641166925430298, "epoch": 0.24989124129930396, "grad_norm": 0.03497723489999771, "grad_norm_var": 1.3110561881367332e-05, "learning_rate": 0.008702657392781074, "loss": 2.6564, "step": 6893 }, { "crossentropy": 2.683781385421753, "epoch": 0.24992749419953597, "grad_norm": 0.032342277467250824, "grad_norm_var": 1.2237865288204583e-05, "learning_rate": 0.008702266864234552, "loss": 2.6248, "step": 6894 }, { "crossentropy": 2.6125454902648926, "epoch": 0.24996374709976799, "grad_norm": 0.030779100954532623, "grad_norm_var": 4.975452386579647e-06, "learning_rate": 0.008701876285683297, "loss": 2.5433, "step": 6895 }, { "crossentropy": 2.7800040245056152, "epoch": 0.25, "grad_norm": 0.03461725637316704, "grad_norm_var": 2.859895290936489e-06, "learning_rate": 0.008701485657132586, "loss": 2.5992, "step": 6896 }, { "crossentropy": 2.5974836349487305, "epoch": 0.250036252900232, "grad_norm": 0.03048434481024742, "grad_norm_var": 3.174770209649532e-06, "learning_rate": 0.008701094978587693, "loss": 2.5669, "step": 6897 }, { "crossentropy": 2.574532985687256, "epoch": 0.25007250580046403, "grad_norm": 0.034552451223134995, "grad_norm_var": 3.375513308073672e-06, "learning_rate": 0.008700704250053896, "loss": 2.6355, "step": 6898 }, { "crossentropy": 2.5305376052856445, "epoch": 0.25010875870069604, "grad_norm": 0.03511006012558937, "grad_norm_var": 3.664886043257265e-06, "learning_rate": 0.008700313471536473, "loss": 2.5239, "step": 6899 }, { "crossentropy": 2.6864068508148193, "epoch": 0.25014501160092806, "grad_norm": 0.030705135315656662, "grad_norm_var": 3.2118707536122246e-06, "learning_rate": 0.008699922643040701, "loss": 2.6489, "step": 6900 }, { "crossentropy": 2.670753002166748, "epoch": 0.25018126450116007, "grad_norm": 0.03168630227446556, "grad_norm_var": 2.8805936449410297e-06, "learning_rate": 0.008699531764571857, "loss": 2.6638, "step": 6901 }, { "crossentropy": 2.64394211769104, "epoch": 0.2502175174013921, "grad_norm": 0.03203815594315529, "grad_norm_var": 2.829667770415841e-06, "learning_rate": 0.008699140836135225, "loss": 2.6582, "step": 6902 }, { "crossentropy": 2.69352650642395, "epoch": 0.25025377030162416, "grad_norm": 0.03008664771914482, "grad_norm_var": 3.066034462122167e-06, "learning_rate": 0.00869874985773608, "loss": 2.711, "step": 6903 }, { "crossentropy": 2.6727852821350098, "epoch": 0.25029002320185617, "grad_norm": 0.032472576946020126, "grad_norm_var": 3.0204759194514077e-06, "learning_rate": 0.008698358829379707, "loss": 2.6697, "step": 6904 }, { "crossentropy": 2.708343029022217, "epoch": 0.2503262761020882, "grad_norm": 0.031198082491755486, "grad_norm_var": 2.9646515662427758e-06, "learning_rate": 0.008697967751071385, "loss": 2.6341, "step": 6905 }, { "crossentropy": 2.700836420059204, "epoch": 0.2503625290023202, "grad_norm": 0.032977718859910965, "grad_norm_var": 2.9738922033589165e-06, "learning_rate": 0.008697576622816396, "loss": 2.8048, "step": 6906 }, { "crossentropy": 2.6676793098449707, "epoch": 0.2503987819025522, "grad_norm": 0.030178580433130264, "grad_norm_var": 3.1781225646620424e-06, "learning_rate": 0.008697185444620024, "loss": 2.7546, "step": 6907 }, { "crossentropy": 2.6256330013275146, "epoch": 0.25043503480278423, "grad_norm": 0.030627023428678513, "grad_norm_var": 3.1844814052584077e-06, "learning_rate": 0.008696794216487553, "loss": 2.6603, "step": 6908 }, { "crossentropy": 2.587709426879883, "epoch": 0.25047128770301624, "grad_norm": 0.0391613245010376, "grad_norm_var": 5.840802332066292e-06, "learning_rate": 0.008696402938424264, "loss": 2.627, "step": 6909 }, { "crossentropy": 2.740783214569092, "epoch": 0.25050754060324826, "grad_norm": 0.030499985441565514, "grad_norm_var": 6.0765817760816476e-06, "learning_rate": 0.008696011610435445, "loss": 2.6517, "step": 6910 }, { "crossentropy": 2.6918625831604004, "epoch": 0.25054379350348027, "grad_norm": 0.02980267070233822, "grad_norm_var": 6.3372264748283535e-06, "learning_rate": 0.00869562023252638, "loss": 2.6536, "step": 6911 }, { "crossentropy": 2.5877232551574707, "epoch": 0.2505800464037123, "grad_norm": 0.028982600197196007, "grad_norm_var": 6.552382812851677e-06, "learning_rate": 0.008695228804702356, "loss": 2.6184, "step": 6912 }, { "crossentropy": 2.7819714546203613, "epoch": 0.2506162993039443, "grad_norm": 0.032215461134910583, "grad_norm_var": 6.410564411297309e-06, "learning_rate": 0.008694837326968656, "loss": 2.7816, "step": 6913 }, { "crossentropy": 2.764737844467163, "epoch": 0.2506525522041763, "grad_norm": 0.033445414155721664, "grad_norm_var": 6.113125093976795e-06, "learning_rate": 0.008694445799330574, "loss": 2.7479, "step": 6914 }, { "crossentropy": 2.5730979442596436, "epoch": 0.25068880510440833, "grad_norm": 0.031276579946279526, "grad_norm_var": 5.416002858124602e-06, "learning_rate": 0.008694054221793394, "loss": 2.5953, "step": 6915 }, { "crossentropy": 2.73227858543396, "epoch": 0.25072505800464034, "grad_norm": 0.03077060543000698, "grad_norm_var": 5.407502071942784e-06, "learning_rate": 0.008693662594362408, "loss": 2.7252, "step": 6916 }, { "crossentropy": 2.6156578063964844, "epoch": 0.2507613109048724, "grad_norm": 0.032402414828538895, "grad_norm_var": 5.436934005154552e-06, "learning_rate": 0.008693270917042901, "loss": 2.6088, "step": 6917 }, { "crossentropy": 2.680598735809326, "epoch": 0.25079756380510443, "grad_norm": 0.03471057862043381, "grad_norm_var": 5.9829505569149725e-06, "learning_rate": 0.008692879189840167, "loss": 2.6966, "step": 6918 }, { "crossentropy": 2.7384791374206543, "epoch": 0.25083381670533644, "grad_norm": 0.0360097773373127, "grad_norm_var": 6.723419293244237e-06, "learning_rate": 0.008692487412759494, "loss": 2.6098, "step": 6919 }, { "crossentropy": 2.709660768508911, "epoch": 0.25087006960556846, "grad_norm": 0.03185172751545906, "grad_norm_var": 6.732869315106147e-06, "learning_rate": 0.008692095585806175, "loss": 2.5839, "step": 6920 }, { "crossentropy": 2.6533823013305664, "epoch": 0.25090632250580047, "grad_norm": 0.03022276610136032, "grad_norm_var": 6.9300140653436966e-06, "learning_rate": 0.0086917037089855, "loss": 2.6656, "step": 6921 }, { "crossentropy": 2.6538431644439697, "epoch": 0.2509425754060325, "grad_norm": 0.03176306560635567, "grad_norm_var": 6.895615351181402e-06, "learning_rate": 0.00869131178230277, "loss": 2.6354, "step": 6922 }, { "crossentropy": 2.685932159423828, "epoch": 0.2509788283062645, "grad_norm": 0.03154761716723442, "grad_norm_var": 6.658366841802201e-06, "learning_rate": 0.008690919805763267, "loss": 2.6767, "step": 6923 }, { "crossentropy": 2.626485824584961, "epoch": 0.2510150812064965, "grad_norm": 0.03140809014439583, "grad_norm_var": 6.5320993564414264e-06, "learning_rate": 0.008690527779372294, "loss": 2.7101, "step": 6924 }, { "crossentropy": 2.7435479164123535, "epoch": 0.25105133410672853, "grad_norm": 0.05275006964802742, "grad_norm_var": 3.058713402125466e-05, "learning_rate": 0.008690135703135142, "loss": 2.6047, "step": 6925 }, { "crossentropy": 2.463473081588745, "epoch": 0.25108758700696054, "grad_norm": 0.03218984976410866, "grad_norm_var": 3.0178951877999706e-05, "learning_rate": 0.008689743577057107, "loss": 2.4572, "step": 6926 }, { "crossentropy": 2.7647316455841064, "epoch": 0.25112383990719256, "grad_norm": 0.030530234798789024, "grad_norm_var": 2.9881561106488693e-05, "learning_rate": 0.008689351401143486, "loss": 2.6505, "step": 6927 }, { "crossentropy": 2.7424232959747314, "epoch": 0.2511600928074246, "grad_norm": 0.031085072085261345, "grad_norm_var": 2.8960210442904504e-05, "learning_rate": 0.008688959175399575, "loss": 2.6454, "step": 6928 }, { "crossentropy": 2.633052110671997, "epoch": 0.2511963457076566, "grad_norm": 0.031796589493751526, "grad_norm_var": 2.903656195877116e-05, "learning_rate": 0.008688566899830672, "loss": 2.7254, "step": 6929 }, { "crossentropy": 2.687669038772583, "epoch": 0.25123259860788866, "grad_norm": 0.033093761652708054, "grad_norm_var": 2.9040287189339394e-05, "learning_rate": 0.008688174574442075, "loss": 2.6679, "step": 6930 }, { "crossentropy": 2.691279888153076, "epoch": 0.25126885150812067, "grad_norm": 0.03150684013962746, "grad_norm_var": 2.8980310991440665e-05, "learning_rate": 0.008687782199239085, "loss": 2.7268, "step": 6931 }, { "crossentropy": 2.790102481842041, "epoch": 0.2513051044083527, "grad_norm": 0.030460482463240623, "grad_norm_var": 2.9093080219436116e-05, "learning_rate": 0.008687389774226999, "loss": 2.7148, "step": 6932 }, { "crossentropy": 2.583192825317383, "epoch": 0.2513413573085847, "grad_norm": 0.03000226430594921, "grad_norm_var": 2.975095005754985e-05, "learning_rate": 0.00868699729941112, "loss": 2.5998, "step": 6933 }, { "crossentropy": 2.732332229614258, "epoch": 0.2513776102088167, "grad_norm": 0.03214403986930847, "grad_norm_var": 2.9639916662363374e-05, "learning_rate": 0.008686604774796745, "loss": 2.7699, "step": 6934 }, { "crossentropy": 2.5775508880615234, "epoch": 0.25141386310904873, "grad_norm": 0.03123839758336544, "grad_norm_var": 2.9162427149620575e-05, "learning_rate": 0.008686212200389178, "loss": 2.5687, "step": 6935 }, { "crossentropy": 2.630859851837158, "epoch": 0.25145011600928074, "grad_norm": 0.03505634889006615, "grad_norm_var": 2.9431386540941965e-05, "learning_rate": 0.008685819576193724, "loss": 2.6957, "step": 6936 }, { "crossentropy": 2.7412188053131104, "epoch": 0.25148636890951276, "grad_norm": 0.03187612071633339, "grad_norm_var": 2.9006597407870897e-05, "learning_rate": 0.008685426902215678, "loss": 2.6908, "step": 6937 }, { "crossentropy": 2.616407871246338, "epoch": 0.2515226218097448, "grad_norm": 0.03224518522620201, "grad_norm_var": 2.8939808181767144e-05, "learning_rate": 0.008685034178460354, "loss": 2.609, "step": 6938 }, { "crossentropy": 2.7033112049102783, "epoch": 0.2515588747099768, "grad_norm": 0.03147771582007408, "grad_norm_var": 2.8954192334994693e-05, "learning_rate": 0.008684641404933049, "loss": 2.739, "step": 6939 }, { "crossentropy": 2.593590497970581, "epoch": 0.2515951276102088, "grad_norm": 0.03116198442876339, "grad_norm_var": 2.9011980855244272e-05, "learning_rate": 0.008684248581639069, "loss": 2.5867, "step": 6940 }, { "crossentropy": 2.75832462310791, "epoch": 0.2516313805104408, "grad_norm": 0.0318060964345932, "grad_norm_var": 1.3822797763180886e-06, "learning_rate": 0.008683855708583722, "loss": 2.711, "step": 6941 }, { "crossentropy": 2.6010355949401855, "epoch": 0.25166763341067283, "grad_norm": 0.03547108918428421, "grad_norm_var": 2.256618197133721e-06, "learning_rate": 0.008683462785772313, "loss": 2.6258, "step": 6942 }, { "crossentropy": 2.607734203338623, "epoch": 0.25170388631090485, "grad_norm": 0.02874450385570526, "grad_norm_var": 2.790275680838528e-06, "learning_rate": 0.00868306981321015, "loss": 2.551, "step": 6943 }, { "crossentropy": 2.6773736476898193, "epoch": 0.2517401392111369, "grad_norm": 0.03356996178627014, "grad_norm_var": 2.9317349427756176e-06, "learning_rate": 0.00868267679090254, "loss": 2.5721, "step": 6944 }, { "crossentropy": 2.672452211380005, "epoch": 0.25177639211136893, "grad_norm": 0.03505596145987511, "grad_norm_var": 3.516774278600141e-06, "learning_rate": 0.00868228371885479, "loss": 2.676, "step": 6945 }, { "crossentropy": 2.671844482421875, "epoch": 0.25181264501160094, "grad_norm": 0.04321730136871338, "grad_norm_var": 1.1152958641054885e-05, "learning_rate": 0.008681890597072212, "loss": 2.7654, "step": 6946 }, { "crossentropy": 2.5755183696746826, "epoch": 0.25184889791183296, "grad_norm": 0.04976811632514, "grad_norm_var": 2.8810809230447646e-05, "learning_rate": 0.008681497425560111, "loss": 2.6444, "step": 6947 }, { "crossentropy": 2.465714454650879, "epoch": 0.251885150812065, "grad_norm": 0.04177985340356827, "grad_norm_var": 3.154325177316754e-05, "learning_rate": 0.008681104204323805, "loss": 2.6412, "step": 6948 }, { "crossentropy": 2.6972360610961914, "epoch": 0.251921403712297, "grad_norm": 0.034762341529130936, "grad_norm_var": 3.0001060828021656e-05, "learning_rate": 0.008680710933368597, "loss": 2.695, "step": 6949 }, { "crossentropy": 2.780967950820923, "epoch": 0.251957656612529, "grad_norm": 0.03266547620296478, "grad_norm_var": 2.9822209871239087e-05, "learning_rate": 0.008680317612699804, "loss": 2.7784, "step": 6950 }, { "crossentropy": 2.604271411895752, "epoch": 0.251993909512761, "grad_norm": 0.03155513107776642, "grad_norm_var": 2.9669896450983907e-05, "learning_rate": 0.008679924242322735, "loss": 2.62, "step": 6951 }, { "crossentropy": 2.7550573348999023, "epoch": 0.25203016241299303, "grad_norm": 0.03271320462226868, "grad_norm_var": 2.999960003637659e-05, "learning_rate": 0.008679530822242706, "loss": 2.6715, "step": 6952 }, { "crossentropy": 2.6756696701049805, "epoch": 0.25206641531322505, "grad_norm": 0.03569520637392998, "grad_norm_var": 2.9388260868882482e-05, "learning_rate": 0.008679137352465028, "loss": 2.659, "step": 6953 }, { "crossentropy": 2.7231273651123047, "epoch": 0.25210266821345706, "grad_norm": 0.03268469497561455, "grad_norm_var": 2.9232711620774315e-05, "learning_rate": 0.008678743832995018, "loss": 2.6719, "step": 6954 }, { "crossentropy": 2.6384925842285156, "epoch": 0.2521389211136891, "grad_norm": 0.031155526638031006, "grad_norm_var": 2.939622693884819e-05, "learning_rate": 0.00867835026383799, "loss": 2.642, "step": 6955 }, { "crossentropy": 2.4731810092926025, "epoch": 0.2521751740139211, "grad_norm": 0.04420362785458565, "grad_norm_var": 3.3156309549209106e-05, "learning_rate": 0.008677956644999258, "loss": 2.6096, "step": 6956 }, { "crossentropy": 2.545300245285034, "epoch": 0.25221142691415316, "grad_norm": 0.033670078963041306, "grad_norm_var": 3.2349039201685394e-05, "learning_rate": 0.00867756297648414, "loss": 2.6254, "step": 6957 }, { "crossentropy": 2.694300413131714, "epoch": 0.2522476798143852, "grad_norm": 0.034999724477529526, "grad_norm_var": 3.239896412445056e-05, "learning_rate": 0.008677169258297954, "loss": 2.6849, "step": 6958 }, { "crossentropy": 2.7276806831359863, "epoch": 0.2522839327146172, "grad_norm": 0.03523204103112221, "grad_norm_var": 2.8740419150440934e-05, "learning_rate": 0.008676775490446017, "loss": 2.7292, "step": 6959 }, { "crossentropy": 2.7040181159973145, "epoch": 0.2523201856148492, "grad_norm": 0.036172494292259216, "grad_norm_var": 2.8174588120454384e-05, "learning_rate": 0.008676381672933647, "loss": 2.692, "step": 6960 }, { "crossentropy": 2.8029446601867676, "epoch": 0.2523564385150812, "grad_norm": 0.033750660717487335, "grad_norm_var": 2.8546872431831765e-05, "learning_rate": 0.008675987805766163, "loss": 2.8032, "step": 6961 }, { "crossentropy": 2.518026828765869, "epoch": 0.25239269141531323, "grad_norm": 0.039345189929008484, "grad_norm_var": 2.6016753946517027e-05, "learning_rate": 0.008675593888948886, "loss": 2.5955, "step": 6962 }, { "crossentropy": 2.5062341690063477, "epoch": 0.25242894431554525, "grad_norm": 0.04037525877356529, "grad_norm_var": 1.4613037354177284e-05, "learning_rate": 0.008675199922487135, "loss": 2.5853, "step": 6963 }, { "crossentropy": 2.760556221008301, "epoch": 0.25246519721577726, "grad_norm": 0.03739519044756889, "grad_norm_var": 1.2244143850800802e-05, "learning_rate": 0.008674805906386232, "loss": 2.7376, "step": 6964 }, { "crossentropy": 2.6532227993011475, "epoch": 0.2525014501160093, "grad_norm": 0.03581761196255684, "grad_norm_var": 1.2224235688745808e-05, "learning_rate": 0.0086744118406515, "loss": 2.6592, "step": 6965 }, { "crossentropy": 2.6905465126037598, "epoch": 0.2525377030162413, "grad_norm": 0.0342847965657711, "grad_norm_var": 1.1783799491106816e-05, "learning_rate": 0.008674017725288257, "loss": 2.661, "step": 6966 }, { "crossentropy": 2.765230655670166, "epoch": 0.2525739559164733, "grad_norm": 0.04142245277762413, "grad_norm_var": 1.2592637935713151e-05, "learning_rate": 0.008673623560301831, "loss": 2.7293, "step": 6967 }, { "crossentropy": 2.6426992416381836, "epoch": 0.2526102088167053, "grad_norm": 0.03702598810195923, "grad_norm_var": 1.1760248788310277e-05, "learning_rate": 0.008673229345697545, "loss": 2.7024, "step": 6968 }, { "crossentropy": 2.7347731590270996, "epoch": 0.25264646171693733, "grad_norm": 0.03401905298233032, "grad_norm_var": 1.210495521341721e-05, "learning_rate": 0.008672835081480718, "loss": 2.6775, "step": 6969 }, { "crossentropy": 2.8291287422180176, "epoch": 0.25268271461716935, "grad_norm": 0.029437072575092316, "grad_norm_var": 1.4350048456647906e-05, "learning_rate": 0.008672440767656685, "loss": 2.781, "step": 6970 }, { "crossentropy": 2.7306292057037354, "epoch": 0.2527189675174014, "grad_norm": 0.03063586913049221, "grad_norm_var": 1.4712577873617404e-05, "learning_rate": 0.008672046404230763, "loss": 2.6523, "step": 6971 }, { "crossentropy": 2.5927810668945312, "epoch": 0.25275522041763343, "grad_norm": 0.031669192016124725, "grad_norm_var": 1.1008373578328295e-05, "learning_rate": 0.008671651991208283, "loss": 2.6801, "step": 6972 }, { "crossentropy": 2.567636251449585, "epoch": 0.25279147331786544, "grad_norm": 0.032344892621040344, "grad_norm_var": 1.1411123205350596e-05, "learning_rate": 0.008671257528594569, "loss": 2.6059, "step": 6973 }, { "crossentropy": 2.598905324935913, "epoch": 0.25282772621809746, "grad_norm": 0.03455858677625656, "grad_norm_var": 1.1437740092308509e-05, "learning_rate": 0.008670863016394952, "loss": 2.6027, "step": 6974 }, { "crossentropy": 2.739575147628784, "epoch": 0.2528639791183295, "grad_norm": 0.034930091351270676, "grad_norm_var": 1.1442869001521471e-05, "learning_rate": 0.00867046845461476, "loss": 2.7263, "step": 6975 }, { "crossentropy": 2.901252508163452, "epoch": 0.2529002320185615, "grad_norm": 0.03086194023489952, "grad_norm_var": 1.2516204898141643e-05, "learning_rate": 0.00867007384325932, "loss": 2.7944, "step": 6976 }, { "crossentropy": 2.761441230773926, "epoch": 0.2529364849187935, "grad_norm": 0.03226776421070099, "grad_norm_var": 1.2874386183038877e-05, "learning_rate": 0.008669679182333965, "loss": 2.6506, "step": 6977 }, { "crossentropy": 2.698899984359741, "epoch": 0.2529727378190255, "grad_norm": 0.033276475965976715, "grad_norm_var": 1.147773570530293e-05, "learning_rate": 0.008669284471844022, "loss": 2.7146, "step": 6978 }, { "crossentropy": 2.7302968502044678, "epoch": 0.25300899071925753, "grad_norm": 0.03287429362535477, "grad_norm_var": 9.013377037078704e-06, "learning_rate": 0.008668889711794824, "loss": 2.6845, "step": 6979 }, { "crossentropy": 2.547407388687134, "epoch": 0.25304524361948955, "grad_norm": 0.03528379276394844, "grad_norm_var": 8.315449374104408e-06, "learning_rate": 0.008668494902191704, "loss": 2.6023, "step": 6980 }, { "crossentropy": 2.6930510997772217, "epoch": 0.25308149651972156, "grad_norm": 0.03567122295498848, "grad_norm_var": 8.277297958692364e-06, "learning_rate": 0.008668100043039993, "loss": 2.771, "step": 6981 }, { "crossentropy": 2.6588101387023926, "epoch": 0.2531177494199536, "grad_norm": 0.030524147674441338, "grad_norm_var": 8.910704248468969e-06, "learning_rate": 0.008667705134345022, "loss": 2.6417, "step": 6982 }, { "crossentropy": 2.620990514755249, "epoch": 0.2531540023201856, "grad_norm": 0.03167678415775299, "grad_norm_var": 4.617420971995909e-06, "learning_rate": 0.00866731017611213, "loss": 2.623, "step": 6983 }, { "crossentropy": 2.5434751510620117, "epoch": 0.25319025522041766, "grad_norm": 0.03033289685845375, "grad_norm_var": 3.7718348905419844e-06, "learning_rate": 0.008666915168346648, "loss": 2.5983, "step": 6984 }, { "crossentropy": 2.6740775108337402, "epoch": 0.2532265081206497, "grad_norm": 0.03162011876702309, "grad_norm_var": 3.652912420665638e-06, "learning_rate": 0.008666520111053911, "loss": 2.7285, "step": 6985 }, { "crossentropy": 2.626235008239746, "epoch": 0.2532627610208817, "grad_norm": 0.03395410627126694, "grad_norm_var": 3.1600201871493165e-06, "learning_rate": 0.008666125004239257, "loss": 2.6881, "step": 6986 }, { "crossentropy": 2.534900665283203, "epoch": 0.2532990139211137, "grad_norm": 0.03447647765278816, "grad_norm_var": 3.0478838477956354e-06, "learning_rate": 0.00866572984790802, "loss": 2.6484, "step": 6987 }, { "crossentropy": 2.5859804153442383, "epoch": 0.2533352668213457, "grad_norm": 0.03218493610620499, "grad_norm_var": 2.9802026177550794e-06, "learning_rate": 0.00866533464206554, "loss": 2.6336, "step": 6988 }, { "crossentropy": 2.594663143157959, "epoch": 0.25337151972157773, "grad_norm": 0.03251386433839798, "grad_norm_var": 2.968863266315661e-06, "learning_rate": 0.008664939386717153, "loss": 2.6225, "step": 6989 }, { "crossentropy": 2.7806787490844727, "epoch": 0.25340777262180975, "grad_norm": 0.03178447484970093, "grad_norm_var": 2.850407548250763e-06, "learning_rate": 0.008664544081868196, "loss": 2.6899, "step": 6990 }, { "crossentropy": 2.707803726196289, "epoch": 0.25344402552204176, "grad_norm": 0.031766533851623535, "grad_norm_var": 2.562487102966794e-06, "learning_rate": 0.008664148727524012, "loss": 2.6144, "step": 6991 }, { "crossentropy": 2.477386236190796, "epoch": 0.2534802784222738, "grad_norm": 0.034178223460912704, "grad_norm_var": 2.4959773203354532e-06, "learning_rate": 0.008663753323689937, "loss": 2.552, "step": 6992 }, { "crossentropy": 2.6504693031311035, "epoch": 0.2535165313225058, "grad_norm": 0.03997756168246269, "grad_norm_var": 5.690505875761729e-06, "learning_rate": 0.008663357870371315, "loss": 2.6299, "step": 6993 }, { "crossentropy": 2.651179552078247, "epoch": 0.2535527842227378, "grad_norm": 0.039614178240299225, "grad_norm_var": 8.218217703069329e-06, "learning_rate": 0.008662962367573485, "loss": 2.6091, "step": 6994 }, { "crossentropy": 2.6938676834106445, "epoch": 0.2535890371229698, "grad_norm": 0.042512763291597366, "grad_norm_var": 1.3024890979465561e-05, "learning_rate": 0.008662566815301791, "loss": 2.6591, "step": 6995 }, { "crossentropy": 2.669220209121704, "epoch": 0.25362529002320183, "grad_norm": 0.04554075375199318, "grad_norm_var": 2.100786736615108e-05, "learning_rate": 0.008662171213561573, "loss": 2.7315, "step": 6996 }, { "crossentropy": 2.692870616912842, "epoch": 0.25366154292343385, "grad_norm": 0.037066493183374405, "grad_norm_var": 2.127384133576251e-05, "learning_rate": 0.008661775562358174, "loss": 2.6677, "step": 6997 }, { "crossentropy": 2.5323803424835205, "epoch": 0.2536977958236659, "grad_norm": 0.03036845289170742, "grad_norm_var": 2.1367914280145587e-05, "learning_rate": 0.008661379861696942, "loss": 2.6438, "step": 6998 }, { "crossentropy": 2.762521982192993, "epoch": 0.25373404872389793, "grad_norm": 0.03134728968143463, "grad_norm_var": 2.1519512715587616e-05, "learning_rate": 0.008660984111583215, "loss": 2.7836, "step": 6999 }, { "crossentropy": 2.658372640609741, "epoch": 0.25377030162412995, "grad_norm": 0.03118518367409706, "grad_norm_var": 2.1039954901904883e-05, "learning_rate": 0.008660588312022344, "loss": 2.6659, "step": 7000 }, { "crossentropy": 2.6617066860198975, "epoch": 0.25380655452436196, "grad_norm": 0.03679559379816055, "grad_norm_var": 2.0377776649140827e-05, "learning_rate": 0.008660192463019672, "loss": 2.6458, "step": 7001 }, { "crossentropy": 2.666295051574707, "epoch": 0.253842807424594, "grad_norm": 0.034452758729457855, "grad_norm_var": 2.0301892993983143e-05, "learning_rate": 0.008659796564580546, "loss": 2.6927, "step": 7002 }, { "crossentropy": 2.5727977752685547, "epoch": 0.253879060324826, "grad_norm": 0.03142594173550606, "grad_norm_var": 2.1243006613252826e-05, "learning_rate": 0.008659400616710314, "loss": 2.593, "step": 7003 }, { "crossentropy": 2.413177013397217, "epoch": 0.253915313225058, "grad_norm": 0.030608240514993668, "grad_norm_var": 2.2025852299680986e-05, "learning_rate": 0.008659004619414323, "loss": 2.6224, "step": 7004 }, { "crossentropy": 2.5561978816986084, "epoch": 0.25395156612529, "grad_norm": 0.034727368503808975, "grad_norm_var": 2.1577337383945475e-05, "learning_rate": 0.008658608572697922, "loss": 2.6332, "step": 7005 }, { "crossentropy": 2.6508352756500244, "epoch": 0.25398781902552203, "grad_norm": 0.031355563551187515, "grad_norm_var": 2.1784705449078575e-05, "learning_rate": 0.00865821247656646, "loss": 2.6159, "step": 7006 }, { "crossentropy": 2.6977548599243164, "epoch": 0.25402407192575405, "grad_norm": 0.03258000314235687, "grad_norm_var": 2.145553957984257e-05, "learning_rate": 0.008657816331025288, "loss": 2.6371, "step": 7007 }, { "crossentropy": 2.6318235397338867, "epoch": 0.25406032482598606, "grad_norm": 0.030710019171237946, "grad_norm_var": 2.2695316446792488e-05, "learning_rate": 0.008657420136079754, "loss": 2.6341, "step": 7008 }, { "crossentropy": 2.53320574760437, "epoch": 0.2540965777262181, "grad_norm": 0.03207492455840111, "grad_norm_var": 2.1371424334410307e-05, "learning_rate": 0.008657023891735212, "loss": 2.642, "step": 7009 }, { "crossentropy": 2.6048455238342285, "epoch": 0.2541328306264501, "grad_norm": 0.03465631604194641, "grad_norm_var": 1.9542082382714115e-05, "learning_rate": 0.008656627597997011, "loss": 2.6351, "step": 7010 }, { "crossentropy": 2.754138469696045, "epoch": 0.25416908352668216, "grad_norm": 0.031101539731025696, "grad_norm_var": 1.505249149998399e-05, "learning_rate": 0.008656231254870509, "loss": 2.4906, "step": 7011 }, { "crossentropy": 2.5709805488586426, "epoch": 0.2542053364269142, "grad_norm": 0.029457110911607742, "grad_norm_var": 5.398513263615252e-06, "learning_rate": 0.008655834862361052, "loss": 2.6179, "step": 7012 }, { "crossentropy": 2.697880744934082, "epoch": 0.2542415893271462, "grad_norm": 0.030498933047056198, "grad_norm_var": 4.09078123157143e-06, "learning_rate": 0.008655438420473997, "loss": 2.694, "step": 7013 }, { "crossentropy": 2.835754871368408, "epoch": 0.2542778422273782, "grad_norm": 0.030993638560175896, "grad_norm_var": 3.972198614211293e-06, "learning_rate": 0.0086550419292147, "loss": 2.6879, "step": 7014 }, { "crossentropy": 2.6796557903289795, "epoch": 0.2543140951276102, "grad_norm": 0.029987554997205734, "grad_norm_var": 4.228415693570857e-06, "learning_rate": 0.008654645388588515, "loss": 2.6264, "step": 7015 }, { "crossentropy": 2.831799268722534, "epoch": 0.25435034802784223, "grad_norm": 0.031051993370056152, "grad_norm_var": 4.244672323995081e-06, "learning_rate": 0.008654248798600796, "loss": 2.7951, "step": 7016 }, { "crossentropy": 2.6948790550231934, "epoch": 0.25438660092807425, "grad_norm": 0.029458338394761086, "grad_norm_var": 2.9470429440667244e-06, "learning_rate": 0.008653852159256905, "loss": 2.6003, "step": 7017 }, { "crossentropy": 2.733290195465088, "epoch": 0.25442285382830626, "grad_norm": 0.030981743708252907, "grad_norm_var": 2.3664785982699708e-06, "learning_rate": 0.008653455470562193, "loss": 2.7237, "step": 7018 }, { "crossentropy": 2.7039594650268555, "epoch": 0.2544591067285383, "grad_norm": 0.03260635584592819, "grad_norm_var": 2.4648360497838412e-06, "learning_rate": 0.00865305873252202, "loss": 2.6279, "step": 7019 }, { "crossentropy": 2.578496217727661, "epoch": 0.2544953596287703, "grad_norm": 0.030666260048747063, "grad_norm_var": 2.4587040381209477e-06, "learning_rate": 0.008652661945141747, "loss": 2.5436, "step": 7020 }, { "crossentropy": 2.672018051147461, "epoch": 0.2545316125290023, "grad_norm": 0.03198697790503502, "grad_norm_var": 1.7238842164727283e-06, "learning_rate": 0.00865226510842673, "loss": 2.7425, "step": 7021 }, { "crossentropy": 2.724658727645874, "epoch": 0.2545678654292343, "grad_norm": 0.03014102578163147, "grad_norm_var": 1.8006762963942413e-06, "learning_rate": 0.008651868222382331, "loss": 2.7354, "step": 7022 }, { "crossentropy": 2.6923327445983887, "epoch": 0.25460411832946633, "grad_norm": 0.030261486768722534, "grad_norm_var": 1.705260802726573e-06, "learning_rate": 0.00865147128701391, "loss": 2.6722, "step": 7023 }, { "crossentropy": 2.757685422897339, "epoch": 0.2546403712296984, "grad_norm": 0.029646525159478188, "grad_norm_var": 1.8226893051171878e-06, "learning_rate": 0.008651074302326828, "loss": 2.6543, "step": 7024 }, { "crossentropy": 2.673835277557373, "epoch": 0.2546766241299304, "grad_norm": 0.030160406604409218, "grad_norm_var": 1.7705317183766903e-06, "learning_rate": 0.008650677268326445, "loss": 2.6829, "step": 7025 }, { "crossentropy": 2.8101935386657715, "epoch": 0.25471287703016243, "grad_norm": 0.031213024631142616, "grad_norm_var": 7.656597963495108e-07, "learning_rate": 0.008650280185018129, "loss": 2.7147, "step": 7026 }, { "crossentropy": 2.613551139831543, "epoch": 0.25474912993039445, "grad_norm": 0.03273358941078186, "grad_norm_var": 1.0329363947551243e-06, "learning_rate": 0.008649883052407237, "loss": 2.6903, "step": 7027 }, { "crossentropy": 2.534409761428833, "epoch": 0.25478538283062646, "grad_norm": 0.03242098540067673, "grad_norm_var": 1.0748719611995088e-06, "learning_rate": 0.008649485870499136, "loss": 2.5332, "step": 7028 }, { "crossentropy": 2.7053439617156982, "epoch": 0.2548216357308585, "grad_norm": 0.03442130982875824, "grad_norm_var": 1.8133219843497467e-06, "learning_rate": 0.00864908863929919, "loss": 2.7192, "step": 7029 }, { "crossentropy": 2.6765894889831543, "epoch": 0.2548578886310905, "grad_norm": 0.029417894780635834, "grad_norm_var": 2.005708190234064e-06, "learning_rate": 0.008648691358812764, "loss": 2.673, "step": 7030 }, { "crossentropy": 2.458674907684326, "epoch": 0.2548941415313225, "grad_norm": 0.028610624372959137, "grad_norm_var": 2.3233382295141544e-06, "learning_rate": 0.008648294029045224, "loss": 2.5751, "step": 7031 }, { "crossentropy": 2.7397360801696777, "epoch": 0.2549303944315545, "grad_norm": 0.030391722917556763, "grad_norm_var": 2.3447897369040124e-06, "learning_rate": 0.008647896650001938, "loss": 2.6464, "step": 7032 }, { "crossentropy": 2.5944104194641113, "epoch": 0.25496664733178653, "grad_norm": 0.033095669001340866, "grad_norm_var": 2.450730698050106e-06, "learning_rate": 0.00864749922168827, "loss": 2.6039, "step": 7033 }, { "crossentropy": 2.67161226272583, "epoch": 0.25500290023201855, "grad_norm": 0.032501667737960815, "grad_norm_var": 2.5565139576151527e-06, "learning_rate": 0.00864710174410959, "loss": 2.6365, "step": 7034 }, { "crossentropy": 2.734274387359619, "epoch": 0.25503915313225056, "grad_norm": 0.03196534514427185, "grad_norm_var": 2.467741522927241e-06, "learning_rate": 0.008646704217271267, "loss": 2.6629, "step": 7035 }, { "crossentropy": 2.7095844745635986, "epoch": 0.2550754060324826, "grad_norm": 0.03238596394658089, "grad_norm_var": 2.523967593253994e-06, "learning_rate": 0.00864630664117867, "loss": 2.7192, "step": 7036 }, { "crossentropy": 2.7494168281555176, "epoch": 0.2551116589327146, "grad_norm": 0.03352111577987671, "grad_norm_var": 2.8045033837142123e-06, "learning_rate": 0.008645909015837167, "loss": 2.6653, "step": 7037 }, { "crossentropy": 2.764396905899048, "epoch": 0.25514791183294666, "grad_norm": 0.03182119503617287, "grad_norm_var": 2.692062601889255e-06, "learning_rate": 0.00864551134125213, "loss": 2.6844, "step": 7038 }, { "crossentropy": 2.5459580421447754, "epoch": 0.2551841647331787, "grad_norm": 0.02983071282505989, "grad_norm_var": 2.7768372727505897e-06, "learning_rate": 0.00864511361742893, "loss": 2.6246, "step": 7039 }, { "crossentropy": 2.633234739303589, "epoch": 0.2552204176334107, "grad_norm": 0.03131425753235817, "grad_norm_var": 2.5366093575148833e-06, "learning_rate": 0.008644715844372942, "loss": 2.6238, "step": 7040 }, { "crossentropy": 2.5890252590179443, "epoch": 0.2552566705336427, "grad_norm": 0.03181067481637001, "grad_norm_var": 2.3872330241929737e-06, "learning_rate": 0.008644318022089533, "loss": 2.6134, "step": 7041 }, { "crossentropy": 2.6769089698791504, "epoch": 0.2552929234338747, "grad_norm": 0.03226097300648689, "grad_norm_var": 2.385593454386619e-06, "learning_rate": 0.008643920150584078, "loss": 2.6768, "step": 7042 }, { "crossentropy": 2.64327335357666, "epoch": 0.25532917633410673, "grad_norm": 0.036711517721414566, "grad_norm_var": 3.879577073865696e-06, "learning_rate": 0.008643522229861951, "loss": 2.6783, "step": 7043 }, { "crossentropy": 2.6589152812957764, "epoch": 0.25536542923433875, "grad_norm": 0.03640836104750633, "grad_norm_var": 5.081088123345898e-06, "learning_rate": 0.008643124259928529, "loss": 2.6146, "step": 7044 }, { "crossentropy": 2.6849184036254883, "epoch": 0.25540168213457076, "grad_norm": 0.03180623799562454, "grad_norm_var": 4.761637202234786e-06, "learning_rate": 0.008642726240789184, "loss": 2.6565, "step": 7045 }, { "crossentropy": 2.5733487606048584, "epoch": 0.2554379350348028, "grad_norm": 0.031701404601335526, "grad_norm_var": 4.2660909564911736e-06, "learning_rate": 0.008642328172449294, "loss": 2.5839, "step": 7046 }, { "crossentropy": 2.6978108882904053, "epoch": 0.2554741879350348, "grad_norm": 0.03368469700217247, "grad_norm_var": 3.4072234525658304e-06, "learning_rate": 0.008641930054914234, "loss": 2.729, "step": 7047 }, { "crossentropy": 2.5237412452697754, "epoch": 0.2555104408352668, "grad_norm": 0.03837878629565239, "grad_norm_var": 5.068467800524677e-06, "learning_rate": 0.008641531888189382, "loss": 2.627, "step": 7048 }, { "crossentropy": 2.66351056098938, "epoch": 0.2555466937354988, "grad_norm": 0.031627409160137177, "grad_norm_var": 5.199140776342553e-06, "learning_rate": 0.008641133672280114, "loss": 2.6411, "step": 7049 }, { "crossentropy": 2.6613476276397705, "epoch": 0.25558294663573083, "grad_norm": 0.03189505264163017, "grad_norm_var": 5.261082491854011e-06, "learning_rate": 0.008640735407191813, "loss": 2.6768, "step": 7050 }, { "crossentropy": 2.70650053024292, "epoch": 0.2556191995359629, "grad_norm": 0.02978397160768509, "grad_norm_var": 5.843481663899095e-06, "learning_rate": 0.008640337092929853, "loss": 2.6975, "step": 7051 }, { "crossentropy": 2.4610114097595215, "epoch": 0.2556554524361949, "grad_norm": 0.029666148126125336, "grad_norm_var": 6.459191903994815e-06, "learning_rate": 0.008639938729499618, "loss": 2.4707, "step": 7052 }, { "crossentropy": 2.8607709407806396, "epoch": 0.25569170533642693, "grad_norm": 0.03486118093132973, "grad_norm_var": 6.72905674707162e-06, "learning_rate": 0.008639540316906487, "loss": 2.7781, "step": 7053 }, { "crossentropy": 2.566086769104004, "epoch": 0.25572795823665895, "grad_norm": 0.04094356670975685, "grad_norm_var": 1.083369279963146e-05, "learning_rate": 0.008639141855155838, "loss": 2.6351, "step": 7054 }, { "crossentropy": 2.583101272583008, "epoch": 0.25576421113689096, "grad_norm": 0.04158104583621025, "grad_norm_var": 1.4038982985022329e-05, "learning_rate": 0.008638743344253057, "loss": 2.6515, "step": 7055 }, { "crossentropy": 2.780280113220215, "epoch": 0.255800464037123, "grad_norm": 0.033670879900455475, "grad_norm_var": 1.3533634839723158e-05, "learning_rate": 0.008638344784203526, "loss": 2.7026, "step": 7056 }, { "crossentropy": 2.6335175037384033, "epoch": 0.255836716937355, "grad_norm": 0.03179459646344185, "grad_norm_var": 1.3538718506480619e-05, "learning_rate": 0.008637946175012628, "loss": 2.7307, "step": 7057 }, { "crossentropy": 2.6924874782562256, "epoch": 0.255872969837587, "grad_norm": 0.03225180134177208, "grad_norm_var": 1.3541062558391526e-05, "learning_rate": 0.008637547516685743, "loss": 2.6345, "step": 7058 }, { "crossentropy": 2.5355303287506104, "epoch": 0.255909222737819, "grad_norm": 0.03286977484822273, "grad_norm_var": 1.3163145362143855e-05, "learning_rate": 0.008637148809228262, "loss": 2.6329, "step": 7059 }, { "crossentropy": 2.6008780002593994, "epoch": 0.25594547563805103, "grad_norm": 0.032366108149290085, "grad_norm_var": 1.2850141407720559e-05, "learning_rate": 0.008636750052645565, "loss": 2.5693, "step": 7060 }, { "crossentropy": 2.5828065872192383, "epoch": 0.25598172853828305, "grad_norm": 0.034164708107709885, "grad_norm_var": 1.2608509701392961e-05, "learning_rate": 0.00863635124694304, "loss": 2.6295, "step": 7061 }, { "crossentropy": 2.7405338287353516, "epoch": 0.25601798143851506, "grad_norm": 0.03183736279606819, "grad_norm_var": 1.2571122362956683e-05, "learning_rate": 0.008635952392126072, "loss": 2.7388, "step": 7062 }, { "crossentropy": 2.629600763320923, "epoch": 0.2560542343387471, "grad_norm": 0.030659664422273636, "grad_norm_var": 1.320410241981614e-05, "learning_rate": 0.00863555348820005, "loss": 2.5558, "step": 7063 }, { "crossentropy": 2.729151487350464, "epoch": 0.2560904872389791, "grad_norm": 0.03203272074460983, "grad_norm_var": 1.1717376233261132e-05, "learning_rate": 0.00863515453517036, "loss": 2.7036, "step": 7064 }, { "crossentropy": 2.673845052719116, "epoch": 0.25612674013921116, "grad_norm": 0.03339672088623047, "grad_norm_var": 1.1530159354496544e-05, "learning_rate": 0.008634755533042394, "loss": 2.613, "step": 7065 }, { "crossentropy": 2.6482090950012207, "epoch": 0.2561629930394432, "grad_norm": 0.03190533071756363, "grad_norm_var": 1.152815706747708e-05, "learning_rate": 0.008634356481821537, "loss": 2.6453, "step": 7066 }, { "crossentropy": 2.729206085205078, "epoch": 0.2561992459396752, "grad_norm": 0.03542293608188629, "grad_norm_var": 1.082564536649099e-05, "learning_rate": 0.00863395738151318, "loss": 2.7085, "step": 7067 }, { "crossentropy": 2.690570831298828, "epoch": 0.2562354988399072, "grad_norm": 0.03587321564555168, "grad_norm_var": 9.883558918527488e-06, "learning_rate": 0.008633558232122715, "loss": 2.7028, "step": 7068 }, { "crossentropy": 2.687516927719116, "epoch": 0.2562717517401392, "grad_norm": 0.033334892243146896, "grad_norm_var": 9.874654090430365e-06, "learning_rate": 0.008633159033655531, "loss": 2.7404, "step": 7069 }, { "crossentropy": 2.6489362716674805, "epoch": 0.25630800464037123, "grad_norm": 0.03541376441717148, "grad_norm_var": 6.671137078256608e-06, "learning_rate": 0.00863275978611702, "loss": 2.6726, "step": 7070 }, { "crossentropy": 2.6884825229644775, "epoch": 0.25634425754060325, "grad_norm": 0.03508375957608223, "grad_norm_var": 2.4483577777752636e-06, "learning_rate": 0.008632360489512577, "loss": 2.6689, "step": 7071 }, { "crossentropy": 2.6480188369750977, "epoch": 0.25638051044083526, "grad_norm": 0.03767440840601921, "grad_norm_var": 3.672179920695811e-06, "learning_rate": 0.008631961143847592, "loss": 2.7302, "step": 7072 }, { "crossentropy": 2.689448833465576, "epoch": 0.2564167633410673, "grad_norm": 0.036446213722229004, "grad_norm_var": 3.9636389395423615e-06, "learning_rate": 0.00863156174912746, "loss": 2.7298, "step": 7073 }, { "crossentropy": 2.6541972160339355, "epoch": 0.2564530162412993, "grad_norm": 0.033100757747888565, "grad_norm_var": 3.833908583981277e-06, "learning_rate": 0.008631162305357576, "loss": 2.6882, "step": 7074 }, { "crossentropy": 2.6051645278930664, "epoch": 0.2564892691415313, "grad_norm": 0.03301341086626053, "grad_norm_var": 3.816446429123708e-06, "learning_rate": 0.008630762812543337, "loss": 2.5653, "step": 7075 }, { "crossentropy": 2.5327751636505127, "epoch": 0.2565255220417633, "grad_norm": 0.033187080174684525, "grad_norm_var": 3.6952781146510143e-06, "learning_rate": 0.008630363270690132, "loss": 2.6087, "step": 7076 }, { "crossentropy": 2.6837799549102783, "epoch": 0.25656177494199534, "grad_norm": 0.033531732857227325, "grad_norm_var": 3.6987538408852124e-06, "learning_rate": 0.008629963679803364, "loss": 2.6781, "step": 7077 }, { "crossentropy": 2.4882662296295166, "epoch": 0.2565980278422274, "grad_norm": 0.030957791954278946, "grad_norm_var": 3.985442244991422e-06, "learning_rate": 0.008629564039888427, "loss": 2.4647, "step": 7078 }, { "crossentropy": 2.621382474899292, "epoch": 0.2566342807424594, "grad_norm": 0.030889185145497322, "grad_norm_var": 3.892183453046968e-06, "learning_rate": 0.00862916435095072, "loss": 2.799, "step": 7079 }, { "crossentropy": 2.707305908203125, "epoch": 0.25667053364269143, "grad_norm": 0.033583734184503555, "grad_norm_var": 3.671063370971533e-06, "learning_rate": 0.008628764612995641, "loss": 2.5996, "step": 7080 }, { "crossentropy": 2.644026279449463, "epoch": 0.25670678654292345, "grad_norm": 0.03092297911643982, "grad_norm_var": 4.228077118831432e-06, "learning_rate": 0.00862836482602859, "loss": 2.642, "step": 7081 }, { "crossentropy": 2.602893590927124, "epoch": 0.25674303944315546, "grad_norm": 0.03268915042281151, "grad_norm_var": 4.071461753510812e-06, "learning_rate": 0.008627964990054963, "loss": 2.6341, "step": 7082 }, { "crossentropy": 2.489312171936035, "epoch": 0.2567792923433875, "grad_norm": 0.03634363412857056, "grad_norm_var": 4.321179616476262e-06, "learning_rate": 0.008627565105080167, "loss": 2.5995, "step": 7083 }, { "crossentropy": 2.7170348167419434, "epoch": 0.2568155452436195, "grad_norm": 0.0380045510828495, "grad_norm_var": 5.172128708353614e-06, "learning_rate": 0.008627165171109596, "loss": 2.6433, "step": 7084 }, { "crossentropy": 2.6421048641204834, "epoch": 0.2568517981438515, "grad_norm": 0.03415800258517265, "grad_norm_var": 5.140264443026842e-06, "learning_rate": 0.008626765188148658, "loss": 2.6328, "step": 7085 }, { "crossentropy": 2.5873639583587646, "epoch": 0.2568880510440835, "grad_norm": 0.03139897435903549, "grad_norm_var": 5.424339316176585e-06, "learning_rate": 0.008626365156202749, "loss": 2.6229, "step": 7086 }, { "crossentropy": 2.5518248081207275, "epoch": 0.25692430394431554, "grad_norm": 0.03336583822965622, "grad_norm_var": 5.317393318237856e-06, "learning_rate": 0.008625965075277279, "loss": 2.5398, "step": 7087 }, { "crossentropy": 2.737260341644287, "epoch": 0.25696055684454755, "grad_norm": 0.03217775374650955, "grad_norm_var": 4.296015046448146e-06, "learning_rate": 0.008625564945377645, "loss": 2.7509, "step": 7088 }, { "crossentropy": 2.6981210708618164, "epoch": 0.25699680974477956, "grad_norm": 0.031163588166236877, "grad_norm_var": 3.866848490536784e-06, "learning_rate": 0.008625164766509255, "loss": 2.6677, "step": 7089 }, { "crossentropy": 2.7288849353790283, "epoch": 0.2570330626450116, "grad_norm": 0.03288368880748749, "grad_norm_var": 3.867760284739771e-06, "learning_rate": 0.008624764538677514, "loss": 2.7717, "step": 7090 }, { "crossentropy": 2.531015157699585, "epoch": 0.2570693155452436, "grad_norm": 0.03333975002169609, "grad_norm_var": 3.874262651421322e-06, "learning_rate": 0.008624364261887826, "loss": 2.6276, "step": 7091 }, { "crossentropy": 2.570000648498535, "epoch": 0.25710556844547566, "grad_norm": 0.030180765315890312, "grad_norm_var": 4.3791109418299855e-06, "learning_rate": 0.0086239639361456, "loss": 2.6492, "step": 7092 }, { "crossentropy": 2.691807746887207, "epoch": 0.2571418213457077, "grad_norm": 0.031562719494104385, "grad_norm_var": 4.442299774255233e-06, "learning_rate": 0.00862356356145624, "loss": 2.6515, "step": 7093 }, { "crossentropy": 2.5831100940704346, "epoch": 0.2571780742459397, "grad_norm": 0.0387631319463253, "grad_norm_var": 6.409415507718265e-06, "learning_rate": 0.008623163137825157, "loss": 2.5901, "step": 7094 }, { "crossentropy": 2.48787784576416, "epoch": 0.2572143271461717, "grad_norm": 0.04178304225206375, "grad_norm_var": 1.0449533493701738e-05, "learning_rate": 0.008622762665257758, "loss": 2.5175, "step": 7095 }, { "crossentropy": 2.6290476322174072, "epoch": 0.2572505800464037, "grad_norm": 0.03654475510120392, "grad_norm_var": 1.0874590513020828e-05, "learning_rate": 0.008622362143759449, "loss": 2.6507, "step": 7096 }, { "crossentropy": 2.5697214603424072, "epoch": 0.25728683294663574, "grad_norm": 0.03455059230327606, "grad_norm_var": 1.0170000412590983e-05, "learning_rate": 0.008621961573335642, "loss": 2.5962, "step": 7097 }, { "crossentropy": 2.828472375869751, "epoch": 0.25732308584686775, "grad_norm": 0.036289241164922714, "grad_norm_var": 1.0203515738399606e-05, "learning_rate": 0.008621560953991749, "loss": 2.8164, "step": 7098 }, { "crossentropy": 2.6283469200134277, "epoch": 0.25735933874709976, "grad_norm": 0.03818298131227493, "grad_norm_var": 1.0859292384058255e-05, "learning_rate": 0.00862116028573318, "loss": 2.5971, "step": 7099 }, { "crossentropy": 2.5543949604034424, "epoch": 0.2573955916473318, "grad_norm": 0.03670867532491684, "grad_norm_var": 1.0384090712884962e-05, "learning_rate": 0.008620759568565345, "loss": 2.5772, "step": 7100 }, { "crossentropy": 2.6646039485931396, "epoch": 0.2574318445475638, "grad_norm": 0.03255532309412956, "grad_norm_var": 1.063177888429924e-05, "learning_rate": 0.008620358802493658, "loss": 2.6399, "step": 7101 }, { "crossentropy": 2.7248847484588623, "epoch": 0.2574680974477958, "grad_norm": 0.031280022114515305, "grad_norm_var": 1.0681302046973761e-05, "learning_rate": 0.008619957987523531, "loss": 2.694, "step": 7102 }, { "crossentropy": 2.5191569328308105, "epoch": 0.2575043503480278, "grad_norm": 0.035206492990255356, "grad_norm_var": 1.0624954297281857e-05, "learning_rate": 0.008619557123660375, "loss": 2.5272, "step": 7103 }, { "crossentropy": 2.640923261642456, "epoch": 0.25754060324825984, "grad_norm": 0.031879447400569916, "grad_norm_var": 1.0725796162918187e-05, "learning_rate": 0.00861915621090961, "loss": 2.7358, "step": 7104 }, { "crossentropy": 2.6306276321411133, "epoch": 0.2575768561484919, "grad_norm": 0.0313650481402874, "grad_norm_var": 1.0637244672497557e-05, "learning_rate": 0.008618755249276648, "loss": 2.6565, "step": 7105 }, { "crossentropy": 2.61443829536438, "epoch": 0.2576131090487239, "grad_norm": 0.03202437981963158, "grad_norm_var": 1.0876286339259175e-05, "learning_rate": 0.008618354238766905, "loss": 2.6205, "step": 7106 }, { "crossentropy": 2.675842523574829, "epoch": 0.25764936194895594, "grad_norm": 0.03652540594339371, "grad_norm_var": 1.101199666101355e-05, "learning_rate": 0.008617953179385796, "loss": 2.6378, "step": 7107 }, { "crossentropy": 2.5042035579681396, "epoch": 0.25768561484918795, "grad_norm": 0.03985217958688736, "grad_norm_var": 1.1014078301481665e-05, "learning_rate": 0.00861755207113874, "loss": 2.6218, "step": 7108 }, { "crossentropy": 2.657904863357544, "epoch": 0.25772186774941996, "grad_norm": 0.03579854220151901, "grad_norm_var": 1.0015085785893402e-05, "learning_rate": 0.008617150914031152, "loss": 2.6365, "step": 7109 }, { "crossentropy": 2.7655599117279053, "epoch": 0.257758120649652, "grad_norm": 0.03070208989083767, "grad_norm_var": 1.0657078257185583e-05, "learning_rate": 0.008616749708068453, "loss": 2.7163, "step": 7110 }, { "crossentropy": 2.6067066192626953, "epoch": 0.257794373549884, "grad_norm": 0.03041703999042511, "grad_norm_var": 8.569954375818112e-06, "learning_rate": 0.00861634845325606, "loss": 2.6528, "step": 7111 }, { "crossentropy": 2.487797737121582, "epoch": 0.257830626450116, "grad_norm": 0.0317714586853981, "grad_norm_var": 8.608373758724762e-06, "learning_rate": 0.008615947149599394, "loss": 2.5914, "step": 7112 }, { "crossentropy": 2.7140069007873535, "epoch": 0.257866879350348, "grad_norm": 0.029248913750052452, "grad_norm_var": 1.0024895332104838e-05, "learning_rate": 0.008615545797103871, "loss": 2.6084, "step": 7113 }, { "crossentropy": 2.6814496517181396, "epoch": 0.25790313225058004, "grad_norm": 0.031476087868213654, "grad_norm_var": 9.835499053304173e-06, "learning_rate": 0.00861514439577492, "loss": 2.641, "step": 7114 }, { "crossentropy": 2.5937483310699463, "epoch": 0.25793938515081205, "grad_norm": 0.030937781557440758, "grad_norm_var": 8.531688242567716e-06, "learning_rate": 0.008614742945617955, "loss": 2.6206, "step": 7115 }, { "crossentropy": 2.683572292327881, "epoch": 0.25797563805104406, "grad_norm": 0.03092382103204727, "grad_norm_var": 7.750563723219712e-06, "learning_rate": 0.008614341446638403, "loss": 2.6742, "step": 7116 }, { "crossentropy": 2.7830262184143066, "epoch": 0.2580118909512761, "grad_norm": 0.031079506501555443, "grad_norm_var": 7.899959271802832e-06, "learning_rate": 0.008613939898841681, "loss": 2.7002, "step": 7117 }, { "crossentropy": 2.608804225921631, "epoch": 0.2580481438515081, "grad_norm": 0.03178581967949867, "grad_norm_var": 7.83161598109904e-06, "learning_rate": 0.00861353830223322, "loss": 2.61, "step": 7118 }, { "crossentropy": 2.634657621383667, "epoch": 0.25808439675174016, "grad_norm": 0.030780121684074402, "grad_norm_var": 7.4955036068857206e-06, "learning_rate": 0.008613136656818439, "loss": 2.5979, "step": 7119 }, { "crossentropy": 2.6202142238616943, "epoch": 0.2581206496519722, "grad_norm": 0.02977224625647068, "grad_norm_var": 7.887100492446928e-06, "learning_rate": 0.008612734962602763, "loss": 2.663, "step": 7120 }, { "crossentropy": 2.492947816848755, "epoch": 0.2581569025522042, "grad_norm": 0.03231319785118103, "grad_norm_var": 7.843576078891027e-06, "learning_rate": 0.00861233321959162, "loss": 2.5592, "step": 7121 }, { "crossentropy": 2.820183515548706, "epoch": 0.2581931554524362, "grad_norm": 0.034282345324754715, "grad_norm_var": 8.105429066425233e-06, "learning_rate": 0.008611931427790437, "loss": 2.7896, "step": 7122 }, { "crossentropy": 2.6325814723968506, "epoch": 0.2582294083526682, "grad_norm": 0.03777937963604927, "grad_norm_var": 8.90112490622417e-06, "learning_rate": 0.008611529587204637, "loss": 2.6644, "step": 7123 }, { "crossentropy": 2.7469992637634277, "epoch": 0.25826566125290024, "grad_norm": 0.04848199337720871, "grad_norm_var": 2.209308582456688e-05, "learning_rate": 0.008611127697839649, "loss": 2.7296, "step": 7124 }, { "crossentropy": 2.6861572265625, "epoch": 0.25830191415313225, "grad_norm": 0.043831486254930496, "grad_norm_var": 2.9153602527641702e-05, "learning_rate": 0.0086107257597009, "loss": 2.6502, "step": 7125 }, { "crossentropy": 2.747437000274658, "epoch": 0.25833816705336426, "grad_norm": 0.03306061029434204, "grad_norm_var": 2.8629599277830464e-05, "learning_rate": 0.008610323772793822, "loss": 2.6998, "step": 7126 }, { "crossentropy": 2.7186179161071777, "epoch": 0.2583744199535963, "grad_norm": 0.031124219298362732, "grad_norm_var": 2.835871822129404e-05, "learning_rate": 0.008609921737123842, "loss": 2.6937, "step": 7127 }, { "crossentropy": 2.6375582218170166, "epoch": 0.2584106728538283, "grad_norm": 0.048811618238687515, "grad_norm_var": 4.220321546457365e-05, "learning_rate": 0.008609519652696391, "loss": 2.615, "step": 7128 }, { "crossentropy": 2.527238607406616, "epoch": 0.2584469257540603, "grad_norm": 0.03386285528540611, "grad_norm_var": 4.016147070157066e-05, "learning_rate": 0.0086091175195169, "loss": 2.5915, "step": 7129 }, { "crossentropy": 2.7494454383850098, "epoch": 0.2584831786542923, "grad_norm": 0.03175294026732445, "grad_norm_var": 4.0035481424088845e-05, "learning_rate": 0.008608715337590798, "loss": 2.7732, "step": 7130 }, { "crossentropy": 2.6070995330810547, "epoch": 0.25851943155452434, "grad_norm": 0.029527118429541588, "grad_norm_var": 4.093072834408794e-05, "learning_rate": 0.008608313106923522, "loss": 2.6015, "step": 7131 }, { "crossentropy": 2.719395875930786, "epoch": 0.2585556844547564, "grad_norm": 0.032296665012836456, "grad_norm_var": 4.031189817787355e-05, "learning_rate": 0.0086079108275205, "loss": 2.6666, "step": 7132 }, { "crossentropy": 2.66133713722229, "epoch": 0.2585919373549884, "grad_norm": 0.0314461775124073, "grad_norm_var": 4.0126973805110674e-05, "learning_rate": 0.008607508499387167, "loss": 2.72, "step": 7133 }, { "crossentropy": 2.634533643722534, "epoch": 0.25862819025522044, "grad_norm": 0.03076661191880703, "grad_norm_var": 4.063640558998626e-05, "learning_rate": 0.008607106122528956, "loss": 2.7489, "step": 7134 }, { "crossentropy": 2.6978089809417725, "epoch": 0.25866444315545245, "grad_norm": 0.030634084716439247, "grad_norm_var": 4.0719771908871194e-05, "learning_rate": 0.008606703696951304, "loss": 2.6942, "step": 7135 }, { "crossentropy": 2.5190370082855225, "epoch": 0.25870069605568446, "grad_norm": 0.034536633640527725, "grad_norm_var": 3.8827726313558084e-05, "learning_rate": 0.008606301222659644, "loss": 2.6034, "step": 7136 }, { "crossentropy": 2.599379301071167, "epoch": 0.2587369489559165, "grad_norm": 0.03248666971921921, "grad_norm_var": 3.876094581355843e-05, "learning_rate": 0.008605898699659416, "loss": 2.6087, "step": 7137 }, { "crossentropy": 2.5410995483398438, "epoch": 0.2587732018561485, "grad_norm": 0.03128546476364136, "grad_norm_var": 3.972595354003078e-05, "learning_rate": 0.008605496127956051, "loss": 2.5889, "step": 7138 }, { "crossentropy": 2.7716641426086426, "epoch": 0.2588094547563805, "grad_norm": 0.03388860076665878, "grad_norm_var": 3.9284846073545085e-05, "learning_rate": 0.00860509350755499, "loss": 2.764, "step": 7139 }, { "crossentropy": 2.677245616912842, "epoch": 0.2588457076566125, "grad_norm": 0.03555399551987648, "grad_norm_var": 2.6253622397044345e-05, "learning_rate": 0.008604690838461672, "loss": 2.7209, "step": 7140 }, { "crossentropy": 2.782355785369873, "epoch": 0.25888196055684454, "grad_norm": 0.033713147044181824, "grad_norm_var": 1.9461646360901494e-05, "learning_rate": 0.008604288120681532, "loss": 2.6799, "step": 7141 }, { "crossentropy": 2.584078073501587, "epoch": 0.25891821345707655, "grad_norm": 0.030621729791164398, "grad_norm_var": 1.99508299291634e-05, "learning_rate": 0.008603885354220013, "loss": 2.6246, "step": 7142 }, { "crossentropy": 2.6780195236206055, "epoch": 0.25895446635730857, "grad_norm": 0.03182356804609299, "grad_norm_var": 1.978137826641733e-05, "learning_rate": 0.008603482539082552, "loss": 2.5825, "step": 7143 }, { "crossentropy": 2.7577099800109863, "epoch": 0.2589907192575406, "grad_norm": 0.03418805077672005, "grad_norm_var": 2.9275638062469515e-06, "learning_rate": 0.00860307967527459, "loss": 2.6558, "step": 7144 }, { "crossentropy": 2.655026912689209, "epoch": 0.2590269721577726, "grad_norm": 0.03479352220892906, "grad_norm_var": 3.163343410774464e-06, "learning_rate": 0.008602676762801572, "loss": 2.6841, "step": 7145 }, { "crossentropy": 2.721055269241333, "epoch": 0.25906322505800466, "grad_norm": 0.03419547900557518, "grad_norm_var": 3.3068650468381083e-06, "learning_rate": 0.008602273801668933, "loss": 2.7132, "step": 7146 }, { "crossentropy": 2.8085765838623047, "epoch": 0.2590994779582367, "grad_norm": 0.03284754604101181, "grad_norm_var": 2.6311464917996388e-06, "learning_rate": 0.008601870791882123, "loss": 2.7359, "step": 7147 }, { "crossentropy": 2.6010167598724365, "epoch": 0.2591357308584687, "grad_norm": 0.030507752671837807, "grad_norm_var": 2.9553592571908514e-06, "learning_rate": 0.00860146773344658, "loss": 2.6456, "step": 7148 }, { "crossentropy": 2.6740562915802, "epoch": 0.2591719837587007, "grad_norm": 0.030661625787615776, "grad_norm_var": 3.12556992369626e-06, "learning_rate": 0.008601064626367751, "loss": 2.6897, "step": 7149 }, { "crossentropy": 2.5734264850616455, "epoch": 0.2592082366589327, "grad_norm": 0.03373851627111435, "grad_norm_var": 2.9286958951253804e-06, "learning_rate": 0.00860066147065108, "loss": 2.6001, "step": 7150 }, { "crossentropy": 2.674520254135132, "epoch": 0.25924448955916474, "grad_norm": 0.03644070401787758, "grad_norm_var": 3.3263822243626008e-06, "learning_rate": 0.00860025826630201, "loss": 2.7728, "step": 7151 }, { "crossentropy": 2.614537477493286, "epoch": 0.25928074245939675, "grad_norm": 0.03174484148621559, "grad_norm_var": 3.3178977116479767e-06, "learning_rate": 0.008599855013325988, "loss": 2.6011, "step": 7152 }, { "crossentropy": 2.598144054412842, "epoch": 0.25931699535962877, "grad_norm": 0.03214098513126373, "grad_norm_var": 3.350441417513996e-06, "learning_rate": 0.008599451711728464, "loss": 2.6455, "step": 7153 }, { "crossentropy": 2.6166186332702637, "epoch": 0.2593532482598608, "grad_norm": 0.034042779356241226, "grad_norm_var": 3.1919364145704374e-06, "learning_rate": 0.00859904836151488, "loss": 2.5895, "step": 7154 }, { "crossentropy": 2.7613730430603027, "epoch": 0.2593895011600928, "grad_norm": 0.03796694427728653, "grad_norm_var": 4.616037710243546e-06, "learning_rate": 0.008598644962690685, "loss": 2.6531, "step": 7155 }, { "crossentropy": 2.395659923553467, "epoch": 0.2594257540603248, "grad_norm": 0.03167102113366127, "grad_norm_var": 4.461999048899617e-06, "learning_rate": 0.008598241515261331, "loss": 2.5593, "step": 7156 }, { "crossentropy": 2.672848701477051, "epoch": 0.2594620069605568, "grad_norm": 0.030658937990665436, "grad_norm_var": 4.833452672777366e-06, "learning_rate": 0.008597838019232263, "loss": 2.683, "step": 7157 }, { "crossentropy": 2.66487455368042, "epoch": 0.25949825986078884, "grad_norm": 0.031510498374700546, "grad_norm_var": 4.600665198443756e-06, "learning_rate": 0.008597434474608935, "loss": 2.7003, "step": 7158 }, { "crossentropy": 2.679941177368164, "epoch": 0.2595345127610209, "grad_norm": 0.03241436183452606, "grad_norm_var": 4.525217235567628e-06, "learning_rate": 0.008597030881396793, "loss": 2.7111, "step": 7159 }, { "crossentropy": 2.6605918407440186, "epoch": 0.2595707656612529, "grad_norm": 0.03331754729151726, "grad_norm_var": 4.445736858453333e-06, "learning_rate": 0.008596627239601291, "loss": 2.5855, "step": 7160 }, { "crossentropy": 2.6491329669952393, "epoch": 0.25960701856148494, "grad_norm": 0.034626737236976624, "grad_norm_var": 4.408498770575436e-06, "learning_rate": 0.00859622354922788, "loss": 2.7033, "step": 7161 }, { "crossentropy": 2.643160104751587, "epoch": 0.25964327146171695, "grad_norm": 0.04014058783650398, "grad_norm_var": 7.541060955517225e-06, "learning_rate": 0.008595819810282013, "loss": 2.5902, "step": 7162 }, { "crossentropy": 2.634918212890625, "epoch": 0.25967952436194897, "grad_norm": 0.03590509295463562, "grad_norm_var": 7.89932778622552e-06, "learning_rate": 0.008595416022769142, "loss": 2.6186, "step": 7163 }, { "crossentropy": 2.74280047416687, "epoch": 0.259715777262181, "grad_norm": 0.031287018209695816, "grad_norm_var": 7.616711557706249e-06, "learning_rate": 0.008595012186694721, "loss": 2.714, "step": 7164 }, { "crossentropy": 2.6036412715911865, "epoch": 0.259752030162413, "grad_norm": 0.031106578186154366, "grad_norm_var": 7.452282945098392e-06, "learning_rate": 0.008594608302064207, "loss": 2.6137, "step": 7165 }, { "crossentropy": 2.7939186096191406, "epoch": 0.259788283062645, "grad_norm": 0.030459962785243988, "grad_norm_var": 8.093951663133341e-06, "learning_rate": 0.00859420436888305, "loss": 2.7082, "step": 7166 }, { "crossentropy": 2.815277099609375, "epoch": 0.259824535962877, "grad_norm": 0.030251193791627884, "grad_norm_var": 8.032296946696214e-06, "learning_rate": 0.00859380038715671, "loss": 2.7408, "step": 7167 }, { "crossentropy": 2.5715086460113525, "epoch": 0.25986078886310904, "grad_norm": 0.031199676916003227, "grad_norm_var": 8.147764430774194e-06, "learning_rate": 0.008593396356890642, "loss": 2.6796, "step": 7168 }, { "crossentropy": 2.6887478828430176, "epoch": 0.25989704176334105, "grad_norm": 0.03198279067873955, "grad_norm_var": 8.168370075026707e-06, "learning_rate": 0.008592992278090303, "loss": 2.6655, "step": 7169 }, { "crossentropy": 2.6218369007110596, "epoch": 0.25993329466357307, "grad_norm": 0.033702313899993896, "grad_norm_var": 8.12981449023192e-06, "learning_rate": 0.008592588150761152, "loss": 2.6615, "step": 7170 }, { "crossentropy": 2.7026054859161377, "epoch": 0.2599695475638051, "grad_norm": 0.03501454368233681, "grad_norm_var": 6.724303506017391e-06, "learning_rate": 0.008592183974908643, "loss": 2.681, "step": 7171 }, { "crossentropy": 2.7418789863586426, "epoch": 0.2600058004640371, "grad_norm": 0.035726211965084076, "grad_norm_var": 7.126490796091102e-06, "learning_rate": 0.00859177975053824, "loss": 2.734, "step": 7172 }, { "crossentropy": 2.6849420070648193, "epoch": 0.26004205336426917, "grad_norm": 0.03256012871861458, "grad_norm_var": 6.7382975088461455e-06, "learning_rate": 0.008591375477655401, "loss": 2.7018, "step": 7173 }, { "crossentropy": 2.6236941814422607, "epoch": 0.2600783062645012, "grad_norm": 0.030054107308387756, "grad_norm_var": 7.199005017641663e-06, "learning_rate": 0.008590971156265585, "loss": 2.7203, "step": 7174 }, { "crossentropy": 2.71985125541687, "epoch": 0.2601145591647332, "grad_norm": 0.029551653191447258, "grad_norm_var": 7.976454153208707e-06, "learning_rate": 0.008590566786374255, "loss": 2.6516, "step": 7175 }, { "crossentropy": 2.5570247173309326, "epoch": 0.2601508120649652, "grad_norm": 0.03017321042716503, "grad_norm_var": 8.43206632376205e-06, "learning_rate": 0.008590162367986872, "loss": 2.5668, "step": 7176 }, { "crossentropy": 2.622680425643921, "epoch": 0.2601870649651972, "grad_norm": 0.03420214354991913, "grad_norm_var": 8.336173483601955e-06, "learning_rate": 0.008589757901108898, "loss": 2.6044, "step": 7177 }, { "crossentropy": 2.5441513061523438, "epoch": 0.26022331786542924, "grad_norm": 0.03093140572309494, "grad_norm_var": 4.509503111244095e-06, "learning_rate": 0.008589353385745792, "loss": 2.633, "step": 7178 }, { "crossentropy": 2.7009174823760986, "epoch": 0.26025957076566125, "grad_norm": 0.031324513256549835, "grad_norm_var": 3.5163148128474857e-06, "learning_rate": 0.008588948821903027, "loss": 2.7685, "step": 7179 }, { "crossentropy": 2.458331346511841, "epoch": 0.26029582366589327, "grad_norm": 0.033732954412698746, "grad_norm_var": 3.7081039437500868e-06, "learning_rate": 0.008588544209586059, "loss": 2.5855, "step": 7180 }, { "crossentropy": 2.6277377605438232, "epoch": 0.2603320765661253, "grad_norm": 0.03256763517856598, "grad_norm_var": 3.6678005639890667e-06, "learning_rate": 0.008588139548800356, "loss": 2.6379, "step": 7181 }, { "crossentropy": 2.6015050411224365, "epoch": 0.2603683294663573, "grad_norm": 0.03287678584456444, "grad_norm_var": 3.5077088354765745e-06, "learning_rate": 0.008587734839551383, "loss": 2.6909, "step": 7182 }, { "crossentropy": 2.733640432357788, "epoch": 0.2604045823665893, "grad_norm": 0.03362879529595375, "grad_norm_var": 3.3247510129930187e-06, "learning_rate": 0.008587330081844606, "loss": 2.6769, "step": 7183 }, { "crossentropy": 2.8312478065490723, "epoch": 0.2604408352668213, "grad_norm": 0.034832727164030075, "grad_norm_var": 3.5431529629863233e-06, "learning_rate": 0.008586925275685494, "loss": 2.8685, "step": 7184 }, { "crossentropy": 2.7369985580444336, "epoch": 0.26047708816705334, "grad_norm": 0.03808635473251343, "grad_norm_var": 5.305021036270709e-06, "learning_rate": 0.008586520421079512, "loss": 2.6882, "step": 7185 }, { "crossentropy": 2.6670470237731934, "epoch": 0.2605133410672854, "grad_norm": 0.03836246207356453, "grad_norm_var": 7.061223092737212e-06, "learning_rate": 0.008586115518032128, "loss": 2.6767, "step": 7186 }, { "crossentropy": 2.7599244117736816, "epoch": 0.2605495939675174, "grad_norm": 0.03348412737250328, "grad_norm_var": 6.868276569787363e-06, "learning_rate": 0.008585710566548813, "loss": 2.64, "step": 7187 }, { "crossentropy": 2.618511438369751, "epoch": 0.26058584686774944, "grad_norm": 0.030053218826651573, "grad_norm_var": 7.011201642721194e-06, "learning_rate": 0.008585305566635034, "loss": 2.6314, "step": 7188 }, { "crossentropy": 2.519751787185669, "epoch": 0.26062209976798145, "grad_norm": 0.031105097383260727, "grad_norm_var": 7.209727313589352e-06, "learning_rate": 0.008584900518296263, "loss": 2.5745, "step": 7189 }, { "crossentropy": 2.463263988494873, "epoch": 0.26065835266821347, "grad_norm": 0.03189411014318466, "grad_norm_var": 6.745104310068222e-06, "learning_rate": 0.00858449542153797, "loss": 2.4905, "step": 7190 }, { "crossentropy": 2.6789016723632812, "epoch": 0.2606946055684455, "grad_norm": 0.030353199690580368, "grad_norm_var": 6.4246918096014585e-06, "learning_rate": 0.008584090276365627, "loss": 2.7277, "step": 7191 }, { "crossentropy": 2.515026807785034, "epoch": 0.2607308584686775, "grad_norm": 0.030551759526133537, "grad_norm_var": 6.292205126549357e-06, "learning_rate": 0.008583685082784707, "loss": 2.5587, "step": 7192 }, { "crossentropy": 2.6171514987945557, "epoch": 0.2607671113689095, "grad_norm": 0.028754623606801033, "grad_norm_var": 7.273185118593937e-06, "learning_rate": 0.00858327984080068, "loss": 2.5206, "step": 7193 }, { "crossentropy": 2.605469226837158, "epoch": 0.2608033642691415, "grad_norm": 0.02985089272260666, "grad_norm_var": 7.5950080475368675e-06, "learning_rate": 0.00858287455041902, "loss": 2.7038, "step": 7194 }, { "crossentropy": 2.6405487060546875, "epoch": 0.26083961716937354, "grad_norm": 0.03136974945664406, "grad_norm_var": 7.587495907910889e-06, "learning_rate": 0.008582469211645204, "loss": 2.6601, "step": 7195 }, { "crossentropy": 2.8151838779449463, "epoch": 0.26087587006960555, "grad_norm": 0.031422968953847885, "grad_norm_var": 7.570211706756193e-06, "learning_rate": 0.008582063824484704, "loss": 2.7029, "step": 7196 }, { "crossentropy": 2.506915807723999, "epoch": 0.26091212296983757, "grad_norm": 0.032933808863162994, "grad_norm_var": 7.584351984251285e-06, "learning_rate": 0.008581658388942995, "loss": 2.6343, "step": 7197 }, { "crossentropy": 2.5499932765960693, "epoch": 0.2609483758700696, "grad_norm": 0.03150912746787071, "grad_norm_var": 7.627542019978288e-06, "learning_rate": 0.008581252905025556, "loss": 2.5399, "step": 7198 }, { "crossentropy": 2.828460693359375, "epoch": 0.2609846287703016, "grad_norm": 0.03314942866563797, "grad_norm_var": 7.562538100377399e-06, "learning_rate": 0.00858084737273786, "loss": 2.7201, "step": 7199 }, { "crossentropy": 2.7175543308258057, "epoch": 0.26102088167053367, "grad_norm": 0.03324352949857712, "grad_norm_var": 7.195817555559235e-06, "learning_rate": 0.008580441792085388, "loss": 2.7037, "step": 7200 }, { "crossentropy": 2.7371175289154053, "epoch": 0.2610571345707657, "grad_norm": 0.03667832911014557, "grad_norm_var": 6.225488143170758e-06, "learning_rate": 0.008580036163073615, "loss": 2.7088, "step": 7201 }, { "crossentropy": 2.683706045150757, "epoch": 0.2610933874709977, "grad_norm": 0.037438031286001205, "grad_norm_var": 5.515604422593939e-06, "learning_rate": 0.00857963048570802, "loss": 2.6251, "step": 7202 }, { "crossentropy": 2.5526769161224365, "epoch": 0.2611296403712297, "grad_norm": 0.03468863666057587, "grad_norm_var": 5.826647416106801e-06, "learning_rate": 0.008579224759994085, "loss": 2.6055, "step": 7203 }, { "crossentropy": 2.6465115547180176, "epoch": 0.2611658932714617, "grad_norm": 0.034062955528497696, "grad_norm_var": 5.690584172830746e-06, "learning_rate": 0.008578818985937287, "loss": 2.6461, "step": 7204 }, { "crossentropy": 2.673417806625366, "epoch": 0.26120214617169374, "grad_norm": 0.03287855163216591, "grad_norm_var": 5.57200239737343e-06, "learning_rate": 0.008578413163543106, "loss": 2.6494, "step": 7205 }, { "crossentropy": 2.721914291381836, "epoch": 0.26123839907192575, "grad_norm": 0.0367504246532917, "grad_norm_var": 6.6221164000869e-06, "learning_rate": 0.008578007292817028, "loss": 2.6949, "step": 7206 }, { "crossentropy": 2.51019024848938, "epoch": 0.26127465197215777, "grad_norm": 0.03312109410762787, "grad_norm_var": 6.17866248294226e-06, "learning_rate": 0.008577601373764529, "loss": 2.5576, "step": 7207 }, { "crossentropy": 2.6429619789123535, "epoch": 0.2613109048723898, "grad_norm": 0.03353377804160118, "grad_norm_var": 5.750975909102095e-06, "learning_rate": 0.008577195406391094, "loss": 2.5785, "step": 7208 }, { "crossentropy": 2.8009774684906006, "epoch": 0.2613471577726218, "grad_norm": 0.030014323070645332, "grad_norm_var": 5.1015566770678015e-06, "learning_rate": 0.008576789390702208, "loss": 2.7804, "step": 7209 }, { "crossentropy": 2.560671091079712, "epoch": 0.2613834106728538, "grad_norm": 0.030964305624365807, "grad_norm_var": 4.668432105075597e-06, "learning_rate": 0.008576383326703352, "loss": 2.6909, "step": 7210 }, { "crossentropy": 2.5316147804260254, "epoch": 0.2614196635730858, "grad_norm": 0.032411735504865646, "grad_norm_var": 4.459790411182283e-06, "learning_rate": 0.008575977214400012, "loss": 2.6077, "step": 7211 }, { "crossentropy": 2.5931408405303955, "epoch": 0.26145591647331784, "grad_norm": 0.033120982348918915, "grad_norm_var": 4.186715544022592e-06, "learning_rate": 0.008575571053797675, "loss": 2.6254, "step": 7212 }, { "crossentropy": 2.657266616821289, "epoch": 0.2614921693735499, "grad_norm": 0.031779784709215164, "grad_norm_var": 4.361870266251141e-06, "learning_rate": 0.008575164844901824, "loss": 2.6704, "step": 7213 }, { "crossentropy": 2.634430170059204, "epoch": 0.2615284222737819, "grad_norm": 0.03255147859454155, "grad_norm_var": 4.1587738389447145e-06, "learning_rate": 0.008574758587717946, "loss": 2.6359, "step": 7214 }, { "crossentropy": 2.7288436889648438, "epoch": 0.26156467517401394, "grad_norm": 0.032159291207790375, "grad_norm_var": 4.269525180117714e-06, "learning_rate": 0.008574352282251528, "loss": 2.6538, "step": 7215 }, { "crossentropy": 2.563575506210327, "epoch": 0.26160092807424595, "grad_norm": 0.031948383897542953, "grad_norm_var": 4.412146091429109e-06, "learning_rate": 0.008573945928508058, "loss": 2.647, "step": 7216 }, { "crossentropy": 2.68259859085083, "epoch": 0.26163718097447797, "grad_norm": 0.03631369769573212, "grad_norm_var": 4.260166368229232e-06, "learning_rate": 0.008573539526493024, "loss": 2.7042, "step": 7217 }, { "crossentropy": 2.502295732498169, "epoch": 0.26167343387471, "grad_norm": 0.029642045497894287, "grad_norm_var": 3.818318959673596e-06, "learning_rate": 0.008573133076211917, "loss": 2.5015, "step": 7218 }, { "crossentropy": 2.6413631439208984, "epoch": 0.261709686774942, "grad_norm": 0.029862042516469955, "grad_norm_var": 4.104807017217954e-06, "learning_rate": 0.008572726577670225, "loss": 2.6062, "step": 7219 }, { "crossentropy": 2.6194541454315186, "epoch": 0.261745939675174, "grad_norm": 0.02947296015918255, "grad_norm_var": 4.50767670355071e-06, "learning_rate": 0.008572320030873437, "loss": 2.6552, "step": 7220 }, { "crossentropy": 2.643000602722168, "epoch": 0.261782192575406, "grad_norm": 0.03105008974671364, "grad_norm_var": 4.5713912450892945e-06, "learning_rate": 0.008571913435827049, "loss": 2.6708, "step": 7221 }, { "crossentropy": 2.6987664699554443, "epoch": 0.26181844547563804, "grad_norm": 0.03171771392226219, "grad_norm_var": 3.0798196480817288e-06, "learning_rate": 0.008571506792536548, "loss": 2.6433, "step": 7222 }, { "crossentropy": 2.695530652999878, "epoch": 0.26185469837587005, "grad_norm": 0.031186437234282494, "grad_norm_var": 2.9868936847430098e-06, "learning_rate": 0.008571100101007428, "loss": 2.579, "step": 7223 }, { "crossentropy": 2.4929661750793457, "epoch": 0.26189095127610207, "grad_norm": 0.030964406207203865, "grad_norm_var": 2.7826048543152465e-06, "learning_rate": 0.00857069336124518, "loss": 2.5835, "step": 7224 }, { "crossentropy": 2.7088711261749268, "epoch": 0.2619272041763341, "grad_norm": 0.03575195372104645, "grad_norm_var": 3.6481131013177405e-06, "learning_rate": 0.0085702865732553, "loss": 2.6973, "step": 7225 }, { "crossentropy": 2.7320210933685303, "epoch": 0.26196345707656615, "grad_norm": 0.03859268128871918, "grad_norm_var": 6.301796072762664e-06, "learning_rate": 0.008569879737043283, "loss": 2.6895, "step": 7226 }, { "crossentropy": 2.6564924716949463, "epoch": 0.26199970997679817, "grad_norm": 0.03871136158704758, "grad_norm_var": 8.785385828604395e-06, "learning_rate": 0.008569472852614625, "loss": 2.5822, "step": 7227 }, { "crossentropy": 2.715804100036621, "epoch": 0.2620359628770302, "grad_norm": 0.035512328147888184, "grad_norm_var": 9.244633840110394e-06, "learning_rate": 0.008569065919974814, "loss": 2.6897, "step": 7228 }, { "crossentropy": 2.7472362518310547, "epoch": 0.2620722157772622, "grad_norm": 0.03366697579622269, "grad_norm_var": 9.172509051097671e-06, "learning_rate": 0.008568658939129354, "loss": 2.7082, "step": 7229 }, { "crossentropy": 2.6801509857177734, "epoch": 0.2621084686774942, "grad_norm": 0.03333272039890289, "grad_norm_var": 9.156748303837494e-06, "learning_rate": 0.00856825191008374, "loss": 2.6373, "step": 7230 }, { "crossentropy": 2.7192251682281494, "epoch": 0.2621447215777262, "grad_norm": 0.033442527055740356, "grad_norm_var": 9.09566455750455e-06, "learning_rate": 0.008567844832843468, "loss": 2.7079, "step": 7231 }, { "crossentropy": 2.6581547260284424, "epoch": 0.26218097447795824, "grad_norm": 0.03497454151511192, "grad_norm_var": 9.163803530267304e-06, "learning_rate": 0.00856743770741404, "loss": 2.716, "step": 7232 }, { "crossentropy": 2.7416787147521973, "epoch": 0.26221722737819025, "grad_norm": 0.033790215849876404, "grad_norm_var": 8.577124139945884e-06, "learning_rate": 0.008567030533800948, "loss": 2.7014, "step": 7233 }, { "crossentropy": 2.7242791652679443, "epoch": 0.26225348027842227, "grad_norm": 0.030452126637101173, "grad_norm_var": 8.230661454704478e-06, "learning_rate": 0.008566623312009696, "loss": 2.6628, "step": 7234 }, { "crossentropy": 2.6102495193481445, "epoch": 0.2622897331786543, "grad_norm": 0.03183287754654884, "grad_norm_var": 7.575241562315001e-06, "learning_rate": 0.008566216042045785, "loss": 2.5376, "step": 7235 }, { "crossentropy": 2.546968936920166, "epoch": 0.2623259860788863, "grad_norm": 0.031348682940006256, "grad_norm_var": 6.812187634217777e-06, "learning_rate": 0.008565808723914714, "loss": 2.6224, "step": 7236 }, { "crossentropy": 2.8223555088043213, "epoch": 0.2623622389791183, "grad_norm": 0.03193926438689232, "grad_norm_var": 6.5687212896331825e-06, "learning_rate": 0.008565401357621984, "loss": 2.8036, "step": 7237 }, { "crossentropy": 2.6003217697143555, "epoch": 0.2623984918793503, "grad_norm": 0.032883577048778534, "grad_norm_var": 6.364798051900151e-06, "learning_rate": 0.0085649939431731, "loss": 2.6959, "step": 7238 }, { "crossentropy": 2.617548942565918, "epoch": 0.26243474477958234, "grad_norm": 0.031164824962615967, "grad_norm_var": 6.371923216897992e-06, "learning_rate": 0.008564586480573561, "loss": 2.6444, "step": 7239 }, { "crossentropy": 2.6595797538757324, "epoch": 0.2624709976798144, "grad_norm": 0.03015194460749626, "grad_norm_var": 6.703841050464246e-06, "learning_rate": 0.008564178969828874, "loss": 2.6338, "step": 7240 }, { "crossentropy": 2.739506244659424, "epoch": 0.2625072505800464, "grad_norm": 0.03158947452902794, "grad_norm_var": 6.590619462129099e-06, "learning_rate": 0.008563771410944538, "loss": 2.7474, "step": 7241 }, { "crossentropy": 2.717956781387329, "epoch": 0.26254350348027844, "grad_norm": 0.03425290063023567, "grad_norm_var": 4.726379111355061e-06, "learning_rate": 0.008563363803926064, "loss": 2.7215, "step": 7242 }, { "crossentropy": 2.8211991786956787, "epoch": 0.26257975638051045, "grad_norm": 0.0376589335501194, "grad_norm_var": 4.003341479416828e-06, "learning_rate": 0.008562956148778953, "loss": 2.7965, "step": 7243 }, { "crossentropy": 2.535276174545288, "epoch": 0.26261600928074247, "grad_norm": 0.035882387310266495, "grad_norm_var": 4.135880569124334e-06, "learning_rate": 0.008562548445508713, "loss": 2.6027, "step": 7244 }, { "crossentropy": 2.6697514057159424, "epoch": 0.2626522621809745, "grad_norm": 0.0361478291451931, "grad_norm_var": 4.733642971062194e-06, "learning_rate": 0.008562140694120849, "loss": 2.6374, "step": 7245 }, { "crossentropy": 2.8809053897857666, "epoch": 0.2626885150812065, "grad_norm": 0.03960348665714264, "grad_norm_var": 7.3208276248768465e-06, "learning_rate": 0.008561732894620868, "loss": 2.8118, "step": 7246 }, { "crossentropy": 2.6224658489227295, "epoch": 0.2627247679814385, "grad_norm": 0.039104074239730835, "grad_norm_var": 9.22812938426291e-06, "learning_rate": 0.008561325047014281, "loss": 2.6141, "step": 7247 }, { "crossentropy": 2.603447437286377, "epoch": 0.2627610208816705, "grad_norm": 0.03736063465476036, "grad_norm_var": 9.918331108037119e-06, "learning_rate": 0.008560917151306593, "loss": 2.625, "step": 7248 }, { "crossentropy": 2.5607752799987793, "epoch": 0.26279727378190254, "grad_norm": 0.031181413680315018, "grad_norm_var": 1.0441956778313082e-05, "learning_rate": 0.008560509207503317, "loss": 2.5574, "step": 7249 }, { "crossentropy": 2.6443898677825928, "epoch": 0.26283352668213456, "grad_norm": 0.03498562425374985, "grad_norm_var": 9.636536566148592e-06, "learning_rate": 0.00856010121560996, "loss": 2.5893, "step": 7250 }, { "crossentropy": 2.717785120010376, "epoch": 0.26286977958236657, "grad_norm": 0.031161446124315262, "grad_norm_var": 9.876000485027857e-06, "learning_rate": 0.008559693175632033, "loss": 2.6731, "step": 7251 }, { "crossentropy": 2.7465741634368896, "epoch": 0.2629060324825986, "grad_norm": 0.0332520492374897, "grad_norm_var": 9.39123966561302e-06, "learning_rate": 0.008559285087575048, "loss": 2.6916, "step": 7252 }, { "crossentropy": 2.748277187347412, "epoch": 0.26294228538283065, "grad_norm": 0.03077602945268154, "grad_norm_var": 9.837300488336784e-06, "learning_rate": 0.008558876951444517, "loss": 2.7285, "step": 7253 }, { "crossentropy": 2.6899757385253906, "epoch": 0.26297853828306267, "grad_norm": 0.0326363630592823, "grad_norm_var": 9.884422569492743e-06, "learning_rate": 0.008558468767245952, "loss": 2.6812, "step": 7254 }, { "crossentropy": 2.5110418796539307, "epoch": 0.2630147911832947, "grad_norm": 0.030830876901745796, "grad_norm_var": 1.0025729427522572e-05, "learning_rate": 0.008558060534984866, "loss": 2.6511, "step": 7255 }, { "crossentropy": 2.822350025177002, "epoch": 0.2630510440835267, "grad_norm": 0.031314194202423096, "grad_norm_var": 9.488891353263973e-06, "learning_rate": 0.008557652254666773, "loss": 2.7307, "step": 7256 }, { "crossentropy": 2.6314690113067627, "epoch": 0.2630872969837587, "grad_norm": 0.029916493222117424, "grad_norm_var": 1.0253631811320491e-05, "learning_rate": 0.008557243926297187, "loss": 2.6574, "step": 7257 }, { "crossentropy": 2.8346219062805176, "epoch": 0.2631235498839907, "grad_norm": 0.0301169753074646, "grad_norm_var": 1.125444874775365e-05, "learning_rate": 0.008556835549881623, "loss": 2.7204, "step": 7258 }, { "crossentropy": 2.714268207550049, "epoch": 0.26315980278422274, "grad_norm": 0.03152582421898842, "grad_norm_var": 1.0507445955633859e-05, "learning_rate": 0.008556427125425597, "loss": 2.7049, "step": 7259 }, { "crossentropy": 2.607225179672241, "epoch": 0.26319605568445475, "grad_norm": 0.03471570089459419, "grad_norm_var": 1.0219932157855951e-05, "learning_rate": 0.008556018652934627, "loss": 2.6121, "step": 7260 }, { "crossentropy": 2.6185965538024902, "epoch": 0.26323230858468677, "grad_norm": 0.03282645717263222, "grad_norm_var": 9.698865323342826e-06, "learning_rate": 0.008555610132414227, "loss": 2.662, "step": 7261 }, { "crossentropy": 2.673839569091797, "epoch": 0.2632685614849188, "grad_norm": 0.03114486299455166, "grad_norm_var": 6.9562647466889576e-06, "learning_rate": 0.008555201563869918, "loss": 2.6539, "step": 7262 }, { "crossentropy": 2.5175108909606934, "epoch": 0.2633048143851508, "grad_norm": 0.03147559612989426, "grad_norm_var": 4.057278900415108e-06, "learning_rate": 0.008554792947307216, "loss": 2.6232, "step": 7263 }, { "crossentropy": 2.6095008850097656, "epoch": 0.2633410672853828, "grad_norm": 0.03014223277568817, "grad_norm_var": 2.348226077791606e-06, "learning_rate": 0.00855438428273164, "loss": 2.6789, "step": 7264 }, { "crossentropy": 2.6419403553009033, "epoch": 0.2633773201856148, "grad_norm": 0.03199245408177376, "grad_norm_var": 2.327837078121518e-06, "learning_rate": 0.008553975570148711, "loss": 2.6786, "step": 7265 }, { "crossentropy": 2.5891661643981934, "epoch": 0.26341357308584684, "grad_norm": 0.03328109160065651, "grad_norm_var": 1.7856136910830595e-06, "learning_rate": 0.00855356680956395, "loss": 2.6484, "step": 7266 }, { "crossentropy": 2.7824058532714844, "epoch": 0.2634498259860789, "grad_norm": 0.032930534332990646, "grad_norm_var": 1.8555317747965052e-06, "learning_rate": 0.008553158000982873, "loss": 2.7836, "step": 7267 }, { "crossentropy": 2.7512433528900146, "epoch": 0.2634860788863109, "grad_norm": 0.03359562158584595, "grad_norm_var": 1.929204694420955e-06, "learning_rate": 0.008552749144411008, "loss": 2.6955, "step": 7268 }, { "crossentropy": 2.845205068588257, "epoch": 0.26352233178654294, "grad_norm": 0.035385843366384506, "grad_norm_var": 2.6117939644146182e-06, "learning_rate": 0.008552340239853873, "loss": 2.7292, "step": 7269 }, { "crossentropy": 2.63496994972229, "epoch": 0.26355858468677495, "grad_norm": 0.030761446803808212, "grad_norm_var": 2.701027241471772e-06, "learning_rate": 0.008551931287316992, "loss": 2.6421, "step": 7270 }, { "crossentropy": 2.6827826499938965, "epoch": 0.26359483758700697, "grad_norm": 0.031910642981529236, "grad_norm_var": 2.605972468023611e-06, "learning_rate": 0.00855152228680589, "loss": 2.6634, "step": 7271 }, { "crossentropy": 2.6773905754089355, "epoch": 0.263631090487239, "grad_norm": 0.03018833138048649, "grad_norm_var": 2.7978648613732284e-06, "learning_rate": 0.008551113238326089, "loss": 2.5723, "step": 7272 }, { "crossentropy": 2.50679349899292, "epoch": 0.263667343387471, "grad_norm": 0.030190017074346542, "grad_norm_var": 2.726760538925878e-06, "learning_rate": 0.008550704141883114, "loss": 2.6125, "step": 7273 }, { "crossentropy": 2.7791059017181396, "epoch": 0.263703596287703, "grad_norm": 0.03235563263297081, "grad_norm_var": 2.474499344027927e-06, "learning_rate": 0.008550294997482492, "loss": 2.7175, "step": 7274 }, { "crossentropy": 2.76806378364563, "epoch": 0.263739849187935, "grad_norm": 0.031135259196162224, "grad_norm_var": 2.5166098701396526e-06, "learning_rate": 0.00854988580512975, "loss": 2.6825, "step": 7275 }, { "crossentropy": 2.7772316932678223, "epoch": 0.26377610208816704, "grad_norm": 0.031333744525909424, "grad_norm_var": 2.0641374613126895e-06, "learning_rate": 0.00854947656483041, "loss": 2.7352, "step": 7276 }, { "crossentropy": 2.672757148742676, "epoch": 0.26381235498839906, "grad_norm": 0.03547070175409317, "grad_norm_var": 2.8222728023711972e-06, "learning_rate": 0.008549067276590005, "loss": 2.7091, "step": 7277 }, { "crossentropy": 2.7111244201660156, "epoch": 0.26384860788863107, "grad_norm": 0.03849624842405319, "grad_norm_var": 5.28248655683085e-06, "learning_rate": 0.00854865794041406, "loss": 2.6463, "step": 7278 }, { "crossentropy": 2.7063114643096924, "epoch": 0.2638848607888631, "grad_norm": 0.03297384828329086, "grad_norm_var": 5.210083903349887e-06, "learning_rate": 0.008548248556308103, "loss": 2.7145, "step": 7279 }, { "crossentropy": 2.6911752223968506, "epoch": 0.26392111368909515, "grad_norm": 0.03268345072865486, "grad_norm_var": 4.769419943736336e-06, "learning_rate": 0.008547839124277666, "loss": 2.6927, "step": 7280 }, { "crossentropy": 2.681238889694214, "epoch": 0.26395736658932717, "grad_norm": 0.030736880376935005, "grad_norm_var": 5.001935518084014e-06, "learning_rate": 0.008547429644328275, "loss": 2.6476, "step": 7281 }, { "crossentropy": 2.6435964107513428, "epoch": 0.2639936194895592, "grad_norm": 0.03303949534893036, "grad_norm_var": 4.987326598071305e-06, "learning_rate": 0.008547020116465466, "loss": 2.6477, "step": 7282 }, { "crossentropy": 2.728426456451416, "epoch": 0.2640298723897912, "grad_norm": 0.03085894137620926, "grad_norm_var": 5.191656385107356e-06, "learning_rate": 0.008546610540694766, "loss": 2.6563, "step": 7283 }, { "crossentropy": 2.749843120574951, "epoch": 0.2640661252900232, "grad_norm": 0.030408233404159546, "grad_norm_var": 5.390644281016741e-06, "learning_rate": 0.008546200917021709, "loss": 2.6974, "step": 7284 }, { "crossentropy": 2.784869432449341, "epoch": 0.2641023781902552, "grad_norm": 0.07464256882667542, "grad_norm_var": 0.00011749156626707894, "learning_rate": 0.008545791245451828, "loss": 2.7, "step": 7285 }, { "crossentropy": 2.703928232192993, "epoch": 0.26413863109048724, "grad_norm": 0.03362751007080078, "grad_norm_var": 0.00011645245545072052, "learning_rate": 0.008545381525990655, "loss": 2.6637, "step": 7286 }, { "crossentropy": 2.606703281402588, "epoch": 0.26417488399071926, "grad_norm": 0.030186066403985023, "grad_norm_var": 0.00011734945864700178, "learning_rate": 0.008544971758643723, "loss": 2.6613, "step": 7287 }, { "crossentropy": 2.5893383026123047, "epoch": 0.26421113689095127, "grad_norm": 0.0338667668402195, "grad_norm_var": 0.00011588650312141616, "learning_rate": 0.008544561943416568, "loss": 2.6561, "step": 7288 }, { "crossentropy": 2.7282888889312744, "epoch": 0.2642473897911833, "grad_norm": 0.03348381444811821, "grad_norm_var": 0.0001143971135017522, "learning_rate": 0.008544152080314725, "loss": 2.6798, "step": 7289 }, { "crossentropy": 2.602344036102295, "epoch": 0.2642836426914153, "grad_norm": 0.030769210308790207, "grad_norm_var": 0.00011518380986119876, "learning_rate": 0.008543742169343728, "loss": 2.6361, "step": 7290 }, { "crossentropy": 2.6310532093048096, "epoch": 0.2643198955916473, "grad_norm": 0.03209461644291878, "grad_norm_var": 0.00011471729510762705, "learning_rate": 0.008543332210509118, "loss": 2.6636, "step": 7291 }, { "crossentropy": 2.666778802871704, "epoch": 0.26435614849187933, "grad_norm": 0.03667864203453064, "grad_norm_var": 0.00011368192370523564, "learning_rate": 0.008542922203816429, "loss": 2.5548, "step": 7292 }, { "crossentropy": 2.465113878250122, "epoch": 0.26439240139211134, "grad_norm": 0.03902953490614891, "grad_norm_var": 0.00011439978429272675, "learning_rate": 0.008542512149271198, "loss": 2.5662, "step": 7293 }, { "crossentropy": 2.63273024559021, "epoch": 0.2644286542923434, "grad_norm": 0.030447227880358696, "grad_norm_var": 0.00011560737264160284, "learning_rate": 0.008542102046878965, "loss": 2.6232, "step": 7294 }, { "crossentropy": 2.710705280303955, "epoch": 0.2644649071925754, "grad_norm": 0.030347945168614388, "grad_norm_var": 0.00011686867068260252, "learning_rate": 0.008541691896645269, "loss": 2.6747, "step": 7295 }, { "crossentropy": 2.6130900382995605, "epoch": 0.26450116009280744, "grad_norm": 0.02919185161590576, "grad_norm_var": 0.00011879349285449925, "learning_rate": 0.00854128169857565, "loss": 2.6105, "step": 7296 }, { "crossentropy": 2.7133185863494873, "epoch": 0.26453741299303946, "grad_norm": 0.033335231244564056, "grad_norm_var": 0.00011775130324433573, "learning_rate": 0.008540871452675645, "loss": 2.6811, "step": 7297 }, { "crossentropy": 2.5585520267486572, "epoch": 0.26457366589327147, "grad_norm": 0.035955414175987244, "grad_norm_var": 0.00011747170697699442, "learning_rate": 0.008540461158950798, "loss": 2.6052, "step": 7298 }, { "crossentropy": 2.637505054473877, "epoch": 0.2646099187935035, "grad_norm": 0.035850755870342255, "grad_norm_var": 0.00011606809591402285, "learning_rate": 0.008540050817406651, "loss": 2.7328, "step": 7299 }, { "crossentropy": 2.7004294395446777, "epoch": 0.2646461716937355, "grad_norm": 0.03199757635593414, "grad_norm_var": 0.00011512159473061599, "learning_rate": 0.008539640428048747, "loss": 2.7519, "step": 7300 }, { "crossentropy": 2.659536123275757, "epoch": 0.2646824245939675, "grad_norm": 0.03120187111198902, "grad_norm_var": 7.616316904851308e-06, "learning_rate": 0.008539229990882625, "loss": 2.661, "step": 7301 }, { "crossentropy": 2.6023781299591064, "epoch": 0.26471867749419953, "grad_norm": 0.03093133121728897, "grad_norm_var": 7.84650800844296e-06, "learning_rate": 0.008538819505913831, "loss": 2.6178, "step": 7302 }, { "crossentropy": 2.5784175395965576, "epoch": 0.26475493039443154, "grad_norm": 0.031195200979709625, "grad_norm_var": 7.5536715831402e-06, "learning_rate": 0.00853840897314791, "loss": 2.6413, "step": 7303 }, { "crossentropy": 2.5834503173828125, "epoch": 0.26479118329466356, "grad_norm": 0.0308853592723608, "grad_norm_var": 7.724339226198676e-06, "learning_rate": 0.008537998392590408, "loss": 2.5567, "step": 7304 }, { "crossentropy": 2.6024560928344727, "epoch": 0.26482743619489557, "grad_norm": 0.03274976834654808, "grad_norm_var": 7.68249797217883e-06, "learning_rate": 0.008537587764246866, "loss": 2.6478, "step": 7305 }, { "crossentropy": 2.568215847015381, "epoch": 0.2648636890951276, "grad_norm": 0.030070355162024498, "grad_norm_var": 7.889799290455264e-06, "learning_rate": 0.008537177088122834, "loss": 2.6418, "step": 7306 }, { "crossentropy": 2.6364502906799316, "epoch": 0.26489994199535966, "grad_norm": 0.03186849132180214, "grad_norm_var": 7.908915827367737e-06, "learning_rate": 0.008536766364223859, "loss": 2.6167, "step": 7307 }, { "crossentropy": 2.5788345336914062, "epoch": 0.26493619489559167, "grad_norm": 0.03039679303765297, "grad_norm_var": 6.966227623354558e-06, "learning_rate": 0.008536355592555485, "loss": 2.6386, "step": 7308 }, { "crossentropy": 2.6774983406066895, "epoch": 0.2649724477958237, "grad_norm": 0.03067818656563759, "grad_norm_var": 3.7382409205159352e-06, "learning_rate": 0.008535944773123262, "loss": 2.6663, "step": 7309 }, { "crossentropy": 2.7587573528289795, "epoch": 0.2650087006960557, "grad_norm": 0.0321718230843544, "grad_norm_var": 3.637449120901433e-06, "learning_rate": 0.008535533905932738, "loss": 2.687, "step": 7310 }, { "crossentropy": 2.624187469482422, "epoch": 0.2650449535962877, "grad_norm": 0.055611446499824524, "grad_norm_var": 3.863064654061192e-05, "learning_rate": 0.008535122990989465, "loss": 2.6588, "step": 7311 }, { "crossentropy": 2.7212772369384766, "epoch": 0.26508120649651973, "grad_norm": 0.030334679409861565, "grad_norm_var": 3.807398824990782e-05, "learning_rate": 0.00853471202829899, "loss": 2.6957, "step": 7312 }, { "crossentropy": 2.690291166305542, "epoch": 0.26511745939675174, "grad_norm": 0.03228635713458061, "grad_norm_var": 3.815909686322357e-05, "learning_rate": 0.008534301017866865, "loss": 2.6871, "step": 7313 }, { "crossentropy": 2.702338695526123, "epoch": 0.26515371229698376, "grad_norm": 0.03264716640114784, "grad_norm_var": 3.7710019900487274e-05, "learning_rate": 0.008533889959698641, "loss": 2.6038, "step": 7314 }, { "crossentropy": 2.5538408756256104, "epoch": 0.26518996519721577, "grad_norm": 0.03547544777393341, "grad_norm_var": 3.758516702729719e-05, "learning_rate": 0.008533478853799872, "loss": 2.6297, "step": 7315 }, { "crossentropy": 2.862334728240967, "epoch": 0.2652262180974478, "grad_norm": 0.06396462023258209, "grad_norm_var": 9.651433378221833e-05, "learning_rate": 0.008533067700176108, "loss": 2.7211, "step": 7316 }, { "crossentropy": 2.6527388095855713, "epoch": 0.2652624709976798, "grad_norm": 0.033271919935941696, "grad_norm_var": 9.569125487991432e-05, "learning_rate": 0.008532656498832903, "loss": 2.6936, "step": 7317 }, { "crossentropy": 2.611697196960449, "epoch": 0.2652987238979118, "grad_norm": 0.030721917748451233, "grad_norm_var": 9.581552126496198e-05, "learning_rate": 0.008532245249775812, "loss": 2.578, "step": 7318 }, { "crossentropy": 2.5774424076080322, "epoch": 0.26533497679814383, "grad_norm": 0.0414540134370327, "grad_norm_var": 9.681872850040202e-05, "learning_rate": 0.008531833953010387, "loss": 2.5503, "step": 7319 }, { "crossentropy": 2.766075611114502, "epoch": 0.26537122969837584, "grad_norm": 0.0312642939388752, "grad_norm_var": 9.657374538485596e-05, "learning_rate": 0.008531422608542186, "loss": 2.8148, "step": 7320 }, { "crossentropy": 2.5197126865386963, "epoch": 0.2654074825986079, "grad_norm": 0.037221916019916534, "grad_norm_var": 9.592417044712905e-05, "learning_rate": 0.008531011216376764, "loss": 2.7659, "step": 7321 }, { "crossentropy": 2.575392484664917, "epoch": 0.26544373549883993, "grad_norm": 0.0312312301248312, "grad_norm_var": 9.505731432125532e-05, "learning_rate": 0.008530599776519675, "loss": 2.526, "step": 7322 }, { "crossentropy": 2.769483804702759, "epoch": 0.26547998839907194, "grad_norm": 0.03192407637834549, "grad_norm_var": 9.502475650758207e-05, "learning_rate": 0.008530188288976477, "loss": 2.6865, "step": 7323 }, { "crossentropy": 2.625800371170044, "epoch": 0.26551624129930396, "grad_norm": 0.034675899893045425, "grad_norm_var": 9.280625720616136e-05, "learning_rate": 0.008529776753752731, "loss": 2.592, "step": 7324 }, { "crossentropy": 2.6852636337280273, "epoch": 0.26555249419953597, "grad_norm": 0.03395828604698181, "grad_norm_var": 9.090699040366351e-05, "learning_rate": 0.008529365170853993, "loss": 2.6838, "step": 7325 }, { "crossentropy": 2.7189509868621826, "epoch": 0.265588747099768, "grad_norm": 0.03361617401242256, "grad_norm_var": 9.015311994940734e-05, "learning_rate": 0.008528953540285824, "loss": 2.6984, "step": 7326 }, { "crossentropy": 2.651210308074951, "epoch": 0.265625, "grad_norm": 0.038509663194417953, "grad_norm_var": 6.566046975284053e-05, "learning_rate": 0.00852854186205378, "loss": 2.7237, "step": 7327 }, { "crossentropy": 2.6728734970092773, "epoch": 0.265661252900232, "grad_norm": 0.04369811341166496, "grad_norm_var": 6.711073240648854e-05, "learning_rate": 0.008528130136163423, "loss": 2.6912, "step": 7328 }, { "crossentropy": 2.7399353981018066, "epoch": 0.26569750580046403, "grad_norm": 0.03643477335572243, "grad_norm_var": 6.578924543542268e-05, "learning_rate": 0.008527718362620316, "loss": 2.6655, "step": 7329 }, { "crossentropy": 2.754748582839966, "epoch": 0.26573375870069604, "grad_norm": 0.03732149302959442, "grad_norm_var": 6.451715082394306e-05, "learning_rate": 0.008527306541430019, "loss": 2.6991, "step": 7330 }, { "crossentropy": 2.7002456188201904, "epoch": 0.26577001160092806, "grad_norm": 0.041073016822338104, "grad_norm_var": 6.520962113297755e-05, "learning_rate": 0.008526894672598094, "loss": 2.6736, "step": 7331 }, { "crossentropy": 2.5178091526031494, "epoch": 0.26580626450116007, "grad_norm": 0.041219402104616165, "grad_norm_var": 1.734925063106688e-05, "learning_rate": 0.008526482756130103, "loss": 2.5687, "step": 7332 }, { "crossentropy": 2.5573205947875977, "epoch": 0.2658425174013921, "grad_norm": 0.03695397824048996, "grad_norm_var": 1.6808293879940378e-05, "learning_rate": 0.008526070792031612, "loss": 2.6248, "step": 7333 }, { "crossentropy": 2.5522055625915527, "epoch": 0.26587877030162416, "grad_norm": 0.03076389618217945, "grad_norm_var": 1.6777015495164175e-05, "learning_rate": 0.008525658780308183, "loss": 2.6087, "step": 7334 }, { "crossentropy": 2.5917718410491943, "epoch": 0.26591502320185617, "grad_norm": 0.03243065997958183, "grad_norm_var": 1.57040757658262e-05, "learning_rate": 0.008525246720965383, "loss": 2.626, "step": 7335 }, { "crossentropy": 2.577082872390747, "epoch": 0.2659512761020882, "grad_norm": 0.03088744543492794, "grad_norm_var": 1.5939274873330563e-05, "learning_rate": 0.008524834614008775, "loss": 2.5851, "step": 7336 }, { "crossentropy": 2.4980709552764893, "epoch": 0.2659875290023202, "grad_norm": 0.03134538605809212, "grad_norm_var": 1.69404075021813e-05, "learning_rate": 0.008524422459443928, "loss": 2.6043, "step": 7337 }, { "crossentropy": 2.7506978511810303, "epoch": 0.2660237819025522, "grad_norm": 0.031108751893043518, "grad_norm_var": 1.7009058999652853e-05, "learning_rate": 0.008524010257276407, "loss": 2.7274, "step": 7338 }, { "crossentropy": 2.855586528778076, "epoch": 0.26606003480278423, "grad_norm": 0.031970031559467316, "grad_norm_var": 1.6988076197079622e-05, "learning_rate": 0.008523598007511776, "loss": 2.728, "step": 7339 }, { "crossentropy": 2.5395090579986572, "epoch": 0.26609628770301624, "grad_norm": 0.03198881819844246, "grad_norm_var": 1.7689083987234427e-05, "learning_rate": 0.00852318571015561, "loss": 2.6184, "step": 7340 }, { "crossentropy": 2.702831745147705, "epoch": 0.26613254060324826, "grad_norm": 0.03093007206916809, "grad_norm_var": 1.876558674200543e-05, "learning_rate": 0.008522773365213475, "loss": 2.6644, "step": 7341 }, { "crossentropy": 2.636786937713623, "epoch": 0.26616879350348027, "grad_norm": 0.02999519370496273, "grad_norm_var": 2.026075721282745e-05, "learning_rate": 0.00852236097269094, "loss": 2.7028, "step": 7342 }, { "crossentropy": 2.7554616928100586, "epoch": 0.2662050464037123, "grad_norm": 0.03187225013971329, "grad_norm_var": 1.972183699753513e-05, "learning_rate": 0.008521948532593571, "loss": 2.6773, "step": 7343 }, { "crossentropy": 2.6411828994750977, "epoch": 0.2662412993039443, "grad_norm": 0.030564337968826294, "grad_norm_var": 1.4175748267987555e-05, "learning_rate": 0.008521536044926946, "loss": 2.6881, "step": 7344 }, { "crossentropy": 2.7331106662750244, "epoch": 0.2662775522041763, "grad_norm": 0.029564635828137398, "grad_norm_var": 1.4486574158150087e-05, "learning_rate": 0.00852112350969663, "loss": 2.7197, "step": 7345 }, { "crossentropy": 2.5524532794952393, "epoch": 0.26631380510440833, "grad_norm": 0.031018812209367752, "grad_norm_var": 1.3442198248827566e-05, "learning_rate": 0.0085207109269082, "loss": 2.6559, "step": 7346 }, { "crossentropy": 2.6793384552001953, "epoch": 0.26635005800464034, "grad_norm": 0.030128642916679382, "grad_norm_var": 8.754469682171507e-06, "learning_rate": 0.008520298296567223, "loss": 2.7239, "step": 7347 }, { "crossentropy": 2.637462615966797, "epoch": 0.2663863109048724, "grad_norm": 0.030258389189839363, "grad_norm_var": 2.857397023898556e-06, "learning_rate": 0.008519885618679278, "loss": 2.7119, "step": 7348 }, { "crossentropy": 2.6700332164764404, "epoch": 0.26642256380510443, "grad_norm": 0.03012751042842865, "grad_norm_var": 6.795352490567095e-07, "learning_rate": 0.008519472893249936, "loss": 2.6994, "step": 7349 }, { "crossentropy": 2.7403969764709473, "epoch": 0.26645881670533644, "grad_norm": 0.0311137605458498, "grad_norm_var": 6.792188780294171e-07, "learning_rate": 0.00851906012028477, "loss": 2.7461, "step": 7350 }, { "crossentropy": 2.741154193878174, "epoch": 0.26649506960556846, "grad_norm": 0.030064815655350685, "grad_norm_var": 5.640411161247453e-07, "learning_rate": 0.008518647299789357, "loss": 2.7069, "step": 7351 }, { "crossentropy": 2.59487247467041, "epoch": 0.26653132250580047, "grad_norm": 0.032135043293237686, "grad_norm_var": 6.744250095742082e-07, "learning_rate": 0.008518234431769273, "loss": 2.6411, "step": 7352 }, { "crossentropy": 2.63014817237854, "epoch": 0.2665675754060325, "grad_norm": 0.031091120094060898, "grad_norm_var": 6.629136890055753e-07, "learning_rate": 0.008517821516230093, "loss": 2.6423, "step": 7353 }, { "crossentropy": 2.7355854511260986, "epoch": 0.2666038283062645, "grad_norm": 0.03279730677604675, "grad_norm_var": 8.94696076755375e-07, "learning_rate": 0.008517408553177395, "loss": 2.6658, "step": 7354 }, { "crossentropy": 2.653848171234131, "epoch": 0.2666400812064965, "grad_norm": 0.030230604112148285, "grad_norm_var": 8.533258457202638e-07, "learning_rate": 0.008516995542616758, "loss": 2.6581, "step": 7355 }, { "crossentropy": 2.67261004447937, "epoch": 0.26667633410672853, "grad_norm": 0.030808424577116966, "grad_norm_var": 7.639422514485461e-07, "learning_rate": 0.008516582484553758, "loss": 2.7007, "step": 7356 }, { "crossentropy": 2.552252769470215, "epoch": 0.26671258700696054, "grad_norm": 0.03185519948601723, "grad_norm_var": 8.342418298399115e-07, "learning_rate": 0.008516169378993977, "loss": 2.6714, "step": 7357 }, { "crossentropy": 2.5973093509674072, "epoch": 0.26674883990719256, "grad_norm": 0.03222944959998131, "grad_norm_var": 8.911031129660383e-07, "learning_rate": 0.00851575622594299, "loss": 2.5773, "step": 7358 }, { "crossentropy": 2.816474437713623, "epoch": 0.2667850928074246, "grad_norm": 0.032529521733522415, "grad_norm_var": 9.953093415519433e-07, "learning_rate": 0.00851534302540638, "loss": 2.7205, "step": 7359 }, { "crossentropy": 2.6899304389953613, "epoch": 0.2668213457076566, "grad_norm": 0.03333750367164612, "grad_norm_var": 1.3029129453312558e-06, "learning_rate": 0.008514929777389728, "loss": 2.6576, "step": 7360 }, { "crossentropy": 2.655329942703247, "epoch": 0.26685759860788866, "grad_norm": 0.03450793772935867, "grad_norm_var": 1.7485597431254306e-06, "learning_rate": 0.008514516481898616, "loss": 2.6089, "step": 7361 }, { "crossentropy": 2.6657748222351074, "epoch": 0.26689385150812067, "grad_norm": 0.03373059630393982, "grad_norm_var": 2.0288979726166184e-06, "learning_rate": 0.008514103138938625, "loss": 2.6303, "step": 7362 }, { "crossentropy": 2.709134578704834, "epoch": 0.2669301044083527, "grad_norm": 0.03366529941558838, "grad_norm_var": 2.0771551876979746e-06, "learning_rate": 0.008513689748515338, "loss": 2.7062, "step": 7363 }, { "crossentropy": 2.642850160598755, "epoch": 0.2669663573085847, "grad_norm": 0.03466842696070671, "grad_norm_var": 2.324375573303537e-06, "learning_rate": 0.00851327631063434, "loss": 2.5733, "step": 7364 }, { "crossentropy": 2.6084773540496826, "epoch": 0.2670026102088167, "grad_norm": 0.033973127603530884, "grad_norm_var": 2.1958606646406737e-06, "learning_rate": 0.008512862825301212, "loss": 2.5797, "step": 7365 }, { "crossentropy": 2.656273126602173, "epoch": 0.26703886310904873, "grad_norm": 0.03133348003029823, "grad_norm_var": 2.160577244332493e-06, "learning_rate": 0.008512449292521542, "loss": 2.6145, "step": 7366 }, { "crossentropy": 2.566443920135498, "epoch": 0.26707511600928074, "grad_norm": 0.029038893058896065, "grad_norm_var": 2.5505580037234995e-06, "learning_rate": 0.008512035712300914, "loss": 2.636, "step": 7367 }, { "crossentropy": 2.6313161849975586, "epoch": 0.26711136890951276, "grad_norm": 0.02944282256066799, "grad_norm_var": 3.088169743598725e-06, "learning_rate": 0.008511622084644913, "loss": 2.7001, "step": 7368 }, { "crossentropy": 2.7055206298828125, "epoch": 0.2671476218097448, "grad_norm": 0.03501959890127182, "grad_norm_var": 3.470600570221758e-06, "learning_rate": 0.008511208409559127, "loss": 2.6335, "step": 7369 }, { "crossentropy": 2.686901092529297, "epoch": 0.2671838747099768, "grad_norm": 0.03776759281754494, "grad_norm_var": 5.246063833681568e-06, "learning_rate": 0.008510794687049142, "loss": 2.6493, "step": 7370 }, { "crossentropy": 2.8094468116760254, "epoch": 0.2672201276102088, "grad_norm": 0.03683118149638176, "grad_norm_var": 5.744160873387864e-06, "learning_rate": 0.00851038091712055, "loss": 2.6683, "step": 7371 }, { "crossentropy": 2.7000601291656494, "epoch": 0.2672563805104408, "grad_norm": 0.033570609986782074, "grad_norm_var": 5.350828614157586e-06, "learning_rate": 0.008509967099778934, "loss": 2.6164, "step": 7372 }, { "crossentropy": 2.7409167289733887, "epoch": 0.26729263341067283, "grad_norm": 0.03542225435376167, "grad_norm_var": 5.438068726884513e-06, "learning_rate": 0.008509553235029886, "loss": 2.6945, "step": 7373 }, { "crossentropy": 2.5967657566070557, "epoch": 0.26732888631090485, "grad_norm": 0.03127579391002655, "grad_norm_var": 5.6649554975383985e-06, "learning_rate": 0.008509139322878997, "loss": 2.6806, "step": 7374 }, { "crossentropy": 2.64538836479187, "epoch": 0.2673651392111369, "grad_norm": 0.03256074711680412, "grad_norm_var": 5.660946131673395e-06, "learning_rate": 0.008508725363331852, "loss": 2.534, "step": 7375 }, { "crossentropy": 2.6456079483032227, "epoch": 0.26740139211136893, "grad_norm": 0.03528760001063347, "grad_norm_var": 5.8540042939526795e-06, "learning_rate": 0.008508311356394052, "loss": 2.651, "step": 7376 }, { "crossentropy": 2.6306653022766113, "epoch": 0.26743764501160094, "grad_norm": 0.032820794731378555, "grad_norm_var": 5.834637735136159e-06, "learning_rate": 0.00850789730207118, "loss": 2.6534, "step": 7377 }, { "crossentropy": 2.6976590156555176, "epoch": 0.26747389791183296, "grad_norm": 0.03255176916718483, "grad_norm_var": 5.889261459983824e-06, "learning_rate": 0.00850748320036883, "loss": 2.6177, "step": 7378 }, { "crossentropy": 2.716937780380249, "epoch": 0.267510150812065, "grad_norm": 0.03265969827771187, "grad_norm_var": 5.923847518970552e-06, "learning_rate": 0.0085070690512926, "loss": 2.6934, "step": 7379 }, { "crossentropy": 2.624009370803833, "epoch": 0.267546403712297, "grad_norm": 0.03466210886836052, "grad_norm_var": 5.9227722294242596e-06, "learning_rate": 0.00850665485484808, "loss": 2.644, "step": 7380 }, { "crossentropy": 2.6839771270751953, "epoch": 0.267582656612529, "grad_norm": 0.033798955380916595, "grad_norm_var": 5.911094450572705e-06, "learning_rate": 0.008506240611040864, "loss": 2.6122, "step": 7381 }, { "crossentropy": 2.5555381774902344, "epoch": 0.267618909512761, "grad_norm": 0.029755493625998497, "grad_norm_var": 6.496831408490351e-06, "learning_rate": 0.008505826319876545, "loss": 2.6146, "step": 7382 }, { "crossentropy": 2.629849910736084, "epoch": 0.26765516241299303, "grad_norm": 0.06668951362371445, "grad_norm_var": 7.380861992623786e-05, "learning_rate": 0.008505411981360723, "loss": 2.7075, "step": 7383 }, { "crossentropy": 2.6030473709106445, "epoch": 0.26769141531322505, "grad_norm": 0.03548387810587883, "grad_norm_var": 7.110406668086263e-05, "learning_rate": 0.008504997595498994, "loss": 2.6209, "step": 7384 }, { "crossentropy": 2.6122658252716064, "epoch": 0.26772766821345706, "grad_norm": 0.0395357720553875, "grad_norm_var": 7.178251973343935e-05, "learning_rate": 0.008504583162296953, "loss": 2.6077, "step": 7385 }, { "crossentropy": 2.765514850616455, "epoch": 0.2677639211136891, "grad_norm": 0.03889238089323044, "grad_norm_var": 7.208287219435693e-05, "learning_rate": 0.008504168681760196, "loss": 2.6908, "step": 7386 }, { "crossentropy": 2.7006821632385254, "epoch": 0.2678001740139211, "grad_norm": 0.03514651954174042, "grad_norm_var": 7.215495624461076e-05, "learning_rate": 0.008503754153894325, "loss": 2.7049, "step": 7387 }, { "crossentropy": 2.7047111988067627, "epoch": 0.26783642691415316, "grad_norm": 0.03352819010615349, "grad_norm_var": 7.217026355714969e-05, "learning_rate": 0.008503339578704937, "loss": 2.6983, "step": 7388 }, { "crossentropy": 2.784409999847412, "epoch": 0.2678726798143852, "grad_norm": 0.03335241228342056, "grad_norm_var": 7.2667702072146e-05, "learning_rate": 0.00850292495619763, "loss": 2.6316, "step": 7389 }, { "crossentropy": 2.859312057495117, "epoch": 0.2679089327146172, "grad_norm": 0.10321737825870514, "grad_norm_var": 0.0003496265894649643, "learning_rate": 0.008502510286378008, "loss": 2.7169, "step": 7390 }, { "crossentropy": 2.6027495861053467, "epoch": 0.2679451856148492, "grad_norm": 0.03678838536143303, "grad_norm_var": 0.0003461999487559906, "learning_rate": 0.008502095569251666, "loss": 2.6369, "step": 7391 }, { "crossentropy": 2.5927274227142334, "epoch": 0.2679814385150812, "grad_norm": 0.03129468858242035, "grad_norm_var": 0.0003501767582974833, "learning_rate": 0.008501680804824212, "loss": 2.6732, "step": 7392 }, { "crossentropy": 2.6075992584228516, "epoch": 0.26801769141531323, "grad_norm": 0.033592261373996735, "grad_norm_var": 0.0003494100540255428, "learning_rate": 0.008501265993101243, "loss": 2.6272, "step": 7393 }, { "crossentropy": 2.6989784240722656, "epoch": 0.26805394431554525, "grad_norm": 0.03299037367105484, "grad_norm_var": 0.00034894647989500274, "learning_rate": 0.008500851134088364, "loss": 2.6847, "step": 7394 }, { "crossentropy": 2.5906240940093994, "epoch": 0.26809019721577726, "grad_norm": 0.035578589886426926, "grad_norm_var": 0.0003463452330047962, "learning_rate": 0.008500436227791177, "loss": 2.6332, "step": 7395 }, { "crossentropy": 2.7073464393615723, "epoch": 0.2681264501160093, "grad_norm": 0.0340040847659111, "grad_norm_var": 0.000346919075757513, "learning_rate": 0.008500021274215286, "loss": 2.6645, "step": 7396 }, { "crossentropy": 2.466761350631714, "epoch": 0.2681627030162413, "grad_norm": 0.031989652663469315, "grad_norm_var": 0.00034882540778827, "learning_rate": 0.008499606273366298, "loss": 2.5067, "step": 7397 }, { "crossentropy": 2.6345083713531494, "epoch": 0.2681989559164733, "grad_norm": 0.03078717738389969, "grad_norm_var": 0.0003473809297462824, "learning_rate": 0.008499191225249817, "loss": 2.6806, "step": 7398 }, { "crossentropy": 2.6227409839630127, "epoch": 0.2682352088167053, "grad_norm": 0.03272335231304169, "grad_norm_var": 0.00030225837057994235, "learning_rate": 0.008498776129871446, "loss": 2.691, "step": 7399 }, { "crossentropy": 2.8462798595428467, "epoch": 0.26827146171693733, "grad_norm": 0.0313868448138237, "grad_norm_var": 0.0003050542816017386, "learning_rate": 0.008498360987236796, "loss": 2.8338, "step": 7400 }, { "crossentropy": 2.692474126815796, "epoch": 0.26830771461716935, "grad_norm": 0.03211573511362076, "grad_norm_var": 0.0003073969101976435, "learning_rate": 0.00849794579735147, "loss": 2.6739, "step": 7401 }, { "crossentropy": 2.768740653991699, "epoch": 0.2683439675174014, "grad_norm": 0.032699186354875565, "grad_norm_var": 0.0003090256631649283, "learning_rate": 0.00849753056022108, "loss": 2.7217, "step": 7402 }, { "crossentropy": 2.702951431274414, "epoch": 0.26838022041763343, "grad_norm": 0.031797975301742554, "grad_norm_var": 0.000310810565650528, "learning_rate": 0.008497115275851229, "loss": 2.6019, "step": 7403 }, { "crossentropy": 2.7580885887145996, "epoch": 0.26841647331786544, "grad_norm": 0.035945408046245575, "grad_norm_var": 0.0003099390354664629, "learning_rate": 0.00849669994424753, "loss": 2.6723, "step": 7404 }, { "crossentropy": 2.6819770336151123, "epoch": 0.26845272621809746, "grad_norm": 0.037947505712509155, "grad_norm_var": 0.00030870748498869593, "learning_rate": 0.008496284565415593, "loss": 2.6705, "step": 7405 }, { "crossentropy": 2.613546371459961, "epoch": 0.2684889791183295, "grad_norm": 0.03539082035422325, "grad_norm_var": 4.663483133511964e-06, "learning_rate": 0.008495869139361028, "loss": 2.6181, "step": 7406 }, { "crossentropy": 2.5026261806488037, "epoch": 0.2685252320185615, "grad_norm": 0.03091290593147278, "grad_norm_var": 4.295480883333379e-06, "learning_rate": 0.008495453666089444, "loss": 2.5927, "step": 7407 }, { "crossentropy": 2.7320330142974854, "epoch": 0.2685614849187935, "grad_norm": 0.030966095626354218, "grad_norm_var": 4.385586526859757e-06, "learning_rate": 0.008495038145606452, "loss": 2.7115, "step": 7408 }, { "crossentropy": 2.608290433883667, "epoch": 0.2685977378190255, "grad_norm": 0.030276283621788025, "grad_norm_var": 4.8891072609319e-06, "learning_rate": 0.008494622577917667, "loss": 2.687, "step": 7409 }, { "crossentropy": 2.6132822036743164, "epoch": 0.26863399071925753, "grad_norm": 0.03037174418568611, "grad_norm_var": 5.310395295885356e-06, "learning_rate": 0.0084942069630287, "loss": 2.6337, "step": 7410 }, { "crossentropy": 2.6591732501983643, "epoch": 0.26867024361948955, "grad_norm": 0.029673956334590912, "grad_norm_var": 5.306492113210659e-06, "learning_rate": 0.008493791300945169, "loss": 2.6707, "step": 7411 }, { "crossentropy": 2.603121757507324, "epoch": 0.26870649651972156, "grad_norm": 0.06091796234250069, "grad_norm_var": 5.620303657447676e-05, "learning_rate": 0.00849337559167268, "loss": 2.6003, "step": 7412 }, { "crossentropy": 2.532438039779663, "epoch": 0.2687427494199536, "grad_norm": 0.03167590871453285, "grad_norm_var": 5.629826111310839e-05, "learning_rate": 0.008492959835216855, "loss": 2.6334, "step": 7413 }, { "crossentropy": 2.6150307655334473, "epoch": 0.2687790023201856, "grad_norm": 0.03451329469680786, "grad_norm_var": 5.552049174963227e-05, "learning_rate": 0.008492544031583307, "loss": 2.5973, "step": 7414 }, { "crossentropy": 2.755331516265869, "epoch": 0.26881525522041766, "grad_norm": 0.03143765404820442, "grad_norm_var": 5.58996021624715e-05, "learning_rate": 0.00849212818077765, "loss": 2.6771, "step": 7415 }, { "crossentropy": 2.5549023151397705, "epoch": 0.2688515081206497, "grad_norm": 0.03064824640750885, "grad_norm_var": 5.6215840777564435e-05, "learning_rate": 0.008491712282805503, "loss": 2.5983, "step": 7416 }, { "crossentropy": 2.866678476333618, "epoch": 0.2688877610208817, "grad_norm": 0.03304668515920639, "grad_norm_var": 5.6010591151727424e-05, "learning_rate": 0.008491296337672483, "loss": 2.7108, "step": 7417 }, { "crossentropy": 2.657857894897461, "epoch": 0.2689240139211137, "grad_norm": 0.03526194393634796, "grad_norm_var": 5.588642620256105e-05, "learning_rate": 0.008490880345384208, "loss": 2.7344, "step": 7418 }, { "crossentropy": 2.6812422275543213, "epoch": 0.2689602668213457, "grad_norm": 0.03534708917140961, "grad_norm_var": 5.543100297711225e-05, "learning_rate": 0.008490464305946296, "loss": 2.6965, "step": 7419 }, { "crossentropy": 2.786886215209961, "epoch": 0.26899651972157773, "grad_norm": 0.03339380398392677, "grad_norm_var": 5.539579096899075e-05, "learning_rate": 0.008490048219364365, "loss": 2.715, "step": 7420 }, { "crossentropy": 2.772066831588745, "epoch": 0.26903277262180975, "grad_norm": 0.031089307740330696, "grad_norm_var": 5.517051594637275e-05, "learning_rate": 0.00848963208564404, "loss": 2.7939, "step": 7421 }, { "crossentropy": 2.673532724380493, "epoch": 0.26906902552204176, "grad_norm": 0.03783981502056122, "grad_norm_var": 5.598066141088823e-05, "learning_rate": 0.008489215904790934, "loss": 2.6607, "step": 7422 }, { "crossentropy": 2.5804214477539062, "epoch": 0.2691052784222738, "grad_norm": 0.04044312983751297, "grad_norm_var": 5.746662005959451e-05, "learning_rate": 0.008488799676810675, "loss": 2.5861, "step": 7423 }, { "crossentropy": 2.6693432331085205, "epoch": 0.2691415313225058, "grad_norm": 0.03558411821722984, "grad_norm_var": 5.643486832649726e-05, "learning_rate": 0.008488383401708882, "loss": 2.7472, "step": 7424 }, { "crossentropy": 2.776844024658203, "epoch": 0.2691777842227378, "grad_norm": 0.03576071187853813, "grad_norm_var": 5.4791039221595384e-05, "learning_rate": 0.008487967079491175, "loss": 2.7477, "step": 7425 }, { "crossentropy": 2.6616621017456055, "epoch": 0.2692140371229698, "grad_norm": 0.036382291465997696, "grad_norm_var": 5.298895893444016e-05, "learning_rate": 0.008487550710163182, "loss": 2.6866, "step": 7426 }, { "crossentropy": 2.698246717453003, "epoch": 0.26925029002320183, "grad_norm": 0.03323889896273613, "grad_norm_var": 5.0864979547789974e-05, "learning_rate": 0.008487134293730521, "loss": 2.7094, "step": 7427 }, { "crossentropy": 2.5723354816436768, "epoch": 0.26928654292343385, "grad_norm": 0.03359147161245346, "grad_norm_var": 6.878926667739264e-06, "learning_rate": 0.008486717830198822, "loss": 2.5895, "step": 7428 }, { "crossentropy": 2.5979864597320557, "epoch": 0.2693227958236659, "grad_norm": 0.031873203814029694, "grad_norm_var": 6.811583079483841e-06, "learning_rate": 0.008486301319573706, "loss": 2.5922, "step": 7429 }, { "crossentropy": 2.5719566345214844, "epoch": 0.26935904872389793, "grad_norm": 0.03304183483123779, "grad_norm_var": 6.913051270658133e-06, "learning_rate": 0.008485884761860801, "loss": 2.63, "step": 7430 }, { "crossentropy": 2.524698495864868, "epoch": 0.26939530162412995, "grad_norm": 0.030929170548915863, "grad_norm_var": 7.119797981593974e-06, "learning_rate": 0.008485468157065731, "loss": 2.5736, "step": 7431 }, { "crossentropy": 2.6244137287139893, "epoch": 0.26943155452436196, "grad_norm": 0.029686925932765007, "grad_norm_var": 7.634983111562815e-06, "learning_rate": 0.008485051505194127, "loss": 2.6881, "step": 7432 }, { "crossentropy": 2.625875234603882, "epoch": 0.269467807424594, "grad_norm": 0.030287979170680046, "grad_norm_var": 8.519004341485724e-06, "learning_rate": 0.00848463480625161, "loss": 2.6673, "step": 7433 }, { "crossentropy": 2.724231004714966, "epoch": 0.269504060324826, "grad_norm": 0.03502839803695679, "grad_norm_var": 8.482633824785746e-06, "learning_rate": 0.008484218060243815, "loss": 2.6455, "step": 7434 }, { "crossentropy": 2.593752384185791, "epoch": 0.269540313225058, "grad_norm": 0.03591800481081009, "grad_norm_var": 8.607841066344624e-06, "learning_rate": 0.008483801267176367, "loss": 2.6579, "step": 7435 }, { "crossentropy": 2.6452341079711914, "epoch": 0.26957656612529, "grad_norm": 0.03695693239569664, "grad_norm_var": 9.110695312262865e-06, "learning_rate": 0.008483384427054895, "loss": 2.6025, "step": 7436 }, { "crossentropy": 2.617222785949707, "epoch": 0.26961281902552203, "grad_norm": 0.03202436864376068, "grad_norm_var": 8.773993021228323e-06, "learning_rate": 0.00848296753988503, "loss": 2.6221, "step": 7437 }, { "crossentropy": 2.6664342880249023, "epoch": 0.26964907192575405, "grad_norm": 0.06042882427573204, "grad_norm_var": 5.1366954407594015e-05, "learning_rate": 0.008482550605672403, "loss": 2.6289, "step": 7438 }, { "crossentropy": 2.4862306118011475, "epoch": 0.26968532482598606, "grad_norm": 0.03049706481397152, "grad_norm_var": 5.125768649890624e-05, "learning_rate": 0.008482133624422645, "loss": 2.6048, "step": 7439 }, { "crossentropy": 2.7708563804626465, "epoch": 0.2697215777262181, "grad_norm": 0.030950389802455902, "grad_norm_var": 5.228626883007994e-05, "learning_rate": 0.008481716596141388, "loss": 2.6694, "step": 7440 }, { "crossentropy": 2.7015786170959473, "epoch": 0.2697578306264501, "grad_norm": 0.03241205960512161, "grad_norm_var": 5.25524859321117e-05, "learning_rate": 0.008481299520834264, "loss": 2.653, "step": 7441 }, { "crossentropy": 2.6646628379821777, "epoch": 0.26979408352668216, "grad_norm": 0.03314848616719246, "grad_norm_var": 5.24281109509663e-05, "learning_rate": 0.008480882398506908, "loss": 2.6836, "step": 7442 }, { "crossentropy": 2.8361268043518066, "epoch": 0.2698303364269142, "grad_norm": 0.031991757452487946, "grad_norm_var": 5.271438388044397e-05, "learning_rate": 0.008480465229164954, "loss": 2.7838, "step": 7443 }, { "crossentropy": 2.453568935394287, "epoch": 0.2698665893271462, "grad_norm": 0.030274085700511932, "grad_norm_var": 5.371467858442425e-05, "learning_rate": 0.008480048012814035, "loss": 2.5985, "step": 7444 }, { "crossentropy": 2.5788450241088867, "epoch": 0.2699028422273782, "grad_norm": 0.031124673783779144, "grad_norm_var": 5.397100146661015e-05, "learning_rate": 0.008479630749459784, "loss": 2.6082, "step": 7445 }, { "crossentropy": 2.522801399230957, "epoch": 0.2699390951276102, "grad_norm": 0.03190595284104347, "grad_norm_var": 5.420339074513693e-05, "learning_rate": 0.00847921343910784, "loss": 2.6211, "step": 7446 }, { "crossentropy": 2.9517650604248047, "epoch": 0.26997534802784223, "grad_norm": 0.03159136697649956, "grad_norm_var": 5.396206500009626e-05, "learning_rate": 0.00847879608176384, "loss": 2.7988, "step": 7447 }, { "crossentropy": 2.5444302558898926, "epoch": 0.27001160092807425, "grad_norm": 0.030940450727939606, "grad_norm_var": 5.3337025995180835e-05, "learning_rate": 0.00847837867743342, "loss": 2.5597, "step": 7448 }, { "crossentropy": 2.838568687438965, "epoch": 0.27004785382830626, "grad_norm": 0.03008442185819149, "grad_norm_var": 5.3442875472521634e-05, "learning_rate": 0.008477961226122216, "loss": 2.7783, "step": 7449 }, { "crossentropy": 2.576011896133423, "epoch": 0.2700841067285383, "grad_norm": 0.030606983229517937, "grad_norm_var": 5.410547902533918e-05, "learning_rate": 0.008477543727835867, "loss": 2.6593, "step": 7450 }, { "crossentropy": 2.650656223297119, "epoch": 0.2701203596287703, "grad_norm": 0.03253545984625816, "grad_norm_var": 5.386692034408899e-05, "learning_rate": 0.008477126182580013, "loss": 2.7315, "step": 7451 }, { "crossentropy": 2.75303316116333, "epoch": 0.2701566125290023, "grad_norm": 0.044782161712646484, "grad_norm_var": 6.120482438163725e-05, "learning_rate": 0.008476708590360293, "loss": 2.7161, "step": 7452 }, { "crossentropy": 2.605715274810791, "epoch": 0.2701928654292343, "grad_norm": 0.03221603482961655, "grad_norm_var": 6.115455814710313e-05, "learning_rate": 0.008476290951182349, "loss": 2.6514, "step": 7453 }, { "crossentropy": 2.679591417312622, "epoch": 0.27022911832946633, "grad_norm": 0.03165155276656151, "grad_norm_var": 1.1863533711048178e-05, "learning_rate": 0.008475873265051819, "loss": 2.6704, "step": 7454 }, { "crossentropy": 2.6553592681884766, "epoch": 0.2702653712296984, "grad_norm": 0.029530001804232597, "grad_norm_var": 1.2153756070880705e-05, "learning_rate": 0.008475455531974346, "loss": 2.5912, "step": 7455 }, { "crossentropy": 2.6629297733306885, "epoch": 0.2703016241299304, "grad_norm": 0.03081483580172062, "grad_norm_var": 1.2178106378496247e-05, "learning_rate": 0.008475037751955572, "loss": 2.6425, "step": 7456 }, { "crossentropy": 2.90028977394104, "epoch": 0.27033787703016243, "grad_norm": 0.0315842442214489, "grad_norm_var": 1.2200360448388383e-05, "learning_rate": 0.008474619925001139, "loss": 2.7095, "step": 7457 }, { "crossentropy": 2.7612228393554688, "epoch": 0.27037412993039445, "grad_norm": 0.030886763706803322, "grad_norm_var": 1.2226174419741965e-05, "learning_rate": 0.008474202051116693, "loss": 2.7463, "step": 7458 }, { "crossentropy": 2.651207685470581, "epoch": 0.27041038283062646, "grad_norm": 0.03084549494087696, "grad_norm_var": 1.2314528304109967e-05, "learning_rate": 0.008473784130307874, "loss": 2.6098, "step": 7459 }, { "crossentropy": 2.8206076622009277, "epoch": 0.2704466357308585, "grad_norm": 0.029567385092377663, "grad_norm_var": 1.2504685938050545e-05, "learning_rate": 0.008473366162580329, "loss": 2.7222, "step": 7460 }, { "crossentropy": 2.5995988845825195, "epoch": 0.2704828886310905, "grad_norm": 0.028965985402464867, "grad_norm_var": 1.3023907455407025e-05, "learning_rate": 0.008472948147939703, "loss": 2.6317, "step": 7461 }, { "crossentropy": 2.5716638565063477, "epoch": 0.2705191415313225, "grad_norm": 0.031444888561964035, "grad_norm_var": 1.3029562533256542e-05, "learning_rate": 0.008472530086391641, "loss": 2.6537, "step": 7462 }, { "crossentropy": 2.6869866847991943, "epoch": 0.2705553944315545, "grad_norm": 0.030086690559983253, "grad_norm_var": 1.3203493509352408e-05, "learning_rate": 0.008472111977941793, "loss": 2.6245, "step": 7463 }, { "crossentropy": 2.6320972442626953, "epoch": 0.27059164733178653, "grad_norm": 0.030679021030664444, "grad_norm_var": 1.3232810379707364e-05, "learning_rate": 0.008471693822595801, "loss": 2.6671, "step": 7464 }, { "crossentropy": 2.7385640144348145, "epoch": 0.27062790023201855, "grad_norm": 0.03207789361476898, "grad_norm_var": 1.3067017695097873e-05, "learning_rate": 0.008471275620359316, "loss": 2.7386, "step": 7465 }, { "crossentropy": 2.6910223960876465, "epoch": 0.27066415313225056, "grad_norm": 0.034719277173280716, "grad_norm_var": 1.3487792379237894e-05, "learning_rate": 0.008470857371237987, "loss": 2.6206, "step": 7466 }, { "crossentropy": 2.589686155319214, "epoch": 0.2707004060324826, "grad_norm": 0.03792726993560791, "grad_norm_var": 1.5672295116918064e-05, "learning_rate": 0.008470439075237462, "loss": 2.6017, "step": 7467 }, { "crossentropy": 2.660566806793213, "epoch": 0.2707366589327146, "grad_norm": 0.04689066857099533, "grad_norm_var": 1.9442110126374858e-05, "learning_rate": 0.008470020732363389, "loss": 2.5893, "step": 7468 }, { "crossentropy": 2.7106266021728516, "epoch": 0.27077291183294666, "grad_norm": 0.03552864491939545, "grad_norm_var": 2.0005616156539455e-05, "learning_rate": 0.00846960234262142, "loss": 2.7254, "step": 7469 }, { "crossentropy": 2.7097983360290527, "epoch": 0.2708091647331787, "grad_norm": 0.03766436502337456, "grad_norm_var": 2.1424655787505978e-05, "learning_rate": 0.008469183906017209, "loss": 2.6714, "step": 7470 }, { "crossentropy": 2.7119758129119873, "epoch": 0.2708454176334107, "grad_norm": 0.03707265108823776, "grad_norm_var": 2.1414376925144896e-05, "learning_rate": 0.008468765422556401, "loss": 2.6578, "step": 7471 }, { "crossentropy": 2.7846250534057617, "epoch": 0.2708816705336427, "grad_norm": 0.036161549389362335, "grad_norm_var": 2.1253157697165627e-05, "learning_rate": 0.008468346892244655, "loss": 2.6569, "step": 7472 }, { "crossentropy": 2.64178729057312, "epoch": 0.2709179234338747, "grad_norm": 0.03164415806531906, "grad_norm_var": 2.12350309974369e-05, "learning_rate": 0.00846792831508762, "loss": 2.671, "step": 7473 }, { "crossentropy": 2.7492189407348633, "epoch": 0.27095417633410673, "grad_norm": 0.031319353729486465, "grad_norm_var": 2.107378284082658e-05, "learning_rate": 0.008467509691090948, "loss": 2.724, "step": 7474 }, { "crossentropy": 2.730097532272339, "epoch": 0.27099042923433875, "grad_norm": 0.032437629997730255, "grad_norm_var": 2.0581197945514318e-05, "learning_rate": 0.008467091020260298, "loss": 2.651, "step": 7475 }, { "crossentropy": 2.507267951965332, "epoch": 0.27102668213457076, "grad_norm": 0.032698288559913635, "grad_norm_var": 1.9338555543732944e-05, "learning_rate": 0.008466672302601322, "loss": 2.6475, "step": 7476 }, { "crossentropy": 2.7259817123413086, "epoch": 0.2710629350348028, "grad_norm": 0.03519107773900032, "grad_norm_var": 1.7410106149932805e-05, "learning_rate": 0.008466253538119676, "loss": 2.6852, "step": 7477 }, { "crossentropy": 2.5308268070220947, "epoch": 0.2710991879350348, "grad_norm": 0.03102007322013378, "grad_norm_var": 1.7599897098216112e-05, "learning_rate": 0.008465834726821015, "loss": 2.5756, "step": 7478 }, { "crossentropy": 2.629373550415039, "epoch": 0.2711354408352668, "grad_norm": 0.03195898234844208, "grad_norm_var": 1.6699802568626005e-05, "learning_rate": 0.008465415868710996, "loss": 2.5423, "step": 7479 }, { "crossentropy": 2.6811728477478027, "epoch": 0.2711716937354988, "grad_norm": 0.030650785192847252, "grad_norm_var": 1.6714941292296447e-05, "learning_rate": 0.008464996963795277, "loss": 2.6155, "step": 7480 }, { "crossentropy": 2.6561596393585205, "epoch": 0.27120794663573083, "grad_norm": 0.03215037286281586, "grad_norm_var": 1.6690073193368197e-05, "learning_rate": 0.008464578012079517, "loss": 2.6241, "step": 7481 }, { "crossentropy": 2.5390634536743164, "epoch": 0.2712441995359629, "grad_norm": 0.03161414712667465, "grad_norm_var": 1.7280440918352133e-05, "learning_rate": 0.008464159013569372, "loss": 2.5932, "step": 7482 }, { "crossentropy": 2.730809211730957, "epoch": 0.2712804524361949, "grad_norm": 0.030942009761929512, "grad_norm_var": 1.7133933823733452e-05, "learning_rate": 0.008463739968270503, "loss": 2.7147, "step": 7483 }, { "crossentropy": 2.58793044090271, "epoch": 0.27131670533642693, "grad_norm": 0.035397592931985855, "grad_norm_var": 5.726302282531394e-06, "learning_rate": 0.008463320876188568, "loss": 2.6224, "step": 7484 }, { "crossentropy": 2.653325080871582, "epoch": 0.27135295823665895, "grad_norm": 0.03490042686462402, "grad_norm_var": 5.567703392423647e-06, "learning_rate": 0.00846290173732923, "loss": 2.5225, "step": 7485 }, { "crossentropy": 2.5601367950439453, "epoch": 0.27138921113689096, "grad_norm": 0.031034095212817192, "grad_norm_var": 4.458273998244009e-06, "learning_rate": 0.00846248255169815, "loss": 2.5609, "step": 7486 }, { "crossentropy": 2.777462959289551, "epoch": 0.271425464037123, "grad_norm": 0.029778102412819862, "grad_norm_var": 3.7130076946532136e-06, "learning_rate": 0.008462063319300986, "loss": 2.7527, "step": 7487 }, { "crossentropy": 2.6211330890655518, "epoch": 0.271461716937355, "grad_norm": 0.030415719375014305, "grad_norm_var": 2.918530834994613e-06, "learning_rate": 0.008461644040143403, "loss": 2.6364, "step": 7488 }, { "crossentropy": 2.6919076442718506, "epoch": 0.271497969837587, "grad_norm": 0.028812624514102936, "grad_norm_var": 3.5811754717195166e-06, "learning_rate": 0.008461224714231066, "loss": 2.6204, "step": 7489 }, { "crossentropy": 2.5705020427703857, "epoch": 0.271534222737819, "grad_norm": 0.029217196628451347, "grad_norm_var": 4.018735976348215e-06, "learning_rate": 0.008460805341569637, "loss": 2.5806, "step": 7490 }, { "crossentropy": 2.649141311645508, "epoch": 0.27157047563805103, "grad_norm": 0.030361268669366837, "grad_norm_var": 4.101613155779943e-06, "learning_rate": 0.008460385922164778, "loss": 2.6712, "step": 7491 }, { "crossentropy": 2.6879801750183105, "epoch": 0.27160672853828305, "grad_norm": 0.03035377711057663, "grad_norm_var": 4.112436638454295e-06, "learning_rate": 0.008459966456022157, "loss": 2.6437, "step": 7492 }, { "crossentropy": 2.6602964401245117, "epoch": 0.27164298143851506, "grad_norm": 0.031246047466993332, "grad_norm_var": 3.136986322675482e-06, "learning_rate": 0.008459546943147439, "loss": 2.6263, "step": 7493 }, { "crossentropy": 2.5715317726135254, "epoch": 0.2716792343387471, "grad_norm": 0.030690036714076996, "grad_norm_var": 3.1535082911318217e-06, "learning_rate": 0.008459127383546287, "loss": 2.6774, "step": 7494 }, { "crossentropy": 2.7807750701904297, "epoch": 0.2717154872389791, "grad_norm": 0.03262133523821831, "grad_norm_var": 3.246172448724029e-06, "learning_rate": 0.008458707777224373, "loss": 2.6621, "step": 7495 }, { "crossentropy": 2.694666862487793, "epoch": 0.27175174013921116, "grad_norm": 0.03018195368349552, "grad_norm_var": 3.2980924570595594e-06, "learning_rate": 0.00845828812418736, "loss": 2.7749, "step": 7496 }, { "crossentropy": 2.538032293319702, "epoch": 0.2717879930394432, "grad_norm": 0.031056877225637436, "grad_norm_var": 3.238970410654405e-06, "learning_rate": 0.008457868424440919, "loss": 2.6073, "step": 7497 }, { "crossentropy": 2.7792749404907227, "epoch": 0.2718242459396752, "grad_norm": 0.030193956568837166, "grad_norm_var": 3.279780605246876e-06, "learning_rate": 0.008457448677990715, "loss": 2.7187, "step": 7498 }, { "crossentropy": 2.6794700622558594, "epoch": 0.2718604988399072, "grad_norm": 0.033361293375492096, "grad_norm_var": 3.602629223650675e-06, "learning_rate": 0.008457028884842422, "loss": 2.6919, "step": 7499 }, { "crossentropy": 2.582082986831665, "epoch": 0.2718967517401392, "grad_norm": 0.03392757102847099, "grad_norm_var": 2.920122325834826e-06, "learning_rate": 0.008456609045001706, "loss": 2.7035, "step": 7500 }, { "crossentropy": 2.6542885303497314, "epoch": 0.27193300464037123, "grad_norm": 0.03155351057648659, "grad_norm_var": 1.939680138471807e-06, "learning_rate": 0.008456189158474242, "loss": 2.6462, "step": 7501 }, { "crossentropy": 2.3828318119049072, "epoch": 0.27196925754060325, "grad_norm": 0.0324929840862751, "grad_norm_var": 2.0938582397643136e-06, "learning_rate": 0.008455769225265695, "loss": 2.5974, "step": 7502 }, { "crossentropy": 2.690004587173462, "epoch": 0.27200551044083526, "grad_norm": 0.03164040297269821, "grad_norm_var": 2.0031120290600032e-06, "learning_rate": 0.008455349245381742, "loss": 2.621, "step": 7503 }, { "crossentropy": 2.675321102142334, "epoch": 0.2720417633410673, "grad_norm": 0.03169209882616997, "grad_norm_var": 1.9828792943706592e-06, "learning_rate": 0.008454929218828055, "loss": 2.7377, "step": 7504 }, { "crossentropy": 2.5701167583465576, "epoch": 0.2720780162412993, "grad_norm": 0.03166383132338524, "grad_norm_var": 1.5785569893354227e-06, "learning_rate": 0.008454509145610304, "loss": 2.5923, "step": 7505 }, { "crossentropy": 2.6026785373687744, "epoch": 0.2721142691415313, "grad_norm": 0.0326111726462841, "grad_norm_var": 1.3148400945563072e-06, "learning_rate": 0.008454089025734166, "loss": 2.5909, "step": 7506 }, { "crossentropy": 2.670344114303589, "epoch": 0.2721505220417633, "grad_norm": 0.029740866273641586, "grad_norm_var": 1.4416133102032025e-06, "learning_rate": 0.008453668859205312, "loss": 2.7124, "step": 7507 }, { "crossentropy": 2.6460888385772705, "epoch": 0.27218677494199534, "grad_norm": 0.029812578111886978, "grad_norm_var": 1.547265609736411e-06, "learning_rate": 0.008453248646029421, "loss": 2.6913, "step": 7508 }, { "crossentropy": 2.7273964881896973, "epoch": 0.2722230278422274, "grad_norm": 0.034360338002443314, "grad_norm_var": 2.035363755421617e-06, "learning_rate": 0.008452828386212165, "loss": 2.7131, "step": 7509 }, { "crossentropy": 2.7124063968658447, "epoch": 0.2722592807424594, "grad_norm": 0.03770296275615692, "grad_norm_var": 4.141387879164534e-06, "learning_rate": 0.008452408079759223, "loss": 2.6734, "step": 7510 }, { "crossentropy": 2.663879632949829, "epoch": 0.27229553364269143, "grad_norm": 0.03545791655778885, "grad_norm_var": 4.8174868242406e-06, "learning_rate": 0.00845198772667627, "loss": 2.679, "step": 7511 }, { "crossentropy": 2.615756034851074, "epoch": 0.27233178654292345, "grad_norm": 0.04536973685026169, "grad_norm_var": 1.4862853508216264e-05, "learning_rate": 0.008451567326968985, "loss": 2.7342, "step": 7512 }, { "crossentropy": 2.62158465385437, "epoch": 0.27236803944315546, "grad_norm": 0.03389713168144226, "grad_norm_var": 1.452140396010422e-05, "learning_rate": 0.008451146880643044, "loss": 2.6082, "step": 7513 }, { "crossentropy": 2.586160659790039, "epoch": 0.2724042923433875, "grad_norm": 0.03099859692156315, "grad_norm_var": 1.4210677041856162e-05, "learning_rate": 0.008450726387704128, "loss": 2.6999, "step": 7514 }, { "crossentropy": 2.727973222732544, "epoch": 0.2724405452436195, "grad_norm": 0.03384103626012802, "grad_norm_var": 1.42150577886296e-05, "learning_rate": 0.008450305848157915, "loss": 2.6974, "step": 7515 }, { "crossentropy": 2.750196695327759, "epoch": 0.2724767981438515, "grad_norm": 0.035445086658000946, "grad_norm_var": 1.4435853389003804e-05, "learning_rate": 0.008449885262010084, "loss": 2.5569, "step": 7516 }, { "crossentropy": 2.6148223876953125, "epoch": 0.2725130510440835, "grad_norm": 0.03361102193593979, "grad_norm_var": 1.4127351060191324e-05, "learning_rate": 0.00844946462926632, "loss": 2.6771, "step": 7517 }, { "crossentropy": 2.800139904022217, "epoch": 0.27254930394431554, "grad_norm": 0.03228040039539337, "grad_norm_var": 1.416640338248221e-05, "learning_rate": 0.008449043949932301, "loss": 2.6942, "step": 7518 }, { "crossentropy": 2.60526704788208, "epoch": 0.27258555684454755, "grad_norm": 0.03401026874780655, "grad_norm_var": 1.3848352834677533e-05, "learning_rate": 0.008448623224013707, "loss": 2.5734, "step": 7519 }, { "crossentropy": 2.6663098335266113, "epoch": 0.27262180974477956, "grad_norm": 0.0328885093331337, "grad_norm_var": 1.358466013115348e-05, "learning_rate": 0.008448202451516226, "loss": 2.6891, "step": 7520 }, { "crossentropy": 2.5501232147216797, "epoch": 0.2726580626450116, "grad_norm": 0.03596838191151619, "grad_norm_var": 1.3412979448946039e-05, "learning_rate": 0.008447781632445535, "loss": 2.5295, "step": 7521 }, { "crossentropy": 2.6683526039123535, "epoch": 0.2726943155452436, "grad_norm": 0.03432971239089966, "grad_norm_var": 1.322210418081778e-05, "learning_rate": 0.008447360766807323, "loss": 2.6683, "step": 7522 }, { "crossentropy": 2.750131130218506, "epoch": 0.27273056844547566, "grad_norm": 0.03226151689887047, "grad_norm_var": 1.2067734318078301e-05, "learning_rate": 0.008446939854607273, "loss": 2.7705, "step": 7523 }, { "crossentropy": 2.791191339492798, "epoch": 0.2727668213457077, "grad_norm": 0.03030247986316681, "grad_norm_var": 1.1775590869977332e-05, "learning_rate": 0.008446518895851067, "loss": 2.7158, "step": 7524 }, { "crossentropy": 2.7051095962524414, "epoch": 0.2728030742459397, "grad_norm": 0.03216542303562164, "grad_norm_var": 1.2130829651605284e-05, "learning_rate": 0.008446097890544393, "loss": 2.6943, "step": 7525 }, { "crossentropy": 2.571187973022461, "epoch": 0.2728393271461717, "grad_norm": 0.03313875570893288, "grad_norm_var": 1.1427726143810887e-05, "learning_rate": 0.008445676838692939, "loss": 2.6587, "step": 7526 }, { "crossentropy": 2.6104023456573486, "epoch": 0.2728755800464037, "grad_norm": 0.03877295181155205, "grad_norm_var": 1.2704662645050231e-05, "learning_rate": 0.00844525574030239, "loss": 2.6227, "step": 7527 }, { "crossentropy": 2.5877954959869385, "epoch": 0.27291183294663574, "grad_norm": 0.038302648812532425, "grad_norm_var": 5.423699852272303e-06, "learning_rate": 0.008444834595378434, "loss": 2.6209, "step": 7528 }, { "crossentropy": 2.586836099624634, "epoch": 0.27294808584686775, "grad_norm": 0.030392194166779518, "grad_norm_var": 6.187392054951095e-06, "learning_rate": 0.008444413403926758, "loss": 2.6056, "step": 7529 }, { "crossentropy": 2.588446617126465, "epoch": 0.27298433874709976, "grad_norm": 0.03101530112326145, "grad_norm_var": 6.181461206896758e-06, "learning_rate": 0.008443992165953053, "loss": 2.5788, "step": 7530 }, { "crossentropy": 2.6500051021575928, "epoch": 0.2730205916473318, "grad_norm": 0.02870423160493374, "grad_norm_var": 7.713733384673048e-06, "learning_rate": 0.008443570881463006, "loss": 2.6598, "step": 7531 }, { "crossentropy": 2.5586910247802734, "epoch": 0.2730568445475638, "grad_norm": 0.03181726858019829, "grad_norm_var": 7.522551406525139e-06, "learning_rate": 0.00844314955046231, "loss": 2.6075, "step": 7532 }, { "crossentropy": 2.4990427494049072, "epoch": 0.2730930974477958, "grad_norm": 0.03576432913541794, "grad_norm_var": 7.952586389253765e-06, "learning_rate": 0.008442728172956653, "loss": 2.5181, "step": 7533 }, { "crossentropy": 2.561558961868286, "epoch": 0.2731293503480278, "grad_norm": 0.03345893323421478, "grad_norm_var": 7.885911179666495e-06, "learning_rate": 0.008442306748951727, "loss": 2.6605, "step": 7534 }, { "crossentropy": 2.6235439777374268, "epoch": 0.27316560324825984, "grad_norm": 0.0339236818253994, "grad_norm_var": 7.878535422990356e-06, "learning_rate": 0.008441885278453227, "loss": 2.6458, "step": 7535 }, { "crossentropy": 2.6485037803649902, "epoch": 0.2732018561484919, "grad_norm": 0.03178749978542328, "grad_norm_var": 8.018434666392594e-06, "learning_rate": 0.00844146376146684, "loss": 2.6789, "step": 7536 }, { "crossentropy": 2.69321608543396, "epoch": 0.2732381090487239, "grad_norm": 0.03953862190246582, "grad_norm_var": 1.0106001610974884e-05, "learning_rate": 0.008441042197998265, "loss": 2.721, "step": 7537 }, { "crossentropy": 2.6111249923706055, "epoch": 0.27327436194895594, "grad_norm": 0.033705804497003555, "grad_norm_var": 1.005962165425583e-05, "learning_rate": 0.008440620588053192, "loss": 2.7268, "step": 7538 }, { "crossentropy": 2.6720190048217773, "epoch": 0.27331061484918795, "grad_norm": 0.030211513862013817, "grad_norm_var": 1.0644596758162493e-05, "learning_rate": 0.008440198931637315, "loss": 2.649, "step": 7539 }, { "crossentropy": 2.5454118251800537, "epoch": 0.27334686774941996, "grad_norm": 0.029647022485733032, "grad_norm_var": 1.0934515887679047e-05, "learning_rate": 0.008439777228756334, "loss": 2.6189, "step": 7540 }, { "crossentropy": 2.6946377754211426, "epoch": 0.273383120649652, "grad_norm": 0.031232425943017006, "grad_norm_var": 1.1126533618364949e-05, "learning_rate": 0.008439355479415938, "loss": 2.6244, "step": 7541 }, { "crossentropy": 2.6081650257110596, "epoch": 0.273419373549884, "grad_norm": 0.03165179863572121, "grad_norm_var": 1.1279507687609217e-05, "learning_rate": 0.008438933683621828, "loss": 2.7031, "step": 7542 }, { "crossentropy": 2.8093271255493164, "epoch": 0.273455626450116, "grad_norm": 0.032506201416254044, "grad_norm_var": 9.010924533274843e-06, "learning_rate": 0.008438511841379701, "loss": 2.6974, "step": 7543 }, { "crossentropy": 2.6186790466308594, "epoch": 0.273491879350348, "grad_norm": 0.02981053665280342, "grad_norm_var": 7.2069123964909785e-06, "learning_rate": 0.008438089952695254, "loss": 2.5676, "step": 7544 }, { "crossentropy": 2.6731693744659424, "epoch": 0.27352813225058004, "grad_norm": 0.029847797006368637, "grad_norm_var": 7.356509274192696e-06, "learning_rate": 0.008437668017574182, "loss": 2.6861, "step": 7545 }, { "crossentropy": 2.679744243621826, "epoch": 0.27356438515081205, "grad_norm": 0.0323190875351429, "grad_norm_var": 7.263073946531352e-06, "learning_rate": 0.00843724603602219, "loss": 2.6716, "step": 7546 }, { "crossentropy": 2.733564853668213, "epoch": 0.27360063805104406, "grad_norm": 0.034252941608428955, "grad_norm_var": 6.567463459010436e-06, "learning_rate": 0.008436824008044972, "loss": 2.724, "step": 7547 }, { "crossentropy": 2.4954216480255127, "epoch": 0.2736368909512761, "grad_norm": 0.033805329352617264, "grad_norm_var": 6.609068418101059e-06, "learning_rate": 0.00843640193364823, "loss": 2.5093, "step": 7548 }, { "crossentropy": 2.74503231048584, "epoch": 0.2736731438515081, "grad_norm": 0.031119026243686676, "grad_norm_var": 6.069982194299485e-06, "learning_rate": 0.008435979812837667, "loss": 2.7157, "step": 7549 }, { "crossentropy": 2.575515031814575, "epoch": 0.27370939675174016, "grad_norm": 0.030747810378670692, "grad_norm_var": 6.1560312377931105e-06, "learning_rate": 0.008435557645618978, "loss": 2.6185, "step": 7550 }, { "crossentropy": 2.6725308895111084, "epoch": 0.2737456496519722, "grad_norm": 0.029979145154356956, "grad_norm_var": 6.2517590602309894e-06, "learning_rate": 0.008435135431997873, "loss": 2.5991, "step": 7551 }, { "crossentropy": 2.6059184074401855, "epoch": 0.2737819025522042, "grad_norm": 0.03068404085934162, "grad_norm_var": 6.360619953113761e-06, "learning_rate": 0.00843471317198005, "loss": 2.6393, "step": 7552 }, { "crossentropy": 2.6743662357330322, "epoch": 0.2738181554524362, "grad_norm": 0.030893424525856972, "grad_norm_var": 2.274332938364404e-06, "learning_rate": 0.008434290865571214, "loss": 2.6415, "step": 7553 }, { "crossentropy": 2.6085305213928223, "epoch": 0.2738544083526682, "grad_norm": 0.030677588656544685, "grad_norm_var": 1.9168180668874357e-06, "learning_rate": 0.008433868512777068, "loss": 2.5847, "step": 7554 }, { "crossentropy": 2.7580020427703857, "epoch": 0.27389066125290024, "grad_norm": 0.030067134648561478, "grad_norm_var": 1.937373230380806e-06, "learning_rate": 0.008433446113603317, "loss": 2.6614, "step": 7555 }, { "crossentropy": 2.646322250366211, "epoch": 0.27392691415313225, "grad_norm": 0.03117894008755684, "grad_norm_var": 1.766314596258802e-06, "learning_rate": 0.008433023668055664, "loss": 2.6934, "step": 7556 }, { "crossentropy": 2.7214460372924805, "epoch": 0.27396316705336426, "grad_norm": 0.03429688140749931, "grad_norm_var": 2.3263183557094365e-06, "learning_rate": 0.008432601176139818, "loss": 2.7597, "step": 7557 }, { "crossentropy": 2.615746021270752, "epoch": 0.2739994199535963, "grad_norm": 0.0350792296230793, "grad_norm_var": 3.1345301863120912e-06, "learning_rate": 0.008432178637861483, "loss": 2.7105, "step": 7558 }, { "crossentropy": 2.5755860805511475, "epoch": 0.2740356728538283, "grad_norm": 0.033816538751125336, "grad_norm_var": 3.3819834338537676e-06, "learning_rate": 0.008431756053226366, "loss": 2.4936, "step": 7559 }, { "crossentropy": 2.677452325820923, "epoch": 0.2740719257540603, "grad_norm": 0.031735289841890335, "grad_norm_var": 3.1065638094467425e-06, "learning_rate": 0.008431333422240178, "loss": 2.6208, "step": 7560 }, { "crossentropy": 2.6337051391601562, "epoch": 0.2741081786542923, "grad_norm": 0.030872724950313568, "grad_norm_var": 2.8909147507604938e-06, "learning_rate": 0.008430910744908624, "loss": 2.5742, "step": 7561 }, { "crossentropy": 2.717593193054199, "epoch": 0.27414443155452434, "grad_norm": 0.03362531214952469, "grad_norm_var": 3.058296022450978e-06, "learning_rate": 0.008430488021237413, "loss": 2.6602, "step": 7562 }, { "crossentropy": 2.6485493183135986, "epoch": 0.2741806844547564, "grad_norm": 0.03283446654677391, "grad_norm_var": 2.767778800723365e-06, "learning_rate": 0.008430065251232256, "loss": 2.6367, "step": 7563 }, { "crossentropy": 2.7621710300445557, "epoch": 0.2742169373549884, "grad_norm": 0.0319182313978672, "grad_norm_var": 2.5268726337437555e-06, "learning_rate": 0.008429642434898863, "loss": 2.7011, "step": 7564 }, { "crossentropy": 2.6061434745788574, "epoch": 0.27425319025522044, "grad_norm": 0.030370887368917465, "grad_norm_var": 2.634307912877789e-06, "learning_rate": 0.008429219572242944, "loss": 2.6345, "step": 7565 }, { "crossentropy": 2.5952916145324707, "epoch": 0.27428944315545245, "grad_norm": 0.03193448856472969, "grad_norm_var": 2.5560603914710765e-06, "learning_rate": 0.00842879666327021, "loss": 2.5604, "step": 7566 }, { "crossentropy": 2.79899001121521, "epoch": 0.27432569605568446, "grad_norm": 0.035447340458631516, "grad_norm_var": 3.04425461252245e-06, "learning_rate": 0.008428373707986374, "loss": 2.7386, "step": 7567 }, { "crossentropy": 2.533517837524414, "epoch": 0.2743619489559165, "grad_norm": 0.032041240483522415, "grad_norm_var": 2.8824213499927438e-06, "learning_rate": 0.00842795070639715, "loss": 2.5623, "step": 7568 }, { "crossentropy": 2.71384334564209, "epoch": 0.2743982018561485, "grad_norm": 0.03028767555952072, "grad_norm_var": 3.018906911527577e-06, "learning_rate": 0.008427527658508247, "loss": 2.7135, "step": 7569 }, { "crossentropy": 2.6270313262939453, "epoch": 0.2744344547563805, "grad_norm": 0.03167722001671791, "grad_norm_var": 2.870250759064626e-06, "learning_rate": 0.008427104564325383, "loss": 2.6354, "step": 7570 }, { "crossentropy": 2.583807945251465, "epoch": 0.2744707076566125, "grad_norm": 0.035136397927999496, "grad_norm_var": 2.9509376784362292e-06, "learning_rate": 0.00842668142385427, "loss": 2.5747, "step": 7571 }, { "crossentropy": 2.76827335357666, "epoch": 0.27450696055684454, "grad_norm": 0.03130514547228813, "grad_norm_var": 2.9273338183551562e-06, "learning_rate": 0.008426258237100623, "loss": 2.6904, "step": 7572 }, { "crossentropy": 2.6727864742279053, "epoch": 0.27454321345707655, "grad_norm": 0.03259760141372681, "grad_norm_var": 2.7343744569987536e-06, "learning_rate": 0.008425835004070164, "loss": 2.586, "step": 7573 }, { "crossentropy": 2.5601930618286133, "epoch": 0.27457946635730857, "grad_norm": 0.03281629830598831, "grad_norm_var": 2.2890314866424547e-06, "learning_rate": 0.008425411724768598, "loss": 2.571, "step": 7574 }, { "crossentropy": 2.6648709774017334, "epoch": 0.2746157192575406, "grad_norm": 0.03348461911082268, "grad_norm_var": 2.2332735136085595e-06, "learning_rate": 0.008424988399201654, "loss": 2.5989, "step": 7575 }, { "crossentropy": 2.7058427333831787, "epoch": 0.2746519721577726, "grad_norm": 0.030157404020428658, "grad_norm_var": 2.524583395666644e-06, "learning_rate": 0.00842456502737504, "loss": 2.5902, "step": 7576 }, { "crossentropy": 2.602243423461914, "epoch": 0.27468822505800466, "grad_norm": 0.031034458428621292, "grad_norm_var": 2.495834658739722e-06, "learning_rate": 0.00842414160929448, "loss": 2.5332, "step": 7577 }, { "crossentropy": 2.6616973876953125, "epoch": 0.2747244779582367, "grad_norm": 0.03904273733496666, "grad_norm_var": 5.293343186769346e-06, "learning_rate": 0.008423718144965692, "loss": 2.6226, "step": 7578 }, { "crossentropy": 2.779210090637207, "epoch": 0.2747607308584687, "grad_norm": 0.0393533781170845, "grad_norm_var": 8.126738617604416e-06, "learning_rate": 0.008423294634394394, "loss": 2.7421, "step": 7579 }, { "crossentropy": 2.646357536315918, "epoch": 0.2747969837587007, "grad_norm": 0.042272672057151794, "grad_norm_var": 1.3281945693626575e-05, "learning_rate": 0.008422871077586307, "loss": 2.6609, "step": 7580 }, { "crossentropy": 2.7432289123535156, "epoch": 0.2748332366589327, "grad_norm": 0.04061006382107735, "grad_norm_var": 1.531002420748598e-05, "learning_rate": 0.008422447474547151, "loss": 2.7081, "step": 7581 }, { "crossentropy": 2.632978916168213, "epoch": 0.27486948955916474, "grad_norm": 0.03448903560638428, "grad_norm_var": 1.4903684730039807e-05, "learning_rate": 0.00842202382528265, "loss": 2.5971, "step": 7582 }, { "crossentropy": 2.5428130626678467, "epoch": 0.27490574245939675, "grad_norm": 0.03221616521477699, "grad_norm_var": 1.5141436088823599e-05, "learning_rate": 0.008421600129798522, "loss": 2.6957, "step": 7583 }, { "crossentropy": 2.5957045555114746, "epoch": 0.27494199535962877, "grad_norm": 0.031873226165771484, "grad_norm_var": 1.5193411839617292e-05, "learning_rate": 0.008421176388100493, "loss": 2.6323, "step": 7584 }, { "crossentropy": 2.7882678508758545, "epoch": 0.2749782482598608, "grad_norm": 0.030918626114726067, "grad_norm_var": 1.488309373913338e-05, "learning_rate": 0.008420752600194283, "loss": 2.787, "step": 7585 }, { "crossentropy": 2.799651622772217, "epoch": 0.2750145011600928, "grad_norm": 0.029306253418326378, "grad_norm_var": 1.6067229111197063e-05, "learning_rate": 0.008420328766085622, "loss": 2.7387, "step": 7586 }, { "crossentropy": 2.757023572921753, "epoch": 0.2750507540603248, "grad_norm": 0.030716408044099808, "grad_norm_var": 1.6714818028769905e-05, "learning_rate": 0.008419904885780228, "loss": 2.8547, "step": 7587 }, { "crossentropy": 2.590670347213745, "epoch": 0.2750870069605568, "grad_norm": 0.031116103753447533, "grad_norm_var": 1.6782131969326983e-05, "learning_rate": 0.008419480959283828, "loss": 2.5969, "step": 7588 }, { "crossentropy": 2.7232298851013184, "epoch": 0.27512325986078884, "grad_norm": 0.03199021518230438, "grad_norm_var": 1.6908664820436344e-05, "learning_rate": 0.00841905698660215, "loss": 2.7286, "step": 7589 }, { "crossentropy": 2.540435791015625, "epoch": 0.2751595127610209, "grad_norm": 0.02897847443819046, "grad_norm_var": 1.8351704850253814e-05, "learning_rate": 0.008418632967740917, "loss": 2.567, "step": 7590 }, { "crossentropy": 2.6917948722839355, "epoch": 0.2751957656612529, "grad_norm": 0.03302934393286705, "grad_norm_var": 1.83715112132623e-05, "learning_rate": 0.00841820890270586, "loss": 2.6577, "step": 7591 }, { "crossentropy": 2.6845805644989014, "epoch": 0.27523201856148494, "grad_norm": 0.03457183018326759, "grad_norm_var": 1.758140599488257e-05, "learning_rate": 0.008417784791502703, "loss": 2.6204, "step": 7592 }, { "crossentropy": 2.679114580154419, "epoch": 0.27526827146171695, "grad_norm": 0.03654037415981293, "grad_norm_var": 1.74128658321187e-05, "learning_rate": 0.008417360634137174, "loss": 2.6941, "step": 7593 }, { "crossentropy": 2.4402129650115967, "epoch": 0.27530452436194897, "grad_norm": 0.030260905623435974, "grad_norm_var": 1.6549673834709158e-05, "learning_rate": 0.008416936430615007, "loss": 2.5267, "step": 7594 }, { "crossentropy": 2.6948113441467285, "epoch": 0.275340777262181, "grad_norm": 0.03168586269021034, "grad_norm_var": 1.438330627235953e-05, "learning_rate": 0.008416512180941927, "loss": 2.7325, "step": 7595 }, { "crossentropy": 2.7541589736938477, "epoch": 0.275377030162413, "grad_norm": 0.03769092634320259, "grad_norm_var": 1.012899884719139e-05, "learning_rate": 0.008416087885123665, "loss": 2.6479, "step": 7596 }, { "crossentropy": 2.6824758052825928, "epoch": 0.275413283062645, "grad_norm": 0.03896297141909599, "grad_norm_var": 8.599755722228176e-06, "learning_rate": 0.008415663543165952, "loss": 2.691, "step": 7597 }, { "crossentropy": 2.613933801651001, "epoch": 0.275449535962877, "grad_norm": 0.039355918765068054, "grad_norm_var": 1.1194594142307811e-05, "learning_rate": 0.008415239155074519, "loss": 2.598, "step": 7598 }, { "crossentropy": 2.7380154132843018, "epoch": 0.27548578886310904, "grad_norm": 0.03220495581626892, "grad_norm_var": 1.1195886869226724e-05, "learning_rate": 0.0084148147208551, "loss": 2.6319, "step": 7599 }, { "crossentropy": 2.741194248199463, "epoch": 0.27552204176334105, "grad_norm": 0.030952515080571175, "grad_norm_var": 1.1396418591686809e-05, "learning_rate": 0.008414390240513422, "loss": 2.7226, "step": 7600 }, { "crossentropy": 2.730292558670044, "epoch": 0.27555829466357307, "grad_norm": 0.035517096519470215, "grad_norm_var": 1.1431093392503842e-05, "learning_rate": 0.008413965714055225, "loss": 2.7354, "step": 7601 }, { "crossentropy": 2.694819450378418, "epoch": 0.2755945475638051, "grad_norm": 0.03463825583457947, "grad_norm_var": 1.036513357461378e-05, "learning_rate": 0.008413541141486241, "loss": 2.7098, "step": 7602 }, { "crossentropy": 2.673752784729004, "epoch": 0.2756308004640371, "grad_norm": 0.0358361154794693, "grad_norm_var": 1.000880947524026e-05, "learning_rate": 0.008413116522812203, "loss": 2.686, "step": 7603 }, { "crossentropy": 2.702967882156372, "epoch": 0.27566705336426917, "grad_norm": 0.03162464126944542, "grad_norm_var": 9.832261464420946e-06, "learning_rate": 0.008412691858038846, "loss": 2.6941, "step": 7604 }, { "crossentropy": 2.506406784057617, "epoch": 0.2757033062645012, "grad_norm": 0.031954582780599594, "grad_norm_var": 9.84184188972125e-06, "learning_rate": 0.008412267147171908, "loss": 2.581, "step": 7605 }, { "crossentropy": 2.745393753051758, "epoch": 0.2757395591647332, "grad_norm": 0.02878064289689064, "grad_norm_var": 9.97642160280609e-06, "learning_rate": 0.008411842390217123, "loss": 2.6658, "step": 7606 }, { "crossentropy": 2.5345640182495117, "epoch": 0.2757758120649652, "grad_norm": 0.029686203226447105, "grad_norm_var": 1.1096679947925475e-05, "learning_rate": 0.008411417587180229, "loss": 2.6042, "step": 7607 }, { "crossentropy": 2.750406265258789, "epoch": 0.2758120649651972, "grad_norm": 0.029653504490852356, "grad_norm_var": 1.2080425330886908e-05, "learning_rate": 0.008410992738066963, "loss": 2.6773, "step": 7608 }, { "crossentropy": 2.7035434246063232, "epoch": 0.27584831786542924, "grad_norm": 0.03181377053260803, "grad_norm_var": 1.1534857620937542e-05, "learning_rate": 0.008410567842883063, "loss": 2.6765, "step": 7609 }, { "crossentropy": 2.5587730407714844, "epoch": 0.27588457076566125, "grad_norm": 0.0341818742454052, "grad_norm_var": 1.0978174371715533e-05, "learning_rate": 0.00841014290163427, "loss": 2.6314, "step": 7610 }, { "crossentropy": 2.7817955017089844, "epoch": 0.27592082366589327, "grad_norm": 0.030268564820289612, "grad_norm_var": 1.142929754798655e-05, "learning_rate": 0.008409717914326322, "loss": 2.696, "step": 7611 }, { "crossentropy": 2.4668188095092773, "epoch": 0.2759570765661253, "grad_norm": 0.029868120327591896, "grad_norm_var": 1.069517695295416e-05, "learning_rate": 0.00840929288096496, "loss": 2.5643, "step": 7612 }, { "crossentropy": 2.6896517276763916, "epoch": 0.2759933294663573, "grad_norm": 0.03136120364069939, "grad_norm_var": 8.091916681143017e-06, "learning_rate": 0.008408867801555921, "loss": 2.7423, "step": 7613 }, { "crossentropy": 2.5181021690368652, "epoch": 0.2760295823665893, "grad_norm": 0.03464376553893089, "grad_norm_var": 5.081809483964402e-06, "learning_rate": 0.008408442676104952, "loss": 2.5394, "step": 7614 }, { "crossentropy": 2.4033637046813965, "epoch": 0.2760658352668213, "grad_norm": 0.033086780458688736, "grad_norm_var": 5.1472641333370844e-06, "learning_rate": 0.00840801750461779, "loss": 2.5596, "step": 7615 }, { "crossentropy": 2.5993518829345703, "epoch": 0.27610208816705334, "grad_norm": 0.03242015093564987, "grad_norm_var": 5.054067704021667e-06, "learning_rate": 0.00840759228710018, "loss": 2.6827, "step": 7616 }, { "crossentropy": 2.6059887409210205, "epoch": 0.2761383410672854, "grad_norm": 0.03860827907919884, "grad_norm_var": 7.014962980350879e-06, "learning_rate": 0.008407167023557868, "loss": 2.6546, "step": 7617 }, { "crossentropy": 2.6732771396636963, "epoch": 0.2761745939675174, "grad_norm": 0.03565644845366478, "grad_norm_var": 7.383396679057295e-06, "learning_rate": 0.008406741713996592, "loss": 2.6701, "step": 7618 }, { "crossentropy": 2.4388797283172607, "epoch": 0.27621084686774944, "grad_norm": 0.034788258373737335, "grad_norm_var": 6.981069575136221e-06, "learning_rate": 0.0084063163584221, "loss": 2.5096, "step": 7619 }, { "crossentropy": 2.633396625518799, "epoch": 0.27624709976798145, "grad_norm": 0.03171520680189133, "grad_norm_var": 6.9722218590161086e-06, "learning_rate": 0.008405890956840136, "loss": 2.679, "step": 7620 }, { "crossentropy": 2.6747078895568848, "epoch": 0.27628335266821347, "grad_norm": 0.031044533476233482, "grad_norm_var": 7.078693091761461e-06, "learning_rate": 0.008405465509256445, "loss": 2.6156, "step": 7621 }, { "crossentropy": 2.68332576751709, "epoch": 0.2763196055684455, "grad_norm": 0.029950233176350594, "grad_norm_var": 6.60778589075169e-06, "learning_rate": 0.008405040015676775, "loss": 2.756, "step": 7622 }, { "crossentropy": 2.6636464595794678, "epoch": 0.2763558584686775, "grad_norm": 0.03158113732933998, "grad_norm_var": 6.141069303115564e-06, "learning_rate": 0.008404614476106874, "loss": 2.6784, "step": 7623 }, { "crossentropy": 2.92425537109375, "epoch": 0.2763921113689095, "grad_norm": 0.03166964650154114, "grad_norm_var": 5.619145668704625e-06, "learning_rate": 0.008404188890552486, "loss": 2.7827, "step": 7624 }, { "crossentropy": 2.521850824356079, "epoch": 0.2764283642691415, "grad_norm": 0.033190954476594925, "grad_norm_var": 5.581172511661065e-06, "learning_rate": 0.008403763259019361, "loss": 2.6462, "step": 7625 }, { "crossentropy": 2.623321294784546, "epoch": 0.27646461716937354, "grad_norm": 0.0358467772603035, "grad_norm_var": 6.071786172359922e-06, "learning_rate": 0.00840333758151325, "loss": 2.5468, "step": 7626 }, { "crossentropy": 2.515425205230713, "epoch": 0.27650087006960555, "grad_norm": 0.0332287922501564, "grad_norm_var": 5.5981172752992406e-06, "learning_rate": 0.008402911858039897, "loss": 2.6488, "step": 7627 }, { "crossentropy": 2.789428949356079, "epoch": 0.27653712296983757, "grad_norm": 0.03651010990142822, "grad_norm_var": 5.545233697006914e-06, "learning_rate": 0.008402486088605056, "loss": 2.6691, "step": 7628 }, { "crossentropy": 2.7547457218170166, "epoch": 0.2765733758700696, "grad_norm": 0.041972093284130096, "grad_norm_var": 9.617927724388936e-06, "learning_rate": 0.008402060273214478, "loss": 2.8122, "step": 7629 }, { "crossentropy": 2.502634286880493, "epoch": 0.2766096287703016, "grad_norm": 0.031173160299658775, "grad_norm_var": 1.0128177783243426e-05, "learning_rate": 0.008401634411873913, "loss": 2.5423, "step": 7630 }, { "crossentropy": 2.7264633178710938, "epoch": 0.27664588167053367, "grad_norm": 0.03104328364133835, "grad_norm_var": 1.061146990375809e-05, "learning_rate": 0.008401208504589112, "loss": 2.686, "step": 7631 }, { "crossentropy": 2.65600848197937, "epoch": 0.2766821345707657, "grad_norm": 0.03003709204494953, "grad_norm_var": 1.1396878293112867e-05, "learning_rate": 0.00840078255136583, "loss": 2.6712, "step": 7632 }, { "crossentropy": 2.6165614128112793, "epoch": 0.2767183874709977, "grad_norm": 0.031190108507871628, "grad_norm_var": 9.908287627310997e-06, "learning_rate": 0.008400356552209819, "loss": 2.6304, "step": 7633 }, { "crossentropy": 2.6872453689575195, "epoch": 0.2767546403712297, "grad_norm": 0.031275659799575806, "grad_norm_var": 9.650937300722101e-06, "learning_rate": 0.008399930507126832, "loss": 2.7103, "step": 7634 }, { "crossentropy": 2.6660358905792236, "epoch": 0.2767908932714617, "grad_norm": 0.029063580557703972, "grad_norm_var": 1.024916597819624e-05, "learning_rate": 0.008399504416122623, "loss": 2.6652, "step": 7635 }, { "crossentropy": 2.7391762733459473, "epoch": 0.27682714617169374, "grad_norm": 0.031222647055983543, "grad_norm_var": 1.0317891438099923e-05, "learning_rate": 0.00839907827920295, "loss": 2.637, "step": 7636 }, { "crossentropy": 2.7085700035095215, "epoch": 0.27686339907192575, "grad_norm": 0.03365342691540718, "grad_norm_var": 1.0237003280766479e-05, "learning_rate": 0.008398652096373565, "loss": 2.6771, "step": 7637 }, { "crossentropy": 2.5709002017974854, "epoch": 0.27689965197215777, "grad_norm": 0.03832840919494629, "grad_norm_var": 1.1593663726407593e-05, "learning_rate": 0.008398225867640226, "loss": 2.6438, "step": 7638 }, { "crossentropy": 2.6021759510040283, "epoch": 0.2769359048723898, "grad_norm": 0.038327883929014206, "grad_norm_var": 1.2994283695946275e-05, "learning_rate": 0.008397799593008688, "loss": 2.6012, "step": 7639 }, { "crossentropy": 2.684670925140381, "epoch": 0.2769721577726218, "grad_norm": 0.03686852008104324, "grad_norm_var": 1.3339673905721622e-05, "learning_rate": 0.008397373272484713, "loss": 2.6856, "step": 7640 }, { "crossentropy": 2.621882915496826, "epoch": 0.2770084106728538, "grad_norm": 0.03277667239308357, "grad_norm_var": 1.3391405113399347e-05, "learning_rate": 0.008396946906074055, "loss": 2.626, "step": 7641 }, { "crossentropy": 2.733680009841919, "epoch": 0.2770446635730858, "grad_norm": 0.03316999226808548, "grad_norm_var": 1.3147051859696036e-05, "learning_rate": 0.008396520493782474, "loss": 2.7118, "step": 7642 }, { "crossentropy": 2.5603444576263428, "epoch": 0.27708091647331784, "grad_norm": 0.03469487279653549, "grad_norm_var": 1.3181441808517562e-05, "learning_rate": 0.008396094035615729, "loss": 2.6412, "step": 7643 }, { "crossentropy": 2.7066502571105957, "epoch": 0.2771171693735499, "grad_norm": 0.0342484675347805, "grad_norm_var": 1.2693456130817108e-05, "learning_rate": 0.00839566753157958, "loss": 2.8312, "step": 7644 }, { "crossentropy": 2.647339344024658, "epoch": 0.2771534222737819, "grad_norm": 0.033037442713975906, "grad_norm_var": 7.816794076372497e-06, "learning_rate": 0.00839524098167979, "loss": 2.7389, "step": 7645 }, { "crossentropy": 2.605280637741089, "epoch": 0.27718967517401394, "grad_norm": 0.03555142134428024, "grad_norm_var": 7.87138744868265e-06, "learning_rate": 0.008394814385922116, "loss": 2.6354, "step": 7646 }, { "crossentropy": 2.702558755874634, "epoch": 0.27722592807424595, "grad_norm": 0.034320101141929626, "grad_norm_var": 7.5103692343683375e-06, "learning_rate": 0.008394387744312323, "loss": 2.6443, "step": 7647 }, { "crossentropy": 2.631221055984497, "epoch": 0.27726218097447797, "grad_norm": 0.0324857234954834, "grad_norm_var": 6.718479993301993e-06, "learning_rate": 0.008393961056856173, "loss": 2.6885, "step": 7648 }, { "crossentropy": 2.6412227153778076, "epoch": 0.27729843387471, "grad_norm": 0.031445473432540894, "grad_norm_var": 6.634937450407154e-06, "learning_rate": 0.008393534323559427, "loss": 2.7221, "step": 7649 }, { "crossentropy": 2.7455391883850098, "epoch": 0.277334686774942, "grad_norm": 0.030752280727028847, "grad_norm_var": 6.826778040824631e-06, "learning_rate": 0.008393107544427849, "loss": 2.7404, "step": 7650 }, { "crossentropy": 2.6419014930725098, "epoch": 0.277370939675174, "grad_norm": 0.03013833984732628, "grad_norm_var": 6.22787799216216e-06, "learning_rate": 0.008392680719467208, "loss": 2.6703, "step": 7651 }, { "crossentropy": 2.66263747215271, "epoch": 0.277407192575406, "grad_norm": 0.0346776582300663, "grad_norm_var": 5.7802600413439044e-06, "learning_rate": 0.008392253848683263, "loss": 2.6154, "step": 7652 }, { "crossentropy": 2.6656699180603027, "epoch": 0.27744344547563804, "grad_norm": 0.03066956251859665, "grad_norm_var": 6.486462128568954e-06, "learning_rate": 0.008391826932081781, "loss": 2.7136, "step": 7653 }, { "crossentropy": 2.7018628120422363, "epoch": 0.27747969837587005, "grad_norm": 0.03133322298526764, "grad_norm_var": 5.361529573711743e-06, "learning_rate": 0.00839139996966853, "loss": 2.7009, "step": 7654 }, { "crossentropy": 2.654979705810547, "epoch": 0.27751595127610207, "grad_norm": 0.036514975130558014, "grad_norm_var": 4.377245618378253e-06, "learning_rate": 0.008390972961449274, "loss": 2.7058, "step": 7655 }, { "crossentropy": 2.735076665878296, "epoch": 0.2775522041763341, "grad_norm": 0.04378930479288101, "grad_norm_var": 1.0670400512902582e-05, "learning_rate": 0.008390545907429785, "loss": 2.6386, "step": 7656 }, { "crossentropy": 2.788017988204956, "epoch": 0.27758845707656615, "grad_norm": 0.034544747322797775, "grad_norm_var": 1.0642137966427332e-05, "learning_rate": 0.008390118807615827, "loss": 2.6614, "step": 7657 }, { "crossentropy": 2.664053440093994, "epoch": 0.27762470997679817, "grad_norm": 0.03176502510905266, "grad_norm_var": 1.0890243033480684e-05, "learning_rate": 0.00838969166201317, "loss": 2.6535, "step": 7658 }, { "crossentropy": 2.725565195083618, "epoch": 0.2776609628770302, "grad_norm": 0.03139599785208702, "grad_norm_var": 1.1153939054712999e-05, "learning_rate": 0.008389264470627584, "loss": 2.6786, "step": 7659 }, { "crossentropy": 2.6442108154296875, "epoch": 0.2776972157772622, "grad_norm": 0.032684359699487686, "grad_norm_var": 1.1159479546803462e-05, "learning_rate": 0.008388837233464836, "loss": 2.6003, "step": 7660 }, { "crossentropy": 2.6706550121307373, "epoch": 0.2777334686774942, "grad_norm": 0.030314572155475616, "grad_norm_var": 1.1770493563221017e-05, "learning_rate": 0.0083884099505307, "loss": 2.6145, "step": 7661 }, { "crossentropy": 2.583188772201538, "epoch": 0.2777697215777262, "grad_norm": 0.029132409021258354, "grad_norm_var": 1.2396487236577176e-05, "learning_rate": 0.008387982621830945, "loss": 2.6735, "step": 7662 }, { "crossentropy": 2.729412317276001, "epoch": 0.27780597447795824, "grad_norm": 0.031023619696497917, "grad_norm_var": 1.2439499342758532e-05, "learning_rate": 0.008387555247371342, "loss": 2.6647, "step": 7663 }, { "crossentropy": 2.7628676891326904, "epoch": 0.27784222737819025, "grad_norm": 0.029407788068056107, "grad_norm_var": 1.3105877795208327e-05, "learning_rate": 0.008387127827157666, "loss": 2.6962, "step": 7664 }, { "crossentropy": 2.788904905319214, "epoch": 0.27787848027842227, "grad_norm": 0.02914213202893734, "grad_norm_var": 1.3753439623446356e-05, "learning_rate": 0.008386700361195688, "loss": 2.7358, "step": 7665 }, { "crossentropy": 2.713144063949585, "epoch": 0.2779147331786543, "grad_norm": 0.03262078016996384, "grad_norm_var": 1.3578489564603511e-05, "learning_rate": 0.008386272849491184, "loss": 2.743, "step": 7666 }, { "crossentropy": 2.67071533203125, "epoch": 0.2779509860788863, "grad_norm": 0.03290070593357086, "grad_norm_var": 1.3205033520562284e-05, "learning_rate": 0.008385845292049924, "loss": 2.6461, "step": 7667 }, { "crossentropy": 2.5765628814697266, "epoch": 0.2779872389791183, "grad_norm": 0.028618980199098587, "grad_norm_var": 1.3836873577081779e-05, "learning_rate": 0.008385417688877685, "loss": 2.5775, "step": 7668 }, { "crossentropy": 2.4429819583892822, "epoch": 0.2780234918793503, "grad_norm": 0.027820253744721413, "grad_norm_var": 1.4941336847580587e-05, "learning_rate": 0.008384990039980243, "loss": 2.5787, "step": 7669 }, { "crossentropy": 2.568481922149658, "epoch": 0.27805974477958234, "grad_norm": 0.029005683958530426, "grad_norm_var": 1.5506421599147517e-05, "learning_rate": 0.008384562345363375, "loss": 2.5338, "step": 7670 }, { "crossentropy": 2.6042561531066895, "epoch": 0.2780959976798144, "grad_norm": 0.03129591420292854, "grad_norm_var": 1.40096249238858e-05, "learning_rate": 0.008384134605032855, "loss": 2.5845, "step": 7671 }, { "crossentropy": 2.642944574356079, "epoch": 0.2781322505800464, "grad_norm": 0.03595227003097534, "grad_norm_var": 5.102257457060416e-06, "learning_rate": 0.00838370681899446, "loss": 2.6472, "step": 7672 }, { "crossentropy": 2.617374897003174, "epoch": 0.27816850348027844, "grad_norm": 0.03386066481471062, "grad_norm_var": 4.817450540909457e-06, "learning_rate": 0.008383278987253972, "loss": 2.6477, "step": 7673 }, { "crossentropy": 2.6182854175567627, "epoch": 0.27820475638051045, "grad_norm": 0.034900933504104614, "grad_norm_var": 5.7273489442926625e-06, "learning_rate": 0.008382851109817166, "loss": 2.6316, "step": 7674 }, { "crossentropy": 2.632479667663574, "epoch": 0.27824100928074247, "grad_norm": 0.032475054264068604, "grad_norm_var": 5.820433958271994e-06, "learning_rate": 0.008382423186689823, "loss": 2.6072, "step": 7675 }, { "crossentropy": 2.659512996673584, "epoch": 0.2782772621809745, "grad_norm": 0.03304597735404968, "grad_norm_var": 5.8942816054673976e-06, "learning_rate": 0.00838199521787772, "loss": 2.6494, "step": 7676 }, { "crossentropy": 2.743291139602661, "epoch": 0.2783135150812065, "grad_norm": 0.03247718885540962, "grad_norm_var": 5.889506558429819e-06, "learning_rate": 0.008381567203386642, "loss": 2.7356, "step": 7677 }, { "crossentropy": 2.6340856552124023, "epoch": 0.2783497679814385, "grad_norm": 0.033701542764902115, "grad_norm_var": 5.764110169487408e-06, "learning_rate": 0.008381139143222366, "loss": 2.6641, "step": 7678 }, { "crossentropy": 2.646569013595581, "epoch": 0.2783860208816705, "grad_norm": 0.03225597366690636, "grad_norm_var": 5.737112172892779e-06, "learning_rate": 0.008380711037390673, "loss": 2.6367, "step": 7679 }, { "crossentropy": 2.729372501373291, "epoch": 0.27842227378190254, "grad_norm": 0.031577255576848984, "grad_norm_var": 5.326970215545288e-06, "learning_rate": 0.008380282885897348, "loss": 2.7149, "step": 7680 }, { "crossentropy": 2.677035331726074, "epoch": 0.27845852668213456, "grad_norm": 0.03255399689078331, "grad_norm_var": 4.764347668136437e-06, "learning_rate": 0.008379854688748174, "loss": 2.5788, "step": 7681 }, { "crossentropy": 2.676563262939453, "epoch": 0.27849477958236657, "grad_norm": 0.030518976971507072, "grad_norm_var": 4.92013011946929e-06, "learning_rate": 0.008379426445948933, "loss": 2.6322, "step": 7682 }, { "crossentropy": 2.5981855392456055, "epoch": 0.2785310324825986, "grad_norm": 0.030905799940228462, "grad_norm_var": 4.945263819488181e-06, "learning_rate": 0.008378998157505408, "loss": 2.7213, "step": 7683 }, { "crossentropy": 2.550079345703125, "epoch": 0.27856728538283065, "grad_norm": 0.02986524999141693, "grad_norm_var": 4.491250225524993e-06, "learning_rate": 0.008378569823423386, "loss": 2.6771, "step": 7684 }, { "crossentropy": 2.640991449356079, "epoch": 0.27860353828306267, "grad_norm": 0.028216833248734474, "grad_norm_var": 4.279363314483664e-06, "learning_rate": 0.008378141443708652, "loss": 2.6754, "step": 7685 }, { "crossentropy": 2.7115864753723145, "epoch": 0.2786397911832947, "grad_norm": 0.03134037181735039, "grad_norm_var": 3.6760758129370744e-06, "learning_rate": 0.00837771301836699, "loss": 2.7086, "step": 7686 }, { "crossentropy": 2.757664203643799, "epoch": 0.2786760440835267, "grad_norm": 0.03035694546997547, "grad_norm_var": 3.8423643754226244e-06, "learning_rate": 0.008377284547404188, "loss": 2.6878, "step": 7687 }, { "crossentropy": 2.6897878646850586, "epoch": 0.2787122969837587, "grad_norm": 0.0306636244058609, "grad_norm_var": 2.8918870821764697e-06, "learning_rate": 0.008376856030826035, "loss": 2.6096, "step": 7688 }, { "crossentropy": 2.775081157684326, "epoch": 0.2787485498839907, "grad_norm": 0.033410754054784775, "grad_norm_var": 2.7806094661630974e-06, "learning_rate": 0.008376427468638314, "loss": 2.6853, "step": 7689 }, { "crossentropy": 2.645446538925171, "epoch": 0.27878480278422274, "grad_norm": 0.02969932183623314, "grad_norm_var": 2.297883874631014e-06, "learning_rate": 0.008375998860846818, "loss": 2.6673, "step": 7690 }, { "crossentropy": 2.6318538188934326, "epoch": 0.27882105568445475, "grad_norm": 0.02875666879117489, "grad_norm_var": 2.649639729698306e-06, "learning_rate": 0.008375570207457335, "loss": 2.7129, "step": 7691 }, { "crossentropy": 2.6470775604248047, "epoch": 0.27885730858468677, "grad_norm": 0.029514294117689133, "grad_norm_var": 2.5642456375135944e-06, "learning_rate": 0.008375141508475651, "loss": 2.6263, "step": 7692 }, { "crossentropy": 2.610563278198242, "epoch": 0.2788935614849188, "grad_norm": 0.030028190463781357, "grad_norm_var": 2.45296447000536e-06, "learning_rate": 0.00837471276390756, "loss": 2.6982, "step": 7693 }, { "crossentropy": 2.6061489582061768, "epoch": 0.2789298143851508, "grad_norm": 0.031224915757775307, "grad_norm_var": 1.8898582932709458e-06, "learning_rate": 0.008374283973758851, "loss": 2.6192, "step": 7694 }, { "crossentropy": 2.6565728187561035, "epoch": 0.2789660672853828, "grad_norm": 0.031050914898514748, "grad_norm_var": 1.7274920398790595e-06, "learning_rate": 0.008373855138035319, "loss": 2.7047, "step": 7695 }, { "crossentropy": 2.699598550796509, "epoch": 0.2790023201856148, "grad_norm": 0.03246069326996803, "grad_norm_var": 1.8907642656865956e-06, "learning_rate": 0.008373426256742752, "loss": 2.6847, "step": 7696 }, { "crossentropy": 2.6223371028900146, "epoch": 0.27903857308584684, "grad_norm": 0.03803998604416847, "grad_norm_var": 5.156816682687923e-06, "learning_rate": 0.008372997329886942, "loss": 2.615, "step": 7697 }, { "crossentropy": 2.6377599239349365, "epoch": 0.2790748259860789, "grad_norm": 0.0386882983148098, "grad_norm_var": 8.800334038543828e-06, "learning_rate": 0.008372568357473686, "loss": 2.7306, "step": 7698 }, { "crossentropy": 2.663320302963257, "epoch": 0.2791110788863109, "grad_norm": 0.03550015389919281, "grad_norm_var": 9.747061597264424e-06, "learning_rate": 0.008372139339508778, "loss": 2.6823, "step": 7699 }, { "crossentropy": 2.7974607944488525, "epoch": 0.27914733178654294, "grad_norm": 0.0355888195335865, "grad_norm_var": 1.0317203293852651e-05, "learning_rate": 0.008371710275998008, "loss": 2.7462, "step": 7700 }, { "crossentropy": 2.593950033187866, "epoch": 0.27918358468677495, "grad_norm": 0.03966003656387329, "grad_norm_var": 1.2486889085320621e-05, "learning_rate": 0.008371281166947176, "loss": 2.6366, "step": 7701 }, { "crossentropy": 2.689397096633911, "epoch": 0.27921983758700697, "grad_norm": 0.04768086224794388, "grad_norm_var": 2.583375238451955e-05, "learning_rate": 0.008370852012362075, "loss": 2.6247, "step": 7702 }, { "crossentropy": 2.678009271621704, "epoch": 0.279256090487239, "grad_norm": 0.03118450939655304, "grad_norm_var": 2.5486129862790788e-05, "learning_rate": 0.008370422812248502, "loss": 2.6523, "step": 7703 }, { "crossentropy": 2.860063314437866, "epoch": 0.279292343387471, "grad_norm": 0.031009158119559288, "grad_norm_var": 2.5342322905630432e-05, "learning_rate": 0.008369993566612253, "loss": 2.7842, "step": 7704 }, { "crossentropy": 2.7455520629882812, "epoch": 0.279328596287703, "grad_norm": 0.030601778998970985, "grad_norm_var": 2.6044398724515265e-05, "learning_rate": 0.008369564275459129, "loss": 2.7141, "step": 7705 }, { "crossentropy": 2.637946367263794, "epoch": 0.279364849187935, "grad_norm": 0.033529847860336304, "grad_norm_var": 2.487064550252663e-05, "learning_rate": 0.008369134938794924, "loss": 2.6859, "step": 7706 }, { "crossentropy": 2.6742262840270996, "epoch": 0.27940110208816704, "grad_norm": 0.030670443549752235, "grad_norm_var": 2.3753334089289868e-05, "learning_rate": 0.00836870555662544, "loss": 2.5674, "step": 7707 }, { "crossentropy": 2.6948750019073486, "epoch": 0.27943735498839906, "grad_norm": 0.03233011066913605, "grad_norm_var": 2.2507673796406585e-05, "learning_rate": 0.008368276128956475, "loss": 2.6604, "step": 7708 }, { "crossentropy": 2.641735553741455, "epoch": 0.27947360788863107, "grad_norm": 0.02995382621884346, "grad_norm_var": 2.2550653481832868e-05, "learning_rate": 0.008367846655793829, "loss": 2.5931, "step": 7709 }, { "crossentropy": 2.775001287460327, "epoch": 0.2795098607888631, "grad_norm": 0.028897175565361977, "grad_norm_var": 2.3850963189952578e-05, "learning_rate": 0.008367417137143302, "loss": 2.707, "step": 7710 }, { "crossentropy": 2.687821388244629, "epoch": 0.27954611368909515, "grad_norm": 0.0329890213906765, "grad_norm_var": 2.327766841337068e-05, "learning_rate": 0.008366987573010699, "loss": 2.6202, "step": 7711 }, { "crossentropy": 2.5498664379119873, "epoch": 0.27958236658932717, "grad_norm": 0.035858891904354095, "grad_norm_var": 2.316645841080392e-05, "learning_rate": 0.008366557963401817, "loss": 2.6322, "step": 7712 }, { "crossentropy": 2.5329766273498535, "epoch": 0.2796186194895592, "grad_norm": 0.041217293590307236, "grad_norm_var": 2.5292253579850543e-05, "learning_rate": 0.008366128308322463, "loss": 2.5708, "step": 7713 }, { "crossentropy": 2.7390289306640625, "epoch": 0.2796548723897912, "grad_norm": 0.037381596863269806, "grad_norm_var": 2.470584642902529e-05, "learning_rate": 0.008365698607778435, "loss": 2.6812, "step": 7714 }, { "crossentropy": 2.632420063018799, "epoch": 0.2796911252900232, "grad_norm": 0.029155461117625237, "grad_norm_var": 2.6484277554411267e-05, "learning_rate": 0.008365268861775542, "loss": 2.6231, "step": 7715 }, { "crossentropy": 2.583371639251709, "epoch": 0.2797273781902552, "grad_norm": 0.028990281745791435, "grad_norm_var": 2.8011660587200633e-05, "learning_rate": 0.008364839070319586, "loss": 2.5888, "step": 7716 }, { "crossentropy": 2.60088849067688, "epoch": 0.27976363109048724, "grad_norm": 0.031236784532666206, "grad_norm_var": 2.5886481190310812e-05, "learning_rate": 0.008364409233416371, "loss": 2.5953, "step": 7717 }, { "crossentropy": 2.512543201446533, "epoch": 0.27979988399071926, "grad_norm": 0.03410571813583374, "grad_norm_var": 1.1361849489004829e-05, "learning_rate": 0.008363979351071702, "loss": 2.6159, "step": 7718 }, { "crossentropy": 2.5433802604675293, "epoch": 0.27983613689095127, "grad_norm": 0.0327933095395565, "grad_norm_var": 1.1253339294599651e-05, "learning_rate": 0.00836354942329139, "loss": 2.6103, "step": 7719 }, { "crossentropy": 2.604792594909668, "epoch": 0.2798723897911833, "grad_norm": 0.03323957324028015, "grad_norm_var": 1.1107506269439291e-05, "learning_rate": 0.008363119450081237, "loss": 2.5483, "step": 7720 }, { "crossentropy": 2.733029365539551, "epoch": 0.2799086426914153, "grad_norm": 0.03656618297100067, "grad_norm_var": 1.167464051061635e-05, "learning_rate": 0.008362689431447053, "loss": 2.7127, "step": 7721 }, { "crossentropy": 2.7846899032592773, "epoch": 0.2799448955916473, "grad_norm": 0.034808024764060974, "grad_norm_var": 1.185729597006622e-05, "learning_rate": 0.008362259367394643, "loss": 2.7422, "step": 7722 }, { "crossentropy": 2.6648190021514893, "epoch": 0.27998114849187933, "grad_norm": 0.03338908776640892, "grad_norm_var": 1.1425104775870422e-05, "learning_rate": 0.008361829257929821, "loss": 2.6242, "step": 7723 }, { "crossentropy": 2.716052532196045, "epoch": 0.28001740139211134, "grad_norm": 0.03281247988343239, "grad_norm_var": 1.1376816400913518e-05, "learning_rate": 0.008361399103058392, "loss": 2.6701, "step": 7724 }, { "crossentropy": 2.604437828063965, "epoch": 0.2800536542923434, "grad_norm": 0.031056899577379227, "grad_norm_var": 1.0955254492028833e-05, "learning_rate": 0.008360968902786165, "loss": 2.6003, "step": 7725 }, { "crossentropy": 2.505580425262451, "epoch": 0.2800899071925754, "grad_norm": 0.03309902548789978, "grad_norm_var": 9.532609683666213e-06, "learning_rate": 0.008360538657118955, "loss": 2.5293, "step": 7726 }, { "crossentropy": 2.6318347454071045, "epoch": 0.28012616009280744, "grad_norm": 0.029928749427199364, "grad_norm_var": 1.0395283263289757e-05, "learning_rate": 0.008360108366062569, "loss": 2.677, "step": 7727 }, { "crossentropy": 2.7031805515289307, "epoch": 0.28016241299303946, "grad_norm": 0.031662456691265106, "grad_norm_var": 1.016344260339012e-05, "learning_rate": 0.008359678029622822, "loss": 2.6588, "step": 7728 }, { "crossentropy": 2.5759451389312744, "epoch": 0.28019866589327147, "grad_norm": 0.03551849350333214, "grad_norm_var": 6.112888825000421e-06, "learning_rate": 0.008359247647805523, "loss": 2.6437, "step": 7729 }, { "crossentropy": 2.636707305908203, "epoch": 0.2802349187935035, "grad_norm": 0.03171198070049286, "grad_norm_var": 4.703077187267803e-06, "learning_rate": 0.008358817220616488, "loss": 2.6574, "step": 7730 }, { "crossentropy": 2.485807180404663, "epoch": 0.2802711716937355, "grad_norm": 0.030570682138204575, "grad_norm_var": 4.196275058253188e-06, "learning_rate": 0.008358386748061528, "loss": 2.5331, "step": 7731 }, { "crossentropy": 2.6253225803375244, "epoch": 0.2803074245939675, "grad_norm": 0.033571790903806686, "grad_norm_var": 3.3073132494185543e-06, "learning_rate": 0.008357956230146459, "loss": 2.5074, "step": 7732 }, { "crossentropy": 2.6180126667022705, "epoch": 0.28034367749419953, "grad_norm": 0.036137860268354416, "grad_norm_var": 3.7351518910956007e-06, "learning_rate": 0.008357525666877096, "loss": 2.6865, "step": 7733 }, { "crossentropy": 2.5375657081604004, "epoch": 0.28037993039443154, "grad_norm": 0.03957775607705116, "grad_norm_var": 6.277800884812524e-06, "learning_rate": 0.008357095058259253, "loss": 2.5998, "step": 7734 }, { "crossentropy": 2.5207862854003906, "epoch": 0.28041618329466356, "grad_norm": 0.03530127555131912, "grad_norm_var": 6.425318291571275e-06, "learning_rate": 0.008356664404298745, "loss": 2.611, "step": 7735 }, { "crossentropy": 2.589775323867798, "epoch": 0.28045243619489557, "grad_norm": 0.03091984987258911, "grad_norm_var": 6.899258505502432e-06, "learning_rate": 0.008356233705001393, "loss": 2.6123, "step": 7736 }, { "crossentropy": 2.5781538486480713, "epoch": 0.2804886890951276, "grad_norm": 0.029371824115514755, "grad_norm_var": 7.230880046687445e-06, "learning_rate": 0.008355802960373011, "loss": 2.579, "step": 7737 }, { "crossentropy": 2.6457648277282715, "epoch": 0.28052494199535966, "grad_norm": 0.030061671510338783, "grad_norm_var": 7.551555139154581e-06, "learning_rate": 0.008355372170419418, "loss": 2.6103, "step": 7738 }, { "crossentropy": 2.6008927822113037, "epoch": 0.28056119489559167, "grad_norm": 0.02996252290904522, "grad_norm_var": 8.013162415104153e-06, "learning_rate": 0.008354941335146429, "loss": 2.6157, "step": 7739 }, { "crossentropy": 2.635944128036499, "epoch": 0.2805974477958237, "grad_norm": 0.029845336452126503, "grad_norm_var": 8.471072171366684e-06, "learning_rate": 0.00835451045455987, "loss": 2.6835, "step": 7740 }, { "crossentropy": 2.6624279022216797, "epoch": 0.2806337006960557, "grad_norm": 0.03058953583240509, "grad_norm_var": 8.568022924647598e-06, "learning_rate": 0.008354079528665554, "loss": 2.604, "step": 7741 }, { "crossentropy": 2.732924461364746, "epoch": 0.2806699535962877, "grad_norm": 0.03426242992281914, "grad_norm_var": 8.766568843245464e-06, "learning_rate": 0.008353648557469305, "loss": 2.6927, "step": 7742 }, { "crossentropy": 2.6918716430664062, "epoch": 0.28070620649651973, "grad_norm": 0.03352031111717224, "grad_norm_var": 8.371571511443552e-06, "learning_rate": 0.00835321754097694, "loss": 2.7129, "step": 7743 }, { "crossentropy": 2.6108522415161133, "epoch": 0.28074245939675174, "grad_norm": 0.03311450779438019, "grad_norm_var": 8.309906685717441e-06, "learning_rate": 0.008352786479194288, "loss": 2.7169, "step": 7744 }, { "crossentropy": 2.7523300647735596, "epoch": 0.28077871229698376, "grad_norm": 0.033325374126434326, "grad_norm_var": 7.80165726359445e-06, "learning_rate": 0.008352355372127165, "loss": 2.6635, "step": 7745 }, { "crossentropy": 2.6516523361206055, "epoch": 0.28081496519721577, "grad_norm": 0.03316438943147659, "grad_norm_var": 7.758569711588378e-06, "learning_rate": 0.008351924219781393, "loss": 2.6273, "step": 7746 }, { "crossentropy": 2.783424139022827, "epoch": 0.2808512180974478, "grad_norm": 0.03143918514251709, "grad_norm_var": 7.558434547899024e-06, "learning_rate": 0.0083514930221628, "loss": 2.7642, "step": 7747 }, { "crossentropy": 2.7116971015930176, "epoch": 0.2808874709976798, "grad_norm": 0.031902849674224854, "grad_norm_var": 7.5519538514505574e-06, "learning_rate": 0.008351061779277207, "loss": 2.649, "step": 7748 }, { "crossentropy": 2.750946283340454, "epoch": 0.2809237238979118, "grad_norm": 0.033574383705854416, "grad_norm_var": 6.772592669992842e-06, "learning_rate": 0.008350630491130438, "loss": 2.622, "step": 7749 }, { "crossentropy": 2.6307997703552246, "epoch": 0.28095997679814383, "grad_norm": 0.030834397301077843, "grad_norm_var": 3.2945061757725174e-06, "learning_rate": 0.008350199157728323, "loss": 2.6619, "step": 7750 }, { "crossentropy": 2.528505802154541, "epoch": 0.28099622969837584, "grad_norm": 0.030925976112484932, "grad_norm_var": 2.5355441815173476e-06, "learning_rate": 0.008349767779076681, "loss": 2.5524, "step": 7751 }, { "crossentropy": 2.654759645462036, "epoch": 0.2810324825986079, "grad_norm": 0.02846257947385311, "grad_norm_var": 3.1606425441592253e-06, "learning_rate": 0.008349336355181342, "loss": 2.656, "step": 7752 }, { "crossentropy": 2.712476968765259, "epoch": 0.28106873549883993, "grad_norm": 0.03143524006009102, "grad_norm_var": 2.835096220866508e-06, "learning_rate": 0.008348904886048131, "loss": 2.7211, "step": 7753 }, { "crossentropy": 2.566978931427002, "epoch": 0.28110498839907194, "grad_norm": 0.029992690309882164, "grad_norm_var": 2.850014155868219e-06, "learning_rate": 0.008348473371682883, "loss": 2.6696, "step": 7754 }, { "crossentropy": 2.683570146560669, "epoch": 0.28114124129930396, "grad_norm": 0.03148315101861954, "grad_norm_var": 2.653008775640184e-06, "learning_rate": 0.008348041812091415, "loss": 2.6968, "step": 7755 }, { "crossentropy": 2.655320405960083, "epoch": 0.28117749419953597, "grad_norm": 0.033299971371889114, "grad_norm_var": 2.525268112186542e-06, "learning_rate": 0.008347610207279565, "loss": 2.6483, "step": 7756 }, { "crossentropy": 2.6766457557678223, "epoch": 0.281213747099768, "grad_norm": 0.029979653656482697, "grad_norm_var": 2.659790410213375e-06, "learning_rate": 0.008347178557253157, "loss": 2.6638, "step": 7757 }, { "crossentropy": 2.645660638809204, "epoch": 0.28125, "grad_norm": 0.03297899663448334, "grad_norm_var": 2.36186234701304e-06, "learning_rate": 0.008346746862018025, "loss": 2.7449, "step": 7758 }, { "crossentropy": 2.6780929565429688, "epoch": 0.281286252900232, "grad_norm": 0.03003411553800106, "grad_norm_var": 2.340223121423734e-06, "learning_rate": 0.008346315121579998, "loss": 2.6512, "step": 7759 }, { "crossentropy": 2.6774978637695312, "epoch": 0.28132250580046403, "grad_norm": 0.029719317331910133, "grad_norm_var": 2.3849056700891064e-06, "learning_rate": 0.008345883335944904, "loss": 2.667, "step": 7760 }, { "crossentropy": 2.5668723583221436, "epoch": 0.28135875870069604, "grad_norm": 0.029336923733353615, "grad_norm_var": 2.360299018932726e-06, "learning_rate": 0.00834545150511858, "loss": 2.6484, "step": 7761 }, { "crossentropy": 2.6511030197143555, "epoch": 0.28139501160092806, "grad_norm": 0.029141299426555634, "grad_norm_var": 2.2968268831319855e-06, "learning_rate": 0.008345019629106856, "loss": 2.5821, "step": 7762 }, { "crossentropy": 2.538663625717163, "epoch": 0.28143126450116007, "grad_norm": 0.029736623167991638, "grad_norm_var": 2.3575939166928328e-06, "learning_rate": 0.008344587707915567, "loss": 2.5817, "step": 7763 }, { "crossentropy": 2.8169517517089844, "epoch": 0.2814675174013921, "grad_norm": 0.03202887624502182, "grad_norm_var": 2.377078281505966e-06, "learning_rate": 0.008344155741550544, "loss": 2.7355, "step": 7764 }, { "crossentropy": 2.852282762527466, "epoch": 0.28150377030162416, "grad_norm": 0.0355881005525589, "grad_norm_var": 3.372673660252086e-06, "learning_rate": 0.008343723730017624, "loss": 2.7737, "step": 7765 }, { "crossentropy": 2.8407912254333496, "epoch": 0.28154002320185617, "grad_norm": 0.035656657069921494, "grad_norm_var": 4.760656377120603e-06, "learning_rate": 0.008343291673322638, "loss": 2.7362, "step": 7766 }, { "crossentropy": 2.6958651542663574, "epoch": 0.2815762761020882, "grad_norm": 0.03490680828690529, "grad_norm_var": 5.585739822563925e-06, "learning_rate": 0.008342859571471427, "loss": 2.627, "step": 7767 }, { "crossentropy": 2.6130361557006836, "epoch": 0.2816125290023202, "grad_norm": 0.03325763717293739, "grad_norm_var": 5.089579236786044e-06, "learning_rate": 0.008342427424469823, "loss": 2.6763, "step": 7768 }, { "crossentropy": 2.6745450496673584, "epoch": 0.2816487819025522, "grad_norm": 0.030513258650898933, "grad_norm_var": 5.185827036479129e-06, "learning_rate": 0.008341995232323665, "loss": 2.6742, "step": 7769 }, { "crossentropy": 2.7475414276123047, "epoch": 0.28168503480278423, "grad_norm": 0.03323914855718613, "grad_norm_var": 5.093232681018835e-06, "learning_rate": 0.00834156299503879, "loss": 2.6208, "step": 7770 }, { "crossentropy": 2.5908515453338623, "epoch": 0.28172128770301624, "grad_norm": 0.0366022102534771, "grad_norm_var": 6.425161648000259e-06, "learning_rate": 0.008341130712621036, "loss": 2.5964, "step": 7771 }, { "crossentropy": 2.673037528991699, "epoch": 0.28175754060324826, "grad_norm": 0.03946428373456001, "grad_norm_var": 9.662056813681633e-06, "learning_rate": 0.00834069838507624, "loss": 2.7228, "step": 7772 }, { "crossentropy": 2.6480941772460938, "epoch": 0.28179379350348027, "grad_norm": 0.03351268917322159, "grad_norm_var": 9.190641378201136e-06, "learning_rate": 0.008340266012410242, "loss": 2.6298, "step": 7773 }, { "crossentropy": 2.6169087886810303, "epoch": 0.2818300464037123, "grad_norm": 0.03110937587916851, "grad_norm_var": 9.37877438228761e-06, "learning_rate": 0.008339833594628886, "loss": 2.6737, "step": 7774 }, { "crossentropy": 2.628457546234131, "epoch": 0.2818662993039443, "grad_norm": 0.030099110677838326, "grad_norm_var": 9.355585192812376e-06, "learning_rate": 0.008339401131738007, "loss": 2.6653, "step": 7775 }, { "crossentropy": 2.6405029296875, "epoch": 0.2819025522041763, "grad_norm": 0.031486015766859055, "grad_norm_var": 8.838045544873271e-06, "learning_rate": 0.008338968623743446, "loss": 2.6586, "step": 7776 }, { "crossentropy": 2.590052604675293, "epoch": 0.28193880510440833, "grad_norm": 0.030760223045945168, "grad_norm_var": 8.29703175880449e-06, "learning_rate": 0.00833853607065105, "loss": 2.5705, "step": 7777 }, { "crossentropy": 2.564964532852173, "epoch": 0.28197505800464034, "grad_norm": 0.02986891195178032, "grad_norm_var": 7.961211699818634e-06, "learning_rate": 0.008338103472466656, "loss": 2.6602, "step": 7778 }, { "crossentropy": 2.682201623916626, "epoch": 0.2820113109048724, "grad_norm": 0.02931143157184124, "grad_norm_var": 8.156916394901056e-06, "learning_rate": 0.00833767082919611, "loss": 2.6011, "step": 7779 }, { "crossentropy": 2.7514283657073975, "epoch": 0.28204756380510443, "grad_norm": 0.032040297985076904, "grad_norm_var": 8.155502283007154e-06, "learning_rate": 0.008337238140845254, "loss": 2.6045, "step": 7780 }, { "crossentropy": 2.740551233291626, "epoch": 0.28208381670533644, "grad_norm": 0.03493109717965126, "grad_norm_var": 7.952565311577934e-06, "learning_rate": 0.008336805407419932, "loss": 2.6577, "step": 7781 }, { "crossentropy": 2.4023361206054688, "epoch": 0.28212006960556846, "grad_norm": 0.03416718542575836, "grad_norm_var": 7.548219451837289e-06, "learning_rate": 0.008336372628925988, "loss": 2.534, "step": 7782 }, { "crossentropy": 2.5615665912628174, "epoch": 0.28215632250580047, "grad_norm": 0.030080880969762802, "grad_norm_var": 7.667066260924022e-06, "learning_rate": 0.00833593980536927, "loss": 2.6443, "step": 7783 }, { "crossentropy": 2.661323070526123, "epoch": 0.2821925754060325, "grad_norm": 0.028754165396094322, "grad_norm_var": 8.496365635425375e-06, "learning_rate": 0.008335506936755622, "loss": 2.6439, "step": 7784 }, { "crossentropy": 2.76847505569458, "epoch": 0.2822288283062645, "grad_norm": 0.02983279712498188, "grad_norm_var": 8.68253769315842e-06, "learning_rate": 0.008335074023090892, "loss": 2.6856, "step": 7785 }, { "crossentropy": 2.7125537395477295, "epoch": 0.2822650812064965, "grad_norm": 0.029340019449591637, "grad_norm_var": 9.094445464010688e-06, "learning_rate": 0.008334641064380926, "loss": 2.7296, "step": 7786 }, { "crossentropy": 2.6273303031921387, "epoch": 0.28230133410672853, "grad_norm": 0.030874358490109444, "grad_norm_var": 7.5996775715421344e-06, "learning_rate": 0.00833420806063157, "loss": 2.6952, "step": 7787 }, { "crossentropy": 2.6560707092285156, "epoch": 0.28233758700696054, "grad_norm": 0.03302484750747681, "grad_norm_var": 3.440879074313961e-06, "learning_rate": 0.008333775011848676, "loss": 2.6697, "step": 7788 }, { "crossentropy": 2.6429505348205566, "epoch": 0.28237383990719256, "grad_norm": 0.0286125335842371, "grad_norm_var": 3.4303252719685884e-06, "learning_rate": 0.008333341918038091, "loss": 2.6889, "step": 7789 }, { "crossentropy": 2.757406711578369, "epoch": 0.2824100928074246, "grad_norm": 0.02950603887438774, "grad_norm_var": 3.5448070811634758e-06, "learning_rate": 0.008332908779205665, "loss": 2.7102, "step": 7790 }, { "crossentropy": 2.6853702068328857, "epoch": 0.2824463457076566, "grad_norm": 0.035105686634778976, "grad_norm_var": 4.648138580670919e-06, "learning_rate": 0.008332475595357247, "loss": 2.7163, "step": 7791 }, { "crossentropy": 2.81581711769104, "epoch": 0.28248259860788866, "grad_norm": 0.03048376739025116, "grad_norm_var": 4.660141352825692e-06, "learning_rate": 0.00833204236649869, "loss": 2.7986, "step": 7792 }, { "crossentropy": 2.577784538269043, "epoch": 0.28251885150812067, "grad_norm": 0.029546307399868965, "grad_norm_var": 4.798072934809637e-06, "learning_rate": 0.008331609092635845, "loss": 2.643, "step": 7793 }, { "crossentropy": 2.5249080657958984, "epoch": 0.2825551044083527, "grad_norm": 0.02981995977461338, "grad_norm_var": 4.805393274808813e-06, "learning_rate": 0.008331175773774561, "loss": 2.6022, "step": 7794 }, { "crossentropy": 2.619401454925537, "epoch": 0.2825913573085847, "grad_norm": 0.031126653775572777, "grad_norm_var": 4.611250675457847e-06, "learning_rate": 0.008330742409920695, "loss": 2.5897, "step": 7795 }, { "crossentropy": 2.6860694885253906, "epoch": 0.2826276102088167, "grad_norm": 0.030541658401489258, "grad_norm_var": 4.559318144099246e-06, "learning_rate": 0.008330309001080099, "loss": 2.6381, "step": 7796 }, { "crossentropy": 2.609454870223999, "epoch": 0.28266386310904873, "grad_norm": 0.030322395265102386, "grad_norm_var": 3.461512553656667e-06, "learning_rate": 0.008329875547258624, "loss": 2.7322, "step": 7797 }, { "crossentropy": 2.616630792617798, "epoch": 0.28270011600928074, "grad_norm": 0.031549565494060516, "grad_norm_var": 2.6783302274522672e-06, "learning_rate": 0.008329442048462126, "loss": 2.6172, "step": 7798 }, { "crossentropy": 2.510375738143921, "epoch": 0.28273636890951276, "grad_norm": 0.03977295756340027, "grad_norm_var": 7.965603092507875e-06, "learning_rate": 0.008329008504696462, "loss": 2.6488, "step": 7799 }, { "crossentropy": 2.7098681926727295, "epoch": 0.2827726218097448, "grad_norm": 0.033087484538555145, "grad_norm_var": 7.761678096825199e-06, "learning_rate": 0.008328574915967486, "loss": 2.7032, "step": 7800 }, { "crossentropy": 2.663787364959717, "epoch": 0.2828088747099768, "grad_norm": 0.030857158824801445, "grad_norm_var": 7.611954279109151e-06, "learning_rate": 0.008328141282281053, "loss": 2.7347, "step": 7801 }, { "crossentropy": 2.635469913482666, "epoch": 0.2828451276102088, "grad_norm": 0.028931522741913795, "grad_norm_var": 7.738570581074809e-06, "learning_rate": 0.008327707603643025, "loss": 2.6213, "step": 7802 }, { "crossentropy": 2.6168124675750732, "epoch": 0.2828813805104408, "grad_norm": 0.030508028343319893, "grad_norm_var": 7.77496131879247e-06, "learning_rate": 0.008327273880059253, "loss": 2.6158, "step": 7803 }, { "crossentropy": 2.8044826984405518, "epoch": 0.28291763341067283, "grad_norm": 0.028686832636594772, "grad_norm_var": 8.025630526654416e-06, "learning_rate": 0.008326840111535597, "loss": 2.7017, "step": 7804 }, { "crossentropy": 2.6212971210479736, "epoch": 0.28295388631090485, "grad_norm": 0.02916908822953701, "grad_norm_var": 7.856420039546253e-06, "learning_rate": 0.008326406298077918, "loss": 2.6458, "step": 7805 }, { "crossentropy": 2.6349031925201416, "epoch": 0.2829901392111369, "grad_norm": 0.0333804190158844, "grad_norm_var": 7.925492719982917e-06, "learning_rate": 0.008325972439692074, "loss": 2.6897, "step": 7806 }, { "crossentropy": 2.5950376987457275, "epoch": 0.28302639211136893, "grad_norm": 0.05014168471097946, "grad_norm_var": 2.94233971273941e-05, "learning_rate": 0.008325538536383926, "loss": 2.7283, "step": 7807 }, { "crossentropy": 2.544898271560669, "epoch": 0.28306264501160094, "grad_norm": 0.03249873220920563, "grad_norm_var": 2.917030126320662e-05, "learning_rate": 0.008325104588159332, "loss": 2.5438, "step": 7808 }, { "crossentropy": 2.6074106693267822, "epoch": 0.28309889791183296, "grad_norm": 0.035182833671569824, "grad_norm_var": 2.893894142370201e-05, "learning_rate": 0.008324670595024154, "loss": 2.6446, "step": 7809 }, { "crossentropy": 2.741223096847534, "epoch": 0.283135150812065, "grad_norm": 0.03524443507194519, "grad_norm_var": 2.8587523388051922e-05, "learning_rate": 0.008324236556984254, "loss": 2.7481, "step": 7810 }, { "crossentropy": 2.7794289588928223, "epoch": 0.283171403712297, "grad_norm": 0.03462753817439079, "grad_norm_var": 2.8391521791928712e-05, "learning_rate": 0.008323802474045497, "loss": 2.7177, "step": 7811 }, { "crossentropy": 2.5441365242004395, "epoch": 0.283207656612529, "grad_norm": 0.05543915927410126, "grad_norm_var": 5.7624394630187194e-05, "learning_rate": 0.008323368346213741, "loss": 2.6389, "step": 7812 }, { "crossentropy": 2.9248268604278564, "epoch": 0.283243909512761, "grad_norm": 0.032934270799160004, "grad_norm_var": 5.643484976335632e-05, "learning_rate": 0.008322934173494854, "loss": 2.793, "step": 7813 }, { "crossentropy": 2.6238389015197754, "epoch": 0.28328016241299303, "grad_norm": 0.03086356818675995, "grad_norm_var": 5.679136052425722e-05, "learning_rate": 0.008322499955894696, "loss": 2.5944, "step": 7814 }, { "crossentropy": 2.6463510990142822, "epoch": 0.28331641531322505, "grad_norm": 0.033608853816986084, "grad_norm_var": 5.531141937001697e-05, "learning_rate": 0.008322065693419136, "loss": 2.7048, "step": 7815 }, { "crossentropy": 2.6722207069396973, "epoch": 0.28335266821345706, "grad_norm": 0.03098725900053978, "grad_norm_var": 5.6037984504414955e-05, "learning_rate": 0.008321631386074035, "loss": 2.6923, "step": 7816 }, { "crossentropy": 2.689094305038452, "epoch": 0.2833889211136891, "grad_norm": 0.034698475152254105, "grad_norm_var": 5.506046645854487e-05, "learning_rate": 0.008321197033865264, "loss": 2.6955, "step": 7817 }, { "crossentropy": 2.7449538707733154, "epoch": 0.2834251740139211, "grad_norm": 0.031926997005939484, "grad_norm_var": 5.327485725571795e-05, "learning_rate": 0.008320762636798687, "loss": 2.6634, "step": 7818 }, { "crossentropy": 2.565142869949341, "epoch": 0.28346142691415316, "grad_norm": 0.03008667565882206, "grad_norm_var": 5.353795642725247e-05, "learning_rate": 0.00832032819488017, "loss": 2.609, "step": 7819 }, { "crossentropy": 2.6110472679138184, "epoch": 0.2834976798143852, "grad_norm": 0.032509345561265945, "grad_norm_var": 5.125022533380453e-05, "learning_rate": 0.008319893708115582, "loss": 2.6733, "step": 7820 }, { "crossentropy": 2.608496904373169, "epoch": 0.2835339327146172, "grad_norm": 0.03383563458919525, "grad_norm_var": 4.8854932735186326e-05, "learning_rate": 0.008319459176510792, "loss": 2.6186, "step": 7821 }, { "crossentropy": 2.6885342597961426, "epoch": 0.2835701856148492, "grad_norm": 0.03444778546690941, "grad_norm_var": 4.862479136809778e-05, "learning_rate": 0.008319024600071667, "loss": 2.687, "step": 7822 }, { "crossentropy": 2.6788477897644043, "epoch": 0.2836064385150812, "grad_norm": 0.03283895179629326, "grad_norm_var": 3.370648893485025e-05, "learning_rate": 0.008318589978804079, "loss": 2.6294, "step": 7823 }, { "crossentropy": 2.7618372440338135, "epoch": 0.28364269141531323, "grad_norm": 0.03336650878190994, "grad_norm_var": 3.3523948692100095e-05, "learning_rate": 0.008318155312713897, "loss": 2.7222, "step": 7824 }, { "crossentropy": 2.67901873588562, "epoch": 0.28367894431554525, "grad_norm": 0.030996937304735184, "grad_norm_var": 3.425882382157712e-05, "learning_rate": 0.008317720601806992, "loss": 2.669, "step": 7825 }, { "crossentropy": 2.7696433067321777, "epoch": 0.28371519721577726, "grad_norm": 0.030827224254608154, "grad_norm_var": 3.490780469084704e-05, "learning_rate": 0.008317285846089236, "loss": 2.6345, "step": 7826 }, { "crossentropy": 2.6464741230010986, "epoch": 0.2837514501160093, "grad_norm": 0.03035023622214794, "grad_norm_var": 3.569320068945401e-05, "learning_rate": 0.0083168510455665, "loss": 2.6486, "step": 7827 }, { "crossentropy": 2.733889579772949, "epoch": 0.2837877030162413, "grad_norm": 0.03261176124215126, "grad_norm_var": 2.1933890993271084e-06, "learning_rate": 0.008316416200244658, "loss": 2.68, "step": 7828 }, { "crossentropy": 2.687138080596924, "epoch": 0.2838239559164733, "grad_norm": 0.02929914928972721, "grad_norm_var": 2.714591711845783e-06, "learning_rate": 0.008315981310129582, "loss": 2.6578, "step": 7829 }, { "crossentropy": 2.618943691253662, "epoch": 0.2838602088167053, "grad_norm": 0.030624380335211754, "grad_norm_var": 2.7569123769439e-06, "learning_rate": 0.008315546375227145, "loss": 2.6186, "step": 7830 }, { "crossentropy": 2.7771737575531006, "epoch": 0.28389646171693733, "grad_norm": 0.036429207772016525, "grad_norm_var": 3.835183996276498e-06, "learning_rate": 0.008315111395543223, "loss": 2.7317, "step": 7831 }, { "crossentropy": 2.7963361740112305, "epoch": 0.28393271461716935, "grad_norm": 0.030149606987833977, "grad_norm_var": 4.018928378731939e-06, "learning_rate": 0.008314676371083692, "loss": 2.7427, "step": 7832 }, { "crossentropy": 2.6407876014709473, "epoch": 0.2839689675174014, "grad_norm": 0.02963479422032833, "grad_norm_var": 3.926131445287859e-06, "learning_rate": 0.008314241301854423, "loss": 2.6606, "step": 7833 }, { "crossentropy": 2.5679409503936768, "epoch": 0.28400522041763343, "grad_norm": 0.029986532405018806, "grad_norm_var": 4.146968103593698e-06, "learning_rate": 0.0083138061878613, "loss": 2.6711, "step": 7834 }, { "crossentropy": 2.733430862426758, "epoch": 0.28404147331786544, "grad_norm": 0.031028540804982185, "grad_norm_var": 3.993570194619567e-06, "learning_rate": 0.008313371029110195, "loss": 2.7363, "step": 7835 }, { "crossentropy": 2.593221426010132, "epoch": 0.28407772621809746, "grad_norm": 0.029827775433659554, "grad_norm_var": 4.192427480206815e-06, "learning_rate": 0.008312935825606984, "loss": 2.6189, "step": 7836 }, { "crossentropy": 2.665421962738037, "epoch": 0.2841139791183295, "grad_norm": 0.030581513419747353, "grad_norm_var": 3.902018378648545e-06, "learning_rate": 0.008312500577357548, "loss": 2.6403, "step": 7837 }, { "crossentropy": 2.6182126998901367, "epoch": 0.2841502320185615, "grad_norm": 0.030289961025118828, "grad_norm_var": 3.3136869678561385e-06, "learning_rate": 0.008312065284367764, "loss": 2.5656, "step": 7838 }, { "crossentropy": 2.852933645248413, "epoch": 0.2841864849187935, "grad_norm": 0.03309949114918709, "grad_norm_var": 3.3756393017998304e-06, "learning_rate": 0.008311629946643513, "loss": 2.8031, "step": 7839 }, { "crossentropy": 2.458369731903076, "epoch": 0.2842227378190255, "grad_norm": 0.033033035695552826, "grad_norm_var": 3.2859920884924693e-06, "learning_rate": 0.008311194564190674, "loss": 2.5309, "step": 7840 }, { "crossentropy": 2.5577352046966553, "epoch": 0.28425899071925753, "grad_norm": 0.03256237879395485, "grad_norm_var": 3.4023782340977805e-06, "learning_rate": 0.008310759137015128, "loss": 2.5744, "step": 7841 }, { "crossentropy": 2.5817055702209473, "epoch": 0.28429524361948955, "grad_norm": 0.03509950637817383, "grad_norm_var": 4.290376165943614e-06, "learning_rate": 0.008310323665122753, "loss": 2.6742, "step": 7842 }, { "crossentropy": 2.6667532920837402, "epoch": 0.28433149651972156, "grad_norm": 0.03870942071080208, "grad_norm_var": 7.3338014323651255e-06, "learning_rate": 0.008309888148519438, "loss": 2.5943, "step": 7843 }, { "crossentropy": 2.574929714202881, "epoch": 0.2843677494199536, "grad_norm": 0.03155715763568878, "grad_norm_var": 7.325789985238478e-06, "learning_rate": 0.008309452587211058, "loss": 2.5695, "step": 7844 }, { "crossentropy": 2.47016978263855, "epoch": 0.2844040023201856, "grad_norm": 0.028533626347780228, "grad_norm_var": 7.637533162361577e-06, "learning_rate": 0.008309016981203496, "loss": 2.5697, "step": 7845 }, { "crossentropy": 2.659879684448242, "epoch": 0.28444025522041766, "grad_norm": 0.030583703890442848, "grad_norm_var": 7.644808116403593e-06, "learning_rate": 0.008308581330502643, "loss": 2.6096, "step": 7846 }, { "crossentropy": 2.6860921382904053, "epoch": 0.2844765081206497, "grad_norm": 0.032709550112485886, "grad_norm_var": 6.285160435926899e-06, "learning_rate": 0.008308145635114375, "loss": 2.6766, "step": 7847 }, { "crossentropy": 2.807615280151367, "epoch": 0.2845127610208817, "grad_norm": 0.03428342565894127, "grad_norm_var": 6.4922219259487405e-06, "learning_rate": 0.00830770989504458, "loss": 2.7067, "step": 7848 }, { "crossentropy": 2.587869167327881, "epoch": 0.2845490139211137, "grad_norm": 0.032026007771492004, "grad_norm_var": 6.105052479041482e-06, "learning_rate": 0.008307274110299143, "loss": 2.6532, "step": 7849 }, { "crossentropy": 2.726410388946533, "epoch": 0.2845852668213457, "grad_norm": 0.028932519257068634, "grad_norm_var": 6.474239972352223e-06, "learning_rate": 0.008306838280883952, "loss": 2.7403, "step": 7850 }, { "crossentropy": 2.6707024574279785, "epoch": 0.28462151972157773, "grad_norm": 0.02931509166955948, "grad_norm_var": 6.891919325227533e-06, "learning_rate": 0.008306402406804891, "loss": 2.6598, "step": 7851 }, { "crossentropy": 2.4614665508270264, "epoch": 0.28465777262180975, "grad_norm": 0.02842160314321518, "grad_norm_var": 7.412742693528234e-06, "learning_rate": 0.008305966488067848, "loss": 2.5843, "step": 7852 }, { "crossentropy": 2.558164596557617, "epoch": 0.28469402552204176, "grad_norm": 0.03314177319407463, "grad_norm_var": 7.3864609946502326e-06, "learning_rate": 0.008305530524678709, "loss": 2.6476, "step": 7853 }, { "crossentropy": 2.74310564994812, "epoch": 0.2847302784222738, "grad_norm": 0.034354183822870255, "grad_norm_var": 7.482064891277381e-06, "learning_rate": 0.008305094516643366, "loss": 2.7611, "step": 7854 }, { "crossentropy": 2.6432902812957764, "epoch": 0.2847665313225058, "grad_norm": 0.032291579991579056, "grad_norm_var": 7.433791866524398e-06, "learning_rate": 0.008304658463967704, "loss": 2.6448, "step": 7855 }, { "crossentropy": 2.6700146198272705, "epoch": 0.2848027842227378, "grad_norm": 0.030937576666474342, "grad_norm_var": 7.481671981874639e-06, "learning_rate": 0.008304222366657617, "loss": 2.6133, "step": 7856 }, { "crossentropy": 2.537961959838867, "epoch": 0.2848390371229698, "grad_norm": 0.02916906401515007, "grad_norm_var": 7.988149809016961e-06, "learning_rate": 0.00830378622471899, "loss": 2.5882, "step": 7857 }, { "crossentropy": 2.5928897857666016, "epoch": 0.28487529002320183, "grad_norm": 0.03231649473309517, "grad_norm_var": 7.2772358748441715e-06, "learning_rate": 0.008303350038157718, "loss": 2.6007, "step": 7858 }, { "crossentropy": 2.6800577640533447, "epoch": 0.28491154292343385, "grad_norm": 0.029057161882519722, "grad_norm_var": 4.085876436027873e-06, "learning_rate": 0.00830291380697969, "loss": 2.6133, "step": 7859 }, { "crossentropy": 2.600583791732788, "epoch": 0.2849477958236659, "grad_norm": 0.029931457713246346, "grad_norm_var": 4.1523776833760994e-06, "learning_rate": 0.008302477531190799, "loss": 2.614, "step": 7860 }, { "crossentropy": 2.6502249240875244, "epoch": 0.28498404872389793, "grad_norm": 0.029209492728114128, "grad_norm_var": 3.958641709345918e-06, "learning_rate": 0.008302041210796935, "loss": 2.6219, "step": 7861 }, { "crossentropy": 2.79048490524292, "epoch": 0.28502030162412995, "grad_norm": 0.03196025267243385, "grad_norm_var": 3.9928568830897735e-06, "learning_rate": 0.008301604845803997, "loss": 2.7155, "step": 7862 }, { "crossentropy": 2.743501901626587, "epoch": 0.28505655452436196, "grad_norm": 0.029023298993706703, "grad_norm_var": 4.065086412536227e-06, "learning_rate": 0.008301168436217874, "loss": 2.6546, "step": 7863 }, { "crossentropy": 2.6176702976226807, "epoch": 0.285092807424594, "grad_norm": 0.030719324946403503, "grad_norm_var": 3.250301171572054e-06, "learning_rate": 0.00830073198204446, "loss": 2.6308, "step": 7864 }, { "crossentropy": 2.5811338424682617, "epoch": 0.285129060324826, "grad_norm": 0.03360891342163086, "grad_norm_var": 3.6919454809851514e-06, "learning_rate": 0.008300295483289652, "loss": 2.6071, "step": 7865 }, { "crossentropy": 2.637603998184204, "epoch": 0.285165313225058, "grad_norm": 0.037147827446460724, "grad_norm_var": 5.892643801033584e-06, "learning_rate": 0.008299858939959347, "loss": 2.5819, "step": 7866 }, { "crossentropy": 2.704957962036133, "epoch": 0.28520156612529, "grad_norm": 0.034494247287511826, "grad_norm_var": 6.2068474971018e-06, "learning_rate": 0.008299422352059438, "loss": 2.6655, "step": 7867 }, { "crossentropy": 2.688817024230957, "epoch": 0.28523781902552203, "grad_norm": 0.030645910650491714, "grad_norm_var": 5.570022769985416e-06, "learning_rate": 0.008298985719595824, "loss": 2.7329, "step": 7868 }, { "crossentropy": 2.6213207244873047, "epoch": 0.28527407192575405, "grad_norm": 0.028545377776026726, "grad_norm_var": 6.037826839021034e-06, "learning_rate": 0.008298549042574401, "loss": 2.6637, "step": 7869 }, { "crossentropy": 2.6147732734680176, "epoch": 0.28531032482598606, "grad_norm": 0.03411390632390976, "grad_norm_var": 5.948818652196502e-06, "learning_rate": 0.008298112321001066, "loss": 2.6127, "step": 7870 }, { "crossentropy": 2.8797354698181152, "epoch": 0.2853465777262181, "grad_norm": 0.03987826779484749, "grad_norm_var": 1.0399267784449474e-05, "learning_rate": 0.008297675554881721, "loss": 2.7845, "step": 7871 }, { "crossentropy": 2.627793073654175, "epoch": 0.2853828306264501, "grad_norm": 0.03618805110454559, "grad_norm_var": 1.1432789107458174e-05, "learning_rate": 0.008297238744222262, "loss": 2.6933, "step": 7872 }, { "crossentropy": 2.6462087631225586, "epoch": 0.28541908352668216, "grad_norm": 0.03938896954059601, "grad_norm_var": 1.3761672749642997e-05, "learning_rate": 0.008296801889028592, "loss": 2.6836, "step": 7873 }, { "crossentropy": 2.7371487617492676, "epoch": 0.2854553364269142, "grad_norm": 0.032511401921510696, "grad_norm_var": 1.374916095007437e-05, "learning_rate": 0.008296364989306606, "loss": 2.7457, "step": 7874 }, { "crossentropy": 2.5115907192230225, "epoch": 0.2854915893271462, "grad_norm": 0.03330010548233986, "grad_norm_var": 1.269948516355382e-05, "learning_rate": 0.00829592804506221, "loss": 2.5623, "step": 7875 }, { "crossentropy": 2.6685211658477783, "epoch": 0.2855278422273782, "grad_norm": 0.031214231625199318, "grad_norm_var": 1.2248989097010666e-05, "learning_rate": 0.008295491056301306, "loss": 2.6896, "step": 7876 }, { "crossentropy": 2.6932027339935303, "epoch": 0.2855640951276102, "grad_norm": 0.10878608375787735, "grad_norm_var": 0.000365188900414387, "learning_rate": 0.008295054023029791, "loss": 2.6864, "step": 7877 }, { "crossentropy": 2.5982308387756348, "epoch": 0.28560034802784223, "grad_norm": 0.036126378923654556, "grad_norm_var": 0.00036279628782084894, "learning_rate": 0.008294616945253573, "loss": 2.6762, "step": 7878 }, { "crossentropy": 2.6380555629730225, "epoch": 0.28563660092807425, "grad_norm": 0.03161397948861122, "grad_norm_var": 0.00035994892681768387, "learning_rate": 0.008294179822978552, "loss": 2.635, "step": 7879 }, { "crossentropy": 2.7362589836120605, "epoch": 0.28567285382830626, "grad_norm": 0.03445231914520264, "grad_norm_var": 0.00035687616484705674, "learning_rate": 0.008293742656210636, "loss": 2.7828, "step": 7880 }, { "crossentropy": 2.6594200134277344, "epoch": 0.2857091067285383, "grad_norm": 0.03847767785191536, "grad_norm_var": 0.00035493849301681374, "learning_rate": 0.008293305444955726, "loss": 2.6772, "step": 7881 }, { "crossentropy": 2.717176675796509, "epoch": 0.2857453596287703, "grad_norm": 0.03436627611517906, "grad_norm_var": 0.0003561758460057642, "learning_rate": 0.008292868189219727, "loss": 2.6857, "step": 7882 }, { "crossentropy": 2.5468432903289795, "epoch": 0.2857816125290023, "grad_norm": 0.033747799694538116, "grad_norm_var": 0.0003566597529588806, "learning_rate": 0.008292430889008546, "loss": 2.5822, "step": 7883 }, { "crossentropy": 2.4902946949005127, "epoch": 0.2858178654292343, "grad_norm": 0.03681506589055061, "grad_norm_var": 0.00035219978775497934, "learning_rate": 0.008291993544328089, "loss": 2.5761, "step": 7884 }, { "crossentropy": 2.704930067062378, "epoch": 0.28585411832946633, "grad_norm": 0.032185088843107224, "grad_norm_var": 0.0003477865769688144, "learning_rate": 0.008291556155184265, "loss": 2.674, "step": 7885 }, { "crossentropy": 2.578289031982422, "epoch": 0.2858903712296984, "grad_norm": 0.029745901003479958, "grad_norm_var": 0.00035215833667078, "learning_rate": 0.00829111872158298, "loss": 2.602, "step": 7886 }, { "crossentropy": 2.8008618354797363, "epoch": 0.2859266241299304, "grad_norm": 0.031268149614334106, "grad_norm_var": 0.0003561276872402235, "learning_rate": 0.00829068124353014, "loss": 2.6955, "step": 7887 }, { "crossentropy": 2.581120491027832, "epoch": 0.28596287703016243, "grad_norm": 0.03205113112926483, "grad_norm_var": 0.0003586169258432405, "learning_rate": 0.008290243721031659, "loss": 2.5666, "step": 7888 }, { "crossentropy": 2.6637072563171387, "epoch": 0.28599912993039445, "grad_norm": 0.031090237200260162, "grad_norm_var": 0.0003619410895777432, "learning_rate": 0.008289806154093441, "loss": 2.6929, "step": 7889 }, { "crossentropy": 2.6878151893615723, "epoch": 0.28603538283062646, "grad_norm": 0.03051188215613365, "grad_norm_var": 0.0003636501090337464, "learning_rate": 0.008289368542721401, "loss": 2.661, "step": 7890 }, { "crossentropy": 2.618929624557495, "epoch": 0.2860716357308585, "grad_norm": 0.029706282541155815, "grad_norm_var": 0.0003666420951792285, "learning_rate": 0.008288930886921444, "loss": 2.65, "step": 7891 }, { "crossentropy": 2.6633496284484863, "epoch": 0.2861078886310905, "grad_norm": 0.029683437198400497, "grad_norm_var": 0.00036809905093545766, "learning_rate": 0.008288493186699487, "loss": 2.6214, "step": 7892 }, { "crossentropy": 2.7403035163879395, "epoch": 0.2861441415313225, "grad_norm": 0.02859138697385788, "grad_norm_var": 8.232417148795284e-06, "learning_rate": 0.008288055442061437, "loss": 2.6637, "step": 7893 }, { "crossentropy": 2.791750431060791, "epoch": 0.2861803944315545, "grad_norm": 0.030392853543162346, "grad_norm_var": 7.535429322666819e-06, "learning_rate": 0.008287617653013208, "loss": 2.6322, "step": 7894 }, { "crossentropy": 2.707127332687378, "epoch": 0.28621664733178653, "grad_norm": 0.036852262914180756, "grad_norm_var": 8.862955666674476e-06, "learning_rate": 0.008287179819560715, "loss": 2.6146, "step": 7895 }, { "crossentropy": 2.672624111175537, "epoch": 0.28625290023201855, "grad_norm": 0.03180604800581932, "grad_norm_var": 8.610406112976781e-06, "learning_rate": 0.00828674194170987, "loss": 2.6779, "step": 7896 }, { "crossentropy": 2.539196252822876, "epoch": 0.28628915313225056, "grad_norm": 0.038368068635463715, "grad_norm_var": 8.52132186549997e-06, "learning_rate": 0.008286304019466588, "loss": 2.5967, "step": 7897 }, { "crossentropy": 2.6117632389068604, "epoch": 0.2863254060324826, "grad_norm": 0.030865222215652466, "grad_norm_var": 8.33399674722451e-06, "learning_rate": 0.008285866052836781, "loss": 2.633, "step": 7898 }, { "crossentropy": 2.6160621643066406, "epoch": 0.2863616589327146, "grad_norm": 0.03059193678200245, "grad_norm_var": 8.265225084442587e-06, "learning_rate": 0.008285428041826368, "loss": 2.6533, "step": 7899 }, { "crossentropy": 2.7053489685058594, "epoch": 0.28639791183294666, "grad_norm": 0.03647831082344055, "grad_norm_var": 8.051973712971718e-06, "learning_rate": 0.008284989986441263, "loss": 2.7322, "step": 7900 }, { "crossentropy": 2.6777491569519043, "epoch": 0.2864341647331787, "grad_norm": 0.03396310657262802, "grad_norm_var": 8.320281846568335e-06, "learning_rate": 0.008284551886687383, "loss": 2.6369, "step": 7901 }, { "crossentropy": 2.730837106704712, "epoch": 0.2864704176334107, "grad_norm": 0.03170176222920418, "grad_norm_var": 7.972092226087935e-06, "learning_rate": 0.008284113742570645, "loss": 2.6981, "step": 7902 }, { "crossentropy": 2.685559034347534, "epoch": 0.2865066705336427, "grad_norm": 0.02953401207923889, "grad_norm_var": 8.357037755623653e-06, "learning_rate": 0.008283675554096967, "loss": 2.6228, "step": 7903 }, { "crossentropy": 2.7841830253601074, "epoch": 0.2865429234338747, "grad_norm": 0.03254026919603348, "grad_norm_var": 8.374559873731155e-06, "learning_rate": 0.008283237321272268, "loss": 2.7546, "step": 7904 }, { "crossentropy": 2.83986234664917, "epoch": 0.28657917633410673, "grad_norm": 0.03215605020523071, "grad_norm_var": 8.310258608420341e-06, "learning_rate": 0.008282799044102467, "loss": 2.7091, "step": 7905 }, { "crossentropy": 2.5778794288635254, "epoch": 0.28661542923433875, "grad_norm": 0.029767269268631935, "grad_norm_var": 8.50346935954051e-06, "learning_rate": 0.008282360722593482, "loss": 2.6436, "step": 7906 }, { "crossentropy": 2.529102325439453, "epoch": 0.28665168213457076, "grad_norm": 0.03072233870625496, "grad_norm_var": 8.248800493035552e-06, "learning_rate": 0.008281922356751235, "loss": 2.5453, "step": 7907 }, { "crossentropy": 2.588838815689087, "epoch": 0.2866879350348028, "grad_norm": 0.028948906809091568, "grad_norm_var": 8.521729448441095e-06, "learning_rate": 0.008281483946581646, "loss": 2.6562, "step": 7908 }, { "crossentropy": 2.777985095977783, "epoch": 0.2867241879350348, "grad_norm": 0.029343420639634132, "grad_norm_var": 8.207270576397694e-06, "learning_rate": 0.008281045492090636, "loss": 2.7332, "step": 7909 }, { "crossentropy": 2.6373627185821533, "epoch": 0.2867604408352668, "grad_norm": 0.029168780893087387, "grad_norm_var": 8.583945545945085e-06, "learning_rate": 0.008280606993284128, "loss": 2.6426, "step": 7910 }, { "crossentropy": 2.5613343715667725, "epoch": 0.2867966937354988, "grad_norm": 0.03087555430829525, "grad_norm_var": 6.990000885779418e-06, "learning_rate": 0.008280168450168044, "loss": 2.6243, "step": 7911 }, { "crossentropy": 2.588581085205078, "epoch": 0.28683294663573083, "grad_norm": 0.03377164155244827, "grad_norm_var": 7.265309490748067e-06, "learning_rate": 0.008279729862748306, "loss": 2.661, "step": 7912 }, { "crossentropy": 2.5269434452056885, "epoch": 0.2868691995359629, "grad_norm": 0.02875630557537079, "grad_norm_var": 4.62173629110259e-06, "learning_rate": 0.008279291231030841, "loss": 2.6384, "step": 7913 }, { "crossentropy": 2.708104133605957, "epoch": 0.2869054524361949, "grad_norm": 0.03073791228234768, "grad_norm_var": 4.628415985400317e-06, "learning_rate": 0.00827885255502157, "loss": 2.6639, "step": 7914 }, { "crossentropy": 2.3834657669067383, "epoch": 0.28694170533642693, "grad_norm": 0.03124363347887993, "grad_norm_var": 4.602897366500341e-06, "learning_rate": 0.00827841383472642, "loss": 2.5385, "step": 7915 }, { "crossentropy": 2.563162088394165, "epoch": 0.28697795823665895, "grad_norm": 0.033095963299274445, "grad_norm_var": 2.9518581984096123e-06, "learning_rate": 0.008277975070151315, "loss": 2.5409, "step": 7916 }, { "crossentropy": 2.4096739292144775, "epoch": 0.28701421113689096, "grad_norm": 0.03777593746781349, "grad_norm_var": 5.356452086803983e-06, "learning_rate": 0.008277536261302185, "loss": 2.5119, "step": 7917 }, { "crossentropy": 2.5676674842834473, "epoch": 0.287050464037123, "grad_norm": 0.03436802700161934, "grad_norm_var": 5.958259673827238e-06, "learning_rate": 0.008277097408184952, "loss": 2.6354, "step": 7918 }, { "crossentropy": 2.8056671619415283, "epoch": 0.287086716937355, "grad_norm": 0.0299021378159523, "grad_norm_var": 5.873894806393322e-06, "learning_rate": 0.008276658510805546, "loss": 2.7526, "step": 7919 }, { "crossentropy": 2.5172231197357178, "epoch": 0.287122969837587, "grad_norm": 0.03138229623436928, "grad_norm_var": 5.789118048163805e-06, "learning_rate": 0.008276219569169893, "loss": 2.5655, "step": 7920 }, { "crossentropy": 2.724982261657715, "epoch": 0.287159222737819, "grad_norm": 0.040483810007572174, "grad_norm_var": 1.0989722589724246e-05, "learning_rate": 0.008275780583283923, "loss": 2.7177, "step": 7921 }, { "crossentropy": 2.699118137359619, "epoch": 0.28719547563805103, "grad_norm": 0.03485780581831932, "grad_norm_var": 1.1164132732279632e-05, "learning_rate": 0.008275341553153567, "loss": 2.713, "step": 7922 }, { "crossentropy": 2.60711407661438, "epoch": 0.28723172853828305, "grad_norm": 0.03207635134458542, "grad_norm_var": 1.1009301863216371e-05, "learning_rate": 0.00827490247878475, "loss": 2.6597, "step": 7923 }, { "crossentropy": 2.7847516536712646, "epoch": 0.28726798143851506, "grad_norm": 0.03356751799583435, "grad_norm_var": 1.0279315289903736e-05, "learning_rate": 0.008274463360183407, "loss": 2.6926, "step": 7924 }, { "crossentropy": 2.694063425064087, "epoch": 0.2873042343387471, "grad_norm": 0.03211026266217232, "grad_norm_var": 9.560834427651405e-06, "learning_rate": 0.008274024197355467, "loss": 2.6434, "step": 7925 }, { "crossentropy": 2.715756893157959, "epoch": 0.2873404872389791, "grad_norm": 0.030393430963158607, "grad_norm_var": 9.068029443563468e-06, "learning_rate": 0.008273584990306861, "loss": 2.7684, "step": 7926 }, { "crossentropy": 2.6546742916107178, "epoch": 0.28737674013921116, "grad_norm": 0.05298621580004692, "grad_norm_var": 3.383938518230067e-05, "learning_rate": 0.008273145739043523, "loss": 2.5704, "step": 7927 }, { "crossentropy": 2.537642240524292, "epoch": 0.2874129930394432, "grad_norm": 0.03222506120800972, "grad_norm_var": 3.4081197363032324e-05, "learning_rate": 0.008272706443571384, "loss": 2.6524, "step": 7928 }, { "crossentropy": 2.656667947769165, "epoch": 0.2874492459396752, "grad_norm": 0.03189602494239807, "grad_norm_var": 3.24507964464577e-05, "learning_rate": 0.008272267103896377, "loss": 2.7064, "step": 7929 }, { "crossentropy": 2.7571403980255127, "epoch": 0.2874854988399072, "grad_norm": 0.035599153488874435, "grad_norm_var": 3.160670337502626e-05, "learning_rate": 0.008271827720024438, "loss": 2.6599, "step": 7930 }, { "crossentropy": 2.592902898788452, "epoch": 0.2875217517401392, "grad_norm": 0.042000386863946915, "grad_norm_var": 3.399202746866453e-05, "learning_rate": 0.008271388291961501, "loss": 2.6195, "step": 7931 }, { "crossentropy": 2.5232863426208496, "epoch": 0.28755800464037123, "grad_norm": 0.035485513508319855, "grad_norm_var": 3.364826391693818e-05, "learning_rate": 0.0082709488197135, "loss": 2.5939, "step": 7932 }, { "crossentropy": 2.661438465118408, "epoch": 0.28759425754060325, "grad_norm": 0.03152690455317497, "grad_norm_var": 3.414624312679409e-05, "learning_rate": 0.00827050930328637, "loss": 2.6077, "step": 7933 }, { "crossentropy": 2.5347838401794434, "epoch": 0.28763051044083526, "grad_norm": 0.029169710353016853, "grad_norm_var": 3.631046882573239e-05, "learning_rate": 0.008270069742686051, "loss": 2.6192, "step": 7934 }, { "crossentropy": 2.536409854888916, "epoch": 0.2876667633410673, "grad_norm": 0.03342408686876297, "grad_norm_var": 3.481910652536052e-05, "learning_rate": 0.008269630137918477, "loss": 2.5429, "step": 7935 }, { "crossentropy": 2.5927529335021973, "epoch": 0.2877030162412993, "grad_norm": 0.03316133841872215, "grad_norm_var": 3.41708683781291e-05, "learning_rate": 0.008269190488989586, "loss": 2.6983, "step": 7936 }, { "crossentropy": 2.512570858001709, "epoch": 0.2877392691415313, "grad_norm": 0.03504131734371185, "grad_norm_var": 3.208645308608865e-05, "learning_rate": 0.008268750795905316, "loss": 2.4692, "step": 7937 }, { "crossentropy": 2.605985164642334, "epoch": 0.2877755220417633, "grad_norm": 0.09958861023187637, "grad_norm_var": 0.00029515505522288916, "learning_rate": 0.008268311058671604, "loss": 2.5782, "step": 7938 }, { "crossentropy": 2.7243411540985107, "epoch": 0.28781177494199534, "grad_norm": 0.03504467383027077, "grad_norm_var": 0.0002930582360186862, "learning_rate": 0.008267871277294394, "loss": 2.7445, "step": 7939 }, { "crossentropy": 2.816331148147583, "epoch": 0.2878480278422274, "grad_norm": 0.03390862047672272, "grad_norm_var": 0.000292820653443504, "learning_rate": 0.008267431451779623, "loss": 2.8285, "step": 7940 }, { "crossentropy": 2.653229236602783, "epoch": 0.2878842807424594, "grad_norm": 0.032711803913116455, "grad_norm_var": 0.00029229287343371705, "learning_rate": 0.008266991582133232, "loss": 2.6744, "step": 7941 }, { "crossentropy": 2.6687304973602295, "epoch": 0.28792053364269143, "grad_norm": 0.03442377224564552, "grad_norm_var": 0.00028867764394493, "learning_rate": 0.00826655166836116, "loss": 2.6743, "step": 7942 }, { "crossentropy": 2.55305814743042, "epoch": 0.28795678654292345, "grad_norm": 0.030850635841488838, "grad_norm_var": 0.0002787960611308477, "learning_rate": 0.008266111710469353, "loss": 2.5519, "step": 7943 }, { "crossentropy": 2.578026294708252, "epoch": 0.28799303944315546, "grad_norm": 0.033722441643476486, "grad_norm_var": 0.00027780746209848315, "learning_rate": 0.008265671708463748, "loss": 2.5638, "step": 7944 }, { "crossentropy": 2.830195665359497, "epoch": 0.2880292923433875, "grad_norm": 0.034199610352516174, "grad_norm_var": 0.000276272857556843, "learning_rate": 0.008265231662350292, "loss": 2.7092, "step": 7945 }, { "crossentropy": 2.7718491554260254, "epoch": 0.2880655452436195, "grad_norm": 0.04868730902671814, "grad_norm_var": 0.00028258669748316187, "learning_rate": 0.008264791572134928, "loss": 2.666, "step": 7946 }, { "crossentropy": 2.6484780311584473, "epoch": 0.2881017981438515, "grad_norm": 0.030738959088921547, "grad_norm_var": 0.0002859089363550571, "learning_rate": 0.0082643514378236, "loss": 2.696, "step": 7947 }, { "crossentropy": 2.5850181579589844, "epoch": 0.2881380510440835, "grad_norm": 0.031197208911180496, "grad_norm_var": 0.0002886276991719565, "learning_rate": 0.00826391125942225, "loss": 2.6587, "step": 7948 }, { "crossentropy": 2.6986660957336426, "epoch": 0.28817430394431554, "grad_norm": 0.028588587418198586, "grad_norm_var": 0.0002916885418973817, "learning_rate": 0.008263471036936827, "loss": 2.6398, "step": 7949 }, { "crossentropy": 2.7251193523406982, "epoch": 0.28821055684454755, "grad_norm": 0.05723382532596588, "grad_norm_var": 0.0003086994980724288, "learning_rate": 0.008263030770373275, "loss": 2.6863, "step": 7950 }, { "crossentropy": 2.571331739425659, "epoch": 0.28824680974477956, "grad_norm": 0.03512442111968994, "grad_norm_var": 0.00030749530854453944, "learning_rate": 0.008262590459737538, "loss": 2.6531, "step": 7951 }, { "crossentropy": 2.7049174308776855, "epoch": 0.2882830626450116, "grad_norm": 0.033633336424827576, "grad_norm_var": 0.00030710157671506665, "learning_rate": 0.008262150105035568, "loss": 2.6856, "step": 7952 }, { "crossentropy": 2.6461873054504395, "epoch": 0.2883193155452436, "grad_norm": 0.03558306768536568, "grad_norm_var": 0.00030678568692988636, "learning_rate": 0.00826170970627331, "loss": 2.6816, "step": 7953 }, { "crossentropy": 2.8108677864074707, "epoch": 0.28835556844547566, "grad_norm": 0.03340175002813339, "grad_norm_var": 5.2087932856927885e-05, "learning_rate": 0.008261269263456713, "loss": 2.5847, "step": 7954 }, { "crossentropy": 2.680670738220215, "epoch": 0.2883918213457077, "grad_norm": 0.03331679850816727, "grad_norm_var": 5.239454875279324e-05, "learning_rate": 0.008260828776591725, "loss": 2.7018, "step": 7955 }, { "crossentropy": 2.8223133087158203, "epoch": 0.2884280742459397, "grad_norm": 0.03444536030292511, "grad_norm_var": 5.23016987041852e-05, "learning_rate": 0.008260388245684296, "loss": 2.7579, "step": 7956 }, { "crossentropy": 2.6983327865600586, "epoch": 0.2884643271461717, "grad_norm": 0.03417360037565231, "grad_norm_var": 5.189353404726169e-05, "learning_rate": 0.008259947670740375, "loss": 2.6807, "step": 7957 }, { "crossentropy": 2.6133837699890137, "epoch": 0.2885005800464037, "grad_norm": 0.0306893028318882, "grad_norm_var": 5.3342161153349303e-05, "learning_rate": 0.008259507051765914, "loss": 2.5509, "step": 7958 }, { "crossentropy": 2.6388328075408936, "epoch": 0.28853683294663574, "grad_norm": 0.030904807150363922, "grad_norm_var": 5.330985259175639e-05, "learning_rate": 0.008259066388766863, "loss": 2.6295, "step": 7959 }, { "crossentropy": 2.7587687969207764, "epoch": 0.28857308584686775, "grad_norm": 0.03046138398349285, "grad_norm_var": 5.4683281058423786e-05, "learning_rate": 0.008258625681749177, "loss": 2.7592, "step": 7960 }, { "crossentropy": 2.6345481872558594, "epoch": 0.28860933874709976, "grad_norm": 0.03185863047838211, "grad_norm_var": 5.53220352075481e-05, "learning_rate": 0.008258184930718806, "loss": 2.7055, "step": 7961 }, { "crossentropy": 2.789184331893921, "epoch": 0.2886455916473318, "grad_norm": 0.03447975590825081, "grad_norm_var": 4.201406322414887e-05, "learning_rate": 0.008257744135681701, "loss": 2.7011, "step": 7962 }, { "crossentropy": 2.7674014568328857, "epoch": 0.2886818445475638, "grad_norm": 0.03211848810315132, "grad_norm_var": 4.1512133289440566e-05, "learning_rate": 0.008257303296643818, "loss": 2.7382, "step": 7963 }, { "crossentropy": 2.7672557830810547, "epoch": 0.2887180974477958, "grad_norm": 0.02969127520918846, "grad_norm_var": 4.225693653334365e-05, "learning_rate": 0.008256862413611113, "loss": 2.7013, "step": 7964 }, { "crossentropy": 2.7970762252807617, "epoch": 0.2887543503480278, "grad_norm": 0.03160462900996208, "grad_norm_var": 4.0606491210432475e-05, "learning_rate": 0.008256421486589536, "loss": 2.6847, "step": 7965 }, { "crossentropy": 2.611048460006714, "epoch": 0.28879060324825984, "grad_norm": 0.03315648064017296, "grad_norm_var": 3.1981853817499566e-06, "learning_rate": 0.008255980515585045, "loss": 2.6212, "step": 7966 }, { "crossentropy": 2.6268820762634277, "epoch": 0.2888268561484919, "grad_norm": 0.035094667226076126, "grad_norm_var": 3.1889803958911096e-06, "learning_rate": 0.008255539500603598, "loss": 2.6733, "step": 7967 }, { "crossentropy": 2.5411579608917236, "epoch": 0.2888631090487239, "grad_norm": 0.03351699560880661, "grad_norm_var": 3.176718565605603e-06, "learning_rate": 0.008255098441651147, "loss": 2.6199, "step": 7968 }, { "crossentropy": 2.6848959922790527, "epoch": 0.28889936194895594, "grad_norm": 0.03188825398683548, "grad_norm_var": 2.6495614159926696e-06, "learning_rate": 0.008254657338733654, "loss": 2.6805, "step": 7969 }, { "crossentropy": 2.6000564098358154, "epoch": 0.28893561484918795, "grad_norm": 0.03492942079901695, "grad_norm_var": 2.968887278520501e-06, "learning_rate": 0.008254216191857074, "loss": 2.6547, "step": 7970 }, { "crossentropy": 2.680687189102173, "epoch": 0.28897186774941996, "grad_norm": 0.03434734418988228, "grad_norm_var": 3.127488418853139e-06, "learning_rate": 0.008253775001027365, "loss": 2.7555, "step": 7971 }, { "crossentropy": 2.6463372707366943, "epoch": 0.289008120649652, "grad_norm": 0.02997746877372265, "grad_norm_var": 3.3413446957205698e-06, "learning_rate": 0.008253333766250488, "loss": 2.6292, "step": 7972 }, { "crossentropy": 2.648798942565918, "epoch": 0.289044373549884, "grad_norm": 0.03133879229426384, "grad_norm_var": 3.1848623181113604e-06, "learning_rate": 0.008252892487532401, "loss": 2.6638, "step": 7973 }, { "crossentropy": 2.7708916664123535, "epoch": 0.289080626450116, "grad_norm": 0.03513539209961891, "grad_norm_var": 3.493006635020499e-06, "learning_rate": 0.008252451164879064, "loss": 2.737, "step": 7974 }, { "crossentropy": 2.5624523162841797, "epoch": 0.289116879350348, "grad_norm": 0.03194541484117508, "grad_norm_var": 3.3349876396734344e-06, "learning_rate": 0.008252009798296439, "loss": 2.6847, "step": 7975 }, { "crossentropy": 2.701141357421875, "epoch": 0.28915313225058004, "grad_norm": 0.031325679272413254, "grad_norm_var": 3.135623276082642e-06, "learning_rate": 0.008251568387790486, "loss": 2.6142, "step": 7976 }, { "crossentropy": 2.673178195953369, "epoch": 0.28918938515081205, "grad_norm": 0.031140338629484177, "grad_norm_var": 3.2437129687263998e-06, "learning_rate": 0.008251126933367168, "loss": 2.7036, "step": 7977 }, { "crossentropy": 2.5427377223968506, "epoch": 0.28922563805104406, "grad_norm": 0.03003603033721447, "grad_norm_var": 3.3674797556330905e-06, "learning_rate": 0.008250685435032446, "loss": 2.6105, "step": 7978 }, { "crossentropy": 2.7164065837860107, "epoch": 0.2892618909512761, "grad_norm": 0.02876340039074421, "grad_norm_var": 4.164705051424308e-06, "learning_rate": 0.008250243892792285, "loss": 2.6403, "step": 7979 }, { "crossentropy": 2.5972683429718018, "epoch": 0.2892981438515081, "grad_norm": 0.030197426676750183, "grad_norm_var": 4.016929727142339e-06, "learning_rate": 0.008249802306652647, "loss": 2.6052, "step": 7980 }, { "crossentropy": 2.748485565185547, "epoch": 0.28933439675174016, "grad_norm": 0.03183509409427643, "grad_norm_var": 4.003495182800182e-06, "learning_rate": 0.008249360676619499, "loss": 2.7482, "step": 7981 }, { "crossentropy": 2.759695291519165, "epoch": 0.2893706496519722, "grad_norm": 0.03197385370731354, "grad_norm_var": 3.934451565874838e-06, "learning_rate": 0.008248919002698803, "loss": 2.7289, "step": 7982 }, { "crossentropy": 2.6337242126464844, "epoch": 0.2894069025522042, "grad_norm": 0.03194788470864296, "grad_norm_var": 3.292816479356211e-06, "learning_rate": 0.008248477284896525, "loss": 2.5289, "step": 7983 }, { "crossentropy": 2.6297690868377686, "epoch": 0.2894431554524362, "grad_norm": 0.034465063363313675, "grad_norm_var": 3.554195974242201e-06, "learning_rate": 0.00824803552321863, "loss": 2.6103, "step": 7984 }, { "crossentropy": 2.621750593185425, "epoch": 0.2894794083526682, "grad_norm": 0.03154126927256584, "grad_norm_var": 3.5647130223269284e-06, "learning_rate": 0.008247593717671088, "loss": 2.5987, "step": 7985 }, { "crossentropy": 2.667592763900757, "epoch": 0.28951566125290024, "grad_norm": 0.03357885405421257, "grad_norm_var": 3.138816202787502e-06, "learning_rate": 0.008247151868259866, "loss": 2.6512, "step": 7986 }, { "crossentropy": 2.540961980819702, "epoch": 0.28955191415313225, "grad_norm": 0.031772129237651825, "grad_norm_var": 2.694718491689932e-06, "learning_rate": 0.008246709974990928, "loss": 2.5952, "step": 7987 }, { "crossentropy": 2.6394455432891846, "epoch": 0.28958816705336426, "grad_norm": 0.030321162194013596, "grad_norm_var": 2.623811987957578e-06, "learning_rate": 0.008246268037870246, "loss": 2.6834, "step": 7988 }, { "crossentropy": 2.6310927867889404, "epoch": 0.2896244199535963, "grad_norm": 0.032343629747629166, "grad_norm_var": 2.6375378367493944e-06, "learning_rate": 0.008245826056903787, "loss": 2.5549, "step": 7989 }, { "crossentropy": 2.6671762466430664, "epoch": 0.2896606728538283, "grad_norm": 0.032128192484378815, "grad_norm_var": 1.8534192115287248e-06, "learning_rate": 0.008245384032097522, "loss": 2.6585, "step": 7990 }, { "crossentropy": 2.666947364807129, "epoch": 0.2896969257540603, "grad_norm": 0.03378516808152199, "grad_norm_var": 2.154055799024647e-06, "learning_rate": 0.00824494196345742, "loss": 2.6352, "step": 7991 }, { "crossentropy": 2.5921895503997803, "epoch": 0.2897331786542923, "grad_norm": 0.031565628945827484, "grad_norm_var": 2.1457681665710177e-06, "learning_rate": 0.008244499850989451, "loss": 2.6255, "step": 7992 }, { "crossentropy": 2.681556463241577, "epoch": 0.28976943155452434, "grad_norm": 0.03127407655119896, "grad_norm_var": 2.136688840113689e-06, "learning_rate": 0.00824405769469959, "loss": 2.6397, "step": 7993 }, { "crossentropy": 2.784942388534546, "epoch": 0.2898056844547564, "grad_norm": 0.03076731041073799, "grad_norm_var": 2.0058641884354434e-06, "learning_rate": 0.008243615494593806, "loss": 2.7285, "step": 7994 }, { "crossentropy": 2.486642360687256, "epoch": 0.2898419373549884, "grad_norm": 0.030783018097281456, "grad_norm_var": 1.452175814139556e-06, "learning_rate": 0.008243173250678073, "loss": 2.5516, "step": 7995 }, { "crossentropy": 2.674924373626709, "epoch": 0.28987819025522044, "grad_norm": 0.03302442282438278, "grad_norm_var": 1.3127468636613676e-06, "learning_rate": 0.008242730962958363, "loss": 2.6462, "step": 7996 }, { "crossentropy": 2.743241310119629, "epoch": 0.28991444315545245, "grad_norm": 0.03649204596877098, "grad_norm_var": 2.5228514148874777e-06, "learning_rate": 0.00824228863144065, "loss": 2.6814, "step": 7997 }, { "crossentropy": 2.5726327896118164, "epoch": 0.28995069605568446, "grad_norm": 0.03265095502138138, "grad_norm_var": 2.5166232650343368e-06, "learning_rate": 0.008241846256130909, "loss": 2.6275, "step": 7998 }, { "crossentropy": 2.673015594482422, "epoch": 0.2899869489559165, "grad_norm": 0.03188677504658699, "grad_norm_var": 2.5205612623493824e-06, "learning_rate": 0.008241403837035114, "loss": 2.6726, "step": 7999 }, { "crossentropy": 2.676950454711914, "epoch": 0.2900232018561485, "grad_norm": 0.029092606157064438, "grad_norm_var": 2.8443465673205394e-06, "learning_rate": 0.008240961374159243, "loss": 2.6134, "step": 8000 }, { "crossentropy": 2.5702860355377197, "epoch": 0.2900594547563805, "grad_norm": 0.028357109054923058, "grad_norm_var": 3.699509508079309e-06, "learning_rate": 0.00824051886750927, "loss": 2.5698, "step": 8001 }, { "crossentropy": 2.674377202987671, "epoch": 0.2900957076566125, "grad_norm": 0.029371609911322594, "grad_norm_var": 3.843809219957712e-06, "learning_rate": 0.00824007631709117, "loss": 2.6975, "step": 8002 }, { "crossentropy": 2.631404399871826, "epoch": 0.29013196055684454, "grad_norm": 0.030560901388525963, "grad_norm_var": 3.907862792256143e-06, "learning_rate": 0.008239633722910924, "loss": 2.6344, "step": 8003 }, { "crossentropy": 2.8348066806793213, "epoch": 0.29016821345707655, "grad_norm": 0.03550046682357788, "grad_norm_var": 4.752899487546885e-06, "learning_rate": 0.008239191084974508, "loss": 2.7439, "step": 8004 }, { "crossentropy": 2.6368958950042725, "epoch": 0.29020446635730857, "grad_norm": 0.03276155889034271, "grad_norm_var": 4.791379017842166e-06, "learning_rate": 0.0082387484032879, "loss": 2.6483, "step": 8005 }, { "crossentropy": 2.8154518604278564, "epoch": 0.2902407192575406, "grad_norm": 0.033625487238168716, "grad_norm_var": 4.982021382934851e-06, "learning_rate": 0.00823830567785708, "loss": 2.7784, "step": 8006 }, { "crossentropy": 2.676649570465088, "epoch": 0.2902769721577726, "grad_norm": 0.02882879413664341, "grad_norm_var": 5.316958913212941e-06, "learning_rate": 0.00823786290868803, "loss": 2.5993, "step": 8007 }, { "crossentropy": 2.6071486473083496, "epoch": 0.29031322505800466, "grad_norm": 0.030822191387414932, "grad_norm_var": 5.3607503902640035e-06, "learning_rate": 0.008237420095786725, "loss": 2.6187, "step": 8008 }, { "crossentropy": 2.5906155109405518, "epoch": 0.2903494779582367, "grad_norm": 0.03244458884000778, "grad_norm_var": 5.39357095082787e-06, "learning_rate": 0.008236977239159149, "loss": 2.5589, "step": 8009 }, { "crossentropy": 2.6704320907592773, "epoch": 0.2903857308584687, "grad_norm": 0.03208552300930023, "grad_norm_var": 5.340773458930891e-06, "learning_rate": 0.008236534338811283, "loss": 2.6822, "step": 8010 }, { "crossentropy": 2.5519473552703857, "epoch": 0.2904219837587007, "grad_norm": 0.030019991099834442, "grad_norm_var": 5.477370976257149e-06, "learning_rate": 0.00823609139474911, "loss": 2.5676, "step": 8011 }, { "crossentropy": 2.692614793777466, "epoch": 0.2904582366589327, "grad_norm": 0.029734909534454346, "grad_norm_var": 5.5816927170102466e-06, "learning_rate": 0.00823564840697861, "loss": 2.6628, "step": 8012 }, { "crossentropy": 2.7014505863189697, "epoch": 0.29049448955916474, "grad_norm": 0.028311176225543022, "grad_norm_var": 4.3354259114957435e-06, "learning_rate": 0.00823520537550577, "loss": 2.7105, "step": 8013 }, { "crossentropy": 2.609403610229492, "epoch": 0.29053074245939675, "grad_norm": 0.03090791404247284, "grad_norm_var": 4.142415654181695e-06, "learning_rate": 0.008234762300336573, "loss": 2.6558, "step": 8014 }, { "crossentropy": 2.726743698120117, "epoch": 0.29056699535962877, "grad_norm": 0.02933509461581707, "grad_norm_var": 4.211753429181747e-06, "learning_rate": 0.008234319181476999, "loss": 2.7567, "step": 8015 }, { "crossentropy": 2.533416986465454, "epoch": 0.2906032482598608, "grad_norm": 0.029057083651423454, "grad_norm_var": 4.219611197662819e-06, "learning_rate": 0.008233876018933037, "loss": 2.5895, "step": 8016 }, { "crossentropy": 2.631028175354004, "epoch": 0.2906395011600928, "grad_norm": 0.02941649779677391, "grad_norm_var": 3.954188063088187e-06, "learning_rate": 0.008233432812710672, "loss": 2.6573, "step": 8017 }, { "crossentropy": 2.8423306941986084, "epoch": 0.2906757540603248, "grad_norm": 0.0299727413803339, "grad_norm_var": 3.8623675155319745e-06, "learning_rate": 0.00823298956281589, "loss": 2.726, "step": 8018 }, { "crossentropy": 2.5443596839904785, "epoch": 0.2907120069605568, "grad_norm": 0.02878146432340145, "grad_norm_var": 4.1256689768252865e-06, "learning_rate": 0.00823254626925468, "loss": 2.5897, "step": 8019 }, { "crossentropy": 2.5149900913238525, "epoch": 0.29074825986078884, "grad_norm": 0.0295390821993351, "grad_norm_var": 2.551287094331757e-06, "learning_rate": 0.008232102932033025, "loss": 2.5794, "step": 8020 }, { "crossentropy": 2.598693370819092, "epoch": 0.2907845127610209, "grad_norm": 0.027920834720134735, "grad_norm_var": 2.4611119627329067e-06, "learning_rate": 0.008231659551156914, "loss": 2.6385, "step": 8021 }, { "crossentropy": 2.619283676147461, "epoch": 0.2908207656612529, "grad_norm": 0.0305036511272192, "grad_norm_var": 1.582038014318387e-06, "learning_rate": 0.008231216126632338, "loss": 2.5953, "step": 8022 }, { "crossentropy": 2.653024196624756, "epoch": 0.29085701856148494, "grad_norm": 0.03233325481414795, "grad_norm_var": 1.8700644735057554e-06, "learning_rate": 0.008230772658465285, "loss": 2.6795, "step": 8023 }, { "crossentropy": 2.5720012187957764, "epoch": 0.29089327146171695, "grad_norm": 0.03372754529118538, "grad_norm_var": 2.6874184315054186e-06, "learning_rate": 0.008230329146661743, "loss": 2.6614, "step": 8024 }, { "crossentropy": 2.681450366973877, "epoch": 0.29092952436194897, "grad_norm": 0.03077600710093975, "grad_norm_var": 2.374452208486509e-06, "learning_rate": 0.008229885591227703, "loss": 2.6568, "step": 8025 }, { "crossentropy": 2.799220085144043, "epoch": 0.290965777262181, "grad_norm": 0.02840501070022583, "grad_norm_var": 2.271957458173365e-06, "learning_rate": 0.00822944199216916, "loss": 2.683, "step": 8026 }, { "crossentropy": 2.574207067489624, "epoch": 0.291002030162413, "grad_norm": 0.031222188845276833, "grad_norm_var": 2.378092304619085e-06, "learning_rate": 0.0082289983494921, "loss": 2.5844, "step": 8027 }, { "crossentropy": 2.6943743228912354, "epoch": 0.291038283062645, "grad_norm": 0.03143111243844032, "grad_norm_var": 2.4987434710796915e-06, "learning_rate": 0.008228554663202516, "loss": 2.6777, "step": 8028 }, { "crossentropy": 2.4610700607299805, "epoch": 0.291074535962877, "grad_norm": 0.03077695332467556, "grad_norm_var": 2.289799449472126e-06, "learning_rate": 0.008228110933306403, "loss": 2.5874, "step": 8029 }, { "crossentropy": 2.5822858810424805, "epoch": 0.29111078886310904, "grad_norm": 0.02894156612455845, "grad_norm_var": 2.3607095780046506e-06, "learning_rate": 0.008227667159809753, "loss": 2.5404, "step": 8030 }, { "crossentropy": 2.4525246620178223, "epoch": 0.29114704176334105, "grad_norm": 0.028215037658810616, "grad_norm_var": 2.5583903137072245e-06, "learning_rate": 0.00822722334271856, "loss": 2.5892, "step": 8031 }, { "crossentropy": 2.562356472015381, "epoch": 0.29118329466357307, "grad_norm": 0.03012566827237606, "grad_norm_var": 2.486329344354128e-06, "learning_rate": 0.008226779482038819, "loss": 2.6242, "step": 8032 }, { "crossentropy": 2.615006685256958, "epoch": 0.2912195475638051, "grad_norm": 0.030240746214985847, "grad_norm_var": 2.450318011787816e-06, "learning_rate": 0.008226335577776522, "loss": 2.6195, "step": 8033 }, { "crossentropy": 2.545448064804077, "epoch": 0.2912558004640371, "grad_norm": 0.035199616104364395, "grad_norm_var": 4.011958254246254e-06, "learning_rate": 0.008225891629937669, "loss": 2.6424, "step": 8034 }, { "crossentropy": 2.617711305618286, "epoch": 0.29129205336426917, "grad_norm": 0.030089346691966057, "grad_norm_var": 3.817659333261294e-06, "learning_rate": 0.008225447638528253, "loss": 2.6892, "step": 8035 }, { "crossentropy": 2.647860050201416, "epoch": 0.2913283062645012, "grad_norm": 0.0318852961063385, "grad_norm_var": 3.832798236718865e-06, "learning_rate": 0.008225003603554272, "loss": 2.6438, "step": 8036 }, { "crossentropy": 2.7717719078063965, "epoch": 0.2913645591647332, "grad_norm": 0.034920744597911835, "grad_norm_var": 4.26672522074975e-06, "learning_rate": 0.008224559525021723, "loss": 2.6947, "step": 8037 }, { "crossentropy": 2.551994800567627, "epoch": 0.2914008120649652, "grad_norm": 0.033965833485126495, "grad_norm_var": 4.706163879427718e-06, "learning_rate": 0.008224115402936605, "loss": 2.653, "step": 8038 }, { "crossentropy": 2.5956954956054688, "epoch": 0.2914370649651972, "grad_norm": 0.029701944440603256, "grad_norm_var": 4.808317455982231e-06, "learning_rate": 0.008223671237304915, "loss": 2.6737, "step": 8039 }, { "crossentropy": 2.7114410400390625, "epoch": 0.29147331786542924, "grad_norm": 0.03046923317015171, "grad_norm_var": 4.385313428981171e-06, "learning_rate": 0.008223227028132654, "loss": 2.6798, "step": 8040 }, { "crossentropy": 2.581141710281372, "epoch": 0.29150957076566125, "grad_norm": 0.030547143891453743, "grad_norm_var": 4.396120856537062e-06, "learning_rate": 0.008222782775425819, "loss": 2.5993, "step": 8041 }, { "crossentropy": 2.71673321723938, "epoch": 0.29154582366589327, "grad_norm": 0.029114119708538055, "grad_norm_var": 4.181385197048757e-06, "learning_rate": 0.008222338479190412, "loss": 2.6629, "step": 8042 }, { "crossentropy": 2.5138344764709473, "epoch": 0.2915820765661253, "grad_norm": 0.032713040709495544, "grad_norm_var": 4.353949553929463e-06, "learning_rate": 0.008221894139432433, "loss": 2.5031, "step": 8043 }, { "crossentropy": 2.587472915649414, "epoch": 0.2916183294663573, "grad_norm": 0.03433835133910179, "grad_norm_var": 4.992686604460977e-06, "learning_rate": 0.008221449756157887, "loss": 2.6782, "step": 8044 }, { "crossentropy": 2.551431894302368, "epoch": 0.2916545823665893, "grad_norm": 0.032107241451740265, "grad_norm_var": 5.005587875153112e-06, "learning_rate": 0.00822100532937277, "loss": 2.6677, "step": 8045 }, { "crossentropy": 2.6865553855895996, "epoch": 0.2916908352668213, "grad_norm": 0.030718423426151276, "grad_norm_var": 4.617885871183401e-06, "learning_rate": 0.008220560859083091, "loss": 2.5996, "step": 8046 }, { "crossentropy": 2.5868453979492188, "epoch": 0.29172708816705334, "grad_norm": 0.02970435656607151, "grad_norm_var": 4.099835064810357e-06, "learning_rate": 0.008220116345294848, "loss": 2.6638, "step": 8047 }, { "crossentropy": 2.645143508911133, "epoch": 0.2917633410672854, "grad_norm": 0.03162101283669472, "grad_norm_var": 3.942632860448126e-06, "learning_rate": 0.00821967178801405, "loss": 2.6508, "step": 8048 }, { "crossentropy": 2.596240282058716, "epoch": 0.2917995939675174, "grad_norm": 0.02955043315887451, "grad_norm_var": 4.107513324058155e-06, "learning_rate": 0.008219227187246696, "loss": 2.5906, "step": 8049 }, { "crossentropy": 2.477670192718506, "epoch": 0.29183584686774944, "grad_norm": 0.031110549345612526, "grad_norm_var": 3.2256474916130766e-06, "learning_rate": 0.008218782542998793, "loss": 2.705, "step": 8050 }, { "crossentropy": 2.5655970573425293, "epoch": 0.29187209976798145, "grad_norm": 0.03335495665669441, "grad_norm_var": 3.317208415647375e-06, "learning_rate": 0.008218337855276347, "loss": 2.6354, "step": 8051 }, { "crossentropy": 2.778064489364624, "epoch": 0.29190835266821347, "grad_norm": 0.03658033162355423, "grad_norm_var": 4.8648025802650045e-06, "learning_rate": 0.008217893124085367, "loss": 2.7271, "step": 8052 }, { "crossentropy": 2.6781108379364014, "epoch": 0.2919446055684455, "grad_norm": 0.03231942281126976, "grad_norm_var": 4.242560257663024e-06, "learning_rate": 0.008217448349431855, "loss": 2.7027, "step": 8053 }, { "crossentropy": 2.7792153358459473, "epoch": 0.2919808584686775, "grad_norm": 0.03380036726593971, "grad_norm_var": 4.19527008875172e-06, "learning_rate": 0.00821700353132182, "loss": 2.8165, "step": 8054 }, { "crossentropy": 2.7005279064178467, "epoch": 0.2920171113689095, "grad_norm": 0.03283340483903885, "grad_norm_var": 3.959526851245119e-06, "learning_rate": 0.008216558669761271, "loss": 2.6838, "step": 8055 }, { "crossentropy": 2.6701204776763916, "epoch": 0.2920533642691415, "grad_norm": 0.030369527637958527, "grad_norm_var": 3.979569698512791e-06, "learning_rate": 0.008216113764756217, "loss": 2.7052, "step": 8056 }, { "crossentropy": 2.616546392440796, "epoch": 0.29208961716937354, "grad_norm": 0.02901957742869854, "grad_norm_var": 4.405826038412228e-06, "learning_rate": 0.008215668816312664, "loss": 2.6604, "step": 8057 }, { "crossentropy": 2.8140878677368164, "epoch": 0.29212587006960555, "grad_norm": 0.033733922988176346, "grad_norm_var": 4.067784373340761e-06, "learning_rate": 0.008215223824436626, "loss": 2.698, "step": 8058 }, { "crossentropy": 2.6253201961517334, "epoch": 0.29216212296983757, "grad_norm": 0.032619018107652664, "grad_norm_var": 4.0608670037861705e-06, "learning_rate": 0.008214778789134108, "loss": 2.6912, "step": 8059 }, { "crossentropy": 2.606448173522949, "epoch": 0.2921983758700696, "grad_norm": 0.03188547492027283, "grad_norm_var": 3.7085491050228473e-06, "learning_rate": 0.008214333710411127, "loss": 2.629, "step": 8060 }, { "crossentropy": 2.806224822998047, "epoch": 0.2922346287703016, "grad_norm": 0.03604183718562126, "grad_norm_var": 4.754407609252224e-06, "learning_rate": 0.008213888588273691, "loss": 2.6848, "step": 8061 }, { "crossentropy": 2.6754894256591797, "epoch": 0.29227088167053367, "grad_norm": 0.03614863380789757, "grad_norm_var": 5.5218201397059e-06, "learning_rate": 0.00821344342272781, "loss": 2.6139, "step": 8062 }, { "crossentropy": 2.692267894744873, "epoch": 0.2923071345707657, "grad_norm": 0.03304767981171608, "grad_norm_var": 4.95489845098353e-06, "learning_rate": 0.008212998213779498, "loss": 2.6866, "step": 8063 }, { "crossentropy": 2.5827109813690186, "epoch": 0.2923433874709977, "grad_norm": 0.03579116240143776, "grad_norm_var": 5.412787013633844e-06, "learning_rate": 0.008212552961434774, "loss": 2.6449, "step": 8064 }, { "crossentropy": 2.606675386428833, "epoch": 0.2923796403712297, "grad_norm": 0.04148629307746887, "grad_norm_var": 8.806508069112742e-06, "learning_rate": 0.008212107665699644, "loss": 2.6706, "step": 8065 }, { "crossentropy": 2.6646721363067627, "epoch": 0.2924158932714617, "grad_norm": 0.040313150733709335, "grad_norm_var": 1.0849956446124876e-05, "learning_rate": 0.008211662326580126, "loss": 2.6635, "step": 8066 }, { "crossentropy": 2.6106483936309814, "epoch": 0.29245214617169374, "grad_norm": 0.03125173971056938, "grad_norm_var": 1.1400991957804723e-05, "learning_rate": 0.008211216944082233, "loss": 2.6185, "step": 8067 }, { "crossentropy": 2.7144298553466797, "epoch": 0.29248839907192575, "grad_norm": 0.030062125995755196, "grad_norm_var": 1.1989954236259203e-05, "learning_rate": 0.008210771518211984, "loss": 2.67, "step": 8068 }, { "crossentropy": 2.6330080032348633, "epoch": 0.29252465197215777, "grad_norm": 0.030124347656965256, "grad_norm_var": 1.2723029535842423e-05, "learning_rate": 0.008210326048975392, "loss": 2.6911, "step": 8069 }, { "crossentropy": 2.758772134780884, "epoch": 0.2925609048723898, "grad_norm": 0.030934251844882965, "grad_norm_var": 1.3182043962577048e-05, "learning_rate": 0.008209880536378476, "loss": 2.7198, "step": 8070 }, { "crossentropy": 2.630354881286621, "epoch": 0.2925971577726218, "grad_norm": 0.03230922669172287, "grad_norm_var": 1.324432946075791e-05, "learning_rate": 0.008209434980427251, "loss": 2.5653, "step": 8071 }, { "crossentropy": 2.6770124435424805, "epoch": 0.2926334106728538, "grad_norm": 0.028456788510084152, "grad_norm_var": 1.4257620088966444e-05, "learning_rate": 0.008208989381127737, "loss": 2.6748, "step": 8072 }, { "crossentropy": 2.77028489112854, "epoch": 0.2926696635730858, "grad_norm": 0.029890356585383415, "grad_norm_var": 1.3804951724440822e-05, "learning_rate": 0.00820854373848595, "loss": 2.7594, "step": 8073 }, { "crossentropy": 2.581422805786133, "epoch": 0.29270591647331784, "grad_norm": 0.03062443807721138, "grad_norm_var": 1.4262936849181867e-05, "learning_rate": 0.008208098052507912, "loss": 2.5715, "step": 8074 }, { "crossentropy": 2.6405022144317627, "epoch": 0.2927421693735499, "grad_norm": 0.03220159187912941, "grad_norm_var": 1.4305420167430351e-05, "learning_rate": 0.008207652323199642, "loss": 2.6353, "step": 8075 }, { "crossentropy": 2.518118381500244, "epoch": 0.2927784222737819, "grad_norm": 0.03151979297399521, "grad_norm_var": 1.4375948375190683e-05, "learning_rate": 0.008207206550567156, "loss": 2.6145, "step": 8076 }, { "crossentropy": 2.7368667125701904, "epoch": 0.29281467517401394, "grad_norm": 0.028819626197218895, "grad_norm_var": 1.4839409955344365e-05, "learning_rate": 0.008206760734616483, "loss": 2.6852, "step": 8077 }, { "crossentropy": 2.714980125427246, "epoch": 0.29285092807424595, "grad_norm": 0.029939381405711174, "grad_norm_var": 1.438263965460821e-05, "learning_rate": 0.008206314875353638, "loss": 2.6999, "step": 8078 }, { "crossentropy": 2.7734251022338867, "epoch": 0.29288718097447797, "grad_norm": 0.02996402233839035, "grad_norm_var": 1.4668816112400896e-05, "learning_rate": 0.008205868972784644, "loss": 2.6253, "step": 8079 }, { "crossentropy": 2.5960278511047363, "epoch": 0.29292343387471, "grad_norm": 0.032695695757865906, "grad_norm_var": 1.3746514022365483e-05, "learning_rate": 0.008205423026915525, "loss": 2.613, "step": 8080 }, { "crossentropy": 2.670771598815918, "epoch": 0.292959686774942, "grad_norm": 0.03232446312904358, "grad_norm_var": 7.297033508587604e-06, "learning_rate": 0.008204977037752303, "loss": 2.5463, "step": 8081 }, { "crossentropy": 2.6536664962768555, "epoch": 0.292995939675174, "grad_norm": 0.030286405235528946, "grad_norm_var": 1.5835582103657444e-06, "learning_rate": 0.008204531005301005, "loss": 2.6189, "step": 8082 }, { "crossentropy": 2.5851590633392334, "epoch": 0.293032192575406, "grad_norm": 0.0305467676371336, "grad_norm_var": 1.5639582781722203e-06, "learning_rate": 0.00820408492956765, "loss": 2.6429, "step": 8083 }, { "crossentropy": 2.6194303035736084, "epoch": 0.29306844547563804, "grad_norm": 0.04780931770801544, "grad_norm_var": 1.9813790665692588e-05, "learning_rate": 0.008203638810558267, "loss": 2.6373, "step": 8084 }, { "crossentropy": 2.643249750137329, "epoch": 0.29310469837587005, "grad_norm": 0.030337592586874962, "grad_norm_var": 1.9769617735228038e-05, "learning_rate": 0.008203192648278878, "loss": 2.6245, "step": 8085 }, { "crossentropy": 2.6432902812957764, "epoch": 0.29314095127610207, "grad_norm": 0.03389471769332886, "grad_norm_var": 1.997911521998626e-05, "learning_rate": 0.008202746442735515, "loss": 2.597, "step": 8086 }, { "crossentropy": 2.6605896949768066, "epoch": 0.2931772041763341, "grad_norm": 0.05943647772073746, "grad_norm_var": 6.717642344781515e-05, "learning_rate": 0.0082023001939342, "loss": 2.7234, "step": 8087 }, { "crossentropy": 2.766817092895508, "epoch": 0.29321345707656615, "grad_norm": 0.03199022635817528, "grad_norm_var": 6.549986503676142e-05, "learning_rate": 0.008201853901880959, "loss": 2.7027, "step": 8088 }, { "crossentropy": 2.5851895809173584, "epoch": 0.29324970997679817, "grad_norm": 0.028984149917960167, "grad_norm_var": 6.603476649548744e-05, "learning_rate": 0.008201407566581822, "loss": 2.6333, "step": 8089 }, { "crossentropy": 2.6620209217071533, "epoch": 0.2932859628770302, "grad_norm": 0.02917260304093361, "grad_norm_var": 6.678817717358716e-05, "learning_rate": 0.00820096118804282, "loss": 2.6412, "step": 8090 }, { "crossentropy": 2.6337950229644775, "epoch": 0.2933222157772622, "grad_norm": 0.030246028676629066, "grad_norm_var": 6.742966848331943e-05, "learning_rate": 0.008200514766269979, "loss": 2.5837, "step": 8091 }, { "crossentropy": 2.594085454940796, "epoch": 0.2933584686774942, "grad_norm": 0.02867330238223076, "grad_norm_var": 6.873429245668587e-05, "learning_rate": 0.008200068301269329, "loss": 2.6061, "step": 8092 }, { "crossentropy": 2.70317006111145, "epoch": 0.2933947215777262, "grad_norm": 0.04230820760130882, "grad_norm_var": 7.17869384897975e-05, "learning_rate": 0.008199621793046898, "loss": 2.7819, "step": 8093 }, { "crossentropy": 2.644620656967163, "epoch": 0.29343097447795824, "grad_norm": 0.0331544429063797, "grad_norm_var": 7.056879735597802e-05, "learning_rate": 0.008199175241608722, "loss": 2.6272, "step": 8094 }, { "crossentropy": 2.557135581970215, "epoch": 0.29346722737819025, "grad_norm": 0.036000315099954605, "grad_norm_var": 6.920419943417284e-05, "learning_rate": 0.008198728646960828, "loss": 2.6312, "step": 8095 }, { "crossentropy": 2.4621903896331787, "epoch": 0.29350348027842227, "grad_norm": 0.032429177314043045, "grad_norm_var": 6.928577288897638e-05, "learning_rate": 0.008198282009109251, "loss": 2.5933, "step": 8096 }, { "crossentropy": 2.5973784923553467, "epoch": 0.2935397331786543, "grad_norm": 0.03398057445883751, "grad_norm_var": 6.889959601918261e-05, "learning_rate": 0.008197835328060019, "loss": 2.6344, "step": 8097 }, { "crossentropy": 2.605698823928833, "epoch": 0.2935759860788863, "grad_norm": 0.03048400767147541, "grad_norm_var": 6.877908190912941e-05, "learning_rate": 0.00819738860381917, "loss": 2.6348, "step": 8098 }, { "crossentropy": 2.567943811416626, "epoch": 0.2936122389791183, "grad_norm": 0.03040512651205063, "grad_norm_var": 6.886378558591341e-05, "learning_rate": 0.008196941836392736, "loss": 2.5985, "step": 8099 }, { "crossentropy": 2.5183825492858887, "epoch": 0.2936484918793503, "grad_norm": 0.02883116528391838, "grad_norm_var": 5.885175491807578e-05, "learning_rate": 0.00819649502578675, "loss": 2.641, "step": 8100 }, { "crossentropy": 2.735898494720459, "epoch": 0.29368474477958234, "grad_norm": 0.030938560143113136, "grad_norm_var": 5.859925150240302e-05, "learning_rate": 0.008196048172007246, "loss": 2.7186, "step": 8101 }, { "crossentropy": 2.6529977321624756, "epoch": 0.2937209976798144, "grad_norm": 0.029024668037891388, "grad_norm_var": 6.0025322660615886e-05, "learning_rate": 0.008195601275060265, "loss": 2.7486, "step": 8102 }, { "crossentropy": 2.5400116443634033, "epoch": 0.2937572505800464, "grad_norm": 0.030037878081202507, "grad_norm_var": 1.2390988942463871e-05, "learning_rate": 0.008195154334951837, "loss": 2.6156, "step": 8103 }, { "crossentropy": 2.614961624145508, "epoch": 0.29379350348027844, "grad_norm": 0.029834061861038208, "grad_norm_var": 1.2588422555506841e-05, "learning_rate": 0.008194707351688002, "loss": 2.6812, "step": 8104 }, { "crossentropy": 2.393456220626831, "epoch": 0.29382975638051045, "grad_norm": 0.028780221939086914, "grad_norm_var": 1.2660285636907169e-05, "learning_rate": 0.008194260325274795, "loss": 2.5563, "step": 8105 }, { "crossentropy": 2.7487363815307617, "epoch": 0.29386600928074247, "grad_norm": 0.09712672978639603, "grad_norm_var": 0.0002800129190108788, "learning_rate": 0.008193813255718255, "loss": 2.6981, "step": 8106 }, { "crossentropy": 2.7358217239379883, "epoch": 0.2939022621809745, "grad_norm": 0.03367944434285164, "grad_norm_var": 0.000278222753494711, "learning_rate": 0.008193366143024421, "loss": 2.7463, "step": 8107 }, { "crossentropy": 2.525838851928711, "epoch": 0.2939385150812065, "grad_norm": 0.08364038914442062, "grad_norm_var": 0.00041350504736363433, "learning_rate": 0.008192918987199331, "loss": 2.5685, "step": 8108 }, { "crossentropy": 2.6041412353515625, "epoch": 0.2939747679814385, "grad_norm": 0.12880495190620422, "grad_norm_var": 0.0009144667519695153, "learning_rate": 0.008192471788249025, "loss": 2.6432, "step": 8109 }, { "crossentropy": 2.798085927963257, "epoch": 0.2940110208816705, "grad_norm": 0.07490649074316025, "grad_norm_var": 0.0009584663580472773, "learning_rate": 0.008192024546179542, "loss": 2.7262, "step": 8110 }, { "crossentropy": 2.6315486431121826, "epoch": 0.29404727378190254, "grad_norm": 0.05176475644111633, "grad_norm_var": 0.0009499712407700071, "learning_rate": 0.008191577260996922, "loss": 2.6525, "step": 8111 }, { "crossentropy": 2.5204756259918213, "epoch": 0.29408352668213456, "grad_norm": 0.04510333761572838, "grad_norm_var": 0.0009329936575944549, "learning_rate": 0.00819112993270721, "loss": 2.6375, "step": 8112 }, { "crossentropy": 2.6283376216888428, "epoch": 0.29411977958236657, "grad_norm": 0.03840626776218414, "grad_norm_var": 0.0009252317117167864, "learning_rate": 0.008190682561316444, "loss": 2.6971, "step": 8113 }, { "crossentropy": 2.726553440093994, "epoch": 0.2941560324825986, "grad_norm": 0.04016437008976936, "grad_norm_var": 0.0009065630346672233, "learning_rate": 0.00819023514683067, "loss": 2.698, "step": 8114 }, { "crossentropy": 2.700087070465088, "epoch": 0.29419228538283065, "grad_norm": 0.04067244008183479, "grad_norm_var": 0.0008862028197758599, "learning_rate": 0.008189787689255927, "loss": 2.6916, "step": 8115 }, { "crossentropy": 2.66805362701416, "epoch": 0.29422853828306267, "grad_norm": 0.037342023104429245, "grad_norm_var": 0.0008658770717455058, "learning_rate": 0.008189340188598262, "loss": 2.6832, "step": 8116 }, { "crossentropy": 2.6924667358398438, "epoch": 0.2942647911832947, "grad_norm": 0.03203070908784866, "grad_norm_var": 0.0008629918098036696, "learning_rate": 0.008188892644863718, "loss": 2.6582, "step": 8117 }, { "crossentropy": 2.6829779148101807, "epoch": 0.2943010440835267, "grad_norm": 0.030290259048342705, "grad_norm_var": 0.0008593275850841145, "learning_rate": 0.008188445058058337, "loss": 2.6338, "step": 8118 }, { "crossentropy": 2.7053065299987793, "epoch": 0.2943372969837587, "grad_norm": 0.03298485651612282, "grad_norm_var": 0.0008514720225235391, "learning_rate": 0.008187997428188168, "loss": 2.6056, "step": 8119 }, { "crossentropy": 2.589503049850464, "epoch": 0.2943735498839907, "grad_norm": 0.03854835033416748, "grad_norm_var": 0.0008309332320630123, "learning_rate": 0.008187549755259258, "loss": 2.6863, "step": 8120 }, { "crossentropy": 2.794450521469116, "epoch": 0.29440980278422274, "grad_norm": 0.032956212759017944, "grad_norm_var": 0.0008190162726312771, "learning_rate": 0.008187102039277649, "loss": 2.6811, "step": 8121 }, { "crossentropy": 2.5642216205596924, "epoch": 0.29444605568445475, "grad_norm": 0.029318371787667274, "grad_norm_var": 0.0007020223294624874, "learning_rate": 0.008186654280249391, "loss": 2.7282, "step": 8122 }, { "crossentropy": 2.6423912048339844, "epoch": 0.29448230858468677, "grad_norm": 0.03121403232216835, "grad_norm_var": 0.000707163385585308, "learning_rate": 0.00818620647818053, "loss": 2.6644, "step": 8123 }, { "crossentropy": 2.592181921005249, "epoch": 0.2945185614849188, "grad_norm": 0.02989167906343937, "grad_norm_var": 0.0006323706750798652, "learning_rate": 0.008185758633077116, "loss": 2.6386, "step": 8124 }, { "crossentropy": 2.5927321910858154, "epoch": 0.2945548143851508, "grad_norm": 0.03333219885826111, "grad_norm_var": 0.0001307930513559683, "learning_rate": 0.008185310744945199, "loss": 2.5903, "step": 8125 }, { "crossentropy": 2.6567704677581787, "epoch": 0.2945910672853828, "grad_norm": 0.03020385652780533, "grad_norm_var": 3.9783055339238564e-05, "learning_rate": 0.008184862813790824, "loss": 2.6787, "step": 8126 }, { "crossentropy": 2.7009921073913574, "epoch": 0.2946273201856148, "grad_norm": 0.034654401242733, "grad_norm_var": 2.1862137319708174e-05, "learning_rate": 0.008184414839620045, "loss": 2.6288, "step": 8127 }, { "crossentropy": 2.709979772567749, "epoch": 0.29466357308584684, "grad_norm": 0.0365169458091259, "grad_norm_var": 1.469664330270222e-05, "learning_rate": 0.00818396682243891, "loss": 2.7465, "step": 8128 }, { "crossentropy": 2.548057794570923, "epoch": 0.2946998259860789, "grad_norm": 0.03472515195608139, "grad_norm_var": 1.3519761705608442e-05, "learning_rate": 0.00818351876225347, "loss": 2.6242, "step": 8129 }, { "crossentropy": 2.6745128631591797, "epoch": 0.2947360788863109, "grad_norm": 0.02959497645497322, "grad_norm_var": 1.1889114723189264e-05, "learning_rate": 0.00818307065906978, "loss": 2.6502, "step": 8130 }, { "crossentropy": 2.713571310043335, "epoch": 0.29477233178654294, "grad_norm": 0.02973395586013794, "grad_norm_var": 8.749410176913163e-06, "learning_rate": 0.008182622512893891, "loss": 2.7541, "step": 8131 }, { "crossentropy": 2.60801362991333, "epoch": 0.29480858468677495, "grad_norm": 0.028972527012228966, "grad_norm_var": 7.956876908257748e-06, "learning_rate": 0.008182174323731854, "loss": 2.6409, "step": 8132 }, { "crossentropy": 2.563333034515381, "epoch": 0.29484483758700697, "grad_norm": 0.030248502269387245, "grad_norm_var": 8.192183016694068e-06, "learning_rate": 0.008181726091589722, "loss": 2.6189, "step": 8133 }, { "crossentropy": 2.608417272567749, "epoch": 0.294881090487239, "grad_norm": 0.03203324228525162, "grad_norm_var": 7.967486931548193e-06, "learning_rate": 0.008181277816473553, "loss": 2.5975, "step": 8134 }, { "crossentropy": 2.6266939640045166, "epoch": 0.294917343387471, "grad_norm": 0.02989121340215206, "grad_norm_var": 8.23492928911372e-06, "learning_rate": 0.008180829498389398, "loss": 2.6308, "step": 8135 }, { "crossentropy": 2.7772419452667236, "epoch": 0.294953596287703, "grad_norm": 0.030071299523115158, "grad_norm_var": 5.3131649322946206e-06, "learning_rate": 0.008180381137343316, "loss": 2.737, "step": 8136 }, { "crossentropy": 2.5741477012634277, "epoch": 0.294989849187935, "grad_norm": 0.029051214456558228, "grad_norm_var": 5.487153767129539e-06, "learning_rate": 0.008179932733341357, "loss": 2.6188, "step": 8137 }, { "crossentropy": 2.727402925491333, "epoch": 0.29502610208816704, "grad_norm": 0.03063724935054779, "grad_norm_var": 5.262196794272867e-06, "learning_rate": 0.008179484286389583, "loss": 2.7583, "step": 8138 }, { "crossentropy": 2.612464427947998, "epoch": 0.29506235498839906, "grad_norm": 0.027652379125356674, "grad_norm_var": 6.095039775038067e-06, "learning_rate": 0.008179035796494047, "loss": 2.597, "step": 8139 }, { "crossentropy": 2.6512959003448486, "epoch": 0.29509860788863107, "grad_norm": 0.028635287657380104, "grad_norm_var": 6.392038806739507e-06, "learning_rate": 0.008178587263660808, "loss": 2.6827, "step": 8140 }, { "crossentropy": 2.7255849838256836, "epoch": 0.2951348607888631, "grad_norm": 0.030956516042351723, "grad_norm_var": 6.005135918560782e-06, "learning_rate": 0.008178138687895926, "loss": 2.7067, "step": 8141 }, { "crossentropy": 2.7543435096740723, "epoch": 0.29517111368909515, "grad_norm": 0.03058469295501709, "grad_norm_var": 5.9814582346355645e-06, "learning_rate": 0.008177690069205459, "loss": 2.7332, "step": 8142 }, { "crossentropy": 2.4558541774749756, "epoch": 0.29520736658932717, "grad_norm": 0.02987242490053177, "grad_norm_var": 4.999318323255795e-06, "learning_rate": 0.008177241407595463, "loss": 2.5979, "step": 8143 }, { "crossentropy": 2.559910535812378, "epoch": 0.2952436194895592, "grad_norm": 0.04445705562829971, "grad_norm_var": 1.5231762827447185e-05, "learning_rate": 0.008176792703072, "loss": 2.6114, "step": 8144 }, { "crossentropy": 2.66914701461792, "epoch": 0.2952798723897912, "grad_norm": 0.02787718176841736, "grad_norm_var": 1.4825166537216434e-05, "learning_rate": 0.008176343955641133, "loss": 2.6684, "step": 8145 }, { "crossentropy": 2.7674708366394043, "epoch": 0.2953161252900232, "grad_norm": 0.03497816249728203, "grad_norm_var": 1.5884927453181985e-05, "learning_rate": 0.008175895165308918, "loss": 2.7454, "step": 8146 }, { "crossentropy": 2.541069507598877, "epoch": 0.2953523781902552, "grad_norm": 0.030956678092479706, "grad_norm_var": 1.5775502023669198e-05, "learning_rate": 0.008175446332081419, "loss": 2.5686, "step": 8147 }, { "crossentropy": 2.634915351867676, "epoch": 0.29538863109048724, "grad_norm": 0.047325700521469116, "grad_norm_var": 3.173260855559741e-05, "learning_rate": 0.008174997455964702, "loss": 2.6379, "step": 8148 }, { "crossentropy": 2.7757389545440674, "epoch": 0.29542488399071926, "grad_norm": 0.02997126802802086, "grad_norm_var": 3.1809615035167347e-05, "learning_rate": 0.008174548536964824, "loss": 2.7103, "step": 8149 }, { "crossentropy": 2.6519315242767334, "epoch": 0.29546113689095127, "grad_norm": 0.036911483854055405, "grad_norm_var": 3.3198577329286004e-05, "learning_rate": 0.00817409957508785, "loss": 2.6627, "step": 8150 }, { "crossentropy": 2.4829349517822266, "epoch": 0.2954973897911833, "grad_norm": 0.02919294498860836, "grad_norm_var": 3.34709451081029e-05, "learning_rate": 0.008173650570339844, "loss": 2.5319, "step": 8151 }, { "crossentropy": 2.644376277923584, "epoch": 0.2955336426914153, "grad_norm": 0.03059115633368492, "grad_norm_var": 3.332325456393141e-05, "learning_rate": 0.008173201522726872, "loss": 2.6538, "step": 8152 }, { "crossentropy": 2.649982452392578, "epoch": 0.2955698955916473, "grad_norm": 0.03304457664489746, "grad_norm_var": 3.2495239284180394e-05, "learning_rate": 0.008172752432254998, "loss": 2.7072, "step": 8153 }, { "crossentropy": 2.7141125202178955, "epoch": 0.29560614849187933, "grad_norm": 0.03201557323336601, "grad_norm_var": 3.222978168905346e-05, "learning_rate": 0.008172303298930286, "loss": 2.6588, "step": 8154 }, { "crossentropy": 2.6026766300201416, "epoch": 0.29564240139211134, "grad_norm": 0.030676301568746567, "grad_norm_var": 3.072019933825578e-05, "learning_rate": 0.008171854122758804, "loss": 2.5987, "step": 8155 }, { "crossentropy": 2.707423686981201, "epoch": 0.2956786542923434, "grad_norm": 0.031135378405451775, "grad_norm_var": 2.965491647100488e-05, "learning_rate": 0.00817140490374662, "loss": 2.6733, "step": 8156 }, { "crossentropy": 2.626819610595703, "epoch": 0.2957149071925754, "grad_norm": 0.02924557961523533, "grad_norm_var": 3.0340358409376465e-05, "learning_rate": 0.0081709556418998, "loss": 2.6795, "step": 8157 }, { "crossentropy": 2.6743671894073486, "epoch": 0.29575116009280744, "grad_norm": 0.030736232176423073, "grad_norm_var": 3.0291935910569064e-05, "learning_rate": 0.00817050633722441, "loss": 2.7088, "step": 8158 }, { "crossentropy": 2.7374203205108643, "epoch": 0.29578741299303946, "grad_norm": 0.07829317450523376, "grad_norm_var": 0.00015623701807676582, "learning_rate": 0.008170056989726521, "loss": 2.7178, "step": 8159 }, { "crossentropy": 2.6991636753082275, "epoch": 0.29582366589327147, "grad_norm": 0.030859965831041336, "grad_norm_var": 0.00015261948188873085, "learning_rate": 0.008169607599412203, "loss": 2.6874, "step": 8160 }, { "crossentropy": 2.694920063018799, "epoch": 0.2958599187935035, "grad_norm": 0.029766947031021118, "grad_norm_var": 0.00015098793398940114, "learning_rate": 0.008169158166287525, "loss": 2.6856, "step": 8161 }, { "crossentropy": 2.6308631896972656, "epoch": 0.2958961716937355, "grad_norm": 0.032230451703071594, "grad_norm_var": 0.00015159834605557124, "learning_rate": 0.008168708690358555, "loss": 2.6392, "step": 8162 }, { "crossentropy": 2.73343563079834, "epoch": 0.2959324245939675, "grad_norm": 0.0334114208817482, "grad_norm_var": 0.0001505911652500506, "learning_rate": 0.008168259171631366, "loss": 2.6572, "step": 8163 }, { "crossentropy": 2.77404522895813, "epoch": 0.29596867749419953, "grad_norm": 0.032086800783872604, "grad_norm_var": 0.00014074794020617893, "learning_rate": 0.008167809610112028, "loss": 2.7371, "step": 8164 }, { "crossentropy": 2.639934778213501, "epoch": 0.29600493039443154, "grad_norm": 0.031015222892165184, "grad_norm_var": 0.00014020160986781652, "learning_rate": 0.008167360005806616, "loss": 2.6997, "step": 8165 }, { "crossentropy": 2.634251117706299, "epoch": 0.29604118329466356, "grad_norm": 0.03443378210067749, "grad_norm_var": 0.00013977239412293514, "learning_rate": 0.0081669103587212, "loss": 2.6753, "step": 8166 }, { "crossentropy": 2.62603497505188, "epoch": 0.29607743619489557, "grad_norm": 0.028795290738344193, "grad_norm_var": 0.00014005284241989357, "learning_rate": 0.00816646066886185, "loss": 2.6213, "step": 8167 }, { "crossentropy": 2.5873401165008545, "epoch": 0.2961136890951276, "grad_norm": 0.029767611995339394, "grad_norm_var": 0.00014049931281551913, "learning_rate": 0.008166010936234647, "loss": 2.6241, "step": 8168 }, { "crossentropy": 2.5651655197143555, "epoch": 0.29614994199535966, "grad_norm": 0.02909226529300213, "grad_norm_var": 0.000142094841770117, "learning_rate": 0.00816556116084566, "loss": 2.568, "step": 8169 }, { "crossentropy": 2.818305253982544, "epoch": 0.29618619489559167, "grad_norm": 0.02981496788561344, "grad_norm_var": 0.00014297173471861354, "learning_rate": 0.008165111342700968, "loss": 2.7263, "step": 8170 }, { "crossentropy": 2.728630542755127, "epoch": 0.2962224477958237, "grad_norm": 0.031238606199622154, "grad_norm_var": 0.00014275466974468144, "learning_rate": 0.008164661481806642, "loss": 2.6803, "step": 8171 }, { "crossentropy": 2.645972967147827, "epoch": 0.2962587006960557, "grad_norm": 0.02734924666583538, "grad_norm_var": 0.00014503119603402192, "learning_rate": 0.00816421157816876, "loss": 2.6075, "step": 8172 }, { "crossentropy": 2.602579355239868, "epoch": 0.2962949535962877, "grad_norm": 0.031203513965010643, "grad_norm_var": 0.0001441252633211013, "learning_rate": 0.0081637616317934, "loss": 2.6468, "step": 8173 }, { "crossentropy": 2.6299357414245605, "epoch": 0.29633120649651973, "grad_norm": 0.03220595791935921, "grad_norm_var": 0.00014366851122925387, "learning_rate": 0.008163311642686638, "loss": 2.6983, "step": 8174 }, { "crossentropy": 2.761272430419922, "epoch": 0.29636745939675174, "grad_norm": 0.029022978618741035, "grad_norm_var": 3.413057884452987e-06, "learning_rate": 0.008162861610854551, "loss": 2.7315, "step": 8175 }, { "crossentropy": 2.7016589641571045, "epoch": 0.29640371229698376, "grad_norm": 0.02841096557676792, "grad_norm_var": 3.7580215932053366e-06, "learning_rate": 0.008162411536303218, "loss": 2.7105, "step": 8176 }, { "crossentropy": 2.708282232284546, "epoch": 0.29643996519721577, "grad_norm": 0.03267413005232811, "grad_norm_var": 3.9573815454803e-06, "learning_rate": 0.00816196141903872, "loss": 2.642, "step": 8177 }, { "crossentropy": 2.781827449798584, "epoch": 0.2964762180974478, "grad_norm": 0.03396754339337349, "grad_norm_var": 4.477961892644525e-06, "learning_rate": 0.008161511259067132, "loss": 2.685, "step": 8178 }, { "crossentropy": 2.699392080307007, "epoch": 0.2965124709976798, "grad_norm": 0.03313827887177467, "grad_norm_var": 4.391367080147747e-06, "learning_rate": 0.008161061056394537, "loss": 2.8006, "step": 8179 }, { "crossentropy": 2.7009246349334717, "epoch": 0.2965487238979118, "grad_norm": 0.03409722074866295, "grad_norm_var": 4.965171083107566e-06, "learning_rate": 0.008160610811027017, "loss": 2.6937, "step": 8180 }, { "crossentropy": 2.575516700744629, "epoch": 0.29658497679814383, "grad_norm": 0.03357217088341713, "grad_norm_var": 5.374135594273834e-06, "learning_rate": 0.008160160522970649, "loss": 2.6078, "step": 8181 }, { "crossentropy": 2.6637773513793945, "epoch": 0.29662122969837584, "grad_norm": 0.03169801086187363, "grad_norm_var": 4.652856382795998e-06, "learning_rate": 0.00815971019223152, "loss": 2.6254, "step": 8182 }, { "crossentropy": 2.517275810241699, "epoch": 0.2966574825986079, "grad_norm": 0.031603001058101654, "grad_norm_var": 4.319059858923224e-06, "learning_rate": 0.00815925981881571, "loss": 2.6991, "step": 8183 }, { "crossentropy": 2.594649314880371, "epoch": 0.29669373549883993, "grad_norm": 0.030461838468909264, "grad_norm_var": 4.218582265845018e-06, "learning_rate": 0.008158809402729299, "loss": 2.643, "step": 8184 }, { "crossentropy": 2.6215980052948, "epoch": 0.29672998839907194, "grad_norm": 0.031472593545913696, "grad_norm_var": 3.8968017589790925e-06, "learning_rate": 0.008158358943978376, "loss": 2.6814, "step": 8185 }, { "crossentropy": 2.803173303604126, "epoch": 0.29676624129930396, "grad_norm": 0.031733471900224686, "grad_norm_var": 3.7288885571142943e-06, "learning_rate": 0.008157908442569022, "loss": 2.7375, "step": 8186 }, { "crossentropy": 2.6703875064849854, "epoch": 0.29680249419953597, "grad_norm": 0.03012189269065857, "grad_norm_var": 3.844349111005206e-06, "learning_rate": 0.00815745789850732, "loss": 2.6016, "step": 8187 }, { "crossentropy": 2.6452572345733643, "epoch": 0.296838747099768, "grad_norm": 0.0617409311234951, "grad_norm_var": 5.909824856893953e-05, "learning_rate": 0.00815700731179936, "loss": 2.6816, "step": 8188 }, { "crossentropy": 2.6132473945617676, "epoch": 0.296875, "grad_norm": 0.03093719854950905, "grad_norm_var": 5.9186722193729304e-05, "learning_rate": 0.008156556682451225, "loss": 2.6454, "step": 8189 }, { "crossentropy": 2.6795222759246826, "epoch": 0.296911252900232, "grad_norm": 0.0332348458468914, "grad_norm_var": 5.906800397652855e-05, "learning_rate": 0.008156106010469, "loss": 2.6852, "step": 8190 }, { "crossentropy": 2.653140068054199, "epoch": 0.29694750580046403, "grad_norm": 0.035822201520204544, "grad_norm_var": 5.7791719435947715e-05, "learning_rate": 0.008155655295858776, "loss": 2.664, "step": 8191 }, { "crossentropy": 2.560718059539795, "epoch": 0.29698375870069604, "grad_norm": 0.040540892630815506, "grad_norm_var": 5.7879015189424506e-05, "learning_rate": 0.008155204538626637, "loss": 2.6898, "step": 8192 }, { "crossentropy": 2.461475372314453, "epoch": 0.29702001160092806, "grad_norm": 0.045643150806427, "grad_norm_var": 6.471342048943924e-05, "learning_rate": 0.008154753738778673, "loss": 2.5483, "step": 8193 }, { "crossentropy": 2.752854108810425, "epoch": 0.29705626450116007, "grad_norm": 0.04509621858596802, "grad_norm_var": 7.001442723143882e-05, "learning_rate": 0.008154302896320973, "loss": 2.6634, "step": 8194 }, { "crossentropy": 2.533245801925659, "epoch": 0.2970925174013921, "grad_norm": 0.04098673537373543, "grad_norm_var": 7.054825096342192e-05, "learning_rate": 0.008153852011259623, "loss": 2.6123, "step": 8195 }, { "crossentropy": 2.7060041427612305, "epoch": 0.29712877030162416, "grad_norm": 0.031283535063266754, "grad_norm_var": 7.205613994760438e-05, "learning_rate": 0.008153401083600718, "loss": 2.7174, "step": 8196 }, { "crossentropy": 2.5316498279571533, "epoch": 0.29716502320185617, "grad_norm": 0.029177065938711166, "grad_norm_var": 7.505057043064229e-05, "learning_rate": 0.008152950113350346, "loss": 2.6412, "step": 8197 }, { "crossentropy": 2.69966459274292, "epoch": 0.2972012761020882, "grad_norm": 0.03187503665685654, "grad_norm_var": 7.494279459413695e-05, "learning_rate": 0.008152499100514597, "loss": 2.7081, "step": 8198 }, { "crossentropy": 2.6268563270568848, "epoch": 0.2972375290023202, "grad_norm": 0.03269611671566963, "grad_norm_var": 7.432441701343064e-05, "learning_rate": 0.008152048045099563, "loss": 2.5843, "step": 8199 }, { "crossentropy": 2.767430543899536, "epoch": 0.2972737819025522, "grad_norm": 0.04712434485554695, "grad_norm_var": 7.842540530240879e-05, "learning_rate": 0.008151596947111338, "loss": 2.6578, "step": 8200 }, { "crossentropy": 2.5975537300109863, "epoch": 0.29731003480278423, "grad_norm": 0.030150678008794785, "grad_norm_var": 7.959132490938098e-05, "learning_rate": 0.008151145806556011, "loss": 2.6639, "step": 8201 }, { "crossentropy": 2.5260024070739746, "epoch": 0.29734628770301624, "grad_norm": 0.030143477022647858, "grad_norm_var": 8.09475074288681e-05, "learning_rate": 0.00815069462343968, "loss": 2.649, "step": 8202 }, { "crossentropy": 2.7427031993865967, "epoch": 0.29738254060324826, "grad_norm": 0.03404083847999573, "grad_norm_var": 7.8164012665069e-05, "learning_rate": 0.008150243397768438, "loss": 2.682, "step": 8203 }, { "crossentropy": 2.606865167617798, "epoch": 0.29741879350348027, "grad_norm": 0.030352924019098282, "grad_norm_var": 3.841853604384729e-05, "learning_rate": 0.008149792129548377, "loss": 2.6014, "step": 8204 }, { "crossentropy": 2.620746612548828, "epoch": 0.2974550464037123, "grad_norm": 0.031855519860982895, "grad_norm_var": 3.790410259685017e-05, "learning_rate": 0.008149340818785591, "loss": 2.6038, "step": 8205 }, { "crossentropy": 2.6595001220703125, "epoch": 0.2974912993039443, "grad_norm": 0.035961657762527466, "grad_norm_var": 3.749928558828917e-05, "learning_rate": 0.00814888946548618, "loss": 2.7402, "step": 8206 }, { "crossentropy": 2.550584316253662, "epoch": 0.2975275522041763, "grad_norm": 0.03400866687297821, "grad_norm_var": 3.769872420737637e-05, "learning_rate": 0.008148438069656238, "loss": 2.6097, "step": 8207 }, { "crossentropy": 2.6440646648406982, "epoch": 0.29756380510440833, "grad_norm": 0.030163956806063652, "grad_norm_var": 3.770820139771638e-05, "learning_rate": 0.008147986631301863, "loss": 2.564, "step": 8208 }, { "crossentropy": 2.739474058151245, "epoch": 0.29760005800464034, "grad_norm": 0.030172713100910187, "grad_norm_var": 3.078489596098469e-05, "learning_rate": 0.008147535150429151, "loss": 2.6427, "step": 8209 }, { "crossentropy": 2.5600876808166504, "epoch": 0.2976363109048724, "grad_norm": 0.03026288002729416, "grad_norm_var": 2.27254520073153e-05, "learning_rate": 0.008147083627044199, "loss": 2.5327, "step": 8210 }, { "crossentropy": 2.6987900733947754, "epoch": 0.29767256380510443, "grad_norm": 0.03818948566913605, "grad_norm_var": 2.0288295832302756e-05, "learning_rate": 0.008146632061153108, "loss": 2.6717, "step": 8211 }, { "crossentropy": 2.637293815612793, "epoch": 0.29770881670533644, "grad_norm": 0.043689239770174026, "grad_norm_var": 2.712388559071321e-05, "learning_rate": 0.008146180452761975, "loss": 2.6711, "step": 8212 }, { "crossentropy": 2.665353775024414, "epoch": 0.29774506960556846, "grad_norm": 0.034026794135570526, "grad_norm_var": 2.564235081005879e-05, "learning_rate": 0.0081457288018769, "loss": 2.6856, "step": 8213 }, { "crossentropy": 2.659986972808838, "epoch": 0.29778132250580047, "grad_norm": 0.031204581260681152, "grad_norm_var": 2.5864395349837207e-05, "learning_rate": 0.008145277108503983, "loss": 2.5936, "step": 8214 }, { "crossentropy": 2.6872754096984863, "epoch": 0.2978175754060325, "grad_norm": 0.029381806030869484, "grad_norm_var": 2.7128344657904347e-05, "learning_rate": 0.008144825372649328, "loss": 2.6855, "step": 8215 }, { "crossentropy": 2.575634002685547, "epoch": 0.2978538283062645, "grad_norm": 0.030604351311922073, "grad_norm_var": 1.4826455798123452e-05, "learning_rate": 0.008144373594319033, "loss": 2.6296, "step": 8216 }, { "crossentropy": 2.4163568019866943, "epoch": 0.2978900812064965, "grad_norm": 0.03255892172455788, "grad_norm_var": 1.4350087247106715e-05, "learning_rate": 0.0081439217735192, "loss": 2.5351, "step": 8217 }, { "crossentropy": 2.787177801132202, "epoch": 0.29792633410672853, "grad_norm": 0.0315730981528759, "grad_norm_var": 1.3949793056437555e-05, "learning_rate": 0.008143469910255934, "loss": 2.7323, "step": 8218 }, { "crossentropy": 2.744993209838867, "epoch": 0.29796258700696054, "grad_norm": 0.044088367372751236, "grad_norm_var": 2.164975428309701e-05, "learning_rate": 0.008143018004535334, "loss": 2.6956, "step": 8219 }, { "crossentropy": 2.5730183124542236, "epoch": 0.29799883990719256, "grad_norm": 0.031351350247859955, "grad_norm_var": 2.127567739868691e-05, "learning_rate": 0.00814256605636351, "loss": 2.6985, "step": 8220 }, { "crossentropy": 2.653553009033203, "epoch": 0.2980350928074246, "grad_norm": 0.03191037103533745, "grad_norm_var": 2.126242458315025e-05, "learning_rate": 0.008142114065746558, "loss": 2.6407, "step": 8221 }, { "crossentropy": 2.466463327407837, "epoch": 0.2980713457076566, "grad_norm": 0.031126422807574272, "grad_norm_var": 2.1263471381024545e-05, "learning_rate": 0.00814166203269059, "loss": 2.5115, "step": 8222 }, { "crossentropy": 2.633234739303589, "epoch": 0.29810759860788866, "grad_norm": 0.029921315610408783, "grad_norm_var": 2.1972949413426182e-05, "learning_rate": 0.008141209957201707, "loss": 2.6506, "step": 8223 }, { "crossentropy": 2.737377405166626, "epoch": 0.29814385150812067, "grad_norm": 0.029639797285199165, "grad_norm_var": 2.21980477216241e-05, "learning_rate": 0.008140757839286018, "loss": 2.7135, "step": 8224 }, { "crossentropy": 2.6155638694763184, "epoch": 0.2981801044083527, "grad_norm": 0.029687032103538513, "grad_norm_var": 2.2402765068478764e-05, "learning_rate": 0.008140305678949628, "loss": 2.633, "step": 8225 }, { "crossentropy": 2.6287176609039307, "epoch": 0.2982163573085847, "grad_norm": 0.02893654815852642, "grad_norm_var": 2.30101943722238e-05, "learning_rate": 0.008139853476198644, "loss": 2.5377, "step": 8226 }, { "crossentropy": 2.5957231521606445, "epoch": 0.2982526102088167, "grad_norm": 0.02877652645111084, "grad_norm_var": 2.2026140562135762e-05, "learning_rate": 0.008139401231039173, "loss": 2.6669, "step": 8227 }, { "crossentropy": 2.7222671508789062, "epoch": 0.29828886310904873, "grad_norm": 0.030710715800523758, "grad_norm_var": 1.3026357991101588e-05, "learning_rate": 0.008138948943477324, "loss": 2.6561, "step": 8228 }, { "crossentropy": 2.6530025005340576, "epoch": 0.29832511600928074, "grad_norm": 0.03442665934562683, "grad_norm_var": 1.3166076546107981e-05, "learning_rate": 0.008138496613519208, "loss": 2.6516, "step": 8229 }, { "crossentropy": 2.495460271835327, "epoch": 0.29836136890951276, "grad_norm": 0.034479279071092606, "grad_norm_var": 1.365552568081679e-05, "learning_rate": 0.008138044241170931, "loss": 2.4819, "step": 8230 }, { "crossentropy": 2.79414701461792, "epoch": 0.2983976218097448, "grad_norm": 0.02975313924252987, "grad_norm_var": 1.3543263398272757e-05, "learning_rate": 0.008137591826438605, "loss": 2.7018, "step": 8231 }, { "crossentropy": 2.7230234146118164, "epoch": 0.2984338747099768, "grad_norm": 0.028954070061445236, "grad_norm_var": 1.3986795519084725e-05, "learning_rate": 0.008137139369328339, "loss": 2.7035, "step": 8232 }, { "crossentropy": 2.6355791091918945, "epoch": 0.2984701276102088, "grad_norm": 0.030139781534671783, "grad_norm_var": 1.4089496452908817e-05, "learning_rate": 0.008136686869846246, "loss": 2.6164, "step": 8233 }, { "crossentropy": 2.6934614181518555, "epoch": 0.2985063805104408, "grad_norm": 0.03125578910112381, "grad_norm_var": 1.4096595507110538e-05, "learning_rate": 0.008136234327998435, "loss": 2.6771, "step": 8234 }, { "crossentropy": 2.763117551803589, "epoch": 0.29854263341067283, "grad_norm": 0.03599875792860985, "grad_norm_var": 4.686718528770784e-06, "learning_rate": 0.008135781743791022, "loss": 2.7962, "step": 8235 }, { "crossentropy": 2.7232961654663086, "epoch": 0.29857888631090485, "grad_norm": 0.030015649273991585, "grad_norm_var": 4.747534208746638e-06, "learning_rate": 0.008135329117230116, "loss": 2.7257, "step": 8236 }, { "crossentropy": 2.681226968765259, "epoch": 0.2986151392111369, "grad_norm": 0.02936459518969059, "grad_norm_var": 4.8378930662481645e-06, "learning_rate": 0.008134876448321835, "loss": 2.6624, "step": 8237 }, { "crossentropy": 2.57769775390625, "epoch": 0.29865139211136893, "grad_norm": 0.03115847148001194, "grad_norm_var": 4.839249005784916e-06, "learning_rate": 0.008134423737072287, "loss": 2.5547, "step": 8238 }, { "crossentropy": 2.571120262145996, "epoch": 0.29868764501160094, "grad_norm": 0.032631490379571915, "grad_norm_var": 4.971352700925321e-06, "learning_rate": 0.008133970983487592, "loss": 2.6383, "step": 8239 }, { "crossentropy": 2.5677125453948975, "epoch": 0.29872389791183296, "grad_norm": 0.031079959124326706, "grad_norm_var": 4.8406540289692155e-06, "learning_rate": 0.008133518187573862, "loss": 2.5999, "step": 8240 }, { "crossentropy": 2.7039129734039307, "epoch": 0.298760150812065, "grad_norm": 0.035669442266225815, "grad_norm_var": 5.9619633702326915e-06, "learning_rate": 0.008133065349337214, "loss": 2.7251, "step": 8241 }, { "crossentropy": 2.612240791320801, "epoch": 0.298796403712297, "grad_norm": 0.034369248896837234, "grad_norm_var": 5.979128516304423e-06, "learning_rate": 0.008132612468783764, "loss": 2.6351, "step": 8242 }, { "crossentropy": 2.693504810333252, "epoch": 0.298832656612529, "grad_norm": 0.031013507395982742, "grad_norm_var": 5.390396239064269e-06, "learning_rate": 0.008132159545919629, "loss": 2.6459, "step": 8243 }, { "crossentropy": 2.773855447769165, "epoch": 0.298868909512761, "grad_norm": 0.03007804974913597, "grad_norm_var": 5.519007223083137e-06, "learning_rate": 0.008131706580750924, "loss": 2.6855, "step": 8244 }, { "crossentropy": 2.6133596897125244, "epoch": 0.29890516241299303, "grad_norm": 0.02831045724451542, "grad_norm_var": 5.795910947488135e-06, "learning_rate": 0.008131253573283771, "loss": 2.6081, "step": 8245 }, { "crossentropy": 2.6631031036376953, "epoch": 0.29894141531322505, "grad_norm": 0.030272524803876877, "grad_norm_var": 5.240404832961111e-06, "learning_rate": 0.008130800523524287, "loss": 2.7117, "step": 8246 }, { "crossentropy": 2.71179461479187, "epoch": 0.29897766821345706, "grad_norm": 0.028642337769269943, "grad_norm_var": 5.539818748971174e-06, "learning_rate": 0.00813034743147859, "loss": 2.7897, "step": 8247 }, { "crossentropy": 2.637533187866211, "epoch": 0.2990139211136891, "grad_norm": 0.02922767400741577, "grad_norm_var": 5.463125324786597e-06, "learning_rate": 0.0081298942971528, "loss": 2.6348, "step": 8248 }, { "crossentropy": 2.599576234817505, "epoch": 0.2990501740139211, "grad_norm": 0.028356119990348816, "grad_norm_var": 5.914520909485778e-06, "learning_rate": 0.008129441120553037, "loss": 2.6189, "step": 8249 }, { "crossentropy": 2.5987725257873535, "epoch": 0.29908642691415316, "grad_norm": 0.028281882405281067, "grad_norm_var": 6.401640610843804e-06, "learning_rate": 0.008128987901685425, "loss": 2.5854, "step": 8250 }, { "crossentropy": 2.678375244140625, "epoch": 0.2991226798143852, "grad_norm": 0.029687725007534027, "grad_norm_var": 4.604194691143482e-06, "learning_rate": 0.008128534640556082, "loss": 2.6338, "step": 8251 }, { "crossentropy": 2.633485794067383, "epoch": 0.2991589327146172, "grad_norm": 0.028421731665730476, "grad_norm_var": 4.868029612224249e-06, "learning_rate": 0.00812808133717113, "loss": 2.6148, "step": 8252 }, { "crossentropy": 2.7755837440490723, "epoch": 0.2991951856148492, "grad_norm": 0.03210983797907829, "grad_norm_var": 4.956280625158224e-06, "learning_rate": 0.008127627991536693, "loss": 2.707, "step": 8253 }, { "crossentropy": 2.5800466537475586, "epoch": 0.2992314385150812, "grad_norm": 0.029101155698299408, "grad_norm_var": 5.062657306737402e-06, "learning_rate": 0.008127174603658896, "loss": 2.6417, "step": 8254 }, { "crossentropy": 2.5931155681610107, "epoch": 0.29926769141531323, "grad_norm": 0.029347939416766167, "grad_norm_var": 4.782896863840174e-06, "learning_rate": 0.008126721173543856, "loss": 2.6473, "step": 8255 }, { "crossentropy": 2.7003836631774902, "epoch": 0.29930394431554525, "grad_norm": 0.029416529461741447, "grad_norm_var": 4.771335610452508e-06, "learning_rate": 0.008126267701197705, "loss": 2.7016, "step": 8256 }, { "crossentropy": 2.553828477859497, "epoch": 0.29934019721577726, "grad_norm": 0.029000774025917053, "grad_norm_var": 2.637923014271274e-06, "learning_rate": 0.008125814186626563, "loss": 2.6784, "step": 8257 }, { "crossentropy": 2.6486756801605225, "epoch": 0.2993764501160093, "grad_norm": 0.02833421900868416, "grad_norm_var": 1.1790670583960993e-06, "learning_rate": 0.008125360629836556, "loss": 2.6512, "step": 8258 }, { "crossentropy": 2.74200177192688, "epoch": 0.2994127030162413, "grad_norm": 0.030791301280260086, "grad_norm_var": 1.132872060820763e-06, "learning_rate": 0.008124907030833812, "loss": 2.7209, "step": 8259 }, { "crossentropy": 2.739022731781006, "epoch": 0.2994489559164733, "grad_norm": 0.030333729460835457, "grad_norm_var": 1.1622456845361648e-06, "learning_rate": 0.008124453389624456, "loss": 2.6479, "step": 8260 }, { "crossentropy": 2.5417778491973877, "epoch": 0.2994852088167053, "grad_norm": 0.028479693457484245, "grad_norm_var": 1.1405279514811827e-06, "learning_rate": 0.008123999706214616, "loss": 2.5822, "step": 8261 }, { "crossentropy": 2.7355194091796875, "epoch": 0.29952146171693733, "grad_norm": 0.029505977407097816, "grad_norm_var": 1.0842754076860473e-06, "learning_rate": 0.008123545980610419, "loss": 2.6715, "step": 8262 }, { "crossentropy": 2.7628285884857178, "epoch": 0.29955771461716935, "grad_norm": 0.030160419642925262, "grad_norm_var": 1.0921743712305823e-06, "learning_rate": 0.008123092212817993, "loss": 2.6943, "step": 8263 }, { "crossentropy": 2.7469441890716553, "epoch": 0.2995939675174014, "grad_norm": 0.029626933857798576, "grad_norm_var": 1.0924422848415977e-06, "learning_rate": 0.008122638402843467, "loss": 2.7318, "step": 8264 }, { "crossentropy": 2.5981836318969727, "epoch": 0.29963022041763343, "grad_norm": 0.028185997158288956, "grad_norm_var": 1.1187177153679778e-06, "learning_rate": 0.00812218455069297, "loss": 2.6055, "step": 8265 }, { "crossentropy": 2.7157602310180664, "epoch": 0.29966647331786544, "grad_norm": 0.027610400691628456, "grad_norm_var": 1.2491633415813904e-06, "learning_rate": 0.008121730656372634, "loss": 2.696, "step": 8266 }, { "crossentropy": 2.5933752059936523, "epoch": 0.29970272621809746, "grad_norm": 0.028764870017766953, "grad_norm_var": 1.2647917179729625e-06, "learning_rate": 0.008121276719888588, "loss": 2.5876, "step": 8267 }, { "crossentropy": 2.6989898681640625, "epoch": 0.2997389791183295, "grad_norm": 0.03093673102557659, "grad_norm_var": 1.357400106274942e-06, "learning_rate": 0.008120822741246963, "loss": 2.6682, "step": 8268 }, { "crossentropy": 2.7625937461853027, "epoch": 0.2997752320185615, "grad_norm": 0.030876675620675087, "grad_norm_var": 1.0203133218841202e-06, "learning_rate": 0.00812036872045389, "loss": 2.7117, "step": 8269 }, { "crossentropy": 2.613147258758545, "epoch": 0.2998114849187935, "grad_norm": 0.033186133950948715, "grad_norm_var": 1.8979872678346095e-06, "learning_rate": 0.008119914657515503, "loss": 2.6176, "step": 8270 }, { "crossentropy": 2.6793153285980225, "epoch": 0.2998477378190255, "grad_norm": 0.03324888274073601, "grad_norm_var": 2.6868158756280605e-06, "learning_rate": 0.008119460552437934, "loss": 2.6127, "step": 8271 }, { "crossentropy": 2.643019437789917, "epoch": 0.29988399071925753, "grad_norm": 0.031540002673864365, "grad_norm_var": 2.8307033348211187e-06, "learning_rate": 0.008119006405227317, "loss": 2.6212, "step": 8272 }, { "crossentropy": 2.508010149002075, "epoch": 0.29992024361948955, "grad_norm": 0.034383222460746765, "grad_norm_var": 3.8981328205879426e-06, "learning_rate": 0.008118552215889783, "loss": 2.6103, "step": 8273 }, { "crossentropy": 2.5870697498321533, "epoch": 0.29995649651972156, "grad_norm": 0.03150850161910057, "grad_norm_var": 3.6650725807118307e-06, "learning_rate": 0.00811809798443147, "loss": 2.5873, "step": 8274 }, { "crossentropy": 2.542097568511963, "epoch": 0.2999927494199536, "grad_norm": 0.02815288119018078, "grad_norm_var": 4.022728085004146e-06, "learning_rate": 0.008117643710858512, "loss": 2.5641, "step": 8275 }, { "crossentropy": 2.6957621574401855, "epoch": 0.3000290023201856, "grad_norm": 0.02968517132103443, "grad_norm_var": 4.055294176227853e-06, "learning_rate": 0.008117189395177042, "loss": 2.7223, "step": 8276 }, { "crossentropy": 2.6666173934936523, "epoch": 0.30006525522041766, "grad_norm": 0.029719289392232895, "grad_norm_var": 3.8396000416394354e-06, "learning_rate": 0.008116735037393201, "loss": 2.7049, "step": 8277 }, { "crossentropy": 2.5929512977600098, "epoch": 0.3001015081206497, "grad_norm": 0.028947437182068825, "grad_norm_var": 3.928899005727751e-06, "learning_rate": 0.008116280637513124, "loss": 2.6518, "step": 8278 }, { "crossentropy": 2.7870755195617676, "epoch": 0.3001377610208817, "grad_norm": 0.028775937855243683, "grad_norm_var": 4.0944651407956304e-06, "learning_rate": 0.008115826195542948, "loss": 2.7239, "step": 8279 }, { "crossentropy": 2.804398775100708, "epoch": 0.3001740139211137, "grad_norm": 0.03472097963094711, "grad_norm_var": 5.2443278034925345e-06, "learning_rate": 0.00811537171148881, "loss": 2.7486, "step": 8280 }, { "crossentropy": 2.6311864852905273, "epoch": 0.3002102668213457, "grad_norm": 0.03324926644563675, "grad_norm_var": 5.189786252243698e-06, "learning_rate": 0.008114917185356849, "loss": 2.5849, "step": 8281 }, { "crossentropy": 2.7386369705200195, "epoch": 0.30024651972157773, "grad_norm": 0.03912019357085228, "grad_norm_var": 8.334211200419526e-06, "learning_rate": 0.008114462617153205, "loss": 2.6983, "step": 8282 }, { "crossentropy": 2.6466715335845947, "epoch": 0.30028277262180975, "grad_norm": 0.036566879600286484, "grad_norm_var": 9.110304105161315e-06, "learning_rate": 0.008114008006884017, "loss": 2.6874, "step": 8283 }, { "crossentropy": 2.6343886852264404, "epoch": 0.30031902552204176, "grad_norm": 0.03097735159099102, "grad_norm_var": 9.10376221874124e-06, "learning_rate": 0.008113553354555426, "loss": 2.6708, "step": 8284 }, { "crossentropy": 2.563034772872925, "epoch": 0.3003552784222738, "grad_norm": 0.02839057333767414, "grad_norm_var": 9.917500042442767e-06, "learning_rate": 0.00811309866017357, "loss": 2.5648, "step": 8285 }, { "crossentropy": 2.6044907569885254, "epoch": 0.3003915313225058, "grad_norm": 0.03171096742153168, "grad_norm_var": 9.822331029942519e-06, "learning_rate": 0.008112643923744592, "loss": 2.6096, "step": 8286 }, { "crossentropy": 2.602616548538208, "epoch": 0.3004277842227378, "grad_norm": 0.03241611272096634, "grad_norm_var": 9.717965441115176e-06, "learning_rate": 0.008112189145274635, "loss": 2.636, "step": 8287 }, { "crossentropy": 2.744154214859009, "epoch": 0.3004640371229698, "grad_norm": 0.03218873217701912, "grad_norm_var": 9.71602328576856e-06, "learning_rate": 0.008111734324769842, "loss": 2.6309, "step": 8288 }, { "crossentropy": 2.6164023876190186, "epoch": 0.30050029002320183, "grad_norm": 0.03158995136618614, "grad_norm_var": 9.281471107056118e-06, "learning_rate": 0.008111279462236354, "loss": 2.6399, "step": 8289 }, { "crossentropy": 2.50460147857666, "epoch": 0.30053654292343385, "grad_norm": 0.02953631989657879, "grad_norm_var": 9.583470693289888e-06, "learning_rate": 0.008110824557680314, "loss": 2.5355, "step": 8290 }, { "crossentropy": 2.7802774906158447, "epoch": 0.3005727958236659, "grad_norm": 0.02980908565223217, "grad_norm_var": 8.991647943243306e-06, "learning_rate": 0.008110369611107868, "loss": 2.7319, "step": 8291 }, { "crossentropy": 2.673978567123413, "epoch": 0.30060904872389793, "grad_norm": 0.033163003623485565, "grad_norm_var": 8.80738758391382e-06, "learning_rate": 0.008109914622525163, "loss": 2.6812, "step": 8292 }, { "crossentropy": 2.7420496940612793, "epoch": 0.30064530162412995, "grad_norm": 0.030534859746694565, "grad_norm_var": 8.608546953905296e-06, "learning_rate": 0.008109459591938338, "loss": 2.7324, "step": 8293 }, { "crossentropy": 2.639456272125244, "epoch": 0.30068155452436196, "grad_norm": 0.0305577851831913, "grad_norm_var": 8.119255503995168e-06, "learning_rate": 0.008109004519353544, "loss": 2.6448, "step": 8294 }, { "crossentropy": 2.6549324989318848, "epoch": 0.300717807424594, "grad_norm": 0.03364946320652962, "grad_norm_var": 7.455580689802223e-06, "learning_rate": 0.008108549404776925, "loss": 2.6172, "step": 8295 }, { "crossentropy": 2.662062644958496, "epoch": 0.300754060324826, "grad_norm": 0.043023403733968735, "grad_norm_var": 1.4348137617099278e-05, "learning_rate": 0.00810809424821463, "loss": 2.6646, "step": 8296 }, { "crossentropy": 2.5125701427459717, "epoch": 0.300790313225058, "grad_norm": 0.03526162728667259, "grad_norm_var": 1.4693542923935322e-05, "learning_rate": 0.008107639049672806, "loss": 2.5437, "step": 8297 }, { "crossentropy": 2.8283252716064453, "epoch": 0.30082656612529, "grad_norm": 0.03195680305361748, "grad_norm_var": 1.2084793803211295e-05, "learning_rate": 0.008107183809157599, "loss": 2.7484, "step": 8298 }, { "crossentropy": 2.61997389793396, "epoch": 0.30086281902552203, "grad_norm": 0.03299889713525772, "grad_norm_var": 1.0985341250402807e-05, "learning_rate": 0.00810672852667516, "loss": 2.6043, "step": 8299 }, { "crossentropy": 2.685049057006836, "epoch": 0.30089907192575405, "grad_norm": 0.030369265004992485, "grad_norm_var": 1.1120579509864223e-05, "learning_rate": 0.008106273202231638, "loss": 2.7383, "step": 8300 }, { "crossentropy": 2.614877939224243, "epoch": 0.30093532482598606, "grad_norm": 0.028957299888134003, "grad_norm_var": 1.0843557772425187e-05, "learning_rate": 0.008105817835833182, "loss": 2.5936, "step": 8301 }, { "crossentropy": 2.7733089923858643, "epoch": 0.3009715777262181, "grad_norm": 0.03369676321744919, "grad_norm_var": 1.091877591299927e-05, "learning_rate": 0.008105362427485942, "loss": 2.6845, "step": 8302 }, { "crossentropy": 2.657763719558716, "epoch": 0.3010078306264501, "grad_norm": 0.036698147654533386, "grad_norm_var": 1.2027241055234427e-05, "learning_rate": 0.00810490697719607, "loss": 2.6732, "step": 8303 }, { "crossentropy": 2.7256908416748047, "epoch": 0.30104408352668216, "grad_norm": 0.034664444625377655, "grad_norm_var": 1.2225218629596903e-05, "learning_rate": 0.008104451484969721, "loss": 2.6706, "step": 8304 }, { "crossentropy": 2.6181583404541016, "epoch": 0.3010803364269142, "grad_norm": 0.030319033190608025, "grad_norm_var": 1.2548876845085387e-05, "learning_rate": 0.00810399595081304, "loss": 2.5594, "step": 8305 }, { "crossentropy": 2.632587432861328, "epoch": 0.3011165893271462, "grad_norm": 0.03325828164815903, "grad_norm_var": 1.1782761556026093e-05, "learning_rate": 0.008103540374732184, "loss": 2.6321, "step": 8306 }, { "crossentropy": 2.7667908668518066, "epoch": 0.3011528422273782, "grad_norm": 0.03366359323263168, "grad_norm_var": 1.1041925630707733e-05, "learning_rate": 0.008103084756733306, "loss": 2.7232, "step": 8307 }, { "crossentropy": 2.590538740158081, "epoch": 0.3011890951276102, "grad_norm": 0.03765123710036278, "grad_norm_var": 1.2219979859845927e-05, "learning_rate": 0.008102629096822559, "loss": 2.5992, "step": 8308 }, { "crossentropy": 2.567789077758789, "epoch": 0.30122534802784223, "grad_norm": 0.037414032965898514, "grad_norm_var": 1.238569055421861e-05, "learning_rate": 0.008102173395006096, "loss": 2.632, "step": 8309 }, { "crossentropy": 2.7276721000671387, "epoch": 0.30126160092807425, "grad_norm": 0.034385889768600464, "grad_norm_var": 1.1540166541447403e-05, "learning_rate": 0.008101717651290075, "loss": 2.6477, "step": 8310 }, { "crossentropy": 2.706470251083374, "epoch": 0.30129785382830626, "grad_norm": 0.03810574486851692, "grad_norm_var": 1.2425679508733187e-05, "learning_rate": 0.00810126186568065, "loss": 2.6883, "step": 8311 }, { "crossentropy": 2.5094668865203857, "epoch": 0.3013341067285383, "grad_norm": 0.03433041274547577, "grad_norm_var": 7.300251240559969e-06, "learning_rate": 0.008100806038183977, "loss": 2.6011, "step": 8312 }, { "crossentropy": 2.67641019821167, "epoch": 0.3013703596287703, "grad_norm": 0.030780306085944176, "grad_norm_var": 7.791528991050071e-06, "learning_rate": 0.008100350168806215, "loss": 2.6854, "step": 8313 }, { "crossentropy": 2.6718368530273438, "epoch": 0.3014066125290023, "grad_norm": 0.03242555633187294, "grad_norm_var": 7.696115613845773e-06, "learning_rate": 0.008099894257553515, "loss": 2.6405, "step": 8314 }, { "crossentropy": 2.5931742191314697, "epoch": 0.3014428654292343, "grad_norm": 0.031084448099136353, "grad_norm_var": 8.112427219365591e-06, "learning_rate": 0.00809943830443204, "loss": 2.6713, "step": 8315 }, { "crossentropy": 2.7323660850524902, "epoch": 0.30147911832946633, "grad_norm": 0.030673004686832428, "grad_norm_var": 7.98683549615934e-06, "learning_rate": 0.00809898230944795, "loss": 2.6998, "step": 8316 }, { "crossentropy": 2.6407663822174072, "epoch": 0.3015153712296984, "grad_norm": 0.030366582795977592, "grad_norm_var": 7.232613401527777e-06, "learning_rate": 0.008098526272607398, "loss": 2.6654, "step": 8317 }, { "crossentropy": 2.6749460697174072, "epoch": 0.3015516241299304, "grad_norm": 0.03193299472332001, "grad_norm_var": 7.432470878247646e-06, "learning_rate": 0.008098070193916548, "loss": 2.6676, "step": 8318 }, { "crossentropy": 2.7212793827056885, "epoch": 0.30158787703016243, "grad_norm": 0.03002450242638588, "grad_norm_var": 7.467823805426298e-06, "learning_rate": 0.008097614073381559, "loss": 2.706, "step": 8319 }, { "crossentropy": 2.5567309856414795, "epoch": 0.30162412993039445, "grad_norm": 0.02954249083995819, "grad_norm_var": 8.102246274211188e-06, "learning_rate": 0.008097157911008589, "loss": 2.5703, "step": 8320 }, { "crossentropy": 2.5603370666503906, "epoch": 0.30166038283062646, "grad_norm": 0.03035326674580574, "grad_norm_var": 8.090664826049197e-06, "learning_rate": 0.008096701706803803, "loss": 2.6864, "step": 8321 }, { "crossentropy": 2.754567861557007, "epoch": 0.3016966357308585, "grad_norm": 0.030174871906638145, "grad_norm_var": 8.52710635997748e-06, "learning_rate": 0.00809624546077336, "loss": 2.7994, "step": 8322 }, { "crossentropy": 2.5082430839538574, "epoch": 0.3017328886310905, "grad_norm": 0.02819725312292576, "grad_norm_var": 9.679091731263205e-06, "learning_rate": 0.008095789172923425, "loss": 2.5654, "step": 8323 }, { "crossentropy": 2.770901918411255, "epoch": 0.3017691415313225, "grad_norm": 0.02799513377249241, "grad_norm_var": 8.668707380440306e-06, "learning_rate": 0.008095332843260156, "loss": 2.7187, "step": 8324 }, { "crossentropy": 2.586885452270508, "epoch": 0.3018053944315545, "grad_norm": 0.028056547045707703, "grad_norm_var": 7.0579026450098925e-06, "learning_rate": 0.008094876471789723, "loss": 2.6824, "step": 8325 }, { "crossentropy": 2.6840741634368896, "epoch": 0.30184164733178653, "grad_norm": 0.030739305540919304, "grad_norm_var": 6.316556671554357e-06, "learning_rate": 0.008094420058518285, "loss": 2.7055, "step": 8326 }, { "crossentropy": 2.6266911029815674, "epoch": 0.30187790023201855, "grad_norm": 0.03313029929995537, "grad_norm_var": 3.0993649723554463e-06, "learning_rate": 0.008093963603452008, "loss": 2.6615, "step": 8327 }, { "crossentropy": 2.952948808670044, "epoch": 0.30191415313225056, "grad_norm": 0.03837628290057182, "grad_norm_var": 6.1278219775438845e-06, "learning_rate": 0.008093507106597057, "loss": 2.823, "step": 8328 }, { "crossentropy": 2.616323471069336, "epoch": 0.3019504060324826, "grad_norm": 0.0340774767100811, "grad_norm_var": 6.769694017910244e-06, "learning_rate": 0.008093050567959599, "loss": 2.5801, "step": 8329 }, { "crossentropy": 2.7126216888427734, "epoch": 0.3019866589327146, "grad_norm": 0.035507895052433014, "grad_norm_var": 7.919828249770356e-06, "learning_rate": 0.0080925939875458, "loss": 2.7067, "step": 8330 }, { "crossentropy": 2.6290507316589355, "epoch": 0.30202291183294666, "grad_norm": 0.040248654782772064, "grad_norm_var": 1.294871446889284e-05, "learning_rate": 0.008092137365361825, "loss": 2.6849, "step": 8331 }, { "crossentropy": 2.5483508110046387, "epoch": 0.3020591647331787, "grad_norm": 0.03815457969903946, "grad_norm_var": 1.5285667047234052e-05, "learning_rate": 0.008091680701413842, "loss": 2.566, "step": 8332 }, { "crossentropy": 2.556586265563965, "epoch": 0.3020954176334107, "grad_norm": 0.032547831535339355, "grad_norm_var": 1.5019310272404382e-05, "learning_rate": 0.00809122399570802, "loss": 2.6656, "step": 8333 }, { "crossentropy": 2.5227315425872803, "epoch": 0.3021316705336427, "grad_norm": 0.02982921712100506, "grad_norm_var": 1.543848449805014e-05, "learning_rate": 0.008090767248250526, "loss": 2.6172, "step": 8334 }, { "crossentropy": 2.636559009552002, "epoch": 0.3021679234338747, "grad_norm": 0.02941511757671833, "grad_norm_var": 1.5647371244926813e-05, "learning_rate": 0.00809031045904753, "loss": 2.5993, "step": 8335 }, { "crossentropy": 2.371995687484741, "epoch": 0.30220417633410673, "grad_norm": 0.028019921854138374, "grad_norm_var": 1.6346301939793825e-05, "learning_rate": 0.0080898536281052, "loss": 2.4625, "step": 8336 }, { "crossentropy": 2.6462109088897705, "epoch": 0.30224042923433875, "grad_norm": 0.029833076521754265, "grad_norm_var": 1.6489669891625997e-05, "learning_rate": 0.008089396755429712, "loss": 2.6027, "step": 8337 }, { "crossentropy": 2.7776153087615967, "epoch": 0.30227668213457076, "grad_norm": 0.02911388874053955, "grad_norm_var": 1.6838582054261526e-05, "learning_rate": 0.00808893984102723, "loss": 2.6663, "step": 8338 }, { "crossentropy": 2.700511932373047, "epoch": 0.3023129350348028, "grad_norm": 0.03039485402405262, "grad_norm_var": 1.600341268352041e-05, "learning_rate": 0.008088482884903929, "loss": 2.5615, "step": 8339 }, { "crossentropy": 2.627152442932129, "epoch": 0.3023491879350348, "grad_norm": 0.03256240114569664, "grad_norm_var": 1.4737387614086281e-05, "learning_rate": 0.008088025887065977, "loss": 2.6283, "step": 8340 }, { "crossentropy": 2.5676767826080322, "epoch": 0.3023854408352668, "grad_norm": 0.03274862468242645, "grad_norm_var": 1.3333204611324043e-05, "learning_rate": 0.008087568847519551, "loss": 2.594, "step": 8341 }, { "crossentropy": 2.780529499053955, "epoch": 0.3024216937354988, "grad_norm": 0.03764575719833374, "grad_norm_var": 1.4422573556257496e-05, "learning_rate": 0.008087111766270822, "loss": 2.7999, "step": 8342 }, { "crossentropy": 2.625678539276123, "epoch": 0.30245794663573083, "grad_norm": 0.03930893540382385, "grad_norm_var": 1.6730226213819175e-05, "learning_rate": 0.008086654643325964, "loss": 2.6065, "step": 8343 }, { "crossentropy": 2.677788019180298, "epoch": 0.3024941995359629, "grad_norm": 0.035294994711875916, "grad_norm_var": 1.5366079589629075e-05, "learning_rate": 0.00808619747869115, "loss": 2.6244, "step": 8344 }, { "crossentropy": 2.764021396636963, "epoch": 0.3025304524361949, "grad_norm": 0.03266290947794914, "grad_norm_var": 1.5366938392724845e-05, "learning_rate": 0.008085740272372558, "loss": 2.72, "step": 8345 }, { "crossentropy": 2.6138646602630615, "epoch": 0.30256670533642693, "grad_norm": 0.0299659613519907, "grad_norm_var": 1.5677602636779362e-05, "learning_rate": 0.008085283024376359, "loss": 2.6042, "step": 8346 }, { "crossentropy": 2.8393917083740234, "epoch": 0.30260295823665895, "grad_norm": 0.03300618752837181, "grad_norm_var": 1.1940897146271492e-05, "learning_rate": 0.00808482573470873, "loss": 2.7379, "step": 8347 }, { "crossentropy": 2.6173312664031982, "epoch": 0.30263921113689096, "grad_norm": 0.03529030829668045, "grad_norm_var": 1.0306186301428887e-05, "learning_rate": 0.008084368403375848, "loss": 2.6199, "step": 8348 }, { "crossentropy": 2.6062042713165283, "epoch": 0.302675464037123, "grad_norm": 0.03387518227100372, "grad_norm_var": 1.045087248608805e-05, "learning_rate": 0.00808391103038389, "loss": 2.6291, "step": 8349 }, { "crossentropy": 2.5073816776275635, "epoch": 0.302711716937355, "grad_norm": 0.030727935954928398, "grad_norm_var": 1.0189049681473318e-05, "learning_rate": 0.008083453615739035, "loss": 2.5586, "step": 8350 }, { "crossentropy": 2.6308650970458984, "epoch": 0.302747969837587, "grad_norm": 0.028738293796777725, "grad_norm_var": 1.0495314437750192e-05, "learning_rate": 0.00808299615944746, "loss": 2.6453, "step": 8351 }, { "crossentropy": 2.7830142974853516, "epoch": 0.302784222737819, "grad_norm": 0.03050939552485943, "grad_norm_var": 9.4124052001426e-06, "learning_rate": 0.008082538661515341, "loss": 2.729, "step": 8352 }, { "crossentropy": 2.5489649772644043, "epoch": 0.30282047563805103, "grad_norm": 0.028753874823451042, "grad_norm_var": 9.88404777423287e-06, "learning_rate": 0.00808208112194886, "loss": 2.6138, "step": 8353 }, { "crossentropy": 2.658022880554199, "epoch": 0.30285672853828305, "grad_norm": 0.0302997175604105, "grad_norm_var": 9.430630620884765e-06, "learning_rate": 0.008081623540754198, "loss": 2.636, "step": 8354 }, { "crossentropy": 2.5108935832977295, "epoch": 0.30289298143851506, "grad_norm": 0.02765672281384468, "grad_norm_var": 1.0708508549065126e-05, "learning_rate": 0.008081165917937531, "loss": 2.6093, "step": 8355 }, { "crossentropy": 2.881098985671997, "epoch": 0.3029292343387471, "grad_norm": 0.03192180395126343, "grad_norm_var": 1.0723740152243283e-05, "learning_rate": 0.008080708253505044, "loss": 2.7702, "step": 8356 }, { "crossentropy": 2.709148406982422, "epoch": 0.3029654872389791, "grad_norm": 0.03006831929087639, "grad_norm_var": 1.1048300585032157e-05, "learning_rate": 0.008080250547462918, "loss": 2.6468, "step": 8357 }, { "crossentropy": 2.5358383655548096, "epoch": 0.30300174013921116, "grad_norm": 0.044152431190013885, "grad_norm_var": 1.8390316032158025e-05, "learning_rate": 0.008079792799817331, "loss": 2.616, "step": 8358 }, { "crossentropy": 2.561741828918457, "epoch": 0.3030379930394432, "grad_norm": 0.030866939574480057, "grad_norm_var": 1.533747761653611e-05, "learning_rate": 0.008079335010574472, "loss": 2.6355, "step": 8359 }, { "crossentropy": 2.634222984313965, "epoch": 0.3030742459396752, "grad_norm": 0.0333617739379406, "grad_norm_var": 1.4750587530524513e-05, "learning_rate": 0.008078877179740518, "loss": 2.676, "step": 8360 }, { "crossentropy": 2.7649574279785156, "epoch": 0.3031104988399072, "grad_norm": 0.031208151951432228, "grad_norm_var": 1.475255008506505e-05, "learning_rate": 0.008078419307321656, "loss": 2.7477, "step": 8361 }, { "crossentropy": 2.566732883453369, "epoch": 0.3031467517401392, "grad_norm": 0.030054908245801926, "grad_norm_var": 1.4730105436030445e-05, "learning_rate": 0.00807796139332407, "loss": 2.5564, "step": 8362 }, { "crossentropy": 2.7159223556518555, "epoch": 0.30318300464037123, "grad_norm": 0.029930148273706436, "grad_norm_var": 1.4870148292305434e-05, "learning_rate": 0.008077503437753944, "loss": 2.7007, "step": 8363 }, { "crossentropy": 2.521179437637329, "epoch": 0.30321925754060325, "grad_norm": 0.02929629571735859, "grad_norm_var": 1.4257064154979808e-05, "learning_rate": 0.008077045440617464, "loss": 2.553, "step": 8364 }, { "crossentropy": 2.6522610187530518, "epoch": 0.30325551044083526, "grad_norm": 0.030444281175732613, "grad_norm_var": 1.3832511208932544e-05, "learning_rate": 0.008076587401920816, "loss": 2.6161, "step": 8365 }, { "crossentropy": 2.4844870567321777, "epoch": 0.3032917633410673, "grad_norm": 0.028236890211701393, "grad_norm_var": 1.4352036683938973e-05, "learning_rate": 0.008076129321670185, "loss": 2.5836, "step": 8366 }, { "crossentropy": 2.441222906112671, "epoch": 0.3033280162412993, "grad_norm": 0.028243931010365486, "grad_norm_var": 1.4514331735978001e-05, "learning_rate": 0.008075671199871761, "loss": 2.5609, "step": 8367 }, { "crossentropy": 2.722987413406372, "epoch": 0.3033642691415313, "grad_norm": 0.029029283672571182, "grad_norm_var": 1.4735806987118992e-05, "learning_rate": 0.008075213036531729, "loss": 2.6773, "step": 8368 }, { "crossentropy": 2.684664726257324, "epoch": 0.3034005220417633, "grad_norm": 0.0285028088837862, "grad_norm_var": 1.4809759444606611e-05, "learning_rate": 0.008074754831656279, "loss": 2.6971, "step": 8369 }, { "crossentropy": 2.712463617324829, "epoch": 0.30343677494199534, "grad_norm": 0.03022894822061062, "grad_norm_var": 1.4815072863574618e-05, "learning_rate": 0.008074296585251597, "loss": 2.6655, "step": 8370 }, { "crossentropy": 2.6949539184570312, "epoch": 0.3034730278422274, "grad_norm": 0.030726440250873566, "grad_norm_var": 1.4107165521825639e-05, "learning_rate": 0.008073838297323874, "loss": 2.7144, "step": 8371 }, { "crossentropy": 2.4553956985473633, "epoch": 0.3035092807424594, "grad_norm": 0.030725346878170967, "grad_norm_var": 1.4052307171239821e-05, "learning_rate": 0.008073379967879303, "loss": 2.4565, "step": 8372 }, { "crossentropy": 2.6167795658111572, "epoch": 0.30354553364269143, "grad_norm": 0.030309131368994713, "grad_norm_var": 1.4027869362389723e-05, "learning_rate": 0.00807292159692407, "loss": 2.658, "step": 8373 }, { "crossentropy": 2.583324909210205, "epoch": 0.30358178654292345, "grad_norm": 0.03225202485918999, "grad_norm_var": 1.9421993904845803e-06, "learning_rate": 0.008072463184464368, "loss": 2.6113, "step": 8374 }, { "crossentropy": 2.7651073932647705, "epoch": 0.30361803944315546, "grad_norm": 0.03161168470978737, "grad_norm_var": 2.0417427543365734e-06, "learning_rate": 0.008072004730506385, "loss": 2.6844, "step": 8375 }, { "crossentropy": 2.702874183654785, "epoch": 0.3036542923433875, "grad_norm": 0.04178540036082268, "grad_norm_var": 9.960199513957717e-06, "learning_rate": 0.00807154623505632, "loss": 2.76, "step": 8376 }, { "crossentropy": 2.8160736560821533, "epoch": 0.3036905452436195, "grad_norm": 0.029218826442956924, "grad_norm_var": 1.0095725395713588e-05, "learning_rate": 0.008071087698120361, "loss": 2.799, "step": 8377 }, { "crossentropy": 2.6216073036193848, "epoch": 0.3037267981438515, "grad_norm": 0.031151482835412025, "grad_norm_var": 1.0082077529552933e-05, "learning_rate": 0.008070629119704702, "loss": 2.6146, "step": 8378 }, { "crossentropy": 2.283921480178833, "epoch": 0.3037630510440835, "grad_norm": 0.033948227763175964, "grad_norm_var": 1.066218921836856e-05, "learning_rate": 0.008070170499815536, "loss": 2.4777, "step": 8379 }, { "crossentropy": 2.5885720252990723, "epoch": 0.30379930394431554, "grad_norm": 0.031742800027132034, "grad_norm_var": 1.0486418269928012e-05, "learning_rate": 0.00806971183845906, "loss": 2.5525, "step": 8380 }, { "crossentropy": 2.6572768688201904, "epoch": 0.30383555684454755, "grad_norm": 0.03223897144198418, "grad_norm_var": 1.0522479088191867e-05, "learning_rate": 0.008069253135641465, "loss": 2.6149, "step": 8381 }, { "crossentropy": 2.6157944202423096, "epoch": 0.30387180974477956, "grad_norm": 0.03295066952705383, "grad_norm_var": 1.0019337720640573e-05, "learning_rate": 0.00806879439136895, "loss": 2.6746, "step": 8382 }, { "crossentropy": 2.5458645820617676, "epoch": 0.3039080626450116, "grad_norm": 0.03247462585568428, "grad_norm_var": 9.277807238538742e-06, "learning_rate": 0.008068335605647708, "loss": 2.561, "step": 8383 }, { "crossentropy": 2.658634662628174, "epoch": 0.3039443155452436, "grad_norm": 0.028037400916218758, "grad_norm_var": 9.706525870734294e-06, "learning_rate": 0.00806787677848394, "loss": 2.6463, "step": 8384 }, { "crossentropy": 2.681887149810791, "epoch": 0.30398056844547566, "grad_norm": 0.02862045168876648, "grad_norm_var": 9.656549710013275e-06, "learning_rate": 0.008067417909883839, "loss": 2.688, "step": 8385 }, { "crossentropy": 2.677872896194458, "epoch": 0.3040168213457077, "grad_norm": 0.028940290212631226, "grad_norm_var": 1.0021929322619656e-05, "learning_rate": 0.008066958999853603, "loss": 2.6774, "step": 8386 }, { "crossentropy": 2.5239310264587402, "epoch": 0.3040530742459397, "grad_norm": 0.030941715463995934, "grad_norm_var": 9.99771773734502e-06, "learning_rate": 0.008066500048399433, "loss": 2.6018, "step": 8387 }, { "crossentropy": 2.7284669876098633, "epoch": 0.3040893271461717, "grad_norm": 0.02928703837096691, "grad_norm_var": 1.0310919172576529e-05, "learning_rate": 0.008066041055527525, "loss": 2.6243, "step": 8388 }, { "crossentropy": 2.6343915462493896, "epoch": 0.3041255800464037, "grad_norm": 0.03203212842345238, "grad_norm_var": 1.0201190650313553e-05, "learning_rate": 0.008065582021244083, "loss": 2.6585, "step": 8389 }, { "crossentropy": 2.886504650115967, "epoch": 0.30416183294663574, "grad_norm": 0.03592619299888611, "grad_norm_var": 1.1314308053658219e-05, "learning_rate": 0.008065122945555298, "loss": 2.7419, "step": 8390 }, { "crossentropy": 2.574932813644409, "epoch": 0.30419808584686775, "grad_norm": 0.0320764034986496, "grad_norm_var": 1.1307974088506367e-05, "learning_rate": 0.00806466382846738, "loss": 2.5381, "step": 8391 }, { "crossentropy": 2.712204933166504, "epoch": 0.30423433874709976, "grad_norm": 0.03136611357331276, "grad_norm_var": 4.444344540500272e-06, "learning_rate": 0.008064204669986526, "loss": 2.6352, "step": 8392 }, { "crossentropy": 2.556133985519409, "epoch": 0.3042705916473318, "grad_norm": 0.031166505068540573, "grad_norm_var": 4.138485532100463e-06, "learning_rate": 0.008063745470118937, "loss": 2.5697, "step": 8393 }, { "crossentropy": 2.5245423316955566, "epoch": 0.3043068445475638, "grad_norm": 0.028460683301091194, "grad_norm_var": 4.691406465751752e-06, "learning_rate": 0.008063286228870816, "loss": 2.5922, "step": 8394 }, { "crossentropy": 2.419940233230591, "epoch": 0.3043430974477958, "grad_norm": 0.03227119892835617, "grad_norm_var": 4.266786820136814e-06, "learning_rate": 0.008062826946248366, "loss": 2.5497, "step": 8395 }, { "crossentropy": 2.565077066421509, "epoch": 0.3043793503480278, "grad_norm": 0.03980845585465431, "grad_norm_var": 8.961269372864678e-06, "learning_rate": 0.008062367622257791, "loss": 2.6019, "step": 8396 }, { "crossentropy": 2.609477996826172, "epoch": 0.30441560324825984, "grad_norm": 0.02996426820755005, "grad_norm_var": 9.10979946267907e-06, "learning_rate": 0.008061908256905292, "loss": 2.682, "step": 8397 }, { "crossentropy": 2.529878616333008, "epoch": 0.3044518561484919, "grad_norm": 0.03018178604543209, "grad_norm_var": 9.060883803373728e-06, "learning_rate": 0.008061448850197077, "loss": 2.529, "step": 8398 }, { "crossentropy": 2.752349853515625, "epoch": 0.3044881090487239, "grad_norm": 0.03175146132707596, "grad_norm_var": 8.984861003832412e-06, "learning_rate": 0.008060989402139349, "loss": 2.7403, "step": 8399 }, { "crossentropy": 2.7239065170288086, "epoch": 0.30452436194895594, "grad_norm": 0.0298838522285223, "grad_norm_var": 8.394222860690298e-06, "learning_rate": 0.008060529912738314, "loss": 2.7062, "step": 8400 }, { "crossentropy": 2.845935106277466, "epoch": 0.30456061484918795, "grad_norm": 0.03345780447125435, "grad_norm_var": 8.052739119768096e-06, "learning_rate": 0.008060070382000179, "loss": 2.7895, "step": 8401 }, { "crossentropy": 2.754664897918701, "epoch": 0.30459686774941996, "grad_norm": 0.03508531302213669, "grad_norm_var": 8.13551342190252e-06, "learning_rate": 0.008059610809931148, "loss": 2.7161, "step": 8402 }, { "crossentropy": 2.6272404193878174, "epoch": 0.304633120649652, "grad_norm": 0.03610970079898834, "grad_norm_var": 9.004011301231436e-06, "learning_rate": 0.008059151196537431, "loss": 2.6626, "step": 8403 }, { "crossentropy": 2.69500994682312, "epoch": 0.304669373549884, "grad_norm": 0.030814949423074722, "grad_norm_var": 8.510280095645736e-06, "learning_rate": 0.008058691541825232, "loss": 2.6474, "step": 8404 }, { "crossentropy": 2.601536512374878, "epoch": 0.304705626450116, "grad_norm": 0.030382853001356125, "grad_norm_var": 8.788077561438359e-06, "learning_rate": 0.008058231845800765, "loss": 2.4865, "step": 8405 }, { "crossentropy": 2.5060248374938965, "epoch": 0.304741879350348, "grad_norm": 0.03647279739379883, "grad_norm_var": 9.062341232359212e-06, "learning_rate": 0.008057772108470236, "loss": 2.4878, "step": 8406 }, { "crossentropy": 2.603099822998047, "epoch": 0.30477813225058004, "grad_norm": 0.03392275050282478, "grad_norm_var": 9.182598618873378e-06, "learning_rate": 0.008057312329839852, "loss": 2.5269, "step": 8407 }, { "crossentropy": 2.537625551223755, "epoch": 0.30481438515081205, "grad_norm": 0.030327044427394867, "grad_norm_var": 9.4166982527523e-06, "learning_rate": 0.008056852509915826, "loss": 2.5957, "step": 8408 }, { "crossentropy": 2.7184622287750244, "epoch": 0.30485063805104406, "grad_norm": 0.0301860049366951, "grad_norm_var": 9.65161868070236e-06, "learning_rate": 0.008056392648704369, "loss": 2.7238, "step": 8409 }, { "crossentropy": 2.655104875564575, "epoch": 0.3048868909512761, "grad_norm": 0.02853427268564701, "grad_norm_var": 9.612887317783748e-06, "learning_rate": 0.008055932746211691, "loss": 2.5969, "step": 8410 }, { "crossentropy": 2.8417789936065674, "epoch": 0.3049231438515081, "grad_norm": 0.03019261546432972, "grad_norm_var": 9.931684970653348e-06, "learning_rate": 0.008055472802444005, "loss": 2.738, "step": 8411 }, { "crossentropy": 2.6255884170532227, "epoch": 0.30495939675174016, "grad_norm": 0.03035431168973446, "grad_norm_var": 6.074923374909631e-06, "learning_rate": 0.008055012817407519, "loss": 2.648, "step": 8412 }, { "crossentropy": 2.712026596069336, "epoch": 0.3049956496519722, "grad_norm": 0.028674276545643806, "grad_norm_var": 6.4820063851752745e-06, "learning_rate": 0.008054552791108449, "loss": 2.7217, "step": 8413 }, { "crossentropy": 2.7205352783203125, "epoch": 0.3050319025522042, "grad_norm": 0.030432838946580887, "grad_norm_var": 6.43694171894384e-06, "learning_rate": 0.00805409272355301, "loss": 2.7309, "step": 8414 }, { "crossentropy": 2.4888954162597656, "epoch": 0.3050681554524362, "grad_norm": 0.029296165332198143, "grad_norm_var": 6.784247134809275e-06, "learning_rate": 0.008053632614747413, "loss": 2.5929, "step": 8415 }, { "crossentropy": 2.5892863273620605, "epoch": 0.3051044083526682, "grad_norm": 0.03027285635471344, "grad_norm_var": 6.70946632927113e-06, "learning_rate": 0.008053172464697873, "loss": 2.5797, "step": 8416 }, { "crossentropy": 2.6341099739074707, "epoch": 0.30514066125290024, "grad_norm": 0.029917871579527855, "grad_norm_var": 6.583833525210248e-06, "learning_rate": 0.008052712273410607, "loss": 2.5963, "step": 8417 }, { "crossentropy": 2.6646029949188232, "epoch": 0.30517691415313225, "grad_norm": 0.02835715375840664, "grad_norm_var": 6.027235797871144e-06, "learning_rate": 0.008052252040891827, "loss": 2.6274, "step": 8418 }, { "crossentropy": 2.680690288543701, "epoch": 0.30521316705336426, "grad_norm": 0.027946539223194122, "grad_norm_var": 4.511402126126116e-06, "learning_rate": 0.008051791767147754, "loss": 2.6226, "step": 8419 }, { "crossentropy": 2.6580140590667725, "epoch": 0.3052494199535963, "grad_norm": 0.030210338532924652, "grad_norm_var": 4.499212627297753e-06, "learning_rate": 0.008051331452184599, "loss": 2.6164, "step": 8420 }, { "crossentropy": 2.5796494483947754, "epoch": 0.3052856728538283, "grad_norm": 0.02994055114686489, "grad_norm_var": 4.509062344093019e-06, "learning_rate": 0.008050871096008582, "loss": 2.6203, "step": 8421 }, { "crossentropy": 2.491983413696289, "epoch": 0.3053219257540603, "grad_norm": 0.029386840760707855, "grad_norm_var": 1.8292896529783476e-06, "learning_rate": 0.008050410698625925, "loss": 2.5356, "step": 8422 }, { "crossentropy": 2.5376930236816406, "epoch": 0.3053581786542923, "grad_norm": 0.029734577983617783, "grad_norm_var": 6.635717837215223e-07, "learning_rate": 0.00804995026004284, "loss": 2.6156, "step": 8423 }, { "crossentropy": 2.6132218837738037, "epoch": 0.30539443155452434, "grad_norm": 0.028004702180624008, "grad_norm_var": 7.787041335497365e-07, "learning_rate": 0.008049489780265547, "loss": 2.5425, "step": 8424 }, { "crossentropy": 2.5561141967773438, "epoch": 0.3054306844547564, "grad_norm": 0.02888837829232216, "grad_norm_var": 7.592185173325878e-07, "learning_rate": 0.008049029259300268, "loss": 2.6173, "step": 8425 }, { "crossentropy": 2.5077154636383057, "epoch": 0.3054669373549884, "grad_norm": 0.03004850447177887, "grad_norm_var": 7.309630896111154e-07, "learning_rate": 0.008048568697153223, "loss": 2.5578, "step": 8426 }, { "crossentropy": 2.4810991287231445, "epoch": 0.30550319025522044, "grad_norm": 0.02911236323416233, "grad_norm_var": 7.010632072715205e-07, "learning_rate": 0.008048108093830629, "loss": 2.6299, "step": 8427 }, { "crossentropy": 2.7767486572265625, "epoch": 0.30553944315545245, "grad_norm": 0.030155736953020096, "grad_norm_var": 6.785557430199114e-07, "learning_rate": 0.00804764744933871, "loss": 2.6726, "step": 8428 }, { "crossentropy": 2.665299892425537, "epoch": 0.30557569605568446, "grad_norm": 0.029429038986563683, "grad_norm_var": 6.412544255276463e-07, "learning_rate": 0.008047186763683689, "loss": 2.7036, "step": 8429 }, { "crossentropy": 2.7309625148773193, "epoch": 0.3056119489559165, "grad_norm": 0.03045523166656494, "grad_norm_var": 6.442324540357573e-07, "learning_rate": 0.008046726036871784, "loss": 2.7635, "step": 8430 }, { "crossentropy": 2.5957553386688232, "epoch": 0.3056482018561485, "grad_norm": 0.028087768703699112, "grad_norm_var": 7.598476126098915e-07, "learning_rate": 0.008046265268909223, "loss": 2.599, "step": 8431 }, { "crossentropy": 2.6171603202819824, "epoch": 0.3056844547563805, "grad_norm": 0.029590671882033348, "grad_norm_var": 7.069734025342995e-07, "learning_rate": 0.008045804459802224, "loss": 2.6396, "step": 8432 }, { "crossentropy": 2.5939748287200928, "epoch": 0.3057207076566125, "grad_norm": 0.03556323051452637, "grad_norm_var": 3.141998676591664e-06, "learning_rate": 0.008045343609557016, "loss": 2.5923, "step": 8433 }, { "crossentropy": 2.5754611492156982, "epoch": 0.30575696055684454, "grad_norm": 0.03294184058904648, "grad_norm_var": 3.645855112581938e-06, "learning_rate": 0.008044882718179817, "loss": 2.6363, "step": 8434 }, { "crossentropy": 2.613508701324463, "epoch": 0.30579321345707655, "grad_norm": 0.03371484950184822, "grad_norm_var": 4.1703213975951534e-06, "learning_rate": 0.008044421785676862, "loss": 2.5879, "step": 8435 }, { "crossentropy": 2.434868812561035, "epoch": 0.30582946635730857, "grad_norm": 0.03994234651327133, "grad_norm_var": 9.935794139746302e-06, "learning_rate": 0.008043960812054367, "loss": 2.4986, "step": 8436 }, { "crossentropy": 2.5015323162078857, "epoch": 0.3058657192575406, "grad_norm": 0.03249852731823921, "grad_norm_var": 1.000479568029285e-05, "learning_rate": 0.008043499797318563, "loss": 2.5675, "step": 8437 }, { "crossentropy": 2.6469764709472656, "epoch": 0.3059019721577726, "grad_norm": 0.028911082074046135, "grad_norm_var": 1.0127435753586507e-05, "learning_rate": 0.008043038741475675, "loss": 2.6006, "step": 8438 }, { "crossentropy": 2.586735725402832, "epoch": 0.30593822505800466, "grad_norm": 0.02845417708158493, "grad_norm_var": 1.0457444253630412e-05, "learning_rate": 0.008042577644531932, "loss": 2.6141, "step": 8439 }, { "crossentropy": 2.579282522201538, "epoch": 0.3059744779582367, "grad_norm": 0.030646709725260735, "grad_norm_var": 9.842997876266047e-06, "learning_rate": 0.008042116506493557, "loss": 2.6036, "step": 8440 }, { "crossentropy": 2.4825024604797363, "epoch": 0.3060107308584687, "grad_norm": 0.030760755762457848, "grad_norm_var": 9.496864344606116e-06, "learning_rate": 0.008041655327366786, "loss": 2.5465, "step": 8441 }, { "crossentropy": 2.692559003829956, "epoch": 0.3060469837587007, "grad_norm": 0.03937305137515068, "grad_norm_var": 1.3412967221348472e-05, "learning_rate": 0.008041194107157841, "loss": 2.6759, "step": 8442 }, { "crossentropy": 2.474980115890503, "epoch": 0.3060832366589327, "grad_norm": 0.03081563673913479, "grad_norm_var": 1.297203202522713e-05, "learning_rate": 0.008040732845872959, "loss": 2.5713, "step": 8443 }, { "crossentropy": 2.6967813968658447, "epoch": 0.30611948955916474, "grad_norm": 0.02929249033331871, "grad_norm_var": 1.3226137399374264e-05, "learning_rate": 0.00804027154351836, "loss": 2.6488, "step": 8444 }, { "crossentropy": 2.7220876216888428, "epoch": 0.30615574245939675, "grad_norm": 0.03132498636841774, "grad_norm_var": 1.28249362959296e-05, "learning_rate": 0.008039810200100283, "loss": 2.7033, "step": 8445 }, { "crossentropy": 2.535479784011841, "epoch": 0.30619199535962877, "grad_norm": 0.029787497594952583, "grad_norm_var": 1.2992413210370646e-05, "learning_rate": 0.008039348815624956, "loss": 2.6796, "step": 8446 }, { "crossentropy": 2.6459977626800537, "epoch": 0.3062282482598608, "grad_norm": 0.032853979617357254, "grad_norm_var": 1.1937700742049862e-05, "learning_rate": 0.008038887390098611, "loss": 2.6196, "step": 8447 }, { "crossentropy": 2.628040075302124, "epoch": 0.3062645011600928, "grad_norm": 0.032739147543907166, "grad_norm_var": 1.1428499992561476e-05, "learning_rate": 0.008038425923527479, "loss": 2.5801, "step": 8448 }, { "crossentropy": 2.741024971008301, "epoch": 0.3063007540603248, "grad_norm": 0.029213236644864082, "grad_norm_var": 1.1335026730387107e-05, "learning_rate": 0.008037964415917795, "loss": 2.6289, "step": 8449 }, { "crossentropy": 2.672686815261841, "epoch": 0.3063370069605568, "grad_norm": 0.02891252189874649, "grad_norm_var": 1.1886397206949523e-05, "learning_rate": 0.00803750286727579, "loss": 2.6877, "step": 8450 }, { "crossentropy": 2.696016311645508, "epoch": 0.30637325986078884, "grad_norm": 0.030406320467591286, "grad_norm_var": 1.1737992360162637e-05, "learning_rate": 0.008037041277607699, "loss": 2.6528, "step": 8451 }, { "crossentropy": 2.6940128803253174, "epoch": 0.3064095127610209, "grad_norm": 0.03263837471604347, "grad_norm_var": 6.968176873807736e-06, "learning_rate": 0.00803657964691976, "loss": 2.6582, "step": 8452 }, { "crossentropy": 2.715501546859741, "epoch": 0.3064457656612529, "grad_norm": 0.030547168105840683, "grad_norm_var": 6.859018689319438e-06, "learning_rate": 0.008036117975218202, "loss": 2.658, "step": 8453 }, { "crossentropy": 2.770886182785034, "epoch": 0.30648201856148494, "grad_norm": 0.027835721150040627, "grad_norm_var": 7.236873900985869e-06, "learning_rate": 0.008035656262509263, "loss": 2.6824, "step": 8454 }, { "crossentropy": 2.735795497894287, "epoch": 0.30651827146171695, "grad_norm": 0.02865937165915966, "grad_norm_var": 7.170534524001543e-06, "learning_rate": 0.008035194508799179, "loss": 2.7006, "step": 8455 }, { "crossentropy": 2.522552967071533, "epoch": 0.30655452436194897, "grad_norm": 0.02917621098458767, "grad_norm_var": 7.372585396944742e-06, "learning_rate": 0.00803473271409419, "loss": 2.5906, "step": 8456 }, { "crossentropy": 2.6593503952026367, "epoch": 0.306590777262181, "grad_norm": 0.03074543923139572, "grad_norm_var": 7.372876315656996e-06, "learning_rate": 0.008034270878400527, "loss": 2.6506, "step": 8457 }, { "crossentropy": 2.688467025756836, "epoch": 0.306627030162413, "grad_norm": 0.028796613216400146, "grad_norm_var": 2.40861517808269e-06, "learning_rate": 0.008033809001724432, "loss": 2.6643, "step": 8458 }, { "crossentropy": 2.766859531402588, "epoch": 0.306663283062645, "grad_norm": 0.029711000621318817, "grad_norm_var": 2.3992193211958462e-06, "learning_rate": 0.008033347084072145, "loss": 2.7648, "step": 8459 }, { "crossentropy": 2.618165969848633, "epoch": 0.306699535962877, "grad_norm": 0.030408497899770737, "grad_norm_var": 2.3472303126402527e-06, "learning_rate": 0.0080328851254499, "loss": 2.6411, "step": 8460 }, { "crossentropy": 2.692934274673462, "epoch": 0.30673578886310904, "grad_norm": 0.0301528163254261, "grad_norm_var": 2.2627127611606067e-06, "learning_rate": 0.00803242312586394, "loss": 2.6364, "step": 8461 }, { "crossentropy": 2.616384983062744, "epoch": 0.30677204176334105, "grad_norm": 0.030661407858133316, "grad_norm_var": 2.2668665354077236e-06, "learning_rate": 0.008031961085320501, "loss": 2.6322, "step": 8462 }, { "crossentropy": 2.6843326091766357, "epoch": 0.30680829466357307, "grad_norm": 0.02983149141073227, "grad_norm_var": 1.7747756237267934e-06, "learning_rate": 0.008031499003825828, "loss": 2.6312, "step": 8463 }, { "crossentropy": 2.502485752105713, "epoch": 0.3068445475638051, "grad_norm": 0.031610921025276184, "grad_norm_var": 1.4463740616673608e-06, "learning_rate": 0.008031036881386162, "loss": 2.5485, "step": 8464 }, { "crossentropy": 2.4758267402648926, "epoch": 0.3068808004640371, "grad_norm": 0.030854249373078346, "grad_norm_var": 1.4520118780101855e-06, "learning_rate": 0.008030574718007742, "loss": 2.5151, "step": 8465 }, { "crossentropy": 2.679687261581421, "epoch": 0.30691705336426917, "grad_norm": 0.029468800872564316, "grad_norm_var": 1.386298256868358e-06, "learning_rate": 0.008030112513696811, "loss": 2.6668, "step": 8466 }, { "crossentropy": 2.6820309162139893, "epoch": 0.3069533062645012, "grad_norm": 0.028456637635827065, "grad_norm_var": 1.5426936684121423e-06, "learning_rate": 0.008029650268459613, "loss": 2.6711, "step": 8467 }, { "crossentropy": 2.6749074459075928, "epoch": 0.3069895591647332, "grad_norm": 0.028389014303684235, "grad_norm_var": 1.1606382673519695e-06, "learning_rate": 0.00802918798230239, "loss": 2.6528, "step": 8468 }, { "crossentropy": 2.513829469680786, "epoch": 0.3070258120649652, "grad_norm": 0.03216324374079704, "grad_norm_var": 1.5049956403040728e-06, "learning_rate": 0.008028725655231387, "loss": 2.558, "step": 8469 }, { "crossentropy": 2.5027570724487305, "epoch": 0.3070620649651972, "grad_norm": 0.029249947518110275, "grad_norm_var": 1.2581754128548894e-06, "learning_rate": 0.008028263287252846, "loss": 2.5554, "step": 8470 }, { "crossentropy": 2.6965136528015137, "epoch": 0.30709831786542924, "grad_norm": 0.03263583406805992, "grad_norm_var": 1.5907982401137337e-06, "learning_rate": 0.008027800878373015, "loss": 2.6961, "step": 8471 }, { "crossentropy": 2.6194987297058105, "epoch": 0.30713457076566125, "grad_norm": 0.030885668471455574, "grad_norm_var": 1.5527368959367677e-06, "learning_rate": 0.008027338428598138, "loss": 2.6715, "step": 8472 }, { "crossentropy": 2.5040230751037598, "epoch": 0.30717082366589327, "grad_norm": 0.031055567786097527, "grad_norm_var": 1.5791789948177426e-06, "learning_rate": 0.008026875937934464, "loss": 2.6515, "step": 8473 }, { "crossentropy": 2.681612968444824, "epoch": 0.3072070765661253, "grad_norm": 0.03281920403242111, "grad_norm_var": 1.7998694043894667e-06, "learning_rate": 0.008026413406388233, "loss": 2.7396, "step": 8474 }, { "crossentropy": 2.709881067276001, "epoch": 0.3072433294663573, "grad_norm": 0.030073851346969604, "grad_norm_var": 1.768855003013899e-06, "learning_rate": 0.008025950833965699, "loss": 2.6921, "step": 8475 }, { "crossentropy": 2.6600067615509033, "epoch": 0.3072795823665893, "grad_norm": 0.029384510591626167, "grad_norm_var": 1.8530019458272052e-06, "learning_rate": 0.008025488220673104, "loss": 2.5962, "step": 8476 }, { "crossentropy": 2.5038199424743652, "epoch": 0.3073158352668213, "grad_norm": 0.029423072934150696, "grad_norm_var": 1.9181995348336074e-06, "learning_rate": 0.008025025566516701, "loss": 2.5576, "step": 8477 }, { "crossentropy": 2.6667280197143555, "epoch": 0.30735208816705334, "grad_norm": 0.029991699382662773, "grad_norm_var": 1.92603351002642e-06, "learning_rate": 0.008024562871502737, "loss": 2.6502, "step": 8478 }, { "crossentropy": 2.690023899078369, "epoch": 0.3073883410672854, "grad_norm": 0.030558587983250618, "grad_norm_var": 1.904604594805366e-06, "learning_rate": 0.008024100135637462, "loss": 2.6956, "step": 8479 }, { "crossentropy": 2.787689685821533, "epoch": 0.3074245939675174, "grad_norm": 0.02897709794342518, "grad_norm_var": 1.926547584531433e-06, "learning_rate": 0.008023637358927124, "loss": 2.7801, "step": 8480 }, { "crossentropy": 2.8257222175598145, "epoch": 0.30746084686774944, "grad_norm": 0.029781315475702286, "grad_norm_var": 1.9155142676950993e-06, "learning_rate": 0.008023174541377978, "loss": 2.7805, "step": 8481 }, { "crossentropy": 2.6777236461639404, "epoch": 0.30749709976798145, "grad_norm": 0.030167654156684875, "grad_norm_var": 1.8772413317571228e-06, "learning_rate": 0.008022711682996269, "loss": 2.5552, "step": 8482 }, { "crossentropy": 2.4842896461486816, "epoch": 0.30753335266821347, "grad_norm": 0.03203346207737923, "grad_norm_var": 1.8211888597162902e-06, "learning_rate": 0.008022248783788252, "loss": 2.556, "step": 8483 }, { "crossentropy": 2.538761615753174, "epoch": 0.3075696055684455, "grad_norm": 0.03535939380526543, "grad_norm_var": 2.9197406107433463e-06, "learning_rate": 0.008021785843760178, "loss": 2.6, "step": 8484 }, { "crossentropy": 2.5448853969573975, "epoch": 0.3076058584686775, "grad_norm": 0.03335711359977722, "grad_norm_var": 3.2083169878796096e-06, "learning_rate": 0.008021322862918304, "loss": 2.5782, "step": 8485 }, { "crossentropy": 2.668409824371338, "epoch": 0.3076421113689095, "grad_norm": 0.031221671029925346, "grad_norm_var": 2.9952575641384867e-06, "learning_rate": 0.008020859841268877, "loss": 2.5703, "step": 8486 }, { "crossentropy": 2.4883005619049072, "epoch": 0.3076783642691415, "grad_norm": 0.03329020366072655, "grad_norm_var": 3.1553349743171915e-06, "learning_rate": 0.008020396778818153, "loss": 2.6608, "step": 8487 }, { "crossentropy": 2.726473808288574, "epoch": 0.30771461716937354, "grad_norm": 0.04008599743247032, "grad_norm_var": 8.122982710610735e-06, "learning_rate": 0.008019933675572388, "loss": 2.6943, "step": 8488 }, { "crossentropy": 2.5474979877471924, "epoch": 0.30775087006960555, "grad_norm": 0.03885648399591446, "grad_norm_var": 1.1231358725591319e-05, "learning_rate": 0.008019470531537835, "loss": 2.6073, "step": 8489 }, { "crossentropy": 2.5495798587799072, "epoch": 0.30778712296983757, "grad_norm": 0.030913220718503, "grad_norm_var": 1.130392792501438e-05, "learning_rate": 0.008019007346720753, "loss": 2.5868, "step": 8490 }, { "crossentropy": 2.6510238647460938, "epoch": 0.3078233758700696, "grad_norm": 0.027917008846998215, "grad_norm_var": 1.2175113157239456e-05, "learning_rate": 0.008018544121127392, "loss": 2.6084, "step": 8491 }, { "crossentropy": 2.4156532287597656, "epoch": 0.3078596287703016, "grad_norm": 0.029737073928117752, "grad_norm_var": 1.2061934158879998e-05, "learning_rate": 0.008018080854764012, "loss": 2.5325, "step": 8492 }, { "crossentropy": 2.604649066925049, "epoch": 0.30789588167053367, "grad_norm": 0.029375579208135605, "grad_norm_var": 1.2078263330241713e-05, "learning_rate": 0.008017617547636871, "loss": 2.6576, "step": 8493 }, { "crossentropy": 2.8053998947143555, "epoch": 0.3079321345707657, "grad_norm": 0.030819060280919075, "grad_norm_var": 1.190209636380729e-05, "learning_rate": 0.008017154199752225, "loss": 2.6946, "step": 8494 }, { "crossentropy": 2.662557363510132, "epoch": 0.3079683874709977, "grad_norm": 0.029573338106274605, "grad_norm_var": 1.215582193924065e-05, "learning_rate": 0.008016690811116334, "loss": 2.6543, "step": 8495 }, { "crossentropy": 2.508286714553833, "epoch": 0.3080046403712297, "grad_norm": 0.03133326396346092, "grad_norm_var": 1.1563621953997182e-05, "learning_rate": 0.008016227381735455, "loss": 2.537, "step": 8496 }, { "crossentropy": 2.6535756587982178, "epoch": 0.3080408932714617, "grad_norm": 0.03443941846489906, "grad_norm_var": 1.1471041370797463e-05, "learning_rate": 0.008015763911615847, "loss": 2.6272, "step": 8497 }, { "crossentropy": 2.6510393619537354, "epoch": 0.30807714617169374, "grad_norm": 0.029975980520248413, "grad_norm_var": 1.1530516149291237e-05, "learning_rate": 0.008015300400763772, "loss": 2.6148, "step": 8498 }, { "crossentropy": 2.5808029174804688, "epoch": 0.30811339907192575, "grad_norm": 0.029260415583848953, "grad_norm_var": 1.2144069442174276e-05, "learning_rate": 0.00801483684918549, "loss": 2.5171, "step": 8499 }, { "crossentropy": 2.676234483718872, "epoch": 0.30814965197215777, "grad_norm": 0.030656903982162476, "grad_norm_var": 1.1557574752120842e-05, "learning_rate": 0.00801437325688726, "loss": 2.6425, "step": 8500 }, { "crossentropy": 2.4608547687530518, "epoch": 0.3081859048723898, "grad_norm": 0.0283130444586277, "grad_norm_var": 1.2185117241867578e-05, "learning_rate": 0.008013909623875343, "loss": 2.6139, "step": 8501 }, { "crossentropy": 2.690406560897827, "epoch": 0.3082221577726218, "grad_norm": 0.028520217165350914, "grad_norm_var": 1.278130180857595e-05, "learning_rate": 0.008013445950156007, "loss": 2.6807, "step": 8502 }, { "crossentropy": 2.7038819789886475, "epoch": 0.3082584106728538, "grad_norm": 0.027654752135276794, "grad_norm_var": 1.3377243208263257e-05, "learning_rate": 0.008012982235735508, "loss": 2.6435, "step": 8503 }, { "crossentropy": 2.5645623207092285, "epoch": 0.3082946635730858, "grad_norm": 0.029169807210564613, "grad_norm_var": 7.730591518808694e-06, "learning_rate": 0.00801251848062011, "loss": 2.6181, "step": 8504 }, { "crossentropy": 2.6030361652374268, "epoch": 0.30833091647331784, "grad_norm": 0.0332254096865654, "grad_norm_var": 3.368615042584561e-06, "learning_rate": 0.00801205468481608, "loss": 2.64, "step": 8505 }, { "crossentropy": 2.6198582649230957, "epoch": 0.3083671693735499, "grad_norm": 0.031995274126529694, "grad_norm_var": 3.5655707447269923e-06, "learning_rate": 0.008011590848329682, "loss": 2.6419, "step": 8506 }, { "crossentropy": 2.5556514263153076, "epoch": 0.3084034222737819, "grad_norm": 0.03130028024315834, "grad_norm_var": 3.285890623008769e-06, "learning_rate": 0.008011126971167177, "loss": 2.6424, "step": 8507 }, { "crossentropy": 2.617156982421875, "epoch": 0.30843967517401394, "grad_norm": 0.03076191432774067, "grad_norm_var": 3.269917347661896e-06, "learning_rate": 0.008010663053334833, "loss": 2.6929, "step": 8508 }, { "crossentropy": 2.605116367340088, "epoch": 0.30847592807424595, "grad_norm": 0.030394170433282852, "grad_norm_var": 3.1958491460471464e-06, "learning_rate": 0.008010199094838915, "loss": 2.5813, "step": 8509 }, { "crossentropy": 2.5511744022369385, "epoch": 0.30851218097447797, "grad_norm": 0.02992311678826809, "grad_norm_var": 3.2033740439554033e-06, "learning_rate": 0.008009735095685692, "loss": 2.596, "step": 8510 }, { "crossentropy": 2.6481504440307617, "epoch": 0.30854843387471, "grad_norm": 0.029725883156061172, "grad_norm_var": 3.1878909634116524e-06, "learning_rate": 0.008009271055881428, "loss": 2.6337, "step": 8511 }, { "crossentropy": 2.577692985534668, "epoch": 0.308584686774942, "grad_norm": 0.030456772074103355, "grad_norm_var": 3.1286643466980524e-06, "learning_rate": 0.008008806975432391, "loss": 2.6092, "step": 8512 }, { "crossentropy": 2.632000684738159, "epoch": 0.308620939675174, "grad_norm": 0.030889656394720078, "grad_norm_var": 1.9858149115950835e-06, "learning_rate": 0.00800834285434485, "loss": 2.6524, "step": 8513 }, { "crossentropy": 2.6998558044433594, "epoch": 0.308657192575406, "grad_norm": 0.031869806349277496, "grad_norm_var": 2.1688182014567656e-06, "learning_rate": 0.008007878692625075, "loss": 2.7045, "step": 8514 }, { "crossentropy": 2.7486507892608643, "epoch": 0.30869344547563804, "grad_norm": 0.030723562464118004, "grad_norm_var": 2.1081320731311063e-06, "learning_rate": 0.008007414490279332, "loss": 2.6294, "step": 8515 }, { "crossentropy": 2.6227118968963623, "epoch": 0.30872969837587005, "grad_norm": 0.02860438823699951, "grad_norm_var": 2.2871110248292675e-06, "learning_rate": 0.008006950247313894, "loss": 2.6347, "step": 8516 }, { "crossentropy": 2.6406142711639404, "epoch": 0.30876595127610207, "grad_norm": 0.0300036258995533, "grad_norm_var": 2.0357781552888163e-06, "learning_rate": 0.008006485963735028, "loss": 2.6451, "step": 8517 }, { "crossentropy": 2.62562894821167, "epoch": 0.3088022041763341, "grad_norm": 0.03236031532287598, "grad_norm_var": 2.0327564503858405e-06, "learning_rate": 0.008006021639549006, "loss": 2.6699, "step": 8518 }, { "crossentropy": 2.600094795227051, "epoch": 0.30883845707656615, "grad_norm": 0.030097292736172676, "grad_norm_var": 1.4574639227275693e-06, "learning_rate": 0.008005557274762104, "loss": 2.6849, "step": 8519 }, { "crossentropy": 2.771974563598633, "epoch": 0.30887470997679817, "grad_norm": 0.02985568717122078, "grad_norm_var": 1.345206747610965e-06, "learning_rate": 0.008005092869380585, "loss": 2.6652, "step": 8520 }, { "crossentropy": 2.6936545372009277, "epoch": 0.3089109628770302, "grad_norm": 0.027936572209000587, "grad_norm_var": 1.3560877156819825e-06, "learning_rate": 0.00800462842341073, "loss": 2.6442, "step": 8521 }, { "crossentropy": 2.638636350631714, "epoch": 0.3089472157772622, "grad_norm": 0.028357308357954025, "grad_norm_var": 1.424562573693923e-06, "learning_rate": 0.008004163936858806, "loss": 2.6899, "step": 8522 }, { "crossentropy": 2.5188417434692383, "epoch": 0.3089834686774942, "grad_norm": 0.02914881333708763, "grad_norm_var": 1.3993163658196709e-06, "learning_rate": 0.00800369940973109, "loss": 2.5933, "step": 8523 }, { "crossentropy": 2.4992892742156982, "epoch": 0.3090197215777262, "grad_norm": 0.03035513125360012, "grad_norm_var": 1.3720928461140776e-06, "learning_rate": 0.008003234842033856, "loss": 2.5364, "step": 8524 }, { "crossentropy": 2.586120367050171, "epoch": 0.30905597447795824, "grad_norm": 0.031779348850250244, "grad_norm_var": 1.5567078426781747e-06, "learning_rate": 0.008002770233773375, "loss": 2.6572, "step": 8525 }, { "crossentropy": 2.5831458568573, "epoch": 0.30909222737819025, "grad_norm": 0.02980569191277027, "grad_norm_var": 1.5608158526796146e-06, "learning_rate": 0.008002305584955929, "loss": 2.6152, "step": 8526 }, { "crossentropy": 2.611100912094116, "epoch": 0.30912848027842227, "grad_norm": 0.027784839272499084, "grad_norm_var": 1.8991002423903702e-06, "learning_rate": 0.00800184089558779, "loss": 2.6374, "step": 8527 }, { "crossentropy": 2.7935242652893066, "epoch": 0.3091647331786543, "grad_norm": 0.028727559372782707, "grad_norm_var": 1.9810873165335487e-06, "learning_rate": 0.00800137616567523, "loss": 2.6581, "step": 8528 }, { "crossentropy": 2.747004508972168, "epoch": 0.3092009860788863, "grad_norm": 0.028099436312913895, "grad_norm_var": 2.0971545994280872e-06, "learning_rate": 0.008000911395224533, "loss": 2.6712, "step": 8529 }, { "crossentropy": 2.4961509704589844, "epoch": 0.3092372389791183, "grad_norm": 0.028943849727511406, "grad_norm_var": 1.7932733307619939e-06, "learning_rate": 0.008000446584241972, "loss": 2.5528, "step": 8530 }, { "crossentropy": 2.700913667678833, "epoch": 0.3092734918793503, "grad_norm": 0.029706206172704697, "grad_norm_var": 1.6969347360271114e-06, "learning_rate": 0.007999981732733828, "loss": 2.604, "step": 8531 }, { "crossentropy": 2.694225788116455, "epoch": 0.30930974477958234, "grad_norm": 0.02846483886241913, "grad_norm_var": 1.7143115109766183e-06, "learning_rate": 0.007999516840706378, "loss": 2.6991, "step": 8532 }, { "crossentropy": 2.6460723876953125, "epoch": 0.3093459976798144, "grad_norm": 0.028992636129260063, "grad_norm_var": 1.7054731490566035e-06, "learning_rate": 0.0079990519081659, "loss": 2.5861, "step": 8533 }, { "crossentropy": 2.738788604736328, "epoch": 0.3093822505800464, "grad_norm": 0.032252825796604156, "grad_norm_var": 1.6637821299747875e-06, "learning_rate": 0.007998586935118675, "loss": 2.7438, "step": 8534 }, { "crossentropy": 2.599137306213379, "epoch": 0.30941850348027844, "grad_norm": 0.03228390961885452, "grad_norm_var": 2.167583632915517e-06, "learning_rate": 0.007998121921570985, "loss": 2.6422, "step": 8535 }, { "crossentropy": 2.5535190105438232, "epoch": 0.30945475638051045, "grad_norm": 0.03021986410021782, "grad_norm_var": 2.1916425791611754e-06, "learning_rate": 0.007997656867529106, "loss": 2.5564, "step": 8536 }, { "crossentropy": 2.5129287242889404, "epoch": 0.30949100928074247, "grad_norm": 0.029136598110198975, "grad_norm_var": 2.0229041202985404e-06, "learning_rate": 0.007997191772999323, "loss": 2.5284, "step": 8537 }, { "crossentropy": 2.641240119934082, "epoch": 0.3095272621809745, "grad_norm": 0.02847488783299923, "grad_norm_var": 2.0038365729257065e-06, "learning_rate": 0.007996726637987917, "loss": 2.617, "step": 8538 }, { "crossentropy": 2.6840012073516846, "epoch": 0.3095635150812065, "grad_norm": 0.031126616522669792, "grad_norm_var": 2.119836388190831e-06, "learning_rate": 0.00799626146250117, "loss": 2.7086, "step": 8539 }, { "crossentropy": 2.579662799835205, "epoch": 0.3095997679814385, "grad_norm": 0.033787596970796585, "grad_norm_var": 3.128733973814061e-06, "learning_rate": 0.007995796246545365, "loss": 2.5226, "step": 8540 }, { "crossentropy": 2.6623806953430176, "epoch": 0.3096360208816705, "grad_norm": 0.030955152586102486, "grad_norm_var": 2.9728138703552827e-06, "learning_rate": 0.007995330990126786, "loss": 2.6178, "step": 8541 }, { "crossentropy": 2.5931589603424072, "epoch": 0.30967227378190254, "grad_norm": 0.029157603159546852, "grad_norm_var": 3.009172212957535e-06, "learning_rate": 0.007994865693251713, "loss": 2.6952, "step": 8542 }, { "crossentropy": 2.6955204010009766, "epoch": 0.30970852668213456, "grad_norm": 0.032831914722919464, "grad_norm_var": 3.1898606134098897e-06, "learning_rate": 0.007994400355926438, "loss": 2.6501, "step": 8543 }, { "crossentropy": 2.6407580375671387, "epoch": 0.30974477958236657, "grad_norm": 0.03195502981543541, "grad_norm_var": 3.2082970618828774e-06, "learning_rate": 0.00799393497815724, "loss": 2.6063, "step": 8544 }, { "crossentropy": 2.6448559761047363, "epoch": 0.3097810324825986, "grad_norm": 0.030660158023238182, "grad_norm_var": 2.8328830400873993e-06, "learning_rate": 0.007993469559950409, "loss": 2.6253, "step": 8545 }, { "crossentropy": 2.6420321464538574, "epoch": 0.30981728538283065, "grad_norm": 0.02984747290611267, "grad_norm_var": 2.6892753388453477e-06, "learning_rate": 0.007993004101312226, "loss": 2.6459, "step": 8546 }, { "crossentropy": 2.7161362171173096, "epoch": 0.30985353828306267, "grad_norm": 0.028342269361019135, "grad_norm_var": 2.9709685113006127e-06, "learning_rate": 0.00799253860224898, "loss": 2.6212, "step": 8547 }, { "crossentropy": 2.675537586212158, "epoch": 0.3098897911832947, "grad_norm": 0.030131801962852478, "grad_norm_var": 2.685504838430275e-06, "learning_rate": 0.007992073062766962, "loss": 2.6531, "step": 8548 }, { "crossentropy": 2.5913915634155273, "epoch": 0.3099260440835267, "grad_norm": 0.02920369803905487, "grad_norm_var": 2.6420767482655207e-06, "learning_rate": 0.007991607482872455, "loss": 2.5719, "step": 8549 }, { "crossentropy": 2.823662281036377, "epoch": 0.3099622969837587, "grad_norm": 0.03158732131123543, "grad_norm_var": 2.52735192976503e-06, "learning_rate": 0.007991141862571749, "loss": 2.7117, "step": 8550 }, { "crossentropy": 2.7842113971710205, "epoch": 0.3099985498839907, "grad_norm": 0.02952146716415882, "grad_norm_var": 2.386413486629944e-06, "learning_rate": 0.007990676201871133, "loss": 2.7162, "step": 8551 }, { "crossentropy": 2.7319247722625732, "epoch": 0.31003480278422274, "grad_norm": 0.02876126952469349, "grad_norm_var": 2.560971841446717e-06, "learning_rate": 0.007990210500776897, "loss": 2.7557, "step": 8552 }, { "crossentropy": 2.64078950881958, "epoch": 0.31007105568445475, "grad_norm": 0.03131724148988724, "grad_norm_var": 2.5075376667727517e-06, "learning_rate": 0.00798974475929533, "loss": 2.6877, "step": 8553 }, { "crossentropy": 2.5655345916748047, "epoch": 0.31010730858468677, "grad_norm": 0.03294872120022774, "grad_norm_var": 2.5631046974829037e-06, "learning_rate": 0.00798927897743272, "loss": 2.5972, "step": 8554 }, { "crossentropy": 2.5245509147644043, "epoch": 0.3101435614849188, "grad_norm": 0.03252292424440384, "grad_norm_var": 2.7535010005891633e-06, "learning_rate": 0.007988813155195366, "loss": 2.5829, "step": 8555 }, { "crossentropy": 2.6487045288085938, "epoch": 0.3101798143851508, "grad_norm": 0.02939886786043644, "grad_norm_var": 2.235834283560202e-06, "learning_rate": 0.007988347292589551, "loss": 2.7203, "step": 8556 }, { "crossentropy": 2.6030397415161133, "epoch": 0.3102160672853828, "grad_norm": 0.028551045805215836, "grad_norm_var": 2.4740666976673925e-06, "learning_rate": 0.007987881389621573, "loss": 2.5681, "step": 8557 }, { "crossentropy": 2.707369089126587, "epoch": 0.3102523201856148, "grad_norm": 0.028746675699949265, "grad_norm_var": 2.553852071823561e-06, "learning_rate": 0.007987415446297722, "loss": 2.6974, "step": 8558 }, { "crossentropy": 2.4228835105895996, "epoch": 0.31028857308584684, "grad_norm": 0.029456852003932, "grad_norm_var": 2.1693821947249415e-06, "learning_rate": 0.007986949462624291, "loss": 2.4729, "step": 8559 }, { "crossentropy": 2.6256632804870605, "epoch": 0.3103248259860789, "grad_norm": 0.029368842020630836, "grad_norm_var": 1.976899733350558e-06, "learning_rate": 0.007986483438607576, "loss": 2.6187, "step": 8560 }, { "crossentropy": 2.5626044273376465, "epoch": 0.3103610788863109, "grad_norm": 0.0278908833861351, "grad_norm_var": 2.220911137721044e-06, "learning_rate": 0.00798601737425387, "loss": 2.5194, "step": 8561 }, { "crossentropy": 2.504610061645508, "epoch": 0.31039733178654294, "grad_norm": 0.028834199532866478, "grad_norm_var": 2.285400396191626e-06, "learning_rate": 0.007985551269569466, "loss": 2.5603, "step": 8562 }, { "crossentropy": 2.7015366554260254, "epoch": 0.31043358468677495, "grad_norm": 0.05166711285710335, "grad_norm_var": 3.179687783547389e-05, "learning_rate": 0.007985085124560663, "loss": 2.6099, "step": 8563 }, { "crossentropy": 2.6308388710021973, "epoch": 0.31046983758700697, "grad_norm": 0.039285510778427124, "grad_norm_var": 3.567596969099538e-05, "learning_rate": 0.007984618939233754, "loss": 2.6125, "step": 8564 }, { "crossentropy": 2.6725099086761475, "epoch": 0.310506090487239, "grad_norm": 0.0415443517267704, "grad_norm_var": 4.08951857327315e-05, "learning_rate": 0.007984152713595039, "loss": 2.6742, "step": 8565 }, { "crossentropy": 2.631057024002075, "epoch": 0.310542343387471, "grad_norm": 0.034808579832315445, "grad_norm_var": 4.1114051278221855e-05, "learning_rate": 0.00798368644765081, "loss": 2.5726, "step": 8566 }, { "crossentropy": 2.7597618103027344, "epoch": 0.310578596287703, "grad_norm": 0.028997959569096565, "grad_norm_var": 4.1359259519894474e-05, "learning_rate": 0.007983220141407371, "loss": 2.6907, "step": 8567 }, { "crossentropy": 2.5606818199157715, "epoch": 0.310614849187935, "grad_norm": 0.0288226380944252, "grad_norm_var": 4.1326805538913314e-05, "learning_rate": 0.007982753794871015, "loss": 2.5667, "step": 8568 }, { "crossentropy": 2.67155122756958, "epoch": 0.31065110208816704, "grad_norm": 0.030695101246237755, "grad_norm_var": 4.147068891932731e-05, "learning_rate": 0.007982287408048043, "loss": 2.6377, "step": 8569 }, { "crossentropy": 2.6548609733581543, "epoch": 0.31068735498839906, "grad_norm": 0.030403204262256622, "grad_norm_var": 4.1798468849897574e-05, "learning_rate": 0.007981820980944753, "loss": 2.5732, "step": 8570 }, { "crossentropy": 2.6877896785736084, "epoch": 0.31072360788863107, "grad_norm": 0.03693975880742073, "grad_norm_var": 4.299463223978011e-05, "learning_rate": 0.007981354513567447, "loss": 2.6226, "step": 8571 }, { "crossentropy": 2.8079302310943604, "epoch": 0.3107598607888631, "grad_norm": 0.03799908980727196, "grad_norm_var": 4.367347405214958e-05, "learning_rate": 0.007980888005922422, "loss": 2.7191, "step": 8572 }, { "crossentropy": 2.7409305572509766, "epoch": 0.31079611368909515, "grad_norm": 0.03571309894323349, "grad_norm_var": 4.227211844437064e-05, "learning_rate": 0.007980421458015982, "loss": 2.7576, "step": 8573 }, { "crossentropy": 2.6665430068969727, "epoch": 0.31083236658932717, "grad_norm": 0.031545400619506836, "grad_norm_var": 4.0867237587718684e-05, "learning_rate": 0.007979954869854427, "loss": 2.676, "step": 8574 }, { "crossentropy": 2.581510543823242, "epoch": 0.3108686194895592, "grad_norm": 0.02798241190612316, "grad_norm_var": 4.1895920681226507e-05, "learning_rate": 0.007979488241444059, "loss": 2.4981, "step": 8575 }, { "crossentropy": 2.452754497528076, "epoch": 0.3109048723897912, "grad_norm": 0.029368123039603233, "grad_norm_var": 4.189635567711119e-05, "learning_rate": 0.007979021572791182, "loss": 2.4864, "step": 8576 }, { "crossentropy": 2.6396005153656006, "epoch": 0.3109411252900232, "grad_norm": 0.029643705114722252, "grad_norm_var": 4.0682568591649385e-05, "learning_rate": 0.007978554863902096, "loss": 2.6799, "step": 8577 }, { "crossentropy": 2.5983309745788574, "epoch": 0.3109773781902552, "grad_norm": 0.028581656515598297, "grad_norm_var": 4.086102627595422e-05, "learning_rate": 0.007978088114783108, "loss": 2.5783, "step": 8578 }, { "crossentropy": 2.7142248153686523, "epoch": 0.31101363109048724, "grad_norm": 0.03175089880824089, "grad_norm_var": 1.8736685438867786e-05, "learning_rate": 0.00797762132544052, "loss": 2.6785, "step": 8579 }, { "crossentropy": 2.8480255603790283, "epoch": 0.31104988399071926, "grad_norm": 0.03181574493646622, "grad_norm_var": 1.5719930893238324e-05, "learning_rate": 0.007977154495880637, "loss": 2.6864, "step": 8580 }, { "crossentropy": 2.837547779083252, "epoch": 0.31108613689095127, "grad_norm": 0.03165195509791374, "grad_norm_var": 9.627456916170294e-06, "learning_rate": 0.007976687626109766, "loss": 2.8483, "step": 8581 }, { "crossentropy": 2.620081901550293, "epoch": 0.3111223897911833, "grad_norm": 0.030786730349063873, "grad_norm_var": 8.955336169127896e-06, "learning_rate": 0.00797622071613421, "loss": 2.5954, "step": 8582 }, { "crossentropy": 2.6950416564941406, "epoch": 0.3111586426914153, "grad_norm": 0.031089216470718384, "grad_norm_var": 8.553715557247516e-06, "learning_rate": 0.007975753765960276, "loss": 2.6602, "step": 8583 }, { "crossentropy": 2.618537187576294, "epoch": 0.3111948955916473, "grad_norm": 0.032544974237680435, "grad_norm_var": 8.06643063011956e-06, "learning_rate": 0.007975286775594273, "loss": 2.6437, "step": 8584 }, { "crossentropy": 2.589768409729004, "epoch": 0.31123114849187933, "grad_norm": 0.0334555022418499, "grad_norm_var": 8.142653519433448e-06, "learning_rate": 0.007974819745042504, "loss": 2.5543, "step": 8585 }, { "crossentropy": 2.663731813430786, "epoch": 0.31126740139211134, "grad_norm": 0.031078165397047997, "grad_norm_var": 8.031521199803465e-06, "learning_rate": 0.007974352674311284, "loss": 2.6602, "step": 8586 }, { "crossentropy": 2.7209179401397705, "epoch": 0.3113036542923434, "grad_norm": 0.028693852946162224, "grad_norm_var": 6.846487317461305e-06, "learning_rate": 0.007973885563406915, "loss": 2.6866, "step": 8587 }, { "crossentropy": 2.6885876655578613, "epoch": 0.3113399071925754, "grad_norm": 0.02881804294884205, "grad_norm_var": 4.136008241076526e-06, "learning_rate": 0.007973418412335708, "loss": 2.6281, "step": 8588 }, { "crossentropy": 2.705313205718994, "epoch": 0.31137616009280744, "grad_norm": 0.028757385909557343, "grad_norm_var": 2.703000602275139e-06, "learning_rate": 0.007972951221103976, "loss": 2.7232, "step": 8589 }, { "crossentropy": 2.6042466163635254, "epoch": 0.31141241299303946, "grad_norm": 0.02860056608915329, "grad_norm_var": 2.8238275459978775e-06, "learning_rate": 0.007972483989718025, "loss": 2.5709, "step": 8590 }, { "crossentropy": 2.5586137771606445, "epoch": 0.31144866589327147, "grad_norm": 0.02758939564228058, "grad_norm_var": 2.954335027101337e-06, "learning_rate": 0.007972016718184165, "loss": 2.6057, "step": 8591 }, { "crossentropy": 2.690526247024536, "epoch": 0.3114849187935035, "grad_norm": 0.02930942364037037, "grad_norm_var": 2.9615629740087675e-06, "learning_rate": 0.007971549406508708, "loss": 2.7419, "step": 8592 }, { "crossentropy": 2.633662223815918, "epoch": 0.3115211716937355, "grad_norm": 0.027517296373844147, "grad_norm_var": 3.419024375644897e-06, "learning_rate": 0.00797108205469797, "loss": 2.6212, "step": 8593 }, { "crossentropy": 2.5648512840270996, "epoch": 0.3115574245939675, "grad_norm": 0.02941501885652542, "grad_norm_var": 3.2906582005554076e-06, "learning_rate": 0.00797061466275826, "loss": 2.6057, "step": 8594 }, { "crossentropy": 2.6059203147888184, "epoch": 0.31159367749419953, "grad_norm": 0.028649156913161278, "grad_norm_var": 3.242137993979136e-06, "learning_rate": 0.00797014723069589, "loss": 2.5969, "step": 8595 }, { "crossentropy": 2.597310781478882, "epoch": 0.31162993039443154, "grad_norm": 0.03333791345357895, "grad_norm_var": 3.7583529669266233e-06, "learning_rate": 0.007969679758517173, "loss": 2.6045, "step": 8596 }, { "crossentropy": 2.6739001274108887, "epoch": 0.31166618329466356, "grad_norm": 0.02779969945549965, "grad_norm_var": 3.878903924346223e-06, "learning_rate": 0.007969212246228426, "loss": 2.6954, "step": 8597 }, { "crossentropy": 2.7226474285125732, "epoch": 0.31170243619489557, "grad_norm": 0.02760324813425541, "grad_norm_var": 4.110522734585877e-06, "learning_rate": 0.00796874469383596, "loss": 2.6481, "step": 8598 }, { "crossentropy": 2.6008501052856445, "epoch": 0.3117386890951276, "grad_norm": 0.028766274452209473, "grad_norm_var": 3.99928208051342e-06, "learning_rate": 0.007968277101346092, "loss": 2.5774, "step": 8599 }, { "crossentropy": 2.5633716583251953, "epoch": 0.31177494199535966, "grad_norm": 0.02824285626411438, "grad_norm_var": 3.4071032024800736e-06, "learning_rate": 0.007967809468765139, "loss": 2.5809, "step": 8600 }, { "crossentropy": 2.6867337226867676, "epoch": 0.31181119489559167, "grad_norm": 0.03066822700202465, "grad_norm_var": 2.3212348223835503e-06, "learning_rate": 0.007967341796099415, "loss": 2.67, "step": 8601 }, { "crossentropy": 2.7346677780151367, "epoch": 0.3118474477958237, "grad_norm": 0.02872839756309986, "grad_norm_var": 2.031804830166641e-06, "learning_rate": 0.007966874083355235, "loss": 2.669, "step": 8602 }, { "crossentropy": 2.638845920562744, "epoch": 0.3118837006960557, "grad_norm": 0.030421797186136246, "grad_norm_var": 2.169528801302239e-06, "learning_rate": 0.00796640633053892, "loss": 2.6078, "step": 8603 }, { "crossentropy": 2.6732280254364014, "epoch": 0.3119199535962877, "grad_norm": 0.033443327993154526, "grad_norm_var": 3.385733054666916e-06, "learning_rate": 0.007965938537656784, "loss": 2.6448, "step": 8604 }, { "crossentropy": 2.6198034286499023, "epoch": 0.31195620649651973, "grad_norm": 0.036952584981918335, "grad_norm_var": 6.986989175278392e-06, "learning_rate": 0.007965470704715148, "loss": 2.6607, "step": 8605 }, { "crossentropy": 2.625983238220215, "epoch": 0.31199245939675174, "grad_norm": 0.03341297432780266, "grad_norm_var": 7.654988944105868e-06, "learning_rate": 0.007965002831720329, "loss": 2.6558, "step": 8606 }, { "crossentropy": 2.7570459842681885, "epoch": 0.31202871229698376, "grad_norm": 0.043857280164957047, "grad_norm_var": 1.8714692878551972e-05, "learning_rate": 0.00796453491867865, "loss": 2.7289, "step": 8607 }, { "crossentropy": 2.621387004852295, "epoch": 0.31206496519721577, "grad_norm": 0.03191135823726654, "grad_norm_var": 1.8505233044587415e-05, "learning_rate": 0.007964066965596426, "loss": 2.7381, "step": 8608 }, { "crossentropy": 2.7524213790893555, "epoch": 0.3121012180974478, "grad_norm": 0.03459625318646431, "grad_norm_var": 1.8071145909928413e-05, "learning_rate": 0.007963598972479979, "loss": 2.7082, "step": 8609 }, { "crossentropy": 2.636152982711792, "epoch": 0.3121374709976798, "grad_norm": 0.03846919909119606, "grad_norm_var": 2.0390546605773308e-05, "learning_rate": 0.00796313093933563, "loss": 2.6787, "step": 8610 }, { "crossentropy": 2.6570303440093994, "epoch": 0.3121737238979118, "grad_norm": 0.04313984885811806, "grad_norm_var": 2.6453228933639287e-05, "learning_rate": 0.007962662866169704, "loss": 2.5837, "step": 8611 }, { "crossentropy": 2.6330087184906006, "epoch": 0.31220997679814383, "grad_norm": 0.03567872196435928, "grad_norm_var": 2.683578412282782e-05, "learning_rate": 0.00796219475298852, "loss": 2.5842, "step": 8612 }, { "crossentropy": 2.803830146789551, "epoch": 0.31224622969837584, "grad_norm": 0.032507941126823425, "grad_norm_var": 2.473335627269899e-05, "learning_rate": 0.007961726599798397, "loss": 2.7231, "step": 8613 }, { "crossentropy": 2.715672016143799, "epoch": 0.3122824825986079, "grad_norm": 0.03025462105870247, "grad_norm_var": 2.3035085112090707e-05, "learning_rate": 0.007961258406605663, "loss": 2.624, "step": 8614 }, { "crossentropy": 2.561389207839966, "epoch": 0.31231873549883993, "grad_norm": 0.03639119863510132, "grad_norm_var": 2.153524086727111e-05, "learning_rate": 0.007960790173416641, "loss": 2.6294, "step": 8615 }, { "crossentropy": 2.5749406814575195, "epoch": 0.31235498839907194, "grad_norm": 0.032548461109399796, "grad_norm_var": 1.9221019486378014e-05, "learning_rate": 0.007960321900237655, "loss": 2.5367, "step": 8616 }, { "crossentropy": 2.6479508876800537, "epoch": 0.31239124129930396, "grad_norm": 0.030561476945877075, "grad_norm_var": 1.9277144383333003e-05, "learning_rate": 0.007959853587075027, "loss": 2.6706, "step": 8617 }, { "crossentropy": 2.7025816440582275, "epoch": 0.31242749419953597, "grad_norm": 0.02899639867246151, "grad_norm_var": 1.9073438810191185e-05, "learning_rate": 0.007959385233935086, "loss": 2.6905, "step": 8618 }, { "crossentropy": 2.8494691848754883, "epoch": 0.312463747099768, "grad_norm": 0.035674188286066055, "grad_norm_var": 1.78915739828272e-05, "learning_rate": 0.007958916840824157, "loss": 2.7317, "step": 8619 }, { "crossentropy": 2.7264626026153564, "epoch": 0.3125, "grad_norm": 0.02913724072277546, "grad_norm_var": 1.988666464131488e-05, "learning_rate": 0.007958448407748566, "loss": 2.6486, "step": 8620 }, { "crossentropy": 2.7076363563537598, "epoch": 0.312536252900232, "grad_norm": 0.03876107186079025, "grad_norm_var": 2.0650980386158666e-05, "learning_rate": 0.00795797993471464, "loss": 2.6876, "step": 8621 }, { "crossentropy": 2.785219430923462, "epoch": 0.31257250580046403, "grad_norm": 0.03209727630019188, "grad_norm_var": 2.0992605547711744e-05, "learning_rate": 0.007957511421728703, "loss": 2.7107, "step": 8622 }, { "crossentropy": 2.7084102630615234, "epoch": 0.31260875870069604, "grad_norm": 0.030273783951997757, "grad_norm_var": 1.5869621100037443e-05, "learning_rate": 0.007957042868797089, "loss": 2.6531, "step": 8623 }, { "crossentropy": 2.632514715194702, "epoch": 0.31264501160092806, "grad_norm": 0.033331844955682755, "grad_norm_var": 1.563567098799423e-05, "learning_rate": 0.007956574275926123, "loss": 2.5893, "step": 8624 }, { "crossentropy": 2.6161210536956787, "epoch": 0.31268126450116007, "grad_norm": 0.0333111397922039, "grad_norm_var": 1.561979797629753e-05, "learning_rate": 0.007956105643122133, "loss": 2.6327, "step": 8625 }, { "crossentropy": 2.541872501373291, "epoch": 0.3127175174013921, "grad_norm": 0.03162746876478195, "grad_norm_var": 1.4305057171479693e-05, "learning_rate": 0.007955636970391452, "loss": 2.6102, "step": 8626 }, { "crossentropy": 2.711378574371338, "epoch": 0.31275377030162416, "grad_norm": 0.02859213575720787, "grad_norm_var": 8.626957399001517e-06, "learning_rate": 0.007955168257740407, "loss": 2.7251, "step": 8627 }, { "crossentropy": 2.449183940887451, "epoch": 0.31279002320185617, "grad_norm": 0.030028872191905975, "grad_norm_var": 8.215426662207953e-06, "learning_rate": 0.007954699505175333, "loss": 2.5987, "step": 8628 }, { "crossentropy": 2.611480236053467, "epoch": 0.3128262761020882, "grad_norm": 0.029374951496720314, "grad_norm_var": 8.671419839592386e-06, "learning_rate": 0.007954230712702555, "loss": 2.7057, "step": 8629 }, { "crossentropy": 2.6686789989471436, "epoch": 0.3128625290023202, "grad_norm": 0.03119412437081337, "grad_norm_var": 8.516073603277705e-06, "learning_rate": 0.00795376188032841, "loss": 2.6901, "step": 8630 }, { "crossentropy": 2.6981167793273926, "epoch": 0.3128987819025522, "grad_norm": 0.03455670177936554, "grad_norm_var": 7.650820657192206e-06, "learning_rate": 0.007953293008059227, "loss": 2.7182, "step": 8631 }, { "crossentropy": 2.6256885528564453, "epoch": 0.31293503480278423, "grad_norm": 0.029804229736328125, "grad_norm_var": 7.876613602066313e-06, "learning_rate": 0.007952824095901341, "loss": 2.5909, "step": 8632 }, { "crossentropy": 2.7194385528564453, "epoch": 0.31297128770301624, "grad_norm": 0.029817869886755943, "grad_norm_var": 8.024816517723278e-06, "learning_rate": 0.007952355143861084, "loss": 2.666, "step": 8633 }, { "crossentropy": 2.6838691234588623, "epoch": 0.31300754060324826, "grad_norm": 0.030859889462590218, "grad_norm_var": 7.579741327157219e-06, "learning_rate": 0.00795188615194479, "loss": 2.6611, "step": 8634 }, { "crossentropy": 2.6094882488250732, "epoch": 0.31304379350348027, "grad_norm": 0.041129760444164276, "grad_norm_var": 1.227430728909003e-05, "learning_rate": 0.007951417120158795, "loss": 2.7043, "step": 8635 }, { "crossentropy": 2.4673495292663574, "epoch": 0.3130800464037123, "grad_norm": 0.039796166121959686, "grad_norm_var": 1.5137954668316513e-05, "learning_rate": 0.007950948048509432, "loss": 2.5855, "step": 8636 }, { "crossentropy": 2.8966968059539795, "epoch": 0.3131162993039443, "grad_norm": 0.031331248581409454, "grad_norm_var": 1.2667774429830321e-05, "learning_rate": 0.007950478937003038, "loss": 2.7783, "step": 8637 }, { "crossentropy": 2.591451406478882, "epoch": 0.3131525522041763, "grad_norm": 0.0316503643989563, "grad_norm_var": 1.2693557082989641e-05, "learning_rate": 0.007950009785645949, "loss": 2.6552, "step": 8638 }, { "crossentropy": 2.5902390480041504, "epoch": 0.31318880510440833, "grad_norm": 0.03441336750984192, "grad_norm_var": 1.2650328385488972e-05, "learning_rate": 0.0079495405944445, "loss": 2.7209, "step": 8639 }, { "crossentropy": 2.609477996826172, "epoch": 0.31322505800464034, "grad_norm": 0.029754728078842163, "grad_norm_var": 1.3077763820469471e-05, "learning_rate": 0.007949071363405029, "loss": 2.6184, "step": 8640 }, { "crossentropy": 2.6571035385131836, "epoch": 0.3132613109048724, "grad_norm": 0.028828412294387817, "grad_norm_var": 1.37458858861668e-05, "learning_rate": 0.007948602092533874, "loss": 2.6358, "step": 8641 }, { "crossentropy": 2.8154375553131104, "epoch": 0.31329756380510443, "grad_norm": 0.030632713809609413, "grad_norm_var": 1.3863444806563816e-05, "learning_rate": 0.007948132781837374, "loss": 2.7383, "step": 8642 }, { "crossentropy": 2.6508331298828125, "epoch": 0.31333381670533644, "grad_norm": 0.03534634783864021, "grad_norm_var": 1.365886123135118e-05, "learning_rate": 0.007947663431321866, "loss": 2.6723, "step": 8643 }, { "crossentropy": 2.6428771018981934, "epoch": 0.31337006960556846, "grad_norm": 0.0348324179649353, "grad_norm_var": 1.3577553227880843e-05, "learning_rate": 0.00794719404099369, "loss": 2.5912, "step": 8644 }, { "crossentropy": 2.7615227699279785, "epoch": 0.31340632250580047, "grad_norm": 0.029644710943102837, "grad_norm_var": 1.3462229099849473e-05, "learning_rate": 0.007946724610859187, "loss": 2.7018, "step": 8645 }, { "crossentropy": 2.6918296813964844, "epoch": 0.3134425754060325, "grad_norm": 0.03130047768354416, "grad_norm_var": 1.3441233703213352e-05, "learning_rate": 0.007946255140924695, "loss": 2.7165, "step": 8646 }, { "crossentropy": 2.484208822250366, "epoch": 0.3134788283062645, "grad_norm": 0.031212417408823967, "grad_norm_var": 1.3326254721852471e-05, "learning_rate": 0.007945785631196557, "loss": 2.5431, "step": 8647 }, { "crossentropy": 2.4864988327026367, "epoch": 0.3135150812064965, "grad_norm": 0.0288498867303133, "grad_norm_var": 1.3729027385212382e-05, "learning_rate": 0.007945316081681112, "loss": 2.5967, "step": 8648 }, { "crossentropy": 2.7024624347686768, "epoch": 0.31355133410672853, "grad_norm": 0.029817864298820496, "grad_norm_var": 1.3729029355653598e-05, "learning_rate": 0.007944846492384708, "loss": 2.6664, "step": 8649 }, { "crossentropy": 2.7163760662078857, "epoch": 0.31358758700696054, "grad_norm": 0.02949286438524723, "grad_norm_var": 1.4137943354876901e-05, "learning_rate": 0.007944376863313679, "loss": 2.7163, "step": 8650 }, { "crossentropy": 2.532958984375, "epoch": 0.31362383990719256, "grad_norm": 0.030276650562882423, "grad_norm_var": 8.834020008671034e-06, "learning_rate": 0.007943907194474374, "loss": 2.654, "step": 8651 }, { "crossentropy": 2.6933882236480713, "epoch": 0.3136600928074246, "grad_norm": 0.029631318524479866, "grad_norm_var": 4.317298579313552e-06, "learning_rate": 0.007943437485873135, "loss": 2.6607, "step": 8652 }, { "crossentropy": 2.6988797187805176, "epoch": 0.3136963457076566, "grad_norm": 0.034354571253061295, "grad_norm_var": 4.996515890673389e-06, "learning_rate": 0.007942967737516304, "loss": 2.7024, "step": 8653 }, { "crossentropy": 2.662997245788574, "epoch": 0.31373259860788866, "grad_norm": 0.028554927557706833, "grad_norm_var": 5.431142556190288e-06, "learning_rate": 0.007942497949410229, "loss": 2.6558, "step": 8654 }, { "crossentropy": 2.590937852859497, "epoch": 0.31376885150812067, "grad_norm": 0.029055343940854073, "grad_norm_var": 4.8290334723490844e-06, "learning_rate": 0.007942028121561253, "loss": 2.5384, "step": 8655 }, { "crossentropy": 2.7393851280212402, "epoch": 0.3138051044083527, "grad_norm": 0.03142430633306503, "grad_norm_var": 4.787458648832359e-06, "learning_rate": 0.007941558253975724, "loss": 2.6321, "step": 8656 }, { "crossentropy": 2.6788523197174072, "epoch": 0.3138413573085847, "grad_norm": 0.03014315292239189, "grad_norm_var": 4.544888125275018e-06, "learning_rate": 0.007941088346659986, "loss": 2.6306, "step": 8657 }, { "crossentropy": 2.6289939880371094, "epoch": 0.3138776102088167, "grad_norm": 0.02981940284371376, "grad_norm_var": 4.61636720284797e-06, "learning_rate": 0.007940618399620385, "loss": 2.6304, "step": 8658 }, { "crossentropy": 2.6726932525634766, "epoch": 0.31391386310904873, "grad_norm": 0.02923697978258133, "grad_norm_var": 3.2944708240424525e-06, "learning_rate": 0.007940148412863272, "loss": 2.584, "step": 8659 }, { "crossentropy": 2.596452474594116, "epoch": 0.31395011600928074, "grad_norm": 0.029705382883548737, "grad_norm_var": 1.9606456797125e-06, "learning_rate": 0.007939678386394993, "loss": 2.625, "step": 8660 }, { "crossentropy": 2.723163604736328, "epoch": 0.31398636890951276, "grad_norm": 0.02815104089677334, "grad_norm_var": 2.2022145503163786e-06, "learning_rate": 0.007939208320221896, "loss": 2.5979, "step": 8661 }, { "crossentropy": 2.4765186309814453, "epoch": 0.3140226218097448, "grad_norm": 0.028833214193582535, "grad_norm_var": 2.175967415714167e-06, "learning_rate": 0.00793873821435033, "loss": 2.5304, "step": 8662 }, { "crossentropy": 2.6895620822906494, "epoch": 0.3140588747099768, "grad_norm": 0.030306126922369003, "grad_norm_var": 2.069914999452277e-06, "learning_rate": 0.007938268068786645, "loss": 2.6921, "step": 8663 }, { "crossentropy": 2.705500602722168, "epoch": 0.3140951276102088, "grad_norm": 0.030667241662740707, "grad_norm_var": 2.0331940655454594e-06, "learning_rate": 0.00793779788353719, "loss": 2.7301, "step": 8664 }, { "crossentropy": 2.6904499530792236, "epoch": 0.3141313805104408, "grad_norm": 0.03449520468711853, "grad_norm_var": 3.307593637705492e-06, "learning_rate": 0.007937327658608315, "loss": 2.6779, "step": 8665 }, { "crossentropy": 2.4285888671875, "epoch": 0.31416763341067283, "grad_norm": 0.037324439734220505, "grad_norm_var": 6.340692020507556e-06, "learning_rate": 0.007936857394006376, "loss": 2.4711, "step": 8666 }, { "crossentropy": 2.711038827896118, "epoch": 0.31420388631090485, "grad_norm": 0.03579051047563553, "grad_norm_var": 7.893810976618273e-06, "learning_rate": 0.007936387089737717, "loss": 2.7001, "step": 8667 }, { "crossentropy": 2.668818235397339, "epoch": 0.3142401392111369, "grad_norm": 0.03203487768769264, "grad_norm_var": 7.786344351294085e-06, "learning_rate": 0.007935916745808697, "loss": 2.6374, "step": 8668 }, { "crossentropy": 2.6143529415130615, "epoch": 0.31427639211136893, "grad_norm": 0.02826848439872265, "grad_norm_var": 7.576842736798266e-06, "learning_rate": 0.007935446362225665, "loss": 2.582, "step": 8669 }, { "crossentropy": 2.670621395111084, "epoch": 0.31431264501160094, "grad_norm": 0.02966899611055851, "grad_norm_var": 7.3115432620493005e-06, "learning_rate": 0.007934975938994978, "loss": 2.7275, "step": 8670 }, { "crossentropy": 2.5299739837646484, "epoch": 0.31434889791183296, "grad_norm": 0.03370792791247368, "grad_norm_var": 7.4997859319568165e-06, "learning_rate": 0.007934505476122983, "loss": 2.5751, "step": 8671 }, { "crossentropy": 2.63728404045105, "epoch": 0.314385150812065, "grad_norm": 0.03482578322291374, "grad_norm_var": 8.313948943095108e-06, "learning_rate": 0.00793403497361604, "loss": 2.649, "step": 8672 }, { "crossentropy": 2.6766674518585205, "epoch": 0.314421403712297, "grad_norm": 0.03341277316212654, "grad_norm_var": 8.418408738259974e-06, "learning_rate": 0.007933564431480501, "loss": 2.6246, "step": 8673 }, { "crossentropy": 2.6031863689422607, "epoch": 0.314457656612529, "grad_norm": 0.031937289983034134, "grad_norm_var": 8.184491853574491e-06, "learning_rate": 0.007933093849722723, "loss": 2.6226, "step": 8674 }, { "crossentropy": 2.6186890602111816, "epoch": 0.314493909512761, "grad_norm": 0.030924523249268532, "grad_norm_var": 7.791884567992454e-06, "learning_rate": 0.007932623228349063, "loss": 2.5968, "step": 8675 }, { "crossentropy": 2.6136229038238525, "epoch": 0.31453016241299303, "grad_norm": 0.032843317836523056, "grad_norm_var": 7.498143028832292e-06, "learning_rate": 0.007932152567365873, "loss": 2.5857, "step": 8676 }, { "crossentropy": 2.6343941688537598, "epoch": 0.31456641531322505, "grad_norm": 0.03201378136873245, "grad_norm_var": 6.409991518818353e-06, "learning_rate": 0.007931681866779516, "loss": 2.6265, "step": 8677 }, { "crossentropy": 2.766730546951294, "epoch": 0.31460266821345706, "grad_norm": 0.032862208783626556, "grad_norm_var": 5.55364194438078e-06, "learning_rate": 0.007931211126596345, "loss": 2.7298, "step": 8678 }, { "crossentropy": 2.579155683517456, "epoch": 0.3146389211136891, "grad_norm": 0.03159370645880699, "grad_norm_var": 5.2689944838914576e-06, "learning_rate": 0.00793074034682272, "loss": 2.5377, "step": 8679 }, { "crossentropy": 2.595963716506958, "epoch": 0.3146751740139211, "grad_norm": 0.02775472216308117, "grad_norm_var": 6.56844166884533e-06, "learning_rate": 0.007930269527464997, "loss": 2.5836, "step": 8680 }, { "crossentropy": 2.601208209991455, "epoch": 0.31471142691415316, "grad_norm": 0.02848973125219345, "grad_norm_var": 7.1978314496870996e-06, "learning_rate": 0.007929798668529539, "loss": 2.5712, "step": 8681 }, { "crossentropy": 2.7058427333831787, "epoch": 0.3147476798143852, "grad_norm": 0.02870398387312889, "grad_norm_var": 5.8268526124197655e-06, "learning_rate": 0.007929327770022703, "loss": 2.6209, "step": 8682 }, { "crossentropy": 2.589033603668213, "epoch": 0.3147839327146172, "grad_norm": 0.028828231617808342, "grad_norm_var": 4.921845812380166e-06, "learning_rate": 0.007928856831950852, "loss": 2.5559, "step": 8683 }, { "crossentropy": 2.726139783859253, "epoch": 0.3148201856148492, "grad_norm": 0.02897607907652855, "grad_norm_var": 5.132221968367221e-06, "learning_rate": 0.007928385854320343, "loss": 2.6322, "step": 8684 }, { "crossentropy": 2.589496612548828, "epoch": 0.3148564385150812, "grad_norm": 0.029220297932624817, "grad_norm_var": 4.851617910973244e-06, "learning_rate": 0.00792791483713754, "loss": 2.5765, "step": 8685 }, { "crossentropy": 2.556946039199829, "epoch": 0.31489269141531323, "grad_norm": 0.031249677762389183, "grad_norm_var": 4.7303756259159734e-06, "learning_rate": 0.007927443780408803, "loss": 2.6036, "step": 8686 }, { "crossentropy": 2.5923807621002197, "epoch": 0.31492894431554525, "grad_norm": 0.028838664293289185, "grad_norm_var": 4.508688854966645e-06, "learning_rate": 0.007926972684140495, "loss": 2.622, "step": 8687 }, { "crossentropy": 2.5689196586608887, "epoch": 0.31496519721577726, "grad_norm": 0.030731799080967903, "grad_norm_var": 3.347604982096884e-06, "learning_rate": 0.007926501548338979, "loss": 2.5439, "step": 8688 }, { "crossentropy": 2.737391233444214, "epoch": 0.3150014501160093, "grad_norm": 0.034117333590984344, "grad_norm_var": 3.65002454655292e-06, "learning_rate": 0.00792603037301062, "loss": 2.7076, "step": 8689 }, { "crossentropy": 2.657360553741455, "epoch": 0.3150377030162413, "grad_norm": 0.03363344445824623, "grad_norm_var": 4.139541088249418e-06, "learning_rate": 0.00792555915816178, "loss": 2.678, "step": 8690 }, { "crossentropy": 2.7274508476257324, "epoch": 0.3150739559164733, "grad_norm": 0.03626817837357521, "grad_norm_var": 6.102812566224855e-06, "learning_rate": 0.007925087903798821, "loss": 2.7563, "step": 8691 }, { "crossentropy": 2.549093246459961, "epoch": 0.3151102088167053, "grad_norm": 0.04027498885989189, "grad_norm_var": 1.1373444083154621e-05, "learning_rate": 0.007924616609928114, "loss": 2.5713, "step": 8692 }, { "crossentropy": 2.5559263229370117, "epoch": 0.31514646171693733, "grad_norm": 0.031164539977908134, "grad_norm_var": 1.1357206852989617e-05, "learning_rate": 0.00792414527655602, "loss": 2.6119, "step": 8693 }, { "crossentropy": 2.7884132862091064, "epoch": 0.31518271461716935, "grad_norm": 0.03018013760447502, "grad_norm_var": 1.129077599222512e-05, "learning_rate": 0.007923673903688908, "loss": 2.6927, "step": 8694 }, { "crossentropy": 2.7109789848327637, "epoch": 0.3152189675174014, "grad_norm": 0.03026186302304268, "grad_norm_var": 1.1340887032160865e-05, "learning_rate": 0.007923202491333142, "loss": 2.6527, "step": 8695 }, { "crossentropy": 2.621086359024048, "epoch": 0.31525522041763343, "grad_norm": 0.0330360010266304, "grad_norm_var": 1.0680351901759302e-05, "learning_rate": 0.00792273103949509, "loss": 2.5912, "step": 8696 }, { "crossentropy": 2.7277865409851074, "epoch": 0.31529147331786544, "grad_norm": 0.033292461186647415, "grad_norm_var": 1.0195325115788742e-05, "learning_rate": 0.00792225954818112, "loss": 2.5814, "step": 8697 }, { "crossentropy": 2.680302858352661, "epoch": 0.31532772621809746, "grad_norm": 0.03190874308347702, "grad_norm_var": 9.514894761288491e-06, "learning_rate": 0.0079217880173976, "loss": 2.6506, "step": 8698 }, { "crossentropy": 2.7776970863342285, "epoch": 0.3153639791183295, "grad_norm": 0.02999291755259037, "grad_norm_var": 9.107297480958655e-06, "learning_rate": 0.007921316447150899, "loss": 2.6925, "step": 8699 }, { "crossentropy": 2.5825586318969727, "epoch": 0.3154002320185615, "grad_norm": 0.031764887273311615, "grad_norm_var": 8.44231081019789e-06, "learning_rate": 0.007920844837447387, "loss": 2.6177, "step": 8700 }, { "crossentropy": 2.6873793601989746, "epoch": 0.3154364849187935, "grad_norm": 0.032195817679166794, "grad_norm_var": 7.795264946931259e-06, "learning_rate": 0.007920373188293433, "loss": 2.6215, "step": 8701 }, { "crossentropy": 2.6750729084014893, "epoch": 0.3154727378190255, "grad_norm": 0.02888418734073639, "grad_norm_var": 8.51787868246059e-06, "learning_rate": 0.007919901499695405, "loss": 2.6311, "step": 8702 }, { "crossentropy": 2.5563302040100098, "epoch": 0.31550899071925753, "grad_norm": 0.031705569475889206, "grad_norm_var": 7.714534894865408e-06, "learning_rate": 0.00791942977165968, "loss": 2.7186, "step": 8703 }, { "crossentropy": 2.7095532417297363, "epoch": 0.31554524361948955, "grad_norm": 0.02963096648454666, "grad_norm_var": 8.04442074555517e-06, "learning_rate": 0.007918958004192622, "loss": 2.6393, "step": 8704 }, { "crossentropy": 2.6392130851745605, "epoch": 0.31558149651972156, "grad_norm": 0.15979985892772675, "grad_norm_var": 0.0010241711323015345, "learning_rate": 0.007918486197300607, "loss": 2.7267, "step": 8705 }, { "crossentropy": 2.6228535175323486, "epoch": 0.3156177494199536, "grad_norm": 0.034761589020490646, "grad_norm_var": 0.0010232554703194234, "learning_rate": 0.007918014350990009, "loss": 2.6008, "step": 8706 }, { "crossentropy": 2.597949981689453, "epoch": 0.3156540023201856, "grad_norm": 0.03971832990646362, "grad_norm_var": 0.001022135444280188, "learning_rate": 0.007917542465267196, "loss": 2.6045, "step": 8707 }, { "crossentropy": 2.648588180541992, "epoch": 0.31569025522041766, "grad_norm": 0.04488073289394379, "grad_norm_var": 0.001023301083005868, "learning_rate": 0.007917070540138547, "loss": 2.5606, "step": 8708 }, { "crossentropy": 2.72747540473938, "epoch": 0.3157265081206497, "grad_norm": 0.051202017813920975, "grad_norm_var": 0.0010225889378454873, "learning_rate": 0.007916598575610433, "loss": 2.6711, "step": 8709 }, { "crossentropy": 2.6752471923828125, "epoch": 0.3157627610208817, "grad_norm": 0.045009709894657135, "grad_norm_var": 0.0010128122844707819, "learning_rate": 0.007916126571689228, "loss": 2.7086, "step": 8710 }, { "crossentropy": 2.625598669052124, "epoch": 0.3157990139211137, "grad_norm": 0.06587345153093338, "grad_norm_var": 0.0010315769413984273, "learning_rate": 0.007915654528381309, "loss": 2.6387, "step": 8711 }, { "crossentropy": 2.713836669921875, "epoch": 0.3158352668213457, "grad_norm": 0.04330093786120415, "grad_norm_var": 0.0010214750290053892, "learning_rate": 0.007915182445693052, "loss": 2.7242, "step": 8712 }, { "crossentropy": 2.5597124099731445, "epoch": 0.31587151972157773, "grad_norm": 0.035485029220581055, "grad_norm_var": 0.0010180985009334057, "learning_rate": 0.007914710323630832, "loss": 2.6368, "step": 8713 }, { "crossentropy": 2.6132452487945557, "epoch": 0.31590777262180975, "grad_norm": 0.030571913346648216, "grad_norm_var": 0.001020723155344284, "learning_rate": 0.007914238162201024, "loss": 2.6285, "step": 8714 }, { "crossentropy": 2.72611927986145, "epoch": 0.31594402552204176, "grad_norm": 0.03229985386133194, "grad_norm_var": 0.0010161556288260602, "learning_rate": 0.007913765961410007, "loss": 2.7087, "step": 8715 }, { "crossentropy": 2.669694662094116, "epoch": 0.3159802784222738, "grad_norm": 0.03198591247200966, "grad_norm_var": 0.0010157371747709738, "learning_rate": 0.007913293721264161, "loss": 2.689, "step": 8716 }, { "crossentropy": 2.719055414199829, "epoch": 0.3160165313225058, "grad_norm": 0.03253045305609703, "grad_norm_var": 0.001015124616265252, "learning_rate": 0.00791282144176986, "loss": 2.7391, "step": 8717 }, { "crossentropy": 2.676187515258789, "epoch": 0.3160527842227378, "grad_norm": 0.04031536355614662, "grad_norm_var": 0.0009970481454223226, "learning_rate": 0.007912349122933487, "loss": 2.6495, "step": 8718 }, { "crossentropy": 2.5678515434265137, "epoch": 0.3160890371229698, "grad_norm": 0.032776832580566406, "grad_norm_var": 0.000994961431088515, "learning_rate": 0.007911876764761419, "loss": 2.5693, "step": 8719 }, { "crossentropy": 2.577282190322876, "epoch": 0.31612529002320183, "grad_norm": 0.03122715651988983, "grad_norm_var": 0.0009914488012469581, "learning_rate": 0.007911404367260035, "loss": 2.6402, "step": 8720 }, { "crossentropy": 2.588193416595459, "epoch": 0.31616154292343385, "grad_norm": 0.0324288047850132, "grad_norm_var": 8.947525150357718e-05, "learning_rate": 0.00791093193043572, "loss": 2.5418, "step": 8721 }, { "crossentropy": 2.6597514152526855, "epoch": 0.3161977958236659, "grad_norm": 0.028880108147859573, "grad_norm_var": 9.497903203297465e-05, "learning_rate": 0.007910459454294846, "loss": 2.6227, "step": 8722 }, { "crossentropy": 2.629533052444458, "epoch": 0.31623404872389793, "grad_norm": 0.02899845317006111, "grad_norm_var": 0.00010064202171554958, "learning_rate": 0.007909986938843806, "loss": 2.7036, "step": 8723 }, { "crossentropy": 2.763585329055786, "epoch": 0.31627030162412995, "grad_norm": 0.09291939437389374, "grad_norm_var": 0.0002890396266482298, "learning_rate": 0.007909514384088974, "loss": 2.7465, "step": 8724 }, { "crossentropy": 2.765284538269043, "epoch": 0.31630655452436196, "grad_norm": 0.03576352447271347, "grad_norm_var": 0.0002829107772989711, "learning_rate": 0.007909041790036734, "loss": 2.7311, "step": 8725 }, { "crossentropy": 2.5960514545440674, "epoch": 0.316342807424594, "grad_norm": 0.03451916202902794, "grad_norm_var": 0.00028281379651369195, "learning_rate": 0.007908569156693473, "loss": 2.6392, "step": 8726 }, { "crossentropy": 2.6111912727355957, "epoch": 0.316379060324826, "grad_norm": 0.0348505862057209, "grad_norm_var": 0.00023332524701279443, "learning_rate": 0.007908096484065569, "loss": 2.6296, "step": 8727 }, { "crossentropy": 2.594498634338379, "epoch": 0.316415313225058, "grad_norm": 0.029206497594714165, "grad_norm_var": 0.0002347049496420061, "learning_rate": 0.007907623772159408, "loss": 2.5716, "step": 8728 }, { "crossentropy": 2.5591821670532227, "epoch": 0.31645156612529, "grad_norm": 0.05900411307811737, "grad_norm_var": 0.0002659450647504682, "learning_rate": 0.00790715102098138, "loss": 2.7105, "step": 8729 }, { "crossentropy": 2.673295021057129, "epoch": 0.31648781902552203, "grad_norm": 0.03674249351024628, "grad_norm_var": 0.00026219910276152706, "learning_rate": 0.007906678230537862, "loss": 2.7285, "step": 8730 }, { "crossentropy": 2.8541626930236816, "epoch": 0.31652407192575405, "grad_norm": 0.047846339643001556, "grad_norm_var": 0.0002646538404916925, "learning_rate": 0.007906205400835243, "loss": 2.7761, "step": 8731 }, { "crossentropy": 2.6366019248962402, "epoch": 0.31656032482598606, "grad_norm": 0.0410834401845932, "grad_norm_var": 0.00026086402743793883, "learning_rate": 0.007905732531879912, "loss": 2.6446, "step": 8732 }, { "crossentropy": 2.564862012863159, "epoch": 0.3165965777262181, "grad_norm": 0.02965100295841694, "grad_norm_var": 0.00026422821735636823, "learning_rate": 0.007905259623678253, "loss": 2.5779, "step": 8733 }, { "crossentropy": 2.6583540439605713, "epoch": 0.3166328306264501, "grad_norm": 0.030064746737480164, "grad_norm_var": 0.00027004092205419876, "learning_rate": 0.007904786676236652, "loss": 2.6517, "step": 8734 }, { "crossentropy": 2.6515085697174072, "epoch": 0.31666908352668216, "grad_norm": 0.030829397961497307, "grad_norm_var": 0.00027192569949549306, "learning_rate": 0.0079043136895615, "loss": 2.617, "step": 8735 }, { "crossentropy": 2.65761661529541, "epoch": 0.3167053364269142, "grad_norm": 0.03050801530480385, "grad_norm_var": 0.0002727034164175743, "learning_rate": 0.007903840663659185, "loss": 2.5771, "step": 8736 }, { "crossentropy": 2.6598517894744873, "epoch": 0.3167415893271462, "grad_norm": 0.04296189174056053, "grad_norm_var": 0.00027047066079583643, "learning_rate": 0.007903367598536095, "loss": 2.6584, "step": 8737 }, { "crossentropy": 2.654428005218506, "epoch": 0.3167778422273782, "grad_norm": 0.029331697151064873, "grad_norm_var": 0.00026983707947037807, "learning_rate": 0.007902894494198618, "loss": 2.659, "step": 8738 }, { "crossentropy": 2.593158721923828, "epoch": 0.3168140951276102, "grad_norm": 0.030097438022494316, "grad_norm_var": 0.0002683528718627036, "learning_rate": 0.007902421350653147, "loss": 2.622, "step": 8739 }, { "crossentropy": 2.408036231994629, "epoch": 0.31685034802784223, "grad_norm": 0.030380021780729294, "grad_norm_var": 6.912052833841646e-05, "learning_rate": 0.00790194816790607, "loss": 2.5497, "step": 8740 }, { "crossentropy": 2.7398934364318848, "epoch": 0.31688660092807425, "grad_norm": 0.031147992238402367, "grad_norm_var": 7.047597432702403e-05, "learning_rate": 0.00790147494596378, "loss": 2.6995, "step": 8741 }, { "crossentropy": 2.678647994995117, "epoch": 0.31692285382830626, "grad_norm": 0.09179721772670746, "grad_norm_var": 0.00026792640295945013, "learning_rate": 0.007901001684832668, "loss": 2.7815, "step": 8742 }, { "crossentropy": 2.673614740371704, "epoch": 0.3169591067285383, "grad_norm": 0.02927246503531933, "grad_norm_var": 0.00027302710302702556, "learning_rate": 0.007900528384519125, "loss": 2.6802, "step": 8743 }, { "crossentropy": 2.676295518875122, "epoch": 0.3169953596287703, "grad_norm": 0.030557110905647278, "grad_norm_var": 0.0002714233486603628, "learning_rate": 0.007900055045029544, "loss": 2.6337, "step": 8744 }, { "crossentropy": 2.5879852771759033, "epoch": 0.3170316125290023, "grad_norm": 0.0335916243493557, "grad_norm_var": 0.00024342794308931706, "learning_rate": 0.00789958166637032, "loss": 2.6306, "step": 8745 }, { "crossentropy": 2.869123697280884, "epoch": 0.3170678654292343, "grad_norm": 0.03527885302901268, "grad_norm_var": 0.0002436592019773318, "learning_rate": 0.007899108248547846, "loss": 2.7236, "step": 8746 }, { "crossentropy": 2.733050584793091, "epoch": 0.31710411832946633, "grad_norm": 0.03169785439968109, "grad_norm_var": 0.00023692682578509654, "learning_rate": 0.007898634791568514, "loss": 2.6568, "step": 8747 }, { "crossentropy": 2.642890691757202, "epoch": 0.3171403712296984, "grad_norm": 0.030409686267375946, "grad_norm_var": 0.00023701300575884962, "learning_rate": 0.00789816129543872, "loss": 2.7046, "step": 8748 }, { "crossentropy": 2.4903504848480225, "epoch": 0.3171766241299304, "grad_norm": 0.029518943279981613, "grad_norm_var": 0.00023711661913970784, "learning_rate": 0.007897687760164859, "loss": 2.5784, "step": 8749 }, { "crossentropy": 2.3742117881774902, "epoch": 0.31721287703016243, "grad_norm": 0.030044525861740112, "grad_norm_var": 0.00023713120524341486, "learning_rate": 0.007897214185753329, "loss": 2.5301, "step": 8750 }, { "crossentropy": 2.717327356338501, "epoch": 0.31724912993039445, "grad_norm": 0.03371352329850197, "grad_norm_var": 0.00023586883733965516, "learning_rate": 0.007896740572210525, "loss": 2.6757, "step": 8751 }, { "crossentropy": 2.7227091789245605, "epoch": 0.31728538283062646, "grad_norm": 0.0342240147292614, "grad_norm_var": 0.00023418701873936977, "learning_rate": 0.007896266919542842, "loss": 2.5965, "step": 8752 }, { "crossentropy": 2.4906718730926514, "epoch": 0.3173216357308585, "grad_norm": 0.03134358301758766, "grad_norm_var": 0.00023164763476297463, "learning_rate": 0.00789579322775668, "loss": 2.5938, "step": 8753 }, { "crossentropy": 2.8367085456848145, "epoch": 0.3173578886310905, "grad_norm": 0.03399009257555008, "grad_norm_var": 0.00022938981022951205, "learning_rate": 0.007895319496858436, "loss": 2.7601, "step": 8754 }, { "crossentropy": 2.662877321243286, "epoch": 0.3173941415313225, "grad_norm": 0.0345180481672287, "grad_norm_var": 0.00022746126887729669, "learning_rate": 0.007894845726854508, "loss": 2.5773, "step": 8755 }, { "crossentropy": 2.6471164226531982, "epoch": 0.3174303944315545, "grad_norm": 0.030134061351418495, "grad_norm_var": 0.0002276401024244873, "learning_rate": 0.007894371917751296, "loss": 2.5884, "step": 8756 }, { "crossentropy": 2.7289414405822754, "epoch": 0.31746664733178653, "grad_norm": 0.02850811928510666, "grad_norm_var": 0.0002296787611983544, "learning_rate": 0.007893898069555198, "loss": 2.7229, "step": 8757 }, { "crossentropy": 2.788078784942627, "epoch": 0.31750290023201855, "grad_norm": 0.030307918787002563, "grad_norm_var": 4.737573618329947e-06, "learning_rate": 0.007893424182272615, "loss": 2.7226, "step": 8758 }, { "crossentropy": 2.5585670471191406, "epoch": 0.31753915313225056, "grad_norm": 0.03103778138756752, "grad_norm_var": 4.362280437940333e-06, "learning_rate": 0.007892950255909949, "loss": 2.5603, "step": 8759 }, { "crossentropy": 2.542109251022339, "epoch": 0.3175754060324826, "grad_norm": 0.030974527820944786, "grad_norm_var": 4.303733056720769e-06, "learning_rate": 0.007892476290473597, "loss": 2.6042, "step": 8760 }, { "crossentropy": 2.671116590499878, "epoch": 0.3176116589327146, "grad_norm": 0.030391188338398933, "grad_norm_var": 4.192529589593198e-06, "learning_rate": 0.007892002285969965, "loss": 2.6699, "step": 8761 }, { "crossentropy": 2.525705575942993, "epoch": 0.31764791183294666, "grad_norm": 0.028509413823485374, "grad_norm_var": 3.763903770895247e-06, "learning_rate": 0.007891528242405452, "loss": 2.611, "step": 8762 }, { "crossentropy": 2.6514787673950195, "epoch": 0.3176841647331787, "grad_norm": 0.0291362963616848, "grad_norm_var": 4.0065963945309e-06, "learning_rate": 0.007891054159786465, "loss": 2.5794, "step": 8763 }, { "crossentropy": 2.8342249393463135, "epoch": 0.3177204176334107, "grad_norm": 0.0863616019487381, "grad_norm_var": 0.00019491109330495836, "learning_rate": 0.007890580038119401, "loss": 2.6809, "step": 8764 }, { "crossentropy": 2.486415147781372, "epoch": 0.3177566705336427, "grad_norm": 0.029941381886601448, "grad_norm_var": 0.00019463917570626055, "learning_rate": 0.00789010587741067, "loss": 2.6514, "step": 8765 }, { "crossentropy": 2.713472843170166, "epoch": 0.3177929234338747, "grad_norm": 0.032955266535282135, "grad_norm_var": 0.00019341198051876101, "learning_rate": 0.00788963167766667, "loss": 2.6464, "step": 8766 }, { "crossentropy": 2.8127708435058594, "epoch": 0.31782917633410673, "grad_norm": 0.033450718969106674, "grad_norm_var": 0.00019345271842889988, "learning_rate": 0.007889157438893811, "loss": 2.7901, "step": 8767 }, { "crossentropy": 2.7641892433166504, "epoch": 0.31786542923433875, "grad_norm": 0.03380683436989784, "grad_norm_var": 0.00019349210245201546, "learning_rate": 0.007888683161098496, "loss": 2.7336, "step": 8768 }, { "crossentropy": 2.724475145339966, "epoch": 0.31790168213457076, "grad_norm": 0.029560379683971405, "grad_norm_var": 0.0001944913431958562, "learning_rate": 0.00788820884428713, "loss": 2.6919, "step": 8769 }, { "crossentropy": 2.7546098232269287, "epoch": 0.3179379350348028, "grad_norm": 0.030495695769786835, "grad_norm_var": 0.00019553820991171378, "learning_rate": 0.007887734488466122, "loss": 2.7096, "step": 8770 }, { "crossentropy": 2.706954002380371, "epoch": 0.3179741879350348, "grad_norm": 0.029903331771492958, "grad_norm_var": 0.00019678460007739406, "learning_rate": 0.007887260093641877, "loss": 2.6335, "step": 8771 }, { "crossentropy": 2.650049924850464, "epoch": 0.3180104408352668, "grad_norm": 0.028173008933663368, "grad_norm_var": 0.00019805989582734692, "learning_rate": 0.007886785659820802, "loss": 2.6743, "step": 8772 }, { "crossentropy": 2.7710483074188232, "epoch": 0.3180466937354988, "grad_norm": 0.030462823808193207, "grad_norm_var": 0.00019687529156873026, "learning_rate": 0.007886311187009307, "loss": 2.7036, "step": 8773 }, { "crossentropy": 2.919424295425415, "epoch": 0.31808294663573083, "grad_norm": 0.033918533474206924, "grad_norm_var": 0.00019586847589214348, "learning_rate": 0.0078858366752138, "loss": 2.7982, "step": 8774 }, { "crossentropy": 2.724201202392578, "epoch": 0.3181191995359629, "grad_norm": 0.03199974447488785, "grad_norm_var": 0.0001955056590120319, "learning_rate": 0.007885362124440689, "loss": 2.7413, "step": 8775 }, { "crossentropy": 2.7690539360046387, "epoch": 0.3181554524361949, "grad_norm": 0.03291408345103264, "grad_norm_var": 0.00019486073032872983, "learning_rate": 0.007884887534696383, "loss": 2.709, "step": 8776 }, { "crossentropy": 2.753107786178589, "epoch": 0.31819170533642693, "grad_norm": 0.03459789603948593, "grad_norm_var": 0.00019366283587095371, "learning_rate": 0.007884412905987292, "loss": 2.7331, "step": 8777 }, { "crossentropy": 2.690434455871582, "epoch": 0.31822795823665895, "grad_norm": 0.031307585537433624, "grad_norm_var": 0.0001918195378696219, "learning_rate": 0.00788393823831983, "loss": 2.7039, "step": 8778 }, { "crossentropy": 2.5354342460632324, "epoch": 0.31826421113689096, "grad_norm": 0.028878776356577873, "grad_norm_var": 0.0001920228409890122, "learning_rate": 0.007883463531700403, "loss": 2.529, "step": 8779 }, { "crossentropy": 2.6204235553741455, "epoch": 0.318300464037123, "grad_norm": 0.03022468276321888, "grad_norm_var": 3.949624932222705e-06, "learning_rate": 0.007882988786135425, "loss": 2.629, "step": 8780 }, { "crossentropy": 2.635347843170166, "epoch": 0.318336716937355, "grad_norm": 0.028626568615436554, "grad_norm_var": 4.315468801359817e-06, "learning_rate": 0.007882514001631312, "loss": 2.6077, "step": 8781 }, { "crossentropy": 2.605018138885498, "epoch": 0.318372969837587, "grad_norm": 0.030218223109841347, "grad_norm_var": 4.1904655574813056e-06, "learning_rate": 0.007882039178194468, "loss": 2.5019, "step": 8782 }, { "crossentropy": 2.6286611557006836, "epoch": 0.318409222737819, "grad_norm": 0.030178489163517952, "grad_norm_var": 3.859673314940357e-06, "learning_rate": 0.007881564315831315, "loss": 2.6247, "step": 8783 }, { "crossentropy": 2.586345672607422, "epoch": 0.31844547563805103, "grad_norm": 0.028581243008375168, "grad_norm_var": 3.5787647977513746e-06, "learning_rate": 0.007881089414548261, "loss": 2.5235, "step": 8784 }, { "crossentropy": 2.5470030307769775, "epoch": 0.31848172853828305, "grad_norm": 0.030442701652646065, "grad_norm_var": 3.5018735538623007e-06, "learning_rate": 0.007880614474351724, "loss": 2.5338, "step": 8785 }, { "crossentropy": 2.66094708442688, "epoch": 0.31851798143851506, "grad_norm": 0.03437032550573349, "grad_norm_var": 4.343555090142526e-06, "learning_rate": 0.007880139495248115, "loss": 2.6265, "step": 8786 }, { "crossentropy": 2.6549596786499023, "epoch": 0.3185542343387471, "grad_norm": 0.033544424921274185, "grad_norm_var": 4.67621413842087e-06, "learning_rate": 0.007879664477243854, "loss": 2.6369, "step": 8787 }, { "crossentropy": 2.7821691036224365, "epoch": 0.3185904872389791, "grad_norm": 0.02866574190557003, "grad_norm_var": 4.495646102473973e-06, "learning_rate": 0.007879189420345352, "loss": 2.7016, "step": 8788 }, { "crossentropy": 2.7254703044891357, "epoch": 0.31862674013921116, "grad_norm": 0.03112197294831276, "grad_norm_var": 4.459486035863121e-06, "learning_rate": 0.00787871432455903, "loss": 2.7552, "step": 8789 }, { "crossentropy": 2.673804998397827, "epoch": 0.3186629930394432, "grad_norm": 0.03195764869451523, "grad_norm_var": 3.995427834799168e-06, "learning_rate": 0.0078782391898913, "loss": 2.6718, "step": 8790 }, { "crossentropy": 2.601024866104126, "epoch": 0.3186992459396752, "grad_norm": 0.03449955955147743, "grad_norm_var": 4.6852604821415175e-06, "learning_rate": 0.007877764016348581, "loss": 2.6578, "step": 8791 }, { "crossentropy": 2.51977801322937, "epoch": 0.3187354988399072, "grad_norm": 0.02905472368001938, "grad_norm_var": 4.764048870951829e-06, "learning_rate": 0.007877288803937293, "loss": 2.5089, "step": 8792 }, { "crossentropy": 2.699838399887085, "epoch": 0.3187717517401392, "grad_norm": 0.03187864273786545, "grad_norm_var": 3.927847358482726e-06, "learning_rate": 0.007876813552663854, "loss": 2.6418, "step": 8793 }, { "crossentropy": 2.7788004875183105, "epoch": 0.31880800464037123, "grad_norm": 0.02881360985338688, "grad_norm_var": 4.163419140045194e-06, "learning_rate": 0.007876338262534682, "loss": 2.6697, "step": 8794 }, { "crossentropy": 2.7812347412109375, "epoch": 0.31884425754060325, "grad_norm": 0.03332487866282463, "grad_norm_var": 4.324547908166922e-06, "learning_rate": 0.007875862933556197, "loss": 2.7353, "step": 8795 }, { "crossentropy": 2.6899991035461426, "epoch": 0.31888051044083526, "grad_norm": 0.030532240867614746, "grad_norm_var": 4.299938577808137e-06, "learning_rate": 0.007875387565734818, "loss": 2.7225, "step": 8796 }, { "crossentropy": 2.8462018966674805, "epoch": 0.3189167633410673, "grad_norm": 0.028890252113342285, "grad_norm_var": 4.221254826993606e-06, "learning_rate": 0.007874912159076966, "loss": 2.7685, "step": 8797 }, { "crossentropy": 2.4631409645080566, "epoch": 0.3189530162412993, "grad_norm": 0.028725728392601013, "grad_norm_var": 4.516977966556359e-06, "learning_rate": 0.007874436713589063, "loss": 2.384, "step": 8798 }, { "crossentropy": 2.7021844387054443, "epoch": 0.3189892691415313, "grad_norm": 0.029126258566975594, "grad_norm_var": 4.6890008905451275e-06, "learning_rate": 0.007873961229277531, "loss": 2.6878, "step": 8799 }, { "crossentropy": 2.7918636798858643, "epoch": 0.3190255220417633, "grad_norm": 0.02802281826734543, "grad_norm_var": 4.877088816050643e-06, "learning_rate": 0.00787348570614879, "loss": 2.7024, "step": 8800 }, { "crossentropy": 2.479450225830078, "epoch": 0.31906177494199534, "grad_norm": 0.030688349157571793, "grad_norm_var": 4.868806512385683e-06, "learning_rate": 0.007873010144209266, "loss": 2.5964, "step": 8801 }, { "crossentropy": 2.642751455307007, "epoch": 0.3190980278422274, "grad_norm": 0.030376438051462173, "grad_norm_var": 3.978373419548823e-06, "learning_rate": 0.007872534543465377, "loss": 2.5915, "step": 8802 }, { "crossentropy": 2.639940023422241, "epoch": 0.3191342807424594, "grad_norm": 0.059268370270729065, "grad_norm_var": 5.5515676370024055e-05, "learning_rate": 0.00787205890392355, "loss": 2.6587, "step": 8803 }, { "crossentropy": 2.7293272018432617, "epoch": 0.31917053364269143, "grad_norm": 0.02891203574836254, "grad_norm_var": 5.540392431657031e-05, "learning_rate": 0.007871583225590211, "loss": 2.6719, "step": 8804 }, { "crossentropy": 2.634336471557617, "epoch": 0.31920678654292345, "grad_norm": 0.032829418778419495, "grad_norm_var": 5.5340804075715734e-05, "learning_rate": 0.007871107508471782, "loss": 2.6538, "step": 8805 }, { "crossentropy": 2.688260555267334, "epoch": 0.31924303944315546, "grad_norm": 0.035675048828125, "grad_norm_var": 5.6031680000882e-05, "learning_rate": 0.007870631752574688, "loss": 2.7059, "step": 8806 }, { "crossentropy": 2.4655487537384033, "epoch": 0.3192792923433875, "grad_norm": 0.03223135694861412, "grad_norm_var": 5.576019392117732e-05, "learning_rate": 0.007870155957905358, "loss": 2.6245, "step": 8807 }, { "crossentropy": 2.760721206665039, "epoch": 0.3193155452436195, "grad_norm": 0.03422745317220688, "grad_norm_var": 5.5127434576523085e-05, "learning_rate": 0.007869680124470214, "loss": 2.7346, "step": 8808 }, { "crossentropy": 2.7333757877349854, "epoch": 0.3193517981438515, "grad_norm": 0.0367719866335392, "grad_norm_var": 5.6074927152406396e-05, "learning_rate": 0.007869204252275686, "loss": 2.6951, "step": 8809 }, { "crossentropy": 2.5464165210723877, "epoch": 0.3193880510440835, "grad_norm": 0.03190375491976738, "grad_norm_var": 5.493614692702274e-05, "learning_rate": 0.0078687283413282, "loss": 2.5986, "step": 8810 }, { "crossentropy": 2.58978271484375, "epoch": 0.31942430394431554, "grad_norm": 0.029870707541704178, "grad_norm_var": 5.563315876884768e-05, "learning_rate": 0.007868252391634186, "loss": 2.6296, "step": 8811 }, { "crossentropy": 2.802919864654541, "epoch": 0.31946055684454755, "grad_norm": 0.02917367033660412, "grad_norm_var": 5.619612371637477e-05, "learning_rate": 0.007867776403200069, "loss": 2.7616, "step": 8812 }, { "crossentropy": 2.6525609493255615, "epoch": 0.31949680974477956, "grad_norm": 0.029043791815638542, "grad_norm_var": 5.611513399777403e-05, "learning_rate": 0.00786730037603228, "loss": 2.6369, "step": 8813 }, { "crossentropy": 2.5514731407165527, "epoch": 0.3195330626450116, "grad_norm": 0.029568081721663475, "grad_norm_var": 5.5687514150305864e-05, "learning_rate": 0.007866824310137249, "loss": 2.5772, "step": 8814 }, { "crossentropy": 2.6579809188842773, "epoch": 0.3195693155452436, "grad_norm": 0.02774910070002079, "grad_norm_var": 5.651378692419748e-05, "learning_rate": 0.007866348205521402, "loss": 2.629, "step": 8815 }, { "crossentropy": 2.638625144958496, "epoch": 0.31960556844547566, "grad_norm": 0.037417273968458176, "grad_norm_var": 5.592750455506386e-05, "learning_rate": 0.007865872062191175, "loss": 2.777, "step": 8816 }, { "crossentropy": 3.112926721572876, "epoch": 0.3196418213457077, "grad_norm": 0.170140340924263, "grad_norm_var": 0.0012194181048700234, "learning_rate": 0.007865395880152998, "loss": 2.9063, "step": 8817 }, { "crossentropy": 2.7989389896392822, "epoch": 0.3196780742459397, "grad_norm": 0.04062042012810707, "grad_norm_var": 0.0012098309365369524, "learning_rate": 0.007864919659413298, "loss": 2.7494, "step": 8818 }, { "crossentropy": 2.592947006225586, "epoch": 0.3197143271461717, "grad_norm": 0.041255101561546326, "grad_norm_var": 0.0011906480668043753, "learning_rate": 0.007864443399978514, "loss": 2.6509, "step": 8819 }, { "crossentropy": 2.620464324951172, "epoch": 0.3197505800464037, "grad_norm": 0.05437100678682327, "grad_norm_var": 0.001187708682428531, "learning_rate": 0.007863967101855072, "loss": 2.5933, "step": 8820 }, { "crossentropy": 2.5965025424957275, "epoch": 0.31978683294663574, "grad_norm": 0.03978084772825241, "grad_norm_var": 0.0011810212862917357, "learning_rate": 0.00786349076504941, "loss": 2.6289, "step": 8821 }, { "crossentropy": 2.701090097427368, "epoch": 0.31982308584686775, "grad_norm": 0.03706955164670944, "grad_norm_var": 0.001179643745376568, "learning_rate": 0.00786301438956796, "loss": 2.6605, "step": 8822 }, { "crossentropy": 2.7117207050323486, "epoch": 0.31985933874709976, "grad_norm": 0.036035604774951935, "grad_norm_var": 0.0011746677614278313, "learning_rate": 0.007862537975417154, "loss": 2.7449, "step": 8823 }, { "crossentropy": 2.547806978225708, "epoch": 0.3198955916473318, "grad_norm": 0.033129043877124786, "grad_norm_var": 0.0011761835434934234, "learning_rate": 0.00786206152260343, "loss": 2.5541, "step": 8824 }, { "crossentropy": 2.641256332397461, "epoch": 0.3199318445475638, "grad_norm": 0.033741604536771774, "grad_norm_var": 0.0011796754619922106, "learning_rate": 0.00786158503113322, "loss": 2.6081, "step": 8825 }, { "crossentropy": 2.605441093444824, "epoch": 0.3199680974477958, "grad_norm": 0.03725583478808403, "grad_norm_var": 0.0011729733545319337, "learning_rate": 0.007861108501012964, "loss": 2.6461, "step": 8826 }, { "crossentropy": 2.7171173095703125, "epoch": 0.3200043503480278, "grad_norm": 0.036201994866132736, "grad_norm_var": 0.0011634338980169724, "learning_rate": 0.007860631932249094, "loss": 2.6548, "step": 8827 }, { "crossentropy": 2.757481813430786, "epoch": 0.32004060324825984, "grad_norm": 0.04346194490790367, "grad_norm_var": 0.0011469294542527893, "learning_rate": 0.007860155324848048, "loss": 2.7687, "step": 8828 }, { "crossentropy": 2.706983804702759, "epoch": 0.3200768561484919, "grad_norm": 0.033738039433956146, "grad_norm_var": 0.0011380520836188775, "learning_rate": 0.007859678678816266, "loss": 2.6341, "step": 8829 }, { "crossentropy": 2.6939172744750977, "epoch": 0.3201131090487239, "grad_norm": 0.03068779595196247, "grad_norm_var": 0.0011357188918928404, "learning_rate": 0.007859201994160183, "loss": 2.7331, "step": 8830 }, { "crossentropy": 2.6920933723449707, "epoch": 0.32014936194895594, "grad_norm": 0.02802826464176178, "grad_norm_var": 0.0011350522107923513, "learning_rate": 0.007858725270886237, "loss": 2.6467, "step": 8831 }, { "crossentropy": 2.7324204444885254, "epoch": 0.32018561484918795, "grad_norm": 0.031572822481393814, "grad_norm_var": 0.0011437259453269625, "learning_rate": 0.00785824850900087, "loss": 2.6432, "step": 8832 }, { "crossentropy": 2.7040436267852783, "epoch": 0.32022186774941996, "grad_norm": 0.0330338217318058, "grad_norm_var": 3.90400487887181e-05, "learning_rate": 0.007857771708510517, "loss": 2.6506, "step": 8833 }, { "crossentropy": 2.6653668880462646, "epoch": 0.320258120649652, "grad_norm": 0.03263245150446892, "grad_norm_var": 3.9037834954557546e-05, "learning_rate": 0.007857294869421622, "loss": 2.6855, "step": 8834 }, { "crossentropy": 2.67392635345459, "epoch": 0.320294373549884, "grad_norm": 0.02873482182621956, "grad_norm_var": 4.0688029134218354e-05, "learning_rate": 0.007856817991740624, "loss": 2.6353, "step": 8835 }, { "crossentropy": 2.649613380432129, "epoch": 0.320330626450116, "grad_norm": 0.02949296310544014, "grad_norm_var": 1.7079733955922697e-05, "learning_rate": 0.007856341075473961, "loss": 2.6045, "step": 8836 }, { "crossentropy": 2.724006414413452, "epoch": 0.320366879350348, "grad_norm": 0.029318591579794884, "grad_norm_var": 1.5908899841903373e-05, "learning_rate": 0.00785586412062808, "loss": 2.6647, "step": 8837 }, { "crossentropy": 2.52893328666687, "epoch": 0.32040313225058004, "grad_norm": 0.03164304047822952, "grad_norm_var": 1.5082313721490454e-05, "learning_rate": 0.007855387127209418, "loss": 2.5152, "step": 8838 }, { "crossentropy": 2.7672860622406006, "epoch": 0.32043938515081205, "grad_norm": 0.034962013363838196, "grad_norm_var": 1.4726157819837788e-05, "learning_rate": 0.007854910095224422, "loss": 2.712, "step": 8839 }, { "crossentropy": 2.7410964965820312, "epoch": 0.32047563805104406, "grad_norm": 0.036202944815158844, "grad_norm_var": 1.537894961014154e-05, "learning_rate": 0.00785443302467953, "loss": 2.6515, "step": 8840 }, { "crossentropy": 2.6928441524505615, "epoch": 0.3205118909512761, "grad_norm": 0.028207246214151382, "grad_norm_var": 1.6870964872895262e-05, "learning_rate": 0.00785395591558119, "loss": 2.6556, "step": 8841 }, { "crossentropy": 2.5149872303009033, "epoch": 0.3205481438515081, "grad_norm": 0.0290841031819582, "grad_norm_var": 1.6215130440085433e-05, "learning_rate": 0.007853478767935844, "loss": 2.5738, "step": 8842 }, { "crossentropy": 2.8721718788146973, "epoch": 0.32058439675174016, "grad_norm": 0.05018606409430504, "grad_norm_var": 3.568906288385364e-05, "learning_rate": 0.007853001581749936, "loss": 2.8134, "step": 8843 }, { "crossentropy": 2.705608367919922, "epoch": 0.3206206496519722, "grad_norm": 0.034270502626895905, "grad_norm_var": 2.837662928486076e-05, "learning_rate": 0.007852524357029914, "loss": 2.6381, "step": 8844 }, { "crossentropy": 2.7381396293640137, "epoch": 0.3206569025522042, "grad_norm": 0.030732886865735054, "grad_norm_var": 2.8489961109847325e-05, "learning_rate": 0.007852047093782219, "loss": 2.694, "step": 8845 }, { "crossentropy": 2.590486526489258, "epoch": 0.3206931554524362, "grad_norm": 0.027961237356066704, "grad_norm_var": 2.9585919257795317e-05, "learning_rate": 0.007851569792013301, "loss": 2.6239, "step": 8846 }, { "crossentropy": 2.7786905765533447, "epoch": 0.3207294083526682, "grad_norm": 0.03160170093178749, "grad_norm_var": 2.837063022043673e-05, "learning_rate": 0.007851092451729607, "loss": 2.6698, "step": 8847 }, { "crossentropy": 2.5890283584594727, "epoch": 0.32076566125290024, "grad_norm": 0.030248451977968216, "grad_norm_var": 2.8639972208961233e-05, "learning_rate": 0.007850615072937583, "loss": 2.6379, "step": 8848 }, { "crossentropy": 2.6782822608947754, "epoch": 0.32080191415313225, "grad_norm": 0.029534444212913513, "grad_norm_var": 2.9107052362148794e-05, "learning_rate": 0.007850137655643675, "loss": 2.6144, "step": 8849 }, { "crossentropy": 2.548163890838623, "epoch": 0.32083816705336426, "grad_norm": 0.02937868796288967, "grad_norm_var": 2.9570644975137812e-05, "learning_rate": 0.007849660199854332, "loss": 2.5702, "step": 8850 }, { "crossentropy": 2.6892874240875244, "epoch": 0.3208744199535963, "grad_norm": 0.03191316872835159, "grad_norm_var": 2.882995905051793e-05, "learning_rate": 0.007849182705576005, "loss": 2.7123, "step": 8851 }, { "crossentropy": 2.6518805027008057, "epoch": 0.3209106728538283, "grad_norm": 0.028754498809576035, "grad_norm_var": 2.912773936258905e-05, "learning_rate": 0.00784870517281514, "loss": 2.7122, "step": 8852 }, { "crossentropy": 2.5982859134674072, "epoch": 0.3209469257540603, "grad_norm": 0.029730219393968582, "grad_norm_var": 2.8984304531573524e-05, "learning_rate": 0.007848227601578191, "loss": 2.6105, "step": 8853 }, { "crossentropy": 2.6471035480499268, "epoch": 0.3209831786542923, "grad_norm": 0.031106723472476006, "grad_norm_var": 2.90385840177516e-05, "learning_rate": 0.007847749991871605, "loss": 2.6152, "step": 8854 }, { "crossentropy": 2.552029848098755, "epoch": 0.32101943155452434, "grad_norm": 0.02994774654507637, "grad_norm_var": 2.8708046947367133e-05, "learning_rate": 0.007847272343701832, "loss": 2.5705, "step": 8855 }, { "crossentropy": 2.7221407890319824, "epoch": 0.3210556844547564, "grad_norm": 0.029908789321780205, "grad_norm_var": 2.7492208935214294e-05, "learning_rate": 0.007846794657075327, "loss": 2.7072, "step": 8856 }, { "crossentropy": 2.6074883937835693, "epoch": 0.3210919373549884, "grad_norm": 0.031780362129211426, "grad_norm_var": 2.6764122164889675e-05, "learning_rate": 0.00784631693199854, "loss": 2.5991, "step": 8857 }, { "crossentropy": 2.8731935024261475, "epoch": 0.32112819025522044, "grad_norm": 0.03072483092546463, "grad_norm_var": 2.6374606854493105e-05, "learning_rate": 0.007845839168477921, "loss": 2.7727, "step": 8858 }, { "crossentropy": 2.656160354614258, "epoch": 0.32116444315545245, "grad_norm": 0.030010690912604332, "grad_norm_var": 2.184097131834907e-06, "learning_rate": 0.00784536136651993, "loss": 2.5841, "step": 8859 }, { "crossentropy": 2.577155351638794, "epoch": 0.32120069605568446, "grad_norm": 0.02909890189766884, "grad_norm_var": 1.23872445070769e-06, "learning_rate": 0.007844883526131012, "loss": 2.6134, "step": 8860 }, { "crossentropy": 2.8647818565368652, "epoch": 0.3212369489559165, "grad_norm": 0.03119160421192646, "grad_norm_var": 1.2873990572752347e-06, "learning_rate": 0.007844405647317625, "loss": 2.6919, "step": 8861 }, { "crossentropy": 2.649209976196289, "epoch": 0.3212732018561485, "grad_norm": 0.03055921196937561, "grad_norm_var": 9.404081231547858e-07, "learning_rate": 0.007843927730086225, "loss": 2.6499, "step": 8862 }, { "crossentropy": 2.670165777206421, "epoch": 0.3213094547563805, "grad_norm": 0.0297005083411932, "grad_norm_var": 8.472776166977443e-07, "learning_rate": 0.007843449774443264, "loss": 2.6922, "step": 8863 }, { "crossentropy": 2.692650079727173, "epoch": 0.3213457076566125, "grad_norm": 0.028955282643437386, "grad_norm_var": 9.476316431915408e-07, "learning_rate": 0.007842971780395199, "loss": 2.6924, "step": 8864 }, { "crossentropy": 2.668365001678467, "epoch": 0.32138196055684454, "grad_norm": 0.029522772878408432, "grad_norm_var": 9.485879241686306e-07, "learning_rate": 0.007842493747948485, "loss": 2.6389, "step": 8865 }, { "crossentropy": 2.5831058025360107, "epoch": 0.32141821345707655, "grad_norm": 0.03094221092760563, "grad_norm_var": 9.420918705927885e-07, "learning_rate": 0.00784201567710958, "loss": 2.6032, "step": 8866 }, { "crossentropy": 2.7137532234191895, "epoch": 0.32145446635730857, "grad_norm": 0.03142058849334717, "grad_norm_var": 8.473981444855379e-07, "learning_rate": 0.00784153756788494, "loss": 2.704, "step": 8867 }, { "crossentropy": 2.617542266845703, "epoch": 0.3214907192575406, "grad_norm": 0.03073023445904255, "grad_norm_var": 7.080273718519659e-07, "learning_rate": 0.007841059420281023, "loss": 2.6299, "step": 8868 }, { "crossentropy": 2.5407447814941406, "epoch": 0.3215269721577726, "grad_norm": 0.028401345014572144, "grad_norm_var": 9.252288556161128e-07, "learning_rate": 0.007840581234304287, "loss": 2.5935, "step": 8869 }, { "crossentropy": 2.6528406143188477, "epoch": 0.32156322505800466, "grad_norm": 0.027460603043437004, "grad_norm_var": 1.3396752157473467e-06, "learning_rate": 0.007840103009961191, "loss": 2.6782, "step": 8870 }, { "crossentropy": 2.672126531600952, "epoch": 0.3215994779582367, "grad_norm": 0.030962802469730377, "grad_norm_var": 1.393990692838178e-06, "learning_rate": 0.007839624747258193, "loss": 2.7055, "step": 8871 }, { "crossentropy": 2.779860496520996, "epoch": 0.3216357308584687, "grad_norm": 0.03291171044111252, "grad_norm_var": 1.8867649912178285e-06, "learning_rate": 0.007839146446201754, "loss": 2.726, "step": 8872 }, { "crossentropy": 2.4498393535614014, "epoch": 0.3216719837587007, "grad_norm": 0.028561873361468315, "grad_norm_var": 1.8874766111222814e-06, "learning_rate": 0.007838668106798334, "loss": 2.5775, "step": 8873 }, { "crossentropy": 2.580836057662964, "epoch": 0.3217082366589327, "grad_norm": 0.028094086796045303, "grad_norm_var": 2.091106250780237e-06, "learning_rate": 0.007838189729054394, "loss": 2.6033, "step": 8874 }, { "crossentropy": 2.6561450958251953, "epoch": 0.32174448955916474, "grad_norm": 0.028761910274624825, "grad_norm_var": 2.1714364384609514e-06, "learning_rate": 0.007837711312976394, "loss": 2.6646, "step": 8875 }, { "crossentropy": 2.64563250541687, "epoch": 0.32178074245939675, "grad_norm": 0.02931133098900318, "grad_norm_var": 2.1535569922853746e-06, "learning_rate": 0.007837232858570796, "loss": 2.6401, "step": 8876 }, { "crossentropy": 2.6027603149414062, "epoch": 0.32181699535962877, "grad_norm": 0.029490206390619278, "grad_norm_var": 2.0285452643771516e-06, "learning_rate": 0.007836754365844063, "loss": 2.6358, "step": 8877 }, { "crossentropy": 2.5047848224639893, "epoch": 0.3218532482598608, "grad_norm": 0.033004872500896454, "grad_norm_var": 2.6705956990280555e-06, "learning_rate": 0.007836275834802658, "loss": 2.6461, "step": 8878 }, { "crossentropy": 2.5861575603485107, "epoch": 0.3218895011600928, "grad_norm": 0.03816118836402893, "grad_norm_var": 6.931316221974005e-06, "learning_rate": 0.007835797265453042, "loss": 2.6333, "step": 8879 }, { "crossentropy": 2.6761770248413086, "epoch": 0.3219257540603248, "grad_norm": 0.033041372895240784, "grad_norm_var": 7.177747648681037e-06, "learning_rate": 0.00783531865780168, "loss": 2.6909, "step": 8880 }, { "crossentropy": 2.62630033493042, "epoch": 0.3219620069605568, "grad_norm": 0.028928156942129135, "grad_norm_var": 7.291093153958863e-06, "learning_rate": 0.00783484001185504, "loss": 2.652, "step": 8881 }, { "crossentropy": 2.6212034225463867, "epoch": 0.32199825986078884, "grad_norm": 0.02947600930929184, "grad_norm_var": 7.365693866895672e-06, "learning_rate": 0.00783436132761958, "loss": 2.6549, "step": 8882 }, { "crossentropy": 2.5785794258117676, "epoch": 0.3220345127610209, "grad_norm": 0.031410202383995056, "grad_norm_var": 7.364487933286913e-06, "learning_rate": 0.007833882605101771, "loss": 2.5583, "step": 8883 }, { "crossentropy": 2.678968667984009, "epoch": 0.3220707656612529, "grad_norm": 0.030995700508356094, "grad_norm_var": 7.3754756639443175e-06, "learning_rate": 0.007833403844308078, "loss": 2.6532, "step": 8884 }, { "crossentropy": 2.6794486045837402, "epoch": 0.32210701856148494, "grad_norm": 0.03007541596889496, "grad_norm_var": 7.068614003677189e-06, "learning_rate": 0.007832925045244963, "loss": 2.6447, "step": 8885 }, { "crossentropy": 2.678945541381836, "epoch": 0.32214327146171695, "grad_norm": 0.02940266579389572, "grad_norm_var": 6.47446704039622e-06, "learning_rate": 0.007832446207918899, "loss": 2.6467, "step": 8886 }, { "crossentropy": 2.5518503189086914, "epoch": 0.32217952436194897, "grad_norm": 0.031010586768388748, "grad_norm_var": 6.4757308221520855e-06, "learning_rate": 0.00783196733233635, "loss": 2.5478, "step": 8887 }, { "crossentropy": 2.5803685188293457, "epoch": 0.322215777262181, "grad_norm": 0.04084174335002899, "grad_norm_var": 1.2649613283206952e-05, "learning_rate": 0.007831488418503783, "loss": 2.5518, "step": 8888 }, { "crossentropy": 2.6788666248321533, "epoch": 0.322252030162413, "grad_norm": 0.027821365743875504, "grad_norm_var": 1.2952796576862619e-05, "learning_rate": 0.00783100946642767, "loss": 2.6426, "step": 8889 }, { "crossentropy": 2.6866111755371094, "epoch": 0.322288283062645, "grad_norm": 0.033642057329416275, "grad_norm_var": 1.255003022521724e-05, "learning_rate": 0.007830530476114476, "loss": 2.6358, "step": 8890 }, { "crossentropy": 2.5276832580566406, "epoch": 0.322324535962877, "grad_norm": 0.028963958844542503, "grad_norm_var": 1.2476503307922807e-05, "learning_rate": 0.007830051447570673, "loss": 2.4443, "step": 8891 }, { "crossentropy": 2.5641703605651855, "epoch": 0.32236078886310904, "grad_norm": 0.027542684227228165, "grad_norm_var": 1.3211381770388908e-05, "learning_rate": 0.007829572380802731, "loss": 2.5858, "step": 8892 }, { "crossentropy": 2.5228986740112305, "epoch": 0.32239704176334105, "grad_norm": 0.027930205687880516, "grad_norm_var": 1.3779025600536322e-05, "learning_rate": 0.007829093275817121, "loss": 2.6367, "step": 8893 }, { "crossentropy": 2.638272523880005, "epoch": 0.32243329466357307, "grad_norm": 0.034951746463775635, "grad_norm_var": 1.443498143384684e-05, "learning_rate": 0.007828614132620312, "loss": 2.6905, "step": 8894 }, { "crossentropy": 2.6003856658935547, "epoch": 0.3224695475638051, "grad_norm": 0.04038383811712265, "grad_norm_var": 1.6714194407569696e-05, "learning_rate": 0.007828134951218774, "loss": 2.5874, "step": 8895 }, { "crossentropy": 2.4541330337524414, "epoch": 0.3225058004640371, "grad_norm": 0.03073522448539734, "grad_norm_var": 1.661910147492529e-05, "learning_rate": 0.007827655731618985, "loss": 2.5495, "step": 8896 }, { "crossentropy": 2.6874656677246094, "epoch": 0.32254205336426917, "grad_norm": 0.03117111511528492, "grad_norm_var": 1.616230688335529e-05, "learning_rate": 0.007827176473827413, "loss": 2.6983, "step": 8897 }, { "crossentropy": 2.7423384189605713, "epoch": 0.3225783062645012, "grad_norm": 0.03087477944791317, "grad_norm_var": 1.5879666783589346e-05, "learning_rate": 0.007826697177850532, "loss": 2.6639, "step": 8898 }, { "crossentropy": 2.6357691287994385, "epoch": 0.3226145591647332, "grad_norm": 0.02922978810966015, "grad_norm_var": 1.627110854795625e-05, "learning_rate": 0.007826217843694815, "loss": 2.7033, "step": 8899 }, { "crossentropy": 2.5656895637512207, "epoch": 0.3226508120649652, "grad_norm": 0.02737644873559475, "grad_norm_var": 1.738059187161046e-05, "learning_rate": 0.00782573847136674, "loss": 2.5242, "step": 8900 }, { "crossentropy": 2.7015421390533447, "epoch": 0.3226870649651972, "grad_norm": 0.02828478254377842, "grad_norm_var": 1.789057500555731e-05, "learning_rate": 0.007825259060872778, "loss": 2.6716, "step": 8901 }, { "crossentropy": 2.897526502609253, "epoch": 0.32272331786542924, "grad_norm": 0.030639968812465668, "grad_norm_var": 1.767981522807284e-05, "learning_rate": 0.007824779612219404, "loss": 2.729, "step": 8902 }, { "crossentropy": 2.5844287872314453, "epoch": 0.32275957076566125, "grad_norm": 0.03072439692914486, "grad_norm_var": 1.769740953375115e-05, "learning_rate": 0.007824300125413096, "loss": 2.6803, "step": 8903 }, { "crossentropy": 2.5253591537475586, "epoch": 0.32279582366589327, "grad_norm": 0.028442218899726868, "grad_norm_var": 1.1564051279582394e-05, "learning_rate": 0.007823820600460326, "loss": 2.5407, "step": 8904 }, { "crossentropy": 2.5562288761138916, "epoch": 0.3228320765661253, "grad_norm": 0.028699390590190887, "grad_norm_var": 1.1293418111116976e-05, "learning_rate": 0.007823341037367575, "loss": 2.6048, "step": 8905 }, { "crossentropy": 2.722811460494995, "epoch": 0.3228683294663573, "grad_norm": 0.030824586749076843, "grad_norm_var": 1.0646590644506661e-05, "learning_rate": 0.00782286143614132, "loss": 2.5773, "step": 8906 }, { "crossentropy": 2.6366066932678223, "epoch": 0.3229045823665893, "grad_norm": 0.032047610729932785, "grad_norm_var": 1.0640824135674113e-05, "learning_rate": 0.007822381796788035, "loss": 2.5994, "step": 8907 }, { "crossentropy": 2.671579360961914, "epoch": 0.3229408352668213, "grad_norm": 0.0291521605104208, "grad_norm_var": 1.0143163796635414e-05, "learning_rate": 0.007821902119314201, "loss": 2.7124, "step": 8908 }, { "crossentropy": 2.637347459793091, "epoch": 0.32297708816705334, "grad_norm": 0.02810320258140564, "grad_norm_var": 1.0080758779731089e-05, "learning_rate": 0.007821422403726299, "loss": 2.6526, "step": 8909 }, { "crossentropy": 2.7410941123962402, "epoch": 0.3230133410672854, "grad_norm": 0.03029434010386467, "grad_norm_var": 8.81331794898899e-06, "learning_rate": 0.007820942650030804, "loss": 2.7181, "step": 8910 }, { "crossentropy": 2.406651735305786, "epoch": 0.3230495939675174, "grad_norm": 0.030690159648656845, "grad_norm_var": 1.8294290280485564e-06, "learning_rate": 0.007820462858234197, "loss": 2.4847, "step": 8911 }, { "crossentropy": 2.621433734893799, "epoch": 0.32308584686774944, "grad_norm": 0.028124554082751274, "grad_norm_var": 1.940526329940959e-06, "learning_rate": 0.00781998302834296, "loss": 2.617, "step": 8912 }, { "crossentropy": 2.63746976852417, "epoch": 0.32312209976798145, "grad_norm": 0.0298579353839159, "grad_norm_var": 1.7850295414835344e-06, "learning_rate": 0.007819503160363572, "loss": 2.6428, "step": 8913 }, { "crossentropy": 2.6300158500671387, "epoch": 0.32315835266821347, "grad_norm": 0.032616738229990005, "grad_norm_var": 2.2741547020444734e-06, "learning_rate": 0.007819023254302516, "loss": 2.5847, "step": 8914 }, { "crossentropy": 2.5787744522094727, "epoch": 0.3231946055684455, "grad_norm": 0.028274400159716606, "grad_norm_var": 2.3903703415023887e-06, "learning_rate": 0.007818543310166274, "loss": 2.6033, "step": 8915 }, { "crossentropy": 2.667510747909546, "epoch": 0.3232308584686775, "grad_norm": 0.0478830486536026, "grad_norm_var": 2.2498763611838018e-05, "learning_rate": 0.007818063327961325, "loss": 2.6535, "step": 8916 }, { "crossentropy": 2.5069518089294434, "epoch": 0.3232671113689095, "grad_norm": 0.027983374893665314, "grad_norm_var": 2.2610192842796685e-05, "learning_rate": 0.007817583307694154, "loss": 2.569, "step": 8917 }, { "crossentropy": 2.7995874881744385, "epoch": 0.3233033642691415, "grad_norm": 0.030067184939980507, "grad_norm_var": 2.2650356758206237e-05, "learning_rate": 0.007817103249371245, "loss": 2.7602, "step": 8918 }, { "crossentropy": 2.5788373947143555, "epoch": 0.32333961716937354, "grad_norm": 0.030394328758120537, "grad_norm_var": 2.26632031860902e-05, "learning_rate": 0.007816623152999082, "loss": 2.5623, "step": 8919 }, { "crossentropy": 2.5673532485961914, "epoch": 0.32337587006960555, "grad_norm": 0.03172483667731285, "grad_norm_var": 2.2286793634305097e-05, "learning_rate": 0.00781614301858415, "loss": 2.592, "step": 8920 }, { "crossentropy": 2.5890326499938965, "epoch": 0.32341212296983757, "grad_norm": 0.030908754095435143, "grad_norm_var": 2.1900571541874623e-05, "learning_rate": 0.00781566284613293, "loss": 2.6438, "step": 8921 }, { "crossentropy": 2.6304590702056885, "epoch": 0.3234483758700696, "grad_norm": 0.032469287514686584, "grad_norm_var": 2.1990775532678774e-05, "learning_rate": 0.007815182635651913, "loss": 2.6445, "step": 8922 }, { "crossentropy": 2.7408196926116943, "epoch": 0.3234846287703016, "grad_norm": 0.02756655402481556, "grad_norm_var": 2.2791318948391497e-05, "learning_rate": 0.007814702387147581, "loss": 2.6654, "step": 8923 }, { "crossentropy": 2.6502044200897217, "epoch": 0.32352088167053367, "grad_norm": 0.02891378104686737, "grad_norm_var": 2.2853822318007185e-05, "learning_rate": 0.007814222100626423, "loss": 2.7251, "step": 8924 }, { "crossentropy": 2.577925682067871, "epoch": 0.3235571345707657, "grad_norm": 0.02976054884493351, "grad_norm_var": 2.2387125453099167e-05, "learning_rate": 0.007813741776094923, "loss": 2.628, "step": 8925 }, { "crossentropy": 2.580291509628296, "epoch": 0.3235933874709977, "grad_norm": 0.03155519440770149, "grad_norm_var": 2.235177973403483e-05, "learning_rate": 0.007813261413559572, "loss": 2.5575, "step": 8926 }, { "crossentropy": 2.467853307723999, "epoch": 0.3236296403712297, "grad_norm": 0.029551029205322266, "grad_norm_var": 2.2506431930517135e-05, "learning_rate": 0.007812781013026857, "loss": 2.5465, "step": 8927 }, { "crossentropy": 2.5220141410827637, "epoch": 0.3236658932714617, "grad_norm": 0.027246227487921715, "grad_norm_var": 2.2903480456237282e-05, "learning_rate": 0.007812300574503264, "loss": 2.5341, "step": 8928 }, { "crossentropy": 2.525136947631836, "epoch": 0.32370214617169374, "grad_norm": 0.02841981127858162, "grad_norm_var": 2.3261000373525385e-05, "learning_rate": 0.0078118200979952835, "loss": 2.6593, "step": 8929 }, { "crossentropy": 2.6693482398986816, "epoch": 0.32373839907192575, "grad_norm": 0.028203273192048073, "grad_norm_var": 2.3502574189465143e-05, "learning_rate": 0.007811339583509407, "loss": 2.5801, "step": 8930 }, { "crossentropy": 2.6498982906341553, "epoch": 0.32377465197215777, "grad_norm": 0.027708468958735466, "grad_norm_var": 2.3704308461854467e-05, "learning_rate": 0.0078108590310521225, "loss": 2.6039, "step": 8931 }, { "crossentropy": 2.5822594165802, "epoch": 0.3238109048723898, "grad_norm": 0.029036732390522957, "grad_norm_var": 2.5923976010846686e-06, "learning_rate": 0.007810378440629922, "loss": 2.6567, "step": 8932 }, { "crossentropy": 2.408987283706665, "epoch": 0.3238471577726218, "grad_norm": 0.02779443748295307, "grad_norm_var": 2.6320625227851944e-06, "learning_rate": 0.0078098978122492935, "loss": 2.4988, "step": 8933 }, { "crossentropy": 2.768460273742676, "epoch": 0.3238834106728538, "grad_norm": 0.02910119853913784, "grad_norm_var": 2.611860440834314e-06, "learning_rate": 0.007809417145916733, "loss": 2.7113, "step": 8934 }, { "crossentropy": 2.631028175354004, "epoch": 0.3239196635730858, "grad_norm": 0.028551816940307617, "grad_norm_var": 2.579064381632657e-06, "learning_rate": 0.00780893644163873, "loss": 2.6393, "step": 8935 }, { "crossentropy": 2.5764613151550293, "epoch": 0.32395591647331784, "grad_norm": 0.029868600890040398, "grad_norm_var": 2.1898168982251167e-06, "learning_rate": 0.007808455699421777, "loss": 2.5821, "step": 8936 }, { "crossentropy": 2.5936691761016846, "epoch": 0.3239921693735499, "grad_norm": 0.02993122674524784, "grad_norm_var": 2.0223917639852185e-06, "learning_rate": 0.007807974919272368, "loss": 2.6964, "step": 8937 }, { "crossentropy": 2.6799731254577637, "epoch": 0.3240284222737819, "grad_norm": 0.029754769057035446, "grad_norm_var": 1.2652328757388938e-06, "learning_rate": 0.007807494101196997, "loss": 2.6068, "step": 8938 }, { "crossentropy": 2.575077772140503, "epoch": 0.32406467517401394, "grad_norm": 0.02907717414200306, "grad_norm_var": 1.132183054978105e-06, "learning_rate": 0.007807013245202158, "loss": 2.5983, "step": 8939 }, { "crossentropy": 2.7311809062957764, "epoch": 0.32410092807424595, "grad_norm": 0.028016943484544754, "grad_norm_var": 1.1963075055192059e-06, "learning_rate": 0.007806532351294344, "loss": 2.5522, "step": 8940 }, { "crossentropy": 2.5540294647216797, "epoch": 0.32413718097447797, "grad_norm": 0.027687493711709976, "grad_norm_var": 1.2473838702492842e-06, "learning_rate": 0.007806051419480053, "loss": 2.6042, "step": 8941 }, { "crossentropy": 2.644273519515991, "epoch": 0.32417343387471, "grad_norm": 0.029808131977915764, "grad_norm_var": 8.066037406608921e-07, "learning_rate": 0.007805570449765778, "loss": 2.6692, "step": 8942 }, { "crossentropy": 2.6340763568878174, "epoch": 0.324209686774942, "grad_norm": 0.030276253819465637, "grad_norm_var": 9.183990271501944e-07, "learning_rate": 0.007805089442158018, "loss": 2.638, "step": 8943 }, { "crossentropy": 2.8223395347595215, "epoch": 0.324245939675174, "grad_norm": 0.029691658914089203, "grad_norm_var": 7.920072178175709e-07, "learning_rate": 0.007804608396663267, "loss": 2.7298, "step": 8944 }, { "crossentropy": 2.551426649093628, "epoch": 0.324282192575406, "grad_norm": 0.028178568929433823, "grad_norm_var": 8.121516147455472e-07, "learning_rate": 0.007804127313288023, "loss": 2.5741, "step": 8945 }, { "crossentropy": 2.584193468093872, "epoch": 0.32431844547563804, "grad_norm": 0.02912004664540291, "grad_norm_var": 7.773250933780391e-07, "learning_rate": 0.007803646192038785, "loss": 2.6713, "step": 8946 }, { "crossentropy": 2.5675811767578125, "epoch": 0.32435469837587005, "grad_norm": 0.031181462109088898, "grad_norm_var": 9.445910809351692e-07, "learning_rate": 0.00780316503292205, "loss": 2.5967, "step": 8947 }, { "crossentropy": 2.757162570953369, "epoch": 0.32439095127610207, "grad_norm": 0.0330878309905529, "grad_norm_var": 1.8862838710354916e-06, "learning_rate": 0.007802683835944318, "loss": 2.7141, "step": 8948 }, { "crossentropy": 2.477555990219116, "epoch": 0.3244272041763341, "grad_norm": 0.030651843175292015, "grad_norm_var": 1.7675569510483893e-06, "learning_rate": 0.0078022026011120875, "loss": 2.4364, "step": 8949 }, { "crossentropy": 2.5829787254333496, "epoch": 0.32446345707656615, "grad_norm": 0.029248664155602455, "grad_norm_var": 1.7586354639602397e-06, "learning_rate": 0.0078017213284318565, "loss": 2.5797, "step": 8950 }, { "crossentropy": 2.5258679389953613, "epoch": 0.32449970997679817, "grad_norm": 0.03252299875020981, "grad_norm_var": 2.171653228760321e-06, "learning_rate": 0.007801240017910129, "loss": 2.5622, "step": 8951 }, { "crossentropy": 2.545876979827881, "epoch": 0.3245359628770302, "grad_norm": 0.031739696860313416, "grad_norm_var": 2.3872528583934594e-06, "learning_rate": 0.0078007586695534046, "loss": 2.5505, "step": 8952 }, { "crossentropy": 2.615365505218506, "epoch": 0.3245722157772622, "grad_norm": 0.03131285682320595, "grad_norm_var": 2.4941805495350943e-06, "learning_rate": 0.007800277283368183, "loss": 2.6538, "step": 8953 }, { "crossentropy": 2.406445026397705, "epoch": 0.3246084686774942, "grad_norm": 0.032194335013628006, "grad_norm_var": 2.7588054740905363e-06, "learning_rate": 0.007799795859360968, "loss": 2.4995, "step": 8954 }, { "crossentropy": 2.5152199268341064, "epoch": 0.3246447215777262, "grad_norm": 0.028395306318998337, "grad_norm_var": 2.893333346952219e-06, "learning_rate": 0.00779931439753826, "loss": 2.5552, "step": 8955 }, { "crossentropy": 2.527301549911499, "epoch": 0.32468097447795824, "grad_norm": 0.028414826840162277, "grad_norm_var": 2.7876990602520757e-06, "learning_rate": 0.007798832897906563, "loss": 2.5691, "step": 8956 }, { "crossentropy": 2.7265589237213135, "epoch": 0.32471722737819025, "grad_norm": 0.03254607319831848, "grad_norm_var": 2.622801726967017e-06, "learning_rate": 0.00779835136047238, "loss": 2.7028, "step": 8957 }, { "crossentropy": 2.4927680492401123, "epoch": 0.32475348027842227, "grad_norm": 0.03164157643914223, "grad_norm_var": 2.6581015004654983e-06, "learning_rate": 0.0077978697852422165, "loss": 2.5923, "step": 8958 }, { "crossentropy": 2.736391305923462, "epoch": 0.3247897331786543, "grad_norm": 0.02900478057563305, "grad_norm_var": 2.8204261156397713e-06, "learning_rate": 0.0077973881722225745, "loss": 2.6472, "step": 8959 }, { "crossentropy": 2.5611209869384766, "epoch": 0.3248259860788863, "grad_norm": 0.028860967606306076, "grad_norm_var": 2.9595403824112052e-06, "learning_rate": 0.007796906521419962, "loss": 2.5934, "step": 8960 }, { "crossentropy": 2.549581289291382, "epoch": 0.3248622389791183, "grad_norm": 0.03022937849164009, "grad_norm_var": 2.5858886934737447e-06, "learning_rate": 0.00779642483284088, "loss": 2.6439, "step": 8961 }, { "crossentropy": 2.5225517749786377, "epoch": 0.3248984918793503, "grad_norm": 0.032184895128011703, "grad_norm_var": 2.55407725067816e-06, "learning_rate": 0.007795943106491838, "loss": 2.6142, "step": 8962 }, { "crossentropy": 2.6986451148986816, "epoch": 0.32493474477958234, "grad_norm": 0.030859561637043953, "grad_norm_var": 2.545301042775466e-06, "learning_rate": 0.007795461342379342, "loss": 2.688, "step": 8963 }, { "crossentropy": 2.638615369796753, "epoch": 0.3249709976798144, "grad_norm": 0.029622338712215424, "grad_norm_var": 2.2415358654890675e-06, "learning_rate": 0.007794979540509898, "loss": 2.6245, "step": 8964 }, { "crossentropy": 2.6339504718780518, "epoch": 0.3250072505800464, "grad_norm": 0.030498800799250603, "grad_norm_var": 2.2417251619987846e-06, "learning_rate": 0.007794497700890013, "loss": 2.6099, "step": 8965 }, { "crossentropy": 2.6131339073181152, "epoch": 0.32504350348027844, "grad_norm": 0.028859689831733704, "grad_norm_var": 2.32021933223186e-06, "learning_rate": 0.007794015823526196, "loss": 2.6158, "step": 8966 }, { "crossentropy": 2.474987030029297, "epoch": 0.32507975638051045, "grad_norm": 0.030404847115278244, "grad_norm_var": 2.04496976859655e-06, "learning_rate": 0.007793533908424954, "loss": 2.5724, "step": 8967 }, { "crossentropy": 2.6835687160491943, "epoch": 0.32511600928074247, "grad_norm": 0.02795267477631569, "grad_norm_var": 2.2765287252471486e-06, "learning_rate": 0.007793051955592801, "loss": 2.5895, "step": 8968 }, { "crossentropy": 2.604857921600342, "epoch": 0.3251522621809745, "grad_norm": 0.027957094833254814, "grad_norm_var": 2.476347994093131e-06, "learning_rate": 0.007792569965036239, "loss": 2.608, "step": 8969 }, { "crossentropy": 2.809812307357788, "epoch": 0.3251885150812065, "grad_norm": 0.02895110473036766, "grad_norm_var": 2.1747820160009554e-06, "learning_rate": 0.007792087936761783, "loss": 2.6703, "step": 8970 }, { "crossentropy": 2.582854986190796, "epoch": 0.3252247679814385, "grad_norm": 0.03037499450147152, "grad_norm_var": 2.0558134042981812e-06, "learning_rate": 0.007791605870775942, "loss": 2.5814, "step": 8971 }, { "crossentropy": 2.7893097400665283, "epoch": 0.3252610208816705, "grad_norm": 0.02944963239133358, "grad_norm_var": 1.918138267680408e-06, "learning_rate": 0.007791123767085228, "loss": 2.6471, "step": 8972 }, { "crossentropy": 2.632528305053711, "epoch": 0.32529727378190254, "grad_norm": 0.028743376955389977, "grad_norm_var": 1.5119298634095098e-06, "learning_rate": 0.007790641625696152, "loss": 2.5994, "step": 8973 }, { "crossentropy": 2.712949752807617, "epoch": 0.32533352668213456, "grad_norm": 0.02948497235774994, "grad_norm_var": 1.2514304594063296e-06, "learning_rate": 0.007790159446615226, "loss": 2.7037, "step": 8974 }, { "crossentropy": 2.571758270263672, "epoch": 0.32536977958236657, "grad_norm": 0.030372265726327896, "grad_norm_var": 1.2616127241792929e-06, "learning_rate": 0.007789677229848962, "loss": 2.616, "step": 8975 }, { "crossentropy": 2.601428747177124, "epoch": 0.3254060324825986, "grad_norm": 0.03360723704099655, "grad_norm_var": 2.1541450077688003e-06, "learning_rate": 0.007789194975403872, "loss": 2.6532, "step": 8976 }, { "crossentropy": 2.6172657012939453, "epoch": 0.32544228538283065, "grad_norm": 0.030307650566101074, "grad_norm_var": 2.157213424019919e-06, "learning_rate": 0.007788712683286474, "loss": 2.6312, "step": 8977 }, { "crossentropy": 2.6230785846710205, "epoch": 0.32547853828306267, "grad_norm": 0.02715027704834938, "grad_norm_var": 2.2592672576121052e-06, "learning_rate": 0.007788230353503277, "loss": 2.5885, "step": 8978 }, { "crossentropy": 2.702425956726074, "epoch": 0.3255147911832947, "grad_norm": 0.027894435450434685, "grad_norm_var": 2.335421068625406e-06, "learning_rate": 0.007787747986060798, "loss": 2.6584, "step": 8979 }, { "crossentropy": 2.532116651535034, "epoch": 0.3255510440835267, "grad_norm": 0.028278781101107597, "grad_norm_var": 2.422199837133126e-06, "learning_rate": 0.007787265580965552, "loss": 2.6142, "step": 8980 }, { "crossentropy": 2.743441343307495, "epoch": 0.3255872969837587, "grad_norm": 0.029416127130389214, "grad_norm_var": 2.335830221454948e-06, "learning_rate": 0.007786783138224057, "loss": 2.6627, "step": 8981 }, { "crossentropy": 2.5941569805145264, "epoch": 0.3256235498839907, "grad_norm": 0.03152476251125336, "grad_norm_var": 2.6142841406944965e-06, "learning_rate": 0.007786300657842823, "loss": 2.64, "step": 8982 }, { "crossentropy": 2.706362009048462, "epoch": 0.32565980278422274, "grad_norm": 0.033223990350961685, "grad_norm_var": 3.4541748960281582e-06, "learning_rate": 0.007785818139828371, "loss": 2.6706, "step": 8983 }, { "crossentropy": 2.639650583267212, "epoch": 0.32569605568445475, "grad_norm": 0.030008528381586075, "grad_norm_var": 3.2481153355426306e-06, "learning_rate": 0.0077853355841872186, "loss": 2.6276, "step": 8984 }, { "crossentropy": 2.8217947483062744, "epoch": 0.32573230858468677, "grad_norm": 0.029220476746559143, "grad_norm_var": 3.0380112448634884e-06, "learning_rate": 0.007784852990925882, "loss": 2.7492, "step": 8985 }, { "crossentropy": 2.6916027069091797, "epoch": 0.3257685614849188, "grad_norm": 0.02760309725999832, "grad_norm_var": 3.317733953459428e-06, "learning_rate": 0.007784370360050879, "loss": 2.6648, "step": 8986 }, { "crossentropy": 2.5699734687805176, "epoch": 0.3258048143851508, "grad_norm": 0.03007199801504612, "grad_norm_var": 3.299890407415055e-06, "learning_rate": 0.00778388769156873, "loss": 2.6695, "step": 8987 }, { "crossentropy": 2.705627918243408, "epoch": 0.3258410672853828, "grad_norm": 0.030893471091985703, "grad_norm_var": 3.3680552294469324e-06, "learning_rate": 0.007783404985485951, "loss": 2.6665, "step": 8988 }, { "crossentropy": 2.697924852371216, "epoch": 0.3258773201856148, "grad_norm": 0.029965369030833244, "grad_norm_var": 3.279028260715526e-06, "learning_rate": 0.007782922241809066, "loss": 2.7074, "step": 8989 }, { "crossentropy": 2.652704954147339, "epoch": 0.32591357308584684, "grad_norm": 0.02885461039841175, "grad_norm_var": 3.342020314401247e-06, "learning_rate": 0.007782439460544592, "loss": 2.6256, "step": 8990 }, { "crossentropy": 2.7646517753601074, "epoch": 0.3259498259860789, "grad_norm": 0.031425125896930695, "grad_norm_var": 3.4776605099682997e-06, "learning_rate": 0.00778195664169905, "loss": 2.8507, "step": 8991 }, { "crossentropy": 2.6532154083251953, "epoch": 0.3259860788863109, "grad_norm": 0.03054528310894966, "grad_norm_var": 2.5768030399298907e-06, "learning_rate": 0.007781473785278963, "loss": 2.6851, "step": 8992 }, { "crossentropy": 2.6450164318084717, "epoch": 0.32602233178654294, "grad_norm": 0.029648810625076294, "grad_norm_var": 2.557053624149356e-06, "learning_rate": 0.007780990891290849, "loss": 2.6773, "step": 8993 }, { "crossentropy": 2.650378942489624, "epoch": 0.32605858468677495, "grad_norm": 0.02932003140449524, "grad_norm_var": 2.1041616344969276e-06, "learning_rate": 0.007780507959741237, "loss": 2.5992, "step": 8994 }, { "crossentropy": 2.569821834564209, "epoch": 0.32609483758700697, "grad_norm": 0.029716243967413902, "grad_norm_var": 1.8320993178915756e-06, "learning_rate": 0.00778002499063664, "loss": 2.6246, "step": 8995 }, { "crossentropy": 2.5516698360443115, "epoch": 0.326131090487239, "grad_norm": 0.02942618541419506, "grad_norm_var": 1.6537670904328537e-06, "learning_rate": 0.007779541983983591, "loss": 2.6196, "step": 8996 }, { "crossentropy": 2.6431760787963867, "epoch": 0.326167343387471, "grad_norm": 0.03021053411066532, "grad_norm_var": 1.6256449185195199e-06, "learning_rate": 0.007779058939788608, "loss": 2.6917, "step": 8997 }, { "crossentropy": 2.6763927936553955, "epoch": 0.326203596287703, "grad_norm": 0.02839919552206993, "grad_norm_var": 1.6439834158452418e-06, "learning_rate": 0.007778575858058217, "loss": 2.6118, "step": 8998 }, { "crossentropy": 2.522660255432129, "epoch": 0.326239849187935, "grad_norm": 0.028555450960993767, "grad_norm_var": 9.422689208515017e-07, "learning_rate": 0.007778092738798941, "loss": 2.592, "step": 8999 }, { "crossentropy": 2.6044888496398926, "epoch": 0.32627610208816704, "grad_norm": 0.03011428192257881, "grad_norm_var": 9.484953313056241e-07, "learning_rate": 0.007777609582017308, "loss": 2.6064, "step": 9000 }, { "crossentropy": 2.713087558746338, "epoch": 0.32631235498839906, "grad_norm": 0.02946869097650051, "grad_norm_var": 9.390198962006338e-07, "learning_rate": 0.007777126387719842, "loss": 2.6882, "step": 9001 }, { "crossentropy": 2.6596500873565674, "epoch": 0.32634860788863107, "grad_norm": 0.0299694761633873, "grad_norm_var": 6.467527412854293e-07, "learning_rate": 0.00777664315591307, "loss": 2.6506, "step": 9002 }, { "crossentropy": 2.452787160873413, "epoch": 0.3263848607888631, "grad_norm": 0.03128274157643318, "grad_norm_var": 7.844524866211597e-07, "learning_rate": 0.0077761598866035175, "loss": 2.5656, "step": 9003 }, { "crossentropy": 2.8009345531463623, "epoch": 0.32642111368909515, "grad_norm": 0.03267601132392883, "grad_norm_var": 1.2281428977958895e-06, "learning_rate": 0.007775676579797714, "loss": 2.6094, "step": 9004 }, { "crossentropy": 2.6305909156799316, "epoch": 0.32645736658932717, "grad_norm": 0.0286969356238842, "grad_norm_var": 1.3300973390271976e-06, "learning_rate": 0.007775193235502185, "loss": 2.6407, "step": 9005 }, { "crossentropy": 2.613940954208374, "epoch": 0.3264936194895592, "grad_norm": 0.032512273639440536, "grad_norm_var": 1.6591843627458084e-06, "learning_rate": 0.00777470985372346, "loss": 2.6457, "step": 9006 }, { "crossentropy": 2.685763120651245, "epoch": 0.3265298723897912, "grad_norm": 0.0295120719820261, "grad_norm_var": 1.5557704431030822e-06, "learning_rate": 0.007774226434468068, "loss": 2.6659, "step": 9007 }, { "crossentropy": 2.6447818279266357, "epoch": 0.3265661252900232, "grad_norm": 0.02907412126660347, "grad_norm_var": 1.5847450049676892e-06, "learning_rate": 0.007773742977742538, "loss": 2.6793, "step": 9008 }, { "crossentropy": 2.7516560554504395, "epoch": 0.3266023781902552, "grad_norm": 0.02917679399251938, "grad_norm_var": 1.6151987755498471e-06, "learning_rate": 0.0077732594835534, "loss": 2.6425, "step": 9009 }, { "crossentropy": 2.6448192596435547, "epoch": 0.32663863109048724, "grad_norm": 0.029481034725904465, "grad_norm_var": 1.6047563517383735e-06, "learning_rate": 0.007772775951907184, "loss": 2.6276, "step": 9010 }, { "crossentropy": 2.6009676456451416, "epoch": 0.32667488399071926, "grad_norm": 0.05838223174214363, "grad_norm_var": 5.229166177911347e-05, "learning_rate": 0.007772292382810421, "loss": 2.6911, "step": 9011 }, { "crossentropy": 2.602431058883667, "epoch": 0.32671113689095127, "grad_norm": 0.04162805527448654, "grad_norm_var": 5.792434584907154e-05, "learning_rate": 0.007771808776269642, "loss": 2.567, "step": 9012 }, { "crossentropy": 2.572659969329834, "epoch": 0.3267473897911833, "grad_norm": 0.031489718705415726, "grad_norm_var": 5.764529737335813e-05, "learning_rate": 0.007771325132291378, "loss": 2.6043, "step": 9013 }, { "crossentropy": 2.6549274921417236, "epoch": 0.3267836426914153, "grad_norm": 0.027729898691177368, "grad_norm_var": 5.804158626041485e-05, "learning_rate": 0.007770841450882165, "loss": 2.6114, "step": 9014 }, { "crossentropy": 2.6307241916656494, "epoch": 0.3268198955916473, "grad_norm": 0.028536109253764153, "grad_norm_var": 5.8051741887487015e-05, "learning_rate": 0.007770357732048532, "loss": 2.5511, "step": 9015 }, { "crossentropy": 2.4485280513763428, "epoch": 0.32685614849187933, "grad_norm": 0.028665967285633087, "grad_norm_var": 5.864029224980035e-05, "learning_rate": 0.007769873975797013, "loss": 2.6373, "step": 9016 }, { "crossentropy": 2.5164613723754883, "epoch": 0.32689240139211134, "grad_norm": 0.03378590568900108, "grad_norm_var": 5.812208379116247e-05, "learning_rate": 0.007769390182134143, "loss": 2.6175, "step": 9017 }, { "crossentropy": 2.7996761798858643, "epoch": 0.3269286542923434, "grad_norm": 0.036642201244831085, "grad_norm_var": 5.8508974496331716e-05, "learning_rate": 0.007768906351066456, "loss": 2.7789, "step": 9018 }, { "crossentropy": 2.7058866024017334, "epoch": 0.3269649071925754, "grad_norm": 0.03743298724293709, "grad_norm_var": 5.9399665154093864e-05, "learning_rate": 0.007768422482600488, "loss": 2.6167, "step": 9019 }, { "crossentropy": 2.6049234867095947, "epoch": 0.32700116009280744, "grad_norm": 0.031576648354530334, "grad_norm_var": 5.95906919182175e-05, "learning_rate": 0.007767938576742771, "loss": 2.7027, "step": 9020 }, { "crossentropy": 2.6708762645721436, "epoch": 0.32703741299303946, "grad_norm": 0.035506803542375565, "grad_norm_var": 5.8223158341616973e-05, "learning_rate": 0.007767454633499844, "loss": 2.7225, "step": 9021 }, { "crossentropy": 2.5801010131835938, "epoch": 0.32707366589327147, "grad_norm": 0.03642822429537773, "grad_norm_var": 5.8498357827950485e-05, "learning_rate": 0.0077669706528782426, "loss": 2.6243, "step": 9022 }, { "crossentropy": 2.6883373260498047, "epoch": 0.3271099187935035, "grad_norm": 0.03511224314570427, "grad_norm_var": 5.7058444718827134e-05, "learning_rate": 0.007766486634884503, "loss": 2.6395, "step": 9023 }, { "crossentropy": 2.662935733795166, "epoch": 0.3271461716937355, "grad_norm": 0.030586017295718193, "grad_norm_var": 5.6124549263454096e-05, "learning_rate": 0.007766002579525163, "loss": 2.6534, "step": 9024 }, { "crossentropy": 2.607088088989258, "epoch": 0.3271824245939675, "grad_norm": 0.030181702226400375, "grad_norm_var": 5.547307290931826e-05, "learning_rate": 0.007765518486806761, "loss": 2.6914, "step": 9025 }, { "crossentropy": 2.6739165782928467, "epoch": 0.32721867749419953, "grad_norm": 0.03478429093956947, "grad_norm_var": 5.363042244243115e-05, "learning_rate": 0.007765034356735835, "loss": 2.6303, "step": 9026 }, { "crossentropy": 2.6260502338409424, "epoch": 0.32725493039443154, "grad_norm": 0.03617274761199951, "grad_norm_var": 1.4934917462017192e-05, "learning_rate": 0.0077645501893189245, "loss": 2.6255, "step": 9027 }, { "crossentropy": 2.5536348819732666, "epoch": 0.32729118329466356, "grad_norm": 0.031051617115736008, "grad_norm_var": 1.0486989572355138e-05, "learning_rate": 0.0077640659845625674, "loss": 2.6569, "step": 9028 }, { "crossentropy": 2.653601884841919, "epoch": 0.32732743619489557, "grad_norm": 0.03105231560766697, "grad_norm_var": 1.0578582169534764e-05, "learning_rate": 0.0077635817424733045, "loss": 2.6153, "step": 9029 }, { "crossentropy": 2.6800999641418457, "epoch": 0.3273636890951276, "grad_norm": 0.034170690923929214, "grad_norm_var": 8.793336327783659e-06, "learning_rate": 0.0077630974630576765, "loss": 2.6075, "step": 9030 }, { "crossentropy": 2.6387479305267334, "epoch": 0.32739994199535966, "grad_norm": 0.030298173427581787, "grad_norm_var": 7.884504118997084e-06, "learning_rate": 0.007762613146322224, "loss": 2.5843, "step": 9031 }, { "crossentropy": 2.7585132122039795, "epoch": 0.32743619489559167, "grad_norm": 0.029755322262644768, "grad_norm_var": 7.2797042482825584e-06, "learning_rate": 0.00776212879227349, "loss": 2.7258, "step": 9032 }, { "crossentropy": 2.528385877609253, "epoch": 0.3274724477958237, "grad_norm": 0.02986244112253189, "grad_norm_var": 8.044432747832915e-06, "learning_rate": 0.007761644400918013, "loss": 2.5516, "step": 9033 }, { "crossentropy": 2.700031042098999, "epoch": 0.3275087006960557, "grad_norm": 0.03004899062216282, "grad_norm_var": 7.703139931252111e-06, "learning_rate": 0.0077611599722623385, "loss": 2.6324, "step": 9034 }, { "crossentropy": 2.6256837844848633, "epoch": 0.3275449535962877, "grad_norm": 0.02802370861172676, "grad_norm_var": 7.363073670872919e-06, "learning_rate": 0.007760675506313008, "loss": 2.6147, "step": 9035 }, { "crossentropy": 2.643493413925171, "epoch": 0.32758120649651973, "grad_norm": 0.0298544280230999, "grad_norm_var": 7.683151418301909e-06, "learning_rate": 0.0077601910030765655, "loss": 2.6067, "step": 9036 }, { "crossentropy": 2.672902822494507, "epoch": 0.32761745939675174, "grad_norm": 0.031115049496293068, "grad_norm_var": 6.867713050935049e-06, "learning_rate": 0.007759706462559555, "loss": 2.6019, "step": 9037 }, { "crossentropy": 2.6519558429718018, "epoch": 0.32765371229698376, "grad_norm": 0.032858312129974365, "grad_norm_var": 5.452264155297162e-06, "learning_rate": 0.007759221884768519, "loss": 2.6285, "step": 9038 }, { "crossentropy": 2.7805097103118896, "epoch": 0.32768996519721577, "grad_norm": 0.028032956644892693, "grad_norm_var": 5.229668320302453e-06, "learning_rate": 0.007758737269710006, "loss": 2.6759, "step": 9039 }, { "crossentropy": 2.4182307720184326, "epoch": 0.3277262180974478, "grad_norm": 0.02831624075770378, "grad_norm_var": 5.711916607428743e-06, "learning_rate": 0.00775825261739056, "loss": 2.5039, "step": 9040 }, { "crossentropy": 2.7657365798950195, "epoch": 0.3277624709976798, "grad_norm": 0.02813216671347618, "grad_norm_var": 6.1908805588757e-06, "learning_rate": 0.007757767927816726, "loss": 2.6753, "step": 9041 }, { "crossentropy": 2.6013708114624023, "epoch": 0.3277987238979118, "grad_norm": 0.029756827279925346, "grad_norm_var": 5.130369544828715e-06, "learning_rate": 0.00775728320099505, "loss": 2.5977, "step": 9042 }, { "crossentropy": 2.6212549209594727, "epoch": 0.32783497679814383, "grad_norm": 0.029623867943882942, "grad_norm_var": 2.884901912632862e-06, "learning_rate": 0.00775679843693208, "loss": 2.6124, "step": 9043 }, { "crossentropy": 2.5401394367218018, "epoch": 0.32787122969837584, "grad_norm": 0.029602965340018272, "grad_norm_var": 2.8365184299947514e-06, "learning_rate": 0.007756313635634366, "loss": 2.6205, "step": 9044 }, { "crossentropy": 2.5149550437927246, "epoch": 0.3279074825986079, "grad_norm": 0.027577102184295654, "grad_norm_var": 3.1183439862043077e-06, "learning_rate": 0.007755828797108451, "loss": 2.597, "step": 9045 }, { "crossentropy": 2.7070975303649902, "epoch": 0.32794373549883993, "grad_norm": 0.03261363506317139, "grad_norm_var": 2.365456972564988e-06, "learning_rate": 0.0077553439213608865, "loss": 2.6869, "step": 9046 }, { "crossentropy": 2.5948405265808105, "epoch": 0.32797998839907194, "grad_norm": 0.037070002406835556, "grad_norm_var": 5.756298225235281e-06, "learning_rate": 0.007754859008398222, "loss": 2.6669, "step": 9047 }, { "crossentropy": 2.887151002883911, "epoch": 0.32801624129930396, "grad_norm": 0.03389991074800491, "grad_norm_var": 6.617182899812332e-06, "learning_rate": 0.007754374058227006, "loss": 2.7679, "step": 9048 }, { "crossentropy": 2.720208168029785, "epoch": 0.32805249419953597, "grad_norm": 0.033141981810331345, "grad_norm_var": 7.054646517542599e-06, "learning_rate": 0.007753889070853788, "loss": 2.6833, "step": 9049 }, { "crossentropy": 2.6923274993896484, "epoch": 0.328088747099768, "grad_norm": 0.03448425978422165, "grad_norm_var": 7.955753607267546e-06, "learning_rate": 0.007753404046285117, "loss": 2.6365, "step": 9050 }, { "crossentropy": 2.6810216903686523, "epoch": 0.328125, "grad_norm": 0.03582371771335602, "grad_norm_var": 8.786193998566598e-06, "learning_rate": 0.007752918984527548, "loss": 2.6656, "step": 9051 }, { "crossentropy": 2.53826642036438, "epoch": 0.328161252900232, "grad_norm": 0.03030230663716793, "grad_norm_var": 8.708287431822594e-06, "learning_rate": 0.0077524338855876306, "loss": 2.5716, "step": 9052 }, { "crossentropy": 2.6094772815704346, "epoch": 0.32819750580046403, "grad_norm": 0.03137624263763428, "grad_norm_var": 8.702733677372444e-06, "learning_rate": 0.007751948749471916, "loss": 2.6257, "step": 9053 }, { "crossentropy": 2.6014158725738525, "epoch": 0.32823375870069604, "grad_norm": 0.029521651566028595, "grad_norm_var": 8.755688036098325e-06, "learning_rate": 0.007751463576186957, "loss": 2.6308, "step": 9054 }, { "crossentropy": 2.5944361686706543, "epoch": 0.32827001160092806, "grad_norm": 0.029080744832754135, "grad_norm_var": 8.381190048628585e-06, "learning_rate": 0.007750978365739307, "loss": 2.5729, "step": 9055 }, { "crossentropy": 2.6980817317962646, "epoch": 0.32830626450116007, "grad_norm": 0.02921922318637371, "grad_norm_var": 8.07649816951272e-06, "learning_rate": 0.00775049311813552, "loss": 2.6863, "step": 9056 }, { "crossentropy": 2.6767160892486572, "epoch": 0.3283425174013921, "grad_norm": 0.028295917436480522, "grad_norm_var": 8.008427255948245e-06, "learning_rate": 0.007750007833382149, "loss": 2.6028, "step": 9057 }, { "crossentropy": 2.740326166152954, "epoch": 0.32837877030162416, "grad_norm": 0.028494635596871376, "grad_norm_var": 8.373911242229766e-06, "learning_rate": 0.00774952251148575, "loss": 2.6847, "step": 9058 }, { "crossentropy": 2.707308292388916, "epoch": 0.32841502320185617, "grad_norm": 0.0332583412528038, "grad_norm_var": 8.40759893398917e-06, "learning_rate": 0.007749037152452874, "loss": 2.7404, "step": 9059 }, { "crossentropy": 2.3848321437835693, "epoch": 0.3284512761020882, "grad_norm": 0.03885969519615173, "grad_norm_var": 1.1439971044927e-05, "learning_rate": 0.007748551756290082, "loss": 2.4984, "step": 9060 }, { "crossentropy": 2.482084274291992, "epoch": 0.3284875290023202, "grad_norm": 0.03523623198270798, "grad_norm_var": 1.0524560811603124e-05, "learning_rate": 0.007748066323003927, "loss": 2.5722, "step": 9061 }, { "crossentropy": 2.7201502323150635, "epoch": 0.3285237819025522, "grad_norm": 0.02840055711567402, "grad_norm_var": 1.1593925009197601e-05, "learning_rate": 0.007747580852600966, "loss": 2.7331, "step": 9062 }, { "crossentropy": 2.5340492725372314, "epoch": 0.32856003480278423, "grad_norm": 0.0292167030274868, "grad_norm_var": 1.0431972216198486e-05, "learning_rate": 0.007747095345087756, "loss": 2.5454, "step": 9063 }, { "crossentropy": 2.672791004180908, "epoch": 0.32859628770301624, "grad_norm": 0.030329564586281776, "grad_norm_var": 1.0223438554761374e-05, "learning_rate": 0.007746609800470854, "loss": 2.6424, "step": 9064 }, { "crossentropy": 2.54547381401062, "epoch": 0.32863254060324826, "grad_norm": 0.028873177245259285, "grad_norm_var": 1.0464842997271356e-05, "learning_rate": 0.007746124218756818, "loss": 2.584, "step": 9065 }, { "crossentropy": 2.6336288452148438, "epoch": 0.32866879350348027, "grad_norm": 0.029372243210673332, "grad_norm_var": 9.92658759189242e-06, "learning_rate": 0.007745638599952209, "loss": 2.6605, "step": 9066 }, { "crossentropy": 2.6558592319488525, "epoch": 0.3287050464037123, "grad_norm": 0.03243136405944824, "grad_norm_var": 8.454422639503124e-06, "learning_rate": 0.0077451529440635825, "loss": 2.7128, "step": 9067 }, { "crossentropy": 2.6861631870269775, "epoch": 0.3287412993039443, "grad_norm": 0.03173975273966789, "grad_norm_var": 8.494541189960749e-06, "learning_rate": 0.007744667251097499, "loss": 2.7388, "step": 9068 }, { "crossentropy": 2.7230074405670166, "epoch": 0.3287775522041763, "grad_norm": 0.0327584408223629, "grad_norm_var": 8.70970709349153e-06, "learning_rate": 0.00774418152106052, "loss": 2.6872, "step": 9069 }, { "crossentropy": 2.632107734680176, "epoch": 0.32881380510440833, "grad_norm": 0.031099451705813408, "grad_norm_var": 8.566280882873108e-06, "learning_rate": 0.007743695753959206, "loss": 2.6861, "step": 9070 }, { "crossentropy": 2.6098408699035645, "epoch": 0.32885005800464034, "grad_norm": 0.03178703784942627, "grad_norm_var": 8.316469085139997e-06, "learning_rate": 0.007743209949800116, "loss": 2.667, "step": 9071 }, { "crossentropy": 2.681556463241577, "epoch": 0.3288863109048724, "grad_norm": 0.027904080227017403, "grad_norm_var": 8.773791837010428e-06, "learning_rate": 0.007742724108589813, "loss": 2.6472, "step": 9072 }, { "crossentropy": 2.5469603538513184, "epoch": 0.32892256380510443, "grad_norm": 0.02819361723959446, "grad_norm_var": 8.813083438612089e-06, "learning_rate": 0.007742238230334858, "loss": 2.5418, "step": 9073 }, { "crossentropy": 2.604433298110962, "epoch": 0.32895881670533644, "grad_norm": 0.027327172458171844, "grad_norm_var": 9.307277374484312e-06, "learning_rate": 0.007741752315041816, "loss": 2.5373, "step": 9074 }, { "crossentropy": 2.6215131282806396, "epoch": 0.32899506960556846, "grad_norm": 0.0279757808893919, "grad_norm_var": 9.495388178025708e-06, "learning_rate": 0.007741266362717246, "loss": 2.6162, "step": 9075 }, { "crossentropy": 2.58798885345459, "epoch": 0.32903132250580047, "grad_norm": 0.02751382812857628, "grad_norm_var": 5.225914784407331e-06, "learning_rate": 0.007740780373367715, "loss": 2.5802, "step": 9076 }, { "crossentropy": 2.685610294342041, "epoch": 0.3290675754060325, "grad_norm": 0.028052210807800293, "grad_norm_var": 3.4454419656409475e-06, "learning_rate": 0.007740294346999786, "loss": 2.7359, "step": 9077 }, { "crossentropy": 2.6614415645599365, "epoch": 0.3291038283062645, "grad_norm": 0.031285360455513, "grad_norm_var": 3.5192438122417693e-06, "learning_rate": 0.007739808283620022, "loss": 2.7004, "step": 9078 }, { "crossentropy": 2.71560001373291, "epoch": 0.3291400812064965, "grad_norm": 0.03332996368408203, "grad_norm_var": 4.289003453198968e-06, "learning_rate": 0.007739322183234991, "loss": 2.7096, "step": 9079 }, { "crossentropy": 2.702986240386963, "epoch": 0.32917633410672853, "grad_norm": 0.03142120689153671, "grad_norm_var": 4.411697721886226e-06, "learning_rate": 0.007738836045851257, "loss": 2.6904, "step": 9080 }, { "crossentropy": 2.596418619155884, "epoch": 0.32921258700696054, "grad_norm": 0.032091736793518066, "grad_norm_var": 4.547020559363919e-06, "learning_rate": 0.0077383498714753834, "loss": 2.5663, "step": 9081 }, { "crossentropy": 2.64406156539917, "epoch": 0.32924883990719256, "grad_norm": 0.0320909209549427, "grad_norm_var": 4.684375555851402e-06, "learning_rate": 0.007737863660113942, "loss": 2.6245, "step": 9082 }, { "crossentropy": 2.6225764751434326, "epoch": 0.3292850928074246, "grad_norm": 0.029105568304657936, "grad_norm_var": 4.491578314471398e-06, "learning_rate": 0.007737377411773494, "loss": 2.5089, "step": 9083 }, { "crossentropy": 2.782870054244995, "epoch": 0.3293213457076566, "grad_norm": 0.029216209426522255, "grad_norm_var": 4.381523666765335e-06, "learning_rate": 0.007736891126460612, "loss": 2.7204, "step": 9084 }, { "crossentropy": 2.410053253173828, "epoch": 0.32935759860788866, "grad_norm": 0.03047965094447136, "grad_norm_var": 3.8898454599055335e-06, "learning_rate": 0.007736404804181861, "loss": 2.5084, "step": 9085 }, { "crossentropy": 2.709073305130005, "epoch": 0.32939385150812067, "grad_norm": 0.030835682526230812, "grad_norm_var": 3.8530515020340194e-06, "learning_rate": 0.007735918444943812, "loss": 2.727, "step": 9086 }, { "crossentropy": 2.679614305496216, "epoch": 0.3294301044083527, "grad_norm": 0.030305858701467514, "grad_norm_var": 3.6200899867264156e-06, "learning_rate": 0.0077354320487530315, "loss": 2.692, "step": 9087 }, { "crossentropy": 2.606316089630127, "epoch": 0.3294663573085847, "grad_norm": 0.029481573030352592, "grad_norm_var": 3.372523934941305e-06, "learning_rate": 0.00773494561561609, "loss": 2.7086, "step": 9088 }, { "crossentropy": 2.7707369327545166, "epoch": 0.3295026102088167, "grad_norm": 0.03099098615348339, "grad_norm_var": 3.2180112968039726e-06, "learning_rate": 0.007734459145539558, "loss": 2.6617, "step": 9089 }, { "crossentropy": 2.640314817428589, "epoch": 0.32953886310904873, "grad_norm": 0.03259478509426117, "grad_norm_var": 3.008981187666247e-06, "learning_rate": 0.0077339726385300045, "loss": 2.6099, "step": 9090 }, { "crossentropy": 2.6181514263153076, "epoch": 0.32957511600928074, "grad_norm": 0.029912851750850677, "grad_norm_var": 2.611384526429713e-06, "learning_rate": 0.007733486094594002, "loss": 2.5755, "step": 9091 }, { "crossentropy": 2.5885379314422607, "epoch": 0.32961136890951276, "grad_norm": 0.027305295690894127, "grad_norm_var": 2.698361906271612e-06, "learning_rate": 0.007732999513738122, "loss": 2.5942, "step": 9092 }, { "crossentropy": 2.59615159034729, "epoch": 0.3296476218097448, "grad_norm": 0.028727585449814796, "grad_norm_var": 2.5036335081580666e-06, "learning_rate": 0.007732512895968935, "loss": 2.5857, "step": 9093 }, { "crossentropy": 2.4543981552124023, "epoch": 0.3296838747099768, "grad_norm": 0.029570721089839935, "grad_norm_var": 2.524627321174204e-06, "learning_rate": 0.007732026241293016, "loss": 2.5489, "step": 9094 }, { "crossentropy": 2.6269192695617676, "epoch": 0.3297201276102088, "grad_norm": 0.029580334201455116, "grad_norm_var": 1.9716631701589827e-06, "learning_rate": 0.007731539549716937, "loss": 2.6501, "step": 9095 }, { "crossentropy": 2.6945292949676514, "epoch": 0.3297563805104408, "grad_norm": 0.029642688110470772, "grad_norm_var": 1.8873398538968156e-06, "learning_rate": 0.007731052821247271, "loss": 2.6434, "step": 9096 }, { "crossentropy": 2.6766648292541504, "epoch": 0.32979263341067283, "grad_norm": 0.03284655511379242, "grad_norm_var": 2.1213113784786095e-06, "learning_rate": 0.00773056605589059, "loss": 2.7187, "step": 9097 }, { "crossentropy": 2.6545822620391846, "epoch": 0.32982888631090485, "grad_norm": 0.033386338502168655, "grad_norm_var": 2.5583323663640044e-06, "learning_rate": 0.007730079253653472, "loss": 2.65, "step": 9098 }, { "crossentropy": 2.574450969696045, "epoch": 0.3298651392111369, "grad_norm": 0.02970775216817856, "grad_norm_var": 2.489195575571442e-06, "learning_rate": 0.007729592414542491, "loss": 2.5966, "step": 9099 }, { "crossentropy": 2.587435007095337, "epoch": 0.32990139211136893, "grad_norm": 0.03137458115816116, "grad_norm_var": 2.472329174960314e-06, "learning_rate": 0.007729105538564222, "loss": 2.6747, "step": 9100 }, { "crossentropy": 2.7078378200531006, "epoch": 0.32993764501160094, "grad_norm": 0.032409705221652985, "grad_norm_var": 2.720125345456838e-06, "learning_rate": 0.007728618625725241, "loss": 2.5803, "step": 9101 }, { "crossentropy": 2.683393955230713, "epoch": 0.32997389791183296, "grad_norm": 0.034069158136844635, "grad_norm_var": 3.500166133027736e-06, "learning_rate": 0.0077281316760321244, "loss": 2.5955, "step": 9102 }, { "crossentropy": 2.7698140144348145, "epoch": 0.330010150812065, "grad_norm": 0.0326092354953289, "grad_norm_var": 3.697148912572973e-06, "learning_rate": 0.00772764468949145, "loss": 2.7208, "step": 9103 }, { "crossentropy": 2.556076765060425, "epoch": 0.330046403712297, "grad_norm": 0.03085297718644142, "grad_norm_var": 3.557500579198777e-06, "learning_rate": 0.007727157666109794, "loss": 2.5931, "step": 9104 }, { "crossentropy": 2.6310884952545166, "epoch": 0.330082656612529, "grad_norm": 0.028487134724855423, "grad_norm_var": 3.943608194409362e-06, "learning_rate": 0.007726670605893735, "loss": 2.6076, "step": 9105 }, { "crossentropy": 2.729444742202759, "epoch": 0.330118909512761, "grad_norm": 0.0274948813021183, "grad_norm_var": 4.3605430102694005e-06, "learning_rate": 0.007726183508849851, "loss": 2.6404, "step": 9106 }, { "crossentropy": 2.7309651374816895, "epoch": 0.33015516241299303, "grad_norm": 0.029492974281311035, "grad_norm_var": 4.404354594071218e-06, "learning_rate": 0.007725696374984723, "loss": 2.6746, "step": 9107 }, { "crossentropy": 2.520045518875122, "epoch": 0.33019141531322505, "grad_norm": 0.028010455891489983, "grad_norm_var": 4.1376601545700204e-06, "learning_rate": 0.007725209204304929, "loss": 2.5893, "step": 9108 }, { "crossentropy": 2.5618624687194824, "epoch": 0.33022766821345706, "grad_norm": 0.029440050944685936, "grad_norm_var": 3.999452419027547e-06, "learning_rate": 0.007724721996817049, "loss": 2.6043, "step": 9109 }, { "crossentropy": 2.6123087406158447, "epoch": 0.3302639211136891, "grad_norm": 0.028894243761897087, "grad_norm_var": 4.117371358350431e-06, "learning_rate": 0.007724234752527663, "loss": 2.553, "step": 9110 }, { "crossentropy": 2.67594575881958, "epoch": 0.3303001740139211, "grad_norm": 0.028250224888324738, "grad_norm_var": 4.394361511045099e-06, "learning_rate": 0.0077237474714433535, "loss": 2.702, "step": 9111 }, { "crossentropy": 2.5894052982330322, "epoch": 0.33033642691415316, "grad_norm": 0.028567619621753693, "grad_norm_var": 4.5802494603300185e-06, "learning_rate": 0.0077232601535707, "loss": 2.6714, "step": 9112 }, { "crossentropy": 2.7519845962524414, "epoch": 0.3303726798143852, "grad_norm": 0.03056151233613491, "grad_norm_var": 4.1515531067699726e-06, "learning_rate": 0.007722772798916284, "loss": 2.668, "step": 9113 }, { "crossentropy": 2.6082606315612793, "epoch": 0.3304089327146172, "grad_norm": 0.028546184301376343, "grad_norm_var": 3.575921073643388e-06, "learning_rate": 0.0077222854074866926, "loss": 2.6205, "step": 9114 }, { "crossentropy": 2.4616758823394775, "epoch": 0.3304451856148492, "grad_norm": 0.027502283453941345, "grad_norm_var": 3.943235861307802e-06, "learning_rate": 0.007721797979288503, "loss": 2.4723, "step": 9115 }, { "crossentropy": 2.5039074420928955, "epoch": 0.3304814385150812, "grad_norm": 0.027970613911747932, "grad_norm_var": 3.946063381560445e-06, "learning_rate": 0.007721310514328303, "loss": 2.5592, "step": 9116 }, { "crossentropy": 2.4932167530059814, "epoch": 0.33051769141531323, "grad_norm": 0.028616826981306076, "grad_norm_var": 3.4103369896279576e-06, "learning_rate": 0.007720823012612673, "loss": 2.48, "step": 9117 }, { "crossentropy": 2.5879158973693848, "epoch": 0.33055394431554525, "grad_norm": 0.029486309736967087, "grad_norm_var": 1.8304462152705086e-06, "learning_rate": 0.007720335474148202, "loss": 2.6048, "step": 9118 }, { "crossentropy": 2.58388090133667, "epoch": 0.33059019721577726, "grad_norm": 0.03085251897573471, "grad_norm_var": 1.189407691218206e-06, "learning_rate": 0.007719847898941469, "loss": 2.5547, "step": 9119 }, { "crossentropy": 2.6225430965423584, "epoch": 0.3306264501160093, "grad_norm": 0.030168330296874046, "grad_norm_var": 1.0440002680478974e-06, "learning_rate": 0.007719360286999064, "loss": 2.561, "step": 9120 }, { "crossentropy": 2.6162686347961426, "epoch": 0.3306627030162413, "grad_norm": 0.028942015022039413, "grad_norm_var": 1.0321111845989124e-06, "learning_rate": 0.007718872638327572, "loss": 2.7031, "step": 9121 }, { "crossentropy": 2.6950178146362305, "epoch": 0.3306989559164733, "grad_norm": 0.02933865413069725, "grad_norm_var": 8.930500172710706e-07, "learning_rate": 0.007718384952933577, "loss": 2.7039, "step": 9122 }, { "crossentropy": 2.633518934249878, "epoch": 0.3307352088167053, "grad_norm": 0.0282491035759449, "grad_norm_var": 9.146338724368963e-07, "learning_rate": 0.00771789723082367, "loss": 2.7052, "step": 9123 }, { "crossentropy": 2.5723869800567627, "epoch": 0.33077146171693733, "grad_norm": 0.027707409113645554, "grad_norm_var": 9.588345187944245e-07, "learning_rate": 0.007717409472004436, "loss": 2.5498, "step": 9124 }, { "crossentropy": 2.7162108421325684, "epoch": 0.33080771461716935, "grad_norm": 0.030007557943463326, "grad_norm_var": 1.0165462754716982e-06, "learning_rate": 0.007716921676482463, "loss": 2.655, "step": 9125 }, { "crossentropy": 2.6752803325653076, "epoch": 0.3308439675174014, "grad_norm": 0.029507366940379143, "grad_norm_var": 1.033125718786692e-06, "learning_rate": 0.007716433844264339, "loss": 2.6145, "step": 9126 }, { "crossentropy": 2.741935968399048, "epoch": 0.33088022041763343, "grad_norm": 0.028821146115660667, "grad_norm_var": 9.951165247864523e-07, "learning_rate": 0.007715945975356655, "loss": 2.6809, "step": 9127 }, { "crossentropy": 2.6496753692626953, "epoch": 0.33091647331786544, "grad_norm": 0.028643757104873657, "grad_norm_var": 9.905530297055556e-07, "learning_rate": 0.007715458069765999, "loss": 2.5653, "step": 9128 }, { "crossentropy": 2.644958019256592, "epoch": 0.33095272621809746, "grad_norm": 0.029315782710909843, "grad_norm_var": 8.377473155291373e-07, "learning_rate": 0.007714970127498959, "loss": 2.6402, "step": 9129 }, { "crossentropy": 2.632383346557617, "epoch": 0.3309889791183295, "grad_norm": 0.028949914500117302, "grad_norm_var": 8.245960229823068e-07, "learning_rate": 0.007714482148562129, "loss": 2.5758, "step": 9130 }, { "crossentropy": 2.844031572341919, "epoch": 0.3310252320185615, "grad_norm": 0.03050961159169674, "grad_norm_var": 7.87302792868566e-07, "learning_rate": 0.007713994132962096, "loss": 2.5677, "step": 9131 }, { "crossentropy": 2.4916675090789795, "epoch": 0.3310614849187935, "grad_norm": 0.03025909885764122, "grad_norm_var": 7.41657836025158e-07, "learning_rate": 0.007713506080705457, "loss": 2.6108, "step": 9132 }, { "crossentropy": 2.698329448699951, "epoch": 0.3310977378190255, "grad_norm": 0.03190816566348076, "grad_norm_var": 1.1031254923361677e-06, "learning_rate": 0.0077130179917987984, "loss": 2.667, "step": 9133 }, { "crossentropy": 2.587088108062744, "epoch": 0.33113399071925753, "grad_norm": 0.03062506392598152, "grad_norm_var": 1.1757672856524954e-06, "learning_rate": 0.007712529866248715, "loss": 2.6464, "step": 9134 }, { "crossentropy": 2.7044517993927, "epoch": 0.33117024361948955, "grad_norm": 0.030325941741466522, "grad_norm_var": 1.1060595272355805e-06, "learning_rate": 0.007712041704061799, "loss": 2.7105, "step": 9135 }, { "crossentropy": 2.54012131690979, "epoch": 0.33120649651972156, "grad_norm": 0.030939599499106407, "grad_norm_var": 1.2037464567997975e-06, "learning_rate": 0.007711553505244644, "loss": 2.5976, "step": 9136 }, { "crossentropy": 2.7416832447052, "epoch": 0.3312427494199536, "grad_norm": 0.029860561713576317, "grad_norm_var": 1.172448142559325e-06, "learning_rate": 0.0077110652698038455, "loss": 2.712, "step": 9137 }, { "crossentropy": 2.6455891132354736, "epoch": 0.3312790023201856, "grad_norm": 0.02871946431696415, "grad_norm_var": 1.2250493112444937e-06, "learning_rate": 0.007710576997745995, "loss": 2.6164, "step": 9138 }, { "crossentropy": 2.6667613983154297, "epoch": 0.33131525522041766, "grad_norm": 0.029225079342722893, "grad_norm_var": 1.1026939145253669e-06, "learning_rate": 0.0077100886890776885, "loss": 2.6295, "step": 9139 }, { "crossentropy": 2.641300678253174, "epoch": 0.3313515081206497, "grad_norm": 0.028185421600937843, "grad_norm_var": 9.894771285146592e-07, "learning_rate": 0.007709600343805523, "loss": 2.6537, "step": 9140 }, { "crossentropy": 2.6791739463806152, "epoch": 0.3313877610208817, "grad_norm": 0.027899470180273056, "grad_norm_var": 1.191383884080408e-06, "learning_rate": 0.007709111961936093, "loss": 2.6291, "step": 9141 }, { "crossentropy": 2.734297513961792, "epoch": 0.3314240139211137, "grad_norm": 0.03418806567788124, "grad_norm_var": 2.4991580744825596e-06, "learning_rate": 0.007708623543475992, "loss": 2.6944, "step": 9142 }, { "crossentropy": 2.5564076900482178, "epoch": 0.3314602668213457, "grad_norm": 0.028142135590314865, "grad_norm_var": 2.625512798480149e-06, "learning_rate": 0.007708135088431822, "loss": 2.6318, "step": 9143 }, { "crossentropy": 2.5862720012664795, "epoch": 0.33149651972157773, "grad_norm": 0.030148383229970932, "grad_norm_var": 2.5237960048958447e-06, "learning_rate": 0.007707646596810176, "loss": 2.6538, "step": 9144 }, { "crossentropy": 2.646418571472168, "epoch": 0.33153277262180975, "grad_norm": 0.03068249113857746, "grad_norm_var": 2.5249471884046063e-06, "learning_rate": 0.007707158068617657, "loss": 2.7049, "step": 9145 }, { "crossentropy": 2.509234666824341, "epoch": 0.33156902552204176, "grad_norm": 0.03099866770207882, "grad_norm_var": 2.490729807830966e-06, "learning_rate": 0.007706669503860857, "loss": 2.5156, "step": 9146 }, { "crossentropy": 2.718794345855713, "epoch": 0.3316052784222738, "grad_norm": 0.029005927965044975, "grad_norm_var": 2.562669334436422e-06, "learning_rate": 0.007706180902546378, "loss": 2.6842, "step": 9147 }, { "crossentropy": 2.6012582778930664, "epoch": 0.3316415313225058, "grad_norm": 0.029624907299876213, "grad_norm_var": 2.5717826306924693e-06, "learning_rate": 0.00770569226468082, "loss": 2.6187, "step": 9148 }, { "crossentropy": 2.672085762023926, "epoch": 0.3316777842227378, "grad_norm": 0.030539821833372116, "grad_norm_var": 2.3461344625908768e-06, "learning_rate": 0.007705203590270782, "loss": 2.6026, "step": 9149 }, { "crossentropy": 2.5681140422821045, "epoch": 0.3317140371229698, "grad_norm": 0.031325481832027435, "grad_norm_var": 2.4403590823311814e-06, "learning_rate": 0.007704714879322865, "loss": 2.507, "step": 9150 }, { "crossentropy": 2.820906400680542, "epoch": 0.33175029002320183, "grad_norm": 0.028854940086603165, "grad_norm_var": 2.5093596660157816e-06, "learning_rate": 0.007704226131843668, "loss": 2.7518, "step": 9151 }, { "crossentropy": 2.6967108249664307, "epoch": 0.33178654292343385, "grad_norm": 0.029317863285541534, "grad_norm_var": 2.448137078205938e-06, "learning_rate": 0.007703737347839792, "loss": 2.7725, "step": 9152 }, { "crossentropy": 2.7010765075683594, "epoch": 0.3318227958236659, "grad_norm": 0.028460495173931122, "grad_norm_var": 2.5583945866127496e-06, "learning_rate": 0.007703248527317842, "loss": 2.6733, "step": 9153 }, { "crossentropy": 2.6743204593658447, "epoch": 0.33185904872389793, "grad_norm": 0.02836727164685726, "grad_norm_var": 2.6125401952630285e-06, "learning_rate": 0.007702759670284418, "loss": 2.6393, "step": 9154 }, { "crossentropy": 2.6741201877593994, "epoch": 0.33189530162412995, "grad_norm": 0.03340470418334007, "grad_norm_var": 3.4478393412691135e-06, "learning_rate": 0.007702270776746122, "loss": 2.6141, "step": 9155 }, { "crossentropy": 2.6938486099243164, "epoch": 0.33193155452436196, "grad_norm": 0.0348249226808548, "grad_norm_var": 4.6438876577488035e-06, "learning_rate": 0.007701781846709559, "loss": 2.6433, "step": 9156 }, { "crossentropy": 2.595452070236206, "epoch": 0.331967807424594, "grad_norm": 0.02798706479370594, "grad_norm_var": 4.615611337654921e-06, "learning_rate": 0.007701292880181332, "loss": 2.6057, "step": 9157 }, { "crossentropy": 2.6268956661224365, "epoch": 0.332004060324826, "grad_norm": 0.033767953515052795, "grad_norm_var": 4.412609410313119e-06, "learning_rate": 0.007700803877168046, "loss": 2.6963, "step": 9158 }, { "crossentropy": 2.550577402114868, "epoch": 0.332040313225058, "grad_norm": 0.029365595430135727, "grad_norm_var": 4.147496738256956e-06, "learning_rate": 0.007700314837676306, "loss": 2.5172, "step": 9159 }, { "crossentropy": 2.625626802444458, "epoch": 0.33207656612529, "grad_norm": 0.029417963698506355, "grad_norm_var": 4.207029013679923e-06, "learning_rate": 0.007699825761712713, "loss": 2.573, "step": 9160 }, { "crossentropy": 2.726475715637207, "epoch": 0.33211281902552203, "grad_norm": 0.02866138145327568, "grad_norm_var": 4.378562894850659e-06, "learning_rate": 0.007699336649283877, "loss": 2.7514, "step": 9161 }, { "crossentropy": 2.5659022331237793, "epoch": 0.33214907192575405, "grad_norm": 0.03024054691195488, "grad_norm_var": 4.3383331220815525e-06, "learning_rate": 0.007698847500396404, "loss": 2.5849, "step": 9162 }, { "crossentropy": 2.5914647579193115, "epoch": 0.33218532482598606, "grad_norm": 0.027309617027640343, "grad_norm_var": 4.78777531862846e-06, "learning_rate": 0.0076983583150569005, "loss": 2.6121, "step": 9163 }, { "crossentropy": 2.623797655105591, "epoch": 0.3322215777262181, "grad_norm": 0.03386003524065018, "grad_norm_var": 5.64508681499265e-06, "learning_rate": 0.007697869093271972, "loss": 2.656, "step": 9164 }, { "crossentropy": 2.5141687393188477, "epoch": 0.3322578306264501, "grad_norm": 0.02976181171834469, "grad_norm_var": 5.663911971246965e-06, "learning_rate": 0.007697379835048226, "loss": 2.4992, "step": 9165 }, { "crossentropy": 2.5389673709869385, "epoch": 0.33229408352668216, "grad_norm": 0.030408699065446854, "grad_norm_var": 5.592065311081047e-06, "learning_rate": 0.007696890540392275, "loss": 2.5811, "step": 9166 }, { "crossentropy": 2.612596273422241, "epoch": 0.3323303364269142, "grad_norm": 0.03210320323705673, "grad_norm_var": 5.6470191320697005e-06, "learning_rate": 0.007696401209310723, "loss": 2.677, "step": 9167 }, { "crossentropy": 2.704690933227539, "epoch": 0.3323665893271462, "grad_norm": 0.028997180983424187, "grad_norm_var": 5.70201196133611e-06, "learning_rate": 0.0076959118418101794, "loss": 2.6129, "step": 9168 }, { "crossentropy": 2.776482343673706, "epoch": 0.3324028422273782, "grad_norm": 0.029991911724209785, "grad_norm_var": 5.445692394325527e-06, "learning_rate": 0.007695422437897257, "loss": 2.6713, "step": 9169 }, { "crossentropy": 2.7484591007232666, "epoch": 0.3324390951276102, "grad_norm": 0.02880789525806904, "grad_norm_var": 5.330804047516549e-06, "learning_rate": 0.007694932997578564, "loss": 2.7104, "step": 9170 }, { "crossentropy": 2.6805038452148438, "epoch": 0.33247534802784223, "grad_norm": 0.03083760291337967, "grad_norm_var": 4.767934567041993e-06, "learning_rate": 0.007694443520860711, "loss": 2.6544, "step": 9171 }, { "crossentropy": 2.7099502086639404, "epoch": 0.33251160092807425, "grad_norm": 0.0302901491522789, "grad_norm_var": 3.375586176141198e-06, "learning_rate": 0.007693954007750308, "loss": 2.6849, "step": 9172 }, { "crossentropy": 2.676574468612671, "epoch": 0.33254785382830626, "grad_norm": 0.02914900705218315, "grad_norm_var": 3.1306002516319373e-06, "learning_rate": 0.007693464458253969, "loss": 2.629, "step": 9173 }, { "crossentropy": 2.6580822467803955, "epoch": 0.3325841067285383, "grad_norm": 0.030772242695093155, "grad_norm_var": 2.2606240928991125e-06, "learning_rate": 0.007692974872378306, "loss": 2.635, "step": 9174 }, { "crossentropy": 2.6940970420837402, "epoch": 0.3326203596287703, "grad_norm": 0.030306970700621605, "grad_norm_var": 2.2365797826592725e-06, "learning_rate": 0.0076924852501299296, "loss": 2.6126, "step": 9175 }, { "crossentropy": 2.598569869995117, "epoch": 0.3326566125290023, "grad_norm": 0.02719101868569851, "grad_norm_var": 2.736359824797085e-06, "learning_rate": 0.007691995591515453, "loss": 2.6923, "step": 9176 }, { "crossentropy": 2.564750909805298, "epoch": 0.3326928654292343, "grad_norm": 0.029448801651597023, "grad_norm_var": 2.6431718000466746e-06, "learning_rate": 0.007691505896541491, "loss": 2.5915, "step": 9177 }, { "crossentropy": 2.6504034996032715, "epoch": 0.33272911832946633, "grad_norm": 0.027874691411852837, "grad_norm_var": 2.9068041980778813e-06, "learning_rate": 0.007691016165214658, "loss": 2.6327, "step": 9178 }, { "crossentropy": 2.687884569168091, "epoch": 0.3327653712296984, "grad_norm": 0.030339796096086502, "grad_norm_var": 2.4666549551419574e-06, "learning_rate": 0.007690526397541569, "loss": 2.6644, "step": 9179 }, { "crossentropy": 2.6060791015625, "epoch": 0.3328016241299304, "grad_norm": 0.02922920510172844, "grad_norm_var": 1.4290279245476416e-06, "learning_rate": 0.007690036593528835, "loss": 2.6702, "step": 9180 }, { "crossentropy": 2.501706123352051, "epoch": 0.33283787703016243, "grad_norm": 0.02718886360526085, "grad_norm_var": 1.8282274882643375e-06, "learning_rate": 0.007689546753183077, "loss": 2.5271, "step": 9181 }, { "crossentropy": 2.655977487564087, "epoch": 0.33287412993039445, "grad_norm": 0.027989938855171204, "grad_norm_var": 1.919712172329076e-06, "learning_rate": 0.007689056876510907, "loss": 2.6403, "step": 9182 }, { "crossentropy": 2.5734126567840576, "epoch": 0.33291038283062646, "grad_norm": 0.02803667075932026, "grad_norm_var": 1.4915815642418373e-06, "learning_rate": 0.007688566963518946, "loss": 2.6602, "step": 9183 }, { "crossentropy": 2.702840566635132, "epoch": 0.3329466357308585, "grad_norm": 0.03011120669543743, "grad_norm_var": 1.5459659036870868e-06, "learning_rate": 0.007688077014213806, "loss": 2.632, "step": 9184 }, { "crossentropy": 2.6960251331329346, "epoch": 0.3329828886310905, "grad_norm": 0.02988901548087597, "grad_norm_var": 1.536076808636433e-06, "learning_rate": 0.007687587028602105, "loss": 2.635, "step": 9185 }, { "crossentropy": 2.7063682079315186, "epoch": 0.3330191415313225, "grad_norm": 0.03210793435573578, "grad_norm_var": 2.0369551271522903e-06, "learning_rate": 0.0076870970066904645, "loss": 2.652, "step": 9186 }, { "crossentropy": 2.589226007461548, "epoch": 0.3330553944315545, "grad_norm": 0.030808907002210617, "grad_norm_var": 2.0315929823921334e-06, "learning_rate": 0.007686606948485501, "loss": 2.6209, "step": 9187 }, { "crossentropy": 2.4780311584472656, "epoch": 0.33309164733178653, "grad_norm": 0.02996891736984253, "grad_norm_var": 2.000811672912235e-06, "learning_rate": 0.007686116853993835, "loss": 2.5653, "step": 9188 }, { "crossentropy": 2.7142560482025146, "epoch": 0.33312790023201855, "grad_norm": 0.03379252552986145, "grad_norm_var": 3.192544140845109e-06, "learning_rate": 0.007685626723222082, "loss": 2.7117, "step": 9189 }, { "crossentropy": 2.6607697010040283, "epoch": 0.33316415313225056, "grad_norm": 0.031715378165245056, "grad_norm_var": 3.3841004026289308e-06, "learning_rate": 0.007685136556176866, "loss": 2.6585, "step": 9190 }, { "crossentropy": 2.590608596801758, "epoch": 0.3332004060324826, "grad_norm": 0.03232571482658386, "grad_norm_var": 3.7887285706930097e-06, "learning_rate": 0.007684646352864806, "loss": 2.5808, "step": 9191 }, { "crossentropy": 2.431283950805664, "epoch": 0.3332366589327146, "grad_norm": 0.03141101449728012, "grad_norm_var": 3.3909124479868187e-06, "learning_rate": 0.007684156113292522, "loss": 2.512, "step": 9192 }, { "crossentropy": 2.6986026763916016, "epoch": 0.33327291183294666, "grad_norm": 0.03013257123529911, "grad_norm_var": 3.357125779725291e-06, "learning_rate": 0.007683665837466637, "loss": 2.655, "step": 9193 }, { "crossentropy": 2.5184409618377686, "epoch": 0.3333091647331787, "grad_norm": 0.03277841955423355, "grad_norm_var": 3.3510236077921583e-06, "learning_rate": 0.0076831755253937714, "loss": 2.5091, "step": 9194 }, { "crossentropy": 2.60662841796875, "epoch": 0.3333454176334107, "grad_norm": 0.03034161776304245, "grad_norm_var": 3.350987543658014e-06, "learning_rate": 0.007682685177080551, "loss": 2.61, "step": 9195 }, { "crossentropy": 2.459303617477417, "epoch": 0.3333816705336427, "grad_norm": 0.028264135122299194, "grad_norm_var": 3.5713342837924433e-06, "learning_rate": 0.007682194792533592, "loss": 2.5695, "step": 9196 }, { "crossentropy": 2.6765551567077637, "epoch": 0.3334179234338747, "grad_norm": 0.0309275034815073, "grad_norm_var": 2.8297995505085673e-06, "learning_rate": 0.007681704371759526, "loss": 2.6561, "step": 9197 }, { "crossentropy": 2.6026458740234375, "epoch": 0.33345417633410673, "grad_norm": 0.03239921107888222, "grad_norm_var": 2.47364399157481e-06, "learning_rate": 0.0076812139147649705, "loss": 2.6684, "step": 9198 }, { "crossentropy": 2.7261552810668945, "epoch": 0.33349042923433875, "grad_norm": 0.02926883101463318, "grad_norm_var": 2.0918508311547346e-06, "learning_rate": 0.007680723421556552, "loss": 2.6602, "step": 9199 }, { "crossentropy": 2.718733787536621, "epoch": 0.33352668213457076, "grad_norm": 0.02767540141940117, "grad_norm_var": 2.756260069737892e-06, "learning_rate": 0.007680232892140897, "loss": 2.6394, "step": 9200 }, { "crossentropy": 2.6155664920806885, "epoch": 0.3335629350348028, "grad_norm": 0.030472299084067345, "grad_norm_var": 2.701780296394474e-06, "learning_rate": 0.007679742326524628, "loss": 2.6631, "step": 9201 }, { "crossentropy": 2.7101001739501953, "epoch": 0.3335991879350348, "grad_norm": 0.028335560113191605, "grad_norm_var": 2.9833326622962273e-06, "learning_rate": 0.007679251724714373, "loss": 2.7127, "step": 9202 }, { "crossentropy": 2.6386618614196777, "epoch": 0.3336354408352668, "grad_norm": 0.029128916561603546, "grad_norm_var": 3.1271877731141634e-06, "learning_rate": 0.007678761086716758, "loss": 2.6588, "step": 9203 }, { "crossentropy": 2.649798631668091, "epoch": 0.3336716937354988, "grad_norm": 0.02947627194225788, "grad_norm_var": 3.1810921310012247e-06, "learning_rate": 0.007678270412538408, "loss": 2.615, "step": 9204 }, { "crossentropy": 2.5908637046813965, "epoch": 0.33370794663573083, "grad_norm": 0.03138449415564537, "grad_norm_var": 2.4953088401835054e-06, "learning_rate": 0.007677779702185954, "loss": 2.6455, "step": 9205 }, { "crossentropy": 2.5506513118743896, "epoch": 0.3337441995359629, "grad_norm": 0.03242257609963417, "grad_norm_var": 2.6527351970386435e-06, "learning_rate": 0.007677288955666019, "loss": 2.6027, "step": 9206 }, { "crossentropy": 2.5837833881378174, "epoch": 0.3337804524361949, "grad_norm": 0.03210555389523506, "grad_norm_var": 2.599867783270535e-06, "learning_rate": 0.007676798172985236, "loss": 2.594, "step": 9207 }, { "crossentropy": 2.507333755493164, "epoch": 0.33381670533642693, "grad_norm": 0.036422211676836014, "grad_norm_var": 4.839698941488945e-06, "learning_rate": 0.007676307354150229, "loss": 2.5664, "step": 9208 }, { "crossentropy": 2.5756351947784424, "epoch": 0.33385295823665895, "grad_norm": 0.03182593360543251, "grad_norm_var": 4.886065790401248e-06, "learning_rate": 0.007675816499167632, "loss": 2.6151, "step": 9209 }, { "crossentropy": 2.581514358520508, "epoch": 0.33388921113689096, "grad_norm": 0.029488811269402504, "grad_norm_var": 4.706406186992933e-06, "learning_rate": 0.00767532560804407, "loss": 2.6016, "step": 9210 }, { "crossentropy": 2.6393795013427734, "epoch": 0.333925464037123, "grad_norm": 0.029864361509680748, "grad_norm_var": 4.7384335256917995e-06, "learning_rate": 0.007674834680786178, "loss": 2.6639, "step": 9211 }, { "crossentropy": 2.499680995941162, "epoch": 0.333961716937355, "grad_norm": 0.028759026899933815, "grad_norm_var": 4.600176344933017e-06, "learning_rate": 0.007674343717400583, "loss": 2.531, "step": 9212 }, { "crossentropy": 2.666182518005371, "epoch": 0.333997969837587, "grad_norm": 0.031797681003808975, "grad_norm_var": 4.6829115425188325e-06, "learning_rate": 0.007673852717893919, "loss": 2.6857, "step": 9213 }, { "crossentropy": 2.650050640106201, "epoch": 0.334034222737819, "grad_norm": 0.02960246615111828, "grad_norm_var": 4.529448345687072e-06, "learning_rate": 0.007673361682272815, "loss": 2.691, "step": 9214 }, { "crossentropy": 2.556401014328003, "epoch": 0.33407047563805103, "grad_norm": 0.028491158038377762, "grad_norm_var": 4.695103366364415e-06, "learning_rate": 0.007672870610543905, "loss": 2.5353, "step": 9215 }, { "crossentropy": 2.5977835655212402, "epoch": 0.33410672853828305, "grad_norm": 0.028932351619005203, "grad_norm_var": 4.328292049501856e-06, "learning_rate": 0.007672379502713821, "loss": 2.625, "step": 9216 }, { "crossentropy": 2.491152763366699, "epoch": 0.33414298143851506, "grad_norm": 0.031067606061697006, "grad_norm_var": 4.3457142740021265e-06, "learning_rate": 0.007671888358789196, "loss": 2.5558, "step": 9217 }, { "crossentropy": 2.526679515838623, "epoch": 0.3341792343387471, "grad_norm": 0.03190157562494278, "grad_norm_var": 4.0785334605488204e-06, "learning_rate": 0.007671397178776664, "loss": 2.6394, "step": 9218 }, { "crossentropy": 2.592935562133789, "epoch": 0.3342154872389791, "grad_norm": 0.03520284965634346, "grad_norm_var": 5.037514721010225e-06, "learning_rate": 0.007670905962682862, "loss": 2.6414, "step": 9219 }, { "crossentropy": 2.6829419136047363, "epoch": 0.33425174013921116, "grad_norm": 0.03228231146931648, "grad_norm_var": 4.895358848585179e-06, "learning_rate": 0.0076704147105144185, "loss": 2.6679, "step": 9220 }, { "crossentropy": 2.551234006881714, "epoch": 0.3342879930394432, "grad_norm": 0.03130362927913666, "grad_norm_var": 4.895362587790479e-06, "learning_rate": 0.0076699234222779725, "loss": 2.5918, "step": 9221 }, { "crossentropy": 2.733003616333008, "epoch": 0.3343242459396752, "grad_norm": 0.028706055134534836, "grad_norm_var": 5.223122370514912e-06, "learning_rate": 0.007669432097980159, "loss": 2.6216, "step": 9222 }, { "crossentropy": 2.550475597381592, "epoch": 0.3343604988399072, "grad_norm": 0.028124528005719185, "grad_norm_var": 5.685001483299182e-06, "learning_rate": 0.007668940737627614, "loss": 2.503, "step": 9223 }, { "crossentropy": 2.6882057189941406, "epoch": 0.3343967517401392, "grad_norm": 0.029328113421797752, "grad_norm_var": 3.5699497133623765e-06, "learning_rate": 0.007668449341226975, "loss": 2.6617, "step": 9224 }, { "crossentropy": 2.7464141845703125, "epoch": 0.33443300464037123, "grad_norm": 0.028898637741804123, "grad_norm_var": 3.555758184004461e-06, "learning_rate": 0.007667957908784878, "loss": 2.6586, "step": 9225 }, { "crossentropy": 2.6560070514678955, "epoch": 0.33446925754060325, "grad_norm": 0.031087210401892662, "grad_norm_var": 3.5565282250075877e-06, "learning_rate": 0.007667466440307959, "loss": 2.6025, "step": 9226 }, { "crossentropy": 2.5755815505981445, "epoch": 0.33450551044083526, "grad_norm": 0.02964962273836136, "grad_norm_var": 3.5728668367447966e-06, "learning_rate": 0.007666974935802859, "loss": 2.6202, "step": 9227 }, { "crossentropy": 2.529360771179199, "epoch": 0.3345417633410673, "grad_norm": 0.02743188478052616, "grad_norm_var": 3.959330165649343e-06, "learning_rate": 0.007666483395276214, "loss": 2.6272, "step": 9228 }, { "crossentropy": 2.728325366973877, "epoch": 0.3345780162412993, "grad_norm": 0.027390990406274796, "grad_norm_var": 4.256596867026095e-06, "learning_rate": 0.007665991818734666, "loss": 2.6666, "step": 9229 }, { "crossentropy": 2.6195068359375, "epoch": 0.3346142691415313, "grad_norm": 0.028958041220903397, "grad_norm_var": 4.313492709608936e-06, "learning_rate": 0.007665500206184851, "loss": 2.6467, "step": 9230 }, { "crossentropy": 2.562764883041382, "epoch": 0.3346505220417633, "grad_norm": 0.029011759907007217, "grad_norm_var": 4.231092180017176e-06, "learning_rate": 0.007665008557633412, "loss": 2.6275, "step": 9231 }, { "crossentropy": 2.47160267829895, "epoch": 0.33468677494199534, "grad_norm": 0.028675241395831108, "grad_norm_var": 4.27027549571971e-06, "learning_rate": 0.007664516873086988, "loss": 2.6094, "step": 9232 }, { "crossentropy": 2.6980535984039307, "epoch": 0.3347230278422274, "grad_norm": 0.027885718271136284, "grad_norm_var": 4.424133523788879e-06, "learning_rate": 0.007664025152552219, "loss": 2.683, "step": 9233 }, { "crossentropy": 2.5880606174468994, "epoch": 0.3347592807424594, "grad_norm": 0.02886146493256092, "grad_norm_var": 4.125538656207201e-06, "learning_rate": 0.007663533396035749, "loss": 2.6375, "step": 9234 }, { "crossentropy": 2.6764485836029053, "epoch": 0.33479553364269143, "grad_norm": 0.029901087284088135, "grad_norm_var": 1.8862369082752917e-06, "learning_rate": 0.007663041603544216, "loss": 2.6043, "step": 9235 }, { "crossentropy": 2.6568310260772705, "epoch": 0.33483178654292345, "grad_norm": 0.029550826177001, "grad_norm_var": 1.2367227698047157e-06, "learning_rate": 0.007662549775084268, "loss": 2.6686, "step": 9236 }, { "crossentropy": 2.6664481163024902, "epoch": 0.33486803944315546, "grad_norm": 0.030087511986494064, "grad_norm_var": 9.633763122083098e-07, "learning_rate": 0.0076620579106625435, "loss": 2.6095, "step": 9237 }, { "crossentropy": 2.612253189086914, "epoch": 0.3349042923433875, "grad_norm": 0.03395402058959007, "grad_norm_var": 2.4987529784798247e-06, "learning_rate": 0.007661566010285685, "loss": 2.5591, "step": 9238 }, { "crossentropy": 2.6732304096221924, "epoch": 0.3349405452436195, "grad_norm": 0.03417845070362091, "grad_norm_var": 3.840716390364933e-06, "learning_rate": 0.007661074073960342, "loss": 2.6841, "step": 9239 }, { "crossentropy": 2.5914385318756104, "epoch": 0.3349767981438515, "grad_norm": 0.03077337145805359, "grad_norm_var": 3.903809949661457e-06, "learning_rate": 0.007660582101693153, "loss": 2.6417, "step": 9240 }, { "crossentropy": 2.5461044311523438, "epoch": 0.3350130510440835, "grad_norm": 0.028192507103085518, "grad_norm_var": 4.016870973655888e-06, "learning_rate": 0.007660090093490765, "loss": 2.6489, "step": 9241 }, { "crossentropy": 2.6266708374023438, "epoch": 0.33504930394431554, "grad_norm": 0.031057097017765045, "grad_norm_var": 4.011455631919697e-06, "learning_rate": 0.007659598049359825, "loss": 2.7328, "step": 9242 }, { "crossentropy": 2.5579373836517334, "epoch": 0.33508555684454755, "grad_norm": 0.029777618125081062, "grad_norm_var": 4.0112362619510214e-06, "learning_rate": 0.007659105969306976, "loss": 2.5413, "step": 9243 }, { "crossentropy": 2.6267619132995605, "epoch": 0.33512180974477956, "grad_norm": 0.03158915042877197, "grad_norm_var": 3.817301811180098e-06, "learning_rate": 0.007658613853338867, "loss": 2.6669, "step": 9244 }, { "crossentropy": 2.4446074962615967, "epoch": 0.3351580626450116, "grad_norm": 0.02886112593114376, "grad_norm_var": 3.4428706167702416e-06, "learning_rate": 0.0076581217014621405, "loss": 2.5071, "step": 9245 }, { "crossentropy": 2.585254430770874, "epoch": 0.3351943155452436, "grad_norm": 0.028220728039741516, "grad_norm_var": 3.5873605405479e-06, "learning_rate": 0.00765762951368345, "loss": 2.5947, "step": 9246 }, { "crossentropy": 2.7676198482513428, "epoch": 0.33523056844547566, "grad_norm": 0.029027357697486877, "grad_norm_var": 3.5852454103054195e-06, "learning_rate": 0.007657137290009437, "loss": 2.6691, "step": 9247 }, { "crossentropy": 2.745873212814331, "epoch": 0.3352668213457077, "grad_norm": 0.02825598604977131, "grad_norm_var": 3.6723590893456783e-06, "learning_rate": 0.007656645030446754, "loss": 2.595, "step": 9248 }, { "crossentropy": 2.6197216510772705, "epoch": 0.3353030742459397, "grad_norm": 0.031970228999853134, "grad_norm_var": 3.557696696229277e-06, "learning_rate": 0.007656152735002047, "loss": 2.6501, "step": 9249 }, { "crossentropy": 2.5104193687438965, "epoch": 0.3353393271461717, "grad_norm": 0.03842021897435188, "grad_norm_var": 7.4780254413736445e-06, "learning_rate": 0.007655660403681969, "loss": 2.6193, "step": 9250 }, { "crossentropy": 2.489877700805664, "epoch": 0.3353755800464037, "grad_norm": 0.029565975069999695, "grad_norm_var": 7.528049963971304e-06, "learning_rate": 0.007655168036493166, "loss": 2.579, "step": 9251 }, { "crossentropy": 2.769374132156372, "epoch": 0.33541183294663574, "grad_norm": 0.029330233111977577, "grad_norm_var": 7.569086530650202e-06, "learning_rate": 0.007654675633442289, "loss": 2.6607, "step": 9252 }, { "crossentropy": 2.704545497894287, "epoch": 0.33544808584686775, "grad_norm": 0.0295021440833807, "grad_norm_var": 7.648363143702522e-06, "learning_rate": 0.007654183194535987, "loss": 2.6052, "step": 9253 }, { "crossentropy": 2.60969614982605, "epoch": 0.33548433874709976, "grad_norm": 0.030797846615314484, "grad_norm_var": 6.94041198441338e-06, "learning_rate": 0.007653690719780917, "loss": 2.5831, "step": 9254 }, { "crossentropy": 2.62318754196167, "epoch": 0.3355205916473318, "grad_norm": 0.0387810654938221, "grad_norm_var": 1.0463513499006187e-05, "learning_rate": 0.007653198209183725, "loss": 2.7, "step": 9255 }, { "crossentropy": 2.6819865703582764, "epoch": 0.3355568445475638, "grad_norm": 0.03068375028669834, "grad_norm_var": 1.0465321508791694e-05, "learning_rate": 0.007652705662751064, "loss": 2.6865, "step": 9256 }, { "crossentropy": 2.681039571762085, "epoch": 0.3355930974477958, "grad_norm": 0.029850507155060768, "grad_norm_var": 1.0043665581541345e-05, "learning_rate": 0.007652213080489587, "loss": 2.6938, "step": 9257 }, { "crossentropy": 2.6551668643951416, "epoch": 0.3356293503480278, "grad_norm": 0.03225618228316307, "grad_norm_var": 1.014574429668232e-05, "learning_rate": 0.0076517204624059485, "loss": 2.6246, "step": 9258 }, { "crossentropy": 2.7207651138305664, "epoch": 0.33566560324825984, "grad_norm": 0.03408020734786987, "grad_norm_var": 1.0569592032107627e-05, "learning_rate": 0.007651227808506802, "loss": 2.8057, "step": 9259 }, { "crossentropy": 2.663379430770874, "epoch": 0.3357018561484919, "grad_norm": 0.029482033103704453, "grad_norm_var": 1.0772747653231089e-05, "learning_rate": 0.0076507351187987994, "loss": 2.5628, "step": 9260 }, { "crossentropy": 2.5814528465270996, "epoch": 0.3357381090487239, "grad_norm": 0.028245775029063225, "grad_norm_var": 1.0987724116882694e-05, "learning_rate": 0.007650242393288596, "loss": 2.5961, "step": 9261 }, { "crossentropy": 2.4565727710723877, "epoch": 0.33577436194895594, "grad_norm": 0.02791570872068405, "grad_norm_var": 1.1112848724065723e-05, "learning_rate": 0.007649749631982846, "loss": 2.5686, "step": 9262 }, { "crossentropy": 2.753406047821045, "epoch": 0.33581061484918795, "grad_norm": 0.02855333872139454, "grad_norm_var": 1.1260121044495682e-05, "learning_rate": 0.00764925683488821, "loss": 2.5986, "step": 9263 }, { "crossentropy": 2.5750300884246826, "epoch": 0.33584686774941996, "grad_norm": 0.02936244010925293, "grad_norm_var": 1.0916225728980118e-05, "learning_rate": 0.007648764002011336, "loss": 2.6454, "step": 9264 }, { "crossentropy": 2.5940301418304443, "epoch": 0.335883120649652, "grad_norm": 0.027326637879014015, "grad_norm_var": 1.1771456202618726e-05, "learning_rate": 0.007648271133358886, "loss": 2.5705, "step": 9265 }, { "crossentropy": 2.6121320724487305, "epoch": 0.335919373549884, "grad_norm": 0.02796352282166481, "grad_norm_var": 8.09904547740361e-06, "learning_rate": 0.0076477782289375166, "loss": 2.6498, "step": 9266 }, { "crossentropy": 2.6829750537872314, "epoch": 0.335955626450116, "grad_norm": 0.033123940229415894, "grad_norm_var": 8.574714938828345e-06, "learning_rate": 0.007647285288753884, "loss": 2.6049, "step": 9267 }, { "crossentropy": 2.626328706741333, "epoch": 0.335991879350348, "grad_norm": 0.0375724695622921, "grad_norm_var": 1.158623377055662e-05, "learning_rate": 0.0076467923128146435, "loss": 2.6597, "step": 9268 }, { "crossentropy": 2.648613691329956, "epoch": 0.33602813225058004, "grad_norm": 0.035298414528369904, "grad_norm_var": 1.2552702176768732e-05, "learning_rate": 0.0076462993011264585, "loss": 2.6372, "step": 9269 }, { "crossentropy": 2.6249804496765137, "epoch": 0.33606438515081205, "grad_norm": 0.033121369779109955, "grad_norm_var": 1.2724993930448185e-05, "learning_rate": 0.007645806253695985, "loss": 2.6336, "step": 9270 }, { "crossentropy": 2.713839530944824, "epoch": 0.33610063805104406, "grad_norm": 0.0323023721575737, "grad_norm_var": 9.038105344651492e-06, "learning_rate": 0.0076453131705298836, "loss": 2.7055, "step": 9271 }, { "crossentropy": 2.4614622592926025, "epoch": 0.3361368909512761, "grad_norm": 0.02956812083721161, "grad_norm_var": 9.173523106131322e-06, "learning_rate": 0.007644820051634812, "loss": 2.531, "step": 9272 }, { "crossentropy": 2.62546443939209, "epoch": 0.3361731438515081, "grad_norm": 0.028749557211995125, "grad_norm_var": 9.418228042146608e-06, "learning_rate": 0.007644326897017433, "loss": 2.6709, "step": 9273 }, { "crossentropy": 2.6354732513427734, "epoch": 0.33620939675174016, "grad_norm": 0.028901757672429085, "grad_norm_var": 9.529521096364096e-06, "learning_rate": 0.007643833706684406, "loss": 2.5212, "step": 9274 }, { "crossentropy": 2.554077386856079, "epoch": 0.3362456496519722, "grad_norm": 0.028605222702026367, "grad_norm_var": 8.95221740798712e-06, "learning_rate": 0.007643340480642393, "loss": 2.5799, "step": 9275 }, { "crossentropy": 2.5252232551574707, "epoch": 0.3362819025522042, "grad_norm": 0.027850273996591568, "grad_norm_var": 9.314173461284061e-06, "learning_rate": 0.0076428472188980546, "loss": 2.5962, "step": 9276 }, { "crossentropy": 2.444181203842163, "epoch": 0.3363181554524362, "grad_norm": 0.03059573471546173, "grad_norm_var": 9.022311917874538e-06, "learning_rate": 0.007642353921458054, "loss": 2.4833, "step": 9277 }, { "crossentropy": 2.674022674560547, "epoch": 0.3363544083526682, "grad_norm": 0.030204281210899353, "grad_norm_var": 8.58375947638771e-06, "learning_rate": 0.007641860588329055, "loss": 2.6484, "step": 9278 }, { "crossentropy": 2.628432035446167, "epoch": 0.33639066125290024, "grad_norm": 0.029756907373666763, "grad_norm_var": 8.350876266887763e-06, "learning_rate": 0.00764136721951772, "loss": 2.6529, "step": 9279 }, { "crossentropy": 2.459803581237793, "epoch": 0.33642691415313225, "grad_norm": 0.027782732620835304, "grad_norm_var": 8.77676260444433e-06, "learning_rate": 0.007640873815030709, "loss": 2.5092, "step": 9280 }, { "crossentropy": 2.698922634124756, "epoch": 0.33646316705336426, "grad_norm": 0.028554493561387062, "grad_norm_var": 8.34406430133064e-06, "learning_rate": 0.007640380374874692, "loss": 2.7108, "step": 9281 }, { "crossentropy": 2.6111950874328613, "epoch": 0.3364994199535963, "grad_norm": 0.03059789165854454, "grad_norm_var": 7.844038274604226e-06, "learning_rate": 0.0076398868990563305, "loss": 2.6734, "step": 9282 }, { "crossentropy": 2.6764628887176514, "epoch": 0.3365356728538283, "grad_norm": 0.03014412149786949, "grad_norm_var": 7.4703475667107016e-06, "learning_rate": 0.00763939338758229, "loss": 2.5358, "step": 9283 }, { "crossentropy": 2.576141834259033, "epoch": 0.3365719257540603, "grad_norm": 0.03051038458943367, "grad_norm_var": 4.0223931809870696e-06, "learning_rate": 0.007638899840459237, "loss": 2.5758, "step": 9284 }, { "crossentropy": 2.5803771018981934, "epoch": 0.3366081786542923, "grad_norm": 0.028953731060028076, "grad_norm_var": 2.190584175019936e-06, "learning_rate": 0.007638406257693837, "loss": 2.5393, "step": 9285 }, { "crossentropy": 2.6769704818725586, "epoch": 0.33664443155452434, "grad_norm": 0.028638532385230064, "grad_norm_var": 1.4388988864900137e-06, "learning_rate": 0.007637912639292757, "loss": 2.6769, "step": 9286 }, { "crossentropy": 2.5597050189971924, "epoch": 0.3366806844547564, "grad_norm": 0.027435828000307083, "grad_norm_var": 1.0892069447831214e-06, "learning_rate": 0.007637418985262662, "loss": 2.5879, "step": 9287 }, { "crossentropy": 2.7008650302886963, "epoch": 0.3367169373549884, "grad_norm": 0.0291451383382082, "grad_norm_var": 1.0783927149290532e-06, "learning_rate": 0.007636925295610222, "loss": 2.669, "step": 9288 }, { "crossentropy": 2.704556465148926, "epoch": 0.33675319025522044, "grad_norm": 0.02744150720536709, "grad_norm_var": 1.255459610134159e-06, "learning_rate": 0.007636431570342104, "loss": 2.6437, "step": 9289 }, { "crossentropy": 2.732801675796509, "epoch": 0.33678944315545245, "grad_norm": 0.028498617932200432, "grad_norm_var": 1.2746556594234644e-06, "learning_rate": 0.007635937809464977, "loss": 2.7187, "step": 9290 }, { "crossentropy": 2.7131636142730713, "epoch": 0.33682569605568446, "grad_norm": 0.029421629384160042, "grad_norm_var": 1.2684728381357474e-06, "learning_rate": 0.007635444012985511, "loss": 2.704, "step": 9291 }, { "crossentropy": 2.6085422039031982, "epoch": 0.3368619489559165, "grad_norm": 0.02768051251769066, "grad_norm_var": 1.298464927775702e-06, "learning_rate": 0.0076349501809103715, "loss": 2.5494, "step": 9292 }, { "crossentropy": 2.6367275714874268, "epoch": 0.3368982018561485, "grad_norm": 0.02787773124873638, "grad_norm_var": 1.2127416942268412e-06, "learning_rate": 0.007634456313246233, "loss": 2.5717, "step": 9293 }, { "crossentropy": 2.658590793609619, "epoch": 0.3369344547563805, "grad_norm": 0.027020694687962532, "grad_norm_var": 1.2990285656562834e-06, "learning_rate": 0.007633962409999764, "loss": 2.5588, "step": 9294 }, { "crossentropy": 2.514645576477051, "epoch": 0.3369707076566125, "grad_norm": 0.028139693662524223, "grad_norm_var": 1.2381005635585135e-06, "learning_rate": 0.007633468471177635, "loss": 2.5877, "step": 9295 }, { "crossentropy": 2.5618748664855957, "epoch": 0.33700696055684454, "grad_norm": 0.029543237760663033, "grad_norm_var": 1.2364027049904225e-06, "learning_rate": 0.007632974496786518, "loss": 2.583, "step": 9296 }, { "crossentropy": 2.6033170223236084, "epoch": 0.33704321345707655, "grad_norm": 0.03393002972006798, "grad_norm_var": 2.920050763442168e-06, "learning_rate": 0.007632480486833085, "loss": 2.5773, "step": 9297 }, { "crossentropy": 2.7338080406188965, "epoch": 0.33707946635730857, "grad_norm": 0.03033028170466423, "grad_norm_var": 2.86969569064017e-06, "learning_rate": 0.007631986441324007, "loss": 2.6861, "step": 9298 }, { "crossentropy": 2.491093397140503, "epoch": 0.3371157192575406, "grad_norm": 0.02900717407464981, "grad_norm_var": 2.7837882598237868e-06, "learning_rate": 0.0076314923602659596, "loss": 2.606, "step": 9299 }, { "crossentropy": 2.642392635345459, "epoch": 0.3371519721577726, "grad_norm": 0.0273477490991354, "grad_norm_var": 2.7608153241417387e-06, "learning_rate": 0.007630998243665612, "loss": 2.5724, "step": 9300 }, { "crossentropy": 2.4819469451904297, "epoch": 0.33718822505800466, "grad_norm": 0.02819092571735382, "grad_norm_var": 2.7790809007952402e-06, "learning_rate": 0.007630504091529642, "loss": 2.5795, "step": 9301 }, { "crossentropy": 2.591359853744507, "epoch": 0.3372244779582367, "grad_norm": 0.028164250776171684, "grad_norm_var": 2.798802626256293e-06, "learning_rate": 0.007630009903864723, "loss": 2.5744, "step": 9302 }, { "crossentropy": 2.7556533813476562, "epoch": 0.3372607308584687, "grad_norm": 0.03448085859417915, "grad_norm_var": 4.71481468265472e-06, "learning_rate": 0.007629515680677528, "loss": 2.6822, "step": 9303 }, { "crossentropy": 2.5553503036499023, "epoch": 0.3372969837587007, "grad_norm": 0.0365108847618103, "grad_norm_var": 8.111975449904744e-06, "learning_rate": 0.0076290214219747335, "loss": 2.5892, "step": 9304 }, { "crossentropy": 2.725992441177368, "epoch": 0.3373332366589327, "grad_norm": 0.030776357278227806, "grad_norm_var": 7.84768057757328e-06, "learning_rate": 0.007628527127763015, "loss": 2.7353, "step": 9305 }, { "crossentropy": 2.768026351928711, "epoch": 0.33736948955916474, "grad_norm": 0.02835896983742714, "grad_norm_var": 7.873271210474357e-06, "learning_rate": 0.0076280327980490495, "loss": 2.8034, "step": 9306 }, { "crossentropy": 2.608961820602417, "epoch": 0.33740574245939675, "grad_norm": 0.029304655268788338, "grad_norm_var": 7.880009130603604e-06, "learning_rate": 0.007627538432839511, "loss": 2.6779, "step": 9307 }, { "crossentropy": 2.4647490978240967, "epoch": 0.33744199535962877, "grad_norm": 0.029207171872258186, "grad_norm_var": 7.595975922660434e-06, "learning_rate": 0.007627044032141081, "loss": 2.4809, "step": 9308 }, { "crossentropy": 2.5985734462738037, "epoch": 0.3374782482598608, "grad_norm": 0.03166348487138748, "grad_norm_var": 7.477550800766286e-06, "learning_rate": 0.007626549595960433, "loss": 2.5909, "step": 9309 }, { "crossentropy": 2.432589530944824, "epoch": 0.3375145011600928, "grad_norm": 0.02928946726024151, "grad_norm_var": 6.860642982310592e-06, "learning_rate": 0.007626055124304248, "loss": 2.4691, "step": 9310 }, { "crossentropy": 2.5372729301452637, "epoch": 0.3375507540603248, "grad_norm": 0.02793460711836815, "grad_norm_var": 6.921396867764226e-06, "learning_rate": 0.007625560617179201, "loss": 2.6331, "step": 9311 }, { "crossentropy": 2.6691436767578125, "epoch": 0.3375870069605568, "grad_norm": 0.028357842937111855, "grad_norm_var": 7.1213212391396516e-06, "learning_rate": 0.007625066074591976, "loss": 2.7622, "step": 9312 }, { "crossentropy": 2.5197534561157227, "epoch": 0.33762325986078884, "grad_norm": 0.028246022760868073, "grad_norm_var": 6.297343317954854e-06, "learning_rate": 0.007624571496549249, "loss": 2.5485, "step": 9313 }, { "crossentropy": 2.678311586380005, "epoch": 0.3376595127610209, "grad_norm": 0.040530845522880554, "grad_norm_var": 1.3490273612701955e-05, "learning_rate": 0.0076240768830577, "loss": 2.6067, "step": 9314 }, { "crossentropy": 2.5992274284362793, "epoch": 0.3376957656612529, "grad_norm": 0.029673844575881958, "grad_norm_var": 1.338884828600433e-05, "learning_rate": 0.007623582234124011, "loss": 2.6418, "step": 9315 }, { "crossentropy": 2.740811586380005, "epoch": 0.33773201856148494, "grad_norm": 0.028391461819410324, "grad_norm_var": 1.3017929247491806e-05, "learning_rate": 0.007623087549754864, "loss": 2.6882, "step": 9316 }, { "crossentropy": 2.6375420093536377, "epoch": 0.33776827146171695, "grad_norm": 0.029354941099882126, "grad_norm_var": 1.2733747267991367e-05, "learning_rate": 0.007622592829956938, "loss": 2.6272, "step": 9317 }, { "crossentropy": 2.656681537628174, "epoch": 0.33780452436194897, "grad_norm": 0.029065262526273727, "grad_norm_var": 1.2487019722955691e-05, "learning_rate": 0.007622098074736916, "loss": 2.5556, "step": 9318 }, { "crossentropy": 2.631603956222534, "epoch": 0.337840777262181, "grad_norm": 0.028458036482334137, "grad_norm_var": 1.1715300791782312e-05, "learning_rate": 0.0076216032841014796, "loss": 2.588, "step": 9319 }, { "crossentropy": 2.627718925476074, "epoch": 0.337877030162413, "grad_norm": 0.027894925326108932, "grad_norm_var": 9.24319503262578e-06, "learning_rate": 0.007621108458057313, "loss": 2.6309, "step": 9320 }, { "crossentropy": 2.748093605041504, "epoch": 0.337913283062645, "grad_norm": 0.031143739819526672, "grad_norm_var": 9.30035114997483e-06, "learning_rate": 0.007620613596611099, "loss": 2.6737, "step": 9321 }, { "crossentropy": 2.5533018112182617, "epoch": 0.337949535962877, "grad_norm": 0.03279048204421997, "grad_norm_var": 9.673505891261147e-06, "learning_rate": 0.007620118699769523, "loss": 2.5499, "step": 9322 }, { "crossentropy": 2.6232359409332275, "epoch": 0.33798578886310904, "grad_norm": 0.03353661298751831, "grad_norm_var": 1.0354405877988299e-05, "learning_rate": 0.0076196237675392655, "loss": 2.6242, "step": 9323 }, { "crossentropy": 2.8185348510742188, "epoch": 0.33802204176334105, "grad_norm": 0.03220716863870621, "grad_norm_var": 1.0461305179772172e-05, "learning_rate": 0.007619128799927015, "loss": 2.665, "step": 9324 }, { "crossentropy": 2.649540424346924, "epoch": 0.33805829466357307, "grad_norm": 0.030887264758348465, "grad_norm_var": 1.0382031363634822e-05, "learning_rate": 0.007618633796939453, "loss": 2.5939, "step": 9325 }, { "crossentropy": 2.7624430656433105, "epoch": 0.3380945475638051, "grad_norm": 0.03041796013712883, "grad_norm_var": 1.0281714431927902e-05, "learning_rate": 0.007618138758583271, "loss": 2.6364, "step": 9326 }, { "crossentropy": 2.752711057662964, "epoch": 0.3381308004640371, "grad_norm": 0.03230144828557968, "grad_norm_var": 9.947432889538402e-06, "learning_rate": 0.00761764368486515, "loss": 2.6938, "step": 9327 }, { "crossentropy": 2.7170767784118652, "epoch": 0.33816705336426917, "grad_norm": 0.03027164749801159, "grad_norm_var": 9.54587142287081e-06, "learning_rate": 0.0076171485757917795, "loss": 2.7751, "step": 9328 }, { "crossentropy": 2.6669886112213135, "epoch": 0.3382033062645012, "grad_norm": 0.028707416728138924, "grad_norm_var": 9.392939142074259e-06, "learning_rate": 0.007616653431369844, "loss": 2.4476, "step": 9329 }, { "crossentropy": 2.749934434890747, "epoch": 0.3382395591647332, "grad_norm": 0.02857685647904873, "grad_norm_var": 3.096618479133288e-06, "learning_rate": 0.007616158251606034, "loss": 2.7008, "step": 9330 }, { "crossentropy": 2.6242027282714844, "epoch": 0.3382758120649652, "grad_norm": 0.03024233877658844, "grad_norm_var": 3.074665841132117e-06, "learning_rate": 0.007615663036507037, "loss": 2.6507, "step": 9331 }, { "crossentropy": 2.5455663204193115, "epoch": 0.3383120649651972, "grad_norm": 0.02941014990210533, "grad_norm_var": 2.8849860040374193e-06, "learning_rate": 0.007615167786079542, "loss": 2.5608, "step": 9332 }, { "crossentropy": 2.4872779846191406, "epoch": 0.33834831786542924, "grad_norm": 0.0280440766364336, "grad_norm_var": 3.162656344609232e-06, "learning_rate": 0.007614672500330235, "loss": 2.5797, "step": 9333 }, { "crossentropy": 2.7684881687164307, "epoch": 0.33838457076566125, "grad_norm": 0.029713796451687813, "grad_norm_var": 3.0867390596881477e-06, "learning_rate": 0.007614177179265811, "loss": 2.6849, "step": 9334 }, { "crossentropy": 2.4748804569244385, "epoch": 0.33842082366589327, "grad_norm": 0.028297174721956253, "grad_norm_var": 3.1276003603363055e-06, "learning_rate": 0.007613681822892955, "loss": 2.5833, "step": 9335 }, { "crossentropy": 2.647698402404785, "epoch": 0.3384570765661253, "grad_norm": 0.028606729581952095, "grad_norm_var": 2.9331252229253603e-06, "learning_rate": 0.00761318643121836, "loss": 2.6395, "step": 9336 }, { "crossentropy": 2.6923463344573975, "epoch": 0.3384933294663573, "grad_norm": 0.029404891654849052, "grad_norm_var": 2.9316238372217947e-06, "learning_rate": 0.007612691004248716, "loss": 2.6961, "step": 9337 }, { "crossentropy": 2.581517219543457, "epoch": 0.3385295823665893, "grad_norm": 0.03135356679558754, "grad_norm_var": 2.566948720395196e-06, "learning_rate": 0.007612195541990717, "loss": 2.6587, "step": 9338 }, { "crossentropy": 2.6509621143341064, "epoch": 0.3385658352668213, "grad_norm": 0.02858290821313858, "grad_norm_var": 1.846435582570369e-06, "learning_rate": 0.007611700044451051, "loss": 2.7103, "step": 9339 }, { "crossentropy": 2.4699759483337402, "epoch": 0.33860208816705334, "grad_norm": 0.028157494962215424, "grad_norm_var": 1.5792667855455406e-06, "learning_rate": 0.007611204511636414, "loss": 2.4871, "step": 9340 }, { "crossentropy": 2.671877384185791, "epoch": 0.3386383410672854, "grad_norm": 0.029850130900740623, "grad_norm_var": 1.4630904161536581e-06, "learning_rate": 0.007610708943553496, "loss": 2.6911, "step": 9341 }, { "crossentropy": 2.606884479522705, "epoch": 0.3386745939675174, "grad_norm": 0.03426642715930939, "grad_norm_var": 2.8617605021304926e-06, "learning_rate": 0.007610213340208992, "loss": 2.6401, "step": 9342 }, { "crossentropy": 2.5544490814208984, "epoch": 0.33871084686774944, "grad_norm": 0.032805006951093674, "grad_norm_var": 3.0498094829671065e-06, "learning_rate": 0.007609717701609598, "loss": 2.5744, "step": 9343 }, { "crossentropy": 2.6074328422546387, "epoch": 0.33874709976798145, "grad_norm": 0.028186354786157608, "grad_norm_var": 3.1815990923459895e-06, "learning_rate": 0.007609222027762004, "loss": 2.6503, "step": 9344 }, { "crossentropy": 2.7801952362060547, "epoch": 0.33878335266821347, "grad_norm": 0.03886670991778374, "grad_norm_var": 8.371985866457062e-06, "learning_rate": 0.007608726318672907, "loss": 2.6651, "step": 9345 }, { "crossentropy": 2.642402410507202, "epoch": 0.3388196055684455, "grad_norm": 0.02912583015859127, "grad_norm_var": 8.266685360169372e-06, "learning_rate": 0.007608230574349002, "loss": 2.6406, "step": 9346 }, { "crossentropy": 2.5587220191955566, "epoch": 0.3388558584686775, "grad_norm": 0.028320735320448875, "grad_norm_var": 8.514062870055997e-06, "learning_rate": 0.007607734794796987, "loss": 2.5095, "step": 9347 }, { "crossentropy": 2.64056134223938, "epoch": 0.3388921113689095, "grad_norm": 0.02929113246500492, "grad_norm_var": 8.527276003884344e-06, "learning_rate": 0.007607238980023554, "loss": 2.6787, "step": 9348 }, { "crossentropy": 2.6296262741088867, "epoch": 0.3389283642691415, "grad_norm": 0.028553210198879242, "grad_norm_var": 8.39851087008959e-06, "learning_rate": 0.007606743130035402, "loss": 2.6177, "step": 9349 }, { "crossentropy": 2.60146164894104, "epoch": 0.33896461716937354, "grad_norm": 0.029856836423277855, "grad_norm_var": 8.390299711507698e-06, "learning_rate": 0.007606247244839229, "loss": 2.6774, "step": 9350 }, { "crossentropy": 2.801692008972168, "epoch": 0.33900087006960555, "grad_norm": 0.02963295578956604, "grad_norm_var": 8.15929878745894e-06, "learning_rate": 0.007605751324441732, "loss": 2.7041, "step": 9351 }, { "crossentropy": 2.536918878555298, "epoch": 0.33903712296983757, "grad_norm": 0.029097000136971474, "grad_norm_var": 8.063384630599398e-06, "learning_rate": 0.007605255368849609, "loss": 2.5508, "step": 9352 }, { "crossentropy": 2.6756999492645264, "epoch": 0.3390733758700696, "grad_norm": 0.029511194676160812, "grad_norm_var": 8.050915593861563e-06, "learning_rate": 0.0076047593780695566, "loss": 2.6637, "step": 9353 }, { "crossentropy": 2.464339256286621, "epoch": 0.3391096287703016, "grad_norm": 0.031010638922452927, "grad_norm_var": 8.01197151912685e-06, "learning_rate": 0.007604263352108278, "loss": 2.5823, "step": 9354 }, { "crossentropy": 2.693066358566284, "epoch": 0.33914588167053367, "grad_norm": 0.03352891281247139, "grad_norm_var": 8.395575348137061e-06, "learning_rate": 0.007603767290972471, "loss": 2.6384, "step": 9355 }, { "crossentropy": 2.7098817825317383, "epoch": 0.3391821345707657, "grad_norm": 0.03306630253791809, "grad_norm_var": 8.284121061614449e-06, "learning_rate": 0.007603271194668835, "loss": 2.6981, "step": 9356 }, { "crossentropy": 2.6084375381469727, "epoch": 0.3392183874709977, "grad_norm": 0.02983359806239605, "grad_norm_var": 8.286530899295705e-06, "learning_rate": 0.00760277506320407, "loss": 2.6272, "step": 9357 }, { "crossentropy": 2.7595415115356445, "epoch": 0.3392546403712297, "grad_norm": 0.02971390262246132, "grad_norm_var": 7.559414798972754e-06, "learning_rate": 0.007602278896584878, "loss": 2.7056, "step": 9358 }, { "crossentropy": 2.614837169647217, "epoch": 0.3392908932714617, "grad_norm": 0.030343694612383842, "grad_norm_var": 7.230830690977185e-06, "learning_rate": 0.0076017826948179615, "loss": 2.6248, "step": 9359 }, { "crossentropy": 2.590956449508667, "epoch": 0.33932714617169374, "grad_norm": 0.030844347551465034, "grad_norm_var": 6.8537858424318195e-06, "learning_rate": 0.00760128645791002, "loss": 2.6235, "step": 9360 }, { "crossentropy": 2.576280355453491, "epoch": 0.33936339907192575, "grad_norm": 0.02954298071563244, "grad_norm_var": 2.087620764997762e-06, "learning_rate": 0.0076007901858677574, "loss": 2.6614, "step": 9361 }, { "crossentropy": 2.5638506412506104, "epoch": 0.33939965197215777, "grad_norm": 0.027256041765213013, "grad_norm_var": 2.5439021635926052e-06, "learning_rate": 0.007600293878697877, "loss": 2.5392, "step": 9362 }, { "crossentropy": 2.375556468963623, "epoch": 0.3394359048723898, "grad_norm": 0.030483433976769447, "grad_norm_var": 2.362749485716048e-06, "learning_rate": 0.007599797536407083, "loss": 2.5436, "step": 9363 }, { "crossentropy": 2.681108236312866, "epoch": 0.3394721577726218, "grad_norm": 0.029764341190457344, "grad_norm_var": 2.32584315489337e-06, "learning_rate": 0.007599301159002076, "loss": 2.7209, "step": 9364 }, { "crossentropy": 2.591557502746582, "epoch": 0.3395084106728538, "grad_norm": 0.030672404915094376, "grad_norm_var": 2.1617103154610826e-06, "learning_rate": 0.007598804746489563, "loss": 2.6081, "step": 9365 }, { "crossentropy": 2.8817498683929443, "epoch": 0.3395446635730858, "grad_norm": 0.030047999694943428, "grad_norm_var": 2.153720520034254e-06, "learning_rate": 0.007598308298876249, "loss": 2.809, "step": 9366 }, { "crossentropy": 2.6091363430023193, "epoch": 0.33958091647331784, "grad_norm": 0.03117227926850319, "grad_norm_var": 2.170684746582938e-06, "learning_rate": 0.007597811816168836, "loss": 2.6104, "step": 9367 }, { "crossentropy": 2.7349727153778076, "epoch": 0.3396171693735499, "grad_norm": 0.03626033291220665, "grad_norm_var": 4.163758198139526e-06, "learning_rate": 0.007597315298374035, "loss": 2.794, "step": 9368 }, { "crossentropy": 2.6089677810668945, "epoch": 0.3396534222737819, "grad_norm": 0.0330539308488369, "grad_norm_var": 4.3319563845537244e-06, "learning_rate": 0.007596818745498546, "loss": 2.6024, "step": 9369 }, { "crossentropy": 2.593095064163208, "epoch": 0.33968967517401394, "grad_norm": 0.028162511065602303, "grad_norm_var": 4.849031115047343e-06, "learning_rate": 0.007596322157549081, "loss": 2.6267, "step": 9370 }, { "crossentropy": 2.496595859527588, "epoch": 0.33972592807424595, "grad_norm": 0.02748849056661129, "grad_norm_var": 4.979281565296274e-06, "learning_rate": 0.007595825534532346, "loss": 2.6393, "step": 9371 }, { "crossentropy": 2.5802199840545654, "epoch": 0.33976218097447797, "grad_norm": 0.029060762375593185, "grad_norm_var": 4.601669377719585e-06, "learning_rate": 0.007595328876455047, "loss": 2.613, "step": 9372 }, { "crossentropy": 2.4196057319641113, "epoch": 0.33979843387471, "grad_norm": 0.028819620609283447, "grad_norm_var": 4.719699006876186e-06, "learning_rate": 0.007594832183323892, "loss": 2.5166, "step": 9373 }, { "crossentropy": 2.7022604942321777, "epoch": 0.339834686774942, "grad_norm": 0.028916476294398308, "grad_norm_var": 4.807717132035413e-06, "learning_rate": 0.007594335455145592, "loss": 2.6644, "step": 9374 }, { "crossentropy": 2.601405620574951, "epoch": 0.339870939675174, "grad_norm": 0.029111571609973907, "grad_norm_var": 4.865539197110071e-06, "learning_rate": 0.007593838691926855, "loss": 2.5663, "step": 9375 }, { "crossentropy": 2.6103692054748535, "epoch": 0.339907192575406, "grad_norm": 0.030755478888750076, "grad_norm_var": 4.8565149395212904e-06, "learning_rate": 0.007593341893674388, "loss": 2.7393, "step": 9376 }, { "crossentropy": 2.697308301925659, "epoch": 0.33994344547563804, "grad_norm": 0.027628982439637184, "grad_norm_var": 5.211178059531446e-06, "learning_rate": 0.007592845060394905, "loss": 2.5578, "step": 9377 }, { "crossentropy": 2.7011988162994385, "epoch": 0.33997969837587005, "grad_norm": 0.029117802157998085, "grad_norm_var": 4.767539373617032e-06, "learning_rate": 0.007592348192095115, "loss": 2.761, "step": 9378 }, { "crossentropy": 2.5335793495178223, "epoch": 0.34001595127610207, "grad_norm": 0.031173283234238625, "grad_norm_var": 4.838780073349961e-06, "learning_rate": 0.007591851288781727, "loss": 2.5311, "step": 9379 }, { "crossentropy": 2.8267173767089844, "epoch": 0.3400522041763341, "grad_norm": 0.036321818828582764, "grad_norm_var": 7.254351185178665e-06, "learning_rate": 0.007591354350461455, "loss": 2.6868, "step": 9380 }, { "crossentropy": 2.452164888381958, "epoch": 0.34008845707656615, "grad_norm": 0.03342359885573387, "grad_norm_var": 7.79607703381943e-06, "learning_rate": 0.00759085737714101, "loss": 2.5007, "step": 9381 }, { "crossentropy": 2.542961359024048, "epoch": 0.34012470997679817, "grad_norm": 0.03031633235514164, "grad_norm_var": 7.778781988560647e-06, "learning_rate": 0.007590360368827104, "loss": 2.5319, "step": 9382 }, { "crossentropy": 2.64341402053833, "epoch": 0.3401609628770302, "grad_norm": 0.028100989758968353, "grad_norm_var": 8.164266581887888e-06, "learning_rate": 0.00758986332552645, "loss": 2.6328, "step": 9383 }, { "crossentropy": 2.6246445178985596, "epoch": 0.3401972157772622, "grad_norm": 0.02694828435778618, "grad_norm_var": 6.409490212497939e-06, "learning_rate": 0.007589366247245761, "loss": 2.5871, "step": 9384 }, { "crossentropy": 2.5176095962524414, "epoch": 0.3402334686774942, "grad_norm": 0.027903130277991295, "grad_norm_var": 5.901623205491607e-06, "learning_rate": 0.007588869133991754, "loss": 2.5197, "step": 9385 }, { "crossentropy": 2.738125801086426, "epoch": 0.3402697215777262, "grad_norm": 0.02643069066107273, "grad_norm_var": 6.415939368649368e-06, "learning_rate": 0.007588371985771138, "loss": 2.7397, "step": 9386 }, { "crossentropy": 2.6234970092773438, "epoch": 0.34030597447795824, "grad_norm": 0.0280669666826725, "grad_norm_var": 6.284032866474956e-06, "learning_rate": 0.007587874802590631, "loss": 2.6143, "step": 9387 }, { "crossentropy": 2.684082508087158, "epoch": 0.34034222737819025, "grad_norm": 0.027782846242189407, "grad_norm_var": 6.461960988810355e-06, "learning_rate": 0.007587377584456948, "loss": 2.5954, "step": 9388 }, { "crossentropy": 2.6715214252471924, "epoch": 0.34037848027842227, "grad_norm": 0.028059110045433044, "grad_norm_var": 6.55960910489529e-06, "learning_rate": 0.007586880331376804, "loss": 2.5973, "step": 9389 }, { "crossentropy": 2.5919346809387207, "epoch": 0.3404147331786543, "grad_norm": 0.027827007696032524, "grad_norm_var": 6.70092005000387e-06, "learning_rate": 0.007586383043356915, "loss": 2.5376, "step": 9390 }, { "crossentropy": 2.519784927368164, "epoch": 0.3404509860788863, "grad_norm": 0.027272311970591545, "grad_norm_var": 6.961132305907669e-06, "learning_rate": 0.007585885720403999, "loss": 2.5069, "step": 9391 }, { "crossentropy": 2.687087059020996, "epoch": 0.3404872389791183, "grad_norm": 0.02689965069293976, "grad_norm_var": 7.088364473093779e-06, "learning_rate": 0.0075853883625247726, "loss": 2.6516, "step": 9392 }, { "crossentropy": 2.427934408187866, "epoch": 0.3405234918793503, "grad_norm": 0.0274808406829834, "grad_norm_var": 7.115919025499568e-06, "learning_rate": 0.00758489096972595, "loss": 2.613, "step": 9393 }, { "crossentropy": 2.560753583908081, "epoch": 0.34055974477958234, "grad_norm": 0.028175104409456253, "grad_norm_var": 7.149778081896816e-06, "learning_rate": 0.007584393542014254, "loss": 2.5419, "step": 9394 }, { "crossentropy": 2.6344258785247803, "epoch": 0.3405959976798144, "grad_norm": 0.02750755473971367, "grad_norm_var": 6.8718669463149145e-06, "learning_rate": 0.0075838960793964015, "loss": 2.6456, "step": 9395 }, { "crossentropy": 2.7179882526397705, "epoch": 0.3406322505800464, "grad_norm": 0.02928139641880989, "grad_norm_var": 2.7749457043997933e-06, "learning_rate": 0.007583398581879111, "loss": 2.8092, "step": 9396 }, { "crossentropy": 2.4597816467285156, "epoch": 0.34066850348027844, "grad_norm": 0.028845621272921562, "grad_norm_var": 9.068663208326606e-07, "learning_rate": 0.007582901049469102, "loss": 2.5443, "step": 9397 }, { "crossentropy": 2.5151286125183105, "epoch": 0.34070475638051045, "grad_norm": 0.02888481691479683, "grad_norm_var": 5.796801822999097e-07, "learning_rate": 0.007582403482173095, "loss": 2.5715, "step": 9398 }, { "crossentropy": 2.5205440521240234, "epoch": 0.34074100928074247, "grad_norm": 0.03106727823615074, "grad_norm_var": 1.2321815032870075e-06, "learning_rate": 0.007581905879997811, "loss": 2.6054, "step": 9399 }, { "crossentropy": 2.5671205520629883, "epoch": 0.3407772621809745, "grad_norm": 0.029051268473267555, "grad_norm_var": 1.20611009575509e-06, "learning_rate": 0.007581408242949968, "loss": 2.6323, "step": 9400 }, { "crossentropy": 2.598876476287842, "epoch": 0.3408135150812065, "grad_norm": 0.029009224846959114, "grad_norm_var": 1.244917408020109e-06, "learning_rate": 0.00758091057103629, "loss": 2.6092, "step": 9401 }, { "crossentropy": 2.759881019592285, "epoch": 0.3408497679814385, "grad_norm": 0.029642200097441673, "grad_norm_var": 1.1200882137372969e-06, "learning_rate": 0.0075804128642634975, "loss": 2.7294, "step": 9402 }, { "crossentropy": 2.519643545150757, "epoch": 0.3408860208816705, "grad_norm": 0.03034244477748871, "grad_norm_var": 1.3340657016288591e-06, "learning_rate": 0.007579915122638315, "loss": 2.6312, "step": 9403 }, { "crossentropy": 2.5393824577331543, "epoch": 0.34092227378190254, "grad_norm": 0.030246971175074577, "grad_norm_var": 1.4547626344613933e-06, "learning_rate": 0.007579417346167461, "loss": 2.6572, "step": 9404 }, { "crossentropy": 2.6415793895721436, "epoch": 0.34095852668213456, "grad_norm": 0.02693573199212551, "grad_norm_var": 1.6333083834973427e-06, "learning_rate": 0.007578919534857662, "loss": 2.6397, "step": 9405 }, { "crossentropy": 2.6570279598236084, "epoch": 0.34099477958236657, "grad_norm": 0.02960287034511566, "grad_norm_var": 1.6345169854169722e-06, "learning_rate": 0.0075784216887156445, "loss": 2.5963, "step": 9406 }, { "crossentropy": 2.5925710201263428, "epoch": 0.3410310324825986, "grad_norm": 0.030178053304553032, "grad_norm_var": 1.583781881828505e-06, "learning_rate": 0.0075779238077481265, "loss": 2.6107, "step": 9407 }, { "crossentropy": 2.5636146068573, "epoch": 0.34106728538283065, "grad_norm": 0.027933422476053238, "grad_norm_var": 1.3683840824885713e-06, "learning_rate": 0.007577425891961835, "loss": 2.6162, "step": 9408 }, { "crossentropy": 2.694258213043213, "epoch": 0.34110353828306267, "grad_norm": 0.02826867438852787, "grad_norm_var": 1.2463841173652897e-06, "learning_rate": 0.007576927941363494, "loss": 2.6068, "step": 9409 }, { "crossentropy": 2.6015613079071045, "epoch": 0.3411397911832947, "grad_norm": 0.027604492381215096, "grad_norm_var": 1.334118348793077e-06, "learning_rate": 0.007576429955959834, "loss": 2.6441, "step": 9410 }, { "crossentropy": 2.698758125305176, "epoch": 0.3411760440835267, "grad_norm": 0.029583802446722984, "grad_norm_var": 1.183429684963449e-06, "learning_rate": 0.007575931935757577, "loss": 2.6463, "step": 9411 }, { "crossentropy": 2.562453031539917, "epoch": 0.3412122969837587, "grad_norm": 0.02774099074304104, "grad_norm_var": 1.305750339716559e-06, "learning_rate": 0.007575433880763447, "loss": 2.5273, "step": 9412 }, { "crossentropy": 2.5244364738464355, "epoch": 0.3412485498839907, "grad_norm": 0.03065297193825245, "grad_norm_var": 1.4585800096532694e-06, "learning_rate": 0.007574935790984176, "loss": 2.6911, "step": 9413 }, { "crossentropy": 2.743690252304077, "epoch": 0.34128480278422274, "grad_norm": 0.03221410512924194, "grad_norm_var": 2.0240462321428754e-06, "learning_rate": 0.00757443766642649, "loss": 2.7158, "step": 9414 }, { "crossentropy": 2.5824766159057617, "epoch": 0.34132105568445475, "grad_norm": 0.02838532067835331, "grad_norm_var": 1.8701182429344798e-06, "learning_rate": 0.007573939507097116, "loss": 2.5359, "step": 9415 }, { "crossentropy": 2.616546869277954, "epoch": 0.34135730858468677, "grad_norm": 0.02743171714246273, "grad_norm_var": 2.068768152479796e-06, "learning_rate": 0.007573441313002784, "loss": 2.5821, "step": 9416 }, { "crossentropy": 2.5080432891845703, "epoch": 0.3413935614849188, "grad_norm": 0.029389366507530212, "grad_norm_var": 2.072650875504011e-06, "learning_rate": 0.007572943084150219, "loss": 2.5773, "step": 9417 }, { "crossentropy": 2.610898733139038, "epoch": 0.3414298143851508, "grad_norm": 0.028133397921919823, "grad_norm_var": 2.1128095349356626e-06, "learning_rate": 0.007572444820546156, "loss": 2.6162, "step": 9418 }, { "crossentropy": 2.560696840286255, "epoch": 0.3414660672853828, "grad_norm": 0.030352439731359482, "grad_norm_var": 2.114551134428872e-06, "learning_rate": 0.007571946522197321, "loss": 2.6423, "step": 9419 }, { "crossentropy": 2.724492311477661, "epoch": 0.3415023201856148, "grad_norm": 0.03182823583483696, "grad_norm_var": 2.525109312984571e-06, "learning_rate": 0.007571448189110445, "loss": 2.6491, "step": 9420 }, { "crossentropy": 2.622356414794922, "epoch": 0.34153857308584684, "grad_norm": 0.03004179708659649, "grad_norm_var": 2.215320918337362e-06, "learning_rate": 0.007570949821292257, "loss": 2.579, "step": 9421 }, { "crossentropy": 2.4626495838165283, "epoch": 0.3415748259860789, "grad_norm": 0.02885248512029648, "grad_norm_var": 2.223597803187316e-06, "learning_rate": 0.007570451418749493, "loss": 2.5397, "step": 9422 }, { "crossentropy": 2.733262538909912, "epoch": 0.3416110788863109, "grad_norm": 0.031160345301032066, "grad_norm_var": 2.4006131241198493e-06, "learning_rate": 0.007569952981488881, "loss": 2.7286, "step": 9423 }, { "crossentropy": 2.5729854106903076, "epoch": 0.34164733178654294, "grad_norm": 0.029957672581076622, "grad_norm_var": 2.2748240393907857e-06, "learning_rate": 0.007569454509517153, "loss": 2.5392, "step": 9424 }, { "crossentropy": 2.620346784591675, "epoch": 0.34168358468677495, "grad_norm": 0.030138637870550156, "grad_norm_var": 2.1926344912501575e-06, "learning_rate": 0.007568956002841043, "loss": 2.6399, "step": 9425 }, { "crossentropy": 2.737785577774048, "epoch": 0.34171983758700697, "grad_norm": 0.033719684928655624, "grad_norm_var": 2.909541129136059e-06, "learning_rate": 0.007568457461467284, "loss": 2.6384, "step": 9426 }, { "crossentropy": 2.6318578720092773, "epoch": 0.341756090487239, "grad_norm": 0.03409011289477348, "grad_norm_var": 3.944310046581114e-06, "learning_rate": 0.007567958885402609, "loss": 2.6756, "step": 9427 }, { "crossentropy": 2.637410879135132, "epoch": 0.341792343387471, "grad_norm": 0.028093896806240082, "grad_norm_var": 3.833772121159896e-06, "learning_rate": 0.007567460274653753, "loss": 2.5795, "step": 9428 }, { "crossentropy": 2.6877384185791016, "epoch": 0.341828596287703, "grad_norm": 0.028742169961333275, "grad_norm_var": 3.966344378995831e-06, "learning_rate": 0.007566961629227448, "loss": 2.605, "step": 9429 }, { "crossentropy": 2.650118589401245, "epoch": 0.341864849187935, "grad_norm": 0.030297348275780678, "grad_norm_var": 3.6705469445896233e-06, "learning_rate": 0.007566462949130432, "loss": 2.7023, "step": 9430 }, { "crossentropy": 2.7261786460876465, "epoch": 0.34190110208816704, "grad_norm": 0.030155068263411522, "grad_norm_var": 3.4762228947796588e-06, "learning_rate": 0.007565964234369439, "loss": 2.5928, "step": 9431 }, { "crossentropy": 2.5928220748901367, "epoch": 0.34193735498839906, "grad_norm": 0.028347069397568703, "grad_norm_var": 3.19695075218316e-06, "learning_rate": 0.0075654654849512054, "loss": 2.7162, "step": 9432 }, { "crossentropy": 2.7556228637695312, "epoch": 0.34197360788863107, "grad_norm": 0.028015485033392906, "grad_norm_var": 3.464559683886735e-06, "learning_rate": 0.007564966700882465, "loss": 2.7841, "step": 9433 }, { "crossentropy": 2.68669056892395, "epoch": 0.3420098607888631, "grad_norm": 0.030444612726569176, "grad_norm_var": 3.186108969011507e-06, "learning_rate": 0.007564467882169959, "loss": 2.6898, "step": 9434 }, { "crossentropy": 2.309861660003662, "epoch": 0.34204611368909515, "grad_norm": 0.030475560575723648, "grad_norm_var": 3.1884948254804936e-06, "learning_rate": 0.007563969028820422, "loss": 2.5373, "step": 9435 }, { "crossentropy": 2.631635904312134, "epoch": 0.34208236658932717, "grad_norm": 0.029881136491894722, "grad_norm_var": 3.02155790038681e-06, "learning_rate": 0.007563470140840593, "loss": 2.5212, "step": 9436 }, { "crossentropy": 2.6630642414093018, "epoch": 0.3421186194895592, "grad_norm": 0.027946939691901207, "grad_norm_var": 3.3262856398210785e-06, "learning_rate": 0.007562971218237207, "loss": 2.6933, "step": 9437 }, { "crossentropy": 2.609445095062256, "epoch": 0.3421548723897912, "grad_norm": 0.028955617919564247, "grad_norm_var": 3.3108973942315983e-06, "learning_rate": 0.007562472261017008, "loss": 2.5956, "step": 9438 }, { "crossentropy": 2.6353228092193604, "epoch": 0.3421911252900232, "grad_norm": 0.03052518144249916, "grad_norm_var": 3.240074310113574e-06, "learning_rate": 0.007561973269186732, "loss": 2.5171, "step": 9439 }, { "crossentropy": 2.559784173965454, "epoch": 0.3422273781902552, "grad_norm": 0.028110211715102196, "grad_norm_var": 3.4605285830946925e-06, "learning_rate": 0.007561474242753119, "loss": 2.6163, "step": 9440 }, { "crossentropy": 2.788022518157959, "epoch": 0.34226363109048724, "grad_norm": 0.02870284765958786, "grad_norm_var": 3.538168396123991e-06, "learning_rate": 0.007560975181722907, "loss": 2.6278, "step": 9441 }, { "crossentropy": 2.587906837463379, "epoch": 0.34229988399071926, "grad_norm": 0.02806900255382061, "grad_norm_var": 2.566632577584539e-06, "learning_rate": 0.007560476086102841, "loss": 2.6047, "step": 9442 }, { "crossentropy": 2.645561456680298, "epoch": 0.34233613689095127, "grad_norm": 0.028336117044091225, "grad_norm_var": 1.0593457440190185e-06, "learning_rate": 0.00755997695589966, "loss": 2.6667, "step": 9443 }, { "crossentropy": 2.8601362705230713, "epoch": 0.3423723897911833, "grad_norm": 0.031456686556339264, "grad_norm_var": 1.329069686454148e-06, "learning_rate": 0.007559477791120104, "loss": 2.6474, "step": 9444 }, { "crossentropy": 2.6102521419525146, "epoch": 0.3424086426914153, "grad_norm": 0.031940460205078125, "grad_norm_var": 1.7395393464426315e-06, "learning_rate": 0.007558978591770916, "loss": 2.511, "step": 9445 }, { "crossentropy": 2.5924551486968994, "epoch": 0.3424448955916473, "grad_norm": 0.029045481234788895, "grad_norm_var": 1.700843885282105e-06, "learning_rate": 0.0075584793578588405, "loss": 2.6466, "step": 9446 }, { "crossentropy": 2.553926467895508, "epoch": 0.34248114849187933, "grad_norm": 0.02838459238409996, "grad_norm_var": 1.7186217557525144e-06, "learning_rate": 0.007557980089390617, "loss": 2.6232, "step": 9447 }, { "crossentropy": 2.6570727825164795, "epoch": 0.34251740139211134, "grad_norm": 0.02916152961552143, "grad_norm_var": 1.6577039279454802e-06, "learning_rate": 0.007557480786372992, "loss": 2.6993, "step": 9448 }, { "crossentropy": 2.5883145332336426, "epoch": 0.3425536542923434, "grad_norm": 0.0287813488394022, "grad_norm_var": 1.5590369038886622e-06, "learning_rate": 0.0075569814488127066, "loss": 2.6633, "step": 9449 }, { "crossentropy": 2.5599708557128906, "epoch": 0.3425899071925754, "grad_norm": 0.029181666672229767, "grad_norm_var": 1.4808986255627927e-06, "learning_rate": 0.007556482076716506, "loss": 2.6401, "step": 9450 }, { "crossentropy": 2.6142916679382324, "epoch": 0.34262616009280744, "grad_norm": 0.029378065839409828, "grad_norm_var": 1.3855686040150587e-06, "learning_rate": 0.007555982670091137, "loss": 2.6186, "step": 9451 }, { "crossentropy": 2.482820749282837, "epoch": 0.34266241299303946, "grad_norm": 0.030763354152441025, "grad_norm_var": 1.5095049704422562e-06, "learning_rate": 0.007555483228943343, "loss": 2.5938, "step": 9452 }, { "crossentropy": 2.5571024417877197, "epoch": 0.34269866589327147, "grad_norm": 0.02900945208966732, "grad_norm_var": 1.3889166711739467e-06, "learning_rate": 0.007554983753279869, "loss": 2.5456, "step": 9453 }, { "crossentropy": 2.5359389781951904, "epoch": 0.3427349187935035, "grad_norm": 0.030088432133197784, "grad_norm_var": 1.407649428794436e-06, "learning_rate": 0.007554484243107462, "loss": 2.546, "step": 9454 }, { "crossentropy": 2.7372095584869385, "epoch": 0.3427711716937355, "grad_norm": 0.03270687535405159, "grad_norm_var": 2.022726707886233e-06, "learning_rate": 0.007553984698432869, "loss": 2.6067, "step": 9455 }, { "crossentropy": 2.714132308959961, "epoch": 0.3428074245939675, "grad_norm": 0.03386794030666351, "grad_norm_var": 2.9742022891460564e-06, "learning_rate": 0.007553485119262839, "loss": 2.6719, "step": 9456 }, { "crossentropy": 2.6062748432159424, "epoch": 0.34284367749419953, "grad_norm": 0.02782221883535385, "grad_norm_var": 3.1667151347007535e-06, "learning_rate": 0.007552985505604114, "loss": 2.6298, "step": 9457 }, { "crossentropy": 2.694355010986328, "epoch": 0.34287993039443154, "grad_norm": 0.027681749314069748, "grad_norm_var": 3.2693165324575947e-06, "learning_rate": 0.007552485857463447, "loss": 2.7103, "step": 9458 }, { "crossentropy": 2.6919336318969727, "epoch": 0.34291618329466356, "grad_norm": 0.03070652484893799, "grad_norm_var": 3.1419063359551526e-06, "learning_rate": 0.007551986174847585, "loss": 2.7139, "step": 9459 }, { "crossentropy": 2.5504674911499023, "epoch": 0.34295243619489557, "grad_norm": 0.02892681211233139, "grad_norm_var": 3.050060336536274e-06, "learning_rate": 0.0075514864577632766, "loss": 2.5753, "step": 9460 }, { "crossentropy": 2.6553640365600586, "epoch": 0.3429886890951276, "grad_norm": 0.027578964829444885, "grad_norm_var": 3.017725517297463e-06, "learning_rate": 0.007550986706217271, "loss": 2.631, "step": 9461 }, { "crossentropy": 2.510223388671875, "epoch": 0.34302494199535966, "grad_norm": 0.02743547409772873, "grad_norm_var": 3.29186092715607e-06, "learning_rate": 0.007550486920216319, "loss": 2.5831, "step": 9462 }, { "crossentropy": 2.8283088207244873, "epoch": 0.34306119489559167, "grad_norm": 0.028367722406983376, "grad_norm_var": 3.294313828591901e-06, "learning_rate": 0.007549987099767173, "loss": 2.7324, "step": 9463 }, { "crossentropy": 2.585847854614258, "epoch": 0.3430974477958237, "grad_norm": 0.027833083644509315, "grad_norm_var": 3.4585651288905937e-06, "learning_rate": 0.00754948724487658, "loss": 2.634, "step": 9464 }, { "crossentropy": 2.644801378250122, "epoch": 0.3431337006960557, "grad_norm": 0.028543105348944664, "grad_norm_var": 3.4812279023905495e-06, "learning_rate": 0.007548987355551291, "loss": 2.6779, "step": 9465 }, { "crossentropy": 2.6967697143554688, "epoch": 0.3431699535962877, "grad_norm": 0.029224639758467674, "grad_norm_var": 3.4802744452462506e-06, "learning_rate": 0.007548487431798061, "loss": 2.6876, "step": 9466 }, { "crossentropy": 2.609574794769287, "epoch": 0.34320620649651973, "grad_norm": 0.029906706884503365, "grad_norm_var": 3.498245801085836e-06, "learning_rate": 0.007547987473623641, "loss": 2.6265, "step": 9467 }, { "crossentropy": 2.629791736602783, "epoch": 0.34324245939675174, "grad_norm": 0.030645323917269707, "grad_norm_var": 3.4777229166834394e-06, "learning_rate": 0.007547487481034783, "loss": 2.6405, "step": 9468 }, { "crossentropy": 2.6739962100982666, "epoch": 0.34327871229698376, "grad_norm": 0.03061976283788681, "grad_norm_var": 3.556675622878175e-06, "learning_rate": 0.00754698745403824, "loss": 2.639, "step": 9469 }, { "crossentropy": 2.7522692680358887, "epoch": 0.34331496519721577, "grad_norm": 0.03139566257596016, "grad_norm_var": 3.766527567989654e-06, "learning_rate": 0.007546487392640766, "loss": 2.68, "step": 9470 }, { "crossentropy": 2.5329513549804688, "epoch": 0.3433512180974478, "grad_norm": 0.029287561774253845, "grad_norm_var": 3.0711934327118383e-06, "learning_rate": 0.007545987296849117, "loss": 2.6339, "step": 9471 }, { "crossentropy": 2.6604790687561035, "epoch": 0.3433874709976798, "grad_norm": 0.028095565736293793, "grad_norm_var": 1.6881814481092364e-06, "learning_rate": 0.007545487166670045, "loss": 2.6051, "step": 9472 }, { "crossentropy": 2.5498580932617188, "epoch": 0.3434237238979118, "grad_norm": 0.02939090132713318, "grad_norm_var": 1.5947107322810313e-06, "learning_rate": 0.007544987002110306, "loss": 2.6348, "step": 9473 }, { "crossentropy": 2.627363443374634, "epoch": 0.34345997679814383, "grad_norm": 0.02844402939081192, "grad_norm_var": 1.486629124946605e-06, "learning_rate": 0.007544486803176656, "loss": 2.6355, "step": 9474 }, { "crossentropy": 2.7101120948791504, "epoch": 0.34349622969837584, "grad_norm": 0.02977774851024151, "grad_norm_var": 1.3478023439801676e-06, "learning_rate": 0.0075439865698758496, "loss": 2.7224, "step": 9475 }, { "crossentropy": 2.7386903762817383, "epoch": 0.3435324825986079, "grad_norm": 0.03458336368203163, "grad_norm_var": 3.2229522729714053e-06, "learning_rate": 0.007543486302214644, "loss": 2.7433, "step": 9476 }, { "crossentropy": 2.6293492317199707, "epoch": 0.34356873549883993, "grad_norm": 0.034433621913194656, "grad_norm_var": 4.453577351953018e-06, "learning_rate": 0.0075429860001997966, "loss": 2.618, "step": 9477 }, { "crossentropy": 2.7125802040100098, "epoch": 0.34360498839907194, "grad_norm": 0.030433399602770805, "grad_norm_var": 4.04055697860913e-06, "learning_rate": 0.007542485663838063, "loss": 2.6704, "step": 9478 }, { "crossentropy": 2.362544536590576, "epoch": 0.34364124129930396, "grad_norm": 0.028739280998706818, "grad_norm_var": 3.9652793585261e-06, "learning_rate": 0.007541985293136202, "loss": 2.4644, "step": 9479 }, { "crossentropy": 2.501781463623047, "epoch": 0.34367749419953597, "grad_norm": 0.029700379818677902, "grad_norm_var": 3.6226352055001204e-06, "learning_rate": 0.007541484888100973, "loss": 2.582, "step": 9480 }, { "crossentropy": 2.4493980407714844, "epoch": 0.343713747099768, "grad_norm": 0.03139091283082962, "grad_norm_var": 3.499875423851314e-06, "learning_rate": 0.007540984448739134, "loss": 2.6632, "step": 9481 }, { "crossentropy": 2.6361899375915527, "epoch": 0.34375, "grad_norm": 0.030696256086230278, "grad_norm_var": 3.4086658338382807e-06, "learning_rate": 0.007540483975057444, "loss": 2.631, "step": 9482 }, { "crossentropy": 2.6485605239868164, "epoch": 0.343786252900232, "grad_norm": 0.028859388083219528, "grad_norm_var": 3.556059012506327e-06, "learning_rate": 0.007539983467062661, "loss": 2.6292, "step": 9483 }, { "crossentropy": 2.567986488342285, "epoch": 0.34382250580046403, "grad_norm": 0.028999049216508865, "grad_norm_var": 3.672876424485336e-06, "learning_rate": 0.007539482924761548, "loss": 2.5646, "step": 9484 }, { "crossentropy": 2.745194435119629, "epoch": 0.34385875870069604, "grad_norm": 0.028760211542248726, "grad_norm_var": 3.8104415926928345e-06, "learning_rate": 0.007538982348160864, "loss": 2.6708, "step": 9485 }, { "crossentropy": 2.5208353996276855, "epoch": 0.34389501160092806, "grad_norm": 0.02746444009244442, "grad_norm_var": 4.142659393102525e-06, "learning_rate": 0.007538481737267371, "loss": 2.5166, "step": 9486 }, { "crossentropy": 2.621030807495117, "epoch": 0.34393126450116007, "grad_norm": 0.029779205098748207, "grad_norm_var": 4.1149315401348144e-06, "learning_rate": 0.007537981092087829, "loss": 2.7173, "step": 9487 }, { "crossentropy": 2.5855090618133545, "epoch": 0.3439675174013921, "grad_norm": 0.033327866345644, "grad_norm_var": 4.517101531396374e-06, "learning_rate": 0.0075374804126290005, "loss": 2.6032, "step": 9488 }, { "crossentropy": 2.501612424850464, "epoch": 0.34400377030162416, "grad_norm": 0.030387267470359802, "grad_norm_var": 4.458541047654083e-06, "learning_rate": 0.007536979698897648, "loss": 2.5491, "step": 9489 }, { "crossentropy": 2.7093770503997803, "epoch": 0.34404002320185617, "grad_norm": 0.027763862162828445, "grad_norm_var": 4.661305726285508e-06, "learning_rate": 0.007536478950900537, "loss": 2.6755, "step": 9490 }, { "crossentropy": 2.5555758476257324, "epoch": 0.3440762761020882, "grad_norm": 0.028479520231485367, "grad_norm_var": 4.8602482765588375e-06, "learning_rate": 0.007535978168644427, "loss": 2.6453, "step": 9491 }, { "crossentropy": 2.752912998199463, "epoch": 0.3441125290023202, "grad_norm": 0.029065197333693504, "grad_norm_var": 3.565799292036542e-06, "learning_rate": 0.007535477352136084, "loss": 2.761, "step": 9492 }, { "crossentropy": 2.58630633354187, "epoch": 0.3441487819025522, "grad_norm": 0.028544936329126358, "grad_norm_var": 2.1675824308578506e-06, "learning_rate": 0.007534976501382269, "loss": 2.5633, "step": 9493 }, { "crossentropy": 2.5817556381225586, "epoch": 0.34418503480278423, "grad_norm": 0.02827005460858345, "grad_norm_var": 2.1979029198212665e-06, "learning_rate": 0.007534475616389753, "loss": 2.5081, "step": 9494 }, { "crossentropy": 2.6881589889526367, "epoch": 0.34422128770301624, "grad_norm": 0.030411038547754288, "grad_norm_var": 2.2276998993505145e-06, "learning_rate": 0.007533974697165296, "loss": 2.6665, "step": 9495 }, { "crossentropy": 2.6509196758270264, "epoch": 0.34425754060324826, "grad_norm": 0.02830962836742401, "grad_norm_var": 2.3102658528426626e-06, "learning_rate": 0.007533473743715665, "loss": 2.6329, "step": 9496 }, { "crossentropy": 2.59474515914917, "epoch": 0.34429379350348027, "grad_norm": 0.030876636505126953, "grad_norm_var": 2.1907450412181193e-06, "learning_rate": 0.007532972756047627, "loss": 2.5789, "step": 9497 }, { "crossentropy": 2.554023265838623, "epoch": 0.3443300464037123, "grad_norm": 0.028104323893785477, "grad_norm_var": 2.153895370280284e-06, "learning_rate": 0.007532471734167948, "loss": 2.5842, "step": 9498 }, { "crossentropy": 2.5876047611236572, "epoch": 0.3443662993039443, "grad_norm": 0.029952118173241615, "grad_norm_var": 2.1770526809817507e-06, "learning_rate": 0.007531970678083395, "loss": 2.5981, "step": 9499 }, { "crossentropy": 2.4144997596740723, "epoch": 0.3444025522041763, "grad_norm": 0.030802402645349503, "grad_norm_var": 2.3125233295712463e-06, "learning_rate": 0.007531469587800734, "loss": 2.5522, "step": 9500 }, { "crossentropy": 2.7419660091400146, "epoch": 0.34443880510440833, "grad_norm": 0.03388357535004616, "grad_norm_var": 3.520352243727559e-06, "learning_rate": 0.007530968463326736, "loss": 2.6213, "step": 9501 }, { "crossentropy": 2.5286736488342285, "epoch": 0.34447505800464034, "grad_norm": 0.028921272605657578, "grad_norm_var": 3.216058928362621e-06, "learning_rate": 0.007530467304668169, "loss": 2.5874, "step": 9502 }, { "crossentropy": 2.7453222274780273, "epoch": 0.3445113109048724, "grad_norm": 0.028169775381684303, "grad_norm_var": 3.3834710962557146e-06, "learning_rate": 0.0075299661118318, "loss": 2.608, "step": 9503 }, { "crossentropy": 2.707916259765625, "epoch": 0.34454756380510443, "grad_norm": 0.029706105589866638, "grad_norm_var": 2.4534880482606487e-06, "learning_rate": 0.0075294648848244, "loss": 2.6282, "step": 9504 }, { "crossentropy": 2.682485342025757, "epoch": 0.34458381670533644, "grad_norm": 0.031350042670965195, "grad_norm_var": 2.6281465282010218e-06, "learning_rate": 0.007528963623652736, "loss": 2.6862, "step": 9505 }, { "crossentropy": 2.4308831691741943, "epoch": 0.34462006960556846, "grad_norm": 0.030325839295983315, "grad_norm_var": 2.4322862096468518e-06, "learning_rate": 0.007528462328323582, "loss": 2.5419, "step": 9506 }, { "crossentropy": 2.5169737339019775, "epoch": 0.34465632250580047, "grad_norm": 0.02873755618929863, "grad_norm_var": 2.3945164669534706e-06, "learning_rate": 0.007527960998843707, "loss": 2.6369, "step": 9507 }, { "crossentropy": 2.5228004455566406, "epoch": 0.3446925754060325, "grad_norm": 0.02670118398964405, "grad_norm_var": 2.9484332843336565e-06, "learning_rate": 0.007527459635219882, "loss": 2.5915, "step": 9508 }, { "crossentropy": 2.678680658340454, "epoch": 0.3447288283062645, "grad_norm": 0.027914483100175858, "grad_norm_var": 3.0591614035947617e-06, "learning_rate": 0.007526958237458878, "loss": 2.6163, "step": 9509 }, { "crossentropy": 2.573699474334717, "epoch": 0.3447650812064965, "grad_norm": 0.0341743566095829, "grad_norm_var": 4.2482436343836576e-06, "learning_rate": 0.0075264568055674694, "loss": 2.6341, "step": 9510 }, { "crossentropy": 2.615098237991333, "epoch": 0.34480133410672853, "grad_norm": 0.038835544139146805, "grad_norm_var": 9.262233444653062e-06, "learning_rate": 0.0075259553395524295, "loss": 2.642, "step": 9511 }, { "crossentropy": 2.507941722869873, "epoch": 0.34483758700696054, "grad_norm": 0.032293397933244705, "grad_norm_var": 9.131681369242694e-06, "learning_rate": 0.007525453839420528, "loss": 2.4487, "step": 9512 }, { "crossentropy": 2.6303181648254395, "epoch": 0.34487383990719256, "grad_norm": 0.02852833829820156, "grad_norm_var": 9.412198633735334e-06, "learning_rate": 0.007524952305178538, "loss": 2.6259, "step": 9513 }, { "crossentropy": 2.831533670425415, "epoch": 0.3449100928074246, "grad_norm": 0.030091920867562294, "grad_norm_var": 9.017591792992845e-06, "learning_rate": 0.007524450736833237, "loss": 2.7249, "step": 9514 }, { "crossentropy": 2.6811132431030273, "epoch": 0.3449463457076566, "grad_norm": 0.03128417953848839, "grad_norm_var": 9.0046756616784e-06, "learning_rate": 0.007523949134391399, "loss": 2.6324, "step": 9515 }, { "crossentropy": 2.597710132598877, "epoch": 0.34498259860788866, "grad_norm": 0.030254987999796867, "grad_norm_var": 9.01830237106498e-06, "learning_rate": 0.007523447497859797, "loss": 2.5702, "step": 9516 }, { "crossentropy": 2.5487680435180664, "epoch": 0.34501885150812067, "grad_norm": 0.02783932164311409, "grad_norm_var": 8.734587961023675e-06, "learning_rate": 0.007522945827245206, "loss": 2.4961, "step": 9517 }, { "crossentropy": 2.728825807571411, "epoch": 0.3450551044083527, "grad_norm": 0.029297517612576485, "grad_norm_var": 8.673240878383212e-06, "learning_rate": 0.007522444122554404, "loss": 2.6638, "step": 9518 }, { "crossentropy": 2.719515323638916, "epoch": 0.3450913573085847, "grad_norm": 0.02923685498535633, "grad_norm_var": 8.43505939412853e-06, "learning_rate": 0.007521942383794166, "loss": 2.6425, "step": 9519 }, { "crossentropy": 2.646023988723755, "epoch": 0.3451276102088167, "grad_norm": 0.02991051785647869, "grad_norm_var": 8.418466485561547e-06, "learning_rate": 0.007521440610971268, "loss": 2.6468, "step": 9520 }, { "crossentropy": 2.748521566390991, "epoch": 0.34516386310904873, "grad_norm": 0.03285767883062363, "grad_norm_var": 8.746778270986762e-06, "learning_rate": 0.0075209388040924875, "loss": 2.6623, "step": 9521 }, { "crossentropy": 2.6946370601654053, "epoch": 0.34520011600928074, "grad_norm": 0.02964898943901062, "grad_norm_var": 8.792728615519467e-06, "learning_rate": 0.0075204369631646055, "loss": 2.6721, "step": 9522 }, { "crossentropy": 2.598067283630371, "epoch": 0.34523636890951276, "grad_norm": 0.02841212786734104, "grad_norm_var": 8.874754566244533e-06, "learning_rate": 0.007519935088194396, "loss": 2.6561, "step": 9523 }, { "crossentropy": 2.510850191116333, "epoch": 0.3452726218097448, "grad_norm": 0.030513208359479904, "grad_norm_var": 7.874978934144456e-06, "learning_rate": 0.0075194331791886395, "loss": 2.6442, "step": 9524 }, { "crossentropy": 2.4356491565704346, "epoch": 0.3453088747099768, "grad_norm": 0.029980430379509926, "grad_norm_var": 7.3762749124891535e-06, "learning_rate": 0.007518931236154111, "loss": 2.4044, "step": 9525 }, { "crossentropy": 2.596813440322876, "epoch": 0.3453451276102088, "grad_norm": 0.02939421683549881, "grad_norm_var": 6.668046014115849e-06, "learning_rate": 0.007518429259097598, "loss": 2.568, "step": 9526 }, { "crossentropy": 2.7425060272216797, "epoch": 0.3453813805104408, "grad_norm": 0.030704373493790627, "grad_norm_var": 1.7889578163563275e-06, "learning_rate": 0.007517927248025876, "loss": 2.6833, "step": 9527 }, { "crossentropy": 2.6175613403320312, "epoch": 0.34541763341067283, "grad_norm": 0.03286896273493767, "grad_norm_var": 1.9844725797911554e-06, "learning_rate": 0.007517425202945724, "loss": 2.6796, "step": 9528 }, { "crossentropy": 2.56528639793396, "epoch": 0.34545388631090485, "grad_norm": 0.029405320063233376, "grad_norm_var": 1.8544391994528096e-06, "learning_rate": 0.007516923123863923, "loss": 2.4874, "step": 9529 }, { "crossentropy": 2.7081778049468994, "epoch": 0.3454901392111369, "grad_norm": 0.027119826525449753, "grad_norm_var": 2.4122166570600747e-06, "learning_rate": 0.007516421010787256, "loss": 2.6918, "step": 9530 }, { "crossentropy": 2.646514892578125, "epoch": 0.34552639211136893, "grad_norm": 0.02791563607752323, "grad_norm_var": 2.5089420668395936e-06, "learning_rate": 0.007515918863722507, "loss": 2.6457, "step": 9531 }, { "crossentropy": 2.7552363872528076, "epoch": 0.34556264501160094, "grad_norm": 0.032445602118968964, "grad_norm_var": 2.968048111292622e-06, "learning_rate": 0.007515416682676454, "loss": 2.6823, "step": 9532 }, { "crossentropy": 2.7228715419769287, "epoch": 0.34559889791183296, "grad_norm": 0.03435072302818298, "grad_norm_var": 3.874981699846421e-06, "learning_rate": 0.00751491446765588, "loss": 2.645, "step": 9533 }, { "crossentropy": 2.664332628250122, "epoch": 0.345635150812065, "grad_norm": 0.03192193806171417, "grad_norm_var": 3.9708047352813065e-06, "learning_rate": 0.007514412218667571, "loss": 2.6234, "step": 9534 }, { "crossentropy": 2.7780263423919678, "epoch": 0.345671403712297, "grad_norm": 0.029801134020090103, "grad_norm_var": 3.9018468553470945e-06, "learning_rate": 0.00751390993571831, "loss": 2.6952, "step": 9535 }, { "crossentropy": 2.5404856204986572, "epoch": 0.345707656612529, "grad_norm": 0.029911179095506668, "grad_norm_var": 3.901799039826671e-06, "learning_rate": 0.00751340761881488, "loss": 2.5834, "step": 9536 }, { "crossentropy": 2.546828508377075, "epoch": 0.345743909512761, "grad_norm": 0.028673430904746056, "grad_norm_var": 3.654591757548155e-06, "learning_rate": 0.007512905267964065, "loss": 2.5081, "step": 9537 }, { "crossentropy": 2.6770777702331543, "epoch": 0.34578016241299303, "grad_norm": 0.02881346270442009, "grad_norm_var": 3.758682503158158e-06, "learning_rate": 0.007512402883172651, "loss": 2.6873, "step": 9538 }, { "crossentropy": 2.55690598487854, "epoch": 0.34581641531322505, "grad_norm": 0.027636971324682236, "grad_norm_var": 3.974765143979749e-06, "learning_rate": 0.007511900464447423, "loss": 2.6288, "step": 9539 }, { "crossentropy": 2.568760395050049, "epoch": 0.34585266821345706, "grad_norm": 0.028929904103279114, "grad_norm_var": 4.042317663147905e-06, "learning_rate": 0.00751139801179517, "loss": 2.6146, "step": 9540 }, { "crossentropy": 2.6829466819763184, "epoch": 0.3458889211136891, "grad_norm": 0.029021231457591057, "grad_norm_var": 4.101310133905337e-06, "learning_rate": 0.007510895525222672, "loss": 2.6717, "step": 9541 }, { "crossentropy": 2.800844669342041, "epoch": 0.3459251740139211, "grad_norm": 0.028738010674715042, "grad_norm_var": 4.175286383731194e-06, "learning_rate": 0.007510393004736722, "loss": 2.7233, "step": 9542 }, { "crossentropy": 2.707932472229004, "epoch": 0.34596142691415316, "grad_norm": 0.02796533890068531, "grad_norm_var": 4.34717198738974e-06, "learning_rate": 0.007509890450344103, "loss": 2.6288, "step": 9543 }, { "crossentropy": 2.611448049545288, "epoch": 0.3459976798143852, "grad_norm": 0.02722865156829357, "grad_norm_var": 3.967278313251702e-06, "learning_rate": 0.0075093878620516065, "loss": 2.6019, "step": 9544 }, { "crossentropy": 2.5859289169311523, "epoch": 0.3460339327146172, "grad_norm": 0.026772454380989075, "grad_norm_var": 4.387214514641015e-06, "learning_rate": 0.0075088852398660175, "loss": 2.6377, "step": 9545 }, { "crossentropy": 2.7636313438415527, "epoch": 0.3460701856148492, "grad_norm": 0.028438666835427284, "grad_norm_var": 4.129634361096403e-06, "learning_rate": 0.007508382583794127, "loss": 2.6382, "step": 9546 }, { "crossentropy": 2.5771827697753906, "epoch": 0.3461064385150812, "grad_norm": 0.028483450412750244, "grad_norm_var": 4.046092072167264e-06, "learning_rate": 0.0075078798938427215, "loss": 2.4927, "step": 9547 }, { "crossentropy": 2.3917152881622314, "epoch": 0.34614269141531323, "grad_norm": 0.028712524101138115, "grad_norm_var": 3.3617131265350117e-06, "learning_rate": 0.007507377170018595, "loss": 2.5029, "step": 9548 }, { "crossentropy": 2.5795681476593018, "epoch": 0.34617894431554525, "grad_norm": 0.027638398110866547, "grad_norm_var": 1.4671560626365598e-06, "learning_rate": 0.007506874412328532, "loss": 2.6221, "step": 9549 }, { "crossentropy": 2.670086145401001, "epoch": 0.34621519721577726, "grad_norm": 0.02731606736779213, "grad_norm_var": 7.946900317999546e-07, "learning_rate": 0.007506371620779327, "loss": 2.6853, "step": 9550 }, { "crossentropy": 2.480057716369629, "epoch": 0.3462514501160093, "grad_norm": 0.02852601185441017, "grad_norm_var": 6.547044414638321e-07, "learning_rate": 0.007505868795377769, "loss": 2.5724, "step": 9551 }, { "crossentropy": 2.6618781089782715, "epoch": 0.3462877030162413, "grad_norm": 0.02798064984381199, "grad_norm_var": 4.730071734124526e-07, "learning_rate": 0.00750536593613065, "loss": 2.6366, "step": 9552 }, { "crossentropy": 2.788347005844116, "epoch": 0.3463239559164733, "grad_norm": 0.028755834326148033, "grad_norm_var": 4.788562339053434e-07, "learning_rate": 0.007504863043044763, "loss": 2.6611, "step": 9553 }, { "crossentropy": 2.608476400375366, "epoch": 0.3463602088167053, "grad_norm": 0.02944541908800602, "grad_norm_var": 5.567840825596952e-07, "learning_rate": 0.007504360116126898, "loss": 2.6588, "step": 9554 }, { "crossentropy": 2.621903896331787, "epoch": 0.34639646171693733, "grad_norm": 0.02744458056986332, "grad_norm_var": 5.741649420229333e-07, "learning_rate": 0.00750385715538385, "loss": 2.6179, "step": 9555 }, { "crossentropy": 2.6386256217956543, "epoch": 0.34643271461716935, "grad_norm": 0.02684856951236725, "grad_norm_var": 6.457756413201828e-07, "learning_rate": 0.007503354160822411, "loss": 2.5978, "step": 9556 }, { "crossentropy": 2.653785467147827, "epoch": 0.3464689675174014, "grad_norm": 0.02757415361702442, "grad_norm_var": 5.954805153557251e-07, "learning_rate": 0.007502851132449376, "loss": 2.604, "step": 9557 }, { "crossentropy": 2.72179913520813, "epoch": 0.34650522041763343, "grad_norm": 0.027300691232085228, "grad_norm_var": 5.815924903659318e-07, "learning_rate": 0.007502348070271537, "loss": 2.6788, "step": 9558 }, { "crossentropy": 2.6182234287261963, "epoch": 0.34654147331786544, "grad_norm": 0.028289398178458214, "grad_norm_var": 5.908940860742395e-07, "learning_rate": 0.007501844974295691, "loss": 2.5472, "step": 9559 }, { "crossentropy": 2.659501791000366, "epoch": 0.34657772621809746, "grad_norm": 0.02732877805829048, "grad_norm_var": 5.822613910133467e-07, "learning_rate": 0.007501341844528631, "loss": 2.6417, "step": 9560 }, { "crossentropy": 2.618833303451538, "epoch": 0.3466139791183295, "grad_norm": 0.027355797588825226, "grad_norm_var": 5.136150044619438e-07, "learning_rate": 0.007500838680977153, "loss": 2.6135, "step": 9561 }, { "crossentropy": 2.806955575942993, "epoch": 0.3466502320185615, "grad_norm": 0.0296314787119627, "grad_norm_var": 6.778827714070858e-07, "learning_rate": 0.007500335483648055, "loss": 2.6443, "step": 9562 }, { "crossentropy": 2.674377918243408, "epoch": 0.3466864849187935, "grad_norm": 0.036563701927661896, "grad_norm_var": 5.236847565222288e-06, "learning_rate": 0.007499832252548131, "loss": 2.6858, "step": 9563 }, { "crossentropy": 2.6015677452087402, "epoch": 0.3467227378190255, "grad_norm": 0.03126506507396698, "grad_norm_var": 5.701248123658684e-06, "learning_rate": 0.00749932898768418, "loss": 2.6392, "step": 9564 }, { "crossentropy": 2.631533622741699, "epoch": 0.34675899071925753, "grad_norm": 0.03190241754055023, "grad_norm_var": 6.2317603804775665e-06, "learning_rate": 0.0074988256890629965, "loss": 2.6198, "step": 9565 }, { "crossentropy": 2.675295829772949, "epoch": 0.34679524361948955, "grad_norm": 0.02894526720046997, "grad_norm_var": 6.038258444651436e-06, "learning_rate": 0.00749832235669138, "loss": 2.6526, "step": 9566 }, { "crossentropy": 2.665466070175171, "epoch": 0.34683149651972156, "grad_norm": 0.03124498762190342, "grad_norm_var": 6.302241391156977e-06, "learning_rate": 0.0074978189905761305, "loss": 2.6426, "step": 9567 }, { "crossentropy": 2.3856780529022217, "epoch": 0.3468677494199536, "grad_norm": 0.03387552872300148, "grad_norm_var": 7.482455042608314e-06, "learning_rate": 0.0074973155907240435, "loss": 2.4957, "step": 9568 }, { "crossentropy": 2.7136378288269043, "epoch": 0.3469040023201856, "grad_norm": 0.03082544542849064, "grad_norm_var": 7.514253987953891e-06, "learning_rate": 0.00749681215714192, "loss": 2.6, "step": 9569 }, { "crossentropy": 2.6006031036376953, "epoch": 0.34694025522041766, "grad_norm": 0.029683751985430717, "grad_norm_var": 7.508440496578617e-06, "learning_rate": 0.0074963086898365595, "loss": 2.6027, "step": 9570 }, { "crossentropy": 2.65615177154541, "epoch": 0.3469765081206497, "grad_norm": 0.027850240468978882, "grad_norm_var": 7.393760866982235e-06, "learning_rate": 0.007495805188814762, "loss": 2.6266, "step": 9571 }, { "crossentropy": 2.6802542209625244, "epoch": 0.3470127610208817, "grad_norm": 0.027193890884518623, "grad_norm_var": 7.266227208376173e-06, "learning_rate": 0.007495301654083326, "loss": 2.6556, "step": 9572 }, { "crossentropy": 2.5559158325195312, "epoch": 0.3470490139211137, "grad_norm": 0.02955046482384205, "grad_norm_var": 6.923307531147884e-06, "learning_rate": 0.007494798085649057, "loss": 2.6136, "step": 9573 }, { "crossentropy": 2.7432844638824463, "epoch": 0.3470852668213457, "grad_norm": 0.02898917719721794, "grad_norm_var": 6.5105824502381065e-06, "learning_rate": 0.007494294483518753, "loss": 2.709, "step": 9574 }, { "crossentropy": 2.5577375888824463, "epoch": 0.34712151972157773, "grad_norm": 0.02743208222091198, "grad_norm_var": 6.7555954224456646e-06, "learning_rate": 0.007493790847699216, "loss": 2.6515, "step": 9575 }, { "crossentropy": 2.61474609375, "epoch": 0.34715777262180975, "grad_norm": 0.029777737334370613, "grad_norm_var": 6.265590672000348e-06, "learning_rate": 0.00749328717819725, "loss": 2.5851, "step": 9576 }, { "crossentropy": 2.6380863189697266, "epoch": 0.34719402552204176, "grad_norm": 0.026828564703464508, "grad_norm_var": 6.478015085973193e-06, "learning_rate": 0.007492783475019657, "loss": 2.6166, "step": 9577 }, { "crossentropy": 2.4878242015838623, "epoch": 0.3472302784222738, "grad_norm": 0.028807329013943672, "grad_norm_var": 6.57167465254437e-06, "learning_rate": 0.00749227973817324, "loss": 2.6396, "step": 9578 }, { "crossentropy": 2.7192413806915283, "epoch": 0.3472665313225058, "grad_norm": 0.029230473563075066, "grad_norm_var": 3.5598953993391193e-06, "learning_rate": 0.0074917759676648025, "loss": 2.6419, "step": 9579 }, { "crossentropy": 2.787907600402832, "epoch": 0.3473027842227378, "grad_norm": 0.030187999829649925, "grad_norm_var": 3.3915085622351325e-06, "learning_rate": 0.0074912721635011495, "loss": 2.6989, "step": 9580 }, { "crossentropy": 2.626267194747925, "epoch": 0.3473390371229698, "grad_norm": 0.031237628310918808, "grad_norm_var": 3.2079857235287057e-06, "learning_rate": 0.007490768325689084, "loss": 2.6202, "step": 9581 }, { "crossentropy": 2.656564474105835, "epoch": 0.34737529002320183, "grad_norm": 0.03305285423994064, "grad_norm_var": 3.9703062537784864e-06, "learning_rate": 0.007490264454235415, "loss": 2.7061, "step": 9582 }, { "crossentropy": 2.6897215843200684, "epoch": 0.34741154292343385, "grad_norm": 0.03153840824961662, "grad_norm_var": 4.034742157930565e-06, "learning_rate": 0.007489760549146943, "loss": 2.6424, "step": 9583 }, { "crossentropy": 2.5143306255340576, "epoch": 0.3474477958236659, "grad_norm": 0.029283544048666954, "grad_norm_var": 2.829078409543178e-06, "learning_rate": 0.007489256610430478, "loss": 2.5315, "step": 9584 }, { "crossentropy": 2.644824981689453, "epoch": 0.34748404872389793, "grad_norm": 0.030995061621069908, "grad_norm_var": 2.8616018282331493e-06, "learning_rate": 0.007488752638092824, "loss": 2.5366, "step": 9585 }, { "crossentropy": 2.6532840728759766, "epoch": 0.34752030162412995, "grad_norm": 0.03017282299697399, "grad_norm_var": 2.890004039763962e-06, "learning_rate": 0.0074882486321407905, "loss": 2.5853, "step": 9586 }, { "crossentropy": 2.6223607063293457, "epoch": 0.34755655452436196, "grad_norm": 0.027863549068570137, "grad_norm_var": 2.8870734176494044e-06, "learning_rate": 0.007487744592581181, "loss": 2.5742, "step": 9587 }, { "crossentropy": 2.5775229930877686, "epoch": 0.347592807424594, "grad_norm": 0.02939336933195591, "grad_norm_var": 2.5105365348602883e-06, "learning_rate": 0.007487240519420807, "loss": 2.6208, "step": 9588 }, { "crossentropy": 2.6948812007904053, "epoch": 0.347629060324826, "grad_norm": 0.0298710186034441, "grad_norm_var": 2.512861949445254e-06, "learning_rate": 0.007486736412666474, "loss": 2.6771, "step": 9589 }, { "crossentropy": 2.690167188644409, "epoch": 0.347665313225058, "grad_norm": 0.028249265626072884, "grad_norm_var": 2.6138852946340498e-06, "learning_rate": 0.007486232272324993, "loss": 2.706, "step": 9590 }, { "crossentropy": 2.6194987297058105, "epoch": 0.34770156612529, "grad_norm": 0.029148323461413383, "grad_norm_var": 2.2972876425116086e-06, "learning_rate": 0.0074857280984031715, "loss": 2.5578, "step": 9591 }, { "crossentropy": 2.460371732711792, "epoch": 0.34773781902552203, "grad_norm": 0.02939451113343239, "grad_norm_var": 2.3038930224357578e-06, "learning_rate": 0.007485223890907821, "loss": 2.6099, "step": 9592 }, { "crossentropy": 2.8138647079467773, "epoch": 0.34777407192575405, "grad_norm": 0.030239030718803406, "grad_norm_var": 1.7235683119393945e-06, "learning_rate": 0.007484719649845748, "loss": 2.7217, "step": 9593 }, { "crossentropy": 2.6301283836364746, "epoch": 0.34781032482598606, "grad_norm": 0.034050293266773224, "grad_norm_var": 2.666179284604303e-06, "learning_rate": 0.007484215375223768, "loss": 2.5578, "step": 9594 }, { "crossentropy": 2.5579793453216553, "epoch": 0.3478465777262181, "grad_norm": 0.03567797690629959, "grad_norm_var": 4.392804454401974e-06, "learning_rate": 0.007483711067048688, "loss": 2.6519, "step": 9595 }, { "crossentropy": 2.6703219413757324, "epoch": 0.3478828306264501, "grad_norm": 0.03189243748784065, "grad_norm_var": 4.470010115561103e-06, "learning_rate": 0.007483206725327322, "loss": 2.677, "step": 9596 }, { "crossentropy": 2.701507568359375, "epoch": 0.34791908352668216, "grad_norm": 0.032399289309978485, "grad_norm_var": 4.629297220791878e-06, "learning_rate": 0.007482702350066479, "loss": 2.6799, "step": 9597 }, { "crossentropy": 2.793412923812866, "epoch": 0.3479553364269142, "grad_norm": 0.03229543939232826, "grad_norm_var": 4.44030138480947e-06, "learning_rate": 0.007482197941272975, "loss": 2.7654, "step": 9598 }, { "crossentropy": 2.613938331604004, "epoch": 0.3479915893271462, "grad_norm": 0.029249217361211777, "grad_norm_var": 4.5360418508275565e-06, "learning_rate": 0.00748169349895362, "loss": 2.6028, "step": 9599 }, { "crossentropy": 2.5802900791168213, "epoch": 0.3480278422273782, "grad_norm": 0.03193109109997749, "grad_norm_var": 4.4967292482409596e-06, "learning_rate": 0.007481189023115229, "loss": 2.6485, "step": 9600 }, { "crossentropy": 2.6633238792419434, "epoch": 0.3480640951276102, "grad_norm": 0.028816934674978256, "grad_norm_var": 4.737006845491274e-06, "learning_rate": 0.007480684513764613, "loss": 2.6481, "step": 9601 }, { "crossentropy": 2.6054093837738037, "epoch": 0.34810034802784223, "grad_norm": 0.028250304982066154, "grad_norm_var": 5.094247342267375e-06, "learning_rate": 0.00748017997090859, "loss": 2.5858, "step": 9602 }, { "crossentropy": 2.388885974884033, "epoch": 0.34813660092807425, "grad_norm": 0.027732981368899345, "grad_norm_var": 5.141996521352878e-06, "learning_rate": 0.007479675394553972, "loss": 2.4937, "step": 9603 }, { "crossentropy": 2.5841450691223145, "epoch": 0.34817285382830626, "grad_norm": 0.028892014175653458, "grad_norm_var": 5.234152864914257e-06, "learning_rate": 0.007479170784707574, "loss": 2.5351, "step": 9604 }, { "crossentropy": 2.7609574794769287, "epoch": 0.3482091067285383, "grad_norm": 0.03221963718533516, "grad_norm_var": 5.380174456691365e-06, "learning_rate": 0.007478666141376212, "loss": 2.7459, "step": 9605 }, { "crossentropy": 2.5478453636169434, "epoch": 0.3482453596287703, "grad_norm": 0.030780082568526268, "grad_norm_var": 4.96956267209576e-06, "learning_rate": 0.007478161464566704, "loss": 2.5939, "step": 9606 }, { "crossentropy": 2.5083887577056885, "epoch": 0.3482816125290023, "grad_norm": 0.03015449084341526, "grad_norm_var": 4.809832473749199e-06, "learning_rate": 0.007477656754285864, "loss": 2.614, "step": 9607 }, { "crossentropy": 2.769627571105957, "epoch": 0.3483178654292343, "grad_norm": 0.030275478959083557, "grad_norm_var": 4.684615409658588e-06, "learning_rate": 0.00747715201054051, "loss": 2.7107, "step": 9608 }, { "crossentropy": 2.749648094177246, "epoch": 0.34835411832946633, "grad_norm": 0.02935405820608139, "grad_norm_var": 4.814923944283095e-06, "learning_rate": 0.0074766472333374565, "loss": 2.6811, "step": 9609 }, { "crossentropy": 2.708110809326172, "epoch": 0.3483903712296984, "grad_norm": 0.028131544589996338, "grad_norm_var": 4.4971685525261686e-06, "learning_rate": 0.007476142422683526, "loss": 2.5981, "step": 9610 }, { "crossentropy": 2.5324835777282715, "epoch": 0.3484266241299304, "grad_norm": 0.0375673770904541, "grad_norm_var": 6.023885031698992e-06, "learning_rate": 0.007475637578585536, "loss": 2.6073, "step": 9611 }, { "crossentropy": 2.506005048751831, "epoch": 0.34846287703016243, "grad_norm": 0.02936548925936222, "grad_norm_var": 5.994730181286854e-06, "learning_rate": 0.007475132701050301, "loss": 2.5765, "step": 9612 }, { "crossentropy": 2.5764317512512207, "epoch": 0.34849912993039445, "grad_norm": 0.028112785890698433, "grad_norm_var": 6.03672283174377e-06, "learning_rate": 0.007474627790084643, "loss": 2.6418, "step": 9613 }, { "crossentropy": 2.8129491806030273, "epoch": 0.34853538283062646, "grad_norm": 0.028660673648118973, "grad_norm_var": 5.844766020140363e-06, "learning_rate": 0.0074741228456953815, "loss": 2.725, "step": 9614 }, { "crossentropy": 2.6415393352508545, "epoch": 0.3485716357308585, "grad_norm": 0.02866882085800171, "grad_norm_var": 5.9214734366938925e-06, "learning_rate": 0.007473617867889335, "loss": 2.6426, "step": 9615 }, { "crossentropy": 2.6184370517730713, "epoch": 0.3486078886310905, "grad_norm": 0.027645152062177658, "grad_norm_var": 5.9272184016888215e-06, "learning_rate": 0.007473112856673328, "loss": 2.6141, "step": 9616 }, { "crossentropy": 2.562652587890625, "epoch": 0.3486441415313225, "grad_norm": 0.030196884647011757, "grad_norm_var": 5.890336400900239e-06, "learning_rate": 0.007472607812054177, "loss": 2.5383, "step": 9617 }, { "crossentropy": 2.608823776245117, "epoch": 0.3486803944315545, "grad_norm": 0.028262590989470482, "grad_norm_var": 5.887888336977019e-06, "learning_rate": 0.007472102734038706, "loss": 2.6798, "step": 9618 }, { "crossentropy": 2.7328882217407227, "epoch": 0.34871664733178653, "grad_norm": 0.03184494748711586, "grad_norm_var": 5.8381124884279715e-06, "learning_rate": 0.007471597622633735, "loss": 2.7345, "step": 9619 }, { "crossentropy": 2.5182387828826904, "epoch": 0.34875290023201855, "grad_norm": 0.030228659510612488, "grad_norm_var": 5.750841114071127e-06, "learning_rate": 0.0074710924778460894, "loss": 2.5777, "step": 9620 }, { "crossentropy": 2.6733875274658203, "epoch": 0.34878915313225056, "grad_norm": 0.029052739962935448, "grad_norm_var": 5.479179603247901e-06, "learning_rate": 0.007470587299682588, "loss": 2.6412, "step": 9621 }, { "crossentropy": 2.7045042514801025, "epoch": 0.3488254060324826, "grad_norm": 0.02993299998342991, "grad_norm_var": 5.423932699618453e-06, "learning_rate": 0.007470082088150056, "loss": 2.6391, "step": 9622 }, { "crossentropy": 2.650724172592163, "epoch": 0.3488616589327146, "grad_norm": 0.02921401336789131, "grad_norm_var": 5.439892771348596e-06, "learning_rate": 0.007469576843255317, "loss": 2.6249, "step": 9623 }, { "crossentropy": 2.711519479751587, "epoch": 0.34889791183294666, "grad_norm": 0.03119850531220436, "grad_norm_var": 5.553856869747649e-06, "learning_rate": 0.007469071565005196, "loss": 2.5871, "step": 9624 }, { "crossentropy": 2.6145217418670654, "epoch": 0.3489341647331787, "grad_norm": 0.033497873693704605, "grad_norm_var": 6.358665419037695e-06, "learning_rate": 0.007468566253406516, "loss": 2.6181, "step": 9625 }, { "crossentropy": 2.6694905757904053, "epoch": 0.3489704176334107, "grad_norm": 0.03612158074975014, "grad_norm_var": 8.252898333392656e-06, "learning_rate": 0.007468060908466102, "loss": 2.7023, "step": 9626 }, { "crossentropy": 2.9008407592773438, "epoch": 0.3490066705336427, "grad_norm": 0.02939033880829811, "grad_norm_var": 4.833591288532814e-06, "learning_rate": 0.007467555530190781, "loss": 2.7409, "step": 9627 }, { "crossentropy": 2.6849889755249023, "epoch": 0.3490429234338747, "grad_norm": 0.030796485021710396, "grad_norm_var": 4.823887230898972e-06, "learning_rate": 0.007467050118587377, "loss": 2.7219, "step": 9628 }, { "crossentropy": 2.6233444213867188, "epoch": 0.34907917633410673, "grad_norm": 0.029305871576070786, "grad_norm_var": 4.584550910704958e-06, "learning_rate": 0.007466544673662717, "loss": 2.6109, "step": 9629 }, { "crossentropy": 2.579460382461548, "epoch": 0.34911542923433875, "grad_norm": 0.02813389152288437, "grad_norm_var": 4.713604740489901e-06, "learning_rate": 0.007466039195423629, "loss": 2.5326, "step": 9630 }, { "crossentropy": 2.707669258117676, "epoch": 0.34915168213457076, "grad_norm": 0.028149625286459923, "grad_norm_var": 4.837710604660891e-06, "learning_rate": 0.007465533683876939, "loss": 2.6918, "step": 9631 }, { "crossentropy": 2.6431398391723633, "epoch": 0.3491879350348028, "grad_norm": 0.03799229860305786, "grad_norm_var": 8.024103350980024e-06, "learning_rate": 0.007465028139029475, "loss": 2.5972, "step": 9632 }, { "crossentropy": 2.6963324546813965, "epoch": 0.3492241879350348, "grad_norm": 0.03362206369638443, "grad_norm_var": 8.467084346296334e-06, "learning_rate": 0.0074645225608880655, "loss": 2.6106, "step": 9633 }, { "crossentropy": 2.6588056087493896, "epoch": 0.3492604408352668, "grad_norm": 0.031129004433751106, "grad_norm_var": 7.916615229251499e-06, "learning_rate": 0.007464016949459539, "loss": 2.6482, "step": 9634 }, { "crossentropy": 2.5697178840637207, "epoch": 0.3492966937354988, "grad_norm": 0.029288137331604958, "grad_norm_var": 8.114082266795902e-06, "learning_rate": 0.007463511304750723, "loss": 2.6882, "step": 9635 }, { "crossentropy": 2.649350881576538, "epoch": 0.34933294663573083, "grad_norm": 0.028045613318681717, "grad_norm_var": 8.655630228736102e-06, "learning_rate": 0.00746300562676845, "loss": 2.6158, "step": 9636 }, { "crossentropy": 2.722428798675537, "epoch": 0.3493691995359629, "grad_norm": 0.027530938386917114, "grad_norm_var": 9.181168108095258e-06, "learning_rate": 0.007462499915519547, "loss": 2.6538, "step": 9637 }, { "crossentropy": 2.657165288925171, "epoch": 0.3494054524361949, "grad_norm": 0.027648786082863808, "grad_norm_var": 9.781780179343016e-06, "learning_rate": 0.0074619941710108465, "loss": 2.6684, "step": 9638 }, { "crossentropy": 2.548527717590332, "epoch": 0.34944170533642693, "grad_norm": 0.028927071020007133, "grad_norm_var": 9.843455761080326e-06, "learning_rate": 0.007461488393249178, "loss": 2.5427, "step": 9639 }, { "crossentropy": 2.537214756011963, "epoch": 0.34947795823665895, "grad_norm": 0.02830461785197258, "grad_norm_var": 1.0164343406212674e-05, "learning_rate": 0.007460982582241374, "loss": 2.5511, "step": 9640 }, { "crossentropy": 2.6287741661071777, "epoch": 0.34951421113689096, "grad_norm": 0.028530118986964226, "grad_norm_var": 9.716267604326307e-06, "learning_rate": 0.0074604767379942635, "loss": 2.5525, "step": 9641 }, { "crossentropy": 2.5962681770324707, "epoch": 0.349550464037123, "grad_norm": 0.030154278501868248, "grad_norm_var": 7.2162622270693925e-06, "learning_rate": 0.007459970860514682, "loss": 2.6191, "step": 9642 }, { "crossentropy": 2.5413150787353516, "epoch": 0.349586716937355, "grad_norm": 0.031046099960803986, "grad_norm_var": 7.29511080320381e-06, "learning_rate": 0.007459464949809461, "loss": 2.4975, "step": 9643 }, { "crossentropy": 2.792755126953125, "epoch": 0.349622969837587, "grad_norm": 0.03361597657203674, "grad_norm_var": 8.124159864353086e-06, "learning_rate": 0.007458959005885434, "loss": 2.8281, "step": 9644 }, { "crossentropy": 2.5111138820648193, "epoch": 0.349659222737819, "grad_norm": 0.030961109325289726, "grad_norm_var": 8.122557528056756e-06, "learning_rate": 0.007458453028749434, "loss": 2.5654, "step": 9645 }, { "crossentropy": 2.615158796310425, "epoch": 0.34969547563805103, "grad_norm": 0.029720241203904152, "grad_norm_var": 7.844420936083451e-06, "learning_rate": 0.007457947018408294, "loss": 2.6728, "step": 9646 }, { "crossentropy": 2.5159060955047607, "epoch": 0.34973172853828305, "grad_norm": 0.02945234812796116, "grad_norm_var": 7.578431478431541e-06, "learning_rate": 0.00745744097486885, "loss": 2.6043, "step": 9647 }, { "crossentropy": 2.7077293395996094, "epoch": 0.34976798143851506, "grad_norm": 0.027306506410241127, "grad_norm_var": 3.859363252627033e-06, "learning_rate": 0.007456934898137937, "loss": 2.5927, "step": 9648 }, { "crossentropy": 2.6080195903778076, "epoch": 0.3498042343387471, "grad_norm": 0.026955971494317055, "grad_norm_var": 3.155289682356963e-06, "learning_rate": 0.0074564287882223895, "loss": 2.6481, "step": 9649 }, { "crossentropy": 2.7182204723358154, "epoch": 0.3498404872389791, "grad_norm": 0.029531752690672874, "grad_norm_var": 2.922784882401644e-06, "learning_rate": 0.007455922645129043, "loss": 2.6915, "step": 9650 }, { "crossentropy": 2.6639864444732666, "epoch": 0.34987674013921116, "grad_norm": 0.029384687542915344, "grad_norm_var": 2.924647301078461e-06, "learning_rate": 0.007455416468864733, "loss": 2.5817, "step": 9651 }, { "crossentropy": 2.6301565170288086, "epoch": 0.3499129930394432, "grad_norm": 0.029401004314422607, "grad_norm_var": 2.8317931697070695e-06, "learning_rate": 0.007454910259436299, "loss": 2.6233, "step": 9652 }, { "crossentropy": 2.6021108627319336, "epoch": 0.3499492459396752, "grad_norm": 0.030159544199705124, "grad_norm_var": 2.6508147397096005e-06, "learning_rate": 0.007454404016850575, "loss": 2.6906, "step": 9653 }, { "crossentropy": 2.603705644607544, "epoch": 0.3499854988399072, "grad_norm": 0.03452994301915169, "grad_norm_var": 3.963346113905691e-06, "learning_rate": 0.007453897741114402, "loss": 2.5899, "step": 9654 }, { "crossentropy": 2.7400553226470947, "epoch": 0.3500217517401392, "grad_norm": 0.03419622406363487, "grad_norm_var": 5.033445800189015e-06, "learning_rate": 0.0074533914322346155, "loss": 2.6879, "step": 9655 }, { "crossentropy": 2.644836902618408, "epoch": 0.35005800464037123, "grad_norm": 0.03252462297677994, "grad_norm_var": 5.078230597604406e-06, "learning_rate": 0.007452885090218054, "loss": 2.6178, "step": 9656 }, { "crossentropy": 2.4713149070739746, "epoch": 0.35009425754060325, "grad_norm": 0.030026333406567574, "grad_norm_var": 4.831767706578449e-06, "learning_rate": 0.0074523787150715575, "loss": 2.6111, "step": 9657 }, { "crossentropy": 2.650980234146118, "epoch": 0.35013051044083526, "grad_norm": 0.027939626947045326, "grad_norm_var": 5.258237148169677e-06, "learning_rate": 0.007451872306801964, "loss": 2.604, "step": 9658 }, { "crossentropy": 2.51987886428833, "epoch": 0.3501667633410673, "grad_norm": 0.027675911784172058, "grad_norm_var": 5.687677877759013e-06, "learning_rate": 0.007451365865416115, "loss": 2.6098, "step": 9659 }, { "crossentropy": 2.517693281173706, "epoch": 0.3502030162412993, "grad_norm": 0.028352444991469383, "grad_norm_var": 5.02985335868616e-06, "learning_rate": 0.00745085939092085, "loss": 2.5936, "step": 9660 }, { "crossentropy": 2.4982709884643555, "epoch": 0.3502392691415313, "grad_norm": 0.027559634298086166, "grad_norm_var": 5.263749722901149e-06, "learning_rate": 0.007450352883323008, "loss": 2.5586, "step": 9661 }, { "crossentropy": 2.5831305980682373, "epoch": 0.3502755220417633, "grad_norm": 0.02847297117114067, "grad_norm_var": 5.352591348090291e-06, "learning_rate": 0.007449846342629434, "loss": 2.6172, "step": 9662 }, { "crossentropy": 2.6043765544891357, "epoch": 0.35031177494199534, "grad_norm": 0.0335843451321125, "grad_norm_var": 6.3428251236299336e-06, "learning_rate": 0.007449339768846965, "loss": 2.6145, "step": 9663 }, { "crossentropy": 2.427056312561035, "epoch": 0.3503480278422274, "grad_norm": 0.029217705130577087, "grad_norm_var": 5.9229438194574e-06, "learning_rate": 0.007448833161982448, "loss": 2.5506, "step": 9664 }, { "crossentropy": 2.533630609512329, "epoch": 0.3503842807424594, "grad_norm": 0.027578631415963173, "grad_norm_var": 5.696984530116388e-06, "learning_rate": 0.007448326522042721, "loss": 2.5875, "step": 9665 }, { "crossentropy": 2.6083855628967285, "epoch": 0.35042053364269143, "grad_norm": 0.029084935784339905, "grad_norm_var": 5.737862568331159e-06, "learning_rate": 0.0074478198490346295, "loss": 2.5463, "step": 9666 }, { "crossentropy": 2.6213605403900146, "epoch": 0.35045678654292345, "grad_norm": 0.028039827942848206, "grad_norm_var": 5.957747252941364e-06, "learning_rate": 0.007447313142965016, "loss": 2.6616, "step": 9667 }, { "crossentropy": 2.7157106399536133, "epoch": 0.35049303944315546, "grad_norm": 0.028929444029927254, "grad_norm_var": 6.00279831362061e-06, "learning_rate": 0.007446806403840726, "loss": 2.7215, "step": 9668 }, { "crossentropy": 2.667917251586914, "epoch": 0.3505292923433875, "grad_norm": 0.029098747298121452, "grad_norm_var": 6.031752905252356e-06, "learning_rate": 0.0074462996316686, "loss": 2.6236, "step": 9669 }, { "crossentropy": 2.5777711868286133, "epoch": 0.3505655452436195, "grad_norm": 0.027378089725971222, "grad_norm_var": 4.718861019508263e-06, "learning_rate": 0.007445792826455486, "loss": 2.588, "step": 9670 }, { "crossentropy": 2.611121416091919, "epoch": 0.3506017981438515, "grad_norm": 0.03019905835390091, "grad_norm_var": 3.1366047546522538e-06, "learning_rate": 0.007445285988208229, "loss": 2.6374, "step": 9671 }, { "crossentropy": 2.6737446784973145, "epoch": 0.3506380510440835, "grad_norm": 0.02869652397930622, "grad_norm_var": 2.306516745168814e-06, "learning_rate": 0.007444779116933673, "loss": 2.6814, "step": 9672 }, { "crossentropy": 2.6466546058654785, "epoch": 0.35067430394431554, "grad_norm": 0.02868092991411686, "grad_norm_var": 2.211255709650079e-06, "learning_rate": 0.007444272212638666, "loss": 2.6251, "step": 9673 }, { "crossentropy": 2.653554677963257, "epoch": 0.35071055684454755, "grad_norm": 0.02905138209462166, "grad_norm_var": 2.1638520160571194e-06, "learning_rate": 0.007443765275330052, "loss": 2.6239, "step": 9674 }, { "crossentropy": 2.8635754585266113, "epoch": 0.35074680974477956, "grad_norm": 0.03418207913637161, "grad_norm_var": 3.790950166200566e-06, "learning_rate": 0.007443258305014681, "loss": 2.8049, "step": 9675 }, { "crossentropy": 2.7338926792144775, "epoch": 0.3507830626450116, "grad_norm": 0.03164716064929962, "grad_norm_var": 4.072174379851157e-06, "learning_rate": 0.007442751301699397, "loss": 2.5584, "step": 9676 }, { "crossentropy": 2.614901065826416, "epoch": 0.3508193155452436, "grad_norm": 0.033093757927417755, "grad_norm_var": 4.582172000983662e-06, "learning_rate": 0.007442244265391052, "loss": 2.7138, "step": 9677 }, { "crossentropy": 2.6878199577331543, "epoch": 0.35085556844547566, "grad_norm": 0.03196443244814873, "grad_norm_var": 4.722351512792183e-06, "learning_rate": 0.00744173719609649, "loss": 2.6675, "step": 9678 }, { "crossentropy": 2.6171000003814697, "epoch": 0.3508918213457077, "grad_norm": 0.030458014458417892, "grad_norm_var": 3.850235750529881e-06, "learning_rate": 0.007441230093822563, "loss": 2.6453, "step": 9679 }, { "crossentropy": 2.665287733078003, "epoch": 0.3509280742459397, "grad_norm": 0.03061571903526783, "grad_norm_var": 3.858014137784968e-06, "learning_rate": 0.007440722958576118, "loss": 2.646, "step": 9680 }, { "crossentropy": 2.592294931411743, "epoch": 0.3509643271461717, "grad_norm": 0.03144536539912224, "grad_norm_var": 3.586049766139326e-06, "learning_rate": 0.007440215790364005, "loss": 2.7231, "step": 9681 }, { "crossentropy": 2.674295663833618, "epoch": 0.3510005800464037, "grad_norm": 0.031223250553011894, "grad_norm_var": 3.5652166132766132e-06, "learning_rate": 0.007439708589193075, "loss": 2.6653, "step": 9682 }, { "crossentropy": 2.6592166423797607, "epoch": 0.35103683294663574, "grad_norm": 0.03176440671086311, "grad_norm_var": 3.312808269880646e-06, "learning_rate": 0.00743920135507018, "loss": 2.6342, "step": 9683 }, { "crossentropy": 2.3638877868652344, "epoch": 0.35107308584686775, "grad_norm": 0.029319120571017265, "grad_norm_var": 3.2393065611387046e-06, "learning_rate": 0.007438694088002167, "loss": 2.4564, "step": 9684 }, { "crossentropy": 2.5959227085113525, "epoch": 0.35110933874709976, "grad_norm": 0.028088320046663284, "grad_norm_var": 3.4987866864116437e-06, "learning_rate": 0.00743818678799589, "loss": 2.6481, "step": 9685 }, { "crossentropy": 2.6147561073303223, "epoch": 0.3511455916473318, "grad_norm": 0.027129704132676125, "grad_norm_var": 3.6056360983883e-06, "learning_rate": 0.0074376794550581994, "loss": 2.5566, "step": 9686 }, { "crossentropy": 2.581678628921509, "epoch": 0.3511818445475638, "grad_norm": 0.029331061989068985, "grad_norm_var": 3.684365281771823e-06, "learning_rate": 0.007437172089195951, "loss": 2.6119, "step": 9687 }, { "crossentropy": 2.6932759284973145, "epoch": 0.3512180974477958, "grad_norm": 0.02848091349005699, "grad_norm_var": 3.736765681204785e-06, "learning_rate": 0.0074366646904159926, "loss": 2.6027, "step": 9688 }, { "crossentropy": 2.401022434234619, "epoch": 0.3512543503480278, "grad_norm": 0.029911693185567856, "grad_norm_var": 3.5485613063883255e-06, "learning_rate": 0.00743615725872518, "loss": 2.4356, "step": 9689 }, { "crossentropy": 2.7211897373199463, "epoch": 0.35129060324825984, "grad_norm": 0.030357601121068, "grad_norm_var": 3.4061004264740775e-06, "learning_rate": 0.007435649794130367, "loss": 2.6791, "step": 9690 }, { "crossentropy": 2.5746850967407227, "epoch": 0.3513268561484919, "grad_norm": 0.028143489733338356, "grad_norm_var": 2.771482682811741e-06, "learning_rate": 0.007435142296638409, "loss": 2.6272, "step": 9691 }, { "crossentropy": 2.736400842666626, "epoch": 0.3513631090487239, "grad_norm": 0.027915049344301224, "grad_norm_var": 2.9148665379602943e-06, "learning_rate": 0.0074346347662561565, "loss": 2.6441, "step": 9692 }, { "crossentropy": 2.599889039993286, "epoch": 0.35139936194895594, "grad_norm": 0.027348801493644714, "grad_norm_var": 2.571554958014551e-06, "learning_rate": 0.007434127202990467, "loss": 2.5878, "step": 9693 }, { "crossentropy": 2.6236188411712646, "epoch": 0.35143561484918795, "grad_norm": 0.027286255732178688, "grad_norm_var": 2.460539871859202e-06, "learning_rate": 0.0074336196068481965, "loss": 2.6001, "step": 9694 }, { "crossentropy": 2.7449488639831543, "epoch": 0.35147186774941996, "grad_norm": 0.027530411258339882, "grad_norm_var": 2.5446489276876106e-06, "learning_rate": 0.0074331119778362, "loss": 2.7223, "step": 9695 }, { "crossentropy": 2.7020816802978516, "epoch": 0.351508120649652, "grad_norm": 0.02954370714724064, "grad_norm_var": 2.402426436153883e-06, "learning_rate": 0.007432604315961334, "loss": 2.7478, "step": 9696 }, { "crossentropy": 2.6281259059906006, "epoch": 0.351544373549884, "grad_norm": 0.02896987646818161, "grad_norm_var": 1.9951976252652324e-06, "learning_rate": 0.007432096621230454, "loss": 2.6396, "step": 9697 }, { "crossentropy": 2.657707691192627, "epoch": 0.351580626450116, "grad_norm": 0.027649199590086937, "grad_norm_var": 1.684762598045536e-06, "learning_rate": 0.0074315888936504195, "loss": 2.5679, "step": 9698 }, { "crossentropy": 2.5043184757232666, "epoch": 0.351616879350348, "grad_norm": 0.028645329177379608, "grad_norm_var": 1.0071998267727548e-06, "learning_rate": 0.007431081133228086, "loss": 2.595, "step": 9699 }, { "crossentropy": 2.5534024238586426, "epoch": 0.35165313225058004, "grad_norm": 0.029977722093462944, "grad_norm_var": 1.1081574363182184e-06, "learning_rate": 0.007430573339970314, "loss": 2.5778, "step": 9700 }, { "crossentropy": 2.749157428741455, "epoch": 0.35168938515081205, "grad_norm": 0.032780274748802185, "grad_norm_var": 2.214428275839018e-06, "learning_rate": 0.007430065513883959, "loss": 2.6384, "step": 9701 }, { "crossentropy": 2.8159618377685547, "epoch": 0.35172563805104406, "grad_norm": 0.030121032148599625, "grad_norm_var": 2.1024812034695403e-06, "learning_rate": 0.007429557654975882, "loss": 2.7259, "step": 9702 }, { "crossentropy": 2.663996934890747, "epoch": 0.3517618909512761, "grad_norm": 0.02825697511434555, "grad_norm_var": 2.127105347316304e-06, "learning_rate": 0.007429049763252942, "loss": 2.6655, "step": 9703 }, { "crossentropy": 2.742283344268799, "epoch": 0.3517981438515081, "grad_norm": 0.03347943723201752, "grad_norm_var": 3.387783522587025e-06, "learning_rate": 0.007428541838721999, "loss": 2.7405, "step": 9704 }, { "crossentropy": 2.367706537246704, "epoch": 0.35183439675174016, "grad_norm": 0.0329255536198616, "grad_norm_var": 4.22348152774112e-06, "learning_rate": 0.007428033881389913, "loss": 2.5774, "step": 9705 }, { "crossentropy": 2.6213810443878174, "epoch": 0.3518706496519722, "grad_norm": 0.030361590906977654, "grad_norm_var": 4.223974293740999e-06, "learning_rate": 0.007427525891263544, "loss": 2.5789, "step": 9706 }, { "crossentropy": 2.5760226249694824, "epoch": 0.3519069025522042, "grad_norm": 0.0288440752774477, "grad_norm_var": 4.134156433433289e-06, "learning_rate": 0.007427017868349754, "loss": 2.5992, "step": 9707 }, { "crossentropy": 2.5964698791503906, "epoch": 0.3519431554524362, "grad_norm": 0.028991421684622765, "grad_norm_var": 3.982372601495955e-06, "learning_rate": 0.007426509812655406, "loss": 2.6884, "step": 9708 }, { "crossentropy": 2.5604991912841797, "epoch": 0.3519794083526682, "grad_norm": 0.03034711629152298, "grad_norm_var": 3.6664631880372587e-06, "learning_rate": 0.007426001724187359, "loss": 2.5475, "step": 9709 }, { "crossentropy": 2.500364303588867, "epoch": 0.35201566125290024, "grad_norm": 0.03102678246796131, "grad_norm_var": 3.3212145572349947e-06, "learning_rate": 0.007425493602952478, "loss": 2.4965, "step": 9710 }, { "crossentropy": 2.4881739616394043, "epoch": 0.35205191415313225, "grad_norm": 0.027554228901863098, "grad_norm_var": 3.3135164382594308e-06, "learning_rate": 0.007424985448957624, "loss": 2.496, "step": 9711 }, { "crossentropy": 2.665332317352295, "epoch": 0.35208816705336426, "grad_norm": 0.027597561478614807, "grad_norm_var": 3.660110402440899e-06, "learning_rate": 0.007424477262209663, "loss": 2.6905, "step": 9712 }, { "crossentropy": 2.6559529304504395, "epoch": 0.3521244199535963, "grad_norm": 0.026918616145849228, "grad_norm_var": 4.162576979454581e-06, "learning_rate": 0.007423969042715456, "loss": 2.6352, "step": 9713 }, { "crossentropy": 2.634594678878784, "epoch": 0.3521606728538283, "grad_norm": 0.029783733189105988, "grad_norm_var": 3.858748864282478e-06, "learning_rate": 0.007423460790481868, "loss": 2.631, "step": 9714 }, { "crossentropy": 2.527031183242798, "epoch": 0.3521969257540603, "grad_norm": 0.02979237399995327, "grad_norm_var": 3.7566298775788704e-06, "learning_rate": 0.007422952505515766, "loss": 2.4702, "step": 9715 }, { "crossentropy": 2.6351449489593506, "epoch": 0.3522331786542923, "grad_norm": 0.028174923732876778, "grad_norm_var": 3.946463485954751e-06, "learning_rate": 0.0074224441878240136, "loss": 2.5489, "step": 9716 }, { "crossentropy": 2.650015115737915, "epoch": 0.35226943155452434, "grad_norm": 0.027844535186886787, "grad_norm_var": 3.5141481317279974e-06, "learning_rate": 0.007421935837413475, "loss": 2.6468, "step": 9717 }, { "crossentropy": 2.6935946941375732, "epoch": 0.3523056844547564, "grad_norm": 0.028709053993225098, "grad_norm_var": 3.5220702606877783e-06, "learning_rate": 0.007421427454291018, "loss": 2.6675, "step": 9718 }, { "crossentropy": 2.6406126022338867, "epoch": 0.3523419373549884, "grad_norm": 0.0274923425167799, "grad_norm_var": 3.6764694745334074e-06, "learning_rate": 0.007420919038463508, "loss": 2.6439, "step": 9719 }, { "crossentropy": 2.694211959838867, "epoch": 0.35237819025522044, "grad_norm": 0.02944743074476719, "grad_norm_var": 2.4807241761263018e-06, "learning_rate": 0.0074204105899378136, "loss": 2.688, "step": 9720 }, { "crossentropy": 2.641422748565674, "epoch": 0.35241444315545245, "grad_norm": 0.027543313801288605, "grad_norm_var": 1.5553951924427197e-06, "learning_rate": 0.0074199021087208005, "loss": 2.7079, "step": 9721 }, { "crossentropy": 2.5494635105133057, "epoch": 0.35245069605568446, "grad_norm": 0.028083987534046173, "grad_norm_var": 1.3983481974443972e-06, "learning_rate": 0.007419393594819337, "loss": 2.5837, "step": 9722 }, { "crossentropy": 2.6685845851898193, "epoch": 0.3524869489559165, "grad_norm": 0.03079959563910961, "grad_norm_var": 1.6920039676519722e-06, "learning_rate": 0.007418885048240291, "loss": 2.5503, "step": 9723 }, { "crossentropy": 2.6713221073150635, "epoch": 0.3525232018561485, "grad_norm": 0.030114328488707542, "grad_norm_var": 1.8059559014284182e-06, "learning_rate": 0.007418376468990533, "loss": 2.6422, "step": 9724 }, { "crossentropy": 2.655466079711914, "epoch": 0.3525594547563805, "grad_norm": 0.029782675206661224, "grad_norm_var": 1.7114561006091484e-06, "learning_rate": 0.00741786785707693, "loss": 2.6177, "step": 9725 }, { "crossentropy": 2.6786575317382812, "epoch": 0.3525957076566125, "grad_norm": 0.03436025604605675, "grad_norm_var": 3.399418457523936e-06, "learning_rate": 0.007417359212506353, "loss": 2.6818, "step": 9726 }, { "crossentropy": 2.5842039585113525, "epoch": 0.35263196055684454, "grad_norm": 0.035839926451444626, "grad_norm_var": 6.093059887948707e-06, "learning_rate": 0.007416850535285671, "loss": 2.5966, "step": 9727 }, { "crossentropy": 2.6738381385803223, "epoch": 0.35266821345707655, "grad_norm": 0.03480369225144386, "grad_norm_var": 7.493590037421999e-06, "learning_rate": 0.007416341825421754, "loss": 2.6836, "step": 9728 }, { "crossentropy": 2.4289910793304443, "epoch": 0.35270446635730857, "grad_norm": 0.0302128903567791, "grad_norm_var": 6.83237786716823e-06, "learning_rate": 0.007415833082921474, "loss": 2.5695, "step": 9729 }, { "crossentropy": 2.5629947185516357, "epoch": 0.3527407192575406, "grad_norm": 0.029424462467432022, "grad_norm_var": 6.859143112370698e-06, "learning_rate": 0.007415324307791702, "loss": 2.6035, "step": 9730 }, { "crossentropy": 2.6961493492126465, "epoch": 0.3527769721577726, "grad_norm": 0.02740536816418171, "grad_norm_var": 7.3295891051972645e-06, "learning_rate": 0.00741481550003931, "loss": 2.6794, "step": 9731 }, { "crossentropy": 2.629873037338257, "epoch": 0.35281322505800466, "grad_norm": 0.02867836132645607, "grad_norm_var": 7.222758721079035e-06, "learning_rate": 0.007414306659671171, "loss": 2.6116, "step": 9732 }, { "crossentropy": 2.744361400604248, "epoch": 0.3528494779582367, "grad_norm": 0.029653238132596016, "grad_norm_var": 6.899236249877326e-06, "learning_rate": 0.007413797786694157, "loss": 2.7178, "step": 9733 }, { "crossentropy": 2.4746670722961426, "epoch": 0.3528857308584687, "grad_norm": 0.033441416919231415, "grad_norm_var": 7.391664747420645e-06, "learning_rate": 0.007413288881115139, "loss": 2.5075, "step": 9734 }, { "crossentropy": 2.7821764945983887, "epoch": 0.3529219837587007, "grad_norm": 0.029839929193258286, "grad_norm_var": 6.812614775160647e-06, "learning_rate": 0.007412779942940992, "loss": 2.7147, "step": 9735 }, { "crossentropy": 2.796921730041504, "epoch": 0.3529582366589327, "grad_norm": 0.029553106054663658, "grad_norm_var": 6.797221919320574e-06, "learning_rate": 0.007412270972178592, "loss": 2.7157, "step": 9736 }, { "crossentropy": 2.5802576541900635, "epoch": 0.35299448955916474, "grad_norm": 0.03007030487060547, "grad_norm_var": 6.167767518674968e-06, "learning_rate": 0.007411761968834811, "loss": 2.6242, "step": 9737 }, { "crossentropy": 2.5348236560821533, "epoch": 0.35303074245939675, "grad_norm": 0.02891707792878151, "grad_norm_var": 5.914566628552317e-06, "learning_rate": 0.007411252932916525, "loss": 2.6341, "step": 9738 }, { "crossentropy": 2.64408540725708, "epoch": 0.35306699535962877, "grad_norm": 0.027991311624646187, "grad_norm_var": 6.4098830896412575e-06, "learning_rate": 0.007410743864430609, "loss": 2.6082, "step": 9739 }, { "crossentropy": 2.760934829711914, "epoch": 0.3531032482598608, "grad_norm": 0.03107040748000145, "grad_norm_var": 6.40121067853582e-06, "learning_rate": 0.0074102347633839384, "loss": 2.7015, "step": 9740 }, { "crossentropy": 2.69301700592041, "epoch": 0.3531395011600928, "grad_norm": 0.028169823810458183, "grad_norm_var": 6.758968088303715e-06, "learning_rate": 0.007409725629783389, "loss": 2.6878, "step": 9741 }, { "crossentropy": 2.640064239501953, "epoch": 0.3531757540603248, "grad_norm": 0.03153526410460472, "grad_norm_var": 5.837430195601553e-06, "learning_rate": 0.0074092164636358395, "loss": 2.649, "step": 9742 }, { "crossentropy": 2.554643392562866, "epoch": 0.3532120069605568, "grad_norm": 0.03336979076266289, "grad_norm_var": 4.4313831730396906e-06, "learning_rate": 0.007408707264948165, "loss": 2.6772, "step": 9743 }, { "crossentropy": 2.5515706539154053, "epoch": 0.35324825986078884, "grad_norm": 0.03451201692223549, "grad_norm_var": 4.259938685694413e-06, "learning_rate": 0.007408198033727243, "loss": 2.6013, "step": 9744 }, { "crossentropy": 2.528972864151001, "epoch": 0.3532845127610209, "grad_norm": 0.03143356367945671, "grad_norm_var": 4.348605603664739e-06, "learning_rate": 0.007407688769979952, "loss": 2.5988, "step": 9745 }, { "crossentropy": 2.638500928878784, "epoch": 0.3533207656612529, "grad_norm": 0.030352508649230003, "grad_norm_var": 4.292043527569234e-06, "learning_rate": 0.007407179473713172, "loss": 2.6359, "step": 9746 }, { "crossentropy": 2.644308567047119, "epoch": 0.35335701856148494, "grad_norm": 0.028824996203184128, "grad_norm_var": 3.8559765164673885e-06, "learning_rate": 0.007406670144933778, "loss": 2.674, "step": 9747 }, { "crossentropy": 2.6223576068878174, "epoch": 0.35339327146171695, "grad_norm": 0.02813011221587658, "grad_norm_var": 4.005242841098079e-06, "learning_rate": 0.007406160783648652, "loss": 2.624, "step": 9748 }, { "crossentropy": 2.5528030395507812, "epoch": 0.35342952436194897, "grad_norm": 0.029599802568554878, "grad_norm_var": 4.01094879088944e-06, "learning_rate": 0.007405651389864674, "loss": 2.5002, "step": 9749 }, { "crossentropy": 2.687830686569214, "epoch": 0.353465777262181, "grad_norm": 0.03291761875152588, "grad_norm_var": 3.817480657525238e-06, "learning_rate": 0.007405141963588723, "loss": 2.6854, "step": 9750 }, { "crossentropy": 2.588165521621704, "epoch": 0.353502030162413, "grad_norm": 0.03006494976580143, "grad_norm_var": 3.804052342151392e-06, "learning_rate": 0.00740463250482768, "loss": 2.5518, "step": 9751 }, { "crossentropy": 2.5728001594543457, "epoch": 0.353538283062645, "grad_norm": 0.02909621223807335, "grad_norm_var": 3.8691203501542424e-06, "learning_rate": 0.007404123013588425, "loss": 2.5917, "step": 9752 }, { "crossentropy": 2.6374995708465576, "epoch": 0.353574535962877, "grad_norm": 0.029559263959527016, "grad_norm_var": 3.906442053211315e-06, "learning_rate": 0.00740361348987784, "loss": 2.6406, "step": 9753 }, { "crossentropy": 2.5975427627563477, "epoch": 0.35361078886310904, "grad_norm": 0.032165903598070145, "grad_norm_var": 3.946909390973316e-06, "learning_rate": 0.007403103933702808, "loss": 2.6217, "step": 9754 }, { "crossentropy": 2.5101125240325928, "epoch": 0.35364704176334105, "grad_norm": 0.029957124963402748, "grad_norm_var": 3.5178876619421876e-06, "learning_rate": 0.007402594345070211, "loss": 2.5715, "step": 9755 }, { "crossentropy": 2.709707736968994, "epoch": 0.35368329466357307, "grad_norm": 0.030129974707961082, "grad_norm_var": 3.523264474898126e-06, "learning_rate": 0.00740208472398693, "loss": 2.7218, "step": 9756 }, { "crossentropy": 2.67179799079895, "epoch": 0.3537195475638051, "grad_norm": 0.028755590319633484, "grad_norm_var": 3.3538388482858786e-06, "learning_rate": 0.007401575070459852, "loss": 2.699, "step": 9757 }, { "crossentropy": 2.543828010559082, "epoch": 0.3537558004640371, "grad_norm": 0.029637224972248077, "grad_norm_var": 3.3550371630940243e-06, "learning_rate": 0.007401065384495855, "loss": 2.583, "step": 9758 }, { "crossentropy": 2.5979692935943604, "epoch": 0.35379205336426917, "grad_norm": 0.028332151472568512, "grad_norm_var": 3.0348236832387534e-06, "learning_rate": 0.00740055566610183, "loss": 2.5625, "step": 9759 }, { "crossentropy": 2.5139644145965576, "epoch": 0.3538283062645012, "grad_norm": 0.026694539934396744, "grad_norm_var": 2.3773622492091376e-06, "learning_rate": 0.0074000459152846546, "loss": 2.5067, "step": 9760 }, { "crossentropy": 2.8211166858673096, "epoch": 0.3538645591647332, "grad_norm": 0.02887106128036976, "grad_norm_var": 2.205104156673537e-06, "learning_rate": 0.007399536132051218, "loss": 2.7233, "step": 9761 }, { "crossentropy": 2.863333225250244, "epoch": 0.3539008120649652, "grad_norm": 0.028520913794636726, "grad_norm_var": 2.2232042612625737e-06, "learning_rate": 0.0073990263164084025, "loss": 2.7574, "step": 9762 }, { "crossentropy": 2.522202491760254, "epoch": 0.3539370649651972, "grad_norm": 0.027513468638062477, "grad_norm_var": 2.4406332162821375e-06, "learning_rate": 0.007398516468363099, "loss": 2.5245, "step": 9763 }, { "crossentropy": 2.6413936614990234, "epoch": 0.35397331786542924, "grad_norm": 0.026965122669935226, "grad_norm_var": 2.7183040021744665e-06, "learning_rate": 0.007398006587922189, "loss": 2.5913, "step": 9764 }, { "crossentropy": 2.7622129917144775, "epoch": 0.35400957076566125, "grad_norm": 0.02867099456489086, "grad_norm_var": 2.7349462490389163e-06, "learning_rate": 0.007397496675092562, "loss": 2.7269, "step": 9765 }, { "crossentropy": 2.6265437602996826, "epoch": 0.35404582366589327, "grad_norm": 0.030514350160956383, "grad_norm_var": 1.917729385419878e-06, "learning_rate": 0.007396986729881102, "loss": 2.6655, "step": 9766 }, { "crossentropy": 2.544922113418579, "epoch": 0.3540820765661253, "grad_norm": 0.02895534224808216, "grad_norm_var": 1.8505214490492032e-06, "learning_rate": 0.007396476752294702, "loss": 2.6239, "step": 9767 }, { "crossentropy": 2.7445027828216553, "epoch": 0.3541183294663573, "grad_norm": 0.029387738555669785, "grad_norm_var": 1.858748816520765e-06, "learning_rate": 0.007395966742340244, "loss": 2.6978, "step": 9768 }, { "crossentropy": 2.656172752380371, "epoch": 0.3541545823665893, "grad_norm": 0.02835344895720482, "grad_norm_var": 1.8660455548500747e-06, "learning_rate": 0.007395456700024621, "loss": 2.6674, "step": 9769 }, { "crossentropy": 2.6940908432006836, "epoch": 0.3541908352668213, "grad_norm": 0.02829051949083805, "grad_norm_var": 1.1502580434552169e-06, "learning_rate": 0.007394946625354717, "loss": 2.6339, "step": 9770 }, { "crossentropy": 2.5070371627807617, "epoch": 0.35422708816705334, "grad_norm": 0.029186192899942398, "grad_norm_var": 1.0604287682606608e-06, "learning_rate": 0.0073944365183374276, "loss": 2.563, "step": 9771 }, { "crossentropy": 2.543243408203125, "epoch": 0.3542633410672854, "grad_norm": 0.027930952608585358, "grad_norm_var": 9.356655101239031e-07, "learning_rate": 0.007393926378979639, "loss": 2.5633, "step": 9772 }, { "crossentropy": 2.564876079559326, "epoch": 0.3542995939675174, "grad_norm": 0.027624279260635376, "grad_norm_var": 9.82567784848356e-07, "learning_rate": 0.007393416207288239, "loss": 2.6123, "step": 9773 }, { "crossentropy": 2.66129732131958, "epoch": 0.35433584686774944, "grad_norm": 0.02840280719101429, "grad_norm_var": 8.849545046854881e-07, "learning_rate": 0.007392906003270122, "loss": 2.7036, "step": 9774 }, { "crossentropy": 2.4085183143615723, "epoch": 0.35437209976798145, "grad_norm": 0.028829235583543777, "grad_norm_var": 8.966718983776749e-07, "learning_rate": 0.0073923957669321805, "loss": 2.5585, "step": 9775 }, { "crossentropy": 2.6997599601745605, "epoch": 0.35440835266821347, "grad_norm": 0.02862984873354435, "grad_norm_var": 6.856665848019624e-07, "learning_rate": 0.0073918854982813025, "loss": 2.6869, "step": 9776 }, { "crossentropy": 2.5657737255096436, "epoch": 0.3544446055684455, "grad_norm": 0.029536936432123184, "grad_norm_var": 7.427363440568972e-07, "learning_rate": 0.0073913751973243794, "loss": 2.6914, "step": 9777 }, { "crossentropy": 2.4755215644836426, "epoch": 0.3544808584686775, "grad_norm": 0.027675673365592957, "grad_norm_var": 7.942737120345308e-07, "learning_rate": 0.007390864864068305, "loss": 2.5628, "step": 9778 }, { "crossentropy": 2.6517810821533203, "epoch": 0.3545171113689095, "grad_norm": 0.027690740302205086, "grad_norm_var": 7.722301642657982e-07, "learning_rate": 0.0073903544985199745, "loss": 2.6079, "step": 9779 }, { "crossentropy": 2.554448366165161, "epoch": 0.3545533642691415, "grad_norm": 0.02878832444548607, "grad_norm_var": 5.970781021693751e-07, "learning_rate": 0.007389844100686279, "loss": 2.5669, "step": 9780 }, { "crossentropy": 2.5726077556610107, "epoch": 0.35458961716937354, "grad_norm": 0.029804635792970657, "grad_norm_var": 6.79936293306443e-07, "learning_rate": 0.007389333670574111, "loss": 2.6313, "step": 9781 }, { "crossentropy": 2.6811017990112305, "epoch": 0.35462587006960555, "grad_norm": 0.031006937846541405, "grad_norm_var": 8.126188264127749e-07, "learning_rate": 0.007388823208190365, "loss": 2.5614, "step": 9782 }, { "crossentropy": 2.6756863594055176, "epoch": 0.35466212296983757, "grad_norm": 0.031018797308206558, "grad_norm_var": 1.133619786501426e-06, "learning_rate": 0.007388312713541938, "loss": 2.589, "step": 9783 }, { "crossentropy": 2.42018461227417, "epoch": 0.3546983758700696, "grad_norm": 0.030663451179862022, "grad_norm_var": 1.3208794398554546e-06, "learning_rate": 0.007387802186635724, "loss": 2.4401, "step": 9784 }, { "crossentropy": 2.4427435398101807, "epoch": 0.3547346287703016, "grad_norm": 0.03205110505223274, "grad_norm_var": 1.8741357676206143e-06, "learning_rate": 0.0073872916274786175, "loss": 2.6217, "step": 9785 }, { "crossentropy": 2.734607219696045, "epoch": 0.35477088167053367, "grad_norm": 0.02858773246407509, "grad_norm_var": 1.843787776084959e-06, "learning_rate": 0.007386781036077512, "loss": 2.7074, "step": 9786 }, { "crossentropy": 2.715597152709961, "epoch": 0.3548071345707657, "grad_norm": 0.029148345813155174, "grad_norm_var": 1.8440187749018654e-06, "learning_rate": 0.007386270412439309, "loss": 2.6351, "step": 9787 }, { "crossentropy": 2.7030282020568848, "epoch": 0.3548433874709977, "grad_norm": 0.032561250030994415, "grad_norm_var": 2.3931978640978045e-06, "learning_rate": 0.007385759756570904, "loss": 2.6861, "step": 9788 }, { "crossentropy": 2.7232918739318848, "epoch": 0.3548796403712297, "grad_norm": 0.031891755759716034, "grad_norm_var": 2.4634135394343535e-06, "learning_rate": 0.007385249068479191, "loss": 2.6755, "step": 9789 }, { "crossentropy": 2.652437448501587, "epoch": 0.3549158932714617, "grad_norm": 0.032460782676935196, "grad_norm_var": 2.7539697885586016e-06, "learning_rate": 0.007384738348171068, "loss": 2.6199, "step": 9790 }, { "crossentropy": 2.6514806747436523, "epoch": 0.35495214617169374, "grad_norm": 0.03673906251788139, "grad_norm_var": 5.406788543556061e-06, "learning_rate": 0.0073842275956534364, "loss": 2.6213, "step": 9791 }, { "crossentropy": 2.6666922569274902, "epoch": 0.35498839907192575, "grad_norm": 0.033083170652389526, "grad_norm_var": 5.526364482819473e-06, "learning_rate": 0.007383716810933195, "loss": 2.6604, "step": 9792 }, { "crossentropy": 2.509600877761841, "epoch": 0.35502465197215777, "grad_norm": 0.0279921256005764, "grad_norm_var": 5.9345009293530645e-06, "learning_rate": 0.007383205994017238, "loss": 2.5664, "step": 9793 }, { "crossentropy": 2.6004672050476074, "epoch": 0.3550609048723898, "grad_norm": 0.03267482668161392, "grad_norm_var": 5.482099755110768e-06, "learning_rate": 0.007382695144912468, "loss": 2.5988, "step": 9794 }, { "crossentropy": 2.573334217071533, "epoch": 0.3550971577726218, "grad_norm": 0.03254513442516327, "grad_norm_var": 4.806398701469765e-06, "learning_rate": 0.007382184263625784, "loss": 2.6023, "step": 9795 }, { "crossentropy": 2.631859540939331, "epoch": 0.3551334106728538, "grad_norm": 0.02978554554283619, "grad_norm_var": 4.532785424193654e-06, "learning_rate": 0.007381673350164089, "loss": 2.574, "step": 9796 }, { "crossentropy": 2.550698757171631, "epoch": 0.3551696635730858, "grad_norm": 0.03154624253511429, "grad_norm_var": 4.35748637643048e-06, "learning_rate": 0.007381162404534276, "loss": 2.5679, "step": 9797 }, { "crossentropy": 2.653529167175293, "epoch": 0.35520591647331784, "grad_norm": 0.033452730625867844, "grad_norm_var": 4.575532579466343e-06, "learning_rate": 0.007380651426743255, "loss": 2.6051, "step": 9798 }, { "crossentropy": 2.6411147117614746, "epoch": 0.3552421693735499, "grad_norm": 0.030688777565956116, "grad_norm_var": 4.609569853130504e-06, "learning_rate": 0.007380140416797923, "loss": 2.6299, "step": 9799 }, { "crossentropy": 2.6414122581481934, "epoch": 0.3552784222737819, "grad_norm": 0.028370283544063568, "grad_norm_var": 5.229787186119643e-06, "learning_rate": 0.007379629374705182, "loss": 2.6967, "step": 9800 }, { "crossentropy": 2.466796636581421, "epoch": 0.35531467517401394, "grad_norm": 0.027734534814953804, "grad_norm_var": 6.062002770656341e-06, "learning_rate": 0.007379118300471935, "loss": 2.603, "step": 9801 }, { "crossentropy": 2.6456243991851807, "epoch": 0.35535092807424595, "grad_norm": 0.026502976194024086, "grad_norm_var": 7.060848655729366e-06, "learning_rate": 0.007378607194105086, "loss": 2.6264, "step": 9802 }, { "crossentropy": 2.554593563079834, "epoch": 0.35538718097447797, "grad_norm": 0.028676249086856842, "grad_norm_var": 7.19596563990113e-06, "learning_rate": 0.007378096055611536, "loss": 2.633, "step": 9803 }, { "crossentropy": 2.523897171020508, "epoch": 0.35542343387471, "grad_norm": 0.029460787773132324, "grad_norm_var": 7.169583669430985e-06, "learning_rate": 0.007377584884998192, "loss": 2.5551, "step": 9804 }, { "crossentropy": 2.571061372756958, "epoch": 0.355459686774942, "grad_norm": 0.028637733310461044, "grad_norm_var": 7.379524038083002e-06, "learning_rate": 0.007377073682271953, "loss": 2.6089, "step": 9805 }, { "crossentropy": 2.632711887359619, "epoch": 0.355495939675174, "grad_norm": 0.03120318241417408, "grad_norm_var": 7.174225434553964e-06, "learning_rate": 0.007376562447439728, "loss": 2.6406, "step": 9806 }, { "crossentropy": 2.684356689453125, "epoch": 0.355532192575406, "grad_norm": 0.030207732692360878, "grad_norm_var": 4.466626552934481e-06, "learning_rate": 0.007376051180508422, "loss": 2.6577, "step": 9807 }, { "crossentropy": 2.795957326889038, "epoch": 0.35556844547563804, "grad_norm": 0.030184241011738777, "grad_norm_var": 3.862037269890261e-06, "learning_rate": 0.0073755398814849375, "loss": 2.6524, "step": 9808 }, { "crossentropy": 2.6492538452148438, "epoch": 0.35560469837587005, "grad_norm": 0.030391879379749298, "grad_norm_var": 3.586246749379997e-06, "learning_rate": 0.007375028550376182, "loss": 2.622, "step": 9809 }, { "crossentropy": 2.591343879699707, "epoch": 0.35564095127610207, "grad_norm": 0.027811700478196144, "grad_norm_var": 3.413568372442642e-06, "learning_rate": 0.007374517187189062, "loss": 2.5378, "step": 9810 }, { "crossentropy": 2.7270069122314453, "epoch": 0.3556772041763341, "grad_norm": 0.030202804133296013, "grad_norm_var": 2.906942977583202e-06, "learning_rate": 0.0073740057919304824, "loss": 2.6335, "step": 9811 }, { "crossentropy": 2.720348358154297, "epoch": 0.35571345707656615, "grad_norm": 0.029752394184470177, "grad_norm_var": 2.9065388922518607e-06, "learning_rate": 0.007373494364607353, "loss": 2.6861, "step": 9812 }, { "crossentropy": 2.6790611743927, "epoch": 0.35574970997679817, "grad_norm": 0.03295573219656944, "grad_norm_var": 3.382086647636615e-06, "learning_rate": 0.007372982905226581, "loss": 2.6554, "step": 9813 }, { "crossentropy": 2.5094778537750244, "epoch": 0.3557859628770302, "grad_norm": 0.03165168687701225, "grad_norm_var": 2.699159037026997e-06, "learning_rate": 0.007372471413795074, "loss": 2.6316, "step": 9814 }, { "crossentropy": 2.5302677154541016, "epoch": 0.3558222157772622, "grad_norm": 0.02989921160042286, "grad_norm_var": 2.6289797778023994e-06, "learning_rate": 0.007371959890319739, "loss": 2.6239, "step": 9815 }, { "crossentropy": 2.5985801219940186, "epoch": 0.3558584686774942, "grad_norm": 0.02818191982805729, "grad_norm_var": 2.6621495621411787e-06, "learning_rate": 0.0073714483348074865, "loss": 2.6491, "step": 9816 }, { "crossentropy": 2.7003231048583984, "epoch": 0.3558947215777262, "grad_norm": 0.029718482866883278, "grad_norm_var": 2.4170890162133105e-06, "learning_rate": 0.007370936747265225, "loss": 2.6783, "step": 9817 }, { "crossentropy": 2.6736702919006348, "epoch": 0.35593097447795824, "grad_norm": 0.028176959604024887, "grad_norm_var": 1.875329125415589e-06, "learning_rate": 0.007370425127699865, "loss": 2.7269, "step": 9818 }, { "crossentropy": 2.6756858825683594, "epoch": 0.35596722737819025, "grad_norm": 0.02723998762667179, "grad_norm_var": 2.223199706094467e-06, "learning_rate": 0.007369913476118317, "loss": 2.6453, "step": 9819 }, { "crossentropy": 2.6246297359466553, "epoch": 0.35600348027842227, "grad_norm": 0.02830948866903782, "grad_norm_var": 2.3473344533994488e-06, "learning_rate": 0.007369401792527492, "loss": 2.6317, "step": 9820 }, { "crossentropy": 2.5651352405548096, "epoch": 0.3560397331786543, "grad_norm": 0.026016894727945328, "grad_norm_var": 3.1330988420997237e-06, "learning_rate": 0.007368890076934298, "loss": 2.604, "step": 9821 }, { "crossentropy": 2.4545552730560303, "epoch": 0.3560759860788863, "grad_norm": 0.02713555470108986, "grad_norm_var": 3.2402329533769147e-06, "learning_rate": 0.007368378329345649, "loss": 2.5237, "step": 9822 }, { "crossentropy": 2.612065553665161, "epoch": 0.3561122389791183, "grad_norm": 0.027457404881715775, "grad_norm_var": 3.358047968366135e-06, "learning_rate": 0.007367866549768457, "loss": 2.5945, "step": 9823 }, { "crossentropy": 2.5679941177368164, "epoch": 0.3561484918793503, "grad_norm": 0.03008430078625679, "grad_norm_var": 3.3437965240851532e-06, "learning_rate": 0.007367354738209634, "loss": 2.6334, "step": 9824 }, { "crossentropy": 2.582770824432373, "epoch": 0.35618474477958234, "grad_norm": 0.027600331231951714, "grad_norm_var": 3.3357229847227974e-06, "learning_rate": 0.007366842894676092, "loss": 2.5677, "step": 9825 }, { "crossentropy": 2.662346363067627, "epoch": 0.3562209976798144, "grad_norm": 0.029781125485897064, "grad_norm_var": 3.2957279150469068e-06, "learning_rate": 0.007366331019174745, "loss": 2.5984, "step": 9826 }, { "crossentropy": 2.5542337894439697, "epoch": 0.3562572505800464, "grad_norm": 0.03110465593636036, "grad_norm_var": 3.4899603034423017e-06, "learning_rate": 0.007365819111712507, "loss": 2.6287, "step": 9827 }, { "crossentropy": 2.7246782779693604, "epoch": 0.35629350348027844, "grad_norm": 0.028955206274986267, "grad_norm_var": 3.4567888732451308e-06, "learning_rate": 0.007365307172296292, "loss": 2.7119, "step": 9828 }, { "crossentropy": 2.569162368774414, "epoch": 0.35632975638051045, "grad_norm": 0.030563969165086746, "grad_norm_var": 2.5581925577268834e-06, "learning_rate": 0.007364795200933014, "loss": 2.6386, "step": 9829 }, { "crossentropy": 2.703921318054199, "epoch": 0.35636600928074247, "grad_norm": 0.028878100216388702, "grad_norm_var": 2.0093018420757795e-06, "learning_rate": 0.007364283197629586, "loss": 2.6917, "step": 9830 }, { "crossentropy": 2.7727749347686768, "epoch": 0.3564022621809745, "grad_norm": 0.031008316203951836, "grad_norm_var": 2.26441509329985e-06, "learning_rate": 0.007363771162392928, "loss": 2.7339, "step": 9831 }, { "crossentropy": 2.619211196899414, "epoch": 0.3564385150812065, "grad_norm": 0.02853269875049591, "grad_norm_var": 2.244914301936738e-06, "learning_rate": 0.007363259095229953, "loss": 2.5972, "step": 9832 }, { "crossentropy": 2.3694190979003906, "epoch": 0.3564747679814385, "grad_norm": 0.02831263653934002, "grad_norm_var": 2.1935024866863904e-06, "learning_rate": 0.007362746996147577, "loss": 2.494, "step": 9833 }, { "crossentropy": 2.6279141902923584, "epoch": 0.3565110208816705, "grad_norm": 0.03041031025350094, "grad_norm_var": 2.350280961479452e-06, "learning_rate": 0.007362234865152717, "loss": 2.7648, "step": 9834 }, { "crossentropy": 2.573727607727051, "epoch": 0.35654727378190254, "grad_norm": 0.030515015125274658, "grad_norm_var": 2.323303654308088e-06, "learning_rate": 0.00736172270225229, "loss": 2.6499, "step": 9835 }, { "crossentropy": 2.5302295684814453, "epoch": 0.35658352668213456, "grad_norm": 0.029537493363022804, "grad_norm_var": 2.2976777031137135e-06, "learning_rate": 0.007361210507453216, "loss": 2.6143, "step": 9836 }, { "crossentropy": 2.5703437328338623, "epoch": 0.35661977958236657, "grad_norm": 0.02909630537033081, "grad_norm_var": 1.6169196244423972e-06, "learning_rate": 0.007360698280762408, "loss": 2.5536, "step": 9837 }, { "crossentropy": 2.509312629699707, "epoch": 0.3566560324825986, "grad_norm": 0.030651265755295753, "grad_norm_var": 1.3697441555192742e-06, "learning_rate": 0.007360186022186789, "loss": 2.4972, "step": 9838 }, { "crossentropy": 2.629499673843384, "epoch": 0.35669228538283065, "grad_norm": 0.029403431341052055, "grad_norm_var": 1.0685080363979799e-06, "learning_rate": 0.007359673731733276, "loss": 2.6756, "step": 9839 }, { "crossentropy": 2.640200614929199, "epoch": 0.35672853828306267, "grad_norm": 0.027200326323509216, "grad_norm_var": 1.4221832277882632e-06, "learning_rate": 0.007359161409408788, "loss": 2.5988, "step": 9840 }, { "crossentropy": 2.7297089099884033, "epoch": 0.3567647911832947, "grad_norm": 0.028720369562506676, "grad_norm_var": 1.2210840776349929e-06, "learning_rate": 0.007358649055220244, "loss": 2.607, "step": 9841 }, { "crossentropy": 2.586555242538452, "epoch": 0.3568010440835267, "grad_norm": 0.02765742689371109, "grad_norm_var": 1.435240628088382e-06, "learning_rate": 0.007358136669174565, "loss": 2.6206, "step": 9842 }, { "crossentropy": 2.607233762741089, "epoch": 0.3568372969837587, "grad_norm": 0.02798244170844555, "grad_norm_var": 1.3387027737398727e-06, "learning_rate": 0.007357624251278674, "loss": 2.5961, "step": 9843 }, { "crossentropy": 2.632922410964966, "epoch": 0.3568735498839907, "grad_norm": 0.027494914829730988, "grad_norm_var": 1.5223855123299689e-06, "learning_rate": 0.007357111801539489, "loss": 2.6576, "step": 9844 }, { "crossentropy": 2.690035104751587, "epoch": 0.35690980278422274, "grad_norm": 0.026883838698267937, "grad_norm_var": 1.661693577718184e-06, "learning_rate": 0.007356599319963931, "loss": 2.6659, "step": 9845 }, { "crossentropy": 2.512152910232544, "epoch": 0.35694605568445475, "grad_norm": 0.028909243643283844, "grad_norm_var": 1.6616931334647537e-06, "learning_rate": 0.0073560868065589225, "loss": 2.712, "step": 9846 }, { "crossentropy": 2.5907199382781982, "epoch": 0.35698230858468677, "grad_norm": 0.03012838028371334, "grad_norm_var": 1.4621126272273126e-06, "learning_rate": 0.007355574261331388, "loss": 2.6267, "step": 9847 }, { "crossentropy": 2.680004835128784, "epoch": 0.3570185614849188, "grad_norm": 0.028381671756505966, "grad_norm_var": 1.4697213934738852e-06, "learning_rate": 0.007355061684288248, "loss": 2.7353, "step": 9848 }, { "crossentropy": 2.573899030685425, "epoch": 0.3570548143851508, "grad_norm": 0.0301076490432024, "grad_norm_var": 1.5472017164837865e-06, "learning_rate": 0.0073545490754364255, "loss": 2.5964, "step": 9849 }, { "crossentropy": 2.7367892265319824, "epoch": 0.3570910672853828, "grad_norm": 0.03311577066779137, "grad_norm_var": 2.5341499152938793e-06, "learning_rate": 0.007354036434782843, "loss": 2.6743, "step": 9850 }, { "crossentropy": 2.4613332748413086, "epoch": 0.3571273201856148, "grad_norm": 0.03459727764129639, "grad_norm_var": 4.3395872057283916e-06, "learning_rate": 0.007353523762334429, "loss": 2.5596, "step": 9851 }, { "crossentropy": 2.5285544395446777, "epoch": 0.35716357308584684, "grad_norm": 0.03273274004459381, "grad_norm_var": 5.050434721203196e-06, "learning_rate": 0.007353011058098103, "loss": 2.5372, "step": 9852 }, { "crossentropy": 2.5186736583709717, "epoch": 0.3571998259860789, "grad_norm": 0.02834155596792698, "grad_norm_var": 5.133348900226003e-06, "learning_rate": 0.0073524983220807924, "loss": 2.5051, "step": 9853 }, { "crossentropy": 2.56728196144104, "epoch": 0.3572360788863109, "grad_norm": 0.03083329275250435, "grad_norm_var": 5.1628936269095996e-06, "learning_rate": 0.007351985554289422, "loss": 2.6354, "step": 9854 }, { "crossentropy": 2.758446455001831, "epoch": 0.35727233178654294, "grad_norm": 0.03754395991563797, "grad_norm_var": 9.166577521977267e-06, "learning_rate": 0.007351472754730918, "loss": 2.6229, "step": 9855 }, { "crossentropy": 2.6766855716705322, "epoch": 0.35730858468677495, "grad_norm": 0.041685834527015686, "grad_norm_var": 1.679750376693129e-05, "learning_rate": 0.007350959923412206, "loss": 2.5927, "step": 9856 }, { "crossentropy": 2.600269079208374, "epoch": 0.35734483758700697, "grad_norm": 0.0309746116399765, "grad_norm_var": 1.644652505031195e-05, "learning_rate": 0.007350447060340211, "loss": 2.5972, "step": 9857 }, { "crossentropy": 2.5100486278533936, "epoch": 0.357381090487239, "grad_norm": 0.028868917375802994, "grad_norm_var": 1.5984486799161174e-05, "learning_rate": 0.0073499341655218625, "loss": 2.5207, "step": 9858 }, { "crossentropy": 2.5939011573791504, "epoch": 0.357417343387471, "grad_norm": 0.02845940925180912, "grad_norm_var": 1.57965386267116e-05, "learning_rate": 0.0073494212389640864, "loss": 2.5948, "step": 9859 }, { "crossentropy": 2.554938316345215, "epoch": 0.357453596287703, "grad_norm": 0.028957590460777283, "grad_norm_var": 1.5209391819422203e-05, "learning_rate": 0.007348908280673812, "loss": 2.595, "step": 9860 }, { "crossentropy": 2.71724271774292, "epoch": 0.357489849187935, "grad_norm": 0.02764323726296425, "grad_norm_var": 1.480004539395526e-05, "learning_rate": 0.007348395290657966, "loss": 2.6799, "step": 9861 }, { "crossentropy": 2.538106918334961, "epoch": 0.35752610208816704, "grad_norm": 0.029279354959726334, "grad_norm_var": 1.4689143360568865e-05, "learning_rate": 0.007347882268923478, "loss": 2.713, "step": 9862 }, { "crossentropy": 2.5686657428741455, "epoch": 0.35756235498839906, "grad_norm": 0.030748125165700912, "grad_norm_var": 1.4611938216352995e-05, "learning_rate": 0.007347369215477276, "loss": 2.591, "step": 9863 }, { "crossentropy": 2.695194959640503, "epoch": 0.35759860788863107, "grad_norm": 0.028485765680670738, "grad_norm_var": 1.4570835389408378e-05, "learning_rate": 0.007346856130326293, "loss": 2.7085, "step": 9864 }, { "crossentropy": 2.7026665210723877, "epoch": 0.3576348607888631, "grad_norm": 0.028443841263651848, "grad_norm_var": 1.5030202540562005e-05, "learning_rate": 0.007346343013477454, "loss": 2.646, "step": 9865 }, { "crossentropy": 2.7055859565734863, "epoch": 0.35767111368909515, "grad_norm": 0.030537769198417664, "grad_norm_var": 1.4819536100047342e-05, "learning_rate": 0.007345829864937692, "loss": 2.7053, "step": 9866 }, { "crossentropy": 2.5467069149017334, "epoch": 0.35770736658932717, "grad_norm": 0.029760640114545822, "grad_norm_var": 1.4047754748646122e-05, "learning_rate": 0.0073453166847139385, "loss": 2.5263, "step": 9867 }, { "crossentropy": 2.632298707962036, "epoch": 0.3577436194895592, "grad_norm": 0.0279911570250988, "grad_norm_var": 1.4250642246254643e-05, "learning_rate": 0.007344803472813126, "loss": 2.6285, "step": 9868 }, { "crossentropy": 2.4620096683502197, "epoch": 0.3577798723897912, "grad_norm": 0.02785177156329155, "grad_norm_var": 1.4408857098856818e-05, "learning_rate": 0.007344290229242182, "loss": 2.5592, "step": 9869 }, { "crossentropy": 2.667828321456909, "epoch": 0.3578161252900232, "grad_norm": 0.028009383007884026, "grad_norm_var": 1.4783305582546925e-05, "learning_rate": 0.00734377695400804, "loss": 2.6762, "step": 9870 }, { "crossentropy": 2.7582759857177734, "epoch": 0.3578523781902552, "grad_norm": 0.03489760681986809, "grad_norm_var": 1.2674727999197923e-05, "learning_rate": 0.007343263647117635, "loss": 2.7063, "step": 9871 }, { "crossentropy": 2.5750231742858887, "epoch": 0.35788863109048724, "grad_norm": 0.025993825867772102, "grad_norm_var": 3.954120129423171e-06, "learning_rate": 0.007342750308577899, "loss": 2.6234, "step": 9872 }, { "crossentropy": 2.6725077629089355, "epoch": 0.35792488399071926, "grad_norm": 0.027430593967437744, "grad_norm_var": 3.891785376074541e-06, "learning_rate": 0.007342236938395763, "loss": 2.6817, "step": 9873 }, { "crossentropy": 2.775435447692871, "epoch": 0.35796113689095127, "grad_norm": 0.029853884130716324, "grad_norm_var": 3.940466862902336e-06, "learning_rate": 0.0073417235365781635, "loss": 2.7934, "step": 9874 }, { "crossentropy": 2.432269811630249, "epoch": 0.3579973897911833, "grad_norm": 0.02762822061777115, "grad_norm_var": 4.045940007578405e-06, "learning_rate": 0.007341210103132033, "loss": 2.6213, "step": 9875 }, { "crossentropy": 2.744281053543091, "epoch": 0.3580336426914153, "grad_norm": 0.028599128127098083, "grad_norm_var": 4.054542468022391e-06, "learning_rate": 0.0073406966380643104, "loss": 2.721, "step": 9876 }, { "crossentropy": 2.6284749507904053, "epoch": 0.3580698955916473, "grad_norm": 0.028769925236701965, "grad_norm_var": 3.9380021007210756e-06, "learning_rate": 0.0073401831413819245, "loss": 2.6243, "step": 9877 }, { "crossentropy": 2.516695976257324, "epoch": 0.35810614849187933, "grad_norm": 0.029716508463025093, "grad_norm_var": 3.965205208356992e-06, "learning_rate": 0.0073396696130918135, "loss": 2.4888, "step": 9878 }, { "crossentropy": 2.5573642253875732, "epoch": 0.35814240139211134, "grad_norm": 0.0314684621989727, "grad_norm_var": 4.1612232267130055e-06, "learning_rate": 0.007339156053200915, "loss": 2.6186, "step": 9879 }, { "crossentropy": 2.6078500747680664, "epoch": 0.3581786542923434, "grad_norm": 0.03134819492697716, "grad_norm_var": 4.442742831919863e-06, "learning_rate": 0.007338642461716165, "loss": 2.6206, "step": 9880 }, { "crossentropy": 2.628485918045044, "epoch": 0.3582149071925754, "grad_norm": 0.028395866975188255, "grad_norm_var": 4.448163630464628e-06, "learning_rate": 0.0073381288386445, "loss": 2.6231, "step": 9881 }, { "crossentropy": 2.4328582286834717, "epoch": 0.35825116009280744, "grad_norm": 0.02859198860824108, "grad_norm_var": 4.354798356574402e-06, "learning_rate": 0.007337615183992854, "loss": 2.593, "step": 9882 }, { "crossentropy": 2.675232410430908, "epoch": 0.35828741299303946, "grad_norm": 0.03288199007511139, "grad_norm_var": 5.220276100286131e-06, "learning_rate": 0.007337101497768168, "loss": 2.6483, "step": 9883 }, { "crossentropy": 2.797520875930786, "epoch": 0.35832366589327147, "grad_norm": 0.033363934606313705, "grad_norm_var": 6.058690728120509e-06, "learning_rate": 0.007336587779977382, "loss": 2.6983, "step": 9884 }, { "crossentropy": 2.5021157264709473, "epoch": 0.3583599187935035, "grad_norm": 0.028567645698785782, "grad_norm_var": 5.916685853145928e-06, "learning_rate": 0.007336074030627431, "loss": 2.5947, "step": 9885 }, { "crossentropy": 2.743234634399414, "epoch": 0.3583961716937355, "grad_norm": 0.02969622053205967, "grad_norm_var": 5.709826862236806e-06, "learning_rate": 0.007335560249725254, "loss": 2.6703, "step": 9886 }, { "crossentropy": 2.753897190093994, "epoch": 0.3584324245939675, "grad_norm": 0.03226763755083084, "grad_norm_var": 4.363437266120572e-06, "learning_rate": 0.0073350464372777925, "loss": 2.6877, "step": 9887 }, { "crossentropy": 2.703101396560669, "epoch": 0.35846867749419953, "grad_norm": 0.034471407532691956, "grad_norm_var": 4.710244449449413e-06, "learning_rate": 0.0073345325932919845, "loss": 2.6385, "step": 9888 }, { "crossentropy": 2.9290127754211426, "epoch": 0.35850493039443154, "grad_norm": 0.036641426384449005, "grad_norm_var": 6.622961101009015e-06, "learning_rate": 0.007334018717774773, "loss": 2.7747, "step": 9889 }, { "crossentropy": 2.6212799549102783, "epoch": 0.35854118329466356, "grad_norm": 0.030704490840435028, "grad_norm_var": 6.564689258767067e-06, "learning_rate": 0.007333504810733095, "loss": 2.5341, "step": 9890 }, { "crossentropy": 2.663396120071411, "epoch": 0.35857743619489557, "grad_norm": 0.02868509106338024, "grad_norm_var": 6.184788456606059e-06, "learning_rate": 0.007332990872173893, "loss": 2.6376, "step": 9891 }, { "crossentropy": 2.6162109375, "epoch": 0.3586136890951276, "grad_norm": 0.029359307140111923, "grad_norm_var": 5.989153054291096e-06, "learning_rate": 0.00733247690210411, "loss": 2.6389, "step": 9892 }, { "crossentropy": 2.4951846599578857, "epoch": 0.35864994199535966, "grad_norm": 0.03203822299838066, "grad_norm_var": 5.71409690498247e-06, "learning_rate": 0.007331962900530687, "loss": 2.5376, "step": 9893 }, { "crossentropy": 2.581376552581787, "epoch": 0.35868619489559167, "grad_norm": 0.029672294855117798, "grad_norm_var": 5.7225954465606535e-06, "learning_rate": 0.007331448867460565, "loss": 2.5658, "step": 9894 }, { "crossentropy": 2.5560507774353027, "epoch": 0.3587224477958237, "grad_norm": 0.027355734258890152, "grad_norm_var": 6.596695667987889e-06, "learning_rate": 0.0073309348029006885, "loss": 2.5795, "step": 9895 }, { "crossentropy": 2.6418774127960205, "epoch": 0.3587587006960557, "grad_norm": 0.02959999442100525, "grad_norm_var": 6.67801378283576e-06, "learning_rate": 0.007330420706858, "loss": 2.6819, "step": 9896 }, { "crossentropy": 2.5084211826324463, "epoch": 0.3587949535962877, "grad_norm": 0.02827698551118374, "grad_norm_var": 6.716502640470116e-06, "learning_rate": 0.0073299065793394425, "loss": 2.504, "step": 9897 }, { "crossentropy": 2.4147119522094727, "epoch": 0.35883120649651973, "grad_norm": 0.027688831090927124, "grad_norm_var": 7.028665757844336e-06, "learning_rate": 0.007329392420351962, "loss": 2.4549, "step": 9898 }, { "crossentropy": 2.602142810821533, "epoch": 0.35886745939675174, "grad_norm": 0.029207760468125343, "grad_norm_var": 6.805642942027806e-06, "learning_rate": 0.007328878229902503, "loss": 2.6211, "step": 9899 }, { "crossentropy": 2.722450017929077, "epoch": 0.35890371229698376, "grad_norm": 0.02734568901360035, "grad_norm_var": 6.751021003844924e-06, "learning_rate": 0.007328364007998007, "loss": 2.598, "step": 9900 }, { "crossentropy": 2.5883748531341553, "epoch": 0.35893996519721577, "grad_norm": 0.0278424471616745, "grad_norm_var": 6.931930222702391e-06, "learning_rate": 0.0073278497546454225, "loss": 2.567, "step": 9901 }, { "crossentropy": 2.605152130126953, "epoch": 0.3589762180974478, "grad_norm": 0.027043526992201805, "grad_norm_var": 7.498041857403982e-06, "learning_rate": 0.007327335469851695, "loss": 2.6253, "step": 9902 }, { "crossentropy": 2.7389230728149414, "epoch": 0.3590124709976798, "grad_norm": 0.02914164587855339, "grad_norm_var": 7.116764236644622e-06, "learning_rate": 0.00732682115362377, "loss": 2.6886, "step": 9903 }, { "crossentropy": 2.6716670989990234, "epoch": 0.3590487238979118, "grad_norm": 0.026738712564110756, "grad_norm_var": 5.926415556241118e-06, "learning_rate": 0.007326306805968593, "loss": 2.6532, "step": 9904 }, { "crossentropy": 2.7090065479278564, "epoch": 0.35908497679814383, "grad_norm": 0.026932595297694206, "grad_norm_var": 2.196247995550612e-06, "learning_rate": 0.007325792426893114, "loss": 2.6385, "step": 9905 }, { "crossentropy": 2.685685873031616, "epoch": 0.35912122969837584, "grad_norm": 0.026313573122024536, "grad_norm_var": 2.170391308027467e-06, "learning_rate": 0.007325278016404277, "loss": 2.7365, "step": 9906 }, { "crossentropy": 2.469280958175659, "epoch": 0.3591574825986079, "grad_norm": 0.02819122187793255, "grad_norm_var": 2.1620983131054405e-06, "learning_rate": 0.007324763574509034, "loss": 2.5867, "step": 9907 }, { "crossentropy": 2.629483222961426, "epoch": 0.35919373549883993, "grad_norm": 0.029396535828709602, "grad_norm_var": 2.1674591160586802e-06, "learning_rate": 0.007324249101214328, "loss": 2.5749, "step": 9908 }, { "crossentropy": 2.6613032817840576, "epoch": 0.35922998839907194, "grad_norm": 0.0298062264919281, "grad_norm_var": 1.3660640656702287e-06, "learning_rate": 0.007323734596527113, "loss": 2.6288, "step": 9909 }, { "crossentropy": 2.696943998336792, "epoch": 0.35926624129930396, "grad_norm": 0.027568165212869644, "grad_norm_var": 1.218389750327377e-06, "learning_rate": 0.007323220060454335, "loss": 2.6998, "step": 9910 }, { "crossentropy": 2.6633870601654053, "epoch": 0.35930249419953597, "grad_norm": 0.0279704499989748, "grad_norm_var": 1.1868982266770617e-06, "learning_rate": 0.007322705493002945, "loss": 2.5966, "step": 9911 }, { "crossentropy": 2.630432605743408, "epoch": 0.359338747099768, "grad_norm": 0.027857273817062378, "grad_norm_var": 1.020393685860936e-06, "learning_rate": 0.007322190894179892, "loss": 2.5901, "step": 9912 }, { "crossentropy": 2.7187390327453613, "epoch": 0.359375, "grad_norm": 0.028084808960556984, "grad_norm_var": 1.0145182021596324e-06, "learning_rate": 0.007321676263992128, "loss": 2.6463, "step": 9913 }, { "crossentropy": 2.5260090827941895, "epoch": 0.359411252900232, "grad_norm": 0.028118649497628212, "grad_norm_var": 1.0113499810555464e-06, "learning_rate": 0.007321161602446601, "loss": 2.5972, "step": 9914 }, { "crossentropy": 2.7004170417785645, "epoch": 0.35944750580046403, "grad_norm": 0.031228413805365562, "grad_norm_var": 1.5993564875073248e-06, "learning_rate": 0.007320646909550265, "loss": 2.6854, "step": 9915 }, { "crossentropy": 2.5878989696502686, "epoch": 0.35948375870069604, "grad_norm": 0.030703509226441383, "grad_norm_var": 1.9668906245601595e-06, "learning_rate": 0.007320132185310071, "loss": 2.5835, "step": 9916 }, { "crossentropy": 2.6652603149414062, "epoch": 0.35952001160092806, "grad_norm": 0.033444467931985855, "grad_norm_var": 3.580111764475933e-06, "learning_rate": 0.00731961742973297, "loss": 2.5442, "step": 9917 }, { "crossentropy": 2.6696200370788574, "epoch": 0.35955626450116007, "grad_norm": 0.039050765335559845, "grad_norm_var": 1.0005079232883287e-05, "learning_rate": 0.007319102642825917, "loss": 2.6382, "step": 9918 }, { "crossentropy": 2.485562801361084, "epoch": 0.3595925174013921, "grad_norm": 0.03808421641588211, "grad_norm_var": 1.468417480535015e-05, "learning_rate": 0.007318587824595862, "loss": 2.5622, "step": 9919 }, { "crossentropy": 2.656700372695923, "epoch": 0.35962877030162416, "grad_norm": 0.030396392568945885, "grad_norm_var": 1.3945397075828382e-05, "learning_rate": 0.007318072975049761, "loss": 2.6583, "step": 9920 }, { "crossentropy": 2.5862181186676025, "epoch": 0.35966502320185617, "grad_norm": 0.02891475521028042, "grad_norm_var": 1.3328292200145844e-05, "learning_rate": 0.007317558094194565, "loss": 2.5998, "step": 9921 }, { "crossentropy": 2.6739511489868164, "epoch": 0.3597012761020882, "grad_norm": 0.027246538549661636, "grad_norm_var": 1.2884239416309029e-05, "learning_rate": 0.00731704318203723, "loss": 2.5909, "step": 9922 }, { "crossentropy": 2.514195203781128, "epoch": 0.3597375290023202, "grad_norm": 0.028390077874064445, "grad_norm_var": 1.2828706525080238e-05, "learning_rate": 0.007316528238584713, "loss": 2.6155, "step": 9923 }, { "crossentropy": 2.479273796081543, "epoch": 0.3597737819025522, "grad_norm": 0.02819577418267727, "grad_norm_var": 1.3078088566833877e-05, "learning_rate": 0.0073160132638439655, "loss": 2.5732, "step": 9924 }, { "crossentropy": 2.6513636112213135, "epoch": 0.35981003480278423, "grad_norm": 0.02743547223508358, "grad_norm_var": 1.3590596604737501e-05, "learning_rate": 0.0073154982578219445, "loss": 2.6549, "step": 9925 }, { "crossentropy": 2.6242752075195312, "epoch": 0.35984628770301624, "grad_norm": 0.028046494349837303, "grad_norm_var": 1.3439079397584716e-05, "learning_rate": 0.007314983220525604, "loss": 2.5923, "step": 9926 }, { "crossentropy": 2.6813466548919678, "epoch": 0.35988254060324826, "grad_norm": 0.03231285139918327, "grad_norm_var": 1.3327883182066325e-05, "learning_rate": 0.007314468151961904, "loss": 2.7009, "step": 9927 }, { "crossentropy": 2.5018374919891357, "epoch": 0.35991879350348027, "grad_norm": 0.03295006603002548, "grad_norm_var": 1.317517840955327e-05, "learning_rate": 0.007313953052137798, "loss": 2.5364, "step": 9928 }, { "crossentropy": 2.439443588256836, "epoch": 0.3599550464037123, "grad_norm": 0.03061143308877945, "grad_norm_var": 1.266360799556305e-05, "learning_rate": 0.0073134379210602445, "loss": 2.5425, "step": 9929 }, { "crossentropy": 2.6530935764312744, "epoch": 0.3599912993039443, "grad_norm": 0.027974333614110947, "grad_norm_var": 1.2719306536263369e-05, "learning_rate": 0.0073129227587362, "loss": 2.5857, "step": 9930 }, { "crossentropy": 2.719190835952759, "epoch": 0.3600275522041763, "grad_norm": 0.028195664286613464, "grad_norm_var": 1.317615391696994e-05, "learning_rate": 0.0073124075651726265, "loss": 2.6443, "step": 9931 }, { "crossentropy": 2.550001621246338, "epoch": 0.36006380510440833, "grad_norm": 0.02765502780675888, "grad_norm_var": 1.377467943002719e-05, "learning_rate": 0.007311892340376478, "loss": 2.5833, "step": 9932 }, { "crossentropy": 2.605011224746704, "epoch": 0.36010005800464034, "grad_norm": 0.028500424697995186, "grad_norm_var": 1.3398650672553898e-05, "learning_rate": 0.007311377084354716, "loss": 2.5965, "step": 9933 }, { "crossentropy": 2.574404716491699, "epoch": 0.3601363109048724, "grad_norm": 0.027186384424567223, "grad_norm_var": 8.270360918535835e-06, "learning_rate": 0.0073108617971142965, "loss": 2.6232, "step": 9934 }, { "crossentropy": 2.616802453994751, "epoch": 0.36017256380510443, "grad_norm": 0.027366526424884796, "grad_norm_var": 3.191168947240993e-06, "learning_rate": 0.007310346478662184, "loss": 2.6959, "step": 9935 }, { "crossentropy": 2.643728256225586, "epoch": 0.36020881670533644, "grad_norm": 0.02713598497211933, "grad_norm_var": 3.177284844281941e-06, "learning_rate": 0.007309831129005335, "loss": 2.6515, "step": 9936 }, { "crossentropy": 2.656715154647827, "epoch": 0.36024506960556846, "grad_norm": 0.027972998097538948, "grad_norm_var": 3.1972571906446296e-06, "learning_rate": 0.007309315748150712, "loss": 2.6591, "step": 9937 }, { "crossentropy": 2.8040928840637207, "epoch": 0.36028132250580047, "grad_norm": 0.026839952915906906, "grad_norm_var": 3.279525820928941e-06, "learning_rate": 0.0073088003361052745, "loss": 2.6754, "step": 9938 }, { "crossentropy": 2.6928675174713135, "epoch": 0.3603175754060325, "grad_norm": 0.027643758803606033, "grad_norm_var": 3.3300616565297513e-06, "learning_rate": 0.007308284892875986, "loss": 2.6492, "step": 9939 }, { "crossentropy": 2.6347203254699707, "epoch": 0.3603538283062645, "grad_norm": 0.032194558531045914, "grad_norm_var": 4.166478121689296e-06, "learning_rate": 0.007307769418469806, "loss": 2.7428, "step": 9940 }, { "crossentropy": 2.6608026027679443, "epoch": 0.3603900812064965, "grad_norm": 0.03126697614789009, "grad_norm_var": 4.411755147083274e-06, "learning_rate": 0.007307253912893699, "loss": 2.6832, "step": 9941 }, { "crossentropy": 2.4426591396331787, "epoch": 0.36042633410672853, "grad_norm": 0.02754928171634674, "grad_norm_var": 4.489811815681401e-06, "learning_rate": 0.007306738376154625, "loss": 2.5369, "step": 9942 }, { "crossentropy": 2.5343964099884033, "epoch": 0.36046258700696054, "grad_norm": 0.027910834178328514, "grad_norm_var": 3.7328750399966224e-06, "learning_rate": 0.007306222808259548, "loss": 2.5955, "step": 9943 }, { "crossentropy": 2.7133731842041016, "epoch": 0.36049883990719256, "grad_norm": 0.0302075557410717, "grad_norm_var": 2.6432294548617835e-06, "learning_rate": 0.007305707209215436, "loss": 2.7213, "step": 9944 }, { "crossentropy": 2.5254712104797363, "epoch": 0.3605350928074246, "grad_norm": 0.03403757885098457, "grad_norm_var": 4.33538363725631e-06, "learning_rate": 0.0073051915790292455, "loss": 2.587, "step": 9945 }, { "crossentropy": 2.6611621379852295, "epoch": 0.3605713457076566, "grad_norm": 0.03941541165113449, "grad_norm_var": 1.1367793074121728e-05, "learning_rate": 0.007304675917707944, "loss": 2.6215, "step": 9946 }, { "crossentropy": 2.6101455688476562, "epoch": 0.36060759860788866, "grad_norm": 0.029192037880420685, "grad_norm_var": 1.126420767813909e-05, "learning_rate": 0.007304160225258498, "loss": 2.6618, "step": 9947 }, { "crossentropy": 2.859724283218384, "epoch": 0.36064385150812067, "grad_norm": 0.027098804712295532, "grad_norm_var": 1.1420722002124688e-05, "learning_rate": 0.007303644501687874, "loss": 2.7972, "step": 9948 }, { "crossentropy": 2.602010488510132, "epoch": 0.3606801044083527, "grad_norm": 0.02849794551730156, "grad_norm_var": 1.1421042867367816e-05, "learning_rate": 0.007303128747003033, "loss": 2.6345, "step": 9949 }, { "crossentropy": 2.5767602920532227, "epoch": 0.3607163573085847, "grad_norm": 0.030497482046484947, "grad_norm_var": 1.1098177528331456e-05, "learning_rate": 0.0073026129612109416, "loss": 2.589, "step": 9950 }, { "crossentropy": 2.655639171600342, "epoch": 0.3607526102088167, "grad_norm": 0.02782602235674858, "grad_norm_var": 1.0969836309671078e-05, "learning_rate": 0.007302097144318569, "loss": 2.5576, "step": 9951 }, { "crossentropy": 2.4629461765289307, "epoch": 0.36078886310904873, "grad_norm": 0.026882817968726158, "grad_norm_var": 1.1060575956750064e-05, "learning_rate": 0.007301581296332882, "loss": 2.5617, "step": 9952 }, { "crossentropy": 2.4528110027313232, "epoch": 0.36082511600928074, "grad_norm": 0.02753780595958233, "grad_norm_var": 1.117202136869851e-05, "learning_rate": 0.007301065417260846, "loss": 2.4971, "step": 9953 }, { "crossentropy": 2.5100507736206055, "epoch": 0.36086136890951276, "grad_norm": 0.02776244282722473, "grad_norm_var": 1.0878047634289787e-05, "learning_rate": 0.007300549507109428, "loss": 2.5812, "step": 9954 }, { "crossentropy": 2.506385564804077, "epoch": 0.3608976218097448, "grad_norm": 0.027584897354245186, "grad_norm_var": 1.0894559563286614e-05, "learning_rate": 0.0073000335658856, "loss": 2.5755, "step": 9955 }, { "crossentropy": 2.5203588008880615, "epoch": 0.3609338747099768, "grad_norm": 0.028548095375299454, "grad_norm_var": 1.052073603013726e-05, "learning_rate": 0.007299517593596328, "loss": 2.5538, "step": 9956 }, { "crossentropy": 2.5929250717163086, "epoch": 0.3609701276102088, "grad_norm": 0.027420392259955406, "grad_norm_var": 1.0533357668053983e-05, "learning_rate": 0.00729900159024858, "loss": 2.5443, "step": 9957 }, { "crossentropy": 2.5453293323516846, "epoch": 0.3610063805104408, "grad_norm": 0.026707584038376808, "grad_norm_var": 1.0768286933692653e-05, "learning_rate": 0.007298485555849328, "loss": 2.6224, "step": 9958 }, { "crossentropy": 2.742938756942749, "epoch": 0.36104263341067283, "grad_norm": 0.027580352500081062, "grad_norm_var": 1.083172006745811e-05, "learning_rate": 0.00729796949040554, "loss": 2.8005, "step": 9959 }, { "crossentropy": 2.4883837699890137, "epoch": 0.36107888631090485, "grad_norm": 0.030948705971240997, "grad_norm_var": 1.0968105862959097e-05, "learning_rate": 0.007297453393924187, "loss": 2.53, "step": 9960 }, { "crossentropy": 2.8248252868652344, "epoch": 0.3611151392111369, "grad_norm": 0.03129802271723747, "grad_norm_var": 9.677861371969636e-06, "learning_rate": 0.007296937266412239, "loss": 2.7375, "step": 9961 }, { "crossentropy": 2.5474369525909424, "epoch": 0.36115139211136893, "grad_norm": 0.02935907244682312, "grad_norm_var": 2.099972226057252e-06, "learning_rate": 0.007296421107876666, "loss": 2.6327, "step": 9962 }, { "crossentropy": 2.64791202545166, "epoch": 0.36118764501160094, "grad_norm": 0.028674202039837837, "grad_norm_var": 2.063523680034818e-06, "learning_rate": 0.007295904918324443, "loss": 2.7003, "step": 9963 }, { "crossentropy": 2.514784574508667, "epoch": 0.36122389791183296, "grad_norm": 0.030726514756679535, "grad_norm_var": 2.2619610712239225e-06, "learning_rate": 0.0072953886977625395, "loss": 2.6098, "step": 9964 }, { "crossentropy": 2.3741204738616943, "epoch": 0.361260150812065, "grad_norm": 0.028599737212061882, "grad_norm_var": 2.2610094968875666e-06, "learning_rate": 0.007294872446197928, "loss": 2.5217, "step": 9965 }, { "crossentropy": 2.5365631580352783, "epoch": 0.361296403712297, "grad_norm": 0.02924995869398117, "grad_norm_var": 2.0463404629591486e-06, "learning_rate": 0.007294356163637582, "loss": 2.5633, "step": 9966 }, { "crossentropy": 2.6945717334747314, "epoch": 0.361332656612529, "grad_norm": 0.027932893484830856, "grad_norm_var": 2.0368211547378347e-06, "learning_rate": 0.007293839850088474, "loss": 2.7349, "step": 9967 }, { "crossentropy": 2.5959932804107666, "epoch": 0.361368909512761, "grad_norm": 0.03092406690120697, "grad_norm_var": 2.15876445671125e-06, "learning_rate": 0.0072933235055575776, "loss": 2.5413, "step": 9968 }, { "crossentropy": 2.7352499961853027, "epoch": 0.36140516241299303, "grad_norm": 0.02883835881948471, "grad_norm_var": 2.045012654960469e-06, "learning_rate": 0.007292807130051869, "loss": 2.6924, "step": 9969 }, { "crossentropy": 2.6964797973632812, "epoch": 0.36144141531322505, "grad_norm": 0.030413363128900528, "grad_norm_var": 2.0875530797917834e-06, "learning_rate": 0.007292290723578316, "loss": 2.6561, "step": 9970 }, { "crossentropy": 2.699993848800659, "epoch": 0.36147766821345706, "grad_norm": 0.029691288247704506, "grad_norm_var": 1.953271918351575e-06, "learning_rate": 0.007291774286143902, "loss": 2.5875, "step": 9971 }, { "crossentropy": 2.455965042114258, "epoch": 0.3615139211136891, "grad_norm": 0.029659070074558258, "grad_norm_var": 1.9365075661780435e-06, "learning_rate": 0.0072912578177555965, "loss": 2.5936, "step": 9972 }, { "crossentropy": 2.6896629333496094, "epoch": 0.3615501740139211, "grad_norm": 0.03012043982744217, "grad_norm_var": 1.7329475965113197e-06, "learning_rate": 0.007290741318420379, "loss": 2.6579, "step": 9973 }, { "crossentropy": 2.743091583251953, "epoch": 0.36158642691415316, "grad_norm": 0.03046022169291973, "grad_norm_var": 1.255815234724973e-06, "learning_rate": 0.007290224788145221, "loss": 2.6868, "step": 9974 }, { "crossentropy": 2.627244234085083, "epoch": 0.3616226798143852, "grad_norm": 0.033618513494729996, "grad_norm_var": 1.8644406763956381e-06, "learning_rate": 0.007289708226937104, "loss": 2.6157, "step": 9975 }, { "crossentropy": 2.542510986328125, "epoch": 0.3616589327146172, "grad_norm": 0.03406541794538498, "grad_norm_var": 2.8524437183030197e-06, "learning_rate": 0.007289191634803002, "loss": 2.6361, "step": 9976 }, { "crossentropy": 2.6583337783813477, "epoch": 0.3616951856148492, "grad_norm": 0.02790696918964386, "grad_norm_var": 3.0868694942476696e-06, "learning_rate": 0.007288675011749893, "loss": 2.6169, "step": 9977 }, { "crossentropy": 2.506185293197632, "epoch": 0.3617314385150812, "grad_norm": 0.026894334703683853, "grad_norm_var": 3.6821131460029743e-06, "learning_rate": 0.007288158357784754, "loss": 2.5808, "step": 9978 }, { "crossentropy": 2.7257020473480225, "epoch": 0.36176769141531323, "grad_norm": 0.03058118000626564, "grad_norm_var": 3.6076491076080743e-06, "learning_rate": 0.007287641672914564, "loss": 2.6682, "step": 9979 }, { "crossentropy": 2.5765268802642822, "epoch": 0.36180394431554525, "grad_norm": 0.03156064823269844, "grad_norm_var": 3.7341448166779926e-06, "learning_rate": 0.007287124957146303, "loss": 2.6329, "step": 9980 }, { "crossentropy": 2.587153434753418, "epoch": 0.36184019721577726, "grad_norm": 0.03015010431408882, "grad_norm_var": 3.58824347670237e-06, "learning_rate": 0.007286608210486949, "loss": 2.5921, "step": 9981 }, { "crossentropy": 2.5164976119995117, "epoch": 0.3618764501160093, "grad_norm": 0.033651117235422134, "grad_norm_var": 4.282936989662145e-06, "learning_rate": 0.007286091432943478, "loss": 2.647, "step": 9982 }, { "crossentropy": 2.6210460662841797, "epoch": 0.3619127030162413, "grad_norm": 0.031870245933532715, "grad_norm_var": 3.954445375264846e-06, "learning_rate": 0.007285574624522875, "loss": 2.6005, "step": 9983 }, { "crossentropy": 2.7280421257019043, "epoch": 0.3619489559164733, "grad_norm": 0.02984200231730938, "grad_norm_var": 3.988131431774386e-06, "learning_rate": 0.007285057785232119, "loss": 2.7343, "step": 9984 }, { "crossentropy": 2.575049638748169, "epoch": 0.3619852088167053, "grad_norm": 0.02900197170674801, "grad_norm_var": 3.951751509925772e-06, "learning_rate": 0.0072845409150781885, "loss": 2.5929, "step": 9985 }, { "crossentropy": 2.696498394012451, "epoch": 0.36202146171693733, "grad_norm": 0.02785821445286274, "grad_norm_var": 4.420976735706771e-06, "learning_rate": 0.007284024014068066, "loss": 2.6317, "step": 9986 }, { "crossentropy": 2.591695785522461, "epoch": 0.36205771461716935, "grad_norm": 0.031659577041864395, "grad_norm_var": 4.468396725461575e-06, "learning_rate": 0.007283507082208734, "loss": 2.6455, "step": 9987 }, { "crossentropy": 2.670811891555786, "epoch": 0.3620939675174014, "grad_norm": 0.0314500518143177, "grad_norm_var": 4.454627897718053e-06, "learning_rate": 0.007282990119507172, "loss": 2.6882, "step": 9988 }, { "crossentropy": 2.613295555114746, "epoch": 0.36213022041763343, "grad_norm": 0.02988741733133793, "grad_norm_var": 4.475039972009681e-06, "learning_rate": 0.0072824731259703655, "loss": 2.6381, "step": 9989 }, { "crossentropy": 2.665935754776001, "epoch": 0.36216647331786544, "grad_norm": 0.029261978343129158, "grad_norm_var": 4.595675771859236e-06, "learning_rate": 0.0072819561016052945, "loss": 2.6167, "step": 9990 }, { "crossentropy": 2.4348692893981934, "epoch": 0.36220272621809746, "grad_norm": 0.030283235013484955, "grad_norm_var": 3.93912944795977e-06, "learning_rate": 0.007281439046418944, "loss": 2.506, "step": 9991 }, { "crossentropy": 2.6673219203948975, "epoch": 0.3622389791183295, "grad_norm": 0.03275686874985695, "grad_norm_var": 3.4014454649484617e-06, "learning_rate": 0.007280921960418296, "loss": 2.6389, "step": 9992 }, { "crossentropy": 2.6314945220947266, "epoch": 0.3622752320185615, "grad_norm": 0.030308466404676437, "grad_norm_var": 2.9993311617352298e-06, "learning_rate": 0.007280404843610338, "loss": 2.6458, "step": 9993 }, { "crossentropy": 2.660961866378784, "epoch": 0.3623114849187935, "grad_norm": 0.028367549180984497, "grad_norm_var": 2.438785946663732e-06, "learning_rate": 0.00727988769600205, "loss": 2.6456, "step": 9994 }, { "crossentropy": 2.726480722427368, "epoch": 0.3623477378190255, "grad_norm": 0.028522765263915062, "grad_norm_var": 2.689738584242945e-06, "learning_rate": 0.0072793705176004185, "loss": 2.6196, "step": 9995 }, { "crossentropy": 2.661590576171875, "epoch": 0.36238399071925753, "grad_norm": 0.028306253254413605, "grad_norm_var": 2.84892742173475e-06, "learning_rate": 0.007278853308412429, "loss": 2.5745, "step": 9996 }, { "crossentropy": 2.4399847984313965, "epoch": 0.36242024361948955, "grad_norm": 0.028002427890896797, "grad_norm_var": 3.151100528091819e-06, "learning_rate": 0.007278336068445068, "loss": 2.5369, "step": 9997 }, { "crossentropy": 2.66454815864563, "epoch": 0.36245649651972156, "grad_norm": 0.027761410921812057, "grad_norm_var": 2.502499737927011e-06, "learning_rate": 0.007277818797705322, "loss": 2.6562, "step": 9998 }, { "crossentropy": 2.5692877769470215, "epoch": 0.3624927494199536, "grad_norm": 0.027420101687312126, "grad_norm_var": 2.4503063150063098e-06, "learning_rate": 0.007277301496200175, "loss": 2.5537, "step": 9999 }, { "crossentropy": 2.6403610706329346, "epoch": 0.3625290023201856, "grad_norm": 0.0278491098433733, "grad_norm_var": 2.5859051743839143e-06, "learning_rate": 0.007276784163936616, "loss": 2.5573, "step": 10000 }, { "crossentropy": 2.63270902633667, "epoch": 0.36256525522041766, "grad_norm": 0.028965668752789497, "grad_norm_var": 2.587399078376034e-06, "learning_rate": 0.007276266800921631, "loss": 2.6758, "step": 10001 }, { "crossentropy": 2.6048126220703125, "epoch": 0.3626015081206497, "grad_norm": 0.030144935473799706, "grad_norm_var": 2.4772695069730233e-06, "learning_rate": 0.007275749407162209, "loss": 2.6592, "step": 10002 }, { "crossentropy": 2.59080171585083, "epoch": 0.3626377610208817, "grad_norm": 0.028514515608549118, "grad_norm_var": 2.162305889505158e-06, "learning_rate": 0.007275231982665338, "loss": 2.6189, "step": 10003 }, { "crossentropy": 2.5967602729797363, "epoch": 0.3626740139211137, "grad_norm": 0.02800672873854637, "grad_norm_var": 1.8876104747535724e-06, "learning_rate": 0.007274714527438005, "loss": 2.5595, "step": 10004 }, { "crossentropy": 2.3971681594848633, "epoch": 0.3627102668213457, "grad_norm": 0.028011398389935493, "grad_norm_var": 1.8912202043741093e-06, "learning_rate": 0.007274197041487201, "loss": 2.5039, "step": 10005 }, { "crossentropy": 2.6031811237335205, "epoch": 0.36274651972157773, "grad_norm": 0.02792278304696083, "grad_norm_var": 1.93960672261495e-06, "learning_rate": 0.007273679524819916, "loss": 2.5271, "step": 10006 }, { "crossentropy": 2.699350595474243, "epoch": 0.36278277262180975, "grad_norm": 0.02831190824508667, "grad_norm_var": 1.79828573618015e-06, "learning_rate": 0.007273161977443137, "loss": 2.6682, "step": 10007 }, { "crossentropy": 2.6163651943206787, "epoch": 0.36281902552204176, "grad_norm": 0.02714463509619236, "grad_norm_var": 7.298447749425989e-07, "learning_rate": 0.007272644399363857, "loss": 2.6097, "step": 10008 }, { "crossentropy": 2.6413745880126953, "epoch": 0.3628552784222738, "grad_norm": 0.026872195303440094, "grad_norm_var": 5.694060957368736e-07, "learning_rate": 0.007272126790589064, "loss": 2.668, "step": 10009 }, { "crossentropy": 2.430593729019165, "epoch": 0.3628915313225058, "grad_norm": 0.027864055708050728, "grad_norm_var": 5.694891757232103e-07, "learning_rate": 0.007271609151125751, "loss": 2.5291, "step": 10010 }, { "crossentropy": 2.5600745677948, "epoch": 0.3629277842227378, "grad_norm": 0.03344740346074104, "grad_norm_var": 2.3619810438024167e-06, "learning_rate": 0.00727109148098091, "loss": 2.6985, "step": 10011 }, { "crossentropy": 2.71087908744812, "epoch": 0.3629640371229698, "grad_norm": 0.028788084164261818, "grad_norm_var": 2.3698840844663664e-06, "learning_rate": 0.007270573780161532, "loss": 2.707, "step": 10012 }, { "crossentropy": 2.655484199523926, "epoch": 0.36300029002320183, "grad_norm": 0.027598151937127113, "grad_norm_var": 2.403643099913019e-06, "learning_rate": 0.007270056048674609, "loss": 2.6508, "step": 10013 }, { "crossentropy": 2.677114248275757, "epoch": 0.36303654292343385, "grad_norm": 0.029071221128106117, "grad_norm_var": 2.39690920102355e-06, "learning_rate": 0.0072695382865271335, "loss": 2.5646, "step": 10014 }, { "crossentropy": 2.579350471496582, "epoch": 0.3630727958236659, "grad_norm": 0.027220116928219795, "grad_norm_var": 2.4280920833660845e-06, "learning_rate": 0.007269020493726101, "loss": 2.5691, "step": 10015 }, { "crossentropy": 2.543041944503784, "epoch": 0.36310904872389793, "grad_norm": 0.02874627336859703, "grad_norm_var": 2.402534676546617e-06, "learning_rate": 0.007268502670278502, "loss": 2.572, "step": 10016 }, { "crossentropy": 2.6774892807006836, "epoch": 0.36314530162412995, "grad_norm": 0.03016052022576332, "grad_norm_var": 2.5596776663883505e-06, "learning_rate": 0.007267984816191332, "loss": 2.5827, "step": 10017 }, { "crossentropy": 2.505284070968628, "epoch": 0.36318155452436196, "grad_norm": 0.027285823598504066, "grad_norm_var": 2.4869918544084824e-06, "learning_rate": 0.007267466931471585, "loss": 2.6086, "step": 10018 }, { "crossentropy": 2.742523431777954, "epoch": 0.363217807424594, "grad_norm": 0.027390915900468826, "grad_norm_var": 2.554038575417611e-06, "learning_rate": 0.007266949016126257, "loss": 2.6608, "step": 10019 }, { "crossentropy": 2.6081156730651855, "epoch": 0.363254060324826, "grad_norm": 0.02890143170952797, "grad_norm_var": 2.5613133818565744e-06, "learning_rate": 0.007266431070162342, "loss": 2.6139, "step": 10020 }, { "crossentropy": 2.6451776027679443, "epoch": 0.363290313225058, "grad_norm": 0.03091820888221264, "grad_norm_var": 2.9306366076494704e-06, "learning_rate": 0.007265913093586836, "loss": 2.6237, "step": 10021 }, { "crossentropy": 2.6652956008911133, "epoch": 0.36332656612529, "grad_norm": 0.030609456822276115, "grad_norm_var": 3.1382012626920136e-06, "learning_rate": 0.007265395086406734, "loss": 2.61, "step": 10022 }, { "crossentropy": 2.648923635482788, "epoch": 0.36336281902552203, "grad_norm": 0.03265567496418953, "grad_norm_var": 4.051781549159111e-06, "learning_rate": 0.007264877048629035, "loss": 2.6906, "step": 10023 }, { "crossentropy": 2.5244433879852295, "epoch": 0.36339907192575405, "grad_norm": 0.031221147626638412, "grad_norm_var": 4.059045769466638e-06, "learning_rate": 0.007264358980260733, "loss": 2.517, "step": 10024 }, { "crossentropy": 2.5534305572509766, "epoch": 0.36343532482598606, "grad_norm": 0.03023643232882023, "grad_norm_var": 3.678781035044741e-06, "learning_rate": 0.007263840881308829, "loss": 2.5891, "step": 10025 }, { "crossentropy": 2.7797205448150635, "epoch": 0.3634715777262181, "grad_norm": 0.028549738228321075, "grad_norm_var": 3.5579442986030085e-06, "learning_rate": 0.007263322751780314, "loss": 2.6465, "step": 10026 }, { "crossentropy": 2.6315085887908936, "epoch": 0.3635078306264501, "grad_norm": 0.031078098341822624, "grad_norm_var": 2.6775881959333838e-06, "learning_rate": 0.007262804591682195, "loss": 2.5969, "step": 10027 }, { "crossentropy": 2.6226179599761963, "epoch": 0.36354408352668216, "grad_norm": 0.02779672108590603, "grad_norm_var": 2.820155893988929e-06, "learning_rate": 0.007262286401021465, "loss": 2.6332, "step": 10028 }, { "crossentropy": 2.5691375732421875, "epoch": 0.3635803364269142, "grad_norm": 0.03089824505150318, "grad_norm_var": 2.734386371515624e-06, "learning_rate": 0.007261768179805124, "loss": 2.4948, "step": 10029 }, { "crossentropy": 2.530985116958618, "epoch": 0.3636165893271462, "grad_norm": 0.03229672461748123, "grad_norm_var": 3.1803334105110863e-06, "learning_rate": 0.007261249928040169, "loss": 2.6071, "step": 10030 }, { "crossentropy": 2.820960283279419, "epoch": 0.3636528422273782, "grad_norm": 0.030666999518871307, "grad_norm_var": 2.761191355794473e-06, "learning_rate": 0.007260731645733604, "loss": 2.7538, "step": 10031 }, { "crossentropy": 2.5900604724884033, "epoch": 0.3636890951276102, "grad_norm": 0.028626909479498863, "grad_norm_var": 2.781450657995029e-06, "learning_rate": 0.007260213332892428, "loss": 2.657, "step": 10032 }, { "crossentropy": 2.5394153594970703, "epoch": 0.36372534802784223, "grad_norm": 0.027839163318276405, "grad_norm_var": 3.0548852530742072e-06, "learning_rate": 0.007259694989523641, "loss": 2.5293, "step": 10033 }, { "crossentropy": 2.591081142425537, "epoch": 0.36376160092807425, "grad_norm": 0.02783268876373768, "grad_norm_var": 2.889472097565325e-06, "learning_rate": 0.007259176615634242, "loss": 2.5464, "step": 10034 }, { "crossentropy": 2.592815637588501, "epoch": 0.36379785382830626, "grad_norm": 0.030307335779070854, "grad_norm_var": 2.4668159461590604e-06, "learning_rate": 0.007258658211231235, "loss": 2.6409, "step": 10035 }, { "crossentropy": 2.825221061706543, "epoch": 0.3638341067285383, "grad_norm": 0.02908829227089882, "grad_norm_var": 2.4409503725316787e-06, "learning_rate": 0.007258139776321623, "loss": 2.6794, "step": 10036 }, { "crossentropy": 2.6076204776763916, "epoch": 0.3638703596287703, "grad_norm": 0.028643716126680374, "grad_norm_var": 2.4976078094836637e-06, "learning_rate": 0.007257621310912406, "loss": 2.6285, "step": 10037 }, { "crossentropy": 2.6135613918304443, "epoch": 0.3639066125290023, "grad_norm": 0.029099619016051292, "grad_norm_var": 2.496598970012327e-06, "learning_rate": 0.007257102815010585, "loss": 2.6177, "step": 10038 }, { "crossentropy": 2.827166795730591, "epoch": 0.3639428654292343, "grad_norm": 0.029474303126335144, "grad_norm_var": 1.9188352234568757e-06, "learning_rate": 0.007256584288623166, "loss": 2.7126, "step": 10039 }, { "crossentropy": 2.3768811225891113, "epoch": 0.36397911832946633, "grad_norm": 0.03060683235526085, "grad_norm_var": 1.8099229474465335e-06, "learning_rate": 0.007256065731757153, "loss": 2.4539, "step": 10040 }, { "crossentropy": 2.614147186279297, "epoch": 0.3640153712296984, "grad_norm": 0.029436683282256126, "grad_norm_var": 1.7783129996502225e-06, "learning_rate": 0.007255547144419549, "loss": 2.634, "step": 10041 }, { "crossentropy": 2.5260443687438965, "epoch": 0.3640516241299304, "grad_norm": 0.03106328845024109, "grad_norm_var": 1.8496426944871463e-06, "learning_rate": 0.007255028526617356, "loss": 2.6058, "step": 10042 }, { "crossentropy": 2.504749298095703, "epoch": 0.36408787703016243, "grad_norm": 0.029495906084775925, "grad_norm_var": 1.709519648249574e-06, "learning_rate": 0.00725450987835758, "loss": 2.5574, "step": 10043 }, { "crossentropy": 2.695385456085205, "epoch": 0.36412412993039445, "grad_norm": 0.027556682005524635, "grad_norm_var": 1.769981860116667e-06, "learning_rate": 0.00725399119964723, "loss": 2.6756, "step": 10044 }, { "crossentropy": 2.5577759742736816, "epoch": 0.36416038283062646, "grad_norm": 0.02673408016562462, "grad_norm_var": 2.109802181580776e-06, "learning_rate": 0.007253472490493307, "loss": 2.5827, "step": 10045 }, { "crossentropy": 2.637117624282837, "epoch": 0.3641966357308585, "grad_norm": 0.02852300927042961, "grad_norm_var": 1.4910542401099508e-06, "learning_rate": 0.007252953750902815, "loss": 2.6412, "step": 10046 }, { "crossentropy": 2.727734088897705, "epoch": 0.3642328886310905, "grad_norm": 0.030027413740754128, "grad_norm_var": 1.3797685002219876e-06, "learning_rate": 0.007252434980882767, "loss": 2.6745, "step": 10047 }, { "crossentropy": 2.5877745151519775, "epoch": 0.3642691415313225, "grad_norm": 0.029415931552648544, "grad_norm_var": 1.3770877581799663e-06, "learning_rate": 0.007251916180440166, "loss": 2.5905, "step": 10048 }, { "crossentropy": 2.6429531574249268, "epoch": 0.3643053944315545, "grad_norm": 0.029231224209070206, "grad_norm_var": 1.2694596941617102e-06, "learning_rate": 0.007251397349582021, "loss": 2.6295, "step": 10049 }, { "crossentropy": 2.533550500869751, "epoch": 0.36434164733178653, "grad_norm": 0.028395481407642365, "grad_norm_var": 1.1897633728538549e-06, "learning_rate": 0.007250878488315335, "loss": 2.5372, "step": 10050 }, { "crossentropy": 2.478968858718872, "epoch": 0.36437790023201855, "grad_norm": 0.02896110713481903, "grad_norm_var": 1.1031463623430622e-06, "learning_rate": 0.00725035959664712, "loss": 2.5685, "step": 10051 }, { "crossentropy": 2.6279172897338867, "epoch": 0.36441415313225056, "grad_norm": 0.030869871377944946, "grad_norm_var": 1.2964617912849508e-06, "learning_rate": 0.0072498406745843845, "loss": 2.6786, "step": 10052 }, { "crossentropy": 2.584257125854492, "epoch": 0.3644504060324826, "grad_norm": 0.027868609875440598, "grad_norm_var": 1.3936664979544503e-06, "learning_rate": 0.007249321722134137, "loss": 2.5675, "step": 10053 }, { "crossentropy": 2.650928020477295, "epoch": 0.3644866589327146, "grad_norm": 0.029768560081720352, "grad_norm_var": 1.4151334812026779e-06, "learning_rate": 0.007248802739303385, "loss": 2.631, "step": 10054 }, { "crossentropy": 2.626866340637207, "epoch": 0.36452291183294666, "grad_norm": 0.03233712911605835, "grad_norm_var": 2.026610706790559e-06, "learning_rate": 0.0072482837260991395, "loss": 2.5835, "step": 10055 }, { "crossentropy": 2.545945167541504, "epoch": 0.3645591647331787, "grad_norm": 0.027203597128391266, "grad_norm_var": 2.1998001174308115e-06, "learning_rate": 0.007247764682528411, "loss": 2.6246, "step": 10056 }, { "crossentropy": 2.5676817893981934, "epoch": 0.3645954176334107, "grad_norm": 0.027534835040569305, "grad_norm_var": 2.3609105028507547e-06, "learning_rate": 0.007247245608598209, "loss": 2.5592, "step": 10057 }, { "crossentropy": 2.556736469268799, "epoch": 0.3646316705336427, "grad_norm": 0.030571071431040764, "grad_norm_var": 2.2446887907640827e-06, "learning_rate": 0.0072467265043155435, "loss": 2.5965, "step": 10058 }, { "crossentropy": 2.570262908935547, "epoch": 0.3646679234338747, "grad_norm": 0.027767153456807137, "grad_norm_var": 2.324292907582235e-06, "learning_rate": 0.007246207369687428, "loss": 2.5488, "step": 10059 }, { "crossentropy": 2.6027920246124268, "epoch": 0.36470417633410673, "grad_norm": 0.028512662276625633, "grad_norm_var": 2.207273012675796e-06, "learning_rate": 0.007245688204720874, "loss": 2.6817, "step": 10060 }, { "crossentropy": 2.53337025642395, "epoch": 0.36474042923433875, "grad_norm": 0.027437381446361542, "grad_norm_var": 2.0273351674976804e-06, "learning_rate": 0.007245169009422894, "loss": 2.5189, "step": 10061 }, { "crossentropy": 2.6734888553619385, "epoch": 0.36477668213457076, "grad_norm": 0.02747362107038498, "grad_norm_var": 2.166617854858453e-06, "learning_rate": 0.0072446497838004964, "loss": 2.5775, "step": 10062 }, { "crossentropy": 2.6708834171295166, "epoch": 0.3648129350348028, "grad_norm": 0.02870880998671055, "grad_norm_var": 2.087793465740719e-06, "learning_rate": 0.007244130527860698, "loss": 2.6792, "step": 10063 }, { "crossentropy": 2.559276580810547, "epoch": 0.3648491879350348, "grad_norm": 0.030629271641373634, "grad_norm_var": 2.266739985523032e-06, "learning_rate": 0.007243611241610512, "loss": 2.5356, "step": 10064 }, { "crossentropy": 2.5460052490234375, "epoch": 0.3648854408352668, "grad_norm": 0.028802096843719482, "grad_norm_var": 2.262410285573654e-06, "learning_rate": 0.007243091925056951, "loss": 2.5562, "step": 10065 }, { "crossentropy": 2.688000440597534, "epoch": 0.3649216937354988, "grad_norm": 0.0307428278028965, "grad_norm_var": 2.440251922200528e-06, "learning_rate": 0.007242572578207029, "loss": 2.6189, "step": 10066 }, { "crossentropy": 2.71787691116333, "epoch": 0.36495794663573083, "grad_norm": 0.029476221650838852, "grad_norm_var": 2.449062386436597e-06, "learning_rate": 0.007242053201067762, "loss": 2.5824, "step": 10067 }, { "crossentropy": 2.6963772773742676, "epoch": 0.3649941995359629, "grad_norm": 0.029671672731637955, "grad_norm_var": 2.2570736995905822e-06, "learning_rate": 0.007241533793646163, "loss": 2.569, "step": 10068 }, { "crossentropy": 2.64858341217041, "epoch": 0.3650304524361949, "grad_norm": 0.028239194303750992, "grad_norm_var": 2.2081924388636278e-06, "learning_rate": 0.007241014355949249, "loss": 2.5996, "step": 10069 }, { "crossentropy": 2.662261486053467, "epoch": 0.36506670533642693, "grad_norm": 0.02967545948922634, "grad_norm_var": 2.1998734347580355e-06, "learning_rate": 0.007240494887984036, "loss": 2.6255, "step": 10070 }, { "crossentropy": 2.7112627029418945, "epoch": 0.36510295823665895, "grad_norm": 0.02973848581314087, "grad_norm_var": 1.4826211243020951e-06, "learning_rate": 0.007239975389757538, "loss": 2.714, "step": 10071 }, { "crossentropy": 2.6674695014953613, "epoch": 0.36513921113689096, "grad_norm": 0.031610023230314255, "grad_norm_var": 1.707399812337212e-06, "learning_rate": 0.007239455861276774, "loss": 2.6544, "step": 10072 }, { "crossentropy": 2.6302459239959717, "epoch": 0.365175464037123, "grad_norm": 0.029008563607931137, "grad_norm_var": 1.5234236728884723e-06, "learning_rate": 0.007238936302548761, "loss": 2.7071, "step": 10073 }, { "crossentropy": 2.648096799850464, "epoch": 0.365211716937355, "grad_norm": 0.029691673815250397, "grad_norm_var": 1.417330623612933e-06, "learning_rate": 0.007238416713580514, "loss": 2.7097, "step": 10074 }, { "crossentropy": 2.5510528087615967, "epoch": 0.365247969837587, "grad_norm": 0.030355118215084076, "grad_norm_var": 1.3418283008816609e-06, "learning_rate": 0.007237897094379053, "loss": 2.5367, "step": 10075 }, { "crossentropy": 2.581089735031128, "epoch": 0.365284222737819, "grad_norm": 0.029261423274874687, "grad_norm_var": 1.2921930590240513e-06, "learning_rate": 0.007237377444951397, "loss": 2.5839, "step": 10076 }, { "crossentropy": 2.6972687244415283, "epoch": 0.36532047563805103, "grad_norm": 0.02783328853547573, "grad_norm_var": 1.1979855200848575e-06, "learning_rate": 0.007236857765304561, "loss": 2.6992, "step": 10077 }, { "crossentropy": 2.708693027496338, "epoch": 0.36535672853828305, "grad_norm": 0.0325983390212059, "grad_norm_var": 1.5010088004348363e-06, "learning_rate": 0.007236338055445566, "loss": 2.6979, "step": 10078 }, { "crossentropy": 2.6077136993408203, "epoch": 0.36539298143851506, "grad_norm": 0.029222939163446426, "grad_norm_var": 1.4459732371074896e-06, "learning_rate": 0.007235818315381434, "loss": 2.6115, "step": 10079 }, { "crossentropy": 2.6978683471679688, "epoch": 0.3654292343387471, "grad_norm": 0.028470739722251892, "grad_norm_var": 1.4941308445947832e-06, "learning_rate": 0.007235298545119181, "loss": 2.614, "step": 10080 }, { "crossentropy": 2.59977650642395, "epoch": 0.3654654872389791, "grad_norm": 0.02836315706372261, "grad_norm_var": 1.5557893229525455e-06, "learning_rate": 0.00723477874466583, "loss": 2.5369, "step": 10081 }, { "crossentropy": 2.578247547149658, "epoch": 0.36550174013921116, "grad_norm": 0.029795177280902863, "grad_norm_var": 1.4703527964718266e-06, "learning_rate": 0.0072342589140284, "loss": 2.5824, "step": 10082 }, { "crossentropy": 2.657735824584961, "epoch": 0.3655379930394432, "grad_norm": 0.028476040810346603, "grad_norm_var": 1.5444769221025726e-06, "learning_rate": 0.007233739053213913, "loss": 2.6738, "step": 10083 }, { "crossentropy": 2.5354621410369873, "epoch": 0.3655742459396752, "grad_norm": 0.027515776455402374, "grad_norm_var": 1.7858250986187933e-06, "learning_rate": 0.00723321916222939, "loss": 2.5347, "step": 10084 }, { "crossentropy": 2.6990296840667725, "epoch": 0.3656104988399072, "grad_norm": 0.027708813548088074, "grad_norm_var": 1.8830887375198412e-06, "learning_rate": 0.007232699241081854, "loss": 2.6066, "step": 10085 }, { "crossentropy": 2.7556705474853516, "epoch": 0.3656467517401392, "grad_norm": 0.028507044538855553, "grad_norm_var": 1.9150330011673124e-06, "learning_rate": 0.0072321792897783255, "loss": 2.7306, "step": 10086 }, { "crossentropy": 2.5792229175567627, "epoch": 0.36568300464037123, "grad_norm": 0.02941223978996277, "grad_norm_var": 1.900862168736208e-06, "learning_rate": 0.007231659308325829, "loss": 2.5991, "step": 10087 }, { "crossentropy": 2.702320098876953, "epoch": 0.36571925754060325, "grad_norm": 0.032641299068927765, "grad_norm_var": 2.293302006328351e-06, "learning_rate": 0.007231139296731386, "loss": 2.6282, "step": 10088 }, { "crossentropy": 2.7308239936828613, "epoch": 0.36575551044083526, "grad_norm": 0.031179510056972504, "grad_norm_var": 2.5023909638700044e-06, "learning_rate": 0.007230619255002022, "loss": 2.6907, "step": 10089 }, { "crossentropy": 2.5733718872070312, "epoch": 0.3657917633410673, "grad_norm": 0.029808424413204193, "grad_norm_var": 2.5071678439362766e-06, "learning_rate": 0.007230099183144759, "loss": 2.5638, "step": 10090 }, { "crossentropy": 2.5789945125579834, "epoch": 0.3658280162412993, "grad_norm": 0.028892721980810165, "grad_norm_var": 2.463727501194575e-06, "learning_rate": 0.007229579081166623, "loss": 2.622, "step": 10091 }, { "crossentropy": 2.7000763416290283, "epoch": 0.3658642691415313, "grad_norm": 0.031668566167354584, "grad_norm_var": 2.795700777301751e-06, "learning_rate": 0.0072290589490746375, "loss": 2.6712, "step": 10092 }, { "crossentropy": 2.6445348262786865, "epoch": 0.3659005220417633, "grad_norm": 0.028551828116178513, "grad_norm_var": 2.6677263978551092e-06, "learning_rate": 0.007228538786875828, "loss": 2.5722, "step": 10093 }, { "crossentropy": 2.552377462387085, "epoch": 0.36593677494199534, "grad_norm": 0.027409816160798073, "grad_norm_var": 2.2419698020289514e-06, "learning_rate": 0.00722801859457722, "loss": 2.538, "step": 10094 }, { "crossentropy": 2.4548027515411377, "epoch": 0.3659730278422274, "grad_norm": 0.028325514867901802, "grad_norm_var": 2.292732232920961e-06, "learning_rate": 0.007227498372185841, "loss": 2.4595, "step": 10095 }, { "crossentropy": 2.6299386024475098, "epoch": 0.3660092807424594, "grad_norm": 0.028144733980298042, "grad_norm_var": 2.3297878862621817e-06, "learning_rate": 0.0072269781197087145, "loss": 2.6571, "step": 10096 }, { "crossentropy": 2.6201670169830322, "epoch": 0.36604553364269143, "grad_norm": 0.02837178111076355, "grad_norm_var": 2.3288877175081195e-06, "learning_rate": 0.007226457837152871, "loss": 2.5879, "step": 10097 }, { "crossentropy": 2.5828278064727783, "epoch": 0.36608178654292345, "grad_norm": 0.031824249774217606, "grad_norm_var": 2.760599806911288e-06, "learning_rate": 0.007225937524525335, "loss": 2.6661, "step": 10098 }, { "crossentropy": 2.6535146236419678, "epoch": 0.36611803944315546, "grad_norm": 0.030737150460481644, "grad_norm_var": 2.8385442983127593e-06, "learning_rate": 0.007225417181833134, "loss": 2.7107, "step": 10099 }, { "crossentropy": 2.5466623306274414, "epoch": 0.3661542923433875, "grad_norm": 0.02823936939239502, "grad_norm_var": 2.687674566892393e-06, "learning_rate": 0.007224896809083297, "loss": 2.5613, "step": 10100 }, { "crossentropy": 2.666844606399536, "epoch": 0.3661905452436195, "grad_norm": 0.02875465527176857, "grad_norm_var": 2.5112913231537432e-06, "learning_rate": 0.007224376406282851, "loss": 2.6197, "step": 10101 }, { "crossentropy": 2.534193277359009, "epoch": 0.3662267981438515, "grad_norm": 0.027490738779306412, "grad_norm_var": 2.714370266057225e-06, "learning_rate": 0.007223855973438827, "loss": 2.5522, "step": 10102 }, { "crossentropy": 2.5703530311584473, "epoch": 0.3662630510440835, "grad_norm": 0.0311165452003479, "grad_norm_var": 2.8837431126861227e-06, "learning_rate": 0.0072233355105582545, "loss": 2.5707, "step": 10103 }, { "crossentropy": 2.6235508918762207, "epoch": 0.36629930394431554, "grad_norm": 0.028666170313954353, "grad_norm_var": 2.2447274299684333e-06, "learning_rate": 0.00722281501764816, "loss": 2.5649, "step": 10104 }, { "crossentropy": 2.753685474395752, "epoch": 0.36633555684454755, "grad_norm": 0.028828609734773636, "grad_norm_var": 2.0084887390682423e-06, "learning_rate": 0.007222294494715576, "loss": 2.7363, "step": 10105 }, { "crossentropy": 2.4068987369537354, "epoch": 0.36637180974477956, "grad_norm": 0.027320513501763344, "grad_norm_var": 2.1858646988980364e-06, "learning_rate": 0.007221773941767533, "loss": 2.5251, "step": 10106 }, { "crossentropy": 2.4411461353302, "epoch": 0.3664080626450116, "grad_norm": 0.02730523608624935, "grad_norm_var": 2.370615742814987e-06, "learning_rate": 0.007221253358811062, "loss": 2.4714, "step": 10107 }, { "crossentropy": 2.5759544372558594, "epoch": 0.3664443155452436, "grad_norm": 0.027617843821644783, "grad_norm_var": 1.912844887641526e-06, "learning_rate": 0.007220732745853191, "loss": 2.4903, "step": 10108 }, { "crossentropy": 2.4440999031066895, "epoch": 0.36648056844547566, "grad_norm": 0.029838573187589645, "grad_norm_var": 1.996216117394741e-06, "learning_rate": 0.007220212102900956, "loss": 2.5089, "step": 10109 }, { "crossentropy": 2.6407904624938965, "epoch": 0.3665168213457077, "grad_norm": 0.02950556017458439, "grad_norm_var": 1.8963825051364203e-06, "learning_rate": 0.007219691429961387, "loss": 2.7131, "step": 10110 }, { "crossentropy": 2.546736001968384, "epoch": 0.3665530742459397, "grad_norm": 0.027800587937235832, "grad_norm_var": 1.9524445252693296e-06, "learning_rate": 0.007219170727041515, "loss": 2.6478, "step": 10111 }, { "crossentropy": 2.6025421619415283, "epoch": 0.3665893271461717, "grad_norm": 0.02849181555211544, "grad_norm_var": 1.9274446351682e-06, "learning_rate": 0.007218649994148379, "loss": 2.5329, "step": 10112 }, { "crossentropy": 2.687150478363037, "epoch": 0.3666255800464037, "grad_norm": 0.028726106509566307, "grad_norm_var": 1.911785040993828e-06, "learning_rate": 0.007218129231289005, "loss": 2.6049, "step": 10113 }, { "crossentropy": 2.6166021823883057, "epoch": 0.36666183294663574, "grad_norm": 0.027657395228743553, "grad_norm_var": 1.3675639193937248e-06, "learning_rate": 0.007217608438470428, "loss": 2.6429, "step": 10114 }, { "crossentropy": 2.4433579444885254, "epoch": 0.36669808584686775, "grad_norm": 0.0274913739413023, "grad_norm_var": 1.1145498968268344e-06, "learning_rate": 0.007217087615699685, "loss": 2.5901, "step": 10115 }, { "crossentropy": 2.770803689956665, "epoch": 0.36673433874709976, "grad_norm": 0.027946706861257553, "grad_norm_var": 1.1272713354840146e-06, "learning_rate": 0.007216566762983811, "loss": 2.7126, "step": 10116 }, { "crossentropy": 2.502261161804199, "epoch": 0.3667705916473318, "grad_norm": 0.032051194459199905, "grad_norm_var": 1.9580018521189735e-06, "learning_rate": 0.007216045880329838, "loss": 2.5917, "step": 10117 }, { "crossentropy": 2.416825294494629, "epoch": 0.3668068445475638, "grad_norm": 0.028828050941228867, "grad_norm_var": 1.869145152891503e-06, "learning_rate": 0.007215524967744801, "loss": 2.5358, "step": 10118 }, { "crossentropy": 2.6168487071990967, "epoch": 0.3668430974477958, "grad_norm": 0.028099462389945984, "grad_norm_var": 1.465753168875215e-06, "learning_rate": 0.0072150040252357385, "loss": 2.6161, "step": 10119 }, { "crossentropy": 2.5732738971710205, "epoch": 0.3668793503480278, "grad_norm": 0.027500582858920097, "grad_norm_var": 1.5265422605496732e-06, "learning_rate": 0.007214483052809686, "loss": 2.557, "step": 10120 }, { "crossentropy": 2.5762827396392822, "epoch": 0.36691560324825984, "grad_norm": 0.06873868405818939, "grad_norm_var": 0.00010315545124485413, "learning_rate": 0.007213962050473678, "loss": 2.6466, "step": 10121 }, { "crossentropy": 2.586376667022705, "epoch": 0.3669518561484919, "grad_norm": 0.030689692124724388, "grad_norm_var": 0.00010224232994607326, "learning_rate": 0.007213441018234752, "loss": 2.5888, "step": 10122 }, { "crossentropy": 2.63616681098938, "epoch": 0.3669881090487239, "grad_norm": 0.030475301668047905, "grad_norm_var": 0.00010124826086311631, "learning_rate": 0.007212919956099948, "loss": 2.6011, "step": 10123 }, { "crossentropy": 2.5655534267425537, "epoch": 0.36702436194895594, "grad_norm": 0.03203612193465233, "grad_norm_var": 0.00010027490150256491, "learning_rate": 0.007212398864076302, "loss": 2.5784, "step": 10124 }, { "crossentropy": 2.877636194229126, "epoch": 0.36706061484918795, "grad_norm": 0.03551752492785454, "grad_norm_var": 0.00010094369741501286, "learning_rate": 0.00721187774217085, "loss": 2.8112, "step": 10125 }, { "crossentropy": 2.756030797958374, "epoch": 0.36709686774941996, "grad_norm": 0.030668288469314575, "grad_norm_var": 0.00010064577991023554, "learning_rate": 0.007211356590390632, "loss": 2.6988, "step": 10126 }, { "crossentropy": 2.5838868618011475, "epoch": 0.367133120649652, "grad_norm": 0.028114154934883118, "grad_norm_var": 0.00010047447373782608, "learning_rate": 0.007210835408742689, "loss": 2.623, "step": 10127 }, { "crossentropy": 2.7331130504608154, "epoch": 0.367169373549884, "grad_norm": 0.027531417086720467, "grad_norm_var": 0.000100989618641472, "learning_rate": 0.007210314197234059, "loss": 2.5982, "step": 10128 }, { "crossentropy": 2.628821849822998, "epoch": 0.367205626450116, "grad_norm": 0.02941959537565708, "grad_norm_var": 0.0001007165389782302, "learning_rate": 0.0072097929558717815, "loss": 2.6442, "step": 10129 }, { "crossentropy": 2.558958053588867, "epoch": 0.367241879350348, "grad_norm": 0.030643871054053307, "grad_norm_var": 9.952571521852139e-05, "learning_rate": 0.007209271684662895, "loss": 2.599, "step": 10130 }, { "crossentropy": 2.5241425037384033, "epoch": 0.36727813225058004, "grad_norm": 0.03126296401023865, "grad_norm_var": 9.802955326911171e-05, "learning_rate": 0.007208750383614442, "loss": 2.5853, "step": 10131 }, { "crossentropy": 2.6476945877075195, "epoch": 0.36731438515081205, "grad_norm": 0.028332822024822235, "grad_norm_var": 9.780599117478467e-05, "learning_rate": 0.0072082290527334635, "loss": 2.5908, "step": 10132 }, { "crossentropy": 2.7462985515594482, "epoch": 0.36735063805104406, "grad_norm": 0.026819149032235146, "grad_norm_var": 9.982603846830534e-05, "learning_rate": 0.007207707692027002, "loss": 2.6009, "step": 10133 }, { "crossentropy": 2.7749342918395996, "epoch": 0.3673868909512761, "grad_norm": 0.03264056518673897, "grad_norm_var": 9.903700652506632e-05, "learning_rate": 0.007207186301502097, "loss": 2.6724, "step": 10134 }, { "crossentropy": 2.6185123920440674, "epoch": 0.3674231438515081, "grad_norm": 0.03470393270254135, "grad_norm_var": 9.797119518281833e-05, "learning_rate": 0.007206664881165789, "loss": 2.5709, "step": 10135 }, { "crossentropy": 2.7303807735443115, "epoch": 0.36745939675174016, "grad_norm": 0.033789005130529404, "grad_norm_var": 9.598393978463964e-05, "learning_rate": 0.007206143431025127, "loss": 2.7092, "step": 10136 }, { "crossentropy": 2.7479331493377686, "epoch": 0.3674956496519722, "grad_norm": 0.028479166328907013, "grad_norm_var": 6.577792848624457e-06, "learning_rate": 0.007205621951087149, "loss": 2.6614, "step": 10137 }, { "crossentropy": 2.5431809425354004, "epoch": 0.3675319025522042, "grad_norm": 0.029034970328211784, "grad_norm_var": 6.750144692310953e-06, "learning_rate": 0.007205100441358897, "loss": 2.6246, "step": 10138 }, { "crossentropy": 2.5268142223358154, "epoch": 0.3675681554524362, "grad_norm": 0.02958543598651886, "grad_norm_var": 6.813458755076347e-06, "learning_rate": 0.007204578901847419, "loss": 2.6272, "step": 10139 }, { "crossentropy": 2.713942766189575, "epoch": 0.3676044083526682, "grad_norm": 0.029837699607014656, "grad_norm_var": 6.67585952021608e-06, "learning_rate": 0.007204057332559757, "loss": 2.7093, "step": 10140 }, { "crossentropy": 2.743757724761963, "epoch": 0.36764066125290024, "grad_norm": 0.031927287578582764, "grad_norm_var": 5.0311401802731356e-06, "learning_rate": 0.007203535733502957, "loss": 2.6303, "step": 10141 }, { "crossentropy": 2.7882447242736816, "epoch": 0.36767691415313225, "grad_norm": 0.03643225133419037, "grad_norm_var": 7.487165341012586e-06, "learning_rate": 0.0072030141046840615, "loss": 2.7159, "step": 10142 }, { "crossentropy": 2.703977346420288, "epoch": 0.36771316705336426, "grad_norm": 0.03313342481851578, "grad_norm_var": 7.441854525588818e-06, "learning_rate": 0.007202492446110118, "loss": 2.7708, "step": 10143 }, { "crossentropy": 2.5881316661834717, "epoch": 0.3677494199535963, "grad_norm": 0.02803008072078228, "grad_norm_var": 7.2368584478596044e-06, "learning_rate": 0.007201970757788172, "loss": 2.6509, "step": 10144 }, { "crossentropy": 2.686622381210327, "epoch": 0.3677856728538283, "grad_norm": 0.027699552476406097, "grad_norm_var": 7.756583986564036e-06, "learning_rate": 0.0072014490397252695, "loss": 2.6603, "step": 10145 }, { "crossentropy": 2.5004796981811523, "epoch": 0.3678219257540603, "grad_norm": 0.028946546837687492, "grad_norm_var": 7.965640200627968e-06, "learning_rate": 0.0072009272919284556, "loss": 2.5796, "step": 10146 }, { "crossentropy": 2.6243953704833984, "epoch": 0.3678581786542923, "grad_norm": 0.030940493568778038, "grad_norm_var": 7.94646922069314e-06, "learning_rate": 0.007200405514404778, "loss": 2.6272, "step": 10147 }, { "crossentropy": 2.7513911724090576, "epoch": 0.36789443155452434, "grad_norm": 0.029589969664812088, "grad_norm_var": 7.657549203757738e-06, "learning_rate": 0.007199883707161287, "loss": 2.718, "step": 10148 }, { "crossentropy": 2.523125410079956, "epoch": 0.3679306844547564, "grad_norm": 0.032489143311977386, "grad_norm_var": 6.714525692353548e-06, "learning_rate": 0.0071993618702050274, "loss": 2.6592, "step": 10149 }, { "crossentropy": 2.3666763305664062, "epoch": 0.3679669373549884, "grad_norm": 0.028362717479467392, "grad_norm_var": 6.967429924104142e-06, "learning_rate": 0.007198840003543047, "loss": 2.4873, "step": 10150 }, { "crossentropy": 2.593930244445801, "epoch": 0.36800319025522044, "grad_norm": 0.02918674424290657, "grad_norm_var": 6.006412387103453e-06, "learning_rate": 0.007198318107182395, "loss": 2.6252, "step": 10151 }, { "crossentropy": 2.6125054359436035, "epoch": 0.36803944315545245, "grad_norm": 0.030711008235812187, "grad_norm_var": 5.234999265909107e-06, "learning_rate": 0.007197796181130123, "loss": 2.6922, "step": 10152 }, { "crossentropy": 2.629701614379883, "epoch": 0.36807569605568446, "grad_norm": 0.029459841549396515, "grad_norm_var": 5.060400118485758e-06, "learning_rate": 0.007197274225393278, "loss": 2.6221, "step": 10153 }, { "crossentropy": 2.656914710998535, "epoch": 0.3681119489559165, "grad_norm": 0.03284090757369995, "grad_norm_var": 5.305784333754189e-06, "learning_rate": 0.007196752239978909, "loss": 2.7145, "step": 10154 }, { "crossentropy": 2.676879405975342, "epoch": 0.3681482018561485, "grad_norm": 0.03326128050684929, "grad_norm_var": 5.666099947225948e-06, "learning_rate": 0.007196230224894068, "loss": 2.6127, "step": 10155 }, { "crossentropy": 2.6187684535980225, "epoch": 0.3681844547563805, "grad_norm": 0.029384423047304153, "grad_norm_var": 5.737284500105927e-06, "learning_rate": 0.007195708180145805, "loss": 2.6047, "step": 10156 }, { "crossentropy": 2.547105073928833, "epoch": 0.3682207076566125, "grad_norm": 0.029047712683677673, "grad_norm_var": 5.813014521894161e-06, "learning_rate": 0.00719518610574117, "loss": 2.5392, "step": 10157 }, { "crossentropy": 2.730768918991089, "epoch": 0.36825696055684454, "grad_norm": 0.029435032978653908, "grad_norm_var": 3.4269172525522626e-06, "learning_rate": 0.007194664001687216, "loss": 2.6431, "step": 10158 }, { "crossentropy": 2.491382360458374, "epoch": 0.36829321345707655, "grad_norm": 0.030906036496162415, "grad_norm_var": 2.8531697705966393e-06, "learning_rate": 0.0071941418679909944, "loss": 2.5354, "step": 10159 }, { "crossentropy": 2.7505624294281006, "epoch": 0.36832946635730857, "grad_norm": 0.028674133121967316, "grad_norm_var": 2.7083663591442876e-06, "learning_rate": 0.0071936197046595565, "loss": 2.7325, "step": 10160 }, { "crossentropy": 2.5968687534332275, "epoch": 0.3683657192575406, "grad_norm": 0.02861846424639225, "grad_norm_var": 2.4721228277300394e-06, "learning_rate": 0.007193097511699956, "loss": 2.6736, "step": 10161 }, { "crossentropy": 2.6821858882904053, "epoch": 0.3684019721577726, "grad_norm": 0.03514615073800087, "grad_norm_var": 3.9077094671535285e-06, "learning_rate": 0.007192575289119246, "loss": 2.7168, "step": 10162 }, { "crossentropy": 2.6326205730438232, "epoch": 0.36843822505800466, "grad_norm": 0.028274541720747948, "grad_norm_var": 4.196538717153817e-06, "learning_rate": 0.007192053036924478, "loss": 2.6266, "step": 10163 }, { "crossentropy": 2.5345780849456787, "epoch": 0.3684744779582367, "grad_norm": 0.030268704518675804, "grad_norm_var": 4.15774855895983e-06, "learning_rate": 0.007191530755122708, "loss": 2.6069, "step": 10164 }, { "crossentropy": 2.761713981628418, "epoch": 0.3685107308584687, "grad_norm": 0.028890060260891914, "grad_norm_var": 3.954810432395666e-06, "learning_rate": 0.007191008443720989, "loss": 2.6754, "step": 10165 }, { "crossentropy": 2.4175634384155273, "epoch": 0.3685469837587007, "grad_norm": 0.028922928497195244, "grad_norm_var": 3.840608162090558e-06, "learning_rate": 0.007190486102726377, "loss": 2.4941, "step": 10166 }, { "crossentropy": 2.587599277496338, "epoch": 0.3685832366589327, "grad_norm": 0.02812466397881508, "grad_norm_var": 4.0530743496334225e-06, "learning_rate": 0.007189963732145924, "loss": 2.5697, "step": 10167 }, { "crossentropy": 2.4574408531188965, "epoch": 0.36861948955916474, "grad_norm": 0.0269621592015028, "grad_norm_var": 4.637461373885422e-06, "learning_rate": 0.007189441331986689, "loss": 2.5205, "step": 10168 }, { "crossentropy": 2.626160144805908, "epoch": 0.36865574245939675, "grad_norm": 0.028191091492772102, "grad_norm_var": 4.8105950251601614e-06, "learning_rate": 0.007188918902255727, "loss": 2.6836, "step": 10169 }, { "crossentropy": 2.69720458984375, "epoch": 0.36869199535962877, "grad_norm": 0.026831891387701035, "grad_norm_var": 4.6384062296501875e-06, "learning_rate": 0.007188396442960091, "loss": 2.6396, "step": 10170 }, { "crossentropy": 2.534262180328369, "epoch": 0.3687282482598608, "grad_norm": 0.025944644585251808, "grad_norm_var": 4.250231633413333e-06, "learning_rate": 0.007187873954106841, "loss": 2.4986, "step": 10171 }, { "crossentropy": 2.6162757873535156, "epoch": 0.3687645011600928, "grad_norm": 0.027647776529192924, "grad_norm_var": 4.34425250489297e-06, "learning_rate": 0.007187351435703034, "loss": 2.6174, "step": 10172 }, { "crossentropy": 2.6258890628814697, "epoch": 0.3688007540603248, "grad_norm": 0.02858777530491352, "grad_norm_var": 4.346445332387251e-06, "learning_rate": 0.007186828887755727, "loss": 2.6527, "step": 10173 }, { "crossentropy": 2.478935956954956, "epoch": 0.3688370069605568, "grad_norm": 0.028373753651976585, "grad_norm_var": 4.332517123291343e-06, "learning_rate": 0.007186306310271975, "loss": 2.4003, "step": 10174 }, { "crossentropy": 2.466613531112671, "epoch": 0.36887325986078884, "grad_norm": 0.028308080509305, "grad_norm_var": 4.0154119512094666e-06, "learning_rate": 0.0071857837032588395, "loss": 2.4628, "step": 10175 }, { "crossentropy": 2.6759207248687744, "epoch": 0.3689095127610209, "grad_norm": 0.026843033730983734, "grad_norm_var": 4.209415943290326e-06, "learning_rate": 0.007185261066723379, "loss": 2.6767, "step": 10176 }, { "crossentropy": 2.5702829360961914, "epoch": 0.3689457656612529, "grad_norm": 0.028146149590611458, "grad_norm_var": 4.2156451923791e-06, "learning_rate": 0.00718473840067265, "loss": 2.6758, "step": 10177 }, { "crossentropy": 2.6379573345184326, "epoch": 0.36898201856148494, "grad_norm": 0.027832208201289177, "grad_norm_var": 1.045024462645649e-06, "learning_rate": 0.007184215705113714, "loss": 2.6541, "step": 10178 }, { "crossentropy": 2.601076364517212, "epoch": 0.36901827146171695, "grad_norm": 0.02856147289276123, "grad_norm_var": 1.06031595292951e-06, "learning_rate": 0.00718369298005363, "loss": 2.6234, "step": 10179 }, { "crossentropy": 2.7489986419677734, "epoch": 0.36905452436194897, "grad_norm": 0.029190178960561752, "grad_norm_var": 8.106917852090107e-07, "learning_rate": 0.007183170225499458, "loss": 2.6956, "step": 10180 }, { "crossentropy": 2.6190268993377686, "epoch": 0.369090777262181, "grad_norm": 0.028230799362063408, "grad_norm_var": 7.560904864226432e-07, "learning_rate": 0.007182647441458261, "loss": 2.5912, "step": 10181 }, { "crossentropy": 2.827054262161255, "epoch": 0.369127030162413, "grad_norm": 0.028778256848454475, "grad_norm_var": 7.380267714457058e-07, "learning_rate": 0.007182124627937096, "loss": 2.7372, "step": 10182 }, { "crossentropy": 2.685730457305908, "epoch": 0.369163283062645, "grad_norm": 0.02889860048890114, "grad_norm_var": 7.976534925826685e-07, "learning_rate": 0.007181601784943027, "loss": 2.6964, "step": 10183 }, { "crossentropy": 2.640381336212158, "epoch": 0.369199535962877, "grad_norm": 0.030974246561527252, "grad_norm_var": 1.2709905216230315e-06, "learning_rate": 0.007181078912483116, "loss": 2.6587, "step": 10184 }, { "crossentropy": 2.449143171310425, "epoch": 0.36923578886310904, "grad_norm": 0.029630644246935844, "grad_norm_var": 1.3971211281844803e-06, "learning_rate": 0.007180556010564423, "loss": 2.5394, "step": 10185 }, { "crossentropy": 2.492041826248169, "epoch": 0.36927204176334105, "grad_norm": 0.029202990233898163, "grad_norm_var": 1.2847703773746314e-06, "learning_rate": 0.007180033079194013, "loss": 2.5852, "step": 10186 }, { "crossentropy": 2.582101821899414, "epoch": 0.36930829466357307, "grad_norm": 0.027310438454151154, "grad_norm_var": 9.456796764879903e-07, "learning_rate": 0.007179510118378948, "loss": 2.5847, "step": 10187 }, { "crossentropy": 2.5974833965301514, "epoch": 0.3693445475638051, "grad_norm": 0.028256265446543694, "grad_norm_var": 8.970598186618655e-07, "learning_rate": 0.007178987128126291, "loss": 2.6015, "step": 10188 }, { "crossentropy": 2.6279759407043457, "epoch": 0.3693808004640371, "grad_norm": 0.027276035398244858, "grad_norm_var": 1.0015457931019286e-06, "learning_rate": 0.007178464108443106, "loss": 2.6589, "step": 10189 }, { "crossentropy": 2.6487176418304443, "epoch": 0.36941705336426917, "grad_norm": 0.02840062975883484, "grad_norm_var": 1.0011803843559442e-06, "learning_rate": 0.007177941059336457, "loss": 2.6533, "step": 10190 }, { "crossentropy": 2.7045671939849854, "epoch": 0.3694533062645012, "grad_norm": 0.02780270017683506, "grad_norm_var": 1.029402064639514e-06, "learning_rate": 0.007177417980813409, "loss": 2.617, "step": 10191 }, { "crossentropy": 2.658292531967163, "epoch": 0.3694895591647332, "grad_norm": 0.028249140828847885, "grad_norm_var": 8.501198074160974e-07, "learning_rate": 0.007176894872881027, "loss": 2.6802, "step": 10192 }, { "crossentropy": 2.452317237854004, "epoch": 0.3695258120649652, "grad_norm": 0.02808385342359543, "grad_norm_var": 8.5368604766835e-07, "learning_rate": 0.007176371735546376, "loss": 2.6066, "step": 10193 }, { "crossentropy": 2.738166093826294, "epoch": 0.3695620649651972, "grad_norm": 0.031051330268383026, "grad_norm_var": 1.1965302050425391e-06, "learning_rate": 0.007175848568816522, "loss": 2.7631, "step": 10194 }, { "crossentropy": 2.698416233062744, "epoch": 0.36959831786542924, "grad_norm": 0.036214254796504974, "grad_norm_var": 4.671010991912301e-06, "learning_rate": 0.0071753253726985315, "loss": 2.5584, "step": 10195 }, { "crossentropy": 2.480304479598999, "epoch": 0.36963457076566125, "grad_norm": 0.033266790211200714, "grad_norm_var": 5.6924427150036126e-06, "learning_rate": 0.007174802147199471, "loss": 2.5562, "step": 10196 }, { "crossentropy": 2.6769697666168213, "epoch": 0.36967082366589327, "grad_norm": 0.03077089786529541, "grad_norm_var": 5.673742341802343e-06, "learning_rate": 0.0071742788923264056, "loss": 2.6158, "step": 10197 }, { "crossentropy": 2.5293807983398438, "epoch": 0.3697070765661253, "grad_norm": 0.028207208961248398, "grad_norm_var": 5.7593891726746e-06, "learning_rate": 0.007173755608086405, "loss": 2.5653, "step": 10198 }, { "crossentropy": 2.5745866298675537, "epoch": 0.3697433294663573, "grad_norm": 0.02850075624883175, "grad_norm_var": 5.806474870972735e-06, "learning_rate": 0.007173232294486536, "loss": 2.5512, "step": 10199 }, { "crossentropy": 2.5608997344970703, "epoch": 0.3697795823665893, "grad_norm": 0.03115103207528591, "grad_norm_var": 5.841413070810014e-06, "learning_rate": 0.007172708951533868, "loss": 2.6448, "step": 10200 }, { "crossentropy": 2.6382088661193848, "epoch": 0.3698158352668213, "grad_norm": 0.03254156932234764, "grad_norm_var": 6.388358369242784e-06, "learning_rate": 0.007172185579235467, "loss": 2.5765, "step": 10201 }, { "crossentropy": 2.510423183441162, "epoch": 0.36985208816705334, "grad_norm": 0.02952083759009838, "grad_norm_var": 6.370733218015095e-06, "learning_rate": 0.0071716621775984035, "loss": 2.519, "step": 10202 }, { "crossentropy": 2.586923122406006, "epoch": 0.3698883410672854, "grad_norm": 0.03216111287474632, "grad_norm_var": 6.239091274018434e-06, "learning_rate": 0.007171138746629746, "loss": 2.5858, "step": 10203 }, { "crossentropy": 2.557446002960205, "epoch": 0.3699245939675174, "grad_norm": 0.028624188154935837, "grad_norm_var": 6.1575511770550186e-06, "learning_rate": 0.007170615286336568, "loss": 2.595, "step": 10204 }, { "crossentropy": 2.5585854053497314, "epoch": 0.36996084686774944, "grad_norm": 0.027599995955824852, "grad_norm_var": 6.041529920738558e-06, "learning_rate": 0.0071700917967259335, "loss": 2.6514, "step": 10205 }, { "crossentropy": 2.6388614177703857, "epoch": 0.36999709976798145, "grad_norm": 0.030301466584205627, "grad_norm_var": 5.828003436528561e-06, "learning_rate": 0.007169568277804917, "loss": 2.6563, "step": 10206 }, { "crossentropy": 2.633676052093506, "epoch": 0.37003335266821347, "grad_norm": 0.028699005022644997, "grad_norm_var": 5.585391296046904e-06, "learning_rate": 0.007169044729580588, "loss": 2.6415, "step": 10207 }, { "crossentropy": 2.6772193908691406, "epoch": 0.3700696055684455, "grad_norm": 0.027804594486951828, "grad_norm_var": 5.719834277390654e-06, "learning_rate": 0.007168521152060021, "loss": 2.5849, "step": 10208 }, { "crossentropy": 2.696424722671509, "epoch": 0.3701058584686775, "grad_norm": 0.027620188891887665, "grad_norm_var": 5.869113867155171e-06, "learning_rate": 0.007167997545250282, "loss": 2.657, "step": 10209 }, { "crossentropy": 2.669901132583618, "epoch": 0.3701421113689095, "grad_norm": 0.027696287259459496, "grad_norm_var": 6.215152096763985e-06, "learning_rate": 0.007167473909158446, "loss": 2.6435, "step": 10210 }, { "crossentropy": 2.6355197429656982, "epoch": 0.3701783642691415, "grad_norm": 0.028050830587744713, "grad_norm_var": 3.6625711941647473e-06, "learning_rate": 0.007166950243791586, "loss": 2.6454, "step": 10211 }, { "crossentropy": 2.6844139099121094, "epoch": 0.37021461716937354, "grad_norm": 0.03048589825630188, "grad_norm_var": 2.7612101485518694e-06, "learning_rate": 0.007166426549156776, "loss": 2.6826, "step": 10212 }, { "crossentropy": 2.444228410720825, "epoch": 0.37025087006960555, "grad_norm": 0.030275652185082436, "grad_norm_var": 2.683274354802791e-06, "learning_rate": 0.007165902825261087, "loss": 2.5104, "step": 10213 }, { "crossentropy": 2.710371732711792, "epoch": 0.37028712296983757, "grad_norm": 0.027954373508691788, "grad_norm_var": 2.7250376031276795e-06, "learning_rate": 0.007165379072111592, "loss": 2.7082, "step": 10214 }, { "crossentropy": 2.770418167114258, "epoch": 0.3703233758700696, "grad_norm": 0.028788726776838303, "grad_norm_var": 2.699082075361759e-06, "learning_rate": 0.0071648552897153675, "loss": 2.6935, "step": 10215 }, { "crossentropy": 2.473297595977783, "epoch": 0.3703596287703016, "grad_norm": 0.03006555326282978, "grad_norm_var": 2.5091263353845764e-06, "learning_rate": 0.007164331478079488, "loss": 2.5873, "step": 10216 }, { "crossentropy": 2.707702159881592, "epoch": 0.37039588167053367, "grad_norm": 0.03155430778861046, "grad_norm_var": 2.1383243388404014e-06, "learning_rate": 0.007163807637211027, "loss": 2.6515, "step": 10217 }, { "crossentropy": 2.5283966064453125, "epoch": 0.3704321345707657, "grad_norm": 0.029616210609674454, "grad_norm_var": 2.142970340037777e-06, "learning_rate": 0.0071632837671170594, "loss": 2.5548, "step": 10218 }, { "crossentropy": 2.622581958770752, "epoch": 0.3704683874709977, "grad_norm": 0.027526071295142174, "grad_norm_var": 1.659512253048218e-06, "learning_rate": 0.007162759867804663, "loss": 2.5671, "step": 10219 }, { "crossentropy": 2.6539666652679443, "epoch": 0.3705046403712297, "grad_norm": 0.029052887111902237, "grad_norm_var": 1.6542924932480176e-06, "learning_rate": 0.00716223593928091, "loss": 2.6834, "step": 10220 }, { "crossentropy": 2.7318685054779053, "epoch": 0.3705408932714617, "grad_norm": 0.02753678523004055, "grad_norm_var": 1.6658633191801343e-06, "learning_rate": 0.007161711981552882, "loss": 2.7038, "step": 10221 }, { "crossentropy": 2.690657615661621, "epoch": 0.37057714617169374, "grad_norm": 0.02876954711973667, "grad_norm_var": 1.534306792533302e-06, "learning_rate": 0.007161187994627651, "loss": 2.6932, "step": 10222 }, { "crossentropy": 2.7050812244415283, "epoch": 0.37061339907192575, "grad_norm": 0.03118130750954151, "grad_norm_var": 1.8715778441533622e-06, "learning_rate": 0.0071606639785122975, "loss": 2.6252, "step": 10223 }, { "crossentropy": 2.527390480041504, "epoch": 0.37064965197215777, "grad_norm": 0.02894514799118042, "grad_norm_var": 1.7712893660453676e-06, "learning_rate": 0.007160139933213898, "loss": 2.6507, "step": 10224 }, { "crossentropy": 2.4999096393585205, "epoch": 0.3706859048723898, "grad_norm": 0.0278933048248291, "grad_norm_var": 1.7231563616964517e-06, "learning_rate": 0.007159615858739532, "loss": 2.578, "step": 10225 }, { "crossentropy": 2.655301809310913, "epoch": 0.3707221577726218, "grad_norm": 0.027768682688474655, "grad_norm_var": 1.7100592265375704e-06, "learning_rate": 0.007159091755096274, "loss": 2.6726, "step": 10226 }, { "crossentropy": 2.5202019214630127, "epoch": 0.3707584106728538, "grad_norm": 0.026320746168494225, "grad_norm_var": 2.1372117422460327e-06, "learning_rate": 0.007158567622291207, "loss": 2.5132, "step": 10227 }, { "crossentropy": 2.803528308868408, "epoch": 0.3707946635730858, "grad_norm": 0.03052835538983345, "grad_norm_var": 2.1458296905999303e-06, "learning_rate": 0.007158043460331407, "loss": 2.6408, "step": 10228 }, { "crossentropy": 2.6145153045654297, "epoch": 0.37083091647331784, "grad_norm": 0.027473118156194687, "grad_norm_var": 2.154849873289381e-06, "learning_rate": 0.007157519269223958, "loss": 2.5897, "step": 10229 }, { "crossentropy": 2.6011595726013184, "epoch": 0.3708671693735499, "grad_norm": 0.026602454483509064, "grad_norm_var": 2.423482289555928e-06, "learning_rate": 0.007156995048975934, "loss": 2.6079, "step": 10230 }, { "crossentropy": 2.6346018314361572, "epoch": 0.3709034222737819, "grad_norm": 0.028129130601882935, "grad_norm_var": 2.445197016225781e-06, "learning_rate": 0.007156470799594419, "loss": 2.5847, "step": 10231 }, { "crossentropy": 2.5604496002197266, "epoch": 0.37093967517401394, "grad_norm": 0.0292286928743124, "grad_norm_var": 2.3349491049396222e-06, "learning_rate": 0.007155946521086494, "loss": 2.5613, "step": 10232 }, { "crossentropy": 2.6234042644500732, "epoch": 0.37097592807424595, "grad_norm": 0.027402184903621674, "grad_norm_var": 1.795129770067877e-06, "learning_rate": 0.00715542221345924, "loss": 2.5991, "step": 10233 }, { "crossentropy": 2.747377395629883, "epoch": 0.37101218097447797, "grad_norm": 0.028059160336852074, "grad_norm_var": 1.6886422487724737e-06, "learning_rate": 0.007154897876719737, "loss": 2.6858, "step": 10234 }, { "crossentropy": 2.691720724105835, "epoch": 0.37104843387471, "grad_norm": 0.02712380327284336, "grad_norm_var": 1.7389842350633145e-06, "learning_rate": 0.007154373510875068, "loss": 2.6163, "step": 10235 }, { "crossentropy": 2.4752907752990723, "epoch": 0.371084686774942, "grad_norm": 0.027957625687122345, "grad_norm_var": 1.6968493049715802e-06, "learning_rate": 0.007153849115932316, "loss": 2.5047, "step": 10236 }, { "crossentropy": 2.664064407348633, "epoch": 0.371120939675174, "grad_norm": 0.02885199338197708, "grad_norm_var": 1.6917263235137364e-06, "learning_rate": 0.007153324691898563, "loss": 2.5948, "step": 10237 }, { "crossentropy": 2.5024797916412354, "epoch": 0.371157192575406, "grad_norm": 0.02724887989461422, "grad_norm_var": 1.733893230206988e-06, "learning_rate": 0.007152800238780892, "loss": 2.6175, "step": 10238 }, { "crossentropy": 2.588717460632324, "epoch": 0.37119344547563804, "grad_norm": 0.027467502281069756, "grad_norm_var": 1.10462629593257e-06, "learning_rate": 0.007152275756586387, "loss": 2.5789, "step": 10239 }, { "crossentropy": 2.5952489376068115, "epoch": 0.37122969837587005, "grad_norm": 0.02791457064449787, "grad_norm_var": 1.0325523983734416e-06, "learning_rate": 0.007151751245322131, "loss": 2.5981, "step": 10240 }, { "crossentropy": 2.572261333465576, "epoch": 0.37126595127610207, "grad_norm": 0.027688393369317055, "grad_norm_var": 1.034625699838689e-06, "learning_rate": 0.007151226704995211, "loss": 2.5563, "step": 10241 }, { "crossentropy": 2.5062429904937744, "epoch": 0.3713022041763341, "grad_norm": 0.02816842496395111, "grad_norm_var": 1.0397280633654577e-06, "learning_rate": 0.007150702135612708, "loss": 2.5911, "step": 10242 }, { "crossentropy": 2.537465810775757, "epoch": 0.37133845707656615, "grad_norm": 0.028053797781467438, "grad_norm_var": 8.659143869315603e-07, "learning_rate": 0.0071501775371817095, "loss": 2.6684, "step": 10243 }, { "crossentropy": 2.6991257667541504, "epoch": 0.37137470997679817, "grad_norm": 0.03118363581597805, "grad_norm_var": 1.1142121615737022e-06, "learning_rate": 0.0071496529097093, "loss": 2.6572, "step": 10244 }, { "crossentropy": 2.473360538482666, "epoch": 0.3714109628770302, "grad_norm": 0.02993977814912796, "grad_norm_var": 1.3098280011456312e-06, "learning_rate": 0.007149128253202566, "loss": 2.5556, "step": 10245 }, { "crossentropy": 2.592545509338379, "epoch": 0.3714472157772622, "grad_norm": 0.028466014191508293, "grad_norm_var": 1.1327267955935864e-06, "learning_rate": 0.007148603567668593, "loss": 2.6352, "step": 10246 }, { "crossentropy": 2.6774542331695557, "epoch": 0.3714834686774942, "grad_norm": 0.028738854452967644, "grad_norm_var": 1.1416461936045174e-06, "learning_rate": 0.007148078853114468, "loss": 2.6716, "step": 10247 }, { "crossentropy": 2.7602641582489014, "epoch": 0.3715197215777262, "grad_norm": 0.027739301323890686, "grad_norm_var": 1.1044692756540052e-06, "learning_rate": 0.00714755410954728, "loss": 2.7399, "step": 10248 }, { "crossentropy": 2.521239757537842, "epoch": 0.37155597447795824, "grad_norm": 0.0312558077275753, "grad_norm_var": 1.5968726497096598e-06, "learning_rate": 0.007147029336974114, "loss": 2.4856, "step": 10249 }, { "crossentropy": 2.743546485900879, "epoch": 0.37159222737819025, "grad_norm": 0.02818138152360916, "grad_norm_var": 1.5907673765423274e-06, "learning_rate": 0.0071465045354020575, "loss": 2.7161, "step": 10250 }, { "crossentropy": 2.5205447673797607, "epoch": 0.37162848027842227, "grad_norm": 0.027543257921934128, "grad_norm_var": 1.5248675479274334e-06, "learning_rate": 0.0071459797048382, "loss": 2.5594, "step": 10251 }, { "crossentropy": 2.4106369018554688, "epoch": 0.3716647331786543, "grad_norm": 0.029818465933203697, "grad_norm_var": 1.6005276643878593e-06, "learning_rate": 0.007145454845289631, "loss": 2.5223, "step": 10252 }, { "crossentropy": 2.557356357574463, "epoch": 0.3717009860788863, "grad_norm": 0.029880976304411888, "grad_norm_var": 1.6956160334718422e-06, "learning_rate": 0.007144929956763437, "loss": 2.6585, "step": 10253 }, { "crossentropy": 2.51701283454895, "epoch": 0.3717372389791183, "grad_norm": 0.02895827405154705, "grad_norm_var": 1.5462362406389128e-06, "learning_rate": 0.00714440503926671, "loss": 2.5884, "step": 10254 }, { "crossentropy": 2.5737085342407227, "epoch": 0.3717734918793503, "grad_norm": 0.030639002099633217, "grad_norm_var": 1.6061735884846054e-06, "learning_rate": 0.0071438800928065385, "loss": 2.5471, "step": 10255 }, { "crossentropy": 2.687321662902832, "epoch": 0.37180974477958234, "grad_norm": 0.03196648508310318, "grad_norm_var": 2.0401522753785612e-06, "learning_rate": 0.007143355117390013, "loss": 2.7008, "step": 10256 }, { "crossentropy": 2.5175211429595947, "epoch": 0.3718459976798144, "grad_norm": 0.02818632684648037, "grad_norm_var": 1.951050997939183e-06, "learning_rate": 0.007142830113024224, "loss": 2.5634, "step": 10257 }, { "crossentropy": 2.6843209266662598, "epoch": 0.3718822505800464, "grad_norm": 0.027824703603982925, "grad_norm_var": 2.0100647901928614e-06, "learning_rate": 0.007142305079716263, "loss": 2.527, "step": 10258 }, { "crossentropy": 2.5633983612060547, "epoch": 0.37191850348027844, "grad_norm": 0.028080029413104057, "grad_norm_var": 2.0058418120451193e-06, "learning_rate": 0.007141780017473219, "loss": 2.5705, "step": 10259 }, { "crossentropy": 2.701247215270996, "epoch": 0.37195475638051045, "grad_norm": 0.02823481895029545, "grad_norm_var": 1.7989392532185515e-06, "learning_rate": 0.007141254926302187, "loss": 2.6312, "step": 10260 }, { "crossentropy": 2.4348976612091064, "epoch": 0.37199100928074247, "grad_norm": 0.02837182953953743, "grad_norm_var": 1.7751148197714251e-06, "learning_rate": 0.007140729806210258, "loss": 2.534, "step": 10261 }, { "crossentropy": 2.7028141021728516, "epoch": 0.3720272621809745, "grad_norm": 0.028244685381650925, "grad_norm_var": 1.7937235337179998e-06, "learning_rate": 0.007140204657204523, "loss": 2.5547, "step": 10262 }, { "crossentropy": 2.4541945457458496, "epoch": 0.3720635150812065, "grad_norm": 0.02752370946109295, "grad_norm_var": 1.9249198661765973e-06, "learning_rate": 0.007139679479292077, "loss": 2.6401, "step": 10263 }, { "crossentropy": 2.7271156311035156, "epoch": 0.3720997679814385, "grad_norm": 0.027976781129837036, "grad_norm_var": 1.8915952443751896e-06, "learning_rate": 0.0071391542724800125, "loss": 2.6392, "step": 10264 }, { "crossentropy": 2.707472801208496, "epoch": 0.3721360208816705, "grad_norm": 0.02892959862947464, "grad_norm_var": 1.5046726119495591e-06, "learning_rate": 0.0071386290367754235, "loss": 2.7364, "step": 10265 }, { "crossentropy": 2.6733555793762207, "epoch": 0.37217227378190254, "grad_norm": 0.027473116293549538, "grad_norm_var": 1.5918495010666753e-06, "learning_rate": 0.007138103772185404, "loss": 2.6457, "step": 10266 }, { "crossentropy": 2.792579412460327, "epoch": 0.37220852668213456, "grad_norm": 0.027619624510407448, "grad_norm_var": 1.5801481131480036e-06, "learning_rate": 0.007137578478717048, "loss": 2.6973, "step": 10267 }, { "crossentropy": 2.5238120555877686, "epoch": 0.37224477958236657, "grad_norm": 0.027903694659471512, "grad_norm_var": 1.532179212984291e-06, "learning_rate": 0.007137053156377451, "loss": 2.5788, "step": 10268 }, { "crossentropy": 2.6076321601867676, "epoch": 0.3722810324825986, "grad_norm": 0.028989747166633606, "grad_norm_var": 1.4311899814702563e-06, "learning_rate": 0.007136527805173707, "loss": 2.606, "step": 10269 }, { "crossentropy": 2.6133925914764404, "epoch": 0.37231728538283065, "grad_norm": 0.029778193682432175, "grad_norm_var": 1.5170038307180893e-06, "learning_rate": 0.007136002425112914, "loss": 2.6702, "step": 10270 }, { "crossentropy": 2.5571680068969727, "epoch": 0.37235353828306267, "grad_norm": 0.028457166627049446, "grad_norm_var": 1.2239483973993377e-06, "learning_rate": 0.007135477016202167, "loss": 2.5643, "step": 10271 }, { "crossentropy": 2.658628463745117, "epoch": 0.3723897911832947, "grad_norm": 0.02993169240653515, "grad_norm_var": 5.347928545904393e-07, "learning_rate": 0.00713495157844856, "loss": 2.615, "step": 10272 }, { "crossentropy": 2.5272328853607178, "epoch": 0.3724260440835267, "grad_norm": 0.03132586181163788, "grad_norm_var": 1.08426441339988e-06, "learning_rate": 0.007134426111859195, "loss": 2.521, "step": 10273 }, { "crossentropy": 2.561311721801758, "epoch": 0.3724622969837587, "grad_norm": 0.027695761993527412, "grad_norm_var": 1.0976281987895921e-06, "learning_rate": 0.007133900616441165, "loss": 2.6024, "step": 10274 }, { "crossentropy": 2.462110757827759, "epoch": 0.3724985498839907, "grad_norm": 0.02898608148097992, "grad_norm_var": 1.0941515974424344e-06, "learning_rate": 0.007133375092201566, "loss": 2.5317, "step": 10275 }, { "crossentropy": 2.5784053802490234, "epoch": 0.37253480278422274, "grad_norm": 0.0272524356842041, "grad_norm_var": 1.2010114425160697e-06, "learning_rate": 0.007132849539147504, "loss": 2.5765, "step": 10276 }, { "crossentropy": 2.5581023693084717, "epoch": 0.37257105568445475, "grad_norm": 0.026873817667365074, "grad_norm_var": 1.3726061666728077e-06, "learning_rate": 0.007132323957286068, "loss": 2.6326, "step": 10277 }, { "crossentropy": 2.620194435119629, "epoch": 0.37260730858468677, "grad_norm": 0.02785254456102848, "grad_norm_var": 1.3921741852287428e-06, "learning_rate": 0.007131798346624364, "loss": 2.6473, "step": 10278 }, { "crossentropy": 2.682292938232422, "epoch": 0.3726435614849188, "grad_norm": 0.028174495324492455, "grad_norm_var": 1.3416863133284065e-06, "learning_rate": 0.007131272707169486, "loss": 2.7128, "step": 10279 }, { "crossentropy": 2.6580629348754883, "epoch": 0.3726798143851508, "grad_norm": 0.027287764474749565, "grad_norm_var": 1.4149502607830548e-06, "learning_rate": 0.007130747038928538, "loss": 2.6353, "step": 10280 }, { "crossentropy": 2.6645984649658203, "epoch": 0.3727160672853828, "grad_norm": 0.02800387144088745, "grad_norm_var": 1.4041576149373953e-06, "learning_rate": 0.007130221341908616, "loss": 2.63, "step": 10281 }, { "crossentropy": 2.592803716659546, "epoch": 0.3727523201856148, "grad_norm": 0.027524545788764954, "grad_norm_var": 1.3983073864798356e-06, "learning_rate": 0.007129695616116823, "loss": 2.5326, "step": 10282 }, { "crossentropy": 2.6245980262756348, "epoch": 0.37278857308584684, "grad_norm": 0.027004554867744446, "grad_norm_var": 1.4821430673132938e-06, "learning_rate": 0.007129169861560257, "loss": 2.6009, "step": 10283 }, { "crossentropy": 2.4841692447662354, "epoch": 0.3728248259860789, "grad_norm": 0.026810279116034508, "grad_norm_var": 1.6168494172211685e-06, "learning_rate": 0.0071286440782460225, "loss": 2.7338, "step": 10284 }, { "crossentropy": 2.620759963989258, "epoch": 0.3728610788863109, "grad_norm": 0.02737092226743698, "grad_norm_var": 1.620276552793028e-06, "learning_rate": 0.0071281182661812196, "loss": 2.6355, "step": 10285 }, { "crossentropy": 2.5537071228027344, "epoch": 0.37289733178654294, "grad_norm": 0.02834033966064453, "grad_norm_var": 1.4365043756222584e-06, "learning_rate": 0.00712759242537295, "loss": 2.5696, "step": 10286 }, { "crossentropy": 2.6477770805358887, "epoch": 0.37293358468677495, "grad_norm": 0.028702327981591225, "grad_norm_var": 1.4533821842752439e-06, "learning_rate": 0.007127066555828314, "loss": 2.6624, "step": 10287 }, { "crossentropy": 2.531162977218628, "epoch": 0.37296983758700697, "grad_norm": 0.026815613731741905, "grad_norm_var": 1.2872123053975572e-06, "learning_rate": 0.007126540657554419, "loss": 2.6007, "step": 10288 }, { "crossentropy": 2.556112289428711, "epoch": 0.373006090487239, "grad_norm": 0.027674296870827675, "grad_norm_var": 4.4108889817547935e-07, "learning_rate": 0.007126014730558364, "loss": 2.5526, "step": 10289 }, { "crossentropy": 2.7126665115356445, "epoch": 0.373042343387471, "grad_norm": 0.02920776605606079, "grad_norm_var": 5.935816830143192e-07, "learning_rate": 0.007125488774847253, "loss": 2.7139, "step": 10290 }, { "crossentropy": 2.737653970718384, "epoch": 0.373078596287703, "grad_norm": 0.02925308793783188, "grad_norm_var": 6.423063454959072e-07, "learning_rate": 0.00712496279042819, "loss": 2.7293, "step": 10291 }, { "crossentropy": 2.546708822250366, "epoch": 0.373114849187935, "grad_norm": 0.029809167608618736, "grad_norm_var": 8.780753149002408e-07, "learning_rate": 0.007124436777308283, "loss": 2.5883, "step": 10292 }, { "crossentropy": 2.689703941345215, "epoch": 0.37315110208816704, "grad_norm": 0.028622450307011604, "grad_norm_var": 8.254769131411556e-07, "learning_rate": 0.007123910735494631, "loss": 2.649, "step": 10293 }, { "crossentropy": 2.4154720306396484, "epoch": 0.37318735498839906, "grad_norm": 0.028700605034828186, "grad_norm_var": 8.505451463594026e-07, "learning_rate": 0.007123384664994341, "loss": 2.4708, "step": 10294 }, { "crossentropy": 2.466797113418579, "epoch": 0.37322360788863107, "grad_norm": 0.027297768741846085, "grad_norm_var": 8.877009113333513e-07, "learning_rate": 0.007122858565814519, "loss": 2.5849, "step": 10295 }, { "crossentropy": 2.5344719886779785, "epoch": 0.3732598607888631, "grad_norm": 0.026691699400544167, "grad_norm_var": 9.686247838252739e-07, "learning_rate": 0.007122332437962271, "loss": 2.5843, "step": 10296 }, { "crossentropy": 2.5902962684631348, "epoch": 0.37329611368909515, "grad_norm": 0.02716284990310669, "grad_norm_var": 1.0112016063466006e-06, "learning_rate": 0.007121806281444704, "loss": 2.5685, "step": 10297 }, { "crossentropy": 2.70090389251709, "epoch": 0.37333236658932717, "grad_norm": 0.02828991413116455, "grad_norm_var": 1.0057465704964847e-06, "learning_rate": 0.0071212800962689215, "loss": 2.7098, "step": 10298 }, { "crossentropy": 2.5941970348358154, "epoch": 0.3733686194895592, "grad_norm": 0.028357939794659615, "grad_norm_var": 9.433738098074222e-07, "learning_rate": 0.007120753882442031, "loss": 2.6601, "step": 10299 }, { "crossentropy": 2.693695545196533, "epoch": 0.3734048723897912, "grad_norm": 0.028925973922014236, "grad_norm_var": 8.680047980034646e-07, "learning_rate": 0.007120227639971143, "loss": 2.6658, "step": 10300 }, { "crossentropy": 2.634575128555298, "epoch": 0.3734411252900232, "grad_norm": 0.03060065768659115, "grad_norm_var": 1.162315745382482e-06, "learning_rate": 0.0071197013688633634, "loss": 2.6637, "step": 10301 }, { "crossentropy": 2.614931583404541, "epoch": 0.3734773781902552, "grad_norm": 0.02923942729830742, "grad_norm_var": 1.2052931315786886e-06, "learning_rate": 0.0071191750691258, "loss": 2.6282, "step": 10302 }, { "crossentropy": 2.6069443225860596, "epoch": 0.37351363109048724, "grad_norm": 0.03054567612707615, "grad_norm_var": 1.4773530746942098e-06, "learning_rate": 0.007118648740765558, "loss": 2.5773, "step": 10303 }, { "crossentropy": 2.631441593170166, "epoch": 0.37354988399071926, "grad_norm": 0.02913566119968891, "grad_norm_var": 1.2696175936410236e-06, "learning_rate": 0.007118122383789754, "loss": 2.5892, "step": 10304 }, { "crossentropy": 2.699575901031494, "epoch": 0.37358613689095127, "grad_norm": 0.029652263969182968, "grad_norm_var": 1.2384408952042924e-06, "learning_rate": 0.007117595998205491, "loss": 2.616, "step": 10305 }, { "crossentropy": 2.491790533065796, "epoch": 0.3736223897911833, "grad_norm": 0.03043298050761223, "grad_norm_var": 1.3918015615571685e-06, "learning_rate": 0.00711706958401988, "loss": 2.5117, "step": 10306 }, { "crossentropy": 2.6141815185546875, "epoch": 0.3736586426914153, "grad_norm": 0.028193099424242973, "grad_norm_var": 1.414932546249063e-06, "learning_rate": 0.007116543141240031, "loss": 2.5537, "step": 10307 }, { "crossentropy": 2.533268690109253, "epoch": 0.3736948955916473, "grad_norm": 0.028706151992082596, "grad_norm_var": 1.3504435473631378e-06, "learning_rate": 0.0071160166698730565, "loss": 2.5765, "step": 10308 }, { "crossentropy": 2.606090784072876, "epoch": 0.37373114849187933, "grad_norm": 0.02697567641735077, "grad_norm_var": 1.5555590981268403e-06, "learning_rate": 0.007115490169926067, "loss": 2.6561, "step": 10309 }, { "crossentropy": 2.654279947280884, "epoch": 0.37376740139211134, "grad_norm": 0.02781733311712742, "grad_norm_var": 1.6021016719187554e-06, "learning_rate": 0.0071149636414061706, "loss": 2.6599, "step": 10310 }, { "crossentropy": 2.5452661514282227, "epoch": 0.3738036542923434, "grad_norm": 0.02952774241566658, "grad_norm_var": 1.5178092020672138e-06, "learning_rate": 0.007114437084320479, "loss": 2.552, "step": 10311 }, { "crossentropy": 2.5896902084350586, "epoch": 0.3738399071925754, "grad_norm": 0.027863996103405952, "grad_norm_var": 1.2794848810698668e-06, "learning_rate": 0.0071139104986761076, "loss": 2.6072, "step": 10312 }, { "crossentropy": 2.5135371685028076, "epoch": 0.37387616009280744, "grad_norm": 0.026773132383823395, "grad_norm_var": 1.3760848996415085e-06, "learning_rate": 0.007113383884480167, "loss": 2.5665, "step": 10313 }, { "crossentropy": 2.661567449569702, "epoch": 0.37391241299303946, "grad_norm": 0.02787940576672554, "grad_norm_var": 1.4153493854176704e-06, "learning_rate": 0.00711285724173977, "loss": 2.6463, "step": 10314 }, { "crossentropy": 2.6787614822387695, "epoch": 0.37394866589327147, "grad_norm": 0.02809942327439785, "grad_norm_var": 1.4343911851698166e-06, "learning_rate": 0.0071123305704620275, "loss": 2.6589, "step": 10315 }, { "crossentropy": 2.706052780151367, "epoch": 0.3739849187935035, "grad_norm": 0.027667628601193428, "grad_norm_var": 1.507696183685308e-06, "learning_rate": 0.0071118038706540565, "loss": 2.6919, "step": 10316 }, { "crossentropy": 2.6383728981018066, "epoch": 0.3740211716937355, "grad_norm": 0.028595460578799248, "grad_norm_var": 1.249338427289764e-06, "learning_rate": 0.00711127714232297, "loss": 2.6586, "step": 10317 }, { "crossentropy": 2.6563713550567627, "epoch": 0.3740574245939675, "grad_norm": 0.03110053390264511, "grad_norm_var": 1.6321692520091893e-06, "learning_rate": 0.007110750385475881, "loss": 2.6627, "step": 10318 }, { "crossentropy": 2.6378448009490967, "epoch": 0.37409367749419953, "grad_norm": 0.03452019765973091, "grad_norm_var": 3.605306017758038e-06, "learning_rate": 0.007110223600119906, "loss": 2.5952, "step": 10319 }, { "crossentropy": 2.7073140144348145, "epoch": 0.37412993039443154, "grad_norm": 0.029922842979431152, "grad_norm_var": 3.6652220652348687e-06, "learning_rate": 0.007109696786262158, "loss": 2.6978, "step": 10320 }, { "crossentropy": 2.550199031829834, "epoch": 0.37416618329466356, "grad_norm": 0.027889840304851532, "grad_norm_var": 3.702083494985277e-06, "learning_rate": 0.0071091699439097536, "loss": 2.6329, "step": 10321 }, { "crossentropy": 2.5070881843566895, "epoch": 0.37420243619489557, "grad_norm": 0.029220057651400566, "grad_norm_var": 3.541721738394574e-06, "learning_rate": 0.0071086430730698084, "loss": 2.5706, "step": 10322 }, { "crossentropy": 2.512045383453369, "epoch": 0.3742386890951276, "grad_norm": 0.0307930801063776, "grad_norm_var": 3.754853493663718e-06, "learning_rate": 0.007108116173749438, "loss": 2.6142, "step": 10323 }, { "crossentropy": 2.615675687789917, "epoch": 0.37427494199535966, "grad_norm": 0.03297598659992218, "grad_norm_var": 4.750069681572891e-06, "learning_rate": 0.00710758924595576, "loss": 2.5577, "step": 10324 }, { "crossentropy": 2.5097334384918213, "epoch": 0.37431119489559167, "grad_norm": 0.03812946006655693, "grad_norm_var": 9.178294876935435e-06, "learning_rate": 0.007107062289695891, "loss": 2.5272, "step": 10325 }, { "crossentropy": 2.747553586959839, "epoch": 0.3743474477958237, "grad_norm": 0.0306609645485878, "grad_norm_var": 8.885127000108952e-06, "learning_rate": 0.007106535304976948, "loss": 2.64, "step": 10326 }, { "crossentropy": 2.5738155841827393, "epoch": 0.3743837006960557, "grad_norm": 0.027744030579924583, "grad_norm_var": 9.220371374497984e-06, "learning_rate": 0.0071060082918060485, "loss": 2.5297, "step": 10327 }, { "crossentropy": 2.6350257396698, "epoch": 0.3744199535962877, "grad_norm": 0.028066636994481087, "grad_norm_var": 9.165502474743033e-06, "learning_rate": 0.007105481250190311, "loss": 2.5445, "step": 10328 }, { "crossentropy": 2.5123438835144043, "epoch": 0.37445620649651973, "grad_norm": 0.027834223583340645, "grad_norm_var": 8.778996646423229e-06, "learning_rate": 0.007104954180136854, "loss": 2.4931, "step": 10329 }, { "crossentropy": 2.5585739612579346, "epoch": 0.37449245939675174, "grad_norm": 0.028470907360315323, "grad_norm_var": 8.628198150767371e-06, "learning_rate": 0.007104427081652798, "loss": 2.6322, "step": 10330 }, { "crossentropy": 2.6599764823913574, "epoch": 0.37452871229698376, "grad_norm": 0.027478253468871117, "grad_norm_var": 8.818479419841057e-06, "learning_rate": 0.007103899954745259, "loss": 2.6537, "step": 10331 }, { "crossentropy": 2.5981452465057373, "epoch": 0.37456496519721577, "grad_norm": 0.028676725924015045, "grad_norm_var": 8.559311125884529e-06, "learning_rate": 0.007103372799421359, "loss": 2.6308, "step": 10332 }, { "crossentropy": 2.7619874477386475, "epoch": 0.3746012180974478, "grad_norm": 0.03084506094455719, "grad_norm_var": 8.415339883686921e-06, "learning_rate": 0.007102845615688217, "loss": 2.6813, "step": 10333 }, { "crossentropy": 2.6359055042266846, "epoch": 0.3746374709976798, "grad_norm": 0.03131226450204849, "grad_norm_var": 8.44157280961427e-06, "learning_rate": 0.007102318403552954, "loss": 2.5879, "step": 10334 }, { "crossentropy": 2.502138137817383, "epoch": 0.3746737238979118, "grad_norm": 0.032238587737083435, "grad_norm_var": 7.47815246996524e-06, "learning_rate": 0.0071017911630226905, "loss": 2.6279, "step": 10335 }, { "crossentropy": 2.572826385498047, "epoch": 0.37470997679814383, "grad_norm": 0.028719259425997734, "grad_norm_var": 7.603729486683437e-06, "learning_rate": 0.007101263894104548, "loss": 2.5004, "step": 10336 }, { "crossentropy": 2.4782581329345703, "epoch": 0.37474622969837584, "grad_norm": 0.026839490979909897, "grad_norm_var": 7.977439530090882e-06, "learning_rate": 0.007100736596805647, "loss": 2.5093, "step": 10337 }, { "crossentropy": 2.5873687267303467, "epoch": 0.3747824825986079, "grad_norm": 0.027435271069407463, "grad_norm_var": 8.362209286095453e-06, "learning_rate": 0.007100209271133111, "loss": 2.6569, "step": 10338 }, { "crossentropy": 2.6853013038635254, "epoch": 0.37481873549883993, "grad_norm": 0.02735111676156521, "grad_norm_var": 8.687636803139303e-06, "learning_rate": 0.007099681917094062, "loss": 2.6437, "step": 10339 }, { "crossentropy": 2.583172082901001, "epoch": 0.37485498839907194, "grad_norm": 0.02764221467077732, "grad_norm_var": 8.117178535112697e-06, "learning_rate": 0.007099154534695621, "loss": 2.5795, "step": 10340 }, { "crossentropy": 2.5787951946258545, "epoch": 0.37489124129930396, "grad_norm": 0.027018561959266663, "grad_norm_var": 2.8121731392849804e-06, "learning_rate": 0.007098627123944914, "loss": 2.597, "step": 10341 }, { "crossentropy": 2.5499916076660156, "epoch": 0.37492749419953597, "grad_norm": 0.02714671939611435, "grad_norm_var": 2.639828008103341e-06, "learning_rate": 0.007098099684849062, "loss": 2.5499, "step": 10342 }, { "crossentropy": 2.6609244346618652, "epoch": 0.374963747099768, "grad_norm": 0.02774634025990963, "grad_norm_var": 2.6396182600302643e-06, "learning_rate": 0.007097572217415189, "loss": 2.6659, "step": 10343 }, { "crossentropy": 2.6756985187530518, "epoch": 0.375, "grad_norm": 0.028095882385969162, "grad_norm_var": 2.6382690476095857e-06, "learning_rate": 0.007097044721650422, "loss": 2.7316, "step": 10344 }, { "crossentropy": 2.6391870975494385, "epoch": 0.375036252900232, "grad_norm": 0.02692185528576374, "grad_norm_var": 2.762549312567544e-06, "learning_rate": 0.007096517197561883, "loss": 2.6676, "step": 10345 }, { "crossentropy": 2.542956829071045, "epoch": 0.37507250580046403, "grad_norm": 0.027774887159466743, "grad_norm_var": 2.7835699687204945e-06, "learning_rate": 0.007095989645156696, "loss": 2.5635, "step": 10346 }, { "crossentropy": 2.618204355239868, "epoch": 0.37510875870069604, "grad_norm": 0.02811465971171856, "grad_norm_var": 2.7368079537889423e-06, "learning_rate": 0.00709546206444199, "loss": 2.64, "step": 10347 }, { "crossentropy": 2.7650256156921387, "epoch": 0.37514501160092806, "grad_norm": 0.026891594752669334, "grad_norm_var": 2.8623587159719616e-06, "learning_rate": 0.0070949344554248886, "loss": 2.6165, "step": 10348 }, { "crossentropy": 2.6838059425354004, "epoch": 0.37518126450116007, "grad_norm": 0.028594879433512688, "grad_norm_var": 2.4019932034718046e-06, "learning_rate": 0.007094406818112518, "loss": 2.6682, "step": 10349 }, { "crossentropy": 2.627427816390991, "epoch": 0.3752175174013921, "grad_norm": 0.02924758568406105, "grad_norm_var": 1.7883094856585915e-06, "learning_rate": 0.007093879152512004, "loss": 2.627, "step": 10350 }, { "crossentropy": 2.7012579441070557, "epoch": 0.37525377030162416, "grad_norm": 0.028298959136009216, "grad_norm_var": 5.246315034035734e-07, "learning_rate": 0.007093351458630475, "loss": 2.6866, "step": 10351 }, { "crossentropy": 2.519794464111328, "epoch": 0.37529002320185617, "grad_norm": 0.028574414551258087, "grad_norm_var": 5.070297898464391e-07, "learning_rate": 0.007092823736475058, "loss": 2.6352, "step": 10352 }, { "crossentropy": 2.6476829051971436, "epoch": 0.3753262761020882, "grad_norm": 0.029695698991417885, "grad_norm_var": 6.774259845423435e-07, "learning_rate": 0.007092295986052881, "loss": 2.6675, "step": 10353 }, { "crossentropy": 2.587376832962036, "epoch": 0.3753625290023202, "grad_norm": 0.027141857892274857, "grad_norm_var": 7.013560368928357e-07, "learning_rate": 0.007091768207371071, "loss": 2.5971, "step": 10354 }, { "crossentropy": 2.545438289642334, "epoch": 0.3753987819025522, "grad_norm": 0.027897275984287262, "grad_norm_var": 6.806786049389756e-07, "learning_rate": 0.007091240400436757, "loss": 2.6073, "step": 10355 }, { "crossentropy": 2.603890895843506, "epoch": 0.37543503480278423, "grad_norm": 0.027502035722136497, "grad_norm_var": 6.871961020752289e-07, "learning_rate": 0.007090712565257067, "loss": 2.5841, "step": 10356 }, { "crossentropy": 2.722752571105957, "epoch": 0.37547128770301624, "grad_norm": 0.027968697249889374, "grad_norm_var": 6.298696712795741e-07, "learning_rate": 0.007090184701839132, "loss": 2.6854, "step": 10357 }, { "crossentropy": 2.7207775115966797, "epoch": 0.37550754060324826, "grad_norm": 0.029040396213531494, "grad_norm_var": 6.446520734511296e-07, "learning_rate": 0.0070896568101900806, "loss": 2.7282, "step": 10358 }, { "crossentropy": 2.6070175170898438, "epoch": 0.37554379350348027, "grad_norm": 0.02824314869940281, "grad_norm_var": 6.370363604290529e-07, "learning_rate": 0.0070891288903170415, "loss": 2.6237, "step": 10359 }, { "crossentropy": 2.690002679824829, "epoch": 0.3755800464037123, "grad_norm": 0.029063735157251358, "grad_norm_var": 6.917941239317193e-07, "learning_rate": 0.007088600942227149, "loss": 2.6232, "step": 10360 }, { "crossentropy": 2.6169538497924805, "epoch": 0.3756162993039443, "grad_norm": 0.026560284197330475, "grad_norm_var": 7.608957228856603e-07, "learning_rate": 0.00708807296592753, "loss": 2.5924, "step": 10361 }, { "crossentropy": 2.638956069946289, "epoch": 0.3756525522041763, "grad_norm": 0.028429701924324036, "grad_norm_var": 7.537975021956282e-07, "learning_rate": 0.007087544961425316, "loss": 2.6558, "step": 10362 }, { "crossentropy": 2.5695645809173584, "epoch": 0.37568880510440833, "grad_norm": 0.026539327576756477, "grad_norm_var": 9.276795247420769e-07, "learning_rate": 0.007087016928727639, "loss": 2.6144, "step": 10363 }, { "crossentropy": 2.634781837463379, "epoch": 0.37572505800464034, "grad_norm": 0.026395317167043686, "grad_norm_var": 1.0234038573484708e-06, "learning_rate": 0.007086488867841633, "loss": 2.4646, "step": 10364 }, { "crossentropy": 2.532465696334839, "epoch": 0.3757613109048724, "grad_norm": 0.027825376018881798, "grad_norm_var": 1.0070295958174244e-06, "learning_rate": 0.007085960778774427, "loss": 2.5164, "step": 10365 }, { "crossentropy": 2.4574341773986816, "epoch": 0.37579756380510443, "grad_norm": 0.02986295521259308, "grad_norm_var": 1.1308872271459003e-06, "learning_rate": 0.007085432661533156, "loss": 2.5495, "step": 10366 }, { "crossentropy": 2.511120557785034, "epoch": 0.37583381670533644, "grad_norm": 0.028333796188235283, "grad_norm_var": 1.1320500421888383e-06, "learning_rate": 0.007084904516124951, "loss": 2.5893, "step": 10367 }, { "crossentropy": 2.541492223739624, "epoch": 0.37587006960556846, "grad_norm": 0.02815164253115654, "grad_norm_var": 1.1146254090980966e-06, "learning_rate": 0.007084376342556949, "loss": 2.6007, "step": 10368 }, { "crossentropy": 2.5731923580169678, "epoch": 0.37590632250580047, "grad_norm": 0.028567560017108917, "grad_norm_var": 9.452269296780404e-07, "learning_rate": 0.0070838481408362795, "loss": 2.5146, "step": 10369 }, { "crossentropy": 2.7093381881713867, "epoch": 0.3759425754060325, "grad_norm": 0.028116922825574875, "grad_norm_var": 8.969580101482784e-07, "learning_rate": 0.007083319910970078, "loss": 2.6641, "step": 10370 }, { "crossentropy": 2.533980369567871, "epoch": 0.3759788283062645, "grad_norm": 0.026678703725337982, "grad_norm_var": 1.0115144536135585e-06, "learning_rate": 0.00708279165296548, "loss": 2.5233, "step": 10371 }, { "crossentropy": 2.680807590484619, "epoch": 0.3760150812064965, "grad_norm": 0.03408085182309151, "grad_norm_var": 3.3192585323364606e-06, "learning_rate": 0.007082263366829622, "loss": 2.681, "step": 10372 }, { "crossentropy": 2.548593282699585, "epoch": 0.37605133410672853, "grad_norm": 0.029702940955758095, "grad_norm_var": 3.4153293797381842e-06, "learning_rate": 0.007081735052569636, "loss": 2.5457, "step": 10373 }, { "crossentropy": 2.78913950920105, "epoch": 0.37608758700696054, "grad_norm": 0.027665311470627785, "grad_norm_var": 3.429761540274754e-06, "learning_rate": 0.007081206710192658, "loss": 2.7307, "step": 10374 }, { "crossentropy": 2.7299585342407227, "epoch": 0.37612383990719256, "grad_norm": 0.03380636125802994, "grad_norm_var": 5.256205849531283e-06, "learning_rate": 0.007080678339705824, "loss": 2.6939, "step": 10375 }, { "crossentropy": 2.6445400714874268, "epoch": 0.3761600928074246, "grad_norm": 0.029840463772416115, "grad_norm_var": 5.3278230699037195e-06, "learning_rate": 0.007080149941116274, "loss": 2.6251, "step": 10376 }, { "crossentropy": 2.6467411518096924, "epoch": 0.3761963457076566, "grad_norm": 0.02915761061012745, "grad_norm_var": 4.979066602397404e-06, "learning_rate": 0.007079621514431141, "loss": 2.6173, "step": 10377 }, { "crossentropy": 2.66020131111145, "epoch": 0.37623259860788866, "grad_norm": 0.02726484090089798, "grad_norm_var": 5.144244574667893e-06, "learning_rate": 0.007079093059657564, "loss": 2.6384, "step": 10378 }, { "crossentropy": 2.532102108001709, "epoch": 0.37626885150812067, "grad_norm": 0.029000908136367798, "grad_norm_var": 4.7565684753104905e-06, "learning_rate": 0.007078564576802679, "loss": 2.5944, "step": 10379 }, { "crossentropy": 2.577752113342285, "epoch": 0.3763051044083527, "grad_norm": 0.02851363830268383, "grad_norm_var": 4.293378504293755e-06, "learning_rate": 0.007078036065873626, "loss": 2.551, "step": 10380 }, { "crossentropy": 2.485261917114258, "epoch": 0.3763413573085847, "grad_norm": 0.028822680935263634, "grad_norm_var": 4.1779896558286844e-06, "learning_rate": 0.0070775075268775444, "loss": 2.5758, "step": 10381 }, { "crossentropy": 2.601856231689453, "epoch": 0.3763776102088167, "grad_norm": 0.02810066193342209, "grad_norm_var": 4.221710754844287e-06, "learning_rate": 0.00707697895982157, "loss": 2.5861, "step": 10382 }, { "crossentropy": 2.729116439819336, "epoch": 0.37641386310904873, "grad_norm": 0.028901981189846992, "grad_norm_var": 4.182871669517483e-06, "learning_rate": 0.0070764503647128395, "loss": 2.7231, "step": 10383 }, { "crossentropy": 2.6529653072357178, "epoch": 0.37645011600928074, "grad_norm": 0.027087954804301262, "grad_norm_var": 4.3949396043236125e-06, "learning_rate": 0.0070759217415584975, "loss": 2.674, "step": 10384 }, { "crossentropy": 2.653757095336914, "epoch": 0.37648636890951276, "grad_norm": 0.026090949773788452, "grad_norm_var": 4.948111330709398e-06, "learning_rate": 0.007075393090365685, "loss": 2.576, "step": 10385 }, { "crossentropy": 2.463203191757202, "epoch": 0.3765226218097448, "grad_norm": 0.026716096326708794, "grad_norm_var": 5.2220688205730495e-06, "learning_rate": 0.007074864411141536, "loss": 2.5467, "step": 10386 }, { "crossentropy": 2.7015225887298584, "epoch": 0.3765588747099768, "grad_norm": 0.027434295043349266, "grad_norm_var": 5.040060967442747e-06, "learning_rate": 0.007074335703893195, "loss": 2.645, "step": 10387 }, { "crossentropy": 2.64774227142334, "epoch": 0.3765951276102088, "grad_norm": 0.027088848873972893, "grad_norm_var": 3.2532511286086868e-06, "learning_rate": 0.007073806968627801, "loss": 2.6934, "step": 10388 }, { "crossentropy": 2.5304481983184814, "epoch": 0.3766313805104408, "grad_norm": 0.02622709982097149, "grad_norm_var": 3.427544213548345e-06, "learning_rate": 0.0070732782053525, "loss": 2.6494, "step": 10389 }, { "crossentropy": 2.670637845993042, "epoch": 0.37666763341067283, "grad_norm": 0.02788960561156273, "grad_norm_var": 3.4137267345519045e-06, "learning_rate": 0.00707274941407443, "loss": 2.546, "step": 10390 }, { "crossentropy": 2.675642490386963, "epoch": 0.37670388631090485, "grad_norm": 0.03052111528813839, "grad_norm_var": 1.6528776559831948e-06, "learning_rate": 0.007072220594800731, "loss": 2.5382, "step": 10391 }, { "crossentropy": 2.6705586910247803, "epoch": 0.3767401392111369, "grad_norm": 0.028595315292477608, "grad_norm_var": 1.4510592646500004e-06, "learning_rate": 0.00707169174753855, "loss": 2.6819, "step": 10392 }, { "crossentropy": 2.657108783721924, "epoch": 0.37677639211136893, "grad_norm": 0.028393471613526344, "grad_norm_var": 1.3658760775685087e-06, "learning_rate": 0.0070711628722950285, "loss": 2.5786, "step": 10393 }, { "crossentropy": 2.7280678749084473, "epoch": 0.37681264501160094, "grad_norm": 0.027888789772987366, "grad_norm_var": 1.3360700769061146e-06, "learning_rate": 0.0070706339690773095, "loss": 2.6749, "step": 10394 }, { "crossentropy": 2.569075345993042, "epoch": 0.37684889791183296, "grad_norm": 0.028386859223246574, "grad_norm_var": 1.2739705396185945e-06, "learning_rate": 0.007070105037892534, "loss": 2.5995, "step": 10395 }, { "crossentropy": 2.6013095378875732, "epoch": 0.376885150812065, "grad_norm": 0.027642596513032913, "grad_norm_var": 1.2520055106432823e-06, "learning_rate": 0.00706957607874785, "loss": 2.557, "step": 10396 }, { "crossentropy": 2.6381263732910156, "epoch": 0.376921403712297, "grad_norm": 0.02850966900587082, "grad_norm_var": 1.2180255003736243e-06, "learning_rate": 0.007069047091650401, "loss": 2.6013, "step": 10397 }, { "crossentropy": 2.657515525817871, "epoch": 0.376957656612529, "grad_norm": 0.028193769976496696, "grad_norm_var": 1.2217758854979418e-06, "learning_rate": 0.007068518076607331, "loss": 2.6336, "step": 10398 }, { "crossentropy": 2.644028663635254, "epoch": 0.376993909512761, "grad_norm": 0.02818579040467739, "grad_norm_var": 1.1531895764329276e-06, "learning_rate": 0.007067989033625784, "loss": 2.6593, "step": 10399 }, { "crossentropy": 2.681286096572876, "epoch": 0.37703016241299303, "grad_norm": 0.028116395696997643, "grad_norm_var": 1.1212081175877245e-06, "learning_rate": 0.0070674599627129076, "loss": 2.6534, "step": 10400 }, { "crossentropy": 2.501370429992676, "epoch": 0.37706641531322505, "grad_norm": 0.02801487036049366, "grad_norm_var": 8.968137649773348e-07, "learning_rate": 0.007066930863875847, "loss": 2.5953, "step": 10401 }, { "crossentropy": 2.592372179031372, "epoch": 0.37710266821345706, "grad_norm": 0.029134048148989677, "grad_norm_var": 8.522346132766936e-07, "learning_rate": 0.007066401737121748, "loss": 2.6585, "step": 10402 }, { "crossentropy": 2.7307426929473877, "epoch": 0.3771389211136891, "grad_norm": 0.03433748707175255, "grad_norm_var": 3.1820688410014725e-06, "learning_rate": 0.007065872582457757, "loss": 2.7341, "step": 10403 }, { "crossentropy": 2.529324769973755, "epoch": 0.3771751740139211, "grad_norm": 0.03420029953122139, "grad_norm_var": 4.938107010693989e-06, "learning_rate": 0.007065343399891023, "loss": 2.4853, "step": 10404 }, { "crossentropy": 2.5610787868499756, "epoch": 0.37721142691415316, "grad_norm": 0.02886762097477913, "grad_norm_var": 4.39240640102063e-06, "learning_rate": 0.007064814189428692, "loss": 2.5398, "step": 10405 }, { "crossentropy": 2.355997085571289, "epoch": 0.3772476798143852, "grad_norm": 0.02824871987104416, "grad_norm_var": 4.338686924629459e-06, "learning_rate": 0.007064284951077913, "loss": 2.5113, "step": 10406 }, { "crossentropy": 2.4885189533233643, "epoch": 0.3772839327146172, "grad_norm": 0.027385305613279343, "grad_norm_var": 4.401861643977406e-06, "learning_rate": 0.007063755684845831, "loss": 2.556, "step": 10407 }, { "crossentropy": 2.673907518386841, "epoch": 0.3773201856148492, "grad_norm": 0.027968231588602066, "grad_norm_var": 4.460802768098274e-06, "learning_rate": 0.0070632263907395975, "loss": 2.6677, "step": 10408 }, { "crossentropy": 2.633103370666504, "epoch": 0.3773564385150812, "grad_norm": 0.03094261698424816, "grad_norm_var": 4.671961452814644e-06, "learning_rate": 0.00706269706876636, "loss": 2.6111, "step": 10409 }, { "crossentropy": 2.581925630569458, "epoch": 0.37739269141531323, "grad_norm": 0.031888775527477264, "grad_norm_var": 5.011875535206378e-06, "learning_rate": 0.00706216771893327, "loss": 2.6926, "step": 10410 }, { "crossentropy": 2.575615406036377, "epoch": 0.37742894431554525, "grad_norm": 0.029959388077259064, "grad_norm_var": 4.958942346403502e-06, "learning_rate": 0.007061638341247474, "loss": 2.5839, "step": 10411 }, { "crossentropy": 2.6509461402893066, "epoch": 0.37746519721577726, "grad_norm": 0.03136882185935974, "grad_norm_var": 4.916483528874469e-06, "learning_rate": 0.007061108935716123, "loss": 2.683, "step": 10412 }, { "crossentropy": 2.6819980144500732, "epoch": 0.3775014501160093, "grad_norm": 0.030229050666093826, "grad_norm_var": 4.8266208466755944e-06, "learning_rate": 0.007060579502346369, "loss": 2.637, "step": 10413 }, { "crossentropy": 2.686944007873535, "epoch": 0.3775377030162413, "grad_norm": 0.02913322113454342, "grad_norm_var": 4.678696518857495e-06, "learning_rate": 0.007060050041145363, "loss": 2.5772, "step": 10414 }, { "crossentropy": 2.631990671157837, "epoch": 0.3775739559164733, "grad_norm": 0.02892780490219593, "grad_norm_var": 4.5461054022685165e-06, "learning_rate": 0.007059520552120252, "loss": 2.5934, "step": 10415 }, { "crossentropy": 2.733292818069458, "epoch": 0.3776102088167053, "grad_norm": 0.03689037635922432, "grad_norm_var": 7.247360195727553e-06, "learning_rate": 0.007058991035278192, "loss": 2.7064, "step": 10416 }, { "crossentropy": 2.54133677482605, "epoch": 0.37764646171693733, "grad_norm": 0.03906206041574478, "grad_norm_var": 1.126073184669433e-05, "learning_rate": 0.007058461490626333, "loss": 2.6285, "step": 10417 }, { "crossentropy": 2.610630989074707, "epoch": 0.37768271461716935, "grad_norm": 0.031184198334813118, "grad_norm_var": 1.0969902120885368e-05, "learning_rate": 0.00705793191817183, "loss": 2.5966, "step": 10418 }, { "crossentropy": 2.5419580936431885, "epoch": 0.3777189675174014, "grad_norm": 0.030504927039146423, "grad_norm_var": 1.0329174287762357e-05, "learning_rate": 0.00705740231792183, "loss": 2.6189, "step": 10419 }, { "crossentropy": 2.686732530593872, "epoch": 0.37775522041763343, "grad_norm": 0.02971464768052101, "grad_norm_var": 9.701146218647991e-06, "learning_rate": 0.007056872689883492, "loss": 2.6864, "step": 10420 }, { "crossentropy": 2.688615322113037, "epoch": 0.37779147331786544, "grad_norm": 0.028519203886389732, "grad_norm_var": 9.796981124969673e-06, "learning_rate": 0.007056343034063966, "loss": 2.6034, "step": 10421 }, { "crossentropy": 2.7245607376098633, "epoch": 0.37782772621809746, "grad_norm": 0.02845020964741707, "grad_norm_var": 9.732442843619311e-06, "learning_rate": 0.007055813350470407, "loss": 2.6166, "step": 10422 }, { "crossentropy": 2.600625991821289, "epoch": 0.3778639791183295, "grad_norm": 0.027831504121422768, "grad_norm_var": 9.544230880072629e-06, "learning_rate": 0.007055283639109969, "loss": 2.615, "step": 10423 }, { "crossentropy": 2.4106922149658203, "epoch": 0.3779002320185615, "grad_norm": 0.028614899143576622, "grad_norm_var": 9.327417658019559e-06, "learning_rate": 0.007054753899989805, "loss": 2.5034, "step": 10424 }, { "crossentropy": 2.701129913330078, "epoch": 0.3779364849187935, "grad_norm": 0.0285553727298975, "grad_norm_var": 9.646595515969122e-06, "learning_rate": 0.007054224133117073, "loss": 2.5948, "step": 10425 }, { "crossentropy": 2.5781402587890625, "epoch": 0.3779727378190255, "grad_norm": 0.03155093640089035, "grad_norm_var": 9.599151209237659e-06, "learning_rate": 0.007053694338498926, "loss": 2.5972, "step": 10426 }, { "crossentropy": 2.4079582691192627, "epoch": 0.37800899071925753, "grad_norm": 0.02880360744893551, "grad_norm_var": 9.789997249774288e-06, "learning_rate": 0.007053164516142521, "loss": 2.4403, "step": 10427 }, { "crossentropy": 2.464078426361084, "epoch": 0.37804524361948955, "grad_norm": 0.029201097786426544, "grad_norm_var": 9.856792461874347e-06, "learning_rate": 0.0070526346660550136, "loss": 2.5341, "step": 10428 }, { "crossentropy": 2.621774911880493, "epoch": 0.37808149651972156, "grad_norm": 0.02799072675406933, "grad_norm_var": 1.023536272896871e-05, "learning_rate": 0.007052104788243558, "loss": 2.5783, "step": 10429 }, { "crossentropy": 2.586378574371338, "epoch": 0.3781177494199536, "grad_norm": 0.02764621190726757, "grad_norm_var": 1.0606567628497126e-05, "learning_rate": 0.007051574882715315, "loss": 2.5853, "step": 10430 }, { "crossentropy": 2.5500333309173584, "epoch": 0.3781540023201856, "grad_norm": 0.029039643704891205, "grad_norm_var": 1.058814767041875e-05, "learning_rate": 0.007051044949477439, "loss": 2.6136, "step": 10431 }, { "crossentropy": 2.659862995147705, "epoch": 0.37819025522041766, "grad_norm": 0.03164416924118996, "grad_norm_var": 7.644157885625113e-06, "learning_rate": 0.007050514988537089, "loss": 2.6129, "step": 10432 }, { "crossentropy": 2.704909563064575, "epoch": 0.3782265081206497, "grad_norm": 0.029994094744324684, "grad_norm_var": 1.699365242494944e-06, "learning_rate": 0.007049984999901422, "loss": 2.6364, "step": 10433 }, { "crossentropy": 2.535464286804199, "epoch": 0.3782627610208817, "grad_norm": 0.028623973950743675, "grad_norm_var": 1.475344757500977e-06, "learning_rate": 0.007049454983577597, "loss": 2.5832, "step": 10434 }, { "crossentropy": 2.5621187686920166, "epoch": 0.3782990139211137, "grad_norm": 0.027722245082259178, "grad_norm_var": 1.463205498481758e-06, "learning_rate": 0.007048924939572772, "loss": 2.5336, "step": 10435 }, { "crossentropy": 2.6104512214660645, "epoch": 0.3783352668213457, "grad_norm": 0.034249212592840195, "grad_norm_var": 3.184112768003268e-06, "learning_rate": 0.007048394867894106, "loss": 2.6143, "step": 10436 }, { "crossentropy": 2.4487123489379883, "epoch": 0.37837151972157773, "grad_norm": 0.031800493597984314, "grad_norm_var": 3.5253621367662176e-06, "learning_rate": 0.0070478647685487585, "loss": 2.6039, "step": 10437 }, { "crossentropy": 2.670128583908081, "epoch": 0.37840777262180975, "grad_norm": 0.0305933877825737, "grad_norm_var": 3.517482233672759e-06, "learning_rate": 0.00704733464154389, "loss": 2.6853, "step": 10438 }, { "crossentropy": 2.5427417755126953, "epoch": 0.37844402552204176, "grad_norm": 0.028187474235892296, "grad_norm_var": 3.44068839985416e-06, "learning_rate": 0.007046804486886661, "loss": 2.548, "step": 10439 }, { "crossentropy": 2.674056053161621, "epoch": 0.3784802784222738, "grad_norm": 0.027010327205061913, "grad_norm_var": 3.820616942359203e-06, "learning_rate": 0.00704627430458423, "loss": 2.6574, "step": 10440 }, { "crossentropy": 2.7379302978515625, "epoch": 0.3785165313225058, "grad_norm": 0.02728458121418953, "grad_norm_var": 4.088096824342071e-06, "learning_rate": 0.00704574409464376, "loss": 2.6053, "step": 10441 }, { "crossentropy": 2.7253832817077637, "epoch": 0.3785527842227378, "grad_norm": 0.027232982218265533, "grad_norm_var": 4.0489422492931875e-06, "learning_rate": 0.007045213857072411, "loss": 2.6624, "step": 10442 }, { "crossentropy": 2.6093926429748535, "epoch": 0.3785890371229698, "grad_norm": 0.02703128568828106, "grad_norm_var": 4.3363378621115905e-06, "learning_rate": 0.007044683591877344, "loss": 2.6595, "step": 10443 }, { "crossentropy": 2.7206506729125977, "epoch": 0.37862529002320183, "grad_norm": 0.02900863066315651, "grad_norm_var": 4.335500384004762e-06, "learning_rate": 0.007044153299065725, "loss": 2.7261, "step": 10444 }, { "crossentropy": 2.5455639362335205, "epoch": 0.37866154292343385, "grad_norm": 0.028131738305091858, "grad_norm_var": 4.3165223139927786e-06, "learning_rate": 0.007043622978644712, "loss": 2.6029, "step": 10445 }, { "crossentropy": 2.5536341667175293, "epoch": 0.3786977958236659, "grad_norm": 0.028311531990766525, "grad_norm_var": 4.217438627967101e-06, "learning_rate": 0.007043092630621469, "loss": 2.5843, "step": 10446 }, { "crossentropy": 2.7492716312408447, "epoch": 0.37873404872389793, "grad_norm": 0.029233016073703766, "grad_norm_var": 4.217791242202597e-06, "learning_rate": 0.00704256225500316, "loss": 2.6498, "step": 10447 }, { "crossentropy": 2.498727560043335, "epoch": 0.37877030162412995, "grad_norm": 0.03018762916326523, "grad_norm_var": 3.8618673134024895e-06, "learning_rate": 0.007042031851796949, "loss": 2.4706, "step": 10448 }, { "crossentropy": 2.500734806060791, "epoch": 0.37880655452436196, "grad_norm": 0.03505588322877884, "grad_norm_var": 6.108724568963418e-06, "learning_rate": 0.007041501421009997, "loss": 2.5129, "step": 10449 }, { "crossentropy": 2.633054256439209, "epoch": 0.378842807424594, "grad_norm": 0.03417558595538139, "grad_norm_var": 7.494604944558759e-06, "learning_rate": 0.007040970962649472, "loss": 2.6512, "step": 10450 }, { "crossentropy": 2.6104273796081543, "epoch": 0.378879060324826, "grad_norm": 0.02802998013794422, "grad_norm_var": 7.4193327684675865e-06, "learning_rate": 0.007040440476722535, "loss": 2.6317, "step": 10451 }, { "crossentropy": 2.761289596557617, "epoch": 0.378915313225058, "grad_norm": 0.028060385957360268, "grad_norm_var": 6.075972563915792e-06, "learning_rate": 0.007039909963236355, "loss": 2.6784, "step": 10452 }, { "crossentropy": 2.4861698150634766, "epoch": 0.37895156612529, "grad_norm": 0.027847664430737495, "grad_norm_var": 5.7522765232839725e-06, "learning_rate": 0.007039379422198094, "loss": 2.5473, "step": 10453 }, { "crossentropy": 2.3861660957336426, "epoch": 0.37898781902552203, "grad_norm": 0.027508575469255447, "grad_norm_var": 5.727185998714336e-06, "learning_rate": 0.0070388488536149195, "loss": 2.5133, "step": 10454 }, { "crossentropy": 2.5749034881591797, "epoch": 0.37902407192575405, "grad_norm": 0.02947409264743328, "grad_norm_var": 5.709515945174064e-06, "learning_rate": 0.007038318257493997, "loss": 2.594, "step": 10455 }, { "crossentropy": 2.651418447494507, "epoch": 0.37906032482598606, "grad_norm": 0.027572933584451675, "grad_norm_var": 5.581996018199489e-06, "learning_rate": 0.007037787633842493, "loss": 2.6596, "step": 10456 }, { "crossentropy": 2.6720023155212402, "epoch": 0.3790965777262181, "grad_norm": 0.027505958452820778, "grad_norm_var": 5.534154795615078e-06, "learning_rate": 0.0070372569826675745, "loss": 2.6482, "step": 10457 }, { "crossentropy": 2.5833606719970703, "epoch": 0.3791328306264501, "grad_norm": 0.028818536549806595, "grad_norm_var": 5.312857643213707e-06, "learning_rate": 0.007036726303976408, "loss": 2.5971, "step": 10458 }, { "crossentropy": 2.4885387420654297, "epoch": 0.37916908352668216, "grad_norm": 0.027959084138274193, "grad_norm_var": 5.108012356472233e-06, "learning_rate": 0.007036195597776163, "loss": 2.5931, "step": 10459 }, { "crossentropy": 2.456693410873413, "epoch": 0.3792053364269142, "grad_norm": 0.0277167409658432, "grad_norm_var": 5.241855454304195e-06, "learning_rate": 0.007035664864074007, "loss": 2.4887, "step": 10460 }, { "crossentropy": 2.759033441543579, "epoch": 0.3792415893271462, "grad_norm": 0.02887328900396824, "grad_norm_var": 5.180554585345919e-06, "learning_rate": 0.007035134102877108, "loss": 2.6085, "step": 10461 }, { "crossentropy": 2.6503331661224365, "epoch": 0.3792778422273782, "grad_norm": 0.028776751831173897, "grad_norm_var": 5.1423397718813414e-06, "learning_rate": 0.007034603314192632, "loss": 2.6363, "step": 10462 }, { "crossentropy": 2.7747695446014404, "epoch": 0.3793140951276102, "grad_norm": 0.027645669877529144, "grad_norm_var": 5.2874886510010285e-06, "learning_rate": 0.0070340724980277524, "loss": 2.6911, "step": 10463 }, { "crossentropy": 2.521754503250122, "epoch": 0.37935034802784223, "grad_norm": 0.02702012099325657, "grad_norm_var": 5.444887571024931e-06, "learning_rate": 0.007033541654389637, "loss": 2.4453, "step": 10464 }, { "crossentropy": 2.649904727935791, "epoch": 0.37938660092807425, "grad_norm": 0.028919551521539688, "grad_norm_var": 2.743347561509031e-06, "learning_rate": 0.007033010783285457, "loss": 2.6261, "step": 10465 }, { "crossentropy": 2.4973976612091064, "epoch": 0.37942285382830626, "grad_norm": 0.03045336715877056, "grad_norm_var": 7.89560721992331e-07, "learning_rate": 0.007032479884722378, "loss": 2.5259, "step": 10466 }, { "crossentropy": 2.6840550899505615, "epoch": 0.3794591067285383, "grad_norm": 0.028052687644958496, "grad_norm_var": 7.888922292197207e-07, "learning_rate": 0.007031948958707575, "loss": 2.6683, "step": 10467 }, { "crossentropy": 2.5011179447174072, "epoch": 0.3794953596287703, "grad_norm": 0.028696421533823013, "grad_norm_var": 7.970071522700461e-07, "learning_rate": 0.007031418005248217, "loss": 2.5909, "step": 10468 }, { "crossentropy": 2.6865649223327637, "epoch": 0.3795316125290023, "grad_norm": 0.029515480622649193, "grad_norm_var": 8.696934523293986e-07, "learning_rate": 0.007030887024351478, "loss": 2.6908, "step": 10469 }, { "crossentropy": 2.603790044784546, "epoch": 0.3795678654292343, "grad_norm": 0.02938064932823181, "grad_norm_var": 8.645218397893932e-07, "learning_rate": 0.007030356016024526, "loss": 2.6536, "step": 10470 }, { "crossentropy": 2.712538003921509, "epoch": 0.37960411832946633, "grad_norm": 0.028543556109070778, "grad_norm_var": 8.00740364891433e-07, "learning_rate": 0.007029824980274535, "loss": 2.6792, "step": 10471 }, { "crossentropy": 2.5130860805511475, "epoch": 0.3796403712296984, "grad_norm": 0.028858348727226257, "grad_norm_var": 7.510028499124281e-07, "learning_rate": 0.007029293917108678, "loss": 2.5652, "step": 10472 }, { "crossentropy": 2.5631301403045654, "epoch": 0.3796766241299304, "grad_norm": 0.0280352383852005, "grad_norm_var": 6.951140659634668e-07, "learning_rate": 0.007028762826534128, "loss": 2.5868, "step": 10473 }, { "crossentropy": 2.574280261993408, "epoch": 0.37971287703016243, "grad_norm": 0.028546638786792755, "grad_norm_var": 6.910540496326967e-07, "learning_rate": 0.007028231708558056, "loss": 2.6556, "step": 10474 }, { "crossentropy": 2.7875232696533203, "epoch": 0.37974912993039445, "grad_norm": 0.02772536128759384, "grad_norm_var": 7.132600027432226e-07, "learning_rate": 0.007027700563187636, "loss": 2.6567, "step": 10475 }, { "crossentropy": 2.2425103187561035, "epoch": 0.37978538283062646, "grad_norm": 0.027346031740307808, "grad_norm_var": 7.629113676846464e-07, "learning_rate": 0.007027169390430044, "loss": 2.3611, "step": 10476 }, { "crossentropy": 2.7804224491119385, "epoch": 0.3798216357308585, "grad_norm": 0.02810797467827797, "grad_norm_var": 7.63908814162273e-07, "learning_rate": 0.007026638190292454, "loss": 2.6541, "step": 10477 }, { "crossentropy": 2.7550652027130127, "epoch": 0.3798578886310905, "grad_norm": 0.028277844190597534, "grad_norm_var": 7.594919358938027e-07, "learning_rate": 0.007026106962782039, "loss": 2.6293, "step": 10478 }, { "crossentropy": 2.7259416580200195, "epoch": 0.3798941415313225, "grad_norm": 0.0279031153768301, "grad_norm_var": 7.361858597444588e-07, "learning_rate": 0.007025575707905973, "loss": 2.6375, "step": 10479 }, { "crossentropy": 2.5164706707000732, "epoch": 0.3799303944315545, "grad_norm": 0.027373190969228745, "grad_norm_var": 6.7612739939279e-07, "learning_rate": 0.007025044425671434, "loss": 2.5171, "step": 10480 }, { "crossentropy": 2.4475514888763428, "epoch": 0.37996664733178653, "grad_norm": 0.028636453673243523, "grad_norm_var": 6.64675779027633e-07, "learning_rate": 0.007024513116085598, "loss": 2.6057, "step": 10481 }, { "crossentropy": 2.5019986629486084, "epoch": 0.38000290023201855, "grad_norm": 0.027542077004909515, "grad_norm_var": 4.2287279702900247e-07, "learning_rate": 0.00702398177915564, "loss": 2.6053, "step": 10482 }, { "crossentropy": 2.607815980911255, "epoch": 0.38003915313225056, "grad_norm": 0.028277946636080742, "grad_norm_var": 4.1910228944554614e-07, "learning_rate": 0.007023450414888735, "loss": 2.6098, "step": 10483 }, { "crossentropy": 2.744206190109253, "epoch": 0.3800754060324826, "grad_norm": 0.027352657169103622, "grad_norm_var": 4.6055537359206606e-07, "learning_rate": 0.007022919023292062, "loss": 2.6479, "step": 10484 }, { "crossentropy": 2.499746561050415, "epoch": 0.3801116589327146, "grad_norm": 0.027692867442965508, "grad_norm_var": 3.5187406711018053e-07, "learning_rate": 0.007022387604372797, "loss": 2.5099, "step": 10485 }, { "crossentropy": 2.67441725730896, "epoch": 0.38014791183294666, "grad_norm": 0.030814630910754204, "grad_norm_var": 7.252506020123066e-07, "learning_rate": 0.0070218561581381215, "loss": 2.5973, "step": 10486 }, { "crossentropy": 2.7386202812194824, "epoch": 0.3801841647331787, "grad_norm": 0.030731964856386185, "grad_norm_var": 1.1278454164369288e-06, "learning_rate": 0.007021324684595208, "loss": 2.7154, "step": 10487 }, { "crossentropy": 2.4719808101654053, "epoch": 0.3802204176334107, "grad_norm": 0.02785060927271843, "grad_norm_var": 1.1198406741319902e-06, "learning_rate": 0.0070207931837512364, "loss": 2.5592, "step": 10488 }, { "crossentropy": 2.5353825092315674, "epoch": 0.3802566705336427, "grad_norm": 0.027566425502300262, "grad_norm_var": 1.1478400736572927e-06, "learning_rate": 0.007020261655613387, "loss": 2.6023, "step": 10489 }, { "crossentropy": 2.7060723304748535, "epoch": 0.3802929234338747, "grad_norm": 0.030418818816542625, "grad_norm_var": 1.4449204295440086e-06, "learning_rate": 0.007019730100188839, "loss": 2.7235, "step": 10490 }, { "crossentropy": 2.4489951133728027, "epoch": 0.38032917633410673, "grad_norm": 0.02804451435804367, "grad_norm_var": 1.4246580902112313e-06, "learning_rate": 0.007019198517484769, "loss": 2.5579, "step": 10491 }, { "crossentropy": 2.698873281478882, "epoch": 0.38036542923433875, "grad_norm": 0.02962825819849968, "grad_norm_var": 1.4382769662761469e-06, "learning_rate": 0.00701866690750836, "loss": 2.7252, "step": 10492 }, { "crossentropy": 2.6242570877075195, "epoch": 0.38040168213457076, "grad_norm": 0.036205392330884933, "grad_norm_var": 5.09823404725527e-06, "learning_rate": 0.007018135270266791, "loss": 2.6835, "step": 10493 }, { "crossentropy": 2.6422739028930664, "epoch": 0.3804379350348028, "grad_norm": 0.03416905924677849, "grad_norm_var": 6.6845837776240145e-06, "learning_rate": 0.007017603605767244, "loss": 2.5749, "step": 10494 }, { "crossentropy": 2.5502281188964844, "epoch": 0.3804741879350348, "grad_norm": 0.03247903659939766, "grad_norm_var": 7.087313404629267e-06, "learning_rate": 0.007017071914016896, "loss": 2.648, "step": 10495 }, { "crossentropy": 2.8143625259399414, "epoch": 0.3805104408352668, "grad_norm": 0.02993643842637539, "grad_norm_var": 6.71161626925737e-06, "learning_rate": 0.007016540195022931, "loss": 2.754, "step": 10496 }, { "crossentropy": 2.4309630393981934, "epoch": 0.3805466937354988, "grad_norm": 0.026628365740180016, "grad_norm_var": 7.284332171100111e-06, "learning_rate": 0.0070160084487925324, "loss": 2.467, "step": 10497 }, { "crossentropy": 2.6336252689361572, "epoch": 0.38058294663573083, "grad_norm": 0.026867764070630074, "grad_norm_var": 7.507547607256632e-06, "learning_rate": 0.0070154766753328796, "loss": 2.674, "step": 10498 }, { "crossentropy": 2.6121082305908203, "epoch": 0.3806191995359629, "grad_norm": 0.026985056698322296, "grad_norm_var": 7.851394680682445e-06, "learning_rate": 0.007014944874651155, "loss": 2.6387, "step": 10499 }, { "crossentropy": 2.630523204803467, "epoch": 0.3806554524361949, "grad_norm": 0.027566032484173775, "grad_norm_var": 7.790708909478439e-06, "learning_rate": 0.007014413046754542, "loss": 2.6326, "step": 10500 }, { "crossentropy": 2.68084716796875, "epoch": 0.38069170533642693, "grad_norm": 0.02875475399196148, "grad_norm_var": 7.591293635089225e-06, "learning_rate": 0.007013881191650223, "loss": 2.5941, "step": 10501 }, { "crossentropy": 2.560142755508423, "epoch": 0.38072795823665895, "grad_norm": 0.02967180497944355, "grad_norm_var": 7.497812582581013e-06, "learning_rate": 0.007013349309345385, "loss": 2.6095, "step": 10502 }, { "crossentropy": 2.600271224975586, "epoch": 0.38076421113689096, "grad_norm": 0.027308015152812004, "grad_norm_var": 7.711024231724094e-06, "learning_rate": 0.007012817399847207, "loss": 2.7117, "step": 10503 }, { "crossentropy": 2.5369765758514404, "epoch": 0.380800464037123, "grad_norm": 0.028768323361873627, "grad_norm_var": 7.5765199110561576e-06, "learning_rate": 0.007012285463162875, "loss": 2.5405, "step": 10504 }, { "crossentropy": 2.5158822536468506, "epoch": 0.380836716937355, "grad_norm": 0.027152840048074722, "grad_norm_var": 7.690383925413463e-06, "learning_rate": 0.0070117534992995756, "loss": 2.5764, "step": 10505 }, { "crossentropy": 2.5481350421905518, "epoch": 0.380872969837587, "grad_norm": 0.027781769633293152, "grad_norm_var": 7.770841225854933e-06, "learning_rate": 0.007011221508264493, "loss": 2.5861, "step": 10506 }, { "crossentropy": 2.5950756072998047, "epoch": 0.380909222737819, "grad_norm": 0.026730820536613464, "grad_norm_var": 8.089279505244526e-06, "learning_rate": 0.0070106894900648095, "loss": 2.5888, "step": 10507 }, { "crossentropy": 2.500807046890259, "epoch": 0.38094547563805103, "grad_norm": 0.02850794419646263, "grad_norm_var": 8.098465663104606e-06, "learning_rate": 0.007010157444707715, "loss": 2.5104, "step": 10508 }, { "crossentropy": 2.6623916625976562, "epoch": 0.38098172853828305, "grad_norm": 0.028491374105215073, "grad_norm_var": 4.503879536729806e-06, "learning_rate": 0.007009625372200392, "loss": 2.7473, "step": 10509 }, { "crossentropy": 2.6041834354400635, "epoch": 0.38101798143851506, "grad_norm": 0.028660152107477188, "grad_norm_var": 2.3191963883251956e-06, "learning_rate": 0.00700909327255003, "loss": 2.6385, "step": 10510 }, { "crossentropy": 2.5544536113739014, "epoch": 0.3810542343387471, "grad_norm": 0.026390789076685905, "grad_norm_var": 1.217617443680039e-06, "learning_rate": 0.007008561145763814, "loss": 2.5641, "step": 10511 }, { "crossentropy": 2.4198977947235107, "epoch": 0.3810904872389791, "grad_norm": 0.026500077918171883, "grad_norm_var": 1.01693208852278e-06, "learning_rate": 0.0070080289918489315, "loss": 2.5041, "step": 10512 }, { "crossentropy": 2.6771249771118164, "epoch": 0.38112674013921116, "grad_norm": 0.026870407164096832, "grad_norm_var": 9.868852243335626e-07, "learning_rate": 0.00700749681081257, "loss": 2.6129, "step": 10513 }, { "crossentropy": 2.450671434402466, "epoch": 0.3811629930394432, "grad_norm": 0.02777394838631153, "grad_norm_var": 9.391042546922814e-07, "learning_rate": 0.007006964602661918, "loss": 2.5193, "step": 10514 }, { "crossentropy": 2.612414598464966, "epoch": 0.3811992459396752, "grad_norm": 0.027635877951979637, "grad_norm_var": 8.996642520702851e-07, "learning_rate": 0.007006432367404163, "loss": 2.652, "step": 10515 }, { "crossentropy": 2.542384624481201, "epoch": 0.3812354988399072, "grad_norm": 0.02578655257821083, "grad_norm_var": 1.1495997771813198e-06, "learning_rate": 0.007005900105046495, "loss": 2.4627, "step": 10516 }, { "crossentropy": 2.5946044921875, "epoch": 0.3812717517401392, "grad_norm": 0.029298951849341393, "grad_norm_var": 1.246521853719137e-06, "learning_rate": 0.007005367815596101, "loss": 2.6204, "step": 10517 }, { "crossentropy": 2.5544941425323486, "epoch": 0.38130800464037123, "grad_norm": 0.02783641219139099, "grad_norm_var": 9.765082774934597e-07, "learning_rate": 0.007004835499060172, "loss": 2.6257, "step": 10518 }, { "crossentropy": 2.523170232772827, "epoch": 0.38134425754060325, "grad_norm": 0.027314824983477592, "grad_norm_var": 9.762520610148707e-07, "learning_rate": 0.0070043031554458965, "loss": 2.5723, "step": 10519 }, { "crossentropy": 2.6860544681549072, "epoch": 0.38138051044083526, "grad_norm": 0.028222965076565742, "grad_norm_var": 9.094369427803019e-07, "learning_rate": 0.007003770784760466, "loss": 2.5982, "step": 10520 }, { "crossentropy": 2.658780097961426, "epoch": 0.3814167633410673, "grad_norm": 0.02646583318710327, "grad_norm_var": 9.762072454627767e-07, "learning_rate": 0.007003238387011069, "loss": 2.5814, "step": 10521 }, { "crossentropy": 2.723867893218994, "epoch": 0.3814530162412993, "grad_norm": 0.02732982486486435, "grad_norm_var": 9.73005866723705e-07, "learning_rate": 0.007002705962204898, "loss": 2.6795, "step": 10522 }, { "crossentropy": 2.561570882797241, "epoch": 0.3814892691415313, "grad_norm": 0.02927727811038494, "grad_norm_var": 1.1210145585503645e-06, "learning_rate": 0.0070021735103491444, "loss": 2.587, "step": 10523 }, { "crossentropy": 2.5079288482666016, "epoch": 0.3815255220417633, "grad_norm": 0.028642477467656136, "grad_norm_var": 1.1375766053071338e-06, "learning_rate": 0.007001641031450999, "loss": 2.597, "step": 10524 }, { "crossentropy": 2.6534156799316406, "epoch": 0.38156177494199534, "grad_norm": 0.02744915336370468, "grad_norm_var": 1.0893948871789112e-06, "learning_rate": 0.007001108525517654, "loss": 2.6942, "step": 10525 }, { "crossentropy": 2.5526652336120605, "epoch": 0.3815980278422274, "grad_norm": 0.029873589053750038, "grad_norm_var": 1.3544063263139112e-06, "learning_rate": 0.007000575992556302, "loss": 2.6245, "step": 10526 }, { "crossentropy": 2.596756935119629, "epoch": 0.3816342807424594, "grad_norm": 0.029461240395903587, "grad_norm_var": 1.4212410166640372e-06, "learning_rate": 0.007000043432574134, "loss": 2.6023, "step": 10527 }, { "crossentropy": 2.5321457386016846, "epoch": 0.38167053364269143, "grad_norm": 0.028408437967300415, "grad_norm_var": 1.3031538042307477e-06, "learning_rate": 0.006999510845578345, "loss": 2.646, "step": 10528 }, { "crossentropy": 2.5461478233337402, "epoch": 0.38170678654292345, "grad_norm": 0.029772665351629257, "grad_norm_var": 1.4010004506649024e-06, "learning_rate": 0.0069989782315761275, "loss": 2.6574, "step": 10529 }, { "crossentropy": 2.578723430633545, "epoch": 0.38174303944315546, "grad_norm": 0.02734275534749031, "grad_norm_var": 1.4347801362862844e-06, "learning_rate": 0.006998445590574675, "loss": 2.5787, "step": 10530 }, { "crossentropy": 2.71516752243042, "epoch": 0.3817792923433875, "grad_norm": 0.027624735608696938, "grad_norm_var": 1.4355255924866985e-06, "learning_rate": 0.006997912922581182, "loss": 2.6805, "step": 10531 }, { "crossentropy": 2.73690128326416, "epoch": 0.3818155452436195, "grad_norm": 0.02959231287240982, "grad_norm_var": 1.150738838995071e-06, "learning_rate": 0.006997380227602844, "loss": 2.6759, "step": 10532 }, { "crossentropy": 2.73356556892395, "epoch": 0.3818517981438515, "grad_norm": 0.030975796282291412, "grad_norm_var": 1.5342625774818757e-06, "learning_rate": 0.006996847505646854, "loss": 2.6801, "step": 10533 }, { "crossentropy": 2.515349864959717, "epoch": 0.3818880510440835, "grad_norm": 0.028882356360554695, "grad_norm_var": 1.5136650971080945e-06, "learning_rate": 0.0069963147567204086, "loss": 2.5908, "step": 10534 }, { "crossentropy": 2.5941038131713867, "epoch": 0.38192430394431554, "grad_norm": 0.028366930782794952, "grad_norm_var": 1.4110124225436555e-06, "learning_rate": 0.006995781980830702, "loss": 2.6416, "step": 10535 }, { "crossentropy": 2.4904229640960693, "epoch": 0.38196055684454755, "grad_norm": 0.02671644650399685, "grad_norm_var": 1.6297061931683782e-06, "learning_rate": 0.006995249177984933, "loss": 2.4703, "step": 10536 }, { "crossentropy": 2.6727066040039062, "epoch": 0.38199680974477956, "grad_norm": 0.026625605300068855, "grad_norm_var": 1.5877257886491464e-06, "learning_rate": 0.006994716348190294, "loss": 2.6708, "step": 10537 }, { "crossentropy": 2.6106019020080566, "epoch": 0.3820330626450116, "grad_norm": 0.03019772283732891, "grad_norm_var": 1.6461550752330194e-06, "learning_rate": 0.0069941834914539844, "loss": 2.5727, "step": 10538 }, { "crossentropy": 2.5047719478607178, "epoch": 0.3820693155452436, "grad_norm": 0.03333074599504471, "grad_norm_var": 2.9847437762630634e-06, "learning_rate": 0.006993650607783199, "loss": 2.5268, "step": 10539 }, { "crossentropy": 2.691859006881714, "epoch": 0.38210556844547566, "grad_norm": 0.030224580317735672, "grad_norm_var": 3.0754831674195537e-06, "learning_rate": 0.0069931176971851396, "loss": 2.6402, "step": 10540 }, { "crossentropy": 2.5870373249053955, "epoch": 0.3821418213457077, "grad_norm": 0.027408841997385025, "grad_norm_var": 3.0842041807362656e-06, "learning_rate": 0.006992584759667, "loss": 2.5698, "step": 10541 }, { "crossentropy": 2.4909632205963135, "epoch": 0.3821780742459397, "grad_norm": 0.028234712779521942, "grad_norm_var": 3.072170870602418e-06, "learning_rate": 0.006992051795235979, "loss": 2.5335, "step": 10542 }, { "crossentropy": 2.5600690841674805, "epoch": 0.3822143271461717, "grad_norm": 0.027934148907661438, "grad_norm_var": 3.11339249775785e-06, "learning_rate": 0.006991518803899274, "loss": 2.5852, "step": 10543 }, { "crossentropy": 2.6082205772399902, "epoch": 0.3822505800464037, "grad_norm": 0.028413109481334686, "grad_norm_var": 3.1131173163305525e-06, "learning_rate": 0.006990985785664087, "loss": 2.563, "step": 10544 }, { "crossentropy": 2.5535075664520264, "epoch": 0.38228683294663574, "grad_norm": 0.029558029025793076, "grad_norm_var": 3.089669359366516e-06, "learning_rate": 0.006990452740537616, "loss": 2.6285, "step": 10545 }, { "crossentropy": 2.59338641166687, "epoch": 0.38232308584686775, "grad_norm": 0.02753453515470028, "grad_norm_var": 3.0537004231935445e-06, "learning_rate": 0.006989919668527059, "loss": 2.6423, "step": 10546 }, { "crossentropy": 2.643606662750244, "epoch": 0.38235933874709976, "grad_norm": 0.027313968166708946, "grad_norm_var": 3.1105594606453216e-06, "learning_rate": 0.006989386569639618, "loss": 2.6519, "step": 10547 }, { "crossentropy": 2.6350550651550293, "epoch": 0.3823955916473318, "grad_norm": 0.028921891003847122, "grad_norm_var": 3.070674950901517e-06, "learning_rate": 0.00698885344388249, "loss": 2.6705, "step": 10548 }, { "crossentropy": 2.6630749702453613, "epoch": 0.3824318445475638, "grad_norm": 0.03198360279202461, "grad_norm_var": 3.4278740666337145e-06, "learning_rate": 0.006988320291262882, "loss": 2.6141, "step": 10549 }, { "crossentropy": 2.626535177230835, "epoch": 0.3824680974477958, "grad_norm": 0.03183817118406296, "grad_norm_var": 3.9855152718496946e-06, "learning_rate": 0.006987787111787989, "loss": 2.6526, "step": 10550 }, { "crossentropy": 2.630488157272339, "epoch": 0.3825043503480278, "grad_norm": 0.03322175517678261, "grad_norm_var": 5.024408580468518e-06, "learning_rate": 0.006987253905465013, "loss": 2.6397, "step": 10551 }, { "crossentropy": 2.6504924297332764, "epoch": 0.38254060324825984, "grad_norm": 0.02827201783657074, "grad_norm_var": 4.631264661690502e-06, "learning_rate": 0.006986720672301159, "loss": 2.6264, "step": 10552 }, { "crossentropy": 2.675365924835205, "epoch": 0.3825768561484919, "grad_norm": 0.0269808117300272, "grad_norm_var": 4.505936866407561e-06, "learning_rate": 0.0069861874123036275, "loss": 2.5722, "step": 10553 }, { "crossentropy": 2.570676326751709, "epoch": 0.3826131090487239, "grad_norm": 0.0267332810908556, "grad_norm_var": 4.91556068194394e-06, "learning_rate": 0.006985654125479621, "loss": 2.565, "step": 10554 }, { "crossentropy": 2.4696013927459717, "epoch": 0.38264936194895594, "grad_norm": 0.026892472058534622, "grad_norm_var": 3.99807022913769e-06, "learning_rate": 0.0069851208118363395, "loss": 2.5349, "step": 10555 }, { "crossentropy": 2.5314433574676514, "epoch": 0.38268561484918795, "grad_norm": 0.026931576430797577, "grad_norm_var": 4.068600136868337e-06, "learning_rate": 0.006984587471380992, "loss": 2.5671, "step": 10556 }, { "crossentropy": 2.4482181072235107, "epoch": 0.38272186774941996, "grad_norm": 0.02893710508942604, "grad_norm_var": 3.964557518051683e-06, "learning_rate": 0.006984054104120778, "loss": 2.5696, "step": 10557 }, { "crossentropy": 2.6824705600738525, "epoch": 0.382758120649652, "grad_norm": 0.030898742377758026, "grad_norm_var": 4.2317250221901115e-06, "learning_rate": 0.006983520710062903, "loss": 2.6415, "step": 10558 }, { "crossentropy": 2.6418395042419434, "epoch": 0.382794373549884, "grad_norm": 0.029828356578946114, "grad_norm_var": 4.212589145149399e-06, "learning_rate": 0.0069829872892145685, "loss": 2.6312, "step": 10559 }, { "crossentropy": 2.620337724685669, "epoch": 0.382830626450116, "grad_norm": 0.02832994982600212, "grad_norm_var": 4.2197085616699536e-06, "learning_rate": 0.0069824538415829834, "loss": 2.6121, "step": 10560 }, { "crossentropy": 2.602522373199463, "epoch": 0.382866879350348, "grad_norm": 0.027131743729114532, "grad_norm_var": 4.410676237307749e-06, "learning_rate": 0.006981920367175352, "loss": 2.6178, "step": 10561 }, { "crossentropy": 2.6104469299316406, "epoch": 0.38290313225058004, "grad_norm": 0.03206416964530945, "grad_norm_var": 4.8928875713268845e-06, "learning_rate": 0.006981386865998876, "loss": 2.6331, "step": 10562 }, { "crossentropy": 2.6556341648101807, "epoch": 0.38293938515081205, "grad_norm": 0.029861746355891228, "grad_norm_var": 4.677434971380306e-06, "learning_rate": 0.006980853338060763, "loss": 2.605, "step": 10563 }, { "crossentropy": 2.6854305267333984, "epoch": 0.38297563805104406, "grad_norm": 0.028569433838129044, "grad_norm_var": 4.703048519966573e-06, "learning_rate": 0.006980319783368221, "loss": 2.5994, "step": 10564 }, { "crossentropy": 2.5001072883605957, "epoch": 0.3830118909512761, "grad_norm": 0.02816050499677658, "grad_norm_var": 4.23824025751011e-06, "learning_rate": 0.006979786201928454, "loss": 2.5346, "step": 10565 }, { "crossentropy": 2.740123748779297, "epoch": 0.3830481438515081, "grad_norm": 0.02878130041062832, "grad_norm_var": 3.682084137702337e-06, "learning_rate": 0.006979252593748673, "loss": 2.7296, "step": 10566 }, { "crossentropy": 2.508751392364502, "epoch": 0.38308439675174016, "grad_norm": 0.02810041978955269, "grad_norm_var": 2.3358941748078625e-06, "learning_rate": 0.0069787189588360775, "loss": 2.5088, "step": 10567 }, { "crossentropy": 2.5371365547180176, "epoch": 0.3831206496519722, "grad_norm": 0.030474210157990456, "grad_norm_var": 2.563364051972004e-06, "learning_rate": 0.006978185297197882, "loss": 2.6232, "step": 10568 }, { "crossentropy": 2.643268585205078, "epoch": 0.3831569025522042, "grad_norm": 0.02926752343773842, "grad_norm_var": 2.375996637766807e-06, "learning_rate": 0.006977651608841292, "loss": 2.5181, "step": 10569 }, { "crossentropy": 2.641822338104248, "epoch": 0.3831931554524362, "grad_norm": 0.029415713623166084, "grad_norm_var": 2.0829007730320853e-06, "learning_rate": 0.006977117893773517, "loss": 2.6458, "step": 10570 }, { "crossentropy": 2.6317782402038574, "epoch": 0.3832294083526682, "grad_norm": 0.02847948856651783, "grad_norm_var": 1.7990524015692086e-06, "learning_rate": 0.006976584152001763, "loss": 2.5931, "step": 10571 }, { "crossentropy": 2.477375030517578, "epoch": 0.38326566125290024, "grad_norm": 0.02819223515689373, "grad_norm_var": 1.5377617446765314e-06, "learning_rate": 0.00697605038353324, "loss": 2.5772, "step": 10572 }, { "crossentropy": 2.6192123889923096, "epoch": 0.38330191415313225, "grad_norm": 0.02831192873418331, "grad_norm_var": 1.580418490235852e-06, "learning_rate": 0.0069755165883751595, "loss": 2.6038, "step": 10573 }, { "crossentropy": 2.6762139797210693, "epoch": 0.38333816705336426, "grad_norm": 0.02797376736998558, "grad_norm_var": 1.4201518395044628e-06, "learning_rate": 0.006974982766534729, "loss": 2.6322, "step": 10574 }, { "crossentropy": 2.558936595916748, "epoch": 0.3833744199535963, "grad_norm": 0.027675608173012733, "grad_norm_var": 1.4530601867706582e-06, "learning_rate": 0.006974448918019158, "loss": 2.595, "step": 10575 }, { "crossentropy": 2.6101651191711426, "epoch": 0.3834106728538283, "grad_norm": 0.027714425697922707, "grad_norm_var": 1.5152639141544858e-06, "learning_rate": 0.006973915042835659, "loss": 2.5738, "step": 10576 }, { "crossentropy": 2.462757110595703, "epoch": 0.3834469257540603, "grad_norm": 0.028736960142850876, "grad_norm_var": 1.3276248581128483e-06, "learning_rate": 0.0069733811409914415, "loss": 2.5429, "step": 10577 }, { "crossentropy": 2.5421650409698486, "epoch": 0.3834831786542923, "grad_norm": 0.027604583650827408, "grad_norm_var": 6.661053587265666e-07, "learning_rate": 0.006972847212493718, "loss": 2.5031, "step": 10578 }, { "crossentropy": 2.523620367050171, "epoch": 0.38351943155452434, "grad_norm": 0.027780143544077873, "grad_norm_var": 5.818686086045497e-07, "learning_rate": 0.006972313257349696, "loss": 2.5755, "step": 10579 }, { "crossentropy": 2.4805033206939697, "epoch": 0.3835556844547564, "grad_norm": 0.027279624715447426, "grad_norm_var": 6.657155989163906e-07, "learning_rate": 0.006971779275566593, "loss": 2.4508, "step": 10580 }, { "crossentropy": 2.7487356662750244, "epoch": 0.3835919373549884, "grad_norm": 0.030110465362668037, "grad_norm_var": 8.484324836185158e-07, "learning_rate": 0.006971245267151618, "loss": 2.78, "step": 10581 }, { "crossentropy": 2.398489475250244, "epoch": 0.38362819025522044, "grad_norm": 0.02670435421168804, "grad_norm_var": 1.03838112921957e-06, "learning_rate": 0.006970711232111985, "loss": 2.4986, "step": 10582 }, { "crossentropy": 2.7023842334747314, "epoch": 0.38366444315545245, "grad_norm": 0.02755821868777275, "grad_norm_var": 1.0757986259024663e-06, "learning_rate": 0.006970177170454905, "loss": 2.6896, "step": 10583 }, { "crossentropy": 2.7529027462005615, "epoch": 0.38370069605568446, "grad_norm": 0.02764335460960865, "grad_norm_var": 7.673133451048386e-07, "learning_rate": 0.006969643082187592, "loss": 2.6685, "step": 10584 }, { "crossentropy": 2.706890106201172, "epoch": 0.3837369489559165, "grad_norm": 0.026351680979132652, "grad_norm_var": 8.654032359791651e-07, "learning_rate": 0.006969108967317262, "loss": 2.6937, "step": 10585 }, { "crossentropy": 2.573479175567627, "epoch": 0.3837732018561485, "grad_norm": 0.027156461030244827, "grad_norm_var": 7.491558357237133e-07, "learning_rate": 0.006968574825851125, "loss": 2.5624, "step": 10586 }, { "crossentropy": 2.5172224044799805, "epoch": 0.3838094547563805, "grad_norm": 0.026621688157320023, "grad_norm_var": 8.038833325713553e-07, "learning_rate": 0.0069680406577964, "loss": 2.5783, "step": 10587 }, { "crossentropy": 2.667567491531372, "epoch": 0.3838457076566125, "grad_norm": 0.027498602867126465, "grad_norm_var": 7.896753149083756e-07, "learning_rate": 0.006967506463160297, "loss": 2.6911, "step": 10588 }, { "crossentropy": 2.7046823501586914, "epoch": 0.38388196055684454, "grad_norm": 0.02730891853570938, "grad_norm_var": 7.667196267439776e-07, "learning_rate": 0.006966972241950034, "loss": 2.6081, "step": 10589 }, { "crossentropy": 2.50618839263916, "epoch": 0.38391821345707655, "grad_norm": 0.02731887809932232, "grad_norm_var": 7.61536447465881e-07, "learning_rate": 0.006966437994172826, "loss": 2.5183, "step": 10590 }, { "crossentropy": 2.5250680446624756, "epoch": 0.38395446635730857, "grad_norm": 0.026774320751428604, "grad_norm_var": 7.991944392750361e-07, "learning_rate": 0.0069659037198358895, "loss": 2.5521, "step": 10591 }, { "crossentropy": 2.4321389198303223, "epoch": 0.3839907192575406, "grad_norm": 0.027104703709483147, "grad_norm_var": 8.058240724121216e-07, "learning_rate": 0.006965369418946439, "loss": 2.5866, "step": 10592 }, { "crossentropy": 2.690258264541626, "epoch": 0.3840269721577726, "grad_norm": 0.03043486177921295, "grad_norm_var": 1.2723602616939352e-06, "learning_rate": 0.006964835091511693, "loss": 2.7018, "step": 10593 }, { "crossentropy": 2.714775323867798, "epoch": 0.38406322505800466, "grad_norm": 0.03344187140464783, "grad_norm_var": 3.4225318153659087e-06, "learning_rate": 0.0069643007375388664, "loss": 2.6779, "step": 10594 }, { "crossentropy": 2.765868663787842, "epoch": 0.3840994779582367, "grad_norm": 0.030608924105763435, "grad_norm_var": 3.86122859243702e-06, "learning_rate": 0.006963766357035177, "loss": 2.7354, "step": 10595 }, { "crossentropy": 2.6487367153167725, "epoch": 0.3841357308584687, "grad_norm": 0.026530275121331215, "grad_norm_var": 3.980269366711994e-06, "learning_rate": 0.006963231950007844, "loss": 2.6598, "step": 10596 }, { "crossentropy": 2.606616973876953, "epoch": 0.3841719837587007, "grad_norm": 0.027192533016204834, "grad_norm_var": 3.7197132351169617e-06, "learning_rate": 0.006962697516464084, "loss": 2.6345, "step": 10597 }, { "crossentropy": 2.6467838287353516, "epoch": 0.3842082366589327, "grad_norm": 0.029907478019595146, "grad_norm_var": 3.854336494971119e-06, "learning_rate": 0.006962163056411115, "loss": 2.6235, "step": 10598 }, { "crossentropy": 2.5817906856536865, "epoch": 0.38424448955916474, "grad_norm": 0.03191278502345085, "grad_norm_var": 4.73025662970776e-06, "learning_rate": 0.006961628569856156, "loss": 2.6862, "step": 10599 }, { "crossentropy": 2.6855485439300537, "epoch": 0.38428074245939675, "grad_norm": 0.029002925381064415, "grad_norm_var": 4.7153366698121965e-06, "learning_rate": 0.006961094056806428, "loss": 2.6566, "step": 10600 }, { "crossentropy": 2.5364022254943848, "epoch": 0.38431699535962877, "grad_norm": 0.028203409165143967, "grad_norm_var": 4.412084531469027e-06, "learning_rate": 0.0069605595172691475, "loss": 2.5463, "step": 10601 }, { "crossentropy": 2.577606439590454, "epoch": 0.3843532482598608, "grad_norm": 0.032191067934036255, "grad_norm_var": 5.05165968335593e-06, "learning_rate": 0.0069600249512515346, "loss": 2.6694, "step": 10602 }, { "crossentropy": 2.5192272663116455, "epoch": 0.3843895011600928, "grad_norm": 0.030311129987239838, "grad_norm_var": 4.792309791007652e-06, "learning_rate": 0.0069594903587608115, "loss": 2.5381, "step": 10603 }, { "crossentropy": 2.6587018966674805, "epoch": 0.3844257540603248, "grad_norm": 0.02861778624355793, "grad_norm_var": 4.63029714395575e-06, "learning_rate": 0.006958955739804197, "loss": 2.6577, "step": 10604 }, { "crossentropy": 2.5958194732666016, "epoch": 0.3844620069605568, "grad_norm": 0.026973897591233253, "grad_norm_var": 4.720841659890061e-06, "learning_rate": 0.006958421094388913, "loss": 2.5676, "step": 10605 }, { "crossentropy": 2.5916221141815186, "epoch": 0.38449825986078884, "grad_norm": 0.03901098668575287, "grad_norm_var": 1.039794660676652e-05, "learning_rate": 0.0069578864225221795, "loss": 2.5657, "step": 10606 }, { "crossentropy": 2.5926082134246826, "epoch": 0.3845345127610209, "grad_norm": 0.029454531148076057, "grad_norm_var": 9.73396365849388e-06, "learning_rate": 0.006957351724211218, "loss": 2.6749, "step": 10607 }, { "crossentropy": 2.6008236408233643, "epoch": 0.3845707656612529, "grad_norm": 0.029641343280673027, "grad_norm_var": 9.137872168866502e-06, "learning_rate": 0.006956816999463253, "loss": 2.5863, "step": 10608 }, { "crossentropy": 2.660132884979248, "epoch": 0.38460701856148494, "grad_norm": 0.028860842809081078, "grad_norm_var": 9.246520922978604e-06, "learning_rate": 0.006956282248285503, "loss": 2.7039, "step": 10609 }, { "crossentropy": 2.6613731384277344, "epoch": 0.38464327146171695, "grad_norm": 0.033448297530412674, "grad_norm_var": 9.249372856416277e-06, "learning_rate": 0.006955747470685193, "loss": 2.7146, "step": 10610 }, { "crossentropy": 2.6529486179351807, "epoch": 0.38467952436194897, "grad_norm": 0.032197173684835434, "grad_norm_var": 9.511254624222642e-06, "learning_rate": 0.006955212666669546, "loss": 2.5947, "step": 10611 }, { "crossentropy": 2.47324538230896, "epoch": 0.384715777262181, "grad_norm": 0.03097214736044407, "grad_norm_var": 8.561507609181063e-06, "learning_rate": 0.006954677836245784, "loss": 2.4572, "step": 10612 }, { "crossentropy": 2.72322940826416, "epoch": 0.384752030162413, "grad_norm": 0.028673343360424042, "grad_norm_var": 8.046781277448502e-06, "learning_rate": 0.006954142979421132, "loss": 2.6996, "step": 10613 }, { "crossentropy": 2.619393825531006, "epoch": 0.384788283062645, "grad_norm": 0.034437183290719986, "grad_norm_var": 8.919251306725472e-06, "learning_rate": 0.006953608096202814, "loss": 2.5848, "step": 10614 }, { "crossentropy": 2.618946075439453, "epoch": 0.384824535962877, "grad_norm": 0.036357831209897995, "grad_norm_var": 1.077259687851082e-05, "learning_rate": 0.006953073186598054, "loss": 2.6151, "step": 10615 }, { "crossentropy": 2.57745099067688, "epoch": 0.38486078886310904, "grad_norm": 0.03173668310046196, "grad_norm_var": 1.0458125718919703e-05, "learning_rate": 0.006952538250614076, "loss": 2.53, "step": 10616 }, { "crossentropy": 2.480207920074463, "epoch": 0.38489704176334105, "grad_norm": 0.028176842257380486, "grad_norm_var": 1.0469202427930525e-05, "learning_rate": 0.006952003288258105, "loss": 2.5397, "step": 10617 }, { "crossentropy": 2.5343027114868164, "epoch": 0.38493329466357307, "grad_norm": 0.030605202540755272, "grad_norm_var": 1.0441423233244148e-05, "learning_rate": 0.006951468299537368, "loss": 2.5995, "step": 10618 }, { "crossentropy": 2.597168207168579, "epoch": 0.3849695475638051, "grad_norm": 0.04593171551823616, "grad_norm_var": 2.3804476078779363e-05, "learning_rate": 0.006950933284459091, "loss": 2.5362, "step": 10619 }, { "crossentropy": 2.609060764312744, "epoch": 0.3850058004640371, "grad_norm": 0.03076319396495819, "grad_norm_var": 2.3069304391214747e-05, "learning_rate": 0.006950398243030498, "loss": 2.6143, "step": 10620 }, { "crossentropy": 2.5521154403686523, "epoch": 0.38504205336426917, "grad_norm": 0.029015088453888893, "grad_norm_var": 2.187265756789015e-05, "learning_rate": 0.0069498631752588184, "loss": 2.5814, "step": 10621 }, { "crossentropy": 2.796877861022949, "epoch": 0.3850783062645012, "grad_norm": 0.02798350900411606, "grad_norm_var": 1.983374144025004e-05, "learning_rate": 0.006949328081151276, "loss": 2.6519, "step": 10622 }, { "crossentropy": 2.7467496395111084, "epoch": 0.3851145591647332, "grad_norm": 0.02814246155321598, "grad_norm_var": 2.0345699538588815e-05, "learning_rate": 0.006948792960715101, "loss": 2.709, "step": 10623 }, { "crossentropy": 2.621434211730957, "epoch": 0.3851508120649652, "grad_norm": 0.027929721400141716, "grad_norm_var": 2.0994953812167417e-05, "learning_rate": 0.006948257813957519, "loss": 2.6474, "step": 10624 }, { "crossentropy": 2.815948486328125, "epoch": 0.3851870649651972, "grad_norm": 0.028251634910702705, "grad_norm_var": 2.1238773085476453e-05, "learning_rate": 0.006947722640885759, "loss": 2.6595, "step": 10625 }, { "crossentropy": 2.5199930667877197, "epoch": 0.38522331786542924, "grad_norm": 0.028428049758076668, "grad_norm_var": 2.153585155826217e-05, "learning_rate": 0.006947187441507049, "loss": 2.4871, "step": 10626 }, { "crossentropy": 2.5280275344848633, "epoch": 0.38525957076566125, "grad_norm": 0.031736765056848526, "grad_norm_var": 2.148942727913175e-05, "learning_rate": 0.006946652215828617, "loss": 2.5585, "step": 10627 }, { "crossentropy": 2.68782377243042, "epoch": 0.38529582366589327, "grad_norm": 0.029574785381555557, "grad_norm_var": 2.1653235713227807e-05, "learning_rate": 0.006946116963857693, "loss": 2.6932, "step": 10628 }, { "crossentropy": 2.6373414993286133, "epoch": 0.3853320765661253, "grad_norm": 0.032240305095911026, "grad_norm_var": 2.1290050523530683e-05, "learning_rate": 0.006945581685601507, "loss": 2.653, "step": 10629 }, { "crossentropy": 2.6759488582611084, "epoch": 0.3853683294663573, "grad_norm": 0.03204774484038353, "grad_norm_var": 2.065758265728973e-05, "learning_rate": 0.006945046381067287, "loss": 2.6167, "step": 10630 }, { "crossentropy": 2.83148455619812, "epoch": 0.3854045823665893, "grad_norm": 0.029803549870848656, "grad_norm_var": 1.8819835617699585e-05, "learning_rate": 0.0069445110502622635, "loss": 2.6154, "step": 10631 }, { "crossentropy": 2.5790584087371826, "epoch": 0.3854408352668213, "grad_norm": 0.02747109718620777, "grad_norm_var": 1.9408920762303928e-05, "learning_rate": 0.006943975693193667, "loss": 2.5511, "step": 10632 }, { "crossentropy": 2.5139389038085938, "epoch": 0.38547708816705334, "grad_norm": 0.026533955708146095, "grad_norm_var": 2.0087896198329766e-05, "learning_rate": 0.00694344030986873, "loss": 2.5858, "step": 10633 }, { "crossentropy": 2.735853672027588, "epoch": 0.3855133410672854, "grad_norm": 0.030096866190433502, "grad_norm_var": 2.009038731001587e-05, "learning_rate": 0.006942904900294682, "loss": 2.7472, "step": 10634 }, { "crossentropy": 2.6195921897888184, "epoch": 0.3855495939675174, "grad_norm": 0.02874893881380558, "grad_norm_var": 2.8952706348558116e-06, "learning_rate": 0.006942369464478755, "loss": 2.5735, "step": 10635 }, { "crossentropy": 2.6836276054382324, "epoch": 0.38558584686774944, "grad_norm": 0.027702391147613525, "grad_norm_var": 2.882838288418065e-06, "learning_rate": 0.006941834002428179, "loss": 2.5529, "step": 10636 }, { "crossentropy": 2.688974618911743, "epoch": 0.38562209976798145, "grad_norm": 0.028491120785474777, "grad_norm_var": 2.9063959037076493e-06, "learning_rate": 0.0069412985141501906, "loss": 2.6166, "step": 10637 }, { "crossentropy": 2.6346969604492188, "epoch": 0.38565835266821347, "grad_norm": 0.02724122256040573, "grad_norm_var": 3.0487534595705425e-06, "learning_rate": 0.006940762999652018, "loss": 2.6502, "step": 10638 }, { "crossentropy": 2.6362006664276123, "epoch": 0.3856946055684455, "grad_norm": 0.02775116078555584, "grad_norm_var": 3.1045007141877744e-06, "learning_rate": 0.006940227458940895, "loss": 2.5684, "step": 10639 }, { "crossentropy": 2.758272409439087, "epoch": 0.3857308584686775, "grad_norm": 0.029437324032187462, "grad_norm_var": 3.030794682973986e-06, "learning_rate": 0.006939691892024058, "loss": 2.646, "step": 10640 }, { "crossentropy": 2.529633045196533, "epoch": 0.3857671113689095, "grad_norm": 0.028815148398280144, "grad_norm_var": 2.9871017142736486e-06, "learning_rate": 0.006939156298908739, "loss": 2.6282, "step": 10641 }, { "crossentropy": 2.660797595977783, "epoch": 0.3858033642691415, "grad_norm": 0.029006920754909515, "grad_norm_var": 2.953671446255625e-06, "learning_rate": 0.00693862067960217, "loss": 2.6422, "step": 10642 }, { "crossentropy": 2.6661152839660645, "epoch": 0.38583961716937354, "grad_norm": 0.028549958020448685, "grad_norm_var": 2.497217332402786e-06, "learning_rate": 0.006938085034111585, "loss": 2.607, "step": 10643 }, { "crossentropy": 2.7284257411956787, "epoch": 0.38587587006960555, "grad_norm": 0.027921075001358986, "grad_norm_var": 2.5346842263777405e-06, "learning_rate": 0.006937549362444222, "loss": 2.684, "step": 10644 }, { "crossentropy": 2.4570441246032715, "epoch": 0.38591212296983757, "grad_norm": 0.02919062413275242, "grad_norm_var": 1.7439656116986866e-06, "learning_rate": 0.006937013664607316, "loss": 2.5188, "step": 10645 }, { "crossentropy": 2.7923622131347656, "epoch": 0.3859483758700696, "grad_norm": 0.02947034128010273, "grad_norm_var": 1.000292488357625e-06, "learning_rate": 0.006936477940608099, "loss": 2.6889, "step": 10646 }, { "crossentropy": 2.3502066135406494, "epoch": 0.3859846287703016, "grad_norm": 0.029856380075216293, "grad_norm_var": 1.009547164601771e-06, "learning_rate": 0.006935942190453809, "loss": 2.4434, "step": 10647 }, { "crossentropy": 2.642390727996826, "epoch": 0.38602088167053367, "grad_norm": 0.031090915203094482, "grad_norm_var": 1.3233149580710313e-06, "learning_rate": 0.0069354064141516816, "loss": 2.6575, "step": 10648 }, { "crossentropy": 2.6177942752838135, "epoch": 0.3860571345707657, "grad_norm": 0.029750509187579155, "grad_norm_var": 1.0221141973663801e-06, "learning_rate": 0.006934870611708954, "loss": 2.6217, "step": 10649 }, { "crossentropy": 2.537935256958008, "epoch": 0.3860933874709977, "grad_norm": 0.027728676795959473, "grad_norm_var": 1.0089403106681878e-06, "learning_rate": 0.0069343347831328644, "loss": 2.6381, "step": 10650 }, { "crossentropy": 2.534738302230835, "epoch": 0.3861296403712297, "grad_norm": 0.02779517136514187, "grad_norm_var": 1.0719123440386854e-06, "learning_rate": 0.0069337989284306446, "loss": 2.6095, "step": 10651 }, { "crossentropy": 2.6485610008239746, "epoch": 0.3861658932714617, "grad_norm": 0.027427922934293747, "grad_norm_var": 1.1144988154673218e-06, "learning_rate": 0.006933263047609537, "loss": 2.6272, "step": 10652 }, { "crossentropy": 2.695096969604492, "epoch": 0.38620214617169374, "grad_norm": 0.029692096635699272, "grad_norm_var": 1.1679500484615963e-06, "learning_rate": 0.006932727140676779, "loss": 2.6443, "step": 10653 }, { "crossentropy": 2.5645809173583984, "epoch": 0.38623839907192575, "grad_norm": 0.03193473070859909, "grad_norm_var": 1.5721950572398218e-06, "learning_rate": 0.0069321912076396085, "loss": 2.5612, "step": 10654 }, { "crossentropy": 2.5615060329437256, "epoch": 0.38627465197215777, "grad_norm": 0.03031121753156185, "grad_norm_var": 1.525261597775509e-06, "learning_rate": 0.006931655248505261, "loss": 2.61, "step": 10655 }, { "crossentropy": 2.709977149963379, "epoch": 0.3863109048723898, "grad_norm": 0.02873590774834156, "grad_norm_var": 1.5383690204002203e-06, "learning_rate": 0.00693111926328098, "loss": 2.7144, "step": 10656 }, { "crossentropy": 2.641106367111206, "epoch": 0.3863471577726218, "grad_norm": 0.0284274872392416, "grad_norm_var": 1.5679045367487324e-06, "learning_rate": 0.006930583251974002, "loss": 2.5728, "step": 10657 }, { "crossentropy": 2.4176652431488037, "epoch": 0.3863834106728538, "grad_norm": 0.030510956421494484, "grad_norm_var": 1.6744537443259706e-06, "learning_rate": 0.006930047214591568, "loss": 2.593, "step": 10658 }, { "crossentropy": 2.610182762145996, "epoch": 0.3864196635730858, "grad_norm": 0.03822014853358269, "grad_norm_var": 6.584636974145813e-06, "learning_rate": 0.006929511151140916, "loss": 2.5781, "step": 10659 }, { "crossentropy": 2.7249300479888916, "epoch": 0.38645591647331784, "grad_norm": 0.02971513196825981, "grad_norm_var": 6.317449063965062e-06, "learning_rate": 0.006928975061629289, "loss": 2.6768, "step": 10660 }, { "crossentropy": 2.7862284183502197, "epoch": 0.3864921693735499, "grad_norm": 0.027852356433868408, "grad_norm_var": 6.5722244467445245e-06, "learning_rate": 0.006928438946063926, "loss": 2.6544, "step": 10661 }, { "crossentropy": 2.6342904567718506, "epoch": 0.3865284222737819, "grad_norm": 0.028774796053767204, "grad_norm_var": 6.643002426347707e-06, "learning_rate": 0.00692790280445207, "loss": 2.6002, "step": 10662 }, { "crossentropy": 2.5291097164154053, "epoch": 0.38656467517401394, "grad_norm": 0.026949947699904442, "grad_norm_var": 7.173924459739087e-06, "learning_rate": 0.006927366636800958, "loss": 2.5058, "step": 10663 }, { "crossentropy": 2.5766780376434326, "epoch": 0.38660092807424595, "grad_norm": 0.02801220864057541, "grad_norm_var": 7.188128288738516e-06, "learning_rate": 0.006926830443117836, "loss": 2.5864, "step": 10664 }, { "crossentropy": 2.740225315093994, "epoch": 0.38663718097447797, "grad_norm": 0.02772345393896103, "grad_norm_var": 7.374516580702038e-06, "learning_rate": 0.006926294223409945, "loss": 2.6478, "step": 10665 }, { "crossentropy": 2.556950330734253, "epoch": 0.38667343387471, "grad_norm": 0.027435140684247017, "grad_norm_var": 7.443876479883135e-06, "learning_rate": 0.006925757977684529, "loss": 2.5073, "step": 10666 }, { "crossentropy": 2.6604089736938477, "epoch": 0.386709686774942, "grad_norm": 0.028236238285899162, "grad_norm_var": 7.36489635099305e-06, "learning_rate": 0.006925221705948827, "loss": 2.6604, "step": 10667 }, { "crossentropy": 2.491745710372925, "epoch": 0.386745939675174, "grad_norm": 0.028607025742530823, "grad_norm_var": 7.146077435376966e-06, "learning_rate": 0.006924685408210084, "loss": 2.5025, "step": 10668 }, { "crossentropy": 2.760143518447876, "epoch": 0.386782192575406, "grad_norm": 0.027283642441034317, "grad_norm_var": 7.429646916639239e-06, "learning_rate": 0.006924149084475545, "loss": 2.6404, "step": 10669 }, { "crossentropy": 2.583691358566284, "epoch": 0.38681844547563804, "grad_norm": 0.026629239320755005, "grad_norm_var": 7.322028744008215e-06, "learning_rate": 0.006923612734752452, "loss": 2.5718, "step": 10670 }, { "crossentropy": 2.68717360496521, "epoch": 0.38685469837587005, "grad_norm": 0.02706941030919552, "grad_norm_var": 7.396562548009124e-06, "learning_rate": 0.006923076359048049, "loss": 2.6231, "step": 10671 }, { "crossentropy": 2.613757371902466, "epoch": 0.38689095127610207, "grad_norm": 0.03873840719461441, "grad_norm_var": 1.3615631445681993e-05, "learning_rate": 0.006922539957369582, "loss": 2.6604, "step": 10672 }, { "crossentropy": 2.6417124271392822, "epoch": 0.3869272041763341, "grad_norm": 0.02796836756169796, "grad_norm_var": 1.3687518846316318e-05, "learning_rate": 0.006922003529724297, "loss": 2.6072, "step": 10673 }, { "crossentropy": 2.6076300144195557, "epoch": 0.38696345707656615, "grad_norm": 0.02882327325642109, "grad_norm_var": 1.360607114741541e-05, "learning_rate": 0.006921467076119437, "loss": 2.6182, "step": 10674 }, { "crossentropy": 2.601450204849243, "epoch": 0.38699970997679817, "grad_norm": 0.03020511008799076, "grad_norm_var": 8.037569745063197e-06, "learning_rate": 0.006920930596562246, "loss": 2.5487, "step": 10675 }, { "crossentropy": 2.5164475440979004, "epoch": 0.3870359628770302, "grad_norm": 0.033746253699064255, "grad_norm_var": 9.571135258909918e-06, "learning_rate": 0.006920394091059974, "loss": 2.5352, "step": 10676 }, { "crossentropy": 2.459906816482544, "epoch": 0.3870722157772622, "grad_norm": 0.036927469074726105, "grad_norm_var": 1.3325673950559333e-05, "learning_rate": 0.006919857559619864, "loss": 2.5942, "step": 10677 }, { "crossentropy": 2.657411813735962, "epoch": 0.3871084686774942, "grad_norm": 0.034193865954875946, "grad_norm_var": 1.458604922703181e-05, "learning_rate": 0.006919321002249166, "loss": 2.6755, "step": 10678 }, { "crossentropy": 2.6858842372894287, "epoch": 0.3871447215777262, "grad_norm": 0.029361948370933533, "grad_norm_var": 1.3997925361620978e-05, "learning_rate": 0.006918784418955124, "loss": 2.6568, "step": 10679 }, { "crossentropy": 2.655426263809204, "epoch": 0.38718097447795824, "grad_norm": 0.027858642861247063, "grad_norm_var": 1.4041330037666101e-05, "learning_rate": 0.006918247809744988, "loss": 2.5843, "step": 10680 }, { "crossentropy": 2.531463146209717, "epoch": 0.38721722737819025, "grad_norm": 0.027994675561785698, "grad_norm_var": 1.3961776071607218e-05, "learning_rate": 0.006917711174626002, "loss": 2.5931, "step": 10681 }, { "crossentropy": 2.688439130783081, "epoch": 0.38725348027842227, "grad_norm": 0.02815331518650055, "grad_norm_var": 1.3741953924181861e-05, "learning_rate": 0.0069171745136054185, "loss": 2.7203, "step": 10682 }, { "crossentropy": 2.5357162952423096, "epoch": 0.3872897331786543, "grad_norm": 0.030422404408454895, "grad_norm_var": 1.3493809013588078e-05, "learning_rate": 0.006916637826690483, "loss": 2.535, "step": 10683 }, { "crossentropy": 2.7131221294403076, "epoch": 0.3873259860788863, "grad_norm": 0.031668249517679214, "grad_norm_var": 1.340933299153052e-05, "learning_rate": 0.006916101113888445, "loss": 2.6985, "step": 10684 }, { "crossentropy": 2.7099716663360596, "epoch": 0.3873622389791183, "grad_norm": 0.02936626225709915, "grad_norm_var": 1.2803874676172756e-05, "learning_rate": 0.006915564375206553, "loss": 2.6736, "step": 10685 }, { "crossentropy": 2.6939148902893066, "epoch": 0.3873984918793503, "grad_norm": 0.027401437982916832, "grad_norm_var": 1.2435358466127245e-05, "learning_rate": 0.006915027610652058, "loss": 2.6739, "step": 10686 }, { "crossentropy": 2.6483311653137207, "epoch": 0.38743474477958234, "grad_norm": 0.027549659833312035, "grad_norm_var": 1.2222501243885315e-05, "learning_rate": 0.006914490820232209, "loss": 2.625, "step": 10687 }, { "crossentropy": 2.485178232192993, "epoch": 0.3874709976798144, "grad_norm": 0.027694884687662125, "grad_norm_var": 7.933127757848629e-06, "learning_rate": 0.006913954003954256, "loss": 2.4914, "step": 10688 }, { "crossentropy": 2.574207305908203, "epoch": 0.3875072505800464, "grad_norm": 0.027682166546583176, "grad_norm_var": 8.01419049129137e-06, "learning_rate": 0.00691341716182545, "loss": 2.5385, "step": 10689 }, { "crossentropy": 2.363654613494873, "epoch": 0.38754350348027844, "grad_norm": 0.027265449985861778, "grad_norm_var": 8.397946255831364e-06, "learning_rate": 0.00691288029385304, "loss": 2.4224, "step": 10690 }, { "crossentropy": 2.7477304935455322, "epoch": 0.38757975638051045, "grad_norm": 0.02731511928141117, "grad_norm_var": 8.78050796178966e-06, "learning_rate": 0.006912343400044279, "loss": 2.793, "step": 10691 }, { "crossentropy": 2.6809608936309814, "epoch": 0.38761600928074247, "grad_norm": 0.027344556525349617, "grad_norm_var": 7.856234915730786e-06, "learning_rate": 0.00691180648040642, "loss": 2.659, "step": 10692 }, { "crossentropy": 2.655517339706421, "epoch": 0.3876522621809745, "grad_norm": 0.02860344387590885, "grad_norm_var": 3.679710199130705e-06, "learning_rate": 0.006911269534946712, "loss": 2.6345, "step": 10693 }, { "crossentropy": 2.9336163997650146, "epoch": 0.3876885150812065, "grad_norm": 0.033592283725738525, "grad_norm_var": 3.265050067590332e-06, "learning_rate": 0.006910732563672408, "loss": 2.7737, "step": 10694 }, { "crossentropy": 2.501624822616577, "epoch": 0.3877247679814385, "grad_norm": 0.030580179765820503, "grad_norm_var": 3.4645700642758757e-06, "learning_rate": 0.006910195566590762, "loss": 2.5122, "step": 10695 }, { "crossentropy": 2.6235575675964355, "epoch": 0.3877610208816705, "grad_norm": 0.0323614738881588, "grad_norm_var": 4.1781481991705805e-06, "learning_rate": 0.0069096585437090265, "loss": 2.6311, "step": 10696 }, { "crossentropy": 2.5015532970428467, "epoch": 0.38779727378190254, "grad_norm": 0.030715374276041985, "grad_norm_var": 4.253522579532784e-06, "learning_rate": 0.006909121495034454, "loss": 2.5353, "step": 10697 }, { "crossentropy": 2.5689144134521484, "epoch": 0.38783352668213456, "grad_norm": 0.0300702303647995, "grad_norm_var": 4.2074150998466784e-06, "learning_rate": 0.006908584420574297, "loss": 2.6002, "step": 10698 }, { "crossentropy": 2.640524387359619, "epoch": 0.38786977958236657, "grad_norm": 0.031425729393959045, "grad_norm_var": 4.413516705384368e-06, "learning_rate": 0.006908047320335813, "loss": 2.6361, "step": 10699 }, { "crossentropy": 2.566502571105957, "epoch": 0.3879060324825986, "grad_norm": 0.030469795688986778, "grad_norm_var": 4.143194591730417e-06, "learning_rate": 0.006907510194326254, "loss": 2.5577, "step": 10700 }, { "crossentropy": 2.5493807792663574, "epoch": 0.38794228538283065, "grad_norm": 0.028539050370454788, "grad_norm_var": 4.183052011888034e-06, "learning_rate": 0.0069069730425528756, "loss": 2.5655, "step": 10701 }, { "crossentropy": 2.422313690185547, "epoch": 0.38797853828306267, "grad_norm": 0.03054451383650303, "grad_norm_var": 4.009796253020175e-06, "learning_rate": 0.006906435865022931, "loss": 2.4467, "step": 10702 }, { "crossentropy": 2.699007987976074, "epoch": 0.3880147911832947, "grad_norm": 0.02784447930753231, "grad_norm_var": 3.9391668216784735e-06, "learning_rate": 0.006905898661743677, "loss": 2.6533, "step": 10703 }, { "crossentropy": 2.5678212642669678, "epoch": 0.3880510440835267, "grad_norm": 0.02718088962137699, "grad_norm_var": 4.079596867981548e-06, "learning_rate": 0.006905361432722371, "loss": 2.4854, "step": 10704 }, { "crossentropy": 2.7366983890533447, "epoch": 0.3880872969837587, "grad_norm": 0.028573481366038322, "grad_norm_var": 3.916670382624155e-06, "learning_rate": 0.0069048241779662645, "loss": 2.7301, "step": 10705 }, { "crossentropy": 2.6761348247528076, "epoch": 0.3881235498839907, "grad_norm": 0.027066614478826523, "grad_norm_var": 3.979088357402664e-06, "learning_rate": 0.0069042868974826185, "loss": 2.6764, "step": 10706 }, { "crossentropy": 2.66615891456604, "epoch": 0.38815980278422274, "grad_norm": 0.027098871767520905, "grad_norm_var": 4.045417170951447e-06, "learning_rate": 0.006903749591278687, "loss": 2.5753, "step": 10707 }, { "crossentropy": 2.682523250579834, "epoch": 0.38819605568445475, "grad_norm": 0.032396212220191956, "grad_norm_var": 4.18809924385462e-06, "learning_rate": 0.00690321225936173, "loss": 2.6863, "step": 10708 }, { "crossentropy": 2.5623786449432373, "epoch": 0.38823230858468677, "grad_norm": 0.03374074026942253, "grad_norm_var": 5.006736029904595e-06, "learning_rate": 0.006902674901739002, "loss": 2.5394, "step": 10709 }, { "crossentropy": 2.6426126956939697, "epoch": 0.3882685614849188, "grad_norm": 0.028976505622267723, "grad_norm_var": 4.2121189248312336e-06, "learning_rate": 0.006902137518417763, "loss": 2.6003, "step": 10710 }, { "crossentropy": 2.7098817825317383, "epoch": 0.3883048143851508, "grad_norm": 0.030335279181599617, "grad_norm_var": 4.191992220049269e-06, "learning_rate": 0.0069016001094052695, "loss": 2.7201, "step": 10711 }, { "crossentropy": 2.5904903411865234, "epoch": 0.3883410672853828, "grad_norm": 0.02696036733686924, "grad_norm_var": 4.194870945829519e-06, "learning_rate": 0.006901062674708779, "loss": 2.6688, "step": 10712 }, { "crossentropy": 2.6075849533081055, "epoch": 0.3883773201856148, "grad_norm": 0.027171749621629715, "grad_norm_var": 4.403629760213236e-06, "learning_rate": 0.006900525214335556, "loss": 2.5844, "step": 10713 }, { "crossentropy": 2.5877771377563477, "epoch": 0.38841357308584684, "grad_norm": 0.030446836724877357, "grad_norm_var": 4.452443352199543e-06, "learning_rate": 0.006899987728292854, "loss": 2.6088, "step": 10714 }, { "crossentropy": 2.5901594161987305, "epoch": 0.3884498259860789, "grad_norm": 0.029490552842617035, "grad_norm_var": 4.1375461123470025e-06, "learning_rate": 0.006899450216587934, "loss": 2.6016, "step": 10715 }, { "crossentropy": 2.632657289505005, "epoch": 0.3884860788863109, "grad_norm": 0.028228813782334328, "grad_norm_var": 4.065210457099506e-06, "learning_rate": 0.006898912679228054, "loss": 2.6989, "step": 10716 }, { "crossentropy": 2.5786664485931396, "epoch": 0.38852233178654294, "grad_norm": 0.027036312967538834, "grad_norm_var": 4.306157906421012e-06, "learning_rate": 0.006898375116220481, "loss": 2.6169, "step": 10717 }, { "crossentropy": 2.7443206310272217, "epoch": 0.38855858468677495, "grad_norm": 0.02743908390402794, "grad_norm_var": 4.24587954612569e-06, "learning_rate": 0.006897837527572469, "loss": 2.6568, "step": 10718 }, { "crossentropy": 2.6101648807525635, "epoch": 0.38859483758700697, "grad_norm": 0.030087139457464218, "grad_norm_var": 4.289701667998487e-06, "learning_rate": 0.006897299913291282, "loss": 2.6623, "step": 10719 }, { "crossentropy": 2.576991319656372, "epoch": 0.388631090487239, "grad_norm": 0.028829453513026237, "grad_norm_var": 4.084029748855596e-06, "learning_rate": 0.006896762273384178, "loss": 2.5294, "step": 10720 }, { "crossentropy": 2.459977626800537, "epoch": 0.388667343387471, "grad_norm": 0.02862858586013317, "grad_norm_var": 4.081141800617175e-06, "learning_rate": 0.006896224607858424, "loss": 2.5355, "step": 10721 }, { "crossentropy": 2.6610333919525146, "epoch": 0.388703596287703, "grad_norm": 0.02810933068394661, "grad_norm_var": 3.880880190975965e-06, "learning_rate": 0.006895686916721278, "loss": 2.5725, "step": 10722 }, { "crossentropy": 2.7176129817962646, "epoch": 0.388739849187935, "grad_norm": 0.02853190340101719, "grad_norm_var": 3.6343253044694595e-06, "learning_rate": 0.006895149199980003, "loss": 2.7188, "step": 10723 }, { "crossentropy": 2.594599485397339, "epoch": 0.38877610208816704, "grad_norm": 0.026126446202397346, "grad_norm_var": 3.3779293361597585e-06, "learning_rate": 0.006894611457641862, "loss": 2.6121, "step": 10724 }, { "crossentropy": 2.6014411449432373, "epoch": 0.38881235498839906, "grad_norm": 0.02769208699464798, "grad_norm_var": 1.6466110417054687e-06, "learning_rate": 0.006894073689714119, "loss": 2.5833, "step": 10725 }, { "crossentropy": 2.5611159801483154, "epoch": 0.38884860788863107, "grad_norm": 0.026971066370606422, "grad_norm_var": 1.7386465404027693e-06, "learning_rate": 0.006893535896204035, "loss": 2.6256, "step": 10726 }, { "crossentropy": 2.5794758796691895, "epoch": 0.3888848607888631, "grad_norm": 0.02885511703789234, "grad_norm_var": 1.4650849220919447e-06, "learning_rate": 0.006892998077118876, "loss": 2.6484, "step": 10727 }, { "crossentropy": 2.715017318725586, "epoch": 0.38892111368909515, "grad_norm": 0.028185972943902016, "grad_norm_var": 1.3624718355810151e-06, "learning_rate": 0.006892460232465905, "loss": 2.6842, "step": 10728 }, { "crossentropy": 2.583014726638794, "epoch": 0.38895736658932717, "grad_norm": 0.02811191976070404, "grad_norm_var": 1.2838800182821448e-06, "learning_rate": 0.006891922362252388, "loss": 2.5952, "step": 10729 }, { "crossentropy": 2.667372703552246, "epoch": 0.3889936194895592, "grad_norm": 0.029501302167773247, "grad_norm_var": 1.0688713225266334e-06, "learning_rate": 0.006891384466485587, "loss": 2.6182, "step": 10730 }, { "crossentropy": 2.5953049659729004, "epoch": 0.3890298723897912, "grad_norm": 0.02812092751264572, "grad_norm_var": 9.575713723995362e-07, "learning_rate": 0.006890846545172769, "loss": 2.6074, "step": 10731 }, { "crossentropy": 2.6279194355010986, "epoch": 0.3890661252900232, "grad_norm": 0.029299793764948845, "grad_norm_var": 1.0400181587649394e-06, "learning_rate": 0.006890308598321198, "loss": 2.6152, "step": 10732 }, { "crossentropy": 2.656115770339966, "epoch": 0.3891023781902552, "grad_norm": 0.02681293524801731, "grad_norm_var": 1.0784033264428042e-06, "learning_rate": 0.006889770625938141, "loss": 2.6527, "step": 10733 }, { "crossentropy": 2.654139280319214, "epoch": 0.38913863109048724, "grad_norm": 0.029870502650737762, "grad_norm_var": 1.199121610643133e-06, "learning_rate": 0.006889232628030865, "loss": 2.6338, "step": 10734 }, { "crossentropy": 2.7270724773406982, "epoch": 0.38917488399071926, "grad_norm": 0.02814854495227337, "grad_norm_var": 9.871639432630959e-07, "learning_rate": 0.006888694604606635, "loss": 2.6463, "step": 10735 }, { "crossentropy": 2.5915939807891846, "epoch": 0.38921113689095127, "grad_norm": 0.027681441977620125, "grad_norm_var": 9.788860087419123e-07, "learning_rate": 0.0068881565556727165, "loss": 2.5781, "step": 10736 }, { "crossentropy": 2.6997132301330566, "epoch": 0.3892473897911833, "grad_norm": 0.02755417674779892, "grad_norm_var": 9.84692940280268e-07, "learning_rate": 0.00688761848123638, "loss": 2.7052, "step": 10737 }, { "crossentropy": 2.4918718338012695, "epoch": 0.3892836426914153, "grad_norm": 0.03133762255311012, "grad_norm_var": 1.6407897684103089e-06, "learning_rate": 0.0068870803813048925, "loss": 2.4931, "step": 10738 }, { "crossentropy": 2.654170274734497, "epoch": 0.3893198955916473, "grad_norm": 0.030055947601795197, "grad_norm_var": 1.8330609702936572e-06, "learning_rate": 0.006886542255885518, "loss": 2.6373, "step": 10739 }, { "crossentropy": 2.555907726287842, "epoch": 0.38935614849187933, "grad_norm": 0.02773343026638031, "grad_norm_var": 1.5083124605415333e-06, "learning_rate": 0.006886004104985528, "loss": 2.5946, "step": 10740 }, { "crossentropy": 2.6027767658233643, "epoch": 0.38939240139211134, "grad_norm": 0.029323430731892586, "grad_norm_var": 1.4998251417930175e-06, "learning_rate": 0.006885465928612191, "loss": 2.6231, "step": 10741 }, { "crossentropy": 2.705069065093994, "epoch": 0.3894286542923434, "grad_norm": 0.028037721291184425, "grad_norm_var": 1.339585497884159e-06, "learning_rate": 0.006884927726772777, "loss": 2.6415, "step": 10742 }, { "crossentropy": 2.7009541988372803, "epoch": 0.3894649071925754, "grad_norm": 0.02989359200000763, "grad_norm_var": 1.4333913532569667e-06, "learning_rate": 0.006884389499474551, "loss": 2.7302, "step": 10743 }, { "crossentropy": 2.766890525817871, "epoch": 0.38950116009280744, "grad_norm": 0.027287693694233894, "grad_norm_var": 1.5489010029297313e-06, "learning_rate": 0.006883851246724786, "loss": 2.7544, "step": 10744 }, { "crossentropy": 2.6057217121124268, "epoch": 0.38953741299303946, "grad_norm": 0.02736753411591053, "grad_norm_var": 1.6392393909824086e-06, "learning_rate": 0.006883312968530751, "loss": 2.5465, "step": 10745 }, { "crossentropy": 2.6404542922973633, "epoch": 0.38957366589327147, "grad_norm": 0.02790731191635132, "grad_norm_var": 1.6121507152133096e-06, "learning_rate": 0.006882774664899718, "loss": 2.6009, "step": 10746 }, { "crossentropy": 2.6687233448028564, "epoch": 0.3896099187935035, "grad_norm": 0.02759919874370098, "grad_norm_var": 1.6574138710284536e-06, "learning_rate": 0.006882236335838954, "loss": 2.7538, "step": 10747 }, { "crossentropy": 2.4994678497314453, "epoch": 0.3896461716937355, "grad_norm": 0.02866492234170437, "grad_norm_var": 1.6144315609303936e-06, "learning_rate": 0.006881697981355732, "loss": 2.5712, "step": 10748 }, { "crossentropy": 2.6011462211608887, "epoch": 0.3896824245939675, "grad_norm": 0.028826920315623283, "grad_norm_var": 1.427061217803407e-06, "learning_rate": 0.006881159601457323, "loss": 2.6053, "step": 10749 }, { "crossentropy": 2.712758779525757, "epoch": 0.38971867749419953, "grad_norm": 0.029537832364439964, "grad_norm_var": 1.3767641779959793e-06, "learning_rate": 0.006880621196151, "loss": 2.6788, "step": 10750 }, { "crossentropy": 2.5854451656341553, "epoch": 0.38975493039443154, "grad_norm": 0.027595175430178642, "grad_norm_var": 1.4262486655184162e-06, "learning_rate": 0.006880082765444034, "loss": 2.5132, "step": 10751 }, { "crossentropy": 2.645850419998169, "epoch": 0.38979118329466356, "grad_norm": 0.028155773878097534, "grad_norm_var": 1.3869447684901424e-06, "learning_rate": 0.006879544309343695, "loss": 2.6519, "step": 10752 }, { "crossentropy": 2.3259379863739014, "epoch": 0.38982743619489557, "grad_norm": 0.02753688022494316, "grad_norm_var": 1.3892713208893722e-06, "learning_rate": 0.006879005827857259, "loss": 2.5124, "step": 10753 }, { "crossentropy": 2.627214193344116, "epoch": 0.3898636890951276, "grad_norm": 0.026773791760206223, "grad_norm_var": 9.970769078762684e-07, "learning_rate": 0.006878467320991998, "loss": 2.6865, "step": 10754 }, { "crossentropy": 2.624338388442993, "epoch": 0.38989994199535966, "grad_norm": 0.027919156476855278, "grad_norm_var": 7.732111509144966e-07, "learning_rate": 0.0068779287887551855, "loss": 2.6055, "step": 10755 }, { "crossentropy": 2.501278877258301, "epoch": 0.38993619489559167, "grad_norm": 0.027150940150022507, "grad_norm_var": 8.256068997599767e-07, "learning_rate": 0.006877390231154095, "loss": 2.5341, "step": 10756 }, { "crossentropy": 2.562323808670044, "epoch": 0.3899724477958237, "grad_norm": 0.027903711423277855, "grad_norm_var": 7.19729895323704e-07, "learning_rate": 0.006876851648195998, "loss": 2.5182, "step": 10757 }, { "crossentropy": 2.446226119995117, "epoch": 0.3900087006960557, "grad_norm": 0.028421230614185333, "grad_norm_var": 7.303457672123605e-07, "learning_rate": 0.006876313039888174, "loss": 2.5223, "step": 10758 }, { "crossentropy": 2.6886439323425293, "epoch": 0.3900449535962877, "grad_norm": 0.02770336903631687, "grad_norm_var": 4.870642998815062e-07, "learning_rate": 0.0068757744062378934, "loss": 2.6238, "step": 10759 }, { "crossentropy": 2.606071949005127, "epoch": 0.39008120649651973, "grad_norm": 0.027674926444888115, "grad_norm_var": 4.649788099076827e-07, "learning_rate": 0.006875235747252434, "loss": 2.6295, "step": 10760 }, { "crossentropy": 2.455522298812866, "epoch": 0.39011745939675174, "grad_norm": 0.02675129845738411, "grad_norm_var": 5.342020856944114e-07, "learning_rate": 0.0068746970629390705, "loss": 2.5299, "step": 10761 }, { "crossentropy": 2.5775187015533447, "epoch": 0.39015371229698376, "grad_norm": 0.029285894706845284, "grad_norm_var": 6.575154203648677e-07, "learning_rate": 0.006874158353305077, "loss": 2.5429, "step": 10762 }, { "crossentropy": 2.631716728210449, "epoch": 0.39018996519721577, "grad_norm": 0.029374102130532265, "grad_norm_var": 7.669372221662714e-07, "learning_rate": 0.0068736196183577306, "loss": 2.6172, "step": 10763 }, { "crossentropy": 2.5367496013641357, "epoch": 0.3902262180974478, "grad_norm": 0.026954084634780884, "grad_norm_var": 8.163868630801091e-07, "learning_rate": 0.006873080858104308, "loss": 2.6062, "step": 10764 }, { "crossentropy": 2.741572856903076, "epoch": 0.3902624709976798, "grad_norm": 0.026997249573469162, "grad_norm_var": 8.172543338372714e-07, "learning_rate": 0.006872542072552087, "loss": 2.6532, "step": 10765 }, { "crossentropy": 2.617568254470825, "epoch": 0.3902987238979118, "grad_norm": 0.03189186006784439, "grad_norm_var": 1.6906988208008895e-06, "learning_rate": 0.006872003261708343, "loss": 2.5694, "step": 10766 }, { "crossentropy": 2.6441144943237305, "epoch": 0.39033497679814383, "grad_norm": 0.03603333234786987, "grad_norm_var": 5.6791018901324855e-06, "learning_rate": 0.006871464425580354, "loss": 2.6235, "step": 10767 }, { "crossentropy": 2.482567548751831, "epoch": 0.39037122969837584, "grad_norm": 0.0333913192152977, "grad_norm_var": 7.128971432158031e-06, "learning_rate": 0.006870925564175397, "loss": 2.485, "step": 10768 }, { "crossentropy": 2.607409954071045, "epoch": 0.3904074825986079, "grad_norm": 0.02744353376328945, "grad_norm_var": 7.145986284573278e-06, "learning_rate": 0.006870386677500753, "loss": 2.6113, "step": 10769 }, { "crossentropy": 2.495267152786255, "epoch": 0.39044373549883993, "grad_norm": 0.028009258210659027, "grad_norm_var": 6.8986548144512326e-06, "learning_rate": 0.006869847765563697, "loss": 2.5917, "step": 10770 }, { "crossentropy": 2.4439594745635986, "epoch": 0.39047998839907194, "grad_norm": 0.028287626802921295, "grad_norm_var": 6.857400765656179e-06, "learning_rate": 0.00686930882837151, "loss": 2.5459, "step": 10771 }, { "crossentropy": 2.53704833984375, "epoch": 0.39051624129930396, "grad_norm": 0.02779965102672577, "grad_norm_var": 6.727694466168027e-06, "learning_rate": 0.00686876986593147, "loss": 2.6514, "step": 10772 }, { "crossentropy": 2.3810794353485107, "epoch": 0.39055249419953597, "grad_norm": 0.02710655890405178, "grad_norm_var": 6.883416281543847e-06, "learning_rate": 0.006868230878250857, "loss": 2.5026, "step": 10773 }, { "crossentropy": 2.601426839828491, "epoch": 0.390588747099768, "grad_norm": 0.026672260835766792, "grad_norm_var": 7.196815334076682e-06, "learning_rate": 0.006867691865336951, "loss": 2.582, "step": 10774 }, { "crossentropy": 2.741245985031128, "epoch": 0.390625, "grad_norm": 0.029556062072515488, "grad_norm_var": 7.131550766510154e-06, "learning_rate": 0.006867152827197033, "loss": 2.7736, "step": 10775 }, { "crossentropy": 2.353428840637207, "epoch": 0.390661252900232, "grad_norm": 0.02885836362838745, "grad_norm_var": 7.017601375271807e-06, "learning_rate": 0.006866613763838382, "loss": 2.5017, "step": 10776 }, { "crossentropy": 2.831144332885742, "epoch": 0.39069750580046403, "grad_norm": 0.029543111100792885, "grad_norm_var": 6.658083695859109e-06, "learning_rate": 0.006866074675268279, "loss": 2.768, "step": 10777 }, { "crossentropy": 2.527801990509033, "epoch": 0.39073375870069604, "grad_norm": 0.028238311409950256, "grad_norm_var": 6.714712805212898e-06, "learning_rate": 0.006865535561494005, "loss": 2.598, "step": 10778 }, { "crossentropy": 2.5483338832855225, "epoch": 0.39077001160092806, "grad_norm": 0.02679201029241085, "grad_norm_var": 7.049023560507063e-06, "learning_rate": 0.006864996422522843, "loss": 2.5307, "step": 10779 }, { "crossentropy": 2.4677937030792236, "epoch": 0.39080626450116007, "grad_norm": 0.028651708737015724, "grad_norm_var": 6.772069500946461e-06, "learning_rate": 0.006864457258362075, "loss": 2.5278, "step": 10780 }, { "crossentropy": 2.677466869354248, "epoch": 0.3908425174013921, "grad_norm": 0.028860528022050858, "grad_norm_var": 6.471745714647231e-06, "learning_rate": 0.006863918069018981, "loss": 2.5841, "step": 10781 }, { "crossentropy": 2.629469871520996, "epoch": 0.39087877030162416, "grad_norm": 0.041376497596502304, "grad_norm_var": 1.5503416407366444e-05, "learning_rate": 0.006863378854500846, "loss": 2.5793, "step": 10782 }, { "crossentropy": 2.854029893875122, "epoch": 0.39091502320185617, "grad_norm": 0.03247179463505745, "grad_norm_var": 1.3330829409040346e-05, "learning_rate": 0.006862839614814951, "loss": 2.8191, "step": 10783 }, { "crossentropy": 2.602296829223633, "epoch": 0.3909512761020882, "grad_norm": 0.03306072577834129, "grad_norm_var": 1.3169050589282618e-05, "learning_rate": 0.00686230034996858, "loss": 2.6556, "step": 10784 }, { "crossentropy": 2.503843069076538, "epoch": 0.3909875290023202, "grad_norm": 0.027246281504631042, "grad_norm_var": 1.322676471730079e-05, "learning_rate": 0.006861761059969016, "loss": 2.5301, "step": 10785 }, { "crossentropy": 2.576444387435913, "epoch": 0.3910237819025522, "grad_norm": 0.027768665924668312, "grad_norm_var": 1.3279268091878164e-05, "learning_rate": 0.006861221744823545, "loss": 2.5921, "step": 10786 }, { "crossentropy": 2.6252341270446777, "epoch": 0.39106003480278423, "grad_norm": 0.027917828410863876, "grad_norm_var": 1.3348487008417858e-05, "learning_rate": 0.006860682404539449, "loss": 2.6642, "step": 10787 }, { "crossentropy": 2.5333964824676514, "epoch": 0.39109628770301624, "grad_norm": 0.03064405731856823, "grad_norm_var": 1.321117575359015e-05, "learning_rate": 0.006860143039124013, "loss": 2.6168, "step": 10788 }, { "crossentropy": 2.4991564750671387, "epoch": 0.39113254060324826, "grad_norm": 0.032601188868284225, "grad_norm_var": 1.3218039509120306e-05, "learning_rate": 0.006859603648584523, "loss": 2.5869, "step": 10789 }, { "crossentropy": 2.7949347496032715, "epoch": 0.39116879350348027, "grad_norm": 0.03112325631082058, "grad_norm_var": 1.2471727867210438e-05, "learning_rate": 0.006859064232928264, "loss": 2.6838, "step": 10790 }, { "crossentropy": 2.640880823135376, "epoch": 0.3912050464037123, "grad_norm": 0.02842111326754093, "grad_norm_var": 1.2663964687031774e-05, "learning_rate": 0.006858524792162521, "loss": 2.5758, "step": 10791 }, { "crossentropy": 2.4562888145446777, "epoch": 0.3912412993039443, "grad_norm": 0.031593017280101776, "grad_norm_var": 1.2633616358487472e-05, "learning_rate": 0.00685798532629458, "loss": 2.523, "step": 10792 }, { "crossentropy": 2.7820727825164795, "epoch": 0.3912775522041763, "grad_norm": 0.030217444524168968, "grad_norm_var": 1.2585498071070396e-05, "learning_rate": 0.0068574458353317285, "loss": 2.7072, "step": 10793 }, { "crossentropy": 2.5792293548583984, "epoch": 0.39131380510440833, "grad_norm": 0.028511377051472664, "grad_norm_var": 1.251012409197172e-05, "learning_rate": 0.006856906319281251, "loss": 2.6165, "step": 10794 }, { "crossentropy": 2.6434807777404785, "epoch": 0.39135005800464034, "grad_norm": 0.028206253424286842, "grad_norm_var": 1.19446801377695e-05, "learning_rate": 0.006856366778150436, "loss": 2.6405, "step": 10795 }, { "crossentropy": 2.487535238265991, "epoch": 0.3913863109048724, "grad_norm": 0.028943631798028946, "grad_norm_var": 1.187643101741366e-05, "learning_rate": 0.006855827211946569, "loss": 2.655, "step": 10796 }, { "crossentropy": 2.5260791778564453, "epoch": 0.39142256380510443, "grad_norm": 0.029841283336281776, "grad_norm_var": 1.1714283162171599e-05, "learning_rate": 0.006855287620676942, "loss": 2.6059, "step": 10797 }, { "crossentropy": 2.645902156829834, "epoch": 0.39145881670533644, "grad_norm": 0.029187273234128952, "grad_norm_var": 3.521056103095108e-06, "learning_rate": 0.00685474800434884, "loss": 2.6116, "step": 10798 }, { "crossentropy": 2.7178328037261963, "epoch": 0.39149506960556846, "grad_norm": 0.028230100870132446, "grad_norm_var": 3.168259648358059e-06, "learning_rate": 0.00685420836296955, "loss": 2.6122, "step": 10799 }, { "crossentropy": 2.6139321327209473, "epoch": 0.39153132250580047, "grad_norm": 0.026894332841038704, "grad_norm_var": 2.6949804334517317e-06, "learning_rate": 0.0068536686965463625, "loss": 2.5635, "step": 10800 }, { "crossentropy": 2.648524761199951, "epoch": 0.3915675754060325, "grad_norm": 0.027820872142910957, "grad_norm_var": 2.565232254995766e-06, "learning_rate": 0.006853129005086567, "loss": 2.6474, "step": 10801 }, { "crossentropy": 2.5294976234436035, "epoch": 0.3916038283062645, "grad_norm": 0.030159030109643936, "grad_norm_var": 2.4517833712177217e-06, "learning_rate": 0.006852589288597452, "loss": 2.5399, "step": 10802 }, { "crossentropy": 2.5211570262908936, "epoch": 0.3916400812064965, "grad_norm": 0.028097543865442276, "grad_norm_var": 2.418417788847234e-06, "learning_rate": 0.006852049547086307, "loss": 2.4786, "step": 10803 }, { "crossentropy": 2.6280698776245117, "epoch": 0.39167633410672853, "grad_norm": 0.028850652277469635, "grad_norm_var": 2.3233284190371866e-06, "learning_rate": 0.0068515097805604205, "loss": 2.6206, "step": 10804 }, { "crossentropy": 2.5347578525543213, "epoch": 0.39171258700696054, "grad_norm": 0.030088897794485092, "grad_norm_var": 1.609869835254331e-06, "learning_rate": 0.006850969989027087, "loss": 2.5913, "step": 10805 }, { "crossentropy": 2.6096622943878174, "epoch": 0.39174883990719256, "grad_norm": 0.030940229073166847, "grad_norm_var": 1.5634826242678924e-06, "learning_rate": 0.006850430172493594, "loss": 2.6198, "step": 10806 }, { "crossentropy": 2.5575602054595947, "epoch": 0.3917850928074246, "grad_norm": 0.02822151593863964, "grad_norm_var": 1.5847101670837226e-06, "learning_rate": 0.0068498903309672345, "loss": 2.6323, "step": 10807 }, { "crossentropy": 2.67665696144104, "epoch": 0.3918213457076566, "grad_norm": 0.026043467223644257, "grad_norm_var": 1.6742801132511749e-06, "learning_rate": 0.006849350464455296, "loss": 2.6349, "step": 10808 }, { "crossentropy": 2.5650711059570312, "epoch": 0.39185759860788866, "grad_norm": 0.027343308553099632, "grad_norm_var": 1.6343011908933591e-06, "learning_rate": 0.006848810572965076, "loss": 2.4841, "step": 10809 }, { "crossentropy": 2.7250094413757324, "epoch": 0.39189385150812067, "grad_norm": 0.02803609147667885, "grad_norm_var": 1.653163606000563e-06, "learning_rate": 0.006848270656503862, "loss": 2.7197, "step": 10810 }, { "crossentropy": 2.556100845336914, "epoch": 0.3919301044083527, "grad_norm": 0.02704034559428692, "grad_norm_var": 1.7925744881919544e-06, "learning_rate": 0.006847730715078948, "loss": 2.6061, "step": 10811 }, { "crossentropy": 2.634977340698242, "epoch": 0.3919663573085847, "grad_norm": 0.027011960744857788, "grad_norm_var": 1.9073157529259405e-06, "learning_rate": 0.006847190748697627, "loss": 2.6766, "step": 10812 }, { "crossentropy": 2.624664545059204, "epoch": 0.3920026102088167, "grad_norm": 0.027877915650606155, "grad_norm_var": 1.7612351325628495e-06, "learning_rate": 0.0068466507573671925, "loss": 2.6972, "step": 10813 }, { "crossentropy": 2.698185682296753, "epoch": 0.39203886310904873, "grad_norm": 0.029857443645596504, "grad_norm_var": 1.8739304999644863e-06, "learning_rate": 0.006846110741094935, "loss": 2.6357, "step": 10814 }, { "crossentropy": 2.4485018253326416, "epoch": 0.39207511600928074, "grad_norm": 0.02752702496945858, "grad_norm_var": 1.90970044221152e-06, "learning_rate": 0.006845570699888153, "loss": 2.5456, "step": 10815 }, { "crossentropy": 2.4399502277374268, "epoch": 0.39211136890951276, "grad_norm": 0.028141861781477928, "grad_norm_var": 1.783441787066529e-06, "learning_rate": 0.0068450306337541345, "loss": 2.5367, "step": 10816 }, { "crossentropy": 2.466749906539917, "epoch": 0.3921476218097448, "grad_norm": 0.0291165541857481, "grad_norm_var": 1.8028058498505025e-06, "learning_rate": 0.00684449054270018, "loss": 2.5148, "step": 10817 }, { "crossentropy": 2.668938398361206, "epoch": 0.3921838747099768, "grad_norm": 0.030408870428800583, "grad_norm_var": 1.8654000965220274e-06, "learning_rate": 0.006843950426733581, "loss": 2.658, "step": 10818 }, { "crossentropy": 2.684678792953491, "epoch": 0.3922201276102088, "grad_norm": 0.02954057976603508, "grad_norm_var": 1.9349037751446746e-06, "learning_rate": 0.006843410285861634, "loss": 2.6625, "step": 10819 }, { "crossentropy": 2.4941046237945557, "epoch": 0.3922563805104408, "grad_norm": 0.026197580620646477, "grad_norm_var": 2.2518202614154784e-06, "learning_rate": 0.006842870120091632, "loss": 2.5713, "step": 10820 }, { "crossentropy": 2.550297975540161, "epoch": 0.39229263341067283, "grad_norm": 0.0277980025857687, "grad_norm_var": 2.0447423642664177e-06, "learning_rate": 0.006842329929430873, "loss": 2.6742, "step": 10821 }, { "crossentropy": 2.5773675441741943, "epoch": 0.39232888631090485, "grad_norm": 0.028322571888566017, "grad_norm_var": 1.5144817149823422e-06, "learning_rate": 0.006841789713886654, "loss": 2.6367, "step": 10822 }, { "crossentropy": 2.674028158187866, "epoch": 0.3923651392111369, "grad_norm": 0.028155436739325523, "grad_norm_var": 1.513070061760248e-06, "learning_rate": 0.0068412494734662686, "loss": 2.6438, "step": 10823 }, { "crossentropy": 2.6627659797668457, "epoch": 0.39240139211136893, "grad_norm": 0.02929094433784485, "grad_norm_var": 1.3136895286415425e-06, "learning_rate": 0.006840709208177014, "loss": 2.6596, "step": 10824 }, { "crossentropy": 2.48598575592041, "epoch": 0.39243764501160094, "grad_norm": 0.02855600416660309, "grad_norm_var": 1.2623688665505065e-06, "learning_rate": 0.006840168918026189, "loss": 2.669, "step": 10825 }, { "crossentropy": 2.6887338161468506, "epoch": 0.39247389791183296, "grad_norm": 0.027575809508562088, "grad_norm_var": 1.292110138259569e-06, "learning_rate": 0.006839628603021091, "loss": 2.6442, "step": 10826 }, { "crossentropy": 2.695927381515503, "epoch": 0.392510150812065, "grad_norm": 0.027407892048358917, "grad_norm_var": 1.239989666001797e-06, "learning_rate": 0.006839088263169016, "loss": 2.665, "step": 10827 }, { "crossentropy": 2.576777219772339, "epoch": 0.392546403712297, "grad_norm": 0.026644667610526085, "grad_norm_var": 1.3114581139730355e-06, "learning_rate": 0.0068385478984772635, "loss": 2.6089, "step": 10828 }, { "crossentropy": 2.6290576457977295, "epoch": 0.392582656612529, "grad_norm": 0.027620866894721985, "grad_norm_var": 1.3292381232799052e-06, "learning_rate": 0.00683800750895313, "loss": 2.6353, "step": 10829 }, { "crossentropy": 2.5446958541870117, "epoch": 0.392618909512761, "grad_norm": 0.025889525189995766, "grad_norm_var": 1.4681946694401152e-06, "learning_rate": 0.006837467094603918, "loss": 2.5889, "step": 10830 }, { "crossentropy": 2.638279438018799, "epoch": 0.39265516241299303, "grad_norm": 0.027184881269931793, "grad_norm_var": 1.4976414692538867e-06, "learning_rate": 0.006836926655436924, "loss": 2.6055, "step": 10831 }, { "crossentropy": 2.4678306579589844, "epoch": 0.39269141531322505, "grad_norm": 0.025923125445842743, "grad_norm_var": 1.760613181895594e-06, "learning_rate": 0.006836386191459446, "loss": 2.4911, "step": 10832 }, { "crossentropy": 2.503924608230591, "epoch": 0.39272766821345706, "grad_norm": 0.028023580089211464, "grad_norm_var": 1.6510038323281576e-06, "learning_rate": 0.006835845702678788, "loss": 2.5659, "step": 10833 }, { "crossentropy": 2.4952306747436523, "epoch": 0.3927639211136891, "grad_norm": 0.03124992363154888, "grad_norm_var": 1.989594240736116e-06, "learning_rate": 0.006835305189102248, "loss": 2.5301, "step": 10834 }, { "crossentropy": 2.6253979206085205, "epoch": 0.3928001740139211, "grad_norm": 0.0319385752081871, "grad_norm_var": 2.893895313352268e-06, "learning_rate": 0.006834764650737124, "loss": 2.5852, "step": 10835 }, { "crossentropy": 2.4672327041625977, "epoch": 0.39283642691415316, "grad_norm": 0.027922607958316803, "grad_norm_var": 2.6684861022195727e-06, "learning_rate": 0.00683422408759072, "loss": 2.5541, "step": 10836 }, { "crossentropy": 2.538058042526245, "epoch": 0.3928726798143852, "grad_norm": 0.027695346623659134, "grad_norm_var": 2.673196550528195e-06, "learning_rate": 0.006833683499670336, "loss": 2.5354, "step": 10837 }, { "crossentropy": 2.6319029331207275, "epoch": 0.3929089327146172, "grad_norm": 0.027739787474274635, "grad_norm_var": 2.6761662814985914e-06, "learning_rate": 0.0068331428869832745, "loss": 2.648, "step": 10838 }, { "crossentropy": 2.7187159061431885, "epoch": 0.3929451856148492, "grad_norm": 0.02786494791507721, "grad_norm_var": 2.677402438654612e-06, "learning_rate": 0.006832602249536837, "loss": 2.5706, "step": 10839 }, { "crossentropy": 2.690735101699829, "epoch": 0.3929814385150812, "grad_norm": 0.029454490169882774, "grad_norm_var": 2.706504353382905e-06, "learning_rate": 0.006832061587338323, "loss": 2.6594, "step": 10840 }, { "crossentropy": 2.448612928390503, "epoch": 0.39301769141531323, "grad_norm": 0.026830336079001427, "grad_norm_var": 2.774646315297395e-06, "learning_rate": 0.006831520900395037, "loss": 2.5222, "step": 10841 }, { "crossentropy": 2.7692718505859375, "epoch": 0.39305394431554525, "grad_norm": 0.027582047507166862, "grad_norm_var": 2.774349665922669e-06, "learning_rate": 0.006830980188714283, "loss": 2.6767, "step": 10842 }, { "crossentropy": 2.568727731704712, "epoch": 0.39309019721577726, "grad_norm": 0.027984675019979477, "grad_norm_var": 2.7545445918583075e-06, "learning_rate": 0.006830439452303364, "loss": 2.5657, "step": 10843 }, { "crossentropy": 2.427720785140991, "epoch": 0.3931264501160093, "grad_norm": 0.02929011918604374, "grad_norm_var": 2.7238173300488573e-06, "learning_rate": 0.006829898691169581, "loss": 2.5437, "step": 10844 }, { "crossentropy": 2.736729621887207, "epoch": 0.3931627030162413, "grad_norm": 0.030944565311074257, "grad_norm_var": 3.185445049041696e-06, "learning_rate": 0.006829357905320238, "loss": 2.6227, "step": 10845 }, { "crossentropy": 2.4791975021362305, "epoch": 0.3931989559164733, "grad_norm": 0.02826591394841671, "grad_norm_var": 2.7604038517089195e-06, "learning_rate": 0.006828817094762643, "loss": 2.5745, "step": 10846 }, { "crossentropy": 2.49066162109375, "epoch": 0.3932352088167053, "grad_norm": 0.028687985613942146, "grad_norm_var": 2.639359611206667e-06, "learning_rate": 0.006828276259504096, "loss": 2.5693, "step": 10847 }, { "crossentropy": 2.540865182876587, "epoch": 0.39327146171693733, "grad_norm": 0.028584985062479973, "grad_norm_var": 2.13662144067984e-06, "learning_rate": 0.006827735399551904, "loss": 2.5992, "step": 10848 }, { "crossentropy": 2.5411179065704346, "epoch": 0.39330771461716935, "grad_norm": 0.031775377690792084, "grad_norm_var": 2.6511140949591594e-06, "learning_rate": 0.006827194514913371, "loss": 2.5983, "step": 10849 }, { "crossentropy": 2.6280417442321777, "epoch": 0.3933439675174014, "grad_norm": 0.031238753348588943, "grad_norm_var": 2.6477533927756473e-06, "learning_rate": 0.006826653605595805, "loss": 2.5032, "step": 10850 }, { "crossentropy": 2.6308751106262207, "epoch": 0.39338022041763343, "grad_norm": 0.029043398797512054, "grad_norm_var": 2.0324592558098314e-06, "learning_rate": 0.006826112671606508, "loss": 2.6568, "step": 10851 }, { "crossentropy": 2.5126049518585205, "epoch": 0.39341647331786544, "grad_norm": 0.029421336948871613, "grad_norm_var": 1.996200718360957e-06, "learning_rate": 0.00682557171295279, "loss": 2.5618, "step": 10852 }, { "crossentropy": 2.605330228805542, "epoch": 0.39345272621809746, "grad_norm": 0.02929547242820263, "grad_norm_var": 1.8991587263034529e-06, "learning_rate": 0.006825030729641955, "loss": 2.6763, "step": 10853 }, { "crossentropy": 2.7002673149108887, "epoch": 0.3934889791183295, "grad_norm": 0.0297564510256052, "grad_norm_var": 1.814414889211337e-06, "learning_rate": 0.00682448972168131, "loss": 2.6941, "step": 10854 }, { "crossentropy": 2.63664174079895, "epoch": 0.3935252320185615, "grad_norm": 0.028995564207434654, "grad_norm_var": 1.704160323780105e-06, "learning_rate": 0.006823948689078162, "loss": 2.6445, "step": 10855 }, { "crossentropy": 2.7188680171966553, "epoch": 0.3935614849187935, "grad_norm": 0.02936532534658909, "grad_norm_var": 1.701595620835295e-06, "learning_rate": 0.00682340763183982, "loss": 2.7178, "step": 10856 }, { "crossentropy": 2.5531089305877686, "epoch": 0.3935977378190255, "grad_norm": 0.03044799156486988, "grad_norm_var": 1.3806940801877697e-06, "learning_rate": 0.006822866549973591, "loss": 2.5721, "step": 10857 }, { "crossentropy": 2.710115909576416, "epoch": 0.39363399071925753, "grad_norm": 0.03239119425415993, "grad_norm_var": 1.6492607884809515e-06, "learning_rate": 0.006822325443486782, "loss": 2.5821, "step": 10858 }, { "crossentropy": 2.6141281127929688, "epoch": 0.39367024361948955, "grad_norm": 0.029402656480669975, "grad_norm_var": 1.4472049473787518e-06, "learning_rate": 0.006821784312386703, "loss": 2.6531, "step": 10859 }, { "crossentropy": 2.464545726776123, "epoch": 0.39370649651972156, "grad_norm": 0.02748316153883934, "grad_norm_var": 1.7757304257648779e-06, "learning_rate": 0.006821243156680662, "loss": 2.5313, "step": 10860 }, { "crossentropy": 2.636728286743164, "epoch": 0.3937427494199536, "grad_norm": 0.027683118358254433, "grad_norm_var": 1.8966197821235556e-06, "learning_rate": 0.006820701976375969, "loss": 2.6297, "step": 10861 }, { "crossentropy": 2.7239603996276855, "epoch": 0.3937790023201856, "grad_norm": 0.026140836998820305, "grad_norm_var": 2.5256804663715987e-06, "learning_rate": 0.006820160771479932, "loss": 2.6378, "step": 10862 }, { "crossentropy": 2.546821355819702, "epoch": 0.39381525522041766, "grad_norm": 0.027040790766477585, "grad_norm_var": 2.842213680686563e-06, "learning_rate": 0.0068196195419998615, "loss": 2.6235, "step": 10863 }, { "crossentropy": 2.694178342819214, "epoch": 0.3938515081206497, "grad_norm": 0.02741362154483795, "grad_norm_var": 3.032480991998811e-06, "learning_rate": 0.006819078287943068, "loss": 2.6215, "step": 10864 }, { "crossentropy": 2.66294527053833, "epoch": 0.3938877610208817, "grad_norm": 0.027611669152975082, "grad_norm_var": 2.675679701478045e-06, "learning_rate": 0.006818537009316862, "loss": 2.5527, "step": 10865 }, { "crossentropy": 2.592120409011841, "epoch": 0.3939240139211137, "grad_norm": 0.02672385424375534, "grad_norm_var": 2.554267761431528e-06, "learning_rate": 0.0068179957061285535, "loss": 2.4939, "step": 10866 }, { "crossentropy": 2.7194507122039795, "epoch": 0.3939602668213457, "grad_norm": 0.0290500670671463, "grad_norm_var": 2.5546305124725387e-06, "learning_rate": 0.006817454378385455, "loss": 2.6891, "step": 10867 }, { "crossentropy": 2.676259756088257, "epoch": 0.39399651972157773, "grad_norm": 0.02855730801820755, "grad_norm_var": 2.5111550083868616e-06, "learning_rate": 0.006816913026094875, "loss": 2.6627, "step": 10868 }, { "crossentropy": 2.6217944622039795, "epoch": 0.39403277262180975, "grad_norm": 0.02852381393313408, "grad_norm_var": 2.4752662190415197e-06, "learning_rate": 0.006816371649264129, "loss": 2.6337, "step": 10869 }, { "crossentropy": 2.458552837371826, "epoch": 0.39406902552204176, "grad_norm": 0.03093935176730156, "grad_norm_var": 2.755096643691085e-06, "learning_rate": 0.0068158302479005275, "loss": 2.565, "step": 10870 }, { "crossentropy": 2.491691827774048, "epoch": 0.3941052784222738, "grad_norm": 0.028646079823374748, "grad_norm_var": 2.7447939337644594e-06, "learning_rate": 0.006815288822011383, "loss": 2.5688, "step": 10871 }, { "crossentropy": 2.6018590927124023, "epoch": 0.3941415313225058, "grad_norm": 0.02829020284116268, "grad_norm_var": 2.7057226747894657e-06, "learning_rate": 0.0068147473716040075, "loss": 2.5849, "step": 10872 }, { "crossentropy": 2.7632272243499756, "epoch": 0.3941777842227378, "grad_norm": 0.02819233201444149, "grad_norm_var": 2.444353762485104e-06, "learning_rate": 0.006814205896685715, "loss": 2.7293, "step": 10873 }, { "crossentropy": 2.4739367961883545, "epoch": 0.3942140371229698, "grad_norm": 0.02733715809881687, "grad_norm_var": 1.3382030617948288e-06, "learning_rate": 0.006813664397263818, "loss": 2.5976, "step": 10874 }, { "crossentropy": 2.577784538269043, "epoch": 0.39425029002320183, "grad_norm": 0.028683237731456757, "grad_norm_var": 1.2422155718676739e-06, "learning_rate": 0.0068131228733456315, "loss": 2.5721, "step": 10875 }, { "crossentropy": 2.6640114784240723, "epoch": 0.39428654292343385, "grad_norm": 0.02825189009308815, "grad_norm_var": 1.2241468947000085e-06, "learning_rate": 0.006812581324938468, "loss": 2.5735, "step": 10876 }, { "crossentropy": 2.6181082725524902, "epoch": 0.3943227958236659, "grad_norm": 0.02752123773097992, "grad_norm_var": 1.2340884473040532e-06, "learning_rate": 0.006812039752049645, "loss": 2.5678, "step": 10877 }, { "crossentropy": 2.7286744117736816, "epoch": 0.39435904872389793, "grad_norm": 0.027769170701503754, "grad_norm_var": 9.83629595187561e-07, "learning_rate": 0.0068114981546864755, "loss": 2.599, "step": 10878 }, { "crossentropy": 2.5242552757263184, "epoch": 0.39439530162412995, "grad_norm": 0.028343064710497856, "grad_norm_var": 8.953779770040261e-07, "learning_rate": 0.006810956532856273, "loss": 2.4755, "step": 10879 }, { "crossentropy": 2.569293975830078, "epoch": 0.39443155452436196, "grad_norm": 0.029190251603722572, "grad_norm_var": 8.966898804117658e-07, "learning_rate": 0.006810414886566354, "loss": 2.5824, "step": 10880 }, { "crossentropy": 2.6424930095672607, "epoch": 0.394467807424594, "grad_norm": 0.02823690138757229, "grad_norm_var": 8.594117561928298e-07, "learning_rate": 0.006809873215824036, "loss": 2.5952, "step": 10881 }, { "crossentropy": 2.5819201469421387, "epoch": 0.394504060324826, "grad_norm": 0.026854615658521652, "grad_norm_var": 8.314140556392775e-07, "learning_rate": 0.006809331520636633, "loss": 2.5757, "step": 10882 }, { "crossentropy": 2.667314052581787, "epoch": 0.394540313225058, "grad_norm": 0.026796704158186913, "grad_norm_var": 9.53205164729252e-07, "learning_rate": 0.006808789801011461, "loss": 2.6256, "step": 10883 }, { "crossentropy": 2.5342535972595215, "epoch": 0.39457656612529, "grad_norm": 0.02673013135790825, "grad_norm_var": 1.0890285996008216e-06, "learning_rate": 0.006808248056955838, "loss": 2.5911, "step": 10884 }, { "crossentropy": 2.456819772720337, "epoch": 0.39461281902552203, "grad_norm": 0.031713467091321945, "grad_norm_var": 1.8863695373118212e-06, "learning_rate": 0.006807706288477083, "loss": 2.5345, "step": 10885 }, { "crossentropy": 2.4965569972991943, "epoch": 0.39464907192575405, "grad_norm": 0.028824293985962868, "grad_norm_var": 1.4339075966483903e-06, "learning_rate": 0.00680716449558251, "loss": 2.5421, "step": 10886 }, { "crossentropy": 2.573725938796997, "epoch": 0.39468532482598606, "grad_norm": 0.032913174480199814, "grad_norm_var": 2.8192820383028673e-06, "learning_rate": 0.006806622678279437, "loss": 2.6189, "step": 10887 }, { "crossentropy": 2.5766048431396484, "epoch": 0.3947215777262181, "grad_norm": 0.033612340688705444, "grad_norm_var": 4.456347055027199e-06, "learning_rate": 0.006806080836575183, "loss": 2.5891, "step": 10888 }, { "crossentropy": 2.512101411819458, "epoch": 0.3947578306264501, "grad_norm": 0.02809148095548153, "grad_norm_var": 4.4652967815537405e-06, "learning_rate": 0.006805538970477068, "loss": 2.5113, "step": 10889 }, { "crossentropy": 2.6026463508605957, "epoch": 0.39479408352668216, "grad_norm": 0.029235700145363808, "grad_norm_var": 4.319179835796785e-06, "learning_rate": 0.006804997079992409, "loss": 2.5764, "step": 10890 }, { "crossentropy": 2.554938554763794, "epoch": 0.3948303364269142, "grad_norm": 0.029363088309764862, "grad_norm_var": 4.326335379615367e-06, "learning_rate": 0.006804455165128525, "loss": 2.6237, "step": 10891 }, { "crossentropy": 2.5370404720306396, "epoch": 0.3948665893271462, "grad_norm": 0.030429204925894737, "grad_norm_var": 4.4154708327030665e-06, "learning_rate": 0.006803913225892735, "loss": 2.5483, "step": 10892 }, { "crossentropy": 2.630134105682373, "epoch": 0.3949028422273782, "grad_norm": 0.028135906904935837, "grad_norm_var": 4.309568428658336e-06, "learning_rate": 0.0068033712622923605, "loss": 2.5781, "step": 10893 }, { "crossentropy": 2.828009605407715, "epoch": 0.3949390951276102, "grad_norm": 0.031851787120103836, "grad_norm_var": 4.605110980582654e-06, "learning_rate": 0.00680282927433472, "loss": 2.7439, "step": 10894 }, { "crossentropy": 2.466845989227295, "epoch": 0.39497534802784223, "grad_norm": 0.030492451041936874, "grad_norm_var": 4.592345763869048e-06, "learning_rate": 0.006802287262027133, "loss": 2.6024, "step": 10895 }, { "crossentropy": 2.6840672492980957, "epoch": 0.39501160092807425, "grad_norm": 0.0314982607960701, "grad_norm_var": 4.820888580505528e-06, "learning_rate": 0.006801745225376922, "loss": 2.674, "step": 10896 }, { "crossentropy": 2.7152392864227295, "epoch": 0.39504785382830626, "grad_norm": 0.031219888478517532, "grad_norm_var": 4.805558919741927e-06, "learning_rate": 0.006801203164391407, "loss": 2.642, "step": 10897 }, { "crossentropy": 2.589691638946533, "epoch": 0.3950841067285383, "grad_norm": 0.029503894969820976, "grad_norm_var": 4.182557692428377e-06, "learning_rate": 0.006800661079077912, "loss": 2.6675, "step": 10898 }, { "crossentropy": 2.6784486770629883, "epoch": 0.3951203596287703, "grad_norm": 0.028087450191378593, "grad_norm_var": 3.7309696092941806e-06, "learning_rate": 0.006800118969443756, "loss": 2.6388, "step": 10899 }, { "crossentropy": 2.5093653202056885, "epoch": 0.3951566125290023, "grad_norm": 0.02919001504778862, "grad_norm_var": 3.0017925999284436e-06, "learning_rate": 0.00679957683549626, "loss": 2.6005, "step": 10900 }, { "crossentropy": 2.5892910957336426, "epoch": 0.3951928654292343, "grad_norm": 0.027401020750403404, "grad_norm_var": 3.3284704449198226e-06, "learning_rate": 0.006799034677242748, "loss": 2.7098, "step": 10901 }, { "crossentropy": 2.6406500339508057, "epoch": 0.39522911832946633, "grad_norm": 0.027970967814326286, "grad_norm_var": 3.5066819362991927e-06, "learning_rate": 0.006798492494690544, "loss": 2.5914, "step": 10902 }, { "crossentropy": 2.646514892578125, "epoch": 0.3952653712296984, "grad_norm": 0.027669506147503853, "grad_norm_var": 3.144578384723223e-06, "learning_rate": 0.00679795028784697, "loss": 2.6674, "step": 10903 }, { "crossentropy": 2.794541120529175, "epoch": 0.3953016241299304, "grad_norm": 0.0270902831107378, "grad_norm_var": 2.3223069919280167e-06, "learning_rate": 0.006797408056719347, "loss": 2.7044, "step": 10904 }, { "crossentropy": 2.4828779697418213, "epoch": 0.39533787703016243, "grad_norm": 0.028738850727677345, "grad_norm_var": 2.2526503382081966e-06, "learning_rate": 0.0067968658013150005, "loss": 2.4944, "step": 10905 }, { "crossentropy": 2.5006229877471924, "epoch": 0.39537412993039445, "grad_norm": 0.028046289458870888, "grad_norm_var": 2.3421302424119488e-06, "learning_rate": 0.006796323521641257, "loss": 2.524, "step": 10906 }, { "crossentropy": 2.7536227703094482, "epoch": 0.39541038283062646, "grad_norm": 0.0283636637032032, "grad_norm_var": 2.378568742106623e-06, "learning_rate": 0.006795781217705436, "loss": 2.6073, "step": 10907 }, { "crossentropy": 2.603679895401001, "epoch": 0.3954466357308585, "grad_norm": 0.028460567817091942, "grad_norm_var": 2.2733605209658184e-06, "learning_rate": 0.006795238889514864, "loss": 2.6193, "step": 10908 }, { "crossentropy": 2.673621416091919, "epoch": 0.3954828886310905, "grad_norm": 0.029359841719269753, "grad_norm_var": 2.2288217116250263e-06, "learning_rate": 0.006794696537076868, "loss": 2.6448, "step": 10909 }, { "crossentropy": 2.4947800636291504, "epoch": 0.3955191415313225, "grad_norm": 0.0319095253944397, "grad_norm_var": 2.2505298073946522e-06, "learning_rate": 0.006794154160398771, "loss": 2.5498, "step": 10910 }, { "crossentropy": 2.5937037467956543, "epoch": 0.3955553944315545, "grad_norm": 0.030626721680164337, "grad_norm_var": 2.277253881941867e-06, "learning_rate": 0.0067936117594879, "loss": 2.6043, "step": 10911 }, { "crossentropy": 2.611654043197632, "epoch": 0.39559164733178653, "grad_norm": 0.028345918282866478, "grad_norm_var": 1.8781448309369928e-06, "learning_rate": 0.006793069334351579, "loss": 2.5503, "step": 10912 }, { "crossentropy": 2.6322457790374756, "epoch": 0.39562790023201855, "grad_norm": 0.028364790603518486, "grad_norm_var": 1.4945963047944057e-06, "learning_rate": 0.0067925268849971365, "loss": 2.6493, "step": 10913 }, { "crossentropy": 2.347759485244751, "epoch": 0.39566415313225056, "grad_norm": 0.028215117752552032, "grad_norm_var": 1.4595073800066956e-06, "learning_rate": 0.006791984411431898, "loss": 2.4766, "step": 10914 }, { "crossentropy": 2.758594512939453, "epoch": 0.3957004060324826, "grad_norm": 0.028809862211346626, "grad_norm_var": 1.4413071905002798e-06, "learning_rate": 0.006791441913663191, "loss": 2.6903, "step": 10915 }, { "crossentropy": 2.729001998901367, "epoch": 0.3957366589327146, "grad_norm": 0.02945384755730629, "grad_norm_var": 1.46429588963845e-06, "learning_rate": 0.006790899391698342, "loss": 2.7238, "step": 10916 }, { "crossentropy": 2.491197347640991, "epoch": 0.39577291183294666, "grad_norm": 0.03932754322886467, "grad_norm_var": 8.325870341858073e-06, "learning_rate": 0.0067903568455446785, "loss": 2.6842, "step": 10917 }, { "crossentropy": 2.6439311504364014, "epoch": 0.3958091647331787, "grad_norm": 0.028526343405246735, "grad_norm_var": 8.237692918464664e-06, "learning_rate": 0.0067898142752095296, "loss": 2.6398, "step": 10918 }, { "crossentropy": 2.601844072341919, "epoch": 0.3958454176334107, "grad_norm": 0.029312333092093468, "grad_norm_var": 8.014879416465035e-06, "learning_rate": 0.006789271680700223, "loss": 2.5843, "step": 10919 }, { "crossentropy": 2.6998677253723145, "epoch": 0.3958816705336427, "grad_norm": 0.02777940407395363, "grad_norm_var": 7.817684221154511e-06, "learning_rate": 0.006788729062024085, "loss": 2.6732, "step": 10920 }, { "crossentropy": 2.731879711151123, "epoch": 0.3959179234338747, "grad_norm": 0.026251452043652534, "grad_norm_var": 8.49082610918812e-06, "learning_rate": 0.006788186419188448, "loss": 2.6707, "step": 10921 }, { "crossentropy": 2.5607473850250244, "epoch": 0.39595417633410673, "grad_norm": 0.02685116045176983, "grad_norm_var": 8.803313086905238e-06, "learning_rate": 0.00678764375220064, "loss": 2.5522, "step": 10922 }, { "crossentropy": 2.6232573986053467, "epoch": 0.39599042923433875, "grad_norm": 0.028347695246338844, "grad_norm_var": 8.805476711303212e-06, "learning_rate": 0.00678710106106799, "loss": 2.617, "step": 10923 }, { "crossentropy": 2.5466437339782715, "epoch": 0.39602668213457076, "grad_norm": 0.028171595185995102, "grad_norm_var": 8.845789198178716e-06, "learning_rate": 0.006786558345797828, "loss": 2.5596, "step": 10924 }, { "crossentropy": 2.6521849632263184, "epoch": 0.3960629350348028, "grad_norm": 0.0280048456043005, "grad_norm_var": 8.959362209284318e-06, "learning_rate": 0.006786015606397485, "loss": 2.6728, "step": 10925 }, { "crossentropy": 2.552015542984009, "epoch": 0.3960991879350348, "grad_norm": 0.02974858507514, "grad_norm_var": 8.490308537758192e-06, "learning_rate": 0.006785472842874289, "loss": 2.5512, "step": 10926 }, { "crossentropy": 2.524975299835205, "epoch": 0.3961354408352668, "grad_norm": 0.0266372412443161, "grad_norm_var": 8.69080559137217e-06, "learning_rate": 0.0067849300552355754, "loss": 2.5359, "step": 10927 }, { "crossentropy": 2.605086326599121, "epoch": 0.3961716937354988, "grad_norm": 0.026881961151957512, "grad_norm_var": 8.929829788406619e-06, "learning_rate": 0.0067843872434886696, "loss": 2.6126, "step": 10928 }, { "crossentropy": 2.6643762588500977, "epoch": 0.39620794663573083, "grad_norm": 0.027806729078292847, "grad_norm_var": 8.981136987085354e-06, "learning_rate": 0.006783844407640909, "loss": 2.6277, "step": 10929 }, { "crossentropy": 2.6454925537109375, "epoch": 0.3962441995359629, "grad_norm": 0.027666622772812843, "grad_norm_var": 9.039631895640423e-06, "learning_rate": 0.00678330154769962, "loss": 2.6677, "step": 10930 }, { "crossentropy": 2.5575013160705566, "epoch": 0.3962804524361949, "grad_norm": 0.02704676240682602, "grad_norm_var": 9.213630372552915e-06, "learning_rate": 0.006782758663672141, "loss": 2.6629, "step": 10931 }, { "crossentropy": 2.6667709350585938, "epoch": 0.39631670533642693, "grad_norm": 0.02633863314986229, "grad_norm_var": 9.471068284120721e-06, "learning_rate": 0.006782215755565797, "loss": 2.6483, "step": 10932 }, { "crossentropy": 2.462296962738037, "epoch": 0.39635295823665895, "grad_norm": 0.028213098645210266, "grad_norm_var": 1.025623260930567e-06, "learning_rate": 0.006781672823387926, "loss": 2.5176, "step": 10933 }, { "crossentropy": 2.6955034732818604, "epoch": 0.39638921113689096, "grad_norm": 0.026974817737936974, "grad_norm_var": 1.010100384309503e-06, "learning_rate": 0.006781129867145861, "loss": 2.6282, "step": 10934 }, { "crossentropy": 2.7006452083587646, "epoch": 0.396425464037123, "grad_norm": 0.028493745252490044, "grad_norm_var": 8.680413973939534e-07, "learning_rate": 0.0067805868868469335, "loss": 2.7032, "step": 10935 }, { "crossentropy": 2.486992120742798, "epoch": 0.396461716937355, "grad_norm": 0.026972824707627296, "grad_norm_var": 8.868160726112669e-07, "learning_rate": 0.0067800438824984766, "loss": 2.487, "step": 10936 }, { "crossentropy": 2.5636823177337646, "epoch": 0.396497969837587, "grad_norm": 0.027675777673721313, "grad_norm_var": 7.716582198400805e-07, "learning_rate": 0.006779500854107827, "loss": 2.5858, "step": 10937 }, { "crossentropy": 2.5003533363342285, "epoch": 0.396534222737819, "grad_norm": 0.02920129895210266, "grad_norm_var": 8.77659465970297e-07, "learning_rate": 0.006778957801682317, "loss": 2.5619, "step": 10938 }, { "crossentropy": 2.649820327758789, "epoch": 0.39657047563805103, "grad_norm": 0.03014611266553402, "grad_norm_var": 1.2203931758672371e-06, "learning_rate": 0.006778414725229284, "loss": 2.6122, "step": 10939 }, { "crossentropy": 2.6445295810699463, "epoch": 0.39660672853828305, "grad_norm": 0.02956574782729149, "grad_norm_var": 1.3972300119582804e-06, "learning_rate": 0.00677787162475606, "loss": 2.6251, "step": 10940 }, { "crossentropy": 2.6142587661743164, "epoch": 0.39664298143851506, "grad_norm": 0.027410320937633514, "grad_norm_var": 1.4158396708592128e-06, "learning_rate": 0.006777328500269982, "loss": 2.7095, "step": 10941 }, { "crossentropy": 2.5537171363830566, "epoch": 0.3966792343387471, "grad_norm": 0.026979345828294754, "grad_norm_var": 1.2213516699520476e-06, "learning_rate": 0.006776785351778385, "loss": 2.5833, "step": 10942 }, { "crossentropy": 2.53790545463562, "epoch": 0.3967154872389791, "grad_norm": 0.02703355811536312, "grad_norm_var": 1.172331224315763e-06, "learning_rate": 0.006776242179288606, "loss": 2.5958, "step": 10943 }, { "crossentropy": 2.635321617126465, "epoch": 0.39675174013921116, "grad_norm": 0.02825356461107731, "grad_norm_var": 1.1265087813559405e-06, "learning_rate": 0.006775698982807981, "loss": 2.6719, "step": 10944 }, { "crossentropy": 2.6096816062927246, "epoch": 0.3967879930394432, "grad_norm": 0.030848916620016098, "grad_norm_var": 1.6828516535698311e-06, "learning_rate": 0.006775155762343846, "loss": 2.5943, "step": 10945 }, { "crossentropy": 2.5311439037323, "epoch": 0.3968242459396752, "grad_norm": 0.030572673305869102, "grad_norm_var": 2.0616116398290088e-06, "learning_rate": 0.006774612517903538, "loss": 2.6728, "step": 10946 }, { "crossentropy": 2.7023205757141113, "epoch": 0.3968604988399072, "grad_norm": 0.028115134686231613, "grad_norm_var": 1.9639783693455585e-06, "learning_rate": 0.006774069249494395, "loss": 2.7219, "step": 10947 }, { "crossentropy": 2.6972861289978027, "epoch": 0.3968967517401392, "grad_norm": 0.028828611597418785, "grad_norm_var": 1.7004016600726442e-06, "learning_rate": 0.006773525957123755, "loss": 2.7196, "step": 10948 }, { "crossentropy": 2.708066940307617, "epoch": 0.39693300464037123, "grad_norm": 0.029963551089167595, "grad_norm_var": 1.835367706395904e-06, "learning_rate": 0.006772982640798956, "loss": 2.6666, "step": 10949 }, { "crossentropy": 2.390763282775879, "epoch": 0.39696925754060325, "grad_norm": 0.02875799685716629, "grad_norm_var": 1.6560828001147924e-06, "learning_rate": 0.0067724393005273355, "loss": 2.4684, "step": 10950 }, { "crossentropy": 2.52620792388916, "epoch": 0.39700551044083526, "grad_norm": 0.027882622554898262, "grad_norm_var": 1.6942916014311393e-06, "learning_rate": 0.006771895936316233, "loss": 2.5602, "step": 10951 }, { "crossentropy": 2.580355167388916, "epoch": 0.3970417633410673, "grad_norm": 0.028772782534360886, "grad_norm_var": 1.4971485354179354e-06, "learning_rate": 0.006771352548172986, "loss": 2.6773, "step": 10952 }, { "crossentropy": 2.6061840057373047, "epoch": 0.3970780162412993, "grad_norm": 0.0282859168946743, "grad_norm_var": 1.4329846225536684e-06, "learning_rate": 0.006770809136104934, "loss": 2.5819, "step": 10953 }, { "crossentropy": 2.480201244354248, "epoch": 0.3971142691415313, "grad_norm": 0.02741219475865364, "grad_norm_var": 1.5346005716367623e-06, "learning_rate": 0.006770265700119419, "loss": 2.4252, "step": 10954 }, { "crossentropy": 2.6054306030273438, "epoch": 0.3971505220417633, "grad_norm": 0.027554582804441452, "grad_norm_var": 1.4466553254807146e-06, "learning_rate": 0.006769722240223778, "loss": 2.5827, "step": 10955 }, { "crossentropy": 2.5023818016052246, "epoch": 0.39718677494199534, "grad_norm": 0.030990906059741974, "grad_norm_var": 1.7732912713325685e-06, "learning_rate": 0.006769178756425354, "loss": 2.5569, "step": 10956 }, { "crossentropy": 2.5499074459075928, "epoch": 0.3972230278422274, "grad_norm": 0.029836734756827354, "grad_norm_var": 1.7551044786406986e-06, "learning_rate": 0.006768635248731485, "loss": 2.6525, "step": 10957 }, { "crossentropy": 2.592928647994995, "epoch": 0.3972592807424594, "grad_norm": 0.027552353218197823, "grad_norm_var": 1.6399204221500518e-06, "learning_rate": 0.006768091717149513, "loss": 2.6014, "step": 10958 }, { "crossentropy": 2.6484715938568115, "epoch": 0.39729553364269143, "grad_norm": 0.02997967042028904, "grad_norm_var": 1.491894768921448e-06, "learning_rate": 0.0067675481616867785, "loss": 2.6274, "step": 10959 }, { "crossentropy": 2.65270733833313, "epoch": 0.39733178654292345, "grad_norm": 0.032169491052627563, "grad_norm_var": 2.0733533686897404e-06, "learning_rate": 0.006767004582350624, "loss": 2.5227, "step": 10960 }, { "crossentropy": 2.524693250656128, "epoch": 0.39736803944315546, "grad_norm": 0.029662426561117172, "grad_norm_var": 1.9036867608431765e-06, "learning_rate": 0.006766460979148392, "loss": 2.5699, "step": 10961 }, { "crossentropy": 2.597947597503662, "epoch": 0.3974042923433875, "grad_norm": 0.031056776642799377, "grad_norm_var": 2.01041500147367e-06, "learning_rate": 0.006765917352087424, "loss": 2.5743, "step": 10962 }, { "crossentropy": 2.691718816757202, "epoch": 0.3974405452436195, "grad_norm": 0.0314328633248806, "grad_norm_var": 2.2289252345553586e-06, "learning_rate": 0.0067653737011750615, "loss": 2.61, "step": 10963 }, { "crossentropy": 2.712843656539917, "epoch": 0.3974767981438515, "grad_norm": 0.03007820062339306, "grad_norm_var": 2.2340300361324097e-06, "learning_rate": 0.006764830026418647, "loss": 2.6502, "step": 10964 }, { "crossentropy": 2.586214780807495, "epoch": 0.3975130510440835, "grad_norm": 0.027572661638259888, "grad_norm_var": 2.4313565901143274e-06, "learning_rate": 0.006764286327825528, "loss": 2.5631, "step": 10965 }, { "crossentropy": 2.549309730529785, "epoch": 0.39754930394431554, "grad_norm": 0.02868875488638878, "grad_norm_var": 2.436774512228632e-06, "learning_rate": 0.006763742605403042, "loss": 2.5826, "step": 10966 }, { "crossentropy": 2.6988160610198975, "epoch": 0.39758555684454755, "grad_norm": 0.02955046109855175, "grad_norm_var": 2.293643544144782e-06, "learning_rate": 0.006763198859158536, "loss": 2.611, "step": 10967 }, { "crossentropy": 2.665337085723877, "epoch": 0.39762180974477956, "grad_norm": 0.029034625738859177, "grad_norm_var": 2.2756016032100204e-06, "learning_rate": 0.006762655089099353, "loss": 2.6335, "step": 10968 }, { "crossentropy": 2.543196201324463, "epoch": 0.3976580626450116, "grad_norm": 0.027202222496271133, "grad_norm_var": 2.5141196531506717e-06, "learning_rate": 0.00676211129523284, "loss": 2.5568, "step": 10969 }, { "crossentropy": 2.6030285358428955, "epoch": 0.3976943155452436, "grad_norm": 0.026394955813884735, "grad_norm_var": 2.843104069775758e-06, "learning_rate": 0.006761567477566339, "loss": 2.6173, "step": 10970 }, { "crossentropy": 2.8121731281280518, "epoch": 0.39773056844547566, "grad_norm": 0.02715938165783882, "grad_norm_var": 2.9446983325813092e-06, "learning_rate": 0.006761023636107196, "loss": 2.6748, "step": 10971 }, { "crossentropy": 2.616117000579834, "epoch": 0.3977668213457077, "grad_norm": 0.027282482013106346, "grad_norm_var": 2.9546236014647725e-06, "learning_rate": 0.006760479770862755, "loss": 2.5562, "step": 10972 }, { "crossentropy": 2.7335424423217773, "epoch": 0.3978030742459397, "grad_norm": 0.027405627071857452, "grad_norm_var": 3.0660415395857097e-06, "learning_rate": 0.006759935881840365, "loss": 2.7373, "step": 10973 }, { "crossentropy": 2.6467907428741455, "epoch": 0.3978393271461717, "grad_norm": 0.026616455987095833, "grad_norm_var": 3.2875725633352368e-06, "learning_rate": 0.00675939196904737, "loss": 2.5879, "step": 10974 }, { "crossentropy": 2.4825172424316406, "epoch": 0.3978755800464037, "grad_norm": 0.026874300092458725, "grad_norm_var": 3.414442668926941e-06, "learning_rate": 0.006758848032491116, "loss": 2.5191, "step": 10975 }, { "crossentropy": 2.6313552856445312, "epoch": 0.39791183294663574, "grad_norm": 0.028624817728996277, "grad_norm_var": 2.5298954289226332e-06, "learning_rate": 0.006758304072178949, "loss": 2.6521, "step": 10976 }, { "crossentropy": 2.6483724117279053, "epoch": 0.39794808584686775, "grad_norm": 0.027493488043546677, "grad_norm_var": 2.463114305032571e-06, "learning_rate": 0.006757760088118219, "loss": 2.6977, "step": 10977 }, { "crossentropy": 2.6814494132995605, "epoch": 0.39798433874709976, "grad_norm": 0.026831667870283127, "grad_norm_var": 2.0141248248718597e-06, "learning_rate": 0.006757216080316271, "loss": 2.6264, "step": 10978 }, { "crossentropy": 2.6026411056518555, "epoch": 0.3980205916473318, "grad_norm": 0.02672143280506134, "grad_norm_var": 1.254519798014192e-06, "learning_rate": 0.006756672048780452, "loss": 2.6569, "step": 10979 }, { "crossentropy": 2.6199872493743896, "epoch": 0.3980568445475638, "grad_norm": 0.02701949141919613, "grad_norm_var": 8.778052119841802e-07, "learning_rate": 0.006756127993518112, "loss": 2.601, "step": 10980 }, { "crossentropy": 2.696348190307617, "epoch": 0.3980930974477958, "grad_norm": 0.027105361223220825, "grad_norm_var": 8.887672746491863e-07, "learning_rate": 0.006755583914536598, "loss": 2.6912, "step": 10981 }, { "crossentropy": 2.6059772968292236, "epoch": 0.3981293503480278, "grad_norm": 0.028094641864299774, "grad_norm_var": 8.166879749013066e-07, "learning_rate": 0.006755039811843261, "loss": 2.6965, "step": 10982 }, { "crossentropy": 2.593877077102661, "epoch": 0.39816560324825984, "grad_norm": 0.026476450264453888, "grad_norm_var": 5.51788078274754e-07, "learning_rate": 0.006754495685445446, "loss": 2.5744, "step": 10983 }, { "crossentropy": 2.5990147590637207, "epoch": 0.3982018561484919, "grad_norm": 0.02885463461279869, "grad_norm_var": 5.114900506231288e-07, "learning_rate": 0.006753951535350505, "loss": 2.6243, "step": 10984 }, { "crossentropy": 2.622621536254883, "epoch": 0.3982381090487239, "grad_norm": 0.029897421598434448, "grad_norm_var": 9.447914210395133e-07, "learning_rate": 0.0067534073615657855, "loss": 2.5775, "step": 10985 }, { "crossentropy": 2.617241859436035, "epoch": 0.39827436194895594, "grad_norm": 0.028833819553256035, "grad_norm_var": 9.805240469819288e-07, "learning_rate": 0.006752863164098641, "loss": 2.6001, "step": 10986 }, { "crossentropy": 2.5605554580688477, "epoch": 0.39831061484918795, "grad_norm": 0.02866678684949875, "grad_norm_var": 1.0378578474097451e-06, "learning_rate": 0.006752318942956418, "loss": 2.5894, "step": 10987 }, { "crossentropy": 2.4840776920318604, "epoch": 0.39834686774941996, "grad_norm": 0.027507245540618896, "grad_norm_var": 1.0292541986403238e-06, "learning_rate": 0.006751774698146468, "loss": 2.5808, "step": 10988 }, { "crossentropy": 2.554248332977295, "epoch": 0.398383120649652, "grad_norm": 0.02961568906903267, "grad_norm_var": 1.2510312606792457e-06, "learning_rate": 0.006751230429676143, "loss": 2.7233, "step": 10989 }, { "crossentropy": 2.600403070449829, "epoch": 0.398419373549884, "grad_norm": 0.03348447382450104, "grad_norm_var": 3.090499635222584e-06, "learning_rate": 0.006750686137552794, "loss": 2.5851, "step": 10990 }, { "crossentropy": 2.575598955154419, "epoch": 0.398455626450116, "grad_norm": 0.0303991436958313, "grad_norm_var": 3.2174940560134383e-06, "learning_rate": 0.006750141821783771, "loss": 2.6136, "step": 10991 }, { "crossentropy": 2.7454802989959717, "epoch": 0.398491879350348, "grad_norm": 0.029241465032100677, "grad_norm_var": 3.25344136910029e-06, "learning_rate": 0.0067495974823764265, "loss": 2.706, "step": 10992 }, { "crossentropy": 2.644516944885254, "epoch": 0.39852813225058004, "grad_norm": 0.029358109459280968, "grad_norm_var": 3.2167278443366213e-06, "learning_rate": 0.006749053119338114, "loss": 2.5686, "step": 10993 }, { "crossentropy": 2.5333003997802734, "epoch": 0.39856438515081205, "grad_norm": 0.027058318257331848, "grad_norm_var": 3.165540231597633e-06, "learning_rate": 0.006748508732676184, "loss": 2.5328, "step": 10994 }, { "crossentropy": 2.541834831237793, "epoch": 0.39860063805104406, "grad_norm": 0.02714291773736477, "grad_norm_var": 3.068491839513151e-06, "learning_rate": 0.006747964322397991, "loss": 2.6072, "step": 10995 }, { "crossentropy": 2.5964996814727783, "epoch": 0.3986368909512761, "grad_norm": 0.028587723150849342, "grad_norm_var": 2.8766138608248813e-06, "learning_rate": 0.006747419888510885, "loss": 2.5384, "step": 10996 }, { "crossentropy": 2.5944812297821045, "epoch": 0.3986731438515081, "grad_norm": 0.02749907411634922, "grad_norm_var": 2.798902890484044e-06, "learning_rate": 0.006746875431022223, "loss": 2.6409, "step": 10997 }, { "crossentropy": 2.6296534538269043, "epoch": 0.39870939675174016, "grad_norm": 0.027740228921175003, "grad_norm_var": 2.83984272992746e-06, "learning_rate": 0.006746330949939359, "loss": 2.6473, "step": 10998 }, { "crossentropy": 2.6631839275360107, "epoch": 0.3987456496519722, "grad_norm": 0.027335846796631813, "grad_norm_var": 2.6228821780548948e-06, "learning_rate": 0.006745786445269644, "loss": 2.6242, "step": 10999 }, { "crossentropy": 2.640378475189209, "epoch": 0.3987819025522042, "grad_norm": 0.03017444722354412, "grad_norm_var": 2.7367143678391163e-06, "learning_rate": 0.006745241917020433, "loss": 2.5372, "step": 11000 }, { "crossentropy": 2.613368511199951, "epoch": 0.3988181554524362, "grad_norm": 0.029629485681653023, "grad_norm_var": 2.7058871898882896e-06, "learning_rate": 0.006744697365199082, "loss": 2.6571, "step": 11001 }, { "crossentropy": 2.6663901805877686, "epoch": 0.3988544083526682, "grad_norm": 0.030948901548981667, "grad_norm_var": 2.9690289987060445e-06, "learning_rate": 0.006744152789812945, "loss": 2.6142, "step": 11002 }, { "crossentropy": 2.6925573348999023, "epoch": 0.39889066125290024, "grad_norm": 0.0281447134912014, "grad_norm_var": 3.0109550460020543e-06, "learning_rate": 0.006743608190869379, "loss": 2.6306, "step": 11003 }, { "crossentropy": 2.6860125064849854, "epoch": 0.39892691415313225, "grad_norm": 0.02746063843369484, "grad_norm_var": 3.0203158535764435e-06, "learning_rate": 0.006743063568375736, "loss": 2.6357, "step": 11004 }, { "crossentropy": 2.3305628299713135, "epoch": 0.39896316705336426, "grad_norm": 0.027946017682552338, "grad_norm_var": 3.054998917864417e-06, "learning_rate": 0.006742518922339375, "loss": 2.5443, "step": 11005 }, { "crossentropy": 2.6852335929870605, "epoch": 0.3989994199535963, "grad_norm": 0.025524912402033806, "grad_norm_var": 2.1327931558543505e-06, "learning_rate": 0.006741974252767653, "loss": 2.6657, "step": 11006 }, { "crossentropy": 2.706866979598999, "epoch": 0.3990356728538283, "grad_norm": 0.02703196555376053, "grad_norm_var": 1.9380434446582127e-06, "learning_rate": 0.006741429559667927, "loss": 2.588, "step": 11007 }, { "crossentropy": 2.6095144748687744, "epoch": 0.3990719257540603, "grad_norm": 0.026997635141015053, "grad_norm_var": 1.934117661534414e-06, "learning_rate": 0.0067408848430475475, "loss": 2.5976, "step": 11008 }, { "crossentropy": 2.7333056926727295, "epoch": 0.3991081786542923, "grad_norm": 0.029835294932127, "grad_norm_var": 2.0324485153721254e-06, "learning_rate": 0.006740340102913879, "loss": 2.5818, "step": 11009 }, { "crossentropy": 2.612344741821289, "epoch": 0.39914443155452434, "grad_norm": 0.03028109297156334, "grad_norm_var": 2.248529689241593e-06, "learning_rate": 0.006739795339274275, "loss": 2.5351, "step": 11010 }, { "crossentropy": 2.5752310752868652, "epoch": 0.3991806844547564, "grad_norm": 0.029641365632414818, "grad_norm_var": 2.2640231314994487e-06, "learning_rate": 0.006739250552136097, "loss": 2.6029, "step": 11011 }, { "crossentropy": 2.5884149074554443, "epoch": 0.3992169373549884, "grad_norm": 0.030346650630235672, "grad_norm_var": 2.4958524531469457e-06, "learning_rate": 0.006738705741506699, "loss": 2.5878, "step": 11012 }, { "crossentropy": 2.6489439010620117, "epoch": 0.39925319025522044, "grad_norm": 0.028252819553017616, "grad_norm_var": 2.427387277055953e-06, "learning_rate": 0.006738160907393442, "loss": 2.6153, "step": 11013 }, { "crossentropy": 2.596240282058716, "epoch": 0.39928944315545245, "grad_norm": 0.026327352970838547, "grad_norm_var": 2.71049139471165e-06, "learning_rate": 0.006737616049803683, "loss": 2.4799, "step": 11014 }, { "crossentropy": 2.710916757583618, "epoch": 0.39932569605568446, "grad_norm": 0.02583388239145279, "grad_norm_var": 3.083107797940719e-06, "learning_rate": 0.006737071168744784, "loss": 2.6395, "step": 11015 }, { "crossentropy": 2.6719415187835693, "epoch": 0.3993619489559165, "grad_norm": 0.028762366622686386, "grad_norm_var": 2.873374115944701e-06, "learning_rate": 0.0067365262642241006, "loss": 2.5849, "step": 11016 }, { "crossentropy": 2.5176241397857666, "epoch": 0.3993982018561485, "grad_norm": 0.029561342671513557, "grad_norm_var": 2.861678729586259e-06, "learning_rate": 0.006735981336248998, "loss": 2.5681, "step": 11017 }, { "crossentropy": 2.6269936561584473, "epoch": 0.3994344547563805, "grad_norm": 0.028313422575592995, "grad_norm_var": 2.3671007954237377e-06, "learning_rate": 0.006735436384826831, "loss": 2.6116, "step": 11018 }, { "crossentropy": 2.5856571197509766, "epoch": 0.3994707076566125, "grad_norm": 0.029862480238080025, "grad_norm_var": 2.552293125411886e-06, "learning_rate": 0.006734891409964963, "loss": 2.5663, "step": 11019 }, { "crossentropy": 2.755920171737671, "epoch": 0.39950696055684454, "grad_norm": 0.027234135195612907, "grad_norm_var": 2.5792994835201663e-06, "learning_rate": 0.006734346411670751, "loss": 2.6585, "step": 11020 }, { "crossentropy": 2.6177871227264404, "epoch": 0.39954321345707655, "grad_norm": 0.027748053893446922, "grad_norm_var": 2.589364593835033e-06, "learning_rate": 0.006733801389951562, "loss": 2.5846, "step": 11021 }, { "crossentropy": 2.5918383598327637, "epoch": 0.39957946635730857, "grad_norm": 0.02824879251420498, "grad_norm_var": 2.0734827175592027e-06, "learning_rate": 0.006733256344814752, "loss": 2.5702, "step": 11022 }, { "crossentropy": 2.636526584625244, "epoch": 0.3996157192575406, "grad_norm": 0.02880058065056801, "grad_norm_var": 1.948167621605929e-06, "learning_rate": 0.0067327112762676855, "loss": 2.7064, "step": 11023 }, { "crossentropy": 2.695603609085083, "epoch": 0.3996519721577726, "grad_norm": 0.031636327505111694, "grad_norm_var": 2.3619809762118654e-06, "learning_rate": 0.006732166184317724, "loss": 2.6821, "step": 11024 }, { "crossentropy": 2.593362331390381, "epoch": 0.39968822505800466, "grad_norm": 0.029375514015555382, "grad_norm_var": 2.311288593375551e-06, "learning_rate": 0.006731621068972229, "loss": 2.6513, "step": 11025 }, { "crossentropy": 2.595245838165283, "epoch": 0.3997244779582367, "grad_norm": 0.027591994032263756, "grad_norm_var": 2.2193423517182764e-06, "learning_rate": 0.006731075930238564, "loss": 2.538, "step": 11026 }, { "crossentropy": 2.5826570987701416, "epoch": 0.3997607308584687, "grad_norm": 0.028597449883818626, "grad_norm_var": 2.141958609204406e-06, "learning_rate": 0.0067305307681240914, "loss": 2.5733, "step": 11027 }, { "crossentropy": 2.6357240676879883, "epoch": 0.3997969837587007, "grad_norm": 0.02767355367541313, "grad_norm_var": 1.9413645669753726e-06, "learning_rate": 0.0067299855826361745, "loss": 2.635, "step": 11028 }, { "crossentropy": 2.5068843364715576, "epoch": 0.3998332366589327, "grad_norm": 0.02710529789328575, "grad_norm_var": 2.0406382721828057e-06, "learning_rate": 0.006729440373782178, "loss": 2.523, "step": 11029 }, { "crossentropy": 2.5506386756896973, "epoch": 0.39986948955916474, "grad_norm": 0.029949340969324112, "grad_norm_var": 1.9117561916593182e-06, "learning_rate": 0.006728895141569463, "loss": 2.542, "step": 11030 }, { "crossentropy": 2.566622734069824, "epoch": 0.39990574245939675, "grad_norm": 0.028329648077487946, "grad_norm_var": 1.4077326962599283e-06, "learning_rate": 0.006728349886005396, "loss": 2.6108, "step": 11031 }, { "crossentropy": 2.8554303646087646, "epoch": 0.39994199535962877, "grad_norm": 0.03573082759976387, "grad_norm_var": 4.524436267250631e-06, "learning_rate": 0.006727804607097342, "loss": 2.7596, "step": 11032 }, { "crossentropy": 2.5890324115753174, "epoch": 0.3999782482598608, "grad_norm": 0.029670385643839836, "grad_norm_var": 4.531742640402611e-06, "learning_rate": 0.006727259304852664, "loss": 2.5972, "step": 11033 }, { "crossentropy": 2.5117270946502686, "epoch": 0.4000145011600928, "grad_norm": 0.0327632911503315, "grad_norm_var": 5.2927061793452945e-06, "learning_rate": 0.006726713979278727, "loss": 2.5466, "step": 11034 }, { "crossentropy": 2.503051280975342, "epoch": 0.4000507540603248, "grad_norm": 0.03321334719657898, "grad_norm_var": 6.203402373620631e-06, "learning_rate": 0.006726168630382898, "loss": 2.6201, "step": 11035 }, { "crossentropy": 2.710820198059082, "epoch": 0.4000870069605568, "grad_norm": 0.029633907601237297, "grad_norm_var": 5.804958487539896e-06, "learning_rate": 0.006725623258172543, "loss": 2.677, "step": 11036 }, { "crossentropy": 2.5198283195495605, "epoch": 0.40012325986078884, "grad_norm": 0.027455022558569908, "grad_norm_var": 5.888709736777741e-06, "learning_rate": 0.006725077862655027, "loss": 2.5426, "step": 11037 }, { "crossentropy": 2.538951873779297, "epoch": 0.4001595127610209, "grad_norm": 0.027898937463760376, "grad_norm_var": 5.9657318235664095e-06, "learning_rate": 0.006724532443837715, "loss": 2.6912, "step": 11038 }, { "crossentropy": 2.4516308307647705, "epoch": 0.4001957656612529, "grad_norm": 0.027258435264229774, "grad_norm_var": 6.3022051475998625e-06, "learning_rate": 0.006723987001727977, "loss": 2.4983, "step": 11039 }, { "crossentropy": 2.608112096786499, "epoch": 0.40023201856148494, "grad_norm": 0.027609115466475487, "grad_norm_var": 6.231934685124729e-06, "learning_rate": 0.006723441536333177, "loss": 2.6315, "step": 11040 }, { "crossentropy": 2.675809383392334, "epoch": 0.40026827146171695, "grad_norm": 0.029222723096609116, "grad_norm_var": 6.233200018199029e-06, "learning_rate": 0.006722896047660685, "loss": 2.7027, "step": 11041 }, { "crossentropy": 2.6421005725860596, "epoch": 0.40030452436194897, "grad_norm": 0.03188705816864967, "grad_norm_var": 6.375710531369462e-06, "learning_rate": 0.006722350535717866, "loss": 2.6248, "step": 11042 }, { "crossentropy": 2.592897891998291, "epoch": 0.400340777262181, "grad_norm": 0.029477663338184357, "grad_norm_var": 6.303551052971969e-06, "learning_rate": 0.006721805000512089, "loss": 2.577, "step": 11043 }, { "crossentropy": 2.729558229446411, "epoch": 0.400377030162413, "grad_norm": 0.028064750134944916, "grad_norm_var": 6.2084651368449066e-06, "learning_rate": 0.006721259442050724, "loss": 2.6703, "step": 11044 }, { "crossentropy": 2.662869930267334, "epoch": 0.400413283062645, "grad_norm": 0.02882462553679943, "grad_norm_var": 5.797402151502759e-06, "learning_rate": 0.0067207138603411366, "loss": 2.586, "step": 11045 }, { "crossentropy": 2.7299392223358154, "epoch": 0.400449535962877, "grad_norm": 0.030784690752625465, "grad_norm_var": 5.856332595425524e-06, "learning_rate": 0.0067201682553906985, "loss": 2.6902, "step": 11046 }, { "crossentropy": 2.637216567993164, "epoch": 0.40048578886310904, "grad_norm": 0.026758985593914986, "grad_norm_var": 6.331851034536001e-06, "learning_rate": 0.006719622627206778, "loss": 2.5346, "step": 11047 }, { "crossentropy": 2.6587672233581543, "epoch": 0.40052204176334105, "grad_norm": 0.029686301946640015, "grad_norm_var": 3.807982889926046e-06, "learning_rate": 0.006719076975796743, "loss": 2.6257, "step": 11048 }, { "crossentropy": 2.562947988510132, "epoch": 0.40055829466357307, "grad_norm": 0.027924001216888428, "grad_norm_var": 3.93286325750367e-06, "learning_rate": 0.006718531301167965, "loss": 2.6146, "step": 11049 }, { "crossentropy": 2.6276187896728516, "epoch": 0.4005945475638051, "grad_norm": 0.027572065591812134, "grad_norm_var": 3.2054163774377126e-06, "learning_rate": 0.006717985603327814, "loss": 2.5818, "step": 11050 }, { "crossentropy": 2.589811086654663, "epoch": 0.4006308004640371, "grad_norm": 0.026826653629541397, "grad_norm_var": 2.128102388686256e-06, "learning_rate": 0.006717439882283661, "loss": 2.5717, "step": 11051 }, { "crossentropy": 2.5347981452941895, "epoch": 0.40066705336426917, "grad_norm": 0.02793838270008564, "grad_norm_var": 2.0639388164652435e-06, "learning_rate": 0.006716894138042875, "loss": 2.5876, "step": 11052 }, { "crossentropy": 2.686929225921631, "epoch": 0.4007033062645012, "grad_norm": 0.027071135118603706, "grad_norm_var": 2.1240434565530325e-06, "learning_rate": 0.006716348370612828, "loss": 2.6361, "step": 11053 }, { "crossentropy": 2.6252520084381104, "epoch": 0.4007395591647332, "grad_norm": 0.026889434084296227, "grad_norm_var": 2.258591757428365e-06, "learning_rate": 0.006715802580000893, "loss": 2.598, "step": 11054 }, { "crossentropy": 2.774791955947876, "epoch": 0.4007758120649652, "grad_norm": 0.02935159020125866, "grad_norm_var": 2.2243617775256792e-06, "learning_rate": 0.006715256766214439, "loss": 2.8024, "step": 11055 }, { "crossentropy": 2.5497658252716064, "epoch": 0.4008120649651972, "grad_norm": 0.028487492352724075, "grad_norm_var": 2.1690569547683313e-06, "learning_rate": 0.00671471092926084, "loss": 2.5822, "step": 11056 }, { "crossentropy": 2.5193536281585693, "epoch": 0.40084831786542924, "grad_norm": 0.02933599054813385, "grad_norm_var": 2.1800491097271562e-06, "learning_rate": 0.0067141650691474675, "loss": 2.5377, "step": 11057 }, { "crossentropy": 2.6825952529907227, "epoch": 0.40088457076566125, "grad_norm": 0.030420485883951187, "grad_norm_var": 1.662925729309358e-06, "learning_rate": 0.006713619185881694, "loss": 2.6226, "step": 11058 }, { "crossentropy": 2.62918758392334, "epoch": 0.40092082366589327, "grad_norm": 0.028469093143939972, "grad_norm_var": 1.5901062189460491e-06, "learning_rate": 0.006713073279470893, "loss": 2.5561, "step": 11059 }, { "crossentropy": 2.6394548416137695, "epoch": 0.4009570765661253, "grad_norm": 0.028773868456482887, "grad_norm_var": 1.5898031366608686e-06, "learning_rate": 0.006712527349922439, "loss": 2.5955, "step": 11060 }, { "crossentropy": 2.2811837196350098, "epoch": 0.4009933294663573, "grad_norm": 0.029359133914113045, "grad_norm_var": 1.6347375840927136e-06, "learning_rate": 0.006711981397243703, "loss": 2.4468, "step": 11061 }, { "crossentropy": 2.5814571380615234, "epoch": 0.4010295823665893, "grad_norm": 0.03965698927640915, "grad_norm_var": 9.283249350889204e-06, "learning_rate": 0.00671143542144206, "loss": 2.596, "step": 11062 }, { "crossentropy": 2.6330158710479736, "epoch": 0.4010658352668213, "grad_norm": 0.029737140983343124, "grad_norm_var": 8.934763755713964e-06, "learning_rate": 0.006710889422524886, "loss": 2.5828, "step": 11063 }, { "crossentropy": 2.717496395111084, "epoch": 0.40110208816705334, "grad_norm": 0.030166052281856537, "grad_norm_var": 8.979057503446091e-06, "learning_rate": 0.006710343400499551, "loss": 2.6615, "step": 11064 }, { "crossentropy": 2.507509469985962, "epoch": 0.4011383410672854, "grad_norm": 0.03098459355533123, "grad_norm_var": 9.023919477846526e-06, "learning_rate": 0.006709797355373435, "loss": 2.5954, "step": 11065 }, { "crossentropy": 2.505098819732666, "epoch": 0.4011745939675174, "grad_norm": 0.030848225578665733, "grad_norm_var": 8.878789621430174e-06, "learning_rate": 0.006709251287153912, "loss": 2.574, "step": 11066 }, { "crossentropy": 2.725559949874878, "epoch": 0.40121084686774944, "grad_norm": 0.026911314576864243, "grad_norm_var": 8.847426376260015e-06, "learning_rate": 0.0067087051958483555, "loss": 2.6374, "step": 11067 }, { "crossentropy": 2.639148712158203, "epoch": 0.40124709976798145, "grad_norm": 0.029780369251966476, "grad_norm_var": 8.639099227386925e-06, "learning_rate": 0.006708159081464141, "loss": 2.6624, "step": 11068 }, { "crossentropy": 2.6006813049316406, "epoch": 0.40128335266821347, "grad_norm": 0.03367531672120094, "grad_norm_var": 8.992786982632503e-06, "learning_rate": 0.006707612944008647, "loss": 2.5808, "step": 11069 }, { "crossentropy": 2.570169448852539, "epoch": 0.4013196055684455, "grad_norm": 0.033809639513492584, "grad_norm_var": 8.95157688579061e-06, "learning_rate": 0.006707066783489249, "loss": 2.5872, "step": 11070 }, { "crossentropy": 2.6607813835144043, "epoch": 0.4013558584686775, "grad_norm": 0.029585419222712517, "grad_norm_var": 8.915746221646578e-06, "learning_rate": 0.0067065205999133235, "loss": 2.5985, "step": 11071 }, { "crossentropy": 2.612858533859253, "epoch": 0.4013921113689095, "grad_norm": 0.028716696426272392, "grad_norm_var": 8.853704084401531e-06, "learning_rate": 0.0067059743932882466, "loss": 2.7058, "step": 11072 }, { "crossentropy": 2.6700520515441895, "epoch": 0.4014283642691415, "grad_norm": 0.029157215729355812, "grad_norm_var": 8.886770411504563e-06, "learning_rate": 0.006705428163621398, "loss": 2.7429, "step": 11073 }, { "crossentropy": 2.721600294113159, "epoch": 0.40146461716937354, "grad_norm": 0.028231047093868256, "grad_norm_var": 9.247016497289474e-06, "learning_rate": 0.006704881910920155, "loss": 2.72, "step": 11074 }, { "crossentropy": 2.738070011138916, "epoch": 0.40150087006960555, "grad_norm": 0.02827664092183113, "grad_norm_var": 9.301223900589707e-06, "learning_rate": 0.006704335635191892, "loss": 2.6711, "step": 11075 }, { "crossentropy": 2.6109185218811035, "epoch": 0.40153712296983757, "grad_norm": 0.027444256469607353, "grad_norm_var": 9.714066850266183e-06, "learning_rate": 0.00670378933644399, "loss": 2.5597, "step": 11076 }, { "crossentropy": 2.6610395908355713, "epoch": 0.4015733758700696, "grad_norm": 0.027563607320189476, "grad_norm_var": 1.0163851623868125e-05, "learning_rate": 0.006703243014683827, "loss": 2.6688, "step": 11077 }, { "crossentropy": 2.663780689239502, "epoch": 0.4016096287703016, "grad_norm": 0.027435975149273872, "grad_norm_var": 4.22548797837276e-06, "learning_rate": 0.0067026966699187856, "loss": 2.6081, "step": 11078 }, { "crossentropy": 2.645689010620117, "epoch": 0.40164588167053367, "grad_norm": 0.028846465051174164, "grad_norm_var": 4.249308546192913e-06, "learning_rate": 0.006702150302156239, "loss": 2.6103, "step": 11079 }, { "crossentropy": 2.4419009685516357, "epoch": 0.4016821345707657, "grad_norm": 0.028770774602890015, "grad_norm_var": 4.2404785424034195e-06, "learning_rate": 0.0067016039114035685, "loss": 2.528, "step": 11080 }, { "crossentropy": 2.6279258728027344, "epoch": 0.4017183874709977, "grad_norm": 0.02762005664408207, "grad_norm_var": 4.22696676218298e-06, "learning_rate": 0.006701057497668157, "loss": 2.6862, "step": 11081 }, { "crossentropy": 2.607011079788208, "epoch": 0.4017546403712297, "grad_norm": 0.027749154716730118, "grad_norm_var": 4.132559830545543e-06, "learning_rate": 0.0067005110609573825, "loss": 2.5986, "step": 11082 }, { "crossentropy": 2.5882604122161865, "epoch": 0.4017908932714617, "grad_norm": 0.028742501512169838, "grad_norm_var": 3.838669371134899e-06, "learning_rate": 0.006699964601278625, "loss": 2.5704, "step": 11083 }, { "crossentropy": 2.6168065071105957, "epoch": 0.40182714617169374, "grad_norm": 0.03057773783802986, "grad_norm_var": 3.952035489601426e-06, "learning_rate": 0.006699418118639265, "loss": 2.6488, "step": 11084 }, { "crossentropy": 2.606668710708618, "epoch": 0.40186339907192575, "grad_norm": 0.028857897967100143, "grad_norm_var": 2.487864482911273e-06, "learning_rate": 0.006698871613046685, "loss": 2.6286, "step": 11085 }, { "crossentropy": 2.525743246078491, "epoch": 0.40189965197215777, "grad_norm": 0.027884654700756073, "grad_norm_var": 7.532382820899889e-07, "learning_rate": 0.006698325084508266, "loss": 2.5357, "step": 11086 }, { "crossentropy": 2.47800350189209, "epoch": 0.4019359048723898, "grad_norm": 0.028209567070007324, "grad_norm_var": 6.66241831352475e-07, "learning_rate": 0.0066977785330313904, "loss": 2.4871, "step": 11087 }, { "crossentropy": 2.5550830364227295, "epoch": 0.4019721577726218, "grad_norm": 0.03048224002122879, "grad_norm_var": 9.40261120709919e-07, "learning_rate": 0.006697231958623436, "loss": 2.5843, "step": 11088 }, { "crossentropy": 2.698073148727417, "epoch": 0.4020084106728538, "grad_norm": 0.027376966550946236, "grad_norm_var": 9.80112146576652e-07, "learning_rate": 0.006696685361291791, "loss": 2.6251, "step": 11089 }, { "crossentropy": 2.663632869720459, "epoch": 0.4020446635730858, "grad_norm": 0.028096118941903114, "grad_norm_var": 9.839179651884573e-07, "learning_rate": 0.006696138741043836, "loss": 2.6532, "step": 11090 }, { "crossentropy": 2.570105791091919, "epoch": 0.40208091647331784, "grad_norm": 0.02891596406698227, "grad_norm_var": 1.0014277627251224e-06, "learning_rate": 0.006695592097886952, "loss": 2.6041, "step": 11091 }, { "crossentropy": 2.6363348960876465, "epoch": 0.4021171693735499, "grad_norm": 0.028890730813145638, "grad_norm_var": 9.457713169900058e-07, "learning_rate": 0.006695045431828524, "loss": 2.625, "step": 11092 }, { "crossentropy": 2.5426759719848633, "epoch": 0.4021534222737819, "grad_norm": 0.027786560356616974, "grad_norm_var": 9.21003932243906e-07, "learning_rate": 0.0066944987428759345, "loss": 2.568, "step": 11093 }, { "crossentropy": 2.6186914443969727, "epoch": 0.40218967517401394, "grad_norm": 0.029655111953616142, "grad_norm_var": 9.094600273017057e-07, "learning_rate": 0.006693952031036568, "loss": 2.6168, "step": 11094 }, { "crossentropy": 2.743321180343628, "epoch": 0.40222592807424595, "grad_norm": 0.03190812095999718, "grad_norm_var": 1.5739250176981708e-06, "learning_rate": 0.006693405296317809, "loss": 2.7304, "step": 11095 }, { "crossentropy": 2.603266716003418, "epoch": 0.40226218097447797, "grad_norm": 0.031253911554813385, "grad_norm_var": 1.9346371221300173e-06, "learning_rate": 0.006692858538727042, "loss": 2.6953, "step": 11096 }, { "crossentropy": 2.763105869293213, "epoch": 0.40229843387471, "grad_norm": 0.02930464781820774, "grad_norm_var": 1.8019481145522635e-06, "learning_rate": 0.00669231175827165, "loss": 2.7705, "step": 11097 }, { "crossentropy": 2.6638569831848145, "epoch": 0.402334686774942, "grad_norm": 0.028074584901332855, "grad_norm_var": 1.7497038646137215e-06, "learning_rate": 0.006691764954959021, "loss": 2.6077, "step": 11098 }, { "crossentropy": 2.5801100730895996, "epoch": 0.402370939675174, "grad_norm": 0.02776416577398777, "grad_norm_var": 1.8595612732628826e-06, "learning_rate": 0.006691218128796539, "loss": 2.6696, "step": 11099 }, { "crossentropy": 2.643576145172119, "epoch": 0.402407192575406, "grad_norm": 0.028420018032193184, "grad_norm_var": 1.715319042459963e-06, "learning_rate": 0.006690671279791589, "loss": 2.5869, "step": 11100 }, { "crossentropy": 2.5655858516693115, "epoch": 0.40244344547563804, "grad_norm": 0.028825514018535614, "grad_norm_var": 1.7156962543693565e-06, "learning_rate": 0.0066901244079515575, "loss": 2.603, "step": 11101 }, { "crossentropy": 2.6367459297180176, "epoch": 0.40247969837587005, "grad_norm": 0.02846876159310341, "grad_norm_var": 1.655759096058163e-06, "learning_rate": 0.006689577513283831, "loss": 2.6053, "step": 11102 }, { "crossentropy": 2.4184188842773438, "epoch": 0.40251595127610207, "grad_norm": 0.026866445317864418, "grad_norm_var": 1.9037141956232752e-06, "learning_rate": 0.006689030595795798, "loss": 2.5083, "step": 11103 }, { "crossentropy": 2.5889453887939453, "epoch": 0.4025522041763341, "grad_norm": 0.026219379156827927, "grad_norm_var": 2.1291299732763744e-06, "learning_rate": 0.006688483655494841, "loss": 2.5858, "step": 11104 }, { "crossentropy": 2.600839614868164, "epoch": 0.40258845707656615, "grad_norm": 0.028368568047881126, "grad_norm_var": 2.0270072011815386e-06, "learning_rate": 0.006687936692388351, "loss": 2.5807, "step": 11105 }, { "crossentropy": 2.4595348834991455, "epoch": 0.40262470997679817, "grad_norm": 0.026116400957107544, "grad_norm_var": 2.4250721504709963e-06, "learning_rate": 0.006687389706483714, "loss": 2.5418, "step": 11106 }, { "crossentropy": 2.6481027603149414, "epoch": 0.4026609628770302, "grad_norm": 0.028299914672970772, "grad_norm_var": 2.418931322396324e-06, "learning_rate": 0.006686842697788319, "loss": 2.5843, "step": 11107 }, { "crossentropy": 2.501063346862793, "epoch": 0.4026972157772622, "grad_norm": 0.0284341461956501, "grad_norm_var": 2.409021638581513e-06, "learning_rate": 0.0066862956663095545, "loss": 2.5335, "step": 11108 }, { "crossentropy": 2.6966586112976074, "epoch": 0.4027334686774942, "grad_norm": 0.028235066682100296, "grad_norm_var": 2.37980336449139e-06, "learning_rate": 0.006685748612054806, "loss": 2.6566, "step": 11109 }, { "crossentropy": 2.6124520301818848, "epoch": 0.4027697215777262, "grad_norm": 0.03006158396601677, "grad_norm_var": 2.4520048990187004e-06, "learning_rate": 0.0066852015350314644, "loss": 2.6723, "step": 11110 }, { "crossentropy": 2.6413676738739014, "epoch": 0.40280597447795824, "grad_norm": 0.027417484670877457, "grad_norm_var": 1.6949983735333832e-06, "learning_rate": 0.00668465443524692, "loss": 2.5777, "step": 11111 }, { "crossentropy": 2.738206386566162, "epoch": 0.40284222737819025, "grad_norm": 0.02816552110016346, "grad_norm_var": 1.0575272229304861e-06, "learning_rate": 0.00668410731270856, "loss": 2.6338, "step": 11112 }, { "crossentropy": 2.735987424850464, "epoch": 0.40287848027842227, "grad_norm": 0.0323789119720459, "grad_norm_var": 2.1562985225961434e-06, "learning_rate": 0.006683560167423774, "loss": 2.7132, "step": 11113 }, { "crossentropy": 2.5726559162139893, "epoch": 0.4029147331786543, "grad_norm": 0.03177974373102188, "grad_norm_var": 2.9240562965246e-06, "learning_rate": 0.0066830129993999555, "loss": 2.5313, "step": 11114 }, { "crossentropy": 2.6067440509796143, "epoch": 0.4029509860788863, "grad_norm": 0.0297089833766222, "grad_norm_var": 2.9725334442967653e-06, "learning_rate": 0.006682465808644491, "loss": 2.5152, "step": 11115 }, { "crossentropy": 2.6101527214050293, "epoch": 0.4029872389791183, "grad_norm": 0.028829528018832207, "grad_norm_var": 2.9726193387567708e-06, "learning_rate": 0.006681918595164774, "loss": 2.5825, "step": 11116 }, { "crossentropy": 2.5399672985076904, "epoch": 0.4030234918793503, "grad_norm": 0.027897227555513382, "grad_norm_var": 3.003019773984104e-06, "learning_rate": 0.006681371358968191, "loss": 2.5636, "step": 11117 }, { "crossentropy": 2.4747118949890137, "epoch": 0.40305974477958234, "grad_norm": 0.027318015694618225, "grad_norm_var": 3.102540841643369e-06, "learning_rate": 0.006680824100062138, "loss": 2.5094, "step": 11118 }, { "crossentropy": 2.521254777908325, "epoch": 0.4030959976798144, "grad_norm": 0.028817763552069664, "grad_norm_var": 2.913931148620467e-06, "learning_rate": 0.006680276818454005, "loss": 2.5684, "step": 11119 }, { "crossentropy": 2.756296157836914, "epoch": 0.4031322505800464, "grad_norm": 0.02859843336045742, "grad_norm_var": 2.503638145960477e-06, "learning_rate": 0.006679729514151182, "loss": 2.7002, "step": 11120 }, { "crossentropy": 2.7122483253479004, "epoch": 0.40316850348027844, "grad_norm": 0.02869626134634018, "grad_norm_var": 2.4925170408552147e-06, "learning_rate": 0.006679182187161065, "loss": 2.6234, "step": 11121 }, { "crossentropy": 2.6746881008148193, "epoch": 0.40320475638051045, "grad_norm": 0.028293093666434288, "grad_norm_var": 2.0106085898446833e-06, "learning_rate": 0.006678634837491044, "loss": 2.6649, "step": 11122 }, { "crossentropy": 2.7427978515625, "epoch": 0.40324100928074247, "grad_norm": 0.028271013870835304, "grad_norm_var": 2.013101236055997e-06, "learning_rate": 0.006678087465148511, "loss": 2.7035, "step": 11123 }, { "crossentropy": 2.559497117996216, "epoch": 0.4032772621809745, "grad_norm": 0.02732732519507408, "grad_norm_var": 2.1630533187750084e-06, "learning_rate": 0.006677540070140861, "loss": 2.609, "step": 11124 }, { "crossentropy": 2.534745931625366, "epoch": 0.4033135150812065, "grad_norm": 0.026961199939250946, "grad_norm_var": 2.3710002963177873e-06, "learning_rate": 0.0066769926524754865, "loss": 2.5644, "step": 11125 }, { "crossentropy": 2.5780210494995117, "epoch": 0.4033497679814385, "grad_norm": 0.027648262679576874, "grad_norm_var": 2.323471084096163e-06, "learning_rate": 0.006676445212159781, "loss": 2.5615, "step": 11126 }, { "crossentropy": 2.4897937774658203, "epoch": 0.4033860208816705, "grad_norm": 0.029363401234149933, "grad_norm_var": 2.2450726878123766e-06, "learning_rate": 0.006675897749201139, "loss": 2.5164, "step": 11127 }, { "crossentropy": 2.613435983657837, "epoch": 0.40342227378190254, "grad_norm": 0.02800777368247509, "grad_norm_var": 2.2589931800972984e-06, "learning_rate": 0.006675350263606954, "loss": 2.6309, "step": 11128 }, { "crossentropy": 2.666261672973633, "epoch": 0.40345852668213456, "grad_norm": 0.029457999393343925, "grad_norm_var": 1.3764197687578713e-06, "learning_rate": 0.0066748027553846226, "loss": 2.639, "step": 11129 }, { "crossentropy": 2.6677794456481934, "epoch": 0.40349477958236657, "grad_norm": 0.028156552463769913, "grad_norm_var": 6.419403573135544e-07, "learning_rate": 0.006674255224541537, "loss": 2.5839, "step": 11130 }, { "crossentropy": 2.5083529949188232, "epoch": 0.4035310324825986, "grad_norm": 0.027834804728627205, "grad_norm_var": 5.18017179687522e-07, "learning_rate": 0.006673707671085094, "loss": 2.5078, "step": 11131 }, { "crossentropy": 2.774603843688965, "epoch": 0.40356728538283065, "grad_norm": 0.029132632538676262, "grad_norm_var": 5.484970565372907e-07, "learning_rate": 0.00667316009502269, "loss": 2.6777, "step": 11132 }, { "crossentropy": 2.5610084533691406, "epoch": 0.40360353828306267, "grad_norm": 0.027458127588033676, "grad_norm_var": 5.804026813877386e-07, "learning_rate": 0.006672612496361719, "loss": 2.508, "step": 11133 }, { "crossentropy": 2.6032159328460693, "epoch": 0.4036397911832947, "grad_norm": 0.026608740910887718, "grad_norm_var": 6.960970466664361e-07, "learning_rate": 0.006672064875109578, "loss": 2.5586, "step": 11134 }, { "crossentropy": 2.5918657779693604, "epoch": 0.4036760440835267, "grad_norm": 0.028389615938067436, "grad_norm_var": 6.702664601997925e-07, "learning_rate": 0.0066715172312736635, "loss": 2.6556, "step": 11135 }, { "crossentropy": 2.774489402770996, "epoch": 0.4037122969837587, "grad_norm": 0.029666906222701073, "grad_norm_var": 8.072379292648242e-07, "learning_rate": 0.006670969564861372, "loss": 2.7644, "step": 11136 }, { "crossentropy": 2.6030960083007812, "epoch": 0.4037485498839907, "grad_norm": 0.02660994604229927, "grad_norm_var": 9.425162589358904e-07, "learning_rate": 0.006670421875880101, "loss": 2.6262, "step": 11137 }, { "crossentropy": 2.523259401321411, "epoch": 0.40378480278422274, "grad_norm": 0.02797907218337059, "grad_norm_var": 9.3951489015433e-07, "learning_rate": 0.006669874164337249, "loss": 2.5769, "step": 11138 }, { "crossentropy": 2.6268537044525146, "epoch": 0.40382105568445475, "grad_norm": 0.026964573189616203, "grad_norm_var": 1.0084890550457933e-06, "learning_rate": 0.00666932643024021, "loss": 2.5291, "step": 11139 }, { "crossentropy": 2.5534119606018066, "epoch": 0.40385730858468677, "grad_norm": 0.027878519147634506, "grad_norm_var": 9.800301021015608e-07, "learning_rate": 0.0066687786735963855, "loss": 2.5542, "step": 11140 }, { "crossentropy": 2.7894909381866455, "epoch": 0.4038935614849188, "grad_norm": 0.030156925320625305, "grad_norm_var": 1.172546231697121e-06, "learning_rate": 0.006668230894413172, "loss": 2.6208, "step": 11141 }, { "crossentropy": 2.5500805377960205, "epoch": 0.4039298143851508, "grad_norm": 0.026807906106114388, "grad_norm_var": 1.2793018022185978e-06, "learning_rate": 0.006667683092697969, "loss": 2.5053, "step": 11142 }, { "crossentropy": 2.6072750091552734, "epoch": 0.4039660672853828, "grad_norm": 0.028628000989556313, "grad_norm_var": 1.1945749843190907e-06, "learning_rate": 0.006667135268458174, "loss": 2.5919, "step": 11143 }, { "crossentropy": 2.5531156063079834, "epoch": 0.4040023201856148, "grad_norm": 0.03240719810128212, "grad_norm_var": 2.345096532953732e-06, "learning_rate": 0.0066665874217011895, "loss": 2.6299, "step": 11144 }, { "crossentropy": 2.575453758239746, "epoch": 0.40403857308584684, "grad_norm": 0.028759019449353218, "grad_norm_var": 2.275500731763524e-06, "learning_rate": 0.006666039552434411, "loss": 2.5443, "step": 11145 }, { "crossentropy": 2.6961658000946045, "epoch": 0.4040748259860789, "grad_norm": 0.027455924078822136, "grad_norm_var": 2.3233093613262794e-06, "learning_rate": 0.0066654916606652395, "loss": 2.5822, "step": 11146 }, { "crossentropy": 2.634061336517334, "epoch": 0.4041110788863109, "grad_norm": 0.02709117718040943, "grad_norm_var": 2.403610251619041e-06, "learning_rate": 0.006664943746401077, "loss": 2.6226, "step": 11147 }, { "crossentropy": 2.5220537185668945, "epoch": 0.40414733178654294, "grad_norm": 0.02989211678504944, "grad_norm_var": 2.529076843294402e-06, "learning_rate": 0.006664395809649322, "loss": 2.5683, "step": 11148 }, { "crossentropy": 2.5406737327575684, "epoch": 0.40418358468677495, "grad_norm": 0.02896779216825962, "grad_norm_var": 2.5026420695734947e-06, "learning_rate": 0.006663847850417375, "loss": 2.5261, "step": 11149 }, { "crossentropy": 2.5621626377105713, "epoch": 0.40421983758700697, "grad_norm": 0.028475722298026085, "grad_norm_var": 2.276718358857251e-06, "learning_rate": 0.00666329986871264, "loss": 2.579, "step": 11150 }, { "crossentropy": 2.5645456314086914, "epoch": 0.404256090487239, "grad_norm": 0.02860129438340664, "grad_norm_var": 2.2761733365130967e-06, "learning_rate": 0.006662751864542515, "loss": 2.6683, "step": 11151 }, { "crossentropy": 2.6360132694244385, "epoch": 0.404292343387471, "grad_norm": 0.027501435950398445, "grad_norm_var": 2.2385053623498348e-06, "learning_rate": 0.006662203837914402, "loss": 2.6229, "step": 11152 }, { "crossentropy": 2.716526508331299, "epoch": 0.404328596287703, "grad_norm": 0.028941966593265533, "grad_norm_var": 2.026148994772649e-06, "learning_rate": 0.006661655788835703, "loss": 2.7023, "step": 11153 }, { "crossentropy": 2.7852230072021484, "epoch": 0.404364849187935, "grad_norm": 0.027566872537136078, "grad_norm_var": 2.0671456376588603e-06, "learning_rate": 0.006661107717313823, "loss": 2.6549, "step": 11154 }, { "crossentropy": 2.403738021850586, "epoch": 0.40440110208816704, "grad_norm": 0.028822442516684532, "grad_norm_var": 1.9010327160446164e-06, "learning_rate": 0.006660559623356162, "loss": 2.5373, "step": 11155 }, { "crossentropy": 2.531991958618164, "epoch": 0.40443735498839906, "grad_norm": 0.026694756001234055, "grad_norm_var": 2.1059838526422844e-06, "learning_rate": 0.0066600115069701225, "loss": 2.6397, "step": 11156 }, { "crossentropy": 2.591017723083496, "epoch": 0.40447360788863107, "grad_norm": 0.027531063184142113, "grad_norm_var": 1.973677849110876e-06, "learning_rate": 0.006659463368163108, "loss": 2.6267, "step": 11157 }, { "crossentropy": 2.5659565925598145, "epoch": 0.4045098607888631, "grad_norm": 0.029037941247224808, "grad_norm_var": 1.8158484673587322e-06, "learning_rate": 0.006658915206942524, "loss": 2.6052, "step": 11158 }, { "crossentropy": 2.8081412315368652, "epoch": 0.40454611368909515, "grad_norm": 0.029257789254188538, "grad_norm_var": 1.849419878476134e-06, "learning_rate": 0.006658367023315772, "loss": 2.7872, "step": 11159 }, { "crossentropy": 2.5368919372558594, "epoch": 0.40458236658932717, "grad_norm": 0.029084160923957825, "grad_norm_var": 8.362291921305455e-07, "learning_rate": 0.0066578188172902565, "loss": 2.5284, "step": 11160 }, { "crossentropy": 2.7941813468933105, "epoch": 0.4046186194895592, "grad_norm": 0.030020801350474358, "grad_norm_var": 1.0036907925040865e-06, "learning_rate": 0.0066572705888733805, "loss": 2.8058, "step": 11161 }, { "crossentropy": 2.650405168533325, "epoch": 0.4046548723897912, "grad_norm": 0.02766074799001217, "grad_norm_var": 9.796030023863453e-07, "learning_rate": 0.006656722338072554, "loss": 2.6561, "step": 11162 }, { "crossentropy": 2.7030656337738037, "epoch": 0.4046911252900232, "grad_norm": 0.028054887428879738, "grad_norm_var": 8.634645246735946e-07, "learning_rate": 0.006656174064895175, "loss": 2.6428, "step": 11163 }, { "crossentropy": 2.605635166168213, "epoch": 0.4047273781902552, "grad_norm": 0.04767921194434166, "grad_norm_var": 2.3922253240837147e-05, "learning_rate": 0.0066556257693486534, "loss": 2.6434, "step": 11164 }, { "crossentropy": 2.6486995220184326, "epoch": 0.40476363109048724, "grad_norm": 0.025823554024100304, "grad_norm_var": 2.481301583268148e-05, "learning_rate": 0.006655077451440393, "loss": 2.5729, "step": 11165 }, { "crossentropy": 2.7097439765930176, "epoch": 0.40479988399071926, "grad_norm": 0.029162868857383728, "grad_norm_var": 2.475581379246013e-05, "learning_rate": 0.006654529111177798, "loss": 2.5944, "step": 11166 }, { "crossentropy": 2.544430732727051, "epoch": 0.40483613689095127, "grad_norm": 0.028803542256355286, "grad_norm_var": 2.4735076264663587e-05, "learning_rate": 0.006653980748568281, "loss": 2.6008, "step": 11167 }, { "crossentropy": 2.585871696472168, "epoch": 0.4048723897911833, "grad_norm": 0.02982638217508793, "grad_norm_var": 2.4460268198882407e-05, "learning_rate": 0.006653432363619239, "loss": 2.6383, "step": 11168 }, { "crossentropy": 2.6049773693084717, "epoch": 0.4049086426914153, "grad_norm": 0.030538512393832207, "grad_norm_var": 2.4474591485959946e-05, "learning_rate": 0.006652883956338086, "loss": 2.5128, "step": 11169 }, { "crossentropy": 2.594261884689331, "epoch": 0.4049448955916473, "grad_norm": 0.02819433994591236, "grad_norm_var": 2.4318824961005883e-05, "learning_rate": 0.006652335526732227, "loss": 2.6059, "step": 11170 }, { "crossentropy": 2.6504578590393066, "epoch": 0.40498114849187933, "grad_norm": 0.029918035492300987, "grad_norm_var": 2.425658638825199e-05, "learning_rate": 0.0066517870748090705, "loss": 2.6776, "step": 11171 }, { "crossentropy": 2.6235742568969727, "epoch": 0.40501740139211134, "grad_norm": 0.027336260303854942, "grad_norm_var": 2.4014091260321966e-05, "learning_rate": 0.00665123860057602, "loss": 2.6566, "step": 11172 }, { "crossentropy": 2.5599629878997803, "epoch": 0.4050536542923434, "grad_norm": 0.02651136741042137, "grad_norm_var": 2.4397163822699798e-05, "learning_rate": 0.006650690104040486, "loss": 2.547, "step": 11173 }, { "crossentropy": 2.601809501647949, "epoch": 0.4050899071925754, "grad_norm": 0.02627682499587536, "grad_norm_var": 2.5156740344134862e-05, "learning_rate": 0.00665014158520988, "loss": 2.5602, "step": 11174 }, { "crossentropy": 2.4963293075561523, "epoch": 0.40512616009280744, "grad_norm": 0.028280267491936684, "grad_norm_var": 2.5265539101946298e-05, "learning_rate": 0.006649593044091608, "loss": 2.5952, "step": 11175 }, { "crossentropy": 2.469240188598633, "epoch": 0.40516241299303946, "grad_norm": 0.02992122247815132, "grad_norm_var": 2.525474638919189e-05, "learning_rate": 0.006649044480693076, "loss": 2.5394, "step": 11176 }, { "crossentropy": 2.647198438644409, "epoch": 0.40519866589327147, "grad_norm": 0.030879421159625053, "grad_norm_var": 2.5346072366391272e-05, "learning_rate": 0.0066484958950216964, "loss": 2.6868, "step": 11177 }, { "crossentropy": 2.624600410461426, "epoch": 0.4052349187935035, "grad_norm": 0.031134316697716713, "grad_norm_var": 2.5165339309793584e-05, "learning_rate": 0.006647947287084879, "loss": 2.6501, "step": 11178 }, { "crossentropy": 2.641399621963501, "epoch": 0.4052711716937355, "grad_norm": 0.03132873401045799, "grad_norm_var": 2.5031412607447122e-05, "learning_rate": 0.006647398656890033, "loss": 2.5609, "step": 11179 }, { "crossentropy": 2.5734751224517822, "epoch": 0.4053074245939675, "grad_norm": 0.029637474566698074, "grad_norm_var": 3.0897283032399815e-06, "learning_rate": 0.006646850004444566, "loss": 2.5905, "step": 11180 }, { "crossentropy": 2.673164129257202, "epoch": 0.40534367749419953, "grad_norm": 0.03027292899787426, "grad_norm_var": 2.458438094687411e-06, "learning_rate": 0.006646301329755893, "loss": 2.5985, "step": 11181 }, { "crossentropy": 2.684342384338379, "epoch": 0.40537993039443154, "grad_norm": 0.028815509751439095, "grad_norm_var": 2.4700798085210226e-06, "learning_rate": 0.006645752632831421, "loss": 2.603, "step": 11182 }, { "crossentropy": 2.5800061225891113, "epoch": 0.40541618329466356, "grad_norm": 0.02780628763139248, "grad_norm_var": 2.588901647583517e-06, "learning_rate": 0.006645203913678562, "loss": 2.6937, "step": 11183 }, { "crossentropy": 2.7733423709869385, "epoch": 0.40545243619489557, "grad_norm": 0.03067374974489212, "grad_norm_var": 2.708235617950823e-06, "learning_rate": 0.006644655172304728, "loss": 2.602, "step": 11184 }, { "crossentropy": 2.6582093238830566, "epoch": 0.4054886890951276, "grad_norm": 0.028404034674167633, "grad_norm_var": 2.6178340416196354e-06, "learning_rate": 0.006644106408717329, "loss": 2.6253, "step": 11185 }, { "crossentropy": 2.668884754180908, "epoch": 0.40552494199535966, "grad_norm": 0.028122615069150925, "grad_norm_var": 2.6266916292432317e-06, "learning_rate": 0.006643557622923779, "loss": 2.6006, "step": 11186 }, { "crossentropy": 2.536512851715088, "epoch": 0.40556119489559167, "grad_norm": 0.03152339905500412, "grad_norm_var": 2.9666239404905576e-06, "learning_rate": 0.006643008814931488, "loss": 2.5816, "step": 11187 }, { "crossentropy": 2.530651807785034, "epoch": 0.4055974477958237, "grad_norm": 0.030749136582016945, "grad_norm_var": 2.8543494946829556e-06, "learning_rate": 0.006642459984747871, "loss": 2.614, "step": 11188 }, { "crossentropy": 2.594863176345825, "epoch": 0.4056337006960557, "grad_norm": 0.02807345986366272, "grad_norm_var": 2.406032626953852e-06, "learning_rate": 0.006641911132380338, "loss": 2.616, "step": 11189 }, { "crossentropy": 2.485682725906372, "epoch": 0.4056699535962877, "grad_norm": 0.030245047062635422, "grad_norm_var": 1.6881641475638506e-06, "learning_rate": 0.006641362257836306, "loss": 2.5575, "step": 11190 }, { "crossentropy": 2.764784097671509, "epoch": 0.40570620649651973, "grad_norm": 0.02773864194750786, "grad_norm_var": 1.812040754588934e-06, "learning_rate": 0.006640813361123183, "loss": 2.6989, "step": 11191 }, { "crossentropy": 2.528923988342285, "epoch": 0.40574245939675174, "grad_norm": 0.027034128084778786, "grad_norm_var": 2.250870142579209e-06, "learning_rate": 0.006640264442248387, "loss": 2.63, "step": 11192 }, { "crossentropy": 2.4298954010009766, "epoch": 0.40577871229698376, "grad_norm": 0.027186749503016472, "grad_norm_var": 2.4374480330354668e-06, "learning_rate": 0.00663971550121933, "loss": 2.5954, "step": 11193 }, { "crossentropy": 2.573254108428955, "epoch": 0.40581496519721577, "grad_norm": 0.030264277011156082, "grad_norm_var": 2.271578183968513e-06, "learning_rate": 0.006639166538043426, "loss": 2.5575, "step": 11194 }, { "crossentropy": 2.763334035873413, "epoch": 0.4058512180974478, "grad_norm": 0.03280043229460716, "grad_norm_var": 2.816367870505515e-06, "learning_rate": 0.006638617552728092, "loss": 2.6656, "step": 11195 }, { "crossentropy": 2.583143949508667, "epoch": 0.4058874709976798, "grad_norm": 0.028803948312997818, "grad_norm_var": 2.8260904379936538e-06, "learning_rate": 0.00663806854528074, "loss": 2.638, "step": 11196 }, { "crossentropy": 2.7097887992858887, "epoch": 0.4059237238979118, "grad_norm": 0.027592165395617485, "grad_norm_var": 2.9211058288895754e-06, "learning_rate": 0.006637519515708788, "loss": 2.644, "step": 11197 }, { "crossentropy": 2.6601548194885254, "epoch": 0.40595997679814383, "grad_norm": 0.029320482164621353, "grad_norm_var": 2.9169055825556025e-06, "learning_rate": 0.006636970464019648, "loss": 2.6079, "step": 11198 }, { "crossentropy": 2.793926477432251, "epoch": 0.40599622969837584, "grad_norm": 0.028036758303642273, "grad_norm_var": 2.879051884252931e-06, "learning_rate": 0.006636421390220739, "loss": 2.686, "step": 11199 }, { "crossentropy": 2.588611364364624, "epoch": 0.4060324825986079, "grad_norm": 0.030769597738981247, "grad_norm_var": 2.898964169882843e-06, "learning_rate": 0.006635872294319476, "loss": 2.5678, "step": 11200 }, { "crossentropy": 2.5856261253356934, "epoch": 0.40606873549883993, "grad_norm": 0.03136548772454262, "grad_norm_var": 3.1460130156761465e-06, "learning_rate": 0.006635323176323274, "loss": 2.5226, "step": 11201 }, { "crossentropy": 2.7429211139678955, "epoch": 0.40610498839907194, "grad_norm": 0.03125595673918724, "grad_norm_var": 3.2461644728845953e-06, "learning_rate": 0.006634774036239552, "loss": 2.62, "step": 11202 }, { "crossentropy": 2.677577495574951, "epoch": 0.40614124129930396, "grad_norm": 0.026504773646593094, "grad_norm_var": 3.4981401806050137e-06, "learning_rate": 0.006634224874075725, "loss": 2.6572, "step": 11203 }, { "crossentropy": 2.699826240539551, "epoch": 0.40617749419953597, "grad_norm": 0.027591532096266747, "grad_norm_var": 3.483322877010214e-06, "learning_rate": 0.00663367568983921, "loss": 2.5999, "step": 11204 }, { "crossentropy": 2.6291255950927734, "epoch": 0.406213747099768, "grad_norm": 0.033477939665317535, "grad_norm_var": 4.614909184689984e-06, "learning_rate": 0.006633126483537428, "loss": 2.5703, "step": 11205 }, { "crossentropy": 2.6918647289276123, "epoch": 0.40625, "grad_norm": 0.03132029250264168, "grad_norm_var": 4.812012212240755e-06, "learning_rate": 0.006632577255177793, "loss": 2.5642, "step": 11206 }, { "crossentropy": 2.492875337600708, "epoch": 0.406286252900232, "grad_norm": 0.03094620630145073, "grad_norm_var": 4.726793660747517e-06, "learning_rate": 0.006632028004767725, "loss": 2.5206, "step": 11207 }, { "crossentropy": 2.7027482986450195, "epoch": 0.40632250580046403, "grad_norm": 0.037454843521118164, "grad_norm_var": 7.890408903114113e-06, "learning_rate": 0.006631478732314642, "loss": 2.6672, "step": 11208 }, { "crossentropy": 2.4339890480041504, "epoch": 0.40635875870069604, "grad_norm": 0.03916167840361595, "grad_norm_var": 1.1892880786310474e-05, "learning_rate": 0.006630929437825962, "loss": 2.5346, "step": 11209 }, { "crossentropy": 2.654697895050049, "epoch": 0.40639501160092806, "grad_norm": 0.027965161949396133, "grad_norm_var": 1.2461553535281684e-05, "learning_rate": 0.006630380121309107, "loss": 2.6205, "step": 11210 }, { "crossentropy": 2.609199285507202, "epoch": 0.40643126450116007, "grad_norm": 0.027087708935141563, "grad_norm_var": 1.3052142742740154e-05, "learning_rate": 0.006629830782771492, "loss": 2.5984, "step": 11211 }, { "crossentropy": 2.6758811473846436, "epoch": 0.4064675174013921, "grad_norm": 0.027169257402420044, "grad_norm_var": 1.3597741842954674e-05, "learning_rate": 0.00662928142222054, "loss": 2.6158, "step": 11212 }, { "crossentropy": 2.6959128379821777, "epoch": 0.40650377030162416, "grad_norm": 0.02708529680967331, "grad_norm_var": 1.3806177654622499e-05, "learning_rate": 0.00662873203966367, "loss": 2.6206, "step": 11213 }, { "crossentropy": 2.59208345413208, "epoch": 0.40654002320185617, "grad_norm": 0.027051011100411415, "grad_norm_var": 1.445687836039929e-05, "learning_rate": 0.006628182635108302, "loss": 2.631, "step": 11214 }, { "crossentropy": 2.6659581661224365, "epoch": 0.4065762761020882, "grad_norm": 0.027569375932216644, "grad_norm_var": 1.4609403681105115e-05, "learning_rate": 0.006627633208561856, "loss": 2.6183, "step": 11215 }, { "crossentropy": 2.6160268783569336, "epoch": 0.4066125290023202, "grad_norm": 0.027871331200003624, "grad_norm_var": 1.4928202269261458e-05, "learning_rate": 0.006627083760031754, "loss": 2.6876, "step": 11216 }, { "crossentropy": 2.5435333251953125, "epoch": 0.4066487819025522, "grad_norm": 0.03071061335504055, "grad_norm_var": 1.4840567025385535e-05, "learning_rate": 0.006626534289525417, "loss": 2.6541, "step": 11217 }, { "crossentropy": 2.462456226348877, "epoch": 0.40668503480278423, "grad_norm": 0.030095256865024567, "grad_norm_var": 1.4732553463434384e-05, "learning_rate": 0.006625984797050267, "loss": 2.4821, "step": 11218 }, { "crossentropy": 2.6703503131866455, "epoch": 0.40672128770301624, "grad_norm": 0.028368273749947548, "grad_norm_var": 1.4095707729537033e-05, "learning_rate": 0.006625435282613723, "loss": 2.7011, "step": 11219 }, { "crossentropy": 2.8001129627227783, "epoch": 0.40675754060324826, "grad_norm": 0.03070516139268875, "grad_norm_var": 1.3677727736190641e-05, "learning_rate": 0.00662488574622321, "loss": 2.7359, "step": 11220 }, { "crossentropy": 2.524895429611206, "epoch": 0.40679379350348027, "grad_norm": 0.0280606746673584, "grad_norm_var": 1.3182132043419364e-05, "learning_rate": 0.006624336187886148, "loss": 2.5599, "step": 11221 }, { "crossentropy": 2.6223349571228027, "epoch": 0.4068300464037123, "grad_norm": 0.02611399069428444, "grad_norm_var": 1.3899939087312961e-05, "learning_rate": 0.006623786607609962, "loss": 2.6232, "step": 11222 }, { "crossentropy": 2.7271337509155273, "epoch": 0.4068662993039443, "grad_norm": 0.028306009247899055, "grad_norm_var": 1.3857652357084487e-05, "learning_rate": 0.006623237005402073, "loss": 2.7, "step": 11223 }, { "crossentropy": 2.6911582946777344, "epoch": 0.4069025522041763, "grad_norm": 0.027954375371336937, "grad_norm_var": 9.32526876802942e-06, "learning_rate": 0.006622687381269905, "loss": 2.6273, "step": 11224 }, { "crossentropy": 2.6300137042999268, "epoch": 0.40693880510440833, "grad_norm": 0.02875087782740593, "grad_norm_var": 1.7574259234096374e-06, "learning_rate": 0.006622137735220881, "loss": 2.639, "step": 11225 }, { "crossentropy": 2.5818727016448975, "epoch": 0.40697505800464034, "grad_norm": 0.029164662584662437, "grad_norm_var": 1.8131474205100467e-06, "learning_rate": 0.006621588067262425, "loss": 2.6331, "step": 11226 }, { "crossentropy": 2.5406484603881836, "epoch": 0.4070113109048724, "grad_norm": 0.027137957513332367, "grad_norm_var": 1.8054913503982949e-06, "learning_rate": 0.006621038377401962, "loss": 2.5685, "step": 11227 }, { "crossentropy": 2.4743871688842773, "epoch": 0.40704756380510443, "grad_norm": 0.02924147993326187, "grad_norm_var": 1.7732969852544822e-06, "learning_rate": 0.006620488665646915, "loss": 2.5242, "step": 11228 }, { "crossentropy": 2.6646437644958496, "epoch": 0.40708381670533644, "grad_norm": 0.027962472289800644, "grad_norm_var": 1.6691851504044603e-06, "learning_rate": 0.00661993893200471, "loss": 2.6821, "step": 11229 }, { "crossentropy": 2.550947904586792, "epoch": 0.40712006960556846, "grad_norm": 0.02628397010266781, "grad_norm_var": 1.848162365031255e-06, "learning_rate": 0.006619389176482772, "loss": 2.5007, "step": 11230 }, { "crossentropy": 2.689558506011963, "epoch": 0.40715632250580047, "grad_norm": 0.030008120462298393, "grad_norm_var": 1.9518926598330667e-06, "learning_rate": 0.006618839399088526, "loss": 2.6603, "step": 11231 }, { "crossentropy": 2.5135345458984375, "epoch": 0.4071925754060325, "grad_norm": 0.03115195594727993, "grad_norm_var": 2.329458601410944e-06, "learning_rate": 0.006618289599829395, "loss": 2.5242, "step": 11232 }, { "crossentropy": 2.459402084350586, "epoch": 0.4072288283062645, "grad_norm": 0.030772985890507698, "grad_norm_var": 2.3459986312905173e-06, "learning_rate": 0.006617739778712808, "loss": 2.5662, "step": 11233 }, { "crossentropy": 2.5201334953308105, "epoch": 0.4072650812064965, "grad_norm": 0.03156299516558647, "grad_norm_var": 2.742947507434051e-06, "learning_rate": 0.0066171899357461905, "loss": 2.6162, "step": 11234 }, { "crossentropy": 2.7063071727752686, "epoch": 0.40730133410672853, "grad_norm": 0.027973590418696404, "grad_norm_var": 2.7778562871133098e-06, "learning_rate": 0.00661664007093697, "loss": 2.5976, "step": 11235 }, { "crossentropy": 2.5955772399902344, "epoch": 0.40733758700696054, "grad_norm": 0.02889355458319187, "grad_norm_var": 2.528092295090905e-06, "learning_rate": 0.006616090184292571, "loss": 2.5978, "step": 11236 }, { "crossentropy": 2.6121742725372314, "epoch": 0.40737383990719256, "grad_norm": 0.027387268841266632, "grad_norm_var": 2.6146216985871294e-06, "learning_rate": 0.00661554027582042, "loss": 2.6001, "step": 11237 }, { "crossentropy": 2.5051586627960205, "epoch": 0.4074100928074246, "grad_norm": 0.026982147246599197, "grad_norm_var": 2.3662475955560224e-06, "learning_rate": 0.006614990345527948, "loss": 2.5526, "step": 11238 }, { "crossentropy": 2.6538901329040527, "epoch": 0.4074463457076566, "grad_norm": 0.031105997040867805, "grad_norm_var": 2.701350904331827e-06, "learning_rate": 0.00661444039342258, "loss": 2.6131, "step": 11239 }, { "crossentropy": 2.677302360534668, "epoch": 0.40748259860788866, "grad_norm": 0.028204461559653282, "grad_norm_var": 2.673864850722069e-06, "learning_rate": 0.006613890419511745, "loss": 2.6569, "step": 11240 }, { "crossentropy": 2.643186092376709, "epoch": 0.40751885150812067, "grad_norm": 0.026353899389505386, "grad_norm_var": 3.0843032729049957e-06, "learning_rate": 0.006613340423802868, "loss": 2.6239, "step": 11241 }, { "crossentropy": 2.568166971206665, "epoch": 0.4075551044083527, "grad_norm": 0.026945114135742188, "grad_norm_var": 3.2729562159854986e-06, "learning_rate": 0.006612790406303384, "loss": 2.6025, "step": 11242 }, { "crossentropy": 2.5709402561187744, "epoch": 0.4075913573085847, "grad_norm": 0.0264279805123806, "grad_norm_var": 3.445039718566236e-06, "learning_rate": 0.006612240367020717, "loss": 2.5548, "step": 11243 }, { "crossentropy": 2.6658380031585693, "epoch": 0.4076276102088167, "grad_norm": 0.02797657437622547, "grad_norm_var": 3.433245594372279e-06, "learning_rate": 0.006611690305962295, "loss": 2.6428, "step": 11244 }, { "crossentropy": 2.6014437675476074, "epoch": 0.40766386310904873, "grad_norm": 0.02673874981701374, "grad_norm_var": 3.6144732825522017e-06, "learning_rate": 0.006611140223135551, "loss": 2.6255, "step": 11245 }, { "crossentropy": 2.483705759048462, "epoch": 0.40770011600928074, "grad_norm": 0.028818324208259583, "grad_norm_var": 3.293070971081358e-06, "learning_rate": 0.006610590118547914, "loss": 2.5406, "step": 11246 }, { "crossentropy": 2.6153252124786377, "epoch": 0.40773636890951276, "grad_norm": 0.02881704457104206, "grad_norm_var": 3.155172783907015e-06, "learning_rate": 0.006610039992206813, "loss": 2.7771, "step": 11247 }, { "crossentropy": 2.4514853954315186, "epoch": 0.4077726218097448, "grad_norm": 0.029320798814296722, "grad_norm_var": 2.7189763078803072e-06, "learning_rate": 0.006609489844119678, "loss": 2.5486, "step": 11248 }, { "crossentropy": 2.6134939193725586, "epoch": 0.4078088747099768, "grad_norm": 0.029839444905519485, "grad_norm_var": 2.4771523934394388e-06, "learning_rate": 0.0066089396742939404, "loss": 2.5857, "step": 11249 }, { "crossentropy": 2.604745626449585, "epoch": 0.4078451276102088, "grad_norm": 0.02922808565199375, "grad_norm_var": 1.812711946417439e-06, "learning_rate": 0.00660838948273703, "loss": 2.6023, "step": 11250 }, { "crossentropy": 2.534634590148926, "epoch": 0.4078813805104408, "grad_norm": 0.02879204973578453, "grad_norm_var": 1.8311467570078596e-06, "learning_rate": 0.006607839269456382, "loss": 2.6307, "step": 11251 }, { "crossentropy": 2.6280930042266846, "epoch": 0.40791763341067283, "grad_norm": 0.028908392414450645, "grad_norm_var": 1.8324545463514608e-06, "learning_rate": 0.006607289034459422, "loss": 2.6476, "step": 11252 }, { "crossentropy": 2.6813297271728516, "epoch": 0.40795388631090485, "grad_norm": 0.02752344310283661, "grad_norm_var": 1.818123652401341e-06, "learning_rate": 0.006606738777753583, "loss": 2.6363, "step": 11253 }, { "crossentropy": 2.455291748046875, "epoch": 0.4079901392111369, "grad_norm": 0.02724243514239788, "grad_norm_var": 1.7783950599018285e-06, "learning_rate": 0.0066061884993463, "loss": 2.5473, "step": 11254 }, { "crossentropy": 2.41471004486084, "epoch": 0.40802639211136893, "grad_norm": 0.026682117953896523, "grad_norm_var": 1.325903596662882e-06, "learning_rate": 0.0066056381992450055, "loss": 2.522, "step": 11255 }, { "crossentropy": 2.6173529624938965, "epoch": 0.40806264501160094, "grad_norm": 0.02789328433573246, "grad_norm_var": 1.32300280391943e-06, "learning_rate": 0.00660508787745713, "loss": 2.5837, "step": 11256 }, { "crossentropy": 2.3789374828338623, "epoch": 0.40809889791183296, "grad_norm": 0.02733442187309265, "grad_norm_var": 1.1719089449801467e-06, "learning_rate": 0.0066045375339901045, "loss": 2.45, "step": 11257 }, { "crossentropy": 2.7185089588165283, "epoch": 0.408135150812065, "grad_norm": 0.027637463063001633, "grad_norm_var": 1.101671193923931e-06, "learning_rate": 0.0066039871688513664, "loss": 2.6716, "step": 11258 }, { "crossentropy": 2.500192642211914, "epoch": 0.408171403712297, "grad_norm": 0.02708660438656807, "grad_norm_var": 9.842536878135189e-07, "learning_rate": 0.006603436782048349, "loss": 2.5988, "step": 11259 }, { "crossentropy": 2.5848441123962402, "epoch": 0.408207656612529, "grad_norm": 0.026761552318930626, "grad_norm_var": 1.0989387061274257e-06, "learning_rate": 0.006602886373588482, "loss": 2.6438, "step": 11260 }, { "crossentropy": 2.6566596031188965, "epoch": 0.408243909512761, "grad_norm": 0.02931929938495159, "grad_norm_var": 1.067755055630948e-06, "learning_rate": 0.006602335943479202, "loss": 2.6441, "step": 11261 }, { "crossentropy": 2.6902217864990234, "epoch": 0.40828016241299303, "grad_norm": 0.02877330407500267, "grad_norm_var": 1.0641719128030741e-06, "learning_rate": 0.006601785491727944, "loss": 2.6985, "step": 11262 }, { "crossentropy": 2.5958356857299805, "epoch": 0.40831641531322505, "grad_norm": 0.027888046577572823, "grad_norm_var": 1.041368987740078e-06, "learning_rate": 0.006601235018342144, "loss": 2.6085, "step": 11263 }, { "crossentropy": 2.569786787033081, "epoch": 0.40835266821345706, "grad_norm": 0.0322955958545208, "grad_norm_var": 2.0630386154194918e-06, "learning_rate": 0.006600684523329233, "loss": 2.5976, "step": 11264 }, { "crossentropy": 2.660980701446533, "epoch": 0.4083889211136891, "grad_norm": 0.03388283774256706, "grad_norm_var": 3.901132112539794e-06, "learning_rate": 0.006600134006696647, "loss": 2.5694, "step": 11265 }, { "crossentropy": 2.5513393878936768, "epoch": 0.4084251740139211, "grad_norm": 0.02874339371919632, "grad_norm_var": 3.873806605867687e-06, "learning_rate": 0.006599583468451826, "loss": 2.5727, "step": 11266 }, { "crossentropy": 2.5667197704315186, "epoch": 0.40846142691415316, "grad_norm": 0.027296774089336395, "grad_norm_var": 3.964844136657601e-06, "learning_rate": 0.0065990329086022025, "loss": 2.5727, "step": 11267 }, { "crossentropy": 2.610175132751465, "epoch": 0.4084976798143852, "grad_norm": 0.02901596389710903, "grad_norm_var": 3.972080200217289e-06, "learning_rate": 0.006598482327155212, "loss": 2.5843, "step": 11268 }, { "crossentropy": 2.6112897396087646, "epoch": 0.4085339327146172, "grad_norm": 0.0341000109910965, "grad_norm_var": 5.853132688289848e-06, "learning_rate": 0.006597931724118291, "loss": 2.5897, "step": 11269 }, { "crossentropy": 2.534191608428955, "epoch": 0.4085701856148492, "grad_norm": 0.03533520922064781, "grad_norm_var": 8.18801012582628e-06, "learning_rate": 0.006597381099498878, "loss": 2.6485, "step": 11270 }, { "crossentropy": 2.460935592651367, "epoch": 0.4086064385150812, "grad_norm": 0.03268544003367424, "grad_norm_var": 8.28270869838627e-06, "learning_rate": 0.00659683045330441, "loss": 2.498, "step": 11271 }, { "crossentropy": 2.652095317840576, "epoch": 0.40864269141531323, "grad_norm": 0.027988461777567863, "grad_norm_var": 8.259673519699648e-06, "learning_rate": 0.006596279785542323, "loss": 2.5722, "step": 11272 }, { "crossentropy": 2.6536331176757812, "epoch": 0.40867894431554525, "grad_norm": 0.027954138815402985, "grad_norm_var": 8.083334344678713e-06, "learning_rate": 0.006595729096220054, "loss": 2.609, "step": 11273 }, { "crossentropy": 2.6073246002197266, "epoch": 0.40871519721577726, "grad_norm": 0.027104925364255905, "grad_norm_var": 8.254450776718847e-06, "learning_rate": 0.006595178385345043, "loss": 2.6287, "step": 11274 }, { "crossentropy": 2.601083517074585, "epoch": 0.4087514501160093, "grad_norm": 0.029370788484811783, "grad_norm_var": 7.7649785373078e-06, "learning_rate": 0.006594627652924727, "loss": 2.5073, "step": 11275 }, { "crossentropy": 2.759655714035034, "epoch": 0.4087877030162413, "grad_norm": 0.02663179114460945, "grad_norm_var": 7.82045588803102e-06, "learning_rate": 0.006594076898966544, "loss": 2.6256, "step": 11276 }, { "crossentropy": 2.574354887008667, "epoch": 0.4088239559164733, "grad_norm": 0.027368338778614998, "grad_norm_var": 8.20917496186513e-06, "learning_rate": 0.006593526123477935, "loss": 2.557, "step": 11277 }, { "crossentropy": 2.581878900527954, "epoch": 0.4088602088167053, "grad_norm": 0.026384631171822548, "grad_norm_var": 8.885511812508476e-06, "learning_rate": 0.006592975326466336, "loss": 2.4674, "step": 11278 }, { "crossentropy": 2.5985772609710693, "epoch": 0.40889646171693733, "grad_norm": 0.029424188658595085, "grad_norm_var": 8.676640822371018e-06, "learning_rate": 0.0065924245079391885, "loss": 2.6405, "step": 11279 }, { "crossentropy": 2.500542163848877, "epoch": 0.40893271461716935, "grad_norm": 0.029194144532084465, "grad_norm_var": 8.214365393074443e-06, "learning_rate": 0.006591873667903932, "loss": 2.5314, "step": 11280 }, { "crossentropy": 2.5596890449523926, "epoch": 0.4089689675174014, "grad_norm": 0.026922227814793587, "grad_norm_var": 7.202768873220304e-06, "learning_rate": 0.006591322806368004, "loss": 2.6116, "step": 11281 }, { "crossentropy": 2.565178394317627, "epoch": 0.40900522041763343, "grad_norm": 0.02716495841741562, "grad_norm_var": 7.432489003378094e-06, "learning_rate": 0.006590771923338848, "loss": 2.6063, "step": 11282 }, { "crossentropy": 2.4801204204559326, "epoch": 0.40904147331786544, "grad_norm": 0.027226336300373077, "grad_norm_var": 7.448761242716247e-06, "learning_rate": 0.006590221018823902, "loss": 2.5661, "step": 11283 }, { "crossentropy": 2.6737492084503174, "epoch": 0.40907772621809746, "grad_norm": 0.028474854305386543, "grad_norm_var": 7.465330266364954e-06, "learning_rate": 0.006589670092830608, "loss": 2.6625, "step": 11284 }, { "crossentropy": 2.6268179416656494, "epoch": 0.4091139791183295, "grad_norm": 0.026615312322974205, "grad_norm_var": 5.835257076693539e-06, "learning_rate": 0.006589119145366408, "loss": 2.6317, "step": 11285 }, { "crossentropy": 2.4985482692718506, "epoch": 0.4091502320185615, "grad_norm": 0.028186747804284096, "grad_norm_var": 2.505018859007728e-06, "learning_rate": 0.006588568176438742, "loss": 2.5375, "step": 11286 }, { "crossentropy": 2.530367612838745, "epoch": 0.4091864849187935, "grad_norm": 0.027990836650133133, "grad_norm_var": 9.769164817570076e-07, "learning_rate": 0.006588017186055052, "loss": 2.5517, "step": 11287 }, { "crossentropy": 2.5885438919067383, "epoch": 0.4092227378190255, "grad_norm": 0.028801895678043365, "grad_norm_var": 1.0441160210982714e-06, "learning_rate": 0.00658746617422278, "loss": 2.6296, "step": 11288 }, { "crossentropy": 2.5610623359680176, "epoch": 0.40925899071925753, "grad_norm": 0.027003895491361618, "grad_norm_var": 1.081149538427704e-06, "learning_rate": 0.006586915140949368, "loss": 2.5333, "step": 11289 }, { "crossentropy": 2.6020283699035645, "epoch": 0.40929524361948955, "grad_norm": 0.028073003515601158, "grad_norm_var": 1.0575406861427441e-06, "learning_rate": 0.006586364086242258, "loss": 2.6042, "step": 11290 }, { "crossentropy": 2.567988634109497, "epoch": 0.40933149651972156, "grad_norm": 0.028104091063141823, "grad_norm_var": 8.928865514783168e-07, "learning_rate": 0.006585813010108896, "loss": 2.5117, "step": 11291 }, { "crossentropy": 2.5462515354156494, "epoch": 0.4093677494199536, "grad_norm": 0.030133040621876717, "grad_norm_var": 1.1496674804224814e-06, "learning_rate": 0.0065852619125567205, "loss": 2.6069, "step": 11292 }, { "crossentropy": 2.7298583984375, "epoch": 0.4094040023201856, "grad_norm": 0.02774331346154213, "grad_norm_var": 1.1297851586038188e-06, "learning_rate": 0.006584710793593178, "loss": 2.699, "step": 11293 }, { "crossentropy": 2.529494285583496, "epoch": 0.40944025522041766, "grad_norm": 0.030840519815683365, "grad_norm_var": 1.4316636593014599e-06, "learning_rate": 0.0065841596532257105, "loss": 2.5693, "step": 11294 }, { "crossentropy": 2.6592838764190674, "epoch": 0.4094765081206497, "grad_norm": 0.02926148660480976, "grad_norm_var": 1.4077093251515233e-06, "learning_rate": 0.006583608491461765, "loss": 2.6695, "step": 11295 }, { "crossentropy": 2.6378862857818604, "epoch": 0.4095127610208817, "grad_norm": 0.03238547220826149, "grad_norm_var": 2.452991624520874e-06, "learning_rate": 0.006583057308308782, "loss": 2.5889, "step": 11296 }, { "crossentropy": 2.523608446121216, "epoch": 0.4095490139211137, "grad_norm": 0.027132129296660423, "grad_norm_var": 2.4134635268291405e-06, "learning_rate": 0.006582506103774207, "loss": 2.621, "step": 11297 }, { "crossentropy": 2.590707540512085, "epoch": 0.4095852668213457, "grad_norm": 0.027242625132203102, "grad_norm_var": 2.4005734031907426e-06, "learning_rate": 0.006581954877865487, "loss": 2.6532, "step": 11298 }, { "crossentropy": 2.653491973876953, "epoch": 0.40962151972157773, "grad_norm": 0.029690710827708244, "grad_norm_var": 2.377749793368994e-06, "learning_rate": 0.006581403630590065, "loss": 2.6823, "step": 11299 }, { "crossentropy": 2.468169689178467, "epoch": 0.40965777262180975, "grad_norm": 0.02778644487261772, "grad_norm_var": 2.4193144442007876e-06, "learning_rate": 0.006580852361955387, "loss": 2.5107, "step": 11300 }, { "crossentropy": 2.599205493927002, "epoch": 0.40969402552204176, "grad_norm": 0.026955731213092804, "grad_norm_var": 2.338200036658816e-06, "learning_rate": 0.0065803010719689, "loss": 2.6189, "step": 11301 }, { "crossentropy": 2.5128750801086426, "epoch": 0.4097302784222738, "grad_norm": 0.02716766484081745, "grad_norm_var": 2.456983511355387e-06, "learning_rate": 0.0065797497606380475, "loss": 2.5915, "step": 11302 }, { "crossentropy": 2.6065895557403564, "epoch": 0.4097665313225058, "grad_norm": 0.02667359821498394, "grad_norm_var": 2.6582878753630593e-06, "learning_rate": 0.006579198427970277, "loss": 2.5778, "step": 11303 }, { "crossentropy": 2.724724531173706, "epoch": 0.4098027842227378, "grad_norm": 0.0285175833851099, "grad_norm_var": 2.6495159744633253e-06, "learning_rate": 0.006578647073973036, "loss": 2.6833, "step": 11304 }, { "crossentropy": 2.670543670654297, "epoch": 0.4098390371229698, "grad_norm": 0.026642708107829094, "grad_norm_var": 2.725840551526581e-06, "learning_rate": 0.00657809569865377, "loss": 2.656, "step": 11305 }, { "crossentropy": 2.5619285106658936, "epoch": 0.40987529002320183, "grad_norm": 0.0290230643004179, "grad_norm_var": 2.7412266984820488e-06, "learning_rate": 0.006577544302019927, "loss": 2.5771, "step": 11306 }, { "crossentropy": 2.541520118713379, "epoch": 0.40991154292343385, "grad_norm": 0.02802036516368389, "grad_norm_var": 2.7455962633106486e-06, "learning_rate": 0.0065769928840789535, "loss": 2.5557, "step": 11307 }, { "crossentropy": 2.6426591873168945, "epoch": 0.4099477958236659, "grad_norm": 0.02836620807647705, "grad_norm_var": 2.5444578703079717e-06, "learning_rate": 0.0065764414448383, "loss": 2.5733, "step": 11308 }, { "crossentropy": 2.63645601272583, "epoch": 0.40998404872389793, "grad_norm": 0.030700530856847763, "grad_norm_var": 2.855520676018665e-06, "learning_rate": 0.006575889984305411, "loss": 2.6385, "step": 11309 }, { "crossentropy": 2.738184928894043, "epoch": 0.41002030162412995, "grad_norm": 0.02960122376680374, "grad_norm_var": 2.568966990078303e-06, "learning_rate": 0.0065753385024877355, "loss": 2.708, "step": 11310 }, { "crossentropy": 2.666766881942749, "epoch": 0.41005655452436196, "grad_norm": 0.02847728505730629, "grad_norm_var": 2.522341455802448e-06, "learning_rate": 0.006574786999392725, "loss": 2.6445, "step": 11311 }, { "crossentropy": 2.572293758392334, "epoch": 0.410092807424594, "grad_norm": 0.02778293937444687, "grad_norm_var": 1.399890562396921e-06, "learning_rate": 0.006574235475027825, "loss": 2.5911, "step": 11312 }, { "crossentropy": 2.625720739364624, "epoch": 0.410129060324826, "grad_norm": 0.029574137181043625, "grad_norm_var": 1.4537839546357433e-06, "learning_rate": 0.006573683929400485, "loss": 2.5848, "step": 11313 }, { "crossentropy": 2.6825878620147705, "epoch": 0.410165313225058, "grad_norm": 0.03037303499877453, "grad_norm_var": 1.6399717529573069e-06, "learning_rate": 0.006573132362518157, "loss": 2.7084, "step": 11314 }, { "crossentropy": 2.692216396331787, "epoch": 0.41020156612529, "grad_norm": 0.027778128162026405, "grad_norm_var": 1.5546423074556831e-06, "learning_rate": 0.0065725807743882885, "loss": 2.6462, "step": 11315 }, { "crossentropy": 2.529910087585449, "epoch": 0.41023781902552203, "grad_norm": 0.026813307777047157, "grad_norm_var": 1.6856594673707402e-06, "learning_rate": 0.00657202916501833, "loss": 2.5483, "step": 11316 }, { "crossentropy": 2.620704174041748, "epoch": 0.41027407192575405, "grad_norm": 0.02683454379439354, "grad_norm_var": 1.7079627157914282e-06, "learning_rate": 0.0065714775344157305, "loss": 2.5907, "step": 11317 }, { "crossentropy": 2.6587932109832764, "epoch": 0.41031032482598606, "grad_norm": 0.027577394619584084, "grad_norm_var": 1.6581439715576772e-06, "learning_rate": 0.006570925882587945, "loss": 2.6539, "step": 11318 }, { "crossentropy": 2.48118257522583, "epoch": 0.4103465777262181, "grad_norm": 0.028458774089813232, "grad_norm_var": 1.4708542926065067e-06, "learning_rate": 0.00657037420954242, "loss": 2.5278, "step": 11319 }, { "crossentropy": 2.5938854217529297, "epoch": 0.4103828306264501, "grad_norm": 0.029314301908016205, "grad_norm_var": 1.5220799366024434e-06, "learning_rate": 0.006569822515286609, "loss": 2.6352, "step": 11320 }, { "crossentropy": 2.6006035804748535, "epoch": 0.41041908352668216, "grad_norm": 0.02734067291021347, "grad_norm_var": 1.3835346126196228e-06, "learning_rate": 0.00656927079982796, "loss": 2.6937, "step": 11321 }, { "crossentropy": 2.6465189456939697, "epoch": 0.4104553364269142, "grad_norm": 0.026329267770051956, "grad_norm_var": 1.650004004794545e-06, "learning_rate": 0.006568719063173931, "loss": 2.6149, "step": 11322 }, { "crossentropy": 2.5542328357696533, "epoch": 0.4104915893271462, "grad_norm": 0.026389578357338905, "grad_norm_var": 1.8843912022963819e-06, "learning_rate": 0.006568167305331969, "loss": 2.5798, "step": 11323 }, { "crossentropy": 2.692014694213867, "epoch": 0.4105278422273782, "grad_norm": 0.027459006756544113, "grad_norm_var": 1.919590684392814e-06, "learning_rate": 0.006567615526309527, "loss": 2.6591, "step": 11324 }, { "crossentropy": 2.6234183311462402, "epoch": 0.4105640951276102, "grad_norm": 0.028068486601114273, "grad_norm_var": 1.4663519087880582e-06, "learning_rate": 0.0065670637261140575, "loss": 2.5739, "step": 11325 }, { "crossentropy": 2.6371402740478516, "epoch": 0.41060034802784223, "grad_norm": 0.029852911829948425, "grad_norm_var": 1.5236846827598132e-06, "learning_rate": 0.006566511904753018, "loss": 2.559, "step": 11326 }, { "crossentropy": 2.7501180171966553, "epoch": 0.41063660092807425, "grad_norm": 0.02735111303627491, "grad_norm_var": 1.5352607966806833e-06, "learning_rate": 0.006565960062233854, "loss": 2.7292, "step": 11327 }, { "crossentropy": 2.3763928413391113, "epoch": 0.41067285382830626, "grad_norm": 0.026581790298223495, "grad_norm_var": 1.653165462352224e-06, "learning_rate": 0.0065654081985640255, "loss": 2.3903, "step": 11328 }, { "crossentropy": 2.5871760845184326, "epoch": 0.4107091067285383, "grad_norm": 0.026775116100907326, "grad_norm_var": 1.5109498438900382e-06, "learning_rate": 0.006564856313750982, "loss": 2.6636, "step": 11329 }, { "crossentropy": 2.644473075866699, "epoch": 0.4107453596287703, "grad_norm": 0.02785257250070572, "grad_norm_var": 1.0117373328143164e-06, "learning_rate": 0.006564304407802181, "loss": 2.6304, "step": 11330 }, { "crossentropy": 2.5622527599334717, "epoch": 0.4107816125290023, "grad_norm": 0.03031272254884243, "grad_norm_var": 1.4908293626861649e-06, "learning_rate": 0.006563752480725073, "loss": 2.6647, "step": 11331 }, { "crossentropy": 2.503028154373169, "epoch": 0.4108178654292343, "grad_norm": 0.03704003617167473, "grad_norm_var": 6.808883786662056e-06, "learning_rate": 0.006563200532527116, "loss": 2.5337, "step": 11332 }, { "crossentropy": 2.6708598136901855, "epoch": 0.41085411832946633, "grad_norm": 0.02930109202861786, "grad_norm_var": 6.6920007085305985e-06, "learning_rate": 0.006562648563215763, "loss": 2.8067, "step": 11333 }, { "crossentropy": 2.610930919647217, "epoch": 0.4108903712296984, "grad_norm": 0.029605630785226822, "grad_norm_var": 6.6995262697085976e-06, "learning_rate": 0.006562096572798471, "loss": 2.6663, "step": 11334 }, { "crossentropy": 2.627584218978882, "epoch": 0.4109266241299304, "grad_norm": 0.02793188951909542, "grad_norm_var": 6.728699528271579e-06, "learning_rate": 0.006561544561282693, "loss": 2.6445, "step": 11335 }, { "crossentropy": 2.5895895957946777, "epoch": 0.41096287703016243, "grad_norm": 0.02774166688323021, "grad_norm_var": 6.73226575707655e-06, "learning_rate": 0.006560992528675887, "loss": 2.6211, "step": 11336 }, { "crossentropy": 2.556990623474121, "epoch": 0.41099912993039445, "grad_norm": 0.027714552357792854, "grad_norm_var": 6.683416252216181e-06, "learning_rate": 0.006560440474985505, "loss": 2.5659, "step": 11337 }, { "crossentropy": 2.7034690380096436, "epoch": 0.41103538283062646, "grad_norm": 0.028195686638355255, "grad_norm_var": 6.356155137193729e-06, "learning_rate": 0.00655988840021901, "loss": 2.6326, "step": 11338 }, { "crossentropy": 2.661883592605591, "epoch": 0.4110716357308585, "grad_norm": 0.02942477911710739, "grad_norm_var": 6.022875094866088e-06, "learning_rate": 0.006559336304383854, "loss": 2.6009, "step": 11339 }, { "crossentropy": 2.642976760864258, "epoch": 0.4111078886310905, "grad_norm": 0.03331160545349121, "grad_norm_var": 7.0972924889236975e-06, "learning_rate": 0.006558784187487494, "loss": 2.5845, "step": 11340 }, { "crossentropy": 2.507002830505371, "epoch": 0.4111441415313225, "grad_norm": 0.029667777940630913, "grad_norm_var": 7.017712002339629e-06, "learning_rate": 0.006558232049537387, "loss": 2.5209, "step": 11341 }, { "crossentropy": 2.7618894577026367, "epoch": 0.4111803944315545, "grad_norm": 0.027255866676568985, "grad_norm_var": 7.244784484769449e-06, "learning_rate": 0.006557679890540993, "loss": 2.6354, "step": 11342 }, { "crossentropy": 2.6058855056762695, "epoch": 0.41121664733178653, "grad_norm": 0.026797696948051453, "grad_norm_var": 7.395114016548936e-06, "learning_rate": 0.00655712771050577, "loss": 2.5849, "step": 11343 }, { "crossentropy": 2.4234421253204346, "epoch": 0.41125290023201855, "grad_norm": 0.02897818200290203, "grad_norm_var": 6.9512044319263696e-06, "learning_rate": 0.0065565755094391725, "loss": 2.4718, "step": 11344 }, { "crossentropy": 2.5255231857299805, "epoch": 0.41128915313225056, "grad_norm": 0.027897654101252556, "grad_norm_var": 6.66041114014739e-06, "learning_rate": 0.006556023287348658, "loss": 2.6155, "step": 11345 }, { "crossentropy": 2.514988422393799, "epoch": 0.4113254060324826, "grad_norm": 0.02916557714343071, "grad_norm_var": 6.512252606154639e-06, "learning_rate": 0.006555471044241691, "loss": 2.6155, "step": 11346 }, { "crossentropy": 2.6936347484588623, "epoch": 0.4113616589327146, "grad_norm": 0.027934426441788673, "grad_norm_var": 6.575199689401688e-06, "learning_rate": 0.006554918780125727, "loss": 2.6939, "step": 11347 }, { "crossentropy": 2.606718063354492, "epoch": 0.41139791183294666, "grad_norm": 0.028110062703490257, "grad_norm_var": 2.2812474329084496e-06, "learning_rate": 0.0065543664950082235, "loss": 2.6591, "step": 11348 }, { "crossentropy": 2.726663112640381, "epoch": 0.4114341647331787, "grad_norm": 0.02794656530022621, "grad_norm_var": 2.2854873808952528e-06, "learning_rate": 0.006553814188896642, "loss": 2.6722, "step": 11349 }, { "crossentropy": 2.568631172180176, "epoch": 0.4114704176334107, "grad_norm": 0.02894832007586956, "grad_norm_var": 2.2247921741880282e-06, "learning_rate": 0.006553261861798442, "loss": 2.6334, "step": 11350 }, { "crossentropy": 2.5602495670318604, "epoch": 0.4115066705336427, "grad_norm": 0.029522491618990898, "grad_norm_var": 2.248882356642214e-06, "learning_rate": 0.006552709513721084, "loss": 2.5686, "step": 11351 }, { "crossentropy": 2.6965279579162598, "epoch": 0.4115429234338747, "grad_norm": 0.030525216832756996, "grad_norm_var": 2.3910842588451086e-06, "learning_rate": 0.006552157144672028, "loss": 2.6487, "step": 11352 }, { "crossentropy": 2.628345012664795, "epoch": 0.41157917633410673, "grad_norm": 0.028096720576286316, "grad_norm_var": 2.3430031593833662e-06, "learning_rate": 0.006551604754658733, "loss": 2.53, "step": 11353 }, { "crossentropy": 2.672791004180908, "epoch": 0.41161542923433875, "grad_norm": 0.028255755081772804, "grad_norm_var": 2.337898778313853e-06, "learning_rate": 0.006551052343688662, "loss": 2.7214, "step": 11354 }, { "crossentropy": 2.524456739425659, "epoch": 0.41165168213457076, "grad_norm": 0.03051292896270752, "grad_norm_var": 2.4931314396588507e-06, "learning_rate": 0.006550499911769274, "loss": 2.5936, "step": 11355 }, { "crossentropy": 2.4890925884246826, "epoch": 0.4116879350348028, "grad_norm": 0.029465317726135254, "grad_norm_var": 1.1721982350905754e-06, "learning_rate": 0.006549947458908034, "loss": 2.5503, "step": 11356 }, { "crossentropy": 2.4301280975341797, "epoch": 0.4117241879350348, "grad_norm": 0.028286049142479897, "grad_norm_var": 1.111852140262788e-06, "learning_rate": 0.0065493949851124006, "loss": 2.4677, "step": 11357 }, { "crossentropy": 2.6652750968933105, "epoch": 0.4117604408352668, "grad_norm": 0.029095446690917015, "grad_norm_var": 9.921550468989954e-07, "learning_rate": 0.0065488424903898355, "loss": 2.6535, "step": 11358 }, { "crossentropy": 2.699202060699463, "epoch": 0.4117966937354988, "grad_norm": 0.02770168147981167, "grad_norm_var": 8.113929726376168e-07, "learning_rate": 0.006548289974747803, "loss": 2.6584, "step": 11359 }, { "crossentropy": 2.621410846710205, "epoch": 0.41183294663573083, "grad_norm": 0.02758784405887127, "grad_norm_var": 8.950335994751021e-07, "learning_rate": 0.006547737438193766, "loss": 2.6312, "step": 11360 }, { "crossentropy": 2.635051965713501, "epoch": 0.4118691995359629, "grad_norm": 0.0296214297413826, "grad_norm_var": 8.98462835412633e-07, "learning_rate": 0.006547184880735184, "loss": 2.6213, "step": 11361 }, { "crossentropy": 2.509768009185791, "epoch": 0.4119054524361949, "grad_norm": 0.029187561944127083, "grad_norm_var": 8.995690898306082e-07, "learning_rate": 0.006546632302379523, "loss": 2.5818, "step": 11362 }, { "crossentropy": 2.4884302616119385, "epoch": 0.41194170533642693, "grad_norm": 0.027745438739657402, "grad_norm_var": 9.236089603434911e-07, "learning_rate": 0.006546079703134246, "loss": 2.4913, "step": 11363 }, { "crossentropy": 2.610424757003784, "epoch": 0.41197795823665895, "grad_norm": 0.0296607818454504, "grad_norm_var": 9.337217961851434e-07, "learning_rate": 0.006545527083006817, "loss": 2.61, "step": 11364 }, { "crossentropy": 2.6043500900268555, "epoch": 0.41201421113689096, "grad_norm": 0.028608668595552444, "grad_norm_var": 8.782776508123863e-07, "learning_rate": 0.006544974442004696, "loss": 2.6003, "step": 11365 }, { "crossentropy": 2.6825554370880127, "epoch": 0.412050464037123, "grad_norm": 0.03392452001571655, "grad_norm_var": 2.4405128120937963e-06, "learning_rate": 0.006544421780135354, "loss": 2.681, "step": 11366 }, { "crossentropy": 2.6370255947113037, "epoch": 0.412086716937355, "grad_norm": 0.031034816056489944, "grad_norm_var": 2.640951833352449e-06, "learning_rate": 0.0065438690974062505, "loss": 2.5941, "step": 11367 }, { "crossentropy": 2.5497028827667236, "epoch": 0.412122969837587, "grad_norm": 0.02689671516418457, "grad_norm_var": 2.886494678461191e-06, "learning_rate": 0.006543316393824854, "loss": 2.5791, "step": 11368 }, { "crossentropy": 2.3404078483581543, "epoch": 0.412159222737819, "grad_norm": 0.02795172668993473, "grad_norm_var": 2.90730323530898e-06, "learning_rate": 0.006542763669398625, "loss": 2.4776, "step": 11369 }, { "crossentropy": 2.574483871459961, "epoch": 0.41219547563805103, "grad_norm": 0.028968097642064095, "grad_norm_var": 2.8592080535707325e-06, "learning_rate": 0.006542210924135032, "loss": 2.6302, "step": 11370 }, { "crossentropy": 2.5734405517578125, "epoch": 0.41223172853828305, "grad_norm": 0.027342699468135834, "grad_norm_var": 2.9072603395641535e-06, "learning_rate": 0.006541658158041541, "loss": 2.557, "step": 11371 }, { "crossentropy": 2.6286394596099854, "epoch": 0.41226798143851506, "grad_norm": 0.026567941531538963, "grad_norm_var": 3.2299322614045533e-06, "learning_rate": 0.0065411053711256165, "loss": 2.5698, "step": 11372 }, { "crossentropy": 2.501288652420044, "epoch": 0.4123042343387471, "grad_norm": 0.026542099192738533, "grad_norm_var": 3.530534838209973e-06, "learning_rate": 0.006540552563394725, "loss": 2.563, "step": 11373 }, { "crossentropy": 2.545708417892456, "epoch": 0.4123404872389791, "grad_norm": 0.027424238622188568, "grad_norm_var": 3.6063573020268194e-06, "learning_rate": 0.006539999734856334, "loss": 2.6051, "step": 11374 }, { "crossentropy": 2.598235607147217, "epoch": 0.41237674013921116, "grad_norm": 0.026888811960816383, "grad_norm_var": 3.7393690279772004e-06, "learning_rate": 0.006539446885517909, "loss": 2.5601, "step": 11375 }, { "crossentropy": 2.6578965187072754, "epoch": 0.4124129930394432, "grad_norm": 0.027754120528697968, "grad_norm_var": 3.720938926321006e-06, "learning_rate": 0.006538894015386918, "loss": 2.6442, "step": 11376 }, { "crossentropy": 2.5154154300689697, "epoch": 0.4124492459396752, "grad_norm": 0.025675185024738312, "grad_norm_var": 4.108119045792037e-06, "learning_rate": 0.006538341124470828, "loss": 2.5613, "step": 11377 }, { "crossentropy": 2.4895009994506836, "epoch": 0.4124854988399072, "grad_norm": 0.02749790996313095, "grad_norm_var": 4.077773202793228e-06, "learning_rate": 0.006537788212777106, "loss": 2.5998, "step": 11378 }, { "crossentropy": 2.4307339191436768, "epoch": 0.4125217517401392, "grad_norm": 0.027272064238786697, "grad_norm_var": 4.117643406670544e-06, "learning_rate": 0.00653723528031322, "loss": 2.4613, "step": 11379 }, { "crossentropy": 2.6901819705963135, "epoch": 0.41255800464037123, "grad_norm": 0.026859257370233536, "grad_norm_var": 4.034749112446001e-06, "learning_rate": 0.00653668232708664, "loss": 2.5791, "step": 11380 }, { "crossentropy": 2.657679557800293, "epoch": 0.41259425754060325, "grad_norm": 0.026243815198540688, "grad_norm_var": 4.176769890795682e-06, "learning_rate": 0.006536129353104833, "loss": 2.6092, "step": 11381 }, { "crossentropy": 2.7377898693084717, "epoch": 0.41263051044083526, "grad_norm": 0.03104308992624283, "grad_norm_var": 2.3437583225315836e-06, "learning_rate": 0.006535576358375268, "loss": 2.6056, "step": 11382 }, { "crossentropy": 2.5157623291015625, "epoch": 0.4126667633410673, "grad_norm": 0.029789550229907036, "grad_norm_var": 1.8741377109929147e-06, "learning_rate": 0.006535023342905413, "loss": 2.5864, "step": 11383 }, { "crossentropy": 2.806288242340088, "epoch": 0.4127030162412993, "grad_norm": 0.028305819258093834, "grad_norm_var": 1.8764674365650523e-06, "learning_rate": 0.006534470306702739, "loss": 2.7212, "step": 11384 }, { "crossentropy": 2.708099603652954, "epoch": 0.4127392691415313, "grad_norm": 0.02721860632300377, "grad_norm_var": 1.8788940806122583e-06, "learning_rate": 0.006533917249774715, "loss": 2.6053, "step": 11385 }, { "crossentropy": 2.5641298294067383, "epoch": 0.4127755220417633, "grad_norm": 0.026493102312088013, "grad_norm_var": 1.8060097824872922e-06, "learning_rate": 0.006533364172128811, "loss": 2.5525, "step": 11386 }, { "crossentropy": 2.6062963008880615, "epoch": 0.41281177494199534, "grad_norm": 0.027124855667352676, "grad_norm_var": 1.8115810435921179e-06, "learning_rate": 0.006532811073772495, "loss": 2.6756, "step": 11387 }, { "crossentropy": 2.6033401489257812, "epoch": 0.4128480278422274, "grad_norm": 0.028463387861847878, "grad_norm_var": 1.8210968891790279e-06, "learning_rate": 0.006532257954713241, "loss": 2.5548, "step": 11388 }, { "crossentropy": 2.657285690307617, "epoch": 0.4128842807424594, "grad_norm": 0.028318170458078384, "grad_norm_var": 1.782588820885148e-06, "learning_rate": 0.0065317048149585175, "loss": 2.6398, "step": 11389 }, { "crossentropy": 2.483668327331543, "epoch": 0.41292053364269143, "grad_norm": 0.027386020869016647, "grad_norm_var": 1.783821598253761e-06, "learning_rate": 0.006531151654515796, "loss": 2.5372, "step": 11390 }, { "crossentropy": 2.784839153289795, "epoch": 0.41295678654292345, "grad_norm": 0.027736397460103035, "grad_norm_var": 1.74316656242437e-06, "learning_rate": 0.0065305984733925486, "loss": 2.6643, "step": 11391 }, { "crossentropy": 2.5399158000946045, "epoch": 0.41299303944315546, "grad_norm": 0.029346786439418793, "grad_norm_var": 1.9134433869828164e-06, "learning_rate": 0.0065300452715962454, "loss": 2.4731, "step": 11392 }, { "crossentropy": 2.7018282413482666, "epoch": 0.4130292923433875, "grad_norm": 0.02780425362288952, "grad_norm_var": 1.5940290948787221e-06, "learning_rate": 0.006529492049134359, "loss": 2.642, "step": 11393 }, { "crossentropy": 2.6746129989624023, "epoch": 0.4130655452436195, "grad_norm": 0.027849331498146057, "grad_norm_var": 1.5814339529735915e-06, "learning_rate": 0.006528938806014362, "loss": 2.6324, "step": 11394 }, { "crossentropy": 2.4571969509124756, "epoch": 0.4131017981438515, "grad_norm": 0.030839847400784492, "grad_norm_var": 2.0528835791397733e-06, "learning_rate": 0.006528385542243724, "loss": 2.537, "step": 11395 }, { "crossentropy": 2.6396825313568115, "epoch": 0.4131380510440835, "grad_norm": 0.02741454727947712, "grad_norm_var": 1.9746362914106638e-06, "learning_rate": 0.006527832257829922, "loss": 2.6425, "step": 11396 }, { "crossentropy": 2.567673444747925, "epoch": 0.41317430394431554, "grad_norm": 0.029343195259571075, "grad_norm_var": 1.7620397872891337e-06, "learning_rate": 0.006527278952780426, "loss": 2.5951, "step": 11397 }, { "crossentropy": 2.508148431777954, "epoch": 0.41321055684454755, "grad_norm": 0.027917364612221718, "grad_norm_var": 1.2731363102734423e-06, "learning_rate": 0.006526725627102711, "loss": 2.5514, "step": 11398 }, { "crossentropy": 2.6091156005859375, "epoch": 0.41324680974477956, "grad_norm": 0.02592761628329754, "grad_norm_var": 1.391663487360489e-06, "learning_rate": 0.006526172280804249, "loss": 2.6016, "step": 11399 }, { "crossentropy": 2.598254442214966, "epoch": 0.4132830626450116, "grad_norm": 0.02846800908446312, "grad_norm_var": 1.4006112692606694e-06, "learning_rate": 0.006525618913892514, "loss": 2.5714, "step": 11400 }, { "crossentropy": 2.638352394104004, "epoch": 0.4133193155452436, "grad_norm": 0.026404917240142822, "grad_norm_var": 1.524403616705638e-06, "learning_rate": 0.006525065526374979, "loss": 2.6242, "step": 11401 }, { "crossentropy": 2.5509676933288574, "epoch": 0.41335556844547566, "grad_norm": 0.026857765391469002, "grad_norm_var": 1.4629785725526109e-06, "learning_rate": 0.006524512118259122, "loss": 2.6027, "step": 11402 }, { "crossentropy": 2.566681385040283, "epoch": 0.4133918213457077, "grad_norm": 0.027800044044852257, "grad_norm_var": 1.4171734390026979e-06, "learning_rate": 0.006523958689552414, "loss": 2.6595, "step": 11403 }, { "crossentropy": 2.602646589279175, "epoch": 0.4134280742459397, "grad_norm": 0.027383089065551758, "grad_norm_var": 1.4222660574492175e-06, "learning_rate": 0.006523405240262332, "loss": 2.5369, "step": 11404 }, { "crossentropy": 2.5100209712982178, "epoch": 0.4134643271461717, "grad_norm": 0.026729168370366096, "grad_norm_var": 1.4967391999058058e-06, "learning_rate": 0.006522851770396349, "loss": 2.5776, "step": 11405 }, { "crossentropy": 2.59281587600708, "epoch": 0.4135005800464037, "grad_norm": 0.027043433859944344, "grad_norm_var": 1.5241502219593208e-06, "learning_rate": 0.0065222982799619435, "loss": 2.5241, "step": 11406 }, { "crossentropy": 2.685389280319214, "epoch": 0.41353683294663574, "grad_norm": 0.026954621076583862, "grad_norm_var": 1.5694068035497356e-06, "learning_rate": 0.006521744768966589, "loss": 2.6346, "step": 11407 }, { "crossentropy": 2.60848069190979, "epoch": 0.41357308584686775, "grad_norm": 0.02953122742474079, "grad_norm_var": 1.6106722461314258e-06, "learning_rate": 0.006521191237417762, "loss": 2.5369, "step": 11408 }, { "crossentropy": 2.571023941040039, "epoch": 0.41360933874709976, "grad_norm": 0.028994472697377205, "grad_norm_var": 1.7051584767978819e-06, "learning_rate": 0.006520637685322937, "loss": 2.625, "step": 11409 }, { "crossentropy": 2.7924652099609375, "epoch": 0.4136455916473318, "grad_norm": 0.029457662254571915, "grad_norm_var": 1.8685800811074049e-06, "learning_rate": 0.006520084112689596, "loss": 2.791, "step": 11410 }, { "crossentropy": 2.7415266036987305, "epoch": 0.4136818445475638, "grad_norm": 0.029856886714696884, "grad_norm_var": 1.549131194272093e-06, "learning_rate": 0.006519530519525211, "loss": 2.6161, "step": 11411 }, { "crossentropy": 2.6147613525390625, "epoch": 0.4137180974477958, "grad_norm": 0.0303545743227005, "grad_norm_var": 1.906808480885552e-06, "learning_rate": 0.0065189769058372595, "loss": 2.6384, "step": 11412 }, { "crossentropy": 2.7174108028411865, "epoch": 0.4137543503480278, "grad_norm": 0.030589405447244644, "grad_norm_var": 2.216425796915343e-06, "learning_rate": 0.006518423271633218, "loss": 2.6194, "step": 11413 }, { "crossentropy": 2.7135238647460938, "epoch": 0.41379060324825984, "grad_norm": 0.032082539051771164, "grad_norm_var": 3.176025903781178e-06, "learning_rate": 0.00651786961692057, "loss": 2.592, "step": 11414 }, { "crossentropy": 2.76354718208313, "epoch": 0.4138268561484919, "grad_norm": 0.0354643277823925, "grad_norm_var": 5.713726113618321e-06, "learning_rate": 0.006517315941706787, "loss": 2.6761, "step": 11415 }, { "crossentropy": 2.6262779235839844, "epoch": 0.4138631090487239, "grad_norm": 0.032267048954963684, "grad_norm_var": 6.3471780390788575e-06, "learning_rate": 0.00651676224599935, "loss": 2.5986, "step": 11416 }, { "crossentropy": 2.6830883026123047, "epoch": 0.41389936194895594, "grad_norm": 0.027706410735845566, "grad_norm_var": 5.961813335038137e-06, "learning_rate": 0.0065162085298057365, "loss": 2.6914, "step": 11417 }, { "crossentropy": 2.6282339096069336, "epoch": 0.41393561484918795, "grad_norm": 0.02715368941426277, "grad_norm_var": 5.8702519744765196e-06, "learning_rate": 0.006515654793133425, "loss": 2.5966, "step": 11418 }, { "crossentropy": 2.597778081893921, "epoch": 0.41397186774941996, "grad_norm": 0.02681606635451317, "grad_norm_var": 6.132217414209218e-06, "learning_rate": 0.006515101035989899, "loss": 2.5, "step": 11419 }, { "crossentropy": 2.593668222427368, "epoch": 0.414008120649652, "grad_norm": 0.02869969792664051, "grad_norm_var": 5.908606408269026e-06, "learning_rate": 0.006514547258382633, "loss": 2.6049, "step": 11420 }, { "crossentropy": 2.5444412231445312, "epoch": 0.414044373549884, "grad_norm": 0.030945517122745514, "grad_norm_var": 5.542770593338955e-06, "learning_rate": 0.006513993460319105, "loss": 2.4923, "step": 11421 }, { "crossentropy": 2.488755702972412, "epoch": 0.414080626450116, "grad_norm": 0.027727724984288216, "grad_norm_var": 5.336967437675339e-06, "learning_rate": 0.0065134396418068, "loss": 2.5906, "step": 11422 }, { "crossentropy": 2.701395273208618, "epoch": 0.414116879350348, "grad_norm": 0.027324847877025604, "grad_norm_var": 5.2118578240806235e-06, "learning_rate": 0.006512885802853197, "loss": 2.7103, "step": 11423 }, { "crossentropy": 2.5403201580047607, "epoch": 0.41415313225058004, "grad_norm": 0.026166193187236786, "grad_norm_var": 5.988906399963809e-06, "learning_rate": 0.006512331943465774, "loss": 2.6165, "step": 11424 }, { "crossentropy": 2.7033650875091553, "epoch": 0.41418938515081205, "grad_norm": 0.028261899948120117, "grad_norm_var": 6.06942711715245e-06, "learning_rate": 0.006511778063652012, "loss": 2.6641, "step": 11425 }, { "crossentropy": 2.5388572216033936, "epoch": 0.41422563805104406, "grad_norm": 0.02833504229784012, "grad_norm_var": 6.144002252718208e-06, "learning_rate": 0.006511224163419394, "loss": 2.5779, "step": 11426 }, { "crossentropy": 2.7246487140655518, "epoch": 0.4142618909512761, "grad_norm": 0.029173683375120163, "grad_norm_var": 6.127865614592489e-06, "learning_rate": 0.0065106702427754, "loss": 2.6507, "step": 11427 }, { "crossentropy": 2.6048824787139893, "epoch": 0.4142981438515081, "grad_norm": 0.029177721589803696, "grad_norm_var": 6.051584718540169e-06, "learning_rate": 0.0065101163017275135, "loss": 2.5093, "step": 11428 }, { "crossentropy": 2.679673910140991, "epoch": 0.41433439675174016, "grad_norm": 0.0317775160074234, "grad_norm_var": 6.353062825768807e-06, "learning_rate": 0.006509562340283213, "loss": 2.6476, "step": 11429 }, { "crossentropy": 2.6702585220336914, "epoch": 0.4143706496519722, "grad_norm": 0.03747576102614403, "grad_norm_var": 1.015932307809562e-05, "learning_rate": 0.006509008358449982, "loss": 2.5612, "step": 11430 }, { "crossentropy": 2.6315255165100098, "epoch": 0.4144069025522042, "grad_norm": 0.034643109887838364, "grad_norm_var": 9.565329593418104e-06, "learning_rate": 0.006508454356235305, "loss": 2.6207, "step": 11431 }, { "crossentropy": 2.4863133430480957, "epoch": 0.4144431554524362, "grad_norm": 0.02964947372674942, "grad_norm_var": 9.063866781928767e-06, "learning_rate": 0.006507900333646661, "loss": 2.5064, "step": 11432 }, { "crossentropy": 2.433708667755127, "epoch": 0.4144794083526682, "grad_norm": 0.03158161789178848, "grad_norm_var": 9.106890299978518e-06, "learning_rate": 0.006507346290691534, "loss": 2.5309, "step": 11433 }, { "crossentropy": 2.594073534011841, "epoch": 0.41451566125290024, "grad_norm": 0.03222787007689476, "grad_norm_var": 9.005652910513914e-06, "learning_rate": 0.006506792227377409, "loss": 2.5749, "step": 11434 }, { "crossentropy": 2.643768072128296, "epoch": 0.41455191415313225, "grad_norm": 0.029161470010876656, "grad_norm_var": 8.354096761221013e-06, "learning_rate": 0.00650623814371177, "loss": 2.663, "step": 11435 }, { "crossentropy": 2.6781206130981445, "epoch": 0.41458816705336426, "grad_norm": 0.0268175657838583, "grad_norm_var": 8.938341495202359e-06, "learning_rate": 0.006505684039702098, "loss": 2.5954, "step": 11436 }, { "crossentropy": 2.531980514526367, "epoch": 0.4146244199535963, "grad_norm": 0.027291616424918175, "grad_norm_var": 9.32574619682726e-06, "learning_rate": 0.006505129915355876, "loss": 2.5111, "step": 11437 }, { "crossentropy": 2.7375071048736572, "epoch": 0.4146606728538283, "grad_norm": 0.028295548632740974, "grad_norm_var": 9.189038692968403e-06, "learning_rate": 0.006504575770680592, "loss": 2.6421, "step": 11438 }, { "crossentropy": 2.5828585624694824, "epoch": 0.4146969257540603, "grad_norm": 0.027661776170134544, "grad_norm_var": 9.083365599500204e-06, "learning_rate": 0.0065040216056837285, "loss": 2.5588, "step": 11439 }, { "crossentropy": 2.6423516273498535, "epoch": 0.4147331786542923, "grad_norm": 0.02725835144519806, "grad_norm_var": 8.62058548909635e-06, "learning_rate": 0.006503467420372773, "loss": 2.6271, "step": 11440 }, { "crossentropy": 2.5544943809509277, "epoch": 0.41476943155452434, "grad_norm": 0.028092965483665466, "grad_norm_var": 8.659815779303682e-06, "learning_rate": 0.006502913214755205, "loss": 2.5633, "step": 11441 }, { "crossentropy": 2.70613431930542, "epoch": 0.4148056844547564, "grad_norm": 0.029299922287464142, "grad_norm_var": 8.514892307313994e-06, "learning_rate": 0.006502358988838515, "loss": 2.723, "step": 11442 }, { "crossentropy": 2.6022629737854004, "epoch": 0.4148419373549884, "grad_norm": 0.028187651187181473, "grad_norm_var": 8.680893104119842e-06, "learning_rate": 0.006501804742630186, "loss": 2.6313, "step": 11443 }, { "crossentropy": 2.557250499725342, "epoch": 0.41487819025522044, "grad_norm": 0.02705816924571991, "grad_norm_var": 9.169326893131953e-06, "learning_rate": 0.0065012504761377075, "loss": 2.6093, "step": 11444 }, { "crossentropy": 2.6649985313415527, "epoch": 0.41491444315545245, "grad_norm": 0.02654232457280159, "grad_norm_var": 9.487978377771646e-06, "learning_rate": 0.00650069618936856, "loss": 2.5641, "step": 11445 }, { "crossentropy": 2.5414321422576904, "epoch": 0.41495069605568446, "grad_norm": 0.026674866676330566, "grad_norm_var": 5.2252006609972e-06, "learning_rate": 0.006500141882330234, "loss": 2.6321, "step": 11446 }, { "crossentropy": 2.6522936820983887, "epoch": 0.4149869489559165, "grad_norm": 0.027316337451338768, "grad_norm_var": 2.8504309728211624e-06, "learning_rate": 0.006499587555030216, "loss": 2.6658, "step": 11447 }, { "crossentropy": 2.5954856872558594, "epoch": 0.4150232018561485, "grad_norm": 0.02639753557741642, "grad_norm_var": 2.9348589714986987e-06, "learning_rate": 0.0064990332074759925, "loss": 2.6143, "step": 11448 }, { "crossentropy": 2.5035691261291504, "epoch": 0.4150594547563805, "grad_norm": 0.02615133486688137, "grad_norm_var": 2.26905311635194e-06, "learning_rate": 0.006498478839675049, "loss": 2.4827, "step": 11449 }, { "crossentropy": 2.5593197345733643, "epoch": 0.4150957076566125, "grad_norm": 0.028214899823069572, "grad_norm_var": 8.941647482860742e-07, "learning_rate": 0.006497924451634877, "loss": 2.5222, "step": 11450 }, { "crossentropy": 2.6520254611968994, "epoch": 0.41513196055684454, "grad_norm": 0.03064531832933426, "grad_norm_var": 1.3552711934668634e-06, "learning_rate": 0.0064973700433629605, "loss": 2.6136, "step": 11451 }, { "crossentropy": 2.6369736194610596, "epoch": 0.41516821345707655, "grad_norm": 0.03290659934282303, "grad_norm_var": 3.021769668670029e-06, "learning_rate": 0.0064968156148667916, "loss": 2.6083, "step": 11452 }, { "crossentropy": 2.5963993072509766, "epoch": 0.41520446635730857, "grad_norm": 0.030154641717672348, "grad_norm_var": 3.2637748736114487e-06, "learning_rate": 0.0064962611661538525, "loss": 2.5237, "step": 11453 }, { "crossentropy": 2.7045180797576904, "epoch": 0.4152407192575406, "grad_norm": 0.02694634161889553, "grad_norm_var": 3.35651618512117e-06, "learning_rate": 0.006495706697231638, "loss": 2.7297, "step": 11454 }, { "crossentropy": 2.603595018386841, "epoch": 0.4152769721577726, "grad_norm": 0.027127312496304512, "grad_norm_var": 3.405192885673249e-06, "learning_rate": 0.006495152208107634, "loss": 2.6101, "step": 11455 }, { "crossentropy": 2.5728824138641357, "epoch": 0.41531322505800466, "grad_norm": 0.026440130546689034, "grad_norm_var": 3.534591832492319e-06, "learning_rate": 0.006494597698789333, "loss": 2.5814, "step": 11456 }, { "crossentropy": 2.6305203437805176, "epoch": 0.4153494779582367, "grad_norm": 0.026111986488103867, "grad_norm_var": 3.757885241817517e-06, "learning_rate": 0.006494043169284218, "loss": 2.5669, "step": 11457 }, { "crossentropy": 2.7048826217651367, "epoch": 0.4153857308584687, "grad_norm": 0.02861885540187359, "grad_norm_var": 3.658475682105124e-06, "learning_rate": 0.006493488619599784, "loss": 2.705, "step": 11458 }, { "crossentropy": 2.537083387374878, "epoch": 0.4154219837587007, "grad_norm": 0.028044072911143303, "grad_norm_var": 3.6531737233358076e-06, "learning_rate": 0.006492934049743521, "loss": 2.5879, "step": 11459 }, { "crossentropy": 2.675053834915161, "epoch": 0.4154582366589327, "grad_norm": 0.03311445191502571, "grad_norm_var": 5.3187574870023804e-06, "learning_rate": 0.006492379459722917, "loss": 2.6435, "step": 11460 }, { "crossentropy": 2.6169891357421875, "epoch": 0.41549448955916474, "grad_norm": 0.030659586191177368, "grad_norm_var": 5.461133957190934e-06, "learning_rate": 0.0064918248495454646, "loss": 2.6748, "step": 11461 }, { "crossentropy": 2.5633795261383057, "epoch": 0.41553074245939675, "grad_norm": 0.028890414163470268, "grad_norm_var": 5.2375520124501095e-06, "learning_rate": 0.006491270219218652, "loss": 2.5992, "step": 11462 }, { "crossentropy": 2.6759274005889893, "epoch": 0.41556699535962877, "grad_norm": 0.02805010788142681, "grad_norm_var": 5.1447597622215785e-06, "learning_rate": 0.006490715568749974, "loss": 2.6854, "step": 11463 }, { "crossentropy": 2.555527448654175, "epoch": 0.4156032482598608, "grad_norm": 0.02716459520161152, "grad_norm_var": 4.950693211623556e-06, "learning_rate": 0.006490160898146919, "loss": 2.51, "step": 11464 }, { "crossentropy": 2.5426347255706787, "epoch": 0.4156395011600928, "grad_norm": 0.03173219412565231, "grad_norm_var": 4.998928271965222e-06, "learning_rate": 0.00648960620741698, "loss": 2.5095, "step": 11465 }, { "crossentropy": 2.530702829360962, "epoch": 0.4156757540603248, "grad_norm": 0.027374744415283203, "grad_norm_var": 5.136743700185492e-06, "learning_rate": 0.006489051496567648, "loss": 2.5962, "step": 11466 }, { "crossentropy": 2.489091634750366, "epoch": 0.4157120069605568, "grad_norm": 0.03047824092209339, "grad_norm_var": 5.101809680072922e-06, "learning_rate": 0.006488496765606415, "loss": 2.5425, "step": 11467 }, { "crossentropy": 2.559454917907715, "epoch": 0.41574825986078884, "grad_norm": 0.03261200711131096, "grad_norm_var": 4.95333059874653e-06, "learning_rate": 0.006487942014540777, "loss": 2.5968, "step": 11468 }, { "crossentropy": 2.488544464111328, "epoch": 0.4157845127610209, "grad_norm": 0.029972266405820847, "grad_norm_var": 4.926602327258083e-06, "learning_rate": 0.006487387243378222, "loss": 2.5502, "step": 11469 }, { "crossentropy": 2.5632143020629883, "epoch": 0.4158207656612529, "grad_norm": 0.0286149512976408, "grad_norm_var": 4.652932690183764e-06, "learning_rate": 0.006486832452126246, "loss": 2.6251, "step": 11470 }, { "crossentropy": 2.486337661743164, "epoch": 0.41585701856148494, "grad_norm": 0.02582668885588646, "grad_norm_var": 5.094316608247653e-06, "learning_rate": 0.0064862776407923406, "loss": 2.587, "step": 11471 }, { "crossentropy": 2.645594596862793, "epoch": 0.41589327146171695, "grad_norm": 0.028951799497008324, "grad_norm_var": 4.63749241851463e-06, "learning_rate": 0.006485722809384, "loss": 2.6225, "step": 11472 }, { "crossentropy": 2.5690224170684814, "epoch": 0.41592952436194897, "grad_norm": 0.028205301612615585, "grad_norm_var": 4.066622435377511e-06, "learning_rate": 0.006485167957908719, "loss": 2.6336, "step": 11473 }, { "crossentropy": 2.616623640060425, "epoch": 0.415965777262181, "grad_norm": 0.03227712959051132, "grad_norm_var": 4.58574571659807e-06, "learning_rate": 0.006484613086373991, "loss": 2.4874, "step": 11474 }, { "crossentropy": 2.549349546432495, "epoch": 0.416002030162413, "grad_norm": 0.033435165882110596, "grad_norm_var": 5.357112812495041e-06, "learning_rate": 0.00648405819478731, "loss": 2.6056, "step": 11475 }, { "crossentropy": 2.694143295288086, "epoch": 0.416038283062645, "grad_norm": 0.03283834457397461, "grad_norm_var": 5.241145933373471e-06, "learning_rate": 0.006483503283156171, "loss": 2.611, "step": 11476 }, { "crossentropy": 2.6160764694213867, "epoch": 0.416074535962877, "grad_norm": 0.03131161257624626, "grad_norm_var": 5.340906183388949e-06, "learning_rate": 0.006482948351488068, "loss": 2.6822, "step": 11477 }, { "crossentropy": 2.453197956085205, "epoch": 0.41611078886310904, "grad_norm": 0.028255093842744827, "grad_norm_var": 5.448136817245855e-06, "learning_rate": 0.006482393399790498, "loss": 2.5096, "step": 11478 }, { "crossentropy": 2.5601139068603516, "epoch": 0.41614704176334105, "grad_norm": 0.026768935844302177, "grad_norm_var": 5.8528516753349275e-06, "learning_rate": 0.006481838428070956, "loss": 2.5759, "step": 11479 }, { "crossentropy": 2.394775390625, "epoch": 0.41618329466357307, "grad_norm": 0.027360154315829277, "grad_norm_var": 5.788123474207785e-06, "learning_rate": 0.006481283436336935, "loss": 2.4645, "step": 11480 }, { "crossentropy": 2.6923561096191406, "epoch": 0.4162195475638051, "grad_norm": 0.028641028329730034, "grad_norm_var": 5.568734893895283e-06, "learning_rate": 0.006480728424595936, "loss": 2.6704, "step": 11481 }, { "crossentropy": 2.5747435092926025, "epoch": 0.4162558004640371, "grad_norm": 0.030862057581543922, "grad_norm_var": 5.313791793668194e-06, "learning_rate": 0.006480173392855451, "loss": 2.5626, "step": 11482 }, { "crossentropy": 2.492393970489502, "epoch": 0.41629205336426917, "grad_norm": 0.03071403130888939, "grad_norm_var": 5.339354427652022e-06, "learning_rate": 0.006479618341122979, "loss": 2.5082, "step": 11483 }, { "crossentropy": 2.639801263809204, "epoch": 0.4163283062645012, "grad_norm": 0.02933925949037075, "grad_norm_var": 4.7775344437046926e-06, "learning_rate": 0.006479063269406016, "loss": 2.6377, "step": 11484 }, { "crossentropy": 2.6456189155578613, "epoch": 0.4163645591647332, "grad_norm": 0.027456175535917282, "grad_norm_var": 5.043574151293454e-06, "learning_rate": 0.006478508177712057, "loss": 2.638, "step": 11485 }, { "crossentropy": 2.5573923587799072, "epoch": 0.4164008120649652, "grad_norm": 0.028117500245571136, "grad_norm_var": 5.113007507985545e-06, "learning_rate": 0.006477953066048603, "loss": 2.5628, "step": 11486 }, { "crossentropy": 2.5971591472625732, "epoch": 0.4164370649651972, "grad_norm": 0.026494547724723816, "grad_norm_var": 4.822910119337865e-06, "learning_rate": 0.00647739793442315, "loss": 2.5096, "step": 11487 }, { "crossentropy": 2.621535539627075, "epoch": 0.41647331786542924, "grad_norm": 0.028587687760591507, "grad_norm_var": 4.854861482583141e-06, "learning_rate": 0.006476842782843196, "loss": 2.5193, "step": 11488 }, { "crossentropy": 2.577071189880371, "epoch": 0.41650957076566125, "grad_norm": 0.03235620632767677, "grad_norm_var": 5.261393651579182e-06, "learning_rate": 0.006476287611316237, "loss": 2.6517, "step": 11489 }, { "crossentropy": 2.535048246383667, "epoch": 0.41654582366589327, "grad_norm": 0.031179333105683327, "grad_norm_var": 4.955971402067066e-06, "learning_rate": 0.006475732419849776, "loss": 2.5626, "step": 11490 }, { "crossentropy": 2.678408622741699, "epoch": 0.4165820765661253, "grad_norm": 0.027024047449231148, "grad_norm_var": 4.252771540643125e-06, "learning_rate": 0.006475177208451309, "loss": 2.6016, "step": 11491 }, { "crossentropy": 2.6973512172698975, "epoch": 0.4166183294663573, "grad_norm": 0.02847849763929844, "grad_norm_var": 3.3296231763325914e-06, "learning_rate": 0.006474621977128333, "loss": 2.5962, "step": 11492 }, { "crossentropy": 2.6263890266418457, "epoch": 0.4166545823665893, "grad_norm": 0.026141708716750145, "grad_norm_var": 3.36127362092115e-06, "learning_rate": 0.00647406672588835, "loss": 2.5427, "step": 11493 }, { "crossentropy": 2.6535370349884033, "epoch": 0.4166908352668213, "grad_norm": 0.026965729892253876, "grad_norm_var": 3.526365858776007e-06, "learning_rate": 0.006473511454738861, "loss": 2.6081, "step": 11494 }, { "crossentropy": 2.5356364250183105, "epoch": 0.41672708816705334, "grad_norm": 0.027357084676623344, "grad_norm_var": 3.4098495976527987e-06, "learning_rate": 0.0064729561636873625, "loss": 2.6023, "step": 11495 }, { "crossentropy": 2.5430500507354736, "epoch": 0.4167633410672854, "grad_norm": 0.02873752824962139, "grad_norm_var": 3.3067499845046047e-06, "learning_rate": 0.0064724008527413555, "loss": 2.5443, "step": 11496 }, { "crossentropy": 2.6299784183502197, "epoch": 0.4167995939675174, "grad_norm": 0.026990210637450218, "grad_norm_var": 3.479770858624354e-06, "learning_rate": 0.00647184552190834, "loss": 2.5913, "step": 11497 }, { "crossentropy": 2.6302452087402344, "epoch": 0.41683584686774944, "grad_norm": 0.027902398258447647, "grad_norm_var": 3.1148973970706686e-06, "learning_rate": 0.006471290171195817, "loss": 2.6726, "step": 11498 }, { "crossentropy": 2.516087770462036, "epoch": 0.41687209976798145, "grad_norm": 0.0280094426125288, "grad_norm_var": 2.7250278104269288e-06, "learning_rate": 0.006470734800611289, "loss": 2.6049, "step": 11499 }, { "crossentropy": 2.608891487121582, "epoch": 0.41690835266821347, "grad_norm": 0.02832944318652153, "grad_norm_var": 2.6348413583130816e-06, "learning_rate": 0.006470179410162254, "loss": 2.5984, "step": 11500 }, { "crossentropy": 2.7301740646362305, "epoch": 0.4169446055684455, "grad_norm": 0.02629823237657547, "grad_norm_var": 2.8231355292969326e-06, "learning_rate": 0.006469623999856216, "loss": 2.746, "step": 11501 }, { "crossentropy": 2.6341660022735596, "epoch": 0.4169808584686775, "grad_norm": 0.028006255626678467, "grad_norm_var": 2.823065009082265e-06, "learning_rate": 0.006469068569700674, "loss": 2.6118, "step": 11502 }, { "crossentropy": 2.523756504058838, "epoch": 0.4170171113689095, "grad_norm": 0.027425192296504974, "grad_norm_var": 2.6837338954879253e-06, "learning_rate": 0.006468513119703134, "loss": 2.6029, "step": 11503 }, { "crossentropy": 2.558303117752075, "epoch": 0.4170533642691415, "grad_norm": 0.026786206290125847, "grad_norm_var": 2.7722632837922825e-06, "learning_rate": 0.006467957649871096, "loss": 2.5941, "step": 11504 }, { "crossentropy": 2.5642037391662598, "epoch": 0.41708961716937354, "grad_norm": 0.026753468438982964, "grad_norm_var": 1.4793731664653845e-06, "learning_rate": 0.0064674021602120595, "loss": 2.5665, "step": 11505 }, { "crossentropy": 2.5986428260803223, "epoch": 0.41712587006960555, "grad_norm": 0.027267009019851685, "grad_norm_var": 5.944667645669712e-07, "learning_rate": 0.006466846650733531, "loss": 2.5529, "step": 11506 }, { "crossentropy": 2.7565789222717285, "epoch": 0.41716212296983757, "grad_norm": 0.026755578815937042, "grad_norm_var": 6.125911064524513e-07, "learning_rate": 0.006466291121443014, "loss": 2.6946, "step": 11507 }, { "crossentropy": 2.5172290802001953, "epoch": 0.4171983758700696, "grad_norm": 0.02794908545911312, "grad_norm_var": 5.531143650259104e-07, "learning_rate": 0.0064657355723480095, "loss": 2.5402, "step": 11508 }, { "crossentropy": 2.536217212677002, "epoch": 0.4172346287703016, "grad_norm": 0.027459386736154556, "grad_norm_var": 4.4852751611734683e-07, "learning_rate": 0.00646518000345602, "loss": 2.6601, "step": 11509 }, { "crossentropy": 2.5054430961608887, "epoch": 0.41727088167053367, "grad_norm": 0.026654712855815887, "grad_norm_var": 4.741169665185927e-07, "learning_rate": 0.006464624414774553, "loss": 2.559, "step": 11510 }, { "crossentropy": 2.5858001708984375, "epoch": 0.4173071345707657, "grad_norm": 0.026917342096567154, "grad_norm_var": 4.897496322384468e-07, "learning_rate": 0.006464068806311111, "loss": 2.5795, "step": 11511 }, { "crossentropy": 2.5901846885681152, "epoch": 0.4173433874709977, "grad_norm": 0.02689928561449051, "grad_norm_var": 3.706906507478045e-07, "learning_rate": 0.006463513178073197, "loss": 2.5289, "step": 11512 }, { "crossentropy": 2.5284814834594727, "epoch": 0.4173796403712297, "grad_norm": 0.025743165984749794, "grad_norm_var": 5.152721246435282e-07, "learning_rate": 0.006462957530068315, "loss": 2.5159, "step": 11513 }, { "crossentropy": 2.5194196701049805, "epoch": 0.4174158932714617, "grad_norm": 0.027871156111359596, "grad_norm_var": 5.123958032598093e-07, "learning_rate": 0.006462401862303973, "loss": 2.666, "step": 11514 }, { "crossentropy": 2.685863971710205, "epoch": 0.41745214617169374, "grad_norm": 0.026243967935442924, "grad_norm_var": 5.15558089129083e-07, "learning_rate": 0.006461846174787676, "loss": 2.6414, "step": 11515 }, { "crossentropy": 2.7484376430511475, "epoch": 0.41748839907192575, "grad_norm": 0.028339635580778122, "grad_norm_var": 5.172558060960915e-07, "learning_rate": 0.006461290467526927, "loss": 2.7338, "step": 11516 }, { "crossentropy": 2.6723520755767822, "epoch": 0.41752465197215777, "grad_norm": 0.026489945128560066, "grad_norm_var": 4.994263323245072e-07, "learning_rate": 0.006460734740529231, "loss": 2.6011, "step": 11517 }, { "crossentropy": 2.6112217903137207, "epoch": 0.4175609048723898, "grad_norm": 0.027127213776111603, "grad_norm_var": 4.412199743642712e-07, "learning_rate": 0.006460178993802096, "loss": 2.6021, "step": 11518 }, { "crossentropy": 2.6580944061279297, "epoch": 0.4175971577726218, "grad_norm": 0.028482578694820404, "grad_norm_var": 5.650321902699358e-07, "learning_rate": 0.006459623227353029, "loss": 2.678, "step": 11519 }, { "crossentropy": 2.6757266521453857, "epoch": 0.4176334106728538, "grad_norm": 0.028899027034640312, "grad_norm_var": 7.531739069294481e-07, "learning_rate": 0.006459067441189535, "loss": 2.6609, "step": 11520 }, { "crossentropy": 2.8063366413116455, "epoch": 0.4176696635730858, "grad_norm": 0.026767956092953682, "grad_norm_var": 7.52245682111953e-07, "learning_rate": 0.006458511635319121, "loss": 2.652, "step": 11521 }, { "crossentropy": 2.557023048400879, "epoch": 0.41770591647331784, "grad_norm": 0.027052879333496094, "grad_norm_var": 7.543885408149333e-07, "learning_rate": 0.006457955809749294, "loss": 2.5766, "step": 11522 }, { "crossentropy": 2.6603946685791016, "epoch": 0.4177421693735499, "grad_norm": 0.02780028060078621, "grad_norm_var": 7.567531117862499e-07, "learning_rate": 0.00645739996448756, "loss": 2.6318, "step": 11523 }, { "crossentropy": 2.58894944190979, "epoch": 0.4177784222737819, "grad_norm": 0.027904588729143143, "grad_norm_var": 7.529879383511943e-07, "learning_rate": 0.00645684409954143, "loss": 2.6353, "step": 11524 }, { "crossentropy": 2.472867012023926, "epoch": 0.41781467517401394, "grad_norm": 0.027309555560350418, "grad_norm_var": 7.510234872540322e-07, "learning_rate": 0.006456288214918409, "loss": 2.5409, "step": 11525 }, { "crossentropy": 2.6750593185424805, "epoch": 0.41785092807424595, "grad_norm": 0.026927215978503227, "grad_norm_var": 7.328926917833743e-07, "learning_rate": 0.006455732310626005, "loss": 2.5731, "step": 11526 }, { "crossentropy": 2.651094913482666, "epoch": 0.41788718097447797, "grad_norm": 0.028469005599617958, "grad_norm_var": 8.045169045187498e-07, "learning_rate": 0.006455176386671726, "loss": 2.5751, "step": 11527 }, { "crossentropy": 2.5137109756469727, "epoch": 0.41792343387471, "grad_norm": 0.02960006147623062, "grad_norm_var": 1.0817274881946259e-06, "learning_rate": 0.006454620443063084, "loss": 2.4903, "step": 11528 }, { "crossentropy": 2.5391106605529785, "epoch": 0.417959686774942, "grad_norm": 0.028260579332709312, "grad_norm_var": 8.665520490093448e-07, "learning_rate": 0.006454064479807583, "loss": 2.5385, "step": 11529 }, { "crossentropy": 2.682504415512085, "epoch": 0.417995939675174, "grad_norm": 0.027206066995859146, "grad_norm_var": 8.809363611241335e-07, "learning_rate": 0.006453508496912737, "loss": 2.6388, "step": 11530 }, { "crossentropy": 2.534438371658325, "epoch": 0.418032192575406, "grad_norm": 0.02700824849307537, "grad_norm_var": 7.711030880712718e-07, "learning_rate": 0.006452952494386051, "loss": 2.6755, "step": 11531 }, { "crossentropy": 2.5715174674987793, "epoch": 0.41806844547563804, "grad_norm": 0.02726668305695057, "grad_norm_var": 7.555257223084792e-07, "learning_rate": 0.006452396472235038, "loss": 2.5936, "step": 11532 }, { "crossentropy": 2.4943368434906006, "epoch": 0.41810469837587005, "grad_norm": 0.03186614066362381, "grad_norm_var": 1.7227347805501136e-06, "learning_rate": 0.006451840430467205, "loss": 2.5281, "step": 11533 }, { "crossentropy": 2.6017026901245117, "epoch": 0.41814095127610207, "grad_norm": 0.03066137619316578, "grad_norm_var": 2.093632083005383e-06, "learning_rate": 0.006451284369090065, "loss": 2.6406, "step": 11534 }, { "crossentropy": 2.5473201274871826, "epoch": 0.4181772041763341, "grad_norm": 0.027436086907982826, "grad_norm_var": 2.125111193616496e-06, "learning_rate": 0.006450728288111127, "loss": 2.5687, "step": 11535 }, { "crossentropy": 2.5589354038238525, "epoch": 0.41821345707656615, "grad_norm": 0.02758147194981575, "grad_norm_var": 2.1024161165755784e-06, "learning_rate": 0.006450172187537902, "loss": 2.633, "step": 11536 }, { "crossentropy": 2.6924219131469727, "epoch": 0.41824970997679817, "grad_norm": 0.027338264510035515, "grad_norm_var": 2.0237440363450485e-06, "learning_rate": 0.0064496160673779, "loss": 2.6097, "step": 11537 }, { "crossentropy": 2.3780717849731445, "epoch": 0.4182859628770302, "grad_norm": 0.026290757581591606, "grad_norm_var": 2.1670124413509697e-06, "learning_rate": 0.006449059927638634, "loss": 2.5751, "step": 11538 }, { "crossentropy": 2.5710127353668213, "epoch": 0.4183222157772622, "grad_norm": 0.02782641537487507, "grad_norm_var": 2.1661574240941043e-06, "learning_rate": 0.006448503768327615, "loss": 2.6497, "step": 11539 }, { "crossentropy": 2.640814781188965, "epoch": 0.4183584686774942, "grad_norm": 0.02988431788980961, "grad_norm_var": 2.3702158620913003e-06, "learning_rate": 0.006447947589452353, "loss": 2.6951, "step": 11540 }, { "crossentropy": 2.686573028564453, "epoch": 0.4183947215777262, "grad_norm": 0.031088106334209442, "grad_norm_var": 2.8223752652494256e-06, "learning_rate": 0.006447391391020363, "loss": 2.68, "step": 11541 }, { "crossentropy": 2.5166518688201904, "epoch": 0.41843097447795824, "grad_norm": 0.032740216702222824, "grad_norm_var": 3.777749817876023e-06, "learning_rate": 0.006446835173039155, "loss": 2.4901, "step": 11542 }, { "crossentropy": 2.5983071327209473, "epoch": 0.41846722737819025, "grad_norm": 0.030090399086475372, "grad_norm_var": 3.874232751684585e-06, "learning_rate": 0.006446278935516242, "loss": 2.6398, "step": 11543 }, { "crossentropy": 2.610603094100952, "epoch": 0.41850348027842227, "grad_norm": 0.028361456468701363, "grad_norm_var": 3.851873314595031e-06, "learning_rate": 0.006445722678459138, "loss": 2.5857, "step": 11544 }, { "crossentropy": 2.769740104675293, "epoch": 0.4185397331786543, "grad_norm": 0.027871528640389442, "grad_norm_var": 3.889660506349995e-06, "learning_rate": 0.0064451664018753555, "loss": 2.7327, "step": 11545 }, { "crossentropy": 2.6098785400390625, "epoch": 0.4185759860788863, "grad_norm": 0.027177909389138222, "grad_norm_var": 3.895627958962648e-06, "learning_rate": 0.006444610105772406, "loss": 2.5945, "step": 11546 }, { "crossentropy": 2.7259228229522705, "epoch": 0.4186122389791183, "grad_norm": 0.02735702507197857, "grad_norm_var": 3.820810793437217e-06, "learning_rate": 0.006444053790157806, "loss": 2.5768, "step": 11547 }, { "crossentropy": 2.568953275680542, "epoch": 0.4186484918793503, "grad_norm": 0.026648998260498047, "grad_norm_var": 3.971133977608935e-06, "learning_rate": 0.006443497455039068, "loss": 2.6425, "step": 11548 }, { "crossentropy": 2.625777006149292, "epoch": 0.41868474477958234, "grad_norm": 0.0277845561504364, "grad_norm_var": 3.324002299104576e-06, "learning_rate": 0.006442941100423706, "loss": 2.6629, "step": 11549 }, { "crossentropy": 2.5231077671051025, "epoch": 0.4187209976798144, "grad_norm": 0.03143174201250076, "grad_norm_var": 3.582208867995692e-06, "learning_rate": 0.006442384726319235, "loss": 2.5332, "step": 11550 }, { "crossentropy": 2.6767687797546387, "epoch": 0.4187572505800464, "grad_norm": 0.030899440869688988, "grad_norm_var": 3.814348621033432e-06, "learning_rate": 0.006441828332733169, "loss": 2.6656, "step": 11551 }, { "crossentropy": 2.448814630508423, "epoch": 0.41879350348027844, "grad_norm": 0.028726618736982346, "grad_norm_var": 3.7143348006541163e-06, "learning_rate": 0.006441271919673024, "loss": 2.4822, "step": 11552 }, { "crossentropy": 2.621633529663086, "epoch": 0.41882975638051045, "grad_norm": 0.026882875710725784, "grad_norm_var": 3.818774187462857e-06, "learning_rate": 0.006440715487146313, "loss": 2.5401, "step": 11553 }, { "crossentropy": 2.553056001663208, "epoch": 0.41886600928074247, "grad_norm": 0.026511672884225845, "grad_norm_var": 3.747430735177382e-06, "learning_rate": 0.006440159035160556, "loss": 2.5482, "step": 11554 }, { "crossentropy": 2.7069218158721924, "epoch": 0.4189022621809745, "grad_norm": 0.026399346068501472, "grad_norm_var": 4.065710637417503e-06, "learning_rate": 0.0064396025637232625, "loss": 2.6831, "step": 11555 }, { "crossentropy": 2.693659782409668, "epoch": 0.4189385150812065, "grad_norm": 0.026200389489531517, "grad_norm_var": 4.352338288786496e-06, "learning_rate": 0.006439046072841953, "loss": 2.6286, "step": 11556 }, { "crossentropy": 2.629991292953491, "epoch": 0.4189747679814385, "grad_norm": 0.029354263097047806, "grad_norm_var": 3.9443997214101774e-06, "learning_rate": 0.006438489562524142, "loss": 2.6357, "step": 11557 }, { "crossentropy": 2.5573618412017822, "epoch": 0.4190110208816705, "grad_norm": 0.02801746129989624, "grad_norm_var": 2.6069011020242535e-06, "learning_rate": 0.0064379330327773475, "loss": 2.5278, "step": 11558 }, { "crossentropy": 2.510355234146118, "epoch": 0.41904727378190254, "grad_norm": 0.028489897027611732, "grad_norm_var": 2.3437927452569023e-06, "learning_rate": 0.006437376483609085, "loss": 2.5607, "step": 11559 }, { "crossentropy": 2.6846086978912354, "epoch": 0.41908352668213456, "grad_norm": 0.02761702612042427, "grad_norm_var": 2.3432660936068777e-06, "learning_rate": 0.006436819915026871, "loss": 2.5936, "step": 11560 }, { "crossentropy": 2.5306968688964844, "epoch": 0.41911977958236657, "grad_norm": 0.026361795142292976, "grad_norm_var": 2.503666393237911e-06, "learning_rate": 0.006436263327038224, "loss": 2.5648, "step": 11561 }, { "crossentropy": 2.662611484527588, "epoch": 0.4191560324825986, "grad_norm": 0.027826931327581406, "grad_norm_var": 2.470421312261023e-06, "learning_rate": 0.006435706719650661, "loss": 2.6813, "step": 11562 }, { "crossentropy": 2.7053072452545166, "epoch": 0.41919228538283065, "grad_norm": 0.025920774787664413, "grad_norm_var": 2.7046439942826715e-06, "learning_rate": 0.0064351500928717, "loss": 2.5033, "step": 11563 }, { "crossentropy": 2.6560921669006348, "epoch": 0.41922853828306267, "grad_norm": 0.027043942362070084, "grad_norm_var": 2.6528808560631697e-06, "learning_rate": 0.00643459344670886, "loss": 2.5917, "step": 11564 }, { "crossentropy": 2.531125068664551, "epoch": 0.4192647911832947, "grad_norm": 0.02841220051050186, "grad_norm_var": 2.6727117923017964e-06, "learning_rate": 0.006434036781169657, "loss": 2.4976, "step": 11565 }, { "crossentropy": 2.4447429180145264, "epoch": 0.4193010440835267, "grad_norm": 0.027509529143571854, "grad_norm_var": 1.7773066056064271e-06, "learning_rate": 0.0064334800962616125, "loss": 2.5293, "step": 11566 }, { "crossentropy": 2.717210292816162, "epoch": 0.4193372969837587, "grad_norm": 0.02898070588707924, "grad_norm_var": 1.1724833326712606e-06, "learning_rate": 0.006432923391992243, "loss": 2.5917, "step": 11567 }, { "crossentropy": 2.5959577560424805, "epoch": 0.4193735498839907, "grad_norm": 0.03147967532277107, "grad_norm_var": 2.0905908724804776e-06, "learning_rate": 0.0064323666683690685, "loss": 2.5875, "step": 11568 }, { "crossentropy": 2.552081823348999, "epoch": 0.41940980278422274, "grad_norm": 0.03094322420656681, "grad_norm_var": 2.6850982649018283e-06, "learning_rate": 0.006431809925399609, "loss": 2.5644, "step": 11569 }, { "crossentropy": 2.6497445106506348, "epoch": 0.41944605568445475, "grad_norm": 0.02980049140751362, "grad_norm_var": 2.7339939629160427e-06, "learning_rate": 0.006431253163091383, "loss": 2.6258, "step": 11570 }, { "crossentropy": 2.42659330368042, "epoch": 0.41948230858468677, "grad_norm": 0.02719908021390438, "grad_norm_var": 2.587575236611718e-06, "learning_rate": 0.006430696381451911, "loss": 2.4915, "step": 11571 }, { "crossentropy": 2.587904691696167, "epoch": 0.4195185614849188, "grad_norm": 0.02702467516064644, "grad_norm_var": 2.410566668625694e-06, "learning_rate": 0.006430139580488714, "loss": 2.6118, "step": 11572 }, { "crossentropy": 2.4845316410064697, "epoch": 0.4195548143851508, "grad_norm": 0.027743199840188026, "grad_norm_var": 2.335335893445534e-06, "learning_rate": 0.006429582760209312, "loss": 2.5674, "step": 11573 }, { "crossentropy": 2.587735414505005, "epoch": 0.4195910672853828, "grad_norm": 0.03047606535255909, "grad_norm_var": 2.6702858947571466e-06, "learning_rate": 0.006429025920621223, "loss": 2.5342, "step": 11574 }, { "crossentropy": 2.5779316425323486, "epoch": 0.4196273201856148, "grad_norm": 0.0326976552605629, "grad_norm_var": 3.8823771276059555e-06, "learning_rate": 0.006428469061731972, "loss": 2.5759, "step": 11575 }, { "crossentropy": 2.487257242202759, "epoch": 0.41966357308584684, "grad_norm": 0.02857954055070877, "grad_norm_var": 3.818645069650293e-06, "learning_rate": 0.006427912183549079, "loss": 2.5457, "step": 11576 }, { "crossentropy": 2.578902244567871, "epoch": 0.4196998259860789, "grad_norm": 0.026876701042056084, "grad_norm_var": 3.679839440657214e-06, "learning_rate": 0.006427355286080064, "loss": 2.5597, "step": 11577 }, { "crossentropy": 2.6312575340270996, "epoch": 0.4197360788863109, "grad_norm": 0.02725786343216896, "grad_norm_var": 3.7630727353708813e-06, "learning_rate": 0.006426798369332451, "loss": 2.571, "step": 11578 }, { "crossentropy": 2.706817626953125, "epoch": 0.41977233178654294, "grad_norm": 0.026761775836348534, "grad_norm_var": 3.5044269275969713e-06, "learning_rate": 0.00642624143331376, "loss": 2.5938, "step": 11579 }, { "crossentropy": 2.7351632118225098, "epoch": 0.41980858468677495, "grad_norm": 0.034531544893980026, "grad_norm_var": 5.3809306424282366e-06, "learning_rate": 0.006425684478031513, "loss": 2.7615, "step": 11580 }, { "crossentropy": 2.641383171081543, "epoch": 0.41984483758700697, "grad_norm": 0.03259361535310745, "grad_norm_var": 6.066748583603943e-06, "learning_rate": 0.006425127503493236, "loss": 2.6526, "step": 11581 }, { "crossentropy": 2.625511646270752, "epoch": 0.419881090487239, "grad_norm": 0.02813280187547207, "grad_norm_var": 5.933636584024421e-06, "learning_rate": 0.0064245705097064456, "loss": 2.6251, "step": 11582 }, { "crossentropy": 2.6633238792419434, "epoch": 0.419917343387471, "grad_norm": 0.027854377403855324, "grad_norm_var": 6.082262996075907e-06, "learning_rate": 0.006424013496678672, "loss": 2.6441, "step": 11583 }, { "crossentropy": 2.626260757446289, "epoch": 0.419953596287703, "grad_norm": 0.028985029086470604, "grad_norm_var": 5.770168797701996e-06, "learning_rate": 0.006423456464417434, "loss": 2.6567, "step": 11584 }, { "crossentropy": 2.753333806991577, "epoch": 0.419989849187935, "grad_norm": 0.027649477124214172, "grad_norm_var": 5.689723327087605e-06, "learning_rate": 0.006422899412930256, "loss": 2.6985, "step": 11585 }, { "crossentropy": 2.6304867267608643, "epoch": 0.42002610208816704, "grad_norm": 0.030631832778453827, "grad_norm_var": 5.820514308960134e-06, "learning_rate": 0.006422342342224662, "loss": 2.7114, "step": 11586 }, { "crossentropy": 2.6315929889678955, "epoch": 0.42006235498839906, "grad_norm": 0.026023712009191513, "grad_norm_var": 6.1988380392411955e-06, "learning_rate": 0.006421785252308178, "loss": 2.5809, "step": 11587 }, { "crossentropy": 2.447802782058716, "epoch": 0.42009860788863107, "grad_norm": 0.02637510746717453, "grad_norm_var": 6.395315064506079e-06, "learning_rate": 0.006421228143188324, "loss": 2.556, "step": 11588 }, { "crossentropy": 2.420159101486206, "epoch": 0.4201348607888631, "grad_norm": 0.027759183198213577, "grad_norm_var": 6.392763158010311e-06, "learning_rate": 0.006420671014872628, "loss": 2.4699, "step": 11589 }, { "crossentropy": 2.57108998298645, "epoch": 0.42017111368909515, "grad_norm": 0.03203350305557251, "grad_norm_var": 6.8614421818736455e-06, "learning_rate": 0.006420113867368614, "loss": 2.6098, "step": 11590 }, { "crossentropy": 2.660372257232666, "epoch": 0.42020736658932717, "grad_norm": 0.03025749698281288, "grad_norm_var": 6.045665207602769e-06, "learning_rate": 0.006419556700683809, "loss": 2.5348, "step": 11591 }, { "crossentropy": 2.7061800956726074, "epoch": 0.4202436194895592, "grad_norm": 0.027275899425148964, "grad_norm_var": 6.20653693119283e-06, "learning_rate": 0.006418999514825734, "loss": 2.664, "step": 11592 }, { "crossentropy": 2.4532864093780518, "epoch": 0.4202798723897912, "grad_norm": 0.02664010226726532, "grad_norm_var": 6.271103149851432e-06, "learning_rate": 0.0064184423098019175, "loss": 2.5355, "step": 11593 }, { "crossentropy": 2.5755419731140137, "epoch": 0.4203161252900232, "grad_norm": 0.02650151401758194, "grad_norm_var": 6.46214521219505e-06, "learning_rate": 0.006417885085619885, "loss": 2.5818, "step": 11594 }, { "crossentropy": 2.635268449783325, "epoch": 0.4203523781902552, "grad_norm": 0.02661633864045143, "grad_norm_var": 6.50203056128066e-06, "learning_rate": 0.006417327842287165, "loss": 2.5904, "step": 11595 }, { "crossentropy": 2.590595245361328, "epoch": 0.42038863109048724, "grad_norm": 0.026951635256409645, "grad_norm_var": 4.241078663470135e-06, "learning_rate": 0.006416770579811279, "loss": 2.6388, "step": 11596 }, { "crossentropy": 2.521172046661377, "epoch": 0.42042488399071926, "grad_norm": 0.027892859652638435, "grad_norm_var": 2.9107431785315696e-06, "learning_rate": 0.006416213298199757, "loss": 2.6167, "step": 11597 }, { "crossentropy": 2.7537405490875244, "epoch": 0.42046113689095127, "grad_norm": 0.02720334567129612, "grad_norm_var": 2.945032070846935e-06, "learning_rate": 0.006415655997460126, "loss": 2.6979, "step": 11598 }, { "crossentropy": 2.468261480331421, "epoch": 0.4204973897911833, "grad_norm": 0.028408203274011612, "grad_norm_var": 2.9596730044019096e-06, "learning_rate": 0.006415098677599912, "loss": 2.5125, "step": 11599 }, { "crossentropy": 2.522268533706665, "epoch": 0.4205336426914153, "grad_norm": 0.028735831379890442, "grad_norm_var": 2.929174854456445e-06, "learning_rate": 0.0064145413386266425, "loss": 2.5928, "step": 11600 }, { "crossentropy": 2.5949549674987793, "epoch": 0.4205698955916473, "grad_norm": 0.028106071054935455, "grad_norm_var": 2.924837388160126e-06, "learning_rate": 0.006413983980547845, "loss": 2.6502, "step": 11601 }, { "crossentropy": 2.611483573913574, "epoch": 0.42060614849187933, "grad_norm": 0.029931236058473587, "grad_norm_var": 2.706238291796716e-06, "learning_rate": 0.006413426603371048, "loss": 2.6351, "step": 11602 }, { "crossentropy": 2.6656055450439453, "epoch": 0.42064240139211134, "grad_norm": 0.03427864983677864, "grad_norm_var": 4.878620556125757e-06, "learning_rate": 0.006412869207103782, "loss": 2.6527, "step": 11603 }, { "crossentropy": 2.6199960708618164, "epoch": 0.4206786542923434, "grad_norm": 0.03032117336988449, "grad_norm_var": 4.767809649275367e-06, "learning_rate": 0.006412311791753572, "loss": 2.6393, "step": 11604 }, { "crossentropy": 2.6150307655334473, "epoch": 0.4207149071925754, "grad_norm": 0.026336710900068283, "grad_norm_var": 5.069310407783643e-06, "learning_rate": 0.006411754357327947, "loss": 2.5975, "step": 11605 }, { "crossentropy": 2.5530662536621094, "epoch": 0.42075116009280744, "grad_norm": 0.027625013142824173, "grad_norm_var": 4.26175574214273e-06, "learning_rate": 0.006411196903834438, "loss": 2.5542, "step": 11606 }, { "crossentropy": 2.6257662773132324, "epoch": 0.42078741299303946, "grad_norm": 0.026700451970100403, "grad_norm_var": 4.132515455922368e-06, "learning_rate": 0.006410639431280573, "loss": 2.5889, "step": 11607 }, { "crossentropy": 2.5693252086639404, "epoch": 0.42082366589327147, "grad_norm": 0.027079125866293907, "grad_norm_var": 4.156434014665883e-06, "learning_rate": 0.006410081939673883, "loss": 2.5976, "step": 11608 }, { "crossentropy": 2.5503251552581787, "epoch": 0.4208599187935035, "grad_norm": 0.028644897043704987, "grad_norm_var": 4.0219345841756175e-06, "learning_rate": 0.006409524429021894, "loss": 2.5872, "step": 11609 }, { "crossentropy": 2.7241227626800537, "epoch": 0.4208961716937355, "grad_norm": 0.0267590694129467, "grad_norm_var": 3.967467697675124e-06, "learning_rate": 0.00640896689933214, "loss": 2.5879, "step": 11610 }, { "crossentropy": 2.7163100242614746, "epoch": 0.4209324245939675, "grad_norm": 0.02634883113205433, "grad_norm_var": 4.029296486497677e-06, "learning_rate": 0.00640840935061215, "loss": 2.6488, "step": 11611 }, { "crossentropy": 2.601314067840576, "epoch": 0.42096867749419953, "grad_norm": 0.027866436168551445, "grad_norm_var": 3.928394448429031e-06, "learning_rate": 0.006407851782869455, "loss": 2.5273, "step": 11612 }, { "crossentropy": 2.6560628414154053, "epoch": 0.42100493039443154, "grad_norm": 0.02832864411175251, "grad_norm_var": 3.9186482450326155e-06, "learning_rate": 0.006407294196111584, "loss": 2.6411, "step": 11613 }, { "crossentropy": 2.545366048812866, "epoch": 0.42104118329466356, "grad_norm": 0.027227291837334633, "grad_norm_var": 3.915207866749661e-06, "learning_rate": 0.0064067365903460695, "loss": 2.5463, "step": 11614 }, { "crossentropy": 2.629669666290283, "epoch": 0.42107743619489557, "grad_norm": 0.027956325560808182, "grad_norm_var": 3.921065207432743e-06, "learning_rate": 0.006406178965580442, "loss": 2.6414, "step": 11615 }, { "crossentropy": 2.620208740234375, "epoch": 0.4211136890951276, "grad_norm": 0.02805975265800953, "grad_norm_var": 3.907222761426592e-06, "learning_rate": 0.006405621321822235, "loss": 2.5968, "step": 11616 }, { "crossentropy": 2.6202473640441895, "epoch": 0.42114994199535966, "grad_norm": 0.026215502992272377, "grad_norm_var": 4.160114656865859e-06, "learning_rate": 0.006405063659078978, "loss": 2.6403, "step": 11617 }, { "crossentropy": 2.602715492248535, "epoch": 0.42118619489559167, "grad_norm": 0.02851453796029091, "grad_norm_var": 3.9405804144177485e-06, "learning_rate": 0.0064045059773582035, "loss": 2.5637, "step": 11618 }, { "crossentropy": 2.6327717304229736, "epoch": 0.4212224477958237, "grad_norm": 0.030233323574066162, "grad_norm_var": 1.5856599237665565e-06, "learning_rate": 0.0064039482766674455, "loss": 2.5273, "step": 11619 }, { "crossentropy": 2.7389848232269287, "epoch": 0.4212587006960557, "grad_norm": 0.031257688999176025, "grad_norm_var": 1.959841258338775e-06, "learning_rate": 0.006403390557014236, "loss": 2.6082, "step": 11620 }, { "crossentropy": 2.7096362113952637, "epoch": 0.4212949535962877, "grad_norm": 0.031427863985300064, "grad_norm_var": 2.571518660775942e-06, "learning_rate": 0.006402832818406106, "loss": 2.6563, "step": 11621 }, { "crossentropy": 2.6137917041778564, "epoch": 0.42133120649651973, "grad_norm": 0.029950493946671486, "grad_norm_var": 2.7497388566552944e-06, "learning_rate": 0.00640227506085059, "loss": 2.6811, "step": 11622 }, { "crossentropy": 2.6934192180633545, "epoch": 0.42136745939675174, "grad_norm": 0.027479132637381554, "grad_norm_var": 2.6230546454971177e-06, "learning_rate": 0.006401717284355221, "loss": 2.6128, "step": 11623 }, { "crossentropy": 2.6591222286224365, "epoch": 0.42140371229698376, "grad_norm": 0.027056152001023293, "grad_norm_var": 2.626932482330243e-06, "learning_rate": 0.006401159488927533, "loss": 2.6975, "step": 11624 }, { "crossentropy": 2.684737205505371, "epoch": 0.42143996519721577, "grad_norm": 0.026769742369651794, "grad_norm_var": 2.768682477496592e-06, "learning_rate": 0.00640060167457506, "loss": 2.5997, "step": 11625 }, { "crossentropy": 2.8180878162384033, "epoch": 0.4214762180974478, "grad_norm": 0.027791613712906837, "grad_norm_var": 2.6347821628758723e-06, "learning_rate": 0.006400043841305336, "loss": 2.6724, "step": 11626 }, { "crossentropy": 2.683356523513794, "epoch": 0.4215124709976798, "grad_norm": 0.027649179100990295, "grad_norm_var": 2.4056020558696113e-06, "learning_rate": 0.006399485989125896, "loss": 2.6354, "step": 11627 }, { "crossentropy": 2.5058939456939697, "epoch": 0.4215487238979118, "grad_norm": 0.02716967649757862, "grad_norm_var": 2.481934402979409e-06, "learning_rate": 0.006398928118044274, "loss": 2.5374, "step": 11628 }, { "crossentropy": 2.6285481452941895, "epoch": 0.42158497679814383, "grad_norm": 0.028255904093384743, "grad_norm_var": 2.4821612101724587e-06, "learning_rate": 0.006398370228068003, "loss": 2.6457, "step": 11629 }, { "crossentropy": 2.6539113521575928, "epoch": 0.42162122969837584, "grad_norm": 0.026252195239067078, "grad_norm_var": 2.6827933241183507e-06, "learning_rate": 0.006397812319204621, "loss": 2.6641, "step": 11630 }, { "crossentropy": 2.4662833213806152, "epoch": 0.4216574825986079, "grad_norm": 0.0279132928699255, "grad_norm_var": 2.684608092137413e-06, "learning_rate": 0.006397254391461662, "loss": 2.4511, "step": 11631 }, { "crossentropy": 2.6025123596191406, "epoch": 0.42169373549883993, "grad_norm": 0.02643374726176262, "grad_norm_var": 2.8910437126161124e-06, "learning_rate": 0.006396696444846663, "loss": 2.5622, "step": 11632 }, { "crossentropy": 2.472459554672241, "epoch": 0.42172998839907194, "grad_norm": 0.03175431489944458, "grad_norm_var": 3.3811865609144892e-06, "learning_rate": 0.006396138479367159, "loss": 2.5107, "step": 11633 }, { "crossentropy": 2.5244855880737305, "epoch": 0.42176624129930396, "grad_norm": 0.028053348883986473, "grad_norm_var": 3.393235778331669e-06, "learning_rate": 0.006395580495030685, "loss": 2.5322, "step": 11634 }, { "crossentropy": 2.6125481128692627, "epoch": 0.42180249419953597, "grad_norm": 0.02666383422911167, "grad_norm_var": 3.34819069601927e-06, "learning_rate": 0.0063950224918447795, "loss": 2.6245, "step": 11635 }, { "crossentropy": 2.6152939796447754, "epoch": 0.421838747099768, "grad_norm": 0.028709303587675095, "grad_norm_var": 2.729528669345018e-06, "learning_rate": 0.006394464469816978, "loss": 2.6379, "step": 11636 }, { "crossentropy": 2.5920112133026123, "epoch": 0.421875, "grad_norm": 0.02736881747841835, "grad_norm_var": 1.949069425593895e-06, "learning_rate": 0.0063939064289548165, "loss": 2.5287, "step": 11637 }, { "crossentropy": 2.6820790767669678, "epoch": 0.421911252900232, "grad_norm": 0.026798568665981293, "grad_norm_var": 1.6785891972690795e-06, "learning_rate": 0.006393348369265834, "loss": 2.6446, "step": 11638 }, { "crossentropy": 2.554286479949951, "epoch": 0.42194750580046403, "grad_norm": 0.02760808914899826, "grad_norm_var": 1.6769927935739489e-06, "learning_rate": 0.006392790290757569, "loss": 2.6071, "step": 11639 }, { "crossentropy": 2.7113959789276123, "epoch": 0.42198375870069604, "grad_norm": 0.027356691658496857, "grad_norm_var": 1.659222633492811e-06, "learning_rate": 0.006392232193437555, "loss": 2.6413, "step": 11640 }, { "crossentropy": 2.6054978370666504, "epoch": 0.42202001160092806, "grad_norm": 0.026947397738695145, "grad_norm_var": 1.6401246420660298e-06, "learning_rate": 0.0063916740773133345, "loss": 2.6124, "step": 11641 }, { "crossentropy": 2.523189067840576, "epoch": 0.42205626450116007, "grad_norm": 0.02767089754343033, "grad_norm_var": 1.6390839951800485e-06, "learning_rate": 0.006391115942392443, "loss": 2.5372, "step": 11642 }, { "crossentropy": 2.7077860832214355, "epoch": 0.4220925174013921, "grad_norm": 0.02849579229950905, "grad_norm_var": 1.682340325805442e-06, "learning_rate": 0.006390557788682419, "loss": 2.6217, "step": 11643 }, { "crossentropy": 2.5088367462158203, "epoch": 0.42212877030162416, "grad_norm": 0.02755504660308361, "grad_norm_var": 1.6635638985166515e-06, "learning_rate": 0.006389999616190801, "loss": 2.5241, "step": 11644 }, { "crossentropy": 2.7299649715423584, "epoch": 0.42216502320185617, "grad_norm": 0.02669578231871128, "grad_norm_var": 1.7083353633544668e-06, "learning_rate": 0.006389441424925131, "loss": 2.6807, "step": 11645 }, { "crossentropy": 2.4499943256378174, "epoch": 0.4222012761020882, "grad_norm": 0.02778085321187973, "grad_norm_var": 1.5710483512552566e-06, "learning_rate": 0.006388883214892944, "loss": 2.5541, "step": 11646 }, { "crossentropy": 2.658132314682007, "epoch": 0.4222375290023202, "grad_norm": 0.027038494125008583, "grad_norm_var": 1.5984155775509383e-06, "learning_rate": 0.0063883249861017835, "loss": 2.6259, "step": 11647 }, { "crossentropy": 2.5980565547943115, "epoch": 0.4222737819025522, "grad_norm": 0.027787182480096817, "grad_norm_var": 1.4874309647783333e-06, "learning_rate": 0.006387766738559186, "loss": 2.6312, "step": 11648 }, { "crossentropy": 2.516669273376465, "epoch": 0.42231003480278423, "grad_norm": 0.030219189822673798, "grad_norm_var": 8.187409017301143e-07, "learning_rate": 0.0063872084722726916, "loss": 2.4938, "step": 11649 }, { "crossentropy": 2.474169969558716, "epoch": 0.42234628770301624, "grad_norm": 0.02998109720647335, "grad_norm_var": 1.1490670791906246e-06, "learning_rate": 0.006386650187249843, "loss": 2.4531, "step": 11650 }, { "crossentropy": 2.5086276531219482, "epoch": 0.42238254060324826, "grad_norm": 0.02905222587287426, "grad_norm_var": 1.146225826115218e-06, "learning_rate": 0.00638609188349818, "loss": 2.5583, "step": 11651 }, { "crossentropy": 2.5420894622802734, "epoch": 0.42241879350348027, "grad_norm": 0.027791807428002357, "grad_norm_var": 1.1049216289032068e-06, "learning_rate": 0.006385533561025241, "loss": 2.5672, "step": 11652 }, { "crossentropy": 2.589548110961914, "epoch": 0.4224550464037123, "grad_norm": 0.02800840139389038, "grad_norm_var": 1.086533726810055e-06, "learning_rate": 0.006384975219838569, "loss": 2.5988, "step": 11653 }, { "crossentropy": 2.545825719833374, "epoch": 0.4224912993039443, "grad_norm": 0.02934465929865837, "grad_norm_var": 1.1095601648779726e-06, "learning_rate": 0.006384416859945705, "loss": 2.5802, "step": 11654 }, { "crossentropy": 2.504730224609375, "epoch": 0.4225275522041763, "grad_norm": 0.02797618694603443, "grad_norm_var": 1.0947029761684446e-06, "learning_rate": 0.006383858481354191, "loss": 2.5359, "step": 11655 }, { "crossentropy": 2.6091396808624268, "epoch": 0.42256380510440833, "grad_norm": 0.02827521227300167, "grad_norm_var": 1.0556219539305246e-06, "learning_rate": 0.006383300084071568, "loss": 2.5927, "step": 11656 }, { "crossentropy": 2.6595962047576904, "epoch": 0.42260005800464034, "grad_norm": 0.02990906313061714, "grad_norm_var": 1.1235089443874246e-06, "learning_rate": 0.006382741668105377, "loss": 2.617, "step": 11657 }, { "crossentropy": 2.57808518409729, "epoch": 0.4226363109048724, "grad_norm": 0.030397538095712662, "grad_norm_var": 1.3416918330293066e-06, "learning_rate": 0.006382183233463163, "loss": 2.6487, "step": 11658 }, { "crossentropy": 2.6093826293945312, "epoch": 0.42267256380510443, "grad_norm": 0.02832375094294548, "grad_norm_var": 1.3440805783600125e-06, "learning_rate": 0.006381624780152466, "loss": 2.5815, "step": 11659 }, { "crossentropy": 2.6420600414276123, "epoch": 0.42270881670533644, "grad_norm": 0.027447637170553207, "grad_norm_var": 1.358456718889113e-06, "learning_rate": 0.006381066308180829, "loss": 2.5544, "step": 11660 }, { "crossentropy": 2.648649215698242, "epoch": 0.42274506960556846, "grad_norm": 0.028991172090172768, "grad_norm_var": 1.135016949455854e-06, "learning_rate": 0.006380507817555797, "loss": 2.6141, "step": 11661 }, { "crossentropy": 2.4023947715759277, "epoch": 0.42278132250580047, "grad_norm": 0.027304796501994133, "grad_norm_var": 1.2040501131543868e-06, "learning_rate": 0.006379949308284912, "loss": 2.4348, "step": 11662 }, { "crossentropy": 2.6107687950134277, "epoch": 0.4228175754060325, "grad_norm": 0.026061594486236572, "grad_norm_var": 1.469109492261431e-06, "learning_rate": 0.006379390780375717, "loss": 2.6038, "step": 11663 }, { "crossentropy": 2.534109592437744, "epoch": 0.4228538283062645, "grad_norm": 0.028408337384462357, "grad_norm_var": 1.4296768517301026e-06, "learning_rate": 0.006378832233835756, "loss": 2.4264, "step": 11664 }, { "crossentropy": 2.629638910293579, "epoch": 0.4228900812064965, "grad_norm": 0.029828229919075966, "grad_norm_var": 1.3544751674456068e-06, "learning_rate": 0.0063782736686725724, "loss": 2.6837, "step": 11665 }, { "crossentropy": 2.520578622817993, "epoch": 0.42292633410672853, "grad_norm": 0.029312334954738617, "grad_norm_var": 1.2565007826882776e-06, "learning_rate": 0.006377715084893712, "loss": 2.5999, "step": 11666 }, { "crossentropy": 2.5316221714019775, "epoch": 0.42296258700696054, "grad_norm": 0.02775835618376732, "grad_norm_var": 1.2705323314231298e-06, "learning_rate": 0.00637715648250672, "loss": 2.6194, "step": 11667 }, { "crossentropy": 2.6753005981445312, "epoch": 0.42299883990719256, "grad_norm": 0.026881294324994087, "grad_norm_var": 1.4017904406818381e-06, "learning_rate": 0.006376597861519138, "loss": 2.6136, "step": 11668 }, { "crossentropy": 2.6078102588653564, "epoch": 0.4230350928074246, "grad_norm": 0.028638599440455437, "grad_norm_var": 1.394607981928146e-06, "learning_rate": 0.006376039221938514, "loss": 2.5239, "step": 11669 }, { "crossentropy": 2.6716246604919434, "epoch": 0.4230713457076566, "grad_norm": 0.026958128437399864, "grad_norm_var": 1.459107872960284e-06, "learning_rate": 0.00637548056377239, "loss": 2.6399, "step": 11670 }, { "crossentropy": 2.383884906768799, "epoch": 0.42310759860788866, "grad_norm": 0.02746471017599106, "grad_norm_var": 1.4961444042319808e-06, "learning_rate": 0.006374921887028317, "loss": 2.5571, "step": 11671 }, { "crossentropy": 2.5911645889282227, "epoch": 0.42314385150812067, "grad_norm": 0.026613879948854446, "grad_norm_var": 1.6625178594196003e-06, "learning_rate": 0.006374363191713834, "loss": 2.5447, "step": 11672 }, { "crossentropy": 2.410762310028076, "epoch": 0.4231801044083527, "grad_norm": 0.030772889032959938, "grad_norm_var": 1.9124823053731498e-06, "learning_rate": 0.006373804477836494, "loss": 2.5465, "step": 11673 }, { "crossentropy": 2.513533592224121, "epoch": 0.4232163573085847, "grad_norm": 0.033270154148340225, "grad_norm_var": 3.270798347096412e-06, "learning_rate": 0.0063732457454038355, "loss": 2.6243, "step": 11674 }, { "crossentropy": 2.748208999633789, "epoch": 0.4232526102088167, "grad_norm": 0.02718512900173664, "grad_norm_var": 3.3599478395085016e-06, "learning_rate": 0.006372686994423412, "loss": 2.7277, "step": 11675 }, { "crossentropy": 2.6008849143981934, "epoch": 0.42328886310904873, "grad_norm": 0.026366839185357094, "grad_norm_var": 3.556662379942593e-06, "learning_rate": 0.006372128224902767, "loss": 2.598, "step": 11676 }, { "crossentropy": 2.527452230453491, "epoch": 0.42332511600928074, "grad_norm": 0.028914639726281166, "grad_norm_var": 3.5493482358188994e-06, "learning_rate": 0.006371569436849448, "loss": 2.584, "step": 11677 }, { "crossentropy": 2.471395969390869, "epoch": 0.42336136890951276, "grad_norm": 0.026632443070411682, "grad_norm_var": 3.6608794532190515e-06, "learning_rate": 0.006371010630271001, "loss": 2.5415, "step": 11678 }, { "crossentropy": 2.5397226810455322, "epoch": 0.4233976218097448, "grad_norm": 0.02915843203663826, "grad_norm_var": 3.380724921316948e-06, "learning_rate": 0.006370451805174977, "loss": 2.5895, "step": 11679 }, { "crossentropy": 2.630251407623291, "epoch": 0.4234338747099768, "grad_norm": 0.026602301746606827, "grad_norm_var": 3.5790316392974564e-06, "learning_rate": 0.006369892961568922, "loss": 2.5168, "step": 11680 }, { "crossentropy": 2.5572617053985596, "epoch": 0.4234701276102088, "grad_norm": 0.028011605143547058, "grad_norm_var": 3.4084410120526714e-06, "learning_rate": 0.006369334099460382, "loss": 2.6151, "step": 11681 }, { "crossentropy": 2.6314291954040527, "epoch": 0.4235063805104408, "grad_norm": 0.02936537377536297, "grad_norm_var": 3.416774035690471e-06, "learning_rate": 0.006368775218856906, "loss": 2.5424, "step": 11682 }, { "crossentropy": 2.5797507762908936, "epoch": 0.42354263341067283, "grad_norm": 0.027669807896018028, "grad_norm_var": 3.4220317294414293e-06, "learning_rate": 0.006368216319766046, "loss": 2.5929, "step": 11683 }, { "crossentropy": 2.60735821723938, "epoch": 0.42357888631090485, "grad_norm": 0.026735888794064522, "grad_norm_var": 3.4480787788249927e-06, "learning_rate": 0.006367657402195347, "loss": 2.5885, "step": 11684 }, { "crossentropy": 2.53841233253479, "epoch": 0.4236151392111369, "grad_norm": 0.028702659532427788, "grad_norm_var": 3.4525294712527913e-06, "learning_rate": 0.006367098466152359, "loss": 2.5177, "step": 11685 }, { "crossentropy": 2.516486644744873, "epoch": 0.42365139211136893, "grad_norm": 0.030872870236635208, "grad_norm_var": 3.7874270471703145e-06, "learning_rate": 0.006366539511644631, "loss": 2.5384, "step": 11686 }, { "crossentropy": 2.5244390964508057, "epoch": 0.42368764501160094, "grad_norm": 0.029628921300172806, "grad_norm_var": 3.811365436840776e-06, "learning_rate": 0.006365980538679714, "loss": 2.5477, "step": 11687 }, { "crossentropy": 2.519887924194336, "epoch": 0.42372389791183296, "grad_norm": 0.027736390009522438, "grad_norm_var": 3.6031124132059835e-06, "learning_rate": 0.006365421547265159, "loss": 2.5772, "step": 11688 }, { "crossentropy": 2.6297199726104736, "epoch": 0.423760150812065, "grad_norm": 0.027511203661561012, "grad_norm_var": 3.3237697292014113e-06, "learning_rate": 0.0063648625374085116, "loss": 2.5427, "step": 11689 }, { "crossentropy": 2.572736978530884, "epoch": 0.423796403712297, "grad_norm": 0.02649880014359951, "grad_norm_var": 1.7904726755737487e-06, "learning_rate": 0.006364303509117324, "loss": 2.5994, "step": 11690 }, { "crossentropy": 2.489413022994995, "epoch": 0.423832656612529, "grad_norm": 0.027651390060782433, "grad_norm_var": 1.7549813372395861e-06, "learning_rate": 0.006363744462399147, "loss": 2.561, "step": 11691 }, { "crossentropy": 2.6288092136383057, "epoch": 0.423868909512761, "grad_norm": 0.028798453509807587, "grad_norm_var": 1.5938254510793147e-06, "learning_rate": 0.006363185397261533, "loss": 2.6019, "step": 11692 }, { "crossentropy": 2.5551278591156006, "epoch": 0.42390516241299303, "grad_norm": 0.03166963905096054, "grad_norm_var": 2.346986016526995e-06, "learning_rate": 0.00636262631371203, "loss": 2.5561, "step": 11693 }, { "crossentropy": 2.5808703899383545, "epoch": 0.42394141531322505, "grad_norm": 0.03350148722529411, "grad_norm_var": 3.743161618434872e-06, "learning_rate": 0.006362067211758192, "loss": 2.5032, "step": 11694 }, { "crossentropy": 2.504474401473999, "epoch": 0.42397766821345706, "grad_norm": 0.02735893614590168, "grad_norm_var": 3.849279851322176e-06, "learning_rate": 0.006361508091407568, "loss": 2.6497, "step": 11695 }, { "crossentropy": 2.6057450771331787, "epoch": 0.4240139211136891, "grad_norm": 0.027345729991793633, "grad_norm_var": 3.681369219433785e-06, "learning_rate": 0.006360948952667714, "loss": 2.6159, "step": 11696 }, { "crossentropy": 2.5922248363494873, "epoch": 0.4240501740139211, "grad_norm": 0.028623446822166443, "grad_norm_var": 3.649325736955246e-06, "learning_rate": 0.006360389795546175, "loss": 2.6646, "step": 11697 }, { "crossentropy": 2.5987772941589355, "epoch": 0.42408642691415316, "grad_norm": 0.02667829766869545, "grad_norm_var": 3.872758098068931e-06, "learning_rate": 0.0063598306200505095, "loss": 2.6149, "step": 11698 }, { "crossentropy": 2.4685218334198, "epoch": 0.4241226798143852, "grad_norm": 0.027467085048556328, "grad_norm_var": 3.899428682588172e-06, "learning_rate": 0.0063592714261882675, "loss": 2.5234, "step": 11699 }, { "crossentropy": 2.498948574066162, "epoch": 0.4241589327146172, "grad_norm": 0.0282612107694149, "grad_norm_var": 3.676133446953167e-06, "learning_rate": 0.006358712213967004, "loss": 2.5374, "step": 11700 }, { "crossentropy": 2.6347222328186035, "epoch": 0.4241951856148492, "grad_norm": 0.028311967849731445, "grad_norm_var": 3.682625949220442e-06, "learning_rate": 0.006358152983394267, "loss": 2.6916, "step": 11701 }, { "crossentropy": 2.4844777584075928, "epoch": 0.4242314385150812, "grad_norm": 0.030609803274273872, "grad_norm_var": 3.6079213067372975e-06, "learning_rate": 0.006357593734477613, "loss": 2.5601, "step": 11702 }, { "crossentropy": 2.587369203567505, "epoch": 0.42426769141531323, "grad_norm": 0.026645632460713387, "grad_norm_var": 3.756207866879611e-06, "learning_rate": 0.0063570344672245985, "loss": 2.5939, "step": 11703 }, { "crossentropy": 2.633652925491333, "epoch": 0.42430394431554525, "grad_norm": 0.027987880632281303, "grad_norm_var": 3.7373438615197525e-06, "learning_rate": 0.006356475181642772, "loss": 2.5888, "step": 11704 }, { "crossentropy": 2.5274555683135986, "epoch": 0.42434019721577726, "grad_norm": 0.027338365092873573, "grad_norm_var": 3.7604437290170475e-06, "learning_rate": 0.00635591587773969, "loss": 2.6252, "step": 11705 }, { "crossentropy": 2.5508840084075928, "epoch": 0.4243764501160093, "grad_norm": 0.028886374086141586, "grad_norm_var": 3.504565064301144e-06, "learning_rate": 0.006355356555522907, "loss": 2.5715, "step": 11706 }, { "crossentropy": 2.6728968620300293, "epoch": 0.4244127030162413, "grad_norm": 0.028564387932419777, "grad_norm_var": 3.4447182464609923e-06, "learning_rate": 0.006354797214999974, "loss": 2.6849, "step": 11707 }, { "crossentropy": 2.6102428436279297, "epoch": 0.4244489559164733, "grad_norm": 0.02656714990735054, "grad_norm_var": 3.7051898150460004e-06, "learning_rate": 0.006354237856178452, "loss": 2.6774, "step": 11708 }, { "crossentropy": 2.792543888092041, "epoch": 0.4244852088167053, "grad_norm": 0.0315445140004158, "grad_norm_var": 3.6530977613254136e-06, "learning_rate": 0.006353678479065892, "loss": 2.7129, "step": 11709 }, { "crossentropy": 2.806577444076538, "epoch": 0.42452146171693733, "grad_norm": 0.03084712289273739, "grad_norm_var": 2.316541393251404e-06, "learning_rate": 0.00635311908366985, "loss": 2.6755, "step": 11710 }, { "crossentropy": 2.698561191558838, "epoch": 0.42455771461716935, "grad_norm": 0.029338229447603226, "grad_norm_var": 2.3091153078121904e-06, "learning_rate": 0.006352559669997879, "loss": 2.6828, "step": 11711 }, { "crossentropy": 2.6589510440826416, "epoch": 0.4245939675174014, "grad_norm": 0.029262462630867958, "grad_norm_var": 2.259439616101681e-06, "learning_rate": 0.006352000238057539, "loss": 2.7173, "step": 11712 }, { "crossentropy": 2.612455368041992, "epoch": 0.42463022041763343, "grad_norm": 0.029995722696185112, "grad_norm_var": 2.3890429267282247e-06, "learning_rate": 0.006351440787856384, "loss": 2.6297, "step": 11713 }, { "crossentropy": 2.5800769329071045, "epoch": 0.42466647331786544, "grad_norm": 0.0345185250043869, "grad_norm_var": 4.175847810362074e-06, "learning_rate": 0.006350881319401971, "loss": 2.6082, "step": 11714 }, { "crossentropy": 2.7780673503875732, "epoch": 0.42470272621809746, "grad_norm": 0.029740940779447556, "grad_norm_var": 3.993576388288076e-06, "learning_rate": 0.0063503218327018544, "loss": 2.573, "step": 11715 }, { "crossentropy": 2.524463653564453, "epoch": 0.4247389791183295, "grad_norm": 0.026869747787714005, "grad_norm_var": 4.302908923443464e-06, "learning_rate": 0.006349762327763593, "loss": 2.5644, "step": 11716 }, { "crossentropy": 2.650951623916626, "epoch": 0.4247752320185615, "grad_norm": 0.027619443833827972, "grad_norm_var": 4.413893231968123e-06, "learning_rate": 0.006349202804594742, "loss": 2.5686, "step": 11717 }, { "crossentropy": 2.575302839279175, "epoch": 0.4248114849187935, "grad_norm": 0.02790882997214794, "grad_norm_var": 4.342694446645273e-06, "learning_rate": 0.00634864326320286, "loss": 2.582, "step": 11718 }, { "crossentropy": 2.5656685829162598, "epoch": 0.4248477378190255, "grad_norm": 0.026577653363347054, "grad_norm_var": 4.364116389735974e-06, "learning_rate": 0.006348083703595506, "loss": 2.5559, "step": 11719 }, { "crossentropy": 2.661151885986328, "epoch": 0.42488399071925753, "grad_norm": 0.03102637641131878, "grad_norm_var": 4.542057245438859e-06, "learning_rate": 0.006347524125780233, "loss": 2.6423, "step": 11720 }, { "crossentropy": 2.6492998600006104, "epoch": 0.42492024361948955, "grad_norm": 0.028015313670039177, "grad_norm_var": 4.406019408019721e-06, "learning_rate": 0.006346964529764603, "loss": 2.6009, "step": 11721 }, { "crossentropy": 2.680209159851074, "epoch": 0.42495649651972156, "grad_norm": 0.02762218378484249, "grad_norm_var": 4.559642088840229e-06, "learning_rate": 0.006346404915556173, "loss": 2.594, "step": 11722 }, { "crossentropy": 2.6459157466888428, "epoch": 0.4249927494199536, "grad_norm": 0.028742846101522446, "grad_norm_var": 4.548265438666411e-06, "learning_rate": 0.006345845283162501, "loss": 2.63, "step": 11723 }, { "crossentropy": 2.6535136699676514, "epoch": 0.4250290023201856, "grad_norm": 0.028345275670289993, "grad_norm_var": 4.136529741440888e-06, "learning_rate": 0.006345285632591147, "loss": 2.6861, "step": 11724 }, { "crossentropy": 2.6554932594299316, "epoch": 0.42506525522041766, "grad_norm": 0.026434360072016716, "grad_norm_var": 4.204201742782316e-06, "learning_rate": 0.006344725963849669, "loss": 2.6152, "step": 11725 }, { "crossentropy": 2.6350483894348145, "epoch": 0.4251015081206497, "grad_norm": 0.027523696422576904, "grad_norm_var": 4.044587715860912e-06, "learning_rate": 0.0063441662769456245, "loss": 2.5907, "step": 11726 }, { "crossentropy": 2.7500698566436768, "epoch": 0.4251377610208817, "grad_norm": 0.027211982756853104, "grad_norm_var": 4.152260612038696e-06, "learning_rate": 0.006343606571886575, "loss": 2.6684, "step": 11727 }, { "crossentropy": 2.6697192192077637, "epoch": 0.4251740139211137, "grad_norm": 0.027635548263788223, "grad_norm_var": 4.171482830567216e-06, "learning_rate": 0.006343046848680081, "loss": 2.6221, "step": 11728 }, { "crossentropy": 2.757732629776001, "epoch": 0.4252102668213457, "grad_norm": 0.029090043157339096, "grad_norm_var": 4.040532739242539e-06, "learning_rate": 0.006342487107333701, "loss": 2.6219, "step": 11729 }, { "crossentropy": 2.554389715194702, "epoch": 0.42524651972157773, "grad_norm": 0.02710949070751667, "grad_norm_var": 1.4568866648506969e-06, "learning_rate": 0.006341927347854995, "loss": 2.5761, "step": 11730 }, { "crossentropy": 2.682422161102295, "epoch": 0.42528277262180975, "grad_norm": 0.026093434542417526, "grad_norm_var": 1.4257301851628197e-06, "learning_rate": 0.006341367570251524, "loss": 2.6002, "step": 11731 }, { "crossentropy": 2.614245891571045, "epoch": 0.42531902552204176, "grad_norm": 0.027388067916035652, "grad_norm_var": 1.3824380970407663e-06, "learning_rate": 0.006340807774530848, "loss": 2.558, "step": 11732 }, { "crossentropy": 2.7293403148651123, "epoch": 0.4253552784222738, "grad_norm": 0.026197465136647224, "grad_norm_var": 1.5376504553591865e-06, "learning_rate": 0.006340247960700527, "loss": 2.6765, "step": 11733 }, { "crossentropy": 2.643327474594116, "epoch": 0.4253915313225058, "grad_norm": 0.0288999080657959, "grad_norm_var": 1.6289270737543845e-06, "learning_rate": 0.006339688128768125, "loss": 2.5503, "step": 11734 }, { "crossentropy": 2.4947071075439453, "epoch": 0.4254277842227378, "grad_norm": 0.026970699429512024, "grad_norm_var": 1.5774270772528904e-06, "learning_rate": 0.006339128278741202, "loss": 2.5169, "step": 11735 }, { "crossentropy": 2.6479671001434326, "epoch": 0.4254640371229698, "grad_norm": 0.027486341074109077, "grad_norm_var": 8.232501183518531e-07, "learning_rate": 0.006338568410627318, "loss": 2.634, "step": 11736 }, { "crossentropy": 2.6623902320861816, "epoch": 0.42550029002320183, "grad_norm": 0.026933183893561363, "grad_norm_var": 8.28999937134408e-07, "learning_rate": 0.006338008524434037, "loss": 2.6508, "step": 11737 }, { "crossentropy": 2.6278207302093506, "epoch": 0.42553654292343385, "grad_norm": 0.029002225026488304, "grad_norm_var": 9.74142587691753e-07, "learning_rate": 0.00633744862016892, "loss": 2.6594, "step": 11738 }, { "crossentropy": 2.6981520652770996, "epoch": 0.4255727958236659, "grad_norm": 0.028362834826111794, "grad_norm_var": 9.235666184354007e-07, "learning_rate": 0.0063368886978395315, "loss": 2.6966, "step": 11739 }, { "crossentropy": 2.558870315551758, "epoch": 0.42560904872389793, "grad_norm": 0.028409035876393318, "grad_norm_var": 9.306429680305672e-07, "learning_rate": 0.00633632875745343, "loss": 2.6162, "step": 11740 }, { "crossentropy": 2.535966157913208, "epoch": 0.42564530162412995, "grad_norm": 0.028376227244734764, "grad_norm_var": 8.783007115846595e-07, "learning_rate": 0.006335768799018183, "loss": 2.569, "step": 11741 }, { "crossentropy": 2.4555823802948, "epoch": 0.42568155452436196, "grad_norm": 0.02878335677087307, "grad_norm_var": 9.532128283190957e-07, "learning_rate": 0.0063352088225413486, "loss": 2.5, "step": 11742 }, { "crossentropy": 2.7761127948760986, "epoch": 0.425717807424594, "grad_norm": 0.030164211988449097, "grad_norm_var": 1.2873953035667796e-06, "learning_rate": 0.006334648828030494, "loss": 2.7118, "step": 11743 }, { "crossentropy": 2.5490784645080566, "epoch": 0.425754060324826, "grad_norm": 0.029212016612291336, "grad_norm_var": 1.3805410793259199e-06, "learning_rate": 0.0063340888154931815, "loss": 2.4773, "step": 11744 }, { "crossentropy": 2.683494806289673, "epoch": 0.425790313225058, "grad_norm": 0.02740289643406868, "grad_norm_var": 1.319964809789557e-06, "learning_rate": 0.0063335287849369745, "loss": 2.6541, "step": 11745 }, { "crossentropy": 2.514991283416748, "epoch": 0.42582656612529, "grad_norm": 0.027083605527877808, "grad_norm_var": 1.3228194453195845e-06, "learning_rate": 0.006332968736369439, "loss": 2.5258, "step": 11746 }, { "crossentropy": 2.572024345397949, "epoch": 0.42586281902552203, "grad_norm": 0.027660692110657692, "grad_norm_var": 1.0940504376360492e-06, "learning_rate": 0.006332408669798137, "loss": 2.5228, "step": 11747 }, { "crossentropy": 2.5778584480285645, "epoch": 0.42589907192575405, "grad_norm": 0.02653263695538044, "grad_norm_var": 1.2119531573322068e-06, "learning_rate": 0.006331848585230634, "loss": 2.6093, "step": 11748 }, { "crossentropy": 2.454066753387451, "epoch": 0.42593532482598606, "grad_norm": 0.028512997552752495, "grad_norm_var": 1.0006337776741667e-06, "learning_rate": 0.006331288482674495, "loss": 2.5337, "step": 11749 }, { "crossentropy": 2.5740203857421875, "epoch": 0.4259715777262181, "grad_norm": 0.026408066973090172, "grad_norm_var": 1.1269524400417715e-06, "learning_rate": 0.006330728362137284, "loss": 2.6025, "step": 11750 }, { "crossentropy": 2.5334627628326416, "epoch": 0.4260078306264501, "grad_norm": 0.027941657230257988, "grad_norm_var": 1.0582761585229209e-06, "learning_rate": 0.0063301682236265675, "loss": 2.5443, "step": 11751 }, { "crossentropy": 2.5874221324920654, "epoch": 0.42604408352668216, "grad_norm": 0.027534663677215576, "grad_norm_var": 1.0550030634898782e-06, "learning_rate": 0.006329608067149911, "loss": 2.5666, "step": 11752 }, { "crossentropy": 2.512964963912964, "epoch": 0.4260803364269142, "grad_norm": 0.030013538897037506, "grad_norm_var": 1.2016612326428676e-06, "learning_rate": 0.0063290478927148786, "loss": 2.546, "step": 11753 }, { "crossentropy": 2.5099036693573, "epoch": 0.4261165893271462, "grad_norm": 0.029991019517183304, "grad_norm_var": 1.3668796901875606e-06, "learning_rate": 0.006328487700329038, "loss": 2.5533, "step": 11754 }, { "crossentropy": 2.630934715270996, "epoch": 0.4261528422273782, "grad_norm": 0.02859172783792019, "grad_norm_var": 1.3728549357983615e-06, "learning_rate": 0.006327927489999956, "loss": 2.5331, "step": 11755 }, { "crossentropy": 2.4202780723571777, "epoch": 0.4261890951276102, "grad_norm": 0.027800116688013077, "grad_norm_var": 1.3862545633067566e-06, "learning_rate": 0.006327367261735197, "loss": 2.4693, "step": 11756 }, { "crossentropy": 2.753002166748047, "epoch": 0.42622534802784223, "grad_norm": 0.03038610704243183, "grad_norm_var": 1.6723995150790793e-06, "learning_rate": 0.006326807015542328, "loss": 2.715, "step": 11757 }, { "crossentropy": 2.412104368209839, "epoch": 0.42626160092807425, "grad_norm": 0.0320252925157547, "grad_norm_var": 2.5052775170300435e-06, "learning_rate": 0.006326246751428918, "loss": 2.555, "step": 11758 }, { "crossentropy": 2.7414257526397705, "epoch": 0.42629785382830626, "grad_norm": 0.0316271148622036, "grad_norm_var": 2.9482678625443545e-06, "learning_rate": 0.006325686469402534, "loss": 2.6668, "step": 11759 }, { "crossentropy": 2.4881584644317627, "epoch": 0.4263341067285383, "grad_norm": 0.027587099000811577, "grad_norm_var": 2.9959154127095464e-06, "learning_rate": 0.006325126169470741, "loss": 2.5232, "step": 11760 }, { "crossentropy": 2.606220245361328, "epoch": 0.4263703596287703, "grad_norm": 0.02766728401184082, "grad_norm_var": 2.959187609054744e-06, "learning_rate": 0.006324565851641109, "loss": 2.6223, "step": 11761 }, { "crossentropy": 2.72056245803833, "epoch": 0.4264066125290023, "grad_norm": 0.027167731896042824, "grad_norm_var": 2.9427864836093494e-06, "learning_rate": 0.006324005515921203, "loss": 2.6099, "step": 11762 }, { "crossentropy": 2.4335429668426514, "epoch": 0.4264428654292343, "grad_norm": 0.027100667357444763, "grad_norm_var": 3.031815759075666e-06, "learning_rate": 0.006323445162318595, "loss": 2.5085, "step": 11763 }, { "crossentropy": 2.611091375350952, "epoch": 0.42647911832946633, "grad_norm": 0.027669396251440048, "grad_norm_var": 2.80598113241239e-06, "learning_rate": 0.006322884790840852, "loss": 2.5163, "step": 11764 }, { "crossentropy": 2.5925402641296387, "epoch": 0.4265153712296984, "grad_norm": 0.027554932981729507, "grad_norm_var": 2.877851980905936e-06, "learning_rate": 0.006322324401495541, "loss": 2.5599, "step": 11765 }, { "crossentropy": 2.5488431453704834, "epoch": 0.4265516241299304, "grad_norm": 0.028398390859365463, "grad_norm_var": 2.5526012786539295e-06, "learning_rate": 0.006321763994290232, "loss": 2.5834, "step": 11766 }, { "crossentropy": 2.4887993335723877, "epoch": 0.42658787703016243, "grad_norm": 0.03652510792016983, "grad_norm_var": 6.299682016492706e-06, "learning_rate": 0.006321203569232495, "loss": 2.5577, "step": 11767 }, { "crossentropy": 2.6168899536132812, "epoch": 0.42662412993039445, "grad_norm": 0.028343813493847847, "grad_norm_var": 6.1579665138815675e-06, "learning_rate": 0.0063206431263299005, "loss": 2.6087, "step": 11768 }, { "crossentropy": 2.5469603538513184, "epoch": 0.42666038283062646, "grad_norm": 0.028626874089241028, "grad_norm_var": 6.142166679869267e-06, "learning_rate": 0.006320082665590015, "loss": 2.6365, "step": 11769 }, { "crossentropy": 2.6750001907348633, "epoch": 0.4266966357308585, "grad_norm": 0.03161943331360817, "grad_norm_var": 6.481511008865033e-06, "learning_rate": 0.006319522187020409, "loss": 2.5729, "step": 11770 }, { "crossentropy": 2.7113959789276123, "epoch": 0.4267328886310905, "grad_norm": 0.030583664774894714, "grad_norm_var": 6.5431960223520545e-06, "learning_rate": 0.006318961690628654, "loss": 2.6779, "step": 11771 }, { "crossentropy": 2.491628646850586, "epoch": 0.4267691415313225, "grad_norm": 0.026211827993392944, "grad_norm_var": 7.043418616043776e-06, "learning_rate": 0.006318401176422322, "loss": 2.4607, "step": 11772 }, { "crossentropy": 2.4899425506591797, "epoch": 0.4268053944315545, "grad_norm": 0.02595970220863819, "grad_norm_var": 7.637850183232302e-06, "learning_rate": 0.006317840644408978, "loss": 2.538, "step": 11773 }, { "crossentropy": 2.614692449569702, "epoch": 0.42684164733178653, "grad_norm": 0.029957514256238937, "grad_norm_var": 7.082513700225739e-06, "learning_rate": 0.006317280094596197, "loss": 2.6249, "step": 11774 }, { "crossentropy": 2.6452035903930664, "epoch": 0.42687790023201855, "grad_norm": 0.030316883698105812, "grad_norm_var": 6.715577470666024e-06, "learning_rate": 0.00631671952699155, "loss": 2.6207, "step": 11775 }, { "crossentropy": 2.548002243041992, "epoch": 0.42691415313225056, "grad_norm": 0.027914108708500862, "grad_norm_var": 6.668040703623279e-06, "learning_rate": 0.006316158941602608, "loss": 2.5505, "step": 11776 }, { "crossentropy": 2.6279337406158447, "epoch": 0.4269504060324826, "grad_norm": 0.026232387870550156, "grad_norm_var": 7.0232075200246284e-06, "learning_rate": 0.006315598338436942, "loss": 2.6593, "step": 11777 }, { "crossentropy": 2.55446457862854, "epoch": 0.4269866589327146, "grad_norm": 0.026546014472842216, "grad_norm_var": 7.179474163457077e-06, "learning_rate": 0.006315037717502123, "loss": 2.5334, "step": 11778 }, { "crossentropy": 2.6089704036712646, "epoch": 0.42702291183294666, "grad_norm": 0.028418274596333504, "grad_norm_var": 7.0030466833038525e-06, "learning_rate": 0.006314477078805724, "loss": 2.5918, "step": 11779 }, { "crossentropy": 2.7120494842529297, "epoch": 0.4270591647331787, "grad_norm": 0.027533233165740967, "grad_norm_var": 7.024820534375492e-06, "learning_rate": 0.006313916422355319, "loss": 2.666, "step": 11780 }, { "crossentropy": 2.719301462173462, "epoch": 0.4270954176334107, "grad_norm": 0.027858398854732513, "grad_norm_var": 6.980344468176386e-06, "learning_rate": 0.006313355748158478, "loss": 2.5605, "step": 11781 }, { "crossentropy": 2.409545421600342, "epoch": 0.4271316705336427, "grad_norm": 0.025963185355067253, "grad_norm_var": 7.486368357338549e-06, "learning_rate": 0.006312795056222773, "loss": 2.4793, "step": 11782 }, { "crossentropy": 2.5076892375946045, "epoch": 0.4271679234338747, "grad_norm": 0.02716680057346821, "grad_norm_var": 3.150040922067903e-06, "learning_rate": 0.006312234346555779, "loss": 2.5517, "step": 11783 }, { "crossentropy": 2.7623813152313232, "epoch": 0.42720417633410673, "grad_norm": 0.028178708627820015, "grad_norm_var": 3.1458986996694854e-06, "learning_rate": 0.00631167361916507, "loss": 2.7182, "step": 11784 }, { "crossentropy": 2.4881434440612793, "epoch": 0.42724042923433875, "grad_norm": 0.027956077829003334, "grad_norm_var": 3.124030736460304e-06, "learning_rate": 0.006311112874058218, "loss": 2.5098, "step": 11785 }, { "crossentropy": 2.5625171661376953, "epoch": 0.42727668213457076, "grad_norm": 0.028414065018296242, "grad_norm_var": 2.2304153380942394e-06, "learning_rate": 0.006310552111242796, "loss": 2.5758, "step": 11786 }, { "crossentropy": 2.5841217041015625, "epoch": 0.4273129350348028, "grad_norm": 0.02726111374795437, "grad_norm_var": 1.6985675156132857e-06, "learning_rate": 0.006309991330726379, "loss": 2.654, "step": 11787 }, { "crossentropy": 2.4741666316986084, "epoch": 0.4273491879350348, "grad_norm": 0.02764078974723816, "grad_norm_var": 1.5582692411800302e-06, "learning_rate": 0.006309430532516543, "loss": 2.5772, "step": 11788 }, { "crossentropy": 2.6249606609344482, "epoch": 0.4273854408352668, "grad_norm": 0.02661445178091526, "grad_norm_var": 1.4324951133621536e-06, "learning_rate": 0.00630886971662086, "loss": 2.6375, "step": 11789 }, { "crossentropy": 2.5516293048858643, "epoch": 0.4274216937354988, "grad_norm": 0.028229281306266785, "grad_norm_var": 1.1100864491472474e-06, "learning_rate": 0.006308308883046902, "loss": 2.5977, "step": 11790 }, { "crossentropy": 2.5257482528686523, "epoch": 0.42745794663573083, "grad_norm": 0.027160799130797386, "grad_norm_var": 6.062772620947365e-07, "learning_rate": 0.006307748031802249, "loss": 2.4898, "step": 11791 }, { "crossentropy": 2.584531784057617, "epoch": 0.4274941995359629, "grad_norm": 0.027785751968622208, "grad_norm_var": 5.992439834118582e-07, "learning_rate": 0.006307187162894475, "loss": 2.5808, "step": 11792 }, { "crossentropy": 2.5493967533111572, "epoch": 0.4275304524361949, "grad_norm": 0.027557600289583206, "grad_norm_var": 4.965175478289022e-07, "learning_rate": 0.006306626276331155, "loss": 2.6144, "step": 11793 }, { "crossentropy": 2.633384943008423, "epoch": 0.42756670533642693, "grad_norm": 0.027552295476198196, "grad_norm_var": 4.2942202295886445e-07, "learning_rate": 0.006306065372119863, "loss": 2.4964, "step": 11794 }, { "crossentropy": 2.5769660472869873, "epoch": 0.42760295823665895, "grad_norm": 0.02746855467557907, "grad_norm_var": 3.7973056620857484e-07, "learning_rate": 0.006305504450268177, "loss": 2.5982, "step": 11795 }, { "crossentropy": 2.481954336166382, "epoch": 0.42763921113689096, "grad_norm": 0.027068497613072395, "grad_norm_var": 3.9249101762941994e-07, "learning_rate": 0.006304943510783672, "loss": 2.5984, "step": 11796 }, { "crossentropy": 2.580301284790039, "epoch": 0.427675464037123, "grad_norm": 0.02749055065214634, "grad_norm_var": 3.8299088362247815e-07, "learning_rate": 0.006304382553673924, "loss": 2.5114, "step": 11797 }, { "crossentropy": 2.653815269470215, "epoch": 0.427711716937355, "grad_norm": 0.027728578075766563, "grad_norm_var": 2.2326531913524395e-07, "learning_rate": 0.006303821578946512, "loss": 2.6222, "step": 11798 }, { "crossentropy": 2.4972944259643555, "epoch": 0.427747969837587, "grad_norm": 0.02729140780866146, "grad_norm_var": 2.173770529645865e-07, "learning_rate": 0.0063032605866090085, "loss": 2.5585, "step": 11799 }, { "crossentropy": 2.6909682750701904, "epoch": 0.427784222737819, "grad_norm": 0.029437284916639328, "grad_norm_var": 4.1560424969970475e-07, "learning_rate": 0.006302699576668994, "loss": 2.6625, "step": 11800 }, { "crossentropy": 2.6720447540283203, "epoch": 0.42782047563805103, "grad_norm": 0.02761571668088436, "grad_norm_var": 4.0968356367145017e-07, "learning_rate": 0.006302138549134045, "loss": 2.6512, "step": 11801 }, { "crossentropy": 2.615304946899414, "epoch": 0.42785672853828305, "grad_norm": 0.027222778648138046, "grad_norm_var": 3.7619200046653165e-07, "learning_rate": 0.0063015775040117395, "loss": 2.6467, "step": 11802 }, { "crossentropy": 2.6698620319366455, "epoch": 0.42789298143851506, "grad_norm": 0.027641480788588524, "grad_norm_var": 3.6955180705289246e-07, "learning_rate": 0.006301016441309654, "loss": 2.6252, "step": 11803 }, { "crossentropy": 2.6387977600097656, "epoch": 0.4279292343387471, "grad_norm": 0.026600809767842293, "grad_norm_var": 4.306769202255204e-07, "learning_rate": 0.0063004553610353665, "loss": 2.6315, "step": 11804 }, { "crossentropy": 2.5812418460845947, "epoch": 0.4279654872389791, "grad_norm": 0.02667371928691864, "grad_norm_var": 4.2366848588525535e-07, "learning_rate": 0.006299894263196456, "loss": 2.6395, "step": 11805 }, { "crossentropy": 2.3823235034942627, "epoch": 0.42800174013921116, "grad_norm": 0.0264443289488554, "grad_norm_var": 4.570433594506257e-07, "learning_rate": 0.006299333147800501, "loss": 2.5444, "step": 11806 }, { "crossentropy": 2.5669729709625244, "epoch": 0.4280379930394432, "grad_norm": 0.026175644248723984, "grad_norm_var": 5.519140229443345e-07, "learning_rate": 0.006298772014855081, "loss": 2.5784, "step": 11807 }, { "crossentropy": 2.603377103805542, "epoch": 0.4280742459396752, "grad_norm": 0.027721764519810677, "grad_norm_var": 5.485348853458225e-07, "learning_rate": 0.006298210864367773, "loss": 2.6116, "step": 11808 }, { "crossentropy": 2.717066764831543, "epoch": 0.4281104988399072, "grad_norm": 0.029373768717050552, "grad_norm_var": 8.035832836614329e-07, "learning_rate": 0.006297649696346157, "loss": 2.6028, "step": 11809 }, { "crossentropy": 2.7354190349578857, "epoch": 0.4281467517401392, "grad_norm": 0.028207384049892426, "grad_norm_var": 8.376626895657679e-07, "learning_rate": 0.006297088510797813, "loss": 2.668, "step": 11810 }, { "crossentropy": 2.546769618988037, "epoch": 0.42818300464037123, "grad_norm": 0.028179971501231194, "grad_norm_var": 8.653500340834984e-07, "learning_rate": 0.00629652730773032, "loss": 2.6051, "step": 11811 }, { "crossentropy": 2.617438316345215, "epoch": 0.42821925754060325, "grad_norm": 0.027891624718904495, "grad_norm_var": 8.543457087079592e-07, "learning_rate": 0.0062959660871512585, "loss": 2.6533, "step": 11812 }, { "crossentropy": 2.6226541996002197, "epoch": 0.42825551044083526, "grad_norm": 0.02718215435743332, "grad_norm_var": 8.650392862504774e-07, "learning_rate": 0.006295404849068207, "loss": 2.6072, "step": 11813 }, { "crossentropy": 2.532927989959717, "epoch": 0.4282917633410673, "grad_norm": 0.02721267379820347, "grad_norm_var": 8.719199441830731e-07, "learning_rate": 0.006294843593488748, "loss": 2.5995, "step": 11814 }, { "crossentropy": 2.5925753116607666, "epoch": 0.4283280162412993, "grad_norm": 0.027617322281003, "grad_norm_var": 8.671245763993673e-07, "learning_rate": 0.006294282320420463, "loss": 2.5836, "step": 11815 }, { "crossentropy": 2.5218000411987305, "epoch": 0.4283642691415313, "grad_norm": 0.04988320916891098, "grad_norm_var": 3.2071449386184005e-05, "learning_rate": 0.006293721029870929, "loss": 2.5827, "step": 11816 }, { "crossentropy": 2.564305067062378, "epoch": 0.4284005220417633, "grad_norm": 0.02685656026005745, "grad_norm_var": 3.2232685085821256e-05, "learning_rate": 0.00629315972184773, "loss": 2.5665, "step": 11817 }, { "crossentropy": 2.554039478302002, "epoch": 0.42843677494199534, "grad_norm": 0.026512647047638893, "grad_norm_var": 3.241404514371839e-05, "learning_rate": 0.006292598396358447, "loss": 2.6288, "step": 11818 }, { "crossentropy": 2.5681328773498535, "epoch": 0.4284730278422274, "grad_norm": 0.02775699645280838, "grad_norm_var": 3.239763710301013e-05, "learning_rate": 0.00629203705341066, "loss": 2.5931, "step": 11819 }, { "crossentropy": 2.5945935249328613, "epoch": 0.4285092807424594, "grad_norm": 0.03175435960292816, "grad_norm_var": 3.256830553470252e-05, "learning_rate": 0.006291475693011952, "loss": 2.4529, "step": 11820 }, { "crossentropy": 2.6021828651428223, "epoch": 0.42854553364269143, "grad_norm": 0.02714899368584156, "grad_norm_var": 3.2429287525963004e-05, "learning_rate": 0.006290914315169905, "loss": 2.5801, "step": 11821 }, { "crossentropy": 2.6564557552337646, "epoch": 0.42858178654292345, "grad_norm": 0.02814338728785515, "grad_norm_var": 3.200357143123319e-05, "learning_rate": 0.006290352919892102, "loss": 2.5604, "step": 11822 }, { "crossentropy": 2.670116662979126, "epoch": 0.42861803944315546, "grad_norm": 0.02937925048172474, "grad_norm_var": 3.134199719161978e-05, "learning_rate": 0.006289791507186123, "loss": 2.6444, "step": 11823 }, { "crossentropy": 2.640177011489868, "epoch": 0.4286542923433875, "grad_norm": 0.02904277667403221, "grad_norm_var": 3.11508220249385e-05, "learning_rate": 0.006289230077059553, "loss": 2.6755, "step": 11824 }, { "crossentropy": 2.487250328063965, "epoch": 0.4286905452436195, "grad_norm": 0.029181862249970436, "grad_norm_var": 3.1156582543915644e-05, "learning_rate": 0.006288668629519976, "loss": 2.5902, "step": 11825 }, { "crossentropy": 2.71148681640625, "epoch": 0.4287267981438515, "grad_norm": 0.030319835990667343, "grad_norm_var": 3.10722668542525e-05, "learning_rate": 0.006288107164574971, "loss": 2.5805, "step": 11826 }, { "crossentropy": 2.5555996894836426, "epoch": 0.4287630510440835, "grad_norm": 0.028343290090560913, "grad_norm_var": 3.104238065192097e-05, "learning_rate": 0.006287545682232125, "loss": 2.5967, "step": 11827 }, { "crossentropy": 2.6102912425994873, "epoch": 0.42879930394431554, "grad_norm": 0.028059540316462517, "grad_norm_var": 3.100501721824664e-05, "learning_rate": 0.006286984182499022, "loss": 2.5986, "step": 11828 }, { "crossentropy": 2.5880606174468994, "epoch": 0.42883555684454755, "grad_norm": 0.0282893106341362, "grad_norm_var": 3.071737139351802e-05, "learning_rate": 0.006286422665383242, "loss": 2.5689, "step": 11829 }, { "crossentropy": 2.6229734420776367, "epoch": 0.42887180974477956, "grad_norm": 0.027385789901018143, "grad_norm_var": 3.0661395939812244e-05, "learning_rate": 0.006285861130892372, "loss": 2.6055, "step": 11830 }, { "crossentropy": 2.6701154708862305, "epoch": 0.4289080626450116, "grad_norm": 0.027698371559381485, "grad_norm_var": 3.0638978988612345e-05, "learning_rate": 0.006285299579033998, "loss": 2.7217, "step": 11831 }, { "crossentropy": 2.561838388442993, "epoch": 0.4289443155452436, "grad_norm": 0.026136020198464394, "grad_norm_var": 2.0886740741239232e-06, "learning_rate": 0.006284738009815703, "loss": 2.4918, "step": 11832 }, { "crossentropy": 2.5323593616485596, "epoch": 0.42898056844547566, "grad_norm": 0.027390126138925552, "grad_norm_var": 2.00729512125399e-06, "learning_rate": 0.006284176423245069, "loss": 2.5101, "step": 11833 }, { "crossentropy": 2.5729756355285645, "epoch": 0.4290168213457077, "grad_norm": 0.027222145348787308, "grad_norm_var": 1.8711957971117756e-06, "learning_rate": 0.006283614819329684, "loss": 2.5998, "step": 11834 }, { "crossentropy": 2.514430046081543, "epoch": 0.4290530742459397, "grad_norm": 0.027716828510165215, "grad_norm_var": 1.8743561348339803e-06, "learning_rate": 0.006283053198077136, "loss": 2.5227, "step": 11835 }, { "crossentropy": 2.677277088165283, "epoch": 0.4290893271461717, "grad_norm": 0.026510652154684067, "grad_norm_var": 1.1957304127650665e-06, "learning_rate": 0.006282491559495005, "loss": 2.5978, "step": 11836 }, { "crossentropy": 2.686530828475952, "epoch": 0.4291255800464037, "grad_norm": 0.031401295214891434, "grad_norm_var": 1.8444891463094733e-06, "learning_rate": 0.0062819299035908795, "loss": 2.6483, "step": 11837 }, { "crossentropy": 2.622889995574951, "epoch": 0.42916183294663574, "grad_norm": 0.0302151907235384, "grad_norm_var": 2.0795048531039096e-06, "learning_rate": 0.006281368230372344, "loss": 2.5864, "step": 11838 }, { "crossentropy": 2.699962615966797, "epoch": 0.42919808584686775, "grad_norm": 0.027642346918582916, "grad_norm_var": 2.0397160863361385e-06, "learning_rate": 0.006280806539846988, "loss": 2.7904, "step": 11839 }, { "crossentropy": 2.5976791381835938, "epoch": 0.42923433874709976, "grad_norm": 0.026724128052592278, "grad_norm_var": 2.1413660380132165e-06, "learning_rate": 0.0062802448320223945, "loss": 2.5347, "step": 11840 }, { "crossentropy": 2.6925699710845947, "epoch": 0.4292705916473318, "grad_norm": 0.028579112142324448, "grad_norm_var": 2.0803253528357677e-06, "learning_rate": 0.0062796831069061535, "loss": 2.644, "step": 11841 }, { "crossentropy": 2.5722475051879883, "epoch": 0.4293068445475638, "grad_norm": 0.02945542521774769, "grad_norm_var": 1.8714238413064868e-06, "learning_rate": 0.006279121364505848, "loss": 2.5914, "step": 11842 }, { "crossentropy": 2.5483992099761963, "epoch": 0.4293430974477958, "grad_norm": 0.026697222143411636, "grad_norm_var": 1.975982643514368e-06, "learning_rate": 0.006278559604829069, "loss": 2.5672, "step": 11843 }, { "crossentropy": 2.4847278594970703, "epoch": 0.4293793503480278, "grad_norm": 0.027041256427764893, "grad_norm_var": 2.02526749273472e-06, "learning_rate": 0.006277997827883401, "loss": 2.5415, "step": 11844 }, { "crossentropy": 2.6789159774780273, "epoch": 0.42941560324825984, "grad_norm": 0.030214156955480576, "grad_norm_var": 2.361475527735028e-06, "learning_rate": 0.006277436033676435, "loss": 2.621, "step": 11845 }, { "crossentropy": 2.6038169860839844, "epoch": 0.4294518561484919, "grad_norm": 0.029941627755761147, "grad_norm_var": 2.5597948016518763e-06, "learning_rate": 0.006276874222215754, "loss": 2.5839, "step": 11846 }, { "crossentropy": 2.688828706741333, "epoch": 0.4294881090487239, "grad_norm": 0.028815612196922302, "grad_norm_var": 2.5688011043851582e-06, "learning_rate": 0.00627631239350895, "loss": 2.6121, "step": 11847 }, { "crossentropy": 2.6676278114318848, "epoch": 0.42952436194895594, "grad_norm": 0.026376847177743912, "grad_norm_var": 2.5051412607588468e-06, "learning_rate": 0.006275750547563611, "loss": 2.5857, "step": 11848 }, { "crossentropy": 2.733910083770752, "epoch": 0.42956061484918795, "grad_norm": 0.029186086729168892, "grad_norm_var": 2.5016653222689295e-06, "learning_rate": 0.006275188684387326, "loss": 2.5904, "step": 11849 }, { "crossentropy": 2.7958643436431885, "epoch": 0.42959686774941996, "grad_norm": 0.05990483984351158, "grad_norm_var": 6.430861717554813e-05, "learning_rate": 0.006274626803987679, "loss": 2.6754, "step": 11850 }, { "crossentropy": 2.8486883640289307, "epoch": 0.429633120649652, "grad_norm": 0.03329513221979141, "grad_norm_var": 6.42567345730354e-05, "learning_rate": 0.006274064906372265, "loss": 2.7814, "step": 11851 }, { "crossentropy": 2.824500322341919, "epoch": 0.429669373549884, "grad_norm": 0.030244138091802597, "grad_norm_var": 6.301754849947828e-05, "learning_rate": 0.006273502991548671, "loss": 2.6467, "step": 11852 }, { "crossentropy": 2.615994930267334, "epoch": 0.429705626450116, "grad_norm": 0.02764955721795559, "grad_norm_var": 6.368822573365826e-05, "learning_rate": 0.006272941059524487, "loss": 2.5991, "step": 11853 }, { "crossentropy": 2.6883487701416016, "epoch": 0.429741879350348, "grad_norm": 0.02741379104554653, "grad_norm_var": 6.437807334168793e-05, "learning_rate": 0.0062723791103073, "loss": 2.6737, "step": 11854 }, { "crossentropy": 2.5417048931121826, "epoch": 0.42977813225058004, "grad_norm": 0.02628607489168644, "grad_norm_var": 6.502315892316298e-05, "learning_rate": 0.006271817143904703, "loss": 2.562, "step": 11855 }, { "crossentropy": 2.7193007469177246, "epoch": 0.42981438515081205, "grad_norm": 0.028161412104964256, "grad_norm_var": 6.443076638455481e-05, "learning_rate": 0.006271255160324288, "loss": 2.6383, "step": 11856 }, { "crossentropy": 2.5936739444732666, "epoch": 0.42985063805104406, "grad_norm": 0.027265774086117744, "grad_norm_var": 6.488875509047994e-05, "learning_rate": 0.006270693159573641, "loss": 2.5376, "step": 11857 }, { "crossentropy": 2.8734755516052246, "epoch": 0.4298868909512761, "grad_norm": 0.028785306960344315, "grad_norm_var": 6.500986800656998e-05, "learning_rate": 0.006270131141660353, "loss": 2.7149, "step": 11858 }, { "crossentropy": 2.579096794128418, "epoch": 0.4299231438515081, "grad_norm": 0.029220370575785637, "grad_norm_var": 6.414359401896852e-05, "learning_rate": 0.0062695691065920165, "loss": 2.5534, "step": 11859 }, { "crossentropy": 2.559000015258789, "epoch": 0.42995939675174016, "grad_norm": 0.028008177876472473, "grad_norm_var": 6.374159666379935e-05, "learning_rate": 0.006269007054376226, "loss": 2.506, "step": 11860 }, { "crossentropy": 2.662214994430542, "epoch": 0.4299956496519722, "grad_norm": 0.02753230184316635, "grad_norm_var": 6.435521199209145e-05, "learning_rate": 0.0062684449850205675, "loss": 2.6014, "step": 11861 }, { "crossentropy": 2.5171327590942383, "epoch": 0.4300319025522042, "grad_norm": 0.028592921793460846, "grad_norm_var": 6.457028902193486e-05, "learning_rate": 0.006267882898532633, "loss": 2.6219, "step": 11862 }, { "crossentropy": 2.4571919441223145, "epoch": 0.4300681554524362, "grad_norm": 0.028227832168340683, "grad_norm_var": 6.471770866007229e-05, "learning_rate": 0.006267320794920018, "loss": 2.5011, "step": 11863 }, { "crossentropy": 2.467390775680542, "epoch": 0.4301044083526682, "grad_norm": 0.02868039347231388, "grad_norm_var": 6.381847305171305e-05, "learning_rate": 0.006266758674190313, "loss": 2.4509, "step": 11864 }, { "crossentropy": 2.627908706665039, "epoch": 0.43014066125290024, "grad_norm": 0.027448270469903946, "grad_norm_var": 6.431824504336558e-05, "learning_rate": 0.0062661965363511075, "loss": 2.5901, "step": 11865 }, { "crossentropy": 2.6937320232391357, "epoch": 0.43017691415313225, "grad_norm": 0.028885802254080772, "grad_norm_var": 2.5080664973368854e-06, "learning_rate": 0.006265634381409997, "loss": 2.6797, "step": 11866 }, { "crossentropy": 2.548785448074341, "epoch": 0.43021316705336426, "grad_norm": 0.03161805495619774, "grad_norm_var": 1.6073812663448816e-06, "learning_rate": 0.006265072209374574, "loss": 2.5362, "step": 11867 }, { "crossentropy": 2.613095283508301, "epoch": 0.4302494199535963, "grad_norm": 0.029055511578917503, "grad_norm_var": 1.399655601210881e-06, "learning_rate": 0.006264510020252431, "loss": 2.624, "step": 11868 }, { "crossentropy": 2.7283740043640137, "epoch": 0.4302856728538283, "grad_norm": 0.030372152104973793, "grad_norm_var": 1.6261034342753087e-06, "learning_rate": 0.00626394781405116, "loss": 2.641, "step": 11869 }, { "crossentropy": 2.7123191356658936, "epoch": 0.4303219257540603, "grad_norm": 0.029099252074956894, "grad_norm_var": 1.565812614905994e-06, "learning_rate": 0.006263385590778357, "loss": 2.6505, "step": 11870 }, { "crossentropy": 2.6057636737823486, "epoch": 0.4303581786542923, "grad_norm": 0.027870692312717438, "grad_norm_var": 1.238618429095931e-06, "learning_rate": 0.006262823350441613, "loss": 2.6184, "step": 11871 }, { "crossentropy": 2.664311408996582, "epoch": 0.43039443155452434, "grad_norm": 0.027528386563062668, "grad_norm_var": 1.3071398806930682e-06, "learning_rate": 0.0062622610930485245, "loss": 2.673, "step": 11872 }, { "crossentropy": 2.5670175552368164, "epoch": 0.4304306844547564, "grad_norm": 0.02778605744242668, "grad_norm_var": 1.2289382982666087e-06, "learning_rate": 0.006261698818606685, "loss": 2.6059, "step": 11873 }, { "crossentropy": 2.573481798171997, "epoch": 0.4304669373549884, "grad_norm": 0.03120771236717701, "grad_norm_var": 1.6331058903596337e-06, "learning_rate": 0.006261136527123686, "loss": 2.5757, "step": 11874 }, { "crossentropy": 2.6513500213623047, "epoch": 0.43050319025522044, "grad_norm": 0.03096902184188366, "grad_norm_var": 1.917362616156206e-06, "learning_rate": 0.006260574218607126, "loss": 2.6324, "step": 11875 }, { "crossentropy": 2.3972272872924805, "epoch": 0.43053944315545245, "grad_norm": 0.030960720032453537, "grad_norm_var": 2.0992483430459205e-06, "learning_rate": 0.006260011893064597, "loss": 2.4451, "step": 11876 }, { "crossentropy": 2.61071515083313, "epoch": 0.43057569605568446, "grad_norm": 0.027701830491423607, "grad_norm_var": 2.065276512869931e-06, "learning_rate": 0.0062594495505036984, "loss": 2.6248, "step": 11877 }, { "crossentropy": 2.577867031097412, "epoch": 0.4306119489559165, "grad_norm": 0.02849307283759117, "grad_norm_var": 2.0729871241579316e-06, "learning_rate": 0.006258887190932019, "loss": 2.6166, "step": 11878 }, { "crossentropy": 2.535583019256592, "epoch": 0.4306482018561485, "grad_norm": 0.032337404787540436, "grad_norm_var": 2.640188500390101e-06, "learning_rate": 0.006258324814357159, "loss": 2.5558, "step": 11879 }, { "crossentropy": 2.5640692710876465, "epoch": 0.4306844547563805, "grad_norm": 0.032715510576963425, "grad_norm_var": 3.283632922832363e-06, "learning_rate": 0.006257762420786712, "loss": 2.6242, "step": 11880 }, { "crossentropy": 2.567004442214966, "epoch": 0.4307207076566125, "grad_norm": 0.031206907704472542, "grad_norm_var": 3.074172028208694e-06, "learning_rate": 0.006257200010228277, "loss": 2.6363, "step": 11881 }, { "crossentropy": 2.783754587173462, "epoch": 0.43075696055684454, "grad_norm": 0.029389141127467155, "grad_norm_var": 3.0244244790185536e-06, "learning_rate": 0.006256637582689445, "loss": 2.6794, "step": 11882 }, { "crossentropy": 2.63816499710083, "epoch": 0.43079321345707655, "grad_norm": 0.027118457481265068, "grad_norm_var": 3.255761180210627e-06, "learning_rate": 0.006256075138177817, "loss": 2.5862, "step": 11883 }, { "crossentropy": 2.6871025562286377, "epoch": 0.43082946635730857, "grad_norm": 0.02781662717461586, "grad_norm_var": 3.4438163901326457e-06, "learning_rate": 0.006255512676700987, "loss": 2.7003, "step": 11884 }, { "crossentropy": 2.4985945224761963, "epoch": 0.4308657192575406, "grad_norm": 0.027218254283070564, "grad_norm_var": 3.713809674222657e-06, "learning_rate": 0.0062549501982665555, "loss": 2.5631, "step": 11885 }, { "crossentropy": 2.5765457153320312, "epoch": 0.4309019721577726, "grad_norm": 0.02875741571187973, "grad_norm_var": 3.7320261016016934e-06, "learning_rate": 0.006254387702882114, "loss": 2.6211, "step": 11886 }, { "crossentropy": 2.5056464672088623, "epoch": 0.43093822505800466, "grad_norm": 0.02779599465429783, "grad_norm_var": 3.7467828532015504e-06, "learning_rate": 0.0062538251905552645, "loss": 2.5183, "step": 11887 }, { "crossentropy": 2.721418857574463, "epoch": 0.4309744779582367, "grad_norm": 0.029872719198465347, "grad_norm_var": 3.5325531400729377e-06, "learning_rate": 0.006253262661293603, "loss": 2.6586, "step": 11888 }, { "crossentropy": 2.497248888015747, "epoch": 0.4310107308584687, "grad_norm": 0.027706854045391083, "grad_norm_var": 3.550614124422866e-06, "learning_rate": 0.006252700115104728, "loss": 2.576, "step": 11889 }, { "crossentropy": 2.689537286758423, "epoch": 0.4310469837587007, "grad_norm": 0.03859807923436165, "grad_norm_var": 8.692061872524988e-06, "learning_rate": 0.006252137551996236, "loss": 2.7503, "step": 11890 }, { "crossentropy": 2.597721576690674, "epoch": 0.4310832366589327, "grad_norm": 0.0301998071372509, "grad_norm_var": 8.621055483642482e-06, "learning_rate": 0.006251574971975727, "loss": 2.541, "step": 11891 }, { "crossentropy": 2.5615921020507812, "epoch": 0.43111948955916474, "grad_norm": 0.03240406885743141, "grad_norm_var": 8.961539549732998e-06, "learning_rate": 0.006251012375050797, "loss": 2.53, "step": 11892 }, { "crossentropy": 2.593622922897339, "epoch": 0.43115574245939675, "grad_norm": 0.03193459287285805, "grad_norm_var": 8.807850078860674e-06, "learning_rate": 0.006250449761229048, "loss": 2.581, "step": 11893 }, { "crossentropy": 2.5980308055877686, "epoch": 0.43119199535962877, "grad_norm": 0.03334273770451546, "grad_norm_var": 9.15931920296846e-06, "learning_rate": 0.006249887130518076, "loss": 2.6559, "step": 11894 }, { "crossentropy": 2.606703519821167, "epoch": 0.4312282482598608, "grad_norm": 0.03080008551478386, "grad_norm_var": 8.935715993001524e-06, "learning_rate": 0.006249324482925482, "loss": 2.6361, "step": 11895 }, { "crossentropy": 2.6803600788116455, "epoch": 0.4312645011600928, "grad_norm": 0.029669953510165215, "grad_norm_var": 8.587272845797753e-06, "learning_rate": 0.006248761818458866, "loss": 2.5896, "step": 11896 }, { "crossentropy": 2.6145973205566406, "epoch": 0.4313007540603248, "grad_norm": 0.030582938343286514, "grad_norm_var": 8.53112050742826e-06, "learning_rate": 0.006248199137125827, "loss": 2.6294, "step": 11897 }, { "crossentropy": 2.591993808746338, "epoch": 0.4313370069605568, "grad_norm": 0.027084948495030403, "grad_norm_var": 9.112217027838982e-06, "learning_rate": 0.006247636438933962, "loss": 2.6291, "step": 11898 }, { "crossentropy": 2.709146022796631, "epoch": 0.43137325986078884, "grad_norm": 0.026139330118894577, "grad_norm_var": 9.555693753082682e-06, "learning_rate": 0.006247073723890875, "loss": 2.6553, "step": 11899 }, { "crossentropy": 2.6280124187469482, "epoch": 0.4314095127610209, "grad_norm": 0.028031861409544945, "grad_norm_var": 9.496066488727745e-06, "learning_rate": 0.006246510992004165, "loss": 2.536, "step": 11900 }, { "crossentropy": 2.5698325634002686, "epoch": 0.4314457656612529, "grad_norm": 0.028973406180739403, "grad_norm_var": 9.035574110981949e-06, "learning_rate": 0.006245948243281432, "loss": 2.5955, "step": 11901 }, { "crossentropy": 2.432431936264038, "epoch": 0.43148201856148494, "grad_norm": 0.031710147857666016, "grad_norm_var": 9.044662362029166e-06, "learning_rate": 0.006245385477730278, "loss": 2.6235, "step": 11902 }, { "crossentropy": 2.702094078063965, "epoch": 0.43151827146171695, "grad_norm": 0.03014977090060711, "grad_norm_var": 8.604147472412758e-06, "learning_rate": 0.006244822695358302, "loss": 2.6547, "step": 11903 }, { "crossentropy": 2.5307669639587402, "epoch": 0.43155452436194897, "grad_norm": 0.026893608272075653, "grad_norm_var": 9.38817812234185e-06, "learning_rate": 0.0062442598961731065, "loss": 2.6013, "step": 11904 }, { "crossentropy": 2.6459717750549316, "epoch": 0.431590777262181, "grad_norm": 0.027210423722863197, "grad_norm_var": 9.57283263343511e-06, "learning_rate": 0.006243697080182293, "loss": 2.6162, "step": 11905 }, { "crossentropy": 2.742539644241333, "epoch": 0.431627030162413, "grad_norm": 0.03270864486694336, "grad_norm_var": 5.171817842946717e-06, "learning_rate": 0.006243134247393464, "loss": 2.7346, "step": 11906 }, { "crossentropy": 2.5018577575683594, "epoch": 0.431663283062645, "grad_norm": 0.031079895794391632, "grad_norm_var": 5.259542538007105e-06, "learning_rate": 0.00624257139781422, "loss": 2.5867, "step": 11907 }, { "crossentropy": 2.4810187816619873, "epoch": 0.431699535962877, "grad_norm": 0.02817741595208645, "grad_norm_var": 4.976047173165414e-06, "learning_rate": 0.006242008531452162, "loss": 2.5833, "step": 11908 }, { "crossentropy": 2.66291880607605, "epoch": 0.43173578886310904, "grad_norm": 0.026940489187836647, "grad_norm_var": 5.017333977197444e-06, "learning_rate": 0.0062414456483148965, "loss": 2.596, "step": 11909 }, { "crossentropy": 2.664341688156128, "epoch": 0.43177204176334105, "grad_norm": 0.02908291108906269, "grad_norm_var": 3.879979943286144e-06, "learning_rate": 0.006240882748410022, "loss": 2.6715, "step": 11910 }, { "crossentropy": 2.716520071029663, "epoch": 0.43180829466357307, "grad_norm": 0.029896141961216927, "grad_norm_var": 3.7234021526187656e-06, "learning_rate": 0.0062403198317451435, "loss": 2.6883, "step": 11911 }, { "crossentropy": 2.5582830905914307, "epoch": 0.4318445475638051, "grad_norm": 0.026343325152993202, "grad_norm_var": 4.1270980450461636e-06, "learning_rate": 0.006239756898327863, "loss": 2.5793, "step": 11912 }, { "crossentropy": 2.480854034423828, "epoch": 0.4318808004640371, "grad_norm": 0.028231503441929817, "grad_norm_var": 3.917702915472151e-06, "learning_rate": 0.006239193948165784, "loss": 2.574, "step": 11913 }, { "crossentropy": 2.760291814804077, "epoch": 0.43191705336426917, "grad_norm": 0.03162984922528267, "grad_norm_var": 4.2506966945988005e-06, "learning_rate": 0.00623863098126651, "loss": 2.643, "step": 11914 }, { "crossentropy": 2.565110445022583, "epoch": 0.4319533062645012, "grad_norm": 0.030028339475393295, "grad_norm_var": 3.7385830931564606e-06, "learning_rate": 0.006238067997637646, "loss": 2.5104, "step": 11915 }, { "crossentropy": 2.6044068336486816, "epoch": 0.4319895591647332, "grad_norm": 0.026980174705386162, "grad_norm_var": 3.9705291095846916e-06, "learning_rate": 0.006237504997286794, "loss": 2.5914, "step": 11916 }, { "crossentropy": 2.6947391033172607, "epoch": 0.4320258120649652, "grad_norm": 0.026722554117441177, "grad_norm_var": 4.3333465678332125e-06, "learning_rate": 0.0062369419802215595, "loss": 2.6012, "step": 11917 }, { "crossentropy": 2.5700907707214355, "epoch": 0.4320620649651972, "grad_norm": 0.026844311505556107, "grad_norm_var": 4.0461244647983175e-06, "learning_rate": 0.006236378946449545, "loss": 2.5653, "step": 11918 }, { "crossentropy": 2.4756221771240234, "epoch": 0.43209831786542924, "grad_norm": 0.030648531392216682, "grad_norm_var": 4.1592503216459116e-06, "learning_rate": 0.0062358158959783605, "loss": 2.5831, "step": 11919 }, { "crossentropy": 2.59979510307312, "epoch": 0.43213457076566125, "grad_norm": 0.03105025924742222, "grad_norm_var": 4.2304154702932705e-06, "learning_rate": 0.006235252828815604, "loss": 2.6174, "step": 11920 }, { "crossentropy": 2.6702656745910645, "epoch": 0.43217082366589327, "grad_norm": 0.027979684993624687, "grad_norm_var": 4.0865730249475355e-06, "learning_rate": 0.006234689744968883, "loss": 2.5575, "step": 11921 }, { "crossentropy": 2.5376474857330322, "epoch": 0.4322070765661253, "grad_norm": 0.030986791476607323, "grad_norm_var": 3.4253757939750463e-06, "learning_rate": 0.006234126644445804, "loss": 2.6065, "step": 11922 }, { "crossentropy": 2.585970640182495, "epoch": 0.4322433294663573, "grad_norm": 0.02627420984208584, "grad_norm_var": 3.480901115991639e-06, "learning_rate": 0.006233563527253972, "loss": 2.5061, "step": 11923 }, { "crossentropy": 2.490778684616089, "epoch": 0.4322795823665893, "grad_norm": 0.02655925787985325, "grad_norm_var": 3.7386470387488583e-06, "learning_rate": 0.006233000393400993, "loss": 2.5451, "step": 11924 }, { "crossentropy": 2.4507875442504883, "epoch": 0.4323158352668213, "grad_norm": 0.02556559257209301, "grad_norm_var": 4.144954565291012e-06, "learning_rate": 0.006232437242894472, "loss": 2.5451, "step": 11925 }, { "crossentropy": 2.545919179916382, "epoch": 0.43235208816705334, "grad_norm": 0.027646424248814583, "grad_norm_var": 4.148192789153679e-06, "learning_rate": 0.006231874075742014, "loss": 2.5639, "step": 11926 }, { "crossentropy": 2.6214969158172607, "epoch": 0.4323883410672854, "grad_norm": 0.028797265142202377, "grad_norm_var": 3.995176536386114e-06, "learning_rate": 0.006231310891951231, "loss": 2.6395, "step": 11927 }, { "crossentropy": 2.574615716934204, "epoch": 0.4324245939675174, "grad_norm": 0.028102818876504898, "grad_norm_var": 3.737136972000593e-06, "learning_rate": 0.006230747691529723, "loss": 2.6379, "step": 11928 }, { "crossentropy": 2.480198860168457, "epoch": 0.43246084686774944, "grad_norm": 0.032664816826581955, "grad_norm_var": 4.8789492960161445e-06, "learning_rate": 0.006230184474485101, "loss": 2.4954, "step": 11929 }, { "crossentropy": 2.690537691116333, "epoch": 0.43249709976798145, "grad_norm": 0.03413194790482521, "grad_norm_var": 6.262660870543163e-06, "learning_rate": 0.006229621240824969, "loss": 2.7124, "step": 11930 }, { "crossentropy": 2.6250879764556885, "epoch": 0.43253335266821347, "grad_norm": 0.030519969761371613, "grad_norm_var": 6.357536004921805e-06, "learning_rate": 0.006229057990556938, "loss": 2.707, "step": 11931 }, { "crossentropy": 2.526312828063965, "epoch": 0.4325696055684455, "grad_norm": 0.0262538343667984, "grad_norm_var": 6.570834124089431e-06, "learning_rate": 0.006228494723688614, "loss": 2.5364, "step": 11932 }, { "crossentropy": 2.625525712966919, "epoch": 0.4326058584686775, "grad_norm": 0.027520664036273956, "grad_norm_var": 6.389918701480273e-06, "learning_rate": 0.006227931440227603, "loss": 2.5971, "step": 11933 }, { "crossentropy": 2.6900157928466797, "epoch": 0.4326421113689095, "grad_norm": 0.0279131680727005, "grad_norm_var": 6.175960589953807e-06, "learning_rate": 0.006227368140181513, "loss": 2.5241, "step": 11934 }, { "crossentropy": 2.6437413692474365, "epoch": 0.4326783642691415, "grad_norm": 0.02778085321187973, "grad_norm_var": 6.026514426398982e-06, "learning_rate": 0.006226804823557957, "loss": 2.6314, "step": 11935 }, { "crossentropy": 2.5242457389831543, "epoch": 0.43271461716937354, "grad_norm": 0.026590295135974884, "grad_norm_var": 5.8924605914555926e-06, "learning_rate": 0.006226241490364539, "loss": 2.5959, "step": 11936 }, { "crossentropy": 2.611205816268921, "epoch": 0.43275087006960555, "grad_norm": 0.02732633240520954, "grad_norm_var": 5.96058772861068e-06, "learning_rate": 0.006225678140608868, "loss": 2.604, "step": 11937 }, { "crossentropy": 2.7316982746124268, "epoch": 0.43278712296983757, "grad_norm": 0.030538497492671013, "grad_norm_var": 5.8194041993221235e-06, "learning_rate": 0.006225114774298554, "loss": 2.7073, "step": 11938 }, { "crossentropy": 2.697094678878784, "epoch": 0.4328233758700696, "grad_norm": 0.02937508560717106, "grad_norm_var": 5.546991693203216e-06, "learning_rate": 0.006224551391441206, "loss": 2.668, "step": 11939 }, { "crossentropy": 2.542799949645996, "epoch": 0.4328596287703016, "grad_norm": 0.029688548296689987, "grad_norm_var": 5.3157105497198076e-06, "learning_rate": 0.006223987992044433, "loss": 2.5938, "step": 11940 }, { "crossentropy": 2.53542160987854, "epoch": 0.43289588167053367, "grad_norm": 0.028460880741477013, "grad_norm_var": 4.600285529878173e-06, "learning_rate": 0.006223424576115846, "loss": 2.61, "step": 11941 }, { "crossentropy": 2.6880667209625244, "epoch": 0.4329321345707657, "grad_norm": 0.02794838696718216, "grad_norm_var": 4.553219874850409e-06, "learning_rate": 0.00622286114366305, "loss": 2.6886, "step": 11942 }, { "crossentropy": 2.4480998516082764, "epoch": 0.4329683874709977, "grad_norm": 0.028564106673002243, "grad_norm_var": 4.562168907042083e-06, "learning_rate": 0.006222297694693662, "loss": 2.6314, "step": 11943 }, { "crossentropy": 2.520195960998535, "epoch": 0.4330046403712297, "grad_norm": 0.027947435155510902, "grad_norm_var": 4.5814630101930485e-06, "learning_rate": 0.006221734229215288, "loss": 2.5177, "step": 11944 }, { "crossentropy": 2.522648572921753, "epoch": 0.4330408932714617, "grad_norm": 0.027997316792607307, "grad_norm_var": 3.6321714382739343e-06, "learning_rate": 0.006221170747235538, "loss": 2.592, "step": 11945 }, { "crossentropy": 2.8007500171661377, "epoch": 0.43307714617169374, "grad_norm": 0.03293650597333908, "grad_norm_var": 2.8492762247621256e-06, "learning_rate": 0.0062206072487620235, "loss": 2.7592, "step": 11946 }, { "crossentropy": 2.8897433280944824, "epoch": 0.43311339907192575, "grad_norm": 0.03381442651152611, "grad_norm_var": 4.377521481474754e-06, "learning_rate": 0.006220043733802357, "loss": 2.6652, "step": 11947 }, { "crossentropy": 2.65689754486084, "epoch": 0.43314965197215777, "grad_norm": 0.028883541002869606, "grad_norm_var": 3.920123725067757e-06, "learning_rate": 0.006219480202364149, "loss": 2.704, "step": 11948 }, { "crossentropy": 2.5275042057037354, "epoch": 0.4331859048723898, "grad_norm": 0.02626633271574974, "grad_norm_var": 4.25840543016435e-06, "learning_rate": 0.00621891665445501, "loss": 2.474, "step": 11949 }, { "crossentropy": 2.4998726844787598, "epoch": 0.4332221577726218, "grad_norm": 0.027119718492031097, "grad_norm_var": 4.3997181091712564e-06, "learning_rate": 0.00621835309008255, "loss": 2.5364, "step": 11950 }, { "crossentropy": 2.6746063232421875, "epoch": 0.4332584106728538, "grad_norm": 0.028150297701358795, "grad_norm_var": 4.356696981493606e-06, "learning_rate": 0.006217789509254385, "loss": 2.6306, "step": 11951 }, { "crossentropy": 2.527776002883911, "epoch": 0.4332946635730858, "grad_norm": 0.02628866583108902, "grad_norm_var": 4.453281714471277e-06, "learning_rate": 0.006217225911978124, "loss": 2.5621, "step": 11952 }, { "crossentropy": 2.5145065784454346, "epoch": 0.43333091647331784, "grad_norm": 0.02770448476076126, "grad_norm_var": 4.3863215923880514e-06, "learning_rate": 0.0062166622982613795, "loss": 2.5924, "step": 11953 }, { "crossentropy": 2.601440668106079, "epoch": 0.4333671693735499, "grad_norm": 0.028516460210084915, "grad_norm_var": 4.1880532618018915e-06, "learning_rate": 0.006216098668111763, "loss": 2.5534, "step": 11954 }, { "crossentropy": 2.4821267127990723, "epoch": 0.4334034222737819, "grad_norm": 0.02955312840640545, "grad_norm_var": 4.205374596591643e-06, "learning_rate": 0.006215535021536889, "loss": 2.6126, "step": 11955 }, { "crossentropy": 2.6030468940734863, "epoch": 0.43343967517401394, "grad_norm": 0.028995783999562263, "grad_norm_var": 4.147755056036796e-06, "learning_rate": 0.00621497135854437, "loss": 2.5308, "step": 11956 }, { "crossentropy": 2.62192964553833, "epoch": 0.43347592807424595, "grad_norm": 0.027379121631383896, "grad_norm_var": 4.2549084648666925e-06, "learning_rate": 0.006214407679141822, "loss": 2.6379, "step": 11957 }, { "crossentropy": 2.605722427368164, "epoch": 0.43351218097447797, "grad_norm": 0.026913069188594818, "grad_norm_var": 4.415869360358315e-06, "learning_rate": 0.006213843983336851, "loss": 2.6096, "step": 11958 }, { "crossentropy": 2.539052963256836, "epoch": 0.43354843387471, "grad_norm": 0.0276419036090374, "grad_norm_var": 4.46905904559236e-06, "learning_rate": 0.006213280271137077, "loss": 2.616, "step": 11959 }, { "crossentropy": 2.5049078464508057, "epoch": 0.433584686774942, "grad_norm": 0.026571616530418396, "grad_norm_var": 4.689968156311915e-06, "learning_rate": 0.006212716542550112, "loss": 2.5667, "step": 11960 }, { "crossentropy": 2.731001853942871, "epoch": 0.433620939675174, "grad_norm": 0.026349157094955444, "grad_norm_var": 4.9528015898289556e-06, "learning_rate": 0.00621215279758357, "loss": 2.6281, "step": 11961 }, { "crossentropy": 2.5583043098449707, "epoch": 0.433657192575406, "grad_norm": 0.031396426260471344, "grad_norm_var": 4.152611027290878e-06, "learning_rate": 0.006211589036245063, "loss": 2.5955, "step": 11962 }, { "crossentropy": 2.515428304672241, "epoch": 0.43369344547563804, "grad_norm": 0.029940590262413025, "grad_norm_var": 2.201717445453318e-06, "learning_rate": 0.006211025258542209, "loss": 2.5491, "step": 11963 }, { "crossentropy": 2.509089708328247, "epoch": 0.43372969837587005, "grad_norm": 0.028311148285865784, "grad_norm_var": 2.1531908830217663e-06, "learning_rate": 0.00621046146448262, "loss": 2.545, "step": 11964 }, { "crossentropy": 2.628437042236328, "epoch": 0.43376595127610207, "grad_norm": 0.02702953852713108, "grad_norm_var": 2.018914119263646e-06, "learning_rate": 0.006209897654073915, "loss": 2.6321, "step": 11965 }, { "crossentropy": 2.4280905723571777, "epoch": 0.4338022041763341, "grad_norm": 0.027809660881757736, "grad_norm_var": 1.9684848098894347e-06, "learning_rate": 0.006209333827323702, "loss": 2.4535, "step": 11966 }, { "crossentropy": 2.4003195762634277, "epoch": 0.43383845707656615, "grad_norm": 0.026323499158024788, "grad_norm_var": 2.148839743912739e-06, "learning_rate": 0.006208769984239602, "loss": 2.5086, "step": 11967 }, { "crossentropy": 2.5624759197235107, "epoch": 0.43387470997679817, "grad_norm": 0.026149628683924675, "grad_norm_var": 2.180295021392608e-06, "learning_rate": 0.00620820612482923, "loss": 2.5641, "step": 11968 }, { "crossentropy": 2.50862717628479, "epoch": 0.4339109628770302, "grad_norm": 0.028640538454055786, "grad_norm_var": 2.2092108228142914e-06, "learning_rate": 0.006207642249100201, "loss": 2.5535, "step": 11969 }, { "crossentropy": 2.553539991378784, "epoch": 0.4339472157772622, "grad_norm": 0.027610808610916138, "grad_norm_var": 2.194496209336376e-06, "learning_rate": 0.006207078357060128, "loss": 2.6149, "step": 11970 }, { "crossentropy": 2.6387102603912354, "epoch": 0.4339834686774942, "grad_norm": 0.028588993474841118, "grad_norm_var": 2.041814010445457e-06, "learning_rate": 0.006206514448716632, "loss": 2.5646, "step": 11971 }, { "crossentropy": 2.6860556602478027, "epoch": 0.4340197215777262, "grad_norm": 0.029331441968679428, "grad_norm_var": 2.0999905128870715e-06, "learning_rate": 0.0062059505240773276, "loss": 2.6982, "step": 11972 }, { "crossentropy": 2.4958739280700684, "epoch": 0.43405597447795824, "grad_norm": 0.029238397255539894, "grad_norm_var": 2.193316398261729e-06, "learning_rate": 0.006205386583149832, "loss": 2.5333, "step": 11973 }, { "crossentropy": 2.5399246215820312, "epoch": 0.43409222737819025, "grad_norm": 0.027587121352553368, "grad_norm_var": 2.1248893187885675e-06, "learning_rate": 0.006204822625941758, "loss": 2.5391, "step": 11974 }, { "crossentropy": 2.639707326889038, "epoch": 0.43412848027842227, "grad_norm": 0.028410544618964195, "grad_norm_var": 2.1217814143587353e-06, "learning_rate": 0.006204258652460729, "loss": 2.7173, "step": 11975 }, { "crossentropy": 2.6673693656921387, "epoch": 0.4341647331786543, "grad_norm": 0.030516933649778366, "grad_norm_var": 2.300853817254024e-06, "learning_rate": 0.006203694662714358, "loss": 2.5407, "step": 11976 }, { "crossentropy": 2.610668420791626, "epoch": 0.4342009860788863, "grad_norm": 0.027751771733164787, "grad_norm_var": 2.0538965655724864e-06, "learning_rate": 0.0062031306567102654, "loss": 2.6115, "step": 11977 }, { "crossentropy": 2.389108657836914, "epoch": 0.4342372389791183, "grad_norm": 0.029860785230994225, "grad_norm_var": 1.590792413884998e-06, "learning_rate": 0.006202566634456066, "loss": 2.5019, "step": 11978 }, { "crossentropy": 2.5411834716796875, "epoch": 0.4342734918793503, "grad_norm": 0.03018806502223015, "grad_norm_var": 1.6481325276320684e-06, "learning_rate": 0.006202002595959381, "loss": 2.5641, "step": 11979 }, { "crossentropy": 2.4734182357788086, "epoch": 0.43430974477958234, "grad_norm": 0.02983229048550129, "grad_norm_var": 1.788053045614392e-06, "learning_rate": 0.006201438541227825, "loss": 2.5457, "step": 11980 }, { "crossentropy": 2.493192672729492, "epoch": 0.4343459976798144, "grad_norm": 0.027822740375995636, "grad_norm_var": 1.6793289421732863e-06, "learning_rate": 0.006200874470269019, "loss": 2.6217, "step": 11981 }, { "crossentropy": 2.5190422534942627, "epoch": 0.4343822505800464, "grad_norm": 0.028245992958545685, "grad_norm_var": 1.6522903295910444e-06, "learning_rate": 0.006200310383090581, "loss": 2.5102, "step": 11982 }, { "crossentropy": 2.6479530334472656, "epoch": 0.43441850348027844, "grad_norm": 0.027808688580989838, "grad_norm_var": 1.3579177927556043e-06, "learning_rate": 0.006199746279700129, "loss": 2.6325, "step": 11983 }, { "crossentropy": 2.631497383117676, "epoch": 0.43445475638051045, "grad_norm": 0.029452774673700333, "grad_norm_var": 9.610698897376343e-07, "learning_rate": 0.006199182160105283, "loss": 2.6623, "step": 11984 }, { "crossentropy": 2.5643723011016846, "epoch": 0.43449100928074247, "grad_norm": 0.027624808251857758, "grad_norm_var": 1.0478915423192073e-06, "learning_rate": 0.006198618024313663, "loss": 2.5804, "step": 11985 }, { "crossentropy": 2.5770652294158936, "epoch": 0.4345272621809745, "grad_norm": 0.026173502206802368, "grad_norm_var": 1.393791527612648e-06, "learning_rate": 0.006198053872332888, "loss": 2.63, "step": 11986 }, { "crossentropy": 2.5622589588165283, "epoch": 0.4345635150812065, "grad_norm": 0.026699747890233994, "grad_norm_var": 1.6327857876951198e-06, "learning_rate": 0.006197489704170576, "loss": 2.6033, "step": 11987 }, { "crossentropy": 2.6182265281677246, "epoch": 0.4345997679814385, "grad_norm": 0.026694266125559807, "grad_norm_var": 1.787090321422865e-06, "learning_rate": 0.006196925519834349, "loss": 2.6252, "step": 11988 }, { "crossentropy": 2.5999252796173096, "epoch": 0.4346360208816705, "grad_norm": 0.026478273794054985, "grad_norm_var": 1.943382318982676e-06, "learning_rate": 0.006196361319331826, "loss": 2.5202, "step": 11989 }, { "crossentropy": 2.659311056137085, "epoch": 0.43467227378190254, "grad_norm": 0.027056798338890076, "grad_norm_var": 2.004068011889071e-06, "learning_rate": 0.006195797102670628, "loss": 2.6162, "step": 11990 }, { "crossentropy": 2.5476889610290527, "epoch": 0.43470852668213456, "grad_norm": 0.026490790769457817, "grad_norm_var": 2.1712053640499568e-06, "learning_rate": 0.006195232869858375, "loss": 2.5365, "step": 11991 }, { "crossentropy": 2.596435546875, "epoch": 0.43474477958236657, "grad_norm": 0.026640037074685097, "grad_norm_var": 1.8321066606801488e-06, "learning_rate": 0.006194668620902689, "loss": 2.5726, "step": 11992 }, { "crossentropy": 2.485480308532715, "epoch": 0.4347810324825986, "grad_norm": 0.028106356039643288, "grad_norm_var": 1.8376216183004727e-06, "learning_rate": 0.00619410435581119, "loss": 2.6217, "step": 11993 }, { "crossentropy": 2.636939764022827, "epoch": 0.43481728538283065, "grad_norm": 0.026622701436281204, "grad_norm_var": 1.6133568615198101e-06, "learning_rate": 0.0061935400745915, "loss": 2.5843, "step": 11994 }, { "crossentropy": 2.610257863998413, "epoch": 0.43485353828306267, "grad_norm": 0.026478465646505356, "grad_norm_var": 1.2037796539877503e-06, "learning_rate": 0.00619297577725124, "loss": 2.5767, "step": 11995 }, { "crossentropy": 2.610891342163086, "epoch": 0.4348897911832947, "grad_norm": 0.03930116072297096, "grad_norm_var": 9.891857890826948e-06, "learning_rate": 0.006192411463798031, "loss": 2.5191, "step": 11996 }, { "crossentropy": 2.5407497882843018, "epoch": 0.4349260440835267, "grad_norm": 0.02805711328983307, "grad_norm_var": 9.890343330057208e-06, "learning_rate": 0.006191847134239496, "loss": 2.5684, "step": 11997 }, { "crossentropy": 2.4915075302124023, "epoch": 0.4349622969837587, "grad_norm": 0.026763498783111572, "grad_norm_var": 9.97823419286747e-06, "learning_rate": 0.006191282788583257, "loss": 2.5561, "step": 11998 }, { "crossentropy": 2.552212953567505, "epoch": 0.4349985498839907, "grad_norm": 0.028526104986667633, "grad_norm_var": 1.000137479809462e-05, "learning_rate": 0.006190718426836935, "loss": 2.5828, "step": 11999 }, { "crossentropy": 2.5496184825897217, "epoch": 0.43503480278422274, "grad_norm": 0.028077401220798492, "grad_norm_var": 9.843634433388685e-06, "learning_rate": 0.006190154049008154, "loss": 2.6088, "step": 12000 }, { "crossentropy": 2.585374116897583, "epoch": 0.43507105568445475, "grad_norm": 0.03060785122215748, "grad_norm_var": 1.0305477292979943e-05, "learning_rate": 0.006189589655104537, "loss": 2.5821, "step": 12001 }, { "crossentropy": 2.531008005142212, "epoch": 0.43510730858468677, "grad_norm": 0.03050374798476696, "grad_norm_var": 1.0394926064829812e-05, "learning_rate": 0.006189025245133707, "loss": 2.4953, "step": 12002 }, { "crossentropy": 2.6649670600891113, "epoch": 0.4351435614849188, "grad_norm": 0.030567169189453125, "grad_norm_var": 1.049474777504281e-05, "learning_rate": 0.006188460819103285, "loss": 2.5616, "step": 12003 }, { "crossentropy": 2.6313416957855225, "epoch": 0.4351798143851508, "grad_norm": 0.029708093032240868, "grad_norm_var": 1.0312416852510533e-05, "learning_rate": 0.006187896377020899, "loss": 2.5296, "step": 12004 }, { "crossentropy": 2.710240602493286, "epoch": 0.4352160672853828, "grad_norm": 0.028740141540765762, "grad_norm_var": 9.947329255895199e-06, "learning_rate": 0.006187331918894166, "loss": 2.7042, "step": 12005 }, { "crossentropy": 2.449031114578247, "epoch": 0.4352523201856148, "grad_norm": 0.07247412204742432, "grad_norm_var": 0.00012776413238359586, "learning_rate": 0.006186767444730717, "loss": 2.4868, "step": 12006 }, { "crossentropy": 2.50600004196167, "epoch": 0.43528857308584684, "grad_norm": 0.028411556035280228, "grad_norm_var": 0.0001266531879533742, "learning_rate": 0.006186202954538171, "loss": 2.5875, "step": 12007 }, { "crossentropy": 2.7486648559570312, "epoch": 0.4353248259860789, "grad_norm": 0.02984979934990406, "grad_norm_var": 0.00012506778713066758, "learning_rate": 0.006185638448324153, "loss": 2.674, "step": 12008 }, { "crossentropy": 2.636875867843628, "epoch": 0.4353610788863109, "grad_norm": 0.028550241142511368, "grad_norm_var": 0.00012484671589749636, "learning_rate": 0.006185073926096291, "loss": 2.5777, "step": 12009 }, { "crossentropy": 2.5615625381469727, "epoch": 0.43539733178654294, "grad_norm": 0.026934165507555008, "grad_norm_var": 0.00012462625133885218, "learning_rate": 0.006184509387862205, "loss": 2.6061, "step": 12010 }, { "crossentropy": 2.689988851547241, "epoch": 0.43543358468677495, "grad_norm": 0.028937362134456635, "grad_norm_var": 0.000123162112537301, "learning_rate": 0.006183944833629523, "loss": 2.6085, "step": 12011 }, { "crossentropy": 2.5319442749023438, "epoch": 0.43546983758700697, "grad_norm": 0.03179972991347313, "grad_norm_var": 0.00011962716876929795, "learning_rate": 0.006183380263405869, "loss": 2.521, "step": 12012 }, { "crossentropy": 2.5978755950927734, "epoch": 0.435506090487239, "grad_norm": 0.03026065044105053, "grad_norm_var": 0.0001188363237642511, "learning_rate": 0.006182815677198869, "loss": 2.6046, "step": 12013 }, { "crossentropy": 2.6166884899139404, "epoch": 0.435542343387471, "grad_norm": 0.02642630785703659, "grad_norm_var": 0.00011907523642154064, "learning_rate": 0.0061822510750161475, "loss": 2.611, "step": 12014 }, { "crossentropy": 2.623806953430176, "epoch": 0.435578596287703, "grad_norm": 0.026651067659258842, "grad_norm_var": 0.00012013806295129367, "learning_rate": 0.006181686456865333, "loss": 2.632, "step": 12015 }, { "crossentropy": 2.561737060546875, "epoch": 0.435614849187935, "grad_norm": 0.0269455723464489, "grad_norm_var": 0.0001207770718715181, "learning_rate": 0.006181121822754049, "loss": 2.6451, "step": 12016 }, { "crossentropy": 2.444610595703125, "epoch": 0.43565110208816704, "grad_norm": 0.028306713327765465, "grad_norm_var": 0.00012144632891156609, "learning_rate": 0.006180557172689921, "loss": 2.5085, "step": 12017 }, { "crossentropy": 2.662334442138672, "epoch": 0.43568735498839906, "grad_norm": 0.030017122626304626, "grad_norm_var": 0.0001215300940133101, "learning_rate": 0.006179992506680577, "loss": 2.6257, "step": 12018 }, { "crossentropy": 2.659452199935913, "epoch": 0.43572360788863107, "grad_norm": 0.027823857963085175, "grad_norm_var": 0.00012235491491383366, "learning_rate": 0.006179427824733645, "loss": 2.5861, "step": 12019 }, { "crossentropy": 2.560856342315674, "epoch": 0.4357598607888631, "grad_norm": 0.029346901923418045, "grad_norm_var": 0.00012244285275178415, "learning_rate": 0.00617886312685675, "loss": 2.5909, "step": 12020 }, { "crossentropy": 2.419055700302124, "epoch": 0.43579611368909515, "grad_norm": 0.03435134142637253, "grad_norm_var": 0.00012246393926587193, "learning_rate": 0.006178298413057519, "loss": 2.4829, "step": 12021 }, { "crossentropy": 2.4441514015197754, "epoch": 0.43583236658932717, "grad_norm": 0.028014861047267914, "grad_norm_var": 4.256032704798238e-06, "learning_rate": 0.006177733683343578, "loss": 2.4675, "step": 12022 }, { "crossentropy": 2.670315980911255, "epoch": 0.4358686194895592, "grad_norm": 0.02788161113858223, "grad_norm_var": 4.30910200882913e-06, "learning_rate": 0.0061771689377225596, "loss": 2.6181, "step": 12023 }, { "crossentropy": 2.6047756671905518, "epoch": 0.4359048723897912, "grad_norm": 0.028170380741357803, "grad_norm_var": 4.268462258529337e-06, "learning_rate": 0.006176604176202087, "loss": 2.6757, "step": 12024 }, { "crossentropy": 2.680532693862915, "epoch": 0.4359411252900232, "grad_norm": 0.02774936892092228, "grad_norm_var": 4.332669322958521e-06, "learning_rate": 0.006176039398789789, "loss": 2.6501, "step": 12025 }, { "crossentropy": 2.654787540435791, "epoch": 0.4359773781902552, "grad_norm": 0.027756869792938232, "grad_norm_var": 4.178411688667236e-06, "learning_rate": 0.006175474605493295, "loss": 2.6501, "step": 12026 }, { "crossentropy": 2.7420613765716553, "epoch": 0.43601363109048724, "grad_norm": 0.02887750416994095, "grad_norm_var": 4.177359615027918e-06, "learning_rate": 0.00617490979632023, "loss": 2.6854, "step": 12027 }, { "crossentropy": 2.723301649093628, "epoch": 0.43604988399071926, "grad_norm": 0.02905147708952427, "grad_norm_var": 3.5405912417623876e-06, "learning_rate": 0.00617434497127823, "loss": 2.6202, "step": 12028 }, { "crossentropy": 2.5683703422546387, "epoch": 0.43608613689095127, "grad_norm": 0.029329504817724228, "grad_norm_var": 3.3888517012418845e-06, "learning_rate": 0.006173780130374915, "loss": 2.6077, "step": 12029 }, { "crossentropy": 2.6461195945739746, "epoch": 0.4361223897911833, "grad_norm": 0.02878253161907196, "grad_norm_var": 3.0706071819147985e-06, "learning_rate": 0.006173215273617919, "loss": 2.6318, "step": 12030 }, { "crossentropy": 2.5667166709899902, "epoch": 0.4361586426914153, "grad_norm": 0.027948936447501183, "grad_norm_var": 2.82287011577938e-06, "learning_rate": 0.006172650401014871, "loss": 2.6217, "step": 12031 }, { "crossentropy": 2.715221881866455, "epoch": 0.4361948955916473, "grad_norm": 0.027946652844548225, "grad_norm_var": 2.6416971189934057e-06, "learning_rate": 0.0061720855125734, "loss": 2.6593, "step": 12032 }, { "crossentropy": 2.672231912612915, "epoch": 0.43623114849187933, "grad_norm": 0.029879441484808922, "grad_norm_var": 2.685566254641613e-06, "learning_rate": 0.006171520608301136, "loss": 2.5439, "step": 12033 }, { "crossentropy": 2.7119805812835693, "epoch": 0.43626740139211134, "grad_norm": 0.031047118827700615, "grad_norm_var": 2.9007545157178533e-06, "learning_rate": 0.0061709556882057065, "loss": 2.6456, "step": 12034 }, { "crossentropy": 2.6243128776550293, "epoch": 0.4363036542923434, "grad_norm": 0.0288394782692194, "grad_norm_var": 2.8063062236188905e-06, "learning_rate": 0.006170390752294743, "loss": 2.5944, "step": 12035 }, { "crossentropy": 2.755192995071411, "epoch": 0.4363399071925754, "grad_norm": 0.028681209310889244, "grad_norm_var": 2.8086153136929725e-06, "learning_rate": 0.006169825800575879, "loss": 2.6854, "step": 12036 }, { "crossentropy": 2.5752623081207275, "epoch": 0.43637616009280744, "grad_norm": 0.028016457334160805, "grad_norm_var": 8.130453061962513e-07, "learning_rate": 0.00616926083305674, "loss": 2.5712, "step": 12037 }, { "crossentropy": 2.5393247604370117, "epoch": 0.43641241299303946, "grad_norm": 0.028169482946395874, "grad_norm_var": 8.019950423320075e-07, "learning_rate": 0.006168695849744959, "loss": 2.5782, "step": 12038 }, { "crossentropy": 2.5209767818450928, "epoch": 0.43644866589327147, "grad_norm": 0.028286319226026535, "grad_norm_var": 7.71685992019067e-07, "learning_rate": 0.006168130850648168, "loss": 2.655, "step": 12039 }, { "crossentropy": 2.6487905979156494, "epoch": 0.4364849187935035, "grad_norm": 0.027575461193919182, "grad_norm_var": 8.325092705506465e-07, "learning_rate": 0.006167565835773998, "loss": 2.6324, "step": 12040 }, { "crossentropy": 2.4651198387145996, "epoch": 0.4365211716937355, "grad_norm": 0.026523876935243607, "grad_norm_var": 1.0688157987724049e-06, "learning_rate": 0.006167000805130079, "loss": 2.463, "step": 12041 }, { "crossentropy": 2.704348087310791, "epoch": 0.4365574245939675, "grad_norm": 0.027906564995646477, "grad_norm_var": 1.0544953424271308e-06, "learning_rate": 0.0061664357587240416, "loss": 2.6011, "step": 12042 }, { "crossentropy": 2.7947285175323486, "epoch": 0.43659367749419953, "grad_norm": 0.029781222343444824, "grad_norm_var": 1.1445353107590595e-06, "learning_rate": 0.00616587069656352, "loss": 2.8092, "step": 12043 }, { "crossentropy": 2.736722230911255, "epoch": 0.43662993039443154, "grad_norm": 0.029078278690576553, "grad_norm_var": 1.1461565641200048e-06, "learning_rate": 0.006165305618656145, "loss": 2.6323, "step": 12044 }, { "crossentropy": 2.437542676925659, "epoch": 0.43666618329466356, "grad_norm": 0.046008557081222534, "grad_norm_var": 2.0128645958237045e-05, "learning_rate": 0.006164740525009552, "loss": 2.583, "step": 12045 }, { "crossentropy": 2.4460089206695557, "epoch": 0.43670243619489557, "grad_norm": 0.027716174721717834, "grad_norm_var": 2.0323689391692544e-05, "learning_rate": 0.006164175415631367, "loss": 2.5246, "step": 12046 }, { "crossentropy": 2.6224281787872314, "epoch": 0.4367386890951276, "grad_norm": 0.02982390858232975, "grad_norm_var": 2.013369286407566e-05, "learning_rate": 0.006163610290529228, "loss": 2.5734, "step": 12047 }, { "crossentropy": 2.5925519466400146, "epoch": 0.43677494199535966, "grad_norm": 0.030387453734874725, "grad_norm_var": 1.9933796303323963e-05, "learning_rate": 0.006163045149710765, "loss": 2.6729, "step": 12048 }, { "crossentropy": 2.5827407836914062, "epoch": 0.43681119489559167, "grad_norm": 0.030273044481873512, "grad_norm_var": 1.994462721002134e-05, "learning_rate": 0.006162479993183615, "loss": 2.5192, "step": 12049 }, { "crossentropy": 2.6226444244384766, "epoch": 0.4368474477958237, "grad_norm": 0.02650122530758381, "grad_norm_var": 2.0530096959333935e-05, "learning_rate": 0.006161914820955405, "loss": 2.5974, "step": 12050 }, { "crossentropy": 2.549691915512085, "epoch": 0.4368837006960557, "grad_norm": 0.031606826931238174, "grad_norm_var": 2.072883991763931e-05, "learning_rate": 0.006161349633033775, "loss": 2.5552, "step": 12051 }, { "crossentropy": 2.471818685531616, "epoch": 0.4369199535962877, "grad_norm": 0.031500037759542465, "grad_norm_var": 2.0815859466153987e-05, "learning_rate": 0.006160784429426354, "loss": 2.5397, "step": 12052 }, { "crossentropy": 2.5562736988067627, "epoch": 0.43695620649651973, "grad_norm": 0.028536781668663025, "grad_norm_var": 2.069883356979143e-05, "learning_rate": 0.0061602192101407795, "loss": 2.578, "step": 12053 }, { "crossentropy": 2.640299081802368, "epoch": 0.43699245939675174, "grad_norm": 0.028027525171637535, "grad_norm_var": 2.0734356341343906e-05, "learning_rate": 0.0061596539751846814, "loss": 2.6742, "step": 12054 }, { "crossentropy": 2.739816904067993, "epoch": 0.43702871229698376, "grad_norm": 0.02892092987895012, "grad_norm_var": 2.0616992656589892e-05, "learning_rate": 0.006159088724565698, "loss": 2.629, "step": 12055 }, { "crossentropy": 2.7402074337005615, "epoch": 0.43706496519721577, "grad_norm": 0.028437934815883636, "grad_norm_var": 2.0383463992977335e-05, "learning_rate": 0.006158523458291463, "loss": 2.6341, "step": 12056 }, { "crossentropy": 2.643023729324341, "epoch": 0.4371012180974478, "grad_norm": 0.02611476369202137, "grad_norm_var": 2.058705463501559e-05, "learning_rate": 0.006157958176369611, "loss": 2.6249, "step": 12057 }, { "crossentropy": 2.651357889175415, "epoch": 0.4371374709976798, "grad_norm": 0.026536239311099052, "grad_norm_var": 2.109400242608772e-05, "learning_rate": 0.006157392878807774, "loss": 2.6453, "step": 12058 }, { "crossentropy": 2.464019775390625, "epoch": 0.4371737238979118, "grad_norm": 0.027576597407460213, "grad_norm_var": 2.1448323016134265e-05, "learning_rate": 0.006156827565613591, "loss": 2.5106, "step": 12059 }, { "crossentropy": 2.3693783283233643, "epoch": 0.43720997679814383, "grad_norm": 0.02767259255051613, "grad_norm_var": 2.1709973524538827e-05, "learning_rate": 0.006156262236794697, "loss": 2.4902, "step": 12060 }, { "crossentropy": 2.5452427864074707, "epoch": 0.43724622969837584, "grad_norm": 0.02958969585597515, "grad_norm_var": 2.916553511244681e-06, "learning_rate": 0.0061556968923587255, "loss": 2.6152, "step": 12061 }, { "crossentropy": 2.5636258125305176, "epoch": 0.4372824825986079, "grad_norm": 0.03182833269238472, "grad_norm_var": 3.4332546743491105e-06, "learning_rate": 0.0061551315323133135, "loss": 2.6245, "step": 12062 }, { "crossentropy": 2.5447967052459717, "epoch": 0.43731873549883993, "grad_norm": 0.029158003628253937, "grad_norm_var": 3.384119987665417e-06, "learning_rate": 0.006154566156666098, "loss": 2.5849, "step": 12063 }, { "crossentropy": 2.4706780910491943, "epoch": 0.43735498839907194, "grad_norm": 0.02777482382953167, "grad_norm_var": 3.29841373958948e-06, "learning_rate": 0.0061540007654247155, "loss": 2.5182, "step": 12064 }, { "crossentropy": 2.463848829269409, "epoch": 0.43739124129930396, "grad_norm": 0.026497816666960716, "grad_norm_var": 3.424281467236575e-06, "learning_rate": 0.0061534353585967995, "loss": 2.5232, "step": 12065 }, { "crossentropy": 2.4098105430603027, "epoch": 0.43742749419953597, "grad_norm": 0.025363251566886902, "grad_norm_var": 3.8111482083731983e-06, "learning_rate": 0.00615286993618999, "loss": 2.5493, "step": 12066 }, { "crossentropy": 2.64324951171875, "epoch": 0.437463747099768, "grad_norm": 0.026359211653470993, "grad_norm_var": 3.3209351040197246e-06, "learning_rate": 0.006152304498211921, "loss": 2.5997, "step": 12067 }, { "crossentropy": 2.7318153381347656, "epoch": 0.4375, "grad_norm": 0.02752336673438549, "grad_norm_var": 2.5162877569889313e-06, "learning_rate": 0.006151739044670233, "loss": 2.5757, "step": 12068 }, { "crossentropy": 2.577733039855957, "epoch": 0.437536252900232, "grad_norm": 0.03027888759970665, "grad_norm_var": 2.8608826115984636e-06, "learning_rate": 0.0061511735755725595, "loss": 2.6397, "step": 12069 }, { "crossentropy": 2.4836220741271973, "epoch": 0.43757250580046403, "grad_norm": 0.02898879162967205, "grad_norm_var": 2.9248863608480682e-06, "learning_rate": 0.006150608090926541, "loss": 2.5743, "step": 12070 }, { "crossentropy": 2.652052879333496, "epoch": 0.43760875870069604, "grad_norm": 0.028063176199793816, "grad_norm_var": 2.869986651791523e-06, "learning_rate": 0.0061500425907398135, "loss": 2.619, "step": 12071 }, { "crossentropy": 2.659926176071167, "epoch": 0.43764501160092806, "grad_norm": 0.03273763507604599, "grad_norm_var": 4.284990172991503e-06, "learning_rate": 0.006149477075020016, "loss": 2.6919, "step": 12072 }, { "crossentropy": 2.706956386566162, "epoch": 0.43768126450116007, "grad_norm": 0.0336836501955986, "grad_norm_var": 5.7066592815781135e-06, "learning_rate": 0.0061489115437747864, "loss": 2.6919, "step": 12073 }, { "crossentropy": 2.663712739944458, "epoch": 0.4377175174013921, "grad_norm": 0.027456272393465042, "grad_norm_var": 5.490819554937797e-06, "learning_rate": 0.006148345997011763, "loss": 2.6431, "step": 12074 }, { "crossentropy": 2.6150197982788086, "epoch": 0.43775377030162416, "grad_norm": 0.027079451829195023, "grad_norm_var": 5.586334225012902e-06, "learning_rate": 0.006147780434738584, "loss": 2.5713, "step": 12075 }, { "crossentropy": 2.5145761966705322, "epoch": 0.43779002320185617, "grad_norm": 0.0269135944545269, "grad_norm_var": 5.7317200890106074e-06, "learning_rate": 0.006147214856962888, "loss": 2.5888, "step": 12076 }, { "crossentropy": 2.5964221954345703, "epoch": 0.4378262761020882, "grad_norm": 0.030881917104125023, "grad_norm_var": 5.988342641052337e-06, "learning_rate": 0.006146649263692314, "loss": 2.7222, "step": 12077 }, { "crossentropy": 2.522360324859619, "epoch": 0.4378625290023202, "grad_norm": 0.033252518624067307, "grad_norm_var": 6.692680134893526e-06, "learning_rate": 0.0061460836549345026, "loss": 2.551, "step": 12078 }, { "crossentropy": 2.6060047149658203, "epoch": 0.4378987819025522, "grad_norm": 0.028984656557440758, "grad_norm_var": 6.688035031681078e-06, "learning_rate": 0.0061455180306970916, "loss": 2.5649, "step": 12079 }, { "crossentropy": 2.584120035171509, "epoch": 0.43793503480278423, "grad_norm": 0.027886923402547836, "grad_norm_var": 6.672526902064268e-06, "learning_rate": 0.006144952390987722, "loss": 2.6149, "step": 12080 }, { "crossentropy": 2.6754329204559326, "epoch": 0.43797128770301624, "grad_norm": 0.030018657445907593, "grad_norm_var": 6.332773126203765e-06, "learning_rate": 0.006144386735814031, "loss": 2.6638, "step": 12081 }, { "crossentropy": 2.4598228931427, "epoch": 0.43800754060324826, "grad_norm": 0.03287915512919426, "grad_norm_var": 6.126670723518773e-06, "learning_rate": 0.006143821065183662, "loss": 2.4953, "step": 12082 }, { "crossentropy": 2.5749077796936035, "epoch": 0.43804379350348027, "grad_norm": 0.02756885439157486, "grad_norm_var": 5.701600662191122e-06, "learning_rate": 0.0061432553791042534, "loss": 2.6002, "step": 12083 }, { "crossentropy": 2.5885493755340576, "epoch": 0.4380800464037123, "grad_norm": 0.028143787756562233, "grad_norm_var": 5.550784153042252e-06, "learning_rate": 0.006142689677583447, "loss": 2.5584, "step": 12084 }, { "crossentropy": 2.524641752243042, "epoch": 0.4381162993039443, "grad_norm": 0.025695431977510452, "grad_norm_var": 6.495420811160205e-06, "learning_rate": 0.0061421239606288805, "loss": 2.5682, "step": 12085 }, { "crossentropy": 2.5901365280151367, "epoch": 0.4381525522041763, "grad_norm": 0.02560003288090229, "grad_norm_var": 7.394274912964635e-06, "learning_rate": 0.006141558228248197, "loss": 2.5859, "step": 12086 }, { "crossentropy": 2.6490211486816406, "epoch": 0.43818880510440833, "grad_norm": 0.026733236387372017, "grad_norm_var": 7.702482311470549e-06, "learning_rate": 0.006140992480449038, "loss": 2.5969, "step": 12087 }, { "crossentropy": 2.5518312454223633, "epoch": 0.43822505800464034, "grad_norm": 0.02677074633538723, "grad_norm_var": 7.029480905564912e-06, "learning_rate": 0.006140426717239043, "loss": 2.5381, "step": 12088 }, { "crossentropy": 2.731393337249756, "epoch": 0.4382613109048724, "grad_norm": 0.026483258232474327, "grad_norm_var": 5.506203391436694e-06, "learning_rate": 0.006139860938625856, "loss": 2.5861, "step": 12089 }, { "crossentropy": 2.5731403827667236, "epoch": 0.43829756380510443, "grad_norm": 0.03134525194764137, "grad_norm_var": 6.028597377370177e-06, "learning_rate": 0.006139295144617116, "loss": 2.5895, "step": 12090 }, { "crossentropy": 2.59702730178833, "epoch": 0.43833381670533644, "grad_norm": 0.030588369816541672, "grad_norm_var": 6.1265733771023125e-06, "learning_rate": 0.006138729335220468, "loss": 2.6722, "step": 12091 }, { "crossentropy": 2.5004520416259766, "epoch": 0.43837006960556846, "grad_norm": 0.029367711395025253, "grad_norm_var": 5.907277493285183e-06, "learning_rate": 0.006138163510443551, "loss": 2.5326, "step": 12092 }, { "crossentropy": 2.5578019618988037, "epoch": 0.43840632250580047, "grad_norm": 0.027556488290429115, "grad_norm_var": 5.714140795034942e-06, "learning_rate": 0.0061375976702940085, "loss": 2.5262, "step": 12093 }, { "crossentropy": 2.399336576461792, "epoch": 0.4384425754060325, "grad_norm": 0.029273055493831635, "grad_norm_var": 4.277579677702568e-06, "learning_rate": 0.006137031814779482, "loss": 2.4726, "step": 12094 }, { "crossentropy": 2.5169291496276855, "epoch": 0.4384788283062645, "grad_norm": 0.027947334572672844, "grad_norm_var": 4.26825266110566e-06, "learning_rate": 0.006136465943907617, "loss": 2.6023, "step": 12095 }, { "crossentropy": 2.6430323123931885, "epoch": 0.4385150812064965, "grad_norm": 0.028021350502967834, "grad_norm_var": 4.2607927212933666e-06, "learning_rate": 0.006135900057686054, "loss": 2.6051, "step": 12096 }, { "crossentropy": 2.474557399749756, "epoch": 0.43855133410672853, "grad_norm": 0.028395937755703926, "grad_norm_var": 4.069644470233724e-06, "learning_rate": 0.006135334156122437, "loss": 2.5111, "step": 12097 }, { "crossentropy": 2.532390832901001, "epoch": 0.43858758700696054, "grad_norm": 0.027486516162753105, "grad_norm_var": 2.5753582805446005e-06, "learning_rate": 0.00613476823922441, "loss": 2.5341, "step": 12098 }, { "crossentropy": 2.5849781036376953, "epoch": 0.43862383990719256, "grad_norm": 0.02781156823039055, "grad_norm_var": 2.567155888036224e-06, "learning_rate": 0.006134202306999614, "loss": 2.5674, "step": 12099 }, { "crossentropy": 2.5667600631713867, "epoch": 0.4386600928074246, "grad_norm": 0.027217691764235497, "grad_norm_var": 2.5969853903691353e-06, "learning_rate": 0.006133636359455697, "loss": 2.5916, "step": 12100 }, { "crossentropy": 2.5952839851379395, "epoch": 0.4386963457076566, "grad_norm": 0.02653285674750805, "grad_norm_var": 2.395400609084217e-06, "learning_rate": 0.0061330703966002995, "loss": 2.54, "step": 12101 }, { "crossentropy": 2.487992763519287, "epoch": 0.43873259860788866, "grad_norm": 0.0301987137645483, "grad_norm_var": 2.2788710397657804e-06, "learning_rate": 0.006132504418441067, "loss": 2.5562, "step": 12102 }, { "crossentropy": 2.6729378700256348, "epoch": 0.43876885150812067, "grad_norm": 0.02735406905412674, "grad_norm_var": 2.1788028496404758e-06, "learning_rate": 0.006131938424985642, "loss": 2.6672, "step": 12103 }, { "crossentropy": 2.5697131156921387, "epoch": 0.4388051044083527, "grad_norm": 0.026528554037213326, "grad_norm_var": 2.230945682731008e-06, "learning_rate": 0.006131372416241674, "loss": 2.4674, "step": 12104 }, { "crossentropy": 2.588128089904785, "epoch": 0.4388413573085847, "grad_norm": 0.027572089806199074, "grad_norm_var": 2.0475650388738213e-06, "learning_rate": 0.006130806392216802, "loss": 2.6351, "step": 12105 }, { "crossentropy": 2.6057960987091064, "epoch": 0.4388776102088167, "grad_norm": 0.02945568412542343, "grad_norm_var": 1.5097513142690028e-06, "learning_rate": 0.006130240352918675, "loss": 2.5829, "step": 12106 }, { "crossentropy": 2.6937320232391357, "epoch": 0.43891386310904873, "grad_norm": 0.02743726223707199, "grad_norm_var": 1.1297114936237856e-06, "learning_rate": 0.006129674298354934, "loss": 2.6466, "step": 12107 }, { "crossentropy": 2.5786030292510986, "epoch": 0.43895011600928074, "grad_norm": 0.026480192318558693, "grad_norm_var": 1.1280245735341304e-06, "learning_rate": 0.006129108228533233, "loss": 2.6481, "step": 12108 }, { "crossentropy": 2.4375710487365723, "epoch": 0.43898636890951276, "grad_norm": 0.026834458112716675, "grad_norm_var": 1.1868747174869812e-06, "learning_rate": 0.006128542143461208, "loss": 2.5259, "step": 12109 }, { "crossentropy": 2.564563751220703, "epoch": 0.4390226218097448, "grad_norm": 0.02868276834487915, "grad_norm_var": 1.0914725104495577e-06, "learning_rate": 0.006127976043146508, "loss": 2.583, "step": 12110 }, { "crossentropy": 2.5671772956848145, "epoch": 0.4390588747099768, "grad_norm": 0.026484714820981026, "grad_norm_var": 1.1861691255791631e-06, "learning_rate": 0.006127409927596781, "loss": 2.6111, "step": 12111 }, { "crossentropy": 2.5342915058135986, "epoch": 0.4390951276102088, "grad_norm": 0.028085360303521156, "grad_norm_var": 1.18954417767612e-06, "learning_rate": 0.006126843796819673, "loss": 2.5999, "step": 12112 }, { "crossentropy": 2.550872802734375, "epoch": 0.4391313805104408, "grad_norm": 0.026812657713890076, "grad_norm_var": 1.1908376357752977e-06, "learning_rate": 0.006126277650822828, "loss": 2.4198, "step": 12113 }, { "crossentropy": 2.569817304611206, "epoch": 0.43916763341067283, "grad_norm": 0.02772348001599312, "grad_norm_var": 1.191995460828911e-06, "learning_rate": 0.006125711489613894, "loss": 2.5472, "step": 12114 }, { "crossentropy": 2.5885069370269775, "epoch": 0.43920388631090485, "grad_norm": 0.026044350117444992, "grad_norm_var": 1.3316228567131564e-06, "learning_rate": 0.006125145313200519, "loss": 2.5749, "step": 12115 }, { "crossentropy": 2.4625420570373535, "epoch": 0.4392401392111369, "grad_norm": 0.02671758271753788, "grad_norm_var": 1.3637659191524654e-06, "learning_rate": 0.006124579121590349, "loss": 2.4578, "step": 12116 }, { "crossentropy": 2.5316483974456787, "epoch": 0.43927639211136893, "grad_norm": 0.027979005128145218, "grad_norm_var": 1.3207071714082478e-06, "learning_rate": 0.006124012914791032, "loss": 2.56, "step": 12117 }, { "crossentropy": 2.4806556701660156, "epoch": 0.43931264501160094, "grad_norm": 0.02666328474879265, "grad_norm_var": 8.412806059287491e-07, "learning_rate": 0.006123446692810213, "loss": 2.5622, "step": 12118 }, { "crossentropy": 2.611895799636841, "epoch": 0.43934889791183296, "grad_norm": 0.02812529355287552, "grad_norm_var": 8.836579461157211e-07, "learning_rate": 0.006122880455655542, "loss": 2.6247, "step": 12119 }, { "crossentropy": 2.6382007598876953, "epoch": 0.439385150812065, "grad_norm": 0.026674678549170494, "grad_norm_var": 8.689554582730815e-07, "learning_rate": 0.006122314203334667, "loss": 2.6088, "step": 12120 }, { "crossentropy": 2.6655097007751465, "epoch": 0.439421403712297, "grad_norm": 0.027342280372977257, "grad_norm_var": 8.657821662427211e-07, "learning_rate": 0.0061217479358552365, "loss": 2.499, "step": 12121 }, { "crossentropy": 2.718175172805786, "epoch": 0.439457656612529, "grad_norm": 0.029207147657871246, "grad_norm_var": 7.997462974699994e-07, "learning_rate": 0.006121181653224895, "loss": 2.689, "step": 12122 }, { "crossentropy": 2.6093661785125732, "epoch": 0.439493909512761, "grad_norm": 0.029901303350925446, "grad_norm_var": 1.2141566874549273e-06, "learning_rate": 0.006120615355451295, "loss": 2.637, "step": 12123 }, { "crossentropy": 2.6324360370635986, "epoch": 0.43953016241299303, "grad_norm": 0.030536329373717308, "grad_norm_var": 1.699052566712933e-06, "learning_rate": 0.006120049042542085, "loss": 2.5767, "step": 12124 }, { "crossentropy": 2.494102716445923, "epoch": 0.43956641531322505, "grad_norm": 0.02914504148066044, "grad_norm_var": 1.7542371955293988e-06, "learning_rate": 0.0061194827145049134, "loss": 2.5521, "step": 12125 }, { "crossentropy": 2.513129472732544, "epoch": 0.43960266821345706, "grad_norm": 0.027025816962122917, "grad_norm_var": 1.7491023046024846e-06, "learning_rate": 0.006118916371347427, "loss": 2.4798, "step": 12126 }, { "crossentropy": 2.4925692081451416, "epoch": 0.4396389211136891, "grad_norm": 0.026543017476797104, "grad_norm_var": 1.739251283907695e-06, "learning_rate": 0.006118350013077279, "loss": 2.5614, "step": 12127 }, { "crossentropy": 2.6527786254882812, "epoch": 0.4396751740139211, "grad_norm": 0.02817179076373577, "grad_norm_var": 1.7432035787189134e-06, "learning_rate": 0.006117783639702115, "loss": 2.598, "step": 12128 }, { "crossentropy": 2.6235828399658203, "epoch": 0.43971142691415316, "grad_norm": 0.02708819881081581, "grad_norm_var": 1.7121042233293834e-06, "learning_rate": 0.006117217251229589, "loss": 2.572, "step": 12129 }, { "crossentropy": 2.533034563064575, "epoch": 0.4397476798143852, "grad_norm": 0.02708355523645878, "grad_norm_var": 1.7446996274511628e-06, "learning_rate": 0.0061166508476673456, "loss": 2.5345, "step": 12130 }, { "crossentropy": 2.5551581382751465, "epoch": 0.4397839327146172, "grad_norm": 0.026485148817300797, "grad_norm_var": 1.6556836941468423e-06, "learning_rate": 0.006116084429023039, "loss": 2.5601, "step": 12131 }, { "crossentropy": 2.541959762573242, "epoch": 0.4398201856148492, "grad_norm": 0.026948636397719383, "grad_norm_var": 1.6258869173617236e-06, "learning_rate": 0.006115517995304319, "loss": 2.5297, "step": 12132 }, { "crossentropy": 2.548309326171875, "epoch": 0.4398564385150812, "grad_norm": 0.0269036665558815, "grad_norm_var": 1.6735735763067731e-06, "learning_rate": 0.006114951546518837, "loss": 2.581, "step": 12133 }, { "crossentropy": 2.613579273223877, "epoch": 0.43989269141531323, "grad_norm": 0.0312846414744854, "grad_norm_var": 2.3447309535645315e-06, "learning_rate": 0.00611438508267424, "loss": 2.5691, "step": 12134 }, { "crossentropy": 2.528992176055908, "epoch": 0.43992894431554525, "grad_norm": 0.027219271287322044, "grad_norm_var": 2.3844224071771125e-06, "learning_rate": 0.006113818603778182, "loss": 2.5288, "step": 12135 }, { "crossentropy": 2.5920870304107666, "epoch": 0.43996519721577726, "grad_norm": 0.027292512357234955, "grad_norm_var": 2.3013653905573177e-06, "learning_rate": 0.006113252109838313, "loss": 2.62, "step": 12136 }, { "crossentropy": 2.756413459777832, "epoch": 0.4400014501160093, "grad_norm": 0.027638118714094162, "grad_norm_var": 2.280451875977469e-06, "learning_rate": 0.006112685600862287, "loss": 2.6157, "step": 12137 }, { "crossentropy": 2.5582332611083984, "epoch": 0.4400377030162413, "grad_norm": 0.0266241617500782, "grad_norm_var": 2.291908031930171e-06, "learning_rate": 0.00611211907685775, "loss": 2.5621, "step": 12138 }, { "crossentropy": 2.4909725189208984, "epoch": 0.4400739559164733, "grad_norm": 0.02694563940167427, "grad_norm_var": 2.0366824305523827e-06, "learning_rate": 0.00611155253783236, "loss": 2.4662, "step": 12139 }, { "crossentropy": 2.825815200805664, "epoch": 0.4401102088167053, "grad_norm": 0.02801717258989811, "grad_norm_var": 1.4750774198228245e-06, "learning_rate": 0.0061109859837937645, "loss": 2.749, "step": 12140 }, { "crossentropy": 2.5293009281158447, "epoch": 0.44014646171693733, "grad_norm": 0.02653098665177822, "grad_norm_var": 1.337864322597026e-06, "learning_rate": 0.006110419414749618, "loss": 2.587, "step": 12141 }, { "crossentropy": 2.7388112545013428, "epoch": 0.44018271461716935, "grad_norm": 0.028700850903987885, "grad_norm_var": 1.4379963261123194e-06, "learning_rate": 0.006109852830707573, "loss": 2.7214, "step": 12142 }, { "crossentropy": 2.4944982528686523, "epoch": 0.4402189675174014, "grad_norm": 0.027152128517627716, "grad_norm_var": 1.3861165279066904e-06, "learning_rate": 0.00610928623167528, "loss": 2.5953, "step": 12143 }, { "crossentropy": 2.6377506256103516, "epoch": 0.44025522041763343, "grad_norm": 0.02742692083120346, "grad_norm_var": 1.3546107178427723e-06, "learning_rate": 0.006108719617660394, "loss": 2.5979, "step": 12144 }, { "crossentropy": 2.5142688751220703, "epoch": 0.44029147331786544, "grad_norm": 0.027638891711831093, "grad_norm_var": 1.3463492570530677e-06, "learning_rate": 0.006108152988670566, "loss": 2.5416, "step": 12145 }, { "crossentropy": 2.397266149520874, "epoch": 0.44032772621809746, "grad_norm": 0.026971273124217987, "grad_norm_var": 1.3532710144200372e-06, "learning_rate": 0.00610758634471345, "loss": 2.4738, "step": 12146 }, { "crossentropy": 2.504103422164917, "epoch": 0.4403639791183295, "grad_norm": 0.02777177281677723, "grad_norm_var": 1.2849945962449537e-06, "learning_rate": 0.006107019685796701, "loss": 2.5311, "step": 12147 }, { "crossentropy": 2.680729866027832, "epoch": 0.4404002320185615, "grad_norm": 0.027589360252022743, "grad_norm_var": 1.257854421098971e-06, "learning_rate": 0.00610645301192797, "loss": 2.6712, "step": 12148 }, { "crossentropy": 2.7350776195526123, "epoch": 0.4404364849187935, "grad_norm": 0.029246360063552856, "grad_norm_var": 1.3812655093767625e-06, "learning_rate": 0.006105886323114913, "loss": 2.6767, "step": 12149 }, { "crossentropy": 2.581381320953369, "epoch": 0.4404727378190255, "grad_norm": 0.025878122076392174, "grad_norm_var": 6.624099298434127e-07, "learning_rate": 0.006105319619365182, "loss": 2.5541, "step": 12150 }, { "crossentropy": 2.787189245223999, "epoch": 0.44050899071925753, "grad_norm": 0.026194551959633827, "grad_norm_var": 7.548105573714717e-07, "learning_rate": 0.006104752900686432, "loss": 2.6276, "step": 12151 }, { "crossentropy": 2.656764507293701, "epoch": 0.44054524361948955, "grad_norm": 0.02754703164100647, "grad_norm_var": 7.568684913277033e-07, "learning_rate": 0.006104186167086319, "loss": 2.6237, "step": 12152 }, { "crossentropy": 2.764190673828125, "epoch": 0.44058149651972156, "grad_norm": 0.028004448860883713, "grad_norm_var": 7.784942802706394e-07, "learning_rate": 0.006103619418572495, "loss": 2.7261, "step": 12153 }, { "crossentropy": 2.582416534423828, "epoch": 0.4406177494199536, "grad_norm": 0.027576768770813942, "grad_norm_var": 7.379407475943677e-07, "learning_rate": 0.0061030526551526165, "loss": 2.4877, "step": 12154 }, { "crossentropy": 2.614302635192871, "epoch": 0.4406540023201856, "grad_norm": 0.02848660573363304, "grad_norm_var": 7.828239313091567e-07, "learning_rate": 0.006102485876834337, "loss": 2.6239, "step": 12155 }, { "crossentropy": 2.657113552093506, "epoch": 0.44069025522041766, "grad_norm": 0.027227630838751793, "grad_norm_var": 7.721654348044962e-07, "learning_rate": 0.006101919083625313, "loss": 2.6267, "step": 12156 }, { "crossentropy": 2.784994602203369, "epoch": 0.4407265081206497, "grad_norm": 0.028065869584679604, "grad_norm_var": 7.218174843538761e-07, "learning_rate": 0.0061013522755332, "loss": 2.6868, "step": 12157 }, { "crossentropy": 2.680009365081787, "epoch": 0.4407627610208817, "grad_norm": 0.02683834731578827, "grad_norm_var": 6.633620352858305e-07, "learning_rate": 0.006100785452565653, "loss": 2.5893, "step": 12158 }, { "crossentropy": 2.437685251235962, "epoch": 0.4407990139211137, "grad_norm": 0.026740610599517822, "grad_norm_var": 6.917170334165557e-07, "learning_rate": 0.006100218614730328, "loss": 2.5319, "step": 12159 }, { "crossentropy": 2.6669933795928955, "epoch": 0.4408352668213457, "grad_norm": 0.029126521199941635, "grad_norm_var": 8.669624008953228e-07, "learning_rate": 0.006099651762034882, "loss": 2.5158, "step": 12160 }, { "crossentropy": 2.4780383110046387, "epoch": 0.44087151972157773, "grad_norm": 0.029147906228899956, "grad_norm_var": 1.0258579792097691e-06, "learning_rate": 0.006099084894486969, "loss": 2.5726, "step": 12161 }, { "crossentropy": 2.6918468475341797, "epoch": 0.44090777262180975, "grad_norm": 0.02748516947031021, "grad_norm_var": 9.958010877405184e-07, "learning_rate": 0.006098518012094246, "loss": 2.6824, "step": 12162 }, { "crossentropy": 2.662709951400757, "epoch": 0.44094402552204176, "grad_norm": 0.031566206365823746, "grad_norm_var": 1.9406004731580314e-06, "learning_rate": 0.006097951114864372, "loss": 2.6636, "step": 12163 }, { "crossentropy": 2.714083433151245, "epoch": 0.4409802784222738, "grad_norm": 0.028076378628611565, "grad_norm_var": 1.9339481710304764e-06, "learning_rate": 0.006097384202805001, "loss": 2.6526, "step": 12164 }, { "crossentropy": 2.6613357067108154, "epoch": 0.4410165313225058, "grad_norm": 0.027955006808042526, "grad_norm_var": 1.8150566960593254e-06, "learning_rate": 0.006096817275923791, "loss": 2.6561, "step": 12165 }, { "crossentropy": 2.578895092010498, "epoch": 0.4410527842227378, "grad_norm": 0.02700788900256157, "grad_norm_var": 1.594808907637533e-06, "learning_rate": 0.0060962503342283985, "loss": 2.6096, "step": 12166 }, { "crossentropy": 2.61027455329895, "epoch": 0.4410890371229698, "grad_norm": 0.030563652515411377, "grad_norm_var": 1.77081603049387e-06, "learning_rate": 0.006095683377726482, "loss": 2.5903, "step": 12167 }, { "crossentropy": 2.6019446849823, "epoch": 0.44112529002320183, "grad_norm": 0.029874777421355247, "grad_norm_var": 1.9026160235010558e-06, "learning_rate": 0.006095116406425699, "loss": 2.5533, "step": 12168 }, { "crossentropy": 2.5965964794158936, "epoch": 0.44116154292343385, "grad_norm": 0.029526865109801292, "grad_norm_var": 1.975508226189484e-06, "learning_rate": 0.006094549420333707, "loss": 2.5322, "step": 12169 }, { "crossentropy": 2.6276772022247314, "epoch": 0.4411977958236659, "grad_norm": 0.029808536171913147, "grad_norm_var": 2.0257295066355407e-06, "learning_rate": 0.006093982419458164, "loss": 2.6489, "step": 12170 }, { "crossentropy": 2.698467969894409, "epoch": 0.44123404872389793, "grad_norm": 0.030105220153927803, "grad_norm_var": 2.166378020617661e-06, "learning_rate": 0.006093415403806728, "loss": 2.6482, "step": 12171 }, { "crossentropy": 2.6934456825256348, "epoch": 0.44127030162412995, "grad_norm": 0.02785458043217659, "grad_norm_var": 2.0683002621093664e-06, "learning_rate": 0.006092848373387058, "loss": 2.5806, "step": 12172 }, { "crossentropy": 2.6431214809417725, "epoch": 0.44130655452436196, "grad_norm": 0.02864345721900463, "grad_norm_var": 2.0376991089999035e-06, "learning_rate": 0.006092281328206812, "loss": 2.6347, "step": 12173 }, { "crossentropy": 2.531233549118042, "epoch": 0.441342807424594, "grad_norm": 0.027663346379995346, "grad_norm_var": 1.8677487888374885e-06, "learning_rate": 0.00609171426827365, "loss": 2.5518, "step": 12174 }, { "crossentropy": 2.517160654067993, "epoch": 0.441379060324826, "grad_norm": 0.04510504752397537, "grad_norm_var": 1.785045946953214e-05, "learning_rate": 0.006091147193595229, "loss": 2.6382, "step": 12175 }, { "crossentropy": 2.579627275466919, "epoch": 0.441415313225058, "grad_norm": 0.02728915587067604, "grad_norm_var": 1.8267946559799807e-05, "learning_rate": 0.0060905801041792095, "loss": 2.595, "step": 12176 }, { "crossentropy": 2.5316855907440186, "epoch": 0.44145156612529, "grad_norm": 0.02915194071829319, "grad_norm_var": 1.826756743758858e-05, "learning_rate": 0.006090013000033251, "loss": 2.5961, "step": 12177 }, { "crossentropy": 2.567073106765747, "epoch": 0.44148781902552203, "grad_norm": 0.031186003237962723, "grad_norm_var": 1.795428368618855e-05, "learning_rate": 0.006089445881165012, "loss": 2.55, "step": 12178 }, { "crossentropy": 2.499149799346924, "epoch": 0.44152407192575405, "grad_norm": 0.03116997703909874, "grad_norm_var": 1.7885902698605802e-05, "learning_rate": 0.006088878747582154, "loss": 2.6055, "step": 12179 }, { "crossentropy": 2.6253204345703125, "epoch": 0.44156032482598606, "grad_norm": 0.029486294835805893, "grad_norm_var": 1.7636988987512397e-05, "learning_rate": 0.006088311599292335, "loss": 2.6231, "step": 12180 }, { "crossentropy": 2.6559972763061523, "epoch": 0.4415965777262181, "grad_norm": 0.028934767469763756, "grad_norm_var": 1.7410309629476336e-05, "learning_rate": 0.0060877444363032175, "loss": 2.6186, "step": 12181 }, { "crossentropy": 2.66530442237854, "epoch": 0.4416328306264501, "grad_norm": 0.02776484005153179, "grad_norm_var": 1.712286911183986e-05, "learning_rate": 0.00608717725862246, "loss": 2.5712, "step": 12182 }, { "crossentropy": 2.677541971206665, "epoch": 0.44166908352668216, "grad_norm": 0.026797635480761528, "grad_norm_var": 1.7855834874184554e-05, "learning_rate": 0.0060866100662577225, "loss": 2.6476, "step": 12183 }, { "crossentropy": 2.8069097995758057, "epoch": 0.4417053364269142, "grad_norm": 0.029763124883174896, "grad_norm_var": 1.785881543907342e-05, "learning_rate": 0.006086042859216671, "loss": 2.7055, "step": 12184 }, { "crossentropy": 2.675267457962036, "epoch": 0.4417415893271462, "grad_norm": 0.027987519279122353, "grad_norm_var": 1.8107240771877846e-05, "learning_rate": 0.0060854756375069595, "loss": 2.5801, "step": 12185 }, { "crossentropy": 2.5970919132232666, "epoch": 0.4417778422273782, "grad_norm": 0.028850410133600235, "grad_norm_var": 1.8178787342976048e-05, "learning_rate": 0.0060849084011362535, "loss": 2.6276, "step": 12186 }, { "crossentropy": 2.4565179347991943, "epoch": 0.4418140951276102, "grad_norm": 0.02972487546503544, "grad_norm_var": 1.8175371795095756e-05, "learning_rate": 0.00608434115011221, "loss": 2.4874, "step": 12187 }, { "crossentropy": 2.567286252975464, "epoch": 0.44185034802784223, "grad_norm": 0.029855450615286827, "grad_norm_var": 1.7897031365329772e-05, "learning_rate": 0.006083773884442498, "loss": 2.482, "step": 12188 }, { "crossentropy": 2.45383358001709, "epoch": 0.44188660092807425, "grad_norm": 0.026913972571492195, "grad_norm_var": 1.838776781392081e-05, "learning_rate": 0.006083206604134773, "loss": 2.5186, "step": 12189 }, { "crossentropy": 2.459150552749634, "epoch": 0.44192285382830626, "grad_norm": 0.034557078033685684, "grad_norm_var": 1.9345546578705685e-05, "learning_rate": 0.006082639309196699, "loss": 2.5989, "step": 12190 }, { "crossentropy": 2.650057077407837, "epoch": 0.4419591067285383, "grad_norm": 0.027225542813539505, "grad_norm_var": 3.992060998620796e-06, "learning_rate": 0.006082071999635938, "loss": 2.4479, "step": 12191 }, { "crossentropy": 2.610584259033203, "epoch": 0.4419953596287703, "grad_norm": 0.026950599625706673, "grad_norm_var": 4.083954379021379e-06, "learning_rate": 0.006081504675460154, "loss": 2.6229, "step": 12192 }, { "crossentropy": 2.613140821456909, "epoch": 0.4420316125290023, "grad_norm": 0.027387885376811028, "grad_norm_var": 4.2768155479990746e-06, "learning_rate": 0.006080937336677006, "loss": 2.5901, "step": 12193 }, { "crossentropy": 2.5830976963043213, "epoch": 0.4420678654292343, "grad_norm": 0.027558347210288048, "grad_norm_var": 4.058773629228111e-06, "learning_rate": 0.006080369983294161, "loss": 2.5832, "step": 12194 }, { "crossentropy": 2.6699790954589844, "epoch": 0.44210411832946633, "grad_norm": 0.026222312822937965, "grad_norm_var": 4.030578947452932e-06, "learning_rate": 0.006079802615319277, "loss": 2.5885, "step": 12195 }, { "crossentropy": 2.642747640609741, "epoch": 0.4421403712296984, "grad_norm": 0.02667824551463127, "grad_norm_var": 4.153672344875851e-06, "learning_rate": 0.006079235232760023, "loss": 2.6926, "step": 12196 }, { "crossentropy": 2.5744593143463135, "epoch": 0.4421766241299304, "grad_norm": 0.028194857761263847, "grad_norm_var": 4.1275637099006845e-06, "learning_rate": 0.006078667835624058, "loss": 2.5539, "step": 12197 }, { "crossentropy": 2.5350606441497803, "epoch": 0.44221287703016243, "grad_norm": 0.02693161927163601, "grad_norm_var": 4.227858583106383e-06, "learning_rate": 0.006078100423919046, "loss": 2.5917, "step": 12198 }, { "crossentropy": 2.5692298412323, "epoch": 0.44224912993039445, "grad_norm": 0.02753225713968277, "grad_norm_var": 4.121781378254446e-06, "learning_rate": 0.006077532997652652, "loss": 2.5467, "step": 12199 }, { "crossentropy": 2.7114129066467285, "epoch": 0.44228538283062646, "grad_norm": 0.0286762323230505, "grad_norm_var": 3.979360362619958e-06, "learning_rate": 0.006076965556832542, "loss": 2.6081, "step": 12200 }, { "crossentropy": 2.5278196334838867, "epoch": 0.4423216357308585, "grad_norm": 0.028788220137357712, "grad_norm_var": 3.9964310309774376e-06, "learning_rate": 0.006076398101466376, "loss": 2.5577, "step": 12201 }, { "crossentropy": 2.602137565612793, "epoch": 0.4423578886310905, "grad_norm": 0.026415923610329628, "grad_norm_var": 4.172931174134748e-06, "learning_rate": 0.006075830631561821, "loss": 2.5796, "step": 12202 }, { "crossentropy": 2.6824214458465576, "epoch": 0.4423941415313225, "grad_norm": 0.027745665982365608, "grad_norm_var": 3.989186049434138e-06, "learning_rate": 0.006075263147126538, "loss": 2.5619, "step": 12203 }, { "crossentropy": 2.5349931716918945, "epoch": 0.4424303944315545, "grad_norm": 0.026536589488387108, "grad_norm_var": 3.846432412808381e-06, "learning_rate": 0.006074695648168197, "loss": 2.5901, "step": 12204 }, { "crossentropy": 2.5649936199188232, "epoch": 0.44246664733178653, "grad_norm": 0.027012420818209648, "grad_norm_var": 3.8358053949208295e-06, "learning_rate": 0.006074128134694461, "loss": 2.5541, "step": 12205 }, { "crossentropy": 2.6104583740234375, "epoch": 0.44250290023201855, "grad_norm": 0.028019245713949203, "grad_norm_var": 5.959984861015338e-07, "learning_rate": 0.006073560606712994, "loss": 2.5785, "step": 12206 }, { "crossentropy": 2.5696260929107666, "epoch": 0.44253915313225056, "grad_norm": 0.029564334079623222, "grad_norm_var": 8.936808889969371e-07, "learning_rate": 0.006072993064231459, "loss": 2.5895, "step": 12207 }, { "crossentropy": 2.6518309116363525, "epoch": 0.4425754060324826, "grad_norm": 0.028683286160230637, "grad_norm_var": 9.512925162024107e-07, "learning_rate": 0.006072425507257527, "loss": 2.6725, "step": 12208 }, { "crossentropy": 2.6550819873809814, "epoch": 0.4426116589327146, "grad_norm": 0.026267413049936295, "grad_norm_var": 1.0646919706868684e-06, "learning_rate": 0.006071857935798861, "loss": 2.6288, "step": 12209 }, { "crossentropy": 2.632190704345703, "epoch": 0.44264791183294666, "grad_norm": 0.033723145723342896, "grad_norm_var": 3.4454638533083636e-06, "learning_rate": 0.006071290349863128, "loss": 2.6568, "step": 12210 }, { "crossentropy": 2.64312744140625, "epoch": 0.4426841647331787, "grad_norm": 0.026842782273888588, "grad_norm_var": 3.327671639341079e-06, "learning_rate": 0.006070722749457989, "loss": 2.7006, "step": 12211 }, { "crossentropy": 2.432208776473999, "epoch": 0.4427204176334107, "grad_norm": 0.027399348095059395, "grad_norm_var": 3.2354183334072525e-06, "learning_rate": 0.006070155134591118, "loss": 2.5867, "step": 12212 }, { "crossentropy": 2.583326816558838, "epoch": 0.4427566705336427, "grad_norm": 0.029121503233909607, "grad_norm_var": 3.310586451954348e-06, "learning_rate": 0.006069587505270178, "loss": 2.6263, "step": 12213 }, { "crossentropy": 2.6042850017547607, "epoch": 0.4427929234338747, "grad_norm": 0.029699455946683884, "grad_norm_var": 3.3660515072363982e-06, "learning_rate": 0.0060690198615028345, "loss": 2.6337, "step": 12214 }, { "crossentropy": 2.5455968379974365, "epoch": 0.44282917633410673, "grad_norm": 0.03255847468972206, "grad_norm_var": 4.462810798287059e-06, "learning_rate": 0.006068452203296754, "loss": 2.5617, "step": 12215 }, { "crossentropy": 2.5454792976379395, "epoch": 0.44286542923433875, "grad_norm": 0.028642524033784866, "grad_norm_var": 4.4623858312344085e-06, "learning_rate": 0.0060678845306596065, "loss": 2.4935, "step": 12216 }, { "crossentropy": 2.468343496322632, "epoch": 0.44290168213457076, "grad_norm": 0.026996217668056488, "grad_norm_var": 4.6094618856571165e-06, "learning_rate": 0.006067316843599059, "loss": 2.3614, "step": 12217 }, { "crossentropy": 2.6033341884613037, "epoch": 0.4429379350348028, "grad_norm": 0.027729105204343796, "grad_norm_var": 4.360781504406513e-06, "learning_rate": 0.006066749142122777, "loss": 2.5893, "step": 12218 }, { "crossentropy": 2.5738120079040527, "epoch": 0.4429741879350348, "grad_norm": 0.0269306730479002, "grad_norm_var": 4.487942837934355e-06, "learning_rate": 0.006066181426238428, "loss": 2.6205, "step": 12219 }, { "crossentropy": 2.6474504470825195, "epoch": 0.4430104408352668, "grad_norm": 0.031542159616947174, "grad_norm_var": 4.7549352997254865e-06, "learning_rate": 0.006065613695953681, "loss": 2.576, "step": 12220 }, { "crossentropy": 2.6721391677856445, "epoch": 0.4430466937354988, "grad_norm": 0.029970912262797356, "grad_norm_var": 4.598513189847418e-06, "learning_rate": 0.006065045951276206, "loss": 2.656, "step": 12221 }, { "crossentropy": 2.5152077674865723, "epoch": 0.44308294663573083, "grad_norm": 0.029247714206576347, "grad_norm_var": 4.5353582830771785e-06, "learning_rate": 0.006064478192213666, "loss": 2.6375, "step": 12222 }, { "crossentropy": 2.489199638366699, "epoch": 0.4431191995359629, "grad_norm": 0.02805997058749199, "grad_norm_var": 4.57512901798535e-06, "learning_rate": 0.006063910418773734, "loss": 2.5395, "step": 12223 }, { "crossentropy": 2.6985552310943604, "epoch": 0.4431554524361949, "grad_norm": 0.027927761897444725, "grad_norm_var": 4.6390245813468855e-06, "learning_rate": 0.006063342630964077, "loss": 2.6355, "step": 12224 }, { "crossentropy": 2.4547483921051025, "epoch": 0.44319170533642693, "grad_norm": 0.02845878154039383, "grad_norm_var": 4.165227173655768e-06, "learning_rate": 0.006062774828792365, "loss": 2.4999, "step": 12225 }, { "crossentropy": 2.6797733306884766, "epoch": 0.44322795823665895, "grad_norm": 0.031036648899316788, "grad_norm_var": 2.943518668225599e-06, "learning_rate": 0.006062207012266265, "loss": 2.6685, "step": 12226 }, { "crossentropy": 2.5039918422698975, "epoch": 0.44326421113689096, "grad_norm": 0.03600795194506645, "grad_norm_var": 5.697595308345885e-06, "learning_rate": 0.0060616391813934495, "loss": 2.5158, "step": 12227 }, { "crossentropy": 2.637479543685913, "epoch": 0.443300464037123, "grad_norm": 0.033330876380205154, "grad_norm_var": 6.2683481544818785e-06, "learning_rate": 0.006061071336181584, "loss": 2.6268, "step": 12228 }, { "crossentropy": 2.577087640762329, "epoch": 0.443336716937355, "grad_norm": 0.02683345042169094, "grad_norm_var": 6.811323593680163e-06, "learning_rate": 0.006060503476638339, "loss": 2.6279, "step": 12229 }, { "crossentropy": 2.5963292121887207, "epoch": 0.443372969837587, "grad_norm": 0.027679767459630966, "grad_norm_var": 7.06259046080065e-06, "learning_rate": 0.006059935602771387, "loss": 2.5709, "step": 12230 }, { "crossentropy": 2.5546715259552, "epoch": 0.443409222737819, "grad_norm": 0.026385288685560226, "grad_norm_var": 6.9759749950535814e-06, "learning_rate": 0.006059367714588395, "loss": 2.596, "step": 12231 }, { "crossentropy": 2.6678669452667236, "epoch": 0.44344547563805103, "grad_norm": 0.026028843596577644, "grad_norm_var": 7.588055884256463e-06, "learning_rate": 0.006058799812097036, "loss": 2.4956, "step": 12232 }, { "crossentropy": 2.5059664249420166, "epoch": 0.44348172853828305, "grad_norm": 0.027522757649421692, "grad_norm_var": 7.463978532838179e-06, "learning_rate": 0.006058231895304978, "loss": 2.539, "step": 12233 }, { "crossentropy": 2.6253316402435303, "epoch": 0.44351798143851506, "grad_norm": 0.03149507939815521, "grad_norm_var": 7.690496430029965e-06, "learning_rate": 0.00605766396421989, "loss": 2.5537, "step": 12234 }, { "crossentropy": 2.5437417030334473, "epoch": 0.4435542343387471, "grad_norm": 0.028005583211779594, "grad_norm_var": 7.426193554311197e-06, "learning_rate": 0.006057096018849449, "loss": 2.5029, "step": 12235 }, { "crossentropy": 2.6987552642822266, "epoch": 0.4435904872389791, "grad_norm": 0.028072472661733627, "grad_norm_var": 7.1625449397579845e-06, "learning_rate": 0.0060565280592013195, "loss": 2.7388, "step": 12236 }, { "crossentropy": 2.672900915145874, "epoch": 0.44362674013921116, "grad_norm": 0.028791412711143494, "grad_norm_var": 7.117090086231403e-06, "learning_rate": 0.0060559600852831755, "loss": 2.6125, "step": 12237 }, { "crossentropy": 2.57417631149292, "epoch": 0.4436629930394432, "grad_norm": 0.029863016679883003, "grad_norm_var": 7.156540381734198e-06, "learning_rate": 0.0060553920971026885, "loss": 2.5634, "step": 12238 }, { "crossentropy": 2.569185495376587, "epoch": 0.4436992459396752, "grad_norm": 0.027083467692136765, "grad_norm_var": 7.3507334929284825e-06, "learning_rate": 0.006054824094667529, "loss": 2.6139, "step": 12239 }, { "crossentropy": 2.8337693214416504, "epoch": 0.4437354988399072, "grad_norm": 0.02729222923517227, "grad_norm_var": 7.469607046310146e-06, "learning_rate": 0.006054256077985371, "loss": 2.776, "step": 12240 }, { "crossentropy": 2.6532537937164307, "epoch": 0.4437717517401392, "grad_norm": 0.02705674059689045, "grad_norm_var": 7.692326281150695e-06, "learning_rate": 0.006053688047063883, "loss": 2.6398, "step": 12241 }, { "crossentropy": 2.6705312728881836, "epoch": 0.44380800464037123, "grad_norm": 0.027731480076909065, "grad_norm_var": 7.435844381875091e-06, "learning_rate": 0.006053120001910739, "loss": 2.583, "step": 12242 }, { "crossentropy": 2.5466229915618896, "epoch": 0.44384425754060325, "grad_norm": 0.02772592566907406, "grad_norm_var": 3.651537035529344e-06, "learning_rate": 0.006052551942533612, "loss": 2.6161, "step": 12243 }, { "crossentropy": 2.5529732704162598, "epoch": 0.44388051044083526, "grad_norm": 0.027067121118307114, "grad_norm_var": 1.802817570151086e-06, "learning_rate": 0.006051983868940174, "loss": 2.5255, "step": 12244 }, { "crossentropy": 2.5919036865234375, "epoch": 0.4439167633410673, "grad_norm": 0.026776853948831558, "grad_norm_var": 1.8102335492088558e-06, "learning_rate": 0.006051415781138096, "loss": 2.5668, "step": 12245 }, { "crossentropy": 2.5212700366973877, "epoch": 0.4439530162412993, "grad_norm": 0.026722047477960587, "grad_norm_var": 1.8811420264917717e-06, "learning_rate": 0.0060508476791350544, "loss": 2.4761, "step": 12246 }, { "crossentropy": 2.635322093963623, "epoch": 0.4439892691415313, "grad_norm": 0.027079859748482704, "grad_norm_var": 1.787106258621301e-06, "learning_rate": 0.006050279562938719, "loss": 2.6955, "step": 12247 }, { "crossentropy": 2.5296833515167236, "epoch": 0.4440255220417633, "grad_norm": 0.02747407741844654, "grad_norm_var": 1.5821945006346532e-06, "learning_rate": 0.006049711432556764, "loss": 2.56, "step": 12248 }, { "crossentropy": 2.558248996734619, "epoch": 0.44406177494199534, "grad_norm": 0.027238452807068825, "grad_norm_var": 1.600030576303218e-06, "learning_rate": 0.006049143287996862, "loss": 2.5693, "step": 12249 }, { "crossentropy": 2.6602039337158203, "epoch": 0.4440980278422274, "grad_norm": 0.02636881172657013, "grad_norm_var": 7.457191707785707e-07, "learning_rate": 0.006048575129266689, "loss": 2.6632, "step": 12250 }, { "crossentropy": 2.4652204513549805, "epoch": 0.4441342807424594, "grad_norm": 0.028585534542798996, "grad_norm_var": 8.041464333051329e-07, "learning_rate": 0.006048006956373918, "loss": 2.5542, "step": 12251 }, { "crossentropy": 2.7002296447753906, "epoch": 0.44417053364269143, "grad_norm": 0.028285415843129158, "grad_norm_var": 8.21584940170067e-07, "learning_rate": 0.006047438769326222, "loss": 2.6182, "step": 12252 }, { "crossentropy": 2.6244852542877197, "epoch": 0.44420678654292345, "grad_norm": 0.026495186612010002, "grad_norm_var": 7.77603401627561e-07, "learning_rate": 0.006046870568131277, "loss": 2.6221, "step": 12253 }, { "crossentropy": 2.7540242671966553, "epoch": 0.44424303944315546, "grad_norm": 0.02834700420498848, "grad_norm_var": 4.29022204754409e-07, "learning_rate": 0.006046302352796755, "loss": 2.6849, "step": 12254 }, { "crossentropy": 2.68741774559021, "epoch": 0.4442792923433875, "grad_norm": 0.028709515929222107, "grad_norm_var": 5.401441208205751e-07, "learning_rate": 0.0060457341233303334, "loss": 2.7317, "step": 12255 }, { "crossentropy": 2.5351603031158447, "epoch": 0.4443155452436195, "grad_norm": 0.028805315494537354, "grad_norm_var": 6.544774294934992e-07, "learning_rate": 0.006045165879739684, "loss": 2.5333, "step": 12256 }, { "crossentropy": 2.5863828659057617, "epoch": 0.4443517981438515, "grad_norm": 0.027659859508275986, "grad_norm_var": 6.392079572590011e-07, "learning_rate": 0.006044597622032484, "loss": 2.6238, "step": 12257 }, { "crossentropy": 2.5876212120056152, "epoch": 0.4443880510440835, "grad_norm": 0.0329115204513073, "grad_norm_var": 2.4298409430525508e-06, "learning_rate": 0.0060440293502164065, "loss": 2.5816, "step": 12258 }, { "crossentropy": 2.605506658554077, "epoch": 0.44442430394431554, "grad_norm": 0.03160037100315094, "grad_norm_var": 3.28288558621551e-06, "learning_rate": 0.00604346106429913, "loss": 2.6634, "step": 12259 }, { "crossentropy": 2.4952821731567383, "epoch": 0.44446055684454755, "grad_norm": 0.027164006605744362, "grad_norm_var": 3.269704017899259e-06, "learning_rate": 0.006042892764288328, "loss": 2.5741, "step": 12260 }, { "crossentropy": 2.6094534397125244, "epoch": 0.44449680974477956, "grad_norm": 0.02640559710562229, "grad_norm_var": 3.3457454523510745e-06, "learning_rate": 0.006042324450191676, "loss": 2.5404, "step": 12261 }, { "crossentropy": 2.449047565460205, "epoch": 0.4445330626450116, "grad_norm": 0.026189429685473442, "grad_norm_var": 3.462452886188183e-06, "learning_rate": 0.00604175612201685, "loss": 2.4871, "step": 12262 }, { "crossentropy": 2.7497310638427734, "epoch": 0.4445693155452436, "grad_norm": 0.02696669101715088, "grad_norm_var": 3.4783822982025014e-06, "learning_rate": 0.0060411877797715285, "loss": 2.6529, "step": 12263 }, { "crossentropy": 2.428217649459839, "epoch": 0.44460556844547566, "grad_norm": 0.02668119966983795, "grad_norm_var": 3.5812458593126663e-06, "learning_rate": 0.006040619423463385, "loss": 2.496, "step": 12264 }, { "crossentropy": 2.6185946464538574, "epoch": 0.4446418213457077, "grad_norm": 0.027004897594451904, "grad_norm_var": 3.6091758151338686e-06, "learning_rate": 0.0060400510531000975, "loss": 2.6145, "step": 12265 }, { "crossentropy": 2.5423407554626465, "epoch": 0.4446780742459397, "grad_norm": 0.027498234063386917, "grad_norm_var": 3.441562934707029e-06, "learning_rate": 0.006039482668689341, "loss": 2.5562, "step": 12266 }, { "crossentropy": 2.7068934440612793, "epoch": 0.4447143271461717, "grad_norm": 0.02840493991971016, "grad_norm_var": 3.4314732430219904e-06, "learning_rate": 0.006038914270238793, "loss": 2.7046, "step": 12267 }, { "crossentropy": 2.507463216781616, "epoch": 0.4447505800464037, "grad_norm": 0.027148479595780373, "grad_norm_var": 3.47969405445161e-06, "learning_rate": 0.006038345857756133, "loss": 2.6388, "step": 12268 }, { "crossentropy": 2.7056310176849365, "epoch": 0.44478683294663574, "grad_norm": 0.02700651064515114, "grad_norm_var": 3.3934748839442933e-06, "learning_rate": 0.006037777431249037, "loss": 2.5243, "step": 12269 }, { "crossentropy": 2.518129348754883, "epoch": 0.44482308584686775, "grad_norm": 0.025723323225975037, "grad_norm_var": 3.7133259202321024e-06, "learning_rate": 0.00603720899072518, "loss": 2.5486, "step": 12270 }, { "crossentropy": 2.728214740753174, "epoch": 0.44485933874709976, "grad_norm": 0.030078282579779625, "grad_norm_var": 3.984092068057609e-06, "learning_rate": 0.006036640536192242, "loss": 2.7183, "step": 12271 }, { "crossentropy": 2.571843147277832, "epoch": 0.4448955916473318, "grad_norm": 0.02902918867766857, "grad_norm_var": 4.01266470134992e-06, "learning_rate": 0.006036072067657902, "loss": 2.5813, "step": 12272 }, { "crossentropy": 2.5813488960266113, "epoch": 0.4449318445475638, "grad_norm": 0.03570007160305977, "grad_norm_var": 7.723678965285083e-06, "learning_rate": 0.0060355035851298346, "loss": 2.5593, "step": 12273 }, { "crossentropy": 2.59641695022583, "epoch": 0.4449680974477958, "grad_norm": 0.03354048356413841, "grad_norm_var": 8.120915332338457e-06, "learning_rate": 0.006034935088615721, "loss": 2.5961, "step": 12274 }, { "crossentropy": 2.503882884979248, "epoch": 0.4450043503480278, "grad_norm": 0.027300050482153893, "grad_norm_var": 7.504112275880321e-06, "learning_rate": 0.006034366578123236, "loss": 2.5487, "step": 12275 }, { "crossentropy": 2.585949420928955, "epoch": 0.44504060324825984, "grad_norm": 0.028377745300531387, "grad_norm_var": 7.4220408823192155e-06, "learning_rate": 0.006033798053660063, "loss": 2.4965, "step": 12276 }, { "crossentropy": 2.42577862739563, "epoch": 0.4450768561484919, "grad_norm": 0.026997635141015053, "grad_norm_var": 7.293147858452622e-06, "learning_rate": 0.006033229515233879, "loss": 2.4832, "step": 12277 }, { "crossentropy": 2.6556766033172607, "epoch": 0.4451131090487239, "grad_norm": 0.026496635749936104, "grad_norm_var": 7.210426885158328e-06, "learning_rate": 0.006032660962852361, "loss": 2.6533, "step": 12278 }, { "crossentropy": 2.559159517288208, "epoch": 0.44514936194895594, "grad_norm": 0.028435438871383667, "grad_norm_var": 7.070018217506431e-06, "learning_rate": 0.006032092396523189, "loss": 2.5934, "step": 12279 }, { "crossentropy": 2.585808277130127, "epoch": 0.44518561484918795, "grad_norm": 0.028126094490289688, "grad_norm_var": 6.857050223404935e-06, "learning_rate": 0.006031523816254044, "loss": 2.5642, "step": 12280 }, { "crossentropy": 2.577816963195801, "epoch": 0.44522186774941996, "grad_norm": 0.026460759341716766, "grad_norm_var": 6.987963932728915e-06, "learning_rate": 0.006030955222052606, "loss": 2.5601, "step": 12281 }, { "crossentropy": 2.5694546699523926, "epoch": 0.445258120649652, "grad_norm": 0.027400633320212364, "grad_norm_var": 7.001859134032881e-06, "learning_rate": 0.006030386613926551, "loss": 2.5717, "step": 12282 }, { "crossentropy": 2.5993478298187256, "epoch": 0.445294373549884, "grad_norm": 0.026872705668210983, "grad_norm_var": 7.170902758818126e-06, "learning_rate": 0.00602981799188356, "loss": 2.5905, "step": 12283 }, { "crossentropy": 2.5347118377685547, "epoch": 0.445330626450116, "grad_norm": 0.027578093111515045, "grad_norm_var": 7.109696210430503e-06, "learning_rate": 0.006029249355931317, "loss": 2.5286, "step": 12284 }, { "crossentropy": 2.4784038066864014, "epoch": 0.445366879350348, "grad_norm": 0.027819976210594177, "grad_norm_var": 6.995007803780688e-06, "learning_rate": 0.006028680706077498, "loss": 2.4809, "step": 12285 }, { "crossentropy": 2.40712308883667, "epoch": 0.44540313225058004, "grad_norm": 0.026509739458560944, "grad_norm_var": 6.7429232339787095e-06, "learning_rate": 0.006028112042329785, "loss": 2.5172, "step": 12286 }, { "crossentropy": 2.4631094932556152, "epoch": 0.44543938515081205, "grad_norm": 0.028879081830382347, "grad_norm_var": 6.587676882658053e-06, "learning_rate": 0.006027543364695859, "loss": 2.5813, "step": 12287 }, { "crossentropy": 2.6153407096862793, "epoch": 0.44547563805104406, "grad_norm": 0.030807247385382652, "grad_norm_var": 6.917775353584381e-06, "learning_rate": 0.0060269746731834006, "loss": 2.5681, "step": 12288 }, { "crossentropy": 2.489804744720459, "epoch": 0.4455118909512761, "grad_norm": 0.027489813044667244, "grad_norm_var": 3.337978446621874e-06, "learning_rate": 0.006026405967800091, "loss": 2.5384, "step": 12289 }, { "crossentropy": 2.590421676635742, "epoch": 0.4455481438515081, "grad_norm": 0.026670070365071297, "grad_norm_var": 1.2752797273635556e-06, "learning_rate": 0.006025837248553612, "loss": 2.5972, "step": 12290 }, { "crossentropy": 2.6264100074768066, "epoch": 0.44558439675174016, "grad_norm": 0.02535409852862358, "grad_norm_var": 1.5998572436003254e-06, "learning_rate": 0.006025268515451641, "loss": 2.5048, "step": 12291 }, { "crossentropy": 2.6622133255004883, "epoch": 0.4456206496519722, "grad_norm": 0.027619322761893272, "grad_norm_var": 1.548790202153018e-06, "learning_rate": 0.006024699768501864, "loss": 2.5624, "step": 12292 }, { "crossentropy": 2.624886989593506, "epoch": 0.4456569025522042, "grad_norm": 0.028107207268476486, "grad_norm_var": 1.5558785921102365e-06, "learning_rate": 0.006024131007711965, "loss": 2.5201, "step": 12293 }, { "crossentropy": 2.58078932762146, "epoch": 0.4456931554524362, "grad_norm": 0.026383308693766594, "grad_norm_var": 1.572434445728946e-06, "learning_rate": 0.006023562233089619, "loss": 2.6171, "step": 12294 }, { "crossentropy": 2.4690330028533936, "epoch": 0.4457294083526682, "grad_norm": 0.02774396724998951, "grad_norm_var": 1.5190332769517955e-06, "learning_rate": 0.006022993444642512, "loss": 2.6045, "step": 12295 }, { "crossentropy": 2.4918017387390137, "epoch": 0.44576566125290024, "grad_norm": 0.02729911357164383, "grad_norm_var": 1.4915152372359704e-06, "learning_rate": 0.006022424642378327, "loss": 2.5431, "step": 12296 }, { "crossentropy": 2.5394272804260254, "epoch": 0.44580191415313225, "grad_norm": 0.026833143085241318, "grad_norm_var": 1.451700866477758e-06, "learning_rate": 0.006021855826304746, "loss": 2.5627, "step": 12297 }, { "crossentropy": 2.575066566467285, "epoch": 0.44583816705336426, "grad_norm": 0.028175700455904007, "grad_norm_var": 1.48306276378933e-06, "learning_rate": 0.006021286996429452, "loss": 2.5675, "step": 12298 }, { "crossentropy": 2.6507277488708496, "epoch": 0.4458744199535963, "grad_norm": 0.02958899736404419, "grad_norm_var": 1.7137866113187214e-06, "learning_rate": 0.006020718152760124, "loss": 2.5649, "step": 12299 }, { "crossentropy": 2.5874154567718506, "epoch": 0.4459106728538283, "grad_norm": 0.029560737311840057, "grad_norm_var": 1.9328760800350983e-06, "learning_rate": 0.00602014929530445, "loss": 2.5733, "step": 12300 }, { "crossentropy": 2.6910760402679443, "epoch": 0.4459469257540603, "grad_norm": 0.028560789301991463, "grad_norm_var": 1.9688931344442885e-06, "learning_rate": 0.006019580424070113, "loss": 2.6161, "step": 12301 }, { "crossentropy": 2.5293915271759033, "epoch": 0.4459831786542923, "grad_norm": 0.030148174613714218, "grad_norm_var": 2.146623333740214e-06, "learning_rate": 0.006019011539064795, "loss": 2.601, "step": 12302 }, { "crossentropy": 2.58390736579895, "epoch": 0.44601943155452434, "grad_norm": 0.034392233937978745, "grad_norm_var": 4.636416912330562e-06, "learning_rate": 0.006018442640296178, "loss": 2.52, "step": 12303 }, { "crossentropy": 2.480971336364746, "epoch": 0.4460556844547564, "grad_norm": 0.029258400201797485, "grad_norm_var": 4.2935320986661175e-06, "learning_rate": 0.006017873727771948, "loss": 2.4958, "step": 12304 }, { "crossentropy": 2.7432165145874023, "epoch": 0.4460919373549884, "grad_norm": 0.028760196641087532, "grad_norm_var": 4.253089520837803e-06, "learning_rate": 0.00601730480149979, "loss": 2.668, "step": 12305 }, { "crossentropy": 2.5699241161346436, "epoch": 0.44612819025522044, "grad_norm": 0.027831867337226868, "grad_norm_var": 4.0689363917655125e-06, "learning_rate": 0.006016735861487385, "loss": 2.4842, "step": 12306 }, { "crossentropy": 2.6994364261627197, "epoch": 0.44616444315545245, "grad_norm": 0.029211461544036865, "grad_norm_var": 3.393208063037995e-06, "learning_rate": 0.006016166907742419, "loss": 2.6885, "step": 12307 }, { "crossentropy": 2.607391834259033, "epoch": 0.44620069605568446, "grad_norm": 0.027852775529026985, "grad_norm_var": 3.3624417870455147e-06, "learning_rate": 0.0060155979402725795, "loss": 2.6409, "step": 12308 }, { "crossentropy": 2.574449300765991, "epoch": 0.4462369489559165, "grad_norm": 0.030056430026888847, "grad_norm_var": 3.437591030942301e-06, "learning_rate": 0.006015028959085547, "loss": 2.4686, "step": 12309 }, { "crossentropy": 2.5710456371307373, "epoch": 0.4462732018561485, "grad_norm": 0.031596358865499496, "grad_norm_var": 3.4190637864282e-06, "learning_rate": 0.006014459964189009, "loss": 2.5757, "step": 12310 }, { "crossentropy": 2.5310418605804443, "epoch": 0.4463094547563805, "grad_norm": 0.02787935920059681, "grad_norm_var": 3.394296727119679e-06, "learning_rate": 0.0060138909555906474, "loss": 2.5346, "step": 12311 }, { "crossentropy": 2.4770193099975586, "epoch": 0.4463457076566125, "grad_norm": 0.026754209771752357, "grad_norm_var": 3.550078821934419e-06, "learning_rate": 0.006013321933298152, "loss": 2.5823, "step": 12312 }, { "crossentropy": 2.7098798751831055, "epoch": 0.44638196055684454, "grad_norm": 0.026745472103357315, "grad_norm_var": 3.5776864712538357e-06, "learning_rate": 0.006012752897319205, "loss": 2.6542, "step": 12313 }, { "crossentropy": 2.580232858657837, "epoch": 0.44641821345707655, "grad_norm": 0.025900699198246002, "grad_norm_var": 4.196192320959219e-06, "learning_rate": 0.006012183847661494, "loss": 2.5427, "step": 12314 }, { "crossentropy": 2.5530219078063965, "epoch": 0.44645446635730857, "grad_norm": 0.02598675899207592, "grad_norm_var": 4.727252100238674e-06, "learning_rate": 0.006011614784332703, "loss": 2.5265, "step": 12315 }, { "crossentropy": 2.5867741107940674, "epoch": 0.4464907192575406, "grad_norm": 0.02708153799176216, "grad_norm_var": 4.853652476180742e-06, "learning_rate": 0.00601104570734052, "loss": 2.6714, "step": 12316 }, { "crossentropy": 2.571597099304199, "epoch": 0.4465269721577726, "grad_norm": 0.029545482248067856, "grad_norm_var": 4.9056861186849135e-06, "learning_rate": 0.006010476616692631, "loss": 2.5576, "step": 12317 }, { "crossentropy": 2.7702934741973877, "epoch": 0.44656322505800466, "grad_norm": 0.02787131629884243, "grad_norm_var": 4.786285045431946e-06, "learning_rate": 0.0060099075123967215, "loss": 2.6819, "step": 12318 }, { "crossentropy": 2.606182098388672, "epoch": 0.4465994779582367, "grad_norm": 0.03188352286815643, "grad_norm_var": 3.2238629250337283e-06, "learning_rate": 0.006009338394460476, "loss": 2.5407, "step": 12319 }, { "crossentropy": 2.4730961322784424, "epoch": 0.4466357308584687, "grad_norm": 0.03138788789510727, "grad_norm_var": 3.7542777091600775e-06, "learning_rate": 0.006008769262891587, "loss": 2.4729, "step": 12320 }, { "crossentropy": 2.6316909790039062, "epoch": 0.4466719837587007, "grad_norm": 0.02719835564494133, "grad_norm_var": 3.857046817102155e-06, "learning_rate": 0.006008200117697736, "loss": 2.6248, "step": 12321 }, { "crossentropy": 2.619779586791992, "epoch": 0.4467082366589327, "grad_norm": 0.026842977851629257, "grad_norm_var": 3.996235395934087e-06, "learning_rate": 0.006007630958886615, "loss": 2.5673, "step": 12322 }, { "crossentropy": 2.575497627258301, "epoch": 0.44674448955916474, "grad_norm": 0.02743423916399479, "grad_norm_var": 3.992390511208888e-06, "learning_rate": 0.0060070617864659065, "loss": 2.5038, "step": 12323 }, { "crossentropy": 2.596928596496582, "epoch": 0.44678074245939675, "grad_norm": 0.028670325875282288, "grad_norm_var": 3.990746229503665e-06, "learning_rate": 0.0060064926004433005, "loss": 2.5941, "step": 12324 }, { "crossentropy": 2.4818239212036133, "epoch": 0.44681699535962877, "grad_norm": 0.026641398668289185, "grad_norm_var": 3.920874367903378e-06, "learning_rate": 0.0060059234008264855, "loss": 2.5761, "step": 12325 }, { "crossentropy": 2.6297335624694824, "epoch": 0.4468532482598608, "grad_norm": 0.02706422470510006, "grad_norm_var": 3.0850418323124495e-06, "learning_rate": 0.006005354187623148, "loss": 2.6392, "step": 12326 }, { "crossentropy": 2.5953474044799805, "epoch": 0.4468895011600928, "grad_norm": 0.02630760334432125, "grad_norm_var": 3.2239613451746457e-06, "learning_rate": 0.006004784960840976, "loss": 2.5321, "step": 12327 }, { "crossentropy": 2.6133759021759033, "epoch": 0.4469257540603248, "grad_norm": 0.02649799920618534, "grad_norm_var": 3.2606213097756872e-06, "learning_rate": 0.006004215720487659, "loss": 2.5663, "step": 12328 }, { "crossentropy": 2.431302785873413, "epoch": 0.4469620069605568, "grad_norm": 0.029213622212409973, "grad_norm_var": 3.3301177819706923e-06, "learning_rate": 0.006003646466570885, "loss": 2.492, "step": 12329 }, { "crossentropy": 2.621616840362549, "epoch": 0.44699825986078884, "grad_norm": 0.029811391606926918, "grad_norm_var": 3.271895004688001e-06, "learning_rate": 0.006003077199098341, "loss": 2.615, "step": 12330 }, { "crossentropy": 2.5322177410125732, "epoch": 0.4470345127610209, "grad_norm": 0.028741799294948578, "grad_norm_var": 2.973714732924561e-06, "learning_rate": 0.006002507918077719, "loss": 2.6108, "step": 12331 }, { "crossentropy": 2.7958993911743164, "epoch": 0.4470707656612529, "grad_norm": 0.028150852769613266, "grad_norm_var": 2.8768596206045956e-06, "learning_rate": 0.006001938623516705, "loss": 2.7172, "step": 12332 }, { "crossentropy": 2.6152842044830322, "epoch": 0.44710701856148494, "grad_norm": 0.02770290896296501, "grad_norm_var": 2.7901755043353387e-06, "learning_rate": 0.006001369315422988, "loss": 2.5472, "step": 12333 }, { "crossentropy": 2.6943485736846924, "epoch": 0.44714327146171695, "grad_norm": 0.029087848961353302, "grad_norm_var": 2.827123926714169e-06, "learning_rate": 0.006000799993804262, "loss": 2.6353, "step": 12334 }, { "crossentropy": 2.671478748321533, "epoch": 0.44717952436194897, "grad_norm": 0.0266610998660326, "grad_norm_var": 2.02934509428056e-06, "learning_rate": 0.006000230658668211, "loss": 2.602, "step": 12335 }, { "crossentropy": 2.6095163822174072, "epoch": 0.447215777262181, "grad_norm": 0.027258653193712234, "grad_norm_var": 1.209609076150327e-06, "learning_rate": 0.005999661310022528, "loss": 2.5351, "step": 12336 }, { "crossentropy": 2.5186309814453125, "epoch": 0.447252030162413, "grad_norm": 0.028646614402532578, "grad_norm_var": 1.242802318113708e-06, "learning_rate": 0.005999091947874902, "loss": 2.5697, "step": 12337 }, { "crossentropy": 2.536921262741089, "epoch": 0.447288283062645, "grad_norm": 0.030278325080871582, "grad_norm_var": 1.5439445630820054e-06, "learning_rate": 0.005998522572233023, "loss": 2.5684, "step": 12338 }, { "crossentropy": 2.7157981395721436, "epoch": 0.447324535962877, "grad_norm": 0.02686847746372223, "grad_norm_var": 1.607424409341103e-06, "learning_rate": 0.00599795318310458, "loss": 2.6998, "step": 12339 }, { "crossentropy": 2.6452555656433105, "epoch": 0.44736078886310904, "grad_norm": 0.026658402755856514, "grad_norm_var": 1.6739411864946074e-06, "learning_rate": 0.005997383780497268, "loss": 2.5409, "step": 12340 }, { "crossentropy": 2.561561346054077, "epoch": 0.44739704176334105, "grad_norm": 0.02626335248351097, "grad_norm_var": 1.7437669159887e-06, "learning_rate": 0.005996814364418771, "loss": 2.6345, "step": 12341 }, { "crossentropy": 2.4802610874176025, "epoch": 0.44743329466357307, "grad_norm": 0.027861420065164566, "grad_norm_var": 1.7025345381573107e-06, "learning_rate": 0.005996244934876786, "loss": 2.5004, "step": 12342 }, { "crossentropy": 2.5396363735198975, "epoch": 0.4474695475638051, "grad_norm": 0.027791732922196388, "grad_norm_var": 1.5299086701217689e-06, "learning_rate": 0.005995675491878999, "loss": 2.6097, "step": 12343 }, { "crossentropy": 2.5826632976531982, "epoch": 0.4475058004640371, "grad_norm": 0.02913350984454155, "grad_norm_var": 1.4473252921263017e-06, "learning_rate": 0.005995106035433104, "loss": 2.5903, "step": 12344 }, { "crossentropy": 2.61454439163208, "epoch": 0.44754205336426917, "grad_norm": 0.027280889451503754, "grad_norm_var": 1.4023498308210826e-06, "learning_rate": 0.005994536565546793, "loss": 2.5373, "step": 12345 }, { "crossentropy": 2.56550669670105, "epoch": 0.4475783062645012, "grad_norm": 0.027166012674570084, "grad_norm_var": 1.2051667111297963e-06, "learning_rate": 0.005993967082227756, "loss": 2.6291, "step": 12346 }, { "crossentropy": 2.5590896606445312, "epoch": 0.4476145591647332, "grad_norm": 0.026801295578479767, "grad_norm_var": 1.2089974166221933e-06, "learning_rate": 0.005993397585483685, "loss": 2.6183, "step": 12347 }, { "crossentropy": 2.531266689300537, "epoch": 0.4476508120649652, "grad_norm": 0.027724217623472214, "grad_norm_var": 1.196189530033156e-06, "learning_rate": 0.005992828075322272, "loss": 2.4874, "step": 12348 }, { "crossentropy": 2.6221446990966797, "epoch": 0.4476870649651972, "grad_norm": 0.030444292351603508, "grad_norm_var": 1.667299863247762e-06, "learning_rate": 0.005992258551751209, "loss": 2.582, "step": 12349 }, { "crossentropy": 2.70528507232666, "epoch": 0.44772331786542924, "grad_norm": 0.029925428330898285, "grad_norm_var": 1.8471091962161494e-06, "learning_rate": 0.005991689014778189, "loss": 2.5956, "step": 12350 }, { "crossentropy": 2.700836658477783, "epoch": 0.44775957076566125, "grad_norm": 0.028802255168557167, "grad_norm_var": 1.7734630629001138e-06, "learning_rate": 0.005991119464410903, "loss": 2.6095, "step": 12351 }, { "crossentropy": 2.680804967880249, "epoch": 0.44779582366589327, "grad_norm": 0.028972284868359566, "grad_norm_var": 1.774688441897792e-06, "learning_rate": 0.005990549900657047, "loss": 2.6585, "step": 12352 }, { "crossentropy": 2.672360897064209, "epoch": 0.4478320765661253, "grad_norm": 0.03066333755850792, "grad_norm_var": 2.158752242526031e-06, "learning_rate": 0.005989980323524309, "loss": 2.7051, "step": 12353 }, { "crossentropy": 2.5116522312164307, "epoch": 0.4478683294663573, "grad_norm": 0.02912677824497223, "grad_norm_var": 1.9362986721748922e-06, "learning_rate": 0.005989410733020384, "loss": 2.5795, "step": 12354 }, { "crossentropy": 2.6039161682128906, "epoch": 0.4479045823665893, "grad_norm": 0.029334092512726784, "grad_norm_var": 1.8726871374214841e-06, "learning_rate": 0.005988841129152965, "loss": 2.6351, "step": 12355 }, { "crossentropy": 2.503941535949707, "epoch": 0.4479408352668213, "grad_norm": 0.028820626437664032, "grad_norm_var": 1.6709123744005807e-06, "learning_rate": 0.005988271511929749, "loss": 2.5335, "step": 12356 }, { "crossentropy": 2.7635786533355713, "epoch": 0.44797708816705334, "grad_norm": 0.028633346781134605, "grad_norm_var": 1.3129853849181787e-06, "learning_rate": 0.005987701881358425, "loss": 2.6833, "step": 12357 }, { "crossentropy": 2.601895332336426, "epoch": 0.4480133410672854, "grad_norm": 0.02882939949631691, "grad_norm_var": 1.2691120911794325e-06, "learning_rate": 0.005987132237446688, "loss": 2.576, "step": 12358 }, { "crossentropy": 2.718440294265747, "epoch": 0.4480495939675174, "grad_norm": 0.02717423066496849, "grad_norm_var": 1.369008724777275e-06, "learning_rate": 0.005986562580202232, "loss": 2.6727, "step": 12359 }, { "crossentropy": 2.597074031829834, "epoch": 0.44808584686774944, "grad_norm": 0.028251662850379944, "grad_norm_var": 1.3639358450617899e-06, "learning_rate": 0.005985992909632752, "loss": 2.5547, "step": 12360 }, { "crossentropy": 2.707456588745117, "epoch": 0.44812209976798145, "grad_norm": 0.0276438370347023, "grad_norm_var": 1.307274245315607e-06, "learning_rate": 0.00598542322574594, "loss": 2.6995, "step": 12361 }, { "crossentropy": 2.711125135421753, "epoch": 0.44815835266821347, "grad_norm": 0.026318540796637535, "grad_norm_var": 1.519233561452643e-06, "learning_rate": 0.005984853528549493, "loss": 2.6436, "step": 12362 }, { "crossentropy": 2.5703823566436768, "epoch": 0.4481946055684455, "grad_norm": 0.02718120999634266, "grad_norm_var": 1.4375660852163855e-06, "learning_rate": 0.005984283818051104, "loss": 2.531, "step": 12363 }, { "crossentropy": 2.705895185470581, "epoch": 0.4482308584686775, "grad_norm": 0.027102379128336906, "grad_norm_var": 1.5356188614390823e-06, "learning_rate": 0.00598371409425847, "loss": 2.6813, "step": 12364 }, { "crossentropy": 2.4515764713287354, "epoch": 0.4482671113689095, "grad_norm": 0.026548027992248535, "grad_norm_var": 1.5140922376041434e-06, "learning_rate": 0.005983144357179282, "loss": 2.5387, "step": 12365 }, { "crossentropy": 2.6534364223480225, "epoch": 0.4483033642691415, "grad_norm": 0.02809298038482666, "grad_norm_var": 1.3348779758390647e-06, "learning_rate": 0.005982574606821238, "loss": 2.648, "step": 12366 }, { "crossentropy": 2.5307281017303467, "epoch": 0.44833961716937354, "grad_norm": 0.02727080136537552, "grad_norm_var": 1.3622502972593722e-06, "learning_rate": 0.005982004843192033, "loss": 2.6145, "step": 12367 }, { "crossentropy": 2.598241090774536, "epoch": 0.44837587006960555, "grad_norm": 0.027498897165060043, "grad_norm_var": 1.331031495509791e-06, "learning_rate": 0.005981435066299363, "loss": 2.5933, "step": 12368 }, { "crossentropy": 2.727673292160034, "epoch": 0.44841212296983757, "grad_norm": 0.03243381157517433, "grad_norm_var": 2.1484270031957153e-06, "learning_rate": 0.0059808652761509234, "loss": 2.6096, "step": 12369 }, { "crossentropy": 2.652935028076172, "epoch": 0.4484483758700696, "grad_norm": 0.02771814912557602, "grad_norm_var": 2.0873498881707746e-06, "learning_rate": 0.005980295472754409, "loss": 2.5945, "step": 12370 }, { "crossentropy": 2.660172462463379, "epoch": 0.4484846287703016, "grad_norm": 0.026587801054120064, "grad_norm_var": 2.0897231198135766e-06, "learning_rate": 0.005979725656117515, "loss": 2.6369, "step": 12371 }, { "crossentropy": 2.506124496459961, "epoch": 0.44852088167053367, "grad_norm": 0.026955094188451767, "grad_norm_var": 2.073666650326252e-06, "learning_rate": 0.005979155826247941, "loss": 2.4953, "step": 12372 }, { "crossentropy": 2.6081581115722656, "epoch": 0.4485571345707657, "grad_norm": 0.02802075259387493, "grad_norm_var": 2.026196101429289e-06, "learning_rate": 0.005978585983153383, "loss": 2.5746, "step": 12373 }, { "crossentropy": 2.595834732055664, "epoch": 0.4485933874709977, "grad_norm": 0.02668783999979496, "grad_norm_var": 1.9979789080642092e-06, "learning_rate": 0.005978016126841535, "loss": 2.5865, "step": 12374 }, { "crossentropy": 2.5477967262268066, "epoch": 0.4486296403712297, "grad_norm": 0.028307385742664337, "grad_norm_var": 2.014979424121097e-06, "learning_rate": 0.005977446257320093, "loss": 2.5649, "step": 12375 }, { "crossentropy": 2.567995309829712, "epoch": 0.4486658932714617, "grad_norm": 0.027139805257320404, "grad_norm_var": 2.005079234114413e-06, "learning_rate": 0.005976876374596758, "loss": 2.6019, "step": 12376 }, { "crossentropy": 2.6205358505249023, "epoch": 0.44870214617169374, "grad_norm": 0.03247798979282379, "grad_norm_var": 3.497632955801131e-06, "learning_rate": 0.005976306478679225, "loss": 2.6621, "step": 12377 }, { "crossentropy": 2.7142109870910645, "epoch": 0.44873839907192575, "grad_norm": 0.026567088440060616, "grad_norm_var": 3.4492061274725426e-06, "learning_rate": 0.005975736569575192, "loss": 2.5823, "step": 12378 }, { "crossentropy": 2.700039863586426, "epoch": 0.44877465197215777, "grad_norm": 0.02814261056482792, "grad_norm_var": 3.413312641060984e-06, "learning_rate": 0.005975166647292354, "loss": 2.7172, "step": 12379 }, { "crossentropy": 2.4984686374664307, "epoch": 0.4488109048723898, "grad_norm": 0.028109503909945488, "grad_norm_var": 3.359935762374533e-06, "learning_rate": 0.0059745967118384116, "loss": 2.5804, "step": 12380 }, { "crossentropy": 2.5687074661254883, "epoch": 0.4488471577726218, "grad_norm": 0.026957690715789795, "grad_norm_var": 3.2892087875199817e-06, "learning_rate": 0.005974026763221062, "loss": 2.528, "step": 12381 }, { "crossentropy": 2.3960766792297363, "epoch": 0.4488834106728538, "grad_norm": 0.02701731026172638, "grad_norm_var": 3.35686881153587e-06, "learning_rate": 0.005973456801448003, "loss": 2.5133, "step": 12382 }, { "crossentropy": 2.538184642791748, "epoch": 0.4489196635730858, "grad_norm": 0.029726818203926086, "grad_norm_var": 3.497279629494319e-06, "learning_rate": 0.005972886826526931, "loss": 2.6319, "step": 12383 }, { "crossentropy": 2.542799711227417, "epoch": 0.44895591647331784, "grad_norm": 0.02988966926932335, "grad_norm_var": 3.6479898552065073e-06, "learning_rate": 0.005972316838465547, "loss": 2.5536, "step": 12384 }, { "crossentropy": 2.6516568660736084, "epoch": 0.4489921693735499, "grad_norm": 0.03067038208246231, "grad_norm_var": 2.8694953482006137e-06, "learning_rate": 0.005971746837271549, "loss": 2.5728, "step": 12385 }, { "crossentropy": 2.6523990631103516, "epoch": 0.4490284222737819, "grad_norm": 0.0299200639128685, "grad_norm_var": 3.035168439057857e-06, "learning_rate": 0.005971176822952636, "loss": 2.6678, "step": 12386 }, { "crossentropy": 2.5412983894348145, "epoch": 0.44906467517401394, "grad_norm": 0.032169878482818604, "grad_norm_var": 3.6907179176661005e-06, "learning_rate": 0.005970606795516503, "loss": 2.6587, "step": 12387 }, { "crossentropy": 2.539982318878174, "epoch": 0.44910092807424595, "grad_norm": 0.02793779969215393, "grad_norm_var": 3.5260485349834174e-06, "learning_rate": 0.005970036754970855, "loss": 2.5051, "step": 12388 }, { "crossentropy": 2.663684129714966, "epoch": 0.44913718097447797, "grad_norm": 0.02745121158659458, "grad_norm_var": 3.600478550514494e-06, "learning_rate": 0.005969466701323387, "loss": 2.7135, "step": 12389 }, { "crossentropy": 2.6498069763183594, "epoch": 0.44917343387471, "grad_norm": 0.026675518602132797, "grad_norm_var": 3.603790954808707e-06, "learning_rate": 0.005968896634581802, "loss": 2.6776, "step": 12390 }, { "crossentropy": 2.602701187133789, "epoch": 0.449209686774942, "grad_norm": 0.031536705791950226, "grad_norm_var": 4.087579313262421e-06, "learning_rate": 0.005968326554753795, "loss": 2.5922, "step": 12391 }, { "crossentropy": 2.597642183303833, "epoch": 0.449245939675174, "grad_norm": 0.029153242707252502, "grad_norm_var": 3.868578030978044e-06, "learning_rate": 0.00596775646184707, "loss": 2.5814, "step": 12392 }, { "crossentropy": 2.601095199584961, "epoch": 0.449282192575406, "grad_norm": 0.026758486405014992, "grad_norm_var": 3.2800374499675993e-06, "learning_rate": 0.005967186355869326, "loss": 2.5549, "step": 12393 }, { "crossentropy": 2.4464972019195557, "epoch": 0.44931844547563804, "grad_norm": 0.025905421003699303, "grad_norm_var": 3.4927253269654518e-06, "learning_rate": 0.005966616236828263, "loss": 2.6206, "step": 12394 }, { "crossentropy": 2.6685538291931152, "epoch": 0.44935469837587005, "grad_norm": 0.028124192729592323, "grad_norm_var": 3.4939345618970833e-06, "learning_rate": 0.0059660461047315785, "loss": 2.6841, "step": 12395 }, { "crossentropy": 2.500042676925659, "epoch": 0.44939095127610207, "grad_norm": 0.026489030569791794, "grad_norm_var": 3.769487719421094e-06, "learning_rate": 0.005965475959586977, "loss": 2.4838, "step": 12396 }, { "crossentropy": 2.522035837173462, "epoch": 0.4494272041763341, "grad_norm": 0.025312427431344986, "grad_norm_var": 4.282259320653216e-06, "learning_rate": 0.005964905801402156, "loss": 2.5093, "step": 12397 }, { "crossentropy": 2.5982282161712646, "epoch": 0.44946345707656615, "grad_norm": 0.02957891672849655, "grad_norm_var": 4.212900707999528e-06, "learning_rate": 0.0059643356301848196, "loss": 2.6058, "step": 12398 }, { "crossentropy": 2.5494513511657715, "epoch": 0.44949970997679817, "grad_norm": 0.02869241312146187, "grad_norm_var": 4.121775758179439e-06, "learning_rate": 0.005963765445942666, "loss": 2.6108, "step": 12399 }, { "crossentropy": 2.486905336380005, "epoch": 0.4495359628770302, "grad_norm": 0.026295386254787445, "grad_norm_var": 4.271171365401577e-06, "learning_rate": 0.005963195248683398, "loss": 2.5475, "step": 12400 }, { "crossentropy": 2.688107967376709, "epoch": 0.4495722157772622, "grad_norm": 0.027080323547124863, "grad_norm_var": 3.938205486820704e-06, "learning_rate": 0.005962625038414715, "loss": 2.6918, "step": 12401 }, { "crossentropy": 2.4908647537231445, "epoch": 0.4496084686774942, "grad_norm": 0.026213085278868675, "grad_norm_var": 3.881437301516072e-06, "learning_rate": 0.005962054815144322, "loss": 2.5063, "step": 12402 }, { "crossentropy": 2.64473819732666, "epoch": 0.4496447215777262, "grad_norm": 0.03076210804283619, "grad_norm_var": 3.191797105399352e-06, "learning_rate": 0.0059614845788799175, "loss": 2.5985, "step": 12403 }, { "crossentropy": 2.5891149044036865, "epoch": 0.44968097447795824, "grad_norm": 0.028038039803504944, "grad_norm_var": 3.194963294916401e-06, "learning_rate": 0.005960914329629205, "loss": 2.5928, "step": 12404 }, { "crossentropy": 2.626882553100586, "epoch": 0.44971722737819025, "grad_norm": 0.026525184512138367, "grad_norm_var": 3.2859634143304517e-06, "learning_rate": 0.005960344067399886, "loss": 2.6176, "step": 12405 }, { "crossentropy": 2.533761978149414, "epoch": 0.44975348027842227, "grad_norm": 0.027325162664055824, "grad_norm_var": 3.223923194481335e-06, "learning_rate": 0.005959773792199665, "loss": 2.6186, "step": 12406 }, { "crossentropy": 2.553178071975708, "epoch": 0.4497897331786543, "grad_norm": 0.027619773522019386, "grad_norm_var": 2.198334080463872e-06, "learning_rate": 0.0059592035040362396, "loss": 2.5611, "step": 12407 }, { "crossentropy": 2.665472984313965, "epoch": 0.4498259860788863, "grad_norm": 0.030940426513552666, "grad_norm_var": 2.7938023908315062e-06, "learning_rate": 0.005958633202917316, "loss": 2.6295, "step": 12408 }, { "crossentropy": 2.784904718399048, "epoch": 0.4498622389791183, "grad_norm": 0.027748020365834236, "grad_norm_var": 2.743475592889971e-06, "learning_rate": 0.005958062888850597, "loss": 2.706, "step": 12409 }, { "crossentropy": 2.69329571723938, "epoch": 0.4498984918793503, "grad_norm": 0.02737867459654808, "grad_norm_var": 2.5333678499249764e-06, "learning_rate": 0.005957492561843785, "loss": 2.6905, "step": 12410 }, { "crossentropy": 2.609678268432617, "epoch": 0.44993474477958234, "grad_norm": 0.028991933912038803, "grad_norm_var": 2.622831798504826e-06, "learning_rate": 0.005956922221904581, "loss": 2.6318, "step": 12411 }, { "crossentropy": 2.7200851440429688, "epoch": 0.4499709976798144, "grad_norm": 0.029258567839860916, "grad_norm_var": 2.6137179847636354e-06, "learning_rate": 0.005956351869040692, "loss": 2.6165, "step": 12412 }, { "crossentropy": 2.6168465614318848, "epoch": 0.4500072505800464, "grad_norm": 0.028431832790374756, "grad_norm_var": 2.110296259181845e-06, "learning_rate": 0.0059557815032598195, "loss": 2.5614, "step": 12413 }, { "crossentropy": 2.6662495136260986, "epoch": 0.45004350348027844, "grad_norm": 0.030857175588607788, "grad_norm_var": 2.4508431631926435e-06, "learning_rate": 0.005955211124569667, "loss": 2.6102, "step": 12414 }, { "crossentropy": 2.6092097759246826, "epoch": 0.45007975638051045, "grad_norm": 0.027209723368287086, "grad_norm_var": 2.5027332464199927e-06, "learning_rate": 0.0059546407329779385, "loss": 2.5934, "step": 12415 }, { "crossentropy": 2.575901985168457, "epoch": 0.45011600928074247, "grad_norm": 0.02849031612277031, "grad_norm_var": 2.2560365820941854e-06, "learning_rate": 0.005954070328492339, "loss": 2.6166, "step": 12416 }, { "crossentropy": 2.580606698989868, "epoch": 0.4501522621809745, "grad_norm": 0.026937566697597504, "grad_norm_var": 2.280609613736319e-06, "learning_rate": 0.0059534999111205715, "loss": 2.5955, "step": 12417 }, { "crossentropy": 2.5112192630767822, "epoch": 0.4501885150812065, "grad_norm": 0.026860862970352173, "grad_norm_var": 2.1269789075679483e-06, "learning_rate": 0.00595292948087034, "loss": 2.549, "step": 12418 }, { "crossentropy": 2.717417001724243, "epoch": 0.4502247679814385, "grad_norm": 0.02749415673315525, "grad_norm_var": 1.7373104480608117e-06, "learning_rate": 0.0059523590377493505, "loss": 2.6114, "step": 12419 }, { "crossentropy": 2.57082462310791, "epoch": 0.4502610208816705, "grad_norm": 0.02745155245065689, "grad_norm_var": 1.7661335463848664e-06, "learning_rate": 0.0059517885817653065, "loss": 2.6084, "step": 12420 }, { "crossentropy": 2.556891441345215, "epoch": 0.45029727378190254, "grad_norm": 0.027338270097970963, "grad_norm_var": 1.6372605884537017e-06, "learning_rate": 0.005951218112925914, "loss": 2.5764, "step": 12421 }, { "crossentropy": 2.6207430362701416, "epoch": 0.45033352668213456, "grad_norm": 0.026867667213082314, "grad_norm_var": 1.7004049855231494e-06, "learning_rate": 0.005950647631238877, "loss": 2.5494, "step": 12422 }, { "crossentropy": 2.761500120162964, "epoch": 0.45036977958236657, "grad_norm": 0.030041631311178207, "grad_norm_var": 1.9063393675250537e-06, "learning_rate": 0.0059500771367119, "loss": 2.6251, "step": 12423 }, { "crossentropy": 2.5549161434173584, "epoch": 0.4504060324825986, "grad_norm": 0.03285055607557297, "grad_norm_var": 2.8148354661472476e-06, "learning_rate": 0.005949506629352691, "loss": 2.6092, "step": 12424 }, { "crossentropy": 2.4309258460998535, "epoch": 0.45044228538283065, "grad_norm": 0.02948206290602684, "grad_norm_var": 2.8547926727322344e-06, "learning_rate": 0.0059489361091689535, "loss": 2.4925, "step": 12425 }, { "crossentropy": 2.6195850372314453, "epoch": 0.45047853828306267, "grad_norm": 0.028944460675120354, "grad_norm_var": 2.7746719009258748e-06, "learning_rate": 0.005948365576168393, "loss": 2.5859, "step": 12426 }, { "crossentropy": 2.6639151573181152, "epoch": 0.4505147911832947, "grad_norm": 0.027156081050634384, "grad_norm_var": 2.8879790661605227e-06, "learning_rate": 0.005947795030358717, "loss": 2.652, "step": 12427 }, { "crossentropy": 2.6201741695404053, "epoch": 0.4505510440835267, "grad_norm": 0.027015438303351402, "grad_norm_var": 2.9694583160928214e-06, "learning_rate": 0.005947224471747631, "loss": 2.6649, "step": 12428 }, { "crossentropy": 2.5386388301849365, "epoch": 0.4505872969837587, "grad_norm": 0.02702750265598297, "grad_norm_var": 3.0753975375069023e-06, "learning_rate": 0.005946653900342841, "loss": 2.5135, "step": 12429 }, { "crossentropy": 2.581782817840576, "epoch": 0.4506235498839907, "grad_norm": 0.0277729332447052, "grad_norm_var": 2.5984202913023933e-06, "learning_rate": 0.005946083316152054, "loss": 2.5985, "step": 12430 }, { "crossentropy": 2.5139482021331787, "epoch": 0.45065980278422274, "grad_norm": 0.03032623417675495, "grad_norm_var": 2.852639871533816e-06, "learning_rate": 0.005945512719182974, "loss": 2.5966, "step": 12431 }, { "crossentropy": 2.754840612411499, "epoch": 0.45069605568445475, "grad_norm": 0.029671616852283478, "grad_norm_var": 2.9771442506122075e-06, "learning_rate": 0.005944942109443312, "loss": 2.727, "step": 12432 }, { "crossentropy": 2.6198854446411133, "epoch": 0.45073230858468677, "grad_norm": 0.028268447145819664, "grad_norm_var": 2.8412178493554077e-06, "learning_rate": 0.005944371486940772, "loss": 2.5565, "step": 12433 }, { "crossentropy": 2.6184353828430176, "epoch": 0.4507685614849188, "grad_norm": 0.027374323457479477, "grad_norm_var": 2.751598829545063e-06, "learning_rate": 0.005943800851683061, "loss": 2.6368, "step": 12434 }, { "crossentropy": 2.7405967712402344, "epoch": 0.4508048143851508, "grad_norm": 0.0280932504683733, "grad_norm_var": 2.698263393430209e-06, "learning_rate": 0.005943230203677888, "loss": 2.6115, "step": 12435 }, { "crossentropy": 2.4349188804626465, "epoch": 0.4508410672853828, "grad_norm": 0.02790224179625511, "grad_norm_var": 2.6491494514513844e-06, "learning_rate": 0.00594265954293296, "loss": 2.5361, "step": 12436 }, { "crossentropy": 2.446570634841919, "epoch": 0.4508773201856148, "grad_norm": 0.027750929817557335, "grad_norm_var": 2.595416176650331e-06, "learning_rate": 0.0059420888694559845, "loss": 2.5659, "step": 12437 }, { "crossentropy": 2.5275399684906006, "epoch": 0.45091357308584684, "grad_norm": 0.048213329166173935, "grad_norm_var": 2.6329971289158238e-05, "learning_rate": 0.005941518183254669, "loss": 2.5443, "step": 12438 }, { "crossentropy": 2.5644187927246094, "epoch": 0.4509498259860789, "grad_norm": 0.030066678300499916, "grad_norm_var": 2.633058972318091e-05, "learning_rate": 0.00594094748433672, "loss": 2.528, "step": 12439 }, { "crossentropy": 2.685272216796875, "epoch": 0.4509860788863109, "grad_norm": 0.03312502056360245, "grad_norm_var": 2.6444381086928554e-05, "learning_rate": 0.00594037677270985, "loss": 2.6488, "step": 12440 }, { "crossentropy": 2.5457470417022705, "epoch": 0.45102233178654294, "grad_norm": 0.03232665732502937, "grad_norm_var": 2.6796563543711214e-05, "learning_rate": 0.005939806048381764, "loss": 2.5968, "step": 12441 }, { "crossentropy": 2.592634439468384, "epoch": 0.45105858468677495, "grad_norm": 0.02779513970017433, "grad_norm_var": 2.7050790292973644e-05, "learning_rate": 0.0059392353113601705, "loss": 2.5196, "step": 12442 }, { "crossentropy": 2.4900732040405273, "epoch": 0.45109483758700697, "grad_norm": 0.027657750993967056, "grad_norm_var": 2.687676933976248e-05, "learning_rate": 0.005938664561652777, "loss": 2.5346, "step": 12443 }, { "crossentropy": 2.729426622390747, "epoch": 0.451131090487239, "grad_norm": 0.03080953098833561, "grad_norm_var": 2.625438681908883e-05, "learning_rate": 0.005938093799267297, "loss": 2.6295, "step": 12444 }, { "crossentropy": 2.624872922897339, "epoch": 0.451167343387471, "grad_norm": 0.02894158475100994, "grad_norm_var": 2.565805568374133e-05, "learning_rate": 0.005937523024211435, "loss": 2.5604, "step": 12445 }, { "crossentropy": 2.6386353969573975, "epoch": 0.451203596287703, "grad_norm": 0.029029810801148415, "grad_norm_var": 2.531972356836955e-05, "learning_rate": 0.005936952236492903, "loss": 2.634, "step": 12446 }, { "crossentropy": 2.598660707473755, "epoch": 0.451239849187935, "grad_norm": 0.030353987589478493, "grad_norm_var": 2.531927843858004e-05, "learning_rate": 0.005936381436119407, "loss": 2.5486, "step": 12447 }, { "crossentropy": 2.506700277328491, "epoch": 0.45127610208816704, "grad_norm": 0.028732111677527428, "grad_norm_var": 2.5473362915591013e-05, "learning_rate": 0.005935810623098662, "loss": 2.5195, "step": 12448 }, { "crossentropy": 2.6701419353485107, "epoch": 0.45131235498839906, "grad_norm": 0.029958270490169525, "grad_norm_var": 2.5170997682923077e-05, "learning_rate": 0.005935239797438372, "loss": 2.6729, "step": 12449 }, { "crossentropy": 2.5416464805603027, "epoch": 0.45134860788863107, "grad_norm": 0.02890915796160698, "grad_norm_var": 2.4676906524791613e-05, "learning_rate": 0.00593466895914625, "loss": 2.5753, "step": 12450 }, { "crossentropy": 2.6082184314727783, "epoch": 0.4513848607888631, "grad_norm": 0.028812557458877563, "grad_norm_var": 2.4468435519395013e-05, "learning_rate": 0.005934098108230004, "loss": 2.5863, "step": 12451 }, { "crossentropy": 2.3976376056671143, "epoch": 0.45142111368909515, "grad_norm": 0.02721562795341015, "grad_norm_var": 2.474936640240264e-05, "learning_rate": 0.005933527244697347, "loss": 2.5052, "step": 12452 }, { "crossentropy": 2.5240817070007324, "epoch": 0.45145736658932717, "grad_norm": 0.02741786651313305, "grad_norm_var": 2.4883094769778158e-05, "learning_rate": 0.005932956368555988, "loss": 2.5653, "step": 12453 }, { "crossentropy": 2.6933951377868652, "epoch": 0.4514936194895592, "grad_norm": 0.026076214388012886, "grad_norm_var": 3.4802268421190665e-06, "learning_rate": 0.005932385479813638, "loss": 2.6376, "step": 12454 }, { "crossentropy": 2.58121919631958, "epoch": 0.4515298723897912, "grad_norm": 0.027727942913770676, "grad_norm_var": 3.5523696186432264e-06, "learning_rate": 0.005931814578478004, "loss": 2.5036, "step": 12455 }, { "crossentropy": 2.583104372024536, "epoch": 0.4515661252900232, "grad_norm": 0.02728101797401905, "grad_norm_var": 2.5159806838594924e-06, "learning_rate": 0.005931243664556802, "loss": 2.5723, "step": 12456 }, { "crossentropy": 2.651907444000244, "epoch": 0.4516023781902552, "grad_norm": 0.027546506375074387, "grad_norm_var": 1.6264680631468496e-06, "learning_rate": 0.005930672738057743, "loss": 2.6762, "step": 12457 }, { "crossentropy": 2.493905782699585, "epoch": 0.45163863109048724, "grad_norm": 0.02784062922000885, "grad_norm_var": 1.622979899499457e-06, "learning_rate": 0.005930101798988534, "loss": 2.5522, "step": 12458 }, { "crossentropy": 2.5120232105255127, "epoch": 0.45167488399071926, "grad_norm": 0.02650185488164425, "grad_norm_var": 1.8200194700991484e-06, "learning_rate": 0.005929530847356888, "loss": 2.6105, "step": 12459 }, { "crossentropy": 2.5154006481170654, "epoch": 0.45171113689095127, "grad_norm": 0.028803570196032524, "grad_norm_var": 1.4062379362441794e-06, "learning_rate": 0.0059289598831705195, "loss": 2.5666, "step": 12460 }, { "crossentropy": 2.6756436824798584, "epoch": 0.4517473897911833, "grad_norm": 0.029860341921448708, "grad_norm_var": 1.550232631742215e-06, "learning_rate": 0.005928388906437138, "loss": 2.6467, "step": 12461 }, { "crossentropy": 2.674586057662964, "epoch": 0.4517836426914153, "grad_norm": 0.027250394225120544, "grad_norm_var": 1.5641138577411241e-06, "learning_rate": 0.005927817917164454, "loss": 2.712, "step": 12462 }, { "crossentropy": 2.570725440979004, "epoch": 0.4518198955916473, "grad_norm": 0.027457496151328087, "grad_norm_var": 1.2345881392630445e-06, "learning_rate": 0.005927246915360181, "loss": 2.6411, "step": 12463 }, { "crossentropy": 2.6034390926361084, "epoch": 0.45185614849187933, "grad_norm": 0.027116557583212852, "grad_norm_var": 1.231820549270216e-06, "learning_rate": 0.005926675901032031, "loss": 2.5623, "step": 12464 }, { "crossentropy": 2.634448289871216, "epoch": 0.45189240139211134, "grad_norm": 0.027285633608698845, "grad_norm_var": 9.308917696121758e-07, "learning_rate": 0.0059261048741877185, "loss": 2.5491, "step": 12465 }, { "crossentropy": 2.3591501712799072, "epoch": 0.4519286542923434, "grad_norm": 0.02727389521896839, "grad_norm_var": 8.330664346596946e-07, "learning_rate": 0.005925533834834954, "loss": 2.4615, "step": 12466 }, { "crossentropy": 2.653263807296753, "epoch": 0.4519649071925754, "grad_norm": 0.027433257550001144, "grad_norm_var": 7.274573090823896e-07, "learning_rate": 0.005924962782981449, "loss": 2.6634, "step": 12467 }, { "crossentropy": 2.811753273010254, "epoch": 0.45200116009280744, "grad_norm": 0.027336416766047478, "grad_norm_var": 7.236999281412845e-07, "learning_rate": 0.005924391718634918, "loss": 2.6907, "step": 12468 }, { "crossentropy": 2.673915386199951, "epoch": 0.45203741299303946, "grad_norm": 0.026078330352902412, "grad_norm_var": 8.528563579533137e-07, "learning_rate": 0.005923820641803076, "loss": 2.5714, "step": 12469 }, { "crossentropy": 2.76987361907959, "epoch": 0.45207366589327147, "grad_norm": 0.029023226350545883, "grad_norm_var": 8.639558020990049e-07, "learning_rate": 0.0059232495524936335, "loss": 2.7207, "step": 12470 }, { "crossentropy": 2.7236342430114746, "epoch": 0.4521099187935035, "grad_norm": 0.029925895854830742, "grad_norm_var": 1.199412185960919e-06, "learning_rate": 0.0059226784507143025, "loss": 2.6683, "step": 12471 }, { "crossentropy": 2.579706907272339, "epoch": 0.4521461716937355, "grad_norm": 0.028735453262925148, "grad_norm_var": 1.240494262350862e-06, "learning_rate": 0.0059221073364728, "loss": 2.5502, "step": 12472 }, { "crossentropy": 2.668299913406372, "epoch": 0.4521824245939675, "grad_norm": 0.028066866099834442, "grad_norm_var": 1.236926938070791e-06, "learning_rate": 0.00592153620977684, "loss": 2.6956, "step": 12473 }, { "crossentropy": 2.6537020206451416, "epoch": 0.45221867749419953, "grad_norm": 0.027292808517813683, "grad_norm_var": 1.2581477179147982e-06, "learning_rate": 0.005920965070634136, "loss": 2.5979, "step": 12474 }, { "crossentropy": 2.5468599796295166, "epoch": 0.45225493039443154, "grad_norm": 0.027020107954740524, "grad_norm_var": 1.1824593511707785e-06, "learning_rate": 0.005920393919052397, "loss": 2.5255, "step": 12475 }, { "crossentropy": 2.6418747901916504, "epoch": 0.45229118329466356, "grad_norm": 0.027073275297880173, "grad_norm_var": 1.154779538389307e-06, "learning_rate": 0.0059198227550393435, "loss": 2.5625, "step": 12476 }, { "crossentropy": 2.5144810676574707, "epoch": 0.45232743619489557, "grad_norm": 0.02671189419925213, "grad_norm_var": 8.94451290013587e-07, "learning_rate": 0.005919251578602688, "loss": 2.5941, "step": 12477 }, { "crossentropy": 2.529721736907959, "epoch": 0.4523636890951276, "grad_norm": 0.028132745996117592, "grad_norm_var": 9.057927226953453e-07, "learning_rate": 0.005918680389750146, "loss": 2.5935, "step": 12478 }, { "crossentropy": 2.642740249633789, "epoch": 0.45239994199535966, "grad_norm": 0.026599694043397903, "grad_norm_var": 9.706814358830898e-07, "learning_rate": 0.005918109188489429, "loss": 2.6081, "step": 12479 }, { "crossentropy": 2.603787899017334, "epoch": 0.45243619489559167, "grad_norm": 0.027087045833468437, "grad_norm_var": 9.72516691782425e-07, "learning_rate": 0.005917537974828255, "loss": 2.6151, "step": 12480 }, { "crossentropy": 2.4586355686187744, "epoch": 0.4524724477958237, "grad_norm": 0.027746854349970818, "grad_norm_var": 9.68491566068208e-07, "learning_rate": 0.005916966748774339, "loss": 2.5056, "step": 12481 }, { "crossentropy": 2.482562780380249, "epoch": 0.4525087006960557, "grad_norm": 0.026699410751461983, "grad_norm_var": 1.0137996128187221e-06, "learning_rate": 0.005916395510335396, "loss": 2.4957, "step": 12482 }, { "crossentropy": 2.62150239944458, "epoch": 0.4525449535962877, "grad_norm": 0.03633899614214897, "grad_norm_var": 5.82006917809047e-06, "learning_rate": 0.005915824259519138, "loss": 2.5824, "step": 12483 }, { "crossentropy": 2.5392234325408936, "epoch": 0.45258120649651973, "grad_norm": 0.028132231906056404, "grad_norm_var": 5.7768448768905425e-06, "learning_rate": 0.005915252996333286, "loss": 2.6409, "step": 12484 }, { "crossentropy": 2.602120876312256, "epoch": 0.45261745939675174, "grad_norm": 0.028771569952368736, "grad_norm_var": 5.480313490218575e-06, "learning_rate": 0.005914681720785553, "loss": 2.4998, "step": 12485 }, { "crossentropy": 2.6386706829071045, "epoch": 0.45265371229698376, "grad_norm": 0.03164973482489586, "grad_norm_var": 6.1525324091420145e-06, "learning_rate": 0.005914110432883656, "loss": 2.5922, "step": 12486 }, { "crossentropy": 2.672985076904297, "epoch": 0.45268996519721577, "grad_norm": 0.026859737932682037, "grad_norm_var": 6.156785060973377e-06, "learning_rate": 0.005913539132635308, "loss": 2.6642, "step": 12487 }, { "crossentropy": 2.398198127746582, "epoch": 0.4527262180974478, "grad_norm": 0.02600793167948723, "grad_norm_var": 6.466076615269043e-06, "learning_rate": 0.00591296782004823, "loss": 2.4805, "step": 12488 }, { "crossentropy": 2.6500113010406494, "epoch": 0.4527624709976798, "grad_norm": 0.02823643572628498, "grad_norm_var": 6.466289600816109e-06, "learning_rate": 0.005912396495130134, "loss": 2.6518, "step": 12489 }, { "crossentropy": 2.540924549102783, "epoch": 0.4527987238979118, "grad_norm": 0.030179698020219803, "grad_norm_var": 6.6581747093609365e-06, "learning_rate": 0.005911825157888742, "loss": 2.648, "step": 12490 }, { "crossentropy": 2.6524903774261475, "epoch": 0.45283497679814383, "grad_norm": 0.026976004242897034, "grad_norm_var": 6.665987099441217e-06, "learning_rate": 0.0059112538083317635, "loss": 2.6836, "step": 12491 }, { "crossentropy": 2.6168808937072754, "epoch": 0.45287122969837584, "grad_norm": 0.028369255363941193, "grad_norm_var": 6.554629951385724e-06, "learning_rate": 0.005910682446466922, "loss": 2.597, "step": 12492 }, { "crossentropy": 2.5416955947875977, "epoch": 0.4529074825986079, "grad_norm": 0.02709115669131279, "grad_norm_var": 6.477941605576585e-06, "learning_rate": 0.005910111072301931, "loss": 2.665, "step": 12493 }, { "crossentropy": 2.636587619781494, "epoch": 0.45294373549883993, "grad_norm": 0.030345896258950233, "grad_norm_var": 6.696380632994223e-06, "learning_rate": 0.00590953968584451, "loss": 2.5458, "step": 12494 }, { "crossentropy": 2.619220733642578, "epoch": 0.45297998839907194, "grad_norm": 0.032338328659534454, "grad_norm_var": 7.2483995322165325e-06, "learning_rate": 0.0059089682871023745, "loss": 2.6608, "step": 12495 }, { "crossentropy": 2.55200457572937, "epoch": 0.45301624129930396, "grad_norm": 0.02716037631034851, "grad_norm_var": 7.2307467009244395e-06, "learning_rate": 0.005908396876083242, "loss": 2.5807, "step": 12496 }, { "crossentropy": 2.7662885189056396, "epoch": 0.45305249419953597, "grad_norm": 0.02778407745063305, "grad_norm_var": 7.22495392538001e-06, "learning_rate": 0.005907825452794833, "loss": 2.6834, "step": 12497 }, { "crossentropy": 2.516663074493408, "epoch": 0.453088747099768, "grad_norm": 0.026463700458407402, "grad_norm_var": 7.298648939208357e-06, "learning_rate": 0.005907254017244862, "loss": 2.5033, "step": 12498 }, { "crossentropy": 2.5995678901672363, "epoch": 0.453125, "grad_norm": 0.027974359691143036, "grad_norm_var": 3.396264894387167e-06, "learning_rate": 0.005906682569441049, "loss": 2.6568, "step": 12499 }, { "crossentropy": 2.523470163345337, "epoch": 0.453161252900232, "grad_norm": 0.02910235896706581, "grad_norm_var": 3.4209317353977766e-06, "learning_rate": 0.005906111109391113, "loss": 2.5614, "step": 12500 }, { "crossentropy": 2.7432165145874023, "epoch": 0.45319750580046403, "grad_norm": 0.02911658212542534, "grad_norm_var": 3.442846012477463e-06, "learning_rate": 0.00590553963710277, "loss": 2.6459, "step": 12501 }, { "crossentropy": 2.579519033432007, "epoch": 0.45323375870069604, "grad_norm": 0.027056001126766205, "grad_norm_var": 2.8193569103014567e-06, "learning_rate": 0.005904968152583741, "loss": 2.6279, "step": 12502 }, { "crossentropy": 2.4971277713775635, "epoch": 0.45327001160092806, "grad_norm": 0.026274975389242172, "grad_norm_var": 2.9445536589049285e-06, "learning_rate": 0.005904396655841743, "loss": 2.5625, "step": 12503 }, { "crossentropy": 2.489861249923706, "epoch": 0.45330626450116007, "grad_norm": 0.027702149003744125, "grad_norm_var": 2.6389789557388724e-06, "learning_rate": 0.0059038251468844965, "loss": 2.5363, "step": 12504 }, { "crossentropy": 2.5194339752197266, "epoch": 0.4533425174013921, "grad_norm": 0.026934359222650528, "grad_norm_var": 2.749155869616775e-06, "learning_rate": 0.005903253625719719, "loss": 2.5269, "step": 12505 }, { "crossentropy": 2.600706100463867, "epoch": 0.45337877030162416, "grad_norm": 0.026529841125011444, "grad_norm_var": 2.6082725301740006e-06, "learning_rate": 0.005902682092355132, "loss": 2.6191, "step": 12506 }, { "crossentropy": 2.594316244125366, "epoch": 0.45341502320185617, "grad_norm": 0.026523472741246223, "grad_norm_var": 2.679913321900161e-06, "learning_rate": 0.0059021105467984515, "loss": 2.6007, "step": 12507 }, { "crossentropy": 2.4142284393310547, "epoch": 0.4534512761020882, "grad_norm": 0.02696068212389946, "grad_norm_var": 2.720094095071529e-06, "learning_rate": 0.0059015389890574, "loss": 2.4711, "step": 12508 }, { "crossentropy": 2.560671091079712, "epoch": 0.4534875290023202, "grad_norm": 0.02749945968389511, "grad_norm_var": 2.6900241521787363e-06, "learning_rate": 0.005900967419139697, "loss": 2.6133, "step": 12509 }, { "crossentropy": 2.6021461486816406, "epoch": 0.4535237819025522, "grad_norm": 0.0273171067237854, "grad_norm_var": 2.259638250336653e-06, "learning_rate": 0.005900395837053061, "loss": 2.5997, "step": 12510 }, { "crossentropy": 2.758028745651245, "epoch": 0.45356003480278423, "grad_norm": 0.02592656761407852, "grad_norm_var": 8.390474093497435e-07, "learning_rate": 0.005899824242805211, "loss": 2.6871, "step": 12511 }, { "crossentropy": 2.501575469970703, "epoch": 0.45359628770301624, "grad_norm": 0.029369765892624855, "grad_norm_var": 1.111729775868144e-06, "learning_rate": 0.005899252636403872, "loss": 2.544, "step": 12512 }, { "crossentropy": 2.569197654724121, "epoch": 0.45363254060324826, "grad_norm": 0.02741163969039917, "grad_norm_var": 1.1017469024088838e-06, "learning_rate": 0.005898681017856761, "loss": 2.5662, "step": 12513 }, { "crossentropy": 2.5864219665527344, "epoch": 0.45366879350348027, "grad_norm": 0.030092772096395493, "grad_norm_var": 1.4789956444135702e-06, "learning_rate": 0.005898109387171597, "loss": 2.6154, "step": 12514 }, { "crossentropy": 2.6604692935943604, "epoch": 0.4537050464037123, "grad_norm": 0.0330253466963768, "grad_norm_var": 3.317557624180299e-06, "learning_rate": 0.005897537744356105, "loss": 2.633, "step": 12515 }, { "crossentropy": 2.543844223022461, "epoch": 0.4537412993039443, "grad_norm": 0.032992053776979446, "grad_norm_var": 4.8723780154832455e-06, "learning_rate": 0.005896966089418003, "loss": 2.5686, "step": 12516 }, { "crossentropy": 2.342607259750366, "epoch": 0.4537775522041763, "grad_norm": 0.02791442722082138, "grad_norm_var": 4.811104408507643e-06, "learning_rate": 0.005896394422365012, "loss": 2.4397, "step": 12517 }, { "crossentropy": 2.5576817989349365, "epoch": 0.45381380510440833, "grad_norm": 0.027949882671236992, "grad_norm_var": 4.7371320568417485e-06, "learning_rate": 0.005895822743204855, "loss": 2.6255, "step": 12518 }, { "crossentropy": 2.6855854988098145, "epoch": 0.45385005800464034, "grad_norm": 0.027890153229236603, "grad_norm_var": 4.4960524746829315e-06, "learning_rate": 0.005895251051945253, "loss": 2.6394, "step": 12519 }, { "crossentropy": 2.714745283126831, "epoch": 0.4538863109048724, "grad_norm": 0.026533588767051697, "grad_norm_var": 4.667144273340695e-06, "learning_rate": 0.005894679348593925, "loss": 2.6166, "step": 12520 }, { "crossentropy": 2.4105849266052246, "epoch": 0.45392256380510443, "grad_norm": 0.043452188372612, "grad_norm_var": 1.897741330927429e-05, "learning_rate": 0.005894107633158597, "loss": 2.4816, "step": 12521 }, { "crossentropy": 2.564866542816162, "epoch": 0.45395881670533644, "grad_norm": 0.028007080778479576, "grad_norm_var": 1.8585548494336134e-05, "learning_rate": 0.005893535905646987, "loss": 2.6145, "step": 12522 }, { "crossentropy": 2.6131832599639893, "epoch": 0.45399506960556846, "grad_norm": 0.0278412364423275, "grad_norm_var": 1.8205512070625082e-05, "learning_rate": 0.005892964166066819, "loss": 2.5602, "step": 12523 }, { "crossentropy": 2.49631667137146, "epoch": 0.45403132250580047, "grad_norm": 0.029359737411141396, "grad_norm_var": 1.7789273516388183e-05, "learning_rate": 0.005892392414425815, "loss": 2.4944, "step": 12524 }, { "crossentropy": 2.592334032058716, "epoch": 0.4540675754060325, "grad_norm": 0.027103571221232414, "grad_norm_var": 1.790659115486295e-05, "learning_rate": 0.005891820650731697, "loss": 2.6605, "step": 12525 }, { "crossentropy": 2.6467554569244385, "epoch": 0.4541038283062645, "grad_norm": 0.03291682153940201, "grad_norm_var": 1.822784913138504e-05, "learning_rate": 0.005891248874992186, "loss": 2.5619, "step": 12526 }, { "crossentropy": 2.6999995708465576, "epoch": 0.4541400812064965, "grad_norm": 0.032457347959280014, "grad_norm_var": 1.7466964294406127e-05, "learning_rate": 0.00589067708721501, "loss": 2.7032, "step": 12527 }, { "crossentropy": 2.603257179260254, "epoch": 0.45417633410672853, "grad_norm": 0.029986068606376648, "grad_norm_var": 1.7416740301330092e-05, "learning_rate": 0.005890105287407885, "loss": 2.5909, "step": 12528 }, { "crossentropy": 2.6850500106811523, "epoch": 0.45421258700696054, "grad_norm": 0.026328690350055695, "grad_norm_var": 1.7908307264558458e-05, "learning_rate": 0.005889533475578539, "loss": 2.6471, "step": 12529 }, { "crossentropy": 2.6608290672302246, "epoch": 0.45424883990719256, "grad_norm": 0.027920395135879517, "grad_norm_var": 1.8246101758651972e-05, "learning_rate": 0.005888961651734693, "loss": 2.6886, "step": 12530 }, { "crossentropy": 2.5717337131500244, "epoch": 0.4542850928074246, "grad_norm": 0.030714616179466248, "grad_norm_var": 1.7680040600279862e-05, "learning_rate": 0.005888389815884071, "loss": 2.5804, "step": 12531 }, { "crossentropy": 2.4825031757354736, "epoch": 0.4543213457076566, "grad_norm": 0.03278660029172897, "grad_norm_var": 1.7599632784023356e-05, "learning_rate": 0.005887817968034397, "loss": 2.538, "step": 12532 }, { "crossentropy": 2.596572160720825, "epoch": 0.45435759860788866, "grad_norm": 0.02772214636206627, "grad_norm_var": 1.7654070182396286e-05, "learning_rate": 0.005887246108193394, "loss": 2.6303, "step": 12533 }, { "crossentropy": 2.607928991317749, "epoch": 0.45439385150812067, "grad_norm": 0.027738790959119797, "grad_norm_var": 1.7712745217841872e-05, "learning_rate": 0.0058866742363687845, "loss": 2.5843, "step": 12534 }, { "crossentropy": 2.714467763900757, "epoch": 0.4544301044083527, "grad_norm": 0.028327960520982742, "grad_norm_var": 1.7606091613761225e-05, "learning_rate": 0.005886102352568295, "loss": 2.8169, "step": 12535 }, { "crossentropy": 2.622018337249756, "epoch": 0.4544663573085847, "grad_norm": 0.02958494983613491, "grad_norm_var": 1.679813665253216e-05, "learning_rate": 0.005885530456799646, "loss": 2.6652, "step": 12536 }, { "crossentropy": 2.6149377822875977, "epoch": 0.4545026102088167, "grad_norm": 0.030408991500735283, "grad_norm_var": 4.280707192684053e-06, "learning_rate": 0.0058849585490705686, "loss": 2.5009, "step": 12537 }, { "crossentropy": 2.737365484237671, "epoch": 0.45453886310904873, "grad_norm": 0.026740988716483116, "grad_norm_var": 4.603427761573153e-06, "learning_rate": 0.005884386629388781, "loss": 2.6155, "step": 12538 }, { "crossentropy": 2.5975253582000732, "epoch": 0.45457511600928074, "grad_norm": 0.027713593095541, "grad_norm_var": 4.628356991241223e-06, "learning_rate": 0.005883814697762008, "loss": 2.621, "step": 12539 }, { "crossentropy": 2.676006317138672, "epoch": 0.45461136890951276, "grad_norm": 0.026235828176140785, "grad_norm_var": 5.187661449978587e-06, "learning_rate": 0.005883242754197979, "loss": 2.6832, "step": 12540 }, { "crossentropy": 2.514791250228882, "epoch": 0.4546476218097448, "grad_norm": 0.026678310707211494, "grad_norm_var": 5.308930420650628e-06, "learning_rate": 0.0058826707987044155, "loss": 2.5251, "step": 12541 }, { "crossentropy": 2.641728639602661, "epoch": 0.4546838747099768, "grad_norm": 0.030235012993216515, "grad_norm_var": 4.363738606366167e-06, "learning_rate": 0.005882098831289043, "loss": 2.6352, "step": 12542 }, { "crossentropy": 2.7162137031555176, "epoch": 0.4547201276102088, "grad_norm": 0.026849966496229172, "grad_norm_var": 3.6309512780908584e-06, "learning_rate": 0.005881526851959587, "loss": 2.6537, "step": 12543 }, { "crossentropy": 2.502727508544922, "epoch": 0.4547563805104408, "grad_norm": 0.03061252273619175, "grad_norm_var": 3.779747674457915e-06, "learning_rate": 0.0058809548607237725, "loss": 2.4977, "step": 12544 }, { "crossentropy": 2.69122576713562, "epoch": 0.45479263341067283, "grad_norm": 0.03121032752096653, "grad_norm_var": 3.831491414500211e-06, "learning_rate": 0.0058803828575893285, "loss": 2.6592, "step": 12545 }, { "crossentropy": 2.531597375869751, "epoch": 0.45482888631090485, "grad_norm": 0.03324691951274872, "grad_norm_var": 4.94980647180217e-06, "learning_rate": 0.005879810842563975, "loss": 2.4849, "step": 12546 }, { "crossentropy": 2.5987486839294434, "epoch": 0.4548651392111369, "grad_norm": 0.03365958109498024, "grad_norm_var": 6.096221681132589e-06, "learning_rate": 0.005879238815655441, "loss": 2.5996, "step": 12547 }, { "crossentropy": 2.619814872741699, "epoch": 0.45490139211136893, "grad_norm": 0.028420880436897278, "grad_norm_var": 5.29255758735687e-06, "learning_rate": 0.005878666776871454, "loss": 2.6669, "step": 12548 }, { "crossentropy": 2.3494439125061035, "epoch": 0.45493764501160094, "grad_norm": 0.02633127011358738, "grad_norm_var": 5.666517856563986e-06, "learning_rate": 0.005878094726219739, "loss": 2.4628, "step": 12549 }, { "crossentropy": 2.564814329147339, "epoch": 0.45497389791183296, "grad_norm": 0.02756391651928425, "grad_norm_var": 5.697830287013499e-06, "learning_rate": 0.005877522663708021, "loss": 2.6637, "step": 12550 }, { "crossentropy": 2.652864694595337, "epoch": 0.455010150812065, "grad_norm": 0.02647978439927101, "grad_norm_var": 6.074164727095227e-06, "learning_rate": 0.005876950589344028, "loss": 2.6246, "step": 12551 }, { "crossentropy": 2.65472412109375, "epoch": 0.455046403712297, "grad_norm": 0.028142252936959267, "grad_norm_var": 6.0673584825268745e-06, "learning_rate": 0.005876378503135486, "loss": 2.6103, "step": 12552 }, { "crossentropy": 2.734639883041382, "epoch": 0.455082656612529, "grad_norm": 0.026856757700443268, "grad_norm_var": 6.0859495849800806e-06, "learning_rate": 0.005875806405090124, "loss": 2.6763, "step": 12553 }, { "crossentropy": 2.739710807800293, "epoch": 0.455118909512761, "grad_norm": 0.02623836137354374, "grad_norm_var": 6.223718896792421e-06, "learning_rate": 0.005875234295215667, "loss": 2.6475, "step": 12554 }, { "crossentropy": 2.642457962036133, "epoch": 0.45515516241299303, "grad_norm": 0.02650795504450798, "grad_norm_var": 6.445758058367746e-06, "learning_rate": 0.00587466217351984, "loss": 2.5958, "step": 12555 }, { "crossentropy": 2.6494016647338867, "epoch": 0.45519141531322505, "grad_norm": 0.02643270418047905, "grad_norm_var": 6.389943995112798e-06, "learning_rate": 0.0058740900400103754, "loss": 2.6324, "step": 12556 }, { "crossentropy": 2.6244468688964844, "epoch": 0.45522766821345706, "grad_norm": 0.03808191791176796, "grad_norm_var": 1.1798437467520198e-05, "learning_rate": 0.005873517894694999, "loss": 2.5591, "step": 12557 }, { "crossentropy": 2.466874361038208, "epoch": 0.4552639211136891, "grad_norm": 0.02874491550028324, "grad_norm_var": 1.1727479698885379e-05, "learning_rate": 0.005872945737581437, "loss": 2.5535, "step": 12558 }, { "crossentropy": 2.611562490463257, "epoch": 0.4553001740139211, "grad_norm": 0.03239603713154793, "grad_norm_var": 1.1996231200196009e-05, "learning_rate": 0.005872373568677417, "loss": 2.5744, "step": 12559 }, { "crossentropy": 2.6184277534484863, "epoch": 0.45533642691415316, "grad_norm": 0.031669970601797104, "grad_norm_var": 1.223243966279458e-05, "learning_rate": 0.0058718013879906685, "loss": 2.6339, "step": 12560 }, { "crossentropy": 2.514291286468506, "epoch": 0.4553726798143852, "grad_norm": 0.03122648037970066, "grad_norm_var": 1.2236141741171769e-05, "learning_rate": 0.00587122919552892, "loss": 2.5741, "step": 12561 }, { "crossentropy": 2.652111768722534, "epoch": 0.4554089327146172, "grad_norm": 0.030205639079213142, "grad_norm_var": 1.12948298611054e-05, "learning_rate": 0.0058706569912999, "loss": 2.5725, "step": 12562 }, { "crossentropy": 2.7322068214416504, "epoch": 0.4554451856148492, "grad_norm": 0.0274959709495306, "grad_norm_var": 1.0094579937627799e-05, "learning_rate": 0.005870084775311333, "loss": 2.6516, "step": 12563 }, { "crossentropy": 2.7212204933166504, "epoch": 0.4554814385150812, "grad_norm": 0.03093814291059971, "grad_norm_var": 1.032152669969189e-05, "learning_rate": 0.005869512547570953, "loss": 2.6247, "step": 12564 }, { "crossentropy": 2.611915111541748, "epoch": 0.45551769141531323, "grad_norm": 0.028922466561198235, "grad_norm_var": 9.790811197497385e-06, "learning_rate": 0.005868940308086486, "loss": 2.5947, "step": 12565 }, { "crossentropy": 2.606194496154785, "epoch": 0.45555394431554525, "grad_norm": 0.026749463751912117, "grad_norm_var": 1.0014711081757439e-05, "learning_rate": 0.005868368056865663, "loss": 2.5769, "step": 12566 }, { "crossentropy": 2.6120033264160156, "epoch": 0.45559019721577726, "grad_norm": 0.02639986015856266, "grad_norm_var": 1.0044024430515969e-05, "learning_rate": 0.005867795793916208, "loss": 2.6413, "step": 12567 }, { "crossentropy": 2.5146331787109375, "epoch": 0.4556264501160093, "grad_norm": 0.07799109816551208, "grad_norm_var": 0.00015839979168560137, "learning_rate": 0.005867223519245857, "loss": 2.5097, "step": 12568 }, { "crossentropy": 2.5274534225463867, "epoch": 0.4556627030162413, "grad_norm": 0.028511298820376396, "grad_norm_var": 0.00015736928060638462, "learning_rate": 0.005866651232862334, "loss": 2.5753, "step": 12569 }, { "crossentropy": 2.8176143169403076, "epoch": 0.4556989559164733, "grad_norm": 0.030092934146523476, "grad_norm_var": 0.00015512755086918707, "learning_rate": 0.005866078934773374, "loss": 2.6505, "step": 12570 }, { "crossentropy": 2.5351006984710693, "epoch": 0.4557352088167053, "grad_norm": 0.030773736536502838, "grad_norm_var": 0.00015277261861085475, "learning_rate": 0.005865506624986701, "loss": 2.5869, "step": 12571 }, { "crossentropy": 2.517199993133545, "epoch": 0.45577146171693733, "grad_norm": 0.02872251532971859, "grad_norm_var": 0.00015112136354228123, "learning_rate": 0.005864934303510049, "loss": 2.6066, "step": 12572 }, { "crossentropy": 2.6184887886047363, "epoch": 0.45580771461716935, "grad_norm": 0.040819842368364334, "grad_norm_var": 0.00015342401897650997, "learning_rate": 0.005864361970351145, "loss": 2.6401, "step": 12573 }, { "crossentropy": 2.5780727863311768, "epoch": 0.4558439675174014, "grad_norm": 0.030719423666596413, "grad_norm_var": 0.00015248723146046107, "learning_rate": 0.005863789625517724, "loss": 2.5718, "step": 12574 }, { "crossentropy": 2.4757816791534424, "epoch": 0.45588022041763343, "grad_norm": 0.0306086502969265, "grad_norm_var": 0.00015291476973981522, "learning_rate": 0.00586321726901751, "loss": 2.5246, "step": 12575 }, { "crossentropy": 2.6321473121643066, "epoch": 0.45591647331786544, "grad_norm": 0.02814517542719841, "grad_norm_var": 0.0001544293721086366, "learning_rate": 0.005862644900858238, "loss": 2.6337, "step": 12576 }, { "crossentropy": 2.6828441619873047, "epoch": 0.45595272621809746, "grad_norm": 0.030070126056671143, "grad_norm_var": 0.00015478949620899505, "learning_rate": 0.005862072521047638, "loss": 2.6825, "step": 12577 }, { "crossentropy": 2.7548892498016357, "epoch": 0.4559889791183295, "grad_norm": 0.029495058581233025, "grad_norm_var": 0.00015508086661974028, "learning_rate": 0.005861500129593439, "loss": 2.7094, "step": 12578 }, { "crossentropy": 2.5976436138153076, "epoch": 0.4560252320185615, "grad_norm": 0.03234824538230896, "grad_norm_var": 0.00015305390296094232, "learning_rate": 0.005860927726503374, "loss": 2.6254, "step": 12579 }, { "crossentropy": 2.5981717109680176, "epoch": 0.4560614849187935, "grad_norm": 0.02840082347393036, "grad_norm_var": 0.000154223768919659, "learning_rate": 0.005860355311785175, "loss": 2.574, "step": 12580 }, { "crossentropy": 2.481395721435547, "epoch": 0.4560977378190255, "grad_norm": 0.026547789573669434, "grad_norm_var": 0.00015588250713755822, "learning_rate": 0.005859782885446569, "loss": 2.5099, "step": 12581 }, { "crossentropy": 2.750596046447754, "epoch": 0.45613399071925753, "grad_norm": 0.02805987186729908, "grad_norm_var": 0.00015491524510290975, "learning_rate": 0.0058592104474952935, "loss": 2.6623, "step": 12582 }, { "crossentropy": 2.660083770751953, "epoch": 0.45617024361948955, "grad_norm": 0.026825018227100372, "grad_norm_var": 0.0001545534355802055, "learning_rate": 0.005858637997939076, "loss": 2.6974, "step": 12583 }, { "crossentropy": 2.633709192276001, "epoch": 0.45620649651972156, "grad_norm": 0.03023648075759411, "grad_norm_var": 1.0666253410964568e-05, "learning_rate": 0.005858065536785648, "loss": 2.5568, "step": 12584 }, { "crossentropy": 2.433742046356201, "epoch": 0.4562427494199536, "grad_norm": 0.02816143073141575, "grad_norm_var": 1.0744449570427199e-05, "learning_rate": 0.005857493064042743, "loss": 2.4834, "step": 12585 }, { "crossentropy": 2.6969401836395264, "epoch": 0.4562790023201856, "grad_norm": 0.04100089892745018, "grad_norm_var": 1.831362811529948e-05, "learning_rate": 0.0058569205797180945, "loss": 2.6677, "step": 12586 }, { "crossentropy": 2.5604231357574463, "epoch": 0.45631525522041766, "grad_norm": 0.027437573298811913, "grad_norm_var": 1.896908763467397e-05, "learning_rate": 0.005856348083819431, "loss": 2.5647, "step": 12587 }, { "crossentropy": 2.438170909881592, "epoch": 0.4563515081206497, "grad_norm": 0.026854485273361206, "grad_norm_var": 1.962365944068109e-05, "learning_rate": 0.00585577557635449, "loss": 2.5612, "step": 12588 }, { "crossentropy": 2.821359395980835, "epoch": 0.4563877610208817, "grad_norm": 0.029700998216867447, "grad_norm_var": 1.1840908391690802e-05, "learning_rate": 0.005855203057330999, "loss": 2.6288, "step": 12589 }, { "crossentropy": 2.517563581466675, "epoch": 0.4564240139211137, "grad_norm": 0.027796585112810135, "grad_norm_var": 1.1963242831078862e-05, "learning_rate": 0.0058546305267566925, "loss": 2.6206, "step": 12590 }, { "crossentropy": 2.5051493644714355, "epoch": 0.4564602668213457, "grad_norm": 0.027000699192285538, "grad_norm_var": 1.2234153047814656e-05, "learning_rate": 0.005854057984639305, "loss": 2.5144, "step": 12591 }, { "crossentropy": 2.542264223098755, "epoch": 0.45649651972157773, "grad_norm": 0.028430161997675896, "grad_norm_var": 1.2197054792791446e-05, "learning_rate": 0.005853485430986568, "loss": 2.5015, "step": 12592 }, { "crossentropy": 2.642002582550049, "epoch": 0.45653277262180975, "grad_norm": 0.02712130732834339, "grad_norm_var": 1.242707175135072e-05, "learning_rate": 0.005852912865806215, "loss": 2.6052, "step": 12593 }, { "crossentropy": 2.7309772968292236, "epoch": 0.45656902552204176, "grad_norm": 0.028597060590982437, "grad_norm_var": 1.2428803956317641e-05, "learning_rate": 0.00585234028910598, "loss": 2.6255, "step": 12594 }, { "crossentropy": 2.4696247577667236, "epoch": 0.4566052784222738, "grad_norm": 0.03079099766910076, "grad_norm_var": 1.1891902081060978e-05, "learning_rate": 0.005851767700893594, "loss": 2.5067, "step": 12595 }, { "crossentropy": 2.4953696727752686, "epoch": 0.4566415313225058, "grad_norm": 0.02818114683032036, "grad_norm_var": 1.1910568337698116e-05, "learning_rate": 0.005851195101176795, "loss": 2.6728, "step": 12596 }, { "crossentropy": 2.5362906455993652, "epoch": 0.4566777842227378, "grad_norm": 0.027103006839752197, "grad_norm_var": 1.1754118555194051e-05, "learning_rate": 0.005850622489963313, "loss": 2.5577, "step": 12597 }, { "crossentropy": 2.431964874267578, "epoch": 0.4567140371229698, "grad_norm": 0.027634764090180397, "grad_norm_var": 1.181621291922763e-05, "learning_rate": 0.005850049867260884, "loss": 2.5342, "step": 12598 }, { "crossentropy": 2.5973105430603027, "epoch": 0.45675029002320183, "grad_norm": 0.028055522590875626, "grad_norm_var": 1.156556388985512e-05, "learning_rate": 0.00584947723307724, "loss": 2.6639, "step": 12599 }, { "crossentropy": 2.524911403656006, "epoch": 0.45678654292343385, "grad_norm": 0.0264270082116127, "grad_norm_var": 1.1847797279853669e-05, "learning_rate": 0.005848904587420119, "loss": 2.5082, "step": 12600 }, { "crossentropy": 2.6007723808288574, "epoch": 0.4568227958236659, "grad_norm": 0.027377856895327568, "grad_norm_var": 1.1949580641330837e-05, "learning_rate": 0.0058483319302972524, "loss": 2.6614, "step": 12601 }, { "crossentropy": 2.656872272491455, "epoch": 0.45685904872389793, "grad_norm": 0.03026055544614792, "grad_norm_var": 1.5715685846609886e-06, "learning_rate": 0.005847759261716375, "loss": 2.6085, "step": 12602 }, { "crossentropy": 2.6183900833129883, "epoch": 0.45689530162412995, "grad_norm": 0.02702692709863186, "grad_norm_var": 1.6155364846235156e-06, "learning_rate": 0.005847186581685223, "loss": 2.585, "step": 12603 }, { "crossentropy": 2.5759432315826416, "epoch": 0.45693155452436196, "grad_norm": 0.027751579880714417, "grad_norm_var": 1.5261327129473604e-06, "learning_rate": 0.005846613890211531, "loss": 2.5898, "step": 12604 }, { "crossentropy": 2.57004976272583, "epoch": 0.456967807424594, "grad_norm": 0.028670787811279297, "grad_norm_var": 1.3695989773652995e-06, "learning_rate": 0.005846041187303033, "loss": 2.6162, "step": 12605 }, { "crossentropy": 2.5224928855895996, "epoch": 0.457004060324826, "grad_norm": 0.02645883709192276, "grad_norm_var": 1.5202485336197774e-06, "learning_rate": 0.005845468472967466, "loss": 2.618, "step": 12606 }, { "crossentropy": 2.5278468132019043, "epoch": 0.457040313225058, "grad_norm": 0.07848155498504639, "grad_norm_var": 0.0001607803025710677, "learning_rate": 0.005844895747212562, "loss": 2.581, "step": 12607 }, { "crossentropy": 2.5710301399230957, "epoch": 0.45707656612529, "grad_norm": 0.02833990380167961, "grad_norm_var": 0.00016081352015945142, "learning_rate": 0.00584432301004606, "loss": 2.5889, "step": 12608 }, { "crossentropy": 2.601309061050415, "epoch": 0.45711281902552203, "grad_norm": 0.03149218112230301, "grad_norm_var": 0.00015966411342822057, "learning_rate": 0.005843750261475694, "loss": 2.5708, "step": 12609 }, { "crossentropy": 2.6456298828125, "epoch": 0.45714907192575405, "grad_norm": 0.032871097326278687, "grad_norm_var": 0.0001591996166013611, "learning_rate": 0.005843177501509201, "loss": 2.5669, "step": 12610 }, { "crossentropy": 2.6267809867858887, "epoch": 0.45718532482598606, "grad_norm": 0.029533063992857933, "grad_norm_var": 0.00015944808227272566, "learning_rate": 0.005842604730154316, "loss": 2.5967, "step": 12611 }, { "crossentropy": 2.6409664154052734, "epoch": 0.4572215777262181, "grad_norm": 0.028119798749685287, "grad_norm_var": 0.00015947631647704187, "learning_rate": 0.0058420319474187755, "loss": 2.6676, "step": 12612 }, { "crossentropy": 2.556307315826416, "epoch": 0.4572578306264501, "grad_norm": 0.028425108641386032, "grad_norm_var": 0.00015879278353271524, "learning_rate": 0.005841459153310316, "loss": 2.5856, "step": 12613 }, { "crossentropy": 2.5056488513946533, "epoch": 0.45729408352668216, "grad_norm": 0.03181709721684456, "grad_norm_var": 0.00015762860239369053, "learning_rate": 0.0058408863478366735, "loss": 2.561, "step": 12614 }, { "crossentropy": 2.46400785446167, "epoch": 0.4573303364269142, "grad_norm": 0.029634132981300354, "grad_norm_var": 0.0001569658367461877, "learning_rate": 0.005840313531005585, "loss": 2.6374, "step": 12615 }, { "crossentropy": 2.6226038932800293, "epoch": 0.4573665893271462, "grad_norm": 0.027551021426916122, "grad_norm_var": 0.00015620314450331046, "learning_rate": 0.005839740702824787, "loss": 2.5349, "step": 12616 }, { "crossentropy": 2.6333975791931152, "epoch": 0.4574028422273782, "grad_norm": 0.027647102251648903, "grad_norm_var": 0.00015603767875100406, "learning_rate": 0.005839167863302015, "loss": 2.6261, "step": 12617 }, { "crossentropy": 2.6767804622650146, "epoch": 0.4574390951276102, "grad_norm": 0.028308620676398277, "grad_norm_var": 0.00015676235687513325, "learning_rate": 0.005838595012445008, "loss": 2.6774, "step": 12618 }, { "crossentropy": 2.64766526222229, "epoch": 0.45747534802784223, "grad_norm": 0.026878435164690018, "grad_norm_var": 0.00015686235588604236, "learning_rate": 0.005838022150261504, "loss": 2.6415, "step": 12619 }, { "crossentropy": 2.640385866165161, "epoch": 0.45751160092807425, "grad_norm": 0.026898004114627838, "grad_norm_var": 0.00015739126602382485, "learning_rate": 0.005837449276759237, "loss": 2.6442, "step": 12620 }, { "crossentropy": 2.5768628120422363, "epoch": 0.45754785382830626, "grad_norm": 0.028823023661971092, "grad_norm_var": 0.00015732624561816763, "learning_rate": 0.00583687639194595, "loss": 2.6239, "step": 12621 }, { "crossentropy": 2.552757978439331, "epoch": 0.4575841067285383, "grad_norm": 0.027262629941105843, "grad_norm_var": 0.00015677759575383231, "learning_rate": 0.005836303495829375, "loss": 2.5756, "step": 12622 }, { "crossentropy": 2.7084081172943115, "epoch": 0.4576203596287703, "grad_norm": 0.02725389413535595, "grad_norm_var": 3.3444986038105284e-06, "learning_rate": 0.005835730588417253, "loss": 2.6535, "step": 12623 }, { "crossentropy": 2.509530544281006, "epoch": 0.4576566125290023, "grad_norm": 0.03150840848684311, "grad_norm_var": 3.7761316071754e-06, "learning_rate": 0.0058351576697173195, "loss": 2.5729, "step": 12624 }, { "crossentropy": 2.637760877609253, "epoch": 0.4576928654292343, "grad_norm": 0.02986530400812626, "grad_norm_var": 3.401276076570716e-06, "learning_rate": 0.005834584739737317, "loss": 2.5698, "step": 12625 }, { "crossentropy": 2.5958664417266846, "epoch": 0.45772911832946633, "grad_norm": 0.02769358828663826, "grad_norm_var": 2.3351623911088316e-06, "learning_rate": 0.005834011798484979, "loss": 2.5905, "step": 12626 }, { "crossentropy": 2.5758354663848877, "epoch": 0.4577653712296984, "grad_norm": 0.02714122273027897, "grad_norm_var": 2.3875640318564976e-06, "learning_rate": 0.005833438845968046, "loss": 2.5752, "step": 12627 }, { "crossentropy": 2.6093649864196777, "epoch": 0.4578016241299304, "grad_norm": 0.027523692697286606, "grad_norm_var": 2.4341666486267134e-06, "learning_rate": 0.005832865882194256, "loss": 2.6441, "step": 12628 }, { "crossentropy": 2.635279893875122, "epoch": 0.45783787703016243, "grad_norm": 0.02730274759232998, "grad_norm_var": 2.507562102850522e-06, "learning_rate": 0.00583229290717135, "loss": 2.6436, "step": 12629 }, { "crossentropy": 2.6307597160339355, "epoch": 0.45787412993039445, "grad_norm": 0.02753322571516037, "grad_norm_var": 1.6566569265720477e-06, "learning_rate": 0.005831719920907066, "loss": 2.5792, "step": 12630 }, { "crossentropy": 2.572885513305664, "epoch": 0.45791038283062646, "grad_norm": 0.028549417853355408, "grad_norm_var": 1.501310259710333e-06, "learning_rate": 0.0058311469234091405, "loss": 2.6113, "step": 12631 }, { "crossentropy": 2.571770191192627, "epoch": 0.4579466357308585, "grad_norm": 0.028585106134414673, "grad_norm_var": 1.5084768062074866e-06, "learning_rate": 0.005830573914685314, "loss": 2.6615, "step": 12632 }, { "crossentropy": 2.476137161254883, "epoch": 0.4579828886310905, "grad_norm": 0.030030855908989906, "grad_norm_var": 1.7360729373494879e-06, "learning_rate": 0.005830000894743328, "loss": 2.6197, "step": 12633 }, { "crossentropy": 2.5076372623443604, "epoch": 0.4580191415313225, "grad_norm": 0.03263472020626068, "grad_norm_var": 2.9699306063156087e-06, "learning_rate": 0.00582942786359092, "loss": 2.6089, "step": 12634 }, { "crossentropy": 2.344669818878174, "epoch": 0.4580553944315545, "grad_norm": 0.028305981308221817, "grad_norm_var": 2.794785946336993e-06, "learning_rate": 0.00582885482123583, "loss": 2.458, "step": 12635 }, { "crossentropy": 2.5219321250915527, "epoch": 0.45809164733178653, "grad_norm": 0.02623739093542099, "grad_norm_var": 2.968187854327517e-06, "learning_rate": 0.005828281767685797, "loss": 2.5576, "step": 12636 }, { "crossentropy": 2.4986612796783447, "epoch": 0.45812790023201855, "grad_norm": 0.026951944455504417, "grad_norm_var": 3.110326336203219e-06, "learning_rate": 0.005827708702948562, "loss": 2.5714, "step": 12637 }, { "crossentropy": 2.5819826126098633, "epoch": 0.45816415313225056, "grad_norm": 0.02675669826567173, "grad_norm_var": 3.2029646980441848e-06, "learning_rate": 0.0058271356270318654, "loss": 2.6361, "step": 12638 }, { "crossentropy": 2.7211735248565674, "epoch": 0.4582004060324826, "grad_norm": 0.02875087969005108, "grad_norm_var": 3.120823842018176e-06, "learning_rate": 0.005826562539943447, "loss": 2.7027, "step": 12639 }, { "crossentropy": 2.389918565750122, "epoch": 0.4582366589327146, "grad_norm": 0.0271858312189579, "grad_norm_var": 2.532088025450916e-06, "learning_rate": 0.005825989441691047, "loss": 2.4001, "step": 12640 }, { "crossentropy": 2.5935580730438232, "epoch": 0.45827291183294666, "grad_norm": 0.027297386899590492, "grad_norm_var": 2.370804025122195e-06, "learning_rate": 0.005825416332282405, "loss": 2.5864, "step": 12641 }, { "crossentropy": 2.752267360687256, "epoch": 0.4583091647331787, "grad_norm": 0.026349157094955444, "grad_norm_var": 2.5440845263755095e-06, "learning_rate": 0.005824843211725265, "loss": 2.7001, "step": 12642 }, { "crossentropy": 2.5702309608459473, "epoch": 0.4583454176334107, "grad_norm": 0.026752891018986702, "grad_norm_var": 2.595179863030848e-06, "learning_rate": 0.005824270080027363, "loss": 2.592, "step": 12643 }, { "crossentropy": 2.607095241546631, "epoch": 0.4583816705336427, "grad_norm": 0.025876756757497787, "grad_norm_var": 2.852113719163417e-06, "learning_rate": 0.005823696937196444, "loss": 2.6115, "step": 12644 }, { "crossentropy": 2.6006875038146973, "epoch": 0.4584179234338747, "grad_norm": 0.026946131139993668, "grad_norm_var": 2.8846004466680117e-06, "learning_rate": 0.005823123783240248, "loss": 2.6406, "step": 12645 }, { "crossentropy": 2.599494457244873, "epoch": 0.45845417633410673, "grad_norm": 0.03513108566403389, "grad_norm_var": 6.2258344802448386e-06, "learning_rate": 0.005822550618166518, "loss": 2.6238, "step": 12646 }, { "crossentropy": 2.579397201538086, "epoch": 0.45849042923433875, "grad_norm": 0.03210005909204483, "grad_norm_var": 7.145399088364738e-06, "learning_rate": 0.00582197744198299, "loss": 2.5338, "step": 12647 }, { "crossentropy": 2.459655523300171, "epoch": 0.45852668213457076, "grad_norm": 0.02861693501472473, "grad_norm_var": 7.145851997141887e-06, "learning_rate": 0.005821404254697411, "loss": 2.6013, "step": 12648 }, { "crossentropy": 2.684781789779663, "epoch": 0.4585629350348028, "grad_norm": 0.027882851660251617, "grad_norm_var": 6.994436349310355e-06, "learning_rate": 0.005820831056317521, "loss": 2.6169, "step": 12649 }, { "crossentropy": 2.6492481231689453, "epoch": 0.4585991879350348, "grad_norm": 0.030294222757220268, "grad_norm_var": 6.003136364334377e-06, "learning_rate": 0.005820257846851063, "loss": 2.6106, "step": 12650 }, { "crossentropy": 2.515413999557495, "epoch": 0.4586354408352668, "grad_norm": 0.0273639727383852, "grad_norm_var": 6.047140460233126e-06, "learning_rate": 0.005819684626305776, "loss": 2.5952, "step": 12651 }, { "crossentropy": 2.4167706966400146, "epoch": 0.4586716937354988, "grad_norm": 0.027951350435614586, "grad_norm_var": 5.792314058130031e-06, "learning_rate": 0.0058191113946894045, "loss": 2.4997, "step": 12652 }, { "crossentropy": 2.4613759517669678, "epoch": 0.45870794663573083, "grad_norm": 0.025527412071824074, "grad_norm_var": 6.168165477802151e-06, "learning_rate": 0.005818538152009691, "loss": 2.4959, "step": 12653 }, { "crossentropy": 2.5917575359344482, "epoch": 0.4587441995359629, "grad_norm": 0.025858473032712936, "grad_norm_var": 6.388329004433863e-06, "learning_rate": 0.0058179648982743795, "loss": 2.5883, "step": 12654 }, { "crossentropy": 2.5768017768859863, "epoch": 0.4587804524361949, "grad_norm": 0.027026724070310593, "grad_norm_var": 6.42859505538314e-06, "learning_rate": 0.005817391633491208, "loss": 2.6401, "step": 12655 }, { "crossentropy": 2.58642840385437, "epoch": 0.45881670533642693, "grad_norm": 0.02777625247836113, "grad_norm_var": 6.3854953774315656e-06, "learning_rate": 0.0058168183576679236, "loss": 2.5548, "step": 12656 }, { "crossentropy": 2.508634567260742, "epoch": 0.45885295823665895, "grad_norm": 0.03025594726204872, "grad_norm_var": 6.6368677620260205e-06, "learning_rate": 0.005816245070812267, "loss": 2.6684, "step": 12657 }, { "crossentropy": 2.5362038612365723, "epoch": 0.45888921113689096, "grad_norm": 0.029062800109386444, "grad_norm_var": 6.415900647040044e-06, "learning_rate": 0.005815671772931983, "loss": 2.5548, "step": 12658 }, { "crossentropy": 2.652644157409668, "epoch": 0.458925464037123, "grad_norm": 0.027360135689377785, "grad_norm_var": 6.3054667569585e-06, "learning_rate": 0.005815098464034813, "loss": 2.6435, "step": 12659 }, { "crossentropy": 2.44933819770813, "epoch": 0.458961716937355, "grad_norm": 0.02906646765768528, "grad_norm_var": 5.851460050685688e-06, "learning_rate": 0.005814525144128501, "loss": 2.5582, "step": 12660 }, { "crossentropy": 2.4730453491210938, "epoch": 0.458997969837587, "grad_norm": 0.02625628001987934, "grad_norm_var": 6.036895515721405e-06, "learning_rate": 0.005813951813220791, "loss": 2.5342, "step": 12661 }, { "crossentropy": 2.541269540786743, "epoch": 0.459034222737819, "grad_norm": 0.027400538325309753, "grad_norm_var": 3.0356847173973092e-06, "learning_rate": 0.005813378471319427, "loss": 2.5589, "step": 12662 }, { "crossentropy": 2.534357786178589, "epoch": 0.45907047563805103, "grad_norm": 0.026904592290520668, "grad_norm_var": 1.96046026054326e-06, "learning_rate": 0.005812805118432152, "loss": 2.5265, "step": 12663 }, { "crossentropy": 2.6268889904022217, "epoch": 0.45910672853828305, "grad_norm": 0.02716466784477234, "grad_norm_var": 1.9317295647763393e-06, "learning_rate": 0.0058122317545667105, "loss": 2.6292, "step": 12664 }, { "crossentropy": 2.5790367126464844, "epoch": 0.45914298143851506, "grad_norm": 0.02904864028096199, "grad_norm_var": 2.045552820960766e-06, "learning_rate": 0.0058116583797308465, "loss": 2.6159, "step": 12665 }, { "crossentropy": 2.594625234603882, "epoch": 0.4591792343387471, "grad_norm": 0.027236333116889, "grad_norm_var": 1.6007594627706044e-06, "learning_rate": 0.005811084993932305, "loss": 2.6125, "step": 12666 }, { "crossentropy": 2.5438714027404785, "epoch": 0.4592154872389791, "grad_norm": 0.026165969669818878, "grad_norm_var": 1.7247732125503923e-06, "learning_rate": 0.005810511597178829, "loss": 2.4866, "step": 12667 }, { "crossentropy": 2.685014009475708, "epoch": 0.45925174013921116, "grad_norm": 0.028136342763900757, "grad_norm_var": 1.7379484664930532e-06, "learning_rate": 0.005809938189478165, "loss": 2.6258, "step": 12668 }, { "crossentropy": 2.705599784851074, "epoch": 0.4592879930394432, "grad_norm": 0.027292069047689438, "grad_norm_var": 1.4648081509018348e-06, "learning_rate": 0.005809364770838055, "loss": 2.5415, "step": 12669 }, { "crossentropy": 2.490302562713623, "epoch": 0.4593242459396752, "grad_norm": 0.02683251164853573, "grad_norm_var": 1.294583737726738e-06, "learning_rate": 0.005808791341266246, "loss": 2.4908, "step": 12670 }, { "crossentropy": 2.620455026626587, "epoch": 0.4593604988399072, "grad_norm": 0.026998551562428474, "grad_norm_var": 1.297112215839025e-06, "learning_rate": 0.0058082179007704834, "loss": 2.5456, "step": 12671 }, { "crossentropy": 2.5139334201812744, "epoch": 0.4593967517401392, "grad_norm": 0.02772391401231289, "grad_norm_var": 1.2966457922691336e-06, "learning_rate": 0.005807644449358511, "loss": 2.5897, "step": 12672 }, { "crossentropy": 2.6144425868988037, "epoch": 0.45943300464037123, "grad_norm": 0.02654866874217987, "grad_norm_var": 8.83135761038598e-07, "learning_rate": 0.005807070987038075, "loss": 2.6459, "step": 12673 }, { "crossentropy": 2.6619443893432617, "epoch": 0.45946925754060325, "grad_norm": 0.026940064504742622, "grad_norm_var": 7.082612298291476e-07, "learning_rate": 0.0058064975138169194, "loss": 2.6091, "step": 12674 }, { "crossentropy": 2.4829158782958984, "epoch": 0.45950551044083526, "grad_norm": 0.028799673542380333, "grad_norm_var": 8.460124993821347e-07, "learning_rate": 0.005805924029702793, "loss": 2.4719, "step": 12675 }, { "crossentropy": 2.529224395751953, "epoch": 0.4595417633410673, "grad_norm": 0.03865709528326988, "grad_norm_var": 8.71655344254988e-06, "learning_rate": 0.005805350534703438, "loss": 2.4813, "step": 12676 }, { "crossentropy": 2.6936936378479004, "epoch": 0.4595780162412993, "grad_norm": 0.029355047270655632, "grad_norm_var": 8.593514875777225e-06, "learning_rate": 0.005804777028826602, "loss": 2.6031, "step": 12677 }, { "crossentropy": 2.5805680751800537, "epoch": 0.4596142691415313, "grad_norm": 0.029544170945882797, "grad_norm_var": 8.652128543224814e-06, "learning_rate": 0.005804203512080031, "loss": 2.6998, "step": 12678 }, { "crossentropy": 2.5761427879333496, "epoch": 0.4596505220417633, "grad_norm": 0.027633190155029297, "grad_norm_var": 8.546418999800714e-06, "learning_rate": 0.005803629984471472, "loss": 2.5352, "step": 12679 }, { "crossentropy": 2.5542283058166504, "epoch": 0.45968677494199534, "grad_norm": 0.027004025876522064, "grad_norm_var": 8.574058842182582e-06, "learning_rate": 0.005803056446008668, "loss": 2.5928, "step": 12680 }, { "crossentropy": 2.645176887512207, "epoch": 0.4597230278422274, "grad_norm": 0.0257043968886137, "grad_norm_var": 8.970347486326042e-06, "learning_rate": 0.0058024828966993704, "loss": 2.6349, "step": 12681 }, { "crossentropy": 2.714252233505249, "epoch": 0.4597592807424594, "grad_norm": 0.02681793086230755, "grad_norm_var": 9.03285926592458e-06, "learning_rate": 0.005801909336551323, "loss": 2.6606, "step": 12682 }, { "crossentropy": 2.500950336456299, "epoch": 0.45979553364269143, "grad_norm": 0.026919813826680183, "grad_norm_var": 8.870504623288454e-06, "learning_rate": 0.0058013357655722735, "loss": 2.5405, "step": 12683 }, { "crossentropy": 2.674741268157959, "epoch": 0.45983178654292345, "grad_norm": 0.027175618335604668, "grad_norm_var": 8.934003831856327e-06, "learning_rate": 0.005800762183769967, "loss": 2.6089, "step": 12684 }, { "crossentropy": 2.5711374282836914, "epoch": 0.45986803944315546, "grad_norm": 0.02710980735719204, "grad_norm_var": 8.95624067004419e-06, "learning_rate": 0.005800188591152154, "loss": 2.5863, "step": 12685 }, { "crossentropy": 2.584836721420288, "epoch": 0.4599042923433875, "grad_norm": 0.028349852189421654, "grad_norm_var": 8.841627841689097e-06, "learning_rate": 0.005799614987726578, "loss": 2.5289, "step": 12686 }, { "crossentropy": 2.593968152999878, "epoch": 0.4599405452436195, "grad_norm": 0.027818890288472176, "grad_norm_var": 8.751715600635692e-06, "learning_rate": 0.005799041373500988, "loss": 2.5893, "step": 12687 }, { "crossentropy": 2.5091006755828857, "epoch": 0.4599767981438515, "grad_norm": 0.027676137164235115, "grad_norm_var": 8.755250236266753e-06, "learning_rate": 0.005798467748483134, "loss": 2.5611, "step": 12688 }, { "crossentropy": 2.688185453414917, "epoch": 0.4600130510440835, "grad_norm": 0.026916297152638435, "grad_norm_var": 8.680136184134543e-06, "learning_rate": 0.005797894112680759, "loss": 2.667, "step": 12689 }, { "crossentropy": 2.632938861846924, "epoch": 0.46004930394431554, "grad_norm": 0.02695491351187229, "grad_norm_var": 8.67750424563083e-06, "learning_rate": 0.005797320466101615, "loss": 2.6283, "step": 12690 }, { "crossentropy": 2.3569958209991455, "epoch": 0.46008555684454755, "grad_norm": 0.027460772544145584, "grad_norm_var": 8.69629172072192e-06, "learning_rate": 0.0057967468087534455, "loss": 2.404, "step": 12691 }, { "crossentropy": 2.578765630722046, "epoch": 0.46012180974477956, "grad_norm": 0.031744007021188736, "grad_norm_var": 2.038561249345866e-06, "learning_rate": 0.005796173140644003, "loss": 2.6294, "step": 12692 }, { "crossentropy": 2.6453511714935303, "epoch": 0.4601580626450116, "grad_norm": 0.029086247086524963, "grad_norm_var": 1.985966274145543e-06, "learning_rate": 0.005795599461781034, "loss": 2.6214, "step": 12693 }, { "crossentropy": 2.379070281982422, "epoch": 0.4601943155452436, "grad_norm": 0.027333416044712067, "grad_norm_var": 1.7610219229917863e-06, "learning_rate": 0.005795025772172287, "loss": 2.5353, "step": 12694 }, { "crossentropy": 2.601005792617798, "epoch": 0.46023056844547566, "grad_norm": 0.02786046825349331, "grad_norm_var": 1.765056699999295e-06, "learning_rate": 0.00579445207182551, "loss": 2.6519, "step": 12695 }, { "crossentropy": 2.7343311309814453, "epoch": 0.4602668213457077, "grad_norm": 0.031724460422992706, "grad_norm_var": 2.76952893110958e-06, "learning_rate": 0.0057938783607484525, "loss": 2.6632, "step": 12696 }, { "crossentropy": 2.482353687286377, "epoch": 0.4603030742459397, "grad_norm": 0.026830313727259636, "grad_norm_var": 2.5167765284056595e-06, "learning_rate": 0.005793304638948862, "loss": 2.5333, "step": 12697 }, { "crossentropy": 2.5764899253845215, "epoch": 0.4603393271461717, "grad_norm": 0.027657926082611084, "grad_norm_var": 2.4300324071156733e-06, "learning_rate": 0.00579273090643449, "loss": 2.6107, "step": 12698 }, { "crossentropy": 2.4358792304992676, "epoch": 0.4603755800464037, "grad_norm": 0.029005106538534164, "grad_norm_var": 2.390720739248161e-06, "learning_rate": 0.005792157163213083, "loss": 2.3762, "step": 12699 }, { "crossentropy": 2.56951904296875, "epoch": 0.46041183294663574, "grad_norm": 0.030582256615161896, "grad_norm_var": 2.664825859992279e-06, "learning_rate": 0.00579158340929239, "loss": 2.5275, "step": 12700 }, { "crossentropy": 2.6087677478790283, "epoch": 0.46044808584686775, "grad_norm": 0.027576880529522896, "grad_norm_var": 2.5992374776154697e-06, "learning_rate": 0.005791009644680165, "loss": 2.6259, "step": 12701 }, { "crossentropy": 2.4709601402282715, "epoch": 0.46048433874709976, "grad_norm": 0.026987163349986076, "grad_norm_var": 2.726427179600144e-06, "learning_rate": 0.005790435869384153, "loss": 2.4969, "step": 12702 }, { "crossentropy": 2.4060094356536865, "epoch": 0.4605205916473318, "grad_norm": 0.025492513552308083, "grad_norm_var": 3.2219616555147213e-06, "learning_rate": 0.005789862083412105, "loss": 2.4704, "step": 12703 }, { "crossentropy": 2.693305015563965, "epoch": 0.4605568445475638, "grad_norm": 0.026291707530617714, "grad_norm_var": 3.4348627889609534e-06, "learning_rate": 0.00578928828677177, "loss": 2.6879, "step": 12704 }, { "crossentropy": 2.5839784145355225, "epoch": 0.4605930974477958, "grad_norm": 0.027467316016554832, "grad_norm_var": 3.367312219917408e-06, "learning_rate": 0.0057887144794709, "loss": 2.6262, "step": 12705 }, { "crossentropy": 2.5039188861846924, "epoch": 0.4606293503480278, "grad_norm": 0.026466280221939087, "grad_norm_var": 3.4586931696599e-06, "learning_rate": 0.0057881406615172435, "loss": 2.5699, "step": 12706 }, { "crossentropy": 2.63569974899292, "epoch": 0.46066560324825984, "grad_norm": 0.026790494099259377, "grad_norm_var": 3.543715550400709e-06, "learning_rate": 0.005787566832918553, "loss": 2.6605, "step": 12707 }, { "crossentropy": 2.637406826019287, "epoch": 0.4607018561484919, "grad_norm": 0.02618049643933773, "grad_norm_var": 2.74251319153002e-06, "learning_rate": 0.005786992993682574, "loss": 2.6737, "step": 12708 }, { "crossentropy": 2.5771098136901855, "epoch": 0.4607381090487239, "grad_norm": 0.027038699015975, "grad_norm_var": 2.628357330100815e-06, "learning_rate": 0.005786419143817063, "loss": 2.5272, "step": 12709 }, { "crossentropy": 2.7298688888549805, "epoch": 0.46077436194895594, "grad_norm": 0.027133114635944366, "grad_norm_var": 2.637459530692212e-06, "learning_rate": 0.005785845283329767, "loss": 2.6942, "step": 12710 }, { "crossentropy": 2.520641565322876, "epoch": 0.46081061484918795, "grad_norm": 0.0278486218303442, "grad_norm_var": 2.637006064732985e-06, "learning_rate": 0.005785271412228439, "loss": 2.5735, "step": 12711 }, { "crossentropy": 2.560513973236084, "epoch": 0.46084686774941996, "grad_norm": 0.02799925208091736, "grad_norm_var": 1.4393840667597154e-06, "learning_rate": 0.005784697530520826, "loss": 2.5023, "step": 12712 }, { "crossentropy": 2.5445456504821777, "epoch": 0.460883120649652, "grad_norm": 0.026227930560708046, "grad_norm_var": 1.5025389037373026e-06, "learning_rate": 0.0057841236382146855, "loss": 2.5301, "step": 12713 }, { "crossentropy": 2.5497853755950928, "epoch": 0.460919373549884, "grad_norm": 0.02721581794321537, "grad_norm_var": 1.4934563550507052e-06, "learning_rate": 0.005783549735317763, "loss": 2.5497, "step": 12714 }, { "crossentropy": 2.6026406288146973, "epoch": 0.460955626450116, "grad_norm": 0.02977076545357704, "grad_norm_var": 1.707333573640783e-06, "learning_rate": 0.005782975821837814, "loss": 2.5435, "step": 12715 }, { "crossentropy": 2.6225247383117676, "epoch": 0.460991879350348, "grad_norm": 0.030738890171051025, "grad_norm_var": 1.7770636317023519e-06, "learning_rate": 0.005782401897782586, "loss": 2.6625, "step": 12716 }, { "crossentropy": 2.6606831550598145, "epoch": 0.46102813225058004, "grad_norm": 0.02675752528011799, "grad_norm_var": 1.7916824269243915e-06, "learning_rate": 0.0057818279631598355, "loss": 2.5198, "step": 12717 }, { "crossentropy": 2.6689987182617188, "epoch": 0.46106438515081205, "grad_norm": 0.026714248582720757, "grad_norm_var": 1.8068265477255331e-06, "learning_rate": 0.005781254017977311, "loss": 2.6761, "step": 12718 }, { "crossentropy": 2.5258798599243164, "epoch": 0.46110063805104406, "grad_norm": 0.025702079758048058, "grad_norm_var": 1.7602300082704732e-06, "learning_rate": 0.005780680062242765, "loss": 2.4793, "step": 12719 }, { "crossentropy": 2.542452812194824, "epoch": 0.4611368909512761, "grad_norm": 0.026280872523784637, "grad_norm_var": 1.761652751363733e-06, "learning_rate": 0.005780106095963949, "loss": 2.5219, "step": 12720 }, { "crossentropy": 2.4086363315582275, "epoch": 0.4611731438515081, "grad_norm": 0.026388736441731453, "grad_norm_var": 1.8060964780560718e-06, "learning_rate": 0.005779532119148618, "loss": 2.4593, "step": 12721 }, { "crossentropy": 2.4414865970611572, "epoch": 0.46120939675174016, "grad_norm": 0.026385720819234848, "grad_norm_var": 1.8144192961924248e-06, "learning_rate": 0.005778958131804523, "loss": 2.4995, "step": 12722 }, { "crossentropy": 2.5987913608551025, "epoch": 0.4612456496519722, "grad_norm": 0.028569530695676804, "grad_norm_var": 1.9154895441555643e-06, "learning_rate": 0.005778384133939415, "loss": 2.6137, "step": 12723 }, { "crossentropy": 2.613464593887329, "epoch": 0.4612819025522042, "grad_norm": 0.02928519994020462, "grad_norm_var": 2.0505678729372418e-06, "learning_rate": 0.005777810125561047, "loss": 2.6222, "step": 12724 }, { "crossentropy": 2.6649510860443115, "epoch": 0.4613181554524362, "grad_norm": 0.02818712592124939, "grad_norm_var": 2.061816526054934e-06, "learning_rate": 0.005777236106677175, "loss": 2.6867, "step": 12725 }, { "crossentropy": 2.5967116355895996, "epoch": 0.4613544083526682, "grad_norm": 0.03021121211349964, "grad_norm_var": 2.472489432951234e-06, "learning_rate": 0.00577666207729555, "loss": 2.6299, "step": 12726 }, { "crossentropy": 2.4507904052734375, "epoch": 0.46139066125290024, "grad_norm": 0.02729855105280876, "grad_norm_var": 2.4854670312283623e-06, "learning_rate": 0.0057760880374239255, "loss": 2.4807, "step": 12727 }, { "crossentropy": 2.529911756515503, "epoch": 0.46142691415313225, "grad_norm": 0.02700737863779068, "grad_norm_var": 2.5117886815195634e-06, "learning_rate": 0.005775513987070051, "loss": 2.5733, "step": 12728 }, { "crossentropy": 2.4840023517608643, "epoch": 0.46146316705336426, "grad_norm": 0.030766643583774567, "grad_norm_var": 2.925781744852971e-06, "learning_rate": 0.0057749399262416855, "loss": 2.5076, "step": 12729 }, { "crossentropy": 2.648251533508301, "epoch": 0.4614994199535963, "grad_norm": 0.029919110238552094, "grad_norm_var": 3.1160818169183864e-06, "learning_rate": 0.0057743658549465815, "loss": 2.5494, "step": 12730 }, { "crossentropy": 2.45932936668396, "epoch": 0.4615356728538283, "grad_norm": 0.026964059099555016, "grad_norm_var": 2.992157348966941e-06, "learning_rate": 0.00577379177319249, "loss": 2.549, "step": 12731 }, { "crossentropy": 2.4401254653930664, "epoch": 0.4615719257540603, "grad_norm": 0.02616708353161812, "grad_norm_var": 2.597579649389614e-06, "learning_rate": 0.005773217680987166, "loss": 2.5795, "step": 12732 }, { "crossentropy": 2.6272647380828857, "epoch": 0.4616081786542923, "grad_norm": 0.026406865566968918, "grad_norm_var": 2.6475913851928354e-06, "learning_rate": 0.005772643578338364, "loss": 2.5732, "step": 12733 }, { "crossentropy": 2.6818156242370605, "epoch": 0.46164443155452434, "grad_norm": 0.027937961742281914, "grad_norm_var": 2.589989243865229e-06, "learning_rate": 0.005772069465253839, "loss": 2.6647, "step": 12734 }, { "crossentropy": 2.7109384536743164, "epoch": 0.4616806844547564, "grad_norm": 0.0278012715280056, "grad_norm_var": 2.3013343295658072e-06, "learning_rate": 0.005771495341741344, "loss": 2.6368, "step": 12735 }, { "crossentropy": 2.618551731109619, "epoch": 0.4617169373549884, "grad_norm": 0.02770630083978176, "grad_norm_var": 2.1303702309793803e-06, "learning_rate": 0.005770921207808632, "loss": 2.5129, "step": 12736 }, { "crossentropy": 2.631443977355957, "epoch": 0.46175319025522044, "grad_norm": 0.026852326467633247, "grad_norm_var": 2.0480596538648e-06, "learning_rate": 0.0057703470634634605, "loss": 2.6745, "step": 12737 }, { "crossentropy": 2.5024631023406982, "epoch": 0.46178944315545245, "grad_norm": 0.027445359155535698, "grad_norm_var": 1.8948755001242194e-06, "learning_rate": 0.005769772908713584, "loss": 2.5816, "step": 12738 }, { "crossentropy": 2.511343479156494, "epoch": 0.46182569605568446, "grad_norm": 0.02709234319627285, "grad_norm_var": 1.9255566257641757e-06, "learning_rate": 0.005769198743566755, "loss": 2.5325, "step": 12739 }, { "crossentropy": 2.5554745197296143, "epoch": 0.4618619489559165, "grad_norm": 0.03203324228525162, "grad_norm_var": 2.890227670499752e-06, "learning_rate": 0.005768624568030728, "loss": 2.5679, "step": 12740 }, { "crossentropy": 2.6390373706817627, "epoch": 0.4618982018561485, "grad_norm": 0.032188404351472855, "grad_norm_var": 3.930785734664261e-06, "learning_rate": 0.005768050382113261, "loss": 2.6152, "step": 12741 }, { "crossentropy": 2.587963819503784, "epoch": 0.4619344547563805, "grad_norm": 0.02692030370235443, "grad_norm_var": 3.796421582292521e-06, "learning_rate": 0.005767476185822108, "loss": 2.5749, "step": 12742 }, { "crossentropy": 2.561807870864868, "epoch": 0.4619707076566125, "grad_norm": 0.027175001800060272, "grad_norm_var": 3.8115121016917e-06, "learning_rate": 0.0057669019791650265, "loss": 2.6337, "step": 12743 }, { "crossentropy": 2.7169010639190674, "epoch": 0.46200696055684454, "grad_norm": 0.027520444244146347, "grad_norm_var": 3.7498689558998836e-06, "learning_rate": 0.005766327762149767, "loss": 2.7005, "step": 12744 }, { "crossentropy": 2.4790608882904053, "epoch": 0.46204321345707655, "grad_norm": 0.026074284687638283, "grad_norm_var": 3.5083343064806155e-06, "learning_rate": 0.00576575353478409, "loss": 2.5545, "step": 12745 }, { "crossentropy": 2.4592723846435547, "epoch": 0.46207946635730857, "grad_norm": 0.02602805569767952, "grad_norm_var": 3.4007306846232556e-06, "learning_rate": 0.005765179297075747, "loss": 2.5645, "step": 12746 }, { "crossentropy": 2.6260643005371094, "epoch": 0.4621157192575406, "grad_norm": 0.026700394228100777, "grad_norm_var": 3.428999621126249e-06, "learning_rate": 0.0057646050490324985, "loss": 2.5816, "step": 12747 }, { "crossentropy": 2.7162747383117676, "epoch": 0.4621519721577726, "grad_norm": 0.0269184373319149, "grad_norm_var": 3.3179172649534647e-06, "learning_rate": 0.005764030790662099, "loss": 2.6162, "step": 12748 }, { "crossentropy": 2.672975540161133, "epoch": 0.46218822505800466, "grad_norm": 0.026642514392733574, "grad_norm_var": 3.2815413701749637e-06, "learning_rate": 0.005763456521972303, "loss": 2.6914, "step": 12749 }, { "crossentropy": 2.6927733421325684, "epoch": 0.4622244779582367, "grad_norm": 0.026599230244755745, "grad_norm_var": 3.3492560158623317e-06, "learning_rate": 0.005762882242970869, "loss": 2.6437, "step": 12750 }, { "crossentropy": 2.6619577407836914, "epoch": 0.4622607308584687, "grad_norm": 0.02662884071469307, "grad_norm_var": 3.4046611958627252e-06, "learning_rate": 0.005762307953665552, "loss": 2.6079, "step": 12751 }, { "crossentropy": 2.6697423458099365, "epoch": 0.4622969837587007, "grad_norm": 0.027419069781899452, "grad_norm_var": 3.403174542571108e-06, "learning_rate": 0.005761733654064108, "loss": 2.5791, "step": 12752 }, { "crossentropy": 2.6207468509674072, "epoch": 0.4623332366589327, "grad_norm": 0.027452541515231133, "grad_norm_var": 3.3726665323164164e-06, "learning_rate": 0.005761159344174297, "loss": 2.6571, "step": 12753 }, { "crossentropy": 2.625185489654541, "epoch": 0.46236948955916474, "grad_norm": 0.02544482611119747, "grad_norm_var": 3.651352766480207e-06, "learning_rate": 0.005760585024003874, "loss": 2.5882, "step": 12754 }, { "crossentropy": 2.502873659133911, "epoch": 0.46240574245939675, "grad_norm": 0.02737789787352085, "grad_norm_var": 3.643693278484785e-06, "learning_rate": 0.005760010693560595, "loss": 2.516, "step": 12755 }, { "crossentropy": 2.7006657123565674, "epoch": 0.46244199535962877, "grad_norm": 0.027448659762740135, "grad_norm_var": 2.1527863284003517e-06, "learning_rate": 0.0057594363528522185, "loss": 2.6628, "step": 12756 }, { "crossentropy": 2.6776680946350098, "epoch": 0.4624782482598608, "grad_norm": 0.026376625522971153, "grad_norm_var": 3.6628319768754657e-07, "learning_rate": 0.005758862001886501, "loss": 2.6786, "step": 12757 }, { "crossentropy": 2.6668975353240967, "epoch": 0.4625145011600928, "grad_norm": 0.026536567136645317, "grad_norm_var": 3.690982015674132e-07, "learning_rate": 0.005758287640671202, "loss": 2.6507, "step": 12758 }, { "crossentropy": 2.5635077953338623, "epoch": 0.4625507540603248, "grad_norm": 0.027839943766593933, "grad_norm_var": 4.325098519629278e-07, "learning_rate": 0.005757713269214077, "loss": 2.5262, "step": 12759 }, { "crossentropy": 2.6824615001678467, "epoch": 0.4625870069605568, "grad_norm": 0.02701439894735813, "grad_norm_var": 4.0078319377591607e-07, "learning_rate": 0.0057571388875228835, "loss": 2.6509, "step": 12760 }, { "crossentropy": 2.700974941253662, "epoch": 0.46262325986078884, "grad_norm": 0.027536919340491295, "grad_norm_var": 3.9659063909361357e-07, "learning_rate": 0.005756564495605381, "loss": 2.6728, "step": 12761 }, { "crossentropy": 2.6166486740112305, "epoch": 0.4626595127610209, "grad_norm": 0.02703266032040119, "grad_norm_var": 3.4651531350946877e-07, "learning_rate": 0.005755990093469328, "loss": 2.6719, "step": 12762 }, { "crossentropy": 2.6814801692962646, "epoch": 0.4626957656612529, "grad_norm": 0.028540795668959618, "grad_norm_var": 5.004923691573898e-07, "learning_rate": 0.00575541568112248, "loss": 2.6721, "step": 12763 }, { "crossentropy": 2.505540609359741, "epoch": 0.46273201856148494, "grad_norm": 0.026602886617183685, "grad_norm_var": 5.122770352302403e-07, "learning_rate": 0.005754841258572597, "loss": 2.549, "step": 12764 }, { "crossentropy": 2.637467384338379, "epoch": 0.46276827146171695, "grad_norm": 0.027927886694669724, "grad_norm_var": 5.489759657972248e-07, "learning_rate": 0.005754266825827438, "loss": 2.6358, "step": 12765 }, { "crossentropy": 2.4613037109375, "epoch": 0.46280452436194897, "grad_norm": 0.026505300775170326, "grad_norm_var": 5.5593969090813e-07, "learning_rate": 0.00575369238289476, "loss": 2.5738, "step": 12766 }, { "crossentropy": 2.5762927532196045, "epoch": 0.462840777262181, "grad_norm": 0.025758426636457443, "grad_norm_var": 6.585939677579307e-07, "learning_rate": 0.0057531179297823235, "loss": 2.5339, "step": 12767 }, { "crossentropy": 2.363773822784424, "epoch": 0.462877030162413, "grad_norm": 0.02751188725233078, "grad_norm_var": 6.636879769955495e-07, "learning_rate": 0.005752543466497886, "loss": 2.5275, "step": 12768 }, { "crossentropy": 2.521357536315918, "epoch": 0.462913283062645, "grad_norm": 0.02632051520049572, "grad_norm_var": 6.840432917563776e-07, "learning_rate": 0.005751968993049206, "loss": 2.5919, "step": 12769 }, { "crossentropy": 2.544647693634033, "epoch": 0.462949535962877, "grad_norm": 0.026326293125748634, "grad_norm_var": 5.51470819960921e-07, "learning_rate": 0.005751394509444044, "loss": 2.6239, "step": 12770 }, { "crossentropy": 2.469714403152466, "epoch": 0.46298578886310904, "grad_norm": 0.02597934752702713, "grad_norm_var": 6.109141776186947e-07, "learning_rate": 0.00575082001569016, "loss": 2.566, "step": 12771 }, { "crossentropy": 2.590876817703247, "epoch": 0.46302204176334105, "grad_norm": 0.026868386194109917, "grad_norm_var": 5.936636592063574e-07, "learning_rate": 0.005750245511795312, "loss": 2.6115, "step": 12772 }, { "crossentropy": 2.6049721240997314, "epoch": 0.46305829466357307, "grad_norm": 0.02726508304476738, "grad_norm_var": 5.789345072564385e-07, "learning_rate": 0.0057496709977672604, "loss": 2.638, "step": 12773 }, { "crossentropy": 2.499635696411133, "epoch": 0.4630945475638051, "grad_norm": 0.02987688034772873, "grad_norm_var": 1.0819334850373577e-06, "learning_rate": 0.005749096473613763, "loss": 2.5085, "step": 12774 }, { "crossentropy": 2.5767977237701416, "epoch": 0.4631308004640371, "grad_norm": 0.030816171318292618, "grad_norm_var": 1.8967550303792638e-06, "learning_rate": 0.0057485219393425815, "loss": 2.6189, "step": 12775 }, { "crossentropy": 2.3959014415740967, "epoch": 0.46316705336426917, "grad_norm": 0.02831370383501053, "grad_norm_var": 1.9410540813005143e-06, "learning_rate": 0.005747947394961475, "loss": 2.4417, "step": 12776 }, { "crossentropy": 2.617795944213867, "epoch": 0.4632033062645012, "grad_norm": 0.0266412403434515, "grad_norm_var": 1.9806880779157326e-06, "learning_rate": 0.005747372840478204, "loss": 2.5563, "step": 12777 }, { "crossentropy": 2.5979037284851074, "epoch": 0.4632395591647332, "grad_norm": 0.02726413682103157, "grad_norm_var": 1.9729165900081447e-06, "learning_rate": 0.005746798275900529, "loss": 2.6666, "step": 12778 }, { "crossentropy": 2.6578054428100586, "epoch": 0.4632758120649652, "grad_norm": 0.027768265455961227, "grad_norm_var": 1.893475938873632e-06, "learning_rate": 0.00574622370123621, "loss": 2.6144, "step": 12779 }, { "crossentropy": 2.626971960067749, "epoch": 0.4633120649651972, "grad_norm": 0.029811717569828033, "grad_norm_var": 2.213450048013062e-06, "learning_rate": 0.005745649116493006, "loss": 2.6355, "step": 12780 }, { "crossentropy": 2.5338196754455566, "epoch": 0.46334831786542924, "grad_norm": 0.030124077573418617, "grad_norm_var": 2.622717109974394e-06, "learning_rate": 0.0057450745216786805, "loss": 2.5801, "step": 12781 }, { "crossentropy": 2.5455687046051025, "epoch": 0.46338457076566125, "grad_norm": 0.026802828535437584, "grad_norm_var": 2.580976041912745e-06, "learning_rate": 0.005744499916800993, "loss": 2.553, "step": 12782 }, { "crossentropy": 2.446659803390503, "epoch": 0.46342082366589327, "grad_norm": 0.02637217566370964, "grad_norm_var": 2.444360536619135e-06, "learning_rate": 0.005743925301867704, "loss": 2.5499, "step": 12783 }, { "crossentropy": 2.6852424144744873, "epoch": 0.4634570765661253, "grad_norm": 0.027918046340346336, "grad_norm_var": 2.441563723071235e-06, "learning_rate": 0.005743350676886573, "loss": 2.6304, "step": 12784 }, { "crossentropy": 2.7159018516540527, "epoch": 0.4634933294663573, "grad_norm": 0.02853226475417614, "grad_norm_var": 2.317106658175624e-06, "learning_rate": 0.005742776041865366, "loss": 2.5127, "step": 12785 }, { "crossentropy": 2.5524139404296875, "epoch": 0.4635295823665893, "grad_norm": 0.029249895364046097, "grad_norm_var": 2.231033088091518e-06, "learning_rate": 0.00574220139681184, "loss": 2.6118, "step": 12786 }, { "crossentropy": 2.6958296298980713, "epoch": 0.4635658352668213, "grad_norm": 0.030164455994963646, "grad_norm_var": 2.1422269436698608e-06, "learning_rate": 0.005741626741733758, "loss": 2.5827, "step": 12787 }, { "crossentropy": 2.5966217517852783, "epoch": 0.46360208816705334, "grad_norm": 0.028622208163142204, "grad_norm_var": 1.9852381697617413e-06, "learning_rate": 0.005741052076638881, "loss": 2.5757, "step": 12788 }, { "crossentropy": 2.5871291160583496, "epoch": 0.4636383410672854, "grad_norm": 0.026832718402147293, "grad_norm_var": 2.06646708167181e-06, "learning_rate": 0.005740477401534973, "loss": 2.5771, "step": 12789 }, { "crossentropy": 2.5138232707977295, "epoch": 0.4636745939675174, "grad_norm": 0.028597552329301834, "grad_norm_var": 1.924415413867665e-06, "learning_rate": 0.005739902716429791, "loss": 2.5982, "step": 12790 }, { "crossentropy": 2.6027212142944336, "epoch": 0.46371084686774944, "grad_norm": 0.03077247180044651, "grad_norm_var": 1.9102496557149574e-06, "learning_rate": 0.005739328021331102, "loss": 2.6247, "step": 12791 }, { "crossentropy": 2.5031981468200684, "epoch": 0.46374709976798145, "grad_norm": 0.027988571673631668, "grad_norm_var": 1.9189387823729044e-06, "learning_rate": 0.005738753316246664, "loss": 2.5772, "step": 12792 }, { "crossentropy": 2.60994291305542, "epoch": 0.46378335266821347, "grad_norm": 0.026233812794089317, "grad_norm_var": 2.021673295511152e-06, "learning_rate": 0.005738178601184244, "loss": 2.6002, "step": 12793 }, { "crossentropy": 2.633666753768921, "epoch": 0.4638196055684455, "grad_norm": 0.027824563905596733, "grad_norm_var": 1.962707927280815e-06, "learning_rate": 0.005737603876151602, "loss": 2.6097, "step": 12794 }, { "crossentropy": 2.4905025959014893, "epoch": 0.4638558584686775, "grad_norm": 0.02846524305641651, "grad_norm_var": 1.938917485411138e-06, "learning_rate": 0.005737029141156498, "loss": 2.5296, "step": 12795 }, { "crossentropy": 2.568063735961914, "epoch": 0.4638921113689095, "grad_norm": 0.025846337899565697, "grad_norm_var": 2.172394755486787e-06, "learning_rate": 0.005736454396206697, "loss": 2.5687, "step": 12796 }, { "crossentropy": 2.6205573081970215, "epoch": 0.4639283642691415, "grad_norm": 0.025794150307774544, "grad_norm_var": 2.202575704086694e-06, "learning_rate": 0.005735879641309964, "loss": 2.6396, "step": 12797 }, { "crossentropy": 2.6916720867156982, "epoch": 0.46396461716937354, "grad_norm": 0.02614336647093296, "grad_norm_var": 2.324125586041512e-06, "learning_rate": 0.0057353048764740575, "loss": 2.646, "step": 12798 }, { "crossentropy": 2.5148236751556396, "epoch": 0.46400087006960555, "grad_norm": 0.026999065652489662, "grad_norm_var": 2.2264281801566305e-06, "learning_rate": 0.005734730101706744, "loss": 2.6347, "step": 12799 }, { "crossentropy": 2.531374454498291, "epoch": 0.46403712296983757, "grad_norm": 0.027064139023423195, "grad_norm_var": 2.266990829510642e-06, "learning_rate": 0.005734155317015785, "loss": 2.5705, "step": 12800 }, { "crossentropy": 2.449526309967041, "epoch": 0.4640733758700696, "grad_norm": 0.02738196589052677, "grad_norm_var": 2.240551430629454e-06, "learning_rate": 0.005733580522408945, "loss": 2.6007, "step": 12801 }, { "crossentropy": 2.514070510864258, "epoch": 0.4641096287703016, "grad_norm": 0.026114076375961304, "grad_norm_var": 2.2275073464685566e-06, "learning_rate": 0.005733005717893987, "loss": 2.5073, "step": 12802 }, { "crossentropy": 2.4187517166137695, "epoch": 0.46414588167053367, "grad_norm": 0.026332305744290352, "grad_norm_var": 1.8109056927672594e-06, "learning_rate": 0.005732430903478674, "loss": 2.5127, "step": 12803 }, { "crossentropy": 2.660900592803955, "epoch": 0.4641821345707657, "grad_norm": 0.027628934010863304, "grad_norm_var": 1.6992184294174646e-06, "learning_rate": 0.005731856079170768, "loss": 2.688, "step": 12804 }, { "crossentropy": 2.720737934112549, "epoch": 0.4642183874709977, "grad_norm": 0.028121130540966988, "grad_norm_var": 1.7310777494299826e-06, "learning_rate": 0.005731281244978037, "loss": 2.6851, "step": 12805 }, { "crossentropy": 2.6028380393981934, "epoch": 0.4642546403712297, "grad_norm": 0.027339819818735123, "grad_norm_var": 1.6176705667262595e-06, "learning_rate": 0.005730706400908244, "loss": 2.6046, "step": 12806 }, { "crossentropy": 2.4340832233428955, "epoch": 0.4642908932714617, "grad_norm": 0.028116799890995026, "grad_norm_var": 8.12292592414221e-07, "learning_rate": 0.005730131546969151, "loss": 2.5204, "step": 12807 }, { "crossentropy": 2.7095930576324463, "epoch": 0.46432714617169374, "grad_norm": 0.02796907350420952, "grad_norm_var": 8.099728582407206e-07, "learning_rate": 0.005729556683168522, "loss": 2.7091, "step": 12808 }, { "crossentropy": 2.520684003829956, "epoch": 0.46436339907192575, "grad_norm": 0.027054229751229286, "grad_norm_var": 7.588290701238467e-07, "learning_rate": 0.005728981809514124, "loss": 2.5022, "step": 12809 }, { "crossentropy": 2.466644048690796, "epoch": 0.46439965197215777, "grad_norm": 0.026688283309340477, "grad_norm_var": 7.353865013742222e-07, "learning_rate": 0.005728406926013722, "loss": 2.5277, "step": 12810 }, { "crossentropy": 2.646637201309204, "epoch": 0.4644359048723898, "grad_norm": 0.02617616020143032, "grad_norm_var": 6.358715572832758e-07, "learning_rate": 0.0057278320326750775, "loss": 2.6676, "step": 12811 }, { "crossentropy": 2.4811694622039795, "epoch": 0.4644721577726218, "grad_norm": 0.027301546186208725, "grad_norm_var": 5.592988628628914e-07, "learning_rate": 0.0057272571295059575, "loss": 2.5561, "step": 12812 }, { "crossentropy": 2.384915590286255, "epoch": 0.4645084106728538, "grad_norm": 0.026182878762483597, "grad_norm_var": 5.055144640197147e-07, "learning_rate": 0.0057266822165141255, "loss": 2.4745, "step": 12813 }, { "crossentropy": 2.661937713623047, "epoch": 0.4645446635730858, "grad_norm": 0.027009591460227966, "grad_norm_var": 4.49042195010601e-07, "learning_rate": 0.005726107293707349, "loss": 2.6434, "step": 12814 }, { "crossentropy": 2.7354514598846436, "epoch": 0.46458091647331784, "grad_norm": 0.026368670165538788, "grad_norm_var": 4.817330121646585e-07, "learning_rate": 0.005725532361093391, "loss": 2.6244, "step": 12815 }, { "crossentropy": 2.6350207328796387, "epoch": 0.4646171693735499, "grad_norm": 0.02590160444378853, "grad_norm_var": 5.644898731069995e-07, "learning_rate": 0.005724957418680014, "loss": 2.594, "step": 12816 }, { "crossentropy": 2.681520938873291, "epoch": 0.4646534222737819, "grad_norm": 0.02616662159562111, "grad_norm_var": 5.917409105079634e-07, "learning_rate": 0.005724382466474991, "loss": 2.6527, "step": 12817 }, { "crossentropy": 2.580343246459961, "epoch": 0.46468967517401394, "grad_norm": 0.026357147842645645, "grad_norm_var": 5.698169423436091e-07, "learning_rate": 0.005723807504486082, "loss": 2.5627, "step": 12818 }, { "crossentropy": 2.3775665760040283, "epoch": 0.46472592807424595, "grad_norm": 0.02716909907758236, "grad_norm_var": 5.48046680686846e-07, "learning_rate": 0.005723232532721056, "loss": 2.4242, "step": 12819 }, { "crossentropy": 2.5842106342315674, "epoch": 0.46476218097447797, "grad_norm": 0.026921361684799194, "grad_norm_var": 5.173583185976895e-07, "learning_rate": 0.005722657551187674, "loss": 2.5666, "step": 12820 }, { "crossentropy": 2.566460132598877, "epoch": 0.46479843387471, "grad_norm": 0.026253338903188705, "grad_norm_var": 4.382007932239469e-07, "learning_rate": 0.005722082559893707, "loss": 2.5874, "step": 12821 }, { "crossentropy": 2.4997975826263428, "epoch": 0.464834686774942, "grad_norm": 0.02850368060171604, "grad_norm_var": 6.049223603929079e-07, "learning_rate": 0.00572150755884692, "loss": 2.5132, "step": 12822 }, { "crossentropy": 2.584744691848755, "epoch": 0.464870939675174, "grad_norm": 0.02841845527291298, "grad_norm_var": 6.602035402639237e-07, "learning_rate": 0.005720932548055078, "loss": 2.6323, "step": 12823 }, { "crossentropy": 2.781795024871826, "epoch": 0.464907192575406, "grad_norm": 0.027952611446380615, "grad_norm_var": 6.578796508413618e-07, "learning_rate": 0.005720357527525945, "loss": 2.7113, "step": 12824 }, { "crossentropy": 2.633416175842285, "epoch": 0.46494344547563804, "grad_norm": 0.02635599859058857, "grad_norm_var": 6.741387686533118e-07, "learning_rate": 0.005719782497267294, "loss": 2.5291, "step": 12825 }, { "crossentropy": 2.7438206672668457, "epoch": 0.46497969837587005, "grad_norm": 0.026142587885260582, "grad_norm_var": 7.050943978487012e-07, "learning_rate": 0.005719207457286887, "loss": 2.6517, "step": 12826 }, { "crossentropy": 2.6432762145996094, "epoch": 0.46501595127610207, "grad_norm": 0.02721995860338211, "grad_norm_var": 6.830502203555277e-07, "learning_rate": 0.005718632407592492, "loss": 2.5062, "step": 12827 }, { "crossentropy": 2.5977585315704346, "epoch": 0.4650522041763341, "grad_norm": 0.02719085104763508, "grad_norm_var": 6.777282075671181e-07, "learning_rate": 0.005718057348191874, "loss": 2.5435, "step": 12828 }, { "crossentropy": 2.491466522216797, "epoch": 0.46508845707656615, "grad_norm": 0.02723085694015026, "grad_norm_var": 6.486593814385965e-07, "learning_rate": 0.005717482279092804, "loss": 2.5714, "step": 12829 }, { "crossentropy": 2.516530990600586, "epoch": 0.46512470997679817, "grad_norm": 0.02835962362587452, "grad_norm_var": 7.737203878732755e-07, "learning_rate": 0.005716907200303046, "loss": 2.6118, "step": 12830 }, { "crossentropy": 2.6237754821777344, "epoch": 0.4651609628770302, "grad_norm": 0.026503970846533775, "grad_norm_var": 7.628974722071261e-07, "learning_rate": 0.00571633211183037, "loss": 2.5782, "step": 12831 }, { "crossentropy": 2.3760788440704346, "epoch": 0.4651972157772622, "grad_norm": 0.029208088293671608, "grad_norm_var": 9.441074339860591e-07, "learning_rate": 0.005715757013682539, "loss": 2.4879, "step": 12832 }, { "crossentropy": 2.6202778816223145, "epoch": 0.4652334686774942, "grad_norm": 0.02741541527211666, "grad_norm_var": 8.616625525624417e-07, "learning_rate": 0.0057151819058673245, "loss": 2.579, "step": 12833 }, { "crossentropy": 2.6064095497131348, "epoch": 0.4652697215777262, "grad_norm": 0.026987945660948753, "grad_norm_var": 8.051131218894062e-07, "learning_rate": 0.005714606788392493, "loss": 2.5582, "step": 12834 }, { "crossentropy": 2.484551191329956, "epoch": 0.46530597447795824, "grad_norm": 0.028231272473931313, "grad_norm_var": 8.479367840693996e-07, "learning_rate": 0.005714031661265813, "loss": 2.583, "step": 12835 }, { "crossentropy": 2.671588182449341, "epoch": 0.46534222737819025, "grad_norm": 0.028063271194696426, "grad_norm_var": 8.518391754316525e-07, "learning_rate": 0.005713456524495051, "loss": 2.639, "step": 12836 }, { "crossentropy": 2.5808746814727783, "epoch": 0.46537848027842227, "grad_norm": 0.027277257293462753, "grad_norm_var": 7.468438849775877e-07, "learning_rate": 0.005712881378087977, "loss": 2.6041, "step": 12837 }, { "crossentropy": 2.5554397106170654, "epoch": 0.4654147331786543, "grad_norm": 0.02651970647275448, "grad_norm_var": 7.449055652183756e-07, "learning_rate": 0.005712306222052357, "loss": 2.6084, "step": 12838 }, { "crossentropy": 2.7445335388183594, "epoch": 0.4654509860788863, "grad_norm": 0.02681903727352619, "grad_norm_var": 6.966327515169531e-07, "learning_rate": 0.005711731056395962, "loss": 2.685, "step": 12839 }, { "crossentropy": 2.4449195861816406, "epoch": 0.4654872389791183, "grad_norm": 0.02619699202477932, "grad_norm_var": 7.46431122312568e-07, "learning_rate": 0.005711155881126557, "loss": 2.5097, "step": 12840 }, { "crossentropy": 2.7359249591827393, "epoch": 0.4655234918793503, "grad_norm": 0.02747778594493866, "grad_norm_var": 6.93955303769124e-07, "learning_rate": 0.005710580696251914, "loss": 2.6864, "step": 12841 }, { "crossentropy": 2.5262277126312256, "epoch": 0.46555974477958234, "grad_norm": 0.02770649828016758, "grad_norm_var": 6.048920830103536e-07, "learning_rate": 0.005710005501779801, "loss": 2.5317, "step": 12842 }, { "crossentropy": 2.619351863861084, "epoch": 0.4655959976798144, "grad_norm": 0.02731955796480179, "grad_norm_var": 6.031140697515073e-07, "learning_rate": 0.005709430297717985, "loss": 2.5943, "step": 12843 }, { "crossentropy": 2.448911428451538, "epoch": 0.4656322505800464, "grad_norm": 0.025680560618638992, "grad_norm_var": 7.891528035064056e-07, "learning_rate": 0.005708855084074236, "loss": 2.4803, "step": 12844 }, { "crossentropy": 2.6070263385772705, "epoch": 0.46566850348027844, "grad_norm": 0.02656361274421215, "grad_norm_var": 8.242301678903676e-07, "learning_rate": 0.005708279860856324, "loss": 2.5464, "step": 12845 }, { "crossentropy": 2.683633804321289, "epoch": 0.46570475638051045, "grad_norm": 0.027622830122709274, "grad_norm_var": 7.511805871837987e-07, "learning_rate": 0.005707704628072017, "loss": 2.608, "step": 12846 }, { "crossentropy": 2.6888256072998047, "epoch": 0.46574100928074247, "grad_norm": 0.03194747865200043, "grad_norm_var": 2.080124089281566e-06, "learning_rate": 0.005707129385729085, "loss": 2.6246, "step": 12847 }, { "crossentropy": 2.558166027069092, "epoch": 0.4657772621809745, "grad_norm": 0.028696056455373764, "grad_norm_var": 1.984323514242492e-06, "learning_rate": 0.005706554133835298, "loss": 2.54, "step": 12848 }, { "crossentropy": 2.6102771759033203, "epoch": 0.4658135150812065, "grad_norm": 0.029113424941897392, "grad_norm_var": 2.137942992829165e-06, "learning_rate": 0.005705978872398425, "loss": 2.6463, "step": 12849 }, { "crossentropy": 2.7375340461730957, "epoch": 0.4658497679814385, "grad_norm": 0.027660978958010674, "grad_norm_var": 2.1078336827277482e-06, "learning_rate": 0.0057054036014262365, "loss": 2.6504, "step": 12850 }, { "crossentropy": 2.5697243213653564, "epoch": 0.4658860208816705, "grad_norm": 0.026685180142521858, "grad_norm_var": 2.1438016438434884e-06, "learning_rate": 0.005704828320926502, "loss": 2.594, "step": 12851 }, { "crossentropy": 2.607184886932373, "epoch": 0.46592227378190254, "grad_norm": 0.026994943618774414, "grad_norm_var": 2.1469206866399245e-06, "learning_rate": 0.00570425303090699, "loss": 2.5841, "step": 12852 }, { "crossentropy": 2.597475528717041, "epoch": 0.46595852668213456, "grad_norm": 0.026418687775731087, "grad_norm_var": 2.2205076511247104e-06, "learning_rate": 0.005703677731375473, "loss": 2.47, "step": 12853 }, { "crossentropy": 2.481701374053955, "epoch": 0.46599477958236657, "grad_norm": 0.02679724618792534, "grad_norm_var": 2.1903796035819638e-06, "learning_rate": 0.005703102422339721, "loss": 2.5064, "step": 12854 }, { "crossentropy": 2.5498838424682617, "epoch": 0.4660310324825986, "grad_norm": 0.02699931152164936, "grad_norm_var": 2.176492149132588e-06, "learning_rate": 0.005702527103807502, "loss": 2.5146, "step": 12855 }, { "crossentropy": 2.69339919090271, "epoch": 0.46606728538283065, "grad_norm": 0.025828145444393158, "grad_norm_var": 2.2487111505042757e-06, "learning_rate": 0.005701951775786591, "loss": 2.5729, "step": 12856 }, { "crossentropy": 2.4645133018493652, "epoch": 0.46610353828306267, "grad_norm": 0.027356287464499474, "grad_norm_var": 2.2494998405340103e-06, "learning_rate": 0.005701376438284753, "loss": 2.5039, "step": 12857 }, { "crossentropy": 2.551140308380127, "epoch": 0.4661397911832947, "grad_norm": 0.02762432210147381, "grad_norm_var": 2.247242152573989e-06, "learning_rate": 0.005700801091309764, "loss": 2.5804, "step": 12858 }, { "crossentropy": 2.5093772411346436, "epoch": 0.4661760440835267, "grad_norm": 0.026522206142544746, "grad_norm_var": 2.3015672999822467e-06, "learning_rate": 0.005700225734869392, "loss": 2.6138, "step": 12859 }, { "crossentropy": 2.5256259441375732, "epoch": 0.4662122969837587, "grad_norm": 0.026142453774809837, "grad_norm_var": 2.208580107875696e-06, "learning_rate": 0.005699650368971409, "loss": 2.5009, "step": 12860 }, { "crossentropy": 2.599245071411133, "epoch": 0.4662485498839907, "grad_norm": 0.026656290516257286, "grad_norm_var": 2.198338998704123e-06, "learning_rate": 0.0056990749936235855, "loss": 2.6036, "step": 12861 }, { "crossentropy": 2.674659252166748, "epoch": 0.46628480278422274, "grad_norm": 0.02866455912590027, "grad_norm_var": 2.2913341960121083e-06, "learning_rate": 0.005698499608833694, "loss": 2.651, "step": 12862 }, { "crossentropy": 2.5879063606262207, "epoch": 0.46632105568445475, "grad_norm": 0.027487188577651978, "grad_norm_var": 8.937799827185913e-07, "learning_rate": 0.005697924214609505, "loss": 2.5379, "step": 12863 }, { "crossentropy": 2.5999934673309326, "epoch": 0.46635730858468677, "grad_norm": 0.026979409158229828, "grad_norm_var": 7.41931574101129e-07, "learning_rate": 0.00569734881095879, "loss": 2.6136, "step": 12864 }, { "crossentropy": 2.8528244495391846, "epoch": 0.4663935614849188, "grad_norm": 0.03134319186210632, "grad_norm_var": 1.645124967455932e-06, "learning_rate": 0.005696773397889322, "loss": 2.7711, "step": 12865 }, { "crossentropy": 2.710742712020874, "epoch": 0.4664298143851508, "grad_norm": 0.026653997600078583, "grad_norm_var": 1.6546669485019312e-06, "learning_rate": 0.005696197975408871, "loss": 2.6711, "step": 12866 }, { "crossentropy": 2.611319065093994, "epoch": 0.4664660672853828, "grad_norm": 0.027931926771998405, "grad_norm_var": 1.6667194662133936e-06, "learning_rate": 0.005695622543525209, "loss": 2.6276, "step": 12867 }, { "crossentropy": 2.614686965942383, "epoch": 0.4665023201856148, "grad_norm": 0.027663487941026688, "grad_norm_var": 1.6696889878500243e-06, "learning_rate": 0.00569504710224611, "loss": 2.5942, "step": 12868 }, { "crossentropy": 2.641413927078247, "epoch": 0.46653857308584684, "grad_norm": 0.028419457376003265, "grad_norm_var": 1.680294141269166e-06, "learning_rate": 0.005694471651579346, "loss": 2.5697, "step": 12869 }, { "crossentropy": 2.6604394912719727, "epoch": 0.4665748259860789, "grad_norm": 0.02882581204175949, "grad_norm_var": 1.7631391029883687e-06, "learning_rate": 0.0056938961915326885, "loss": 2.6514, "step": 12870 }, { "crossentropy": 2.455913543701172, "epoch": 0.4666110788863109, "grad_norm": 0.02617642842233181, "grad_norm_var": 1.8679242573031573e-06, "learning_rate": 0.005693320722113909, "loss": 2.55, "step": 12871 }, { "crossentropy": 2.5671889781951904, "epoch": 0.46664733178654294, "grad_norm": 0.0262349471449852, "grad_norm_var": 1.7866526526487258e-06, "learning_rate": 0.005692745243330779, "loss": 2.5662, "step": 12872 }, { "crossentropy": 2.4438376426696777, "epoch": 0.46668358468677495, "grad_norm": 0.027663083747029305, "grad_norm_var": 1.7849131318632087e-06, "learning_rate": 0.005692169755191077, "loss": 2.4938, "step": 12873 }, { "crossentropy": 2.5523681640625, "epoch": 0.46671983758700697, "grad_norm": 0.02640608325600624, "grad_norm_var": 1.8675137866014727e-06, "learning_rate": 0.005691594257702569, "loss": 2.6077, "step": 12874 }, { "crossentropy": 2.5914416313171387, "epoch": 0.466756090487239, "grad_norm": 0.027836553752422333, "grad_norm_var": 1.8066417693389601e-06, "learning_rate": 0.005691018750873033, "loss": 2.5571, "step": 12875 }, { "crossentropy": 2.5485291481018066, "epoch": 0.466792343387471, "grad_norm": 0.02925744839012623, "grad_norm_var": 1.8210966148366013e-06, "learning_rate": 0.005690443234710237, "loss": 2.5702, "step": 12876 }, { "crossentropy": 2.560070276260376, "epoch": 0.466828596287703, "grad_norm": 0.02753453515470028, "grad_norm_var": 1.7397683609204993e-06, "learning_rate": 0.005689867709221962, "loss": 2.4534, "step": 12877 }, { "crossentropy": 2.624586820602417, "epoch": 0.466864849187935, "grad_norm": 0.026983249932527542, "grad_norm_var": 1.7265278164555007e-06, "learning_rate": 0.005689292174415974, "loss": 2.6291, "step": 12878 }, { "crossentropy": 2.567103862762451, "epoch": 0.46690110208816704, "grad_norm": 0.027317892760038376, "grad_norm_var": 1.733400524804686e-06, "learning_rate": 0.005688716630300049, "loss": 2.623, "step": 12879 }, { "crossentropy": 2.651193380355835, "epoch": 0.46693735498839906, "grad_norm": 0.027756299823522568, "grad_norm_var": 1.6963021906257383e-06, "learning_rate": 0.005688141076881961, "loss": 2.6094, "step": 12880 }, { "crossentropy": 2.6444146633148193, "epoch": 0.46697360788863107, "grad_norm": 0.0367923378944397, "grad_norm_var": 6.162570725249635e-06, "learning_rate": 0.005687565514169485, "loss": 2.6032, "step": 12881 }, { "crossentropy": 2.6074371337890625, "epoch": 0.4670098607888631, "grad_norm": 0.027765924111008644, "grad_norm_var": 6.026821811185428e-06, "learning_rate": 0.005686989942170392, "loss": 2.5611, "step": 12882 }, { "crossentropy": 2.552820920944214, "epoch": 0.46704611368909515, "grad_norm": 0.026819558814167976, "grad_norm_var": 6.1380345042878296e-06, "learning_rate": 0.005686414360892456, "loss": 2.5495, "step": 12883 }, { "crossentropy": 2.586740255355835, "epoch": 0.46708236658932717, "grad_norm": 0.025777475908398628, "grad_norm_var": 6.4678097395081164e-06, "learning_rate": 0.0056858387703434534, "loss": 2.6723, "step": 12884 }, { "crossentropy": 2.6204733848571777, "epoch": 0.4671186194895592, "grad_norm": 0.027321744710206985, "grad_norm_var": 6.477767964758888e-06, "learning_rate": 0.005685263170531159, "loss": 2.5482, "step": 12885 }, { "crossentropy": 2.547368288040161, "epoch": 0.4671548723897912, "grad_norm": 0.02585856430232525, "grad_norm_var": 6.663486547839326e-06, "learning_rate": 0.0056846875614633445, "loss": 2.5287, "step": 12886 }, { "crossentropy": 2.563229560852051, "epoch": 0.4671911252900232, "grad_norm": 0.026868324726819992, "grad_norm_var": 6.551110761849341e-06, "learning_rate": 0.005684111943147785, "loss": 2.5964, "step": 12887 }, { "crossentropy": 2.6795737743377686, "epoch": 0.4672273781902552, "grad_norm": 0.0268208310008049, "grad_norm_var": 6.453264539165458e-06, "learning_rate": 0.005683536315592255, "loss": 2.5361, "step": 12888 }, { "crossentropy": 2.6137032508850098, "epoch": 0.46726363109048724, "grad_norm": 0.39163362979888916, "grad_norm_var": 0.008279529639610825, "learning_rate": 0.00568296067880453, "loss": 2.5495, "step": 12889 }, { "crossentropy": 2.63838529586792, "epoch": 0.46729988399071926, "grad_norm": 0.035526592284440994, "grad_norm_var": 0.008255371744180654, "learning_rate": 0.005682385032792385, "loss": 2.6192, "step": 12890 }, { "crossentropy": 2.422231674194336, "epoch": 0.46733613689095127, "grad_norm": 0.0280720554292202, "grad_norm_var": 0.008254644201312996, "learning_rate": 0.005681809377563596, "loss": 2.4933, "step": 12891 }, { "crossentropy": 2.499095916748047, "epoch": 0.4673723897911833, "grad_norm": 0.02770133502781391, "grad_norm_var": 0.008259334043528209, "learning_rate": 0.005681233713125934, "loss": 2.5872, "step": 12892 }, { "crossentropy": 2.5607614517211914, "epoch": 0.4674086426914153, "grad_norm": 0.027305474504828453, "grad_norm_var": 0.008260055041976586, "learning_rate": 0.005680658039487177, "loss": 2.552, "step": 12893 }, { "crossentropy": 2.659801721572876, "epoch": 0.4674448955916473, "grad_norm": 0.027729196473956108, "grad_norm_var": 0.008257699127139093, "learning_rate": 0.005680082356655104, "loss": 2.5583, "step": 12894 }, { "crossentropy": 2.653608798980713, "epoch": 0.46748114849187933, "grad_norm": 0.029486333951354027, "grad_norm_var": 0.008251126624453355, "learning_rate": 0.005679506664637484, "loss": 2.7269, "step": 12895 }, { "crossentropy": 2.5290374755859375, "epoch": 0.46751740139211134, "grad_norm": 0.029035180807113647, "grad_norm_var": 0.008247230904936143, "learning_rate": 0.005678930963442094, "loss": 2.4069, "step": 12896 }, { "crossentropy": 2.499776601791382, "epoch": 0.4675536542923434, "grad_norm": 0.029385169968008995, "grad_norm_var": 0.008264970511204929, "learning_rate": 0.005678355253076713, "loss": 2.5859, "step": 12897 }, { "crossentropy": 2.5435492992401123, "epoch": 0.4675899071925754, "grad_norm": 0.029751980677247047, "grad_norm_var": 0.008259112352995979, "learning_rate": 0.0056777795335491155, "loss": 2.5163, "step": 12898 }, { "crossentropy": 2.5228688716888428, "epoch": 0.46762616009280744, "grad_norm": 0.02961556985974312, "grad_norm_var": 0.008250607576506815, "learning_rate": 0.0056772038048670745, "loss": 2.544, "step": 12899 }, { "crossentropy": 2.5228567123413086, "epoch": 0.46766241299303946, "grad_norm": 0.02818242646753788, "grad_norm_var": 0.008242843339613079, "learning_rate": 0.00567662806703837, "loss": 2.6029, "step": 12900 }, { "crossentropy": 2.729595422744751, "epoch": 0.46769866589327147, "grad_norm": 0.027047617360949516, "grad_norm_var": 0.008243723294000383, "learning_rate": 0.0056760523200707754, "loss": 2.6644, "step": 12901 }, { "crossentropy": 2.5477495193481445, "epoch": 0.4677349187935035, "grad_norm": 0.028060518205165863, "grad_norm_var": 0.008236571189925837, "learning_rate": 0.005675476563972071, "loss": 2.6012, "step": 12902 }, { "crossentropy": 2.5732531547546387, "epoch": 0.4677711716937355, "grad_norm": 0.03159419447183609, "grad_norm_var": 0.008222516255690848, "learning_rate": 0.005674900798750028, "loss": 2.5716, "step": 12903 }, { "crossentropy": 2.6443798542022705, "epoch": 0.4678074245939675, "grad_norm": 0.03165268898010254, "grad_norm_var": 0.008207957228367311, "learning_rate": 0.005674325024412425, "loss": 2.628, "step": 12904 }, { "crossentropy": 2.624110460281372, "epoch": 0.46784367749419953, "grad_norm": 0.03210573270916939, "grad_norm_var": 5.0319922718699535e-06, "learning_rate": 0.005673749240967041, "loss": 2.7005, "step": 12905 }, { "crossentropy": 2.5114171504974365, "epoch": 0.46787993039443154, "grad_norm": 0.02813197858631611, "grad_norm_var": 2.523135003052928e-06, "learning_rate": 0.0056731734484216515, "loss": 2.4941, "step": 12906 }, { "crossentropy": 2.4500820636749268, "epoch": 0.46791618329466356, "grad_norm": 0.028181105852127075, "grad_norm_var": 2.5096066724681108e-06, "learning_rate": 0.005672597646784031, "loss": 2.531, "step": 12907 }, { "crossentropy": 2.6063919067382812, "epoch": 0.46795243619489557, "grad_norm": 0.028718115761876106, "grad_norm_var": 2.389971506059245e-06, "learning_rate": 0.005672021836061959, "loss": 2.6426, "step": 12908 }, { "crossentropy": 2.7265379428863525, "epoch": 0.4679886890951276, "grad_norm": 0.027147674933075905, "grad_norm_var": 2.429788531993941e-06, "learning_rate": 0.005671446016263213, "loss": 2.7347, "step": 12909 }, { "crossentropy": 2.5838019847869873, "epoch": 0.46802494199535966, "grad_norm": 0.03304673731327057, "grad_norm_var": 3.2151545200757742e-06, "learning_rate": 0.005670870187395569, "loss": 2.5867, "step": 12910 }, { "crossentropy": 2.6536738872528076, "epoch": 0.46806119489559167, "grad_norm": 0.029398048296570778, "grad_norm_var": 3.215172048526051e-06, "learning_rate": 0.005670294349466807, "loss": 2.6649, "step": 12911 }, { "crossentropy": 2.5959222316741943, "epoch": 0.4680974477958237, "grad_norm": 0.03168286755681038, "grad_norm_var": 3.51007588403057e-06, "learning_rate": 0.0056697185024846995, "loss": 2.6475, "step": 12912 }, { "crossentropy": 2.5258798599243164, "epoch": 0.4681337006960557, "grad_norm": 0.029033636674284935, "grad_norm_var": 3.528168737904474e-06, "learning_rate": 0.00566914264645703, "loss": 2.4955, "step": 12913 }, { "crossentropy": 2.477036714553833, "epoch": 0.4681699535962877, "grad_norm": 0.0339677669107914, "grad_norm_var": 4.733152649463593e-06, "learning_rate": 0.005668566781391571, "loss": 2.5376, "step": 12914 }, { "crossentropy": 2.4920506477355957, "epoch": 0.46820620649651973, "grad_norm": 0.028034187853336334, "grad_norm_var": 4.938441431309133e-06, "learning_rate": 0.005667990907296106, "loss": 2.601, "step": 12915 }, { "crossentropy": 2.641496181488037, "epoch": 0.46824245939675174, "grad_norm": 0.02839503064751625, "grad_norm_var": 4.896856154510259e-06, "learning_rate": 0.005667415024178407, "loss": 2.5613, "step": 12916 }, { "crossentropy": 2.65193247795105, "epoch": 0.46827871229698376, "grad_norm": 0.028787635266780853, "grad_norm_var": 4.456256214829798e-06, "learning_rate": 0.0056668391320462575, "loss": 2.5994, "step": 12917 }, { "crossentropy": 2.7384607791900635, "epoch": 0.46831496519721577, "grad_norm": 0.030469560995697975, "grad_norm_var": 4.237398535645217e-06, "learning_rate": 0.0056662632309074324, "loss": 2.7133, "step": 12918 }, { "crossentropy": 2.6832687854766846, "epoch": 0.4683512180974478, "grad_norm": 0.02798069827258587, "grad_norm_var": 4.29584902748753e-06, "learning_rate": 0.005665687320769714, "loss": 2.6205, "step": 12919 }, { "crossentropy": 2.523420810699463, "epoch": 0.4683874709976798, "grad_norm": 0.026482008397579193, "grad_norm_var": 4.686689884025397e-06, "learning_rate": 0.005665111401640873, "loss": 2.5795, "step": 12920 }, { "crossentropy": 2.509186029434204, "epoch": 0.4684237238979118, "grad_norm": 0.03020607866346836, "grad_norm_var": 4.245312675534886e-06, "learning_rate": 0.005664535473528697, "loss": 2.5862, "step": 12921 }, { "crossentropy": 2.5241198539733887, "epoch": 0.46845997679814383, "grad_norm": 0.027912365272641182, "grad_norm_var": 4.28410841870937e-06, "learning_rate": 0.00566395953644096, "loss": 2.5243, "step": 12922 }, { "crossentropy": 2.587141990661621, "epoch": 0.46849622969837584, "grad_norm": 0.033049896359443665, "grad_norm_var": 5.0132140074866825e-06, "learning_rate": 0.005663383590385442, "loss": 2.6106, "step": 12923 }, { "crossentropy": 2.5154147148132324, "epoch": 0.4685324825986079, "grad_norm": 0.027376368641853333, "grad_norm_var": 5.291465082302609e-06, "learning_rate": 0.005662807635369921, "loss": 2.5907, "step": 12924 }, { "crossentropy": 2.591553211212158, "epoch": 0.46856873549883993, "grad_norm": 0.028971850872039795, "grad_norm_var": 4.9125465937163255e-06, "learning_rate": 0.005662231671402178, "loss": 2.5551, "step": 12925 }, { "crossentropy": 2.6711955070495605, "epoch": 0.46860498839907194, "grad_norm": 0.030159447342157364, "grad_norm_var": 4.135423253025644e-06, "learning_rate": 0.005661655698489991, "loss": 2.5863, "step": 12926 }, { "crossentropy": 2.474423885345459, "epoch": 0.46864124129930396, "grad_norm": 0.028654543682932854, "grad_norm_var": 4.179506632168614e-06, "learning_rate": 0.005661079716641141, "loss": 2.4498, "step": 12927 }, { "crossentropy": 2.4295172691345215, "epoch": 0.46867749419953597, "grad_norm": 0.03182702139019966, "grad_norm_var": 4.223765572481294e-06, "learning_rate": 0.005660503725863404, "loss": 2.5931, "step": 12928 }, { "crossentropy": 2.581578254699707, "epoch": 0.468713747099768, "grad_norm": 0.028461871668696404, "grad_norm_var": 4.276454422907222e-06, "learning_rate": 0.005659927726164563, "loss": 2.5604, "step": 12929 }, { "crossentropy": 2.467844247817993, "epoch": 0.46875, "grad_norm": 0.026534100994467735, "grad_norm_var": 3.2236339852661215e-06, "learning_rate": 0.005659351717552397, "loss": 2.4423, "step": 12930 }, { "crossentropy": 2.624178409576416, "epoch": 0.468786252900232, "grad_norm": 0.06352225691080093, "grad_norm_var": 7.757257511801842e-05, "learning_rate": 0.005658775700034684, "loss": 2.6559, "step": 12931 }, { "crossentropy": 2.729771137237549, "epoch": 0.46882250580046403, "grad_norm": 0.028770409524440765, "grad_norm_var": 7.744227202741564e-05, "learning_rate": 0.0056581996736192065, "loss": 2.6932, "step": 12932 }, { "crossentropy": 2.5831546783447266, "epoch": 0.46885875870069604, "grad_norm": 0.028377428650856018, "grad_norm_var": 7.758461543099144e-05, "learning_rate": 0.0056576236383137435, "loss": 2.577, "step": 12933 }, { "crossentropy": 2.6919827461242676, "epoch": 0.46889501160092806, "grad_norm": 0.026999063789844513, "grad_norm_var": 7.866254205181976e-05, "learning_rate": 0.005657047594126075, "loss": 2.6169, "step": 12934 }, { "crossentropy": 2.4787182807922363, "epoch": 0.46893126450116007, "grad_norm": 0.02732655592262745, "grad_norm_var": 7.89487309963396e-05, "learning_rate": 0.005656471541063981, "loss": 2.5621, "step": 12935 }, { "crossentropy": 2.500124216079712, "epoch": 0.4689675174013921, "grad_norm": 0.028754858300089836, "grad_norm_var": 7.792835852906548e-05, "learning_rate": 0.005655895479135245, "loss": 2.5116, "step": 12936 }, { "crossentropy": 2.552875280380249, "epoch": 0.46900377030162416, "grad_norm": 0.027220644056797028, "grad_norm_var": 7.882392967310566e-05, "learning_rate": 0.005655319408347642, "loss": 2.5215, "step": 12937 }, { "crossentropy": 2.595736026763916, "epoch": 0.46904002320185617, "grad_norm": 0.03229100629687309, "grad_norm_var": 7.829553602410135e-05, "learning_rate": 0.005654743328708958, "loss": 2.5581, "step": 12938 }, { "crossentropy": 2.602764368057251, "epoch": 0.4690762761020882, "grad_norm": 0.02721918560564518, "grad_norm_var": 7.893833851878169e-05, "learning_rate": 0.005654167240226972, "loss": 2.5594, "step": 12939 }, { "crossentropy": 2.565850019454956, "epoch": 0.4691125290023202, "grad_norm": 0.027882201597094536, "grad_norm_var": 7.872483077762577e-05, "learning_rate": 0.005653591142909463, "loss": 2.5582, "step": 12940 }, { "crossentropy": 2.6190779209136963, "epoch": 0.4691487819025522, "grad_norm": 0.02773924544453621, "grad_norm_var": 7.912201089654128e-05, "learning_rate": 0.005653015036764215, "loss": 2.5916, "step": 12941 }, { "crossentropy": 2.589597702026367, "epoch": 0.46918503480278423, "grad_norm": 0.0289910975843668, "grad_norm_var": 7.929678927198643e-05, "learning_rate": 0.005652438921799008, "loss": 2.5369, "step": 12942 }, { "crossentropy": 2.534604549407959, "epoch": 0.46922128770301624, "grad_norm": 0.02717066928744316, "grad_norm_var": 7.983132844256087e-05, "learning_rate": 0.0056518627980216216, "loss": 2.6067, "step": 12943 }, { "crossentropy": 2.6073431968688965, "epoch": 0.46925754060324826, "grad_norm": 0.027906080707907677, "grad_norm_var": 8.013397049350182e-05, "learning_rate": 0.0056512866654398405, "loss": 2.6156, "step": 12944 }, { "crossentropy": 2.6352274417877197, "epoch": 0.46929379350348027, "grad_norm": 0.027814986184239388, "grad_norm_var": 8.032064208021757e-05, "learning_rate": 0.005650710524061444, "loss": 2.5621, "step": 12945 }, { "crossentropy": 2.513340473175049, "epoch": 0.4693300464037123, "grad_norm": 0.02925802953541279, "grad_norm_var": 7.942300069159076e-05, "learning_rate": 0.005650134373894215, "loss": 2.4887, "step": 12946 }, { "crossentropy": 2.63946795463562, "epoch": 0.4693662993039443, "grad_norm": 0.03154737874865532, "grad_norm_var": 2.3366841021355347e-06, "learning_rate": 0.005649558214945934, "loss": 2.6113, "step": 12947 }, { "crossentropy": 2.510969638824463, "epoch": 0.4694025522041763, "grad_norm": 0.02779674157500267, "grad_norm_var": 2.354898171467538e-06, "learning_rate": 0.0056489820472243844, "loss": 2.5814, "step": 12948 }, { "crossentropy": 2.714602470397949, "epoch": 0.46943880510440833, "grad_norm": 0.028648212552070618, "grad_norm_var": 2.358902533694529e-06, "learning_rate": 0.005648405870737346, "loss": 2.5921, "step": 12949 }, { "crossentropy": 2.531569242477417, "epoch": 0.46947505800464034, "grad_norm": 0.02693282999098301, "grad_norm_var": 2.3716402260317518e-06, "learning_rate": 0.005647829685492604, "loss": 2.5639, "step": 12950 }, { "crossentropy": 2.59142804145813, "epoch": 0.4695113109048724, "grad_norm": 0.026838459074497223, "grad_norm_var": 2.456795046063692e-06, "learning_rate": 0.0056472534914979375, "loss": 2.6062, "step": 12951 }, { "crossentropy": 2.4861862659454346, "epoch": 0.46954756380510443, "grad_norm": 0.028649767860770226, "grad_norm_var": 2.4521728813259313e-06, "learning_rate": 0.0056466772887611314, "loss": 2.6164, "step": 12952 }, { "crossentropy": 2.603541374206543, "epoch": 0.46958381670533644, "grad_norm": 0.0317700058221817, "grad_norm_var": 3.049048464316001e-06, "learning_rate": 0.005646101077289967, "loss": 2.6046, "step": 12953 }, { "crossentropy": 2.6342968940734863, "epoch": 0.46962006960556846, "grad_norm": 0.029544513672590256, "grad_norm_var": 2.188446256535387e-06, "learning_rate": 0.005645524857092227, "loss": 2.6444, "step": 12954 }, { "crossentropy": 2.554222583770752, "epoch": 0.46965632250580047, "grad_norm": 0.029716776683926582, "grad_norm_var": 2.157840307872213e-06, "learning_rate": 0.005644948628175693, "loss": 2.5638, "step": 12955 }, { "crossentropy": 2.496927499771118, "epoch": 0.4696925754060325, "grad_norm": 0.028507770970463753, "grad_norm_var": 2.119263527408597e-06, "learning_rate": 0.005644372390548151, "loss": 2.5196, "step": 12956 }, { "crossentropy": 2.8067023754119873, "epoch": 0.4697288283062645, "grad_norm": 0.026829618960618973, "grad_norm_var": 2.284715770642318e-06, "learning_rate": 0.0056437961442173815, "loss": 2.7645, "step": 12957 }, { "crossentropy": 2.5790181159973145, "epoch": 0.4697650812064965, "grad_norm": 0.0276617631316185, "grad_norm_var": 2.329418920487394e-06, "learning_rate": 0.0056432198891911665, "loss": 2.5485, "step": 12958 }, { "crossentropy": 2.338052272796631, "epoch": 0.46980133410672853, "grad_norm": 0.027589160948991776, "grad_norm_var": 2.2641195437451128e-06, "learning_rate": 0.0056426436254772925, "loss": 2.406, "step": 12959 }, { "crossentropy": 2.524484395980835, "epoch": 0.46983758700696054, "grad_norm": 0.027916908264160156, "grad_norm_var": 2.26317812396324e-06, "learning_rate": 0.00564206735308354, "loss": 2.574, "step": 12960 }, { "crossentropy": 2.6451609134674072, "epoch": 0.46987383990719256, "grad_norm": 0.02742612548172474, "grad_norm_var": 2.3114603659529196e-06, "learning_rate": 0.005641491072017695, "loss": 2.6225, "step": 12961 }, { "crossentropy": 2.595773458480835, "epoch": 0.4699100928074246, "grad_norm": 0.027296295389533043, "grad_norm_var": 2.364077293008406e-06, "learning_rate": 0.005640914782287538, "loss": 2.6991, "step": 12962 }, { "crossentropy": 2.7530901432037354, "epoch": 0.4699463457076566, "grad_norm": 0.029326127842068672, "grad_norm_var": 1.7453413918911182e-06, "learning_rate": 0.005640338483900855, "loss": 2.633, "step": 12963 }, { "crossentropy": 2.5738086700439453, "epoch": 0.46998259860788866, "grad_norm": 0.027177680283784866, "grad_norm_var": 1.8090333720289225e-06, "learning_rate": 0.005639762176865427, "loss": 2.5953, "step": 12964 }, { "crossentropy": 2.589210033416748, "epoch": 0.47001885150812067, "grad_norm": 0.029010316357016563, "grad_norm_var": 1.8369611181539545e-06, "learning_rate": 0.005639185861189043, "loss": 2.5381, "step": 12965 }, { "crossentropy": 2.409769296646118, "epoch": 0.4700551044083527, "grad_norm": 0.03156648203730583, "grad_norm_var": 2.3576117563631984e-06, "learning_rate": 0.005638609536879483, "loss": 2.4869, "step": 12966 }, { "crossentropy": 2.448939085006714, "epoch": 0.4700913573085847, "grad_norm": 0.028779231011867523, "grad_norm_var": 2.149680070080923e-06, "learning_rate": 0.005638033203944531, "loss": 2.5046, "step": 12967 }, { "crossentropy": 2.518254518508911, "epoch": 0.4701276102088167, "grad_norm": 0.03161922097206116, "grad_norm_var": 2.691571593954843e-06, "learning_rate": 0.005637456862391973, "loss": 2.5715, "step": 12968 }, { "crossentropy": 2.4057252407073975, "epoch": 0.47016386310904873, "grad_norm": 0.02666698396205902, "grad_norm_var": 2.338211511062477e-06, "learning_rate": 0.005636880512229594, "loss": 2.4692, "step": 12969 }, { "crossentropy": 2.6076972484588623, "epoch": 0.47020011600928074, "grad_norm": 0.027791282162070274, "grad_norm_var": 2.2954325677832736e-06, "learning_rate": 0.005636304153465176, "loss": 2.6149, "step": 12970 }, { "crossentropy": 2.4790945053100586, "epoch": 0.47023636890951276, "grad_norm": 0.027456557378172874, "grad_norm_var": 2.2269660480750493e-06, "learning_rate": 0.005635727786106505, "loss": 2.5692, "step": 12971 }, { "crossentropy": 2.668638229370117, "epoch": 0.4702726218097448, "grad_norm": 0.028569970279932022, "grad_norm_var": 2.2290234484266536e-06, "learning_rate": 0.005635151410161363, "loss": 2.6441, "step": 12972 }, { "crossentropy": 2.756709575653076, "epoch": 0.4703088747099768, "grad_norm": 0.02896178513765335, "grad_norm_var": 2.0972098301231047e-06, "learning_rate": 0.005634575025637541, "loss": 2.7306, "step": 12973 }, { "crossentropy": 2.638333559036255, "epoch": 0.4703451276102088, "grad_norm": 0.031644225120544434, "grad_norm_var": 2.6826577739515998e-06, "learning_rate": 0.00563399863254282, "loss": 2.6139, "step": 12974 }, { "crossentropy": 2.57953143119812, "epoch": 0.4703813805104408, "grad_norm": 0.028117910027503967, "grad_norm_var": 2.623586987234953e-06, "learning_rate": 0.005633422230884984, "loss": 2.5935, "step": 12975 }, { "crossentropy": 2.630176067352295, "epoch": 0.47041763341067283, "grad_norm": 0.02734946459531784, "grad_norm_var": 2.703560592243424e-06, "learning_rate": 0.005632845820671818, "loss": 2.5732, "step": 12976 }, { "crossentropy": 2.6204726696014404, "epoch": 0.47045388631090485, "grad_norm": 0.030222168192267418, "grad_norm_var": 2.7275289693234316e-06, "learning_rate": 0.005632269401911111, "loss": 2.5877, "step": 12977 }, { "crossentropy": 2.573590040206909, "epoch": 0.4704901392111369, "grad_norm": 0.03371889516711235, "grad_norm_var": 3.977501964832333e-06, "learning_rate": 0.005631692974610647, "loss": 2.5965, "step": 12978 }, { "crossentropy": 2.650887966156006, "epoch": 0.47052639211136893, "grad_norm": 0.030004553496837616, "grad_norm_var": 4.013277256872718e-06, "learning_rate": 0.005631116538778209, "loss": 2.7638, "step": 12979 }, { "crossentropy": 2.570244550704956, "epoch": 0.47056264501160094, "grad_norm": 0.026666073128581047, "grad_norm_var": 4.173797818854856e-06, "learning_rate": 0.005630540094421584, "loss": 2.5578, "step": 12980 }, { "crossentropy": 2.484450340270996, "epoch": 0.47059889791183296, "grad_norm": 0.02717936411499977, "grad_norm_var": 4.444049408076449e-06, "learning_rate": 0.005629963641548559, "loss": 2.5942, "step": 12981 }, { "crossentropy": 2.773655891418457, "epoch": 0.470635150812065, "grad_norm": 0.026659924536943436, "grad_norm_var": 4.364302942578418e-06, "learning_rate": 0.005629387180166921, "loss": 2.6356, "step": 12982 }, { "crossentropy": 2.61035418510437, "epoch": 0.470671403712297, "grad_norm": 0.02771841734647751, "grad_norm_var": 4.442944733407811e-06, "learning_rate": 0.005628810710284451, "loss": 2.5372, "step": 12983 }, { "crossentropy": 2.724665403366089, "epoch": 0.470707656612529, "grad_norm": 0.03103790432214737, "grad_norm_var": 4.24335515965546e-06, "learning_rate": 0.005628234231908938, "loss": 2.6607, "step": 12984 }, { "crossentropy": 2.490297794342041, "epoch": 0.470743909512761, "grad_norm": 0.027520066127181053, "grad_norm_var": 4.05357552012626e-06, "learning_rate": 0.00562765774504817, "loss": 2.5503, "step": 12985 }, { "crossentropy": 2.630918264389038, "epoch": 0.47078016241299303, "grad_norm": 0.027055704966187477, "grad_norm_var": 4.185212426037809e-06, "learning_rate": 0.005627081249709932, "loss": 2.6033, "step": 12986 }, { "crossentropy": 2.4941275119781494, "epoch": 0.47081641531322505, "grad_norm": 0.02647755853831768, "grad_norm_var": 4.412997368481242e-06, "learning_rate": 0.00562650474590201, "loss": 2.5704, "step": 12987 }, { "crossentropy": 2.607745885848999, "epoch": 0.47085266821345706, "grad_norm": 0.028575127944350243, "grad_norm_var": 4.412922333999793e-06, "learning_rate": 0.005625928233632188, "loss": 2.6194, "step": 12988 }, { "crossentropy": 2.668942928314209, "epoch": 0.4708889211136891, "grad_norm": 0.027820443734526634, "grad_norm_var": 4.451734039164846e-06, "learning_rate": 0.005625351712908257, "loss": 2.6293, "step": 12989 }, { "crossentropy": 2.733381748199463, "epoch": 0.4709251740139211, "grad_norm": 0.027481449767947197, "grad_norm_var": 3.8509419920072325e-06, "learning_rate": 0.005624775183738004, "loss": 2.7313, "step": 12990 }, { "crossentropy": 2.4916255474090576, "epoch": 0.47096142691415316, "grad_norm": 0.02898986265063286, "grad_norm_var": 3.871441446361551e-06, "learning_rate": 0.005624198646129213, "loss": 2.4685, "step": 12991 }, { "crossentropy": 2.5929388999938965, "epoch": 0.4709976798143852, "grad_norm": 0.02800825424492359, "grad_norm_var": 3.8058664978219872e-06, "learning_rate": 0.00562362210008967, "loss": 2.63, "step": 12992 }, { "crossentropy": 2.6101152896881104, "epoch": 0.4710339327146172, "grad_norm": 0.027629626914858818, "grad_norm_var": 3.6119689699277834e-06, "learning_rate": 0.005623045545627166, "loss": 2.6824, "step": 12993 }, { "crossentropy": 2.6033966541290283, "epoch": 0.4710701856148492, "grad_norm": 0.027501123026013374, "grad_norm_var": 1.5224967658160223e-06, "learning_rate": 0.005622468982749485, "loss": 2.5984, "step": 12994 }, { "crossentropy": 2.376856803894043, "epoch": 0.4711064385150812, "grad_norm": 0.026629731059074402, "grad_norm_var": 1.285240201138032e-06, "learning_rate": 0.005621892411464418, "loss": 2.4268, "step": 12995 }, { "crossentropy": 2.4299957752227783, "epoch": 0.47114269141531323, "grad_norm": 0.026732448488473892, "grad_norm_var": 1.2765031864261316e-06, "learning_rate": 0.0056213158317797475, "loss": 2.4376, "step": 12996 }, { "crossentropy": 2.626176595687866, "epoch": 0.47117894431554525, "grad_norm": 0.026920530945062637, "grad_norm_var": 1.2982633572788342e-06, "learning_rate": 0.0056207392437032655, "loss": 2.537, "step": 12997 }, { "crossentropy": 2.656111478805542, "epoch": 0.47121519721577726, "grad_norm": 0.029761921614408493, "grad_norm_var": 1.480908843324311e-06, "learning_rate": 0.005620162647242758, "loss": 2.4687, "step": 12998 }, { "crossentropy": 2.6158828735351562, "epoch": 0.4712514501160093, "grad_norm": 0.027474161237478256, "grad_norm_var": 1.489452545383217e-06, "learning_rate": 0.005619586042406014, "loss": 2.5884, "step": 12999 }, { "crossentropy": 2.5948104858398438, "epoch": 0.4712877030162413, "grad_norm": 0.02738393284380436, "grad_norm_var": 7.712715313352529e-07, "learning_rate": 0.005619009429200818, "loss": 2.6258, "step": 13000 }, { "crossentropy": 2.7495651245117188, "epoch": 0.4713239559164733, "grad_norm": 0.029325293377041817, "grad_norm_var": 9.502646016086465e-07, "learning_rate": 0.00561843280763496, "loss": 2.661, "step": 13001 }, { "crossentropy": 2.3831114768981934, "epoch": 0.4713602088167053, "grad_norm": 0.03146844357252121, "grad_norm_var": 1.7673437434361038e-06, "learning_rate": 0.00561785617771623, "loss": 2.4059, "step": 13002 }, { "crossentropy": 2.4742393493652344, "epoch": 0.47139646171693733, "grad_norm": 0.02976522408425808, "grad_norm_var": 1.7705908131125515e-06, "learning_rate": 0.005617279539452415, "loss": 2.5213, "step": 13003 }, { "crossentropy": 2.505584478378296, "epoch": 0.47143271461716935, "grad_norm": 0.02600574679672718, "grad_norm_var": 2.060414587126908e-06, "learning_rate": 0.005616702892851302, "loss": 2.5845, "step": 13004 }, { "crossentropy": 2.646773338317871, "epoch": 0.4714689675174014, "grad_norm": 0.02824617177248001, "grad_norm_var": 2.058363520543449e-06, "learning_rate": 0.0056161262379206805, "loss": 2.6392, "step": 13005 }, { "crossentropy": 2.7164440155029297, "epoch": 0.47150522041763343, "grad_norm": 0.026684045791625977, "grad_norm_var": 2.162034383168055e-06, "learning_rate": 0.005615549574668339, "loss": 2.6091, "step": 13006 }, { "crossentropy": 2.5447633266448975, "epoch": 0.47154147331786544, "grad_norm": 0.026870518922805786, "grad_norm_var": 2.1723448838709225e-06, "learning_rate": 0.005614972903102068, "loss": 2.5417, "step": 13007 }, { "crossentropy": 2.503352165222168, "epoch": 0.47157772621809746, "grad_norm": 0.02888568304479122, "grad_norm_var": 2.2330747233337366e-06, "learning_rate": 0.005614396223229653, "loss": 2.5968, "step": 13008 }, { "crossentropy": 2.5551273822784424, "epoch": 0.4716139791183295, "grad_norm": 0.03060191124677658, "grad_norm_var": 2.656168490582957e-06, "learning_rate": 0.005613819535058885, "loss": 2.4949, "step": 13009 }, { "crossentropy": 2.6237144470214844, "epoch": 0.4716502320185615, "grad_norm": 0.026302063837647438, "grad_norm_var": 2.8483363441087277e-06, "learning_rate": 0.0056132428385975534, "loss": 2.6502, "step": 13010 }, { "crossentropy": 2.5871846675872803, "epoch": 0.4716864849187935, "grad_norm": 0.025889797136187553, "grad_norm_var": 3.0242657227280324e-06, "learning_rate": 0.005612666133853445, "loss": 2.6295, "step": 13011 }, { "crossentropy": 2.6438357830047607, "epoch": 0.4717227378190255, "grad_norm": 0.026557540521025658, "grad_norm_var": 3.056201772997219e-06, "learning_rate": 0.005612089420834352, "loss": 2.6753, "step": 13012 }, { "crossentropy": 2.5066754817962646, "epoch": 0.47175899071925753, "grad_norm": 0.025963328778743744, "grad_norm_var": 3.252376431426655e-06, "learning_rate": 0.005611512699548063, "loss": 2.5316, "step": 13013 }, { "crossentropy": 2.5733397006988525, "epoch": 0.47179524361948955, "grad_norm": 0.028688708320260048, "grad_norm_var": 3.0649588547465896e-06, "learning_rate": 0.005610935970002366, "loss": 2.6015, "step": 13014 }, { "crossentropy": 2.5823557376861572, "epoch": 0.47183149651972156, "grad_norm": 0.02799861878156662, "grad_norm_var": 3.0536281247459195e-06, "learning_rate": 0.005610359232205051, "loss": 2.5951, "step": 13015 }, { "crossentropy": 2.5880649089813232, "epoch": 0.4718677494199536, "grad_norm": 0.02808091789484024, "grad_norm_var": 3.0346543594884924e-06, "learning_rate": 0.00560978248616391, "loss": 2.5507, "step": 13016 }, { "crossentropy": 2.5084946155548096, "epoch": 0.4719040023201856, "grad_norm": 0.02622552216053009, "grad_norm_var": 3.0702398356169018e-06, "learning_rate": 0.00560920573188673, "loss": 2.5625, "step": 13017 }, { "crossentropy": 2.6428983211517334, "epoch": 0.47194025522041766, "grad_norm": 0.027683230116963387, "grad_norm_var": 2.096438360192169e-06, "learning_rate": 0.005608628969381302, "loss": 2.6019, "step": 13018 }, { "crossentropy": 2.6228296756744385, "epoch": 0.4719765081206497, "grad_norm": 0.026798684149980545, "grad_norm_var": 1.7615776405318124e-06, "learning_rate": 0.005608052198655416, "loss": 2.5381, "step": 13019 }, { "crossentropy": 2.7649943828582764, "epoch": 0.4720127610208817, "grad_norm": 0.028810862451791763, "grad_norm_var": 1.7533452546093323e-06, "learning_rate": 0.005607475419716864, "loss": 2.7159, "step": 13020 }, { "crossentropy": 2.4908883571624756, "epoch": 0.4720490139211137, "grad_norm": 0.027710797265172005, "grad_norm_var": 1.7192783279323565e-06, "learning_rate": 0.005606898632573433, "loss": 2.5699, "step": 13021 }, { "crossentropy": 2.631256103515625, "epoch": 0.4720852668213457, "grad_norm": 0.029353955760598183, "grad_norm_var": 1.8798473144763956e-06, "learning_rate": 0.005606321837232916, "loss": 2.5785, "step": 13022 }, { "crossentropy": 2.729674816131592, "epoch": 0.47212151972157773, "grad_norm": 0.027605239301919937, "grad_norm_var": 1.8370900490586225e-06, "learning_rate": 0.005605745033703102, "loss": 2.7031, "step": 13023 }, { "crossentropy": 2.648859739303589, "epoch": 0.47215777262180975, "grad_norm": 0.026528431102633476, "grad_norm_var": 1.810871938997168e-06, "learning_rate": 0.005605168221991781, "loss": 2.6279, "step": 13024 }, { "crossentropy": 2.5788519382476807, "epoch": 0.47219402552204176, "grad_norm": 0.02862853743135929, "grad_norm_var": 1.251245037515294e-06, "learning_rate": 0.005604591402106745, "loss": 2.5873, "step": 13025 }, { "crossentropy": 2.666527271270752, "epoch": 0.4722302784222738, "grad_norm": 0.029854390770196915, "grad_norm_var": 1.5072860580319624e-06, "learning_rate": 0.0056040145740557865, "loss": 2.6016, "step": 13026 }, { "crossentropy": 2.489025354385376, "epoch": 0.4722665313225058, "grad_norm": 0.025991789996623993, "grad_norm_var": 1.4840173547408698e-06, "learning_rate": 0.005603437737846693, "loss": 2.5463, "step": 13027 }, { "crossentropy": 2.526423931121826, "epoch": 0.4723027842227378, "grad_norm": 0.03117757849395275, "grad_norm_var": 2.1420023285108654e-06, "learning_rate": 0.0056028608934872584, "loss": 2.5946, "step": 13028 }, { "crossentropy": 2.5293831825256348, "epoch": 0.4723390371229698, "grad_norm": 0.028469620272517204, "grad_norm_var": 1.8727817988538227e-06, "learning_rate": 0.0056022840409852715, "loss": 2.5638, "step": 13029 }, { "crossentropy": 2.6082608699798584, "epoch": 0.47237529002320183, "grad_norm": 0.027705011889338493, "grad_norm_var": 1.8561021960903276e-06, "learning_rate": 0.005601707180348524, "loss": 2.6021, "step": 13030 }, { "crossentropy": 2.5040676593780518, "epoch": 0.47241154292343385, "grad_norm": 0.02985503152012825, "grad_norm_var": 2.0615112985122285e-06, "learning_rate": 0.0056011303115848085, "loss": 2.5269, "step": 13031 }, { "crossentropy": 2.6055703163146973, "epoch": 0.4724477958236659, "grad_norm": 0.027479318901896477, "grad_norm_var": 2.0900717430000303e-06, "learning_rate": 0.005600553434701916, "loss": 2.611, "step": 13032 }, { "crossentropy": 2.440516948699951, "epoch": 0.47248404872389793, "grad_norm": 0.026471685618162155, "grad_norm_var": 2.0317650129001435e-06, "learning_rate": 0.005599976549707638, "loss": 2.4753, "step": 13033 }, { "crossentropy": 2.538811206817627, "epoch": 0.47252030162412995, "grad_norm": 0.02601373754441738, "grad_norm_var": 2.3060303304577584e-06, "learning_rate": 0.005599399656609767, "loss": 2.5846, "step": 13034 }, { "crossentropy": 2.536686658859253, "epoch": 0.47255655452436196, "grad_norm": 0.02901843562722206, "grad_norm_var": 2.250026185819986e-06, "learning_rate": 0.005598822755416093, "loss": 2.6211, "step": 13035 }, { "crossentropy": 2.5732052326202393, "epoch": 0.472592807424594, "grad_norm": 0.029884565621614456, "grad_norm_var": 2.41423252425979e-06, "learning_rate": 0.005598245846134409, "loss": 2.5629, "step": 13036 }, { "crossentropy": 2.4359171390533447, "epoch": 0.472629060324826, "grad_norm": 0.029893914237618446, "grad_norm_var": 2.5597373872400373e-06, "learning_rate": 0.005597668928772508, "loss": 2.5202, "step": 13037 }, { "crossentropy": 2.7410101890563965, "epoch": 0.472665313225058, "grad_norm": 0.02685510739684105, "grad_norm_var": 2.6224025621250533e-06, "learning_rate": 0.00559709200333818, "loss": 2.6459, "step": 13038 }, { "crossentropy": 2.5277509689331055, "epoch": 0.47270156612529, "grad_norm": 0.027695851400494576, "grad_norm_var": 2.6155545707795864e-06, "learning_rate": 0.005596515069839218, "loss": 2.47, "step": 13039 }, { "crossentropy": 2.572763204574585, "epoch": 0.47273781902552203, "grad_norm": 0.02887486293911934, "grad_norm_var": 2.430384506095738e-06, "learning_rate": 0.005595938128283417, "loss": 2.5728, "step": 13040 }, { "crossentropy": 2.5470340251922607, "epoch": 0.47277407192575405, "grad_norm": 0.02725573442876339, "grad_norm_var": 2.500270054863786e-06, "learning_rate": 0.005595361178678564, "loss": 2.5312, "step": 13041 }, { "crossentropy": 2.687305450439453, "epoch": 0.47281032482598606, "grad_norm": 0.026684079319238663, "grad_norm_var": 2.4633813733488176e-06, "learning_rate": 0.005594784221032457, "loss": 2.6469, "step": 13042 }, { "crossentropy": 2.672793388366699, "epoch": 0.4728465777262181, "grad_norm": 0.02714451029896736, "grad_norm_var": 2.2250345203894684e-06, "learning_rate": 0.005594207255352886, "loss": 2.6491, "step": 13043 }, { "crossentropy": 2.7386512756347656, "epoch": 0.4728828306264501, "grad_norm": 0.02753518708050251, "grad_norm_var": 1.5862721386816192e-06, "learning_rate": 0.005593630281647644, "loss": 2.6443, "step": 13044 }, { "crossentropy": 2.441134214401245, "epoch": 0.47291908352668216, "grad_norm": 0.027767766267061234, "grad_norm_var": 1.566308104902408e-06, "learning_rate": 0.005593053299924524, "loss": 2.534, "step": 13045 }, { "crossentropy": 2.51727032661438, "epoch": 0.4729553364269142, "grad_norm": 0.029786139726638794, "grad_norm_var": 1.787494688365067e-06, "learning_rate": 0.00559247631019132, "loss": 2.5746, "step": 13046 }, { "crossentropy": 2.525604724884033, "epoch": 0.4729915893271462, "grad_norm": 0.029563741758465767, "grad_norm_var": 1.7212750509005737e-06, "learning_rate": 0.005591899312455823, "loss": 2.5469, "step": 13047 }, { "crossentropy": 2.4131617546081543, "epoch": 0.4730278422273782, "grad_norm": 0.02752750739455223, "grad_norm_var": 1.718105002216965e-06, "learning_rate": 0.005591322306725829, "loss": 2.4577, "step": 13048 }, { "crossentropy": 2.6277642250061035, "epoch": 0.4730640951276102, "grad_norm": 0.028605472296476364, "grad_norm_var": 1.5683406199990442e-06, "learning_rate": 0.005590745293009129, "loss": 2.6033, "step": 13049 }, { "crossentropy": 2.551081657409668, "epoch": 0.47310034802784223, "grad_norm": 0.027202308177947998, "grad_norm_var": 1.320993793156574e-06, "learning_rate": 0.005590168271313518, "loss": 2.6093, "step": 13050 }, { "crossentropy": 2.6454970836639404, "epoch": 0.47313660092807425, "grad_norm": 0.026397112756967545, "grad_norm_var": 1.4664801715552477e-06, "learning_rate": 0.005589591241646789, "loss": 2.6382, "step": 13051 }, { "crossentropy": 2.63335919380188, "epoch": 0.47317285382830626, "grad_norm": 0.028355440124869347, "grad_norm_var": 1.236974393225137e-06, "learning_rate": 0.005589014204016735, "loss": 2.5764, "step": 13052 }, { "crossentropy": 2.499631881713867, "epoch": 0.4732091067285383, "grad_norm": 0.028334129601716995, "grad_norm_var": 9.84035720653674e-07, "learning_rate": 0.005588437158431149, "loss": 2.5121, "step": 13053 }, { "crossentropy": 2.474954128265381, "epoch": 0.4732453596287703, "grad_norm": 0.028126206248998642, "grad_norm_var": 9.16561649502466e-07, "learning_rate": 0.005587860104897828, "loss": 2.4524, "step": 13054 }, { "crossentropy": 2.6527771949768066, "epoch": 0.4732816125290023, "grad_norm": 0.027022315189242363, "grad_norm_var": 9.6580808588025e-07, "learning_rate": 0.0055872830434245645, "loss": 2.6511, "step": 13055 }, { "crossentropy": 2.6075594425201416, "epoch": 0.4733178654292343, "grad_norm": 0.02832331880927086, "grad_norm_var": 9.121303751147598e-07, "learning_rate": 0.005586705974019152, "loss": 2.6612, "step": 13056 }, { "crossentropy": 2.640855550765991, "epoch": 0.47335411832946633, "grad_norm": 0.027529703453183174, "grad_norm_var": 8.950428107702546e-07, "learning_rate": 0.0055861288966893834, "loss": 2.5509, "step": 13057 }, { "crossentropy": 2.5702664852142334, "epoch": 0.4733903712296984, "grad_norm": 0.029030993580818176, "grad_norm_var": 8.684872327125442e-07, "learning_rate": 0.005585551811443056, "loss": 2.7429, "step": 13058 }, { "crossentropy": 2.5608983039855957, "epoch": 0.4734266241299304, "grad_norm": 0.026921197772026062, "grad_norm_var": 8.975449042395902e-07, "learning_rate": 0.0055849747182879625, "loss": 2.5963, "step": 13059 }, { "crossentropy": 2.589808940887451, "epoch": 0.47346287703016243, "grad_norm": 0.028474140912294388, "grad_norm_var": 8.942320152478811e-07, "learning_rate": 0.005584397617231898, "loss": 2.6458, "step": 13060 }, { "crossentropy": 2.722851514816284, "epoch": 0.47349912993039445, "grad_norm": 0.027166003361344337, "grad_norm_var": 9.403493972344786e-07, "learning_rate": 0.005583820508282655, "loss": 2.7518, "step": 13061 }, { "crossentropy": 2.586454153060913, "epoch": 0.47353538283062646, "grad_norm": 0.027511922642588615, "grad_norm_var": 7.289253594046248e-07, "learning_rate": 0.005583243391448033, "loss": 2.5488, "step": 13062 }, { "crossentropy": 2.4756481647491455, "epoch": 0.4735716357308585, "grad_norm": 0.02630958892405033, "grad_norm_var": 6.605282789862512e-07, "learning_rate": 0.005582666266735822, "loss": 2.4162, "step": 13063 }, { "crossentropy": 2.502357006072998, "epoch": 0.4736078886310905, "grad_norm": 0.027229124680161476, "grad_norm_var": 6.720535930313284e-07, "learning_rate": 0.005582089134153819, "loss": 2.5755, "step": 13064 }, { "crossentropy": 2.624330997467041, "epoch": 0.4736441415313225, "grad_norm": 0.027401333674788475, "grad_norm_var": 6.106672227269734e-07, "learning_rate": 0.005581511993709818, "loss": 2.5797, "step": 13065 }, { "crossentropy": 2.6462957859039307, "epoch": 0.4736803944315545, "grad_norm": 0.03447675704956055, "grad_norm_var": 3.5483598909729607e-06, "learning_rate": 0.005580934845411618, "loss": 2.6228, "step": 13066 }, { "crossentropy": 2.513399362564087, "epoch": 0.47371664733178653, "grad_norm": 0.031214721500873566, "grad_norm_var": 3.944872412194507e-06, "learning_rate": 0.005580357689267009, "loss": 2.6154, "step": 13067 }, { "crossentropy": 2.596696138381958, "epoch": 0.47375290023201855, "grad_norm": 0.031574372202157974, "grad_norm_var": 4.599445872798891e-06, "learning_rate": 0.005579780525283788, "loss": 2.5754, "step": 13068 }, { "crossentropy": 2.5863969326019287, "epoch": 0.47378915313225056, "grad_norm": 0.028552338480949402, "grad_norm_var": 4.596421517041741e-06, "learning_rate": 0.005579203353469753, "loss": 2.5987, "step": 13069 }, { "crossentropy": 2.6655771732330322, "epoch": 0.4738254060324826, "grad_norm": 0.027295943349599838, "grad_norm_var": 4.686862817434541e-06, "learning_rate": 0.005578626173832698, "loss": 2.6701, "step": 13070 }, { "crossentropy": 2.7308127880096436, "epoch": 0.4738616589327146, "grad_norm": 0.02870958112180233, "grad_norm_var": 4.53188409426281e-06, "learning_rate": 0.005578048986380417, "loss": 2.6285, "step": 13071 }, { "crossentropy": 2.634474277496338, "epoch": 0.47389791183294666, "grad_norm": 0.027861373499035835, "grad_norm_var": 4.562728684584285e-06, "learning_rate": 0.005577471791120708, "loss": 2.5986, "step": 13072 }, { "crossentropy": 2.612509250640869, "epoch": 0.4739341647331787, "grad_norm": 0.030256198719143867, "grad_norm_var": 4.645997520903028e-06, "learning_rate": 0.005576894588061364, "loss": 2.5726, "step": 13073 }, { "crossentropy": 2.6708874702453613, "epoch": 0.4739704176334107, "grad_norm": 0.03207824006676674, "grad_norm_var": 5.340887913921934e-06, "learning_rate": 0.005576317377210186, "loss": 2.5723, "step": 13074 }, { "crossentropy": 2.666118621826172, "epoch": 0.4740066705336427, "grad_norm": 0.0280122309923172, "grad_norm_var": 5.1216727510081455e-06, "learning_rate": 0.0055757401585749666, "loss": 2.6428, "step": 13075 }, { "crossentropy": 2.7042970657348633, "epoch": 0.4740429234338747, "grad_norm": 0.02767302468419075, "grad_norm_var": 5.2187813116258764e-06, "learning_rate": 0.005575162932163501, "loss": 2.6506, "step": 13076 }, { "crossentropy": 2.634559392929077, "epoch": 0.47407917633410673, "grad_norm": 0.025196444243192673, "grad_norm_var": 5.9317353622227125e-06, "learning_rate": 0.005574585697983588, "loss": 2.489, "step": 13077 }, { "crossentropy": 2.7305715084075928, "epoch": 0.47411542923433875, "grad_norm": 0.026809390634298325, "grad_norm_var": 6.086476371489937e-06, "learning_rate": 0.005574008456043024, "loss": 2.6883, "step": 13078 }, { "crossentropy": 2.5720114707946777, "epoch": 0.47415168213457076, "grad_norm": 0.026774004101753235, "grad_norm_var": 5.946323118811515e-06, "learning_rate": 0.005573431206349606, "loss": 2.4855, "step": 13079 }, { "crossentropy": 2.4523000717163086, "epoch": 0.4741879350348028, "grad_norm": 0.025756772607564926, "grad_norm_var": 6.394062005806757e-06, "learning_rate": 0.005572853948911126, "loss": 2.4263, "step": 13080 }, { "crossentropy": 2.6076982021331787, "epoch": 0.4742241879350348, "grad_norm": 0.026840582489967346, "grad_norm_var": 6.512880611830154e-06, "learning_rate": 0.005572276683735385, "loss": 2.5932, "step": 13081 }, { "crossentropy": 2.6705801486968994, "epoch": 0.4742604408352668, "grad_norm": 0.029130693525075912, "grad_norm_var": 4.176175915537048e-06, "learning_rate": 0.00557169941083018, "loss": 2.7059, "step": 13082 }, { "crossentropy": 2.539591073989868, "epoch": 0.4742966937354988, "grad_norm": 0.02716229297220707, "grad_norm_var": 3.6592745464774624e-06, "learning_rate": 0.005571122130203308, "loss": 2.5149, "step": 13083 }, { "crossentropy": 2.5645930767059326, "epoch": 0.47433294663573083, "grad_norm": 0.026205595582723618, "grad_norm_var": 2.9774109736854216e-06, "learning_rate": 0.0055705448418625625, "loss": 2.5983, "step": 13084 }, { "crossentropy": 2.5928571224212646, "epoch": 0.4743691995359629, "grad_norm": 0.027715979143977165, "grad_norm_var": 2.9338504967828105e-06, "learning_rate": 0.005569967545815744, "loss": 2.6036, "step": 13085 }, { "crossentropy": 2.6346161365509033, "epoch": 0.4744054524361949, "grad_norm": 0.0261969156563282, "grad_norm_var": 3.0711003890493504e-06, "learning_rate": 0.005569390242070649, "loss": 2.5613, "step": 13086 }, { "crossentropy": 2.5235073566436768, "epoch": 0.47444170533642693, "grad_norm": 0.02670934610068798, "grad_norm_var": 3.0382262722785608e-06, "learning_rate": 0.005568812930635075, "loss": 2.5348, "step": 13087 }, { "crossentropy": 2.660953998565674, "epoch": 0.47447795823665895, "grad_norm": 0.026810383424162865, "grad_norm_var": 3.0599426577028554e-06, "learning_rate": 0.005568235611516818, "loss": 2.5808, "step": 13088 }, { "crossentropy": 2.661522388458252, "epoch": 0.47451421113689096, "grad_norm": 0.02934512309730053, "grad_norm_var": 2.7719059677654e-06, "learning_rate": 0.005567658284723677, "loss": 2.6967, "step": 13089 }, { "crossentropy": 2.544959783554077, "epoch": 0.474550464037123, "grad_norm": 0.028159124776721, "grad_norm_var": 1.2878200577103847e-06, "learning_rate": 0.00556708095026345, "loss": 2.5217, "step": 13090 }, { "crossentropy": 2.659419298171997, "epoch": 0.474586716937355, "grad_norm": 0.026279764249920845, "grad_norm_var": 1.2776520475959862e-06, "learning_rate": 0.005566503608143936, "loss": 2.6286, "step": 13091 }, { "crossentropy": 2.674896001815796, "epoch": 0.474622969837587, "grad_norm": 0.026296036317944527, "grad_norm_var": 1.2813751382969561e-06, "learning_rate": 0.005565926258372928, "loss": 2.6466, "step": 13092 }, { "crossentropy": 2.4933016300201416, "epoch": 0.474659222737819, "grad_norm": 0.025898169726133347, "grad_norm_var": 1.1469806743264268e-06, "learning_rate": 0.005565348900958227, "loss": 2.4873, "step": 13093 }, { "crossentropy": 2.6098923683166504, "epoch": 0.47469547563805103, "grad_norm": 0.02611650712788105, "grad_norm_var": 1.1951161608326899e-06, "learning_rate": 0.005564771535907633, "loss": 2.6608, "step": 13094 }, { "crossentropy": 2.5044496059417725, "epoch": 0.47473172853828305, "grad_norm": 0.027580948546528816, "grad_norm_var": 1.2155510744395076e-06, "learning_rate": 0.005564194163228941, "loss": 2.5513, "step": 13095 }, { "crossentropy": 2.465182304382324, "epoch": 0.47476798143851506, "grad_norm": 0.02775469794869423, "grad_norm_var": 1.1304488856874716e-06, "learning_rate": 0.005563616782929952, "loss": 2.574, "step": 13096 }, { "crossentropy": 2.659274101257324, "epoch": 0.4748042343387471, "grad_norm": 0.027296746149659157, "grad_norm_var": 1.125386938675615e-06, "learning_rate": 0.005563039395018462, "loss": 2.5859, "step": 13097 }, { "crossentropy": 2.5992863178253174, "epoch": 0.4748404872389791, "grad_norm": 0.02691488340497017, "grad_norm_var": 8.518415573396336e-07, "learning_rate": 0.00556246199950227, "loss": 2.6163, "step": 13098 }, { "crossentropy": 2.456389904022217, "epoch": 0.47487674013921116, "grad_norm": 0.025964217260479927, "grad_norm_var": 9.200459559882776e-07, "learning_rate": 0.005561884596389177, "loss": 2.605, "step": 13099 }, { "crossentropy": 2.3348827362060547, "epoch": 0.4749129930394432, "grad_norm": 0.028705717995762825, "grad_norm_var": 1.0616364075570368e-06, "learning_rate": 0.005561307185686979, "loss": 2.4501, "step": 13100 }, { "crossentropy": 2.5604939460754395, "epoch": 0.4749492459396752, "grad_norm": 0.028559863567352295, "grad_norm_var": 1.174437303066049e-06, "learning_rate": 0.005560729767403475, "loss": 2.5682, "step": 13101 }, { "crossentropy": 2.722161293029785, "epoch": 0.4749854988399072, "grad_norm": 0.028256282210350037, "grad_norm_var": 1.174565229301279e-06, "learning_rate": 0.005560152341546465, "loss": 2.5881, "step": 13102 }, { "crossentropy": 2.4053540229797363, "epoch": 0.4750217517401392, "grad_norm": 0.024960506707429886, "grad_norm_var": 1.5012275838800307e-06, "learning_rate": 0.0055595749081237475, "loss": 2.4796, "step": 13103 }, { "crossentropy": 2.5451507568359375, "epoch": 0.47505800464037123, "grad_norm": 0.02634819597005844, "grad_norm_var": 1.5374293545806443e-06, "learning_rate": 0.005558997467143122, "loss": 2.5579, "step": 13104 }, { "crossentropy": 2.4613490104675293, "epoch": 0.47509425754060325, "grad_norm": 0.026387829333543777, "grad_norm_var": 1.2193851938945935e-06, "learning_rate": 0.0055584200186123865, "loss": 2.5311, "step": 13105 }, { "crossentropy": 2.6266844272613525, "epoch": 0.47513051044083526, "grad_norm": 0.026463719084858894, "grad_norm_var": 1.129656378242155e-06, "learning_rate": 0.005557842562539343, "loss": 2.5886, "step": 13106 }, { "crossentropy": 2.6172454357147217, "epoch": 0.4751667633410673, "grad_norm": 0.02854694426059723, "grad_norm_var": 1.2750580474647401e-06, "learning_rate": 0.005557265098931788, "loss": 2.6359, "step": 13107 }, { "crossentropy": 2.693174362182617, "epoch": 0.4752030162412993, "grad_norm": 0.030657144263386726, "grad_norm_var": 2.0525572829294673e-06, "learning_rate": 0.005556687627797522, "loss": 2.7101, "step": 13108 }, { "crossentropy": 2.487273693084717, "epoch": 0.4752392691415313, "grad_norm": 0.028074417263269424, "grad_norm_var": 1.9488264184822398e-06, "learning_rate": 0.005556110149144345, "loss": 2.6138, "step": 13109 }, { "crossentropy": 2.483793020248413, "epoch": 0.4752755220417633, "grad_norm": 0.028296269476413727, "grad_norm_var": 1.8693324940146731e-06, "learning_rate": 0.0055555326629800564, "loss": 2.6013, "step": 13110 }, { "crossentropy": 2.6998305320739746, "epoch": 0.47531177494199534, "grad_norm": 0.028990022838115692, "grad_norm_var": 1.999611398945097e-06, "learning_rate": 0.005554955169312456, "loss": 2.7035, "step": 13111 }, { "crossentropy": 2.679011821746826, "epoch": 0.4753480278422274, "grad_norm": 0.027309831231832504, "grad_norm_var": 2.0049453172642777e-06, "learning_rate": 0.005554377668149344, "loss": 2.6841, "step": 13112 }, { "crossentropy": 2.537849187850952, "epoch": 0.4753842807424594, "grad_norm": 0.026249324902892113, "grad_norm_var": 2.1170221137391572e-06, "learning_rate": 0.00555380015949852, "loss": 2.6003, "step": 13113 }, { "crossentropy": 2.579974889755249, "epoch": 0.47542053364269143, "grad_norm": 0.027005301788449287, "grad_norm_var": 2.109962775744915e-06, "learning_rate": 0.005553222643367784, "loss": 2.598, "step": 13114 }, { "crossentropy": 2.653449773788452, "epoch": 0.47545678654292345, "grad_norm": 0.027096103876829147, "grad_norm_var": 1.9509424935290243e-06, "learning_rate": 0.005552645119764939, "loss": 2.5353, "step": 13115 }, { "crossentropy": 2.495938539505005, "epoch": 0.47549303944315546, "grad_norm": 0.029421551153063774, "grad_norm_var": 2.0866690047442824e-06, "learning_rate": 0.0055520675886977814, "loss": 2.5196, "step": 13116 }, { "crossentropy": 2.4707443714141846, "epoch": 0.4755292923433875, "grad_norm": 0.03021537885069847, "grad_norm_var": 2.455723007879296e-06, "learning_rate": 0.005551490050174113, "loss": 2.4388, "step": 13117 }, { "crossentropy": 2.57476806640625, "epoch": 0.4755655452436195, "grad_norm": 0.029389644041657448, "grad_norm_var": 2.609878217448881e-06, "learning_rate": 0.005550912504201734, "loss": 2.6288, "step": 13118 }, { "crossentropy": 2.6229915618896484, "epoch": 0.4756017981438515, "grad_norm": 0.029451699927449226, "grad_norm_var": 2.1472805036951695e-06, "learning_rate": 0.005550334950788447, "loss": 2.668, "step": 13119 }, { "crossentropy": 2.8802168369293213, "epoch": 0.4756380510440835, "grad_norm": 0.02891186997294426, "grad_norm_var": 1.952768410249045e-06, "learning_rate": 0.005549757389942051, "loss": 2.7402, "step": 13120 }, { "crossentropy": 2.5204968452453613, "epoch": 0.47567430394431554, "grad_norm": 0.02623874507844448, "grad_norm_var": 1.991753837160548e-06, "learning_rate": 0.005549179821670346, "loss": 2.5436, "step": 13121 }, { "crossentropy": 2.83141827583313, "epoch": 0.47571055684454755, "grad_norm": 0.029711604118347168, "grad_norm_var": 1.8688937920660475e-06, "learning_rate": 0.005548602245981136, "loss": 2.7118, "step": 13122 }, { "crossentropy": 2.5806543827056885, "epoch": 0.47574680974477956, "grad_norm": 0.03128107637166977, "grad_norm_var": 2.3631165595953124e-06, "learning_rate": 0.005548024662882218, "loss": 2.5756, "step": 13123 }, { "crossentropy": 2.5463855266571045, "epoch": 0.4757830626450116, "grad_norm": 0.026050584390759468, "grad_norm_var": 2.452747776466784e-06, "learning_rate": 0.0055474470723813965, "loss": 2.6101, "step": 13124 }, { "crossentropy": 2.5820257663726807, "epoch": 0.4758193155452436, "grad_norm": 0.025509074330329895, "grad_norm_var": 2.9603184284146398e-06, "learning_rate": 0.005546869474486471, "loss": 2.6159, "step": 13125 }, { "crossentropy": 2.657395839691162, "epoch": 0.47585556844547566, "grad_norm": 0.02814953401684761, "grad_norm_var": 2.959692707664195e-06, "learning_rate": 0.005546291869205244, "loss": 2.6512, "step": 13126 }, { "crossentropy": 2.5022523403167725, "epoch": 0.4758918213457077, "grad_norm": 0.02829631417989731, "grad_norm_var": 2.9154329183764577e-06, "learning_rate": 0.005545714256545517, "loss": 2.5504, "step": 13127 }, { "crossentropy": 2.456394910812378, "epoch": 0.4759280742459397, "grad_norm": 0.027113035321235657, "grad_norm_var": 2.9397147586794944e-06, "learning_rate": 0.005545136636515089, "loss": 2.5038, "step": 13128 }, { "crossentropy": 2.6342110633850098, "epoch": 0.4759643271461717, "grad_norm": 0.02678198181092739, "grad_norm_var": 2.823832065422975e-06, "learning_rate": 0.005544559009121764, "loss": 2.5693, "step": 13129 }, { "crossentropy": 2.6056225299835205, "epoch": 0.4760005800464037, "grad_norm": 0.026294708251953125, "grad_norm_var": 2.9651698430864977e-06, "learning_rate": 0.005543981374373344, "loss": 2.631, "step": 13130 }, { "crossentropy": 2.556095838546753, "epoch": 0.47603683294663574, "grad_norm": 0.026419678702950478, "grad_norm_var": 3.086072006932408e-06, "learning_rate": 0.00554340373227763, "loss": 2.5407, "step": 13131 }, { "crossentropy": 2.596627712249756, "epoch": 0.47607308584686775, "grad_norm": 0.02779492549598217, "grad_norm_var": 2.9598913268841088e-06, "learning_rate": 0.005542826082842423, "loss": 2.5771, "step": 13132 }, { "crossentropy": 2.564697265625, "epoch": 0.47610933874709976, "grad_norm": 0.02749530039727688, "grad_norm_var": 2.610007219510981e-06, "learning_rate": 0.005542248426075526, "loss": 2.5095, "step": 13133 }, { "crossentropy": 2.607391834259033, "epoch": 0.4761455916473318, "grad_norm": 0.02722674049437046, "grad_norm_var": 2.445576953580044e-06, "learning_rate": 0.005541670761984742, "loss": 2.594, "step": 13134 }, { "crossentropy": 2.62280011177063, "epoch": 0.4761818445475638, "grad_norm": 0.028085213154554367, "grad_norm_var": 2.2377380142622603e-06, "learning_rate": 0.005541093090577871, "loss": 2.6424, "step": 13135 }, { "crossentropy": 2.6148135662078857, "epoch": 0.4762180974477958, "grad_norm": 0.027889221906661987, "grad_norm_var": 2.122181569645997e-06, "learning_rate": 0.005540515411862718, "loss": 2.637, "step": 13136 }, { "crossentropy": 2.651679515838623, "epoch": 0.4762543503480278, "grad_norm": 0.02904248982667923, "grad_norm_var": 2.134103766257787e-06, "learning_rate": 0.005539937725847083, "loss": 2.6748, "step": 13137 }, { "crossentropy": 2.7235634326934814, "epoch": 0.47629060324825984, "grad_norm": 0.03126071020960808, "grad_norm_var": 2.700334042079512e-06, "learning_rate": 0.00553936003253877, "loss": 2.6372, "step": 13138 }, { "crossentropy": 2.6127936840057373, "epoch": 0.4763268561484919, "grad_norm": 0.027125461027026176, "grad_norm_var": 1.8470643830279052e-06, "learning_rate": 0.005538782331945582, "loss": 2.5543, "step": 13139 }, { "crossentropy": 2.489022731781006, "epoch": 0.4763631090487239, "grad_norm": 0.027363460510969162, "grad_norm_var": 1.6952187462032063e-06, "learning_rate": 0.005538204624075321, "loss": 2.5466, "step": 13140 }, { "crossentropy": 2.604619026184082, "epoch": 0.47639936194895594, "grad_norm": 0.030439458787441254, "grad_norm_var": 1.8297863207796781e-06, "learning_rate": 0.005537626908935788, "loss": 2.6656, "step": 13141 }, { "crossentropy": 2.580561399459839, "epoch": 0.47643561484918795, "grad_norm": 0.028046514838933945, "grad_norm_var": 1.8273467696935364e-06, "learning_rate": 0.005537049186534791, "loss": 2.622, "step": 13142 }, { "crossentropy": 2.4669175148010254, "epoch": 0.47647186774941996, "grad_norm": 0.027062389999628067, "grad_norm_var": 1.8601343910865097e-06, "learning_rate": 0.005536471456880127, "loss": 2.5474, "step": 13143 }, { "crossentropy": 2.674560070037842, "epoch": 0.476508120649652, "grad_norm": 0.026175834238529205, "grad_norm_var": 2.005882699790638e-06, "learning_rate": 0.005535893719979602, "loss": 2.5148, "step": 13144 }, { "crossentropy": 2.6524860858917236, "epoch": 0.476544373549884, "grad_norm": 0.027674568817019463, "grad_norm_var": 1.936722245920545e-06, "learning_rate": 0.005535315975841019, "loss": 2.6811, "step": 13145 }, { "crossentropy": 2.3527684211730957, "epoch": 0.476580626450116, "grad_norm": 0.02668026275932789, "grad_norm_var": 1.8667129827259898e-06, "learning_rate": 0.005534738224472183, "loss": 2.4627, "step": 13146 }, { "crossentropy": 2.683588743209839, "epoch": 0.476616879350348, "grad_norm": 0.034007273614406586, "grad_norm_var": 4.006388818173408e-06, "learning_rate": 0.005534160465880895, "loss": 2.6863, "step": 13147 }, { "crossentropy": 2.5007941722869873, "epoch": 0.47665313225058004, "grad_norm": 0.029099486768245697, "grad_norm_var": 4.018708131619724e-06, "learning_rate": 0.005533582700074959, "loss": 2.5932, "step": 13148 }, { "crossentropy": 2.433274269104004, "epoch": 0.47668938515081205, "grad_norm": 0.03071056306362152, "grad_norm_var": 4.2696296148438005e-06, "learning_rate": 0.005533004927062178, "loss": 2.5275, "step": 13149 }, { "crossentropy": 2.505751848220825, "epoch": 0.47672563805104406, "grad_norm": 0.027096664533019066, "grad_norm_var": 4.294818143830518e-06, "learning_rate": 0.005532427146850359, "loss": 2.5518, "step": 13150 }, { "crossentropy": 2.534303903579712, "epoch": 0.4767618909512761, "grad_norm": 0.026126820594072342, "grad_norm_var": 4.671549351782887e-06, "learning_rate": 0.005531849359447302, "loss": 2.6048, "step": 13151 }, { "crossentropy": 2.5308029651641846, "epoch": 0.4767981438515081, "grad_norm": 0.02681438811123371, "grad_norm_var": 4.829504100032596e-06, "learning_rate": 0.005531271564860812, "loss": 2.5267, "step": 13152 }, { "crossentropy": 2.681879758834839, "epoch": 0.47683439675174016, "grad_norm": 0.02756083756685257, "grad_norm_var": 4.843813186631536e-06, "learning_rate": 0.005530693763098692, "loss": 2.6684, "step": 13153 }, { "crossentropy": 2.7696726322174072, "epoch": 0.4768706496519722, "grad_norm": 0.029910743236541748, "grad_norm_var": 4.429801764740703e-06, "learning_rate": 0.0055301159541687505, "loss": 2.6322, "step": 13154 }, { "crossentropy": 2.6399612426757812, "epoch": 0.4769069025522042, "grad_norm": 0.030067499727010727, "grad_norm_var": 4.532232230706976e-06, "learning_rate": 0.005529538138078787, "loss": 2.6014, "step": 13155 }, { "crossentropy": 2.535862922668457, "epoch": 0.4769431554524362, "grad_norm": 0.02951306663453579, "grad_norm_var": 4.5161217735827645e-06, "learning_rate": 0.005528960314836608, "loss": 2.6423, "step": 13156 }, { "crossentropy": 2.8306641578674316, "epoch": 0.4769794083526682, "grad_norm": 0.027533208951354027, "grad_norm_var": 4.316363269476058e-06, "learning_rate": 0.005528382484450015, "loss": 2.6229, "step": 13157 }, { "crossentropy": 2.670240640640259, "epoch": 0.47701566125290024, "grad_norm": 0.030178885906934738, "grad_norm_var": 4.505733759408286e-06, "learning_rate": 0.005527804646926816, "loss": 2.5416, "step": 13158 }, { "crossentropy": 2.511591911315918, "epoch": 0.47705191415313225, "grad_norm": 0.027483897283673286, "grad_norm_var": 4.4352965556701394e-06, "learning_rate": 0.005527226802274815, "loss": 2.5911, "step": 13159 }, { "crossentropy": 2.591688632965088, "epoch": 0.47708816705336426, "grad_norm": 0.02602425590157509, "grad_norm_var": 4.484505820470649e-06, "learning_rate": 0.005526648950501816, "loss": 2.5031, "step": 13160 }, { "crossentropy": 2.7306206226348877, "epoch": 0.4771244199535963, "grad_norm": 0.02777790278196335, "grad_norm_var": 4.4733850900676815e-06, "learning_rate": 0.005526071091615622, "loss": 2.7329, "step": 13161 }, { "crossentropy": 2.6108787059783936, "epoch": 0.4771606728538283, "grad_norm": 0.02668692357838154, "grad_norm_var": 4.471739224040881e-06, "learning_rate": 0.0055254932256240405, "loss": 2.5754, "step": 13162 }, { "crossentropy": 2.489685297012329, "epoch": 0.4771969257540603, "grad_norm": 0.027389289811253548, "grad_norm_var": 2.382161517974291e-06, "learning_rate": 0.005524915352534877, "loss": 2.4842, "step": 13163 }, { "crossentropy": 2.455247402191162, "epoch": 0.4772331786542923, "grad_norm": 0.026883700862526894, "grad_norm_var": 2.4006456123316805e-06, "learning_rate": 0.005524337472355934, "loss": 2.6145, "step": 13164 }, { "crossentropy": 2.473628520965576, "epoch": 0.47726943155452434, "grad_norm": 0.029146654531359673, "grad_norm_var": 1.9851536072504655e-06, "learning_rate": 0.005523759585095016, "loss": 2.5814, "step": 13165 }, { "crossentropy": 2.566432476043701, "epoch": 0.4773056844547564, "grad_norm": 0.02920517884194851, "grad_norm_var": 2.0407788474283467e-06, "learning_rate": 0.005523181690759932, "loss": 2.6011, "step": 13166 }, { "crossentropy": 2.6540982723236084, "epoch": 0.4773419373549884, "grad_norm": 0.031010234728455544, "grad_norm_var": 2.2992531226481244e-06, "learning_rate": 0.005522603789358486, "loss": 2.6413, "step": 13167 }, { "crossentropy": 2.424377918243408, "epoch": 0.47737819025522044, "grad_norm": 0.028977898880839348, "grad_norm_var": 2.1562788132792677e-06, "learning_rate": 0.005522025880898482, "loss": 2.5599, "step": 13168 }, { "crossentropy": 2.542801856994629, "epoch": 0.47741444315545245, "grad_norm": 0.027846548706293106, "grad_norm_var": 2.1271506944933736e-06, "learning_rate": 0.005521447965387725, "loss": 2.5399, "step": 13169 }, { "crossentropy": 2.4667105674743652, "epoch": 0.47745069605568446, "grad_norm": 0.026026999577879906, "grad_norm_var": 2.3275543276300803e-06, "learning_rate": 0.005520870042834022, "loss": 2.4609, "step": 13170 }, { "crossentropy": 2.558823347091675, "epoch": 0.4774869489559165, "grad_norm": 0.02527415193617344, "grad_norm_var": 2.5920776023724904e-06, "learning_rate": 0.005520292113245181, "loss": 2.5742, "step": 13171 }, { "crossentropy": 2.5941436290740967, "epoch": 0.4775232018561485, "grad_norm": 0.026293190196156502, "grad_norm_var": 2.562530031936331e-06, "learning_rate": 0.0055197141766290024, "loss": 2.6054, "step": 13172 }, { "crossentropy": 2.5249783992767334, "epoch": 0.4775594547563805, "grad_norm": 0.027959072962403297, "grad_norm_var": 2.5624817702860415e-06, "learning_rate": 0.005519136232993297, "loss": 2.5881, "step": 13173 }, { "crossentropy": 2.667064905166626, "epoch": 0.4775957076566125, "grad_norm": 0.030893128365278244, "grad_norm_var": 2.8246933096433165e-06, "learning_rate": 0.005518558282345868, "loss": 2.5881, "step": 13174 }, { "crossentropy": 2.585289239883423, "epoch": 0.47763196055684454, "grad_norm": 0.02991516888141632, "grad_norm_var": 3.0900638607595273e-06, "learning_rate": 0.0055179803246945235, "loss": 2.5013, "step": 13175 }, { "crossentropy": 2.681781768798828, "epoch": 0.47766821345707655, "grad_norm": 0.029225114732980728, "grad_norm_var": 2.905593968241281e-06, "learning_rate": 0.005517402360047068, "loss": 2.6241, "step": 13176 }, { "crossentropy": 2.6510002613067627, "epoch": 0.47770446635730857, "grad_norm": 0.026895878836512566, "grad_norm_var": 2.9987937226987987e-06, "learning_rate": 0.0055168243884113075, "loss": 2.5959, "step": 13177 }, { "crossentropy": 2.329796314239502, "epoch": 0.4777407192575406, "grad_norm": 0.02688485197722912, "grad_norm_var": 2.9639024176718207e-06, "learning_rate": 0.005516246409795052, "loss": 2.3908, "step": 13178 }, { "crossentropy": 2.623865842819214, "epoch": 0.4777769721577726, "grad_norm": 0.027490748092532158, "grad_norm_var": 2.9547394753193234e-06, "learning_rate": 0.005515668424206103, "loss": 2.6153, "step": 13179 }, { "crossentropy": 2.544769287109375, "epoch": 0.47781322505800466, "grad_norm": 0.02587313763797283, "grad_norm_var": 3.1852197457206293e-06, "learning_rate": 0.00551509043165227, "loss": 2.578, "step": 13180 }, { "crossentropy": 2.6641948223114014, "epoch": 0.4778494779582367, "grad_norm": 0.029789507389068604, "grad_norm_var": 3.3044148978777678e-06, "learning_rate": 0.005514512432141358, "loss": 2.562, "step": 13181 }, { "crossentropy": 2.4904327392578125, "epoch": 0.4778857308584687, "grad_norm": 0.026934558525681496, "grad_norm_var": 3.2913134686816107e-06, "learning_rate": 0.005513934425681178, "loss": 2.5194, "step": 13182 }, { "crossentropy": 2.5096499919891357, "epoch": 0.4779219837587007, "grad_norm": 0.027036551386117935, "grad_norm_var": 2.659797858128807e-06, "learning_rate": 0.005513356412279531, "loss": 2.5809, "step": 13183 }, { "crossentropy": 2.6169912815093994, "epoch": 0.4779582366589327, "grad_norm": 0.026590989902615547, "grad_norm_var": 2.6115016058742465e-06, "learning_rate": 0.0055127783919442265, "loss": 2.6144, "step": 13184 }, { "crossentropy": 2.615992784500122, "epoch": 0.47799448955916474, "grad_norm": 0.025641342625021935, "grad_norm_var": 2.830623110255586e-06, "learning_rate": 0.005512200364683073, "loss": 2.6102, "step": 13185 }, { "crossentropy": 2.7151551246643066, "epoch": 0.47803074245939675, "grad_norm": 0.026028629392385483, "grad_norm_var": 2.830320505636345e-06, "learning_rate": 0.005511622330503876, "loss": 2.647, "step": 13186 }, { "crossentropy": 2.689304828643799, "epoch": 0.47806699535962877, "grad_norm": 0.03256240487098694, "grad_norm_var": 4.064604673586916e-06, "learning_rate": 0.005511044289414442, "loss": 2.6113, "step": 13187 }, { "crossentropy": 2.4767677783966064, "epoch": 0.4781032482598608, "grad_norm": 0.027849677950143814, "grad_norm_var": 3.887559710864103e-06, "learning_rate": 0.00551046624142258, "loss": 2.5859, "step": 13188 }, { "crossentropy": 2.590404987335205, "epoch": 0.4781395011600928, "grad_norm": 0.026769796386361122, "grad_norm_var": 3.978194185867007e-06, "learning_rate": 0.005509888186536096, "loss": 2.5458, "step": 13189 }, { "crossentropy": 2.5260541439056396, "epoch": 0.4781757540603248, "grad_norm": 0.027761181816458702, "grad_norm_var": 3.3408700037688368e-06, "learning_rate": 0.0055093101247628, "loss": 2.537, "step": 13190 }, { "crossentropy": 2.7408015727996826, "epoch": 0.4782120069605568, "grad_norm": 0.027390826493501663, "grad_norm_var": 2.9946018818198614e-06, "learning_rate": 0.005508732056110497, "loss": 2.6865, "step": 13191 }, { "crossentropy": 2.643322229385376, "epoch": 0.47824825986078884, "grad_norm": 0.0269413273781538, "grad_norm_var": 2.809077796555685e-06, "learning_rate": 0.005508153980586995, "loss": 2.55, "step": 13192 }, { "crossentropy": 2.4865314960479736, "epoch": 0.4782845127610209, "grad_norm": 0.02772033028304577, "grad_norm_var": 2.795859342964724e-06, "learning_rate": 0.005507575898200103, "loss": 2.5849, "step": 13193 }, { "crossentropy": 2.5532567501068115, "epoch": 0.4783207656612529, "grad_norm": 0.026989132165908813, "grad_norm_var": 2.788623923723185e-06, "learning_rate": 0.005506997808957628, "loss": 2.5951, "step": 13194 }, { "crossentropy": 2.540036678314209, "epoch": 0.47835701856148494, "grad_norm": 0.026667334139347076, "grad_norm_var": 2.8276933888103537e-06, "learning_rate": 0.005506419712867377, "loss": 2.5993, "step": 13195 }, { "crossentropy": 2.568734884262085, "epoch": 0.47839327146171695, "grad_norm": 0.026777993887662888, "grad_norm_var": 2.6935476636054476e-06, "learning_rate": 0.005505841609937161, "loss": 2.6057, "step": 13196 }, { "crossentropy": 2.5056402683258057, "epoch": 0.47842952436194897, "grad_norm": 0.026993149891495705, "grad_norm_var": 2.315856417100098e-06, "learning_rate": 0.005505263500174786, "loss": 2.5247, "step": 13197 }, { "crossentropy": 2.467329263687134, "epoch": 0.478465777262181, "grad_norm": 0.028334124013781548, "grad_norm_var": 2.3717743122808146e-06, "learning_rate": 0.00550468538358806, "loss": 2.571, "step": 13198 }, { "crossentropy": 2.630558490753174, "epoch": 0.478502030162413, "grad_norm": 0.02765519917011261, "grad_norm_var": 2.3674947542406374e-06, "learning_rate": 0.005504107260184793, "loss": 2.5981, "step": 13199 }, { "crossentropy": 2.6297366619110107, "epoch": 0.478538283062645, "grad_norm": 0.02647869475185871, "grad_norm_var": 2.38065183013608e-06, "learning_rate": 0.005503529129972792, "loss": 2.5968, "step": 13200 }, { "crossentropy": 2.6169400215148926, "epoch": 0.478574535962877, "grad_norm": 0.026319971308112144, "grad_norm_var": 2.249394025896576e-06, "learning_rate": 0.005502950992959866, "loss": 2.5929, "step": 13201 }, { "crossentropy": 2.587252378463745, "epoch": 0.47861078886310904, "grad_norm": 0.028432540595531464, "grad_norm_var": 2.154191738254485e-06, "learning_rate": 0.005502372849153824, "loss": 2.5787, "step": 13202 }, { "crossentropy": 2.5770504474639893, "epoch": 0.47864704176334105, "grad_norm": 0.028314026072621346, "grad_norm_var": 4.728267273062473e-07, "learning_rate": 0.0055017946985624734, "loss": 2.5808, "step": 13203 }, { "crossentropy": 2.6366028785705566, "epoch": 0.47868329466357307, "grad_norm": 0.03024698793888092, "grad_norm_var": 9.958271870067707e-07, "learning_rate": 0.005501216541193624, "loss": 2.6008, "step": 13204 }, { "crossentropy": 2.5448577404022217, "epoch": 0.4787195475638051, "grad_norm": 0.02789541520178318, "grad_norm_var": 9.673702958352814e-07, "learning_rate": 0.005500638377055085, "loss": 2.5732, "step": 13205 }, { "crossentropy": 2.598226308822632, "epoch": 0.4787558004640371, "grad_norm": 0.02816946990787983, "grad_norm_var": 9.888831132339454e-07, "learning_rate": 0.005500060206154665, "loss": 2.5513, "step": 13206 }, { "crossentropy": 2.551842451095581, "epoch": 0.47879205336426917, "grad_norm": 0.02793947421014309, "grad_norm_var": 9.936451859990716e-07, "learning_rate": 0.005499482028500173, "loss": 2.4865, "step": 13207 }, { "crossentropy": 2.6621556282043457, "epoch": 0.4788283062645012, "grad_norm": 0.029623348265886307, "grad_norm_var": 1.2015291567655928e-06, "learning_rate": 0.005498903844099417, "loss": 2.6178, "step": 13208 }, { "crossentropy": 2.5431554317474365, "epoch": 0.4788645591647332, "grad_norm": 0.027680478990077972, "grad_norm_var": 1.201971104938746e-06, "learning_rate": 0.00549832565296021, "loss": 2.6194, "step": 13209 }, { "crossentropy": 2.486781597137451, "epoch": 0.4789008120649652, "grad_norm": 0.026233678683638573, "grad_norm_var": 1.3175374007442457e-06, "learning_rate": 0.005497747455090358, "loss": 2.4947, "step": 13210 }, { "crossentropy": 2.6018190383911133, "epoch": 0.4789370649651972, "grad_norm": 0.026920929551124573, "grad_norm_var": 1.2854521389737595e-06, "learning_rate": 0.005497169250497669, "loss": 2.5152, "step": 13211 }, { "crossentropy": 2.730320930480957, "epoch": 0.47897331786542924, "grad_norm": 0.030234307050704956, "grad_norm_var": 1.5836964940935435e-06, "learning_rate": 0.005496591039189956, "loss": 2.7347, "step": 13212 }, { "crossentropy": 2.6252028942108154, "epoch": 0.47900957076566125, "grad_norm": 0.030968153849244118, "grad_norm_var": 2.055103222329393e-06, "learning_rate": 0.005496012821175028, "loss": 2.6277, "step": 13213 }, { "crossentropy": 2.585304021835327, "epoch": 0.47904582366589327, "grad_norm": 0.030543241649866104, "grad_norm_var": 2.395078453996888e-06, "learning_rate": 0.005495434596460695, "loss": 2.5816, "step": 13214 }, { "crossentropy": 2.6519744396209717, "epoch": 0.4790820765661253, "grad_norm": 0.029308078810572624, "grad_norm_var": 2.4119359920159033e-06, "learning_rate": 0.005494856365054764, "loss": 2.564, "step": 13215 }, { "crossentropy": 2.541165351867676, "epoch": 0.4791183294663573, "grad_norm": 0.028071768581867218, "grad_norm_var": 2.1503847710599895e-06, "learning_rate": 0.005494278126965049, "loss": 2.5916, "step": 13216 }, { "crossentropy": 2.6609363555908203, "epoch": 0.4791545823665893, "grad_norm": 0.02783951163291931, "grad_norm_var": 1.84159167137426e-06, "learning_rate": 0.0054936998821993556, "loss": 2.6456, "step": 13217 }, { "crossentropy": 2.580996513366699, "epoch": 0.4791908352668213, "grad_norm": 0.02731885015964508, "grad_norm_var": 1.951600525236157e-06, "learning_rate": 0.005493121630765497, "loss": 2.5775, "step": 13218 }, { "crossentropy": 2.6041676998138428, "epoch": 0.47922708816705334, "grad_norm": 0.02645571157336235, "grad_norm_var": 2.2337648639000316e-06, "learning_rate": 0.005492543372671283, "loss": 2.6207, "step": 13219 }, { "crossentropy": 2.7277112007141113, "epoch": 0.4792633410672854, "grad_norm": 0.030216535553336143, "grad_norm_var": 2.226589772277261e-06, "learning_rate": 0.005491965107924522, "loss": 2.5373, "step": 13220 }, { "crossentropy": 2.553006410598755, "epoch": 0.4792995939675174, "grad_norm": 0.03094629757106304, "grad_norm_var": 2.577169393991547e-06, "learning_rate": 0.005491386836533025, "loss": 2.5671, "step": 13221 }, { "crossentropy": 2.532339334487915, "epoch": 0.47933584686774944, "grad_norm": 0.026701539754867554, "grad_norm_var": 2.806751145851584e-06, "learning_rate": 0.005490808558504606, "loss": 2.5748, "step": 13222 }, { "crossentropy": 2.6565895080566406, "epoch": 0.47937209976798145, "grad_norm": 0.02921934612095356, "grad_norm_var": 2.8027912307458657e-06, "learning_rate": 0.005490230273847071, "loss": 2.6198, "step": 13223 }, { "crossentropy": 2.5434658527374268, "epoch": 0.47940835266821347, "grad_norm": 0.030825991183519363, "grad_norm_var": 3.0504516505068522e-06, "learning_rate": 0.005489651982568232, "loss": 2.6403, "step": 13224 }, { "crossentropy": 2.6793272495269775, "epoch": 0.4794446055684455, "grad_norm": 0.0292592104524374, "grad_norm_var": 2.987877696126193e-06, "learning_rate": 0.005489073684675899, "loss": 2.5957, "step": 13225 }, { "crossentropy": 2.5937304496765137, "epoch": 0.4794808584686775, "grad_norm": 0.02603716216981411, "grad_norm_var": 3.057965584422071e-06, "learning_rate": 0.005488495380177887, "loss": 2.6101, "step": 13226 }, { "crossentropy": 2.599428653717041, "epoch": 0.4795171113689095, "grad_norm": 0.026918426156044006, "grad_norm_var": 3.0585945736660935e-06, "learning_rate": 0.0054879170690820015, "loss": 2.6518, "step": 13227 }, { "crossentropy": 2.3633081912994385, "epoch": 0.4795533642691415, "grad_norm": 0.026495324447751045, "grad_norm_var": 3.219295702611841e-06, "learning_rate": 0.005487338751396055, "loss": 2.4442, "step": 13228 }, { "crossentropy": 2.4563663005828857, "epoch": 0.47958961716937354, "grad_norm": 0.03153006732463837, "grad_norm_var": 3.418679756025467e-06, "learning_rate": 0.0054867604271278585, "loss": 2.5491, "step": 13229 }, { "crossentropy": 2.5288422107696533, "epoch": 0.47962587006960555, "grad_norm": 0.031424373388290405, "grad_norm_var": 3.6948652955687826e-06, "learning_rate": 0.005486182096285226, "loss": 2.5367, "step": 13230 }, { "crossentropy": 2.674072265625, "epoch": 0.47966212296983757, "grad_norm": 0.028639014810323715, "grad_norm_var": 3.6650747409811463e-06, "learning_rate": 0.005485603758875964, "loss": 2.6209, "step": 13231 }, { "crossentropy": 2.556334972381592, "epoch": 0.4796983758700696, "grad_norm": 0.029634546488523483, "grad_norm_var": 3.7037535036643064e-06, "learning_rate": 0.005485025414907889, "loss": 2.609, "step": 13232 }, { "crossentropy": 2.6392815113067627, "epoch": 0.4797346287703016, "grad_norm": 0.027173850685358047, "grad_norm_var": 3.8092728554035375e-06, "learning_rate": 0.005484447064388807, "loss": 2.6321, "step": 13233 }, { "crossentropy": 2.4598851203918457, "epoch": 0.47977088167053367, "grad_norm": 0.026781899854540825, "grad_norm_var": 3.924367135664967e-06, "learning_rate": 0.0054838687073265335, "loss": 2.5202, "step": 13234 }, { "crossentropy": 2.4737725257873535, "epoch": 0.4798071345707657, "grad_norm": 0.026846976950764656, "grad_norm_var": 3.819920727662849e-06, "learning_rate": 0.00548329034372888, "loss": 2.4899, "step": 13235 }, { "crossentropy": 2.535971164703369, "epoch": 0.4798433874709977, "grad_norm": 0.027036475017666817, "grad_norm_var": 3.7943861156094124e-06, "learning_rate": 0.0054827119736036555, "loss": 2.5861, "step": 13236 }, { "crossentropy": 2.53340744972229, "epoch": 0.4798796403712297, "grad_norm": 0.03327490761876106, "grad_norm_var": 4.90309235561917e-06, "learning_rate": 0.005482133596958673, "loss": 2.4616, "step": 13237 }, { "crossentropy": 2.6572682857513428, "epoch": 0.4799158932714617, "grad_norm": 0.026478927582502365, "grad_norm_var": 4.9629083718111115e-06, "learning_rate": 0.005481555213801745, "loss": 2.5926, "step": 13238 }, { "crossentropy": 2.624656915664673, "epoch": 0.47995214617169374, "grad_norm": 0.027848031371831894, "grad_norm_var": 4.966928871643662e-06, "learning_rate": 0.005480976824140683, "loss": 2.7282, "step": 13239 }, { "crossentropy": 2.7046163082122803, "epoch": 0.47998839907192575, "grad_norm": 0.027318570762872696, "grad_norm_var": 4.654037148902647e-06, "learning_rate": 0.0054803984279833, "loss": 2.6789, "step": 13240 }, { "crossentropy": 2.7444894313812256, "epoch": 0.48002465197215777, "grad_norm": 0.026049381121993065, "grad_norm_var": 4.88472003500693e-06, "learning_rate": 0.005479820025337406, "loss": 2.6273, "step": 13241 }, { "crossentropy": 2.5911359786987305, "epoch": 0.4800609048723898, "grad_norm": 0.027147550135850906, "grad_norm_var": 4.65741037430505e-06, "learning_rate": 0.005479241616210813, "loss": 2.5194, "step": 13242 }, { "crossentropy": 2.652684211730957, "epoch": 0.4800971577726218, "grad_norm": 0.026274800300598145, "grad_norm_var": 4.790054685869478e-06, "learning_rate": 0.005478663200611339, "loss": 2.6371, "step": 13243 }, { "crossentropy": 2.424175500869751, "epoch": 0.4801334106728538, "grad_norm": 0.02668893337249756, "grad_norm_var": 4.750401255157235e-06, "learning_rate": 0.00547808477854679, "loss": 2.4888, "step": 13244 }, { "crossentropy": 2.4601261615753174, "epoch": 0.4801696635730858, "grad_norm": 0.02643614634871483, "grad_norm_var": 4.065762863156239e-06, "learning_rate": 0.005477506350024979, "loss": 2.4991, "step": 13245 }, { "crossentropy": 2.628082752227783, "epoch": 0.48020591647331784, "grad_norm": 0.027312085032463074, "grad_norm_var": 3.1441501840769856e-06, "learning_rate": 0.005476927915053721, "loss": 2.6129, "step": 13246 }, { "crossentropy": 2.699908494949341, "epoch": 0.4802421693735499, "grad_norm": 0.027293644845485687, "grad_norm_var": 3.06351919359682e-06, "learning_rate": 0.005476349473640829, "loss": 2.6366, "step": 13247 }, { "crossentropy": 2.6539530754089355, "epoch": 0.4802784222737819, "grad_norm": 0.026903677731752396, "grad_norm_var": 2.743222409491014e-06, "learning_rate": 0.005475771025794114, "loss": 2.6121, "step": 13248 }, { "crossentropy": 2.630910634994507, "epoch": 0.48031467517401394, "grad_norm": 0.027243811637163162, "grad_norm_var": 2.7423131849342587e-06, "learning_rate": 0.005475192571521388, "loss": 2.5949, "step": 13249 }, { "crossentropy": 2.653998374938965, "epoch": 0.48035092807424595, "grad_norm": 0.026553098112344742, "grad_norm_var": 2.7616496683884833e-06, "learning_rate": 0.0054746141108304655, "loss": 2.6198, "step": 13250 }, { "crossentropy": 2.573885440826416, "epoch": 0.48038718097447797, "grad_norm": 0.026275135576725006, "grad_norm_var": 2.8161852130387592e-06, "learning_rate": 0.005474035643729161, "loss": 2.5108, "step": 13251 }, { "crossentropy": 2.655888080596924, "epoch": 0.48042343387471, "grad_norm": 0.0269569531083107, "grad_norm_var": 2.8189340141474093e-06, "learning_rate": 0.0054734571702252845, "loss": 2.6439, "step": 13252 }, { "crossentropy": 2.569634437561035, "epoch": 0.480459686774942, "grad_norm": 0.0273760873824358, "grad_norm_var": 2.577785679046263e-07, "learning_rate": 0.00547287869032665, "loss": 2.5239, "step": 13253 }, { "crossentropy": 2.5882725715637207, "epoch": 0.480495939675174, "grad_norm": 0.026864681392908096, "grad_norm_var": 2.462032548412411e-07, "learning_rate": 0.005472300204041071, "loss": 2.6618, "step": 13254 }, { "crossentropy": 2.728742837905884, "epoch": 0.480532192575406, "grad_norm": 0.026729607954621315, "grad_norm_var": 1.843382405661605e-07, "learning_rate": 0.005471721711376363, "loss": 2.6486, "step": 13255 }, { "crossentropy": 2.6531407833099365, "epoch": 0.48056844547563804, "grad_norm": 0.026157304644584656, "grad_norm_var": 1.943689524435867e-07, "learning_rate": 0.005471143212340335, "loss": 2.6045, "step": 13256 }, { "crossentropy": 2.507291793823242, "epoch": 0.48060469837587005, "grad_norm": 0.026425760239362717, "grad_norm_var": 1.6723842478684933e-07, "learning_rate": 0.0054705647069408034, "loss": 2.5459, "step": 13257 }, { "crossentropy": 2.6384177207946777, "epoch": 0.48064095127610207, "grad_norm": 0.02569729834794998, "grad_norm_var": 2.295432242277809e-07, "learning_rate": 0.005469986195185581, "loss": 2.5619, "step": 13258 }, { "crossentropy": 2.467557668685913, "epoch": 0.4806772041763341, "grad_norm": 0.02707798406481743, "grad_norm_var": 2.2440055245885142e-07, "learning_rate": 0.005469407677082484, "loss": 2.5384, "step": 13259 }, { "crossentropy": 2.64408802986145, "epoch": 0.48071345707656615, "grad_norm": 0.02686474844813347, "grad_norm_var": 2.249123753956042e-07, "learning_rate": 0.005468829152639323, "loss": 2.6304, "step": 13260 }, { "crossentropy": 2.4563241004943848, "epoch": 0.48074970997679817, "grad_norm": 0.027338631451129913, "grad_norm_var": 2.3678723661754718e-07, "learning_rate": 0.005468250621863912, "loss": 2.503, "step": 13261 }, { "crossentropy": 2.6626689434051514, "epoch": 0.4807859628770302, "grad_norm": 0.02803889848291874, "grad_norm_var": 3.177903014787842e-07, "learning_rate": 0.005467672084764066, "loss": 2.6069, "step": 13262 }, { "crossentropy": 2.6447558403015137, "epoch": 0.4808222157772622, "grad_norm": 0.02631743997335434, "grad_norm_var": 3.2121143125270143e-07, "learning_rate": 0.005467093541347599, "loss": 2.615, "step": 13263 }, { "crossentropy": 2.66703462600708, "epoch": 0.4808584686774942, "grad_norm": 0.027539506554603577, "grad_norm_var": 3.551564316701805e-07, "learning_rate": 0.0054665149916223255, "loss": 2.6772, "step": 13264 }, { "crossentropy": 2.6223137378692627, "epoch": 0.4808947215777262, "grad_norm": 0.027767078951001167, "grad_norm_var": 4.0036909832261853e-07, "learning_rate": 0.005465936435596057, "loss": 2.6518, "step": 13265 }, { "crossentropy": 2.632312297821045, "epoch": 0.48093097447795824, "grad_norm": 0.042428821325302124, "grad_norm_var": 1.547400833471745e-05, "learning_rate": 0.005465357873276612, "loss": 2.6044, "step": 13266 }, { "crossentropy": 2.649447202682495, "epoch": 0.48096722737819025, "grad_norm": 0.027400478720664978, "grad_norm_var": 1.5314456292019515e-05, "learning_rate": 0.005464779304671802, "loss": 2.6319, "step": 13267 }, { "crossentropy": 2.4909658432006836, "epoch": 0.48100348027842227, "grad_norm": 0.027989109978079796, "grad_norm_var": 1.524625777588096e-05, "learning_rate": 0.005464200729789442, "loss": 2.5091, "step": 13268 }, { "crossentropy": 2.632514476776123, "epoch": 0.4810397331786543, "grad_norm": 0.028619106858968735, "grad_norm_var": 1.5239282429981247e-05, "learning_rate": 0.005463622148637346, "loss": 2.6639, "step": 13269 }, { "crossentropy": 2.5542099475860596, "epoch": 0.4810759860788863, "grad_norm": 0.0273469015955925, "grad_norm_var": 1.517577039824297e-05, "learning_rate": 0.005463043561223331, "loss": 2.5661, "step": 13270 }, { "crossentropy": 2.520545482635498, "epoch": 0.4811122389791183, "grad_norm": 0.02912812866270542, "grad_norm_var": 1.5094299745282922e-05, "learning_rate": 0.005462464967555209, "loss": 2.5574, "step": 13271 }, { "crossentropy": 2.495386838912964, "epoch": 0.4811484918793503, "grad_norm": 0.02819104492664337, "grad_norm_var": 1.4783014239663714e-05, "learning_rate": 0.005461886367640797, "loss": 2.5358, "step": 13272 }, { "crossentropy": 2.60190486907959, "epoch": 0.48118474477958234, "grad_norm": 0.027422897517681122, "grad_norm_var": 1.4584581884093348e-05, "learning_rate": 0.005461307761487907, "loss": 2.6049, "step": 13273 }, { "crossentropy": 2.699385166168213, "epoch": 0.4812209976798144, "grad_norm": 0.026155540719628334, "grad_norm_var": 1.4429640650684287e-05, "learning_rate": 0.005460729149104356, "loss": 2.6194, "step": 13274 }, { "crossentropy": 2.5877788066864014, "epoch": 0.4812572505800464, "grad_norm": 0.02754783071577549, "grad_norm_var": 1.435581707899374e-05, "learning_rate": 0.0054601505304979585, "loss": 2.5335, "step": 13275 }, { "crossentropy": 2.779634475708008, "epoch": 0.48129350348027844, "grad_norm": 0.027769001200795174, "grad_norm_var": 1.4209039570432777e-05, "learning_rate": 0.00545957190567653, "loss": 2.6986, "step": 13276 }, { "crossentropy": 2.7147207260131836, "epoch": 0.48132975638051045, "grad_norm": 0.02730320207774639, "grad_norm_var": 1.4214899599040466e-05, "learning_rate": 0.0054589932746478866, "loss": 2.6641, "step": 13277 }, { "crossentropy": 2.4885714054107666, "epoch": 0.48136600928074247, "grad_norm": 0.026724576950073242, "grad_norm_var": 1.441423862806033e-05, "learning_rate": 0.005458414637419841, "loss": 2.5294, "step": 13278 }, { "crossentropy": 2.664219856262207, "epoch": 0.4814022621809745, "grad_norm": 0.0292829517275095, "grad_norm_var": 1.4109525153410165e-05, "learning_rate": 0.0054578359940002106, "loss": 2.6341, "step": 13279 }, { "crossentropy": 2.713888168334961, "epoch": 0.4814385150812065, "grad_norm": 0.03127181902527809, "grad_norm_var": 1.4420808408708667e-05, "learning_rate": 0.00545725734439681, "loss": 2.5976, "step": 13280 }, { "crossentropy": 2.573758840560913, "epoch": 0.4814747679814385, "grad_norm": 0.027882106602191925, "grad_norm_var": 1.4404309110480869e-05, "learning_rate": 0.005456678688617456, "loss": 2.5871, "step": 13281 }, { "crossentropy": 2.6264405250549316, "epoch": 0.4815110208816705, "grad_norm": 0.02800690196454525, "grad_norm_var": 1.3965517968288094e-06, "learning_rate": 0.005456100026669962, "loss": 2.5595, "step": 13282 }, { "crossentropy": 2.411466121673584, "epoch": 0.48154727378190254, "grad_norm": 0.02825913578271866, "grad_norm_var": 1.3736971239657869e-06, "learning_rate": 0.005455521358562146, "loss": 2.4492, "step": 13283 }, { "crossentropy": 2.549537181854248, "epoch": 0.48158352668213456, "grad_norm": 0.02487492747604847, "grad_norm_var": 2.0077152373519184e-06, "learning_rate": 0.005454942684301823, "loss": 2.5526, "step": 13284 }, { "crossentropy": 2.481146812438965, "epoch": 0.48161977958236657, "grad_norm": 0.025066306814551353, "grad_norm_var": 2.4377924712066228e-06, "learning_rate": 0.005454364003896808, "loss": 2.4703, "step": 13285 }, { "crossentropy": 2.522956609725952, "epoch": 0.4816560324825986, "grad_norm": 0.02655990608036518, "grad_norm_var": 2.507214097147564e-06, "learning_rate": 0.005453785317354917, "loss": 2.5422, "step": 13286 }, { "crossentropy": 2.4386818408966064, "epoch": 0.48169228538283065, "grad_norm": 0.026384685188531876, "grad_norm_var": 2.41512688926349e-06, "learning_rate": 0.005453206624683967, "loss": 2.5545, "step": 13287 }, { "crossentropy": 2.6581826210021973, "epoch": 0.48172853828306267, "grad_norm": 0.02598380856215954, "grad_norm_var": 2.4923873112115205e-06, "learning_rate": 0.0054526279258917745, "loss": 2.6368, "step": 13288 }, { "crossentropy": 2.489187240600586, "epoch": 0.4817647911832947, "grad_norm": 0.027447814121842384, "grad_norm_var": 2.4928976108245462e-06, "learning_rate": 0.005452049220986155, "loss": 2.4567, "step": 13289 }, { "crossentropy": 2.6595945358276367, "epoch": 0.4818010440835267, "grad_norm": 0.028912462294101715, "grad_norm_var": 2.55366586715122e-06, "learning_rate": 0.005451470509974924, "loss": 2.5924, "step": 13290 }, { "crossentropy": 2.5141959190368652, "epoch": 0.4818372969837587, "grad_norm": 0.027328163385391235, "grad_norm_var": 2.553958116429513e-06, "learning_rate": 0.005450891792865898, "loss": 2.5836, "step": 13291 }, { "crossentropy": 2.5907132625579834, "epoch": 0.4818735498839907, "grad_norm": 0.026530912145972252, "grad_norm_var": 2.5956344427178197e-06, "learning_rate": 0.0054503130696668955, "loss": 2.5968, "step": 13292 }, { "crossentropy": 2.5469706058502197, "epoch": 0.48190980278422274, "grad_norm": 0.026237737387418747, "grad_norm_var": 2.6751840975729077e-06, "learning_rate": 0.005449734340385731, "loss": 2.4867, "step": 13293 }, { "crossentropy": 2.5479326248168945, "epoch": 0.48194605568445475, "grad_norm": 0.028084442019462585, "grad_norm_var": 2.686946972498399e-06, "learning_rate": 0.005449155605030223, "loss": 2.5568, "step": 13294 }, { "crossentropy": 2.538346529006958, "epoch": 0.48198230858468677, "grad_norm": 0.028961770236492157, "grad_norm_var": 2.611993153384797e-06, "learning_rate": 0.005448576863608186, "loss": 2.5023, "step": 13295 }, { "crossentropy": 2.4454245567321777, "epoch": 0.4820185614849188, "grad_norm": 0.027482928708195686, "grad_norm_var": 1.5340687630612434e-06, "learning_rate": 0.005447998116127437, "loss": 2.4781, "step": 13296 }, { "crossentropy": 2.488548994064331, "epoch": 0.4820548143851508, "grad_norm": 0.025549594312906265, "grad_norm_var": 1.6387236388128053e-06, "learning_rate": 0.005447419362595794, "loss": 2.5031, "step": 13297 }, { "crossentropy": 2.6848785877227783, "epoch": 0.4820910672853828, "grad_norm": 0.030556200072169304, "grad_norm_var": 2.3941374061132886e-06, "learning_rate": 0.005446840603021074, "loss": 2.6297, "step": 13298 }, { "crossentropy": 2.5613620281219482, "epoch": 0.4821273201856148, "grad_norm": 0.029805650934576988, "grad_norm_var": 2.7746347972923267e-06, "learning_rate": 0.005446261837411093, "loss": 2.5692, "step": 13299 }, { "crossentropy": 2.632570743560791, "epoch": 0.48216357308584684, "grad_norm": 0.026999864727258682, "grad_norm_var": 2.3880477833373415e-06, "learning_rate": 0.005445683065773669, "loss": 2.6505, "step": 13300 }, { "crossentropy": 2.601004123687744, "epoch": 0.4821998259860789, "grad_norm": 0.027274465188384056, "grad_norm_var": 2.0150502683104084e-06, "learning_rate": 0.005445104288116617, "loss": 2.5953, "step": 13301 }, { "crossentropy": 2.553056001663208, "epoch": 0.4822360788863109, "grad_norm": 0.027076635509729385, "grad_norm_var": 1.9665361037473807e-06, "learning_rate": 0.005444525504447759, "loss": 2.5258, "step": 13302 }, { "crossentropy": 2.6337273120880127, "epoch": 0.48227233178654294, "grad_norm": 0.026660295203328133, "grad_norm_var": 1.9288806634556433e-06, "learning_rate": 0.005443946714774909, "loss": 2.6066, "step": 13303 }, { "crossentropy": 2.5760746002197266, "epoch": 0.48230858468677495, "grad_norm": 0.025865739211440086, "grad_norm_var": 1.954499083497002e-06, "learning_rate": 0.005443367919105884, "loss": 2.4965, "step": 13304 }, { "crossentropy": 2.540757656097412, "epoch": 0.48234483758700697, "grad_norm": 0.02552805468440056, "grad_norm_var": 2.21059251740197e-06, "learning_rate": 0.005442789117448501, "loss": 2.4239, "step": 13305 }, { "crossentropy": 2.6281650066375732, "epoch": 0.482381090487239, "grad_norm": 0.030299272388219833, "grad_norm_var": 2.6052042029963643e-06, "learning_rate": 0.005442210309810582, "loss": 2.6927, "step": 13306 }, { "crossentropy": 2.6894168853759766, "epoch": 0.482417343387471, "grad_norm": 0.03026842698454857, "grad_norm_var": 3.0722372668219445e-06, "learning_rate": 0.00544163149619994, "loss": 2.6964, "step": 13307 }, { "crossentropy": 2.761094093322754, "epoch": 0.482453596287703, "grad_norm": 0.027013175189495087, "grad_norm_var": 3.0116713680939747e-06, "learning_rate": 0.0054410526766243954, "loss": 2.6506, "step": 13308 }, { "crossentropy": 2.60905122756958, "epoch": 0.482489849187935, "grad_norm": 0.027206722646951675, "grad_norm_var": 2.8776844055539613e-06, "learning_rate": 0.0054404738510917625, "loss": 2.5958, "step": 13309 }, { "crossentropy": 2.641221284866333, "epoch": 0.48252610208816704, "grad_norm": 0.027388527989387512, "grad_norm_var": 2.880592867660301e-06, "learning_rate": 0.005439895019609865, "loss": 2.5921, "step": 13310 }, { "crossentropy": 2.7474613189697266, "epoch": 0.48256235498839906, "grad_norm": 0.02694307081401348, "grad_norm_var": 2.8080752270916484e-06, "learning_rate": 0.005439316182186516, "loss": 2.7021, "step": 13311 }, { "crossentropy": 2.722008228302002, "epoch": 0.48259860788863107, "grad_norm": 0.0286350566893816, "grad_norm_var": 2.869994373621643e-06, "learning_rate": 0.005438737338829537, "loss": 2.6538, "step": 13312 }, { "crossentropy": 2.604719400405884, "epoch": 0.4826348607888631, "grad_norm": 0.029859649017453194, "grad_norm_var": 2.7998901394071217e-06, "learning_rate": 0.005438158489546742, "loss": 2.6299, "step": 13313 }, { "crossentropy": 2.6358370780944824, "epoch": 0.48267111368909515, "grad_norm": 0.0302736796438694, "grad_norm_var": 2.707130464531928e-06, "learning_rate": 0.005437579634345954, "loss": 2.5601, "step": 13314 }, { "crossentropy": 2.6601529121398926, "epoch": 0.48270736658932717, "grad_norm": 0.02907566912472248, "grad_norm_var": 2.559204124599048e-06, "learning_rate": 0.0054370007732349864, "loss": 2.6537, "step": 13315 }, { "crossentropy": 2.5507421493530273, "epoch": 0.4827436194895592, "grad_norm": 0.02825385145843029, "grad_norm_var": 2.507314478731454e-06, "learning_rate": 0.005436421906221662, "loss": 2.5808, "step": 13316 }, { "crossentropy": 2.6590065956115723, "epoch": 0.4827798723897912, "grad_norm": 0.027725093066692352, "grad_norm_var": 2.477831625403862e-06, "learning_rate": 0.005435843033313796, "loss": 2.6676, "step": 13317 }, { "crossentropy": 2.5249733924865723, "epoch": 0.4828161252900232, "grad_norm": 0.02732822112739086, "grad_norm_var": 2.4506606736094187e-06, "learning_rate": 0.00543526415451921, "loss": 2.5139, "step": 13318 }, { "crossentropy": 2.483210802078247, "epoch": 0.4828523781902552, "grad_norm": 0.027464911341667175, "grad_norm_var": 2.3452213546303375e-06, "learning_rate": 0.005434685269845722, "loss": 2.5602, "step": 13319 }, { "crossentropy": 2.7239229679107666, "epoch": 0.48288863109048724, "grad_norm": 0.026469437405467033, "grad_norm_var": 2.1905259183476924e-06, "learning_rate": 0.005434106379301148, "loss": 2.6273, "step": 13320 }, { "crossentropy": 2.603515386581421, "epoch": 0.48292488399071926, "grad_norm": 0.026505539193749428, "grad_norm_var": 1.913956363286164e-06, "learning_rate": 0.005433527482893308, "loss": 2.5357, "step": 13321 }, { "crossentropy": 2.5904619693756104, "epoch": 0.48296113689095127, "grad_norm": 0.026865286752581596, "grad_norm_var": 1.6757762109535404e-06, "learning_rate": 0.0054329485806300214, "loss": 2.5755, "step": 13322 }, { "crossentropy": 2.42317271232605, "epoch": 0.4829973897911833, "grad_norm": 0.027727684006094933, "grad_norm_var": 1.2954494098012132e-06, "learning_rate": 0.005432369672519109, "loss": 2.5397, "step": 13323 }, { "crossentropy": 2.5357441902160645, "epoch": 0.4830336426914153, "grad_norm": 0.027931392192840576, "grad_norm_var": 1.2523074054345806e-06, "learning_rate": 0.0054317907585683875, "loss": 2.5711, "step": 13324 }, { "crossentropy": 2.807159423828125, "epoch": 0.4830698955916473, "grad_norm": 0.02754810079932213, "grad_norm_var": 1.2301579545986296e-06, "learning_rate": 0.0054312118387856755, "loss": 2.6918, "step": 13325 }, { "crossentropy": 2.5814943313598633, "epoch": 0.48310614849187933, "grad_norm": 0.026024652644991875, "grad_norm_var": 1.4348277703565693e-06, "learning_rate": 0.005430632913178793, "loss": 2.5454, "step": 13326 }, { "crossentropy": 2.5435004234313965, "epoch": 0.48314240139211134, "grad_norm": 0.026874078437685966, "grad_norm_var": 1.4429111500605889e-06, "learning_rate": 0.005430053981755563, "loss": 2.4825, "step": 13327 }, { "crossentropy": 2.627627372741699, "epoch": 0.4831786542923434, "grad_norm": 0.029947742819786072, "grad_norm_var": 1.6993635301318367e-06, "learning_rate": 0.005429475044523798, "loss": 2.5427, "step": 13328 }, { "crossentropy": 2.612844705581665, "epoch": 0.4832149071925754, "grad_norm": 0.029634203761816025, "grad_norm_var": 1.6426479744131949e-06, "learning_rate": 0.0054288961014913205, "loss": 2.6028, "step": 13329 }, { "crossentropy": 2.7217984199523926, "epoch": 0.48325116009280744, "grad_norm": 0.028066379949450493, "grad_norm_var": 1.2347650587875176e-06, "learning_rate": 0.005428317152665951, "loss": 2.6807, "step": 13330 }, { "crossentropy": 2.6612842082977295, "epoch": 0.48328741299303946, "grad_norm": 0.027006980031728745, "grad_norm_var": 1.1269640545844324e-06, "learning_rate": 0.00542773819805551, "loss": 2.5969, "step": 13331 }, { "crossentropy": 2.593376874923706, "epoch": 0.48332366589327147, "grad_norm": 0.02645661123096943, "grad_norm_var": 1.1687683619194482e-06, "learning_rate": 0.005427159237667813, "loss": 2.5367, "step": 13332 }, { "crossentropy": 2.5343520641326904, "epoch": 0.4833599187935035, "grad_norm": 0.02610034868121147, "grad_norm_var": 1.2792565122007968e-06, "learning_rate": 0.005426580271510684, "loss": 2.5456, "step": 13333 }, { "crossentropy": 2.6299636363983154, "epoch": 0.4833961716937355, "grad_norm": 0.02790812961757183, "grad_norm_var": 1.2968919224581501e-06, "learning_rate": 0.00542600129959194, "loss": 2.4955, "step": 13334 }, { "crossentropy": 2.649500608444214, "epoch": 0.4834324245939675, "grad_norm": 0.0331655852496624, "grad_norm_var": 3.3710895779743513e-06, "learning_rate": 0.0054254223219194035, "loss": 2.6412, "step": 13335 }, { "crossentropy": 2.630783796310425, "epoch": 0.48346867749419953, "grad_norm": 0.030555056408047676, "grad_norm_var": 3.7088677283461743e-06, "learning_rate": 0.005424843338500892, "loss": 2.5881, "step": 13336 }, { "crossentropy": 2.5016369819641113, "epoch": 0.48350493039443154, "grad_norm": 0.025882026180624962, "grad_norm_var": 3.859058983219721e-06, "learning_rate": 0.005424264349344225, "loss": 2.5355, "step": 13337 }, { "crossentropy": 2.5618631839752197, "epoch": 0.48354118329466356, "grad_norm": 0.027261191979050636, "grad_norm_var": 3.809965476407823e-06, "learning_rate": 0.005423685354457226, "loss": 2.5542, "step": 13338 }, { "crossentropy": 2.5386993885040283, "epoch": 0.48357743619489557, "grad_norm": 0.027031516656279564, "grad_norm_var": 3.866056114985384e-06, "learning_rate": 0.005423106353847713, "loss": 2.6371, "step": 13339 }, { "crossentropy": 2.518588066101074, "epoch": 0.4836136890951276, "grad_norm": 0.028239509090781212, "grad_norm_var": 3.870727052073843e-06, "learning_rate": 0.005422527347523507, "loss": 2.5704, "step": 13340 }, { "crossentropy": 2.5204412937164307, "epoch": 0.48364994199535966, "grad_norm": 0.02630988135933876, "grad_norm_var": 4.038084240055341e-06, "learning_rate": 0.005421948335492426, "loss": 2.4793, "step": 13341 }, { "crossentropy": 2.621459484100342, "epoch": 0.48368619489559167, "grad_norm": 0.026682816445827484, "grad_norm_var": 3.900236092084378e-06, "learning_rate": 0.0054213693177622944, "loss": 2.6246, "step": 13342 }, { "crossentropy": 2.635477304458618, "epoch": 0.4837224477958237, "grad_norm": 0.028866387903690338, "grad_norm_var": 3.8638020332012095e-06, "learning_rate": 0.00542079029434093, "loss": 2.5891, "step": 13343 }, { "crossentropy": 2.609968662261963, "epoch": 0.4837587006960557, "grad_norm": 0.02694554254412651, "grad_norm_var": 3.675338518526784e-06, "learning_rate": 0.005420211265236155, "loss": 2.5138, "step": 13344 }, { "crossentropy": 2.337691307067871, "epoch": 0.4837949535962877, "grad_norm": 0.027180124074220657, "grad_norm_var": 3.478408942145181e-06, "learning_rate": 0.005419632230455788, "loss": 2.4794, "step": 13345 }, { "crossentropy": 2.6807827949523926, "epoch": 0.48383120649651973, "grad_norm": 0.02879904769361019, "grad_norm_var": 3.5449534892136845e-06, "learning_rate": 0.005419053190007651, "loss": 2.5964, "step": 13346 }, { "crossentropy": 2.4533233642578125, "epoch": 0.48386745939675174, "grad_norm": 0.027750886976718903, "grad_norm_var": 3.503420094788425e-06, "learning_rate": 0.005418474143899565, "loss": 2.4929, "step": 13347 }, { "crossentropy": 2.5612845420837402, "epoch": 0.48390371229698376, "grad_norm": 0.026789076626300812, "grad_norm_var": 3.4498505221025453e-06, "learning_rate": 0.005417895092139352, "loss": 2.5412, "step": 13348 }, { "crossentropy": 2.502918004989624, "epoch": 0.48393996519721577, "grad_norm": 0.026474636048078537, "grad_norm_var": 3.3717043311838006e-06, "learning_rate": 0.00541731603473483, "loss": 2.5516, "step": 13349 }, { "crossentropy": 2.5201520919799805, "epoch": 0.4839762180974478, "grad_norm": 0.028330152854323387, "grad_norm_var": 3.3852577245535673e-06, "learning_rate": 0.005416736971693821, "loss": 2.5602, "step": 13350 }, { "crossentropy": 2.5722053050994873, "epoch": 0.4840124709976798, "grad_norm": 0.027696382254362106, "grad_norm_var": 1.4087377396671072e-06, "learning_rate": 0.005416157903024148, "loss": 2.6188, "step": 13351 }, { "crossentropy": 2.581547498703003, "epoch": 0.4840487238979118, "grad_norm": 0.02881561405956745, "grad_norm_var": 9.008082554732746e-07, "learning_rate": 0.005415578828733632, "loss": 2.6184, "step": 13352 }, { "crossentropy": 2.661266565322876, "epoch": 0.48408497679814383, "grad_norm": 0.026996059343218803, "grad_norm_var": 7.468198598465133e-07, "learning_rate": 0.005414999748830091, "loss": 2.5898, "step": 13353 }, { "crossentropy": 2.5999467372894287, "epoch": 0.48412122969837584, "grad_norm": 0.02616262249648571, "grad_norm_var": 8.58773477065824e-07, "learning_rate": 0.005414420663321349, "loss": 2.6017, "step": 13354 }, { "crossentropy": 2.6244864463806152, "epoch": 0.4841574825986079, "grad_norm": 0.026626279577612877, "grad_norm_var": 8.912102333224892e-07, "learning_rate": 0.005413841572215228, "loss": 2.6222, "step": 13355 }, { "crossentropy": 2.576395273208618, "epoch": 0.48419373549883993, "grad_norm": 0.028713788837194443, "grad_norm_var": 9.57309906769501e-07, "learning_rate": 0.0054132624755195485, "loss": 2.601, "step": 13356 }, { "crossentropy": 2.5412843227386475, "epoch": 0.48422998839907194, "grad_norm": 0.025887342169880867, "grad_norm_var": 1.0324875204307244e-06, "learning_rate": 0.005412683373242131, "loss": 2.4844, "step": 13357 }, { "crossentropy": 2.4305672645568848, "epoch": 0.48426624129930396, "grad_norm": 0.026069113984704018, "grad_norm_var": 1.1163318842699147e-06, "learning_rate": 0.0054121042653908, "loss": 2.5065, "step": 13358 }, { "crossentropy": 2.550330400466919, "epoch": 0.48430249419953597, "grad_norm": 0.026751672849059105, "grad_norm_var": 9.77134535186487e-07, "learning_rate": 0.0054115251519733735, "loss": 2.5458, "step": 13359 }, { "crossentropy": 2.5826988220214844, "epoch": 0.484338747099768, "grad_norm": 0.02579827606678009, "grad_norm_var": 1.1058593661529645e-06, "learning_rate": 0.0054109460329976765, "loss": 2.5263, "step": 13360 }, { "crossentropy": 2.3326144218444824, "epoch": 0.484375, "grad_norm": 0.029172953218221664, "grad_norm_var": 1.3547492426125522e-06, "learning_rate": 0.005410366908471529, "loss": 2.4269, "step": 13361 }, { "crossentropy": 2.533184766769409, "epoch": 0.484411252900232, "grad_norm": 0.030401352792978287, "grad_norm_var": 1.8350154443652621e-06, "learning_rate": 0.005409787778402754, "loss": 2.5607, "step": 13362 }, { "crossentropy": 2.426501989364624, "epoch": 0.48444750580046403, "grad_norm": 0.025592776015400887, "grad_norm_var": 2.025789768977113e-06, "learning_rate": 0.005409208642799174, "loss": 2.5006, "step": 13363 }, { "crossentropy": 2.5971322059631348, "epoch": 0.48448375870069604, "grad_norm": 0.027016719803214073, "grad_norm_var": 2.0145109044626686e-06, "learning_rate": 0.005408629501668609, "loss": 2.6613, "step": 13364 }, { "crossentropy": 2.564319610595703, "epoch": 0.48452001160092806, "grad_norm": 0.028561746701598167, "grad_norm_var": 2.062197283454964e-06, "learning_rate": 0.005408050355018883, "loss": 2.6103, "step": 13365 }, { "crossentropy": 2.537966728210449, "epoch": 0.48455626450116007, "grad_norm": 0.028865082189440727, "grad_norm_var": 2.1455640707739117e-06, "learning_rate": 0.005407471202857818, "loss": 2.6005, "step": 13366 }, { "crossentropy": 2.5769095420837402, "epoch": 0.4845925174013921, "grad_norm": 0.027197344228625298, "grad_norm_var": 2.144434795475249e-06, "learning_rate": 0.005406892045193235, "loss": 2.6036, "step": 13367 }, { "crossentropy": 2.584911584854126, "epoch": 0.48462877030162416, "grad_norm": 0.02641770988702774, "grad_norm_var": 2.0557762986897805e-06, "learning_rate": 0.00540631288203296, "loss": 2.5771, "step": 13368 }, { "crossentropy": 2.6486427783966064, "epoch": 0.48466502320185617, "grad_norm": 0.026990633457899094, "grad_norm_var": 2.0559722900238367e-06, "learning_rate": 0.005405733713384811, "loss": 2.572, "step": 13369 }, { "crossentropy": 2.5649468898773193, "epoch": 0.4847012761020882, "grad_norm": 0.026711655780673027, "grad_norm_var": 1.994179944963629e-06, "learning_rate": 0.005405154539256613, "loss": 2.5925, "step": 13370 }, { "crossentropy": 2.6184802055358887, "epoch": 0.4847375290023202, "grad_norm": 0.02692406065762043, "grad_norm_var": 1.973035959131469e-06, "learning_rate": 0.005404575359656188, "loss": 2.6391, "step": 13371 }, { "crossentropy": 2.680739402770996, "epoch": 0.4847737819025522, "grad_norm": 0.026558954268693924, "grad_norm_var": 1.8619338830779797e-06, "learning_rate": 0.005403996174591358, "loss": 2.6547, "step": 13372 }, { "crossentropy": 2.625676393508911, "epoch": 0.48481003480278423, "grad_norm": 0.026756655424833298, "grad_norm_var": 1.7590646412324184e-06, "learning_rate": 0.0054034169840699465, "loss": 2.6334, "step": 13373 }, { "crossentropy": 2.6395304203033447, "epoch": 0.48484628770301624, "grad_norm": 0.027403168380260468, "grad_norm_var": 1.6626183381976759e-06, "learning_rate": 0.005402837788099777, "loss": 2.5352, "step": 13374 }, { "crossentropy": 2.545171022415161, "epoch": 0.48488254060324826, "grad_norm": 0.026534710079431534, "grad_norm_var": 1.6820025458183594e-06, "learning_rate": 0.005402258586688672, "loss": 2.5928, "step": 13375 }, { "crossentropy": 2.5323774814605713, "epoch": 0.48491879350348027, "grad_norm": 0.026068836450576782, "grad_norm_var": 1.6321694288638864e-06, "learning_rate": 0.005401679379844455, "loss": 2.5332, "step": 13376 }, { "crossentropy": 2.5468525886535645, "epoch": 0.4849550464037123, "grad_norm": 0.02742720954120159, "grad_norm_var": 1.392132370776379e-06, "learning_rate": 0.005401100167574947, "loss": 2.5652, "step": 13377 }, { "crossentropy": 2.631535291671753, "epoch": 0.4849912993039443, "grad_norm": 0.028737498447299004, "grad_norm_var": 8.581166347712641e-07, "learning_rate": 0.005400520949887974, "loss": 2.6032, "step": 13378 }, { "crossentropy": 2.5765669345855713, "epoch": 0.4850275522041763, "grad_norm": 0.028198082000017166, "grad_norm_var": 7.551951034252824e-07, "learning_rate": 0.005399941726791356, "loss": 2.6506, "step": 13379 }, { "crossentropy": 2.591219425201416, "epoch": 0.48506380510440833, "grad_norm": 0.02726827748119831, "grad_norm_var": 7.505499486410231e-07, "learning_rate": 0.00539936249829292, "loss": 2.5184, "step": 13380 }, { "crossentropy": 2.496352434158325, "epoch": 0.48510005800464034, "grad_norm": 0.026948830112814903, "grad_norm_var": 6.39400535387633e-07, "learning_rate": 0.005398783264400487, "loss": 2.4455, "step": 13381 }, { "crossentropy": 2.5103249549865723, "epoch": 0.4851363109048724, "grad_norm": 0.027586422860622406, "grad_norm_var": 4.556714503264747e-07, "learning_rate": 0.00539820402512188, "loss": 2.511, "step": 13382 }, { "crossentropy": 2.557633399963379, "epoch": 0.48517256380510443, "grad_norm": 0.02676093578338623, "grad_norm_var": 4.6238342945268775e-07, "learning_rate": 0.005397624780464924, "loss": 2.5699, "step": 13383 }, { "crossentropy": 2.626859664916992, "epoch": 0.48520881670533644, "grad_norm": 0.03277745097875595, "grad_norm_var": 2.4279555256629763e-06, "learning_rate": 0.005397045530437441, "loss": 2.5458, "step": 13384 }, { "crossentropy": 2.6892879009246826, "epoch": 0.48524506960556846, "grad_norm": 0.02780766598880291, "grad_norm_var": 2.41654769262919e-06, "learning_rate": 0.005396466275047256, "loss": 2.6571, "step": 13385 }, { "crossentropy": 2.5850393772125244, "epoch": 0.48528132250580047, "grad_norm": 0.02685578539967537, "grad_norm_var": 2.402131188019435e-06, "learning_rate": 0.0053958870143021925, "loss": 2.5549, "step": 13386 }, { "crossentropy": 2.6254384517669678, "epoch": 0.4853175754060325, "grad_norm": 0.028469596058130264, "grad_norm_var": 2.4248240546715365e-06, "learning_rate": 0.005395307748210074, "loss": 2.6156, "step": 13387 }, { "crossentropy": 2.483733892440796, "epoch": 0.4853538283062645, "grad_norm": 0.028009934350848198, "grad_norm_var": 2.3482309231284113e-06, "learning_rate": 0.005394728476778724, "loss": 2.3998, "step": 13388 }, { "crossentropy": 2.569711685180664, "epoch": 0.4853900812064965, "grad_norm": 0.02705284021794796, "grad_norm_var": 2.315445274189221e-06, "learning_rate": 0.0053941492000159665, "loss": 2.5892, "step": 13389 }, { "crossentropy": 2.594472885131836, "epoch": 0.48542633410672853, "grad_norm": 0.02638845704495907, "grad_norm_var": 2.425937910764617e-06, "learning_rate": 0.005393569917929628, "loss": 2.5861, "step": 13390 }, { "crossentropy": 2.5605757236480713, "epoch": 0.48546258700696054, "grad_norm": 0.028209136798977852, "grad_norm_var": 2.3453007135272223e-06, "learning_rate": 0.005392990630527529, "loss": 2.6105, "step": 13391 }, { "crossentropy": 2.6197330951690674, "epoch": 0.48549883990719256, "grad_norm": 0.027222057804465294, "grad_norm_var": 2.164471605944444e-06, "learning_rate": 0.005392411337817495, "loss": 2.5585, "step": 13392 }, { "crossentropy": 2.5313215255737305, "epoch": 0.4855350928074246, "grad_norm": 0.02995152398943901, "grad_norm_var": 2.417902892844663e-06, "learning_rate": 0.00539183203980735, "loss": 2.663, "step": 13393 }, { "crossentropy": 2.5377113819122314, "epoch": 0.4855713457076566, "grad_norm": 0.03224572911858559, "grad_norm_var": 3.524960452147505e-06, "learning_rate": 0.005391252736504921, "loss": 2.5878, "step": 13394 }, { "crossentropy": 2.664344549179077, "epoch": 0.48560759860788866, "grad_norm": 0.028756750747561455, "grad_norm_var": 3.541751248888738e-06, "learning_rate": 0.005390673427918028, "loss": 2.6874, "step": 13395 }, { "crossentropy": 2.2673308849334717, "epoch": 0.48564385150812067, "grad_norm": 0.0312897190451622, "grad_norm_var": 4.015673334686829e-06, "learning_rate": 0.005390094114054499, "loss": 2.4289, "step": 13396 }, { "crossentropy": 2.5396339893341064, "epoch": 0.4856801044083527, "grad_norm": 0.027911268174648285, "grad_norm_var": 3.8718428289462864e-06, "learning_rate": 0.005389514794922155, "loss": 2.5912, "step": 13397 }, { "crossentropy": 2.795227527618408, "epoch": 0.4857163573085847, "grad_norm": 0.02552890032529831, "grad_norm_var": 4.409266473920009e-06, "learning_rate": 0.005388935470528824, "loss": 2.6782, "step": 13398 }, { "crossentropy": 2.464982271194458, "epoch": 0.4857526102088167, "grad_norm": 0.027844242751598358, "grad_norm_var": 4.238302786575123e-06, "learning_rate": 0.005388356140882329, "loss": 2.562, "step": 13399 }, { "crossentropy": 2.402817487716675, "epoch": 0.48578886310904873, "grad_norm": 0.027407318353652954, "grad_norm_var": 2.9923352577423117e-06, "learning_rate": 0.005387776805990495, "loss": 2.4998, "step": 13400 }, { "crossentropy": 2.394587755203247, "epoch": 0.48582511600928074, "grad_norm": 0.028478797525167465, "grad_norm_var": 2.986771667762231e-06, "learning_rate": 0.0053871974658611455, "loss": 2.5328, "step": 13401 }, { "crossentropy": 2.599810838699341, "epoch": 0.48586136890951276, "grad_norm": 0.029850570484995842, "grad_norm_var": 3.000033495234343e-06, "learning_rate": 0.005386618120502109, "loss": 2.6191, "step": 13402 }, { "crossentropy": 2.593137741088867, "epoch": 0.4858976218097448, "grad_norm": 0.029901666566729546, "grad_norm_var": 3.1389111935109053e-06, "learning_rate": 0.005386038769921207, "loss": 2.5693, "step": 13403 }, { "crossentropy": 2.5604946613311768, "epoch": 0.4859338747099768, "grad_norm": 0.031078679487109184, "grad_norm_var": 3.525716923901088e-06, "learning_rate": 0.005385459414126265, "loss": 2.547, "step": 13404 }, { "crossentropy": 2.574936866760254, "epoch": 0.4859701276102088, "grad_norm": 0.027811678126454353, "grad_norm_var": 3.3955703375808274e-06, "learning_rate": 0.0053848800531251075, "loss": 2.5303, "step": 13405 }, { "crossentropy": 2.3592355251312256, "epoch": 0.4860063805104408, "grad_norm": 0.02918073907494545, "grad_norm_var": 3.006534052139145e-06, "learning_rate": 0.005384300686925563, "loss": 2.4754, "step": 13406 }, { "crossentropy": 2.607668399810791, "epoch": 0.48604263341067283, "grad_norm": 0.03123236633837223, "grad_norm_var": 3.2925222941775627e-06, "learning_rate": 0.005383721315535455, "loss": 2.6002, "step": 13407 }, { "crossentropy": 2.5502381324768066, "epoch": 0.48607888631090485, "grad_norm": 0.029135217890143394, "grad_norm_var": 3.0407762162978073e-06, "learning_rate": 0.005383141938962607, "loss": 2.5656, "step": 13408 }, { "crossentropy": 2.5581352710723877, "epoch": 0.4861151392111369, "grad_norm": 0.02768167294561863, "grad_norm_var": 3.14300779151247e-06, "learning_rate": 0.005382562557214844, "loss": 2.5898, "step": 13409 }, { "crossentropy": 2.6404993534088135, "epoch": 0.48615139211136893, "grad_norm": 0.028212103992700577, "grad_norm_var": 2.459168483624994e-06, "learning_rate": 0.005381983170299995, "loss": 2.6805, "step": 13410 }, { "crossentropy": 2.4970531463623047, "epoch": 0.48618764501160094, "grad_norm": 0.02785622514784336, "grad_norm_var": 2.518810458193056e-06, "learning_rate": 0.0053814037782258855, "loss": 2.5509, "step": 13411 }, { "crossentropy": 2.49151611328125, "epoch": 0.48622389791183296, "grad_norm": 0.027001557871699333, "grad_norm_var": 2.230319780775859e-06, "learning_rate": 0.0053808243810003365, "loss": 2.4898, "step": 13412 }, { "crossentropy": 2.552929401397705, "epoch": 0.486260150812065, "grad_norm": 0.027401752769947052, "grad_norm_var": 2.287020692214369e-06, "learning_rate": 0.005380244978631175, "loss": 2.6145, "step": 13413 }, { "crossentropy": 2.6714916229248047, "epoch": 0.486296403712297, "grad_norm": 0.027843188494443893, "grad_norm_var": 1.7126158501835764e-06, "learning_rate": 0.005379665571126231, "loss": 2.5896, "step": 13414 }, { "crossentropy": 2.5822787284851074, "epoch": 0.486332656612529, "grad_norm": 0.027541832998394966, "grad_norm_var": 1.7496055198612988e-06, "learning_rate": 0.005379086158493328, "loss": 2.5672, "step": 13415 }, { "crossentropy": 2.5934739112854004, "epoch": 0.486368909512761, "grad_norm": 0.02691745012998581, "grad_norm_var": 1.8425673476480757e-06, "learning_rate": 0.0053785067407402885, "loss": 2.6249, "step": 13416 }, { "crossentropy": 2.5637686252593994, "epoch": 0.48640516241299303, "grad_norm": 0.026358317583799362, "grad_norm_var": 2.149477468222268e-06, "learning_rate": 0.005377927317874942, "loss": 2.5636, "step": 13417 }, { "crossentropy": 2.581450939178467, "epoch": 0.48644141531322505, "grad_norm": 0.02761063352227211, "grad_norm_var": 2.0411283455521914e-06, "learning_rate": 0.005377347889905113, "loss": 2.585, "step": 13418 }, { "crossentropy": 2.496821641921997, "epoch": 0.48647766821345706, "grad_norm": 0.025367911905050278, "grad_norm_var": 2.356283907193185e-06, "learning_rate": 0.00537676845683863, "loss": 2.5038, "step": 13419 }, { "crossentropy": 2.4561870098114014, "epoch": 0.4865139211136891, "grad_norm": 0.027389097958803177, "grad_norm_var": 1.6996711569772314e-06, "learning_rate": 0.005376189018683316, "loss": 2.4988, "step": 13420 }, { "crossentropy": 2.626657009124756, "epoch": 0.4865501740139211, "grad_norm": 0.02918202430009842, "grad_norm_var": 1.822119576426473e-06, "learning_rate": 0.005375609575446997, "loss": 2.6036, "step": 13421 }, { "crossentropy": 2.4839460849761963, "epoch": 0.48658642691415316, "grad_norm": 0.02840210683643818, "grad_norm_var": 1.7238822040660374e-06, "learning_rate": 0.005375030127137502, "loss": 2.4734, "step": 13422 }, { "crossentropy": 2.5381970405578613, "epoch": 0.4866226798143852, "grad_norm": 0.030185900628566742, "grad_norm_var": 1.3163194954076673e-06, "learning_rate": 0.005374450673762658, "loss": 2.5614, "step": 13423 }, { "crossentropy": 2.7917611598968506, "epoch": 0.4866589327146172, "grad_norm": 0.030169373378157616, "grad_norm_var": 1.573416215405685e-06, "learning_rate": 0.005373871215330287, "loss": 2.6884, "step": 13424 }, { "crossentropy": 2.423923969268799, "epoch": 0.4866951856148492, "grad_norm": 0.02616022527217865, "grad_norm_var": 1.7461669741066725e-06, "learning_rate": 0.005373291751848217, "loss": 2.4842, "step": 13425 }, { "crossentropy": 2.562732219696045, "epoch": 0.4867314385150812, "grad_norm": 0.026699621230363846, "grad_norm_var": 1.790906965727631e-06, "learning_rate": 0.005372712283324277, "loss": 2.5978, "step": 13426 }, { "crossentropy": 2.6352012157440186, "epoch": 0.48676769141531323, "grad_norm": 0.03063013032078743, "grad_norm_var": 2.35531971877889e-06, "learning_rate": 0.005372132809766291, "loss": 2.6869, "step": 13427 }, { "crossentropy": 2.586731195449829, "epoch": 0.48680394431554525, "grad_norm": 0.029395831748843193, "grad_norm_var": 2.4574924599090587e-06, "learning_rate": 0.005371553331182089, "loss": 2.5739, "step": 13428 }, { "crossentropy": 2.600890636444092, "epoch": 0.48684019721577726, "grad_norm": 0.02690390683710575, "grad_norm_var": 2.509605309582843e-06, "learning_rate": 0.005370973847579492, "loss": 2.6217, "step": 13429 }, { "crossentropy": 2.6004374027252197, "epoch": 0.4868764501160093, "grad_norm": 0.02704046480357647, "grad_norm_var": 2.5583504710190624e-06, "learning_rate": 0.005370394358966333, "loss": 2.5708, "step": 13430 }, { "crossentropy": 2.5632734298706055, "epoch": 0.4869127030162413, "grad_norm": 0.03386932611465454, "grad_norm_var": 4.78197375677751e-06, "learning_rate": 0.005369814865350435, "loss": 2.53, "step": 13431 }, { "crossentropy": 2.629192590713501, "epoch": 0.4869489559164733, "grad_norm": 0.02650618925690651, "grad_norm_var": 4.866582377504785e-06, "learning_rate": 0.0053692353667396265, "loss": 2.5179, "step": 13432 }, { "crossentropy": 2.4553334712982178, "epoch": 0.4869852088167053, "grad_norm": 0.02972973883152008, "grad_norm_var": 4.730255559867478e-06, "learning_rate": 0.0053686558631417325, "loss": 2.5253, "step": 13433 }, { "crossentropy": 2.590691566467285, "epoch": 0.48702146171693733, "grad_norm": 0.029019545763731003, "grad_norm_var": 4.696142216122642e-06, "learning_rate": 0.005368076354564584, "loss": 2.4926, "step": 13434 }, { "crossentropy": 2.51627254486084, "epoch": 0.48705771461716935, "grad_norm": 0.026428794488310814, "grad_norm_var": 4.3176883974626235e-06, "learning_rate": 0.0053674968410160045, "loss": 2.561, "step": 13435 }, { "crossentropy": 2.6659507751464844, "epoch": 0.4870939675174014, "grad_norm": 0.027022812515497208, "grad_norm_var": 4.3855545313206675e-06, "learning_rate": 0.005366917322503823, "loss": 2.6344, "step": 13436 }, { "crossentropy": 2.56107234954834, "epoch": 0.48713022041763343, "grad_norm": 0.02930876426398754, "grad_norm_var": 4.3966621761716646e-06, "learning_rate": 0.005366337799035864, "loss": 2.5512, "step": 13437 }, { "crossentropy": 2.691331148147583, "epoch": 0.48716647331786544, "grad_norm": 0.027999019250273705, "grad_norm_var": 4.4170254213719e-06, "learning_rate": 0.00536575827061996, "loss": 2.5543, "step": 13438 }, { "crossentropy": 2.600921630859375, "epoch": 0.48720272621809746, "grad_norm": 0.02779546193778515, "grad_norm_var": 4.258131464150253e-06, "learning_rate": 0.005365178737263934, "loss": 2.594, "step": 13439 }, { "crossentropy": 2.617351770401001, "epoch": 0.4872389791183295, "grad_norm": 0.028315333649516106, "grad_norm_var": 4.039888274989569e-06, "learning_rate": 0.005364599198975616, "loss": 2.6227, "step": 13440 }, { "crossentropy": 2.4924521446228027, "epoch": 0.4872752320185615, "grad_norm": 0.029529033228754997, "grad_norm_var": 3.7873539452151397e-06, "learning_rate": 0.005364019655762831, "loss": 2.5553, "step": 13441 }, { "crossentropy": 2.6511452198028564, "epoch": 0.4873114849187935, "grad_norm": 0.02952129952609539, "grad_norm_var": 3.6030643198232497e-06, "learning_rate": 0.005363440107633408, "loss": 2.5035, "step": 13442 }, { "crossentropy": 2.5729310512542725, "epoch": 0.4873477378190255, "grad_norm": 0.027503417804837227, "grad_norm_var": 3.404619986981554e-06, "learning_rate": 0.005362860554595177, "loss": 2.5333, "step": 13443 }, { "crossentropy": 2.628066301345825, "epoch": 0.48738399071925753, "grad_norm": 0.026371333748102188, "grad_norm_var": 3.6122862299513167e-06, "learning_rate": 0.005362280996655962, "loss": 2.6092, "step": 13444 }, { "crossentropy": 2.707606792449951, "epoch": 0.48742024361948955, "grad_norm": 0.027115171775221825, "grad_norm_var": 3.5756362603301763e-06, "learning_rate": 0.0053617014338235925, "loss": 2.7009, "step": 13445 }, { "crossentropy": 2.570824384689331, "epoch": 0.48745649651972156, "grad_norm": 0.027471203356981277, "grad_norm_var": 3.5139052123994796e-06, "learning_rate": 0.005361121866105896, "loss": 2.5707, "step": 13446 }, { "crossentropy": 2.572981595993042, "epoch": 0.4874927494199536, "grad_norm": 0.026107311248779297, "grad_norm_var": 1.5612629447766253e-06, "learning_rate": 0.005360542293510702, "loss": 2.6298, "step": 13447 }, { "crossentropy": 2.594745397567749, "epoch": 0.4875290023201856, "grad_norm": 0.028315430507063866, "grad_norm_var": 1.4394996040725342e-06, "learning_rate": 0.005359962716045836, "loss": 2.6235, "step": 13448 }, { "crossentropy": 2.509813070297241, "epoch": 0.48756525522041766, "grad_norm": 0.02667238563299179, "grad_norm_var": 1.3072181116767964e-06, "learning_rate": 0.005359383133719127, "loss": 2.4985, "step": 13449 }, { "crossentropy": 2.560861587524414, "epoch": 0.4876015081206497, "grad_norm": 0.0264974944293499, "grad_norm_var": 1.2882811102949791e-06, "learning_rate": 0.0053588035465384045, "loss": 2.5575, "step": 13450 }, { "crossentropy": 2.5516810417175293, "epoch": 0.4876377610208817, "grad_norm": 0.028050027787685394, "grad_norm_var": 1.1943265090822205e-06, "learning_rate": 0.005358223954511494, "loss": 2.5652, "step": 13451 }, { "crossentropy": 2.48964262008667, "epoch": 0.4876740139211137, "grad_norm": 0.028279047459363937, "grad_norm_var": 1.175391506565944e-06, "learning_rate": 0.005357644357646228, "loss": 2.539, "step": 13452 }, { "crossentropy": 2.377599000930786, "epoch": 0.4877102668213457, "grad_norm": 0.027657337486743927, "grad_norm_var": 1.014338975510211e-06, "learning_rate": 0.005357064755950429, "loss": 2.4502, "step": 13453 }, { "crossentropy": 2.6403539180755615, "epoch": 0.48774651972157773, "grad_norm": 0.02712896652519703, "grad_norm_var": 1.0269648635458096e-06, "learning_rate": 0.0053564851494319315, "loss": 2.6214, "step": 13454 }, { "crossentropy": 2.5807487964630127, "epoch": 0.48778277262180975, "grad_norm": 0.028362615033984184, "grad_norm_var": 1.0583982988754404e-06, "learning_rate": 0.005355905538098559, "loss": 2.5948, "step": 13455 }, { "crossentropy": 2.569270610809326, "epoch": 0.48781902552204176, "grad_norm": 0.027821946889162064, "grad_norm_var": 1.031888922263394e-06, "learning_rate": 0.005355325921958143, "loss": 2.5244, "step": 13456 }, { "crossentropy": 2.669274091720581, "epoch": 0.4878552784222738, "grad_norm": 0.02797997184097767, "grad_norm_var": 7.938169201301131e-07, "learning_rate": 0.00535474630101851, "loss": 2.7197, "step": 13457 }, { "crossentropy": 2.468686819076538, "epoch": 0.4878915313225058, "grad_norm": 0.026379650458693504, "grad_norm_var": 5.863774095143792e-07, "learning_rate": 0.005354166675287492, "loss": 2.5643, "step": 13458 }, { "crossentropy": 2.550887107849121, "epoch": 0.4879277842227378, "grad_norm": 0.027974378317594528, "grad_norm_var": 6.094292654495436e-07, "learning_rate": 0.005353587044772914, "loss": 2.5348, "step": 13459 }, { "crossentropy": 2.739548683166504, "epoch": 0.4879640371229698, "grad_norm": 0.026536433026194572, "grad_norm_var": 5.887854058885528e-07, "learning_rate": 0.0053530074094826074, "loss": 2.6467, "step": 13460 }, { "crossentropy": 2.5334596633911133, "epoch": 0.48800029002320183, "grad_norm": 0.028593657538294792, "grad_norm_var": 6.69880594566828e-07, "learning_rate": 0.005352427769424399, "loss": 2.5805, "step": 13461 }, { "crossentropy": 2.5821352005004883, "epoch": 0.48803654292343385, "grad_norm": 0.026625005528330803, "grad_norm_var": 7.166689001597097e-07, "learning_rate": 0.005351848124606119, "loss": 2.625, "step": 13462 }, { "crossentropy": 2.590261936187744, "epoch": 0.4880727958236659, "grad_norm": 0.029351748526096344, "grad_norm_var": 7.996345646789823e-07, "learning_rate": 0.005351268475035597, "loss": 2.5715, "step": 13463 }, { "crossentropy": 2.605778694152832, "epoch": 0.48810904872389793, "grad_norm": 0.026601647958159447, "grad_norm_var": 8.28662870438109e-07, "learning_rate": 0.005350688820720661, "loss": 2.5491, "step": 13464 }, { "crossentropy": 2.5662992000579834, "epoch": 0.48814530162412995, "grad_norm": 0.027794944122433662, "grad_norm_var": 7.787562058995107e-07, "learning_rate": 0.005350109161669139, "loss": 2.6064, "step": 13465 }, { "crossentropy": 2.5450148582458496, "epoch": 0.48818155452436196, "grad_norm": 0.028804464265704155, "grad_norm_var": 7.715913672813485e-07, "learning_rate": 0.005349529497888863, "loss": 2.5673, "step": 13466 }, { "crossentropy": 2.627798557281494, "epoch": 0.488217807424594, "grad_norm": 0.029933810234069824, "grad_norm_var": 1.069652376838442e-06, "learning_rate": 0.00534894982938766, "loss": 2.4934, "step": 13467 }, { "crossentropy": 2.446516752243042, "epoch": 0.488254060324826, "grad_norm": 0.026998987421393394, "grad_norm_var": 1.1012412306570744e-06, "learning_rate": 0.00534837015617336, "loss": 2.4638, "step": 13468 }, { "crossentropy": 2.523245096206665, "epoch": 0.488290313225058, "grad_norm": 0.027276156470179558, "grad_norm_var": 1.1167649006460847e-06, "learning_rate": 0.005347790478253792, "loss": 2.6346, "step": 13469 }, { "crossentropy": 2.720437526702881, "epoch": 0.48832656612529, "grad_norm": 0.02695784717798233, "grad_norm_var": 1.132998871703492e-06, "learning_rate": 0.005347210795636786, "loss": 2.6031, "step": 13470 }, { "crossentropy": 2.5341665744781494, "epoch": 0.48836281902552203, "grad_norm": 0.026384610682725906, "grad_norm_var": 1.2158518185326207e-06, "learning_rate": 0.005346631108330171, "loss": 2.5291, "step": 13471 }, { "crossentropy": 2.5800626277923584, "epoch": 0.48839907192575405, "grad_norm": 0.029153598472476006, "grad_norm_var": 1.361482086203652e-06, "learning_rate": 0.005346051416341778, "loss": 2.5562, "step": 13472 }, { "crossentropy": 2.61920166015625, "epoch": 0.48843532482598606, "grad_norm": 0.028576787561178207, "grad_norm_var": 1.4052921166027646e-06, "learning_rate": 0.005345471719679434, "loss": 2.5415, "step": 13473 }, { "crossentropy": 2.6840126514434814, "epoch": 0.4884715777262181, "grad_norm": 0.026324063539505005, "grad_norm_var": 1.415615636927052e-06, "learning_rate": 0.005344892018350972, "loss": 2.5863, "step": 13474 }, { "crossentropy": 2.5130503177642822, "epoch": 0.4885078306264501, "grad_norm": 0.030302852392196655, "grad_norm_var": 1.8263093289780693e-06, "learning_rate": 0.005344312312364219, "loss": 2.5342, "step": 13475 }, { "crossentropy": 2.7381319999694824, "epoch": 0.48854408352668216, "grad_norm": 0.036837007850408554, "grad_norm_var": 6.600679381686218e-06, "learning_rate": 0.0053437326017270045, "loss": 2.6916, "step": 13476 }, { "crossentropy": 2.582765817642212, "epoch": 0.4885803364269142, "grad_norm": 0.028941074386239052, "grad_norm_var": 6.611064126477566e-06, "learning_rate": 0.005343152886447162, "loss": 2.5954, "step": 13477 }, { "crossentropy": 2.4981086254119873, "epoch": 0.4886165893271462, "grad_norm": 0.026990476995706558, "grad_norm_var": 6.525411374128084e-06, "learning_rate": 0.005342573166532518, "loss": 2.5336, "step": 13478 }, { "crossentropy": 2.462205648422241, "epoch": 0.4886528422273782, "grad_norm": 0.028106769546866417, "grad_norm_var": 6.493658641580172e-06, "learning_rate": 0.005341993441990902, "loss": 2.5163, "step": 13479 }, { "crossentropy": 2.5591232776641846, "epoch": 0.4886890951276102, "grad_norm": 0.026351070031523705, "grad_norm_var": 6.560976539798924e-06, "learning_rate": 0.005341413712830146, "loss": 2.4976, "step": 13480 }, { "crossentropy": 2.5521485805511475, "epoch": 0.48872534802784223, "grad_norm": 0.02694125846028328, "grad_norm_var": 6.684889423840207e-06, "learning_rate": 0.005340833979058081, "loss": 2.5728, "step": 13481 }, { "crossentropy": 2.564436197280884, "epoch": 0.48876160092807425, "grad_norm": 0.027035599574446678, "grad_norm_var": 6.7921400005232216e-06, "learning_rate": 0.005340254240682535, "loss": 2.5397, "step": 13482 }, { "crossentropy": 2.594040870666504, "epoch": 0.48879785382830626, "grad_norm": 0.02700418420135975, "grad_norm_var": 6.697981875919354e-06, "learning_rate": 0.00533967449771134, "loss": 2.5547, "step": 13483 }, { "crossentropy": 2.5875797271728516, "epoch": 0.4888341067285383, "grad_norm": 0.026523804292082787, "grad_norm_var": 6.784157997549202e-06, "learning_rate": 0.005339094750152325, "loss": 2.5936, "step": 13484 }, { "crossentropy": 2.483919382095337, "epoch": 0.4888703596287703, "grad_norm": 0.027124935761094093, "grad_norm_var": 6.8023332314988414e-06, "learning_rate": 0.00533851499801332, "loss": 2.5201, "step": 13485 }, { "crossentropy": 2.5432791709899902, "epoch": 0.4889066125290023, "grad_norm": 0.02771012671291828, "grad_norm_var": 6.723417292993809e-06, "learning_rate": 0.005337935241302157, "loss": 2.5988, "step": 13486 }, { "crossentropy": 2.6207282543182373, "epoch": 0.4889428654292343, "grad_norm": 0.02810545079410076, "grad_norm_var": 6.504753743737506e-06, "learning_rate": 0.005337355480026665, "loss": 2.6039, "step": 13487 }, { "crossentropy": 2.7158803939819336, "epoch": 0.48897911832946633, "grad_norm": 0.02620369754731655, "grad_norm_var": 6.69393437959098e-06, "learning_rate": 0.005336775714194677, "loss": 2.6349, "step": 13488 }, { "crossentropy": 2.6891698837280273, "epoch": 0.4890153712296984, "grad_norm": 0.026176147162914276, "grad_norm_var": 6.891094207335524e-06, "learning_rate": 0.005336195943814021, "loss": 2.6229, "step": 13489 }, { "crossentropy": 2.4897677898406982, "epoch": 0.4890516241299304, "grad_norm": 0.026645410805940628, "grad_norm_var": 6.829279316623339e-06, "learning_rate": 0.005335616168892527, "loss": 2.5153, "step": 13490 }, { "crossentropy": 2.543165683746338, "epoch": 0.48908787703016243, "grad_norm": 0.03056146204471588, "grad_norm_var": 6.915019933514434e-06, "learning_rate": 0.005335036389438029, "loss": 2.735, "step": 13491 }, { "crossentropy": 2.6103367805480957, "epoch": 0.48912412993039445, "grad_norm": 0.028238145634531975, "grad_norm_var": 1.3513996651380213e-06, "learning_rate": 0.005334456605458357, "loss": 2.5487, "step": 13492 }, { "crossentropy": 2.5325677394866943, "epoch": 0.48916038283062646, "grad_norm": 0.0285897646099329, "grad_norm_var": 1.2876874330127457e-06, "learning_rate": 0.005333876816961338, "loss": 2.5734, "step": 13493 }, { "crossentropy": 2.632382392883301, "epoch": 0.4891966357308585, "grad_norm": 0.027480704709887505, "grad_norm_var": 1.2763142955266964e-06, "learning_rate": 0.0053332970239548085, "loss": 2.5784, "step": 13494 }, { "crossentropy": 2.585698366165161, "epoch": 0.4892328886310905, "grad_norm": 0.025802848860621452, "grad_norm_var": 1.3986068718448153e-06, "learning_rate": 0.005332717226446597, "loss": 2.5413, "step": 13495 }, { "crossentropy": 2.5585503578186035, "epoch": 0.4892691415313225, "grad_norm": 0.02689102292060852, "grad_norm_var": 1.3498858246182987e-06, "learning_rate": 0.005332137424444533, "loss": 2.5946, "step": 13496 }, { "crossentropy": 2.500964641571045, "epoch": 0.4893053944315545, "grad_norm": 0.028254633769392967, "grad_norm_var": 1.392306600382122e-06, "learning_rate": 0.00533155761795645, "loss": 2.4903, "step": 13497 }, { "crossentropy": 2.5242834091186523, "epoch": 0.48934164733178653, "grad_norm": 0.026169203221797943, "grad_norm_var": 1.4809412493857101e-06, "learning_rate": 0.005330977806990177, "loss": 2.5099, "step": 13498 }, { "crossentropy": 2.4236485958099365, "epoch": 0.48937790023201855, "grad_norm": 0.02912949211895466, "grad_norm_var": 1.6673522469003716e-06, "learning_rate": 0.005330397991553548, "loss": 2.5877, "step": 13499 }, { "crossentropy": 2.636092185974121, "epoch": 0.48941415313225056, "grad_norm": 0.028783347457647324, "grad_norm_var": 1.6997501689638406e-06, "learning_rate": 0.005329818171654392, "loss": 2.7, "step": 13500 }, { "crossentropy": 2.604403495788574, "epoch": 0.4894504060324826, "grad_norm": 0.029339056462049484, "grad_norm_var": 1.8609839754185476e-06, "learning_rate": 0.00532923834730054, "loss": 2.643, "step": 13501 }, { "crossentropy": 2.5609822273254395, "epoch": 0.4894866589327146, "grad_norm": 0.030954264104366302, "grad_norm_var": 2.499336756113466e-06, "learning_rate": 0.0053286585184998255, "loss": 2.5965, "step": 13502 }, { "crossentropy": 2.6027188301086426, "epoch": 0.48952291183294666, "grad_norm": 0.029604598879814148, "grad_norm_var": 2.6693173010631226e-06, "learning_rate": 0.00532807868526008, "loss": 2.5515, "step": 13503 }, { "crossentropy": 2.5673937797546387, "epoch": 0.4895591647331787, "grad_norm": 0.027861133217811584, "grad_norm_var": 2.4326648788342935e-06, "learning_rate": 0.005327498847589134, "loss": 2.6123, "step": 13504 }, { "crossentropy": 2.6321358680725098, "epoch": 0.4895954176334107, "grad_norm": 0.02773246541619301, "grad_norm_var": 2.1734017834747095e-06, "learning_rate": 0.005326919005494817, "loss": 2.6471, "step": 13505 }, { "crossentropy": 2.5271215438842773, "epoch": 0.4896316705336427, "grad_norm": 0.026973526924848557, "grad_norm_var": 2.1098289834398446e-06, "learning_rate": 0.005326339158984963, "loss": 2.5494, "step": 13506 }, { "crossentropy": 2.639970064163208, "epoch": 0.4896679234338747, "grad_norm": 0.028191236779093742, "grad_norm_var": 1.7376832156135025e-06, "learning_rate": 0.005325759308067406, "loss": 2.6324, "step": 13507 }, { "crossentropy": 2.4977152347564697, "epoch": 0.48970417633410673, "grad_norm": 0.02855776622891426, "grad_norm_var": 1.7489020037455652e-06, "learning_rate": 0.005325179452749973, "loss": 2.5317, "step": 13508 }, { "crossentropy": 2.5187175273895264, "epoch": 0.48974042923433875, "grad_norm": 0.028431590646505356, "grad_norm_var": 1.7410791632566178e-06, "learning_rate": 0.005324599593040498, "loss": 2.5843, "step": 13509 }, { "crossentropy": 2.474916696548462, "epoch": 0.48977668213457076, "grad_norm": 0.026913978159427643, "grad_norm_var": 1.8105790385131506e-06, "learning_rate": 0.005324019728946813, "loss": 2.551, "step": 13510 }, { "crossentropy": 2.5903539657592773, "epoch": 0.4898129350348028, "grad_norm": 0.0273230392485857, "grad_norm_var": 1.489525527062265e-06, "learning_rate": 0.00532343986047675, "loss": 2.5808, "step": 13511 }, { "crossentropy": 2.59769344329834, "epoch": 0.4898491879350348, "grad_norm": 0.027060626074671745, "grad_norm_var": 1.461849168264565e-06, "learning_rate": 0.005322859987638142, "loss": 2.6451, "step": 13512 }, { "crossentropy": 2.614405632019043, "epoch": 0.4898854408352668, "grad_norm": 0.02736218273639679, "grad_norm_var": 1.5057220695789224e-06, "learning_rate": 0.005322280110438817, "loss": 2.5864, "step": 13513 }, { "crossentropy": 2.623716115951538, "epoch": 0.4899216937354988, "grad_norm": 0.026486527174711227, "grad_norm_var": 1.4282412744060294e-06, "learning_rate": 0.005321700228886612, "loss": 2.5741, "step": 13514 }, { "crossentropy": 2.5629165172576904, "epoch": 0.48995794663573083, "grad_norm": 0.026959678158164024, "grad_norm_var": 1.444633539595073e-06, "learning_rate": 0.005321120342989357, "loss": 2.4985, "step": 13515 }, { "crossentropy": 2.5700089931488037, "epoch": 0.4899941995359629, "grad_norm": 0.027224240824580193, "grad_norm_var": 1.4406676672699582e-06, "learning_rate": 0.005320540452754886, "loss": 2.6191, "step": 13516 }, { "crossentropy": 2.5057404041290283, "epoch": 0.4900304524361949, "grad_norm": 0.02633073180913925, "grad_norm_var": 1.4435116005222786e-06, "learning_rate": 0.0053199605581910284, "loss": 2.5958, "step": 13517 }, { "crossentropy": 2.5005602836608887, "epoch": 0.49006670533642693, "grad_norm": 0.02723604440689087, "grad_norm_var": 7.180252288685318e-07, "learning_rate": 0.005319380659305617, "loss": 2.5, "step": 13518 }, { "crossentropy": 2.571289539337158, "epoch": 0.49010295823665895, "grad_norm": 0.027532564476132393, "grad_norm_var": 4.092237711061005e-07, "learning_rate": 0.005318800756106486, "loss": 2.5124, "step": 13519 }, { "crossentropy": 2.599534034729004, "epoch": 0.49013921113689096, "grad_norm": 0.027221256867051125, "grad_norm_var": 3.94284116315618e-07, "learning_rate": 0.005318220848601468, "loss": 2.6217, "step": 13520 }, { "crossentropy": 2.4713594913482666, "epoch": 0.490175464037123, "grad_norm": 0.026753870770335197, "grad_norm_var": 4.0372322531404685e-07, "learning_rate": 0.0053176409367983925, "loss": 2.6092, "step": 13521 }, { "crossentropy": 2.455463409423828, "epoch": 0.490211716937355, "grad_norm": 0.027980852872133255, "grad_norm_var": 4.253178938104854e-07, "learning_rate": 0.005317061020705095, "loss": 2.5413, "step": 13522 }, { "crossentropy": 2.609546184539795, "epoch": 0.490247969837587, "grad_norm": 0.03495987877249718, "grad_norm_var": 4.04983634554241e-06, "learning_rate": 0.005316481100329408, "loss": 2.5621, "step": 13523 }, { "crossentropy": 2.8161423206329346, "epoch": 0.490284222737819, "grad_norm": 0.02714807726442814, "grad_norm_var": 4.026144605283795e-06, "learning_rate": 0.005315901175679163, "loss": 2.7197, "step": 13524 }, { "crossentropy": 2.699932098388672, "epoch": 0.49032047563805103, "grad_norm": 0.026937376707792282, "grad_norm_var": 4.016510591905072e-06, "learning_rate": 0.005315321246762193, "loss": 2.6273, "step": 13525 }, { "crossentropy": 2.6426074504852295, "epoch": 0.49035672853828305, "grad_norm": 0.02608717791736126, "grad_norm_var": 4.133697658793021e-06, "learning_rate": 0.005314741313586331, "loss": 2.613, "step": 13526 }, { "crossentropy": 2.7711453437805176, "epoch": 0.49039298143851506, "grad_norm": 0.0271313339471817, "grad_norm_var": 4.141482951449706e-06, "learning_rate": 0.00531416137615941, "loss": 2.7001, "step": 13527 }, { "crossentropy": 2.587610960006714, "epoch": 0.4904292343387471, "grad_norm": 0.02594090811908245, "grad_norm_var": 4.28928841125862e-06, "learning_rate": 0.005313581434489263, "loss": 2.5512, "step": 13528 }, { "crossentropy": 2.49099063873291, "epoch": 0.4904654872389791, "grad_norm": 0.027542587369680405, "grad_norm_var": 4.289070806751116e-06, "learning_rate": 0.005313001488583722, "loss": 2.5033, "step": 13529 }, { "crossentropy": 2.5881738662719727, "epoch": 0.49050174013921116, "grad_norm": 0.02820552885532379, "grad_norm_var": 4.249015774771397e-06, "learning_rate": 0.005312421538450621, "loss": 2.6534, "step": 13530 }, { "crossentropy": 2.4745872020721436, "epoch": 0.4905379930394432, "grad_norm": 0.027495240792632103, "grad_norm_var": 4.223038582106391e-06, "learning_rate": 0.005311841584097794, "loss": 2.4322, "step": 13531 }, { "crossentropy": 2.4441893100738525, "epoch": 0.4905742459396752, "grad_norm": 0.02712327614426613, "grad_norm_var": 4.228841572273415e-06, "learning_rate": 0.005311261625533073, "loss": 2.5338, "step": 13532 }, { "crossentropy": 2.5681347846984863, "epoch": 0.4906104988399072, "grad_norm": 0.026913633570075035, "grad_norm_var": 4.151299924306842e-06, "learning_rate": 0.005310681662764292, "loss": 2.5481, "step": 13533 }, { "crossentropy": 2.5541934967041016, "epoch": 0.4906467517401392, "grad_norm": 0.02711309678852558, "grad_norm_var": 4.158835594364112e-06, "learning_rate": 0.005310101695799282, "loss": 2.4783, "step": 13534 }, { "crossentropy": 2.524785041809082, "epoch": 0.49068300464037123, "grad_norm": 0.02615957334637642, "grad_norm_var": 4.294567930978171e-06, "learning_rate": 0.005309521724645879, "loss": 2.4987, "step": 13535 }, { "crossentropy": 2.6136908531188965, "epoch": 0.49071925754060325, "grad_norm": 0.027006398886442184, "grad_norm_var": 4.30671635050088e-06, "learning_rate": 0.005308941749311915, "loss": 2.574, "step": 13536 }, { "crossentropy": 2.5176479816436768, "epoch": 0.49075551044083526, "grad_norm": 0.028198281303048134, "grad_norm_var": 4.287411794060567e-06, "learning_rate": 0.005308361769805224, "loss": 2.5495, "step": 13537 }, { "crossentropy": 2.65846586227417, "epoch": 0.4907917633410673, "grad_norm": 0.029124949127435684, "grad_norm_var": 4.4240468738214555e-06, "learning_rate": 0.00530778178613364, "loss": 2.5589, "step": 13538 }, { "crossentropy": 2.3946826457977295, "epoch": 0.4908280162412993, "grad_norm": 0.026029884815216064, "grad_norm_var": 7.556212241595679e-07, "learning_rate": 0.005307201798304995, "loss": 2.5599, "step": 13539 }, { "crossentropy": 2.697378158569336, "epoch": 0.4908642691415313, "grad_norm": 0.028490735217928886, "grad_norm_var": 8.706629092502013e-07, "learning_rate": 0.005306621806327124, "loss": 2.7263, "step": 13540 }, { "crossentropy": 2.6102466583251953, "epoch": 0.4909005220417633, "grad_norm": 0.027995191514492035, "grad_norm_var": 9.009133756169984e-07, "learning_rate": 0.00530604181020786, "loss": 2.5809, "step": 13541 }, { "crossentropy": 2.4789135456085205, "epoch": 0.49093677494199534, "grad_norm": 0.02609184756875038, "grad_norm_var": 9.001690359619966e-07, "learning_rate": 0.005305461809955037, "loss": 2.5048, "step": 13542 }, { "crossentropy": 2.581178903579712, "epoch": 0.4909730278422274, "grad_norm": 0.02679342031478882, "grad_norm_var": 9.142360325809298e-07, "learning_rate": 0.005304881805576488, "loss": 2.6079, "step": 13543 }, { "crossentropy": 2.6475982666015625, "epoch": 0.4910092807424594, "grad_norm": 0.027572015300393105, "grad_norm_var": 7.927631315921002e-07, "learning_rate": 0.005304301797080049, "loss": 2.5962, "step": 13544 }, { "crossentropy": 2.693009853363037, "epoch": 0.49104553364269143, "grad_norm": 0.02923276647925377, "grad_norm_var": 1.0111072342088486e-06, "learning_rate": 0.005303721784473552, "loss": 2.6759, "step": 13545 }, { "crossentropy": 2.5517191886901855, "epoch": 0.49108178654292345, "grad_norm": 0.025931060314178467, "grad_norm_var": 1.1118641511103275e-06, "learning_rate": 0.005303141767764831, "loss": 2.6217, "step": 13546 }, { "crossentropy": 2.4767162799835205, "epoch": 0.49111803944315546, "grad_norm": 0.027224326506257057, "grad_norm_var": 1.1104630184290953e-06, "learning_rate": 0.005302561746961719, "loss": 2.6058, "step": 13547 }, { "crossentropy": 2.6118216514587402, "epoch": 0.4911542923433875, "grad_norm": 0.029404504224658012, "grad_norm_var": 1.378149387748018e-06, "learning_rate": 0.005301981722072053, "loss": 2.5858, "step": 13548 }, { "crossentropy": 2.7363007068634033, "epoch": 0.4911905452436195, "grad_norm": 0.030962616205215454, "grad_norm_var": 2.1104693611035304e-06, "learning_rate": 0.0053014016931036655, "loss": 2.6912, "step": 13549 }, { "crossentropy": 2.6970207691192627, "epoch": 0.4912267981438515, "grad_norm": 0.028406735509634018, "grad_norm_var": 2.1124224441478826e-06, "learning_rate": 0.005300821660064391, "loss": 2.6964, "step": 13550 }, { "crossentropy": 2.6293253898620605, "epoch": 0.4912630510440835, "grad_norm": 0.02701704576611519, "grad_norm_var": 1.9720821487464403e-06, "learning_rate": 0.005300241622962063, "loss": 2.5916, "step": 13551 }, { "crossentropy": 2.5944643020629883, "epoch": 0.49129930394431554, "grad_norm": 0.027090590447187424, "grad_norm_var": 1.9631382265579277e-06, "learning_rate": 0.005299661581804517, "loss": 2.6445, "step": 13552 }, { "crossentropy": 2.5544841289520264, "epoch": 0.49133555684454755, "grad_norm": 0.02780287154018879, "grad_norm_var": 1.954436058340077e-06, "learning_rate": 0.005299081536599586, "loss": 2.5752, "step": 13553 }, { "crossentropy": 2.592900037765503, "epoch": 0.49137180974477956, "grad_norm": 0.02569602243602276, "grad_norm_var": 2.0941169588653897e-06, "learning_rate": 0.005298501487355105, "loss": 2.5929, "step": 13554 }, { "crossentropy": 2.6331686973571777, "epoch": 0.4914080626450116, "grad_norm": 0.02701508440077305, "grad_norm_var": 1.9473675379498886e-06, "learning_rate": 0.005297921434078908, "loss": 2.562, "step": 13555 }, { "crossentropy": 2.4979941844940186, "epoch": 0.4914443155452436, "grad_norm": 0.02641790732741356, "grad_norm_var": 1.9891916675289385e-06, "learning_rate": 0.00529734137677883, "loss": 2.5272, "step": 13556 }, { "crossentropy": 2.5756337642669678, "epoch": 0.49148056844547566, "grad_norm": 0.026617281138896942, "grad_norm_var": 2.024388888911763e-06, "learning_rate": 0.005296761315462706, "loss": 2.613, "step": 13557 }, { "crossentropy": 2.402218818664551, "epoch": 0.4915168213457077, "grad_norm": 0.027151690796017647, "grad_norm_var": 1.901997219956467e-06, "learning_rate": 0.00529618125013837, "loss": 2.4739, "step": 13558 }, { "crossentropy": 2.637944459915161, "epoch": 0.4915530742459397, "grad_norm": 0.02807028777897358, "grad_norm_var": 1.8800275693638008e-06, "learning_rate": 0.005295601180813656, "loss": 2.6096, "step": 13559 }, { "crossentropy": 2.4240469932556152, "epoch": 0.4915893271461717, "grad_norm": 0.027866579592227936, "grad_norm_var": 1.8843200358878517e-06, "learning_rate": 0.0052950211074964, "loss": 2.4619, "step": 13560 }, { "crossentropy": 2.6418986320495605, "epoch": 0.4916255800464037, "grad_norm": 0.02729218453168869, "grad_norm_var": 1.7021878718727433e-06, "learning_rate": 0.005294441030194437, "loss": 2.591, "step": 13561 }, { "crossentropy": 2.550354480743408, "epoch": 0.49166183294663574, "grad_norm": 0.026410862803459167, "grad_norm_var": 1.616337992703951e-06, "learning_rate": 0.005293860948915601, "loss": 2.5509, "step": 13562 }, { "crossentropy": 2.501316547393799, "epoch": 0.49169808584686775, "grad_norm": 0.026906926184892654, "grad_norm_var": 1.6354821764250072e-06, "learning_rate": 0.005293280863667725, "loss": 2.5833, "step": 13563 }, { "crossentropy": 2.5615928173065186, "epoch": 0.49173433874709976, "grad_norm": 0.027589043602347374, "grad_norm_var": 1.3824232708739095e-06, "learning_rate": 0.005292700774458647, "loss": 2.6592, "step": 13564 }, { "crossentropy": 2.6789700984954834, "epoch": 0.4917705916473318, "grad_norm": 0.026855945587158203, "grad_norm_var": 4.827852193876881e-07, "learning_rate": 0.005292120681296202, "loss": 2.6543, "step": 13565 }, { "crossentropy": 2.6676769256591797, "epoch": 0.4918068445475638, "grad_norm": 0.02660592459142208, "grad_norm_var": 3.808199012973601e-07, "learning_rate": 0.005291540584188222, "loss": 2.645, "step": 13566 }, { "crossentropy": 2.635077714920044, "epoch": 0.4918430974477958, "grad_norm": 0.026650894433259964, "grad_norm_var": 3.896064691993276e-07, "learning_rate": 0.005290960483142546, "loss": 2.6269, "step": 13567 }, { "crossentropy": 2.734666109085083, "epoch": 0.4918793503480278, "grad_norm": 0.02848811447620392, "grad_norm_var": 5.280868944182535e-07, "learning_rate": 0.005290380378167005, "loss": 2.6664, "step": 13568 }, { "crossentropy": 2.5345895290374756, "epoch": 0.49191560324825984, "grad_norm": 0.02868269756436348, "grad_norm_var": 6.601122576270491e-07, "learning_rate": 0.005289800269269438, "loss": 2.5563, "step": 13569 }, { "crossentropy": 2.6632654666900635, "epoch": 0.4919518561484919, "grad_norm": 0.02759038470685482, "grad_norm_var": 5.184554583393768e-07, "learning_rate": 0.005289220156457678, "loss": 2.6425, "step": 13570 }, { "crossentropy": 2.4751479625701904, "epoch": 0.4919881090487239, "grad_norm": 0.027125917375087738, "grad_norm_var": 5.15556056836216e-07, "learning_rate": 0.0052886400397395604, "loss": 2.5495, "step": 13571 }, { "crossentropy": 2.6791913509368896, "epoch": 0.49202436194895594, "grad_norm": 0.031658533960580826, "grad_norm_var": 1.6365511830442134e-06, "learning_rate": 0.005288059919122921, "loss": 2.6231, "step": 13572 }, { "crossentropy": 2.4949445724487305, "epoch": 0.49206061484918795, "grad_norm": 0.027855807915329933, "grad_norm_var": 1.570518920734204e-06, "learning_rate": 0.005287479794615596, "loss": 2.5653, "step": 13573 }, { "crossentropy": 2.628338098526001, "epoch": 0.49209686774941996, "grad_norm": 0.02725401520729065, "grad_norm_var": 1.5640321413574644e-06, "learning_rate": 0.005286899666225419, "loss": 2.6212, "step": 13574 }, { "crossentropy": 2.576770782470703, "epoch": 0.492133120649652, "grad_norm": 0.028685908764600754, "grad_norm_var": 1.6196311377706756e-06, "learning_rate": 0.005286319533960229, "loss": 2.613, "step": 13575 }, { "crossentropy": 2.414560079574585, "epoch": 0.492169373549884, "grad_norm": 0.02646029181778431, "grad_norm_var": 1.7157465257406814e-06, "learning_rate": 0.005285739397827857, "loss": 2.5223, "step": 13576 }, { "crossentropy": 2.6141161918640137, "epoch": 0.492205626450116, "grad_norm": 0.029847851023077965, "grad_norm_var": 2.0081360199562805e-06, "learning_rate": 0.0052851592578361415, "loss": 2.6256, "step": 13577 }, { "crossentropy": 2.5875165462493896, "epoch": 0.492241879350348, "grad_norm": 0.029425259679555893, "grad_norm_var": 2.0210140612888483e-06, "learning_rate": 0.005284579113992917, "loss": 2.6474, "step": 13578 }, { "crossentropy": 2.595492362976074, "epoch": 0.49227813225058004, "grad_norm": 0.02692233957350254, "grad_norm_var": 2.018823164017302e-06, "learning_rate": 0.00528399896630602, "loss": 2.5983, "step": 13579 }, { "crossentropy": 2.54799747467041, "epoch": 0.49231438515081205, "grad_norm": 0.028046004474163055, "grad_norm_var": 2.007981667935937e-06, "learning_rate": 0.005283418814783287, "loss": 2.5346, "step": 13580 }, { "crossentropy": 2.4573302268981934, "epoch": 0.49235063805104406, "grad_norm": 0.028846178203821182, "grad_norm_var": 1.949369033257357e-06, "learning_rate": 0.005282838659432552, "loss": 2.5629, "step": 13581 }, { "crossentropy": 2.5588395595550537, "epoch": 0.4923868909512761, "grad_norm": 0.028632046654820442, "grad_norm_var": 1.7930973650052856e-06, "learning_rate": 0.0052822585002616496, "loss": 2.557, "step": 13582 }, { "crossentropy": 2.6177690029144287, "epoch": 0.4924231438515081, "grad_norm": 0.027781672775745392, "grad_norm_var": 1.6302926359952156e-06, "learning_rate": 0.005281678337278422, "loss": 2.6325, "step": 13583 }, { "crossentropy": 2.5025460720062256, "epoch": 0.49245939675174016, "grad_norm": 0.026951070874929428, "grad_norm_var": 1.7458401392985688e-06, "learning_rate": 0.005281098170490698, "loss": 2.5344, "step": 13584 }, { "crossentropy": 2.530569076538086, "epoch": 0.4924956496519722, "grad_norm": 0.027260348200798035, "grad_norm_var": 1.7874490707191216e-06, "learning_rate": 0.005280517999906317, "loss": 2.6192, "step": 13585 }, { "crossentropy": 2.66302490234375, "epoch": 0.4925319025522042, "grad_norm": 0.026672210544347763, "grad_norm_var": 1.9082179117735673e-06, "learning_rate": 0.005279937825533114, "loss": 2.6698, "step": 13586 }, { "crossentropy": 2.5308966636657715, "epoch": 0.4925681554524362, "grad_norm": 0.026101240888237953, "grad_norm_var": 2.1054327217796356e-06, "learning_rate": 0.005279357647378928, "loss": 2.5929, "step": 13587 }, { "crossentropy": 2.6386494636535645, "epoch": 0.4926044083526682, "grad_norm": 0.028133779764175415, "grad_norm_var": 1.1743070314447375e-06, "learning_rate": 0.0052787774654515915, "loss": 2.5182, "step": 13588 }, { "crossentropy": 2.662579298019409, "epoch": 0.49264066125290024, "grad_norm": 0.027973880991339684, "grad_norm_var": 1.1759821423931265e-06, "learning_rate": 0.0052781972797589435, "loss": 2.547, "step": 13589 }, { "crossentropy": 2.557866096496582, "epoch": 0.49267691415313225, "grad_norm": 0.027025289833545685, "grad_norm_var": 1.1962725540877e-06, "learning_rate": 0.005277617090308817, "loss": 2.6289, "step": 13590 }, { "crossentropy": 2.647041082382202, "epoch": 0.49271316705336426, "grad_norm": 0.02775219827890396, "grad_norm_var": 1.140200613155134e-06, "learning_rate": 0.005277036897109053, "loss": 2.6524, "step": 13591 }, { "crossentropy": 2.7121899127960205, "epoch": 0.4927494199535963, "grad_norm": 0.02706874907016754, "grad_norm_var": 1.0595619504497142e-06, "learning_rate": 0.005276456700167484, "loss": 2.6231, "step": 13592 }, { "crossentropy": 2.6560912132263184, "epoch": 0.4927856728538283, "grad_norm": 0.028646331280469894, "grad_norm_var": 8.181155822526801e-07, "learning_rate": 0.005275876499491946, "loss": 2.5495, "step": 13593 }, { "crossentropy": 2.4422502517700195, "epoch": 0.4928219257540603, "grad_norm": 0.025432538241147995, "grad_norm_var": 8.972997995774594e-07, "learning_rate": 0.005275296295090279, "loss": 2.5004, "step": 13594 }, { "crossentropy": 2.528006076812744, "epoch": 0.4928581786542923, "grad_norm": 0.025265712291002274, "grad_norm_var": 1.186010605106026e-06, "learning_rate": 0.005274716086970318, "loss": 2.5328, "step": 13595 }, { "crossentropy": 2.5835928916931152, "epoch": 0.49289443155452434, "grad_norm": 0.03011118248105049, "grad_norm_var": 1.6444053334293731e-06, "learning_rate": 0.005274135875139901, "loss": 2.6835, "step": 13596 }, { "crossentropy": 2.6732177734375, "epoch": 0.4929306844547564, "grad_norm": 0.026134518906474113, "grad_norm_var": 1.6094480644490705e-06, "learning_rate": 0.005273555659606859, "loss": 2.6693, "step": 13597 }, { "crossentropy": 2.665877103805542, "epoch": 0.4929669373549884, "grad_norm": 0.02612907625734806, "grad_norm_var": 1.5594366839351282e-06, "learning_rate": 0.0052729754403790345, "loss": 2.6449, "step": 13598 }, { "crossentropy": 2.658595561981201, "epoch": 0.49300319025522044, "grad_norm": 0.029276559129357338, "grad_norm_var": 1.824512565453878e-06, "learning_rate": 0.005272395217464263, "loss": 2.5678, "step": 13599 }, { "crossentropy": 2.5422396659851074, "epoch": 0.49303944315545245, "grad_norm": 0.030202265828847885, "grad_norm_var": 2.3573403162248923e-06, "learning_rate": 0.005271814990870382, "loss": 2.6063, "step": 13600 }, { "crossentropy": 2.2851145267486572, "epoch": 0.49307569605568446, "grad_norm": 0.02565772831439972, "grad_norm_var": 2.558201473410931e-06, "learning_rate": 0.005271234760605223, "loss": 2.3261, "step": 13601 }, { "crossentropy": 2.6192195415496826, "epoch": 0.4931119489559165, "grad_norm": 0.028020799160003662, "grad_norm_var": 2.550183228811744e-06, "learning_rate": 0.005270654526676629, "loss": 2.5742, "step": 13602 }, { "crossentropy": 2.7224435806274414, "epoch": 0.4931482018561485, "grad_norm": 0.027406370267271996, "grad_norm_var": 2.424852457611953e-06, "learning_rate": 0.005270074289092436, "loss": 2.7271, "step": 13603 }, { "crossentropy": 2.4028732776641846, "epoch": 0.4931844547563805, "grad_norm": 0.03430705517530441, "grad_norm_var": 5.316160615228164e-06, "learning_rate": 0.00526949404786048, "loss": 2.5682, "step": 13604 }, { "crossentropy": 2.6054515838623047, "epoch": 0.4932207076566125, "grad_norm": 0.027182750403881073, "grad_norm_var": 5.347552934393112e-06, "learning_rate": 0.005268913802988597, "loss": 2.5662, "step": 13605 }, { "crossentropy": 2.5326454639434814, "epoch": 0.49325696055684454, "grad_norm": 0.028044817969202995, "grad_norm_var": 5.300246603200442e-06, "learning_rate": 0.005268333554484624, "loss": 2.6233, "step": 13606 }, { "crossentropy": 2.542743444442749, "epoch": 0.49329321345707655, "grad_norm": 0.030276557430624962, "grad_norm_var": 5.643753256466313e-06, "learning_rate": 0.0052677533023564, "loss": 2.5646, "step": 13607 }, { "crossentropy": 2.5593230724334717, "epoch": 0.49332946635730857, "grad_norm": 0.030808083713054657, "grad_norm_var": 6.0171252826484355e-06, "learning_rate": 0.005267173046611763, "loss": 2.6479, "step": 13608 }, { "crossentropy": 2.6013128757476807, "epoch": 0.4933657192575406, "grad_norm": 0.027747943997383118, "grad_norm_var": 6.026849958034632e-06, "learning_rate": 0.005266592787258546, "loss": 2.5625, "step": 13609 }, { "crossentropy": 2.640817642211914, "epoch": 0.4934019721577726, "grad_norm": 0.0280354805290699, "grad_norm_var": 5.472395474090339e-06, "learning_rate": 0.00526601252430459, "loss": 2.6157, "step": 13610 }, { "crossentropy": 2.563032865524292, "epoch": 0.49343822505800466, "grad_norm": 0.028330374509096146, "grad_norm_var": 4.773383345577954e-06, "learning_rate": 0.0052654322577577305, "loss": 2.5162, "step": 13611 }, { "crossentropy": 2.5597097873687744, "epoch": 0.4934744779582367, "grad_norm": 0.02665177546441555, "grad_norm_var": 4.826375608918626e-06, "learning_rate": 0.005264851987625806, "loss": 2.4425, "step": 13612 }, { "crossentropy": 2.5210416316986084, "epoch": 0.4935107308584687, "grad_norm": 0.025807959958910942, "grad_norm_var": 4.93117122349015e-06, "learning_rate": 0.005264271713916654, "loss": 2.475, "step": 13613 }, { "crossentropy": 2.4743709564208984, "epoch": 0.4935469837587007, "grad_norm": 0.02696649730205536, "grad_norm_var": 4.725028029216721e-06, "learning_rate": 0.00526369143663811, "loss": 2.5519, "step": 13614 }, { "crossentropy": 2.4865314960479736, "epoch": 0.4935832366589327, "grad_norm": 0.02612444758415222, "grad_norm_var": 4.986099295254175e-06, "learning_rate": 0.005263111155798015, "loss": 2.5393, "step": 13615 }, { "crossentropy": 2.6298179626464844, "epoch": 0.49361948955916474, "grad_norm": 0.02696489542722702, "grad_norm_var": 4.786864300276499e-06, "learning_rate": 0.005262530871404203, "loss": 2.5682, "step": 13616 }, { "crossentropy": 2.736478567123413, "epoch": 0.49365574245939675, "grad_norm": 0.025855490937829018, "grad_norm_var": 4.726997163010301e-06, "learning_rate": 0.005261950583464514, "loss": 2.6353, "step": 13617 }, { "crossentropy": 2.482761859893799, "epoch": 0.49369199535962877, "grad_norm": 0.025504808872938156, "grad_norm_var": 5.126797253779345e-06, "learning_rate": 0.005261370291986784, "loss": 2.5195, "step": 13618 }, { "crossentropy": 2.639491558074951, "epoch": 0.4937282482598608, "grad_norm": 0.027414685115218163, "grad_norm_var": 5.126280969355799e-06, "learning_rate": 0.005260789996978852, "loss": 2.6112, "step": 13619 }, { "crossentropy": 2.611543893814087, "epoch": 0.4937645011600928, "grad_norm": 0.028962107375264168, "grad_norm_var": 2.328995860841398e-06, "learning_rate": 0.005260209698448556, "loss": 2.6528, "step": 13620 }, { "crossentropy": 2.5161659717559814, "epoch": 0.4938007540603248, "grad_norm": 0.029516080394387245, "grad_norm_var": 2.557376460331449e-06, "learning_rate": 0.005259629396403732, "loss": 2.5101, "step": 13621 }, { "crossentropy": 2.663313865661621, "epoch": 0.4938370069605568, "grad_norm": 0.029332462698221207, "grad_norm_var": 2.7222209144546803e-06, "learning_rate": 0.005259049090852221, "loss": 2.6184, "step": 13622 }, { "crossentropy": 2.624764919281006, "epoch": 0.49387325986078884, "grad_norm": 0.029311800375580788, "grad_norm_var": 2.4578003838792204e-06, "learning_rate": 0.005258468781801856, "loss": 2.6113, "step": 13623 }, { "crossentropy": 2.5108561515808105, "epoch": 0.4939095127610209, "grad_norm": 0.03149845078587532, "grad_norm_var": 2.772908068407913e-06, "learning_rate": 0.00525788846926048, "loss": 2.5146, "step": 13624 }, { "crossentropy": 2.4060959815979004, "epoch": 0.4939457656612529, "grad_norm": 0.029586629942059517, "grad_norm_var": 2.983314834122582e-06, "learning_rate": 0.005257308153235928, "loss": 2.5386, "step": 13625 }, { "crossentropy": 2.3353941440582275, "epoch": 0.49398201856148494, "grad_norm": 0.02749416045844555, "grad_norm_var": 2.9894324696893286e-06, "learning_rate": 0.0052567278337360395, "loss": 2.5083, "step": 13626 }, { "crossentropy": 2.7018253803253174, "epoch": 0.49401827146171695, "grad_norm": 0.027425890788435936, "grad_norm_var": 2.9805403618030426e-06, "learning_rate": 0.005256147510768651, "loss": 2.6524, "step": 13627 }, { "crossentropy": 2.6682395935058594, "epoch": 0.49405452436194897, "grad_norm": 0.027757981792092323, "grad_norm_var": 2.89118480716379e-06, "learning_rate": 0.005255567184341603, "loss": 2.7089, "step": 13628 }, { "crossentropy": 2.6874570846557617, "epoch": 0.494090777262181, "grad_norm": 0.031219758093357086, "grad_norm_var": 3.2515878127210014e-06, "learning_rate": 0.005254986854462731, "loss": 2.6663, "step": 13629 }, { "crossentropy": 2.4909849166870117, "epoch": 0.494127030162413, "grad_norm": 0.02942761965095997, "grad_norm_var": 3.2307959743751425e-06, "learning_rate": 0.005254406521139877, "loss": 2.5792, "step": 13630 }, { "crossentropy": 2.5466227531433105, "epoch": 0.494163283062645, "grad_norm": 0.02628745697438717, "grad_norm_var": 3.184360659624893e-06, "learning_rate": 0.005253826184380873, "loss": 2.598, "step": 13631 }, { "crossentropy": 2.544830560684204, "epoch": 0.494199535962877, "grad_norm": 0.026823172345757484, "grad_norm_var": 3.211742593493307e-06, "learning_rate": 0.005253245844193564, "loss": 2.6028, "step": 13632 }, { "crossentropy": 2.706428050994873, "epoch": 0.49423578886310904, "grad_norm": 0.027460139244794846, "grad_norm_var": 2.841391913226696e-06, "learning_rate": 0.005252665500585785, "loss": 2.6508, "step": 13633 }, { "crossentropy": 2.6389899253845215, "epoch": 0.49427204176334105, "grad_norm": 0.026512863114476204, "grad_norm_var": 2.5105329188293902e-06, "learning_rate": 0.005252085153565375, "loss": 2.6032, "step": 13634 }, { "crossentropy": 2.5810387134552, "epoch": 0.49430829466357307, "grad_norm": 0.02740221656858921, "grad_norm_var": 2.5123501899047366e-06, "learning_rate": 0.005251504803140171, "loss": 2.6384, "step": 13635 }, { "crossentropy": 2.481231451034546, "epoch": 0.4943445475638051, "grad_norm": 0.02802996337413788, "grad_norm_var": 2.5093685145312205e-06, "learning_rate": 0.005250924449318016, "loss": 2.5608, "step": 13636 }, { "crossentropy": 2.483391523361206, "epoch": 0.4943808004640371, "grad_norm": 0.030892284587025642, "grad_norm_var": 2.8246588579151996e-06, "learning_rate": 0.005250344092106742, "loss": 2.5938, "step": 13637 }, { "crossentropy": 2.5111424922943115, "epoch": 0.49441705336426917, "grad_norm": 0.02974388375878334, "grad_norm_var": 2.8793168669344888e-06, "learning_rate": 0.005249763731514193, "loss": 2.6017, "step": 13638 }, { "crossentropy": 2.5597000122070312, "epoch": 0.4944533062645012, "grad_norm": 0.025429850444197655, "grad_norm_var": 3.4292626960453897e-06, "learning_rate": 0.0052491833675482056, "loss": 2.5453, "step": 13639 }, { "crossentropy": 2.683518886566162, "epoch": 0.4944895591647332, "grad_norm": 0.027633506804704666, "grad_norm_var": 2.7208245582876437e-06, "learning_rate": 0.005248603000216619, "loss": 2.6676, "step": 13640 }, { "crossentropy": 2.53521466255188, "epoch": 0.4945258120649652, "grad_norm": 0.0264606811106205, "grad_norm_var": 2.6996179939036667e-06, "learning_rate": 0.005248022629527271, "loss": 2.682, "step": 13641 }, { "crossentropy": 2.522430658340454, "epoch": 0.4945620649651972, "grad_norm": 0.0256841778755188, "grad_norm_var": 2.996300250470207e-06, "learning_rate": 0.005247442255488002, "loss": 2.5651, "step": 13642 }, { "crossentropy": 2.4364020824432373, "epoch": 0.49459831786542924, "grad_norm": 0.025311583653092384, "grad_norm_var": 3.370435662364612e-06, "learning_rate": 0.005246861878106649, "loss": 2.4545, "step": 13643 }, { "crossentropy": 2.5730624198913574, "epoch": 0.49463457076566125, "grad_norm": 0.026130158454179764, "grad_norm_var": 3.5082323424838353e-06, "learning_rate": 0.005246281497391052, "loss": 2.5381, "step": 13644 }, { "crossentropy": 2.580211639404297, "epoch": 0.49467082366589327, "grad_norm": 0.025910140946507454, "grad_norm_var": 2.656716437481503e-06, "learning_rate": 0.00524570111334905, "loss": 2.6318, "step": 13645 }, { "crossentropy": 2.491401195526123, "epoch": 0.4947070765661253, "grad_norm": 0.026875872164964676, "grad_norm_var": 2.3044879006154743e-06, "learning_rate": 0.00524512072598848, "loss": 2.5874, "step": 13646 }, { "crossentropy": 2.7242822647094727, "epoch": 0.4947433294663573, "grad_norm": 0.02681434340775013, "grad_norm_var": 2.2691997187050557e-06, "learning_rate": 0.0052445403353171835, "loss": 2.5944, "step": 13647 }, { "crossentropy": 2.328620672225952, "epoch": 0.4947795823665893, "grad_norm": 0.026989884674549103, "grad_norm_var": 2.265457391110873e-06, "learning_rate": 0.0052439599413429995, "loss": 2.4738, "step": 13648 }, { "crossentropy": 2.6592588424682617, "epoch": 0.4948158352668213, "grad_norm": 0.027430349960923195, "grad_norm_var": 2.2640033617411793e-06, "learning_rate": 0.0052433795440737655, "loss": 2.6994, "step": 13649 }, { "crossentropy": 2.480971097946167, "epoch": 0.49485208816705334, "grad_norm": 0.026487691327929497, "grad_norm_var": 2.265940485830266e-06, "learning_rate": 0.0052427991435173215, "loss": 2.5637, "step": 13650 }, { "crossentropy": 2.4512884616851807, "epoch": 0.4948883410672854, "grad_norm": 0.02790036052465439, "grad_norm_var": 2.3030727824680282e-06, "learning_rate": 0.005242218739681507, "loss": 2.5982, "step": 13651 }, { "crossentropy": 2.4542276859283447, "epoch": 0.4949245939675174, "grad_norm": 0.02728763036429882, "grad_norm_var": 2.246239870277537e-06, "learning_rate": 0.00524163833257416, "loss": 2.4727, "step": 13652 }, { "crossentropy": 2.607496976852417, "epoch": 0.49496084686774944, "grad_norm": 0.02729105018079281, "grad_norm_var": 1.217340332628341e-06, "learning_rate": 0.0052410579222031215, "loss": 2.5737, "step": 13653 }, { "crossentropy": 2.498774290084839, "epoch": 0.49499709976798145, "grad_norm": 0.026501329615712166, "grad_norm_var": 6.174186802772175e-07, "learning_rate": 0.005240477508576228, "loss": 2.5244, "step": 13654 }, { "crossentropy": 2.624553680419922, "epoch": 0.49503335266821347, "grad_norm": 0.02729170396924019, "grad_norm_var": 5.352317650382984e-07, "learning_rate": 0.005239897091701323, "loss": 2.5807, "step": 13655 }, { "crossentropy": 2.5024662017822266, "epoch": 0.4950696055684455, "grad_norm": 0.029841596260666847, "grad_norm_var": 1.1000668396838325e-06, "learning_rate": 0.005239316671586244, "loss": 2.5423, "step": 13656 }, { "crossentropy": 2.7437541484832764, "epoch": 0.4951058584686775, "grad_norm": 0.029902145266532898, "grad_norm_var": 1.6442003063991818e-06, "learning_rate": 0.005238736248238829, "loss": 2.5879, "step": 13657 }, { "crossentropy": 2.4718222618103027, "epoch": 0.4951421113689095, "grad_norm": 0.028014613315463066, "grad_norm_var": 1.542731072490718e-06, "learning_rate": 0.005238155821666918, "loss": 2.5008, "step": 13658 }, { "crossentropy": 2.5363950729370117, "epoch": 0.4951783642691415, "grad_norm": 0.02746688574552536, "grad_norm_var": 1.2763653702660271e-06, "learning_rate": 0.005237575391878353, "loss": 2.568, "step": 13659 }, { "crossentropy": 2.5630574226379395, "epoch": 0.49521461716937354, "grad_norm": 0.025646604597568512, "grad_norm_var": 1.3717861602801588e-06, "learning_rate": 0.00523699495888097, "loss": 2.5723, "step": 13660 }, { "crossentropy": 2.547778606414795, "epoch": 0.49525087006960555, "grad_norm": 0.026512345299124718, "grad_norm_var": 1.278577901317338e-06, "learning_rate": 0.0052364145226826125, "loss": 2.6126, "step": 13661 }, { "crossentropy": 2.4562370777130127, "epoch": 0.49528712296983757, "grad_norm": 0.02836436964571476, "grad_norm_var": 1.3148386807363203e-06, "learning_rate": 0.005235834083291115, "loss": 2.4795, "step": 13662 }, { "crossentropy": 2.673536777496338, "epoch": 0.4953233758700696, "grad_norm": 0.027897994965314865, "grad_norm_var": 1.2914857824160384e-06, "learning_rate": 0.005235253640714323, "loss": 2.5956, "step": 13663 }, { "crossentropy": 2.522794246673584, "epoch": 0.4953596287703016, "grad_norm": 0.027396738529205322, "grad_norm_var": 1.271356698089943e-06, "learning_rate": 0.005234673194960075, "loss": 2.6219, "step": 13664 }, { "crossentropy": 2.606492042541504, "epoch": 0.49539588167053367, "grad_norm": 0.026959965005517006, "grad_norm_var": 1.2943886950932907e-06, "learning_rate": 0.005234092746036207, "loss": 2.5766, "step": 13665 }, { "crossentropy": 2.651367664337158, "epoch": 0.4954321345707657, "grad_norm": 0.0273149311542511, "grad_norm_var": 1.2202427436538062e-06, "learning_rate": 0.00523351229395056, "loss": 2.56, "step": 13666 }, { "crossentropy": 2.588300943374634, "epoch": 0.4954683874709977, "grad_norm": 0.02778753638267517, "grad_norm_var": 1.2165107815762428e-06, "learning_rate": 0.005232931838710977, "loss": 2.5828, "step": 13667 }, { "crossentropy": 2.5069739818573, "epoch": 0.4955046403712297, "grad_norm": 0.029211223125457764, "grad_norm_var": 1.3696222129642622e-06, "learning_rate": 0.005232351380325295, "loss": 2.521, "step": 13668 }, { "crossentropy": 2.4433023929595947, "epoch": 0.4955408932714617, "grad_norm": 0.028020938858389854, "grad_norm_var": 1.3618971619251309e-06, "learning_rate": 0.005231770918801356, "loss": 2.4881, "step": 13669 }, { "crossentropy": 2.6692605018615723, "epoch": 0.49557714617169374, "grad_norm": 0.028912803158164024, "grad_norm_var": 1.3212317537749136e-06, "learning_rate": 0.005231190454146998, "loss": 2.5748, "step": 13670 }, { "crossentropy": 2.445495843887329, "epoch": 0.49561339907192575, "grad_norm": 0.029346715658903122, "grad_norm_var": 1.416061225513104e-06, "learning_rate": 0.005230609986370064, "loss": 2.4948, "step": 13671 }, { "crossentropy": 2.591451406478882, "epoch": 0.49564965197215777, "grad_norm": 0.02686060220003128, "grad_norm_var": 1.2543254613814684e-06, "learning_rate": 0.005230029515478391, "loss": 2.5323, "step": 13672 }, { "crossentropy": 2.5502703189849854, "epoch": 0.4956859048723898, "grad_norm": 0.02669239230453968, "grad_norm_var": 1.0204210229991603e-06, "learning_rate": 0.00522944904147982, "loss": 2.5204, "step": 13673 }, { "crossentropy": 2.4003713130950928, "epoch": 0.4957221577726218, "grad_norm": 0.027312541380524635, "grad_norm_var": 1.0171352468367795e-06, "learning_rate": 0.00522886856438219, "loss": 2.4569, "step": 13674 }, { "crossentropy": 2.6384966373443604, "epoch": 0.4957584106728538, "grad_norm": 0.026582255959510803, "grad_norm_var": 1.0825177881552122e-06, "learning_rate": 0.005228288084193344, "loss": 2.4724, "step": 13675 }, { "crossentropy": 2.5685253143310547, "epoch": 0.4957946635730858, "grad_norm": 0.0271871630102396, "grad_norm_var": 8.396218386755652e-07, "learning_rate": 0.0052277076009211235, "loss": 2.7355, "step": 13676 }, { "crossentropy": 2.650369882583618, "epoch": 0.49583091647331784, "grad_norm": 0.027140041813254356, "grad_norm_var": 7.692399651962681e-07, "learning_rate": 0.005227127114573364, "loss": 2.6344, "step": 13677 }, { "crossentropy": 2.5741076469421387, "epoch": 0.4958671693735499, "grad_norm": 0.028252052143216133, "grad_norm_var": 7.598808098705293e-07, "learning_rate": 0.005226546625157907, "loss": 2.6008, "step": 13678 }, { "crossentropy": 2.625582695007324, "epoch": 0.4959034222737819, "grad_norm": 0.030126407742500305, "grad_norm_var": 1.135092032673174e-06, "learning_rate": 0.005225966132682594, "loss": 2.5856, "step": 13679 }, { "crossentropy": 2.681079387664795, "epoch": 0.49593967517401394, "grad_norm": 0.0269336961209774, "grad_norm_var": 1.1745637355610346e-06, "learning_rate": 0.005225385637155269, "loss": 2.6107, "step": 13680 }, { "crossentropy": 2.62524151802063, "epoch": 0.49597592807424595, "grad_norm": 0.027739904820919037, "grad_norm_var": 1.126257660589514e-06, "learning_rate": 0.005224805138583767, "loss": 2.5285, "step": 13681 }, { "crossentropy": 2.501055955886841, "epoch": 0.49601218097447797, "grad_norm": 0.02694230154156685, "grad_norm_var": 1.1609650952469255e-06, "learning_rate": 0.005224224636975928, "loss": 2.5289, "step": 13682 }, { "crossentropy": 2.6092092990875244, "epoch": 0.49604843387471, "grad_norm": 0.027106061577796936, "grad_norm_var": 1.192534728690223e-06, "learning_rate": 0.005223644132339598, "loss": 2.6287, "step": 13683 }, { "crossentropy": 2.658639907836914, "epoch": 0.496084686774942, "grad_norm": 0.026850290596485138, "grad_norm_var": 1.0881524985108105e-06, "learning_rate": 0.005223063624682615, "loss": 2.5624, "step": 13684 }, { "crossentropy": 2.520454168319702, "epoch": 0.496120939675174, "grad_norm": 0.025882119312882423, "grad_norm_var": 1.2612595292738379e-06, "learning_rate": 0.005222483114012818, "loss": 2.5206, "step": 13685 }, { "crossentropy": 2.329878807067871, "epoch": 0.496157192575406, "grad_norm": 0.02629001811146736, "grad_norm_var": 1.1942339658635448e-06, "learning_rate": 0.005221902600338048, "loss": 2.4397, "step": 13686 }, { "crossentropy": 2.632059335708618, "epoch": 0.49619344547563804, "grad_norm": 0.02689732238650322, "grad_norm_var": 9.098504930929452e-07, "learning_rate": 0.0052213220836661465, "loss": 2.5869, "step": 13687 }, { "crossentropy": 2.5013859272003174, "epoch": 0.49622969837587005, "grad_norm": 0.027662284672260284, "grad_norm_var": 9.164448840521625e-07, "learning_rate": 0.005220741564004957, "loss": 2.4974, "step": 13688 }, { "crossentropy": 2.5362601280212402, "epoch": 0.49626595127610207, "grad_norm": 0.02750234119594097, "grad_norm_var": 8.999491953755207e-07, "learning_rate": 0.005220161041362316, "loss": 2.5621, "step": 13689 }, { "crossentropy": 2.557682991027832, "epoch": 0.4963022041763341, "grad_norm": 0.0259663425385952, "grad_norm_var": 1.0065527926381937e-06, "learning_rate": 0.005219580515746065, "loss": 2.5809, "step": 13690 }, { "crossentropy": 2.591407299041748, "epoch": 0.49633845707656615, "grad_norm": 0.029582619667053223, "grad_norm_var": 1.325546951842999e-06, "learning_rate": 0.005218999987164047, "loss": 2.5736, "step": 13691 }, { "crossentropy": 2.7200582027435303, "epoch": 0.49637470997679817, "grad_norm": 0.027561483904719353, "grad_norm_var": 1.3247391911245007e-06, "learning_rate": 0.005218419455624104, "loss": 2.6244, "step": 13692 }, { "crossentropy": 2.5863451957702637, "epoch": 0.4964109628770302, "grad_norm": 0.02700914815068245, "grad_norm_var": 1.3303854219021138e-06, "learning_rate": 0.0052178389211340725, "loss": 2.6483, "step": 13693 }, { "crossentropy": 2.378427267074585, "epoch": 0.4964472157772622, "grad_norm": 0.02723761647939682, "grad_norm_var": 1.2786477462395448e-06, "learning_rate": 0.005217258383701794, "loss": 2.5604, "step": 13694 }, { "crossentropy": 2.6676783561706543, "epoch": 0.4964834686774942, "grad_norm": 0.02806229330599308, "grad_norm_var": 7.754904939862491e-07, "learning_rate": 0.005216677843335114, "loss": 2.6607, "step": 13695 }, { "crossentropy": 2.4724032878875732, "epoch": 0.4965197215777262, "grad_norm": 0.026622362434864044, "grad_norm_var": 7.92670169823204e-07, "learning_rate": 0.00521609730004187, "loss": 2.5317, "step": 13696 }, { "crossentropy": 2.6213457584381104, "epoch": 0.49655597447795824, "grad_norm": 0.025861864909529686, "grad_norm_var": 8.734467463074874e-07, "learning_rate": 0.005215516753829904, "loss": 2.5885, "step": 13697 }, { "crossentropy": 2.6495587825775146, "epoch": 0.49659222737819025, "grad_norm": 0.0304122194647789, "grad_norm_var": 1.5693023026112773e-06, "learning_rate": 0.0052149362047070585, "loss": 2.6023, "step": 13698 }, { "crossentropy": 2.6221044063568115, "epoch": 0.49662848027842227, "grad_norm": 0.026932384818792343, "grad_norm_var": 1.575253596053242e-06, "learning_rate": 0.005214355652681171, "loss": 2.5768, "step": 13699 }, { "crossentropy": 2.5046591758728027, "epoch": 0.4966647331786543, "grad_norm": 0.02908812277019024, "grad_norm_var": 1.7627779440805106e-06, "learning_rate": 0.005213775097760088, "loss": 2.5829, "step": 13700 }, { "crossentropy": 2.5072548389434814, "epoch": 0.4967009860788863, "grad_norm": 0.030676282942295074, "grad_norm_var": 2.222202329193766e-06, "learning_rate": 0.005213194539951646, "loss": 2.5346, "step": 13701 }, { "crossentropy": 2.47701358795166, "epoch": 0.4967372389791183, "grad_norm": 0.02656754106283188, "grad_norm_var": 2.174461453119146e-06, "learning_rate": 0.005212613979263687, "loss": 2.5393, "step": 13702 }, { "crossentropy": 2.581118583679199, "epoch": 0.4967734918793503, "grad_norm": 0.02671683020889759, "grad_norm_var": 2.1964796394956244e-06, "learning_rate": 0.005212033415704056, "loss": 2.6286, "step": 13703 }, { "crossentropy": 2.569883346557617, "epoch": 0.49680974477958234, "grad_norm": 0.025821160525083542, "grad_norm_var": 2.421612528427132e-06, "learning_rate": 0.00521145284928059, "loss": 2.5306, "step": 13704 }, { "crossentropy": 2.6231656074523926, "epoch": 0.4968459976798144, "grad_norm": 0.028724640607833862, "grad_norm_var": 2.498862771493438e-06, "learning_rate": 0.005210872280001134, "loss": 2.6524, "step": 13705 }, { "crossentropy": 2.4901528358459473, "epoch": 0.4968822505800464, "grad_norm": 0.028456753119826317, "grad_norm_var": 2.3182384012223702e-06, "learning_rate": 0.005210291707873527, "loss": 2.5222, "step": 13706 }, { "crossentropy": 2.6941845417022705, "epoch": 0.49691850348027844, "grad_norm": 0.027945062145590782, "grad_norm_var": 2.103897011303243e-06, "learning_rate": 0.005209711132905612, "loss": 2.635, "step": 13707 }, { "crossentropy": 2.4870989322662354, "epoch": 0.49695475638051045, "grad_norm": 0.02717091143131256, "grad_norm_var": 2.1222582071013057e-06, "learning_rate": 0.005209130555105229, "loss": 2.5797, "step": 13708 }, { "crossentropy": 2.6081955432891846, "epoch": 0.49699100928074247, "grad_norm": 0.026177257299423218, "grad_norm_var": 2.2428685527847134e-06, "learning_rate": 0.005208549974480221, "loss": 2.5187, "step": 13709 }, { "crossentropy": 2.686737537384033, "epoch": 0.4970272621809745, "grad_norm": 0.027414748445153236, "grad_norm_var": 2.234981825647422e-06, "learning_rate": 0.00520796939103843, "loss": 2.5995, "step": 13710 }, { "crossentropy": 2.5688915252685547, "epoch": 0.4970635150812065, "grad_norm": 0.028289329260587692, "grad_norm_var": 2.250210313504049e-06, "learning_rate": 0.005207388804787695, "loss": 2.6014, "step": 13711 }, { "crossentropy": 2.5219686031341553, "epoch": 0.4970997679814385, "grad_norm": 0.026110386475920677, "grad_norm_var": 2.33877998780962e-06, "learning_rate": 0.005206808215735861, "loss": 2.538, "step": 13712 }, { "crossentropy": 2.6271345615386963, "epoch": 0.4971360208816705, "grad_norm": 0.028120940551161766, "grad_norm_var": 2.119789155583191e-06, "learning_rate": 0.005206227623890767, "loss": 2.5792, "step": 13713 }, { "crossentropy": 2.504802942276001, "epoch": 0.49717227378190254, "grad_norm": 0.026818888261914253, "grad_norm_var": 1.6699952312763018e-06, "learning_rate": 0.005205647029260256, "loss": 2.5454, "step": 13714 }, { "crossentropy": 2.5879900455474854, "epoch": 0.49720852668213456, "grad_norm": 0.02622014284133911, "grad_norm_var": 1.7617254551811786e-06, "learning_rate": 0.005205066431852172, "loss": 2.5974, "step": 13715 }, { "crossentropy": 2.5460574626922607, "epoch": 0.49724477958236657, "grad_norm": 0.02538028545677662, "grad_norm_var": 1.8457022070197022e-06, "learning_rate": 0.005204485831674352, "loss": 2.5363, "step": 13716 }, { "crossentropy": 2.4342193603515625, "epoch": 0.4972810324825986, "grad_norm": 0.027038374915719032, "grad_norm_var": 1.0294448885464348e-06, "learning_rate": 0.0052039052287346415, "loss": 2.4511, "step": 13717 }, { "crossentropy": 2.528942584991455, "epoch": 0.49731728538283065, "grad_norm": 0.02633855678141117, "grad_norm_var": 1.0477826705587146e-06, "learning_rate": 0.0052033246230408815, "loss": 2.5755, "step": 13718 }, { "crossentropy": 2.6559956073760986, "epoch": 0.49735353828306267, "grad_norm": 0.027311906218528748, "grad_norm_var": 1.043756411502047e-06, "learning_rate": 0.005202744014600913, "loss": 2.7243, "step": 13719 }, { "crossentropy": 2.50238299369812, "epoch": 0.4973897911832947, "grad_norm": 0.029146993532776833, "grad_norm_var": 1.1752091746119093e-06, "learning_rate": 0.005202163403422581, "loss": 2.5934, "step": 13720 }, { "crossentropy": 2.5816283226013184, "epoch": 0.4974260440835267, "grad_norm": 0.02683313563466072, "grad_norm_var": 1.0374006624402332e-06, "learning_rate": 0.005201582789513723, "loss": 2.584, "step": 13721 }, { "crossentropy": 2.6570639610290527, "epoch": 0.4974622969837587, "grad_norm": 0.025672441348433495, "grad_norm_var": 1.0454742556508778e-06, "learning_rate": 0.005201002172882186, "loss": 2.5531, "step": 13722 }, { "crossentropy": 2.5172805786132812, "epoch": 0.4974985498839907, "grad_norm": 0.025985267013311386, "grad_norm_var": 1.0383998813072997e-06, "learning_rate": 0.005200421553535807, "loss": 2.5411, "step": 13723 }, { "crossentropy": 2.578336715698242, "epoch": 0.49753480278422274, "grad_norm": 0.026315277442336082, "grad_norm_var": 1.050608619550253e-06, "learning_rate": 0.005199840931482433, "loss": 2.5341, "step": 13724 }, { "crossentropy": 2.7588906288146973, "epoch": 0.49757105568445475, "grad_norm": 0.02589135803282261, "grad_norm_var": 1.080347049279987e-06, "learning_rate": 0.005199260306729903, "loss": 2.6828, "step": 13725 }, { "crossentropy": 2.5119056701660156, "epoch": 0.49760730858468677, "grad_norm": 0.026176707819104195, "grad_norm_var": 1.075574008410152e-06, "learning_rate": 0.005198679679286059, "loss": 2.5398, "step": 13726 }, { "crossentropy": 2.5203921794891357, "epoch": 0.4976435614849188, "grad_norm": 0.026263877749443054, "grad_norm_var": 9.103580932306634e-07, "learning_rate": 0.005198099049158747, "loss": 2.5716, "step": 13727 }, { "crossentropy": 2.4520554542541504, "epoch": 0.4976798143851508, "grad_norm": 0.026904450729489326, "grad_norm_var": 8.977663876768519e-07, "learning_rate": 0.005197518416355804, "loss": 2.4429, "step": 13728 }, { "crossentropy": 2.6106014251708984, "epoch": 0.4977160672853828, "grad_norm": 0.02618466131389141, "grad_norm_var": 7.526366079765492e-07, "learning_rate": 0.005196937780885077, "loss": 2.6524, "step": 13729 }, { "crossentropy": 2.452728509902954, "epoch": 0.4977523201856148, "grad_norm": 0.028111251071095467, "grad_norm_var": 9.067789621273649e-07, "learning_rate": 0.005196357142754405, "loss": 2.4828, "step": 13730 }, { "crossentropy": 2.4175539016723633, "epoch": 0.49778857308584684, "grad_norm": 0.02726130746304989, "grad_norm_var": 9.202822792135172e-07, "learning_rate": 0.005195776501971633, "loss": 2.5864, "step": 13731 }, { "crossentropy": 2.5052602291107178, "epoch": 0.4978248259860789, "grad_norm": 0.029316483065485954, "grad_norm_var": 1.208615211085023e-06, "learning_rate": 0.005195195858544602, "loss": 2.6188, "step": 13732 }, { "crossentropy": 2.5793826580047607, "epoch": 0.4978610788863109, "grad_norm": 0.030416594818234444, "grad_norm_var": 1.9743055808406237e-06, "learning_rate": 0.005194615212481154, "loss": 2.528, "step": 13733 }, { "crossentropy": 2.5427398681640625, "epoch": 0.49789733178654294, "grad_norm": 0.02704063057899475, "grad_norm_var": 1.930731323498376e-06, "learning_rate": 0.005194034563789133, "loss": 2.5376, "step": 13734 }, { "crossentropy": 2.6232340335845947, "epoch": 0.49793358468677495, "grad_norm": 0.02578883245587349, "grad_norm_var": 2.048324009997386e-06, "learning_rate": 0.0051934539124763816, "loss": 2.4963, "step": 13735 }, { "crossentropy": 2.540964126586914, "epoch": 0.49796983758700697, "grad_norm": 0.027417635545134544, "grad_norm_var": 1.7590537006415084e-06, "learning_rate": 0.00519287325855074, "loss": 2.5435, "step": 13736 }, { "crossentropy": 2.686610698699951, "epoch": 0.498006090487239, "grad_norm": 0.026529263705015182, "grad_norm_var": 1.770521780163981e-06, "learning_rate": 0.005192292602020052, "loss": 2.648, "step": 13737 }, { "crossentropy": 2.5147504806518555, "epoch": 0.498042343387471, "grad_norm": 0.025228142738342285, "grad_norm_var": 1.8588232341381605e-06, "learning_rate": 0.005191711942892161, "loss": 2.5114, "step": 13738 }, { "crossentropy": 2.543362617492676, "epoch": 0.498078596287703, "grad_norm": 0.02672303467988968, "grad_norm_var": 1.8002062906884164e-06, "learning_rate": 0.0051911312811749105, "loss": 2.532, "step": 13739 }, { "crossentropy": 2.6472089290618896, "epoch": 0.498114849187935, "grad_norm": 0.02679130993783474, "grad_norm_var": 1.772616929239292e-06, "learning_rate": 0.005190550616876141, "loss": 2.6055, "step": 13740 }, { "crossentropy": 2.7037837505340576, "epoch": 0.49815110208816704, "grad_norm": 0.02635934203863144, "grad_norm_var": 1.7169504925130246e-06, "learning_rate": 0.005189969950003696, "loss": 2.6304, "step": 13741 }, { "crossentropy": 2.456815719604492, "epoch": 0.49818735498839906, "grad_norm": 0.027079500257968903, "grad_norm_var": 1.6649251423930234e-06, "learning_rate": 0.005189389280565419, "loss": 2.422, "step": 13742 }, { "crossentropy": 2.5059683322906494, "epoch": 0.49822360788863107, "grad_norm": 0.027042878791689873, "grad_norm_var": 1.6171998620467434e-06, "learning_rate": 0.005188808608569153, "loss": 2.6032, "step": 13743 }, { "crossentropy": 2.6695470809936523, "epoch": 0.4982598607888631, "grad_norm": 0.02796773426234722, "grad_norm_var": 1.6548624190052999e-06, "learning_rate": 0.005188227934022739, "loss": 2.6421, "step": 13744 }, { "crossentropy": 2.394965171813965, "epoch": 0.49829611368909515, "grad_norm": 0.027934782207012177, "grad_norm_var": 1.6085117067789663e-06, "learning_rate": 0.005187647256934022, "loss": 2.491, "step": 13745 }, { "crossentropy": 2.6795244216918945, "epoch": 0.49833236658932717, "grad_norm": 0.0297029260545969, "grad_norm_var": 1.9362489384664357e-06, "learning_rate": 0.005187066577310843, "loss": 2.6041, "step": 13746 }, { "crossentropy": 2.499619960784912, "epoch": 0.4983686194895592, "grad_norm": 0.03171635791659355, "grad_norm_var": 3.086891915204702e-06, "learning_rate": 0.005186485895161047, "loss": 2.5121, "step": 13747 }, { "crossentropy": 2.533371686935425, "epoch": 0.4984048723897912, "grad_norm": 0.028576400130987167, "grad_norm_var": 2.9607222189147073e-06, "learning_rate": 0.005185905210492475, "loss": 2.5704, "step": 13748 }, { "crossentropy": 2.569946050643921, "epoch": 0.4984411252900232, "grad_norm": 0.027691032737493515, "grad_norm_var": 2.4176895118624506e-06, "learning_rate": 0.005185324523312972, "loss": 2.5575, "step": 13749 }, { "crossentropy": 2.479797601699829, "epoch": 0.4984773781902552, "grad_norm": 0.027380503714084625, "grad_norm_var": 2.4052539326343957e-06, "learning_rate": 0.005184743833630379, "loss": 2.5828, "step": 13750 }, { "crossentropy": 2.680150270462036, "epoch": 0.49851363109048724, "grad_norm": 0.025655224919319153, "grad_norm_var": 2.436774638695293e-06, "learning_rate": 0.005184163141452543, "loss": 2.6663, "step": 13751 }, { "crossentropy": 2.557018280029297, "epoch": 0.49854988399071926, "grad_norm": 0.026196947321295738, "grad_norm_var": 2.541235670177342e-06, "learning_rate": 0.0051835824467873015, "loss": 2.4573, "step": 13752 }, { "crossentropy": 2.5838875770568848, "epoch": 0.49858613689095127, "grad_norm": 0.03193788230419159, "grad_norm_var": 3.733722371358989e-06, "learning_rate": 0.0051830017496425, "loss": 2.5628, "step": 13753 }, { "crossentropy": 2.5071146488189697, "epoch": 0.4986223897911833, "grad_norm": 0.027441317215561867, "grad_norm_var": 3.2959766066674445e-06, "learning_rate": 0.005182421050025983, "loss": 2.6027, "step": 13754 }, { "crossentropy": 2.503648281097412, "epoch": 0.4986586426914153, "grad_norm": 0.02765985205769539, "grad_norm_var": 3.20539816015767e-06, "learning_rate": 0.005181840347945594, "loss": 2.4592, "step": 13755 }, { "crossentropy": 2.6096112728118896, "epoch": 0.4986948955916473, "grad_norm": 0.029789317399263382, "grad_norm_var": 3.3056321200370613e-06, "learning_rate": 0.005181259643409174, "loss": 2.6443, "step": 13756 }, { "crossentropy": 2.6368439197540283, "epoch": 0.49873114849187933, "grad_norm": 0.027379563078284264, "grad_norm_var": 3.129381548074017e-06, "learning_rate": 0.005180678936424569, "loss": 2.6485, "step": 13757 }, { "crossentropy": 2.470386028289795, "epoch": 0.49876740139211134, "grad_norm": 0.026474958285689354, "grad_norm_var": 3.2423013285982297e-06, "learning_rate": 0.005180098226999618, "loss": 2.5225, "step": 13758 }, { "crossentropy": 2.4618031978607178, "epoch": 0.4988036542923434, "grad_norm": 0.027362234890460968, "grad_norm_var": 3.2011404604979922e-06, "learning_rate": 0.005179517515142168, "loss": 2.5229, "step": 13759 }, { "crossentropy": 2.63291597366333, "epoch": 0.4988399071925754, "grad_norm": 0.031833939254283905, "grad_norm_var": 4.026357789045628e-06, "learning_rate": 0.0051789368008600635, "loss": 2.6061, "step": 13760 }, { "crossentropy": 2.433839797973633, "epoch": 0.49887616009280744, "grad_norm": 0.027757350355386734, "grad_norm_var": 4.039824072667855e-06, "learning_rate": 0.005178356084161145, "loss": 2.5364, "step": 13761 }, { "crossentropy": 2.488349437713623, "epoch": 0.49891241299303946, "grad_norm": 0.028661226853728294, "grad_norm_var": 3.9280300843296066e-06, "learning_rate": 0.005177775365053256, "loss": 2.5324, "step": 13762 }, { "crossentropy": 2.467808723449707, "epoch": 0.49894866589327147, "grad_norm": 0.025529099628329277, "grad_norm_var": 3.5391000044108014e-06, "learning_rate": 0.005177194643544241, "loss": 2.5246, "step": 13763 }, { "crossentropy": 2.6098437309265137, "epoch": 0.4989849187935035, "grad_norm": 0.02854146994650364, "grad_norm_var": 3.5362958163127987e-06, "learning_rate": 0.005176613919641945, "loss": 2.6434, "step": 13764 }, { "crossentropy": 2.6455078125, "epoch": 0.4990211716937355, "grad_norm": 0.026327630504965782, "grad_norm_var": 3.70059615747888e-06, "learning_rate": 0.005176033193354209, "loss": 2.6988, "step": 13765 }, { "crossentropy": 2.6102728843688965, "epoch": 0.4990574245939675, "grad_norm": 0.02749096415936947, "grad_norm_var": 3.6941415808899724e-06, "learning_rate": 0.0051754524646888764, "loss": 2.6214, "step": 13766 }, { "crossentropy": 2.5920162200927734, "epoch": 0.49909367749419953, "grad_norm": 0.029885822907090187, "grad_norm_var": 3.5592597324398655e-06, "learning_rate": 0.005174871733653793, "loss": 2.6251, "step": 13767 }, { "crossentropy": 2.8313019275665283, "epoch": 0.49912993039443154, "grad_norm": 0.03508208692073822, "grad_norm_var": 6.189270529465329e-06, "learning_rate": 0.0051742910002568025, "loss": 2.6192, "step": 13768 }, { "crossentropy": 2.6762969493865967, "epoch": 0.49916618329466356, "grad_norm": 0.02931586652994156, "grad_norm_var": 5.485996019360936e-06, "learning_rate": 0.005173710264505747, "loss": 2.5735, "step": 13769 }, { "crossentropy": 2.5680017471313477, "epoch": 0.49920243619489557, "grad_norm": 0.02683224156498909, "grad_norm_var": 5.597861350137058e-06, "learning_rate": 0.005173129526408469, "loss": 2.566, "step": 13770 }, { "crossentropy": 2.6340651512145996, "epoch": 0.4992386890951276, "grad_norm": 0.02787013351917267, "grad_norm_var": 5.577203158380929e-06, "learning_rate": 0.0051725487859728145, "loss": 2.6731, "step": 13771 }, { "crossentropy": 2.4859464168548584, "epoch": 0.49927494199535966, "grad_norm": 0.028646819293498993, "grad_norm_var": 5.463653729640555e-06, "learning_rate": 0.00517196804320663, "loss": 2.4854, "step": 13772 }, { "crossentropy": 2.5375003814697266, "epoch": 0.49931119489559167, "grad_norm": 0.02710980921983719, "grad_norm_var": 5.506233369965613e-06, "learning_rate": 0.005171387298117753, "loss": 2.5457, "step": 13773 }, { "crossentropy": 2.6532015800476074, "epoch": 0.4993474477958237, "grad_norm": 0.029145782813429832, "grad_norm_var": 5.259379357011696e-06, "learning_rate": 0.005170806550714031, "loss": 2.6401, "step": 13774 }, { "crossentropy": 2.6646270751953125, "epoch": 0.4993837006960557, "grad_norm": 0.03781632333993912, "grad_norm_var": 1.0382661810563843e-05, "learning_rate": 0.005170225801003308, "loss": 2.6788, "step": 13775 }, { "crossentropy": 2.5574700832366943, "epoch": 0.4994199535962877, "grad_norm": 0.033272746950387955, "grad_norm_var": 1.100959251652971e-05, "learning_rate": 0.005169645048993427, "loss": 2.4808, "step": 13776 }, { "crossentropy": 2.576655626296997, "epoch": 0.49945620649651973, "grad_norm": 0.029765352606773376, "grad_norm_var": 1.0840455944501507e-05, "learning_rate": 0.005169064294692232, "loss": 2.5377, "step": 13777 }, { "crossentropy": 2.652881145477295, "epoch": 0.49949245939675174, "grad_norm": 0.027221765369176865, "grad_norm_var": 1.1122466940457348e-05, "learning_rate": 0.005168483538107566, "loss": 2.5709, "step": 13778 }, { "crossentropy": 2.5885109901428223, "epoch": 0.49952871229698376, "grad_norm": 0.02860037237405777, "grad_norm_var": 1.0140842725609107e-05, "learning_rate": 0.0051679027792472755, "loss": 2.5928, "step": 13779 }, { "crossentropy": 2.695544719696045, "epoch": 0.49956496519721577, "grad_norm": 0.030260032042860985, "grad_norm_var": 1.0092544640936474e-05, "learning_rate": 0.0051673220181192026, "loss": 2.5683, "step": 13780 }, { "crossentropy": 2.568748712539673, "epoch": 0.4996012180974478, "grad_norm": 0.029270896688103676, "grad_norm_var": 9.324176413351644e-06, "learning_rate": 0.0051667412547311935, "loss": 2.618, "step": 13781 }, { "crossentropy": 2.7083539962768555, "epoch": 0.4996374709976798, "grad_norm": 0.030430197715759277, "grad_norm_var": 8.93993682597686e-06, "learning_rate": 0.005166160489091088, "loss": 2.6457, "step": 13782 }, { "crossentropy": 2.6772403717041016, "epoch": 0.4996737238979118, "grad_norm": 0.029021160677075386, "grad_norm_var": 9.003619559979209e-06, "learning_rate": 0.005165579721206735, "loss": 2.5875, "step": 13783 }, { "crossentropy": 2.5120766162872314, "epoch": 0.49970997679814383, "grad_norm": 0.036078739911317825, "grad_norm_var": 9.743856165221841e-06, "learning_rate": 0.005164998951085975, "loss": 2.6333, "step": 13784 }, { "crossentropy": 2.4430699348449707, "epoch": 0.49974622969837584, "grad_norm": 0.02599506825208664, "grad_norm_var": 1.0754219092794458e-05, "learning_rate": 0.005164418178736656, "loss": 2.4702, "step": 13785 }, { "crossentropy": 2.3958213329315186, "epoch": 0.4997824825986079, "grad_norm": 0.027073334902524948, "grad_norm_var": 1.0661371281426671e-05, "learning_rate": 0.005163837404166618, "loss": 2.4877, "step": 13786 }, { "crossentropy": 2.65712308883667, "epoch": 0.49981873549883993, "grad_norm": 0.026343973353505135, "grad_norm_var": 1.1209550215512707e-05, "learning_rate": 0.005163256627383708, "loss": 2.5879, "step": 13787 }, { "crossentropy": 2.481910228729248, "epoch": 0.49985498839907194, "grad_norm": 0.02689114212989807, "grad_norm_var": 1.1661210536662821e-05, "learning_rate": 0.005162675848395769, "loss": 2.5644, "step": 13788 }, { "crossentropy": 2.5389342308044434, "epoch": 0.49989124129930396, "grad_norm": 0.026602033525705338, "grad_norm_var": 1.1848867797362263e-05, "learning_rate": 0.005162095067210648, "loss": 2.5393, "step": 13789 }, { "crossentropy": 2.5831146240234375, "epoch": 0.49992749419953597, "grad_norm": 0.027576537802815437, "grad_norm_var": 1.2100283539572205e-05, "learning_rate": 0.005161514283836184, "loss": 2.6398, "step": 13790 }, { "crossentropy": 2.6329801082611084, "epoch": 0.499963747099768, "grad_norm": 0.028465241193771362, "grad_norm_var": 7.213690124279942e-06, "learning_rate": 0.005160933498280226, "loss": 2.6093, "step": 13791 }, { "crossentropy": 2.558945655822754, "epoch": 0.5, "grad_norm": 0.027812480926513672, "grad_norm_var": 5.9149039433098086e-06, "learning_rate": 0.005160352710550617, "loss": 2.588, "step": 13792 }, { "crossentropy": 2.5019142627716064, "epoch": 0.500036252900232, "grad_norm": 0.029867803677916527, "grad_norm_var": 5.931642480650791e-06, "learning_rate": 0.005159771920655202, "loss": 2.5223, "step": 13793 }, { "crossentropy": 2.637026309967041, "epoch": 0.500072505800464, "grad_norm": 0.031256526708602905, "grad_norm_var": 6.210652125065534e-06, "learning_rate": 0.005159191128601822, "loss": 2.5883, "step": 13794 }, { "crossentropy": 2.5945024490356445, "epoch": 0.500108758700696, "grad_norm": 0.028174810111522675, "grad_norm_var": 6.235942229495621e-06, "learning_rate": 0.005158610334398326, "loss": 2.6002, "step": 13795 }, { "crossentropy": 2.496598958969116, "epoch": 0.5001450116009281, "grad_norm": 0.027208205312490463, "grad_norm_var": 6.232080839906858e-06, "learning_rate": 0.005158029538052558, "loss": 2.5488, "step": 13796 }, { "crossentropy": 2.6667966842651367, "epoch": 0.5001812645011601, "grad_norm": 0.026180386543273926, "grad_norm_var": 6.564636007306129e-06, "learning_rate": 0.00515744873957236, "loss": 2.6625, "step": 13797 }, { "crossentropy": 2.57909893989563, "epoch": 0.5002175174013921, "grad_norm": 0.026938818395137787, "grad_norm_var": 6.398208478073687e-06, "learning_rate": 0.005156867938965577, "loss": 2.5696, "step": 13798 }, { "crossentropy": 2.7305848598480225, "epoch": 0.5002537703016241, "grad_norm": 0.027670158073306084, "grad_norm_var": 6.3675881405017565e-06, "learning_rate": 0.0051562871362400555, "loss": 2.6519, "step": 13799 }, { "crossentropy": 2.5823495388031006, "epoch": 0.5002900232018561, "grad_norm": 0.027272609993815422, "grad_norm_var": 1.8853705026780256e-06, "learning_rate": 0.005155706331403639, "loss": 2.562, "step": 13800 }, { "crossentropy": 2.5587241649627686, "epoch": 0.5003262761020881, "grad_norm": 0.027882764115929604, "grad_norm_var": 1.7083939833068342e-06, "learning_rate": 0.005155125524464172, "loss": 2.5793, "step": 13801 }, { "crossentropy": 2.7350215911865234, "epoch": 0.5003625290023201, "grad_norm": 0.02900685928761959, "grad_norm_var": 1.7802238432091844e-06, "learning_rate": 0.005154544715429499, "loss": 2.7789, "step": 13802 }, { "crossentropy": 2.571458339691162, "epoch": 0.5003987819025522, "grad_norm": 0.026933662593364716, "grad_norm_var": 1.6857550952962981e-06, "learning_rate": 0.005153963904307466, "loss": 2.6035, "step": 13803 }, { "crossentropy": 2.573596715927124, "epoch": 0.5004350348027842, "grad_norm": 0.02728762850165367, "grad_norm_var": 1.6444276075335922e-06, "learning_rate": 0.0051533830911059154, "loss": 2.5593, "step": 13804 }, { "crossentropy": 2.701620101928711, "epoch": 0.5004712877030162, "grad_norm": 0.027769049629569054, "grad_norm_var": 1.5301439614140057e-06, "learning_rate": 0.005152802275832695, "loss": 2.6784, "step": 13805 }, { "crossentropy": 2.613999605178833, "epoch": 0.5005075406032483, "grad_norm": 0.026486894115805626, "grad_norm_var": 1.6595506420346522e-06, "learning_rate": 0.005152221458495647, "loss": 2.5524, "step": 13806 }, { "crossentropy": 2.5102782249450684, "epoch": 0.5005437935034803, "grad_norm": 0.026646386831998825, "grad_norm_var": 1.726415484205286e-06, "learning_rate": 0.005151640639102617, "loss": 2.5664, "step": 13807 }, { "crossentropy": 2.6028809547424316, "epoch": 0.5005800464037123, "grad_norm": 0.026860801503062248, "grad_norm_var": 1.7782260672789817e-06, "learning_rate": 0.005151059817661451, "loss": 2.6725, "step": 13808 }, { "crossentropy": 2.6692276000976562, "epoch": 0.5006162993039444, "grad_norm": 0.026135120540857315, "grad_norm_var": 1.577707264701827e-06, "learning_rate": 0.005150478994179993, "loss": 2.5495, "step": 13809 }, { "crossentropy": 2.579509735107422, "epoch": 0.5006525522041764, "grad_norm": 0.027036964893341064, "grad_norm_var": 5.668749624303798e-07, "learning_rate": 0.005149898168666087, "loss": 2.6115, "step": 13810 }, { "crossentropy": 2.5930254459381104, "epoch": 0.5006888051044084, "grad_norm": 0.034944452345371246, "grad_norm_var": 4.294587377247693e-06, "learning_rate": 0.005149317341127578, "loss": 2.5845, "step": 13811 }, { "crossentropy": 2.598137378692627, "epoch": 0.5007250580046404, "grad_norm": 0.026400191709399223, "grad_norm_var": 4.382052019714228e-06, "learning_rate": 0.005148736511572314, "loss": 2.6444, "step": 13812 }, { "crossentropy": 2.5973262786865234, "epoch": 0.5007613109048724, "grad_norm": 0.19737432897090912, "grad_norm_var": 0.001803898596235732, "learning_rate": 0.005148155680008136, "loss": 2.5903, "step": 13813 }, { "crossentropy": 2.474660873413086, "epoch": 0.5007975638051044, "grad_norm": 0.04069199413061142, "grad_norm_var": 0.0017949043884442227, "learning_rate": 0.005147574846442892, "loss": 2.4269, "step": 13814 }, { "crossentropy": 2.605435609817505, "epoch": 0.5008338167053364, "grad_norm": 0.03346055746078491, "grad_norm_var": 0.0017881368908790642, "learning_rate": 0.005146994010884424, "loss": 2.6759, "step": 13815 }, { "crossentropy": 2.472806692123413, "epoch": 0.5008700696055685, "grad_norm": 0.028045522049069405, "grad_norm_var": 0.001786912909497648, "learning_rate": 0.00514641317334058, "loss": 2.5446, "step": 13816 }, { "crossentropy": 2.554351568222046, "epoch": 0.5009063225058005, "grad_norm": 0.02723141573369503, "grad_norm_var": 0.00178795356916451, "learning_rate": 0.0051458323338192035, "loss": 2.6371, "step": 13817 }, { "crossentropy": 2.6011486053466797, "epoch": 0.5009425754060325, "grad_norm": 0.029728872701525688, "grad_norm_var": 0.001786974115964869, "learning_rate": 0.00514525149232814, "loss": 2.65, "step": 13818 }, { "crossentropy": 2.4024462699890137, "epoch": 0.5009788283062645, "grad_norm": 0.027541974559426308, "grad_norm_var": 0.0017859727690626654, "learning_rate": 0.005144670648875234, "loss": 2.4352, "step": 13819 }, { "crossentropy": 2.5647757053375244, "epoch": 0.5010150812064965, "grad_norm": 0.028058255091309547, "grad_norm_var": 0.001784744516146298, "learning_rate": 0.005144089803468333, "loss": 2.5712, "step": 13820 }, { "crossentropy": 2.5926568508148193, "epoch": 0.5010513341067285, "grad_norm": 0.028921274468302727, "grad_norm_var": 0.001783002099642582, "learning_rate": 0.00514350895611528, "loss": 2.5021, "step": 13821 }, { "crossentropy": 2.359318256378174, "epoch": 0.5010875870069605, "grad_norm": 0.027723686769604683, "grad_norm_var": 0.0017809150248571154, "learning_rate": 0.005142928106823918, "loss": 2.4597, "step": 13822 }, { "crossentropy": 2.531630277633667, "epoch": 0.5011238399071926, "grad_norm": 0.025900600478053093, "grad_norm_var": 0.001782257769733543, "learning_rate": 0.0051423472556021, "loss": 2.6144, "step": 13823 }, { "crossentropy": 2.5552995204925537, "epoch": 0.5011600928074246, "grad_norm": 0.02768184430897236, "grad_norm_var": 0.0017808885072841038, "learning_rate": 0.005141766402457663, "loss": 2.5362, "step": 13824 }, { "crossentropy": 2.7334115505218506, "epoch": 0.5011963457076566, "grad_norm": 0.028756897896528244, "grad_norm_var": 0.0017765395948257485, "learning_rate": 0.005141185547398457, "loss": 2.6347, "step": 13825 }, { "crossentropy": 2.5840837955474854, "epoch": 0.5012325986078886, "grad_norm": 0.029477179050445557, "grad_norm_var": 0.001772704273853566, "learning_rate": 0.005140604690432324, "loss": 2.5455, "step": 13826 }, { "crossentropy": 2.6750986576080322, "epoch": 0.5012688515081206, "grad_norm": 0.027312923222780228, "grad_norm_var": 0.0017816118123644342, "learning_rate": 0.005140023831567113, "loss": 2.6506, "step": 13827 }, { "crossentropy": 2.6813900470733643, "epoch": 0.5013051044083526, "grad_norm": 0.02793550305068493, "grad_norm_var": 0.0017790479752776464, "learning_rate": 0.005139442970810668, "loss": 2.6327, "step": 13828 }, { "crossentropy": 2.539919853210449, "epoch": 0.5013413573085846, "grad_norm": 0.027721120044589043, "grad_norm_var": 1.2184216123914159e-05, "learning_rate": 0.0051388621081708335, "loss": 2.6166, "step": 13829 }, { "crossentropy": 2.669463634490967, "epoch": 0.5013776102088167, "grad_norm": 0.027328716591000557, "grad_norm_var": 2.7566795565118114e-06, "learning_rate": 0.005138281243655454, "loss": 2.6461, "step": 13830 }, { "crossentropy": 2.5046353340148926, "epoch": 0.5014138631090487, "grad_norm": 0.028462940827012062, "grad_norm_var": 8.800557218866748e-07, "learning_rate": 0.0051377003772723785, "loss": 2.5462, "step": 13831 }, { "crossentropy": 2.5524449348449707, "epoch": 0.5014501160092807, "grad_norm": 0.027260882779955864, "grad_norm_var": 9.126520471205845e-07, "learning_rate": 0.0051371195090294496, "loss": 2.5511, "step": 13832 }, { "crossentropy": 2.385460615158081, "epoch": 0.5014863689095128, "grad_norm": 0.02744959481060505, "grad_norm_var": 8.950066456268052e-07, "learning_rate": 0.005136538638934513, "loss": 2.5243, "step": 13833 }, { "crossentropy": 2.5302422046661377, "epoch": 0.5015226218097448, "grad_norm": 0.026837307959794998, "grad_norm_var": 7.332486342020994e-07, "learning_rate": 0.005135957766995416, "loss": 2.619, "step": 13834 }, { "crossentropy": 2.77718186378479, "epoch": 0.5015588747099768, "grad_norm": 0.02757357619702816, "grad_norm_var": 7.323369014978926e-07, "learning_rate": 0.005135376893220003, "loss": 2.7102, "step": 13835 }, { "crossentropy": 2.6670241355895996, "epoch": 0.5015951276102089, "grad_norm": 0.0270046666264534, "grad_norm_var": 7.619439276706676e-07, "learning_rate": 0.005134796017616119, "loss": 2.6807, "step": 13836 }, { "crossentropy": 2.666889190673828, "epoch": 0.5016313805104409, "grad_norm": 0.02797229029238224, "grad_norm_var": 6.648763292328806e-07, "learning_rate": 0.005134215140191612, "loss": 2.6293, "step": 13837 }, { "crossentropy": 2.5415308475494385, "epoch": 0.5016676334106729, "grad_norm": 0.02726922184228897, "grad_norm_var": 6.733188846002579e-07, "learning_rate": 0.005133634260954322, "loss": 2.5502, "step": 13838 }, { "crossentropy": 2.630995512008667, "epoch": 0.5017038863109049, "grad_norm": 0.027562279254198074, "grad_norm_var": 4.645972481793903e-07, "learning_rate": 0.005133053379912103, "loss": 2.6358, "step": 13839 }, { "crossentropy": 2.6670498847961426, "epoch": 0.5017401392111369, "grad_norm": 0.027701804414391518, "grad_norm_var": 4.6450614110646267e-07, "learning_rate": 0.005132472497072793, "loss": 2.6559, "step": 13840 }, { "crossentropy": 2.562047243118286, "epoch": 0.5017763921113689, "grad_norm": 0.028578925877809525, "grad_norm_var": 4.4203914427326e-07, "learning_rate": 0.0051318916124442416, "loss": 2.642, "step": 13841 }, { "crossentropy": 2.6565215587615967, "epoch": 0.5018126450116009, "grad_norm": 0.031084666028618813, "grad_norm_var": 9.811110316901643e-07, "learning_rate": 0.005131310726034294, "loss": 2.5872, "step": 13842 }, { "crossentropy": 2.3798668384552, "epoch": 0.501848897911833, "grad_norm": 0.028776943683624268, "grad_norm_var": 1.0168636948168034e-06, "learning_rate": 0.005130729837850795, "loss": 2.5144, "step": 13843 }, { "crossentropy": 2.4718847274780273, "epoch": 0.501885150812065, "grad_norm": 0.028474267572164536, "grad_norm_var": 1.0370150253817632e-06, "learning_rate": 0.005130148947901594, "loss": 2.503, "step": 13844 }, { "crossentropy": 2.5404462814331055, "epoch": 0.501921403712297, "grad_norm": 0.029221409931778908, "grad_norm_var": 1.1336698269162932e-06, "learning_rate": 0.005129568056194531, "loss": 2.5412, "step": 13845 }, { "crossentropy": 2.3806328773498535, "epoch": 0.501957656612529, "grad_norm": 0.03081597574055195, "grad_norm_var": 1.5653464309800842e-06, "learning_rate": 0.005128987162737455, "loss": 2.5223, "step": 13846 }, { "crossentropy": 2.619593858718872, "epoch": 0.501993909512761, "grad_norm": 0.028837891295552254, "grad_norm_var": 1.5846327204465839e-06, "learning_rate": 0.005128406267538212, "loss": 2.5863, "step": 13847 }, { "crossentropy": 2.5864615440368652, "epoch": 0.502030162412993, "grad_norm": 0.026644719764590263, "grad_norm_var": 1.691787589116903e-06, "learning_rate": 0.005127825370604649, "loss": 2.5533, "step": 13848 }, { "crossentropy": 2.666895866394043, "epoch": 0.502066415313225, "grad_norm": 0.029250822961330414, "grad_norm_var": 1.7052545735798026e-06, "learning_rate": 0.00512724447194461, "loss": 2.6396, "step": 13849 }, { "crossentropy": 2.629818916320801, "epoch": 0.5021026682134571, "grad_norm": 0.026207223534584045, "grad_norm_var": 1.8571861761365026e-06, "learning_rate": 0.00512666357156594, "loss": 2.5379, "step": 13850 }, { "crossentropy": 2.6499805450439453, "epoch": 0.5021389211136891, "grad_norm": 0.0258952509611845, "grad_norm_var": 2.198262495386958e-06, "learning_rate": 0.005126082669476486, "loss": 2.5346, "step": 13851 }, { "crossentropy": 2.6594948768615723, "epoch": 0.5021751740139211, "grad_norm": 0.027069756761193275, "grad_norm_var": 2.1881000172651317e-06, "learning_rate": 0.005125501765684096, "loss": 2.6323, "step": 13852 }, { "crossentropy": 2.635721206665039, "epoch": 0.5022114269141531, "grad_norm": 0.028660597279667854, "grad_norm_var": 2.1958750020231123e-06, "learning_rate": 0.005124920860196613, "loss": 2.6142, "step": 13853 }, { "crossentropy": 2.5579004287719727, "epoch": 0.5022476798143851, "grad_norm": 0.026983868330717087, "grad_norm_var": 2.23840304015171e-06, "learning_rate": 0.005124339953021884, "loss": 2.5216, "step": 13854 }, { "crossentropy": 2.6136491298675537, "epoch": 0.5022839327146171, "grad_norm": 0.026693303138017654, "grad_norm_var": 2.3635881463911993e-06, "learning_rate": 0.0051237590441677566, "loss": 2.5648, "step": 13855 }, { "crossentropy": 2.624612808227539, "epoch": 0.5023201856148491, "grad_norm": 0.027578486129641533, "grad_norm_var": 2.3724192192844102e-06, "learning_rate": 0.005123178133642075, "loss": 2.5993, "step": 13856 }, { "crossentropy": 2.7151479721069336, "epoch": 0.5023564385150812, "grad_norm": 0.027028482407331467, "grad_norm_var": 2.4388249841251896e-06, "learning_rate": 0.005122597221452685, "loss": 2.6125, "step": 13857 }, { "crossentropy": 2.507073402404785, "epoch": 0.5023926914153132, "grad_norm": 0.026889869943261147, "grad_norm_var": 1.8560971850801877e-06, "learning_rate": 0.0051220163076074335, "loss": 2.5399, "step": 13858 }, { "crossentropy": 2.5998318195343018, "epoch": 0.5024289443155452, "grad_norm": 0.026892783120274544, "grad_norm_var": 1.8361403875534013e-06, "learning_rate": 0.005121435392114166, "loss": 2.6037, "step": 13859 }, { "crossentropy": 2.489696979522705, "epoch": 0.5024651972157773, "grad_norm": 0.02675001136958599, "grad_norm_var": 1.8431573808879205e-06, "learning_rate": 0.005120854474980732, "loss": 2.5566, "step": 13860 }, { "crossentropy": 2.5754964351654053, "epoch": 0.5025014501160093, "grad_norm": 0.02652817592024803, "grad_norm_var": 1.7102271794734609e-06, "learning_rate": 0.005120273556214972, "loss": 2.5847, "step": 13861 }, { "crossentropy": 2.5767719745635986, "epoch": 0.5025377030162413, "grad_norm": 0.026712432503700256, "grad_norm_var": 9.048446192816273e-07, "learning_rate": 0.005119692635824736, "loss": 2.4743, "step": 13862 }, { "crossentropy": 2.3999385833740234, "epoch": 0.5025739559164734, "grad_norm": 0.027096359059214592, "grad_norm_var": 7.057135633670945e-07, "learning_rate": 0.00511911171381787, "loss": 2.5989, "step": 13863 }, { "crossentropy": 2.4659242630004883, "epoch": 0.5026102088167054, "grad_norm": 0.027315253391861916, "grad_norm_var": 6.971216542113092e-07, "learning_rate": 0.005118530790202218, "loss": 2.5545, "step": 13864 }, { "crossentropy": 2.5331311225891113, "epoch": 0.5026464617169374, "grad_norm": 0.030764056369662285, "grad_norm_var": 1.274795243443964e-06, "learning_rate": 0.00511794986498563, "loss": 2.5525, "step": 13865 }, { "crossentropy": 2.562532663345337, "epoch": 0.5026827146171694, "grad_norm": 0.02809843420982361, "grad_norm_var": 1.2501109473340525e-06, "learning_rate": 0.005117368938175948, "loss": 2.5627, "step": 13866 }, { "crossentropy": 2.627603769302368, "epoch": 0.5027189675174014, "grad_norm": 0.027906060218811035, "grad_norm_var": 1.1235634133998126e-06, "learning_rate": 0.005116788009781021, "loss": 2.6036, "step": 13867 }, { "crossentropy": 2.657804250717163, "epoch": 0.5027552204176334, "grad_norm": 0.026962237432599068, "grad_norm_var": 1.1295291385235527e-06, "learning_rate": 0.0051162070798086955, "loss": 2.6185, "step": 13868 }, { "crossentropy": 2.6216697692871094, "epoch": 0.5027914733178654, "grad_norm": 0.027191417291760445, "grad_norm_var": 1.023132400260066e-06, "learning_rate": 0.005115626148266818, "loss": 2.5514, "step": 13869 }, { "crossentropy": 2.6574082374572754, "epoch": 0.5028277262180975, "grad_norm": 0.027319012209773064, "grad_norm_var": 1.0143746481648558e-06, "learning_rate": 0.005115045215163231, "loss": 2.6588, "step": 13870 }, { "crossentropy": 2.4537885189056396, "epoch": 0.5028639791183295, "grad_norm": 0.025912120938301086, "grad_norm_var": 1.1217376671981508e-06, "learning_rate": 0.005114464280505785, "loss": 2.5824, "step": 13871 }, { "crossentropy": 2.5982143878936768, "epoch": 0.5029002320185615, "grad_norm": 0.02657514438033104, "grad_norm_var": 1.1486144966947233e-06, "learning_rate": 0.005113883344302326, "loss": 2.6037, "step": 13872 }, { "crossentropy": 2.479989767074585, "epoch": 0.5029364849187935, "grad_norm": 0.027146346867084503, "grad_norm_var": 1.1460586561727459e-06, "learning_rate": 0.0051133024065607, "loss": 2.5167, "step": 13873 }, { "crossentropy": 2.384552001953125, "epoch": 0.5029727378190255, "grad_norm": 0.027420250698924065, "grad_norm_var": 1.1379087350219525e-06, "learning_rate": 0.005112721467288751, "loss": 2.4894, "step": 13874 }, { "crossentropy": 2.6564149856567383, "epoch": 0.5030089907192575, "grad_norm": 0.030293697491288185, "grad_norm_var": 1.6820916431028866e-06, "learning_rate": 0.005112140526494328, "loss": 2.5793, "step": 13875 }, { "crossentropy": 2.49385929107666, "epoch": 0.5030452436194895, "grad_norm": 0.027013728395104408, "grad_norm_var": 1.6600867632030608e-06, "learning_rate": 0.005111559584185278, "loss": 2.5175, "step": 13876 }, { "crossentropy": 2.4853739738464355, "epoch": 0.5030814965197216, "grad_norm": 0.025729268789291382, "grad_norm_var": 1.8051930385856585e-06, "learning_rate": 0.005110978640369448, "loss": 2.4718, "step": 13877 }, { "crossentropy": 2.653294324874878, "epoch": 0.5031177494199536, "grad_norm": 0.02820120006799698, "grad_norm_var": 1.7941371596474996e-06, "learning_rate": 0.00511039769505468, "loss": 2.6167, "step": 13878 }, { "crossentropy": 2.6865503787994385, "epoch": 0.5031540023201856, "grad_norm": 0.02718106284737587, "grad_norm_var": 1.7893601734324133e-06, "learning_rate": 0.0051098167482488245, "loss": 2.684, "step": 13879 }, { "crossentropy": 2.6429927349090576, "epoch": 0.5031902552204176, "grad_norm": 0.027844661846756935, "grad_norm_var": 1.789295435961474e-06, "learning_rate": 0.005109235799959727, "loss": 2.641, "step": 13880 }, { "crossentropy": 2.5692944526672363, "epoch": 0.5032265081206496, "grad_norm": 0.027228618040680885, "grad_norm_var": 1.07777626343226e-06, "learning_rate": 0.005108654850195236, "loss": 2.5183, "step": 13881 }, { "crossentropy": 2.542062759399414, "epoch": 0.5032627610208816, "grad_norm": 0.026140911504626274, "grad_norm_var": 1.1288306522130952e-06, "learning_rate": 0.005108073898963193, "loss": 2.6103, "step": 13882 }, { "crossentropy": 2.506890296936035, "epoch": 0.5032990139211136, "grad_norm": 0.02615814469754696, "grad_norm_var": 1.167840367598203e-06, "learning_rate": 0.005107492946271452, "loss": 2.4552, "step": 13883 }, { "crossentropy": 2.5188498497009277, "epoch": 0.5033352668213457, "grad_norm": 0.02677735686302185, "grad_norm_var": 1.174478548896758e-06, "learning_rate": 0.0051069119921278525, "loss": 2.5223, "step": 13884 }, { "crossentropy": 2.5041770935058594, "epoch": 0.5033715197215777, "grad_norm": 0.026317529380321503, "grad_norm_var": 1.2154378617170355e-06, "learning_rate": 0.005106331036540246, "loss": 2.5691, "step": 13885 }, { "crossentropy": 2.6417148113250732, "epoch": 0.5034077726218097, "grad_norm": 0.029131973162293434, "grad_norm_var": 1.4789569859243712e-06, "learning_rate": 0.005105750079516477, "loss": 2.5439, "step": 13886 }, { "crossentropy": 2.640089511871338, "epoch": 0.5034440255220418, "grad_norm": 0.026690784841775894, "grad_norm_var": 1.3839723076308763e-06, "learning_rate": 0.0051051691210643926, "loss": 2.547, "step": 13887 }, { "crossentropy": 2.506948471069336, "epoch": 0.5034802784222738, "grad_norm": 0.026218365877866745, "grad_norm_var": 1.4235872340950806e-06, "learning_rate": 0.00510458816119184, "loss": 2.53, "step": 13888 }, { "crossentropy": 2.4587221145629883, "epoch": 0.5035165313225058, "grad_norm": 0.026005303487181664, "grad_norm_var": 1.5159183425113616e-06, "learning_rate": 0.005104007199906665, "loss": 2.413, "step": 13889 }, { "crossentropy": 2.5666141510009766, "epoch": 0.5035527842227379, "grad_norm": 0.026930170133709908, "grad_norm_var": 1.5130777173168042e-06, "learning_rate": 0.005103426237216715, "loss": 2.5276, "step": 13890 }, { "crossentropy": 2.620181083679199, "epoch": 0.5035890371229699, "grad_norm": 0.026499206200242043, "grad_norm_var": 8.054778904038739e-07, "learning_rate": 0.005102845273129837, "loss": 2.543, "step": 13891 }, { "crossentropy": 2.7555017471313477, "epoch": 0.5036252900232019, "grad_norm": 0.026992931962013245, "grad_norm_var": 8.051320812596056e-07, "learning_rate": 0.005102264307653878, "loss": 2.6863, "step": 13892 }, { "crossentropy": 2.5878958702087402, "epoch": 0.5036615429234339, "grad_norm": 0.03305695205926895, "grad_norm_var": 3.0387586603823164e-06, "learning_rate": 0.0051016833407966835, "loss": 2.5397, "step": 13893 }, { "crossentropy": 2.765425443649292, "epoch": 0.5036977958236659, "grad_norm": 0.033616580069065094, "grad_norm_var": 5.49641055337761e-06, "learning_rate": 0.005101102372566101, "loss": 2.6607, "step": 13894 }, { "crossentropy": 2.6649694442749023, "epoch": 0.5037340487238979, "grad_norm": 0.03343632072210312, "grad_norm_var": 7.530458123904531e-06, "learning_rate": 0.005100521402969978, "loss": 2.634, "step": 13895 }, { "crossentropy": 2.625037431716919, "epoch": 0.50377030162413, "grad_norm": 0.026563692837953568, "grad_norm_var": 7.670708099407466e-06, "learning_rate": 0.0050999404320161605, "loss": 2.5403, "step": 13896 }, { "crossentropy": 2.617401599884033, "epoch": 0.503806554524362, "grad_norm": 0.03207922354340553, "grad_norm_var": 8.651847677026287e-06, "learning_rate": 0.005099359459712496, "loss": 2.5849, "step": 13897 }, { "crossentropy": 2.5645718574523926, "epoch": 0.503842807424594, "grad_norm": 0.02714747190475464, "grad_norm_var": 8.426951381302347e-06, "learning_rate": 0.005098778486066832, "loss": 2.6389, "step": 13898 }, { "crossentropy": 2.4270341396331787, "epoch": 0.503879060324826, "grad_norm": 0.027274081483483315, "grad_norm_var": 8.178449307917136e-06, "learning_rate": 0.0050981975110870135, "loss": 2.4504, "step": 13899 }, { "crossentropy": 2.526949882507324, "epoch": 0.503915313225058, "grad_norm": 0.02842787094414234, "grad_norm_var": 7.986970705813057e-06, "learning_rate": 0.005097616534780889, "loss": 2.5553, "step": 13900 }, { "crossentropy": 2.5498225688934326, "epoch": 0.50395156612529, "grad_norm": 0.02763545885682106, "grad_norm_var": 7.707750680441678e-06, "learning_rate": 0.005097035557156305, "loss": 2.6026, "step": 13901 }, { "crossentropy": 2.599670886993408, "epoch": 0.503987819025522, "grad_norm": 0.028001628816127777, "grad_norm_var": 7.70843265567044e-06, "learning_rate": 0.005096454578221109, "loss": 2.5561, "step": 13902 }, { "crossentropy": 2.5634777545928955, "epoch": 0.504024071925754, "grad_norm": 0.026997480541467667, "grad_norm_var": 7.638855494105376e-06, "learning_rate": 0.005095873597983147, "loss": 2.522, "step": 13903 }, { "crossentropy": 2.5490708351135254, "epoch": 0.5040603248259861, "grad_norm": 0.0255188699811697, "grad_norm_var": 7.88738116690718e-06, "learning_rate": 0.005095292616450266, "loss": 2.5779, "step": 13904 }, { "crossentropy": 2.7159597873687744, "epoch": 0.5040965777262181, "grad_norm": 0.02707020193338394, "grad_norm_var": 7.602417453968557e-06, "learning_rate": 0.005094711633630316, "loss": 2.602, "step": 13905 }, { "crossentropy": 2.586519479751587, "epoch": 0.5041328306264501, "grad_norm": 0.028683297336101532, "grad_norm_var": 7.40932560119491e-06, "learning_rate": 0.00509413064953114, "loss": 2.6342, "step": 13906 }, { "crossentropy": 2.5537867546081543, "epoch": 0.5041690835266821, "grad_norm": 0.02822287380695343, "grad_norm_var": 7.092077928596178e-06, "learning_rate": 0.005093549664160586, "loss": 2.602, "step": 13907 }, { "crossentropy": 2.4401559829711914, "epoch": 0.5042053364269141, "grad_norm": 0.027709703892469406, "grad_norm_var": 6.9519356607742555e-06, "learning_rate": 0.005092968677526503, "loss": 2.5281, "step": 13908 }, { "crossentropy": 2.576223373413086, "epoch": 0.5042415893271461, "grad_norm": 0.028325658291578293, "grad_norm_var": 5.690855793457705e-06, "learning_rate": 0.005092387689636736, "loss": 2.5608, "step": 13909 }, { "crossentropy": 2.6940534114837646, "epoch": 0.5042778422273781, "grad_norm": 0.02668107859790325, "grad_norm_var": 4.006765471704227e-06, "learning_rate": 0.005091806700499134, "loss": 2.59, "step": 13910 }, { "crossentropy": 2.596287488937378, "epoch": 0.5043140951276102, "grad_norm": 0.02715460956096649, "grad_norm_var": 2.0126684911192516e-06, "learning_rate": 0.005091225710121543, "loss": 2.6363, "step": 13911 }, { "crossentropy": 2.4990720748901367, "epoch": 0.5043503480278422, "grad_norm": 0.02757669985294342, "grad_norm_var": 1.9208515249191998e-06, "learning_rate": 0.00509064471851181, "loss": 2.5632, "step": 13912 }, { "crossentropy": 2.4646155834198, "epoch": 0.5043866009280742, "grad_norm": 0.02641787938773632, "grad_norm_var": 6.800129180825707e-07, "learning_rate": 0.005090063725677782, "loss": 2.4969, "step": 13913 }, { "crossentropy": 2.5942845344543457, "epoch": 0.5044228538283063, "grad_norm": 0.027943283319473267, "grad_norm_var": 6.898496173501998e-07, "learning_rate": 0.0050894827316273076, "loss": 2.6331, "step": 13914 }, { "crossentropy": 2.6904635429382324, "epoch": 0.5044591067285383, "grad_norm": 0.02759493701159954, "grad_norm_var": 6.875796810499318e-07, "learning_rate": 0.005088901736368234, "loss": 2.6279, "step": 13915 }, { "crossentropy": 2.5122764110565186, "epoch": 0.5044953596287703, "grad_norm": 0.02696603536605835, "grad_norm_var": 6.398186360674629e-07, "learning_rate": 0.005088320739908405, "loss": 2.5533, "step": 13916 }, { "crossentropy": 2.563786745071411, "epoch": 0.5045316125290024, "grad_norm": 0.028370369225740433, "grad_norm_var": 6.960360513385648e-07, "learning_rate": 0.005087739742255673, "loss": 2.6497, "step": 13917 }, { "crossentropy": 2.6403634548187256, "epoch": 0.5045678654292344, "grad_norm": 0.02835528366267681, "grad_norm_var": 7.297625387735526e-07, "learning_rate": 0.00508715874341788, "loss": 2.518, "step": 13918 }, { "crossentropy": 2.7168986797332764, "epoch": 0.5046041183294664, "grad_norm": 0.026542704552412033, "grad_norm_var": 7.715996294439896e-07, "learning_rate": 0.005086577743402878, "loss": 2.5765, "step": 13919 }, { "crossentropy": 2.500903367996216, "epoch": 0.5046403712296984, "grad_norm": 0.0265720896422863, "grad_norm_var": 5.703256780295943e-07, "learning_rate": 0.005085996742218511, "loss": 2.475, "step": 13920 }, { "crossentropy": 2.657565116882324, "epoch": 0.5046766241299304, "grad_norm": 0.02851586975157261, "grad_norm_var": 6.15852578246057e-07, "learning_rate": 0.005085415739872629, "loss": 2.6881, "step": 13921 }, { "crossentropy": 2.7024288177490234, "epoch": 0.5047128770301624, "grad_norm": 0.029640652239322662, "grad_norm_var": 8.111573336168209e-07, "learning_rate": 0.005084834736373075, "loss": 2.6808, "step": 13922 }, { "crossentropy": 2.6053669452667236, "epoch": 0.5047491299303944, "grad_norm": 0.027838323265314102, "grad_norm_var": 7.916345880862174e-07, "learning_rate": 0.005084253731727701, "loss": 2.6406, "step": 13923 }, { "crossentropy": 2.5253050327301025, "epoch": 0.5047853828306265, "grad_norm": 0.027161739766597748, "grad_norm_var": 8.051494213297116e-07, "learning_rate": 0.005083672725944354, "loss": 2.5046, "step": 13924 }, { "crossentropy": 2.5863747596740723, "epoch": 0.5048216357308585, "grad_norm": 0.028680574148893356, "grad_norm_var": 8.471927185941941e-07, "learning_rate": 0.005083091719030878, "loss": 2.6409, "step": 13925 }, { "crossentropy": 2.594864845275879, "epoch": 0.5048578886310905, "grad_norm": 0.027080554515123367, "grad_norm_var": 8.06849637704996e-07, "learning_rate": 0.0050825107109951214, "loss": 2.5268, "step": 13926 }, { "crossentropy": 2.6607284545898438, "epoch": 0.5048941415313225, "grad_norm": 0.027084002271294594, "grad_norm_var": 8.118318098416921e-07, "learning_rate": 0.005081929701844934, "loss": 2.5891, "step": 13927 }, { "crossentropy": 2.544952869415283, "epoch": 0.5049303944315545, "grad_norm": 0.02584179677069187, "grad_norm_var": 1.0160526507996722e-06, "learning_rate": 0.005081348691588162, "loss": 2.5457, "step": 13928 }, { "crossentropy": 2.560152769088745, "epoch": 0.5049666473317865, "grad_norm": 0.02650178223848343, "grad_norm_var": 1.003963121679038e-06, "learning_rate": 0.005080767680232652, "loss": 2.548, "step": 13929 }, { "crossentropy": 2.5186734199523926, "epoch": 0.5050029002320185, "grad_norm": 0.027930475771427155, "grad_norm_var": 1.0032900339320411e-06, "learning_rate": 0.00508018666778625, "loss": 2.5519, "step": 13930 }, { "crossentropy": 2.451953172683716, "epoch": 0.5050391531322506, "grad_norm": 0.027869796380400658, "grad_norm_var": 1.0099399064919379e-06, "learning_rate": 0.005079605654256807, "loss": 2.5222, "step": 13931 }, { "crossentropy": 2.3903486728668213, "epoch": 0.5050754060324826, "grad_norm": 0.02923869714140892, "grad_norm_var": 1.152918354152529e-06, "learning_rate": 0.00507902463965217, "loss": 2.4588, "step": 13932 }, { "crossentropy": 2.5408272743225098, "epoch": 0.5051116589327146, "grad_norm": 0.02688092365860939, "grad_norm_var": 1.1587476157039981e-06, "learning_rate": 0.0050784436239801846, "loss": 2.5393, "step": 13933 }, { "crossentropy": 2.275336980819702, "epoch": 0.5051479118329466, "grad_norm": 0.02833605743944645, "grad_norm_var": 1.1568562237927362e-06, "learning_rate": 0.0050778626072486965, "loss": 2.4502, "step": 13934 }, { "crossentropy": 2.752617359161377, "epoch": 0.5051841647331786, "grad_norm": 0.027047645300626755, "grad_norm_var": 1.1011203965008519e-06, "learning_rate": 0.005077281589465557, "loss": 2.6563, "step": 13935 }, { "crossentropy": 2.5148346424102783, "epoch": 0.5052204176334106, "grad_norm": 0.027943303808569908, "grad_norm_var": 1.0236074905055231e-06, "learning_rate": 0.005076700570638616, "loss": 2.5123, "step": 13936 }, { "crossentropy": 2.552872896194458, "epoch": 0.5052566705336426, "grad_norm": 0.027593011036515236, "grad_norm_var": 9.794619194457171e-07, "learning_rate": 0.005076119550775714, "loss": 2.5819, "step": 13937 }, { "crossentropy": 2.579911470413208, "epoch": 0.5052929234338747, "grad_norm": 0.028221461921930313, "grad_norm_var": 7.318466464146615e-07, "learning_rate": 0.005075538529884701, "loss": 2.5474, "step": 13938 }, { "crossentropy": 2.7331321239471436, "epoch": 0.5053291763341067, "grad_norm": 0.028547445312142372, "grad_norm_var": 7.87875810348198e-07, "learning_rate": 0.005074957507973428, "loss": 2.6947, "step": 13939 }, { "crossentropy": 2.5676472187042236, "epoch": 0.5053654292343387, "grad_norm": 0.02649574726819992, "grad_norm_var": 8.565084182825805e-07, "learning_rate": 0.00507437648504974, "loss": 2.4731, "step": 13940 }, { "crossentropy": 2.5973422527313232, "epoch": 0.5054016821345708, "grad_norm": 0.026473460718989372, "grad_norm_var": 8.373330044023538e-07, "learning_rate": 0.005073795461121483, "loss": 2.5895, "step": 13941 }, { "crossentropy": 2.649029493331909, "epoch": 0.5054379350348028, "grad_norm": 0.026898210868239403, "grad_norm_var": 8.482202370428768e-07, "learning_rate": 0.005073214436196506, "loss": 2.598, "step": 13942 }, { "crossentropy": 2.426480531692505, "epoch": 0.5054741879350348, "grad_norm": 0.02746649831533432, "grad_norm_var": 8.396425680676685e-07, "learning_rate": 0.0050726334102826585, "loss": 2.4895, "step": 13943 }, { "crossentropy": 2.5479886531829834, "epoch": 0.5055104408352669, "grad_norm": 0.026155581697821617, "grad_norm_var": 7.782866881258487e-07, "learning_rate": 0.005072052383387786, "loss": 2.5048, "step": 13944 }, { "crossentropy": 2.701275587081909, "epoch": 0.5055466937354989, "grad_norm": 0.026766471564769745, "grad_norm_var": 7.483185336747943e-07, "learning_rate": 0.005071471355519738, "loss": 2.7012, "step": 13945 }, { "crossentropy": 2.594034194946289, "epoch": 0.5055829466357309, "grad_norm": 0.02697942964732647, "grad_norm_var": 7.491904986679873e-07, "learning_rate": 0.005070890326686358, "loss": 2.6646, "step": 13946 }, { "crossentropy": 2.6177802085876465, "epoch": 0.5056191995359629, "grad_norm": 0.027053071185946465, "grad_norm_var": 7.432177801904346e-07, "learning_rate": 0.005070309296895499, "loss": 2.6164, "step": 13947 }, { "crossentropy": 2.4761552810668945, "epoch": 0.5056554524361949, "grad_norm": 0.02845313400030136, "grad_norm_var": 5.872153240817398e-07, "learning_rate": 0.005069728266155005, "loss": 2.5133, "step": 13948 }, { "crossentropy": 2.538626194000244, "epoch": 0.5056917053364269, "grad_norm": 0.027619658038020134, "grad_norm_var": 5.768966369522343e-07, "learning_rate": 0.005069147234472726, "loss": 2.6453, "step": 13949 }, { "crossentropy": 2.570300340652466, "epoch": 0.505727958236659, "grad_norm": 0.02766324020922184, "grad_norm_var": 5.192552540558374e-07, "learning_rate": 0.005068566201856507, "loss": 2.5538, "step": 13950 }, { "crossentropy": 2.656766891479492, "epoch": 0.505764211136891, "grad_norm": 0.02683429606258869, "grad_norm_var": 5.303052606875213e-07, "learning_rate": 0.005067985168314199, "loss": 2.6748, "step": 13951 }, { "crossentropy": 2.5952513217926025, "epoch": 0.505800464037123, "grad_norm": 0.036568425595760345, "grad_norm_var": 5.89349602597757e-06, "learning_rate": 0.005067404133853648, "loss": 2.5596, "step": 13952 }, { "crossentropy": 2.6553447246551514, "epoch": 0.505836716937355, "grad_norm": 0.026406005024909973, "grad_norm_var": 6.024101426428313e-06, "learning_rate": 0.005066823098482703, "loss": 2.5449, "step": 13953 }, { "crossentropy": 2.471635341644287, "epoch": 0.505872969837587, "grad_norm": 0.02681351825594902, "grad_norm_var": 6.066554731464395e-06, "learning_rate": 0.005066242062209208, "loss": 2.5869, "step": 13954 }, { "crossentropy": 2.649571418762207, "epoch": 0.505909222737819, "grad_norm": 0.028366822749376297, "grad_norm_var": 6.048175991331626e-06, "learning_rate": 0.005065661025041014, "loss": 2.6205, "step": 13955 }, { "crossentropy": 2.502389907836914, "epoch": 0.505945475638051, "grad_norm": 0.026548659428954124, "grad_norm_var": 6.039937226748439e-06, "learning_rate": 0.005065079986985969, "loss": 2.5522, "step": 13956 }, { "crossentropy": 2.641329526901245, "epoch": 0.505981728538283, "grad_norm": 0.027101948857307434, "grad_norm_var": 5.962541788778521e-06, "learning_rate": 0.005064498948051921, "loss": 2.6105, "step": 13957 }, { "crossentropy": 2.4547131061553955, "epoch": 0.5060179814385151, "grad_norm": 0.027179867029190063, "grad_norm_var": 5.93622764790042e-06, "learning_rate": 0.005063917908246714, "loss": 2.522, "step": 13958 }, { "crossentropy": 2.6671199798583984, "epoch": 0.5060542343387471, "grad_norm": 0.02589779533445835, "grad_norm_var": 6.1490212526220275e-06, "learning_rate": 0.005063336867578201, "loss": 2.5937, "step": 13959 }, { "crossentropy": 2.669227123260498, "epoch": 0.5060904872389791, "grad_norm": 0.027139466255903244, "grad_norm_var": 6.01341340055402e-06, "learning_rate": 0.005062755826054226, "loss": 2.6388, "step": 13960 }, { "crossentropy": 2.467593193054199, "epoch": 0.5061267401392111, "grad_norm": 0.027300097048282623, "grad_norm_var": 5.963937099508311e-06, "learning_rate": 0.0050621747836826405, "loss": 2.4884, "step": 13961 }, { "crossentropy": 2.5881266593933105, "epoch": 0.5061629930394431, "grad_norm": 0.029739687219262123, "grad_norm_var": 6.158244773958006e-06, "learning_rate": 0.005061593740471287, "loss": 2.5274, "step": 13962 }, { "crossentropy": 2.390659809112549, "epoch": 0.5061992459396751, "grad_norm": 0.026402834802865982, "grad_norm_var": 6.259645486786312e-06, "learning_rate": 0.005061012696428019, "loss": 2.488, "step": 13963 }, { "crossentropy": 2.6106605529785156, "epoch": 0.5062354988399071, "grad_norm": 0.027443857863545418, "grad_norm_var": 6.245809008472213e-06, "learning_rate": 0.005060431651560681, "loss": 2.627, "step": 13964 }, { "crossentropy": 2.557727336883545, "epoch": 0.5062717517401392, "grad_norm": 0.026880227029323578, "grad_norm_var": 6.299155158355496e-06, "learning_rate": 0.005059850605877122, "loss": 2.4828, "step": 13965 }, { "crossentropy": 2.551992177963257, "epoch": 0.5063080046403712, "grad_norm": 0.027490634471178055, "grad_norm_var": 6.3034263570396855e-06, "learning_rate": 0.00505926955938519, "loss": 2.514, "step": 13966 }, { "crossentropy": 2.4619550704956055, "epoch": 0.5063442575406032, "grad_norm": 0.02709820494055748, "grad_norm_var": 6.275306669386154e-06, "learning_rate": 0.005058688512092733, "loss": 2.5228, "step": 13967 }, { "crossentropy": 2.5738940238952637, "epoch": 0.5063805104408353, "grad_norm": 0.02741464599967003, "grad_norm_var": 7.782018520916442e-07, "learning_rate": 0.005058107464007597, "loss": 2.5528, "step": 13968 }, { "crossentropy": 2.6765260696411133, "epoch": 0.5064167633410673, "grad_norm": 0.028706157580018044, "grad_norm_var": 8.648975202340021e-07, "learning_rate": 0.005057526415137633, "loss": 2.6497, "step": 13969 }, { "crossentropy": 2.708192825317383, "epoch": 0.5064530162412993, "grad_norm": 0.027396978810429573, "grad_norm_var": 8.448061600323242e-07, "learning_rate": 0.005056945365490687, "loss": 2.7177, "step": 13970 }, { "crossentropy": 2.662442445755005, "epoch": 0.5064892691415314, "grad_norm": 0.027152476832270622, "grad_norm_var": 7.774738747694561e-07, "learning_rate": 0.005056364315074609, "loss": 2.5778, "step": 13971 }, { "crossentropy": 2.460662603378296, "epoch": 0.5065255220417634, "grad_norm": 0.02625049091875553, "grad_norm_var": 8.131329711644758e-07, "learning_rate": 0.005055783263897244, "loss": 2.553, "step": 13972 }, { "crossentropy": 2.533081293106079, "epoch": 0.5065617749419954, "grad_norm": 0.027199771255254745, "grad_norm_var": 8.113146799860882e-07, "learning_rate": 0.005055202211966441, "loss": 2.5448, "step": 13973 }, { "crossentropy": 2.51202130317688, "epoch": 0.5065980278422274, "grad_norm": 0.030028248205780983, "grad_norm_var": 1.275305005023613e-06, "learning_rate": 0.005054621159290048, "loss": 2.6041, "step": 13974 }, { "crossentropy": 2.657179355621338, "epoch": 0.5066342807424594, "grad_norm": 0.03138270601630211, "grad_norm_var": 2.0047972525844324e-06, "learning_rate": 0.005054040105875914, "loss": 2.5928, "step": 13975 }, { "crossentropy": 2.485675096511841, "epoch": 0.5066705336426914, "grad_norm": 0.027137085795402527, "grad_norm_var": 2.005011749504658e-06, "learning_rate": 0.005053459051731887, "loss": 2.5274, "step": 13976 }, { "crossentropy": 2.395724058151245, "epoch": 0.5067067865429234, "grad_norm": 0.02945387363433838, "grad_norm_var": 2.147354348332288e-06, "learning_rate": 0.005052877996865814, "loss": 2.4537, "step": 13977 }, { "crossentropy": 2.5647268295288086, "epoch": 0.5067430394431555, "grad_norm": 0.028277790173888206, "grad_norm_var": 1.9318111967008086e-06, "learning_rate": 0.005052296941285542, "loss": 2.5251, "step": 13978 }, { "crossentropy": 2.5665178298950195, "epoch": 0.5067792923433875, "grad_norm": 0.030191466212272644, "grad_norm_var": 2.09422060896019e-06, "learning_rate": 0.005051715884998922, "loss": 2.5651, "step": 13979 }, { "crossentropy": 2.517352342605591, "epoch": 0.5068155452436195, "grad_norm": 0.027810342609882355, "grad_norm_var": 2.0708442135052017e-06, "learning_rate": 0.0050511348280138, "loss": 2.5351, "step": 13980 }, { "crossentropy": 2.669818639755249, "epoch": 0.5068517981438515, "grad_norm": 0.028919504955410957, "grad_norm_var": 1.994492227756989e-06, "learning_rate": 0.005050553770338024, "loss": 2.6607, "step": 13981 }, { "crossentropy": 2.61747407913208, "epoch": 0.5068880510440835, "grad_norm": 0.029630839824676514, "grad_norm_var": 2.0656774781445714e-06, "learning_rate": 0.005049972711979444, "loss": 2.7085, "step": 13982 }, { "crossentropy": 2.5451619625091553, "epoch": 0.5069243039443155, "grad_norm": 0.027688369154930115, "grad_norm_var": 1.9867279008181957e-06, "learning_rate": 0.005049391652945906, "loss": 2.6282, "step": 13983 }, { "crossentropy": 2.5700736045837402, "epoch": 0.5069605568445475, "grad_norm": 0.027604103088378906, "grad_norm_var": 1.9637002066364886e-06, "learning_rate": 0.005048810593245258, "loss": 2.6147, "step": 13984 }, { "crossentropy": 2.6522955894470215, "epoch": 0.5069968097447796, "grad_norm": 0.029185781255364418, "grad_norm_var": 1.995936888044798e-06, "learning_rate": 0.00504822953288535, "loss": 2.5889, "step": 13985 }, { "crossentropy": 2.46823787689209, "epoch": 0.5070330626450116, "grad_norm": 0.027085458859801292, "grad_norm_var": 2.046025577092852e-06, "learning_rate": 0.005047648471874028, "loss": 2.4954, "step": 13986 }, { "crossentropy": 2.5271875858306885, "epoch": 0.5070693155452436, "grad_norm": 0.02798011526465416, "grad_norm_var": 1.9470442192343526e-06, "learning_rate": 0.005047067410219141, "loss": 2.5787, "step": 13987 }, { "crossentropy": 2.611494541168213, "epoch": 0.5071055684454756, "grad_norm": 0.02670414373278618, "grad_norm_var": 1.8244986200710664e-06, "learning_rate": 0.005046486347928539, "loss": 2.5802, "step": 13988 }, { "crossentropy": 2.6333189010620117, "epoch": 0.5071418213457076, "grad_norm": 0.026179831475019455, "grad_norm_var": 2.0687130716543485e-06, "learning_rate": 0.0050459052850100675, "loss": 2.5521, "step": 13989 }, { "crossentropy": 2.5853352546691895, "epoch": 0.5071780742459396, "grad_norm": 0.02812686748802662, "grad_norm_var": 1.8954979909784383e-06, "learning_rate": 0.005045324221471574, "loss": 2.483, "step": 13990 }, { "crossentropy": 2.4432637691497803, "epoch": 0.5072143271461717, "grad_norm": 0.027589814737439156, "grad_norm_var": 1.2532877848935032e-06, "learning_rate": 0.00504474315732091, "loss": 2.4416, "step": 13991 }, { "crossentropy": 2.656045436859131, "epoch": 0.5072505800464037, "grad_norm": 0.028712177649140358, "grad_norm_var": 1.2065754612419622e-06, "learning_rate": 0.005044162092565921, "loss": 2.7183, "step": 13992 }, { "crossentropy": 2.547550678253174, "epoch": 0.5072868329466357, "grad_norm": 0.028019223362207413, "grad_norm_var": 1.094653360372251e-06, "learning_rate": 0.005043581027214456, "loss": 2.5347, "step": 13993 }, { "crossentropy": 2.658001184463501, "epoch": 0.5073230858468677, "grad_norm": 0.02717006951570511, "grad_norm_var": 1.1460616791518496e-06, "learning_rate": 0.0050429999612743625, "loss": 2.6172, "step": 13994 }, { "crossentropy": 2.5329301357269287, "epoch": 0.5073593387470998, "grad_norm": 0.02722582407295704, "grad_norm_var": 8.439854692152687e-07, "learning_rate": 0.00504241889475349, "loss": 2.6037, "step": 13995 }, { "crossentropy": 2.58150053024292, "epoch": 0.5073955916473318, "grad_norm": 0.02695493772625923, "grad_norm_var": 8.944723345055318e-07, "learning_rate": 0.005041837827659685, "loss": 2.7227, "step": 13996 }, { "crossentropy": 2.7174665927886963, "epoch": 0.5074318445475638, "grad_norm": 0.02713189274072647, "grad_norm_var": 8.270208765434953e-07, "learning_rate": 0.005041256760000798, "loss": 2.6827, "step": 13997 }, { "crossentropy": 2.363328456878662, "epoch": 0.5074680974477959, "grad_norm": 0.026551764458417892, "grad_norm_var": 6.214689382881555e-07, "learning_rate": 0.005040675691784675, "loss": 2.4738, "step": 13998 }, { "crossentropy": 2.694010019302368, "epoch": 0.5075043503480279, "grad_norm": 0.03046296536922455, "grad_norm_var": 1.1743766799162761e-06, "learning_rate": 0.005040094623019165, "loss": 2.6878, "step": 13999 }, { "crossentropy": 2.7227187156677246, "epoch": 0.5075406032482599, "grad_norm": 0.028207281604409218, "grad_norm_var": 1.1919920937323066e-06, "learning_rate": 0.005039513553712116, "loss": 2.6647, "step": 14000 }, { "crossentropy": 2.329241991043091, "epoch": 0.5075768561484919, "grad_norm": 0.02742275968194008, "grad_norm_var": 1.038290565783378e-06, "learning_rate": 0.005038932483871378, "loss": 2.5297, "step": 14001 }, { "crossentropy": 2.537813186645508, "epoch": 0.5076131090487239, "grad_norm": 0.027799366042017937, "grad_norm_var": 1.0216120156391594e-06, "learning_rate": 0.0050383514135047954, "loss": 2.5518, "step": 14002 }, { "crossentropy": 2.480675458908081, "epoch": 0.5076493619489559, "grad_norm": 0.026874369010329247, "grad_norm_var": 1.0478761392283676e-06, "learning_rate": 0.005037770342620222, "loss": 2.5031, "step": 14003 }, { "crossentropy": 2.5403902530670166, "epoch": 0.507685614849188, "grad_norm": 0.027375467121601105, "grad_norm_var": 9.984663824246503e-07, "learning_rate": 0.0050371892712255, "loss": 2.5502, "step": 14004 }, { "crossentropy": 2.717736005783081, "epoch": 0.50772186774942, "grad_norm": 0.026866590604186058, "grad_norm_var": 8.967309438616284e-07, "learning_rate": 0.005036608199328481, "loss": 2.6889, "step": 14005 }, { "crossentropy": 2.7168943881988525, "epoch": 0.507758120649652, "grad_norm": 0.027119871228933334, "grad_norm_var": 8.968481189479723e-07, "learning_rate": 0.005036027126937013, "loss": 2.6621, "step": 14006 }, { "crossentropy": 2.666703224182129, "epoch": 0.507794373549884, "grad_norm": 0.02701089344918728, "grad_norm_var": 9.180233653064181e-07, "learning_rate": 0.005035446054058943, "loss": 2.6909, "step": 14007 }, { "crossentropy": 2.498929500579834, "epoch": 0.507830626450116, "grad_norm": 0.02625315636396408, "grad_norm_var": 9.170657996683077e-07, "learning_rate": 0.005034864980702123, "loss": 2.5741, "step": 14008 }, { "crossentropy": 2.6216604709625244, "epoch": 0.507866879350348, "grad_norm": 0.027477620169520378, "grad_norm_var": 8.908922950704902e-07, "learning_rate": 0.005034283906874396, "loss": 2.6345, "step": 14009 }, { "crossentropy": 2.482038736343384, "epoch": 0.50790313225058, "grad_norm": 0.02602490596473217, "grad_norm_var": 1.0032370721400433e-06, "learning_rate": 0.005033702832583614, "loss": 2.4309, "step": 14010 }, { "crossentropy": 2.6627466678619385, "epoch": 0.507939385150812, "grad_norm": 0.025528667494654655, "grad_norm_var": 1.1994729905322073e-06, "learning_rate": 0.005033121757837623, "loss": 2.6304, "step": 14011 }, { "crossentropy": 2.5397350788116455, "epoch": 0.5079756380510441, "grad_norm": 0.02714225836098194, "grad_norm_var": 1.1957599823370166e-06, "learning_rate": 0.005032540682644275, "loss": 2.5454, "step": 14012 }, { "crossentropy": 2.5815436840057373, "epoch": 0.5080118909512761, "grad_norm": 0.026879532262682915, "grad_norm_var": 1.202136814901685e-06, "learning_rate": 0.005031959607011412, "loss": 2.5806, "step": 14013 }, { "crossentropy": 2.60718035697937, "epoch": 0.5080481438515081, "grad_norm": 0.02882666513323784, "grad_norm_var": 1.3328017486523754e-06, "learning_rate": 0.005031378530946888, "loss": 2.6136, "step": 14014 }, { "crossentropy": 2.628610849380493, "epoch": 0.5080843967517401, "grad_norm": 0.02782057784497738, "grad_norm_var": 6.652208256617178e-07, "learning_rate": 0.005030797454458549, "loss": 2.5555, "step": 14015 }, { "crossentropy": 2.5830702781677246, "epoch": 0.5081206496519721, "grad_norm": 0.029387280344963074, "grad_norm_var": 9.163296108717518e-07, "learning_rate": 0.0050302163775542445, "loss": 2.5963, "step": 14016 }, { "crossentropy": 2.463308095932007, "epoch": 0.5081569025522041, "grad_norm": 0.026855867356061935, "grad_norm_var": 9.224592220653689e-07, "learning_rate": 0.005029635300241821, "loss": 2.4368, "step": 14017 }, { "crossentropy": 2.6287386417388916, "epoch": 0.5081931554524362, "grad_norm": 0.026869576424360275, "grad_norm_var": 9.025203037640265e-07, "learning_rate": 0.005029054222529127, "loss": 2.5949, "step": 14018 }, { "crossentropy": 2.699263572692871, "epoch": 0.5082294083526682, "grad_norm": 0.026277124881744385, "grad_norm_var": 9.463317735916923e-07, "learning_rate": 0.005028473144424013, "loss": 2.5995, "step": 14019 }, { "crossentropy": 2.5559771060943604, "epoch": 0.5082656612529002, "grad_norm": 0.028288329020142555, "grad_norm_var": 1.0310596834618106e-06, "learning_rate": 0.0050278920659343265, "loss": 2.524, "step": 14020 }, { "crossentropy": 2.6427204608917236, "epoch": 0.5083019141531323, "grad_norm": 0.027976511046290398, "grad_norm_var": 1.063995981349106e-06, "learning_rate": 0.0050273109870679145, "loss": 2.5971, "step": 14021 }, { "crossentropy": 2.5727319717407227, "epoch": 0.5083381670533643, "grad_norm": 0.026983439922332764, "grad_norm_var": 1.067229553939119e-06, "learning_rate": 0.005026729907832626, "loss": 2.5709, "step": 14022 }, { "crossentropy": 2.468247890472412, "epoch": 0.5083744199535963, "grad_norm": 0.026630502194166183, "grad_norm_var": 1.087140009700671e-06, "learning_rate": 0.005026148828236309, "loss": 2.4779, "step": 14023 }, { "crossentropy": 2.512324094772339, "epoch": 0.5084106728538283, "grad_norm": 0.02722904272377491, "grad_norm_var": 1.0232815417654614e-06, "learning_rate": 0.005025567748286814, "loss": 2.5617, "step": 14024 }, { "crossentropy": 2.5184850692749023, "epoch": 0.5084469257540604, "grad_norm": 0.026566628366708755, "grad_norm_var": 1.0490050423408078e-06, "learning_rate": 0.005024986667991987, "loss": 2.5495, "step": 14025 }, { "crossentropy": 2.502652883529663, "epoch": 0.5084831786542924, "grad_norm": 0.02747097983956337, "grad_norm_var": 9.520836298359252e-07, "learning_rate": 0.0050244055873596765, "loss": 2.4566, "step": 14026 }, { "crossentropy": 2.4443373680114746, "epoch": 0.5085194315545244, "grad_norm": 0.026157520711421967, "grad_norm_var": 8.286297646556418e-07, "learning_rate": 0.005023824506397731, "loss": 2.5145, "step": 14027 }, { "crossentropy": 2.7026801109313965, "epoch": 0.5085556844547564, "grad_norm": 0.026532839983701706, "grad_norm_var": 8.675123862862764e-07, "learning_rate": 0.005023243425114001, "loss": 2.7653, "step": 14028 }, { "crossentropy": 2.682586431503296, "epoch": 0.5085919373549884, "grad_norm": 0.026522260159254074, "grad_norm_var": 8.953779509109083e-07, "learning_rate": 0.005022662343516333, "loss": 2.5969, "step": 14029 }, { "crossentropy": 2.534475088119507, "epoch": 0.5086281902552204, "grad_norm": 0.028539108112454414, "grad_norm_var": 8.410420903997058e-07, "learning_rate": 0.005022081261612575, "loss": 2.5599, "step": 14030 }, { "crossentropy": 2.632401466369629, "epoch": 0.5086644431554525, "grad_norm": 0.03115684911608696, "grad_norm_var": 1.7875336624802058e-06, "learning_rate": 0.005021500179410575, "loss": 2.6876, "step": 14031 }, { "crossentropy": 2.7596983909606934, "epoch": 0.5087006960556845, "grad_norm": 0.030934154987335205, "grad_norm_var": 2.3335054540000896e-06, "learning_rate": 0.005020919096918185, "loss": 2.6445, "step": 14032 }, { "crossentropy": 2.718222141265869, "epoch": 0.5087369489559165, "grad_norm": 0.030244963243603706, "grad_norm_var": 2.7323275365694615e-06, "learning_rate": 0.00502033801414325, "loss": 2.6747, "step": 14033 }, { "crossentropy": 2.4125454425811768, "epoch": 0.5087732018561485, "grad_norm": 0.027129564434289932, "grad_norm_var": 2.705209275154667e-06, "learning_rate": 0.005019756931093617, "loss": 2.4722, "step": 14034 }, { "crossentropy": 2.547492027282715, "epoch": 0.5088094547563805, "grad_norm": 0.02615426480770111, "grad_norm_var": 2.7309354284791997e-06, "learning_rate": 0.00501917584777714, "loss": 2.5756, "step": 14035 }, { "crossentropy": 2.5199389457702637, "epoch": 0.5088457076566125, "grad_norm": 0.03926629573106766, "grad_norm_var": 1.1003844780491101e-05, "learning_rate": 0.005018594764201662, "loss": 2.457, "step": 14036 }, { "crossentropy": 2.557427167892456, "epoch": 0.5088819605568445, "grad_norm": 0.026422392576932907, "grad_norm_var": 1.1256734009271865e-05, "learning_rate": 0.005018013680375035, "loss": 2.5574, "step": 14037 }, { "crossentropy": 2.6523401737213135, "epoch": 0.5089182134570766, "grad_norm": 0.026438351720571518, "grad_norm_var": 1.1376171599597112e-05, "learning_rate": 0.005017432596305105, "loss": 2.4945, "step": 14038 }, { "crossentropy": 2.6351425647735596, "epoch": 0.5089544663573086, "grad_norm": 0.025114521384239197, "grad_norm_var": 1.1864791665635897e-05, "learning_rate": 0.0050168515119997205, "loss": 2.5793, "step": 14039 }, { "crossentropy": 2.6213366985321045, "epoch": 0.5089907192575406, "grad_norm": 0.02736630290746689, "grad_norm_var": 1.1847421843560041e-05, "learning_rate": 0.005016270427466731, "loss": 2.6683, "step": 14040 }, { "crossentropy": 2.469179630279541, "epoch": 0.5090269721577726, "grad_norm": 0.02677481807768345, "grad_norm_var": 1.1803373200906204e-05, "learning_rate": 0.005015689342713987, "loss": 2.5243, "step": 14041 }, { "crossentropy": 2.660162925720215, "epoch": 0.5090632250580046, "grad_norm": 0.02714676968753338, "grad_norm_var": 1.1844226615516315e-05, "learning_rate": 0.005015108257749332, "loss": 2.672, "step": 14042 }, { "crossentropy": 2.6141774654388428, "epoch": 0.5090994779582366, "grad_norm": 0.026649875566363335, "grad_norm_var": 1.1722418086577849e-05, "learning_rate": 0.005014527172580619, "loss": 2.6078, "step": 14043 }, { "crossentropy": 2.56023907661438, "epoch": 0.5091357308584686, "grad_norm": 0.02723154053092003, "grad_norm_var": 1.159066855733573e-05, "learning_rate": 0.005013946087215693, "loss": 2.6683, "step": 14044 }, { "crossentropy": 2.6190199851989746, "epoch": 0.5091719837587007, "grad_norm": 0.029336918145418167, "grad_norm_var": 1.1411798552866701e-05, "learning_rate": 0.005013365001662405, "loss": 2.6637, "step": 14045 }, { "crossentropy": 2.4961354732513428, "epoch": 0.5092082366589327, "grad_norm": 0.026388268917798996, "grad_norm_var": 1.1688042574044671e-05, "learning_rate": 0.0050127839159286, "loss": 2.5527, "step": 14046 }, { "crossentropy": 2.4687747955322266, "epoch": 0.5092444895591647, "grad_norm": 0.032307982444763184, "grad_norm_var": 1.220017444378459e-05, "learning_rate": 0.005012202830022131, "loss": 2.6044, "step": 14047 }, { "crossentropy": 2.5426461696624756, "epoch": 0.5092807424593968, "grad_norm": 0.02719144895672798, "grad_norm_var": 1.1826864487190075e-05, "learning_rate": 0.005011621743950844, "loss": 2.51, "step": 14048 }, { "crossentropy": 2.4872541427612305, "epoch": 0.5093169953596288, "grad_norm": 0.027648424729704857, "grad_norm_var": 1.1539490555149273e-05, "learning_rate": 0.005011040657722588, "loss": 2.5108, "step": 14049 }, { "crossentropy": 2.436400890350342, "epoch": 0.5093532482598608, "grad_norm": 0.026111872866749763, "grad_norm_var": 1.1727147761444881e-05, "learning_rate": 0.005010459571345211, "loss": 2.5059, "step": 14050 }, { "crossentropy": 2.6657168865203857, "epoch": 0.5093895011600929, "grad_norm": 0.027002178132534027, "grad_norm_var": 1.156659210756355e-05, "learning_rate": 0.0050098784848265614, "loss": 2.6012, "step": 14051 }, { "crossentropy": 2.4369804859161377, "epoch": 0.5094257540603249, "grad_norm": 0.025705307722091675, "grad_norm_var": 2.7343930269715556e-06, "learning_rate": 0.0050092973981744875, "loss": 2.533, "step": 14052 }, { "crossentropy": 2.767667293548584, "epoch": 0.5094620069605569, "grad_norm": 0.0275119636207819, "grad_norm_var": 2.6989192241199644e-06, "learning_rate": 0.005008716311396838, "loss": 2.6283, "step": 14053 }, { "crossentropy": 2.584364175796509, "epoch": 0.5094982598607889, "grad_norm": 0.028166912496089935, "grad_norm_var": 2.699658010047364e-06, "learning_rate": 0.005008135224501462, "loss": 2.5264, "step": 14054 }, { "crossentropy": 2.478935956954956, "epoch": 0.5095345127610209, "grad_norm": 0.03063133731484413, "grad_norm_var": 2.954965008067761e-06, "learning_rate": 0.005007554137496207, "loss": 2.4926, "step": 14055 }, { "crossentropy": 2.6341233253479004, "epoch": 0.5095707656612529, "grad_norm": 0.026341792196035385, "grad_norm_var": 3.0659101774274206e-06, "learning_rate": 0.005006973050388922, "loss": 2.6258, "step": 14056 }, { "crossentropy": 2.5097711086273193, "epoch": 0.5096070185614849, "grad_norm": 0.02724926546216011, "grad_norm_var": 3.025613910917245e-06, "learning_rate": 0.005006391963187457, "loss": 2.5095, "step": 14057 }, { "crossentropy": 2.5766797065734863, "epoch": 0.509643271461717, "grad_norm": 0.02702259086072445, "grad_norm_var": 3.0351393427430755e-06, "learning_rate": 0.005005810875899657, "loss": 2.6151, "step": 14058 }, { "crossentropy": 2.3804850578308105, "epoch": 0.509679524361949, "grad_norm": 0.02710087224841118, "grad_norm_var": 2.987344233537405e-06, "learning_rate": 0.005005229788533372, "loss": 2.4301, "step": 14059 }, { "crossentropy": 2.6770317554473877, "epoch": 0.509715777262181, "grad_norm": 0.02698986977338791, "grad_norm_var": 3.0055834454848286e-06, "learning_rate": 0.005004648701096452, "loss": 2.563, "step": 14060 }, { "crossentropy": 2.4882254600524902, "epoch": 0.509752030162413, "grad_norm": 0.026045959442853928, "grad_norm_var": 2.9506931566983597e-06, "learning_rate": 0.005004067613596743, "loss": 2.5206, "step": 14061 }, { "crossentropy": 2.7483813762664795, "epoch": 0.509788283062645, "grad_norm": 0.028974343091249466, "grad_norm_var": 2.997928055773792e-06, "learning_rate": 0.005003486526042096, "loss": 2.6231, "step": 14062 }, { "crossentropy": 2.492828845977783, "epoch": 0.509824535962877, "grad_norm": 0.029096025973558426, "grad_norm_var": 1.6372382576242128e-06, "learning_rate": 0.005002905438440358, "loss": 2.5407, "step": 14063 }, { "crossentropy": 2.6255416870117188, "epoch": 0.509860788863109, "grad_norm": 0.02805113047361374, "grad_norm_var": 1.6567288838762253e-06, "learning_rate": 0.005002324350799377, "loss": 2.5979, "step": 14064 }, { "crossentropy": 2.5357797145843506, "epoch": 0.509897041763341, "grad_norm": 0.028410475701093674, "grad_norm_var": 1.7103285743847214e-06, "learning_rate": 0.005001743263127003, "loss": 2.5722, "step": 14065 }, { "crossentropy": 2.532841205596924, "epoch": 0.5099332946635731, "grad_norm": 0.026529720053076744, "grad_norm_var": 1.6424699210800063e-06, "learning_rate": 0.005001162175431083, "loss": 2.5306, "step": 14066 }, { "crossentropy": 2.4885690212249756, "epoch": 0.5099695475638051, "grad_norm": 0.027588006108999252, "grad_norm_var": 1.6209837787542566e-06, "learning_rate": 0.005000581087719466, "loss": 2.5455, "step": 14067 }, { "crossentropy": 2.5565123558044434, "epoch": 0.5100058004640371, "grad_norm": 0.027873622253537178, "grad_norm_var": 1.370393655982828e-06, "learning_rate": 0.005, "loss": 2.5123, "step": 14068 }, { "crossentropy": 2.6872594356536865, "epoch": 0.5100420533642691, "grad_norm": 0.02778623439371586, "grad_norm_var": 1.3673413917664538e-06, "learning_rate": 0.004999418912280535, "loss": 2.6567, "step": 14069 }, { "crossentropy": 2.544419765472412, "epoch": 0.5100783062645011, "grad_norm": 0.02645527757704258, "grad_norm_var": 1.453277158004274e-06, "learning_rate": 0.004998837824568918, "loss": 2.601, "step": 14070 }, { "crossentropy": 2.479182243347168, "epoch": 0.5101145591647331, "grad_norm": 0.026421867311000824, "grad_norm_var": 8.785494807694253e-07, "learning_rate": 0.004998256736872997, "loss": 2.5009, "step": 14071 }, { "crossentropy": 2.553100347518921, "epoch": 0.5101508120649652, "grad_norm": 0.02692870981991291, "grad_norm_var": 8.195324896952127e-07, "learning_rate": 0.004997675649200623, "loss": 2.6048, "step": 14072 }, { "crossentropy": 2.58467435836792, "epoch": 0.5101870649651972, "grad_norm": 0.02761704847216606, "grad_norm_var": 8.20214878660223e-07, "learning_rate": 0.004997094561559644, "loss": 2.5296, "step": 14073 }, { "crossentropy": 2.5687265396118164, "epoch": 0.5102233178654292, "grad_norm": 0.029667852446436882, "grad_norm_var": 1.1136000058922327e-06, "learning_rate": 0.004996513473957904, "loss": 2.5098, "step": 14074 }, { "crossentropy": 2.5220284461975098, "epoch": 0.5102595707656613, "grad_norm": 0.030077317729592323, "grad_norm_var": 1.4707804568764853e-06, "learning_rate": 0.004995932386403257, "loss": 2.6192, "step": 14075 }, { "crossentropy": 2.5910451412200928, "epoch": 0.5102958236658933, "grad_norm": 0.030214490368962288, "grad_norm_var": 1.7800514056383794e-06, "learning_rate": 0.004995351298903549, "loss": 2.5494, "step": 14076 }, { "crossentropy": 2.5792105197906494, "epoch": 0.5103320765661253, "grad_norm": 0.02772422879934311, "grad_norm_var": 1.5224970581414453e-06, "learning_rate": 0.004994770211466628, "loss": 2.5623, "step": 14077 }, { "crossentropy": 2.5515873432159424, "epoch": 0.5103683294663574, "grad_norm": 0.026732338592410088, "grad_norm_var": 1.5718565593871576e-06, "learning_rate": 0.004994189124100344, "loss": 2.5948, "step": 14078 }, { "crossentropy": 2.530543088912964, "epoch": 0.5104045823665894, "grad_norm": 0.027315888553857803, "grad_norm_var": 1.4975204112466773e-06, "learning_rate": 0.0049936080368125435, "loss": 2.5415, "step": 14079 }, { "crossentropy": 2.707411050796509, "epoch": 0.5104408352668214, "grad_norm": 0.026777585968375206, "grad_norm_var": 1.562552951764271e-06, "learning_rate": 0.004993026949611077, "loss": 2.6071, "step": 14080 }, { "crossentropy": 2.4996447563171387, "epoch": 0.5104770881670534, "grad_norm": 0.027618249878287315, "grad_norm_var": 1.5328098011031655e-06, "learning_rate": 0.004992445862503795, "loss": 2.5442, "step": 14081 }, { "crossentropy": 2.5066304206848145, "epoch": 0.5105133410672854, "grad_norm": 0.031767766922712326, "grad_norm_var": 2.424693527663888e-06, "learning_rate": 0.004991864775498539, "loss": 2.6164, "step": 14082 }, { "crossentropy": 2.7222912311553955, "epoch": 0.5105495939675174, "grad_norm": 0.02895408123731613, "grad_norm_var": 2.459837814030678e-06, "learning_rate": 0.004991283688603162, "loss": 2.6573, "step": 14083 }, { "crossentropy": 2.5219922065734863, "epoch": 0.5105858468677494, "grad_norm": 0.028094789013266563, "grad_norm_var": 2.45560642657307e-06, "learning_rate": 0.0049907026018255135, "loss": 2.6208, "step": 14084 }, { "crossentropy": 2.5869836807250977, "epoch": 0.5106220997679815, "grad_norm": 0.027172014117240906, "grad_norm_var": 2.507715998841077e-06, "learning_rate": 0.0049901215151734405, "loss": 2.5385, "step": 14085 }, { "crossentropy": 2.5103671550750732, "epoch": 0.5106583526682135, "grad_norm": 0.02769639529287815, "grad_norm_var": 2.3324424310773207e-06, "learning_rate": 0.004989540428654791, "loss": 2.4917, "step": 14086 }, { "crossentropy": 2.4505221843719482, "epoch": 0.5106946055684455, "grad_norm": 0.027183759957551956, "grad_norm_var": 2.1907522860986253e-06, "learning_rate": 0.004988959342277412, "loss": 2.4792, "step": 14087 }, { "crossentropy": 2.540539264678955, "epoch": 0.5107308584686775, "grad_norm": 0.026655346155166626, "grad_norm_var": 2.242539637176075e-06, "learning_rate": 0.004988378256049156, "loss": 2.5514, "step": 14088 }, { "crossentropy": 2.7377805709838867, "epoch": 0.5107671113689095, "grad_norm": 0.02995280735194683, "grad_norm_var": 2.4006279567340224e-06, "learning_rate": 0.004987797169977868, "loss": 2.5331, "step": 14089 }, { "crossentropy": 2.5083680152893066, "epoch": 0.5108033642691415, "grad_norm": 0.02970140054821968, "grad_norm_var": 2.4065917853208566e-06, "learning_rate": 0.0049872160840714, "loss": 2.4782, "step": 14090 }, { "crossentropy": 2.6552317142486572, "epoch": 0.5108396171693735, "grad_norm": 0.027651982381939888, "grad_norm_var": 2.216433204103342e-06, "learning_rate": 0.004986634998337597, "loss": 2.5525, "step": 14091 }, { "crossentropy": 2.466831922531128, "epoch": 0.5108758700696056, "grad_norm": 0.02638697810471058, "grad_norm_var": 2.104402596895573e-06, "learning_rate": 0.004986053912784309, "loss": 2.5408, "step": 14092 }, { "crossentropy": 2.7286882400512695, "epoch": 0.5109121229698376, "grad_norm": 0.025821279734373093, "grad_norm_var": 2.3909561214736918e-06, "learning_rate": 0.0049854728274193825, "loss": 2.6286, "step": 14093 }, { "crossentropy": 2.4382214546203613, "epoch": 0.5109483758700696, "grad_norm": 0.025875771418213844, "grad_norm_var": 2.5636222196284118e-06, "learning_rate": 0.004984891742250669, "loss": 2.559, "step": 14094 }, { "crossentropy": 2.3444509506225586, "epoch": 0.5109846287703016, "grad_norm": 0.025929037481546402, "grad_norm_var": 2.771340876380329e-06, "learning_rate": 0.004984310657286013, "loss": 2.493, "step": 14095 }, { "crossentropy": 2.5498502254486084, "epoch": 0.5110208816705336, "grad_norm": 0.037905748933553696, "grad_norm_var": 9.138815648656558e-06, "learning_rate": 0.0049837295725332674, "loss": 2.5636, "step": 14096 }, { "crossentropy": 2.5617034435272217, "epoch": 0.5110571345707656, "grad_norm": 0.025742320343852043, "grad_norm_var": 9.553785049514841e-06, "learning_rate": 0.004983148488000279, "loss": 2.4984, "step": 14097 }, { "crossentropy": 2.5247604846954346, "epoch": 0.5110933874709976, "grad_norm": 0.02735593356192112, "grad_norm_var": 8.719064649578834e-06, "learning_rate": 0.004982567403694896, "loss": 2.489, "step": 14098 }, { "crossentropy": 2.5293421745300293, "epoch": 0.5111296403712297, "grad_norm": 0.028217792510986328, "grad_norm_var": 8.65977200125781e-06, "learning_rate": 0.004981986319624966, "loss": 2.5365, "step": 14099 }, { "crossentropy": 2.5600430965423584, "epoch": 0.5111658932714617, "grad_norm": 0.026660650968551636, "grad_norm_var": 8.762345952603246e-06, "learning_rate": 0.004981405235798339, "loss": 2.5719, "step": 14100 }, { "crossentropy": 2.4401774406433105, "epoch": 0.5112021461716937, "grad_norm": 0.02803145721554756, "grad_norm_var": 8.7286044332186e-06, "learning_rate": 0.004980824152222861, "loss": 2.5575, "step": 14101 }, { "crossentropy": 2.6527936458587646, "epoch": 0.5112383990719258, "grad_norm": 0.026150355115532875, "grad_norm_var": 8.924714973621582e-06, "learning_rate": 0.004980243068906383, "loss": 2.5944, "step": 14102 }, { "crossentropy": 2.535423517227173, "epoch": 0.5112746519721578, "grad_norm": 0.026276150718331337, "grad_norm_var": 9.053970114592837e-06, "learning_rate": 0.00497966198585675, "loss": 2.5172, "step": 14103 }, { "crossentropy": 2.588325262069702, "epoch": 0.5113109048723898, "grad_norm": 0.02708677388727665, "grad_norm_var": 9.001502152935223e-06, "learning_rate": 0.004979080903081815, "loss": 2.5385, "step": 14104 }, { "crossentropy": 2.4961016178131104, "epoch": 0.5113471577726219, "grad_norm": 0.025933733209967613, "grad_norm_var": 8.855629143907996e-06, "learning_rate": 0.004978499820589424, "loss": 2.5169, "step": 14105 }, { "crossentropy": 2.5402920246124268, "epoch": 0.5113834106728539, "grad_norm": 0.02682008221745491, "grad_norm_var": 8.546243862687453e-06, "learning_rate": 0.004977918738387426, "loss": 2.5792, "step": 14106 }, { "crossentropy": 2.5653867721557617, "epoch": 0.5114196635730859, "grad_norm": 0.027656884863972664, "grad_norm_var": 8.546432707885498e-06, "learning_rate": 0.004977337656483669, "loss": 2.4876, "step": 14107 }, { "crossentropy": 2.549994707107544, "epoch": 0.5114559164733179, "grad_norm": 0.02766331098973751, "grad_norm_var": 8.481692807259692e-06, "learning_rate": 0.0049767565748860005, "loss": 2.5628, "step": 14108 }, { "crossentropy": 2.6685664653778076, "epoch": 0.5114921693735499, "grad_norm": 0.02826760709285736, "grad_norm_var": 8.325956504203258e-06, "learning_rate": 0.0049761754936022695, "loss": 2.6346, "step": 14109 }, { "crossentropy": 2.7028985023498535, "epoch": 0.5115284222737819, "grad_norm": 0.029772642999887466, "grad_norm_var": 8.380034333687243e-06, "learning_rate": 0.004975594412640325, "loss": 2.6594, "step": 14110 }, { "crossentropy": 2.557737350463867, "epoch": 0.5115646751740139, "grad_norm": 0.02682780660688877, "grad_norm_var": 8.201290770307512e-06, "learning_rate": 0.0049750133320080134, "loss": 2.5441, "step": 14111 }, { "crossentropy": 2.631911039352417, "epoch": 0.511600928074246, "grad_norm": 0.026452552527189255, "grad_norm_var": 1.117130054730991e-06, "learning_rate": 0.004974432251713187, "loss": 2.6393, "step": 14112 }, { "crossentropy": 2.5898892879486084, "epoch": 0.511637180974478, "grad_norm": 0.028115276247262955, "grad_norm_var": 1.0134761758858724e-06, "learning_rate": 0.0049738511717636906, "loss": 2.5305, "step": 14113 }, { "crossentropy": 2.5465450286865234, "epoch": 0.51167343387471, "grad_norm": 0.02764136902987957, "grad_norm_var": 1.0195338117108265e-06, "learning_rate": 0.0049732700921673745, "loss": 2.6258, "step": 14114 }, { "crossentropy": 2.5797007083892822, "epoch": 0.511709686774942, "grad_norm": 0.02695239894092083, "grad_norm_var": 9.729274452469852e-07, "learning_rate": 0.0049726890129320874, "loss": 2.5932, "step": 14115 }, { "crossentropy": 2.4292149543762207, "epoch": 0.511745939675174, "grad_norm": 0.029175108298659325, "grad_norm_var": 1.1640218615724742e-06, "learning_rate": 0.004972107934065675, "loss": 2.5065, "step": 14116 }, { "crossentropy": 2.7625765800476074, "epoch": 0.511782192575406, "grad_norm": 0.02850271202623844, "grad_norm_var": 1.215915720963985e-06, "learning_rate": 0.004971526855575988, "loss": 2.6302, "step": 14117 }, { "crossentropy": 2.559119462966919, "epoch": 0.511818445475638, "grad_norm": 0.026782799512147903, "grad_norm_var": 1.1308216452611425e-06, "learning_rate": 0.004970945777470874, "loss": 2.5086, "step": 14118 }, { "crossentropy": 2.6066505908966064, "epoch": 0.51185469837587, "grad_norm": 0.02668391354382038, "grad_norm_var": 1.0749221990267894e-06, "learning_rate": 0.00497036469975818, "loss": 2.5714, "step": 14119 }, { "crossentropy": 2.690938711166382, "epoch": 0.5118909512761021, "grad_norm": 0.02791055105626583, "grad_norm_var": 1.0696482332218e-06, "learning_rate": 0.004969783622445757, "loss": 2.5835, "step": 14120 }, { "crossentropy": 2.557453155517578, "epoch": 0.5119272041763341, "grad_norm": 0.026708733290433884, "grad_norm_var": 9.378561292870535e-07, "learning_rate": 0.004969202545541451, "loss": 2.6188, "step": 14121 }, { "crossentropy": 2.4523916244506836, "epoch": 0.5119634570765661, "grad_norm": 0.03382746875286102, "grad_norm_var": 3.258642125773915e-06, "learning_rate": 0.004968621469053113, "loss": 2.5417, "step": 14122 }, { "crossentropy": 2.6679136753082275, "epoch": 0.5119997099767981, "grad_norm": 0.027680646628141403, "grad_norm_var": 3.2574039864340906e-06, "learning_rate": 0.0049680403929885895, "loss": 2.604, "step": 14123 }, { "crossentropy": 2.6460773944854736, "epoch": 0.5120359628770301, "grad_norm": 0.0267068799585104, "grad_norm_var": 3.36520296485143e-06, "learning_rate": 0.004967459317355728, "loss": 2.5494, "step": 14124 }, { "crossentropy": 2.6079368591308594, "epoch": 0.5120722157772621, "grad_norm": 0.026043076068162918, "grad_norm_var": 3.5952701867632277e-06, "learning_rate": 0.004966878242162378, "loss": 2.6621, "step": 14125 }, { "crossentropy": 2.6708648204803467, "epoch": 0.5121084686774942, "grad_norm": 0.027275215834379196, "grad_norm_var": 3.3486981877245415e-06, "learning_rate": 0.004966297167416387, "loss": 2.6027, "step": 14126 }, { "crossentropy": 2.575071334838867, "epoch": 0.5121447215777262, "grad_norm": 0.029133453965187073, "grad_norm_var": 3.411157263463156e-06, "learning_rate": 0.004965716093125605, "loss": 2.5464, "step": 14127 }, { "crossentropy": 2.588413953781128, "epoch": 0.5121809744779582, "grad_norm": 0.07981020957231522, "grad_norm_var": 0.00017141268165294374, "learning_rate": 0.004965135019297878, "loss": 2.5653, "step": 14128 }, { "crossentropy": 2.5185070037841797, "epoch": 0.5122172273781903, "grad_norm": 0.028208794072270393, "grad_norm_var": 0.00017137495967251928, "learning_rate": 0.004964553945941056, "loss": 2.6232, "step": 14129 }, { "crossentropy": 2.557150363922119, "epoch": 0.5122534802784223, "grad_norm": 0.02618389017879963, "grad_norm_var": 0.000172197372690389, "learning_rate": 0.004963972873062987, "loss": 2.5837, "step": 14130 }, { "crossentropy": 2.471818685531616, "epoch": 0.5122897331786543, "grad_norm": 0.026145491749048233, "grad_norm_var": 0.00017268420180483826, "learning_rate": 0.004963391800671519, "loss": 2.5053, "step": 14131 }, { "crossentropy": 2.5889391899108887, "epoch": 0.5123259860788864, "grad_norm": 0.0283626951277256, "grad_norm_var": 0.0001729284017659929, "learning_rate": 0.004962810728774502, "loss": 2.5737, "step": 14132 }, { "crossentropy": 2.5506069660186768, "epoch": 0.5123622389791184, "grad_norm": 0.02989097498357296, "grad_norm_var": 0.0001725869912692597, "learning_rate": 0.004962229657379781, "loss": 2.6523, "step": 14133 }, { "crossentropy": 2.530479669570923, "epoch": 0.5123984918793504, "grad_norm": 0.03459741920232773, "grad_norm_var": 0.00017192142307830954, "learning_rate": 0.004961648586495205, "loss": 2.6445, "step": 14134 }, { "crossentropy": 2.497460126876831, "epoch": 0.5124347447795824, "grad_norm": 0.030223222449421883, "grad_norm_var": 0.00017039710229656117, "learning_rate": 0.004961067516128624, "loss": 2.4607, "step": 14135 }, { "crossentropy": 2.5821022987365723, "epoch": 0.5124709976798144, "grad_norm": 0.028582831844687462, "grad_norm_var": 0.00017007722101279252, "learning_rate": 0.004960486446287884, "loss": 2.492, "step": 14136 }, { "crossentropy": 2.5460145473480225, "epoch": 0.5125072505800464, "grad_norm": 0.026774562895298004, "grad_norm_var": 0.00017003248565505968, "learning_rate": 0.004959905376980835, "loss": 2.5235, "step": 14137 }, { "crossentropy": 2.6127734184265137, "epoch": 0.5125435034802784, "grad_norm": 0.029506361111998558, "grad_norm_var": 0.00017005465439719398, "learning_rate": 0.004959324308215326, "loss": 2.5159, "step": 14138 }, { "crossentropy": 2.600044012069702, "epoch": 0.5125797563805105, "grad_norm": 0.02710927277803421, "grad_norm_var": 0.00017037138922239834, "learning_rate": 0.004958743239999202, "loss": 2.5873, "step": 14139 }, { "crossentropy": 2.6638705730438232, "epoch": 0.5126160092807425, "grad_norm": 0.02705959789454937, "grad_norm_var": 0.00017015211950753721, "learning_rate": 0.004958162172340317, "loss": 2.6481, "step": 14140 }, { "crossentropy": 2.58599591255188, "epoch": 0.5126522621809745, "grad_norm": 0.02605312317609787, "grad_norm_var": 0.00017014473969750055, "learning_rate": 0.004957581105246512, "loss": 2.6278, "step": 14141 }, { "crossentropy": 2.4465253353118896, "epoch": 0.5126885150812065, "grad_norm": 0.027240842580795288, "grad_norm_var": 0.00017016443885527695, "learning_rate": 0.004957000038725638, "loss": 2.5148, "step": 14142 }, { "crossentropy": 2.5376975536346436, "epoch": 0.5127247679814385, "grad_norm": 0.02858375385403633, "grad_norm_var": 0.00017036082027803353, "learning_rate": 0.0049564189727855455, "loss": 2.5875, "step": 14143 }, { "crossentropy": 2.6348109245300293, "epoch": 0.5127610208816705, "grad_norm": 0.026673035696148872, "grad_norm_var": 4.705014018115532e-06, "learning_rate": 0.00495583790743408, "loss": 2.595, "step": 14144 }, { "crossentropy": 2.453948974609375, "epoch": 0.5127972737819025, "grad_norm": 0.02671852894127369, "grad_norm_var": 4.8420209582649676e-06, "learning_rate": 0.004955256842679091, "loss": 2.4399, "step": 14145 }, { "crossentropy": 2.6666407585144043, "epoch": 0.5128335266821346, "grad_norm": 0.026417868211865425, "grad_norm_var": 4.7854596421092445e-06, "learning_rate": 0.004954675778528426, "loss": 2.5984, "step": 14146 }, { "crossentropy": 2.7406105995178223, "epoch": 0.5128697795823666, "grad_norm": 0.028784872964024544, "grad_norm_var": 4.52556075484522e-06, "learning_rate": 0.004954094714989933, "loss": 2.6232, "step": 14147 }, { "crossentropy": 2.5440902709960938, "epoch": 0.5129060324825986, "grad_norm": 0.028418073430657387, "grad_norm_var": 4.5263173588802145e-06, "learning_rate": 0.004953513652071463, "loss": 2.5236, "step": 14148 }, { "crossentropy": 2.5756471157073975, "epoch": 0.5129422853828306, "grad_norm": 0.027185853570699692, "grad_norm_var": 4.406100700949734e-06, "learning_rate": 0.00495293258978086, "loss": 2.6357, "step": 14149 }, { "crossentropy": 2.654782295227051, "epoch": 0.5129785382830626, "grad_norm": 0.027437880635261536, "grad_norm_var": 1.4269601796906926e-06, "learning_rate": 0.0049523515281259724, "loss": 2.5016, "step": 14150 }, { "crossentropy": 2.5596959590911865, "epoch": 0.5130147911832946, "grad_norm": 0.0283815935254097, "grad_norm_var": 1.0127523553481061e-06, "learning_rate": 0.004951770467114651, "loss": 2.5941, "step": 14151 }, { "crossentropy": 2.5783803462982178, "epoch": 0.5130510440835266, "grad_norm": 0.026438690721988678, "grad_norm_var": 1.0071025795127848e-06, "learning_rate": 0.004951189406754743, "loss": 2.6224, "step": 14152 }, { "crossentropy": 2.4988698959350586, "epoch": 0.5130872969837587, "grad_norm": 0.02805924601852894, "grad_norm_var": 9.990114071172877e-07, "learning_rate": 0.004950608347054095, "loss": 2.5972, "step": 14153 }, { "crossentropy": 2.5537331104278564, "epoch": 0.5131235498839907, "grad_norm": 0.028311580419540405, "grad_norm_var": 7.692916453527621e-07, "learning_rate": 0.004950027288020557, "loss": 2.6032, "step": 14154 }, { "crossentropy": 2.637845993041992, "epoch": 0.5131598027842227, "grad_norm": 0.028123728930950165, "grad_norm_var": 7.9028219024875e-07, "learning_rate": 0.004949446229661975, "loss": 2.5742, "step": 14155 }, { "crossentropy": 2.710312604904175, "epoch": 0.5131960556844548, "grad_norm": 0.026652712374925613, "grad_norm_var": 8.241430189276829e-07, "learning_rate": 0.0049488651719862, "loss": 2.6084, "step": 14156 }, { "crossentropy": 2.4939794540405273, "epoch": 0.5132323085846868, "grad_norm": 0.027537474408745766, "grad_norm_var": 6.819077834962212e-07, "learning_rate": 0.00494828411500108, "loss": 2.5671, "step": 14157 }, { "crossentropy": 2.749056100845337, "epoch": 0.5132685614849188, "grad_norm": 0.027973618358373642, "grad_norm_var": 6.842499995337771e-07, "learning_rate": 0.004947703058714458, "loss": 2.5964, "step": 14158 }, { "crossentropy": 2.632025957107544, "epoch": 0.5133048143851509, "grad_norm": 0.027998343110084534, "grad_norm_var": 6.293630149867135e-07, "learning_rate": 0.004947122003134188, "loss": 2.6668, "step": 14159 }, { "crossentropy": 2.4665210247039795, "epoch": 0.5133410672853829, "grad_norm": 0.026355359703302383, "grad_norm_var": 6.736446661543063e-07, "learning_rate": 0.004946540948268114, "loss": 2.5105, "step": 14160 }, { "crossentropy": 2.538761615753174, "epoch": 0.5133773201856149, "grad_norm": 0.027293363586068153, "grad_norm_var": 6.305909767087905e-07, "learning_rate": 0.004945959894124087, "loss": 2.4796, "step": 14161 }, { "crossentropy": 2.4857230186462402, "epoch": 0.5134135730858469, "grad_norm": 0.02535155415534973, "grad_norm_var": 8.67683454692245e-07, "learning_rate": 0.0049453788407099524, "loss": 2.4844, "step": 14162 }, { "crossentropy": 2.529261827468872, "epoch": 0.5134498259860789, "grad_norm": 0.02667202427983284, "grad_norm_var": 7.900775298519749e-07, "learning_rate": 0.004944797788033559, "loss": 2.5752, "step": 14163 }, { "crossentropy": 2.46511173248291, "epoch": 0.5134860788863109, "grad_norm": 0.026019474491477013, "grad_norm_var": 8.198884044942345e-07, "learning_rate": 0.004944216736102756, "loss": 2.4766, "step": 14164 }, { "crossentropy": 2.551893949508667, "epoch": 0.5135223317865429, "grad_norm": 0.02777101844549179, "grad_norm_var": 8.372965494583712e-07, "learning_rate": 0.004943635684925394, "loss": 2.6087, "step": 14165 }, { "crossentropy": 2.4875094890594482, "epoch": 0.513558584686775, "grad_norm": 0.02860109694302082, "grad_norm_var": 9.47342136863371e-07, "learning_rate": 0.004943054634509313, "loss": 2.6028, "step": 14166 }, { "crossentropy": 2.5963873863220215, "epoch": 0.513594837587007, "grad_norm": 0.025667527690529823, "grad_norm_var": 1.0330812142256747e-06, "learning_rate": 0.004942473584862367, "loss": 2.5325, "step": 14167 }, { "crossentropy": 2.4703569412231445, "epoch": 0.513631090487239, "grad_norm": 0.02685507945716381, "grad_norm_var": 1.0029456118961263e-06, "learning_rate": 0.004941892535992403, "loss": 2.527, "step": 14168 }, { "crossentropy": 2.59338116645813, "epoch": 0.513667343387471, "grad_norm": 0.029048990458250046, "grad_norm_var": 1.1772051087120816e-06, "learning_rate": 0.004941311487907268, "loss": 2.5901, "step": 14169 }, { "crossentropy": 2.702561855316162, "epoch": 0.513703596287703, "grad_norm": 0.03281456604599953, "grad_norm_var": 3.073139628861389e-06, "learning_rate": 0.004940730440614811, "loss": 2.6773, "step": 14170 }, { "crossentropy": 2.625225305557251, "epoch": 0.513739849187935, "grad_norm": 0.026515662670135498, "grad_norm_var": 3.1108858553114195e-06, "learning_rate": 0.004940149394122877, "loss": 2.5028, "step": 14171 }, { "crossentropy": 2.6613059043884277, "epoch": 0.513776102088167, "grad_norm": 0.028080198913812637, "grad_norm_var": 3.087352322482008e-06, "learning_rate": 0.004939568348439318, "loss": 2.6025, "step": 14172 }, { "crossentropy": 2.4836912155151367, "epoch": 0.5138123549883991, "grad_norm": 0.027748817577958107, "grad_norm_var": 3.090221854776659e-06, "learning_rate": 0.004938987303571981, "loss": 2.5037, "step": 14173 }, { "crossentropy": 2.672346830368042, "epoch": 0.5138486078886311, "grad_norm": 0.027922986075282097, "grad_norm_var": 3.0875081940524474e-06, "learning_rate": 0.004938406259528713, "loss": 2.6119, "step": 14174 }, { "crossentropy": 2.4240763187408447, "epoch": 0.5138848607888631, "grad_norm": 0.026620415970683098, "grad_norm_var": 3.1228408569398842e-06, "learning_rate": 0.0049378252163173614, "loss": 2.4432, "step": 14175 }, { "crossentropy": 2.658439874649048, "epoch": 0.5139211136890951, "grad_norm": 0.027026761323213577, "grad_norm_var": 3.0522493042185623e-06, "learning_rate": 0.004937244173945775, "loss": 2.6271, "step": 14176 }, { "crossentropy": 2.6147594451904297, "epoch": 0.5139573665893271, "grad_norm": 0.027417974546551704, "grad_norm_var": 3.0497766712521297e-06, "learning_rate": 0.0049366631324218, "loss": 2.6141, "step": 14177 }, { "crossentropy": 2.483825206756592, "epoch": 0.5139936194895591, "grad_norm": 0.02820958010852337, "grad_norm_var": 2.738392649903248e-06, "learning_rate": 0.004936082091753287, "loss": 2.5507, "step": 14178 }, { "crossentropy": 2.4682705402374268, "epoch": 0.5140298723897911, "grad_norm": 0.02674965374171734, "grad_norm_var": 2.7282635799984997e-06, "learning_rate": 0.0049355010519480805, "loss": 2.491, "step": 14179 }, { "crossentropy": 2.5475316047668457, "epoch": 0.5140661252900232, "grad_norm": 0.02674778178334236, "grad_norm_var": 2.5990138715343508e-06, "learning_rate": 0.004934920013014031, "loss": 2.5305, "step": 14180 }, { "crossentropy": 2.521911144256592, "epoch": 0.5141023781902552, "grad_norm": 0.02709740400314331, "grad_norm_var": 2.6243525801361666e-06, "learning_rate": 0.004934338974958986, "loss": 2.6094, "step": 14181 }, { "crossentropy": 2.656336545944214, "epoch": 0.5141386310904872, "grad_norm": 0.027415838092565536, "grad_norm_var": 2.5690048202001505e-06, "learning_rate": 0.004933757937790793, "loss": 2.5417, "step": 14182 }, { "crossentropy": 2.421694040298462, "epoch": 0.5141748839907193, "grad_norm": 0.02626986987888813, "grad_norm_var": 2.43477673358457e-06, "learning_rate": 0.004933176901517299, "loss": 2.5316, "step": 14183 }, { "crossentropy": 2.7223963737487793, "epoch": 0.5142111368909513, "grad_norm": 0.026707712560892105, "grad_norm_var": 2.4519272451455396e-06, "learning_rate": 0.004932595866146353, "loss": 2.6782, "step": 14184 }, { "crossentropy": 2.4816091060638428, "epoch": 0.5142473897911833, "grad_norm": 0.02698192372918129, "grad_norm_var": 2.3333011602852016e-06, "learning_rate": 0.004932014831685801, "loss": 2.5671, "step": 14185 }, { "crossentropy": 2.5972065925598145, "epoch": 0.5142836426914154, "grad_norm": 0.027022553607821465, "grad_norm_var": 3.4153347677519605e-07, "learning_rate": 0.004931433798143493, "loss": 2.5521, "step": 14186 }, { "crossentropy": 2.7297492027282715, "epoch": 0.5143198955916474, "grad_norm": 0.027554327622056007, "grad_norm_var": 3.199418419928216e-07, "learning_rate": 0.004930852765527275, "loss": 2.6794, "step": 14187 }, { "crossentropy": 2.41485333442688, "epoch": 0.5143561484918794, "grad_norm": 0.02720884047448635, "grad_norm_var": 2.6784771552610754e-07, "learning_rate": 0.004930271733844995, "loss": 2.4015, "step": 14188 }, { "crossentropy": 2.4466428756713867, "epoch": 0.5143924013921114, "grad_norm": 0.026632219552993774, "grad_norm_var": 2.594345684691814e-07, "learning_rate": 0.004929690703104501, "loss": 2.5205, "step": 14189 }, { "crossentropy": 2.6278035640716553, "epoch": 0.5144286542923434, "grad_norm": 0.027706757187843323, "grad_norm_var": 2.386041271051446e-07, "learning_rate": 0.004929109673313643, "loss": 2.5589, "step": 14190 }, { "crossentropy": 2.6517250537872314, "epoch": 0.5144649071925754, "grad_norm": 0.027487821877002716, "grad_norm_var": 2.3182813753268247e-07, "learning_rate": 0.004928528644480264, "loss": 2.6106, "step": 14191 }, { "crossentropy": 2.5685436725616455, "epoch": 0.5145011600928074, "grad_norm": 0.0260365791618824, "grad_norm_var": 3.0803259898902424e-07, "learning_rate": 0.004927947616612215, "loss": 2.6275, "step": 14192 }, { "crossentropy": 2.6509883403778076, "epoch": 0.5145374129930395, "grad_norm": 0.02665988728404045, "grad_norm_var": 3.0957972655115455e-07, "learning_rate": 0.0049273665897173425, "loss": 2.588, "step": 14193 }, { "crossentropy": 2.6395325660705566, "epoch": 0.5145736658932715, "grad_norm": 0.025964565575122833, "grad_norm_var": 2.716591473505275e-07, "learning_rate": 0.004926785563803495, "loss": 2.6618, "step": 14194 }, { "crossentropy": 2.557082176208496, "epoch": 0.5146099187935035, "grad_norm": 0.02764718420803547, "grad_norm_var": 3.051834228448193e-07, "learning_rate": 0.004926204538878517, "loss": 2.6082, "step": 14195 }, { "crossentropy": 2.4254558086395264, "epoch": 0.5146461716937355, "grad_norm": 0.027546081691980362, "grad_norm_var": 3.238802137387932e-07, "learning_rate": 0.004925623514950262, "loss": 2.5541, "step": 14196 }, { "crossentropy": 2.670792579650879, "epoch": 0.5146824245939675, "grad_norm": 0.027118053287267685, "grad_norm_var": 3.241854389248912e-07, "learning_rate": 0.004925042492026572, "loss": 2.6541, "step": 14197 }, { "crossentropy": 2.473522901535034, "epoch": 0.5147186774941995, "grad_norm": 0.0300817359238863, "grad_norm_var": 9.170683963656441e-07, "learning_rate": 0.004924461470115298, "loss": 2.5164, "step": 14198 }, { "crossentropy": 2.440988063812256, "epoch": 0.5147549303944315, "grad_norm": 0.027908815070986748, "grad_norm_var": 8.895326609296983e-07, "learning_rate": 0.004923880449224287, "loss": 2.5141, "step": 14199 }, { "crossentropy": 2.4889464378356934, "epoch": 0.5147911832946636, "grad_norm": 0.025747276842594147, "grad_norm_var": 1.018750688753546e-06, "learning_rate": 0.004923299429361387, "loss": 2.5706, "step": 14200 }, { "crossentropy": 2.5030856132507324, "epoch": 0.5148274361948956, "grad_norm": 0.026126716285943985, "grad_norm_var": 1.0900742715341226e-06, "learning_rate": 0.004922718410534443, "loss": 2.428, "step": 14201 }, { "crossentropy": 2.3099563121795654, "epoch": 0.5148636890951276, "grad_norm": 0.029158851131796837, "grad_norm_var": 1.3381282065349475e-06, "learning_rate": 0.004922137392751304, "loss": 2.4897, "step": 14202 }, { "crossentropy": 2.673154592514038, "epoch": 0.5148999419953596, "grad_norm": 0.02886875532567501, "grad_norm_var": 1.4930306257131727e-06, "learning_rate": 0.0049215563760198165, "loss": 2.5789, "step": 14203 }, { "crossentropy": 2.6108157634735107, "epoch": 0.5149361948955916, "grad_norm": 0.026799574494361877, "grad_norm_var": 1.5122258418442108e-06, "learning_rate": 0.004920975360347831, "loss": 2.5524, "step": 14204 }, { "crossentropy": 2.543012857437134, "epoch": 0.5149724477958236, "grad_norm": 0.03073847107589245, "grad_norm_var": 2.1768063752945765e-06, "learning_rate": 0.0049203943457431926, "loss": 2.5839, "step": 14205 }, { "crossentropy": 2.604856014251709, "epoch": 0.5150087006960556, "grad_norm": 0.027348361909389496, "grad_norm_var": 2.179724238529172e-06, "learning_rate": 0.00491981333221375, "loss": 2.6269, "step": 14206 }, { "crossentropy": 2.4869871139526367, "epoch": 0.5150449535962877, "grad_norm": 0.025530129671096802, "grad_norm_var": 2.442646749954471e-06, "learning_rate": 0.00491923231976735, "loss": 2.5397, "step": 14207 }, { "crossentropy": 2.4199490547180176, "epoch": 0.5150812064965197, "grad_norm": 0.029293237254023552, "grad_norm_var": 2.489574198186401e-06, "learning_rate": 0.00491865130841184, "loss": 2.4899, "step": 14208 }, { "crossentropy": 2.5161352157592773, "epoch": 0.5151174593967517, "grad_norm": 0.02818535454571247, "grad_norm_var": 2.431879805659537e-06, "learning_rate": 0.004918070298155067, "loss": 2.5578, "step": 14209 }, { "crossentropy": 2.472998857498169, "epoch": 0.5151537122969838, "grad_norm": 0.02743471972644329, "grad_norm_var": 2.2162086944915583e-06, "learning_rate": 0.00491748928900488, "loss": 2.5299, "step": 14210 }, { "crossentropy": 2.604952335357666, "epoch": 0.5151899651972158, "grad_norm": 0.027070427313447, "grad_norm_var": 2.2522754554992933e-06, "learning_rate": 0.004916908280969123, "loss": 2.5838, "step": 14211 }, { "crossentropy": 2.6211984157562256, "epoch": 0.5152262180974478, "grad_norm": 0.02844962291419506, "grad_norm_var": 2.27153070719253e-06, "learning_rate": 0.004916327274055647, "loss": 2.568, "step": 14212 }, { "crossentropy": 2.5928938388824463, "epoch": 0.5152624709976799, "grad_norm": 0.035019103437662125, "grad_norm_var": 5.384980732908949e-06, "learning_rate": 0.004915746268272299, "loss": 2.5307, "step": 14213 }, { "crossentropy": 2.5655322074890137, "epoch": 0.5152987238979119, "grad_norm": 0.029313012957572937, "grad_norm_var": 5.24544982978072e-06, "learning_rate": 0.004915165263626924, "loss": 2.6254, "step": 14214 }, { "crossentropy": 2.382148504257202, "epoch": 0.5153349767981439, "grad_norm": 0.02920185588300228, "grad_norm_var": 5.280431079962321e-06, "learning_rate": 0.004914584260127372, "loss": 2.4674, "step": 14215 }, { "crossentropy": 2.6846108436584473, "epoch": 0.5153712296983759, "grad_norm": 0.02964646741747856, "grad_norm_var": 4.855253202571995e-06, "learning_rate": 0.00491400325778149, "loss": 2.6206, "step": 14216 }, { "crossentropy": 2.6353635787963867, "epoch": 0.5154074825986079, "grad_norm": 0.02871721237897873, "grad_norm_var": 4.407777813921727e-06, "learning_rate": 0.0049134222565971235, "loss": 2.6162, "step": 14217 }, { "crossentropy": 2.5022120475769043, "epoch": 0.5154437354988399, "grad_norm": 0.02777176722884178, "grad_norm_var": 4.461373217076678e-06, "learning_rate": 0.004912841256582121, "loss": 2.5265, "step": 14218 }, { "crossentropy": 2.64341139793396, "epoch": 0.5154799883990719, "grad_norm": 0.02683987095952034, "grad_norm_var": 4.676174942619043e-06, "learning_rate": 0.004912260257744328, "loss": 2.5844, "step": 14219 }, { "crossentropy": 2.5158283710479736, "epoch": 0.515516241299304, "grad_norm": 0.026368966326117516, "grad_norm_var": 4.790270169389524e-06, "learning_rate": 0.004911679260091595, "loss": 2.5553, "step": 14220 }, { "crossentropy": 2.567572593688965, "epoch": 0.515552494199536, "grad_norm": 0.027905603870749474, "grad_norm_var": 4.468256922442413e-06, "learning_rate": 0.0049110982636317675, "loss": 2.6045, "step": 14221 }, { "crossentropy": 2.48537540435791, "epoch": 0.515588747099768, "grad_norm": 0.025655923411250114, "grad_norm_var": 4.8802981681618365e-06, "learning_rate": 0.004910517268372693, "loss": 2.5142, "step": 14222 }, { "crossentropy": 2.6337313652038574, "epoch": 0.515625, "grad_norm": 0.0263235941529274, "grad_norm_var": 4.629231347219237e-06, "learning_rate": 0.004909936274322218, "loss": 2.5933, "step": 14223 }, { "crossentropy": 2.595782995223999, "epoch": 0.515661252900232, "grad_norm": 0.026111003011465073, "grad_norm_var": 4.8512372814168404e-06, "learning_rate": 0.004909355281488192, "loss": 2.6203, "step": 14224 }, { "crossentropy": 2.601649284362793, "epoch": 0.515697505800464, "grad_norm": 0.03045443631708622, "grad_norm_var": 5.191018671291068e-06, "learning_rate": 0.004908774289878458, "loss": 2.6164, "step": 14225 }, { "crossentropy": 2.599320650100708, "epoch": 0.515733758700696, "grad_norm": 0.02974873222410679, "grad_norm_var": 5.268673499202413e-06, "learning_rate": 0.004908193299500867, "loss": 2.5555, "step": 14226 }, { "crossentropy": 2.5334150791168213, "epoch": 0.5157700116009281, "grad_norm": 0.02617526613175869, "grad_norm_var": 5.478920540445629e-06, "learning_rate": 0.004907612310363264, "loss": 2.5232, "step": 14227 }, { "crossentropy": 2.5026516914367676, "epoch": 0.5158062645011601, "grad_norm": 0.026608673855662346, "grad_norm_var": 5.66785698790691e-06, "learning_rate": 0.004907031322473498, "loss": 2.5618, "step": 14228 }, { "crossentropy": 2.530980110168457, "epoch": 0.5158425174013921, "grad_norm": 0.02671036124229431, "grad_norm_var": 2.4739350153947976e-06, "learning_rate": 0.004906450335839414, "loss": 2.5758, "step": 14229 }, { "crossentropy": 2.496239423751831, "epoch": 0.5158787703016241, "grad_norm": 0.02722945809364319, "grad_norm_var": 2.303278008188488e-06, "learning_rate": 0.004905869350468862, "loss": 2.4865, "step": 14230 }, { "crossentropy": 2.7129926681518555, "epoch": 0.5159150232018561, "grad_norm": 0.030342616140842438, "grad_norm_var": 2.6294993516695476e-06, "learning_rate": 0.004905288366369685, "loss": 2.6179, "step": 14231 }, { "crossentropy": 2.5025806427001953, "epoch": 0.5159512761020881, "grad_norm": 0.028069477528333664, "grad_norm_var": 2.367901675090937e-06, "learning_rate": 0.004904707383549735, "loss": 2.5564, "step": 14232 }, { "crossentropy": 2.6686301231384277, "epoch": 0.5159875290023201, "grad_norm": 0.026546040549874306, "grad_norm_var": 2.328845071686234e-06, "learning_rate": 0.004904126402016855, "loss": 2.6214, "step": 14233 }, { "crossentropy": 2.5632989406585693, "epoch": 0.5160237819025522, "grad_norm": 0.027599044144153595, "grad_norm_var": 2.3228126294590664e-06, "learning_rate": 0.004903545421778892, "loss": 2.5029, "step": 14234 }, { "crossentropy": 2.447589635848999, "epoch": 0.5160600348027842, "grad_norm": 0.028937742114067078, "grad_norm_var": 2.4361483887425443e-06, "learning_rate": 0.004902964442843695, "loss": 2.4625, "step": 14235 }, { "crossentropy": 2.5675172805786133, "epoch": 0.5160962877030162, "grad_norm": 0.025416797026991844, "grad_norm_var": 2.642648103077551e-06, "learning_rate": 0.004902383465219112, "loss": 2.5013, "step": 14236 }, { "crossentropy": 2.6041674613952637, "epoch": 0.5161325406032483, "grad_norm": 0.027119562029838562, "grad_norm_var": 2.637672600301954e-06, "learning_rate": 0.004901802488912987, "loss": 2.5563, "step": 14237 }, { "crossentropy": 2.657866954803467, "epoch": 0.5161687935034803, "grad_norm": 0.025863230228424072, "grad_norm_var": 2.591030030947341e-06, "learning_rate": 0.004901221513933169, "loss": 2.5995, "step": 14238 }, { "crossentropy": 2.548300266265869, "epoch": 0.5162050464037123, "grad_norm": 0.026807542890310287, "grad_norm_var": 2.5327589216722046e-06, "learning_rate": 0.0049006405402875035, "loss": 2.556, "step": 14239 }, { "crossentropy": 2.557934045791626, "epoch": 0.5162412993039444, "grad_norm": 0.026083016768097878, "grad_norm_var": 2.5379302738681345e-06, "learning_rate": 0.004900059567983839, "loss": 2.4845, "step": 14240 }, { "crossentropy": 2.5493619441986084, "epoch": 0.5162775522041764, "grad_norm": 0.026164699345827103, "grad_norm_var": 1.987916053442819e-06, "learning_rate": 0.004899478597030024, "loss": 2.5284, "step": 14241 }, { "crossentropy": 2.5655388832092285, "epoch": 0.5163138051044084, "grad_norm": 0.026497632265090942, "grad_norm_var": 1.5497163943215074e-06, "learning_rate": 0.0048988976274339, "loss": 2.5164, "step": 14242 }, { "crossentropy": 2.693781614303589, "epoch": 0.5163500580046404, "grad_norm": 0.02650405280292034, "grad_norm_var": 1.5198488590710402e-06, "learning_rate": 0.004898316659203318, "loss": 2.5913, "step": 14243 }, { "crossentropy": 2.6594033241271973, "epoch": 0.5163863109048724, "grad_norm": 0.02957056649029255, "grad_norm_var": 1.9012673093504364e-06, "learning_rate": 0.004897735692346123, "loss": 2.7542, "step": 14244 }, { "crossentropy": 2.5430400371551514, "epoch": 0.5164225638051044, "grad_norm": 0.02689005434513092, "grad_norm_var": 1.8911620258559575e-06, "learning_rate": 0.004897154726870164, "loss": 2.5126, "step": 14245 }, { "crossentropy": 2.6225671768188477, "epoch": 0.5164588167053364, "grad_norm": 0.02566346898674965, "grad_norm_var": 2.044043298858007e-06, "learning_rate": 0.004896573762783286, "loss": 2.554, "step": 14246 }, { "crossentropy": 2.5061185359954834, "epoch": 0.5164950696055685, "grad_norm": 0.026845304295420647, "grad_norm_var": 1.3102934081331747e-06, "learning_rate": 0.004895992800093335, "loss": 2.557, "step": 14247 }, { "crossentropy": 2.568535804748535, "epoch": 0.5165313225058005, "grad_norm": 0.02733699604868889, "grad_norm_var": 1.2306983231361035e-06, "learning_rate": 0.004895411838808161, "loss": 2.6563, "step": 14248 }, { "crossentropy": 2.53666090965271, "epoch": 0.5165675754060325, "grad_norm": 0.026280617341399193, "grad_norm_var": 1.2464020332905144e-06, "learning_rate": 0.004894830878935609, "loss": 2.4676, "step": 14249 }, { "crossentropy": 2.3544232845306396, "epoch": 0.5166038283062645, "grad_norm": 0.027355894446372986, "grad_norm_var": 1.2257733011744945e-06, "learning_rate": 0.004894249920483524, "loss": 2.464, "step": 14250 }, { "crossentropy": 2.5241785049438477, "epoch": 0.5166400812064965, "grad_norm": 0.027141377329826355, "grad_norm_var": 9.23475638439121e-07, "learning_rate": 0.004893668963459755, "loss": 2.4848, "step": 14251 }, { "crossentropy": 2.543179988861084, "epoch": 0.5166763341067285, "grad_norm": 0.02788909524679184, "grad_norm_var": 8.754754830862885e-07, "learning_rate": 0.004893088007872148, "loss": 2.5564, "step": 14252 }, { "crossentropy": 2.7292895317077637, "epoch": 0.5167125870069605, "grad_norm": 0.03446035459637642, "grad_norm_var": 4.481996218635162e-06, "learning_rate": 0.00489250705372855, "loss": 2.6065, "step": 14253 }, { "crossentropy": 2.719660758972168, "epoch": 0.5167488399071926, "grad_norm": 0.027882840484380722, "grad_norm_var": 4.340705246912953e-06, "learning_rate": 0.004891926101036807, "loss": 2.6342, "step": 14254 }, { "crossentropy": 2.482705593109131, "epoch": 0.5167850928074246, "grad_norm": 0.032166458666324615, "grad_norm_var": 5.66878037476999e-06, "learning_rate": 0.004891345149804764, "loss": 2.5495, "step": 14255 }, { "crossentropy": 2.619999647140503, "epoch": 0.5168213457076566, "grad_norm": 0.028376257047057152, "grad_norm_var": 5.473762092448213e-06, "learning_rate": 0.004890764200040272, "loss": 2.5496, "step": 14256 }, { "crossentropy": 2.4980263710021973, "epoch": 0.5168575986078886, "grad_norm": 0.02740497700870037, "grad_norm_var": 5.276471148036911e-06, "learning_rate": 0.004890183251751175, "loss": 2.5068, "step": 14257 }, { "crossentropy": 2.464816093444824, "epoch": 0.5168938515081206, "grad_norm": 0.02753952145576477, "grad_norm_var": 5.133301141075954e-06, "learning_rate": 0.0048896023049453214, "loss": 2.5456, "step": 14258 }, { "crossentropy": 2.4466605186462402, "epoch": 0.5169301044083526, "grad_norm": 0.027669807896018028, "grad_norm_var": 4.97301148137443e-06, "learning_rate": 0.004889021359630554, "loss": 2.5277, "step": 14259 }, { "crossentropy": 2.49528431892395, "epoch": 0.5169663573085846, "grad_norm": 0.02673032321035862, "grad_norm_var": 4.940972411496501e-06, "learning_rate": 0.004888440415814723, "loss": 2.5703, "step": 14260 }, { "crossentropy": 2.5891807079315186, "epoch": 0.5170026102088167, "grad_norm": 0.028664980083703995, "grad_norm_var": 4.880617010691255e-06, "learning_rate": 0.004887859473505672, "loss": 2.6015, "step": 14261 }, { "crossentropy": 2.5785536766052246, "epoch": 0.5170388631090487, "grad_norm": 0.027244962751865387, "grad_norm_var": 4.5256827983476225e-06, "learning_rate": 0.004887278532711249, "loss": 2.4848, "step": 14262 }, { "crossentropy": 2.5532712936401367, "epoch": 0.5170751160092807, "grad_norm": 0.02750410884618759, "grad_norm_var": 4.434966152579026e-06, "learning_rate": 0.004886697593439301, "loss": 2.6038, "step": 14263 }, { "crossentropy": 2.497230291366577, "epoch": 0.5171113689095128, "grad_norm": 0.02829616144299507, "grad_norm_var": 4.3785121135624425e-06, "learning_rate": 0.0048861166556976735, "loss": 2.4736, "step": 14264 }, { "crossentropy": 2.478095531463623, "epoch": 0.5171476218097448, "grad_norm": 0.03046489506959915, "grad_norm_var": 4.352856304277025e-06, "learning_rate": 0.004885535719494214, "loss": 2.5685, "step": 14265 }, { "crossentropy": 2.4761300086975098, "epoch": 0.5171838747099768, "grad_norm": 0.028755590319633484, "grad_norm_var": 4.252544937095812e-06, "learning_rate": 0.00488495478483677, "loss": 2.4717, "step": 14266 }, { "crossentropy": 2.6140289306640625, "epoch": 0.5172201276102089, "grad_norm": 0.02940031699836254, "grad_norm_var": 4.121006378254146e-06, "learning_rate": 0.004884373851733184, "loss": 2.6142, "step": 14267 }, { "crossentropy": 2.5820488929748535, "epoch": 0.5172563805104409, "grad_norm": 0.02729756571352482, "grad_norm_var": 4.2129971012059075e-06, "learning_rate": 0.004883792920191305, "loss": 2.5562, "step": 14268 }, { "crossentropy": 2.5464913845062256, "epoch": 0.5172926334106729, "grad_norm": 0.026981554925441742, "grad_norm_var": 2.0057817950727233e-06, "learning_rate": 0.004883211990218979, "loss": 2.5581, "step": 14269 }, { "crossentropy": 2.4417426586151123, "epoch": 0.5173288863109049, "grad_norm": 0.028194144368171692, "grad_norm_var": 1.995612288312592e-06, "learning_rate": 0.004882631061824052, "loss": 2.5184, "step": 14270 }, { "crossentropy": 2.449265718460083, "epoch": 0.5173651392111369, "grad_norm": 0.02746761403977871, "grad_norm_var": 9.489298401868142e-07, "learning_rate": 0.00488205013501437, "loss": 2.512, "step": 14271 }, { "crossentropy": 2.5352630615234375, "epoch": 0.5174013921113689, "grad_norm": 0.02818124182522297, "grad_norm_var": 9.415115953642263e-07, "learning_rate": 0.004881469209797781, "loss": 2.5821, "step": 14272 }, { "crossentropy": 2.557504653930664, "epoch": 0.5174376450116009, "grad_norm": 0.02690667100250721, "grad_norm_var": 9.95724917161285e-07, "learning_rate": 0.00488088828618213, "loss": 2.5667, "step": 14273 }, { "crossentropy": 2.5057430267333984, "epoch": 0.517473897911833, "grad_norm": 0.028686629608273506, "grad_norm_var": 1.0142334545011506e-06, "learning_rate": 0.004880307364175265, "loss": 2.4915, "step": 14274 }, { "crossentropy": 2.4792661666870117, "epoch": 0.517510150812065, "grad_norm": 0.026538783684372902, "grad_norm_var": 1.148187469631375e-06, "learning_rate": 0.00487972644378503, "loss": 2.5421, "step": 14275 }, { "crossentropy": 2.7130866050720215, "epoch": 0.517546403712297, "grad_norm": 0.026631774380803108, "grad_norm_var": 1.164915713586003e-06, "learning_rate": 0.00487914552501927, "loss": 2.5104, "step": 14276 }, { "crossentropy": 2.5838892459869385, "epoch": 0.517582656612529, "grad_norm": 0.027063913643360138, "grad_norm_var": 1.1727250755724204e-06, "learning_rate": 0.004878564607885834, "loss": 2.5715, "step": 14277 }, { "crossentropy": 2.4318156242370605, "epoch": 0.517618909512761, "grad_norm": 0.02867424674332142, "grad_norm_var": 1.1849109763402045e-06, "learning_rate": 0.004877983692392568, "loss": 2.5714, "step": 14278 }, { "crossentropy": 2.480534315109253, "epoch": 0.517655162412993, "grad_norm": 0.02875739336013794, "grad_norm_var": 1.2101872474736304e-06, "learning_rate": 0.0048774027785473155, "loss": 2.4863, "step": 14279 }, { "crossentropy": 2.5641958713531494, "epoch": 0.517691415313225, "grad_norm": 0.025845520198345184, "grad_norm_var": 1.4948644227699559e-06, "learning_rate": 0.004876821866357926, "loss": 2.5893, "step": 14280 }, { "crossentropy": 2.4874069690704346, "epoch": 0.5177276682134571, "grad_norm": 0.02648847922682762, "grad_norm_var": 1.1049322336046843e-06, "learning_rate": 0.004876240955832244, "loss": 2.508, "step": 14281 }, { "crossentropy": 2.621934652328491, "epoch": 0.5177639211136891, "grad_norm": 0.02612539567053318, "grad_norm_var": 1.1379949812951688e-06, "learning_rate": 0.004875660046978115, "loss": 2.6374, "step": 14282 }, { "crossentropy": 2.4118080139160156, "epoch": 0.5178001740139211, "grad_norm": 0.026675870642066002, "grad_norm_var": 8.943731965057692e-07, "learning_rate": 0.0048750791398033885, "loss": 2.5297, "step": 14283 }, { "crossentropy": 2.4288220405578613, "epoch": 0.5178364269141531, "grad_norm": 0.027926314622163773, "grad_norm_var": 9.203607993853487e-07, "learning_rate": 0.004874498234315906, "loss": 2.5007, "step": 14284 }, { "crossentropy": 2.633776903152466, "epoch": 0.5178726798143851, "grad_norm": 0.028348075225949287, "grad_norm_var": 9.751153674473005e-07, "learning_rate": 0.004873917330523515, "loss": 2.5968, "step": 14285 }, { "crossentropy": 2.486306667327881, "epoch": 0.5179089327146171, "grad_norm": 0.025890236720442772, "grad_norm_var": 1.065065023423993e-06, "learning_rate": 0.0048733364284340615, "loss": 2.5162, "step": 14286 }, { "crossentropy": 2.566612958908081, "epoch": 0.5179451856148491, "grad_norm": 0.02916724607348442, "grad_norm_var": 1.291978707998696e-06, "learning_rate": 0.0048727555280553905, "loss": 2.5775, "step": 14287 }, { "crossentropy": 2.442723035812378, "epoch": 0.5179814385150812, "grad_norm": 0.0279729962348938, "grad_norm_var": 1.2721429083159165e-06, "learning_rate": 0.004872174629395351, "loss": 2.5447, "step": 14288 }, { "crossentropy": 2.5564582347869873, "epoch": 0.5180176914153132, "grad_norm": 0.026516875252127647, "grad_norm_var": 1.3050035967547645e-06, "learning_rate": 0.004871593732461787, "loss": 2.5344, "step": 14289 }, { "crossentropy": 2.3617959022521973, "epoch": 0.5180539443155452, "grad_norm": 0.02829269878566265, "grad_norm_var": 1.2435443439670711e-06, "learning_rate": 0.004871012837262544, "loss": 2.5003, "step": 14290 }, { "crossentropy": 2.731494188308716, "epoch": 0.5180901972157773, "grad_norm": 0.028783036395907402, "grad_norm_var": 1.3283885762268294e-06, "learning_rate": 0.00487043194380547, "loss": 2.5832, "step": 14291 }, { "crossentropy": 2.408694267272949, "epoch": 0.5181264501160093, "grad_norm": 0.02663029171526432, "grad_norm_var": 1.3285499742931531e-06, "learning_rate": 0.004869851052098409, "loss": 2.4878, "step": 14292 }, { "crossentropy": 2.513051986694336, "epoch": 0.5181627030162413, "grad_norm": 0.027069956064224243, "grad_norm_var": 1.3282432885000725e-06, "learning_rate": 0.004869270162149205, "loss": 2.5478, "step": 14293 }, { "crossentropy": 2.547358989715576, "epoch": 0.5181989559164734, "grad_norm": 0.026205871254205704, "grad_norm_var": 1.3054005551469198e-06, "learning_rate": 0.004868689273965708, "loss": 2.5388, "step": 14294 }, { "crossentropy": 2.530025005340576, "epoch": 0.5182352088167054, "grad_norm": 0.026341494172811508, "grad_norm_var": 1.198642122763466e-06, "learning_rate": 0.004868108387555759, "loss": 2.5657, "step": 14295 }, { "crossentropy": 2.626133918762207, "epoch": 0.5182714617169374, "grad_norm": 0.02702755481004715, "grad_norm_var": 1.0815539556385348e-06, "learning_rate": 0.004867527502927208, "loss": 2.5894, "step": 14296 }, { "crossentropy": 2.5205821990966797, "epoch": 0.5183077146171694, "grad_norm": 0.027075687423348427, "grad_norm_var": 1.046112692917285e-06, "learning_rate": 0.004866946620087899, "loss": 2.511, "step": 14297 }, { "crossentropy": 2.501408815383911, "epoch": 0.5183439675174014, "grad_norm": 0.028105664998292923, "grad_norm_var": 9.934498992706746e-07, "learning_rate": 0.004866365739045677, "loss": 2.5643, "step": 14298 }, { "crossentropy": 2.5424952507019043, "epoch": 0.5183802204176334, "grad_norm": 0.0270316693931818, "grad_norm_var": 9.681068005056835e-07, "learning_rate": 0.00486578485980839, "loss": 2.5751, "step": 14299 }, { "crossentropy": 2.597490072250366, "epoch": 0.5184164733178654, "grad_norm": 0.02780892699956894, "grad_norm_var": 9.607163128678986e-07, "learning_rate": 0.004865203982383882, "loss": 2.5745, "step": 14300 }, { "crossentropy": 2.628230571746826, "epoch": 0.5184527262180975, "grad_norm": 0.027374718338251114, "grad_norm_var": 8.958198072109378e-07, "learning_rate": 0.004864623106779998, "loss": 2.638, "step": 14301 }, { "crossentropy": 2.5735456943511963, "epoch": 0.5184889791183295, "grad_norm": 0.026629526168107986, "grad_norm_var": 7.879669258574473e-07, "learning_rate": 0.004864042233004585, "loss": 2.5787, "step": 14302 }, { "crossentropy": 2.6481125354766846, "epoch": 0.5185252320185615, "grad_norm": 0.028410062193870544, "grad_norm_var": 6.430744649713396e-07, "learning_rate": 0.004863461361065487, "loss": 2.5439, "step": 14303 }, { "crossentropy": 2.575305223464966, "epoch": 0.5185614849187935, "grad_norm": 0.02699635922908783, "grad_norm_var": 6.189341792217987e-07, "learning_rate": 0.004862880490970551, "loss": 2.5635, "step": 14304 }, { "crossentropy": 2.559664726257324, "epoch": 0.5185977378190255, "grad_norm": 0.025724714621901512, "grad_norm_var": 7.375707553833668e-07, "learning_rate": 0.0048622996227276226, "loss": 2.5883, "step": 14305 }, { "crossentropy": 2.717174768447876, "epoch": 0.5186339907192575, "grad_norm": 0.027728183194994926, "grad_norm_var": 6.766920718168437e-07, "learning_rate": 0.004861718756344546, "loss": 2.6005, "step": 14306 }, { "crossentropy": 2.6302688121795654, "epoch": 0.5186702436194895, "grad_norm": 0.02595071867108345, "grad_norm_var": 5.741979889598795e-07, "learning_rate": 0.0048611378918291676, "loss": 2.5997, "step": 14307 }, { "crossentropy": 2.5380356311798096, "epoch": 0.5187064965197216, "grad_norm": 0.027043500915169716, "grad_norm_var": 5.641168455837772e-07, "learning_rate": 0.004860557029189334, "loss": 2.6109, "step": 14308 }, { "crossentropy": 2.64346981048584, "epoch": 0.5187427494199536, "grad_norm": 0.026727527379989624, "grad_norm_var": 5.697484464222177e-07, "learning_rate": 0.004859976168432888, "loss": 2.6659, "step": 14309 }, { "crossentropy": 2.506643533706665, "epoch": 0.5187790023201856, "grad_norm": 0.02578970044851303, "grad_norm_var": 6.2527090596251e-07, "learning_rate": 0.004859395309567677, "loss": 2.4907, "step": 14310 }, { "crossentropy": 2.5972886085510254, "epoch": 0.5188152552204176, "grad_norm": 0.027352921664714813, "grad_norm_var": 6.023755926516516e-07, "learning_rate": 0.004858814452601543, "loss": 2.5921, "step": 14311 }, { "crossentropy": 2.601060390472412, "epoch": 0.5188515081206496, "grad_norm": 0.027867479249835014, "grad_norm_var": 6.441119546815125e-07, "learning_rate": 0.004858233597542338, "loss": 2.5736, "step": 14312 }, { "crossentropy": 2.6293394565582275, "epoch": 0.5188877610208816, "grad_norm": 0.026098642498254776, "grad_norm_var": 7.070841208140263e-07, "learning_rate": 0.0048576527443979015, "loss": 2.5744, "step": 14313 }, { "crossentropy": 2.558490037918091, "epoch": 0.5189240139211136, "grad_norm": 0.0274544395506382, "grad_norm_var": 6.410599915879078e-07, "learning_rate": 0.00485707189317608, "loss": 2.5868, "step": 14314 }, { "crossentropy": 2.609621047973633, "epoch": 0.5189602668213457, "grad_norm": 0.028414415195584297, "grad_norm_var": 7.665235851283253e-07, "learning_rate": 0.004856491043884721, "loss": 2.5839, "step": 14315 }, { "crossentropy": 2.486299753189087, "epoch": 0.5189965197215777, "grad_norm": 0.029622621834278107, "grad_norm_var": 1.1470021037651336e-06, "learning_rate": 0.004855910196531669, "loss": 2.4667, "step": 14316 }, { "crossentropy": 2.599151611328125, "epoch": 0.5190327726218097, "grad_norm": 0.026682576164603233, "grad_norm_var": 1.1607359581744873e-06, "learning_rate": 0.004855329351124767, "loss": 2.5899, "step": 14317 }, { "crossentropy": 2.5369999408721924, "epoch": 0.5190690255220418, "grad_norm": 0.027737712487578392, "grad_norm_var": 1.159724064982145e-06, "learning_rate": 0.004854748507671862, "loss": 2.5263, "step": 14318 }, { "crossentropy": 2.6767499446868896, "epoch": 0.5191052784222738, "grad_norm": 0.026671210303902626, "grad_norm_var": 1.0739692421423789e-06, "learning_rate": 0.004854167666180797, "loss": 2.5673, "step": 14319 }, { "crossentropy": 2.627105951309204, "epoch": 0.5191415313225058, "grad_norm": 0.026487622410058975, "grad_norm_var": 1.0982889898304587e-06, "learning_rate": 0.004853586826659421, "loss": 2.6457, "step": 14320 }, { "crossentropy": 2.5277979373931885, "epoch": 0.5191777842227379, "grad_norm": 0.027076253667473793, "grad_norm_var": 9.673923229036596e-07, "learning_rate": 0.004853005989115576, "loss": 2.583, "step": 14321 }, { "crossentropy": 2.6607754230499268, "epoch": 0.5192140371229699, "grad_norm": 0.02856210060417652, "grad_norm_var": 1.0730203746093952e-06, "learning_rate": 0.004852425153557109, "loss": 2.5602, "step": 14322 }, { "crossentropy": 2.487825632095337, "epoch": 0.5192502900232019, "grad_norm": 0.028150619938969612, "grad_norm_var": 1.0028309700720832e-06, "learning_rate": 0.004851844319991865, "loss": 2.5021, "step": 14323 }, { "crossentropy": 2.5228259563446045, "epoch": 0.5192865429234339, "grad_norm": 0.02829158306121826, "grad_norm_var": 1.0477337042853752e-06, "learning_rate": 0.004851263488427686, "loss": 2.4426, "step": 14324 }, { "crossentropy": 2.5868914127349854, "epoch": 0.5193227958236659, "grad_norm": 0.02614549547433853, "grad_norm_var": 1.1239421860700002e-06, "learning_rate": 0.004850682658872423, "loss": 2.4894, "step": 14325 }, { "crossentropy": 2.5406458377838135, "epoch": 0.5193590487238979, "grad_norm": 0.028378769755363464, "grad_norm_var": 9.868905018665016e-07, "learning_rate": 0.004850101831333915, "loss": 2.591, "step": 14326 }, { "crossentropy": 2.6450467109680176, "epoch": 0.51939530162413, "grad_norm": 0.027592457830905914, "grad_norm_var": 9.837941057177593e-07, "learning_rate": 0.004849521005820008, "loss": 2.6131, "step": 14327 }, { "crossentropy": 2.6891794204711914, "epoch": 0.519431554524362, "grad_norm": 0.026729799807071686, "grad_norm_var": 1.0206447549766651e-06, "learning_rate": 0.00484894018233855, "loss": 2.6333, "step": 14328 }, { "crossentropy": 2.5453104972839355, "epoch": 0.519467807424594, "grad_norm": 0.031751036643981934, "grad_norm_var": 1.9568185542220232e-06, "learning_rate": 0.004848359360897383, "loss": 2.653, "step": 14329 }, { "crossentropy": 2.493819236755371, "epoch": 0.519504060324826, "grad_norm": 0.028819259256124496, "grad_norm_var": 1.9995654565839546e-06, "learning_rate": 0.004847778541504353, "loss": 2.5255, "step": 14330 }, { "crossentropy": 2.7216079235076904, "epoch": 0.519540313225058, "grad_norm": 0.025206448510289192, "grad_norm_var": 2.441800811963388e-06, "learning_rate": 0.004847197724167306, "loss": 2.588, "step": 14331 }, { "crossentropy": 2.535259962081909, "epoch": 0.51957656612529, "grad_norm": 0.027862457558512688, "grad_norm_var": 2.194568878086979e-06, "learning_rate": 0.004846616908894084, "loss": 2.5657, "step": 14332 }, { "crossentropy": 2.4481356143951416, "epoch": 0.519612819025522, "grad_norm": 0.02680891938507557, "grad_norm_var": 2.1795376033230395e-06, "learning_rate": 0.004846036095692536, "loss": 2.5049, "step": 14333 }, { "crossentropy": 2.5673811435699463, "epoch": 0.519649071925754, "grad_norm": 0.028587503358721733, "grad_norm_var": 2.2355181763439313e-06, "learning_rate": 0.004845455284570502, "loss": 2.5122, "step": 14334 }, { "crossentropy": 2.6738646030426025, "epoch": 0.5196853248259861, "grad_norm": 0.027611512690782547, "grad_norm_var": 2.1624104117204477e-06, "learning_rate": 0.004844874475535829, "loss": 2.6971, "step": 14335 }, { "crossentropy": 2.3991456031799316, "epoch": 0.5197215777262181, "grad_norm": 0.027849679812788963, "grad_norm_var": 2.048401089503084e-06, "learning_rate": 0.004844293668596363, "loss": 2.5548, "step": 14336 }, { "crossentropy": 2.5975794792175293, "epoch": 0.5197578306264501, "grad_norm": 0.02809853106737137, "grad_norm_var": 2.00975254489367e-06, "learning_rate": 0.0048437128637599455, "loss": 2.5898, "step": 14337 }, { "crossentropy": 2.573333263397217, "epoch": 0.5197940835266821, "grad_norm": 0.029527729377150536, "grad_norm_var": 2.1529041997705394e-06, "learning_rate": 0.004843132061034424, "loss": 2.6478, "step": 14338 }, { "crossentropy": 2.738424777984619, "epoch": 0.5198303364269141, "grad_norm": 0.02852669544517994, "grad_norm_var": 2.17113973060543e-06, "learning_rate": 0.004842551260427641, "loss": 2.6462, "step": 14339 }, { "crossentropy": 2.4774341583251953, "epoch": 0.5198665893271461, "grad_norm": 0.027332229539752007, "grad_norm_var": 2.1896688561344352e-06, "learning_rate": 0.004841970461947442, "loss": 2.4951, "step": 14340 }, { "crossentropy": 2.6245172023773193, "epoch": 0.5199028422273781, "grad_norm": 0.028569966554641724, "grad_norm_var": 1.9812236453521534e-06, "learning_rate": 0.004841389665601673, "loss": 2.669, "step": 14341 }, { "crossentropy": 2.6513733863830566, "epoch": 0.5199390951276102, "grad_norm": 0.026878520846366882, "grad_norm_var": 2.0617938544677136e-06, "learning_rate": 0.004840808871398179, "loss": 2.6747, "step": 14342 }, { "crossentropy": 2.570441961288452, "epoch": 0.5199753480278422, "grad_norm": 0.02888367883861065, "grad_norm_var": 2.098493961968335e-06, "learning_rate": 0.0048402280793448, "loss": 2.5703, "step": 14343 }, { "crossentropy": 2.4879229068756104, "epoch": 0.5200116009280742, "grad_norm": 0.029534542933106422, "grad_norm_var": 2.090743560735573e-06, "learning_rate": 0.0048396472894493845, "loss": 2.5735, "step": 14344 }, { "crossentropy": 2.5685617923736572, "epoch": 0.5200478538283063, "grad_norm": 0.03355446085333824, "grad_norm_var": 3.1381356126229666e-06, "learning_rate": 0.004839066501719775, "loss": 2.6329, "step": 14345 }, { "crossentropy": 2.56533145904541, "epoch": 0.5200841067285383, "grad_norm": 0.027529384940862656, "grad_norm_var": 3.161977276416178e-06, "learning_rate": 0.004838485716163817, "loss": 2.562, "step": 14346 }, { "crossentropy": 2.64994215965271, "epoch": 0.5201203596287703, "grad_norm": 0.02643316425383091, "grad_norm_var": 2.7545163077536777e-06, "learning_rate": 0.004837904932789353, "loss": 2.6234, "step": 14347 }, { "crossentropy": 2.583878993988037, "epoch": 0.5201566125290024, "grad_norm": 0.02653573453426361, "grad_norm_var": 2.9506510704951307e-06, "learning_rate": 0.0048373241516042305, "loss": 2.5577, "step": 14348 }, { "crossentropy": 2.6038761138916016, "epoch": 0.5201928654292344, "grad_norm": 0.027620859444141388, "grad_norm_var": 2.8340700521108096e-06, "learning_rate": 0.004836743372616292, "loss": 2.6683, "step": 14349 }, { "crossentropy": 2.6842539310455322, "epoch": 0.5202291183294664, "grad_norm": 0.027392320334911346, "grad_norm_var": 2.8802640097413812e-06, "learning_rate": 0.004836162595833382, "loss": 2.6966, "step": 14350 }, { "crossentropy": 2.5415096282958984, "epoch": 0.5202653712296984, "grad_norm": 0.02949722297489643, "grad_norm_var": 2.9438755974377693e-06, "learning_rate": 0.004835581821263346, "loss": 2.5621, "step": 14351 }, { "crossentropy": 2.5972824096679688, "epoch": 0.5203016241299304, "grad_norm": 0.028101587668061256, "grad_norm_var": 2.9306912948426704e-06, "learning_rate": 0.004835001048914025, "loss": 2.6479, "step": 14352 }, { "crossentropy": 2.5710837841033936, "epoch": 0.5203378770301624, "grad_norm": 0.025861507281661034, "grad_norm_var": 3.3262308591077653e-06, "learning_rate": 0.004834420278793266, "loss": 2.5818, "step": 14353 }, { "crossentropy": 2.644307851791382, "epoch": 0.5203741299303944, "grad_norm": 0.027194473892450333, "grad_norm_var": 3.264698088390842e-06, "learning_rate": 0.004833839510908913, "loss": 2.5787, "step": 14354 }, { "crossentropy": 2.6226651668548584, "epoch": 0.5204103828306265, "grad_norm": 0.026887377724051476, "grad_norm_var": 3.3372939912007863e-06, "learning_rate": 0.004833258745268808, "loss": 2.5903, "step": 14355 }, { "crossentropy": 2.504739284515381, "epoch": 0.5204466357308585, "grad_norm": 0.025247512385249138, "grad_norm_var": 3.7911844931107565e-06, "learning_rate": 0.004832677981880797, "loss": 2.4802, "step": 14356 }, { "crossentropy": 2.5194430351257324, "epoch": 0.5204828886310905, "grad_norm": 0.02791518159210682, "grad_norm_var": 3.755791935208909e-06, "learning_rate": 0.004832097220752725, "loss": 2.5806, "step": 14357 }, { "crossentropy": 2.563873291015625, "epoch": 0.5205191415313225, "grad_norm": 0.025623511523008347, "grad_norm_var": 4.011225728104542e-06, "learning_rate": 0.004831516461892434, "loss": 2.5505, "step": 14358 }, { "crossentropy": 2.577569007873535, "epoch": 0.5205553944315545, "grad_norm": 0.027671515941619873, "grad_norm_var": 3.91793849465327e-06, "learning_rate": 0.004830935705307769, "loss": 2.5592, "step": 14359 }, { "crossentropy": 2.662936210632324, "epoch": 0.5205916473317865, "grad_norm": 0.028205931186676025, "grad_norm_var": 3.6966389372821424e-06, "learning_rate": 0.004830354951006575, "loss": 2.6692, "step": 14360 }, { "crossentropy": 2.5460262298583984, "epoch": 0.5206279002320185, "grad_norm": 0.027524089440703392, "grad_norm_var": 1.1652979943709818e-06, "learning_rate": 0.0048297741989966935, "loss": 2.5413, "step": 14361 }, { "crossentropy": 2.7366061210632324, "epoch": 0.5206641531322506, "grad_norm": 0.027418360114097595, "grad_norm_var": 1.1612306944591236e-06, "learning_rate": 0.00482919344928597, "loss": 2.6335, "step": 14362 }, { "crossentropy": 2.537902355194092, "epoch": 0.5207004060324826, "grad_norm": 0.026987630873918533, "grad_norm_var": 1.1240757819159455e-06, "learning_rate": 0.004828612701882247, "loss": 2.5751, "step": 14363 }, { "crossentropy": 2.5278830528259277, "epoch": 0.5207366589327146, "grad_norm": 0.028409341350197792, "grad_norm_var": 1.1699633287310176e-06, "learning_rate": 0.004828031956793371, "loss": 2.6067, "step": 14364 }, { "crossentropy": 2.6477982997894287, "epoch": 0.5207729118329466, "grad_norm": 0.027127452194690704, "grad_norm_var": 1.1671888462835267e-06, "learning_rate": 0.004827451214027184, "loss": 2.625, "step": 14365 }, { "crossentropy": 2.555408239364624, "epoch": 0.5208091647331786, "grad_norm": 0.026322050020098686, "grad_norm_var": 1.2279705541825507e-06, "learning_rate": 0.004826870473591531, "loss": 2.5727, "step": 14366 }, { "crossentropy": 2.6236376762390137, "epoch": 0.5208454176334106, "grad_norm": 0.027616890147328377, "grad_norm_var": 8.85462833498833e-07, "learning_rate": 0.004826289735494255, "loss": 2.6016, "step": 14367 }, { "crossentropy": 2.623373031616211, "epoch": 0.5208816705336426, "grad_norm": 0.027129901573061943, "grad_norm_var": 8.188752612493423e-07, "learning_rate": 0.004825708999743199, "loss": 2.6772, "step": 14368 }, { "crossentropy": 2.5507352352142334, "epoch": 0.5209179234338747, "grad_norm": 0.02903740666806698, "grad_norm_var": 9.36929652011253e-07, "learning_rate": 0.004825128266346208, "loss": 2.5592, "step": 14369 }, { "crossentropy": 2.4936788082122803, "epoch": 0.5209541763341067, "grad_norm": 0.026741541922092438, "grad_norm_var": 9.54307270746218e-07, "learning_rate": 0.004824547535311125, "loss": 2.5051, "step": 14370 }, { "crossentropy": 2.525580883026123, "epoch": 0.5209904292343387, "grad_norm": 0.026824191212654114, "grad_norm_var": 9.575411300231812e-07, "learning_rate": 0.004823966806645792, "loss": 2.5002, "step": 14371 }, { "crossentropy": 2.498366117477417, "epoch": 0.5210266821345708, "grad_norm": 0.029617248103022575, "grad_norm_var": 9.914323596977096e-07, "learning_rate": 0.004823386080358056, "loss": 2.5791, "step": 14372 }, { "crossentropy": 2.7155442237854004, "epoch": 0.5210629350348028, "grad_norm": 0.027860675007104874, "grad_norm_var": 9.886789312933773e-07, "learning_rate": 0.004822805356455759, "loss": 2.6466, "step": 14373 }, { "crossentropy": 2.517003059387207, "epoch": 0.5210991879350348, "grad_norm": 0.027049534022808075, "grad_norm_var": 7.57587420007139e-07, "learning_rate": 0.0048222246349467444, "loss": 2.476, "step": 14374 }, { "crossentropy": 2.7101268768310547, "epoch": 0.5211354408352669, "grad_norm": 0.027097245678305626, "grad_norm_var": 7.724539930899222e-07, "learning_rate": 0.004821643915838857, "loss": 2.6511, "step": 14375 }, { "crossentropy": 2.7452049255371094, "epoch": 0.5211716937354989, "grad_norm": 0.027394920587539673, "grad_norm_var": 7.437791590660103e-07, "learning_rate": 0.0048210631991399385, "loss": 2.6283, "step": 14376 }, { "crossentropy": 2.6743576526641846, "epoch": 0.5212079466357309, "grad_norm": 0.026210499927401543, "grad_norm_var": 8.491396476129065e-07, "learning_rate": 0.004820482484857832, "loss": 2.6081, "step": 14377 }, { "crossentropy": 2.5873866081237793, "epoch": 0.5212441995359629, "grad_norm": 0.02650444023311138, "grad_norm_var": 9.02493731892009e-07, "learning_rate": 0.004819901773000383, "loss": 2.6625, "step": 14378 }, { "crossentropy": 2.6480469703674316, "epoch": 0.5212804524361949, "grad_norm": 0.026500724256038666, "grad_norm_var": 9.421793608001128e-07, "learning_rate": 0.004819321063575433, "loss": 2.5724, "step": 14379 }, { "crossentropy": 2.542074203491211, "epoch": 0.5213167053364269, "grad_norm": 0.027334824204444885, "grad_norm_var": 8.611740194348904e-07, "learning_rate": 0.0048187403565908265, "loss": 2.5391, "step": 14380 }, { "crossentropy": 2.4730188846588135, "epoch": 0.521352958236659, "grad_norm": 0.02629469893872738, "grad_norm_var": 9.206878416433227e-07, "learning_rate": 0.004818159652054406, "loss": 2.5303, "step": 14381 }, { "crossentropy": 2.5260250568389893, "epoch": 0.521389211136891, "grad_norm": 0.026886776089668274, "grad_norm_var": 8.729282677577092e-07, "learning_rate": 0.004817578949974017, "loss": 2.6338, "step": 14382 }, { "crossentropy": 2.497023582458496, "epoch": 0.521425464037123, "grad_norm": 0.028412669897079468, "grad_norm_var": 9.507626356457331e-07, "learning_rate": 0.0048169982503575, "loss": 2.5993, "step": 14383 }, { "crossentropy": 2.5916354656219482, "epoch": 0.521461716937355, "grad_norm": 0.027115702629089355, "grad_norm_var": 9.511087781209626e-07, "learning_rate": 0.0048164175532127, "loss": 2.4626, "step": 14384 }, { "crossentropy": 2.68109393119812, "epoch": 0.521497969837587, "grad_norm": 0.026272829622030258, "grad_norm_var": 7.90277677704014e-07, "learning_rate": 0.00481583685854746, "loss": 2.6361, "step": 14385 }, { "crossentropy": 2.498962640762329, "epoch": 0.521534222737819, "grad_norm": 0.05929547920823097, "grad_norm_var": 6.532864229577491e-05, "learning_rate": 0.004815256166369622, "loss": 2.5449, "step": 14386 }, { "crossentropy": 2.539473295211792, "epoch": 0.521570475638051, "grad_norm": 0.02732020802795887, "grad_norm_var": 6.518907443138344e-05, "learning_rate": 0.004814675476687028, "loss": 2.461, "step": 14387 }, { "crossentropy": 2.5524206161499023, "epoch": 0.521606728538283, "grad_norm": 0.026587925851345062, "grad_norm_var": 6.559329769248525e-05, "learning_rate": 0.004814094789507525, "loss": 2.5851, "step": 14388 }, { "crossentropy": 2.680882215499878, "epoch": 0.5216429814385151, "grad_norm": 0.03226366639137268, "grad_norm_var": 6.613097930726387e-05, "learning_rate": 0.004813514104838954, "loss": 2.6383, "step": 14389 }, { "crossentropy": 2.5586631298065186, "epoch": 0.5216792343387471, "grad_norm": 0.028841810300946236, "grad_norm_var": 6.579780218013693e-05, "learning_rate": 0.004812933422689157, "loss": 2.5727, "step": 14390 }, { "crossentropy": 2.5913753509521484, "epoch": 0.5217154872389791, "grad_norm": 0.028559457510709763, "grad_norm_var": 6.548328160611223e-05, "learning_rate": 0.004812352743065979, "loss": 2.5431, "step": 14391 }, { "crossentropy": 2.5085058212280273, "epoch": 0.5217517401392111, "grad_norm": 0.02886098623275757, "grad_norm_var": 6.520860917477142e-05, "learning_rate": 0.0048117720659772625, "loss": 2.592, "step": 14392 }, { "crossentropy": 2.627089262008667, "epoch": 0.5217879930394431, "grad_norm": 0.0273517407476902, "grad_norm_var": 6.477745415299103e-05, "learning_rate": 0.004811191391430849, "loss": 2.6219, "step": 14393 }, { "crossentropy": 2.562614679336548, "epoch": 0.5218242459396751, "grad_norm": 0.02616974711418152, "grad_norm_var": 6.492483932143815e-05, "learning_rate": 0.004810610719434582, "loss": 2.5547, "step": 14394 }, { "crossentropy": 2.6881675720214844, "epoch": 0.5218604988399071, "grad_norm": 0.028531881049275398, "grad_norm_var": 6.433539790818791e-05, "learning_rate": 0.004810030049996304, "loss": 2.6589, "step": 14395 }, { "crossentropy": 2.537522792816162, "epoch": 0.5218967517401392, "grad_norm": 0.02678261697292328, "grad_norm_var": 6.453274191274879e-05, "learning_rate": 0.004809449383123859, "loss": 2.5343, "step": 14396 }, { "crossentropy": 2.547586679458618, "epoch": 0.5219330046403712, "grad_norm": 0.031462348997592926, "grad_norm_var": 6.384046464364978e-05, "learning_rate": 0.00480886871882509, "loss": 2.4677, "step": 14397 }, { "crossentropy": 2.578364610671997, "epoch": 0.5219692575406032, "grad_norm": 0.027232352644205093, "grad_norm_var": 6.370241946671213e-05, "learning_rate": 0.004808288057107839, "loss": 2.6215, "step": 14398 }, { "crossentropy": 2.583735942840576, "epoch": 0.5220055104408353, "grad_norm": 0.02802242524921894, "grad_norm_var": 6.379798237999227e-05, "learning_rate": 0.004807707397979948, "loss": 2.5935, "step": 14399 }, { "crossentropy": 2.584803819656372, "epoch": 0.5220417633410673, "grad_norm": 0.026296911761164665, "grad_norm_var": 6.415934800557952e-05, "learning_rate": 0.004807126741449261, "loss": 2.6019, "step": 14400 }, { "crossentropy": 2.632920503616333, "epoch": 0.5220780162412993, "grad_norm": 0.026585396379232407, "grad_norm_var": 6.401050667723938e-05, "learning_rate": 0.004806546087523621, "loss": 2.6534, "step": 14401 }, { "crossentropy": 2.674447536468506, "epoch": 0.5221142691415314, "grad_norm": 0.02668854594230652, "grad_norm_var": 3.141320155276026e-06, "learning_rate": 0.004805965436210868, "loss": 2.6128, "step": 14402 }, { "crossentropy": 2.506213665008545, "epoch": 0.5221505220417634, "grad_norm": 0.026771213859319687, "grad_norm_var": 3.207895529749871e-06, "learning_rate": 0.004805384787518846, "loss": 2.4995, "step": 14403 }, { "crossentropy": 2.624131202697754, "epoch": 0.5221867749419954, "grad_norm": 0.028625067323446274, "grad_norm_var": 3.100544076075201e-06, "learning_rate": 0.004804804141455399, "loss": 2.6454, "step": 14404 }, { "crossentropy": 2.5645105838775635, "epoch": 0.5222230278422274, "grad_norm": 0.02909218706190586, "grad_norm_var": 1.953885184237146e-06, "learning_rate": 0.0048042234980283675, "loss": 2.6117, "step": 14405 }, { "crossentropy": 2.615426540374756, "epoch": 0.5222592807424594, "grad_norm": 0.029882887378335, "grad_norm_var": 2.1569156346095865e-06, "learning_rate": 0.004803642857245595, "loss": 2.5901, "step": 14406 }, { "crossentropy": 2.6260886192321777, "epoch": 0.5222955336426914, "grad_norm": 0.028757916763424873, "grad_norm_var": 2.1759743354373673e-06, "learning_rate": 0.0048030622191149236, "loss": 2.6879, "step": 14407 }, { "crossentropy": 2.7665390968322754, "epoch": 0.5223317865429234, "grad_norm": 0.027317365631461143, "grad_norm_var": 2.1362981439597734e-06, "learning_rate": 0.004802481583644195, "loss": 2.672, "step": 14408 }, { "crossentropy": 2.58906888961792, "epoch": 0.5223680394431555, "grad_norm": 0.02664089761674404, "grad_norm_var": 2.2149296941668834e-06, "learning_rate": 0.004801900950841256, "loss": 2.5551, "step": 14409 }, { "crossentropy": 2.5330514907836914, "epoch": 0.5224042923433875, "grad_norm": 0.025965997949242592, "grad_norm_var": 2.2619141270684004e-06, "learning_rate": 0.004801320320713942, "loss": 2.6272, "step": 14410 }, { "crossentropy": 2.694439172744751, "epoch": 0.5224405452436195, "grad_norm": 0.027477648109197617, "grad_norm_var": 2.2272356865885487e-06, "learning_rate": 0.0048007396932700985, "loss": 2.6587, "step": 14411 }, { "crossentropy": 2.6660375595092773, "epoch": 0.5224767981438515, "grad_norm": 0.029684744775295258, "grad_norm_var": 2.388933793955513e-06, "learning_rate": 0.004800159068517568, "loss": 2.5844, "step": 14412 }, { "crossentropy": 2.502988815307617, "epoch": 0.5225130510440835, "grad_norm": 0.02798830345273018, "grad_norm_var": 1.4961522932496172e-06, "learning_rate": 0.004799578446464193, "loss": 2.6069, "step": 14413 }, { "crossentropy": 2.519885540008545, "epoch": 0.5225493039443155, "grad_norm": 0.02590421587228775, "grad_norm_var": 1.6873292295715992e-06, "learning_rate": 0.004798997827117816, "loss": 2.4665, "step": 14414 }, { "crossentropy": 2.5156383514404297, "epoch": 0.5225855568445475, "grad_norm": 0.027440566569566727, "grad_norm_var": 1.676210207122057e-06, "learning_rate": 0.004798417210486278, "loss": 2.5459, "step": 14415 }, { "crossentropy": 2.4387593269348145, "epoch": 0.5226218097447796, "grad_norm": 0.02759587951004505, "grad_norm_var": 1.5611755701045443e-06, "learning_rate": 0.004797836596577419, "loss": 2.516, "step": 14416 }, { "crossentropy": 2.695307731628418, "epoch": 0.5226580626450116, "grad_norm": 0.02619978040456772, "grad_norm_var": 1.6252669115322197e-06, "learning_rate": 0.004797255985399088, "loss": 2.6666, "step": 14417 }, { "crossentropy": 2.561998128890991, "epoch": 0.5226943155452436, "grad_norm": 0.02541978843510151, "grad_norm_var": 1.8846449766166466e-06, "learning_rate": 0.004796675376959121, "loss": 2.5601, "step": 14418 }, { "crossentropy": 2.588214635848999, "epoch": 0.5227305684454756, "grad_norm": 0.026262076571583748, "grad_norm_var": 1.9535633638761607e-06, "learning_rate": 0.00479609477126536, "loss": 2.6303, "step": 14419 }, { "crossentropy": 2.5012013912200928, "epoch": 0.5227668213457076, "grad_norm": 0.02648880146443844, "grad_norm_var": 1.9228766203244553e-06, "learning_rate": 0.004795514168325649, "loss": 2.5472, "step": 14420 }, { "crossentropy": 2.7270562648773193, "epoch": 0.5228030742459396, "grad_norm": 0.026987342163920403, "grad_norm_var": 1.7199415305436024e-06, "learning_rate": 0.00479493356814783, "loss": 2.664, "step": 14421 }, { "crossentropy": 2.5152013301849365, "epoch": 0.5228393271461717, "grad_norm": 0.025398360565304756, "grad_norm_var": 1.4031084298357644e-06, "learning_rate": 0.004794352970739745, "loss": 2.537, "step": 14422 }, { "crossentropy": 2.5016028881073, "epoch": 0.5228755800464037, "grad_norm": 0.02772844396531582, "grad_norm_var": 1.2240150245677756e-06, "learning_rate": 0.004793772376109234, "loss": 2.585, "step": 14423 }, { "crossentropy": 2.535252809524536, "epoch": 0.5229118329466357, "grad_norm": 0.02900204434990883, "grad_norm_var": 1.4937422998425083e-06, "learning_rate": 0.004793191784264139, "loss": 2.525, "step": 14424 }, { "crossentropy": 2.742129325866699, "epoch": 0.5229480858468677, "grad_norm": 0.027712376788258553, "grad_norm_var": 1.5125428651813066e-06, "learning_rate": 0.004792611195212305, "loss": 2.6193, "step": 14425 }, { "crossentropy": 2.6297380924224854, "epoch": 0.5229843387470998, "grad_norm": 0.026516320183873177, "grad_norm_var": 1.4498382921858948e-06, "learning_rate": 0.004792030608961573, "loss": 2.6527, "step": 14426 }, { "crossentropy": 2.6006243228912354, "epoch": 0.5230205916473318, "grad_norm": 0.026273442432284355, "grad_norm_var": 1.4819089587958985e-06, "learning_rate": 0.00479145002551978, "loss": 2.5419, "step": 14427 }, { "crossentropy": 2.584376573562622, "epoch": 0.5230568445475638, "grad_norm": 0.02613167278468609, "grad_norm_var": 1.016889145018226e-06, "learning_rate": 0.004790869444894772, "loss": 2.6246, "step": 14428 }, { "crossentropy": 2.417698383331299, "epoch": 0.5230930974477959, "grad_norm": 0.027584655210375786, "grad_norm_var": 9.639571434282938e-07, "learning_rate": 0.004790288867094389, "loss": 2.5103, "step": 14429 }, { "crossentropy": 2.493865728378296, "epoch": 0.5231293503480279, "grad_norm": 0.026626603677868843, "grad_norm_var": 9.112203969282069e-07, "learning_rate": 0.004789708292126473, "loss": 2.5237, "step": 14430 }, { "crossentropy": 2.6074211597442627, "epoch": 0.5231656032482599, "grad_norm": 0.02622898668050766, "grad_norm_var": 9.052224619234082e-07, "learning_rate": 0.004789127719998867, "loss": 2.5801, "step": 14431 }, { "crossentropy": 2.6347522735595703, "epoch": 0.5232018561484919, "grad_norm": 0.02817627415060997, "grad_norm_var": 9.909779793241629e-07, "learning_rate": 0.004788547150719409, "loss": 2.6065, "step": 14432 }, { "crossentropy": 2.6267569065093994, "epoch": 0.5232381090487239, "grad_norm": 0.02589484304189682, "grad_norm_var": 1.02103340262986e-06, "learning_rate": 0.004787966584295944, "loss": 2.6567, "step": 14433 }, { "crossentropy": 2.7234232425689697, "epoch": 0.5232743619489559, "grad_norm": 0.02538244053721428, "grad_norm_var": 1.0278791250182268e-06, "learning_rate": 0.004787386020736313, "loss": 2.6686, "step": 14434 }, { "crossentropy": 2.622192144393921, "epoch": 0.523310614849188, "grad_norm": 0.02766859158873558, "grad_norm_var": 1.0553929405219981e-06, "learning_rate": 0.004786805460048356, "loss": 2.5289, "step": 14435 }, { "crossentropy": 2.5498204231262207, "epoch": 0.52334686774942, "grad_norm": 0.02687528356909752, "grad_norm_var": 1.0454675635756706e-06, "learning_rate": 0.004786224902239914, "loss": 2.4949, "step": 14436 }, { "crossentropy": 2.5495903491973877, "epoch": 0.523383120649652, "grad_norm": 0.02884616330265999, "grad_norm_var": 1.2863545412948132e-06, "learning_rate": 0.0047856443473188294, "loss": 2.5863, "step": 14437 }, { "crossentropy": 2.5753583908081055, "epoch": 0.523419373549884, "grad_norm": 0.028277436271309853, "grad_norm_var": 1.188473980989526e-06, "learning_rate": 0.0047850637952929435, "loss": 2.5942, "step": 14438 }, { "crossentropy": 2.709913969039917, "epoch": 0.523455626450116, "grad_norm": 0.02722427062690258, "grad_norm_var": 1.1676842888802553e-06, "learning_rate": 0.004784483246170096, "loss": 2.584, "step": 14439 }, { "crossentropy": 2.616454601287842, "epoch": 0.523491879350348, "grad_norm": 0.026236986741423607, "grad_norm_var": 9.632227413962676e-07, "learning_rate": 0.00478390269995813, "loss": 2.5594, "step": 14440 }, { "crossentropy": 2.6113393306732178, "epoch": 0.52352813225058, "grad_norm": 0.02748546190559864, "grad_norm_var": 9.442378710454765e-07, "learning_rate": 0.004783322156664886, "loss": 2.5744, "step": 14441 }, { "crossentropy": 2.5075008869171143, "epoch": 0.523564385150812, "grad_norm": 0.027479439973831177, "grad_norm_var": 9.446800125471426e-07, "learning_rate": 0.004782741616298205, "loss": 2.5734, "step": 14442 }, { "crossentropy": 2.437236785888672, "epoch": 0.5236006380510441, "grad_norm": 0.025327028706669807, "grad_norm_var": 1.095440379147936e-06, "learning_rate": 0.0047821610788659295, "loss": 2.5233, "step": 14443 }, { "crossentropy": 2.586754322052002, "epoch": 0.5236368909512761, "grad_norm": 0.029187943786382675, "grad_norm_var": 1.339500384783795e-06, "learning_rate": 0.004781580544375898, "loss": 2.5558, "step": 14444 }, { "crossentropy": 2.4785618782043457, "epoch": 0.5236731438515081, "grad_norm": 0.026893172413110733, "grad_norm_var": 1.3299005716618332e-06, "learning_rate": 0.004781000012835953, "loss": 2.5503, "step": 14445 }, { "crossentropy": 2.6035261154174805, "epoch": 0.5237093967517401, "grad_norm": 0.027016261592507362, "grad_norm_var": 1.3141102244508077e-06, "learning_rate": 0.004780419484253936, "loss": 2.5541, "step": 14446 }, { "crossentropy": 2.4177186489105225, "epoch": 0.5237456496519721, "grad_norm": 0.02932373620569706, "grad_norm_var": 1.5378044825186161e-06, "learning_rate": 0.004779838958637685, "loss": 2.5071, "step": 14447 }, { "crossentropy": 2.4553022384643555, "epoch": 0.5237819025522041, "grad_norm": 0.02770509198307991, "grad_norm_var": 1.4985739112436385e-06, "learning_rate": 0.004779258435995044, "loss": 2.5412, "step": 14448 }, { "crossentropy": 2.5521011352539062, "epoch": 0.5238181554524362, "grad_norm": 0.029259784147143364, "grad_norm_var": 1.5751373896209323e-06, "learning_rate": 0.004778677916333852, "loss": 2.5546, "step": 14449 }, { "crossentropy": 2.5990779399871826, "epoch": 0.5238544083526682, "grad_norm": 0.03243635594844818, "grad_norm_var": 2.682268272545934e-06, "learning_rate": 0.004778097399661952, "loss": 2.5755, "step": 14450 }, { "crossentropy": 2.42879319190979, "epoch": 0.5238906612529002, "grad_norm": 0.028588321059942245, "grad_norm_var": 2.7002982481834244e-06, "learning_rate": 0.0047775168859871845, "loss": 2.4996, "step": 14451 }, { "crossentropy": 2.524951696395874, "epoch": 0.5239269141531323, "grad_norm": 0.02603605017066002, "grad_norm_var": 2.871309198768969e-06, "learning_rate": 0.004776936375317387, "loss": 2.5528, "step": 14452 }, { "crossentropy": 2.5768325328826904, "epoch": 0.5239631670533643, "grad_norm": 0.029569534584879875, "grad_norm_var": 2.9897033320657227e-06, "learning_rate": 0.004776355867660403, "loss": 2.6466, "step": 14453 }, { "crossentropy": 2.7550508975982666, "epoch": 0.5239994199535963, "grad_norm": 0.028014710173010826, "grad_norm_var": 2.9844013913534468e-06, "learning_rate": 0.004775775363024072, "loss": 2.6704, "step": 14454 }, { "crossentropy": 2.641932964324951, "epoch": 0.5240356728538283, "grad_norm": 0.028473565354943275, "grad_norm_var": 2.954979360229509e-06, "learning_rate": 0.004775194861416234, "loss": 2.6422, "step": 14455 }, { "crossentropy": 2.4574928283691406, "epoch": 0.5240719257540604, "grad_norm": 0.027956856414675713, "grad_norm_var": 2.720752683778753e-06, "learning_rate": 0.004774614362844732, "loss": 2.4319, "step": 14456 }, { "crossentropy": 2.428358316421509, "epoch": 0.5241081786542924, "grad_norm": 0.02663743495941162, "grad_norm_var": 2.84333586947205e-06, "learning_rate": 0.004774033867317404, "loss": 2.4616, "step": 14457 }, { "crossentropy": 2.519862651824951, "epoch": 0.5241444315545244, "grad_norm": 0.026466427370905876, "grad_norm_var": 2.9938682119704514e-06, "learning_rate": 0.004773453374842093, "loss": 2.5596, "step": 14458 }, { "crossentropy": 2.673300266265869, "epoch": 0.5241806844547564, "grad_norm": 0.026791857555508614, "grad_norm_var": 2.5950246227929383e-06, "learning_rate": 0.004772872885426638, "loss": 2.6814, "step": 14459 }, { "crossentropy": 2.6114518642425537, "epoch": 0.5242169373549884, "grad_norm": 0.027584930881857872, "grad_norm_var": 2.53320977246594e-06, "learning_rate": 0.004772292399078879, "loss": 2.6142, "step": 14460 }, { "crossentropy": 2.682387351989746, "epoch": 0.5242531902552204, "grad_norm": 0.02893122099339962, "grad_norm_var": 2.4792359985265164e-06, "learning_rate": 0.004771711915806657, "loss": 2.7104, "step": 14461 }, { "crossentropy": 2.507694721221924, "epoch": 0.5242894431554525, "grad_norm": 0.025798842310905457, "grad_norm_var": 2.759877505419522e-06, "learning_rate": 0.0047711314356178106, "loss": 2.5494, "step": 14462 }, { "crossentropy": 2.482578992843628, "epoch": 0.5243256960556845, "grad_norm": 0.026080915704369545, "grad_norm_var": 2.8873229070931644e-06, "learning_rate": 0.004770550958520181, "loss": 2.5258, "step": 14463 }, { "crossentropy": 2.5057616233825684, "epoch": 0.5243619489559165, "grad_norm": 0.02711763046681881, "grad_norm_var": 2.9238257549103147e-06, "learning_rate": 0.0047699704845216104, "loss": 2.5974, "step": 14464 }, { "crossentropy": 2.607229471206665, "epoch": 0.5243982018561485, "grad_norm": 0.0272220429033041, "grad_norm_var": 2.8027661129836987e-06, "learning_rate": 0.0047693900136299375, "loss": 2.4845, "step": 14465 }, { "crossentropy": 2.2876601219177246, "epoch": 0.5244344547563805, "grad_norm": 0.026366934180259705, "grad_norm_var": 1.297836038688021e-06, "learning_rate": 0.004768809545853001, "loss": 2.4319, "step": 14466 }, { "crossentropy": 2.5658743381500244, "epoch": 0.5244707076566125, "grad_norm": 0.02712417021393776, "grad_norm_var": 1.1905292708155735e-06, "learning_rate": 0.004768229081198645, "loss": 2.5815, "step": 14467 }, { "crossentropy": 2.6556293964385986, "epoch": 0.5245069605568445, "grad_norm": 0.02753644809126854, "grad_norm_var": 1.0862098904708723e-06, "learning_rate": 0.004767648619674706, "loss": 2.5541, "step": 14468 }, { "crossentropy": 2.6401355266571045, "epoch": 0.5245432134570766, "grad_norm": 0.027875108644366264, "grad_norm_var": 7.6524556330026e-07, "learning_rate": 0.004767068161289025, "loss": 2.601, "step": 14469 }, { "crossentropy": 2.4826951026916504, "epoch": 0.5245794663573086, "grad_norm": 0.02795938216149807, "grad_norm_var": 7.597859304643891e-07, "learning_rate": 0.004766487706049441, "loss": 2.5271, "step": 14470 }, { "crossentropy": 2.670974016189575, "epoch": 0.5246157192575406, "grad_norm": 0.02673952654004097, "grad_norm_var": 6.637203772227411e-07, "learning_rate": 0.004765907253963794, "loss": 2.5385, "step": 14471 }, { "crossentropy": 2.6042351722717285, "epoch": 0.5246519721577726, "grad_norm": 0.02678101323544979, "grad_norm_var": 6.215747340346245e-07, "learning_rate": 0.0047653268050399265, "loss": 2.5605, "step": 14472 }, { "crossentropy": 2.5683624744415283, "epoch": 0.5246882250580046, "grad_norm": 0.026278162375092506, "grad_norm_var": 6.500455032824494e-07, "learning_rate": 0.004764746359285677, "loss": 2.4603, "step": 14473 }, { "crossentropy": 2.661684274673462, "epoch": 0.5247244779582366, "grad_norm": 0.027666376903653145, "grad_norm_var": 6.481240421060541e-07, "learning_rate": 0.004764165916708884, "loss": 2.6127, "step": 14474 }, { "crossentropy": 2.528799057006836, "epoch": 0.5247607308584686, "grad_norm": 0.029219307005405426, "grad_norm_var": 9.115231147202006e-07, "learning_rate": 0.004763585477317388, "loss": 2.62, "step": 14475 }, { "crossentropy": 2.6728508472442627, "epoch": 0.5247969837587007, "grad_norm": 0.029499955475330353, "grad_norm_var": 1.2217501720455098e-06, "learning_rate": 0.004763005041119031, "loss": 2.6861, "step": 14476 }, { "crossentropy": 2.5888938903808594, "epoch": 0.5248332366589327, "grad_norm": 0.02881101705133915, "grad_norm_var": 1.1979087527659668e-06, "learning_rate": 0.004762424608121649, "loss": 2.6464, "step": 14477 }, { "crossentropy": 2.5834407806396484, "epoch": 0.5248694895591647, "grad_norm": 0.02645573578774929, "grad_norm_var": 1.0864084465910435e-06, "learning_rate": 0.004761844178333083, "loss": 2.5753, "step": 14478 }, { "crossentropy": 2.6402270793914795, "epoch": 0.5249057424593968, "grad_norm": 0.026549838483333588, "grad_norm_var": 1.016374225315516e-06, "learning_rate": 0.0047612637517611715, "loss": 2.4721, "step": 14479 }, { "crossentropy": 2.584542989730835, "epoch": 0.5249419953596288, "grad_norm": 0.029530426487326622, "grad_norm_var": 1.2732446739308849e-06, "learning_rate": 0.004760683328413757, "loss": 2.5075, "step": 14480 }, { "crossentropy": 2.581045389175415, "epoch": 0.5249782482598608, "grad_norm": 0.027493467554450035, "grad_norm_var": 1.264135947175388e-06, "learning_rate": 0.004760102908298677, "loss": 2.571, "step": 14481 }, { "crossentropy": 2.598592519760132, "epoch": 0.5250145011600929, "grad_norm": 0.02610624022781849, "grad_norm_var": 1.3118671153819482e-06, "learning_rate": 0.004759522491423772, "loss": 2.5217, "step": 14482 }, { "crossentropy": 2.468214750289917, "epoch": 0.5250507540603249, "grad_norm": 0.026710763573646545, "grad_norm_var": 1.3488670187045484e-06, "learning_rate": 0.0047589420777968796, "loss": 2.5336, "step": 14483 }, { "crossentropy": 2.4711098670959473, "epoch": 0.5250870069605569, "grad_norm": 0.02493048645555973, "grad_norm_var": 1.78697938666276e-06, "learning_rate": 0.0047583616674258394, "loss": 2.4598, "step": 14484 }, { "crossentropy": 2.632835865020752, "epoch": 0.5251232598607889, "grad_norm": 0.02820012904703617, "grad_norm_var": 1.813610971708558e-06, "learning_rate": 0.004757781260318495, "loss": 2.6416, "step": 14485 }, { "crossentropy": 2.5769925117492676, "epoch": 0.5251595127610209, "grad_norm": 0.030088912695646286, "grad_norm_var": 2.2464338864426498e-06, "learning_rate": 0.00475720085648268, "loss": 2.6065, "step": 14486 }, { "crossentropy": 2.5585498809814453, "epoch": 0.5251957656612529, "grad_norm": 0.030821187421679497, "grad_norm_var": 2.837714246256663e-06, "learning_rate": 0.004756620455926236, "loss": 2.4724, "step": 14487 }, { "crossentropy": 2.571331739425659, "epoch": 0.5252320185614849, "grad_norm": 0.026579566299915314, "grad_norm_var": 2.8681959543357116e-06, "learning_rate": 0.004756040058657002, "loss": 2.5692, "step": 14488 }, { "crossentropy": 2.3772666454315186, "epoch": 0.525268271461717, "grad_norm": 0.02893189899623394, "grad_norm_var": 2.766735859154451e-06, "learning_rate": 0.004755459664682817, "loss": 2.4597, "step": 14489 }, { "crossentropy": 2.573197603225708, "epoch": 0.525304524361949, "grad_norm": 0.02947063557803631, "grad_norm_var": 2.8960209323869907e-06, "learning_rate": 0.00475487927401152, "loss": 2.5389, "step": 14490 }, { "crossentropy": 2.6008431911468506, "epoch": 0.525340777262181, "grad_norm": 0.027380308136343956, "grad_norm_var": 2.829865209611722e-06, "learning_rate": 0.004754298886650952, "loss": 2.6045, "step": 14491 }, { "crossentropy": 2.6541407108306885, "epoch": 0.525377030162413, "grad_norm": 0.026567408815026283, "grad_norm_var": 2.7701238973570444e-06, "learning_rate": 0.004753718502608948, "loss": 2.583, "step": 14492 }, { "crossentropy": 2.546114444732666, "epoch": 0.525413283062645, "grad_norm": 0.026248469948768616, "grad_norm_var": 2.831429698925631e-06, "learning_rate": 0.004753138121893353, "loss": 2.5156, "step": 14493 }, { "crossentropy": 2.482107400894165, "epoch": 0.525449535962877, "grad_norm": 0.027704725041985512, "grad_norm_var": 2.733526807166759e-06, "learning_rate": 0.004752557744512, "loss": 2.5572, "step": 14494 }, { "crossentropy": 2.5953824520111084, "epoch": 0.525485788863109, "grad_norm": 0.026772771030664444, "grad_norm_var": 2.7022325487980563e-06, "learning_rate": 0.004751977370472729, "loss": 2.6499, "step": 14495 }, { "crossentropy": 2.4718925952911377, "epoch": 0.525522041763341, "grad_norm": 0.02602168172597885, "grad_norm_var": 2.6252202568942856e-06, "learning_rate": 0.004751396999783382, "loss": 2.5344, "step": 14496 }, { "crossentropy": 2.562965154647827, "epoch": 0.5255582946635731, "grad_norm": 0.02880309522151947, "grad_norm_var": 2.7309621684293845e-06, "learning_rate": 0.004750816632451795, "loss": 2.598, "step": 14497 }, { "crossentropy": 2.5537261962890625, "epoch": 0.5255945475638051, "grad_norm": 0.0280519500374794, "grad_norm_var": 2.5842943522193703e-06, "learning_rate": 0.004750236268485807, "loss": 2.4817, "step": 14498 }, { "crossentropy": 2.4265761375427246, "epoch": 0.5256308004640371, "grad_norm": 0.02595284767448902, "grad_norm_var": 2.7206948476454168e-06, "learning_rate": 0.004749655907893258, "loss": 2.5413, "step": 14499 }, { "crossentropy": 2.6329517364501953, "epoch": 0.5256670533642691, "grad_norm": 0.027740858495235443, "grad_norm_var": 2.192333181659974e-06, "learning_rate": 0.0047490755506819846, "loss": 2.5913, "step": 14500 }, { "crossentropy": 2.590211868286133, "epoch": 0.5257033062645011, "grad_norm": 0.026193752884864807, "grad_norm_var": 2.3458577855978705e-06, "learning_rate": 0.00474849519685983, "loss": 2.5661, "step": 14501 }, { "crossentropy": 2.722520351409912, "epoch": 0.5257395591647331, "grad_norm": 0.027810655534267426, "grad_norm_var": 1.94705637984667e-06, "learning_rate": 0.004747914846434627, "loss": 2.6478, "step": 14502 }, { "crossentropy": 2.54872727394104, "epoch": 0.5257758120649652, "grad_norm": 0.027217064052820206, "grad_norm_var": 1.1945075923347944e-06, "learning_rate": 0.004747334499414216, "loss": 2.5808, "step": 14503 }, { "crossentropy": 2.5016863346099854, "epoch": 0.5258120649651972, "grad_norm": 0.026411833241581917, "grad_norm_var": 1.213283388527816e-06, "learning_rate": 0.004746754155806437, "loss": 2.5347, "step": 14504 }, { "crossentropy": 2.6973555088043213, "epoch": 0.5258483178654292, "grad_norm": 0.028032781556248665, "grad_norm_var": 1.0717694460289264e-06, "learning_rate": 0.004746173815619127, "loss": 2.5062, "step": 14505 }, { "crossentropy": 2.5721395015716553, "epoch": 0.5258845707656613, "grad_norm": 0.02657240815460682, "grad_norm_var": 7.478291424557372e-07, "learning_rate": 0.004745593478860125, "loss": 2.5907, "step": 14506 }, { "crossentropy": 2.582252025604248, "epoch": 0.5259208236658933, "grad_norm": 0.02748940698802471, "grad_norm_var": 7.527572830669556e-07, "learning_rate": 0.004745013145537269, "loss": 2.6179, "step": 14507 }, { "crossentropy": 2.683040142059326, "epoch": 0.5259570765661253, "grad_norm": 0.02657368779182434, "grad_norm_var": 7.523142972228984e-07, "learning_rate": 0.004744432815658397, "loss": 2.6189, "step": 14508 }, { "crossentropy": 2.6164774894714355, "epoch": 0.5259933294663574, "grad_norm": 0.028167519718408585, "grad_norm_var": 7.646346458754667e-07, "learning_rate": 0.004743852489231349, "loss": 2.5814, "step": 14509 }, { "crossentropy": 2.608825206756592, "epoch": 0.5260295823665894, "grad_norm": 0.02674582228064537, "grad_norm_var": 7.601055019216488e-07, "learning_rate": 0.004743272166263962, "loss": 2.5611, "step": 14510 }, { "crossentropy": 2.620734214782715, "epoch": 0.5260658352668214, "grad_norm": 0.027126844972372055, "grad_norm_var": 7.496654979835625e-07, "learning_rate": 0.004742691846764073, "loss": 2.5898, "step": 14511 }, { "crossentropy": 2.494187593460083, "epoch": 0.5261020881670534, "grad_norm": 0.027270454913377762, "grad_norm_var": 6.53931383864228e-07, "learning_rate": 0.004742111530739521, "loss": 2.5381, "step": 14512 }, { "crossentropy": 2.4646103382110596, "epoch": 0.5261383410672854, "grad_norm": 0.027586745098233223, "grad_norm_var": 4.961513710566648e-07, "learning_rate": 0.004741531218198144, "loss": 2.5024, "step": 14513 }, { "crossentropy": 2.7892565727233887, "epoch": 0.5261745939675174, "grad_norm": 0.02752740867435932, "grad_norm_var": 4.526471912342459e-07, "learning_rate": 0.004740950909147781, "loss": 2.7478, "step": 14514 }, { "crossentropy": 2.6010351181030273, "epoch": 0.5262108468677494, "grad_norm": 0.02934996783733368, "grad_norm_var": 6.311056130122704e-07, "learning_rate": 0.004740370603596268, "loss": 2.5565, "step": 14515 }, { "crossentropy": 2.5869052410125732, "epoch": 0.5262470997679815, "grad_norm": 0.02780243754386902, "grad_norm_var": 6.344403066168546e-07, "learning_rate": 0.004739790301551443, "loss": 2.5424, "step": 14516 }, { "crossentropy": 2.5499157905578613, "epoch": 0.5262833526682135, "grad_norm": 0.026247944682836533, "grad_norm_var": 6.261434038872434e-07, "learning_rate": 0.004739210003021147, "loss": 2.5823, "step": 14517 }, { "crossentropy": 2.5708086490631104, "epoch": 0.5263196055684455, "grad_norm": 0.026297729462385178, "grad_norm_var": 6.804756053796837e-07, "learning_rate": 0.004738629708013218, "loss": 2.5732, "step": 14518 }, { "crossentropy": 2.555225133895874, "epoch": 0.5263558584686775, "grad_norm": 0.026111600920557976, "grad_norm_var": 7.65577893911901e-07, "learning_rate": 0.004738049416535487, "loss": 2.5413, "step": 14519 }, { "crossentropy": 2.460726499557495, "epoch": 0.5263921113689095, "grad_norm": 0.026092465966939926, "grad_norm_var": 8.058195452097239e-07, "learning_rate": 0.0047374691285957974, "loss": 2.4188, "step": 14520 }, { "crossentropy": 2.596306324005127, "epoch": 0.5264283642691415, "grad_norm": 0.027403561398386955, "grad_norm_var": 7.596236318879156e-07, "learning_rate": 0.004736888844201987, "loss": 2.5751, "step": 14521 }, { "crossentropy": 2.4968788623809814, "epoch": 0.5264646171693735, "grad_norm": 0.02899751253426075, "grad_norm_var": 9.411185834473067e-07, "learning_rate": 0.00473630856336189, "loss": 2.5602, "step": 14522 }, { "crossentropy": 2.6648919582366943, "epoch": 0.5265008700696056, "grad_norm": 0.0304899662733078, "grad_norm_var": 1.5798275263618205e-06, "learning_rate": 0.004735728286083347, "loss": 2.6726, "step": 14523 }, { "crossentropy": 2.464127540588379, "epoch": 0.5265371229698376, "grad_norm": 0.028691044077277184, "grad_norm_var": 1.6021922210691813e-06, "learning_rate": 0.0047351480123741935, "loss": 2.5183, "step": 14524 }, { "crossentropy": 2.4464468955993652, "epoch": 0.5265733758700696, "grad_norm": 0.027121147140860558, "grad_norm_var": 1.5941395573775778e-06, "learning_rate": 0.00473456774224227, "loss": 2.4868, "step": 14525 }, { "crossentropy": 2.544175148010254, "epoch": 0.5266096287703016, "grad_norm": 0.027553919702768326, "grad_norm_var": 1.5478842857159405e-06, "learning_rate": 0.00473398747569541, "loss": 2.549, "step": 14526 }, { "crossentropy": 2.6131908893585205, "epoch": 0.5266458816705336, "grad_norm": 0.026691826060414314, "grad_norm_var": 1.5874125455306656e-06, "learning_rate": 0.004733407212741455, "loss": 2.6077, "step": 14527 }, { "crossentropy": 2.532050132751465, "epoch": 0.5266821345707656, "grad_norm": 0.028003189712762833, "grad_norm_var": 1.590997189941136e-06, "learning_rate": 0.004732826953388239, "loss": 2.5644, "step": 14528 }, { "crossentropy": 2.645256280899048, "epoch": 0.5267183874709976, "grad_norm": 0.026054929941892624, "grad_norm_var": 1.7450615313370585e-06, "learning_rate": 0.004732246697643601, "loss": 2.5932, "step": 14529 }, { "crossentropy": 2.650296211242676, "epoch": 0.5267546403712297, "grad_norm": 0.02817799709737301, "grad_norm_var": 1.7715258420884407e-06, "learning_rate": 0.004731666445515377, "loss": 2.5557, "step": 14530 }, { "crossentropy": 2.5570180416107178, "epoch": 0.5267908932714617, "grad_norm": 0.02921089343726635, "grad_norm_var": 1.7396903361848829e-06, "learning_rate": 0.004731086197011405, "loss": 2.4815, "step": 14531 }, { "crossentropy": 2.588761329650879, "epoch": 0.5268271461716937, "grad_norm": 0.02741321362555027, "grad_norm_var": 1.7365387402932017e-06, "learning_rate": 0.00473050595213952, "loss": 2.5826, "step": 14532 }, { "crossentropy": 2.4655888080596924, "epoch": 0.5268633990719258, "grad_norm": 0.026120904833078384, "grad_norm_var": 1.7593472906926212e-06, "learning_rate": 0.004729925710907564, "loss": 2.5168, "step": 14533 }, { "crossentropy": 2.402806520462036, "epoch": 0.5268996519721578, "grad_norm": 0.026542900130152702, "grad_norm_var": 1.722920137748179e-06, "learning_rate": 0.0047293454733233695, "loss": 2.511, "step": 14534 }, { "crossentropy": 2.566781520843506, "epoch": 0.5269359048723898, "grad_norm": 0.026486650109291077, "grad_norm_var": 1.6601663161354736e-06, "learning_rate": 0.004728765239394777, "loss": 2.5319, "step": 14535 }, { "crossentropy": 2.50854229927063, "epoch": 0.5269721577726219, "grad_norm": 0.026589535176753998, "grad_norm_var": 1.577964955715956e-06, "learning_rate": 0.004728185009129621, "loss": 2.5447, "step": 14536 }, { "crossentropy": 2.407296895980835, "epoch": 0.5270084106728539, "grad_norm": 0.024386031553149223, "grad_norm_var": 2.2248147917695483e-06, "learning_rate": 0.004727604782535738, "loss": 2.444, "step": 14537 }, { "crossentropy": 2.7169995307922363, "epoch": 0.5270446635730859, "grad_norm": 0.025938421487808228, "grad_norm_var": 2.161457031894104e-06, "learning_rate": 0.004727024559620966, "loss": 2.5973, "step": 14538 }, { "crossentropy": 2.5566015243530273, "epoch": 0.5270809164733179, "grad_norm": 0.026045532897114754, "grad_norm_var": 1.4565091692563067e-06, "learning_rate": 0.004726444340393141, "loss": 2.5904, "step": 14539 }, { "crossentropy": 2.630811929702759, "epoch": 0.5271171693735499, "grad_norm": 0.026718491688370705, "grad_norm_var": 1.238962524984866e-06, "learning_rate": 0.004725864124860101, "loss": 2.5226, "step": 14540 }, { "crossentropy": 2.525038957595825, "epoch": 0.5271534222737819, "grad_norm": 0.02746720239520073, "grad_norm_var": 1.260528065728274e-06, "learning_rate": 0.004725283913029682, "loss": 2.53, "step": 14541 }, { "crossentropy": 2.5209884643554688, "epoch": 0.5271896751740139, "grad_norm": 0.026595305651426315, "grad_norm_var": 1.226405639297099e-06, "learning_rate": 0.00472470370490972, "loss": 2.5645, "step": 14542 }, { "crossentropy": 2.341280937194824, "epoch": 0.527225928074246, "grad_norm": 0.026569057255983353, "grad_norm_var": 1.2287531577367342e-06, "learning_rate": 0.0047241235005080545, "loss": 2.395, "step": 14543 }, { "crossentropy": 2.549659252166748, "epoch": 0.527262180974478, "grad_norm": 0.028568627312779427, "grad_norm_var": 1.3417066709662042e-06, "learning_rate": 0.0047235432998325175, "loss": 2.4981, "step": 14544 }, { "crossentropy": 2.6113693714141846, "epoch": 0.52729843387471, "grad_norm": 0.0276446882635355, "grad_norm_var": 1.3405987669267884e-06, "learning_rate": 0.004722963102890949, "loss": 2.5593, "step": 14545 }, { "crossentropy": 2.5134594440460205, "epoch": 0.527334686774942, "grad_norm": 0.026396282017230988, "grad_norm_var": 1.236522292634839e-06, "learning_rate": 0.004722382909691183, "loss": 2.4553, "step": 14546 }, { "crossentropy": 2.5206527709960938, "epoch": 0.527370939675174, "grad_norm": 0.029817447066307068, "grad_norm_var": 1.4550317685218014e-06, "learning_rate": 0.0047218027202410575, "loss": 2.5518, "step": 14547 }, { "crossentropy": 2.6870150566101074, "epoch": 0.527407192575406, "grad_norm": 0.02734033390879631, "grad_norm_var": 1.449708798797559e-06, "learning_rate": 0.004721222534548409, "loss": 2.7027, "step": 14548 }, { "crossentropy": 2.653542995452881, "epoch": 0.527443445475638, "grad_norm": 0.026534304022789, "grad_norm_var": 1.4214858966113756e-06, "learning_rate": 0.004720642352621073, "loss": 2.5503, "step": 14549 }, { "crossentropy": 2.6145875453948975, "epoch": 0.52747969837587, "grad_norm": 0.027981963008642197, "grad_norm_var": 1.4915030540684025e-06, "learning_rate": 0.004720062174466885, "loss": 2.5427, "step": 14550 }, { "crossentropy": 2.7366998195648193, "epoch": 0.5275159512761021, "grad_norm": 0.02842695824801922, "grad_norm_var": 1.6088729235588333e-06, "learning_rate": 0.004719482000093683, "loss": 2.6467, "step": 14551 }, { "crossentropy": 2.4305787086486816, "epoch": 0.5275522041763341, "grad_norm": 0.02780606783926487, "grad_norm_var": 1.624448351849568e-06, "learning_rate": 0.004718901829509304, "loss": 2.4309, "step": 14552 }, { "crossentropy": 2.5353429317474365, "epoch": 0.5275884570765661, "grad_norm": 0.026163412258028984, "grad_norm_var": 1.1692929317203554e-06, "learning_rate": 0.004718321662721581, "loss": 2.5796, "step": 14553 }, { "crossentropy": 2.5395665168762207, "epoch": 0.5276247099767981, "grad_norm": 0.027516674250364304, "grad_norm_var": 1.0487873386060085e-06, "learning_rate": 0.0047177414997383515, "loss": 2.538, "step": 14554 }, { "crossentropy": 2.5853888988494873, "epoch": 0.5276609628770301, "grad_norm": 0.0263878982514143, "grad_norm_var": 9.96587801523049e-07, "learning_rate": 0.004717161340567449, "loss": 2.587, "step": 14555 }, { "crossentropy": 2.732011318206787, "epoch": 0.5276972157772621, "grad_norm": 0.02791369892656803, "grad_norm_var": 9.818987828741673e-07, "learning_rate": 0.0047165811852167136, "loss": 2.6919, "step": 14556 }, { "crossentropy": 2.5776920318603516, "epoch": 0.5277334686774942, "grad_norm": 0.02688039094209671, "grad_norm_var": 1.0017318767164036e-06, "learning_rate": 0.00471600103369398, "loss": 2.5512, "step": 14557 }, { "crossentropy": 2.360766887664795, "epoch": 0.5277697215777262, "grad_norm": 0.026681149378418922, "grad_norm_var": 9.928796781359545e-07, "learning_rate": 0.004715420886007083, "loss": 2.4315, "step": 14558 }, { "crossentropy": 2.592611312866211, "epoch": 0.5278059744779582, "grad_norm": 0.02680378220975399, "grad_norm_var": 9.698695915022337e-07, "learning_rate": 0.0047148407421638595, "loss": 2.5783, "step": 14559 }, { "crossentropy": 2.5941805839538574, "epoch": 0.5278422273781903, "grad_norm": 0.0272427536547184, "grad_norm_var": 8.782704312991033e-07, "learning_rate": 0.004714260602172144, "loss": 2.5502, "step": 14560 }, { "crossentropy": 2.6652164459228516, "epoch": 0.5278784802784223, "grad_norm": 0.027208548039197922, "grad_norm_var": 8.72796303454321e-07, "learning_rate": 0.004713680466039773, "loss": 2.573, "step": 14561 }, { "crossentropy": 2.4508018493652344, "epoch": 0.5279147331786543, "grad_norm": 0.027937239035964012, "grad_norm_var": 8.316530835744684e-07, "learning_rate": 0.004713100333774581, "loss": 2.5295, "step": 14562 }, { "crossentropy": 2.55376935005188, "epoch": 0.5279509860788864, "grad_norm": 0.028944360092282295, "grad_norm_var": 5.99642008972329e-07, "learning_rate": 0.004712520205384404, "loss": 2.5549, "step": 14563 }, { "crossentropy": 2.7229719161987305, "epoch": 0.5279872389791184, "grad_norm": 0.029557067900896072, "grad_norm_var": 9.007726386663171e-07, "learning_rate": 0.004711940080877079, "loss": 2.6521, "step": 14564 }, { "crossentropy": 2.596558094024658, "epoch": 0.5280234918793504, "grad_norm": 0.026682207360863686, "grad_norm_var": 8.831128215253461e-07, "learning_rate": 0.00471135996026044, "loss": 2.5887, "step": 14565 }, { "crossentropy": 2.553826093673706, "epoch": 0.5280597447795824, "grad_norm": 0.026622449979186058, "grad_norm_var": 9.127854507064477e-07, "learning_rate": 0.004710779843542323, "loss": 2.5888, "step": 14566 }, { "crossentropy": 2.5657236576080322, "epoch": 0.5280959976798144, "grad_norm": 0.02638143114745617, "grad_norm_var": 9.005937623501273e-07, "learning_rate": 0.004710199730730563, "loss": 2.6336, "step": 14567 }, { "crossentropy": 2.4977879524230957, "epoch": 0.5281322505800464, "grad_norm": 0.02784748561680317, "grad_norm_var": 9.035201311054318e-07, "learning_rate": 0.004709619621832994, "loss": 2.5862, "step": 14568 }, { "crossentropy": 2.5894110202789307, "epoch": 0.5281685034802784, "grad_norm": 0.02758392132818699, "grad_norm_var": 8.147130423063424e-07, "learning_rate": 0.004709039516857456, "loss": 2.6347, "step": 14569 }, { "crossentropy": 2.5848724842071533, "epoch": 0.5282047563805105, "grad_norm": 0.02689395286142826, "grad_norm_var": 8.28177734478001e-07, "learning_rate": 0.004708459415811779, "loss": 2.5841, "step": 14570 }, { "crossentropy": 2.7341232299804688, "epoch": 0.5282410092807425, "grad_norm": 0.029305238276720047, "grad_norm_var": 9.86639998479125e-07, "learning_rate": 0.0047078793187037985, "loss": 2.6798, "step": 14571 }, { "crossentropy": 2.6077771186828613, "epoch": 0.5282772621809745, "grad_norm": 0.02927803248167038, "grad_norm_var": 1.1727124444771767e-06, "learning_rate": 0.004707299225541353, "loss": 2.5741, "step": 14572 }, { "crossentropy": 2.5987746715545654, "epoch": 0.5283135150812065, "grad_norm": 0.027067571878433228, "grad_norm_var": 1.1565526490836879e-06, "learning_rate": 0.004706719136332275, "loss": 2.6491, "step": 14573 }, { "crossentropy": 2.607123374938965, "epoch": 0.5283497679814385, "grad_norm": 0.02711334079504013, "grad_norm_var": 1.1137031551122864e-06, "learning_rate": 0.0047061390510844, "loss": 2.5712, "step": 14574 }, { "crossentropy": 2.3856730461120605, "epoch": 0.5283860208816705, "grad_norm": 0.026486413553357124, "grad_norm_var": 1.1559902335403118e-06, "learning_rate": 0.004705558969805564, "loss": 2.4959, "step": 14575 }, { "crossentropy": 2.5090506076812744, "epoch": 0.5284222737819025, "grad_norm": 0.026684802025556564, "grad_norm_var": 1.204590576467596e-06, "learning_rate": 0.004704978892503599, "loss": 2.6022, "step": 14576 }, { "crossentropy": 2.656399726867676, "epoch": 0.5284585266821346, "grad_norm": 0.02676333487033844, "grad_norm_var": 1.2401942458011799e-06, "learning_rate": 0.004704398819186345, "loss": 2.6315, "step": 14577 }, { "crossentropy": 2.547471523284912, "epoch": 0.5284947795823666, "grad_norm": 0.02742954157292843, "grad_norm_var": 1.2315665857377204e-06, "learning_rate": 0.004703818749861632, "loss": 2.6124, "step": 14578 }, { "crossentropy": 2.829483985900879, "epoch": 0.5285310324825986, "grad_norm": 0.026409227401018143, "grad_norm_var": 1.1585734703208871e-06, "learning_rate": 0.004703238684537294, "loss": 2.6497, "step": 14579 }, { "crossentropy": 2.53429913520813, "epoch": 0.5285672853828306, "grad_norm": 0.027257701382040977, "grad_norm_var": 8.220646585948745e-07, "learning_rate": 0.00470265862322117, "loss": 2.5287, "step": 14580 }, { "crossentropy": 2.609710454940796, "epoch": 0.5286035382830626, "grad_norm": 0.025655122473835945, "grad_norm_var": 9.640974152060374e-07, "learning_rate": 0.004702078565921093, "loss": 2.5746, "step": 14581 }, { "crossentropy": 2.5957155227661133, "epoch": 0.5286397911832946, "grad_norm": 0.026073157787322998, "grad_norm_var": 1.02332969520146e-06, "learning_rate": 0.004701498512644896, "loss": 2.5601, "step": 14582 }, { "crossentropy": 2.685743570327759, "epoch": 0.5286760440835266, "grad_norm": 0.027166998013854027, "grad_norm_var": 9.825088547132933e-07, "learning_rate": 0.0047009184634004145, "loss": 2.6756, "step": 14583 }, { "crossentropy": 2.5541107654571533, "epoch": 0.5287122969837587, "grad_norm": 0.02794252522289753, "grad_norm_var": 9.914241436426163e-07, "learning_rate": 0.004700338418195483, "loss": 2.5487, "step": 14584 }, { "crossentropy": 2.6644158363342285, "epoch": 0.5287485498839907, "grad_norm": 0.027278823778033257, "grad_norm_var": 9.813975507255134e-07, "learning_rate": 0.004699758377037939, "loss": 2.6488, "step": 14585 }, { "crossentropy": 2.5806565284729004, "epoch": 0.5287848027842227, "grad_norm": 0.02618994377553463, "grad_norm_var": 1.0387895842733918e-06, "learning_rate": 0.004699178339935611, "loss": 2.5416, "step": 14586 }, { "crossentropy": 2.4340107440948486, "epoch": 0.5288210556844548, "grad_norm": 0.026178274303674698, "grad_norm_var": 7.4355708946372e-07, "learning_rate": 0.004698598306896335, "loss": 2.4918, "step": 14587 }, { "crossentropy": 2.6898913383483887, "epoch": 0.5288573085846868, "grad_norm": 0.028874434530735016, "grad_norm_var": 6.27701863416726e-07, "learning_rate": 0.004698018277927947, "loss": 2.6451, "step": 14588 }, { "crossentropy": 2.6523118019104004, "epoch": 0.5288935614849188, "grad_norm": 0.02628849819302559, "grad_norm_var": 6.493413878103441e-07, "learning_rate": 0.004697438253038281, "loss": 2.636, "step": 14589 }, { "crossentropy": 2.6190969944000244, "epoch": 0.5289298143851509, "grad_norm": 0.02702401578426361, "grad_norm_var": 6.468467074008735e-07, "learning_rate": 0.00469685823223517, "loss": 2.5922, "step": 14590 }, { "crossentropy": 2.533702850341797, "epoch": 0.5289660672853829, "grad_norm": 0.02698219195008278, "grad_norm_var": 6.3774975289318e-07, "learning_rate": 0.004696278215526449, "loss": 2.5319, "step": 14591 }, { "crossentropy": 2.623476505279541, "epoch": 0.5290023201856149, "grad_norm": 0.027178708463907242, "grad_norm_var": 6.396535055108819e-07, "learning_rate": 0.004695698202919951, "loss": 2.5701, "step": 14592 }, { "crossentropy": 2.650038957595825, "epoch": 0.5290385730858469, "grad_norm": 0.025598647072911263, "grad_norm_var": 7.484964924704685e-07, "learning_rate": 0.004695118194423511, "loss": 2.5493, "step": 14593 }, { "crossentropy": 2.3096022605895996, "epoch": 0.5290748259860789, "grad_norm": 0.027973497286438942, "grad_norm_var": 8.093493680443852e-07, "learning_rate": 0.004694538190044965, "loss": 2.3895, "step": 14594 }, { "crossentropy": 2.6835265159606934, "epoch": 0.5291110788863109, "grad_norm": 0.03078843094408512, "grad_norm_var": 1.733357577605995e-06, "learning_rate": 0.004693958189792141, "loss": 2.6144, "step": 14595 }, { "crossentropy": 2.565089464187622, "epoch": 0.5291473317865429, "grad_norm": 0.02931239828467369, "grad_norm_var": 2.0258518648644715e-06, "learning_rate": 0.004693378193672877, "loss": 2.5769, "step": 14596 }, { "crossentropy": 2.514697551727295, "epoch": 0.529183584686775, "grad_norm": 0.02811429463326931, "grad_norm_var": 1.8705158559145207e-06, "learning_rate": 0.004692798201695006, "loss": 2.5435, "step": 14597 }, { "crossentropy": 2.5798799991607666, "epoch": 0.529219837587007, "grad_norm": 0.02668604999780655, "grad_norm_var": 1.7826801084493022e-06, "learning_rate": 0.004692218213866361, "loss": 2.5167, "step": 14598 }, { "crossentropy": 2.675067186355591, "epoch": 0.529256090487239, "grad_norm": 0.028128840029239655, "grad_norm_var": 1.8011799461445463e-06, "learning_rate": 0.004691638230194777, "loss": 2.5565, "step": 14599 }, { "crossentropy": 2.401870012283325, "epoch": 0.529292343387471, "grad_norm": 0.026765305548906326, "grad_norm_var": 1.8236287232402392e-06, "learning_rate": 0.0046910582506880846, "loss": 2.5183, "step": 14600 }, { "crossentropy": 2.5923891067504883, "epoch": 0.529328596287703, "grad_norm": 0.027420852333307266, "grad_norm_var": 1.8214557337305984e-06, "learning_rate": 0.004690478275354121, "loss": 2.6287, "step": 14601 }, { "crossentropy": 2.536447525024414, "epoch": 0.529364849187935, "grad_norm": 0.027138330042362213, "grad_norm_var": 1.7159289066620564e-06, "learning_rate": 0.00468989830420072, "loss": 2.5495, "step": 14602 }, { "crossentropy": 2.512369394302368, "epoch": 0.529401102088167, "grad_norm": 0.02782428450882435, "grad_norm_var": 1.588976199679926e-06, "learning_rate": 0.0046893183372357095, "loss": 2.477, "step": 14603 }, { "crossentropy": 2.5681827068328857, "epoch": 0.5294373549883991, "grad_norm": 0.030146174132823944, "grad_norm_var": 1.9008726675930846e-06, "learning_rate": 0.0046887383744669275, "loss": 2.5561, "step": 14604 }, { "crossentropy": 2.656377077102661, "epoch": 0.5294736078886311, "grad_norm": 0.028940698131918907, "grad_norm_var": 1.8375945237777968e-06, "learning_rate": 0.004688158415902206, "loss": 2.5259, "step": 14605 }, { "crossentropy": 2.4268431663513184, "epoch": 0.5295098607888631, "grad_norm": 0.026490207761526108, "grad_norm_var": 1.9160733189915157e-06, "learning_rate": 0.004687578461549379, "loss": 2.486, "step": 14606 }, { "crossentropy": 2.602083921432495, "epoch": 0.5295461136890951, "grad_norm": 0.02821807749569416, "grad_norm_var": 1.869679222726951e-06, "learning_rate": 0.004686998511416279, "loss": 2.5695, "step": 14607 }, { "crossentropy": 2.6306450366973877, "epoch": 0.5295823665893271, "grad_norm": 0.028425315394997597, "grad_norm_var": 1.8435429733602386e-06, "learning_rate": 0.0046864185655107376, "loss": 2.5655, "step": 14608 }, { "crossentropy": 2.5690910816192627, "epoch": 0.5296186194895591, "grad_norm": 0.02792748063802719, "grad_norm_var": 1.4374177132883317e-06, "learning_rate": 0.004685838623840591, "loss": 2.5819, "step": 14609 }, { "crossentropy": 2.5356032848358154, "epoch": 0.5296548723897911, "grad_norm": 0.027662206441164017, "grad_norm_var": 1.4505411177252274e-06, "learning_rate": 0.004685258686413669, "loss": 2.566, "step": 14610 }, { "crossentropy": 2.5732686519622803, "epoch": 0.5296911252900232, "grad_norm": 0.026989376172423363, "grad_norm_var": 1.0031062389817893e-06, "learning_rate": 0.004684678753237809, "loss": 2.593, "step": 14611 }, { "crossentropy": 2.6368730068206787, "epoch": 0.5297273781902552, "grad_norm": 0.02663130685687065, "grad_norm_var": 9.427750841528929e-07, "learning_rate": 0.004684098824320838, "loss": 2.6524, "step": 14612 }, { "crossentropy": 2.6668856143951416, "epoch": 0.5297636310904872, "grad_norm": 0.02757919579744339, "grad_norm_var": 9.324892683179915e-07, "learning_rate": 0.0046835188996705935, "loss": 2.71, "step": 14613 }, { "crossentropy": 2.698721408843994, "epoch": 0.5297998839907193, "grad_norm": 0.02695896103978157, "grad_norm_var": 9.007632038934467e-07, "learning_rate": 0.004682938979294906, "loss": 2.7168, "step": 14614 }, { "crossentropy": 2.5371971130371094, "epoch": 0.5298361368909513, "grad_norm": 0.02665150724351406, "grad_norm_var": 9.532721385127156e-07, "learning_rate": 0.0046823590632016085, "loss": 2.6378, "step": 14615 }, { "crossentropy": 2.604414224624634, "epoch": 0.5298723897911833, "grad_norm": 0.028422638773918152, "grad_norm_var": 9.381573759352387e-07, "learning_rate": 0.004681779151398532, "loss": 2.6542, "step": 14616 }, { "crossentropy": 2.6121106147766113, "epoch": 0.5299086426914154, "grad_norm": 0.030344244092702866, "grad_norm_var": 1.3579677062497755e-06, "learning_rate": 0.0046811992438935135, "loss": 2.6467, "step": 14617 }, { "crossentropy": 2.606727123260498, "epoch": 0.5299448955916474, "grad_norm": 0.02762007713317871, "grad_norm_var": 1.3237491274279214e-06, "learning_rate": 0.004680619340694383, "loss": 2.5938, "step": 14618 }, { "crossentropy": 2.4198074340820312, "epoch": 0.5299811484918794, "grad_norm": 0.026604196056723595, "grad_norm_var": 1.433494689381115e-06, "learning_rate": 0.0046800394418089735, "loss": 2.4447, "step": 14619 }, { "crossentropy": 2.4695634841918945, "epoch": 0.5300174013921114, "grad_norm": 0.026824014261364937, "grad_norm_var": 1.1065132101012772e-06, "learning_rate": 0.004679459547245116, "loss": 2.5263, "step": 14620 }, { "crossentropy": 2.6156978607177734, "epoch": 0.5300536542923434, "grad_norm": 0.028010517358779907, "grad_norm_var": 9.99656282906341e-07, "learning_rate": 0.004678879657010643, "loss": 2.5799, "step": 14621 }, { "crossentropy": 2.57397723197937, "epoch": 0.5300899071925754, "grad_norm": 0.02774238772690296, "grad_norm_var": 9.148769312868384e-07, "learning_rate": 0.004678299771113389, "loss": 2.5867, "step": 14622 }, { "crossentropy": 2.4732282161712646, "epoch": 0.5301261600928074, "grad_norm": 0.030224816873669624, "grad_norm_var": 1.3150254921772171e-06, "learning_rate": 0.004677719889561184, "loss": 2.5169, "step": 14623 }, { "crossentropy": 2.5040040016174316, "epoch": 0.5301624129930395, "grad_norm": 0.029006848111748695, "grad_norm_var": 1.3855280948993209e-06, "learning_rate": 0.004677140012361859, "loss": 2.5064, "step": 14624 }, { "crossentropy": 2.7000033855438232, "epoch": 0.5301986658932715, "grad_norm": 0.02633274532854557, "grad_norm_var": 1.5226832944386625e-06, "learning_rate": 0.0046765601395232495, "loss": 2.6005, "step": 14625 }, { "crossentropy": 2.7039003372192383, "epoch": 0.5302349187935035, "grad_norm": 0.026977229863405228, "grad_norm_var": 1.5577715652238696e-06, "learning_rate": 0.004675980271053187, "loss": 2.6851, "step": 14626 }, { "crossentropy": 2.5651543140411377, "epoch": 0.5302711716937355, "grad_norm": 0.02709365077316761, "grad_norm_var": 1.5488143905525073e-06, "learning_rate": 0.004675400406959503, "loss": 2.5464, "step": 14627 }, { "crossentropy": 2.676571846008301, "epoch": 0.5303074245939675, "grad_norm": 0.030094088986516, "grad_norm_var": 1.809891957034792e-06, "learning_rate": 0.004674820547250029, "loss": 2.6254, "step": 14628 }, { "crossentropy": 2.4222607612609863, "epoch": 0.5303436774941995, "grad_norm": 0.02768632583320141, "grad_norm_var": 1.8059491151716765e-06, "learning_rate": 0.004674240691932597, "loss": 2.4955, "step": 14629 }, { "crossentropy": 2.5678558349609375, "epoch": 0.5303799303944315, "grad_norm": 0.027035413309931755, "grad_norm_var": 1.7965980591148061e-06, "learning_rate": 0.004673660841015038, "loss": 2.4987, "step": 14630 }, { "crossentropy": 2.56097674369812, "epoch": 0.5304161832946636, "grad_norm": 0.027274100109934807, "grad_norm_var": 1.715779593882309e-06, "learning_rate": 0.004673080994505184, "loss": 2.5145, "step": 14631 }, { "crossentropy": 2.727074146270752, "epoch": 0.5304524361948956, "grad_norm": 0.026834027841687202, "grad_norm_var": 1.7746330775873324e-06, "learning_rate": 0.004672501152410868, "loss": 2.6468, "step": 14632 }, { "crossentropy": 2.552196502685547, "epoch": 0.5304886890951276, "grad_norm": 0.027757175266742706, "grad_norm_var": 1.3348269440789535e-06, "learning_rate": 0.004671921314739921, "loss": 2.5821, "step": 14633 }, { "crossentropy": 2.506377935409546, "epoch": 0.5305249419953596, "grad_norm": 0.02934982441365719, "grad_norm_var": 1.5045832452218691e-06, "learning_rate": 0.004671341481500174, "loss": 2.5775, "step": 14634 }, { "crossentropy": 2.4603545665740967, "epoch": 0.5305611948955916, "grad_norm": 0.02986924722790718, "grad_norm_var": 1.648998052289352e-06, "learning_rate": 0.004670761652699459, "loss": 2.506, "step": 14635 }, { "crossentropy": 2.4800612926483154, "epoch": 0.5305974477958236, "grad_norm": 0.027314145117998123, "grad_norm_var": 1.5867015884934854e-06, "learning_rate": 0.00467018182834561, "loss": 2.6558, "step": 14636 }, { "crossentropy": 2.709963083267212, "epoch": 0.5306337006960556, "grad_norm": 0.027544613927602768, "grad_norm_var": 1.6019542656703212e-06, "learning_rate": 0.004669602008446454, "loss": 2.6573, "step": 14637 }, { "crossentropy": 2.6039812564849854, "epoch": 0.5306699535962877, "grad_norm": 0.026336848735809326, "grad_norm_var": 1.775303834639028e-06, "learning_rate": 0.004669022193009824, "loss": 2.4819, "step": 14638 }, { "crossentropy": 2.561520576477051, "epoch": 0.5307062064965197, "grad_norm": 0.02627231366932392, "grad_norm_var": 1.5374226195130047e-06, "learning_rate": 0.004668442382043551, "loss": 2.5798, "step": 14639 }, { "crossentropy": 2.6467130184173584, "epoch": 0.5307424593967517, "grad_norm": 0.026682287454605103, "grad_norm_var": 1.4619370384362362e-06, "learning_rate": 0.004667862575555467, "loss": 2.6466, "step": 14640 }, { "crossentropy": 2.6224992275238037, "epoch": 0.5307787122969838, "grad_norm": 0.026237014681100845, "grad_norm_var": 1.4777709612833733e-06, "learning_rate": 0.004667282773553404, "loss": 2.5904, "step": 14641 }, { "crossentropy": 2.490006685256958, "epoch": 0.5308149651972158, "grad_norm": 0.026608776301145554, "grad_norm_var": 1.5130382036214358e-06, "learning_rate": 0.004666702976045192, "loss": 2.5212, "step": 14642 }, { "crossentropy": 2.6265900135040283, "epoch": 0.5308512180974478, "grad_norm": 0.02731660008430481, "grad_norm_var": 1.5040843336500912e-06, "learning_rate": 0.004666123183038661, "loss": 2.6479, "step": 14643 }, { "crossentropy": 2.7070930004119873, "epoch": 0.5308874709976799, "grad_norm": 0.027727888897061348, "grad_norm_var": 1.0397940702360724e-06, "learning_rate": 0.004665543394541645, "loss": 2.5964, "step": 14644 }, { "crossentropy": 2.4065780639648438, "epoch": 0.5309237238979119, "grad_norm": 0.027087576687335968, "grad_norm_var": 1.0365808237423775e-06, "learning_rate": 0.004664963610561972, "loss": 2.4174, "step": 14645 }, { "crossentropy": 2.422940254211426, "epoch": 0.5309599767981439, "grad_norm": 0.02714524231851101, "grad_norm_var": 1.0330502575011646e-06, "learning_rate": 0.004664383831107474, "loss": 2.5623, "step": 14646 }, { "crossentropy": 2.5788369178771973, "epoch": 0.5309962296983759, "grad_norm": 0.02892063558101654, "grad_norm_var": 1.1891546502020561e-06, "learning_rate": 0.00466380405618598, "loss": 2.6335, "step": 14647 }, { "crossentropy": 2.4997973442077637, "epoch": 0.5310324825986079, "grad_norm": 0.026732448488473892, "grad_norm_var": 1.197976493572811e-06, "learning_rate": 0.004663224285805324, "loss": 2.5929, "step": 14648 }, { "crossentropy": 2.556009531021118, "epoch": 0.5310687354988399, "grad_norm": 0.026393964886665344, "grad_norm_var": 1.2549122523713986e-06, "learning_rate": 0.004662644519973335, "loss": 2.5013, "step": 14649 }, { "crossentropy": 2.4684231281280518, "epoch": 0.5311049883990719, "grad_norm": 0.027072807773947716, "grad_norm_var": 9.706620689803326e-07, "learning_rate": 0.004662064758697844, "loss": 2.5051, "step": 14650 }, { "crossentropy": 2.5336480140686035, "epoch": 0.531141241299304, "grad_norm": 0.02563931792974472, "grad_norm_var": 5.857006422489592e-07, "learning_rate": 0.004661485001986681, "loss": 2.4642, "step": 14651 }, { "crossentropy": 2.5878610610961914, "epoch": 0.531177494199536, "grad_norm": 0.026884641498327255, "grad_norm_var": 5.757770963893752e-07, "learning_rate": 0.004660905249847676, "loss": 2.6271, "step": 14652 }, { "crossentropy": 2.656482458114624, "epoch": 0.531213747099768, "grad_norm": 0.026787837967276573, "grad_norm_var": 5.478077654197476e-07, "learning_rate": 0.004660325502288662, "loss": 2.678, "step": 14653 }, { "crossentropy": 2.5150723457336426, "epoch": 0.53125, "grad_norm": 0.028282301500439644, "grad_norm_var": 6.47257244563761e-07, "learning_rate": 0.004659745759317466, "loss": 2.5398, "step": 14654 }, { "crossentropy": 2.632660388946533, "epoch": 0.531286252900232, "grad_norm": 0.027506520971655846, "grad_norm_var": 6.248555361562079e-07, "learning_rate": 0.004659166020941919, "loss": 2.5925, "step": 14655 }, { "crossentropy": 2.5110719203948975, "epoch": 0.531322505800464, "grad_norm": 0.02653828263282776, "grad_norm_var": 6.334829849264159e-07, "learning_rate": 0.004658586287169854, "loss": 2.5013, "step": 14656 }, { "crossentropy": 2.6505067348480225, "epoch": 0.531358758700696, "grad_norm": 0.025691237300634384, "grad_norm_var": 7.11633546883877e-07, "learning_rate": 0.004658006558009099, "loss": 2.6466, "step": 14657 }, { "crossentropy": 2.611867666244507, "epoch": 0.5313950116009281, "grad_norm": 0.026151293888688087, "grad_norm_var": 7.498591688158273e-07, "learning_rate": 0.004657426833467484, "loss": 2.6146, "step": 14658 }, { "crossentropy": 2.6816701889038086, "epoch": 0.5314312645011601, "grad_norm": 0.028502222150564194, "grad_norm_var": 8.88963941947128e-07, "learning_rate": 0.004656847113552839, "loss": 2.6044, "step": 14659 }, { "crossentropy": 2.6083924770355225, "epoch": 0.5314675174013921, "grad_norm": 0.026555130258202553, "grad_norm_var": 8.715063341693431e-07, "learning_rate": 0.004656267398272995, "loss": 2.5387, "step": 14660 }, { "crossentropy": 2.648332357406616, "epoch": 0.5315037703016241, "grad_norm": 0.026733273640275, "grad_norm_var": 8.748943797614687e-07, "learning_rate": 0.004655687687635784, "loss": 2.5796, "step": 14661 }, { "crossentropy": 2.5499539375305176, "epoch": 0.5315400232018561, "grad_norm": 0.027234498411417007, "grad_norm_var": 8.77465058629724e-07, "learning_rate": 0.004655107981649031, "loss": 2.5354, "step": 14662 }, { "crossentropy": 2.5336575508117676, "epoch": 0.5315762761020881, "grad_norm": 0.02691279537975788, "grad_norm_var": 6.090008796854711e-07, "learning_rate": 0.004654528280320566, "loss": 2.5266, "step": 14663 }, { "crossentropy": 2.6146223545074463, "epoch": 0.5316125290023201, "grad_norm": 0.027551408857107162, "grad_norm_var": 6.379566150784462e-07, "learning_rate": 0.004653948583658223, "loss": 2.6759, "step": 14664 }, { "crossentropy": 2.4590046405792236, "epoch": 0.5316487819025522, "grad_norm": 0.0257854163646698, "grad_norm_var": 7.023522600753858e-07, "learning_rate": 0.004653368891669829, "loss": 2.4717, "step": 14665 }, { "crossentropy": 2.638917922973633, "epoch": 0.5316850348027842, "grad_norm": 0.027339505031704903, "grad_norm_var": 7.142117699369838e-07, "learning_rate": 0.004652789204363215, "loss": 2.5631, "step": 14666 }, { "crossentropy": 2.5159003734588623, "epoch": 0.5317212877030162, "grad_norm": 0.02652333304286003, "grad_norm_var": 6.167013406814063e-07, "learning_rate": 0.004652209521746209, "loss": 2.4818, "step": 14667 }, { "crossentropy": 2.5421736240386963, "epoch": 0.5317575406032483, "grad_norm": 0.028431914746761322, "grad_norm_var": 7.556866358059942e-07, "learning_rate": 0.0046516298438266405, "loss": 2.4981, "step": 14668 }, { "crossentropy": 2.4513044357299805, "epoch": 0.5317937935034803, "grad_norm": 0.0300246924161911, "grad_norm_var": 1.3047338556581758e-06, "learning_rate": 0.004651050170612342, "loss": 2.5059, "step": 14669 }, { "crossentropy": 2.6228437423706055, "epoch": 0.5318300464037123, "grad_norm": 0.02745833434164524, "grad_norm_var": 1.232133809082172e-06, "learning_rate": 0.004650470502111139, "loss": 2.587, "step": 14670 }, { "crossentropy": 2.671464443206787, "epoch": 0.5318662993039444, "grad_norm": 0.026886794716119766, "grad_norm_var": 1.229466253216347e-06, "learning_rate": 0.004649890838330862, "loss": 2.6709, "step": 14671 }, { "crossentropy": 2.655956506729126, "epoch": 0.5319025522041764, "grad_norm": 0.0268259197473526, "grad_norm_var": 1.2113682872983214e-06, "learning_rate": 0.00464931117927934, "loss": 2.5852, "step": 14672 }, { "crossentropy": 2.504800796508789, "epoch": 0.5319388051044084, "grad_norm": 0.02824564278125763, "grad_norm_var": 1.117921059558269e-06, "learning_rate": 0.004648731524964404, "loss": 2.616, "step": 14673 }, { "crossentropy": 2.636319875717163, "epoch": 0.5319750580046404, "grad_norm": 0.02826281450688839, "grad_norm_var": 1.0668034518924892e-06, "learning_rate": 0.004648151875393881, "loss": 2.6241, "step": 14674 }, { "crossentropy": 2.555975914001465, "epoch": 0.5320113109048724, "grad_norm": 0.026014098897576332, "grad_norm_var": 1.1061793787418463e-06, "learning_rate": 0.004647572230575602, "loss": 2.5631, "step": 14675 }, { "crossentropy": 2.6425976753234863, "epoch": 0.5320475638051044, "grad_norm": 0.02634039893746376, "grad_norm_var": 1.130361657633525e-06, "learning_rate": 0.004646992590517393, "loss": 2.612, "step": 14676 }, { "crossentropy": 2.6727190017700195, "epoch": 0.5320838167053364, "grad_norm": 0.027202272787690163, "grad_norm_var": 1.1095655711427025e-06, "learning_rate": 0.004646412955227086, "loss": 2.6444, "step": 14677 }, { "crossentropy": 2.5892741680145264, "epoch": 0.5321200696055685, "grad_norm": 0.028420014306902885, "grad_norm_var": 1.18468285005209e-06, "learning_rate": 0.00464583332471251, "loss": 2.6506, "step": 14678 }, { "crossentropy": 2.57785701751709, "epoch": 0.5321563225058005, "grad_norm": 0.02736644819378853, "grad_norm_var": 1.1687360664429844e-06, "learning_rate": 0.00464525369898149, "loss": 2.5547, "step": 14679 }, { "crossentropy": 2.5876898765563965, "epoch": 0.5321925754060325, "grad_norm": 0.026703553274273872, "grad_norm_var": 1.198519712817036e-06, "learning_rate": 0.004644674078041858, "loss": 2.6029, "step": 14680 }, { "crossentropy": 2.5097568035125732, "epoch": 0.5322288283062645, "grad_norm": 0.028839487582445145, "grad_norm_var": 1.1384828285349584e-06, "learning_rate": 0.004644094461901442, "loss": 2.633, "step": 14681 }, { "crossentropy": 2.4898455142974854, "epoch": 0.5322650812064965, "grad_norm": 0.030286816880106926, "grad_norm_var": 1.59658581931793e-06, "learning_rate": 0.0046435148505680705, "loss": 2.4284, "step": 14682 }, { "crossentropy": 2.617314338684082, "epoch": 0.5323013341067285, "grad_norm": 0.02716081030666828, "grad_norm_var": 1.5186110400716134e-06, "learning_rate": 0.004642935244049571, "loss": 2.5771, "step": 14683 }, { "crossentropy": 2.62335467338562, "epoch": 0.5323375870069605, "grad_norm": 0.02572847343981266, "grad_norm_var": 1.74018482548692e-06, "learning_rate": 0.0046423556423537725, "loss": 2.5465, "step": 14684 }, { "crossentropy": 2.703901529312134, "epoch": 0.5323738399071926, "grad_norm": 0.027899304404854774, "grad_norm_var": 1.3383431081021486e-06, "learning_rate": 0.004641776045488505, "loss": 2.6046, "step": 14685 }, { "crossentropy": 2.5435101985931396, "epoch": 0.5324100928074246, "grad_norm": 0.02728305757045746, "grad_norm_var": 1.340712866732665e-06, "learning_rate": 0.004641196453461598, "loss": 2.5327, "step": 14686 }, { "crossentropy": 2.446272850036621, "epoch": 0.5324463457076566, "grad_norm": 0.026583554223179817, "grad_norm_var": 1.3699035460237296e-06, "learning_rate": 0.004640616866280874, "loss": 2.4992, "step": 14687 }, { "crossentropy": 2.5653445720672607, "epoch": 0.5324825986078886, "grad_norm": 0.03284532204270363, "grad_norm_var": 3.135472758944898e-06, "learning_rate": 0.0046400372839541645, "loss": 2.6347, "step": 14688 }, { "crossentropy": 2.410893678665161, "epoch": 0.5325188515081206, "grad_norm": 0.027034079656004906, "grad_norm_var": 3.159083186976191e-06, "learning_rate": 0.0046394577064893, "loss": 2.5615, "step": 14689 }, { "crossentropy": 2.5102455615997314, "epoch": 0.5325551044083526, "grad_norm": 0.02631252259016037, "grad_norm_var": 3.262979532509212e-06, "learning_rate": 0.004638878133894104, "loss": 2.5457, "step": 14690 }, { "crossentropy": 2.4928174018859863, "epoch": 0.5325913573085846, "grad_norm": 0.02950086072087288, "grad_norm_var": 3.273325944459767e-06, "learning_rate": 0.004638298566176409, "loss": 2.5355, "step": 14691 }, { "crossentropy": 2.6571385860443115, "epoch": 0.5326276102088167, "grad_norm": 0.026465794071555138, "grad_norm_var": 3.249166345682617e-06, "learning_rate": 0.004637719003344038, "loss": 2.568, "step": 14692 }, { "crossentropy": 2.560298204421997, "epoch": 0.5326638631090487, "grad_norm": 0.02635606937110424, "grad_norm_var": 3.3672295847537076e-06, "learning_rate": 0.0046371394454048235, "loss": 2.6201, "step": 14693 }, { "crossentropy": 2.4535861015319824, "epoch": 0.5327001160092807, "grad_norm": 0.026634957641363144, "grad_norm_var": 3.418607453363346e-06, "learning_rate": 0.0046365598923665905, "loss": 2.4698, "step": 14694 }, { "crossentropy": 2.628514289855957, "epoch": 0.5327363689095128, "grad_norm": 0.02712877467274666, "grad_norm_var": 3.4323142680859334e-06, "learning_rate": 0.0046359803442371705, "loss": 2.6902, "step": 14695 }, { "crossentropy": 2.506443500518799, "epoch": 0.5327726218097448, "grad_norm": 0.02733701840043068, "grad_norm_var": 3.375536801425207e-06, "learning_rate": 0.004635400801024386, "loss": 2.5417, "step": 14696 }, { "crossentropy": 2.5633442401885986, "epoch": 0.5328088747099768, "grad_norm": 0.0289648175239563, "grad_norm_var": 3.3953544650609673e-06, "learning_rate": 0.004634821262736067, "loss": 2.5668, "step": 14697 }, { "crossentropy": 2.408313035964966, "epoch": 0.5328451276102089, "grad_norm": 0.028290677815675735, "grad_norm_var": 2.9612641901663084e-06, "learning_rate": 0.0046342417293800415, "loss": 2.5507, "step": 14698 }, { "crossentropy": 2.4176931381225586, "epoch": 0.5328813805104409, "grad_norm": 0.02919064834713936, "grad_norm_var": 3.1011649541547057e-06, "learning_rate": 0.004633662200964136, "loss": 2.5461, "step": 14699 }, { "crossentropy": 2.557274103164673, "epoch": 0.5329176334106729, "grad_norm": 0.028612613677978516, "grad_norm_var": 2.8543473066769727e-06, "learning_rate": 0.004633082677496177, "loss": 2.5191, "step": 14700 }, { "crossentropy": 2.578418493270874, "epoch": 0.5329538863109049, "grad_norm": 0.027702366933226585, "grad_norm_var": 2.856855360011433e-06, "learning_rate": 0.004632503158983996, "loss": 2.5107, "step": 14701 }, { "crossentropy": 2.9017348289489746, "epoch": 0.5329901392111369, "grad_norm": 0.027977880090475082, "grad_norm_var": 2.8307818830511914e-06, "learning_rate": 0.0046319236454354165, "loss": 2.7553, "step": 14702 }, { "crossentropy": 2.5794715881347656, "epoch": 0.5330263921113689, "grad_norm": 0.027337223291397095, "grad_norm_var": 2.7306156701644807e-06, "learning_rate": 0.004631344136858268, "loss": 2.5785, "step": 14703 }, { "crossentropy": 2.4961111545562744, "epoch": 0.5330626450116009, "grad_norm": 0.030301423743367195, "grad_norm_var": 1.4850745466103334e-06, "learning_rate": 0.004630764633260375, "loss": 2.538, "step": 14704 }, { "crossentropy": 2.42850661277771, "epoch": 0.533098897911833, "grad_norm": 0.025480518117547035, "grad_norm_var": 1.7990773700079688e-06, "learning_rate": 0.004630185134649566, "loss": 2.4901, "step": 14705 }, { "crossentropy": 2.535203218460083, "epoch": 0.533135150812065, "grad_norm": 0.0262726042419672, "grad_norm_var": 1.8066928571457026e-06, "learning_rate": 0.004629605641033668, "loss": 2.5635, "step": 14706 }, { "crossentropy": 2.569521903991699, "epoch": 0.533171403712297, "grad_norm": 0.02637743018567562, "grad_norm_var": 1.6756703117485102e-06, "learning_rate": 0.004629026152420508, "loss": 2.5695, "step": 14707 }, { "crossentropy": 2.615344762802124, "epoch": 0.533207656612529, "grad_norm": 0.02662626840174198, "grad_norm_var": 1.6545752181772496e-06, "learning_rate": 0.004628446668817911, "loss": 2.5896, "step": 14708 }, { "crossentropy": 2.6523659229278564, "epoch": 0.533243909512761, "grad_norm": 0.026850813999772072, "grad_norm_var": 1.5919751893693808e-06, "learning_rate": 0.004627867190233708, "loss": 2.6232, "step": 14709 }, { "crossentropy": 2.7478842735290527, "epoch": 0.533280162412993, "grad_norm": 0.027331674471497536, "grad_norm_var": 1.5356494737434404e-06, "learning_rate": 0.004627287716675723, "loss": 2.63, "step": 14710 }, { "crossentropy": 2.5608832836151123, "epoch": 0.533316415313225, "grad_norm": 0.02782469242811203, "grad_norm_var": 1.52113392849762e-06, "learning_rate": 0.004626708248151782, "loss": 2.6004, "step": 14711 }, { "crossentropy": 2.7658803462982178, "epoch": 0.5333526682134571, "grad_norm": 0.02849040925502777, "grad_norm_var": 1.5553901573122618e-06, "learning_rate": 0.004626128784669716, "loss": 2.7191, "step": 14712 }, { "crossentropy": 2.771824359893799, "epoch": 0.5333889211136891, "grad_norm": 0.02652662992477417, "grad_norm_var": 1.524534672489528e-06, "learning_rate": 0.004625549326237344, "loss": 2.6663, "step": 14713 }, { "crossentropy": 2.625243663787842, "epoch": 0.5334251740139211, "grad_norm": 0.026071297004818916, "grad_norm_var": 1.6204930410732097e-06, "learning_rate": 0.004624969872862498, "loss": 2.5543, "step": 14714 }, { "crossentropy": 2.5790963172912598, "epoch": 0.5334614269141531, "grad_norm": 0.027977947145700455, "grad_norm_var": 1.428677864421486e-06, "learning_rate": 0.004624390424553003, "loss": 2.5531, "step": 14715 }, { "crossentropy": 2.5472466945648193, "epoch": 0.5334976798143851, "grad_norm": 0.026638507843017578, "grad_norm_var": 1.342570128813021e-06, "learning_rate": 0.004623810981316685, "loss": 2.6078, "step": 14716 }, { "crossentropy": 2.6261258125305176, "epoch": 0.5335339327146171, "grad_norm": 0.02788294106721878, "grad_norm_var": 1.3558189882416245e-06, "learning_rate": 0.004623231543161371, "loss": 2.6509, "step": 14717 }, { "crossentropy": 2.5664076805114746, "epoch": 0.5335701856148491, "grad_norm": 0.031063798815011978, "grad_norm_var": 2.2513065876985627e-06, "learning_rate": 0.004622652110094887, "loss": 2.5856, "step": 14718 }, { "crossentropy": 2.542562961578369, "epoch": 0.5336064385150812, "grad_norm": 0.027649490162730217, "grad_norm_var": 2.253084932595873e-06, "learning_rate": 0.004622072682125058, "loss": 2.5393, "step": 14719 }, { "crossentropy": 2.5608131885528564, "epoch": 0.5336426914153132, "grad_norm": 0.026847485452890396, "grad_norm_var": 1.690329128163579e-06, "learning_rate": 0.004621493259259713, "loss": 2.6014, "step": 14720 }, { "crossentropy": 2.5660340785980225, "epoch": 0.5336789443155452, "grad_norm": 0.027014998719096184, "grad_norm_var": 1.4765809146802718e-06, "learning_rate": 0.004620913841506674, "loss": 2.5573, "step": 14721 }, { "crossentropy": 2.573028802871704, "epoch": 0.5337151972157773, "grad_norm": 0.026037026196718216, "grad_norm_var": 1.5135905329926076e-06, "learning_rate": 0.00462033442887377, "loss": 2.6072, "step": 14722 }, { "crossentropy": 2.5879759788513184, "epoch": 0.5337514501160093, "grad_norm": 0.02819325029850006, "grad_norm_var": 1.4900775198611303e-06, "learning_rate": 0.004619755021368825, "loss": 2.5812, "step": 14723 }, { "crossentropy": 2.5321543216705322, "epoch": 0.5337877030162413, "grad_norm": 0.025353245437145233, "grad_norm_var": 1.7293486465768387e-06, "learning_rate": 0.0046191756189996646, "loss": 2.6394, "step": 14724 }, { "crossentropy": 2.6045970916748047, "epoch": 0.5338239559164734, "grad_norm": 0.026500338688492775, "grad_norm_var": 1.760803072862665e-06, "learning_rate": 0.0046185962217741156, "loss": 2.6339, "step": 14725 }, { "crossentropy": 2.7168984413146973, "epoch": 0.5338602088167054, "grad_norm": 0.026980914175510406, "grad_norm_var": 1.76877598179447e-06, "learning_rate": 0.004618016829700004, "loss": 2.711, "step": 14726 }, { "crossentropy": 2.542781114578247, "epoch": 0.5338964617169374, "grad_norm": 0.026748551055788994, "grad_norm_var": 1.7681388528096997e-06, "learning_rate": 0.004617437442785155, "loss": 2.5634, "step": 14727 }, { "crossentropy": 2.604656457901001, "epoch": 0.5339327146171694, "grad_norm": 0.028293317183852196, "grad_norm_var": 1.7379319860482666e-06, "learning_rate": 0.004616858061037395, "loss": 2.5929, "step": 14728 }, { "crossentropy": 2.4715588092803955, "epoch": 0.5339689675174014, "grad_norm": 0.02634965255856514, "grad_norm_var": 1.75663405539081e-06, "learning_rate": 0.004616278684464548, "loss": 2.4394, "step": 14729 }, { "crossentropy": 2.6378719806671143, "epoch": 0.5340052204176334, "grad_norm": 0.026821639388799667, "grad_norm_var": 1.6763821746127674e-06, "learning_rate": 0.004615699313074439, "loss": 2.6757, "step": 14730 }, { "crossentropy": 2.487945079803467, "epoch": 0.5340414733178654, "grad_norm": 0.02655583992600441, "grad_norm_var": 1.6689368982423461e-06, "learning_rate": 0.004615119946874893, "loss": 2.4741, "step": 14731 }, { "crossentropy": 2.4964089393615723, "epoch": 0.5340777262180975, "grad_norm": 0.025996066629886627, "grad_norm_var": 1.7413891728176954e-06, "learning_rate": 0.004614540585873736, "loss": 2.5286, "step": 14732 }, { "crossentropy": 2.3431692123413086, "epoch": 0.5341139791183295, "grad_norm": 0.02654366008937359, "grad_norm_var": 1.721368105392825e-06, "learning_rate": 0.004613961230078794, "loss": 2.4632, "step": 14733 }, { "crossentropy": 2.608720541000366, "epoch": 0.5341502320185615, "grad_norm": 0.026064038276672363, "grad_norm_var": 6.142002067734941e-07, "learning_rate": 0.004613381879497892, "loss": 2.5823, "step": 14734 }, { "crossentropy": 2.6483826637268066, "epoch": 0.5341864849187935, "grad_norm": 0.027431676164269447, "grad_norm_var": 5.909508796607398e-07, "learning_rate": 0.004612802534138854, "loss": 2.6092, "step": 14735 }, { "crossentropy": 2.4516139030456543, "epoch": 0.5342227378190255, "grad_norm": 0.026812126860022545, "grad_norm_var": 5.904903700667258e-07, "learning_rate": 0.004612223194009506, "loss": 2.494, "step": 14736 }, { "crossentropy": 2.5890958309173584, "epoch": 0.5342589907192575, "grad_norm": 0.026610825210809708, "grad_norm_var": 5.853966503419072e-07, "learning_rate": 0.004611643859117673, "loss": 2.4783, "step": 14737 }, { "crossentropy": 2.727520227432251, "epoch": 0.5342952436194895, "grad_norm": 0.026095863431692123, "grad_norm_var": 5.80366817058466e-07, "learning_rate": 0.004611064529471178, "loss": 2.704, "step": 14738 }, { "crossentropy": 2.5661017894744873, "epoch": 0.5343314965197216, "grad_norm": 0.02711339294910431, "grad_norm_var": 4.396067881257589e-07, "learning_rate": 0.004610485205077846, "loss": 2.5866, "step": 14739 }, { "crossentropy": 2.488924503326416, "epoch": 0.5343677494199536, "grad_norm": 0.027225136756896973, "grad_norm_var": 3.369641837757305e-07, "learning_rate": 0.004609905885945502, "loss": 2.4774, "step": 14740 }, { "crossentropy": 2.55229115486145, "epoch": 0.5344040023201856, "grad_norm": 0.02768642269074917, "grad_norm_var": 3.8399251021248074e-07, "learning_rate": 0.0046093265720819725, "loss": 2.5639, "step": 14741 }, { "crossentropy": 2.6041386127471924, "epoch": 0.5344402552204176, "grad_norm": 0.028681490570306778, "grad_norm_var": 5.98262676089247e-07, "learning_rate": 0.00460874726349508, "loss": 2.5276, "step": 14742 }, { "crossentropy": 2.623555898666382, "epoch": 0.5344765081206496, "grad_norm": 0.027846550568938255, "grad_norm_var": 6.456789935757601e-07, "learning_rate": 0.004608167960192649, "loss": 2.5771, "step": 14743 }, { "crossentropy": 2.703258514404297, "epoch": 0.5345127610208816, "grad_norm": 0.028733568266034126, "grad_norm_var": 7.332422145451059e-07, "learning_rate": 0.004607588662182505, "loss": 2.6687, "step": 14744 }, { "crossentropy": 2.5079879760742188, "epoch": 0.5345490139211136, "grad_norm": 0.02805504947900772, "grad_norm_var": 7.590643044022572e-07, "learning_rate": 0.004607009369472472, "loss": 2.6027, "step": 14745 }, { "crossentropy": 2.5625531673431396, "epoch": 0.5345852668213457, "grad_norm": 0.027729857712984085, "grad_norm_var": 7.718135734767484e-07, "learning_rate": 0.004606430082070374, "loss": 2.6169, "step": 14746 }, { "crossentropy": 2.6873819828033447, "epoch": 0.5346215197215777, "grad_norm": 0.027157314121723175, "grad_norm_var": 7.428572508802814e-07, "learning_rate": 0.004605850799984035, "loss": 2.6008, "step": 14747 }, { "crossentropy": 2.3819661140441895, "epoch": 0.5346577726218097, "grad_norm": 0.026898974552750587, "grad_norm_var": 6.444841966122961e-07, "learning_rate": 0.004605271523221276, "loss": 2.5067, "step": 14748 }, { "crossentropy": 2.605083465576172, "epoch": 0.5346940255220418, "grad_norm": 0.027966560795903206, "grad_norm_var": 6.28884094611134e-07, "learning_rate": 0.004604692251789927, "loss": 2.588, "step": 14749 }, { "crossentropy": 2.4704439640045166, "epoch": 0.5347302784222738, "grad_norm": 0.02710406295955181, "grad_norm_var": 5.1375292142349e-07, "learning_rate": 0.004604112985697809, "loss": 2.5394, "step": 14750 }, { "crossentropy": 2.594947099685669, "epoch": 0.5347665313225058, "grad_norm": 0.02614426054060459, "grad_norm_var": 6.199397306496286e-07, "learning_rate": 0.0046035337249527445, "loss": 2.556, "step": 14751 }, { "crossentropy": 2.6669719219207764, "epoch": 0.5348027842227379, "grad_norm": 0.026204029098153114, "grad_norm_var": 6.879866856656623e-07, "learning_rate": 0.00460295446956256, "loss": 2.6367, "step": 14752 }, { "crossentropy": 2.696281671524048, "epoch": 0.5348390371229699, "grad_norm": 0.027360262349247932, "grad_norm_var": 6.513930579411652e-07, "learning_rate": 0.0046023752195350764, "loss": 2.6488, "step": 14753 }, { "crossentropy": 2.58063006401062, "epoch": 0.5348752900232019, "grad_norm": 0.027772756293416023, "grad_norm_var": 5.411054148291479e-07, "learning_rate": 0.004601795974878122, "loss": 2.5703, "step": 14754 }, { "crossentropy": 2.401244878768921, "epoch": 0.5349115429234339, "grad_norm": 0.02618524059653282, "grad_norm_var": 6.403136487401554e-07, "learning_rate": 0.0046012167355995144, "loss": 2.4623, "step": 14755 }, { "crossentropy": 2.4878323078155518, "epoch": 0.5349477958236659, "grad_norm": 0.028175638988614082, "grad_norm_var": 6.718340254929875e-07, "learning_rate": 0.0046006375017070816, "loss": 2.4506, "step": 14756 }, { "crossentropy": 2.5033059120178223, "epoch": 0.5349840487238979, "grad_norm": 0.02694869041442871, "grad_norm_var": 6.856804525330144e-07, "learning_rate": 0.004600058273208645, "loss": 2.518, "step": 14757 }, { "crossentropy": 2.4945762157440186, "epoch": 0.53502030162413, "grad_norm": 0.03153238445520401, "grad_norm_var": 1.6673678475746685e-06, "learning_rate": 0.0045994790501120274, "loss": 2.5756, "step": 14758 }, { "crossentropy": 2.462599277496338, "epoch": 0.535056554524362, "grad_norm": 0.032012294977903366, "grad_norm_var": 2.8814286134459425e-06, "learning_rate": 0.004598899832425053, "loss": 2.5191, "step": 14759 }, { "crossentropy": 2.536069869995117, "epoch": 0.535092807424594, "grad_norm": 0.02731819450855255, "grad_norm_var": 2.8443830515411756e-06, "learning_rate": 0.004598320620155545, "loss": 2.5899, "step": 14760 }, { "crossentropy": 2.591989517211914, "epoch": 0.535129060324826, "grad_norm": 0.02533484250307083, "grad_norm_var": 3.2090343339052457e-06, "learning_rate": 0.004597741413311327, "loss": 2.6191, "step": 14761 }, { "crossentropy": 2.4353976249694824, "epoch": 0.535165313225058, "grad_norm": 0.0270692165941, "grad_norm_var": 3.2262244902738485e-06, "learning_rate": 0.004597162211900224, "loss": 2.4859, "step": 14762 }, { "crossentropy": 2.4839928150177, "epoch": 0.53520156612529, "grad_norm": 0.026979288086295128, "grad_norm_var": 3.23809718464745e-06, "learning_rate": 0.004596583015930054, "loss": 2.5464, "step": 14763 }, { "crossentropy": 2.4847757816314697, "epoch": 0.535237819025522, "grad_norm": 0.026763813570141792, "grad_norm_var": 3.2512042094539248e-06, "learning_rate": 0.004596003825408644, "loss": 2.543, "step": 14764 }, { "crossentropy": 2.521191358566284, "epoch": 0.535274071925754, "grad_norm": 0.02768854796886444, "grad_norm_var": 3.240759407011984e-06, "learning_rate": 0.004595424640343813, "loss": 2.5422, "step": 14765 }, { "crossentropy": 2.5190906524658203, "epoch": 0.5353103248259861, "grad_norm": 0.027169903740286827, "grad_norm_var": 3.2372288538782263e-06, "learning_rate": 0.004594845460743389, "loss": 2.5501, "step": 14766 }, { "crossentropy": 2.5660974979400635, "epoch": 0.5353465777262181, "grad_norm": 0.02659699134528637, "grad_norm_var": 3.1657135529986146e-06, "learning_rate": 0.00459426628661519, "loss": 2.5939, "step": 14767 }, { "crossentropy": 2.6305198669433594, "epoch": 0.5353828306264501, "grad_norm": 0.02612406015396118, "grad_norm_var": 3.180672674982232e-06, "learning_rate": 0.004593687117967041, "loss": 2.674, "step": 14768 }, { "crossentropy": 2.5334436893463135, "epoch": 0.5354190835266821, "grad_norm": 0.02634602226316929, "grad_norm_var": 3.2725858973919423e-06, "learning_rate": 0.004593107954806764, "loss": 2.533, "step": 14769 }, { "crossentropy": 2.5503342151641846, "epoch": 0.5354553364269141, "grad_norm": 0.026504263281822205, "grad_norm_var": 3.3272102017227556e-06, "learning_rate": 0.004592528797142183, "loss": 2.5102, "step": 14770 }, { "crossentropy": 2.508545160293579, "epoch": 0.5354915893271461, "grad_norm": 0.026656202971935272, "grad_norm_var": 3.2634209928670503e-06, "learning_rate": 0.004591949644981118, "loss": 2.545, "step": 14771 }, { "crossentropy": 2.46098256111145, "epoch": 0.5355278422273781, "grad_norm": 0.027567453682422638, "grad_norm_var": 3.227799183160712e-06, "learning_rate": 0.004591370498331392, "loss": 2.4978, "step": 14772 }, { "crossentropy": 2.6816303730010986, "epoch": 0.5355640951276102, "grad_norm": 0.026965349912643433, "grad_norm_var": 3.2267845950564402e-06, "learning_rate": 0.004590791357200827, "loss": 2.6441, "step": 14773 }, { "crossentropy": 2.2892277240753174, "epoch": 0.5356003480278422, "grad_norm": 0.0257278960198164, "grad_norm_var": 2.145424942229463e-06, "learning_rate": 0.004590212221597246, "loss": 2.4229, "step": 14774 }, { "crossentropy": 2.2567882537841797, "epoch": 0.5356366009280742, "grad_norm": 0.025961022824048996, "grad_norm_var": 4.3151118325455e-07, "learning_rate": 0.0045896330915284715, "loss": 2.348, "step": 14775 }, { "crossentropy": 2.705937147140503, "epoch": 0.5356728538283063, "grad_norm": 0.02578750066459179, "grad_norm_var": 4.46335457179349e-07, "learning_rate": 0.004589053967002324, "loss": 2.6499, "step": 14776 }, { "crossentropy": 2.585733652114868, "epoch": 0.5357091067285383, "grad_norm": 0.027047844603657722, "grad_norm_var": 3.4587673764667884e-07, "learning_rate": 0.004588474848026627, "loss": 2.6293, "step": 14777 }, { "crossentropy": 2.50219988822937, "epoch": 0.5357453596287703, "grad_norm": 0.02729550004005432, "grad_norm_var": 3.6067796361875327e-07, "learning_rate": 0.0045878957346092005, "loss": 2.5112, "step": 14778 }, { "crossentropy": 2.5065860748291016, "epoch": 0.5357816125290024, "grad_norm": 0.02555028349161148, "grad_norm_var": 4.348740996135917e-07, "learning_rate": 0.0045873166267578706, "loss": 2.4937, "step": 14779 }, { "crossentropy": 2.5756261348724365, "epoch": 0.5358178654292344, "grad_norm": 0.025859717279672623, "grad_norm_var": 4.673640172568489e-07, "learning_rate": 0.0045867375244804534, "loss": 2.576, "step": 14780 }, { "crossentropy": 2.682058334350586, "epoch": 0.5358541183294664, "grad_norm": 0.029209528118371964, "grad_norm_var": 8.422293273027219e-07, "learning_rate": 0.004586158427784774, "loss": 2.5928, "step": 14781 }, { "crossentropy": 2.508349657058716, "epoch": 0.5358903712296984, "grad_norm": 0.027852335944771767, "grad_norm_var": 9.188161969615633e-07, "learning_rate": 0.004585579336678652, "loss": 2.4824, "step": 14782 }, { "crossentropy": 2.6352407932281494, "epoch": 0.5359266241299304, "grad_norm": 0.027572281658649445, "grad_norm_var": 9.660736150143469e-07, "learning_rate": 0.004585000251169911, "loss": 2.6638, "step": 14783 }, { "crossentropy": 2.5145771503448486, "epoch": 0.5359628770301624, "grad_norm": 0.026586057618260384, "grad_norm_var": 9.407510740561583e-07, "learning_rate": 0.00458442117126637, "loss": 2.4509, "step": 14784 }, { "crossentropy": 2.5828051567077637, "epoch": 0.5359991299303944, "grad_norm": 0.028527790680527687, "grad_norm_var": 1.111844607278728e-06, "learning_rate": 0.004583842096975852, "loss": 2.6078, "step": 14785 }, { "crossentropy": 2.4160618782043457, "epoch": 0.5360353828306265, "grad_norm": 0.030631309375166893, "grad_norm_var": 1.949292055409635e-06, "learning_rate": 0.004583263028306179, "loss": 2.4369, "step": 14786 }, { "crossentropy": 2.7331531047821045, "epoch": 0.5360716357308585, "grad_norm": 0.026979293674230576, "grad_norm_var": 1.9334723309292426e-06, "learning_rate": 0.004582683965265172, "loss": 2.6727, "step": 14787 }, { "crossentropy": 2.589296340942383, "epoch": 0.5361078886310905, "grad_norm": 0.02798778936266899, "grad_norm_var": 1.9653849546259846e-06, "learning_rate": 0.00458210490786065, "loss": 2.5828, "step": 14788 }, { "crossentropy": 2.5646307468414307, "epoch": 0.5361441415313225, "grad_norm": 0.02860124036669731, "grad_norm_var": 2.076806485956867e-06, "learning_rate": 0.004581525856100436, "loss": 2.5045, "step": 14789 }, { "crossentropy": 2.383882761001587, "epoch": 0.5361803944315545, "grad_norm": 0.026382794603705406, "grad_norm_var": 1.9642768120688707e-06, "learning_rate": 0.00458094680999235, "loss": 2.3833, "step": 14790 }, { "crossentropy": 2.507664203643799, "epoch": 0.5362166473317865, "grad_norm": 0.026930779218673706, "grad_norm_var": 1.8415803870912612e-06, "learning_rate": 0.004580367769544213, "loss": 2.5286, "step": 14791 }, { "crossentropy": 2.4270689487457275, "epoch": 0.5362529002320185, "grad_norm": 0.02608991414308548, "grad_norm_var": 1.7812641826655632e-06, "learning_rate": 0.004579788734763846, "loss": 2.5388, "step": 14792 }, { "crossentropy": 2.3445627689361572, "epoch": 0.5362891531322506, "grad_norm": 0.027791503816843033, "grad_norm_var": 1.776545030229938e-06, "learning_rate": 0.00457920970565907, "loss": 2.4823, "step": 14793 }, { "crossentropy": 2.5128276348114014, "epoch": 0.5363254060324826, "grad_norm": 0.027143193408846855, "grad_norm_var": 1.7819549852334896e-06, "learning_rate": 0.004578630682237706, "loss": 2.5014, "step": 14794 }, { "crossentropy": 2.600400447845459, "epoch": 0.5363616589327146, "grad_norm": 0.02831304632127285, "grad_norm_var": 1.5477980421346824e-06, "learning_rate": 0.004578051664507573, "loss": 2.644, "step": 14795 }, { "crossentropy": 2.578979730606079, "epoch": 0.5363979118329466, "grad_norm": 0.02984756790101528, "grad_norm_var": 1.5878686478775864e-06, "learning_rate": 0.004577472652476495, "loss": 2.5585, "step": 14796 }, { "crossentropy": 2.6554791927337646, "epoch": 0.5364341647331786, "grad_norm": 0.028439903631806374, "grad_norm_var": 1.4908071990081395e-06, "learning_rate": 0.004576893646152288, "loss": 2.6344, "step": 14797 }, { "crossentropy": 2.5716288089752197, "epoch": 0.5364704176334106, "grad_norm": 0.026593975722789764, "grad_norm_var": 1.5901875426710745e-06, "learning_rate": 0.004576314645542775, "loss": 2.5184, "step": 14798 }, { "crossentropy": 2.675950765609741, "epoch": 0.5365066705336426, "grad_norm": 0.026908911764621735, "grad_norm_var": 1.6357235141284746e-06, "learning_rate": 0.004575735650655776, "loss": 2.5891, "step": 14799 }, { "crossentropy": 2.5412960052490234, "epoch": 0.5365429234338747, "grad_norm": 0.02700107917189598, "grad_norm_var": 1.5829276260112632e-06, "learning_rate": 0.004575156661499109, "loss": 2.5094, "step": 14800 }, { "crossentropy": 2.6152596473693848, "epoch": 0.5365791763341067, "grad_norm": 0.026255501434206963, "grad_norm_var": 1.673205897854144e-06, "learning_rate": 0.004574577678080597, "loss": 2.5197, "step": 14801 }, { "crossentropy": 2.605114221572876, "epoch": 0.5366154292343387, "grad_norm": 0.026477908715605736, "grad_norm_var": 1.0829854732572007e-06, "learning_rate": 0.00457399870040806, "loss": 2.5034, "step": 14802 }, { "crossentropy": 2.6158299446105957, "epoch": 0.5366516821345708, "grad_norm": 0.026918061077594757, "grad_norm_var": 1.0863200723637643e-06, "learning_rate": 0.004573419728489316, "loss": 2.5531, "step": 14803 }, { "crossentropy": 2.5746102333068848, "epoch": 0.5366879350348028, "grad_norm": 0.02657441608607769, "grad_norm_var": 1.0919599079624308e-06, "learning_rate": 0.004572840762332188, "loss": 2.5942, "step": 14804 }, { "crossentropy": 2.572927951812744, "epoch": 0.5367241879350348, "grad_norm": 0.02789565548300743, "grad_norm_var": 9.975399390911576e-07, "learning_rate": 0.004572261801944492, "loss": 2.5207, "step": 14805 }, { "crossentropy": 2.6601672172546387, "epoch": 0.5367604408352669, "grad_norm": 0.030176902189850807, "grad_norm_var": 1.4723190082980078e-06, "learning_rate": 0.00457168284733405, "loss": 2.6167, "step": 14806 }, { "crossentropy": 2.5126967430114746, "epoch": 0.5367966937354989, "grad_norm": 0.029767371714115143, "grad_norm_var": 1.775091953874243e-06, "learning_rate": 0.004571103898508681, "loss": 2.4582, "step": 14807 }, { "crossentropy": 2.585369110107422, "epoch": 0.5368329466357309, "grad_norm": 0.027620786800980568, "grad_norm_var": 1.6057424656475225e-06, "learning_rate": 0.004570524955476203, "loss": 2.5202, "step": 14808 }, { "crossentropy": 2.775071382522583, "epoch": 0.5368691995359629, "grad_norm": 0.029236309230327606, "grad_norm_var": 1.747505766633165e-06, "learning_rate": 0.004569946018244438, "loss": 2.6778, "step": 14809 }, { "crossentropy": 2.4922842979431152, "epoch": 0.5369054524361949, "grad_norm": 0.027705997228622437, "grad_norm_var": 1.716277342277544e-06, "learning_rate": 0.004569367086821206, "loss": 2.5595, "step": 14810 }, { "crossentropy": 2.489081859588623, "epoch": 0.5369417053364269, "grad_norm": 0.026882579550147057, "grad_norm_var": 1.7574408607755217e-06, "learning_rate": 0.004568788161214325, "loss": 2.5304, "step": 14811 }, { "crossentropy": 2.49875807762146, "epoch": 0.536977958236659, "grad_norm": 0.0268023069947958, "grad_norm_var": 1.4930436470989364e-06, "learning_rate": 0.004568209241431614, "loss": 2.574, "step": 14812 }, { "crossentropy": 2.652250289916992, "epoch": 0.537014211136891, "grad_norm": 0.025721753016114235, "grad_norm_var": 1.642662850275846e-06, "learning_rate": 0.004567630327480892, "loss": 2.5672, "step": 14813 }, { "crossentropy": 2.5436031818389893, "epoch": 0.537050464037123, "grad_norm": 0.026320351287722588, "grad_norm_var": 1.6770667629605831e-06, "learning_rate": 0.004567051419369979, "loss": 2.5315, "step": 14814 }, { "crossentropy": 2.492119789123535, "epoch": 0.537086716937355, "grad_norm": 0.029090652242302895, "grad_norm_var": 1.834147533349463e-06, "learning_rate": 0.004566472517106694, "loss": 2.5783, "step": 14815 }, { "crossentropy": 2.6724812984466553, "epoch": 0.537122969837587, "grad_norm": 0.027162518352270126, "grad_norm_var": 1.8244348523076155e-06, "learning_rate": 0.004565893620698853, "loss": 2.6456, "step": 14816 }, { "crossentropy": 2.6361918449401855, "epoch": 0.537159222737819, "grad_norm": 0.026770194992423058, "grad_norm_var": 1.7529745980838945e-06, "learning_rate": 0.004565314730154279, "loss": 2.6193, "step": 14817 }, { "crossentropy": 2.6807761192321777, "epoch": 0.537195475638051, "grad_norm": 0.02785182185471058, "grad_norm_var": 1.6708503488100287e-06, "learning_rate": 0.00456473584548079, "loss": 2.6225, "step": 14818 }, { "crossentropy": 2.637036085128784, "epoch": 0.537231728538283, "grad_norm": 0.02623770199716091, "grad_norm_var": 1.7667321858549073e-06, "learning_rate": 0.004564156966686203, "loss": 2.5789, "step": 14819 }, { "crossentropy": 2.449167013168335, "epoch": 0.5372679814385151, "grad_norm": 0.027865128591656685, "grad_norm_var": 1.6920180248656264e-06, "learning_rate": 0.004563578093778338, "loss": 2.4943, "step": 14820 }, { "crossentropy": 2.463980197906494, "epoch": 0.5373042343387471, "grad_norm": 0.02769968844950199, "grad_norm_var": 1.689155757400353e-06, "learning_rate": 0.004562999226765015, "loss": 2.5686, "step": 14821 }, { "crossentropy": 2.729736089706421, "epoch": 0.5373404872389791, "grad_norm": 0.026691364124417305, "grad_norm_var": 1.2889917323797665e-06, "learning_rate": 0.004562420365654049, "loss": 2.6079, "step": 14822 }, { "crossentropy": 2.5973267555236816, "epoch": 0.5373767401392111, "grad_norm": 0.027474887669086456, "grad_norm_var": 9.134486369586729e-07, "learning_rate": 0.004561841510453259, "loss": 2.5901, "step": 14823 }, { "crossentropy": 2.634155035018921, "epoch": 0.5374129930394431, "grad_norm": 0.026060687378048897, "grad_norm_var": 1.0031829457003705e-06, "learning_rate": 0.004561262661170465, "loss": 2.6548, "step": 14824 }, { "crossentropy": 2.538961172103882, "epoch": 0.5374492459396751, "grad_norm": 0.026260847225785255, "grad_norm_var": 7.579295057956545e-07, "learning_rate": 0.004560683817813484, "loss": 2.576, "step": 14825 }, { "crossentropy": 2.582233190536499, "epoch": 0.5374854988399071, "grad_norm": 0.025463933125138283, "grad_norm_var": 8.722375208042657e-07, "learning_rate": 0.004560104980390136, "loss": 2.5897, "step": 14826 }, { "crossentropy": 2.4926702976226807, "epoch": 0.5375217517401392, "grad_norm": 0.026689372956752777, "grad_norm_var": 8.749491648762092e-07, "learning_rate": 0.004559526148908237, "loss": 2.5089, "step": 14827 }, { "crossentropy": 2.6297616958618164, "epoch": 0.5375580046403712, "grad_norm": 0.02552894502878189, "grad_norm_var": 9.903636489336695e-07, "learning_rate": 0.004558947323375606, "loss": 2.5234, "step": 14828 }, { "crossentropy": 2.4631757736206055, "epoch": 0.5375942575406032, "grad_norm": 0.02753726951777935, "grad_norm_var": 9.340005416690865e-07, "learning_rate": 0.004558368503800062, "loss": 2.5324, "step": 14829 }, { "crossentropy": 2.596886396408081, "epoch": 0.5376305104408353, "grad_norm": 0.027813972905278206, "grad_norm_var": 9.54194529255899e-07, "learning_rate": 0.00455778969018942, "loss": 2.6209, "step": 14830 }, { "crossentropy": 2.630262613296509, "epoch": 0.5376667633410673, "grad_norm": 0.025907276198267937, "grad_norm_var": 7.054633219584379e-07, "learning_rate": 0.0045572108825515, "loss": 2.6365, "step": 14831 }, { "crossentropy": 2.5512759685516357, "epoch": 0.5377030162412993, "grad_norm": 0.026768703013658524, "grad_norm_var": 6.968286967436062e-07, "learning_rate": 0.004556632080894117, "loss": 2.5776, "step": 14832 }, { "crossentropy": 2.5286037921905518, "epoch": 0.5377392691415314, "grad_norm": 0.02675856463611126, "grad_norm_var": 6.968660983046211e-07, "learning_rate": 0.004556053285225093, "loss": 2.446, "step": 14833 }, { "crossentropy": 2.5114705562591553, "epoch": 0.5377755220417634, "grad_norm": 0.025555023923516273, "grad_norm_var": 7.00828064797004e-07, "learning_rate": 0.004555474495552241, "loss": 2.5083, "step": 14834 }, { "crossentropy": 2.543006658554077, "epoch": 0.5378117749419954, "grad_norm": 0.027408096939325333, "grad_norm_var": 7.229468446141486e-07, "learning_rate": 0.004554895711883382, "loss": 2.5892, "step": 14835 }, { "crossentropy": 2.6163454055786133, "epoch": 0.5378480278422274, "grad_norm": 0.02673882432281971, "grad_norm_var": 6.299233686607077e-07, "learning_rate": 0.004554316934226333, "loss": 2.592, "step": 14836 }, { "crossentropy": 2.6132616996765137, "epoch": 0.5378842807424594, "grad_norm": 0.026576291769742966, "grad_norm_var": 5.511724873933158e-07, "learning_rate": 0.004553738162588907, "loss": 2.6032, "step": 14837 }, { "crossentropy": 2.6524853706359863, "epoch": 0.5379205336426914, "grad_norm": 0.026483163237571716, "grad_norm_var": 5.50710526746193e-07, "learning_rate": 0.004553159396978928, "loss": 2.5627, "step": 14838 }, { "crossentropy": 2.614309310913086, "epoch": 0.5379567865429234, "grad_norm": 0.02800990827381611, "grad_norm_var": 6.335718312812262e-07, "learning_rate": 0.004552580637404208, "loss": 2.7182, "step": 14839 }, { "crossentropy": 2.6048760414123535, "epoch": 0.5379930394431555, "grad_norm": 0.027129076421260834, "grad_norm_var": 6.28434975988865e-07, "learning_rate": 0.0045520018838725645, "loss": 2.503, "step": 14840 }, { "crossentropy": 2.3374035358428955, "epoch": 0.5380292923433875, "grad_norm": 0.029173430055379868, "grad_norm_var": 1.0019411354542442e-06, "learning_rate": 0.004551423136391816, "loss": 2.456, "step": 14841 }, { "crossentropy": 2.3523318767547607, "epoch": 0.5380655452436195, "grad_norm": 0.027065739035606384, "grad_norm_var": 8.670506837953381e-07, "learning_rate": 0.004550844394969778, "loss": 2.4197, "step": 14842 }, { "crossentropy": 2.689990758895874, "epoch": 0.5381017981438515, "grad_norm": 0.027492450550198555, "grad_norm_var": 8.79828926694283e-07, "learning_rate": 0.00455026565961427, "loss": 2.6733, "step": 14843 }, { "crossentropy": 2.565269708633423, "epoch": 0.5381380510440835, "grad_norm": 0.027143144980072975, "grad_norm_var": 7.2678776283154e-07, "learning_rate": 0.0045496869303331055, "loss": 2.5265, "step": 14844 }, { "crossentropy": 2.5393800735473633, "epoch": 0.5381743039443155, "grad_norm": 0.026641711592674255, "grad_norm_var": 7.244093694550961e-07, "learning_rate": 0.0045491082071341, "loss": 2.5105, "step": 14845 }, { "crossentropy": 2.6044998168945312, "epoch": 0.5382105568445475, "grad_norm": 0.027671383693814278, "grad_norm_var": 7.109955632254398e-07, "learning_rate": 0.004548529490025078, "loss": 2.5835, "step": 14846 }, { "crossentropy": 2.3201301097869873, "epoch": 0.5382468097447796, "grad_norm": 0.028438495472073555, "grad_norm_var": 7.316202560003181e-07, "learning_rate": 0.004547950779013848, "loss": 2.4548, "step": 14847 }, { "crossentropy": 2.488104820251465, "epoch": 0.5382830626450116, "grad_norm": 0.029931344091892242, "grad_norm_var": 1.1787399137086022e-06, "learning_rate": 0.0045473720741082265, "loss": 2.5628, "step": 14848 }, { "crossentropy": 2.3393959999084473, "epoch": 0.5383193155452436, "grad_norm": 0.029197286814451218, "grad_norm_var": 1.345605445103925e-06, "learning_rate": 0.004546793375316034, "loss": 2.4486, "step": 14849 }, { "crossentropy": 2.519977331161499, "epoch": 0.5383555684454756, "grad_norm": 0.0314745157957077, "grad_norm_var": 1.9681979003237255e-06, "learning_rate": 0.0045462146826450835, "loss": 2.5583, "step": 14850 }, { "crossentropy": 2.565579652786255, "epoch": 0.5383918213457076, "grad_norm": 0.02638472430408001, "grad_norm_var": 2.1022648902377457e-06, "learning_rate": 0.004545635996103193, "loss": 2.5141, "step": 14851 }, { "crossentropy": 2.4572625160217285, "epoch": 0.5384280742459396, "grad_norm": 0.029433337971568108, "grad_norm_var": 2.157918972077672e-06, "learning_rate": 0.0045450573156981785, "loss": 2.4934, "step": 14852 }, { "crossentropy": 2.3710830211639404, "epoch": 0.5384643271461717, "grad_norm": 0.03135516494512558, "grad_norm_var": 2.6683111125497117e-06, "learning_rate": 0.004544478641437854, "loss": 2.5371, "step": 14853 }, { "crossentropy": 2.6746726036071777, "epoch": 0.5385005800464037, "grad_norm": 0.02994038164615631, "grad_norm_var": 2.5713612767999517e-06, "learning_rate": 0.0045438999733300395, "loss": 2.6817, "step": 14854 }, { "crossentropy": 2.5682406425476074, "epoch": 0.5385368329466357, "grad_norm": 0.0278434157371521, "grad_norm_var": 2.5846421884453385e-06, "learning_rate": 0.004543321311382545, "loss": 2.4725, "step": 14855 }, { "crossentropy": 2.5145890712738037, "epoch": 0.5385730858468677, "grad_norm": 0.02713562361896038, "grad_norm_var": 2.5834308872314196e-06, "learning_rate": 0.0045427426556031904, "loss": 2.5352, "step": 14856 }, { "crossentropy": 2.5686352252960205, "epoch": 0.5386093387470998, "grad_norm": 0.02607055939733982, "grad_norm_var": 2.9148898571251802e-06, "learning_rate": 0.0045421640059997905, "loss": 2.5919, "step": 14857 }, { "crossentropy": 2.7242281436920166, "epoch": 0.5386455916473318, "grad_norm": 0.026743225753307343, "grad_norm_var": 2.975593050975195e-06, "learning_rate": 0.004541585362580159, "loss": 2.646, "step": 14858 }, { "crossentropy": 2.4876036643981934, "epoch": 0.5386818445475638, "grad_norm": 0.027120478451251984, "grad_norm_var": 3.0245921553687887e-06, "learning_rate": 0.0045410067253521145, "loss": 2.5985, "step": 14859 }, { "crossentropy": 2.561107635498047, "epoch": 0.5387180974477959, "grad_norm": 0.02734880894422531, "grad_norm_var": 2.9959843067759243e-06, "learning_rate": 0.004540428094323471, "loss": 2.5335, "step": 14860 }, { "crossentropy": 2.5630290508270264, "epoch": 0.5387543503480279, "grad_norm": 0.02728809043765068, "grad_norm_var": 2.8795540708385914e-06, "learning_rate": 0.004539849469502041, "loss": 2.5321, "step": 14861 }, { "crossentropy": 2.4026331901550293, "epoch": 0.5387906032482599, "grad_norm": 0.027191268280148506, "grad_norm_var": 2.936510016631616e-06, "learning_rate": 0.004539270850895644, "loss": 2.4363, "step": 14862 }, { "crossentropy": 2.6310462951660156, "epoch": 0.5388268561484919, "grad_norm": 0.028244389221072197, "grad_norm_var": 2.9354369180005385e-06, "learning_rate": 0.004538692238512094, "loss": 2.5821, "step": 14863 }, { "crossentropy": 2.4482309818267822, "epoch": 0.5388631090487239, "grad_norm": 0.028276989236474037, "grad_norm_var": 2.7453070340814245e-06, "learning_rate": 0.004538113632359205, "loss": 2.5249, "step": 14864 }, { "crossentropy": 2.556450843811035, "epoch": 0.5388993619489559, "grad_norm": 0.026720982044935226, "grad_norm_var": 2.7961529440901815e-06, "learning_rate": 0.004537535032444791, "loss": 2.546, "step": 14865 }, { "crossentropy": 2.446652412414551, "epoch": 0.538935614849188, "grad_norm": 0.02835439331829548, "grad_norm_var": 1.9740168344215454e-06, "learning_rate": 0.00453695643877667, "loss": 2.5735, "step": 14866 }, { "crossentropy": 2.7863006591796875, "epoch": 0.53897186774942, "grad_norm": 0.026847628876566887, "grad_norm_var": 1.8975432240051401e-06, "learning_rate": 0.004536377851362654, "loss": 2.6442, "step": 14867 }, { "crossentropy": 2.4864792823791504, "epoch": 0.539008120649652, "grad_norm": 0.02591749280691147, "grad_norm_var": 1.937101435642282e-06, "learning_rate": 0.004535799270210558, "loss": 2.4337, "step": 14868 }, { "crossentropy": 2.5807337760925293, "epoch": 0.539044373549884, "grad_norm": 0.025569098070263863, "grad_norm_var": 1.1710143756656887e-06, "learning_rate": 0.004535220695328198, "loss": 2.6, "step": 14869 }, { "crossentropy": 2.632208824157715, "epoch": 0.539080626450116, "grad_norm": 0.02678874507546425, "grad_norm_var": 6.773628708461877e-07, "learning_rate": 0.004534642126723388, "loss": 2.6092, "step": 14870 }, { "crossentropy": 2.7213332653045654, "epoch": 0.539116879350348, "grad_norm": 0.027278536930680275, "grad_norm_var": 6.406604754002985e-07, "learning_rate": 0.004534063564403943, "loss": 2.6171, "step": 14871 }, { "crossentropy": 2.5683250427246094, "epoch": 0.53915313225058, "grad_norm": 0.026628969237208366, "grad_norm_var": 6.513265652322569e-07, "learning_rate": 0.0045334850083776764, "loss": 2.4847, "step": 14872 }, { "crossentropy": 2.574439287185669, "epoch": 0.539189385150812, "grad_norm": 0.027342740446329117, "grad_norm_var": 5.906928489307975e-07, "learning_rate": 0.004532906458652402, "loss": 2.5598, "step": 14873 }, { "crossentropy": 2.6362833976745605, "epoch": 0.5392256380510441, "grad_norm": 0.026969166472554207, "grad_norm_var": 5.830190177665787e-07, "learning_rate": 0.004532327915235936, "loss": 2.6359, "step": 14874 }, { "crossentropy": 2.7427480220794678, "epoch": 0.5392618909512761, "grad_norm": 0.027090394869446754, "grad_norm_var": 5.830655845172801e-07, "learning_rate": 0.00453174937813609, "loss": 2.6077, "step": 14875 }, { "crossentropy": 2.6491219997406006, "epoch": 0.5392981438515081, "grad_norm": 0.026645679026842117, "grad_norm_var": 5.921490008219931e-07, "learning_rate": 0.0045311708473606774, "loss": 2.5506, "step": 14876 }, { "crossentropy": 2.6335039138793945, "epoch": 0.5393343967517401, "grad_norm": 0.02724285051226616, "grad_norm_var": 5.90974427888633e-07, "learning_rate": 0.0045305923229175165, "loss": 2.5672, "step": 14877 }, { "crossentropy": 2.592113733291626, "epoch": 0.5393706496519721, "grad_norm": 0.026973647996783257, "grad_norm_var": 5.903962544436906e-07, "learning_rate": 0.004530013804814418, "loss": 2.5258, "step": 14878 }, { "crossentropy": 2.6092495918273926, "epoch": 0.5394069025522041, "grad_norm": 0.026130713522434235, "grad_norm_var": 5.346312250143244e-07, "learning_rate": 0.004529435293059197, "loss": 2.6422, "step": 14879 }, { "crossentropy": 2.7250936031341553, "epoch": 0.5394431554524362, "grad_norm": 0.026462530717253685, "grad_norm_var": 4.129814516693238e-07, "learning_rate": 0.004528856787659666, "loss": 2.5929, "step": 14880 }, { "crossentropy": 2.584275245666504, "epoch": 0.5394794083526682, "grad_norm": 0.026277873665094376, "grad_norm_var": 4.305254788777481e-07, "learning_rate": 0.00452827828862364, "loss": 2.6459, "step": 14881 }, { "crossentropy": 2.627948045730591, "epoch": 0.5395156612529002, "grad_norm": 0.02705766074359417, "grad_norm_var": 2.6384847437951363e-07, "learning_rate": 0.00452769979595893, "loss": 2.5483, "step": 14882 }, { "crossentropy": 2.5594708919525146, "epoch": 0.5395519141531323, "grad_norm": 0.028405889868736267, "grad_norm_var": 4.4597400008592606e-07, "learning_rate": 0.0045271213096733515, "loss": 2.5677, "step": 14883 }, { "crossentropy": 2.557835102081299, "epoch": 0.5395881670533643, "grad_norm": 0.027401838451623917, "grad_norm_var": 4.0924247286244773e-07, "learning_rate": 0.0045265428297747165, "loss": 2.6087, "step": 14884 }, { "crossentropy": 2.3709464073181152, "epoch": 0.5396244199535963, "grad_norm": 0.025953054428100586, "grad_norm_var": 3.5074962126100245e-07, "learning_rate": 0.00452596435627084, "loss": 2.4407, "step": 14885 }, { "crossentropy": 2.6543359756469727, "epoch": 0.5396606728538283, "grad_norm": 0.026926305145025253, "grad_norm_var": 3.4960481170239327e-07, "learning_rate": 0.004525385889169535, "loss": 2.6119, "step": 14886 }, { "crossentropy": 2.560053586959839, "epoch": 0.5396969257540604, "grad_norm": 0.027124591171741486, "grad_norm_var": 3.438136985329035e-07, "learning_rate": 0.0045248074284786124, "loss": 2.5544, "step": 14887 }, { "crossentropy": 2.5625247955322266, "epoch": 0.5397331786542924, "grad_norm": 0.027633139863610268, "grad_norm_var": 3.6859061476370486e-07, "learning_rate": 0.004524228974205888, "loss": 2.5644, "step": 14888 }, { "crossentropy": 2.698803424835205, "epoch": 0.5397694315545244, "grad_norm": 0.02748269774019718, "grad_norm_var": 3.7663285238156e-07, "learning_rate": 0.004523650526359173, "loss": 2.6551, "step": 14889 }, { "crossentropy": 2.7438244819641113, "epoch": 0.5398056844547564, "grad_norm": 0.02755168452858925, "grad_norm_var": 3.965234886116794e-07, "learning_rate": 0.0045230720849462796, "loss": 2.6902, "step": 14890 }, { "crossentropy": 2.6747491359710693, "epoch": 0.5398419373549884, "grad_norm": 0.02656714618206024, "grad_norm_var": 4.089009341123739e-07, "learning_rate": 0.0045224936499750215, "loss": 2.6709, "step": 14891 }, { "crossentropy": 2.6501903533935547, "epoch": 0.5398781902552204, "grad_norm": 0.02796303480863571, "grad_norm_var": 4.5691557984393836e-07, "learning_rate": 0.004521915221453211, "loss": 2.5331, "step": 14892 }, { "crossentropy": 2.5781068801879883, "epoch": 0.5399144431554525, "grad_norm": 0.0287157129496336, "grad_norm_var": 6.260175813093659e-07, "learning_rate": 0.004521336799388662, "loss": 2.5672, "step": 14893 }, { "crossentropy": 2.492723226547241, "epoch": 0.5399506960556845, "grad_norm": 0.026596030220389366, "grad_norm_var": 6.445249027990536e-07, "learning_rate": 0.004520758383789185, "loss": 2.4993, "step": 14894 }, { "crossentropy": 2.56290340423584, "epoch": 0.5399869489559165, "grad_norm": 0.026926368474960327, "grad_norm_var": 5.769534053308318e-07, "learning_rate": 0.004520179974662595, "loss": 2.5746, "step": 14895 }, { "crossentropy": 2.4722607135772705, "epoch": 0.5400232018561485, "grad_norm": 0.026712516322731972, "grad_norm_var": 5.566000451348122e-07, "learning_rate": 0.004519601572016702, "loss": 2.4527, "step": 14896 }, { "crossentropy": 2.577345132827759, "epoch": 0.5400594547563805, "grad_norm": 0.026135267689824104, "grad_norm_var": 5.75518047736404e-07, "learning_rate": 0.004519023175859318, "loss": 2.5477, "step": 14897 }, { "crossentropy": 2.532872200012207, "epoch": 0.5400957076566125, "grad_norm": 0.027527719736099243, "grad_norm_var": 5.805910644291386e-07, "learning_rate": 0.004518444786198257, "loss": 2.6026, "step": 14898 }, { "crossentropy": 2.527635097503662, "epoch": 0.5401319605568445, "grad_norm": 0.02718600071966648, "grad_norm_var": 4.81758982986954e-07, "learning_rate": 0.004517866403041328, "loss": 2.5624, "step": 14899 }, { "crossentropy": 2.5372586250305176, "epoch": 0.5401682134570766, "grad_norm": 0.028759516775608063, "grad_norm_var": 6.42518213772924e-07, "learning_rate": 0.0045172880263963456, "loss": 2.638, "step": 14900 }, { "crossentropy": 2.5919902324676514, "epoch": 0.5402044663573086, "grad_norm": 0.02726784162223339, "grad_norm_var": 5.258197627889052e-07, "learning_rate": 0.0045167096562711215, "loss": 2.6384, "step": 14901 }, { "crossentropy": 2.547269821166992, "epoch": 0.5402407192575406, "grad_norm": 0.025200430303812027, "grad_norm_var": 8.019418017997266e-07, "learning_rate": 0.004516131292673467, "loss": 2.5961, "step": 14902 }, { "crossentropy": 2.558070182800293, "epoch": 0.5402769721577726, "grad_norm": 0.026120172813534737, "grad_norm_var": 8.763472560075012e-07, "learning_rate": 0.004515552935611193, "loss": 2.4559, "step": 14903 }, { "crossentropy": 2.5207390785217285, "epoch": 0.5403132250580046, "grad_norm": 0.028028592467308044, "grad_norm_var": 9.117760247973409e-07, "learning_rate": 0.0045149745850921125, "loss": 2.5478, "step": 14904 }, { "crossentropy": 2.5991017818450928, "epoch": 0.5403494779582366, "grad_norm": 0.029526468366384506, "grad_norm_var": 1.2576962987275678e-06, "learning_rate": 0.004514396241124037, "loss": 2.5674, "step": 14905 }, { "crossentropy": 2.6355698108673096, "epoch": 0.5403857308584686, "grad_norm": 0.028104981407523155, "grad_norm_var": 1.2954688395340937e-06, "learning_rate": 0.0045138179037147765, "loss": 2.5929, "step": 14906 }, { "crossentropy": 2.5407495498657227, "epoch": 0.5404219837587007, "grad_norm": 0.026518739759922028, "grad_norm_var": 1.3005622076898057e-06, "learning_rate": 0.004513239572872143, "loss": 2.5543, "step": 14907 }, { "crossentropy": 2.638634204864502, "epoch": 0.5404582366589327, "grad_norm": 0.026848338544368744, "grad_norm_var": 1.2842231708200258e-06, "learning_rate": 0.004512661248603946, "loss": 2.6566, "step": 14908 }, { "crossentropy": 2.54056978225708, "epoch": 0.5404944895591647, "grad_norm": 0.026521028950810432, "grad_norm_var": 1.1595545298068514e-06, "learning_rate": 0.004512082930918, "loss": 2.5789, "step": 14909 }, { "crossentropy": 2.7372381687164307, "epoch": 0.5405307424593968, "grad_norm": 0.026819366961717606, "grad_norm_var": 1.1469573975548721e-06, "learning_rate": 0.004511504619822114, "loss": 2.6271, "step": 14910 }, { "crossentropy": 2.5924289226531982, "epoch": 0.5405669953596288, "grad_norm": 0.028829475864768028, "grad_norm_var": 1.31969373443989e-06, "learning_rate": 0.0045109263153241, "loss": 2.6171, "step": 14911 }, { "crossentropy": 2.6965601444244385, "epoch": 0.5406032482598608, "grad_norm": 0.030333753675222397, "grad_norm_var": 1.8765520130407798e-06, "learning_rate": 0.004510348017431769, "loss": 2.6721, "step": 14912 }, { "crossentropy": 2.5719408988952637, "epoch": 0.5406395011600929, "grad_norm": 0.028663596138358116, "grad_norm_var": 1.8217515630306597e-06, "learning_rate": 0.004509769726152931, "loss": 2.6234, "step": 14913 }, { "crossentropy": 2.492953062057495, "epoch": 0.5406757540603249, "grad_norm": 0.02651338092982769, "grad_norm_var": 1.9013775766962258e-06, "learning_rate": 0.004509191441495396, "loss": 2.5199, "step": 14914 }, { "crossentropy": 2.5377657413482666, "epoch": 0.5407120069605569, "grad_norm": 0.02650115080177784, "grad_norm_var": 1.9664500079743494e-06, "learning_rate": 0.004508613163466975, "loss": 2.6259, "step": 14915 }, { "crossentropy": 2.5499727725982666, "epoch": 0.5407482598607889, "grad_norm": 0.028469938784837723, "grad_norm_var": 1.924404253569647e-06, "learning_rate": 0.004508034892075478, "loss": 2.5423, "step": 14916 }, { "crossentropy": 2.537879705429077, "epoch": 0.5407845127610209, "grad_norm": 0.031668875366449356, "grad_norm_var": 2.9889395590507503e-06, "learning_rate": 0.0045074566273287184, "loss": 2.5185, "step": 14917 }, { "crossentropy": 2.71431040763855, "epoch": 0.5408207656612529, "grad_norm": 0.027768990024924278, "grad_norm_var": 2.5138157646061495e-06, "learning_rate": 0.004506878369234503, "loss": 2.6732, "step": 14918 }, { "crossentropy": 2.63539981842041, "epoch": 0.5408570185614849, "grad_norm": 0.026184847578406334, "grad_norm_var": 2.4982781777718154e-06, "learning_rate": 0.0045063001178006455, "loss": 2.5454, "step": 14919 }, { "crossentropy": 2.4863126277923584, "epoch": 0.540893271461717, "grad_norm": 0.0267312191426754, "grad_norm_var": 2.5909792434460527e-06, "learning_rate": 0.004505721873034952, "loss": 2.5671, "step": 14920 }, { "crossentropy": 2.5518081188201904, "epoch": 0.540929524361949, "grad_norm": 0.027222594246268272, "grad_norm_var": 2.4154953468613714e-06, "learning_rate": 0.004505143634945235, "loss": 2.4679, "step": 14921 }, { "crossentropy": 2.5445950031280518, "epoch": 0.540965777262181, "grad_norm": 0.027993645519018173, "grad_norm_var": 2.4107223725163457e-06, "learning_rate": 0.004504565403539307, "loss": 2.587, "step": 14922 }, { "crossentropy": 2.4647216796875, "epoch": 0.541002030162413, "grad_norm": 0.028668219223618507, "grad_norm_var": 2.353975932813098e-06, "learning_rate": 0.004503987178824973, "loss": 2.4452, "step": 14923 }, { "crossentropy": 2.6479499340057373, "epoch": 0.541038283062645, "grad_norm": 0.031056737527251244, "grad_norm_var": 2.893983217909513e-06, "learning_rate": 0.004503408960810044, "loss": 2.6809, "step": 14924 }, { "crossentropy": 2.59987211227417, "epoch": 0.541074535962877, "grad_norm": 0.029206203296780586, "grad_norm_var": 2.7715493387439614e-06, "learning_rate": 0.004502830749502331, "loss": 2.584, "step": 14925 }, { "crossentropy": 2.590207815170288, "epoch": 0.541110788863109, "grad_norm": 0.026228468865156174, "grad_norm_var": 2.909198358094084e-06, "learning_rate": 0.004502252544909644, "loss": 2.589, "step": 14926 }, { "crossentropy": 2.6372251510620117, "epoch": 0.541147041763341, "grad_norm": 0.026030918583273888, "grad_norm_var": 3.1834257854970276e-06, "learning_rate": 0.004501674347039792, "loss": 2.5196, "step": 14927 }, { "crossentropy": 2.513002872467041, "epoch": 0.5411832946635731, "grad_norm": 0.027139492332935333, "grad_norm_var": 2.8602581322348735e-06, "learning_rate": 0.004501096155900583, "loss": 2.542, "step": 14928 }, { "crossentropy": 2.5200579166412354, "epoch": 0.5412195475638051, "grad_norm": 0.026446759700775146, "grad_norm_var": 2.9352059255575845e-06, "learning_rate": 0.004500517971499827, "loss": 2.4886, "step": 14929 }, { "crossentropy": 2.4224462509155273, "epoch": 0.5412558004640371, "grad_norm": 0.027129238471388817, "grad_norm_var": 2.858231864824922e-06, "learning_rate": 0.004499939793845337, "loss": 2.5217, "step": 14930 }, { "crossentropy": 2.636766195297241, "epoch": 0.5412920533642691, "grad_norm": 0.027919379994273186, "grad_norm_var": 2.742502380820285e-06, "learning_rate": 0.004499361622944917, "loss": 2.6633, "step": 14931 }, { "crossentropy": 2.498361587524414, "epoch": 0.5413283062645011, "grad_norm": 0.027185317128896713, "grad_norm_var": 2.7423008078407967e-06, "learning_rate": 0.004498783458806377, "loss": 2.5392, "step": 14932 }, { "crossentropy": 2.6008803844451904, "epoch": 0.5413645591647331, "grad_norm": 0.02785113826394081, "grad_norm_var": 1.6768953945775575e-06, "learning_rate": 0.004498205301437528, "loss": 2.5765, "step": 14933 }, { "crossentropy": 2.517486572265625, "epoch": 0.5414008120649652, "grad_norm": 0.026567386463284492, "grad_norm_var": 1.7316820705163266e-06, "learning_rate": 0.004497627150846177, "loss": 2.4577, "step": 14934 }, { "crossentropy": 2.5387680530548096, "epoch": 0.5414370649651972, "grad_norm": 0.027226656675338745, "grad_norm_var": 1.620638791593095e-06, "learning_rate": 0.0044970490070401345, "loss": 2.5036, "step": 14935 }, { "crossentropy": 2.485582113265991, "epoch": 0.5414733178654292, "grad_norm": 0.03000514768064022, "grad_norm_var": 1.9384989171816834e-06, "learning_rate": 0.004496470870027209, "loss": 2.5959, "step": 14936 }, { "crossentropy": 2.7513513565063477, "epoch": 0.5415095707656613, "grad_norm": 0.028758171945810318, "grad_norm_var": 1.9794609221271873e-06, "learning_rate": 0.004495892739815207, "loss": 2.6802, "step": 14937 }, { "crossentropy": 2.6014761924743652, "epoch": 0.5415458236658933, "grad_norm": 0.028326375409960747, "grad_norm_var": 1.993271765787424e-06, "learning_rate": 0.004495314616411942, "loss": 2.5998, "step": 14938 }, { "crossentropy": 2.592519998550415, "epoch": 0.5415820765661253, "grad_norm": 0.027856765314936638, "grad_norm_var": 1.9468837068036762e-06, "learning_rate": 0.004494736499825216, "loss": 2.6003, "step": 14939 }, { "crossentropy": 2.5303544998168945, "epoch": 0.5416183294663574, "grad_norm": 0.027323530986905098, "grad_norm_var": 1.2010327892167933e-06, "learning_rate": 0.0044941583900628395, "loss": 2.5341, "step": 14940 }, { "crossentropy": 2.671761989593506, "epoch": 0.5416545823665894, "grad_norm": 0.02597615495324135, "grad_norm_var": 1.1506187583775729e-06, "learning_rate": 0.004493580287132623, "loss": 2.5119, "step": 14941 }, { "crossentropy": 2.6035499572753906, "epoch": 0.5416908352668214, "grad_norm": 0.026465270668268204, "grad_norm_var": 1.1179807853787965e-06, "learning_rate": 0.004493002191042373, "loss": 2.5904, "step": 14942 }, { "crossentropy": 2.773310661315918, "epoch": 0.5417270881670534, "grad_norm": 0.02604849450290203, "grad_norm_var": 1.1148198751868727e-06, "learning_rate": 0.004492424101799898, "loss": 2.7175, "step": 14943 }, { "crossentropy": 2.6685030460357666, "epoch": 0.5417633410672854, "grad_norm": 0.026278041303157806, "grad_norm_var": 1.1898686697319726e-06, "learning_rate": 0.004491846019413006, "loss": 2.6263, "step": 14944 }, { "crossentropy": 2.5775253772735596, "epoch": 0.5417995939675174, "grad_norm": 0.027475660666823387, "grad_norm_var": 1.1341458198511755e-06, "learning_rate": 0.004491267943889503, "loss": 2.5691, "step": 14945 }, { "crossentropy": 2.556086778640747, "epoch": 0.5418358468677494, "grad_norm": 0.02639458328485489, "grad_norm_var": 1.1943558863237885e-06, "learning_rate": 0.0044906898752372, "loss": 2.4901, "step": 14946 }, { "crossentropy": 2.5361521244049072, "epoch": 0.5418720997679815, "grad_norm": 0.026731880381703377, "grad_norm_var": 1.1929134941660499e-06, "learning_rate": 0.004490111813463905, "loss": 2.4286, "step": 14947 }, { "crossentropy": 2.571568727493286, "epoch": 0.5419083526682135, "grad_norm": 0.026419878005981445, "grad_norm_var": 1.2391351415133011e-06, "learning_rate": 0.004489533758577421, "loss": 2.6134, "step": 14948 }, { "crossentropy": 2.4956679344177246, "epoch": 0.5419446055684455, "grad_norm": 0.02956012636423111, "grad_norm_var": 1.562852898902752e-06, "learning_rate": 0.0044889557105855585, "loss": 2.6453, "step": 14949 }, { "crossentropy": 2.478883743286133, "epoch": 0.5419808584686775, "grad_norm": 0.027095699682831764, "grad_norm_var": 1.52598723744841e-06, "learning_rate": 0.004488377669496125, "loss": 2.4707, "step": 14950 }, { "crossentropy": 2.607632875442505, "epoch": 0.5420171113689095, "grad_norm": 0.027380866929888725, "grad_norm_var": 1.5244973683410281e-06, "learning_rate": 0.004487799635316928, "loss": 2.5307, "step": 14951 }, { "crossentropy": 2.7024757862091064, "epoch": 0.5420533642691415, "grad_norm": 0.027346570044755936, "grad_norm_var": 1.0360638719221377e-06, "learning_rate": 0.004487221608055774, "loss": 2.6557, "step": 14952 }, { "crossentropy": 2.658510684967041, "epoch": 0.5420896171693735, "grad_norm": 0.025439949706196785, "grad_norm_var": 1.0414279471360516e-06, "learning_rate": 0.004486643587720469, "loss": 2.6311, "step": 14953 }, { "crossentropy": 2.555694818496704, "epoch": 0.5421258700696056, "grad_norm": 0.029479291290044785, "grad_norm_var": 1.3272456652030363e-06, "learning_rate": 0.0044860655743188226, "loss": 2.6467, "step": 14954 }, { "crossentropy": 2.6264777183532715, "epoch": 0.5421621229698376, "grad_norm": 0.02739633619785309, "grad_norm_var": 1.2927815383161848e-06, "learning_rate": 0.004485487567858643, "loss": 2.5389, "step": 14955 }, { "crossentropy": 2.560459613800049, "epoch": 0.5421983758700696, "grad_norm": 0.026854414492845535, "grad_norm_var": 1.2894750992637274e-06, "learning_rate": 0.004484909568347731, "loss": 2.5903, "step": 14956 }, { "crossentropy": 2.631505250930786, "epoch": 0.5422346287703016, "grad_norm": 0.027129288762807846, "grad_norm_var": 1.211866929394454e-06, "learning_rate": 0.004484331575793898, "loss": 2.5831, "step": 14957 }, { "crossentropy": 2.5522565841674805, "epoch": 0.5422708816705336, "grad_norm": 0.027058565989136696, "grad_norm_var": 1.1841684098688137e-06, "learning_rate": 0.00448375359020495, "loss": 2.5872, "step": 14958 }, { "crossentropy": 2.5296363830566406, "epoch": 0.5423071345707656, "grad_norm": 0.027363887056708336, "grad_norm_var": 1.1025231844181595e-06, "learning_rate": 0.004483175611588693, "loss": 2.6189, "step": 14959 }, { "crossentropy": 2.5662214756011963, "epoch": 0.5423433874709976, "grad_norm": 0.027617312967777252, "grad_norm_var": 1.047704096277764e-06, "learning_rate": 0.004482597639952933, "loss": 2.6256, "step": 14960 }, { "crossentropy": 2.5475125312805176, "epoch": 0.5423796403712297, "grad_norm": 0.027866311371326447, "grad_norm_var": 1.0665729782273366e-06, "learning_rate": 0.004482019675305477, "loss": 2.6537, "step": 14961 }, { "crossentropy": 2.5242722034454346, "epoch": 0.5424158932714617, "grad_norm": 0.02688351646065712, "grad_norm_var": 1.0211240679405357e-06, "learning_rate": 0.0044814417176541315, "loss": 2.4925, "step": 14962 }, { "crossentropy": 2.3398404121398926, "epoch": 0.5424521461716937, "grad_norm": 0.028957463800907135, "grad_norm_var": 1.1468336630025225e-06, "learning_rate": 0.004480863767006703, "loss": 2.3904, "step": 14963 }, { "crossentropy": 2.526319742202759, "epoch": 0.5424883990719258, "grad_norm": 0.028033733367919922, "grad_norm_var": 1.0792196350593257e-06, "learning_rate": 0.004480285823370999, "loss": 2.5832, "step": 14964 }, { "crossentropy": 2.526787042617798, "epoch": 0.5425246519721578, "grad_norm": 0.027634259313344955, "grad_norm_var": 8.055108445169235e-07, "learning_rate": 0.004479707886754821, "loss": 2.4872, "step": 14965 }, { "crossentropy": 2.692314386367798, "epoch": 0.5425609048723898, "grad_norm": 0.025531725957989693, "grad_norm_var": 1.0366671443450543e-06, "learning_rate": 0.0044791299571659785, "loss": 2.5833, "step": 14966 }, { "crossentropy": 2.703911781311035, "epoch": 0.5425971577726219, "grad_norm": 0.027199115604162216, "grad_norm_var": 1.0385494182879133e-06, "learning_rate": 0.004478552034612277, "loss": 2.7042, "step": 14967 }, { "crossentropy": 2.328941583633423, "epoch": 0.5426334106728539, "grad_norm": 0.02606455236673355, "grad_norm_var": 1.1439072790321464e-06, "learning_rate": 0.0044779741191015195, "loss": 2.3861, "step": 14968 }, { "crossentropy": 2.3673911094665527, "epoch": 0.5426696635730859, "grad_norm": 0.02652803622186184, "grad_norm_var": 9.506823318174614e-07, "learning_rate": 0.0044773962106415145, "loss": 2.4816, "step": 14969 }, { "crossentropy": 2.475412607192993, "epoch": 0.5427059164733179, "grad_norm": 0.02605533041059971, "grad_norm_var": 7.112577921582063e-07, "learning_rate": 0.004476818309240068, "loss": 2.5249, "step": 14970 }, { "crossentropy": 2.520686149597168, "epoch": 0.5427421693735499, "grad_norm": 0.026338111609220505, "grad_norm_var": 7.444962348710961e-07, "learning_rate": 0.004476240414904983, "loss": 2.581, "step": 14971 }, { "crossentropy": 2.5131852626800537, "epoch": 0.5427784222737819, "grad_norm": 0.0260174497961998, "grad_norm_var": 8.123059225694732e-07, "learning_rate": 0.004475662527644067, "loss": 2.511, "step": 14972 }, { "crossentropy": 2.4665606021881104, "epoch": 0.5428146751740139, "grad_norm": 0.025658773258328438, "grad_norm_var": 9.255222241358677e-07, "learning_rate": 0.004475084647465125, "loss": 2.5128, "step": 14973 }, { "crossentropy": 2.5418341159820557, "epoch": 0.542850928074246, "grad_norm": 0.027319561690092087, "grad_norm_var": 9.344099499561474e-07, "learning_rate": 0.00447450677437596, "loss": 2.5004, "step": 14974 }, { "crossentropy": 2.593167781829834, "epoch": 0.542887180974478, "grad_norm": 0.027745511382818222, "grad_norm_var": 9.649883435690228e-07, "learning_rate": 0.004473928908384378, "loss": 2.6055, "step": 14975 }, { "crossentropy": 2.4892749786376953, "epoch": 0.54292343387471, "grad_norm": 0.027677051723003387, "grad_norm_var": 9.704018111076142e-07, "learning_rate": 0.004473351049498185, "loss": 2.524, "step": 14976 }, { "crossentropy": 2.507582902908325, "epoch": 0.542959686774942, "grad_norm": 0.029103226959705353, "grad_norm_var": 1.2139437340028329e-06, "learning_rate": 0.004472773197725185, "loss": 2.5469, "step": 14977 }, { "crossentropy": 2.5394253730773926, "epoch": 0.542995939675174, "grad_norm": 0.02583715133368969, "grad_norm_var": 1.3051422590040398e-06, "learning_rate": 0.004472195353073184, "loss": 2.6183, "step": 14978 }, { "crossentropy": 2.598473310470581, "epoch": 0.543032192575406, "grad_norm": 0.02594795823097229, "grad_norm_var": 1.0782487562773264e-06, "learning_rate": 0.004471617515549985, "loss": 2.5685, "step": 14979 }, { "crossentropy": 2.6099298000335693, "epoch": 0.543068445475638, "grad_norm": 0.02579372003674507, "grad_norm_var": 1.0213508370324517e-06, "learning_rate": 0.004471039685163394, "loss": 2.5329, "step": 14980 }, { "crossentropy": 2.5987110137939453, "epoch": 0.54310469837587, "grad_norm": 0.026781657710671425, "grad_norm_var": 9.552592844958945e-07, "learning_rate": 0.004470461861921215, "loss": 2.6878, "step": 14981 }, { "crossentropy": 2.6752219200134277, "epoch": 0.5431409512761021, "grad_norm": 0.027792679145932198, "grad_norm_var": 9.527313493754019e-07, "learning_rate": 0.0044698840458312515, "loss": 2.6654, "step": 14982 }, { "crossentropy": 2.665499448776245, "epoch": 0.5431772041763341, "grad_norm": 0.027904370799660683, "grad_norm_var": 1.026873512495691e-06, "learning_rate": 0.004469306236901309, "loss": 2.5735, "step": 14983 }, { "crossentropy": 2.7357177734375, "epoch": 0.5432134570765661, "grad_norm": 0.027048762887716293, "grad_norm_var": 9.928302783156525e-07, "learning_rate": 0.0044687284351391885, "loss": 2.6726, "step": 14984 }, { "crossentropy": 2.5015785694122314, "epoch": 0.5432497099767981, "grad_norm": 0.027789194136857986, "grad_norm_var": 1.0386303915750165e-06, "learning_rate": 0.004468150640552699, "loss": 2.4984, "step": 14985 }, { "crossentropy": 2.418987512588501, "epoch": 0.5432859628770301, "grad_norm": 0.02708526700735092, "grad_norm_var": 9.854109922044814e-07, "learning_rate": 0.0044675728531496425, "loss": 2.4878, "step": 14986 }, { "crossentropy": 2.521868944168091, "epoch": 0.5433222157772621, "grad_norm": 0.02850889228284359, "grad_norm_var": 1.0912400430239388e-06, "learning_rate": 0.004466995072937822, "loss": 2.5353, "step": 14987 }, { "crossentropy": 2.551213264465332, "epoch": 0.5433584686774942, "grad_norm": 0.029528973624110222, "grad_norm_var": 1.3430279207059566e-06, "learning_rate": 0.004466417299925042, "loss": 2.4744, "step": 14988 }, { "crossentropy": 2.701932430267334, "epoch": 0.5433947215777262, "grad_norm": 0.03051174432039261, "grad_norm_var": 1.7237800146123356e-06, "learning_rate": 0.0044658395341191075, "loss": 2.6636, "step": 14989 }, { "crossentropy": 2.5084478855133057, "epoch": 0.5434309744779582, "grad_norm": 0.02738330326974392, "grad_norm_var": 1.7212384920845299e-06, "learning_rate": 0.004465261775527819, "loss": 2.5419, "step": 14990 }, { "crossentropy": 2.440598964691162, "epoch": 0.5434672273781903, "grad_norm": 0.026267915964126587, "grad_norm_var": 1.8393629907337602e-06, "learning_rate": 0.004464684024158981, "loss": 2.4694, "step": 14991 }, { "crossentropy": 2.439291000366211, "epoch": 0.5435034802784223, "grad_norm": 0.02740803360939026, "grad_norm_var": 1.8396918154245003e-06, "learning_rate": 0.004464106280020398, "loss": 2.4292, "step": 14992 }, { "crossentropy": 2.562699317932129, "epoch": 0.5435397331786543, "grad_norm": 0.028723236173391342, "grad_norm_var": 1.769682157354045e-06, "learning_rate": 0.004463528543119874, "loss": 2.5829, "step": 14993 }, { "crossentropy": 2.5615127086639404, "epoch": 0.5435759860788864, "grad_norm": 0.027798032388091087, "grad_norm_var": 1.5701325856095344e-06, "learning_rate": 0.004462950813465211, "loss": 2.5486, "step": 14994 }, { "crossentropy": 2.56583833694458, "epoch": 0.5436122389791184, "grad_norm": 0.029454747214913368, "grad_norm_var": 1.5465934833526947e-06, "learning_rate": 0.004462373091064212, "loss": 2.564, "step": 14995 }, { "crossentropy": 2.56115460395813, "epoch": 0.5436484918793504, "grad_norm": 0.030518800020217896, "grad_norm_var": 1.6394054689199943e-06, "learning_rate": 0.00446179537592468, "loss": 2.6663, "step": 14996 }, { "crossentropy": 2.626058340072632, "epoch": 0.5436847447795824, "grad_norm": 0.02771489880979061, "grad_norm_var": 1.5227520507058353e-06, "learning_rate": 0.004461217668054419, "loss": 2.641, "step": 14997 }, { "crossentropy": 2.5425143241882324, "epoch": 0.5437209976798144, "grad_norm": 0.027658216655254364, "grad_norm_var": 1.531452282568776e-06, "learning_rate": 0.004460639967461231, "loss": 2.5557, "step": 14998 }, { "crossentropy": 2.425628662109375, "epoch": 0.5437572505800464, "grad_norm": 0.028589192777872086, "grad_norm_var": 1.533174087480632e-06, "learning_rate": 0.004460062274152918, "loss": 2.4264, "step": 14999 }, { "crossentropy": 2.5892980098724365, "epoch": 0.5437935034802784, "grad_norm": 0.02641787938773632, "grad_norm_var": 1.6590386641131358e-06, "learning_rate": 0.0044594845881372825, "loss": 2.6051, "step": 15000 }, { "crossentropy": 2.5816092491149902, "epoch": 0.5438297563805105, "grad_norm": 0.028371620923280716, "grad_norm_var": 1.6475696102162627e-06, "learning_rate": 0.004458906909422129, "loss": 2.5856, "step": 15001 }, { "crossentropy": 2.56308650970459, "epoch": 0.5438660092807425, "grad_norm": 0.02667483501136303, "grad_norm_var": 1.721634534285199e-06, "learning_rate": 0.004458329238015259, "loss": 2.5361, "step": 15002 }, { "crossentropy": 2.640549898147583, "epoch": 0.5439022621809745, "grad_norm": 0.02851059101521969, "grad_norm_var": 1.7217000019387197e-06, "learning_rate": 0.004457751573924474, "loss": 2.5659, "step": 15003 }, { "crossentropy": 2.4976582527160645, "epoch": 0.5439385150812065, "grad_norm": 0.029405813664197922, "grad_norm_var": 1.7011652779302337e-06, "learning_rate": 0.004457173917157578, "loss": 2.5433, "step": 15004 }, { "crossentropy": 2.3938472270965576, "epoch": 0.5439747679814385, "grad_norm": 0.025548746809363365, "grad_norm_var": 1.719504935573414e-06, "learning_rate": 0.0044565962677223705, "loss": 2.4947, "step": 15005 }, { "crossentropy": 2.4973652362823486, "epoch": 0.5440110208816705, "grad_norm": 0.027613822370767593, "grad_norm_var": 1.7068568939888982e-06, "learning_rate": 0.004456018625626658, "loss": 2.4377, "step": 15006 }, { "crossentropy": 2.6068241596221924, "epoch": 0.5440472737819025, "grad_norm": 0.027157368138432503, "grad_norm_var": 1.5606988805737127e-06, "learning_rate": 0.004455440990878238, "loss": 2.4945, "step": 15007 }, { "crossentropy": 2.468146562576294, "epoch": 0.5440835266821346, "grad_norm": 0.026673030108213425, "grad_norm_var": 1.6498169738881722e-06, "learning_rate": 0.004454863363484911, "loss": 2.5838, "step": 15008 }, { "crossentropy": 2.4849588871002197, "epoch": 0.5441197795823666, "grad_norm": 0.02639295905828476, "grad_norm_var": 1.7417877890265586e-06, "learning_rate": 0.004454285743454485, "loss": 2.4739, "step": 15009 }, { "crossentropy": 2.5806972980499268, "epoch": 0.5441560324825986, "grad_norm": 0.026649650186300278, "grad_norm_var": 1.8216472697510915e-06, "learning_rate": 0.004453708130794757, "loss": 2.5991, "step": 15010 }, { "crossentropy": 2.7020747661590576, "epoch": 0.5441922853828306, "grad_norm": 0.028786825016140938, "grad_norm_var": 1.6941054842988286e-06, "learning_rate": 0.004453130525513529, "loss": 2.6336, "step": 15011 }, { "crossentropy": 2.573289155960083, "epoch": 0.5442285382830626, "grad_norm": 0.026516292244195938, "grad_norm_var": 1.1738547855733578e-06, "learning_rate": 0.004452552927618604, "loss": 2.5981, "step": 15012 }, { "crossentropy": 2.466463804244995, "epoch": 0.5442647911832946, "grad_norm": 0.026901274919509888, "grad_norm_var": 1.1829778173419458e-06, "learning_rate": 0.004451975337117781, "loss": 2.4226, "step": 15013 }, { "crossentropy": 2.5397536754608154, "epoch": 0.5443010440835266, "grad_norm": 0.026801543310284615, "grad_norm_var": 1.1955545094284215e-06, "learning_rate": 0.004451397754018867, "loss": 2.48, "step": 15014 }, { "crossentropy": 2.447298049926758, "epoch": 0.5443372969837587, "grad_norm": 0.026623202487826347, "grad_norm_var": 1.1026497979506218e-06, "learning_rate": 0.004450820178329655, "loss": 2.55, "step": 15015 }, { "crossentropy": 2.60146164894104, "epoch": 0.5443735498839907, "grad_norm": 0.028335928916931152, "grad_norm_var": 1.1350326600588114e-06, "learning_rate": 0.00445024261005795, "loss": 2.5021, "step": 15016 }, { "crossentropy": 2.7490789890289307, "epoch": 0.5444098027842227, "grad_norm": 0.027377240359783173, "grad_norm_var": 1.0561072132767574e-06, "learning_rate": 0.004449665049211554, "loss": 2.6536, "step": 15017 }, { "crossentropy": 2.527164936065674, "epoch": 0.5444460556844548, "grad_norm": 0.026945995166897774, "grad_norm_var": 1.0399775626522411e-06, "learning_rate": 0.004449087495798266, "loss": 2.6226, "step": 15018 }, { "crossentropy": 2.6137919425964355, "epoch": 0.5444823085846868, "grad_norm": 0.027669040486216545, "grad_norm_var": 9.444788048254636e-07, "learning_rate": 0.004448509949825889, "loss": 2.5541, "step": 15019 }, { "crossentropy": 2.590111255645752, "epoch": 0.5445185614849188, "grad_norm": 0.026155725121498108, "grad_norm_var": 6.541749021189789e-07, "learning_rate": 0.0044479324113022205, "loss": 2.605, "step": 15020 }, { "crossentropy": 2.539163589477539, "epoch": 0.5445548143851509, "grad_norm": 0.025683755055069923, "grad_norm_var": 6.290227152881082e-07, "learning_rate": 0.004447354880235062, "loss": 2.5245, "step": 15021 }, { "crossentropy": 2.4110193252563477, "epoch": 0.5445910672853829, "grad_norm": 0.025920545682311058, "grad_norm_var": 6.736415462782475e-07, "learning_rate": 0.004446777356632215, "loss": 2.3566, "step": 15022 }, { "crossentropy": 2.6216609477996826, "epoch": 0.5446273201856149, "grad_norm": 0.028050700202584267, "grad_norm_var": 7.527573002492904e-07, "learning_rate": 0.004446199840501482, "loss": 2.6189, "step": 15023 }, { "crossentropy": 2.6832752227783203, "epoch": 0.5446635730858469, "grad_norm": 0.02730748802423477, "grad_norm_var": 7.529857462305773e-07, "learning_rate": 0.004445622331850657, "loss": 2.5945, "step": 15024 }, { "crossentropy": 2.45831298828125, "epoch": 0.5446998259860789, "grad_norm": 0.026929380372166634, "grad_norm_var": 7.270244680110712e-07, "learning_rate": 0.004445044830687545, "loss": 2.5396, "step": 15025 }, { "crossentropy": 2.524199962615967, "epoch": 0.5447360788863109, "grad_norm": 0.027991535142064095, "grad_norm_var": 7.695616910315927e-07, "learning_rate": 0.0044444673370199455, "loss": 2.5079, "step": 15026 }, { "crossentropy": 2.610645055770874, "epoch": 0.5447723317865429, "grad_norm": 0.029254190623760223, "grad_norm_var": 8.867846572588872e-07, "learning_rate": 0.004443889850855656, "loss": 2.6026, "step": 15027 }, { "crossentropy": 2.6674818992614746, "epoch": 0.544808584686775, "grad_norm": 0.025844357907772064, "grad_norm_var": 9.721352670195714e-07, "learning_rate": 0.004443312372202479, "loss": 2.6262, "step": 15028 }, { "crossentropy": 2.5421504974365234, "epoch": 0.544844837587007, "grad_norm": 0.026344990357756615, "grad_norm_var": 1.0071053509432802e-06, "learning_rate": 0.0044427349010682125, "loss": 2.557, "step": 15029 }, { "crossentropy": 2.6132454872131348, "epoch": 0.544881090487239, "grad_norm": 0.027717048302292824, "grad_norm_var": 1.0258378128124555e-06, "learning_rate": 0.0044421574374606575, "loss": 2.6147, "step": 15030 }, { "crossentropy": 2.4781880378723145, "epoch": 0.544917343387471, "grad_norm": 0.02601027302443981, "grad_norm_var": 1.091098738741023e-06, "learning_rate": 0.004441579981387615, "loss": 2.5054, "step": 15031 }, { "crossentropy": 2.5682764053344727, "epoch": 0.544953596287703, "grad_norm": 0.0286271832883358, "grad_norm_var": 1.1445465267723628e-06, "learning_rate": 0.004441002532856879, "loss": 2.624, "step": 15032 }, { "crossentropy": 2.762009620666504, "epoch": 0.544989849187935, "grad_norm": 0.02598297782242298, "grad_norm_var": 1.217171017785914e-06, "learning_rate": 0.004440425091876253, "loss": 2.6625, "step": 15033 }, { "crossentropy": 2.6898980140686035, "epoch": 0.545026102088167, "grad_norm": 0.02825918234884739, "grad_norm_var": 1.310731664447617e-06, "learning_rate": 0.004439847658453536, "loss": 2.6059, "step": 15034 }, { "crossentropy": 2.418531894683838, "epoch": 0.5450623549883991, "grad_norm": 0.027522338554263115, "grad_norm_var": 1.3011275663208823e-06, "learning_rate": 0.004439270232596526, "loss": 2.506, "step": 15035 }, { "crossentropy": 2.508389472961426, "epoch": 0.5450986078886311, "grad_norm": 0.02712300978600979, "grad_norm_var": 1.237807218705996e-06, "learning_rate": 0.0044386928143130226, "loss": 2.5249, "step": 15036 }, { "crossentropy": 2.4510931968688965, "epoch": 0.5451348607888631, "grad_norm": 0.02872515097260475, "grad_norm_var": 1.217064740606149e-06, "learning_rate": 0.004438115403610823, "loss": 2.5547, "step": 15037 }, { "crossentropy": 2.6883492469787598, "epoch": 0.5451711136890951, "grad_norm": 0.02663860283792019, "grad_norm_var": 1.1123708534733669e-06, "learning_rate": 0.004437538000497729, "loss": 2.5084, "step": 15038 }, { "crossentropy": 2.648857355117798, "epoch": 0.5452073665893271, "grad_norm": 0.02687094174325466, "grad_norm_var": 1.0963005255861981e-06, "learning_rate": 0.00443696060498154, "loss": 2.6443, "step": 15039 }, { "crossentropy": 2.5953922271728516, "epoch": 0.5452436194895591, "grad_norm": 0.027177365496754646, "grad_norm_var": 1.0976069142058324e-06, "learning_rate": 0.004436383217070049, "loss": 2.5796, "step": 15040 }, { "crossentropy": 2.41841721534729, "epoch": 0.5452798723897911, "grad_norm": 0.02748919650912285, "grad_norm_var": 1.0885107342457524e-06, "learning_rate": 0.004435805836771059, "loss": 2.5043, "step": 15041 }, { "crossentropy": 2.581205368041992, "epoch": 0.5453161252900232, "grad_norm": 0.02655770815908909, "grad_norm_var": 1.094096503784211e-06, "learning_rate": 0.004435228464092368, "loss": 2.5077, "step": 15042 }, { "crossentropy": 2.6925787925720215, "epoch": 0.5453523781902552, "grad_norm": 0.02603176049888134, "grad_norm_var": 8.858655850557211e-07, "learning_rate": 0.0044346510990417734, "loss": 2.6459, "step": 15043 }, { "crossentropy": 2.5934534072875977, "epoch": 0.5453886310904872, "grad_norm": 0.025510720908641815, "grad_norm_var": 9.467950371597663e-07, "learning_rate": 0.004434073741627073, "loss": 2.6088, "step": 15044 }, { "crossentropy": 2.636993408203125, "epoch": 0.5454248839907193, "grad_norm": 0.026297759264707565, "grad_norm_var": 9.512909798968372e-07, "learning_rate": 0.004433496391856065, "loss": 2.5372, "step": 15045 }, { "crossentropy": 2.5898995399475098, "epoch": 0.5454611368909513, "grad_norm": 0.027016526088118553, "grad_norm_var": 9.181467245525e-07, "learning_rate": 0.0044329190497365495, "loss": 2.6224, "step": 15046 }, { "crossentropy": 2.471330404281616, "epoch": 0.5454973897911833, "grad_norm": 0.028404856100678444, "grad_norm_var": 9.637045487727862e-07, "learning_rate": 0.004432341715276323, "loss": 2.5414, "step": 15047 }, { "crossentropy": 2.72375226020813, "epoch": 0.5455336426914154, "grad_norm": 0.02682643197476864, "grad_norm_var": 8.092298377522973e-07, "learning_rate": 0.004431764388483183, "loss": 2.6519, "step": 15048 }, { "crossentropy": 2.5858988761901855, "epoch": 0.5455698955916474, "grad_norm": 0.02642633393406868, "grad_norm_var": 7.597893021367748e-07, "learning_rate": 0.004431187069364927, "loss": 2.5925, "step": 15049 }, { "crossentropy": 2.602604866027832, "epoch": 0.5456061484918794, "grad_norm": 0.027011841535568237, "grad_norm_var": 6.567384082521721e-07, "learning_rate": 0.004430609757929352, "loss": 2.5356, "step": 15050 }, { "crossentropy": 2.587353229522705, "epoch": 0.5456424013921114, "grad_norm": 0.02678806520998478, "grad_norm_var": 6.370364934393223e-07, "learning_rate": 0.004430032454184257, "loss": 2.5345, "step": 15051 }, { "crossentropy": 2.528716564178467, "epoch": 0.5456786542923434, "grad_norm": 0.026745006442070007, "grad_norm_var": 6.362903763381615e-07, "learning_rate": 0.004429455158137439, "loss": 2.5274, "step": 15052 }, { "crossentropy": 2.6404123306274414, "epoch": 0.5457149071925754, "grad_norm": 0.029032249003648758, "grad_norm_var": 7.166154059652044e-07, "learning_rate": 0.0044288778697966925, "loss": 2.6009, "step": 15053 }, { "crossentropy": 2.504164934158325, "epoch": 0.5457511600928074, "grad_norm": 0.027506805956363678, "grad_norm_var": 7.303894707794134e-07, "learning_rate": 0.004428300589169819, "loss": 2.5448, "step": 15054 }, { "crossentropy": 2.601672649383545, "epoch": 0.5457874129930395, "grad_norm": 0.02678997814655304, "grad_norm_var": 7.319856194487692e-07, "learning_rate": 0.004427723316264613, "loss": 2.6702, "step": 15055 }, { "crossentropy": 2.4805896282196045, "epoch": 0.5458236658932715, "grad_norm": 0.026769885793328285, "grad_norm_var": 7.314112600208112e-07, "learning_rate": 0.004427146051088874, "loss": 2.503, "step": 15056 }, { "crossentropy": 2.5855767726898193, "epoch": 0.5458599187935035, "grad_norm": 0.039591234177351, "grad_norm_var": 1.075465177705416e-05, "learning_rate": 0.004426568793650396, "loss": 2.5819, "step": 15057 }, { "crossentropy": 2.6624083518981934, "epoch": 0.5458961716937355, "grad_norm": 0.027271421626210213, "grad_norm_var": 1.0677148551305525e-05, "learning_rate": 0.0044259915439569765, "loss": 2.561, "step": 15058 }, { "crossentropy": 2.622789144515991, "epoch": 0.5459324245939675, "grad_norm": 0.026242058724164963, "grad_norm_var": 1.0631697020211985e-05, "learning_rate": 0.004425414302016413, "loss": 2.5275, "step": 15059 }, { "crossentropy": 2.4926862716674805, "epoch": 0.5459686774941995, "grad_norm": 0.029104726389050484, "grad_norm_var": 1.0359013857019746e-05, "learning_rate": 0.0044248370678364995, "loss": 2.5439, "step": 15060 }, { "crossentropy": 2.5532007217407227, "epoch": 0.5460049303944315, "grad_norm": 0.027508407831192017, "grad_norm_var": 1.0177606564345713e-05, "learning_rate": 0.004424259841425034, "loss": 2.5563, "step": 15061 }, { "crossentropy": 2.571599006652832, "epoch": 0.5460411832946636, "grad_norm": 0.027652492746710777, "grad_norm_var": 1.0114001092554114e-05, "learning_rate": 0.0044236826227898145, "loss": 2.5846, "step": 15062 }, { "crossentropy": 2.4833872318267822, "epoch": 0.5460774361948956, "grad_norm": 0.028369881212711334, "grad_norm_var": 1.0112676829522547e-05, "learning_rate": 0.004423105411938635, "loss": 2.5589, "step": 15063 }, { "crossentropy": 2.6578986644744873, "epoch": 0.5461136890951276, "grad_norm": 0.02786802500486374, "grad_norm_var": 1.0003292536683038e-05, "learning_rate": 0.0044225282088792926, "loss": 2.6274, "step": 15064 }, { "crossentropy": 2.5170650482177734, "epoch": 0.5461499419953596, "grad_norm": 0.02657688967883587, "grad_norm_var": 9.969758875870427e-06, "learning_rate": 0.004421951013619584, "loss": 2.599, "step": 15065 }, { "crossentropy": 2.600036382675171, "epoch": 0.5461861948955916, "grad_norm": 0.026794973760843277, "grad_norm_var": 1.0006384252450075e-05, "learning_rate": 0.004421373826167304, "loss": 2.606, "step": 15066 }, { "crossentropy": 2.462026357650757, "epoch": 0.5462224477958236, "grad_norm": 0.027615001425147057, "grad_norm_var": 9.897497173204373e-06, "learning_rate": 0.004420796646530249, "loss": 2.4936, "step": 15067 }, { "crossentropy": 2.40484619140625, "epoch": 0.5462587006960556, "grad_norm": 0.028564395383000374, "grad_norm_var": 9.747799097593633e-06, "learning_rate": 0.004420219474716213, "loss": 2.4971, "step": 15068 }, { "crossentropy": 2.4516987800598145, "epoch": 0.5462949535962877, "grad_norm": 0.027029898017644882, "grad_norm_var": 9.810540683238703e-06, "learning_rate": 0.004419642310732992, "loss": 2.6248, "step": 15069 }, { "crossentropy": 2.5590193271636963, "epoch": 0.5463312064965197, "grad_norm": 0.026280304417014122, "grad_norm_var": 1.001849342762498e-05, "learning_rate": 0.004419065154588383, "loss": 2.6277, "step": 15070 }, { "crossentropy": 2.5824532508850098, "epoch": 0.5463674593967517, "grad_norm": 0.026558635756373405, "grad_norm_var": 1.0063075018481314e-05, "learning_rate": 0.004418488006290181, "loss": 2.573, "step": 15071 }, { "crossentropy": 2.7291266918182373, "epoch": 0.5464037122969838, "grad_norm": 0.02715138904750347, "grad_norm_var": 1.0003882293422083e-05, "learning_rate": 0.004417910865846182, "loss": 2.6125, "step": 15072 }, { "crossentropy": 2.5048880577087402, "epoch": 0.5464399651972158, "grad_norm": 0.0265102069824934, "grad_norm_var": 7.19371685263479e-07, "learning_rate": 0.00441733373326418, "loss": 2.5104, "step": 15073 }, { "crossentropy": 2.5328071117401123, "epoch": 0.5464762180974478, "grad_norm": 0.026695802807807922, "grad_norm_var": 7.437064654714492e-07, "learning_rate": 0.004416756608551969, "loss": 2.4733, "step": 15074 }, { "crossentropy": 2.3905608654022217, "epoch": 0.5465124709976799, "grad_norm": 0.025340769439935684, "grad_norm_var": 9.19531625622596e-07, "learning_rate": 0.004416179491717346, "loss": 2.4297, "step": 15075 }, { "crossentropy": 2.5605082511901855, "epoch": 0.5465487238979119, "grad_norm": 0.02916642650961876, "grad_norm_var": 9.352222610183846e-07, "learning_rate": 0.004415602382768103, "loss": 2.5207, "step": 15076 }, { "crossentropy": 2.5783770084381104, "epoch": 0.5465849767981439, "grad_norm": 0.02638545073568821, "grad_norm_var": 9.723842141023115e-07, "learning_rate": 0.004415025281712038, "loss": 2.5667, "step": 15077 }, { "crossentropy": 2.626460075378418, "epoch": 0.5466212296983759, "grad_norm": 0.026678072288632393, "grad_norm_var": 9.677460735289655e-07, "learning_rate": 0.004414448188556944, "loss": 2.5628, "step": 15078 }, { "crossentropy": 2.7030632495880127, "epoch": 0.5466574825986079, "grad_norm": 0.027263902127742767, "grad_norm_var": 8.568059216659674e-07, "learning_rate": 0.004413871103310617, "loss": 2.621, "step": 15079 }, { "crossentropy": 2.736661911010742, "epoch": 0.5466937354988399, "grad_norm": 0.027796750888228416, "grad_norm_var": 8.491595745512467e-07, "learning_rate": 0.004413294025980849, "loss": 2.627, "step": 15080 }, { "crossentropy": 2.5632288455963135, "epoch": 0.5467299883990719, "grad_norm": 0.026043297722935677, "grad_norm_var": 8.988751111215753e-07, "learning_rate": 0.0044127169565754375, "loss": 2.5252, "step": 15081 }, { "crossentropy": 2.5782527923583984, "epoch": 0.546766241299304, "grad_norm": 0.027116449549794197, "grad_norm_var": 8.968802769113015e-07, "learning_rate": 0.004412139895102173, "loss": 2.563, "step": 15082 }, { "crossentropy": 2.447179079055786, "epoch": 0.546802494199536, "grad_norm": 0.026567280292510986, "grad_norm_var": 8.812922713526938e-07, "learning_rate": 0.004411562841568851, "loss": 2.5029, "step": 15083 }, { "crossentropy": 2.450801372528076, "epoch": 0.546838747099768, "grad_norm": 0.02767818607389927, "grad_norm_var": 7.392423399309988e-07, "learning_rate": 0.004410985795983266, "loss": 2.5569, "step": 15084 }, { "crossentropy": 2.5331597328186035, "epoch": 0.546875, "grad_norm": 0.0328839085996151, "grad_norm_var": 2.9891642293832763e-06, "learning_rate": 0.004410408758353212, "loss": 2.5575, "step": 15085 }, { "crossentropy": 2.463494300842285, "epoch": 0.546911252900232, "grad_norm": 0.027684327214956284, "grad_norm_var": 2.9294722986092456e-06, "learning_rate": 0.004409831728686482, "loss": 2.4986, "step": 15086 }, { "crossentropy": 2.5006392002105713, "epoch": 0.546947505800464, "grad_norm": 0.028849394991993904, "grad_norm_var": 3.0172467538768737e-06, "learning_rate": 0.004409254706990871, "loss": 2.5134, "step": 15087 }, { "crossentropy": 2.431678533554077, "epoch": 0.546983758700696, "grad_norm": 0.027913901954889297, "grad_norm_var": 3.0193402095030424e-06, "learning_rate": 0.004408677693274171, "loss": 2.4379, "step": 15088 }, { "crossentropy": 2.5534353256225586, "epoch": 0.5470200116009281, "grad_norm": 0.02852388471364975, "grad_norm_var": 2.9973872043352006e-06, "learning_rate": 0.004408100687544176, "loss": 2.5876, "step": 15089 }, { "crossentropy": 2.5668578147888184, "epoch": 0.5470562645011601, "grad_norm": 0.02697744034230709, "grad_norm_var": 2.96607220973636e-06, "learning_rate": 0.0044075236898086815, "loss": 2.5253, "step": 15090 }, { "crossentropy": 2.5403409004211426, "epoch": 0.5470925174013921, "grad_norm": 0.02713652513921261, "grad_norm_var": 2.6076847948605738e-06, "learning_rate": 0.004406946700075478, "loss": 2.5078, "step": 15091 }, { "crossentropy": 2.5553689002990723, "epoch": 0.5471287703016241, "grad_norm": 0.02739124931395054, "grad_norm_var": 2.4792241556663316e-06, "learning_rate": 0.004406369718352356, "loss": 2.5949, "step": 15092 }, { "crossentropy": 2.590670108795166, "epoch": 0.5471650232018561, "grad_norm": 0.027031533420085907, "grad_norm_var": 2.393741009286164e-06, "learning_rate": 0.004405792744647114, "loss": 2.6295, "step": 15093 }, { "crossentropy": 2.5207762718200684, "epoch": 0.5472012761020881, "grad_norm": 0.026824770495295525, "grad_norm_var": 2.3746864874537997e-06, "learning_rate": 0.004405215778967544, "loss": 2.4985, "step": 15094 }, { "crossentropy": 2.4038422107696533, "epoch": 0.5472375290023201, "grad_norm": 0.02684151381254196, "grad_norm_var": 2.4120970022300924e-06, "learning_rate": 0.004404638821321436, "loss": 2.507, "step": 15095 }, { "crossentropy": 2.507460832595825, "epoch": 0.5472737819025522, "grad_norm": 0.0295004490762949, "grad_norm_var": 2.614628896058452e-06, "learning_rate": 0.004404061871716585, "loss": 2.5101, "step": 15096 }, { "crossentropy": 2.4205105304718018, "epoch": 0.5473100348027842, "grad_norm": 0.028714921325445175, "grad_norm_var": 2.431307157686226e-06, "learning_rate": 0.004403484930160781, "loss": 2.5245, "step": 15097 }, { "crossentropy": 2.626134157180786, "epoch": 0.5473462877030162, "grad_norm": 0.028710531070828438, "grad_norm_var": 2.40717100848057e-06, "learning_rate": 0.004402907996661822, "loss": 2.6688, "step": 15098 }, { "crossentropy": 2.459880828857422, "epoch": 0.5473825406032483, "grad_norm": 0.029784277081489563, "grad_norm_var": 2.4064778196645e-06, "learning_rate": 0.004402331071227494, "loss": 2.4858, "step": 15099 }, { "crossentropy": 2.648216724395752, "epoch": 0.5474187935034803, "grad_norm": 0.02806215174496174, "grad_norm_var": 2.3849882372523405e-06, "learning_rate": 0.004401754153865591, "loss": 2.6172, "step": 15100 }, { "crossentropy": 2.596203565597534, "epoch": 0.5474550464037123, "grad_norm": 0.02733476459980011, "grad_norm_var": 9.194048652801101e-07, "learning_rate": 0.004401177244583908, "loss": 2.5628, "step": 15101 }, { "crossentropy": 2.5354080200195312, "epoch": 0.5474912993039444, "grad_norm": 0.02565581724047661, "grad_norm_var": 1.2498191463546083e-06, "learning_rate": 0.004400600343390234, "loss": 2.4745, "step": 15102 }, { "crossentropy": 2.587982654571533, "epoch": 0.5475275522041764, "grad_norm": 0.02923586778342724, "grad_norm_var": 1.3117698958032067e-06, "learning_rate": 0.004400023450292362, "loss": 2.5465, "step": 15103 }, { "crossentropy": 2.393336772918701, "epoch": 0.5475638051044084, "grad_norm": 0.025536833330988884, "grad_norm_var": 1.6454545193190454e-06, "learning_rate": 0.004399446565298084, "loss": 2.4782, "step": 15104 }, { "crossentropy": 2.5554094314575195, "epoch": 0.5476000580046404, "grad_norm": 0.02700112946331501, "grad_norm_var": 1.6238953435038692e-06, "learning_rate": 0.004398869688415191, "loss": 2.558, "step": 15105 }, { "crossentropy": 2.4887712001800537, "epoch": 0.5476363109048724, "grad_norm": 0.03303927183151245, "grad_norm_var": 3.4102668916874266e-06, "learning_rate": 0.004398292819651476, "loss": 2.6011, "step": 15106 }, { "crossentropy": 2.5602500438690186, "epoch": 0.5476725638051044, "grad_norm": 0.02817019820213318, "grad_norm_var": 3.359749079365146e-06, "learning_rate": 0.004397715959014731, "loss": 2.6612, "step": 15107 }, { "crossentropy": 2.4709436893463135, "epoch": 0.5477088167053364, "grad_norm": 0.02802477590739727, "grad_norm_var": 3.3290027597711788e-06, "learning_rate": 0.0043971391065127435, "loss": 2.5736, "step": 15108 }, { "crossentropy": 2.6227731704711914, "epoch": 0.5477450696055685, "grad_norm": 0.026891455054283142, "grad_norm_var": 3.350031860321669e-06, "learning_rate": 0.0043965622621533074, "loss": 2.6185, "step": 15109 }, { "crossentropy": 2.4954724311828613, "epoch": 0.5477813225058005, "grad_norm": 0.02887747250497341, "grad_norm_var": 3.268999138808515e-06, "learning_rate": 0.004395985425944215, "loss": 2.4935, "step": 15110 }, { "crossentropy": 2.6498820781707764, "epoch": 0.5478175754060325, "grad_norm": 0.02648330293595791, "grad_norm_var": 3.3424436838035e-06, "learning_rate": 0.004395408597893255, "loss": 2.6192, "step": 15111 }, { "crossentropy": 2.452244281768799, "epoch": 0.5478538283062645, "grad_norm": 0.027914196252822876, "grad_norm_var": 3.2223237648675827e-06, "learning_rate": 0.0043948317780082196, "loss": 2.4431, "step": 15112 }, { "crossentropy": 2.4155454635620117, "epoch": 0.5478900812064965, "grad_norm": 0.02720755897462368, "grad_norm_var": 3.238696769314305e-06, "learning_rate": 0.004394254966296899, "loss": 2.4629, "step": 15113 }, { "crossentropy": 2.4248785972595215, "epoch": 0.5479263341067285, "grad_norm": 0.026368863880634308, "grad_norm_var": 3.358192256237326e-06, "learning_rate": 0.004393678162767084, "loss": 2.4269, "step": 15114 }, { "crossentropy": 2.7327613830566406, "epoch": 0.5479625870069605, "grad_norm": 0.027373788878321648, "grad_norm_var": 3.0994296622237668e-06, "learning_rate": 0.004393101367426568, "loss": 2.5867, "step": 15115 }, { "crossentropy": 2.491790771484375, "epoch": 0.5479988399071926, "grad_norm": 0.02842009626328945, "grad_norm_var": 3.1247887284925237e-06, "learning_rate": 0.004392524580283137, "loss": 2.5675, "step": 15116 }, { "crossentropy": 2.4889283180236816, "epoch": 0.5480350928074246, "grad_norm": 0.028914684429764748, "grad_norm_var": 3.1994430899966636e-06, "learning_rate": 0.004391947801344584, "loss": 2.5352, "step": 15117 }, { "crossentropy": 2.4908082485198975, "epoch": 0.5480713457076566, "grad_norm": 0.029494639486074448, "grad_norm_var": 3.012906071917417e-06, "learning_rate": 0.004391371030618698, "loss": 2.5367, "step": 15118 }, { "crossentropy": 2.6352434158325195, "epoch": 0.5481075986078886, "grad_norm": 0.026860956102609634, "grad_norm_var": 2.9929585603643275e-06, "learning_rate": 0.00439079426811327, "loss": 2.6042, "step": 15119 }, { "crossentropy": 2.778639793395996, "epoch": 0.5481438515081206, "grad_norm": 0.02726292610168457, "grad_norm_var": 2.632720202523873e-06, "learning_rate": 0.004390217513836091, "loss": 2.6684, "step": 15120 }, { "crossentropy": 2.6398932933807373, "epoch": 0.5481801044083526, "grad_norm": 0.028097840026021004, "grad_norm_var": 2.559040302242919e-06, "learning_rate": 0.0043896407677949485, "loss": 2.5887, "step": 15121 }, { "crossentropy": 2.547682762145996, "epoch": 0.5482163573085846, "grad_norm": 0.02649955451488495, "grad_norm_var": 9.143862718846244e-07, "learning_rate": 0.004389064029997634, "loss": 2.5625, "step": 15122 }, { "crossentropy": 2.6889102458953857, "epoch": 0.5482526102088167, "grad_norm": 0.026410548016428947, "grad_norm_var": 9.926395971958003e-07, "learning_rate": 0.004388487300451939, "loss": 2.6957, "step": 15123 }, { "crossentropy": 2.497363805770874, "epoch": 0.5482888631090487, "grad_norm": 0.026773912832140923, "grad_norm_var": 1.0144016738827896e-06, "learning_rate": 0.004387910579165649, "loss": 2.543, "step": 15124 }, { "crossentropy": 2.6612708568573, "epoch": 0.5483251160092807, "grad_norm": 0.027759632095694542, "grad_norm_var": 9.921388105071349e-07, "learning_rate": 0.004387333866146556, "loss": 2.6296, "step": 15125 }, { "crossentropy": 2.62368106842041, "epoch": 0.5483613689095128, "grad_norm": 0.028593450784683228, "grad_norm_var": 9.467203673221934e-07, "learning_rate": 0.0043867571614024485, "loss": 2.5209, "step": 15126 }, { "crossentropy": 2.664167642593384, "epoch": 0.5483976218097448, "grad_norm": 0.028307756409049034, "grad_norm_var": 9.008094620439975e-07, "learning_rate": 0.004386180464941116, "loss": 2.5487, "step": 15127 }, { "crossentropy": 2.650681257247925, "epoch": 0.5484338747099768, "grad_norm": 0.02775040827691555, "grad_norm_var": 8.965259626398173e-07, "learning_rate": 0.004385603776770349, "loss": 2.6432, "step": 15128 }, { "crossentropy": 2.518216371536255, "epoch": 0.5484701276102089, "grad_norm": 0.027530089020729065, "grad_norm_var": 8.848162455588147e-07, "learning_rate": 0.004385027096897933, "loss": 2.546, "step": 15129 }, { "crossentropy": 2.3430492877960205, "epoch": 0.5485063805104409, "grad_norm": 0.02723458968102932, "grad_norm_var": 7.836390045464747e-07, "learning_rate": 0.00438445042533166, "loss": 2.4972, "step": 15130 }, { "crossentropy": 2.475712537765503, "epoch": 0.5485426334106729, "grad_norm": 0.02773912064731121, "grad_norm_var": 7.758322829518397e-07, "learning_rate": 0.00438387376207932, "loss": 2.4365, "step": 15131 }, { "crossentropy": 2.7104544639587402, "epoch": 0.5485788863109049, "grad_norm": 0.02705247700214386, "grad_norm_var": 7.665530880873132e-07, "learning_rate": 0.004383297107148699, "loss": 2.6684, "step": 15132 }, { "crossentropy": 2.7302088737487793, "epoch": 0.5486151392111369, "grad_norm": 0.027537092566490173, "grad_norm_var": 6.515192716054103e-07, "learning_rate": 0.004382720460547587, "loss": 2.6181, "step": 15133 }, { "crossentropy": 2.4950032234191895, "epoch": 0.5486513921113689, "grad_norm": 0.026371004059910774, "grad_norm_var": 4.541582886510015e-07, "learning_rate": 0.00438214382228377, "loss": 2.4905, "step": 15134 }, { "crossentropy": 2.532930612564087, "epoch": 0.5486876450116009, "grad_norm": 0.026733729988336563, "grad_norm_var": 4.636581103067133e-07, "learning_rate": 0.00438156719236504, "loss": 2.5517, "step": 15135 }, { "crossentropy": 2.5882601737976074, "epoch": 0.548723897911833, "grad_norm": 0.027611423283815384, "grad_norm_var": 4.6704554493846806e-07, "learning_rate": 0.0043809905707991835, "loss": 2.6227, "step": 15136 }, { "crossentropy": 2.4422686100006104, "epoch": 0.548760150812065, "grad_norm": 0.027480002492666245, "grad_norm_var": 4.3137042306402397e-07, "learning_rate": 0.004380413957593987, "loss": 2.4591, "step": 15137 }, { "crossentropy": 2.678615093231201, "epoch": 0.548796403712297, "grad_norm": 0.027231935411691666, "grad_norm_var": 3.831610678137545e-07, "learning_rate": 0.004379837352757242, "loss": 2.6643, "step": 15138 }, { "crossentropy": 2.6223549842834473, "epoch": 0.548832656612529, "grad_norm": 0.0276616420596838, "grad_norm_var": 3.188840368007276e-07, "learning_rate": 0.004379260756296734, "loss": 2.5705, "step": 15139 }, { "crossentropy": 2.5021543502807617, "epoch": 0.548868909512761, "grad_norm": 0.027012262493371964, "grad_norm_var": 3.0061445812390394e-07, "learning_rate": 0.004378684168220253, "loss": 2.5335, "step": 15140 }, { "crossentropy": 2.622659683227539, "epoch": 0.548905162412993, "grad_norm": 0.02657233737409115, "grad_norm_var": 3.437252596169184e-07, "learning_rate": 0.004378107588535584, "loss": 2.5554, "step": 15141 }, { "crossentropy": 2.5715343952178955, "epoch": 0.548941415313225, "grad_norm": 0.03169432654976845, "grad_norm_var": 1.4376227340947977e-06, "learning_rate": 0.004377531017250516, "loss": 2.6009, "step": 15142 }, { "crossentropy": 2.6797122955322266, "epoch": 0.5489776682134571, "grad_norm": 0.02719919942319393, "grad_norm_var": 1.4090799059729286e-06, "learning_rate": 0.004376954454372836, "loss": 2.6381, "step": 15143 }, { "crossentropy": 2.5616629123687744, "epoch": 0.5490139211136891, "grad_norm": 0.028593972325325012, "grad_norm_var": 1.478825939400565e-06, "learning_rate": 0.004376377899910331, "loss": 2.5405, "step": 15144 }, { "crossentropy": 2.4362056255340576, "epoch": 0.5490501740139211, "grad_norm": 0.02769671566784382, "grad_norm_var": 1.4794867804129024e-06, "learning_rate": 0.004375801353870788, "loss": 2.4789, "step": 15145 }, { "crossentropy": 2.7072818279266357, "epoch": 0.5490864269141531, "grad_norm": 0.027363575994968414, "grad_norm_var": 1.4744337428803633e-06, "learning_rate": 0.004375224816261997, "loss": 2.667, "step": 15146 }, { "crossentropy": 2.638568639755249, "epoch": 0.5491226798143851, "grad_norm": 0.02935406006872654, "grad_norm_var": 1.668053657127161e-06, "learning_rate": 0.004374648287091742, "loss": 2.6777, "step": 15147 }, { "crossentropy": 2.5220208168029785, "epoch": 0.5491589327146171, "grad_norm": 0.028292037546634674, "grad_norm_var": 1.6574200803434746e-06, "learning_rate": 0.004374071766367811, "loss": 2.5424, "step": 15148 }, { "crossentropy": 2.4081079959869385, "epoch": 0.5491951856148491, "grad_norm": 0.026813076809048653, "grad_norm_var": 1.7131810856114837e-06, "learning_rate": 0.0043734952540979925, "loss": 2.5145, "step": 15149 }, { "crossentropy": 2.507171869277954, "epoch": 0.5492314385150812, "grad_norm": 0.02931503765285015, "grad_norm_var": 1.7214002228098697e-06, "learning_rate": 0.00437291875029007, "loss": 2.6082, "step": 15150 }, { "crossentropy": 2.554605007171631, "epoch": 0.5492676914153132, "grad_norm": 0.07740187644958496, "grad_norm_var": 0.000154201040584736, "learning_rate": 0.004372342254951831, "loss": 2.6894, "step": 15151 }, { "crossentropy": 2.5742149353027344, "epoch": 0.5493039443155452, "grad_norm": 0.027112271636724472, "grad_norm_var": 0.00015444751478982385, "learning_rate": 0.004371765768091062, "loss": 2.5302, "step": 15152 }, { "crossentropy": 2.5381855964660645, "epoch": 0.5493401972157773, "grad_norm": 0.028625069186091423, "grad_norm_var": 0.00015398446613741664, "learning_rate": 0.0043711892897155494, "loss": 2.5835, "step": 15153 }, { "crossentropy": 2.5909316539764404, "epoch": 0.5493764501160093, "grad_norm": 0.029681570827960968, "grad_norm_var": 0.00015308920269045654, "learning_rate": 0.00437061281983308, "loss": 2.5626, "step": 15154 }, { "crossentropy": 2.641427755355835, "epoch": 0.5494127030162413, "grad_norm": 0.026956450194120407, "grad_norm_var": 0.00015345996729284464, "learning_rate": 0.00437003635845144, "loss": 2.6099, "step": 15155 }, { "crossentropy": 2.533785820007324, "epoch": 0.5494489559164734, "grad_norm": 0.02773274853825569, "grad_norm_var": 0.00015308721185962532, "learning_rate": 0.004369459905578416, "loss": 2.5348, "step": 15156 }, { "crossentropy": 2.3274853229522705, "epoch": 0.5494852088167054, "grad_norm": 0.02970043011009693, "grad_norm_var": 0.00015173727737247434, "learning_rate": 0.004368883461221793, "loss": 2.4228, "step": 15157 }, { "crossentropy": 2.568894147872925, "epoch": 0.5495214617169374, "grad_norm": 0.02686031349003315, "grad_norm_var": 0.00015305367146364394, "learning_rate": 0.004368307025389356, "loss": 2.5962, "step": 15158 }, { "crossentropy": 2.274954319000244, "epoch": 0.5495577146171694, "grad_norm": 0.0319238044321537, "grad_norm_var": 0.00015194824470077446, "learning_rate": 0.00436773059808889, "loss": 2.468, "step": 15159 }, { "crossentropy": 2.682692766189575, "epoch": 0.5495939675174014, "grad_norm": 0.029402045533061028, "grad_norm_var": 0.00015167983709284724, "learning_rate": 0.004367154179328182, "loss": 2.6742, "step": 15160 }, { "crossentropy": 2.540050745010376, "epoch": 0.5496302204176334, "grad_norm": 0.026449253782629967, "grad_norm_var": 0.00015241209303622655, "learning_rate": 0.004366577769115017, "loss": 2.5641, "step": 15161 }, { "crossentropy": 2.6582541465759277, "epoch": 0.5496664733178654, "grad_norm": 0.02626919560134411, "grad_norm_var": 0.0001530812542925507, "learning_rate": 0.004366001367457181, "loss": 2.5977, "step": 15162 }, { "crossentropy": 2.5129854679107666, "epoch": 0.5497027262180975, "grad_norm": 0.02961639314889908, "grad_norm_var": 0.00015301510966762665, "learning_rate": 0.004365424974362459, "loss": 2.5067, "step": 15163 }, { "crossentropy": 2.596217632293701, "epoch": 0.5497389791183295, "grad_norm": 0.02907661721110344, "grad_norm_var": 0.00015273008085432138, "learning_rate": 0.0043648485898386356, "loss": 2.6125, "step": 15164 }, { "crossentropy": 2.623295307159424, "epoch": 0.5497752320185615, "grad_norm": 0.025904007256031036, "grad_norm_var": 0.00015334177063062668, "learning_rate": 0.004364272213893497, "loss": 2.565, "step": 15165 }, { "crossentropy": 2.727189302444458, "epoch": 0.5498114849187935, "grad_norm": 0.02857792004942894, "grad_norm_var": 0.00015357835383656744, "learning_rate": 0.004363695846534826, "loss": 2.6428, "step": 15166 }, { "crossentropy": 2.5463955402374268, "epoch": 0.5498477378190255, "grad_norm": 0.026136798784136772, "grad_norm_var": 2.9222638549837906e-06, "learning_rate": 0.004363119487770409, "loss": 2.5763, "step": 15167 }, { "crossentropy": 2.6965723037719727, "epoch": 0.5498839907192575, "grad_norm": 0.028776992112398148, "grad_norm_var": 2.870336506265965e-06, "learning_rate": 0.004362543137608027, "loss": 2.7063, "step": 15168 }, { "crossentropy": 2.4736530780792236, "epoch": 0.5499202436194895, "grad_norm": 0.027000127360224724, "grad_norm_var": 2.9498986273259007e-06, "learning_rate": 0.004361966796055469, "loss": 2.5368, "step": 15169 }, { "crossentropy": 2.499441146850586, "epoch": 0.5499564965197216, "grad_norm": 0.026731839403510094, "grad_norm_var": 2.8831000140651375e-06, "learning_rate": 0.004361390463120517, "loss": 2.481, "step": 15170 }, { "crossentropy": 2.4421451091766357, "epoch": 0.5499927494199536, "grad_norm": 0.026982301846146584, "grad_norm_var": 2.879735454620067e-06, "learning_rate": 0.004360814138810958, "loss": 2.4627, "step": 15171 }, { "crossentropy": 2.612098455429077, "epoch": 0.5500290023201856, "grad_norm": 0.026803413406014442, "grad_norm_var": 2.9601758036701746e-06, "learning_rate": 0.004360237823134572, "loss": 2.6485, "step": 15172 }, { "crossentropy": 2.5932533740997314, "epoch": 0.5500652552204176, "grad_norm": 0.02787027880549431, "grad_norm_var": 2.727299825940619e-06, "learning_rate": 0.004359661516099146, "loss": 2.5977, "step": 15173 }, { "crossentropy": 2.5507194995880127, "epoch": 0.5501015081206496, "grad_norm": 0.02760743908584118, "grad_norm_var": 2.6711854363374906e-06, "learning_rate": 0.004359085217712464, "loss": 2.5639, "step": 15174 }, { "crossentropy": 2.757962703704834, "epoch": 0.5501377610208816, "grad_norm": 0.02647007256746292, "grad_norm_var": 1.5463780486866247e-06, "learning_rate": 0.004358508927982308, "loss": 2.656, "step": 15175 }, { "crossentropy": 2.550912618637085, "epoch": 0.5501740139211136, "grad_norm": 0.025862127542495728, "grad_norm_var": 1.422225819369451e-06, "learning_rate": 0.004357932646916461, "loss": 2.5316, "step": 15176 }, { "crossentropy": 2.655343770980835, "epoch": 0.5502102668213457, "grad_norm": 0.025619255378842354, "grad_norm_var": 1.5548298605158029e-06, "learning_rate": 0.004357356374522709, "loss": 2.5602, "step": 15177 }, { "crossentropy": 2.5057098865509033, "epoch": 0.5502465197215777, "grad_norm": 0.026295779272913933, "grad_norm_var": 1.5515515904048737e-06, "learning_rate": 0.004356780110808834, "loss": 2.5174, "step": 15178 }, { "crossentropy": 2.5199315547943115, "epoch": 0.5502827726218097, "grad_norm": 0.026021551340818405, "grad_norm_var": 1.2049605017408945e-06, "learning_rate": 0.0043562038557826204, "loss": 2.555, "step": 15179 }, { "crossentropy": 2.638338327407837, "epoch": 0.5503190255220418, "grad_norm": 0.027024228125810623, "grad_norm_var": 8.954527738275208e-07, "learning_rate": 0.00435562760945185, "loss": 2.6001, "step": 15180 }, { "crossentropy": 2.691803216934204, "epoch": 0.5503552784222738, "grad_norm": 0.02667299658060074, "grad_norm_var": 8.348782267492138e-07, "learning_rate": 0.004355051371824307, "loss": 2.6456, "step": 15181 }, { "crossentropy": 2.4693305492401123, "epoch": 0.5503915313225058, "grad_norm": 0.026829499751329422, "grad_norm_var": 6.355518014596051e-07, "learning_rate": 0.004354475142907775, "loss": 2.5197, "step": 15182 }, { "crossentropy": 2.576477527618408, "epoch": 0.5504277842227379, "grad_norm": 0.026163220405578613, "grad_norm_var": 6.332800355149006e-07, "learning_rate": 0.004353898922710035, "loss": 2.5046, "step": 15183 }, { "crossentropy": 2.6364095211029053, "epoch": 0.5504640371229699, "grad_norm": 0.026766138151288033, "grad_norm_var": 3.547877193030754e-07, "learning_rate": 0.004353322711238869, "loss": 2.5678, "step": 15184 }, { "crossentropy": 2.6114773750305176, "epoch": 0.5505002900232019, "grad_norm": 0.0285036638379097, "grad_norm_var": 5.622543569204815e-07, "learning_rate": 0.004352746508502063, "loss": 2.5853, "step": 15185 }, { "crossentropy": 2.7335147857666016, "epoch": 0.5505365429234339, "grad_norm": 0.028077157214283943, "grad_norm_var": 6.696052084140576e-07, "learning_rate": 0.0043521703145073965, "loss": 2.6061, "step": 15186 }, { "crossentropy": 2.6602072715759277, "epoch": 0.5505727958236659, "grad_norm": 0.027669791132211685, "grad_norm_var": 7.114496784812555e-07, "learning_rate": 0.004351594129262654, "loss": 2.6628, "step": 15187 }, { "crossentropy": 2.57669997215271, "epoch": 0.5506090487238979, "grad_norm": 0.026805805042386055, "grad_norm_var": 7.114220937294117e-07, "learning_rate": 0.004351017952775617, "loss": 2.5849, "step": 15188 }, { "crossentropy": 2.645050048828125, "epoch": 0.55064530162413, "grad_norm": 0.027395447716116905, "grad_norm_var": 6.635265802113019e-07, "learning_rate": 0.004350441785054065, "loss": 2.6665, "step": 15189 }, { "crossentropy": 2.583317518234253, "epoch": 0.550681554524362, "grad_norm": 0.02809828333556652, "grad_norm_var": 7.274025316145269e-07, "learning_rate": 0.004349865626105785, "loss": 2.573, "step": 15190 }, { "crossentropy": 2.6048855781555176, "epoch": 0.550717807424594, "grad_norm": 0.029640918597579002, "grad_norm_var": 1.1773320783815822e-06, "learning_rate": 0.004349289475938557, "loss": 2.7186, "step": 15191 }, { "crossentropy": 2.565192461013794, "epoch": 0.550754060324826, "grad_norm": 0.026019735261797905, "grad_norm_var": 1.1530739324297577e-06, "learning_rate": 0.00434871333456016, "loss": 2.5486, "step": 15192 }, { "crossentropy": 2.3984978199005127, "epoch": 0.550790313225058, "grad_norm": 0.027149487286806107, "grad_norm_var": 9.972625854696068e-07, "learning_rate": 0.004348137201978379, "loss": 2.4505, "step": 15193 }, { "crossentropy": 2.601982593536377, "epoch": 0.55082656612529, "grad_norm": 0.027608036994934082, "grad_norm_var": 9.474044275003714e-07, "learning_rate": 0.004347561078200993, "loss": 2.5588, "step": 15194 }, { "crossentropy": 2.712782382965088, "epoch": 0.550862819025522, "grad_norm": 0.02838958241045475, "grad_norm_var": 9.012099370947028e-07, "learning_rate": 0.004346984963235786, "loss": 2.5419, "step": 15195 }, { "crossentropy": 2.525400161743164, "epoch": 0.550899071925754, "grad_norm": 0.02643579989671707, "grad_norm_var": 9.543624312574028e-07, "learning_rate": 0.004346408857090538, "loss": 2.5032, "step": 15196 }, { "crossentropy": 2.4866342544555664, "epoch": 0.5509353248259861, "grad_norm": 0.03141091763973236, "grad_norm_var": 1.9049785026342114e-06, "learning_rate": 0.004345832759773028, "loss": 2.4649, "step": 15197 }, { "crossentropy": 2.666722297668457, "epoch": 0.5509715777262181, "grad_norm": 0.027412833645939827, "grad_norm_var": 1.8596899930116746e-06, "learning_rate": 0.0043452566712910414, "loss": 2.6469, "step": 15198 }, { "crossentropy": 2.4663307666778564, "epoch": 0.5510078306264501, "grad_norm": 0.026706909760832787, "grad_norm_var": 1.7651894345829347e-06, "learning_rate": 0.004344680591652359, "loss": 2.5015, "step": 15199 }, { "crossentropy": 2.4172332286834717, "epoch": 0.5510440835266821, "grad_norm": 0.028628837317228317, "grad_norm_var": 1.7362857180449093e-06, "learning_rate": 0.004344104520864756, "loss": 2.5293, "step": 15200 }, { "crossentropy": 2.7634835243225098, "epoch": 0.5510803364269141, "grad_norm": 0.032829515635967255, "grad_norm_var": 3.270135534960911e-06, "learning_rate": 0.0043435284589360195, "loss": 2.6507, "step": 15201 }, { "crossentropy": 2.5778493881225586, "epoch": 0.5511165893271461, "grad_norm": 0.027116959914565086, "grad_norm_var": 3.336117278441306e-06, "learning_rate": 0.004342952405873926, "loss": 2.5416, "step": 15202 }, { "crossentropy": 2.4842960834503174, "epoch": 0.5511528422273781, "grad_norm": 0.02843892015516758, "grad_norm_var": 3.3307735250892937e-06, "learning_rate": 0.0043423763616862576, "loss": 2.4984, "step": 15203 }, { "crossentropy": 2.603992462158203, "epoch": 0.5511890951276102, "grad_norm": 0.028307415544986725, "grad_norm_var": 3.2064773257669027e-06, "learning_rate": 0.004341800326380795, "loss": 2.6284, "step": 15204 }, { "crossentropy": 2.6588282585144043, "epoch": 0.5512253480278422, "grad_norm": 0.029578516259789467, "grad_norm_var": 3.2630656719120997e-06, "learning_rate": 0.004341224299965316, "loss": 2.6544, "step": 15205 }, { "crossentropy": 2.487982988357544, "epoch": 0.5512616009280742, "grad_norm": 0.02738869935274124, "grad_norm_var": 3.3193712565077194e-06, "learning_rate": 0.004340648282447603, "loss": 2.5547, "step": 15206 }, { "crossentropy": 2.7231605052948, "epoch": 0.5512978538283063, "grad_norm": 0.027542151510715485, "grad_norm_var": 3.2240372322809523e-06, "learning_rate": 0.004340072273835439, "loss": 2.6595, "step": 15207 }, { "crossentropy": 2.3488686084747314, "epoch": 0.5513341067285383, "grad_norm": 0.028062667697668076, "grad_norm_var": 2.8950134105518165e-06, "learning_rate": 0.004339496274136597, "loss": 2.4643, "step": 15208 }, { "crossentropy": 2.5961709022521973, "epoch": 0.5513703596287703, "grad_norm": 0.02751879394054413, "grad_norm_var": 2.846247530121672e-06, "learning_rate": 0.004338920283358861, "loss": 2.6057, "step": 15209 }, { "crossentropy": 2.558568000793457, "epoch": 0.5514066125290024, "grad_norm": 0.027044961228966713, "grad_norm_var": 2.9207191520248433e-06, "learning_rate": 0.00433834430151001, "loss": 2.5437, "step": 15210 }, { "crossentropy": 2.5555953979492188, "epoch": 0.5514428654292344, "grad_norm": 0.028582941740751266, "grad_norm_var": 2.925343713644733e-06, "learning_rate": 0.004337768328597822, "loss": 2.5281, "step": 15211 }, { "crossentropy": 2.4171276092529297, "epoch": 0.5514791183294664, "grad_norm": 0.029237741604447365, "grad_norm_var": 2.7147432349995293e-06, "learning_rate": 0.004337192364630079, "loss": 2.4835, "step": 15212 }, { "crossentropy": 2.5695769786834717, "epoch": 0.5515153712296984, "grad_norm": 0.030135659500956535, "grad_norm_var": 2.3193976618547664e-06, "learning_rate": 0.004336616409614558, "loss": 2.5853, "step": 15213 }, { "crossentropy": 2.629976272583008, "epoch": 0.5515516241299304, "grad_norm": 0.028477411717176437, "grad_norm_var": 2.248923915904944e-06, "learning_rate": 0.00433604046355904, "loss": 2.5673, "step": 15214 }, { "crossentropy": 2.435131311416626, "epoch": 0.5515878770301624, "grad_norm": 0.027033694088459015, "grad_norm_var": 2.178565441274203e-06, "learning_rate": 0.004335464526471303, "loss": 2.5614, "step": 15215 }, { "crossentropy": 2.487396240234375, "epoch": 0.5516241299303944, "grad_norm": 0.04271475225687027, "grad_norm_var": 1.483016705697976e-05, "learning_rate": 0.004334888598359127, "loss": 2.5451, "step": 15216 }, { "crossentropy": 2.5785555839538574, "epoch": 0.5516603828306265, "grad_norm": 0.029566731303930283, "grad_norm_var": 1.3992975600283544e-05, "learning_rate": 0.004334312679230289, "loss": 2.5645, "step": 15217 }, { "crossentropy": 2.6229517459869385, "epoch": 0.5516966357308585, "grad_norm": 0.02854214794933796, "grad_norm_var": 1.3729461314576178e-05, "learning_rate": 0.004333736769092569, "loss": 2.6208, "step": 15218 }, { "crossentropy": 2.531895637512207, "epoch": 0.5517328886310905, "grad_norm": 0.027225369587540627, "grad_norm_var": 1.39544951804881e-05, "learning_rate": 0.0043331608679537436, "loss": 2.5448, "step": 15219 }, { "crossentropy": 2.5649075508117676, "epoch": 0.5517691415313225, "grad_norm": 0.02720322646200657, "grad_norm_var": 1.4159896660867462e-05, "learning_rate": 0.004332584975821593, "loss": 2.5461, "step": 15220 }, { "crossentropy": 2.599440097808838, "epoch": 0.5518053944315545, "grad_norm": 0.027708346024155617, "grad_norm_var": 1.4263153167139844e-05, "learning_rate": 0.004332009092703894, "loss": 2.5536, "step": 15221 }, { "crossentropy": 2.710068702697754, "epoch": 0.5518416473317865, "grad_norm": 0.028820103034377098, "grad_norm_var": 1.4083862952949759e-05, "learning_rate": 0.004331433218608428, "loss": 2.678, "step": 15222 }, { "crossentropy": 2.6782166957855225, "epoch": 0.5518779002320185, "grad_norm": 0.027387559413909912, "grad_norm_var": 1.411723129143815e-05, "learning_rate": 0.0043308573535429705, "loss": 2.5571, "step": 15223 }, { "crossentropy": 2.613734006881714, "epoch": 0.5519141531322506, "grad_norm": 0.027527187019586563, "grad_norm_var": 1.4207707577031567e-05, "learning_rate": 0.004330281497515301, "loss": 2.5985, "step": 15224 }, { "crossentropy": 2.4470937252044678, "epoch": 0.5519504060324826, "grad_norm": 0.02692248485982418, "grad_norm_var": 1.4351309944360965e-05, "learning_rate": 0.004329705650533195, "loss": 2.5988, "step": 15225 }, { "crossentropy": 2.810715436935425, "epoch": 0.5519866589327146, "grad_norm": 0.026731589809060097, "grad_norm_var": 1.4439474966303775e-05, "learning_rate": 0.004329129812604432, "loss": 2.6852, "step": 15226 }, { "crossentropy": 2.56640625, "epoch": 0.5520229118329466, "grad_norm": 0.02635050192475319, "grad_norm_var": 1.4871697213805985e-05, "learning_rate": 0.004328553983736788, "loss": 2.5484, "step": 15227 }, { "crossentropy": 2.344360828399658, "epoch": 0.5520591647331786, "grad_norm": 0.028373220935463905, "grad_norm_var": 1.4873603092145161e-05, "learning_rate": 0.004327978163938042, "loss": 2.4346, "step": 15228 }, { "crossentropy": 2.6180260181427, "epoch": 0.5520954176334106, "grad_norm": 0.026928376406431198, "grad_norm_var": 1.4943202660921914e-05, "learning_rate": 0.004327402353215969, "loss": 2.5987, "step": 15229 }, { "crossentropy": 2.485337734222412, "epoch": 0.5521316705336426, "grad_norm": 0.02670997381210327, "grad_norm_var": 1.5166045644786814e-05, "learning_rate": 0.00432682655157835, "loss": 2.5017, "step": 15230 }, { "crossentropy": 2.495929002761841, "epoch": 0.5521679234338747, "grad_norm": 0.026196658611297607, "grad_norm_var": 1.5371704742222816e-05, "learning_rate": 0.004326250759032959, "loss": 2.511, "step": 15231 }, { "crossentropy": 2.5704495906829834, "epoch": 0.5522041763341067, "grad_norm": 0.026526032015681267, "grad_norm_var": 9.215984327980009e-07, "learning_rate": 0.004325674975587575, "loss": 2.5325, "step": 15232 }, { "crossentropy": 2.6089015007019043, "epoch": 0.5522404292343387, "grad_norm": 0.02613612823188305, "grad_norm_var": 6.752048398131482e-07, "learning_rate": 0.004325099201249973, "loss": 2.494, "step": 15233 }, { "crossentropy": 2.5264480113983154, "epoch": 0.5522766821345708, "grad_norm": 0.02920164354145527, "grad_norm_var": 8.19918386444259e-07, "learning_rate": 0.004324523436027932, "loss": 2.6044, "step": 15234 }, { "crossentropy": 2.6618881225585938, "epoch": 0.5523129350348028, "grad_norm": 0.026929611340165138, "grad_norm_var": 8.262295597171361e-07, "learning_rate": 0.004323947679929226, "loss": 2.6822, "step": 15235 }, { "crossentropy": 2.5096535682678223, "epoch": 0.5523491879350348, "grad_norm": 0.027669763192534447, "grad_norm_var": 8.382740040834812e-07, "learning_rate": 0.004323371932961632, "loss": 2.5447, "step": 15236 }, { "crossentropy": 2.4524788856506348, "epoch": 0.5523854408352669, "grad_norm": 0.027737503871321678, "grad_norm_var": 8.400800994690556e-07, "learning_rate": 0.004322796195132926, "loss": 2.4738, "step": 15237 }, { "crossentropy": 2.5878639221191406, "epoch": 0.5524216937354989, "grad_norm": 0.027455247938632965, "grad_norm_var": 6.724656878700214e-07, "learning_rate": 0.004322220466450886, "loss": 2.5985, "step": 15238 }, { "crossentropy": 2.5614266395568848, "epoch": 0.5524579466357309, "grad_norm": 0.025547828525304794, "grad_norm_var": 8.31610289395027e-07, "learning_rate": 0.004321644746923287, "loss": 2.5939, "step": 15239 }, { "crossentropy": 2.5470049381256104, "epoch": 0.5524941995359629, "grad_norm": 0.02724519744515419, "grad_norm_var": 8.189764080844628e-07, "learning_rate": 0.0043210690365579054, "loss": 2.6133, "step": 15240 }, { "crossentropy": 2.318751096725464, "epoch": 0.5525304524361949, "grad_norm": 0.02855793572962284, "grad_norm_var": 9.602231694104489e-07, "learning_rate": 0.004320493335362518, "loss": 2.4602, "step": 15241 }, { "crossentropy": 2.4632463455200195, "epoch": 0.5525667053364269, "grad_norm": 0.025957629084587097, "grad_norm_var": 1.0401764198262702e-06, "learning_rate": 0.0043199176433448986, "loss": 2.4915, "step": 15242 }, { "crossentropy": 2.7642760276794434, "epoch": 0.552602958236659, "grad_norm": 0.03056616149842739, "grad_norm_var": 1.7323253649189115e-06, "learning_rate": 0.004319341960512823, "loss": 2.7666, "step": 15243 }, { "crossentropy": 2.658308744430542, "epoch": 0.552639211136891, "grad_norm": 0.03170514106750488, "grad_norm_var": 2.8768961224878245e-06, "learning_rate": 0.004318766286874067, "loss": 2.5959, "step": 15244 }, { "crossentropy": 2.6596789360046387, "epoch": 0.552675464037123, "grad_norm": 0.027297984808683395, "grad_norm_var": 2.8539657810954894e-06, "learning_rate": 0.004318190622436406, "loss": 2.6286, "step": 15245 }, { "crossentropy": 2.5140540599823, "epoch": 0.552711716937355, "grad_norm": 0.08488037437200546, "grad_norm_var": 0.00020751544086080136, "learning_rate": 0.004317614967207615, "loss": 2.4572, "step": 15246 }, { "crossentropy": 2.4619593620300293, "epoch": 0.552747969837587, "grad_norm": 0.02550029195845127, "grad_norm_var": 0.00020801268758214596, "learning_rate": 0.00431703932119547, "loss": 2.5012, "step": 15247 }, { "crossentropy": 2.5584168434143066, "epoch": 0.552784222737819, "grad_norm": 0.02789565548300743, "grad_norm_var": 0.00020727964471369797, "learning_rate": 0.004316463684407745, "loss": 2.5276, "step": 15248 }, { "crossentropy": 2.592555522918701, "epoch": 0.552820475638051, "grad_norm": 0.03125346824526787, "grad_norm_var": 0.00020541497146053393, "learning_rate": 0.004315888056852218, "loss": 2.5191, "step": 15249 }, { "crossentropy": 2.7286670207977295, "epoch": 0.552856728538283, "grad_norm": 0.028674161061644554, "grad_norm_var": 0.00020560016730655135, "learning_rate": 0.0043153124385366575, "loss": 2.6627, "step": 15250 }, { "crossentropy": 2.542104721069336, "epoch": 0.5528929814385151, "grad_norm": 0.027633817866444588, "grad_norm_var": 0.00020519689977017025, "learning_rate": 0.004314736829468843, "loss": 2.6313, "step": 15251 }, { "crossentropy": 2.485269784927368, "epoch": 0.5529292343387471, "grad_norm": 0.0273780208081007, "grad_norm_var": 0.00020535504850047285, "learning_rate": 0.004314161229656548, "loss": 2.5088, "step": 15252 }, { "crossentropy": 2.503953218460083, "epoch": 0.5529654872389791, "grad_norm": 0.026660131290555, "grad_norm_var": 0.0002059796251970823, "learning_rate": 0.004313585639107544, "loss": 2.5247, "step": 15253 }, { "crossentropy": 2.730987071990967, "epoch": 0.5530017401392111, "grad_norm": 0.03053489699959755, "grad_norm_var": 0.0002049061695752176, "learning_rate": 0.004313010057829609, "loss": 2.6475, "step": 15254 }, { "crossentropy": 2.6944379806518555, "epoch": 0.5530379930394431, "grad_norm": 0.0284530371427536, "grad_norm_var": 0.0002030484248409624, "learning_rate": 0.004312434485830517, "loss": 2.6008, "step": 15255 }, { "crossentropy": 2.549429416656494, "epoch": 0.5530742459396751, "grad_norm": 0.026638662442564964, "grad_norm_var": 0.0002034468160266254, "learning_rate": 0.004311858923118039, "loss": 2.5276, "step": 15256 }, { "crossentropy": 2.6100118160247803, "epoch": 0.5531104988399071, "grad_norm": 0.027006613090634346, "grad_norm_var": 0.00020427800587962305, "learning_rate": 0.004311283369699951, "loss": 2.5884, "step": 15257 }, { "crossentropy": 2.5298638343811035, "epoch": 0.5531467517401392, "grad_norm": 0.028002440929412842, "grad_norm_var": 0.00020295947894465038, "learning_rate": 0.004310707825584028, "loss": 2.5802, "step": 15258 }, { "crossentropy": 2.6082096099853516, "epoch": 0.5531830046403712, "grad_norm": 0.026072045788168907, "grad_norm_var": 0.00020500910066862757, "learning_rate": 0.004310132290778041, "loss": 2.6245, "step": 15259 }, { "crossentropy": 2.4037258625030518, "epoch": 0.5532192575406032, "grad_norm": 0.026784298941493034, "grad_norm_var": 0.00020645299065907678, "learning_rate": 0.004309556765289763, "loss": 2.5185, "step": 15260 }, { "crossentropy": 2.608833074569702, "epoch": 0.5532555104408353, "grad_norm": 0.02639547549188137, "grad_norm_var": 0.00020698447060532517, "learning_rate": 0.004308981249126968, "loss": 2.5825, "step": 15261 }, { "crossentropy": 2.6301827430725098, "epoch": 0.5532917633410673, "grad_norm": 0.026977939531207085, "grad_norm_var": 2.3696489244490975e-06, "learning_rate": 0.004308405742297431, "loss": 2.5672, "step": 15262 }, { "crossentropy": 2.506669282913208, "epoch": 0.5533280162412993, "grad_norm": 0.028237242251634598, "grad_norm_var": 2.0656385547326387e-06, "learning_rate": 0.004307830244808924, "loss": 2.5865, "step": 15263 }, { "crossentropy": 2.4911088943481445, "epoch": 0.5533642691415314, "grad_norm": 0.029707953333854675, "grad_norm_var": 2.297081287407944e-06, "learning_rate": 0.004307254756669221, "loss": 2.6246, "step": 15264 }, { "crossentropy": 2.570307970046997, "epoch": 0.5534005220417634, "grad_norm": 0.026316260918974876, "grad_norm_var": 1.613433110823773e-06, "learning_rate": 0.0043066792778860925, "loss": 2.512, "step": 15265 }, { "crossentropy": 2.5359604358673096, "epoch": 0.5534367749419954, "grad_norm": 0.02664664201438427, "grad_norm_var": 1.5778301087930919e-06, "learning_rate": 0.004306103808467314, "loss": 2.4812, "step": 15266 }, { "crossentropy": 2.5797042846679688, "epoch": 0.5534730278422274, "grad_norm": 0.025759264826774597, "grad_norm_var": 1.7553431079864526e-06, "learning_rate": 0.0043055283484206555, "loss": 2.4978, "step": 15267 }, { "crossentropy": 2.50607967376709, "epoch": 0.5535092807424594, "grad_norm": 0.025564393028616905, "grad_norm_var": 1.953705642070552e-06, "learning_rate": 0.004304952897753891, "loss": 2.5098, "step": 15268 }, { "crossentropy": 2.591836452484131, "epoch": 0.5535455336426914, "grad_norm": 0.02711952105164528, "grad_norm_var": 1.93169408056897e-06, "learning_rate": 0.004304377456474791, "loss": 2.5127, "step": 15269 }, { "crossentropy": 2.678481340408325, "epoch": 0.5535817865429234, "grad_norm": 0.026370778679847717, "grad_norm_var": 1.19912931001626e-06, "learning_rate": 0.004303802024591129, "loss": 2.5825, "step": 15270 }, { "crossentropy": 2.5058913230895996, "epoch": 0.5536180394431555, "grad_norm": 0.028161533176898956, "grad_norm_var": 1.1480924432551251e-06, "learning_rate": 0.004303226602110679, "loss": 2.4506, "step": 15271 }, { "crossentropy": 2.4847447872161865, "epoch": 0.5536542923433875, "grad_norm": 0.0258487556129694, "grad_norm_var": 1.2235730938498728e-06, "learning_rate": 0.004302651189041211, "loss": 2.5639, "step": 15272 }, { "crossentropy": 2.6727135181427, "epoch": 0.5536905452436195, "grad_norm": 0.026874562725424767, "grad_norm_var": 1.2234143329500757e-06, "learning_rate": 0.0043020757853904955, "loss": 2.6261, "step": 15273 }, { "crossentropy": 2.6412885189056396, "epoch": 0.5537267981438515, "grad_norm": 0.026744598522782326, "grad_norm_var": 1.142009618075995e-06, "learning_rate": 0.004301500391166306, "loss": 2.6182, "step": 15274 }, { "crossentropy": 2.50882625579834, "epoch": 0.5537630510440835, "grad_norm": 0.026890166103839874, "grad_norm_var": 1.0991085414651606e-06, "learning_rate": 0.0043009250063764165, "loss": 2.5309, "step": 15275 }, { "crossentropy": 2.6456644535064697, "epoch": 0.5537993039443155, "grad_norm": 0.02677750401198864, "grad_norm_var": 1.0992162164729594e-06, "learning_rate": 0.004300349631028592, "loss": 2.6545, "step": 15276 }, { "crossentropy": 2.6763076782226562, "epoch": 0.5538355568445475, "grad_norm": 0.027853090316057205, "grad_norm_var": 1.1340426159600972e-06, "learning_rate": 0.004299774265130609, "loss": 2.6825, "step": 15277 }, { "crossentropy": 2.478471517562866, "epoch": 0.5538718097447796, "grad_norm": 0.026341881603002548, "grad_norm_var": 1.1604051396613223e-06, "learning_rate": 0.004299198908690237, "loss": 2.5177, "step": 15278 }, { "crossentropy": 2.646230459213257, "epoch": 0.5539080626450116, "grad_norm": 0.026336390525102615, "grad_norm_var": 1.0602090291037993e-06, "learning_rate": 0.004298623561715247, "loss": 2.6171, "step": 15279 }, { "crossentropy": 2.5016276836395264, "epoch": 0.5539443155452436, "grad_norm": 0.026123305782675743, "grad_norm_var": 4.887833425393228e-07, "learning_rate": 0.0042980482242134105, "loss": 2.5506, "step": 15280 }, { "crossentropy": 2.5864317417144775, "epoch": 0.5539805684454756, "grad_norm": 0.026005737483501434, "grad_norm_var": 5.068904815062285e-07, "learning_rate": 0.004297472896192498, "loss": 2.5669, "step": 15281 }, { "crossentropy": 2.564070463180542, "epoch": 0.5540168213457076, "grad_norm": 0.02746659517288208, "grad_norm_var": 5.552526543161605e-07, "learning_rate": 0.004296897577660279, "loss": 2.5464, "step": 15282 }, { "crossentropy": 2.4254043102264404, "epoch": 0.5540530742459396, "grad_norm": 0.027893774211406708, "grad_norm_var": 5.893866431594743e-07, "learning_rate": 0.004296322268624529, "loss": 2.5017, "step": 15283 }, { "crossentropy": 2.4682891368865967, "epoch": 0.5540893271461717, "grad_norm": 0.027046948671340942, "grad_norm_var": 4.877928840848195e-07, "learning_rate": 0.00429574696909301, "loss": 2.5269, "step": 15284 }, { "crossentropy": 2.648467540740967, "epoch": 0.5541255800464037, "grad_norm": 0.027647797018289566, "grad_norm_var": 5.230960830295555e-07, "learning_rate": 0.004295171679073499, "loss": 2.58, "step": 15285 }, { "crossentropy": 2.530588388442993, "epoch": 0.5541618329466357, "grad_norm": 0.026968050748109818, "grad_norm_var": 5.033292632478641e-07, "learning_rate": 0.004294596398573765, "loss": 2.4273, "step": 15286 }, { "crossentropy": 2.7843334674835205, "epoch": 0.5541980858468677, "grad_norm": 0.027195077389478683, "grad_norm_var": 4.038211975920658e-07, "learning_rate": 0.0042940211276015754, "loss": 2.6605, "step": 15287 }, { "crossentropy": 2.705659866333008, "epoch": 0.5542343387470998, "grad_norm": 0.02696872316300869, "grad_norm_var": 3.2883573520581187e-07, "learning_rate": 0.004293445866164703, "loss": 2.6255, "step": 15288 }, { "crossentropy": 2.6916511058807373, "epoch": 0.5542705916473318, "grad_norm": 0.027729695662856102, "grad_norm_var": 3.6640669338858174e-07, "learning_rate": 0.004292870614270914, "loss": 2.611, "step": 15289 }, { "crossentropy": 2.6143414974212646, "epoch": 0.5543068445475638, "grad_norm": 0.026268359273672104, "grad_norm_var": 3.9675723972430047e-07, "learning_rate": 0.004292295371927983, "loss": 2.5843, "step": 15290 }, { "crossentropy": 2.4839229583740234, "epoch": 0.5543430974477959, "grad_norm": 0.02665114216506481, "grad_norm_var": 4.028585628266471e-07, "learning_rate": 0.004291720139143679, "loss": 2.5387, "step": 15291 }, { "crossentropy": 2.6257097721099854, "epoch": 0.5543793503480279, "grad_norm": 0.02674432285130024, "grad_norm_var": 4.0371100556769493e-07, "learning_rate": 0.004291144915925765, "loss": 2.6816, "step": 15292 }, { "crossentropy": 2.748241424560547, "epoch": 0.5544156032482599, "grad_norm": 0.027470167726278305, "grad_norm_var": 3.6689735878566756e-07, "learning_rate": 0.0042905697022820165, "loss": 2.6145, "step": 15293 }, { "crossentropy": 2.5707175731658936, "epoch": 0.5544518561484919, "grad_norm": 0.026643695309758186, "grad_norm_var": 3.489790286183176e-07, "learning_rate": 0.004289994498220201, "loss": 2.488, "step": 15294 }, { "crossentropy": 2.6262598037719727, "epoch": 0.5544881090487239, "grad_norm": 0.027535347267985344, "grad_norm_var": 3.411322645666444e-07, "learning_rate": 0.004289419303748087, "loss": 2.5932, "step": 15295 }, { "crossentropy": 2.6414577960968018, "epoch": 0.5545243619489559, "grad_norm": 0.03014960139989853, "grad_norm_var": 8.716426707693533e-07, "learning_rate": 0.004288844118873443, "loss": 2.6829, "step": 15296 }, { "crossentropy": 2.4447715282440186, "epoch": 0.554560614849188, "grad_norm": 0.02798287197947502, "grad_norm_var": 7.816051676073332e-07, "learning_rate": 0.004288268943604038, "loss": 2.4571, "step": 15297 }, { "crossentropy": 2.5163233280181885, "epoch": 0.55459686774942, "grad_norm": 0.02676769532263279, "grad_norm_var": 8.057078711875964e-07, "learning_rate": 0.0042876937779476425, "loss": 2.4834, "step": 15298 }, { "crossentropy": 2.656520128250122, "epoch": 0.554633120649652, "grad_norm": 0.027281470596790314, "grad_norm_var": 7.850689526765619e-07, "learning_rate": 0.004287118621912023, "loss": 2.5319, "step": 15299 }, { "crossentropy": 2.4940524101257324, "epoch": 0.554669373549884, "grad_norm": 0.026936030015349388, "grad_norm_var": 7.898122759780167e-07, "learning_rate": 0.004286543475504949, "loss": 2.5888, "step": 15300 }, { "crossentropy": 2.5594944953918457, "epoch": 0.554705626450116, "grad_norm": 0.026419449597597122, "grad_norm_var": 8.285860987559841e-07, "learning_rate": 0.004285968338734188, "loss": 2.5783, "step": 15301 }, { "crossentropy": 2.611635208129883, "epoch": 0.554741879350348, "grad_norm": 0.030803652480244637, "grad_norm_var": 1.6130984855341428e-06, "learning_rate": 0.004285393211607507, "loss": 2.5899, "step": 15302 }, { "crossentropy": 2.4571728706359863, "epoch": 0.55477813225058, "grad_norm": 0.027625882998108864, "grad_norm_var": 1.6088083010415422e-06, "learning_rate": 0.004284818094132676, "loss": 2.5397, "step": 15303 }, { "crossentropy": 2.4582579135894775, "epoch": 0.554814385150812, "grad_norm": 0.025549771264195442, "grad_norm_var": 1.8349026355862196e-06, "learning_rate": 0.0042842429863174615, "loss": 2.5187, "step": 15304 }, { "crossentropy": 2.634697914123535, "epoch": 0.5548506380510441, "grad_norm": 0.02672920748591423, "grad_norm_var": 1.85480973763732e-06, "learning_rate": 0.004283667888169631, "loss": 2.6012, "step": 15305 }, { "crossentropy": 2.452695846557617, "epoch": 0.5548868909512761, "grad_norm": 0.02857154607772827, "grad_norm_var": 1.8549821249300707e-06, "learning_rate": 0.004283092799696954, "loss": 2.5441, "step": 15306 }, { "crossentropy": 2.6492888927459717, "epoch": 0.5549231438515081, "grad_norm": 0.027663180604577065, "grad_norm_var": 1.805617490781503e-06, "learning_rate": 0.0042825177209071955, "loss": 2.6325, "step": 15307 }, { "crossentropy": 2.569432258605957, "epoch": 0.5549593967517401, "grad_norm": 0.027308693155646324, "grad_norm_var": 1.7645503797728088e-06, "learning_rate": 0.004281942651808126, "loss": 2.6109, "step": 15308 }, { "crossentropy": 2.524080514907837, "epoch": 0.5549956496519721, "grad_norm": 0.027950795367360115, "grad_norm_var": 1.771315725074216e-06, "learning_rate": 0.00428136759240751, "loss": 2.5291, "step": 15309 }, { "crossentropy": 2.612891912460327, "epoch": 0.5550319025522041, "grad_norm": 0.025754880160093307, "grad_norm_var": 1.9363826200980422e-06, "learning_rate": 0.004280792542713114, "loss": 2.57, "step": 15310 }, { "crossentropy": 2.6642985343933105, "epoch": 0.5550681554524362, "grad_norm": 0.027194533497095108, "grad_norm_var": 1.9449615353966362e-06, "learning_rate": 0.004280217502732707, "loss": 2.6192, "step": 15311 }, { "crossentropy": 2.574251174926758, "epoch": 0.5551044083526682, "grad_norm": 0.02638714201748371, "grad_norm_var": 1.5221264726809064e-06, "learning_rate": 0.004279642472474055, "loss": 2.597, "step": 15312 }, { "crossentropy": 2.4891135692596436, "epoch": 0.5551406612529002, "grad_norm": 0.027579711750149727, "grad_norm_var": 1.496003546604061e-06, "learning_rate": 0.004279067451944923, "loss": 2.5319, "step": 15313 }, { "crossentropy": 2.54707670211792, "epoch": 0.5551769141531323, "grad_norm": 0.02737734653055668, "grad_norm_var": 1.477367864791077e-06, "learning_rate": 0.00427849244115308, "loss": 2.515, "step": 15314 }, { "crossentropy": 2.698972702026367, "epoch": 0.5552131670533643, "grad_norm": 0.0270468071103096, "grad_norm_var": 1.4820410711814554e-06, "learning_rate": 0.0042779174401062926, "loss": 2.608, "step": 15315 }, { "crossentropy": 2.41640567779541, "epoch": 0.5552494199535963, "grad_norm": 0.032351668924093246, "grad_norm_var": 3.0478440768655674e-06, "learning_rate": 0.004277342448812325, "loss": 2.4224, "step": 15316 }, { "crossentropy": 2.4984967708587646, "epoch": 0.5552856728538283, "grad_norm": 0.02839650772511959, "grad_norm_var": 2.9691713404920877e-06, "learning_rate": 0.0042767674672789455, "loss": 2.5965, "step": 15317 }, { "crossentropy": 2.5371246337890625, "epoch": 0.5553219257540604, "grad_norm": 0.029657134786248207, "grad_norm_var": 2.587302318007798e-06, "learning_rate": 0.004276192495513919, "loss": 2.4898, "step": 15318 }, { "crossentropy": 2.5913400650024414, "epoch": 0.5553581786542924, "grad_norm": 0.027438826858997345, "grad_norm_var": 2.5912517005659825e-06, "learning_rate": 0.00427561753352501, "loss": 2.5903, "step": 15319 }, { "crossentropy": 2.5129570960998535, "epoch": 0.5553944315545244, "grad_norm": 0.026191992685198784, "grad_norm_var": 2.434203128856199e-06, "learning_rate": 0.004275042581319986, "loss": 2.5016, "step": 15320 }, { "crossentropy": 2.5252466201782227, "epoch": 0.5554306844547564, "grad_norm": 0.027905823662877083, "grad_norm_var": 2.3645079008477803e-06, "learning_rate": 0.0042744676389066105, "loss": 2.4831, "step": 15321 }, { "crossentropy": 2.6791493892669678, "epoch": 0.5554669373549884, "grad_norm": 0.027863940224051476, "grad_norm_var": 2.3228705890882775e-06, "learning_rate": 0.004273892706292652, "loss": 2.6184, "step": 15322 }, { "crossentropy": 2.5356383323669434, "epoch": 0.5555031902552204, "grad_norm": 0.02710556983947754, "grad_norm_var": 2.3490791134573216e-06, "learning_rate": 0.004273317783485875, "loss": 2.5253, "step": 15323 }, { "crossentropy": 2.396695613861084, "epoch": 0.5555394431554525, "grad_norm": 0.026327993720769882, "grad_norm_var": 2.4629017832455823e-06, "learning_rate": 0.004272742870494043, "loss": 2.4304, "step": 15324 }, { "crossentropy": 2.640087842941284, "epoch": 0.5555756960556845, "grad_norm": 0.02885565347969532, "grad_norm_var": 2.549379728010896e-06, "learning_rate": 0.004272167967324924, "loss": 2.6393, "step": 15325 }, { "crossentropy": 2.543278217315674, "epoch": 0.5556119489559165, "grad_norm": 0.028167566284537315, "grad_norm_var": 2.2827315780879023e-06, "learning_rate": 0.0042715930739862795, "loss": 2.4812, "step": 15326 }, { "crossentropy": 2.552445888519287, "epoch": 0.5556482018561485, "grad_norm": 0.026174291968345642, "grad_norm_var": 2.439062291500517e-06, "learning_rate": 0.004271018190485877, "loss": 2.5824, "step": 15327 }, { "crossentropy": 2.5492262840270996, "epoch": 0.5556844547563805, "grad_norm": 0.025939011946320534, "grad_norm_var": 2.536137276222018e-06, "learning_rate": 0.004270443316831479, "loss": 2.5763, "step": 15328 }, { "crossentropy": 2.533644437789917, "epoch": 0.5557207076566125, "grad_norm": 0.027418063953518867, "grad_norm_var": 2.5419523104983425e-06, "learning_rate": 0.00426986845303085, "loss": 2.579, "step": 15329 }, { "crossentropy": 2.6825339794158936, "epoch": 0.5557569605568445, "grad_norm": 0.02777366153895855, "grad_norm_var": 2.531356522766463e-06, "learning_rate": 0.004269293599091757, "loss": 2.6193, "step": 15330 }, { "crossentropy": 2.4679715633392334, "epoch": 0.5557932134570766, "grad_norm": 0.02763189561665058, "grad_norm_var": 2.4948985016746596e-06, "learning_rate": 0.004268718755021963, "loss": 2.5519, "step": 15331 }, { "crossentropy": 2.4113829135894775, "epoch": 0.5558294663573086, "grad_norm": 0.027731502428650856, "grad_norm_var": 1.0404758312059873e-06, "learning_rate": 0.004268143920829232, "loss": 2.4263, "step": 15332 }, { "crossentropy": 2.644050359725952, "epoch": 0.5558657192575406, "grad_norm": 0.026104390621185303, "grad_norm_var": 1.1059194303930045e-06, "learning_rate": 0.004267569096521328, "loss": 2.6354, "step": 15333 }, { "crossentropy": 2.637667655944824, "epoch": 0.5559019721577726, "grad_norm": 0.02624489925801754, "grad_norm_var": 8.035080519367636e-07, "learning_rate": 0.004266994282106015, "loss": 2.6125, "step": 15334 }, { "crossentropy": 2.6245651245117188, "epoch": 0.5559382250580046, "grad_norm": 0.026432931423187256, "grad_norm_var": 8.319922454042093e-07, "learning_rate": 0.004266419477591057, "loss": 2.6134, "step": 15335 }, { "crossentropy": 2.5419325828552246, "epoch": 0.5559744779582366, "grad_norm": 0.026103433221578598, "grad_norm_var": 8.434027645229728e-07, "learning_rate": 0.0042658446829842165, "loss": 2.4462, "step": 15336 }, { "crossentropy": 2.427703380584717, "epoch": 0.5560107308584686, "grad_norm": 0.026325060054659843, "grad_norm_var": 8.321158178563788e-07, "learning_rate": 0.004265269898293256, "loss": 2.3909, "step": 15337 }, { "crossentropy": 2.6109421253204346, "epoch": 0.5560469837587007, "grad_norm": 0.02673446014523506, "grad_norm_var": 7.836227444918727e-07, "learning_rate": 0.004264695123525943, "loss": 2.6408, "step": 15338 }, { "crossentropy": 2.6993567943573, "epoch": 0.5560832366589327, "grad_norm": 0.03100978396832943, "grad_norm_var": 1.8215039939381909e-06, "learning_rate": 0.004264120358690037, "loss": 2.649, "step": 15339 }, { "crossentropy": 2.6918437480926514, "epoch": 0.5561194895591647, "grad_norm": 0.026497693732380867, "grad_norm_var": 1.8038920317603544e-06, "learning_rate": 0.004263545603793303, "loss": 2.6585, "step": 15340 }, { "crossentropy": 2.569664239883423, "epoch": 0.5561557424593968, "grad_norm": 0.0261542946100235, "grad_norm_var": 1.6623866485342142e-06, "learning_rate": 0.004262970858843502, "loss": 2.5649, "step": 15341 }, { "crossentropy": 2.767009973526001, "epoch": 0.5561919953596288, "grad_norm": 0.029035184532403946, "grad_norm_var": 1.8412986169427283e-06, "learning_rate": 0.004262396123848401, "loss": 2.6802, "step": 15342 }, { "crossentropy": 2.506561517715454, "epoch": 0.5562282482598608, "grad_norm": 0.02932080812752247, "grad_norm_var": 2.0793059151732556e-06, "learning_rate": 0.004261821398815757, "loss": 2.5165, "step": 15343 }, { "crossentropy": 2.624580144882202, "epoch": 0.5562645011600929, "grad_norm": 0.02637619338929653, "grad_norm_var": 2.0131675646313404e-06, "learning_rate": 0.004261246683753337, "loss": 2.6043, "step": 15344 }, { "crossentropy": 2.617612361907959, "epoch": 0.5563007540603249, "grad_norm": 0.0270557664334774, "grad_norm_var": 2.0159526204388862e-06, "learning_rate": 0.004260671978668898, "loss": 2.5745, "step": 15345 }, { "crossentropy": 2.603760242462158, "epoch": 0.5563370069605569, "grad_norm": 0.026582300662994385, "grad_norm_var": 2.026760061029272e-06, "learning_rate": 0.004260097283570209, "loss": 2.5583, "step": 15346 }, { "crossentropy": 2.6448075771331787, "epoch": 0.5563732598607889, "grad_norm": 0.028439728543162346, "grad_norm_var": 2.1131206227107193e-06, "learning_rate": 0.004259522598465028, "loss": 2.546, "step": 15347 }, { "crossentropy": 2.488335132598877, "epoch": 0.5564095127610209, "grad_norm": 0.03039691597223282, "grad_norm_var": 2.7249709501916586e-06, "learning_rate": 0.004258947923361119, "loss": 2.5069, "step": 15348 }, { "crossentropy": 2.57340407371521, "epoch": 0.5564457656612529, "grad_norm": 0.028949033468961716, "grad_norm_var": 2.729504045062891e-06, "learning_rate": 0.004258373258266243, "loss": 2.6055, "step": 15349 }, { "crossentropy": 2.380241870880127, "epoch": 0.5564820185614849, "grad_norm": 0.026787031441926956, "grad_norm_var": 2.6496565218284486e-06, "learning_rate": 0.004257798603188162, "loss": 2.4737, "step": 15350 }, { "crossentropy": 2.5444793701171875, "epoch": 0.556518271461717, "grad_norm": 0.02715391479432583, "grad_norm_var": 2.5663448404564334e-06, "learning_rate": 0.004257223958134636, "loss": 2.4792, "step": 15351 }, { "crossentropy": 2.691162109375, "epoch": 0.556554524361949, "grad_norm": 0.027541261166334152, "grad_norm_var": 2.392811445124524e-06, "learning_rate": 0.004256649323113427, "loss": 2.6325, "step": 15352 }, { "crossentropy": 2.5068070888519287, "epoch": 0.556590777262181, "grad_norm": 0.027353458106517792, "grad_norm_var": 2.260443891311838e-06, "learning_rate": 0.004256074698132297, "loss": 2.5116, "step": 15353 }, { "crossentropy": 2.6029272079467773, "epoch": 0.556627030162413, "grad_norm": 0.029390133917331696, "grad_norm_var": 2.3109257383230793e-06, "learning_rate": 0.004255500083199008, "loss": 2.5167, "step": 15354 }, { "crossentropy": 2.6154398918151855, "epoch": 0.556663283062645, "grad_norm": 0.027540652081370354, "grad_norm_var": 1.672184796272976e-06, "learning_rate": 0.004254925478321321, "loss": 2.6401, "step": 15355 }, { "crossentropy": 2.516611099243164, "epoch": 0.556699535962877, "grad_norm": 0.02700723148882389, "grad_norm_var": 1.6008930839641704e-06, "learning_rate": 0.004254350883506995, "loss": 2.5052, "step": 15356 }, { "crossentropy": 2.4406557083129883, "epoch": 0.556735788863109, "grad_norm": 0.02654588595032692, "grad_norm_var": 1.523624737799894e-06, "learning_rate": 0.004253776298763791, "loss": 2.5125, "step": 15357 }, { "crossentropy": 2.519867420196533, "epoch": 0.556772041763341, "grad_norm": 0.025910452008247375, "grad_norm_var": 1.6368452932941553e-06, "learning_rate": 0.004253201724099471, "loss": 2.5126, "step": 15358 }, { "crossentropy": 2.5375304222106934, "epoch": 0.5568082946635731, "grad_norm": 0.027215149253606796, "grad_norm_var": 1.4440069315881417e-06, "learning_rate": 0.004252627159521798, "loss": 2.4957, "step": 15359 }, { "crossentropy": 2.4028828144073486, "epoch": 0.5568445475638051, "grad_norm": 0.0270868968218565, "grad_norm_var": 1.3676315502235963e-06, "learning_rate": 0.004252052605038527, "loss": 2.4443, "step": 15360 }, { "crossentropy": 2.5910916328430176, "epoch": 0.5568808004640371, "grad_norm": 0.027794182300567627, "grad_norm_var": 1.3520914026341243e-06, "learning_rate": 0.00425147806065742, "loss": 2.5515, "step": 15361 }, { "crossentropy": 2.701275587081909, "epoch": 0.5569170533642691, "grad_norm": 0.02853589691221714, "grad_norm_var": 1.3240012038009488e-06, "learning_rate": 0.004250903526386239, "loss": 2.6031, "step": 15362 }, { "crossentropy": 2.3800535202026367, "epoch": 0.5569533062645011, "grad_norm": 0.027658725157380104, "grad_norm_var": 1.288007965920673e-06, "learning_rate": 0.0042503290022327415, "loss": 2.4451, "step": 15363 }, { "crossentropy": 2.6168935298919678, "epoch": 0.5569895591647331, "grad_norm": 0.025899553671479225, "grad_norm_var": 9.224617038224575e-07, "learning_rate": 0.004249754488204689, "loss": 2.5973, "step": 15364 }, { "crossentropy": 2.5955188274383545, "epoch": 0.5570258120649652, "grad_norm": 0.02684858627617359, "grad_norm_var": 7.638477815624077e-07, "learning_rate": 0.004249179984309841, "loss": 2.5788, "step": 15365 }, { "crossentropy": 2.6762747764587402, "epoch": 0.5570620649651972, "grad_norm": 0.02682163193821907, "grad_norm_var": 7.61709181240575e-07, "learning_rate": 0.004248605490555954, "loss": 2.6499, "step": 15366 }, { "crossentropy": 2.58967661857605, "epoch": 0.5570983178654292, "grad_norm": 0.029988201335072517, "grad_norm_var": 1.2203008513601842e-06, "learning_rate": 0.004248031006950795, "loss": 2.5488, "step": 15367 }, { "crossentropy": 2.561685562133789, "epoch": 0.5571345707656613, "grad_norm": 0.027157310396432877, "grad_norm_var": 1.224643816101027e-06, "learning_rate": 0.004247456533502117, "loss": 2.5442, "step": 15368 }, { "crossentropy": 2.5599730014801025, "epoch": 0.5571708236658933, "grad_norm": 0.02559738978743553, "grad_norm_var": 1.4334568800052882e-06, "learning_rate": 0.0042468820702176785, "loss": 2.5106, "step": 15369 }, { "crossentropy": 2.4880712032318115, "epoch": 0.5572070765661253, "grad_norm": 0.02789117582142353, "grad_norm_var": 1.158621943026641e-06, "learning_rate": 0.004246307617105241, "loss": 2.5547, "step": 15370 }, { "crossentropy": 2.5518453121185303, "epoch": 0.5572433294663574, "grad_norm": 0.025944828987121582, "grad_norm_var": 1.2492801357723756e-06, "learning_rate": 0.004245733174172563, "loss": 2.5376, "step": 15371 }, { "crossentropy": 2.5021255016326904, "epoch": 0.5572795823665894, "grad_norm": 0.03238243609666824, "grad_norm_var": 2.9750182551355517e-06, "learning_rate": 0.004245158741427404, "loss": 2.4974, "step": 15372 }, { "crossentropy": 2.71186900138855, "epoch": 0.5573158352668214, "grad_norm": 0.02669602818787098, "grad_norm_var": 2.958229774626247e-06, "learning_rate": 0.004244584318877522, "loss": 2.7413, "step": 15373 }, { "crossentropy": 2.4772088527679443, "epoch": 0.5573520881670534, "grad_norm": 0.026389501988887787, "grad_norm_var": 2.873324800539643e-06, "learning_rate": 0.0042440099065306726, "loss": 2.5727, "step": 15374 }, { "crossentropy": 2.672794818878174, "epoch": 0.5573883410672854, "grad_norm": 0.028389861807227135, "grad_norm_var": 2.915861507025288e-06, "learning_rate": 0.004243435504394618, "loss": 2.6154, "step": 15375 }, { "crossentropy": 2.4557077884674072, "epoch": 0.5574245939675174, "grad_norm": 0.026943515986204147, "grad_norm_var": 2.9263369283271643e-06, "learning_rate": 0.0042428611124771184, "loss": 2.4316, "step": 15376 }, { "crossentropy": 2.6365621089935303, "epoch": 0.5574608468677494, "grad_norm": 0.0271796602755785, "grad_norm_var": 2.930642807488073e-06, "learning_rate": 0.0042422867307859245, "loss": 2.5718, "step": 15377 }, { "crossentropy": 2.4300262928009033, "epoch": 0.5574970997679815, "grad_norm": 0.027409130707383156, "grad_norm_var": 2.8574095822929266e-06, "learning_rate": 0.004241712359328799, "loss": 2.4426, "step": 15378 }, { "crossentropy": 2.555680513381958, "epoch": 0.5575333526682135, "grad_norm": 0.026914622634649277, "grad_norm_var": 2.8712914648463633e-06, "learning_rate": 0.0042411379981135, "loss": 2.5773, "step": 15379 }, { "crossentropy": 2.484630584716797, "epoch": 0.5575696055684455, "grad_norm": 0.02585967630147934, "grad_norm_var": 2.879386457116138e-06, "learning_rate": 0.0042405636471477826, "loss": 2.5261, "step": 15380 }, { "crossentropy": 2.6879825592041016, "epoch": 0.5576058584686775, "grad_norm": 0.02609839476644993, "grad_norm_var": 2.9698008784101075e-06, "learning_rate": 0.0042399893064394055, "loss": 2.5478, "step": 15381 }, { "crossentropy": 2.5554046630859375, "epoch": 0.5576421113689095, "grad_norm": 0.027254005894064903, "grad_norm_var": 2.9507964168282626e-06, "learning_rate": 0.004239414975996126, "loss": 2.5192, "step": 15382 }, { "crossentropy": 2.5039236545562744, "epoch": 0.5576783642691415, "grad_norm": 0.02671797201037407, "grad_norm_var": 2.4823698461177844e-06, "learning_rate": 0.004238840655825703, "loss": 2.5392, "step": 15383 }, { "crossentropy": 2.725785970687866, "epoch": 0.5577146171693735, "grad_norm": 0.025866709649562836, "grad_norm_var": 2.589791395796514e-06, "learning_rate": 0.004238266345935892, "loss": 2.5936, "step": 15384 }, { "crossentropy": 2.583375930786133, "epoch": 0.5577508700696056, "grad_norm": 0.026566404849290848, "grad_norm_var": 2.4548636282345347e-06, "learning_rate": 0.00423769204633445, "loss": 2.5801, "step": 15385 }, { "crossentropy": 2.6060988903045654, "epoch": 0.5577871229698376, "grad_norm": 0.026065338402986526, "grad_norm_var": 2.484364496810794e-06, "learning_rate": 0.0042371177570291315, "loss": 2.5754, "step": 15386 }, { "crossentropy": 2.5582735538482666, "epoch": 0.5578233758700696, "grad_norm": 0.028478488326072693, "grad_norm_var": 2.5148026194081927e-06, "learning_rate": 0.004236543478027698, "loss": 2.5514, "step": 15387 }, { "crossentropy": 2.46728777885437, "epoch": 0.5578596287703016, "grad_norm": 0.028173651546239853, "grad_norm_var": 7.140971037368377e-07, "learning_rate": 0.004235969209337902, "loss": 2.4693, "step": 15388 }, { "crossentropy": 2.582868814468384, "epoch": 0.5578958816705336, "grad_norm": 0.027447247877717018, "grad_norm_var": 7.251627910366427e-07, "learning_rate": 0.004235394950967502, "loss": 2.5387, "step": 15389 }, { "crossentropy": 2.4018239974975586, "epoch": 0.5579321345707656, "grad_norm": 0.026611143723130226, "grad_norm_var": 7.106455562996683e-07, "learning_rate": 0.004234820702924251, "loss": 2.5331, "step": 15390 }, { "crossentropy": 2.408907651901245, "epoch": 0.5579683874709976, "grad_norm": 0.0265395175665617, "grad_norm_var": 5.813622763247422e-07, "learning_rate": 0.004234246465215911, "loss": 2.4976, "step": 15391 }, { "crossentropy": 2.512458324432373, "epoch": 0.5580046403712297, "grad_norm": 0.03022698499262333, "grad_norm_var": 1.2817479084436512e-06, "learning_rate": 0.004233672237850234, "loss": 2.5563, "step": 15392 }, { "crossentropy": 2.6703591346740723, "epoch": 0.5580408932714617, "grad_norm": 0.02623390406370163, "grad_norm_var": 1.3261003810310085e-06, "learning_rate": 0.0042330980208349755, "loss": 2.5989, "step": 15393 }, { "crossentropy": 2.406278610229492, "epoch": 0.5580771461716937, "grad_norm": 0.027312852442264557, "grad_norm_var": 1.3217993011610992e-06, "learning_rate": 0.004232523814177892, "loss": 2.397, "step": 15394 }, { "crossentropy": 2.5003533363342285, "epoch": 0.5581133990719258, "grad_norm": 0.026421427726745605, "grad_norm_var": 1.3441242400879494e-06, "learning_rate": 0.00423194961788674, "loss": 2.4846, "step": 15395 }, { "crossentropy": 2.6991939544677734, "epoch": 0.5581496519721578, "grad_norm": 0.027184022590517998, "grad_norm_var": 1.2537784161679855e-06, "learning_rate": 0.004231375431969274, "loss": 2.6634, "step": 15396 }, { "crossentropy": 2.5468504428863525, "epoch": 0.5581859048723898, "grad_norm": 0.026673894375562668, "grad_norm_var": 1.199549552732768e-06, "learning_rate": 0.004230801256433246, "loss": 2.5138, "step": 15397 }, { "crossentropy": 2.5602009296417236, "epoch": 0.5582221577726219, "grad_norm": 0.027008019387722015, "grad_norm_var": 1.1986360624289254e-06, "learning_rate": 0.004230227091286417, "loss": 2.5511, "step": 15398 }, { "crossentropy": 2.584151268005371, "epoch": 0.5582584106728539, "grad_norm": 0.0264887697994709, "grad_norm_var": 1.213455979321556e-06, "learning_rate": 0.00422965293653654, "loss": 2.5135, "step": 15399 }, { "crossentropy": 2.6615149974823, "epoch": 0.5582946635730859, "grad_norm": 0.028875522315502167, "grad_norm_var": 1.2920630171962678e-06, "learning_rate": 0.004229078792191368, "loss": 2.5742, "step": 15400 }, { "crossentropy": 2.582082748413086, "epoch": 0.5583309164733179, "grad_norm": 0.02924908697605133, "grad_norm_var": 1.4904787378636295e-06, "learning_rate": 0.004228504658258658, "loss": 2.5182, "step": 15401 }, { "crossentropy": 2.5198967456817627, "epoch": 0.5583671693735499, "grad_norm": 0.02909328043460846, "grad_norm_var": 1.509784582665374e-06, "learning_rate": 0.004227930534746162, "loss": 2.5414, "step": 15402 }, { "crossentropy": 2.5329747200012207, "epoch": 0.5584034222737819, "grad_norm": 0.02845843881368637, "grad_norm_var": 1.507531079671789e-06, "learning_rate": 0.004227356421661637, "loss": 2.5454, "step": 15403 }, { "crossentropy": 2.618596315383911, "epoch": 0.5584396751740139, "grad_norm": 0.02704516053199768, "grad_norm_var": 1.5045501941592879e-06, "learning_rate": 0.004226782319012835, "loss": 2.5786, "step": 15404 }, { "crossentropy": 2.386352300643921, "epoch": 0.558475928074246, "grad_norm": 0.028238527476787567, "grad_norm_var": 1.5323853624821248e-06, "learning_rate": 0.004226208226807511, "loss": 2.4751, "step": 15405 }, { "crossentropy": 2.6268391609191895, "epoch": 0.558512180974478, "grad_norm": 0.02865718863904476, "grad_norm_var": 1.523230730458326e-06, "learning_rate": 0.004225634145053419, "loss": 2.619, "step": 15406 }, { "crossentropy": 2.715358018875122, "epoch": 0.55854843387471, "grad_norm": 0.02702246978878975, "grad_norm_var": 1.461041876722963e-06, "learning_rate": 0.004225060073758314, "loss": 2.6456, "step": 15407 }, { "crossentropy": 2.525346040725708, "epoch": 0.558584686774942, "grad_norm": 0.02826559916138649, "grad_norm_var": 1.0568032677529236e-06, "learning_rate": 0.004224486012929947, "loss": 2.5097, "step": 15408 }, { "crossentropy": 2.601135015487671, "epoch": 0.558620939675174, "grad_norm": 0.02850579470396042, "grad_norm_var": 9.53687458298859e-07, "learning_rate": 0.004223911962576077, "loss": 2.5399, "step": 15409 }, { "crossentropy": 2.5962135791778564, "epoch": 0.558657192575406, "grad_norm": 0.027354903519153595, "grad_norm_var": 9.51171741056002e-07, "learning_rate": 0.004223337922704451, "loss": 2.5516, "step": 15410 }, { "crossentropy": 2.5475220680236816, "epoch": 0.558693445475638, "grad_norm": 0.026578150689601898, "grad_norm_var": 9.242364962922315e-07, "learning_rate": 0.0042227638933228255, "loss": 2.556, "step": 15411 }, { "crossentropy": 2.4948480129241943, "epoch": 0.55872969837587, "grad_norm": 0.02622446045279503, "grad_norm_var": 1.0597841142339585e-06, "learning_rate": 0.004222189874438953, "loss": 2.4957, "step": 15412 }, { "crossentropy": 2.410736083984375, "epoch": 0.5587659512761021, "grad_norm": 0.02651134878396988, "grad_norm_var": 1.0844044195386227e-06, "learning_rate": 0.004221615866060586, "loss": 2.4598, "step": 15413 }, { "crossentropy": 2.486928939819336, "epoch": 0.5588022041763341, "grad_norm": 0.02804880030453205, "grad_norm_var": 1.0528119103498287e-06, "learning_rate": 0.004221041868195478, "loss": 2.4861, "step": 15414 }, { "crossentropy": 2.595669746398926, "epoch": 0.5588384570765661, "grad_norm": 0.02707868441939354, "grad_norm_var": 9.72323832036659e-07, "learning_rate": 0.004220467880851383, "loss": 2.5772, "step": 15415 }, { "crossentropy": 2.354329824447632, "epoch": 0.5588747099767981, "grad_norm": 0.02620287798345089, "grad_norm_var": 1.0445719218693845e-06, "learning_rate": 0.004219893904036051, "loss": 2.4373, "step": 15416 }, { "crossentropy": 2.64605450630188, "epoch": 0.5589109628770301, "grad_norm": 0.028427358716726303, "grad_norm_var": 9.124951484773171e-07, "learning_rate": 0.0042193199377572365, "loss": 2.6199, "step": 15417 }, { "crossentropy": 2.5226547718048096, "epoch": 0.5589472157772621, "grad_norm": 0.02896830253303051, "grad_norm_var": 8.887054921485431e-07, "learning_rate": 0.004218745982022692, "loss": 2.599, "step": 15418 }, { "crossentropy": 2.287229061126709, "epoch": 0.5589834686774942, "grad_norm": 0.027140222489833832, "grad_norm_var": 8.462992192109313e-07, "learning_rate": 0.004218172036840166, "loss": 2.4955, "step": 15419 }, { "crossentropy": 2.6201515197753906, "epoch": 0.5590197215777262, "grad_norm": 0.027634397149086, "grad_norm_var": 8.309397500865882e-07, "learning_rate": 0.004217598102217415, "loss": 2.6226, "step": 15420 }, { "crossentropy": 2.5484728813171387, "epoch": 0.5590559744779582, "grad_norm": 0.027237050235271454, "grad_norm_var": 8.021783793867032e-07, "learning_rate": 0.004217024178162187, "loss": 2.6041, "step": 15421 }, { "crossentropy": 2.515324592590332, "epoch": 0.5590922273781903, "grad_norm": 0.026239855214953423, "grad_norm_var": 7.915540383559202e-07, "learning_rate": 0.004216450264682237, "loss": 2.4566, "step": 15422 }, { "crossentropy": 2.6608707904815674, "epoch": 0.5591284802784223, "grad_norm": 0.027761084958910942, "grad_norm_var": 7.943783442757122e-07, "learning_rate": 0.004215876361785316, "loss": 2.6576, "step": 15423 }, { "crossentropy": 2.5591280460357666, "epoch": 0.5591647331786543, "grad_norm": 0.02979770489037037, "grad_norm_var": 1.1207358656416254e-06, "learning_rate": 0.004215302469479174, "loss": 2.5705, "step": 15424 }, { "crossentropy": 2.5290369987487793, "epoch": 0.5592009860788864, "grad_norm": 0.027589235454797745, "grad_norm_var": 1.048117457966696e-06, "learning_rate": 0.004214728587771562, "loss": 2.4906, "step": 15425 }, { "crossentropy": 2.5242996215820312, "epoch": 0.5592372389791184, "grad_norm": 0.026201730594038963, "grad_norm_var": 1.1419547754888536e-06, "learning_rate": 0.004214154716670234, "loss": 2.4664, "step": 15426 }, { "crossentropy": 2.617746114730835, "epoch": 0.5592734918793504, "grad_norm": 0.026749521493911743, "grad_norm_var": 1.1260950176231737e-06, "learning_rate": 0.0042135808561829385, "loss": 2.5373, "step": 15427 }, { "crossentropy": 2.4738800525665283, "epoch": 0.5593097447795824, "grad_norm": 0.02600168250501156, "grad_norm_var": 1.1630243667237172e-06, "learning_rate": 0.004213007006317427, "loss": 2.449, "step": 15428 }, { "crossentropy": 2.460312604904175, "epoch": 0.5593459976798144, "grad_norm": 0.025720100849866867, "grad_norm_var": 1.2905645443962722e-06, "learning_rate": 0.004212433167081448, "loss": 2.5355, "step": 15429 }, { "crossentropy": 2.6329243183135986, "epoch": 0.5593822505800464, "grad_norm": 0.02599945105612278, "grad_norm_var": 1.3484232117675725e-06, "learning_rate": 0.004211859338482757, "loss": 2.6351, "step": 15430 }, { "crossentropy": 2.654320478439331, "epoch": 0.5594185034802784, "grad_norm": 0.02776295877993107, "grad_norm_var": 1.3691895008898729e-06, "learning_rate": 0.0042112855205291, "loss": 2.6404, "step": 15431 }, { "crossentropy": 2.5329296588897705, "epoch": 0.5594547563805105, "grad_norm": 0.026837067678570747, "grad_norm_var": 1.3087773068674493e-06, "learning_rate": 0.004210711713228231, "loss": 2.4763, "step": 15432 }, { "crossentropy": 2.464653968811035, "epoch": 0.5594910092807425, "grad_norm": 0.02584325708448887, "grad_norm_var": 1.321929250930563e-06, "learning_rate": 0.004210137916587896, "loss": 2.4796, "step": 15433 }, { "crossentropy": 2.5353305339813232, "epoch": 0.5595272621809745, "grad_norm": 0.026765696704387665, "grad_norm_var": 1.074325632276204e-06, "learning_rate": 0.004209564130615849, "loss": 2.5255, "step": 15434 }, { "crossentropy": 2.5463147163391113, "epoch": 0.5595635150812065, "grad_norm": 0.02597041241824627, "grad_norm_var": 1.1309740051222706e-06, "learning_rate": 0.004208990355319837, "loss": 2.578, "step": 15435 }, { "crossentropy": 2.589308500289917, "epoch": 0.5595997679814385, "grad_norm": 0.025794697925448418, "grad_norm_var": 1.1579347491039258e-06, "learning_rate": 0.00420841659070761, "loss": 2.5763, "step": 15436 }, { "crossentropy": 2.663783311843872, "epoch": 0.5596360208816705, "grad_norm": 0.02840767428278923, "grad_norm_var": 1.3169540393303254e-06, "learning_rate": 0.004207842836786917, "loss": 2.6406, "step": 15437 }, { "crossentropy": 2.5376181602478027, "epoch": 0.5596722737819025, "grad_norm": 0.031422048807144165, "grad_norm_var": 2.5806320974513497e-06, "learning_rate": 0.004207269093565511, "loss": 2.548, "step": 15438 }, { "crossentropy": 2.425441026687622, "epoch": 0.5597085266821346, "grad_norm": 0.027843618765473366, "grad_norm_var": 2.5876282388493286e-06, "learning_rate": 0.0042066953610511385, "loss": 2.4259, "step": 15439 }, { "crossentropy": 2.544522285461426, "epoch": 0.5597447795823666, "grad_norm": 0.02571098692715168, "grad_norm_var": 2.1991845306687055e-06, "learning_rate": 0.0042061216392515486, "loss": 2.5699, "step": 15440 }, { "crossentropy": 2.533564805984497, "epoch": 0.5597810324825986, "grad_norm": 0.026941338554024696, "grad_norm_var": 2.167068285651334e-06, "learning_rate": 0.004205547928174491, "loss": 2.492, "step": 15441 }, { "crossentropy": 2.641812801361084, "epoch": 0.5598172853828306, "grad_norm": 0.028058314695954323, "grad_norm_var": 2.2162650698665283e-06, "learning_rate": 0.004204974227827713, "loss": 2.5448, "step": 15442 }, { "crossentropy": 2.5158979892730713, "epoch": 0.5598535382830626, "grad_norm": 0.028113320469856262, "grad_norm_var": 2.288910182756754e-06, "learning_rate": 0.004204400538218967, "loss": 2.586, "step": 15443 }, { "crossentropy": 2.5353901386260986, "epoch": 0.5598897911832946, "grad_norm": 0.02628384344279766, "grad_norm_var": 2.253523673227545e-06, "learning_rate": 0.004203826859355999, "loss": 2.4869, "step": 15444 }, { "crossentropy": 2.509298801422119, "epoch": 0.5599260440835266, "grad_norm": 0.026771342381834984, "grad_norm_var": 2.130275561011176e-06, "learning_rate": 0.004203253191246555, "loss": 2.4953, "step": 15445 }, { "crossentropy": 2.753242254257202, "epoch": 0.5599622969837587, "grad_norm": 0.02902250923216343, "grad_norm_var": 2.234523788564348e-06, "learning_rate": 0.004202679533898387, "loss": 2.6684, "step": 15446 }, { "crossentropy": 2.6267781257629395, "epoch": 0.5599985498839907, "grad_norm": 0.029054908081889153, "grad_norm_var": 2.4105289713932204e-06, "learning_rate": 0.004202105887319242, "loss": 2.6698, "step": 15447 }, { "crossentropy": 2.5138676166534424, "epoch": 0.5600348027842227, "grad_norm": 0.02565363235771656, "grad_norm_var": 2.5912367786223393e-06, "learning_rate": 0.004201532251516867, "loss": 2.4433, "step": 15448 }, { "crossentropy": 2.7415857315063477, "epoch": 0.5600710556844548, "grad_norm": 0.026658790186047554, "grad_norm_var": 2.4685738684804282e-06, "learning_rate": 0.004200958626499012, "loss": 2.63, "step": 15449 }, { "crossentropy": 2.4627466201782227, "epoch": 0.5601073085846868, "grad_norm": 0.026910407468676567, "grad_norm_var": 2.4575557630564324e-06, "learning_rate": 0.004200385012273422, "loss": 2.4954, "step": 15450 }, { "crossentropy": 2.7043745517730713, "epoch": 0.5601435614849188, "grad_norm": 0.027181699872016907, "grad_norm_var": 2.316172359428705e-06, "learning_rate": 0.004199811408847848, "loss": 2.6373, "step": 15451 }, { "crossentropy": 2.4760663509368896, "epoch": 0.5601798143851509, "grad_norm": 0.02641911990940571, "grad_norm_var": 2.199453311601112e-06, "learning_rate": 0.004199237816230035, "loss": 2.484, "step": 15452 }, { "crossentropy": 2.4336068630218506, "epoch": 0.5602160672853829, "grad_norm": 0.02591538056731224, "grad_norm_var": 2.295468299272294e-06, "learning_rate": 0.004198664234427728, "loss": 2.5076, "step": 15453 }, { "crossentropy": 2.420584201812744, "epoch": 0.5602523201856149, "grad_norm": 0.02646648697555065, "grad_norm_var": 1.1546647031696606e-06, "learning_rate": 0.004198090663448677, "loss": 2.509, "step": 15454 }, { "crossentropy": 2.5379931926727295, "epoch": 0.5602885730858469, "grad_norm": 0.026707885786890984, "grad_norm_var": 1.117051091221533e-06, "learning_rate": 0.00419751710330063, "loss": 2.4986, "step": 15455 }, { "crossentropy": 2.4044151306152344, "epoch": 0.5603248259860789, "grad_norm": 0.026031160727143288, "grad_norm_var": 1.068777226512551e-06, "learning_rate": 0.004196943553991332, "loss": 2.4382, "step": 15456 }, { "crossentropy": 2.558582067489624, "epoch": 0.5603610788863109, "grad_norm": 0.027277100831270218, "grad_norm_var": 1.0726650549510348e-06, "learning_rate": 0.0041963700155285295, "loss": 2.5359, "step": 15457 }, { "crossentropy": 2.6138086318969727, "epoch": 0.5603973317865429, "grad_norm": 0.026592714712023735, "grad_norm_var": 1.0065282402275827e-06, "learning_rate": 0.004195796487919968, "loss": 2.6054, "step": 15458 }, { "crossentropy": 2.599020004272461, "epoch": 0.560433584686775, "grad_norm": 0.0262534748762846, "grad_norm_var": 9.320726918777504e-07, "learning_rate": 0.004195222971173398, "loss": 2.6024, "step": 15459 }, { "crossentropy": 2.5616414546966553, "epoch": 0.560469837587007, "grad_norm": 0.02595326118171215, "grad_norm_var": 9.627571425351813e-07, "learning_rate": 0.004194649465296564, "loss": 2.5485, "step": 15460 }, { "crossentropy": 2.5264220237731934, "epoch": 0.560506090487239, "grad_norm": 0.026279591023921967, "grad_norm_var": 9.800361879210827e-07, "learning_rate": 0.004194075970297208, "loss": 2.565, "step": 15461 }, { "crossentropy": 2.6816110610961914, "epoch": 0.560542343387471, "grad_norm": 0.025861212983727455, "grad_norm_var": 6.567330007033999e-07, "learning_rate": 0.004193502486183081, "loss": 2.6047, "step": 15462 }, { "crossentropy": 2.545675754547119, "epoch": 0.560578596287703, "grad_norm": 0.027344783768057823, "grad_norm_var": 2.7429548606344494e-07, "learning_rate": 0.004192929012961926, "loss": 2.6319, "step": 15463 }, { "crossentropy": 2.5715713500976562, "epoch": 0.560614849187935, "grad_norm": 0.026753054931759834, "grad_norm_var": 2.3029186531024066e-07, "learning_rate": 0.00419235555064149, "loss": 2.5148, "step": 15464 }, { "crossentropy": 2.4791903495788574, "epoch": 0.560651102088167, "grad_norm": 0.026840290054678917, "grad_norm_var": 2.3527670806415083e-07, "learning_rate": 0.004191782099229518, "loss": 2.528, "step": 15465 }, { "crossentropy": 2.558936357498169, "epoch": 0.5606873549883991, "grad_norm": 0.0277811698615551, "grad_norm_var": 3.245995997690528e-07, "learning_rate": 0.004191208658733753, "loss": 2.5555, "step": 15466 }, { "crossentropy": 2.624783754348755, "epoch": 0.5607236078886311, "grad_norm": 0.027628548443317413, "grad_norm_var": 3.7151934833289506e-07, "learning_rate": 0.0041906352291619445, "loss": 2.6168, "step": 15467 }, { "crossentropy": 2.670532464981079, "epoch": 0.5607598607888631, "grad_norm": 0.02745025046169758, "grad_norm_var": 4.0876173071229387e-07, "learning_rate": 0.004190061810521837, "loss": 2.6124, "step": 15468 }, { "crossentropy": 2.3892662525177, "epoch": 0.5607961136890951, "grad_norm": 0.028170878067612648, "grad_norm_var": 4.919511161206753e-07, "learning_rate": 0.004189488402821171, "loss": 2.546, "step": 15469 }, { "crossentropy": 2.551579475402832, "epoch": 0.5608323665893271, "grad_norm": 0.02654142677783966, "grad_norm_var": 4.886000426232064e-07, "learning_rate": 0.004188915006067696, "loss": 2.5906, "step": 15470 }, { "crossentropy": 2.6662449836730957, "epoch": 0.5608686194895591, "grad_norm": 0.027254994958639145, "grad_norm_var": 4.975484092746447e-07, "learning_rate": 0.004188341620269154, "loss": 2.5923, "step": 15471 }, { "crossentropy": 2.5153841972351074, "epoch": 0.5609048723897911, "grad_norm": 0.026660822331905365, "grad_norm_var": 4.5141058489330847e-07, "learning_rate": 0.00418776824543329, "loss": 2.5078, "step": 15472 }, { "crossentropy": 2.4957058429718018, "epoch": 0.5609411252900232, "grad_norm": 0.027148401364684105, "grad_norm_var": 4.4623601688261135e-07, "learning_rate": 0.004187194881567848, "loss": 2.479, "step": 15473 }, { "crossentropy": 2.519503116607666, "epoch": 0.5609773781902552, "grad_norm": 0.026709774509072304, "grad_norm_var": 4.421842925303454e-07, "learning_rate": 0.004186621528680573, "loss": 2.6067, "step": 15474 }, { "crossentropy": 2.329486608505249, "epoch": 0.5610136310904872, "grad_norm": 0.025699783116579056, "grad_norm_var": 5.101454608585995e-07, "learning_rate": 0.004186048186779209, "loss": 2.3977, "step": 15475 }, { "crossentropy": 2.5161354541778564, "epoch": 0.5610498839907193, "grad_norm": 0.02729833871126175, "grad_norm_var": 4.5703750182202187e-07, "learning_rate": 0.004185474855871501, "loss": 2.4941, "step": 15476 }, { "crossentropy": 2.6486423015594482, "epoch": 0.5610861368909513, "grad_norm": 0.02671082690358162, "grad_norm_var": 4.29310486996642e-07, "learning_rate": 0.0041849015359651884, "loss": 2.6259, "step": 15477 }, { "crossentropy": 2.527867317199707, "epoch": 0.5611223897911833, "grad_norm": 0.026896407827734947, "grad_norm_var": 3.4035974669939054e-07, "learning_rate": 0.004184328227068018, "loss": 2.506, "step": 15478 }, { "crossentropy": 2.4304323196411133, "epoch": 0.5611586426914154, "grad_norm": 0.028388893231749535, "grad_norm_var": 4.4875231033520795e-07, "learning_rate": 0.004183754929187734, "loss": 2.4562, "step": 15479 }, { "crossentropy": 2.600039005279541, "epoch": 0.5611948955916474, "grad_norm": 0.025912079960107803, "grad_norm_var": 5.341974343816274e-07, "learning_rate": 0.0041831816423320775, "loss": 2.545, "step": 15480 }, { "crossentropy": 2.55885648727417, "epoch": 0.5612311484918794, "grad_norm": 0.026170531287789345, "grad_norm_var": 5.825955226301659e-07, "learning_rate": 0.004182608366508793, "loss": 2.5205, "step": 15481 }, { "crossentropy": 2.558518886566162, "epoch": 0.5612674013921114, "grad_norm": 0.027589164674282074, "grad_norm_var": 5.655781811799426e-07, "learning_rate": 0.004182035101725621, "loss": 2.5658, "step": 15482 }, { "crossentropy": 2.5647568702697754, "epoch": 0.5613036542923434, "grad_norm": 0.028768830001354218, "grad_norm_var": 7.402100606696543e-07, "learning_rate": 0.004181461847990307, "loss": 2.5985, "step": 15483 }, { "crossentropy": 2.5645437240600586, "epoch": 0.5613399071925754, "grad_norm": 0.029221002012491226, "grad_norm_var": 1.0222500529496447e-06, "learning_rate": 0.004180888605310594, "loss": 2.5761, "step": 15484 }, { "crossentropy": 2.639368772506714, "epoch": 0.5613761600928074, "grad_norm": 0.03750978782773018, "grad_norm_var": 7.68662964667105e-06, "learning_rate": 0.004180315373694225, "loss": 2.5989, "step": 15485 }, { "crossentropy": 2.518946886062622, "epoch": 0.5614124129930395, "grad_norm": 0.030830074101686478, "grad_norm_var": 8.127882060994619e-06, "learning_rate": 0.004179742153148939, "loss": 2.4595, "step": 15486 }, { "crossentropy": 2.658168077468872, "epoch": 0.5614486658932715, "grad_norm": 0.02555890753865242, "grad_norm_var": 8.487034920240836e-06, "learning_rate": 0.00417916894368248, "loss": 2.5847, "step": 15487 }, { "crossentropy": 2.641683578491211, "epoch": 0.5614849187935035, "grad_norm": 0.028528083115816116, "grad_norm_var": 8.385953679798186e-06, "learning_rate": 0.00417859574530259, "loss": 2.6766, "step": 15488 }, { "crossentropy": 2.567481756210327, "epoch": 0.5615211716937355, "grad_norm": 0.02730054222047329, "grad_norm_var": 8.368932404365079e-06, "learning_rate": 0.00417802255801701, "loss": 2.526, "step": 15489 }, { "crossentropy": 2.397261619567871, "epoch": 0.5615574245939675, "grad_norm": 0.0264538936316967, "grad_norm_var": 8.419374503396138e-06, "learning_rate": 0.004177449381833484, "loss": 2.454, "step": 15490 }, { "crossentropy": 2.761138677597046, "epoch": 0.5615936774941995, "grad_norm": 0.027862051501870155, "grad_norm_var": 8.033344667161e-06, "learning_rate": 0.004176876216759752, "loss": 2.7212, "step": 15491 }, { "crossentropy": 2.507416248321533, "epoch": 0.5616299303944315, "grad_norm": 0.026617521420121193, "grad_norm_var": 8.143025041106727e-06, "learning_rate": 0.004176303062803556, "loss": 2.5075, "step": 15492 }, { "crossentropy": 2.577104330062866, "epoch": 0.5616661832946636, "grad_norm": 0.027801858261227608, "grad_norm_var": 8.008804257639612e-06, "learning_rate": 0.0041757299199726376, "loss": 2.5817, "step": 15493 }, { "crossentropy": 2.4794373512268066, "epoch": 0.5617024361948956, "grad_norm": 0.027695178985595703, "grad_norm_var": 7.908449846533624e-06, "learning_rate": 0.004175156788274737, "loss": 2.4211, "step": 15494 }, { "crossentropy": 2.6267905235290527, "epoch": 0.5617386890951276, "grad_norm": 0.025733422487974167, "grad_norm_var": 8.304604891678696e-06, "learning_rate": 0.004174583667717596, "loss": 2.5897, "step": 15495 }, { "crossentropy": 2.3792080879211426, "epoch": 0.5617749419953596, "grad_norm": 0.026482317596673965, "grad_norm_var": 8.158800510135371e-06, "learning_rate": 0.004174010558308955, "loss": 2.4021, "step": 15496 }, { "crossentropy": 2.5824944972991943, "epoch": 0.5618111948955916, "grad_norm": 0.02616404928267002, "grad_norm_var": 8.160498972710816e-06, "learning_rate": 0.004173437460056554, "loss": 2.6496, "step": 15497 }, { "crossentropy": 2.61476731300354, "epoch": 0.5618474477958236, "grad_norm": 0.02711373381316662, "grad_norm_var": 8.209055444972854e-06, "learning_rate": 0.004172864372968135, "loss": 2.5953, "step": 15498 }, { "crossentropy": 2.7759501934051514, "epoch": 0.5618837006960556, "grad_norm": 0.029482988640666008, "grad_norm_var": 8.304373103621544e-06, "learning_rate": 0.004172291297051438, "loss": 2.6174, "step": 15499 }, { "crossentropy": 2.586111068725586, "epoch": 0.5619199535962877, "grad_norm": 0.02748003602027893, "grad_norm_var": 8.244550975835387e-06, "learning_rate": 0.004171718232314203, "loss": 2.5233, "step": 15500 }, { "crossentropy": 2.6191766262054443, "epoch": 0.5619562064965197, "grad_norm": 0.027073809877038002, "grad_norm_var": 1.8723146474621788e-06, "learning_rate": 0.00417114517876417, "loss": 2.5773, "step": 15501 }, { "crossentropy": 2.6611721515655518, "epoch": 0.5619924593967517, "grad_norm": 0.025289496406912804, "grad_norm_var": 1.2467656764550967e-06, "learning_rate": 0.004170572136409082, "loss": 2.6443, "step": 15502 }, { "crossentropy": 2.6718223094940186, "epoch": 0.5620287122969838, "grad_norm": 0.02702944353222847, "grad_norm_var": 1.091546310444261e-06, "learning_rate": 0.004169999105256673, "loss": 2.6438, "step": 15503 }, { "crossentropy": 2.6208784580230713, "epoch": 0.5620649651972158, "grad_norm": 0.026924405246973038, "grad_norm_var": 9.537192979234733e-07, "learning_rate": 0.004169426085314687, "loss": 2.5796, "step": 15504 }, { "crossentropy": 2.662968158721924, "epoch": 0.5621012180974478, "grad_norm": 0.027143508195877075, "grad_norm_var": 9.496283372628645e-07, "learning_rate": 0.00416885307659086, "loss": 2.6804, "step": 15505 }, { "crossentropy": 2.4687485694885254, "epoch": 0.5621374709976799, "grad_norm": 0.02692844159901142, "grad_norm_var": 9.277741891620574e-07, "learning_rate": 0.004168280079092935, "loss": 2.5437, "step": 15506 }, { "crossentropy": 2.4902961254119873, "epoch": 0.5621737238979119, "grad_norm": 0.026000209152698517, "grad_norm_var": 9.431849136246409e-07, "learning_rate": 0.00416770709282865, "loss": 2.5103, "step": 15507 }, { "crossentropy": 2.4314379692077637, "epoch": 0.5622099767981439, "grad_norm": 0.02697536163032055, "grad_norm_var": 9.36039213737111e-07, "learning_rate": 0.004167134117805743, "loss": 2.5444, "step": 15508 }, { "crossentropy": 2.5702402591705322, "epoch": 0.5622462296983759, "grad_norm": 0.026405541226267815, "grad_norm_var": 9.00676394673984e-07, "learning_rate": 0.004166561154031955, "loss": 2.5268, "step": 15509 }, { "crossentropy": 2.6232991218566895, "epoch": 0.5622824825986079, "grad_norm": 0.026644403114914894, "grad_norm_var": 8.540911129427715e-07, "learning_rate": 0.004165988201515023, "loss": 2.5695, "step": 15510 }, { "crossentropy": 2.5869855880737305, "epoch": 0.5623187354988399, "grad_norm": 0.02649904042482376, "grad_norm_var": 7.813939449336499e-07, "learning_rate": 0.004165415260262686, "loss": 2.5303, "step": 15511 }, { "crossentropy": 2.599583387374878, "epoch": 0.5623549883990719, "grad_norm": 0.025932546705007553, "grad_norm_var": 8.274051225496167e-07, "learning_rate": 0.004164842330282682, "loss": 2.6205, "step": 15512 }, { "crossentropy": 2.639909267425537, "epoch": 0.562391241299304, "grad_norm": 0.02687862701714039, "grad_norm_var": 7.970183395460216e-07, "learning_rate": 0.004164269411582749, "loss": 2.6102, "step": 15513 }, { "crossentropy": 2.495556354522705, "epoch": 0.562427494199536, "grad_norm": 0.026392893865704536, "grad_norm_var": 8.053569631884105e-07, "learning_rate": 0.004163696504170626, "loss": 2.5637, "step": 15514 }, { "crossentropy": 2.58556866645813, "epoch": 0.562463747099768, "grad_norm": 0.026538001373410225, "grad_norm_var": 3.007907954363642e-07, "learning_rate": 0.004163123608054051, "loss": 2.5884, "step": 15515 }, { "crossentropy": 2.5218544006347656, "epoch": 0.5625, "grad_norm": 0.025621050968766212, "grad_norm_var": 3.0694987550126256e-07, "learning_rate": 0.0041625507232407625, "loss": 2.5227, "step": 15516 }, { "crossentropy": 2.6033616065979004, "epoch": 0.562536252900232, "grad_norm": 0.027102788910269737, "grad_norm_var": 3.0915264920203246e-07, "learning_rate": 0.004161977849738497, "loss": 2.5861, "step": 15517 }, { "crossentropy": 2.545377731323242, "epoch": 0.562572505800464, "grad_norm": 0.027147352695465088, "grad_norm_var": 2.2028682040360567e-07, "learning_rate": 0.004161404987554993, "loss": 2.5436, "step": 15518 }, { "crossentropy": 2.595102548599243, "epoch": 0.562608758700696, "grad_norm": 0.028600113466382027, "grad_norm_var": 4.5703266088187313e-07, "learning_rate": 0.004160832136697987, "loss": 2.5269, "step": 15519 }, { "crossentropy": 2.4314401149749756, "epoch": 0.5626450116009281, "grad_norm": 0.025995811447501183, "grad_norm_var": 4.872758399438685e-07, "learning_rate": 0.004160259297175216, "loss": 2.4555, "step": 15520 }, { "crossentropy": 2.5833609104156494, "epoch": 0.5626812645011601, "grad_norm": 0.027824345976114273, "grad_norm_var": 5.587452125776672e-07, "learning_rate": 0.004159686468994416, "loss": 2.6199, "step": 15521 }, { "crossentropy": 2.517735719680786, "epoch": 0.5627175174013921, "grad_norm": 0.027391135692596436, "grad_norm_var": 5.851139242261823e-07, "learning_rate": 0.0041591136521633276, "loss": 2.5459, "step": 15522 }, { "crossentropy": 2.635577440261841, "epoch": 0.5627537703016241, "grad_norm": 0.026510950177907944, "grad_norm_var": 5.505737023988692e-07, "learning_rate": 0.004158540846689684, "loss": 2.6344, "step": 15523 }, { "crossentropy": 2.6311161518096924, "epoch": 0.5627900232018561, "grad_norm": 0.027184810489416122, "grad_norm_var": 5.588062431147728e-07, "learning_rate": 0.004157968052581225, "loss": 2.5882, "step": 15524 }, { "crossentropy": 2.596381902694702, "epoch": 0.5628262761020881, "grad_norm": 0.027654416859149933, "grad_norm_var": 5.919619482837776e-07, "learning_rate": 0.004157395269845684, "loss": 2.5215, "step": 15525 }, { "crossentropy": 2.5742764472961426, "epoch": 0.5628625290023201, "grad_norm": 0.029652297496795654, "grad_norm_var": 1.0669929057401657e-06, "learning_rate": 0.004156822498490798, "loss": 2.5805, "step": 15526 }, { "crossentropy": 2.5252318382263184, "epoch": 0.5628987819025522, "grad_norm": 0.027772139757871628, "grad_norm_var": 1.0734295710832504e-06, "learning_rate": 0.004156249738524307, "loss": 2.5812, "step": 15527 }, { "crossentropy": 2.5131285190582275, "epoch": 0.5629350348027842, "grad_norm": 0.02665020525455475, "grad_norm_var": 9.903241339419134e-07, "learning_rate": 0.004155676989953941, "loss": 2.5089, "step": 15528 }, { "crossentropy": 2.5656018257141113, "epoch": 0.5629712877030162, "grad_norm": 0.027288509532809258, "grad_norm_var": 9.842278477661571e-07, "learning_rate": 0.004155104252787438, "loss": 2.5681, "step": 15529 }, { "crossentropy": 2.7052996158599854, "epoch": 0.5630075406032483, "grad_norm": 0.02855370007455349, "grad_norm_var": 1.0412285552006816e-06, "learning_rate": 0.004154531527032536, "loss": 2.613, "step": 15530 }, { "crossentropy": 2.3322296142578125, "epoch": 0.5630437935034803, "grad_norm": 0.02692544460296631, "grad_norm_var": 1.0090262629084204e-06, "learning_rate": 0.004153958812696967, "loss": 2.3506, "step": 15531 }, { "crossentropy": 2.715627908706665, "epoch": 0.5630800464037123, "grad_norm": 0.028249122202396393, "grad_norm_var": 8.288342308709981e-07, "learning_rate": 0.004153386109788469, "loss": 2.728, "step": 15532 }, { "crossentropy": 2.6879963874816895, "epoch": 0.5631162993039444, "grad_norm": 0.02894168347120285, "grad_norm_var": 9.350792318016735e-07, "learning_rate": 0.004152813418314777, "loss": 2.6221, "step": 15533 }, { "crossentropy": 2.491732597351074, "epoch": 0.5631525522041764, "grad_norm": 0.02608124352991581, "grad_norm_var": 1.0770513441214393e-06, "learning_rate": 0.004152240738283624, "loss": 2.4416, "step": 15534 }, { "crossentropy": 2.5724036693573, "epoch": 0.5631888051044084, "grad_norm": 0.026367759332060814, "grad_norm_var": 1.0848045068013635e-06, "learning_rate": 0.0041516680697027495, "loss": 2.6051, "step": 15535 }, { "crossentropy": 2.5715081691741943, "epoch": 0.5632250580046404, "grad_norm": 0.02703680656850338, "grad_norm_var": 9.5205048206066e-07, "learning_rate": 0.004151095412579883, "loss": 2.5977, "step": 15536 }, { "crossentropy": 2.688891649246216, "epoch": 0.5632613109048724, "grad_norm": 0.027557017281651497, "grad_norm_var": 9.451444921447505e-07, "learning_rate": 0.00415052276692276, "loss": 2.6525, "step": 15537 }, { "crossentropy": 2.6894006729125977, "epoch": 0.5632975638051044, "grad_norm": 0.02738906256854534, "grad_norm_var": 9.451716953273824e-07, "learning_rate": 0.004149950132739118, "loss": 2.7078, "step": 15538 }, { "crossentropy": 2.5421268939971924, "epoch": 0.5633338167053364, "grad_norm": 0.026735451072454453, "grad_norm_var": 9.190618490798139e-07, "learning_rate": 0.004149377510036688, "loss": 2.5477, "step": 15539 }, { "crossentropy": 2.5158822536468506, "epoch": 0.5633700696055685, "grad_norm": 0.026544239372015, "grad_norm_var": 9.718394944310903e-07, "learning_rate": 0.004148804898823207, "loss": 2.525, "step": 15540 }, { "crossentropy": 2.528022050857544, "epoch": 0.5634063225058005, "grad_norm": 0.0265897735953331, "grad_norm_var": 1.0154300149763182e-06, "learning_rate": 0.004148232299106406, "loss": 2.5308, "step": 15541 }, { "crossentropy": 2.590675115585327, "epoch": 0.5634425754060325, "grad_norm": 0.027103004977107048, "grad_norm_var": 6.546496691595475e-07, "learning_rate": 0.0041476597108940205, "loss": 2.5646, "step": 15542 }, { "crossentropy": 2.5964760780334473, "epoch": 0.5634788283062645, "grad_norm": 0.04000027850270271, "grad_norm_var": 1.0873309181229768e-05, "learning_rate": 0.004147087134193785, "loss": 2.5437, "step": 15543 }, { "crossentropy": 2.486703395843506, "epoch": 0.5635150812064965, "grad_norm": 0.029162272810935974, "grad_norm_var": 1.0815332521104243e-05, "learning_rate": 0.004146514569013434, "loss": 2.4828, "step": 15544 }, { "crossentropy": 2.5646536350250244, "epoch": 0.5635513341067285, "grad_norm": 0.028493130579590797, "grad_norm_var": 1.0766399216863055e-05, "learning_rate": 0.004145942015360696, "loss": 2.5161, "step": 15545 }, { "crossentropy": 2.5505850315093994, "epoch": 0.5635875870069605, "grad_norm": 0.028949938714504242, "grad_norm_var": 1.0793148628536114e-05, "learning_rate": 0.004145369473243308, "loss": 2.5478, "step": 15546 }, { "crossentropy": 2.508415699005127, "epoch": 0.5636238399071926, "grad_norm": 0.028637230396270752, "grad_norm_var": 1.067217213202401e-05, "learning_rate": 0.004144796942669003, "loss": 2.4724, "step": 15547 }, { "crossentropy": 2.682091236114502, "epoch": 0.5636600928074246, "grad_norm": 0.030749637633562088, "grad_norm_var": 1.1024365687939047e-05, "learning_rate": 0.004144224423645512, "loss": 2.6478, "step": 15548 }, { "crossentropy": 2.6051852703094482, "epoch": 0.5636963457076566, "grad_norm": 0.0304579921066761, "grad_norm_var": 1.1253084670364413e-05, "learning_rate": 0.004143651916180569, "loss": 2.4193, "step": 15549 }, { "crossentropy": 2.49554443359375, "epoch": 0.5637325986078886, "grad_norm": 0.028648069128394127, "grad_norm_var": 1.0797392901043982e-05, "learning_rate": 0.004143079420281906, "loss": 2.5129, "step": 15550 }, { "crossentropy": 2.684446096420288, "epoch": 0.5637688515081206, "grad_norm": 0.026967015117406845, "grad_norm_var": 1.0627388544454513e-05, "learning_rate": 0.004142506935957256, "loss": 2.5808, "step": 15551 }, { "crossentropy": 2.7092490196228027, "epoch": 0.5638051044083526, "grad_norm": 0.02646828629076481, "grad_norm_var": 1.0782290978190769e-05, "learning_rate": 0.004141934463214354, "loss": 2.6575, "step": 15552 }, { "crossentropy": 2.6368627548217773, "epoch": 0.5638413573085846, "grad_norm": 0.02776281349360943, "grad_norm_var": 1.0751427288022546e-05, "learning_rate": 0.004141362002060925, "loss": 2.5943, "step": 15553 }, { "crossentropy": 2.531778573989868, "epoch": 0.5638776102088167, "grad_norm": 0.026122771203517914, "grad_norm_var": 1.1088370330608133e-05, "learning_rate": 0.0041407895525047075, "loss": 2.5221, "step": 15554 }, { "crossentropy": 2.4957144260406494, "epoch": 0.5639138631090487, "grad_norm": 0.024977359920740128, "grad_norm_var": 1.17448763105454e-05, "learning_rate": 0.004140217114553431, "loss": 2.4124, "step": 15555 }, { "crossentropy": 2.6373538970947266, "epoch": 0.5639501160092807, "grad_norm": 0.028110835701227188, "grad_norm_var": 1.1468417601721138e-05, "learning_rate": 0.004139644688214827, "loss": 2.6939, "step": 15556 }, { "crossentropy": 2.447840690612793, "epoch": 0.5639863689095128, "grad_norm": 0.029976896941661835, "grad_norm_var": 1.1232430859883082e-05, "learning_rate": 0.004139072273496626, "loss": 2.4613, "step": 15557 }, { "crossentropy": 2.660219430923462, "epoch": 0.5640226218097448, "grad_norm": 0.026316387578845024, "grad_norm_var": 1.1460806109103702e-05, "learning_rate": 0.0041384998704065605, "loss": 2.5926, "step": 15558 }, { "crossentropy": 2.5584306716918945, "epoch": 0.5640588747099768, "grad_norm": 0.026605095714330673, "grad_norm_var": 2.7829971399614584e-06, "learning_rate": 0.004137927478952363, "loss": 2.4099, "step": 15559 }, { "crossentropy": 2.6549317836761475, "epoch": 0.5640951276102089, "grad_norm": 0.027879226952791214, "grad_norm_var": 2.691389935538625e-06, "learning_rate": 0.004137355099141764, "loss": 2.599, "step": 15560 }, { "crossentropy": 2.5363450050354004, "epoch": 0.5641313805104409, "grad_norm": 0.02681584842503071, "grad_norm_var": 2.7446745948284894e-06, "learning_rate": 0.004136782730982491, "loss": 2.4734, "step": 15561 }, { "crossentropy": 2.635138511657715, "epoch": 0.5641676334106729, "grad_norm": 0.028204599395394325, "grad_norm_var": 2.6691247081623083e-06, "learning_rate": 0.004136210374482278, "loss": 2.5887, "step": 15562 }, { "crossentropy": 2.425536632537842, "epoch": 0.5642038863109049, "grad_norm": 0.025997687131166458, "grad_norm_var": 2.807721717208491e-06, "learning_rate": 0.004135638029648855, "loss": 2.449, "step": 15563 }, { "crossentropy": 2.5932278633117676, "epoch": 0.5642401392111369, "grad_norm": 0.028000814840197563, "grad_norm_var": 2.136149754904101e-06, "learning_rate": 0.004135065696489953, "loss": 2.5546, "step": 15564 }, { "crossentropy": 2.5690762996673584, "epoch": 0.5642763921113689, "grad_norm": 0.027186721563339233, "grad_norm_var": 1.4960263167072684e-06, "learning_rate": 0.004134493375013299, "loss": 2.6613, "step": 15565 }, { "crossentropy": 2.4739091396331787, "epoch": 0.5643126450116009, "grad_norm": 0.026955386623740196, "grad_norm_var": 1.3601383594923858e-06, "learning_rate": 0.004133921065226627, "loss": 2.4686, "step": 15566 }, { "crossentropy": 2.4974591732025146, "epoch": 0.564348897911833, "grad_norm": 0.026810212060809135, "grad_norm_var": 1.3654324591994004e-06, "learning_rate": 0.004133348767137664, "loss": 2.5505, "step": 15567 }, { "crossentropy": 2.506727933883667, "epoch": 0.564385150812065, "grad_norm": 0.02804739587008953, "grad_norm_var": 1.380499252432293e-06, "learning_rate": 0.0041327764807541426, "loss": 2.5562, "step": 15568 }, { "crossentropy": 2.498616933822632, "epoch": 0.564421403712297, "grad_norm": 0.026436887681484222, "grad_norm_var": 1.39717808548428e-06, "learning_rate": 0.004132204206083792, "loss": 2.5263, "step": 15569 }, { "crossentropy": 2.5390069484710693, "epoch": 0.564457656612529, "grad_norm": 0.028912419453263283, "grad_norm_var": 1.5004548976377263e-06, "learning_rate": 0.004131631943134339, "loss": 2.5752, "step": 15570 }, { "crossentropy": 2.5966250896453857, "epoch": 0.564493909512761, "grad_norm": 0.02669086679816246, "grad_norm_var": 1.147119562134452e-06, "learning_rate": 0.004131059691913515, "loss": 2.596, "step": 15571 }, { "crossentropy": 2.421907663345337, "epoch": 0.564530162412993, "grad_norm": 0.027214327827095985, "grad_norm_var": 1.1164718470279503e-06, "learning_rate": 0.004130487452429048, "loss": 2.4758, "step": 15572 }, { "crossentropy": 2.4938583374023438, "epoch": 0.564566415313225, "grad_norm": 0.026392444968223572, "grad_norm_var": 6.774903939145514e-07, "learning_rate": 0.0041299152246886675, "loss": 2.5338, "step": 15573 }, { "crossentropy": 2.5973386764526367, "epoch": 0.5646026682134571, "grad_norm": 0.02865375578403473, "grad_norm_var": 7.578596293390805e-07, "learning_rate": 0.004129343008700101, "loss": 2.5479, "step": 15574 }, { "crossentropy": 2.554774761199951, "epoch": 0.5646389211136891, "grad_norm": 0.02845863252878189, "grad_norm_var": 8.007901129489584e-07, "learning_rate": 0.0041287708044710795, "loss": 2.6395, "step": 15575 }, { "crossentropy": 2.6196177005767822, "epoch": 0.5646751740139211, "grad_norm": 0.027119334787130356, "grad_norm_var": 7.899539740530495e-07, "learning_rate": 0.004128198612009331, "loss": 2.607, "step": 15576 }, { "crossentropy": 2.394364595413208, "epoch": 0.5647114269141531, "grad_norm": 0.025817016139626503, "grad_norm_var": 9.259200445236108e-07, "learning_rate": 0.004127626431322584, "loss": 2.4817, "step": 15577 }, { "crossentropy": 2.437711238861084, "epoch": 0.5647476798143851, "grad_norm": 0.027105895802378654, "grad_norm_var": 8.697505636208469e-07, "learning_rate": 0.004127054262418565, "loss": 2.5422, "step": 15578 }, { "crossentropy": 2.5861082077026367, "epoch": 0.5647839327146171, "grad_norm": 0.02630542777478695, "grad_norm_var": 8.247979878358285e-07, "learning_rate": 0.0041264821053050025, "loss": 2.5518, "step": 15579 }, { "crossentropy": 2.4590044021606445, "epoch": 0.5648201856148491, "grad_norm": 0.026575062423944473, "grad_norm_var": 8.103936846230268e-07, "learning_rate": 0.004125909959989626, "loss": 2.5286, "step": 15580 }, { "crossentropy": 2.5038394927978516, "epoch": 0.5648564385150812, "grad_norm": 0.026521997526288033, "grad_norm_var": 8.363161126412651e-07, "learning_rate": 0.00412533782648016, "loss": 2.5588, "step": 15581 }, { "crossentropy": 2.6520566940307617, "epoch": 0.5648926914153132, "grad_norm": 0.028180407360196114, "grad_norm_var": 9.022301649006288e-07, "learning_rate": 0.004124765704784334, "loss": 2.6406, "step": 15582 }, { "crossentropy": 2.6020753383636475, "epoch": 0.5649289443155452, "grad_norm": 0.02811528742313385, "grad_norm_var": 9.403968046245436e-07, "learning_rate": 0.0041241935949098765, "loss": 2.5872, "step": 15583 }, { "crossentropy": 2.5898427963256836, "epoch": 0.5649651972157773, "grad_norm": 0.02729484625160694, "grad_norm_var": 8.992132029423186e-07, "learning_rate": 0.004123621496864514, "loss": 2.5834, "step": 15584 }, { "crossentropy": 2.6155495643615723, "epoch": 0.5650014501160093, "grad_norm": 0.026735395193099976, "grad_norm_var": 8.729306083624447e-07, "learning_rate": 0.004123049410655972, "loss": 2.6047, "step": 15585 }, { "crossentropy": 2.53806209564209, "epoch": 0.5650377030162413, "grad_norm": 0.025396838784217834, "grad_norm_var": 8.688660641288966e-07, "learning_rate": 0.004122477336291981, "loss": 2.4928, "step": 15586 }, { "crossentropy": 2.583937168121338, "epoch": 0.5650739559164734, "grad_norm": 0.02791006490588188, "grad_norm_var": 9.056484298508901e-07, "learning_rate": 0.004121905273780263, "loss": 2.6071, "step": 15587 }, { "crossentropy": 2.5812320709228516, "epoch": 0.5651102088167054, "grad_norm": 0.027773750945925713, "grad_norm_var": 9.328185997818274e-07, "learning_rate": 0.004121333223128547, "loss": 2.553, "step": 15588 }, { "crossentropy": 2.597217321395874, "epoch": 0.5651464617169374, "grad_norm": 0.027175772935152054, "grad_norm_var": 8.923330813448051e-07, "learning_rate": 0.004120761184344559, "loss": 2.5543, "step": 15589 }, { "crossentropy": 2.5146727561950684, "epoch": 0.5651827146171694, "grad_norm": 0.025614116340875626, "grad_norm_var": 8.790774631184702e-07, "learning_rate": 0.004120189157436025, "loss": 2.4501, "step": 15590 }, { "crossentropy": 2.49108624458313, "epoch": 0.5652189675174014, "grad_norm": 0.026162447407841682, "grad_norm_var": 7.639451160212456e-07, "learning_rate": 0.004119617142410673, "loss": 2.545, "step": 15591 }, { "crossentropy": 2.436540365219116, "epoch": 0.5652552204176334, "grad_norm": 0.02623867057263851, "grad_norm_var": 7.822870546657686e-07, "learning_rate": 0.004119045139276228, "loss": 2.4605, "step": 15592 }, { "crossentropy": 2.494711399078369, "epoch": 0.5652914733178654, "grad_norm": 0.027845488861203194, "grad_norm_var": 7.71515963541774e-07, "learning_rate": 0.004118473148040413, "loss": 2.5241, "step": 15593 }, { "crossentropy": 2.652663230895996, "epoch": 0.5653277262180975, "grad_norm": 0.027058329433202744, "grad_norm_var": 7.70570140157939e-07, "learning_rate": 0.0041179011687109595, "loss": 2.5293, "step": 15594 }, { "crossentropy": 2.4448189735412598, "epoch": 0.5653639791183295, "grad_norm": 0.02786540426313877, "grad_norm_var": 7.924457414033743e-07, "learning_rate": 0.004117329201295586, "loss": 2.5182, "step": 15595 }, { "crossentropy": 2.7013657093048096, "epoch": 0.5654002320185615, "grad_norm": 0.02636753022670746, "grad_norm_var": 8.076982741057892e-07, "learning_rate": 0.004116757245802023, "loss": 2.6176, "step": 15596 }, { "crossentropy": 2.548351526260376, "epoch": 0.5654364849187935, "grad_norm": 0.028068846091628075, "grad_norm_var": 8.553537939122331e-07, "learning_rate": 0.004116185302237992, "loss": 2.5512, "step": 15597 }, { "crossentropy": 2.622710704803467, "epoch": 0.5654727378190255, "grad_norm": 0.02756587229669094, "grad_norm_var": 7.914712978276025e-07, "learning_rate": 0.00411561337061122, "loss": 2.6768, "step": 15598 }, { "crossentropy": 2.581200122833252, "epoch": 0.5655089907192575, "grad_norm": 0.026414187625050545, "grad_norm_var": 7.362183239288522e-07, "learning_rate": 0.0041150414509294325, "loss": 2.5552, "step": 15599 }, { "crossentropy": 2.528212785720825, "epoch": 0.5655452436194895, "grad_norm": 0.027453092858195305, "grad_norm_var": 7.446803336578899e-07, "learning_rate": 0.0041144695432003536, "loss": 2.5992, "step": 15600 }, { "crossentropy": 2.563859462738037, "epoch": 0.5655814965197216, "grad_norm": 0.026068463921546936, "grad_norm_var": 7.94041418708521e-07, "learning_rate": 0.004113897647431706, "loss": 2.5918, "step": 15601 }, { "crossentropy": 2.5528790950775146, "epoch": 0.5656177494199536, "grad_norm": 0.026302941143512726, "grad_norm_var": 6.593818530072621e-07, "learning_rate": 0.004113325763631217, "loss": 2.576, "step": 15602 }, { "crossentropy": 2.640073537826538, "epoch": 0.5656540023201856, "grad_norm": 0.026900460943579674, "grad_norm_var": 5.996130548917751e-07, "learning_rate": 0.004112753891806609, "loss": 2.6602, "step": 15603 }, { "crossentropy": 2.7201266288757324, "epoch": 0.5656902552204176, "grad_norm": 0.028554657474160194, "grad_norm_var": 7.256086674714904e-07, "learning_rate": 0.004112182031965605, "loss": 2.5681, "step": 15604 }, { "crossentropy": 2.5418436527252197, "epoch": 0.5657265081206496, "grad_norm": 0.027578411623835564, "grad_norm_var": 7.46330718336e-07, "learning_rate": 0.004111610184115929, "loss": 2.5534, "step": 15605 }, { "crossentropy": 2.6001622676849365, "epoch": 0.5657627610208816, "grad_norm": 0.02729007788002491, "grad_norm_var": 6.113690539039443e-07, "learning_rate": 0.004111038348265307, "loss": 2.5983, "step": 15606 }, { "crossentropy": 2.586782932281494, "epoch": 0.5657990139211136, "grad_norm": 0.027214236557483673, "grad_norm_var": 5.478470767128572e-07, "learning_rate": 0.004110466524421462, "loss": 2.5595, "step": 15607 }, { "crossentropy": 2.611015796661377, "epoch": 0.5658352668213457, "grad_norm": 0.025360573083162308, "grad_norm_var": 7.0556563368356e-07, "learning_rate": 0.004109894712592116, "loss": 2.5552, "step": 15608 }, { "crossentropy": 2.5245978832244873, "epoch": 0.5658715197215777, "grad_norm": 0.026619739830493927, "grad_norm_var": 6.807837486101035e-07, "learning_rate": 0.004109322912784992, "loss": 2.5408, "step": 15609 }, { "crossentropy": 2.5840628147125244, "epoch": 0.5659077726218097, "grad_norm": 0.027847860008478165, "grad_norm_var": 7.213914443650413e-07, "learning_rate": 0.004108751125007812, "loss": 2.7016, "step": 15610 }, { "crossentropy": 2.5597591400146484, "epoch": 0.5659440255220418, "grad_norm": 0.02623073011636734, "grad_norm_var": 7.198377372396568e-07, "learning_rate": 0.004108179349268305, "loss": 2.5638, "step": 15611 }, { "crossentropy": 2.4841408729553223, "epoch": 0.5659802784222738, "grad_norm": 0.02662818692624569, "grad_norm_var": 7.024556870838799e-07, "learning_rate": 0.004107607585574187, "loss": 2.5182, "step": 15612 }, { "crossentropy": 2.4606549739837646, "epoch": 0.5660165313225058, "grad_norm": 0.027074892073869705, "grad_norm_var": 6.233655735402484e-07, "learning_rate": 0.004107035833933182, "loss": 2.5189, "step": 15613 }, { "crossentropy": 2.652738094329834, "epoch": 0.5660527842227379, "grad_norm": 0.028510602191090584, "grad_norm_var": 7.574782216093177e-07, "learning_rate": 0.004106464094353014, "loss": 2.593, "step": 15614 }, { "crossentropy": 2.6926190853118896, "epoch": 0.5660890371229699, "grad_norm": 0.028792576864361763, "grad_norm_var": 9.242787624476219e-07, "learning_rate": 0.004105892366841404, "loss": 2.5876, "step": 15615 }, { "crossentropy": 2.55999493598938, "epoch": 0.5661252900232019, "grad_norm": 0.027751978486776352, "grad_norm_var": 9.418722277269719e-07, "learning_rate": 0.004105320651406075, "loss": 2.4945, "step": 15616 }, { "crossentropy": 2.31923770904541, "epoch": 0.5661615429234339, "grad_norm": 0.025947945192456245, "grad_norm_var": 9.60487205987383e-07, "learning_rate": 0.004104748948054748, "loss": 2.5541, "step": 15617 }, { "crossentropy": 2.504055976867676, "epoch": 0.5661977958236659, "grad_norm": 0.02663624845445156, "grad_norm_var": 9.292146273521951e-07, "learning_rate": 0.004104177256795144, "loss": 2.5067, "step": 15618 }, { "crossentropy": 2.5086910724639893, "epoch": 0.5662340487238979, "grad_norm": 0.02722245082259178, "grad_norm_var": 9.235345152058353e-07, "learning_rate": 0.004103605577634989, "loss": 2.4689, "step": 15619 }, { "crossentropy": 2.6134889125823975, "epoch": 0.56627030162413, "grad_norm": 0.026709729805588722, "grad_norm_var": 8.039771372913655e-07, "learning_rate": 0.0041030339105819985, "loss": 2.524, "step": 15620 }, { "crossentropy": 2.4799959659576416, "epoch": 0.566306554524362, "grad_norm": 0.027924224734306335, "grad_norm_var": 8.340396634469536e-07, "learning_rate": 0.004102462255643895, "loss": 2.5438, "step": 15621 }, { "crossentropy": 2.4685888290405273, "epoch": 0.566342807424594, "grad_norm": 0.02585522085428238, "grad_norm_var": 9.282886613832623e-07, "learning_rate": 0.004101890612828403, "loss": 2.5325, "step": 15622 }, { "crossentropy": 2.5370075702667236, "epoch": 0.566379060324826, "grad_norm": 0.026455864310264587, "grad_norm_var": 9.446391855907436e-07, "learning_rate": 0.0041013189821432405, "loss": 2.5144, "step": 15623 }, { "crossentropy": 2.5936074256896973, "epoch": 0.566415313225058, "grad_norm": 0.027933817356824875, "grad_norm_var": 8.0524820792414e-07, "learning_rate": 0.004100747363596129, "loss": 2.5574, "step": 15624 }, { "crossentropy": 2.6185462474823, "epoch": 0.56645156612529, "grad_norm": 0.028125347569584846, "grad_norm_var": 8.43714318870499e-07, "learning_rate": 0.004100175757194789, "loss": 2.5958, "step": 15625 }, { "crossentropy": 2.3337485790252686, "epoch": 0.566487819025522, "grad_norm": 0.026649612933397293, "grad_norm_var": 8.344156089751341e-07, "learning_rate": 0.0040996041629469395, "loss": 2.396, "step": 15626 }, { "crossentropy": 2.512006998062134, "epoch": 0.566524071925754, "grad_norm": 0.027538606896996498, "grad_norm_var": 7.804801767725851e-07, "learning_rate": 0.004099032580860303, "loss": 2.5047, "step": 15627 }, { "crossentropy": 2.6366584300994873, "epoch": 0.5665603248259861, "grad_norm": 0.026744414120912552, "grad_norm_var": 7.719233266322561e-07, "learning_rate": 0.004098461010942602, "loss": 2.6489, "step": 15628 }, { "crossentropy": 2.562655210494995, "epoch": 0.5665965777262181, "grad_norm": 0.02587081491947174, "grad_norm_var": 8.893794289827611e-07, "learning_rate": 0.00409788945320155, "loss": 2.555, "step": 15629 }, { "crossentropy": 2.600739002227783, "epoch": 0.5666328306264501, "grad_norm": 0.02625405229628086, "grad_norm_var": 8.033286191463784e-07, "learning_rate": 0.0040973179076448695, "loss": 2.5077, "step": 15630 }, { "crossentropy": 2.6058475971221924, "epoch": 0.5666690835266821, "grad_norm": 0.028044993057847023, "grad_norm_var": 6.621508707346415e-07, "learning_rate": 0.004096746374280282, "loss": 2.6018, "step": 15631 }, { "crossentropy": 2.612522840499878, "epoch": 0.5667053364269141, "grad_norm": 0.026339024305343628, "grad_norm_var": 6.413194798203139e-07, "learning_rate": 0.004096174853115505, "loss": 2.5417, "step": 15632 }, { "crossentropy": 2.375603199005127, "epoch": 0.5667415893271461, "grad_norm": 0.02639215625822544, "grad_norm_var": 5.978102587587819e-07, "learning_rate": 0.004095603344158257, "loss": 2.5271, "step": 15633 }, { "crossentropy": 2.6200759410858154, "epoch": 0.5667778422273781, "grad_norm": 0.0282316654920578, "grad_norm_var": 6.968460951966548e-07, "learning_rate": 0.004095031847416259, "loss": 2.5496, "step": 15634 }, { "crossentropy": 2.6051185131073, "epoch": 0.5668140951276102, "grad_norm": 0.025844553485512733, "grad_norm_var": 7.779929105589199e-07, "learning_rate": 0.00409446036289723, "loss": 2.5922, "step": 15635 }, { "crossentropy": 2.636223316192627, "epoch": 0.5668503480278422, "grad_norm": 0.029583115130662918, "grad_norm_var": 1.2088083943724454e-06, "learning_rate": 0.004093888890608889, "loss": 2.5779, "step": 15636 }, { "crossentropy": 2.5100131034851074, "epoch": 0.5668866009280742, "grad_norm": 0.02729220688343048, "grad_norm_var": 1.1653046140748408e-06, "learning_rate": 0.004093317430558951, "loss": 2.5265, "step": 15637 }, { "crossentropy": 2.476457118988037, "epoch": 0.5669228538283063, "grad_norm": 0.02839077077805996, "grad_norm_var": 1.1556841508058065e-06, "learning_rate": 0.004092745982755139, "loss": 2.4795, "step": 15638 }, { "crossentropy": 2.6170103549957275, "epoch": 0.5669591067285383, "grad_norm": 0.027230022475123405, "grad_norm_var": 1.1131635097675712e-06, "learning_rate": 0.004092174547205169, "loss": 2.5947, "step": 15639 }, { "crossentropy": 2.604133367538452, "epoch": 0.5669953596287703, "grad_norm": 0.025943217799067497, "grad_norm_var": 1.1870411774699316e-06, "learning_rate": 0.004091603123916758, "loss": 2.5564, "step": 15640 }, { "crossentropy": 2.435253381729126, "epoch": 0.5670316125290024, "grad_norm": 0.026868926361203194, "grad_norm_var": 1.1230911425137568e-06, "learning_rate": 0.004091031712897627, "loss": 2.4197, "step": 15641 }, { "crossentropy": 2.594503402709961, "epoch": 0.5670678654292344, "grad_norm": 0.026102816686034203, "grad_norm_var": 1.1728738337323774e-06, "learning_rate": 0.0040904603141554904, "loss": 2.4811, "step": 15642 }, { "crossentropy": 2.3842177391052246, "epoch": 0.5671041183294664, "grad_norm": 0.026334205642342567, "grad_norm_var": 1.1837802491892456e-06, "learning_rate": 0.004089888927698069, "loss": 2.4886, "step": 15643 }, { "crossentropy": 2.492063045501709, "epoch": 0.5671403712296984, "grad_norm": 0.025316547602415085, "grad_norm_var": 1.3535217920939678e-06, "learning_rate": 0.004089317553533078, "loss": 2.5001, "step": 15644 }, { "crossentropy": 2.7181079387664795, "epoch": 0.5671766241299304, "grad_norm": 0.02647389844059944, "grad_norm_var": 1.2953095369032848e-06, "learning_rate": 0.004088746191668238, "loss": 2.7109, "step": 15645 }, { "crossentropy": 2.4805808067321777, "epoch": 0.5672128770301624, "grad_norm": 0.027019182220101357, "grad_norm_var": 1.2644565556529004e-06, "learning_rate": 0.004088174842111261, "loss": 2.4778, "step": 15646 }, { "crossentropy": 2.3411264419555664, "epoch": 0.5672491299303944, "grad_norm": 0.02973000332713127, "grad_norm_var": 1.6850093287612411e-06, "learning_rate": 0.004087603504869867, "loss": 2.3935, "step": 15647 }, { "crossentropy": 2.4895336627960205, "epoch": 0.5672853828306265, "grad_norm": 0.02807731181383133, "grad_norm_var": 1.704843657293721e-06, "learning_rate": 0.004087032179951771, "loss": 2.5412, "step": 15648 }, { "crossentropy": 2.360792636871338, "epoch": 0.5673216357308585, "grad_norm": 0.027295857667922974, "grad_norm_var": 1.6613278762530233e-06, "learning_rate": 0.004086460867364693, "loss": 2.5039, "step": 15649 }, { "crossentropy": 2.6541168689727783, "epoch": 0.5673578886310905, "grad_norm": 0.027242284268140793, "grad_norm_var": 1.590818082467839e-06, "learning_rate": 0.004085889567116345, "loss": 2.6338, "step": 15650 }, { "crossentropy": 2.4516096115112305, "epoch": 0.5673941415313225, "grad_norm": 0.02684854157269001, "grad_norm_var": 1.47617808626075e-06, "learning_rate": 0.004085318279214447, "loss": 2.492, "step": 15651 }, { "crossentropy": 2.7192962169647217, "epoch": 0.5674303944315545, "grad_norm": 0.026428652927279472, "grad_norm_var": 1.1101956229042034e-06, "learning_rate": 0.004084747003666714, "loss": 2.5725, "step": 15652 }, { "crossentropy": 2.5755043029785156, "epoch": 0.5674666473317865, "grad_norm": 0.02559623122215271, "grad_norm_var": 1.2322910842947013e-06, "learning_rate": 0.004084175740480862, "loss": 2.5396, "step": 15653 }, { "crossentropy": 2.5280210971832275, "epoch": 0.5675029002320185, "grad_norm": 0.02675282023847103, "grad_norm_var": 1.0812006217393443e-06, "learning_rate": 0.004083604489664606, "loss": 2.5857, "step": 15654 }, { "crossentropy": 2.5370473861694336, "epoch": 0.5675391531322506, "grad_norm": 0.026031360030174255, "grad_norm_var": 1.106873264327309e-06, "learning_rate": 0.004083033251225662, "loss": 2.604, "step": 15655 }, { "crossentropy": 2.484989881515503, "epoch": 0.5675754060324826, "grad_norm": 0.02651817910373211, "grad_norm_var": 1.0653890240112467e-06, "learning_rate": 0.004082462025171745, "loss": 2.5375, "step": 15656 }, { "crossentropy": 2.5480661392211914, "epoch": 0.5676116589327146, "grad_norm": 0.02682863548398018, "grad_norm_var": 1.0650654140393109e-06, "learning_rate": 0.004081890811510571, "loss": 2.5334, "step": 15657 }, { "crossentropy": 2.512955904006958, "epoch": 0.5676479118329466, "grad_norm": 0.026339253410696983, "grad_norm_var": 1.0469815790619159e-06, "learning_rate": 0.004081319610249855, "loss": 2.5298, "step": 15658 }, { "crossentropy": 2.6315624713897705, "epoch": 0.5676841647331786, "grad_norm": 0.02745983563363552, "grad_norm_var": 1.0559542593861163e-06, "learning_rate": 0.0040807484213973115, "loss": 2.5998, "step": 15659 }, { "crossentropy": 2.4594383239746094, "epoch": 0.5677204176334106, "grad_norm": 0.026872945949435234, "grad_norm_var": 8.844800724910895e-07, "learning_rate": 0.004080177244960656, "loss": 2.5619, "step": 15660 }, { "crossentropy": 2.5592541694641113, "epoch": 0.5677566705336426, "grad_norm": 0.02691865898668766, "grad_norm_var": 8.674423553725694e-07, "learning_rate": 0.004079606080947603, "loss": 2.5322, "step": 15661 }, { "crossentropy": 2.628122329711914, "epoch": 0.5677929234338747, "grad_norm": 0.02636856772005558, "grad_norm_var": 8.920163203229945e-07, "learning_rate": 0.004079034929365867, "loss": 2.5814, "step": 15662 }, { "crossentropy": 2.4792027473449707, "epoch": 0.5678291763341067, "grad_norm": 0.02764754556119442, "grad_norm_var": 3.930510987788331e-07, "learning_rate": 0.00407846379022316, "loss": 2.5367, "step": 15663 }, { "crossentropy": 2.430340528488159, "epoch": 0.5678654292343387, "grad_norm": 0.02966299280524254, "grad_norm_var": 8.146164794030237e-07, "learning_rate": 0.0040778926635272, "loss": 2.5172, "step": 15664 }, { "crossentropy": 2.491818428039551, "epoch": 0.5679016821345708, "grad_norm": 0.026053862646222115, "grad_norm_var": 8.497401250865394e-07, "learning_rate": 0.004077321549285698, "loss": 2.5866, "step": 15665 }, { "crossentropy": 2.6052839756011963, "epoch": 0.5679379350348028, "grad_norm": 0.026982605457305908, "grad_norm_var": 8.40308180278026e-07, "learning_rate": 0.0040767504475063675, "loss": 2.6317, "step": 15666 }, { "crossentropy": 2.3936188220977783, "epoch": 0.5679741879350348, "grad_norm": 0.027644319459795952, "grad_norm_var": 8.816509010352852e-07, "learning_rate": 0.0040761793581969245, "loss": 2.4445, "step": 15667 }, { "crossentropy": 2.5949151515960693, "epoch": 0.5680104408352669, "grad_norm": 0.02611599490046501, "grad_norm_var": 9.066451875506046e-07, "learning_rate": 0.004075608281365081, "loss": 2.5336, "step": 15668 }, { "crossentropy": 2.586174726486206, "epoch": 0.5680466937354989, "grad_norm": 0.027007468044757843, "grad_norm_var": 7.929249855829004e-07, "learning_rate": 0.004075037217018551, "loss": 2.6162, "step": 15669 }, { "crossentropy": 2.5354385375976562, "epoch": 0.5680829466357309, "grad_norm": 0.02679278329014778, "grad_norm_var": 7.919724666695469e-07, "learning_rate": 0.004074466165165048, "loss": 2.6217, "step": 15670 }, { "crossentropy": 2.3775136470794678, "epoch": 0.5681191995359629, "grad_norm": 0.027538204565644264, "grad_norm_var": 7.487522226743218e-07, "learning_rate": 0.004073895125812283, "loss": 2.4142, "step": 15671 }, { "crossentropy": 2.539233446121216, "epoch": 0.5681554524361949, "grad_norm": 0.0278214979916811, "grad_norm_var": 7.630225251041567e-07, "learning_rate": 0.004073324098967969, "loss": 2.5646, "step": 15672 }, { "crossentropy": 2.5815248489379883, "epoch": 0.5681917053364269, "grad_norm": 0.028644489124417305, "grad_norm_var": 8.965164905467276e-07, "learning_rate": 0.00407275308463982, "loss": 2.5587, "step": 15673 }, { "crossentropy": 2.60906720161438, "epoch": 0.568227958236659, "grad_norm": 0.02666103094816208, "grad_norm_var": 8.642592592360277e-07, "learning_rate": 0.004072182082835546, "loss": 2.5128, "step": 15674 }, { "crossentropy": 2.5275309085845947, "epoch": 0.568264211136891, "grad_norm": 0.030124608427286148, "grad_norm_var": 1.378346405075355e-06, "learning_rate": 0.004071611093562863, "loss": 2.4648, "step": 15675 }, { "crossentropy": 2.6078596115112305, "epoch": 0.568300464037123, "grad_norm": 0.025264553725719452, "grad_norm_var": 1.659190215550869e-06, "learning_rate": 0.004071040116829481, "loss": 2.4659, "step": 15676 }, { "crossentropy": 2.713907241821289, "epoch": 0.568336716937355, "grad_norm": 0.02732699364423752, "grad_norm_var": 1.6473208460261127e-06, "learning_rate": 0.004070469152643112, "loss": 2.646, "step": 15677 }, { "crossentropy": 2.463616132736206, "epoch": 0.568372969837587, "grad_norm": 0.02638738416135311, "grad_norm_var": 1.6448716805460833e-06, "learning_rate": 0.004069898201011468, "loss": 2.5199, "step": 15678 }, { "crossentropy": 2.522411346435547, "epoch": 0.568409222737819, "grad_norm": 0.027532970532774925, "grad_norm_var": 1.6412195236878973e-06, "learning_rate": 0.00406932726194226, "loss": 2.5603, "step": 15679 }, { "crossentropy": 2.5882139205932617, "epoch": 0.568445475638051, "grad_norm": 0.027158457785844803, "grad_norm_var": 1.2600686877091282e-06, "learning_rate": 0.004068756335443198, "loss": 2.5519, "step": 15680 }, { "crossentropy": 2.7205584049224854, "epoch": 0.568481728538283, "grad_norm": 0.028231238946318626, "grad_norm_var": 1.2262268144684705e-06, "learning_rate": 0.004068185421521996, "loss": 2.6282, "step": 15681 }, { "crossentropy": 2.256821632385254, "epoch": 0.5685179814385151, "grad_norm": 0.027054397389292717, "grad_norm_var": 1.2232507551903615e-06, "learning_rate": 0.004067614520186363, "loss": 2.3998, "step": 15682 }, { "crossentropy": 2.5840375423431396, "epoch": 0.5685542343387471, "grad_norm": 0.02816919982433319, "grad_norm_var": 1.262351373068959e-06, "learning_rate": 0.004067043631444013, "loss": 2.5804, "step": 15683 }, { "crossentropy": 2.4700210094451904, "epoch": 0.5685904872389791, "grad_norm": 0.027265679091215134, "grad_norm_var": 1.1535843173277598e-06, "learning_rate": 0.0040664727553026535, "loss": 2.4813, "step": 15684 }, { "crossentropy": 2.5971827507019043, "epoch": 0.5686267401392111, "grad_norm": 0.026746515184640884, "grad_norm_var": 1.1727613412111042e-06, "learning_rate": 0.004065901891769996, "loss": 2.5962, "step": 15685 }, { "crossentropy": 2.522270441055298, "epoch": 0.5686629930394431, "grad_norm": 0.026902608573436737, "grad_norm_var": 1.1643306194782185e-06, "learning_rate": 0.004065331040853751, "loss": 2.5517, "step": 15686 }, { "crossentropy": 2.5701770782470703, "epoch": 0.5686992459396751, "grad_norm": 0.026721172034740448, "grad_norm_var": 1.1939228626596405e-06, "learning_rate": 0.0040647602025616295, "loss": 2.605, "step": 15687 }, { "crossentropy": 2.611603260040283, "epoch": 0.5687354988399071, "grad_norm": 0.02558480203151703, "grad_norm_var": 1.3736795787389847e-06, "learning_rate": 0.00406418937690134, "loss": 2.6044, "step": 15688 }, { "crossentropy": 2.4507265090942383, "epoch": 0.5687717517401392, "grad_norm": 0.027519265189766884, "grad_norm_var": 1.2414981629695152e-06, "learning_rate": 0.004063618563880593, "loss": 2.5857, "step": 15689 }, { "crossentropy": 2.488929271697998, "epoch": 0.5688080046403712, "grad_norm": 0.02735939994454384, "grad_norm_var": 1.2249897974366072e-06, "learning_rate": 0.004063047763507098, "loss": 2.5708, "step": 15690 }, { "crossentropy": 2.7875092029571533, "epoch": 0.5688442575406032, "grad_norm": 0.027046218514442444, "grad_norm_var": 6.206873890299598e-07, "learning_rate": 0.004062476975788565, "loss": 2.6442, "step": 15691 }, { "crossentropy": 2.644554853439331, "epoch": 0.5688805104408353, "grad_norm": 0.026236969977617264, "grad_norm_var": 4.5258194182165996e-07, "learning_rate": 0.004061906200732704, "loss": 2.6071, "step": 15692 }, { "crossentropy": 2.579092025756836, "epoch": 0.5689167633410673, "grad_norm": 0.02745182439684868, "grad_norm_var": 4.5770505429456634e-07, "learning_rate": 0.004061335438347222, "loss": 2.5597, "step": 15693 }, { "crossentropy": 2.4561595916748047, "epoch": 0.5689530162412993, "grad_norm": 0.02614862471818924, "grad_norm_var": 4.834923730489526e-07, "learning_rate": 0.004060764688639831, "loss": 2.5333, "step": 15694 }, { "crossentropy": 2.610978841781616, "epoch": 0.5689892691415314, "grad_norm": 0.027186136692762375, "grad_norm_var": 4.696278909124453e-07, "learning_rate": 0.004060193951618238, "loss": 2.6297, "step": 15695 }, { "crossentropy": 2.4011995792388916, "epoch": 0.5690255220417634, "grad_norm": 0.026615530252456665, "grad_norm_var": 4.801206089656649e-07, "learning_rate": 0.004059623227290152, "loss": 2.4276, "step": 15696 }, { "crossentropy": 2.393402338027954, "epoch": 0.5690617749419954, "grad_norm": 0.02625654637813568, "grad_norm_var": 4.036005130478304e-07, "learning_rate": 0.004059052515663281, "loss": 2.4477, "step": 15697 }, { "crossentropy": 2.5131032466888428, "epoch": 0.5690980278422274, "grad_norm": 0.027125028893351555, "grad_norm_var": 4.0544588135044136e-07, "learning_rate": 0.004058481816745332, "loss": 2.5482, "step": 15698 }, { "crossentropy": 2.454191207885742, "epoch": 0.5691342807424594, "grad_norm": 0.02670004963874817, "grad_norm_var": 2.9093726618180844e-07, "learning_rate": 0.0040579111305440166, "loss": 2.5221, "step": 15699 }, { "crossentropy": 2.426891326904297, "epoch": 0.5691705336426914, "grad_norm": 0.029002664610743523, "grad_norm_var": 5.863968434144703e-07, "learning_rate": 0.00405734045706704, "loss": 2.5293, "step": 15700 }, { "crossentropy": 2.612011432647705, "epoch": 0.5692067865429234, "grad_norm": 0.02711624465882778, "grad_norm_var": 5.867476457550753e-07, "learning_rate": 0.004056769796322112, "loss": 2.5369, "step": 15701 }, { "crossentropy": 2.4919612407684326, "epoch": 0.5692430394431555, "grad_norm": 0.031915441155433655, "grad_norm_var": 2.135081924704331e-06, "learning_rate": 0.004056199148316939, "loss": 2.5325, "step": 15702 }, { "crossentropy": 2.6052000522613525, "epoch": 0.5692792923433875, "grad_norm": 0.026896968483924866, "grad_norm_var": 2.1246386001589138e-06, "learning_rate": 0.00405562851305923, "loss": 2.6087, "step": 15703 }, { "crossentropy": 2.6302032470703125, "epoch": 0.5693155452436195, "grad_norm": 0.028392594307661057, "grad_norm_var": 1.9901818289230768e-06, "learning_rate": 0.00405505789055669, "loss": 2.596, "step": 15704 }, { "crossentropy": 2.676629066467285, "epoch": 0.5693517981438515, "grad_norm": 0.028691628947854042, "grad_norm_var": 2.0891631665603816e-06, "learning_rate": 0.004054487280817027, "loss": 2.6431, "step": 15705 }, { "crossentropy": 2.617131471633911, "epoch": 0.5693880510440835, "grad_norm": 0.02862202748656273, "grad_norm_var": 2.1636396109893373e-06, "learning_rate": 0.004053916683847948, "loss": 2.5851, "step": 15706 }, { "crossentropy": 2.6464130878448486, "epoch": 0.5694243039443155, "grad_norm": 0.026306938380002975, "grad_norm_var": 2.251180259945212e-06, "learning_rate": 0.00405334609965716, "loss": 2.5465, "step": 15707 }, { "crossentropy": 2.615997791290283, "epoch": 0.5694605568445475, "grad_norm": 0.02789788320660591, "grad_norm_var": 2.1346831123582206e-06, "learning_rate": 0.00405277552825237, "loss": 2.6004, "step": 15708 }, { "crossentropy": 2.5887792110443115, "epoch": 0.5694968097447796, "grad_norm": 0.026070773601531982, "grad_norm_var": 2.2895313937007962e-06, "learning_rate": 0.004052204969641283, "loss": 2.6213, "step": 15709 }, { "crossentropy": 2.618391513824463, "epoch": 0.5695330626450116, "grad_norm": 0.02727825939655304, "grad_norm_var": 2.1568480266372553e-06, "learning_rate": 0.004051634423831607, "loss": 2.5462, "step": 15710 }, { "crossentropy": 2.7073380947113037, "epoch": 0.5695693155452436, "grad_norm": 0.030836770310997963, "grad_norm_var": 2.773903162002706e-06, "learning_rate": 0.004051063890831047, "loss": 2.623, "step": 15711 }, { "crossentropy": 2.4001712799072266, "epoch": 0.5696055684454756, "grad_norm": 0.0273686945438385, "grad_norm_var": 2.684602144721821e-06, "learning_rate": 0.004050493370647311, "loss": 2.5121, "step": 15712 }, { "crossentropy": 2.6220271587371826, "epoch": 0.5696418213457076, "grad_norm": 0.02935747243463993, "grad_norm_var": 2.6040600731333057e-06, "learning_rate": 0.0040499228632881, "loss": 2.6929, "step": 15713 }, { "crossentropy": 2.5898070335388184, "epoch": 0.5696780742459396, "grad_norm": 0.02839108742773533, "grad_norm_var": 2.539875771791616e-06, "learning_rate": 0.004049352368761124, "loss": 2.4708, "step": 15714 }, { "crossentropy": 2.5851495265960693, "epoch": 0.5697143271461717, "grad_norm": 0.026939887553453445, "grad_norm_var": 2.4962134392886765e-06, "learning_rate": 0.004048781887074087, "loss": 2.5975, "step": 15715 }, { "crossentropy": 2.655921697616577, "epoch": 0.5697505800464037, "grad_norm": 0.033977895975112915, "grad_norm_var": 4.580484316125824e-06, "learning_rate": 0.004048211418234694, "loss": 2.6794, "step": 15716 }, { "crossentropy": 2.429460048675537, "epoch": 0.5697868329466357, "grad_norm": 0.025504695251584053, "grad_norm_var": 5.040947948654764e-06, "learning_rate": 0.004047640962250651, "loss": 2.4666, "step": 15717 }, { "crossentropy": 2.6719489097595215, "epoch": 0.5698230858468677, "grad_norm": 0.028153836727142334, "grad_norm_var": 4.163678889059505e-06, "learning_rate": 0.00404707051912966, "loss": 2.5542, "step": 15718 }, { "crossentropy": 2.5264110565185547, "epoch": 0.5698593387470998, "grad_norm": 0.027086010202765465, "grad_norm_var": 4.133876296040911e-06, "learning_rate": 0.004046500088879429, "loss": 2.5764, "step": 15719 }, { "crossentropy": 2.4631845951080322, "epoch": 0.5698955916473318, "grad_norm": 0.02738882042467594, "grad_norm_var": 4.168366392878171e-06, "learning_rate": 0.004045929671507663, "loss": 2.4903, "step": 15720 }, { "crossentropy": 2.6242218017578125, "epoch": 0.5699318445475638, "grad_norm": 0.02713695913553238, "grad_norm_var": 4.200323178863106e-06, "learning_rate": 0.004045359267022062, "loss": 2.6561, "step": 15721 }, { "crossentropy": 2.624884605407715, "epoch": 0.5699680974477959, "grad_norm": 0.027027998119592667, "grad_norm_var": 4.231151607538934e-06, "learning_rate": 0.004044788875430334, "loss": 2.5723, "step": 15722 }, { "crossentropy": 2.548657178878784, "epoch": 0.5700043503480279, "grad_norm": 0.026471784338355064, "grad_norm_var": 4.19739029817374e-06, "learning_rate": 0.004044218496740182, "loss": 2.5176, "step": 15723 }, { "crossentropy": 2.4828312397003174, "epoch": 0.5700406032482599, "grad_norm": 0.029408272355794907, "grad_norm_var": 4.333391039570269e-06, "learning_rate": 0.004043648130959309, "loss": 2.5057, "step": 15724 }, { "crossentropy": 2.782257080078125, "epoch": 0.5700768561484919, "grad_norm": 0.03042483516037464, "grad_norm_var": 4.383775569277657e-06, "learning_rate": 0.004043077778095419, "loss": 2.6866, "step": 15725 }, { "crossentropy": 2.6202099323272705, "epoch": 0.5701131090487239, "grad_norm": 0.027380213141441345, "grad_norm_var": 4.3705755534185925e-06, "learning_rate": 0.004042507438156215, "loss": 2.5408, "step": 15726 }, { "crossentropy": 2.5421910285949707, "epoch": 0.5701493619489559, "grad_norm": 0.027377095073461533, "grad_norm_var": 3.950065648131957e-06, "learning_rate": 0.004041937111149403, "loss": 2.4976, "step": 15727 }, { "crossentropy": 2.6255247592926025, "epoch": 0.570185614849188, "grad_norm": 0.026232292875647545, "grad_norm_var": 4.139650190621418e-06, "learning_rate": 0.004041366797082683, "loss": 2.5963, "step": 15728 }, { "crossentropy": 2.699228286743164, "epoch": 0.57022186774942, "grad_norm": 0.026798337697982788, "grad_norm_var": 4.091306168494989e-06, "learning_rate": 0.004040796495963761, "loss": 2.5532, "step": 15729 }, { "crossentropy": 2.642335891723633, "epoch": 0.570258120649652, "grad_norm": 0.027000607922673225, "grad_norm_var": 4.112988597967344e-06, "learning_rate": 0.004040226207800337, "loss": 2.5221, "step": 15730 }, { "crossentropy": 2.5661299228668213, "epoch": 0.570294373549884, "grad_norm": 0.025219950824975967, "grad_norm_var": 4.4880905531287e-06, "learning_rate": 0.004039655932600114, "loss": 2.5221, "step": 15731 }, { "crossentropy": 2.6016695499420166, "epoch": 0.570330626450116, "grad_norm": 0.02596326544880867, "grad_norm_var": 1.7532981564557e-06, "learning_rate": 0.004039085670370796, "loss": 2.5668, "step": 15732 }, { "crossentropy": 2.5087831020355225, "epoch": 0.570366879350348, "grad_norm": 0.02578265592455864, "grad_norm_var": 1.6967444038759143e-06, "learning_rate": 0.004038515421120083, "loss": 2.4824, "step": 15733 }, { "crossentropy": 2.6272013187408447, "epoch": 0.57040313225058, "grad_norm": 0.027254998683929443, "grad_norm_var": 1.6303265123929675e-06, "learning_rate": 0.004037945184855678, "loss": 2.6007, "step": 15734 }, { "crossentropy": 2.634584903717041, "epoch": 0.570439385150812, "grad_norm": 0.02710702456533909, "grad_norm_var": 1.6302529049706688e-06, "learning_rate": 0.004037374961585284, "loss": 2.528, "step": 15735 }, { "crossentropy": 2.675374984741211, "epoch": 0.5704756380510441, "grad_norm": 0.029871495440602303, "grad_norm_var": 2.1033282372244384e-06, "learning_rate": 0.004036804751316602, "loss": 2.6653, "step": 15736 }, { "crossentropy": 2.60074520111084, "epoch": 0.5705118909512761, "grad_norm": 0.027750512585043907, "grad_norm_var": 2.115268040172669e-06, "learning_rate": 0.004036234554057335, "loss": 2.5514, "step": 15737 }, { "crossentropy": 2.4026033878326416, "epoch": 0.5705481438515081, "grad_norm": 0.025201544165611267, "grad_norm_var": 2.3941336919225718e-06, "learning_rate": 0.0040356643698151815, "loss": 2.392, "step": 15738 }, { "crossentropy": 2.6078410148620605, "epoch": 0.5705843967517401, "grad_norm": 0.027573179453611374, "grad_norm_var": 2.362598240184382e-06, "learning_rate": 0.004035094198597845, "loss": 2.5901, "step": 15739 }, { "crossentropy": 2.7991690635681152, "epoch": 0.5706206496519721, "grad_norm": 0.02924237959086895, "grad_norm_var": 2.317058076346439e-06, "learning_rate": 0.004034524040413025, "loss": 2.6736, "step": 15740 }, { "crossentropy": 2.450810194015503, "epoch": 0.5706569025522041, "grad_norm": 0.02574503980576992, "grad_norm_var": 1.7118626960649912e-06, "learning_rate": 0.004033953895268423, "loss": 2.4023, "step": 15741 }, { "crossentropy": 2.7048275470733643, "epoch": 0.5706931554524362, "grad_norm": 0.026863614097237587, "grad_norm_var": 1.7002033818213612e-06, "learning_rate": 0.004033383763171738, "loss": 2.6088, "step": 15742 }, { "crossentropy": 2.5435914993286133, "epoch": 0.5707294083526682, "grad_norm": 0.026974990963935852, "grad_norm_var": 1.6866868324636372e-06, "learning_rate": 0.004032813644130674, "loss": 2.5784, "step": 15743 }, { "crossentropy": 2.6432135105133057, "epoch": 0.5707656612529002, "grad_norm": 0.02609424479305744, "grad_norm_var": 1.700377250234084e-06, "learning_rate": 0.004032243538152929, "loss": 2.6451, "step": 15744 }, { "crossentropy": 2.6037845611572266, "epoch": 0.5708019141531323, "grad_norm": 0.027651090174913406, "grad_norm_var": 1.733955834240288e-06, "learning_rate": 0.004031673445246205, "loss": 2.5834, "step": 15745 }, { "crossentropy": 2.6740620136260986, "epoch": 0.5708381670533643, "grad_norm": 0.028130466118454933, "grad_norm_var": 1.8204565423054116e-06, "learning_rate": 0.0040311033654181995, "loss": 2.5373, "step": 15746 }, { "crossentropy": 2.506237268447876, "epoch": 0.5708744199535963, "grad_norm": 0.026934469118714333, "grad_norm_var": 1.5911632632248923e-06, "learning_rate": 0.004030533298676613, "loss": 2.6187, "step": 15747 }, { "crossentropy": 2.627241849899292, "epoch": 0.5709106728538283, "grad_norm": 0.026591068133711815, "grad_norm_var": 1.517813917633476e-06, "learning_rate": 0.004029963245029146, "loss": 2.5934, "step": 15748 }, { "crossentropy": 2.641268014907837, "epoch": 0.5709469257540604, "grad_norm": 0.02570284530520439, "grad_norm_var": 1.5330077701614561e-06, "learning_rate": 0.004029393204483497, "loss": 2.5623, "step": 15749 }, { "crossentropy": 2.6922390460968018, "epoch": 0.5709831786542924, "grad_norm": 0.027564160525798798, "grad_norm_var": 1.5425653275423965e-06, "learning_rate": 0.0040288231770473654, "loss": 2.5963, "step": 15750 }, { "crossentropy": 2.534359931945801, "epoch": 0.5710194315545244, "grad_norm": 0.02575664222240448, "grad_norm_var": 1.671004686301528e-06, "learning_rate": 0.004028253162728451, "loss": 2.441, "step": 15751 }, { "crossentropy": 2.576866865158081, "epoch": 0.5710556844547564, "grad_norm": 0.026837052777409554, "grad_norm_var": 1.1263762266523844e-06, "learning_rate": 0.004027683161534453, "loss": 2.5397, "step": 15752 }, { "crossentropy": 2.498659372329712, "epoch": 0.5710919373549884, "grad_norm": 0.02658749185502529, "grad_norm_var": 1.0810935681829312e-06, "learning_rate": 0.004027113173473069, "loss": 2.5588, "step": 15753 }, { "crossentropy": 2.542449951171875, "epoch": 0.5711281902552204, "grad_norm": 0.026518002152442932, "grad_norm_var": 9.017027175128954e-07, "learning_rate": 0.004026543198552, "loss": 2.5412, "step": 15754 }, { "crossentropy": 2.518833875656128, "epoch": 0.5711644431554525, "grad_norm": 0.033583976328372955, "grad_norm_var": 3.680950468940231e-06, "learning_rate": 0.00402597323677894, "loss": 2.4783, "step": 15755 }, { "crossentropy": 2.553118944168091, "epoch": 0.5712006960556845, "grad_norm": 0.0260139312595129, "grad_norm_var": 3.495659676349645e-06, "learning_rate": 0.0040254032881615895, "loss": 2.5131, "step": 15756 }, { "crossentropy": 2.400733470916748, "epoch": 0.5712369489559165, "grad_norm": 0.02758796513080597, "grad_norm_var": 3.375769569923838e-06, "learning_rate": 0.004024833352707647, "loss": 2.4828, "step": 15757 }, { "crossentropy": 2.518217086791992, "epoch": 0.5712732018561485, "grad_norm": 0.026831883937120438, "grad_norm_var": 3.3773064100760106e-06, "learning_rate": 0.00402426343042481, "loss": 2.6228, "step": 15758 }, { "crossentropy": 2.570680618286133, "epoch": 0.5713094547563805, "grad_norm": 0.025585835799574852, "grad_norm_var": 3.5414477005626365e-06, "learning_rate": 0.004023693521320776, "loss": 2.577, "step": 15759 }, { "crossentropy": 2.5444724559783936, "epoch": 0.5713457076566125, "grad_norm": 0.027728809043765068, "grad_norm_var": 3.484183778742119e-06, "learning_rate": 0.004023123625403242, "loss": 2.4936, "step": 15760 }, { "crossentropy": 2.5830154418945312, "epoch": 0.5713819605568445, "grad_norm": 0.02888251654803753, "grad_norm_var": 3.6488608956621126e-06, "learning_rate": 0.004022553742679907, "loss": 2.584, "step": 15761 }, { "crossentropy": 2.68029522895813, "epoch": 0.5714182134570766, "grad_norm": 0.03525935485959053, "grad_norm_var": 7.6123452410961984e-06, "learning_rate": 0.004021983873158468, "loss": 2.6982, "step": 15762 }, { "crossentropy": 2.528205633163452, "epoch": 0.5714544663573086, "grad_norm": 0.026939017698168755, "grad_norm_var": 7.61185322183458e-06, "learning_rate": 0.00402141401684662, "loss": 2.5029, "step": 15763 }, { "crossentropy": 2.469339609146118, "epoch": 0.5714907192575406, "grad_norm": 0.028938395902514458, "grad_norm_var": 7.594081893357492e-06, "learning_rate": 0.00402084417375206, "loss": 2.4949, "step": 15764 }, { "crossentropy": 2.4614017009735107, "epoch": 0.5715269721577726, "grad_norm": 0.02646367996931076, "grad_norm_var": 7.407892358751202e-06, "learning_rate": 0.004020274343882486, "loss": 2.4532, "step": 15765 }, { "crossentropy": 2.440608501434326, "epoch": 0.5715632250580046, "grad_norm": 0.026978958398103714, "grad_norm_var": 7.4588106181501525e-06, "learning_rate": 0.0040197045272455925, "loss": 2.3373, "step": 15766 }, { "crossentropy": 2.464656114578247, "epoch": 0.5715994779582366, "grad_norm": 0.02643135003745556, "grad_norm_var": 7.293918045922368e-06, "learning_rate": 0.004019134723849078, "loss": 2.5373, "step": 15767 }, { "crossentropy": 2.5181190967559814, "epoch": 0.5716357308584686, "grad_norm": 0.027206769213080406, "grad_norm_var": 7.247695776741225e-06, "learning_rate": 0.004018564933700637, "loss": 2.5105, "step": 15768 }, { "crossentropy": 2.583630323410034, "epoch": 0.5716719837587007, "grad_norm": 0.02684173919260502, "grad_norm_var": 7.204831342172872e-06, "learning_rate": 0.004017995156807967, "loss": 2.5629, "step": 15769 }, { "crossentropy": 2.48622465133667, "epoch": 0.5717082366589327, "grad_norm": 0.02628246136009693, "grad_norm_var": 7.254433695037809e-06, "learning_rate": 0.004017425393178762, "loss": 2.4689, "step": 15770 }, { "crossentropy": 2.510411024093628, "epoch": 0.5717444895591647, "grad_norm": 0.024823663756251335, "grad_norm_var": 5.496192803365477e-06, "learning_rate": 0.004016855642820719, "loss": 2.3696, "step": 15771 }, { "crossentropy": 2.5324714183807373, "epoch": 0.5717807424593968, "grad_norm": 0.029753858223557472, "grad_norm_var": 5.666858769098068e-06, "learning_rate": 0.004016285905741533, "loss": 2.6514, "step": 15772 }, { "crossentropy": 2.56221866607666, "epoch": 0.5718169953596288, "grad_norm": 0.02758810669183731, "grad_norm_var": 5.666857438713076e-06, "learning_rate": 0.004015716181948897, "loss": 2.5647, "step": 15773 }, { "crossentropy": 2.6928231716156006, "epoch": 0.5718532482598608, "grad_norm": 0.027467243373394012, "grad_norm_var": 5.622058976239074e-06, "learning_rate": 0.004015146471450507, "loss": 2.6682, "step": 15774 }, { "crossentropy": 2.4579062461853027, "epoch": 0.5718895011600929, "grad_norm": 0.028565913438796997, "grad_norm_var": 5.337764412580622e-06, "learning_rate": 0.00401457677425406, "loss": 2.5691, "step": 15775 }, { "crossentropy": 2.618518114089966, "epoch": 0.5719257540603249, "grad_norm": 0.027166714891791344, "grad_norm_var": 5.369178913911208e-06, "learning_rate": 0.004014007090367248, "loss": 2.5806, "step": 15776 }, { "crossentropy": 2.452205181121826, "epoch": 0.5719620069605569, "grad_norm": 0.02645106054842472, "grad_norm_var": 5.4037339314351335e-06, "learning_rate": 0.0040134374197977685, "loss": 2.5427, "step": 15777 }, { "crossentropy": 2.6913797855377197, "epoch": 0.5719982598607889, "grad_norm": 0.027564195916056633, "grad_norm_var": 1.3459677261894956e-06, "learning_rate": 0.004012867762553313, "loss": 2.6248, "step": 15778 }, { "crossentropy": 2.6717886924743652, "epoch": 0.5720345127610209, "grad_norm": 0.028479913249611855, "grad_norm_var": 1.4373668600248334e-06, "learning_rate": 0.004012298118641577, "loss": 2.7143, "step": 15779 }, { "crossentropy": 2.439958333969116, "epoch": 0.5720707656612529, "grad_norm": 0.026864537969231606, "grad_norm_var": 1.2566582111913817e-06, "learning_rate": 0.004011728488070253, "loss": 2.531, "step": 15780 }, { "crossentropy": 2.5212454795837402, "epoch": 0.5721070185614849, "grad_norm": 0.02669835276901722, "grad_norm_var": 1.2375886198493872e-06, "learning_rate": 0.004011158870847035, "loss": 2.529, "step": 15781 }, { "crossentropy": 2.3847177028656006, "epoch": 0.572143271461717, "grad_norm": 0.025972489267587662, "grad_norm_var": 1.3302678463528172e-06, "learning_rate": 0.0040105892669796165, "loss": 2.5341, "step": 15782 }, { "crossentropy": 2.5785326957702637, "epoch": 0.572179524361949, "grad_norm": 0.026156233623623848, "grad_norm_var": 1.3608060947670544e-06, "learning_rate": 0.004010019676475692, "loss": 2.5111, "step": 15783 }, { "crossentropy": 2.659559726715088, "epoch": 0.572215777262181, "grad_norm": 0.0276825949549675, "grad_norm_var": 1.3806073728136186e-06, "learning_rate": 0.004009450099342955, "loss": 2.659, "step": 15784 }, { "crossentropy": 2.629446029663086, "epoch": 0.572252030162413, "grad_norm": 0.02770947478711605, "grad_norm_var": 1.3922984092866781e-06, "learning_rate": 0.004008880535589097, "loss": 2.6306, "step": 15785 }, { "crossentropy": 2.5918474197387695, "epoch": 0.572288283062645, "grad_norm": 0.02695324830710888, "grad_norm_var": 1.3382076537994988e-06, "learning_rate": 0.004008310985221811, "loss": 2.5643, "step": 15786 }, { "crossentropy": 2.602376699447632, "epoch": 0.572324535962877, "grad_norm": 0.0278867669403553, "grad_norm_var": 9.362849232987779e-07, "learning_rate": 0.004007741448248793, "loss": 2.5203, "step": 15787 }, { "crossentropy": 2.506103038787842, "epoch": 0.572360788863109, "grad_norm": 0.03049894981086254, "grad_norm_var": 1.2013463722095978e-06, "learning_rate": 0.00400717192467773, "loss": 2.5404, "step": 15788 }, { "crossentropy": 2.551409959793091, "epoch": 0.572397041763341, "grad_norm": 0.029707269743084908, "grad_norm_var": 1.5121151598786505e-06, "learning_rate": 0.004006602414516317, "loss": 2.5111, "step": 15789 }, { "crossentropy": 2.514185667037964, "epoch": 0.5724332946635731, "grad_norm": 0.030194539576768875, "grad_norm_var": 1.923610881704437e-06, "learning_rate": 0.004006032917772244, "loss": 2.6201, "step": 15790 }, { "crossentropy": 2.5932843685150146, "epoch": 0.5724695475638051, "grad_norm": 0.026531482115387917, "grad_norm_var": 1.9703328804204314e-06, "learning_rate": 0.004005463434453207, "loss": 2.6891, "step": 15791 }, { "crossentropy": 2.599003791809082, "epoch": 0.5725058004640371, "grad_norm": 0.03183066472411156, "grad_norm_var": 3.0247445710534914e-06, "learning_rate": 0.004004893964566896, "loss": 2.603, "step": 15792 }, { "crossentropy": 2.660336971282959, "epoch": 0.5725420533642691, "grad_norm": 0.026204276829957962, "grad_norm_var": 3.0778353239569362e-06, "learning_rate": 0.004004324508121001, "loss": 2.6185, "step": 15793 }, { "crossentropy": 2.6857316493988037, "epoch": 0.5725783062645011, "grad_norm": 0.02629205398261547, "grad_norm_var": 3.241612153155373e-06, "learning_rate": 0.004003755065123215, "loss": 2.6558, "step": 15794 }, { "crossentropy": 2.47096586227417, "epoch": 0.5726145591647331, "grad_norm": 0.026931963860988617, "grad_norm_var": 3.262172237160456e-06, "learning_rate": 0.004003185635581228, "loss": 2.4074, "step": 15795 }, { "crossentropy": 2.605820417404175, "epoch": 0.5726508120649652, "grad_norm": 0.026238100603222847, "grad_norm_var": 3.3612567332363806e-06, "learning_rate": 0.004002616219502735, "loss": 2.5432, "step": 15796 }, { "crossentropy": 2.5969276428222656, "epoch": 0.5726870649651972, "grad_norm": 0.026592055335640907, "grad_norm_var": 3.3764147900584936e-06, "learning_rate": 0.004002046816895421, "loss": 2.5957, "step": 15797 }, { "crossentropy": 2.5171825885772705, "epoch": 0.5727233178654292, "grad_norm": 0.026119334623217583, "grad_norm_var": 3.3437160053703675e-06, "learning_rate": 0.004001477427766978, "loss": 2.5165, "step": 15798 }, { "crossentropy": 2.5258865356445312, "epoch": 0.5727595707656613, "grad_norm": 0.027118217200040817, "grad_norm_var": 3.200906376778324e-06, "learning_rate": 0.004000908052125099, "loss": 2.5358, "step": 15799 }, { "crossentropy": 2.7364819049835205, "epoch": 0.5727958236658933, "grad_norm": 0.028273511677980423, "grad_norm_var": 3.2150017167788345e-06, "learning_rate": 0.004000338689977473, "loss": 2.6425, "step": 15800 }, { "crossentropy": 2.546121835708618, "epoch": 0.5728320765661253, "grad_norm": 0.026257911697030067, "grad_norm_var": 3.3676219232816784e-06, "learning_rate": 0.0039997693413317895, "loss": 2.5277, "step": 15801 }, { "crossentropy": 2.5799646377563477, "epoch": 0.5728683294663574, "grad_norm": 0.027713080868124962, "grad_norm_var": 3.325326920790974e-06, "learning_rate": 0.003999200006195739, "loss": 2.5105, "step": 15802 }, { "crossentropy": 2.5302810668945312, "epoch": 0.5729045823665894, "grad_norm": 0.02655879221856594, "grad_norm_var": 3.4156482246360785e-06, "learning_rate": 0.00399863068457701, "loss": 2.5549, "step": 15803 }, { "crossentropy": 2.5280849933624268, "epoch": 0.5729408352668214, "grad_norm": 0.026361407712101936, "grad_norm_var": 2.936747505345496e-06, "learning_rate": 0.0039980613764832975, "loss": 2.5281, "step": 15804 }, { "crossentropy": 2.578038215637207, "epoch": 0.5729770881670534, "grad_norm": 0.026250025257468224, "grad_norm_var": 2.635324202650222e-06, "learning_rate": 0.003997492081922282, "loss": 2.5256, "step": 15805 }, { "crossentropy": 2.4571011066436768, "epoch": 0.5730133410672854, "grad_norm": 0.0261788759380579, "grad_norm_var": 2.048778379574997e-06, "learning_rate": 0.003996922800901659, "loss": 2.4853, "step": 15806 }, { "crossentropy": 2.528021812438965, "epoch": 0.5730495939675174, "grad_norm": 0.02717754617333412, "grad_norm_var": 2.0374584739028698e-06, "learning_rate": 0.003996353533429117, "loss": 2.5548, "step": 15807 }, { "crossentropy": 2.5252652168273926, "epoch": 0.5730858468677494, "grad_norm": 0.026836423203349113, "grad_norm_var": 3.8369840369028606e-07, "learning_rate": 0.003995784279512343, "loss": 2.6239, "step": 15808 }, { "crossentropy": 2.543147087097168, "epoch": 0.5731220997679815, "grad_norm": 0.026087602600455284, "grad_norm_var": 3.9216720713423453e-07, "learning_rate": 0.003995215039159025, "loss": 2.5201, "step": 15809 }, { "crossentropy": 2.599931478500366, "epoch": 0.5731583526682135, "grad_norm": 0.02689124457538128, "grad_norm_var": 3.830789349614413e-07, "learning_rate": 0.003994645812376853, "loss": 2.5735, "step": 15810 }, { "crossentropy": 2.5313045978546143, "epoch": 0.5731946055684455, "grad_norm": 0.02757241204380989, "grad_norm_var": 4.264623056641932e-07, "learning_rate": 0.003994076599173515, "loss": 2.588, "step": 15811 }, { "crossentropy": 2.6026711463928223, "epoch": 0.5732308584686775, "grad_norm": 0.028942013159394264, "grad_norm_var": 6.937533474170194e-07, "learning_rate": 0.003993507399556699, "loss": 2.6173, "step": 15812 }, { "crossentropy": 2.4750962257385254, "epoch": 0.5732671113689095, "grad_norm": 0.027078038081526756, "grad_norm_var": 6.864121798603554e-07, "learning_rate": 0.003992938213534095, "loss": 2.5676, "step": 15813 }, { "crossentropy": 2.6915664672851562, "epoch": 0.5733033642691415, "grad_norm": 0.026022031903266907, "grad_norm_var": 6.979562179020334e-07, "learning_rate": 0.003992369041113387, "loss": 2.5554, "step": 15814 }, { "crossentropy": 2.6841118335723877, "epoch": 0.5733396171693735, "grad_norm": 0.025782575830817223, "grad_norm_var": 7.808212936887803e-07, "learning_rate": 0.003991799882302264, "loss": 2.5907, "step": 15815 }, { "crossentropy": 2.646502733230591, "epoch": 0.5733758700696056, "grad_norm": 0.029171496629714966, "grad_norm_var": 9.987890391793355e-07, "learning_rate": 0.003991230737108414, "loss": 2.6128, "step": 15816 }, { "crossentropy": 2.6304032802581787, "epoch": 0.5734121229698376, "grad_norm": 0.025715382769703865, "grad_norm_var": 1.0658088026295939e-06, "learning_rate": 0.003990661605539524, "loss": 2.5831, "step": 15817 }, { "crossentropy": 2.5922932624816895, "epoch": 0.5734483758700696, "grad_norm": 0.0260639451444149, "grad_norm_var": 1.0561637361897279e-06, "learning_rate": 0.00399009248760328, "loss": 2.6403, "step": 15818 }, { "crossentropy": 2.474693536758423, "epoch": 0.5734846287703016, "grad_norm": 0.026795873418450356, "grad_norm_var": 1.052269620690456e-06, "learning_rate": 0.003989523383307369, "loss": 2.562, "step": 15819 }, { "crossentropy": 2.5260133743286133, "epoch": 0.5735208816705336, "grad_norm": 0.02580266445875168, "grad_norm_var": 1.1050473231572858e-06, "learning_rate": 0.003988954292659479, "loss": 2.5394, "step": 15820 }, { "crossentropy": 2.5439810752868652, "epoch": 0.5735571345707656, "grad_norm": 0.027457833290100098, "grad_norm_var": 1.1120004040191528e-06, "learning_rate": 0.003988385215667298, "loss": 2.5239, "step": 15821 }, { "crossentropy": 2.5297913551330566, "epoch": 0.5735933874709976, "grad_norm": 0.0280085951089859, "grad_norm_var": 1.1578798432961464e-06, "learning_rate": 0.003987816152338507, "loss": 2.6061, "step": 15822 }, { "crossentropy": 2.5336415767669678, "epoch": 0.5736296403712297, "grad_norm": 0.027625882998108864, "grad_norm_var": 1.1832765797785965e-06, "learning_rate": 0.003987247102680796, "loss": 2.6051, "step": 15823 }, { "crossentropy": 2.6405045986175537, "epoch": 0.5736658932714617, "grad_norm": 0.026796432211995125, "grad_norm_var": 1.1842000972066367e-06, "learning_rate": 0.003986678066701849, "loss": 2.5284, "step": 15824 }, { "crossentropy": 2.5531601905822754, "epoch": 0.5737021461716937, "grad_norm": 0.027163635939359665, "grad_norm_var": 1.1273305464456114e-06, "learning_rate": 0.003986109044409353, "loss": 2.5195, "step": 15825 }, { "crossentropy": 2.668236255645752, "epoch": 0.5737383990719258, "grad_norm": 0.027329230681061745, "grad_norm_var": 1.1297203133720305e-06, "learning_rate": 0.003985540035810991, "loss": 2.6022, "step": 15826 }, { "crossentropy": 2.544116497039795, "epoch": 0.5737746519721578, "grad_norm": 0.026851043105125427, "grad_norm_var": 1.115171011276811e-06, "learning_rate": 0.003984971040914453, "loss": 2.6075, "step": 15827 }, { "crossentropy": 2.598633289337158, "epoch": 0.5738109048723898, "grad_norm": 0.027929501608014107, "grad_norm_var": 9.22188847746197e-07, "learning_rate": 0.003984402059727421, "loss": 2.5561, "step": 15828 }, { "crossentropy": 2.5897409915924072, "epoch": 0.5738471577726219, "grad_norm": 0.027364833280444145, "grad_norm_var": 9.31283625674878e-07, "learning_rate": 0.003983833092257581, "loss": 2.5934, "step": 15829 }, { "crossentropy": 2.4992516040802, "epoch": 0.5738834106728539, "grad_norm": 0.025828776881098747, "grad_norm_var": 9.586257658843276e-07, "learning_rate": 0.003983264138512616, "loss": 2.5277, "step": 15830 }, { "crossentropy": 2.7363553047180176, "epoch": 0.5739196635730859, "grad_norm": 0.02792874351143837, "grad_norm_var": 9.037154852656673e-07, "learning_rate": 0.0039826951985002116, "loss": 2.6601, "step": 15831 }, { "crossentropy": 2.50300931930542, "epoch": 0.5739559164733179, "grad_norm": 0.026294991374015808, "grad_norm_var": 6.319747755264537e-07, "learning_rate": 0.0039821262722280525, "loss": 2.4753, "step": 15832 }, { "crossentropy": 2.567572832107544, "epoch": 0.5739921693735499, "grad_norm": 0.026441048830747604, "grad_norm_var": 5.468980130311629e-07, "learning_rate": 0.003981557359703823, "loss": 2.5449, "step": 15833 }, { "crossentropy": 2.56467342376709, "epoch": 0.5740284222737819, "grad_norm": 0.028761517256498337, "grad_norm_var": 6.721526099595498e-07, "learning_rate": 0.003980988460935206, "loss": 2.455, "step": 15834 }, { "crossentropy": 2.7051963806152344, "epoch": 0.5740646751740139, "grad_norm": 0.027541832998394966, "grad_norm_var": 6.718297809736682e-07, "learning_rate": 0.003980419575929887, "loss": 2.6572, "step": 15835 }, { "crossentropy": 2.525141716003418, "epoch": 0.574100928074246, "grad_norm": 0.0264581385999918, "grad_norm_var": 5.769615001850747e-07, "learning_rate": 0.003979850704695549, "loss": 2.6237, "step": 15836 }, { "crossentropy": 2.440244436264038, "epoch": 0.574137180974478, "grad_norm": 0.026355277746915817, "grad_norm_var": 6.203826329967757e-07, "learning_rate": 0.003979281847239876, "loss": 2.53, "step": 15837 }, { "crossentropy": 2.5148165225982666, "epoch": 0.57417343387471, "grad_norm": 0.027298657223582268, "grad_norm_var": 5.722636023714703e-07, "learning_rate": 0.003978713003570551, "loss": 2.5821, "step": 15838 }, { "crossentropy": 2.4814276695251465, "epoch": 0.574209686774942, "grad_norm": 0.026408564299345016, "grad_norm_var": 5.832733016880008e-07, "learning_rate": 0.0039781441736952555, "loss": 2.508, "step": 15839 }, { "crossentropy": 2.507692813873291, "epoch": 0.574245939675174, "grad_norm": 0.02586926333606243, "grad_norm_var": 6.679784977250043e-07, "learning_rate": 0.003977575357621674, "loss": 2.4988, "step": 15840 }, { "crossentropy": 2.7003157138824463, "epoch": 0.574282192575406, "grad_norm": 0.027398165315389633, "grad_norm_var": 6.768751520354795e-07, "learning_rate": 0.003977006555357488, "loss": 2.663, "step": 15841 }, { "crossentropy": 2.5977845191955566, "epoch": 0.574318445475638, "grad_norm": 0.025475848466157913, "grad_norm_var": 8.111258466911384e-07, "learning_rate": 0.003976437766910381, "loss": 2.5242, "step": 15842 }, { "crossentropy": 2.6250131130218506, "epoch": 0.57435469837587, "grad_norm": 0.026507597416639328, "grad_norm_var": 8.201852479460068e-07, "learning_rate": 0.003975868992288036, "loss": 2.6086, "step": 15843 }, { "crossentropy": 2.571288824081421, "epoch": 0.5743909512761021, "grad_norm": 0.02641323208808899, "grad_norm_var": 7.489553765694171e-07, "learning_rate": 0.003975300231498134, "loss": 2.6031, "step": 15844 }, { "crossentropy": 2.5680010318756104, "epoch": 0.5744272041763341, "grad_norm": 0.02750491350889206, "grad_norm_var": 7.612607774394916e-07, "learning_rate": 0.0039747314845483585, "loss": 2.5197, "step": 15845 }, { "crossentropy": 2.6244077682495117, "epoch": 0.5744634570765661, "grad_norm": 0.027231955900788307, "grad_norm_var": 7.062760837467924e-07, "learning_rate": 0.003974162751446391, "loss": 2.6012, "step": 15846 }, { "crossentropy": 2.673264503479004, "epoch": 0.5744997099767981, "grad_norm": 0.026237601414322853, "grad_norm_var": 6.458659149411831e-07, "learning_rate": 0.003973594032199911, "loss": 2.564, "step": 15847 }, { "crossentropy": 2.4653773307800293, "epoch": 0.5745359628770301, "grad_norm": 0.028259051963686943, "grad_norm_var": 7.645559070908048e-07, "learning_rate": 0.0039730253268166005, "loss": 2.5668, "step": 15848 }, { "crossentropy": 2.6121654510498047, "epoch": 0.5745722157772621, "grad_norm": 0.028107844293117523, "grad_norm_var": 8.394933767043397e-07, "learning_rate": 0.003972456635304143, "loss": 2.5678, "step": 15849 }, { "crossentropy": 2.5021724700927734, "epoch": 0.5746084686774942, "grad_norm": 0.02770354226231575, "grad_norm_var": 6.59461282484031e-07, "learning_rate": 0.003971887957670215, "loss": 2.5014, "step": 15850 }, { "crossentropy": 2.474602222442627, "epoch": 0.5746447215777262, "grad_norm": 0.02626229077577591, "grad_norm_var": 6.562488374321774e-07, "learning_rate": 0.003971319293922503, "loss": 2.504, "step": 15851 }, { "crossentropy": 2.6024582386016846, "epoch": 0.5746809744779582, "grad_norm": 0.03020424395799637, "grad_norm_var": 1.3409764339179873e-06, "learning_rate": 0.003970750644068683, "loss": 2.5297, "step": 15852 }, { "crossentropy": 2.4524667263031006, "epoch": 0.5747172273781903, "grad_norm": 0.027778929099440575, "grad_norm_var": 1.3305811586423558e-06, "learning_rate": 0.00397018200811644, "loss": 2.53, "step": 15853 }, { "crossentropy": 2.542510986328125, "epoch": 0.5747534802784223, "grad_norm": 0.027132857590913773, "grad_norm_var": 1.3293745283125448e-06, "learning_rate": 0.00396961338607345, "loss": 2.6097, "step": 15854 }, { "crossentropy": 2.610399007797241, "epoch": 0.5747897331786543, "grad_norm": 0.026798581704497337, "grad_norm_var": 1.3000135553595248e-06, "learning_rate": 0.003969044777947397, "loss": 2.5473, "step": 15855 }, { "crossentropy": 2.624677896499634, "epoch": 0.5748259860788864, "grad_norm": 0.02744772657752037, "grad_norm_var": 1.1797973817779475e-06, "learning_rate": 0.003968476183745957, "loss": 2.5622, "step": 15856 }, { "crossentropy": 2.563493251800537, "epoch": 0.5748622389791184, "grad_norm": 0.028828918933868408, "grad_norm_var": 1.3304666455030925e-06, "learning_rate": 0.003967907603476812, "loss": 2.5403, "step": 15857 }, { "crossentropy": 2.464345932006836, "epoch": 0.5748984918793504, "grad_norm": 0.027268733829259872, "grad_norm_var": 1.0789409609608294e-06, "learning_rate": 0.003967339037147639, "loss": 2.483, "step": 15858 }, { "crossentropy": 2.4798803329467773, "epoch": 0.5749347447795824, "grad_norm": 0.026202674955129623, "grad_norm_var": 1.1243067685858988e-06, "learning_rate": 0.003966770484766123, "loss": 2.455, "step": 15859 }, { "crossentropy": 2.4489777088165283, "epoch": 0.5749709976798144, "grad_norm": 0.026640012860298157, "grad_norm_var": 1.095825882313596e-06, "learning_rate": 0.003966201946339938, "loss": 2.4168, "step": 15860 }, { "crossentropy": 2.561821699142456, "epoch": 0.5750072505800464, "grad_norm": 0.027479520067572594, "grad_norm_var": 1.0957669937160242e-06, "learning_rate": 0.003965633421876764, "loss": 2.6078, "step": 15861 }, { "crossentropy": 2.57944655418396, "epoch": 0.5750435034802784, "grad_norm": 0.026430346071720123, "grad_norm_var": 1.161801376893173e-06, "learning_rate": 0.0039650649113842805, "loss": 2.5084, "step": 15862 }, { "crossentropy": 2.641531467437744, "epoch": 0.5750797563805105, "grad_norm": 0.026740796864032745, "grad_norm_var": 1.0980327264657635e-06, "learning_rate": 0.003964496414870167, "loss": 2.6786, "step": 15863 }, { "crossentropy": 2.5438928604125977, "epoch": 0.5751160092807425, "grad_norm": 0.026475775986909866, "grad_norm_var": 1.1056979928255323e-06, "learning_rate": 0.003963927932342101, "loss": 2.5276, "step": 15864 }, { "crossentropy": 2.5435032844543457, "epoch": 0.5751522621809745, "grad_norm": 0.026300327852368355, "grad_norm_var": 1.125786436014832e-06, "learning_rate": 0.003963359463807759, "loss": 2.571, "step": 15865 }, { "crossentropy": 2.594616413116455, "epoch": 0.5751885150812065, "grad_norm": 0.0262764859944582, "grad_norm_var": 1.1631458795934343e-06, "learning_rate": 0.003962791009274821, "loss": 2.588, "step": 15866 }, { "crossentropy": 2.668724298477173, "epoch": 0.5752247679814385, "grad_norm": 0.02784368023276329, "grad_norm_var": 1.1340068049852376e-06, "learning_rate": 0.003962222568750964, "loss": 2.5852, "step": 15867 }, { "crossentropy": 2.4605555534362793, "epoch": 0.5752610208816705, "grad_norm": 0.026856793090701103, "grad_norm_var": 5.11592691011288e-07, "learning_rate": 0.003961654142243867, "loss": 2.4991, "step": 15868 }, { "crossentropy": 2.481363296508789, "epoch": 0.5752972737819025, "grad_norm": 0.026721080765128136, "grad_norm_var": 4.7609446235981574e-07, "learning_rate": 0.003961085729761207, "loss": 2.5786, "step": 15869 }, { "crossentropy": 2.540666341781616, "epoch": 0.5753335266821346, "grad_norm": 0.02799227088689804, "grad_norm_var": 5.414600611398461e-07, "learning_rate": 0.00396051733131066, "loss": 2.5612, "step": 15870 }, { "crossentropy": 2.658092975616455, "epoch": 0.5753697795823666, "grad_norm": 0.02792147733271122, "grad_norm_var": 5.872676320577026e-07, "learning_rate": 0.003959948946899905, "loss": 2.5505, "step": 15871 }, { "crossentropy": 2.499943256378174, "epoch": 0.5754060324825986, "grad_norm": 0.027222128584980965, "grad_norm_var": 5.796630658956667e-07, "learning_rate": 0.003959380576536617, "loss": 2.5372, "step": 15872 }, { "crossentropy": 2.5555179119110107, "epoch": 0.5754422853828306, "grad_norm": 0.026696518063545227, "grad_norm_var": 3.6520267504753547e-07, "learning_rate": 0.003958812220228473, "loss": 2.5954, "step": 15873 }, { "crossentropy": 2.4890146255493164, "epoch": 0.5754785382830626, "grad_norm": 0.027173733338713646, "grad_norm_var": 3.61625420373414e-07, "learning_rate": 0.00395824387798315, "loss": 2.5901, "step": 15874 }, { "crossentropy": 2.7465481758117676, "epoch": 0.5755147911832946, "grad_norm": 0.02615448273718357, "grad_norm_var": 3.664816960802549e-07, "learning_rate": 0.003957675549808325, "loss": 2.6441, "step": 15875 }, { "crossentropy": 2.491772413253784, "epoch": 0.5755510440835266, "grad_norm": 0.02635466307401657, "grad_norm_var": 3.8271179355098213e-07, "learning_rate": 0.003957107235711673, "loss": 2.6069, "step": 15876 }, { "crossentropy": 2.541874647140503, "epoch": 0.5755872969837587, "grad_norm": 0.026423096656799316, "grad_norm_var": 3.7294811220337647e-07, "learning_rate": 0.003956538935700871, "loss": 2.61, "step": 15877 }, { "crossentropy": 2.5301513671875, "epoch": 0.5756235498839907, "grad_norm": 0.027065806090831757, "grad_norm_var": 3.627163043787643e-07, "learning_rate": 0.003955970649783594, "loss": 2.582, "step": 15878 }, { "crossentropy": 2.48767352104187, "epoch": 0.5756598027842227, "grad_norm": 0.027261273935437202, "grad_norm_var": 3.6938366037250236e-07, "learning_rate": 0.003955402377967516, "loss": 2.428, "step": 15879 }, { "crossentropy": 2.4036951065063477, "epoch": 0.5756960556844548, "grad_norm": 0.027798539027571678, "grad_norm_var": 4.0017696636187225e-07, "learning_rate": 0.003954834120260318, "loss": 2.43, "step": 15880 }, { "crossentropy": 2.638859510421753, "epoch": 0.5757323085846868, "grad_norm": 0.028087828308343887, "grad_norm_var": 4.321902265370329e-07, "learning_rate": 0.003954265876669669, "loss": 2.5138, "step": 15881 }, { "crossentropy": 2.6332077980041504, "epoch": 0.5757685614849188, "grad_norm": 0.02651629038155079, "grad_norm_var": 4.0895408208196405e-07, "learning_rate": 0.003953697647203246, "loss": 2.587, "step": 15882 }, { "crossentropy": 2.576038360595703, "epoch": 0.5758048143851509, "grad_norm": 0.02620161697268486, "grad_norm_var": 4.213551249081556e-07, "learning_rate": 0.003953129431868724, "loss": 2.5542, "step": 15883 }, { "crossentropy": 2.4837253093719482, "epoch": 0.5758410672853829, "grad_norm": 0.026777608320116997, "grad_norm_var": 4.2355434701674536e-07, "learning_rate": 0.003952561230673778, "loss": 2.4543, "step": 15884 }, { "crossentropy": 2.60341215133667, "epoch": 0.5758773201856149, "grad_norm": 0.02810516767203808, "grad_norm_var": 4.875629404641937e-07, "learning_rate": 0.003951993043626083, "loss": 2.6102, "step": 15885 }, { "crossentropy": 2.544914960861206, "epoch": 0.5759135730858469, "grad_norm": 0.026231730356812477, "grad_norm_var": 4.7406870787066133e-07, "learning_rate": 0.003951424870733311, "loss": 2.63, "step": 15886 }, { "crossentropy": 2.654179096221924, "epoch": 0.5759498259860789, "grad_norm": 0.026545949280261993, "grad_norm_var": 4.23229006769526e-07, "learning_rate": 0.003950856712003137, "loss": 2.5487, "step": 15887 }, { "crossentropy": 2.4923577308654785, "epoch": 0.5759860788863109, "grad_norm": 0.026546737179160118, "grad_norm_var": 4.239483604423355e-07, "learning_rate": 0.003950288567443239, "loss": 2.4971, "step": 15888 }, { "crossentropy": 2.6680033206939697, "epoch": 0.5760223317865429, "grad_norm": 0.02717013843357563, "grad_norm_var": 4.2692980223811107e-07, "learning_rate": 0.003949720437061284, "loss": 2.55, "step": 15889 }, { "crossentropy": 2.6431946754455566, "epoch": 0.576058584686775, "grad_norm": 0.027275966480374336, "grad_norm_var": 4.313018197647673e-07, "learning_rate": 0.0039491523208649475, "loss": 2.6069, "step": 15890 }, { "crossentropy": 2.663391351699829, "epoch": 0.576094837587007, "grad_norm": 0.028057845309376717, "grad_norm_var": 4.666734050921712e-07, "learning_rate": 0.003948584218861904, "loss": 2.5885, "step": 15891 }, { "crossentropy": 2.5710525512695312, "epoch": 0.576131090487239, "grad_norm": 0.02743213064968586, "grad_norm_var": 4.4274786515524113e-07, "learning_rate": 0.003948016131059827, "loss": 2.551, "step": 15892 }, { "crossentropy": 2.4913744926452637, "epoch": 0.576167343387471, "grad_norm": 0.02732899971306324, "grad_norm_var": 4.130500557577207e-07, "learning_rate": 0.0039474480574663885, "loss": 2.5092, "step": 15893 }, { "crossentropy": 2.5386962890625, "epoch": 0.576203596287703, "grad_norm": 0.027186501771211624, "grad_norm_var": 4.1260195991389925e-07, "learning_rate": 0.003946879998089261, "loss": 2.4999, "step": 15894 }, { "crossentropy": 2.509312391281128, "epoch": 0.576239849187935, "grad_norm": 0.02826300635933876, "grad_norm_var": 4.891431019176147e-07, "learning_rate": 0.003946311952936117, "loss": 2.5265, "step": 15895 }, { "crossentropy": 2.556609630584717, "epoch": 0.576276102088167, "grad_norm": 0.027599409222602844, "grad_norm_var": 4.762708623135455e-07, "learning_rate": 0.003945743922014629, "loss": 2.5932, "step": 15896 }, { "crossentropy": 2.4778761863708496, "epoch": 0.5763123549883991, "grad_norm": 0.026344716548919678, "grad_norm_var": 4.6167247088052303e-07, "learning_rate": 0.003945175905332473, "loss": 2.5116, "step": 15897 }, { "crossentropy": 2.677908182144165, "epoch": 0.5763486078886311, "grad_norm": 0.027333438396453857, "grad_norm_var": 4.3991891185561567e-07, "learning_rate": 0.003944607902897313, "loss": 2.6291, "step": 15898 }, { "crossentropy": 2.5613574981689453, "epoch": 0.5763848607888631, "grad_norm": 0.027095871046185493, "grad_norm_var": 3.7681299998431245e-07, "learning_rate": 0.003944039914716826, "loss": 2.5979, "step": 15899 }, { "crossentropy": 2.4827630519866943, "epoch": 0.5764211136890951, "grad_norm": 0.026477420702576637, "grad_norm_var": 3.9958946216652144e-07, "learning_rate": 0.003943471940798682, "loss": 2.4917, "step": 15900 }, { "crossentropy": 2.5012598037719727, "epoch": 0.5764573665893271, "grad_norm": 0.026270082220435143, "grad_norm_var": 3.854514631144885e-07, "learning_rate": 0.0039429039811505525, "loss": 2.4737, "step": 15901 }, { "crossentropy": 2.5193591117858887, "epoch": 0.5764936194895591, "grad_norm": 0.02666218765079975, "grad_norm_var": 3.487771177367848e-07, "learning_rate": 0.00394233603578011, "loss": 2.6173, "step": 15902 }, { "crossentropy": 2.5347485542297363, "epoch": 0.5765298723897911, "grad_norm": 0.025885077193379402, "grad_norm_var": 4.248421353658516e-07, "learning_rate": 0.003941768104695023, "loss": 2.5018, "step": 15903 }, { "crossentropy": 2.656822681427002, "epoch": 0.5765661252900232, "grad_norm": 0.02703923173248768, "grad_norm_var": 4.064227370403992e-07, "learning_rate": 0.003941200187902965, "loss": 2.588, "step": 15904 }, { "crossentropy": 2.442286968231201, "epoch": 0.5766023781902552, "grad_norm": 0.025855422019958496, "grad_norm_var": 5.002078307653664e-07, "learning_rate": 0.003940632285411606, "loss": 2.4721, "step": 15905 }, { "crossentropy": 2.417769432067871, "epoch": 0.5766386310904872, "grad_norm": 0.02639530599117279, "grad_norm_var": 5.170636505112304e-07, "learning_rate": 0.003940064397228614, "loss": 2.4307, "step": 15906 }, { "crossentropy": 2.585010290145874, "epoch": 0.5766748839907193, "grad_norm": 0.02723432332277298, "grad_norm_var": 4.3798863342033267e-07, "learning_rate": 0.003939496523361661, "loss": 2.5567, "step": 15907 }, { "crossentropy": 2.6489906311035156, "epoch": 0.5767111368909513, "grad_norm": 0.027916194871068, "grad_norm_var": 4.869656362324962e-07, "learning_rate": 0.003938928663818417, "loss": 2.6138, "step": 15908 }, { "crossentropy": 2.684018611907959, "epoch": 0.5767473897911833, "grad_norm": 0.026601111516356468, "grad_norm_var": 4.813994438246371e-07, "learning_rate": 0.0039383608186065525, "loss": 2.6787, "step": 15909 }, { "crossentropy": 2.6146936416625977, "epoch": 0.5767836426914154, "grad_norm": 0.026934528723359108, "grad_norm_var": 4.752367501924809e-07, "learning_rate": 0.003937792987733735, "loss": 2.621, "step": 15910 }, { "crossentropy": 2.7565414905548096, "epoch": 0.5768198955916474, "grad_norm": 0.02952772192656994, "grad_norm_var": 8.102403112502284e-07, "learning_rate": 0.003937225171207635, "loss": 2.5919, "step": 15911 }, { "crossentropy": 2.7746357917785645, "epoch": 0.5768561484918794, "grad_norm": 0.02777157537639141, "grad_norm_var": 8.27040500648426e-07, "learning_rate": 0.003936657369035922, "loss": 2.6659, "step": 15912 }, { "crossentropy": 2.326429605484009, "epoch": 0.5768924013921114, "grad_norm": 0.026126110926270485, "grad_norm_var": 8.479324334701351e-07, "learning_rate": 0.003936089581226265, "loss": 2.46, "step": 15913 }, { "crossentropy": 2.6338155269622803, "epoch": 0.5769286542923434, "grad_norm": 0.026067087426781654, "grad_norm_var": 8.826327879923271e-07, "learning_rate": 0.003935521807786334, "loss": 2.5106, "step": 15914 }, { "crossentropy": 2.5987584590911865, "epoch": 0.5769649071925754, "grad_norm": 0.02671808935701847, "grad_norm_var": 8.799841560876703e-07, "learning_rate": 0.003934954048723797, "loss": 2.5322, "step": 15915 }, { "crossentropy": 2.6229336261749268, "epoch": 0.5770011600928074, "grad_norm": 0.027208831161260605, "grad_norm_var": 8.778072269815506e-07, "learning_rate": 0.00393438630404632, "loss": 2.5062, "step": 15916 }, { "crossentropy": 2.5810019969940186, "epoch": 0.5770374129930395, "grad_norm": 0.02622251957654953, "grad_norm_var": 8.818691900153482e-07, "learning_rate": 0.003933818573761573, "loss": 2.573, "step": 15917 }, { "crossentropy": 2.479400873184204, "epoch": 0.5770736658932715, "grad_norm": 0.027323566377162933, "grad_norm_var": 8.895302930249255e-07, "learning_rate": 0.003933250857877224, "loss": 2.4902, "step": 15918 }, { "crossentropy": 2.591313123703003, "epoch": 0.5771099187935035, "grad_norm": 0.027425570413470268, "grad_norm_var": 8.239083210649183e-07, "learning_rate": 0.003932683156400942, "loss": 2.6717, "step": 15919 }, { "crossentropy": 2.612163782119751, "epoch": 0.5771461716937355, "grad_norm": 0.026369107887148857, "grad_norm_var": 8.505201219490809e-07, "learning_rate": 0.003932115469340393, "loss": 2.6109, "step": 15920 }, { "crossentropy": 2.2934460639953613, "epoch": 0.5771824245939675, "grad_norm": 0.026649171486496925, "grad_norm_var": 7.707668640142267e-07, "learning_rate": 0.0039315477967032456, "loss": 2.4115, "step": 15921 }, { "crossentropy": 2.4410581588745117, "epoch": 0.5772186774941995, "grad_norm": 0.027859102934598923, "grad_norm_var": 7.806786795104866e-07, "learning_rate": 0.0039309801384971674, "loss": 2.5215, "step": 15922 }, { "crossentropy": 2.443481683731079, "epoch": 0.5772549303944315, "grad_norm": 0.027970273047685623, "grad_norm_var": 8.255359364506594e-07, "learning_rate": 0.003930412494729824, "loss": 2.4647, "step": 15923 }, { "crossentropy": 2.552833080291748, "epoch": 0.5772911832946636, "grad_norm": 0.026195991784334183, "grad_norm_var": 8.389100599059026e-07, "learning_rate": 0.0039298448654088835, "loss": 2.5434, "step": 15924 }, { "crossentropy": 2.5185513496398926, "epoch": 0.5773274361948956, "grad_norm": 0.02637111395597458, "grad_norm_var": 8.563085269694722e-07, "learning_rate": 0.003929277250542011, "loss": 2.5842, "step": 15925 }, { "crossentropy": 2.6048200130462646, "epoch": 0.5773636890951276, "grad_norm": 0.0268208310008049, "grad_norm_var": 8.588104791854937e-07, "learning_rate": 0.003928709650136874, "loss": 2.5359, "step": 15926 }, { "crossentropy": 2.5184454917907715, "epoch": 0.5773999419953596, "grad_norm": 0.027307778596878052, "grad_norm_var": 4.3022616255035337e-07, "learning_rate": 0.003928142064201139, "loss": 2.4781, "step": 15927 }, { "crossentropy": 2.519270658493042, "epoch": 0.5774361948955916, "grad_norm": 0.029170431196689606, "grad_norm_var": 7.150087668202886e-07, "learning_rate": 0.003927574492742473, "loss": 2.5325, "step": 15928 }, { "crossentropy": 2.520294666290283, "epoch": 0.5774724477958236, "grad_norm": 0.02649756520986557, "grad_norm_var": 6.809529243789833e-07, "learning_rate": 0.00392700693576854, "loss": 2.5081, "step": 15929 }, { "crossentropy": 2.6007654666900635, "epoch": 0.5775087006960556, "grad_norm": 0.02719663828611374, "grad_norm_var": 6.185264028298054e-07, "learning_rate": 0.003926439393287008, "loss": 2.6193, "step": 15930 }, { "crossentropy": 2.6085829734802246, "epoch": 0.5775449535962877, "grad_norm": 0.025993691757321358, "grad_norm_var": 6.864394900774062e-07, "learning_rate": 0.003925871865305541, "loss": 2.4829, "step": 15931 }, { "crossentropy": 2.4677047729492188, "epoch": 0.5775812064965197, "grad_norm": 0.028084391728043556, "grad_norm_var": 7.544838120168561e-07, "learning_rate": 0.003925304351831803, "loss": 2.5203, "step": 15932 }, { "crossentropy": 2.5092058181762695, "epoch": 0.5776174593967517, "grad_norm": 0.026477137580513954, "grad_norm_var": 7.290479041822488e-07, "learning_rate": 0.003924736852873463, "loss": 2.5212, "step": 15933 }, { "crossentropy": 2.604947566986084, "epoch": 0.5776537122969838, "grad_norm": 0.026283755898475647, "grad_norm_var": 7.66601351082317e-07, "learning_rate": 0.00392416936843818, "loss": 2.552, "step": 15934 }, { "crossentropy": 2.763807773590088, "epoch": 0.5776899651972158, "grad_norm": 0.028439562767744064, "grad_norm_var": 8.827162900440124e-07, "learning_rate": 0.003923601898533625, "loss": 2.7294, "step": 15935 }, { "crossentropy": 2.5965659618377686, "epoch": 0.5777262180974478, "grad_norm": 0.030858907848596573, "grad_norm_var": 1.7018309346393419e-06, "learning_rate": 0.003923034443167458, "loss": 2.6773, "step": 15936 }, { "crossentropy": 2.604710340499878, "epoch": 0.5777624709976799, "grad_norm": 0.027781452983617783, "grad_norm_var": 1.6707168763836159e-06, "learning_rate": 0.003922467002347348, "loss": 2.6499, "step": 15937 }, { "crossentropy": 2.528440475463867, "epoch": 0.5777987238979119, "grad_norm": 0.025791078805923462, "grad_norm_var": 1.8270795015574848e-06, "learning_rate": 0.003921899576080954, "loss": 2.5449, "step": 15938 }, { "crossentropy": 2.6442646980285645, "epoch": 0.5778349767981439, "grad_norm": 0.026925373822450638, "grad_norm_var": 1.8057720722018522e-06, "learning_rate": 0.003921332164375944, "loss": 2.5427, "step": 15939 }, { "crossentropy": 2.5507049560546875, "epoch": 0.5778712296983759, "grad_norm": 0.027558760717511177, "grad_norm_var": 1.7281048308716759e-06, "learning_rate": 0.003920764767239979, "loss": 2.5857, "step": 15940 }, { "crossentropy": 2.558422803878784, "epoch": 0.5779074825986079, "grad_norm": 0.027530984953045845, "grad_norm_var": 1.661203313410214e-06, "learning_rate": 0.003920197384680723, "loss": 2.5788, "step": 15941 }, { "crossentropy": 2.433196544647217, "epoch": 0.5779437354988399, "grad_norm": 0.026983119547367096, "grad_norm_var": 1.6498865502718169e-06, "learning_rate": 0.003919630016705841, "loss": 2.4115, "step": 15942 }, { "crossentropy": 2.634962320327759, "epoch": 0.5779799883990719, "grad_norm": 0.027267538011074066, "grad_norm_var": 1.650643736766224e-06, "learning_rate": 0.003919062663322994, "loss": 2.6108, "step": 15943 }, { "crossentropy": 2.4393415451049805, "epoch": 0.578016241299304, "grad_norm": 0.02734578400850296, "grad_norm_var": 1.434702001266288e-06, "learning_rate": 0.003918495324539847, "loss": 2.603, "step": 15944 }, { "crossentropy": 2.467108964920044, "epoch": 0.578052494199536, "grad_norm": 0.026582900434732437, "grad_norm_var": 1.425873584149255e-06, "learning_rate": 0.003917928000364062, "loss": 2.4946, "step": 15945 }, { "crossentropy": 2.8170769214630127, "epoch": 0.578088747099768, "grad_norm": 0.03008163906633854, "grad_norm_var": 1.899077164776534e-06, "learning_rate": 0.003917360690803302, "loss": 2.7159, "step": 15946 }, { "crossentropy": 2.6127939224243164, "epoch": 0.578125, "grad_norm": 0.0322076641023159, "grad_norm_var": 3.0651177230781584e-06, "learning_rate": 0.0039167933958652295, "loss": 2.6081, "step": 15947 }, { "crossentropy": 2.6575188636779785, "epoch": 0.578161252900232, "grad_norm": 0.028498847037553787, "grad_norm_var": 3.0867337440287335e-06, "learning_rate": 0.003916226115557504, "loss": 2.6253, "step": 15948 }, { "crossentropy": 2.5243489742279053, "epoch": 0.578197505800464, "grad_norm": 0.027647780254483223, "grad_norm_var": 2.9482029508045546e-06, "learning_rate": 0.00391565884988779, "loss": 2.5067, "step": 15949 }, { "crossentropy": 2.6055142879486084, "epoch": 0.578233758700696, "grad_norm": 0.025961391627788544, "grad_norm_var": 3.0278881424089867e-06, "learning_rate": 0.003915091598863748, "loss": 2.5928, "step": 15950 }, { "crossentropy": 2.6185407638549805, "epoch": 0.5782700116009281, "grad_norm": 0.027817871421575546, "grad_norm_var": 3.0128249071734516e-06, "learning_rate": 0.003914524362493042, "loss": 2.5297, "step": 15951 }, { "crossentropy": 2.567819356918335, "epoch": 0.5783062645011601, "grad_norm": 0.026542052626609802, "grad_norm_var": 2.4903049530733414e-06, "learning_rate": 0.00391395714078333, "loss": 2.5554, "step": 15952 }, { "crossentropy": 2.509542465209961, "epoch": 0.5783425174013921, "grad_norm": 0.027514623478055, "grad_norm_var": 2.490354345382204e-06, "learning_rate": 0.003913389933742276, "loss": 2.4587, "step": 15953 }, { "crossentropy": 2.553770065307617, "epoch": 0.5783787703016241, "grad_norm": 0.02634773775935173, "grad_norm_var": 2.3724112699916812e-06, "learning_rate": 0.003912822741377541, "loss": 2.5611, "step": 15954 }, { "crossentropy": 2.5795369148254395, "epoch": 0.5784150232018561, "grad_norm": 0.027167007327079773, "grad_norm_var": 2.3518808059253776e-06, "learning_rate": 0.003912255563696782, "loss": 2.556, "step": 15955 }, { "crossentropy": 2.581085443496704, "epoch": 0.5784512761020881, "grad_norm": 0.028476115316152573, "grad_norm_var": 2.388304588874651e-06, "learning_rate": 0.003911688400707667, "loss": 2.5817, "step": 15956 }, { "crossentropy": 2.644251585006714, "epoch": 0.5784875290023201, "grad_norm": 0.02745814621448517, "grad_norm_var": 2.3907468646540505e-06, "learning_rate": 0.003911121252417848, "loss": 2.5629, "step": 15957 }, { "crossentropy": 2.425896644592285, "epoch": 0.5785237819025522, "grad_norm": 0.027922121807932854, "grad_norm_var": 2.350621822876965e-06, "learning_rate": 0.003910554118834989, "loss": 2.5241, "step": 15958 }, { "crossentropy": 2.4069933891296387, "epoch": 0.5785600348027842, "grad_norm": 0.026807736605405807, "grad_norm_var": 2.3966292559500038e-06, "learning_rate": 0.003909986999966751, "loss": 2.4397, "step": 15959 }, { "crossentropy": 2.3893179893493652, "epoch": 0.5785962877030162, "grad_norm": 0.02711881697177887, "grad_norm_var": 2.412799007507858e-06, "learning_rate": 0.003909419895820792, "loss": 2.4579, "step": 15960 }, { "crossentropy": 2.5855400562286377, "epoch": 0.5786325406032483, "grad_norm": 0.027287790551781654, "grad_norm_var": 2.3332676112338176e-06, "learning_rate": 0.003908852806404772, "loss": 2.581, "step": 15961 }, { "crossentropy": 2.511533260345459, "epoch": 0.5786687935034803, "grad_norm": 0.027294157072901726, "grad_norm_var": 1.9722243965141722e-06, "learning_rate": 0.003908285731726351, "loss": 2.5575, "step": 15962 }, { "crossentropy": 2.706282138824463, "epoch": 0.5787050464037123, "grad_norm": 0.0288919098675251, "grad_norm_var": 6.352954698590849e-07, "learning_rate": 0.003907718671793187, "loss": 2.6964, "step": 15963 }, { "crossentropy": 2.660860300064087, "epoch": 0.5787412993039444, "grad_norm": 0.02825678512454033, "grad_norm_var": 6.042066871760384e-07, "learning_rate": 0.003907151626612944, "loss": 2.5637, "step": 15964 }, { "crossentropy": 2.625281810760498, "epoch": 0.5787775522041764, "grad_norm": 0.02596968412399292, "grad_norm_var": 7.263340616967909e-07, "learning_rate": 0.003906584596193273, "loss": 2.6008, "step": 15965 }, { "crossentropy": 2.5671112537384033, "epoch": 0.5788138051044084, "grad_norm": 0.028380276635289192, "grad_norm_var": 6.596122573470821e-07, "learning_rate": 0.0039060175805418367, "loss": 2.5396, "step": 15966 }, { "crossentropy": 2.3891260623931885, "epoch": 0.5788500580046404, "grad_norm": 0.026565836742520332, "grad_norm_var": 6.967262196330704e-07, "learning_rate": 0.0039054505796662937, "loss": 2.461, "step": 15967 }, { "crossentropy": 2.4809179306030273, "epoch": 0.5788863109048724, "grad_norm": 0.02586696669459343, "grad_norm_var": 8.00189330882733e-07, "learning_rate": 0.003904883593574302, "loss": 2.538, "step": 15968 }, { "crossentropy": 2.5559589862823486, "epoch": 0.5789225638051044, "grad_norm": 0.026099102571606636, "grad_norm_var": 8.911146491601014e-07, "learning_rate": 0.003904316622273519, "loss": 2.52, "step": 15969 }, { "crossentropy": 2.4958832263946533, "epoch": 0.5789588167053364, "grad_norm": 0.029677614569664, "grad_norm_var": 1.1860221042903998e-06, "learning_rate": 0.0039037496657716025, "loss": 2.4897, "step": 15970 }, { "crossentropy": 2.561903953552246, "epoch": 0.5789950696055685, "grad_norm": 0.02765248715877533, "grad_norm_var": 1.1822723704336493e-06, "learning_rate": 0.00390318272407621, "loss": 2.6269, "step": 15971 }, { "crossentropy": 2.7268810272216797, "epoch": 0.5790313225058005, "grad_norm": 0.02788354828953743, "grad_norm_var": 1.1257412571910135e-06, "learning_rate": 0.0039026157971950018, "loss": 2.6659, "step": 15972 }, { "crossentropy": 2.38464617729187, "epoch": 0.5790675754060325, "grad_norm": 0.026088546961545944, "grad_norm_var": 1.24072637298165e-06, "learning_rate": 0.0039020488851356306, "loss": 2.4048, "step": 15973 }, { "crossentropy": 2.432328224182129, "epoch": 0.5791038283062645, "grad_norm": 0.025700507685542107, "grad_norm_var": 1.3827530296994217e-06, "learning_rate": 0.0039014819879057546, "loss": 2.4439, "step": 15974 }, { "crossentropy": 2.5099966526031494, "epoch": 0.5791400812064965, "grad_norm": 0.025992967188358307, "grad_norm_var": 1.4691780156149877e-06, "learning_rate": 0.003900915105513032, "loss": 2.4866, "step": 15975 }, { "crossentropy": 2.4890339374542236, "epoch": 0.5791763341067285, "grad_norm": 0.026688801124691963, "grad_norm_var": 1.4836947970197119e-06, "learning_rate": 0.003900348237965119, "loss": 2.5585, "step": 15976 }, { "crossentropy": 2.641817092895508, "epoch": 0.5792125870069605, "grad_norm": 0.02613813802599907, "grad_norm_var": 1.5441926503063803e-06, "learning_rate": 0.003899781385269672, "loss": 2.5607, "step": 15977 }, { "crossentropy": 2.61708402633667, "epoch": 0.5792488399071926, "grad_norm": 0.02635030262172222, "grad_norm_var": 1.5718768849698819e-06, "learning_rate": 0.0038992145474343472, "loss": 2.5676, "step": 15978 }, { "crossentropy": 2.5144689083099365, "epoch": 0.5792850928074246, "grad_norm": 0.02572719007730484, "grad_norm_var": 1.404893502678778e-06, "learning_rate": 0.003898647724466799, "loss": 2.4892, "step": 15979 }, { "crossentropy": 2.629612922668457, "epoch": 0.5793213457076566, "grad_norm": 0.028389936313033104, "grad_norm_var": 1.431599681283206e-06, "learning_rate": 0.003898080916374686, "loss": 2.5541, "step": 15980 }, { "crossentropy": 2.4347853660583496, "epoch": 0.5793575986078886, "grad_norm": 0.026112886145710945, "grad_norm_var": 1.4165838208295028e-06, "learning_rate": 0.0038975141231656642, "loss": 2.5093, "step": 15981 }, { "crossentropy": 2.5569450855255127, "epoch": 0.5793938515081206, "grad_norm": 0.027542265132069588, "grad_norm_var": 1.2875005018540716e-06, "learning_rate": 0.0038969473448473846, "loss": 2.5888, "step": 15982 }, { "crossentropy": 2.6173336505889893, "epoch": 0.5794301044083526, "grad_norm": 0.0255398191511631, "grad_norm_var": 1.3825682254235319e-06, "learning_rate": 0.0038963805814275056, "loss": 2.5193, "step": 15983 }, { "crossentropy": 2.585235834121704, "epoch": 0.5794663573085846, "grad_norm": 0.026775218546390533, "grad_norm_var": 1.3313449121613347e-06, "learning_rate": 0.0038958138329136817, "loss": 2.5687, "step": 15984 }, { "crossentropy": 2.370624542236328, "epoch": 0.5795026102088167, "grad_norm": 0.02694832906126976, "grad_norm_var": 1.3001748199671531e-06, "learning_rate": 0.003895247099313568, "loss": 2.4913, "step": 15985 }, { "crossentropy": 2.674593687057495, "epoch": 0.5795388631090487, "grad_norm": 0.02746478095650673, "grad_norm_var": 7.647239829762047e-07, "learning_rate": 0.0038946803806348186, "loss": 2.5904, "step": 15986 }, { "crossentropy": 2.6053993701934814, "epoch": 0.5795751160092807, "grad_norm": 0.02547175996005535, "grad_norm_var": 7.812863132808697e-07, "learning_rate": 0.0038941136768850873, "loss": 2.5139, "step": 15987 }, { "crossentropy": 2.6658382415771484, "epoch": 0.5796113689095128, "grad_norm": 0.028027022257447243, "grad_norm_var": 8.080655263170122e-07, "learning_rate": 0.003893546988072029, "loss": 2.5876, "step": 15988 }, { "crossentropy": 2.479241132736206, "epoch": 0.5796476218097448, "grad_norm": 0.027192745357751846, "grad_norm_var": 8.148726163336705e-07, "learning_rate": 0.003892980314203301, "loss": 2.506, "step": 15989 }, { "crossentropy": 2.546659469604492, "epoch": 0.5796838747099768, "grad_norm": 0.02747935615479946, "grad_norm_var": 7.924415874067831e-07, "learning_rate": 0.0038924136552865502, "loss": 2.5485, "step": 15990 }, { "crossentropy": 2.4731521606445312, "epoch": 0.5797201276102089, "grad_norm": 0.026085149496793747, "grad_norm_var": 7.837897586605858e-07, "learning_rate": 0.0038918470113294347, "loss": 2.478, "step": 15991 }, { "crossentropy": 2.6350302696228027, "epoch": 0.5797563805104409, "grad_norm": 0.02638237550854683, "grad_norm_var": 7.919893877007898e-07, "learning_rate": 0.0038912803823396075, "loss": 2.5767, "step": 15992 }, { "crossentropy": 2.401850938796997, "epoch": 0.5797926334106729, "grad_norm": 0.026772288605570793, "grad_norm_var": 7.6735825518161e-07, "learning_rate": 0.0038907137683247208, "loss": 2.5016, "step": 15993 }, { "crossentropy": 2.5982532501220703, "epoch": 0.5798288863109049, "grad_norm": 0.02681044116616249, "grad_norm_var": 7.550666343868513e-07, "learning_rate": 0.003890147169292428, "loss": 2.4755, "step": 15994 }, { "crossentropy": 2.7332448959350586, "epoch": 0.5798651392111369, "grad_norm": 0.027418281883001328, "grad_norm_var": 6.930129491134005e-07, "learning_rate": 0.0038895805852503813, "loss": 2.6068, "step": 15995 }, { "crossentropy": 2.5243871212005615, "epoch": 0.5799013921113689, "grad_norm": 0.026790494099259377, "grad_norm_var": 5.353278237840796e-07, "learning_rate": 0.003889014016206235, "loss": 2.5348, "step": 15996 }, { "crossentropy": 2.5846312046051025, "epoch": 0.5799376450116009, "grad_norm": 0.026452235877513885, "grad_norm_var": 5.113982602597544e-07, "learning_rate": 0.0038884474621676403, "loss": 2.5946, "step": 15997 }, { "crossentropy": 2.5280940532684326, "epoch": 0.579973897911833, "grad_norm": 0.026676036417484283, "grad_norm_var": 4.75110758884585e-07, "learning_rate": 0.0038878809231422504, "loss": 2.5154, "step": 15998 }, { "crossentropy": 2.5871570110321045, "epoch": 0.580010150812065, "grad_norm": 0.02627771906554699, "grad_norm_var": 3.883154122895823e-07, "learning_rate": 0.0038873143991377153, "loss": 2.5741, "step": 15999 }, { "crossentropy": 2.53275465965271, "epoch": 0.580046403712297, "grad_norm": 0.02708536759018898, "grad_norm_var": 3.9272309583036206e-07, "learning_rate": 0.0038867478901616877, "loss": 2.5172, "step": 16000 }, { "crossentropy": 2.5918259620666504, "epoch": 0.580082656612529, "grad_norm": 0.026979872956871986, "grad_norm_var": 3.932686634711222e-07, "learning_rate": 0.0038861813962218195, "loss": 2.5486, "step": 16001 }, { "crossentropy": 2.506756067276001, "epoch": 0.580118909512761, "grad_norm": 0.027155352756381035, "grad_norm_var": 3.7328512903402287e-07, "learning_rate": 0.0038856149173257617, "loss": 2.5118, "step": 16002 }, { "crossentropy": 2.4474923610687256, "epoch": 0.580155162412993, "grad_norm": 0.027710244059562683, "grad_norm_var": 2.8524351524913894e-07, "learning_rate": 0.0038850484534811644, "loss": 2.4657, "step": 16003 }, { "crossentropy": 2.644932746887207, "epoch": 0.580191415313225, "grad_norm": 0.02877635695040226, "grad_norm_var": 4.273509873349655e-07, "learning_rate": 0.0038844820046956808, "loss": 2.5942, "step": 16004 }, { "crossentropy": 2.550104856491089, "epoch": 0.5802276682134571, "grad_norm": 0.026452722027897835, "grad_norm_var": 4.428333064682502e-07, "learning_rate": 0.00388391557097696, "loss": 2.5665, "step": 16005 }, { "crossentropy": 2.4219350814819336, "epoch": 0.5802639211136891, "grad_norm": 0.026512136682868004, "grad_norm_var": 4.338763881089313e-07, "learning_rate": 0.003883349152332655, "loss": 2.4913, "step": 16006 }, { "crossentropy": 2.632175922393799, "epoch": 0.5803001740139211, "grad_norm": 0.02648361772298813, "grad_norm_var": 4.0071662107861914e-07, "learning_rate": 0.0038827827487704137, "loss": 2.5981, "step": 16007 }, { "crossentropy": 2.4803617000579834, "epoch": 0.5803364269141531, "grad_norm": 0.027365779504179955, "grad_norm_var": 3.905383465667289e-07, "learning_rate": 0.003882216360297886, "loss": 2.5112, "step": 16008 }, { "crossentropy": 2.565936803817749, "epoch": 0.5803726798143851, "grad_norm": 0.02879432402551174, "grad_norm_var": 5.894213015360337e-07, "learning_rate": 0.003881649986922723, "loss": 2.5864, "step": 16009 }, { "crossentropy": 2.53285813331604, "epoch": 0.5804089327146171, "grad_norm": 0.02660008706152439, "grad_norm_var": 6.005553090812632e-07, "learning_rate": 0.0038810836286525736, "loss": 2.4959, "step": 16010 }, { "crossentropy": 2.4502017498016357, "epoch": 0.5804451856148491, "grad_norm": 0.02554839290678501, "grad_norm_var": 7.386510645923979e-07, "learning_rate": 0.003880517285495087, "loss": 2.4749, "step": 16011 }, { "crossentropy": 2.545198440551758, "epoch": 0.5804814385150812, "grad_norm": 0.028228631243109703, "grad_norm_var": 8.318087286112436e-07, "learning_rate": 0.0038799509574579147, "loss": 2.5173, "step": 16012 }, { "crossentropy": 2.3494136333465576, "epoch": 0.5805176914153132, "grad_norm": 0.027253856882452965, "grad_norm_var": 8.060837401233552e-07, "learning_rate": 0.003879384644548704, "loss": 2.4515, "step": 16013 }, { "crossentropy": 2.2587509155273438, "epoch": 0.5805539443155452, "grad_norm": 0.026854176074266434, "grad_norm_var": 7.975510468909963e-07, "learning_rate": 0.003878818346775106, "loss": 2.4529, "step": 16014 }, { "crossentropy": 2.5203213691711426, "epoch": 0.5805901972157773, "grad_norm": 0.028174281120300293, "grad_norm_var": 8.068613166236354e-07, "learning_rate": 0.003878252064144766, "loss": 2.5419, "step": 16015 }, { "crossentropy": 2.4306397438049316, "epoch": 0.5806264501160093, "grad_norm": 0.03023766167461872, "grad_norm_var": 1.3593767072841412e-06, "learning_rate": 0.0038776857966653345, "loss": 2.5093, "step": 16016 }, { "crossentropy": 2.4271013736724854, "epoch": 0.5806627030162413, "grad_norm": 0.02877676859498024, "grad_norm_var": 1.4496286499590528e-06, "learning_rate": 0.003877119544344459, "loss": 2.448, "step": 16017 }, { "crossentropy": 2.4451754093170166, "epoch": 0.5806989559164734, "grad_norm": 0.026165692135691643, "grad_norm_var": 1.5639443469427863e-06, "learning_rate": 0.0038765533071897884, "loss": 2.5536, "step": 16018 }, { "crossentropy": 2.510077476501465, "epoch": 0.5807352088167054, "grad_norm": 0.02584015019237995, "grad_norm_var": 1.7290818594334344e-06, "learning_rate": 0.0038759870852089697, "loss": 2.5227, "step": 16019 }, { "crossentropy": 2.6079599857330322, "epoch": 0.5807714617169374, "grad_norm": 0.025747520849108696, "grad_norm_var": 1.7381480329702859e-06, "learning_rate": 0.003875420878409651, "loss": 2.4515, "step": 16020 }, { "crossentropy": 2.44805908203125, "epoch": 0.5808077146171694, "grad_norm": 0.025789272040128708, "grad_norm_var": 1.8308547811276965e-06, "learning_rate": 0.003874854686799481, "loss": 2.5634, "step": 16021 }, { "crossentropy": 2.477417469024658, "epoch": 0.5808439675174014, "grad_norm": 0.02546802908182144, "grad_norm_var": 1.9875489578246214e-06, "learning_rate": 0.003874288510386106, "loss": 2.5424, "step": 16022 }, { "crossentropy": 2.4574460983276367, "epoch": 0.5808802204176334, "grad_norm": 0.026309560984373093, "grad_norm_var": 2.0033529950566966e-06, "learning_rate": 0.0038737223491771733, "loss": 2.5679, "step": 16023 }, { "crossentropy": 2.556215286254883, "epoch": 0.5809164733178654, "grad_norm": 0.026767875999212265, "grad_norm_var": 2.0022866769635413e-06, "learning_rate": 0.0038731562031803293, "loss": 2.5653, "step": 16024 }, { "crossentropy": 2.527256488800049, "epoch": 0.5809527262180975, "grad_norm": 0.02754652500152588, "grad_norm_var": 1.8068562808799698e-06, "learning_rate": 0.0038725900724032205, "loss": 2.5356, "step": 16025 }, { "crossentropy": 2.5810728073120117, "epoch": 0.5809889791183295, "grad_norm": 0.026208069175481796, "grad_norm_var": 1.835105164895443e-06, "learning_rate": 0.003872023956853492, "loss": 2.5082, "step": 16026 }, { "crossentropy": 2.6136345863342285, "epoch": 0.5810252320185615, "grad_norm": 0.027763262391090393, "grad_norm_var": 1.7330244823860266e-06, "learning_rate": 0.003871457856538793, "loss": 2.6454, "step": 16027 }, { "crossentropy": 2.4521732330322266, "epoch": 0.5810614849187935, "grad_norm": 0.03317161649465561, "grad_norm_var": 4.023239543176011e-06, "learning_rate": 0.003870891771466769, "loss": 2.5556, "step": 16028 }, { "crossentropy": 2.5321428775787354, "epoch": 0.5810977378190255, "grad_norm": 0.02672641910612583, "grad_norm_var": 4.0494725044218674e-06, "learning_rate": 0.0038703257016450645, "loss": 2.486, "step": 16029 }, { "crossentropy": 2.661261558532715, "epoch": 0.5811339907192575, "grad_norm": 0.02719871699810028, "grad_norm_var": 4.034266745462954e-06, "learning_rate": 0.0038697596470813255, "loss": 2.574, "step": 16030 }, { "crossentropy": 2.620757818222046, "epoch": 0.5811702436194895, "grad_norm": 0.027281738817691803, "grad_norm_var": 3.988129680701141e-06, "learning_rate": 0.003869193607783199, "loss": 2.5346, "step": 16031 }, { "crossentropy": 2.646512746810913, "epoch": 0.5812064965197216, "grad_norm": 0.0266688484698534, "grad_norm_var": 3.3922090072708295e-06, "learning_rate": 0.0038686275837583285, "loss": 2.584, "step": 16032 }, { "crossentropy": 2.5341062545776367, "epoch": 0.5812427494199536, "grad_norm": 0.025415664538741112, "grad_norm_var": 3.3420739174461664e-06, "learning_rate": 0.003868061575014359, "loss": 2.4805, "step": 16033 }, { "crossentropy": 2.4980711936950684, "epoch": 0.5812790023201856, "grad_norm": 0.027025938034057617, "grad_norm_var": 3.306473760214173e-06, "learning_rate": 0.003867495581558934, "loss": 2.4843, "step": 16034 }, { "crossentropy": 2.6680948734283447, "epoch": 0.5813152552204176, "grad_norm": 0.02708417735993862, "grad_norm_var": 3.2219151463693293e-06, "learning_rate": 0.0038669296033997016, "loss": 2.6294, "step": 16035 }, { "crossentropy": 2.4939045906066895, "epoch": 0.5813515081206496, "grad_norm": 0.026561565697193146, "grad_norm_var": 3.1262135552922547e-06, "learning_rate": 0.0038663636405443036, "loss": 2.5051, "step": 16036 }, { "crossentropy": 2.615497350692749, "epoch": 0.5813877610208816, "grad_norm": 0.029130280017852783, "grad_norm_var": 3.2570316215303647e-06, "learning_rate": 0.0038657976930003856, "loss": 2.5976, "step": 16037 }, { "crossentropy": 2.4074172973632812, "epoch": 0.5814240139211136, "grad_norm": 0.02595636621117592, "grad_norm_var": 3.1545732243091112e-06, "learning_rate": 0.003865231760775591, "loss": 2.5241, "step": 16038 }, { "crossentropy": 2.674616575241089, "epoch": 0.5814602668213457, "grad_norm": 0.026458444073796272, "grad_norm_var": 3.136276700586383e-06, "learning_rate": 0.0038646658438775626, "loss": 2.6132, "step": 16039 }, { "crossentropy": 2.523355007171631, "epoch": 0.5814965197215777, "grad_norm": 0.027630701661109924, "grad_norm_var": 3.1203985518116525e-06, "learning_rate": 0.003864099942313948, "loss": 2.5074, "step": 16040 }, { "crossentropy": 2.647136688232422, "epoch": 0.5815327726218097, "grad_norm": 0.02792375721037388, "grad_norm_var": 3.138459510601023e-06, "learning_rate": 0.0038635340560923847, "loss": 2.5603, "step": 16041 }, { "crossentropy": 2.5403013229370117, "epoch": 0.5815690255220418, "grad_norm": 0.02681908942759037, "grad_norm_var": 3.0656777947833757e-06, "learning_rate": 0.0038629681852205182, "loss": 2.6187, "step": 16042 }, { "crossentropy": 2.609604597091675, "epoch": 0.5816052784222738, "grad_norm": 0.02670067921280861, "grad_norm_var": 3.0884680972890898e-06, "learning_rate": 0.003862402329705993, "loss": 2.4524, "step": 16043 }, { "crossentropy": 2.629321575164795, "epoch": 0.5816415313225058, "grad_norm": 0.025982236489653587, "grad_norm_var": 7.476352915892006e-07, "learning_rate": 0.0038618364895564505, "loss": 2.5458, "step": 16044 }, { "crossentropy": 2.5209410190582275, "epoch": 0.5816777842227379, "grad_norm": 0.025867177173495293, "grad_norm_var": 8.148439934999955e-07, "learning_rate": 0.003861270664779534, "loss": 2.5221, "step": 16045 }, { "crossentropy": 2.5830788612365723, "epoch": 0.5817140371229699, "grad_norm": 0.02603595331311226, "grad_norm_var": 8.463029241095298e-07, "learning_rate": 0.003860704855382884, "loss": 2.5808, "step": 16046 }, { "crossentropy": 2.6127471923828125, "epoch": 0.5817502900232019, "grad_norm": 0.027734307572245598, "grad_norm_var": 8.89144097051273e-07, "learning_rate": 0.003860139061374144, "loss": 2.5957, "step": 16047 }, { "crossentropy": 2.5760107040405273, "epoch": 0.5817865429234339, "grad_norm": 0.026738060638308525, "grad_norm_var": 8.881206107583337e-07, "learning_rate": 0.003859573282760958, "loss": 2.5987, "step": 16048 }, { "crossentropy": 2.4744009971618652, "epoch": 0.5818227958236659, "grad_norm": 0.026379428803920746, "grad_norm_var": 7.661599854350462e-07, "learning_rate": 0.0038590075195509637, "loss": 2.5511, "step": 16049 }, { "crossentropy": 2.4372007846832275, "epoch": 0.5818590487238979, "grad_norm": 0.026467986404895782, "grad_norm_var": 7.745189915651467e-07, "learning_rate": 0.0038584417717518037, "loss": 2.4416, "step": 16050 }, { "crossentropy": 2.459073066711426, "epoch": 0.58189530162413, "grad_norm": 0.02992439828813076, "grad_norm_var": 1.3704513882245583e-06, "learning_rate": 0.0038578760393711206, "loss": 2.4676, "step": 16051 }, { "crossentropy": 2.4764482975006104, "epoch": 0.581931554524362, "grad_norm": 0.026431716978549957, "grad_norm_var": 1.3794317758469843e-06, "learning_rate": 0.003857310322416555, "loss": 2.5369, "step": 16052 }, { "crossentropy": 2.4902281761169434, "epoch": 0.581967807424594, "grad_norm": 0.02693250961601734, "grad_norm_var": 1.0603774394386668e-06, "learning_rate": 0.003856744620895747, "loss": 2.4934, "step": 16053 }, { "crossentropy": 2.580799102783203, "epoch": 0.582004060324826, "grad_norm": 0.027589276432991028, "grad_norm_var": 1.0272548484971245e-06, "learning_rate": 0.003856178934816339, "loss": 2.5818, "step": 16054 }, { "crossentropy": 2.455726146697998, "epoch": 0.582040313225058, "grad_norm": 0.02810564450919628, "grad_norm_var": 1.0831688634252983e-06, "learning_rate": 0.0038556132641859686, "loss": 2.5906, "step": 16055 }, { "crossentropy": 2.461357831954956, "epoch": 0.58207656612529, "grad_norm": 0.02689269557595253, "grad_norm_var": 1.0629151646043226e-06, "learning_rate": 0.003855047609012281, "loss": 2.5522, "step": 16056 }, { "crossentropy": 2.57955002784729, "epoch": 0.582112819025522, "grad_norm": 0.027869345620274544, "grad_norm_var": 1.0566364702999184e-06, "learning_rate": 0.003854481969302911, "loss": 2.5634, "step": 16057 }, { "crossentropy": 2.6164815425872803, "epoch": 0.582149071925754, "grad_norm": 0.0253695547580719, "grad_norm_var": 1.228606669272508e-06, "learning_rate": 0.003853916345065499, "loss": 2.543, "step": 16058 }, { "crossentropy": 2.6414437294006348, "epoch": 0.5821853248259861, "grad_norm": 0.02645205892622471, "grad_norm_var": 1.2403638317379084e-06, "learning_rate": 0.003853350736307687, "loss": 2.5855, "step": 16059 }, { "crossentropy": 2.5567893981933594, "epoch": 0.5822215777262181, "grad_norm": 0.02763443998992443, "grad_norm_var": 1.203670580966172e-06, "learning_rate": 0.0038527851430371135, "loss": 2.6169, "step": 16060 }, { "crossentropy": 2.5470588207244873, "epoch": 0.5822578306264501, "grad_norm": 0.026650870218873024, "grad_norm_var": 1.1209124517948342e-06, "learning_rate": 0.003852219565261418, "loss": 2.5242, "step": 16061 }, { "crossentropy": 2.515415668487549, "epoch": 0.5822940835266821, "grad_norm": 0.027486149221658707, "grad_norm_var": 1.0513450395401574e-06, "learning_rate": 0.0038516540029882385, "loss": 2.5258, "step": 16062 }, { "crossentropy": 2.4214181900024414, "epoch": 0.5823303364269141, "grad_norm": 0.028107881546020508, "grad_norm_var": 1.0883670994978057e-06, "learning_rate": 0.0038510884562252133, "loss": 2.4337, "step": 16063 }, { "crossentropy": 2.506467580795288, "epoch": 0.5823665893271461, "grad_norm": 0.026026401668787003, "grad_norm_var": 1.1628569751354385e-06, "learning_rate": 0.0038505229249799836, "loss": 2.5507, "step": 16064 }, { "crossentropy": 2.493530511856079, "epoch": 0.5824028422273781, "grad_norm": 0.029078958556056023, "grad_norm_var": 1.3427575846260026e-06, "learning_rate": 0.003849957409260188, "loss": 2.5923, "step": 16065 }, { "crossentropy": 2.524559736251831, "epoch": 0.5824390951276102, "grad_norm": 0.02807578258216381, "grad_norm_var": 1.3230133852770806e-06, "learning_rate": 0.00384939190907346, "loss": 2.5973, "step": 16066 }, { "crossentropy": 2.619180917739868, "epoch": 0.5824753480278422, "grad_norm": 0.02628723531961441, "grad_norm_var": 9.325043560779248e-07, "learning_rate": 0.003848826424427441, "loss": 2.5765, "step": 16067 }, { "crossentropy": 2.410156488418579, "epoch": 0.5825116009280742, "grad_norm": 0.026163626462221146, "grad_norm_var": 9.639909811932912e-07, "learning_rate": 0.003848260955329768, "loss": 2.5336, "step": 16068 }, { "crossentropy": 2.5389373302459717, "epoch": 0.5825478538283063, "grad_norm": 0.02897561900317669, "grad_norm_var": 1.1601472447437046e-06, "learning_rate": 0.0038476955017880793, "loss": 2.5808, "step": 16069 }, { "crossentropy": 2.868663787841797, "epoch": 0.5825841067285383, "grad_norm": 0.02828582562506199, "grad_norm_var": 1.2175371021681669e-06, "learning_rate": 0.003847130063810011, "loss": 2.8375, "step": 16070 }, { "crossentropy": 2.459364175796509, "epoch": 0.5826203596287703, "grad_norm": 0.027421222999691963, "grad_norm_var": 1.1770703311208964e-06, "learning_rate": 0.0038465646414031994, "loss": 2.5713, "step": 16071 }, { "crossentropy": 2.431975841522217, "epoch": 0.5826566125290024, "grad_norm": 0.030603954568505287, "grad_norm_var": 1.8370529068068482e-06, "learning_rate": 0.0038459992345752847, "loss": 2.4597, "step": 16072 }, { "crossentropy": 2.4589362144470215, "epoch": 0.5826928654292344, "grad_norm": 0.026238055899739265, "grad_norm_var": 1.929683926981535e-06, "learning_rate": 0.0038454338433339032, "loss": 2.4952, "step": 16073 }, { "crossentropy": 2.5364155769348145, "epoch": 0.5827291183294664, "grad_norm": 0.02787758596241474, "grad_norm_var": 1.634268633345888e-06, "learning_rate": 0.0038448684676866867, "loss": 2.4785, "step": 16074 }, { "crossentropy": 2.658607006072998, "epoch": 0.5827653712296984, "grad_norm": 0.026718391105532646, "grad_norm_var": 1.5984575321992585e-06, "learning_rate": 0.0038443031076412755, "loss": 2.583, "step": 16075 }, { "crossentropy": 2.514110565185547, "epoch": 0.5828016241299304, "grad_norm": 0.02670191042125225, "grad_norm_var": 1.648774750170875e-06, "learning_rate": 0.0038437377632053044, "loss": 2.5062, "step": 16076 }, { "crossentropy": 2.6157186031341553, "epoch": 0.5828378770301624, "grad_norm": 0.026833105832338333, "grad_norm_var": 1.6291559010027378e-06, "learning_rate": 0.00384317243438641, "loss": 2.6052, "step": 16077 }, { "crossentropy": 2.590567111968994, "epoch": 0.5828741299303944, "grad_norm": 0.02691926248371601, "grad_norm_var": 1.654453078838676e-06, "learning_rate": 0.003842607121192227, "loss": 2.6209, "step": 16078 }, { "crossentropy": 2.4327480792999268, "epoch": 0.5829103828306265, "grad_norm": 0.02754564955830574, "grad_norm_var": 1.6301152518071917e-06, "learning_rate": 0.0038420418236303905, "loss": 2.5299, "step": 16079 }, { "crossentropy": 2.515484571456909, "epoch": 0.5829466357308585, "grad_norm": 0.025869550183415413, "grad_norm_var": 1.6621476549310658e-06, "learning_rate": 0.0038414765417085373, "loss": 2.5435, "step": 16080 }, { "crossentropy": 2.5692572593688965, "epoch": 0.5829828886310905, "grad_norm": 0.026355838403105736, "grad_norm_var": 1.5431427841725945e-06, "learning_rate": 0.003840911275434302, "loss": 2.5634, "step": 16081 }, { "crossentropy": 2.4811830520629883, "epoch": 0.5830191415313225, "grad_norm": 0.027290519326925278, "grad_norm_var": 1.5009320671365329e-06, "learning_rate": 0.0038403460248153188, "loss": 2.4455, "step": 16082 }, { "crossentropy": 2.6564385890960693, "epoch": 0.5830553944315545, "grad_norm": 0.027918510138988495, "grad_norm_var": 1.4566561841259136e-06, "learning_rate": 0.0038397807898592224, "loss": 2.6487, "step": 16083 }, { "crossentropy": 2.4299988746643066, "epoch": 0.5830916473317865, "grad_norm": 0.025681210681796074, "grad_norm_var": 1.5479884379052585e-06, "learning_rate": 0.0038392155705736465, "loss": 2.4422, "step": 16084 }, { "crossentropy": 2.5644357204437256, "epoch": 0.5831279002320185, "grad_norm": 0.027383575215935707, "grad_norm_var": 1.3565005494583482e-06, "learning_rate": 0.0038386503669662264, "loss": 2.5977, "step": 16085 }, { "crossentropy": 2.5864155292510986, "epoch": 0.5831641531322506, "grad_norm": 0.025185244157910347, "grad_norm_var": 1.5199352839125857e-06, "learning_rate": 0.0038380851790445953, "loss": 2.5389, "step": 16086 }, { "crossentropy": 2.49881911277771, "epoch": 0.5832004060324826, "grad_norm": 0.02679331786930561, "grad_norm_var": 1.5121561014393969e-06, "learning_rate": 0.003837520006816386, "loss": 2.4852, "step": 16087 }, { "crossentropy": 2.6162872314453125, "epoch": 0.5832366589327146, "grad_norm": 0.028420181944966316, "grad_norm_var": 7.593132898330834e-07, "learning_rate": 0.0038369548502892345, "loss": 2.5998, "step": 16088 }, { "crossentropy": 2.549659013748169, "epoch": 0.5832729118329466, "grad_norm": 0.02584671415388584, "grad_norm_var": 8.012458118942037e-07, "learning_rate": 0.0038363897094707724, "loss": 2.5921, "step": 16089 }, { "crossentropy": 2.6172778606414795, "epoch": 0.5833091647331786, "grad_norm": 0.027252227067947388, "grad_norm_var": 7.386545977260215e-07, "learning_rate": 0.003835824584368634, "loss": 2.5951, "step": 16090 }, { "crossentropy": 2.4964630603790283, "epoch": 0.5833454176334106, "grad_norm": 0.026996226981282234, "grad_norm_var": 7.406522798373046e-07, "learning_rate": 0.0038352594749904503, "loss": 2.551, "step": 16091 }, { "crossentropy": 2.4516189098358154, "epoch": 0.5833816705336426, "grad_norm": 0.02790805883705616, "grad_norm_var": 8.138618243724716e-07, "learning_rate": 0.003834694381343855, "loss": 2.5283, "step": 16092 }, { "crossentropy": 2.5168886184692383, "epoch": 0.5834179234338747, "grad_norm": 0.027884405106306076, "grad_norm_var": 8.753211744750294e-07, "learning_rate": 0.003834129303436481, "loss": 2.5007, "step": 16093 }, { "crossentropy": 2.5444345474243164, "epoch": 0.5834541763341067, "grad_norm": 0.02649141289293766, "grad_norm_var": 8.886956254054578e-07, "learning_rate": 0.003833564241275959, "loss": 2.6358, "step": 16094 }, { "crossentropy": 2.2550079822540283, "epoch": 0.5834904292343387, "grad_norm": 0.026860902085900307, "grad_norm_var": 8.614646771124464e-07, "learning_rate": 0.0038329991948699215, "loss": 2.4281, "step": 16095 }, { "crossentropy": 2.6728627681732178, "epoch": 0.5835266821345708, "grad_norm": 0.02682105451822281, "grad_norm_var": 7.893976626582259e-07, "learning_rate": 0.0038324341642260026, "loss": 2.6189, "step": 16096 }, { "crossentropy": 2.5171639919281006, "epoch": 0.5835629350348028, "grad_norm": 0.028105823323130608, "grad_norm_var": 8.437770144183263e-07, "learning_rate": 0.0038318691493518316, "loss": 2.6006, "step": 16097 }, { "crossentropy": 2.5241363048553467, "epoch": 0.5835991879350348, "grad_norm": 0.02698255516588688, "grad_norm_var": 8.39929531615211e-07, "learning_rate": 0.0038313041502550417, "loss": 2.5379, "step": 16098 }, { "crossentropy": 2.623544931411743, "epoch": 0.5836354408352669, "grad_norm": 0.02589322440326214, "grad_norm_var": 8.572271753267408e-07, "learning_rate": 0.0038307391669432614, "loss": 2.5113, "step": 16099 }, { "crossentropy": 2.572796106338501, "epoch": 0.5836716937354989, "grad_norm": 0.027089480310678482, "grad_norm_var": 7.510818882746258e-07, "learning_rate": 0.003830174199424123, "loss": 2.5938, "step": 16100 }, { "crossentropy": 2.574310779571533, "epoch": 0.5837079466357309, "grad_norm": 0.026094242930412292, "grad_norm_var": 7.881200288554207e-07, "learning_rate": 0.0038296092477052572, "loss": 2.5517, "step": 16101 }, { "crossentropy": 2.545821189880371, "epoch": 0.5837441995359629, "grad_norm": 0.02624710276722908, "grad_norm_var": 6.138227901975509e-07, "learning_rate": 0.0038290443117942954, "loss": 2.4986, "step": 16102 }, { "crossentropy": 2.5762240886688232, "epoch": 0.5837804524361949, "grad_norm": 0.02650669403374195, "grad_norm_var": 6.261082598021476e-07, "learning_rate": 0.0038284793916988653, "loss": 2.5102, "step": 16103 }, { "crossentropy": 2.461611270904541, "epoch": 0.5838167053364269, "grad_norm": 0.027015509083867073, "grad_norm_var": 4.764221274963324e-07, "learning_rate": 0.0038279144874266004, "loss": 2.5339, "step": 16104 }, { "crossentropy": 2.472573757171631, "epoch": 0.583852958236659, "grad_norm": 0.028270667418837547, "grad_norm_var": 5.113965873798064e-07, "learning_rate": 0.0038273495989851287, "loss": 2.5367, "step": 16105 }, { "crossentropy": 2.594985008239746, "epoch": 0.583889211136891, "grad_norm": 0.028882455080747604, "grad_norm_var": 7.266241294568802e-07, "learning_rate": 0.003826784726382081, "loss": 2.587, "step": 16106 }, { "crossentropy": 2.562769889831543, "epoch": 0.583925464037123, "grad_norm": 0.029148781672120094, "grad_norm_var": 9.783649869423797e-07, "learning_rate": 0.003826219869625086, "loss": 2.5097, "step": 16107 }, { "crossentropy": 2.478583335876465, "epoch": 0.583961716937355, "grad_norm": 0.028150184080004692, "grad_norm_var": 1.0028650573288994e-06, "learning_rate": 0.0038256550287217726, "loss": 2.4658, "step": 16108 }, { "crossentropy": 2.6112754344940186, "epoch": 0.583997969837587, "grad_norm": 0.027130454778671265, "grad_norm_var": 9.774106929568798e-07, "learning_rate": 0.00382509020367977, "loss": 2.4762, "step": 16109 }, { "crossentropy": 2.654292106628418, "epoch": 0.584034222737819, "grad_norm": 0.02712092362344265, "grad_norm_var": 9.401299774992762e-07, "learning_rate": 0.0038245253945067073, "loss": 2.5244, "step": 16110 }, { "crossentropy": 2.369053602218628, "epoch": 0.584070475638051, "grad_norm": 0.025427360087633133, "grad_norm_var": 1.1467653429734903e-06, "learning_rate": 0.0038239606012102113, "loss": 2.4713, "step": 16111 }, { "crossentropy": 2.548177719116211, "epoch": 0.584106728538283, "grad_norm": 0.027144161984324455, "grad_norm_var": 1.1378089790551118e-06, "learning_rate": 0.003823395823797914, "loss": 2.5186, "step": 16112 }, { "crossentropy": 2.5954909324645996, "epoch": 0.5841429814385151, "grad_norm": 0.026460278779268265, "grad_norm_var": 1.1084364429988682e-06, "learning_rate": 0.003822831062277441, "loss": 2.6158, "step": 16113 }, { "crossentropy": 2.603161573410034, "epoch": 0.5841792343387471, "grad_norm": 0.02613545022904873, "grad_norm_var": 1.166297102644107e-06, "learning_rate": 0.003822266316656421, "loss": 2.573, "step": 16114 }, { "crossentropy": 2.4920620918273926, "epoch": 0.5842154872389791, "grad_norm": 0.026674015447497368, "grad_norm_var": 1.0845128325337548e-06, "learning_rate": 0.003821701586942483, "loss": 2.4981, "step": 16115 }, { "crossentropy": 2.6182639598846436, "epoch": 0.5842517401392111, "grad_norm": 0.026593133807182312, "grad_norm_var": 1.1001836337130259e-06, "learning_rate": 0.0038211368731432523, "loss": 2.5976, "step": 16116 }, { "crossentropy": 2.431830406188965, "epoch": 0.5842879930394431, "grad_norm": 0.027143388986587524, "grad_norm_var": 1.0335197261626076e-06, "learning_rate": 0.003820572175266357, "loss": 2.4716, "step": 16117 }, { "crossentropy": 2.567948579788208, "epoch": 0.5843242459396751, "grad_norm": 0.02992899902164936, "grad_norm_var": 1.448263994154042e-06, "learning_rate": 0.003820007493319424, "loss": 2.5545, "step": 16118 }, { "crossentropy": 2.440685272216797, "epoch": 0.5843604988399071, "grad_norm": 0.027720492333173752, "grad_norm_var": 1.4025253803172463e-06, "learning_rate": 0.0038194428273100795, "loss": 2.4788, "step": 16119 }, { "crossentropy": 2.545077323913574, "epoch": 0.5843967517401392, "grad_norm": 0.029171975329518318, "grad_norm_var": 1.5728032934669468e-06, "learning_rate": 0.0038188781772459522, "loss": 2.5886, "step": 16120 }, { "crossentropy": 2.656953811645508, "epoch": 0.5844330046403712, "grad_norm": 0.027290690690279007, "grad_norm_var": 1.5411326458313983e-06, "learning_rate": 0.0038183135431346677, "loss": 2.6545, "step": 16121 }, { "crossentropy": 2.562735080718994, "epoch": 0.5844692575406032, "grad_norm": 0.027589945122599602, "grad_norm_var": 1.4086211786708356e-06, "learning_rate": 0.0038177489249838522, "loss": 2.6754, "step": 16122 }, { "crossentropy": 2.5767762660980225, "epoch": 0.5845055104408353, "grad_norm": 0.025974292308092117, "grad_norm_var": 1.3096405857318874e-06, "learning_rate": 0.0038171843228011317, "loss": 2.435, "step": 16123 }, { "crossentropy": 2.6192269325256348, "epoch": 0.5845417633410673, "grad_norm": 0.02622111327946186, "grad_norm_var": 1.305152800774173e-06, "learning_rate": 0.0038166197365941323, "loss": 2.536, "step": 16124 }, { "crossentropy": 2.5067384243011475, "epoch": 0.5845780162412993, "grad_norm": 0.026561247184872627, "grad_norm_var": 1.3236921591725046e-06, "learning_rate": 0.003816055166370479, "loss": 2.5936, "step": 16125 }, { "crossentropy": 2.473810911178589, "epoch": 0.5846142691415314, "grad_norm": 0.0280764140188694, "grad_norm_var": 1.3869415444102417e-06, "learning_rate": 0.0038154906121377957, "loss": 2.4876, "step": 16126 }, { "crossentropy": 2.7949252128601074, "epoch": 0.5846505220417634, "grad_norm": 0.026119479909539223, "grad_norm_var": 1.2595667046190791e-06, "learning_rate": 0.0038149260739037106, "loss": 2.6904, "step": 16127 }, { "crossentropy": 2.5263359546661377, "epoch": 0.5846867749419954, "grad_norm": 0.026612266898155212, "grad_norm_var": 1.2794582503463289e-06, "learning_rate": 0.0038143615516758474, "loss": 2.5215, "step": 16128 }, { "crossentropy": 2.5232818126678467, "epoch": 0.5847230278422274, "grad_norm": 0.02612552046775818, "grad_norm_var": 1.3168937428850145e-06, "learning_rate": 0.0038137970454618303, "loss": 2.485, "step": 16129 }, { "crossentropy": 2.4186151027679443, "epoch": 0.5847592807424594, "grad_norm": 0.026386752724647522, "grad_norm_var": 1.2878129078300875e-06, "learning_rate": 0.0038132325552692843, "loss": 2.494, "step": 16130 }, { "crossentropy": 2.60379695892334, "epoch": 0.5847955336426914, "grad_norm": 0.026008551940321922, "grad_norm_var": 1.3565578206188907e-06, "learning_rate": 0.003812668081105834, "loss": 2.536, "step": 16131 }, { "crossentropy": 2.546637773513794, "epoch": 0.5848317865429234, "grad_norm": 0.026826294139027596, "grad_norm_var": 1.3443452309747083e-06, "learning_rate": 0.003812103622979104, "loss": 2.552, "step": 16132 }, { "crossentropy": 2.568620204925537, "epoch": 0.5848680394431555, "grad_norm": 0.02896500751376152, "grad_norm_var": 1.559887307872078e-06, "learning_rate": 0.003811539180896716, "loss": 2.5312, "step": 16133 }, { "crossentropy": 2.446571111679077, "epoch": 0.5849042923433875, "grad_norm": 0.027836348861455917, "grad_norm_var": 1.0787510204320147e-06, "learning_rate": 0.0038109747548662936, "loss": 2.5866, "step": 16134 }, { "crossentropy": 2.6323695182800293, "epoch": 0.5849405452436195, "grad_norm": 0.026719871908426285, "grad_norm_var": 1.057597642626718e-06, "learning_rate": 0.003810410344895463, "loss": 2.5392, "step": 16135 }, { "crossentropy": 2.6948561668395996, "epoch": 0.5849767981438515, "grad_norm": 0.026311727240681648, "grad_norm_var": 7.521714775609999e-07, "learning_rate": 0.003809845950991846, "loss": 2.6659, "step": 16136 }, { "crossentropy": 2.5831944942474365, "epoch": 0.5850130510440835, "grad_norm": 0.02723512053489685, "grad_norm_var": 7.491110667136863e-07, "learning_rate": 0.003809281573163066, "loss": 2.5491, "step": 16137 }, { "crossentropy": 2.5108609199523926, "epoch": 0.5850493039443155, "grad_norm": 0.026748182252049446, "grad_norm_var": 7.10137821935625e-07, "learning_rate": 0.0038087172114167444, "loss": 2.5154, "step": 16138 }, { "crossentropy": 2.618349552154541, "epoch": 0.5850855568445475, "grad_norm": 0.026495689526200294, "grad_norm_var": 6.700378783049017e-07, "learning_rate": 0.003808152865760504, "loss": 2.575, "step": 16139 }, { "crossentropy": 2.5521531105041504, "epoch": 0.5851218097447796, "grad_norm": 0.027254903689026833, "grad_norm_var": 6.531668680285925e-07, "learning_rate": 0.0038075885362019715, "loss": 2.5739, "step": 16140 }, { "crossentropy": 2.6007497310638428, "epoch": 0.5851580626450116, "grad_norm": 0.02561887912452221, "grad_norm_var": 7.503186093380428e-07, "learning_rate": 0.003807024222748763, "loss": 2.6552, "step": 16141 }, { "crossentropy": 2.493422269821167, "epoch": 0.5851943155452436, "grad_norm": 0.028441710397601128, "grad_norm_var": 8.191810448918465e-07, "learning_rate": 0.0038064599254085014, "loss": 2.4438, "step": 16142 }, { "crossentropy": 2.5559256076812744, "epoch": 0.5852305684454756, "grad_norm": 0.028980493545532227, "grad_norm_var": 1.0495635407603103e-06, "learning_rate": 0.003805895644188811, "loss": 2.5804, "step": 16143 }, { "crossentropy": 2.547790765762329, "epoch": 0.5852668213457076, "grad_norm": 0.027353385463356972, "grad_norm_var": 1.0420741599690645e-06, "learning_rate": 0.003805331379097312, "loss": 2.5631, "step": 16144 }, { "crossentropy": 2.614623546600342, "epoch": 0.5853030742459396, "grad_norm": 0.03089689277112484, "grad_norm_var": 1.856594347750031e-06, "learning_rate": 0.0038047671301416255, "loss": 2.6123, "step": 16145 }, { "crossentropy": 2.6650874614715576, "epoch": 0.5853393271461717, "grad_norm": 0.02947753295302391, "grad_norm_var": 2.0443356452857013e-06, "learning_rate": 0.003804202897329373, "loss": 2.6007, "step": 16146 }, { "crossentropy": 2.5131964683532715, "epoch": 0.5853755800464037, "grad_norm": 0.029123792424798012, "grad_norm_var": 2.000995973607672e-06, "learning_rate": 0.003803638680668174, "loss": 2.4608, "step": 16147 }, { "crossentropy": 2.541480779647827, "epoch": 0.5854118329466357, "grad_norm": 0.026457486674189568, "grad_norm_var": 2.0557982491749167e-06, "learning_rate": 0.003803074480165651, "loss": 2.5288, "step": 16148 }, { "crossentropy": 2.611628770828247, "epoch": 0.5854480858468677, "grad_norm": 0.026580937206745148, "grad_norm_var": 2.0231649351410788e-06, "learning_rate": 0.003802510295829426, "loss": 2.5857, "step": 16149 }, { "crossentropy": 2.542668104171753, "epoch": 0.5854843387470998, "grad_norm": 0.02665504440665245, "grad_norm_var": 2.0724957780109595e-06, "learning_rate": 0.0038019461276671135, "loss": 2.5785, "step": 16150 }, { "crossentropy": 2.6061525344848633, "epoch": 0.5855205916473318, "grad_norm": 0.026513565331697464, "grad_norm_var": 2.0972199001355793e-06, "learning_rate": 0.0038013819756863377, "loss": 2.5344, "step": 16151 }, { "crossentropy": 2.559093952178955, "epoch": 0.5855568445475638, "grad_norm": 0.027562597766518593, "grad_norm_var": 1.9953137847151154e-06, "learning_rate": 0.003800817839894717, "loss": 2.5133, "step": 16152 }, { "crossentropy": 2.515745162963867, "epoch": 0.5855930974477959, "grad_norm": 0.027659960091114044, "grad_norm_var": 1.986647097285595e-06, "learning_rate": 0.003800253720299872, "loss": 2.505, "step": 16153 }, { "crossentropy": 2.4810681343078613, "epoch": 0.5856293503480279, "grad_norm": 0.026686208322644234, "grad_norm_var": 1.9940400406921283e-06, "learning_rate": 0.0037996896169094196, "loss": 2.5549, "step": 16154 }, { "crossentropy": 2.55824613571167, "epoch": 0.5856656032482599, "grad_norm": 0.026466475799679756, "grad_norm_var": 1.9984335782848956e-06, "learning_rate": 0.003799125529730981, "loss": 2.5481, "step": 16155 }, { "crossentropy": 2.6579151153564453, "epoch": 0.5857018561484919, "grad_norm": 0.027938958257436752, "grad_norm_var": 1.995463654587979e-06, "learning_rate": 0.0037985614587721745, "loss": 2.5833, "step": 16156 }, { "crossentropy": 2.5543911457061768, "epoch": 0.5857381090487239, "grad_norm": 0.027228249236941338, "grad_norm_var": 1.7213131061523476e-06, "learning_rate": 0.0037979974040406217, "loss": 2.5653, "step": 16157 }, { "crossentropy": 2.327502727508545, "epoch": 0.5857743619489559, "grad_norm": 0.026202740147709846, "grad_norm_var": 1.8285635344128843e-06, "learning_rate": 0.0037974333655439343, "loss": 2.4565, "step": 16158 }, { "crossentropy": 2.6802408695220947, "epoch": 0.585810614849188, "grad_norm": 0.027195937931537628, "grad_norm_var": 1.7018688714186818e-06, "learning_rate": 0.0037968693432897356, "loss": 2.624, "step": 16159 }, { "crossentropy": 2.5794708728790283, "epoch": 0.58584686774942, "grad_norm": 0.026995809748768806, "grad_norm_var": 1.7168495513662641e-06, "learning_rate": 0.003796305337285642, "loss": 2.5561, "step": 16160 }, { "crossentropy": 2.5048599243164062, "epoch": 0.585883120649652, "grad_norm": 0.028760354965925217, "grad_norm_var": 1.0280998818542586e-06, "learning_rate": 0.0037957413475392717, "loss": 2.5555, "step": 16161 }, { "crossentropy": 2.488523483276367, "epoch": 0.585919373549884, "grad_norm": 0.02848662994801998, "grad_norm_var": 8.075983364621974e-07, "learning_rate": 0.0037951773740582417, "loss": 2.5147, "step": 16162 }, { "crossentropy": 2.431703567504883, "epoch": 0.585955626450116, "grad_norm": 0.02737630531191826, "grad_norm_var": 5.693608380559952e-07, "learning_rate": 0.003794613416850169, "loss": 2.4572, "step": 16163 }, { "crossentropy": 2.7033355236053467, "epoch": 0.585991879350348, "grad_norm": 0.02767244540154934, "grad_norm_var": 5.457168917095967e-07, "learning_rate": 0.0037940494759226722, "loss": 2.6697, "step": 16164 }, { "crossentropy": 2.62024188041687, "epoch": 0.58602813225058, "grad_norm": 0.028127914294600487, "grad_norm_var": 5.575139274579712e-07, "learning_rate": 0.0037934855512833676, "loss": 2.5665, "step": 16165 }, { "crossentropy": 2.47536039352417, "epoch": 0.586064385150812, "grad_norm": 0.02649684064090252, "grad_norm_var": 5.736441391200608e-07, "learning_rate": 0.0037929216429398722, "loss": 2.5359, "step": 16166 }, { "crossentropy": 2.586808443069458, "epoch": 0.5861006380510441, "grad_norm": 0.025961263105273247, "grad_norm_var": 6.532502876671938e-07, "learning_rate": 0.0037923577508998008, "loss": 2.5456, "step": 16167 }, { "crossentropy": 2.621777296066284, "epoch": 0.5861368909512761, "grad_norm": 0.026380112394690514, "grad_norm_var": 6.994240453949181e-07, "learning_rate": 0.0037917938751707713, "loss": 2.5696, "step": 16168 }, { "crossentropy": 2.5390982627868652, "epoch": 0.5861731438515081, "grad_norm": 0.02811640128493309, "grad_norm_var": 7.38778651652759e-07, "learning_rate": 0.0037912300157603987, "loss": 2.6548, "step": 16169 }, { "crossentropy": 2.5529778003692627, "epoch": 0.5862093967517401, "grad_norm": 0.025452466681599617, "grad_norm_var": 9.276066749016083e-07, "learning_rate": 0.003790666172676299, "loss": 2.5758, "step": 16170 }, { "crossentropy": 2.646758794784546, "epoch": 0.5862456496519721, "grad_norm": 0.03773084655404091, "grad_norm_var": 7.78831138930665e-06, "learning_rate": 0.0037901023459260864, "loss": 2.6127, "step": 16171 }, { "crossentropy": 2.466268539428711, "epoch": 0.5862819025522041, "grad_norm": 0.027906125411391258, "grad_norm_var": 7.788132502317214e-06, "learning_rate": 0.0037895385355173794, "loss": 2.4012, "step": 16172 }, { "crossentropy": 2.607668399810791, "epoch": 0.5863181554524362, "grad_norm": 0.026378197595477104, "grad_norm_var": 7.907237789685666e-06, "learning_rate": 0.0037889747414577906, "loss": 2.5144, "step": 16173 }, { "crossentropy": 2.4756624698638916, "epoch": 0.5863544083526682, "grad_norm": 0.025142448022961617, "grad_norm_var": 8.207200970161915e-06, "learning_rate": 0.003788410963754938, "loss": 2.4584, "step": 16174 }, { "crossentropy": 2.4956979751586914, "epoch": 0.5863906612529002, "grad_norm": 0.02760959230363369, "grad_norm_var": 8.186715824611406e-06, "learning_rate": 0.0037878472024164317, "loss": 2.5835, "step": 16175 }, { "crossentropy": 2.532472848892212, "epoch": 0.5864269141531323, "grad_norm": 0.026235951110720634, "grad_norm_var": 8.302972533856766e-06, "learning_rate": 0.0037872834574498894, "loss": 2.521, "step": 16176 }, { "crossentropy": 2.580200672149658, "epoch": 0.5864631670533643, "grad_norm": 0.02582249976694584, "grad_norm_var": 8.442572792092733e-06, "learning_rate": 0.003786719728862924, "loss": 2.5498, "step": 16177 }, { "crossentropy": 2.5530571937561035, "epoch": 0.5864994199535963, "grad_norm": 0.027056358754634857, "grad_norm_var": 8.39295423637453e-06, "learning_rate": 0.0037861560166631497, "loss": 2.5022, "step": 16178 }, { "crossentropy": 2.6108548641204834, "epoch": 0.5865356728538283, "grad_norm": 0.027155950665473938, "grad_norm_var": 8.398642220551419e-06, "learning_rate": 0.0037855923208581794, "loss": 2.5632, "step": 16179 }, { "crossentropy": 2.4337046146392822, "epoch": 0.5865719257540604, "grad_norm": 0.024900825694203377, "grad_norm_var": 8.79760383624024e-06, "learning_rate": 0.003785028641455629, "loss": 2.5337, "step": 16180 }, { "crossentropy": 2.472346782684326, "epoch": 0.5866081786542924, "grad_norm": 0.02731148898601532, "grad_norm_var": 8.74691984952453e-06, "learning_rate": 0.0037844649784631104, "loss": 2.4717, "step": 16181 }, { "crossentropy": 2.6018145084381104, "epoch": 0.5866444315545244, "grad_norm": 0.027625329792499542, "grad_norm_var": 8.716410674341052e-06, "learning_rate": 0.0037839013318882384, "loss": 2.6101, "step": 16182 }, { "crossentropy": 2.495811700820923, "epoch": 0.5866806844547564, "grad_norm": 0.02637694962322712, "grad_norm_var": 8.653060050544762e-06, "learning_rate": 0.003783337701738623, "loss": 2.6392, "step": 16183 }, { "crossentropy": 2.5503640174865723, "epoch": 0.5867169373549884, "grad_norm": 0.026575149968266487, "grad_norm_var": 8.630863207015743e-06, "learning_rate": 0.003782774088021878, "loss": 2.5294, "step": 16184 }, { "crossentropy": 2.6674232482910156, "epoch": 0.5867531902552204, "grad_norm": 0.026729371398687363, "grad_norm_var": 8.60701653161011e-06, "learning_rate": 0.0037822104907456167, "loss": 2.6162, "step": 16185 }, { "crossentropy": 2.766094207763672, "epoch": 0.5867894431554525, "grad_norm": 0.027008796110749245, "grad_norm_var": 8.385270523822371e-06, "learning_rate": 0.0037816469099174505, "loss": 2.7262, "step": 16186 }, { "crossentropy": 2.5534145832061768, "epoch": 0.5868256960556845, "grad_norm": 0.02641858346760273, "grad_norm_var": 7.225612741456559e-07, "learning_rate": 0.0037810833455449904, "loss": 2.5589, "step": 16187 }, { "crossentropy": 2.462928056716919, "epoch": 0.5868619489559165, "grad_norm": 0.0266717541962862, "grad_norm_var": 6.095483784944774e-07, "learning_rate": 0.0037805197976358514, "loss": 2.5668, "step": 16188 }, { "crossentropy": 2.5639936923980713, "epoch": 0.5868982018561485, "grad_norm": 0.027454588562250137, "grad_norm_var": 6.55338465717301e-07, "learning_rate": 0.003779956266197642, "loss": 2.4998, "step": 16189 }, { "crossentropy": 2.4845104217529297, "epoch": 0.5869344547563805, "grad_norm": 0.026603810489177704, "grad_norm_var": 4.98774769998655e-07, "learning_rate": 0.0037793927512379754, "loss": 2.5235, "step": 16190 }, { "crossentropy": 2.465763568878174, "epoch": 0.5869707076566125, "grad_norm": 0.028542201966047287, "grad_norm_var": 6.634662410650212e-07, "learning_rate": 0.0037788292527644635, "loss": 2.5211, "step": 16191 }, { "crossentropy": 2.55649471282959, "epoch": 0.5870069605568445, "grad_norm": 0.02771064080297947, "grad_norm_var": 6.922937198532962e-07, "learning_rate": 0.003778265770784714, "loss": 2.5866, "step": 16192 }, { "crossentropy": 2.3611340522766113, "epoch": 0.5870432134570766, "grad_norm": 0.026101088151335716, "grad_norm_var": 6.581320720573658e-07, "learning_rate": 0.0037777023053063396, "loss": 2.4365, "step": 16193 }, { "crossentropy": 2.4829070568084717, "epoch": 0.5870794663573086, "grad_norm": 0.026568587869405746, "grad_norm_var": 6.621945133131538e-07, "learning_rate": 0.0037771388563369503, "loss": 2.5495, "step": 16194 }, { "crossentropy": 2.5076401233673096, "epoch": 0.5871157192575406, "grad_norm": 0.02947639301419258, "grad_norm_var": 1.0903820759198639e-06, "learning_rate": 0.0037765754238841555, "loss": 2.5708, "step": 16195 }, { "crossentropy": 2.460340976715088, "epoch": 0.5871519721577726, "grad_norm": 0.02671147510409355, "grad_norm_var": 7.873626785456402e-07, "learning_rate": 0.0037760120079555672, "loss": 2.5747, "step": 16196 }, { "crossentropy": 2.453361749649048, "epoch": 0.5871882250580046, "grad_norm": 0.02865833230316639, "grad_norm_var": 9.35503536799292e-07, "learning_rate": 0.0037754486085587948, "loss": 2.4824, "step": 16197 }, { "crossentropy": 2.495837926864624, "epoch": 0.5872244779582366, "grad_norm": 0.02951982244849205, "grad_norm_var": 1.2667385080045168e-06, "learning_rate": 0.0037748852257014466, "loss": 2.4862, "step": 16198 }, { "crossentropy": 2.550191879272461, "epoch": 0.5872607308584686, "grad_norm": 0.028781961649656296, "grad_norm_var": 1.3256861282662785e-06, "learning_rate": 0.003774321859391133, "loss": 2.5979, "step": 16199 }, { "crossentropy": 2.5211946964263916, "epoch": 0.5872969837587007, "grad_norm": 0.02801997773349285, "grad_norm_var": 1.2836181646050931e-06, "learning_rate": 0.003773758509635463, "loss": 2.5786, "step": 16200 }, { "crossentropy": 2.5366764068603516, "epoch": 0.5873332366589327, "grad_norm": 0.02694045938551426, "grad_norm_var": 1.2629943704984062e-06, "learning_rate": 0.0037731951764420446, "loss": 2.5832, "step": 16201 }, { "crossentropy": 2.631450891494751, "epoch": 0.5873694895591647, "grad_norm": 0.026708019897341728, "grad_norm_var": 1.2913263796439368e-06, "learning_rate": 0.003772631859818487, "loss": 2.5811, "step": 16202 }, { "crossentropy": 2.4083659648895264, "epoch": 0.5874057424593968, "grad_norm": 0.02735961228609085, "grad_norm_var": 1.2040252115862962e-06, "learning_rate": 0.003772068559772398, "loss": 2.4993, "step": 16203 }, { "crossentropy": 2.517136335372925, "epoch": 0.5874419953596288, "grad_norm": 0.02843586727976799, "grad_norm_var": 1.1768312526798386e-06, "learning_rate": 0.003771505276311388, "loss": 2.5259, "step": 16204 }, { "crossentropy": 2.4057555198669434, "epoch": 0.5874782482598608, "grad_norm": 0.026066796854138374, "grad_norm_var": 1.3471579330978731e-06, "learning_rate": 0.0037709420094430624, "loss": 2.42, "step": 16205 }, { "crossentropy": 2.6357903480529785, "epoch": 0.5875145011600929, "grad_norm": 0.026074647903442383, "grad_norm_var": 1.437612980216537e-06, "learning_rate": 0.003770378759175031, "loss": 2.5364, "step": 16206 }, { "crossentropy": 2.5703775882720947, "epoch": 0.5875507540603249, "grad_norm": 0.026898503303527832, "grad_norm_var": 1.401018675320252e-06, "learning_rate": 0.0037698155255149004, "loss": 2.6004, "step": 16207 }, { "crossentropy": 2.61073899269104, "epoch": 0.5875870069605569, "grad_norm": 0.027057567611336708, "grad_norm_var": 1.4095085292658056e-06, "learning_rate": 0.0037692523084702782, "loss": 2.6011, "step": 16208 }, { "crossentropy": 2.495182752609253, "epoch": 0.5876232598607889, "grad_norm": 0.026014622300863266, "grad_norm_var": 1.4256561682507904e-06, "learning_rate": 0.0037686891080487713, "loss": 2.5502, "step": 16209 }, { "crossentropy": 2.541459083557129, "epoch": 0.5876595127610209, "grad_norm": 0.0275901947170496, "grad_norm_var": 1.3700365739951661e-06, "learning_rate": 0.0037681259242579863, "loss": 2.5645, "step": 16210 }, { "crossentropy": 2.550642728805542, "epoch": 0.5876957656612529, "grad_norm": 0.02796182967722416, "grad_norm_var": 1.1182554461201695e-06, "learning_rate": 0.003767562757105529, "loss": 2.5263, "step": 16211 }, { "crossentropy": 2.4675283432006836, "epoch": 0.5877320185614849, "grad_norm": 0.027180522680282593, "grad_norm_var": 1.087383391445984e-06, "learning_rate": 0.0037669996065990076, "loss": 2.4892, "step": 16212 }, { "crossentropy": 2.5294981002807617, "epoch": 0.587768271461717, "grad_norm": 0.02573852241039276, "grad_norm_var": 1.1514731060951876e-06, "learning_rate": 0.0037664364727460283, "loss": 2.4662, "step": 16213 }, { "crossentropy": 2.5298731327056885, "epoch": 0.587804524361949, "grad_norm": 0.026558594778180122, "grad_norm_var": 8.119430966448011e-07, "learning_rate": 0.003765873355554196, "loss": 2.4988, "step": 16214 }, { "crossentropy": 2.6466028690338135, "epoch": 0.587840777262181, "grad_norm": 0.026402320712804794, "grad_norm_var": 6.279893385136362e-07, "learning_rate": 0.003765310255031117, "loss": 2.6622, "step": 16215 }, { "crossentropy": 2.509413003921509, "epoch": 0.587877030162413, "grad_norm": 0.026536164805293083, "grad_norm_var": 5.515360351306004e-07, "learning_rate": 0.0037647471711843984, "loss": 2.5371, "step": 16216 }, { "crossentropy": 2.5743634700775146, "epoch": 0.587913283062645, "grad_norm": 0.03709463030099869, "grad_norm_var": 7.124617337441357e-06, "learning_rate": 0.003764184104021642, "loss": 2.5814, "step": 16217 }, { "crossentropy": 2.474865436553955, "epoch": 0.587949535962877, "grad_norm": 0.026901766657829285, "grad_norm_var": 7.1070235195230235e-06, "learning_rate": 0.0037636210535504544, "loss": 2.4985, "step": 16218 }, { "crossentropy": 2.6906228065490723, "epoch": 0.587985788863109, "grad_norm": 0.026789577677845955, "grad_norm_var": 7.137395092960301e-06, "learning_rate": 0.0037630580197784403, "loss": 2.703, "step": 16219 }, { "crossentropy": 2.639502763748169, "epoch": 0.588022041763341, "grad_norm": 0.02663148008286953, "grad_norm_var": 7.105234254493193e-06, "learning_rate": 0.003762495002713206, "loss": 2.5262, "step": 16220 }, { "crossentropy": 2.581808567047119, "epoch": 0.5880582946635731, "grad_norm": 0.026505593210458755, "grad_norm_var": 7.042566747909974e-06, "learning_rate": 0.003761932002362354, "loss": 2.5866, "step": 16221 }, { "crossentropy": 2.6710731983184814, "epoch": 0.5880945475638051, "grad_norm": 0.028065849095582962, "grad_norm_var": 6.946189898289475e-06, "learning_rate": 0.0037613690187334897, "loss": 2.5732, "step": 16222 }, { "crossentropy": 2.5032706260681152, "epoch": 0.5881308004640371, "grad_norm": 0.02732122130692005, "grad_norm_var": 6.923710804123314e-06, "learning_rate": 0.0037608060518342164, "loss": 2.5457, "step": 16223 }, { "crossentropy": 2.589921474456787, "epoch": 0.5881670533642691, "grad_norm": 0.026089537888765335, "grad_norm_var": 7.042210542483708e-06, "learning_rate": 0.0037602431016721368, "loss": 2.4936, "step": 16224 }, { "crossentropy": 2.495952844619751, "epoch": 0.5882033062645011, "grad_norm": 0.026608102023601532, "grad_norm_var": 6.949739608469784e-06, "learning_rate": 0.003759680168254859, "loss": 2.5293, "step": 16225 }, { "crossentropy": 2.2613096237182617, "epoch": 0.5882395591647331, "grad_norm": 0.026516437530517578, "grad_norm_var": 7.008670726613095e-06, "learning_rate": 0.0037591172515899795, "loss": 2.3845, "step": 16226 }, { "crossentropy": 2.4706201553344727, "epoch": 0.5882758120649652, "grad_norm": 0.026124678552150726, "grad_norm_var": 7.089681580986134e-06, "learning_rate": 0.0037585543516851045, "loss": 2.5481, "step": 16227 }, { "crossentropy": 2.5343151092529297, "epoch": 0.5883120649651972, "grad_norm": 0.02743254043161869, "grad_norm_var": 7.089079880844874e-06, "learning_rate": 0.0037579914685478377, "loss": 2.5557, "step": 16228 }, { "crossentropy": 2.624694585800171, "epoch": 0.5883483178654292, "grad_norm": 0.027773287147283554, "grad_norm_var": 6.915447922615133e-06, "learning_rate": 0.003757428602185781, "loss": 2.5726, "step": 16229 }, { "crossentropy": 2.5209460258483887, "epoch": 0.5883845707656613, "grad_norm": 0.02599327452480793, "grad_norm_var": 7.003327745811379e-06, "learning_rate": 0.003756865752606537, "loss": 2.6079, "step": 16230 }, { "crossentropy": 2.524473190307617, "epoch": 0.5884208236658933, "grad_norm": 0.02560313418507576, "grad_norm_var": 7.1521311458472755e-06, "learning_rate": 0.0037563029198177072, "loss": 2.4995, "step": 16231 }, { "crossentropy": 2.5636508464813232, "epoch": 0.5884570765661253, "grad_norm": 0.027427643537521362, "grad_norm_var": 7.102189407441487e-06, "learning_rate": 0.0037557401038268933, "loss": 2.5152, "step": 16232 }, { "crossentropy": 2.310877561569214, "epoch": 0.5884933294663574, "grad_norm": 0.027409274131059647, "grad_norm_var": 4.842533834438588e-07, "learning_rate": 0.0037551773046417005, "loss": 2.3664, "step": 16233 }, { "crossentropy": 2.6041629314422607, "epoch": 0.5885295823665894, "grad_norm": 0.026868296787142754, "grad_norm_var": 4.839789738724763e-07, "learning_rate": 0.003754614522269724, "loss": 2.5517, "step": 16234 }, { "crossentropy": 2.5827672481536865, "epoch": 0.5885658352668214, "grad_norm": 0.0274799931794405, "grad_norm_var": 5.107408077311669e-07, "learning_rate": 0.003754051756718569, "loss": 2.6064, "step": 16235 }, { "crossentropy": 2.5068376064300537, "epoch": 0.5886020881670534, "grad_norm": 0.026979679241776466, "grad_norm_var": 5.074469348084291e-07, "learning_rate": 0.0037534890079958362, "loss": 2.537, "step": 16236 }, { "crossentropy": 2.533282518386841, "epoch": 0.5886383410672854, "grad_norm": 0.02590601146221161, "grad_norm_var": 5.604395412134658e-07, "learning_rate": 0.0037529262761091258, "loss": 2.4304, "step": 16237 }, { "crossentropy": 2.6603384017944336, "epoch": 0.5886745939675174, "grad_norm": 0.02553427405655384, "grad_norm_var": 5.505703707895608e-07, "learning_rate": 0.003752363561066039, "loss": 2.5091, "step": 16238 }, { "crossentropy": 2.717832088470459, "epoch": 0.5887108468677494, "grad_norm": 0.025958595797419548, "grad_norm_var": 5.522456779404368e-07, "learning_rate": 0.003751800862874174, "loss": 2.5668, "step": 16239 }, { "crossentropy": 2.5512430667877197, "epoch": 0.5887470997679815, "grad_norm": 0.026542631909251213, "grad_norm_var": 5.338427013175384e-07, "learning_rate": 0.0037512381815411337, "loss": 2.5343, "step": 16240 }, { "crossentropy": 2.6422722339630127, "epoch": 0.5887833526682135, "grad_norm": 0.026893271133303642, "grad_norm_var": 5.37907659132553e-07, "learning_rate": 0.0037506755170745192, "loss": 2.4921, "step": 16241 }, { "crossentropy": 2.550358533859253, "epoch": 0.5888196055684455, "grad_norm": 0.02656625770032406, "grad_norm_var": 5.371577111986636e-07, "learning_rate": 0.0037501128694819243, "loss": 2.5421, "step": 16242 }, { "crossentropy": 2.381939172744751, "epoch": 0.5888558584686775, "grad_norm": 0.02587525174021721, "grad_norm_var": 5.587096178633529e-07, "learning_rate": 0.0037495502387709535, "loss": 2.3977, "step": 16243 }, { "crossentropy": 2.409336566925049, "epoch": 0.5888921113689095, "grad_norm": 0.02533024176955223, "grad_norm_var": 6.128439840342164e-07, "learning_rate": 0.003748987624949204, "loss": 2.5077, "step": 16244 }, { "crossentropy": 2.592266082763672, "epoch": 0.5889283642691415, "grad_norm": 0.026928773149847984, "grad_norm_var": 5.150378546420545e-07, "learning_rate": 0.003748425028024275, "loss": 2.5266, "step": 16245 }, { "crossentropy": 2.506216526031494, "epoch": 0.5889646171693735, "grad_norm": 0.02579433098435402, "grad_norm_var": 5.297866806714396e-07, "learning_rate": 0.0037478624480037654, "loss": 2.5431, "step": 16246 }, { "crossentropy": 2.514294385910034, "epoch": 0.5890008700696056, "grad_norm": 0.026119913905858994, "grad_norm_var": 4.885663237532648e-07, "learning_rate": 0.003747299884895272, "loss": 2.5301, "step": 16247 }, { "crossentropy": 2.5735578536987305, "epoch": 0.5890371229698376, "grad_norm": 0.02660703845322132, "grad_norm_var": 4.2651956652234314e-07, "learning_rate": 0.0037467373387063967, "loss": 2.6788, "step": 16248 }, { "crossentropy": 2.579256772994995, "epoch": 0.5890733758700696, "grad_norm": 0.028598759323358536, "grad_norm_var": 6.711142951370171e-07, "learning_rate": 0.0037461748094447357, "loss": 2.5549, "step": 16249 }, { "crossentropy": 2.5843698978424072, "epoch": 0.5891096287703016, "grad_norm": 0.026691962033510208, "grad_norm_var": 6.643740206326107e-07, "learning_rate": 0.003745612297117887, "loss": 2.5046, "step": 16250 }, { "crossentropy": 2.458860158920288, "epoch": 0.5891458816705336, "grad_norm": 0.0273889172822237, "grad_norm_var": 6.528454545974647e-07, "learning_rate": 0.003745049801733447, "loss": 2.4953, "step": 16251 }, { "crossentropy": 2.488889455795288, "epoch": 0.5891821345707656, "grad_norm": 0.028504718095064163, "grad_norm_var": 8.993520908329167e-07, "learning_rate": 0.0037444873232990133, "loss": 2.5489, "step": 16252 }, { "crossentropy": 2.6248772144317627, "epoch": 0.5892183874709976, "grad_norm": 0.027361901476979256, "grad_norm_var": 9.014681021252453e-07, "learning_rate": 0.0037439248618221844, "loss": 2.6348, "step": 16253 }, { "crossentropy": 2.4792773723602295, "epoch": 0.5892546403712297, "grad_norm": 0.025552507489919662, "grad_norm_var": 8.987313088810895e-07, "learning_rate": 0.0037433624173105564, "loss": 2.4265, "step": 16254 }, { "crossentropy": 2.5263564586639404, "epoch": 0.5892908932714617, "grad_norm": 0.02710168994963169, "grad_norm_var": 8.720178334936194e-07, "learning_rate": 0.0037427999897717244, "loss": 2.572, "step": 16255 }, { "crossentropy": 2.448012351989746, "epoch": 0.5893271461716937, "grad_norm": 0.028501683846116066, "grad_norm_var": 1.0600351278537585e-06, "learning_rate": 0.003742237579213288, "loss": 2.5047, "step": 16256 }, { "crossentropy": 2.4911305904388428, "epoch": 0.5893633990719258, "grad_norm": 0.028628027066588402, "grad_norm_var": 1.2549897361994905e-06, "learning_rate": 0.0037416751856428412, "loss": 2.5302, "step": 16257 }, { "crossentropy": 2.561016798019409, "epoch": 0.5893996519721578, "grad_norm": 0.02638525515794754, "grad_norm_var": 1.2668293691117978e-06, "learning_rate": 0.003741112809067982, "loss": 2.5333, "step": 16258 }, { "crossentropy": 2.6309494972229004, "epoch": 0.5894359048723898, "grad_norm": 0.027562644332647324, "grad_norm_var": 1.2005781253584946e-06, "learning_rate": 0.003740550449496304, "loss": 2.6383, "step": 16259 }, { "crossentropy": 2.5721945762634277, "epoch": 0.5894721577726219, "grad_norm": 0.027082907035946846, "grad_norm_var": 9.86906226045116e-07, "learning_rate": 0.0037399881069354025, "loss": 2.5353, "step": 16260 }, { "crossentropy": 2.6107687950134277, "epoch": 0.5895084106728539, "grad_norm": 0.027951186522841454, "grad_norm_var": 1.0185792385953089e-06, "learning_rate": 0.0037394257813928747, "loss": 2.5647, "step": 16261 }, { "crossentropy": 2.64986252784729, "epoch": 0.5895446635730859, "grad_norm": 0.026605453342199326, "grad_norm_var": 9.033949326798033e-07, "learning_rate": 0.0037388634728763143, "loss": 2.6158, "step": 16262 }, { "crossentropy": 2.598876953125, "epoch": 0.5895809164733179, "grad_norm": 0.027111854404211044, "grad_norm_var": 8.100997125859779e-07, "learning_rate": 0.0037383011813933156, "loss": 2.6343, "step": 16263 }, { "crossentropy": 2.5549869537353516, "epoch": 0.5896171693735499, "grad_norm": 0.02623014897108078, "grad_norm_var": 8.564274755167803e-07, "learning_rate": 0.0037377389069514757, "loss": 2.5293, "step": 16264 }, { "crossentropy": 2.5486090183258057, "epoch": 0.5896534222737819, "grad_norm": 0.028106804937124252, "grad_norm_var": 7.882472098755342e-07, "learning_rate": 0.0037371766495583863, "loss": 2.5352, "step": 16265 }, { "crossentropy": 2.3246145248413086, "epoch": 0.5896896751740139, "grad_norm": 0.02957707643508911, "grad_norm_var": 1.0753663083443253e-06, "learning_rate": 0.0037366144092216436, "loss": 2.3992, "step": 16266 }, { "crossentropy": 2.5465211868286133, "epoch": 0.589725928074246, "grad_norm": 0.026670046150684357, "grad_norm_var": 1.1162319355212737e-06, "learning_rate": 0.003736052185948841, "loss": 2.5507, "step": 16267 }, { "crossentropy": 2.4969561100006104, "epoch": 0.589762180974478, "grad_norm": 0.026393232867121696, "grad_norm_var": 1.0932617018627552e-06, "learning_rate": 0.003735489979747571, "loss": 2.5093, "step": 16268 }, { "crossentropy": 2.569976806640625, "epoch": 0.58979843387471, "grad_norm": 0.02652893215417862, "grad_norm_var": 1.1299072589824113e-06, "learning_rate": 0.0037349277906254274, "loss": 2.5825, "step": 16269 }, { "crossentropy": 2.429603338241577, "epoch": 0.589834686774942, "grad_norm": 0.026348939165472984, "grad_norm_var": 9.89362997023133e-07, "learning_rate": 0.003734365618590004, "loss": 2.5219, "step": 16270 }, { "crossentropy": 2.5608861446380615, "epoch": 0.589870939675174, "grad_norm": 0.027266882359981537, "grad_norm_var": 9.867200560254378e-07, "learning_rate": 0.0037338034636488927, "loss": 2.5548, "step": 16271 }, { "crossentropy": 2.5061440467834473, "epoch": 0.589907192575406, "grad_norm": 0.02680123597383499, "grad_norm_var": 8.97127598983383e-07, "learning_rate": 0.0037332413258096887, "loss": 2.5671, "step": 16272 }, { "crossentropy": 2.55230450630188, "epoch": 0.589943445475638, "grad_norm": 0.02687447890639305, "grad_norm_var": 7.561687033805682e-07, "learning_rate": 0.0037326792050799825, "loss": 2.4812, "step": 16273 }, { "crossentropy": 2.4877991676330566, "epoch": 0.58997969837587, "grad_norm": 0.026807229965925217, "grad_norm_var": 7.274456337834241e-07, "learning_rate": 0.0037321171014673667, "loss": 2.536, "step": 16274 }, { "crossentropy": 2.5764453411102295, "epoch": 0.5900159512761021, "grad_norm": 0.028099987655878067, "grad_norm_var": 7.772095843021238e-07, "learning_rate": 0.003731555014979434, "loss": 2.5097, "step": 16275 }, { "crossentropy": 2.5580365657806396, "epoch": 0.5900522041763341, "grad_norm": 0.02567886933684349, "grad_norm_var": 9.136372015390437e-07, "learning_rate": 0.0037309929456237757, "loss": 2.601, "step": 16276 }, { "crossentropy": 2.3142011165618896, "epoch": 0.5900884570765661, "grad_norm": 0.027004212141036987, "grad_norm_var": 8.578894718865952e-07, "learning_rate": 0.0037304308934079833, "loss": 2.3628, "step": 16277 }, { "crossentropy": 2.475512742996216, "epoch": 0.5901247099767981, "grad_norm": 0.02869890071451664, "grad_norm_var": 1.0198302566101115e-06, "learning_rate": 0.003729868858339648, "loss": 2.534, "step": 16278 }, { "crossentropy": 2.5450899600982666, "epoch": 0.5901609628770301, "grad_norm": 0.027981974184513092, "grad_norm_var": 1.0641827030801227e-06, "learning_rate": 0.003729306840426361, "loss": 2.566, "step": 16279 }, { "crossentropy": 2.6279125213623047, "epoch": 0.5901972157772621, "grad_norm": 0.026111317798495293, "grad_norm_var": 1.080301955744961e-06, "learning_rate": 0.0037287448396757135, "loss": 2.5866, "step": 16280 }, { "crossentropy": 2.4989635944366455, "epoch": 0.5902334686774942, "grad_norm": 0.02825400047004223, "grad_norm_var": 1.0997596409298977e-06, "learning_rate": 0.0037281828560952967, "loss": 2.546, "step": 16281 }, { "crossentropy": 2.6300556659698486, "epoch": 0.5902697215777262, "grad_norm": 0.030004078522324562, "grad_norm_var": 1.2468562460995458e-06, "learning_rate": 0.0037276208896927, "loss": 2.5875, "step": 16282 }, { "crossentropy": 2.5501768589019775, "epoch": 0.5903059744779582, "grad_norm": 0.02740367501974106, "grad_norm_var": 1.2266731167192546e-06, "learning_rate": 0.0037270589404755157, "loss": 2.5612, "step": 16283 }, { "crossentropy": 2.6412789821624756, "epoch": 0.5903422273781903, "grad_norm": 0.026661034673452377, "grad_norm_var": 1.1999873187826105e-06, "learning_rate": 0.0037264970084513304, "loss": 2.609, "step": 16284 }, { "crossentropy": 2.590566873550415, "epoch": 0.5903784802784223, "grad_norm": 0.046383440494537354, "grad_norm_var": 2.3841734077843197e-05, "learning_rate": 0.0037259350936277363, "loss": 2.5789, "step": 16285 }, { "crossentropy": 2.565402030944824, "epoch": 0.5904147331786543, "grad_norm": 0.027980508282780647, "grad_norm_var": 2.3534992797413187e-05, "learning_rate": 0.003725373196012322, "loss": 2.5854, "step": 16286 }, { "crossentropy": 2.556527853012085, "epoch": 0.5904509860788864, "grad_norm": 0.027478208765387535, "grad_norm_var": 2.3499495665885103e-05, "learning_rate": 0.0037248113156126764, "loss": 2.5469, "step": 16287 }, { "crossentropy": 2.583494186401367, "epoch": 0.5904872389791184, "grad_norm": 0.02727900817990303, "grad_norm_var": 2.3396694671356613e-05, "learning_rate": 0.003724249452436389, "loss": 2.5787, "step": 16288 }, { "crossentropy": 2.4941840171813965, "epoch": 0.5905234918793504, "grad_norm": 0.025762353092432022, "grad_norm_var": 2.3740065427551445e-05, "learning_rate": 0.0037236876064910495, "loss": 2.5055, "step": 16289 }, { "crossentropy": 2.623782157897949, "epoch": 0.5905597447795824, "grad_norm": 0.026134047657251358, "grad_norm_var": 2.3929240803952554e-05, "learning_rate": 0.0037231257777842466, "loss": 2.6014, "step": 16290 }, { "crossentropy": 2.5456862449645996, "epoch": 0.5905959976798144, "grad_norm": 0.027733806520700455, "grad_norm_var": 2.395994562007555e-05, "learning_rate": 0.0037225639663235667, "loss": 2.5114, "step": 16291 }, { "crossentropy": 2.464484214782715, "epoch": 0.5906322505800464, "grad_norm": 0.026001349091529846, "grad_norm_var": 2.3843667675199574e-05, "learning_rate": 0.0037220021721166, "loss": 2.4742, "step": 16292 }, { "crossentropy": 2.43603515625, "epoch": 0.5906685034802784, "grad_norm": 0.02755715511739254, "grad_norm_var": 2.374848108061439e-05, "learning_rate": 0.003721440395170933, "loss": 2.4572, "step": 16293 }, { "crossentropy": 2.6002891063690186, "epoch": 0.5907047563805105, "grad_norm": 0.026781320571899414, "grad_norm_var": 2.3950215277161734e-05, "learning_rate": 0.0037208786354941533, "loss": 2.5545, "step": 16294 }, { "crossentropy": 2.5458714962005615, "epoch": 0.5907410092807425, "grad_norm": 0.02623330056667328, "grad_norm_var": 2.425493249136717e-05, "learning_rate": 0.003720316893093848, "loss": 2.4875, "step": 16295 }, { "crossentropy": 2.3914268016815186, "epoch": 0.5907772621809745, "grad_norm": 0.026062561199069023, "grad_norm_var": 2.4269698912852734e-05, "learning_rate": 0.0037197551679776053, "loss": 2.4898, "step": 16296 }, { "crossentropy": 2.554716110229492, "epoch": 0.5908135150812065, "grad_norm": 0.026470107957720757, "grad_norm_var": 2.4493057634731908e-05, "learning_rate": 0.003719193460153013, "loss": 2.4962, "step": 16297 }, { "crossentropy": 2.46728253364563, "epoch": 0.5908497679814385, "grad_norm": 0.02627958357334137, "grad_norm_var": 2.4486676689478e-05, "learning_rate": 0.0037186317696276563, "loss": 2.4405, "step": 16298 }, { "crossentropy": 2.640921115875244, "epoch": 0.5908860208816705, "grad_norm": 0.027207665145397186, "grad_norm_var": 2.450499174515701e-05, "learning_rate": 0.003718070096409122, "loss": 2.6511, "step": 16299 }, { "crossentropy": 2.6915173530578613, "epoch": 0.5909222737819025, "grad_norm": 0.027163010090589523, "grad_norm_var": 2.443110062707239e-05, "learning_rate": 0.0037175084405049975, "loss": 2.6653, "step": 16300 }, { "crossentropy": 2.561270236968994, "epoch": 0.5909585266821346, "grad_norm": 0.026502590626478195, "grad_norm_var": 4.877237421414913e-07, "learning_rate": 0.003716946801922867, "loss": 2.5043, "step": 16301 }, { "crossentropy": 2.6844704151153564, "epoch": 0.5909947795823666, "grad_norm": 0.02761080488562584, "grad_norm_var": 4.3754026275797525e-07, "learning_rate": 0.0037163851806703164, "loss": 2.5577, "step": 16302 }, { "crossentropy": 2.4675076007843018, "epoch": 0.5910310324825986, "grad_norm": 0.026956886053085327, "grad_norm_var": 4.050247271685277e-07, "learning_rate": 0.0037158235767549307, "loss": 2.4244, "step": 16303 }, { "crossentropy": 2.5552000999450684, "epoch": 0.5910672853828306, "grad_norm": 0.02750965766608715, "grad_norm_var": 4.2512669808439374e-07, "learning_rate": 0.003715261990184299, "loss": 2.5764, "step": 16304 }, { "crossentropy": 2.589829921722412, "epoch": 0.5911035382830626, "grad_norm": 0.026013385504484177, "grad_norm_var": 3.960784673656681e-07, "learning_rate": 0.0037147004209660017, "loss": 2.6368, "step": 16305 }, { "crossentropy": 2.5124542713165283, "epoch": 0.5911397911832946, "grad_norm": 0.02869351953268051, "grad_norm_var": 5.906744604169377e-07, "learning_rate": 0.003714138869107627, "loss": 2.4794, "step": 16306 }, { "crossentropy": 2.581467866897583, "epoch": 0.5911760440835266, "grad_norm": 0.028155826032161713, "grad_norm_var": 6.473986183944587e-07, "learning_rate": 0.0037135773346167584, "loss": 2.5105, "step": 16307 }, { "crossentropy": 2.6993274688720703, "epoch": 0.5912122969837587, "grad_norm": 0.027012227103114128, "grad_norm_var": 5.834138052522673e-07, "learning_rate": 0.003713015817500979, "loss": 2.6675, "step": 16308 }, { "crossentropy": 2.4679956436157227, "epoch": 0.5912485498839907, "grad_norm": 0.0258783008903265, "grad_norm_var": 6.377880665860234e-07, "learning_rate": 0.0037124543177678767, "loss": 2.4534, "step": 16309 }, { "crossentropy": 2.5285873413085938, "epoch": 0.5912848027842227, "grad_norm": 0.0256721880286932, "grad_norm_var": 7.334333010653241e-07, "learning_rate": 0.0037118928354250304, "loss": 2.4143, "step": 16310 }, { "crossentropy": 2.6602606773376465, "epoch": 0.5913210556844548, "grad_norm": 0.028270317241549492, "grad_norm_var": 8.28304284828625e-07, "learning_rate": 0.0037113313704800256, "loss": 2.5496, "step": 16311 }, { "crossentropy": 2.3858273029327393, "epoch": 0.5913573085846868, "grad_norm": 0.02638981305062771, "grad_norm_var": 7.955701998988192e-07, "learning_rate": 0.003710769922940447, "loss": 2.3574, "step": 16312 }, { "crossentropy": 2.4488284587860107, "epoch": 0.5913935614849188, "grad_norm": 0.02682584710419178, "grad_norm_var": 7.789805643151528e-07, "learning_rate": 0.003710208492813878, "loss": 2.3997, "step": 16313 }, { "crossentropy": 2.288632869720459, "epoch": 0.5914298143851509, "grad_norm": 0.02933439612388611, "grad_norm_var": 1.0651861755362935e-06, "learning_rate": 0.0037096470801078994, "loss": 2.4013, "step": 16314 }, { "crossentropy": 2.5103507041931152, "epoch": 0.5914660672853829, "grad_norm": 0.027049381285905838, "grad_norm_var": 1.0665855653788399e-06, "learning_rate": 0.003709085684830096, "loss": 2.5734, "step": 16315 }, { "crossentropy": 2.5151259899139404, "epoch": 0.5915023201856149, "grad_norm": 0.03062397800385952, "grad_norm_var": 1.8028277397783357e-06, "learning_rate": 0.003708524306988048, "loss": 2.5979, "step": 16316 }, { "crossentropy": 2.469296455383301, "epoch": 0.5915385730858469, "grad_norm": 0.02826249785721302, "grad_norm_var": 1.7843726943721535e-06, "learning_rate": 0.003707962946589343, "loss": 2.5125, "step": 16317 }, { "crossentropy": 2.5013375282287598, "epoch": 0.5915748259860789, "grad_norm": 0.027073755860328674, "grad_norm_var": 1.7956239425359707e-06, "learning_rate": 0.003707401603641556, "loss": 2.5655, "step": 16318 }, { "crossentropy": 2.4294798374176025, "epoch": 0.5916110788863109, "grad_norm": 0.026366524398326874, "grad_norm_var": 1.8587902432164038e-06, "learning_rate": 0.0037068402781522715, "loss": 2.4116, "step": 16319 }, { "crossentropy": 2.705134391784668, "epoch": 0.5916473317865429, "grad_norm": 0.027773290872573853, "grad_norm_var": 1.865381416151409e-06, "learning_rate": 0.003706278970129072, "loss": 2.6686, "step": 16320 }, { "crossentropy": 2.343398094177246, "epoch": 0.591683584686775, "grad_norm": 0.02689187042415142, "grad_norm_var": 1.7439129847786194e-06, "learning_rate": 0.0037057176795795388, "loss": 2.4729, "step": 16321 }, { "crossentropy": 2.389451742172241, "epoch": 0.591719837587007, "grad_norm": 0.02967352420091629, "grad_norm_var": 1.9576570150327585e-06, "learning_rate": 0.003705156406511252, "loss": 2.5478, "step": 16322 }, { "crossentropy": 2.7368576526641846, "epoch": 0.591756090487239, "grad_norm": 0.027035914361476898, "grad_norm_var": 1.94981632204622e-06, "learning_rate": 0.0037045951509317933, "loss": 2.6277, "step": 16323 }, { "crossentropy": 2.6432602405548096, "epoch": 0.591792343387471, "grad_norm": 0.02741212397813797, "grad_norm_var": 1.9333573444875346e-06, "learning_rate": 0.003704033912848742, "loss": 2.5468, "step": 16324 }, { "crossentropy": 2.7349517345428467, "epoch": 0.591828596287703, "grad_norm": 0.02769341506063938, "grad_norm_var": 1.738723360510582e-06, "learning_rate": 0.0037034726922696827, "loss": 2.6053, "step": 16325 }, { "crossentropy": 2.62040376663208, "epoch": 0.591864849187935, "grad_norm": 0.02508186362683773, "grad_norm_var": 1.915925278716165e-06, "learning_rate": 0.0037029114892021893, "loss": 2.5255, "step": 16326 }, { "crossentropy": 2.5152125358581543, "epoch": 0.591901102088167, "grad_norm": 0.025756387040019035, "grad_norm_var": 2.0895522583571266e-06, "learning_rate": 0.0037023503036538443, "loss": 2.497, "step": 16327 }, { "crossentropy": 2.560051441192627, "epoch": 0.5919373549883991, "grad_norm": 0.02726091630756855, "grad_norm_var": 2.013517284568609e-06, "learning_rate": 0.0037017891356322286, "loss": 2.5842, "step": 16328 }, { "crossentropy": 2.6490018367767334, "epoch": 0.5919736078886311, "grad_norm": 0.029322542250156403, "grad_norm_var": 2.176282682480448e-06, "learning_rate": 0.0037012279851449206, "loss": 2.6439, "step": 16329 }, { "crossentropy": 2.671050548553467, "epoch": 0.5920098607888631, "grad_norm": 0.026620520278811455, "grad_norm_var": 2.0319070319592866e-06, "learning_rate": 0.0037006668521995, "loss": 2.5673, "step": 16330 }, { "crossentropy": 2.607783079147339, "epoch": 0.5920461136890951, "grad_norm": 0.02748061716556549, "grad_norm_var": 2.0179848110918676e-06, "learning_rate": 0.0037001057368035444, "loss": 2.5218, "step": 16331 }, { "crossentropy": 2.3307275772094727, "epoch": 0.5920823665893271, "grad_norm": 0.02683844231069088, "grad_norm_var": 1.347238717287237e-06, "learning_rate": 0.0036995446389646337, "loss": 2.4489, "step": 16332 }, { "crossentropy": 2.5453944206237793, "epoch": 0.5921186194895591, "grad_norm": 0.0276185255497694, "grad_norm_var": 1.2891418571430816e-06, "learning_rate": 0.0036989835586903465, "loss": 2.582, "step": 16333 }, { "crossentropy": 2.47356915473938, "epoch": 0.5921548723897911, "grad_norm": 0.03467506542801857, "grad_norm_var": 4.728080840260956e-06, "learning_rate": 0.0036984224959882616, "loss": 2.5099, "step": 16334 }, { "crossentropy": 2.5747690200805664, "epoch": 0.5921911252900232, "grad_norm": 0.026807166635990143, "grad_norm_var": 4.660764160262637e-06, "learning_rate": 0.003697861450865956, "loss": 2.4996, "step": 16335 }, { "crossentropy": 2.4036924839019775, "epoch": 0.5922273781902552, "grad_norm": 0.026535717770457268, "grad_norm_var": 4.752048890875243e-06, "learning_rate": 0.0036973004233310063, "loss": 2.3991, "step": 16336 }, { "crossentropy": 2.4960777759552, "epoch": 0.5922636310904872, "grad_norm": 0.025234375149011612, "grad_norm_var": 5.095508157856865e-06, "learning_rate": 0.003696739413390992, "loss": 2.5459, "step": 16337 }, { "crossentropy": 2.540571928024292, "epoch": 0.5922998839907193, "grad_norm": 0.027488350868225098, "grad_norm_var": 4.779742050767949e-06, "learning_rate": 0.00369617842105349, "loss": 2.5607, "step": 16338 }, { "crossentropy": 2.560887336730957, "epoch": 0.5923361368909513, "grad_norm": 0.025265229865908623, "grad_norm_var": 5.0684735198172945e-06, "learning_rate": 0.003695617446326076, "loss": 2.5727, "step": 16339 }, { "crossentropy": 2.6228370666503906, "epoch": 0.5923723897911833, "grad_norm": 0.026717031374573708, "grad_norm_var": 5.089966189530485e-06, "learning_rate": 0.0036950564892163276, "loss": 2.5811, "step": 16340 }, { "crossentropy": 2.5147907733917236, "epoch": 0.5924086426914154, "grad_norm": 0.02611961029469967, "grad_norm_var": 5.156919273286743e-06, "learning_rate": 0.003694495549731822, "loss": 2.5364, "step": 16341 }, { "crossentropy": 2.73181414604187, "epoch": 0.5924448955916474, "grad_norm": 0.02832682803273201, "grad_norm_var": 4.908806297473792e-06, "learning_rate": 0.003693934627880137, "loss": 2.6177, "step": 16342 }, { "crossentropy": 2.5317177772521973, "epoch": 0.5924811484918794, "grad_norm": 0.025842497125267982, "grad_norm_var": 4.890637566023906e-06, "learning_rate": 0.003693373723668846, "loss": 2.5328, "step": 16343 }, { "crossentropy": 2.5135912895202637, "epoch": 0.5925174013921114, "grad_norm": 0.027068953961133957, "grad_norm_var": 4.896106082305507e-06, "learning_rate": 0.0036928128371055253, "loss": 2.5652, "step": 16344 }, { "crossentropy": 2.517960548400879, "epoch": 0.5925536542923434, "grad_norm": 0.027564814314246178, "grad_norm_var": 4.63220897725911e-06, "learning_rate": 0.003692251968197751, "loss": 2.5015, "step": 16345 }, { "crossentropy": 2.58571195602417, "epoch": 0.5925899071925754, "grad_norm": 0.026510171592235565, "grad_norm_var": 4.642419023337165e-06, "learning_rate": 0.0036916911169530987, "loss": 2.5918, "step": 16346 }, { "crossentropy": 2.5191025733947754, "epoch": 0.5926261600928074, "grad_norm": 0.02646825835108757, "grad_norm_var": 4.676132376372523e-06, "learning_rate": 0.0036911302833791417, "loss": 2.5339, "step": 16347 }, { "crossentropy": 2.6437604427337646, "epoch": 0.5926624129930395, "grad_norm": 0.027084436267614365, "grad_norm_var": 4.6682995044935624e-06, "learning_rate": 0.0036905694674834576, "loss": 2.5641, "step": 16348 }, { "crossentropy": 2.4741337299346924, "epoch": 0.5926986658932715, "grad_norm": 0.025877714157104492, "grad_norm_var": 4.76240064889253e-06, "learning_rate": 0.0036900086692736203, "loss": 2.5459, "step": 16349 }, { "crossentropy": 2.608666181564331, "epoch": 0.5927349187935035, "grad_norm": 0.027976425364613533, "grad_norm_var": 8.004327555445699e-07, "learning_rate": 0.0036894478887572035, "loss": 2.5204, "step": 16350 }, { "crossentropy": 2.5851552486419678, "epoch": 0.5927711716937355, "grad_norm": 0.028055518865585327, "grad_norm_var": 9.189193456398629e-07, "learning_rate": 0.003688887125941783, "loss": 2.6043, "step": 16351 }, { "crossentropy": 2.359863519668579, "epoch": 0.5928074245939675, "grad_norm": 0.027574613690376282, "grad_norm_var": 9.555167951572422e-07, "learning_rate": 0.003688326380834931, "loss": 2.4577, "step": 16352 }, { "crossentropy": 2.411360025405884, "epoch": 0.5928436774941995, "grad_norm": 0.026258938014507294, "grad_norm_var": 8.040470826302382e-07, "learning_rate": 0.0036877656534442214, "loss": 2.4062, "step": 16353 }, { "crossentropy": 2.4546992778778076, "epoch": 0.5928799303944315, "grad_norm": 0.025961581617593765, "grad_norm_var": 8.27413619678731e-07, "learning_rate": 0.003687204943777228, "loss": 2.5388, "step": 16354 }, { "crossentropy": 2.6140899658203125, "epoch": 0.5929161832946636, "grad_norm": 0.0286552831530571, "grad_norm_var": 8.555639051470529e-07, "learning_rate": 0.003686644251841523, "loss": 2.6196, "step": 16355 }, { "crossentropy": 2.498232364654541, "epoch": 0.5929524361948956, "grad_norm": 0.026158420369029045, "grad_norm_var": 8.964344779178637e-07, "learning_rate": 0.0036860835776446822, "loss": 2.4879, "step": 16356 }, { "crossentropy": 2.4234495162963867, "epoch": 0.5929886890951276, "grad_norm": 0.026338333263993263, "grad_norm_var": 8.746535375883976e-07, "learning_rate": 0.003685522921194276, "loss": 2.4792, "step": 16357 }, { "crossentropy": 2.7916672229766846, "epoch": 0.5930249419953596, "grad_norm": 0.026269713416695595, "grad_norm_var": 7.70458954939673e-07, "learning_rate": 0.0036849622824978775, "loss": 2.6587, "step": 16358 }, { "crossentropy": 2.611755847930908, "epoch": 0.5930611948955916, "grad_norm": 0.02629449963569641, "grad_norm_var": 7.222615479787377e-07, "learning_rate": 0.00368440166156306, "loss": 2.5677, "step": 16359 }, { "crossentropy": 2.654813528060913, "epoch": 0.5930974477958236, "grad_norm": 0.028064310550689697, "grad_norm_var": 8.089468362196579e-07, "learning_rate": 0.0036838410583973935, "loss": 2.5908, "step": 16360 }, { "crossentropy": 2.5569138526916504, "epoch": 0.5931337006960556, "grad_norm": 0.028307946398854256, "grad_norm_var": 9.049191682947704e-07, "learning_rate": 0.003683280473008451, "loss": 2.6334, "step": 16361 }, { "crossentropy": 2.5363516807556152, "epoch": 0.5931699535962877, "grad_norm": 0.025625353679060936, "grad_norm_var": 1.0105778824360165e-06, "learning_rate": 0.003682719905403804, "loss": 2.4825, "step": 16362 }, { "crossentropy": 2.552720308303833, "epoch": 0.5932062064965197, "grad_norm": 0.026415536180138588, "grad_norm_var": 1.0140376121073875e-06, "learning_rate": 0.0036821593555910227, "loss": 2.483, "step": 16363 }, { "crossentropy": 2.5772409439086914, "epoch": 0.5932424593967517, "grad_norm": 0.026900652796030045, "grad_norm_var": 1.0124234118273478e-06, "learning_rate": 0.00368159882357768, "loss": 2.5471, "step": 16364 }, { "crossentropy": 2.4450857639312744, "epoch": 0.5932787122969838, "grad_norm": 0.027195801958441734, "grad_norm_var": 9.376685440528586e-07, "learning_rate": 0.003681038309371346, "loss": 2.4765, "step": 16365 }, { "crossentropy": 2.6129541397094727, "epoch": 0.5933149651972158, "grad_norm": 0.02960093319416046, "grad_norm_var": 1.313385873700348e-06, "learning_rate": 0.0036804778129795913, "loss": 2.6357, "step": 16366 }, { "crossentropy": 2.522704839706421, "epoch": 0.5933512180974478, "grad_norm": 0.02719498798251152, "grad_norm_var": 1.2505894683531438e-06, "learning_rate": 0.0036799173344099867, "loss": 2.5732, "step": 16367 }, { "crossentropy": 2.579472064971924, "epoch": 0.5933874709976799, "grad_norm": 0.027543790638446808, "grad_norm_var": 1.2484971635622615e-06, "learning_rate": 0.0036793568736701015, "loss": 2.4971, "step": 16368 }, { "crossentropy": 2.4558815956115723, "epoch": 0.5934237238979119, "grad_norm": 0.025848908349871635, "grad_norm_var": 1.3022052309722212e-06, "learning_rate": 0.003678796430767506, "loss": 2.4326, "step": 16369 }, { "crossentropy": 2.504815101623535, "epoch": 0.5934599767981439, "grad_norm": 0.02918793074786663, "grad_norm_var": 1.4959709188481675e-06, "learning_rate": 0.003678236005709769, "loss": 2.4816, "step": 16370 }, { "crossentropy": 2.495131731033325, "epoch": 0.5934962296983759, "grad_norm": 0.026867462322115898, "grad_norm_var": 1.3548303321124384e-06, "learning_rate": 0.0036776755985044596, "loss": 2.5331, "step": 16371 }, { "crossentropy": 2.457348108291626, "epoch": 0.5935324825986079, "grad_norm": 0.026362065225839615, "grad_norm_var": 1.331491750498195e-06, "learning_rate": 0.00367711520915915, "loss": 2.4602, "step": 16372 }, { "crossentropy": 2.666928291320801, "epoch": 0.5935687354988399, "grad_norm": 0.027389051392674446, "grad_norm_var": 1.2901240307999397e-06, "learning_rate": 0.0036765548376814063, "loss": 2.6667, "step": 16373 }, { "crossentropy": 2.5181705951690674, "epoch": 0.5936049883990719, "grad_norm": 0.02651093900203705, "grad_norm_var": 1.2641031517194683e-06, "learning_rate": 0.003675994484078797, "loss": 2.4985, "step": 16374 }, { "crossentropy": 2.447737693786621, "epoch": 0.593641241299304, "grad_norm": 0.02678685076534748, "grad_norm_var": 1.2193585173405578e-06, "learning_rate": 0.003675434148358893, "loss": 2.5, "step": 16375 }, { "crossentropy": 2.5692591667175293, "epoch": 0.593677494199536, "grad_norm": 0.026146739721298218, "grad_norm_var": 1.2378204673162921e-06, "learning_rate": 0.003674873830529261, "loss": 2.519, "step": 16376 }, { "crossentropy": 2.527578592300415, "epoch": 0.593713747099768, "grad_norm": 0.025349102914333344, "grad_norm_var": 1.315468758434496e-06, "learning_rate": 0.0036743135305974685, "loss": 2.5418, "step": 16377 }, { "crossentropy": 2.4501028060913086, "epoch": 0.59375, "grad_norm": 0.02660425379872322, "grad_norm_var": 1.2047005232376872e-06, "learning_rate": 0.0036737532485710826, "loss": 2.4864, "step": 16378 }, { "crossentropy": 2.6123931407928467, "epoch": 0.593786252900232, "grad_norm": 0.02793494611978531, "grad_norm_var": 1.2317858595950818e-06, "learning_rate": 0.0036731929844576717, "loss": 2.529, "step": 16379 }, { "crossentropy": 2.5044050216674805, "epoch": 0.593822505800464, "grad_norm": 0.025709133595228195, "grad_norm_var": 1.3504449494512622e-06, "learning_rate": 0.003672632738264804, "loss": 2.5034, "step": 16380 }, { "crossentropy": 2.539537191390991, "epoch": 0.593858758700696, "grad_norm": 0.02604060247540474, "grad_norm_var": 1.4059336331106633e-06, "learning_rate": 0.003672072510000045, "loss": 2.5556, "step": 16381 }, { "crossentropy": 2.494320869445801, "epoch": 0.5938950116009281, "grad_norm": 0.027683107182383537, "grad_norm_var": 9.559870742454254e-07, "learning_rate": 0.003671512299670962, "loss": 2.5056, "step": 16382 }, { "crossentropy": 2.6128506660461426, "epoch": 0.5939312645011601, "grad_norm": 0.027535663917660713, "grad_norm_var": 9.801608850050348e-07, "learning_rate": 0.0036709521072851217, "loss": 2.5692, "step": 16383 }, { "crossentropy": 2.565260887145996, "epoch": 0.5939675174013921, "grad_norm": 0.027514653280377388, "grad_norm_var": 9.774944350761372e-07, "learning_rate": 0.0036703919328500913, "loss": 2.5311, "step": 16384 }, { "crossentropy": 2.6312906742095947, "epoch": 0.5940037703016241, "grad_norm": 0.026459285989403725, "grad_norm_var": 9.199610276029608e-07, "learning_rate": 0.003669831776373434, "loss": 2.6308, "step": 16385 }, { "crossentropy": 2.628237247467041, "epoch": 0.5940400232018561, "grad_norm": 0.02796691283583641, "grad_norm_var": 6.374228839811502e-07, "learning_rate": 0.003669271637862718, "loss": 2.5716, "step": 16386 }, { "crossentropy": 2.643375873565674, "epoch": 0.5940762761020881, "grad_norm": 0.025785604491829872, "grad_norm_var": 7.013904969571516e-07, "learning_rate": 0.003668711517325506, "loss": 2.6768, "step": 16387 }, { "crossentropy": 2.6029539108276367, "epoch": 0.5941125290023201, "grad_norm": 0.0265620369464159, "grad_norm_var": 6.939147521713961e-07, "learning_rate": 0.003668151414769366, "loss": 2.5241, "step": 16388 }, { "crossentropy": 2.460179090499878, "epoch": 0.5941487819025522, "grad_norm": 0.026490576565265656, "grad_norm_var": 6.67654005263848e-07, "learning_rate": 0.0036675913302018637, "loss": 2.5296, "step": 16389 }, { "crossentropy": 2.4947354793548584, "epoch": 0.5941850348027842, "grad_norm": 0.029051771387457848, "grad_norm_var": 1.0096258497291807e-06, "learning_rate": 0.0036670312636305612, "loss": 2.5303, "step": 16390 }, { "crossentropy": 2.5581982135772705, "epoch": 0.5942212877030162, "grad_norm": 0.026662688702344894, "grad_norm_var": 1.0116567739882568e-06, "learning_rate": 0.0036664712150630253, "loss": 2.4972, "step": 16391 }, { "crossentropy": 2.6155943870544434, "epoch": 0.5942575406032483, "grad_norm": 0.02766844443976879, "grad_norm_var": 1.0149987039187458e-06, "learning_rate": 0.0036659111845068182, "loss": 2.4904, "step": 16392 }, { "crossentropy": 2.6593375205993652, "epoch": 0.5942937935034803, "grad_norm": 0.026897167786955833, "grad_norm_var": 8.366790273816541e-07, "learning_rate": 0.0036653511719695075, "loss": 2.5844, "step": 16393 }, { "crossentropy": 2.432753801345825, "epoch": 0.5943300464037123, "grad_norm": 0.02731633372604847, "grad_norm_var": 8.274327331358443e-07, "learning_rate": 0.003664791177458653, "loss": 2.5489, "step": 16394 }, { "crossentropy": 2.6008825302124023, "epoch": 0.5943662993039444, "grad_norm": 0.02698504738509655, "grad_norm_var": 7.755368525201445e-07, "learning_rate": 0.003664231200981819, "loss": 2.5382, "step": 16395 }, { "crossentropy": 2.5450544357299805, "epoch": 0.5944025522041764, "grad_norm": 0.02717372216284275, "grad_norm_var": 6.53506369773423e-07, "learning_rate": 0.003663671242546571, "loss": 2.481, "step": 16396 }, { "crossentropy": 2.4678516387939453, "epoch": 0.5944388051044084, "grad_norm": 0.02558545581996441, "grad_norm_var": 7.314789830383984e-07, "learning_rate": 0.00366311130216047, "loss": 2.5136, "step": 16397 }, { "crossentropy": 2.6863603591918945, "epoch": 0.5944750580046404, "grad_norm": 0.028317293152213097, "grad_norm_var": 8.073045674667252e-07, "learning_rate": 0.0036625513798310807, "loss": 2.6086, "step": 16398 }, { "crossentropy": 2.6060986518859863, "epoch": 0.5945113109048724, "grad_norm": 0.02948109060525894, "grad_norm_var": 1.1508128416888586e-06, "learning_rate": 0.003661991475565963, "loss": 2.5909, "step": 16399 }, { "crossentropy": 2.618960380554199, "epoch": 0.5945475638051044, "grad_norm": 0.02885650098323822, "grad_norm_var": 1.3116134404937782e-06, "learning_rate": 0.0036614315893726814, "loss": 2.625, "step": 16400 }, { "crossentropy": 2.480771780014038, "epoch": 0.5945838167053364, "grad_norm": 0.029099641367793083, "grad_norm_var": 1.4412396676695502e-06, "learning_rate": 0.0036608717212588005, "loss": 2.508, "step": 16401 }, { "crossentropy": 2.475288152694702, "epoch": 0.5946200696055685, "grad_norm": 0.02767711877822876, "grad_norm_var": 1.42820651419886e-06, "learning_rate": 0.0036603118712318773, "loss": 2.517, "step": 16402 }, { "crossentropy": 2.4629247188568115, "epoch": 0.5946563225058005, "grad_norm": 0.02603895030915737, "grad_norm_var": 1.3751290268120084e-06, "learning_rate": 0.003659752039299473, "loss": 2.4489, "step": 16403 }, { "crossentropy": 2.4833545684814453, "epoch": 0.5946925754060325, "grad_norm": 0.027254508808255196, "grad_norm_var": 1.319282844561388e-06, "learning_rate": 0.0036591922254691532, "loss": 2.494, "step": 16404 }, { "crossentropy": 2.5780160427093506, "epoch": 0.5947288283062645, "grad_norm": 0.028908902779221535, "grad_norm_var": 1.3481084802530533e-06, "learning_rate": 0.0036586324297484775, "loss": 2.5341, "step": 16405 }, { "crossentropy": 2.560500383377075, "epoch": 0.5947650812064965, "grad_norm": 0.02736167050898075, "grad_norm_var": 1.2188446719870367e-06, "learning_rate": 0.003658072652145006, "loss": 2.5152, "step": 16406 }, { "crossentropy": 2.460019111633301, "epoch": 0.5948013341067285, "grad_norm": 0.027203302830457687, "grad_norm_var": 1.1709691817481567e-06, "learning_rate": 0.0036575128926662997, "loss": 2.5569, "step": 16407 }, { "crossentropy": 2.612710952758789, "epoch": 0.5948375870069605, "grad_norm": 0.025868205353617668, "grad_norm_var": 1.360471852518419e-06, "learning_rate": 0.0036569531513199185, "loss": 2.5368, "step": 16408 }, { "crossentropy": 2.6403114795684814, "epoch": 0.5948738399071926, "grad_norm": 0.0269306730479002, "grad_norm_var": 1.3578419859695822e-06, "learning_rate": 0.0036563934281134258, "loss": 2.6258, "step": 16409 }, { "crossentropy": 2.412092685699463, "epoch": 0.5949100928074246, "grad_norm": 0.025638917461037636, "grad_norm_var": 1.5755943804124372e-06, "learning_rate": 0.003655833723054377, "loss": 2.4304, "step": 16410 }, { "crossentropy": 2.432382822036743, "epoch": 0.5949463457076566, "grad_norm": 0.026996556669473648, "grad_norm_var": 1.5749677072096248e-06, "learning_rate": 0.0036552740361503323, "loss": 2.4273, "step": 16411 }, { "crossentropy": 2.4457290172576904, "epoch": 0.5949825986078886, "grad_norm": 0.026784062385559082, "grad_norm_var": 1.5961892422768885e-06, "learning_rate": 0.0036547143674088532, "loss": 2.4594, "step": 16412 }, { "crossentropy": 2.5074923038482666, "epoch": 0.5950188515081206, "grad_norm": 0.026273829862475395, "grad_norm_var": 1.46153896332975e-06, "learning_rate": 0.003654154716837499, "loss": 2.4922, "step": 16413 }, { "crossentropy": 2.743955373764038, "epoch": 0.5950551044083526, "grad_norm": 0.0280653927475214, "grad_norm_var": 1.4353072884531101e-06, "learning_rate": 0.0036535950844438277, "loss": 2.635, "step": 16414 }, { "crossentropy": 2.4952282905578613, "epoch": 0.5950913573085846, "grad_norm": 0.02773676998913288, "grad_norm_var": 1.1420329107300706e-06, "learning_rate": 0.0036530354702353973, "loss": 2.514, "step": 16415 }, { "crossentropy": 2.662076711654663, "epoch": 0.5951276102088167, "grad_norm": 0.027939843013882637, "grad_norm_var": 1.0035100251307508e-06, "learning_rate": 0.003652475874219766, "loss": 2.5839, "step": 16416 }, { "crossentropy": 2.462425470352173, "epoch": 0.5951638631090487, "grad_norm": 0.026948580518364906, "grad_norm_var": 7.582360359421891e-07, "learning_rate": 0.003651916296404495, "loss": 2.6066, "step": 16417 }, { "crossentropy": 2.7044644355773926, "epoch": 0.5952001160092807, "grad_norm": 0.026753365993499756, "grad_norm_var": 7.406965147067901e-07, "learning_rate": 0.0036513567367971412, "loss": 2.5731, "step": 16418 }, { "crossentropy": 2.62278413772583, "epoch": 0.5952363689095128, "grad_norm": 0.02652711234986782, "grad_norm_var": 6.901753557837787e-07, "learning_rate": 0.0036507971954052587, "loss": 2.6794, "step": 16419 }, { "crossentropy": 2.5589993000030518, "epoch": 0.5952726218097448, "grad_norm": 0.027009855955839157, "grad_norm_var": 6.880437154204669e-07, "learning_rate": 0.0036502376722364082, "loss": 2.6023, "step": 16420 }, { "crossentropy": 2.807997703552246, "epoch": 0.5953088747099768, "grad_norm": 0.02642032690346241, "grad_norm_var": 4.613534506674264e-07, "learning_rate": 0.0036496781672981466, "loss": 2.6188, "step": 16421 }, { "crossentropy": 2.5433740615844727, "epoch": 0.5953451276102089, "grad_norm": 0.02680356614291668, "grad_norm_var": 4.4673818649549185e-07, "learning_rate": 0.0036491186805980304, "loss": 2.6455, "step": 16422 }, { "crossentropy": 2.392169237136841, "epoch": 0.5953813805104409, "grad_norm": 0.028257306665182114, "grad_norm_var": 5.631837644461904e-07, "learning_rate": 0.003648559212143616, "loss": 2.4975, "step": 16423 }, { "crossentropy": 2.496380090713501, "epoch": 0.5954176334106729, "grad_norm": 0.025706980377435684, "grad_norm_var": 5.877333125065345e-07, "learning_rate": 0.0036479997619424604, "loss": 2.5554, "step": 16424 }, { "crossentropy": 2.513396978378296, "epoch": 0.5954538863109049, "grad_norm": 0.025812562555074692, "grad_norm_var": 6.649593408527965e-07, "learning_rate": 0.0036474403300021203, "loss": 2.4635, "step": 16425 }, { "crossentropy": 2.3880109786987305, "epoch": 0.5954901392111369, "grad_norm": 0.02853705734014511, "grad_norm_var": 7.201133307997246e-07, "learning_rate": 0.003646880916330153, "loss": 2.3974, "step": 16426 }, { "crossentropy": 2.535548686981201, "epoch": 0.5955263921113689, "grad_norm": 0.02618442475795746, "grad_norm_var": 7.655876561961256e-07, "learning_rate": 0.0036463215209341093, "loss": 2.4888, "step": 16427 }, { "crossentropy": 2.6815075874328613, "epoch": 0.5955626450116009, "grad_norm": 0.029165221378207207, "grad_norm_var": 1.056141697856995e-06, "learning_rate": 0.0036457621438215487, "loss": 2.718, "step": 16428 }, { "crossentropy": 2.488499879837036, "epoch": 0.595598897911833, "grad_norm": 0.028010277077555656, "grad_norm_var": 1.0454688473399693e-06, "learning_rate": 0.0036452027850000257, "loss": 2.4809, "step": 16429 }, { "crossentropy": 2.5850279331207275, "epoch": 0.595635150812065, "grad_norm": 0.02700078673660755, "grad_norm_var": 9.994858955745944e-07, "learning_rate": 0.003644643444477095, "loss": 2.6311, "step": 16430 }, { "crossentropy": 2.712494134902954, "epoch": 0.595671403712297, "grad_norm": 0.029086001217365265, "grad_norm_var": 1.2141656324531995e-06, "learning_rate": 0.003644084122260312, "loss": 2.659, "step": 16431 }, { "crossentropy": 2.422074556350708, "epoch": 0.595707656612529, "grad_norm": 0.02996394783258438, "grad_norm_var": 1.6536494819446712e-06, "learning_rate": 0.003643524818357228, "loss": 2.5, "step": 16432 }, { "crossentropy": 2.714191198348999, "epoch": 0.595743909512761, "grad_norm": 0.028111599385738373, "grad_norm_var": 1.6702472730782565e-06, "learning_rate": 0.0036429655327754017, "loss": 2.67, "step": 16433 }, { "crossentropy": 2.6373744010925293, "epoch": 0.595780162412993, "grad_norm": 0.026136208325624466, "grad_norm_var": 1.7521503663515495e-06, "learning_rate": 0.0036424062655223857, "loss": 2.5683, "step": 16434 }, { "crossentropy": 2.529611110687256, "epoch": 0.595816415313225, "grad_norm": 0.025966709479689598, "grad_norm_var": 1.8385572920993446e-06, "learning_rate": 0.003641847016605734, "loss": 2.4659, "step": 16435 }, { "crossentropy": 2.608827829360962, "epoch": 0.5958526682134571, "grad_norm": 0.027604635804891586, "grad_norm_var": 1.8308534657308454e-06, "learning_rate": 0.0036412877860329987, "loss": 2.5585, "step": 16436 }, { "crossentropy": 2.663691520690918, "epoch": 0.5958889211136891, "grad_norm": 0.027634942904114723, "grad_norm_var": 1.7606814401385322e-06, "learning_rate": 0.0036407285738117336, "loss": 2.643, "step": 16437 }, { "crossentropy": 2.5969064235687256, "epoch": 0.5959251740139211, "grad_norm": 0.027563301846385002, "grad_norm_var": 1.7263214311338352e-06, "learning_rate": 0.0036401693799494915, "loss": 2.5633, "step": 16438 }, { "crossentropy": 2.5654444694519043, "epoch": 0.5959614269141531, "grad_norm": 0.026554744690656662, "grad_norm_var": 1.7461033830918256e-06, "learning_rate": 0.003639610204453826, "loss": 2.5334, "step": 16439 }, { "crossentropy": 2.5004334449768066, "epoch": 0.5959976798143851, "grad_norm": 0.025861872360110283, "grad_norm_var": 1.7118128463576432e-06, "learning_rate": 0.0036390510473322873, "loss": 2.4938, "step": 16440 }, { "crossentropy": 2.727396011352539, "epoch": 0.5960339327146171, "grad_norm": 0.027741942554712296, "grad_norm_var": 1.523329418002039e-06, "learning_rate": 0.0036384919085924315, "loss": 2.5975, "step": 16441 }, { "crossentropy": 2.4317774772644043, "epoch": 0.5960701856148491, "grad_norm": 0.02618260681629181, "grad_norm_var": 1.5662811719087255e-06, "learning_rate": 0.0036379327882418077, "loss": 2.5371, "step": 16442 }, { "crossentropy": 2.5073440074920654, "epoch": 0.5961064385150812, "grad_norm": 0.026747314259409904, "grad_norm_var": 1.4931207545300826e-06, "learning_rate": 0.0036373736862879706, "loss": 2.571, "step": 16443 }, { "crossentropy": 2.454709529876709, "epoch": 0.5961426914153132, "grad_norm": 0.027432924136519432, "grad_norm_var": 1.2864114980117025e-06, "learning_rate": 0.0036368146027384684, "loss": 2.5061, "step": 16444 }, { "crossentropy": 2.5580859184265137, "epoch": 0.5961789443155452, "grad_norm": 0.026517048478126526, "grad_norm_var": 1.2943081473140803e-06, "learning_rate": 0.0036362555376008533, "loss": 2.5538, "step": 16445 }, { "crossentropy": 2.5570943355560303, "epoch": 0.5962151972157773, "grad_norm": 0.02645004726946354, "grad_norm_var": 1.3320546602850336e-06, "learning_rate": 0.0036356964908826773, "loss": 2.5709, "step": 16446 }, { "crossentropy": 2.449059009552002, "epoch": 0.5962514501160093, "grad_norm": 0.02621309831738472, "grad_norm_var": 1.1339823610761878e-06, "learning_rate": 0.003635137462591489, "loss": 2.4304, "step": 16447 }, { "crossentropy": 2.518383026123047, "epoch": 0.5962877030162413, "grad_norm": 0.027076680213212967, "grad_norm_var": 5.304059534403839e-07, "learning_rate": 0.0036345784527348425, "loss": 2.6102, "step": 16448 }, { "crossentropy": 2.507697105407715, "epoch": 0.5963239559164734, "grad_norm": 0.027020366862416267, "grad_norm_var": 4.2304986665828855e-07, "learning_rate": 0.003634019461320285, "loss": 2.4377, "step": 16449 }, { "crossentropy": 2.605933427810669, "epoch": 0.5963602088167054, "grad_norm": 0.027712734416127205, "grad_norm_var": 4.4011358821213245e-07, "learning_rate": 0.003633460488355368, "loss": 2.6338, "step": 16450 }, { "crossentropy": 2.37925124168396, "epoch": 0.5963964617169374, "grad_norm": 0.02889079600572586, "grad_norm_var": 6.135370227519065e-07, "learning_rate": 0.0036329015338476423, "loss": 2.4567, "step": 16451 }, { "crossentropy": 2.604851722717285, "epoch": 0.5964327146171694, "grad_norm": 0.026746684685349464, "grad_norm_var": 5.989913012672491e-07, "learning_rate": 0.003632342597804655, "loss": 2.5087, "step": 16452 }, { "crossentropy": 2.5267131328582764, "epoch": 0.5964689675174014, "grad_norm": 0.02723570540547371, "grad_norm_var": 5.763089574427223e-07, "learning_rate": 0.0036317836802339555, "loss": 2.5713, "step": 16453 }, { "crossentropy": 2.47509503364563, "epoch": 0.5965052204176334, "grad_norm": 0.02757171168923378, "grad_norm_var": 5.769486686115252e-07, "learning_rate": 0.0036312247811430944, "loss": 2.4356, "step": 16454 }, { "crossentropy": 2.689711570739746, "epoch": 0.5965414733178654, "grad_norm": 0.02772645838558674, "grad_norm_var": 5.936210765359236e-07, "learning_rate": 0.0036306659005396194, "loss": 2.576, "step": 16455 }, { "crossentropy": 2.5584187507629395, "epoch": 0.5965777262180975, "grad_norm": 0.027846302837133408, "grad_norm_var": 5.199523112477672e-07, "learning_rate": 0.0036301070384310795, "loss": 2.5675, "step": 16456 }, { "crossentropy": 2.414476156234741, "epoch": 0.5966139791183295, "grad_norm": 0.026399319991469383, "grad_norm_var": 5.346205822568303e-07, "learning_rate": 0.003629548194825023, "loss": 2.4804, "step": 16457 }, { "crossentropy": 2.6280415058135986, "epoch": 0.5966502320185615, "grad_norm": 0.028500668704509735, "grad_norm_var": 5.836354880772721e-07, "learning_rate": 0.0036289893697289983, "loss": 2.624, "step": 16458 }, { "crossentropy": 2.589956045150757, "epoch": 0.5966864849187935, "grad_norm": 0.027080636471509933, "grad_norm_var": 5.679945740561283e-07, "learning_rate": 0.0036284305631505524, "loss": 2.5293, "step": 16459 }, { "crossentropy": 2.5343985557556152, "epoch": 0.5967227378190255, "grad_norm": 0.027117298915982246, "grad_norm_var": 5.676305183521425e-07, "learning_rate": 0.0036278717750972346, "loss": 2.5106, "step": 16460 }, { "crossentropy": 2.6570444107055664, "epoch": 0.5967589907192575, "grad_norm": 0.027464326471090317, "grad_norm_var": 5.303062020145145e-07, "learning_rate": 0.0036273130055765896, "loss": 2.6807, "step": 16461 }, { "crossentropy": 2.640490770339966, "epoch": 0.5967952436194895, "grad_norm": 0.025784790515899658, "grad_norm_var": 6.347598611964114e-07, "learning_rate": 0.003626754254596165, "loss": 2.6148, "step": 16462 }, { "crossentropy": 2.4320480823516846, "epoch": 0.5968314965197216, "grad_norm": 0.027099547907710075, "grad_norm_var": 5.584540227813178e-07, "learning_rate": 0.0036261955221635078, "loss": 2.4304, "step": 16463 }, { "crossentropy": 2.574404001235962, "epoch": 0.5968677494199536, "grad_norm": 0.025514058768749237, "grad_norm_var": 7.637669442266675e-07, "learning_rate": 0.0036256368082861657, "loss": 2.5422, "step": 16464 }, { "crossentropy": 2.6060831546783447, "epoch": 0.5969040023201856, "grad_norm": 0.027090590447187424, "grad_norm_var": 7.620939484572741e-07, "learning_rate": 0.003625078112971685, "loss": 2.4824, "step": 16465 }, { "crossentropy": 2.5301806926727295, "epoch": 0.5969402552204176, "grad_norm": 0.029141388833522797, "grad_norm_var": 9.804045723571606e-07, "learning_rate": 0.0036245194362276097, "loss": 2.6134, "step": 16466 }, { "crossentropy": 2.673085927963257, "epoch": 0.5969765081206496, "grad_norm": 0.027848083525896072, "grad_norm_var": 8.307570279286833e-07, "learning_rate": 0.003623960778061487, "loss": 2.6303, "step": 16467 }, { "crossentropy": 2.428441286087036, "epoch": 0.5970127610208816, "grad_norm": 0.027024174109101295, "grad_norm_var": 8.165600957335626e-07, "learning_rate": 0.0036234021384808635, "loss": 2.4923, "step": 16468 }, { "crossentropy": 2.5277581214904785, "epoch": 0.5970490139211136, "grad_norm": 0.027521366253495216, "grad_norm_var": 8.200562985375206e-07, "learning_rate": 0.0036228435174932823, "loss": 2.5528, "step": 16469 }, { "crossentropy": 2.476552724838257, "epoch": 0.5970852668213457, "grad_norm": 0.026028014719486237, "grad_norm_var": 9.121772394570423e-07, "learning_rate": 0.003622284915106289, "loss": 2.4905, "step": 16470 }, { "crossentropy": 2.5229761600494385, "epoch": 0.5971215197215777, "grad_norm": 0.025894034653902054, "grad_norm_var": 9.932142041965541e-07, "learning_rate": 0.0036217263313274273, "loss": 2.4876, "step": 16471 }, { "crossentropy": 2.584207773208618, "epoch": 0.5971577726218097, "grad_norm": 0.026900727301836014, "grad_norm_var": 9.530711678782383e-07, "learning_rate": 0.003621167766164245, "loss": 2.5915, "step": 16472 }, { "crossentropy": 2.4897940158843994, "epoch": 0.5971940255220418, "grad_norm": 0.025479363277554512, "grad_norm_var": 1.0827818683208177e-06, "learning_rate": 0.003620609219624284, "loss": 2.5143, "step": 16473 }, { "crossentropy": 2.499263286590576, "epoch": 0.5972302784222738, "grad_norm": 0.025954877957701683, "grad_norm_var": 9.676230706962343e-07, "learning_rate": 0.003620050691715089, "loss": 2.4765, "step": 16474 }, { "crossentropy": 2.412686824798584, "epoch": 0.5972665313225058, "grad_norm": 0.027695994824171066, "grad_norm_var": 1.013580550614177e-06, "learning_rate": 0.0036194921824442036, "loss": 2.4029, "step": 16475 }, { "crossentropy": 2.4554829597473145, "epoch": 0.5973027842227379, "grad_norm": 0.026332741603255272, "grad_norm_var": 1.0238192567746847e-06, "learning_rate": 0.0036189336918191696, "loss": 2.4538, "step": 16476 }, { "crossentropy": 2.4718408584594727, "epoch": 0.5973390371229699, "grad_norm": 0.029418829828500748, "grad_norm_var": 1.4361203970724341e-06, "learning_rate": 0.0036183752198475356, "loss": 2.5717, "step": 16477 }, { "crossentropy": 2.497652530670166, "epoch": 0.5973752900232019, "grad_norm": 0.02614215388894081, "grad_norm_var": 1.3899856463351138e-06, "learning_rate": 0.003617816766536839, "loss": 2.4752, "step": 16478 }, { "crossentropy": 2.5546135902404785, "epoch": 0.5974115429234339, "grad_norm": 0.028072968125343323, "grad_norm_var": 1.4695422282489836e-06, "learning_rate": 0.003617258331894623, "loss": 2.5718, "step": 16479 }, { "crossentropy": 2.6878581047058105, "epoch": 0.5974477958236659, "grad_norm": 0.026198159903287888, "grad_norm_var": 1.3629155499594493e-06, "learning_rate": 0.003616699915928433, "loss": 2.6042, "step": 16480 }, { "crossentropy": 2.364778518676758, "epoch": 0.5974840487238979, "grad_norm": 0.02664417214691639, "grad_norm_var": 1.372744785677523e-06, "learning_rate": 0.0036161415186458097, "loss": 2.4426, "step": 16481 }, { "crossentropy": 2.6191020011901855, "epoch": 0.59752030162413, "grad_norm": 0.027071284130215645, "grad_norm_var": 1.0546493036816257e-06, "learning_rate": 0.003615583140054295, "loss": 2.5873, "step": 16482 }, { "crossentropy": 2.4819841384887695, "epoch": 0.597556554524362, "grad_norm": 0.026251859962940216, "grad_norm_var": 1.009812535173353e-06, "learning_rate": 0.0036150247801614312, "loss": 2.5142, "step": 16483 }, { "crossentropy": 2.448355197906494, "epoch": 0.597592807424594, "grad_norm": 0.02584468200802803, "grad_norm_var": 1.0598439136276505e-06, "learning_rate": 0.003614466438974759, "loss": 2.4262, "step": 16484 }, { "crossentropy": 2.4561827182769775, "epoch": 0.597629060324826, "grad_norm": 0.026292014867067337, "grad_norm_var": 1.0222411888238196e-06, "learning_rate": 0.003613908116501823, "loss": 2.4345, "step": 16485 }, { "crossentropy": 2.4905550479888916, "epoch": 0.597665313225058, "grad_norm": 0.025950772687792778, "grad_norm_var": 1.028905218668913e-06, "learning_rate": 0.0036133498127501585, "loss": 2.5026, "step": 16486 }, { "crossentropy": 2.726241111755371, "epoch": 0.59770156612529, "grad_norm": 0.025528104975819588, "grad_norm_var": 1.0733795655229097e-06, "learning_rate": 0.0036127915277273086, "loss": 2.6241, "step": 16487 }, { "crossentropy": 2.611910820007324, "epoch": 0.597737819025522, "grad_norm": 0.025899721309542656, "grad_norm_var": 1.0973587942173884e-06, "learning_rate": 0.0036122332614408155, "loss": 2.6345, "step": 16488 }, { "crossentropy": 2.492877721786499, "epoch": 0.597774071925754, "grad_norm": 0.026098176836967468, "grad_norm_var": 1.033070318708533e-06, "learning_rate": 0.003611675013898218, "loss": 2.527, "step": 16489 }, { "crossentropy": 2.495932102203369, "epoch": 0.5978103248259861, "grad_norm": 0.02569693699479103, "grad_norm_var": 1.058978392566058e-06, "learning_rate": 0.0036111167851070563, "loss": 2.5036, "step": 16490 }, { "crossentropy": 2.4927620887756348, "epoch": 0.5978465777262181, "grad_norm": 0.02627815119922161, "grad_norm_var": 9.719757751945004e-07, "learning_rate": 0.0036105585750748704, "loss": 2.5702, "step": 16491 }, { "crossentropy": 2.4144630432128906, "epoch": 0.5978828306264501, "grad_norm": 0.027254419401288033, "grad_norm_var": 1.0066594221870185e-06, "learning_rate": 0.0036100003838091983, "loss": 2.5366, "step": 16492 }, { "crossentropy": 2.6098036766052246, "epoch": 0.5979190835266821, "grad_norm": 0.026778852567076683, "grad_norm_var": 4.2896552833269346e-07, "learning_rate": 0.0036094422113175833, "loss": 2.4977, "step": 16493 }, { "crossentropy": 2.6995930671691895, "epoch": 0.5979553364269141, "grad_norm": 0.028223592787981033, "grad_norm_var": 6.350769445046326e-07, "learning_rate": 0.00360888405760756, "loss": 2.6489, "step": 16494 }, { "crossentropy": 2.1952083110809326, "epoch": 0.5979915893271461, "grad_norm": 0.027788538485765457, "grad_norm_var": 5.806788330407301e-07, "learning_rate": 0.003608325922686667, "loss": 2.3995, "step": 16495 }, { "crossentropy": 2.6585159301757812, "epoch": 0.5980278422273781, "grad_norm": 0.027849877253174782, "grad_norm_var": 6.874761007887812e-07, "learning_rate": 0.0036077678065624453, "loss": 2.6371, "step": 16496 }, { "crossentropy": 2.5406689643859863, "epoch": 0.5980640951276102, "grad_norm": 0.029311226680874825, "grad_norm_var": 1.1510658700179202e-06, "learning_rate": 0.0036072097092424327, "loss": 2.5512, "step": 16497 }, { "crossentropy": 2.603727102279663, "epoch": 0.5981003480278422, "grad_norm": 0.028310641646385193, "grad_norm_var": 1.2989368717477556e-06, "learning_rate": 0.0036066516307341656, "loss": 2.5033, "step": 16498 }, { "crossentropy": 2.521841287612915, "epoch": 0.5981366009280742, "grad_norm": 0.026429887861013412, "grad_norm_var": 1.2870793220802016e-06, "learning_rate": 0.0036060935710451837, "loss": 2.5051, "step": 16499 }, { "crossentropy": 2.6372904777526855, "epoch": 0.5981728538283063, "grad_norm": 0.026023048907518387, "grad_norm_var": 1.2652547441020327e-06, "learning_rate": 0.0036055355301830225, "loss": 2.6388, "step": 16500 }, { "crossentropy": 2.3919084072113037, "epoch": 0.5982091067285383, "grad_norm": 0.02606591023504734, "grad_norm_var": 1.2854864195014816e-06, "learning_rate": 0.0036049775081552207, "loss": 2.4104, "step": 16501 }, { "crossentropy": 2.3842766284942627, "epoch": 0.5982453596287703, "grad_norm": 0.027219422161579132, "grad_norm_var": 1.235156707995085e-06, "learning_rate": 0.003604419504969317, "loss": 2.5201, "step": 16502 }, { "crossentropy": 2.6686818599700928, "epoch": 0.5982816125290024, "grad_norm": 0.027428215369582176, "grad_norm_var": 1.1075959360450893e-06, "learning_rate": 0.003603861520632842, "loss": 2.5092, "step": 16503 }, { "crossentropy": 2.442067861557007, "epoch": 0.5983178654292344, "grad_norm": 0.027807803824543953, "grad_norm_var": 1.0447809074764594e-06, "learning_rate": 0.0036033035551533378, "loss": 2.511, "step": 16504 }, { "crossentropy": 2.6377058029174805, "epoch": 0.5983541183294664, "grad_norm": 0.026711968705058098, "grad_norm_var": 9.814046781840182e-07, "learning_rate": 0.0036027456085383383, "loss": 2.518, "step": 16505 }, { "crossentropy": 2.4842891693115234, "epoch": 0.5983903712296984, "grad_norm": 0.028379520401358604, "grad_norm_var": 8.940390481162556e-07, "learning_rate": 0.003602187680795379, "loss": 2.4652, "step": 16506 }, { "crossentropy": 2.543654203414917, "epoch": 0.5984266241299304, "grad_norm": 0.026820266619324684, "grad_norm_var": 8.337522325064031e-07, "learning_rate": 0.0036016297719319978, "loss": 2.6047, "step": 16507 }, { "crossentropy": 2.559866428375244, "epoch": 0.5984628770301624, "grad_norm": 0.027421345934271812, "grad_norm_var": 8.322491515964925e-07, "learning_rate": 0.0036010718819557263, "loss": 2.536, "step": 16508 }, { "crossentropy": 2.542886972427368, "epoch": 0.5984991299303944, "grad_norm": 0.026166461408138275, "grad_norm_var": 9.07274274672048e-07, "learning_rate": 0.003600514010874104, "loss": 2.5067, "step": 16509 }, { "crossentropy": 2.6300227642059326, "epoch": 0.5985353828306265, "grad_norm": 0.02742997743189335, "grad_norm_var": 8.565646266227763e-07, "learning_rate": 0.003599956158694665, "loss": 2.6179, "step": 16510 }, { "crossentropy": 2.5621752738952637, "epoch": 0.5985716357308585, "grad_norm": 0.02647777460515499, "grad_norm_var": 8.825420689454691e-07, "learning_rate": 0.00359939832542494, "loss": 2.5001, "step": 16511 }, { "crossentropy": 2.5496935844421387, "epoch": 0.5986078886310905, "grad_norm": 0.027984879910945892, "grad_norm_var": 8.946441650036403e-07, "learning_rate": 0.0035988405110724676, "loss": 2.5441, "step": 16512 }, { "crossentropy": 2.5631954669952393, "epoch": 0.5986441415313225, "grad_norm": 0.026243044063448906, "grad_norm_var": 6.39476058631773e-07, "learning_rate": 0.0035982827156447795, "loss": 2.4837, "step": 16513 }, { "crossentropy": 2.5871410369873047, "epoch": 0.5986803944315545, "grad_norm": 0.025773996487259865, "grad_norm_var": 6.178034155330242e-07, "learning_rate": 0.0035977249391494115, "loss": 2.5507, "step": 16514 }, { "crossentropy": 2.57766056060791, "epoch": 0.5987166473317865, "grad_norm": 0.02689564973115921, "grad_norm_var": 6.02231039533565e-07, "learning_rate": 0.0035971671815938954, "loss": 2.4217, "step": 16515 }, { "crossentropy": 2.5232834815979004, "epoch": 0.5987529002320185, "grad_norm": 0.025784721598029137, "grad_norm_var": 6.345401952350648e-07, "learning_rate": 0.003596609442985765, "loss": 2.4041, "step": 16516 }, { "crossentropy": 2.52288556098938, "epoch": 0.5987891531322506, "grad_norm": 0.027683649212121964, "grad_norm_var": 6.153517564657136e-07, "learning_rate": 0.003596051723332554, "loss": 2.5488, "step": 16517 }, { "crossentropy": 2.586848735809326, "epoch": 0.5988254060324826, "grad_norm": 0.027960965409874916, "grad_norm_var": 6.70001216608133e-07, "learning_rate": 0.003595494022641796, "loss": 2.5057, "step": 16518 }, { "crossentropy": 2.7273013591766357, "epoch": 0.5988616589327146, "grad_norm": 0.028264859691262245, "grad_norm_var": 7.547535486387401e-07, "learning_rate": 0.0035949363409210236, "loss": 2.6513, "step": 16519 }, { "crossentropy": 2.5525689125061035, "epoch": 0.5988979118329466, "grad_norm": 0.028238970786333084, "grad_norm_var": 8.063201412945488e-07, "learning_rate": 0.0035943786781777666, "loss": 2.5598, "step": 16520 }, { "crossentropy": 2.6072168350219727, "epoch": 0.5989341647331786, "grad_norm": 0.025179913267493248, "grad_norm_var": 1.0404305770731244e-06, "learning_rate": 0.003593821034419559, "loss": 2.5005, "step": 16521 }, { "crossentropy": 2.5649819374084473, "epoch": 0.5989704176334106, "grad_norm": 0.026696953922510147, "grad_norm_var": 9.177843626019417e-07, "learning_rate": 0.0035932634096539316, "loss": 2.5333, "step": 16522 }, { "crossentropy": 2.493323564529419, "epoch": 0.5990066705336426, "grad_norm": 0.025281835347414017, "grad_norm_var": 1.090055315831996e-06, "learning_rate": 0.0035927058038884175, "loss": 2.3953, "step": 16523 }, { "crossentropy": 2.429213523864746, "epoch": 0.5990429234338747, "grad_norm": 0.028676440939307213, "grad_norm_var": 1.2853245517655999e-06, "learning_rate": 0.003592148217130545, "loss": 2.6027, "step": 16524 }, { "crossentropy": 2.3932642936706543, "epoch": 0.5990791763341067, "grad_norm": 0.026848353445529938, "grad_norm_var": 1.245760488172024e-06, "learning_rate": 0.00359159064938785, "loss": 2.5039, "step": 16525 }, { "crossentropy": 2.5925369262695312, "epoch": 0.5991154292343387, "grad_norm": 0.02665538527071476, "grad_norm_var": 1.235121388336183e-06, "learning_rate": 0.00359103310066786, "loss": 2.5557, "step": 16526 }, { "crossentropy": 2.555272340774536, "epoch": 0.5991516821345708, "grad_norm": 0.025696448981761932, "grad_norm_var": 1.3188726076582742e-06, "learning_rate": 0.003590475570978107, "loss": 2.5608, "step": 16527 }, { "crossentropy": 2.6815707683563232, "epoch": 0.5991879350348028, "grad_norm": 0.026835525408387184, "grad_norm_var": 1.2300672364472792e-06, "learning_rate": 0.0035899180603261188, "loss": 2.5876, "step": 16528 }, { "crossentropy": 2.612968921661377, "epoch": 0.5992241879350348, "grad_norm": 0.027644924819469452, "grad_norm_var": 1.2497647956900722e-06, "learning_rate": 0.0035893605687194277, "loss": 2.5279, "step": 16529 }, { "crossentropy": 2.6044681072235107, "epoch": 0.5992604408352669, "grad_norm": 0.0269426628947258, "grad_norm_var": 1.1624103663216186e-06, "learning_rate": 0.0035888030961655627, "loss": 2.6043, "step": 16530 }, { "crossentropy": 2.5152575969696045, "epoch": 0.5992966937354989, "grad_norm": 0.02604549750685692, "grad_norm_var": 1.214361799364719e-06, "learning_rate": 0.0035882456426720535, "loss": 2.534, "step": 16531 }, { "crossentropy": 2.5300581455230713, "epoch": 0.5993329466357309, "grad_norm": 0.026777101680636406, "grad_norm_var": 1.1280353811292705e-06, "learning_rate": 0.0035876882082464287, "loss": 2.5415, "step": 16532 }, { "crossentropy": 2.6072278022766113, "epoch": 0.5993691995359629, "grad_norm": 0.027420254424214363, "grad_norm_var": 1.1071098990145592e-06, "learning_rate": 0.0035871307928962183, "loss": 2.5441, "step": 16533 }, { "crossentropy": 2.5522806644439697, "epoch": 0.5994054524361949, "grad_norm": 0.02614874579012394, "grad_norm_var": 1.0675777534204765e-06, "learning_rate": 0.003586573396628951, "loss": 2.6012, "step": 16534 }, { "crossentropy": 2.5790131092071533, "epoch": 0.5994417053364269, "grad_norm": 0.026428569108247757, "grad_norm_var": 9.281466560904223e-07, "learning_rate": 0.0035860160194521548, "loss": 2.5398, "step": 16535 }, { "crossentropy": 2.7667665481567383, "epoch": 0.599477958236659, "grad_norm": 0.027779223397374153, "grad_norm_var": 8.482354967503358e-07, "learning_rate": 0.0035854586613733586, "loss": 2.6194, "step": 16536 }, { "crossentropy": 2.494757652282715, "epoch": 0.599514211136891, "grad_norm": 0.026763642206788063, "grad_norm_var": 6.858866924935407e-07, "learning_rate": 0.00358490132240009, "loss": 2.487, "step": 16537 }, { "crossentropy": 2.6064436435699463, "epoch": 0.599550464037123, "grad_norm": 0.027157742530107498, "grad_norm_var": 6.934344566294585e-07, "learning_rate": 0.0035843440025398756, "loss": 2.5716, "step": 16538 }, { "crossentropy": 2.685662269592285, "epoch": 0.599586716937355, "grad_norm": 0.026457183063030243, "grad_norm_var": 5.388969933100539e-07, "learning_rate": 0.003583786701800244, "loss": 2.6603, "step": 16539 }, { "crossentropy": 2.6176562309265137, "epoch": 0.599622969837587, "grad_norm": 0.027449171990156174, "grad_norm_var": 3.410935931674597e-07, "learning_rate": 0.003583229420188721, "loss": 2.616, "step": 16540 }, { "crossentropy": 2.577967405319214, "epoch": 0.599659222737819, "grad_norm": 0.02706858702003956, "grad_norm_var": 3.450852797574784e-07, "learning_rate": 0.003582672157712836, "loss": 2.5858, "step": 16541 }, { "crossentropy": 2.502509117126465, "epoch": 0.599695475638051, "grad_norm": 0.025962207466363907, "grad_norm_var": 3.912008700105897e-07, "learning_rate": 0.0035821149143801146, "loss": 2.5221, "step": 16542 }, { "crossentropy": 2.652733564376831, "epoch": 0.599731728538283, "grad_norm": 0.026521427556872368, "grad_norm_var": 3.138799918431592e-07, "learning_rate": 0.0035815576901980823, "loss": 2.6292, "step": 16543 }, { "crossentropy": 2.6603267192840576, "epoch": 0.5997679814385151, "grad_norm": 0.025508934631943703, "grad_norm_var": 4.242467191735317e-07, "learning_rate": 0.003581000485174267, "loss": 2.5442, "step": 16544 }, { "crossentropy": 2.5807511806488037, "epoch": 0.5998042343387471, "grad_norm": 0.026096796616911888, "grad_norm_var": 3.902916296879734e-07, "learning_rate": 0.0035804432993161934, "loss": 2.5789, "step": 16545 }, { "crossentropy": 2.5301175117492676, "epoch": 0.5998404872389791, "grad_norm": 0.0269135981798172, "grad_norm_var": 3.8924121319333126e-07, "learning_rate": 0.0035798861326313868, "loss": 2.5366, "step": 16546 }, { "crossentropy": 2.5318639278411865, "epoch": 0.5998767401392111, "grad_norm": 0.02629147656261921, "grad_norm_var": 3.729945422491221e-07, "learning_rate": 0.003579328985127373, "loss": 2.5847, "step": 16547 }, { "crossentropy": 2.581347703933716, "epoch": 0.5999129930394431, "grad_norm": 0.026651054620742798, "grad_norm_var": 3.722134583285328e-07, "learning_rate": 0.003578771856811676, "loss": 2.5728, "step": 16548 }, { "crossentropy": 2.5413050651550293, "epoch": 0.5999492459396751, "grad_norm": 0.02765139937400818, "grad_norm_var": 3.98870332000284e-07, "learning_rate": 0.0035782147476918235, "loss": 2.4995, "step": 16549 }, { "crossentropy": 2.4749016761779785, "epoch": 0.5999854988399071, "grad_norm": 0.028193896636366844, "grad_norm_var": 5.159348324576887e-07, "learning_rate": 0.003577657657775338, "loss": 2.4269, "step": 16550 }, { "crossentropy": 2.4305524826049805, "epoch": 0.6000217517401392, "grad_norm": 0.026333153247833252, "grad_norm_var": 5.21304697283694e-07, "learning_rate": 0.003577100587069744, "loss": 2.4991, "step": 16551 }, { "crossentropy": 2.588060140609741, "epoch": 0.6000580046403712, "grad_norm": 0.02659693732857704, "grad_norm_var": 4.5429928714339706e-07, "learning_rate": 0.0035765435355825676, "loss": 2.5698, "step": 16552 }, { "crossentropy": 2.539525032043457, "epoch": 0.6000942575406032, "grad_norm": 0.02712729200720787, "grad_norm_var": 4.643858409896761e-07, "learning_rate": 0.00357598650332133, "loss": 2.5597, "step": 16553 }, { "crossentropy": 2.3539364337921143, "epoch": 0.6001305104408353, "grad_norm": 0.028675785288214684, "grad_norm_var": 6.91185759709685e-07, "learning_rate": 0.0035754294902935546, "loss": 2.4531, "step": 16554 }, { "crossentropy": 2.451404333114624, "epoch": 0.6001667633410673, "grad_norm": 0.027669133618474007, "grad_norm_var": 7.205316939632815e-07, "learning_rate": 0.003574872496506766, "loss": 2.4337, "step": 16555 }, { "crossentropy": 2.3721234798431396, "epoch": 0.6002030162412993, "grad_norm": 0.02596399188041687, "grad_norm_var": 7.534897110180715e-07, "learning_rate": 0.0035743155219684873, "loss": 2.4135, "step": 16556 }, { "crossentropy": 2.481886863708496, "epoch": 0.6002392691415314, "grad_norm": 0.028959892690181732, "grad_norm_var": 1.0380762531938717e-06, "learning_rate": 0.0035737585666862416, "loss": 2.4781, "step": 16557 }, { "crossentropy": 2.6630680561065674, "epoch": 0.6002755220417634, "grad_norm": 0.02999928779900074, "grad_norm_var": 1.5277890431521224e-06, "learning_rate": 0.003573201630667551, "loss": 2.5599, "step": 16558 }, { "crossentropy": 2.608868360519409, "epoch": 0.6003117749419954, "grad_norm": 0.026125865057110786, "grad_norm_var": 1.5732059994624214e-06, "learning_rate": 0.003572644713919937, "loss": 2.5709, "step": 16559 }, { "crossentropy": 2.449817419052124, "epoch": 0.6003480278422274, "grad_norm": 0.025488609448075294, "grad_norm_var": 1.5777398671418598e-06, "learning_rate": 0.0035720878164509206, "loss": 2.4966, "step": 16560 }, { "crossentropy": 2.427727460861206, "epoch": 0.6003842807424594, "grad_norm": 0.027203526347875595, "grad_norm_var": 1.4957593095987157e-06, "learning_rate": 0.0035715309382680295, "loss": 2.5272, "step": 16561 }, { "crossentropy": 2.5074961185455322, "epoch": 0.6004205336426914, "grad_norm": 0.029353460296988487, "grad_norm_var": 1.7615342355899178e-06, "learning_rate": 0.003570974079378778, "loss": 2.5167, "step": 16562 }, { "crossentropy": 2.5737833976745605, "epoch": 0.6004567865429234, "grad_norm": 0.027897736057639122, "grad_norm_var": 1.6869209255619573e-06, "learning_rate": 0.0035704172397906897, "loss": 2.5152, "step": 16563 }, { "crossentropy": 2.6698906421661377, "epoch": 0.6004930394431555, "grad_norm": 0.028778573498129845, "grad_norm_var": 1.730929430325762e-06, "learning_rate": 0.003569860419511287, "loss": 2.5925, "step": 16564 }, { "crossentropy": 2.5577707290649414, "epoch": 0.6005292923433875, "grad_norm": 0.028052957728505135, "grad_norm_var": 1.7423589090278502e-06, "learning_rate": 0.0035693036185480893, "loss": 2.5846, "step": 16565 }, { "crossentropy": 2.5724406242370605, "epoch": 0.6005655452436195, "grad_norm": 0.027874896302819252, "grad_norm_var": 1.725638653418913e-06, "learning_rate": 0.0035687468369086173, "loss": 2.5999, "step": 16566 }, { "crossentropy": 2.479811668395996, "epoch": 0.6006017981438515, "grad_norm": 0.02916988916695118, "grad_norm_var": 1.7375735758989498e-06, "learning_rate": 0.003568190074600392, "loss": 2.518, "step": 16567 }, { "crossentropy": 2.5845425128936768, "epoch": 0.6006380510440835, "grad_norm": 0.026454966515302658, "grad_norm_var": 1.7617696833250544e-06, "learning_rate": 0.003567633331630931, "loss": 2.4713, "step": 16568 }, { "crossentropy": 2.5805559158325195, "epoch": 0.6006743039443155, "grad_norm": 0.030138559639453888, "grad_norm_var": 2.0585129355818785e-06, "learning_rate": 0.003567076608007759, "loss": 2.5438, "step": 16569 }, { "crossentropy": 2.456752061843872, "epoch": 0.6007105568445475, "grad_norm": 0.025188351050019264, "grad_norm_var": 2.4988109099910766e-06, "learning_rate": 0.0035665199037383894, "loss": 2.5435, "step": 16570 }, { "crossentropy": 2.4763286113739014, "epoch": 0.6007468097447796, "grad_norm": 0.025952564552426338, "grad_norm_var": 2.7060555433254847e-06, "learning_rate": 0.0035659632188303433, "loss": 2.5511, "step": 16571 }, { "crossentropy": 2.6487741470336914, "epoch": 0.6007830626450116, "grad_norm": 0.027136657387018204, "grad_norm_var": 2.5264005820053163e-06, "learning_rate": 0.0035654065532911405, "loss": 2.5492, "step": 16572 }, { "crossentropy": 2.6090927124023438, "epoch": 0.6008193155452436, "grad_norm": 0.026937801390886307, "grad_norm_var": 2.4519740613612585e-06, "learning_rate": 0.0035648499071283, "loss": 2.5076, "step": 16573 }, { "crossentropy": 2.5303385257720947, "epoch": 0.6008555684454756, "grad_norm": 0.02638138271868229, "grad_norm_var": 2.1172993164486705e-06, "learning_rate": 0.0035642932803493393, "loss": 2.5481, "step": 16574 }, { "crossentropy": 2.43961238861084, "epoch": 0.6008918213457076, "grad_norm": 0.02687898464500904, "grad_norm_var": 2.026463307767826e-06, "learning_rate": 0.0035637366729617767, "loss": 2.5517, "step": 16575 }, { "crossentropy": 2.6293671131134033, "epoch": 0.6009280742459396, "grad_norm": 0.02722364291548729, "grad_norm_var": 1.7653636940256321e-06, "learning_rate": 0.003563180084973129, "loss": 2.5948, "step": 16576 }, { "crossentropy": 2.663010597229004, "epoch": 0.6009643271461717, "grad_norm": 0.026597877964377403, "grad_norm_var": 1.8153796088739842e-06, "learning_rate": 0.003562623516390915, "loss": 2.5768, "step": 16577 }, { "crossentropy": 2.6003968715667725, "epoch": 0.6010005800464037, "grad_norm": 0.026186760514974594, "grad_norm_var": 1.6600314890868306e-06, "learning_rate": 0.003562066967222655, "loss": 2.4827, "step": 16578 }, { "crossentropy": 2.425196409225464, "epoch": 0.6010368329466357, "grad_norm": 0.029177583754062653, "grad_norm_var": 1.863858247807395e-06, "learning_rate": 0.003561510437475858, "loss": 2.3743, "step": 16579 }, { "crossentropy": 2.5095748901367188, "epoch": 0.6010730858468677, "grad_norm": 0.027777718380093575, "grad_norm_var": 1.7402583719541004e-06, "learning_rate": 0.0035609539271580478, "loss": 2.5752, "step": 16580 }, { "crossentropy": 2.5382254123687744, "epoch": 0.6011093387470998, "grad_norm": 0.026977282017469406, "grad_norm_var": 1.7075474214080117e-06, "learning_rate": 0.0035603974362767377, "loss": 2.4877, "step": 16581 }, { "crossentropy": 2.606346845626831, "epoch": 0.6011455916473318, "grad_norm": 0.029163042083382607, "grad_norm_var": 1.917993035103517e-06, "learning_rate": 0.003559840964839446, "loss": 2.4862, "step": 16582 }, { "crossentropy": 2.586245536804199, "epoch": 0.6011818445475638, "grad_norm": 0.029810409992933273, "grad_norm_var": 2.100429743235677e-06, "learning_rate": 0.0035592845128536864, "loss": 2.5571, "step": 16583 }, { "crossentropy": 2.692854881286621, "epoch": 0.6012180974477959, "grad_norm": 0.026756195351481438, "grad_norm_var": 2.069190041142921e-06, "learning_rate": 0.003558728080326976, "loss": 2.5771, "step": 16584 }, { "crossentropy": 2.6814045906066895, "epoch": 0.6012543503480279, "grad_norm": 0.025788843631744385, "grad_norm_var": 1.6592557748697222e-06, "learning_rate": 0.003558171667266831, "loss": 2.621, "step": 16585 }, { "crossentropy": 2.566690683364868, "epoch": 0.6012906032482599, "grad_norm": 0.026224100962281227, "grad_norm_var": 1.4594133773986977e-06, "learning_rate": 0.0035576152736807665, "loss": 2.5641, "step": 16586 }, { "crossentropy": 2.453839063644409, "epoch": 0.6013268561484919, "grad_norm": 0.026603542268276215, "grad_norm_var": 1.3788685417359987e-06, "learning_rate": 0.003557058899576294, "loss": 2.4713, "step": 16587 }, { "crossentropy": 2.4992198944091797, "epoch": 0.6013631090487239, "grad_norm": 0.02609378471970558, "grad_norm_var": 1.459316198075761e-06, "learning_rate": 0.0035565025449609327, "loss": 2.5076, "step": 16588 }, { "crossentropy": 2.548696279525757, "epoch": 0.6013993619489559, "grad_norm": 0.02730507217347622, "grad_norm_var": 1.4568077380576009e-06, "learning_rate": 0.0035559462098421948, "loss": 2.5735, "step": 16589 }, { "crossentropy": 2.5459909439086914, "epoch": 0.601435614849188, "grad_norm": 0.027404826134443283, "grad_norm_var": 1.4127290964296845e-06, "learning_rate": 0.0035553898942275943, "loss": 2.5219, "step": 16590 }, { "crossentropy": 2.3968324661254883, "epoch": 0.60147186774942, "grad_norm": 0.026707163080573082, "grad_norm_var": 1.423030622418062e-06, "learning_rate": 0.003554833598124646, "loss": 2.462, "step": 16591 }, { "crossentropy": 2.5353164672851562, "epoch": 0.601508120649652, "grad_norm": 0.02777457796037197, "grad_norm_var": 1.4409931871387874e-06, "learning_rate": 0.003554277321540862, "loss": 2.5938, "step": 16592 }, { "crossentropy": 2.365078926086426, "epoch": 0.601544373549884, "grad_norm": 0.02707217074930668, "grad_norm_var": 1.4124346839273932e-06, "learning_rate": 0.0035537210644837574, "loss": 2.4602, "step": 16593 }, { "crossentropy": 2.5176424980163574, "epoch": 0.601580626450116, "grad_norm": 0.027232779189944267, "grad_norm_var": 1.3253556735688763e-06, "learning_rate": 0.0035531648269608476, "loss": 2.5305, "step": 16594 }, { "crossentropy": 2.418633460998535, "epoch": 0.601616879350348, "grad_norm": 0.025603286921977997, "grad_norm_var": 1.260868764890371e-06, "learning_rate": 0.0035526086089796385, "loss": 2.56, "step": 16595 }, { "crossentropy": 2.605469226837158, "epoch": 0.60165313225058, "grad_norm": 0.026105714961886406, "grad_norm_var": 1.294187991611216e-06, "learning_rate": 0.003552052410547647, "loss": 2.5911, "step": 16596 }, { "crossentropy": 2.605233669281006, "epoch": 0.601689385150812, "grad_norm": 0.026434553787112236, "grad_norm_var": 1.317058295325207e-06, "learning_rate": 0.0035514962316723865, "loss": 2.5852, "step": 16597 }, { "crossentropy": 2.369649648666382, "epoch": 0.6017256380510441, "grad_norm": 0.027088403701782227, "grad_norm_var": 9.891129191226158e-07, "learning_rate": 0.0035509400723613667, "loss": 2.4075, "step": 16598 }, { "crossentropy": 2.498728036880493, "epoch": 0.6017618909512761, "grad_norm": 0.027557462453842163, "grad_norm_var": 4.2467394121372717e-07, "learning_rate": 0.0035503839326221005, "loss": 2.5115, "step": 16599 }, { "crossentropy": 2.6967530250549316, "epoch": 0.6017981438515081, "grad_norm": 0.026981202885508537, "grad_norm_var": 4.2848820157961356e-07, "learning_rate": 0.0035498278124620982, "loss": 2.6004, "step": 16600 }, { "crossentropy": 2.6076016426086426, "epoch": 0.6018343967517401, "grad_norm": 0.026369689032435417, "grad_norm_var": 3.7524574865253206e-07, "learning_rate": 0.003549271711888873, "loss": 2.5087, "step": 16601 }, { "crossentropy": 2.6229066848754883, "epoch": 0.6018706496519721, "grad_norm": 0.026290906593203545, "grad_norm_var": 3.7052945295740213e-07, "learning_rate": 0.0035487156309099346, "loss": 2.5954, "step": 16602 }, { "crossentropy": 2.570803642272949, "epoch": 0.6019069025522041, "grad_norm": 0.02665104903280735, "grad_norm_var": 3.694953258704155e-07, "learning_rate": 0.0035481595695327954, "loss": 2.5413, "step": 16603 }, { "crossentropy": 2.599817991256714, "epoch": 0.6019431554524362, "grad_norm": 0.0270709078758955, "grad_norm_var": 3.381975484846473e-07, "learning_rate": 0.003547603527764963, "loss": 2.6198, "step": 16604 }, { "crossentropy": 2.6098711490631104, "epoch": 0.6019794083526682, "grad_norm": 0.02566358633339405, "grad_norm_var": 4.0768377200216717e-07, "learning_rate": 0.0035470475056139494, "loss": 2.5076, "step": 16605 }, { "crossentropy": 2.4458508491516113, "epoch": 0.6020156612529002, "grad_norm": 0.028019776567816734, "grad_norm_var": 4.849680044371422e-07, "learning_rate": 0.0035464915030872647, "loss": 2.5769, "step": 16606 }, { "crossentropy": 2.538331985473633, "epoch": 0.6020519141531323, "grad_norm": 0.027110746130347252, "grad_norm_var": 4.907468127107132e-07, "learning_rate": 0.003545935520192417, "loss": 2.5377, "step": 16607 }, { "crossentropy": 2.7521286010742188, "epoch": 0.6020881670533643, "grad_norm": 0.02823304757475853, "grad_norm_var": 5.625926510708276e-07, "learning_rate": 0.0035453795569369163, "loss": 2.6467, "step": 16608 }, { "crossentropy": 2.521866798400879, "epoch": 0.6021244199535963, "grad_norm": 0.02945907600224018, "grad_norm_var": 9.916634871135848e-07, "learning_rate": 0.003544823613328273, "loss": 2.5045, "step": 16609 }, { "crossentropy": 2.8314285278320312, "epoch": 0.6021606728538283, "grad_norm": 0.027814684435725212, "grad_norm_var": 1.031507350747011e-06, "learning_rate": 0.0035442676893739956, "loss": 2.7369, "step": 16610 }, { "crossentropy": 2.6134257316589355, "epoch": 0.6021969257540604, "grad_norm": 0.026124795898795128, "grad_norm_var": 9.494123367153092e-07, "learning_rate": 0.0035437117850815924, "loss": 2.5371, "step": 16611 }, { "crossentropy": 2.5615427494049072, "epoch": 0.6022331786542924, "grad_norm": 0.027227552607655525, "grad_norm_var": 8.851835646673934e-07, "learning_rate": 0.003543155900458571, "loss": 2.5647, "step": 16612 }, { "crossentropy": 2.6760904788970947, "epoch": 0.6022694315545244, "grad_norm": 0.026497630402445793, "grad_norm_var": 8.7957421083111e-07, "learning_rate": 0.00354260003551244, "loss": 2.6433, "step": 16613 }, { "crossentropy": 2.5119924545288086, "epoch": 0.6023056844547564, "grad_norm": 0.02653050422668457, "grad_norm_var": 9.024959959677414e-07, "learning_rate": 0.0035420441902507074, "loss": 2.5004, "step": 16614 }, { "crossentropy": 2.5445070266723633, "epoch": 0.6023419373549884, "grad_norm": 0.02744441293179989, "grad_norm_var": 8.964017696328393e-07, "learning_rate": 0.0035414883646808803, "loss": 2.5319, "step": 16615 }, { "crossentropy": 2.51763653755188, "epoch": 0.6023781902552204, "grad_norm": 0.026401543989777565, "grad_norm_var": 9.260501825466537e-07, "learning_rate": 0.0035409325588104646, "loss": 2.5145, "step": 16616 }, { "crossentropy": 2.545769214630127, "epoch": 0.6024144431554525, "grad_norm": 0.02745981700718403, "grad_norm_var": 9.00441938088183e-07, "learning_rate": 0.0035403767726469703, "loss": 2.5219, "step": 16617 }, { "crossentropy": 2.6119186878204346, "epoch": 0.6024506960556845, "grad_norm": 0.027884429320693016, "grad_norm_var": 8.819290329230752e-07, "learning_rate": 0.003539821006197903, "loss": 2.5647, "step": 16618 }, { "crossentropy": 2.5174400806427, "epoch": 0.6024869489559165, "grad_norm": 0.027432825416326523, "grad_norm_var": 8.603425391412311e-07, "learning_rate": 0.0035392652594707693, "loss": 2.5148, "step": 16619 }, { "crossentropy": 2.5472893714904785, "epoch": 0.6025232018561485, "grad_norm": 0.02625967189669609, "grad_norm_var": 9.233828795751631e-07, "learning_rate": 0.0035387095324730757, "loss": 2.5042, "step": 16620 }, { "crossentropy": 2.444756031036377, "epoch": 0.6025594547563805, "grad_norm": 0.02667267806828022, "grad_norm_var": 7.772451096038635e-07, "learning_rate": 0.0035381538252123256, "loss": 2.4762, "step": 16621 }, { "crossentropy": 2.5163516998291016, "epoch": 0.6025957076566125, "grad_norm": 0.026789681985974312, "grad_norm_var": 7.514385120860681e-07, "learning_rate": 0.0035375981376960276, "loss": 2.5562, "step": 16622 }, { "crossentropy": 2.6506145000457764, "epoch": 0.6026319605568445, "grad_norm": 0.030775923281908035, "grad_norm_var": 1.5430455754693405e-06, "learning_rate": 0.0035370424699316854, "loss": 2.5955, "step": 16623 }, { "crossentropy": 2.4931328296661377, "epoch": 0.6026682134570766, "grad_norm": 0.025279417634010315, "grad_norm_var": 1.7751944500568924e-06, "learning_rate": 0.003536486821926804, "loss": 2.4735, "step": 16624 }, { "crossentropy": 2.4497742652893066, "epoch": 0.6027044663573086, "grad_norm": 0.02754603698849678, "grad_norm_var": 1.441324850099309e-06, "learning_rate": 0.0035359311936888904, "loss": 2.4637, "step": 16625 }, { "crossentropy": 2.6273961067199707, "epoch": 0.6027407192575406, "grad_norm": 0.02697637863457203, "grad_norm_var": 1.409147520230134e-06, "learning_rate": 0.003535375585225447, "loss": 2.5881, "step": 16626 }, { "crossentropy": 2.7065954208374023, "epoch": 0.6027769721577726, "grad_norm": 0.027534715831279755, "grad_norm_var": 1.353547722271114e-06, "learning_rate": 0.00353481999654398, "loss": 2.5557, "step": 16627 }, { "crossentropy": 2.681267023086548, "epoch": 0.6028132250580046, "grad_norm": 0.028643755242228508, "grad_norm_var": 1.489847091243089e-06, "learning_rate": 0.0035342644276519924, "loss": 2.7249, "step": 16628 }, { "crossentropy": 2.385197639465332, "epoch": 0.6028494779582366, "grad_norm": 0.02649943344295025, "grad_norm_var": 1.4896644760911537e-06, "learning_rate": 0.0035337088785569876, "loss": 2.5196, "step": 16629 }, { "crossentropy": 2.5229058265686035, "epoch": 0.6028857308584686, "grad_norm": 0.026503702625632286, "grad_norm_var": 1.4923098321464416e-06, "learning_rate": 0.0035331533492664693, "loss": 2.5598, "step": 16630 }, { "crossentropy": 2.595102310180664, "epoch": 0.6029219837587007, "grad_norm": 0.02605225332081318, "grad_norm_var": 1.5785658892647838e-06, "learning_rate": 0.003532597839787941, "loss": 2.5486, "step": 16631 }, { "crossentropy": 2.5684666633605957, "epoch": 0.6029582366589327, "grad_norm": 0.026720011606812477, "grad_norm_var": 1.5522948221819224e-06, "learning_rate": 0.0035320423501289055, "loss": 2.5666, "step": 16632 }, { "crossentropy": 2.57582688331604, "epoch": 0.6029944895591647, "grad_norm": 0.029090221971273422, "grad_norm_var": 1.7772143097343247e-06, "learning_rate": 0.0035314868802968657, "loss": 2.5672, "step": 16633 }, { "crossentropy": 2.530856132507324, "epoch": 0.6030307424593968, "grad_norm": 0.026192370802164078, "grad_norm_var": 1.8223458396373604e-06, "learning_rate": 0.003530931430299325, "loss": 2.5337, "step": 16634 }, { "crossentropy": 2.76737642288208, "epoch": 0.6030669953596288, "grad_norm": 0.0268861074000597, "grad_norm_var": 1.8230030731022773e-06, "learning_rate": 0.0035303760001437847, "loss": 2.6328, "step": 16635 }, { "crossentropy": 2.5956666469573975, "epoch": 0.6031032482598608, "grad_norm": 0.02891460433602333, "grad_norm_var": 1.94788186714336e-06, "learning_rate": 0.0035298205898377467, "loss": 2.562, "step": 16636 }, { "crossentropy": 2.5871493816375732, "epoch": 0.6031395011600929, "grad_norm": 0.027209201827645302, "grad_norm_var": 1.919756773553063e-06, "learning_rate": 0.0035292651993887128, "loss": 2.5607, "step": 16637 }, { "crossentropy": 2.474839448928833, "epoch": 0.6031757540603249, "grad_norm": 0.026484301313757896, "grad_norm_var": 1.94843522567481e-06, "learning_rate": 0.0035287098288041842, "loss": 2.5685, "step": 16638 }, { "crossentropy": 2.5570387840270996, "epoch": 0.6032120069605569, "grad_norm": 0.027186404913663864, "grad_norm_var": 1.1053486397620404e-06, "learning_rate": 0.003528154478091662, "loss": 2.5355, "step": 16639 }, { "crossentropy": 2.623835325241089, "epoch": 0.6032482598607889, "grad_norm": 0.027745356783270836, "grad_norm_var": 8.843657196208032e-07, "learning_rate": 0.0035275991472586456, "loss": 2.6135, "step": 16640 }, { "crossentropy": 2.4497568607330322, "epoch": 0.6032845127610209, "grad_norm": 0.028267282992601395, "grad_norm_var": 9.44235628010046e-07, "learning_rate": 0.003527043836312639, "loss": 2.6341, "step": 16641 }, { "crossentropy": 2.600839853286743, "epoch": 0.6033207656612529, "grad_norm": 0.02968308888375759, "grad_norm_var": 1.2829416641151058e-06, "learning_rate": 0.0035264885452611396, "loss": 2.6269, "step": 16642 }, { "crossentropy": 2.5618228912353516, "epoch": 0.6033570185614849, "grad_norm": 0.026361316442489624, "grad_norm_var": 1.3597783535698738e-06, "learning_rate": 0.0035259332741116494, "loss": 2.5154, "step": 16643 }, { "crossentropy": 2.6882567405700684, "epoch": 0.603393271461717, "grad_norm": 0.025903167203068733, "grad_norm_var": 1.3756221571699252e-06, "learning_rate": 0.003525378022871667, "loss": 2.5896, "step": 16644 }, { "crossentropy": 2.6354098320007324, "epoch": 0.603429524361949, "grad_norm": 0.026526067405939102, "grad_norm_var": 1.3730679299216261e-06, "learning_rate": 0.0035248227915486937, "loss": 2.7409, "step": 16645 }, { "crossentropy": 2.4960885047912598, "epoch": 0.603465777262181, "grad_norm": 0.02558327652513981, "grad_norm_var": 1.5154993709467437e-06, "learning_rate": 0.0035242675801502254, "loss": 2.5143, "step": 16646 }, { "crossentropy": 2.4590377807617188, "epoch": 0.603502030162413, "grad_norm": 0.030604062601923943, "grad_norm_var": 2.128840082117481e-06, "learning_rate": 0.003523712388683764, "loss": 2.4384, "step": 16647 }, { "crossentropy": 2.5189313888549805, "epoch": 0.603538283062645, "grad_norm": 0.02623971551656723, "grad_norm_var": 2.1906336882035933e-06, "learning_rate": 0.003523157217156805, "loss": 2.4717, "step": 16648 }, { "crossentropy": 2.4445910453796387, "epoch": 0.603574535962877, "grad_norm": 0.0528532974421978, "grad_norm_var": 4.274431622672247e-05, "learning_rate": 0.003522602065576851, "loss": 2.5284, "step": 16649 }, { "crossentropy": 2.495453119277954, "epoch": 0.603610788863109, "grad_norm": 0.029206519946455956, "grad_norm_var": 4.2217956090740886e-05, "learning_rate": 0.003522046933951397, "loss": 2.547, "step": 16650 }, { "crossentropy": 2.5355629920959473, "epoch": 0.603647041763341, "grad_norm": 0.02777737006545067, "grad_norm_var": 4.2004115564632455e-05, "learning_rate": 0.0035214918222879433, "loss": 2.59, "step": 16651 }, { "crossentropy": 2.4910311698913574, "epoch": 0.6036832946635731, "grad_norm": 0.027570629492402077, "grad_norm_var": 4.216081380445968e-05, "learning_rate": 0.0035209367305939856, "loss": 2.5186, "step": 16652 }, { "crossentropy": 2.4155118465423584, "epoch": 0.6037195475638051, "grad_norm": 0.0267945546656847, "grad_norm_var": 4.22747162873542e-05, "learning_rate": 0.003520381658877023, "loss": 2.4801, "step": 16653 }, { "crossentropy": 2.657757043838501, "epoch": 0.6037558004640371, "grad_norm": 0.028960423544049263, "grad_norm_var": 4.181113102036184e-05, "learning_rate": 0.0035198266071445506, "loss": 2.5091, "step": 16654 }, { "crossentropy": 2.558389186859131, "epoch": 0.6037920533642691, "grad_norm": 0.028682822361588478, "grad_norm_var": 4.1548548104614e-05, "learning_rate": 0.003519271575404065, "loss": 2.5199, "step": 16655 }, { "crossentropy": 2.4512596130371094, "epoch": 0.6038283062645011, "grad_norm": 0.03018122911453247, "grad_norm_var": 4.141530181093511e-05, "learning_rate": 0.003518716563663065, "loss": 2.5967, "step": 16656 }, { "crossentropy": 2.5625083446502686, "epoch": 0.6038645591647331, "grad_norm": 0.02808663249015808, "grad_norm_var": 4.1445821471487025e-05, "learning_rate": 0.003518161571929045, "loss": 2.5299, "step": 16657 }, { "crossentropy": 2.5684075355529785, "epoch": 0.6039008120649652, "grad_norm": 0.026310743764042854, "grad_norm_var": 4.204658620551789e-05, "learning_rate": 0.0035176066002095027, "loss": 2.5806, "step": 16658 }, { "crossentropy": 2.450446844100952, "epoch": 0.6039370649651972, "grad_norm": 0.027132079005241394, "grad_norm_var": 4.1789151212293544e-05, "learning_rate": 0.003517051648511932, "loss": 2.4557, "step": 16659 }, { "crossentropy": 2.57133412361145, "epoch": 0.6039733178654292, "grad_norm": 0.027257833629846573, "grad_norm_var": 4.129467631788323e-05, "learning_rate": 0.00351649671684383, "loss": 2.6304, "step": 16660 }, { "crossentropy": 2.4345321655273438, "epoch": 0.6040095707656613, "grad_norm": 0.02637280337512493, "grad_norm_var": 4.135406569469092e-05, "learning_rate": 0.0035159418052126902, "loss": 2.473, "step": 16661 }, { "crossentropy": 2.4380109310150146, "epoch": 0.6040458236658933, "grad_norm": 0.024751432240009308, "grad_norm_var": 4.1815187491745645e-05, "learning_rate": 0.003515386913626011, "loss": 2.516, "step": 16662 }, { "crossentropy": 2.5116140842437744, "epoch": 0.6040820765661253, "grad_norm": 0.02673123963177204, "grad_norm_var": 4.2078646598611194e-05, "learning_rate": 0.0035148320420912817, "loss": 2.4615, "step": 16663 }, { "crossentropy": 2.698446273803711, "epoch": 0.6041183294663574, "grad_norm": 0.02677926793694496, "grad_norm_var": 4.189417706632776e-05, "learning_rate": 0.0035142771906160005, "loss": 2.7335, "step": 16664 }, { "crossentropy": 2.6740059852600098, "epoch": 0.6041545823665894, "grad_norm": 0.026006311178207397, "grad_norm_var": 1.8806494175066871e-06, "learning_rate": 0.0035137223592076605, "loss": 2.5257, "step": 16665 }, { "crossentropy": 2.6256420612335205, "epoch": 0.6041908352668214, "grad_norm": 0.027117175981402397, "grad_norm_var": 1.6537406218444216e-06, "learning_rate": 0.0035131675478737552, "loss": 2.5205, "step": 16666 }, { "crossentropy": 2.625378370285034, "epoch": 0.6042270881670534, "grad_norm": 0.027518615126609802, "grad_norm_var": 1.640835843452528e-06, "learning_rate": 0.003512612756621779, "loss": 2.5291, "step": 16667 }, { "crossentropy": 2.5854897499084473, "epoch": 0.6042633410672854, "grad_norm": 0.026800233870744705, "grad_norm_var": 1.6466246527881e-06, "learning_rate": 0.003512057985459223, "loss": 2.5328, "step": 16668 }, { "crossentropy": 2.5607542991638184, "epoch": 0.6042995939675174, "grad_norm": 0.02563629485666752, "grad_norm_var": 1.79582272256105e-06, "learning_rate": 0.0035115032343935836, "loss": 2.5782, "step": 16669 }, { "crossentropy": 2.670837640762329, "epoch": 0.6043358468677494, "grad_norm": 0.02559695392847061, "grad_norm_var": 1.6888751384063391e-06, "learning_rate": 0.0035109485034323542, "loss": 2.5877, "step": 16670 }, { "crossentropy": 2.519174814224243, "epoch": 0.6043720997679815, "grad_norm": 0.025962458923459053, "grad_norm_var": 1.5174749333086064e-06, "learning_rate": 0.003510393792583021, "loss": 2.5202, "step": 16671 }, { "crossentropy": 2.581975221633911, "epoch": 0.6044083526682135, "grad_norm": 0.028018716722726822, "grad_norm_var": 8.247588194282632e-07, "learning_rate": 0.003509839101853082, "loss": 2.6061, "step": 16672 }, { "crossentropy": 2.5334055423736572, "epoch": 0.6044446055684455, "grad_norm": 0.026642579585313797, "grad_norm_var": 6.746142562607157e-07, "learning_rate": 0.003509284431250027, "loss": 2.5845, "step": 16673 }, { "crossentropy": 2.527406930923462, "epoch": 0.6044808584686775, "grad_norm": 0.026038048788905144, "grad_norm_var": 6.875855650613378e-07, "learning_rate": 0.003508729780781348, "loss": 2.4263, "step": 16674 }, { "crossentropy": 2.4931888580322266, "epoch": 0.6045171113689095, "grad_norm": 0.025792788714170456, "grad_norm_var": 6.908607758142338e-07, "learning_rate": 0.0035081751504545365, "loss": 2.5193, "step": 16675 }, { "crossentropy": 2.5981149673461914, "epoch": 0.6045533642691415, "grad_norm": 0.027591800317168236, "grad_norm_var": 7.34296855378957e-07, "learning_rate": 0.003507620540277082, "loss": 2.5106, "step": 16676 }, { "crossentropy": 2.486363410949707, "epoch": 0.6045896171693735, "grad_norm": 0.028747977688908577, "grad_norm_var": 1.0593382761499148e-06, "learning_rate": 0.0035070659502564783, "loss": 2.5106, "step": 16677 }, { "crossentropy": 2.5509989261627197, "epoch": 0.6046258700696056, "grad_norm": 0.028082365170121193, "grad_norm_var": 9.281277573676878e-07, "learning_rate": 0.003506511380400217, "loss": 2.5361, "step": 16678 }, { "crossentropy": 2.65594744682312, "epoch": 0.6046621229698376, "grad_norm": 0.029186846688389778, "grad_norm_var": 1.2771116462852635e-06, "learning_rate": 0.003505956830715783, "loss": 2.6714, "step": 16679 }, { "crossentropy": 2.621428966522217, "epoch": 0.6046983758700696, "grad_norm": 0.02724059298634529, "grad_norm_var": 1.2786870305148134e-06, "learning_rate": 0.0035054023012106696, "loss": 2.6264, "step": 16680 }, { "crossentropy": 2.5615835189819336, "epoch": 0.6047346287703016, "grad_norm": 0.026112807914614677, "grad_norm_var": 1.2653038908051606e-06, "learning_rate": 0.003504847791892366, "loss": 2.5493, "step": 16681 }, { "crossentropy": 2.6868720054626465, "epoch": 0.6047708816705336, "grad_norm": 0.028004931285977364, "grad_norm_var": 1.327792418698972e-06, "learning_rate": 0.003504293302768363, "loss": 2.6464, "step": 16682 }, { "crossentropy": 2.550415277481079, "epoch": 0.6048071345707656, "grad_norm": 0.028292061761021614, "grad_norm_var": 1.4123860767554023e-06, "learning_rate": 0.0035037388338461477, "loss": 2.5378, "step": 16683 }, { "crossentropy": 2.6832942962646484, "epoch": 0.6048433874709976, "grad_norm": 0.027274537831544876, "grad_norm_var": 1.4069061365177064e-06, "learning_rate": 0.00350318438513321, "loss": 2.5662, "step": 16684 }, { "crossentropy": 2.4053828716278076, "epoch": 0.6048796403712297, "grad_norm": 0.030586203560233116, "grad_norm_var": 1.9465810627778785e-06, "learning_rate": 0.003502629956637039, "loss": 2.4437, "step": 16685 }, { "crossentropy": 2.53165602684021, "epoch": 0.6049158932714617, "grad_norm": 0.027283040806651115, "grad_norm_var": 1.7080734314827885e-06, "learning_rate": 0.0035020755483651236, "loss": 2.6071, "step": 16686 }, { "crossentropy": 2.5077335834503174, "epoch": 0.6049521461716937, "grad_norm": 0.026254579424858093, "grad_norm_var": 1.651432453707707e-06, "learning_rate": 0.003501521160324951, "loss": 2.5364, "step": 16687 }, { "crossentropy": 2.617079019546509, "epoch": 0.6049883990719258, "grad_norm": 0.02825053036212921, "grad_norm_var": 1.6686024854870029e-06, "learning_rate": 0.0035009667925240085, "loss": 2.5909, "step": 16688 }, { "crossentropy": 2.6174068450927734, "epoch": 0.6050246519721578, "grad_norm": 0.026358334347605705, "grad_norm_var": 1.7094207146539267e-06, "learning_rate": 0.003500412444969785, "loss": 2.5188, "step": 16689 }, { "crossentropy": 2.33661150932312, "epoch": 0.6050609048723898, "grad_norm": 0.026213914155960083, "grad_norm_var": 1.6754645170683272e-06, "learning_rate": 0.0034998581176697665, "loss": 2.4526, "step": 16690 }, { "crossentropy": 2.5761613845825195, "epoch": 0.6050971577726219, "grad_norm": 0.02587248384952545, "grad_norm_var": 1.65687497585415e-06, "learning_rate": 0.0034993038106314414, "loss": 2.481, "step": 16691 }, { "crossentropy": 2.5500528812408447, "epoch": 0.6051334106728539, "grad_norm": 0.026783786714076996, "grad_norm_var": 1.696900637365471e-06, "learning_rate": 0.0034987495238622936, "loss": 2.5617, "step": 16692 }, { "crossentropy": 2.6447010040283203, "epoch": 0.6051696635730859, "grad_norm": 0.026517249643802643, "grad_norm_var": 1.6468544742129683e-06, "learning_rate": 0.0034981952573698133, "loss": 2.5855, "step": 16693 }, { "crossentropy": 2.5170373916625977, "epoch": 0.6052059164733179, "grad_norm": 0.027051180601119995, "grad_norm_var": 1.6187573418308902e-06, "learning_rate": 0.0034976410111614855, "loss": 2.5662, "step": 16694 }, { "crossentropy": 2.4367547035217285, "epoch": 0.6052421693735499, "grad_norm": 0.0249203871935606, "grad_norm_var": 1.7002460473111801e-06, "learning_rate": 0.003497086785244796, "loss": 2.4288, "step": 16695 }, { "crossentropy": 2.483227252960205, "epoch": 0.6052784222737819, "grad_norm": 0.02677186019718647, "grad_norm_var": 1.7029124750940483e-06, "learning_rate": 0.0034965325796272296, "loss": 2.4808, "step": 16696 }, { "crossentropy": 2.6267805099487305, "epoch": 0.6053146751740139, "grad_norm": 0.02520057186484337, "grad_norm_var": 1.8669989077810343e-06, "learning_rate": 0.003495978394316272, "loss": 2.5322, "step": 16697 }, { "crossentropy": 2.454153060913086, "epoch": 0.605350928074246, "grad_norm": 0.025667015463113785, "grad_norm_var": 1.88825683009435e-06, "learning_rate": 0.0034954242293194094, "loss": 2.513, "step": 16698 }, { "crossentropy": 2.499192476272583, "epoch": 0.605387180974478, "grad_norm": 0.02934173122048378, "grad_norm_var": 2.16158875723757e-06, "learning_rate": 0.0034948700846441252, "loss": 2.5073, "step": 16699 }, { "crossentropy": 2.5265541076660156, "epoch": 0.60542343387471, "grad_norm": 0.036777645349502563, "grad_norm_var": 8.284639265085482e-06, "learning_rate": 0.0034943159602979037, "loss": 2.5368, "step": 16700 }, { "crossentropy": 2.503572940826416, "epoch": 0.605459686774942, "grad_norm": 0.02638929896056652, "grad_norm_var": 7.653286395479597e-06, "learning_rate": 0.003493761856288231, "loss": 2.5729, "step": 16701 }, { "crossentropy": 2.563464641571045, "epoch": 0.605495939675174, "grad_norm": 0.02881649322807789, "grad_norm_var": 7.811435636245278e-06, "learning_rate": 0.003493207772622591, "loss": 2.5309, "step": 16702 }, { "crossentropy": 2.60931396484375, "epoch": 0.605532192575406, "grad_norm": 0.02755696140229702, "grad_norm_var": 7.731708945707636e-06, "learning_rate": 0.0034926537093084653, "loss": 2.5653, "step": 16703 }, { "crossentropy": 2.4036786556243896, "epoch": 0.605568445475638, "grad_norm": 0.02645774930715561, "grad_norm_var": 7.730615598342445e-06, "learning_rate": 0.0034920996663533412, "loss": 2.3946, "step": 16704 }, { "crossentropy": 2.5505547523498535, "epoch": 0.60560469837587, "grad_norm": 0.027085527777671814, "grad_norm_var": 7.672989377911994e-06, "learning_rate": 0.0034915456437646973, "loss": 2.5919, "step": 16705 }, { "crossentropy": 2.6200554370880127, "epoch": 0.6056409512761021, "grad_norm": 0.026054201647639275, "grad_norm_var": 7.698542147122529e-06, "learning_rate": 0.0034909916415500186, "loss": 2.6144, "step": 16706 }, { "crossentropy": 2.4664714336395264, "epoch": 0.6056772041763341, "grad_norm": 0.026801424100995064, "grad_norm_var": 7.572071956350951e-06, "learning_rate": 0.0034904376597167885, "loss": 2.5333, "step": 16707 }, { "crossentropy": 2.6638011932373047, "epoch": 0.6057134570765661, "grad_norm": 0.02792665734887123, "grad_norm_var": 7.561776906377552e-06, "learning_rate": 0.003489883698272488, "loss": 2.6175, "step": 16708 }, { "crossentropy": 2.492001533508301, "epoch": 0.6057497099767981, "grad_norm": 0.02671854943037033, "grad_norm_var": 7.539046448830109e-06, "learning_rate": 0.0034893297572245996, "loss": 2.4622, "step": 16709 }, { "crossentropy": 2.468719720840454, "epoch": 0.6057859628770301, "grad_norm": 0.02672787383198738, "grad_norm_var": 7.56368017831344e-06, "learning_rate": 0.0034887758365806064, "loss": 2.545, "step": 16710 }, { "crossentropy": 2.6797046661376953, "epoch": 0.6058222157772621, "grad_norm": 0.026911068707704544, "grad_norm_var": 7.1397041348279785e-06, "learning_rate": 0.0034882219363479884, "loss": 2.6272, "step": 16711 }, { "crossentropy": 2.4620869159698486, "epoch": 0.6058584686774942, "grad_norm": 0.027773302048444748, "grad_norm_var": 7.09510615042549e-06, "learning_rate": 0.0034876680565342284, "loss": 2.6537, "step": 16712 }, { "crossentropy": 2.5883548259735107, "epoch": 0.6058947215777262, "grad_norm": 0.027043579146265984, "grad_norm_var": 6.708468299330452e-06, "learning_rate": 0.0034871141971468058, "loss": 2.5979, "step": 16713 }, { "crossentropy": 2.6798884868621826, "epoch": 0.6059309744779582, "grad_norm": 0.027048366144299507, "grad_norm_var": 6.4435171640807165e-06, "learning_rate": 0.0034865603581932013, "loss": 2.5674, "step": 16714 }, { "crossentropy": 2.617738962173462, "epoch": 0.6059672273781903, "grad_norm": 0.027875708416104317, "grad_norm_var": 6.284183711412668e-06, "learning_rate": 0.003486006539680896, "loss": 2.6248, "step": 16715 }, { "crossentropy": 2.5734341144561768, "epoch": 0.6060034802784223, "grad_norm": 0.027009014040231705, "grad_norm_var": 4.870558937603367e-07, "learning_rate": 0.0034854527416173686, "loss": 2.5288, "step": 16716 }, { "crossentropy": 2.5702059268951416, "epoch": 0.6060397331786543, "grad_norm": 0.026601752266287804, "grad_norm_var": 4.68690027155029e-07, "learning_rate": 0.003484898964010102, "loss": 2.5615, "step": 16717 }, { "crossentropy": 2.63647723197937, "epoch": 0.6060759860788864, "grad_norm": 0.02824602648615837, "grad_norm_var": 3.623114701824335e-07, "learning_rate": 0.0034843452068665736, "loss": 2.6165, "step": 16718 }, { "crossentropy": 2.584606647491455, "epoch": 0.6061122389791184, "grad_norm": 0.028452260419726372, "grad_norm_var": 4.651840353033735e-07, "learning_rate": 0.0034837914701942637, "loss": 2.6008, "step": 16719 }, { "crossentropy": 2.546295166015625, "epoch": 0.6061484918793504, "grad_norm": 0.028894178569316864, "grad_norm_var": 6.045507790197139e-07, "learning_rate": 0.003483237754000652, "loss": 2.5128, "step": 16720 }, { "crossentropy": 2.3510589599609375, "epoch": 0.6061847447795824, "grad_norm": 0.02654186822474003, "grad_norm_var": 6.402442603301491e-07, "learning_rate": 0.003482684058293215, "loss": 2.4513, "step": 16721 }, { "crossentropy": 2.4535369873046875, "epoch": 0.6062209976798144, "grad_norm": 0.025729825720191002, "grad_norm_var": 7.002306246330543e-07, "learning_rate": 0.0034821303830794323, "loss": 2.5414, "step": 16722 }, { "crossentropy": 2.5282747745513916, "epoch": 0.6062572505800464, "grad_norm": 0.026535246521234512, "grad_norm_var": 7.21247565216978e-07, "learning_rate": 0.003481576728366782, "loss": 2.5552, "step": 16723 }, { "crossentropy": 2.5991017818450928, "epoch": 0.6062935034802784, "grad_norm": 0.02608986385166645, "grad_norm_var": 7.669333793749085e-07, "learning_rate": 0.003481023094162742, "loss": 2.5937, "step": 16724 }, { "crossentropy": 2.6838018894195557, "epoch": 0.6063297563805105, "grad_norm": 0.026935674250125885, "grad_norm_var": 7.577539629895676e-07, "learning_rate": 0.0034804694804747905, "loss": 2.6282, "step": 16725 }, { "crossentropy": 2.531031608581543, "epoch": 0.6063660092807425, "grad_norm": 0.025672050192952156, "grad_norm_var": 8.869894566637526e-07, "learning_rate": 0.003479915887310405, "loss": 2.5678, "step": 16726 }, { "crossentropy": 2.569746494293213, "epoch": 0.6064022621809745, "grad_norm": 0.027827225625514984, "grad_norm_var": 9.182036191435061e-07, "learning_rate": 0.003479362314677062, "loss": 2.511, "step": 16727 }, { "crossentropy": 2.5435612201690674, "epoch": 0.6064385150812065, "grad_norm": 0.026975281536579132, "grad_norm_var": 8.908598651796573e-07, "learning_rate": 0.0034788087625822386, "loss": 2.5559, "step": 16728 }, { "crossentropy": 2.6344921588897705, "epoch": 0.6064747679814385, "grad_norm": 0.02669493667781353, "grad_norm_var": 9.007249176136289e-07, "learning_rate": 0.0034782552310334124, "loss": 2.5665, "step": 16729 }, { "crossentropy": 2.5060460567474365, "epoch": 0.6065110208816705, "grad_norm": 0.030276689678430557, "grad_norm_var": 1.5425426968631214e-06, "learning_rate": 0.003477701720038058, "loss": 2.6048, "step": 16730 }, { "crossentropy": 2.3641560077667236, "epoch": 0.6065472737819025, "grad_norm": 0.027305657044053078, "grad_norm_var": 1.5169932466620184e-06, "learning_rate": 0.0034771482296036514, "loss": 2.4645, "step": 16731 }, { "crossentropy": 2.7147040367126465, "epoch": 0.6065835266821346, "grad_norm": 0.028784459456801414, "grad_norm_var": 1.6601017544730916e-06, "learning_rate": 0.003476594759737669, "loss": 2.5665, "step": 16732 }, { "crossentropy": 2.621095895767212, "epoch": 0.6066197795823666, "grad_norm": 0.0314139761030674, "grad_norm_var": 2.6288312266029474e-06, "learning_rate": 0.0034760413104475865, "loss": 2.6804, "step": 16733 }, { "crossentropy": 2.684687852859497, "epoch": 0.6066560324825986, "grad_norm": 0.026553478091955185, "grad_norm_var": 2.6730195683108877e-06, "learning_rate": 0.0034754878817408784, "loss": 2.6437, "step": 16734 }, { "crossentropy": 2.630497455596924, "epoch": 0.6066922853828306, "grad_norm": 0.0256003737449646, "grad_norm_var": 2.8354738540587447e-06, "learning_rate": 0.003474934473625021, "loss": 2.535, "step": 16735 }, { "crossentropy": 2.3968400955200195, "epoch": 0.6067285382830626, "grad_norm": 0.025844190269708633, "grad_norm_var": 2.7947778710400805e-06, "learning_rate": 0.003474381086107488, "loss": 2.4552, "step": 16736 }, { "crossentropy": 2.6412177085876465, "epoch": 0.6067647911832946, "grad_norm": 0.02786427177488804, "grad_norm_var": 2.7926523413121606e-06, "learning_rate": 0.0034738277191957534, "loss": 2.6779, "step": 16737 }, { "crossentropy": 2.4419198036193848, "epoch": 0.6068010440835266, "grad_norm": 0.0296710804104805, "grad_norm_var": 2.9612533431887246e-06, "learning_rate": 0.003473274372897291, "loss": 2.4964, "step": 16738 }, { "crossentropy": 2.6281185150146484, "epoch": 0.6068372969837587, "grad_norm": 0.02959413081407547, "grad_norm_var": 3.151442566266758e-06, "learning_rate": 0.0034727210472195753, "loss": 2.5891, "step": 16739 }, { "crossentropy": 2.487198829650879, "epoch": 0.6068735498839907, "grad_norm": 0.027267325669527054, "grad_norm_var": 2.9862588590605222e-06, "learning_rate": 0.003472167742170078, "loss": 2.5326, "step": 16740 }, { "crossentropy": 2.4274721145629883, "epoch": 0.6069098027842227, "grad_norm": 0.027028875425457954, "grad_norm_var": 2.9764641886959505e-06, "learning_rate": 0.0034716144577562756, "loss": 2.4537, "step": 16741 }, { "crossentropy": 2.621136426925659, "epoch": 0.6069460556844548, "grad_norm": 0.027007771655917168, "grad_norm_var": 2.7137356971267934e-06, "learning_rate": 0.0034710611939856397, "loss": 2.5682, "step": 16742 }, { "crossentropy": 2.3245725631713867, "epoch": 0.6069823085846868, "grad_norm": 0.027688391506671906, "grad_norm_var": 2.7154889057897625e-06, "learning_rate": 0.0034705079508656417, "loss": 2.4833, "step": 16743 }, { "crossentropy": 2.6534337997436523, "epoch": 0.6070185614849188, "grad_norm": 0.0267396941781044, "grad_norm_var": 2.7463769419602247e-06, "learning_rate": 0.003469954728403755, "loss": 2.5675, "step": 16744 }, { "crossentropy": 2.6062657833099365, "epoch": 0.6070548143851509, "grad_norm": 0.02631671167910099, "grad_norm_var": 2.8127333760290716e-06, "learning_rate": 0.003469401526607451, "loss": 2.5968, "step": 16745 }, { "crossentropy": 2.52858567237854, "epoch": 0.6070910672853829, "grad_norm": 0.026562239974737167, "grad_norm_var": 2.453311318693687e-06, "learning_rate": 0.003468848345484205, "loss": 2.5866, "step": 16746 }, { "crossentropy": 2.594266176223755, "epoch": 0.6071273201856149, "grad_norm": 0.02613723836839199, "grad_norm_var": 2.5810122275893646e-06, "learning_rate": 0.003468295185041484, "loss": 2.6115, "step": 16747 }, { "crossentropy": 2.641773223876953, "epoch": 0.6071635730858469, "grad_norm": 0.026399658992886543, "grad_norm_var": 2.5295176543626194e-06, "learning_rate": 0.00346774204528676, "loss": 2.5314, "step": 16748 }, { "crossentropy": 2.59718918800354, "epoch": 0.6071998259860789, "grad_norm": 0.026660816743969917, "grad_norm_var": 1.3695283711541296e-06, "learning_rate": 0.0034671889262275052, "loss": 2.5053, "step": 16749 }, { "crossentropy": 2.429581880569458, "epoch": 0.6072360788863109, "grad_norm": 0.026753408834338188, "grad_norm_var": 1.358563637323303e-06, "learning_rate": 0.003466635827871191, "loss": 2.4915, "step": 16750 }, { "crossentropy": 2.633056163787842, "epoch": 0.6072723317865429, "grad_norm": 0.027749400585889816, "grad_norm_var": 1.2258164796922003e-06, "learning_rate": 0.003466082750225286, "loss": 2.4925, "step": 16751 }, { "crossentropy": 2.6484761238098145, "epoch": 0.607308584686775, "grad_norm": 0.02636863850057125, "grad_norm_var": 1.1478275277818864e-06, "learning_rate": 0.003465529693297262, "loss": 2.6018, "step": 16752 }, { "crossentropy": 2.609825611114502, "epoch": 0.607344837587007, "grad_norm": 0.026765404269099236, "grad_norm_var": 1.131553419158682e-06, "learning_rate": 0.003464976657094587, "loss": 2.5644, "step": 16753 }, { "crossentropy": 2.552295446395874, "epoch": 0.607381090487239, "grad_norm": 0.026371823623776436, "grad_norm_var": 7.113910587242325e-07, "learning_rate": 0.003464423641624734, "loss": 2.55, "step": 16754 }, { "crossentropy": 2.582012414932251, "epoch": 0.607417343387471, "grad_norm": 0.026501618325710297, "grad_norm_var": 2.243018243585614e-07, "learning_rate": 0.0034638706468951685, "loss": 2.6054, "step": 16755 }, { "crossentropy": 2.604708671569824, "epoch": 0.607453596287703, "grad_norm": 0.03079795464873314, "grad_norm_var": 1.2375308414330187e-06, "learning_rate": 0.003463317672913361, "loss": 2.5236, "step": 16756 }, { "crossentropy": 2.4378607273101807, "epoch": 0.607489849187935, "grad_norm": 0.026087690144777298, "grad_norm_var": 1.2880923352002187e-06, "learning_rate": 0.00346276471968678, "loss": 2.5861, "step": 16757 }, { "crossentropy": 2.659438371658325, "epoch": 0.607526102088167, "grad_norm": 0.02647426724433899, "grad_norm_var": 1.3004758556531734e-06, "learning_rate": 0.003462211787222895, "loss": 2.6122, "step": 16758 }, { "crossentropy": 2.607872724533081, "epoch": 0.6075623549883991, "grad_norm": 0.02572096511721611, "grad_norm_var": 1.3351745586545038e-06, "learning_rate": 0.0034616588755291733, "loss": 2.5471, "step": 16759 }, { "crossentropy": 2.515054941177368, "epoch": 0.6075986078886311, "grad_norm": 0.027270909398794174, "grad_norm_var": 1.3502774047861414e-06, "learning_rate": 0.0034611059846130833, "loss": 2.5478, "step": 16760 }, { "crossentropy": 2.584822654724121, "epoch": 0.6076348607888631, "grad_norm": 0.02667440101504326, "grad_norm_var": 1.334811247072108e-06, "learning_rate": 0.003460553114482091, "loss": 2.5733, "step": 16761 }, { "crossentropy": 2.511323928833008, "epoch": 0.6076711136890951, "grad_norm": 0.026142233982682228, "grad_norm_var": 1.3608888630759855e-06, "learning_rate": 0.003460000265143668, "loss": 2.5037, "step": 16762 }, { "crossentropy": 2.4907853603363037, "epoch": 0.6077073665893271, "grad_norm": 0.02684912271797657, "grad_norm_var": 1.3292012101539496e-06, "learning_rate": 0.0034594474366052754, "loss": 2.4609, "step": 16763 }, { "crossentropy": 2.5523664951324463, "epoch": 0.6077436194895591, "grad_norm": 0.034263018518686295, "grad_norm_var": 4.7223343005081545e-06, "learning_rate": 0.0034588946288743845, "loss": 2.5792, "step": 16764 }, { "crossentropy": 2.655552387237549, "epoch": 0.6077798723897911, "grad_norm": 0.026638779789209366, "grad_norm_var": 4.7243624134121755e-06, "learning_rate": 0.00345834184195846, "loss": 2.692, "step": 16765 }, { "crossentropy": 2.6480910778045654, "epoch": 0.6078161252900232, "grad_norm": 0.027202920988202095, "grad_norm_var": 4.701872742680855e-06, "learning_rate": 0.003457789075864969, "loss": 2.576, "step": 16766 }, { "crossentropy": 2.5514237880706787, "epoch": 0.6078523781902552, "grad_norm": 0.026896845549345016, "grad_norm_var": 4.703882657310955e-06, "learning_rate": 0.003457236330601376, "loss": 2.5292, "step": 16767 }, { "crossentropy": 2.7105624675750732, "epoch": 0.6078886310904872, "grad_norm": 0.027290988713502884, "grad_norm_var": 4.64077275611507e-06, "learning_rate": 0.0034566836061751473, "loss": 2.5791, "step": 16768 }, { "crossentropy": 2.6226956844329834, "epoch": 0.6079248839907193, "grad_norm": 0.026707859709858894, "grad_norm_var": 4.6456324228844135e-06, "learning_rate": 0.0034561309025937493, "loss": 2.5418, "step": 16769 }, { "crossentropy": 2.505706548690796, "epoch": 0.6079611368909513, "grad_norm": 0.0298195518553257, "grad_norm_var": 4.930522235591789e-06, "learning_rate": 0.0034555782198646455, "loss": 2.4998, "step": 16770 }, { "crossentropy": 2.476184606552124, "epoch": 0.6079973897911833, "grad_norm": 0.028808826580643654, "grad_norm_var": 4.930345876514317e-06, "learning_rate": 0.0034550255579953036, "loss": 2.4646, "step": 16771 }, { "crossentropy": 2.6195223331451416, "epoch": 0.6080336426914154, "grad_norm": 0.026343930512666702, "grad_norm_var": 4.347026187273547e-06, "learning_rate": 0.0034544729169931855, "loss": 2.5552, "step": 16772 }, { "crossentropy": 2.445528268814087, "epoch": 0.6080698955916474, "grad_norm": 0.026261288672685623, "grad_norm_var": 4.317388172719395e-06, "learning_rate": 0.0034539202968657556, "loss": 2.4895, "step": 16773 }, { "crossentropy": 2.554959774017334, "epoch": 0.6081061484918794, "grad_norm": 0.026832370087504387, "grad_norm_var": 4.278319560967192e-06, "learning_rate": 0.003453367697620478, "loss": 2.5098, "step": 16774 }, { "crossentropy": 2.5610527992248535, "epoch": 0.6081424013921114, "grad_norm": 0.02652609348297119, "grad_norm_var": 4.129705554875978e-06, "learning_rate": 0.0034528151192648173, "loss": 2.5661, "step": 16775 }, { "crossentropy": 2.583857297897339, "epoch": 0.6081786542923434, "grad_norm": 0.027895759791135788, "grad_norm_var": 4.132266331063836e-06, "learning_rate": 0.0034522625618062354, "loss": 2.5523, "step": 16776 }, { "crossentropy": 2.5998127460479736, "epoch": 0.6082149071925754, "grad_norm": 0.027517547830939293, "grad_norm_var": 4.07577567281199e-06, "learning_rate": 0.0034517100252521964, "loss": 2.6283, "step": 16777 }, { "crossentropy": 2.6360597610473633, "epoch": 0.6082511600928074, "grad_norm": 0.03277216851711273, "grad_norm_var": 5.5124335325228955e-06, "learning_rate": 0.0034511575096101642, "loss": 2.5185, "step": 16778 }, { "crossentropy": 2.5277445316314697, "epoch": 0.6082874129930395, "grad_norm": 0.026977451518177986, "grad_norm_var": 5.493100110257921e-06, "learning_rate": 0.003450605014887601, "loss": 2.5236, "step": 16779 }, { "crossentropy": 2.674731969833374, "epoch": 0.6083236658932715, "grad_norm": 0.027823250740766525, "grad_norm_var": 2.74790031345584e-06, "learning_rate": 0.003450052541091967, "loss": 2.6002, "step": 16780 }, { "crossentropy": 2.7268319129943848, "epoch": 0.6083599187935035, "grad_norm": 0.027671415358781815, "grad_norm_var": 2.6760427242440593e-06, "learning_rate": 0.003449500088230726, "loss": 2.7243, "step": 16781 }, { "crossentropy": 2.537623643875122, "epoch": 0.6083961716937355, "grad_norm": 0.032337501645088196, "grad_norm_var": 3.9771377962677e-06, "learning_rate": 0.0034489476563113386, "loss": 2.5738, "step": 16782 }, { "crossentropy": 2.5822393894195557, "epoch": 0.6084324245939675, "grad_norm": 0.031191639602184296, "grad_norm_var": 4.480975605819837e-06, "learning_rate": 0.003448395245341268, "loss": 2.4838, "step": 16783 }, { "crossentropy": 2.639183282852173, "epoch": 0.6084686774941995, "grad_norm": 0.026198038831353188, "grad_norm_var": 4.702470545758728e-06, "learning_rate": 0.003447842855327973, "loss": 2.6412, "step": 16784 }, { "crossentropy": 2.4250950813293457, "epoch": 0.6085049303944315, "grad_norm": 0.027238281443715096, "grad_norm_var": 4.6123838229050175e-06, "learning_rate": 0.0034472904862789156, "loss": 2.42, "step": 16785 }, { "crossentropy": 2.714845657348633, "epoch": 0.6085411832946636, "grad_norm": 0.02807770110666752, "grad_norm_var": 4.440610738134588e-06, "learning_rate": 0.0034467381382015574, "loss": 2.6509, "step": 16786 }, { "crossentropy": 2.53530216217041, "epoch": 0.6085774361948956, "grad_norm": 0.026651084423065186, "grad_norm_var": 4.543375125434096e-06, "learning_rate": 0.0034461858111033577, "loss": 2.5028, "step": 16787 }, { "crossentropy": 2.5879974365234375, "epoch": 0.6086136890951276, "grad_norm": 0.02677777223289013, "grad_norm_var": 4.458201791578348e-06, "learning_rate": 0.0034456335049917776, "loss": 2.6053, "step": 16788 }, { "crossentropy": 2.5147252082824707, "epoch": 0.6086499419953596, "grad_norm": 0.02549753338098526, "grad_norm_var": 4.676488853457339e-06, "learning_rate": 0.0034450812198742755, "loss": 2.5248, "step": 16789 }, { "crossentropy": 2.5974984169006348, "epoch": 0.6086861948955916, "grad_norm": 0.026152146980166435, "grad_norm_var": 4.811226096873205e-06, "learning_rate": 0.0034445289557583103, "loss": 2.5932, "step": 16790 }, { "crossentropy": 2.594017505645752, "epoch": 0.6087224477958236, "grad_norm": 0.027973784133791924, "grad_norm_var": 4.666092560260254e-06, "learning_rate": 0.003443976712651342, "loss": 2.5308, "step": 16791 }, { "crossentropy": 2.609200954437256, "epoch": 0.6087587006960556, "grad_norm": 0.028234370052814484, "grad_norm_var": 4.666427378323864e-06, "learning_rate": 0.003443424490560829, "loss": 2.5269, "step": 16792 }, { "crossentropy": 2.505343437194824, "epoch": 0.6087949535962877, "grad_norm": 0.025911204516887665, "grad_norm_var": 4.945643276436914e-06, "learning_rate": 0.003442872289494231, "loss": 2.5842, "step": 16793 }, { "crossentropy": 2.5609476566314697, "epoch": 0.6088312064965197, "grad_norm": 0.02760440483689308, "grad_norm_var": 3.304399248190435e-06, "learning_rate": 0.003442320109459006, "loss": 2.5952, "step": 16794 }, { "crossentropy": 2.391693115234375, "epoch": 0.6088674593967517, "grad_norm": 0.02721559815108776, "grad_norm_var": 3.286752073124394e-06, "learning_rate": 0.003441767950462612, "loss": 2.4535, "step": 16795 }, { "crossentropy": 2.5510520935058594, "epoch": 0.6089037122969838, "grad_norm": 0.02597726881504059, "grad_norm_var": 3.4594833713153987e-06, "learning_rate": 0.0034412158125125074, "loss": 2.5268, "step": 16796 }, { "crossentropy": 2.480950117111206, "epoch": 0.6089399651972158, "grad_norm": 0.027754008769989014, "grad_norm_var": 3.4613089271602007e-06, "learning_rate": 0.003440663695616148, "loss": 2.5568, "step": 16797 }, { "crossentropy": 2.45298433303833, "epoch": 0.6089762180974478, "grad_norm": 0.02651679329574108, "grad_norm_var": 1.8629242354508321e-06, "learning_rate": 0.003440111599780992, "loss": 2.4792, "step": 16798 }, { "crossentropy": 2.6095778942108154, "epoch": 0.6090124709976799, "grad_norm": 0.02845635451376438, "grad_norm_var": 8.695606676108047e-07, "learning_rate": 0.003439559525014495, "loss": 2.5884, "step": 16799 }, { "crossentropy": 2.44069504737854, "epoch": 0.6090487238979119, "grad_norm": 0.02651507593691349, "grad_norm_var": 8.413180884104943e-07, "learning_rate": 0.0034390074713241143, "loss": 2.4942, "step": 16800 }, { "crossentropy": 2.473297595977783, "epoch": 0.6090849767981439, "grad_norm": 0.027506522834300995, "grad_norm_var": 8.531004371155337e-07, "learning_rate": 0.0034384554387173073, "loss": 2.4623, "step": 16801 }, { "crossentropy": 2.4684391021728516, "epoch": 0.6091212296983759, "grad_norm": 0.02844884991645813, "grad_norm_var": 9.125003601048586e-07, "learning_rate": 0.0034379034272015298, "loss": 2.6135, "step": 16802 }, { "crossentropy": 2.6110899448394775, "epoch": 0.6091574825986079, "grad_norm": 0.026829971000552177, "grad_norm_var": 9.044001184426697e-07, "learning_rate": 0.0034373514367842368, "loss": 2.5424, "step": 16803 }, { "crossentropy": 2.5586040019989014, "epoch": 0.6091937354988399, "grad_norm": 0.026210736483335495, "grad_norm_var": 9.47778692388434e-07, "learning_rate": 0.0034367994674728854, "loss": 2.6214, "step": 16804 }, { "crossentropy": 2.583251953125, "epoch": 0.6092299883990719, "grad_norm": 0.02860059030354023, "grad_norm_var": 9.071503007703739e-07, "learning_rate": 0.0034362475192749277, "loss": 2.5486, "step": 16805 }, { "crossentropy": 2.488020658493042, "epoch": 0.609266241299304, "grad_norm": 0.027069907635450363, "grad_norm_var": 8.26156975017297e-07, "learning_rate": 0.003435695592197821, "loss": 2.4647, "step": 16806 }, { "crossentropy": 2.5963826179504395, "epoch": 0.609302494199536, "grad_norm": 0.028918400406837463, "grad_norm_var": 9.665877785446895e-07, "learning_rate": 0.0034351436862490193, "loss": 2.6104, "step": 16807 }, { "crossentropy": 2.4784741401672363, "epoch": 0.609338747099768, "grad_norm": 0.027136191725730896, "grad_norm_var": 9.140259811370987e-07, "learning_rate": 0.0034345918014359756, "loss": 2.5761, "step": 16808 }, { "crossentropy": 2.5158822536468506, "epoch": 0.609375, "grad_norm": 0.02662295289337635, "grad_norm_var": 8.146511149869356e-07, "learning_rate": 0.003434039937766146, "loss": 2.4869, "step": 16809 }, { "crossentropy": 2.630108594894409, "epoch": 0.609411252900232, "grad_norm": 0.028737111017107964, "grad_norm_var": 9.35304581845912e-07, "learning_rate": 0.0034334880952469836, "loss": 2.5973, "step": 16810 }, { "crossentropy": 2.5587995052337646, "epoch": 0.609447505800464, "grad_norm": 0.026678407564759254, "grad_norm_var": 9.670690761025165e-07, "learning_rate": 0.0034329362738859414, "loss": 2.5319, "step": 16811 }, { "crossentropy": 2.527256965637207, "epoch": 0.609483758700696, "grad_norm": 0.027894277125597, "grad_norm_var": 8.398231805743321e-07, "learning_rate": 0.003432384473690474, "loss": 2.4839, "step": 16812 }, { "crossentropy": 2.5909957885742188, "epoch": 0.6095200116009281, "grad_norm": 0.029377669095993042, "grad_norm_var": 1.0609852190498112e-06, "learning_rate": 0.0034318326946680333, "loss": 2.6123, "step": 16813 }, { "crossentropy": 2.476886510848999, "epoch": 0.6095562645011601, "grad_norm": 0.02691861055791378, "grad_norm_var": 1.013311310118827e-06, "learning_rate": 0.0034312809368260712, "loss": 2.4681, "step": 16814 }, { "crossentropy": 2.5623602867126465, "epoch": 0.6095925174013921, "grad_norm": 0.026823021471500397, "grad_norm_var": 9.979301394739121e-07, "learning_rate": 0.0034307292001720404, "loss": 2.6111, "step": 16815 }, { "crossentropy": 2.593888759613037, "epoch": 0.6096287703016241, "grad_norm": 0.028117654845118523, "grad_norm_var": 9.441403908311824e-07, "learning_rate": 0.0034301774847133927, "loss": 2.5964, "step": 16816 }, { "crossentropy": 2.730238437652588, "epoch": 0.6096650232018561, "grad_norm": 0.0265842042863369, "grad_norm_var": 1.0110384504468146e-06, "learning_rate": 0.0034296257904575813, "loss": 2.6395, "step": 16817 }, { "crossentropy": 2.453500747680664, "epoch": 0.6097012761020881, "grad_norm": 0.027340279892086983, "grad_norm_var": 9.565451467445178e-07, "learning_rate": 0.003429074117412056, "loss": 2.5311, "step": 16818 }, { "crossentropy": 2.4781417846679688, "epoch": 0.6097375290023201, "grad_norm": 0.029129870235919952, "grad_norm_var": 1.0843580979928337e-06, "learning_rate": 0.0034285224655842693, "loss": 2.5168, "step": 16819 }, { "crossentropy": 2.430521249771118, "epoch": 0.6097737819025522, "grad_norm": 0.02715511992573738, "grad_norm_var": 9.607601364644877e-07, "learning_rate": 0.0034279708349816713, "loss": 2.4997, "step": 16820 }, { "crossentropy": 2.4943835735321045, "epoch": 0.6098100348027842, "grad_norm": 0.02619899809360504, "grad_norm_var": 1.0309419797085995e-06, "learning_rate": 0.0034274192256117143, "loss": 2.5046, "step": 16821 }, { "crossentropy": 2.568464756011963, "epoch": 0.6098462877030162, "grad_norm": 0.027538252994418144, "grad_norm_var": 1.0150511605681722e-06, "learning_rate": 0.003426867637481845, "loss": 2.5387, "step": 16822 }, { "crossentropy": 2.596359968185425, "epoch": 0.6098825406032483, "grad_norm": 0.02613055892288685, "grad_norm_var": 1.0007733872769223e-06, "learning_rate": 0.0034263160705995167, "loss": 2.6146, "step": 16823 }, { "crossentropy": 2.4281842708587646, "epoch": 0.6099187935034803, "grad_norm": 0.025879060849547386, "grad_norm_var": 1.1435896839762644e-06, "learning_rate": 0.0034257645249721764, "loss": 2.5038, "step": 16824 }, { "crossentropy": 2.7057597637176514, "epoch": 0.6099550464037123, "grad_norm": 0.02680562622845173, "grad_norm_var": 1.1286884819625228e-06, "learning_rate": 0.0034252130006072767, "loss": 2.5897, "step": 16825 }, { "crossentropy": 2.486894369125366, "epoch": 0.6099912993039444, "grad_norm": 0.025930486619472504, "grad_norm_var": 1.0951172738008288e-06, "learning_rate": 0.0034246614975122647, "loss": 2.5109, "step": 16826 }, { "crossentropy": 2.4240872859954834, "epoch": 0.6100275522041764, "grad_norm": 0.029166387394070625, "grad_norm_var": 1.3234365091253802e-06, "learning_rate": 0.0034241100156945903, "loss": 2.454, "step": 16827 }, { "crossentropy": 2.4662225246429443, "epoch": 0.6100638051044084, "grad_norm": 0.027952319011092186, "grad_norm_var": 1.3281541872485798e-06, "learning_rate": 0.003423558555161701, "loss": 2.5738, "step": 16828 }, { "crossentropy": 2.494235038757324, "epoch": 0.6101000580046404, "grad_norm": 0.028383513912558556, "grad_norm_var": 1.1165779051779007e-06, "learning_rate": 0.0034230071159210454, "loss": 2.5116, "step": 16829 }, { "crossentropy": 2.490079641342163, "epoch": 0.6101363109048724, "grad_norm": 0.027368389070034027, "grad_norm_var": 1.1091458494775676e-06, "learning_rate": 0.003422455697980075, "loss": 2.5114, "step": 16830 }, { "crossentropy": 2.6020195484161377, "epoch": 0.6101725638051044, "grad_norm": 0.026851674541831017, "grad_norm_var": 1.107445647395971e-06, "learning_rate": 0.0034219043013462313, "loss": 2.5436, "step": 16831 }, { "crossentropy": 2.4648311138153076, "epoch": 0.6102088167053364, "grad_norm": 0.027560468763113022, "grad_norm_var": 1.0648618416949125e-06, "learning_rate": 0.003421352926026965, "loss": 2.5266, "step": 16832 }, { "crossentropy": 2.540877342224121, "epoch": 0.6102450696055685, "grad_norm": 0.025818083435297012, "grad_norm_var": 1.169398064863519e-06, "learning_rate": 0.0034208015720297237, "loss": 2.5092, "step": 16833 }, { "crossentropy": 2.476454019546509, "epoch": 0.6102813225058005, "grad_norm": 0.027875656262040138, "grad_norm_var": 1.1972854236081173e-06, "learning_rate": 0.003420250239361954, "loss": 2.6083, "step": 16834 }, { "crossentropy": 2.622162103652954, "epoch": 0.6103175754060325, "grad_norm": 0.026181282475590706, "grad_norm_var": 9.95330521912953e-07, "learning_rate": 0.0034196989280311013, "loss": 2.4948, "step": 16835 }, { "crossentropy": 2.463714361190796, "epoch": 0.6103538283062645, "grad_norm": 0.025670140981674194, "grad_norm_var": 1.1122887229137365e-06, "learning_rate": 0.0034191476380446137, "loss": 2.4631, "step": 16836 }, { "crossentropy": 2.637686014175415, "epoch": 0.6103900812064965, "grad_norm": 0.025818416848778725, "grad_norm_var": 1.1598020376366354e-06, "learning_rate": 0.003418596369409935, "loss": 2.6365, "step": 16837 }, { "crossentropy": 2.595409870147705, "epoch": 0.6104263341067285, "grad_norm": 0.025973312556743622, "grad_norm_var": 1.1866058638333616e-06, "learning_rate": 0.003418045122134515, "loss": 2.5946, "step": 16838 }, { "crossentropy": 2.5335540771484375, "epoch": 0.6104625870069605, "grad_norm": 0.027074752375483513, "grad_norm_var": 1.153598558606648e-06, "learning_rate": 0.003417493896225794, "loss": 2.4925, "step": 16839 }, { "crossentropy": 2.496957778930664, "epoch": 0.6104988399071926, "grad_norm": 0.0308462493121624, "grad_norm_var": 2.0232421480586057e-06, "learning_rate": 0.0034169426916912196, "loss": 2.5873, "step": 16840 }, { "crossentropy": 2.650554656982422, "epoch": 0.6105350928074246, "grad_norm": 0.027506006881594658, "grad_norm_var": 2.01662422071532e-06, "learning_rate": 0.003416391508538237, "loss": 2.6575, "step": 16841 }, { "crossentropy": 2.6249654293060303, "epoch": 0.6105713457076566, "grad_norm": 0.03370991349220276, "grad_norm_var": 4.431899446983193e-06, "learning_rate": 0.0034158403467742893, "loss": 2.554, "step": 16842 }, { "crossentropy": 2.500354766845703, "epoch": 0.6106075986078886, "grad_norm": 0.02615092694759369, "grad_norm_var": 4.424620184712014e-06, "learning_rate": 0.0034152892064068232, "loss": 2.5128, "step": 16843 }, { "crossentropy": 2.448873519897461, "epoch": 0.6106438515081206, "grad_norm": 0.030126960948109627, "grad_norm_var": 4.837907474954973e-06, "learning_rate": 0.0034147380874432805, "loss": 2.5776, "step": 16844 }, { "crossentropy": 2.5307083129882812, "epoch": 0.6106801044083526, "grad_norm": 0.027175424620509148, "grad_norm_var": 4.816163839845879e-06, "learning_rate": 0.003414186989891105, "loss": 2.5289, "step": 16845 }, { "crossentropy": 2.4932022094726562, "epoch": 0.6107163573085846, "grad_norm": 0.02572808228433132, "grad_norm_var": 5.0364534227171114e-06, "learning_rate": 0.0034136359137577413, "loss": 2.5266, "step": 16846 }, { "crossentropy": 2.3328611850738525, "epoch": 0.6107526102088167, "grad_norm": 0.025351352989673615, "grad_norm_var": 5.307673697469278e-06, "learning_rate": 0.0034130848590506343, "loss": 2.4874, "step": 16847 }, { "crossentropy": 2.376962184906006, "epoch": 0.6107888631090487, "grad_norm": 0.02671092376112938, "grad_norm_var": 5.335787396277119e-06, "learning_rate": 0.003412533825777221, "loss": 2.4425, "step": 16848 }, { "crossentropy": 2.36000919342041, "epoch": 0.6108251160092807, "grad_norm": 0.026529081165790558, "grad_norm_var": 5.221460926947928e-06, "learning_rate": 0.0034119828139449492, "loss": 2.4223, "step": 16849 }, { "crossentropy": 2.5473339557647705, "epoch": 0.6108613689095128, "grad_norm": 0.027486717328429222, "grad_norm_var": 5.2063410092432e-06, "learning_rate": 0.003411431823561259, "loss": 2.4897, "step": 16850 }, { "crossentropy": 2.6654458045959473, "epoch": 0.6108976218097448, "grad_norm": 0.027104496955871582, "grad_norm_var": 5.112366109021266e-06, "learning_rate": 0.003410880854633593, "loss": 2.5883, "step": 16851 }, { "crossentropy": 2.535141706466675, "epoch": 0.6109338747099768, "grad_norm": 0.02619992569088936, "grad_norm_var": 5.005229873372577e-06, "learning_rate": 0.0034103299071693927, "loss": 2.5184, "step": 16852 }, { "crossentropy": 2.6212470531463623, "epoch": 0.6109701276102089, "grad_norm": 0.0280903410166502, "grad_norm_var": 4.8280492604400494e-06, "learning_rate": 0.0034097789811760983, "loss": 2.6439, "step": 16853 }, { "crossentropy": 2.597637176513672, "epoch": 0.6110063805104409, "grad_norm": 0.02769056148827076, "grad_norm_var": 4.63754767506813e-06, "learning_rate": 0.003409228076661153, "loss": 2.5385, "step": 16854 }, { "crossentropy": 2.586524486541748, "epoch": 0.6110426334106729, "grad_norm": 0.028264479711651802, "grad_norm_var": 4.624037089503754e-06, "learning_rate": 0.003408677193631998, "loss": 2.5747, "step": 16855 }, { "crossentropy": 2.57537841796875, "epoch": 0.6110788863109049, "grad_norm": 0.028351373970508575, "grad_norm_var": 3.997054555108086e-06, "learning_rate": 0.00340812633209607, "loss": 2.5461, "step": 16856 }, { "crossentropy": 2.5242841243743896, "epoch": 0.6111151392111369, "grad_norm": 0.027405260130763054, "grad_norm_var": 3.999435586840514e-06, "learning_rate": 0.0034075754920608126, "loss": 2.569, "step": 16857 }, { "crossentropy": 2.5845448970794678, "epoch": 0.6111513921113689, "grad_norm": 0.02646823041141033, "grad_norm_var": 1.4062992951674314e-06, "learning_rate": 0.0034070246735336644, "loss": 2.5064, "step": 16858 }, { "crossentropy": 2.4650394916534424, "epoch": 0.6111876450116009, "grad_norm": 0.02688070945441723, "grad_norm_var": 1.339731335685268e-06, "learning_rate": 0.003406473876522066, "loss": 2.4756, "step": 16859 }, { "crossentropy": 2.5322799682617188, "epoch": 0.611223897911833, "grad_norm": 0.026877442374825478, "grad_norm_var": 7.413849175595677e-07, "learning_rate": 0.003405923101033456, "loss": 2.5629, "step": 16860 }, { "crossentropy": 2.556360960006714, "epoch": 0.611260150812065, "grad_norm": 0.027011731639504433, "grad_norm_var": 7.396597392054019e-07, "learning_rate": 0.0034053723470752724, "loss": 2.57, "step": 16861 }, { "crossentropy": 2.5513131618499756, "epoch": 0.611296403712297, "grad_norm": 0.02595403976738453, "grad_norm_var": 7.042470936501529e-07, "learning_rate": 0.003404821614654956, "loss": 2.5689, "step": 16862 }, { "crossentropy": 2.540581464767456, "epoch": 0.611332656612529, "grad_norm": 0.02738841623067856, "grad_norm_var": 5.094181941412794e-07, "learning_rate": 0.003404270903779946, "loss": 2.5729, "step": 16863 }, { "crossentropy": 2.6582319736480713, "epoch": 0.611368909512761, "grad_norm": 0.02881607785820961, "grad_norm_var": 6.62913835181254e-07, "learning_rate": 0.003403720214457678, "loss": 2.5635, "step": 16864 }, { "crossentropy": 2.609273910522461, "epoch": 0.611405162412993, "grad_norm": 0.027249442413449287, "grad_norm_var": 6.22988550288495e-07, "learning_rate": 0.0034031695466955904, "loss": 2.6072, "step": 16865 }, { "crossentropy": 2.6828830242156982, "epoch": 0.611441415313225, "grad_norm": 0.026412665843963623, "grad_norm_var": 6.722799531908367e-07, "learning_rate": 0.003402618900501122, "loss": 2.6975, "step": 16866 }, { "crossentropy": 2.5969605445861816, "epoch": 0.6114776682134571, "grad_norm": 0.02523769624531269, "grad_norm_var": 9.288755843705425e-07, "learning_rate": 0.003402068275881709, "loss": 2.5301, "step": 16867 }, { "crossentropy": 2.4955716133117676, "epoch": 0.6115139211136891, "grad_norm": 0.027201887220144272, "grad_norm_var": 8.655443371036273e-07, "learning_rate": 0.0034015176728447896, "loss": 2.5841, "step": 16868 }, { "crossentropy": 2.3072292804718018, "epoch": 0.6115501740139211, "grad_norm": 0.02605779469013214, "grad_norm_var": 8.84159022379825e-07, "learning_rate": 0.003400967091397798, "loss": 2.4337, "step": 16869 }, { "crossentropy": 2.533094644546509, "epoch": 0.6115864269141531, "grad_norm": 0.02790447697043419, "grad_norm_var": 9.044552172217246e-07, "learning_rate": 0.003400416531548174, "loss": 2.5622, "step": 16870 }, { "crossentropy": 2.541813611984253, "epoch": 0.6116226798143851, "grad_norm": 0.027141060680150986, "grad_norm_var": 8.078008524152354e-07, "learning_rate": 0.0033998659933033516, "loss": 2.5975, "step": 16871 }, { "crossentropy": 2.632967472076416, "epoch": 0.6116589327146171, "grad_norm": 0.026801951229572296, "grad_norm_var": 6.832917320402528e-07, "learning_rate": 0.0033993154766707684, "loss": 2.5945, "step": 16872 }, { "crossentropy": 2.5997087955474854, "epoch": 0.6116951856148491, "grad_norm": 0.026571352034807205, "grad_norm_var": 6.734170912382619e-07, "learning_rate": 0.003398764981657858, "loss": 2.56, "step": 16873 }, { "crossentropy": 2.5956075191497803, "epoch": 0.6117314385150812, "grad_norm": 0.03048892319202423, "grad_norm_var": 1.4665626502138296e-06, "learning_rate": 0.0033982145082720565, "loss": 2.5441, "step": 16874 }, { "crossentropy": 2.463038682937622, "epoch": 0.6117676914153132, "grad_norm": 0.028971383348107338, "grad_norm_var": 1.6717228772014648e-06, "learning_rate": 0.003397664056520799, "loss": 2.5552, "step": 16875 }, { "crossentropy": 2.647695541381836, "epoch": 0.6118039443155452, "grad_norm": 0.02615349180996418, "grad_norm_var": 1.740962070153479e-06, "learning_rate": 0.003397113626411519, "loss": 2.5211, "step": 16876 }, { "crossentropy": 2.571558713912964, "epoch": 0.6118401972157773, "grad_norm": 0.026090286672115326, "grad_norm_var": 1.8184058507542209e-06, "learning_rate": 0.0033965632179516524, "loss": 2.5823, "step": 16877 }, { "crossentropy": 2.536968231201172, "epoch": 0.6118764501160093, "grad_norm": 0.025465620681643486, "grad_norm_var": 1.911366057320363e-06, "learning_rate": 0.0033960128311486338, "loss": 2.5047, "step": 16878 }, { "crossentropy": 2.5346310138702393, "epoch": 0.6119127030162413, "grad_norm": 0.027591250836849213, "grad_norm_var": 1.9211416578392465e-06, "learning_rate": 0.0033954624660098953, "loss": 2.5601, "step": 16879 }, { "crossentropy": 2.5883758068084717, "epoch": 0.6119489559164734, "grad_norm": 0.026108646765351295, "grad_norm_var": 1.772319792915413e-06, "learning_rate": 0.0033949121225428725, "loss": 2.6189, "step": 16880 }, { "crossentropy": 2.6251819133758545, "epoch": 0.6119852088167054, "grad_norm": 0.029398057609796524, "grad_norm_var": 2.1421996367187163e-06, "learning_rate": 0.003394361800754996, "loss": 2.5312, "step": 16881 }, { "crossentropy": 2.5936648845672607, "epoch": 0.6120214617169374, "grad_norm": 0.03287315368652344, "grad_norm_var": 4.158936020121175e-06, "learning_rate": 0.0033938115006537, "loss": 2.5569, "step": 16882 }, { "crossentropy": 2.391305685043335, "epoch": 0.6120577146171694, "grad_norm": 0.026865899562835693, "grad_norm_var": 3.832720480541572e-06, "learning_rate": 0.0033932612222464177, "loss": 2.4639, "step": 16883 }, { "crossentropy": 2.572160482406616, "epoch": 0.6120939675174014, "grad_norm": 0.027270661666989326, "grad_norm_var": 3.8293165849391375e-06, "learning_rate": 0.0033927109655405795, "loss": 2.5849, "step": 16884 }, { "crossentropy": 2.57724666595459, "epoch": 0.6121302204176334, "grad_norm": 0.02621740661561489, "grad_norm_var": 3.7978834011445986e-06, "learning_rate": 0.00339216073054362, "loss": 2.6009, "step": 16885 }, { "crossentropy": 2.4877431392669678, "epoch": 0.6121664733178654, "grad_norm": 0.02621445618569851, "grad_norm_var": 3.912201304160029e-06, "learning_rate": 0.0033916105172629696, "loss": 2.5389, "step": 16886 }, { "crossentropy": 2.6303539276123047, "epoch": 0.6122027262180975, "grad_norm": 0.026765519753098488, "grad_norm_var": 3.939688360889553e-06, "learning_rate": 0.00339106032570606, "loss": 2.4607, "step": 16887 }, { "crossentropy": 2.5426862239837646, "epoch": 0.6122389791183295, "grad_norm": 0.0275088082998991, "grad_norm_var": 3.906021854550611e-06, "learning_rate": 0.0033905101558803223, "loss": 2.5352, "step": 16888 }, { "crossentropy": 2.593656539916992, "epoch": 0.6122752320185615, "grad_norm": 0.02820032276213169, "grad_norm_var": 3.862636856674882e-06, "learning_rate": 0.0033899600077931893, "loss": 2.561, "step": 16889 }, { "crossentropy": 2.475999355316162, "epoch": 0.6123114849187935, "grad_norm": 0.026454638689756393, "grad_norm_var": 3.3455172983730293e-06, "learning_rate": 0.003389409881452088, "loss": 2.5473, "step": 16890 }, { "crossentropy": 2.3209848403930664, "epoch": 0.6123477378190255, "grad_norm": 0.025654124096035957, "grad_norm_var": 3.331333686553267e-06, "learning_rate": 0.00338885977686445, "loss": 2.4641, "step": 16891 }, { "crossentropy": 2.5022177696228027, "epoch": 0.6123839907192575, "grad_norm": 0.025808176025748253, "grad_norm_var": 3.3859118393667063e-06, "learning_rate": 0.003388309694037705, "loss": 2.5452, "step": 16892 }, { "crossentropy": 2.4687235355377197, "epoch": 0.6124202436194895, "grad_norm": 0.027997763827443123, "grad_norm_var": 3.3424155664019217e-06, "learning_rate": 0.0033877596329792846, "loss": 2.4757, "step": 16893 }, { "crossentropy": 2.6041696071624756, "epoch": 0.6124564965197216, "grad_norm": 0.028005456551909447, "grad_norm_var": 3.1329678504068837e-06, "learning_rate": 0.0033872095936966174, "loss": 2.5908, "step": 16894 }, { "crossentropy": 2.5447511672973633, "epoch": 0.6124927494199536, "grad_norm": 0.026854034513235092, "grad_norm_var": 3.1514194655888334e-06, "learning_rate": 0.0033866595761971314, "loss": 2.4577, "step": 16895 }, { "crossentropy": 2.3534610271453857, "epoch": 0.6125290023201856, "grad_norm": 0.026630451902747154, "grad_norm_var": 3.0794745393733357e-06, "learning_rate": 0.003386109580488257, "loss": 2.4261, "step": 16896 }, { "crossentropy": 2.5241687297821045, "epoch": 0.6125652552204176, "grad_norm": 0.027185659855604172, "grad_norm_var": 2.8018738048595496e-06, "learning_rate": 0.0033855596065774218, "loss": 2.4618, "step": 16897 }, { "crossentropy": 2.5651416778564453, "epoch": 0.6126015081206496, "grad_norm": 0.02598733641207218, "grad_norm_var": 6.316767748453777e-07, "learning_rate": 0.003385009654472054, "loss": 2.6308, "step": 16898 }, { "crossentropy": 2.6304025650024414, "epoch": 0.6126377610208816, "grad_norm": 0.027188653126358986, "grad_norm_var": 6.388158900478968e-07, "learning_rate": 0.003384459724179581, "loss": 2.6228, "step": 16899 }, { "crossentropy": 2.584451198577881, "epoch": 0.6126740139211136, "grad_norm": 0.026810048148036003, "grad_norm_var": 6.275595896196856e-07, "learning_rate": 0.0033839098157074305, "loss": 2.5488, "step": 16900 }, { "crossentropy": 2.502713680267334, "epoch": 0.6127102668213457, "grad_norm": 0.02721148543059826, "grad_norm_var": 6.064456885795325e-07, "learning_rate": 0.003383359929063031, "loss": 2.5663, "step": 16901 }, { "crossentropy": 2.480440616607666, "epoch": 0.6127465197215777, "grad_norm": 0.02657782845199108, "grad_norm_var": 5.812508345962118e-07, "learning_rate": 0.0033828100642538097, "loss": 2.4802, "step": 16902 }, { "crossentropy": 2.550192356109619, "epoch": 0.6127827726218097, "grad_norm": 0.027364768087863922, "grad_norm_var": 5.907507719483951e-07, "learning_rate": 0.0033822602212871923, "loss": 2.5403, "step": 16903 }, { "crossentropy": 2.355694055557251, "epoch": 0.6128190255220418, "grad_norm": 0.027022378519177437, "grad_norm_var": 5.702674033403535e-07, "learning_rate": 0.0033817104001706054, "loss": 2.4294, "step": 16904 }, { "crossentropy": 2.58831524848938, "epoch": 0.6128552784222738, "grad_norm": 0.02783854864537716, "grad_norm_var": 5.173919096025659e-07, "learning_rate": 0.003381160600911477, "loss": 2.5855, "step": 16905 }, { "crossentropy": 2.437338352203369, "epoch": 0.6128915313225058, "grad_norm": 0.025930387899279594, "grad_norm_var": 5.665361143865431e-07, "learning_rate": 0.0033806108235172293, "loss": 2.5366, "step": 16906 }, { "crossentropy": 2.5490076541900635, "epoch": 0.6129277842227379, "grad_norm": 0.02771201729774475, "grad_norm_var": 4.950771972922022e-07, "learning_rate": 0.0033800610679952914, "loss": 2.5637, "step": 16907 }, { "crossentropy": 2.5886943340301514, "epoch": 0.6129640371229699, "grad_norm": 0.02699876017868519, "grad_norm_var": 3.9323465726339113e-07, "learning_rate": 0.0033795113343530847, "loss": 2.4741, "step": 16908 }, { "crossentropy": 2.5157599449157715, "epoch": 0.6130002900232019, "grad_norm": 0.02663317881524563, "grad_norm_var": 3.430377744350714e-07, "learning_rate": 0.0033789616225980384, "loss": 2.5168, "step": 16909 }, { "crossentropy": 2.5967233180999756, "epoch": 0.6130365429234339, "grad_norm": 0.027017833665013313, "grad_norm_var": 2.711952927854251e-07, "learning_rate": 0.003378411932737575, "loss": 2.5594, "step": 16910 }, { "crossentropy": 2.601348876953125, "epoch": 0.6130727958236659, "grad_norm": 0.02720651961863041, "grad_norm_var": 2.751455333016428e-07, "learning_rate": 0.00337786226477912, "loss": 2.5994, "step": 16911 }, { "crossentropy": 2.5852620601654053, "epoch": 0.6131090487238979, "grad_norm": 0.027194373309612274, "grad_norm_var": 2.7044987695312485e-07, "learning_rate": 0.003377312618730096, "loss": 2.5562, "step": 16912 }, { "crossentropy": 2.5655744075775146, "epoch": 0.61314530162413, "grad_norm": 0.02721404656767845, "grad_norm_var": 2.712313821955891e-07, "learning_rate": 0.003376762994597927, "loss": 2.6094, "step": 16913 }, { "crossentropy": 2.468170404434204, "epoch": 0.613181554524362, "grad_norm": 0.026153629645705223, "grad_norm_var": 2.5063377245652475e-07, "learning_rate": 0.00337621339239004, "loss": 2.4861, "step": 16914 }, { "crossentropy": 2.3950467109680176, "epoch": 0.613217807424594, "grad_norm": 0.02609199471771717, "grad_norm_var": 2.988953827348188e-07, "learning_rate": 0.0033756638121138537, "loss": 2.5156, "step": 16915 }, { "crossentropy": 2.5206384658813477, "epoch": 0.613254060324826, "grad_norm": 0.025186952203512192, "grad_norm_var": 4.908298354636218e-07, "learning_rate": 0.003375114253776791, "loss": 2.5336, "step": 16916 }, { "crossentropy": 2.5003068447113037, "epoch": 0.613290313225058, "grad_norm": 0.026740608736872673, "grad_norm_var": 4.810297588800448e-07, "learning_rate": 0.0033745647173862776, "loss": 2.5379, "step": 16917 }, { "crossentropy": 2.3829734325408936, "epoch": 0.61332656612529, "grad_norm": 0.02869047410786152, "grad_norm_var": 6.959258090203826e-07, "learning_rate": 0.003374015202949734, "loss": 2.4977, "step": 16918 }, { "crossentropy": 2.4473869800567627, "epoch": 0.613362819025522, "grad_norm": 0.025974296033382416, "grad_norm_var": 7.375093012609481e-07, "learning_rate": 0.003373465710474583, "loss": 2.4505, "step": 16919 }, { "crossentropy": 2.5964772701263428, "epoch": 0.613399071925754, "grad_norm": 0.027428943663835526, "grad_norm_var": 7.57164337198725e-07, "learning_rate": 0.0033729162399682455, "loss": 2.4803, "step": 16920 }, { "crossentropy": 2.646272659301758, "epoch": 0.6134353248259861, "grad_norm": 0.02641269564628601, "grad_norm_var": 7.01195855485577e-07, "learning_rate": 0.003372366791438143, "loss": 2.5206, "step": 16921 }, { "crossentropy": 2.6503918170928955, "epoch": 0.6134715777262181, "grad_norm": 0.0267227441072464, "grad_norm_var": 6.499711216844765e-07, "learning_rate": 0.0033718173648916994, "loss": 2.6058, "step": 16922 }, { "crossentropy": 2.578845739364624, "epoch": 0.6135078306264501, "grad_norm": 0.02707417495548725, "grad_norm_var": 6.00913652536505e-07, "learning_rate": 0.0033712679603363316, "loss": 2.672, "step": 16923 }, { "crossentropy": 2.4599597454071045, "epoch": 0.6135440835266821, "grad_norm": 0.028078000992536545, "grad_norm_var": 7.028411391047165e-07, "learning_rate": 0.00337071857777946, "loss": 2.5426, "step": 16924 }, { "crossentropy": 2.71142840385437, "epoch": 0.6135803364269141, "grad_norm": 0.03530361130833626, "grad_norm_var": 5.134778802531172e-06, "learning_rate": 0.0033701692172285083, "loss": 2.5384, "step": 16925 }, { "crossentropy": 2.2899093627929688, "epoch": 0.6136165893271461, "grad_norm": 0.026853524148464203, "grad_norm_var": 5.144963092512898e-06, "learning_rate": 0.003369619878690895, "loss": 2.3536, "step": 16926 }, { "crossentropy": 2.559149742126465, "epoch": 0.6136528422273781, "grad_norm": 0.028179766610264778, "grad_norm_var": 5.17965185840343e-06, "learning_rate": 0.0033690705621740383, "loss": 2.5965, "step": 16927 }, { "crossentropy": 2.4389495849609375, "epoch": 0.6136890951276102, "grad_norm": 0.02825978398323059, "grad_norm_var": 5.213396222248043e-06, "learning_rate": 0.0033685212676853593, "loss": 2.439, "step": 16928 }, { "crossentropy": 2.5287294387817383, "epoch": 0.6137253480278422, "grad_norm": 0.025960279628634453, "grad_norm_var": 5.3632605975308195e-06, "learning_rate": 0.003367971995232275, "loss": 2.4751, "step": 16929 }, { "crossentropy": 2.6863059997558594, "epoch": 0.6137616009280742, "grad_norm": 0.027807554230093956, "grad_norm_var": 5.249567454073409e-06, "learning_rate": 0.003367422744822207, "loss": 2.6466, "step": 16930 }, { "crossentropy": 2.5358378887176514, "epoch": 0.6137978538283063, "grad_norm": 0.02891298197209835, "grad_norm_var": 5.199351878457552e-06, "learning_rate": 0.003366873516462574, "loss": 2.5102, "step": 16931 }, { "crossentropy": 2.5783274173736572, "epoch": 0.6138341067285383, "grad_norm": 0.026975415647029877, "grad_norm_var": 4.794239851448852e-06, "learning_rate": 0.00336632431016079, "loss": 2.5415, "step": 16932 }, { "crossentropy": 2.4671072959899902, "epoch": 0.6138703596287703, "grad_norm": 0.026572344824671745, "grad_norm_var": 4.8205831027653035e-06, "learning_rate": 0.0033657751259242763, "loss": 2.4655, "step": 16933 }, { "crossentropy": 2.57360577583313, "epoch": 0.6139066125290024, "grad_norm": 0.026334991678595543, "grad_norm_var": 4.895666546129056e-06, "learning_rate": 0.00336522596376045, "loss": 2.516, "step": 16934 }, { "crossentropy": 2.6146953105926514, "epoch": 0.6139428654292344, "grad_norm": 0.026848455891013145, "grad_norm_var": 4.744828995426458e-06, "learning_rate": 0.0033646768236767267, "loss": 2.5822, "step": 16935 }, { "crossentropy": 2.6982290744781494, "epoch": 0.6139791183294664, "grad_norm": 0.027090227231383324, "grad_norm_var": 4.765723689294912e-06, "learning_rate": 0.0033641277056805255, "loss": 2.6374, "step": 16936 }, { "crossentropy": 2.520601272583008, "epoch": 0.6140153712296984, "grad_norm": 0.027486665174365044, "grad_norm_var": 4.651805494580198e-06, "learning_rate": 0.003363578609779261, "loss": 2.5069, "step": 16937 }, { "crossentropy": 2.7135214805603027, "epoch": 0.6140516241299304, "grad_norm": 0.02630060724914074, "grad_norm_var": 4.722382003954555e-06, "learning_rate": 0.003363029535980351, "loss": 2.5639, "step": 16938 }, { "crossentropy": 2.583867311477661, "epoch": 0.6140878770301624, "grad_norm": 0.02676459588110447, "grad_norm_var": 4.7563671543522505e-06, "learning_rate": 0.0033624804842912145, "loss": 2.5787, "step": 16939 }, { "crossentropy": 2.446686029434204, "epoch": 0.6141241299303944, "grad_norm": 0.02771740034222603, "grad_norm_var": 4.74790895191901e-06, "learning_rate": 0.0033619314547192604, "loss": 2.503, "step": 16940 }, { "crossentropy": 2.465475559234619, "epoch": 0.6141603828306265, "grad_norm": 0.026433032006025314, "grad_norm_var": 6.851666025063507e-07, "learning_rate": 0.003361382447271909, "loss": 2.5387, "step": 16941 }, { "crossentropy": 2.468491554260254, "epoch": 0.6141966357308585, "grad_norm": 0.025795726105570793, "grad_norm_var": 7.97775598954803e-07, "learning_rate": 0.003360833461956574, "loss": 2.5268, "step": 16942 }, { "crossentropy": 2.522156238555908, "epoch": 0.6142328886310905, "grad_norm": 0.026044800877571106, "grad_norm_var": 7.724372151531132e-07, "learning_rate": 0.003360284498780671, "loss": 2.3966, "step": 16943 }, { "crossentropy": 2.6149754524230957, "epoch": 0.6142691415313225, "grad_norm": 0.027308840304613113, "grad_norm_var": 6.637157972803984e-07, "learning_rate": 0.0033597355577516144, "loss": 2.5278, "step": 16944 }, { "crossentropy": 2.6201703548431396, "epoch": 0.6143053944315545, "grad_norm": 0.02730649895966053, "grad_norm_var": 6.088259477197843e-07, "learning_rate": 0.0033591866388768166, "loss": 2.5569, "step": 16945 }, { "crossentropy": 2.411320924758911, "epoch": 0.6143416473317865, "grad_norm": 0.0261350367218256, "grad_norm_var": 5.993922800452081e-07, "learning_rate": 0.003358637742163695, "loss": 2.4608, "step": 16946 }, { "crossentropy": 2.5835883617401123, "epoch": 0.6143779002320185, "grad_norm": 0.028064781799912453, "grad_norm_var": 4.140705130489432e-07, "learning_rate": 0.003358088867619663, "loss": 2.5451, "step": 16947 }, { "crossentropy": 2.5488665103912354, "epoch": 0.6144141531322506, "grad_norm": 0.026272766292095184, "grad_norm_var": 4.3071534235407785e-07, "learning_rate": 0.0033575400152521294, "loss": 2.581, "step": 16948 }, { "crossentropy": 2.526484727859497, "epoch": 0.6144504060324826, "grad_norm": 0.02750631794333458, "grad_norm_var": 4.5940033612425444e-07, "learning_rate": 0.003356991185068512, "loss": 2.5067, "step": 16949 }, { "crossentropy": 2.717916965484619, "epoch": 0.6144866589327146, "grad_norm": 0.02616039477288723, "grad_norm_var": 4.7301941040322804e-07, "learning_rate": 0.003356442377076222, "loss": 2.6404, "step": 16950 }, { "crossentropy": 2.495993137359619, "epoch": 0.6145229118329466, "grad_norm": 0.027227625250816345, "grad_norm_var": 4.830766151035637e-07, "learning_rate": 0.0033558935912826717, "loss": 2.451, "step": 16951 }, { "crossentropy": 2.4309263229370117, "epoch": 0.6145591647331786, "grad_norm": 0.025859970599412918, "grad_norm_var": 5.384238715502287e-07, "learning_rate": 0.0033553448276952736, "loss": 2.489, "step": 16952 }, { "crossentropy": 2.4609649181365967, "epoch": 0.6145954176334106, "grad_norm": 0.025866834446787834, "grad_norm_var": 5.48509314912624e-07, "learning_rate": 0.0033547960863214376, "loss": 2.507, "step": 16953 }, { "crossentropy": 2.567274808883667, "epoch": 0.6146316705336426, "grad_norm": 0.02630268782377243, "grad_norm_var": 5.484063280416686e-07, "learning_rate": 0.0033542473671685793, "loss": 2.5565, "step": 16954 }, { "crossentropy": 2.598076820373535, "epoch": 0.6146679234338747, "grad_norm": 0.027063678950071335, "grad_norm_var": 5.576513520485586e-07, "learning_rate": 0.0033536986702441076, "loss": 2.5544, "step": 16955 }, { "crossentropy": 2.4299519062042236, "epoch": 0.6147041763341067, "grad_norm": 0.026135172694921494, "grad_norm_var": 4.977204594104149e-07, "learning_rate": 0.0033531499955554347, "loss": 2.4844, "step": 16956 }, { "crossentropy": 2.5716209411621094, "epoch": 0.6147404292343387, "grad_norm": 0.026257658377289772, "grad_norm_var": 5.033776555957477e-07, "learning_rate": 0.003352601343109969, "loss": 2.5878, "step": 16957 }, { "crossentropy": 2.5220746994018555, "epoch": 0.6147766821345708, "grad_norm": 0.02741570957005024, "grad_norm_var": 4.976091881302476e-07, "learning_rate": 0.003352052712915122, "loss": 2.6034, "step": 16958 }, { "crossentropy": 2.5333592891693115, "epoch": 0.6148129350348028, "grad_norm": 0.027010425925254822, "grad_norm_var": 4.737117909279446e-07, "learning_rate": 0.003351504104978304, "loss": 2.5952, "step": 16959 }, { "crossentropy": 2.5160529613494873, "epoch": 0.6148491879350348, "grad_norm": 0.027565861120820045, "grad_norm_var": 4.972178450632821e-07, "learning_rate": 0.0033509555193069252, "loss": 2.5366, "step": 16960 }, { "crossentropy": 2.442415714263916, "epoch": 0.6148854408352669, "grad_norm": 0.03270233795046806, "grad_norm_var": 2.710472044976222e-06, "learning_rate": 0.0033504069559083926, "loss": 2.4586, "step": 16961 }, { "crossentropy": 2.4678432941436768, "epoch": 0.6149216937354989, "grad_norm": 0.026564696803689003, "grad_norm_var": 2.666918039532816e-06, "learning_rate": 0.00334985841479012, "loss": 2.5266, "step": 16962 }, { "crossentropy": 2.591447114944458, "epoch": 0.6149579466357309, "grad_norm": 0.02594948746263981, "grad_norm_var": 2.681110237638196e-06, "learning_rate": 0.0033493098959595127, "loss": 2.5645, "step": 16963 }, { "crossentropy": 2.4721782207489014, "epoch": 0.6149941995359629, "grad_norm": 0.027396412566304207, "grad_norm_var": 2.652363449528958e-06, "learning_rate": 0.0033487613994239813, "loss": 2.484, "step": 16964 }, { "crossentropy": 2.556741237640381, "epoch": 0.6150304524361949, "grad_norm": 0.02638459950685501, "grad_norm_var": 2.664488042776701e-06, "learning_rate": 0.0033482129251909323, "loss": 2.5948, "step": 16965 }, { "crossentropy": 2.5946121215820312, "epoch": 0.6150667053364269, "grad_norm": 0.02731332741677761, "grad_norm_var": 2.619809571724243e-06, "learning_rate": 0.003347664473267774, "loss": 2.6563, "step": 16966 }, { "crossentropy": 2.7311043739318848, "epoch": 0.615102958236659, "grad_norm": 0.02665117383003235, "grad_norm_var": 2.6279657257541797e-06, "learning_rate": 0.003347116043661915, "loss": 2.6292, "step": 16967 }, { "crossentropy": 2.5520381927490234, "epoch": 0.615139211136891, "grad_norm": 0.025880005210638046, "grad_norm_var": 2.6248720068215253e-06, "learning_rate": 0.003346567636380762, "loss": 2.581, "step": 16968 }, { "crossentropy": 2.4002485275268555, "epoch": 0.615175464037123, "grad_norm": 0.03018859587609768, "grad_norm_var": 3.122684620979893e-06, "learning_rate": 0.0033460192514317214, "loss": 2.5145, "step": 16969 }, { "crossentropy": 2.428941488265991, "epoch": 0.615211716937355, "grad_norm": 0.02566278725862503, "grad_norm_var": 3.233270531014659e-06, "learning_rate": 0.0033454708888222013, "loss": 2.4014, "step": 16970 }, { "crossentropy": 2.6587727069854736, "epoch": 0.615247969837587, "grad_norm": 0.028200825676321983, "grad_norm_var": 3.2844945668443335e-06, "learning_rate": 0.003344922548559608, "loss": 2.619, "step": 16971 }, { "crossentropy": 2.4882872104644775, "epoch": 0.615284222737819, "grad_norm": 0.025996584445238113, "grad_norm_var": 3.3077724567404185e-06, "learning_rate": 0.0033443742306513476, "loss": 2.467, "step": 16972 }, { "crossentropy": 2.5420122146606445, "epoch": 0.615320475638051, "grad_norm": 0.02646538056433201, "grad_norm_var": 3.2810108482324847e-06, "learning_rate": 0.003343825935104826, "loss": 2.5641, "step": 16973 }, { "crossentropy": 2.646984100341797, "epoch": 0.615356728538283, "grad_norm": 0.026916801929473877, "grad_norm_var": 3.291149754771951e-06, "learning_rate": 0.0033432776619274486, "loss": 2.6789, "step": 16974 }, { "crossentropy": 2.5442962646484375, "epoch": 0.6153929814385151, "grad_norm": 0.026602989062666893, "grad_norm_var": 3.3174235422601804e-06, "learning_rate": 0.0033427294111266193, "loss": 2.606, "step": 16975 }, { "crossentropy": 2.4847960472106934, "epoch": 0.6154292343387471, "grad_norm": 0.025814691558480263, "grad_norm_var": 3.441783731106248e-06, "learning_rate": 0.003342181182709744, "loss": 2.4896, "step": 16976 }, { "crossentropy": 2.490586280822754, "epoch": 0.6154654872389791, "grad_norm": 0.025772029533982277, "grad_norm_var": 1.3298070679914598e-06, "learning_rate": 0.003341632976684229, "loss": 2.5356, "step": 16977 }, { "crossentropy": 2.4335615634918213, "epoch": 0.6155017401392111, "grad_norm": 0.02962852641940117, "grad_norm_var": 1.8469172247794711e-06, "learning_rate": 0.0033410847930574763, "loss": 2.5132, "step": 16978 }, { "crossentropy": 2.4699018001556396, "epoch": 0.6155379930394431, "grad_norm": 0.026523394510149956, "grad_norm_var": 1.792739844434079e-06, "learning_rate": 0.0033405366318368917, "loss": 2.4851, "step": 16979 }, { "crossentropy": 2.5162863731384277, "epoch": 0.6155742459396751, "grad_norm": 0.027424650266766548, "grad_norm_var": 1.7944238134550191e-06, "learning_rate": 0.0033399884930298785, "loss": 2.4858, "step": 16980 }, { "crossentropy": 2.531033754348755, "epoch": 0.6156104988399071, "grad_norm": 0.027027105912566185, "grad_norm_var": 1.7705762623910585e-06, "learning_rate": 0.0033394403766438398, "loss": 2.5132, "step": 16981 }, { "crossentropy": 2.605839967727661, "epoch": 0.6156467517401392, "grad_norm": 0.027124907821416855, "grad_norm_var": 1.7650316668575318e-06, "learning_rate": 0.003338892282686179, "loss": 2.53, "step": 16982 }, { "crossentropy": 2.6462342739105225, "epoch": 0.6156830046403712, "grad_norm": 0.02827036753296852, "grad_norm_var": 1.8551976040206598e-06, "learning_rate": 0.003338344211164298, "loss": 2.6621, "step": 16983 }, { "crossentropy": 2.592146396636963, "epoch": 0.6157192575406032, "grad_norm": 0.0275302492082119, "grad_norm_var": 1.75834574332212e-06, "learning_rate": 0.003337796162085599, "loss": 2.5867, "step": 16984 }, { "crossentropy": 2.6884987354278564, "epoch": 0.6157555104408353, "grad_norm": 0.02646123245358467, "grad_norm_var": 1.1398388363279463e-06, "learning_rate": 0.003337248135457487, "loss": 2.6645, "step": 16985 }, { "crossentropy": 2.459066390991211, "epoch": 0.6157917633410673, "grad_norm": 0.028363725170493126, "grad_norm_var": 1.1272143446845364e-06, "learning_rate": 0.0033367001312873614, "loss": 2.4726, "step": 16986 }, { "crossentropy": 2.682995557785034, "epoch": 0.6158280162412993, "grad_norm": 0.02548888325691223, "grad_norm_var": 1.2006587035216323e-06, "learning_rate": 0.003336152149582625, "loss": 2.6075, "step": 16987 }, { "crossentropy": 2.5657436847686768, "epoch": 0.6158642691415314, "grad_norm": 0.029170265421271324, "grad_norm_var": 1.4211353470151236e-06, "learning_rate": 0.003335604190350679, "loss": 2.5532, "step": 16988 }, { "crossentropy": 2.4245901107788086, "epoch": 0.6159005220417634, "grad_norm": 0.02890247292816639, "grad_norm_var": 1.566123681347277e-06, "learning_rate": 0.0033350562535989247, "loss": 2.5103, "step": 16989 }, { "crossentropy": 2.6834683418273926, "epoch": 0.6159367749419954, "grad_norm": 0.02846710756421089, "grad_norm_var": 1.6342574196086575e-06, "learning_rate": 0.003334508339334762, "loss": 2.674, "step": 16990 }, { "crossentropy": 2.3275792598724365, "epoch": 0.6159730278422274, "grad_norm": 0.026681529358029366, "grad_norm_var": 1.6261836595875818e-06, "learning_rate": 0.0033339604475655918, "loss": 2.3872, "step": 16991 }, { "crossentropy": 2.4930005073547363, "epoch": 0.6160092807424594, "grad_norm": 0.026643868535757065, "grad_norm_var": 1.4921524067054137e-06, "learning_rate": 0.0033334125782988125, "loss": 2.4448, "step": 16992 }, { "crossentropy": 2.651484251022339, "epoch": 0.6160455336426914, "grad_norm": 0.03068043291568756, "grad_norm_var": 1.8883089078984417e-06, "learning_rate": 0.003332864731541826, "loss": 2.7219, "step": 16993 }, { "crossentropy": 2.4127044677734375, "epoch": 0.6160817865429234, "grad_norm": 0.02819308079779148, "grad_norm_var": 1.6622039578630584e-06, "learning_rate": 0.003332316907302032, "loss": 2.5486, "step": 16994 }, { "crossentropy": 2.615264415740967, "epoch": 0.6161180394431555, "grad_norm": 0.02693818137049675, "grad_norm_var": 1.6087377240361766e-06, "learning_rate": 0.0033317691055868293, "loss": 2.5496, "step": 16995 }, { "crossentropy": 2.511950731277466, "epoch": 0.6161542923433875, "grad_norm": 0.02682933583855629, "grad_norm_var": 1.653577374588026e-06, "learning_rate": 0.0033312213264036156, "loss": 2.55, "step": 16996 }, { "crossentropy": 2.5923972129821777, "epoch": 0.6161905452436195, "grad_norm": 0.027125675231218338, "grad_norm_var": 1.6456920084959806e-06, "learning_rate": 0.0033306735697597903, "loss": 2.5494, "step": 16997 }, { "crossentropy": 2.496504783630371, "epoch": 0.6162267981438515, "grad_norm": 0.02542733959853649, "grad_norm_var": 1.9513186820978764e-06, "learning_rate": 0.003330125835662754, "loss": 2.4317, "step": 16998 }, { "crossentropy": 2.5420663356781006, "epoch": 0.6162630510440835, "grad_norm": 0.027115507051348686, "grad_norm_var": 1.927348785460053e-06, "learning_rate": 0.0033295781241199003, "loss": 2.5908, "step": 16999 }, { "crossentropy": 2.372527599334717, "epoch": 0.6162993039443155, "grad_norm": 0.026783255860209465, "grad_norm_var": 1.9593285011859063e-06, "learning_rate": 0.003329030435138628, "loss": 2.4539, "step": 17000 }, { "crossentropy": 2.7463185787200928, "epoch": 0.6163355568445475, "grad_norm": 0.025840729475021362, "grad_norm_var": 2.065568675850791e-06, "learning_rate": 0.0033284827687263367, "loss": 2.5993, "step": 17001 }, { "crossentropy": 2.5449318885803223, "epoch": 0.6163718097447796, "grad_norm": 0.027063416317105293, "grad_norm_var": 2.006882530369789e-06, "learning_rate": 0.003327935124890422, "loss": 2.5045, "step": 17002 }, { "crossentropy": 2.5421195030212402, "epoch": 0.6164080626450116, "grad_norm": 0.026802171021699905, "grad_norm_var": 1.7915110943842367e-06, "learning_rate": 0.0033273875036382815, "loss": 2.5402, "step": 17003 }, { "crossentropy": 2.48626446723938, "epoch": 0.6164443155452436, "grad_norm": 0.025838982313871384, "grad_norm_var": 1.706139909384456e-06, "learning_rate": 0.0033268399049773112, "loss": 2.4435, "step": 17004 }, { "crossentropy": 2.469046115875244, "epoch": 0.6164805684454756, "grad_norm": 0.02846161276102066, "grad_norm_var": 1.6187025348186774e-06, "learning_rate": 0.003326292328914905, "loss": 2.5075, "step": 17005 }, { "crossentropy": 2.65775728225708, "epoch": 0.6165168213457076, "grad_norm": 0.02791009657084942, "grad_norm_var": 1.5425595378708618e-06, "learning_rate": 0.0033257447754584646, "loss": 2.6118, "step": 17006 }, { "crossentropy": 2.4646239280700684, "epoch": 0.6165530742459396, "grad_norm": 0.027309082448482513, "grad_norm_var": 1.5283135748685056e-06, "learning_rate": 0.00332519724461538, "loss": 2.4785, "step": 17007 }, { "crossentropy": 2.655413866043091, "epoch": 0.6165893271461717, "grad_norm": 0.027860119938850403, "grad_norm_var": 1.532986149112986e-06, "learning_rate": 0.003324649736393046, "loss": 2.6275, "step": 17008 }, { "crossentropy": 2.5127665996551514, "epoch": 0.6166255800464037, "grad_norm": 0.027927806600928307, "grad_norm_var": 7.516254190764592e-07, "learning_rate": 0.0033241022507988624, "loss": 2.5037, "step": 17009 }, { "crossentropy": 2.4603610038757324, "epoch": 0.6166618329466357, "grad_norm": 0.025828082114458084, "grad_norm_var": 7.53095473805578e-07, "learning_rate": 0.00332355478784022, "loss": 2.4919, "step": 17010 }, { "crossentropy": 2.4631597995758057, "epoch": 0.6166980858468677, "grad_norm": 0.02905404381453991, "grad_norm_var": 1.032009796740257e-06, "learning_rate": 0.003323007347524515, "loss": 2.4873, "step": 17011 }, { "crossentropy": 2.558844804763794, "epoch": 0.6167343387470998, "grad_norm": 0.026237500831484795, "grad_norm_var": 1.0731751079850449e-06, "learning_rate": 0.0033224599298591396, "loss": 2.546, "step": 17012 }, { "crossentropy": 2.5064189434051514, "epoch": 0.6167705916473318, "grad_norm": 0.02593955770134926, "grad_norm_var": 1.1470158636491788e-06, "learning_rate": 0.003321912534851489, "loss": 2.5679, "step": 17013 }, { "crossentropy": 2.6357851028442383, "epoch": 0.6168068445475638, "grad_norm": 0.026422886177897453, "grad_norm_var": 1.0051897047904633e-06, "learning_rate": 0.0033213651625089557, "loss": 2.5984, "step": 17014 }, { "crossentropy": 2.4368081092834473, "epoch": 0.6168430974477959, "grad_norm": 0.026003601029515266, "grad_norm_var": 1.0689948876131672e-06, "learning_rate": 0.0033208178128389366, "loss": 2.3798, "step": 17015 }, { "crossentropy": 2.4980664253234863, "epoch": 0.6168793503480279, "grad_norm": 0.02805192954838276, "grad_norm_var": 1.140507924261326e-06, "learning_rate": 0.0033202704858488175, "loss": 2.4769, "step": 17016 }, { "crossentropy": 2.4190428256988525, "epoch": 0.6169156032482599, "grad_norm": 0.02740454114973545, "grad_norm_var": 1.0444461071608488e-06, "learning_rate": 0.0033197231815459962, "loss": 2.4611, "step": 17017 }, { "crossentropy": 2.6718814373016357, "epoch": 0.6169518561484919, "grad_norm": 0.026849357411265373, "grad_norm_var": 1.0492735119819873e-06, "learning_rate": 0.003319175899937863, "loss": 2.5804, "step": 17018 }, { "crossentropy": 2.6796936988830566, "epoch": 0.6169881090487239, "grad_norm": 0.028299788013100624, "grad_norm_var": 1.1262197294926077e-06, "learning_rate": 0.0033186286410318093, "loss": 2.5952, "step": 17019 }, { "crossentropy": 2.6405954360961914, "epoch": 0.6170243619489559, "grad_norm": 0.02698216401040554, "grad_norm_var": 9.985510202384218e-07, "learning_rate": 0.0033180814048352283, "loss": 2.554, "step": 17020 }, { "crossentropy": 2.4725468158721924, "epoch": 0.617060614849188, "grad_norm": 0.025427840650081635, "grad_norm_var": 1.0973927510415319e-06, "learning_rate": 0.003317534191355509, "loss": 2.5045, "step": 17021 }, { "crossentropy": 2.5379815101623535, "epoch": 0.61709686774942, "grad_norm": 0.026857005432248116, "grad_norm_var": 1.0521540304681085e-06, "learning_rate": 0.0033169870006000447, "loss": 2.5575, "step": 17022 }, { "crossentropy": 2.557377815246582, "epoch": 0.617133120649652, "grad_norm": 0.02605031430721283, "grad_norm_var": 1.1040860705526738e-06, "learning_rate": 0.0033164398325762267, "loss": 2.5294, "step": 17023 }, { "crossentropy": 2.527719497680664, "epoch": 0.617169373549884, "grad_norm": 0.02647286094725132, "grad_norm_var": 1.0559835834670258e-06, "learning_rate": 0.0033158926872914407, "loss": 2.5333, "step": 17024 }, { "crossentropy": 2.475477933883667, "epoch": 0.617205626450116, "grad_norm": 0.02742835506796837, "grad_norm_var": 1.000670410926371e-06, "learning_rate": 0.003315345564753081, "loss": 2.6428, "step": 17025 }, { "crossentropy": 2.635680913925171, "epoch": 0.617241879350348, "grad_norm": 0.02767804265022278, "grad_norm_var": 9.669732200952847e-07, "learning_rate": 0.0033147984649685358, "loss": 2.6334, "step": 17026 }, { "crossentropy": 2.7257275581359863, "epoch": 0.61727813225058, "grad_norm": 0.02634165808558464, "grad_norm_var": 6.649486324184402e-07, "learning_rate": 0.0033142513879451953, "loss": 2.5583, "step": 17027 }, { "crossentropy": 2.59224271774292, "epoch": 0.617314385150812, "grad_norm": 0.02807542495429516, "grad_norm_var": 7.436277321001365e-07, "learning_rate": 0.003313704333690447, "loss": 2.5132, "step": 17028 }, { "crossentropy": 2.7388367652893066, "epoch": 0.6173506380510441, "grad_norm": 0.029285402968525887, "grad_norm_var": 1.0180270711612383e-06, "learning_rate": 0.00331315730221168, "loss": 2.5854, "step": 17029 }, { "crossentropy": 2.501323938369751, "epoch": 0.6173868909512761, "grad_norm": 0.02546190656721592, "grad_norm_var": 1.1627533284102083e-06, "learning_rate": 0.0033126102935162847, "loss": 2.5045, "step": 17030 }, { "crossentropy": 2.547959089279175, "epoch": 0.6174231438515081, "grad_norm": 0.025673778727650642, "grad_norm_var": 1.2152122350191405e-06, "learning_rate": 0.0033120633076116506, "loss": 2.5315, "step": 17031 }, { "crossentropy": 2.5737297534942627, "epoch": 0.6174593967517401, "grad_norm": 0.026947451755404472, "grad_norm_var": 1.1396759049963108e-06, "learning_rate": 0.00331151634450516, "loss": 2.569, "step": 17032 }, { "crossentropy": 2.649151563644409, "epoch": 0.6174956496519721, "grad_norm": 0.028064878657460213, "grad_norm_var": 1.2067513217092386e-06, "learning_rate": 0.003310969404204204, "loss": 2.5551, "step": 17033 }, { "crossentropy": 2.5476763248443604, "epoch": 0.6175319025522041, "grad_norm": 0.028154555708169937, "grad_norm_var": 1.2881356084695182e-06, "learning_rate": 0.003310422486716169, "loss": 2.5865, "step": 17034 }, { "crossentropy": 2.4892961978912354, "epoch": 0.6175681554524362, "grad_norm": 0.02708420902490616, "grad_norm_var": 1.181991886407468e-06, "learning_rate": 0.0033098755920484436, "loss": 2.6031, "step": 17035 }, { "crossentropy": 2.486992359161377, "epoch": 0.6176044083526682, "grad_norm": 0.02664598822593689, "grad_norm_var": 1.1898150995711654e-06, "learning_rate": 0.0033093287202084123, "loss": 2.4461, "step": 17036 }, { "crossentropy": 2.527881145477295, "epoch": 0.6176406612529002, "grad_norm": 0.025850433856248856, "grad_norm_var": 1.1136258609980206e-06, "learning_rate": 0.003308781871203461, "loss": 2.5121, "step": 17037 }, { "crossentropy": 2.4923813343048096, "epoch": 0.6176769141531323, "grad_norm": 0.026378726586699486, "grad_norm_var": 1.1373296450076578e-06, "learning_rate": 0.003308235045040979, "loss": 2.5076, "step": 17038 }, { "crossentropy": 2.4649534225463867, "epoch": 0.6177131670533643, "grad_norm": 0.026429908350110054, "grad_norm_var": 1.099553698981912e-06, "learning_rate": 0.003307688241728349, "loss": 2.49, "step": 17039 }, { "crossentropy": 2.515143632888794, "epoch": 0.6177494199535963, "grad_norm": 0.02719748392701149, "grad_norm_var": 1.0816003516952074e-06, "learning_rate": 0.0033071414612729595, "loss": 2.5041, "step": 17040 }, { "crossentropy": 2.6114978790283203, "epoch": 0.6177856728538283, "grad_norm": 0.02679857239127159, "grad_norm_var": 1.0740843893094116e-06, "learning_rate": 0.003306594703682192, "loss": 2.6337, "step": 17041 }, { "crossentropy": 2.5800061225891113, "epoch": 0.6178219257540604, "grad_norm": 0.025923600420355797, "grad_norm_var": 1.1088524159741463e-06, "learning_rate": 0.0033060479689634333, "loss": 2.5385, "step": 17042 }, { "crossentropy": 2.6071858406066895, "epoch": 0.6178581786542924, "grad_norm": 0.0265891645103693, "grad_norm_var": 1.0944327893247854e-06, "learning_rate": 0.0033055012571240674, "loss": 2.6521, "step": 17043 }, { "crossentropy": 2.537721633911133, "epoch": 0.6178944315545244, "grad_norm": 0.02673165313899517, "grad_norm_var": 9.984984108062898e-07, "learning_rate": 0.0033049545681714775, "loss": 2.5632, "step": 17044 }, { "crossentropy": 2.498136520385742, "epoch": 0.6179306844547564, "grad_norm": 0.02653188817203045, "grad_norm_var": 5.694693303974948e-07, "learning_rate": 0.0033044079021130484, "loss": 2.5573, "step": 17045 }, { "crossentropy": 2.424665689468384, "epoch": 0.6179669373549884, "grad_norm": 0.028041502460837364, "grad_norm_var": 5.75343806674157e-07, "learning_rate": 0.0033038612589561646, "loss": 2.5156, "step": 17046 }, { "crossentropy": 2.3618152141571045, "epoch": 0.6180031902552204, "grad_norm": 0.02637162059545517, "grad_norm_var": 4.995725887385471e-07, "learning_rate": 0.0033033146387082086, "loss": 2.5252, "step": 17047 }, { "crossentropy": 2.548074722290039, "epoch": 0.6180394431554525, "grad_norm": 0.027566825971007347, "grad_norm_var": 5.308659389509449e-07, "learning_rate": 0.003302768041376564, "loss": 2.5241, "step": 17048 }, { "crossentropy": 2.497044801712036, "epoch": 0.6180756960556845, "grad_norm": 0.030011652037501335, "grad_norm_var": 1.0707361592598159e-06, "learning_rate": 0.003302221466968612, "loss": 2.525, "step": 17049 }, { "crossentropy": 2.445560932159424, "epoch": 0.6181119489559165, "grad_norm": 0.028559673577547073, "grad_norm_var": 1.1423187649405119e-06, "learning_rate": 0.0033016749154917346, "loss": 2.4153, "step": 17050 }, { "crossentropy": 2.5512490272521973, "epoch": 0.6181482018561485, "grad_norm": 0.02747263014316559, "grad_norm_var": 1.1538017864416572e-06, "learning_rate": 0.003301128386953316, "loss": 2.5342, "step": 17051 }, { "crossentropy": 2.369849920272827, "epoch": 0.6181844547563805, "grad_norm": 0.0253326203674078, "grad_norm_var": 1.3356569571113321e-06, "learning_rate": 0.0033005818813607356, "loss": 2.4093, "step": 17052 }, { "crossentropy": 2.4336025714874268, "epoch": 0.6182207076566125, "grad_norm": 0.026109719648957253, "grad_norm_var": 1.3005747869532324e-06, "learning_rate": 0.0033000353987213756, "loss": 2.4929, "step": 17053 }, { "crossentropy": 2.5488357543945312, "epoch": 0.6182569605568445, "grad_norm": 0.029444444924592972, "grad_norm_var": 1.6328289255964726e-06, "learning_rate": 0.003299488939042618, "loss": 2.5529, "step": 17054 }, { "crossentropy": 2.53704571723938, "epoch": 0.6182932134570766, "grad_norm": 0.027408717200160027, "grad_norm_var": 1.592915052929927e-06, "learning_rate": 0.0032989425023318432, "loss": 2.4919, "step": 17055 }, { "crossentropy": 2.611806869506836, "epoch": 0.6183294663573086, "grad_norm": 0.02804623171687126, "grad_norm_var": 1.6313462239120682e-06, "learning_rate": 0.003298396088596431, "loss": 2.5264, "step": 17056 }, { "crossentropy": 2.4954071044921875, "epoch": 0.6183657192575406, "grad_norm": 0.027041815221309662, "grad_norm_var": 1.6184968448122727e-06, "learning_rate": 0.0032978496978437635, "loss": 2.4947, "step": 17057 }, { "crossentropy": 2.3266613483428955, "epoch": 0.6184019721577726, "grad_norm": 0.02684556134045124, "grad_norm_var": 1.4994759427251642e-06, "learning_rate": 0.0032973033300812173, "loss": 2.4501, "step": 17058 }, { "crossentropy": 2.451192617416382, "epoch": 0.6184382250580046, "grad_norm": 0.026884889230132103, "grad_norm_var": 1.47369576209098e-06, "learning_rate": 0.0032967569853161733, "loss": 2.5848, "step": 17059 }, { "crossentropy": 2.6285529136657715, "epoch": 0.6184744779582366, "grad_norm": 0.027891427278518677, "grad_norm_var": 1.454398188599247e-06, "learning_rate": 0.0032962106635560117, "loss": 2.6268, "step": 17060 }, { "crossentropy": 2.5644071102142334, "epoch": 0.6185107308584686, "grad_norm": 0.029698986560106277, "grad_norm_var": 1.684071644704697e-06, "learning_rate": 0.0032956643648081093, "loss": 2.5563, "step": 17061 }, { "crossentropy": 2.5762670040130615, "epoch": 0.6185469837587007, "grad_norm": 0.02592436969280243, "grad_norm_var": 1.8594897617025952e-06, "learning_rate": 0.0032951180890798473, "loss": 2.5917, "step": 17062 }, { "crossentropy": 2.5717904567718506, "epoch": 0.6185832366589327, "grad_norm": 0.02644343301653862, "grad_norm_var": 1.8486420988416865e-06, "learning_rate": 0.0032945718363786027, "loss": 2.5178, "step": 17063 }, { "crossentropy": 2.6470015048980713, "epoch": 0.6186194895591647, "grad_norm": 0.026722684502601624, "grad_norm_var": 1.8904611725124402e-06, "learning_rate": 0.003294025606711753, "loss": 2.6316, "step": 17064 }, { "crossentropy": 2.4796695709228516, "epoch": 0.6186557424593968, "grad_norm": 0.028850024566054344, "grad_norm_var": 1.5842235476462112e-06, "learning_rate": 0.0032934794000866775, "loss": 2.5222, "step": 17065 }, { "crossentropy": 2.547210216522217, "epoch": 0.6186919953596288, "grad_norm": 0.02743246778845787, "grad_norm_var": 1.491947618316447e-06, "learning_rate": 0.0032929332165107517, "loss": 2.5478, "step": 17066 }, { "crossentropy": 2.4793543815612793, "epoch": 0.6187282482598608, "grad_norm": 0.027054985985159874, "grad_norm_var": 1.495846577439933e-06, "learning_rate": 0.003292387055991354, "loss": 2.4813, "step": 17067 }, { "crossentropy": 2.6087849140167236, "epoch": 0.6187645011600929, "grad_norm": 0.027091316878795624, "grad_norm_var": 1.2229521411298573e-06, "learning_rate": 0.00329184091853586, "loss": 2.6151, "step": 17068 }, { "crossentropy": 2.495306968688965, "epoch": 0.6188007540603249, "grad_norm": 0.026722969487309456, "grad_norm_var": 1.1384453558124785e-06, "learning_rate": 0.0032912948041516456, "loss": 2.509, "step": 17069 }, { "crossentropy": 2.5654540061950684, "epoch": 0.6188370069605569, "grad_norm": 0.026299873366951942, "grad_norm_var": 9.282174686721465e-07, "learning_rate": 0.003290748712846089, "loss": 2.527, "step": 17070 }, { "crossentropy": 2.5795037746429443, "epoch": 0.6188732598607889, "grad_norm": 0.027174487709999084, "grad_norm_var": 9.273918093403999e-07, "learning_rate": 0.0032902026446265643, "loss": 2.5882, "step": 17071 }, { "crossentropy": 2.5226476192474365, "epoch": 0.6189095127610209, "grad_norm": 0.026197388768196106, "grad_norm_var": 9.466835454838632e-07, "learning_rate": 0.0032896565995004477, "loss": 2.538, "step": 17072 }, { "crossentropy": 2.5058984756469727, "epoch": 0.6189457656612529, "grad_norm": 0.0258394256234169, "grad_norm_var": 1.0531507339784154e-06, "learning_rate": 0.0032891105774751164, "loss": 2.4762, "step": 17073 }, { "crossentropy": 2.6348912715911865, "epoch": 0.6189820185614849, "grad_norm": 0.027794959023594856, "grad_norm_var": 1.081436236270196e-06, "learning_rate": 0.003288564578557941, "loss": 2.5839, "step": 17074 }, { "crossentropy": 2.5285696983337402, "epoch": 0.619018271461717, "grad_norm": 0.027134310454130173, "grad_norm_var": 1.07729001868477e-06, "learning_rate": 0.0032880186027562986, "loss": 2.4387, "step": 17075 }, { "crossentropy": 2.4306397438049316, "epoch": 0.619054524361949, "grad_norm": 0.026511207222938538, "grad_norm_var": 1.0584491484145886e-06, "learning_rate": 0.003287472650077562, "loss": 2.4558, "step": 17076 }, { "crossentropy": 2.5891261100769043, "epoch": 0.619090777262181, "grad_norm": 0.026705153286457062, "grad_norm_var": 5.63539948052064e-07, "learning_rate": 0.003286926720529107, "loss": 2.4992, "step": 17077 }, { "crossentropy": 2.5361905097961426, "epoch": 0.619127030162413, "grad_norm": 0.028125746175646782, "grad_norm_var": 5.892443450033457e-07, "learning_rate": 0.0032863808141183064, "loss": 2.5235, "step": 17078 }, { "crossentropy": 2.306729316711426, "epoch": 0.619163283062645, "grad_norm": 0.025788404047489166, "grad_norm_var": 6.652180047588459e-07, "learning_rate": 0.0032858349308525336, "loss": 2.4627, "step": 17079 }, { "crossentropy": 2.575232744216919, "epoch": 0.619199535962877, "grad_norm": 0.025870613753795624, "grad_norm_var": 7.381622440139531e-07, "learning_rate": 0.0032852890707391613, "loss": 2.4872, "step": 17080 }, { "crossentropy": 2.4155774116516113, "epoch": 0.619235788863109, "grad_norm": 0.026271622627973557, "grad_norm_var": 4.874331885777594e-07, "learning_rate": 0.003284743233785562, "loss": 2.4111, "step": 17081 }, { "crossentropy": 2.4493343830108643, "epoch": 0.619272041763341, "grad_norm": 0.027000878006219864, "grad_norm_var": 4.598559356510373e-07, "learning_rate": 0.0032841974199991097, "loss": 2.511, "step": 17082 }, { "crossentropy": 2.505387306213379, "epoch": 0.6193082946635731, "grad_norm": 0.027724284678697586, "grad_norm_var": 5.173942812898453e-07, "learning_rate": 0.0032836516293871733, "loss": 2.5511, "step": 17083 }, { "crossentropy": 2.5465922355651855, "epoch": 0.6193445475638051, "grad_norm": 0.027311867102980614, "grad_norm_var": 5.300070994797609e-07, "learning_rate": 0.0032831058619571265, "loss": 2.5448, "step": 17084 }, { "crossentropy": 2.5300111770629883, "epoch": 0.6193808004640371, "grad_norm": 0.02723781205713749, "grad_norm_var": 5.426878424512887e-07, "learning_rate": 0.00328256011771634, "loss": 2.5256, "step": 17085 }, { "crossentropy": 2.2891335487365723, "epoch": 0.6194170533642691, "grad_norm": 0.026852712035179138, "grad_norm_var": 5.240582388349457e-07, "learning_rate": 0.0032820143966721866, "loss": 2.3867, "step": 17086 }, { "crossentropy": 2.5297787189483643, "epoch": 0.6194533062645011, "grad_norm": 0.026442358270287514, "grad_norm_var": 5.255227418289223e-07, "learning_rate": 0.0032814686988320353, "loss": 2.59, "step": 17087 }, { "crossentropy": 2.462864398956299, "epoch": 0.6194895591647331, "grad_norm": 0.026267433539032936, "grad_norm_var": 5.201963115240733e-07, "learning_rate": 0.0032809230242032577, "loss": 2.4516, "step": 17088 }, { "crossentropy": 2.514073371887207, "epoch": 0.6195258120649652, "grad_norm": 0.026706455275416374, "grad_norm_var": 4.555646298727191e-07, "learning_rate": 0.003280377372793222, "loss": 2.4947, "step": 17089 }, { "crossentropy": 2.4644174575805664, "epoch": 0.6195620649651972, "grad_norm": 0.027632977813482285, "grad_norm_var": 4.3699258278253006e-07, "learning_rate": 0.003279831744609303, "loss": 2.5888, "step": 17090 }, { "crossentropy": 2.5073070526123047, "epoch": 0.6195983178654292, "grad_norm": 0.026721132919192314, "grad_norm_var": 4.3194389966135353e-07, "learning_rate": 0.0032792861396588645, "loss": 2.47, "step": 17091 }, { "crossentropy": 2.4807169437408447, "epoch": 0.6196345707656613, "grad_norm": 0.027323555201292038, "grad_norm_var": 4.3939899708003195e-07, "learning_rate": 0.0032787405579492767, "loss": 2.5443, "step": 17092 }, { "crossentropy": 2.557666301727295, "epoch": 0.6196708236658933, "grad_norm": 0.028744518756866455, "grad_norm_var": 6.534420755265144e-07, "learning_rate": 0.003278194999487911, "loss": 2.6047, "step": 17093 }, { "crossentropy": 2.5959725379943848, "epoch": 0.6197070765661253, "grad_norm": 0.02681180089712143, "grad_norm_var": 5.643677874561081e-07, "learning_rate": 0.0032776494642821354, "loss": 2.581, "step": 17094 }, { "crossentropy": 2.534287214279175, "epoch": 0.6197433294663574, "grad_norm": 0.026575906202197075, "grad_norm_var": 4.843858132080042e-07, "learning_rate": 0.0032771039523393163, "loss": 2.533, "step": 17095 }, { "crossentropy": 2.629305362701416, "epoch": 0.6197795823665894, "grad_norm": 0.025969820097088814, "grad_norm_var": 4.704786864729096e-07, "learning_rate": 0.0032765584636668234, "loss": 2.6456, "step": 17096 }, { "crossentropy": 2.5340542793273926, "epoch": 0.6198158352668214, "grad_norm": 0.025687988847494125, "grad_norm_var": 5.464796076446246e-07, "learning_rate": 0.0032760129982720232, "loss": 2.501, "step": 17097 }, { "crossentropy": 2.6166045665740967, "epoch": 0.6198520881670534, "grad_norm": 0.026598449796438217, "grad_norm_var": 5.532392787054839e-07, "learning_rate": 0.0032754675561622844, "loss": 2.6342, "step": 17098 }, { "crossentropy": 2.5504043102264404, "epoch": 0.6198883410672854, "grad_norm": 0.026578538119792938, "grad_norm_var": 5.113585929898227e-07, "learning_rate": 0.0032749221373449757, "loss": 2.5001, "step": 17099 }, { "crossentropy": 2.4971041679382324, "epoch": 0.6199245939675174, "grad_norm": 0.028195757418870926, "grad_norm_var": 6.156261549391014e-07, "learning_rate": 0.003274376741827458, "loss": 2.4769, "step": 17100 }, { "crossentropy": 2.5450148582458496, "epoch": 0.6199608468677494, "grad_norm": 0.026330038905143738, "grad_norm_var": 6.258425590700718e-07, "learning_rate": 0.0032738313696171017, "loss": 2.5749, "step": 17101 }, { "crossentropy": 2.6317570209503174, "epoch": 0.6199970997679815, "grad_norm": 0.02652849070727825, "grad_norm_var": 6.318614888857806e-07, "learning_rate": 0.0032732860207212735, "loss": 2.4842, "step": 17102 }, { "crossentropy": 2.539597749710083, "epoch": 0.6200333526682135, "grad_norm": 0.02522466331720352, "grad_norm_var": 7.858004816503154e-07, "learning_rate": 0.003272740695147337, "loss": 2.5087, "step": 17103 }, { "crossentropy": 2.6157546043395996, "epoch": 0.6200696055684455, "grad_norm": 0.026751484721899033, "grad_norm_var": 7.697130110691548e-07, "learning_rate": 0.0032721953929026595, "loss": 2.5208, "step": 17104 }, { "crossentropy": 2.575479030609131, "epoch": 0.6201058584686775, "grad_norm": 0.028181036934256554, "grad_norm_var": 8.923621743108044e-07, "learning_rate": 0.003271650113994603, "loss": 2.6266, "step": 17105 }, { "crossentropy": 2.4995579719543457, "epoch": 0.6201421113689095, "grad_norm": 0.025544414296746254, "grad_norm_var": 9.514118176033592e-07, "learning_rate": 0.003271104858430537, "loss": 2.6054, "step": 17106 }, { "crossentropy": 2.6088387966156006, "epoch": 0.6201783642691415, "grad_norm": 0.027421940118074417, "grad_norm_var": 9.80767367140843e-07, "learning_rate": 0.0032705596262178243, "loss": 2.5817, "step": 17107 }, { "crossentropy": 2.4963583946228027, "epoch": 0.6202146171693735, "grad_norm": 0.025748703628778458, "grad_norm_var": 1.0214891954984551e-06, "learning_rate": 0.0032700144173638257, "loss": 2.5052, "step": 17108 }, { "crossentropy": 2.535477876663208, "epoch": 0.6202508700696056, "grad_norm": 0.026077071204781532, "grad_norm_var": 7.322291667162066e-07, "learning_rate": 0.0032694692318759096, "loss": 2.5379, "step": 17109 }, { "crossentropy": 2.4862921237945557, "epoch": 0.6202871229698376, "grad_norm": 0.026165464892983437, "grad_norm_var": 7.32685969482176e-07, "learning_rate": 0.003268924069761437, "loss": 2.546, "step": 17110 }, { "crossentropy": 2.553929090499878, "epoch": 0.6203233758700696, "grad_norm": 0.026575112715363503, "grad_norm_var": 7.326751993605758e-07, "learning_rate": 0.0032683789310277715, "loss": 2.4708, "step": 17111 }, { "crossentropy": 2.4714293479919434, "epoch": 0.6203596287703016, "grad_norm": 0.02556772157549858, "grad_norm_var": 7.697942310316378e-07, "learning_rate": 0.0032678338156822773, "loss": 2.5559, "step": 17112 }, { "crossentropy": 2.409087896347046, "epoch": 0.6203958816705336, "grad_norm": 0.0271997582167387, "grad_norm_var": 7.593279320579436e-07, "learning_rate": 0.0032672887237323147, "loss": 2.4617, "step": 17113 }, { "crossentropy": 2.472332000732422, "epoch": 0.6204321345707656, "grad_norm": 0.027112985029816628, "grad_norm_var": 7.796759328759321e-07, "learning_rate": 0.003266743655185248, "loss": 2.5878, "step": 17114 }, { "crossentropy": 2.2865192890167236, "epoch": 0.6204683874709976, "grad_norm": 0.026412630453705788, "grad_norm_var": 7.813223997000235e-07, "learning_rate": 0.0032661986100484407, "loss": 2.395, "step": 17115 }, { "crossentropy": 2.461156129837036, "epoch": 0.6205046403712297, "grad_norm": 0.026934858411550522, "grad_norm_var": 6.064976804669483e-07, "learning_rate": 0.0032656535883292493, "loss": 2.5862, "step": 17116 }, { "crossentropy": 2.476557493209839, "epoch": 0.6205408932714617, "grad_norm": 0.02607945166528225, "grad_norm_var": 6.156340013267759e-07, "learning_rate": 0.003265108590035039, "loss": 2.47, "step": 17117 }, { "crossentropy": 2.6167025566101074, "epoch": 0.6205771461716937, "grad_norm": 0.026237376034259796, "grad_norm_var": 6.186744432939376e-07, "learning_rate": 0.0032645636151731705, "loss": 2.6217, "step": 17118 }, { "crossentropy": 2.4275310039520264, "epoch": 0.6206133990719258, "grad_norm": 0.02649020031094551, "grad_norm_var": 5.116465803802246e-07, "learning_rate": 0.003264018663751004, "loss": 2.4898, "step": 17119 }, { "crossentropy": 2.4905333518981934, "epoch": 0.6206496519721578, "grad_norm": 0.025866758078336716, "grad_norm_var": 5.3458978714591e-07, "learning_rate": 0.0032634737357758992, "loss": 2.4849, "step": 17120 }, { "crossentropy": 2.5534400939941406, "epoch": 0.6206859048723898, "grad_norm": 0.026256510987877846, "grad_norm_var": 3.2855063619995017e-07, "learning_rate": 0.003262928831255216, "loss": 2.5079, "step": 17121 }, { "crossentropy": 2.612002372741699, "epoch": 0.6207221577726219, "grad_norm": 0.02679138258099556, "grad_norm_var": 2.908499348742872e-07, "learning_rate": 0.0032623839501963164, "loss": 2.5812, "step": 17122 }, { "crossentropy": 2.6730258464813232, "epoch": 0.6207584106728539, "grad_norm": 0.026949508115649223, "grad_norm_var": 2.425442509398536e-07, "learning_rate": 0.003261839092606559, "loss": 2.6082, "step": 17123 }, { "crossentropy": 2.4795284271240234, "epoch": 0.6207946635730859, "grad_norm": 0.02719290368258953, "grad_norm_var": 2.4669951009202253e-07, "learning_rate": 0.0032612942584933024, "loss": 2.4424, "step": 17124 }, { "crossentropy": 2.592273712158203, "epoch": 0.6208309164733179, "grad_norm": 0.02686198242008686, "grad_norm_var": 2.4153400568794707e-07, "learning_rate": 0.003260749447863905, "loss": 2.5486, "step": 17125 }, { "crossentropy": 2.60752534866333, "epoch": 0.6208671693735499, "grad_norm": 0.026422932744026184, "grad_norm_var": 2.3270251939543015e-07, "learning_rate": 0.003260204660725725, "loss": 2.5484, "step": 17126 }, { "crossentropy": 2.5166332721710205, "epoch": 0.6209034222737819, "grad_norm": 0.027606181800365448, "grad_norm_var": 3.01292229846981e-07, "learning_rate": 0.0032596598970861222, "loss": 2.4451, "step": 17127 }, { "crossentropy": 2.4971437454223633, "epoch": 0.6209396751740139, "grad_norm": 0.02716701664030552, "grad_norm_var": 2.3592259120844225e-07, "learning_rate": 0.003259115156952453, "loss": 2.5061, "step": 17128 }, { "crossentropy": 2.373706340789795, "epoch": 0.620975928074246, "grad_norm": 0.026766380295157433, "grad_norm_var": 2.2016439256351768e-07, "learning_rate": 0.0032585704403320748, "loss": 2.3771, "step": 17129 }, { "crossentropy": 2.6107044219970703, "epoch": 0.621012180974478, "grad_norm": 0.02875985950231552, "grad_norm_var": 4.810603219847683e-07, "learning_rate": 0.003258025747232346, "loss": 2.5915, "step": 17130 }, { "crossentropy": 2.6089231967926025, "epoch": 0.62104843387471, "grad_norm": 0.031205343082547188, "grad_norm_var": 1.6693135003185052e-06, "learning_rate": 0.0032574810776606236, "loss": 2.5976, "step": 17131 }, { "crossentropy": 2.6764400005340576, "epoch": 0.621084686774942, "grad_norm": 0.028815891593694687, "grad_norm_var": 1.8492160953895628e-06, "learning_rate": 0.0032569364316242643, "loss": 2.5415, "step": 17132 }, { "crossentropy": 2.5190603733062744, "epoch": 0.621120939675174, "grad_norm": 0.027238747105002403, "grad_norm_var": 1.7574024364639304e-06, "learning_rate": 0.003256391809130623, "loss": 2.5104, "step": 17133 }, { "crossentropy": 2.5612869262695312, "epoch": 0.621157192575406, "grad_norm": 0.026422550901770592, "grad_norm_var": 1.7335732905478326e-06, "learning_rate": 0.003255847210187056, "loss": 2.5686, "step": 17134 }, { "crossentropy": 2.6234724521636963, "epoch": 0.621193445475638, "grad_norm": 0.026016108691692352, "grad_norm_var": 1.7988661049717167e-06, "learning_rate": 0.0032553026348009196, "loss": 2.5887, "step": 17135 }, { "crossentropy": 2.518172025680542, "epoch": 0.62122969837587, "grad_norm": 0.02599853090941906, "grad_norm_var": 1.7752747790755173e-06, "learning_rate": 0.0032547580829795685, "loss": 2.5377, "step": 17136 }, { "crossentropy": 2.6656596660614014, "epoch": 0.6212659512761021, "grad_norm": 0.02575988695025444, "grad_norm_var": 1.8584275857089072e-06, "learning_rate": 0.0032542135547303577, "loss": 2.635, "step": 17137 }, { "crossentropy": 2.489487648010254, "epoch": 0.6213022041763341, "grad_norm": 0.026849236339330673, "grad_norm_var": 1.8551110310107877e-06, "learning_rate": 0.003253669050060642, "loss": 2.5301, "step": 17138 }, { "crossentropy": 2.553572654724121, "epoch": 0.6213384570765661, "grad_norm": 0.028660455718636513, "grad_norm_var": 1.969048394268484e-06, "learning_rate": 0.003253124568977777, "loss": 2.5899, "step": 17139 }, { "crossentropy": 2.434262990951538, "epoch": 0.6213747099767981, "grad_norm": 0.02631578966975212, "grad_norm_var": 2.036556238467028e-06, "learning_rate": 0.0032525801114891145, "loss": 2.4344, "step": 17140 }, { "crossentropy": 2.550339698791504, "epoch": 0.6214109628770301, "grad_norm": 0.025944916531443596, "grad_norm_var": 2.1431893723768023e-06, "learning_rate": 0.003252035677602011, "loss": 2.5357, "step": 17141 }, { "crossentropy": 2.4722187519073486, "epoch": 0.6214472157772621, "grad_norm": 0.025923127308487892, "grad_norm_var": 2.213709609138201e-06, "learning_rate": 0.003251491267323817, "loss": 2.5294, "step": 17142 }, { "crossentropy": 2.630438804626465, "epoch": 0.6214834686774942, "grad_norm": 0.025789810344576836, "grad_norm_var": 2.325324123765513e-06, "learning_rate": 0.0032509468806618882, "loss": 2.5608, "step": 17143 }, { "crossentropy": 2.513859510421753, "epoch": 0.6215197215777262, "grad_norm": 0.025882214307785034, "grad_norm_var": 2.4173738172152905e-06, "learning_rate": 0.0032504025176235754, "loss": 2.5732, "step": 17144 }, { "crossentropy": 2.64485239982605, "epoch": 0.6215559744779582, "grad_norm": 0.02726145274937153, "grad_norm_var": 2.415831991936729e-06, "learning_rate": 0.0032498581782162295, "loss": 2.6433, "step": 17145 }, { "crossentropy": 2.5210044384002686, "epoch": 0.6215922273781903, "grad_norm": 0.026780230924487114, "grad_norm_var": 2.2101714144619086e-06, "learning_rate": 0.0032493138624472076, "loss": 2.5039, "step": 17146 }, { "crossentropy": 2.441713809967041, "epoch": 0.6216284802784223, "grad_norm": 0.0269571952521801, "grad_norm_var": 9.158992791168426e-07, "learning_rate": 0.0032487695703238575, "loss": 2.4761, "step": 17147 }, { "crossentropy": 2.609969139099121, "epoch": 0.6216647331786543, "grad_norm": 0.030095161870121956, "grad_norm_var": 1.3853130862050954e-06, "learning_rate": 0.003248225301853532, "loss": 2.5452, "step": 17148 }, { "crossentropy": 2.478524923324585, "epoch": 0.6217009860788864, "grad_norm": 0.027194974943995476, "grad_norm_var": 1.3825422183528017e-06, "learning_rate": 0.003247681057043584, "loss": 2.5375, "step": 17149 }, { "crossentropy": 2.623873472213745, "epoch": 0.6217372389791184, "grad_norm": 0.026052238419651985, "grad_norm_var": 1.4068229060658355e-06, "learning_rate": 0.003247136835901361, "loss": 2.543, "step": 17150 }, { "crossentropy": 2.6266653537750244, "epoch": 0.6217734918793504, "grad_norm": 0.02718685194849968, "grad_norm_var": 1.3829883544669644e-06, "learning_rate": 0.0032465926384342148, "loss": 2.6062, "step": 17151 }, { "crossentropy": 2.4461495876312256, "epoch": 0.6218097447795824, "grad_norm": 0.02740003541111946, "grad_norm_var": 1.3577111010405156e-06, "learning_rate": 0.003246048464649497, "loss": 2.4903, "step": 17152 }, { "crossentropy": 2.57967209815979, "epoch": 0.6218459976798144, "grad_norm": 0.026170210912823677, "grad_norm_var": 1.3070430748082322e-06, "learning_rate": 0.0032455043145545545, "loss": 2.6634, "step": 17153 }, { "crossentropy": 2.4857356548309326, "epoch": 0.6218822505800464, "grad_norm": 0.02649461291730404, "grad_norm_var": 1.3174920447421896e-06, "learning_rate": 0.0032449601881567396, "loss": 2.5562, "step": 17154 }, { "crossentropy": 2.415149211883545, "epoch": 0.6219185034802784, "grad_norm": 0.028647175058722496, "grad_norm_var": 1.3143535583813448e-06, "learning_rate": 0.0032444160854634015, "loss": 2.4871, "step": 17155 }, { "crossentropy": 2.556893825531006, "epoch": 0.6219547563805105, "grad_norm": 0.028058307245373726, "grad_norm_var": 1.372808023392067e-06, "learning_rate": 0.0032438720064818876, "loss": 2.5873, "step": 17156 }, { "crossentropy": 2.397254467010498, "epoch": 0.6219910092807425, "grad_norm": 0.02548374980688095, "grad_norm_var": 1.4503555219431829e-06, "learning_rate": 0.003243327951219548, "loss": 2.4567, "step": 17157 }, { "crossentropy": 2.4298295974731445, "epoch": 0.6220272621809745, "grad_norm": 0.027148108929395676, "grad_norm_var": 1.3746113321032778e-06, "learning_rate": 0.0032427839196837306, "loss": 2.4858, "step": 17158 }, { "crossentropy": 2.473184823989868, "epoch": 0.6220635150812065, "grad_norm": 0.027652103453874588, "grad_norm_var": 1.2815251199170628e-06, "learning_rate": 0.0032422399118817824, "loss": 2.471, "step": 17159 }, { "crossentropy": 2.4185638427734375, "epoch": 0.6220997679814385, "grad_norm": 0.027079537510871887, "grad_norm_var": 1.1680860181288345e-06, "learning_rate": 0.0032416959278210524, "loss": 2.4358, "step": 17160 }, { "crossentropy": 2.61818265914917, "epoch": 0.6221360208816705, "grad_norm": 0.02885558269917965, "grad_norm_var": 1.3338392831753563e-06, "learning_rate": 0.0032411519675088853, "loss": 2.5815, "step": 17161 }, { "crossentropy": 2.589942216873169, "epoch": 0.6221722737819025, "grad_norm": 0.02777489274740219, "grad_norm_var": 1.322960842203326e-06, "learning_rate": 0.003240608030952631, "loss": 2.5873, "step": 17162 }, { "crossentropy": 2.586684226989746, "epoch": 0.6222085266821346, "grad_norm": 0.026139628142118454, "grad_norm_var": 1.4119895974833037e-06, "learning_rate": 0.0032400641181596353, "loss": 2.5, "step": 17163 }, { "crossentropy": 2.6202962398529053, "epoch": 0.6222447795823666, "grad_norm": 0.029497651383280754, "grad_norm_var": 1.214770858378363e-06, "learning_rate": 0.0032395202291372444, "loss": 2.6417, "step": 17164 }, { "crossentropy": 2.524569034576416, "epoch": 0.6222810324825986, "grad_norm": 0.027654236182570457, "grad_norm_var": 1.221385739077809e-06, "learning_rate": 0.003238976363892805, "loss": 2.5302, "step": 17165 }, { "crossentropy": 2.613699436187744, "epoch": 0.6223172853828306, "grad_norm": 0.027642808854579926, "grad_norm_var": 1.1083249467177947e-06, "learning_rate": 0.003238432522433663, "loss": 2.5153, "step": 17166 }, { "crossentropy": 2.578885316848755, "epoch": 0.6223535382830626, "grad_norm": 0.026781033724546432, "grad_norm_var": 1.1317930720566716e-06, "learning_rate": 0.0032378887047671613, "loss": 2.5541, "step": 17167 }, { "crossentropy": 2.519375801086426, "epoch": 0.6223897911832946, "grad_norm": 0.026651185005903244, "grad_norm_var": 1.1673352998019798e-06, "learning_rate": 0.0032373449109006475, "loss": 2.4735, "step": 17168 }, { "crossentropy": 2.5320558547973633, "epoch": 0.6224260440835266, "grad_norm": 0.026216957718133926, "grad_norm_var": 1.1600673990967703e-06, "learning_rate": 0.0032368011408414644, "loss": 2.5757, "step": 17169 }, { "crossentropy": 2.572394847869873, "epoch": 0.6224622969837587, "grad_norm": 0.026311537250876427, "grad_norm_var": 1.1833131758428687e-06, "learning_rate": 0.0032362573945969586, "loss": 2.5578, "step": 17170 }, { "crossentropy": 2.520625114440918, "epoch": 0.6224985498839907, "grad_norm": 0.02628975175321102, "grad_norm_var": 1.1228132378538388e-06, "learning_rate": 0.003235713672174474, "loss": 2.5509, "step": 17171 }, { "crossentropy": 2.62593412399292, "epoch": 0.6225348027842227, "grad_norm": 0.026452738791704178, "grad_norm_var": 1.1006820807463268e-06, "learning_rate": 0.0032351699735813527, "loss": 2.6352, "step": 17172 }, { "crossentropy": 2.5972092151641846, "epoch": 0.6225710556844548, "grad_norm": 0.026888420805335045, "grad_norm_var": 9.209254603517273e-07, "learning_rate": 0.0032346262988249396, "loss": 2.5671, "step": 17173 }, { "crossentropy": 2.6476216316223145, "epoch": 0.6226073085846868, "grad_norm": 0.027718544006347656, "grad_norm_var": 9.38094752779041e-07, "learning_rate": 0.0032340826479125783, "loss": 2.5932, "step": 17174 }, { "crossentropy": 2.561237335205078, "epoch": 0.6226435614849188, "grad_norm": 0.02639293298125267, "grad_norm_var": 9.655523637953796e-07, "learning_rate": 0.003233539020851609, "loss": 2.5848, "step": 17175 }, { "crossentropy": 2.4485273361206055, "epoch": 0.6226798143851509, "grad_norm": 0.026610542088747025, "grad_norm_var": 9.835004475047353e-07, "learning_rate": 0.003232995417649377, "loss": 2.4497, "step": 17176 }, { "crossentropy": 2.5130224227905273, "epoch": 0.6227160672853829, "grad_norm": 0.026746559888124466, "grad_norm_var": 7.727175376391702e-07, "learning_rate": 0.0032324518383132217, "loss": 2.5702, "step": 17177 }, { "crossentropy": 2.5617971420288086, "epoch": 0.6227523201856149, "grad_norm": 0.025679081678390503, "grad_norm_var": 8.266797891345272e-07, "learning_rate": 0.0032319082828504886, "loss": 2.5479, "step": 17178 }, { "crossentropy": 2.514887809753418, "epoch": 0.6227885730858469, "grad_norm": 0.04766343533992767, "grad_norm_var": 2.7729464694966004e-05, "learning_rate": 0.003231364751268516, "loss": 2.4874, "step": 17179 }, { "crossentropy": 2.655156135559082, "epoch": 0.6228248259860789, "grad_norm": 0.026469513773918152, "grad_norm_var": 2.777857168438687e-05, "learning_rate": 0.0032308212435746475, "loss": 2.6259, "step": 17180 }, { "crossentropy": 2.490847110748291, "epoch": 0.6228610788863109, "grad_norm": 0.02701215073466301, "grad_norm_var": 2.78348458818172e-05, "learning_rate": 0.003230277759776222, "loss": 2.5607, "step": 17181 }, { "crossentropy": 2.5727903842926025, "epoch": 0.6228973317865429, "grad_norm": 0.03142799437046051, "grad_norm_var": 2.8564965226374643e-05, "learning_rate": 0.003229734299880581, "loss": 2.5572, "step": 17182 }, { "crossentropy": 2.453859806060791, "epoch": 0.622933584686775, "grad_norm": 0.027198903262615204, "grad_norm_var": 2.8496428288680277e-05, "learning_rate": 0.0032291908638950673, "loss": 2.5712, "step": 17183 }, { "crossentropy": 2.424384832382202, "epoch": 0.622969837587007, "grad_norm": 0.02791072614490986, "grad_norm_var": 2.832990927936572e-05, "learning_rate": 0.0032286474518270153, "loss": 2.4943, "step": 17184 }, { "crossentropy": 2.600776195526123, "epoch": 0.623006090487239, "grad_norm": 0.028322236612439156, "grad_norm_var": 2.8018874052858802e-05, "learning_rate": 0.003228104063683769, "loss": 2.6055, "step": 17185 }, { "crossentropy": 2.6067843437194824, "epoch": 0.623042343387471, "grad_norm": 0.027689645066857338, "grad_norm_var": 2.7745840290768864e-05, "learning_rate": 0.003227560699472666, "loss": 2.4964, "step": 17186 }, { "crossentropy": 2.5831124782562256, "epoch": 0.623078596287703, "grad_norm": 0.02544945292174816, "grad_norm_var": 2.8040920962284207e-05, "learning_rate": 0.0032270173592010448, "loss": 2.5477, "step": 17187 }, { "crossentropy": 2.531066417694092, "epoch": 0.623114849187935, "grad_norm": 0.02729339525103569, "grad_norm_var": 2.7858189328686212e-05, "learning_rate": 0.003226474042876245, "loss": 2.5938, "step": 17188 }, { "crossentropy": 2.482450485229492, "epoch": 0.623151102088167, "grad_norm": 0.027096020057797432, "grad_norm_var": 2.781545535533469e-05, "learning_rate": 0.0032259307505056044, "loss": 2.5258, "step": 17189 }, { "crossentropy": 2.509152412414551, "epoch": 0.6231873549883991, "grad_norm": 0.027681872248649597, "grad_norm_var": 2.7819568541657942e-05, "learning_rate": 0.003225387482096462, "loss": 2.5282, "step": 17190 }, { "crossentropy": 2.527785301208496, "epoch": 0.6232236078886311, "grad_norm": 0.02619958482682705, "grad_norm_var": 2.787726306036764e-05, "learning_rate": 0.0032248442376561566, "loss": 2.5008, "step": 17191 }, { "crossentropy": 2.6135237216949463, "epoch": 0.6232598607888631, "grad_norm": 0.02735375240445137, "grad_norm_var": 2.772175643746492e-05, "learning_rate": 0.0032243010171920205, "loss": 2.498, "step": 17192 }, { "crossentropy": 2.5125350952148438, "epoch": 0.6232961136890951, "grad_norm": 0.02637428604066372, "grad_norm_var": 2.7821157960687204e-05, "learning_rate": 0.003223757820711395, "loss": 2.5655, "step": 17193 }, { "crossentropy": 2.4424960613250732, "epoch": 0.6233323665893271, "grad_norm": 0.02728172205388546, "grad_norm_var": 2.7367918694258936e-05, "learning_rate": 0.0032232146482216156, "loss": 2.5917, "step": 17194 }, { "crossentropy": 2.5236032009124756, "epoch": 0.6233686194895591, "grad_norm": 0.02612478844821453, "grad_norm_var": 1.7637763425945903e-06, "learning_rate": 0.003222671499730019, "loss": 2.5336, "step": 17195 }, { "crossentropy": 2.5969948768615723, "epoch": 0.6234048723897911, "grad_norm": 0.025709930807352066, "grad_norm_var": 1.8844911418216716e-06, "learning_rate": 0.0032221283752439413, "loss": 2.5531, "step": 17196 }, { "crossentropy": 2.4113609790802, "epoch": 0.6234411252900232, "grad_norm": 0.06676894426345825, "grad_norm_var": 9.936944162222947e-05, "learning_rate": 0.0032215852747707165, "loss": 2.3872, "step": 17197 }, { "crossentropy": 2.514986276626587, "epoch": 0.6234773781902552, "grad_norm": 0.029904015362262726, "grad_norm_var": 9.917215221993702e-05, "learning_rate": 0.003221042198317682, "loss": 2.5358, "step": 17198 }, { "crossentropy": 2.4805920124053955, "epoch": 0.6235136310904872, "grad_norm": 0.027219191193580627, "grad_norm_var": 9.916555447251212e-05, "learning_rate": 0.003220499145892173, "loss": 2.4726, "step": 17199 }, { "crossentropy": 2.240346908569336, "epoch": 0.6235498839907193, "grad_norm": 0.026987239718437195, "grad_norm_var": 9.943285833885184e-05, "learning_rate": 0.0032199561175015236, "loss": 2.3676, "step": 17200 }, { "crossentropy": 2.6430063247680664, "epoch": 0.6235861368909513, "grad_norm": 0.029164055362343788, "grad_norm_var": 9.93347397977055e-05, "learning_rate": 0.003219413113153068, "loss": 2.6254, "step": 17201 }, { "crossentropy": 2.65592098236084, "epoch": 0.6236223897911833, "grad_norm": 0.027199141681194305, "grad_norm_var": 9.947756764062096e-05, "learning_rate": 0.0032188701328541404, "loss": 2.5961, "step": 17202 }, { "crossentropy": 2.6263253688812256, "epoch": 0.6236586426914154, "grad_norm": 0.028104227036237717, "grad_norm_var": 9.844429990950396e-05, "learning_rate": 0.0032183271766120735, "loss": 2.6053, "step": 17203 }, { "crossentropy": 2.5737011432647705, "epoch": 0.6236948955916474, "grad_norm": 0.02629205211997032, "grad_norm_var": 9.883881171501059e-05, "learning_rate": 0.0032177842444342034, "loss": 2.5887, "step": 17204 }, { "crossentropy": 2.519461154937744, "epoch": 0.6237311484918794, "grad_norm": 0.026369253173470497, "grad_norm_var": 9.912573476489229e-05, "learning_rate": 0.0032172413363278597, "loss": 2.5208, "step": 17205 }, { "crossentropy": 2.492326021194458, "epoch": 0.6237674013921114, "grad_norm": 0.027430696412920952, "grad_norm_var": 9.919628988871534e-05, "learning_rate": 0.003216698452300378, "loss": 2.5409, "step": 17206 }, { "crossentropy": 2.446323871612549, "epoch": 0.6238036542923434, "grad_norm": 0.026435723528265953, "grad_norm_var": 9.909097501224321e-05, "learning_rate": 0.003216155592359091, "loss": 2.5133, "step": 17207 }, { "crossentropy": 2.467895030975342, "epoch": 0.6238399071925754, "grad_norm": 0.02653157338500023, "grad_norm_var": 9.938713296492675e-05, "learning_rate": 0.00321561275651133, "loss": 2.4693, "step": 17208 }, { "crossentropy": 2.5007646083831787, "epoch": 0.6238761600928074, "grad_norm": 0.027549801394343376, "grad_norm_var": 9.8965006392928e-05, "learning_rate": 0.003215069944764426, "loss": 2.5376, "step": 17209 }, { "crossentropy": 2.520592212677002, "epoch": 0.6239124129930395, "grad_norm": 0.028761126101017, "grad_norm_var": 9.862635511591512e-05, "learning_rate": 0.0032145271571257113, "loss": 2.5954, "step": 17210 }, { "crossentropy": 2.413656234741211, "epoch": 0.6239486658932715, "grad_norm": 0.02890975959599018, "grad_norm_var": 9.775215589371856e-05, "learning_rate": 0.0032139843936025165, "loss": 2.5405, "step": 17211 }, { "crossentropy": 2.494415044784546, "epoch": 0.6239849187935035, "grad_norm": 0.026227906346321106, "grad_norm_var": 9.747550074832896e-05, "learning_rate": 0.0032134416542021725, "loss": 2.5318, "step": 17212 }, { "crossentropy": 2.5972681045532227, "epoch": 0.6240211716937355, "grad_norm": 0.026949703693389893, "grad_norm_var": 1.31067518843956e-06, "learning_rate": 0.0032128989389320103, "loss": 2.5132, "step": 17213 }, { "crossentropy": 2.5339224338531494, "epoch": 0.6240574245939675, "grad_norm": 0.026084745302796364, "grad_norm_var": 9.992692776571254e-07, "learning_rate": 0.00321235624779936, "loss": 2.5632, "step": 17214 }, { "crossentropy": 2.662163257598877, "epoch": 0.6240936774941995, "grad_norm": 0.07561212033033371, "grad_norm_var": 0.00014708051689589656, "learning_rate": 0.0032118135808115516, "loss": 2.5709, "step": 17215 }, { "crossentropy": 2.680450201034546, "epoch": 0.6241299303944315, "grad_norm": 0.028843767940998077, "grad_norm_var": 0.0001464788574373838, "learning_rate": 0.0032112709379759157, "loss": 2.6404, "step": 17216 }, { "crossentropy": 2.602430820465088, "epoch": 0.6241661832946636, "grad_norm": 0.027133354917168617, "grad_norm_var": 0.00014707234691944715, "learning_rate": 0.0032107283192997793, "loss": 2.5575, "step": 17217 }, { "crossentropy": 2.5888893604278564, "epoch": 0.6242024361948956, "grad_norm": 0.026030270382761955, "grad_norm_var": 0.0001476374496399775, "learning_rate": 0.003210185724790472, "loss": 2.6469, "step": 17218 }, { "crossentropy": 2.6967074871063232, "epoch": 0.6242386890951276, "grad_norm": 0.028182506561279297, "grad_norm_var": 0.0001476159153649283, "learning_rate": 0.003209643154455323, "loss": 2.6057, "step": 17219 }, { "crossentropy": 2.2523036003112793, "epoch": 0.6242749419953596, "grad_norm": 0.0286817979067564, "grad_norm_var": 0.00014672477052079825, "learning_rate": 0.0032091006083016594, "loss": 2.4672, "step": 17220 }, { "crossentropy": 2.6168813705444336, "epoch": 0.6243111948955916, "grad_norm": 0.026303553953766823, "grad_norm_var": 0.00014675998464575612, "learning_rate": 0.003208558086336809, "loss": 2.4638, "step": 17221 }, { "crossentropy": 2.6716725826263428, "epoch": 0.6243474477958236, "grad_norm": 0.02758469618856907, "grad_norm_var": 0.00014670143615528756, "learning_rate": 0.0032080155885681016, "loss": 2.6434, "step": 17222 }, { "crossentropy": 2.667128324508667, "epoch": 0.6243837006960556, "grad_norm": 0.02616272121667862, "grad_norm_var": 0.00014684908114807974, "learning_rate": 0.0032074731150028638, "loss": 2.5092, "step": 17223 }, { "crossentropy": 2.6536049842834473, "epoch": 0.6244199535962877, "grad_norm": 0.028032030910253525, "grad_norm_var": 0.00014622650631495592, "learning_rate": 0.003206930665648421, "loss": 2.5225, "step": 17224 }, { "crossentropy": 2.5961735248565674, "epoch": 0.6244562064965197, "grad_norm": 0.02999827265739441, "grad_norm_var": 0.00014565745086592372, "learning_rate": 0.0032063882405121012, "loss": 2.6024, "step": 17225 }, { "crossentropy": 2.6698174476623535, "epoch": 0.6244924593967517, "grad_norm": 0.02852018177509308, "grad_norm_var": 0.00014571995061700484, "learning_rate": 0.0032058458396012303, "loss": 2.5878, "step": 17226 }, { "crossentropy": 2.4304511547088623, "epoch": 0.6245287122969838, "grad_norm": 0.02605041302740574, "grad_norm_var": 0.00014686717610816352, "learning_rate": 0.0032053034629231335, "loss": 2.5231, "step": 17227 }, { "crossentropy": 2.6514525413513184, "epoch": 0.6245649651972158, "grad_norm": 0.025609077885746956, "grad_norm_var": 0.0001472353416805613, "learning_rate": 0.003204761110485137, "loss": 2.5751, "step": 17228 }, { "crossentropy": 2.535602331161499, "epoch": 0.6246012180974478, "grad_norm": 0.027460603043437004, "grad_norm_var": 0.00014701926441954003, "learning_rate": 0.0032042187822945646, "loss": 2.5679, "step": 17229 }, { "crossentropy": 2.5931618213653564, "epoch": 0.6246374709976799, "grad_norm": 0.025484219193458557, "grad_norm_var": 0.00014738677706515672, "learning_rate": 0.0032036764783587445, "loss": 2.4687, "step": 17230 }, { "crossentropy": 2.437437057495117, "epoch": 0.6246737238979119, "grad_norm": 0.02504819631576538, "grad_norm_var": 2.0682053444169107e-06, "learning_rate": 0.0032031341986849993, "loss": 2.4903, "step": 17231 }, { "crossentropy": 2.512103796005249, "epoch": 0.6247099767981439, "grad_norm": 0.026188097894191742, "grad_norm_var": 1.9253060003156703e-06, "learning_rate": 0.003202591943280653, "loss": 2.4415, "step": 17232 }, { "crossentropy": 2.436281442642212, "epoch": 0.6247462296983759, "grad_norm": 0.02660125307738781, "grad_norm_var": 1.9356246919191e-06, "learning_rate": 0.0032020497121530325, "loss": 2.5553, "step": 17233 }, { "crossentropy": 2.553818941116333, "epoch": 0.6247824825986079, "grad_norm": 0.02659597434103489, "grad_norm_var": 1.8827748076137111e-06, "learning_rate": 0.0032015075053094567, "loss": 2.5698, "step": 17234 }, { "crossentropy": 2.5399551391601562, "epoch": 0.6248187354988399, "grad_norm": 0.026508470997214317, "grad_norm_var": 1.8010087404241125e-06, "learning_rate": 0.003200965322757252, "loss": 2.5613, "step": 17235 }, { "crossentropy": 2.5162932872772217, "epoch": 0.6248549883990719, "grad_norm": 0.02623821049928665, "grad_norm_var": 1.6024204165770995e-06, "learning_rate": 0.003200423164503741, "loss": 2.5001, "step": 17236 }, { "crossentropy": 2.576545476913452, "epoch": 0.624891241299304, "grad_norm": 0.026729509234428406, "grad_norm_var": 1.5870347517107542e-06, "learning_rate": 0.003199881030556245, "loss": 2.5306, "step": 17237 }, { "crossentropy": 2.569728136062622, "epoch": 0.624927494199536, "grad_norm": 0.02636495977640152, "grad_norm_var": 1.552524468165633e-06, "learning_rate": 0.003199338920922088, "loss": 2.5142, "step": 17238 }, { "crossentropy": 2.6552951335906982, "epoch": 0.624963747099768, "grad_norm": 0.027072321623563766, "grad_norm_var": 1.5361012589229043e-06, "learning_rate": 0.0031987968356085917, "loss": 2.5993, "step": 17239 }, { "crossentropy": 2.674708127975464, "epoch": 0.625, "grad_norm": 0.026842771098017693, "grad_norm_var": 1.4261814062831255e-06, "learning_rate": 0.0031982547746230777, "loss": 2.6869, "step": 17240 }, { "crossentropy": 2.5340023040771484, "epoch": 0.625036252900232, "grad_norm": 0.026933006942272186, "grad_norm_var": 6.682858444376535e-07, "learning_rate": 0.003197712737972867, "loss": 2.6057, "step": 17241 }, { "crossentropy": 2.429286241531372, "epoch": 0.625072505800464, "grad_norm": 0.025818394497036934, "grad_norm_var": 4.02334578614103e-07, "learning_rate": 0.0031971707256652823, "loss": 2.4481, "step": 17242 }, { "crossentropy": 2.480222702026367, "epoch": 0.625108758700696, "grad_norm": 0.027209272608160973, "grad_norm_var": 4.4050524852697523e-07, "learning_rate": 0.0031966287377076415, "loss": 2.4799, "step": 17243 }, { "crossentropy": 2.4245998859405518, "epoch": 0.6251450116009281, "grad_norm": 0.025625865906476974, "grad_norm_var": 4.3870988405031396e-07, "learning_rate": 0.0031960867741072665, "loss": 2.4766, "step": 17244 }, { "crossentropy": 2.476130485534668, "epoch": 0.6251812645011601, "grad_norm": 0.02719874680042267, "grad_norm_var": 4.0666609954908886e-07, "learning_rate": 0.0031955448348714756, "loss": 2.5692, "step": 17245 }, { "crossentropy": 2.4666528701782227, "epoch": 0.6252175174013921, "grad_norm": 0.02608267217874527, "grad_norm_var": 3.5568106543272755e-07, "learning_rate": 0.0031950029200075915, "loss": 2.3717, "step": 17246 }, { "crossentropy": 2.6182355880737305, "epoch": 0.6252537703016241, "grad_norm": 0.02761758491396904, "grad_norm_var": 2.9110016279929963e-07, "learning_rate": 0.0031944610295229327, "loss": 2.6, "step": 17247 }, { "crossentropy": 2.585420608520508, "epoch": 0.6252900232018561, "grad_norm": 0.026213956996798515, "grad_norm_var": 2.897159244307958e-07, "learning_rate": 0.0031939191634248166, "loss": 2.4884, "step": 17248 }, { "crossentropy": 2.6049468517303467, "epoch": 0.6253262761020881, "grad_norm": 0.025702735409140587, "grad_norm_var": 3.404208096570756e-07, "learning_rate": 0.0031933773217205637, "loss": 2.5816, "step": 17249 }, { "crossentropy": 2.388892650604248, "epoch": 0.6253625290023201, "grad_norm": 0.02737192064523697, "grad_norm_var": 3.8310259557790504e-07, "learning_rate": 0.003192835504417493, "loss": 2.4285, "step": 17250 }, { "crossentropy": 2.442580461502075, "epoch": 0.6253987819025522, "grad_norm": 0.029488835483789444, "grad_norm_var": 9.036200114547934e-07, "learning_rate": 0.003192293711522919, "loss": 2.4983, "step": 17251 }, { "crossentropy": 2.499095916748047, "epoch": 0.6254350348027842, "grad_norm": 0.02765035443007946, "grad_norm_var": 9.258810819863054e-07, "learning_rate": 0.0031917519430441628, "loss": 2.584, "step": 17252 }, { "crossentropy": 2.387798309326172, "epoch": 0.6254712877030162, "grad_norm": 0.02570798434317112, "grad_norm_var": 1.0102607190775783e-06, "learning_rate": 0.0031912101989885395, "loss": 2.4349, "step": 17253 }, { "crossentropy": 2.7105300426483154, "epoch": 0.6255075406032483, "grad_norm": 0.02736005373299122, "grad_norm_var": 1.013587462824333e-06, "learning_rate": 0.0031906684793633682, "loss": 2.6289, "step": 17254 }, { "crossentropy": 2.435530662536621, "epoch": 0.6255437935034803, "grad_norm": 0.026820775121450424, "grad_norm_var": 1.0107071094754444e-06, "learning_rate": 0.003190126784175965, "loss": 2.4936, "step": 17255 }, { "crossentropy": 2.6274688243865967, "epoch": 0.6255800464037123, "grad_norm": 0.027737563475966454, "grad_norm_var": 1.0595504646603836e-06, "learning_rate": 0.003189585113433646, "loss": 2.6545, "step": 17256 }, { "crossentropy": 2.670957088470459, "epoch": 0.6256162993039444, "grad_norm": 0.027712028473615646, "grad_norm_var": 1.1000014762674315e-06, "learning_rate": 0.003189043467143728, "loss": 2.5808, "step": 17257 }, { "crossentropy": 2.604964256286621, "epoch": 0.6256525522041764, "grad_norm": 0.027334513142704964, "grad_norm_var": 1.0134116040366736e-06, "learning_rate": 0.003188501845313527, "loss": 2.5245, "step": 17258 }, { "crossentropy": 2.492067337036133, "epoch": 0.6256888051044084, "grad_norm": 0.026109235361218452, "grad_norm_var": 1.066000546454983e-06, "learning_rate": 0.003187960247950356, "loss": 2.4781, "step": 17259 }, { "crossentropy": 2.4492878913879395, "epoch": 0.6257250580046404, "grad_norm": 0.02765423245728016, "grad_norm_var": 9.55991701501628e-07, "learning_rate": 0.0031874186750615323, "loss": 2.4685, "step": 17260 }, { "crossentropy": 2.536027669906616, "epoch": 0.6257613109048724, "grad_norm": 0.027406224980950356, "grad_norm_var": 9.61131700580262e-07, "learning_rate": 0.0031868771266543687, "loss": 2.4674, "step": 17261 }, { "crossentropy": 2.644472360610962, "epoch": 0.6257975638051044, "grad_norm": 0.027003206312656403, "grad_norm_var": 8.863850171375008e-07, "learning_rate": 0.0031863356027361824, "loss": 2.5684, "step": 17262 }, { "crossentropy": 2.6520543098449707, "epoch": 0.6258338167053364, "grad_norm": 0.02599758468568325, "grad_norm_var": 9.560429799438842e-07, "learning_rate": 0.0031857941033142864, "loss": 2.5666, "step": 17263 }, { "crossentropy": 2.642098903656006, "epoch": 0.6258700696055685, "grad_norm": 0.026301540434360504, "grad_norm_var": 9.464153579270367e-07, "learning_rate": 0.0031852526283959936, "loss": 2.5939, "step": 17264 }, { "crossentropy": 2.663076639175415, "epoch": 0.6259063225058005, "grad_norm": 0.026080919429659843, "grad_norm_var": 8.856580724074525e-07, "learning_rate": 0.0031847111779886184, "loss": 2.606, "step": 17265 }, { "crossentropy": 2.566807985305786, "epoch": 0.6259425754060325, "grad_norm": 0.027046436443924904, "grad_norm_var": 8.808500569366101e-07, "learning_rate": 0.003184169752099472, "loss": 2.5789, "step": 17266 }, { "crossentropy": 2.578399181365967, "epoch": 0.6259788283062645, "grad_norm": 0.027906419709324837, "grad_norm_var": 5.308492112487499e-07, "learning_rate": 0.003183628350735872, "loss": 2.5536, "step": 17267 }, { "crossentropy": 2.3769447803497314, "epoch": 0.6260150812064965, "grad_norm": 0.02864004299044609, "grad_norm_var": 6.79296413246784e-07, "learning_rate": 0.003183086973905126, "loss": 2.4394, "step": 17268 }, { "crossentropy": 2.589278221130371, "epoch": 0.6260513341067285, "grad_norm": 0.04353317990899086, "grad_norm_var": 1.7345550516891887e-05, "learning_rate": 0.0031825456216145465, "loss": 2.5394, "step": 17269 }, { "crossentropy": 2.6588919162750244, "epoch": 0.6260875870069605, "grad_norm": 0.02991986833512783, "grad_norm_var": 1.7480271694634136e-05, "learning_rate": 0.0031820042938714467, "loss": 2.515, "step": 17270 }, { "crossentropy": 2.5468196868896484, "epoch": 0.6261238399071926, "grad_norm": 0.02698463387787342, "grad_norm_var": 1.7449080595036008e-05, "learning_rate": 0.0031814629906831383, "loss": 2.5777, "step": 17271 }, { "crossentropy": 2.477565288543701, "epoch": 0.6261600928074246, "grad_norm": 0.026172520592808723, "grad_norm_var": 1.772693356158031e-05, "learning_rate": 0.0031809217120569323, "loss": 2.4401, "step": 17272 }, { "crossentropy": 2.607386350631714, "epoch": 0.6261963457076566, "grad_norm": 0.025843560695648193, "grad_norm_var": 1.8076082299962057e-05, "learning_rate": 0.0031803804580001387, "loss": 2.5494, "step": 17273 }, { "crossentropy": 2.4587817192077637, "epoch": 0.6262325986078886, "grad_norm": 0.02609257586300373, "grad_norm_var": 1.8302698995635914e-05, "learning_rate": 0.0031798392285200677, "loss": 2.5274, "step": 17274 }, { "crossentropy": 2.4335994720458984, "epoch": 0.6262688515081206, "grad_norm": 0.026605362072587013, "grad_norm_var": 1.8190146590295316e-05, "learning_rate": 0.0031792980236240334, "loss": 2.4176, "step": 17275 }, { "crossentropy": 2.6877083778381348, "epoch": 0.6263051044083526, "grad_norm": 0.026464499533176422, "grad_norm_var": 1.8345244005267317e-05, "learning_rate": 0.0031787568433193393, "loss": 2.6216, "step": 17276 }, { "crossentropy": 2.5295486450195312, "epoch": 0.6263413573085846, "grad_norm": 0.02649681642651558, "grad_norm_var": 1.846892008509972e-05, "learning_rate": 0.0031782156876132973, "loss": 2.4329, "step": 17277 }, { "crossentropy": 2.5471808910369873, "epoch": 0.6263776102088167, "grad_norm": 0.026126310229301453, "grad_norm_var": 1.8626867973650986e-05, "learning_rate": 0.003177674556513218, "loss": 2.516, "step": 17278 }, { "crossentropy": 2.6364855766296387, "epoch": 0.6264138631090487, "grad_norm": 0.028153592720627785, "grad_norm_var": 1.837388098477586e-05, "learning_rate": 0.0031771334500264106, "loss": 2.5465, "step": 17279 }, { "crossentropy": 2.673938751220703, "epoch": 0.6264501160092807, "grad_norm": 0.026720575988292694, "grad_norm_var": 1.828867406596008e-05, "learning_rate": 0.0031765923681601805, "loss": 2.6398, "step": 17280 }, { "crossentropy": 2.5424389839172363, "epoch": 0.6264863689095128, "grad_norm": 0.02780495025217533, "grad_norm_var": 1.8021989875950067e-05, "learning_rate": 0.0031760513109218382, "loss": 2.5937, "step": 17281 }, { "crossentropy": 2.390326738357544, "epoch": 0.6265226218097448, "grad_norm": 0.02680373564362526, "grad_norm_var": 1.8061607987585453e-05, "learning_rate": 0.00317551027831869, "loss": 2.5321, "step": 17282 }, { "crossentropy": 2.469703197479248, "epoch": 0.6265588747099768, "grad_norm": 0.026849646121263504, "grad_norm_var": 1.8164570595444287e-05, "learning_rate": 0.003174969270358045, "loss": 2.4742, "step": 17283 }, { "crossentropy": 2.698167324066162, "epoch": 0.6265951276102089, "grad_norm": 0.02645140513777733, "grad_norm_var": 1.8299280658860236e-05, "learning_rate": 0.0031744282870472106, "loss": 2.6396, "step": 17284 }, { "crossentropy": 2.4891185760498047, "epoch": 0.6266313805104409, "grad_norm": 0.026411311700940132, "grad_norm_var": 1.0213703666031929e-06, "learning_rate": 0.0031738873283934923, "loss": 2.548, "step": 17285 }, { "crossentropy": 2.4967262744903564, "epoch": 0.6266676334106729, "grad_norm": 0.026571182534098625, "grad_norm_var": 3.59966280872627e-07, "learning_rate": 0.0031733463944041964, "loss": 2.5184, "step": 17286 }, { "crossentropy": 2.5428061485290527, "epoch": 0.6267038863109049, "grad_norm": 0.025623267516493797, "grad_norm_var": 4.167895432394264e-07, "learning_rate": 0.0031728054850866296, "loss": 2.5591, "step": 17287 }, { "crossentropy": 2.7617437839508057, "epoch": 0.6267401392111369, "grad_norm": 0.027082547545433044, "grad_norm_var": 4.1977912284273056e-07, "learning_rate": 0.0031722646004480972, "loss": 2.67, "step": 17288 }, { "crossentropy": 2.408247709274292, "epoch": 0.6267763921113689, "grad_norm": 0.027710000053048134, "grad_norm_var": 4.4145978087157624e-07, "learning_rate": 0.0031717237404959055, "loss": 2.4338, "step": 17289 }, { "crossentropy": 2.571671724319458, "epoch": 0.6268126450116009, "grad_norm": 0.028887517750263214, "grad_norm_var": 6.854465299928654e-07, "learning_rate": 0.0031711829052373576, "loss": 2.5395, "step": 17290 }, { "crossentropy": 2.3258132934570312, "epoch": 0.626848897911833, "grad_norm": 0.027432002127170563, "grad_norm_var": 6.931816237985717e-07, "learning_rate": 0.003170642094679761, "loss": 2.414, "step": 17291 }, { "crossentropy": 2.5243260860443115, "epoch": 0.626885150812065, "grad_norm": 0.02621765062212944, "grad_norm_var": 7.137703361271656e-07, "learning_rate": 0.003170101308830421, "loss": 2.4919, "step": 17292 }, { "crossentropy": 2.491670846939087, "epoch": 0.626921403712297, "grad_norm": 0.027480393648147583, "grad_norm_var": 7.136341075454517e-07, "learning_rate": 0.0031695605476966377, "loss": 2.5869, "step": 17293 }, { "crossentropy": 2.5872416496276855, "epoch": 0.626957656612529, "grad_norm": 0.027263911440968513, "grad_norm_var": 6.589049016622924e-07, "learning_rate": 0.0031690198112857173, "loss": 2.526, "step": 17294 }, { "crossentropy": 2.5298473834991455, "epoch": 0.626993909512761, "grad_norm": 0.03290843591094017, "grad_norm_var": 2.745295092177376e-06, "learning_rate": 0.0031684790996049633, "loss": 2.5473, "step": 17295 }, { "crossentropy": 2.5840353965759277, "epoch": 0.627030162412993, "grad_norm": 0.027193326503038406, "grad_norm_var": 2.7171519021526013e-06, "learning_rate": 0.003167938412661678, "loss": 2.6124, "step": 17296 }, { "crossentropy": 2.332186460494995, "epoch": 0.627066415313225, "grad_norm": 0.0264629814773798, "grad_norm_var": 2.760506957630709e-06, "learning_rate": 0.003167397750463164, "loss": 2.4422, "step": 17297 }, { "crossentropy": 2.523475408554077, "epoch": 0.6271026682134571, "grad_norm": 0.02983243204653263, "grad_norm_var": 3.11955077877192e-06, "learning_rate": 0.0031668571130167257, "loss": 2.5689, "step": 17298 }, { "crossentropy": 2.3928005695343018, "epoch": 0.6271389211136891, "grad_norm": 0.026309339329600334, "grad_norm_var": 3.186350596482273e-06, "learning_rate": 0.0031663165003296633, "loss": 2.5229, "step": 17299 }, { "crossentropy": 2.4444680213928223, "epoch": 0.6271751740139211, "grad_norm": 0.0270936768501997, "grad_norm_var": 3.1232035908675444e-06, "learning_rate": 0.003165775912409281, "loss": 2.5679, "step": 17300 }, { "crossentropy": 2.6402740478515625, "epoch": 0.6272114269141531, "grad_norm": 0.028152167797088623, "grad_norm_var": 3.0529517910844623e-06, "learning_rate": 0.003165235349262877, "loss": 2.6305, "step": 17301 }, { "crossentropy": 2.4052574634552, "epoch": 0.6272476798143851, "grad_norm": 0.02723795361816883, "grad_norm_var": 2.985823892769542e-06, "learning_rate": 0.0031646948108977548, "loss": 2.4968, "step": 17302 }, { "crossentropy": 2.513516426086426, "epoch": 0.6272839327146171, "grad_norm": 0.027203042060136795, "grad_norm_var": 2.7084811107506383e-06, "learning_rate": 0.0031641542973212134, "loss": 2.562, "step": 17303 }, { "crossentropy": 2.5188629627227783, "epoch": 0.6273201856148491, "grad_norm": 0.02668418362736702, "grad_norm_var": 2.7554028942925694e-06, "learning_rate": 0.003163613808540554, "loss": 2.5488, "step": 17304 }, { "crossentropy": 2.657025098800659, "epoch": 0.6273564385150812, "grad_norm": 0.02605094574391842, "grad_norm_var": 2.937234158950858e-06, "learning_rate": 0.0031630733445630766, "loss": 2.6193, "step": 17305 }, { "crossentropy": 2.7355504035949707, "epoch": 0.6273926914153132, "grad_norm": 0.027219900861382484, "grad_norm_var": 2.836020975926131e-06, "learning_rate": 0.0031625329053960828, "loss": 2.6193, "step": 17306 }, { "crossentropy": 2.6466848850250244, "epoch": 0.6274289443155452, "grad_norm": 0.027069050818681717, "grad_norm_var": 2.849790273154867e-06, "learning_rate": 0.0031619924910468696, "loss": 2.6589, "step": 17307 }, { "crossentropy": 2.5820393562316895, "epoch": 0.6274651972157773, "grad_norm": 0.026054631918668747, "grad_norm_var": 2.8798395415710032e-06, "learning_rate": 0.003161452101522737, "loss": 2.5498, "step": 17308 }, { "crossentropy": 2.505836009979248, "epoch": 0.6275014501160093, "grad_norm": 0.031711645424366, "grad_norm_var": 3.980117083031363e-06, "learning_rate": 0.0031609117368309857, "loss": 2.6494, "step": 17309 }, { "crossentropy": 2.345550775527954, "epoch": 0.6275377030162413, "grad_norm": 0.027418073266744614, "grad_norm_var": 3.971035886903455e-06, "learning_rate": 0.0031603713969789106, "loss": 2.4237, "step": 17310 }, { "crossentropy": 2.570054292678833, "epoch": 0.6275739559164734, "grad_norm": 0.025998463854193687, "grad_norm_var": 2.237301043407867e-06, "learning_rate": 0.003159831081973812, "loss": 2.5516, "step": 17311 }, { "crossentropy": 2.4443089962005615, "epoch": 0.6276102088167054, "grad_norm": 0.03322863206267357, "grad_norm_var": 4.3831640215606186e-06, "learning_rate": 0.0031592907918229873, "loss": 2.4228, "step": 17312 }, { "crossentropy": 2.5294744968414307, "epoch": 0.6276464617169374, "grad_norm": 0.02966318093240261, "grad_norm_var": 4.481358909467064e-06, "learning_rate": 0.0031587505265337325, "loss": 2.5179, "step": 17313 }, { "crossentropy": 2.6680216789245605, "epoch": 0.6276827146171694, "grad_norm": 0.027001801878213882, "grad_norm_var": 4.26524345061723e-06, "learning_rate": 0.0031582102861133464, "loss": 2.677, "step": 17314 }, { "crossentropy": 2.485043525695801, "epoch": 0.6277189675174014, "grad_norm": 0.026826882734894753, "grad_norm_var": 4.182153214454563e-06, "learning_rate": 0.0031576700705691265, "loss": 2.4997, "step": 17315 }, { "crossentropy": 2.5208473205566406, "epoch": 0.6277552204176334, "grad_norm": 0.02580336295068264, "grad_norm_var": 4.405729754252647e-06, "learning_rate": 0.0031571298799083682, "loss": 2.4795, "step": 17316 }, { "crossentropy": 2.7466633319854736, "epoch": 0.6277914733178654, "grad_norm": 0.02762017399072647, "grad_norm_var": 4.391894316987004e-06, "learning_rate": 0.003156589714138368, "loss": 2.6705, "step": 17317 }, { "crossentropy": 2.4908480644226074, "epoch": 0.6278277262180975, "grad_norm": 0.027676735073328018, "grad_norm_var": 4.3783878671228736e-06, "learning_rate": 0.0031560495732664198, "loss": 2.4793, "step": 17318 }, { "crossentropy": 2.4537453651428223, "epoch": 0.6278639791183295, "grad_norm": 0.026718752458691597, "grad_norm_var": 4.425259861410184e-06, "learning_rate": 0.003155509457299821, "loss": 2.4354, "step": 17319 }, { "crossentropy": 2.5244028568267822, "epoch": 0.6279002320185615, "grad_norm": 0.025522608309984207, "grad_norm_var": 4.662524149893103e-06, "learning_rate": 0.0031549693662458658, "loss": 2.489, "step": 17320 }, { "crossentropy": 2.4705541133880615, "epoch": 0.6279364849187935, "grad_norm": 0.02590664103627205, "grad_norm_var": 4.693612188887142e-06, "learning_rate": 0.0031544293001118484, "loss": 2.5538, "step": 17321 }, { "crossentropy": 2.532522201538086, "epoch": 0.6279727378190255, "grad_norm": 0.02607565186917782, "grad_norm_var": 4.831913666798691e-06, "learning_rate": 0.003153889258905065, "loss": 2.4749, "step": 17322 }, { "crossentropy": 2.5332024097442627, "epoch": 0.6280089907192575, "grad_norm": 0.02748280204832554, "grad_norm_var": 4.8178173649412405e-06, "learning_rate": 0.0031533492426328085, "loss": 2.5026, "step": 17323 }, { "crossentropy": 2.53409481048584, "epoch": 0.6280452436194895, "grad_norm": 0.026590559631586075, "grad_norm_var": 4.729315730587222e-06, "learning_rate": 0.0031528092513023733, "loss": 2.4665, "step": 17324 }, { "crossentropy": 2.4865200519561768, "epoch": 0.6280814965197216, "grad_norm": 0.026155494153499603, "grad_norm_var": 3.5963597796324997e-06, "learning_rate": 0.003152269284921052, "loss": 2.5397, "step": 17325 }, { "crossentropy": 2.569817543029785, "epoch": 0.6281177494199536, "grad_norm": 0.026667291298508644, "grad_norm_var": 3.6128238572590934e-06, "learning_rate": 0.0031517293434961393, "loss": 2.5133, "step": 17326 }, { "crossentropy": 2.571240186691284, "epoch": 0.6281540023201856, "grad_norm": 0.028455398976802826, "grad_norm_var": 3.601837278820518e-06, "learning_rate": 0.003151189427034926, "loss": 2.5358, "step": 17327 }, { "crossentropy": 2.5080997943878174, "epoch": 0.6281902552204176, "grad_norm": 0.028451964259147644, "grad_norm_var": 1.2757140986384274e-06, "learning_rate": 0.0031506495355447047, "loss": 2.5499, "step": 17328 }, { "crossentropy": 2.4283335208892822, "epoch": 0.6282265081206496, "grad_norm": 0.026681628078222275, "grad_norm_var": 7.879830633346236e-07, "learning_rate": 0.003150109669032767, "loss": 2.4534, "step": 17329 }, { "crossentropy": 2.4636831283569336, "epoch": 0.6282627610208816, "grad_norm": 0.026596996933221817, "grad_norm_var": 7.901587407571053e-07, "learning_rate": 0.0031495698275064068, "loss": 2.5221, "step": 17330 }, { "crossentropy": 2.4793872833251953, "epoch": 0.6282990139211136, "grad_norm": 0.027957340702414513, "grad_norm_var": 8.700031255938027e-07, "learning_rate": 0.0031490300109729137, "loss": 2.6381, "step": 17331 }, { "crossentropy": 2.453263998031616, "epoch": 0.6283352668213457, "grad_norm": 0.03068237192928791, "grad_norm_var": 1.6458864216893516e-06, "learning_rate": 0.0031484902194395792, "loss": 2.5286, "step": 17332 }, { "crossentropy": 2.661435604095459, "epoch": 0.6283715197215777, "grad_norm": 0.0278649739921093, "grad_norm_var": 1.6632598229375979e-06, "learning_rate": 0.0031479504529136944, "loss": 2.5437, "step": 17333 }, { "crossentropy": 2.5811779499053955, "epoch": 0.6284077726218097, "grad_norm": 0.026423929259181023, "grad_norm_var": 1.6847192713638117e-06, "learning_rate": 0.0031474107114025504, "loss": 2.5984, "step": 17334 }, { "crossentropy": 2.4118096828460693, "epoch": 0.6284440255220418, "grad_norm": 0.027079839259386063, "grad_norm_var": 1.6726041634805451e-06, "learning_rate": 0.003146870994913435, "loss": 2.6095, "step": 17335 }, { "crossentropy": 2.647284984588623, "epoch": 0.6284802784222738, "grad_norm": 0.02563536912202835, "grad_norm_var": 1.6487476848371188e-06, "learning_rate": 0.003146331303453639, "loss": 2.5216, "step": 17336 }, { "crossentropy": 2.473757743835449, "epoch": 0.6285165313225058, "grad_norm": 0.026434000581502914, "grad_norm_var": 1.5773484787348172e-06, "learning_rate": 0.0031457916370304507, "loss": 2.4826, "step": 17337 }, { "crossentropy": 2.558720827102661, "epoch": 0.6285527842227379, "grad_norm": 0.026655035093426704, "grad_norm_var": 1.5112997181561277e-06, "learning_rate": 0.0031452519956511617, "loss": 2.5237, "step": 17338 }, { "crossentropy": 2.42276668548584, "epoch": 0.6285890371229699, "grad_norm": 0.02739742025732994, "grad_norm_var": 1.5089734385926307e-06, "learning_rate": 0.0031447123793230582, "loss": 2.4599, "step": 17339 }, { "crossentropy": 2.681882858276367, "epoch": 0.6286252900232019, "grad_norm": 0.028935998678207397, "grad_norm_var": 1.6518523750642336e-06, "learning_rate": 0.00314417278805343, "loss": 2.6254, "step": 17340 }, { "crossentropy": 2.5671377182006836, "epoch": 0.6286615429234339, "grad_norm": 0.02585514634847641, "grad_norm_var": 1.7065150615072488e-06, "learning_rate": 0.003143633221849565, "loss": 2.5529, "step": 17341 }, { "crossentropy": 2.576413631439209, "epoch": 0.6286977958236659, "grad_norm": 0.02807212807238102, "grad_norm_var": 1.6999384564205747e-06, "learning_rate": 0.0031430936807187518, "loss": 2.5933, "step": 17342 }, { "crossentropy": 2.5422298908233643, "epoch": 0.6287340487238979, "grad_norm": 0.025732742622494698, "grad_norm_var": 1.7977970916279706e-06, "learning_rate": 0.0031425541646682743, "loss": 2.516, "step": 17343 }, { "crossentropy": 2.4830427169799805, "epoch": 0.62877030162413, "grad_norm": 0.027630163356661797, "grad_norm_var": 1.7114324172836364e-06, "learning_rate": 0.0031420146737054224, "loss": 2.555, "step": 17344 }, { "crossentropy": 2.574594259262085, "epoch": 0.628806554524362, "grad_norm": 0.027569526806473732, "grad_norm_var": 1.696117684208548e-06, "learning_rate": 0.0031414752078374798, "loss": 2.594, "step": 17345 }, { "crossentropy": 2.5348522663116455, "epoch": 0.628842807424594, "grad_norm": 0.02761019393801689, "grad_norm_var": 1.6676463878160611e-06, "learning_rate": 0.003140935767071737, "loss": 2.647, "step": 17346 }, { "crossentropy": 2.5086631774902344, "epoch": 0.628879060324826, "grad_norm": 0.024919599294662476, "grad_norm_var": 1.9967803365627716e-06, "learning_rate": 0.0031403963514154776, "loss": 2.4887, "step": 17347 }, { "crossentropy": 2.4938273429870605, "epoch": 0.628915313225058, "grad_norm": 0.028248663991689682, "grad_norm_var": 1.222725275114705e-06, "learning_rate": 0.0031398569608759876, "loss": 2.5133, "step": 17348 }, { "crossentropy": 2.584214687347412, "epoch": 0.62895156612529, "grad_norm": 0.032503146678209305, "grad_norm_var": 3.099683540628134e-06, "learning_rate": 0.0031393175954605524, "loss": 2.6045, "step": 17349 }, { "crossentropy": 2.4890029430389404, "epoch": 0.628987819025522, "grad_norm": 0.028214966878294945, "grad_norm_var": 3.0924111358099707e-06, "learning_rate": 0.0031387782551764555, "loss": 2.5387, "step": 17350 }, { "crossentropy": 2.4942080974578857, "epoch": 0.629024071925754, "grad_norm": 0.02775168977677822, "grad_norm_var": 3.0914166069742247e-06, "learning_rate": 0.0031382389400309856, "loss": 2.544, "step": 17351 }, { "crossentropy": 2.4986765384674072, "epoch": 0.6290603248259861, "grad_norm": 0.027048660442233086, "grad_norm_var": 2.8747095748219222e-06, "learning_rate": 0.0031376996500314227, "loss": 2.5381, "step": 17352 }, { "crossentropy": 2.557765007019043, "epoch": 0.6290965777262181, "grad_norm": 0.02638229727745056, "grad_norm_var": 2.882474914865189e-06, "learning_rate": 0.0031371603851850505, "loss": 2.6035, "step": 17353 }, { "crossentropy": 2.4053754806518555, "epoch": 0.6291328306264501, "grad_norm": 0.02680540643632412, "grad_norm_var": 2.866286145423019e-06, "learning_rate": 0.0031366211454991557, "loss": 2.4554, "step": 17354 }, { "crossentropy": 2.5641930103302, "epoch": 0.6291690835266821, "grad_norm": 0.025658881291747093, "grad_norm_var": 3.088791156742353e-06, "learning_rate": 0.0031360819309810195, "loss": 2.4289, "step": 17355 }, { "crossentropy": 2.4180502891540527, "epoch": 0.6292053364269141, "grad_norm": 0.028185104951262474, "grad_norm_var": 2.9736224308383903e-06, "learning_rate": 0.0031355427416379263, "loss": 2.4541, "step": 17356 }, { "crossentropy": 2.506971597671509, "epoch": 0.6292415893271461, "grad_norm": 0.025169873610138893, "grad_norm_var": 3.1429163318891314e-06, "learning_rate": 0.003135003577477157, "loss": 2.5153, "step": 17357 }, { "crossentropy": 2.4463117122650146, "epoch": 0.6292778422273781, "grad_norm": 0.02550184726715088, "grad_norm_var": 3.3062598683533094e-06, "learning_rate": 0.003134464438505994, "loss": 2.4586, "step": 17358 }, { "crossentropy": 2.487504005432129, "epoch": 0.6293140951276102, "grad_norm": 0.028095578774809837, "grad_norm_var": 3.1982071443286316e-06, "learning_rate": 0.0031339253247317234, "loss": 2.5289, "step": 17359 }, { "crossentropy": 2.4966514110565186, "epoch": 0.6293503480278422, "grad_norm": 0.027116956189274788, "grad_norm_var": 3.1941957495492644e-06, "learning_rate": 0.0031333862361616204, "loss": 2.5785, "step": 17360 }, { "crossentropy": 2.4842898845672607, "epoch": 0.6293866009280742, "grad_norm": 0.026135040447115898, "grad_norm_var": 3.27104372281455e-06, "learning_rate": 0.0031328471728029684, "loss": 2.4937, "step": 17361 }, { "crossentropy": 2.468843698501587, "epoch": 0.6294228538283063, "grad_norm": 0.028476357460021973, "grad_norm_var": 3.364238747047219e-06, "learning_rate": 0.003132308134663049, "loss": 2.509, "step": 17362 }, { "crossentropy": 2.5612709522247314, "epoch": 0.6294591067285383, "grad_norm": 0.02675609476864338, "grad_norm_var": 3.0011212606555984e-06, "learning_rate": 0.003131769121749143, "loss": 2.5668, "step": 17363 }, { "crossentropy": 2.520556688308716, "epoch": 0.6294953596287703, "grad_norm": 0.027204761281609535, "grad_norm_var": 2.9480667557308175e-06, "learning_rate": 0.0031312301340685306, "loss": 2.5929, "step": 17364 }, { "crossentropy": 2.657663583755493, "epoch": 0.6295316125290024, "grad_norm": 0.02601335383951664, "grad_norm_var": 1.0892691045471473e-06, "learning_rate": 0.0031306911716284906, "loss": 2.6555, "step": 17365 }, { "crossentropy": 2.5403716564178467, "epoch": 0.6295678654292344, "grad_norm": 0.027676081284880638, "grad_norm_var": 1.0134615649542381e-06, "learning_rate": 0.0031301522344363026, "loss": 2.607, "step": 17366 }, { "crossentropy": 2.5520451068878174, "epoch": 0.6296041183294664, "grad_norm": 0.026760276407003403, "grad_norm_var": 9.58822699975483e-07, "learning_rate": 0.003129613322499247, "loss": 2.5511, "step": 17367 }, { "crossentropy": 2.5129027366638184, "epoch": 0.6296403712296984, "grad_norm": 0.025995753705501556, "grad_norm_var": 9.948391751911223e-07, "learning_rate": 0.0031290744358246038, "loss": 2.5578, "step": 17368 }, { "crossentropy": 2.5265443325042725, "epoch": 0.6296766241299304, "grad_norm": 0.025818631052970886, "grad_norm_var": 1.0420199416660692e-06, "learning_rate": 0.003128535574419647, "loss": 2.5807, "step": 17369 }, { "crossentropy": 2.505107879638672, "epoch": 0.6297128770301624, "grad_norm": 0.02661830745637417, "grad_norm_var": 1.0418433485065293e-06, "learning_rate": 0.003127996738291658, "loss": 2.537, "step": 17370 }, { "crossentropy": 2.5182313919067383, "epoch": 0.6297491299303944, "grad_norm": 0.027040543034672737, "grad_norm_var": 9.695555424488045e-07, "learning_rate": 0.003127457927447914, "loss": 2.5432, "step": 17371 }, { "crossentropy": 2.5790889263153076, "epoch": 0.6297853828306265, "grad_norm": 0.02531522512435913, "grad_norm_var": 9.486767126099434e-07, "learning_rate": 0.003126919141895692, "loss": 2.5354, "step": 17372 }, { "crossentropy": 2.540250301361084, "epoch": 0.6298216357308585, "grad_norm": 0.02804231271147728, "grad_norm_var": 9.143651718546691e-07, "learning_rate": 0.0031263803816422697, "loss": 2.6083, "step": 17373 }, { "crossentropy": 2.549760103225708, "epoch": 0.6298578886310905, "grad_norm": 0.025947079062461853, "grad_norm_var": 8.50554825264924e-07, "learning_rate": 0.0031258416466949236, "loss": 2.4705, "step": 17374 }, { "crossentropy": 2.4139463901519775, "epoch": 0.6298941415313225, "grad_norm": 0.025926770642399788, "grad_norm_var": 7.737275659617232e-07, "learning_rate": 0.0031253029370609297, "loss": 2.4644, "step": 17375 }, { "crossentropy": 2.5592050552368164, "epoch": 0.6299303944315545, "grad_norm": 0.026516487821936607, "grad_norm_var": 7.610965046662138e-07, "learning_rate": 0.003124764252747567, "loss": 2.5328, "step": 17376 }, { "crossentropy": 2.361053705215454, "epoch": 0.6299666473317865, "grad_norm": 0.02783900685608387, "grad_norm_var": 8.277971212743007e-07, "learning_rate": 0.0031242255937621068, "loss": 2.4693, "step": 17377 }, { "crossentropy": 2.4549155235290527, "epoch": 0.6300029002320185, "grad_norm": 0.027822505682706833, "grad_norm_var": 7.037244500655935e-07, "learning_rate": 0.003123686960111827, "loss": 2.5376, "step": 17378 }, { "crossentropy": 2.349918842315674, "epoch": 0.6300391531322506, "grad_norm": 0.027321187779307365, "grad_norm_var": 7.274702382800326e-07, "learning_rate": 0.003123148351804002, "loss": 2.4201, "step": 17379 }, { "crossentropy": 2.4547324180603027, "epoch": 0.6300754060324826, "grad_norm": 0.026704134419560432, "grad_norm_var": 7.121877885577725e-07, "learning_rate": 0.0031226097688459064, "loss": 2.5273, "step": 17380 }, { "crossentropy": 2.563506841659546, "epoch": 0.6301116589327146, "grad_norm": 0.026013992726802826, "grad_norm_var": 7.12128482760419e-07, "learning_rate": 0.0031220712112448156, "loss": 2.4841, "step": 17381 }, { "crossentropy": 2.568967580795288, "epoch": 0.6301479118329466, "grad_norm": 0.026847589761018753, "grad_norm_var": 6.482979169505534e-07, "learning_rate": 0.003121532679008001, "loss": 2.5392, "step": 17382 }, { "crossentropy": 2.550642728805542, "epoch": 0.6301841647331786, "grad_norm": 0.0276382677257061, "grad_norm_var": 7.084370529177591e-07, "learning_rate": 0.0031209941721427404, "loss": 2.5011, "step": 17383 }, { "crossentropy": 2.3922834396362305, "epoch": 0.6302204176334106, "grad_norm": 0.027051478624343872, "grad_norm_var": 6.771365740350221e-07, "learning_rate": 0.0031204556906563063, "loss": 2.4378, "step": 17384 }, { "crossentropy": 2.4856221675872803, "epoch": 0.6302566705336426, "grad_norm": 0.026739398017525673, "grad_norm_var": 6.12225024397041e-07, "learning_rate": 0.003119917234555968, "loss": 2.4966, "step": 17385 }, { "crossentropy": 2.4449961185455322, "epoch": 0.6302929234338747, "grad_norm": 0.025707781314849854, "grad_norm_var": 6.905326566759938e-07, "learning_rate": 0.003119378803849001, "loss": 2.4958, "step": 17386 }, { "crossentropy": 2.5738213062286377, "epoch": 0.6303291763341067, "grad_norm": 0.026095349341630936, "grad_norm_var": 7.134853106348371e-07, "learning_rate": 0.003118840398542677, "loss": 2.4446, "step": 17387 }, { "crossentropy": 2.3251638412475586, "epoch": 0.6303654292343387, "grad_norm": 0.025720493867993355, "grad_norm_var": 6.478133680581082e-07, "learning_rate": 0.003118302018644269, "loss": 2.4099, "step": 17388 }, { "crossentropy": 2.5455398559570312, "epoch": 0.6304016821345708, "grad_norm": 0.027454739436507225, "grad_norm_var": 5.678232602843517e-07, "learning_rate": 0.0031177636641610473, "loss": 2.5163, "step": 17389 }, { "crossentropy": 2.538421630859375, "epoch": 0.6304379350348028, "grad_norm": 0.027074331417679787, "grad_norm_var": 5.327036622685636e-07, "learning_rate": 0.0031172253351002825, "loss": 2.5631, "step": 17390 }, { "crossentropy": 2.616197109222412, "epoch": 0.6304741879350348, "grad_norm": 0.025749314576387405, "grad_norm_var": 5.54850335546253e-07, "learning_rate": 0.0031166870314692486, "loss": 2.4876, "step": 17391 }, { "crossentropy": 2.466280460357666, "epoch": 0.6305104408352669, "grad_norm": 0.02801411785185337, "grad_norm_var": 6.447077754804034e-07, "learning_rate": 0.0031161487532752135, "loss": 2.5585, "step": 17392 }, { "crossentropy": 2.5351414680480957, "epoch": 0.6305466937354989, "grad_norm": 0.025850124657154083, "grad_norm_var": 6.328771781294867e-07, "learning_rate": 0.0031156105005254496, "loss": 2.509, "step": 17393 }, { "crossentropy": 2.528944969177246, "epoch": 0.6305829466357309, "grad_norm": 0.025998108088970184, "grad_norm_var": 5.770460238685986e-07, "learning_rate": 0.003115072273227225, "loss": 2.5109, "step": 17394 }, { "crossentropy": 2.6619772911071777, "epoch": 0.6306191995359629, "grad_norm": 0.027613606303930283, "grad_norm_var": 6.095818096841949e-07, "learning_rate": 0.003114534071387809, "loss": 2.5849, "step": 17395 }, { "crossentropy": 2.4792563915252686, "epoch": 0.6306554524361949, "grad_norm": 0.026080047711730003, "grad_norm_var": 6.287585782030018e-07, "learning_rate": 0.003113995895014472, "loss": 2.4573, "step": 17396 }, { "crossentropy": 2.7020280361175537, "epoch": 0.6306917053364269, "grad_norm": 0.026375308632850647, "grad_norm_var": 6.08539976920212e-07, "learning_rate": 0.003113457744114482, "loss": 2.5712, "step": 17397 }, { "crossentropy": 2.387279748916626, "epoch": 0.630727958236659, "grad_norm": 0.027182355523109436, "grad_norm_var": 6.254515678528911e-07, "learning_rate": 0.0031129196186951086, "loss": 2.5267, "step": 17398 }, { "crossentropy": 2.5469274520874023, "epoch": 0.630764211136891, "grad_norm": 0.025815367698669434, "grad_norm_var": 5.920968978949855e-07, "learning_rate": 0.0031123815187636197, "loss": 2.51, "step": 17399 }, { "crossentropy": 2.553774356842041, "epoch": 0.630800464037123, "grad_norm": 0.0257155392318964, "grad_norm_var": 6.112209874045328e-07, "learning_rate": 0.003111843444327283, "loss": 2.5569, "step": 17400 }, { "crossentropy": 2.6095175743103027, "epoch": 0.630836716937355, "grad_norm": 0.026805562898516655, "grad_norm_var": 6.140553923288424e-07, "learning_rate": 0.003111305395393367, "loss": 2.6494, "step": 17401 }, { "crossentropy": 2.4696567058563232, "epoch": 0.630872969837587, "grad_norm": 0.026139475405216217, "grad_norm_var": 5.82793754518362e-07, "learning_rate": 0.0031107673719691364, "loss": 2.5033, "step": 17402 }, { "crossentropy": 2.5965330600738525, "epoch": 0.630909222737819, "grad_norm": 0.025970077142119408, "grad_norm_var": 5.902033907728236e-07, "learning_rate": 0.0031102293740618604, "loss": 2.595, "step": 17403 }, { "crossentropy": 2.631577253341675, "epoch": 0.630945475638051, "grad_norm": 0.027877865359187126, "grad_norm_var": 6.648056138429277e-07, "learning_rate": 0.0031096914016788034, "loss": 2.5729, "step": 17404 }, { "crossentropy": 2.7376928329467773, "epoch": 0.630981728538283, "grad_norm": 0.026225468143820763, "grad_norm_var": 6.203433387890732e-07, "learning_rate": 0.0031091534548272322, "loss": 2.6435, "step": 17405 }, { "crossentropy": 2.6394951343536377, "epoch": 0.6310179814385151, "grad_norm": 0.02713715471327305, "grad_norm_var": 6.251460785690217e-07, "learning_rate": 0.003108615533514414, "loss": 2.5324, "step": 17406 }, { "crossentropy": 2.49210262298584, "epoch": 0.6310542343387471, "grad_norm": 0.026365628466010094, "grad_norm_var": 5.843763675643e-07, "learning_rate": 0.0031080776377476133, "loss": 2.5405, "step": 17407 }, { "crossentropy": 2.449097156524658, "epoch": 0.6310904872389791, "grad_norm": 0.02582935057580471, "grad_norm_var": 4.628610093574629e-07, "learning_rate": 0.0031075397675340956, "loss": 2.5274, "step": 17408 }, { "crossentropy": 2.5361030101776123, "epoch": 0.6311267401392111, "grad_norm": 0.026217347010970116, "grad_norm_var": 4.425876474076218e-07, "learning_rate": 0.0031070019228811242, "loss": 2.6003, "step": 17409 }, { "crossentropy": 2.6831088066101074, "epoch": 0.6311629930394431, "grad_norm": 0.026320582255721092, "grad_norm_var": 4.2925877636623177e-07, "learning_rate": 0.0031064641037959664, "loss": 2.5498, "step": 17410 }, { "crossentropy": 2.46958065032959, "epoch": 0.6311992459396751, "grad_norm": 0.027732079848647118, "grad_norm_var": 4.4805215121656214e-07, "learning_rate": 0.0031059263102858835, "loss": 2.4578, "step": 17411 }, { "crossentropy": 2.6428189277648926, "epoch": 0.6312354988399071, "grad_norm": 0.027415059506893158, "grad_norm_var": 4.870360668759171e-07, "learning_rate": 0.0031053885423581395, "loss": 2.5625, "step": 17412 }, { "crossentropy": 2.5428667068481445, "epoch": 0.6312717517401392, "grad_norm": 0.026266643777489662, "grad_norm_var": 4.905987081827226e-07, "learning_rate": 0.0031048508000199983, "loss": 2.583, "step": 17413 }, { "crossentropy": 2.6782572269439697, "epoch": 0.6313080046403712, "grad_norm": 0.02598036825656891, "grad_norm_var": 4.817117520894768e-07, "learning_rate": 0.003104313083278723, "loss": 2.5717, "step": 17414 }, { "crossentropy": 2.5299947261810303, "epoch": 0.6313442575406032, "grad_norm": 0.028276750817894936, "grad_norm_var": 6.395004867249295e-07, "learning_rate": 0.0031037753921415766, "loss": 2.5405, "step": 17415 }, { "crossentropy": 2.391324996948242, "epoch": 0.6313805104408353, "grad_norm": 0.028569413349032402, "grad_norm_var": 7.959339359488952e-07, "learning_rate": 0.0031032377266158218, "loss": 2.5338, "step": 17416 }, { "crossentropy": 2.579594373703003, "epoch": 0.6314167633410673, "grad_norm": 0.02713496796786785, "grad_norm_var": 8.020573479906722e-07, "learning_rate": 0.0031027000867087195, "loss": 2.572, "step": 17417 }, { "crossentropy": 2.663417339324951, "epoch": 0.6314530162412993, "grad_norm": 0.025936754420399666, "grad_norm_var": 8.235914411943689e-07, "learning_rate": 0.003102162472427533, "loss": 2.5988, "step": 17418 }, { "crossentropy": 2.5907349586486816, "epoch": 0.6314892691415314, "grad_norm": 0.02651732601225376, "grad_norm_var": 7.796751261006836e-07, "learning_rate": 0.0031016248837795212, "loss": 2.6453, "step": 17419 }, { "crossentropy": 2.5917694568634033, "epoch": 0.6315255220417634, "grad_norm": 0.026605933904647827, "grad_norm_var": 7.086208021034179e-07, "learning_rate": 0.003101087320771946, "loss": 2.5056, "step": 17420 }, { "crossentropy": 2.438819408416748, "epoch": 0.6315617749419954, "grad_norm": 0.026869919151067734, "grad_norm_var": 6.866559989824235e-07, "learning_rate": 0.0031005497834120676, "loss": 2.5269, "step": 17421 }, { "crossentropy": 2.5509729385375977, "epoch": 0.6315980278422274, "grad_norm": 0.027170991525053978, "grad_norm_var": 6.881428368409858e-07, "learning_rate": 0.0031000122717071477, "loss": 2.555, "step": 17422 }, { "crossentropy": 2.6323070526123047, "epoch": 0.6316342807424594, "grad_norm": 0.028230082243680954, "grad_norm_var": 7.910659981662788e-07, "learning_rate": 0.0030994747856644455, "loss": 2.6047, "step": 17423 }, { "crossentropy": 2.583852529525757, "epoch": 0.6316705336426914, "grad_norm": 0.027950188145041466, "grad_norm_var": 7.575270791763116e-07, "learning_rate": 0.00309893732529122, "loss": 2.597, "step": 17424 }, { "crossentropy": 2.6616897583007812, "epoch": 0.6317067865429234, "grad_norm": 0.026889750733971596, "grad_norm_var": 7.089244599772637e-07, "learning_rate": 0.003098399890594732, "loss": 2.624, "step": 17425 }, { "crossentropy": 2.5299339294433594, "epoch": 0.6317430394431555, "grad_norm": 0.027077483013272285, "grad_norm_var": 6.643887969509741e-07, "learning_rate": 0.003097862481582239, "loss": 2.5039, "step": 17426 }, { "crossentropy": 2.5308315753936768, "epoch": 0.6317792923433875, "grad_norm": 0.02704249881207943, "grad_norm_var": 6.418756590852258e-07, "learning_rate": 0.0030973250982609998, "loss": 2.5713, "step": 17427 }, { "crossentropy": 2.6348023414611816, "epoch": 0.6318155452436195, "grad_norm": 0.028687819838523865, "grad_norm_var": 7.930427078313284e-07, "learning_rate": 0.003096787740638272, "loss": 2.5754, "step": 17428 }, { "crossentropy": 2.4998693466186523, "epoch": 0.6318517981438515, "grad_norm": 0.02622109092772007, "grad_norm_var": 7.988439535588776e-07, "learning_rate": 0.003096250408721313, "loss": 2.441, "step": 17429 }, { "crossentropy": 2.5178298950195312, "epoch": 0.6318880510440835, "grad_norm": 0.026813697069883347, "grad_norm_var": 7.07000834261603e-07, "learning_rate": 0.0030957131025173825, "loss": 2.5633, "step": 17430 }, { "crossentropy": 2.5503456592559814, "epoch": 0.6319243039443155, "grad_norm": 0.025521088391542435, "grad_norm_var": 8.042326157626142e-07, "learning_rate": 0.003095175822033736, "loss": 2.5343, "step": 17431 }, { "crossentropy": 2.6069087982177734, "epoch": 0.6319605568445475, "grad_norm": 0.026240816339850426, "grad_norm_var": 6.799024079115478e-07, "learning_rate": 0.0030946385672776314, "loss": 2.6151, "step": 17432 }, { "crossentropy": 2.5191304683685303, "epoch": 0.6319968097447796, "grad_norm": 0.026634925976395607, "grad_norm_var": 6.819910664210629e-07, "learning_rate": 0.0030941013382563237, "loss": 2.5309, "step": 17433 }, { "crossentropy": 2.624540328979492, "epoch": 0.6320330626450116, "grad_norm": 0.025500135496258736, "grad_norm_var": 7.50019706651818e-07, "learning_rate": 0.0030935641349770695, "loss": 2.6577, "step": 17434 }, { "crossentropy": 2.5461485385894775, "epoch": 0.6320693155452436, "grad_norm": 0.02681228704750538, "grad_norm_var": 7.414552091323037e-07, "learning_rate": 0.0030930269574471272, "loss": 2.5479, "step": 17435 }, { "crossentropy": 2.5169332027435303, "epoch": 0.6321055684454756, "grad_norm": 0.02761770598590374, "grad_norm_var": 7.668719732511897e-07, "learning_rate": 0.0030924898056737477, "loss": 2.4497, "step": 17436 }, { "crossentropy": 2.5729081630706787, "epoch": 0.6321418213457076, "grad_norm": 0.026554996147751808, "grad_norm_var": 7.766442880965026e-07, "learning_rate": 0.003091952679664187, "loss": 2.5511, "step": 17437 }, { "crossentropy": 2.2454779148101807, "epoch": 0.6321780742459396, "grad_norm": 0.026837455108761787, "grad_norm_var": 7.731177420153626e-07, "learning_rate": 0.0030914155794257024, "loss": 2.3638, "step": 17438 }, { "crossentropy": 2.4635958671569824, "epoch": 0.6322143271461717, "grad_norm": 0.025846369564533234, "grad_norm_var": 7.101192049392691e-07, "learning_rate": 0.003090878504965547, "loss": 2.4735, "step": 17439 }, { "crossentropy": 2.653794765472412, "epoch": 0.6322505800464037, "grad_norm": 0.028209414333105087, "grad_norm_var": 7.552653857519857e-07, "learning_rate": 0.0030903414562909737, "loss": 2.625, "step": 17440 }, { "crossentropy": 2.5072672367095947, "epoch": 0.6322868329466357, "grad_norm": 0.028042376041412354, "grad_norm_var": 8.549018401563315e-07, "learning_rate": 0.003089804433409238, "loss": 2.5616, "step": 17441 }, { "crossentropy": 2.484567880630493, "epoch": 0.6323230858468677, "grad_norm": 0.028114568442106247, "grad_norm_var": 9.530594596296315e-07, "learning_rate": 0.003089267436327591, "loss": 2.5758, "step": 17442 }, { "crossentropy": 2.49009370803833, "epoch": 0.6323593387470998, "grad_norm": 0.028916645795106888, "grad_norm_var": 1.203552289246769e-06, "learning_rate": 0.0030887304650532897, "loss": 2.521, "step": 17443 }, { "crossentropy": 2.5183324813842773, "epoch": 0.6323955916473318, "grad_norm": 0.025872519239783287, "grad_norm_var": 1.078764784191707e-06, "learning_rate": 0.0030881935195935816, "loss": 2.5533, "step": 17444 }, { "crossentropy": 2.472909927368164, "epoch": 0.6324318445475638, "grad_norm": 0.02629021368920803, "grad_norm_var": 1.073177236347158e-06, "learning_rate": 0.0030876565999557203, "loss": 2.4548, "step": 17445 }, { "crossentropy": 2.5496928691864014, "epoch": 0.6324680974477959, "grad_norm": 0.02615259401500225, "grad_norm_var": 1.104934062368575e-06, "learning_rate": 0.00308711970614696, "loss": 2.5247, "step": 17446 }, { "crossentropy": 2.3299388885498047, "epoch": 0.6325043503480279, "grad_norm": 0.02626924030482769, "grad_norm_var": 1.0100711554574532e-06, "learning_rate": 0.0030865828381745515, "loss": 2.4119, "step": 17447 }, { "crossentropy": 2.650010585784912, "epoch": 0.6325406032482599, "grad_norm": 0.027223331853747368, "grad_norm_var": 9.880436838083228e-07, "learning_rate": 0.0030860459960457454, "loss": 2.5254, "step": 17448 }, { "crossentropy": 2.477708578109741, "epoch": 0.6325768561484919, "grad_norm": 0.026862850412726402, "grad_norm_var": 9.822951823274754e-07, "learning_rate": 0.003085509179767792, "loss": 2.4436, "step": 17449 }, { "crossentropy": 2.540867805480957, "epoch": 0.6326131090487239, "grad_norm": 0.026331834495067596, "grad_norm_var": 8.652834983783123e-07, "learning_rate": 0.003084972389347942, "loss": 2.5837, "step": 17450 }, { "crossentropy": 2.578545093536377, "epoch": 0.6326493619489559, "grad_norm": 0.026507724076509476, "grad_norm_var": 8.785879053753992e-07, "learning_rate": 0.003084435624793446, "loss": 2.5286, "step": 17451 }, { "crossentropy": 2.618177652359009, "epoch": 0.632685614849188, "grad_norm": 0.027084339410066605, "grad_norm_var": 8.508830366464488e-07, "learning_rate": 0.003083898886111557, "loss": 2.5447, "step": 17452 }, { "crossentropy": 2.655331611633301, "epoch": 0.63272186774942, "grad_norm": 0.027349479496479034, "grad_norm_var": 8.490430830575818e-07, "learning_rate": 0.003083362173309518, "loss": 2.5819, "step": 17453 }, { "crossentropy": 2.436255693435669, "epoch": 0.632758120649652, "grad_norm": 0.027883796021342278, "grad_norm_var": 8.955693494041289e-07, "learning_rate": 0.0030828254863945826, "loss": 2.4268, "step": 17454 }, { "crossentropy": 2.4535305500030518, "epoch": 0.632794373549884, "grad_norm": 0.02754482999444008, "grad_norm_var": 8.010651578629294e-07, "learning_rate": 0.0030822888253739977, "loss": 2.4923, "step": 17455 }, { "crossentropy": 2.527646064758301, "epoch": 0.632830626450116, "grad_norm": 0.026815839111804962, "grad_norm_var": 7.285637386509561e-07, "learning_rate": 0.0030817521902550132, "loss": 2.568, "step": 17456 }, { "crossentropy": 2.6065006256103516, "epoch": 0.632866879350348, "grad_norm": 0.027559742331504822, "grad_norm_var": 6.811205174250216e-07, "learning_rate": 0.0030812155810448763, "loss": 2.5804, "step": 17457 }, { "crossentropy": 2.475769519805908, "epoch": 0.63290313225058, "grad_norm": 0.025542285293340683, "grad_norm_var": 7.291059715123452e-07, "learning_rate": 0.0030806789977508337, "loss": 2.5339, "step": 17458 }, { "crossentropy": 2.509679079055786, "epoch": 0.632939385150812, "grad_norm": 0.025690563023090363, "grad_norm_var": 5.069512878599306e-07, "learning_rate": 0.003080142440380135, "loss": 2.499, "step": 17459 }, { "crossentropy": 2.4463415145874023, "epoch": 0.6329756380510441, "grad_norm": 0.030244851484894753, "grad_norm_var": 1.227351938991418e-06, "learning_rate": 0.003079605908940028, "loss": 2.5968, "step": 17460 }, { "crossentropy": 2.3484387397766113, "epoch": 0.6330118909512761, "grad_norm": 0.026273027062416077, "grad_norm_var": 1.2289043204387802e-06, "learning_rate": 0.0030790694034377544, "loss": 2.489, "step": 17461 }, { "crossentropy": 2.523665428161621, "epoch": 0.6330481438515081, "grad_norm": 0.027534058317542076, "grad_norm_var": 1.1997342195806115e-06, "learning_rate": 0.0030785329238805646, "loss": 2.4981, "step": 17462 }, { "crossentropy": 2.5637471675872803, "epoch": 0.6330843967517401, "grad_norm": 0.0257706455886364, "grad_norm_var": 1.266834299498568e-06, "learning_rate": 0.003077996470275704, "loss": 2.5198, "step": 17463 }, { "crossentropy": 2.509382724761963, "epoch": 0.6331206496519721, "grad_norm": 0.027117019519209862, "grad_norm_var": 1.26456916533095e-06, "learning_rate": 0.0030774600426304176, "loss": 2.4855, "step": 17464 }, { "crossentropy": 2.564157724380493, "epoch": 0.6331569025522041, "grad_norm": 0.02807648852467537, "grad_norm_var": 1.3332914922350424e-06, "learning_rate": 0.003076923640951951, "loss": 2.6088, "step": 17465 }, { "crossentropy": 2.541728973388672, "epoch": 0.6331931554524362, "grad_norm": 0.029097052291035652, "grad_norm_var": 1.5342758550865859e-06, "learning_rate": 0.0030763872652475478, "loss": 2.4488, "step": 17466 }, { "crossentropy": 2.5642333030700684, "epoch": 0.6332294083526682, "grad_norm": 0.02968725562095642, "grad_norm_var": 1.8490052364088094e-06, "learning_rate": 0.003075850915524455, "loss": 2.533, "step": 17467 }, { "crossentropy": 2.4214835166931152, "epoch": 0.6332656612529002, "grad_norm": 0.02601395919919014, "grad_norm_var": 1.973434207452176e-06, "learning_rate": 0.003075314591789915, "loss": 2.6149, "step": 17468 }, { "crossentropy": 2.6146881580352783, "epoch": 0.6333019141531323, "grad_norm": 0.0265900157392025, "grad_norm_var": 2.013338959213591e-06, "learning_rate": 0.003074778294051174, "loss": 2.5634, "step": 17469 }, { "crossentropy": 2.484438896179199, "epoch": 0.6333381670533643, "grad_norm": 0.026273254305124283, "grad_norm_var": 2.0586992714676764e-06, "learning_rate": 0.003074242022315472, "loss": 2.5118, "step": 17470 }, { "crossentropy": 2.543182134628296, "epoch": 0.6333744199535963, "grad_norm": 0.02672179788351059, "grad_norm_var": 2.067521821299175e-06, "learning_rate": 0.003073705776590055, "loss": 2.4713, "step": 17471 }, { "crossentropy": 2.5067996978759766, "epoch": 0.6334106728538283, "grad_norm": 0.026162512600421906, "grad_norm_var": 2.1266172640976854e-06, "learning_rate": 0.003073169556882164, "loss": 2.5608, "step": 17472 }, { "crossentropy": 2.4564177989959717, "epoch": 0.6334469257540604, "grad_norm": 0.025729240849614143, "grad_norm_var": 2.2353400917387528e-06, "learning_rate": 0.003072633363199042, "loss": 2.4602, "step": 17473 }, { "crossentropy": 2.529649496078491, "epoch": 0.6334831786542924, "grad_norm": 0.026486316695809364, "grad_norm_var": 2.1034335175210138e-06, "learning_rate": 0.003072097195547931, "loss": 2.5708, "step": 17474 }, { "crossentropy": 2.323702573776245, "epoch": 0.6335194315545244, "grad_norm": 0.02660125307738781, "grad_norm_var": 1.9851282395609195e-06, "learning_rate": 0.003071561053936074, "loss": 2.4574, "step": 17475 }, { "crossentropy": 2.5194780826568604, "epoch": 0.6335556844547564, "grad_norm": 0.026560157537460327, "grad_norm_var": 1.3125588918656723e-06, "learning_rate": 0.0030710249383707113, "loss": 2.4796, "step": 17476 }, { "crossentropy": 2.571403741836548, "epoch": 0.6335919373549884, "grad_norm": 0.026274308562278748, "grad_norm_var": 1.3124487254813239e-06, "learning_rate": 0.0030704888488590843, "loss": 2.5037, "step": 17477 }, { "crossentropy": 2.4626317024230957, "epoch": 0.6336281902552204, "grad_norm": 0.025905204936861992, "grad_norm_var": 1.3445751923417289e-06, "learning_rate": 0.0030699527854084335, "loss": 2.5059, "step": 17478 }, { "crossentropy": 2.441422939300537, "epoch": 0.6336644431554525, "grad_norm": 0.02671671286225319, "grad_norm_var": 1.268569338354284e-06, "learning_rate": 0.0030694167480259994, "loss": 2.5035, "step": 17479 }, { "crossentropy": 2.5651745796203613, "epoch": 0.6337006960556845, "grad_norm": 0.02777092531323433, "grad_norm_var": 1.3163265609228975e-06, "learning_rate": 0.0030688807367190213, "loss": 2.5562, "step": 17480 }, { "crossentropy": 2.3476204872131348, "epoch": 0.6337369489559165, "grad_norm": 0.026503078639507294, "grad_norm_var": 1.2277332751830928e-06, "learning_rate": 0.0030683447514947395, "loss": 2.405, "step": 17481 }, { "crossentropy": 2.4576568603515625, "epoch": 0.6337732018561485, "grad_norm": 0.02701541595160961, "grad_norm_var": 8.660920164376874e-07, "learning_rate": 0.0030678087923603926, "loss": 2.4536, "step": 17482 }, { "crossentropy": 2.5938408374786377, "epoch": 0.6338094547563805, "grad_norm": 0.02717021480202675, "grad_norm_var": 2.5556543296240535e-07, "learning_rate": 0.003067272859323221, "loss": 2.5297, "step": 17483 }, { "crossentropy": 2.515644073486328, "epoch": 0.6338457076566125, "grad_norm": 0.02618417516350746, "grad_norm_var": 2.456441089007984e-07, "learning_rate": 0.003066736952390462, "loss": 2.5919, "step": 17484 }, { "crossentropy": 2.618056058883667, "epoch": 0.6338819605568445, "grad_norm": 0.027450066059827805, "grad_norm_var": 2.974337899632236e-07, "learning_rate": 0.0030662010715693556, "loss": 2.5767, "step": 17485 }, { "crossentropy": 2.569673538208008, "epoch": 0.6339182134570766, "grad_norm": 0.030626270920038223, "grad_norm_var": 1.2948208350058021e-06, "learning_rate": 0.003065665216867138, "loss": 2.5266, "step": 17486 }, { "crossentropy": 2.4526724815368652, "epoch": 0.6339544663573086, "grad_norm": 0.027584563940763474, "grad_norm_var": 1.3245996347714528e-06, "learning_rate": 0.0030651293882910462, "loss": 2.4504, "step": 17487 }, { "crossentropy": 2.439098358154297, "epoch": 0.6339907192575406, "grad_norm": 0.02587158791720867, "grad_norm_var": 1.3593218628355404e-06, "learning_rate": 0.0030645935858483186, "loss": 2.4918, "step": 17488 }, { "crossentropy": 2.4123575687408447, "epoch": 0.6340269721577726, "grad_norm": 0.025277165696024895, "grad_norm_var": 1.4428510478905454e-06, "learning_rate": 0.003064057809546192, "loss": 2.4724, "step": 17489 }, { "crossentropy": 2.643883466720581, "epoch": 0.6340632250580046, "grad_norm": 0.027186917141079903, "grad_norm_var": 1.4372354573974691e-06, "learning_rate": 0.003063522059391901, "loss": 2.583, "step": 17490 }, { "crossentropy": 2.6210670471191406, "epoch": 0.6340994779582366, "grad_norm": 0.027242396026849747, "grad_norm_var": 1.4357961066394867e-06, "learning_rate": 0.0030629863353926847, "loss": 2.5109, "step": 17491 }, { "crossentropy": 2.584855794906616, "epoch": 0.6341357308584686, "grad_norm": 0.026491930708289146, "grad_norm_var": 1.439712521131518e-06, "learning_rate": 0.0030624506375557777, "loss": 2.6442, "step": 17492 }, { "crossentropy": 2.5380098819732666, "epoch": 0.6341719837587007, "grad_norm": 0.0264801736921072, "grad_norm_var": 1.4236927672118022e-06, "learning_rate": 0.003061914965888415, "loss": 2.578, "step": 17493 }, { "crossentropy": 2.5066065788269043, "epoch": 0.6342082366589327, "grad_norm": 0.026319071650505066, "grad_norm_var": 1.3757893490946397e-06, "learning_rate": 0.0030613793203978323, "loss": 2.4786, "step": 17494 }, { "crossentropy": 2.4951157569885254, "epoch": 0.6342444895591647, "grad_norm": 0.026626192033290863, "grad_norm_var": 1.379638118909933e-06, "learning_rate": 0.0030608437010912633, "loss": 2.4916, "step": 17495 }, { "crossentropy": 2.598196506500244, "epoch": 0.6342807424593968, "grad_norm": 0.026606827974319458, "grad_norm_var": 1.3427369189777013e-06, "learning_rate": 0.003060308107975943, "loss": 2.5252, "step": 17496 }, { "crossentropy": 2.543947219848633, "epoch": 0.6343169953596288, "grad_norm": 0.025766048580408096, "grad_norm_var": 1.4171432697745433e-06, "learning_rate": 0.0030597725410591047, "loss": 2.5256, "step": 17497 }, { "crossentropy": 2.5361945629119873, "epoch": 0.6343532482598608, "grad_norm": 0.02671339362859726, "grad_norm_var": 1.416935706077454e-06, "learning_rate": 0.003059237000347982, "loss": 2.6221, "step": 17498 }, { "crossentropy": 2.362865447998047, "epoch": 0.6343895011600929, "grad_norm": 0.024902142584323883, "grad_norm_var": 1.6415523619929627e-06, "learning_rate": 0.00305870148584981, "loss": 2.3868, "step": 17499 }, { "crossentropy": 2.605966567993164, "epoch": 0.6344257540603249, "grad_norm": 0.02564007043838501, "grad_norm_var": 1.6980617481335367e-06, "learning_rate": 0.00305816599757182, "loss": 2.4954, "step": 17500 }, { "crossentropy": 2.409985303878784, "epoch": 0.6344620069605569, "grad_norm": 0.0270234402269125, "grad_norm_var": 1.6652949493522102e-06, "learning_rate": 0.003057630535521246, "loss": 2.4591, "step": 17501 }, { "crossentropy": 2.383148431777954, "epoch": 0.6344982598607889, "grad_norm": 0.0266096368432045, "grad_norm_var": 5.427332210496937e-07, "learning_rate": 0.003057095099705319, "loss": 2.4833, "step": 17502 }, { "crossentropy": 2.499180793762207, "epoch": 0.6345345127610209, "grad_norm": 0.02663743682205677, "grad_norm_var": 4.487465564677878e-07, "learning_rate": 0.0030565596901312715, "loss": 2.4785, "step": 17503 }, { "crossentropy": 2.375506639480591, "epoch": 0.6345707656612529, "grad_norm": 0.027126500383019447, "grad_norm_var": 4.692729317513072e-07, "learning_rate": 0.003056024306806334, "loss": 2.5343, "step": 17504 }, { "crossentropy": 2.572970390319824, "epoch": 0.6346070185614849, "grad_norm": 0.026193859055638313, "grad_norm_var": 3.8264928156488536e-07, "learning_rate": 0.003055488949737737, "loss": 2.5594, "step": 17505 }, { "crossentropy": 2.7232046127319336, "epoch": 0.634643271461717, "grad_norm": 0.027184516191482544, "grad_norm_var": 3.8242105871374996e-07, "learning_rate": 0.003054953618932714, "loss": 2.6377, "step": 17506 }, { "crossentropy": 2.53273868560791, "epoch": 0.634679524361949, "grad_norm": 0.028022320941090584, "grad_norm_var": 5.004765800528753e-07, "learning_rate": 0.003054418314398494, "loss": 2.5331, "step": 17507 }, { "crossentropy": 2.569697618484497, "epoch": 0.634715777262181, "grad_norm": 0.027836639434099197, "grad_norm_var": 6.081949770525911e-07, "learning_rate": 0.003053883036142307, "loss": 2.6084, "step": 17508 }, { "crossentropy": 2.5689494609832764, "epoch": 0.634752030162413, "grad_norm": 0.027780892327427864, "grad_norm_var": 6.92198620532806e-07, "learning_rate": 0.003053347784171383, "loss": 2.5801, "step": 17509 }, { "crossentropy": 2.5991063117980957, "epoch": 0.634788283062645, "grad_norm": 0.026782266795635223, "grad_norm_var": 6.828965846336169e-07, "learning_rate": 0.0030528125584929505, "loss": 2.6319, "step": 17510 }, { "crossentropy": 2.4703450202941895, "epoch": 0.634824535962877, "grad_norm": 0.02784145623445511, "grad_norm_var": 7.606873655615724e-07, "learning_rate": 0.0030522773591142426, "loss": 2.5019, "step": 17511 }, { "crossentropy": 2.467254400253296, "epoch": 0.634860788863109, "grad_norm": 0.026590216904878616, "grad_norm_var": 7.611141017058523e-07, "learning_rate": 0.0030517421860424823, "loss": 2.5712, "step": 17512 }, { "crossentropy": 2.46773362159729, "epoch": 0.634897041763341, "grad_norm": 0.027389274910092354, "grad_norm_var": 7.040325006004427e-07, "learning_rate": 0.0030512070392848993, "loss": 2.529, "step": 17513 }, { "crossentropy": 2.5754761695861816, "epoch": 0.6349332946635731, "grad_norm": 0.02664787881076336, "grad_norm_var": 7.058620711144122e-07, "learning_rate": 0.003050671918848724, "loss": 2.5338, "step": 17514 }, { "crossentropy": 2.525409698486328, "epoch": 0.6349695475638051, "grad_norm": 0.029236355796456337, "grad_norm_var": 7.323127462983363e-07, "learning_rate": 0.0030501368247411826, "loss": 2.5323, "step": 17515 }, { "crossentropy": 2.5520529747009277, "epoch": 0.6350058004640371, "grad_norm": 0.027394330129027367, "grad_norm_var": 5.693904470603639e-07, "learning_rate": 0.003049601756969502, "loss": 2.5739, "step": 17516 }, { "crossentropy": 2.452252149581909, "epoch": 0.6350420533642691, "grad_norm": 0.028805071488022804, "grad_norm_var": 7.095492446114344e-07, "learning_rate": 0.0030490667155409104, "loss": 2.5853, "step": 17517 }, { "crossentropy": 2.451284646987915, "epoch": 0.6350783062645011, "grad_norm": 0.02683732844889164, "grad_norm_var": 6.894046538104589e-07, "learning_rate": 0.003048531700462631, "loss": 2.4571, "step": 17518 }, { "crossentropy": 2.5741333961486816, "epoch": 0.6351145591647331, "grad_norm": 0.026909681037068367, "grad_norm_var": 6.665689816863814e-07, "learning_rate": 0.003047996711741897, "loss": 2.5074, "step": 17519 }, { "crossentropy": 2.4412450790405273, "epoch": 0.6351508120649652, "grad_norm": 0.02639012411236763, "grad_norm_var": 7.284086629933314e-07, "learning_rate": 0.003047461749385927, "loss": 2.5354, "step": 17520 }, { "crossentropy": 2.695037841796875, "epoch": 0.6351870649651972, "grad_norm": 0.028365500271320343, "grad_norm_var": 6.840138267837862e-07, "learning_rate": 0.003046926813401948, "loss": 2.5489, "step": 17521 }, { "crossentropy": 2.656832456588745, "epoch": 0.6352233178654292, "grad_norm": 0.02670265920460224, "grad_norm_var": 7.188501682247235e-07, "learning_rate": 0.0030463919037971876, "loss": 2.606, "step": 17522 }, { "crossentropy": 2.4681453704833984, "epoch": 0.6352595707656613, "grad_norm": 0.027366921305656433, "grad_norm_var": 6.974970244717626e-07, "learning_rate": 0.003045857020578869, "loss": 2.5454, "step": 17523 }, { "crossentropy": 2.431495428085327, "epoch": 0.6352958236658933, "grad_norm": 0.026543742045760155, "grad_norm_var": 7.31835265896837e-07, "learning_rate": 0.003045322163754217, "loss": 2.5397, "step": 17524 }, { "crossentropy": 2.4800498485565186, "epoch": 0.6353320765661253, "grad_norm": 0.02632862888276577, "grad_norm_var": 7.800189274125823e-07, "learning_rate": 0.0030447873333304554, "loss": 2.5307, "step": 17525 }, { "crossentropy": 2.6149041652679443, "epoch": 0.6353683294663574, "grad_norm": 0.028270026668906212, "grad_norm_var": 8.23945426992031e-07, "learning_rate": 0.003044252529314807, "loss": 2.5736, "step": 17526 }, { "crossentropy": 2.544431447982788, "epoch": 0.6354045823665894, "grad_norm": 0.02639259770512581, "grad_norm_var": 8.604365646903833e-07, "learning_rate": 0.003043717751714499, "loss": 2.5165, "step": 17527 }, { "crossentropy": 2.5111868381500244, "epoch": 0.6354408352668214, "grad_norm": 0.026109566912055016, "grad_norm_var": 9.178411628627907e-07, "learning_rate": 0.00304318300053675, "loss": 2.4925, "step": 17528 }, { "crossentropy": 2.508699893951416, "epoch": 0.6354770881670534, "grad_norm": 0.0280969999730587, "grad_norm_var": 9.641184130251459e-07, "learning_rate": 0.0030426482757887827, "loss": 2.5352, "step": 17529 }, { "crossentropy": 2.5217068195343018, "epoch": 0.6355133410672854, "grad_norm": 0.0257065761834383, "grad_norm_var": 1.0981843929835777e-06, "learning_rate": 0.003042113577477822, "loss": 2.5797, "step": 17530 }, { "crossentropy": 2.35237717628479, "epoch": 0.6355495939675174, "grad_norm": 0.02856992371380329, "grad_norm_var": 9.46419247858339e-07, "learning_rate": 0.003041578905611089, "loss": 2.3692, "step": 17531 }, { "crossentropy": 2.638747453689575, "epoch": 0.6355858468677494, "grad_norm": 0.02749442495405674, "grad_norm_var": 9.499812182732579e-07, "learning_rate": 0.003041044260195804, "loss": 2.5822, "step": 17532 }, { "crossentropy": 2.582787036895752, "epoch": 0.6356220997679815, "grad_norm": 0.02725793421268463, "grad_norm_var": 7.644815045795839e-07, "learning_rate": 0.0030405096412391896, "loss": 2.6328, "step": 17533 }, { "crossentropy": 2.6439380645751953, "epoch": 0.6356583526682135, "grad_norm": 0.026876531541347504, "grad_norm_var": 7.632886337788214e-07, "learning_rate": 0.003039975048748465, "loss": 2.6049, "step": 17534 }, { "crossentropy": 2.675131320953369, "epoch": 0.6356946055684455, "grad_norm": 0.0270861629396677, "grad_norm_var": 7.610777157686926e-07, "learning_rate": 0.003039440482730853, "loss": 2.6895, "step": 17535 }, { "crossentropy": 2.53682804107666, "epoch": 0.6357308584686775, "grad_norm": 0.027150148525834084, "grad_norm_var": 7.255076122452663e-07, "learning_rate": 0.0030389059431935746, "loss": 2.4933, "step": 17536 }, { "crossentropy": 2.6681199073791504, "epoch": 0.6357671113689095, "grad_norm": 0.02774938941001892, "grad_norm_var": 6.489618558928762e-07, "learning_rate": 0.0030383714301438444, "loss": 2.7103, "step": 17537 }, { "crossentropy": 2.4931936264038086, "epoch": 0.6358033642691415, "grad_norm": 0.02672259695827961, "grad_norm_var": 6.479134367996681e-07, "learning_rate": 0.0030378369435888853, "loss": 2.5114, "step": 17538 }, { "crossentropy": 2.610278844833374, "epoch": 0.6358396171693735, "grad_norm": 0.026301955804228783, "grad_norm_var": 6.819805499131872e-07, "learning_rate": 0.003037302483535917, "loss": 2.566, "step": 17539 }, { "crossentropy": 2.6335067749023438, "epoch": 0.6358758700696056, "grad_norm": 0.027055688202381134, "grad_norm_var": 6.644133840840703e-07, "learning_rate": 0.0030367680499921567, "loss": 2.6152, "step": 17540 }, { "crossentropy": 2.4987683296203613, "epoch": 0.6359121229698376, "grad_norm": 0.027086136862635612, "grad_norm_var": 6.250874796821677e-07, "learning_rate": 0.003036233642964823, "loss": 2.5468, "step": 17541 }, { "crossentropy": 2.4853782653808594, "epoch": 0.6359483758700696, "grad_norm": 0.026880253106355667, "grad_norm_var": 5.327779736288696e-07, "learning_rate": 0.0030356992624611333, "loss": 2.4425, "step": 17542 }, { "crossentropy": 2.5885584354400635, "epoch": 0.6359846287703016, "grad_norm": 0.027364695444703102, "grad_norm_var": 5.08762374503563e-07, "learning_rate": 0.003035164908488307, "loss": 2.5923, "step": 17543 }, { "crossentropy": 2.5097086429595947, "epoch": 0.6360208816705336, "grad_norm": 0.025692712515592575, "grad_norm_var": 5.743555327809239e-07, "learning_rate": 0.0030346305810535624, "loss": 2.5354, "step": 17544 }, { "crossentropy": 2.4756019115448, "epoch": 0.6360571345707656, "grad_norm": 0.026844915002584457, "grad_norm_var": 5.005948738830464e-07, "learning_rate": 0.0030340962801641115, "loss": 2.4671, "step": 17545 }, { "crossentropy": 2.5826239585876465, "epoch": 0.6360933874709976, "grad_norm": 0.025275908410549164, "grad_norm_var": 5.858844443017037e-07, "learning_rate": 0.003033562005827174, "loss": 2.5475, "step": 17546 }, { "crossentropy": 2.5542423725128174, "epoch": 0.6361296403712297, "grad_norm": 0.02779589220881462, "grad_norm_var": 4.574973509705819e-07, "learning_rate": 0.0030330277580499668, "loss": 2.5684, "step": 17547 }, { "crossentropy": 2.6146669387817383, "epoch": 0.6361658932714617, "grad_norm": 0.025831229984760284, "grad_norm_var": 5.018285530700508e-07, "learning_rate": 0.003032493536839704, "loss": 2.5264, "step": 17548 }, { "crossentropy": 2.343597888946533, "epoch": 0.6362021461716937, "grad_norm": 0.026487194001674652, "grad_norm_var": 4.930020060508444e-07, "learning_rate": 0.0030319593422036017, "loss": 2.4319, "step": 17549 }, { "crossentropy": 2.5585570335388184, "epoch": 0.6362383990719258, "grad_norm": 0.027737339958548546, "grad_norm_var": 5.523917387342206e-07, "learning_rate": 0.003031425174148874, "loss": 2.513, "step": 17550 }, { "crossentropy": 2.3920419216156006, "epoch": 0.6362746519721578, "grad_norm": 0.025821829214692116, "grad_norm_var": 6.068225197684707e-07, "learning_rate": 0.003030891032682739, "loss": 2.503, "step": 17551 }, { "crossentropy": 2.5290911197662354, "epoch": 0.6363109048723898, "grad_norm": 0.026214633136987686, "grad_norm_var": 6.100334784375193e-07, "learning_rate": 0.0030303569178124073, "loss": 2.5311, "step": 17552 }, { "crossentropy": 2.663909912109375, "epoch": 0.6363471577726219, "grad_norm": 0.025370284914970398, "grad_norm_var": 6.242173698391949e-07, "learning_rate": 0.0030298228295450964, "loss": 2.4993, "step": 17553 }, { "crossentropy": 2.4751453399658203, "epoch": 0.6363834106728539, "grad_norm": 0.02623658999800682, "grad_norm_var": 6.265128074321349e-07, "learning_rate": 0.0030292887678880164, "loss": 2.5813, "step": 17554 }, { "crossentropy": 2.645264148712158, "epoch": 0.6364196635730859, "grad_norm": 0.025565261021256447, "grad_norm_var": 6.798690123147131e-07, "learning_rate": 0.003028754732848383, "loss": 2.5804, "step": 17555 }, { "crossentropy": 2.6068592071533203, "epoch": 0.6364559164733179, "grad_norm": 0.026225542649626732, "grad_norm_var": 6.563180900820882e-07, "learning_rate": 0.003028220724433408, "loss": 2.6357, "step": 17556 }, { "crossentropy": 2.568964958190918, "epoch": 0.6364921693735499, "grad_norm": 0.027180036529898643, "grad_norm_var": 6.654357625428727e-07, "learning_rate": 0.003027686742650304, "loss": 2.4911, "step": 17557 }, { "crossentropy": 2.499262809753418, "epoch": 0.6365284222737819, "grad_norm": 0.027509218081831932, "grad_norm_var": 7.297839580003614e-07, "learning_rate": 0.0030271527875062832, "loss": 2.6266, "step": 17558 }, { "crossentropy": 2.63179612159729, "epoch": 0.6365646751740139, "grad_norm": 0.025782356038689613, "grad_norm_var": 6.926741629174286e-07, "learning_rate": 0.0030266188590085587, "loss": 2.5451, "step": 17559 }, { "crossentropy": 2.432572603225708, "epoch": 0.636600928074246, "grad_norm": 0.0263905618339777, "grad_norm_var": 6.621218921448735e-07, "learning_rate": 0.0030260849571643413, "loss": 2.4552, "step": 17560 }, { "crossentropy": 2.55900502204895, "epoch": 0.636637180974478, "grad_norm": 0.026614727452397346, "grad_norm_var": 6.515266653053421e-07, "learning_rate": 0.003025551081980843, "loss": 2.5499, "step": 17561 }, { "crossentropy": 2.335491180419922, "epoch": 0.63667343387471, "grad_norm": 0.027246598154306412, "grad_norm_var": 6.048229969179367e-07, "learning_rate": 0.003025017233465273, "loss": 2.4738, "step": 17562 }, { "crossentropy": 2.717188596725464, "epoch": 0.636709686774942, "grad_norm": 0.03339362144470215, "grad_norm_var": 3.530007345630192e-06, "learning_rate": 0.003024483411624842, "loss": 2.6017, "step": 17563 }, { "crossentropy": 2.6451449394226074, "epoch": 0.636745939675174, "grad_norm": 0.026800306513905525, "grad_norm_var": 3.4570094700652384e-06, "learning_rate": 0.00302394961646676, "loss": 2.6408, "step": 17564 }, { "crossentropy": 2.5512325763702393, "epoch": 0.636782192575406, "grad_norm": 0.026756783947348595, "grad_norm_var": 3.4463178196497546e-06, "learning_rate": 0.0030234158479982383, "loss": 2.5215, "step": 17565 }, { "crossentropy": 2.74566650390625, "epoch": 0.636818445475638, "grad_norm": 0.027699271216988564, "grad_norm_var": 3.442299590098114e-06, "learning_rate": 0.0030228821062264835, "loss": 2.6149, "step": 17566 }, { "crossentropy": 2.4888548851013184, "epoch": 0.63685469837587, "grad_norm": 0.026703717187047005, "grad_norm_var": 3.361135067882085e-06, "learning_rate": 0.003022348391158707, "loss": 2.4805, "step": 17567 }, { "crossentropy": 2.5144598484039307, "epoch": 0.6368909512761021, "grad_norm": 0.0270765982568264, "grad_norm_var": 3.3195406427028213e-06, "learning_rate": 0.003021814702802117, "loss": 2.5566, "step": 17568 }, { "crossentropy": 2.7007646560668945, "epoch": 0.6369272041763341, "grad_norm": 0.028064055368304253, "grad_norm_var": 3.1753422527684243e-06, "learning_rate": 0.0030212810411639223, "loss": 2.5951, "step": 17569 }, { "crossentropy": 2.4957261085510254, "epoch": 0.6369634570765661, "grad_norm": 0.027268998324871063, "grad_norm_var": 3.1089520026523928e-06, "learning_rate": 0.0030207474062513297, "loss": 2.5232, "step": 17570 }, { "crossentropy": 2.5353000164031982, "epoch": 0.6369997099767981, "grad_norm": 0.02589236944913864, "grad_norm_var": 3.0414036639578397e-06, "learning_rate": 0.0030202137980715462, "loss": 2.4683, "step": 17571 }, { "crossentropy": 2.477443218231201, "epoch": 0.6370359628770301, "grad_norm": 0.028090093284845352, "grad_norm_var": 2.9946042153776444e-06, "learning_rate": 0.0030196802166317805, "loss": 2.5119, "step": 17572 }, { "crossentropy": 2.5719544887542725, "epoch": 0.6370722157772621, "grad_norm": 0.025837497785687447, "grad_norm_var": 3.1474049178340306e-06, "learning_rate": 0.003019146661939238, "loss": 2.5588, "step": 17573 }, { "crossentropy": 2.4941372871398926, "epoch": 0.6371084686774942, "grad_norm": 0.025777975097298622, "grad_norm_var": 3.291150129824255e-06, "learning_rate": 0.0030186131340011247, "loss": 2.5031, "step": 17574 }, { "crossentropy": 2.570674419403076, "epoch": 0.6371447215777262, "grad_norm": 0.027709132060408592, "grad_norm_var": 3.1558420496855324e-06, "learning_rate": 0.00301807963282465, "loss": 2.5113, "step": 17575 }, { "crossentropy": 2.5170469284057617, "epoch": 0.6371809744779582, "grad_norm": 0.026039741933345795, "grad_norm_var": 3.2076010451448683e-06, "learning_rate": 0.003017546158417016, "loss": 2.5903, "step": 17576 }, { "crossentropy": 2.440429925918579, "epoch": 0.6372172273781903, "grad_norm": 0.025937041267752647, "grad_norm_var": 3.2991931263311267e-06, "learning_rate": 0.0030170127107854304, "loss": 2.4411, "step": 17577 }, { "crossentropy": 2.5915451049804688, "epoch": 0.6372534802784223, "grad_norm": 0.025500573217868805, "grad_norm_var": 3.4947976495985074e-06, "learning_rate": 0.003016479289937099, "loss": 2.5658, "step": 17578 }, { "crossentropy": 2.3663856983184814, "epoch": 0.6372897331786543, "grad_norm": 0.027572989463806152, "grad_norm_var": 7.738740178583978e-07, "learning_rate": 0.003015945895879223, "loss": 2.4002, "step": 17579 }, { "crossentropy": 2.6556830406188965, "epoch": 0.6373259860788864, "grad_norm": 0.027023756876587868, "grad_norm_var": 7.771394424450299e-07, "learning_rate": 0.003015412528619009, "loss": 2.5732, "step": 17580 }, { "crossentropy": 2.555586576461792, "epoch": 0.6373622389791184, "grad_norm": 0.027226051315665245, "grad_norm_var": 7.876097894705339e-07, "learning_rate": 0.0030148791881636607, "loss": 2.5917, "step": 17581 }, { "crossentropy": 2.593151569366455, "epoch": 0.6373984918793504, "grad_norm": 0.02764519676566124, "grad_norm_var": 7.815881852038866e-07, "learning_rate": 0.00301434587452038, "loss": 2.529, "step": 17582 }, { "crossentropy": 2.6905829906463623, "epoch": 0.6374347447795824, "grad_norm": 0.02655387856066227, "grad_norm_var": 7.856214693767345e-07, "learning_rate": 0.0030138125876963727, "loss": 2.6127, "step": 17583 }, { "crossentropy": 2.5342211723327637, "epoch": 0.6374709976798144, "grad_norm": 0.02811490371823311, "grad_norm_var": 8.876948050147322e-07, "learning_rate": 0.0030132793276988414, "loss": 2.5033, "step": 17584 }, { "crossentropy": 2.547231912612915, "epoch": 0.6375072505800464, "grad_norm": 0.027225390076637268, "grad_norm_var": 8.004691298122881e-07, "learning_rate": 0.0030127460945349872, "loss": 2.5576, "step": 17585 }, { "crossentropy": 2.472057342529297, "epoch": 0.6375435034802784, "grad_norm": 0.027855629101395607, "grad_norm_var": 8.556520928485375e-07, "learning_rate": 0.0030122128882120137, "loss": 2.5176, "step": 17586 }, { "crossentropy": 2.3136024475097656, "epoch": 0.6375797563805105, "grad_norm": 0.026531590148806572, "grad_norm_var": 7.974289228974828e-07, "learning_rate": 0.00301167970873712, "loss": 2.4183, "step": 17587 }, { "crossentropy": 2.474433660507202, "epoch": 0.6376160092807425, "grad_norm": 0.028189675882458687, "grad_norm_var": 8.13650032938038e-07, "learning_rate": 0.00301114655611751, "loss": 2.5848, "step": 17588 }, { "crossentropy": 2.520777940750122, "epoch": 0.6376522621809745, "grad_norm": 0.027499886229634285, "grad_norm_var": 7.46141195008485e-07, "learning_rate": 0.0030106134303603837, "loss": 2.5457, "step": 17589 }, { "crossentropy": 2.5109663009643555, "epoch": 0.6376885150812065, "grad_norm": 0.02707403525710106, "grad_norm_var": 6.355941869207037e-07, "learning_rate": 0.003010080331472941, "loss": 2.4713, "step": 17590 }, { "crossentropy": 2.4492993354797363, "epoch": 0.6377247679814385, "grad_norm": 0.025690896436572075, "grad_norm_var": 7.279305873430687e-07, "learning_rate": 0.0030095472594623847, "loss": 2.4956, "step": 17591 }, { "crossentropy": 2.582017660140991, "epoch": 0.6377610208816705, "grad_norm": 0.02590576559305191, "grad_norm_var": 7.458501326094615e-07, "learning_rate": 0.0030090142143359127, "loss": 2.4709, "step": 17592 }, { "crossentropy": 2.612945318222046, "epoch": 0.6377972737819025, "grad_norm": 0.02780904248356819, "grad_norm_var": 7.066224951880636e-07, "learning_rate": 0.0030084811961007254, "loss": 2.603, "step": 17593 }, { "crossentropy": 2.448932647705078, "epoch": 0.6378335266821346, "grad_norm": 0.027272028848528862, "grad_norm_var": 5.276438828040551e-07, "learning_rate": 0.0030079482047640216, "loss": 2.5062, "step": 17594 }, { "crossentropy": 2.485377311706543, "epoch": 0.6378697795823666, "grad_norm": 0.025942955166101456, "grad_norm_var": 6.12516271857955e-07, "learning_rate": 0.003007415240333002, "loss": 2.5538, "step": 17595 }, { "crossentropy": 2.6491336822509766, "epoch": 0.6379060324825986, "grad_norm": 0.0268207099288702, "grad_norm_var": 6.170906227068722e-07, "learning_rate": 0.003006882302814862, "loss": 2.5199, "step": 17596 }, { "crossentropy": 2.5352609157562256, "epoch": 0.6379422853828306, "grad_norm": 0.031180977821350098, "grad_norm_var": 1.669138458161521e-06, "learning_rate": 0.0030063493922168004, "loss": 2.5025, "step": 17597 }, { "crossentropy": 2.4743802547454834, "epoch": 0.6379785382830626, "grad_norm": 0.025523653253912926, "grad_norm_var": 1.8618628317562525e-06, "learning_rate": 0.0030058165085460158, "loss": 2.4699, "step": 17598 }, { "crossentropy": 2.4171319007873535, "epoch": 0.6380147911832946, "grad_norm": 0.02582280896604061, "grad_norm_var": 1.958193334794161e-06, "learning_rate": 0.003005283651809706, "loss": 2.5052, "step": 17599 }, { "crossentropy": 2.602538585662842, "epoch": 0.6380510440835266, "grad_norm": 0.027831850573420525, "grad_norm_var": 1.926926312617999e-06, "learning_rate": 0.003004750822015068, "loss": 2.5236, "step": 17600 }, { "crossentropy": 2.508730173110962, "epoch": 0.6380872969837587, "grad_norm": 0.029942244291305542, "grad_norm_var": 2.420618391051471e-06, "learning_rate": 0.003004218019169298, "loss": 2.552, "step": 17601 }, { "crossentropy": 2.6174914836883545, "epoch": 0.6381235498839907, "grad_norm": 0.02797345630824566, "grad_norm_var": 2.4301231384057467e-06, "learning_rate": 0.0030036852432795925, "loss": 2.5604, "step": 17602 }, { "crossentropy": 2.6386184692382812, "epoch": 0.6381598027842227, "grad_norm": 0.02747862972319126, "grad_norm_var": 2.3874800140728674e-06, "learning_rate": 0.003003152494353147, "loss": 2.6393, "step": 17603 }, { "crossentropy": 2.6832969188690186, "epoch": 0.6381960556844548, "grad_norm": 0.02726651541888714, "grad_norm_var": 2.3401488548925133e-06, "learning_rate": 0.003002619772397158, "loss": 2.6482, "step": 17604 }, { "crossentropy": 2.454901695251465, "epoch": 0.6382323085846868, "grad_norm": 0.027503428980708122, "grad_norm_var": 2.3402371075971367e-06, "learning_rate": 0.003002087077418818, "loss": 2.469, "step": 17605 }, { "crossentropy": 2.5491721630096436, "epoch": 0.6382685614849188, "grad_norm": 0.025610024109482765, "grad_norm_var": 2.5212196119181085e-06, "learning_rate": 0.0030015544094253257, "loss": 2.5014, "step": 17606 }, { "crossentropy": 2.451859474182129, "epoch": 0.6383048143851509, "grad_norm": 0.026620598509907722, "grad_norm_var": 2.3852671133253126e-06, "learning_rate": 0.003001021768423874, "loss": 2.5073, "step": 17607 }, { "crossentropy": 2.584352493286133, "epoch": 0.6383410672853829, "grad_norm": 0.027432432398200035, "grad_norm_var": 2.250889399255512e-06, "learning_rate": 0.0030004891544216563, "loss": 2.5694, "step": 17608 }, { "crossentropy": 2.601729393005371, "epoch": 0.6383773201856149, "grad_norm": 0.02775188535451889, "grad_norm_var": 2.2478007020737547e-06, "learning_rate": 0.0029999565674258665, "loss": 2.6436, "step": 17609 }, { "crossentropy": 2.4334394931793213, "epoch": 0.6384135730858469, "grad_norm": 0.026730354875326157, "grad_norm_var": 2.2734593146285365e-06, "learning_rate": 0.0029994240074436994, "loss": 2.4368, "step": 17610 }, { "crossentropy": 2.4832284450531006, "epoch": 0.6384498259860789, "grad_norm": 0.026375925168395042, "grad_norm_var": 2.2045522542054235e-06, "learning_rate": 0.002998891474482348, "loss": 2.5304, "step": 17611 }, { "crossentropy": 2.5746853351593018, "epoch": 0.6384860788863109, "grad_norm": 0.02836388349533081, "grad_norm_var": 2.2410697196674577e-06, "learning_rate": 0.0029983589685490027, "loss": 2.483, "step": 17612 }, { "crossentropy": 2.512453556060791, "epoch": 0.6385223317865429, "grad_norm": 0.026824867352843285, "grad_norm_var": 1.2676188836241442e-06, "learning_rate": 0.0029978264896508558, "loss": 2.5004, "step": 17613 }, { "crossentropy": 2.4884486198425293, "epoch": 0.638558584686775, "grad_norm": 0.026618463918566704, "grad_norm_var": 1.09917289555575e-06, "learning_rate": 0.002997294037795102, "loss": 2.4987, "step": 17614 }, { "crossentropy": 2.522909164428711, "epoch": 0.638594837587007, "grad_norm": 0.02692667581140995, "grad_norm_var": 9.639177040415258e-07, "learning_rate": 0.0029967616129889314, "loss": 2.505, "step": 17615 }, { "crossentropy": 2.481863498687744, "epoch": 0.638631090487239, "grad_norm": 0.027479631826281548, "grad_norm_var": 9.480187485602064e-07, "learning_rate": 0.0029962292152395354, "loss": 2.5211, "step": 17616 }, { "crossentropy": 2.47502064704895, "epoch": 0.638667343387471, "grad_norm": 0.025983933359384537, "grad_norm_var": 5.360390949375275e-07, "learning_rate": 0.0029956968445541046, "loss": 2.5529, "step": 17617 }, { "crossentropy": 2.598808526992798, "epoch": 0.638703596287703, "grad_norm": 0.026459913700819016, "grad_norm_var": 4.946307794050405e-07, "learning_rate": 0.0029951645009398276, "loss": 2.6086, "step": 17618 }, { "crossentropy": 2.5074779987335205, "epoch": 0.638739849187935, "grad_norm": 0.025839919224381447, "grad_norm_var": 5.500658589123782e-07, "learning_rate": 0.002994632184403899, "loss": 2.5031, "step": 17619 }, { "crossentropy": 2.513946294784546, "epoch": 0.638776102088167, "grad_norm": 0.02807941660284996, "grad_norm_var": 6.352345461558064e-07, "learning_rate": 0.002994099894953507, "loss": 2.5321, "step": 17620 }, { "crossentropy": 2.526921510696411, "epoch": 0.6388123549883991, "grad_norm": 0.028262455016374588, "grad_norm_var": 7.310375743455836e-07, "learning_rate": 0.002993567632595837, "loss": 2.5644, "step": 17621 }, { "crossentropy": 2.365696668624878, "epoch": 0.6388486078886311, "grad_norm": 0.026593070477247238, "grad_norm_var": 6.144880294082014e-07, "learning_rate": 0.0029930353973380824, "loss": 2.3397, "step": 17622 }, { "crossentropy": 2.5945582389831543, "epoch": 0.6388848607888631, "grad_norm": 0.02786162495613098, "grad_norm_var": 6.444158725025003e-07, "learning_rate": 0.00299250318918743, "loss": 2.6193, "step": 17623 }, { "crossentropy": 2.6427972316741943, "epoch": 0.6389211136890951, "grad_norm": 0.026375189423561096, "grad_norm_var": 6.672774961473278e-07, "learning_rate": 0.002991971008151069, "loss": 2.6352, "step": 17624 }, { "crossentropy": 2.5113229751586914, "epoch": 0.6389573665893271, "grad_norm": 0.026149103417992592, "grad_norm_var": 6.741949616075367e-07, "learning_rate": 0.002991438854236187, "loss": 2.5264, "step": 17625 }, { "crossentropy": 2.519470453262329, "epoch": 0.6389936194895591, "grad_norm": 0.0269523523747921, "grad_norm_var": 6.712835210563913e-07, "learning_rate": 0.00299090672744997, "loss": 2.452, "step": 17626 }, { "crossentropy": 2.4029541015625, "epoch": 0.6390298723897911, "grad_norm": 0.02737598866224289, "grad_norm_var": 6.576897644230972e-07, "learning_rate": 0.002990374627799607, "loss": 2.4507, "step": 17627 }, { "crossentropy": 2.4571666717529297, "epoch": 0.6390661252900232, "grad_norm": 0.026309123262763023, "grad_norm_var": 5.504151065662075e-07, "learning_rate": 0.002989842555292287, "loss": 2.4764, "step": 17628 }, { "crossentropy": 2.2914881706237793, "epoch": 0.6391023781902552, "grad_norm": 0.026748433709144592, "grad_norm_var": 5.513495741474306e-07, "learning_rate": 0.0029893105099351907, "loss": 2.4555, "step": 17629 }, { "crossentropy": 2.603252410888672, "epoch": 0.6391386310904872, "grad_norm": 0.029920227825641632, "grad_norm_var": 1.1193453125438266e-06, "learning_rate": 0.0029887784917355087, "loss": 2.4094, "step": 17630 }, { "crossentropy": 2.578188419342041, "epoch": 0.6391748839907193, "grad_norm": 0.025998054072260857, "grad_norm_var": 1.1925122683798753e-06, "learning_rate": 0.0029882465007004247, "loss": 2.5527, "step": 17631 }, { "crossentropy": 2.565920114517212, "epoch": 0.6392111368909513, "grad_norm": 0.02776283770799637, "grad_norm_var": 1.2147196579851324e-06, "learning_rate": 0.002987714536837125, "loss": 2.5701, "step": 17632 }, { "crossentropy": 2.5231595039367676, "epoch": 0.6392473897911833, "grad_norm": 0.027499986812472343, "grad_norm_var": 1.1444972098766422e-06, "learning_rate": 0.002987182600152794, "loss": 2.5418, "step": 17633 }, { "crossentropy": 2.443589687347412, "epoch": 0.6392836426914154, "grad_norm": 0.02860886976122856, "grad_norm_var": 1.2391961953174864e-06, "learning_rate": 0.0029866506906546155, "loss": 2.4893, "step": 17634 }, { "crossentropy": 2.45758056640625, "epoch": 0.6393198955916474, "grad_norm": 0.02927745319902897, "grad_norm_var": 1.3217989648981228e-06, "learning_rate": 0.002986118808349776, "loss": 2.5392, "step": 17635 }, { "crossentropy": 2.606095552444458, "epoch": 0.6393561484918794, "grad_norm": 0.025682473555207253, "grad_norm_var": 1.4911948149893642e-06, "learning_rate": 0.0029855869532454584, "loss": 2.619, "step": 17636 }, { "crossentropy": 2.527780294418335, "epoch": 0.6393924013921114, "grad_norm": 0.027035685256123543, "grad_norm_var": 1.4337281879111025e-06, "learning_rate": 0.0029850551253488467, "loss": 2.5078, "step": 17637 }, { "crossentropy": 2.508957624435425, "epoch": 0.6394286542923434, "grad_norm": 0.027076641097664833, "grad_norm_var": 1.4053806044286275e-06, "learning_rate": 0.0029845233246671224, "loss": 2.5109, "step": 17638 }, { "crossentropy": 2.408205986022949, "epoch": 0.6394649071925754, "grad_norm": 0.02753867208957672, "grad_norm_var": 1.3872688486820639e-06, "learning_rate": 0.002983991551207469, "loss": 2.5026, "step": 17639 }, { "crossentropy": 2.5172343254089355, "epoch": 0.6395011600928074, "grad_norm": 0.028048984706401825, "grad_norm_var": 1.3627952162123883e-06, "learning_rate": 0.002983459804977069, "loss": 2.5782, "step": 17640 }, { "crossentropy": 2.5892741680145264, "epoch": 0.6395374129930395, "grad_norm": 0.027844442054629326, "grad_norm_var": 1.265536562411231e-06, "learning_rate": 0.002982928085983105, "loss": 2.6593, "step": 17641 }, { "crossentropy": 2.494192123413086, "epoch": 0.6395736658932715, "grad_norm": 0.026289308443665504, "grad_norm_var": 1.339661656125562e-06, "learning_rate": 0.002982396394232757, "loss": 2.453, "step": 17642 }, { "crossentropy": 2.3130908012390137, "epoch": 0.6396099187935035, "grad_norm": 0.02690078876912594, "grad_norm_var": 1.3577404888206445e-06, "learning_rate": 0.002981864729733208, "loss": 2.4172, "step": 17643 }, { "crossentropy": 2.3788740634918213, "epoch": 0.6396461716937355, "grad_norm": 0.026242926716804504, "grad_norm_var": 1.3677209885101186e-06, "learning_rate": 0.0029813330924916394, "loss": 2.4763, "step": 17644 }, { "crossentropy": 2.4987776279449463, "epoch": 0.6396824245939675, "grad_norm": 0.027287574484944344, "grad_norm_var": 1.3387094162584101e-06, "learning_rate": 0.0029808014825152313, "loss": 2.453, "step": 17645 }, { "crossentropy": 2.4324417114257812, "epoch": 0.6397186774941995, "grad_norm": 0.028596939519047737, "grad_norm_var": 1.010268637079018e-06, "learning_rate": 0.002980269899811162, "loss": 2.4473, "step": 17646 }, { "crossentropy": 2.461608409881592, "epoch": 0.6397549303944315, "grad_norm": 0.027930017560720444, "grad_norm_var": 8.938187823088103e-07, "learning_rate": 0.002979738344386614, "loss": 2.5753, "step": 17647 }, { "crossentropy": 2.609640598297119, "epoch": 0.6397911832946636, "grad_norm": 0.027402332052588463, "grad_norm_var": 8.881768412297703e-07, "learning_rate": 0.0029792068162487646, "loss": 2.5696, "step": 17648 }, { "crossentropy": 2.464374542236328, "epoch": 0.6398274361948956, "grad_norm": 0.02599690482020378, "grad_norm_var": 1.0201527262609345e-06, "learning_rate": 0.002978675315404794, "loss": 2.5312, "step": 17649 }, { "crossentropy": 2.6789565086364746, "epoch": 0.6398636890951276, "grad_norm": 0.0351201631128788, "grad_norm_var": 4.754195097792825e-06, "learning_rate": 0.0029781438418618796, "loss": 2.6109, "step": 17650 }, { "crossentropy": 2.536611795425415, "epoch": 0.6398999419953596, "grad_norm": 0.026099609211087227, "grad_norm_var": 4.7453470840766804e-06, "learning_rate": 0.0029776123956272017, "loss": 2.5056, "step": 17651 }, { "crossentropy": 2.5199620723724365, "epoch": 0.6399361948955916, "grad_norm": 0.026560353115200996, "grad_norm_var": 4.5727721311671885e-06, "learning_rate": 0.0029770809767079376, "loss": 2.6026, "step": 17652 }, { "crossentropy": 2.651115655899048, "epoch": 0.6399724477958236, "grad_norm": 0.026687877252697945, "grad_norm_var": 4.607578840129002e-06, "learning_rate": 0.002976549585111267, "loss": 2.5037, "step": 17653 }, { "crossentropy": 2.3854217529296875, "epoch": 0.6400087006960556, "grad_norm": 0.02682550996541977, "grad_norm_var": 4.629093999581575e-06, "learning_rate": 0.002976018220844362, "loss": 2.4718, "step": 17654 }, { "crossentropy": 2.5140316486358643, "epoch": 0.6400449535962877, "grad_norm": 0.02717619203031063, "grad_norm_var": 4.639582513806898e-06, "learning_rate": 0.0029754868839144033, "loss": 2.6117, "step": 17655 }, { "crossentropy": 2.6073405742645264, "epoch": 0.6400812064965197, "grad_norm": 0.026081467047333717, "grad_norm_var": 4.75406828307257e-06, "learning_rate": 0.0029749555743285674, "loss": 2.5717, "step": 17656 }, { "crossentropy": 2.498387098312378, "epoch": 0.6401174593967517, "grad_norm": 0.028515245765447617, "grad_norm_var": 4.8183519312095895e-06, "learning_rate": 0.0029744242920940285, "loss": 2.4872, "step": 17657 }, { "crossentropy": 2.581214666366577, "epoch": 0.6401537122969838, "grad_norm": 0.026616359129548073, "grad_norm_var": 4.77302435604963e-06, "learning_rate": 0.002973893037217963, "loss": 2.6011, "step": 17658 }, { "crossentropy": 2.4174554347991943, "epoch": 0.6401899651972158, "grad_norm": 0.026766614988446236, "grad_norm_var": 4.7849143261885845e-06, "learning_rate": 0.002973361809707547, "loss": 2.5191, "step": 17659 }, { "crossentropy": 2.6604080200195312, "epoch": 0.6402262180974478, "grad_norm": 0.028515100479125977, "grad_norm_var": 4.728527378814774e-06, "learning_rate": 0.002972830609569956, "loss": 2.576, "step": 17660 }, { "crossentropy": 2.6603457927703857, "epoch": 0.6402624709976799, "grad_norm": 0.02695648744702339, "grad_norm_var": 4.750766002826325e-06, "learning_rate": 0.0029722994368123635, "loss": 2.6189, "step": 17661 }, { "crossentropy": 2.590369701385498, "epoch": 0.6402987238979119, "grad_norm": 0.02592981606721878, "grad_norm_var": 4.8463283044474755e-06, "learning_rate": 0.0029717682914419453, "loss": 2.5134, "step": 17662 }, { "crossentropy": 2.5243427753448486, "epoch": 0.6403349767981439, "grad_norm": 0.027266183868050575, "grad_norm_var": 4.831273229916041e-06, "learning_rate": 0.002971237173465874, "loss": 2.4582, "step": 17663 }, { "crossentropy": 2.5603716373443604, "epoch": 0.6403712296983759, "grad_norm": 0.026894252747297287, "grad_norm_var": 4.8477413432046016e-06, "learning_rate": 0.0029707060828913223, "loss": 2.5443, "step": 17664 }, { "crossentropy": 2.5899908542633057, "epoch": 0.6404074825986079, "grad_norm": 0.027102919295430183, "grad_norm_var": 4.720894834704424e-06, "learning_rate": 0.002970175019725465, "loss": 2.5534, "step": 17665 }, { "crossentropy": 2.4251651763916016, "epoch": 0.6404437354988399, "grad_norm": 0.025423219427466393, "grad_norm_var": 6.739254365178045e-07, "learning_rate": 0.002969643983975474, "loss": 2.4475, "step": 17666 }, { "crossentropy": 2.5660829544067383, "epoch": 0.6404799883990719, "grad_norm": 0.02615361101925373, "grad_norm_var": 6.687869633597206e-07, "learning_rate": 0.0029691129756485224, "loss": 2.5072, "step": 17667 }, { "crossentropy": 2.5375277996063232, "epoch": 0.640516241299304, "grad_norm": 0.027431316673755646, "grad_norm_var": 6.834965717263657e-07, "learning_rate": 0.0029685819947517823, "loss": 2.5814, "step": 17668 }, { "crossentropy": 2.542053699493408, "epoch": 0.640552494199536, "grad_norm": 0.02735712006688118, "grad_norm_var": 6.928837280648544e-07, "learning_rate": 0.0029680510412924254, "loss": 2.5073, "step": 17669 }, { "crossentropy": 2.2161381244659424, "epoch": 0.640588747099768, "grad_norm": 0.02613336220383644, "grad_norm_var": 7.33226510801245e-07, "learning_rate": 0.002967520115277624, "loss": 2.3523, "step": 17670 }, { "crossentropy": 2.596364736557007, "epoch": 0.640625, "grad_norm": 0.026820944622159004, "grad_norm_var": 7.277928576188142e-07, "learning_rate": 0.0029669892167145462, "loss": 2.5187, "step": 17671 }, { "crossentropy": 2.530669927597046, "epoch": 0.640661252900232, "grad_norm": 0.02667301334440708, "grad_norm_var": 6.872524602833755e-07, "learning_rate": 0.0029664583456103645, "loss": 2.4847, "step": 17672 }, { "crossentropy": 2.5669734477996826, "epoch": 0.640697505800464, "grad_norm": 0.02528521977365017, "grad_norm_var": 6.478684798001302e-07, "learning_rate": 0.0029659275019722486, "loss": 2.5413, "step": 17673 }, { "crossentropy": 2.4012975692749023, "epoch": 0.640733758700696, "grad_norm": 0.025598447769880295, "grad_norm_var": 7.250442311501972e-07, "learning_rate": 0.002965396685807368, "loss": 2.4899, "step": 17674 }, { "crossentropy": 2.4661576747894287, "epoch": 0.6407700116009281, "grad_norm": 0.02630981057882309, "grad_norm_var": 7.306317981685725e-07, "learning_rate": 0.002964865897122894, "loss": 2.4873, "step": 17675 }, { "crossentropy": 2.6061763763427734, "epoch": 0.6408062645011601, "grad_norm": 0.026239482685923576, "grad_norm_var": 4.779690285505733e-07, "learning_rate": 0.0029643351359259934, "loss": 2.6096, "step": 17676 }, { "crossentropy": 2.54528546333313, "epoch": 0.6408425174013921, "grad_norm": 0.02890898659825325, "grad_norm_var": 8.41985407553277e-07, "learning_rate": 0.002963804402223837, "loss": 2.5818, "step": 17677 }, { "crossentropy": 2.484422206878662, "epoch": 0.6408787703016241, "grad_norm": 0.02723035030066967, "grad_norm_var": 8.322677841589848e-07, "learning_rate": 0.002963273696023592, "loss": 2.4328, "step": 17678 }, { "crossentropy": 2.5566294193267822, "epoch": 0.6409150232018561, "grad_norm": 0.027445113286376, "grad_norm_var": 8.483306829451849e-07, "learning_rate": 0.002962743017332427, "loss": 2.5561, "step": 17679 }, { "crossentropy": 2.386427164077759, "epoch": 0.6409512761020881, "grad_norm": 0.026186393573880196, "grad_norm_var": 8.60175940768417e-07, "learning_rate": 0.002962212366157509, "loss": 2.4674, "step": 17680 }, { "crossentropy": 2.523452043533325, "epoch": 0.6409875290023201, "grad_norm": 0.027961548417806625, "grad_norm_var": 9.588261011515687e-07, "learning_rate": 0.002961681742506005, "loss": 2.5381, "step": 17681 }, { "crossentropy": 2.593538999557495, "epoch": 0.6410237819025522, "grad_norm": 0.025298738852143288, "grad_norm_var": 9.809421851151758e-07, "learning_rate": 0.0029611511463850815, "loss": 2.5629, "step": 17682 }, { "crossentropy": 2.455725908279419, "epoch": 0.6410600348027842, "grad_norm": 0.02998441457748413, "grad_norm_var": 1.6243685230458877e-06, "learning_rate": 0.0029606205778019068, "loss": 2.4893, "step": 17683 }, { "crossentropy": 2.445119857788086, "epoch": 0.6410962877030162, "grad_norm": 0.02782392129302025, "grad_norm_var": 1.6602962235728418e-06, "learning_rate": 0.0029600900367636462, "loss": 2.4654, "step": 17684 }, { "crossentropy": 2.4836063385009766, "epoch": 0.6411325406032483, "grad_norm": 0.027065733447670937, "grad_norm_var": 1.6499237013543744e-06, "learning_rate": 0.0029595595232774653, "loss": 2.5098, "step": 17685 }, { "crossentropy": 2.3968610763549805, "epoch": 0.6411687935034803, "grad_norm": 0.02605362795293331, "grad_norm_var": 1.6588470889121927e-06, "learning_rate": 0.002959029037350529, "loss": 2.4464, "step": 17686 }, { "crossentropy": 2.530440092086792, "epoch": 0.6412050464037123, "grad_norm": 0.027088595554232597, "grad_norm_var": 1.6594197478652627e-06, "learning_rate": 0.0029584985789900043, "loss": 2.5293, "step": 17687 }, { "crossentropy": 2.4939687252044678, "epoch": 0.6412412993039444, "grad_norm": 0.024891627952456474, "grad_norm_var": 1.9228506475589084e-06, "learning_rate": 0.0029579681482030528, "loss": 2.5291, "step": 17688 }, { "crossentropy": 2.4200315475463867, "epoch": 0.6412775522041764, "grad_norm": 0.02608782984316349, "grad_norm_var": 1.7971825062130034e-06, "learning_rate": 0.0029574377449968414, "loss": 2.4335, "step": 17689 }, { "crossentropy": 2.639333486557007, "epoch": 0.6413138051044084, "grad_norm": 0.028376737609505653, "grad_norm_var": 1.802686192591676e-06, "learning_rate": 0.0029569073693785307, "loss": 2.6241, "step": 17690 }, { "crossentropy": 2.51708984375, "epoch": 0.6413500580046404, "grad_norm": 0.02657068334519863, "grad_norm_var": 1.7808611475702194e-06, "learning_rate": 0.0029563770213552887, "loss": 2.6243, "step": 17691 }, { "crossentropy": 2.468951463699341, "epoch": 0.6413863109048724, "grad_norm": 0.031464628875255585, "grad_norm_var": 2.904552092017213e-06, "learning_rate": 0.002955846700934276, "loss": 2.4958, "step": 17692 }, { "crossentropy": 2.4990270137786865, "epoch": 0.6414225638051044, "grad_norm": 0.026953186839818954, "grad_norm_var": 2.750755237041847e-06, "learning_rate": 0.0029553164081226554, "loss": 2.5267, "step": 17693 }, { "crossentropy": 2.567976236343384, "epoch": 0.6414588167053364, "grad_norm": 0.026200756430625916, "grad_norm_var": 2.8238519482743154e-06, "learning_rate": 0.0029547861429275895, "loss": 2.5193, "step": 17694 }, { "crossentropy": 2.60386061668396, "epoch": 0.6414950696055685, "grad_norm": 0.02760949730873108, "grad_norm_var": 2.83056587791045e-06, "learning_rate": 0.0029542559053562416, "loss": 2.571, "step": 17695 }, { "crossentropy": 2.541672706604004, "epoch": 0.6415313225058005, "grad_norm": 0.02957259677350521, "grad_norm_var": 3.0777840884050095e-06, "learning_rate": 0.0029537256954157713, "loss": 2.5171, "step": 17696 }, { "crossentropy": 2.616518020629883, "epoch": 0.6415675754060325, "grad_norm": 0.027334682643413544, "grad_norm_var": 3.058564612027946e-06, "learning_rate": 0.0029531955131133414, "loss": 2.5417, "step": 17697 }, { "crossentropy": 2.4320244789123535, "epoch": 0.6416038283062645, "grad_norm": 0.025562869384884834, "grad_norm_var": 2.988974011430241e-06, "learning_rate": 0.0029526653584561104, "loss": 2.4687, "step": 17698 }, { "crossentropy": 2.6233346462249756, "epoch": 0.6416400812064965, "grad_norm": 0.02596859075129032, "grad_norm_var": 2.6211725811691363e-06, "learning_rate": 0.0029521352314512417, "loss": 2.5788, "step": 17699 }, { "crossentropy": 2.427135467529297, "epoch": 0.6416763341067285, "grad_norm": 0.026209749281406403, "grad_norm_var": 2.642010403908155e-06, "learning_rate": 0.0029516051321058947, "loss": 2.5194, "step": 17700 }, { "crossentropy": 2.4853594303131104, "epoch": 0.6417125870069605, "grad_norm": 0.026494715362787247, "grad_norm_var": 2.6621972944300033e-06, "learning_rate": 0.0029510750604272292, "loss": 2.4517, "step": 17701 }, { "crossentropy": 2.471318244934082, "epoch": 0.6417488399071926, "grad_norm": 0.029279856011271477, "grad_norm_var": 2.8937969626681755e-06, "learning_rate": 0.002950545016422404, "loss": 2.4427, "step": 17702 }, { "crossentropy": 2.4491400718688965, "epoch": 0.6417850928074246, "grad_norm": 0.02565181627869606, "grad_norm_var": 3.0497464180193405e-06, "learning_rate": 0.0029500150000985777, "loss": 2.4939, "step": 17703 }, { "crossentropy": 2.650270938873291, "epoch": 0.6418213457076566, "grad_norm": 0.027355559170246124, "grad_norm_var": 2.6907456169328643e-06, "learning_rate": 0.002949485011462912, "loss": 2.6112, "step": 17704 }, { "crossentropy": 2.6230225563049316, "epoch": 0.6418575986078886, "grad_norm": 0.02747596800327301, "grad_norm_var": 2.5880529810372193e-06, "learning_rate": 0.0029489550505225628, "loss": 2.5882, "step": 17705 }, { "crossentropy": 2.4071855545043945, "epoch": 0.6418938515081206, "grad_norm": 0.025546550750732422, "grad_norm_var": 2.712592848066195e-06, "learning_rate": 0.0029484251172846856, "loss": 2.3977, "step": 17706 }, { "crossentropy": 2.67537260055542, "epoch": 0.6419301044083526, "grad_norm": 0.027101274579763412, "grad_norm_var": 2.6854383408214664e-06, "learning_rate": 0.002947895211756442, "loss": 2.6455, "step": 17707 }, { "crossentropy": 2.3840088844299316, "epoch": 0.6419663573085846, "grad_norm": 0.026707271113991737, "grad_norm_var": 1.4179362915485295e-06, "learning_rate": 0.002947365333944988, "loss": 2.4502, "step": 17708 }, { "crossentropy": 2.61920428276062, "epoch": 0.6420026102088167, "grad_norm": 0.026575520634651184, "grad_norm_var": 1.4261393519180426e-06, "learning_rate": 0.00294683548385748, "loss": 2.5862, "step": 17709 }, { "crossentropy": 2.539515972137451, "epoch": 0.6420388631090487, "grad_norm": 0.029088616371154785, "grad_norm_var": 1.6721796754609694e-06, "learning_rate": 0.002946305661501074, "loss": 2.491, "step": 17710 }, { "crossentropy": 2.457167387008667, "epoch": 0.6420751160092807, "grad_norm": 0.02677927538752556, "grad_norm_var": 1.6584107347084823e-06, "learning_rate": 0.0029457758668829273, "loss": 2.5176, "step": 17711 }, { "crossentropy": 2.6423048973083496, "epoch": 0.6421113689095128, "grad_norm": 0.02663934789597988, "grad_norm_var": 1.2072460310016357e-06, "learning_rate": 0.002945246100010196, "loss": 2.5367, "step": 17712 }, { "crossentropy": 2.508993625640869, "epoch": 0.6421476218097448, "grad_norm": 0.03214702755212784, "grad_norm_var": 2.9587729699377066e-06, "learning_rate": 0.0029447163608900323, "loss": 2.5738, "step": 17713 }, { "crossentropy": 2.3904452323913574, "epoch": 0.6421838747099768, "grad_norm": 0.025478286668658257, "grad_norm_var": 2.9772489849819952e-06, "learning_rate": 0.0029441866495295943, "loss": 2.4707, "step": 17714 }, { "crossentropy": 2.452512502670288, "epoch": 0.6422201276102089, "grad_norm": 0.025609267875552177, "grad_norm_var": 3.0422172419460367e-06, "learning_rate": 0.0029436569659360345, "loss": 2.5321, "step": 17715 }, { "crossentropy": 2.5041139125823975, "epoch": 0.6422563805104409, "grad_norm": 0.02674206718802452, "grad_norm_var": 2.9943453153242663e-06, "learning_rate": 0.002943127310116509, "loss": 2.5822, "step": 17716 }, { "crossentropy": 2.567310094833374, "epoch": 0.6422926334106729, "grad_norm": 0.026764165610074997, "grad_norm_var": 2.9747291168548725e-06, "learning_rate": 0.00294259768207817, "loss": 2.5005, "step": 17717 }, { "crossentropy": 2.4787521362304688, "epoch": 0.6423288863109049, "grad_norm": 0.0264678243547678, "grad_norm_var": 2.68308422937713e-06, "learning_rate": 0.002942068081828171, "loss": 2.5162, "step": 17718 }, { "crossentropy": 2.4066245555877686, "epoch": 0.6423651392111369, "grad_norm": 0.025844156742095947, "grad_norm_var": 2.650613592040116e-06, "learning_rate": 0.0029415385093736657, "loss": 2.4393, "step": 17719 }, { "crossentropy": 2.6767773628234863, "epoch": 0.6424013921113689, "grad_norm": 0.028313232585787773, "grad_norm_var": 2.750764822850802e-06, "learning_rate": 0.0029410089647218074, "loss": 2.5806, "step": 17720 }, { "crossentropy": 2.590712308883667, "epoch": 0.6424376450116009, "grad_norm": 0.028689973056316376, "grad_norm_var": 2.90697361120523e-06, "learning_rate": 0.0029404794478797487, "loss": 2.6183, "step": 17721 }, { "crossentropy": 2.52760648727417, "epoch": 0.642473897911833, "grad_norm": 0.025831853970885277, "grad_norm_var": 2.850841932993869e-06, "learning_rate": 0.002939949958854639, "loss": 2.568, "step": 17722 }, { "crossentropy": 2.519134044647217, "epoch": 0.642510150812065, "grad_norm": 0.026460275053977966, "grad_norm_var": 2.882711706980257e-06, "learning_rate": 0.002939420497653632, "loss": 2.5179, "step": 17723 }, { "crossentropy": 2.5322563648223877, "epoch": 0.642546403712297, "grad_norm": 0.027534451335668564, "grad_norm_var": 2.878451917882885e-06, "learning_rate": 0.002938891064283877, "loss": 2.6063, "step": 17724 }, { "crossentropy": 2.500291109085083, "epoch": 0.642582656612529, "grad_norm": 0.028118804097175598, "grad_norm_var": 2.901827696055274e-06, "learning_rate": 0.0029383616587525274, "loss": 2.5692, "step": 17725 }, { "crossentropy": 2.5308873653411865, "epoch": 0.642618909512761, "grad_norm": 0.026593981310725212, "grad_norm_var": 2.6897946665459106e-06, "learning_rate": 0.0029378322810667307, "loss": 2.5269, "step": 17726 }, { "crossentropy": 2.5419018268585205, "epoch": 0.642655162412993, "grad_norm": 0.030072057619690895, "grad_norm_var": 3.2152755761668514e-06, "learning_rate": 0.00293730293123364, "loss": 2.4799, "step": 17727 }, { "crossentropy": 2.5137205123901367, "epoch": 0.642691415313225, "grad_norm": 0.027352258563041687, "grad_norm_var": 3.1812318545888872e-06, "learning_rate": 0.0029367736092604027, "loss": 2.5187, "step": 17728 }, { "crossentropy": 2.396756410598755, "epoch": 0.6427276682134571, "grad_norm": 0.027741286903619766, "grad_norm_var": 1.5918718424027482e-06, "learning_rate": 0.0029362443151541694, "loss": 2.4709, "step": 17729 }, { "crossentropy": 2.621072292327881, "epoch": 0.6427639211136891, "grad_norm": 0.028131632134318352, "grad_norm_var": 1.457849920284447e-06, "learning_rate": 0.0029357150489220884, "loss": 2.6123, "step": 17730 }, { "crossentropy": 2.602994203567505, "epoch": 0.6428001740139211, "grad_norm": 0.028258688747882843, "grad_norm_var": 1.3110643680965793e-06, "learning_rate": 0.002935185810571308, "loss": 2.5799, "step": 17731 }, { "crossentropy": 2.547081708908081, "epoch": 0.6428364269141531, "grad_norm": 0.026989294216036797, "grad_norm_var": 1.2921320693248662e-06, "learning_rate": 0.002934656600108977, "loss": 2.5214, "step": 17732 }, { "crossentropy": 2.531851053237915, "epoch": 0.6428726798143851, "grad_norm": 0.026532666757702827, "grad_norm_var": 1.316581295759435e-06, "learning_rate": 0.002934127417542243, "loss": 2.5632, "step": 17733 }, { "crossentropy": 2.5409793853759766, "epoch": 0.6429089327146171, "grad_norm": 0.02654934860765934, "grad_norm_var": 1.3065023057145497e-06, "learning_rate": 0.002933598262878252, "loss": 2.6197, "step": 17734 }, { "crossentropy": 2.4238758087158203, "epoch": 0.6429451856148491, "grad_norm": 0.026760747656226158, "grad_norm_var": 1.1641784836869852e-06, "learning_rate": 0.0029330691361241536, "loss": 2.496, "step": 17735 }, { "crossentropy": 2.540977716445923, "epoch": 0.6429814385150812, "grad_norm": 0.025715850293636322, "grad_norm_var": 1.3026881895175993e-06, "learning_rate": 0.0029325400372870927, "loss": 2.5307, "step": 17736 }, { "crossentropy": 2.511000871658325, "epoch": 0.6430176914153132, "grad_norm": 0.027277713641524315, "grad_norm_var": 1.1718840991769975e-06, "learning_rate": 0.0029320109663742173, "loss": 2.4685, "step": 17737 }, { "crossentropy": 2.5275938510894775, "epoch": 0.6430539443155452, "grad_norm": 0.02671241946518421, "grad_norm_var": 1.054423955019557e-06, "learning_rate": 0.0029314819233926715, "loss": 2.5198, "step": 17738 }, { "crossentropy": 2.4958064556121826, "epoch": 0.6430901972157773, "grad_norm": 0.026673458516597748, "grad_norm_var": 1.0333930518888751e-06, "learning_rate": 0.0029309529083496005, "loss": 2.5317, "step": 17739 }, { "crossentropy": 2.5727121829986572, "epoch": 0.6431264501160093, "grad_norm": 0.026768216863274574, "grad_norm_var": 1.0475057949057541e-06, "learning_rate": 0.002930423921252151, "loss": 2.5493, "step": 17740 }, { "crossentropy": 2.458927631378174, "epoch": 0.6431627030162413, "grad_norm": 0.02632349170744419, "grad_norm_var": 1.0446991645323186e-06, "learning_rate": 0.002929894962107467, "loss": 2.4662, "step": 17741 }, { "crossentropy": 2.5616703033447266, "epoch": 0.6431989559164734, "grad_norm": 0.026365630328655243, "grad_norm_var": 1.0649882337762188e-06, "learning_rate": 0.002929366030922692, "loss": 2.5089, "step": 17742 }, { "crossentropy": 2.480490207672119, "epoch": 0.6432352088167054, "grad_norm": 0.02678670547902584, "grad_norm_var": 4.5478831378290703e-07, "learning_rate": 0.0029288371277049717, "loss": 2.4913, "step": 17743 }, { "crossentropy": 2.5493247509002686, "epoch": 0.6432714617169374, "grad_norm": 0.02753712795674801, "grad_norm_var": 4.672411894431159e-07, "learning_rate": 0.0029283082524614503, "loss": 2.5691, "step": 17744 }, { "crossentropy": 2.464711904525757, "epoch": 0.6433077146171694, "grad_norm": 0.028093084692955017, "grad_norm_var": 5.123146790481641e-07, "learning_rate": 0.002927779405199269, "loss": 2.4768, "step": 17745 }, { "crossentropy": 2.5580742359161377, "epoch": 0.6433439675174014, "grad_norm": 0.028102807700634003, "grad_norm_var": 5.078916048094813e-07, "learning_rate": 0.0029272505859255725, "loss": 2.5126, "step": 17746 }, { "crossentropy": 2.4318017959594727, "epoch": 0.6433802204176334, "grad_norm": 0.027176419273018837, "grad_norm_var": 3.944810710934521e-07, "learning_rate": 0.002926721794647501, "loss": 2.4954, "step": 17747 }, { "crossentropy": 2.4463634490966797, "epoch": 0.6434164733178654, "grad_norm": 0.025887666270136833, "grad_norm_var": 4.5689274934047837e-07, "learning_rate": 0.0029261930313721984, "loss": 2.5087, "step": 17748 }, { "crossentropy": 2.436896800994873, "epoch": 0.6434527262180975, "grad_norm": 0.0268817488104105, "grad_norm_var": 4.5071815198033324e-07, "learning_rate": 0.0029256642961068063, "loss": 2.5233, "step": 17749 }, { "crossentropy": 2.4559614658355713, "epoch": 0.6434889791183295, "grad_norm": 0.025824178010225296, "grad_norm_var": 5.127301428116288e-07, "learning_rate": 0.0029251355888584647, "loss": 2.3917, "step": 17750 }, { "crossentropy": 2.41400408744812, "epoch": 0.6435252320185615, "grad_norm": 0.025790289044380188, "grad_norm_var": 5.773767883770897e-07, "learning_rate": 0.0029246069096343164, "loss": 2.4187, "step": 17751 }, { "crossentropy": 2.615020751953125, "epoch": 0.6435614849187935, "grad_norm": 0.027397604659199715, "grad_norm_var": 5.234198632198446e-07, "learning_rate": 0.0029240782584415014, "loss": 2.568, "step": 17752 }, { "crossentropy": 2.622873067855835, "epoch": 0.6435977378190255, "grad_norm": 0.02654510736465454, "grad_norm_var": 5.151761575575142e-07, "learning_rate": 0.0029235496352871607, "loss": 2.552, "step": 17753 }, { "crossentropy": 2.4320051670074463, "epoch": 0.6436339907192575, "grad_norm": 0.027497783303260803, "grad_norm_var": 5.441232555368029e-07, "learning_rate": 0.002923021040178433, "loss": 2.4435, "step": 17754 }, { "crossentropy": 2.5878000259399414, "epoch": 0.6436702436194895, "grad_norm": 0.02753905951976776, "grad_norm_var": 5.702068708394089e-07, "learning_rate": 0.002922492473122458, "loss": 2.6575, "step": 17755 }, { "crossentropy": 2.5131402015686035, "epoch": 0.6437064965197216, "grad_norm": 0.025856193155050278, "grad_norm_var": 6.391074421938642e-07, "learning_rate": 0.0029219639341263736, "loss": 2.5755, "step": 17756 }, { "crossentropy": 2.4354710578918457, "epoch": 0.6437427494199536, "grad_norm": 0.024941353127360344, "grad_norm_var": 8.555856955661748e-07, "learning_rate": 0.0029214354231973205, "loss": 2.4521, "step": 17757 }, { "crossentropy": 2.4845387935638428, "epoch": 0.6437790023201856, "grad_norm": 0.02531193569302559, "grad_norm_var": 9.809348180982256e-07, "learning_rate": 0.002920906940342436, "loss": 2.503, "step": 17758 }, { "crossentropy": 2.584097146987915, "epoch": 0.6438152552204176, "grad_norm": 0.026045184582471848, "grad_norm_var": 1.006536958610138e-06, "learning_rate": 0.0029203784855688585, "loss": 2.6093, "step": 17759 }, { "crossentropy": 2.505311965942383, "epoch": 0.6438515081206496, "grad_norm": 0.027124376967549324, "grad_norm_var": 9.684576814762587e-07, "learning_rate": 0.0029198500588837263, "loss": 2.5707, "step": 17760 }, { "crossentropy": 2.7936055660247803, "epoch": 0.6438877610208816, "grad_norm": 0.026756351813673973, "grad_norm_var": 8.18642605279043e-07, "learning_rate": 0.002919321660294175, "loss": 2.6911, "step": 17761 }, { "crossentropy": 2.486828327178955, "epoch": 0.6439240139211136, "grad_norm": 0.026978885754942894, "grad_norm_var": 6.637525867834541e-07, "learning_rate": 0.002918793289807342, "loss": 2.3793, "step": 17762 }, { "crossentropy": 2.39908766746521, "epoch": 0.6439602668213457, "grad_norm": 0.027019649744033813, "grad_norm_var": 6.505672248511742e-07, "learning_rate": 0.0029182649474303658, "loss": 2.453, "step": 17763 }, { "crossentropy": 2.619117021560669, "epoch": 0.6439965197215777, "grad_norm": 0.027039695531129837, "grad_norm_var": 6.452440044550529e-07, "learning_rate": 0.0029177366331703805, "loss": 2.6347, "step": 17764 }, { "crossentropy": 2.5126805305480957, "epoch": 0.6440327726218097, "grad_norm": 0.025045879185199738, "grad_norm_var": 7.708547902161942e-07, "learning_rate": 0.0029172083470345197, "loss": 2.4426, "step": 17765 }, { "crossentropy": 2.5451443195343018, "epoch": 0.6440690255220418, "grad_norm": 0.027544081211090088, "grad_norm_var": 8.191925939171242e-07, "learning_rate": 0.002916680089029923, "loss": 2.4999, "step": 17766 }, { "crossentropy": 2.4163448810577393, "epoch": 0.6441052784222738, "grad_norm": 0.02671520784497261, "grad_norm_var": 7.81795699342326e-07, "learning_rate": 0.002916151859163721, "loss": 2.5155, "step": 17767 }, { "crossentropy": 2.4040932655334473, "epoch": 0.6441415313225058, "grad_norm": 0.02603205479681492, "grad_norm_var": 7.503686944507059e-07, "learning_rate": 0.0029156236574430505, "loss": 2.3955, "step": 17768 }, { "crossentropy": 2.5394341945648193, "epoch": 0.6441777842227379, "grad_norm": 0.027195986360311508, "grad_norm_var": 7.808000575676288e-07, "learning_rate": 0.002915095483875048, "loss": 2.5671, "step": 17769 }, { "crossentropy": 2.5885090827941895, "epoch": 0.6442140371229699, "grad_norm": 0.02581639215350151, "grad_norm_var": 7.428227469559543e-07, "learning_rate": 0.002914567338466843, "loss": 2.5945, "step": 17770 }, { "crossentropy": 2.582993507385254, "epoch": 0.6442502900232019, "grad_norm": 0.0271000899374485, "grad_norm_var": 6.902547018185993e-07, "learning_rate": 0.002914039221225573, "loss": 2.6211, "step": 17771 }, { "crossentropy": 2.557888984680176, "epoch": 0.6442865429234339, "grad_norm": 0.02616940625011921, "grad_norm_var": 6.733539080963982e-07, "learning_rate": 0.0029135111321583686, "loss": 2.5231, "step": 17772 }, { "crossentropy": 2.402709484100342, "epoch": 0.6443227958236659, "grad_norm": 0.026917144656181335, "grad_norm_var": 5.258866824954146e-07, "learning_rate": 0.0029129830712723605, "loss": 2.4656, "step": 17773 }, { "crossentropy": 2.4437689781188965, "epoch": 0.6443590487238979, "grad_norm": 0.027489114552736282, "grad_norm_var": 4.6252152564329036e-07, "learning_rate": 0.0029124550385746855, "loss": 2.5692, "step": 17774 }, { "crossentropy": 2.430778741836548, "epoch": 0.64439530162413, "grad_norm": 0.027268776670098305, "grad_norm_var": 4.514112343852993e-07, "learning_rate": 0.0029119270340724713, "loss": 2.5216, "step": 17775 }, { "crossentropy": 2.599304437637329, "epoch": 0.644431554524362, "grad_norm": 0.026991214603185654, "grad_norm_var": 4.461089084267181e-07, "learning_rate": 0.002911399057772851, "loss": 2.5568, "step": 17776 }, { "crossentropy": 2.662734031677246, "epoch": 0.644467807424594, "grad_norm": 0.027189621701836586, "grad_norm_var": 4.579199254244829e-07, "learning_rate": 0.002910871109682958, "loss": 2.5674, "step": 17777 }, { "crossentropy": 2.6051220893859863, "epoch": 0.644504060324826, "grad_norm": 0.027219198644161224, "grad_norm_var": 4.678354711300121e-07, "learning_rate": 0.0029103431898099188, "loss": 2.615, "step": 17778 }, { "crossentropy": 2.5493526458740234, "epoch": 0.644540313225058, "grad_norm": 0.025727219879627228, "grad_norm_var": 5.338823197790013e-07, "learning_rate": 0.0029098152981608684, "loss": 2.539, "step": 17779 }, { "crossentropy": 2.565551280975342, "epoch": 0.64457656612529, "grad_norm": 0.026773082092404366, "grad_norm_var": 5.268294084565984e-07, "learning_rate": 0.0029092874347429343, "loss": 2.6013, "step": 17780 }, { "crossentropy": 2.4073679447174072, "epoch": 0.644612819025522, "grad_norm": 0.026534611359238625, "grad_norm_var": 3.3707917514626493e-07, "learning_rate": 0.0029087595995632444, "loss": 2.4895, "step": 17781 }, { "crossentropy": 2.43377685546875, "epoch": 0.644649071925754, "grad_norm": 0.02876994013786316, "grad_norm_var": 5.538114278659407e-07, "learning_rate": 0.0029082317926289315, "loss": 2.4686, "step": 17782 }, { "crossentropy": 2.478929281234741, "epoch": 0.6446853248259861, "grad_norm": 0.026459764689207077, "grad_norm_var": 5.631384233636382e-07, "learning_rate": 0.00290770401394712, "loss": 2.5408, "step": 17783 }, { "crossentropy": 2.6246020793914795, "epoch": 0.6447215777262181, "grad_norm": 0.02642354741692543, "grad_norm_var": 5.298467160695045e-07, "learning_rate": 0.0029071762635249415, "loss": 2.4426, "step": 17784 }, { "crossentropy": 2.3431029319763184, "epoch": 0.6447578306264501, "grad_norm": 0.02596922591328621, "grad_norm_var": 5.71863595596198e-07, "learning_rate": 0.0029066485413695256, "loss": 2.4788, "step": 17785 }, { "crossentropy": 2.517651319503784, "epoch": 0.6447940835266821, "grad_norm": 0.03335800766944885, "grad_norm_var": 3.136392476212159e-06, "learning_rate": 0.002906120847487996, "loss": 2.521, "step": 17786 }, { "crossentropy": 2.6703693866729736, "epoch": 0.6448303364269141, "grad_norm": 0.03034915216267109, "grad_norm_var": 3.7214792565042065e-06, "learning_rate": 0.0029055931818874835, "loss": 2.609, "step": 17787 }, { "crossentropy": 2.4671638011932373, "epoch": 0.6448665893271461, "grad_norm": 0.026020169258117676, "grad_norm_var": 3.7488615157842217e-06, "learning_rate": 0.002905065544575114, "loss": 2.4319, "step": 17788 }, { "crossentropy": 2.653956890106201, "epoch": 0.6449028422273781, "grad_norm": 0.027319664135575294, "grad_norm_var": 3.7295185065626866e-06, "learning_rate": 0.002904537935558011, "loss": 2.6527, "step": 17789 }, { "crossentropy": 2.4846608638763428, "epoch": 0.6449390951276102, "grad_norm": 0.027966758236289024, "grad_norm_var": 3.743632278701405e-06, "learning_rate": 0.002904010354843305, "loss": 2.4152, "step": 17790 }, { "crossentropy": 2.513763189315796, "epoch": 0.6449753480278422, "grad_norm": 0.02698521502315998, "grad_norm_var": 3.758203188923429e-06, "learning_rate": 0.0029034828024381188, "loss": 2.556, "step": 17791 }, { "crossentropy": 2.519765853881836, "epoch": 0.6450116009280742, "grad_norm": 0.027975086122751236, "grad_norm_var": 3.75149709707181e-06, "learning_rate": 0.0029029552783495782, "loss": 2.5438, "step": 17792 }, { "crossentropy": 2.622284412384033, "epoch": 0.6450478538283063, "grad_norm": 0.02609528787434101, "grad_norm_var": 3.88111930582229e-06, "learning_rate": 0.002902427782584811, "loss": 2.5936, "step": 17793 }, { "crossentropy": 2.480646848678589, "epoch": 0.6450841067285383, "grad_norm": 0.026514098048210144, "grad_norm_var": 3.938273624789568e-06, "learning_rate": 0.002901900315150938, "loss": 2.5007, "step": 17794 }, { "crossentropy": 2.7104060649871826, "epoch": 0.6451203596287703, "grad_norm": 0.027741309255361557, "grad_norm_var": 3.7284786631097773e-06, "learning_rate": 0.002901372876055087, "loss": 2.7064, "step": 17795 }, { "crossentropy": 2.510005235671997, "epoch": 0.6451566125290024, "grad_norm": 0.026222199201583862, "grad_norm_var": 3.8065994925487803e-06, "learning_rate": 0.0029008454653043805, "loss": 2.5721, "step": 17796 }, { "crossentropy": 2.529071569442749, "epoch": 0.6451928654292344, "grad_norm": 0.029418345540761948, "grad_norm_var": 3.938235988730831e-06, "learning_rate": 0.0029003180829059393, "loss": 2.506, "step": 17797 }, { "crossentropy": 2.609609842300415, "epoch": 0.6452291183294664, "grad_norm": 0.02718985453248024, "grad_norm_var": 3.873970885511048e-06, "learning_rate": 0.0028997907288668906, "loss": 2.6108, "step": 17798 }, { "crossentropy": 2.506025552749634, "epoch": 0.6452653712296984, "grad_norm": 0.02679685689508915, "grad_norm_var": 3.8286790123717255e-06, "learning_rate": 0.0028992634031943534, "loss": 2.4548, "step": 17799 }, { "crossentropy": 2.542956590652466, "epoch": 0.6453016241299304, "grad_norm": 0.02656072936952114, "grad_norm_var": 3.80748536786186e-06, "learning_rate": 0.002898736105895452, "loss": 2.4565, "step": 17800 }, { "crossentropy": 2.7154862880706787, "epoch": 0.6453378770301624, "grad_norm": 0.027235668152570724, "grad_norm_var": 3.6230488693565158e-06, "learning_rate": 0.0028982088369773106, "loss": 2.6677, "step": 17801 }, { "crossentropy": 2.516681432723999, "epoch": 0.6453741299303944, "grad_norm": 0.027750814333558083, "grad_norm_var": 1.3836399946152873e-06, "learning_rate": 0.002897681596447046, "loss": 2.5927, "step": 17802 }, { "crossentropy": 2.445240020751953, "epoch": 0.6454103828306265, "grad_norm": 0.026797616854310036, "grad_norm_var": 7.677827101208618e-07, "learning_rate": 0.002897154384311784, "loss": 2.4571, "step": 17803 }, { "crossentropy": 2.5608513355255127, "epoch": 0.6454466357308585, "grad_norm": 0.027148041874170303, "grad_norm_var": 6.755986727666659e-07, "learning_rate": 0.002896627200578641, "loss": 2.5499, "step": 17804 }, { "crossentropy": 2.5592405796051025, "epoch": 0.6454828886310905, "grad_norm": 0.026713071390986443, "grad_norm_var": 6.915336865614384e-07, "learning_rate": 0.0028961000452547413, "loss": 2.5249, "step": 17805 }, { "crossentropy": 2.5962066650390625, "epoch": 0.6455191415313225, "grad_norm": 0.026753077283501625, "grad_norm_var": 6.586169202247675e-07, "learning_rate": 0.0028955729183472046, "loss": 2.5027, "step": 17806 }, { "crossentropy": 2.4798245429992676, "epoch": 0.6455553944315545, "grad_norm": 0.0270038153976202, "grad_norm_var": 6.583077930725929e-07, "learning_rate": 0.002895045819863146, "loss": 2.4633, "step": 17807 }, { "crossentropy": 2.5997745990753174, "epoch": 0.6455916473317865, "grad_norm": 0.027620231732726097, "grad_norm_var": 6.257082135713849e-07, "learning_rate": 0.0028945187498096883, "loss": 2.6178, "step": 17808 }, { "crossentropy": 2.6464169025421143, "epoch": 0.6456279002320185, "grad_norm": 0.027499713003635406, "grad_norm_var": 5.613010047112246e-07, "learning_rate": 0.0028939917081939525, "loss": 2.6775, "step": 17809 }, { "crossentropy": 2.504132032394409, "epoch": 0.6456641531322506, "grad_norm": 0.025861823931336403, "grad_norm_var": 6.462701990199525e-07, "learning_rate": 0.002893464695023052, "loss": 2.5449, "step": 17810 }, { "crossentropy": 2.4980294704437256, "epoch": 0.6457004060324826, "grad_norm": 0.027173588052392006, "grad_norm_var": 6.212437695333912e-07, "learning_rate": 0.0028929377103041097, "loss": 2.5635, "step": 17811 }, { "crossentropy": 2.692141056060791, "epoch": 0.6457366589327146, "grad_norm": 0.027461405843496323, "grad_norm_var": 5.706819647006515e-07, "learning_rate": 0.0028924107540442396, "loss": 2.651, "step": 17812 }, { "crossentropy": 2.54313325881958, "epoch": 0.6457729118329466, "grad_norm": 0.027194013819098473, "grad_norm_var": 2.1800700507345188e-07, "learning_rate": 0.0028918838262505624, "loss": 2.5282, "step": 17813 }, { "crossentropy": 2.5158982276916504, "epoch": 0.6458091647331786, "grad_norm": 0.026918288320302963, "grad_norm_var": 2.174625069836644e-07, "learning_rate": 0.002891356926930193, "loss": 2.6008, "step": 17814 }, { "crossentropy": 2.643291473388672, "epoch": 0.6458454176334106, "grad_norm": 0.02725585736334324, "grad_norm_var": 2.163282310662599e-07, "learning_rate": 0.0028908300560902466, "loss": 2.5261, "step": 17815 }, { "crossentropy": 2.5188140869140625, "epoch": 0.6458816705336426, "grad_norm": 0.026614364236593246, "grad_norm_var": 2.1294305511977233e-07, "learning_rate": 0.0028903032137378415, "loss": 2.5115, "step": 17816 }, { "crossentropy": 2.401003837585449, "epoch": 0.6459179234338747, "grad_norm": 0.027305932715535164, "grad_norm_var": 2.148731541996244e-07, "learning_rate": 0.0028897763998800945, "loss": 2.4255, "step": 17817 }, { "crossentropy": 2.6582565307617188, "epoch": 0.6459541763341067, "grad_norm": 0.027101751416921616, "grad_norm_var": 1.820229911339566e-07, "learning_rate": 0.0028892496145241178, "loss": 2.5597, "step": 17818 }, { "crossentropy": 2.518310308456421, "epoch": 0.6459904292343387, "grad_norm": 0.02739490009844303, "grad_norm_var": 1.8609895534177356e-07, "learning_rate": 0.0028887228576770303, "loss": 2.5692, "step": 17819 }, { "crossentropy": 2.5243310928344727, "epoch": 0.6460266821345708, "grad_norm": 0.02637360990047455, "grad_norm_var": 2.14878436980738e-07, "learning_rate": 0.0028881961293459424, "loss": 2.6009, "step": 17820 }, { "crossentropy": 2.5252504348754883, "epoch": 0.6460629350348028, "grad_norm": 0.026119926944375038, "grad_norm_var": 2.607724230739572e-07, "learning_rate": 0.0028876694295379706, "loss": 2.598, "step": 17821 }, { "crossentropy": 2.511803150177002, "epoch": 0.6460991879350348, "grad_norm": 0.026249386370182037, "grad_norm_var": 2.9175254289263467e-07, "learning_rate": 0.002887142758260233, "loss": 2.4796, "step": 17822 }, { "crossentropy": 2.5013232231140137, "epoch": 0.6461354408352669, "grad_norm": 0.02645881101489067, "grad_norm_var": 3.0617288495574796e-07, "learning_rate": 0.0028866161155198344, "loss": 2.4588, "step": 17823 }, { "crossentropy": 2.6023049354553223, "epoch": 0.6461716937354989, "grad_norm": 0.027495592832565308, "grad_norm_var": 2.953861035651182e-07, "learning_rate": 0.0028860895013238927, "loss": 2.5445, "step": 17824 }, { "crossentropy": 2.4962098598480225, "epoch": 0.6462079466357309, "grad_norm": 0.027308689430356026, "grad_norm_var": 2.8251785563759846e-07, "learning_rate": 0.0028855629156795217, "loss": 2.6676, "step": 17825 }, { "crossentropy": 2.3449172973632812, "epoch": 0.6462441995359629, "grad_norm": 0.02622794173657894, "grad_norm_var": 2.4055808295567885e-07, "learning_rate": 0.002885036358593831, "loss": 2.4602, "step": 17826 }, { "crossentropy": 2.5277040004730225, "epoch": 0.6462804524361949, "grad_norm": 0.02640879526734352, "grad_norm_var": 2.508356051372871e-07, "learning_rate": 0.0028845098300739353, "loss": 2.5376, "step": 17827 }, { "crossentropy": 2.546260356903076, "epoch": 0.6463167053364269, "grad_norm": 0.026504158973693848, "grad_norm_var": 2.3237769449361601e-07, "learning_rate": 0.002883983330126943, "loss": 2.4635, "step": 17828 }, { "crossentropy": 2.521369218826294, "epoch": 0.646352958236659, "grad_norm": 0.025698913261294365, "grad_norm_var": 2.951850438907688e-07, "learning_rate": 0.0028834568587599667, "loss": 2.4642, "step": 17829 }, { "crossentropy": 2.457848310470581, "epoch": 0.646389211136891, "grad_norm": 0.030555779114365578, "grad_norm_var": 1.2208317003289023e-06, "learning_rate": 0.0028829304159801216, "loss": 2.5473, "step": 17830 }, { "crossentropy": 2.549100637435913, "epoch": 0.646425464037123, "grad_norm": 0.03169873729348183, "grad_norm_var": 2.640365404431632e-06, "learning_rate": 0.0028824040017945106, "loss": 2.5233, "step": 17831 }, { "crossentropy": 2.4899628162384033, "epoch": 0.646461716937355, "grad_norm": 0.025928592309355736, "grad_norm_var": 2.7251196841867587e-06, "learning_rate": 0.0028818776162102466, "loss": 2.5453, "step": 17832 }, { "crossentropy": 2.601043701171875, "epoch": 0.646497969837587, "grad_norm": 0.028445569798350334, "grad_norm_var": 2.825889081647011e-06, "learning_rate": 0.0028813512592344415, "loss": 2.5915, "step": 17833 }, { "crossentropy": 2.626100778579712, "epoch": 0.646534222737819, "grad_norm": 0.026920562610030174, "grad_norm_var": 2.831478830541883e-06, "learning_rate": 0.0028808249308742012, "loss": 2.5327, "step": 17834 }, { "crossentropy": 2.610480308532715, "epoch": 0.646570475638051, "grad_norm": 0.042020246386528015, "grad_norm_var": 1.6508436224059914e-05, "learning_rate": 0.0028802986311366385, "loss": 2.6509, "step": 17835 }, { "crossentropy": 2.5842320919036865, "epoch": 0.646606728538283, "grad_norm": 0.02574915811419487, "grad_norm_var": 1.66807898254378e-05, "learning_rate": 0.002879772360028857, "loss": 2.563, "step": 17836 }, { "crossentropy": 2.505972385406494, "epoch": 0.6466429814385151, "grad_norm": 0.02795032039284706, "grad_norm_var": 1.6404033123636528e-05, "learning_rate": 0.0028792461175579675, "loss": 2.4236, "step": 17837 }, { "crossentropy": 2.553227663040161, "epoch": 0.6466792343387471, "grad_norm": 0.0276552252471447, "grad_norm_var": 1.6156988790220677e-05, "learning_rate": 0.0028787199037310814, "loss": 2.5036, "step": 17838 }, { "crossentropy": 2.4114716053009033, "epoch": 0.6467154872389791, "grad_norm": 0.02695014514029026, "grad_norm_var": 1.6050528511641488e-05, "learning_rate": 0.002878193718555298, "loss": 2.4227, "step": 17839 }, { "crossentropy": 2.4697251319885254, "epoch": 0.6467517401392111, "grad_norm": 0.025565216317772865, "grad_norm_var": 1.650202373495815e-05, "learning_rate": 0.002877667562037729, "loss": 2.4448, "step": 17840 }, { "crossentropy": 2.370680809020996, "epoch": 0.6467879930394431, "grad_norm": 0.02656595967710018, "grad_norm_var": 1.6627170564417166e-05, "learning_rate": 0.002877141434185482, "loss": 2.4512, "step": 17841 }, { "crossentropy": 2.5114529132843018, "epoch": 0.6468242459396751, "grad_norm": 0.02709077298641205, "grad_norm_var": 1.644937685505258e-05, "learning_rate": 0.002876615335005659, "loss": 2.4903, "step": 17842 }, { "crossentropy": 2.345510244369507, "epoch": 0.6468604988399071, "grad_norm": 0.025901950895786285, "grad_norm_var": 1.6588627114656693e-05, "learning_rate": 0.002876089264505371, "loss": 2.4653, "step": 17843 }, { "crossentropy": 2.4551358222961426, "epoch": 0.6468967517401392, "grad_norm": 0.032007940113544464, "grad_norm_var": 1.7237320812629684e-05, "learning_rate": 0.002875563222691718, "loss": 2.5114, "step": 17844 }, { "crossentropy": 2.492792844772339, "epoch": 0.6469330046403712, "grad_norm": 0.027899224311113358, "grad_norm_var": 1.6705209605630948e-05, "learning_rate": 0.002875037209571808, "loss": 2.5192, "step": 17845 }, { "crossentropy": 2.5322647094726562, "epoch": 0.6469692575406032, "grad_norm": 0.027531012892723083, "grad_norm_var": 1.65211697602236e-05, "learning_rate": 0.0028745112251527473, "loss": 2.552, "step": 17846 }, { "crossentropy": 2.381011486053467, "epoch": 0.6470055104408353, "grad_norm": 0.027247777208685875, "grad_norm_var": 1.585660575353583e-05, "learning_rate": 0.0028739852694416377, "loss": 2.4924, "step": 17847 }, { "crossentropy": 2.532438278198242, "epoch": 0.6470417633410673, "grad_norm": 0.031689323484897614, "grad_norm_var": 1.617504391649884e-05, "learning_rate": 0.0028734593424455824, "loss": 2.4906, "step": 17848 }, { "crossentropy": 2.527740716934204, "epoch": 0.6470780162412993, "grad_norm": 0.027599673718214035, "grad_norm_var": 1.623429547864135e-05, "learning_rate": 0.002872933444171687, "loss": 2.5399, "step": 17849 }, { "crossentropy": 2.5255465507507324, "epoch": 0.6471142691415314, "grad_norm": 0.026403037831187248, "grad_norm_var": 1.636150713865644e-05, "learning_rate": 0.0028724075746270512, "loss": 2.5254, "step": 17850 }, { "crossentropy": 2.4645111560821533, "epoch": 0.6471505220417634, "grad_norm": 0.027810240164399147, "grad_norm_var": 3.3449151685159433e-06, "learning_rate": 0.002871881733818782, "loss": 2.4936, "step": 17851 }, { "crossentropy": 2.676162004470825, "epoch": 0.6471867749419954, "grad_norm": 0.028120867908000946, "grad_norm_var": 3.1108545834465188e-06, "learning_rate": 0.0028713559217539773, "loss": 2.5897, "step": 17852 }, { "crossentropy": 2.613593339920044, "epoch": 0.6472230278422274, "grad_norm": 0.029287029057741165, "grad_norm_var": 3.25835763302663e-06, "learning_rate": 0.002870830138439742, "loss": 2.5734, "step": 17853 }, { "crossentropy": 2.5382308959960938, "epoch": 0.6472592807424594, "grad_norm": 0.02793155424296856, "grad_norm_var": 3.256586075095592e-06, "learning_rate": 0.0028703043838831785, "loss": 2.5509, "step": 17854 }, { "crossentropy": 2.6408252716064453, "epoch": 0.6472955336426914, "grad_norm": 0.02750074490904808, "grad_norm_var": 3.2094643454230164e-06, "learning_rate": 0.002869778658091386, "loss": 2.6261, "step": 17855 }, { "crossentropy": 2.431795120239258, "epoch": 0.6473317865429234, "grad_norm": 0.02671753242611885, "grad_norm_var": 2.9361110172567422e-06, "learning_rate": 0.0028692529610714634, "loss": 2.477, "step": 17856 }, { "crossentropy": 2.6866707801818848, "epoch": 0.6473680394431555, "grad_norm": 0.028294017538428307, "grad_norm_var": 2.802347054683601e-06, "learning_rate": 0.002868727292830515, "loss": 2.5986, "step": 17857 }, { "crossentropy": 2.5092859268188477, "epoch": 0.6474042923433875, "grad_norm": 0.027954986318945885, "grad_norm_var": 2.7368200094495787e-06, "learning_rate": 0.0028682016533756374, "loss": 2.5116, "step": 17858 }, { "crossentropy": 2.6015260219573975, "epoch": 0.6474405452436195, "grad_norm": 0.026123156771063805, "grad_norm_var": 2.674501420895841e-06, "learning_rate": 0.0028676760427139326, "loss": 2.6121, "step": 17859 }, { "crossentropy": 2.439863443374634, "epoch": 0.6474767981438515, "grad_norm": 0.026667866855859756, "grad_norm_var": 1.697340395561186e-06, "learning_rate": 0.002867150460852498, "loss": 2.4257, "step": 17860 }, { "crossentropy": 2.6223483085632324, "epoch": 0.6475130510440835, "grad_norm": 0.02569492720067501, "grad_norm_var": 1.9714573239207794e-06, "learning_rate": 0.002866624907798432, "loss": 2.5993, "step": 17861 }, { "crossentropy": 2.4432005882263184, "epoch": 0.6475493039443155, "grad_norm": 0.026704465970396996, "grad_norm_var": 2.0284659469205995e-06, "learning_rate": 0.0028660993835588366, "loss": 2.5456, "step": 17862 }, { "crossentropy": 2.6748812198638916, "epoch": 0.6475855568445475, "grad_norm": 0.027069661766290665, "grad_norm_var": 2.0390320994470927e-06, "learning_rate": 0.0028655738881408054, "loss": 2.6763, "step": 17863 }, { "crossentropy": 2.4758002758026123, "epoch": 0.6476218097447796, "grad_norm": 0.028230246156454086, "grad_norm_var": 8.999288319322396e-07, "learning_rate": 0.00286504842155144, "loss": 2.5347, "step": 17864 }, { "crossentropy": 2.4946188926696777, "epoch": 0.6476580626450116, "grad_norm": 0.02515588514506817, "grad_norm_var": 1.2022182212532377e-06, "learning_rate": 0.0028645229837978353, "loss": 2.4162, "step": 17865 }, { "crossentropy": 2.390531063079834, "epoch": 0.6476943155452436, "grad_norm": 0.02849176898598671, "grad_norm_var": 1.2448260513449327e-06, "learning_rate": 0.002863997574887087, "loss": 2.4341, "step": 17866 }, { "crossentropy": 2.5067508220672607, "epoch": 0.6477305684454756, "grad_norm": 0.027414074167609215, "grad_norm_var": 1.2308359567988458e-06, "learning_rate": 0.002863472194826294, "loss": 2.4866, "step": 17867 }, { "crossentropy": 2.5666344165802, "epoch": 0.6477668213457076, "grad_norm": 0.028695693239569664, "grad_norm_var": 1.3117248557750615e-06, "learning_rate": 0.00286294684362255, "loss": 2.4789, "step": 17868 }, { "crossentropy": 2.6214396953582764, "epoch": 0.6478030742459396, "grad_norm": 0.02678781747817993, "grad_norm_var": 1.0635788185564861e-06, "learning_rate": 0.0028624215212829517, "loss": 2.6705, "step": 17869 }, { "crossentropy": 2.6285769939422607, "epoch": 0.6478393271461717, "grad_norm": 0.025933939963579178, "grad_norm_var": 1.1220363099990911e-06, "learning_rate": 0.002861896227814597, "loss": 2.6185, "step": 17870 }, { "crossentropy": 2.506120443344116, "epoch": 0.6478755800464037, "grad_norm": 0.02754283882677555, "grad_norm_var": 1.124453496409887e-06, "learning_rate": 0.0028613709632245767, "loss": 2.5354, "step": 17871 }, { "crossentropy": 2.303433895111084, "epoch": 0.6479118329466357, "grad_norm": 0.025818804278969765, "grad_norm_var": 1.2198596387525315e-06, "learning_rate": 0.0028608457275199886, "loss": 2.4224, "step": 17872 }, { "crossentropy": 2.477778434753418, "epoch": 0.6479480858468677, "grad_norm": 0.026489686220884323, "grad_norm_var": 1.1207470418117856e-06, "learning_rate": 0.0028603205207079247, "loss": 2.457, "step": 17873 }, { "crossentropy": 2.576962471008301, "epoch": 0.6479843387470998, "grad_norm": 0.026278415694832802, "grad_norm_var": 1.0658437957227247e-06, "learning_rate": 0.0028597953427954774, "loss": 2.5125, "step": 17874 }, { "crossentropy": 2.48895263671875, "epoch": 0.6480205916473318, "grad_norm": 0.026794612407684326, "grad_norm_var": 1.0317516255519665e-06, "learning_rate": 0.002859270193789745, "loss": 2.5067, "step": 17875 }, { "crossentropy": 2.5994362831115723, "epoch": 0.6480568445475638, "grad_norm": 0.027086669579148293, "grad_norm_var": 1.031947715128068e-06, "learning_rate": 0.0028587450736978137, "loss": 2.5584, "step": 17876 }, { "crossentropy": 2.519606828689575, "epoch": 0.6480930974477959, "grad_norm": 0.02576196752488613, "grad_norm_var": 1.0215744152351625e-06, "learning_rate": 0.002858219982526781, "loss": 2.4697, "step": 17877 }, { "crossentropy": 2.4755189418792725, "epoch": 0.6481293503480279, "grad_norm": 0.026920830830931664, "grad_norm_var": 1.019118033220703e-06, "learning_rate": 0.0028576949202837386, "loss": 2.4421, "step": 17878 }, { "crossentropy": 2.5644168853759766, "epoch": 0.6481656032482599, "grad_norm": 0.02668825164437294, "grad_norm_var": 1.0198137885990132e-06, "learning_rate": 0.0028571698869757755, "loss": 2.5074, "step": 17879 }, { "crossentropy": 2.4806087017059326, "epoch": 0.6482018561484919, "grad_norm": 0.02833724208176136, "grad_norm_var": 1.039781819166021e-06, "learning_rate": 0.002856644882609988, "loss": 2.4574, "step": 17880 }, { "crossentropy": 2.552626132965088, "epoch": 0.6482381090487239, "grad_norm": 0.027271904051303864, "grad_norm_var": 8.311036679718575e-07, "learning_rate": 0.002856119907193463, "loss": 2.5734, "step": 17881 }, { "crossentropy": 2.5030195713043213, "epoch": 0.6482743619489559, "grad_norm": 0.024980578571558, "grad_norm_var": 9.12450380593357e-07, "learning_rate": 0.002855594960733291, "loss": 2.4477, "step": 17882 }, { "crossentropy": 2.58984112739563, "epoch": 0.648310614849188, "grad_norm": 0.026674456894397736, "grad_norm_var": 8.861031758570943e-07, "learning_rate": 0.0028550700432365644, "loss": 2.5514, "step": 17883 }, { "crossentropy": 2.499276638031006, "epoch": 0.64834686774942, "grad_norm": 0.0279029980301857, "grad_norm_var": 7.201513208687869e-07, "learning_rate": 0.00285454515471037, "loss": 2.6071, "step": 17884 }, { "crossentropy": 2.5150253772735596, "epoch": 0.648383120649652, "grad_norm": 0.027581127360463142, "grad_norm_var": 7.68304501463692e-07, "learning_rate": 0.0028540202951618, "loss": 2.5537, "step": 17885 }, { "crossentropy": 2.2515528202056885, "epoch": 0.648419373549884, "grad_norm": 0.025703558698296547, "grad_norm_var": 7.968125397215016e-07, "learning_rate": 0.002853495464597944, "loss": 2.4181, "step": 17886 }, { "crossentropy": 2.4637932777404785, "epoch": 0.648455626450116, "grad_norm": 0.02690829522907734, "grad_norm_var": 7.540210264459958e-07, "learning_rate": 0.0028529706630258866, "loss": 2.5667, "step": 17887 }, { "crossentropy": 2.495819568634033, "epoch": 0.648491879350348, "grad_norm": 0.02618388831615448, "grad_norm_var": 7.194585195615642e-07, "learning_rate": 0.002852445890452722, "loss": 2.5541, "step": 17888 }, { "crossentropy": 2.7113330364227295, "epoch": 0.64852813225058, "grad_norm": 0.02680163085460663, "grad_norm_var": 7.158453718431046e-07, "learning_rate": 0.002851921146885533, "loss": 2.6457, "step": 17889 }, { "crossentropy": 2.49312686920166, "epoch": 0.648564385150812, "grad_norm": 0.026378557085990906, "grad_norm_var": 7.102785818251331e-07, "learning_rate": 0.002851396432331408, "loss": 2.4692, "step": 17890 }, { "crossentropy": 2.53822660446167, "epoch": 0.6486006380510441, "grad_norm": 0.026680225506424904, "grad_norm_var": 7.103936096717704e-07, "learning_rate": 0.0028508717467974364, "loss": 2.5725, "step": 17891 }, { "crossentropy": 2.604703187942505, "epoch": 0.6486368909512761, "grad_norm": 0.02705259434878826, "grad_norm_var": 7.088974325321236e-07, "learning_rate": 0.002850347090290701, "loss": 2.5416, "step": 17892 }, { "crossentropy": 2.5202064514160156, "epoch": 0.6486731438515081, "grad_norm": 0.027849651873111725, "grad_norm_var": 7.092628813409971e-07, "learning_rate": 0.0028498224628182908, "loss": 2.561, "step": 17893 }, { "crossentropy": 2.456998348236084, "epoch": 0.6487093967517401, "grad_norm": 0.026922132819890976, "grad_norm_var": 7.092718571100401e-07, "learning_rate": 0.002849297864387293, "loss": 2.5521, "step": 17894 }, { "crossentropy": 2.489877462387085, "epoch": 0.6487456496519721, "grad_norm": 0.027698246762156487, "grad_norm_var": 7.485766278665737e-07, "learning_rate": 0.0028487732950047895, "loss": 2.5413, "step": 17895 }, { "crossentropy": 2.6130762100219727, "epoch": 0.6487819025522041, "grad_norm": 0.02797853574156761, "grad_norm_var": 6.89454387999e-07, "learning_rate": 0.002848248754677869, "loss": 2.5368, "step": 17896 }, { "crossentropy": 2.6013569831848145, "epoch": 0.6488181554524362, "grad_norm": 0.027191678062081337, "grad_norm_var": 6.859910402288397e-07, "learning_rate": 0.0028477242434136146, "loss": 2.5823, "step": 17897 }, { "crossentropy": 2.596458911895752, "epoch": 0.6488544083526682, "grad_norm": 0.02721158042550087, "grad_norm_var": 4.2447327982807145e-07, "learning_rate": 0.0028471997612191085, "loss": 2.5259, "step": 17898 }, { "crossentropy": 2.434235095977783, "epoch": 0.6488906612529002, "grad_norm": 0.027953099459409714, "grad_norm_var": 4.634928855110626e-07, "learning_rate": 0.0028466753081014384, "loss": 2.4183, "step": 17899 }, { "crossentropy": 2.48844838142395, "epoch": 0.6489269141531323, "grad_norm": 0.029446246102452278, "grad_norm_var": 7.724579456737841e-07, "learning_rate": 0.002846150884067685, "loss": 2.5073, "step": 17900 }, { "crossentropy": 2.502957582473755, "epoch": 0.6489631670533643, "grad_norm": 0.027069373056292534, "grad_norm_var": 7.64274858563949e-07, "learning_rate": 0.0028456264891249315, "loss": 2.5502, "step": 17901 }, { "crossentropy": 2.5707828998565674, "epoch": 0.6489994199535963, "grad_norm": 0.02699989080429077, "grad_norm_var": 6.124974396221842e-07, "learning_rate": 0.0028451021232802644, "loss": 2.4712, "step": 17902 }, { "crossentropy": 2.5363450050354004, "epoch": 0.6490356728538283, "grad_norm": 0.027025381103157997, "grad_norm_var": 6.077020327768425e-07, "learning_rate": 0.0028445777865407605, "loss": 2.5529, "step": 17903 }, { "crossentropy": 2.4323599338531494, "epoch": 0.6490719257540604, "grad_norm": 0.026293115690350533, "grad_norm_var": 5.925182499676047e-07, "learning_rate": 0.002844053478913507, "loss": 2.5163, "step": 17904 }, { "crossentropy": 2.5830094814300537, "epoch": 0.6491081786542924, "grad_norm": 0.03101024404168129, "grad_norm_var": 1.4285856644434085e-06, "learning_rate": 0.0028435292004055806, "loss": 2.5608, "step": 17905 }, { "crossentropy": 2.6099040508270264, "epoch": 0.6491444315545244, "grad_norm": 0.02808828465640545, "grad_norm_var": 1.3447992861521786e-06, "learning_rate": 0.0028430049510240673, "loss": 2.5675, "step": 17906 }, { "crossentropy": 2.403714656829834, "epoch": 0.6491806844547564, "grad_norm": 0.026711802929639816, "grad_norm_var": 1.3407600490946322e-06, "learning_rate": 0.0028424807307760457, "loss": 2.4764, "step": 17907 }, { "crossentropy": 2.485931873321533, "epoch": 0.6492169373549884, "grad_norm": 0.025896254926919937, "grad_norm_var": 1.5174187824536772e-06, "learning_rate": 0.0028419565396685938, "loss": 2.5019, "step": 17908 }, { "crossentropy": 2.598076343536377, "epoch": 0.6492531902552204, "grad_norm": 0.02596060000360012, "grad_norm_var": 1.6735643074721728e-06, "learning_rate": 0.0028414323777087936, "loss": 2.5297, "step": 17909 }, { "crossentropy": 2.5098042488098145, "epoch": 0.6492894431554525, "grad_norm": 0.02658945880830288, "grad_norm_var": 1.7046066643970018e-06, "learning_rate": 0.002840908244903727, "loss": 2.4958, "step": 17910 }, { "crossentropy": 2.3450090885162354, "epoch": 0.6493256960556845, "grad_norm": 0.02739318087697029, "grad_norm_var": 1.7001319542260412e-06, "learning_rate": 0.002840384141260469, "loss": 2.5264, "step": 17911 }, { "crossentropy": 2.5945417881011963, "epoch": 0.6493619489559165, "grad_norm": 0.027207380160689354, "grad_norm_var": 1.6805048987563227e-06, "learning_rate": 0.002839860066786103, "loss": 2.6432, "step": 17912 }, { "crossentropy": 2.546830177307129, "epoch": 0.6493982018561485, "grad_norm": 0.027726242318749428, "grad_norm_var": 1.6850866036093356e-06, "learning_rate": 0.002839336021487702, "loss": 2.5337, "step": 17913 }, { "crossentropy": 2.449575185775757, "epoch": 0.6494344547563805, "grad_norm": 0.027011651545763016, "grad_norm_var": 1.6929110121736835e-06, "learning_rate": 0.0028388120053723497, "loss": 2.4806, "step": 17914 }, { "crossentropy": 2.585810422897339, "epoch": 0.6494707076566125, "grad_norm": 0.029187403619289398, "grad_norm_var": 1.8793389348439505e-06, "learning_rate": 0.0028382880184471206, "loss": 2.5283, "step": 17915 }, { "crossentropy": 2.4062914848327637, "epoch": 0.6495069605568445, "grad_norm": 0.02858271263539791, "grad_norm_var": 1.6990984445045306e-06, "learning_rate": 0.0028377640607190903, "loss": 2.5097, "step": 17916 }, { "crossentropy": 2.4677257537841797, "epoch": 0.6495432134570766, "grad_norm": 0.027105920016765594, "grad_norm_var": 1.6974633011875018e-06, "learning_rate": 0.002837240132195338, "loss": 2.5002, "step": 17917 }, { "crossentropy": 2.5091888904571533, "epoch": 0.6495794663573086, "grad_norm": 0.031176572665572166, "grad_norm_var": 2.5513804324546792e-06, "learning_rate": 0.002836716232882942, "loss": 2.5884, "step": 17918 }, { "crossentropy": 2.503682851791382, "epoch": 0.6496157192575406, "grad_norm": 0.02614053525030613, "grad_norm_var": 2.6781821778951483e-06, "learning_rate": 0.0028361923627889733, "loss": 2.536, "step": 17919 }, { "crossentropy": 2.7047152519226074, "epoch": 0.6496519721577726, "grad_norm": 0.02842407487332821, "grad_norm_var": 2.5821235948975437e-06, "learning_rate": 0.0028356685219205137, "loss": 2.6042, "step": 17920 }, { "crossentropy": 2.4524455070495605, "epoch": 0.6496882250580046, "grad_norm": 0.028653115034103394, "grad_norm_var": 1.9089055940559823e-06, "learning_rate": 0.002835144710284632, "loss": 2.5566, "step": 17921 }, { "crossentropy": 2.7693779468536377, "epoch": 0.6497244779582366, "grad_norm": 0.027326107025146484, "grad_norm_var": 1.897212315990493e-06, "learning_rate": 0.002834620927888409, "loss": 2.7265, "step": 17922 }, { "crossentropy": 2.6111204624176025, "epoch": 0.6497607308584686, "grad_norm": 0.025570515543222427, "grad_norm_var": 2.1089574687029043e-06, "learning_rate": 0.002834097174738916, "loss": 2.5394, "step": 17923 }, { "crossentropy": 2.4635257720947266, "epoch": 0.6497969837587007, "grad_norm": 0.026753218844532967, "grad_norm_var": 1.9719545251388097e-06, "learning_rate": 0.0028335734508432254, "loss": 2.5107, "step": 17924 }, { "crossentropy": 2.434267997741699, "epoch": 0.6498332366589327, "grad_norm": 0.026621762663125992, "grad_norm_var": 1.8591140636384576e-06, "learning_rate": 0.0028330497562084134, "loss": 2.4538, "step": 17925 }, { "crossentropy": 2.453010082244873, "epoch": 0.6498694895591647, "grad_norm": 0.02669769711792469, "grad_norm_var": 1.8453797734185873e-06, "learning_rate": 0.002832526090841555, "loss": 2.5084, "step": 17926 }, { "crossentropy": 2.6158156394958496, "epoch": 0.6499057424593968, "grad_norm": 0.026565788313746452, "grad_norm_var": 1.9108309405744255e-06, "learning_rate": 0.0028320024547497193, "loss": 2.5619, "step": 17927 }, { "crossentropy": 2.4411256313323975, "epoch": 0.6499419953596288, "grad_norm": 0.02564086765050888, "grad_norm_var": 2.1351223596023535e-06, "learning_rate": 0.002831478847939982, "loss": 2.5242, "step": 17928 }, { "crossentropy": 2.571702480316162, "epoch": 0.6499782482598608, "grad_norm": 0.028060005977749825, "grad_norm_var": 2.154422019075691e-06, "learning_rate": 0.0028309552704194117, "loss": 2.4479, "step": 17929 }, { "crossentropy": 2.678626537322998, "epoch": 0.6500145011600929, "grad_norm": 0.027131980285048485, "grad_norm_var": 2.147975348208578e-06, "learning_rate": 0.002830431722195082, "loss": 2.653, "step": 17930 }, { "crossentropy": 2.495262622833252, "epoch": 0.6500507540603249, "grad_norm": 0.026173697784543037, "grad_norm_var": 2.028497316110267e-06, "learning_rate": 0.0028299082032740685, "loss": 2.5174, "step": 17931 }, { "crossentropy": 2.510981321334839, "epoch": 0.6500870069605569, "grad_norm": 0.028697190806269646, "grad_norm_var": 2.0490627631118695e-06, "learning_rate": 0.0028293847136634342, "loss": 2.5133, "step": 17932 }, { "crossentropy": 2.42179799079895, "epoch": 0.6501232598607889, "grad_norm": 0.026230759918689728, "grad_norm_var": 2.119134136082529e-06, "learning_rate": 0.002828861253370253, "loss": 2.4992, "step": 17933 }, { "crossentropy": 2.5444836616516113, "epoch": 0.6501595127610209, "grad_norm": 0.027316566556692123, "grad_norm_var": 1.0251046002956402e-06, "learning_rate": 0.0028283378224015976, "loss": 2.6068, "step": 17934 }, { "crossentropy": 2.432831287384033, "epoch": 0.6501957656612529, "grad_norm": 0.026060545817017555, "grad_norm_var": 1.0346734964914212e-06, "learning_rate": 0.0028278144207645336, "loss": 2.4239, "step": 17935 }, { "crossentropy": 2.610753297805786, "epoch": 0.6502320185614849, "grad_norm": 0.02628212980926037, "grad_norm_var": 9.133552313315598e-07, "learning_rate": 0.002827291048466134, "loss": 2.6242, "step": 17936 }, { "crossentropy": 2.5807337760925293, "epoch": 0.650268271461717, "grad_norm": 0.028393402695655823, "grad_norm_var": 8.555258448916427e-07, "learning_rate": 0.002826767705513464, "loss": 2.5514, "step": 17937 }, { "crossentropy": 2.405599594116211, "epoch": 0.650304524361949, "grad_norm": 0.026545453816652298, "grad_norm_var": 8.435520784480135e-07, "learning_rate": 0.002826244391913594, "loss": 2.4339, "step": 17938 }, { "crossentropy": 2.4672911167144775, "epoch": 0.650340777262181, "grad_norm": 0.025898322463035583, "grad_norm_var": 7.96689940291935e-07, "learning_rate": 0.002825721107673597, "loss": 2.4579, "step": 17939 }, { "crossentropy": 2.5283262729644775, "epoch": 0.650377030162413, "grad_norm": 0.026831207796931267, "grad_norm_var": 7.96408548440564e-07, "learning_rate": 0.0028251978528005314, "loss": 2.4234, "step": 17940 }, { "crossentropy": 2.616509437561035, "epoch": 0.650413283062645, "grad_norm": 0.028388841077685356, "grad_norm_var": 9.444589613183743e-07, "learning_rate": 0.002824674627301469, "loss": 2.5769, "step": 17941 }, { "crossentropy": 2.413882255554199, "epoch": 0.650449535962877, "grad_norm": 0.0273906160145998, "grad_norm_var": 9.528062534106332e-07, "learning_rate": 0.002824151431183479, "loss": 2.3798, "step": 17942 }, { "crossentropy": 2.5550034046173096, "epoch": 0.650485788863109, "grad_norm": 0.02703743986785412, "grad_norm_var": 9.409466650566722e-07, "learning_rate": 0.0028236282644536247, "loss": 2.4758, "step": 17943 }, { "crossentropy": 2.3974201679229736, "epoch": 0.650522041763341, "grad_norm": 0.02606423757970333, "grad_norm_var": 8.751483753902661e-07, "learning_rate": 0.0028231051271189744, "loss": 2.4963, "step": 17944 }, { "crossentropy": 2.3570690155029297, "epoch": 0.6505582946635731, "grad_norm": 0.02644490636885166, "grad_norm_var": 8.16675708753993e-07, "learning_rate": 0.0028225820191865912, "loss": 2.3433, "step": 17945 }, { "crossentropy": 2.4883639812469482, "epoch": 0.6505945475638051, "grad_norm": 0.027731142938137054, "grad_norm_var": 8.552123816557913e-07, "learning_rate": 0.002822058940663543, "loss": 2.5171, "step": 17946 }, { "crossentropy": 2.617892265319824, "epoch": 0.6506308004640371, "grad_norm": 0.02769952453672886, "grad_norm_var": 8.391454753221723e-07, "learning_rate": 0.002821535891556895, "loss": 2.6022, "step": 17947 }, { "crossentropy": 2.689892292022705, "epoch": 0.6506670533642691, "grad_norm": 0.027117125689983368, "grad_norm_var": 6.509560797078219e-07, "learning_rate": 0.0028210128718737116, "loss": 2.6045, "step": 17948 }, { "crossentropy": 2.4458279609680176, "epoch": 0.6507033062645011, "grad_norm": 0.026519475504755974, "grad_norm_var": 6.279197100606329e-07, "learning_rate": 0.0028204898816210533, "loss": 2.5064, "step": 17949 }, { "crossentropy": 2.5588676929473877, "epoch": 0.6507395591647331, "grad_norm": 0.027387036010622978, "grad_norm_var": 6.313683954507158e-07, "learning_rate": 0.0028199669208059885, "loss": 2.489, "step": 17950 }, { "crossentropy": 2.550919532775879, "epoch": 0.6507758120649652, "grad_norm": 0.026321040466427803, "grad_norm_var": 6.034325909709624e-07, "learning_rate": 0.0028194439894355773, "loss": 2.5654, "step": 17951 }, { "crossentropy": 2.388935089111328, "epoch": 0.6508120649651972, "grad_norm": 0.026105379685759544, "grad_norm_var": 6.223793964842446e-07, "learning_rate": 0.002818921087516886, "loss": 2.4641, "step": 17952 }, { "crossentropy": 2.6111528873443604, "epoch": 0.6508483178654292, "grad_norm": 0.02756704017519951, "grad_norm_var": 5.106719057585413e-07, "learning_rate": 0.0028183982150569733, "loss": 2.5295, "step": 17953 }, { "crossentropy": 2.7797064781188965, "epoch": 0.6508845707656613, "grad_norm": 0.027070648968219757, "grad_norm_var": 5.002443079380232e-07, "learning_rate": 0.0028178753720629036, "loss": 2.7344, "step": 17954 }, { "crossentropy": 2.518296241760254, "epoch": 0.6509208236658933, "grad_norm": 0.028178242966532707, "grad_norm_var": 4.9831735789285e-07, "learning_rate": 0.00281735255854174, "loss": 2.5454, "step": 17955 }, { "crossentropy": 2.44246506690979, "epoch": 0.6509570765661253, "grad_norm": 0.028031840920448303, "grad_norm_var": 5.428425124000124e-07, "learning_rate": 0.0028168297745005434, "loss": 2.4983, "step": 17956 }, { "crossentropy": 2.5873758792877197, "epoch": 0.6509933294663574, "grad_norm": 0.03173393756151199, "grad_norm_var": 1.7764901809873279e-06, "learning_rate": 0.002816307019946371, "loss": 2.5234, "step": 17957 }, { "crossentropy": 2.571009635925293, "epoch": 0.6510295823665894, "grad_norm": 0.026903966441750526, "grad_norm_var": 1.791899334717406e-06, "learning_rate": 0.0028157842948862877, "loss": 2.5464, "step": 17958 }, { "crossentropy": 2.6217052936553955, "epoch": 0.6510658352668214, "grad_norm": 0.02776174433529377, "grad_norm_var": 1.7926135937396462e-06, "learning_rate": 0.002815261599327351, "loss": 2.6259, "step": 17959 }, { "crossentropy": 2.3237709999084473, "epoch": 0.6511020881670534, "grad_norm": 0.026418844237923622, "grad_norm_var": 1.736615472737226e-06, "learning_rate": 0.0028147389332766237, "loss": 2.4024, "step": 17960 }, { "crossentropy": 2.55595064163208, "epoch": 0.6511383410672854, "grad_norm": 0.027806881815195084, "grad_norm_var": 1.6723917582415893e-06, "learning_rate": 0.002814216296741161, "loss": 2.5459, "step": 17961 }, { "crossentropy": 2.6574559211730957, "epoch": 0.6511745939675174, "grad_norm": 0.0265854112803936, "grad_norm_var": 1.722503890528544e-06, "learning_rate": 0.0028136936897280246, "loss": 2.6347, "step": 17962 }, { "crossentropy": 2.36072039604187, "epoch": 0.6512108468677494, "grad_norm": 0.025543341413140297, "grad_norm_var": 1.941484436907105e-06, "learning_rate": 0.002813171112244275, "loss": 2.445, "step": 17963 }, { "crossentropy": 2.310532808303833, "epoch": 0.6512470997679815, "grad_norm": 0.025809625163674355, "grad_norm_var": 2.0829581771697933e-06, "learning_rate": 0.0028126485642969677, "loss": 2.5032, "step": 17964 }, { "crossentropy": 2.4570984840393066, "epoch": 0.6512833526682135, "grad_norm": 0.02627599611878395, "grad_norm_var": 2.1098605072398936e-06, "learning_rate": 0.0028121260458931585, "loss": 2.4909, "step": 17965 }, { "crossentropy": 2.4911794662475586, "epoch": 0.6513196055684455, "grad_norm": 0.02643849328160286, "grad_norm_var": 2.1448180366210115e-06, "learning_rate": 0.00281160355703991, "loss": 2.4904, "step": 17966 }, { "crossentropy": 2.6120729446411133, "epoch": 0.6513558584686775, "grad_norm": 0.026477476581931114, "grad_norm_var": 2.128858272240399e-06, "learning_rate": 0.0028110810977442745, "loss": 2.5221, "step": 17967 }, { "crossentropy": 2.4614202976226807, "epoch": 0.6513921113689095, "grad_norm": 0.026814240962266922, "grad_norm_var": 2.0597068675343715e-06, "learning_rate": 0.002810558668013312, "loss": 2.4406, "step": 17968 }, { "crossentropy": 2.5084786415100098, "epoch": 0.6514283642691415, "grad_norm": 0.026102857664227486, "grad_norm_var": 2.124697759362838e-06, "learning_rate": 0.002810036267854076, "loss": 2.4986, "step": 17969 }, { "crossentropy": 2.5560641288757324, "epoch": 0.6514646171693735, "grad_norm": 0.028913388028740883, "grad_norm_var": 2.3242875409406564e-06, "learning_rate": 0.0028095138972736235, "loss": 2.5886, "step": 17970 }, { "crossentropy": 2.587538242340088, "epoch": 0.6515008700696056, "grad_norm": 0.026332227513194084, "grad_norm_var": 2.3056661247880847e-06, "learning_rate": 0.0028089915562790114, "loss": 2.5572, "step": 17971 }, { "crossentropy": 2.4627788066864014, "epoch": 0.6515371229698376, "grad_norm": 0.027431199327111244, "grad_norm_var": 2.255340520698988e-06, "learning_rate": 0.002808469244877292, "loss": 2.5043, "step": 17972 }, { "crossentropy": 2.440823554992676, "epoch": 0.6515733758700696, "grad_norm": 0.027300579473376274, "grad_norm_var": 7.353199726248579e-07, "learning_rate": 0.002807946963075523, "loss": 2.4417, "step": 17973 }, { "crossentropy": 2.4842143058776855, "epoch": 0.6516096287703016, "grad_norm": 0.02518908679485321, "grad_norm_var": 8.970103714806346e-07, "learning_rate": 0.0028074247108807564, "loss": 2.4699, "step": 17974 }, { "crossentropy": 2.3505682945251465, "epoch": 0.6516458816705336, "grad_norm": 0.026420900598168373, "grad_norm_var": 8.195745523302429e-07, "learning_rate": 0.002806902488300045, "loss": 2.4285, "step": 17975 }, { "crossentropy": 2.639012575149536, "epoch": 0.6516821345707656, "grad_norm": 0.026510799303650856, "grad_norm_var": 8.176822860796069e-07, "learning_rate": 0.0028063802953404433, "loss": 2.5832, "step": 17976 }, { "crossentropy": 2.6077115535736084, "epoch": 0.6517183874709976, "grad_norm": 0.025595417246222496, "grad_norm_var": 7.739760161049365e-07, "learning_rate": 0.0028058581320090066, "loss": 2.5537, "step": 17977 }, { "crossentropy": 2.4617936611175537, "epoch": 0.6517546403712297, "grad_norm": 0.02584296464920044, "grad_norm_var": 7.98370399448138e-07, "learning_rate": 0.0028053359983127835, "loss": 2.497, "step": 17978 }, { "crossentropy": 2.5677614212036133, "epoch": 0.6517908932714617, "grad_norm": 0.02674819342792034, "grad_norm_var": 7.454699520314702e-07, "learning_rate": 0.002804813894258831, "loss": 2.5441, "step": 17979 }, { "crossentropy": 2.5685558319091797, "epoch": 0.6518271461716937, "grad_norm": 0.027295289561152458, "grad_norm_var": 7.441457232076522e-07, "learning_rate": 0.0028042918198541954, "loss": 2.5597, "step": 17980 }, { "crossentropy": 2.438403844833374, "epoch": 0.6518633990719258, "grad_norm": 0.02576182223856449, "grad_norm_var": 7.832635438446515e-07, "learning_rate": 0.0028037697751059332, "loss": 2.4964, "step": 17981 }, { "crossentropy": 2.5481841564178467, "epoch": 0.6518996519721578, "grad_norm": 0.027455389499664307, "grad_norm_var": 8.295973818451981e-07, "learning_rate": 0.0028032477600210925, "loss": 2.4978, "step": 17982 }, { "crossentropy": 2.3581039905548096, "epoch": 0.6519359048723898, "grad_norm": 0.028032662346959114, "grad_norm_var": 9.476837473651929e-07, "learning_rate": 0.0028027257746067237, "loss": 2.4421, "step": 17983 }, { "crossentropy": 2.6382720470428467, "epoch": 0.6519721577726219, "grad_norm": 0.02586931176483631, "grad_norm_var": 9.934036128532528e-07, "learning_rate": 0.002802203818869877, "loss": 2.5112, "step": 17984 }, { "crossentropy": 2.421499729156494, "epoch": 0.6520084106728539, "grad_norm": 0.02596462331712246, "grad_norm_var": 1.0051456120054123e-06, "learning_rate": 0.0028016818928176044, "loss": 2.4339, "step": 17985 }, { "crossentropy": 2.5153586864471436, "epoch": 0.6520446635730859, "grad_norm": 0.026771020144224167, "grad_norm_var": 6.501803507515924e-07, "learning_rate": 0.0028011599964569527, "loss": 2.5136, "step": 17986 }, { "crossentropy": 2.519033670425415, "epoch": 0.6520809164733179, "grad_norm": 0.027289576828479767, "grad_norm_var": 6.818867458382628e-07, "learning_rate": 0.002800638129794974, "loss": 2.5627, "step": 17987 }, { "crossentropy": 2.3902251720428467, "epoch": 0.6521171693735499, "grad_norm": 0.026629004627466202, "grad_norm_var": 6.323920517373183e-07, "learning_rate": 0.002800116292838713, "loss": 2.4723, "step": 17988 }, { "crossentropy": 2.6349010467529297, "epoch": 0.6521534222737819, "grad_norm": 0.028100254014134407, "grad_norm_var": 7.532108051437498e-07, "learning_rate": 0.0027995944855952197, "loss": 2.6533, "step": 17989 }, { "crossentropy": 2.5933432579040527, "epoch": 0.6521896751740139, "grad_norm": 0.02733905054628849, "grad_norm_var": 6.398683082342911e-07, "learning_rate": 0.002799072708071547, "loss": 2.5575, "step": 17990 }, { "crossentropy": 2.524294376373291, "epoch": 0.652225928074246, "grad_norm": 0.026864539831876755, "grad_norm_var": 6.340840796791499e-07, "learning_rate": 0.002798550960274732, "loss": 2.5398, "step": 17991 }, { "crossentropy": 2.432490110397339, "epoch": 0.652262180974478, "grad_norm": 0.02644694223999977, "grad_norm_var": 6.364127650462416e-07, "learning_rate": 0.002798029242211828, "loss": 2.4989, "step": 17992 }, { "crossentropy": 2.345231533050537, "epoch": 0.65229843387471, "grad_norm": 0.02553807944059372, "grad_norm_var": 6.454479703964118e-07, "learning_rate": 0.0027975075538898826, "loss": 2.4371, "step": 17993 }, { "crossentropy": 2.4350674152374268, "epoch": 0.652334686774942, "grad_norm": 0.02706434577703476, "grad_norm_var": 5.914941549305917e-07, "learning_rate": 0.002796985895315938, "loss": 2.4866, "step": 17994 }, { "crossentropy": 2.4575724601745605, "epoch": 0.652370939675174, "grad_norm": 0.026447471231222153, "grad_norm_var": 6.001510128171891e-07, "learning_rate": 0.0027964642664970443, "loss": 2.4605, "step": 17995 }, { "crossentropy": 2.65346097946167, "epoch": 0.652407192575406, "grad_norm": 0.026404229924082756, "grad_norm_var": 5.914460857024076e-07, "learning_rate": 0.002795942667440242, "loss": 2.5552, "step": 17996 }, { "crossentropy": 2.6295418739318848, "epoch": 0.652443445475638, "grad_norm": 0.025929396972060204, "grad_norm_var": 5.711522863068349e-07, "learning_rate": 0.0027954210981525796, "loss": 2.6133, "step": 17997 }, { "crossentropy": 2.411776542663574, "epoch": 0.65247969837587, "grad_norm": 0.02544376626610756, "grad_norm_var": 6.373152577917081e-07, "learning_rate": 0.0027948995586411043, "loss": 2.5354, "step": 17998 }, { "crossentropy": 2.54309344291687, "epoch": 0.6525159512761021, "grad_norm": 0.025483226403594017, "grad_norm_var": 5.678951218796574e-07, "learning_rate": 0.0027943780489128526, "loss": 2.5278, "step": 17999 }, { "crossentropy": 2.5427160263061523, "epoch": 0.6525522041763341, "grad_norm": 0.02598855458199978, "grad_norm_var": 5.59169002114695e-07, "learning_rate": 0.002793856568974873, "loss": 2.5347, "step": 18000 }, { "crossentropy": 2.612816333770752, "epoch": 0.6525884570765661, "grad_norm": 0.026300719007849693, "grad_norm_var": 5.430661215220876e-07, "learning_rate": 0.0027933351188342104, "loss": 2.5412, "step": 18001 }, { "crossentropy": 2.5241518020629883, "epoch": 0.6526247099767981, "grad_norm": 0.026716845110058784, "grad_norm_var": 5.413100234719082e-07, "learning_rate": 0.0027928136984979035, "loss": 2.4881, "step": 18002 }, { "crossentropy": 2.512357473373413, "epoch": 0.6526609628770301, "grad_norm": 0.025973757728934288, "grad_norm_var": 5.108424163682238e-07, "learning_rate": 0.002792292307972999, "loss": 2.544, "step": 18003 }, { "crossentropy": 2.475661516189575, "epoch": 0.6526972157772621, "grad_norm": 0.026277795433998108, "grad_norm_var": 5.086186142360851e-07, "learning_rate": 0.0027917709472665355, "loss": 2.4774, "step": 18004 }, { "crossentropy": 2.550896167755127, "epoch": 0.6527334686774942, "grad_norm": 0.026210669428110123, "grad_norm_var": 3.021310255012117e-07, "learning_rate": 0.002791249616385556, "loss": 2.5452, "step": 18005 }, { "crossentropy": 2.464846134185791, "epoch": 0.6527697215777262, "grad_norm": 0.027123551815748215, "grad_norm_var": 2.745127470967264e-07, "learning_rate": 0.0027907283153371073, "loss": 2.3981, "step": 18006 }, { "crossentropy": 2.540269136428833, "epoch": 0.6528059744779582, "grad_norm": 0.02708313800394535, "grad_norm_var": 2.950213223046685e-07, "learning_rate": 0.002790207044128221, "loss": 2.4544, "step": 18007 }, { "crossentropy": 2.506558418273926, "epoch": 0.6528422273781903, "grad_norm": 0.026953857392072678, "grad_norm_var": 3.225656136524382e-07, "learning_rate": 0.002789685802765941, "loss": 2.5367, "step": 18008 }, { "crossentropy": 2.430694341659546, "epoch": 0.6528784802784223, "grad_norm": 0.027546919882297516, "grad_norm_var": 3.683700189835784e-07, "learning_rate": 0.0027891645912573116, "loss": 2.4944, "step": 18009 }, { "crossentropy": 2.541430711746216, "epoch": 0.6529147331786543, "grad_norm": 0.02661244198679924, "grad_norm_var": 3.4316881687955956e-07, "learning_rate": 0.002788643409609367, "loss": 2.5787, "step": 18010 }, { "crossentropy": 2.5515658855438232, "epoch": 0.6529509860788864, "grad_norm": 0.030243825167417526, "grad_norm_var": 1.2649189037037175e-06, "learning_rate": 0.0027881222578291513, "loss": 2.5589, "step": 18011 }, { "crossentropy": 2.5891833305358887, "epoch": 0.6529872389791184, "grad_norm": 0.026202524080872536, "grad_norm_var": 1.2738911328366922e-06, "learning_rate": 0.0027876011359236996, "loss": 2.5595, "step": 18012 }, { "crossentropy": 2.5771090984344482, "epoch": 0.6530234918793504, "grad_norm": 0.026633743196725845, "grad_norm_var": 1.2390374930275193e-06, "learning_rate": 0.002787080043900051, "loss": 2.4789, "step": 18013 }, { "crossentropy": 2.490102529525757, "epoch": 0.6530597447795824, "grad_norm": 0.02821124531328678, "grad_norm_var": 1.2635070267439282e-06, "learning_rate": 0.002786558981765247, "loss": 2.4881, "step": 18014 }, { "crossentropy": 2.5509607791900635, "epoch": 0.6530959976798144, "grad_norm": 0.02675113081932068, "grad_norm_var": 1.1333153369243033e-06, "learning_rate": 0.0027860379495263233, "loss": 2.5708, "step": 18015 }, { "crossentropy": 2.570347309112549, "epoch": 0.6531322505800464, "grad_norm": 0.026116911321878433, "grad_norm_var": 1.1182856503907935e-06, "learning_rate": 0.0027855169471903145, "loss": 2.4833, "step": 18016 }, { "crossentropy": 2.732865810394287, "epoch": 0.6531685034802784, "grad_norm": 0.02814539521932602, "grad_norm_var": 1.1749710134828824e-06, "learning_rate": 0.002784995974764262, "loss": 2.684, "step": 18017 }, { "crossentropy": 2.5770387649536133, "epoch": 0.6532047563805105, "grad_norm": 0.028689846396446228, "grad_norm_var": 1.330563202173174e-06, "learning_rate": 0.0027844750322551987, "loss": 2.6238, "step": 18018 }, { "crossentropy": 2.59012508392334, "epoch": 0.6532410092807425, "grad_norm": 0.02694793790578842, "grad_norm_var": 1.234035924526491e-06, "learning_rate": 0.0027839541196701635, "loss": 2.5794, "step": 18019 }, { "crossentropy": 2.5677504539489746, "epoch": 0.6532772621809745, "grad_norm": 0.02667403407394886, "grad_norm_var": 1.193307820640937e-06, "learning_rate": 0.0027834332370161897, "loss": 2.5788, "step": 18020 }, { "crossentropy": 2.5037736892700195, "epoch": 0.6533135150812065, "grad_norm": 0.025904279202222824, "grad_norm_var": 1.2420095373203223e-06, "learning_rate": 0.0027829123843003136, "loss": 2.4609, "step": 18021 }, { "crossentropy": 2.4664385318756104, "epoch": 0.6533497679814385, "grad_norm": 0.025767790153622627, "grad_norm_var": 1.3779491063183828e-06, "learning_rate": 0.0027823915615295715, "loss": 2.4364, "step": 18022 }, { "crossentropy": 2.454599618911743, "epoch": 0.6533860208816705, "grad_norm": 0.025706613436341286, "grad_norm_var": 1.5096222421587112e-06, "learning_rate": 0.0027818707687109973, "loss": 2.4453, "step": 18023 }, { "crossentropy": 2.580681085586548, "epoch": 0.6534222737819025, "grad_norm": 0.02679114043712616, "grad_norm_var": 1.5137812257452842e-06, "learning_rate": 0.0027813500058516227, "loss": 2.5482, "step": 18024 }, { "crossentropy": 2.4668757915496826, "epoch": 0.6534585266821346, "grad_norm": 0.028403175994753838, "grad_norm_var": 1.6152965269961365e-06, "learning_rate": 0.002780829272958485, "loss": 2.4848, "step": 18025 }, { "crossentropy": 2.5563225746154785, "epoch": 0.6534947795823666, "grad_norm": 0.025664083659648895, "grad_norm_var": 1.7347553011574923e-06, "learning_rate": 0.002780308570038613, "loss": 2.5428, "step": 18026 }, { "crossentropy": 2.7467877864837646, "epoch": 0.6535310324825986, "grad_norm": 0.026486163958907127, "grad_norm_var": 1.018762166297736e-06, "learning_rate": 0.0027797878970990454, "loss": 2.6467, "step": 18027 }, { "crossentropy": 2.6063992977142334, "epoch": 0.6535672853828306, "grad_norm": 0.025875240564346313, "grad_norm_var": 1.0523366992552795e-06, "learning_rate": 0.002779267254146809, "loss": 2.5532, "step": 18028 }, { "crossentropy": 2.607541084289551, "epoch": 0.6536035382830626, "grad_norm": 0.02740865759551525, "grad_norm_var": 1.072891416593645e-06, "learning_rate": 0.0027787466411889382, "loss": 2.6371, "step": 18029 }, { "crossentropy": 2.4535398483276367, "epoch": 0.6536397911832946, "grad_norm": 0.02624530717730522, "grad_norm_var": 9.567086841202309e-07, "learning_rate": 0.0027782260582324674, "loss": 2.5254, "step": 18030 }, { "crossentropy": 2.4615345001220703, "epoch": 0.6536760440835266, "grad_norm": 0.026623418554663658, "grad_norm_var": 9.572593972769488e-07, "learning_rate": 0.0027777055052844233, "loss": 2.581, "step": 18031 }, { "crossentropy": 2.550091028213501, "epoch": 0.6537122969837587, "grad_norm": 0.028137624263763428, "grad_norm_var": 1.051154056375458e-06, "learning_rate": 0.0027771849823518403, "loss": 2.5766, "step": 18032 }, { "crossentropy": 2.60406756401062, "epoch": 0.6537485498839907, "grad_norm": 0.027062347158789635, "grad_norm_var": 9.362358478356402e-07, "learning_rate": 0.0027766644894417483, "loss": 2.5874, "step": 18033 }, { "crossentropy": 2.5262696743011475, "epoch": 0.6537848027842227, "grad_norm": 0.026697389781475067, "grad_norm_var": 6.754488933252122e-07, "learning_rate": 0.0027761440265611733, "loss": 2.5492, "step": 18034 }, { "crossentropy": 2.551997661590576, "epoch": 0.6538210556844548, "grad_norm": 0.02836420014500618, "grad_norm_var": 8.571290122908378e-07, "learning_rate": 0.00277562359371715, "loss": 2.501, "step": 18035 }, { "crossentropy": 2.6674985885620117, "epoch": 0.6538573085846868, "grad_norm": 0.026423761621117592, "grad_norm_var": 8.631855313403121e-07, "learning_rate": 0.0027751031909167044, "loss": 2.5997, "step": 18036 }, { "crossentropy": 2.478508949279785, "epoch": 0.6538935614849188, "grad_norm": 0.027220850810408592, "grad_norm_var": 8.278746484505304e-07, "learning_rate": 0.0027745828181668666, "loss": 2.5456, "step": 18037 }, { "crossentropy": 2.4493932723999023, "epoch": 0.6539298143851509, "grad_norm": 0.025702670216560364, "grad_norm_var": 8.371442123046579e-07, "learning_rate": 0.002774062475474667, "loss": 2.3723, "step": 18038 }, { "crossentropy": 2.576467275619507, "epoch": 0.6539660672853829, "grad_norm": 0.02624022401869297, "grad_norm_var": 7.770918951393683e-07, "learning_rate": 0.002773542162847129, "loss": 2.4987, "step": 18039 }, { "crossentropy": 2.567019462585449, "epoch": 0.6540023201856149, "grad_norm": 0.02616468258202076, "grad_norm_var": 8.052117239026503e-07, "learning_rate": 0.0027730218802912853, "loss": 2.5539, "step": 18040 }, { "crossentropy": 2.5139667987823486, "epoch": 0.6540385730858469, "grad_norm": 0.025673266500234604, "grad_norm_var": 6.856258408227215e-07, "learning_rate": 0.0027725016278141615, "loss": 2.5544, "step": 18041 }, { "crossentropy": 2.6640126705169678, "epoch": 0.6540748259860789, "grad_norm": 0.02731015533208847, "grad_norm_var": 6.442132493833056e-07, "learning_rate": 0.0027719814054227806, "loss": 2.6307, "step": 18042 }, { "crossentropy": 2.2870075702667236, "epoch": 0.6541110788863109, "grad_norm": 0.026575865224003792, "grad_norm_var": 6.418327441659947e-07, "learning_rate": 0.0027714612131241735, "loss": 2.4258, "step": 18043 }, { "crossentropy": 2.4235641956329346, "epoch": 0.6541473317865429, "grad_norm": 0.026905998587608337, "grad_norm_var": 5.903710560315259e-07, "learning_rate": 0.0027709410509253636, "loss": 2.4231, "step": 18044 }, { "crossentropy": 2.5730080604553223, "epoch": 0.654183584686775, "grad_norm": 0.026512904092669487, "grad_norm_var": 5.674998513564728e-07, "learning_rate": 0.0027704209188333774, "loss": 2.5526, "step": 18045 }, { "crossentropy": 2.5567355155944824, "epoch": 0.654219837587007, "grad_norm": 0.028279446065425873, "grad_norm_var": 6.915872466798737e-07, "learning_rate": 0.002769900816855242, "loss": 2.4989, "step": 18046 }, { "crossentropy": 2.590085983276367, "epoch": 0.654256090487239, "grad_norm": 0.026955537497997284, "grad_norm_var": 6.876316657686606e-07, "learning_rate": 0.002769380744997978, "loss": 2.4906, "step": 18047 }, { "crossentropy": 2.470858335494995, "epoch": 0.654292343387471, "grad_norm": 0.025538207963109016, "grad_norm_var": 6.772460863908754e-07, "learning_rate": 0.002768860703268614, "loss": 2.4529, "step": 18048 }, { "crossentropy": 2.457447052001953, "epoch": 0.654328596287703, "grad_norm": 0.027818985283374786, "grad_norm_var": 7.468872666827603e-07, "learning_rate": 0.0027683406916741734, "loss": 2.567, "step": 18049 }, { "crossentropy": 2.549276351928711, "epoch": 0.654364849187935, "grad_norm": 0.02608659118413925, "grad_norm_var": 7.764443131805793e-07, "learning_rate": 0.0027678207102216756, "loss": 2.618, "step": 18050 }, { "crossentropy": 2.5915305614471436, "epoch": 0.654401102088167, "grad_norm": 0.026010936126112938, "grad_norm_var": 6.116300347161947e-07, "learning_rate": 0.0027673007589181483, "loss": 2.5863, "step": 18051 }, { "crossentropy": 2.3271641731262207, "epoch": 0.6544373549883991, "grad_norm": 0.026459690183401108, "grad_norm_var": 6.109203159836701e-07, "learning_rate": 0.0027667808377706106, "loss": 2.4181, "step": 18052 }, { "crossentropy": 2.503431558609009, "epoch": 0.6544736078886311, "grad_norm": 0.02636306919157505, "grad_norm_var": 5.848706379513036e-07, "learning_rate": 0.0027662609467860866, "loss": 2.5535, "step": 18053 }, { "crossentropy": 2.3404712677001953, "epoch": 0.6545098607888631, "grad_norm": 0.028729304671287537, "grad_norm_var": 8.205509257133558e-07, "learning_rate": 0.002765741085971601, "loss": 2.461, "step": 18054 }, { "crossentropy": 2.453091859817505, "epoch": 0.6545461136890951, "grad_norm": 0.026376208290457726, "grad_norm_var": 8.128888939985899e-07, "learning_rate": 0.0027652212553341705, "loss": 2.5121, "step": 18055 }, { "crossentropy": 2.6656084060668945, "epoch": 0.6545823665893271, "grad_norm": 0.02596251852810383, "grad_norm_var": 8.308177421396569e-07, "learning_rate": 0.0027647014548808197, "loss": 2.5986, "step": 18056 }, { "crossentropy": 2.582409143447876, "epoch": 0.6546186194895591, "grad_norm": 0.02709333784878254, "grad_norm_var": 7.582061142744993e-07, "learning_rate": 0.0027641816846185685, "loss": 2.5617, "step": 18057 }, { "crossentropy": 2.4847307205200195, "epoch": 0.6546548723897911, "grad_norm": 0.02627292089164257, "grad_norm_var": 7.56438745099748e-07, "learning_rate": 0.0027636619445544344, "loss": 2.4997, "step": 18058 }, { "crossentropy": 2.5639538764953613, "epoch": 0.6546911252900232, "grad_norm": 0.02743758261203766, "grad_norm_var": 7.832611519782591e-07, "learning_rate": 0.0027631422346954416, "loss": 2.5369, "step": 18059 }, { "crossentropy": 2.5327816009521484, "epoch": 0.6547273781902552, "grad_norm": 0.027677888050675392, "grad_norm_var": 8.313878769474244e-07, "learning_rate": 0.002762622555048605, "loss": 2.5146, "step": 18060 }, { "crossentropy": 2.409266471862793, "epoch": 0.6547636310904872, "grad_norm": 0.02653050795197487, "grad_norm_var": 8.306196688715254e-07, "learning_rate": 0.002762102905620947, "loss": 2.4865, "step": 18061 }, { "crossentropy": 2.6077282428741455, "epoch": 0.6547998839907193, "grad_norm": 0.026441726833581924, "grad_norm_var": 6.913280479525774e-07, "learning_rate": 0.002761583286419487, "loss": 2.6069, "step": 18062 }, { "crossentropy": 2.421198606491089, "epoch": 0.6548361368909513, "grad_norm": 0.02659919299185276, "grad_norm_var": 6.887712670234584e-07, "learning_rate": 0.0027610636974512394, "loss": 2.4499, "step": 18063 }, { "crossentropy": 2.5458576679229736, "epoch": 0.6548723897911833, "grad_norm": 0.026243962347507477, "grad_norm_var": 6.09408108402885e-07, "learning_rate": 0.0027605441387232262, "loss": 2.5047, "step": 18064 }, { "crossentropy": 2.5037736892700195, "epoch": 0.6549086426914154, "grad_norm": 0.025992397218942642, "grad_norm_var": 5.591779859411816e-07, "learning_rate": 0.002760024610242463, "loss": 2.5654, "step": 18065 }, { "crossentropy": 2.4984352588653564, "epoch": 0.6549448955916474, "grad_norm": 0.028102759271860123, "grad_norm_var": 6.638319735916924e-07, "learning_rate": 0.0027595051120159653, "loss": 2.5013, "step": 18066 }, { "crossentropy": 2.4880735874176025, "epoch": 0.6549811484918794, "grad_norm": 0.02645256370306015, "grad_norm_var": 6.31420858868585e-07, "learning_rate": 0.0027589856440507522, "loss": 2.4978, "step": 18067 }, { "crossentropy": 2.537522315979004, "epoch": 0.6550174013921114, "grad_norm": 0.026892393827438354, "grad_norm_var": 6.237212208894203e-07, "learning_rate": 0.002758466206353838, "loss": 2.5678, "step": 18068 }, { "crossentropy": 2.4239161014556885, "epoch": 0.6550536542923434, "grad_norm": 0.027271760627627373, "grad_norm_var": 6.196015062505476e-07, "learning_rate": 0.0027579467989322383, "loss": 2.506, "step": 18069 }, { "crossentropy": 2.4373037815093994, "epoch": 0.6550899071925754, "grad_norm": 0.027046887204051018, "grad_norm_var": 3.816275082166772e-07, "learning_rate": 0.002757427421792972, "loss": 2.5054, "step": 18070 }, { "crossentropy": 2.4469892978668213, "epoch": 0.6551261600928074, "grad_norm": 0.026278380304574966, "grad_norm_var": 3.8742298928243154e-07, "learning_rate": 0.0027569080749430494, "loss": 2.4223, "step": 18071 }, { "crossentropy": 2.5280439853668213, "epoch": 0.6551624129930395, "grad_norm": 0.025763921439647675, "grad_norm_var": 4.112314072277426e-07, "learning_rate": 0.002756388758389489, "loss": 2.4464, "step": 18072 }, { "crossentropy": 2.6044962406158447, "epoch": 0.6551986658932715, "grad_norm": 0.0316634476184845, "grad_norm_var": 1.9220733543606847e-06, "learning_rate": 0.002755869472139302, "loss": 2.5567, "step": 18073 }, { "crossentropy": 2.5873570442199707, "epoch": 0.6552349187935035, "grad_norm": 0.02584480307996273, "grad_norm_var": 1.9774162944893724e-06, "learning_rate": 0.0027553502161995046, "loss": 2.4395, "step": 18074 }, { "crossentropy": 2.574256181716919, "epoch": 0.6552711716937355, "grad_norm": 0.026417722925543785, "grad_norm_var": 1.984961564386924e-06, "learning_rate": 0.0027548309905771097, "loss": 2.54, "step": 18075 }, { "crossentropy": 2.3623435497283936, "epoch": 0.6553074245939675, "grad_norm": 0.02767263352870941, "grad_norm_var": 1.984454219076008e-06, "learning_rate": 0.002754311795279126, "loss": 2.5008, "step": 18076 }, { "crossentropy": 2.5427474975585938, "epoch": 0.6553436774941995, "grad_norm": 0.02590942196547985, "grad_norm_var": 2.0433801585555134e-06, "learning_rate": 0.002753792630312571, "loss": 2.4921, "step": 18077 }, { "crossentropy": 2.708012104034424, "epoch": 0.6553799303944315, "grad_norm": 0.028482098132371902, "grad_norm_var": 2.1756036863045113e-06, "learning_rate": 0.0027532734956844567, "loss": 2.5863, "step": 18078 }, { "crossentropy": 2.6568117141723633, "epoch": 0.6554161832946636, "grad_norm": 0.02638055756688118, "grad_norm_var": 2.1914311128463317e-06, "learning_rate": 0.0027527543914017917, "loss": 2.6574, "step": 18079 }, { "crossentropy": 2.6112985610961914, "epoch": 0.6554524361948956, "grad_norm": 0.028719456866383553, "grad_norm_var": 2.3163176594906756e-06, "learning_rate": 0.0027522353174715903, "loss": 2.5597, "step": 18080 }, { "crossentropy": 2.5306107997894287, "epoch": 0.6554886890951276, "grad_norm": 0.026871806010603905, "grad_norm_var": 2.225318754675099e-06, "learning_rate": 0.0027517162739008607, "loss": 2.4954, "step": 18081 }, { "crossentropy": 2.59826397895813, "epoch": 0.6555249419953596, "grad_norm": 0.02712002769112587, "grad_norm_var": 2.1720625040734838e-06, "learning_rate": 0.002751197260696616, "loss": 2.5853, "step": 18082 }, { "crossentropy": 2.478527784347534, "epoch": 0.6555611948955916, "grad_norm": 0.026233995333313942, "grad_norm_var": 2.1960797546285066e-06, "learning_rate": 0.0027506782778658656, "loss": 2.4713, "step": 18083 }, { "crossentropy": 2.5471417903900146, "epoch": 0.6555974477958236, "grad_norm": 0.02745080552995205, "grad_norm_var": 2.1956007997257864e-06, "learning_rate": 0.0027501593254156166, "loss": 2.5179, "step": 18084 }, { "crossentropy": 2.5088510513305664, "epoch": 0.6556337006960556, "grad_norm": 0.027789708226919174, "grad_norm_var": 2.2176353748884837e-06, "learning_rate": 0.0027496404033528796, "loss": 2.4947, "step": 18085 }, { "crossentropy": 2.716160535812378, "epoch": 0.6556699535962877, "grad_norm": 0.027364319190382957, "grad_norm_var": 2.2162737549813806e-06, "learning_rate": 0.0027491215116846658, "loss": 2.6129, "step": 18086 }, { "crossentropy": 2.4380247592926025, "epoch": 0.6557062064965197, "grad_norm": 0.026624029502272606, "grad_norm_var": 2.1790685029253783e-06, "learning_rate": 0.0027486026504179805, "loss": 2.443, "step": 18087 }, { "crossentropy": 2.6823132038116455, "epoch": 0.6557424593967517, "grad_norm": 0.026172876358032227, "grad_norm_var": 2.1074371532771394e-06, "learning_rate": 0.0027480838195598346, "loss": 2.562, "step": 18088 }, { "crossentropy": 2.620661973953247, "epoch": 0.6557787122969838, "grad_norm": 0.028006264939904213, "grad_norm_var": 8.131424574587635e-07, "learning_rate": 0.0027475650191172325, "loss": 2.5418, "step": 18089 }, { "crossentropy": 2.588629722595215, "epoch": 0.6558149651972158, "grad_norm": 0.027792366221547127, "grad_norm_var": 7.330172103349348e-07, "learning_rate": 0.0027470462490971825, "loss": 2.536, "step": 18090 }, { "crossentropy": 2.4691193103790283, "epoch": 0.6558512180974478, "grad_norm": 0.02676447108387947, "grad_norm_var": 7.049193055394155e-07, "learning_rate": 0.0027465275095066967, "loss": 2.4818, "step": 18091 }, { "crossentropy": 2.4229819774627686, "epoch": 0.6558874709976799, "grad_norm": 0.026019375771284103, "grad_norm_var": 7.736966863598384e-07, "learning_rate": 0.0027460088003527718, "loss": 2.4521, "step": 18092 }, { "crossentropy": 2.6468515396118164, "epoch": 0.6559237238979119, "grad_norm": 0.02619239315390587, "grad_norm_var": 7.335417894918997e-07, "learning_rate": 0.002745490121642419, "loss": 2.5877, "step": 18093 }, { "crossentropy": 2.5544872283935547, "epoch": 0.6559599767981439, "grad_norm": 0.02773282490670681, "grad_norm_var": 6.329551723155963e-07, "learning_rate": 0.0027449714733826457, "loss": 2.6082, "step": 18094 }, { "crossentropy": 2.5557820796966553, "epoch": 0.6559962296983759, "grad_norm": 0.026061981916427612, "grad_norm_var": 6.688896419787856e-07, "learning_rate": 0.0027444528555804525, "loss": 2.4936, "step": 18095 }, { "crossentropy": 2.5659518241882324, "epoch": 0.6560324825986079, "grad_norm": 0.027666691690683365, "grad_norm_var": 5.048436635188627e-07, "learning_rate": 0.002743934268242848, "loss": 2.5642, "step": 18096 }, { "crossentropy": 2.608337163925171, "epoch": 0.6560687354988399, "grad_norm": 0.027815617620944977, "grad_norm_var": 5.454554501965834e-07, "learning_rate": 0.0027434157113768332, "loss": 2.6459, "step": 18097 }, { "crossentropy": 2.3810460567474365, "epoch": 0.6561049883990719, "grad_norm": 0.027568722143769264, "grad_norm_var": 5.62198866267162e-07, "learning_rate": 0.002742897184989414, "loss": 2.4844, "step": 18098 }, { "crossentropy": 2.534963846206665, "epoch": 0.656141241299304, "grad_norm": 0.02564093843102455, "grad_norm_var": 6.509619153481262e-07, "learning_rate": 0.0027423786890875975, "loss": 2.5723, "step": 18099 }, { "crossentropy": 2.5016603469848633, "epoch": 0.656177494199536, "grad_norm": 0.02713792212307453, "grad_norm_var": 6.40003496709289e-07, "learning_rate": 0.0027418602236783784, "loss": 2.4938, "step": 18100 }, { "crossentropy": 2.54966402053833, "epoch": 0.656213747099768, "grad_norm": 0.026348279789090157, "grad_norm_var": 6.222965685668176e-07, "learning_rate": 0.0027413417887687643, "loss": 2.4681, "step": 18101 }, { "crossentropy": 2.6453189849853516, "epoch": 0.65625, "grad_norm": 0.0272130835801363, "grad_norm_var": 6.15004787569926e-07, "learning_rate": 0.002740823384365759, "loss": 2.5733, "step": 18102 }, { "crossentropy": 2.579608201980591, "epoch": 0.656286252900232, "grad_norm": 0.02648775838315487, "grad_norm_var": 6.215860019373811e-07, "learning_rate": 0.0027403050104763604, "loss": 2.5677, "step": 18103 }, { "crossentropy": 2.457963705062866, "epoch": 0.656322505800464, "grad_norm": 0.02734297513961792, "grad_norm_var": 5.915553628526559e-07, "learning_rate": 0.0027397866671075733, "loss": 2.549, "step": 18104 }, { "crossentropy": 2.6046383380889893, "epoch": 0.656358758700696, "grad_norm": 0.02724006213247776, "grad_norm_var": 5.241164195567023e-07, "learning_rate": 0.0027392683542663953, "loss": 2.5601, "step": 18105 }, { "crossentropy": 2.7312281131744385, "epoch": 0.6563950116009281, "grad_norm": 0.026670238003134727, "grad_norm_var": 4.7515016828938996e-07, "learning_rate": 0.0027387500719598295, "loss": 2.6164, "step": 18106 }, { "crossentropy": 2.429360866546631, "epoch": 0.6564312645011601, "grad_norm": 0.02728133089840412, "grad_norm_var": 4.846459602253581e-07, "learning_rate": 0.0027382318201948797, "loss": 2.4638, "step": 18107 }, { "crossentropy": 2.667266368865967, "epoch": 0.6564675174013921, "grad_norm": 0.025880498811602592, "grad_norm_var": 5.021812138732151e-07, "learning_rate": 0.0027377135989785366, "loss": 2.5914, "step": 18108 }, { "crossentropy": 2.4911210536956787, "epoch": 0.6565037703016241, "grad_norm": 0.02587953209877014, "grad_norm_var": 5.375071017174951e-07, "learning_rate": 0.0027371954083178053, "loss": 2.4812, "step": 18109 }, { "crossentropy": 2.4533989429473877, "epoch": 0.6565400232018561, "grad_norm": 0.026870504021644592, "grad_norm_var": 4.851258923913576e-07, "learning_rate": 0.0027366772482196856, "loss": 2.4773, "step": 18110 }, { "crossentropy": 2.4389641284942627, "epoch": 0.6565762761020881, "grad_norm": 0.025766106322407722, "grad_norm_var": 5.204669822106896e-07, "learning_rate": 0.002736159118691173, "loss": 2.4414, "step": 18111 }, { "crossentropy": 2.486937999725342, "epoch": 0.6566125290023201, "grad_norm": 0.027267353609204292, "grad_norm_var": 4.843209947684995e-07, "learning_rate": 0.002735641019739268, "loss": 2.4945, "step": 18112 }, { "crossentropy": 2.6202707290649414, "epoch": 0.6566487819025522, "grad_norm": 0.02564946934580803, "grad_norm_var": 4.772296252822007e-07, "learning_rate": 0.002735122951370965, "loss": 2.5179, "step": 18113 }, { "crossentropy": 2.5607059001922607, "epoch": 0.6566850348027842, "grad_norm": 0.027233164757490158, "grad_norm_var": 4.427284563193167e-07, "learning_rate": 0.0027346049135932647, "loss": 2.531, "step": 18114 }, { "crossentropy": 2.413736343383789, "epoch": 0.6567212877030162, "grad_norm": 0.026391711086034775, "grad_norm_var": 3.8001761994693506e-07, "learning_rate": 0.0027340869064131647, "loss": 2.5279, "step": 18115 }, { "crossentropy": 2.4801948070526123, "epoch": 0.6567575406032483, "grad_norm": 0.02754807472229004, "grad_norm_var": 4.1632606557691297e-07, "learning_rate": 0.00273356892983766, "loss": 2.4718, "step": 18116 }, { "crossentropy": 2.570317029953003, "epoch": 0.6567937935034803, "grad_norm": 0.027472611516714096, "grad_norm_var": 4.438236768780056e-07, "learning_rate": 0.0027330509838737437, "loss": 2.523, "step": 18117 }, { "crossentropy": 2.509155511856079, "epoch": 0.6568300464037123, "grad_norm": 0.026402978226542473, "grad_norm_var": 4.3613393609897205e-07, "learning_rate": 0.002732533068528416, "loss": 2.5534, "step": 18118 }, { "crossentropy": 2.4336345195770264, "epoch": 0.6568662993039444, "grad_norm": 0.02585725113749504, "grad_norm_var": 4.797915150581e-07, "learning_rate": 0.0027320151838086683, "loss": 2.4936, "step": 18119 }, { "crossentropy": 2.482135057449341, "epoch": 0.6569025522041764, "grad_norm": 0.026682734489440918, "grad_norm_var": 4.479792744561343e-07, "learning_rate": 0.0027314973297214995, "loss": 2.4297, "step": 18120 }, { "crossentropy": 2.5406339168548584, "epoch": 0.6569388051044084, "grad_norm": 0.026671603322029114, "grad_norm_var": 4.22001029599881e-07, "learning_rate": 0.0027309795062739, "loss": 2.5164, "step": 18121 }, { "crossentropy": 2.3946421146392822, "epoch": 0.6569750580046404, "grad_norm": 0.026353631168603897, "grad_norm_var": 4.2510352745576905e-07, "learning_rate": 0.0027304617134728654, "loss": 2.4569, "step": 18122 }, { "crossentropy": 2.519383192062378, "epoch": 0.6570113109048724, "grad_norm": 0.029276439920067787, "grad_norm_var": 8.616343208005337e-07, "learning_rate": 0.0027299439513253916, "loss": 2.5142, "step": 18123 }, { "crossentropy": 2.4703128337860107, "epoch": 0.6570475638051044, "grad_norm": 0.02585374377667904, "grad_norm_var": 8.64603315060353e-07, "learning_rate": 0.00272942621983847, "loss": 2.5121, "step": 18124 }, { "crossentropy": 2.4725680351257324, "epoch": 0.6570838167053364, "grad_norm": 0.026485878974199295, "grad_norm_var": 8.213667689513457e-07, "learning_rate": 0.0027289085190190902, "loss": 2.4256, "step": 18125 }, { "crossentropy": 2.4291253089904785, "epoch": 0.6571200696055685, "grad_norm": 0.03102840855717659, "grad_norm_var": 1.9761932910449064e-06, "learning_rate": 0.0027283908488742498, "loss": 2.5188, "step": 18126 }, { "crossentropy": 2.419231653213501, "epoch": 0.6571563225058005, "grad_norm": 0.028675401583313942, "grad_norm_var": 2.027984921259398e-06, "learning_rate": 0.0027278732094109365, "loss": 2.4138, "step": 18127 }, { "crossentropy": 2.489664077758789, "epoch": 0.6571925754060325, "grad_norm": 0.025886595249176025, "grad_norm_var": 2.1307189369971597e-06, "learning_rate": 0.0027273556006361453, "loss": 2.4961, "step": 18128 }, { "crossentropy": 2.713850259780884, "epoch": 0.6572288283062645, "grad_norm": 0.028658466413617134, "grad_norm_var": 2.1179128889153477e-06, "learning_rate": 0.0027268380225568636, "loss": 2.5758, "step": 18129 }, { "crossentropy": 2.55005145072937, "epoch": 0.6572650812064965, "grad_norm": 0.028063466772437096, "grad_norm_var": 2.155824522723689e-06, "learning_rate": 0.002726320475180084, "loss": 2.5153, "step": 18130 }, { "crossentropy": 2.4861271381378174, "epoch": 0.6573013341067285, "grad_norm": 0.026520874351263046, "grad_norm_var": 2.14067701375803e-06, "learning_rate": 0.0027258029585127993, "loss": 2.5696, "step": 18131 }, { "crossentropy": 2.4490416049957275, "epoch": 0.6573375870069605, "grad_norm": 0.026022523641586304, "grad_norm_var": 2.2437864398552886e-06, "learning_rate": 0.002725285472561994, "loss": 2.4952, "step": 18132 }, { "crossentropy": 2.584645986557007, "epoch": 0.6573738399071926, "grad_norm": 0.026281988248229027, "grad_norm_var": 2.29617881811552e-06, "learning_rate": 0.0027247680173346635, "loss": 2.5272, "step": 18133 }, { "crossentropy": 2.554391384124756, "epoch": 0.6574100928074246, "grad_norm": 0.025121666491031647, "grad_norm_var": 2.529849207214864e-06, "learning_rate": 0.0027242505928377924, "loss": 2.4929, "step": 18134 }, { "crossentropy": 2.4890151023864746, "epoch": 0.6574463457076566, "grad_norm": 0.027606159448623657, "grad_norm_var": 2.433544936165701e-06, "learning_rate": 0.00272373319907837, "loss": 2.5345, "step": 18135 }, { "crossentropy": 2.3897480964660645, "epoch": 0.6574825986078886, "grad_norm": 0.025590553879737854, "grad_norm_var": 2.5833300904177143e-06, "learning_rate": 0.0027232158360633862, "loss": 2.4331, "step": 18136 }, { "crossentropy": 2.6557071208953857, "epoch": 0.6575188515081206, "grad_norm": 0.034592173993587494, "grad_norm_var": 6.019044703075663e-06, "learning_rate": 0.002722698503799825, "loss": 2.5723, "step": 18137 }, { "crossentropy": 2.3847129344940186, "epoch": 0.6575551044083526, "grad_norm": 0.026912037283182144, "grad_norm_var": 5.943790973157725e-06, "learning_rate": 0.0027221812022946777, "loss": 2.4607, "step": 18138 }, { "crossentropy": 2.526158571243286, "epoch": 0.6575913573085846, "grad_norm": 0.02776653692126274, "grad_norm_var": 5.761062667685107e-06, "learning_rate": 0.0027216639315549318, "loss": 2.5333, "step": 18139 }, { "crossentropy": 2.567877769470215, "epoch": 0.6576276102088167, "grad_norm": 0.025623196735978127, "grad_norm_var": 5.817038870462596e-06, "learning_rate": 0.00272114669158757, "loss": 2.4904, "step": 18140 }, { "crossentropy": 2.5517661571502686, "epoch": 0.6576638631090487, "grad_norm": 0.02607799880206585, "grad_norm_var": 5.885430063602704e-06, "learning_rate": 0.002720629482399582, "loss": 2.565, "step": 18141 }, { "crossentropy": 2.495814323425293, "epoch": 0.6577001160092807, "grad_norm": 0.026346465572714806, "grad_norm_var": 5.069526737330035e-06, "learning_rate": 0.0027201123039979525, "loss": 2.4682, "step": 18142 }, { "crossentropy": 2.588534116744995, "epoch": 0.6577363689095128, "grad_norm": 0.026896752417087555, "grad_norm_var": 4.925449467166795e-06, "learning_rate": 0.002719595156389664, "loss": 2.5681, "step": 18143 }, { "crossentropy": 2.6288251876831055, "epoch": 0.6577726218097448, "grad_norm": 0.02680593729019165, "grad_norm_var": 4.826720812640961e-06, "learning_rate": 0.002719078039581705, "loss": 2.6793, "step": 18144 }, { "crossentropy": 2.5285661220550537, "epoch": 0.6578088747099768, "grad_norm": 0.02665325067937374, "grad_norm_var": 4.682854164285837e-06, "learning_rate": 0.002718560953581056, "loss": 2.4602, "step": 18145 }, { "crossentropy": 2.5478217601776123, "epoch": 0.6578451276102089, "grad_norm": 0.02702450193464756, "grad_norm_var": 4.6106318262131385e-06, "learning_rate": 0.0027180438983947053, "loss": 2.5234, "step": 18146 }, { "crossentropy": 2.368372678756714, "epoch": 0.6578813805104409, "grad_norm": 0.02860712818801403, "grad_norm_var": 4.752119400502084e-06, "learning_rate": 0.0027175268740296356, "loss": 2.4596, "step": 18147 }, { "crossentropy": 2.4279706478118896, "epoch": 0.6579176334106729, "grad_norm": 0.02547626383602619, "grad_norm_var": 4.850744067393976e-06, "learning_rate": 0.0027170098804928274, "loss": 2.4604, "step": 18148 }, { "crossentropy": 2.585604190826416, "epoch": 0.6579538863109049, "grad_norm": 0.027508489787578583, "grad_norm_var": 4.8132127298027795e-06, "learning_rate": 0.002716492917791267, "loss": 2.5545, "step": 18149 }, { "crossentropy": 2.577512264251709, "epoch": 0.6579901392111369, "grad_norm": 0.028138110414147377, "grad_norm_var": 4.560859042629373e-06, "learning_rate": 0.0027159759859319353, "loss": 2.6158, "step": 18150 }, { "crossentropy": 2.5089242458343506, "epoch": 0.6580263921113689, "grad_norm": 0.025937560945749283, "grad_norm_var": 4.678237922198178e-06, "learning_rate": 0.002715459084921813, "loss": 2.4245, "step": 18151 }, { "crossentropy": 2.4968464374542236, "epoch": 0.6580626450116009, "grad_norm": 0.026566674932837486, "grad_norm_var": 4.522162765139984e-06, "learning_rate": 0.0027149422147678837, "loss": 2.4367, "step": 18152 }, { "crossentropy": 2.5165555477142334, "epoch": 0.658098897911833, "grad_norm": 0.02778942510485649, "grad_norm_var": 8.078002221243204e-07, "learning_rate": 0.0027144253754771254, "loss": 2.4585, "step": 18153 }, { "crossentropy": 2.5577762126922607, "epoch": 0.658135150812065, "grad_norm": 0.02712843380868435, "grad_norm_var": 8.115605438743789e-07, "learning_rate": 0.002713908567056521, "loss": 2.4893, "step": 18154 }, { "crossentropy": 2.472283124923706, "epoch": 0.658171403712297, "grad_norm": 0.02641673944890499, "grad_norm_var": 7.688801465976682e-07, "learning_rate": 0.0027133917895130534, "loss": 2.4679, "step": 18155 }, { "crossentropy": 2.4598147869110107, "epoch": 0.658207656612529, "grad_norm": 0.027362769469618797, "grad_norm_var": 6.82206129093291e-07, "learning_rate": 0.0027128750428536967, "loss": 2.4259, "step": 18156 }, { "crossentropy": 2.44519305229187, "epoch": 0.658243909512761, "grad_norm": 0.026841960847377777, "grad_norm_var": 6.328108486594597e-07, "learning_rate": 0.002712358327085436, "loss": 2.4479, "step": 18157 }, { "crossentropy": 2.440647602081299, "epoch": 0.658280162412993, "grad_norm": 0.026981906965374947, "grad_norm_var": 6.053216271686401e-07, "learning_rate": 0.002711841642215248, "loss": 2.5064, "step": 18158 }, { "crossentropy": 2.582547187805176, "epoch": 0.658316415313225, "grad_norm": 0.0277712345123291, "grad_norm_var": 6.400877437047365e-07, "learning_rate": 0.0027113249882501082, "loss": 2.5221, "step": 18159 }, { "crossentropy": 2.5636236667633057, "epoch": 0.6583526682134571, "grad_norm": 0.02654549479484558, "grad_norm_var": 6.5325899402015e-07, "learning_rate": 0.002710808365197, "loss": 2.5115, "step": 18160 }, { "crossentropy": 2.4708669185638428, "epoch": 0.6583889211136891, "grad_norm": 0.02640615776181221, "grad_norm_var": 6.700430515931318e-07, "learning_rate": 0.002710291773062897, "loss": 2.5144, "step": 18161 }, { "crossentropy": 2.4356720447540283, "epoch": 0.6584251740139211, "grad_norm": 0.026526235044002533, "grad_norm_var": 6.860200769287505e-07, "learning_rate": 0.002709775211854778, "loss": 2.432, "step": 18162 }, { "crossentropy": 2.588329315185547, "epoch": 0.6584614269141531, "grad_norm": 0.027560217306017876, "grad_norm_var": 5.30225486342073e-07, "learning_rate": 0.0027092586815796226, "loss": 2.5493, "step": 18163 }, { "crossentropy": 2.416368246078491, "epoch": 0.6584976798143851, "grad_norm": 0.026052962988615036, "grad_norm_var": 4.388561082388585e-07, "learning_rate": 0.0027087421822444033, "loss": 2.4786, "step": 18164 }, { "crossentropy": 2.5904324054718018, "epoch": 0.6585339327146171, "grad_norm": 0.02682541497051716, "grad_norm_var": 4.19056038577538e-07, "learning_rate": 0.0027082257138560995, "loss": 2.6783, "step": 18165 }, { "crossentropy": 2.5907363891601562, "epoch": 0.6585701856148491, "grad_norm": 0.027007024735212326, "grad_norm_var": 3.165483382188873e-07, "learning_rate": 0.002707709276421685, "loss": 2.5304, "step": 18166 }, { "crossentropy": 2.484837055206299, "epoch": 0.6586064385150812, "grad_norm": 0.027264820411801338, "grad_norm_var": 2.6384737187018274e-07, "learning_rate": 0.002707192869948134, "loss": 2.6194, "step": 18167 }, { "crossentropy": 2.514230728149414, "epoch": 0.6586426914153132, "grad_norm": 0.025597885251045227, "grad_norm_var": 3.7079042428550013e-07, "learning_rate": 0.0027066764944424244, "loss": 2.4548, "step": 18168 }, { "crossentropy": 2.514613389968872, "epoch": 0.6586789443155452, "grad_norm": 0.02672741189599037, "grad_norm_var": 3.1249457939351054e-07, "learning_rate": 0.002706160149911527, "loss": 2.5854, "step": 18169 }, { "crossentropy": 2.5608744621276855, "epoch": 0.6587151972157773, "grad_norm": 0.026652825996279716, "grad_norm_var": 3.066635795224837e-07, "learning_rate": 0.002705643836362418, "loss": 2.6214, "step": 18170 }, { "crossentropy": 2.515150785446167, "epoch": 0.6587514501160093, "grad_norm": 0.026955798268318176, "grad_norm_var": 2.9844162947901635e-07, "learning_rate": 0.0027051275538020727, "loss": 2.5076, "step": 18171 }, { "crossentropy": 2.4512338638305664, "epoch": 0.6587877030162413, "grad_norm": 0.025950852781534195, "grad_norm_var": 3.2038734313331353e-07, "learning_rate": 0.0027046113022374607, "loss": 2.4383, "step": 18172 }, { "crossentropy": 2.439321994781494, "epoch": 0.6588239559164734, "grad_norm": 0.026022514328360558, "grad_norm_var": 3.500422922926456e-07, "learning_rate": 0.002704095081675558, "loss": 2.4764, "step": 18173 }, { "crossentropy": 2.6051743030548096, "epoch": 0.6588602088167054, "grad_norm": 0.02697172947227955, "grad_norm_var": 3.496364289446233e-07, "learning_rate": 0.0027035788921233334, "loss": 2.4666, "step": 18174 }, { "crossentropy": 2.5028512477874756, "epoch": 0.6588964617169374, "grad_norm": 0.028885554522275925, "grad_norm_var": 5.89759128385529e-07, "learning_rate": 0.0027030627335877626, "loss": 2.4691, "step": 18175 }, { "crossentropy": 2.4586334228515625, "epoch": 0.6589327146171694, "grad_norm": 0.026688696816563606, "grad_norm_var": 5.871922695595596e-07, "learning_rate": 0.002702546606075816, "loss": 2.5289, "step": 18176 }, { "crossentropy": 2.4902565479278564, "epoch": 0.6589689675174014, "grad_norm": 0.025278598070144653, "grad_norm_var": 7.192508959643577e-07, "learning_rate": 0.0027020305095944615, "loss": 2.5721, "step": 18177 }, { "crossentropy": 2.4280343055725098, "epoch": 0.6590052204176334, "grad_norm": 0.026630576699972153, "grad_norm_var": 7.177151435312634e-07, "learning_rate": 0.0027015144441506723, "loss": 2.4787, "step": 18178 }, { "crossentropy": 2.5348100662231445, "epoch": 0.6590414733178654, "grad_norm": 0.02628283016383648, "grad_norm_var": 6.718336454258292e-07, "learning_rate": 0.0027009984097514208, "loss": 2.5314, "step": 18179 }, { "crossentropy": 2.473440408706665, "epoch": 0.6590777262180975, "grad_norm": 0.026757732033729553, "grad_norm_var": 6.503245539335543e-07, "learning_rate": 0.0027004824064036726, "loss": 2.4343, "step": 18180 }, { "crossentropy": 2.513352394104004, "epoch": 0.6591139791183295, "grad_norm": 0.025395695120096207, "grad_norm_var": 7.458361041131716e-07, "learning_rate": 0.002699966434114401, "loss": 2.537, "step": 18181 }, { "crossentropy": 2.4425790309906006, "epoch": 0.6591502320185615, "grad_norm": 0.026630345731973648, "grad_norm_var": 7.325997399466415e-07, "learning_rate": 0.002699450492890571, "loss": 2.5549, "step": 18182 }, { "crossentropy": 2.5524795055389404, "epoch": 0.6591864849187935, "grad_norm": 0.026435963809490204, "grad_norm_var": 6.958065570234478e-07, "learning_rate": 0.002698934582739155, "loss": 2.4838, "step": 18183 }, { "crossentropy": 2.5185952186584473, "epoch": 0.6592227378190255, "grad_norm": 0.025959372520446777, "grad_norm_var": 6.608998629130016e-07, "learning_rate": 0.0026984187036671182, "loss": 2.5635, "step": 18184 }, { "crossentropy": 2.354281425476074, "epoch": 0.6592589907192575, "grad_norm": 0.026436563581228256, "grad_norm_var": 6.579168989803101e-07, "learning_rate": 0.002697902855681432, "loss": 2.4206, "step": 18185 }, { "crossentropy": 2.4255547523498535, "epoch": 0.6592952436194895, "grad_norm": 0.026614906266331673, "grad_norm_var": 6.572137508843115e-07, "learning_rate": 0.0026973870387890586, "loss": 2.4753, "step": 18186 }, { "crossentropy": 2.576198101043701, "epoch": 0.6593314965197216, "grad_norm": 0.027380619198083878, "grad_norm_var": 6.946730411434467e-07, "learning_rate": 0.0026968712529969697, "loss": 2.5245, "step": 18187 }, { "crossentropy": 2.6252543926239014, "epoch": 0.6593677494199536, "grad_norm": 0.02841128408908844, "grad_norm_var": 8.862653547991727e-07, "learning_rate": 0.0026963554983121273, "loss": 2.5835, "step": 18188 }, { "crossentropy": 2.5630524158477783, "epoch": 0.6594040023201856, "grad_norm": 0.02789895050227642, "grad_norm_var": 9.433483904709803e-07, "learning_rate": 0.002695839774741502, "loss": 2.4962, "step": 18189 }, { "crossentropy": 2.3712167739868164, "epoch": 0.6594402552204176, "grad_norm": 0.026059037074446678, "grad_norm_var": 9.73443966134158e-07, "learning_rate": 0.0026953240822920554, "loss": 2.4795, "step": 18190 }, { "crossentropy": 2.6483089923858643, "epoch": 0.6594765081206496, "grad_norm": 0.02746783010661602, "grad_norm_var": 6.923893981783391e-07, "learning_rate": 0.0026948084209707564, "loss": 2.5951, "step": 18191 }, { "crossentropy": 2.384303331375122, "epoch": 0.6595127610208816, "grad_norm": 0.027663826942443848, "grad_norm_var": 7.574275158674412e-07, "learning_rate": 0.002694292790784566, "loss": 2.5637, "step": 18192 }, { "crossentropy": 2.4661643505096436, "epoch": 0.6595490139211136, "grad_norm": 0.025850940495729446, "grad_norm_var": 6.689338636060715e-07, "learning_rate": 0.0026937771917404517, "loss": 2.5032, "step": 18193 }, { "crossentropy": 2.520005941390991, "epoch": 0.6595852668213457, "grad_norm": 0.02653689682483673, "grad_norm_var": 6.708776009485638e-07, "learning_rate": 0.0026932616238453757, "loss": 2.5565, "step": 18194 }, { "crossentropy": 2.556833267211914, "epoch": 0.6596215197215777, "grad_norm": 0.026003798469901085, "grad_norm_var": 6.926194006895743e-07, "learning_rate": 0.0026927460871063033, "loss": 2.5756, "step": 18195 }, { "crossentropy": 2.5571987628936768, "epoch": 0.6596577726218097, "grad_norm": 0.026444310322403908, "grad_norm_var": 6.971397594370913e-07, "learning_rate": 0.002692230581530194, "loss": 2.5919, "step": 18196 }, { "crossentropy": 2.595045566558838, "epoch": 0.6596940255220418, "grad_norm": 0.026928970590233803, "grad_norm_var": 5.775487051102538e-07, "learning_rate": 0.0026917151071240136, "loss": 2.5604, "step": 18197 }, { "crossentropy": 2.4369394779205322, "epoch": 0.6597302784222738, "grad_norm": 0.02616187371313572, "grad_norm_var": 6.015642397928673e-07, "learning_rate": 0.0026911996638947257, "loss": 2.4994, "step": 18198 }, { "crossentropy": 2.3887929916381836, "epoch": 0.6597665313225058, "grad_norm": 0.025478767231106758, "grad_norm_var": 7.009427647431161e-07, "learning_rate": 0.0026906842518492876, "loss": 2.4289, "step": 18199 }, { "crossentropy": 2.5873336791992188, "epoch": 0.6598027842227379, "grad_norm": 0.02630865015089512, "grad_norm_var": 6.73791070372511e-07, "learning_rate": 0.0026901688709946646, "loss": 2.5811, "step": 18200 }, { "crossentropy": 2.5830752849578857, "epoch": 0.6598390371229699, "grad_norm": 0.026343636214733124, "grad_norm_var": 6.779411783008315e-07, "learning_rate": 0.002689653521337817, "loss": 2.5734, "step": 18201 }, { "crossentropy": 2.667008638381958, "epoch": 0.6598752900232019, "grad_norm": 0.026413457468152046, "grad_norm_var": 6.833579080364171e-07, "learning_rate": 0.0026891382028857032, "loss": 2.5848, "step": 18202 }, { "crossentropy": 2.5580904483795166, "epoch": 0.6599115429234339, "grad_norm": 0.027449406683444977, "grad_norm_var": 6.89808433791452e-07, "learning_rate": 0.0026886229156452865, "loss": 2.4382, "step": 18203 }, { "crossentropy": 2.660740852355957, "epoch": 0.6599477958236659, "grad_norm": 0.02618064545094967, "grad_norm_var": 4.959451398944681e-07, "learning_rate": 0.002688107659623522, "loss": 2.5286, "step": 18204 }, { "crossentropy": 2.53054141998291, "epoch": 0.6599840487238979, "grad_norm": 0.027562977746129036, "grad_norm_var": 4.436666195847209e-07, "learning_rate": 0.002687592434827373, "loss": 2.6032, "step": 18205 }, { "crossentropy": 2.5896615982055664, "epoch": 0.66002030162413, "grad_norm": 0.025916850194334984, "grad_norm_var": 4.5430318629129977e-07, "learning_rate": 0.002687077241263799, "loss": 2.5603, "step": 18206 }, { "crossentropy": 2.418370008468628, "epoch": 0.660056554524362, "grad_norm": 0.026159455999732018, "grad_norm_var": 4.0022767777643923e-07, "learning_rate": 0.002686562078939755, "loss": 2.4717, "step": 18207 }, { "crossentropy": 2.4968366622924805, "epoch": 0.660092807424594, "grad_norm": 0.026094920933246613, "grad_norm_var": 3.0282513549758675e-07, "learning_rate": 0.002686046947862203, "loss": 2.4728, "step": 18208 }, { "crossentropy": 2.5404491424560547, "epoch": 0.660129060324826, "grad_norm": 0.02604036033153534, "grad_norm_var": 2.9209156296818896e-07, "learning_rate": 0.0026855318480380986, "loss": 2.4913, "step": 18209 }, { "crossentropy": 2.554245948791504, "epoch": 0.660165313225058, "grad_norm": 0.02597016468644142, "grad_norm_var": 3.0004999425786194e-07, "learning_rate": 0.002685016779474396, "loss": 2.492, "step": 18210 }, { "crossentropy": 2.4811325073242188, "epoch": 0.66020156612529, "grad_norm": 0.02635130286216736, "grad_norm_var": 2.9196707763776835e-07, "learning_rate": 0.0026845017421780583, "loss": 2.5514, "step": 18211 }, { "crossentropy": 2.4549248218536377, "epoch": 0.660237819025522, "grad_norm": 0.026354974135756493, "grad_norm_var": 2.9149568543685814e-07, "learning_rate": 0.0026839867361560356, "loss": 2.4735, "step": 18212 }, { "crossentropy": 2.3965511322021484, "epoch": 0.660274071925754, "grad_norm": 0.02707350254058838, "grad_norm_var": 3.038183647168276e-07, "learning_rate": 0.002683471761415287, "loss": 2.4735, "step": 18213 }, { "crossentropy": 2.5510687828063965, "epoch": 0.6603103248259861, "grad_norm": 0.027916157618165016, "grad_norm_var": 4.4834449390711236e-07, "learning_rate": 0.0026829568179627694, "loss": 2.5693, "step": 18214 }, { "crossentropy": 2.663999080657959, "epoch": 0.6603465777262181, "grad_norm": 0.027419356629252434, "grad_norm_var": 4.256956476111068e-07, "learning_rate": 0.002682441905805434, "loss": 2.5991, "step": 18215 }, { "crossentropy": 2.4026498794555664, "epoch": 0.6603828306264501, "grad_norm": 0.027355553582310677, "grad_norm_var": 4.5391282275027085e-07, "learning_rate": 0.0026819270249502404, "loss": 2.5276, "step": 18216 }, { "crossentropy": 2.4894793033599854, "epoch": 0.6604190835266821, "grad_norm": 0.027692919597029686, "grad_norm_var": 5.103025430316932e-07, "learning_rate": 0.0026814121754041398, "loss": 2.5203, "step": 18217 }, { "crossentropy": 2.4238784313201904, "epoch": 0.6604553364269141, "grad_norm": 0.02628934755921364, "grad_norm_var": 5.167847118229951e-07, "learning_rate": 0.002680897357174084, "loss": 2.4899, "step": 18218 }, { "crossentropy": 2.536346197128296, "epoch": 0.6604915893271461, "grad_norm": 0.02582094632089138, "grad_norm_var": 5.283310702916543e-07, "learning_rate": 0.0026803825702670313, "loss": 2.4704, "step": 18219 }, { "crossentropy": 2.3875656127929688, "epoch": 0.6605278422273781, "grad_norm": 0.027029670774936676, "grad_norm_var": 5.21670331894231e-07, "learning_rate": 0.0026798678146899304, "loss": 2.521, "step": 18220 }, { "crossentropy": 2.677910089492798, "epoch": 0.6605640951276102, "grad_norm": 0.027278877794742584, "grad_norm_var": 4.93666522053543e-07, "learning_rate": 0.0026793530904497343, "loss": 2.6109, "step": 18221 }, { "crossentropy": 2.51757550239563, "epoch": 0.6606003480278422, "grad_norm": 0.027509115636348724, "grad_norm_var": 4.916394668262689e-07, "learning_rate": 0.0026788383975533993, "loss": 2.5241, "step": 18222 }, { "crossentropy": 2.4439473152160645, "epoch": 0.6606366009280742, "grad_norm": 0.025718018412590027, "grad_norm_var": 5.39889010457648e-07, "learning_rate": 0.0026783237360078725, "loss": 2.4473, "step": 18223 }, { "crossentropy": 2.405364990234375, "epoch": 0.6606728538283063, "grad_norm": 0.027512019500136375, "grad_norm_var": 5.426261926503995e-07, "learning_rate": 0.002677809105820108, "loss": 2.5131, "step": 18224 }, { "crossentropy": 2.6366164684295654, "epoch": 0.6607091067285383, "grad_norm": 0.026375561952590942, "grad_norm_var": 5.14210842513874e-07, "learning_rate": 0.0026772945069970565, "loss": 2.6674, "step": 18225 }, { "crossentropy": 2.4718613624572754, "epoch": 0.6607453596287703, "grad_norm": 0.026469700038433075, "grad_norm_var": 4.709246910776134e-07, "learning_rate": 0.002676779939545665, "loss": 2.5263, "step": 18226 }, { "crossentropy": 2.3672468662261963, "epoch": 0.6607816125290024, "grad_norm": 0.027910761535167694, "grad_norm_var": 5.11857373853873e-07, "learning_rate": 0.0026762654034728885, "loss": 2.487, "step": 18227 }, { "crossentropy": 2.491002082824707, "epoch": 0.6608178654292344, "grad_norm": 0.026577822864055634, "grad_norm_var": 4.963034017401353e-07, "learning_rate": 0.0026757508987856715, "loss": 2.4678, "step": 18228 }, { "crossentropy": 2.5392589569091797, "epoch": 0.6608541183294664, "grad_norm": 0.026573767885565758, "grad_norm_var": 5.068032500329106e-07, "learning_rate": 0.002675236425490967, "loss": 2.5784, "step": 18229 }, { "crossentropy": 2.438441753387451, "epoch": 0.6608903712296984, "grad_norm": 0.026714354753494263, "grad_norm_var": 4.4475614405231393e-07, "learning_rate": 0.002674721983595723, "loss": 2.4942, "step": 18230 }, { "crossentropy": 2.6161880493164062, "epoch": 0.6609266241299304, "grad_norm": 0.027025334537029266, "grad_norm_var": 4.2667464906184725e-07, "learning_rate": 0.002674207573106886, "loss": 2.6338, "step": 18231 }, { "crossentropy": 2.4733920097351074, "epoch": 0.6609628770301624, "grad_norm": 0.026099471375346184, "grad_norm_var": 4.4327096996517265e-07, "learning_rate": 0.002673693194031407, "loss": 2.5606, "step": 18232 }, { "crossentropy": 2.4779388904571533, "epoch": 0.6609991299303944, "grad_norm": 0.027133386582136154, "grad_norm_var": 3.952792385508256e-07, "learning_rate": 0.0026731788463762317, "loss": 2.488, "step": 18233 }, { "crossentropy": 2.4550740718841553, "epoch": 0.6610353828306265, "grad_norm": 0.036021504551172256, "grad_norm_var": 5.714112346137898e-06, "learning_rate": 0.0026726645301483054, "loss": 2.4757, "step": 18234 }, { "crossentropy": 2.6327996253967285, "epoch": 0.6610716357308585, "grad_norm": 0.027861827984452248, "grad_norm_var": 5.555458272007001e-06, "learning_rate": 0.0026721502453545785, "loss": 2.6036, "step": 18235 }, { "crossentropy": 2.5920512676239014, "epoch": 0.6611078886310905, "grad_norm": 0.02503751404583454, "grad_norm_var": 5.925296179801762e-06, "learning_rate": 0.002671635992001994, "loss": 2.5576, "step": 18236 }, { "crossentropy": 2.4814820289611816, "epoch": 0.6611441415313225, "grad_norm": 0.02660270780324936, "grad_norm_var": 5.961517871865791e-06, "learning_rate": 0.0026711217700974977, "loss": 2.4682, "step": 18237 }, { "crossentropy": 2.488884210586548, "epoch": 0.6611803944315545, "grad_norm": 0.02698654495179653, "grad_norm_var": 5.965508131949328e-06, "learning_rate": 0.0026706075796480377, "loss": 2.4433, "step": 18238 }, { "crossentropy": 2.6003549098968506, "epoch": 0.6612166473317865, "grad_norm": 0.026021571829915047, "grad_norm_var": 5.907692959684777e-06, "learning_rate": 0.0026700934206605564, "loss": 2.5689, "step": 18239 }, { "crossentropy": 2.589470624923706, "epoch": 0.6612529002320185, "grad_norm": 0.027211477980017662, "grad_norm_var": 5.905152386014587e-06, "learning_rate": 0.002669579293142001, "loss": 2.5937, "step": 18240 }, { "crossentropy": 2.5271997451782227, "epoch": 0.6612891531322506, "grad_norm": 0.027567852288484573, "grad_norm_var": 5.8487953861593165e-06, "learning_rate": 0.0026690651970993117, "loss": 2.5405, "step": 18241 }, { "crossentropy": 2.4269285202026367, "epoch": 0.6613254060324826, "grad_norm": 0.02613687328994274, "grad_norm_var": 5.895381703643898e-06, "learning_rate": 0.0026685511325394364, "loss": 2.4837, "step": 18242 }, { "crossentropy": 2.647939443588257, "epoch": 0.6613616589327146, "grad_norm": 0.026774125173687935, "grad_norm_var": 5.890033473474618e-06, "learning_rate": 0.0026680370994693154, "loss": 2.5904, "step": 18243 }, { "crossentropy": 2.5808305740356445, "epoch": 0.6613979118329466, "grad_norm": 0.027918484061956406, "grad_norm_var": 5.878347230488925e-06, "learning_rate": 0.002667523097895891, "loss": 2.6206, "step": 18244 }, { "crossentropy": 2.7143778800964355, "epoch": 0.6614341647331786, "grad_norm": 0.027813468128442764, "grad_norm_var": 5.845198047828235e-06, "learning_rate": 0.0026670091278261066, "loss": 2.6896, "step": 18245 }, { "crossentropy": 2.5417003631591797, "epoch": 0.6614704176334106, "grad_norm": 0.026316339150071144, "grad_norm_var": 5.893231699577552e-06, "learning_rate": 0.002666495189266906, "loss": 2.4916, "step": 18246 }, { "crossentropy": 2.3978824615478516, "epoch": 0.6615066705336426, "grad_norm": 0.025624819099903107, "grad_norm_var": 6.087284748401108e-06, "learning_rate": 0.002665981282225227, "loss": 2.4395, "step": 18247 }, { "crossentropy": 2.649874687194824, "epoch": 0.6615429234338747, "grad_norm": 0.02642037719488144, "grad_norm_var": 6.041476425050485e-06, "learning_rate": 0.0026654674067080153, "loss": 2.6049, "step": 18248 }, { "crossentropy": 2.4999239444732666, "epoch": 0.6615791763341067, "grad_norm": 0.026763351634144783, "grad_norm_var": 6.060255547291162e-06, "learning_rate": 0.0026649535627222077, "loss": 2.5703, "step": 18249 }, { "crossentropy": 2.5293378829956055, "epoch": 0.6616154292343387, "grad_norm": 0.025766611099243164, "grad_norm_var": 7.316819238006259e-07, "learning_rate": 0.002664439750274746, "loss": 2.419, "step": 18250 }, { "crossentropy": 2.573875665664673, "epoch": 0.6616516821345708, "grad_norm": 0.02840999700129032, "grad_norm_var": 8.370974279951153e-07, "learning_rate": 0.0026639259693725715, "loss": 2.6175, "step": 18251 }, { "crossentropy": 2.552440643310547, "epoch": 0.6616879350348028, "grad_norm": 0.026343844830989838, "grad_norm_var": 6.523124848174799e-07, "learning_rate": 0.0026634122200226195, "loss": 2.5775, "step": 18252 }, { "crossentropy": 2.540104627609253, "epoch": 0.6617241879350348, "grad_norm": 0.026752693578600883, "grad_norm_var": 6.499249270648048e-07, "learning_rate": 0.0026628985022318307, "loss": 2.6087, "step": 18253 }, { "crossentropy": 2.491976261138916, "epoch": 0.6617604408352669, "grad_norm": 0.027061183005571365, "grad_norm_var": 6.521118676558816e-07, "learning_rate": 0.0026623848160071467, "loss": 2.5071, "step": 18254 }, { "crossentropy": 2.424078941345215, "epoch": 0.6617966937354989, "grad_norm": 0.027925122529268265, "grad_norm_var": 6.79375633978726e-07, "learning_rate": 0.002661871161355501, "loss": 2.4724, "step": 18255 }, { "crossentropy": 2.475292682647705, "epoch": 0.6618329466357309, "grad_norm": 0.026156499981880188, "grad_norm_var": 7.086979329257633e-07, "learning_rate": 0.0026613575382838364, "loss": 2.4069, "step": 18256 }, { "crossentropy": 2.588651180267334, "epoch": 0.6618691995359629, "grad_norm": 0.027020709589123726, "grad_norm_var": 6.757306505827115e-07, "learning_rate": 0.002660843946799084, "loss": 2.5686, "step": 18257 }, { "crossentropy": 2.6331071853637695, "epoch": 0.6619054524361949, "grad_norm": 0.02562137134373188, "grad_norm_var": 7.39656291246513e-07, "learning_rate": 0.002660330386908185, "loss": 2.6078, "step": 18258 }, { "crossentropy": 2.5212504863739014, "epoch": 0.6619417053364269, "grad_norm": 0.026347151026129723, "grad_norm_var": 7.521285645075651e-07, "learning_rate": 0.0026598168586180783, "loss": 2.5366, "step": 18259 }, { "crossentropy": 2.437986135482788, "epoch": 0.661977958236659, "grad_norm": 0.026005201041698456, "grad_norm_var": 6.870115859181157e-07, "learning_rate": 0.002659303361935692, "loss": 2.5274, "step": 18260 }, { "crossentropy": 2.594003677368164, "epoch": 0.662014211136891, "grad_norm": 0.025979015976190567, "grad_norm_var": 6.119770097407027e-07, "learning_rate": 0.0026587898968679657, "loss": 2.5617, "step": 18261 }, { "crossentropy": 2.458822727203369, "epoch": 0.662050464037123, "grad_norm": 0.025218047201633453, "grad_norm_var": 7.18969419857135e-07, "learning_rate": 0.0026582764634218376, "loss": 2.5272, "step": 18262 }, { "crossentropy": 2.4862465858459473, "epoch": 0.662086716937355, "grad_norm": 0.028564009815454483, "grad_norm_var": 9.302247281234646e-07, "learning_rate": 0.002657763061604237, "loss": 2.4749, "step": 18263 }, { "crossentropy": 2.571953058242798, "epoch": 0.662122969837587, "grad_norm": 0.02647075429558754, "grad_norm_var": 9.288597927051279e-07, "learning_rate": 0.0026572496914221036, "loss": 2.5443, "step": 18264 }, { "crossentropy": 2.7124550342559814, "epoch": 0.662159222737819, "grad_norm": 0.026555785909295082, "grad_norm_var": 9.284250821697352e-07, "learning_rate": 0.002656736352882365, "loss": 2.5639, "step": 18265 }, { "crossentropy": 2.5016167163848877, "epoch": 0.662195475638051, "grad_norm": 0.025352289900183678, "grad_norm_var": 9.872574136708984e-07, "learning_rate": 0.002656223045991959, "loss": 2.4422, "step": 18266 }, { "crossentropy": 2.5067453384399414, "epoch": 0.662231728538283, "grad_norm": 0.026328710839152336, "grad_norm_var": 7.588947280240717e-07, "learning_rate": 0.0026557097707578204, "loss": 2.4797, "step": 18267 }, { "crossentropy": 2.554764986038208, "epoch": 0.6622679814385151, "grad_norm": 0.02660353295505047, "grad_norm_var": 7.583467587323848e-07, "learning_rate": 0.002655196527186876, "loss": 2.5631, "step": 18268 }, { "crossentropy": 2.6336288452148438, "epoch": 0.6623042343387471, "grad_norm": 0.026531094685196877, "grad_norm_var": 7.538796453098716e-07, "learning_rate": 0.0026546833152860604, "loss": 2.5434, "step": 18269 }, { "crossentropy": 2.452892541885376, "epoch": 0.6623404872389791, "grad_norm": 0.025767451152205467, "grad_norm_var": 7.588879067066025e-07, "learning_rate": 0.002654170135062308, "loss": 2.545, "step": 18270 }, { "crossentropy": 2.6346733570098877, "epoch": 0.6623767401392111, "grad_norm": 0.026312517002224922, "grad_norm_var": 5.941243612490761e-07, "learning_rate": 0.002653656986522546, "loss": 2.6152, "step": 18271 }, { "crossentropy": 2.516191005706787, "epoch": 0.6624129930394431, "grad_norm": 0.025813831016421318, "grad_norm_var": 6.081171349319221e-07, "learning_rate": 0.0026531438696737086, "loss": 2.5352, "step": 18272 }, { "crossentropy": 2.5287258625030518, "epoch": 0.6624492459396751, "grad_norm": 0.029405249282717705, "grad_norm_var": 1.1987661739644758e-06, "learning_rate": 0.0026526307845227233, "loss": 2.621, "step": 18273 }, { "crossentropy": 2.6110377311706543, "epoch": 0.6624854988399071, "grad_norm": 0.0262820515781641, "grad_norm_var": 1.1548366081952915e-06, "learning_rate": 0.0026521177310765208, "loss": 2.6046, "step": 18274 }, { "crossentropy": 2.5251216888427734, "epoch": 0.6625217517401392, "grad_norm": 0.025988169014453888, "grad_norm_var": 1.1688208791412838e-06, "learning_rate": 0.0026516047093420357, "loss": 2.4929, "step": 18275 }, { "crossentropy": 2.4649269580841064, "epoch": 0.6625580046403712, "grad_norm": 0.026491686701774597, "grad_norm_var": 1.1548512303805972e-06, "learning_rate": 0.0026510917193261897, "loss": 2.536, "step": 18276 }, { "crossentropy": 2.4235689640045166, "epoch": 0.6625942575406032, "grad_norm": 0.027197320014238358, "grad_norm_var": 1.1663980985727555e-06, "learning_rate": 0.0026505787610359133, "loss": 2.4817, "step": 18277 }, { "crossentropy": 2.375300168991089, "epoch": 0.6626305104408353, "grad_norm": 0.026707325130701065, "grad_norm_var": 1.0395096125913212e-06, "learning_rate": 0.002650065834478139, "loss": 2.3876, "step": 18278 }, { "crossentropy": 2.5088412761688232, "epoch": 0.6626667633410673, "grad_norm": 0.02724335715174675, "grad_norm_var": 8.111744593063015e-07, "learning_rate": 0.0026495529396597895, "loss": 2.5399, "step": 18279 }, { "crossentropy": 2.477789878845215, "epoch": 0.6627030162412993, "grad_norm": 0.026863621547818184, "grad_norm_var": 8.158477674815075e-07, "learning_rate": 0.0026490400765877965, "loss": 2.5007, "step": 18280 }, { "crossentropy": 2.5659732818603516, "epoch": 0.6627392691415314, "grad_norm": 0.02640531398355961, "grad_norm_var": 8.179543227492299e-07, "learning_rate": 0.0026485272452690827, "loss": 2.5324, "step": 18281 }, { "crossentropy": 2.2605395317077637, "epoch": 0.6627755220417634, "grad_norm": 0.02840329147875309, "grad_norm_var": 8.999659643683557e-07, "learning_rate": 0.002648014445710577, "loss": 2.3582, "step": 18282 }, { "crossentropy": 2.502915859222412, "epoch": 0.6628117749419954, "grad_norm": 0.029576951637864113, "grad_norm_var": 1.3676221185613165e-06, "learning_rate": 0.0026475016779192073, "loss": 2.4922, "step": 18283 }, { "crossentropy": 2.5865321159362793, "epoch": 0.6628480278422274, "grad_norm": 0.025969676673412323, "grad_norm_var": 1.4240889897117076e-06, "learning_rate": 0.0026469889419018983, "loss": 2.4701, "step": 18284 }, { "crossentropy": 2.491840362548828, "epoch": 0.6628842807424594, "grad_norm": 0.02753579244017601, "grad_norm_var": 1.433079695643347e-06, "learning_rate": 0.0026464762376655717, "loss": 2.5351, "step": 18285 }, { "crossentropy": 2.5495123863220215, "epoch": 0.6629205336426914, "grad_norm": 0.028154408559203148, "grad_norm_var": 1.397629264258945e-06, "learning_rate": 0.0026459635652171577, "loss": 2.499, "step": 18286 }, { "crossentropy": 2.605527639389038, "epoch": 0.6629567865429234, "grad_norm": 0.029590819031000137, "grad_norm_var": 1.704614220814545e-06, "learning_rate": 0.0026454509245635756, "loss": 2.5785, "step": 18287 }, { "crossentropy": 2.5939018726348877, "epoch": 0.6629930394431555, "grad_norm": 0.025745263323187828, "grad_norm_var": 1.7189687683575224e-06, "learning_rate": 0.002644938315711754, "loss": 2.502, "step": 18288 }, { "crossentropy": 2.503059148788452, "epoch": 0.6630292923433875, "grad_norm": 0.026466110721230507, "grad_norm_var": 1.4524831549982941e-06, "learning_rate": 0.0026444257386686125, "loss": 2.4331, "step": 18289 }, { "crossentropy": 2.566676378250122, "epoch": 0.6630655452436195, "grad_norm": 0.027430091053247452, "grad_norm_var": 1.3998834526257e-06, "learning_rate": 0.0026439131934410764, "loss": 2.5487, "step": 18290 }, { "crossentropy": 2.3230557441711426, "epoch": 0.6631017981438515, "grad_norm": 0.025715507566928864, "grad_norm_var": 1.449879235248377e-06, "learning_rate": 0.0026434006800360697, "loss": 2.3455, "step": 18291 }, { "crossentropy": 2.6216988563537598, "epoch": 0.6631380510440835, "grad_norm": 0.026351671665906906, "grad_norm_var": 1.4646737636441471e-06, "learning_rate": 0.002642888198460513, "loss": 2.577, "step": 18292 }, { "crossentropy": 2.4142343997955322, "epoch": 0.6631743039443155, "grad_norm": 0.028221020475029945, "grad_norm_var": 1.528470364361989e-06, "learning_rate": 0.0026423757487213264, "loss": 2.4829, "step": 18293 }, { "crossentropy": 2.5303568840026855, "epoch": 0.6632105568445475, "grad_norm": 0.02720707654953003, "grad_norm_var": 1.506336021250311e-06, "learning_rate": 0.002641863330825435, "loss": 2.5611, "step": 18294 }, { "crossentropy": 2.4243416786193848, "epoch": 0.6632468097447796, "grad_norm": 0.027650795876979828, "grad_norm_var": 1.5133627468834343e-06, "learning_rate": 0.0026413509447797564, "loss": 2.4661, "step": 18295 }, { "crossentropy": 2.6613831520080566, "epoch": 0.6632830626450116, "grad_norm": 0.027216730639338493, "grad_norm_var": 1.4991761488271243e-06, "learning_rate": 0.002640838590591214, "loss": 2.5495, "step": 18296 }, { "crossentropy": 2.4784398078918457, "epoch": 0.6633193155452436, "grad_norm": 0.02756168134510517, "grad_norm_var": 1.4367058813056938e-06, "learning_rate": 0.002640326268266725, "loss": 2.4851, "step": 18297 }, { "crossentropy": 2.4539103507995605, "epoch": 0.6633555684454756, "grad_norm": 0.025944914668798447, "grad_norm_var": 1.4937003044096826e-06, "learning_rate": 0.0026398139778132106, "loss": 2.4412, "step": 18298 }, { "crossentropy": 2.627795696258545, "epoch": 0.6633918213457076, "grad_norm": 0.028885256499052048, "grad_norm_var": 1.3109486820858663e-06, "learning_rate": 0.0026393017192375924, "loss": 2.607, "step": 18299 }, { "crossentropy": 2.3887572288513184, "epoch": 0.6634280742459396, "grad_norm": 0.02541414462029934, "grad_norm_var": 1.423436889397495e-06, "learning_rate": 0.002638789492546785, "loss": 2.3579, "step": 18300 }, { "crossentropy": 2.4253973960876465, "epoch": 0.6634643271461717, "grad_norm": 0.026687445119023323, "grad_norm_var": 1.4296666636233353e-06, "learning_rate": 0.00263827729774771, "loss": 2.4898, "step": 18301 }, { "crossentropy": 2.5725836753845215, "epoch": 0.6635005800464037, "grad_norm": 0.026322949677705765, "grad_norm_var": 1.3916385929420611e-06, "learning_rate": 0.002637765134847284, "loss": 2.6234, "step": 18302 }, { "crossentropy": 2.4694836139678955, "epoch": 0.6635368329466357, "grad_norm": 0.02840825542807579, "grad_norm_var": 1.0745893625049475e-06, "learning_rate": 0.002637253003852424, "loss": 2.5095, "step": 18303 }, { "crossentropy": 2.5190529823303223, "epoch": 0.6635730858468677, "grad_norm": 0.026185771450400352, "grad_norm_var": 1.0158516743874093e-06, "learning_rate": 0.0026367409047700486, "loss": 2.5277, "step": 18304 }, { "crossentropy": 2.504246711730957, "epoch": 0.6636093387470998, "grad_norm": 0.026675399392843246, "grad_norm_var": 1.0042675694705086e-06, "learning_rate": 0.0026362288376070727, "loss": 2.5514, "step": 18305 }, { "crossentropy": 2.5035557746887207, "epoch": 0.6636455916473318, "grad_norm": 0.0260937437415123, "grad_norm_var": 1.037897417903444e-06, "learning_rate": 0.0026357168023704125, "loss": 2.5138, "step": 18306 }, { "crossentropy": 2.6691269874572754, "epoch": 0.6636818445475638, "grad_norm": 0.026229476556181908, "grad_norm_var": 9.726255996304282e-07, "learning_rate": 0.0026352047990669874, "loss": 2.5017, "step": 18307 }, { "crossentropy": 2.5269582271575928, "epoch": 0.6637180974477959, "grad_norm": 0.026508823037147522, "grad_norm_var": 9.618201964286017e-07, "learning_rate": 0.002634692827703708, "loss": 2.5436, "step": 18308 }, { "crossentropy": 2.5736210346221924, "epoch": 0.6637543503480279, "grad_norm": 0.026711147278547287, "grad_norm_var": 8.48594886582119e-07, "learning_rate": 0.0026341808882874933, "loss": 2.4723, "step": 18309 }, { "crossentropy": 2.462584972381592, "epoch": 0.6637906032482599, "grad_norm": 0.02640567347407341, "grad_norm_var": 8.512723085019973e-07, "learning_rate": 0.002633668980825256, "loss": 2.5168, "step": 18310 }, { "crossentropy": 2.501525402069092, "epoch": 0.6638268561484919, "grad_norm": 0.02650873363018036, "grad_norm_var": 8.042092711770903e-07, "learning_rate": 0.002633157105323909, "loss": 2.542, "step": 18311 }, { "crossentropy": 2.3745510578155518, "epoch": 0.6638631090487239, "grad_norm": 0.027166809886693954, "grad_norm_var": 8.011586397644315e-07, "learning_rate": 0.002632645261790368, "loss": 2.4567, "step": 18312 }, { "crossentropy": 2.477994680404663, "epoch": 0.6638993619489559, "grad_norm": 0.027942484244704247, "grad_norm_var": 8.523534562000866e-07, "learning_rate": 0.0026321334502315435, "loss": 2.5863, "step": 18313 }, { "crossentropy": 2.385590076446533, "epoch": 0.663935614849188, "grad_norm": 0.028285618871450424, "grad_norm_var": 9.41746656763738e-07, "learning_rate": 0.0026316216706543504, "loss": 2.4854, "step": 18314 }, { "crossentropy": 2.510580539703369, "epoch": 0.66397186774942, "grad_norm": 0.02547171525657177, "grad_norm_var": 7.673485002056164e-07, "learning_rate": 0.0026311099230657033, "loss": 2.4545, "step": 18315 }, { "crossentropy": 2.5849318504333496, "epoch": 0.664008120649652, "grad_norm": 0.02636853978037834, "grad_norm_var": 6.62095304299739e-07, "learning_rate": 0.002630598207472509, "loss": 2.5278, "step": 18316 }, { "crossentropy": 2.4237115383148193, "epoch": 0.664044373549884, "grad_norm": 0.026788590475916862, "grad_norm_var": 6.619141911104783e-07, "learning_rate": 0.002630086523881683, "loss": 2.4397, "step": 18317 }, { "crossentropy": 2.5306808948516846, "epoch": 0.664080626450116, "grad_norm": 0.0272071436047554, "grad_norm_var": 6.598873026318937e-07, "learning_rate": 0.0026295748723001357, "loss": 2.4809, "step": 18318 }, { "crossentropy": 2.4381465911865234, "epoch": 0.664116879350348, "grad_norm": 0.025930041447281837, "grad_norm_var": 5.15581813607946e-07, "learning_rate": 0.0026290632527347746, "loss": 2.4867, "step": 18319 }, { "crossentropy": 2.4924347400665283, "epoch": 0.66415313225058, "grad_norm": 0.025367598980665207, "grad_norm_var": 6.086057219334504e-07, "learning_rate": 0.0026285516651925146, "loss": 2.4792, "step": 18320 }, { "crossentropy": 2.5893826484680176, "epoch": 0.664189385150812, "grad_norm": 0.030258936807513237, "grad_norm_var": 1.445402955867897e-06, "learning_rate": 0.0026280401096802613, "loss": 2.4623, "step": 18321 }, { "crossentropy": 2.4904468059539795, "epoch": 0.6642256380510441, "grad_norm": 0.025678522884845734, "grad_norm_var": 1.4968188328775648e-06, "learning_rate": 0.002627528586204926, "loss": 2.5756, "step": 18322 }, { "crossentropy": 2.484586000442505, "epoch": 0.6642618909512761, "grad_norm": 0.026242243126034737, "grad_norm_var": 1.4958546927937647e-06, "learning_rate": 0.0026270170947734197, "loss": 2.4935, "step": 18323 }, { "crossentropy": 2.3922882080078125, "epoch": 0.6642981438515081, "grad_norm": 0.026672150939702988, "grad_norm_var": 1.4911229542599602e-06, "learning_rate": 0.0026265056353926455, "loss": 2.4661, "step": 18324 }, { "crossentropy": 2.6077957153320312, "epoch": 0.6643343967517401, "grad_norm": 0.026653051376342773, "grad_norm_var": 1.492121871420465e-06, "learning_rate": 0.0026259942080695178, "loss": 2.6646, "step": 18325 }, { "crossentropy": 2.599116802215576, "epoch": 0.6643706496519721, "grad_norm": 0.027280086651444435, "grad_norm_var": 1.4928579788721218e-06, "learning_rate": 0.00262548281281094, "loss": 2.4948, "step": 18326 }, { "crossentropy": 2.3092026710510254, "epoch": 0.6644069025522041, "grad_norm": 0.026449916884303093, "grad_norm_var": 1.4958594246050616e-06, "learning_rate": 0.0026249714496238185, "loss": 2.3971, "step": 18327 }, { "crossentropy": 2.594909429550171, "epoch": 0.6644431554524362, "grad_norm": 0.02780350111424923, "grad_norm_var": 1.5472228508375441e-06, "learning_rate": 0.0026244601185150644, "loss": 2.5846, "step": 18328 }, { "crossentropy": 2.7061986923217773, "epoch": 0.6644794083526682, "grad_norm": 0.026265976950526237, "grad_norm_var": 1.489861143110629e-06, "learning_rate": 0.0026239488194915793, "loss": 2.5591, "step": 18329 }, { "crossentropy": 2.586740732192993, "epoch": 0.6645156612529002, "grad_norm": 0.0276061799377203, "grad_norm_var": 1.3836961214684524e-06, "learning_rate": 0.002623437552560271, "loss": 2.552, "step": 18330 }, { "crossentropy": 2.622924566268921, "epoch": 0.6645519141531323, "grad_norm": 0.027623485773801804, "grad_norm_var": 1.3055425030951503e-06, "learning_rate": 0.0026229263177280472, "loss": 2.505, "step": 18331 }, { "crossentropy": 2.4398410320281982, "epoch": 0.6645881670533643, "grad_norm": 0.057669997215270996, "grad_norm_var": 6.037702707869818e-05, "learning_rate": 0.0026224151150018083, "loss": 2.3669, "step": 18332 }, { "crossentropy": 2.5797204971313477, "epoch": 0.6646244199535963, "grad_norm": 0.027292296290397644, "grad_norm_var": 6.025486925566037e-05, "learning_rate": 0.002621903944388464, "loss": 2.5836, "step": 18333 }, { "crossentropy": 2.3013863563537598, "epoch": 0.6646606728538283, "grad_norm": 0.025923507288098335, "grad_norm_var": 6.0643320118817707e-05, "learning_rate": 0.0026213928058949156, "loss": 2.485, "step": 18334 }, { "crossentropy": 2.443300485610962, "epoch": 0.6646969257540604, "grad_norm": 0.02704092673957348, "grad_norm_var": 6.0296120411601236e-05, "learning_rate": 0.002620881699528065, "loss": 2.4702, "step": 18335 }, { "crossentropy": 2.4693846702575684, "epoch": 0.6647331786542924, "grad_norm": 0.02819337137043476, "grad_norm_var": 5.9477741350111704e-05, "learning_rate": 0.0026203706252948193, "loss": 2.4747, "step": 18336 }, { "crossentropy": 2.5419788360595703, "epoch": 0.6647694315545244, "grad_norm": 0.027033686637878418, "grad_norm_var": 5.960407812717002e-05, "learning_rate": 0.0026198595832020778, "loss": 2.5951, "step": 18337 }, { "crossentropy": 2.503706693649292, "epoch": 0.6648056844547564, "grad_norm": 0.026935230940580368, "grad_norm_var": 5.917316105523393e-05, "learning_rate": 0.0026193485732567447, "loss": 2.5418, "step": 18338 }, { "crossentropy": 2.4831371307373047, "epoch": 0.6648419373549884, "grad_norm": 0.02645014598965645, "grad_norm_var": 5.9101693670139154e-05, "learning_rate": 0.0026188375954657238, "loss": 2.5369, "step": 18339 }, { "crossentropy": 2.60840106010437, "epoch": 0.6648781902552204, "grad_norm": 0.02689531445503235, "grad_norm_var": 5.903759855945175e-05, "learning_rate": 0.0026183266498359128, "loss": 2.6318, "step": 18340 }, { "crossentropy": 2.673025369644165, "epoch": 0.6649144431554525, "grad_norm": 0.03199324384331703, "grad_norm_var": 5.918817373729007e-05, "learning_rate": 0.0026178157363742163, "loss": 2.6207, "step": 18341 }, { "crossentropy": 2.4237983226776123, "epoch": 0.6649506960556845, "grad_norm": 0.026120996102690697, "grad_norm_var": 5.958099590674485e-05, "learning_rate": 0.0026173048550875324, "loss": 2.4672, "step": 18342 }, { "crossentropy": 2.457287311553955, "epoch": 0.6649869489559165, "grad_norm": 0.026723355054855347, "grad_norm_var": 5.948518244009575e-05, "learning_rate": 0.0026167940059827633, "loss": 2.5124, "step": 18343 }, { "crossentropy": 2.5397655963897705, "epoch": 0.6650232018561485, "grad_norm": 0.026171838864684105, "grad_norm_var": 5.9960440303351974e-05, "learning_rate": 0.0026162831890668074, "loss": 2.4506, "step": 18344 }, { "crossentropy": 2.606673002243042, "epoch": 0.6650594547563805, "grad_norm": 0.02705085277557373, "grad_norm_var": 5.970014046411254e-05, "learning_rate": 0.0026157724043465638, "loss": 2.5646, "step": 18345 }, { "crossentropy": 2.449240207672119, "epoch": 0.6650957076566125, "grad_norm": 0.02623211219906807, "grad_norm_var": 6.010470102581557e-05, "learning_rate": 0.002615261651828931, "loss": 2.4866, "step": 18346 }, { "crossentropy": 2.499993324279785, "epoch": 0.6651319605568445, "grad_norm": 0.029017608612775803, "grad_norm_var": 5.9954615943118974e-05, "learning_rate": 0.0026147509315208105, "loss": 2.4501, "step": 18347 }, { "crossentropy": 2.570004463195801, "epoch": 0.6651682134570766, "grad_norm": 0.026557303965091705, "grad_norm_var": 2.2327213494852665e-06, "learning_rate": 0.0026142402434290975, "loss": 2.5502, "step": 18348 }, { "crossentropy": 2.4845025539398193, "epoch": 0.6652044663573086, "grad_norm": 0.02518988400697708, "grad_norm_var": 2.490672315082889e-06, "learning_rate": 0.0026137295875606915, "loss": 2.5001, "step": 18349 }, { "crossentropy": 2.572646379470825, "epoch": 0.6652407192575406, "grad_norm": 0.029428333044052124, "grad_norm_var": 2.710685734679437e-06, "learning_rate": 0.0026132189639224875, "loss": 2.5214, "step": 18350 }, { "crossentropy": 2.54850697517395, "epoch": 0.6652769721577726, "grad_norm": 0.026047559455037117, "grad_norm_var": 2.808612141545705e-06, "learning_rate": 0.0026127083725213853, "loss": 2.5109, "step": 18351 }, { "crossentropy": 2.5760066509246826, "epoch": 0.6653132250580046, "grad_norm": 0.02650238759815693, "grad_norm_var": 2.775204977122023e-06, "learning_rate": 0.002612197813364279, "loss": 2.4858, "step": 18352 }, { "crossentropy": 2.598208427429199, "epoch": 0.6653494779582366, "grad_norm": 0.02910463511943817, "grad_norm_var": 3.012004928228523e-06, "learning_rate": 0.0026116872864580627, "loss": 2.55, "step": 18353 }, { "crossentropy": 2.3304481506347656, "epoch": 0.6653857308584686, "grad_norm": 0.026662709191441536, "grad_norm_var": 3.0290398542031355e-06, "learning_rate": 0.002611176791809634, "loss": 2.4586, "step": 18354 }, { "crossentropy": 2.6869964599609375, "epoch": 0.6654219837587007, "grad_norm": 0.028542501851916313, "grad_norm_var": 3.0769325638612654e-06, "learning_rate": 0.0026106663294258904, "loss": 2.6342, "step": 18355 }, { "crossentropy": 2.5026092529296875, "epoch": 0.6654582366589327, "grad_norm": 0.030145704746246338, "grad_norm_var": 3.522840669314906e-06, "learning_rate": 0.002610155899313722, "loss": 2.4874, "step": 18356 }, { "crossentropy": 2.412785768508911, "epoch": 0.6654944895591647, "grad_norm": 0.02783881314098835, "grad_norm_var": 2.1642502661985924e-06, "learning_rate": 0.002609645501480026, "loss": 2.4312, "step": 18357 }, { "crossentropy": 2.3950672149658203, "epoch": 0.6655307424593968, "grad_norm": 0.027609776705503464, "grad_norm_var": 2.0620851344042844e-06, "learning_rate": 0.002609135135931694, "loss": 2.45, "step": 18358 }, { "crossentropy": 2.682398796081543, "epoch": 0.6655669953596288, "grad_norm": 0.028290096670389175, "grad_norm_var": 2.0685984274089347e-06, "learning_rate": 0.002608624802675622, "loss": 2.6202, "step": 18359 }, { "crossentropy": 2.493805170059204, "epoch": 0.6656032482598608, "grad_norm": 0.02694731019437313, "grad_norm_var": 1.9663224110875894e-06, "learning_rate": 0.002608114501718701, "loss": 2.5963, "step": 18360 }, { "crossentropy": 2.5267422199249268, "epoch": 0.6656395011600929, "grad_norm": 0.02724332921206951, "grad_norm_var": 1.9552383802600893e-06, "learning_rate": 0.002607604233067821, "loss": 2.5182, "step": 18361 }, { "crossentropy": 2.459069013595581, "epoch": 0.6656757540603249, "grad_norm": 0.026921918615698814, "grad_norm_var": 1.8605468001479825e-06, "learning_rate": 0.0026070939967298766, "loss": 2.5107, "step": 18362 }, { "crossentropy": 2.549015522003174, "epoch": 0.6657120069605569, "grad_norm": 0.027082834392786026, "grad_norm_var": 1.7360592505538096e-06, "learning_rate": 0.002606583792711762, "loss": 2.5001, "step": 18363 }, { "crossentropy": 2.5015175342559814, "epoch": 0.6657482598607889, "grad_norm": 0.02646101452410221, "grad_norm_var": 1.7488339749235076e-06, "learning_rate": 0.0026060736210203627, "loss": 2.5375, "step": 18364 }, { "crossentropy": 2.4139606952667236, "epoch": 0.6657845127610209, "grad_norm": 0.027603721246123314, "grad_norm_var": 1.369119573869868e-06, "learning_rate": 0.002605563481662574, "loss": 2.4245, "step": 18365 }, { "crossentropy": 2.525359869003296, "epoch": 0.6658207656612529, "grad_norm": 0.02909865602850914, "grad_norm_var": 1.2978321243644249e-06, "learning_rate": 0.0026050533746452826, "loss": 2.5651, "step": 18366 }, { "crossentropy": 2.359856128692627, "epoch": 0.6658570185614849, "grad_norm": 0.02804732508957386, "grad_norm_var": 1.1254560736755533e-06, "learning_rate": 0.002604543299975379, "loss": 2.5262, "step": 18367 }, { "crossentropy": 2.3533835411071777, "epoch": 0.665893271461717, "grad_norm": 0.02564234845340252, "grad_norm_var": 1.3154876450995866e-06, "learning_rate": 0.002604033257659758, "loss": 2.4445, "step": 18368 }, { "crossentropy": 2.5575850009918213, "epoch": 0.665929524361949, "grad_norm": 0.02726892940700054, "grad_norm_var": 1.1829546814187773e-06, "learning_rate": 0.0026035232477053007, "loss": 2.5589, "step": 18369 }, { "crossentropy": 2.4670093059539795, "epoch": 0.665965777262181, "grad_norm": 0.025907330214977264, "grad_norm_var": 1.3118033542106056e-06, "learning_rate": 0.0026030132701188972, "loss": 2.5911, "step": 18370 }, { "crossentropy": 2.691164493560791, "epoch": 0.666002030162413, "grad_norm": 0.02605251595377922, "grad_norm_var": 1.3667174015135104e-06, "learning_rate": 0.00260250332490744, "loss": 2.6335, "step": 18371 }, { "crossentropy": 2.2456142902374268, "epoch": 0.666038283062645, "grad_norm": 0.026408882811665535, "grad_norm_var": 8.640062706311939e-07, "learning_rate": 0.002601993412077811, "loss": 2.3858, "step": 18372 }, { "crossentropy": 2.5565030574798584, "epoch": 0.666074535962877, "grad_norm": 0.02720421925187111, "grad_norm_var": 8.310245571946691e-07, "learning_rate": 0.0026014835316369024, "loss": 2.5279, "step": 18373 }, { "crossentropy": 2.5679032802581787, "epoch": 0.666110788863109, "grad_norm": 0.026331959292292595, "grad_norm_var": 8.482475348930406e-07, "learning_rate": 0.002600973683591597, "loss": 2.5244, "step": 18374 }, { "crossentropy": 2.5477614402770996, "epoch": 0.666147041763341, "grad_norm": 0.027733536437153816, "grad_norm_var": 7.742484290006497e-07, "learning_rate": 0.0026004638679487826, "loss": 2.473, "step": 18375 }, { "crossentropy": 2.4405596256256104, "epoch": 0.6661832946635731, "grad_norm": 0.026482045650482178, "grad_norm_var": 7.908752471784293e-07, "learning_rate": 0.0025999540847153487, "loss": 2.4909, "step": 18376 }, { "crossentropy": 2.557041883468628, "epoch": 0.6662195475638051, "grad_norm": 0.02705366164445877, "grad_norm_var": 7.861648625339847e-07, "learning_rate": 0.0025994443338981732, "loss": 2.465, "step": 18377 }, { "crossentropy": 2.4043266773223877, "epoch": 0.6662558004640371, "grad_norm": 0.02610466070473194, "grad_norm_var": 8.31656403789159e-07, "learning_rate": 0.002598934615504145, "loss": 2.4758, "step": 18378 }, { "crossentropy": 2.4873719215393066, "epoch": 0.6662920533642691, "grad_norm": 0.02703157439827919, "grad_norm_var": 8.306067446702762e-07, "learning_rate": 0.0025984249295401504, "loss": 2.4949, "step": 18379 }, { "crossentropy": 2.4259839057922363, "epoch": 0.6663283062645011, "grad_norm": 0.026987366378307343, "grad_norm_var": 8.169719954570498e-07, "learning_rate": 0.0025979152760130697, "loss": 2.4818, "step": 18380 }, { "crossentropy": 2.5594000816345215, "epoch": 0.6663645591647331, "grad_norm": 0.027057167142629623, "grad_norm_var": 7.869039991842495e-07, "learning_rate": 0.00259740565492979, "loss": 2.5784, "step": 18381 }, { "crossentropy": 2.6092021465301514, "epoch": 0.6664008120649652, "grad_norm": 0.02669544331729412, "grad_norm_var": 4.436006121782502e-07, "learning_rate": 0.0025968960662971917, "loss": 2.4999, "step": 18382 }, { "crossentropy": 2.5289454460144043, "epoch": 0.6664370649651972, "grad_norm": 0.027194809168577194, "grad_norm_var": 3.416229001229013e-07, "learning_rate": 0.002596386510122159, "loss": 2.5844, "step": 18383 }, { "crossentropy": 2.5099220275878906, "epoch": 0.6664733178654292, "grad_norm": 0.02967599220573902, "grad_norm_var": 7.911541153444162e-07, "learning_rate": 0.002595876986411576, "loss": 2.5125, "step": 18384 }, { "crossentropy": 2.3278660774230957, "epoch": 0.6665095707656613, "grad_norm": 0.02683015912771225, "grad_norm_var": 7.844921219917385e-07, "learning_rate": 0.0025953674951723223, "loss": 2.4491, "step": 18385 }, { "crossentropy": 2.4470479488372803, "epoch": 0.6665458236658933, "grad_norm": 0.02757885493338108, "grad_norm_var": 7.329868058007638e-07, "learning_rate": 0.002594858036411278, "loss": 2.5255, "step": 18386 }, { "crossentropy": 2.468771457672119, "epoch": 0.6665820765661253, "grad_norm": 0.02617848478257656, "grad_norm_var": 7.176208898340118e-07, "learning_rate": 0.0025943486101353275, "loss": 2.5388, "step": 18387 }, { "crossentropy": 2.5732390880584717, "epoch": 0.6666183294663574, "grad_norm": 0.02649041824042797, "grad_norm_var": 7.11237225268363e-07, "learning_rate": 0.002593839216351348, "loss": 2.5448, "step": 18388 }, { "crossentropy": 2.4314513206481934, "epoch": 0.6666545823665894, "grad_norm": 0.027475617825984955, "grad_norm_var": 7.218051344186234e-07, "learning_rate": 0.0025933298550662232, "loss": 2.5217, "step": 18389 }, { "crossentropy": 2.649808883666992, "epoch": 0.6666908352668214, "grad_norm": 0.02737068198621273, "grad_norm_var": 6.889124017259509e-07, "learning_rate": 0.0025928205262868292, "loss": 2.637, "step": 18390 }, { "crossentropy": 2.567878246307373, "epoch": 0.6667270881670534, "grad_norm": 0.026565179228782654, "grad_norm_var": 6.788506069300375e-07, "learning_rate": 0.0025923112300200468, "loss": 2.531, "step": 18391 }, { "crossentropy": 2.4908218383789062, "epoch": 0.6667633410672854, "grad_norm": 0.026250874623656273, "grad_norm_var": 6.99642839989643e-07, "learning_rate": 0.0025918019662727575, "loss": 2.5416, "step": 18392 }, { "crossentropy": 2.5859293937683105, "epoch": 0.6667995939675174, "grad_norm": 0.02633173018693924, "grad_norm_var": 7.30305948131125e-07, "learning_rate": 0.002591292735051837, "loss": 2.5216, "step": 18393 }, { "crossentropy": 2.420433759689331, "epoch": 0.6668358468677494, "grad_norm": 0.02648155950009823, "grad_norm_var": 6.94759046014568e-07, "learning_rate": 0.002590783536364161, "loss": 2.5023, "step": 18394 }, { "crossentropy": 2.442108392715454, "epoch": 0.6668720997679815, "grad_norm": 0.025938214734196663, "grad_norm_var": 7.66655827762067e-07, "learning_rate": 0.002590274370216612, "loss": 2.4232, "step": 18395 }, { "crossentropy": 2.6568968296051025, "epoch": 0.6669083526682135, "grad_norm": 0.027709251269698143, "grad_norm_var": 8.034084635716773e-07, "learning_rate": 0.002589765236616062, "loss": 2.5905, "step": 18396 }, { "crossentropy": 2.4143600463867188, "epoch": 0.6669446055684455, "grad_norm": 0.028385356068611145, "grad_norm_var": 9.257308224550412e-07, "learning_rate": 0.002589256135569393, "loss": 2.4353, "step": 18397 }, { "crossentropy": 2.668344736099243, "epoch": 0.6669808584686775, "grad_norm": 0.0294449795037508, "grad_norm_var": 1.2601657823745522e-06, "learning_rate": 0.0025887470670834757, "loss": 2.5906, "step": 18398 }, { "crossentropy": 2.4518320560455322, "epoch": 0.6670171113689095, "grad_norm": 0.025057384744286537, "grad_norm_var": 1.5596884082188921e-06, "learning_rate": 0.0025882380311651886, "loss": 2.4871, "step": 18399 }, { "crossentropy": 2.4736080169677734, "epoch": 0.6670533642691415, "grad_norm": 0.025848153978586197, "grad_norm_var": 1.165984093954205e-06, "learning_rate": 0.002587729027821409, "loss": 2.4856, "step": 18400 }, { "crossentropy": 2.608478307723999, "epoch": 0.6670896171693735, "grad_norm": 0.026793913915753365, "grad_norm_var": 1.1662638447034925e-06, "learning_rate": 0.002587220057059007, "loss": 2.5598, "step": 18401 }, { "crossentropy": 2.4936258792877197, "epoch": 0.6671258700696056, "grad_norm": 0.027032364159822464, "grad_norm_var": 1.1331904844493734e-06, "learning_rate": 0.0025867111188848622, "loss": 2.528, "step": 18402 }, { "crossentropy": 2.4618709087371826, "epoch": 0.6671621229698376, "grad_norm": 0.02739589475095272, "grad_norm_var": 1.1193137003178677e-06, "learning_rate": 0.002586202213305846, "loss": 2.4706, "step": 18403 }, { "crossentropy": 2.4191179275512695, "epoch": 0.6671983758700696, "grad_norm": 0.026061715558171272, "grad_norm_var": 1.154825119050601e-06, "learning_rate": 0.00258569334032883, "loss": 2.4755, "step": 18404 }, { "crossentropy": 2.6339378356933594, "epoch": 0.6672346287703016, "grad_norm": 0.027065880596637726, "grad_norm_var": 1.1329930084479079e-06, "learning_rate": 0.0025851844999606887, "loss": 2.6633, "step": 18405 }, { "crossentropy": 2.555335760116577, "epoch": 0.6672708816705336, "grad_norm": 0.02679535374045372, "grad_norm_var": 1.1143772316363991e-06, "learning_rate": 0.0025846756922082983, "loss": 2.5657, "step": 18406 }, { "crossentropy": 2.505410671234131, "epoch": 0.6673071345707656, "grad_norm": 0.02646239660680294, "grad_norm_var": 1.118562033426175e-06, "learning_rate": 0.0025841669170785253, "loss": 2.4567, "step": 18407 }, { "crossentropy": 2.5100064277648926, "epoch": 0.6673433874709976, "grad_norm": 0.027796944603323936, "grad_norm_var": 1.1514739322504922e-06, "learning_rate": 0.0025836581745782472, "loss": 2.4539, "step": 18408 }, { "crossentropy": 2.5271615982055664, "epoch": 0.6673796403712297, "grad_norm": 0.0269011203199625, "grad_norm_var": 1.127640281774533e-06, "learning_rate": 0.0025831494647143294, "loss": 2.4725, "step": 18409 }, { "crossentropy": 2.5269575119018555, "epoch": 0.6674158932714617, "grad_norm": 0.028012527152895927, "grad_norm_var": 1.1788859199043789e-06, "learning_rate": 0.002582640787493648, "loss": 2.5302, "step": 18410 }, { "crossentropy": 2.55533504486084, "epoch": 0.6674521461716937, "grad_norm": 0.026022905483841896, "grad_norm_var": 1.1668493633248117e-06, "learning_rate": 0.0025821321429230714, "loss": 2.5753, "step": 18411 }, { "crossentropy": 2.5454771518707275, "epoch": 0.6674883990719258, "grad_norm": 0.027032535523176193, "grad_norm_var": 1.1359093004092413e-06, "learning_rate": 0.002581623531009467, "loss": 2.5722, "step": 18412 }, { "crossentropy": 2.50537371635437, "epoch": 0.6675246519721578, "grad_norm": 0.028145402669906616, "grad_norm_var": 1.0954039271477214e-06, "learning_rate": 0.0025811149517597077, "loss": 2.5426, "step": 18413 }, { "crossentropy": 2.5908894538879395, "epoch": 0.6675609048723898, "grad_norm": 0.02673587016761303, "grad_norm_var": 6.679995153156815e-07, "learning_rate": 0.0025806064051806634, "loss": 2.5368, "step": 18414 }, { "crossentropy": 2.4826347827911377, "epoch": 0.6675971577726219, "grad_norm": 0.026295697316527367, "grad_norm_var": 4.72399131034619e-07, "learning_rate": 0.002580097891279199, "loss": 2.4408, "step": 18415 }, { "crossentropy": 2.497457265853882, "epoch": 0.6676334106728539, "grad_norm": 0.025882666930556297, "grad_norm_var": 4.676336499995302e-07, "learning_rate": 0.002579589410062187, "loss": 2.5011, "step": 18416 }, { "crossentropy": 2.5930662155151367, "epoch": 0.6676696635730859, "grad_norm": 0.026532946154475212, "grad_norm_var": 4.756536787354441e-07, "learning_rate": 0.0025790809615364907, "loss": 2.522, "step": 18417 }, { "crossentropy": 2.467833995819092, "epoch": 0.6677059164733179, "grad_norm": 0.026917051523923874, "grad_norm_var": 4.7423075919526304e-07, "learning_rate": 0.0025785725457089827, "loss": 2.5392, "step": 18418 }, { "crossentropy": 2.525973320007324, "epoch": 0.6677421693735499, "grad_norm": 0.027233319357037544, "grad_norm_var": 4.6466849253135915e-07, "learning_rate": 0.0025780641625865264, "loss": 2.6156, "step": 18419 }, { "crossentropy": 2.3928518295288086, "epoch": 0.6677784222737819, "grad_norm": 0.02752070315182209, "grad_norm_var": 4.4078391944328033e-07, "learning_rate": 0.0025775558121759875, "loss": 2.4866, "step": 18420 }, { "crossentropy": 2.5733399391174316, "epoch": 0.6678146751740139, "grad_norm": 0.026129717007279396, "grad_norm_var": 4.822907501790541e-07, "learning_rate": 0.002577047494484234, "loss": 2.5336, "step": 18421 }, { "crossentropy": 2.4220128059387207, "epoch": 0.667850928074246, "grad_norm": 0.02647048607468605, "grad_norm_var": 4.934662123802965e-07, "learning_rate": 0.0025765392095181317, "loss": 2.4524, "step": 18422 }, { "crossentropy": 2.463183641433716, "epoch": 0.667887180974478, "grad_norm": 0.02619122713804245, "grad_norm_var": 5.131886285064524e-07, "learning_rate": 0.002576030957284544, "loss": 2.438, "step": 18423 }, { "crossentropy": 2.1612372398376465, "epoch": 0.66792343387471, "grad_norm": 0.02576526813209057, "grad_norm_var": 5.183961616376699e-07, "learning_rate": 0.0025755227377903386, "loss": 2.3784, "step": 18424 }, { "crossentropy": 2.3701624870300293, "epoch": 0.667959686774942, "grad_norm": 0.027916746214032173, "grad_norm_var": 6.051109371106907e-07, "learning_rate": 0.0025750145510423754, "loss": 2.5145, "step": 18425 }, { "crossentropy": 2.6447384357452393, "epoch": 0.667995939675174, "grad_norm": 0.02588128112256527, "grad_norm_var": 5.445299273271284e-07, "learning_rate": 0.0025745063970475214, "loss": 2.5102, "step": 18426 }, { "crossentropy": 2.575590133666992, "epoch": 0.668032192575406, "grad_norm": 0.028838416561484337, "grad_norm_var": 7.981370138843696e-07, "learning_rate": 0.0025739982758126428, "loss": 2.604, "step": 18427 }, { "crossentropy": 2.3824210166931152, "epoch": 0.668068445475638, "grad_norm": 0.027757979929447174, "grad_norm_var": 8.493537902142286e-07, "learning_rate": 0.002573490187344596, "loss": 2.3876, "step": 18428 }, { "crossentropy": 2.496154308319092, "epoch": 0.66810469837587, "grad_norm": 0.027321042492985725, "grad_norm_var": 7.536664441369518e-07, "learning_rate": 0.0025729821316502455, "loss": 2.4698, "step": 18429 }, { "crossentropy": 2.592975616455078, "epoch": 0.6681409512761021, "grad_norm": 0.026707828044891357, "grad_norm_var": 7.540933416426508e-07, "learning_rate": 0.002572474108736457, "loss": 2.5021, "step": 18430 }, { "crossentropy": 2.4890503883361816, "epoch": 0.6681772041763341, "grad_norm": 0.026139769703149796, "grad_norm_var": 7.668283100040543e-07, "learning_rate": 0.002571966118610087, "loss": 2.4324, "step": 18431 }, { "crossentropy": 2.473188877105713, "epoch": 0.6682134570765661, "grad_norm": 0.025524809956550598, "grad_norm_var": 8.198141224035563e-07, "learning_rate": 0.002571458161278002, "loss": 2.4795, "step": 18432 }, { "crossentropy": 2.56070876121521, "epoch": 0.6682497099767981, "grad_norm": 0.026495428755879402, "grad_norm_var": 8.212531755676881e-07, "learning_rate": 0.002570950236747058, "loss": 2.5507, "step": 18433 }, { "crossentropy": 2.231776475906372, "epoch": 0.6682859628770301, "grad_norm": 0.02627641335129738, "grad_norm_var": 8.369650246967207e-07, "learning_rate": 0.0025704423450241165, "loss": 2.4444, "step": 18434 }, { "crossentropy": 2.5565600395202637, "epoch": 0.6683222157772621, "grad_norm": 0.02758100815117359, "grad_norm_var": 8.664326312800372e-07, "learning_rate": 0.0025699344861160422, "loss": 2.5582, "step": 18435 }, { "crossentropy": 2.47424578666687, "epoch": 0.6683584686774942, "grad_norm": 0.026183968409895897, "grad_norm_var": 8.465195825940134e-07, "learning_rate": 0.0025694266600296872, "loss": 2.4989, "step": 18436 }, { "crossentropy": 2.5574023723602295, "epoch": 0.6683947215777262, "grad_norm": 0.026275726035237312, "grad_norm_var": 8.367724442636387e-07, "learning_rate": 0.0025689188667719133, "loss": 2.5599, "step": 18437 }, { "crossentropy": 2.4896132946014404, "epoch": 0.6684309744779582, "grad_norm": 0.02659892477095127, "grad_norm_var": 8.337366536922385e-07, "learning_rate": 0.002568411106349581, "loss": 2.473, "step": 18438 }, { "crossentropy": 2.5929036140441895, "epoch": 0.6684672273781903, "grad_norm": 0.027195345610380173, "grad_norm_var": 8.264959887043418e-07, "learning_rate": 0.0025679033787695455, "loss": 2.6315, "step": 18439 }, { "crossentropy": 2.5567421913146973, "epoch": 0.6685034802784223, "grad_norm": 0.027986476197838783, "grad_norm_var": 8.347032296846315e-07, "learning_rate": 0.0025673956840386674, "loss": 2.5933, "step": 18440 }, { "crossentropy": 2.5011813640594482, "epoch": 0.6685397331786543, "grad_norm": 0.02886279672384262, "grad_norm_var": 1.0166772509906703e-06, "learning_rate": 0.0025668880221638, "loss": 2.5558, "step": 18441 }, { "crossentropy": 2.48241925239563, "epoch": 0.6685759860788864, "grad_norm": 0.02859773114323616, "grad_norm_var": 1.0811172699687344e-06, "learning_rate": 0.0025663803931518024, "loss": 2.5172, "step": 18442 }, { "crossentropy": 2.4402945041656494, "epoch": 0.6686122389791184, "grad_norm": 0.025981176644563675, "grad_norm_var": 9.467865218176167e-07, "learning_rate": 0.002565872797009533, "loss": 2.5333, "step": 18443 }, { "crossentropy": 2.447441577911377, "epoch": 0.6686484918793504, "grad_norm": 0.02751239389181137, "grad_norm_var": 9.246850929960401e-07, "learning_rate": 0.002565365233743845, "loss": 2.467, "step": 18444 }, { "crossentropy": 2.550387382507324, "epoch": 0.6686847447795824, "grad_norm": 0.026608821004629135, "grad_norm_var": 9.213959423356548e-07, "learning_rate": 0.0025648577033615923, "loss": 2.5272, "step": 18445 }, { "crossentropy": 2.322190761566162, "epoch": 0.6687209976798144, "grad_norm": 0.026299526914954185, "grad_norm_var": 9.4271480111951e-07, "learning_rate": 0.002564350205869634, "loss": 2.4128, "step": 18446 }, { "crossentropy": 2.55001163482666, "epoch": 0.6687572505800464, "grad_norm": 0.035060249269008636, "grad_norm_var": 5.032724555297874e-06, "learning_rate": 0.00256384274127482, "loss": 2.5356, "step": 18447 }, { "crossentropy": 2.5873799324035645, "epoch": 0.6687935034802784, "grad_norm": 0.027146220207214355, "grad_norm_var": 4.782983319649006e-06, "learning_rate": 0.002563335309584009, "loss": 2.5635, "step": 18448 }, { "crossentropy": 2.6519622802734375, "epoch": 0.6688297563805105, "grad_norm": 0.026162181049585342, "grad_norm_var": 4.836399330072847e-06, "learning_rate": 0.0025628279108040504, "loss": 2.6013, "step": 18449 }, { "crossentropy": 2.5437605381011963, "epoch": 0.6688660092807425, "grad_norm": 0.027771499007940292, "grad_norm_var": 4.728090299997653e-06, "learning_rate": 0.0025623205449417995, "loss": 2.5267, "step": 18450 }, { "crossentropy": 2.464949369430542, "epoch": 0.6689022621809745, "grad_norm": 0.026129143312573433, "grad_norm_var": 4.866221941554558e-06, "learning_rate": 0.0025618132120041105, "loss": 2.4109, "step": 18451 }, { "crossentropy": 2.4782655239105225, "epoch": 0.6689385150812065, "grad_norm": 0.02673584222793579, "grad_norm_var": 4.786707814684885e-06, "learning_rate": 0.0025613059119978343, "loss": 2.4899, "step": 18452 }, { "crossentropy": 2.4535253047943115, "epoch": 0.6689747679814385, "grad_norm": 0.02643338032066822, "grad_norm_var": 4.761312297056258e-06, "learning_rate": 0.0025607986449298213, "loss": 2.4736, "step": 18453 }, { "crossentropy": 2.5377161502838135, "epoch": 0.6690110208816705, "grad_norm": 0.02677825465798378, "grad_norm_var": 4.740160429819728e-06, "learning_rate": 0.0025602914108069255, "loss": 2.5148, "step": 18454 }, { "crossentropy": 2.6723999977111816, "epoch": 0.6690472737819025, "grad_norm": 0.09881794452667236, "grad_norm_var": 0.00032169044382860756, "learning_rate": 0.0025597842096359946, "loss": 2.6121, "step": 18455 }, { "crossentropy": 2.3839006423950195, "epoch": 0.6690835266821346, "grad_norm": 0.033320102840662, "grad_norm_var": 0.00032057492390053394, "learning_rate": 0.0025592770414238834, "loss": 2.3709, "step": 18456 }, { "crossentropy": 2.496433734893799, "epoch": 0.6691197795823666, "grad_norm": 0.02806866355240345, "grad_norm_var": 0.0003209876647886446, "learning_rate": 0.0025587699061774384, "loss": 2.517, "step": 18457 }, { "crossentropy": 2.540496587753296, "epoch": 0.6691560324825986, "grad_norm": 0.026929566636681557, "grad_norm_var": 0.00032199371625922025, "learning_rate": 0.0025582628039035092, "loss": 2.4505, "step": 18458 }, { "crossentropy": 2.549743413925171, "epoch": 0.6691922853828306, "grad_norm": 0.02692599967122078, "grad_norm_var": 0.0003212617148787319, "learning_rate": 0.0025577557346089484, "loss": 2.5858, "step": 18459 }, { "crossentropy": 2.612567186355591, "epoch": 0.6692285382830626, "grad_norm": 0.028828192502260208, "grad_norm_var": 0.00032053108483552786, "learning_rate": 0.002557248698300604, "loss": 2.5333, "step": 18460 }, { "crossentropy": 2.5833561420440674, "epoch": 0.6692647911832946, "grad_norm": 0.027886157855391502, "grad_norm_var": 0.000319650846196183, "learning_rate": 0.0025567416949853197, "loss": 2.6062, "step": 18461 }, { "crossentropy": 2.5061211585998535, "epoch": 0.6693010440835266, "grad_norm": 0.026369750499725342, "grad_norm_var": 0.0003195935122574711, "learning_rate": 0.002556234724669949, "loss": 2.5423, "step": 18462 }, { "crossentropy": 2.5523359775543213, "epoch": 0.6693372969837587, "grad_norm": 0.02720642276108265, "grad_norm_var": 0.00032072595933976383, "learning_rate": 0.002555727787361335, "loss": 2.5404, "step": 18463 }, { "crossentropy": 2.699085235595703, "epoch": 0.6693735498839907, "grad_norm": 0.026623444631695747, "grad_norm_var": 0.00032107922761471586, "learning_rate": 0.0025552208830663284, "loss": 2.5855, "step": 18464 }, { "crossentropy": 2.440080404281616, "epoch": 0.6694098027842227, "grad_norm": 0.02789796143770218, "grad_norm_var": 0.00031993110585517475, "learning_rate": 0.002554714011791771, "loss": 2.4393, "step": 18465 }, { "crossentropy": 2.3749642372131348, "epoch": 0.6694460556844548, "grad_norm": 0.02645864337682724, "grad_norm_var": 0.0003207869209890601, "learning_rate": 0.0025542071735445127, "loss": 2.4397, "step": 18466 }, { "crossentropy": 2.564422607421875, "epoch": 0.6694823085846868, "grad_norm": 0.02868431992828846, "grad_norm_var": 0.0003192074097503417, "learning_rate": 0.0025537003683313996, "loss": 2.5188, "step": 18467 }, { "crossentropy": 2.3663792610168457, "epoch": 0.6695185614849188, "grad_norm": 0.027602659538388252, "grad_norm_var": 0.00031863177051838726, "learning_rate": 0.0025531935961592743, "loss": 2.5521, "step": 18468 }, { "crossentropy": 2.489203453063965, "epoch": 0.6695548143851509, "grad_norm": 0.031194303184747696, "grad_norm_var": 0.00031640245021141293, "learning_rate": 0.0025526868570349835, "loss": 2.5019, "step": 18469 }, { "crossentropy": 2.5424413681030273, "epoch": 0.6695910672853829, "grad_norm": 0.028132924810051918, "grad_norm_var": 0.00031548827039795706, "learning_rate": 0.002552180150965372, "loss": 2.5442, "step": 18470 }, { "crossentropy": 2.4196925163269043, "epoch": 0.6696273201856149, "grad_norm": 0.026149755343794823, "grad_norm_var": 3.5427273985242825e-06, "learning_rate": 0.002551673477957279, "loss": 2.4455, "step": 18471 }, { "crossentropy": 2.693424940109253, "epoch": 0.6696635730858469, "grad_norm": 0.027011459693312645, "grad_norm_var": 1.569806854694912e-06, "learning_rate": 0.002551166838017554, "loss": 2.5709, "step": 18472 }, { "crossentropy": 2.476928472518921, "epoch": 0.6696998259860789, "grad_norm": 0.02671150676906109, "grad_norm_var": 1.6043044734137998e-06, "learning_rate": 0.002550660231153035, "loss": 2.4983, "step": 18473 }, { "crossentropy": 2.641648769378662, "epoch": 0.6697360788863109, "grad_norm": 0.027241064235568047, "grad_norm_var": 1.5850856674011367e-06, "learning_rate": 0.0025501536573705663, "loss": 2.5968, "step": 18474 }, { "crossentropy": 2.5954430103302, "epoch": 0.6697723317865429, "grad_norm": 0.026035919785499573, "grad_norm_var": 1.7095794397961144e-06, "learning_rate": 0.002549647116676992, "loss": 2.5631, "step": 18475 }, { "crossentropy": 2.658398389816284, "epoch": 0.669808584686775, "grad_norm": 0.026455987244844437, "grad_norm_var": 1.641871675597123e-06, "learning_rate": 0.00254914060907915, "loss": 2.532, "step": 18476 }, { "crossentropy": 2.585811138153076, "epoch": 0.669844837587007, "grad_norm": 0.026868894696235657, "grad_norm_var": 1.6343543475659845e-06, "learning_rate": 0.002548634134583886, "loss": 2.5847, "step": 18477 }, { "crossentropy": 2.4246153831481934, "epoch": 0.669881090487239, "grad_norm": 0.02660294808447361, "grad_norm_var": 1.6091300876282127e-06, "learning_rate": 0.0025481276931980375, "loss": 2.4917, "step": 18478 }, { "crossentropy": 2.6220803260803223, "epoch": 0.669917343387471, "grad_norm": 0.026863250881433487, "grad_norm_var": 1.6209959429737727e-06, "learning_rate": 0.0025476212849284436, "loss": 2.6106, "step": 18479 }, { "crossentropy": 2.5079777240753174, "epoch": 0.669953596287703, "grad_norm": 0.028191814199090004, "grad_norm_var": 1.6367167857509925e-06, "learning_rate": 0.0025471149097819478, "loss": 2.5829, "step": 18480 }, { "crossentropy": 2.395615577697754, "epoch": 0.669989849187935, "grad_norm": 0.06439969688653946, "grad_norm_var": 8.742400361909662e-05, "learning_rate": 0.0025466085677653856, "loss": 2.5321, "step": 18481 }, { "crossentropy": 2.6564340591430664, "epoch": 0.670026102088167, "grad_norm": 0.02617398463189602, "grad_norm_var": 8.755068101912357e-05, "learning_rate": 0.0025461022588855976, "loss": 2.5888, "step": 18482 }, { "crossentropy": 2.3757107257843018, "epoch": 0.6700623549883991, "grad_norm": 0.025158314034342766, "grad_norm_var": 8.877938874153011e-05, "learning_rate": 0.0025455959831494246, "loss": 2.4099, "step": 18483 }, { "crossentropy": 2.4812281131744385, "epoch": 0.6700986078886311, "grad_norm": 0.028834568336606026, "grad_norm_var": 8.857496769045097e-05, "learning_rate": 0.0025450897405637006, "loss": 2.506, "step": 18484 }, { "crossentropy": 2.574376344680786, "epoch": 0.6701348607888631, "grad_norm": 0.026764893904328346, "grad_norm_var": 8.880153608950014e-05, "learning_rate": 0.0025445835311352673, "loss": 2.4969, "step": 18485 }, { "crossentropy": 2.644888162612915, "epoch": 0.6701711136890951, "grad_norm": 0.026134636253118515, "grad_norm_var": 8.93420290278217e-05, "learning_rate": 0.0025440773548709595, "loss": 2.5441, "step": 18486 }, { "crossentropy": 2.492328643798828, "epoch": 0.6702073665893271, "grad_norm": 0.02732725627720356, "grad_norm_var": 8.896550983315491e-05, "learning_rate": 0.002543571211777612, "loss": 2.5195, "step": 18487 }, { "crossentropy": 2.6245007514953613, "epoch": 0.6702436194895591, "grad_norm": 0.04018666595220566, "grad_norm_var": 9.60165734734965e-05, "learning_rate": 0.002543065101862065, "loss": 2.5579, "step": 18488 }, { "crossentropy": 2.5979092121124268, "epoch": 0.6702798723897911, "grad_norm": 0.027729107066988945, "grad_norm_var": 9.563552208888627e-05, "learning_rate": 0.0025425590251311504, "loss": 2.5244, "step": 18489 }, { "crossentropy": 2.518820285797119, "epoch": 0.6703161252900232, "grad_norm": 0.026273075491189957, "grad_norm_var": 9.605798375819323e-05, "learning_rate": 0.0025420529815917054, "loss": 2.5817, "step": 18490 }, { "crossentropy": 2.7180354595184326, "epoch": 0.6703523781902552, "grad_norm": 0.026405498385429382, "grad_norm_var": 9.587117884535205e-05, "learning_rate": 0.002541546971250567, "loss": 2.6353, "step": 18491 }, { "crossentropy": 2.6207783222198486, "epoch": 0.6703886310904872, "grad_norm": 0.02739279344677925, "grad_norm_var": 9.548046235118793e-05, "learning_rate": 0.002541040994114565, "loss": 2.6561, "step": 18492 }, { "crossentropy": 2.450848340988159, "epoch": 0.6704248839907193, "grad_norm": 0.029263410717248917, "grad_norm_var": 9.481306657266597e-05, "learning_rate": 0.002540535050190539, "loss": 2.4307, "step": 18493 }, { "crossentropy": 2.536668062210083, "epoch": 0.6704611368909513, "grad_norm": 0.034870944917201996, "grad_norm_var": 9.508558188749057e-05, "learning_rate": 0.002540029139485319, "loss": 2.6199, "step": 18494 }, { "crossentropy": 2.514227867126465, "epoch": 0.6704973897911833, "grad_norm": 0.026422958821058273, "grad_norm_var": 9.532576152832087e-05, "learning_rate": 0.0025395232620057367, "loss": 2.4997, "step": 18495 }, { "crossentropy": 2.4059410095214844, "epoch": 0.6705336426914154, "grad_norm": 0.027574272826313972, "grad_norm_var": 9.555781377593788e-05, "learning_rate": 0.002539017417758629, "loss": 2.3816, "step": 18496 }, { "crossentropy": 2.387389898300171, "epoch": 0.6705698955916474, "grad_norm": 0.02712303213775158, "grad_norm_var": 1.4820258688914787e-05, "learning_rate": 0.002538511606750823, "loss": 2.4682, "step": 18497 }, { "crossentropy": 2.5548126697540283, "epoch": 0.6706061484918794, "grad_norm": 0.027292082086205482, "grad_norm_var": 1.4573662964529472e-05, "learning_rate": 0.002538005828989154, "loss": 2.577, "step": 18498 }, { "crossentropy": 2.6881704330444336, "epoch": 0.6706424013921114, "grad_norm": 0.02673727087676525, "grad_norm_var": 1.4042366226841435e-05, "learning_rate": 0.002537500084480454, "loss": 2.5767, "step": 18499 }, { "crossentropy": 2.415626049041748, "epoch": 0.6706786542923434, "grad_norm": 0.02753816917538643, "grad_norm_var": 1.4093167440986385e-05, "learning_rate": 0.00253699437323155, "loss": 2.4654, "step": 18500 }, { "crossentropy": 2.577336072921753, "epoch": 0.6707149071925754, "grad_norm": 0.02698436751961708, "grad_norm_var": 1.4047166294755587e-05, "learning_rate": 0.0025364886952492774, "loss": 2.5192, "step": 18501 }, { "crossentropy": 2.584078550338745, "epoch": 0.6707511600928074, "grad_norm": 0.0261747557669878, "grad_norm_var": 1.4034862821523164e-05, "learning_rate": 0.0025359830505404628, "loss": 2.5603, "step": 18502 }, { "crossentropy": 2.516005039215088, "epoch": 0.6707874129930395, "grad_norm": 0.028069866821169853, "grad_norm_var": 1.3957569552443825e-05, "learning_rate": 0.002535477439111935, "loss": 2.4886, "step": 18503 }, { "crossentropy": 2.5693440437316895, "epoch": 0.6708236658932715, "grad_norm": 0.026277434080839157, "grad_norm_var": 4.380003503314531e-06, "learning_rate": 0.0025349718609705262, "loss": 2.4979, "step": 18504 }, { "crossentropy": 2.621816635131836, "epoch": 0.6708599187935035, "grad_norm": 0.026861578226089478, "grad_norm_var": 4.415932162342833e-06, "learning_rate": 0.0025344663161230613, "loss": 2.5921, "step": 18505 }, { "crossentropy": 2.5610768795013428, "epoch": 0.6708961716937355, "grad_norm": 0.02807171270251274, "grad_norm_var": 4.3049783643648105e-06, "learning_rate": 0.002533960804576371, "loss": 2.5339, "step": 18506 }, { "crossentropy": 2.5323071479797363, "epoch": 0.6709324245939675, "grad_norm": 0.027636367827653885, "grad_norm_var": 4.188654498720607e-06, "learning_rate": 0.0025334553263372827, "loss": 2.5583, "step": 18507 }, { "crossentropy": 2.480776309967041, "epoch": 0.6709686774941995, "grad_norm": 0.02689223177731037, "grad_norm_var": 4.2293690808519e-06, "learning_rate": 0.0025329498814126226, "loss": 2.4281, "step": 18508 }, { "crossentropy": 2.5137722492218018, "epoch": 0.6710049303944315, "grad_norm": 0.026653077453374863, "grad_norm_var": 4.123941728995092e-06, "learning_rate": 0.00253244446980922, "loss": 2.4912, "step": 18509 }, { "crossentropy": 2.3626115322113037, "epoch": 0.6710411832946636, "grad_norm": 0.02521137334406376, "grad_norm_var": 5.572891059522448e-07, "learning_rate": 0.002531939091533898, "loss": 2.5165, "step": 18510 }, { "crossentropy": 2.5509772300720215, "epoch": 0.6710774361948956, "grad_norm": 0.02712494321167469, "grad_norm_var": 5.368828445547937e-07, "learning_rate": 0.0025314337465934854, "loss": 2.5459, "step": 18511 }, { "crossentropy": 2.454439878463745, "epoch": 0.6711136890951276, "grad_norm": 0.02559581957757473, "grad_norm_var": 6.337045541538923e-07, "learning_rate": 0.002530928434994806, "loss": 2.4391, "step": 18512 }, { "crossentropy": 2.5019805431365967, "epoch": 0.6711499419953596, "grad_norm": 0.026998011395335197, "grad_norm_var": 6.308011807302684e-07, "learning_rate": 0.002530423156744683, "loss": 2.5247, "step": 18513 }, { "crossentropy": 2.448350191116333, "epoch": 0.6711861948955916, "grad_norm": 0.026838654652237892, "grad_norm_var": 6.188853126317852e-07, "learning_rate": 0.0025299179118499437, "loss": 2.5031, "step": 18514 }, { "crossentropy": 2.511854648590088, "epoch": 0.6712224477958236, "grad_norm": 0.028346111997961998, "grad_norm_var": 7.555966945453485e-07, "learning_rate": 0.002529412700317413, "loss": 2.4772, "step": 18515 }, { "crossentropy": 2.5534608364105225, "epoch": 0.6712587006960556, "grad_norm": 0.02629600092768669, "grad_norm_var": 7.553899830639675e-07, "learning_rate": 0.0025289075221539116, "loss": 2.4527, "step": 18516 }, { "crossentropy": 2.4655725955963135, "epoch": 0.6712949535962877, "grad_norm": 0.02550562657415867, "grad_norm_var": 8.708917773089798e-07, "learning_rate": 0.0025284023773662657, "loss": 2.5448, "step": 18517 }, { "crossentropy": 2.5315120220184326, "epoch": 0.6713312064965197, "grad_norm": 0.026182718575000763, "grad_norm_var": 8.702482660978648e-07, "learning_rate": 0.002527897265961294, "loss": 2.5335, "step": 18518 }, { "crossentropy": 2.6085216999053955, "epoch": 0.6713674593967517, "grad_norm": 0.026314930990338326, "grad_norm_var": 7.621102716909387e-07, "learning_rate": 0.002527392187945824, "loss": 2.5154, "step": 18519 }, { "crossentropy": 2.4903452396392822, "epoch": 0.6714037122969838, "grad_norm": 0.027350327000021935, "grad_norm_var": 7.771222687519633e-07, "learning_rate": 0.0025268871433266745, "loss": 2.4924, "step": 18520 }, { "crossentropy": 2.5080270767211914, "epoch": 0.6714399651972158, "grad_norm": 0.027300896123051643, "grad_norm_var": 7.961617558371967e-07, "learning_rate": 0.002526382132110665, "loss": 2.512, "step": 18521 }, { "crossentropy": 2.4888577461242676, "epoch": 0.6714762180974478, "grad_norm": 0.025934990495443344, "grad_norm_var": 7.106361988819341e-07, "learning_rate": 0.0025258771543046187, "loss": 2.4842, "step": 18522 }, { "crossentropy": 2.4636332988739014, "epoch": 0.6715124709976799, "grad_norm": 0.026664765551686287, "grad_norm_var": 6.400915076290939e-07, "learning_rate": 0.0025253722099153587, "loss": 2.527, "step": 18523 }, { "crossentropy": 2.4895436763763428, "epoch": 0.6715487238979119, "grad_norm": 0.026563355699181557, "grad_norm_var": 6.329695366411383e-07, "learning_rate": 0.0025248672989497, "loss": 2.5532, "step": 18524 }, { "crossentropy": 2.509115219116211, "epoch": 0.6715849767981439, "grad_norm": 0.02745787240564823, "grad_norm_var": 6.839640128952397e-07, "learning_rate": 0.0025243624214144663, "loss": 2.5919, "step": 18525 }, { "crossentropy": 2.609100580215454, "epoch": 0.6716212296983759, "grad_norm": 0.02577967196702957, "grad_norm_var": 5.985194371771011e-07, "learning_rate": 0.002523857577316473, "loss": 2.5671, "step": 18526 }, { "crossentropy": 2.679002046585083, "epoch": 0.6716574825986079, "grad_norm": 0.02767838165163994, "grad_norm_var": 6.533798644223066e-07, "learning_rate": 0.002523352766662541, "loss": 2.5702, "step": 18527 }, { "crossentropy": 2.4999101161956787, "epoch": 0.6716937354988399, "grad_norm": 0.025962313637137413, "grad_norm_var": 6.09014790674484e-07, "learning_rate": 0.0025228479894594926, "loss": 2.4874, "step": 18528 }, { "crossentropy": 2.7707607746124268, "epoch": 0.6717299883990719, "grad_norm": 0.027373937889933586, "grad_norm_var": 6.328641983944916e-07, "learning_rate": 0.002522343245714137, "loss": 2.6683, "step": 18529 }, { "crossentropy": 2.6093204021453857, "epoch": 0.671766241299304, "grad_norm": 0.025981994345784187, "grad_norm_var": 6.653961162578764e-07, "learning_rate": 0.0025218385354332964, "loss": 2.5576, "step": 18530 }, { "crossentropy": 2.517387866973877, "epoch": 0.671802494199536, "grad_norm": 0.02844218723475933, "grad_norm_var": 6.874649667682238e-07, "learning_rate": 0.0025213338586237877, "loss": 2.4979, "step": 18531 }, { "crossentropy": 2.564974069595337, "epoch": 0.671838747099768, "grad_norm": 0.026870451867580414, "grad_norm_var": 6.791087440495876e-07, "learning_rate": 0.0025208292152924263, "loss": 2.6207, "step": 18532 }, { "crossentropy": 2.5941250324249268, "epoch": 0.671875, "grad_norm": 0.026228925213217735, "grad_norm_var": 5.956300933791878e-07, "learning_rate": 0.00252032460544603, "loss": 2.6185, "step": 18533 }, { "crossentropy": 2.5226094722747803, "epoch": 0.671911252900232, "grad_norm": 0.026210853829979897, "grad_norm_var": 5.935309198868213e-07, "learning_rate": 0.002519820029091411, "loss": 2.4685, "step": 18534 }, { "crossentropy": 2.427135705947876, "epoch": 0.671947505800464, "grad_norm": 0.027393566444516182, "grad_norm_var": 6.026346556973923e-07, "learning_rate": 0.002519315486235386, "loss": 2.4241, "step": 18535 }, { "crossentropy": 2.63177490234375, "epoch": 0.671983758700696, "grad_norm": 0.027809100225567818, "grad_norm_var": 6.479443986802391e-07, "learning_rate": 0.0025188109768847743, "loss": 2.5363, "step": 18536 }, { "crossentropy": 2.546994209289551, "epoch": 0.6720200116009281, "grad_norm": 0.026690751314163208, "grad_norm_var": 6.348009270607056e-07, "learning_rate": 0.002518306501046381, "loss": 2.5053, "step": 18537 }, { "crossentropy": 2.4923174381256104, "epoch": 0.6720562645011601, "grad_norm": 0.0278627909719944, "grad_norm_var": 6.408290262604887e-07, "learning_rate": 0.002517802058727026, "loss": 2.5002, "step": 18538 }, { "crossentropy": 2.5032105445861816, "epoch": 0.6720925174013921, "grad_norm": 0.026699671521782875, "grad_norm_var": 6.396442953400614e-07, "learning_rate": 0.002517297649933522, "loss": 2.4996, "step": 18539 }, { "crossentropy": 2.407750368118286, "epoch": 0.6721287703016241, "grad_norm": 0.025797855108976364, "grad_norm_var": 7.144935999766714e-07, "learning_rate": 0.0025167932746726785, "loss": 2.4783, "step": 18540 }, { "crossentropy": 2.55531644821167, "epoch": 0.6721650232018561, "grad_norm": 0.026139574125409126, "grad_norm_var": 7.232998790809853e-07, "learning_rate": 0.0025162889329513128, "loss": 2.5659, "step": 18541 }, { "crossentropy": 2.4935572147369385, "epoch": 0.6722012761020881, "grad_norm": 0.026419607922434807, "grad_norm_var": 6.611847290850878e-07, "learning_rate": 0.002515784624776233, "loss": 2.5016, "step": 18542 }, { "crossentropy": 2.5184686183929443, "epoch": 0.6722375290023201, "grad_norm": 0.025801286101341248, "grad_norm_var": 6.734808547525223e-07, "learning_rate": 0.002515280350154251, "loss": 2.5377, "step": 18543 }, { "crossentropy": 2.550764560699463, "epoch": 0.6722737819025522, "grad_norm": 0.027063900604844093, "grad_norm_var": 6.365230957425568e-07, "learning_rate": 0.0025147761090921817, "loss": 2.5324, "step": 18544 }, { "crossentropy": 2.3760929107666016, "epoch": 0.6723100348027842, "grad_norm": 0.02663765288889408, "grad_norm_var": 6.139779630564737e-07, "learning_rate": 0.0025142719015968295, "loss": 2.4929, "step": 18545 }, { "crossentropy": 2.351490020751953, "epoch": 0.6723462877030162, "grad_norm": 0.026121510192751884, "grad_norm_var": 6.008496481925505e-07, "learning_rate": 0.0025137677276750076, "loss": 2.4447, "step": 18546 }, { "crossentropy": 2.556764841079712, "epoch": 0.6723825406032483, "grad_norm": 0.027288343757390976, "grad_norm_var": 4.2554732302268796e-07, "learning_rate": 0.0025132635873335273, "loss": 2.5597, "step": 18547 }, { "crossentropy": 2.5939993858337402, "epoch": 0.6724187935034803, "grad_norm": 0.02656308189034462, "grad_norm_var": 4.2404604362657656e-07, "learning_rate": 0.0025127594805791943, "loss": 2.468, "step": 18548 }, { "crossentropy": 2.418092727661133, "epoch": 0.6724550464037123, "grad_norm": 0.026473509147763252, "grad_norm_var": 4.1338363284676206e-07, "learning_rate": 0.00251225540741882, "loss": 2.5132, "step": 18549 }, { "crossentropy": 2.4071736335754395, "epoch": 0.6724912993039444, "grad_norm": 0.02604532428085804, "grad_norm_var": 4.2557883807066663e-07, "learning_rate": 0.002511751367859211, "loss": 2.5028, "step": 18550 }, { "crossentropy": 2.3005292415618896, "epoch": 0.6725275522041764, "grad_norm": 0.028159575536847115, "grad_norm_var": 5.355943712698555e-07, "learning_rate": 0.002511247361907175, "loss": 2.4457, "step": 18551 }, { "crossentropy": 2.3244471549987793, "epoch": 0.6725638051044084, "grad_norm": 0.025915473699569702, "grad_norm_var": 4.855731366025836e-07, "learning_rate": 0.0025107433895695227, "loss": 2.404, "step": 18552 }, { "crossentropy": 2.525456190109253, "epoch": 0.6726000580046404, "grad_norm": 0.026678983122110367, "grad_norm_var": 4.854472316067794e-07, "learning_rate": 0.002510239450853058, "loss": 2.5294, "step": 18553 }, { "crossentropy": 2.4225356578826904, "epoch": 0.6726363109048724, "grad_norm": 0.028610315173864365, "grad_norm_var": 6.458095258269573e-07, "learning_rate": 0.002509735545764586, "loss": 2.4999, "step": 18554 }, { "crossentropy": 2.5691466331481934, "epoch": 0.6726725638051044, "grad_norm": 0.026526158675551414, "grad_norm_var": 6.465646928802687e-07, "learning_rate": 0.002509231674310917, "loss": 2.5297, "step": 18555 }, { "crossentropy": 2.5129103660583496, "epoch": 0.6727088167053364, "grad_norm": 0.026574984192848206, "grad_norm_var": 5.97035650675319e-07, "learning_rate": 0.002508727836498852, "loss": 2.4762, "step": 18556 }, { "crossentropy": 2.5115678310394287, "epoch": 0.6727450696055685, "grad_norm": 0.02613420970737934, "grad_norm_var": 5.97430218303889e-07, "learning_rate": 0.0025082240323351995, "loss": 2.4315, "step": 18557 }, { "crossentropy": 2.519188165664673, "epoch": 0.6727813225058005, "grad_norm": 0.035657402127981186, "grad_norm_var": 5.599947233809038e-06, "learning_rate": 0.0025077202618267614, "loss": 2.5476, "step": 18558 }, { "crossentropy": 2.650486707687378, "epoch": 0.6728175754060325, "grad_norm": 0.02586369775235653, "grad_norm_var": 5.5880042203156385e-06, "learning_rate": 0.0025072165249803435, "loss": 2.5194, "step": 18559 }, { "crossentropy": 2.504631519317627, "epoch": 0.6728538283062645, "grad_norm": 0.02604706957936287, "grad_norm_var": 5.680518436885068e-06, "learning_rate": 0.002506712821802751, "loss": 2.5523, "step": 18560 }, { "crossentropy": 2.55332088470459, "epoch": 0.6728900812064965, "grad_norm": 0.02933843433856964, "grad_norm_var": 5.931713964119515e-06, "learning_rate": 0.002506209152300786, "loss": 2.5202, "step": 18561 }, { "crossentropy": 2.5056684017181396, "epoch": 0.6729263341067285, "grad_norm": 0.02736317552626133, "grad_norm_var": 5.820570222797337e-06, "learning_rate": 0.002505705516481248, "loss": 2.4674, "step": 18562 }, { "crossentropy": 2.58143949508667, "epoch": 0.6729625870069605, "grad_norm": 0.027094917371869087, "grad_norm_var": 5.827141781539607e-06, "learning_rate": 0.002505201914350945, "loss": 2.5907, "step": 18563 }, { "crossentropy": 2.6241707801818848, "epoch": 0.6729988399071926, "grad_norm": 0.02645447850227356, "grad_norm_var": 5.8405828330526655e-06, "learning_rate": 0.002504698345916674, "loss": 2.5551, "step": 18564 }, { "crossentropy": 2.5180819034576416, "epoch": 0.6730350928074246, "grad_norm": 0.026384642347693443, "grad_norm_var": 5.852452522245716e-06, "learning_rate": 0.0025041948111852405, "loss": 2.5416, "step": 18565 }, { "crossentropy": 2.5984909534454346, "epoch": 0.6730713457076566, "grad_norm": 0.02678452804684639, "grad_norm_var": 5.75032151071141e-06, "learning_rate": 0.002503691310163442, "loss": 2.6036, "step": 18566 }, { "crossentropy": 2.5453686714172363, "epoch": 0.6731075986078886, "grad_norm": 0.025928311049938202, "grad_norm_var": 5.857595515157352e-06, "learning_rate": 0.0025031878428580802, "loss": 2.4949, "step": 18567 }, { "crossentropy": 2.385193347930908, "epoch": 0.6731438515081206, "grad_norm": 0.02609972096979618, "grad_norm_var": 5.824849635515161e-06, "learning_rate": 0.002502684409275957, "loss": 2.4008, "step": 18568 }, { "crossentropy": 2.506265163421631, "epoch": 0.6731801044083526, "grad_norm": 0.02592449076473713, "grad_norm_var": 5.927561139840107e-06, "learning_rate": 0.0025021810094238697, "loss": 2.5052, "step": 18569 }, { "crossentropy": 2.4484472274780273, "epoch": 0.6732163573085846, "grad_norm": 0.025880880653858185, "grad_norm_var": 5.916012271102907e-06, "learning_rate": 0.00250167764330862, "loss": 2.4597, "step": 18570 }, { "crossentropy": 2.665398120880127, "epoch": 0.6732526102088167, "grad_norm": 0.02585260011255741, "grad_norm_var": 5.998468479780026e-06, "learning_rate": 0.002501174310937005, "loss": 2.5783, "step": 18571 }, { "crossentropy": 2.590127944946289, "epoch": 0.6732888631090487, "grad_norm": 0.026951881125569344, "grad_norm_var": 5.981642955941178e-06, "learning_rate": 0.0025006710123158214, "loss": 2.5517, "step": 18572 }, { "crossentropy": 2.420569896697998, "epoch": 0.6733251160092807, "grad_norm": 0.02550181746482849, "grad_norm_var": 6.088917899557157e-06, "learning_rate": 0.00250016774745187, "loss": 2.4302, "step": 18573 }, { "crossentropy": 2.539365291595459, "epoch": 0.6733613689095128, "grad_norm": 0.02630073018372059, "grad_norm_var": 8.479849913572141e-07, "learning_rate": 0.0024996645163519455, "loss": 2.5501, "step": 18574 }, { "crossentropy": 2.5469918251037598, "epoch": 0.6733976218097448, "grad_norm": 0.026162942871451378, "grad_norm_var": 8.287637956790599e-07, "learning_rate": 0.0024991613190228456, "loss": 2.5554, "step": 18575 }, { "crossentropy": 2.5404059886932373, "epoch": 0.6734338747099768, "grad_norm": 0.02605941705405712, "grad_norm_var": 8.280203849339006e-07, "learning_rate": 0.00249865815547137, "loss": 2.5306, "step": 18576 }, { "crossentropy": 2.4882280826568604, "epoch": 0.6734701276102089, "grad_norm": 0.02673063799738884, "grad_norm_var": 2.679198757798059e-07, "learning_rate": 0.002498155025704309, "loss": 2.5441, "step": 18577 }, { "crossentropy": 2.592583656311035, "epoch": 0.6735063805104409, "grad_norm": 0.024765390902757645, "grad_norm_var": 3.360629768602569e-07, "learning_rate": 0.002497651929728464, "loss": 2.456, "step": 18578 }, { "crossentropy": 2.635711431503296, "epoch": 0.6735426334106729, "grad_norm": 0.027217630296945572, "grad_norm_var": 3.5197642729989145e-07, "learning_rate": 0.0024971488675506258, "loss": 2.5646, "step": 18579 }, { "crossentropy": 2.4697353839874268, "epoch": 0.6735788863109049, "grad_norm": 0.027027906849980354, "grad_norm_var": 3.929396076934732e-07, "learning_rate": 0.00249664583917759, "loss": 2.5548, "step": 18580 }, { "crossentropy": 2.4177656173706055, "epoch": 0.6736151392111369, "grad_norm": 0.02570495381951332, "grad_norm_var": 4.071955936379411e-07, "learning_rate": 0.002496142844616152, "loss": 2.4135, "step": 18581 }, { "crossentropy": 2.6096866130828857, "epoch": 0.6736513921113689, "grad_norm": 0.027331756427884102, "grad_norm_var": 4.6995731444603575e-07, "learning_rate": 0.0024956398838731028, "loss": 2.5277, "step": 18582 }, { "crossentropy": 2.7145657539367676, "epoch": 0.6736876450116009, "grad_norm": 0.02600116841495037, "grad_norm_var": 4.6750344104793763e-07, "learning_rate": 0.002495136956955238, "loss": 2.6366, "step": 18583 }, { "crossentropy": 2.4189274311065674, "epoch": 0.673723897911833, "grad_norm": 0.026933645829558372, "grad_norm_var": 4.976362460338071e-07, "learning_rate": 0.002494634063869351, "loss": 2.4005, "step": 18584 }, { "crossentropy": 2.5994348526000977, "epoch": 0.673760150812065, "grad_norm": 0.02740606665611267, "grad_norm_var": 5.662309954825578e-07, "learning_rate": 0.0024941312046222316, "loss": 2.5765, "step": 18585 }, { "crossentropy": 2.463690757751465, "epoch": 0.673796403712297, "grad_norm": 0.026774663478136063, "grad_norm_var": 5.58544723751432e-07, "learning_rate": 0.0024936283792206747, "loss": 2.4409, "step": 18586 }, { "crossentropy": 2.3419370651245117, "epoch": 0.673832656612529, "grad_norm": 0.026784349232912064, "grad_norm_var": 5.422896671112657e-07, "learning_rate": 0.00249312558767147, "loss": 2.3944, "step": 18587 }, { "crossentropy": 2.4420604705810547, "epoch": 0.673868909512761, "grad_norm": 0.025680270045995712, "grad_norm_var": 5.630799115954501e-07, "learning_rate": 0.0024926228299814073, "loss": 2.4073, "step": 18588 }, { "crossentropy": 2.3627004623413086, "epoch": 0.673905162412993, "grad_norm": 0.025619138032197952, "grad_norm_var": 5.499064115128815e-07, "learning_rate": 0.0024921201061572796, "loss": 2.3852, "step": 18589 }, { "crossentropy": 2.4517266750335693, "epoch": 0.673941415313225, "grad_norm": 0.02760445885360241, "grad_norm_var": 6.377883695688937e-07, "learning_rate": 0.0024916174162058742, "loss": 2.4679, "step": 18590 }, { "crossentropy": 2.571211338043213, "epoch": 0.6739776682134571, "grad_norm": 0.031982000917196274, "grad_norm_var": 2.502099375431092e-06, "learning_rate": 0.0024911147601339822, "loss": 2.5482, "step": 18591 }, { "crossentropy": 2.630971670150757, "epoch": 0.6740139211136891, "grad_norm": 0.027356375008821487, "grad_norm_var": 2.4702634101942394e-06, "learning_rate": 0.002490612137948394, "loss": 2.5174, "step": 18592 }, { "crossentropy": 2.513726234436035, "epoch": 0.6740501740139211, "grad_norm": 0.028819695115089417, "grad_norm_var": 2.686789368249e-06, "learning_rate": 0.0024901095496558962, "loss": 2.4649, "step": 18593 }, { "crossentropy": 2.3633766174316406, "epoch": 0.6740864269141531, "grad_norm": 0.028640586882829666, "grad_norm_var": 2.4381553220132e-06, "learning_rate": 0.002489606995263279, "loss": 2.4073, "step": 18594 }, { "crossentropy": 2.5024118423461914, "epoch": 0.6741226798143851, "grad_norm": 0.02604244463145733, "grad_norm_var": 2.53820743308508e-06, "learning_rate": 0.002489104474777329, "loss": 2.5546, "step": 18595 }, { "crossentropy": 2.4558751583099365, "epoch": 0.6741589327146171, "grad_norm": 0.02668127417564392, "grad_norm_var": 2.5551425064687977e-06, "learning_rate": 0.0024886019882048318, "loss": 2.3993, "step": 18596 }, { "crossentropy": 2.5235118865966797, "epoch": 0.6741951856148491, "grad_norm": 0.02762037329375744, "grad_norm_var": 2.4000264047904206e-06, "learning_rate": 0.0024880995355525777, "loss": 2.4951, "step": 18597 }, { "crossentropy": 2.58940052986145, "epoch": 0.6742314385150812, "grad_norm": 0.02668800577521324, "grad_norm_var": 2.4257672791503828e-06, "learning_rate": 0.0024875971168273497, "loss": 2.569, "step": 18598 }, { "crossentropy": 2.5621676445007324, "epoch": 0.6742676914153132, "grad_norm": 0.027265939861536026, "grad_norm_var": 2.3084593495777013e-06, "learning_rate": 0.002487094732035935, "loss": 2.4876, "step": 18599 }, { "crossentropy": 2.3706557750701904, "epoch": 0.6743039443155452, "grad_norm": 0.026639319956302643, "grad_norm_var": 2.3309468238648233e-06, "learning_rate": 0.0024865923811851214, "loss": 2.4802, "step": 18600 }, { "crossentropy": 2.5178756713867188, "epoch": 0.6743401972157773, "grad_norm": 0.02570183016359806, "grad_norm_var": 2.499803557082005e-06, "learning_rate": 0.0024860900642816905, "loss": 2.5533, "step": 18601 }, { "crossentropy": 2.495572805404663, "epoch": 0.6743764501160093, "grad_norm": 0.027190933004021645, "grad_norm_var": 2.4845955373273593e-06, "learning_rate": 0.00248558778133243, "loss": 2.5558, "step": 18602 }, { "crossentropy": 2.4633994102478027, "epoch": 0.6744127030162413, "grad_norm": 0.026355747133493423, "grad_norm_var": 2.5238194976628793e-06, "learning_rate": 0.002485085532344121, "loss": 2.4493, "step": 18603 }, { "crossentropy": 2.4147894382476807, "epoch": 0.6744489559164734, "grad_norm": 0.02519695833325386, "grad_norm_var": 2.6391252268869452e-06, "learning_rate": 0.0024845833173235475, "loss": 2.5367, "step": 18604 }, { "crossentropy": 2.520207405090332, "epoch": 0.6744852088167054, "grad_norm": 0.026304319500923157, "grad_norm_var": 2.522872704750059e-06, "learning_rate": 0.002484081136277494, "loss": 2.5708, "step": 18605 }, { "crossentropy": 2.635798931121826, "epoch": 0.6745214617169374, "grad_norm": 0.026504741981625557, "grad_norm_var": 2.547312056356054e-06, "learning_rate": 0.002483578989212744, "loss": 2.5369, "step": 18606 }, { "crossentropy": 2.5126023292541504, "epoch": 0.6745577146171694, "grad_norm": 0.02830476686358452, "grad_norm_var": 1.0414168805822442e-06, "learning_rate": 0.002483076876136077, "loss": 2.4944, "step": 18607 }, { "crossentropy": 2.521559000015259, "epoch": 0.6745939675174014, "grad_norm": 0.02603337913751602, "grad_norm_var": 1.080376689836894e-06, "learning_rate": 0.0024825747970542785, "loss": 2.4841, "step": 18608 }, { "crossentropy": 2.310042142868042, "epoch": 0.6746302204176334, "grad_norm": 0.02486768737435341, "grad_norm_var": 1.0314788940498128e-06, "learning_rate": 0.0024820727519741254, "loss": 2.3786, "step": 18609 }, { "crossentropy": 2.491246461868286, "epoch": 0.6746664733178654, "grad_norm": 0.027776433154940605, "grad_norm_var": 8.461904428207212e-07, "learning_rate": 0.002481570740902403, "loss": 2.5557, "step": 18610 }, { "crossentropy": 2.624752998352051, "epoch": 0.6747027262180975, "grad_norm": 0.026787877082824707, "grad_norm_var": 8.281491334676532e-07, "learning_rate": 0.0024810687638458875, "loss": 2.6195, "step": 18611 }, { "crossentropy": 2.6761813163757324, "epoch": 0.6747389791183295, "grad_norm": 0.027170220389962196, "grad_norm_var": 8.470872289625374e-07, "learning_rate": 0.002480566820811363, "loss": 2.4993, "step": 18612 }, { "crossentropy": 2.5745842456817627, "epoch": 0.6747752320185615, "grad_norm": 0.02672429569065571, "grad_norm_var": 7.813983361649087e-07, "learning_rate": 0.0024800649118056047, "loss": 2.6151, "step": 18613 }, { "crossentropy": 2.448969602584839, "epoch": 0.6748114849187935, "grad_norm": 0.02719937637448311, "grad_norm_var": 8.041156191837539e-07, "learning_rate": 0.002479563036835396, "loss": 2.4914, "step": 18614 }, { "crossentropy": 2.4972403049468994, "epoch": 0.6748477378190255, "grad_norm": 0.026526527479290962, "grad_norm_var": 7.752439138172199e-07, "learning_rate": 0.0024790611959075114, "loss": 2.5279, "step": 18615 }, { "crossentropy": 2.527348756790161, "epoch": 0.6748839907192575, "grad_norm": 0.026431284844875336, "grad_norm_var": 7.763110605296519e-07, "learning_rate": 0.0024785593890287332, "loss": 2.5894, "step": 18616 }, { "crossentropy": 2.395551919937134, "epoch": 0.6749202436194895, "grad_norm": 0.03035874292254448, "grad_norm_var": 1.5943656921984147e-06, "learning_rate": 0.0024780576162058347, "loss": 2.461, "step": 18617 }, { "crossentropy": 2.4369966983795166, "epoch": 0.6749564965197216, "grad_norm": 0.027545815333724022, "grad_norm_var": 1.6179749911442974e-06, "learning_rate": 0.002477555877445596, "loss": 2.5, "step": 18618 }, { "crossentropy": 2.5879974365234375, "epoch": 0.6749927494199536, "grad_norm": 0.029057646170258522, "grad_norm_var": 1.885193330804904e-06, "learning_rate": 0.002477054172754794, "loss": 2.6532, "step": 18619 }, { "crossentropy": 2.509617805480957, "epoch": 0.6750290023201856, "grad_norm": 0.026391180232167244, "grad_norm_var": 1.6793684395323879e-06, "learning_rate": 0.0024765525021402048, "loss": 2.4895, "step": 18620 }, { "crossentropy": 2.6041088104248047, "epoch": 0.6750652552204176, "grad_norm": 0.026103660464286804, "grad_norm_var": 1.703815608340246e-06, "learning_rate": 0.0024760508656086013, "loss": 2.5403, "step": 18621 }, { "crossentropy": 2.4009926319122314, "epoch": 0.6751015081206496, "grad_norm": 0.026029415428638458, "grad_norm_var": 1.7563895490889005e-06, "learning_rate": 0.002475549263166763, "loss": 2.4098, "step": 18622 }, { "crossentropy": 2.5419063568115234, "epoch": 0.6751377610208816, "grad_norm": 0.02750035561621189, "grad_norm_var": 1.6656594915652352e-06, "learning_rate": 0.0024750476948214614, "loss": 2.4824, "step": 18623 }, { "crossentropy": 2.55802583694458, "epoch": 0.6751740139211136, "grad_norm": 0.026735873892903328, "grad_norm_var": 1.6030138143936171e-06, "learning_rate": 0.002474546160579474, "loss": 2.5607, "step": 18624 }, { "crossentropy": 2.4762208461761475, "epoch": 0.6752102668213457, "grad_norm": 0.025627080351114273, "grad_norm_var": 1.4155200229644711e-06, "learning_rate": 0.0024740446604475715, "loss": 2.4897, "step": 18625 }, { "crossentropy": 2.608898878097534, "epoch": 0.6752465197215777, "grad_norm": 0.027985215187072754, "grad_norm_var": 1.4364382606389208e-06, "learning_rate": 0.0024735431944325286, "loss": 2.5378, "step": 18626 }, { "crossentropy": 2.6179611682891846, "epoch": 0.6752827726218097, "grad_norm": 0.026791764423251152, "grad_norm_var": 1.4362588152035434e-06, "learning_rate": 0.002473041762541121, "loss": 2.4709, "step": 18627 }, { "crossentropy": 2.651205539703369, "epoch": 0.6753190255220418, "grad_norm": 0.027315810322761536, "grad_norm_var": 1.4382448996036814e-06, "learning_rate": 0.0024725403647801194, "loss": 2.6008, "step": 18628 }, { "crossentropy": 2.6182684898376465, "epoch": 0.6753552784222738, "grad_norm": 0.026758182793855667, "grad_norm_var": 1.4364146683398427e-06, "learning_rate": 0.002472039001156294, "loss": 2.5618, "step": 18629 }, { "crossentropy": 2.5793468952178955, "epoch": 0.6753915313225058, "grad_norm": 0.027605773881077766, "grad_norm_var": 1.4495550963749525e-06, "learning_rate": 0.00247153767167642, "loss": 2.5046, "step": 18630 }, { "crossentropy": 2.5333352088928223, "epoch": 0.6754277842227379, "grad_norm": 0.025750843808054924, "grad_norm_var": 1.5539977819352865e-06, "learning_rate": 0.0024710363763472644, "loss": 2.5208, "step": 18631 }, { "crossentropy": 2.5148706436157227, "epoch": 0.6754640371229699, "grad_norm": 0.025791270658373833, "grad_norm_var": 1.6387366978700408e-06, "learning_rate": 0.002470535115175603, "loss": 2.535, "step": 18632 }, { "crossentropy": 2.409053087234497, "epoch": 0.6755002900232019, "grad_norm": 0.02609890326857567, "grad_norm_var": 9.130567590557297e-07, "learning_rate": 0.0024700338881682013, "loss": 2.4935, "step": 18633 }, { "crossentropy": 2.6924009323120117, "epoch": 0.6755365429234339, "grad_norm": 0.027407847344875336, "grad_norm_var": 9.008586716104105e-07, "learning_rate": 0.0024695326953318307, "loss": 2.6054, "step": 18634 }, { "crossentropy": 2.6068973541259766, "epoch": 0.6755727958236659, "grad_norm": 0.02648104727268219, "grad_norm_var": 5.434194925002841e-07, "learning_rate": 0.002469031536673264, "loss": 2.5807, "step": 18635 }, { "crossentropy": 2.4598331451416016, "epoch": 0.6756090487238979, "grad_norm": 0.027132341638207436, "grad_norm_var": 5.523342394190735e-07, "learning_rate": 0.0024685304121992648, "loss": 2.5778, "step": 18636 }, { "crossentropy": 2.510521173477173, "epoch": 0.67564530162413, "grad_norm": 0.025865260511636734, "grad_norm_var": 5.746739412042784e-07, "learning_rate": 0.0024680293219166063, "loss": 2.5528, "step": 18637 }, { "crossentropy": 2.428842306137085, "epoch": 0.675681554524362, "grad_norm": 0.02685266174376011, "grad_norm_var": 5.456408414885571e-07, "learning_rate": 0.0024675282658320535, "loss": 2.4143, "step": 18638 }, { "crossentropy": 2.6178972721099854, "epoch": 0.675717807424594, "grad_norm": 0.02622687816619873, "grad_norm_var": 5.164105409173932e-07, "learning_rate": 0.002467027243952374, "loss": 2.5118, "step": 18639 }, { "crossentropy": 2.5134077072143555, "epoch": 0.675754060324826, "grad_norm": 0.030818652361631393, "grad_norm_var": 1.6040649547794174e-06, "learning_rate": 0.002466526256284336, "loss": 2.4608, "step": 18640 }, { "crossentropy": 2.511803388595581, "epoch": 0.675790313225058, "grad_norm": 0.02738730050623417, "grad_norm_var": 1.4973575199868216e-06, "learning_rate": 0.0024660253028347045, "loss": 2.5502, "step": 18641 }, { "crossentropy": 2.584188461303711, "epoch": 0.67582656612529, "grad_norm": 0.028350723907351494, "grad_norm_var": 1.5528996296670927e-06, "learning_rate": 0.0024655243836102466, "loss": 2.5119, "step": 18642 }, { "crossentropy": 2.4801623821258545, "epoch": 0.675862819025522, "grad_norm": 0.026857543736696243, "grad_norm_var": 1.5509954901250875e-06, "learning_rate": 0.0024650234986177302, "loss": 2.4769, "step": 18643 }, { "crossentropy": 2.48219633102417, "epoch": 0.675899071925754, "grad_norm": 0.026665927842259407, "grad_norm_var": 1.5538235369323695e-06, "learning_rate": 0.002464522647863916, "loss": 2.4998, "step": 18644 }, { "crossentropy": 2.4975051879882812, "epoch": 0.6759353248259861, "grad_norm": 0.026451431214809418, "grad_norm_var": 1.56972572177416e-06, "learning_rate": 0.0024640218313555736, "loss": 2.5603, "step": 18645 }, { "crossentropy": 2.469951868057251, "epoch": 0.6759715777262181, "grad_norm": 0.02682311460375786, "grad_norm_var": 1.5431281572524188e-06, "learning_rate": 0.0024635210490994647, "loss": 2.5281, "step": 18646 }, { "crossentropy": 2.6530280113220215, "epoch": 0.6760078306264501, "grad_norm": 0.02684105932712555, "grad_norm_var": 1.4452664941787262e-06, "learning_rate": 0.002463020301102351, "loss": 2.6384, "step": 18647 }, { "crossentropy": 2.5594730377197266, "epoch": 0.6760440835266821, "grad_norm": 0.026737244799733162, "grad_norm_var": 1.3483291538019319e-06, "learning_rate": 0.0024625195873710006, "loss": 2.5125, "step": 18648 }, { "crossentropy": 2.5086092948913574, "epoch": 0.6760803364269141, "grad_norm": 0.02574499323964119, "grad_norm_var": 1.4016215487382943e-06, "learning_rate": 0.0024620189079121715, "loss": 2.5605, "step": 18649 }, { "crossentropy": 2.5614261627197266, "epoch": 0.6761165893271461, "grad_norm": 0.02659907191991806, "grad_norm_var": 1.4028636012380336e-06, "learning_rate": 0.002461518262732629, "loss": 2.4926, "step": 18650 }, { "crossentropy": 2.399031400680542, "epoch": 0.6761528422273781, "grad_norm": 0.026824623346328735, "grad_norm_var": 1.3869397782338583e-06, "learning_rate": 0.0024610176518391363, "loss": 2.5226, "step": 18651 }, { "crossentropy": 2.39261794090271, "epoch": 0.6761890951276102, "grad_norm": 0.02674468606710434, "grad_norm_var": 1.390069384450673e-06, "learning_rate": 0.0024605170752384515, "loss": 2.5039, "step": 18652 }, { "crossentropy": 2.54587459564209, "epoch": 0.6762253480278422, "grad_norm": 0.0266359094530344, "grad_norm_var": 1.311931117316025e-06, "learning_rate": 0.002460016532937339, "loss": 2.575, "step": 18653 }, { "crossentropy": 2.47641658782959, "epoch": 0.6762616009280742, "grad_norm": 0.027136271819472313, "grad_norm_var": 1.3100589155822772e-06, "learning_rate": 0.002459516024942558, "loss": 2.4714, "step": 18654 }, { "crossentropy": 2.4119815826416016, "epoch": 0.6762978538283063, "grad_norm": 0.0276974868029356, "grad_norm_var": 1.283271572440287e-06, "learning_rate": 0.002459015551260867, "loss": 2.403, "step": 18655 }, { "crossentropy": 2.3831732273101807, "epoch": 0.6763341067285383, "grad_norm": 0.026419274508953094, "grad_norm_var": 3.378796257988281e-07, "learning_rate": 0.0024585151118990283, "loss": 2.4629, "step": 18656 }, { "crossentropy": 2.5631558895111084, "epoch": 0.6763703596287703, "grad_norm": 0.027941977605223656, "grad_norm_var": 3.9538218364319237e-07, "learning_rate": 0.002458014706863798, "loss": 2.6276, "step": 18657 }, { "crossentropy": 2.679478645324707, "epoch": 0.6764066125290024, "grad_norm": 0.03035500831902027, "grad_norm_var": 1.0329513252450706e-06, "learning_rate": 0.002457514336161937, "loss": 2.5896, "step": 18658 }, { "crossentropy": 2.6029140949249268, "epoch": 0.6764428654292344, "grad_norm": 0.027390873059630394, "grad_norm_var": 1.0384848174273728e-06, "learning_rate": 0.002457013999800205, "loss": 2.5464, "step": 18659 }, { "crossentropy": 2.4511735439300537, "epoch": 0.6764791183294664, "grad_norm": 0.02853768691420555, "grad_norm_var": 1.1583411093852297e-06, "learning_rate": 0.002456513697785356, "loss": 2.4919, "step": 18660 }, { "crossentropy": 2.3764023780822754, "epoch": 0.6765153712296984, "grad_norm": 0.026142174378037453, "grad_norm_var": 1.1943624175488863e-06, "learning_rate": 0.0024560134301241515, "loss": 2.4222, "step": 18661 }, { "crossentropy": 2.548095464706421, "epoch": 0.6765516241299304, "grad_norm": 0.026639094576239586, "grad_norm_var": 1.2047622675554475e-06, "learning_rate": 0.002455513196823346, "loss": 2.522, "step": 18662 }, { "crossentropy": 2.3715810775756836, "epoch": 0.6765878770301624, "grad_norm": 0.027257734909653664, "grad_norm_var": 1.1984933138535954e-06, "learning_rate": 0.0024550129978896944, "loss": 2.4548, "step": 18663 }, { "crossentropy": 2.5441012382507324, "epoch": 0.6766241299303944, "grad_norm": 0.026623601093888283, "grad_norm_var": 1.2059374716158015e-06, "learning_rate": 0.0024545128333299565, "loss": 2.5971, "step": 18664 }, { "crossentropy": 2.6110761165618896, "epoch": 0.6766603828306265, "grad_norm": 0.027748992666602135, "grad_norm_var": 1.0766698146166906e-06, "learning_rate": 0.0024540127031508836, "loss": 2.491, "step": 18665 }, { "crossentropy": 2.436248779296875, "epoch": 0.6766966357308585, "grad_norm": 0.0250684916973114, "grad_norm_var": 1.364784557070448e-06, "learning_rate": 0.002453512607359233, "loss": 2.4866, "step": 18666 }, { "crossentropy": 2.4450955390930176, "epoch": 0.6767328886310905, "grad_norm": 0.028565488755702972, "grad_norm_var": 1.4675910286163306e-06, "learning_rate": 0.0024530125459617607, "loss": 2.5647, "step": 18667 }, { "crossentropy": 2.616027593612671, "epoch": 0.6767691415313225, "grad_norm": 0.026260752230882645, "grad_norm_var": 1.5184818307338675e-06, "learning_rate": 0.0024525125189652176, "loss": 2.6184, "step": 18668 }, { "crossentropy": 2.574244976043701, "epoch": 0.6768053944315545, "grad_norm": 0.0260162353515625, "grad_norm_var": 1.5953928021750567e-06, "learning_rate": 0.0024520125263763604, "loss": 2.497, "step": 18669 }, { "crossentropy": 2.546123743057251, "epoch": 0.6768416473317865, "grad_norm": 0.026051770895719528, "grad_norm_var": 1.6835496455371805e-06, "learning_rate": 0.002451512568201941, "loss": 2.6329, "step": 18670 }, { "crossentropy": 2.4976279735565186, "epoch": 0.6768779002320185, "grad_norm": 0.027262534946203232, "grad_norm_var": 1.6647705768100633e-06, "learning_rate": 0.0024510126444487104, "loss": 2.4179, "step": 18671 }, { "crossentropy": 2.5100202560424805, "epoch": 0.6769141531322506, "grad_norm": 0.02655143290758133, "grad_norm_var": 1.6531162855900247e-06, "learning_rate": 0.002450512755123423, "loss": 2.4938, "step": 18672 }, { "crossentropy": 2.364523410797119, "epoch": 0.6769504060324826, "grad_norm": 0.025683967396616936, "grad_norm_var": 1.7336008990951243e-06, "learning_rate": 0.0024500129002328285, "loss": 2.4977, "step": 18673 }, { "crossentropy": 2.5434577465057373, "epoch": 0.6769866589327146, "grad_norm": 0.028755517676472664, "grad_norm_var": 1.1800689953931262e-06, "learning_rate": 0.0024495130797836796, "loss": 2.4991, "step": 18674 }, { "crossentropy": 2.480637311935425, "epoch": 0.6770229118329466, "grad_norm": 0.03301027789711952, "grad_norm_var": 3.5141429178925e-06, "learning_rate": 0.0024490132937827294, "loss": 2.4832, "step": 18675 }, { "crossentropy": 2.4483072757720947, "epoch": 0.6770591647331786, "grad_norm": 0.027459071949124336, "grad_norm_var": 3.4032467013689686e-06, "learning_rate": 0.0024485135422367232, "loss": 2.4999, "step": 18676 }, { "crossentropy": 2.768115758895874, "epoch": 0.6770954176334106, "grad_norm": 0.026761557906866074, "grad_norm_var": 3.3403948981029168e-06, "learning_rate": 0.0024480138251524158, "loss": 2.6604, "step": 18677 }, { "crossentropy": 2.443936347961426, "epoch": 0.6771316705336426, "grad_norm": 0.025540009140968323, "grad_norm_var": 3.5028227848864223e-06, "learning_rate": 0.002447514142536553, "loss": 2.566, "step": 18678 }, { "crossentropy": 2.5286622047424316, "epoch": 0.6771679234338747, "grad_norm": 0.02624158002436161, "grad_norm_var": 3.5546029936738e-06, "learning_rate": 0.0024470144943958866, "loss": 2.5219, "step": 18679 }, { "crossentropy": 2.7422590255737305, "epoch": 0.6772041763341067, "grad_norm": 0.027956565842032433, "grad_norm_var": 3.58096870955258e-06, "learning_rate": 0.002446514880737164, "loss": 2.6157, "step": 18680 }, { "crossentropy": 2.587653636932373, "epoch": 0.6772404292343387, "grad_norm": 0.02665485441684723, "grad_norm_var": 3.5732769312665634e-06, "learning_rate": 0.0024460153015671312, "loss": 2.5079, "step": 18681 }, { "crossentropy": 2.6336493492126465, "epoch": 0.6772766821345708, "grad_norm": 0.02654942311346531, "grad_norm_var": 3.306249493343951e-06, "learning_rate": 0.0024455157568925375, "loss": 2.6146, "step": 18682 }, { "crossentropy": 2.5259716510772705, "epoch": 0.6773129350348028, "grad_norm": 0.02847468852996826, "grad_norm_var": 3.2903248151079183e-06, "learning_rate": 0.002445016246720132, "loss": 2.5409, "step": 18683 }, { "crossentropy": 2.554658889770508, "epoch": 0.6773491879350348, "grad_norm": 0.025633463636040688, "grad_norm_var": 3.3936333344467082e-06, "learning_rate": 0.002444516771056657, "loss": 2.4944, "step": 18684 }, { "crossentropy": 2.579267978668213, "epoch": 0.6773854408352669, "grad_norm": 0.02830289863049984, "grad_norm_var": 3.3708960617448784e-06, "learning_rate": 0.0024440173299088634, "loss": 2.6075, "step": 18685 }, { "crossentropy": 2.514756679534912, "epoch": 0.6774216937354989, "grad_norm": 0.027850156649947166, "grad_norm_var": 3.2723836759436607e-06, "learning_rate": 0.002443517923283493, "loss": 2.5832, "step": 18686 }, { "crossentropy": 2.6688971519470215, "epoch": 0.6774579466357309, "grad_norm": 0.02655397728085518, "grad_norm_var": 3.3184495134267693e-06, "learning_rate": 0.0024430185511872945, "loss": 2.6934, "step": 18687 }, { "crossentropy": 2.3707115650177, "epoch": 0.6774941995359629, "grad_norm": 0.02712073363363743, "grad_norm_var": 3.276289184194851e-06, "learning_rate": 0.0024425192136270104, "loss": 2.4735, "step": 18688 }, { "crossentropy": 2.615574359893799, "epoch": 0.6775304524361949, "grad_norm": 0.026355158537626266, "grad_norm_var": 3.150041867867404e-06, "learning_rate": 0.002442019910609383, "loss": 2.5487, "step": 18689 }, { "crossentropy": 2.5736491680145264, "epoch": 0.6775667053364269, "grad_norm": 0.026297256350517273, "grad_norm_var": 3.10023364185983e-06, "learning_rate": 0.0024415206421411597, "loss": 2.5002, "step": 18690 }, { "crossentropy": 2.359164237976074, "epoch": 0.677602958236659, "grad_norm": 0.02504817396402359, "grad_norm_var": 9.977741909611427e-07, "learning_rate": 0.0024410214082290845, "loss": 2.3671, "step": 18691 }, { "crossentropy": 2.329054355621338, "epoch": 0.677639211136891, "grad_norm": 0.027221063151955605, "grad_norm_var": 9.803985261224815e-07, "learning_rate": 0.002440522208879896, "loss": 2.4931, "step": 18692 }, { "crossentropy": 2.4951963424682617, "epoch": 0.677675464037123, "grad_norm": 0.02677384577691555, "grad_norm_var": 9.80369396162449e-07, "learning_rate": 0.002440023044100342, "loss": 2.5609, "step": 18693 }, { "crossentropy": 2.3958446979522705, "epoch": 0.677711716937355, "grad_norm": 0.02751191146671772, "grad_norm_var": 8.958333686308213e-07, "learning_rate": 0.002439523913897159, "loss": 2.5586, "step": 18694 }, { "crossentropy": 2.5017967224121094, "epoch": 0.677747969837587, "grad_norm": 0.02652980014681816, "grad_norm_var": 8.753725748282162e-07, "learning_rate": 0.002439024818277091, "loss": 2.5328, "step": 18695 }, { "crossentropy": 2.448747158050537, "epoch": 0.677784222737819, "grad_norm": 0.02753775380551815, "grad_norm_var": 8.288495654458119e-07, "learning_rate": 0.0024385257572468837, "loss": 2.489, "step": 18696 }, { "crossentropy": 2.560209035873413, "epoch": 0.677820475638051, "grad_norm": 0.028790714219212532, "grad_norm_var": 1.0438854402191704e-06, "learning_rate": 0.00243802673081327, "loss": 2.5454, "step": 18697 }, { "crossentropy": 2.548774480819702, "epoch": 0.677856728538283, "grad_norm": 0.027432924136519432, "grad_norm_var": 1.0355364188293613e-06, "learning_rate": 0.002437527738982992, "loss": 2.5088, "step": 18698 }, { "crossentropy": 2.5205628871917725, "epoch": 0.6778929814385151, "grad_norm": 0.027617312967777252, "grad_norm_var": 9.231474872537296e-07, "learning_rate": 0.0024370287817627933, "loss": 2.5501, "step": 18699 }, { "crossentropy": 2.699403762817383, "epoch": 0.6779292343387471, "grad_norm": 0.026945898309350014, "grad_norm_var": 7.853586224450145e-07, "learning_rate": 0.0024365298591594083, "loss": 2.6358, "step": 18700 }, { "crossentropy": 2.4418537616729736, "epoch": 0.6779654872389791, "grad_norm": 0.025913871824741364, "grad_norm_var": 7.646716806431238e-07, "learning_rate": 0.002436030971179579, "loss": 2.4586, "step": 18701 }, { "crossentropy": 2.3269739151000977, "epoch": 0.6780017401392111, "grad_norm": 0.025545019656419754, "grad_norm_var": 8.258840801221463e-07, "learning_rate": 0.002435532117830041, "loss": 2.4261, "step": 18702 }, { "crossentropy": 2.630295991897583, "epoch": 0.6780379930394431, "grad_norm": 0.029618211090564728, "grad_norm_var": 1.3021164421931723e-06, "learning_rate": 0.0024350332991175338, "loss": 2.5369, "step": 18703 }, { "crossentropy": 2.4613306522369385, "epoch": 0.6780742459396751, "grad_norm": 0.027679741382598877, "grad_norm_var": 1.329436302520516e-06, "learning_rate": 0.002434534515048798, "loss": 2.5125, "step": 18704 }, { "crossentropy": 2.556483507156372, "epoch": 0.6781104988399071, "grad_norm": 0.028190050274133682, "grad_norm_var": 1.3695832419421353e-06, "learning_rate": 0.002434035765630563, "loss": 2.5776, "step": 18705 }, { "crossentropy": 2.5786261558532715, "epoch": 0.6781467517401392, "grad_norm": 0.027340693399310112, "grad_norm_var": 1.3167882049894622e-06, "learning_rate": 0.0024335370508695685, "loss": 2.4977, "step": 18706 }, { "crossentropy": 2.5547780990600586, "epoch": 0.6781830046403712, "grad_norm": 0.026415344327688217, "grad_norm_var": 1.0356931427581614e-06, "learning_rate": 0.002433038370772553, "loss": 2.5546, "step": 18707 }, { "crossentropy": 2.3090169429779053, "epoch": 0.6782192575406032, "grad_norm": 0.026223869994282722, "grad_norm_var": 1.1105332719342497e-06, "learning_rate": 0.0024325397253462484, "loss": 2.3674, "step": 18708 }, { "crossentropy": 2.4547388553619385, "epoch": 0.6782555104408353, "grad_norm": 0.026113100349903107, "grad_norm_var": 1.1801374123667027e-06, "learning_rate": 0.0024320411145973926, "loss": 2.4656, "step": 18709 }, { "crossentropy": 2.485771417617798, "epoch": 0.6782917633410673, "grad_norm": 0.02712343819439411, "grad_norm_var": 1.1740810603981608e-06, "learning_rate": 0.0024315425385327163, "loss": 2.4674, "step": 18710 }, { "crossentropy": 2.5283806324005127, "epoch": 0.6783280162412993, "grad_norm": 0.025097064673900604, "grad_norm_var": 1.4282299199806866e-06, "learning_rate": 0.002431043997158956, "loss": 2.5205, "step": 18711 }, { "crossentropy": 2.3970892429351807, "epoch": 0.6783642691415314, "grad_norm": 0.0258072130382061, "grad_norm_var": 1.5141801615066337e-06, "learning_rate": 0.002430545490482847, "loss": 2.4392, "step": 18712 }, { "crossentropy": 2.4910082817077637, "epoch": 0.6784005220417634, "grad_norm": 0.02788444049656391, "grad_norm_var": 1.3480306832823359e-06, "learning_rate": 0.0024300470185111204, "loss": 2.5065, "step": 18713 }, { "crossentropy": 2.519261360168457, "epoch": 0.6784367749419954, "grad_norm": 0.02699849382042885, "grad_norm_var": 1.3309417705531599e-06, "learning_rate": 0.0024295485812505075, "loss": 2.5344, "step": 18714 }, { "crossentropy": 2.5942647457122803, "epoch": 0.6784730278422274, "grad_norm": 0.026180315762758255, "grad_norm_var": 1.3239272525543036e-06, "learning_rate": 0.002429050178707743, "loss": 2.6117, "step": 18715 }, { "crossentropy": 2.586750030517578, "epoch": 0.6785092807424594, "grad_norm": 0.025535928085446358, "grad_norm_var": 1.424001890471431e-06, "learning_rate": 0.002428551810889556, "loss": 2.5528, "step": 18716 }, { "crossentropy": 2.33048677444458, "epoch": 0.6785455336426914, "grad_norm": 0.02834179997444153, "grad_norm_var": 1.5284961536790684e-06, "learning_rate": 0.002428053477802681, "loss": 2.4258, "step": 18717 }, { "crossentropy": 2.5368964672088623, "epoch": 0.6785817865429234, "grad_norm": 0.02753373607993126, "grad_norm_var": 1.421452881614664e-06, "learning_rate": 0.0024275551794538442, "loss": 2.5367, "step": 18718 }, { "crossentropy": 2.381051778793335, "epoch": 0.6786180394431555, "grad_norm": 0.027133841067552567, "grad_norm_var": 9.416555808528047e-07, "learning_rate": 0.002427056915849779, "loss": 2.4883, "step": 18719 }, { "crossentropy": 2.493302345275879, "epoch": 0.6786542923433875, "grad_norm": 0.02562171034514904, "grad_norm_var": 9.786734122350927e-07, "learning_rate": 0.0024265586869972175, "loss": 2.5237, "step": 18720 }, { "crossentropy": 2.5675604343414307, "epoch": 0.6786905452436195, "grad_norm": 0.02560601755976677, "grad_norm_var": 8.899653102657316e-07, "learning_rate": 0.002426060492902885, "loss": 2.5584, "step": 18721 }, { "crossentropy": 2.6046741008758545, "epoch": 0.6787267981438515, "grad_norm": 0.02874653972685337, "grad_norm_var": 1.1598636090838384e-06, "learning_rate": 0.00242556233357351, "loss": 2.6027, "step": 18722 }, { "crossentropy": 2.4528000354766846, "epoch": 0.6787630510440835, "grad_norm": 0.02786032296717167, "grad_norm_var": 1.2455989546773418e-06, "learning_rate": 0.002425064209015825, "loss": 2.4899, "step": 18723 }, { "crossentropy": 2.5681567192077637, "epoch": 0.6787993039443155, "grad_norm": 0.027225008234381676, "grad_norm_var": 1.2396140215693244e-06, "learning_rate": 0.0024245661192365533, "loss": 2.5524, "step": 18724 }, { "crossentropy": 2.5091359615325928, "epoch": 0.6788355568445475, "grad_norm": 0.026572680100798607, "grad_norm_var": 1.2106891572433511e-06, "learning_rate": 0.002424068064242426, "loss": 2.485, "step": 18725 }, { "crossentropy": 2.6049270629882812, "epoch": 0.6788718097447796, "grad_norm": 0.030678441748023033, "grad_norm_var": 2.1399963448714975e-06, "learning_rate": 0.002423570044040167, "loss": 2.5846, "step": 18726 }, { "crossentropy": 2.535358428955078, "epoch": 0.6789080626450116, "grad_norm": 0.02966344729065895, "grad_norm_var": 2.253294033225696e-06, "learning_rate": 0.0024230720586365046, "loss": 2.5326, "step": 18727 }, { "crossentropy": 2.520461320877075, "epoch": 0.6789443155452436, "grad_norm": 0.02751218155026436, "grad_norm_var": 2.0872405594738255e-06, "learning_rate": 0.0024225741080381663, "loss": 2.5796, "step": 18728 }, { "crossentropy": 2.534670352935791, "epoch": 0.6789805684454756, "grad_norm": 0.027314256876707077, "grad_norm_var": 2.074032417538835e-06, "learning_rate": 0.0024220761922518763, "loss": 2.5825, "step": 18729 }, { "crossentropy": 2.5040767192840576, "epoch": 0.6790168213457076, "grad_norm": 0.026145311072468758, "grad_norm_var": 2.166088636774456e-06, "learning_rate": 0.002421578311284357, "loss": 2.5418, "step": 18730 }, { "crossentropy": 2.570843458175659, "epoch": 0.6790530742459396, "grad_norm": 0.027106106281280518, "grad_norm_var": 2.0747203850717915e-06, "learning_rate": 0.0024210804651423378, "loss": 2.5706, "step": 18731 }, { "crossentropy": 2.4161441326141357, "epoch": 0.6790893271461717, "grad_norm": 0.028090760111808777, "grad_norm_var": 1.843481699613615e-06, "learning_rate": 0.0024205826538325386, "loss": 2.5154, "step": 18732 }, { "crossentropy": 2.466623067855835, "epoch": 0.6791255800464037, "grad_norm": 0.026521604508161545, "grad_norm_var": 1.8637287560264397e-06, "learning_rate": 0.0024200848773616875, "loss": 2.5263, "step": 18733 }, { "crossentropy": 2.792966604232788, "epoch": 0.6791618329466357, "grad_norm": 0.02613298036158085, "grad_norm_var": 1.9722622988771475e-06, "learning_rate": 0.0024195871357365027, "loss": 2.659, "step": 18734 }, { "crossentropy": 2.4889378547668457, "epoch": 0.6791980858468677, "grad_norm": 0.02653472311794758, "grad_norm_var": 2.0136171010582168e-06, "learning_rate": 0.00241908942896371, "loss": 2.5031, "step": 18735 }, { "crossentropy": 2.4909048080444336, "epoch": 0.6792343387470998, "grad_norm": 0.0261774230748415, "grad_norm_var": 1.906101132420609e-06, "learning_rate": 0.0024185917570500332, "loss": 2.549, "step": 18736 }, { "crossentropy": 2.4527273178100586, "epoch": 0.6792705916473318, "grad_norm": 0.025545375421643257, "grad_norm_var": 1.9205775931417647e-06, "learning_rate": 0.00241809412000219, "loss": 2.5554, "step": 18737 }, { "crossentropy": 2.5027735233306885, "epoch": 0.6793068445475638, "grad_norm": 0.02605435810983181, "grad_norm_var": 1.8773655866327968e-06, "learning_rate": 0.0024175965178269056, "loss": 2.5367, "step": 18738 }, { "crossentropy": 2.6535134315490723, "epoch": 0.6793430974477959, "grad_norm": 0.026652084663510323, "grad_norm_var": 1.861573918350047e-06, "learning_rate": 0.0024170989505308996, "loss": 2.5666, "step": 18739 }, { "crossentropy": 2.574276924133301, "epoch": 0.6793793503480279, "grad_norm": 0.025737952440977097, "grad_norm_var": 1.9790454921150227e-06, "learning_rate": 0.0024166014181208895, "loss": 2.5425, "step": 18740 }, { "crossentropy": 2.483264207839966, "epoch": 0.6794156032482599, "grad_norm": 0.026128558441996574, "grad_norm_var": 2.0183048014341207e-06, "learning_rate": 0.0024161039206036, "loss": 2.5344, "step": 18741 }, { "crossentropy": 2.5782837867736816, "epoch": 0.6794518561484919, "grad_norm": 0.02584230527281761, "grad_norm_var": 1.1079633946013227e-06, "learning_rate": 0.0024156064579857464, "loss": 2.5572, "step": 18742 }, { "crossentropy": 2.560011386871338, "epoch": 0.6794881090487239, "grad_norm": 0.02670333907008171, "grad_norm_var": 4.849860142280375e-07, "learning_rate": 0.0024151090302740492, "loss": 2.5796, "step": 18743 }, { "crossentropy": 2.504549264907837, "epoch": 0.6795243619489559, "grad_norm": 0.02691948600113392, "grad_norm_var": 4.2793725096381257e-07, "learning_rate": 0.002414611637475229, "loss": 2.5704, "step": 18744 }, { "crossentropy": 2.5673441886901855, "epoch": 0.679560614849188, "grad_norm": 0.026690689846873283, "grad_norm_var": 3.824961938202198e-07, "learning_rate": 0.0024141142795960006, "loss": 2.4912, "step": 18745 }, { "crossentropy": 2.6196863651275635, "epoch": 0.67959686774942, "grad_norm": 0.026703616604208946, "grad_norm_var": 3.803058256002592e-07, "learning_rate": 0.0024136169566430854, "loss": 2.6281, "step": 18746 }, { "crossentropy": 2.6589982509613037, "epoch": 0.679633120649652, "grad_norm": 0.028750626370310783, "grad_norm_var": 6.885195583849196e-07, "learning_rate": 0.0024131196686231976, "loss": 2.5847, "step": 18747 }, { "crossentropy": 2.662660598754883, "epoch": 0.679669373549884, "grad_norm": 0.02575760893523693, "grad_norm_var": 5.569367430742489e-07, "learning_rate": 0.0024126224155430528, "loss": 2.5795, "step": 18748 }, { "crossentropy": 2.3657844066619873, "epoch": 0.679705626450116, "grad_norm": 0.028363758698105812, "grad_norm_var": 7.919510424002439e-07, "learning_rate": 0.0024121251974093707, "loss": 2.5017, "step": 18749 }, { "crossentropy": 2.569836378097534, "epoch": 0.679741879350348, "grad_norm": 0.027445586398243904, "grad_norm_var": 8.277998952708425e-07, "learning_rate": 0.0024116280142288627, "loss": 2.5645, "step": 18750 }, { "crossentropy": 2.591709852218628, "epoch": 0.67977813225058, "grad_norm": 0.0296609029173851, "grad_norm_var": 1.4007876452397255e-06, "learning_rate": 0.0024111308660082465, "loss": 2.5634, "step": 18751 }, { "crossentropy": 2.347075939178467, "epoch": 0.679814385150812, "grad_norm": 0.026907026767730713, "grad_norm_var": 1.3714644145519506e-06, "learning_rate": 0.002410633752754239, "loss": 2.4712, "step": 18752 }, { "crossentropy": 2.4868268966674805, "epoch": 0.6798506380510441, "grad_norm": 0.026311667636036873, "grad_norm_var": 1.2731869424830218e-06, "learning_rate": 0.0024101366744735497, "loss": 2.4891, "step": 18753 }, { "crossentropy": 2.548532009124756, "epoch": 0.6798868909512761, "grad_norm": 0.02633105032145977, "grad_norm_var": 1.2462448527040717e-06, "learning_rate": 0.002409639631172897, "loss": 2.5377, "step": 18754 }, { "crossentropy": 2.3618345260620117, "epoch": 0.6799231438515081, "grad_norm": 0.026306908577680588, "grad_norm_var": 1.266557678392989e-06, "learning_rate": 0.0024091426228589918, "loss": 2.4917, "step": 18755 }, { "crossentropy": 2.3971452713012695, "epoch": 0.6799593967517401, "grad_norm": 0.026102809235453606, "grad_norm_var": 1.2178571429221347e-06, "learning_rate": 0.0024086456495385463, "loss": 2.4193, "step": 18756 }, { "crossentropy": 2.528243064880371, "epoch": 0.6799956496519721, "grad_norm": 0.02677067555487156, "grad_norm_var": 1.1747650504797483e-06, "learning_rate": 0.002408148711218275, "loss": 2.5704, "step": 18757 }, { "crossentropy": 2.563744306564331, "epoch": 0.6800319025522041, "grad_norm": 0.026898568496108055, "grad_norm_var": 1.085253786984169e-06, "learning_rate": 0.0024076518079048865, "loss": 2.5402, "step": 18758 }, { "crossentropy": 2.5693788528442383, "epoch": 0.6800681554524362, "grad_norm": 0.027002928778529167, "grad_norm_var": 1.0774545344096557e-06, "learning_rate": 0.0024071549396050946, "loss": 2.5598, "step": 18759 }, { "crossentropy": 2.5111095905303955, "epoch": 0.6801044083526682, "grad_norm": 0.025835329666733742, "grad_norm_var": 1.1709025551169625e-06, "learning_rate": 0.0024066581063256123, "loss": 2.4993, "step": 18760 }, { "crossentropy": 2.389063835144043, "epoch": 0.6801406612529002, "grad_norm": 0.02555171400308609, "grad_norm_var": 1.2974336223806986e-06, "learning_rate": 0.0024061613080731455, "loss": 2.3921, "step": 18761 }, { "crossentropy": 2.450301170349121, "epoch": 0.6801769141531323, "grad_norm": 0.02727649360895157, "grad_norm_var": 1.3015089941628554e-06, "learning_rate": 0.0024056645448544086, "loss": 2.5938, "step": 18762 }, { "crossentropy": 2.5978167057037354, "epoch": 0.6802131670533643, "grad_norm": 0.027342524379491806, "grad_norm_var": 1.0882331608984737e-06, "learning_rate": 0.0024051678166761093, "loss": 2.5887, "step": 18763 }, { "crossentropy": 2.398993968963623, "epoch": 0.6802494199535963, "grad_norm": 0.027004912495613098, "grad_norm_var": 1.0010358912226296e-06, "learning_rate": 0.0024046711235449544, "loss": 2.4694, "step": 18764 }, { "crossentropy": 2.3820693492889404, "epoch": 0.6802856728538283, "grad_norm": 0.02771027386188507, "grad_norm_var": 9.040688413395245e-07, "learning_rate": 0.0024041744654676555, "loss": 2.4338, "step": 18765 }, { "crossentropy": 2.5446982383728027, "epoch": 0.6803219257540604, "grad_norm": 0.026729203760623932, "grad_norm_var": 8.84385393244919e-07, "learning_rate": 0.0024036778424509193, "loss": 2.4836, "step": 18766 }, { "crossentropy": 2.650064706802368, "epoch": 0.6803581786542924, "grad_norm": 0.02593313157558441, "grad_norm_var": 3.6022434857309733e-07, "learning_rate": 0.0024031812545014532, "loss": 2.5263, "step": 18767 }, { "crossentropy": 2.455885410308838, "epoch": 0.6803944315545244, "grad_norm": 0.02887020632624626, "grad_norm_var": 6.746775546432089e-07, "learning_rate": 0.002402684701625967, "loss": 2.4616, "step": 18768 }, { "crossentropy": 2.51212739944458, "epoch": 0.6804306844547564, "grad_norm": 0.026649368926882744, "grad_norm_var": 6.621292601515268e-07, "learning_rate": 0.0024021881838311634, "loss": 2.5387, "step": 18769 }, { "crossentropy": 2.486842393875122, "epoch": 0.6804669373549884, "grad_norm": 0.02748902700841427, "grad_norm_var": 6.782013080888396e-07, "learning_rate": 0.002401691701123753, "loss": 2.4128, "step": 18770 }, { "crossentropy": 2.5688347816467285, "epoch": 0.6805031902552204, "grad_norm": 0.025145690888166428, "grad_norm_var": 8.453457406334588e-07, "learning_rate": 0.002401195253510439, "loss": 2.5414, "step": 18771 }, { "crossentropy": 2.487485408782959, "epoch": 0.6805394431554525, "grad_norm": 0.0262246523052454, "grad_norm_var": 8.354418406531268e-07, "learning_rate": 0.0024006988409979247, "loss": 2.5375, "step": 18772 }, { "crossentropy": 2.540708541870117, "epoch": 0.6805756960556845, "grad_norm": 0.027618946507573128, "grad_norm_var": 8.796801561511038e-07, "learning_rate": 0.0024002024635929194, "loss": 2.5803, "step": 18773 }, { "crossentropy": 2.553992986679077, "epoch": 0.6806119489559165, "grad_norm": 0.025868264958262444, "grad_norm_var": 9.366314994330479e-07, "learning_rate": 0.002399706121302123, "loss": 2.5852, "step": 18774 }, { "crossentropy": 2.5749387741088867, "epoch": 0.6806482018561485, "grad_norm": 0.02715146169066429, "grad_norm_var": 9.427067285456651e-07, "learning_rate": 0.0023992098141322423, "loss": 2.5269, "step": 18775 }, { "crossentropy": 2.382371664047241, "epoch": 0.6806844547563805, "grad_norm": 0.025354351848363876, "grad_norm_var": 1.017431686637958e-06, "learning_rate": 0.0023987135420899808, "loss": 2.4208, "step": 18776 }, { "crossentropy": 2.4718401432037354, "epoch": 0.6807207076566125, "grad_norm": 0.026170600205659866, "grad_norm_var": 9.429015236405199e-07, "learning_rate": 0.0023982173051820396, "loss": 2.4726, "step": 18777 }, { "crossentropy": 2.5260329246520996, "epoch": 0.6807569605568445, "grad_norm": 0.027130931615829468, "grad_norm_var": 9.346614139335661e-07, "learning_rate": 0.002397721103415123, "loss": 2.4898, "step": 18778 }, { "crossentropy": 2.5527844429016113, "epoch": 0.6807932134570766, "grad_norm": 0.026921693235635757, "grad_norm_var": 9.138632088780448e-07, "learning_rate": 0.002397224936795931, "loss": 2.6059, "step": 18779 }, { "crossentropy": 2.5258145332336426, "epoch": 0.6808294663573086, "grad_norm": 0.026860464364290237, "grad_norm_var": 9.102248954081001e-07, "learning_rate": 0.002396728805331167, "loss": 2.5354, "step": 18780 }, { "crossentropy": 2.399583339691162, "epoch": 0.6808657192575406, "grad_norm": 0.025755271315574646, "grad_norm_var": 8.959925809177375e-07, "learning_rate": 0.0023962327090275314, "loss": 2.478, "step": 18781 }, { "crossentropy": 2.567481756210327, "epoch": 0.6809019721577726, "grad_norm": 0.02741895616054535, "grad_norm_var": 9.360392410470795e-07, "learning_rate": 0.002395736647891723, "loss": 2.5672, "step": 18782 }, { "crossentropy": 2.6022355556488037, "epoch": 0.6809382250580046, "grad_norm": 0.026409948244690895, "grad_norm_var": 9.040258147624894e-07, "learning_rate": 0.0023952406219304428, "loss": 2.6446, "step": 18783 }, { "crossentropy": 2.3918097019195557, "epoch": 0.6809744779582366, "grad_norm": 0.027437780052423477, "grad_norm_var": 6.158661901913808e-07, "learning_rate": 0.0023947446311503927, "loss": 2.4117, "step": 18784 }, { "crossentropy": 2.5091450214385986, "epoch": 0.6810107308584686, "grad_norm": 0.02622632496058941, "grad_norm_var": 6.24293001241262e-07, "learning_rate": 0.002394248675558269, "loss": 2.5028, "step": 18785 }, { "crossentropy": 2.384556531906128, "epoch": 0.6810469837587007, "grad_norm": 0.025901852175593376, "grad_norm_var": 5.881020353421321e-07, "learning_rate": 0.0023937527551607723, "loss": 2.5352, "step": 18786 }, { "crossentropy": 2.4911303520202637, "epoch": 0.6810832366589327, "grad_norm": 0.025432230904698372, "grad_norm_var": 5.42453617057116e-07, "learning_rate": 0.002393256869964598, "loss": 2.5216, "step": 18787 }, { "crossentropy": 2.690715789794922, "epoch": 0.6811194895591647, "grad_norm": 0.02762479893863201, "grad_norm_var": 6.14932278576484e-07, "learning_rate": 0.0023927610199764478, "loss": 2.5646, "step": 18788 }, { "crossentropy": 2.352327346801758, "epoch": 0.6811557424593968, "grad_norm": 0.02653546631336212, "grad_norm_var": 5.382474736618029e-07, "learning_rate": 0.0023922652052030162, "loss": 2.4946, "step": 18789 }, { "crossentropy": 2.558969020843506, "epoch": 0.6811919953596288, "grad_norm": 0.026358675211668015, "grad_norm_var": 5.111519771474332e-07, "learning_rate": 0.0023917694256509986, "loss": 2.5177, "step": 18790 }, { "crossentropy": 2.422935724258423, "epoch": 0.6812282482598608, "grad_norm": 0.02623981423676014, "grad_norm_var": 4.891567812780513e-07, "learning_rate": 0.002391273681327093, "loss": 2.4859, "step": 18791 }, { "crossentropy": 2.5260980129241943, "epoch": 0.6812645011600929, "grad_norm": 0.025873549282550812, "grad_norm_var": 4.276511412603633e-07, "learning_rate": 0.0023907779722379973, "loss": 2.5771, "step": 18792 }, { "crossentropy": 2.3729357719421387, "epoch": 0.6813007540603249, "grad_norm": 0.027098797261714935, "grad_norm_var": 4.3842382717457216e-07, "learning_rate": 0.0023902822983904027, "loss": 2.5128, "step": 18793 }, { "crossentropy": 2.4464962482452393, "epoch": 0.6813370069605569, "grad_norm": 0.026391085237264633, "grad_norm_var": 4.179578036722064e-07, "learning_rate": 0.002389786659791008, "loss": 2.4284, "step": 18794 }, { "crossentropy": 2.3539657592773438, "epoch": 0.6813732598607889, "grad_norm": 0.025472447276115417, "grad_norm_var": 4.7362045241551114e-07, "learning_rate": 0.0023892910564465037, "loss": 2.4232, "step": 18795 }, { "crossentropy": 2.52921986579895, "epoch": 0.6814095127610209, "grad_norm": 0.025654006749391556, "grad_norm_var": 4.969298591869324e-07, "learning_rate": 0.0023887954883635852, "loss": 2.5535, "step": 18796 }, { "crossentropy": 2.3912594318389893, "epoch": 0.6814457656612529, "grad_norm": 0.026005921885371208, "grad_norm_var": 4.804980767553288e-07, "learning_rate": 0.002388299955548951, "loss": 2.3315, "step": 18797 }, { "crossentropy": 2.4239501953125, "epoch": 0.6814820185614849, "grad_norm": 0.027095122262835503, "grad_norm_var": 4.4219692085043646e-07, "learning_rate": 0.002387804458009285, "loss": 2.4727, "step": 18798 }, { "crossentropy": 2.4273841381073, "epoch": 0.681518271461717, "grad_norm": 0.026761969551444054, "grad_norm_var": 4.5229263003025246e-07, "learning_rate": 0.002387308995751284, "loss": 2.5075, "step": 18799 }, { "crossentropy": 2.708976984024048, "epoch": 0.681554524361949, "grad_norm": 0.030984798446297646, "grad_norm_var": 1.7380062950471948e-06, "learning_rate": 0.002386813568781641, "loss": 2.5509, "step": 18800 }, { "crossentropy": 2.6068475246429443, "epoch": 0.681590777262181, "grad_norm": 0.02604280784726143, "grad_norm_var": 1.749341596088821e-06, "learning_rate": 0.0023863181771070457, "loss": 2.5506, "step": 18801 }, { "crossentropy": 2.574359893798828, "epoch": 0.681627030162413, "grad_norm": 0.02732754498720169, "grad_norm_var": 1.7451712939143265e-06, "learning_rate": 0.002385822820734191, "loss": 2.6098, "step": 18802 }, { "crossentropy": 2.3853166103363037, "epoch": 0.681663283062645, "grad_norm": 0.025944704189896584, "grad_norm_var": 1.6762445241532182e-06, "learning_rate": 0.002385327499669765, "loss": 2.4436, "step": 18803 }, { "crossentropy": 2.6046814918518066, "epoch": 0.681699535962877, "grad_norm": 0.02941540814936161, "grad_norm_var": 2.0942748147853197e-06, "learning_rate": 0.002384832213920458, "loss": 2.5542, "step": 18804 }, { "crossentropy": 2.5005383491516113, "epoch": 0.681735788863109, "grad_norm": 0.031061435118317604, "grad_norm_var": 3.1997467748951305e-06, "learning_rate": 0.002384336963492965, "loss": 2.5424, "step": 18805 }, { "crossentropy": 2.446507453918457, "epoch": 0.681772041763341, "grad_norm": 0.026359781622886658, "grad_norm_var": 3.1996363090820713e-06, "learning_rate": 0.0023838417483939673, "loss": 2.4508, "step": 18806 }, { "crossentropy": 2.368762493133545, "epoch": 0.6818082946635731, "grad_norm": 0.02564230002462864, "grad_norm_var": 3.2911233179500067e-06, "learning_rate": 0.0023833465686301566, "loss": 2.4591, "step": 18807 }, { "crossentropy": 2.7005739212036133, "epoch": 0.6818445475638051, "grad_norm": 0.027305541560053825, "grad_norm_var": 3.1907054646207554e-06, "learning_rate": 0.0023828514242082233, "loss": 2.5168, "step": 18808 }, { "crossentropy": 2.6218409538269043, "epoch": 0.6818808004640371, "grad_norm": 0.027437420561909676, "grad_norm_var": 3.195098421220066e-06, "learning_rate": 0.002382356315134851, "loss": 2.573, "step": 18809 }, { "crossentropy": 2.492466449737549, "epoch": 0.6819170533642691, "grad_norm": 0.02689630538225174, "grad_norm_var": 3.1578140982872646e-06, "learning_rate": 0.002381861241416731, "loss": 2.4541, "step": 18810 }, { "crossentropy": 2.645120859146118, "epoch": 0.6819533062645011, "grad_norm": 0.027322102338075638, "grad_norm_var": 2.9423917674959813e-06, "learning_rate": 0.002381366203060547, "loss": 2.6036, "step": 18811 }, { "crossentropy": 2.470845937728882, "epoch": 0.6819895591647331, "grad_norm": 0.026967473328113556, "grad_norm_var": 2.756951484327435e-06, "learning_rate": 0.002380871200072985, "loss": 2.4661, "step": 18812 }, { "crossentropy": 2.567335367202759, "epoch": 0.6820258120649652, "grad_norm": 0.026744863018393517, "grad_norm_var": 2.6526756302741015e-06, "learning_rate": 0.002380376232460735, "loss": 2.5782, "step": 18813 }, { "crossentropy": 2.497408628463745, "epoch": 0.6820620649651972, "grad_norm": 0.02621687948703766, "grad_norm_var": 2.7432403400330043e-06, "learning_rate": 0.0023798813002304797, "loss": 2.5943, "step": 18814 }, { "crossentropy": 2.536702871322632, "epoch": 0.6820983178654292, "grad_norm": 0.02588324435055256, "grad_norm_var": 2.866483466519446e-06, "learning_rate": 0.002379386403388901, "loss": 2.5138, "step": 18815 }, { "crossentropy": 2.5020742416381836, "epoch": 0.6821345707656613, "grad_norm": 0.02673761174082756, "grad_norm_var": 1.9338626931481375e-06, "learning_rate": 0.0023788915419426884, "loss": 2.4829, "step": 18816 }, { "crossentropy": 2.4754185676574707, "epoch": 0.6821708236658933, "grad_norm": 0.026052704080939293, "grad_norm_var": 1.932498144667028e-06, "learning_rate": 0.0023783967158985202, "loss": 2.4654, "step": 18817 }, { "crossentropy": 2.530776023864746, "epoch": 0.6822070765661253, "grad_norm": 0.02789204753935337, "grad_norm_var": 1.9708804090139604e-06, "learning_rate": 0.0023779019252630857, "loss": 2.5832, "step": 18818 }, { "crossentropy": 2.5821433067321777, "epoch": 0.6822433294663574, "grad_norm": 0.02657623589038849, "grad_norm_var": 1.8970539969343498e-06, "learning_rate": 0.0023774071700430627, "loss": 2.6054, "step": 18819 }, { "crossentropy": 2.575634717941284, "epoch": 0.6822795823665894, "grad_norm": 0.025607265532016754, "grad_norm_var": 1.6566933468959224e-06, "learning_rate": 0.0023769124502451356, "loss": 2.541, "step": 18820 }, { "crossentropy": 2.5068435668945312, "epoch": 0.6823158352668214, "grad_norm": 0.027425656095147133, "grad_norm_var": 4.7471953377262873e-07, "learning_rate": 0.002376417765875989, "loss": 2.4705, "step": 18821 }, { "crossentropy": 2.5367581844329834, "epoch": 0.6823520881670534, "grad_norm": 0.026222847402095795, "grad_norm_var": 4.819518666889765e-07, "learning_rate": 0.002375923116942301, "loss": 2.5505, "step": 18822 }, { "crossentropy": 2.5110394954681396, "epoch": 0.6823883410672854, "grad_norm": 0.02561803162097931, "grad_norm_var": 4.853566652675134e-07, "learning_rate": 0.002375428503450752, "loss": 2.4293, "step": 18823 }, { "crossentropy": 2.6097230911254883, "epoch": 0.6824245939675174, "grad_norm": 0.02811652608215809, "grad_norm_var": 5.93925994581236e-07, "learning_rate": 0.002374933925408026, "loss": 2.5611, "step": 18824 }, { "crossentropy": 2.561570405960083, "epoch": 0.6824608468677494, "grad_norm": 0.026587696745991707, "grad_norm_var": 5.591681415854545e-07, "learning_rate": 0.002374439382820799, "loss": 2.5815, "step": 18825 }, { "crossentropy": 2.6523616313934326, "epoch": 0.6824970997679815, "grad_norm": 0.026483988389372826, "grad_norm_var": 5.578589731648648e-07, "learning_rate": 0.0023739448756957527, "loss": 2.5498, "step": 18826 }, { "crossentropy": 2.6192188262939453, "epoch": 0.6825333526682135, "grad_norm": 0.027858907356858253, "grad_norm_var": 6.237271919584299e-07, "learning_rate": 0.002373450404039568, "loss": 2.6499, "step": 18827 }, { "crossentropy": 2.5360500812530518, "epoch": 0.6825696055684455, "grad_norm": 0.0267525315284729, "grad_norm_var": 6.185765933926079e-07, "learning_rate": 0.0023729559678589193, "loss": 2.5632, "step": 18828 }, { "crossentropy": 2.498849630355835, "epoch": 0.6826058584686775, "grad_norm": 0.027035683393478394, "grad_norm_var": 6.26627287188206e-07, "learning_rate": 0.002372461567160489, "loss": 2.5204, "step": 18829 }, { "crossentropy": 2.4815306663513184, "epoch": 0.6826421113689095, "grad_norm": 0.026238322257995605, "grad_norm_var": 6.252983777728256e-07, "learning_rate": 0.0023719672019509525, "loss": 2.5608, "step": 18830 }, { "crossentropy": 2.5773184299468994, "epoch": 0.6826783642691415, "grad_norm": 0.026831720024347305, "grad_norm_var": 5.791090126910781e-07, "learning_rate": 0.0023714728722369857, "loss": 2.5386, "step": 18831 }, { "crossentropy": 2.523451328277588, "epoch": 0.6827146171693735, "grad_norm": 0.028111683204770088, "grad_norm_var": 6.944113357826645e-07, "learning_rate": 0.002370978578025268, "loss": 2.5174, "step": 18832 }, { "crossentropy": 2.518001079559326, "epoch": 0.6827508700696056, "grad_norm": 0.026208512485027313, "grad_norm_var": 6.796095129600697e-07, "learning_rate": 0.002370484319322473, "loss": 2.5297, "step": 18833 }, { "crossentropy": 2.4872119426727295, "epoch": 0.6827871229698376, "grad_norm": 0.026326926425099373, "grad_norm_var": 6.148304970326428e-07, "learning_rate": 0.002369990096135277, "loss": 2.3992, "step": 18834 }, { "crossentropy": 2.4798502922058105, "epoch": 0.6828233758700696, "grad_norm": 0.02624649368226528, "grad_norm_var": 6.292727296801244e-07, "learning_rate": 0.002369495908470358, "loss": 2.4311, "step": 18835 }, { "crossentropy": 2.4005844593048096, "epoch": 0.6828596287703016, "grad_norm": 0.026088930666446686, "grad_norm_var": 5.71697469022893e-07, "learning_rate": 0.0023690017563343874, "loss": 2.3924, "step": 18836 }, { "crossentropy": 2.5673279762268066, "epoch": 0.6828958816705336, "grad_norm": 0.02618366852402687, "grad_norm_var": 5.578168774625214e-07, "learning_rate": 0.002368507639734042, "loss": 2.5678, "step": 18837 }, { "crossentropy": 2.485095739364624, "epoch": 0.6829321345707656, "grad_norm": 0.0259049404412508, "grad_norm_var": 5.835970485366012e-07, "learning_rate": 0.002368013558675993, "loss": 2.4727, "step": 18838 }, { "crossentropy": 2.507704019546509, "epoch": 0.6829683874709976, "grad_norm": 0.025812039151787758, "grad_norm_var": 5.589403074344878e-07, "learning_rate": 0.0023675195131669163, "loss": 2.4643, "step": 18839 }, { "crossentropy": 2.503269672393799, "epoch": 0.6830046403712297, "grad_norm": 0.025917300954461098, "grad_norm_var": 4.383190678551672e-07, "learning_rate": 0.002367025503213484, "loss": 2.4883, "step": 18840 }, { "crossentropy": 2.5564939975738525, "epoch": 0.6830408932714617, "grad_norm": 0.02641112729907036, "grad_norm_var": 4.390701768427462e-07, "learning_rate": 0.002366531528822366, "loss": 2.5601, "step": 18841 }, { "crossentropy": 2.658461093902588, "epoch": 0.6830771461716937, "grad_norm": 0.026637207716703415, "grad_norm_var": 4.396832883225467e-07, "learning_rate": 0.0023660375900002363, "loss": 2.6096, "step": 18842 }, { "crossentropy": 2.4445738792419434, "epoch": 0.6831133990719258, "grad_norm": 0.027299214154481888, "grad_norm_var": 3.604921871393103e-07, "learning_rate": 0.002365543686753768, "loss": 2.483, "step": 18843 }, { "crossentropy": 2.5984482765197754, "epoch": 0.6831496519721578, "grad_norm": 0.02645397186279297, "grad_norm_var": 3.5602621956763033e-07, "learning_rate": 0.002365049819089628, "loss": 2.5232, "step": 18844 }, { "crossentropy": 2.614043712615967, "epoch": 0.6831859048723898, "grad_norm": 0.026515934616327286, "grad_norm_var": 3.3452128377284287e-07, "learning_rate": 0.0023645559870144905, "loss": 2.5479, "step": 18845 }, { "crossentropy": 2.4781484603881836, "epoch": 0.6832221577726219, "grad_norm": 0.02657540887594223, "grad_norm_var": 3.321428913169444e-07, "learning_rate": 0.0023640621905350223, "loss": 2.5415, "step": 18846 }, { "crossentropy": 2.5942184925079346, "epoch": 0.6832584106728539, "grad_norm": 0.030302751809358597, "grad_norm_var": 1.2524053620413532e-06, "learning_rate": 0.0023635684296578964, "loss": 2.6346, "step": 18847 }, { "crossentropy": 2.516502857208252, "epoch": 0.6832946635730859, "grad_norm": 0.025947220623493195, "grad_norm_var": 1.1341292053003524e-06, "learning_rate": 0.0023630747043897793, "loss": 2.5089, "step": 18848 }, { "crossentropy": 2.6848623752593994, "epoch": 0.6833309164733179, "grad_norm": 0.03483574464917183, "grad_norm_var": 5.390862473468117e-06, "learning_rate": 0.002362581014737339, "loss": 2.6023, "step": 18849 }, { "crossentropy": 2.528116464614868, "epoch": 0.6833671693735499, "grad_norm": 0.026589898392558098, "grad_norm_var": 5.3683876424334555e-06, "learning_rate": 0.0023620873607072437, "loss": 2.5101, "step": 18850 }, { "crossentropy": 2.6504836082458496, "epoch": 0.6834034222737819, "grad_norm": 0.026536142453551292, "grad_norm_var": 5.340374771495279e-06, "learning_rate": 0.002361593742306164, "loss": 2.6335, "step": 18851 }, { "crossentropy": 2.627424478530884, "epoch": 0.6834396751740139, "grad_norm": 0.02646872214972973, "grad_norm_var": 5.2968880915192525e-06, "learning_rate": 0.0023611001595407626, "loss": 2.5466, "step": 18852 }, { "crossentropy": 2.5047707557678223, "epoch": 0.683475928074246, "grad_norm": 0.02770720049738884, "grad_norm_var": 5.245772229108397e-06, "learning_rate": 0.00236060661241771, "loss": 2.4779, "step": 18853 }, { "crossentropy": 2.364400625228882, "epoch": 0.683512180974478, "grad_norm": 0.02593950368463993, "grad_norm_var": 5.239672809115708e-06, "learning_rate": 0.0023601131009436693, "loss": 2.4864, "step": 18854 }, { "crossentropy": 2.614016532897949, "epoch": 0.68354843387471, "grad_norm": 0.02739894948899746, "grad_norm_var": 5.093479554752385e-06, "learning_rate": 0.002359619625125309, "loss": 2.5802, "step": 18855 }, { "crossentropy": 2.553101062774658, "epoch": 0.683584686774942, "grad_norm": 0.02623811550438404, "grad_norm_var": 5.0387983860739526e-06, "learning_rate": 0.002359126184969292, "loss": 2.52, "step": 18856 }, { "crossentropy": 2.501812696456909, "epoch": 0.683620939675174, "grad_norm": 0.026819845661520958, "grad_norm_var": 4.997198727074006e-06, "learning_rate": 0.002358632780482282, "loss": 2.4951, "step": 18857 }, { "crossentropy": 2.2193093299865723, "epoch": 0.683657192575406, "grad_norm": 0.025852853432297707, "grad_norm_var": 5.1145457480676155e-06, "learning_rate": 0.0023581394116709447, "loss": 2.3204, "step": 18858 }, { "crossentropy": 2.5685060024261475, "epoch": 0.683693445475638, "grad_norm": 0.028983410447835922, "grad_norm_var": 5.282087080477754e-06, "learning_rate": 0.002357646078541946, "loss": 2.5176, "step": 18859 }, { "crossentropy": 2.6328487396240234, "epoch": 0.68372969837587, "grad_norm": 0.0271320641040802, "grad_norm_var": 5.2209659223041716e-06, "learning_rate": 0.002357152781101945, "loss": 2.506, "step": 18860 }, { "crossentropy": 2.3461179733276367, "epoch": 0.6837659512761021, "grad_norm": 0.02810513973236084, "grad_norm_var": 5.1723657233923e-06, "learning_rate": 0.002356659519357608, "loss": 2.418, "step": 18861 }, { "crossentropy": 2.5192744731903076, "epoch": 0.6838022041763341, "grad_norm": 0.02560168132185936, "grad_norm_var": 5.363292486915832e-06, "learning_rate": 0.0023561662933155934, "loss": 2.5516, "step": 18862 }, { "crossentropy": 2.596426248550415, "epoch": 0.6838384570765661, "grad_norm": 0.027119506150484085, "grad_norm_var": 4.8192110100438165e-06, "learning_rate": 0.0023556731029825656, "loss": 2.5617, "step": 18863 }, { "crossentropy": 2.465649127960205, "epoch": 0.6838747099767981, "grad_norm": 0.025154411792755127, "grad_norm_var": 5.0046393136993215e-06, "learning_rate": 0.002355179948365189, "loss": 2.5116, "step": 18864 }, { "crossentropy": 2.5823581218719482, "epoch": 0.6839109628770301, "grad_norm": 0.02626875415444374, "grad_norm_var": 9.612842194445248e-07, "learning_rate": 0.002354686829470118, "loss": 2.5442, "step": 18865 }, { "crossentropy": 2.5713205337524414, "epoch": 0.6839472157772621, "grad_norm": 0.027312764897942543, "grad_norm_var": 9.790165812062836e-07, "learning_rate": 0.0023541937463040148, "loss": 2.4956, "step": 18866 }, { "crossentropy": 2.568556070327759, "epoch": 0.6839834686774942, "grad_norm": 0.026495199650526047, "grad_norm_var": 9.805068505281626e-07, "learning_rate": 0.002353700698873542, "loss": 2.5444, "step": 18867 }, { "crossentropy": 2.5634939670562744, "epoch": 0.6840197215777262, "grad_norm": 0.02718093991279602, "grad_norm_var": 9.819494769709608e-07, "learning_rate": 0.002353207687185356, "loss": 2.571, "step": 18868 }, { "crossentropy": 2.7123260498046875, "epoch": 0.6840559744779582, "grad_norm": 0.027717307209968567, "grad_norm_var": 9.831353874965865e-07, "learning_rate": 0.0023527147112461185, "loss": 2.6145, "step": 18869 }, { "crossentropy": 2.391749620437622, "epoch": 0.6840922273781903, "grad_norm": 0.027930252254009247, "grad_norm_var": 9.93789642873471e-07, "learning_rate": 0.0023522217710624837, "loss": 2.5349, "step": 18870 }, { "crossentropy": 2.397122859954834, "epoch": 0.6841284802784223, "grad_norm": 0.025662649422883987, "grad_norm_var": 1.079884868602996e-06, "learning_rate": 0.0023517288666411126, "loss": 2.4023, "step": 18871 }, { "crossentropy": 2.4744858741760254, "epoch": 0.6841647331786543, "grad_norm": 0.025944864377379417, "grad_norm_var": 1.1091230593051707e-06, "learning_rate": 0.0023512359979886654, "loss": 2.5237, "step": 18872 }, { "crossentropy": 2.6021974086761475, "epoch": 0.6842009860788864, "grad_norm": 0.026321474462747574, "grad_norm_var": 1.125328005544092e-06, "learning_rate": 0.002350743165111793, "loss": 2.5888, "step": 18873 }, { "crossentropy": 2.4923672676086426, "epoch": 0.6842372389791184, "grad_norm": 0.027660369873046875, "grad_norm_var": 1.101510284554942e-06, "learning_rate": 0.002350250368017153, "loss": 2.5696, "step": 18874 }, { "crossentropy": 2.4445676803588867, "epoch": 0.6842734918793504, "grad_norm": 0.027699321508407593, "grad_norm_var": 8.49902576739725e-07, "learning_rate": 0.0023497576067114056, "loss": 2.569, "step": 18875 }, { "crossentropy": 2.571229934692383, "epoch": 0.6843097447795824, "grad_norm": 0.026487555354833603, "grad_norm_var": 8.500502271403693e-07, "learning_rate": 0.002349264881201202, "loss": 2.4656, "step": 18876 }, { "crossentropy": 2.547780752182007, "epoch": 0.6843459976798144, "grad_norm": 0.02815702185034752, "grad_norm_var": 8.59306498311498e-07, "learning_rate": 0.0023487721914932005, "loss": 2.5723, "step": 18877 }, { "crossentropy": 2.588498592376709, "epoch": 0.6843822505800464, "grad_norm": 0.02727748081088066, "grad_norm_var": 7.682732034814307e-07, "learning_rate": 0.0023482795375940517, "loss": 2.5271, "step": 18878 }, { "crossentropy": 2.5976269245147705, "epoch": 0.6844185034802784, "grad_norm": 0.02709929645061493, "grad_norm_var": 7.677055379415667e-07, "learning_rate": 0.0023477869195104124, "loss": 2.5929, "step": 18879 }, { "crossentropy": 2.5668811798095703, "epoch": 0.6844547563805105, "grad_norm": 0.026045339182019234, "grad_norm_var": 6.101812610881472e-07, "learning_rate": 0.002347294337248937, "loss": 2.5064, "step": 18880 }, { "crossentropy": 2.5272607803344727, "epoch": 0.6844910092807425, "grad_norm": 0.02675074152648449, "grad_norm_var": 5.806771286503976e-07, "learning_rate": 0.0023468017908162774, "loss": 2.519, "step": 18881 }, { "crossentropy": 2.4779884815216064, "epoch": 0.6845272621809745, "grad_norm": 0.02680821344256401, "grad_norm_var": 5.744647391783212e-07, "learning_rate": 0.0023463092802190845, "loss": 2.5159, "step": 18882 }, { "crossentropy": 2.521284341812134, "epoch": 0.6845635150812065, "grad_norm": 0.027023207396268845, "grad_norm_var": 5.597035135675021e-07, "learning_rate": 0.0023458168054640133, "loss": 2.5532, "step": 18883 }, { "crossentropy": 2.5117006301879883, "epoch": 0.6845997679814385, "grad_norm": 0.026509171351790428, "grad_norm_var": 5.703916908292768e-07, "learning_rate": 0.0023453243665577123, "loss": 2.5136, "step": 18884 }, { "crossentropy": 2.543590545654297, "epoch": 0.6846360208816705, "grad_norm": 0.026385223492980003, "grad_norm_var": 5.438385888690041e-07, "learning_rate": 0.002344831963506836, "loss": 2.6192, "step": 18885 }, { "crossentropy": 2.406497001647949, "epoch": 0.6846722737819025, "grad_norm": 0.03002787195146084, "grad_norm_var": 1.118131919843462e-06, "learning_rate": 0.002344339596318032, "loss": 2.4172, "step": 18886 }, { "crossentropy": 2.716899871826172, "epoch": 0.6847085266821346, "grad_norm": 0.025044962763786316, "grad_norm_var": 1.2513981337866167e-06, "learning_rate": 0.0023438472649979516, "loss": 2.5828, "step": 18887 }, { "crossentropy": 2.5314652919769287, "epoch": 0.6847447795823666, "grad_norm": 0.026401974260807037, "grad_norm_var": 1.2030360542738717e-06, "learning_rate": 0.0023433549695532465, "loss": 2.4624, "step": 18888 }, { "crossentropy": 2.2769501209259033, "epoch": 0.6847810324825986, "grad_norm": 0.026100456714630127, "grad_norm_var": 1.2255306280638806e-06, "learning_rate": 0.002342862709990564, "loss": 2.3442, "step": 18889 }, { "crossentropy": 2.5856146812438965, "epoch": 0.6848172853828306, "grad_norm": 0.026712674647569656, "grad_norm_var": 1.1940987692190258e-06, "learning_rate": 0.002342370486316551, "loss": 2.6219, "step": 18890 }, { "crossentropy": 2.496039867401123, "epoch": 0.6848535382830626, "grad_norm": 0.02593541145324707, "grad_norm_var": 1.2024875430348964e-06, "learning_rate": 0.00234187829853786, "loss": 2.6157, "step": 18891 }, { "crossentropy": 2.4999313354492188, "epoch": 0.6848897911832946, "grad_norm": 0.026184210553765297, "grad_norm_var": 1.2207913758668457e-06, "learning_rate": 0.002341386146661134, "loss": 2.4968, "step": 18892 }, { "crossentropy": 2.368354320526123, "epoch": 0.6849260440835266, "grad_norm": 0.02592574432492256, "grad_norm_var": 1.1219734806504808e-06, "learning_rate": 0.0023408940306930254, "loss": 2.4606, "step": 18893 }, { "crossentropy": 2.2959213256835938, "epoch": 0.6849622969837587, "grad_norm": 0.0257203858345747, "grad_norm_var": 1.141054377749847e-06, "learning_rate": 0.0023404019506401757, "loss": 2.458, "step": 18894 }, { "crossentropy": 2.4701919555664062, "epoch": 0.6849985498839907, "grad_norm": 0.026858670637011528, "grad_norm_var": 1.126798974131667e-06, "learning_rate": 0.002339909906509234, "loss": 2.4585, "step": 18895 }, { "crossentropy": 2.349787473678589, "epoch": 0.6850348027842227, "grad_norm": 0.025610223412513733, "grad_norm_var": 1.166583787753902e-06, "learning_rate": 0.0023394178983068476, "loss": 2.4144, "step": 18896 }, { "crossentropy": 2.5790116786956787, "epoch": 0.6850710556844548, "grad_norm": 0.026687920093536377, "grad_norm_var": 1.164729738849235e-06, "learning_rate": 0.00233892592603966, "loss": 2.5279, "step": 18897 }, { "crossentropy": 2.5069949626922607, "epoch": 0.6851073085846868, "grad_norm": 0.027519579976797104, "grad_norm_var": 1.225968567288016e-06, "learning_rate": 0.002338433989714315, "loss": 2.5439, "step": 18898 }, { "crossentropy": 2.5125348567962646, "epoch": 0.6851435614849188, "grad_norm": 0.026564422994852066, "grad_norm_var": 1.2095947566804119e-06, "learning_rate": 0.002337942089337459, "loss": 2.5645, "step": 18899 }, { "crossentropy": 2.563371181488037, "epoch": 0.6851798143851509, "grad_norm": 0.028369173407554626, "grad_norm_var": 1.425166709971776e-06, "learning_rate": 0.002337450224915733, "loss": 2.5507, "step": 18900 }, { "crossentropy": 2.637514591217041, "epoch": 0.6852160672853829, "grad_norm": 0.03353016823530197, "grad_norm_var": 4.384469148368577e-06, "learning_rate": 0.0023369583964557845, "loss": 2.5863, "step": 18901 }, { "crossentropy": 2.5037264823913574, "epoch": 0.6852523201856149, "grad_norm": 0.02631429024040699, "grad_norm_var": 3.7840994334588838e-06, "learning_rate": 0.0023364666039642525, "loss": 2.5374, "step": 18902 }, { "crossentropy": 2.5776100158691406, "epoch": 0.6852885730858469, "grad_norm": 0.02621331252157688, "grad_norm_var": 3.5893915887172475e-06, "learning_rate": 0.0023359748474477806, "loss": 2.5894, "step": 18903 }, { "crossentropy": 2.498650312423706, "epoch": 0.6853248259860789, "grad_norm": 0.027021680027246475, "grad_norm_var": 3.5709592778764095e-06, "learning_rate": 0.0023354831269130133, "loss": 2.5197, "step": 18904 }, { "crossentropy": 2.6321322917938232, "epoch": 0.6853610788863109, "grad_norm": 0.027639709413051605, "grad_norm_var": 3.54380913832838e-06, "learning_rate": 0.002334991442366588, "loss": 2.6316, "step": 18905 }, { "crossentropy": 2.5016839504241943, "epoch": 0.6853973317865429, "grad_norm": 0.026508482173085213, "grad_norm_var": 3.5556118495690935e-06, "learning_rate": 0.0023344997938151498, "loss": 2.488, "step": 18906 }, { "crossentropy": 2.6206109523773193, "epoch": 0.685433584686775, "grad_norm": 0.0270342119038105, "grad_norm_var": 3.4695776178183166e-06, "learning_rate": 0.0023340081812653363, "loss": 2.4922, "step": 18907 }, { "crossentropy": 2.610825538635254, "epoch": 0.685469837587007, "grad_norm": 0.026271317154169083, "grad_norm_var": 3.459341490817539e-06, "learning_rate": 0.0023335166047237866, "loss": 2.5508, "step": 18908 }, { "crossentropy": 2.501279592514038, "epoch": 0.685506090487239, "grad_norm": 0.025923192501068115, "grad_norm_var": 3.4597454556305283e-06, "learning_rate": 0.0023330250641971436, "loss": 2.5501, "step": 18909 }, { "crossentropy": 2.3138325214385986, "epoch": 0.685542343387471, "grad_norm": 0.02624652162194252, "grad_norm_var": 3.379445959493926e-06, "learning_rate": 0.002332533559692042, "loss": 2.3531, "step": 18910 }, { "crossentropy": 2.6526424884796143, "epoch": 0.685578596287703, "grad_norm": 0.02647305093705654, "grad_norm_var": 3.4034388761811434e-06, "learning_rate": 0.002332042091215123, "loss": 2.5845, "step": 18911 }, { "crossentropy": 2.4706871509552, "epoch": 0.685614849187935, "grad_norm": 0.02609214186668396, "grad_norm_var": 3.320913177767779e-06, "learning_rate": 0.002331550658773026, "loss": 2.5473, "step": 18912 }, { "crossentropy": 2.552560806274414, "epoch": 0.685651102088167, "grad_norm": 0.02747146040201187, "grad_norm_var": 3.3109498015811955e-06, "learning_rate": 0.002331059262372386, "loss": 2.5791, "step": 18913 }, { "crossentropy": 2.4678280353546143, "epoch": 0.6856873549883991, "grad_norm": 0.02668786607682705, "grad_norm_var": 3.3186936817589656e-06, "learning_rate": 0.0023305679020198416, "loss": 2.4694, "step": 18914 }, { "crossentropy": 2.5244951248168945, "epoch": 0.6857236078886311, "grad_norm": 0.0282564889639616, "grad_norm_var": 3.366075212088889e-06, "learning_rate": 0.0023300765777220294, "loss": 2.5532, "step": 18915 }, { "crossentropy": 2.511770009994507, "epoch": 0.6857598607888631, "grad_norm": 0.02644052915275097, "grad_norm_var": 3.3116090902932885e-06, "learning_rate": 0.0023295852894855825, "loss": 2.5596, "step": 18916 }, { "crossentropy": 2.5585262775421143, "epoch": 0.6857961136890951, "grad_norm": 0.025958755984902382, "grad_norm_var": 4.361964018040343e-07, "learning_rate": 0.002329094037317141, "loss": 2.5255, "step": 18917 }, { "crossentropy": 2.2543604373931885, "epoch": 0.6858323665893271, "grad_norm": 0.026463467627763748, "grad_norm_var": 4.307196813611251e-07, "learning_rate": 0.002328602821223336, "loss": 2.4866, "step": 18918 }, { "crossentropy": 2.662820816040039, "epoch": 0.6858686194895591, "grad_norm": 0.02603735215961933, "grad_norm_var": 4.4334321072079555e-07, "learning_rate": 0.0023281116412108035, "loss": 2.5177, "step": 18919 }, { "crossentropy": 2.652667284011841, "epoch": 0.6859048723897911, "grad_norm": 0.02617768384516239, "grad_norm_var": 4.4692540408231005e-07, "learning_rate": 0.0023276204972861795, "loss": 2.6312, "step": 18920 }, { "crossentropy": 2.5780324935913086, "epoch": 0.6859411252900232, "grad_norm": 0.027324765920639038, "grad_norm_var": 4.0968059968685134e-07, "learning_rate": 0.002327129389456095, "loss": 2.4724, "step": 18921 }, { "crossentropy": 2.3928775787353516, "epoch": 0.6859773781902552, "grad_norm": 0.02634880319237709, "grad_norm_var": 4.1291298911413864e-07, "learning_rate": 0.002326638317727186, "loss": 2.4273, "step": 18922 }, { "crossentropy": 2.4681341648101807, "epoch": 0.6860136310904872, "grad_norm": 0.026787055656313896, "grad_norm_var": 4.016136042269668e-07, "learning_rate": 0.002326147282106083, "loss": 2.5382, "step": 18923 }, { "crossentropy": 2.5473921298980713, "epoch": 0.6860498839907193, "grad_norm": 0.025577623397111893, "grad_norm_var": 4.58392910962444e-07, "learning_rate": 0.0023256562825994176, "loss": 2.5046, "step": 18924 }, { "crossentropy": 2.6163330078125, "epoch": 0.6860861368909513, "grad_norm": 0.027277523651719093, "grad_norm_var": 4.658620962487281e-07, "learning_rate": 0.002325165319213824, "loss": 2.5984, "step": 18925 }, { "crossentropy": 2.3979249000549316, "epoch": 0.6861223897911833, "grad_norm": 0.026147251948714256, "grad_norm_var": 4.71174071934218e-07, "learning_rate": 0.00232467439195593, "loss": 2.4997, "step": 18926 }, { "crossentropy": 2.4735045433044434, "epoch": 0.6861586426914154, "grad_norm": 0.02670292928814888, "grad_norm_var": 4.7073554419600705e-07, "learning_rate": 0.0023241835008323687, "loss": 2.5432, "step": 18927 }, { "crossentropy": 2.6769912242889404, "epoch": 0.6861948955916474, "grad_norm": 0.025690900161862373, "grad_norm_var": 5.084748077259967e-07, "learning_rate": 0.0023236926458497713, "loss": 2.4888, "step": 18928 }, { "crossentropy": 2.608346939086914, "epoch": 0.6862311484918794, "grad_norm": 0.0260099358856678, "grad_norm_var": 4.6911747334227087e-07, "learning_rate": 0.002323201827014765, "loss": 2.5654, "step": 18929 }, { "crossentropy": 2.4595909118652344, "epoch": 0.6862674013921114, "grad_norm": 0.02645888365805149, "grad_norm_var": 4.6644685869114594e-07, "learning_rate": 0.0023227110443339817, "loss": 2.5297, "step": 18930 }, { "crossentropy": 2.6141979694366455, "epoch": 0.6863036542923434, "grad_norm": 0.026898879557847977, "grad_norm_var": 2.5984356962920467e-07, "learning_rate": 0.0023222202978140484, "loss": 2.5642, "step": 18931 }, { "crossentropy": 2.4526333808898926, "epoch": 0.6863399071925754, "grad_norm": 0.025642335414886475, "grad_norm_var": 2.94700209107814e-07, "learning_rate": 0.0023217295874615918, "loss": 2.5299, "step": 18932 }, { "crossentropy": 2.5311052799224854, "epoch": 0.6863761600928074, "grad_norm": 0.028866345062851906, "grad_norm_var": 6.737254404474054e-07, "learning_rate": 0.0023212389132832435, "loss": 2.5441, "step": 18933 }, { "crossentropy": 2.4385788440704346, "epoch": 0.6864124129930395, "grad_norm": 0.030549699440598488, "grad_norm_var": 1.6833817058805548e-06, "learning_rate": 0.0023207482752856273, "loss": 2.5319, "step": 18934 }, { "crossentropy": 2.436288833618164, "epoch": 0.6864486658932715, "grad_norm": 0.02944072149693966, "grad_norm_var": 2.0698041272982122e-06, "learning_rate": 0.002320257673475371, "loss": 2.4417, "step": 18935 }, { "crossentropy": 2.4995272159576416, "epoch": 0.6864849187935035, "grad_norm": 0.02845171093940735, "grad_norm_var": 2.145544525221597e-06, "learning_rate": 0.0023197671078591035, "loss": 2.6149, "step": 18936 }, { "crossentropy": 2.6078691482543945, "epoch": 0.6865211716937355, "grad_norm": 0.02774740196764469, "grad_norm_var": 2.1673478272539315e-06, "learning_rate": 0.002319276578443447, "loss": 2.5444, "step": 18937 }, { "crossentropy": 2.5212466716766357, "epoch": 0.6865574245939675, "grad_norm": 0.025273578241467476, "grad_norm_var": 2.356240988069432e-06, "learning_rate": 0.0023187860852350306, "loss": 2.485, "step": 18938 }, { "crossentropy": 2.5296542644500732, "epoch": 0.6865936774941995, "grad_norm": 0.026787538081407547, "grad_norm_var": 2.35622118344562e-06, "learning_rate": 0.0023182956282404767, "loss": 2.5209, "step": 18939 }, { "crossentropy": 2.3743770122528076, "epoch": 0.6866299303944315, "grad_norm": 0.02695063129067421, "grad_norm_var": 2.196223147524776e-06, "learning_rate": 0.002317805207466408, "loss": 2.4164, "step": 18940 }, { "crossentropy": 2.3982558250427246, "epoch": 0.6866661832946636, "grad_norm": 0.028080005198717117, "grad_norm_var": 2.2467977009295775e-06, "learning_rate": 0.002317314822919452, "loss": 2.5105, "step": 18941 }, { "crossentropy": 2.622466802597046, "epoch": 0.6867024361948956, "grad_norm": 0.02631475031375885, "grad_norm_var": 2.2243438768918536e-06, "learning_rate": 0.0023168244746062288, "loss": 2.6429, "step": 18942 }, { "crossentropy": 2.4634859561920166, "epoch": 0.6867386890951276, "grad_norm": 0.02596503123641014, "grad_norm_var": 2.3113765710055932e-06, "learning_rate": 0.0023163341625333633, "loss": 2.5107, "step": 18943 }, { "crossentropy": 2.542398452758789, "epoch": 0.6867749419953596, "grad_norm": 0.026576906442642212, "grad_norm_var": 2.1826922933486834e-06, "learning_rate": 0.002315843886707479, "loss": 2.4987, "step": 18944 }, { "crossentropy": 2.5605998039245605, "epoch": 0.6868111948955916, "grad_norm": 0.032917119562625885, "grad_norm_var": 4.021643464730863e-06, "learning_rate": 0.0023153536471351944, "loss": 2.574, "step": 18945 }, { "crossentropy": 2.5600883960723877, "epoch": 0.6868474477958236, "grad_norm": 0.028453124687075615, "grad_norm_var": 3.944822115167085e-06, "learning_rate": 0.002314863443823135, "loss": 2.5395, "step": 18946 }, { "crossentropy": 2.565199136734009, "epoch": 0.6868837006960556, "grad_norm": 0.03021489642560482, "grad_norm_var": 4.2304533212188335e-06, "learning_rate": 0.0023143732767779176, "loss": 2.5876, "step": 18947 }, { "crossentropy": 2.6139183044433594, "epoch": 0.6869199535962877, "grad_norm": 0.025968827307224274, "grad_norm_var": 4.133850519282912e-06, "learning_rate": 0.0023138831460061664, "loss": 2.5109, "step": 18948 }, { "crossentropy": 2.550694704055786, "epoch": 0.6869562064965197, "grad_norm": 0.02675822377204895, "grad_norm_var": 4.177904568841012e-06, "learning_rate": 0.0023133930515145, "loss": 2.4866, "step": 18949 }, { "crossentropy": 2.305370807647705, "epoch": 0.6869924593967517, "grad_norm": 0.026574011892080307, "grad_norm_var": 3.762863726355101e-06, "learning_rate": 0.0023129029933095357, "loss": 2.4043, "step": 18950 }, { "crossentropy": 2.5323691368103027, "epoch": 0.6870287122969838, "grad_norm": 0.02722160890698433, "grad_norm_var": 3.542178144811531e-06, "learning_rate": 0.002312412971397894, "loss": 2.5024, "step": 18951 }, { "crossentropy": 2.412348508834839, "epoch": 0.6870649651972158, "grad_norm": 0.026673631742596626, "grad_norm_var": 3.5179308523692326e-06, "learning_rate": 0.002311922985786196, "loss": 2.434, "step": 18952 }, { "crossentropy": 2.5120363235473633, "epoch": 0.6871012180974478, "grad_norm": 0.026605576276779175, "grad_norm_var": 3.54726195908502e-06, "learning_rate": 0.002311433036481055, "loss": 2.5141, "step": 18953 }, { "crossentropy": 2.322601318359375, "epoch": 0.6871374709976799, "grad_norm": 0.02616785280406475, "grad_norm_var": 3.3516308261861855e-06, "learning_rate": 0.002310943123489093, "loss": 2.3623, "step": 18954 }, { "crossentropy": 2.440981388092041, "epoch": 0.6871737238979119, "grad_norm": 0.026057597249746323, "grad_norm_var": 3.4435041165144114e-06, "learning_rate": 0.0023104532468169224, "loss": 2.4523, "step": 18955 }, { "crossentropy": 2.5178229808807373, "epoch": 0.6872099767981439, "grad_norm": 0.026573874056339264, "grad_norm_var": 3.4721231414400664e-06, "learning_rate": 0.002309963406471165, "loss": 2.4496, "step": 18956 }, { "crossentropy": 2.4084959030151367, "epoch": 0.6872462296983759, "grad_norm": 0.02757558599114418, "grad_norm_var": 3.4369235028417394e-06, "learning_rate": 0.002309473602458434, "loss": 2.4735, "step": 18957 }, { "crossentropy": 2.4655323028564453, "epoch": 0.6872824825986079, "grad_norm": 0.027296995744109154, "grad_norm_var": 3.3696742911181227e-06, "learning_rate": 0.0023089838347853425, "loss": 2.5334, "step": 18958 }, { "crossentropy": 2.4456307888031006, "epoch": 0.6873187354988399, "grad_norm": 0.02631940320134163, "grad_norm_var": 3.3120812438314304e-06, "learning_rate": 0.002308494103458509, "loss": 2.3939, "step": 18959 }, { "crossentropy": 2.574094772338867, "epoch": 0.6873549883990719, "grad_norm": 0.025993110612034798, "grad_norm_var": 3.3952877258993747e-06, "learning_rate": 0.002308004408484548, "loss": 2.5164, "step": 18960 }, { "crossentropy": 2.458385944366455, "epoch": 0.687391241299304, "grad_norm": 0.02632199041545391, "grad_norm_var": 1.2057595794636207e-06, "learning_rate": 0.0023075147498700715, "loss": 2.5051, "step": 18961 }, { "crossentropy": 2.5336623191833496, "epoch": 0.687427494199536, "grad_norm": 0.026964550837874413, "grad_norm_var": 1.0406596204970507e-06, "learning_rate": 0.0023070251276216956, "loss": 2.4473, "step": 18962 }, { "crossentropy": 2.4837963581085205, "epoch": 0.687463747099768, "grad_norm": 0.027032556012272835, "grad_norm_var": 2.375681060979521e-07, "learning_rate": 0.002306535541746031, "loss": 2.5643, "step": 18963 }, { "crossentropy": 2.4650049209594727, "epoch": 0.6875, "grad_norm": 0.030193326994776726, "grad_norm_var": 9.79657371632849e-07, "learning_rate": 0.002306045992249691, "loss": 2.4598, "step": 18964 }, { "crossentropy": 2.478700637817383, "epoch": 0.687536252900232, "grad_norm": 0.030387133359909058, "grad_norm_var": 1.7362398332801862e-06, "learning_rate": 0.002305556479139292, "loss": 2.468, "step": 18965 }, { "crossentropy": 2.5987327098846436, "epoch": 0.687572505800464, "grad_norm": 0.02823757566511631, "grad_norm_var": 1.7875623349026716e-06, "learning_rate": 0.0023050670024214377, "loss": 2.5526, "step": 18966 }, { "crossentropy": 2.5568673610687256, "epoch": 0.687608758700696, "grad_norm": 0.026682162657380104, "grad_norm_var": 1.8060944454269142e-06, "learning_rate": 0.002304577562102743, "loss": 2.6187, "step": 18967 }, { "crossentropy": 2.4195189476013184, "epoch": 0.6876450116009281, "grad_norm": 0.02647535689175129, "grad_norm_var": 1.8222734702236578e-06, "learning_rate": 0.002304088158189821, "loss": 2.4152, "step": 18968 }, { "crossentropy": 2.5552315711975098, "epoch": 0.6876812645011601, "grad_norm": 0.026698041707277298, "grad_norm_var": 1.8157223425449564e-06, "learning_rate": 0.002303598790689278, "loss": 2.5138, "step": 18969 }, { "crossentropy": 2.4800400733947754, "epoch": 0.6877175174013921, "grad_norm": 0.026831476017832756, "grad_norm_var": 1.7531521011818207e-06, "learning_rate": 0.0023031094596077267, "loss": 2.5924, "step": 18970 }, { "crossentropy": 2.435586929321289, "epoch": 0.6877537703016241, "grad_norm": 0.026370862498879433, "grad_norm_var": 1.710418304029732e-06, "learning_rate": 0.002302620164951773, "loss": 2.4335, "step": 18971 }, { "crossentropy": 2.4973456859588623, "epoch": 0.6877900232018561, "grad_norm": 0.026455767452716827, "grad_norm_var": 1.7218921807437508e-06, "learning_rate": 0.0023021309067280276, "loss": 2.5004, "step": 18972 }, { "crossentropy": 2.474848985671997, "epoch": 0.6878262761020881, "grad_norm": 0.02601412497460842, "grad_norm_var": 1.8043565521421353e-06, "learning_rate": 0.002301641684943102, "loss": 2.5462, "step": 18973 }, { "crossentropy": 2.431015729904175, "epoch": 0.6878625290023201, "grad_norm": 0.026315785944461823, "grad_norm_var": 1.8442719757416019e-06, "learning_rate": 0.002301152499603597, "loss": 2.5067, "step": 18974 }, { "crossentropy": 2.5439109802246094, "epoch": 0.6878987819025522, "grad_norm": 0.02748391032218933, "grad_norm_var": 1.8108023853935361e-06, "learning_rate": 0.002300663350716123, "loss": 2.556, "step": 18975 }, { "crossentropy": 2.4662275314331055, "epoch": 0.6879350348027842, "grad_norm": 0.0266689695417881, "grad_norm_var": 1.7347737719381842e-06, "learning_rate": 0.0023001742382872882, "loss": 2.5621, "step": 18976 }, { "crossentropy": 2.437389373779297, "epoch": 0.6879712877030162, "grad_norm": 0.027110155671834946, "grad_norm_var": 1.681766340112092e-06, "learning_rate": 0.002299685162323696, "loss": 2.4957, "step": 18977 }, { "crossentropy": 2.4966254234313965, "epoch": 0.6880075406032483, "grad_norm": 0.028374766930937767, "grad_norm_var": 1.753307513904081e-06, "learning_rate": 0.0022991961228319554, "loss": 2.5006, "step": 18978 }, { "crossentropy": 2.6202392578125, "epoch": 0.6880437935034803, "grad_norm": 0.02661457471549511, "grad_norm_var": 1.780984619785643e-06, "learning_rate": 0.0022987071198186677, "loss": 2.5516, "step": 18979 }, { "crossentropy": 2.7335047721862793, "epoch": 0.6880800464037123, "grad_norm": 0.02936081774532795, "grad_norm_var": 1.503929557048371e-06, "learning_rate": 0.00229821815329044, "loss": 2.6464, "step": 18980 }, { "crossentropy": 2.4850008487701416, "epoch": 0.6881162993039444, "grad_norm": 0.027289390563964844, "grad_norm_var": 8.100459950070252e-07, "learning_rate": 0.00229772922325388, "loss": 2.5153, "step": 18981 }, { "crossentropy": 2.6253652572631836, "epoch": 0.6881525522041764, "grad_norm": 0.02871546894311905, "grad_norm_var": 8.992593946006873e-07, "learning_rate": 0.0022972403297155837, "loss": 2.6095, "step": 18982 }, { "crossentropy": 2.4776434898376465, "epoch": 0.6881888051044084, "grad_norm": 0.026292983442544937, "grad_norm_var": 9.299587402530057e-07, "learning_rate": 0.0022967514726821587, "loss": 2.4434, "step": 18983 }, { "crossentropy": 2.6512839794158936, "epoch": 0.6882250580046404, "grad_norm": 0.02607896737754345, "grad_norm_var": 9.710500114991679e-07, "learning_rate": 0.0022962626521602085, "loss": 2.5215, "step": 18984 }, { "crossentropy": 2.55991792678833, "epoch": 0.6882613109048724, "grad_norm": 0.027635805308818817, "grad_norm_var": 9.829739138649997e-07, "learning_rate": 0.0022957738681563334, "loss": 2.5448, "step": 18985 }, { "crossentropy": 2.5098233222961426, "epoch": 0.6882975638051044, "grad_norm": 0.028163662180304527, "grad_norm_var": 1.0460438814687318e-06, "learning_rate": 0.002295285120677137, "loss": 2.5253, "step": 18986 }, { "crossentropy": 2.6419951915740967, "epoch": 0.6883338167053364, "grad_norm": 0.02723008207976818, "grad_norm_var": 9.990154516158056e-07, "learning_rate": 0.0022947964097292184, "loss": 2.5699, "step": 18987 }, { "crossentropy": 2.4289209842681885, "epoch": 0.6883700696055685, "grad_norm": 0.02682984434068203, "grad_norm_var": 9.687545725510041e-07, "learning_rate": 0.002294307735319179, "loss": 2.4409, "step": 18988 }, { "crossentropy": 2.469482660293579, "epoch": 0.6884063225058005, "grad_norm": 0.026674969121813774, "grad_norm_var": 8.861656887781492e-07, "learning_rate": 0.0022938190974536215, "loss": 2.539, "step": 18989 }, { "crossentropy": 2.6725687980651855, "epoch": 0.6884425754060325, "grad_norm": 0.02616458386182785, "grad_norm_var": 9.074871918283063e-07, "learning_rate": 0.002293330496139144, "loss": 2.6164, "step": 18990 }, { "crossentropy": 2.477607011795044, "epoch": 0.6884788283062645, "grad_norm": 0.02635287120938301, "grad_norm_var": 9.586589861105068e-07, "learning_rate": 0.0022928419313823444, "loss": 2.5225, "step": 18991 }, { "crossentropy": 2.3995511531829834, "epoch": 0.6885150812064965, "grad_norm": 0.02715485915541649, "grad_norm_var": 9.375623544582411e-07, "learning_rate": 0.002292353403189824, "loss": 2.4385, "step": 18992 }, { "crossentropy": 2.481302499771118, "epoch": 0.6885513341067285, "grad_norm": 0.027334576472640038, "grad_norm_var": 9.364436992532425e-07, "learning_rate": 0.0022918649115681784, "loss": 2.568, "step": 18993 }, { "crossentropy": 2.3915023803710938, "epoch": 0.6885875870069605, "grad_norm": 0.02622688189148903, "grad_norm_var": 9.074667674491623e-07, "learning_rate": 0.0022913764565240086, "loss": 2.4411, "step": 18994 }, { "crossentropy": 2.5197055339813232, "epoch": 0.6886238399071926, "grad_norm": 0.026023875921964645, "grad_norm_var": 9.700679619048611e-07, "learning_rate": 0.0022908880380639084, "loss": 2.4885, "step": 18995 }, { "crossentropy": 2.5355663299560547, "epoch": 0.6886600928074246, "grad_norm": 0.02697746828198433, "grad_norm_var": 6.052501822418136e-07, "learning_rate": 0.0022903996561944766, "loss": 2.5852, "step": 18996 }, { "crossentropy": 2.4233198165893555, "epoch": 0.6886963457076566, "grad_norm": 0.02733301930129528, "grad_norm_var": 6.07362967254193e-07, "learning_rate": 0.0022899113109223113, "loss": 2.4448, "step": 18997 }, { "crossentropy": 2.5564420223236084, "epoch": 0.6887325986078886, "grad_norm": 0.026953618973493576, "grad_norm_var": 3.864899756106695e-07, "learning_rate": 0.0022894230022540065, "loss": 2.5595, "step": 18998 }, { "crossentropy": 2.516139268875122, "epoch": 0.6887688515081206, "grad_norm": 0.02611406333744526, "grad_norm_var": 4.0152259627209097e-07, "learning_rate": 0.002288934730196155, "loss": 2.4993, "step": 18999 }, { "crossentropy": 2.3012290000915527, "epoch": 0.6888051044083526, "grad_norm": 0.024744896218180656, "grad_norm_var": 6.4600452663547e-07, "learning_rate": 0.0022884464947553564, "loss": 2.3669, "step": 19000 }, { "crossentropy": 2.4907467365264893, "epoch": 0.6888413573085846, "grad_norm": 0.027790682390332222, "grad_norm_var": 6.65905438375922e-07, "learning_rate": 0.002287958295938201, "loss": 2.457, "step": 19001 }, { "crossentropy": 2.474666118621826, "epoch": 0.6888776102088167, "grad_norm": 0.02595348283648491, "grad_norm_var": 5.559067450847457e-07, "learning_rate": 0.0022874701337512867, "loss": 2.4857, "step": 19002 }, { "crossentropy": 2.597385883331299, "epoch": 0.6889138631090487, "grad_norm": 0.02667325548827648, "grad_norm_var": 5.297111237126742e-07, "learning_rate": 0.002286982008201202, "loss": 2.5839, "step": 19003 }, { "crossentropy": 2.5365078449249268, "epoch": 0.6889501160092807, "grad_norm": 0.027590343728661537, "grad_norm_var": 5.910473389945474e-07, "learning_rate": 0.002286493919294543, "loss": 2.5576, "step": 19004 }, { "crossentropy": 2.461690902709961, "epoch": 0.6889863689095128, "grad_norm": 0.027974458411335945, "grad_norm_var": 7.045601912886515e-07, "learning_rate": 0.002286005867037903, "loss": 2.5098, "step": 19005 }, { "crossentropy": 2.509704828262329, "epoch": 0.6890226218097448, "grad_norm": 0.025789974257349968, "grad_norm_var": 7.405825500994643e-07, "learning_rate": 0.002285517851437871, "loss": 2.5126, "step": 19006 }, { "crossentropy": 2.500107526779175, "epoch": 0.6890588747099768, "grad_norm": 0.0261029414832592, "grad_norm_var": 7.556134518582469e-07, "learning_rate": 0.0022850298725010415, "loss": 2.5457, "step": 19007 }, { "crossentropy": 2.351781129837036, "epoch": 0.6890951276102089, "grad_norm": 0.025859374552965164, "grad_norm_var": 7.769542523603506e-07, "learning_rate": 0.0022845419302340037, "loss": 2.4893, "step": 19008 }, { "crossentropy": 2.4376254081726074, "epoch": 0.6891313805104409, "grad_norm": 0.02772502601146698, "grad_norm_var": 8.252355554474432e-07, "learning_rate": 0.002284054024643346, "loss": 2.384, "step": 19009 }, { "crossentropy": 2.6343510150909424, "epoch": 0.6891676334106729, "grad_norm": 0.028728608042001724, "grad_norm_var": 1.0870764986787087e-06, "learning_rate": 0.0022835661557356625, "loss": 2.5053, "step": 19010 }, { "crossentropy": 2.622366428375244, "epoch": 0.6892038863109049, "grad_norm": 0.027976520359516144, "grad_norm_var": 1.130876897356692e-06, "learning_rate": 0.002283078323517538, "loss": 2.5554, "step": 19011 }, { "crossentropy": 2.353011131286621, "epoch": 0.6892401392111369, "grad_norm": 0.027221044525504112, "grad_norm_var": 1.1373287868390724e-06, "learning_rate": 0.002282590527995564, "loss": 2.4752, "step": 19012 }, { "crossentropy": 2.4628641605377197, "epoch": 0.6892763921113689, "grad_norm": 0.027382811531424522, "grad_norm_var": 1.1403040554218511e-06, "learning_rate": 0.002282102769176331, "loss": 2.4396, "step": 19013 }, { "crossentropy": 2.5333211421966553, "epoch": 0.6893126450116009, "grad_norm": 0.026717543601989746, "grad_norm_var": 1.1424558113312985e-06, "learning_rate": 0.0022816150470664222, "loss": 2.5298, "step": 19014 }, { "crossentropy": 2.695295810699463, "epoch": 0.689348897911833, "grad_norm": 0.026697272434830666, "grad_norm_var": 1.1028659003617466e-06, "learning_rate": 0.0022811273616724294, "loss": 2.6176, "step": 19015 }, { "crossentropy": 2.4458136558532715, "epoch": 0.689385150812065, "grad_norm": 0.026676269248127937, "grad_norm_var": 7.725270978724362e-07, "learning_rate": 0.0022806397130009375, "loss": 2.4546, "step": 19016 }, { "crossentropy": 2.655298948287964, "epoch": 0.689421403712297, "grad_norm": 0.025790473446249962, "grad_norm_var": 8.260369832225559e-07, "learning_rate": 0.002280152101058532, "loss": 2.5149, "step": 19017 }, { "crossentropy": 2.6133205890655518, "epoch": 0.689457656612529, "grad_norm": 0.026244154199957848, "grad_norm_var": 7.935214202848169e-07, "learning_rate": 0.002279664525851801, "loss": 2.505, "step": 19018 }, { "crossentropy": 2.5843286514282227, "epoch": 0.689493909512761, "grad_norm": 0.02683008648455143, "grad_norm_var": 7.893369719083032e-07, "learning_rate": 0.0022791769873873277, "loss": 2.4908, "step": 19019 }, { "crossentropy": 2.405761241912842, "epoch": 0.689530162412993, "grad_norm": 0.025908932089805603, "grad_norm_var": 8.239739108789236e-07, "learning_rate": 0.0022786894856716973, "loss": 2.4067, "step": 19020 }, { "crossentropy": 2.5657174587249756, "epoch": 0.689566415313225, "grad_norm": 0.026151128113269806, "grad_norm_var": 7.587766842435487e-07, "learning_rate": 0.002278202020711497, "loss": 2.5145, "step": 19021 }, { "crossentropy": 2.432279348373413, "epoch": 0.6896026682134571, "grad_norm": 0.026414403691887856, "grad_norm_var": 7.04246554785309e-07, "learning_rate": 0.002277714592513308, "loss": 2.4512, "step": 19022 }, { "crossentropy": 2.440877914428711, "epoch": 0.6896389211136891, "grad_norm": 0.02686271071434021, "grad_norm_var": 6.720750330267237e-07, "learning_rate": 0.002277227201083716, "loss": 2.452, "step": 19023 }, { "crossentropy": 2.445202112197876, "epoch": 0.6896751740139211, "grad_norm": 0.026606058701872826, "grad_norm_var": 6.10870359511031e-07, "learning_rate": 0.002276739846429302, "loss": 2.5541, "step": 19024 }, { "crossentropy": 2.498173475265503, "epoch": 0.6897114269141531, "grad_norm": 0.026008982211351395, "grad_norm_var": 5.994723324419798e-07, "learning_rate": 0.0022762525285566484, "loss": 2.4701, "step": 19025 }, { "crossentropy": 2.6293389797210693, "epoch": 0.6897476798143851, "grad_norm": 0.026377998292446136, "grad_norm_var": 3.28933679879711e-07, "learning_rate": 0.0022757652474723385, "loss": 2.5497, "step": 19026 }, { "crossentropy": 2.577683925628662, "epoch": 0.6897839327146171, "grad_norm": 0.02822023071348667, "grad_norm_var": 3.7683447016798737e-07, "learning_rate": 0.0022752780031829517, "loss": 2.4987, "step": 19027 }, { "crossentropy": 2.36777400970459, "epoch": 0.6898201856148491, "grad_norm": 0.027405893430113792, "grad_norm_var": 3.934908648273519e-07, "learning_rate": 0.0022747907956950707, "loss": 2.4715, "step": 19028 }, { "crossentropy": 2.5373244285583496, "epoch": 0.6898564385150812, "grad_norm": 0.026986146345734596, "grad_norm_var": 3.642201252831989e-07, "learning_rate": 0.002274303625015277, "loss": 2.5143, "step": 19029 }, { "crossentropy": 2.5474016666412354, "epoch": 0.6898926914153132, "grad_norm": 0.026239747181534767, "grad_norm_var": 3.721876164727942e-07, "learning_rate": 0.002273816491150148, "loss": 2.5233, "step": 19030 }, { "crossentropy": 2.529097557067871, "epoch": 0.6899289443155452, "grad_norm": 0.027192242443561554, "grad_norm_var": 3.94659860065751e-07, "learning_rate": 0.002273329394106266, "loss": 2.5749, "step": 19031 }, { "crossentropy": 2.500336170196533, "epoch": 0.6899651972157773, "grad_norm": 0.025624942034482956, "grad_norm_var": 4.558129659085978e-07, "learning_rate": 0.002272842333890208, "loss": 2.4828, "step": 19032 }, { "crossentropy": 2.600637435913086, "epoch": 0.6900014501160093, "grad_norm": 0.0277031809091568, "grad_norm_var": 4.897436155517608e-07, "learning_rate": 0.0022723553105085514, "loss": 2.5229, "step": 19033 }, { "crossentropy": 2.488069772720337, "epoch": 0.6900377030162413, "grad_norm": 0.02699718438088894, "grad_norm_var": 4.820712169394924e-07, "learning_rate": 0.0022718683239678758, "loss": 2.4857, "step": 19034 }, { "crossentropy": 2.475078582763672, "epoch": 0.6900739559164734, "grad_norm": 0.027104564011096954, "grad_norm_var": 4.907861013366538e-07, "learning_rate": 0.0022713813742747603, "loss": 2.5114, "step": 19035 }, { "crossentropy": 2.5579895973205566, "epoch": 0.6901102088167054, "grad_norm": 0.027171947062015533, "grad_norm_var": 4.5090830298927645e-07, "learning_rate": 0.0022708944614357783, "loss": 2.5447, "step": 19036 }, { "crossentropy": 2.5451598167419434, "epoch": 0.6901464617169374, "grad_norm": 0.026663541793823242, "grad_norm_var": 4.218450196554936e-07, "learning_rate": 0.00227040758545751, "loss": 2.4759, "step": 19037 }, { "crossentropy": 2.5468244552612305, "epoch": 0.6901827146171694, "grad_norm": 0.025996554642915726, "grad_norm_var": 4.5695542222236547e-07, "learning_rate": 0.0022699207463465275, "loss": 2.524, "step": 19038 }, { "crossentropy": 2.495701313018799, "epoch": 0.6902189675174014, "grad_norm": 0.027779284864664078, "grad_norm_var": 5.143616275064822e-07, "learning_rate": 0.0022694339441094087, "loss": 2.43, "step": 19039 }, { "crossentropy": 2.447030544281006, "epoch": 0.6902552204176334, "grad_norm": 0.0269946176558733, "grad_norm_var": 5.096103038975651e-07, "learning_rate": 0.002268947178752732, "loss": 2.5639, "step": 19040 }, { "crossentropy": 2.4196279048919678, "epoch": 0.6902914733178654, "grad_norm": 0.02526940405368805, "grad_norm_var": 6.320732223151956e-07, "learning_rate": 0.0022684604502830642, "loss": 2.4694, "step": 19041 }, { "crossentropy": 2.428941249847412, "epoch": 0.6903277262180975, "grad_norm": 0.025354109704494476, "grad_norm_var": 7.63119626665439e-07, "learning_rate": 0.002267973758706984, "loss": 2.4449, "step": 19042 }, { "crossentropy": 2.390529155731201, "epoch": 0.6903639791183295, "grad_norm": 0.026473291218280792, "grad_norm_var": 6.216458574646721e-07, "learning_rate": 0.0022674871040310653, "loss": 2.4345, "step": 19043 }, { "crossentropy": 2.5890684127807617, "epoch": 0.6904002320185615, "grad_norm": 0.02586578018963337, "grad_norm_var": 6.218153489046524e-07, "learning_rate": 0.0022670004862618783, "loss": 2.5804, "step": 19044 }, { "crossentropy": 2.4149415493011475, "epoch": 0.6904364849187935, "grad_norm": 0.025805413722991943, "grad_norm_var": 6.463518743995052e-07, "learning_rate": 0.0022665139054059984, "loss": 2.4961, "step": 19045 }, { "crossentropy": 2.5042083263397217, "epoch": 0.6904727378190255, "grad_norm": 0.025308825075626373, "grad_norm_var": 7.346480267337995e-07, "learning_rate": 0.0022660273614699957, "loss": 2.5267, "step": 19046 }, { "crossentropy": 2.5853781700134277, "epoch": 0.6905089907192575, "grad_norm": 0.027051309123635292, "grad_norm_var": 7.220650355151588e-07, "learning_rate": 0.0022655408544604416, "loss": 2.5207, "step": 19047 }, { "crossentropy": 2.437722682952881, "epoch": 0.6905452436194895, "grad_norm": 0.029954630881547928, "grad_norm_var": 1.4187043385402905e-06, "learning_rate": 0.00226505438438391, "loss": 2.4696, "step": 19048 }, { "crossentropy": 2.5851571559906006, "epoch": 0.6905814965197216, "grad_norm": 0.025554006919264793, "grad_norm_var": 1.4251796147282062e-06, "learning_rate": 0.0022645679512469695, "loss": 2.433, "step": 19049 }, { "crossentropy": 2.5127131938934326, "epoch": 0.6906177494199536, "grad_norm": 0.026862233877182007, "grad_norm_var": 1.4188837733591456e-06, "learning_rate": 0.0022640815550561883, "loss": 2.4671, "step": 19050 }, { "crossentropy": 2.5234005451202393, "epoch": 0.6906540023201856, "grad_norm": 0.02644822560250759, "grad_norm_var": 1.3995164777036064e-06, "learning_rate": 0.0022635951958181392, "loss": 2.4727, "step": 19051 }, { "crossentropy": 2.48451566696167, "epoch": 0.6906902552204176, "grad_norm": 0.0260452963411808, "grad_norm_var": 1.3831040290253655e-06, "learning_rate": 0.0022631088735393877, "loss": 2.4808, "step": 19052 }, { "crossentropy": 2.6232314109802246, "epoch": 0.6907265081206496, "grad_norm": 0.02581779845058918, "grad_norm_var": 1.4053254563989662e-06, "learning_rate": 0.0022626225882265067, "loss": 2.5223, "step": 19053 }, { "crossentropy": 2.444415807723999, "epoch": 0.6907627610208816, "grad_norm": 0.026094503700733185, "grad_norm_var": 1.4005085746858493e-06, "learning_rate": 0.0022621363398860595, "loss": 2.5057, "step": 19054 }, { "crossentropy": 2.5609169006347656, "epoch": 0.6907990139211136, "grad_norm": 0.02664322592318058, "grad_norm_var": 1.2748852181353042e-06, "learning_rate": 0.002261650128524616, "loss": 2.5346, "step": 19055 }, { "crossentropy": 2.5374741554260254, "epoch": 0.6908352668213457, "grad_norm": 0.025827951729297638, "grad_norm_var": 1.2591234097594195e-06, "learning_rate": 0.0022611639541487445, "loss": 2.5214, "step": 19056 }, { "crossentropy": 2.56807804107666, "epoch": 0.6908715197215777, "grad_norm": 0.0263974666595459, "grad_norm_var": 1.1876317642331783e-06, "learning_rate": 0.0022606778167650107, "loss": 2.5204, "step": 19057 }, { "crossentropy": 2.4782822132110596, "epoch": 0.6909077726218097, "grad_norm": 0.026461264118552208, "grad_norm_var": 1.1181148695369685e-06, "learning_rate": 0.002260191716379978, "loss": 2.5018, "step": 19058 }, { "crossentropy": 2.338387966156006, "epoch": 0.6909440255220418, "grad_norm": 0.02864868752658367, "grad_norm_var": 1.4313158901340754e-06, "learning_rate": 0.002259705653000216, "loss": 2.4391, "step": 19059 }, { "crossentropy": 2.5674564838409424, "epoch": 0.6909802784222738, "grad_norm": 0.026067104190587997, "grad_norm_var": 1.4155048983094705e-06, "learning_rate": 0.002259219626632285, "loss": 2.5467, "step": 19060 }, { "crossentropy": 2.3138339519500732, "epoch": 0.6910165313225058, "grad_norm": 0.026185091584920883, "grad_norm_var": 1.3862262288428003e-06, "learning_rate": 0.002258733637282755, "loss": 2.4144, "step": 19061 }, { "crossentropy": 2.508633852005005, "epoch": 0.6910527842227379, "grad_norm": 0.026922758668661118, "grad_norm_var": 1.2743010296615512e-06, "learning_rate": 0.002258247684958185, "loss": 2.4834, "step": 19062 }, { "crossentropy": 2.6259565353393555, "epoch": 0.6910890371229699, "grad_norm": 0.026399411261081696, "grad_norm_var": 1.269139320253058e-06, "learning_rate": 0.0022577617696651407, "loss": 2.5436, "step": 19063 }, { "crossentropy": 2.4893455505371094, "epoch": 0.6911252900232019, "grad_norm": 0.02701890841126442, "grad_norm_var": 5.125420349384272e-07, "learning_rate": 0.002257275891410188, "loss": 2.3885, "step": 19064 }, { "crossentropy": 2.582883834838867, "epoch": 0.6911615429234339, "grad_norm": 0.02649788185954094, "grad_norm_var": 4.53937137179329e-07, "learning_rate": 0.0022567900501998836, "loss": 2.5161, "step": 19065 }, { "crossentropy": 2.529181957244873, "epoch": 0.6911977958236659, "grad_norm": 0.026099590584635735, "grad_norm_var": 4.5560155891062836e-07, "learning_rate": 0.0022563042460407947, "loss": 2.4782, "step": 19066 }, { "crossentropy": 2.769962787628174, "epoch": 0.6912340487238979, "grad_norm": 0.026112161576747894, "grad_norm_var": 4.6379042308943424e-07, "learning_rate": 0.002255818478939481, "loss": 2.5895, "step": 19067 }, { "crossentropy": 2.4976203441619873, "epoch": 0.69127030162413, "grad_norm": 0.027696974575519562, "grad_norm_var": 5.446294142338043e-07, "learning_rate": 0.002255332748902501, "loss": 2.5173, "step": 19068 }, { "crossentropy": 2.4646809101104736, "epoch": 0.691306554524362, "grad_norm": 0.02603599801659584, "grad_norm_var": 5.261378940565308e-07, "learning_rate": 0.002254847055936419, "loss": 2.4386, "step": 19069 }, { "crossentropy": 2.462777853012085, "epoch": 0.691342807424594, "grad_norm": 0.02605966478586197, "grad_norm_var": 5.284193241819034e-07, "learning_rate": 0.002254361400047792, "loss": 2.5338, "step": 19070 }, { "crossentropy": 2.379988193511963, "epoch": 0.691379060324826, "grad_norm": 0.026394372805953026, "grad_norm_var": 5.297650494254698e-07, "learning_rate": 0.002253875781243181, "loss": 2.4648, "step": 19071 }, { "crossentropy": 2.409423589706421, "epoch": 0.691415313225058, "grad_norm": 0.026370741426944733, "grad_norm_var": 4.958084065890175e-07, "learning_rate": 0.0022533901995291468, "loss": 2.4504, "step": 19072 }, { "crossentropy": 2.529101848602295, "epoch": 0.69145156612529, "grad_norm": 0.027790173888206482, "grad_norm_var": 5.821178707646664e-07, "learning_rate": 0.002252904654912244, "loss": 2.6065, "step": 19073 }, { "crossentropy": 2.510174512863159, "epoch": 0.691487819025522, "grad_norm": 0.02613929659128189, "grad_norm_var": 5.976670666858838e-07, "learning_rate": 0.0022524191473990347, "loss": 2.54, "step": 19074 }, { "crossentropy": 2.476560115814209, "epoch": 0.691524071925754, "grad_norm": 0.026926189661026, "grad_norm_var": 3.246303604370962e-07, "learning_rate": 0.0022519336769960745, "loss": 2.4499, "step": 19075 }, { "crossentropy": 2.618377923965454, "epoch": 0.6915603248259861, "grad_norm": 0.027298208326101303, "grad_norm_var": 3.4094893482609217e-07, "learning_rate": 0.002251448243709918, "loss": 2.5522, "step": 19076 }, { "crossentropy": 2.5105838775634766, "epoch": 0.6915965777262181, "grad_norm": 0.027856143191456795, "grad_norm_var": 4.181922980463996e-07, "learning_rate": 0.0022509628475471267, "loss": 2.5438, "step": 19077 }, { "crossentropy": 2.431673526763916, "epoch": 0.6916328306264501, "grad_norm": 0.026414088904857635, "grad_norm_var": 4.2102965867063006e-07, "learning_rate": 0.0022504774885142515, "loss": 2.4957, "step": 19078 }, { "crossentropy": 2.4416747093200684, "epoch": 0.6916690835266821, "grad_norm": 0.025976503267884254, "grad_norm_var": 4.488395090559057e-07, "learning_rate": 0.0022499921666178507, "loss": 2.5206, "step": 19079 }, { "crossentropy": 2.403200626373291, "epoch": 0.6917053364269141, "grad_norm": 0.02651357837021351, "grad_norm_var": 4.4115149614395476e-07, "learning_rate": 0.0022495068818644804, "loss": 2.3894, "step": 19080 }, { "crossentropy": 2.5426669120788574, "epoch": 0.6917415893271461, "grad_norm": 0.025834960862994194, "grad_norm_var": 4.808569589421589e-07, "learning_rate": 0.0022490216342606925, "loss": 2.5357, "step": 19081 }, { "crossentropy": 2.4600677490234375, "epoch": 0.6917778422273781, "grad_norm": 0.0263275895267725, "grad_norm_var": 4.690481234419115e-07, "learning_rate": 0.0022485364238130435, "loss": 2.419, "step": 19082 }, { "crossentropy": 2.5739994049072266, "epoch": 0.6918140951276102, "grad_norm": 0.02587040141224861, "grad_norm_var": 4.887218842362928e-07, "learning_rate": 0.0022480512505280857, "loss": 2.5142, "step": 19083 }, { "crossentropy": 2.5376088619232178, "epoch": 0.6918503480278422, "grad_norm": 0.02706088125705719, "grad_norm_var": 4.2046903031787525e-07, "learning_rate": 0.00224756611441237, "loss": 2.4838, "step": 19084 }, { "crossentropy": 2.3458800315856934, "epoch": 0.6918866009280742, "grad_norm": 0.026481300592422485, "grad_norm_var": 4.020889634752274e-07, "learning_rate": 0.002247081015472453, "loss": 2.4258, "step": 19085 }, { "crossentropy": 2.4851248264312744, "epoch": 0.6919228538283063, "grad_norm": 0.02583904378116131, "grad_norm_var": 4.2049999927697086e-07, "learning_rate": 0.0022465959537148828, "loss": 2.492, "step": 19086 }, { "crossentropy": 2.4108333587646484, "epoch": 0.6919591067285383, "grad_norm": 0.029478242620825768, "grad_norm_var": 9.433576531686494e-07, "learning_rate": 0.002246110929146212, "loss": 2.4994, "step": 19087 }, { "crossentropy": 2.491543769836426, "epoch": 0.6919953596287703, "grad_norm": 0.02745603770017624, "grad_norm_var": 9.604894274500233e-07, "learning_rate": 0.0022456259417729953, "loss": 2.5201, "step": 19088 }, { "crossentropy": 2.4166853427886963, "epoch": 0.6920316125290024, "grad_norm": 0.026861578226089478, "grad_norm_var": 8.953664349127851e-07, "learning_rate": 0.0022451409916017773, "loss": 2.5229, "step": 19089 }, { "crossentropy": 2.4513299465179443, "epoch": 0.6920678654292344, "grad_norm": 0.027744488790631294, "grad_norm_var": 9.212320025717494e-07, "learning_rate": 0.0022446560786391133, "loss": 2.4949, "step": 19090 }, { "crossentropy": 2.4166111946105957, "epoch": 0.6921041183294664, "grad_norm": 0.027821116149425507, "grad_norm_var": 9.77849129718024e-07, "learning_rate": 0.00224417120289155, "loss": 2.487, "step": 19091 }, { "crossentropy": 2.4862053394317627, "epoch": 0.6921403712296984, "grad_norm": 0.02623773366212845, "grad_norm_var": 9.956685483397677e-07, "learning_rate": 0.0022436863643656353, "loss": 2.4652, "step": 19092 }, { "crossentropy": 2.426605701446533, "epoch": 0.6921766241299304, "grad_norm": 0.026286309584975243, "grad_norm_var": 9.413673406584328e-07, "learning_rate": 0.00224320156306792, "loss": 2.4549, "step": 19093 }, { "crossentropy": 2.4401369094848633, "epoch": 0.6922128770301624, "grad_norm": 0.02797842212021351, "grad_norm_var": 1.021592434280446e-06, "learning_rate": 0.002242716799004951, "loss": 2.4902, "step": 19094 }, { "crossentropy": 2.680828809738159, "epoch": 0.6922491299303944, "grad_norm": 0.027629686519503593, "grad_norm_var": 9.97548795352238e-07, "learning_rate": 0.0022422320721832744, "loss": 2.6201, "step": 19095 }, { "crossentropy": 2.5582683086395264, "epoch": 0.6922853828306265, "grad_norm": 0.026207618415355682, "grad_norm_var": 1.0217676086395066e-06, "learning_rate": 0.0022417473826094416, "loss": 2.5524, "step": 19096 }, { "crossentropy": 2.5053679943084717, "epoch": 0.6923216357308585, "grad_norm": 0.02721499651670456, "grad_norm_var": 9.365990531503502e-07, "learning_rate": 0.0022412627302899934, "loss": 2.5893, "step": 19097 }, { "crossentropy": 2.5528900623321533, "epoch": 0.6923578886310905, "grad_norm": 0.02708698809146881, "grad_norm_var": 9.01422919251151e-07, "learning_rate": 0.002240778115231481, "loss": 2.5132, "step": 19098 }, { "crossentropy": 2.5330049991607666, "epoch": 0.6923941415313225, "grad_norm": 0.026238715276122093, "grad_norm_var": 8.505769789761969e-07, "learning_rate": 0.002240293537440447, "loss": 2.4567, "step": 19099 }, { "crossentropy": 2.5590505599975586, "epoch": 0.6924303944315545, "grad_norm": 0.028163332492113113, "grad_norm_var": 9.20576431771601e-07, "learning_rate": 0.002239808996923435, "loss": 2.5984, "step": 19100 }, { "crossentropy": 2.5203044414520264, "epoch": 0.6924666473317865, "grad_norm": 0.0262856837362051, "grad_norm_var": 9.409400282896103e-07, "learning_rate": 0.002239324493686994, "loss": 2.5445, "step": 19101 }, { "crossentropy": 2.598430871963501, "epoch": 0.6925029002320185, "grad_norm": 0.02685348130762577, "grad_norm_var": 8.26841068288231e-07, "learning_rate": 0.0022388400277376626, "loss": 2.5678, "step": 19102 }, { "crossentropy": 2.6136529445648193, "epoch": 0.6925391531322506, "grad_norm": 0.027191244065761566, "grad_norm_var": 4.655913207837151e-07, "learning_rate": 0.002238355599081987, "loss": 2.5365, "step": 19103 }, { "crossentropy": 2.5511298179626465, "epoch": 0.6925754060324826, "grad_norm": 0.02667083404958248, "grad_norm_var": 4.6460887941450016e-07, "learning_rate": 0.0022378712077265116, "loss": 2.484, "step": 19104 }, { "crossentropy": 2.509969711303711, "epoch": 0.6926116589327146, "grad_norm": 0.027534956112504005, "grad_norm_var": 4.77870814256107e-07, "learning_rate": 0.0022373868536777752, "loss": 2.5771, "step": 19105 }, { "crossentropy": 2.697296619415283, "epoch": 0.6926479118329466, "grad_norm": 0.0279857125133276, "grad_norm_var": 5.031498375512259e-07, "learning_rate": 0.002236902536942324, "loss": 2.5681, "step": 19106 }, { "crossentropy": 2.417064666748047, "epoch": 0.6926841647331786, "grad_norm": 0.027099348604679108, "grad_norm_var": 4.650298579466124e-07, "learning_rate": 0.0022364182575266957, "loss": 2.4767, "step": 19107 }, { "crossentropy": 2.5430116653442383, "epoch": 0.6927204176334106, "grad_norm": 0.026138167828321457, "grad_norm_var": 4.7632068010994694e-07, "learning_rate": 0.0022359340154374336, "loss": 2.5686, "step": 19108 }, { "crossentropy": 2.5633275508880615, "epoch": 0.6927566705336426, "grad_norm": 0.02542935311794281, "grad_norm_var": 6.078043490758665e-07, "learning_rate": 0.002235449810681078, "loss": 2.4841, "step": 19109 }, { "crossentropy": 2.4938135147094727, "epoch": 0.6927929234338747, "grad_norm": 0.027378186583518982, "grad_norm_var": 5.505596526959027e-07, "learning_rate": 0.0022349656432641657, "loss": 2.4787, "step": 19110 }, { "crossentropy": 2.414332628250122, "epoch": 0.6928291763341067, "grad_norm": 0.02572876587510109, "grad_norm_var": 6.02680136497132e-07, "learning_rate": 0.0022344815131932387, "loss": 2.3728, "step": 19111 }, { "crossentropy": 2.7761611938476562, "epoch": 0.6928654292343387, "grad_norm": 0.025951238349080086, "grad_norm_var": 6.279086623640067e-07, "learning_rate": 0.0022339974204748374, "loss": 2.6118, "step": 19112 }, { "crossentropy": 2.5604159832000732, "epoch": 0.6929016821345708, "grad_norm": 0.026706669479608536, "grad_norm_var": 6.165709066075492e-07, "learning_rate": 0.002233513365115497, "loss": 2.5895, "step": 19113 }, { "crossentropy": 2.5577900409698486, "epoch": 0.6929379350348028, "grad_norm": 0.025452662259340286, "grad_norm_var": 7.161055957684842e-07, "learning_rate": 0.002233029347121759, "loss": 2.532, "step": 19114 }, { "crossentropy": 2.6321048736572266, "epoch": 0.6929741879350348, "grad_norm": 0.025918569415807724, "grad_norm_var": 7.411570125001598e-07, "learning_rate": 0.002232545366500156, "loss": 2.5333, "step": 19115 }, { "crossentropy": 2.678341865539551, "epoch": 0.6930104408352669, "grad_norm": 0.028771227225661278, "grad_norm_var": 8.864657612547237e-07, "learning_rate": 0.00223206142325723, "loss": 2.6465, "step": 19116 }, { "crossentropy": 2.5006864070892334, "epoch": 0.6930466937354989, "grad_norm": 0.025550508871674538, "grad_norm_var": 9.602220093994726e-07, "learning_rate": 0.002231577517399515, "loss": 2.5223, "step": 19117 }, { "crossentropy": 2.574397325515747, "epoch": 0.6930829466357309, "grad_norm": 0.027294442057609558, "grad_norm_var": 9.844821297616892e-07, "learning_rate": 0.002231093648933545, "loss": 2.5431, "step": 19118 }, { "crossentropy": 2.5173256397247314, "epoch": 0.6931191995359629, "grad_norm": 0.02567329630255699, "grad_norm_var": 1.0240321269408547e-06, "learning_rate": 0.002230609817865857, "loss": 2.5349, "step": 19119 }, { "crossentropy": 2.459951400756836, "epoch": 0.6931554524361949, "grad_norm": 0.028447266668081284, "grad_norm_var": 1.2427206162532542e-06, "learning_rate": 0.002230126024202988, "loss": 2.4996, "step": 19120 }, { "crossentropy": 2.527974843978882, "epoch": 0.6931917053364269, "grad_norm": 0.02890286222100258, "grad_norm_var": 1.5135457619053216e-06, "learning_rate": 0.0022296422679514683, "loss": 2.5148, "step": 19121 }, { "crossentropy": 2.5440549850463867, "epoch": 0.693227958236659, "grad_norm": 0.026408882811665535, "grad_norm_var": 1.4147718343505862e-06, "learning_rate": 0.002229158549117836, "loss": 2.6335, "step": 19122 }, { "crossentropy": 2.4904894828796387, "epoch": 0.693264211136891, "grad_norm": 0.027050599455833435, "grad_norm_var": 1.4121830467995916e-06, "learning_rate": 0.002228674867708621, "loss": 2.4824, "step": 19123 }, { "crossentropy": 2.586533546447754, "epoch": 0.693300464037123, "grad_norm": 0.026552699506282806, "grad_norm_var": 1.3932423148205198e-06, "learning_rate": 0.0022281912237303594, "loss": 2.5311, "step": 19124 }, { "crossentropy": 2.6112136840820312, "epoch": 0.693336716937355, "grad_norm": 0.026822946965694427, "grad_norm_var": 1.2783216135863842e-06, "learning_rate": 0.002227707617189581, "loss": 2.6412, "step": 19125 }, { "crossentropy": 2.5922558307647705, "epoch": 0.693372969837587, "grad_norm": 0.026257460936903954, "grad_norm_var": 1.2686579798276553e-06, "learning_rate": 0.002227224048092817, "loss": 2.5201, "step": 19126 }, { "crossentropy": 2.616875648498535, "epoch": 0.693409222737819, "grad_norm": 0.026926105841994286, "grad_norm_var": 1.2003118723462788e-06, "learning_rate": 0.0022267405164466, "loss": 2.6727, "step": 19127 }, { "crossentropy": 2.3065147399902344, "epoch": 0.693445475638051, "grad_norm": 0.026564044877886772, "grad_norm_var": 1.1550072091650575e-06, "learning_rate": 0.0022262570222574625, "loss": 2.324, "step": 19128 }, { "crossentropy": 2.4934473037719727, "epoch": 0.693481728538283, "grad_norm": 0.027061251923441887, "grad_norm_var": 1.1569746544873854e-06, "learning_rate": 0.0022257735655319315, "loss": 2.4728, "step": 19129 }, { "crossentropy": 2.3616092205047607, "epoch": 0.6935179814385151, "grad_norm": 0.02725510485470295, "grad_norm_var": 1.0233849433562957e-06, "learning_rate": 0.002225290146276541, "loss": 2.4084, "step": 19130 }, { "crossentropy": 2.569324016571045, "epoch": 0.6935542343387471, "grad_norm": 0.02636014297604561, "grad_norm_var": 9.738979505489903e-07, "learning_rate": 0.0022248067644978146, "loss": 2.5292, "step": 19131 }, { "crossentropy": 2.591862440109253, "epoch": 0.6935904872389791, "grad_norm": 0.02568880282342434, "grad_norm_var": 8.371767825203376e-07, "learning_rate": 0.0022243234202022856, "loss": 2.5689, "step": 19132 }, { "crossentropy": 2.6055641174316406, "epoch": 0.6936267401392111, "grad_norm": 0.02683275006711483, "grad_norm_var": 7.26140377650906e-07, "learning_rate": 0.002223840113396484, "loss": 2.6006, "step": 19133 }, { "crossentropy": 2.6087000370025635, "epoch": 0.6936629930394431, "grad_norm": 0.02719874307513237, "grad_norm_var": 7.214394280992294e-07, "learning_rate": 0.0022233568440869315, "loss": 2.5545, "step": 19134 }, { "crossentropy": 2.3827273845672607, "epoch": 0.6936992459396751, "grad_norm": 0.02639084868133068, "grad_norm_var": 6.386304296141967e-07, "learning_rate": 0.0022228736122801586, "loss": 2.3894, "step": 19135 }, { "crossentropy": 2.681459665298462, "epoch": 0.6937354988399071, "grad_norm": 0.02796678990125656, "grad_norm_var": 5.552189531828847e-07, "learning_rate": 0.002222390417982693, "loss": 2.566, "step": 19136 }, { "crossentropy": 2.3616080284118652, "epoch": 0.6937717517401392, "grad_norm": 0.028178537264466286, "grad_norm_var": 3.93614079099297e-07, "learning_rate": 0.002221907261201059, "loss": 2.4135, "step": 19137 }, { "crossentropy": 2.6226298809051514, "epoch": 0.6938080046403712, "grad_norm": 0.0267848651856184, "grad_norm_var": 3.8059970247448365e-07, "learning_rate": 0.002221424141941785, "loss": 2.5491, "step": 19138 }, { "crossentropy": 2.4574286937713623, "epoch": 0.6938442575406032, "grad_norm": 0.02705087698996067, "grad_norm_var": 3.8060645575510067e-07, "learning_rate": 0.002220941060211393, "loss": 2.5107, "step": 19139 }, { "crossentropy": 2.4961657524108887, "epoch": 0.6938805104408353, "grad_norm": 0.026085112243890762, "grad_norm_var": 4.1394419718053003e-07, "learning_rate": 0.0022204580160164085, "loss": 2.4762, "step": 19140 }, { "crossentropy": 2.497587203979492, "epoch": 0.6939167633410673, "grad_norm": 0.026411019265651703, "grad_norm_var": 4.254324862515048e-07, "learning_rate": 0.0022199750093633607, "loss": 2.4614, "step": 19141 }, { "crossentropy": 2.3890137672424316, "epoch": 0.6939530162412993, "grad_norm": 0.026185592636466026, "grad_norm_var": 4.310813910655588e-07, "learning_rate": 0.002219492040258766, "loss": 2.4075, "step": 19142 }, { "crossentropy": 2.4894649982452393, "epoch": 0.6939892691415314, "grad_norm": 0.026494339108467102, "grad_norm_var": 4.359788694484219e-07, "learning_rate": 0.0022190091087091503, "loss": 2.5061, "step": 19143 }, { "crossentropy": 2.5812253952026367, "epoch": 0.6940255220417634, "grad_norm": 0.026270495727658272, "grad_norm_var": 4.4988752774186014e-07, "learning_rate": 0.002218526214721039, "loss": 2.599, "step": 19144 }, { "crossentropy": 2.426208257675171, "epoch": 0.6940617749419954, "grad_norm": 0.02635440230369568, "grad_norm_var": 4.530484066658415e-07, "learning_rate": 0.0022180433583009504, "loss": 2.4975, "step": 19145 }, { "crossentropy": 2.451904535293579, "epoch": 0.6940980278422274, "grad_norm": 0.02629234455525875, "grad_norm_var": 4.421968759909933e-07, "learning_rate": 0.0022175605394554095, "loss": 2.4018, "step": 19146 }, { "crossentropy": 2.48738694190979, "epoch": 0.6941342807424594, "grad_norm": 0.02619888074696064, "grad_norm_var": 4.5025036802809106e-07, "learning_rate": 0.0022170777581909345, "loss": 2.5408, "step": 19147 }, { "crossentropy": 2.413902759552002, "epoch": 0.6941705336426914, "grad_norm": 0.02688675746321678, "grad_norm_var": 3.865701381571777e-07, "learning_rate": 0.0022165950145140475, "loss": 2.4587, "step": 19148 }, { "crossentropy": 2.4377667903900146, "epoch": 0.6942067865429234, "grad_norm": 0.02670874260365963, "grad_norm_var": 3.8573144480213464e-07, "learning_rate": 0.0022161123084312707, "loss": 2.5076, "step": 19149 }, { "crossentropy": 2.337955951690674, "epoch": 0.6942430394431555, "grad_norm": 0.028220755979418755, "grad_norm_var": 5.167759683542711e-07, "learning_rate": 0.0022156296399491217, "loss": 2.4384, "step": 19150 }, { "crossentropy": 2.6322100162506104, "epoch": 0.6942792923433875, "grad_norm": 0.026799270883202553, "grad_norm_var": 5.060085458303646e-07, "learning_rate": 0.0022151470090741184, "loss": 2.5291, "step": 19151 }, { "crossentropy": 2.6526126861572266, "epoch": 0.6943155452436195, "grad_norm": 0.027611905708909035, "grad_norm_var": 4.589324949140812e-07, "learning_rate": 0.0022146644158127825, "loss": 2.53, "step": 19152 }, { "crossentropy": 2.4176039695739746, "epoch": 0.6943517981438515, "grad_norm": 0.025744978338479996, "grad_norm_var": 3.763738943770555e-07, "learning_rate": 0.0022141818601716283, "loss": 2.527, "step": 19153 }, { "crossentropy": 2.67696213722229, "epoch": 0.6943880510440835, "grad_norm": 0.027854392305016518, "grad_norm_var": 4.6976996348780757e-07, "learning_rate": 0.0022136993421571784, "loss": 2.7263, "step": 19154 }, { "crossentropy": 2.3863399028778076, "epoch": 0.6944243039443155, "grad_norm": 0.026270348578691483, "grad_norm_var": 4.7113457500139584e-07, "learning_rate": 0.0022132168617759454, "loss": 2.3619, "step": 19155 }, { "crossentropy": 2.577397108078003, "epoch": 0.6944605568445475, "grad_norm": 0.026116808876395226, "grad_norm_var": 4.688128447958087e-07, "learning_rate": 0.0022127344190344465, "loss": 2.5367, "step": 19156 }, { "crossentropy": 2.620180606842041, "epoch": 0.6944968097447796, "grad_norm": 0.02653375267982483, "grad_norm_var": 4.6582200922632573e-07, "learning_rate": 0.0022122520139392023, "loss": 2.4968, "step": 19157 }, { "crossentropy": 2.547403573989868, "epoch": 0.6945330626450116, "grad_norm": 0.02854730561375618, "grad_norm_var": 6.653584352831008e-07, "learning_rate": 0.002211769646496725, "loss": 2.5246, "step": 19158 }, { "crossentropy": 2.4813621044158936, "epoch": 0.6945693155452436, "grad_norm": 0.027106687426567078, "grad_norm_var": 6.632997031358421e-07, "learning_rate": 0.002211287316713527, "loss": 2.5499, "step": 19159 }, { "crossentropy": 2.5857269763946533, "epoch": 0.6946055684454756, "grad_norm": 0.026349693536758423, "grad_norm_var": 6.576265564529892e-07, "learning_rate": 0.0022108050245961287, "loss": 2.5698, "step": 19160 }, { "crossentropy": 2.4563493728637695, "epoch": 0.6946418213457076, "grad_norm": 0.025911197066307068, "grad_norm_var": 6.991793743635807e-07, "learning_rate": 0.002210322770151039, "loss": 2.4832, "step": 19161 }, { "crossentropy": 2.4262137413024902, "epoch": 0.6946780742459396, "grad_norm": 0.0271727554500103, "grad_norm_var": 6.854359565720229e-07, "learning_rate": 0.002209840553384776, "loss": 2.4678, "step": 19162 }, { "crossentropy": 2.673029661178589, "epoch": 0.6947143271461717, "grad_norm": 0.02829729951918125, "grad_norm_var": 7.708765652567611e-07, "learning_rate": 0.0022093583743038486, "loss": 2.6432, "step": 19163 }, { "crossentropy": 2.63187313079834, "epoch": 0.6947505800464037, "grad_norm": 0.027123626321554184, "grad_norm_var": 7.705449169164893e-07, "learning_rate": 0.002208876232914771, "loss": 2.6147, "step": 19164 }, { "crossentropy": 2.3795254230499268, "epoch": 0.6947868329466357, "grad_norm": 0.02720271237194538, "grad_norm_var": 7.650912191269384e-07, "learning_rate": 0.002208394129224058, "loss": 2.5031, "step": 19165 }, { "crossentropy": 2.6370298862457275, "epoch": 0.6948230858468677, "grad_norm": 0.025706259533762932, "grad_norm_var": 7.69074984621508e-07, "learning_rate": 0.0022079120632382183, "loss": 2.5673, "step": 19166 }, { "crossentropy": 2.632599115371704, "epoch": 0.6948593387470998, "grad_norm": 0.026398498564958572, "grad_norm_var": 7.843258824625938e-07, "learning_rate": 0.002207430034963761, "loss": 2.5883, "step": 19167 }, { "crossentropy": 2.529829502105713, "epoch": 0.6948955916473318, "grad_norm": 0.027517259120941162, "grad_norm_var": 7.755455026903079e-07, "learning_rate": 0.0022069480444072017, "loss": 2.5826, "step": 19168 }, { "crossentropy": 2.3345208168029785, "epoch": 0.6949318445475638, "grad_norm": 0.02933528460562229, "grad_norm_var": 1.0446202613444144e-06, "learning_rate": 0.002206466091575045, "loss": 2.4444, "step": 19169 }, { "crossentropy": 2.675480604171753, "epoch": 0.6949680974477959, "grad_norm": 0.02615673467516899, "grad_norm_var": 1.051779239923907e-06, "learning_rate": 0.002205984176473806, "loss": 2.5854, "step": 19170 }, { "crossentropy": 2.5029172897338867, "epoch": 0.6950043503480279, "grad_norm": 0.02731209062039852, "grad_norm_var": 1.020461163410829e-06, "learning_rate": 0.002205502299109988, "loss": 2.4483, "step": 19171 }, { "crossentropy": 2.362109422683716, "epoch": 0.6950406032482599, "grad_norm": 0.027930302545428276, "grad_norm_var": 1.000545656377969e-06, "learning_rate": 0.0022050204594901025, "loss": 2.4541, "step": 19172 }, { "crossentropy": 2.579049587249756, "epoch": 0.6950768561484919, "grad_norm": 0.02575874514877796, "grad_norm_var": 1.1030660632318218e-06, "learning_rate": 0.0022045386576206587, "loss": 2.5409, "step": 19173 }, { "crossentropy": 2.5603246688842773, "epoch": 0.6951131090487239, "grad_norm": 0.027088144794106483, "grad_norm_var": 9.5731132126736e-07, "learning_rate": 0.0022040568935081614, "loss": 2.4801, "step": 19174 }, { "crossentropy": 2.5472755432128906, "epoch": 0.6951493619489559, "grad_norm": 0.026949873194098473, "grad_norm_var": 9.570975301777107e-07, "learning_rate": 0.0022035751671591205, "loss": 2.4954, "step": 19175 }, { "crossentropy": 2.58256459236145, "epoch": 0.695185614849188, "grad_norm": 0.028719572350382805, "grad_norm_var": 1.098474835463381e-06, "learning_rate": 0.0022030934785800403, "loss": 2.6224, "step": 19176 }, { "crossentropy": 2.449538230895996, "epoch": 0.69522186774942, "grad_norm": 0.02879061922430992, "grad_norm_var": 1.1367342782832185e-06, "learning_rate": 0.0022026118277774253, "loss": 2.512, "step": 19177 }, { "crossentropy": 2.4831109046936035, "epoch": 0.695258120649652, "grad_norm": 0.026597779244184494, "grad_norm_var": 1.1703129480418345e-06, "learning_rate": 0.002202130214757785, "loss": 2.4317, "step": 19178 }, { "crossentropy": 2.4727182388305664, "epoch": 0.695294373549884, "grad_norm": 0.028109999373555183, "grad_norm_var": 1.1477319810196744e-06, "learning_rate": 0.00220164863952762, "loss": 2.5033, "step": 19179 }, { "crossentropy": 2.4973292350769043, "epoch": 0.695330626450116, "grad_norm": 0.025700241327285767, "grad_norm_var": 1.3066157652973088e-06, "learning_rate": 0.002201167102093437, "loss": 2.527, "step": 19180 }, { "crossentropy": 2.5136890411376953, "epoch": 0.695366879350348, "grad_norm": 0.02644299902021885, "grad_norm_var": 1.3428830175118978e-06, "learning_rate": 0.002200685602461741, "loss": 2.5315, "step": 19181 }, { "crossentropy": 2.5327253341674805, "epoch": 0.69540313225058, "grad_norm": 0.027998512610793114, "grad_norm_var": 1.2278433794664367e-06, "learning_rate": 0.002200204140639033, "loss": 2.575, "step": 19182 }, { "crossentropy": 2.554853677749634, "epoch": 0.695439385150812, "grad_norm": 0.025788389146327972, "grad_norm_var": 1.3244770851146444e-06, "learning_rate": 0.002199722716631818, "loss": 2.484, "step": 19183 }, { "crossentropy": 2.527777910232544, "epoch": 0.6954756380510441, "grad_norm": 0.027535652741789818, "grad_norm_var": 1.3251235520784447e-06, "learning_rate": 0.0021992413304465974, "loss": 2.4863, "step": 19184 }, { "crossentropy": 2.5228586196899414, "epoch": 0.6955118909512761, "grad_norm": 0.027066022157669067, "grad_norm_var": 1.0200941215318208e-06, "learning_rate": 0.0021987599820898714, "loss": 2.5409, "step": 19185 }, { "crossentropy": 2.5211715698242188, "epoch": 0.6955481438515081, "grad_norm": 0.026881538331508636, "grad_norm_var": 9.59682365245645e-07, "learning_rate": 0.0021982786715681445, "loss": 2.5601, "step": 19186 }, { "crossentropy": 2.5777034759521484, "epoch": 0.6955843967517401, "grad_norm": 0.025943662971258163, "grad_norm_var": 1.050229389928806e-06, "learning_rate": 0.002197797398887914, "loss": 2.5365, "step": 19187 }, { "crossentropy": 2.496007204055786, "epoch": 0.6956206496519721, "grad_norm": 0.025889821350574493, "grad_norm_var": 1.0794902714290783e-06, "learning_rate": 0.0021973161640556826, "loss": 2.4206, "step": 19188 }, { "crossentropy": 2.5941996574401855, "epoch": 0.6956569025522041, "grad_norm": 0.026131536811590195, "grad_norm_var": 1.0287728568765387e-06, "learning_rate": 0.0021968349670779506, "loss": 2.5758, "step": 19189 }, { "crossentropy": 2.5468013286590576, "epoch": 0.6956931554524362, "grad_norm": 0.026664460077881813, "grad_norm_var": 1.0337217885250516e-06, "learning_rate": 0.0021963538079612156, "loss": 2.4902, "step": 19190 }, { "crossentropy": 2.4876773357391357, "epoch": 0.6957294083526682, "grad_norm": 0.02604837343096733, "grad_norm_var": 1.0846111275020567e-06, "learning_rate": 0.0021958726867119783, "loss": 2.5091, "step": 19191 }, { "crossentropy": 2.409968376159668, "epoch": 0.6957656612529002, "grad_norm": 0.02624480240046978, "grad_norm_var": 8.65115473429728e-07, "learning_rate": 0.0021953916033367357, "loss": 2.5056, "step": 19192 }, { "crossentropy": 2.5505337715148926, "epoch": 0.6958019141531323, "grad_norm": 0.025149356573820114, "grad_norm_var": 6.980413713335182e-07, "learning_rate": 0.0021949105578419835, "loss": 2.5376, "step": 19193 }, { "crossentropy": 2.405697822570801, "epoch": 0.6958381670533643, "grad_norm": 0.025978054851293564, "grad_norm_var": 7.149630110021728e-07, "learning_rate": 0.0021944295502342235, "loss": 2.509, "step": 19194 }, { "crossentropy": 2.439079999923706, "epoch": 0.6958744199535963, "grad_norm": 0.026339862495660782, "grad_norm_var": 5.245178164246886e-07, "learning_rate": 0.0021939485805199483, "loss": 2.4375, "step": 19195 }, { "crossentropy": 2.4944145679473877, "epoch": 0.6959106728538283, "grad_norm": 0.027298513799905777, "grad_norm_var": 5.429994199451868e-07, "learning_rate": 0.0021934676487056553, "loss": 2.4647, "step": 19196 }, { "crossentropy": 2.385343313217163, "epoch": 0.6959469257540604, "grad_norm": 0.02686314284801483, "grad_norm_var": 5.529340857892325e-07, "learning_rate": 0.0021929867547978432, "loss": 2.4196, "step": 19197 }, { "crossentropy": 2.4136087894439697, "epoch": 0.6959831786542924, "grad_norm": 0.027236059308052063, "grad_norm_var": 4.3579521179606895e-07, "learning_rate": 0.002192505898803003, "loss": 2.5556, "step": 19198 }, { "crossentropy": 2.4217588901519775, "epoch": 0.6960194315545244, "grad_norm": 0.026995934545993805, "grad_norm_var": 4.218235849005967e-07, "learning_rate": 0.0021920250807276327, "loss": 2.4098, "step": 19199 }, { "crossentropy": 2.6156299114227295, "epoch": 0.6960556844547564, "grad_norm": 0.027058564126491547, "grad_norm_var": 3.712303865129861e-07, "learning_rate": 0.0021915443005782248, "loss": 2.511, "step": 19200 }, { "crossentropy": 2.55798077583313, "epoch": 0.6960919373549884, "grad_norm": 0.027720076963305473, "grad_norm_var": 4.4847458327266535e-07, "learning_rate": 0.002191063558361271, "loss": 2.4233, "step": 19201 }, { "crossentropy": 2.5275561809539795, "epoch": 0.6961281902552204, "grad_norm": 0.02622380666434765, "grad_norm_var": 4.4448508491334857e-07, "learning_rate": 0.002190582854083269, "loss": 2.4825, "step": 19202 }, { "crossentropy": 2.545804977416992, "epoch": 0.6961644431554525, "grad_norm": 0.026515325531363487, "grad_norm_var": 4.2352435076068765e-07, "learning_rate": 0.002190102187750707, "loss": 2.5387, "step": 19203 }, { "crossentropy": 2.571202039718628, "epoch": 0.6962006960556845, "grad_norm": 0.02702266350388527, "grad_norm_var": 4.0819101104817327e-07, "learning_rate": 0.002189621559370079, "loss": 2.5265, "step": 19204 }, { "crossentropy": 2.281541347503662, "epoch": 0.6962369489559165, "grad_norm": 0.0268789641559124, "grad_norm_var": 3.97102678092416e-07, "learning_rate": 0.0021891409689478785, "loss": 2.4168, "step": 19205 }, { "crossentropy": 2.6096866130828857, "epoch": 0.6962732018561485, "grad_norm": 0.026432134211063385, "grad_norm_var": 3.997144932941024e-07, "learning_rate": 0.002188660416490593, "loss": 2.5402, "step": 19206 }, { "crossentropy": 2.3521125316619873, "epoch": 0.6963094547563805, "grad_norm": 0.026213737204670906, "grad_norm_var": 3.8870204839475986e-07, "learning_rate": 0.002188179902004717, "loss": 2.4844, "step": 19207 }, { "crossentropy": 2.5064234733581543, "epoch": 0.6963457076566125, "grad_norm": 0.026849592104554176, "grad_norm_var": 3.8004227631802054e-07, "learning_rate": 0.0021876994254967385, "loss": 2.5562, "step": 19208 }, { "crossentropy": 2.4452033042907715, "epoch": 0.6963819605568445, "grad_norm": 0.026352375745773315, "grad_norm_var": 2.2602133658484492e-07, "learning_rate": 0.002187218986973145, "loss": 2.4656, "step": 19209 }, { "crossentropy": 2.5593857765197754, "epoch": 0.6964182134570766, "grad_norm": 0.02800540253520012, "grad_norm_var": 2.7459620172130305e-07, "learning_rate": 0.0021867385864404298, "loss": 2.511, "step": 19210 }, { "crossentropy": 2.604818105697632, "epoch": 0.6964544663573086, "grad_norm": 0.02764310874044895, "grad_norm_var": 2.8769372898002075e-07, "learning_rate": 0.002186258223905077, "loss": 2.6482, "step": 19211 }, { "crossentropy": 2.5011942386627197, "epoch": 0.6964907192575406, "grad_norm": 0.02667390927672386, "grad_norm_var": 2.836219066634319e-07, "learning_rate": 0.002185777899373577, "loss": 2.58, "step": 19212 }, { "crossentropy": 2.555145263671875, "epoch": 0.6965269721577726, "grad_norm": 0.026115646585822105, "grad_norm_var": 3.2399127616020554e-07, "learning_rate": 0.0021852976128524195, "loss": 2.5375, "step": 19213 }, { "crossentropy": 2.4856512546539307, "epoch": 0.6965632250580046, "grad_norm": 0.02597203478217125, "grad_norm_var": 3.623390017270702e-07, "learning_rate": 0.002184817364348087, "loss": 2.4458, "step": 19214 }, { "crossentropy": 2.6299164295196533, "epoch": 0.6965994779582366, "grad_norm": 0.02661760337650776, "grad_norm_var": 3.610016253600852e-07, "learning_rate": 0.0021843371538670703, "loss": 2.5617, "step": 19215 }, { "crossentropy": 2.616947889328003, "epoch": 0.6966357308584686, "grad_norm": 0.02844640426337719, "grad_norm_var": 5.350701045256454e-07, "learning_rate": 0.002183856981415851, "loss": 2.4633, "step": 19216 }, { "crossentropy": 2.5061957836151123, "epoch": 0.6966719837587007, "grad_norm": 0.027193458750844002, "grad_norm_var": 4.916732127694616e-07, "learning_rate": 0.002183376847000919, "loss": 2.4829, "step": 19217 }, { "crossentropy": 2.469778537750244, "epoch": 0.6967082366589327, "grad_norm": 0.02769959345459938, "grad_norm_var": 5.100361833584287e-07, "learning_rate": 0.0021828967506287568, "loss": 2.5414, "step": 19218 }, { "crossentropy": 2.5595602989196777, "epoch": 0.6967444895591647, "grad_norm": 0.025438586249947548, "grad_norm_var": 6.398038175201478e-07, "learning_rate": 0.0021824166923058476, "loss": 2.5349, "step": 19219 }, { "crossentropy": 2.5359764099121094, "epoch": 0.6967807424593968, "grad_norm": 0.024904921650886536, "grad_norm_var": 8.705611368383636e-07, "learning_rate": 0.002181936672038676, "loss": 2.4893, "step": 19220 }, { "crossentropy": 2.4585158824920654, "epoch": 0.6968169953596288, "grad_norm": 0.02578066661953926, "grad_norm_var": 9.219182446006923e-07, "learning_rate": 0.0021814566898337292, "loss": 2.516, "step": 19221 }, { "crossentropy": 2.5085649490356445, "epoch": 0.6968532482598608, "grad_norm": 0.02561277337372303, "grad_norm_var": 9.872638722401104e-07, "learning_rate": 0.0021809767456974845, "loss": 2.4905, "step": 19222 }, { "crossentropy": 2.598111391067505, "epoch": 0.6968895011600929, "grad_norm": 0.0268190149217844, "grad_norm_var": 9.793930599087067e-07, "learning_rate": 0.002180496839636429, "loss": 2.5532, "step": 19223 }, { "crossentropy": 2.6473472118377686, "epoch": 0.6969257540603249, "grad_norm": 0.02771567367017269, "grad_norm_var": 1.0513066485971866e-06, "learning_rate": 0.00218001697165704, "loss": 2.4898, "step": 19224 }, { "crossentropy": 2.423363447189331, "epoch": 0.6969620069605569, "grad_norm": 0.02656925842165947, "grad_norm_var": 1.044571461931394e-06, "learning_rate": 0.002179537141765804, "loss": 2.3931, "step": 19225 }, { "crossentropy": 2.6481685638427734, "epoch": 0.6969982598607889, "grad_norm": 0.02700330875813961, "grad_norm_var": 9.329826366324956e-07, "learning_rate": 0.0021790573499691985, "loss": 2.5996, "step": 19226 }, { "crossentropy": 2.4583683013916016, "epoch": 0.6970345127610209, "grad_norm": 0.025915957987308502, "grad_norm_var": 8.879306821039327e-07, "learning_rate": 0.0021785775962737024, "loss": 2.4848, "step": 19227 }, { "crossentropy": 2.4722249507904053, "epoch": 0.6970707656612529, "grad_norm": 0.02607041224837303, "grad_norm_var": 8.991079068746636e-07, "learning_rate": 0.002178097880685798, "loss": 2.4655, "step": 19228 }, { "crossentropy": 2.3094613552093506, "epoch": 0.6971070185614849, "grad_norm": 0.02626117691397667, "grad_norm_var": 8.931248006202018e-07, "learning_rate": 0.0021776182032119664, "loss": 2.3793, "step": 19229 }, { "crossentropy": 2.6000194549560547, "epoch": 0.697143271461717, "grad_norm": 0.026553908362984657, "grad_norm_var": 8.732235746061138e-07, "learning_rate": 0.0021771385638586814, "loss": 2.557, "step": 19230 }, { "crossentropy": 2.5621461868286133, "epoch": 0.697179524361949, "grad_norm": 0.02591417171061039, "grad_norm_var": 8.966525532441791e-07, "learning_rate": 0.002176658962632426, "loss": 2.5303, "step": 19231 }, { "crossentropy": 2.539033889770508, "epoch": 0.697215777262181, "grad_norm": 0.026310989633202553, "grad_norm_var": 6.256760684725335e-07, "learning_rate": 0.002176179399539674, "loss": 2.5962, "step": 19232 }, { "crossentropy": 2.61108660697937, "epoch": 0.697252030162413, "grad_norm": 0.02646930143237114, "grad_norm_var": 5.780006439998209e-07, "learning_rate": 0.002175699874586905, "loss": 2.4734, "step": 19233 }, { "crossentropy": 2.6092779636383057, "epoch": 0.697288283062645, "grad_norm": 0.026030512526631355, "grad_norm_var": 4.439779950571176e-07, "learning_rate": 0.0021752203877805983, "loss": 2.5781, "step": 19234 }, { "crossentropy": 2.343761444091797, "epoch": 0.697324535962877, "grad_norm": 0.02707352675497532, "grad_norm_var": 4.427354308118714e-07, "learning_rate": 0.002174740939127224, "loss": 2.3947, "step": 19235 }, { "crossentropy": 2.4944212436676025, "epoch": 0.697360788863109, "grad_norm": 0.02655470184981823, "grad_norm_var": 3.0314371883549455e-07, "learning_rate": 0.0021742615286332602, "loss": 2.4924, "step": 19236 }, { "crossentropy": 2.5297434329986572, "epoch": 0.697397041763341, "grad_norm": 0.027191083878278732, "grad_norm_var": 3.080030784043929e-07, "learning_rate": 0.0021737821563051845, "loss": 2.4954, "step": 19237 }, { "crossentropy": 2.6526894569396973, "epoch": 0.6974332946635731, "grad_norm": 0.0283478032797575, "grad_norm_var": 4.504827709014836e-07, "learning_rate": 0.0021733028221494676, "loss": 2.6793, "step": 19238 }, { "crossentropy": 2.5601372718811035, "epoch": 0.6974695475638051, "grad_norm": 0.026303887367248535, "grad_norm_var": 4.571795160833534e-07, "learning_rate": 0.002172823526172588, "loss": 2.5916, "step": 19239 }, { "crossentropy": 2.6321020126342773, "epoch": 0.6975058004640371, "grad_norm": 0.02732160873711109, "grad_norm_var": 4.1051691821608095e-07, "learning_rate": 0.002172344268381016, "loss": 2.6384, "step": 19240 }, { "crossentropy": 2.399181365966797, "epoch": 0.6975420533642691, "grad_norm": 0.02746110036969185, "grad_norm_var": 4.5440549720301e-07, "learning_rate": 0.0021718650487812245, "loss": 2.4671, "step": 19241 }, { "crossentropy": 2.51784348487854, "epoch": 0.6975783062645011, "grad_norm": 0.026385720819234848, "grad_norm_var": 4.511241558415498e-07, "learning_rate": 0.00217138586737969, "loss": 2.4649, "step": 19242 }, { "crossentropy": 2.5436766147613525, "epoch": 0.6976145591647331, "grad_norm": 0.02753404900431633, "grad_norm_var": 4.595536893548316e-07, "learning_rate": 0.002170906724182882, "loss": 2.5026, "step": 19243 }, { "crossentropy": 2.2678959369659424, "epoch": 0.6976508120649652, "grad_norm": 0.025862587615847588, "grad_norm_var": 4.807103125569473e-07, "learning_rate": 0.0021704276191972694, "loss": 2.4382, "step": 19244 }, { "crossentropy": 2.6650562286376953, "epoch": 0.6976870649651972, "grad_norm": 0.027522770687937737, "grad_norm_var": 5.024165977097113e-07, "learning_rate": 0.0021699485524293285, "loss": 2.5973, "step": 19245 }, { "crossentropy": 2.5990898609161377, "epoch": 0.6977233178654292, "grad_norm": 0.026538221165537834, "grad_norm_var": 5.029516415024892e-07, "learning_rate": 0.0021694695238855246, "loss": 2.5954, "step": 19246 }, { "crossentropy": 2.498975992202759, "epoch": 0.6977595707656613, "grad_norm": 0.026246948167681694, "grad_norm_var": 4.7050741769684897e-07, "learning_rate": 0.0021689905335723304, "loss": 2.4826, "step": 19247 }, { "crossentropy": 2.6018614768981934, "epoch": 0.6977958236658933, "grad_norm": 0.026533059775829315, "grad_norm_var": 4.584537222268448e-07, "learning_rate": 0.0021685115814962176, "loss": 2.5421, "step": 19248 }, { "crossentropy": 2.518529176712036, "epoch": 0.6978320765661253, "grad_norm": 0.027792053297162056, "grad_norm_var": 5.031250199811927e-07, "learning_rate": 0.002168032667663651, "loss": 2.5358, "step": 19249 }, { "crossentropy": 2.712733507156372, "epoch": 0.6978683294663574, "grad_norm": 0.02809334732592106, "grad_norm_var": 5.247817963899628e-07, "learning_rate": 0.0021675537920811017, "loss": 2.6333, "step": 19250 }, { "crossentropy": 2.5878987312316895, "epoch": 0.6979045823665894, "grad_norm": 0.025498272851109505, "grad_norm_var": 6.744367758330811e-07, "learning_rate": 0.002167074954755038, "loss": 2.5331, "step": 19251 }, { "crossentropy": 2.4285430908203125, "epoch": 0.6979408352668214, "grad_norm": 0.026854636147618294, "grad_norm_var": 6.642828017102791e-07, "learning_rate": 0.0021665961556919236, "loss": 2.5267, "step": 19252 }, { "crossentropy": 2.5582544803619385, "epoch": 0.6979770881670534, "grad_norm": 0.02799702063202858, "grad_norm_var": 7.288565848829569e-07, "learning_rate": 0.00216611739489823, "loss": 2.5487, "step": 19253 }, { "crossentropy": 2.6581969261169434, "epoch": 0.6980133410672854, "grad_norm": 0.02599155716598034, "grad_norm_var": 6.581707844987202e-07, "learning_rate": 0.00216563867238042, "loss": 2.6244, "step": 19254 }, { "crossentropy": 2.267434597015381, "epoch": 0.6980495939675174, "grad_norm": 0.025486674159765244, "grad_norm_var": 7.617099429923478e-07, "learning_rate": 0.0021651599881449607, "loss": 2.3226, "step": 19255 }, { "crossentropy": 2.3774938583374023, "epoch": 0.6980858468677494, "grad_norm": 0.025806456804275513, "grad_norm_var": 8.03850454928256e-07, "learning_rate": 0.0021646813421983195, "loss": 2.4665, "step": 19256 }, { "crossentropy": 2.515794277191162, "epoch": 0.6981220997679815, "grad_norm": 0.027202680706977844, "grad_norm_var": 7.82670847855243e-07, "learning_rate": 0.002164202734546958, "loss": 2.4777, "step": 19257 }, { "crossentropy": 2.3854899406433105, "epoch": 0.6981583526682135, "grad_norm": 0.026146573945879936, "grad_norm_var": 7.965575582869385e-07, "learning_rate": 0.0021637241651973438, "loss": 2.4299, "step": 19258 }, { "crossentropy": 2.5351057052612305, "epoch": 0.6981946055684455, "grad_norm": 0.02642509900033474, "grad_norm_var": 7.492354310115971e-07, "learning_rate": 0.0021632456341559385, "loss": 2.5821, "step": 19259 }, { "crossentropy": 2.5269253253936768, "epoch": 0.6982308584686775, "grad_norm": 0.029677478596568108, "grad_norm_var": 1.271084692601671e-06, "learning_rate": 0.002162767141429205, "loss": 2.5694, "step": 19260 }, { "crossentropy": 2.4235851764678955, "epoch": 0.6982671113689095, "grad_norm": 0.02636885456740856, "grad_norm_var": 1.2528421473231246e-06, "learning_rate": 0.002162288687023608, "loss": 2.4909, "step": 19261 }, { "crossentropy": 2.361633777618408, "epoch": 0.6983033642691415, "grad_norm": 0.026584316045045853, "grad_norm_var": 1.2514202416484544e-06, "learning_rate": 0.0021618102709456068, "loss": 2.4, "step": 19262 }, { "crossentropy": 2.474503755569458, "epoch": 0.6983396171693735, "grad_norm": 0.026486888527870178, "grad_norm_var": 1.2375150863169846e-06, "learning_rate": 0.0021613318932016653, "loss": 2.3915, "step": 19263 }, { "crossentropy": 2.4909255504608154, "epoch": 0.6983758700696056, "grad_norm": 0.02707831934094429, "grad_norm_var": 1.2360312242150427e-06, "learning_rate": 0.002160853553798246, "loss": 2.5193, "step": 19264 }, { "crossentropy": 2.4414050579071045, "epoch": 0.6984121229698376, "grad_norm": 0.027282224968075752, "grad_norm_var": 1.1877721016914269e-06, "learning_rate": 0.002160375252741806, "loss": 2.4917, "step": 19265 }, { "crossentropy": 2.592078447341919, "epoch": 0.6984483758700696, "grad_norm": 0.026074562221765518, "grad_norm_var": 1.0973933107560895e-06, "learning_rate": 0.0021598969900388097, "loss": 2.6056, "step": 19266 }, { "crossentropy": 2.4899277687072754, "epoch": 0.6984846287703016, "grad_norm": 0.02711806632578373, "grad_norm_var": 1.0050542975229467e-06, "learning_rate": 0.002159418765695715, "loss": 2.5025, "step": 19267 }, { "crossentropy": 2.529451847076416, "epoch": 0.6985208816705336, "grad_norm": 0.026380866765975952, "grad_norm_var": 1.0147685484526337e-06, "learning_rate": 0.0021589405797189784, "loss": 2.4954, "step": 19268 }, { "crossentropy": 2.458548069000244, "epoch": 0.6985571345707656, "grad_norm": 0.026457861065864563, "grad_norm_var": 9.08297267660169e-07, "learning_rate": 0.0021584624321150624, "loss": 2.5834, "step": 19269 }, { "crossentropy": 2.514242172241211, "epoch": 0.6985933874709976, "grad_norm": 0.026367204263806343, "grad_norm_var": 8.836103279058107e-07, "learning_rate": 0.002157984322890422, "loss": 2.4018, "step": 19270 }, { "crossentropy": 2.282318592071533, "epoch": 0.6986296403712297, "grad_norm": 0.02731069177389145, "grad_norm_var": 8.003559525460024e-07, "learning_rate": 0.0021575062520515155, "loss": 2.4067, "step": 19271 }, { "crossentropy": 2.4566383361816406, "epoch": 0.6986658932714617, "grad_norm": 0.026363050565123558, "grad_norm_var": 7.461326679306564e-07, "learning_rate": 0.002157028219604803, "loss": 2.5177, "step": 19272 }, { "crossentropy": 2.585024118423462, "epoch": 0.6987021461716937, "grad_norm": 0.027582399547100067, "grad_norm_var": 7.738712682142656e-07, "learning_rate": 0.0021565502255567366, "loss": 2.5801, "step": 19273 }, { "crossentropy": 2.468374013900757, "epoch": 0.6987383990719258, "grad_norm": 0.026692358776926994, "grad_norm_var": 7.408245053908136e-07, "learning_rate": 0.002156072269913777, "loss": 2.483, "step": 19274 }, { "crossentropy": 2.581974744796753, "epoch": 0.6987746519721578, "grad_norm": 0.02530813030898571, "grad_norm_var": 8.881333478675165e-07, "learning_rate": 0.0021555943526823745, "loss": 2.5351, "step": 19275 }, { "crossentropy": 2.390148401260376, "epoch": 0.6988109048723898, "grad_norm": 0.02618706040084362, "grad_norm_var": 3.201187636051502e-07, "learning_rate": 0.0021551164738689895, "loss": 2.4725, "step": 19276 }, { "crossentropy": 2.5300140380859375, "epoch": 0.6988471577726219, "grad_norm": 0.026502950116991997, "grad_norm_var": 3.17061982062448e-07, "learning_rate": 0.0021546386334800735, "loss": 2.5452, "step": 19277 }, { "crossentropy": 2.602437973022461, "epoch": 0.6988834106728539, "grad_norm": 0.02614026889204979, "grad_norm_var": 3.3096897621894457e-07, "learning_rate": 0.0021541608315220786, "loss": 2.5095, "step": 19278 }, { "crossentropy": 2.62310791015625, "epoch": 0.6989196635730859, "grad_norm": 0.02712123468518257, "grad_norm_var": 3.4796368472232023e-07, "learning_rate": 0.0021536830680014604, "loss": 2.642, "step": 19279 }, { "crossentropy": 2.474555253982544, "epoch": 0.6989559164733179, "grad_norm": 0.028241513296961784, "grad_norm_var": 5.031513430971454e-07, "learning_rate": 0.002153205342924674, "loss": 2.4593, "step": 19280 }, { "crossentropy": 2.506427764892578, "epoch": 0.6989921693735499, "grad_norm": 0.02595880813896656, "grad_norm_var": 5.091119099795745e-07, "learning_rate": 0.0021527276562981674, "loss": 2.4865, "step": 19281 }, { "crossentropy": 2.592843770980835, "epoch": 0.6990284222737819, "grad_norm": 0.026632923632860184, "grad_norm_var": 4.885161862482375e-07, "learning_rate": 0.002152250008128397, "loss": 2.6253, "step": 19282 }, { "crossentropy": 2.335993528366089, "epoch": 0.6990646751740139, "grad_norm": 0.02723221853375435, "grad_norm_var": 4.964876387067903e-07, "learning_rate": 0.0021517723984218096, "loss": 2.4192, "step": 19283 }, { "crossentropy": 2.4727752208709717, "epoch": 0.699100928074246, "grad_norm": 0.02863679826259613, "grad_norm_var": 7.321161814767551e-07, "learning_rate": 0.0021512948271848603, "loss": 2.4859, "step": 19284 }, { "crossentropy": 2.581507682800293, "epoch": 0.699137180974478, "grad_norm": 0.026474691927433014, "grad_norm_var": 7.313751377524546e-07, "learning_rate": 0.0021508172944239975, "loss": 2.5384, "step": 19285 }, { "crossentropy": 2.5397679805755615, "epoch": 0.69917343387471, "grad_norm": 0.026110561564564705, "grad_norm_var": 7.501995699460417e-07, "learning_rate": 0.0021503398001456687, "loss": 2.5271, "step": 19286 }, { "crossentropy": 2.5284829139709473, "epoch": 0.699209686774942, "grad_norm": 0.026202991604804993, "grad_norm_var": 7.486519587482828e-07, "learning_rate": 0.002149862344356326, "loss": 2.5499, "step": 19287 }, { "crossentropy": 2.547362804412842, "epoch": 0.699245939675174, "grad_norm": 0.026338985189795494, "grad_norm_var": 7.498070248743623e-07, "learning_rate": 0.002149384927062419, "loss": 2.4913, "step": 19288 }, { "crossentropy": 2.4828414916992188, "epoch": 0.699282192575406, "grad_norm": 0.02645396627485752, "grad_norm_var": 6.981694638415742e-07, "learning_rate": 0.002148907548270393, "loss": 2.4541, "step": 19289 }, { "crossentropy": 2.5009191036224365, "epoch": 0.699318445475638, "grad_norm": 0.026205480098724365, "grad_norm_var": 7.095677451292188e-07, "learning_rate": 0.0021484302079866984, "loss": 2.4786, "step": 19290 }, { "crossentropy": 2.537971019744873, "epoch": 0.69935469837587, "grad_norm": 0.02616475708782673, "grad_norm_var": 6.068168111041568e-07, "learning_rate": 0.00214795290621778, "loss": 2.5576, "step": 19291 }, { "crossentropy": 2.4931931495666504, "epoch": 0.6993909512761021, "grad_norm": 0.027461716905236244, "grad_norm_var": 6.275053411274855e-07, "learning_rate": 0.002147475642970086, "loss": 2.5495, "step": 19292 }, { "crossentropy": 2.4943366050720215, "epoch": 0.6994272041763341, "grad_norm": 0.02685610204935074, "grad_norm_var": 6.240208356020898e-07, "learning_rate": 0.002146998418250065, "loss": 2.4642, "step": 19293 }, { "crossentropy": 2.357591152191162, "epoch": 0.6994634570765661, "grad_norm": 0.027313582599163055, "grad_norm_var": 6.123965911477055e-07, "learning_rate": 0.002146521232064157, "loss": 2.4533, "step": 19294 }, { "crossentropy": 2.55554461479187, "epoch": 0.6994997099767981, "grad_norm": 0.026047520339488983, "grad_norm_var": 6.438871519135464e-07, "learning_rate": 0.00214604408441881, "loss": 2.3993, "step": 19295 }, { "crossentropy": 2.420729160308838, "epoch": 0.6995359628770301, "grad_norm": 0.026569223031401634, "grad_norm_var": 4.907413421704886e-07, "learning_rate": 0.0021455669753204704, "loss": 2.4422, "step": 19296 }, { "crossentropy": 2.5232093334198, "epoch": 0.6995722157772621, "grad_norm": 0.026956014335155487, "grad_norm_var": 4.588278308375581e-07, "learning_rate": 0.002145089904775579, "loss": 2.5362, "step": 19297 }, { "crossentropy": 2.570049524307251, "epoch": 0.6996084686774942, "grad_norm": 0.027481425553560257, "grad_norm_var": 4.930013099960565e-07, "learning_rate": 0.002144612872790582, "loss": 2.5737, "step": 19298 }, { "crossentropy": 2.506462812423706, "epoch": 0.6996447215777262, "grad_norm": 0.026048781350255013, "grad_norm_var": 5.094344959833171e-07, "learning_rate": 0.0021441358793719205, "loss": 2.5737, "step": 19299 }, { "crossentropy": 2.4827160835266113, "epoch": 0.6996809744779582, "grad_norm": 0.0262753888964653, "grad_norm_var": 2.5055311276358087e-07, "learning_rate": 0.0021436589245260374, "loss": 2.4707, "step": 19300 }, { "crossentropy": 2.5714290142059326, "epoch": 0.6997172273781903, "grad_norm": 0.02815178595483303, "grad_norm_var": 4.0725081725326295e-07, "learning_rate": 0.002143182008259379, "loss": 2.5576, "step": 19301 }, { "crossentropy": 2.5381925106048584, "epoch": 0.6997534802784223, "grad_norm": 0.027707062661647797, "grad_norm_var": 4.48553104548605e-07, "learning_rate": 0.0021427051305783792, "loss": 2.5855, "step": 19302 }, { "crossentropy": 2.3769752979278564, "epoch": 0.6997897331786543, "grad_norm": 0.02552502416074276, "grad_norm_var": 5.280542454974904e-07, "learning_rate": 0.0021422282914894827, "loss": 2.4743, "step": 19303 }, { "crossentropy": 2.3855690956115723, "epoch": 0.6998259860788864, "grad_norm": 0.02600766532123089, "grad_norm_var": 5.518484028169471e-07, "learning_rate": 0.0021417514909991314, "loss": 2.4325, "step": 19304 }, { "crossentropy": 2.5197484493255615, "epoch": 0.6998622389791184, "grad_norm": 0.026527414098381996, "grad_norm_var": 5.497605397636677e-07, "learning_rate": 0.0021412747291137623, "loss": 2.5417, "step": 19305 }, { "crossentropy": 2.6055092811584473, "epoch": 0.6998984918793504, "grad_norm": 0.02658837102353573, "grad_norm_var": 5.333613823088005e-07, "learning_rate": 0.0021407980058398184, "loss": 2.5148, "step": 19306 }, { "crossentropy": 2.5780200958251953, "epoch": 0.6999347447795824, "grad_norm": 0.02537471428513527, "grad_norm_var": 6.319260886228709e-07, "learning_rate": 0.0021403213211837343, "loss": 2.5799, "step": 19307 }, { "crossentropy": 2.5517892837524414, "epoch": 0.6999709976798144, "grad_norm": 0.026027927175164223, "grad_norm_var": 6.111091949233145e-07, "learning_rate": 0.00213984467515195, "loss": 2.5789, "step": 19308 }, { "crossentropy": 2.5059547424316406, "epoch": 0.7000072505800464, "grad_norm": 0.02724486030638218, "grad_norm_var": 6.342899325857311e-07, "learning_rate": 0.002139368067750908, "loss": 2.5652, "step": 19309 }, { "crossentropy": 2.481255292892456, "epoch": 0.7000435034802784, "grad_norm": 0.026575231924653053, "grad_norm_var": 5.996309493663347e-07, "learning_rate": 0.002138891498987038, "loss": 2.513, "step": 19310 }, { "crossentropy": 2.5108957290649414, "epoch": 0.7000797563805105, "grad_norm": 0.026099229231476784, "grad_norm_var": 5.9620081058694e-07, "learning_rate": 0.0021384149688667797, "loss": 2.5402, "step": 19311 }, { "crossentropy": 2.4395673274993896, "epoch": 0.7001160092807425, "grad_norm": 0.025411013513803482, "grad_norm_var": 6.805485980488265e-07, "learning_rate": 0.002137938477396572, "loss": 2.3778, "step": 19312 }, { "crossentropy": 2.465845823287964, "epoch": 0.7001522621809745, "grad_norm": 0.02655099704861641, "grad_norm_var": 6.661816563700735e-07, "learning_rate": 0.0021374620245828463, "loss": 2.4769, "step": 19313 }, { "crossentropy": 2.539586067199707, "epoch": 0.7001885150812065, "grad_norm": 0.026284342631697655, "grad_norm_var": 5.950769814433057e-07, "learning_rate": 0.0021369856104320423, "loss": 2.4586, "step": 19314 }, { "crossentropy": 2.6017017364501953, "epoch": 0.7002247679814385, "grad_norm": 0.027021357789635658, "grad_norm_var": 6.086526517047463e-07, "learning_rate": 0.0021365092349505898, "loss": 2.6344, "step": 19315 }, { "crossentropy": 2.5059738159179688, "epoch": 0.7002610208816705, "grad_norm": 0.0271115992218256, "grad_norm_var": 6.316861610039526e-07, "learning_rate": 0.002136032898144927, "loss": 2.5355, "step": 19316 }, { "crossentropy": 2.551743268966675, "epoch": 0.7002972737819025, "grad_norm": 0.026960035786032677, "grad_norm_var": 4.600557374390135e-07, "learning_rate": 0.0021355566000214867, "loss": 2.5465, "step": 19317 }, { "crossentropy": 2.572174310684204, "epoch": 0.7003335266821346, "grad_norm": 0.026546187698841095, "grad_norm_var": 3.4793848978179977e-07, "learning_rate": 0.0021350803405867013, "loss": 2.4773, "step": 19318 }, { "crossentropy": 2.403348684310913, "epoch": 0.7003697795823666, "grad_norm": 0.025915702804923058, "grad_norm_var": 3.1367110608822766e-07, "learning_rate": 0.002134604119847003, "loss": 2.4711, "step": 19319 }, { "crossentropy": 2.5832998752593994, "epoch": 0.7004060324825986, "grad_norm": 0.025806229561567307, "grad_norm_var": 3.264870751117406e-07, "learning_rate": 0.0021341279378088254, "loss": 2.5283, "step": 19320 }, { "crossentropy": 2.529151439666748, "epoch": 0.7004422853828306, "grad_norm": 0.026097945868968964, "grad_norm_var": 3.294489752220589e-07, "learning_rate": 0.0021336517944785974, "loss": 2.513, "step": 19321 }, { "crossentropy": 2.513298273086548, "epoch": 0.7004785382830626, "grad_norm": 0.026136059314012527, "grad_norm_var": 3.2791920921529325e-07, "learning_rate": 0.0021331756898627535, "loss": 2.4563, "step": 19322 }, { "crossentropy": 2.5090787410736084, "epoch": 0.7005147911832946, "grad_norm": 0.0255951639264822, "grad_norm_var": 3.030917420334655e-07, "learning_rate": 0.002132699623967721, "loss": 2.5533, "step": 19323 }, { "crossentropy": 2.4394054412841797, "epoch": 0.7005510440835266, "grad_norm": 0.02621917612850666, "grad_norm_var": 2.9750937338372114e-07, "learning_rate": 0.002132223596799931, "loss": 2.4776, "step": 19324 }, { "crossentropy": 2.5539944171905518, "epoch": 0.7005872969837587, "grad_norm": 0.028355365619063377, "grad_norm_var": 5.073154947646264e-07, "learning_rate": 0.0021317476083658154, "loss": 2.5591, "step": 19325 }, { "crossentropy": 2.5188169479370117, "epoch": 0.7006235498839907, "grad_norm": 0.02615473046898842, "grad_norm_var": 5.095430598526697e-07, "learning_rate": 0.002131271658671801, "loss": 2.4504, "step": 19326 }, { "crossentropy": 2.5825862884521484, "epoch": 0.7006598027842227, "grad_norm": 0.025811931118369102, "grad_norm_var": 5.259003893165095e-07, "learning_rate": 0.0021307957477243143, "loss": 2.4573, "step": 19327 }, { "crossentropy": 2.4225549697875977, "epoch": 0.7006960556844548, "grad_norm": 0.025474773719906807, "grad_norm_var": 5.17971052801415e-07, "learning_rate": 0.002130319875529787, "loss": 2.4606, "step": 19328 }, { "crossentropy": 2.741286516189575, "epoch": 0.7007323085846868, "grad_norm": 0.0268790852278471, "grad_norm_var": 5.322839417650719e-07, "learning_rate": 0.002129844042094643, "loss": 2.6537, "step": 19329 }, { "crossentropy": 2.5643205642700195, "epoch": 0.7007685614849188, "grad_norm": 0.026778018102049828, "grad_norm_var": 5.400279048862888e-07, "learning_rate": 0.002129368247425313, "loss": 2.5721, "step": 19330 }, { "crossentropy": 2.4141154289245605, "epoch": 0.7008048143851509, "grad_norm": 0.027139276266098022, "grad_norm_var": 5.502109031551548e-07, "learning_rate": 0.0021288924915282186, "loss": 2.4872, "step": 19331 }, { "crossentropy": 2.45296311378479, "epoch": 0.7008410672853829, "grad_norm": 0.029139991849660873, "grad_norm_var": 9.899875780382811e-07, "learning_rate": 0.0021284167744097886, "loss": 2.5836, "step": 19332 }, { "crossentropy": 2.4937663078308105, "epoch": 0.7008773201856149, "grad_norm": 0.02648879401385784, "grad_norm_var": 9.789268039121634e-07, "learning_rate": 0.00212794109607645, "loss": 2.5157, "step": 19333 }, { "crossentropy": 2.5200705528259277, "epoch": 0.7009135730858469, "grad_norm": 0.0263636764138937, "grad_norm_var": 9.807036473097678e-07, "learning_rate": 0.002127465456534623, "loss": 2.5551, "step": 19334 }, { "crossentropy": 2.463947057723999, "epoch": 0.7009498259860789, "grad_norm": 0.02782789058983326, "grad_norm_var": 1.0545895134719356e-06, "learning_rate": 0.002126989855790736, "loss": 2.4842, "step": 19335 }, { "crossentropy": 2.5987374782562256, "epoch": 0.7009860788863109, "grad_norm": 0.02834976091980934, "grad_norm_var": 1.1755778811855152e-06, "learning_rate": 0.0021265142938512115, "loss": 2.5621, "step": 19336 }, { "crossentropy": 2.5493407249450684, "epoch": 0.7010223317865429, "grad_norm": 0.02839634381234646, "grad_norm_var": 1.2903728782672248e-06, "learning_rate": 0.00212603877072247, "loss": 2.6009, "step": 19337 }, { "crossentropy": 2.5963428020477295, "epoch": 0.701058584686775, "grad_norm": 0.026455575600266457, "grad_norm_var": 1.262317441035922e-06, "learning_rate": 0.002125563286410938, "loss": 2.4975, "step": 19338 }, { "crossentropy": 2.2863054275512695, "epoch": 0.701094837587007, "grad_norm": 0.025657042860984802, "grad_norm_var": 1.2512602741476201e-06, "learning_rate": 0.0021250878409230347, "loss": 2.3982, "step": 19339 }, { "crossentropy": 2.5985002517700195, "epoch": 0.701131090487239, "grad_norm": 0.02734730765223503, "grad_norm_var": 1.2181343068039253e-06, "learning_rate": 0.0021246124342651825, "loss": 2.5807, "step": 19340 }, { "crossentropy": 2.3580820560455322, "epoch": 0.701167343387471, "grad_norm": 0.027077801525592804, "grad_norm_var": 1.095865514657299e-06, "learning_rate": 0.002124137066443805, "loss": 2.4874, "step": 19341 }, { "crossentropy": 2.488060474395752, "epoch": 0.701203596287703, "grad_norm": 0.026670677587389946, "grad_norm_var": 1.057183629552388e-06, "learning_rate": 0.002123661737465318, "loss": 2.513, "step": 19342 }, { "crossentropy": 2.3570687770843506, "epoch": 0.701239849187935, "grad_norm": 0.026644881814718246, "grad_norm_var": 9.695855425645062e-07, "learning_rate": 0.002123186447336147, "loss": 2.4046, "step": 19343 }, { "crossentropy": 2.49299693107605, "epoch": 0.701276102088167, "grad_norm": 0.02608644589781761, "grad_norm_var": 8.650559813365317e-07, "learning_rate": 0.0021227111960627084, "loss": 2.4583, "step": 19344 }, { "crossentropy": 2.6936230659484863, "epoch": 0.7013123549883991, "grad_norm": 0.02744109556078911, "grad_norm_var": 8.696357614273864e-07, "learning_rate": 0.00212223598365142, "loss": 2.6545, "step": 19345 }, { "crossentropy": 2.3351235389709473, "epoch": 0.7013486078886311, "grad_norm": 0.02572019211947918, "grad_norm_var": 9.873187759645013e-07, "learning_rate": 0.002121760810108703, "loss": 2.4751, "step": 19346 }, { "crossentropy": 2.513760805130005, "epoch": 0.7013848607888631, "grad_norm": 0.02617202326655388, "grad_norm_var": 1.0343331697592322e-06, "learning_rate": 0.0021212856754409728, "loss": 2.5554, "step": 19347 }, { "crossentropy": 2.436452865600586, "epoch": 0.7014211136890951, "grad_norm": 0.02711634710431099, "grad_norm_var": 7.101615968657978e-07, "learning_rate": 0.002120810579654648, "loss": 2.465, "step": 19348 }, { "crossentropy": 2.563758611679077, "epoch": 0.7014573665893271, "grad_norm": 0.026387060061097145, "grad_norm_var": 7.158910472129422e-07, "learning_rate": 0.002120335522756148, "loss": 2.5728, "step": 19349 }, { "crossentropy": 2.6746573448181152, "epoch": 0.7014936194895591, "grad_norm": 0.027672840282320976, "grad_norm_var": 7.368750768112538e-07, "learning_rate": 0.002119860504751884, "loss": 2.6407, "step": 19350 }, { "crossentropy": 2.527501106262207, "epoch": 0.7015298723897911, "grad_norm": 0.02561187371611595, "grad_norm_var": 7.811430381929676e-07, "learning_rate": 0.0021193855256482767, "loss": 2.5384, "step": 19351 }, { "crossentropy": 2.538332462310791, "epoch": 0.7015661252900232, "grad_norm": 0.027806585654616356, "grad_norm_var": 6.8737699863799e-07, "learning_rate": 0.00211891058545174, "loss": 2.5198, "step": 19352 }, { "crossentropy": 2.475283622741699, "epoch": 0.7016023781902552, "grad_norm": 0.027970941737294197, "grad_norm_var": 6.062425745711654e-07, "learning_rate": 0.002118435684168686, "loss": 2.4954, "step": 19353 }, { "crossentropy": 2.5484907627105713, "epoch": 0.7016386310904872, "grad_norm": 0.026446161791682243, "grad_norm_var": 6.066050129797333e-07, "learning_rate": 0.0021179608218055324, "loss": 2.5593, "step": 19354 }, { "crossentropy": 2.56499981880188, "epoch": 0.7016748839907193, "grad_norm": 0.026717405766248703, "grad_norm_var": 5.238625002303241e-07, "learning_rate": 0.00211748599836869, "loss": 2.5519, "step": 19355 }, { "crossentropy": 2.3421308994293213, "epoch": 0.7017111368909513, "grad_norm": 0.026741640642285347, "grad_norm_var": 5.030438156176468e-07, "learning_rate": 0.0021170112138645734, "loss": 2.4566, "step": 19356 }, { "crossentropy": 2.462327480316162, "epoch": 0.7017473897911833, "grad_norm": 0.026228822767734528, "grad_norm_var": 5.129944890732337e-07, "learning_rate": 0.002116536468299598, "loss": 2.567, "step": 19357 }, { "crossentropy": 2.5225114822387695, "epoch": 0.7017836426914154, "grad_norm": 0.026733336970210075, "grad_norm_var": 5.128721941444203e-07, "learning_rate": 0.002116061761680171, "loss": 2.4841, "step": 19358 }, { "crossentropy": 2.459683895111084, "epoch": 0.7018198955916474, "grad_norm": 0.026688599959015846, "grad_norm_var": 5.125619191750311e-07, "learning_rate": 0.002115587094012708, "loss": 2.4171, "step": 19359 }, { "crossentropy": 2.578737497329712, "epoch": 0.7018561484918794, "grad_norm": 0.026827162131667137, "grad_norm_var": 4.841500988295629e-07, "learning_rate": 0.002115112465303619, "loss": 2.5556, "step": 19360 }, { "crossentropy": 2.4123339653015137, "epoch": 0.7018924013921114, "grad_norm": 0.026630796492099762, "grad_norm_var": 4.5242556392510507e-07, "learning_rate": 0.002114637875559312, "loss": 2.488, "step": 19361 }, { "crossentropy": 2.4611053466796875, "epoch": 0.7019286542923434, "grad_norm": 0.027019744738936424, "grad_norm_var": 3.8525957400525657e-07, "learning_rate": 0.0021141633247862015, "loss": 2.554, "step": 19362 }, { "crossentropy": 2.386653423309326, "epoch": 0.7019649071925754, "grad_norm": 0.02596163935959339, "grad_norm_var": 4.055911626434412e-07, "learning_rate": 0.002113688812990693, "loss": 2.4186, "step": 19363 }, { "crossentropy": 2.522927761077881, "epoch": 0.7020011600928074, "grad_norm": 0.026654066517949104, "grad_norm_var": 3.9852793867108567e-07, "learning_rate": 0.0021132143401791964, "loss": 2.5669, "step": 19364 }, { "crossentropy": 2.5604453086853027, "epoch": 0.7020374129930395, "grad_norm": 0.02635863982141018, "grad_norm_var": 3.9997710313121926e-07, "learning_rate": 0.0021127399063581233, "loss": 2.569, "step": 19365 }, { "crossentropy": 2.4288735389709473, "epoch": 0.7020736658932715, "grad_norm": 0.026750465855002403, "grad_norm_var": 3.401966485918594e-07, "learning_rate": 0.002112265511533877, "loss": 2.4653, "step": 19366 }, { "crossentropy": 2.313962459564209, "epoch": 0.7021099187935035, "grad_norm": 0.02657666616141796, "grad_norm_var": 2.5881671838928696e-07, "learning_rate": 0.00211179115571287, "loss": 2.355, "step": 19367 }, { "crossentropy": 2.5620758533477783, "epoch": 0.7021461716937355, "grad_norm": 0.02568736858665943, "grad_norm_var": 2.4294791331016567e-07, "learning_rate": 0.002111316838901506, "loss": 2.4871, "step": 19368 }, { "crossentropy": 2.5595412254333496, "epoch": 0.7021824245939675, "grad_norm": 0.026284387335181236, "grad_norm_var": 1.1796790076513443e-07, "learning_rate": 0.00211084256110619, "loss": 2.4637, "step": 19369 }, { "crossentropy": 2.651973247528076, "epoch": 0.7022186774941995, "grad_norm": 0.027142588049173355, "grad_norm_var": 1.415006123582525e-07, "learning_rate": 0.0021103683223333315, "loss": 2.5157, "step": 19370 }, { "crossentropy": 2.4802637100219727, "epoch": 0.7022549303944315, "grad_norm": 0.026418067514896393, "grad_norm_var": 1.4092657084421866e-07, "learning_rate": 0.002109894122589332, "loss": 2.4981, "step": 19371 }, { "crossentropy": 2.524740219116211, "epoch": 0.7022911832946636, "grad_norm": 0.02654411457479, "grad_norm_var": 1.381598695249578e-07, "learning_rate": 0.002109419961880598, "loss": 2.5404, "step": 19372 }, { "crossentropy": 2.5212059020996094, "epoch": 0.7023274361948956, "grad_norm": 0.025543829426169395, "grad_norm_var": 1.951441951157275e-07, "learning_rate": 0.0021089458402135364, "loss": 2.5351, "step": 19373 }, { "crossentropy": 2.4963557720184326, "epoch": 0.7023636890951276, "grad_norm": 0.026669513434171677, "grad_norm_var": 1.9331818167262853e-07, "learning_rate": 0.0021084717575945468, "loss": 2.574, "step": 19374 }, { "crossentropy": 2.4208035469055176, "epoch": 0.7023999419953596, "grad_norm": 0.02738739177584648, "grad_norm_var": 2.4282110649615806e-07, "learning_rate": 0.0021079977140300355, "loss": 2.4982, "step": 19375 }, { "crossentropy": 2.416302442550659, "epoch": 0.7024361948955916, "grad_norm": 0.026929106563329697, "grad_norm_var": 2.475298650582899e-07, "learning_rate": 0.002107523709526402, "loss": 2.4841, "step": 19376 }, { "crossentropy": 2.500309705734253, "epoch": 0.7024724477958236, "grad_norm": 0.026882831007242203, "grad_norm_var": 2.547225447818135e-07, "learning_rate": 0.0021070497440900528, "loss": 2.4976, "step": 19377 }, { "crossentropy": 2.559556722640991, "epoch": 0.7025087006960556, "grad_norm": 0.026529431343078613, "grad_norm_var": 2.390809564944135e-07, "learning_rate": 0.0021065758177273865, "loss": 2.4886, "step": 19378 }, { "crossentropy": 2.4871575832366943, "epoch": 0.7025449535962877, "grad_norm": 0.026530984789133072, "grad_norm_var": 2.169534087619977e-07, "learning_rate": 0.002106101930444803, "loss": 2.5029, "step": 19379 }, { "crossentropy": 2.4196407794952393, "epoch": 0.7025812064965197, "grad_norm": 0.026755232363939285, "grad_norm_var": 2.189213840663445e-07, "learning_rate": 0.0021056280822487044, "loss": 2.5268, "step": 19380 }, { "crossentropy": 2.512596368789673, "epoch": 0.7026174593967517, "grad_norm": 0.027450380846858025, "grad_norm_var": 2.638253859895175e-07, "learning_rate": 0.0021051542731454933, "loss": 2.5577, "step": 19381 }, { "crossentropy": 2.5088930130004883, "epoch": 0.7026537122969838, "grad_norm": 0.027518371120095253, "grad_norm_var": 3.1299937415945185e-07, "learning_rate": 0.0021046805031415643, "loss": 2.5986, "step": 19382 }, { "crossentropy": 2.6642348766326904, "epoch": 0.7026899651972158, "grad_norm": 0.02767595835030079, "grad_norm_var": 3.736536003295556e-07, "learning_rate": 0.0021042067722433213, "loss": 2.5968, "step": 19383 }, { "crossentropy": 2.625701665878296, "epoch": 0.7027262180974478, "grad_norm": 0.027474749833345413, "grad_norm_var": 3.2083201091052255e-07, "learning_rate": 0.002103733080457158, "loss": 2.564, "step": 19384 }, { "crossentropy": 2.4867610931396484, "epoch": 0.7027624709976799, "grad_norm": 0.02842075750231743, "grad_norm_var": 4.4253452634912944e-07, "learning_rate": 0.0021032594277894767, "loss": 2.5515, "step": 19385 }, { "crossentropy": 2.349440097808838, "epoch": 0.7027987238979119, "grad_norm": 0.026700157672166824, "grad_norm_var": 4.458900935877848e-07, "learning_rate": 0.0021027858142466734, "loss": 2.3762, "step": 19386 }, { "crossentropy": 2.449235677719116, "epoch": 0.7028349767981439, "grad_norm": 0.02544916234910488, "grad_norm_var": 5.751467792196781e-07, "learning_rate": 0.002102312239835142, "loss": 2.4768, "step": 19387 }, { "crossentropy": 2.468266725540161, "epoch": 0.7028712296983759, "grad_norm": 0.026290737092494965, "grad_norm_var": 5.913132568274252e-07, "learning_rate": 0.002101838704561281, "loss": 2.4401, "step": 19388 }, { "crossentropy": 2.42218017578125, "epoch": 0.7029074825986079, "grad_norm": 0.025979572907090187, "grad_norm_var": 5.250829764113224e-07, "learning_rate": 0.0021013652084314882, "loss": 2.4931, "step": 19389 }, { "crossentropy": 2.4351699352264404, "epoch": 0.7029437354988399, "grad_norm": 0.025389306247234344, "grad_norm_var": 6.694655740838018e-07, "learning_rate": 0.002100891751452156, "loss": 2.478, "step": 19390 }, { "crossentropy": 2.589313507080078, "epoch": 0.7029799883990719, "grad_norm": 0.025815976783633232, "grad_norm_var": 7.081155183181915e-07, "learning_rate": 0.0021004183336296813, "loss": 2.5692, "step": 19391 }, { "crossentropy": 2.502563953399658, "epoch": 0.703016241299304, "grad_norm": 0.026019319891929626, "grad_norm_var": 7.365494738354368e-07, "learning_rate": 0.002099944954970456, "loss": 2.3954, "step": 19392 }, { "crossentropy": 2.396822929382324, "epoch": 0.703052494199536, "grad_norm": 0.026195241138339043, "grad_norm_var": 7.475197295422753e-07, "learning_rate": 0.0020994716154808762, "loss": 2.4962, "step": 19393 }, { "crossentropy": 2.4349522590637207, "epoch": 0.703088747099768, "grad_norm": 0.027142321690917015, "grad_norm_var": 7.621894585277863e-07, "learning_rate": 0.002098998315167334, "loss": 2.4878, "step": 19394 }, { "crossentropy": 2.4142470359802246, "epoch": 0.703125, "grad_norm": 0.025963295251131058, "grad_norm_var": 7.932711489991769e-07, "learning_rate": 0.002098525054036221, "loss": 2.5032, "step": 19395 }, { "crossentropy": 2.5267765522003174, "epoch": 0.703161252900232, "grad_norm": 0.025458896532654762, "grad_norm_var": 8.783900931656156e-07, "learning_rate": 0.0020980518320939294, "loss": 2.5811, "step": 19396 }, { "crossentropy": 2.5875141620635986, "epoch": 0.703197505800464, "grad_norm": 0.027247384190559387, "grad_norm_var": 8.568396071069924e-07, "learning_rate": 0.0020975786493468546, "loss": 2.5393, "step": 19397 }, { "crossentropy": 2.5133683681488037, "epoch": 0.703233758700696, "grad_norm": 0.02769049070775509, "grad_norm_var": 8.809989238508485e-07, "learning_rate": 0.0020971055058013823, "loss": 2.5227, "step": 19398 }, { "crossentropy": 2.539839029312134, "epoch": 0.7032700116009281, "grad_norm": 0.025318283587694168, "grad_norm_var": 8.766874293892098e-07, "learning_rate": 0.002096632401463907, "loss": 2.491, "step": 19399 }, { "crossentropy": 2.455551862716675, "epoch": 0.7033062645011601, "grad_norm": 0.02656189538538456, "grad_norm_var": 7.991409339680802e-07, "learning_rate": 0.0020961593363408157, "loss": 2.5502, "step": 19400 }, { "crossentropy": 2.5776848793029785, "epoch": 0.7033425174013921, "grad_norm": 0.027741024270653725, "grad_norm_var": 6.405856700290261e-07, "learning_rate": 0.002095686310438499, "loss": 2.5968, "step": 19401 }, { "crossentropy": 2.595527172088623, "epoch": 0.7033787703016241, "grad_norm": 0.027221590280532837, "grad_norm_var": 6.846910542412325e-07, "learning_rate": 0.0020952133237633497, "loss": 2.5424, "step": 19402 }, { "crossentropy": 2.3577945232391357, "epoch": 0.7034150232018561, "grad_norm": 0.02654212713241577, "grad_norm_var": 6.291259556402695e-07, "learning_rate": 0.0020947403763217493, "loss": 2.4683, "step": 19403 }, { "crossentropy": 2.6199774742126465, "epoch": 0.7034512761020881, "grad_norm": 0.02632034942507744, "grad_norm_var": 6.28705564872413e-07, "learning_rate": 0.0020942674681200886, "loss": 2.5303, "step": 19404 }, { "crossentropy": 2.5591979026794434, "epoch": 0.7034875290023201, "grad_norm": 0.027208874002099037, "grad_norm_var": 6.521222069658453e-07, "learning_rate": 0.0020937945991647577, "loss": 2.5116, "step": 19405 }, { "crossentropy": 2.504058361053467, "epoch": 0.7035237819025522, "grad_norm": 0.025721585378050804, "grad_norm_var": 6.102678206978586e-07, "learning_rate": 0.0020933217694621385, "loss": 2.494, "step": 19406 }, { "crossentropy": 2.569983959197998, "epoch": 0.7035600348027842, "grad_norm": 0.027443448081612587, "grad_norm_var": 6.250914751709889e-07, "learning_rate": 0.0020928489790186223, "loss": 2.5298, "step": 19407 }, { "crossentropy": 2.5918381214141846, "epoch": 0.7035962877030162, "grad_norm": 0.0267680361866951, "grad_norm_var": 6.009351642635111e-07, "learning_rate": 0.0020923762278405905, "loss": 2.5532, "step": 19408 }, { "crossentropy": 2.6298725605010986, "epoch": 0.7036325406032483, "grad_norm": 0.027302540838718414, "grad_norm_var": 6.090900825990452e-07, "learning_rate": 0.00209190351593443, "loss": 2.5714, "step": 19409 }, { "crossentropy": 2.472529888153076, "epoch": 0.7036687935034803, "grad_norm": 0.02594650350511074, "grad_norm_var": 6.324447334511707e-07, "learning_rate": 0.00209143084330653, "loss": 2.3941, "step": 19410 }, { "crossentropy": 2.4392199516296387, "epoch": 0.7037050464037123, "grad_norm": 0.027110083028674126, "grad_norm_var": 6.091009851508089e-07, "learning_rate": 0.0020909582099632655, "loss": 2.4623, "step": 19411 }, { "crossentropy": 2.479506731033325, "epoch": 0.7037412993039444, "grad_norm": 0.027310706675052643, "grad_norm_var": 5.107669045693412e-07, "learning_rate": 0.002090485615911026, "loss": 2.4142, "step": 19412 }, { "crossentropy": 2.489912748336792, "epoch": 0.7037775522041764, "grad_norm": 0.02671632543206215, "grad_norm_var": 4.99613415199697e-07, "learning_rate": 0.002090013061156195, "loss": 2.4835, "step": 19413 }, { "crossentropy": 2.4816532135009766, "epoch": 0.7038138051044084, "grad_norm": 0.02818468026816845, "grad_norm_var": 5.730434321985292e-07, "learning_rate": 0.0020895405457051525, "loss": 2.6017, "step": 19414 }, { "crossentropy": 2.6156060695648193, "epoch": 0.7038500580046404, "grad_norm": 0.027276618406176567, "grad_norm_var": 4.157564268033777e-07, "learning_rate": 0.0020890680695642826, "loss": 2.5747, "step": 19415 }, { "crossentropy": 2.517019510269165, "epoch": 0.7038863109048724, "grad_norm": 0.028389260172843933, "grad_norm_var": 5.272130970604763e-07, "learning_rate": 0.0020885956327399645, "loss": 2.513, "step": 19416 }, { "crossentropy": 2.34393572807312, "epoch": 0.7039225638051044, "grad_norm": 0.027646023780107498, "grad_norm_var": 5.19343785602153e-07, "learning_rate": 0.002088123235238581, "loss": 2.4655, "step": 19417 }, { "crossentropy": 2.685786247253418, "epoch": 0.7039588167053364, "grad_norm": 0.027592556551098824, "grad_norm_var": 5.354775381778698e-07, "learning_rate": 0.0020876508770665135, "loss": 2.61, "step": 19418 }, { "crossentropy": 2.561678409576416, "epoch": 0.7039950696055685, "grad_norm": 0.026658259332180023, "grad_norm_var": 5.277985933342204e-07, "learning_rate": 0.002087178558230141, "loss": 2.493, "step": 19419 }, { "crossentropy": 2.494621753692627, "epoch": 0.7040313225058005, "grad_norm": 0.02717454731464386, "grad_norm_var": 4.846347205049335e-07, "learning_rate": 0.0020867062787358398, "loss": 2.4916, "step": 19420 }, { "crossentropy": 2.412087917327881, "epoch": 0.7040675754060325, "grad_norm": 0.026499444618821144, "grad_norm_var": 5.108173073382788e-07, "learning_rate": 0.002086234038589994, "loss": 2.5178, "step": 19421 }, { "crossentropy": 2.6282575130462646, "epoch": 0.7041038283062645, "grad_norm": 0.02651294320821762, "grad_norm_var": 4.0358786107748806e-07, "learning_rate": 0.0020857618377989767, "loss": 2.5304, "step": 19422 }, { "crossentropy": 2.673848867416382, "epoch": 0.7041400812064965, "grad_norm": 0.02696136012673378, "grad_norm_var": 3.997812453966169e-07, "learning_rate": 0.002085289676369171, "loss": 2.6318, "step": 19423 }, { "crossentropy": 2.5604939460754395, "epoch": 0.7041763341067285, "grad_norm": 0.02707510255277157, "grad_norm_var": 3.909318160848074e-07, "learning_rate": 0.0020848175543069494, "loss": 2.529, "step": 19424 }, { "crossentropy": 2.4517016410827637, "epoch": 0.7042125870069605, "grad_norm": 0.025739429518580437, "grad_norm_var": 5.112866631392631e-07, "learning_rate": 0.00208434547161869, "loss": 2.4906, "step": 19425 }, { "crossentropy": 2.423781633377075, "epoch": 0.7042488399071926, "grad_norm": 0.025434710085392, "grad_norm_var": 6.029328243967729e-07, "learning_rate": 0.002083873428310772, "loss": 2.3849, "step": 19426 }, { "crossentropy": 2.4614148139953613, "epoch": 0.7042850928074246, "grad_norm": 0.02845652960240841, "grad_norm_var": 7.328382893058282e-07, "learning_rate": 0.002083401424389569, "loss": 2.5855, "step": 19427 }, { "crossentropy": 2.572798013687134, "epoch": 0.7043213457076566, "grad_norm": 0.026992257684469223, "grad_norm_var": 7.30305447603482e-07, "learning_rate": 0.0020829294598614535, "loss": 2.5817, "step": 19428 }, { "crossentropy": 2.541597843170166, "epoch": 0.7043575986078886, "grad_norm": 0.027350660413503647, "grad_norm_var": 7.245365442488099e-07, "learning_rate": 0.0020824575347328047, "loss": 2.5262, "step": 19429 }, { "crossentropy": 2.4647226333618164, "epoch": 0.7043938515081206, "grad_norm": 0.02571912296116352, "grad_norm_var": 7.549693115090854e-07, "learning_rate": 0.002081985649009992, "loss": 2.476, "step": 19430 }, { "crossentropy": 2.2818238735198975, "epoch": 0.7044301044083526, "grad_norm": 0.027162538841366768, "grad_norm_var": 7.510797008375743e-07, "learning_rate": 0.002081513802699394, "loss": 2.4054, "step": 19431 }, { "crossentropy": 2.3663227558135986, "epoch": 0.7044663573085846, "grad_norm": 0.027096068486571312, "grad_norm_var": 6.092114178233206e-07, "learning_rate": 0.0020810419958073785, "loss": 2.452, "step": 19432 }, { "crossentropy": 2.3871679306030273, "epoch": 0.7045026102088167, "grad_norm": 0.025983985513448715, "grad_norm_var": 6.11987874856353e-07, "learning_rate": 0.002080570228340321, "loss": 2.4637, "step": 19433 }, { "crossentropy": 2.479384183883667, "epoch": 0.7045388631090487, "grad_norm": 0.026638638228178024, "grad_norm_var": 5.649517475721782e-07, "learning_rate": 0.0020800985003045945, "loss": 2.577, "step": 19434 }, { "crossentropy": 2.618713617324829, "epoch": 0.7045751160092807, "grad_norm": 0.026477716863155365, "grad_norm_var": 5.683783197990653e-07, "learning_rate": 0.0020796268117065694, "loss": 2.5701, "step": 19435 }, { "crossentropy": 2.6385445594787598, "epoch": 0.7046113689095128, "grad_norm": 0.02720760740339756, "grad_norm_var": 5.705177625806371e-07, "learning_rate": 0.0020791551625526134, "loss": 2.5235, "step": 19436 }, { "crossentropy": 2.515214681625366, "epoch": 0.7046476218097448, "grad_norm": 0.026421478018164635, "grad_norm_var": 5.730528152272004e-07, "learning_rate": 0.0020786835528491016, "loss": 2.5557, "step": 19437 }, { "crossentropy": 2.362283706665039, "epoch": 0.7046838747099768, "grad_norm": 0.026940977200865746, "grad_norm_var": 5.737205367302751e-07, "learning_rate": 0.0020782119826024, "loss": 2.3959, "step": 19438 }, { "crossentropy": 2.560760736465454, "epoch": 0.7047201276102089, "grad_norm": 0.025444665923714638, "grad_norm_var": 6.704303685951703e-07, "learning_rate": 0.0020777404518188816, "loss": 2.5346, "step": 19439 }, { "crossentropy": 2.4332633018493652, "epoch": 0.7047563805104409, "grad_norm": 0.028600338846445084, "grad_norm_var": 9.055636395548094e-07, "learning_rate": 0.0020772689605049106, "loss": 2.5786, "step": 19440 }, { "crossentropy": 2.4919111728668213, "epoch": 0.7047926334106729, "grad_norm": 0.026989392936229706, "grad_norm_var": 8.382621961112886e-07, "learning_rate": 0.0020767975086668582, "loss": 2.4756, "step": 19441 }, { "crossentropy": 2.6766881942749023, "epoch": 0.7048288863109049, "grad_norm": 0.028188321739435196, "grad_norm_var": 8.082193696595334e-07, "learning_rate": 0.002076326096311093, "loss": 2.5228, "step": 19442 }, { "crossentropy": 2.333833932876587, "epoch": 0.7048651392111369, "grad_norm": 0.026395617052912712, "grad_norm_var": 6.677797058411243e-07, "learning_rate": 0.0020758547234439794, "loss": 2.443, "step": 19443 }, { "crossentropy": 2.424253463745117, "epoch": 0.7049013921113689, "grad_norm": 0.025892574340105057, "grad_norm_var": 7.225887507250154e-07, "learning_rate": 0.002075383390071887, "loss": 2.4755, "step": 19444 }, { "crossentropy": 2.5097572803497314, "epoch": 0.7049376450116009, "grad_norm": 0.02542297914624214, "grad_norm_var": 8.086396016580381e-07, "learning_rate": 0.0020749120962011798, "loss": 2.4597, "step": 19445 }, { "crossentropy": 2.4246420860290527, "epoch": 0.704973897911833, "grad_norm": 0.026705050840973854, "grad_norm_var": 7.455270960944616e-07, "learning_rate": 0.0020744408418382223, "loss": 2.4616, "step": 19446 }, { "crossentropy": 2.5090136528015137, "epoch": 0.705010150812065, "grad_norm": 0.02645079977810383, "grad_norm_var": 7.354759935182994e-07, "learning_rate": 0.0020739696269893824, "loss": 2.5427, "step": 19447 }, { "crossentropy": 2.5017213821411133, "epoch": 0.705046403712297, "grad_norm": 0.026699427515268326, "grad_norm_var": 7.232261463789667e-07, "learning_rate": 0.0020734984516610215, "loss": 2.5113, "step": 19448 }, { "crossentropy": 2.5121371746063232, "epoch": 0.705082656612529, "grad_norm": 0.026943475008010864, "grad_norm_var": 6.950840642613026e-07, "learning_rate": 0.0020730273158595055, "loss": 2.5283, "step": 19449 }, { "crossentropy": 2.61484694480896, "epoch": 0.705118909512761, "grad_norm": 0.02585778571665287, "grad_norm_var": 7.410062804439251e-07, "learning_rate": 0.0020725562195911982, "loss": 2.4994, "step": 19450 }, { "crossentropy": 2.4022397994995117, "epoch": 0.705155162412993, "grad_norm": 0.026550913229584694, "grad_norm_var": 7.395144373808221e-07, "learning_rate": 0.002072085162862461, "loss": 2.4769, "step": 19451 }, { "crossentropy": 2.5153069496154785, "epoch": 0.705191415313225, "grad_norm": 0.025796378031373024, "grad_norm_var": 7.627280613432443e-07, "learning_rate": 0.0020716141456796583, "loss": 2.5405, "step": 19452 }, { "crossentropy": 2.699824094772339, "epoch": 0.7052276682134571, "grad_norm": 0.026940032839775085, "grad_norm_var": 7.684867601169427e-07, "learning_rate": 0.0020711431680491507, "loss": 2.641, "step": 19453 }, { "crossentropy": 2.436631917953491, "epoch": 0.7052639211136891, "grad_norm": 0.026486260816454887, "grad_norm_var": 7.615654758568712e-07, "learning_rate": 0.002070672229977297, "loss": 2.508, "step": 19454 }, { "crossentropy": 2.5620226860046387, "epoch": 0.7053001740139211, "grad_norm": 0.026966849341988564, "grad_norm_var": 6.748900263169854e-07, "learning_rate": 0.002070201331470461, "loss": 2.5857, "step": 19455 }, { "crossentropy": 2.511749267578125, "epoch": 0.7053364269141531, "grad_norm": 0.026136115193367004, "grad_norm_var": 4.235895872504077e-07, "learning_rate": 0.0020697304725350034, "loss": 2.4822, "step": 19456 }, { "crossentropy": 2.428586721420288, "epoch": 0.7053726798143851, "grad_norm": 0.025446513667702675, "grad_norm_var": 4.77118254231162e-07, "learning_rate": 0.0020692596531772813, "loss": 2.4346, "step": 19457 }, { "crossentropy": 2.5046138763427734, "epoch": 0.7054089327146171, "grad_norm": 0.02628498338162899, "grad_norm_var": 2.5729828303227886e-07, "learning_rate": 0.0020687888734036565, "loss": 2.5324, "step": 19458 }, { "crossentropy": 2.5883748531341553, "epoch": 0.7054451856148491, "grad_norm": 0.027445882558822632, "grad_norm_var": 3.3809090442074067e-07, "learning_rate": 0.002068318133220484, "loss": 2.5818, "step": 19459 }, { "crossentropy": 2.5837817192077637, "epoch": 0.7054814385150812, "grad_norm": 0.02629769593477249, "grad_norm_var": 3.2220196794287224e-07, "learning_rate": 0.0020678474326341267, "loss": 2.6558, "step": 19460 }, { "crossentropy": 2.5657835006713867, "epoch": 0.7055176914153132, "grad_norm": 0.025981148704886436, "grad_norm_var": 2.688167476666906e-07, "learning_rate": 0.0020673767716509394, "loss": 2.5205, "step": 19461 }, { "crossentropy": 2.349118232727051, "epoch": 0.7055539443155452, "grad_norm": 0.026960384100675583, "grad_norm_var": 2.820227973680898e-07, "learning_rate": 0.0020669061502772775, "loss": 2.4238, "step": 19462 }, { "crossentropy": 2.574115753173828, "epoch": 0.7055901972157773, "grad_norm": 0.027881573885679245, "grad_norm_var": 4.0958771277410655e-07, "learning_rate": 0.0020664355685194986, "loss": 2.5522, "step": 19463 }, { "crossentropy": 2.541597843170166, "epoch": 0.7056264501160093, "grad_norm": 0.025750134140253067, "grad_norm_var": 4.460111502402331e-07, "learning_rate": 0.0020659650263839613, "loss": 2.4891, "step": 19464 }, { "crossentropy": 2.4612109661102295, "epoch": 0.7056627030162413, "grad_norm": 0.026321565732359886, "grad_norm_var": 4.3199148048720696e-07, "learning_rate": 0.0020654945238770175, "loss": 2.462, "step": 19465 }, { "crossentropy": 2.550799608230591, "epoch": 0.7056989559164734, "grad_norm": 0.026776010170578957, "grad_norm_var": 4.129156551283073e-07, "learning_rate": 0.0020650240610050248, "loss": 2.5799, "step": 19466 }, { "crossentropy": 2.4903903007507324, "epoch": 0.7057352088167054, "grad_norm": 0.02864060550928116, "grad_norm_var": 6.996364475759034e-07, "learning_rate": 0.002064553637774334, "loss": 2.5351, "step": 19467 }, { "crossentropy": 2.3448615074157715, "epoch": 0.7057714617169374, "grad_norm": 0.026479415595531464, "grad_norm_var": 6.526929618897673e-07, "learning_rate": 0.0020640832541913012, "loss": 2.4803, "step": 19468 }, { "crossentropy": 2.628347158432007, "epoch": 0.7058077146171694, "grad_norm": 0.02655600756406784, "grad_norm_var": 6.483241486005073e-07, "learning_rate": 0.0020636129102622838, "loss": 2.5417, "step": 19469 }, { "crossentropy": 2.4871232509613037, "epoch": 0.7058439675174014, "grad_norm": 0.026964766904711723, "grad_norm_var": 6.521435141243568e-07, "learning_rate": 0.002063142605993626, "loss": 2.5581, "step": 19470 }, { "crossentropy": 2.4972071647644043, "epoch": 0.7058802204176334, "grad_norm": 0.02624444290995598, "grad_norm_var": 6.571889881988476e-07, "learning_rate": 0.002062672341391684, "loss": 2.5064, "step": 19471 }, { "crossentropy": 2.541564702987671, "epoch": 0.7059164733178654, "grad_norm": 0.02693829871714115, "grad_norm_var": 6.439995774005219e-07, "learning_rate": 0.002062202116462811, "loss": 2.5376, "step": 19472 }, { "crossentropy": 2.559741497039795, "epoch": 0.7059527262180975, "grad_norm": 0.025616666302084923, "grad_norm_var": 6.176981403045589e-07, "learning_rate": 0.002061731931213356, "loss": 2.5164, "step": 19473 }, { "crossentropy": 2.6510727405548096, "epoch": 0.7059889791183295, "grad_norm": 0.026639802381396294, "grad_norm_var": 6.061112138981319e-07, "learning_rate": 0.002061261785649671, "loss": 2.5141, "step": 19474 }, { "crossentropy": 2.5659704208374023, "epoch": 0.7060252320185615, "grad_norm": 0.02635739929974079, "grad_norm_var": 5.745806169442254e-07, "learning_rate": 0.0020607916797781047, "loss": 2.5811, "step": 19475 }, { "crossentropy": 2.5240910053253174, "epoch": 0.7060614849187935, "grad_norm": 0.02665243297815323, "grad_norm_var": 5.657646473889236e-07, "learning_rate": 0.002060321613605006, "loss": 2.5093, "step": 19476 }, { "crossentropy": 2.5429861545562744, "epoch": 0.7060977378190255, "grad_norm": 0.025908973067998886, "grad_norm_var": 5.72743786617077e-07, "learning_rate": 0.002059851587136729, "loss": 2.5673, "step": 19477 }, { "crossentropy": 2.441826105117798, "epoch": 0.7061339907192575, "grad_norm": 0.026618676260113716, "grad_norm_var": 5.667215915714414e-07, "learning_rate": 0.0020593816003796153, "loss": 2.4011, "step": 19478 }, { "crossentropy": 2.6481120586395264, "epoch": 0.7061702436194895, "grad_norm": 0.027558637782931328, "grad_norm_var": 5.200670397693576e-07, "learning_rate": 0.002058911653340015, "loss": 2.6686, "step": 19479 }, { "crossentropy": 2.4263546466827393, "epoch": 0.7062064965197216, "grad_norm": 0.0265866257250309, "grad_norm_var": 4.660575512045363e-07, "learning_rate": 0.0020584417460242778, "loss": 2.4771, "step": 19480 }, { "crossentropy": 2.6448025703430176, "epoch": 0.7062427494199536, "grad_norm": 0.026859845966100693, "grad_norm_var": 4.58529821318702e-07, "learning_rate": 0.0020579718784387467, "loss": 2.5893, "step": 19481 }, { "crossentropy": 2.5476341247558594, "epoch": 0.7062790023201856, "grad_norm": 0.026849346235394478, "grad_norm_var": 4.5948782016076254e-07, "learning_rate": 0.0020575020505897725, "loss": 2.5827, "step": 19482 }, { "crossentropy": 2.476451873779297, "epoch": 0.7063152552204176, "grad_norm": 0.02844952791929245, "grad_norm_var": 4.1276192441370523e-07, "learning_rate": 0.002057032262483696, "loss": 2.5526, "step": 19483 }, { "crossentropy": 2.4864754676818848, "epoch": 0.7063515081206496, "grad_norm": 0.026566170156002045, "grad_norm_var": 4.106222989461635e-07, "learning_rate": 0.0020565625141268657, "loss": 2.5876, "step": 19484 }, { "crossentropy": 2.6373839378356934, "epoch": 0.7063877610208816, "grad_norm": 0.02749432809650898, "grad_norm_var": 4.4632465118209173e-07, "learning_rate": 0.002056092805525627, "loss": 2.6107, "step": 19485 }, { "crossentropy": 2.5279831886291504, "epoch": 0.7064240139211136, "grad_norm": 0.0267327930778265, "grad_norm_var": 4.436366040644433e-07, "learning_rate": 0.002055623136686322, "loss": 2.5207, "step": 19486 }, { "crossentropy": 2.3925065994262695, "epoch": 0.7064602668213457, "grad_norm": 0.0302865132689476, "grad_norm_var": 1.189824598497507e-06, "learning_rate": 0.002055153507615293, "loss": 2.413, "step": 19487 }, { "crossentropy": 2.6026229858398438, "epoch": 0.7064965197215777, "grad_norm": 0.026973089203238487, "grad_norm_var": 1.189580389836228e-06, "learning_rate": 0.0020546839183188874, "loss": 2.582, "step": 19488 }, { "crossentropy": 2.5921201705932617, "epoch": 0.7065327726218097, "grad_norm": 0.02760911174118519, "grad_norm_var": 1.0676954275819072e-06, "learning_rate": 0.002054214368803443, "loss": 2.5753, "step": 19489 }, { "crossentropy": 2.472928285598755, "epoch": 0.7065690255220418, "grad_norm": 0.026786180213093758, "grad_norm_var": 1.0593901917645462e-06, "learning_rate": 0.0020537448590753054, "loss": 2.4898, "step": 19490 }, { "crossentropy": 2.5612621307373047, "epoch": 0.7066052784222738, "grad_norm": 0.025608722120523453, "grad_norm_var": 1.1728543460144565e-06, "learning_rate": 0.0020532753891408136, "loss": 2.4668, "step": 19491 }, { "crossentropy": 2.493678331375122, "epoch": 0.7066415313225058, "grad_norm": 0.026540936902165413, "grad_norm_var": 1.1802300592706245e-06, "learning_rate": 0.002052805959006309, "loss": 2.4947, "step": 19492 }, { "crossentropy": 2.3674099445343018, "epoch": 0.7066777842227379, "grad_norm": 0.025772428140044212, "grad_norm_var": 1.202885133569596e-06, "learning_rate": 0.002052336568678134, "loss": 2.4581, "step": 19493 }, { "crossentropy": 2.6070849895477295, "epoch": 0.7067140371229699, "grad_norm": 0.026570376008749008, "grad_norm_var": 1.206007086540954e-06, "learning_rate": 0.0020518672181626275, "loss": 2.5192, "step": 19494 }, { "crossentropy": 2.464172601699829, "epoch": 0.7067502900232019, "grad_norm": 0.0273080263286829, "grad_norm_var": 1.1938649906890422e-06, "learning_rate": 0.002051397907466126, "loss": 2.4781, "step": 19495 }, { "crossentropy": 2.562854290008545, "epoch": 0.7067865429234339, "grad_norm": 0.026418626308441162, "grad_norm_var": 1.2062801547523995e-06, "learning_rate": 0.002050928636594972, "loss": 2.5162, "step": 19496 }, { "crossentropy": 2.4808132648468018, "epoch": 0.7068227958236659, "grad_norm": 0.02623061276972294, "grad_norm_var": 1.2471160006968334e-06, "learning_rate": 0.0020504594055555003, "loss": 2.5858, "step": 19497 }, { "crossentropy": 2.649214506149292, "epoch": 0.7068590487238979, "grad_norm": 0.025700191035866737, "grad_norm_var": 1.3546186337934364e-06, "learning_rate": 0.0020499902143540525, "loss": 2.5491, "step": 19498 }, { "crossentropy": 2.4412682056427, "epoch": 0.70689530162413, "grad_norm": 0.027957255020737648, "grad_norm_var": 1.2707157776547003e-06, "learning_rate": 0.0020495210629969617, "loss": 2.5217, "step": 19499 }, { "crossentropy": 2.547280788421631, "epoch": 0.706931554524362, "grad_norm": 0.025312630459666252, "grad_norm_var": 1.4263446699532e-06, "learning_rate": 0.002049051951490567, "loss": 2.556, "step": 19500 }, { "crossentropy": 2.6147732734680176, "epoch": 0.706967807424594, "grad_norm": 0.028398314490914345, "grad_norm_var": 1.5573272271899678e-06, "learning_rate": 0.0020485828798412053, "loss": 2.5436, "step": 19501 }, { "crossentropy": 2.4660816192626953, "epoch": 0.707004060324826, "grad_norm": 0.0423310287296772, "grad_norm_var": 1.6441378102691643e-05, "learning_rate": 0.002048113848055209, "loss": 2.5509, "step": 19502 }, { "crossentropy": 2.4510393142700195, "epoch": 0.707040313225058, "grad_norm": 0.027441924437880516, "grad_norm_var": 1.6027828143793237e-05, "learning_rate": 0.0020476448561389166, "loss": 2.4739, "step": 19503 }, { "crossentropy": 2.5972135066986084, "epoch": 0.70707656612529, "grad_norm": 0.0261749979108572, "grad_norm_var": 1.6143389843198814e-05, "learning_rate": 0.0020471759040986606, "loss": 2.5278, "step": 19504 }, { "crossentropy": 2.579603433609009, "epoch": 0.707112819025522, "grad_norm": 0.027319345623254776, "grad_norm_var": 1.6149641114098395e-05, "learning_rate": 0.002046706991940773, "loss": 2.5731, "step": 19505 }, { "crossentropy": 2.594587564468384, "epoch": 0.707149071925754, "grad_norm": 0.025970762595534325, "grad_norm_var": 1.6281523669282395e-05, "learning_rate": 0.002046238119671592, "loss": 2.5996, "step": 19506 }, { "crossentropy": 2.529435157775879, "epoch": 0.7071853248259861, "grad_norm": 0.02568449079990387, "grad_norm_var": 1.626210898078548e-05, "learning_rate": 0.002045769287297445, "loss": 2.4474, "step": 19507 }, { "crossentropy": 2.667952537536621, "epoch": 0.7072215777262181, "grad_norm": 0.029156751930713654, "grad_norm_var": 1.6330592229936378e-05, "learning_rate": 0.0020453004948246677, "loss": 2.6, "step": 19508 }, { "crossentropy": 2.405082941055298, "epoch": 0.7072578306264501, "grad_norm": 0.02692835032939911, "grad_norm_var": 1.6111742484301477e-05, "learning_rate": 0.0020448317422595923, "loss": 2.5101, "step": 19509 }, { "crossentropy": 2.6260550022125244, "epoch": 0.7072940835266821, "grad_norm": 0.02747982367873192, "grad_norm_var": 1.6013546303642495e-05, "learning_rate": 0.0020443630296085474, "loss": 2.6063, "step": 19510 }, { "crossentropy": 2.4901223182678223, "epoch": 0.7073303364269141, "grad_norm": 0.02796509675681591, "grad_norm_var": 1.5991881142145034e-05, "learning_rate": 0.002043894356877867, "loss": 2.5565, "step": 19511 }, { "crossentropy": 2.3427271842956543, "epoch": 0.7073665893271461, "grad_norm": 0.026676727458834648, "grad_norm_var": 1.5944914425660797e-05, "learning_rate": 0.002043425724073879, "loss": 2.4544, "step": 19512 }, { "crossentropy": 2.439012289047241, "epoch": 0.7074028422273781, "grad_norm": 0.026116950437426567, "grad_norm_var": 1.5971332362181972e-05, "learning_rate": 0.002042957131202912, "loss": 2.4229, "step": 19513 }, { "crossentropy": 2.6025302410125732, "epoch": 0.7074390951276102, "grad_norm": 0.026752956211566925, "grad_norm_var": 1.5729934655770332e-05, "learning_rate": 0.0020424885782712983, "loss": 2.5295, "step": 19514 }, { "crossentropy": 2.482734441757202, "epoch": 0.7074753480278422, "grad_norm": 0.02659892477095127, "grad_norm_var": 1.5849227784713025e-05, "learning_rate": 0.002042020065285362, "loss": 2.4733, "step": 19515 }, { "crossentropy": 2.3743720054626465, "epoch": 0.7075116009280742, "grad_norm": 0.02573554590344429, "grad_norm_var": 1.5714828345939762e-05, "learning_rate": 0.0020415515922514345, "loss": 2.3556, "step": 19516 }, { "crossentropy": 2.513718605041504, "epoch": 0.7075478538283063, "grad_norm": 0.02675595134496689, "grad_norm_var": 1.577883512200669e-05, "learning_rate": 0.0020410831591758435, "loss": 2.5368, "step": 19517 }, { "crossentropy": 2.520392656326294, "epoch": 0.7075841067285383, "grad_norm": 0.026190634816884995, "grad_norm_var": 8.282738705564826e-07, "learning_rate": 0.0020406147660649134, "loss": 2.5295, "step": 19518 }, { "crossentropy": 2.362597703933716, "epoch": 0.7076203596287703, "grad_norm": 0.025786077603697777, "grad_norm_var": 8.599736128834009e-07, "learning_rate": 0.002040146412924973, "loss": 2.3967, "step": 19519 }, { "crossentropy": 2.429396152496338, "epoch": 0.7076566125290024, "grad_norm": 0.026858795434236526, "grad_norm_var": 8.407991315729942e-07, "learning_rate": 0.0020396780997623478, "loss": 2.4914, "step": 19520 }, { "crossentropy": 2.509120464324951, "epoch": 0.7076928654292344, "grad_norm": 0.026847437024116516, "grad_norm_var": 8.188041411609503e-07, "learning_rate": 0.00203920982658336, "loss": 2.3965, "step": 19521 }, { "crossentropy": 2.431680202484131, "epoch": 0.7077291183294664, "grad_norm": 0.026189545169472694, "grad_norm_var": 7.999665854865312e-07, "learning_rate": 0.0020387415933943386, "loss": 2.5085, "step": 19522 }, { "crossentropy": 2.558238983154297, "epoch": 0.7077653712296984, "grad_norm": 0.027699485421180725, "grad_norm_var": 7.720967379271429e-07, "learning_rate": 0.002038273400201604, "loss": 2.548, "step": 19523 }, { "crossentropy": 2.608043670654297, "epoch": 0.7078016241299304, "grad_norm": 0.02640765719115734, "grad_norm_var": 4.0209685464215733e-07, "learning_rate": 0.002037805247011482, "loss": 2.5767, "step": 19524 }, { "crossentropy": 2.4568402767181396, "epoch": 0.7078378770301624, "grad_norm": 0.02623680606484413, "grad_norm_var": 4.0972076287199877e-07, "learning_rate": 0.002037337133830297, "loss": 2.4575, "step": 19525 }, { "crossentropy": 2.5761475563049316, "epoch": 0.7078741299303944, "grad_norm": 0.028190944343805313, "grad_norm_var": 5.206091774632386e-07, "learning_rate": 0.0020368690606643687, "loss": 2.596, "step": 19526 }, { "crossentropy": 2.4948627948760986, "epoch": 0.7079103828306265, "grad_norm": 0.06065722927451134, "grad_norm_var": 7.288546345375433e-05, "learning_rate": 0.002036401027520021, "loss": 2.5064, "step": 19527 }, { "crossentropy": 2.6069798469543457, "epoch": 0.7079466357308585, "grad_norm": 0.030443958938121796, "grad_norm_var": 7.274043153917448e-05, "learning_rate": 0.002035933034403576, "loss": 2.5538, "step": 19528 }, { "crossentropy": 2.614718437194824, "epoch": 0.7079828886310905, "grad_norm": 0.026930244639515877, "grad_norm_var": 7.247273584448689e-05, "learning_rate": 0.0020354650813213518, "loss": 2.4995, "step": 19529 }, { "crossentropy": 2.5955123901367188, "epoch": 0.7080191415313225, "grad_norm": 0.026110883802175522, "grad_norm_var": 7.26923804572295e-05, "learning_rate": 0.002034997168279671, "loss": 2.5391, "step": 19530 }, { "crossentropy": 2.5116631984710693, "epoch": 0.7080553944315545, "grad_norm": 0.026780031621456146, "grad_norm_var": 7.263699341840858e-05, "learning_rate": 0.0020345292952848528, "loss": 2.4925, "step": 19531 }, { "crossentropy": 2.417609453201294, "epoch": 0.7080916473317865, "grad_norm": 0.027268247678875923, "grad_norm_var": 7.21189755876702e-05, "learning_rate": 0.0020340614623432156, "loss": 2.4941, "step": 19532 }, { "crossentropy": 2.549995183944702, "epoch": 0.7081279002320185, "grad_norm": 0.026259852573275566, "grad_norm_var": 7.228839104291701e-05, "learning_rate": 0.002033593669461082, "loss": 2.5614, "step": 19533 }, { "crossentropy": 2.6008238792419434, "epoch": 0.7081641531322506, "grad_norm": 0.02755374275147915, "grad_norm_var": 7.188417995705752e-05, "learning_rate": 0.002033125916644765, "loss": 2.5124, "step": 19534 }, { "crossentropy": 2.5417943000793457, "epoch": 0.7082004060324826, "grad_norm": 0.026008041575551033, "grad_norm_var": 7.178803447198481e-05, "learning_rate": 0.002032658203900587, "loss": 2.5422, "step": 19535 }, { "crossentropy": 2.5009233951568604, "epoch": 0.7082366589327146, "grad_norm": 0.027029965072870255, "grad_norm_var": 7.173751317649118e-05, "learning_rate": 0.002032190531234863, "loss": 2.5369, "step": 19536 }, { "crossentropy": 2.4826529026031494, "epoch": 0.7082729118329466, "grad_norm": 0.02698131650686264, "grad_norm_var": 7.169729245002827e-05, "learning_rate": 0.0020317228986539075, "loss": 2.5068, "step": 19537 }, { "crossentropy": 2.378434181213379, "epoch": 0.7083091647331786, "grad_norm": 0.028354089707136154, "grad_norm_var": 7.112943959705863e-05, "learning_rate": 0.0020312553061640416, "loss": 2.4721, "step": 19538 }, { "crossentropy": 2.5036096572875977, "epoch": 0.7083454176334106, "grad_norm": 0.026206443086266518, "grad_norm_var": 7.158878088882266e-05, "learning_rate": 0.0020307877537715756, "loss": 2.4888, "step": 19539 }, { "crossentropy": 2.55706524848938, "epoch": 0.7083816705336426, "grad_norm": 0.02856937050819397, "grad_norm_var": 7.107205772960788e-05, "learning_rate": 0.002030320241482827, "loss": 2.6479, "step": 19540 }, { "crossentropy": 2.4970333576202393, "epoch": 0.7084179234338747, "grad_norm": 0.025822049006819725, "grad_norm_var": 7.12549066588223e-05, "learning_rate": 0.0020298527693041117, "loss": 2.4334, "step": 19541 }, { "crossentropy": 2.2832717895507812, "epoch": 0.7084541763341067, "grad_norm": 0.02690150775015354, "grad_norm_var": 7.15534335379186e-05, "learning_rate": 0.002029385337241741, "loss": 2.3534, "step": 19542 }, { "crossentropy": 2.4795360565185547, "epoch": 0.7084904292343387, "grad_norm": 0.026299970224499702, "grad_norm_var": 1.4190298368975946e-06, "learning_rate": 0.00202891794530203, "loss": 2.5035, "step": 19543 }, { "crossentropy": 2.4267990589141846, "epoch": 0.7085266821345708, "grad_norm": 0.027096323668956757, "grad_norm_var": 6.246258595901753e-07, "learning_rate": 0.00202845059349129, "loss": 2.4854, "step": 19544 }, { "crossentropy": 2.4865822792053223, "epoch": 0.7085629350348028, "grad_norm": 0.02552224136888981, "grad_norm_var": 7.401782295373641e-07, "learning_rate": 0.002027983281815836, "loss": 2.4606, "step": 19545 }, { "crossentropy": 2.664710521697998, "epoch": 0.7085991879350348, "grad_norm": 0.026681793853640556, "grad_norm_var": 7.082638310784127e-07, "learning_rate": 0.002027516010281978, "loss": 2.5741, "step": 19546 }, { "crossentropy": 2.5780208110809326, "epoch": 0.7086354408352669, "grad_norm": 0.026112373918294907, "grad_norm_var": 7.408784301954814e-07, "learning_rate": 0.002027048778896025, "loss": 2.5729, "step": 19547 }, { "crossentropy": 2.5870044231414795, "epoch": 0.7086716937354989, "grad_norm": 0.02632058411836624, "grad_norm_var": 7.367943405680999e-07, "learning_rate": 0.00202658158766429, "loss": 2.5472, "step": 19548 }, { "crossentropy": 2.5094079971313477, "epoch": 0.7087079466357309, "grad_norm": 0.02679958939552307, "grad_norm_var": 7.209890547023775e-07, "learning_rate": 0.002026114436593085, "loss": 2.4461, "step": 19549 }, { "crossentropy": 2.5395052433013916, "epoch": 0.7087441995359629, "grad_norm": 0.026863619685173035, "grad_norm_var": 6.782902289784725e-07, "learning_rate": 0.002025647325688716, "loss": 2.5535, "step": 19550 }, { "crossentropy": 2.4505865573883057, "epoch": 0.7087804524361949, "grad_norm": 0.027092093601822853, "grad_norm_var": 6.483864366498786e-07, "learning_rate": 0.002025180254957495, "loss": 2.4853, "step": 19551 }, { "crossentropy": 2.4690048694610596, "epoch": 0.7088167053364269, "grad_norm": 0.02639160491526127, "grad_norm_var": 6.535017841070856e-07, "learning_rate": 0.0020247132244057275, "loss": 2.4778, "step": 19552 }, { "crossentropy": 2.470911979675293, "epoch": 0.708852958236659, "grad_norm": 0.027090633288025856, "grad_norm_var": 6.576066006831836e-07, "learning_rate": 0.0020242462340397246, "loss": 2.48, "step": 19553 }, { "crossentropy": 2.3792049884796143, "epoch": 0.708889211136891, "grad_norm": 0.026693642139434814, "grad_norm_var": 4.7651001872579753e-07, "learning_rate": 0.002023779283865792, "loss": 2.5152, "step": 19554 }, { "crossentropy": 2.617448329925537, "epoch": 0.708925464037123, "grad_norm": 0.026790201663970947, "grad_norm_var": 4.629738159904357e-07, "learning_rate": 0.0020233123738902354, "loss": 2.5453, "step": 19555 }, { "crossentropy": 2.505478858947754, "epoch": 0.708961716937355, "grad_norm": 0.026914697140455246, "grad_norm_var": 2.1956751970660566e-07, "learning_rate": 0.0020228455041193623, "loss": 2.4707, "step": 19556 }, { "crossentropy": 2.4100539684295654, "epoch": 0.708997969837587, "grad_norm": 0.025440014898777008, "grad_norm_var": 2.6765732835501835e-07, "learning_rate": 0.0020223786745594808, "loss": 2.4667, "step": 19557 }, { "crossentropy": 2.438692569732666, "epoch": 0.709034222737819, "grad_norm": 0.026191098615527153, "grad_norm_var": 2.6715315181202677e-07, "learning_rate": 0.0020219118852168915, "loss": 2.5023, "step": 19558 }, { "crossentropy": 2.4904279708862305, "epoch": 0.709070475638051, "grad_norm": 0.02635238878428936, "grad_norm_var": 2.657955898696518e-07, "learning_rate": 0.002021445136097904, "loss": 2.5744, "step": 19559 }, { "crossentropy": 2.4112720489501953, "epoch": 0.709106728538283, "grad_norm": 0.02564871497452259, "grad_norm_var": 2.8592685139451854e-07, "learning_rate": 0.0020209784272088183, "loss": 2.4745, "step": 19560 }, { "crossentropy": 2.409852981567383, "epoch": 0.7091429814385151, "grad_norm": 0.026234347373247147, "grad_norm_var": 2.3128081323163924e-07, "learning_rate": 0.002020511758555941, "loss": 2.4866, "step": 19561 }, { "crossentropy": 2.3393197059631348, "epoch": 0.7091792343387471, "grad_norm": 0.026665156707167625, "grad_norm_var": 2.3084179710552466e-07, "learning_rate": 0.0020200451301455743, "loss": 2.4285, "step": 19562 }, { "crossentropy": 2.5840699672698975, "epoch": 0.7092154872389791, "grad_norm": 0.026951083913445473, "grad_norm_var": 2.342493810170501e-07, "learning_rate": 0.0020195785419840186, "loss": 2.4945, "step": 19563 }, { "crossentropy": 2.4766178131103516, "epoch": 0.7092517401392111, "grad_norm": 0.025664472952485085, "grad_norm_var": 2.792529155889404e-07, "learning_rate": 0.0020191119940775777, "loss": 2.4973, "step": 19564 }, { "crossentropy": 2.5128796100616455, "epoch": 0.7092879930394431, "grad_norm": 0.026717575266957283, "grad_norm_var": 2.762491725191718e-07, "learning_rate": 0.0020186454864325543, "loss": 2.5501, "step": 19565 }, { "crossentropy": 2.469428300857544, "epoch": 0.7093242459396751, "grad_norm": 0.027373014017939568, "grad_norm_var": 3.1843138087737865e-07, "learning_rate": 0.0020181790190552467, "loss": 2.4988, "step": 19566 }, { "crossentropy": 2.4509785175323486, "epoch": 0.7093604988399071, "grad_norm": 0.026393583044409752, "grad_norm_var": 2.950084114886614e-07, "learning_rate": 0.002017712591951958, "loss": 2.5067, "step": 19567 }, { "crossentropy": 2.5158531665802, "epoch": 0.7093967517401392, "grad_norm": 0.0263061560690403, "grad_norm_var": 2.963523918139121e-07, "learning_rate": 0.002017246205128985, "loss": 2.5564, "step": 19568 }, { "crossentropy": 2.579566478729248, "epoch": 0.7094330046403712, "grad_norm": 0.027723371982574463, "grad_norm_var": 3.742261370110461e-07, "learning_rate": 0.002016779858592628, "loss": 2.6217, "step": 19569 }, { "crossentropy": 2.6417155265808105, "epoch": 0.7094692575406032, "grad_norm": 0.030089061707258224, "grad_norm_var": 1.1807628239841044e-06, "learning_rate": 0.0020163135523491903, "loss": 2.5653, "step": 19570 }, { "crossentropy": 2.5746712684631348, "epoch": 0.7095055104408353, "grad_norm": 0.026752905920147896, "grad_norm_var": 1.180480442544013e-06, "learning_rate": 0.002015847286404962, "loss": 2.5899, "step": 19571 }, { "crossentropy": 2.5147600173950195, "epoch": 0.7095417633410673, "grad_norm": 0.026984399184584618, "grad_norm_var": 1.1826529833560642e-06, "learning_rate": 0.0020153810607662455, "loss": 2.5978, "step": 19572 }, { "crossentropy": 2.520358085632324, "epoch": 0.7095780162412993, "grad_norm": 0.027514416724443436, "grad_norm_var": 1.0981367821127762e-06, "learning_rate": 0.0020149148754393382, "loss": 2.5581, "step": 19573 }, { "crossentropy": 2.337777614593506, "epoch": 0.7096142691415314, "grad_norm": 0.02630826272070408, "grad_norm_var": 1.0887388162744792e-06, "learning_rate": 0.0020144487304305336, "loss": 2.297, "step": 19574 }, { "crossentropy": 2.3855483531951904, "epoch": 0.7096505220417634, "grad_norm": 0.02741939015686512, "grad_norm_var": 1.0883993275562076e-06, "learning_rate": 0.002013982625746132, "loss": 2.4962, "step": 19575 }, { "crossentropy": 2.589136838912964, "epoch": 0.7096867749419954, "grad_norm": 0.02730732038617134, "grad_norm_var": 9.788355508720999e-07, "learning_rate": 0.0020135165613924247, "loss": 2.6221, "step": 19576 }, { "crossentropy": 2.58237886428833, "epoch": 0.7097230278422274, "grad_norm": 0.02715264819562435, "grad_norm_var": 9.346981565796446e-07, "learning_rate": 0.0020130505373757075, "loss": 2.615, "step": 19577 }, { "crossentropy": 2.526179313659668, "epoch": 0.7097592807424594, "grad_norm": 0.02674131840467453, "grad_norm_var": 9.308208287233642e-07, "learning_rate": 0.00201258455370228, "loss": 2.5199, "step": 19578 }, { "crossentropy": 2.4640047550201416, "epoch": 0.7097955336426914, "grad_norm": 0.026812510564923286, "grad_norm_var": 9.345402969257802e-07, "learning_rate": 0.0020121186103784277, "loss": 2.5323, "step": 19579 }, { "crossentropy": 2.5988497734069824, "epoch": 0.7098317865429234, "grad_norm": 0.026416271924972534, "grad_norm_var": 8.28095913657266e-07, "learning_rate": 0.0020116527074104484, "loss": 2.4824, "step": 19580 }, { "crossentropy": 2.4140214920043945, "epoch": 0.7098680394431555, "grad_norm": 0.02687356434762478, "grad_norm_var": 8.211269924226211e-07, "learning_rate": 0.0020111868448046354, "loss": 2.4559, "step": 19581 }, { "crossentropy": 2.4009101390838623, "epoch": 0.7099042923433875, "grad_norm": 0.027181120589375496, "grad_norm_var": 8.173517639097159e-07, "learning_rate": 0.0020107210225672784, "loss": 2.5412, "step": 19582 }, { "crossentropy": 2.513368606567383, "epoch": 0.7099405452436195, "grad_norm": 0.025441350415349007, "grad_norm_var": 9.666992782101363e-07, "learning_rate": 0.002010255240704672, "loss": 2.4455, "step": 19583 }, { "crossentropy": 2.6357405185699463, "epoch": 0.7099767981438515, "grad_norm": 0.026958927512168884, "grad_norm_var": 9.273709608785396e-07, "learning_rate": 0.0020097894992231035, "loss": 2.6013, "step": 19584 }, { "crossentropy": 2.5262632369995117, "epoch": 0.7100130510440835, "grad_norm": 0.0265479888767004, "grad_norm_var": 9.167754924999567e-07, "learning_rate": 0.0020093237981288663, "loss": 2.5595, "step": 19585 }, { "crossentropy": 2.488806962966919, "epoch": 0.7100493039443155, "grad_norm": 0.02605392038822174, "grad_norm_var": 2.8931203317709294e-07, "learning_rate": 0.002008858137428251, "loss": 2.4216, "step": 19586 }, { "crossentropy": 2.496473550796509, "epoch": 0.7100855568445475, "grad_norm": 0.025763602927327156, "grad_norm_var": 3.5394314853252203e-07, "learning_rate": 0.0020083925171275464, "loss": 2.5643, "step": 19587 }, { "crossentropy": 2.5872535705566406, "epoch": 0.7101218097447796, "grad_norm": 0.025487976148724556, "grad_norm_var": 4.4060847787172344e-07, "learning_rate": 0.0020079269372330383, "loss": 2.6034, "step": 19588 }, { "crossentropy": 2.492844581604004, "epoch": 0.7101580626450116, "grad_norm": 0.027083706110715866, "grad_norm_var": 4.0105579206482317e-07, "learning_rate": 0.00200746139775102, "loss": 2.4706, "step": 19589 }, { "crossentropy": 2.4968044757843018, "epoch": 0.7101943155452436, "grad_norm": 0.02595737762749195, "grad_norm_var": 4.2225309487484153e-07, "learning_rate": 0.0020069958986877745, "loss": 2.4745, "step": 19590 }, { "crossentropy": 2.575500965118408, "epoch": 0.7102305684454756, "grad_norm": 0.027331531047821045, "grad_norm_var": 4.1284316126947476e-07, "learning_rate": 0.002006530440049594, "loss": 2.4916, "step": 19591 }, { "crossentropy": 2.5079281330108643, "epoch": 0.7102668213457076, "grad_norm": 0.02579541690647602, "grad_norm_var": 4.069629325392209e-07, "learning_rate": 0.0020060650218427603, "loss": 2.5061, "step": 19592 }, { "crossentropy": 2.4958112239837646, "epoch": 0.7103030742459396, "grad_norm": 0.026366209611296654, "grad_norm_var": 3.745560938354815e-07, "learning_rate": 0.0020055996440735612, "loss": 2.4971, "step": 19593 }, { "crossentropy": 2.5357296466827393, "epoch": 0.7103393271461717, "grad_norm": 0.027052579447627068, "grad_norm_var": 3.9370580410985564e-07, "learning_rate": 0.0020051343067482856, "loss": 2.5729, "step": 19594 }, { "crossentropy": 2.6313352584838867, "epoch": 0.7103755800464037, "grad_norm": 0.025878332555294037, "grad_norm_var": 4.0250436343231e-07, "learning_rate": 0.002004669009873216, "loss": 2.5386, "step": 19595 }, { "crossentropy": 2.528616428375244, "epoch": 0.7104118329466357, "grad_norm": 0.025784149765968323, "grad_norm_var": 4.249997043454893e-07, "learning_rate": 0.0020042037534546355, "loss": 2.5237, "step": 19596 }, { "crossentropy": 2.480264186859131, "epoch": 0.7104480858468677, "grad_norm": 0.025844035670161247, "grad_norm_var": 4.190129075517616e-07, "learning_rate": 0.0020037385374988316, "loss": 2.4184, "step": 19597 }, { "crossentropy": 2.3924553394317627, "epoch": 0.7104843387470998, "grad_norm": 0.027042711153626442, "grad_norm_var": 4.0363604247167784e-07, "learning_rate": 0.002003273362012083, "loss": 2.468, "step": 19598 }, { "crossentropy": 2.3786606788635254, "epoch": 0.7105205916473318, "grad_norm": 0.027562394738197327, "grad_norm_var": 4.492318300660606e-07, "learning_rate": 0.0020028082270006774, "loss": 2.4221, "step": 19599 }, { "crossentropy": 2.483234167098999, "epoch": 0.7105568445475638, "grad_norm": 0.02721683867275715, "grad_norm_var": 4.723714332924289e-07, "learning_rate": 0.0020023431324708942, "loss": 2.4501, "step": 19600 }, { "crossentropy": 2.4012527465820312, "epoch": 0.7105930974477959, "grad_norm": 0.02577986754477024, "grad_norm_var": 4.964511384148232e-07, "learning_rate": 0.0020018780784290154, "loss": 2.4838, "step": 19601 }, { "crossentropy": 2.4982352256774902, "epoch": 0.7106293503480279, "grad_norm": 0.026081478223204613, "grad_norm_var": 4.953186858513142e-07, "learning_rate": 0.0020014130648813243, "loss": 2.4838, "step": 19602 }, { "crossentropy": 2.490476131439209, "epoch": 0.7106656032482599, "grad_norm": 0.0257441233843565, "grad_norm_var": 4.969349453941835e-07, "learning_rate": 0.0020009480918340996, "loss": 2.5041, "step": 19603 }, { "crossentropy": 2.467604637145996, "epoch": 0.7107018561484919, "grad_norm": 0.026041453704237938, "grad_norm_var": 4.505810683805376e-07, "learning_rate": 0.002000483159293623, "loss": 2.558, "step": 19604 }, { "crossentropy": 2.5977985858917236, "epoch": 0.7107381090487239, "grad_norm": 0.026030486449599266, "grad_norm_var": 4.253218403688433e-07, "learning_rate": 0.0020000182672661737, "loss": 2.5168, "step": 19605 }, { "crossentropy": 2.4683594703674316, "epoch": 0.7107743619489559, "grad_norm": 0.026548687368631363, "grad_norm_var": 4.16668409593127e-07, "learning_rate": 0.001999553415758028, "loss": 2.4866, "step": 19606 }, { "crossentropy": 2.654937982559204, "epoch": 0.710810614849188, "grad_norm": 0.025547996163368225, "grad_norm_var": 3.8950385218340136e-07, "learning_rate": 0.0019990886047754693, "loss": 2.6189, "step": 19607 }, { "crossentropy": 2.438199758529663, "epoch": 0.71084686774942, "grad_norm": 0.025046486407518387, "grad_norm_var": 4.7193032904925083e-07, "learning_rate": 0.001998623834324771, "loss": 2.4485, "step": 19608 }, { "crossentropy": 2.5490756034851074, "epoch": 0.710883120649652, "grad_norm": 0.02533859945833683, "grad_norm_var": 5.18305974716154e-07, "learning_rate": 0.001998159104412212, "loss": 2.5061, "step": 19609 }, { "crossentropy": 2.267312526702881, "epoch": 0.710919373549884, "grad_norm": 0.025084787979722023, "grad_norm_var": 5.258063029869401e-07, "learning_rate": 0.0019976944150440724, "loss": 2.3273, "step": 19610 }, { "crossentropy": 2.6825125217437744, "epoch": 0.710955626450116, "grad_norm": 0.027156082913279533, "grad_norm_var": 6.010234132163324e-07, "learning_rate": 0.0019972297662266235, "loss": 2.6444, "step": 19611 }, { "crossentropy": 2.4701387882232666, "epoch": 0.710991879350348, "grad_norm": 0.026768183335661888, "grad_norm_var": 6.180510352328412e-07, "learning_rate": 0.0019967651579661455, "loss": 2.4336, "step": 19612 }, { "crossentropy": 2.6021358966827393, "epoch": 0.71102813225058, "grad_norm": 0.026175837963819504, "grad_norm_var": 6.101952696328187e-07, "learning_rate": 0.0019963005902689118, "loss": 2.6136, "step": 19613 }, { "crossentropy": 2.608886957168579, "epoch": 0.711064385150812, "grad_norm": 0.025038141757249832, "grad_norm_var": 6.355348221603306e-07, "learning_rate": 0.001995836063141195, "loss": 2.5357, "step": 19614 }, { "crossentropy": 2.3533012866973877, "epoch": 0.7111006380510441, "grad_norm": 0.025926033034920692, "grad_norm_var": 4.778419588315757e-07, "learning_rate": 0.0019953715765892725, "loss": 2.4767, "step": 19615 }, { "crossentropy": 2.5037858486175537, "epoch": 0.7111368909512761, "grad_norm": 0.026483189314603806, "grad_norm_var": 3.895474773488172e-07, "learning_rate": 0.0019949071306194156, "loss": 2.5601, "step": 19616 }, { "crossentropy": 2.4748036861419678, "epoch": 0.7111731438515081, "grad_norm": 0.027048612013459206, "grad_norm_var": 4.656936002373861e-07, "learning_rate": 0.001994442725237898, "loss": 2.4639, "step": 19617 }, { "crossentropy": 2.4128782749176025, "epoch": 0.7112093967517401, "grad_norm": 0.025965068489313126, "grad_norm_var": 4.6533428306991656e-07, "learning_rate": 0.0019939783604509937, "loss": 2.452, "step": 19618 }, { "crossentropy": 2.5830790996551514, "epoch": 0.7112456496519721, "grad_norm": 0.026776231825351715, "grad_norm_var": 4.971835817784802e-07, "learning_rate": 0.0019935140362649715, "loss": 2.518, "step": 19619 }, { "crossentropy": 2.4597532749176025, "epoch": 0.7112819025522041, "grad_norm": 0.02565647102892399, "grad_norm_var": 5.074497510868317e-07, "learning_rate": 0.0019930497526861072, "loss": 2.4628, "step": 19620 }, { "crossentropy": 2.4179399013519287, "epoch": 0.7113181554524362, "grad_norm": 0.02655753307044506, "grad_norm_var": 5.243580099155316e-07, "learning_rate": 0.0019925855097206696, "loss": 2.5435, "step": 19621 }, { "crossentropy": 2.540079355239868, "epoch": 0.7113544083526682, "grad_norm": 0.026493553072214127, "grad_norm_var": 5.210281058315794e-07, "learning_rate": 0.001992121307374926, "loss": 2.62, "step": 19622 }, { "crossentropy": 2.4981625080108643, "epoch": 0.7113906612529002, "grad_norm": 0.028014743700623512, "grad_norm_var": 7.308195840644334e-07, "learning_rate": 0.0019916571456551507, "loss": 2.4093, "step": 19623 }, { "crossentropy": 2.4341437816619873, "epoch": 0.7114269141531323, "grad_norm": 0.02675079181790352, "grad_norm_var": 6.455548564874055e-07, "learning_rate": 0.0019911930245676097, "loss": 2.4498, "step": 19624 }, { "crossentropy": 2.5017287731170654, "epoch": 0.7114631670533643, "grad_norm": 0.02636287920176983, "grad_norm_var": 5.761242984831146e-07, "learning_rate": 0.001990728944118572, "loss": 2.4721, "step": 19625 }, { "crossentropy": 2.347426414489746, "epoch": 0.7114994199535963, "grad_norm": 0.026700805872678757, "grad_norm_var": 4.578668206665799e-07, "learning_rate": 0.0019902649043143095, "loss": 2.4844, "step": 19626 }, { "crossentropy": 2.4745121002197266, "epoch": 0.7115356728538283, "grad_norm": 0.027010826393961906, "grad_norm_var": 4.4632649971772474e-07, "learning_rate": 0.0019898009051610844, "loss": 2.4597, "step": 19627 }, { "crossentropy": 2.78119158744812, "epoch": 0.7115719257540604, "grad_norm": 0.029540393501520157, "grad_norm_var": 1.0320392419433613e-06, "learning_rate": 0.0019893369466651685, "loss": 2.7018, "step": 19628 }, { "crossentropy": 2.604780912399292, "epoch": 0.7116081786542924, "grad_norm": 0.026268918067216873, "grad_norm_var": 1.0266176331917189e-06, "learning_rate": 0.0019888730288328254, "loss": 2.5019, "step": 19629 }, { "crossentropy": 2.4942169189453125, "epoch": 0.7116444315545244, "grad_norm": 0.025780122727155685, "grad_norm_var": 9.003629751253798e-07, "learning_rate": 0.00198840915167032, "loss": 2.4715, "step": 19630 }, { "crossentropy": 2.572662830352783, "epoch": 0.7116806844547564, "grad_norm": 0.02860836870968342, "grad_norm_var": 1.0701966898453053e-06, "learning_rate": 0.001987945315183921, "loss": 2.5356, "step": 19631 }, { "crossentropy": 2.583904981613159, "epoch": 0.7117169373549884, "grad_norm": 0.026994941756129265, "grad_norm_var": 1.0597512405586055e-06, "learning_rate": 0.00198748151937989, "loss": 2.5712, "step": 19632 }, { "crossentropy": 2.635361909866333, "epoch": 0.7117531902552204, "grad_norm": 0.0267594363540411, "grad_norm_var": 1.0595615569382322e-06, "learning_rate": 0.0019870177642644926, "loss": 2.5963, "step": 19633 }, { "crossentropy": 2.577258348464966, "epoch": 0.7117894431554525, "grad_norm": 0.025968195870518684, "grad_norm_var": 1.0591764581384938e-06, "learning_rate": 0.001986554049843994, "loss": 2.4729, "step": 19634 }, { "crossentropy": 2.6210694313049316, "epoch": 0.7118256960556845, "grad_norm": 0.030312132090330124, "grad_norm_var": 1.786827848010763e-06, "learning_rate": 0.001986090376124655, "loss": 2.5936, "step": 19635 }, { "crossentropy": 2.5725185871124268, "epoch": 0.7118619489559165, "grad_norm": 0.02682776376605034, "grad_norm_var": 1.6453758876086e-06, "learning_rate": 0.0019856267431127405, "loss": 2.5098, "step": 19636 }, { "crossentropy": 2.560264825820923, "epoch": 0.7118982018561485, "grad_norm": 0.026582421734929085, "grad_norm_var": 1.643334143557514e-06, "learning_rate": 0.0019851631508145118, "loss": 2.4939, "step": 19637 }, { "crossentropy": 2.5365195274353027, "epoch": 0.7119344547563805, "grad_norm": 0.027586795389652252, "grad_norm_var": 1.6170951608981386e-06, "learning_rate": 0.001984699599236228, "loss": 2.5503, "step": 19638 }, { "crossentropy": 2.6767239570617676, "epoch": 0.7119707076566125, "grad_norm": 0.026250256225466728, "grad_norm_var": 1.6327887155338782e-06, "learning_rate": 0.0019842360883841536, "loss": 2.5788, "step": 19639 }, { "crossentropy": 2.5587263107299805, "epoch": 0.7120069605568445, "grad_norm": 0.025984499603509903, "grad_norm_var": 1.7096706346056886e-06, "learning_rate": 0.0019837726182645454, "loss": 2.549, "step": 19640 }, { "crossentropy": 2.571749210357666, "epoch": 0.7120432134570766, "grad_norm": 0.025945788249373436, "grad_norm_var": 1.7613234290281276e-06, "learning_rate": 0.0019833091888836656, "loss": 2.5258, "step": 19641 }, { "crossentropy": 2.5960400104522705, "epoch": 0.7120794663573086, "grad_norm": 0.026323426514863968, "grad_norm_var": 1.7888064503819579e-06, "learning_rate": 0.001982845800247775, "loss": 2.4905, "step": 19642 }, { "crossentropy": 2.579566478729248, "epoch": 0.7121157192575406, "grad_norm": 0.026656663045287132, "grad_norm_var": 1.7983313483044272e-06, "learning_rate": 0.0019823824523631283, "loss": 2.5186, "step": 19643 }, { "crossentropy": 2.5285911560058594, "epoch": 0.7121519721577726, "grad_norm": 0.026561789214611053, "grad_norm_var": 1.3536098357575313e-06, "learning_rate": 0.0019819191452359886, "loss": 2.4927, "step": 19644 }, { "crossentropy": 2.3435866832733154, "epoch": 0.7121882250580046, "grad_norm": 0.026896603405475616, "grad_norm_var": 1.3305884810621098e-06, "learning_rate": 0.0019814558788726084, "loss": 2.4896, "step": 19645 }, { "crossentropy": 2.4699082374572754, "epoch": 0.7122244779582366, "grad_norm": 0.026588579639792442, "grad_norm_var": 1.2531530350236616e-06, "learning_rate": 0.0019809926532792493, "loss": 2.4603, "step": 19646 }, { "crossentropy": 2.3343513011932373, "epoch": 0.7122607308584686, "grad_norm": 0.025857962667942047, "grad_norm_var": 1.1097149484752545e-06, "learning_rate": 0.001980529468462166, "loss": 2.4712, "step": 19647 }, { "crossentropy": 2.354341983795166, "epoch": 0.7122969837587007, "grad_norm": 0.02620903216302395, "grad_norm_var": 1.1232883234929548e-06, "learning_rate": 0.0019800663244276128, "loss": 2.3556, "step": 19648 }, { "crossentropy": 2.4750425815582275, "epoch": 0.7123332366589327, "grad_norm": 0.02560662291944027, "grad_norm_var": 1.1982833048512775e-06, "learning_rate": 0.001979603221181847, "loss": 2.4989, "step": 19649 }, { "crossentropy": 2.3574142456054688, "epoch": 0.7123694895591647, "grad_norm": 0.026434490457177162, "grad_norm_var": 1.1704214682159894e-06, "learning_rate": 0.001979140158731125, "loss": 2.4267, "step": 19650 }, { "crossentropy": 2.494488477706909, "epoch": 0.7124057424593968, "grad_norm": 0.02568906731903553, "grad_norm_var": 2.575086982823608e-07, "learning_rate": 0.001978677137081697, "loss": 2.4385, "step": 19651 }, { "crossentropy": 2.4314491748809814, "epoch": 0.7124419953596288, "grad_norm": 0.02698889747262001, "grad_norm_var": 2.688564868505525e-07, "learning_rate": 0.0019782141562398213, "loss": 2.4763, "step": 19652 }, { "crossentropy": 2.54720139503479, "epoch": 0.7124782482598608, "grad_norm": 0.026349863037467003, "grad_norm_var": 2.661207021753629e-07, "learning_rate": 0.001977751216211748, "loss": 2.5395, "step": 19653 }, { "crossentropy": 2.593986988067627, "epoch": 0.7125145011600929, "grad_norm": 0.02651919797062874, "grad_norm_var": 1.6424158625822673e-07, "learning_rate": 0.0019772883170037325, "loss": 2.4961, "step": 19654 }, { "crossentropy": 2.5501363277435303, "epoch": 0.7125507540603249, "grad_norm": 0.026085494086146355, "grad_norm_var": 1.6711717485486517e-07, "learning_rate": 0.001976825458622025, "loss": 2.4935, "step": 19655 }, { "crossentropy": 2.5959951877593994, "epoch": 0.7125870069605569, "grad_norm": 0.026267778128385544, "grad_norm_var": 1.6045683599746375e-07, "learning_rate": 0.0019763626410728764, "loss": 2.5077, "step": 19656 }, { "crossentropy": 2.5020320415496826, "epoch": 0.7126232598607889, "grad_norm": 0.027182139456272125, "grad_norm_var": 1.9573395150055775e-07, "learning_rate": 0.0019758998643625383, "loss": 2.5241, "step": 19657 }, { "crossentropy": 2.3636863231658936, "epoch": 0.7126595127610209, "grad_norm": 0.026762615889310837, "grad_norm_var": 2.0397291376645735e-07, "learning_rate": 0.001975437128497264, "loss": 2.4564, "step": 19658 }, { "crossentropy": 2.4326424598693848, "epoch": 0.7126957656612529, "grad_norm": 0.025812625885009766, "grad_norm_var": 2.214196320691423e-07, "learning_rate": 0.0019749744334832987, "loss": 2.4819, "step": 19659 }, { "crossentropy": 2.563359260559082, "epoch": 0.7127320185614849, "grad_norm": 0.026162710040807724, "grad_norm_var": 2.2081178820119487e-07, "learning_rate": 0.001974511779326897, "loss": 2.5695, "step": 19660 }, { "crossentropy": 2.5136682987213135, "epoch": 0.712768271461717, "grad_norm": 0.0260897446423769, "grad_norm_var": 2.0144359257068256e-07, "learning_rate": 0.0019740491660343025, "loss": 2.4668, "step": 19661 }, { "crossentropy": 2.4439055919647217, "epoch": 0.712804524361949, "grad_norm": 0.02784564718604088, "grad_norm_var": 3.505994658251164e-07, "learning_rate": 0.001973586593611768, "loss": 2.5118, "step": 19662 }, { "crossentropy": 2.557093381881714, "epoch": 0.712840777262181, "grad_norm": 0.025974659249186516, "grad_norm_var": 3.4353809513307935e-07, "learning_rate": 0.0019731240620655378, "loss": 2.5235, "step": 19663 }, { "crossentropy": 2.6239147186279297, "epoch": 0.712877030162413, "grad_norm": 0.02706974558532238, "grad_norm_var": 3.7093230362960964e-07, "learning_rate": 0.0019726615714018626, "loss": 2.5291, "step": 19664 }, { "crossentropy": 2.499375343322754, "epoch": 0.712913283062645, "grad_norm": 0.025880396366119385, "grad_norm_var": 3.4564925336075654e-07, "learning_rate": 0.0019721991216269845, "loss": 2.4435, "step": 19665 }, { "crossentropy": 2.602177381515503, "epoch": 0.712949535962877, "grad_norm": 0.0286602433770895, "grad_norm_var": 6.522452609945453e-07, "learning_rate": 0.0019717367127471542, "loss": 2.5316, "step": 19666 }, { "crossentropy": 2.606717348098755, "epoch": 0.712985788863109, "grad_norm": 0.02681371383368969, "grad_norm_var": 5.971291494829736e-07, "learning_rate": 0.0019712743447686137, "loss": 2.5862, "step": 19667 }, { "crossentropy": 2.4591360092163086, "epoch": 0.713022041763341, "grad_norm": 0.02703707665205002, "grad_norm_var": 5.994249802085029e-07, "learning_rate": 0.001970812017697611, "loss": 2.4949, "step": 19668 }, { "crossentropy": 2.5677669048309326, "epoch": 0.7130582946635731, "grad_norm": 0.025630049407482147, "grad_norm_var": 6.612956310824679e-07, "learning_rate": 0.0019703497315403874, "loss": 2.5219, "step": 19669 }, { "crossentropy": 2.518873453140259, "epoch": 0.7130945475638051, "grad_norm": 0.026427676901221275, "grad_norm_var": 6.62952984790951e-07, "learning_rate": 0.001969887486303188, "loss": 2.4978, "step": 19670 }, { "crossentropy": 2.6378746032714844, "epoch": 0.7131308004640371, "grad_norm": 0.02519557997584343, "grad_norm_var": 7.742572652249512e-07, "learning_rate": 0.001969425281992258, "loss": 2.5197, "step": 19671 }, { "crossentropy": 2.592284679412842, "epoch": 0.7131670533642691, "grad_norm": 0.02608454041182995, "grad_norm_var": 7.832698663630718e-07, "learning_rate": 0.001968963118613839, "loss": 2.4935, "step": 19672 }, { "crossentropy": 2.3693196773529053, "epoch": 0.7132033062645011, "grad_norm": 0.026062291115522385, "grad_norm_var": 7.656676806045651e-07, "learning_rate": 0.001968500996174171, "loss": 2.4482, "step": 19673 }, { "crossentropy": 2.606126308441162, "epoch": 0.7132395591647331, "grad_norm": 0.026814907789230347, "grad_norm_var": 7.678834308819192e-07, "learning_rate": 0.0019680389146794998, "loss": 2.5192, "step": 19674 }, { "crossentropy": 2.4582481384277344, "epoch": 0.7132758120649652, "grad_norm": 0.02580190636217594, "grad_norm_var": 7.688338944072948e-07, "learning_rate": 0.0019675768741360617, "loss": 2.4865, "step": 19675 }, { "crossentropy": 2.340074062347412, "epoch": 0.7133120649651972, "grad_norm": 0.02687024138867855, "grad_norm_var": 7.709503380340001e-07, "learning_rate": 0.0019671148745501, "loss": 2.4149, "step": 19676 }, { "crossentropy": 2.4412639141082764, "epoch": 0.7133483178654292, "grad_norm": 0.026074841618537903, "grad_norm_var": 7.718115190335587e-07, "learning_rate": 0.0019666529159278563, "loss": 2.3825, "step": 19677 }, { "crossentropy": 2.511639356613159, "epoch": 0.7133845707656613, "grad_norm": 0.02719772793352604, "grad_norm_var": 6.831143788307472e-07, "learning_rate": 0.0019661909982755664, "loss": 2.5566, "step": 19678 }, { "crossentropy": 2.355421781539917, "epoch": 0.7134208236658933, "grad_norm": 0.026343077421188354, "grad_norm_var": 6.670331906060793e-07, "learning_rate": 0.0019657291215994725, "loss": 2.5061, "step": 19679 }, { "crossentropy": 2.514286994934082, "epoch": 0.7134570765661253, "grad_norm": 0.02621718868613243, "grad_norm_var": 6.474404634090504e-07, "learning_rate": 0.001965267285905812, "loss": 2.4678, "step": 19680 }, { "crossentropy": 2.51655912399292, "epoch": 0.7134933294663574, "grad_norm": 0.026593206450343132, "grad_norm_var": 6.255866462407181e-07, "learning_rate": 0.0019648054912008205, "loss": 2.4752, "step": 19681 }, { "crossentropy": 2.4119036197662354, "epoch": 0.7135295823665894, "grad_norm": 0.024986177682876587, "grad_norm_var": 4.0562853847133683e-07, "learning_rate": 0.0019643437374907385, "loss": 2.5192, "step": 19682 }, { "crossentropy": 2.3665966987609863, "epoch": 0.7135658352668214, "grad_norm": 0.02593984641134739, "grad_norm_var": 3.8876863531661586e-07, "learning_rate": 0.001963882024781799, "loss": 2.4619, "step": 19683 }, { "crossentropy": 2.464038848876953, "epoch": 0.7136020881670534, "grad_norm": 0.026918692514300346, "grad_norm_var": 3.765069891812726e-07, "learning_rate": 0.001963420353080241, "loss": 2.4686, "step": 19684 }, { "crossentropy": 2.6501851081848145, "epoch": 0.7136383410672854, "grad_norm": 0.02736547961831093, "grad_norm_var": 4.3346619536110734e-07, "learning_rate": 0.0019629587223923, "loss": 2.5425, "step": 19685 }, { "crossentropy": 2.5100879669189453, "epoch": 0.7136745939675174, "grad_norm": 0.026329616084694862, "grad_norm_var": 4.3247415329276485e-07, "learning_rate": 0.0019624971327242093, "loss": 2.5732, "step": 19686 }, { "crossentropy": 2.640598773956299, "epoch": 0.7137108468677494, "grad_norm": 0.026430170983076096, "grad_norm_var": 3.4598478463321316e-07, "learning_rate": 0.001962035584082206, "loss": 2.5686, "step": 19687 }, { "crossentropy": 2.4381513595581055, "epoch": 0.7137470997679815, "grad_norm": 0.026023248210549355, "grad_norm_var": 3.486085798535983e-07, "learning_rate": 0.0019615740764725225, "loss": 2.4256, "step": 19688 }, { "crossentropy": 2.3838303089141846, "epoch": 0.7137833526682135, "grad_norm": 0.028603680431842804, "grad_norm_var": 6.469773735832037e-07, "learning_rate": 0.0019611126099013903, "loss": 2.4779, "step": 19689 }, { "crossentropy": 2.5345003604888916, "epoch": 0.7138196055684455, "grad_norm": 0.02574744261801243, "grad_norm_var": 6.779113911228266e-07, "learning_rate": 0.001960651184375046, "loss": 2.5497, "step": 19690 }, { "crossentropy": 2.4609384536743164, "epoch": 0.7138558584686775, "grad_norm": 0.029615920037031174, "grad_norm_var": 1.2497928633401281e-06, "learning_rate": 0.0019601897998997174, "loss": 2.4693, "step": 19691 }, { "crossentropy": 2.5138566493988037, "epoch": 0.7138921113689095, "grad_norm": 0.02700684778392315, "grad_norm_var": 1.25399561839151e-06, "learning_rate": 0.0019597284564816385, "loss": 2.5646, "step": 19692 }, { "crossentropy": 2.5470223426818848, "epoch": 0.7139283642691415, "grad_norm": 0.02701561339199543, "grad_norm_var": 1.2293794603058001e-06, "learning_rate": 0.001959267154127043, "loss": 2.5918, "step": 19693 }, { "crossentropy": 2.44046688079834, "epoch": 0.7139646171693735, "grad_norm": 0.027530357241630554, "grad_norm_var": 1.2552259508200395e-06, "learning_rate": 0.001958805892842157, "loss": 2.5218, "step": 19694 }, { "crossentropy": 2.5831027030944824, "epoch": 0.7140008700696056, "grad_norm": 0.02563207410275936, "grad_norm_var": 1.3293471681821798e-06, "learning_rate": 0.001958344672633215, "loss": 2.5167, "step": 19695 }, { "crossentropy": 2.3311500549316406, "epoch": 0.7140371229698376, "grad_norm": 0.027645304799079895, "grad_norm_var": 1.3558902244238986e-06, "learning_rate": 0.0019578834935064433, "loss": 2.4249, "step": 19696 }, { "crossentropy": 2.3488073348999023, "epoch": 0.7140733758700696, "grad_norm": 0.02758997119963169, "grad_norm_var": 1.3856549527063346e-06, "learning_rate": 0.00195742235546807, "loss": 2.47, "step": 19697 }, { "crossentropy": 2.4565446376800537, "epoch": 0.7141096287703016, "grad_norm": 0.02621624246239662, "grad_norm_var": 1.1665382412817527e-06, "learning_rate": 0.0019569612585243266, "loss": 2.4552, "step": 19698 }, { "crossentropy": 2.618121862411499, "epoch": 0.7141458816705336, "grad_norm": 0.026413945481181145, "grad_norm_var": 1.115109466471202e-06, "learning_rate": 0.0019565002026814384, "loss": 2.5893, "step": 19699 }, { "crossentropy": 2.5749359130859375, "epoch": 0.7141821345707656, "grad_norm": 0.027429666370153427, "grad_norm_var": 1.1255281262528838e-06, "learning_rate": 0.001956039187945633, "loss": 2.5992, "step": 19700 }, { "crossentropy": 2.520705223083496, "epoch": 0.7142183874709976, "grad_norm": 0.026296017691493034, "grad_norm_var": 1.1502048138796086e-06, "learning_rate": 0.0019555782143231394, "loss": 2.5092, "step": 19701 }, { "crossentropy": 2.5067663192749023, "epoch": 0.7142546403712297, "grad_norm": 0.026729008182883263, "grad_norm_var": 1.1260521731741203e-06, "learning_rate": 0.0019551172818201806, "loss": 2.4814, "step": 19702 }, { "crossentropy": 2.425791025161743, "epoch": 0.7142908932714617, "grad_norm": 0.026927456259727478, "grad_norm_var": 1.1040343052718387e-06, "learning_rate": 0.001954656390442985, "loss": 2.441, "step": 19703 }, { "crossentropy": 2.557875871658325, "epoch": 0.7143271461716937, "grad_norm": 0.026017192751169205, "grad_norm_var": 1.104846556376713e-06, "learning_rate": 0.0019541955401977773, "loss": 2.516, "step": 19704 }, { "crossentropy": 2.567349672317505, "epoch": 0.7143633990719258, "grad_norm": 0.02602674439549446, "grad_norm_var": 9.778223820774053e-07, "learning_rate": 0.001953734731090779, "loss": 2.4819, "step": 19705 }, { "crossentropy": 2.476943016052246, "epoch": 0.7143996519721578, "grad_norm": 0.025967448949813843, "grad_norm_var": 9.480652874472301e-07, "learning_rate": 0.001953273963128217, "loss": 2.471, "step": 19706 }, { "crossentropy": 2.5349924564361572, "epoch": 0.7144359048723898, "grad_norm": 0.02724316157400608, "grad_norm_var": 4.3398302323096003e-07, "learning_rate": 0.0019528132363163125, "loss": 2.5332, "step": 19707 }, { "crossentropy": 2.595477819442749, "epoch": 0.7144721577726219, "grad_norm": 0.026957346126437187, "grad_norm_var": 4.323118268654072e-07, "learning_rate": 0.0019523525506612894, "loss": 2.601, "step": 19708 }, { "crossentropy": 2.4947264194488525, "epoch": 0.7145084106728539, "grad_norm": 0.026547694578766823, "grad_norm_var": 4.280113714685262e-07, "learning_rate": 0.0019518919061693718, "loss": 2.4982, "step": 19709 }, { "crossentropy": 2.5950918197631836, "epoch": 0.7145446635730859, "grad_norm": 0.026066286489367485, "grad_norm_var": 3.9951624711992373e-07, "learning_rate": 0.0019514313028467785, "loss": 2.5593, "step": 19710 }, { "crossentropy": 2.503509521484375, "epoch": 0.7145809164733179, "grad_norm": 0.02647792547941208, "grad_norm_var": 3.3432584399384113e-07, "learning_rate": 0.0019509707406997329, "loss": 2.4731, "step": 19711 }, { "crossentropy": 2.4085757732391357, "epoch": 0.7146171693735499, "grad_norm": 0.028114669024944305, "grad_norm_var": 4.0979059850032353e-07, "learning_rate": 0.0019505102197344526, "loss": 2.4396, "step": 19712 }, { "crossentropy": 2.5320544242858887, "epoch": 0.7146534222737819, "grad_norm": 0.02618107572197914, "grad_norm_var": 3.6456451247764086e-07, "learning_rate": 0.0019500497399571614, "loss": 2.4633, "step": 19713 }, { "crossentropy": 2.3750507831573486, "epoch": 0.7146896751740139, "grad_norm": 0.025849301367998123, "grad_norm_var": 3.9179172621934114e-07, "learning_rate": 0.0019495893013740772, "loss": 2.3867, "step": 19714 }, { "crossentropy": 2.5250895023345947, "epoch": 0.714725928074246, "grad_norm": 0.026746457442641258, "grad_norm_var": 3.914371215120536e-07, "learning_rate": 0.0019491289039914174, "loss": 2.4671, "step": 19715 }, { "crossentropy": 2.6085240840911865, "epoch": 0.714762180974478, "grad_norm": 0.027016587555408478, "grad_norm_var": 3.563284569918819e-07, "learning_rate": 0.0019486685478154009, "loss": 2.5513, "step": 19716 }, { "crossentropy": 2.4452285766601562, "epoch": 0.71479843387471, "grad_norm": 0.027237486094236374, "grad_norm_var": 3.769852347748626e-07, "learning_rate": 0.0019482082328522482, "loss": 2.443, "step": 19717 }, { "crossentropy": 2.3683788776397705, "epoch": 0.714834686774942, "grad_norm": 0.026488499715924263, "grad_norm_var": 3.77477324258318e-07, "learning_rate": 0.0019477479591081725, "loss": 2.3927, "step": 19718 }, { "crossentropy": 2.4778637886047363, "epoch": 0.714870939675174, "grad_norm": 0.025541134178638458, "grad_norm_var": 4.401327165619191e-07, "learning_rate": 0.0019472877265893946, "loss": 2.4953, "step": 19719 }, { "crossentropy": 2.3554866313934326, "epoch": 0.714907192575406, "grad_norm": 0.02655922621488571, "grad_norm_var": 4.2143854046850874e-07, "learning_rate": 0.001946827535302127, "loss": 2.4355, "step": 19720 }, { "crossentropy": 2.5612435340881348, "epoch": 0.714943445475638, "grad_norm": 0.026843957602977753, "grad_norm_var": 4.046581873394401e-07, "learning_rate": 0.0019463673852525881, "loss": 2.5471, "step": 19721 }, { "crossentropy": 2.472224712371826, "epoch": 0.71497969837587, "grad_norm": 0.028309954330325127, "grad_norm_var": 5.453981639622624e-07, "learning_rate": 0.001945907276446992, "loss": 2.5022, "step": 19722 }, { "crossentropy": 2.428983211517334, "epoch": 0.7150159512761021, "grad_norm": 0.026307884603738785, "grad_norm_var": 5.399794465907673e-07, "learning_rate": 0.0019454472088915515, "loss": 2.4283, "step": 19723 }, { "crossentropy": 2.5015623569488525, "epoch": 0.7150522041763341, "grad_norm": 0.026127176359295845, "grad_norm_var": 5.548825280525527e-07, "learning_rate": 0.0019449871825924815, "loss": 2.5062, "step": 19724 }, { "crossentropy": 2.5136351585388184, "epoch": 0.7150884570765661, "grad_norm": 0.027741221711039543, "grad_norm_var": 6.274813023060891e-07, "learning_rate": 0.0019445271975559975, "loss": 2.5238, "step": 19725 }, { "crossentropy": 2.4301469326019287, "epoch": 0.7151247099767981, "grad_norm": 0.025841213762760162, "grad_norm_var": 6.504317910168879e-07, "learning_rate": 0.0019440672537883092, "loss": 2.4289, "step": 19726 }, { "crossentropy": 2.472369432449341, "epoch": 0.7151609628770301, "grad_norm": 0.028097501024603844, "grad_norm_var": 7.639350610886067e-07, "learning_rate": 0.0019436073512956314, "loss": 2.5158, "step": 19727 }, { "crossentropy": 2.4920032024383545, "epoch": 0.7151972157772621, "grad_norm": 0.026301249861717224, "grad_norm_var": 6.546657672116054e-07, "learning_rate": 0.0019431474900841734, "loss": 2.5101, "step": 19728 }, { "crossentropy": 2.611168622970581, "epoch": 0.7152334686774942, "grad_norm": 0.026579027995467186, "grad_norm_var": 6.370627664250046e-07, "learning_rate": 0.001942687670160147, "loss": 2.5198, "step": 19729 }, { "crossentropy": 2.5414950847625732, "epoch": 0.7152697215777262, "grad_norm": 0.02613511122763157, "grad_norm_var": 6.088259813079196e-07, "learning_rate": 0.001942227891529767, "loss": 2.5354, "step": 19730 }, { "crossentropy": 2.51704478263855, "epoch": 0.7153059744779582, "grad_norm": 0.02769184298813343, "grad_norm_var": 6.652341509029899e-07, "learning_rate": 0.001941768154199236, "loss": 2.5156, "step": 19731 }, { "crossentropy": 2.534360885620117, "epoch": 0.7153422273781903, "grad_norm": 0.027274971827864647, "grad_norm_var": 6.76827439154134e-07, "learning_rate": 0.0019413084581747669, "loss": 2.4874, "step": 19732 }, { "crossentropy": 2.7129204273223877, "epoch": 0.7153784802784223, "grad_norm": 0.026998352259397507, "grad_norm_var": 6.67005387052855e-07, "learning_rate": 0.001940848803462571, "loss": 2.5188, "step": 19733 }, { "crossentropy": 2.3613696098327637, "epoch": 0.7154147331786543, "grad_norm": 0.02725733071565628, "grad_norm_var": 6.717715132201555e-07, "learning_rate": 0.0019403891900688524, "loss": 2.4656, "step": 19734 }, { "crossentropy": 2.5231175422668457, "epoch": 0.7154509860788864, "grad_norm": 0.02698833867907524, "grad_norm_var": 5.500257234269483e-07, "learning_rate": 0.0019399296179998227, "loss": 2.5416, "step": 19735 }, { "crossentropy": 2.525604009628296, "epoch": 0.7154872389791184, "grad_norm": 0.026441963389515877, "grad_norm_var": 5.568525823039835e-07, "learning_rate": 0.0019394700872616855, "loss": 2.4397, "step": 19736 }, { "crossentropy": 2.4716544151306152, "epoch": 0.7155234918793504, "grad_norm": 0.026360755786299706, "grad_norm_var": 5.772186945487978e-07, "learning_rate": 0.0019390105978606498, "loss": 2.4183, "step": 19737 }, { "crossentropy": 2.533419370651245, "epoch": 0.7155597447795824, "grad_norm": 0.02581232599914074, "grad_norm_var": 4.986857611286033e-07, "learning_rate": 0.001938551149802924, "loss": 2.5984, "step": 19738 }, { "crossentropy": 2.4058103561401367, "epoch": 0.7155959976798144, "grad_norm": 0.026790771633386612, "grad_norm_var": 4.849699111394989e-07, "learning_rate": 0.0019380917430947076, "loss": 2.4941, "step": 19739 }, { "crossentropy": 2.4878921508789062, "epoch": 0.7156322505800464, "grad_norm": 0.025749294087290764, "grad_norm_var": 5.26658042451257e-07, "learning_rate": 0.001937632377742209, "loss": 2.4585, "step": 19740 }, { "crossentropy": 2.3839282989501953, "epoch": 0.7156685034802784, "grad_norm": 0.027510439977049828, "grad_norm_var": 4.996038624438883e-07, "learning_rate": 0.0019371730537516342, "loss": 2.4134, "step": 19741 }, { "crossentropy": 2.565319299697876, "epoch": 0.7157047563805105, "grad_norm": 0.026318104937672615, "grad_norm_var": 4.5670596249569076e-07, "learning_rate": 0.0019367137711291831, "loss": 2.5492, "step": 19742 }, { "crossentropy": 2.529292106628418, "epoch": 0.7157410092807425, "grad_norm": 0.025516165420413017, "grad_norm_var": 4.1599363269765636e-07, "learning_rate": 0.0019362545298810636, "loss": 2.4879, "step": 19743 }, { "crossentropy": 2.5902652740478516, "epoch": 0.7157772621809745, "grad_norm": 0.027231113985180855, "grad_norm_var": 4.3201776360908086e-07, "learning_rate": 0.001935795330013474, "loss": 2.5692, "step": 19744 }, { "crossentropy": 2.467947244644165, "epoch": 0.7158135150812065, "grad_norm": 0.026692131534218788, "grad_norm_var": 4.315057943700291e-07, "learning_rate": 0.0019353361715326184, "loss": 2.472, "step": 19745 }, { "crossentropy": 2.471817970275879, "epoch": 0.7158497679814385, "grad_norm": 0.02686748467385769, "grad_norm_var": 4.124980549392865e-07, "learning_rate": 0.0019348770544447024, "loss": 2.5505, "step": 19746 }, { "crossentropy": 2.543229103088379, "epoch": 0.7158860208816705, "grad_norm": 0.026929346844553947, "grad_norm_var": 3.499137767601704e-07, "learning_rate": 0.0019344179787559195, "loss": 2.5254, "step": 19747 }, { "crossentropy": 2.5934975147247314, "epoch": 0.7159222737819025, "grad_norm": 0.02612636424601078, "grad_norm_var": 3.3990078412218795e-07, "learning_rate": 0.0019339589444724741, "loss": 2.5441, "step": 19748 }, { "crossentropy": 2.662670850753784, "epoch": 0.7159585266821346, "grad_norm": 0.028531260788440704, "grad_norm_var": 5.683062748275287e-07, "learning_rate": 0.001933499951600568, "loss": 2.6716, "step": 19749 }, { "crossentropy": 2.415520429611206, "epoch": 0.7159947795823666, "grad_norm": 0.02698756754398346, "grad_norm_var": 5.52635563661324e-07, "learning_rate": 0.0019330410001463967, "loss": 2.5782, "step": 19750 }, { "crossentropy": 2.4490602016448975, "epoch": 0.7160310324825986, "grad_norm": 0.026415565982460976, "grad_norm_var": 5.494652948776485e-07, "learning_rate": 0.0019325820901161628, "loss": 2.3731, "step": 19751 }, { "crossentropy": 2.581495523452759, "epoch": 0.7160672853828306, "grad_norm": 0.026810988783836365, "grad_norm_var": 5.481074286067348e-07, "learning_rate": 0.0019321232215160611, "loss": 2.5718, "step": 19752 }, { "crossentropy": 2.497978925704956, "epoch": 0.7161035382830626, "grad_norm": 0.02657792717218399, "grad_norm_var": 5.422278675967871e-07, "learning_rate": 0.0019316643943522904, "loss": 2.4692, "step": 19753 }, { "crossentropy": 2.6154439449310303, "epoch": 0.7161397911832946, "grad_norm": 0.027152273803949356, "grad_norm_var": 4.995725376807924e-07, "learning_rate": 0.0019312056086310508, "loss": 2.5673, "step": 19754 }, { "crossentropy": 2.51665997505188, "epoch": 0.7161760440835266, "grad_norm": 0.02521182782948017, "grad_norm_var": 6.495265913864737e-07, "learning_rate": 0.0019307468643585362, "loss": 2.4336, "step": 19755 }, { "crossentropy": 2.6325318813323975, "epoch": 0.7162122969837587, "grad_norm": 0.027059538289904594, "grad_norm_var": 5.969822972959656e-07, "learning_rate": 0.0019302881615409413, "loss": 2.6256, "step": 19756 }, { "crossentropy": 2.4694952964782715, "epoch": 0.7162485498839907, "grad_norm": 0.0249599851667881, "grad_norm_var": 7.436222536085756e-07, "learning_rate": 0.0019298295001844651, "loss": 2.4829, "step": 19757 }, { "crossentropy": 2.52950119972229, "epoch": 0.7162848027842227, "grad_norm": 0.026583906263113022, "grad_norm_var": 7.385178535535546e-07, "learning_rate": 0.0019293708802952986, "loss": 2.576, "step": 19758 }, { "crossentropy": 2.498081922531128, "epoch": 0.7163210556844548, "grad_norm": 0.025132274255156517, "grad_norm_var": 8.033762190287474e-07, "learning_rate": 0.00192891230187964, "loss": 2.479, "step": 19759 }, { "crossentropy": 2.4897608757019043, "epoch": 0.7163573085846868, "grad_norm": 0.0268463883548975, "grad_norm_var": 7.791935701880465e-07, "learning_rate": 0.0019284537649436801, "loss": 2.4719, "step": 19760 }, { "crossentropy": 2.548102855682373, "epoch": 0.7163935614849188, "grad_norm": 0.026685349643230438, "grad_norm_var": 7.790727163914726e-07, "learning_rate": 0.0019279952694936126, "loss": 2.5544, "step": 19761 }, { "crossentropy": 2.467947244644165, "epoch": 0.7164298143851509, "grad_norm": 0.025903495028614998, "grad_norm_var": 7.969725356216095e-07, "learning_rate": 0.0019275368155356333, "loss": 2.5019, "step": 19762 }, { "crossentropy": 2.598431348800659, "epoch": 0.7164660672853829, "grad_norm": 0.025619560852646828, "grad_norm_var": 8.28275388531768e-07, "learning_rate": 0.0019270784030759314, "loss": 2.5065, "step": 19763 }, { "crossentropy": 2.5070204734802246, "epoch": 0.7165023201856149, "grad_norm": 0.026125861331820488, "grad_norm_var": 8.282946091490532e-07, "learning_rate": 0.0019266200321206978, "loss": 2.5176, "step": 19764 }, { "crossentropy": 2.473750591278076, "epoch": 0.7165385730858469, "grad_norm": 0.02623535506427288, "grad_norm_var": 5.09219033466623e-07, "learning_rate": 0.0019261617026761259, "loss": 2.4552, "step": 19765 }, { "crossentropy": 2.7016513347625732, "epoch": 0.7165748259860789, "grad_norm": 0.02694530226290226, "grad_norm_var": 5.05282647450996e-07, "learning_rate": 0.0019257034147484031, "loss": 2.6782, "step": 19766 }, { "crossentropy": 2.5003137588500977, "epoch": 0.7166110788863109, "grad_norm": 0.027701662853360176, "grad_norm_var": 6.342050936425832e-07, "learning_rate": 0.0019252451683437234, "loss": 2.4665, "step": 19767 }, { "crossentropy": 2.3574929237365723, "epoch": 0.7166473317865429, "grad_norm": 0.02630651369690895, "grad_norm_var": 6.189003266339361e-07, "learning_rate": 0.001924786963468272, "loss": 2.3732, "step": 19768 }, { "crossentropy": 2.5547101497650146, "epoch": 0.716683584686775, "grad_norm": 0.02634243853390217, "grad_norm_var": 6.141249151416963e-07, "learning_rate": 0.001924328800128239, "loss": 2.5505, "step": 19769 }, { "crossentropy": 2.4583756923675537, "epoch": 0.716719837587007, "grad_norm": 0.02541537955403328, "grad_norm_var": 6.054702496072293e-07, "learning_rate": 0.001923870678329815, "loss": 2.405, "step": 19770 }, { "crossentropy": 2.431666374206543, "epoch": 0.716756090487239, "grad_norm": 0.02710159309208393, "grad_norm_var": 5.816536287141788e-07, "learning_rate": 0.0019234125980791838, "loss": 2.4336, "step": 19771 }, { "crossentropy": 2.5505783557891846, "epoch": 0.716792343387471, "grad_norm": 0.027647271752357483, "grad_norm_var": 6.619576566597081e-07, "learning_rate": 0.0019229545593825365, "loss": 2.5492, "step": 19772 }, { "crossentropy": 2.3902103900909424, "epoch": 0.716828596287703, "grad_norm": 0.025690404698252678, "grad_norm_var": 5.602197810656983e-07, "learning_rate": 0.001922496562246057, "loss": 2.4001, "step": 19773 }, { "crossentropy": 2.4210541248321533, "epoch": 0.716864849187935, "grad_norm": 0.026585722342133522, "grad_norm_var": 5.602662933248033e-07, "learning_rate": 0.0019220386066759305, "loss": 2.3923, "step": 19774 }, { "crossentropy": 2.413804292678833, "epoch": 0.716901102088167, "grad_norm": 0.02620924822986126, "grad_norm_var": 4.5175326376078094e-07, "learning_rate": 0.0019215806926783452, "loss": 2.496, "step": 19775 }, { "crossentropy": 2.410032033920288, "epoch": 0.7169373549883991, "grad_norm": 0.02701389603316784, "grad_norm_var": 4.6213451579211333e-07, "learning_rate": 0.0019211228202594822, "loss": 2.5156, "step": 19776 }, { "crossentropy": 2.616379499435425, "epoch": 0.7169736078886311, "grad_norm": 0.0274636410176754, "grad_norm_var": 5.22281681532751e-07, "learning_rate": 0.001920664989425528, "loss": 2.5014, "step": 19777 }, { "crossentropy": 2.5207338333129883, "epoch": 0.7170098607888631, "grad_norm": 0.026381416246294975, "grad_norm_var": 4.973221792154217e-07, "learning_rate": 0.0019202072001826681, "loss": 2.5354, "step": 19778 }, { "crossentropy": 2.499157667160034, "epoch": 0.7170461136890951, "grad_norm": 0.027361346408724785, "grad_norm_var": 4.7106618080531664e-07, "learning_rate": 0.0019197494525370817, "loss": 2.5089, "step": 19779 }, { "crossentropy": 2.5778985023498535, "epoch": 0.7170823665893271, "grad_norm": 0.02718784660100937, "grad_norm_var": 4.662130648590366e-07, "learning_rate": 0.0019192917464949555, "loss": 2.4547, "step": 19780 }, { "crossentropy": 2.534766435623169, "epoch": 0.7171186194895591, "grad_norm": 0.026726771146059036, "grad_norm_var": 4.4926847836921725e-07, "learning_rate": 0.0019188340820624694, "loss": 2.5146, "step": 19781 }, { "crossentropy": 2.6415493488311768, "epoch": 0.7171548723897911, "grad_norm": 0.02568867616355419, "grad_norm_var": 5.160823875670235e-07, "learning_rate": 0.0019183764592458025, "loss": 2.5124, "step": 19782 }, { "crossentropy": 2.5337655544281006, "epoch": 0.7171911252900232, "grad_norm": 0.028050264343619347, "grad_norm_var": 5.713278467195844e-07, "learning_rate": 0.0019179188780511403, "loss": 2.5086, "step": 19783 }, { "crossentropy": 2.6881954669952393, "epoch": 0.7172273781902552, "grad_norm": 0.026623783633112907, "grad_norm_var": 5.610464865642006e-07, "learning_rate": 0.001917461338484659, "loss": 2.599, "step": 19784 }, { "crossentropy": 2.5302443504333496, "epoch": 0.7172636310904872, "grad_norm": 0.02702007070183754, "grad_norm_var": 5.558036352215056e-07, "learning_rate": 0.0019170038405525404, "loss": 2.5382, "step": 19785 }, { "crossentropy": 2.5262787342071533, "epoch": 0.7172998839907193, "grad_norm": 0.0271974615752697, "grad_norm_var": 4.346867482462579e-07, "learning_rate": 0.0019165463842609654, "loss": 2.4904, "step": 19786 }, { "crossentropy": 2.6789233684539795, "epoch": 0.7173361368909513, "grad_norm": 0.02689996547996998, "grad_norm_var": 4.31050951430328e-07, "learning_rate": 0.0019160889696161088, "loss": 2.6527, "step": 19787 }, { "crossentropy": 2.5507938861846924, "epoch": 0.7173723897911833, "grad_norm": 0.027226723730564117, "grad_norm_var": 3.9791719412997593e-07, "learning_rate": 0.0019156315966241523, "loss": 2.5929, "step": 19788 }, { "crossentropy": 2.5070083141326904, "epoch": 0.7174086426914154, "grad_norm": 0.02711748518049717, "grad_norm_var": 3.078011102770432e-07, "learning_rate": 0.0019151742652912718, "loss": 2.5331, "step": 19789 }, { "crossentropy": 2.538933277130127, "epoch": 0.7174448955916474, "grad_norm": 0.026386650279164314, "grad_norm_var": 3.1920761073671773e-07, "learning_rate": 0.0019147169756236428, "loss": 2.446, "step": 19790 }, { "crossentropy": 2.537822961807251, "epoch": 0.7174811484918794, "grad_norm": 0.02583310939371586, "grad_norm_var": 3.631792323210402e-07, "learning_rate": 0.0019142597276274448, "loss": 2.5671, "step": 19791 }, { "crossentropy": 2.301863670349121, "epoch": 0.7175174013921114, "grad_norm": 0.025922192260622978, "grad_norm_var": 4.190794663668761e-07, "learning_rate": 0.00191380252130885, "loss": 2.4403, "step": 19792 }, { "crossentropy": 2.606307029724121, "epoch": 0.7175536542923434, "grad_norm": 0.02546135149896145, "grad_norm_var": 4.972741993180951e-07, "learning_rate": 0.0019133453566740355, "loss": 2.4798, "step": 19793 }, { "crossentropy": 2.361686944961548, "epoch": 0.7175899071925754, "grad_norm": 0.025976238772273064, "grad_norm_var": 5.243579018314314e-07, "learning_rate": 0.0019128882337291781, "loss": 2.4833, "step": 19794 }, { "crossentropy": 2.4199297428131104, "epoch": 0.7176261600928074, "grad_norm": 0.02687118947505951, "grad_norm_var": 4.940276918307303e-07, "learning_rate": 0.0019124311524804482, "loss": 2.4841, "step": 19795 }, { "crossentropy": 2.52054762840271, "epoch": 0.7176624129930395, "grad_norm": 0.027883248403668404, "grad_norm_var": 5.75339163840976e-07, "learning_rate": 0.001911974112934023, "loss": 2.5282, "step": 19796 }, { "crossentropy": 2.4481265544891357, "epoch": 0.7176986658932715, "grad_norm": 0.02813747711479664, "grad_norm_var": 7.084563305362797e-07, "learning_rate": 0.0019115171150960737, "loss": 2.5307, "step": 19797 }, { "crossentropy": 2.58323335647583, "epoch": 0.7177349187935035, "grad_norm": 0.02816181443631649, "grad_norm_var": 7.346606248894346e-07, "learning_rate": 0.0019110601589727705, "loss": 2.5617, "step": 19798 }, { "crossentropy": 2.478865385055542, "epoch": 0.7177711716937355, "grad_norm": 0.027064461261034012, "grad_norm_var": 6.472389495098489e-07, "learning_rate": 0.0019106032445702899, "loss": 2.4706, "step": 19799 }, { "crossentropy": 2.614414691925049, "epoch": 0.7178074245939675, "grad_norm": 0.027734162285923958, "grad_norm_var": 6.89110910533303e-07, "learning_rate": 0.001910146371894798, "loss": 2.6291, "step": 19800 }, { "crossentropy": 2.5103495121002197, "epoch": 0.7178436774941995, "grad_norm": 0.025883441790938377, "grad_norm_var": 7.563348142335922e-07, "learning_rate": 0.0019096895409524695, "loss": 2.4432, "step": 19801 }, { "crossentropy": 2.6252360343933105, "epoch": 0.7178799303944315, "grad_norm": 0.026894431561231613, "grad_norm_var": 7.484315713660386e-07, "learning_rate": 0.0019092327517494746, "loss": 2.5723, "step": 19802 }, { "crossentropy": 2.512554883956909, "epoch": 0.7179161832946636, "grad_norm": 0.02789170667529106, "grad_norm_var": 8.177176172361692e-07, "learning_rate": 0.0019087760042919804, "loss": 2.522, "step": 19803 }, { "crossentropy": 2.411336660385132, "epoch": 0.7179524361948956, "grad_norm": 0.026043690741062164, "grad_norm_var": 8.541042797414415e-07, "learning_rate": 0.0019083192985861585, "loss": 2.4198, "step": 19804 }, { "crossentropy": 2.720942497253418, "epoch": 0.7179886890951276, "grad_norm": 0.030548375099897385, "grad_norm_var": 1.7217988170582325e-06, "learning_rate": 0.0019078626346381773, "loss": 2.6293, "step": 19805 }, { "crossentropy": 2.5215413570404053, "epoch": 0.7180249419953596, "grad_norm": 0.025688625872135162, "grad_norm_var": 1.8133698448016908e-06, "learning_rate": 0.0019074060124542008, "loss": 2.4749, "step": 19806 }, { "crossentropy": 2.4522881507873535, "epoch": 0.7180611948955916, "grad_norm": 0.028991982340812683, "grad_norm_var": 1.945668202135188e-06, "learning_rate": 0.0019069494320404017, "loss": 2.4929, "step": 19807 }, { "crossentropy": 2.470254898071289, "epoch": 0.7180974477958236, "grad_norm": 0.02730843983590603, "grad_norm_var": 1.8301191920005432e-06, "learning_rate": 0.0019064928934029424, "loss": 2.5388, "step": 19808 }, { "crossentropy": 2.5547494888305664, "epoch": 0.7181337006960556, "grad_norm": 0.02692180871963501, "grad_norm_var": 1.6085485532619926e-06, "learning_rate": 0.0019060363965479916, "loss": 2.5074, "step": 19809 }, { "crossentropy": 2.541379928588867, "epoch": 0.7181699535962877, "grad_norm": 0.02681879699230194, "grad_norm_var": 1.495771521992558e-06, "learning_rate": 0.001905579941481716, "loss": 2.5013, "step": 19810 }, { "crossentropy": 2.571594715118408, "epoch": 0.7182062064965197, "grad_norm": 0.02563093788921833, "grad_norm_var": 1.6839436077775572e-06, "learning_rate": 0.0019051235282102775, "loss": 2.5454, "step": 19811 }, { "crossentropy": 2.3565425872802734, "epoch": 0.7182424593967517, "grad_norm": 0.026985174044966698, "grad_norm_var": 1.6705247622087002e-06, "learning_rate": 0.0019046671567398438, "loss": 2.4742, "step": 19812 }, { "crossentropy": 2.5671892166137695, "epoch": 0.7182787122969838, "grad_norm": 0.02762731723487377, "grad_norm_var": 1.629422421749862e-06, "learning_rate": 0.0019042108270765762, "loss": 2.46, "step": 19813 }, { "crossentropy": 2.6140308380126953, "epoch": 0.7183149651972158, "grad_norm": 0.02704600803554058, "grad_norm_var": 1.5733967019529536e-06, "learning_rate": 0.001903754539226641, "loss": 2.6708, "step": 19814 }, { "crossentropy": 2.3802807331085205, "epoch": 0.7183512180974478, "grad_norm": 0.026089433580636978, "grad_norm_var": 1.6494544496160978e-06, "learning_rate": 0.001903298293196199, "loss": 2.4695, "step": 19815 }, { "crossentropy": 2.573486328125, "epoch": 0.7183874709976799, "grad_norm": 0.026859523728489876, "grad_norm_var": 1.6269873561212116e-06, "learning_rate": 0.0019028420889914118, "loss": 2.5367, "step": 19816 }, { "crossentropy": 2.552272319793701, "epoch": 0.7184237238979119, "grad_norm": 0.026582246646285057, "grad_norm_var": 1.5463127285539885e-06, "learning_rate": 0.0019023859266184423, "loss": 2.5907, "step": 19817 }, { "crossentropy": 2.3539555072784424, "epoch": 0.7184599767981439, "grad_norm": 0.028350984677672386, "grad_norm_var": 1.6349992661858377e-06, "learning_rate": 0.0019019298060834534, "loss": 2.4283, "step": 19818 }, { "crossentropy": 2.557908535003662, "epoch": 0.7184962296983759, "grad_norm": 0.025842105969786644, "grad_norm_var": 1.7116842328530747e-06, "learning_rate": 0.001901473727392602, "loss": 2.5339, "step": 19819 }, { "crossentropy": 2.5400002002716064, "epoch": 0.7185324825986079, "grad_norm": 0.02700592391192913, "grad_norm_var": 1.6361517292732992e-06, "learning_rate": 0.0019010176905520516, "loss": 2.5164, "step": 19820 }, { "crossentropy": 2.5002048015594482, "epoch": 0.7185687354988399, "grad_norm": 0.026574814692139626, "grad_norm_var": 8.19101153369051e-07, "learning_rate": 0.001900561695567959, "loss": 2.4595, "step": 19821 }, { "crossentropy": 2.3632900714874268, "epoch": 0.7186049883990719, "grad_norm": 0.026002254337072372, "grad_norm_var": 7.747909483149605e-07, "learning_rate": 0.0019001057424464852, "loss": 2.3953, "step": 19822 }, { "crossentropy": 2.5829601287841797, "epoch": 0.718641241299304, "grad_norm": 0.027276668697595596, "grad_norm_var": 4.83629143672642e-07, "learning_rate": 0.0018996498311937883, "loss": 2.5636, "step": 19823 }, { "crossentropy": 2.3434066772460938, "epoch": 0.718677494199536, "grad_norm": 0.025785958394408226, "grad_norm_var": 5.268423996341766e-07, "learning_rate": 0.0018991939618160237, "loss": 2.4259, "step": 19824 }, { "crossentropy": 2.5395092964172363, "epoch": 0.718713747099768, "grad_norm": 0.026056330651044846, "grad_norm_var": 5.495042472525389e-07, "learning_rate": 0.00189873813431935, "loss": 2.5336, "step": 19825 }, { "crossentropy": 2.5044167041778564, "epoch": 0.71875, "grad_norm": 0.026924610137939453, "grad_norm_var": 5.524669015681636e-07, "learning_rate": 0.001898282348709926, "loss": 2.525, "step": 19826 }, { "crossentropy": 2.603966474533081, "epoch": 0.718786252900232, "grad_norm": 0.026144953444600105, "grad_norm_var": 4.981090295254892e-07, "learning_rate": 0.0018978266049939035, "loss": 2.592, "step": 19827 }, { "crossentropy": 2.4817826747894287, "epoch": 0.718822505800464, "grad_norm": 0.026113832369446754, "grad_norm_var": 5.120983187011646e-07, "learning_rate": 0.0018973709031774428, "loss": 2.4872, "step": 19828 }, { "crossentropy": 2.4338924884796143, "epoch": 0.718858758700696, "grad_norm": 0.025498753413558006, "grad_norm_var": 5.158254235107836e-07, "learning_rate": 0.0018969152432666947, "loss": 2.5225, "step": 19829 }, { "crossentropy": 2.2067160606384277, "epoch": 0.7188950116009281, "grad_norm": 0.025970496237277985, "grad_norm_var": 5.112062133239351e-07, "learning_rate": 0.0018964596252678174, "loss": 2.3793, "step": 19830 }, { "crossentropy": 2.440643787384033, "epoch": 0.7189312645011601, "grad_norm": 0.026701943948864937, "grad_norm_var": 5.058256882807619e-07, "learning_rate": 0.0018960040491869624, "loss": 2.5042, "step": 19831 }, { "crossentropy": 2.5972936153411865, "epoch": 0.7189675174013921, "grad_norm": 0.02613978646695614, "grad_norm_var": 5.018494461816406e-07, "learning_rate": 0.001895548515030281, "loss": 2.5672, "step": 19832 }, { "crossentropy": 2.4378445148468018, "epoch": 0.7190037703016241, "grad_norm": 0.025921182706952095, "grad_norm_var": 5.16247954247687e-07, "learning_rate": 0.0018950930228039287, "loss": 2.4569, "step": 19833 }, { "crossentropy": 2.4458699226379395, "epoch": 0.7190400232018561, "grad_norm": 0.02491779439151287, "grad_norm_var": 3.572847312560545e-07, "learning_rate": 0.001894637572514058, "loss": 2.4931, "step": 19834 }, { "crossentropy": 2.4781479835510254, "epoch": 0.7190762761020881, "grad_norm": 0.02596130408346653, "grad_norm_var": 3.528051386000961e-07, "learning_rate": 0.0018941821641668183, "loss": 2.4588, "step": 19835 }, { "crossentropy": 2.599764347076416, "epoch": 0.7191125290023201, "grad_norm": 0.025960342958569527, "grad_norm_var": 3.0700592592290167e-07, "learning_rate": 0.0018937267977683632, "loss": 2.5531, "step": 19836 }, { "crossentropy": 2.501542568206787, "epoch": 0.7191487819025522, "grad_norm": 0.025345714762806892, "grad_norm_var": 3.272066065549643e-07, "learning_rate": 0.0018932714733248402, "loss": 2.5234, "step": 19837 }, { "crossentropy": 2.417029619216919, "epoch": 0.7191850348027842, "grad_norm": 0.025653032585978508, "grad_norm_var": 3.3682481597966525e-07, "learning_rate": 0.0018928161908424001, "loss": 2.4757, "step": 19838 }, { "crossentropy": 2.5477206707000732, "epoch": 0.7192212877030162, "grad_norm": 0.02684749849140644, "grad_norm_var": 2.766150350846323e-07, "learning_rate": 0.0018923609503271965, "loss": 2.5314, "step": 19839 }, { "crossentropy": 2.6809160709381104, "epoch": 0.7192575406032483, "grad_norm": 0.025945477187633514, "grad_norm_var": 2.737279977843912e-07, "learning_rate": 0.001891905751785371, "loss": 2.5668, "step": 19840 }, { "crossentropy": 2.5529017448425293, "epoch": 0.7192937935034803, "grad_norm": 0.025630654767155647, "grad_norm_var": 2.822214113239272e-07, "learning_rate": 0.0018914505952230749, "loss": 2.4913, "step": 19841 }, { "crossentropy": 2.393012762069702, "epoch": 0.7193300464037123, "grad_norm": 0.026767415925860405, "grad_norm_var": 2.639640533257104e-07, "learning_rate": 0.0018909954806464574, "loss": 2.4312, "step": 19842 }, { "crossentropy": 2.673943042755127, "epoch": 0.7193662993039444, "grad_norm": 0.02937825582921505, "grad_norm_var": 9.92773023892647e-07, "learning_rate": 0.0018905404080616624, "loss": 2.5354, "step": 19843 }, { "crossentropy": 2.6077373027801514, "epoch": 0.7194025522041764, "grad_norm": 0.026958098635077477, "grad_norm_var": 1.0307638011956833e-06, "learning_rate": 0.0018900853774748395, "loss": 2.5933, "step": 19844 }, { "crossentropy": 2.4760589599609375, "epoch": 0.7194388051044084, "grad_norm": 0.02727162279188633, "grad_norm_var": 1.0555665771289427e-06, "learning_rate": 0.0018896303888921312, "loss": 2.5645, "step": 19845 }, { "crossentropy": 2.4066531658172607, "epoch": 0.7194750580046404, "grad_norm": 0.02637948840856552, "grad_norm_var": 1.0461078128335202e-06, "learning_rate": 0.0018891754423196849, "loss": 2.4404, "step": 19846 }, { "crossentropy": 2.482769250869751, "epoch": 0.7195113109048724, "grad_norm": 0.026336748152971268, "grad_norm_var": 1.0378528029856507e-06, "learning_rate": 0.0018887205377636491, "loss": 2.5327, "step": 19847 }, { "crossentropy": 2.3856759071350098, "epoch": 0.7195475638051044, "grad_norm": 0.026018604636192322, "grad_norm_var": 1.041979749606912e-06, "learning_rate": 0.0018882656752301596, "loss": 2.4664, "step": 19848 }, { "crossentropy": 2.3988842964172363, "epoch": 0.7195838167053364, "grad_norm": 0.02716227062046528, "grad_norm_var": 1.0704611280847969e-06, "learning_rate": 0.0018878108547253647, "loss": 2.4123, "step": 19849 }, { "crossentropy": 2.4614856243133545, "epoch": 0.7196200696055685, "grad_norm": 0.027535103261470795, "grad_norm_var": 9.784235175114142e-07, "learning_rate": 0.0018873560762554093, "loss": 2.5026, "step": 19850 }, { "crossentropy": 2.4967668056488037, "epoch": 0.7196563225058005, "grad_norm": 0.026083292439579964, "grad_norm_var": 9.694209241389023e-07, "learning_rate": 0.0018869013398264312, "loss": 2.5103, "step": 19851 }, { "crossentropy": 2.4266457557678223, "epoch": 0.7196925754060325, "grad_norm": 0.027061644941568375, "grad_norm_var": 9.542929881515074e-07, "learning_rate": 0.0018864466454445773, "loss": 2.4487, "step": 19852 }, { "crossentropy": 2.4307312965393066, "epoch": 0.7197288283062645, "grad_norm": 0.02703932113945484, "grad_norm_var": 8.393897073465916e-07, "learning_rate": 0.001885991993115984, "loss": 2.4796, "step": 19853 }, { "crossentropy": 2.548931121826172, "epoch": 0.7197650812064965, "grad_norm": 0.025321681052446365, "grad_norm_var": 8.949052957454243e-07, "learning_rate": 0.0018855373828467947, "loss": 2.4846, "step": 19854 }, { "crossentropy": 2.4426236152648926, "epoch": 0.7198013341067285, "grad_norm": 0.026310894638299942, "grad_norm_var": 9.047507749793474e-07, "learning_rate": 0.0018850828146431515, "loss": 2.505, "step": 19855 }, { "crossentropy": 2.5794010162353516, "epoch": 0.7198375870069605, "grad_norm": 0.02716730907559395, "grad_norm_var": 8.751294945337958e-07, "learning_rate": 0.001884628288511192, "loss": 2.5391, "step": 19856 }, { "crossentropy": 2.627750873565674, "epoch": 0.7198738399071926, "grad_norm": 0.026415109634399414, "grad_norm_var": 7.937519865645285e-07, "learning_rate": 0.0018841738044570527, "loss": 2.5259, "step": 19857 }, { "crossentropy": 2.412360668182373, "epoch": 0.7199100928074246, "grad_norm": 0.025297017768025398, "grad_norm_var": 9.402550157704918e-07, "learning_rate": 0.001883719362486877, "loss": 2.4114, "step": 19858 }, { "crossentropy": 2.4391019344329834, "epoch": 0.7199463457076566, "grad_norm": 0.025842325761914253, "grad_norm_var": 4.74804195070783e-07, "learning_rate": 0.001883264962606799, "loss": 2.4458, "step": 19859 }, { "crossentropy": 2.4511775970458984, "epoch": 0.7199825986078886, "grad_norm": 0.026614492759108543, "grad_norm_var": 4.617700755965426e-07, "learning_rate": 0.0018828106048229582, "loss": 2.5061, "step": 19860 }, { "crossentropy": 2.449033498764038, "epoch": 0.7200188515081206, "grad_norm": 0.027679739519953728, "grad_norm_var": 5.146549058897594e-07, "learning_rate": 0.0018823562891414898, "loss": 2.4158, "step": 19861 }, { "crossentropy": 2.691361665725708, "epoch": 0.7200551044083526, "grad_norm": 0.027113161981105804, "grad_norm_var": 5.348879265292302e-07, "learning_rate": 0.0018819020155685306, "loss": 2.5442, "step": 19862 }, { "crossentropy": 2.3924973011016846, "epoch": 0.7200913573085846, "grad_norm": 0.025731397792696953, "grad_norm_var": 5.760057173467258e-07, "learning_rate": 0.0018814477841102184, "loss": 2.419, "step": 19863 }, { "crossentropy": 2.586030960083008, "epoch": 0.7201276102088167, "grad_norm": 0.02684861794114113, "grad_norm_var": 5.630672413076132e-07, "learning_rate": 0.0018809935947726865, "loss": 2.5498, "step": 19864 }, { "crossentropy": 2.447610855102539, "epoch": 0.7201638631090487, "grad_norm": 0.02673277258872986, "grad_norm_var": 5.410493312212369e-07, "learning_rate": 0.0018805394475620674, "loss": 2.4941, "step": 19865 }, { "crossentropy": 2.352993965148926, "epoch": 0.7202001160092807, "grad_norm": 0.02793811820447445, "grad_norm_var": 6.04156037178373e-07, "learning_rate": 0.0018800853424844987, "loss": 2.4483, "step": 19866 }, { "crossentropy": 2.489549160003662, "epoch": 0.7202363689095128, "grad_norm": 0.027094917371869087, "grad_norm_var": 6.018206064938835e-07, "learning_rate": 0.0018796312795461107, "loss": 2.5617, "step": 19867 }, { "crossentropy": 2.3963863849639893, "epoch": 0.7202726218097448, "grad_norm": 0.02541651763021946, "grad_norm_var": 6.78053858556279e-07, "learning_rate": 0.0018791772587530393, "loss": 2.4796, "step": 19868 }, { "crossentropy": 2.554068088531494, "epoch": 0.7203088747099768, "grad_norm": 0.02655172348022461, "grad_norm_var": 6.60139679319512e-07, "learning_rate": 0.001878723280111413, "loss": 2.5232, "step": 19869 }, { "crossentropy": 2.4660732746124268, "epoch": 0.7203451276102089, "grad_norm": 0.02924242615699768, "grad_norm_var": 1.0024432033720402e-06, "learning_rate": 0.0018782693436273656, "loss": 2.5018, "step": 19870 }, { "crossentropy": 2.453000783920288, "epoch": 0.7203813805104409, "grad_norm": 0.02689294144511223, "grad_norm_var": 9.8955631339098e-07, "learning_rate": 0.0018778154493070299, "loss": 2.4045, "step": 19871 }, { "crossentropy": 2.6614108085632324, "epoch": 0.7204176334106729, "grad_norm": 0.027242673560976982, "grad_norm_var": 9.937412968195747e-07, "learning_rate": 0.0018773615971565344, "loss": 2.6141, "step": 19872 }, { "crossentropy": 2.5072882175445557, "epoch": 0.7204538863109049, "grad_norm": 0.026938902214169502, "grad_norm_var": 9.84645834665084e-07, "learning_rate": 0.0018769077871820079, "loss": 2.4867, "step": 19873 }, { "crossentropy": 2.638787031173706, "epoch": 0.7204901392111369, "grad_norm": 0.027158213779330254, "grad_norm_var": 8.223108946894241e-07, "learning_rate": 0.0018764540193895829, "loss": 2.5528, "step": 19874 }, { "crossentropy": 2.580925226211548, "epoch": 0.7205263921113689, "grad_norm": 0.026393312960863113, "grad_norm_var": 7.606493386942548e-07, "learning_rate": 0.0018760002937853848, "loss": 2.4995, "step": 19875 }, { "crossentropy": 2.437725067138672, "epoch": 0.7205626450116009, "grad_norm": 0.02645842544734478, "grad_norm_var": 7.696603402386106e-07, "learning_rate": 0.0018755466103755436, "loss": 2.492, "step": 19876 }, { "crossentropy": 2.544750690460205, "epoch": 0.720598897911833, "grad_norm": 0.02649076096713543, "grad_norm_var": 7.446459049953071e-07, "learning_rate": 0.0018750929691661884, "loss": 2.508, "step": 19877 }, { "crossentropy": 2.489459276199341, "epoch": 0.720635150812065, "grad_norm": 0.027327753603458405, "grad_norm_var": 7.539004286207512e-07, "learning_rate": 0.0018746393701634435, "loss": 2.4831, "step": 19878 }, { "crossentropy": 2.379866361618042, "epoch": 0.720671403712297, "grad_norm": 0.02607206627726555, "grad_norm_var": 7.079042305124216e-07, "learning_rate": 0.001874185813373438, "loss": 2.5365, "step": 19879 }, { "crossentropy": 2.3796699047088623, "epoch": 0.720707656612529, "grad_norm": 0.026923993602395058, "grad_norm_var": 7.074915870128867e-07, "learning_rate": 0.0018737322988022953, "loss": 2.3952, "step": 19880 }, { "crossentropy": 2.4639875888824463, "epoch": 0.720743909512761, "grad_norm": 0.04156969487667084, "grad_norm_var": 1.4076270652767795e-05, "learning_rate": 0.001873278826456144, "loss": 2.5338, "step": 19881 }, { "crossentropy": 2.571361780166626, "epoch": 0.720780162412993, "grad_norm": 0.02672463469207287, "grad_norm_var": 1.4155184260691136e-05, "learning_rate": 0.001872825396341107, "loss": 2.5405, "step": 19882 }, { "crossentropy": 2.482630729675293, "epoch": 0.720816415313225, "grad_norm": 0.027735790237784386, "grad_norm_var": 1.412221277436094e-05, "learning_rate": 0.0018723720084633073, "loss": 2.5691, "step": 19883 }, { "crossentropy": 2.484407901763916, "epoch": 0.7208526682134571, "grad_norm": 0.026384972035884857, "grad_norm_var": 1.3870316647078955e-05, "learning_rate": 0.0018719186628288692, "loss": 2.4947, "step": 19884 }, { "crossentropy": 2.4386894702911377, "epoch": 0.7208889211136891, "grad_norm": 0.026587307453155518, "grad_norm_var": 1.3864085350805538e-05, "learning_rate": 0.001871465359443919, "loss": 2.4581, "step": 19885 }, { "crossentropy": 2.521836042404175, "epoch": 0.7209251740139211, "grad_norm": 0.02613571099936962, "grad_norm_var": 1.3904612824936153e-05, "learning_rate": 0.001871012098314575, "loss": 2.4173, "step": 19886 }, { "crossentropy": 2.3854761123657227, "epoch": 0.7209614269141531, "grad_norm": 0.027268392965197563, "grad_norm_var": 1.3873531056357984e-05, "learning_rate": 0.0018705588794469625, "loss": 2.4691, "step": 19887 }, { "crossentropy": 2.2741353511810303, "epoch": 0.7209976798143851, "grad_norm": 0.026893336325883865, "grad_norm_var": 1.3903078750662637e-05, "learning_rate": 0.0018701057028471997, "loss": 2.4593, "step": 19888 }, { "crossentropy": 2.5392568111419678, "epoch": 0.7210339327146171, "grad_norm": 0.025771789252758026, "grad_norm_var": 1.4105321727819753e-05, "learning_rate": 0.001869652568521411, "loss": 2.5629, "step": 19889 }, { "crossentropy": 2.407968282699585, "epoch": 0.7210701856148491, "grad_norm": 0.025660645216703415, "grad_norm_var": 1.433740116629684e-05, "learning_rate": 0.0018691994764757152, "loss": 2.5129, "step": 19890 }, { "crossentropy": 2.4898765087127686, "epoch": 0.7211064385150812, "grad_norm": 0.026318084448575974, "grad_norm_var": 1.4349105339470656e-05, "learning_rate": 0.0018687464267162295, "loss": 2.5206, "step": 19891 }, { "crossentropy": 2.547003746032715, "epoch": 0.7211426914153132, "grad_norm": 0.025769120082259178, "grad_norm_var": 1.447638753668012e-05, "learning_rate": 0.0018682934192490752, "loss": 2.5154, "step": 19892 }, { "crossentropy": 2.4446053504943848, "epoch": 0.7211789443155452, "grad_norm": 0.026334794238209724, "grad_norm_var": 1.449841995312764e-05, "learning_rate": 0.0018678404540803722, "loss": 2.4332, "step": 19893 }, { "crossentropy": 2.508178234100342, "epoch": 0.7212151972157773, "grad_norm": 0.02595892921090126, "grad_norm_var": 1.464100824100597e-05, "learning_rate": 0.0018673875312162358, "loss": 2.4717, "step": 19894 }, { "crossentropy": 2.3779468536376953, "epoch": 0.7212514501160093, "grad_norm": 0.02580299787223339, "grad_norm_var": 1.4692521868146329e-05, "learning_rate": 0.0018669346506627865, "loss": 2.4123, "step": 19895 }, { "crossentropy": 2.390944480895996, "epoch": 0.7212877030162413, "grad_norm": 0.027242399752140045, "grad_norm_var": 1.4680135206242039e-05, "learning_rate": 0.0018664818124261373, "loss": 2.4907, "step": 19896 }, { "crossentropy": 2.5068111419677734, "epoch": 0.7213239559164734, "grad_norm": 0.027065061032772064, "grad_norm_var": 3.964837453533278e-07, "learning_rate": 0.0018660290165124066, "loss": 2.476, "step": 19897 }, { "crossentropy": 2.3860549926757812, "epoch": 0.7213602088167054, "grad_norm": 0.025465458631515503, "grad_norm_var": 4.5423408409743976e-07, "learning_rate": 0.001865576262927714, "loss": 2.4447, "step": 19898 }, { "crossentropy": 2.4471819400787354, "epoch": 0.7213964617169374, "grad_norm": 0.026281384751200676, "grad_norm_var": 3.2733947305565077e-07, "learning_rate": 0.0018651235516781667, "loss": 2.4613, "step": 19899 }, { "crossentropy": 2.5794756412506104, "epoch": 0.7214327146171694, "grad_norm": 0.02740403264760971, "grad_norm_var": 4.025981428628274e-07, "learning_rate": 0.0018646708827698834, "loss": 2.6102, "step": 19900 }, { "crossentropy": 2.614649534225464, "epoch": 0.7214689675174014, "grad_norm": 0.026658127084374428, "grad_norm_var": 4.0494027884993507e-07, "learning_rate": 0.0018642182562089787, "loss": 2.5261, "step": 19901 }, { "crossentropy": 2.695263147354126, "epoch": 0.7215052204176334, "grad_norm": 0.027039937674999237, "grad_norm_var": 4.2696431357674747e-07, "learning_rate": 0.0018637656720015645, "loss": 2.5642, "step": 19902 }, { "crossentropy": 2.420395612716675, "epoch": 0.7215414733178654, "grad_norm": 0.03787071257829666, "grad_norm_var": 8.63291171947221e-06, "learning_rate": 0.0018633131301537553, "loss": 2.4609, "step": 19903 }, { "crossentropy": 2.5239405632019043, "epoch": 0.7215777262180975, "grad_norm": 0.028312215581536293, "grad_norm_var": 8.72038757454511e-06, "learning_rate": 0.0018628606306716612, "loss": 2.5313, "step": 19904 }, { "crossentropy": 2.5424108505249023, "epoch": 0.7216139791183295, "grad_norm": 0.02602626010775566, "grad_norm_var": 8.67649446805475e-06, "learning_rate": 0.0018624081735613945, "loss": 2.5685, "step": 19905 }, { "crossentropy": 2.490114212036133, "epoch": 0.7216502320185615, "grad_norm": 0.026752404868602753, "grad_norm_var": 8.526817494561705e-06, "learning_rate": 0.001861955758829071, "loss": 2.5614, "step": 19906 }, { "crossentropy": 2.477989435195923, "epoch": 0.7216864849187935, "grad_norm": 0.027581464499235153, "grad_norm_var": 8.466415094243208e-06, "learning_rate": 0.0018615033864807934, "loss": 2.5404, "step": 19907 }, { "crossentropy": 2.3529422283172607, "epoch": 0.7217227378190255, "grad_norm": 0.02717292308807373, "grad_norm_var": 8.294088252665472e-06, "learning_rate": 0.001861051056522675, "loss": 2.4038, "step": 19908 }, { "crossentropy": 2.448199510574341, "epoch": 0.7217589907192575, "grad_norm": 0.026658937335014343, "grad_norm_var": 8.253080578370965e-06, "learning_rate": 0.0018605987689608277, "loss": 2.5097, "step": 19909 }, { "crossentropy": 2.4369921684265137, "epoch": 0.7217952436194895, "grad_norm": 0.026342405006289482, "grad_norm_var": 8.185734843990988e-06, "learning_rate": 0.0018601465238013571, "loss": 2.3831, "step": 19910 }, { "crossentropy": 2.5385468006134033, "epoch": 0.7218314965197216, "grad_norm": 0.02662930078804493, "grad_norm_var": 8.04366937770215e-06, "learning_rate": 0.0018596943210503736, "loss": 2.5462, "step": 19911 }, { "crossentropy": 2.531602144241333, "epoch": 0.7218677494199536, "grad_norm": 0.026588698849081993, "grad_norm_var": 8.09556988554047e-06, "learning_rate": 0.001859242160713982, "loss": 2.5024, "step": 19912 }, { "crossentropy": 2.6962506771087646, "epoch": 0.7219040023201856, "grad_norm": 0.026987431570887566, "grad_norm_var": 8.100350934638053e-06, "learning_rate": 0.0018587900427982917, "loss": 2.6105, "step": 19913 }, { "crossentropy": 2.581223726272583, "epoch": 0.7219402552204176, "grad_norm": 0.031264640390872955, "grad_norm_var": 8.640134241555741e-06, "learning_rate": 0.0018583379673094104, "loss": 2.6109, "step": 19914 }, { "crossentropy": 2.378016710281372, "epoch": 0.7219765081206496, "grad_norm": 0.027665065601468086, "grad_norm_var": 8.470735794056809e-06, "learning_rate": 0.0018578859342534427, "loss": 2.5451, "step": 19915 }, { "crossentropy": 2.5258090496063232, "epoch": 0.7220127610208816, "grad_norm": 0.025359708815813065, "grad_norm_var": 8.876576052283544e-06, "learning_rate": 0.0018574339436364918, "loss": 2.4356, "step": 19916 }, { "crossentropy": 2.494325637817383, "epoch": 0.7220490139211136, "grad_norm": 0.026988474652171135, "grad_norm_var": 8.83279785927533e-06, "learning_rate": 0.0018569819954646662, "loss": 2.5776, "step": 19917 }, { "crossentropy": 2.7848048210144043, "epoch": 0.7220852668213457, "grad_norm": 0.028002090752124786, "grad_norm_var": 8.789617787976914e-06, "learning_rate": 0.0018565300897440667, "loss": 2.6811, "step": 19918 }, { "crossentropy": 2.543513059616089, "epoch": 0.7221215197215777, "grad_norm": 0.026528770104050636, "grad_norm_var": 1.7326518980064742e-06, "learning_rate": 0.0018560782264808006, "loss": 2.5418, "step": 19919 }, { "crossentropy": 2.4758317470550537, "epoch": 0.7221577726218097, "grad_norm": 0.026506785303354263, "grad_norm_var": 1.6635350589071316e-06, "learning_rate": 0.0018556264056809673, "loss": 2.4926, "step": 19920 }, { "crossentropy": 2.4935131072998047, "epoch": 0.7221940255220418, "grad_norm": 0.027592290192842484, "grad_norm_var": 1.5997196674099548e-06, "learning_rate": 0.0018551746273506714, "loss": 2.5263, "step": 19921 }, { "crossentropy": 2.5092825889587402, "epoch": 0.7222302784222738, "grad_norm": 0.026488807052373886, "grad_norm_var": 1.6185227532367382e-06, "learning_rate": 0.0018547228914960162, "loss": 2.469, "step": 19922 }, { "crossentropy": 2.584751605987549, "epoch": 0.7222665313225058, "grad_norm": 0.02588770166039467, "grad_norm_var": 1.6997892586029327e-06, "learning_rate": 0.0018542711981231015, "loss": 2.5014, "step": 19923 }, { "crossentropy": 2.452331304550171, "epoch": 0.7223027842227379, "grad_norm": 0.025988250970840454, "grad_norm_var": 1.7667459654351618e-06, "learning_rate": 0.001853819547238026, "loss": 2.4548, "step": 19924 }, { "crossentropy": 2.4546492099761963, "epoch": 0.7223390371229699, "grad_norm": 0.026880308985710144, "grad_norm_var": 1.7607023822956697e-06, "learning_rate": 0.0018533679388468944, "loss": 2.5394, "step": 19925 }, { "crossentropy": 2.512422800064087, "epoch": 0.7223752900232019, "grad_norm": 0.02640504762530327, "grad_norm_var": 1.755611400490233e-06, "learning_rate": 0.001852916372955802, "loss": 2.5971, "step": 19926 }, { "crossentropy": 2.570455551147461, "epoch": 0.7224115429234339, "grad_norm": 0.025886164978146553, "grad_norm_var": 1.8253923440100332e-06, "learning_rate": 0.0018524648495708512, "loss": 2.5826, "step": 19927 }, { "crossentropy": 2.5371291637420654, "epoch": 0.7224477958236659, "grad_norm": 0.025485502555966377, "grad_norm_var": 1.95294967390332e-06, "learning_rate": 0.0018520133686981378, "loss": 2.5787, "step": 19928 }, { "crossentropy": 2.3937363624572754, "epoch": 0.7224840487238979, "grad_norm": 0.027187177911400795, "grad_norm_var": 1.9585757983687595e-06, "learning_rate": 0.0018515619303437608, "loss": 2.4596, "step": 19929 }, { "crossentropy": 2.3823719024658203, "epoch": 0.72252030162413, "grad_norm": 0.026059165596961975, "grad_norm_var": 6.105140887310382e-07, "learning_rate": 0.0018511105345138197, "loss": 2.4027, "step": 19930 }, { "crossentropy": 2.451453447341919, "epoch": 0.722556554524362, "grad_norm": 0.02742096036672592, "grad_norm_var": 5.781722859224721e-07, "learning_rate": 0.0018506591812144097, "loss": 2.4732, "step": 19931 }, { "crossentropy": 2.5387001037597656, "epoch": 0.722592807424594, "grad_norm": 0.027036268264055252, "grad_norm_var": 4.89626606422798e-07, "learning_rate": 0.0018502078704516246, "loss": 2.5781, "step": 19932 }, { "crossentropy": 2.3531270027160645, "epoch": 0.722629060324826, "grad_norm": 0.026661543175578117, "grad_norm_var": 4.813992630560282e-07, "learning_rate": 0.0018497566022315642, "loss": 2.4527, "step": 19933 }, { "crossentropy": 2.5767884254455566, "epoch": 0.722665313225058, "grad_norm": 0.028351807966828346, "grad_norm_var": 5.532063946383133e-07, "learning_rate": 0.0018493053765603197, "loss": 2.5943, "step": 19934 }, { "crossentropy": 2.4182300567626953, "epoch": 0.72270156612529, "grad_norm": 0.028094204142689705, "grad_norm_var": 6.815005530173834e-07, "learning_rate": 0.0018488541934439896, "loss": 2.4828, "step": 19935 }, { "crossentropy": 2.448930501937866, "epoch": 0.722737819025522, "grad_norm": 0.026749933138489723, "grad_norm_var": 6.774484677363971e-07, "learning_rate": 0.001848403052888663, "loss": 2.5356, "step": 19936 }, { "crossentropy": 2.413715362548828, "epoch": 0.722774071925754, "grad_norm": 0.027192970737814903, "grad_norm_var": 6.431515433863444e-07, "learning_rate": 0.0018479519549004365, "loss": 2.4519, "step": 19937 }, { "crossentropy": 2.40852427482605, "epoch": 0.7228103248259861, "grad_norm": 0.026869799941778183, "grad_norm_var": 6.396671868201069e-07, "learning_rate": 0.0018475008994854043, "loss": 2.5045, "step": 19938 }, { "crossentropy": 2.3181798458099365, "epoch": 0.7228465777262181, "grad_norm": 0.02698598988354206, "grad_norm_var": 5.873482158069102e-07, "learning_rate": 0.0018470498866496544, "loss": 2.3693, "step": 19939 }, { "crossentropy": 2.592883825302124, "epoch": 0.7228828306264501, "grad_norm": 0.027151256799697876, "grad_norm_var": 5.415981154987485e-07, "learning_rate": 0.0018465989163992818, "loss": 2.6387, "step": 19940 }, { "crossentropy": 2.59854793548584, "epoch": 0.7229190835266821, "grad_norm": 0.02831072360277176, "grad_norm_var": 6.655072015331141e-07, "learning_rate": 0.0018461479887403771, "loss": 2.4945, "step": 19941 }, { "crossentropy": 2.454685688018799, "epoch": 0.7229553364269141, "grad_norm": 0.025947509333491325, "grad_norm_var": 7.143085860271403e-07, "learning_rate": 0.0018456971036790281, "loss": 2.4487, "step": 19942 }, { "crossentropy": 2.604666233062744, "epoch": 0.7229915893271461, "grad_norm": 0.02568366564810276, "grad_norm_var": 7.459171852311486e-07, "learning_rate": 0.0018452462612213278, "loss": 2.5368, "step": 19943 }, { "crossentropy": 2.5752854347229004, "epoch": 0.7230278422273781, "grad_norm": 0.02830999158322811, "grad_norm_var": 6.932694049175195e-07, "learning_rate": 0.0018447954613733631, "loss": 2.5577, "step": 19944 }, { "crossentropy": 2.421699285507202, "epoch": 0.7230640951276102, "grad_norm": 0.026210162788629532, "grad_norm_var": 7.449350631941968e-07, "learning_rate": 0.0018443447041412238, "loss": 2.4248, "step": 19945 }, { "crossentropy": 2.4108705520629883, "epoch": 0.7231003480278422, "grad_norm": 0.028434352949261665, "grad_norm_var": 7.790704982010844e-07, "learning_rate": 0.0018438939895309998, "loss": 2.4517, "step": 19946 }, { "crossentropy": 2.4835286140441895, "epoch": 0.7231366009280742, "grad_norm": 0.026931801810860634, "grad_norm_var": 7.804746426085292e-07, "learning_rate": 0.0018434433175487748, "loss": 2.5252, "step": 19947 }, { "crossentropy": 2.4603734016418457, "epoch": 0.7231728538283063, "grad_norm": 0.026621557772159576, "grad_norm_var": 7.99316385629078e-07, "learning_rate": 0.0018429926882006405, "loss": 2.4067, "step": 19948 }, { "crossentropy": 2.4149363040924072, "epoch": 0.7232091067285383, "grad_norm": 0.02878347598016262, "grad_norm_var": 9.406356174350283e-07, "learning_rate": 0.0018425421014926801, "loss": 2.5024, "step": 19949 }, { "crossentropy": 2.582051992416382, "epoch": 0.7232453596287703, "grad_norm": 0.028757775202393532, "grad_norm_var": 1.008447293956285e-06, "learning_rate": 0.0018420915574309793, "loss": 2.5155, "step": 19950 }, { "crossentropy": 2.4743897914886475, "epoch": 0.7232816125290024, "grad_norm": 0.027692602947354317, "grad_norm_var": 9.767874452332962e-07, "learning_rate": 0.0018416410560216257, "loss": 2.4781, "step": 19951 }, { "crossentropy": 2.645514726638794, "epoch": 0.7233178654292344, "grad_norm": 0.026332424953579903, "grad_norm_var": 1.0177239532927015e-06, "learning_rate": 0.0018411905972707005, "loss": 2.5962, "step": 19952 }, { "crossentropy": 2.530357837677002, "epoch": 0.7233541183294664, "grad_norm": 0.029026394709944725, "grad_norm_var": 1.2105718846979005e-06, "learning_rate": 0.0018407401811842904, "loss": 2.5378, "step": 19953 }, { "crossentropy": 2.408763885498047, "epoch": 0.7233903712296984, "grad_norm": 0.02687235176563263, "grad_norm_var": 1.2103993484834183e-06, "learning_rate": 0.0018402898077684805, "loss": 2.4662, "step": 19954 }, { "crossentropy": 2.487395763397217, "epoch": 0.7234266241299304, "grad_norm": 0.026275668293237686, "grad_norm_var": 1.279085146726512e-06, "learning_rate": 0.0018398394770293496, "loss": 2.5178, "step": 19955 }, { "crossentropy": 2.645162582397461, "epoch": 0.7234628770301624, "grad_norm": 0.027391016483306885, "grad_norm_var": 1.2768405770713711e-06, "learning_rate": 0.001839389188972984, "loss": 2.5984, "step": 19956 }, { "crossentropy": 2.540069818496704, "epoch": 0.7234991299303944, "grad_norm": 0.02650085836648941, "grad_norm_var": 1.249449585687991e-06, "learning_rate": 0.0018389389436054638, "loss": 2.4687, "step": 19957 }, { "crossentropy": 2.7002880573272705, "epoch": 0.7235353828306265, "grad_norm": 0.027385931462049484, "grad_norm_var": 1.1316992080436974e-06, "learning_rate": 0.0018384887409328688, "loss": 2.6366, "step": 19958 }, { "crossentropy": 2.6106674671173096, "epoch": 0.7235716357308585, "grad_norm": 0.027254480868577957, "grad_norm_var": 9.420197594885045e-07, "learning_rate": 0.0018380385809612827, "loss": 2.5137, "step": 19959 }, { "crossentropy": 2.5483617782592773, "epoch": 0.7236078886310905, "grad_norm": 0.025861086323857307, "grad_norm_var": 1.0274819011511319e-06, "learning_rate": 0.0018375884636967822, "loss": 2.5453, "step": 19960 }, { "crossentropy": 2.570967435836792, "epoch": 0.7236441415313225, "grad_norm": 0.027816666290163994, "grad_norm_var": 9.616077365022136e-07, "learning_rate": 0.0018371383891454486, "loss": 2.4975, "step": 19961 }, { "crossentropy": 2.4502437114715576, "epoch": 0.7236803944315545, "grad_norm": 0.027076629921793938, "grad_norm_var": 8.843501474177131e-07, "learning_rate": 0.001836688357313363, "loss": 2.4592, "step": 19962 }, { "crossentropy": 2.3962438106536865, "epoch": 0.7237166473317865, "grad_norm": 0.02670481614768505, "grad_norm_var": 8.982989586349035e-07, "learning_rate": 0.0018362383682065998, "loss": 2.4826, "step": 19963 }, { "crossentropy": 2.4544804096221924, "epoch": 0.7237529002320185, "grad_norm": 0.026677431538701057, "grad_norm_var": 8.936475793715089e-07, "learning_rate": 0.0018357884218312397, "loss": 2.534, "step": 19964 }, { "crossentropy": 2.3961806297302246, "epoch": 0.7237891531322506, "grad_norm": 0.026364753022789955, "grad_norm_var": 7.730020360800782e-07, "learning_rate": 0.0018353385181933597, "loss": 2.444, "step": 19965 }, { "crossentropy": 2.5482141971588135, "epoch": 0.7238254060324826, "grad_norm": 0.028478335589170456, "grad_norm_var": 7.170262812766406e-07, "learning_rate": 0.0018348886572990331, "loss": 2.5409, "step": 19966 }, { "crossentropy": 2.567457675933838, "epoch": 0.7238616589327146, "grad_norm": 0.02610759437084198, "grad_norm_var": 7.502766684705971e-07, "learning_rate": 0.0018344388391543405, "loss": 2.539, "step": 19967 }, { "crossentropy": 2.4654855728149414, "epoch": 0.7238979118329466, "grad_norm": 0.026793064549565315, "grad_norm_var": 7.220515765161973e-07, "learning_rate": 0.0018339890637653533, "loss": 2.4078, "step": 19968 }, { "crossentropy": 2.5401384830474854, "epoch": 0.7239341647331786, "grad_norm": 0.02619142271578312, "grad_norm_var": 4.7226820191013237e-07, "learning_rate": 0.0018335393311381487, "loss": 2.4802, "step": 19969 }, { "crossentropy": 2.4002773761749268, "epoch": 0.7239704176334106, "grad_norm": 0.02616634964942932, "grad_norm_var": 5.022114891945276e-07, "learning_rate": 0.0018330896412788022, "loss": 2.5045, "step": 19970 }, { "crossentropy": 2.577901840209961, "epoch": 0.7240066705336426, "grad_norm": 0.02816874161362648, "grad_norm_var": 5.899654866788872e-07, "learning_rate": 0.0018326399941933846, "loss": 2.5166, "step": 19971 }, { "crossentropy": 2.4654600620269775, "epoch": 0.7240429234338747, "grad_norm": 0.02721545659005642, "grad_norm_var": 5.81186927855086e-07, "learning_rate": 0.0018321903898879722, "loss": 2.5493, "step": 19972 }, { "crossentropy": 2.3921310901641846, "epoch": 0.7240791763341067, "grad_norm": 0.026450615376234055, "grad_norm_var": 5.8417082035241e-07, "learning_rate": 0.0018317408283686355, "loss": 2.4634, "step": 19973 }, { "crossentropy": 2.523020029067993, "epoch": 0.7241154292343387, "grad_norm": 0.026211604475975037, "grad_norm_var": 5.973421210823572e-07, "learning_rate": 0.0018312913096414458, "loss": 2.6012, "step": 19974 }, { "crossentropy": 2.62160587310791, "epoch": 0.7241516821345708, "grad_norm": 0.02752636931836605, "grad_norm_var": 6.167635848370115e-07, "learning_rate": 0.0018308418337124766, "loss": 2.5016, "step": 19975 }, { "crossentropy": 2.4729020595550537, "epoch": 0.7241879350348028, "grad_norm": 0.027565212920308113, "grad_norm_var": 5.705731363200272e-07, "learning_rate": 0.001830392400587797, "loss": 2.4399, "step": 19976 }, { "crossentropy": 2.5467941761016846, "epoch": 0.7242241879350348, "grad_norm": 0.027407566085457802, "grad_norm_var": 5.348336480500797e-07, "learning_rate": 0.0018299430102734777, "loss": 2.5727, "step": 19977 }, { "crossentropy": 2.5618479251861572, "epoch": 0.7242604408352669, "grad_norm": 0.026243582367897034, "grad_norm_var": 5.634886914031387e-07, "learning_rate": 0.0018294936627755904, "loss": 2.4972, "step": 19978 }, { "crossentropy": 2.523594379425049, "epoch": 0.7242966937354989, "grad_norm": 0.027107268571853638, "grad_norm_var": 5.635642693267602e-07, "learning_rate": 0.001829044358100201, "loss": 2.5569, "step": 19979 }, { "crossentropy": 2.3092148303985596, "epoch": 0.7243329466357309, "grad_norm": 0.02585570327937603, "grad_norm_var": 6.320376942303234e-07, "learning_rate": 0.0018285950962533808, "loss": 2.4519, "step": 19980 }, { "crossentropy": 2.5909526348114014, "epoch": 0.7243691995359629, "grad_norm": 0.02631843462586403, "grad_norm_var": 6.35266464805055e-07, "learning_rate": 0.0018281458772411957, "loss": 2.6001, "step": 19981 }, { "crossentropy": 2.6017355918884277, "epoch": 0.7244054524361949, "grad_norm": 0.026379799470305443, "grad_norm_var": 4.585167978960261e-07, "learning_rate": 0.0018276967010697148, "loss": 2.5032, "step": 19982 }, { "crossentropy": 2.5333127975463867, "epoch": 0.7244417053364269, "grad_norm": 0.025469519197940826, "grad_norm_var": 5.370683173780768e-07, "learning_rate": 0.001827247567745005, "loss": 2.4467, "step": 19983 }, { "crossentropy": 2.5277035236358643, "epoch": 0.724477958236659, "grad_norm": 0.026169486343860626, "grad_norm_var": 5.529618425452551e-07, "learning_rate": 0.0018267984772731294, "loss": 2.5219, "step": 19984 }, { "crossentropy": 2.5845768451690674, "epoch": 0.724514211136891, "grad_norm": 0.027292074635624886, "grad_norm_var": 5.609463407396971e-07, "learning_rate": 0.0018263494296601563, "loss": 2.5971, "step": 19985 }, { "crossentropy": 2.543461561203003, "epoch": 0.724550464037123, "grad_norm": 0.026950174942612648, "grad_norm_var": 5.413017165046992e-07, "learning_rate": 0.001825900424912152, "loss": 2.484, "step": 19986 }, { "crossentropy": 2.4158880710601807, "epoch": 0.724586716937355, "grad_norm": 0.025969449430704117, "grad_norm_var": 4.3365434335227576e-07, "learning_rate": 0.0018254514630351771, "loss": 2.4822, "step": 19987 }, { "crossentropy": 2.3780879974365234, "epoch": 0.724622969837587, "grad_norm": 0.026615222916007042, "grad_norm_var": 4.0957879435202934e-07, "learning_rate": 0.0018250025440352997, "loss": 2.458, "step": 19988 }, { "crossentropy": 2.437973976135254, "epoch": 0.724659222737819, "grad_norm": 0.026656093075871468, "grad_norm_var": 4.082412112538554e-07, "learning_rate": 0.0018245536679185792, "loss": 2.4079, "step": 19989 }, { "crossentropy": 2.5123519897460938, "epoch": 0.724695475638051, "grad_norm": 0.02647809498012066, "grad_norm_var": 3.985737990564823e-07, "learning_rate": 0.0018241048346910827, "loss": 2.546, "step": 19990 }, { "crossentropy": 2.589718818664551, "epoch": 0.724731728538283, "grad_norm": 0.02602994255721569, "grad_norm_var": 3.587357235159775e-07, "learning_rate": 0.0018236560443588695, "loss": 2.5472, "step": 19991 }, { "crossentropy": 2.4847872257232666, "epoch": 0.7247679814385151, "grad_norm": 0.026461685076355934, "grad_norm_var": 2.827824539775962e-07, "learning_rate": 0.001823207296928, "loss": 2.362, "step": 19992 }, { "crossentropy": 2.5814223289489746, "epoch": 0.7248042343387471, "grad_norm": 0.0255754292011261, "grad_norm_var": 2.6177497126620745e-07, "learning_rate": 0.001822758592404537, "loss": 2.4505, "step": 19993 }, { "crossentropy": 2.481795310974121, "epoch": 0.7248404872389791, "grad_norm": 0.025650598108768463, "grad_norm_var": 2.9202717375617116e-07, "learning_rate": 0.0018223099307945428, "loss": 2.5115, "step": 19994 }, { "crossentropy": 2.5170271396636963, "epoch": 0.7248767401392111, "grad_norm": 0.02654086984694004, "grad_norm_var": 2.519576249492006e-07, "learning_rate": 0.0018218613121040734, "loss": 2.511, "step": 19995 }, { "crossentropy": 2.447028636932373, "epoch": 0.7249129930394431, "grad_norm": 0.02600182220339775, "grad_norm_var": 2.451077727885862e-07, "learning_rate": 0.001821412736339192, "loss": 2.4638, "step": 19996 }, { "crossentropy": 2.4927074909210205, "epoch": 0.7249492459396751, "grad_norm": 0.026087727397680283, "grad_norm_var": 2.4740339970457693e-07, "learning_rate": 0.0018209642035059532, "loss": 2.5437, "step": 19997 }, { "crossentropy": 2.502392292022705, "epoch": 0.7249854988399071, "grad_norm": 0.025763079524040222, "grad_norm_var": 2.6218719161192706e-07, "learning_rate": 0.0018205157136104173, "loss": 2.4631, "step": 19998 }, { "crossentropy": 2.477696657180786, "epoch": 0.7250217517401392, "grad_norm": 0.028444452211260796, "grad_norm_var": 5.129005471467592e-07, "learning_rate": 0.001820067266658645, "loss": 2.4735, "step": 19999 }, { "crossentropy": 2.4093003273010254, "epoch": 0.7250580046403712, "grad_norm": 0.027932588011026382, "grad_norm_var": 6.487892801724232e-07, "learning_rate": 0.0018196188626566862, "loss": 2.5804, "step": 20000 }, { "crossentropy": 2.575373411178589, "epoch": 0.7250942575406032, "grad_norm": 0.02708546817302704, "grad_norm_var": 6.304110472160155e-07, "learning_rate": 0.001819170501610602, "loss": 2.5963, "step": 20001 }, { "crossentropy": 2.549151659011841, "epoch": 0.7251305104408353, "grad_norm": 0.026602016761898994, "grad_norm_var": 6.177934608914137e-07, "learning_rate": 0.0018187221835264484, "loss": 2.5454, "step": 20002 }, { "crossentropy": 2.5799171924591064, "epoch": 0.7251667633410673, "grad_norm": 0.026023734360933304, "grad_norm_var": 6.141852268030471e-07, "learning_rate": 0.0018182739084102778, "loss": 2.5287, "step": 20003 }, { "crossentropy": 2.5522615909576416, "epoch": 0.7252030162412993, "grad_norm": 0.025662776082754135, "grad_norm_var": 6.558437353033532e-07, "learning_rate": 0.0018178256762681484, "loss": 2.491, "step": 20004 }, { "crossentropy": 2.5255472660064697, "epoch": 0.7252392691415314, "grad_norm": 0.02589454874396324, "grad_norm_var": 6.698718315115149e-07, "learning_rate": 0.00181737748710611, "loss": 2.5006, "step": 20005 }, { "crossentropy": 2.6411924362182617, "epoch": 0.7252755220417634, "grad_norm": 0.02689981274306774, "grad_norm_var": 6.859588518301485e-07, "learning_rate": 0.001816929340930219, "loss": 2.5715, "step": 20006 }, { "crossentropy": 2.4931116104125977, "epoch": 0.7253117749419954, "grad_norm": 0.025670789182186127, "grad_norm_var": 7.125096254319372e-07, "learning_rate": 0.0018164812377465312, "loss": 2.4809, "step": 20007 }, { "crossentropy": 2.2958226203918457, "epoch": 0.7253480278422274, "grad_norm": 0.027074364945292473, "grad_norm_var": 7.415336116237906e-07, "learning_rate": 0.0018160331775610922, "loss": 2.4055, "step": 20008 }, { "crossentropy": 2.4287350177764893, "epoch": 0.7253842807424594, "grad_norm": 0.027437008917331696, "grad_norm_var": 7.455458990861339e-07, "learning_rate": 0.0018155851603799567, "loss": 2.4298, "step": 20009 }, { "crossentropy": 2.427248477935791, "epoch": 0.7254205336426914, "grad_norm": 0.026890341192483902, "grad_norm_var": 6.932285927481781e-07, "learning_rate": 0.0018151371862091775, "loss": 2.4525, "step": 20010 }, { "crossentropy": 2.5371384620666504, "epoch": 0.7254567865429234, "grad_norm": 0.02649002894759178, "grad_norm_var": 6.939652729274932e-07, "learning_rate": 0.0018146892550548028, "loss": 2.5669, "step": 20011 }, { "crossentropy": 2.562692165374756, "epoch": 0.7254930394431555, "grad_norm": 0.02583027444779873, "grad_norm_var": 7.100021465454894e-07, "learning_rate": 0.0018142413669228847, "loss": 2.4979, "step": 20012 }, { "crossentropy": 2.561821937561035, "epoch": 0.7255292923433875, "grad_norm": 0.026687124744057655, "grad_norm_var": 6.905722161225253e-07, "learning_rate": 0.0018137935218194702, "loss": 2.5666, "step": 20013 }, { "crossentropy": 2.4650440216064453, "epoch": 0.7255655452436195, "grad_norm": 0.026197556406259537, "grad_norm_var": 6.510327923273107e-07, "learning_rate": 0.0018133457197506092, "loss": 2.4624, "step": 20014 }, { "crossentropy": 2.5532844066619873, "epoch": 0.7256017981438515, "grad_norm": 0.026408573612570763, "grad_norm_var": 4.301525186307017e-07, "learning_rate": 0.0018128979607223534, "loss": 2.5877, "step": 20015 }, { "crossentropy": 2.5573055744171143, "epoch": 0.7256380510440835, "grad_norm": 0.027037305757403374, "grad_norm_var": 3.1511036053323504e-07, "learning_rate": 0.0018124502447407443, "loss": 2.633, "step": 20016 }, { "crossentropy": 2.5815892219543457, "epoch": 0.7256743039443155, "grad_norm": 0.025082768872380257, "grad_norm_var": 4.076430624152504e-07, "learning_rate": 0.0018120025718118317, "loss": 2.5986, "step": 20017 }, { "crossentropy": 2.469475269317627, "epoch": 0.7257105568445475, "grad_norm": 0.026716196909546852, "grad_norm_var": 4.1201958136573524e-07, "learning_rate": 0.0018115549419416637, "loss": 2.4997, "step": 20018 }, { "crossentropy": 2.425325393676758, "epoch": 0.7257468097447796, "grad_norm": 0.02633863314986229, "grad_norm_var": 4.034603300487674e-07, "learning_rate": 0.0018111073551362838, "loss": 2.5157, "step": 20019 }, { "crossentropy": 2.5852720737457275, "epoch": 0.7257830626450116, "grad_norm": 0.02575068175792694, "grad_norm_var": 3.953624630854968e-07, "learning_rate": 0.0018106598114017397, "loss": 2.5103, "step": 20020 }, { "crossentropy": 2.4191789627075195, "epoch": 0.7258193155452436, "grad_norm": 0.02647973969578743, "grad_norm_var": 3.772981144123388e-07, "learning_rate": 0.0018102123107440731, "loss": 2.4476, "step": 20021 }, { "crossentropy": 2.4951589107513428, "epoch": 0.7258555684454756, "grad_norm": 0.026570690795779228, "grad_norm_var": 3.637564247954694e-07, "learning_rate": 0.00180976485316933, "loss": 2.4475, "step": 20022 }, { "crossentropy": 2.435793876647949, "epoch": 0.7258918213457076, "grad_norm": 0.026208441704511642, "grad_norm_var": 3.2837414627491743e-07, "learning_rate": 0.001809317438683556, "loss": 2.4807, "step": 20023 }, { "crossentropy": 2.468747138977051, "epoch": 0.7259280742459396, "grad_norm": 0.02622833102941513, "grad_norm_var": 3.0267690318366326e-07, "learning_rate": 0.0018088700672927915, "loss": 2.4755, "step": 20024 }, { "crossentropy": 2.4840474128723145, "epoch": 0.7259643271461717, "grad_norm": 0.025782018899917603, "grad_norm_var": 2.4439340100602424e-07, "learning_rate": 0.0018084227390030778, "loss": 2.4717, "step": 20025 }, { "crossentropy": 2.5504043102264404, "epoch": 0.7260005800464037, "grad_norm": 0.025927625596523285, "grad_norm_var": 2.2572968151623035e-07, "learning_rate": 0.0018079754538204602, "loss": 2.4464, "step": 20026 }, { "crossentropy": 2.5951547622680664, "epoch": 0.7260368329466357, "grad_norm": 0.02711121365427971, "grad_norm_var": 2.7109353826526515e-07, "learning_rate": 0.0018075282117509766, "loss": 2.5709, "step": 20027 }, { "crossentropy": 2.5279505252838135, "epoch": 0.7260730858468677, "grad_norm": 0.02765802852809429, "grad_norm_var": 3.721587391295692e-07, "learning_rate": 0.001807081012800671, "loss": 2.5748, "step": 20028 }, { "crossentropy": 2.3143646717071533, "epoch": 0.7261093387470998, "grad_norm": 0.02606523595750332, "grad_norm_var": 3.7140781416476694e-07, "learning_rate": 0.0018066338569755797, "loss": 2.4582, "step": 20029 }, { "crossentropy": 2.483875036239624, "epoch": 0.7261455916473318, "grad_norm": 0.027199124917387962, "grad_norm_var": 4.1405480173117846e-07, "learning_rate": 0.0018061867442817443, "loss": 2.5241, "step": 20030 }, { "crossentropy": 2.487642526626587, "epoch": 0.7261818445475638, "grad_norm": 0.028071867302060127, "grad_norm_var": 5.865836770030732e-07, "learning_rate": 0.0018057396747252052, "loss": 2.4582, "step": 20031 }, { "crossentropy": 2.4386284351348877, "epoch": 0.7262180974477959, "grad_norm": 0.026100583374500275, "grad_norm_var": 5.76095745102043e-07, "learning_rate": 0.0018052926483120002, "loss": 2.532, "step": 20032 }, { "crossentropy": 2.6505348682403564, "epoch": 0.7262543503480279, "grad_norm": 0.027454527094960213, "grad_norm_var": 4.935053225098846e-07, "learning_rate": 0.0018048456650481637, "loss": 2.6248, "step": 20033 }, { "crossentropy": 2.49605393409729, "epoch": 0.7262906032482599, "grad_norm": 0.02614937350153923, "grad_norm_var": 5.051014211815881e-07, "learning_rate": 0.001804398724939737, "loss": 2.4535, "step": 20034 }, { "crossentropy": 2.255873441696167, "epoch": 0.7263268561484919, "grad_norm": 0.029760019853711128, "grad_norm_var": 1.131854237919324e-06, "learning_rate": 0.0018039518279927536, "loss": 2.3614, "step": 20035 }, { "crossentropy": 2.3217685222625732, "epoch": 0.7263631090487239, "grad_norm": 0.025701209902763367, "grad_norm_var": 1.1388123036447692e-06, "learning_rate": 0.0018035049742132526, "loss": 2.3273, "step": 20036 }, { "crossentropy": 2.5259180068969727, "epoch": 0.7263993619489559, "grad_norm": 0.02777048572897911, "grad_norm_var": 1.1913929791989754e-06, "learning_rate": 0.0018030581636072657, "loss": 2.4638, "step": 20037 }, { "crossentropy": 2.5202624797821045, "epoch": 0.726435614849188, "grad_norm": 0.026535872370004654, "grad_norm_var": 1.192811500232554e-06, "learning_rate": 0.0018026113961808299, "loss": 2.5313, "step": 20038 }, { "crossentropy": 2.4184775352478027, "epoch": 0.72647186774942, "grad_norm": 0.025474298745393753, "grad_norm_var": 1.2900546348466763e-06, "learning_rate": 0.0018021646719399815, "loss": 2.4853, "step": 20039 }, { "crossentropy": 2.3517262935638428, "epoch": 0.726508120649652, "grad_norm": 0.02614779956638813, "grad_norm_var": 1.2967256639142965e-06, "learning_rate": 0.0018017179908907505, "loss": 2.4483, "step": 20040 }, { "crossentropy": 2.479572057723999, "epoch": 0.726544373549884, "grad_norm": 0.027066271752119064, "grad_norm_var": 1.2243249868089938e-06, "learning_rate": 0.0018012713530391727, "loss": 2.4358, "step": 20041 }, { "crossentropy": 2.497286319732666, "epoch": 0.726580626450116, "grad_norm": 0.09837377816438675, "grad_norm_var": 0.0003199841458421549, "learning_rate": 0.00180082475839128, "loss": 2.6084, "step": 20042 }, { "crossentropy": 2.4591739177703857, "epoch": 0.726616879350348, "grad_norm": 0.029769310727715492, "grad_norm_var": 0.00031890042761760474, "learning_rate": 0.0018003782069531023, "loss": 2.3259, "step": 20043 }, { "crossentropy": 2.483914613723755, "epoch": 0.72665313225058, "grad_norm": 0.02691715955734253, "grad_norm_var": 0.0003193222651252678, "learning_rate": 0.0017999316987306734, "loss": 2.4797, "step": 20044 }, { "crossentropy": 2.530731439590454, "epoch": 0.726689385150812, "grad_norm": 0.027052603662014008, "grad_norm_var": 0.0003186631323008235, "learning_rate": 0.001799485233730022, "loss": 2.5515, "step": 20045 }, { "crossentropy": 2.4829421043395996, "epoch": 0.7267256380510441, "grad_norm": 0.027747681364417076, "grad_norm_var": 0.00031836031033484984, "learning_rate": 0.0017990388119571794, "loss": 2.5258, "step": 20046 }, { "crossentropy": 2.548722267150879, "epoch": 0.7267618909512761, "grad_norm": 0.02543763630092144, "grad_norm_var": 0.000320044016361145, "learning_rate": 0.0017985924334181775, "loss": 2.5036, "step": 20047 }, { "crossentropy": 2.3672924041748047, "epoch": 0.7267981438515081, "grad_norm": 0.026338033378124237, "grad_norm_var": 0.00031987766600686645, "learning_rate": 0.0017981460981190407, "loss": 2.5084, "step": 20048 }, { "crossentropy": 2.573262929916382, "epoch": 0.7268343967517401, "grad_norm": 0.0272199846804142, "grad_norm_var": 0.000320007021418384, "learning_rate": 0.0017976998060658018, "loss": 2.507, "step": 20049 }, { "crossentropy": 2.443791151046753, "epoch": 0.7268706496519721, "grad_norm": 0.025309955701231956, "grad_norm_var": 0.0003206461483726695, "learning_rate": 0.0017972535572644867, "loss": 2.4241, "step": 20050 }, { "crossentropy": 2.511362314224243, "epoch": 0.7269069025522041, "grad_norm": 0.026168879121541977, "grad_norm_var": 0.0003222440665212068, "learning_rate": 0.0017968073517211209, "loss": 2.5587, "step": 20051 }, { "crossentropy": 2.6149590015411377, "epoch": 0.7269431554524362, "grad_norm": 0.026570545509457588, "grad_norm_var": 0.0003216551526081695, "learning_rate": 0.0017963611894417342, "loss": 2.5251, "step": 20052 }, { "crossentropy": 2.5689799785614014, "epoch": 0.7269794083526682, "grad_norm": 0.026508668437600136, "grad_norm_var": 0.0003223390171425554, "learning_rate": 0.0017959150704323502, "loss": 2.5458, "step": 20053 }, { "crossentropy": 2.4722628593444824, "epoch": 0.7270156612529002, "grad_norm": 0.026039326563477516, "grad_norm_var": 0.00032266089722929645, "learning_rate": 0.0017954689946989954, "loss": 2.4188, "step": 20054 }, { "crossentropy": 2.5218660831451416, "epoch": 0.7270519141531323, "grad_norm": 0.026952436193823814, "grad_norm_var": 0.0003216820361492797, "learning_rate": 0.0017950229622476965, "loss": 2.5408, "step": 20055 }, { "crossentropy": 2.4184277057647705, "epoch": 0.7270881670533643, "grad_norm": 0.026085685938596725, "grad_norm_var": 0.00032172433611461955, "learning_rate": 0.0017945769730844747, "loss": 2.4429, "step": 20056 }, { "crossentropy": 2.5158097743988037, "epoch": 0.7271244199535963, "grad_norm": 0.025844916701316833, "grad_norm_var": 0.00032249437779664346, "learning_rate": 0.0017941310272153565, "loss": 2.5045, "step": 20057 }, { "crossentropy": 2.44651198387146, "epoch": 0.7271606728538283, "grad_norm": 0.027366891503334045, "grad_norm_var": 1.13365948450807e-06, "learning_rate": 0.0017936851246463641, "loss": 2.4936, "step": 20058 }, { "crossentropy": 2.367144823074341, "epoch": 0.7271969257540604, "grad_norm": 0.026957107707858086, "grad_norm_var": 4.801097445374565e-07, "learning_rate": 0.0017932392653835184, "loss": 2.4315, "step": 20059 }, { "crossentropy": 2.2694640159606934, "epoch": 0.7272331786542924, "grad_norm": 0.027234401553869247, "grad_norm_var": 5.026771652380871e-07, "learning_rate": 0.001792793449432844, "loss": 2.3709, "step": 20060 }, { "crossentropy": 2.4375457763671875, "epoch": 0.7272694315545244, "grad_norm": 0.025718266144394875, "grad_norm_var": 5.24923096978597e-07, "learning_rate": 0.00179234767680036, "loss": 2.4508, "step": 20061 }, { "crossentropy": 2.581712484359741, "epoch": 0.7273056844547564, "grad_norm": 0.026485856622457504, "grad_norm_var": 4.092684528324171e-07, "learning_rate": 0.0017919019474920883, "loss": 2.5092, "step": 20062 }, { "crossentropy": 2.592343807220459, "epoch": 0.7273419373549884, "grad_norm": 0.026946749538183212, "grad_norm_var": 3.599951263773589e-07, "learning_rate": 0.0017914562615140505, "loss": 2.5119, "step": 20063 }, { "crossentropy": 2.4538257122039795, "epoch": 0.7273781902552204, "grad_norm": 0.026199135929346085, "grad_norm_var": 3.639084486928608e-07, "learning_rate": 0.0017910106188722642, "loss": 2.4822, "step": 20064 }, { "crossentropy": 2.495976448059082, "epoch": 0.7274144431554525, "grad_norm": 0.02598317340016365, "grad_norm_var": 3.367515521509086e-07, "learning_rate": 0.0017905650195727501, "loss": 2.5141, "step": 20065 }, { "crossentropy": 2.682760238647461, "epoch": 0.7274506960556845, "grad_norm": 0.025451604276895523, "grad_norm_var": 3.1745153116393097e-07, "learning_rate": 0.0017901194636215268, "loss": 2.6118, "step": 20066 }, { "crossentropy": 2.5864126682281494, "epoch": 0.7274869489559165, "grad_norm": 0.027140680700540543, "grad_norm_var": 3.4560893645002356e-07, "learning_rate": 0.001789673951024609, "loss": 2.4904, "step": 20067 }, { "crossentropy": 2.5314102172851562, "epoch": 0.7275232018561485, "grad_norm": 0.02764715626835823, "grad_norm_var": 4.3279522739836315e-07, "learning_rate": 0.0017892284817880182, "loss": 2.4934, "step": 20068 }, { "crossentropy": 2.430445432662964, "epoch": 0.7275594547563805, "grad_norm": 0.027219898998737335, "grad_norm_var": 4.6190154823159276e-07, "learning_rate": 0.0017887830559177675, "loss": 2.4765, "step": 20069 }, { "crossentropy": 2.3779685497283936, "epoch": 0.7275957076566125, "grad_norm": 0.027085667476058006, "grad_norm_var": 4.549564065259304e-07, "learning_rate": 0.0017883376734198746, "loss": 2.4094, "step": 20070 }, { "crossentropy": 2.7364087104797363, "epoch": 0.7276319605568445, "grad_norm": 0.0863015204668045, "grad_norm_var": 0.00022303256021292186, "learning_rate": 0.0017878923343003577, "loss": 2.6562, "step": 20071 }, { "crossentropy": 2.3954312801361084, "epoch": 0.7276682134570766, "grad_norm": 0.028954247012734413, "grad_norm_var": 0.00022191421510035016, "learning_rate": 0.001787447038565227, "loss": 2.4589, "step": 20072 }, { "crossentropy": 2.6013410091400146, "epoch": 0.7277044663573086, "grad_norm": 0.027390383183956146, "grad_norm_var": 0.00022109733807577358, "learning_rate": 0.0017870017862205013, "loss": 2.5257, "step": 20073 }, { "crossentropy": 2.3847200870513916, "epoch": 0.7277407192575406, "grad_norm": 0.029061760753393173, "grad_norm_var": 0.0002205394300927009, "learning_rate": 0.0017865565772721926, "loss": 2.4358, "step": 20074 }, { "crossentropy": 2.3498454093933105, "epoch": 0.7277769721577726, "grad_norm": 0.028331873938441277, "grad_norm_var": 0.0002199648563993646, "learning_rate": 0.0017861114117263116, "loss": 2.5256, "step": 20075 }, { "crossentropy": 2.522094249725342, "epoch": 0.7278132250580046, "grad_norm": 0.02751234732568264, "grad_norm_var": 0.0002198367295177344, "learning_rate": 0.0017856662895888754, "loss": 2.5504, "step": 20076 }, { "crossentropy": 2.566495895385742, "epoch": 0.7278494779582366, "grad_norm": 0.026911661028862, "grad_norm_var": 0.00021911087092634686, "learning_rate": 0.0017852212108658916, "loss": 2.5786, "step": 20077 }, { "crossentropy": 2.530585289001465, "epoch": 0.7278857308584686, "grad_norm": 0.026678849011659622, "grad_norm_var": 0.0002189992528644865, "learning_rate": 0.0017847761755633747, "loss": 2.4469, "step": 20078 }, { "crossentropy": 2.389711856842041, "epoch": 0.7279219837587007, "grad_norm": 0.025665948167443275, "grad_norm_var": 0.00021978133928745198, "learning_rate": 0.001784331183687336, "loss": 2.4675, "step": 20079 }, { "crossentropy": 2.4121785163879395, "epoch": 0.7279582366589327, "grad_norm": 0.02610735222697258, "grad_norm_var": 0.00021983873325105, "learning_rate": 0.001783886235243783, "loss": 2.4709, "step": 20080 }, { "crossentropy": 2.4115591049194336, "epoch": 0.7279944895591647, "grad_norm": 0.026717495173215866, "grad_norm_var": 0.00021939687998986538, "learning_rate": 0.0017834413302387287, "loss": 2.4929, "step": 20081 }, { "crossentropy": 2.605844497680664, "epoch": 0.7280307424593968, "grad_norm": 0.02621365524828434, "grad_norm_var": 0.0002188809880232246, "learning_rate": 0.0017829964686781792, "loss": 2.5906, "step": 20082 }, { "crossentropy": 2.6162784099578857, "epoch": 0.7280669953596288, "grad_norm": 0.026162728667259216, "grad_norm_var": 0.00021943535843747247, "learning_rate": 0.001782551650568146, "loss": 2.5938, "step": 20083 }, { "crossentropy": 2.6423323154449463, "epoch": 0.7281032482598608, "grad_norm": 0.026922406628727913, "grad_norm_var": 0.00021977987824635736, "learning_rate": 0.0017821068759146337, "loss": 2.5638, "step": 20084 }, { "crossentropy": 2.486832618713379, "epoch": 0.7281395011600929, "grad_norm": 0.02663147635757923, "grad_norm_var": 0.00022008454672547967, "learning_rate": 0.0017816621447236526, "loss": 2.4874, "step": 20085 }, { "crossentropy": 2.347161293029785, "epoch": 0.7281757540603249, "grad_norm": 0.02625865302979946, "grad_norm_var": 0.00022053582991060633, "learning_rate": 0.001781217457001207, "loss": 2.433, "step": 20086 }, { "crossentropy": 2.5133204460144043, "epoch": 0.7282120069605569, "grad_norm": 0.029292330145835876, "grad_norm_var": 1.3197957215632082e-06, "learning_rate": 0.0017807728127533058, "loss": 2.549, "step": 20087 }, { "crossentropy": 2.414580821990967, "epoch": 0.7282482598607889, "grad_norm": 0.026095302775502205, "grad_norm_var": 1.1527213426439897e-06, "learning_rate": 0.0017803282119859514, "loss": 2.5776, "step": 20088 }, { "crossentropy": 2.5294830799102783, "epoch": 0.7282845127610209, "grad_norm": 0.02588881179690361, "grad_norm_var": 1.2149098991894036e-06, "learning_rate": 0.0017798836547051522, "loss": 2.5528, "step": 20089 }, { "crossentropy": 2.3324763774871826, "epoch": 0.7283207656612529, "grad_norm": 0.025490108877420425, "grad_norm_var": 9.842963526740522e-07, "learning_rate": 0.001779439140916909, "loss": 2.488, "step": 20090 }, { "crossentropy": 2.4632766246795654, "epoch": 0.7283570185614849, "grad_norm": 0.02639763616025448, "grad_norm_var": 7.921266173019822e-07, "learning_rate": 0.0017789946706272299, "loss": 2.5989, "step": 20091 }, { "crossentropy": 2.6460647583007812, "epoch": 0.728393271461717, "grad_norm": 0.028348537161946297, "grad_norm_var": 9.42098786033072e-07, "learning_rate": 0.0017785502438421137, "loss": 2.5529, "step": 20092 }, { "crossentropy": 2.4571151733398438, "epoch": 0.728429524361949, "grad_norm": 0.0261538065969944, "grad_norm_var": 9.476581802723057e-07, "learning_rate": 0.0017781058605675672, "loss": 2.509, "step": 20093 }, { "crossentropy": 2.466745138168335, "epoch": 0.728465777262181, "grad_norm": 0.025563271716237068, "grad_norm_var": 1.0083673719487312e-06, "learning_rate": 0.0017776615208095886, "loss": 2.4911, "step": 20094 }, { "crossentropy": 2.484788179397583, "epoch": 0.728502030162413, "grad_norm": 0.026327311992645264, "grad_norm_var": 9.626554400250671e-07, "learning_rate": 0.001777217224574183, "loss": 2.4354, "step": 20095 }, { "crossentropy": 2.5522074699401855, "epoch": 0.728538283062645, "grad_norm": 0.026881737634539604, "grad_norm_var": 9.559095211476708e-07, "learning_rate": 0.0017767729718673475, "loss": 2.5235, "step": 20096 }, { "crossentropy": 2.4439597129821777, "epoch": 0.728574535962877, "grad_norm": 0.025527335703372955, "grad_norm_var": 1.0232680167542579e-06, "learning_rate": 0.0017763287626950848, "loss": 2.4622, "step": 20097 }, { "crossentropy": 2.424969434738159, "epoch": 0.728610788863109, "grad_norm": 0.025839276611804962, "grad_norm_var": 1.0468054084562806e-06, "learning_rate": 0.0017758845970633958, "loss": 2.4505, "step": 20098 }, { "crossentropy": 2.4848744869232178, "epoch": 0.728647041763341, "grad_norm": 0.02738511748611927, "grad_norm_var": 1.0874584235652605e-06, "learning_rate": 0.0017754404749782782, "loss": 2.4991, "step": 20099 }, { "crossentropy": 2.640052080154419, "epoch": 0.7286832946635731, "grad_norm": 0.025856604799628258, "grad_norm_var": 1.107336767502256e-06, "learning_rate": 0.0017749963964457283, "loss": 2.5475, "step": 20100 }, { "crossentropy": 2.467764139175415, "epoch": 0.7287195475638051, "grad_norm": 0.025900376960635185, "grad_norm_var": 1.1275452236381705e-06, "learning_rate": 0.0017745523614717486, "loss": 2.4489, "step": 20101 }, { "crossentropy": 2.544379711151123, "epoch": 0.7287558004640371, "grad_norm": 0.02630634233355522, "grad_norm_var": 1.1264681999791545e-06, "learning_rate": 0.0017741083700623316, "loss": 2.4591, "step": 20102 }, { "crossentropy": 2.50632381439209, "epoch": 0.7287920533642691, "grad_norm": 0.0269214678555727, "grad_norm_var": 5.803419206302454e-07, "learning_rate": 0.0017736644222234787, "loss": 2.4977, "step": 20103 }, { "crossentropy": 2.577052593231201, "epoch": 0.7288283062645011, "grad_norm": 0.026482511311769485, "grad_norm_var": 5.788765362714383e-07, "learning_rate": 0.001773220517961182, "loss": 2.5383, "step": 20104 }, { "crossentropy": 2.552807092666626, "epoch": 0.7288645591647331, "grad_norm": 0.02683277428150177, "grad_norm_var": 5.791160855174449e-07, "learning_rate": 0.0017727766572814396, "loss": 2.5526, "step": 20105 }, { "crossentropy": 2.295414924621582, "epoch": 0.7289008120649652, "grad_norm": 0.02697419747710228, "grad_norm_var": 5.390232966029362e-07, "learning_rate": 0.0017723328401902472, "loss": 2.4363, "step": 20106 }, { "crossentropy": 2.5061116218566895, "epoch": 0.7289370649651972, "grad_norm": 0.0267401784658432, "grad_norm_var": 5.42542746871756e-07, "learning_rate": 0.001771889066693596, "loss": 2.5533, "step": 20107 }, { "crossentropy": 2.4381134510040283, "epoch": 0.7289733178654292, "grad_norm": 0.027604810893535614, "grad_norm_var": 3.940590419549787e-07, "learning_rate": 0.0017714453367974837, "loss": 2.4209, "step": 20108 }, { "crossentropy": 2.490060329437256, "epoch": 0.7290095707656613, "grad_norm": 0.026929285377264023, "grad_norm_var": 4.0039130623711896e-07, "learning_rate": 0.0017710016505079018, "loss": 2.4785, "step": 20109 }, { "crossentropy": 2.476217746734619, "epoch": 0.7290458236658933, "grad_norm": 0.02784973382949829, "grad_norm_var": 4.4017979420062484e-07, "learning_rate": 0.0017705580078308408, "loss": 2.4594, "step": 20110 }, { "crossentropy": 2.5034849643707275, "epoch": 0.7290820765661253, "grad_norm": 0.027406727895140648, "grad_norm_var": 4.669272537467679e-07, "learning_rate": 0.0017701144087722965, "loss": 2.5159, "step": 20111 }, { "crossentropy": 2.547839403152466, "epoch": 0.7291183294663574, "grad_norm": 0.02924933470785618, "grad_norm_var": 8.699376799344942e-07, "learning_rate": 0.0017696708533382571, "loss": 2.558, "step": 20112 }, { "crossentropy": 2.645559787750244, "epoch": 0.7291545823665894, "grad_norm": 0.02744280733168125, "grad_norm_var": 7.581592552299467e-07, "learning_rate": 0.001769227341534715, "loss": 2.5786, "step": 20113 }, { "crossentropy": 2.561547040939331, "epoch": 0.7291908352668214, "grad_norm": 0.02697143517434597, "grad_norm_var": 6.656814057757106e-07, "learning_rate": 0.001768783873367662, "loss": 2.4942, "step": 20114 }, { "crossentropy": 2.4107718467712402, "epoch": 0.7292270881670534, "grad_norm": 0.027453582733869553, "grad_norm_var": 6.69002919080142e-07, "learning_rate": 0.0017683404488430849, "loss": 2.4265, "step": 20115 }, { "crossentropy": 2.397320508956909, "epoch": 0.7292633410672854, "grad_norm": 0.026005424559116364, "grad_norm_var": 6.465555090497931e-07, "learning_rate": 0.0017678970679669758, "loss": 2.3597, "step": 20116 }, { "crossentropy": 2.6005859375, "epoch": 0.7292995939675174, "grad_norm": 0.027779169380664825, "grad_norm_var": 5.749419436769232e-07, "learning_rate": 0.001767453730745322, "loss": 2.6097, "step": 20117 }, { "crossentropy": 2.6014535427093506, "epoch": 0.7293358468677494, "grad_norm": 0.028662189841270447, "grad_norm_var": 6.460208085092792e-07, "learning_rate": 0.0017670104371841089, "loss": 2.5228, "step": 20118 }, { "crossentropy": 2.601468324661255, "epoch": 0.7293720997679815, "grad_norm": 0.025724394246935844, "grad_norm_var": 8.010438109280615e-07, "learning_rate": 0.001766567187289328, "loss": 2.536, "step": 20119 }, { "crossentropy": 2.595114231109619, "epoch": 0.7294083526682135, "grad_norm": 0.031162424013018608, "grad_norm_var": 1.6867550698058028e-06, "learning_rate": 0.0017661239810669628, "loss": 2.5762, "step": 20120 }, { "crossentropy": 2.378150224685669, "epoch": 0.7294446055684455, "grad_norm": 0.026262080296874046, "grad_norm_var": 1.7616314835456563e-06, "learning_rate": 0.0017656808185230006, "loss": 2.5464, "step": 20121 }, { "crossentropy": 2.5580124855041504, "epoch": 0.7294808584686775, "grad_norm": 0.026035411283373833, "grad_norm_var": 1.8842331508783783e-06, "learning_rate": 0.0017652376996634283, "loss": 2.4553, "step": 20122 }, { "crossentropy": 2.533255100250244, "epoch": 0.7295171113689095, "grad_norm": 0.025974178686738014, "grad_norm_var": 1.993906017724737e-06, "learning_rate": 0.0017647946244942292, "loss": 2.5321, "step": 20123 }, { "crossentropy": 2.554737091064453, "epoch": 0.7295533642691415, "grad_norm": 0.026100507006049156, "grad_norm_var": 2.095675875182085e-06, "learning_rate": 0.0017643515930213893, "loss": 2.5274, "step": 20124 }, { "crossentropy": 2.530017852783203, "epoch": 0.7295896171693735, "grad_norm": 0.02728845179080963, "grad_norm_var": 2.0853606989144787e-06, "learning_rate": 0.0017639086052508913, "loss": 2.5593, "step": 20125 }, { "crossentropy": 2.3886497020721436, "epoch": 0.7296258700696056, "grad_norm": 0.02724870853126049, "grad_norm_var": 2.0667279191811564e-06, "learning_rate": 0.0017634656611887174, "loss": 2.4473, "step": 20126 }, { "crossentropy": 2.5839614868164062, "epoch": 0.7296621229698376, "grad_norm": 0.02772834151983261, "grad_norm_var": 2.0778582190562775e-06, "learning_rate": 0.0017630227608408523, "loss": 2.5803, "step": 20127 }, { "crossentropy": 2.6346726417541504, "epoch": 0.7296983758700696, "grad_norm": 0.027609940618276596, "grad_norm_var": 1.8236775625475917e-06, "learning_rate": 0.001762579904213276, "loss": 2.5788, "step": 20128 }, { "crossentropy": 2.4772980213165283, "epoch": 0.7297346287703016, "grad_norm": 0.027468647807836533, "grad_norm_var": 1.8245022341822903e-06, "learning_rate": 0.001762137091311971, "loss": 2.4939, "step": 20129 }, { "crossentropy": 2.4955837726593018, "epoch": 0.7297708816705336, "grad_norm": 0.025986317545175552, "grad_norm_var": 1.917434183465909e-06, "learning_rate": 0.0017616943221429198, "loss": 2.544, "step": 20130 }, { "crossentropy": 2.590700387954712, "epoch": 0.7298071345707656, "grad_norm": 0.02631707675755024, "grad_norm_var": 1.953009102454289e-06, "learning_rate": 0.001761251596712099, "loss": 2.5367, "step": 20131 }, { "crossentropy": 2.4309840202331543, "epoch": 0.7298433874709976, "grad_norm": 0.026622122153639793, "grad_norm_var": 1.8880439218646317e-06, "learning_rate": 0.0017608089150254924, "loss": 2.5057, "step": 20132 }, { "crossentropy": 2.4589035511016846, "epoch": 0.7298796403712297, "grad_norm": 0.02771761640906334, "grad_norm_var": 1.882896502641016e-06, "learning_rate": 0.0017603662770890771, "loss": 2.4679, "step": 20133 }, { "crossentropy": 2.476101875305176, "epoch": 0.7299158932714617, "grad_norm": 0.029129264876246452, "grad_norm_var": 1.9926190130053457e-06, "learning_rate": 0.00175992368290883, "loss": 2.5968, "step": 20134 }, { "crossentropy": 2.507761240005493, "epoch": 0.7299521461716937, "grad_norm": 0.02617708407342434, "grad_norm_var": 1.9194718747339e-06, "learning_rate": 0.0017594811324907322, "loss": 2.503, "step": 20135 }, { "crossentropy": 2.414114475250244, "epoch": 0.7299883990719258, "grad_norm": 0.02812703140079975, "grad_norm_var": 8.822487189803312e-07, "learning_rate": 0.0017590386258407576, "loss": 2.4971, "step": 20136 }, { "crossentropy": 2.4919943809509277, "epoch": 0.7300246519721578, "grad_norm": 0.02729676105082035, "grad_norm_var": 8.491441914429756e-07, "learning_rate": 0.0017585961629648845, "loss": 2.4702, "step": 20137 }, { "crossentropy": 2.4488816261291504, "epoch": 0.7300609048723898, "grad_norm": 0.025454511865973473, "grad_norm_var": 9.489505854798699e-07, "learning_rate": 0.0017581537438690915, "loss": 2.5165, "step": 20138 }, { "crossentropy": 2.3654592037200928, "epoch": 0.7300971577726219, "grad_norm": 0.026273058727383614, "grad_norm_var": 9.130398925536257e-07, "learning_rate": 0.0017577113685593494, "loss": 2.3907, "step": 20139 }, { "crossentropy": 2.632657289505005, "epoch": 0.7301334106728539, "grad_norm": 0.02809448540210724, "grad_norm_var": 9.133308128896386e-07, "learning_rate": 0.0017572690370416377, "loss": 2.6315, "step": 20140 }, { "crossentropy": 2.3555290699005127, "epoch": 0.7301696635730859, "grad_norm": 0.026163658127188683, "grad_norm_var": 9.729462301358641e-07, "learning_rate": 0.0017568267493219287, "loss": 2.4371, "step": 20141 }, { "crossentropy": 2.5935091972351074, "epoch": 0.7302059164733179, "grad_norm": 0.02638085186481476, "grad_norm_var": 1.0014713434541943e-06, "learning_rate": 0.0017563845054061945, "loss": 2.5304, "step": 20142 }, { "crossentropy": 2.528249740600586, "epoch": 0.7302421693735499, "grad_norm": 0.027804533019661903, "grad_norm_var": 1.0088861290190322e-06, "learning_rate": 0.0017559423053004114, "loss": 2.5186, "step": 20143 }, { "crossentropy": 2.428574323654175, "epoch": 0.7302784222737819, "grad_norm": 0.027537720277905464, "grad_norm_var": 1.0037136867924035e-06, "learning_rate": 0.0017555001490105488, "loss": 2.509, "step": 20144 }, { "crossentropy": 2.589066505432129, "epoch": 0.7303146751740139, "grad_norm": 0.027503646910190582, "grad_norm_var": 1.0058165836010578e-06, "learning_rate": 0.0017550580365425805, "loss": 2.5318, "step": 20145 }, { "crossentropy": 2.5251924991607666, "epoch": 0.730350928074246, "grad_norm": 0.02799745835363865, "grad_norm_var": 9.769717349533691e-07, "learning_rate": 0.0017546159679024793, "loss": 2.5131, "step": 20146 }, { "crossentropy": 2.477442502975464, "epoch": 0.730387180974478, "grad_norm": 0.026031333953142166, "grad_norm_var": 1.0142771824853905e-06, "learning_rate": 0.0017541739430962128, "loss": 2.585, "step": 20147 }, { "crossentropy": 2.4876537322998047, "epoch": 0.73042343387471, "grad_norm": 0.0265523549169302, "grad_norm_var": 1.0194402128168988e-06, "learning_rate": 0.001753731962129755, "loss": 2.5422, "step": 20148 }, { "crossentropy": 2.3372316360473633, "epoch": 0.730459686774942, "grad_norm": 0.02527521550655365, "grad_norm_var": 1.2041979789435455e-06, "learning_rate": 0.0017532900250090716, "loss": 2.4199, "step": 20149 }, { "crossentropy": 2.487609624862671, "epoch": 0.730495939675174, "grad_norm": 0.027244986966252327, "grad_norm_var": 8.879975640251146e-07, "learning_rate": 0.001752848131740135, "loss": 2.4994, "step": 20150 }, { "crossentropy": 2.454723596572876, "epoch": 0.730532192575406, "grad_norm": 0.027301380410790443, "grad_norm_var": 8.631775539363751e-07, "learning_rate": 0.0017524062823289122, "loss": 2.4408, "step": 20151 }, { "crossentropy": 2.5277044773101807, "epoch": 0.730568445475638, "grad_norm": 0.026683053001761436, "grad_norm_var": 7.649427929509589e-07, "learning_rate": 0.0017519644767813691, "loss": 2.468, "step": 20152 }, { "crossentropy": 2.520778179168701, "epoch": 0.73060469837587, "grad_norm": 0.02635353058576584, "grad_norm_var": 7.643223204655454e-07, "learning_rate": 0.001751522715103475, "loss": 2.5253, "step": 20153 }, { "crossentropy": 2.4124045372009277, "epoch": 0.7306409512761021, "grad_norm": 0.02635728009045124, "grad_norm_var": 6.544191035318461e-07, "learning_rate": 0.0017510809973011981, "loss": 2.4207, "step": 20154 }, { "crossentropy": 2.5198888778686523, "epoch": 0.7306772041763341, "grad_norm": 0.02629796788096428, "grad_norm_var": 6.52551168217015e-07, "learning_rate": 0.0017506393233805012, "loss": 2.4556, "step": 20155 }, { "crossentropy": 2.4234421253204346, "epoch": 0.7307134570765661, "grad_norm": 0.026122922077775, "grad_norm_var": 5.680107919149117e-07, "learning_rate": 0.0017501976933473528, "loss": 2.4025, "step": 20156 }, { "crossentropy": 2.4076642990112305, "epoch": 0.7307497099767981, "grad_norm": 0.02646738663315773, "grad_norm_var": 5.510237625016978e-07, "learning_rate": 0.0017497561072077146, "loss": 2.4839, "step": 20157 }, { "crossentropy": 2.4121785163879395, "epoch": 0.7307859628770301, "grad_norm": 0.02552962675690651, "grad_norm_var": 6.375804522689658e-07, "learning_rate": 0.001749314564967554, "loss": 2.4714, "step": 20158 }, { "crossentropy": 2.5547425746917725, "epoch": 0.7308222157772621, "grad_norm": 0.025854196399450302, "grad_norm_var": 5.858216661930662e-07, "learning_rate": 0.001748873066632834, "loss": 2.604, "step": 20159 }, { "crossentropy": 2.565617561340332, "epoch": 0.7308584686774942, "grad_norm": 0.025205282494425774, "grad_norm_var": 6.246921238156622e-07, "learning_rate": 0.0017484316122095146, "loss": 2.5586, "step": 20160 }, { "crossentropy": 2.4130859375, "epoch": 0.7308947215777262, "grad_norm": 0.02621866576373577, "grad_norm_var": 5.428455813621108e-07, "learning_rate": 0.001747990201703561, "loss": 2.4179, "step": 20161 }, { "crossentropy": 2.580048084259033, "epoch": 0.7309309744779582, "grad_norm": 0.025960687547922134, "grad_norm_var": 3.529012442373321e-07, "learning_rate": 0.0017475488351209368, "loss": 2.5552, "step": 20162 }, { "crossentropy": 2.336365222930908, "epoch": 0.7309672273781903, "grad_norm": 0.02651864103972912, "grad_norm_var": 3.557449873344548e-07, "learning_rate": 0.0017471075124675995, "loss": 2.5357, "step": 20163 }, { "crossentropy": 2.419823169708252, "epoch": 0.7310034802784223, "grad_norm": 0.026593895629048347, "grad_norm_var": 3.57547181259798e-07, "learning_rate": 0.0017466662337495133, "loss": 2.4555, "step": 20164 }, { "crossentropy": 2.3778462409973145, "epoch": 0.7310397331786543, "grad_norm": 0.02675020880997181, "grad_norm_var": 3.020035931511807e-07, "learning_rate": 0.0017462249989726349, "loss": 2.4857, "step": 20165 }, { "crossentropy": 2.388072967529297, "epoch": 0.7310759860788864, "grad_norm": 0.026840606704354286, "grad_norm_var": 2.634957163854164e-07, "learning_rate": 0.0017457838081429255, "loss": 2.4969, "step": 20166 }, { "crossentropy": 2.4667253494262695, "epoch": 0.7311122389791184, "grad_norm": 0.028265774250030518, "grad_norm_var": 4.4833552656045547e-07, "learning_rate": 0.001745342661266348, "loss": 2.4658, "step": 20167 }, { "crossentropy": 2.4144113063812256, "epoch": 0.7311484918793504, "grad_norm": 0.026758302003145218, "grad_norm_var": 4.5176781528575434e-07, "learning_rate": 0.0017449015583488531, "loss": 2.42, "step": 20168 }, { "crossentropy": 2.5811526775360107, "epoch": 0.7311847447795824, "grad_norm": 0.026705626398324966, "grad_norm_var": 4.5822945797802107e-07, "learning_rate": 0.0017444604993964031, "loss": 2.528, "step": 20169 }, { "crossentropy": 2.50663161277771, "epoch": 0.7312209976798144, "grad_norm": 0.027300242334604263, "grad_norm_var": 5.080620878188894e-07, "learning_rate": 0.001744019484414956, "loss": 2.5117, "step": 20170 }, { "crossentropy": 2.437385082244873, "epoch": 0.7312572505800464, "grad_norm": 0.026229333132505417, "grad_norm_var": 5.098564902635775e-07, "learning_rate": 0.0017435785134104647, "loss": 2.505, "step": 20171 }, { "crossentropy": 2.292642593383789, "epoch": 0.7312935034802784, "grad_norm": 0.02704598940908909, "grad_norm_var": 5.219206745498642e-07, "learning_rate": 0.0017431375863888898, "loss": 2.4434, "step": 20172 }, { "crossentropy": 2.4928529262542725, "epoch": 0.7313297563805105, "grad_norm": 0.027485590428113937, "grad_norm_var": 5.802149570131952e-07, "learning_rate": 0.0017426967033561818, "loss": 2.5395, "step": 20173 }, { "crossentropy": 2.564105987548828, "epoch": 0.7313660092807425, "grad_norm": 0.025761721655726433, "grad_norm_var": 5.511103935410336e-07, "learning_rate": 0.001742255864318299, "loss": 2.5635, "step": 20174 }, { "crossentropy": 2.4828262329101562, "epoch": 0.7314022621809745, "grad_norm": 0.026452308520674706, "grad_norm_var": 5.145169933084044e-07, "learning_rate": 0.0017418150692811967, "loss": 2.5267, "step": 20175 }, { "crossentropy": 2.5105838775634766, "epoch": 0.7314385150812065, "grad_norm": 0.0270919781178236, "grad_norm_var": 3.7838971810587263e-07, "learning_rate": 0.001741374318250824, "loss": 2.5535, "step": 20176 }, { "crossentropy": 2.4881224632263184, "epoch": 0.7314747679814385, "grad_norm": 0.026288144290447235, "grad_norm_var": 3.737810739291858e-07, "learning_rate": 0.0017409336112331358, "loss": 2.5804, "step": 20177 }, { "crossentropy": 2.670203685760498, "epoch": 0.7315110208816705, "grad_norm": 0.026780543848872185, "grad_norm_var": 3.2917318964855757e-07, "learning_rate": 0.001740492948234087, "loss": 2.5497, "step": 20178 }, { "crossentropy": 2.486968517303467, "epoch": 0.7315472737819025, "grad_norm": 0.02672470733523369, "grad_norm_var": 3.2397833952813037e-07, "learning_rate": 0.0017400523292596254, "loss": 2.5485, "step": 20179 }, { "crossentropy": 2.6186680793762207, "epoch": 0.7315835266821346, "grad_norm": 0.028261663392186165, "grad_norm_var": 4.48166100710153e-07, "learning_rate": 0.0017396117543157059, "loss": 2.5195, "step": 20180 }, { "crossentropy": 2.4177680015563965, "epoch": 0.7316197795823666, "grad_norm": 0.02564980648458004, "grad_norm_var": 5.489667786410963e-07, "learning_rate": 0.0017391712234082757, "loss": 2.4624, "step": 20181 }, { "crossentropy": 2.517240047454834, "epoch": 0.7316560324825986, "grad_norm": 0.029260722920298576, "grad_norm_var": 9.11142022358193e-07, "learning_rate": 0.001738730736543287, "loss": 2.544, "step": 20182 }, { "crossentropy": 2.501751661300659, "epoch": 0.7316922853828306, "grad_norm": 0.026117999106645584, "grad_norm_var": 8.380886742468188e-07, "learning_rate": 0.001738290293726692, "loss": 2.5006, "step": 20183 }, { "crossentropy": 2.4317290782928467, "epoch": 0.7317285382830626, "grad_norm": 0.02638966403901577, "grad_norm_var": 8.520558507955304e-07, "learning_rate": 0.0017378498949644334, "loss": 2.4609, "step": 20184 }, { "crossentropy": 2.4003748893737793, "epoch": 0.7317647911832946, "grad_norm": 0.028412465006113052, "grad_norm_var": 1.0020481412020687e-06, "learning_rate": 0.0017374095402624618, "loss": 2.4332, "step": 20185 }, { "crossentropy": 2.591007709503174, "epoch": 0.7318010440835266, "grad_norm": 0.026289567351341248, "grad_norm_var": 1.0191375185196097e-06, "learning_rate": 0.0017369692296267275, "loss": 2.5455, "step": 20186 }, { "crossentropy": 2.587432861328125, "epoch": 0.7318372969837587, "grad_norm": 0.02545943856239319, "grad_norm_var": 1.1240169465067159e-06, "learning_rate": 0.0017365289630631741, "loss": 2.5569, "step": 20187 }, { "crossentropy": 2.517913818359375, "epoch": 0.7318735498839907, "grad_norm": 0.02683403715491295, "grad_norm_var": 1.1210604277445134e-06, "learning_rate": 0.0017360887405777508, "loss": 2.5788, "step": 20188 }, { "crossentropy": 2.5779049396514893, "epoch": 0.7319098027842227, "grad_norm": 0.026364417746663094, "grad_norm_var": 1.1014371633881094e-06, "learning_rate": 0.0017356485621764011, "loss": 2.5096, "step": 20189 }, { "crossentropy": 2.4432830810546875, "epoch": 0.7319460556844548, "grad_norm": 0.026739494875073433, "grad_norm_var": 1.0312139805069743e-06, "learning_rate": 0.0017352084278650709, "loss": 2.4367, "step": 20190 }, { "crossentropy": 2.3924412727355957, "epoch": 0.7319823085846868, "grad_norm": 0.025934269651770592, "grad_norm_var": 1.0733707487325562e-06, "learning_rate": 0.0017347683376497075, "loss": 2.4852, "step": 20191 }, { "crossentropy": 2.5236785411834717, "epoch": 0.7320185614849188, "grad_norm": 0.02649584598839283, "grad_norm_var": 1.0713750025319642e-06, "learning_rate": 0.0017343282915362524, "loss": 2.4595, "step": 20192 }, { "crossentropy": 2.4751429557800293, "epoch": 0.7320548143851509, "grad_norm": 0.026696842163801193, "grad_norm_var": 1.0566372037642494e-06, "learning_rate": 0.001733888289530648, "loss": 2.5557, "step": 20193 }, { "crossentropy": 2.577491521835327, "epoch": 0.7320910672853829, "grad_norm": 0.02632700279355049, "grad_norm_var": 1.0692015834384376e-06, "learning_rate": 0.0017334483316388406, "loss": 2.5414, "step": 20194 }, { "crossentropy": 2.6512715816497803, "epoch": 0.7321273201856149, "grad_norm": 0.0264559518545866, "grad_norm_var": 1.0745280781257558e-06, "learning_rate": 0.0017330084178667688, "loss": 2.5371, "step": 20195 }, { "crossentropy": 2.5121989250183105, "epoch": 0.7321635730858469, "grad_norm": 0.026792045682668686, "grad_norm_var": 9.094986756547422e-07, "learning_rate": 0.0017325685482203778, "loss": 2.4945, "step": 20196 }, { "crossentropy": 2.639782428741455, "epoch": 0.7321998259860789, "grad_norm": 0.02712109126150608, "grad_norm_var": 8.507940231455384e-07, "learning_rate": 0.001732128722705606, "loss": 2.5541, "step": 20197 }, { "crossentropy": 2.5459365844726562, "epoch": 0.7322360788863109, "grad_norm": 0.02689623273909092, "grad_norm_var": 4.025845311954208e-07, "learning_rate": 0.0017316889413283942, "loss": 2.5247, "step": 20198 }, { "crossentropy": 2.502603530883789, "epoch": 0.7322723317865429, "grad_norm": 0.026161406189203262, "grad_norm_var": 4.0001163890037115e-07, "learning_rate": 0.0017312492040946854, "loss": 2.4367, "step": 20199 }, { "crossentropy": 2.410836935043335, "epoch": 0.732308584686775, "grad_norm": 0.029094306752085686, "grad_norm_var": 7.865427653074829e-07, "learning_rate": 0.001730809511010416, "loss": 2.4426, "step": 20200 }, { "crossentropy": 2.4663586616516113, "epoch": 0.732344837587007, "grad_norm": 0.02656334452331066, "grad_norm_var": 5.915126270895071e-07, "learning_rate": 0.0017303698620815238, "loss": 2.503, "step": 20201 }, { "crossentropy": 2.588487148284912, "epoch": 0.732381090487239, "grad_norm": 0.029074322432279587, "grad_norm_var": 9.46416792830607e-07, "learning_rate": 0.0017299302573139503, "loss": 2.478, "step": 20202 }, { "crossentropy": 2.4268598556518555, "epoch": 0.732417343387471, "grad_norm": 0.02805568277835846, "grad_norm_var": 8.990958769885485e-07, "learning_rate": 0.0017294906967136293, "loss": 2.4787, "step": 20203 }, { "crossentropy": 2.600086212158203, "epoch": 0.732453596287703, "grad_norm": 0.02729002572596073, "grad_norm_var": 9.034969805407634e-07, "learning_rate": 0.0017290511802865016, "loss": 2.527, "step": 20204 }, { "crossentropy": 2.4553754329681396, "epoch": 0.732489849187935, "grad_norm": 0.02860979363322258, "grad_norm_var": 1.027155817113221e-06, "learning_rate": 0.0017286117080384995, "loss": 2.4239, "step": 20205 }, { "crossentropy": 2.533586025238037, "epoch": 0.732526102088167, "grad_norm": 0.026407264173030853, "grad_norm_var": 1.0519830611843695e-06, "learning_rate": 0.0017281722799755612, "loss": 2.5117, "step": 20206 }, { "crossentropy": 2.5697829723358154, "epoch": 0.7325623549883991, "grad_norm": 0.027286801487207413, "grad_norm_var": 9.518604815026647e-07, "learning_rate": 0.0017277328961036232, "loss": 2.5831, "step": 20207 }, { "crossentropy": 2.5172324180603027, "epoch": 0.7325986078886311, "grad_norm": 0.025843102484941483, "grad_norm_var": 1.0404704155766998e-06, "learning_rate": 0.0017272935564286163, "loss": 2.4709, "step": 20208 }, { "crossentropy": 2.510934829711914, "epoch": 0.7326348607888631, "grad_norm": 0.025767525658011436, "grad_norm_var": 1.152728874481571e-06, "learning_rate": 0.001726854260956478, "loss": 2.5203, "step": 20209 }, { "crossentropy": 2.530508518218994, "epoch": 0.7326711136890951, "grad_norm": 0.025910286232829094, "grad_norm_var": 1.2070382578448108e-06, "learning_rate": 0.0017264150096931402, "loss": 2.5727, "step": 20210 }, { "crossentropy": 2.6244523525238037, "epoch": 0.7327073665893271, "grad_norm": 0.027414454147219658, "grad_norm_var": 1.1843122743997065e-06, "learning_rate": 0.0017259758026445338, "loss": 2.6455, "step": 20211 }, { "crossentropy": 2.401787757873535, "epoch": 0.7327436194895591, "grad_norm": 0.02583770826458931, "grad_norm_var": 1.2858894489259712e-06, "learning_rate": 0.0017255366398165944, "loss": 2.5024, "step": 20212 }, { "crossentropy": 2.573493242263794, "epoch": 0.7327798723897911, "grad_norm": 0.027049586176872253, "grad_norm_var": 1.2858490345104505e-06, "learning_rate": 0.0017250975212152503, "loss": 2.5112, "step": 20213 }, { "crossentropy": 2.60771107673645, "epoch": 0.7328161252900232, "grad_norm": 0.025958865880966187, "grad_norm_var": 1.3635908907047114e-06, "learning_rate": 0.0017246584468464338, "loss": 2.6143, "step": 20214 }, { "crossentropy": 2.452176094055176, "epoch": 0.7328523781902552, "grad_norm": 0.02540917880833149, "grad_norm_var": 1.4850986990087018e-06, "learning_rate": 0.0017242194167160774, "loss": 2.4089, "step": 20215 }, { "crossentropy": 2.5143790245056152, "epoch": 0.7328886310904872, "grad_norm": 0.02674025669693947, "grad_norm_var": 1.165707397107231e-06, "learning_rate": 0.001723780430830107, "loss": 2.4516, "step": 20216 }, { "crossentropy": 2.4652888774871826, "epoch": 0.7329248839907193, "grad_norm": 0.025287169963121414, "grad_norm_var": 1.312212193360163e-06, "learning_rate": 0.0017233414891944555, "loss": 2.469, "step": 20217 }, { "crossentropy": 2.502523183822632, "epoch": 0.7329611368909513, "grad_norm": 0.027923675253987312, "grad_norm_var": 1.0378089175396935e-06, "learning_rate": 0.0017229025918150498, "loss": 2.4836, "step": 20218 }, { "crossentropy": 2.553619384765625, "epoch": 0.7329973897911833, "grad_norm": 0.02704508788883686, "grad_norm_var": 9.155262165599544e-07, "learning_rate": 0.0017224637386978158, "loss": 2.5472, "step": 20219 }, { "crossentropy": 2.5649259090423584, "epoch": 0.7330336426914154, "grad_norm": 0.025813626125454903, "grad_norm_var": 9.181513518090641e-07, "learning_rate": 0.001722024929848685, "loss": 2.5597, "step": 20220 }, { "crossentropy": 2.546576499938965, "epoch": 0.7330698955916474, "grad_norm": 0.026560844853520393, "grad_norm_var": 6.093542913243137e-07, "learning_rate": 0.0017215861652735804, "loss": 2.5533, "step": 20221 }, { "crossentropy": 2.5599727630615234, "epoch": 0.7331061484918794, "grad_norm": 0.025811780244112015, "grad_norm_var": 6.302227119632404e-07, "learning_rate": 0.0017211474449784292, "loss": 2.5255, "step": 20222 }, { "crossentropy": 2.3398804664611816, "epoch": 0.7331424013921114, "grad_norm": 0.02721427194774151, "grad_norm_var": 6.215282926024286e-07, "learning_rate": 0.0017207087689691597, "loss": 2.4325, "step": 20223 }, { "crossentropy": 2.45066499710083, "epoch": 0.7331786542923434, "grad_norm": 0.02550410106778145, "grad_norm_var": 6.51587241740067e-07, "learning_rate": 0.0017202701372516932, "loss": 2.489, "step": 20224 }, { "crossentropy": 2.616036891937256, "epoch": 0.7332149071925754, "grad_norm": 0.026048915460705757, "grad_norm_var": 6.35506787276122e-07, "learning_rate": 0.0017198315498319573, "loss": 2.5981, "step": 20225 }, { "crossentropy": 2.3942818641662598, "epoch": 0.7332511600928074, "grad_norm": 0.026564864441752434, "grad_norm_var": 6.24292265818356e-07, "learning_rate": 0.0017193930067158736, "loss": 2.4359, "step": 20226 }, { "crossentropy": 2.4279401302337646, "epoch": 0.7332874129930395, "grad_norm": 0.02734949253499508, "grad_norm_var": 6.156525516382797e-07, "learning_rate": 0.001718954507909365, "loss": 2.4272, "step": 20227 }, { "crossentropy": 2.416951894760132, "epoch": 0.7333236658932715, "grad_norm": 0.026429414749145508, "grad_norm_var": 5.945567621237161e-07, "learning_rate": 0.001718516053418356, "loss": 2.4372, "step": 20228 }, { "crossentropy": 2.3121275901794434, "epoch": 0.7333599187935035, "grad_norm": 0.028773440048098564, "grad_norm_var": 9.251222732574334e-07, "learning_rate": 0.001718077643248766, "loss": 2.4928, "step": 20229 }, { "crossentropy": 2.443328380584717, "epoch": 0.7333961716937355, "grad_norm": 0.026359431445598602, "grad_norm_var": 9.047972756322127e-07, "learning_rate": 0.001717639277406518, "loss": 2.3902, "step": 20230 }, { "crossentropy": 2.5018374919891357, "epoch": 0.7334324245939675, "grad_norm": 0.02747405879199505, "grad_norm_var": 8.565807669757194e-07, "learning_rate": 0.0017172009558975344, "loss": 2.4825, "step": 20231 }, { "crossentropy": 2.5505194664001465, "epoch": 0.7334686774941995, "grad_norm": 0.026763916015625, "grad_norm_var": 8.568018082077387e-07, "learning_rate": 0.001716762678727732, "loss": 2.5006, "step": 20232 }, { "crossentropy": 2.470789670944214, "epoch": 0.7335049303944315, "grad_norm": 0.02620827965438366, "grad_norm_var": 7.384311628572664e-07, "learning_rate": 0.0017163244459030337, "loss": 2.4776, "step": 20233 }, { "crossentropy": 2.4362998008728027, "epoch": 0.7335411832946636, "grad_norm": 0.02627953141927719, "grad_norm_var": 6.479686594395549e-07, "learning_rate": 0.0017158862574293565, "loss": 2.4021, "step": 20234 }, { "crossentropy": 2.419541835784912, "epoch": 0.7335774361948956, "grad_norm": 0.02654641680419445, "grad_norm_var": 6.364147978261032e-07, "learning_rate": 0.0017154481133126182, "loss": 2.5246, "step": 20235 }, { "crossentropy": 2.701828956604004, "epoch": 0.7336136890951276, "grad_norm": 0.027435636147856712, "grad_norm_var": 6.293956517882647e-07, "learning_rate": 0.0017150100135587393, "loss": 2.6748, "step": 20236 }, { "crossentropy": 2.3990392684936523, "epoch": 0.7336499419953596, "grad_norm": 0.025489073246717453, "grad_norm_var": 7.22185752525028e-07, "learning_rate": 0.0017145719581736336, "loss": 2.4456, "step": 20237 }, { "crossentropy": 2.45978045463562, "epoch": 0.7336861948955916, "grad_norm": 0.026228325441479683, "grad_norm_var": 6.869875058508496e-07, "learning_rate": 0.001714133947163219, "loss": 2.5103, "step": 20238 }, { "crossentropy": 2.5690207481384277, "epoch": 0.7337224477958236, "grad_norm": 0.02558465301990509, "grad_norm_var": 7.340150490368064e-07, "learning_rate": 0.0017136959805334141, "loss": 2.4738, "step": 20239 }, { "crossentropy": 2.4773707389831543, "epoch": 0.7337587006960556, "grad_norm": 0.026289476081728935, "grad_norm_var": 6.614750600065592e-07, "learning_rate": 0.0017132580582901302, "loss": 2.549, "step": 20240 }, { "crossentropy": 2.440335512161255, "epoch": 0.7337949535962877, "grad_norm": 0.025879202410578728, "grad_norm_var": 6.760634888677263e-07, "learning_rate": 0.0017128201804392856, "loss": 2.4102, "step": 20241 }, { "crossentropy": 2.5175344944000244, "epoch": 0.7338312064965197, "grad_norm": 0.026195399463176727, "grad_norm_var": 6.86495851403993e-07, "learning_rate": 0.0017123823469867934, "loss": 2.5167, "step": 20242 }, { "crossentropy": 2.5803167819976807, "epoch": 0.7338674593967517, "grad_norm": 0.026336556300520897, "grad_norm_var": 6.467456039249784e-07, "learning_rate": 0.0017119445579385645, "loss": 2.5285, "step": 20243 }, { "crossentropy": 2.5310418605804443, "epoch": 0.7339037122969838, "grad_norm": 0.026347341015934944, "grad_norm_var": 6.481256245409461e-07, "learning_rate": 0.0017115068133005164, "loss": 2.4566, "step": 20244 }, { "crossentropy": 2.364917516708374, "epoch": 0.7339399651972158, "grad_norm": 0.026751982048153877, "grad_norm_var": 2.9397682149798904e-07, "learning_rate": 0.0017110691130785567, "loss": 2.4379, "step": 20245 }, { "crossentropy": 2.5338943004608154, "epoch": 0.7339762180974478, "grad_norm": 0.026233380660414696, "grad_norm_var": 2.954093435731843e-07, "learning_rate": 0.0017106314572786002, "loss": 2.5942, "step": 20246 }, { "crossentropy": 2.628939628601074, "epoch": 0.7340124709976799, "grad_norm": 0.026720384135842323, "grad_norm_var": 2.2073807092307974e-07, "learning_rate": 0.0017101938459065597, "loss": 2.5747, "step": 20247 }, { "crossentropy": 2.4979283809661865, "epoch": 0.7340487238979119, "grad_norm": 0.026266098022460938, "grad_norm_var": 2.0746513377371303e-07, "learning_rate": 0.001709756278968342, "loss": 2.551, "step": 20248 }, { "crossentropy": 2.5467071533203125, "epoch": 0.7340849767981439, "grad_norm": 0.02608085423707962, "grad_norm_var": 2.1002951936105463e-07, "learning_rate": 0.0017093187564698603, "loss": 2.6323, "step": 20249 }, { "crossentropy": 2.397974967956543, "epoch": 0.7341212296983759, "grad_norm": 0.02598099783062935, "grad_norm_var": 2.1607683879139538e-07, "learning_rate": 0.001708881278417021, "loss": 2.4658, "step": 20250 }, { "crossentropy": 2.4817373752593994, "epoch": 0.7341574825986079, "grad_norm": 0.026197539642453194, "grad_norm_var": 2.109590629290984e-07, "learning_rate": 0.0017084438448157358, "loss": 2.4647, "step": 20251 }, { "crossentropy": 2.504899501800537, "epoch": 0.7341937354988399, "grad_norm": 0.027461260557174683, "grad_norm_var": 2.1504732249549793e-07, "learning_rate": 0.001708006455671912, "loss": 2.5118, "step": 20252 }, { "crossentropy": 2.721791982650757, "epoch": 0.7342299883990719, "grad_norm": 0.02693619765341282, "grad_norm_var": 1.9859931714276876e-07, "learning_rate": 0.0017075691109914553, "loss": 2.6834, "step": 20253 }, { "crossentropy": 2.5671021938323975, "epoch": 0.734266241299304, "grad_norm": 0.02631932497024536, "grad_norm_var": 1.9772424672234665e-07, "learning_rate": 0.001707131810780274, "loss": 2.5413, "step": 20254 }, { "crossentropy": 2.5701398849487305, "epoch": 0.734302494199536, "grad_norm": 0.026692161336541176, "grad_norm_var": 1.6154668132604502e-07, "learning_rate": 0.0017066945550442763, "loss": 2.5791, "step": 20255 }, { "crossentropy": 2.4519333839416504, "epoch": 0.734338747099768, "grad_norm": 0.02545926719903946, "grad_norm_var": 2.1885258058306338e-07, "learning_rate": 0.0017062573437893647, "loss": 2.4827, "step": 20256 }, { "crossentropy": 2.547121047973633, "epoch": 0.734375, "grad_norm": 0.027571819722652435, "grad_norm_var": 2.880230927217985e-07, "learning_rate": 0.0017058201770214477, "loss": 2.5539, "step": 20257 }, { "crossentropy": 2.5623364448547363, "epoch": 0.734411252900232, "grad_norm": 0.025853388011455536, "grad_norm_var": 3.079431486413889e-07, "learning_rate": 0.0017053830547464272, "loss": 2.5686, "step": 20258 }, { "crossentropy": 2.5084474086761475, "epoch": 0.734447505800464, "grad_norm": 0.026413992047309875, "grad_norm_var": 3.071411179382388e-07, "learning_rate": 0.0017049459769702092, "loss": 2.5574, "step": 20259 }, { "crossentropy": 2.5055596828460693, "epoch": 0.734483758700696, "grad_norm": 0.027026059105992317, "grad_norm_var": 3.2615570409107117e-07, "learning_rate": 0.0017045089436986966, "loss": 2.5268, "step": 20260 }, { "crossentropy": 2.5532546043395996, "epoch": 0.7345200116009281, "grad_norm": 0.0269636120647192, "grad_norm_var": 3.3612741192136306e-07, "learning_rate": 0.0017040719549377902, "loss": 2.5446, "step": 20261 }, { "crossentropy": 2.450484037399292, "epoch": 0.7345562645011601, "grad_norm": 0.025993680581450462, "grad_norm_var": 3.48591810480483e-07, "learning_rate": 0.0017036350106933934, "loss": 2.5037, "step": 20262 }, { "crossentropy": 2.5275063514709473, "epoch": 0.7345925174013921, "grad_norm": 0.026915548369288445, "grad_norm_var": 3.568102427162761e-07, "learning_rate": 0.0017031981109714102, "loss": 2.5401, "step": 20263 }, { "crossentropy": 2.5644428730010986, "epoch": 0.7346287703016241, "grad_norm": 0.026716068387031555, "grad_norm_var": 3.5493740884426304e-07, "learning_rate": 0.0017027612557777379, "loss": 2.5617, "step": 20264 }, { "crossentropy": 2.575546979904175, "epoch": 0.7346650232018561, "grad_norm": 0.026651589199900627, "grad_norm_var": 3.406329276922059e-07, "learning_rate": 0.0017023244451182802, "loss": 2.5173, "step": 20265 }, { "crossentropy": 2.457242012023926, "epoch": 0.7347012761020881, "grad_norm": 0.027021126821637154, "grad_norm_var": 3.2628284044980357e-07, "learning_rate": 0.0017018876789989335, "loss": 2.5469, "step": 20266 }, { "crossentropy": 2.599677085876465, "epoch": 0.7347375290023201, "grad_norm": 0.02663292922079563, "grad_norm_var": 3.126167614199282e-07, "learning_rate": 0.0017014509574255993, "loss": 2.5682, "step": 20267 }, { "crossentropy": 2.4080049991607666, "epoch": 0.7347737819025522, "grad_norm": 0.027235526591539383, "grad_norm_var": 2.918132340382623e-07, "learning_rate": 0.0017010142804041784, "loss": 2.5345, "step": 20268 }, { "crossentropy": 2.540332317352295, "epoch": 0.7348100348027842, "grad_norm": 0.026527386158704758, "grad_norm_var": 2.866663485532236e-07, "learning_rate": 0.0017005776479405627, "loss": 2.5833, "step": 20269 }, { "crossentropy": 2.444430351257324, "epoch": 0.7348462877030162, "grad_norm": 0.025806505233049393, "grad_norm_var": 3.2397581474491225e-07, "learning_rate": 0.001700141060040653, "loss": 2.5101, "step": 20270 }, { "crossentropy": 2.4852702617645264, "epoch": 0.7348825406032483, "grad_norm": 0.02670629508793354, "grad_norm_var": 3.2417603399350023e-07, "learning_rate": 0.0016997045167103481, "loss": 2.5467, "step": 20271 }, { "crossentropy": 2.514338254928589, "epoch": 0.7349187935034803, "grad_norm": 0.026287436485290527, "grad_norm_var": 2.4180597881408537e-07, "learning_rate": 0.0016992680179555397, "loss": 2.4658, "step": 20272 }, { "crossentropy": 2.5807502269744873, "epoch": 0.7349550464037123, "grad_norm": 0.026124024763703346, "grad_norm_var": 1.9393596224468306e-07, "learning_rate": 0.0016988315637821278, "loss": 2.5192, "step": 20273 }, { "crossentropy": 2.5637705326080322, "epoch": 0.7349912993039444, "grad_norm": 0.02614416927099228, "grad_norm_var": 1.7203019716364027e-07, "learning_rate": 0.0016983951541960035, "loss": 2.5702, "step": 20274 }, { "crossentropy": 2.440157413482666, "epoch": 0.7350275522041764, "grad_norm": 0.025401724502444267, "grad_norm_var": 2.575168933255725e-07, "learning_rate": 0.0016979587892030634, "loss": 2.5076, "step": 20275 }, { "crossentropy": 2.488874673843384, "epoch": 0.7350638051044084, "grad_norm": 0.029189813882112503, "grad_norm_var": 6.991288692313139e-07, "learning_rate": 0.0016975224688092038, "loss": 2.5134, "step": 20276 }, { "crossentropy": 2.441363573074341, "epoch": 0.7351000580046404, "grad_norm": 0.02700996957719326, "grad_norm_var": 7.01233514884209e-07, "learning_rate": 0.001697086193020312, "loss": 2.4787, "step": 20277 }, { "crossentropy": 2.547696113586426, "epoch": 0.7351363109048724, "grad_norm": 0.026121901348233223, "grad_norm_var": 6.910792325189128e-07, "learning_rate": 0.001696649961842283, "loss": 2.572, "step": 20278 }, { "crossentropy": 2.508685350418091, "epoch": 0.7351725638051044, "grad_norm": 0.026450861245393753, "grad_norm_var": 6.88478511975954e-07, "learning_rate": 0.0016962137752810115, "loss": 2.5627, "step": 20279 }, { "crossentropy": 2.5578415393829346, "epoch": 0.7352088167053364, "grad_norm": 0.026190048083662987, "grad_norm_var": 6.995047122467045e-07, "learning_rate": 0.0016957776333423847, "loss": 2.5158, "step": 20280 }, { "crossentropy": 2.3379056453704834, "epoch": 0.7352450696055685, "grad_norm": 0.02527049370110035, "grad_norm_var": 8.080829475163766e-07, "learning_rate": 0.001695341536032297, "loss": 2.4156, "step": 20281 }, { "crossentropy": 2.43634295463562, "epoch": 0.7352813225058005, "grad_norm": 0.02582528628408909, "grad_norm_var": 8.155667852613775e-07, "learning_rate": 0.0016949054833566351, "loss": 2.4861, "step": 20282 }, { "crossentropy": 2.4466447830200195, "epoch": 0.7353175754060325, "grad_norm": 0.02628285065293312, "grad_norm_var": 8.138837625991264e-07, "learning_rate": 0.0016944694753212902, "loss": 2.5233, "step": 20283 }, { "crossentropy": 2.378025531768799, "epoch": 0.7353538283062645, "grad_norm": 0.025731412693858147, "grad_norm_var": 7.899021911250317e-07, "learning_rate": 0.0016940335119321548, "loss": 2.4782, "step": 20284 }, { "crossentropy": 2.298250913619995, "epoch": 0.7353900812064965, "grad_norm": 0.02710025943815708, "grad_norm_var": 8.264923195864966e-07, "learning_rate": 0.001693597593195111, "loss": 2.3436, "step": 20285 }, { "crossentropy": 2.4303979873657227, "epoch": 0.7354263341067285, "grad_norm": 0.02786453813314438, "grad_norm_var": 9.413353271215304e-07, "learning_rate": 0.0016931617191160487, "loss": 2.4568, "step": 20286 }, { "crossentropy": 2.3135199546813965, "epoch": 0.7354625870069605, "grad_norm": 0.026350069791078568, "grad_norm_var": 9.385806768864019e-07, "learning_rate": 0.0016927258897008574, "loss": 2.3683, "step": 20287 }, { "crossentropy": 2.547786235809326, "epoch": 0.7354988399071926, "grad_norm": 0.025757376104593277, "grad_norm_var": 9.682699283766604e-07, "learning_rate": 0.0016922901049554202, "loss": 2.5086, "step": 20288 }, { "crossentropy": 2.4194209575653076, "epoch": 0.7355350928074246, "grad_norm": 0.02755867876112461, "grad_norm_var": 1.0391597896739717e-06, "learning_rate": 0.0016918543648856265, "loss": 2.4593, "step": 20289 }, { "crossentropy": 2.4772841930389404, "epoch": 0.7355713457076566, "grad_norm": 0.027166567742824554, "grad_norm_var": 1.053858841509866e-06, "learning_rate": 0.0016914186694973587, "loss": 2.5017, "step": 20290 }, { "crossentropy": 2.6169581413269043, "epoch": 0.7356075986078886, "grad_norm": 0.02587604895234108, "grad_norm_var": 9.934345449777795e-07, "learning_rate": 0.0016909830187965026, "loss": 2.5213, "step": 20291 }, { "crossentropy": 2.539551258087158, "epoch": 0.7356438515081206, "grad_norm": 0.026871317997574806, "grad_norm_var": 5.316263605960412e-07, "learning_rate": 0.0016905474127889436, "loss": 2.4601, "step": 20292 }, { "crossentropy": 2.5807366371154785, "epoch": 0.7356801044083526, "grad_norm": 0.02698202058672905, "grad_norm_var": 5.296414663115885e-07, "learning_rate": 0.0016901118514805642, "loss": 2.4828, "step": 20293 }, { "crossentropy": 2.492554187774658, "epoch": 0.7357163573085846, "grad_norm": 0.026319077238440514, "grad_norm_var": 5.231174242128466e-07, "learning_rate": 0.001689676334877246, "loss": 2.5136, "step": 20294 }, { "crossentropy": 2.5503480434417725, "epoch": 0.7357526102088167, "grad_norm": 0.027327820658683777, "grad_norm_var": 5.683836437416397e-07, "learning_rate": 0.0016892408629848737, "loss": 2.5086, "step": 20295 }, { "crossentropy": 2.4988934993743896, "epoch": 0.7357888631090487, "grad_norm": 0.02623152919113636, "grad_norm_var": 5.666130955460254e-07, "learning_rate": 0.0016888054358093264, "loss": 2.4795, "step": 20296 }, { "crossentropy": 2.5215370655059814, "epoch": 0.7358251160092807, "grad_norm": 0.027730468660593033, "grad_norm_var": 5.309919096030258e-07, "learning_rate": 0.0016883700533564876, "loss": 2.464, "step": 20297 }, { "crossentropy": 2.695162773132324, "epoch": 0.7358613689095128, "grad_norm": 0.026362674310803413, "grad_norm_var": 4.873724273607525e-07, "learning_rate": 0.001687934715632236, "loss": 2.6471, "step": 20298 }, { "crossentropy": 2.5955874919891357, "epoch": 0.7358976218097448, "grad_norm": 0.026367448270320892, "grad_norm_var": 4.828939575652793e-07, "learning_rate": 0.0016874994226424517, "loss": 2.5043, "step": 20299 }, { "crossentropy": 2.680835723876953, "epoch": 0.7359338747099768, "grad_norm": 0.02991661988198757, "grad_norm_var": 1.02328611289285e-06, "learning_rate": 0.0016870641743930166, "loss": 2.6359, "step": 20300 }, { "crossentropy": 2.533726215362549, "epoch": 0.7359701276102089, "grad_norm": 0.02639637514948845, "grad_norm_var": 1.0435667606241311e-06, "learning_rate": 0.0016866289708898075, "loss": 2.5162, "step": 20301 }, { "crossentropy": 2.4294543266296387, "epoch": 0.7360063805104409, "grad_norm": 0.025895746424794197, "grad_norm_var": 1.0437629689813751e-06, "learning_rate": 0.0016861938121387006, "loss": 2.4231, "step": 20302 }, { "crossentropy": 2.6778159141540527, "epoch": 0.7360426334106729, "grad_norm": 0.02768412046134472, "grad_norm_var": 1.0715185264249805e-06, "learning_rate": 0.0016857586981455764, "loss": 2.6797, "step": 20303 }, { "crossentropy": 2.6389706134796143, "epoch": 0.7360788863109049, "grad_norm": 0.026785310357809067, "grad_norm_var": 9.805774758314678e-07, "learning_rate": 0.0016853236289163093, "loss": 2.5518, "step": 20304 }, { "crossentropy": 2.58453631401062, "epoch": 0.7361151392111369, "grad_norm": 0.025830881670117378, "grad_norm_var": 1.0308483462902195e-06, "learning_rate": 0.0016848886044567768, "loss": 2.4834, "step": 20305 }, { "crossentropy": 2.489696979522705, "epoch": 0.7361513921113689, "grad_norm": 0.027312396094202995, "grad_norm_var": 1.0381577116416134e-06, "learning_rate": 0.0016844536247728559, "loss": 2.5495, "step": 20306 }, { "crossentropy": 2.337909698486328, "epoch": 0.7361876450116009, "grad_norm": 0.02594032883644104, "grad_norm_var": 1.0299132950147925e-06, "learning_rate": 0.001684018689870419, "loss": 2.4318, "step": 20307 }, { "crossentropy": 2.4515514373779297, "epoch": 0.736223897911833, "grad_norm": 0.02794218249619007, "grad_norm_var": 1.1014687812715366e-06, "learning_rate": 0.0016835837997553433, "loss": 2.5791, "step": 20308 }, { "crossentropy": 2.648993730545044, "epoch": 0.736260150812065, "grad_norm": 0.027309071272611618, "grad_norm_var": 1.1100271778268725e-06, "learning_rate": 0.0016831489544334994, "loss": 2.5024, "step": 20309 }, { "crossentropy": 2.520258903503418, "epoch": 0.736296403712297, "grad_norm": 0.02559424564242363, "grad_norm_var": 1.2047569420421678e-06, "learning_rate": 0.0016827141539107644, "loss": 2.546, "step": 20310 }, { "crossentropy": 2.538555145263672, "epoch": 0.736332656612529, "grad_norm": 0.02574482001364231, "grad_norm_var": 1.2740738185539838e-06, "learning_rate": 0.001682279398193009, "loss": 2.5092, "step": 20311 }, { "crossentropy": 2.5311121940612793, "epoch": 0.736368909512761, "grad_norm": 0.026055755093693733, "grad_norm_var": 1.2896855720633764e-06, "learning_rate": 0.001681844687286103, "loss": 2.5502, "step": 20312 }, { "crossentropy": 2.248242139816284, "epoch": 0.736405162412993, "grad_norm": 0.025091545656323433, "grad_norm_var": 1.399044049922582e-06, "learning_rate": 0.0016814100211959204, "loss": 2.3036, "step": 20313 }, { "crossentropy": 2.5429582595825195, "epoch": 0.736441415313225, "grad_norm": 0.027183406054973602, "grad_norm_var": 1.4108677536949955e-06, "learning_rate": 0.0016809753999283334, "loss": 2.5478, "step": 20314 }, { "crossentropy": 2.5404675006866455, "epoch": 0.7364776682134571, "grad_norm": 0.025293031707406044, "grad_norm_var": 1.5293150641917479e-06, "learning_rate": 0.0016805408234892084, "loss": 2.4876, "step": 20315 }, { "crossentropy": 2.430184841156006, "epoch": 0.7365139211136891, "grad_norm": 0.025136269629001617, "grad_norm_var": 8.585739858146437e-07, "learning_rate": 0.0016801062918844183, "loss": 2.4841, "step": 20316 }, { "crossentropy": 2.5496246814727783, "epoch": 0.7365501740139211, "grad_norm": 0.026119550690054893, "grad_norm_var": 8.607186086047209e-07, "learning_rate": 0.0016796718051198302, "loss": 2.4777, "step": 20317 }, { "crossentropy": 2.418055772781372, "epoch": 0.7365864269141531, "grad_norm": 0.026009177789092064, "grad_norm_var": 8.552966044473217e-07, "learning_rate": 0.0016792373632013142, "loss": 2.4604, "step": 20318 }, { "crossentropy": 2.347238540649414, "epoch": 0.7366226798143851, "grad_norm": 0.02678295038640499, "grad_norm_var": 7.414858955080243e-07, "learning_rate": 0.001678802966134737, "loss": 2.4299, "step": 20319 }, { "crossentropy": 2.5243635177612305, "epoch": 0.7366589327146171, "grad_norm": 0.02560265362262726, "grad_norm_var": 7.457817303226898e-07, "learning_rate": 0.001678368613925964, "loss": 2.5025, "step": 20320 }, { "crossentropy": 2.436447858810425, "epoch": 0.7366951856148491, "grad_norm": 0.0265020914375782, "grad_norm_var": 7.423133283683172e-07, "learning_rate": 0.0016779343065808645, "loss": 2.4796, "step": 20321 }, { "crossentropy": 2.570897102355957, "epoch": 0.7367314385150812, "grad_norm": 0.02678387612104416, "grad_norm_var": 6.832293702121091e-07, "learning_rate": 0.0016775000441053046, "loss": 2.4916, "step": 20322 }, { "crossentropy": 2.437856435775757, "epoch": 0.7367676914153132, "grad_norm": 0.027111375704407692, "grad_norm_var": 7.294579720120426e-07, "learning_rate": 0.001677065826505147, "loss": 2.4843, "step": 20323 }, { "crossentropy": 2.5393378734588623, "epoch": 0.7368039443155452, "grad_norm": 0.028142789378762245, "grad_norm_var": 7.767969638965e-07, "learning_rate": 0.0016766316537862598, "loss": 2.5503, "step": 20324 }, { "crossentropy": 2.6185147762298584, "epoch": 0.7368401972157773, "grad_norm": 0.026316193863749504, "grad_norm_var": 7.020337120568095e-07, "learning_rate": 0.0016761975259545043, "loss": 2.5507, "step": 20325 }, { "crossentropy": 2.528402805328369, "epoch": 0.7368764501160093, "grad_norm": 0.027665505185723305, "grad_norm_var": 7.982203136862638e-07, "learning_rate": 0.0016757634430157465, "loss": 2.5014, "step": 20326 }, { "crossentropy": 2.6016156673431396, "epoch": 0.7369127030162413, "grad_norm": 0.02550383284687996, "grad_norm_var": 8.211769035500079e-07, "learning_rate": 0.0016753294049758484, "loss": 2.4235, "step": 20327 }, { "crossentropy": 2.5540969371795654, "epoch": 0.7369489559164734, "grad_norm": 0.027215858921408653, "grad_norm_var": 8.626782165737275e-07, "learning_rate": 0.00167489541184067, "loss": 2.577, "step": 20328 }, { "crossentropy": 2.4255247116088867, "epoch": 0.7369852088167054, "grad_norm": 0.027257295325398445, "grad_norm_var": 7.769099098419348e-07, "learning_rate": 0.001674461463616075, "loss": 2.4552, "step": 20329 }, { "crossentropy": 2.5793638229370117, "epoch": 0.7370214617169374, "grad_norm": 0.02616812475025654, "grad_norm_var": 7.541166040166223e-07, "learning_rate": 0.0016740275603079268, "loss": 2.5844, "step": 20330 }, { "crossentropy": 2.724815845489502, "epoch": 0.7370577146171694, "grad_norm": 0.027417780831456184, "grad_norm_var": 7.012377669805691e-07, "learning_rate": 0.0016735937019220816, "loss": 2.589, "step": 20331 }, { "crossentropy": 2.680798292160034, "epoch": 0.7370939675174014, "grad_norm": 0.02580953761935234, "grad_norm_var": 5.974114130360136e-07, "learning_rate": 0.0016731598884644034, "loss": 2.5539, "step": 20332 }, { "crossentropy": 2.4937007427215576, "epoch": 0.7371302204176334, "grad_norm": 0.026274925097823143, "grad_norm_var": 5.879200107855211e-07, "learning_rate": 0.001672726119940748, "loss": 2.5309, "step": 20333 }, { "crossentropy": 2.509655237197876, "epoch": 0.7371664733178654, "grad_norm": 0.027674414217472076, "grad_norm_var": 6.166751601785018e-07, "learning_rate": 0.0016722923963569754, "loss": 2.4615, "step": 20334 }, { "crossentropy": 2.585280179977417, "epoch": 0.7372027262180975, "grad_norm": 0.02772834524512291, "grad_norm_var": 6.748836125458941e-07, "learning_rate": 0.0016718587177189477, "loss": 2.5748, "step": 20335 }, { "crossentropy": 2.5167343616485596, "epoch": 0.7372389791183295, "grad_norm": 0.02876017615199089, "grad_norm_var": 7.840622363823226e-07, "learning_rate": 0.0016714250840325145, "loss": 2.5438, "step": 20336 }, { "crossentropy": 2.41683030128479, "epoch": 0.7372752320185615, "grad_norm": 0.02602771483361721, "grad_norm_var": 8.309325590695356e-07, "learning_rate": 0.0016709914953035377, "loss": 2.4252, "step": 20337 }, { "crossentropy": 2.602185010910034, "epoch": 0.7373114849187935, "grad_norm": 0.027380315586924553, "grad_norm_var": 8.366860519197812e-07, "learning_rate": 0.001670557951537874, "loss": 2.494, "step": 20338 }, { "crossentropy": 2.4005722999572754, "epoch": 0.7373477378190255, "grad_norm": 0.025864573195576668, "grad_norm_var": 9.200471960319936e-07, "learning_rate": 0.0016701244527413761, "loss": 2.464, "step": 20339 }, { "crossentropy": 2.507669448852539, "epoch": 0.7373839907192575, "grad_norm": 0.027937380596995354, "grad_norm_var": 8.900289601126843e-07, "learning_rate": 0.0016696909989199027, "loss": 2.5815, "step": 20340 }, { "crossentropy": 2.5044126510620117, "epoch": 0.7374202436194895, "grad_norm": 0.028089096769690514, "grad_norm_var": 9.395801092346254e-07, "learning_rate": 0.0016692575900793044, "loss": 2.459, "step": 20341 }, { "crossentropy": 2.589245080947876, "epoch": 0.7374564965197216, "grad_norm": 0.02693747915327549, "grad_norm_var": 9.12806887870413e-07, "learning_rate": 0.0016688242262254366, "loss": 2.5461, "step": 20342 }, { "crossentropy": 2.537055015563965, "epoch": 0.7374927494199536, "grad_norm": 0.027487969025969505, "grad_norm_var": 7.622687522447491e-07, "learning_rate": 0.0016683909073641567, "loss": 2.4574, "step": 20343 }, { "crossentropy": 2.4780776500701904, "epoch": 0.7375290023201856, "grad_norm": 0.027513310313224792, "grad_norm_var": 7.713252569002075e-07, "learning_rate": 0.0016679576335013096, "loss": 2.5104, "step": 20344 }, { "crossentropy": 2.593488931655884, "epoch": 0.7375652552204176, "grad_norm": 0.027201568707823753, "grad_norm_var": 7.706888882487989e-07, "learning_rate": 0.0016675244046427517, "loss": 2.6051, "step": 20345 }, { "crossentropy": 2.379426956176758, "epoch": 0.7376015081206496, "grad_norm": 0.02622099407017231, "grad_norm_var": 7.639981893751603e-07, "learning_rate": 0.0016670912207943356, "loss": 2.4581, "step": 20346 }, { "crossentropy": 2.728508234024048, "epoch": 0.7376377610208816, "grad_norm": 0.025384310632944107, "grad_norm_var": 9.485714430585342e-07, "learning_rate": 0.0016666580819619087, "loss": 2.5981, "step": 20347 }, { "crossentropy": 2.52614688873291, "epoch": 0.7376740139211136, "grad_norm": 0.027492210268974304, "grad_norm_var": 8.543492845013966e-07, "learning_rate": 0.0016662249881513247, "loss": 2.4326, "step": 20348 }, { "crossentropy": 2.6476049423217773, "epoch": 0.7377102668213457, "grad_norm": 0.02600591443479061, "grad_norm_var": 8.893062388904208e-07, "learning_rate": 0.0016657919393684296, "loss": 2.5457, "step": 20349 }, { "crossentropy": 2.615640640258789, "epoch": 0.7377465197215777, "grad_norm": 0.026984263211488724, "grad_norm_var": 8.668261702725332e-07, "learning_rate": 0.001665358935619074, "loss": 2.6653, "step": 20350 }, { "crossentropy": 2.554269313812256, "epoch": 0.7377827726218097, "grad_norm": 0.027157017961144447, "grad_norm_var": 8.365794035374105e-07, "learning_rate": 0.0016649259769091086, "loss": 2.5653, "step": 20351 }, { "crossentropy": 2.4628915786743164, "epoch": 0.7378190255220418, "grad_norm": 0.02791444957256317, "grad_norm_var": 6.859303099757217e-07, "learning_rate": 0.0016644930632443782, "loss": 2.4689, "step": 20352 }, { "crossentropy": 2.672562599182129, "epoch": 0.7378552784222738, "grad_norm": 0.02816005051136017, "grad_norm_var": 7.008102207499307e-07, "learning_rate": 0.00166406019463073, "loss": 2.6345, "step": 20353 }, { "crossentropy": 2.445934295654297, "epoch": 0.7378915313225058, "grad_norm": 0.026408091187477112, "grad_norm_var": 7.246097686373996e-07, "learning_rate": 0.0016636273710740124, "loss": 2.4875, "step": 20354 }, { "crossentropy": 2.430885076522827, "epoch": 0.7379277842227379, "grad_norm": 0.027722543105483055, "grad_norm_var": 6.473378194913794e-07, "learning_rate": 0.001663194592580069, "loss": 2.4197, "step": 20355 }, { "crossentropy": 2.5999560356140137, "epoch": 0.7379640371229699, "grad_norm": 0.027487067505717278, "grad_norm_var": 6.135489825546582e-07, "learning_rate": 0.0016627618591547482, "loss": 2.5549, "step": 20356 }, { "crossentropy": 2.4195034503936768, "epoch": 0.7380002900232019, "grad_norm": 0.026575937867164612, "grad_norm_var": 5.642386631878859e-07, "learning_rate": 0.0016623291708038911, "loss": 2.4644, "step": 20357 }, { "crossentropy": 2.718912363052368, "epoch": 0.7380365429234339, "grad_norm": 0.028618087992072105, "grad_norm_var": 7.176089980751958e-07, "learning_rate": 0.0016618965275333436, "loss": 2.6446, "step": 20358 }, { "crossentropy": 2.608933448791504, "epoch": 0.7380727958236659, "grad_norm": 0.027395576238632202, "grad_norm_var": 7.139280908765768e-07, "learning_rate": 0.0016614639293489508, "loss": 2.5364, "step": 20359 }, { "crossentropy": 2.6270973682403564, "epoch": 0.7381090487238979, "grad_norm": 0.02809947356581688, "grad_norm_var": 7.645715985682046e-07, "learning_rate": 0.0016610313762565542, "loss": 2.6154, "step": 20360 }, { "crossentropy": 2.520139217376709, "epoch": 0.73814530162413, "grad_norm": 0.02582881599664688, "grad_norm_var": 8.778019945035002e-07, "learning_rate": 0.0016605988682619943, "loss": 2.4214, "step": 20361 }, { "crossentropy": 2.5088446140289307, "epoch": 0.738181554524362, "grad_norm": 0.02607456035912037, "grad_norm_var": 8.961271384391271e-07, "learning_rate": 0.0016601664053711158, "loss": 2.5199, "step": 20362 }, { "crossentropy": 2.5507521629333496, "epoch": 0.738217807424594, "grad_norm": 0.025798168033361435, "grad_norm_var": 8.131643535270553e-07, "learning_rate": 0.0016597339875897572, "loss": 2.5661, "step": 20363 }, { "crossentropy": 2.5070977210998535, "epoch": 0.738254060324826, "grad_norm": 0.027234427630901337, "grad_norm_var": 8.04099498580522e-07, "learning_rate": 0.0016593016149237612, "loss": 2.461, "step": 20364 }, { "crossentropy": 2.6352524757385254, "epoch": 0.738290313225058, "grad_norm": 0.028205716982483864, "grad_norm_var": 7.881271885685932e-07, "learning_rate": 0.0016588692873789651, "loss": 2.6248, "step": 20365 }, { "crossentropy": 2.4747424125671387, "epoch": 0.73832656612529, "grad_norm": 0.026724563911557198, "grad_norm_var": 8.008173539291909e-07, "learning_rate": 0.0016584370049612096, "loss": 2.4628, "step": 20366 }, { "crossentropy": 2.4725942611694336, "epoch": 0.738362819025522, "grad_norm": 0.02625870332121849, "grad_norm_var": 8.579323547627226e-07, "learning_rate": 0.0016580047676763355, "loss": 2.464, "step": 20367 }, { "crossentropy": 2.467747449874878, "epoch": 0.738399071925754, "grad_norm": 0.02747219242155552, "grad_norm_var": 8.25470566561875e-07, "learning_rate": 0.0016575725755301786, "loss": 2.4775, "step": 20368 }, { "crossentropy": 2.533186197280884, "epoch": 0.7384353248259861, "grad_norm": 0.02790067531168461, "grad_norm_var": 7.940180434697537e-07, "learning_rate": 0.001657140428528574, "loss": 2.4748, "step": 20369 }, { "crossentropy": 2.509150505065918, "epoch": 0.7384715777262181, "grad_norm": 0.025712911039590836, "grad_norm_var": 8.895415515357159e-07, "learning_rate": 0.0016567083266773631, "loss": 2.4843, "step": 20370 }, { "crossentropy": 2.336693048477173, "epoch": 0.7385078306264501, "grad_norm": 0.026563437655568123, "grad_norm_var": 8.725608737334782e-07, "learning_rate": 0.001656276269982378, "loss": 2.3882, "step": 20371 }, { "crossentropy": 2.3446621894836426, "epoch": 0.7385440835266821, "grad_norm": 0.02747700922191143, "grad_norm_var": 8.719098238944382e-07, "learning_rate": 0.001655844258449458, "loss": 2.3981, "step": 20372 }, { "crossentropy": 2.4671077728271484, "epoch": 0.7385803364269141, "grad_norm": 0.02739548869431019, "grad_norm_var": 8.679580831805981e-07, "learning_rate": 0.0016554122920844345, "loss": 2.4635, "step": 20373 }, { "crossentropy": 2.4116737842559814, "epoch": 0.7386165893271461, "grad_norm": 0.02925417572259903, "grad_norm_var": 1.026451307512867e-06, "learning_rate": 0.0016549803708931438, "loss": 2.4566, "step": 20374 }, { "crossentropy": 2.5138509273529053, "epoch": 0.7386528422273781, "grad_norm": 0.029704762622714043, "grad_norm_var": 1.454655700140273e-06, "learning_rate": 0.0016545484948814204, "loss": 2.5615, "step": 20375 }, { "crossentropy": 2.5113813877105713, "epoch": 0.7386890951276102, "grad_norm": 0.02563536912202835, "grad_norm_var": 1.54899576151773e-06, "learning_rate": 0.0016541166640550954, "loss": 2.5402, "step": 20376 }, { "crossentropy": 2.5705366134643555, "epoch": 0.7387253480278422, "grad_norm": 0.027409879490733147, "grad_norm_var": 1.4419848268849728e-06, "learning_rate": 0.0016536848784200042, "loss": 2.5482, "step": 20377 }, { "crossentropy": 2.531677484512329, "epoch": 0.7387616009280742, "grad_norm": 0.026595722883939743, "grad_norm_var": 1.3823970283666538e-06, "learning_rate": 0.0016532531379819765, "loss": 2.499, "step": 20378 }, { "crossentropy": 2.552882194519043, "epoch": 0.7387978538283063, "grad_norm": 0.026431584730744362, "grad_norm_var": 1.2883246708981939e-06, "learning_rate": 0.0016528214427468425, "loss": 2.5553, "step": 20379 }, { "crossentropy": 2.3685507774353027, "epoch": 0.7388341067285383, "grad_norm": 0.02704596519470215, "grad_norm_var": 1.29089914149786e-06, "learning_rate": 0.0016523897927204362, "loss": 2.4567, "step": 20380 }, { "crossentropy": 2.570645332336426, "epoch": 0.7388703596287703, "grad_norm": 0.02712368592619896, "grad_norm_var": 1.2242813705382137e-06, "learning_rate": 0.0016519581879085843, "loss": 2.5068, "step": 20381 }, { "crossentropy": 2.4594836235046387, "epoch": 0.7389066125290024, "grad_norm": 0.028248166665434837, "grad_norm_var": 1.2790538226111476e-06, "learning_rate": 0.0016515266283171177, "loss": 2.5182, "step": 20382 }, { "crossentropy": 2.4975924491882324, "epoch": 0.7389428654292344, "grad_norm": 0.026525745168328285, "grad_norm_var": 1.2477038596315122e-06, "learning_rate": 0.0016510951139518677, "loss": 2.5266, "step": 20383 }, { "crossentropy": 2.608846664428711, "epoch": 0.7389791183294664, "grad_norm": 0.02758008800446987, "grad_norm_var": 1.251181265705427e-06, "learning_rate": 0.001650663644818658, "loss": 2.5003, "step": 20384 }, { "crossentropy": 2.4126718044281006, "epoch": 0.7390153712296984, "grad_norm": 0.02665579691529274, "grad_norm_var": 1.2463101648545513e-06, "learning_rate": 0.0016502322209233206, "loss": 2.3853, "step": 20385 }, { "crossentropy": 2.4577476978302, "epoch": 0.7390516241299304, "grad_norm": 0.026529556140303612, "grad_norm_var": 1.1249813780260984e-06, "learning_rate": 0.0016498008422716804, "loss": 2.4512, "step": 20386 }, { "crossentropy": 2.4351143836975098, "epoch": 0.7390878770301624, "grad_norm": 0.026204664260149002, "grad_norm_var": 1.1663964810053185e-06, "learning_rate": 0.001649369508869562, "loss": 2.4228, "step": 20387 }, { "crossentropy": 2.522463321685791, "epoch": 0.7391241299303944, "grad_norm": 0.025952167809009552, "grad_norm_var": 1.2632470859002519e-06, "learning_rate": 0.0016489382207227955, "loss": 2.5238, "step": 20388 }, { "crossentropy": 2.4806511402130127, "epoch": 0.7391603828306265, "grad_norm": 0.02612038142979145, "grad_norm_var": 1.3219902652039998e-06, "learning_rate": 0.0016485069778372013, "loss": 2.492, "step": 20389 }, { "crossentropy": 2.732694149017334, "epoch": 0.7391966357308585, "grad_norm": 0.026602279394865036, "grad_norm_var": 9.869700802818868e-07, "learning_rate": 0.0016480757802186069, "loss": 2.6348, "step": 20390 }, { "crossentropy": 2.58726167678833, "epoch": 0.7392328886310905, "grad_norm": 0.026860062032938004, "grad_norm_var": 4.281018195936009e-07, "learning_rate": 0.0016476446278728375, "loss": 2.5421, "step": 20391 }, { "crossentropy": 2.3828725814819336, "epoch": 0.7392691415313225, "grad_norm": 0.026347074657678604, "grad_norm_var": 3.5682796375468594e-07, "learning_rate": 0.0016472135208057126, "loss": 2.399, "step": 20392 }, { "crossentropy": 2.5356223583221436, "epoch": 0.7393053944315545, "grad_norm": 0.028444020077586174, "grad_norm_var": 5.126497298467281e-07, "learning_rate": 0.0016467824590230596, "loss": 2.5689, "step": 20393 }, { "crossentropy": 2.3628957271575928, "epoch": 0.7393416473317865, "grad_norm": 0.026644740253686905, "grad_norm_var": 5.11274071712447e-07, "learning_rate": 0.0016463514425306975, "loss": 2.3874, "step": 20394 }, { "crossentropy": 2.3039817810058594, "epoch": 0.7393779002320185, "grad_norm": 0.02735665626823902, "grad_norm_var": 5.153398656677766e-07, "learning_rate": 0.0016459204713344473, "loss": 2.4306, "step": 20395 }, { "crossentropy": 2.482801914215088, "epoch": 0.7394141531322506, "grad_norm": 0.026767874136567116, "grad_norm_var": 5.143927107431033e-07, "learning_rate": 0.0016454895454401325, "loss": 2.5371, "step": 20396 }, { "crossentropy": 2.5585343837738037, "epoch": 0.7394504060324826, "grad_norm": 0.026624079793691635, "grad_norm_var": 5.132728865430679e-07, "learning_rate": 0.0016450586648535704, "loss": 2.5129, "step": 20397 }, { "crossentropy": 2.2420766353607178, "epoch": 0.7394866589327146, "grad_norm": 0.027705905959010124, "grad_norm_var": 4.299438720382793e-07, "learning_rate": 0.0016446278295805827, "loss": 2.3583, "step": 20398 }, { "crossentropy": 2.4475157260894775, "epoch": 0.7395229118329466, "grad_norm": 0.02573714405298233, "grad_norm_var": 4.984449060038554e-07, "learning_rate": 0.0016441970396269894, "loss": 2.516, "step": 20399 }, { "crossentropy": 2.5580224990844727, "epoch": 0.7395591647331786, "grad_norm": 0.026739832013845444, "grad_norm_var": 4.5050132092295615e-07, "learning_rate": 0.001643766294998606, "loss": 2.5028, "step": 20400 }, { "crossentropy": 2.5525312423706055, "epoch": 0.7395954176334106, "grad_norm": 0.028268704190850258, "grad_norm_var": 6.023474013202064e-07, "learning_rate": 0.001643335595701254, "loss": 2.5139, "step": 20401 }, { "crossentropy": 2.451704978942871, "epoch": 0.7396316705336426, "grad_norm": 0.030726931989192963, "grad_norm_var": 1.5484385284393146e-06, "learning_rate": 0.0016429049417407487, "loss": 2.4537, "step": 20402 }, { "crossentropy": 2.612180233001709, "epoch": 0.7396679234338747, "grad_norm": 0.027605267241597176, "grad_norm_var": 1.5096492047334768e-06, "learning_rate": 0.0016424743331229048, "loss": 2.5853, "step": 20403 }, { "crossentropy": 2.504971742630005, "epoch": 0.7397041763341067, "grad_norm": 0.025831198319792747, "grad_norm_var": 1.529987913350718e-06, "learning_rate": 0.0016420437698535417, "loss": 2.5425, "step": 20404 }, { "crossentropy": 2.5995864868164062, "epoch": 0.7397404292343387, "grad_norm": 0.02684316597878933, "grad_norm_var": 1.4635208581322463e-06, "learning_rate": 0.0016416132519384724, "loss": 2.5358, "step": 20405 }, { "crossentropy": 2.4680745601654053, "epoch": 0.7397766821345708, "grad_norm": 0.026201317086815834, "grad_norm_var": 1.5052065147326398e-06, "learning_rate": 0.001641182779383512, "loss": 2.5355, "step": 20406 }, { "crossentropy": 2.447347402572632, "epoch": 0.7398129350348028, "grad_norm": 0.026613466441631317, "grad_norm_var": 1.5191647467715918e-06, "learning_rate": 0.0016407523521944779, "loss": 2.6041, "step": 20407 }, { "crossentropy": 2.478039264678955, "epoch": 0.7398491879350348, "grad_norm": 0.02772500365972519, "grad_norm_var": 1.4896573568772547e-06, "learning_rate": 0.0016403219703771787, "loss": 2.4563, "step": 20408 }, { "crossentropy": 2.4858415126800537, "epoch": 0.7398854408352669, "grad_norm": 0.027102064341306686, "grad_norm_var": 1.3867254853356173e-06, "learning_rate": 0.0016398916339374315, "loss": 2.5521, "step": 20409 }, { "crossentropy": 2.265207290649414, "epoch": 0.7399216937354989, "grad_norm": 0.026213428005576134, "grad_norm_var": 1.4277445364341367e-06, "learning_rate": 0.0016394613428810474, "loss": 2.4147, "step": 20410 }, { "crossentropy": 2.5049102306365967, "epoch": 0.7399579466357309, "grad_norm": 0.02670334465801716, "grad_norm_var": 1.4345791355071703e-06, "learning_rate": 0.0016390310972138355, "loss": 2.5148, "step": 20411 }, { "crossentropy": 2.4677164554595947, "epoch": 0.7399941995359629, "grad_norm": 0.027096694335341454, "grad_norm_var": 1.4272996312114043e-06, "learning_rate": 0.001638600896941611, "loss": 2.3663, "step": 20412 }, { "crossentropy": 2.5281739234924316, "epoch": 0.7400304524361949, "grad_norm": 0.026838582009077072, "grad_norm_var": 1.4163180030455584e-06, "learning_rate": 0.001638170742070181, "loss": 2.5718, "step": 20413 }, { "crossentropy": 2.5329809188842773, "epoch": 0.7400667053364269, "grad_norm": 0.02801852859556675, "grad_norm_var": 1.446765141823814e-06, "learning_rate": 0.0016377406326053563, "loss": 2.5737, "step": 20414 }, { "crossentropy": 2.5217227935791016, "epoch": 0.740102958236659, "grad_norm": 0.026312347501516342, "grad_norm_var": 1.3597352197319227e-06, "learning_rate": 0.0016373105685529488, "loss": 2.4681, "step": 20415 }, { "crossentropy": 2.590808629989624, "epoch": 0.740139211136891, "grad_norm": 0.027981653809547424, "grad_norm_var": 1.3836516656353839e-06, "learning_rate": 0.0016368805499187634, "loss": 2.5456, "step": 20416 }, { "crossentropy": 2.495520830154419, "epoch": 0.740175464037123, "grad_norm": 0.025853071361780167, "grad_norm_var": 1.421892689968766e-06, "learning_rate": 0.0016364505767086119, "loss": 2.4523, "step": 20417 }, { "crossentropy": 2.4125165939331055, "epoch": 0.740211716937355, "grad_norm": 0.026606526225805283, "grad_norm_var": 4.926792720807404e-07, "learning_rate": 0.0016360206489282976, "loss": 2.484, "step": 20418 }, { "crossentropy": 2.6793806552886963, "epoch": 0.740247969837587, "grad_norm": 0.02714761346578598, "grad_norm_var": 4.594756784052073e-07, "learning_rate": 0.0016355907665836311, "loss": 2.5571, "step": 20419 }, { "crossentropy": 2.4916646480560303, "epoch": 0.740284222737819, "grad_norm": 0.0262230783700943, "grad_norm_var": 4.1751273056420816e-07, "learning_rate": 0.0016351609296804172, "loss": 2.5024, "step": 20420 }, { "crossentropy": 2.3897812366485596, "epoch": 0.740320475638051, "grad_norm": 0.025605307891964912, "grad_norm_var": 5.131699249117754e-07, "learning_rate": 0.0016347311382244596, "loss": 2.4321, "step": 20421 }, { "crossentropy": 2.426254987716675, "epoch": 0.740356728538283, "grad_norm": 0.027679145336151123, "grad_norm_var": 5.385732691143835e-07, "learning_rate": 0.001634301392221565, "loss": 2.4793, "step": 20422 }, { "crossentropy": 2.304961681365967, "epoch": 0.7403929814385151, "grad_norm": 0.02776697464287281, "grad_norm_var": 5.842033482569842e-07, "learning_rate": 0.0016338716916775393, "loss": 2.3852, "step": 20423 }, { "crossentropy": 2.6008224487304688, "epoch": 0.7404292343387471, "grad_norm": 0.026479117572307587, "grad_norm_var": 5.490844286512587e-07, "learning_rate": 0.0016334420365981827, "loss": 2.5625, "step": 20424 }, { "crossentropy": 2.5576586723327637, "epoch": 0.7404654872389791, "grad_norm": 0.02570805698633194, "grad_norm_var": 6.240065600408735e-07, "learning_rate": 0.0016330124269893027, "loss": 2.4779, "step": 20425 }, { "crossentropy": 2.6319236755371094, "epoch": 0.7405017401392111, "grad_norm": 0.027091119438409805, "grad_norm_var": 6.076527042924897e-07, "learning_rate": 0.0016325828628566974, "loss": 2.5733, "step": 20426 }, { "crossentropy": 2.525387763977051, "epoch": 0.7405379930394431, "grad_norm": 0.025174062699079514, "grad_norm_var": 7.774955534241175e-07, "learning_rate": 0.001632153344206172, "loss": 2.4822, "step": 20427 }, { "crossentropy": 2.439927101135254, "epoch": 0.7405742459396751, "grad_norm": 0.027352405712008476, "grad_norm_var": 7.942937972788347e-07, "learning_rate": 0.0016317238710435273, "loss": 2.501, "step": 20428 }, { "crossentropy": 2.6025097370147705, "epoch": 0.7406104988399071, "grad_norm": 0.02703464776277542, "grad_norm_var": 7.992774843125494e-07, "learning_rate": 0.0016312944433745614, "loss": 2.586, "step": 20429 }, { "crossentropy": 2.485043525695801, "epoch": 0.7406467517401392, "grad_norm": 0.026695765554904938, "grad_norm_var": 6.852765036039482e-07, "learning_rate": 0.0016308650612050762, "loss": 2.4766, "step": 20430 }, { "crossentropy": 2.4896445274353027, "epoch": 0.7406830046403712, "grad_norm": 0.028555413708090782, "grad_norm_var": 8.929407364540362e-07, "learning_rate": 0.001630435724540873, "loss": 2.5334, "step": 20431 }, { "crossentropy": 2.5786499977111816, "epoch": 0.7407192575406032, "grad_norm": 0.02965731732547283, "grad_norm_var": 1.3302885899295666e-06, "learning_rate": 0.0016300064333877467, "loss": 2.5748, "step": 20432 }, { "crossentropy": 2.3952109813690186, "epoch": 0.7407555104408353, "grad_norm": 0.028920922428369522, "grad_norm_var": 1.484407220111528e-06, "learning_rate": 0.0016295771877514998, "loss": 2.3905, "step": 20433 }, { "crossentropy": 2.3786561489105225, "epoch": 0.7407917633410673, "grad_norm": 0.027781793847680092, "grad_norm_var": 1.492452432901893e-06, "learning_rate": 0.0016291479876379255, "loss": 2.4753, "step": 20434 }, { "crossentropy": 2.640669822692871, "epoch": 0.7408280162412993, "grad_norm": 0.027328690513968468, "grad_norm_var": 1.4937307622758886e-06, "learning_rate": 0.0016287188330528241, "loss": 2.6279, "step": 20435 }, { "crossentropy": 2.5624570846557617, "epoch": 0.7408642691415314, "grad_norm": 0.02862970158457756, "grad_norm_var": 1.5451745042526031e-06, "learning_rate": 0.0016282897240019933, "loss": 2.529, "step": 20436 }, { "crossentropy": 2.4220705032348633, "epoch": 0.7409005220417634, "grad_norm": 0.026009390130639076, "grad_norm_var": 1.4618497162115535e-06, "learning_rate": 0.0016278606604912239, "loss": 2.4206, "step": 20437 }, { "crossentropy": 2.7374377250671387, "epoch": 0.7409367749419954, "grad_norm": 0.02834406867623329, "grad_norm_var": 1.5171975204901807e-06, "learning_rate": 0.0016274316425263131, "loss": 2.5747, "step": 20438 }, { "crossentropy": 2.484919548034668, "epoch": 0.7409730278422274, "grad_norm": 0.025820821523666382, "grad_norm_var": 1.66079122845784e-06, "learning_rate": 0.001627002670113058, "loss": 2.4964, "step": 20439 }, { "crossentropy": 2.550039291381836, "epoch": 0.7410092807424594, "grad_norm": 0.027721229940652847, "grad_norm_var": 1.6235115904450829e-06, "learning_rate": 0.0016265737432572492, "loss": 2.5573, "step": 20440 }, { "crossentropy": 2.468459367752075, "epoch": 0.7410455336426914, "grad_norm": 0.02675788663327694, "grad_norm_var": 1.4605887576280557e-06, "learning_rate": 0.0016261448619646823, "loss": 2.4565, "step": 20441 }, { "crossentropy": 2.602242946624756, "epoch": 0.7410817865429234, "grad_norm": 0.026685679331421852, "grad_norm_var": 1.4891659578306995e-06, "learning_rate": 0.001625716026241148, "loss": 2.6271, "step": 20442 }, { "crossentropy": 2.516533136367798, "epoch": 0.7411180394431555, "grad_norm": 0.02901400811970234, "grad_norm_var": 1.2688425763630507e-06, "learning_rate": 0.0016252872360924392, "loss": 2.4721, "step": 20443 }, { "crossentropy": 2.7174971103668213, "epoch": 0.7411542923433875, "grad_norm": 0.026423074305057526, "grad_norm_var": 1.3589973023346117e-06, "learning_rate": 0.0016248584915243508, "loss": 2.6441, "step": 20444 }, { "crossentropy": 2.434026002883911, "epoch": 0.7411905452436195, "grad_norm": 0.02579803392291069, "grad_norm_var": 1.5455265982825331e-06, "learning_rate": 0.001624429792542667, "loss": 2.5267, "step": 20445 }, { "crossentropy": 2.5217065811157227, "epoch": 0.7412267981438515, "grad_norm": 0.027614517137408257, "grad_norm_var": 1.4986632865989033e-06, "learning_rate": 0.0016240011391531817, "loss": 2.5679, "step": 20446 }, { "crossentropy": 2.5786538124084473, "epoch": 0.7412630510440835, "grad_norm": 0.027383048087358475, "grad_norm_var": 1.4299692260334729e-06, "learning_rate": 0.0016235725313616862, "loss": 2.5154, "step": 20447 }, { "crossentropy": 2.628511905670166, "epoch": 0.7412993039443155, "grad_norm": 0.02768373303115368, "grad_norm_var": 1.1039164650762656e-06, "learning_rate": 0.0016231439691739658, "loss": 2.5391, "step": 20448 }, { "crossentropy": 2.483924150466919, "epoch": 0.7413355568445475, "grad_norm": 0.02653469890356064, "grad_norm_var": 9.662814146812448e-07, "learning_rate": 0.001622715452595812, "loss": 2.5781, "step": 20449 }, { "crossentropy": 2.4169788360595703, "epoch": 0.7413718097447796, "grad_norm": 0.025818180292844772, "grad_norm_var": 1.0603511985174252e-06, "learning_rate": 0.0016222869816330094, "loss": 2.5075, "step": 20450 }, { "crossentropy": 2.6525685787200928, "epoch": 0.7414080626450116, "grad_norm": 0.027286691591143608, "grad_norm_var": 1.059169175874813e-06, "learning_rate": 0.0016218585562913473, "loss": 2.6228, "step": 20451 }, { "crossentropy": 2.40960431098938, "epoch": 0.7414443155452436, "grad_norm": 0.025430934503674507, "grad_norm_var": 1.0442493522448577e-06, "learning_rate": 0.0016214301765766154, "loss": 2.44, "step": 20452 }, { "crossentropy": 2.4366884231567383, "epoch": 0.7414805684454756, "grad_norm": 0.029874825850129128, "grad_norm_var": 1.52147005009941e-06, "learning_rate": 0.0016210018424945927, "loss": 2.5384, "step": 20453 }, { "crossentropy": 2.3219215869903564, "epoch": 0.7415168213457076, "grad_norm": 0.026066817343235016, "grad_norm_var": 1.4790698701065234e-06, "learning_rate": 0.0016205735540510673, "loss": 2.4209, "step": 20454 }, { "crossentropy": 2.528367757797241, "epoch": 0.7415530742459396, "grad_norm": 0.026211272925138474, "grad_norm_var": 1.4274891229852533e-06, "learning_rate": 0.0016201453112518272, "loss": 2.4869, "step": 20455 }, { "crossentropy": 2.535309314727783, "epoch": 0.7415893271461717, "grad_norm": 0.02876807190477848, "grad_norm_var": 1.5939924941550983e-06, "learning_rate": 0.0016197171141026517, "loss": 2.5132, "step": 20456 }, { "crossentropy": 2.56109619140625, "epoch": 0.7416255800464037, "grad_norm": 0.032649654895067215, "grad_norm_var": 3.5069992846007797e-06, "learning_rate": 0.0016192889626093277, "loss": 2.5068, "step": 20457 }, { "crossentropy": 2.582977533340454, "epoch": 0.7416618329466357, "grad_norm": 0.025568729266524315, "grad_norm_var": 3.699203105419063e-06, "learning_rate": 0.0016188608567776353, "loss": 2.5054, "step": 20458 }, { "crossentropy": 2.4859676361083984, "epoch": 0.7416980858468677, "grad_norm": 0.026048988103866577, "grad_norm_var": 3.6038236567095442e-06, "learning_rate": 0.0016184327966133583, "loss": 2.4792, "step": 20459 }, { "crossentropy": 2.342525005340576, "epoch": 0.7417343387470998, "grad_norm": 0.025311285629868507, "grad_norm_var": 3.795889766558478e-06, "learning_rate": 0.0016180047821222794, "loss": 2.4389, "step": 20460 }, { "crossentropy": 2.3072140216827393, "epoch": 0.7417705916473318, "grad_norm": 0.02560023032128811, "grad_norm_var": 3.8334138806174565e-06, "learning_rate": 0.0016175768133101782, "loss": 2.3694, "step": 20461 }, { "crossentropy": 2.5960137844085693, "epoch": 0.7418068445475638, "grad_norm": 0.028457477688789368, "grad_norm_var": 3.933886338528675e-06, "learning_rate": 0.0016171488901828336, "loss": 2.5246, "step": 20462 }, { "crossentropy": 2.554503917694092, "epoch": 0.7418430974477959, "grad_norm": 0.026992086321115494, "grad_norm_var": 3.932251089424031e-06, "learning_rate": 0.0016167210127460286, "loss": 2.4972, "step": 20463 }, { "crossentropy": 2.3617966175079346, "epoch": 0.7418793503480279, "grad_norm": 0.027071602642536163, "grad_norm_var": 3.91161682690339e-06, "learning_rate": 0.0016162931810055393, "loss": 2.3957, "step": 20464 }, { "crossentropy": 2.543854236602783, "epoch": 0.7419156032482599, "grad_norm": 0.025973020121455193, "grad_norm_var": 3.9740987075799485e-06, "learning_rate": 0.001615865394967147, "loss": 2.5006, "step": 20465 }, { "crossentropy": 2.4932005405426025, "epoch": 0.7419518561484919, "grad_norm": 0.026069244369864464, "grad_norm_var": 3.936112704068734e-06, "learning_rate": 0.001615437654636626, "loss": 2.5186, "step": 20466 }, { "crossentropy": 2.5109572410583496, "epoch": 0.7419881090487239, "grad_norm": 0.026753472164273262, "grad_norm_var": 3.939636456753631e-06, "learning_rate": 0.0016150099600197565, "loss": 2.5194, "step": 20467 }, { "crossentropy": 2.4255471229553223, "epoch": 0.7420243619489559, "grad_norm": 0.026241732761263847, "grad_norm_var": 3.8053697851321422e-06, "learning_rate": 0.0016145823111223157, "loss": 2.3952, "step": 20468 }, { "crossentropy": 2.4475884437561035, "epoch": 0.742060614849188, "grad_norm": 0.02629043161869049, "grad_norm_var": 3.283967534333197e-06, "learning_rate": 0.001614154707950078, "loss": 2.4619, "step": 20469 }, { "crossentropy": 2.4672255516052246, "epoch": 0.74209686774942, "grad_norm": 0.026582280173897743, "grad_norm_var": 3.244710446334203e-06, "learning_rate": 0.001613727150508818, "loss": 2.4424, "step": 20470 }, { "crossentropy": 2.6198909282684326, "epoch": 0.742133120649652, "grad_norm": 0.02643309161067009, "grad_norm_var": 3.2270655574217206e-06, "learning_rate": 0.0016132996388043132, "loss": 2.5555, "step": 20471 }, { "crossentropy": 2.6526875495910645, "epoch": 0.742169373549884, "grad_norm": 0.027396012097597122, "grad_norm_var": 3.0076811713102813e-06, "learning_rate": 0.0016128721728423346, "loss": 2.617, "step": 20472 }, { "crossentropy": 2.507065773010254, "epoch": 0.742205626450116, "grad_norm": 0.02521161362528801, "grad_norm_var": 7.037584865254275e-07, "learning_rate": 0.001612444752628659, "loss": 2.4548, "step": 20473 }, { "crossentropy": 2.574124336242676, "epoch": 0.742241879350348, "grad_norm": 0.028073076158761978, "grad_norm_var": 8.264917577508925e-07, "learning_rate": 0.001612017378169056, "loss": 2.4959, "step": 20474 }, { "crossentropy": 2.4901394844055176, "epoch": 0.74227813225058, "grad_norm": 0.027225660160183907, "grad_norm_var": 8.373093420300029e-07, "learning_rate": 0.0016115900494693004, "loss": 2.4951, "step": 20475 }, { "crossentropy": 2.5029642581939697, "epoch": 0.742314385150812, "grad_norm": 0.026423050090670586, "grad_norm_var": 7.227650470315039e-07, "learning_rate": 0.0016111627665351647, "loss": 2.5531, "step": 20476 }, { "crossentropy": 2.481921911239624, "epoch": 0.7423506380510441, "grad_norm": 0.026378383859992027, "grad_norm_var": 6.49137162722635e-07, "learning_rate": 0.0016107355293724162, "loss": 2.5124, "step": 20477 }, { "crossentropy": 2.454658031463623, "epoch": 0.7423868909512761, "grad_norm": 0.025525419041514397, "grad_norm_var": 5.084724383072007e-07, "learning_rate": 0.0016103083379868305, "loss": 2.5493, "step": 20478 }, { "crossentropy": 2.6584811210632324, "epoch": 0.7424231438515081, "grad_norm": 0.027205657213926315, "grad_norm_var": 5.24196570402544e-07, "learning_rate": 0.0016098811923841738, "loss": 2.6061, "step": 20479 }, { "crossentropy": 2.588472843170166, "epoch": 0.7424593967517401, "grad_norm": 0.02496744506061077, "grad_norm_var": 6.555185757677861e-07, "learning_rate": 0.0016094540925702156, "loss": 2.4911, "step": 20480 }, { "crossentropy": 2.458514451980591, "epoch": 0.7424956496519721, "grad_norm": 0.026113631203770638, "grad_norm_var": 6.483395772809616e-07, "learning_rate": 0.001609027038550726, "loss": 2.5245, "step": 20481 }, { "crossentropy": 2.5459210872650146, "epoch": 0.7425319025522041, "grad_norm": 0.02739209681749344, "grad_norm_var": 6.939680880846718e-07, "learning_rate": 0.0016086000303314712, "loss": 2.6466, "step": 20482 }, { "crossentropy": 2.470574140548706, "epoch": 0.7425681554524362, "grad_norm": 0.027813250198960304, "grad_norm_var": 7.9809866514352e-07, "learning_rate": 0.0016081730679182193, "loss": 2.3993, "step": 20483 }, { "crossentropy": 2.45050311088562, "epoch": 0.7426044083526682, "grad_norm": 0.02603759616613388, "grad_norm_var": 8.098979848523864e-07, "learning_rate": 0.001607746151316739, "loss": 2.501, "step": 20484 }, { "crossentropy": 2.453531265258789, "epoch": 0.7426406612529002, "grad_norm": 0.0258119348436594, "grad_norm_var": 8.418396981816108e-07, "learning_rate": 0.0016073192805327935, "loss": 2.3745, "step": 20485 }, { "crossentropy": 2.437743902206421, "epoch": 0.7426769141531323, "grad_norm": 0.026331458240747452, "grad_norm_var": 8.442536085485896e-07, "learning_rate": 0.001606892455572151, "loss": 2.4445, "step": 20486 }, { "crossentropy": 2.618086814880371, "epoch": 0.7427131670533643, "grad_norm": 0.02602543868124485, "grad_norm_var": 8.594295337626395e-07, "learning_rate": 0.001606465676440575, "loss": 2.5261, "step": 20487 }, { "crossentropy": 2.5356249809265137, "epoch": 0.7427494199535963, "grad_norm": 0.02566220425069332, "grad_norm_var": 8.391886094566887e-07, "learning_rate": 0.0016060389431438288, "loss": 2.5349, "step": 20488 }, { "crossentropy": 2.5609147548675537, "epoch": 0.7427856728538283, "grad_norm": 0.026669058948755264, "grad_norm_var": 7.434677581801459e-07, "learning_rate": 0.0016056122556876785, "loss": 2.4962, "step": 20489 }, { "crossentropy": 2.452794313430786, "epoch": 0.7428219257540604, "grad_norm": 0.026362424716353416, "grad_norm_var": 5.626523029343544e-07, "learning_rate": 0.0016051856140778843, "loss": 2.4494, "step": 20490 }, { "crossentropy": 2.465287208557129, "epoch": 0.7428581786542924, "grad_norm": 0.02688736282289028, "grad_norm_var": 5.312791067536935e-07, "learning_rate": 0.0016047590183202098, "loss": 2.4045, "step": 20491 }, { "crossentropy": 2.4509525299072266, "epoch": 0.7428944315545244, "grad_norm": 0.026175905019044876, "grad_norm_var": 5.327026597792249e-07, "learning_rate": 0.0016043324684204196, "loss": 2.492, "step": 20492 }, { "crossentropy": 2.4520201683044434, "epoch": 0.7429306844547564, "grad_norm": 0.02611774578690529, "grad_norm_var": 5.354391659665442e-07, "learning_rate": 0.0016039059643842702, "loss": 2.5514, "step": 20493 }, { "crossentropy": 2.514568328857422, "epoch": 0.7429669373549884, "grad_norm": 0.025839949026703835, "grad_norm_var": 5.083556456292413e-07, "learning_rate": 0.0016034795062175268, "loss": 2.4213, "step": 20494 }, { "crossentropy": 2.4684934616088867, "epoch": 0.7430031902552204, "grad_norm": 0.02650929056107998, "grad_norm_var": 4.581324841291627e-07, "learning_rate": 0.0016030530939259463, "loss": 2.4847, "step": 20495 }, { "crossentropy": 2.5249907970428467, "epoch": 0.7430394431554525, "grad_norm": 0.02672472968697548, "grad_norm_var": 3.401302843614723e-07, "learning_rate": 0.0016026267275152878, "loss": 2.5545, "step": 20496 }, { "crossentropy": 2.5299198627471924, "epoch": 0.7430756960556845, "grad_norm": 0.026752928271889687, "grad_norm_var": 3.4086947319372263e-07, "learning_rate": 0.0016022004069913126, "loss": 2.4811, "step": 20497 }, { "crossentropy": 2.4408485889434814, "epoch": 0.7431119489559165, "grad_norm": 0.02844974771142006, "grad_norm_var": 5.44401667228753e-07, "learning_rate": 0.0016017741323597757, "loss": 2.4724, "step": 20498 }, { "crossentropy": 2.4212429523468018, "epoch": 0.7431482018561485, "grad_norm": 0.026148956269025803, "grad_norm_var": 4.28472863427988e-07, "learning_rate": 0.0016013479036264355, "loss": 2.4634, "step": 20499 }, { "crossentropy": 2.5965442657470703, "epoch": 0.7431844547563805, "grad_norm": 0.03082374855875969, "grad_norm_var": 1.6246500298281822e-06, "learning_rate": 0.0016009217207970518, "loss": 2.5674, "step": 20500 }, { "crossentropy": 2.51043701171875, "epoch": 0.7432207076566125, "grad_norm": 0.027941405773162842, "grad_norm_var": 1.6542693036794652e-06, "learning_rate": 0.0016004955838773765, "loss": 2.5172, "step": 20501 }, { "crossentropy": 2.4353878498077393, "epoch": 0.7432569605568445, "grad_norm": 0.02594178169965744, "grad_norm_var": 1.69012472988488e-06, "learning_rate": 0.0016000694928731691, "loss": 2.4559, "step": 20502 }, { "crossentropy": 2.560317039489746, "epoch": 0.7432932134570766, "grad_norm": 0.026627255603671074, "grad_norm_var": 1.6494417456126235e-06, "learning_rate": 0.0015996434477901827, "loss": 2.4973, "step": 20503 }, { "crossentropy": 2.596597909927368, "epoch": 0.7433294663573086, "grad_norm": 0.026493072509765625, "grad_norm_var": 1.5607623817843105e-06, "learning_rate": 0.0015992174486341704, "loss": 2.5505, "step": 20504 }, { "crossentropy": 2.5258474349975586, "epoch": 0.7433657192575406, "grad_norm": 0.026469355449080467, "grad_norm_var": 1.5695130496999234e-06, "learning_rate": 0.001598791495410889, "loss": 2.5539, "step": 20505 }, { "crossentropy": 2.4850962162017822, "epoch": 0.7434019721577726, "grad_norm": 0.025571737438440323, "grad_norm_var": 1.6643758783521182e-06, "learning_rate": 0.0015983655881260878, "loss": 2.4951, "step": 20506 }, { "crossentropy": 2.5025274753570557, "epoch": 0.7434382250580046, "grad_norm": 0.0261974073946476, "grad_norm_var": 1.6899722630228277e-06, "learning_rate": 0.0015979397267855218, "loss": 2.4861, "step": 20507 }, { "crossentropy": 2.540301561355591, "epoch": 0.7434744779582366, "grad_norm": 0.026925791054964066, "grad_norm_var": 1.6628114458802257e-06, "learning_rate": 0.0015975139113949444, "loss": 2.5748, "step": 20508 }, { "crossentropy": 2.501760959625244, "epoch": 0.7435107308584686, "grad_norm": 0.0269070565700531, "grad_norm_var": 1.6251143619259394e-06, "learning_rate": 0.001597088141960103, "loss": 2.5709, "step": 20509 }, { "crossentropy": 2.3995699882507324, "epoch": 0.7435469837587007, "grad_norm": 0.027377862483263016, "grad_norm_var": 1.5565403615233042e-06, "learning_rate": 0.0015966624184867524, "loss": 2.4609, "step": 20510 }, { "crossentropy": 2.4775238037109375, "epoch": 0.7435832366589327, "grad_norm": 0.026954401284456253, "grad_norm_var": 1.5403118187906716e-06, "learning_rate": 0.0015962367409806406, "loss": 2.4772, "step": 20511 }, { "crossentropy": 2.407785415649414, "epoch": 0.7436194895591647, "grad_norm": 0.02679140865802765, "grad_norm_var": 1.537971681136183e-06, "learning_rate": 0.0015958111094475152, "loss": 2.4124, "step": 20512 }, { "crossentropy": 2.566141128540039, "epoch": 0.7436557424593968, "grad_norm": 0.02627747133374214, "grad_norm_var": 1.5692448272324373e-06, "learning_rate": 0.0015953855238931263, "loss": 2.5667, "step": 20513 }, { "crossentropy": 2.4688620567321777, "epoch": 0.7436919953596288, "grad_norm": 0.02601487748324871, "grad_norm_var": 1.4670619091227405e-06, "learning_rate": 0.0015949599843232249, "loss": 2.4243, "step": 20514 }, { "crossentropy": 2.5664408206939697, "epoch": 0.7437282482598608, "grad_norm": 0.02775699645280838, "grad_norm_var": 1.4801943996864958e-06, "learning_rate": 0.001594534490743555, "loss": 2.5716, "step": 20515 }, { "crossentropy": 2.443234920501709, "epoch": 0.7437645011600929, "grad_norm": 0.02711949311196804, "grad_norm_var": 4.20578814140186e-07, "learning_rate": 0.0015941090431598655, "loss": 2.397, "step": 20516 }, { "crossentropy": 2.5104594230651855, "epoch": 0.7438007540603249, "grad_norm": 0.026796823367476463, "grad_norm_var": 3.1460240584948025e-07, "learning_rate": 0.001593683641577901, "loss": 2.4919, "step": 20517 }, { "crossentropy": 2.4559314250946045, "epoch": 0.7438370069605569, "grad_norm": 0.027754658833146095, "grad_norm_var": 3.5149889404973804e-07, "learning_rate": 0.0015932582860034078, "loss": 2.5187, "step": 20518 }, { "crossentropy": 2.3999674320220947, "epoch": 0.7438732598607889, "grad_norm": 0.02522333711385727, "grad_norm_var": 4.980793098391707e-07, "learning_rate": 0.0015928329764421329, "loss": 2.426, "step": 20519 }, { "crossentropy": 2.4513707160949707, "epoch": 0.7439095127610209, "grad_norm": 0.026261068880558014, "grad_norm_var": 5.067458394388079e-07, "learning_rate": 0.0015924077128998194, "loss": 2.4406, "step": 20520 }, { "crossentropy": 2.5596704483032227, "epoch": 0.7439457656612529, "grad_norm": 0.02622498758137226, "grad_norm_var": 5.163633822986008e-07, "learning_rate": 0.0015919824953822092, "loss": 2.5623, "step": 20521 }, { "crossentropy": 2.5729970932006836, "epoch": 0.7439820185614849, "grad_norm": 0.02567727118730545, "grad_norm_var": 5.021021873965187e-07, "learning_rate": 0.0015915573238950492, "loss": 2.5662, "step": 20522 }, { "crossentropy": 2.6039700508117676, "epoch": 0.744018271461717, "grad_norm": 0.026560263708233833, "grad_norm_var": 4.888549924376058e-07, "learning_rate": 0.0015911321984440785, "loss": 2.589, "step": 20523 }, { "crossentropy": 2.5510354042053223, "epoch": 0.744054524361949, "grad_norm": 0.025870949029922485, "grad_norm_var": 5.215764330231016e-07, "learning_rate": 0.001590707119035042, "loss": 2.4955, "step": 20524 }, { "crossentropy": 2.5429136753082275, "epoch": 0.744090777262181, "grad_norm": 0.026307672262191772, "grad_norm_var": 5.193357560013985e-07, "learning_rate": 0.0015902820856736777, "loss": 2.5041, "step": 20525 }, { "crossentropy": 2.586963176727295, "epoch": 0.744127030162413, "grad_norm": 0.026508428156375885, "grad_norm_var": 4.718392868097628e-07, "learning_rate": 0.0015898570983657284, "loss": 2.5274, "step": 20526 }, { "crossentropy": 2.5660552978515625, "epoch": 0.744163283062645, "grad_norm": 0.02712361328303814, "grad_norm_var": 4.837396876055614e-07, "learning_rate": 0.0015894321571169362, "loss": 2.5035, "step": 20527 }, { "crossentropy": 2.551966905593872, "epoch": 0.744199535962877, "grad_norm": 0.026442455127835274, "grad_norm_var": 4.785749816641275e-07, "learning_rate": 0.0015890072619330381, "loss": 2.4972, "step": 20528 }, { "crossentropy": 2.5088369846343994, "epoch": 0.744235788863109, "grad_norm": 0.02544781006872654, "grad_norm_var": 5.456619796505388e-07, "learning_rate": 0.0015885824128197713, "loss": 2.556, "step": 20529 }, { "crossentropy": 2.571499824523926, "epoch": 0.744272041763341, "grad_norm": 0.0267331525683403, "grad_norm_var": 5.368894299838274e-07, "learning_rate": 0.0015881576097828783, "loss": 2.4698, "step": 20530 }, { "crossentropy": 2.4273018836975098, "epoch": 0.7443082946635731, "grad_norm": 0.028093785047531128, "grad_norm_var": 6.00960308271871e-07, "learning_rate": 0.0015877328528280921, "loss": 2.4556, "step": 20531 }, { "crossentropy": 2.648115873336792, "epoch": 0.7443445475638051, "grad_norm": 0.027051188051700592, "grad_norm_var": 5.956929450434545e-07, "learning_rate": 0.001587308141961154, "loss": 2.6126, "step": 20532 }, { "crossentropy": 2.4866602420806885, "epoch": 0.7443808004640371, "grad_norm": 0.02692178450524807, "grad_norm_var": 6.015337516526575e-07, "learning_rate": 0.0015868834771877972, "loss": 2.5252, "step": 20533 }, { "crossentropy": 2.477743148803711, "epoch": 0.7444170533642691, "grad_norm": 0.02576170302927494, "grad_norm_var": 5.197403975154876e-07, "learning_rate": 0.001586458858513758, "loss": 2.539, "step": 20534 }, { "crossentropy": 2.457646608352661, "epoch": 0.7444533062645011, "grad_norm": 0.02675587683916092, "grad_norm_var": 4.285283859817157e-07, "learning_rate": 0.001586034285944774, "loss": 2.4904, "step": 20535 }, { "crossentropy": 2.5446739196777344, "epoch": 0.7444895591647331, "grad_norm": 0.026772886514663696, "grad_norm_var": 4.2969586340721397e-07, "learning_rate": 0.0015856097594865765, "loss": 2.5088, "step": 20536 }, { "crossentropy": 2.2688539028167725, "epoch": 0.7445258120649652, "grad_norm": 0.02541310526430607, "grad_norm_var": 5.023805955857296e-07, "learning_rate": 0.0015851852791449017, "loss": 2.389, "step": 20537 }, { "crossentropy": 2.5476558208465576, "epoch": 0.7445620649651972, "grad_norm": 0.02795371785759926, "grad_norm_var": 5.871354440699911e-07, "learning_rate": 0.0015847608449254824, "loss": 2.5225, "step": 20538 }, { "crossentropy": 2.52133846282959, "epoch": 0.7445983178654292, "grad_norm": 0.026880856603384018, "grad_norm_var": 5.915443303522797e-07, "learning_rate": 0.001584336456834049, "loss": 2.4628, "step": 20539 }, { "crossentropy": 2.5409748554229736, "epoch": 0.7446345707656613, "grad_norm": 0.026405636221170425, "grad_norm_var": 5.554812575379875e-07, "learning_rate": 0.001583912114876337, "loss": 2.5087, "step": 20540 }, { "crossentropy": 2.560842514038086, "epoch": 0.7446708236658933, "grad_norm": 0.027037015184760094, "grad_norm_var": 5.543821177805223e-07, "learning_rate": 0.0015834878190580737, "loss": 2.5523, "step": 20541 }, { "crossentropy": 2.4140737056732178, "epoch": 0.7447070765661253, "grad_norm": 0.026605093851685524, "grad_norm_var": 5.524140274992817e-07, "learning_rate": 0.0015830635693849927, "loss": 2.3985, "step": 20542 }, { "crossentropy": 2.386775255203247, "epoch": 0.7447433294663574, "grad_norm": 0.026587972417473793, "grad_norm_var": 5.409833352902511e-07, "learning_rate": 0.0015826393658628251, "loss": 2.3605, "step": 20543 }, { "crossentropy": 2.582073450088501, "epoch": 0.7447795823665894, "grad_norm": 0.026943709701299667, "grad_norm_var": 5.408774544240671e-07, "learning_rate": 0.0015822152084972974, "loss": 2.4725, "step": 20544 }, { "crossentropy": 2.4563040733337402, "epoch": 0.7448158352668214, "grad_norm": 0.025610921904444695, "grad_norm_var": 5.150826852180392e-07, "learning_rate": 0.0015817910972941412, "loss": 2.4252, "step": 20545 }, { "crossentropy": 2.5016307830810547, "epoch": 0.7448520881670534, "grad_norm": 0.027305787429213524, "grad_norm_var": 5.365412078644586e-07, "learning_rate": 0.001581367032259084, "loss": 2.4488, "step": 20546 }, { "crossentropy": 2.5514066219329834, "epoch": 0.7448883410672854, "grad_norm": 0.026263196021318436, "grad_norm_var": 4.195344934935022e-07, "learning_rate": 0.0015809430133978508, "loss": 2.4631, "step": 20547 }, { "crossentropy": 2.3618216514587402, "epoch": 0.7449245939675174, "grad_norm": 0.026962339878082275, "grad_norm_var": 4.1517930696958333e-07, "learning_rate": 0.001580519040716173, "loss": 2.3877, "step": 20548 }, { "crossentropy": 2.5864834785461426, "epoch": 0.7449608468677494, "grad_norm": 0.027130890637636185, "grad_norm_var": 4.258702846581756e-07, "learning_rate": 0.0015800951142197729, "loss": 2.5404, "step": 20549 }, { "crossentropy": 2.515254020690918, "epoch": 0.7449970997679815, "grad_norm": 0.026276595890522003, "grad_norm_var": 3.814961123537371e-07, "learning_rate": 0.0015796712339143783, "loss": 2.4343, "step": 20550 }, { "crossentropy": 2.498269557952881, "epoch": 0.7450333526682135, "grad_norm": 0.026557570323348045, "grad_norm_var": 3.81990013998074e-07, "learning_rate": 0.001579247399805716, "loss": 2.4785, "step": 20551 }, { "crossentropy": 2.6552820205688477, "epoch": 0.7450696055684455, "grad_norm": 0.02693798393011093, "grad_norm_var": 3.8597590469670724e-07, "learning_rate": 0.001578823611899507, "loss": 2.5628, "step": 20552 }, { "crossentropy": 2.400845766067505, "epoch": 0.7451058584686775, "grad_norm": 0.025983937084674835, "grad_norm_var": 3.099531401447193e-07, "learning_rate": 0.0015783998702014784, "loss": 2.4385, "step": 20553 }, { "crossentropy": 2.5085837841033936, "epoch": 0.7451421113689095, "grad_norm": 0.02721356973052025, "grad_norm_var": 2.2196710739065772e-07, "learning_rate": 0.0015779761747173522, "loss": 2.6073, "step": 20554 }, { "crossentropy": 2.334268569946289, "epoch": 0.7451783642691415, "grad_norm": 0.026485204696655273, "grad_norm_var": 2.2057164461899056e-07, "learning_rate": 0.0015775525254528488, "loss": 2.4085, "step": 20555 }, { "crossentropy": 2.5485496520996094, "epoch": 0.7452146171693735, "grad_norm": 0.025993002578616142, "grad_norm_var": 2.4433933446404006e-07, "learning_rate": 0.0015771289224136942, "loss": 2.5015, "step": 20556 }, { "crossentropy": 2.5263843536376953, "epoch": 0.7452508700696056, "grad_norm": 0.026753505691885948, "grad_norm_var": 2.3353967704985562e-07, "learning_rate": 0.0015767053656056062, "loss": 2.5314, "step": 20557 }, { "crossentropy": 2.430265188217163, "epoch": 0.7452871229698376, "grad_norm": 0.026084264740347862, "grad_norm_var": 2.50188841025807e-07, "learning_rate": 0.0015762818550343078, "loss": 2.386, "step": 20558 }, { "crossentropy": 2.236238956451416, "epoch": 0.7453233758700696, "grad_norm": 0.026572074741125107, "grad_norm_var": 2.501626266251481e-07, "learning_rate": 0.0015758583907055197, "loss": 2.324, "step": 20559 }, { "crossentropy": 2.572702646255493, "epoch": 0.7453596287703016, "grad_norm": 0.026946302503347397, "grad_norm_var": 2.5029322274240187e-07, "learning_rate": 0.001575434972624959, "loss": 2.5095, "step": 20560 }, { "crossentropy": 2.42382550239563, "epoch": 0.7453958816705336, "grad_norm": 0.02598433755338192, "grad_norm_var": 2.1139022048782818e-07, "learning_rate": 0.0015750116007983473, "loss": 2.4566, "step": 20561 }, { "crossentropy": 2.605128288269043, "epoch": 0.7454321345707656, "grad_norm": 0.02646021544933319, "grad_norm_var": 1.754516808268134e-07, "learning_rate": 0.0015745882752314017, "loss": 2.5044, "step": 20562 }, { "crossentropy": 2.3707385063171387, "epoch": 0.7454683874709976, "grad_norm": 0.026249993592500687, "grad_norm_var": 1.759459878290879e-07, "learning_rate": 0.0015741649959298382, "loss": 2.3779, "step": 20563 }, { "crossentropy": 2.4451828002929688, "epoch": 0.7455046403712297, "grad_norm": 0.02670496702194214, "grad_norm_var": 1.6548945928790357e-07, "learning_rate": 0.0015737417628993767, "loss": 2.5004, "step": 20564 }, { "crossentropy": 2.4258058071136475, "epoch": 0.7455408932714617, "grad_norm": 0.025739887729287148, "grad_norm_var": 1.7328704806594358e-07, "learning_rate": 0.0015733185761457307, "loss": 2.4733, "step": 20565 }, { "crossentropy": 2.526787757873535, "epoch": 0.7455771461716937, "grad_norm": 0.02754291705787182, "grad_norm_var": 2.46939766860539e-07, "learning_rate": 0.0015728954356746172, "loss": 2.5796, "step": 20566 }, { "crossentropy": 2.506810188293457, "epoch": 0.7456133990719258, "grad_norm": 0.02648748643696308, "grad_norm_var": 2.468312760695353e-07, "learning_rate": 0.001572472341491754, "loss": 2.4101, "step": 20567 }, { "crossentropy": 2.4247043132781982, "epoch": 0.7456496519721578, "grad_norm": 0.026180006563663483, "grad_norm_var": 2.3935722701557264e-07, "learning_rate": 0.0015720492936028513, "loss": 2.5455, "step": 20568 }, { "crossentropy": 2.429879665374756, "epoch": 0.7456859048723898, "grad_norm": 0.026345381513237953, "grad_norm_var": 2.2451437118638461e-07, "learning_rate": 0.0015716262920136264, "loss": 2.5095, "step": 20569 }, { "crossentropy": 2.625443696975708, "epoch": 0.7457221577726219, "grad_norm": 0.027234576642513275, "grad_norm_var": 2.2658557392030154e-07, "learning_rate": 0.0015712033367297912, "loss": 2.5969, "step": 20570 }, { "crossentropy": 2.4169669151306152, "epoch": 0.7457584106728539, "grad_norm": 0.027920786291360855, "grad_norm_var": 3.553813199908343e-07, "learning_rate": 0.001570780427757057, "loss": 2.4396, "step": 20571 }, { "crossentropy": 2.4815287590026855, "epoch": 0.7457946635730859, "grad_norm": 0.026332149282097816, "grad_norm_var": 3.3625326341564975e-07, "learning_rate": 0.0015703575651011387, "loss": 2.3654, "step": 20572 }, { "crossentropy": 2.368947982788086, "epoch": 0.7458309164733179, "grad_norm": 0.0268967654556036, "grad_norm_var": 3.4054113139484846e-07, "learning_rate": 0.0015699347487677445, "loss": 2.4342, "step": 20573 }, { "crossentropy": 2.382916212081909, "epoch": 0.7458671693735499, "grad_norm": 0.026834961026906967, "grad_norm_var": 3.236276026181145e-07, "learning_rate": 0.0015695119787625866, "loss": 2.4857, "step": 20574 }, { "crossentropy": 2.4587953090667725, "epoch": 0.7459034222737819, "grad_norm": 0.026414981111884117, "grad_norm_var": 3.268451618955911e-07, "learning_rate": 0.0015690892550913771, "loss": 2.3441, "step": 20575 }, { "crossentropy": 2.476125717163086, "epoch": 0.7459396751740139, "grad_norm": 0.027238773182034492, "grad_norm_var": 3.4404890850123156e-07, "learning_rate": 0.0015686665777598224, "loss": 2.4852, "step": 20576 }, { "crossentropy": 2.4560701847076416, "epoch": 0.745975928074246, "grad_norm": 0.028625059872865677, "grad_norm_var": 5.418089134706071e-07, "learning_rate": 0.0015682439467736343, "loss": 2.4696, "step": 20577 }, { "crossentropy": 2.545766830444336, "epoch": 0.746012180974478, "grad_norm": 0.026212070137262344, "grad_norm_var": 5.577451174157884e-07, "learning_rate": 0.0015678213621385178, "loss": 2.5727, "step": 20578 }, { "crossentropy": 2.5279948711395264, "epoch": 0.74604843387471, "grad_norm": 0.02629280276596546, "grad_norm_var": 5.546629294746747e-07, "learning_rate": 0.0015673988238601839, "loss": 2.5023, "step": 20579 }, { "crossentropy": 2.400841236114502, "epoch": 0.746084686774942, "grad_norm": 0.02588893100619316, "grad_norm_var": 6.080069988529617e-07, "learning_rate": 0.001566976331944338, "loss": 2.4863, "step": 20580 }, { "crossentropy": 2.5307817459106445, "epoch": 0.746120939675174, "grad_norm": 0.027875248342752457, "grad_norm_var": 6.020613042107339e-07, "learning_rate": 0.0015665538863966855, "loss": 2.4736, "step": 20581 }, { "crossentropy": 2.5552468299865723, "epoch": 0.746157192575406, "grad_norm": 0.025989271700382233, "grad_norm_var": 6.187443991277717e-07, "learning_rate": 0.0015661314872229327, "loss": 2.4458, "step": 20582 }, { "crossentropy": 2.5071539878845215, "epoch": 0.746193445475638, "grad_norm": 0.0294159147888422, "grad_norm_var": 1.033452420871406e-06, "learning_rate": 0.001565709134428787, "loss": 2.4967, "step": 20583 }, { "crossentropy": 2.4739277362823486, "epoch": 0.74622969837587, "grad_norm": 0.026847705245018005, "grad_norm_var": 9.89997285589479e-07, "learning_rate": 0.0015652868280199494, "loss": 2.5382, "step": 20584 }, { "crossentropy": 2.5767033100128174, "epoch": 0.7462659512761021, "grad_norm": 0.026089880615472794, "grad_norm_var": 1.017156032197264e-06, "learning_rate": 0.0015648645680021273, "loss": 2.6193, "step": 20585 }, { "crossentropy": 2.400949478149414, "epoch": 0.7463022041763341, "grad_norm": 0.02620045095682144, "grad_norm_var": 1.052597189814264e-06, "learning_rate": 0.001564442354381021, "loss": 2.4639, "step": 20586 }, { "crossentropy": 2.303490400314331, "epoch": 0.7463384570765661, "grad_norm": 0.026725243777036667, "grad_norm_var": 9.859431040706394e-07, "learning_rate": 0.0015640201871623355, "loss": 2.3459, "step": 20587 }, { "crossentropy": 2.5818684101104736, "epoch": 0.7463747099767981, "grad_norm": 0.02832946740090847, "grad_norm_var": 1.0927008416904023e-06, "learning_rate": 0.0015635980663517718, "loss": 2.5166, "step": 20588 }, { "crossentropy": 2.3561911582946777, "epoch": 0.7464109628770301, "grad_norm": 0.02672293782234192, "grad_norm_var": 1.0968046040685097e-06, "learning_rate": 0.0015631759919550292, "loss": 2.3767, "step": 20589 }, { "crossentropy": 2.3957602977752686, "epoch": 0.7464472157772621, "grad_norm": 0.027334341779351234, "grad_norm_var": 1.1026350088579861e-06, "learning_rate": 0.0015627539639778104, "loss": 2.4173, "step": 20590 }, { "crossentropy": 2.5288543701171875, "epoch": 0.7464834686774942, "grad_norm": 0.027184702455997467, "grad_norm_var": 1.0783216114348814e-06, "learning_rate": 0.0015623319824258175, "loss": 2.4848, "step": 20591 }, { "crossentropy": 2.461123466491699, "epoch": 0.7465197215777262, "grad_norm": 0.02628762274980545, "grad_norm_var": 1.112294003435763e-06, "learning_rate": 0.0015619100473047466, "loss": 2.5603, "step": 20592 }, { "crossentropy": 2.6138460636138916, "epoch": 0.7465559744779582, "grad_norm": 0.02696712501347065, "grad_norm_var": 9.251574370587456e-07, "learning_rate": 0.001561488158620299, "loss": 2.6043, "step": 20593 }, { "crossentropy": 2.4901716709136963, "epoch": 0.7465922273781903, "grad_norm": 0.03401864692568779, "grad_norm_var": 4.020382615014953e-06, "learning_rate": 0.0015610663163781707, "loss": 2.5063, "step": 20594 }, { "crossentropy": 2.6073741912841797, "epoch": 0.7466284802784223, "grad_norm": 0.026084689423441887, "grad_norm_var": 4.053414189854896e-06, "learning_rate": 0.0015606445205840619, "loss": 2.5334, "step": 20595 }, { "crossentropy": 2.445202589035034, "epoch": 0.7466647331786543, "grad_norm": 0.02708105929195881, "grad_norm_var": 3.906401706910087e-06, "learning_rate": 0.001560222771243669, "loss": 2.5146, "step": 20596 }, { "crossentropy": 2.3514857292175293, "epoch": 0.7467009860788864, "grad_norm": 0.02704223431646824, "grad_norm_var": 3.902222336721083e-06, "learning_rate": 0.001559801068362685, "loss": 2.562, "step": 20597 }, { "crossentropy": 2.416506052017212, "epoch": 0.7467372389791184, "grad_norm": 0.026299308985471725, "grad_norm_var": 3.850116262945355e-06, "learning_rate": 0.0015593794119468092, "loss": 2.4951, "step": 20598 }, { "crossentropy": 2.3820652961730957, "epoch": 0.7467734918793504, "grad_norm": 0.028118839487433434, "grad_norm_var": 3.6091278833726466e-06, "learning_rate": 0.0015589578020017364, "loss": 2.4729, "step": 20599 }, { "crossentropy": 2.3557727336883545, "epoch": 0.7468097447795824, "grad_norm": 0.02665422484278679, "grad_norm_var": 3.623996973762611e-06, "learning_rate": 0.0015585362385331597, "loss": 2.4781, "step": 20600 }, { "crossentropy": 2.515493631362915, "epoch": 0.7468459976798144, "grad_norm": 0.02719017118215561, "grad_norm_var": 3.519006269119213e-06, "learning_rate": 0.001558114721546775, "loss": 2.4849, "step": 20601 }, { "crossentropy": 2.5669331550598145, "epoch": 0.7468822505800464, "grad_norm": 0.02848409302532673, "grad_norm_var": 3.4827242284998205e-06, "learning_rate": 0.0015576932510482729, "loss": 2.544, "step": 20602 }, { "crossentropy": 2.479729175567627, "epoch": 0.7469185034802784, "grad_norm": 0.027618886902928352, "grad_norm_var": 3.4364150116763475e-06, "learning_rate": 0.0015572718270433466, "loss": 2.5326, "step": 20603 }, { "crossentropy": 2.3178961277008057, "epoch": 0.7469547563805105, "grad_norm": 0.026349162682890892, "grad_norm_var": 3.4859087439963043e-06, "learning_rate": 0.0015568504495376923, "loss": 2.4053, "step": 20604 }, { "crossentropy": 2.641084671020508, "epoch": 0.7469910092807425, "grad_norm": 0.027655910700559616, "grad_norm_var": 3.448016477045054e-06, "learning_rate": 0.001556429118536995, "loss": 2.5369, "step": 20605 }, { "crossentropy": 2.463567018508911, "epoch": 0.7470272621809745, "grad_norm": 0.02631714940071106, "grad_norm_var": 3.538296489243436e-06, "learning_rate": 0.001556007834046948, "loss": 2.4471, "step": 20606 }, { "crossentropy": 2.582247257232666, "epoch": 0.7470635150812065, "grad_norm": 0.025871537625789642, "grad_norm_var": 3.6942055327314365e-06, "learning_rate": 0.0015555865960732436, "loss": 2.5365, "step": 20607 }, { "crossentropy": 2.4547927379608154, "epoch": 0.7470997679814385, "grad_norm": 0.02652447670698166, "grad_norm_var": 3.6632915636963226e-06, "learning_rate": 0.0015551654046215668, "loss": 2.4688, "step": 20608 }, { "crossentropy": 2.4273359775543213, "epoch": 0.7471360208816705, "grad_norm": 0.025925904512405396, "grad_norm_var": 3.790083327347597e-06, "learning_rate": 0.0015547442596976113, "loss": 2.4177, "step": 20609 }, { "crossentropy": 2.5127129554748535, "epoch": 0.7471722737819025, "grad_norm": 0.026739049702882767, "grad_norm_var": 6.073781704447514e-07, "learning_rate": 0.0015543231613070613, "loss": 2.4174, "step": 20610 }, { "crossentropy": 2.4193382263183594, "epoch": 0.7472085266821346, "grad_norm": 0.026761485263705254, "grad_norm_var": 5.649334877569906e-07, "learning_rate": 0.001553902109455606, "loss": 2.4813, "step": 20611 }, { "crossentropy": 2.560467004776001, "epoch": 0.7472447795823666, "grad_norm": 0.02791614830493927, "grad_norm_var": 6.270545181997119e-07, "learning_rate": 0.0015534811041489356, "loss": 2.4601, "step": 20612 }, { "crossentropy": 2.0655415058135986, "epoch": 0.7472810324825986, "grad_norm": 0.026345018297433853, "grad_norm_var": 6.504226120372051e-07, "learning_rate": 0.0015530601453927295, "loss": 2.29, "step": 20613 }, { "crossentropy": 2.473588705062866, "epoch": 0.7473172853828306, "grad_norm": 0.027121607214212418, "grad_norm_var": 6.242790984654998e-07, "learning_rate": 0.0015526392331926776, "loss": 2.4967, "step": 20614 }, { "crossentropy": 2.475273847579956, "epoch": 0.7473535382830626, "grad_norm": 0.026510797441005707, "grad_norm_var": 5.405610714330373e-07, "learning_rate": 0.0015522183675544653, "loss": 2.4702, "step": 20615 }, { "crossentropy": 2.586434841156006, "epoch": 0.7473897911832946, "grad_norm": 0.07362394779920578, "grad_norm_var": 0.0001370482347199079, "learning_rate": 0.0015517975484837755, "loss": 2.5386, "step": 20616 }, { "crossentropy": 2.5624265670776367, "epoch": 0.7474260440835266, "grad_norm": 0.026506206020712852, "grad_norm_var": 0.00013731636243790115, "learning_rate": 0.0015513767759862934, "loss": 2.5615, "step": 20617 }, { "crossentropy": 2.324834108352661, "epoch": 0.7474622969837587, "grad_norm": 0.02686363458633423, "grad_norm_var": 0.00013775765825866642, "learning_rate": 0.0015509560500677, "loss": 2.4634, "step": 20618 }, { "crossentropy": 2.409371852874756, "epoch": 0.7474985498839907, "grad_norm": 0.027381356805562973, "grad_norm_var": 0.00013782600795678512, "learning_rate": 0.0015505353707336795, "loss": 2.4972, "step": 20619 }, { "crossentropy": 2.5929315090179443, "epoch": 0.7475348027842227, "grad_norm": 0.02640884555876255, "grad_norm_var": 0.00013779995679498493, "learning_rate": 0.001550114737989915, "loss": 2.5505, "step": 20620 }, { "crossentropy": 2.5082759857177734, "epoch": 0.7475710556844548, "grad_norm": 0.026291366666555405, "grad_norm_var": 0.000138279964553354, "learning_rate": 0.0015496941518420865, "loss": 2.441, "step": 20621 }, { "crossentropy": 2.631957769393921, "epoch": 0.7476073085846868, "grad_norm": 0.035306621342897415, "grad_norm_var": 0.00013943263195737655, "learning_rate": 0.0015492736122958727, "loss": 2.5809, "step": 20622 }, { "crossentropy": 2.3441827297210693, "epoch": 0.7476435614849188, "grad_norm": 0.02835432067513466, "grad_norm_var": 0.00013840781092979633, "learning_rate": 0.0015488531193569573, "loss": 2.3368, "step": 20623 }, { "crossentropy": 2.550856113433838, "epoch": 0.7476798143851509, "grad_norm": 0.026275314390659332, "grad_norm_var": 0.00013853666496620683, "learning_rate": 0.001548432673031016, "loss": 2.5018, "step": 20624 }, { "crossentropy": 2.6618151664733887, "epoch": 0.7477160672853829, "grad_norm": 0.027557965368032455, "grad_norm_var": 0.00013775767285431568, "learning_rate": 0.0015480122733237311, "loss": 2.5894, "step": 20625 }, { "crossentropy": 2.5790369510650635, "epoch": 0.7477523201856149, "grad_norm": 0.026584861800074577, "grad_norm_var": 0.00013783386133657053, "learning_rate": 0.0015475919202407773, "loss": 2.5807, "step": 20626 }, { "crossentropy": 2.6135413646698, "epoch": 0.7477885730858469, "grad_norm": 0.02676796168088913, "grad_norm_var": 0.00013783075388899024, "learning_rate": 0.0015471716137878345, "loss": 2.4999, "step": 20627 }, { "crossentropy": 2.4433538913726807, "epoch": 0.7478248259860789, "grad_norm": 0.026817018166184425, "grad_norm_var": 0.000138264920159536, "learning_rate": 0.0015467513539705797, "loss": 2.396, "step": 20628 }, { "crossentropy": 2.4671828746795654, "epoch": 0.7478610788863109, "grad_norm": 0.026777474209666252, "grad_norm_var": 0.0001380488611073183, "learning_rate": 0.0015463311407946884, "loss": 2.5879, "step": 20629 }, { "crossentropy": 2.6170287132263184, "epoch": 0.7478973317865429, "grad_norm": 0.027310146018862724, "grad_norm_var": 0.0001379706339325599, "learning_rate": 0.001545910974265835, "loss": 2.555, "step": 20630 }, { "crossentropy": 2.478055715560913, "epoch": 0.747933584686775, "grad_norm": 0.026413995772600174, "grad_norm_var": 0.00013802056027392516, "learning_rate": 0.0015454908543896966, "loss": 2.454, "step": 20631 }, { "crossentropy": 2.35990834236145, "epoch": 0.747969837587007, "grad_norm": 0.02724764123558998, "grad_norm_var": 4.719662644174274e-06, "learning_rate": 0.0015450707811719457, "loss": 2.4479, "step": 20632 }, { "crossentropy": 2.637004852294922, "epoch": 0.748006090487239, "grad_norm": 0.027266858145594597, "grad_norm_var": 4.6622299666015015e-06, "learning_rate": 0.0015446507546182586, "loss": 2.5445, "step": 20633 }, { "crossentropy": 2.3052921295166016, "epoch": 0.748042343387471, "grad_norm": 0.027583755552768707, "grad_norm_var": 4.635787594030096e-06, "learning_rate": 0.0015442307747343049, "loss": 2.3793, "step": 20634 }, { "crossentropy": 2.5616185665130615, "epoch": 0.748078596287703, "grad_norm": 0.02793203480541706, "grad_norm_var": 4.6444437496853066e-06, "learning_rate": 0.0015438108415257584, "loss": 2.5228, "step": 20635 }, { "crossentropy": 2.409313678741455, "epoch": 0.748114849187935, "grad_norm": 0.02599356696009636, "grad_norm_var": 4.718741388910763e-06, "learning_rate": 0.001543390954998294, "loss": 2.4392, "step": 20636 }, { "crossentropy": 2.540487766265869, "epoch": 0.748151102088167, "grad_norm": 0.026997432112693787, "grad_norm_var": 4.633286609929897e-06, "learning_rate": 0.0015429711151575792, "loss": 2.5696, "step": 20637 }, { "crossentropy": 2.3617825508117676, "epoch": 0.7481873549883991, "grad_norm": 0.027185261249542236, "grad_norm_var": 3.8251445249372815e-07, "learning_rate": 0.0015425513220092852, "loss": 2.4819, "step": 20638 }, { "crossentropy": 2.518389940261841, "epoch": 0.7482236078886311, "grad_norm": 0.02795555256307125, "grad_norm_var": 3.239860543409323e-07, "learning_rate": 0.0015421315755590837, "loss": 2.5056, "step": 20639 }, { "crossentropy": 2.5064504146575928, "epoch": 0.7482598607888631, "grad_norm": 0.02702728658914566, "grad_norm_var": 2.8248960722540975e-07, "learning_rate": 0.0015417118758126408, "loss": 2.4901, "step": 20640 }, { "crossentropy": 2.4866878986358643, "epoch": 0.7482961136890951, "grad_norm": 0.026446044445037842, "grad_norm_var": 2.901875586811999e-07, "learning_rate": 0.0015412922227756292, "loss": 2.3988, "step": 20641 }, { "crossentropy": 2.5532066822052, "epoch": 0.7483323665893271, "grad_norm": 0.026396431028842926, "grad_norm_var": 3.033185662743946e-07, "learning_rate": 0.001540872616453713, "loss": 2.5738, "step": 20642 }, { "crossentropy": 2.642219066619873, "epoch": 0.7483686194895591, "grad_norm": 0.026448333635926247, "grad_norm_var": 3.199080182050695e-07, "learning_rate": 0.0015404530568525615, "loss": 2.583, "step": 20643 }, { "crossentropy": 2.328098773956299, "epoch": 0.7484048723897911, "grad_norm": 0.02643176168203354, "grad_norm_var": 3.3793791078401413e-07, "learning_rate": 0.0015400335439778434, "loss": 2.4063, "step": 20644 }, { "crossentropy": 2.579857110977173, "epoch": 0.7484411252900232, "grad_norm": 0.027351567521691322, "grad_norm_var": 3.44308964838648e-07, "learning_rate": 0.0015396140778352213, "loss": 2.6484, "step": 20645 }, { "crossentropy": 2.5025105476379395, "epoch": 0.7484773781902552, "grad_norm": 0.028592824935913086, "grad_norm_var": 5.003122105898116e-07, "learning_rate": 0.001539194658430364, "loss": 2.4725, "step": 20646 }, { "crossentropy": 2.333552360534668, "epoch": 0.7485136310904872, "grad_norm": 0.026625555008649826, "grad_norm_var": 4.843399790784569e-07, "learning_rate": 0.0015387752857689346, "loss": 2.3657, "step": 20647 }, { "crossentropy": 2.7059154510498047, "epoch": 0.7485498839907193, "grad_norm": 0.02738025225698948, "grad_norm_var": 4.881801013623469e-07, "learning_rate": 0.001538355959856596, "loss": 2.5981, "step": 20648 }, { "crossentropy": 2.655775547027588, "epoch": 0.7485861368909513, "grad_norm": 0.027440521866083145, "grad_norm_var": 4.939076607352362e-07, "learning_rate": 0.0015379366806990153, "loss": 2.5972, "step": 20649 }, { "crossentropy": 2.587094783782959, "epoch": 0.7486223897911833, "grad_norm": 0.026808809489011765, "grad_norm_var": 4.826721623677933e-07, "learning_rate": 0.0015375174483018518, "loss": 2.4495, "step": 20650 }, { "crossentropy": 2.4648404121398926, "epoch": 0.7486586426914154, "grad_norm": 0.026127859950065613, "grad_norm_var": 4.771392193328796e-07, "learning_rate": 0.0015370982626707697, "loss": 2.4924, "step": 20651 }, { "crossentropy": 2.573899745941162, "epoch": 0.7486948955916474, "grad_norm": 0.02901269495487213, "grad_norm_var": 6.615946060252454e-07, "learning_rate": 0.0015366791238114325, "loss": 2.6313, "step": 20652 }, { "crossentropy": 2.4916584491729736, "epoch": 0.7487311484918794, "grad_norm": 0.027940358966588974, "grad_norm_var": 6.993327068035547e-07, "learning_rate": 0.0015362600317294973, "loss": 2.4427, "step": 20653 }, { "crossentropy": 2.5411109924316406, "epoch": 0.7487674013921114, "grad_norm": 0.027602901682257652, "grad_norm_var": 7.095139699638073e-07, "learning_rate": 0.001535840986430629, "loss": 2.5109, "step": 20654 }, { "crossentropy": 2.455155849456787, "epoch": 0.7488036542923434, "grad_norm": 0.02585567906498909, "grad_norm_var": 7.803666206597504e-07, "learning_rate": 0.0015354219879204851, "loss": 2.4303, "step": 20655 }, { "crossentropy": 2.4574971199035645, "epoch": 0.7488399071925754, "grad_norm": 0.02645418420433998, "grad_norm_var": 8.059201362412352e-07, "learning_rate": 0.0015350030362047229, "loss": 2.4791, "step": 20656 }, { "crossentropy": 2.4212348461151123, "epoch": 0.7488761600928074, "grad_norm": 0.02580958791077137, "grad_norm_var": 8.831037164800839e-07, "learning_rate": 0.0015345841312890051, "loss": 2.4861, "step": 20657 }, { "crossentropy": 2.4418723583221436, "epoch": 0.7489124129930395, "grad_norm": 0.027656977996230125, "grad_norm_var": 8.780371008686056e-07, "learning_rate": 0.0015341652731789856, "loss": 2.4849, "step": 20658 }, { "crossentropy": 2.532376766204834, "epoch": 0.7489486658932715, "grad_norm": 0.02684524469077587, "grad_norm_var": 8.53594987218718e-07, "learning_rate": 0.0015337464618803242, "loss": 2.4716, "step": 20659 }, { "crossentropy": 2.618032217025757, "epoch": 0.7489849187935035, "grad_norm": 0.027081498876214027, "grad_norm_var": 8.202658173487766e-07, "learning_rate": 0.0015333276973986782, "loss": 2.5982, "step": 20660 }, { "crossentropy": 2.487541675567627, "epoch": 0.7490211716937355, "grad_norm": 0.02656380645930767, "grad_norm_var": 8.391041187357732e-07, "learning_rate": 0.0015329089797397012, "loss": 2.4898, "step": 20661 }, { "crossentropy": 2.419337511062622, "epoch": 0.7490574245939675, "grad_norm": 0.025980176404118538, "grad_norm_var": 7.500220205051515e-07, "learning_rate": 0.0015324903089090514, "loss": 2.4826, "step": 20662 }, { "crossentropy": 2.5655617713928223, "epoch": 0.7490936774941995, "grad_norm": 0.028545916080474854, "grad_norm_var": 8.976574336742756e-07, "learning_rate": 0.0015320716849123823, "loss": 2.5171, "step": 20663 }, { "crossentropy": 2.6007728576660156, "epoch": 0.7491299303944315, "grad_norm": 0.02649194374680519, "grad_norm_var": 9.101289044255764e-07, "learning_rate": 0.0015316531077553459, "loss": 2.5367, "step": 20664 }, { "crossentropy": 2.3983449935913086, "epoch": 0.7491661832946636, "grad_norm": 0.028031915426254272, "grad_norm_var": 9.65649125120417e-07, "learning_rate": 0.0015312345774435993, "loss": 2.43, "step": 20665 }, { "crossentropy": 2.528329849243164, "epoch": 0.7492024361948956, "grad_norm": 0.026001131162047386, "grad_norm_var": 1.032458873195287e-06, "learning_rate": 0.0015308160939827926, "loss": 2.5416, "step": 20666 }, { "crossentropy": 2.5504355430603027, "epoch": 0.7492386890951276, "grad_norm": 0.02767321653664112, "grad_norm_var": 1.0019903139753811e-06, "learning_rate": 0.0015303976573785788, "loss": 2.5671, "step": 20667 }, { "crossentropy": 2.4601094722747803, "epoch": 0.7492749419953596, "grad_norm": 0.026847943663597107, "grad_norm_var": 7.418548494722652e-07, "learning_rate": 0.001529979267636612, "loss": 2.5326, "step": 20668 }, { "crossentropy": 2.5436294078826904, "epoch": 0.7493111948955916, "grad_norm": 0.026472467929124832, "grad_norm_var": 6.849241902272118e-07, "learning_rate": 0.0015295609247625392, "loss": 2.487, "step": 20669 }, { "crossentropy": 2.632568120956421, "epoch": 0.7493474477958236, "grad_norm": 0.026988739147782326, "grad_norm_var": 6.484551366649996e-07, "learning_rate": 0.001529142628762014, "loss": 2.5735, "step": 20670 }, { "crossentropy": 2.6458375453948975, "epoch": 0.7493837006960556, "grad_norm": 0.02675049938261509, "grad_norm_var": 5.821011388104184e-07, "learning_rate": 0.0015287243796406852, "loss": 2.4612, "step": 20671 }, { "crossentropy": 2.2559897899627686, "epoch": 0.7494199535962877, "grad_norm": 0.025953317061066628, "grad_norm_var": 6.266983739661487e-07, "learning_rate": 0.0015283061774042002, "loss": 2.4316, "step": 20672 }, { "crossentropy": 2.5021145343780518, "epoch": 0.7494562064965197, "grad_norm": 0.02652669884264469, "grad_norm_var": 5.587960725945597e-07, "learning_rate": 0.0015278880220582097, "loss": 2.4664, "step": 20673 }, { "crossentropy": 2.379305839538574, "epoch": 0.7494924593967517, "grad_norm": 0.027215810492634773, "grad_norm_var": 5.264754124098866e-07, "learning_rate": 0.0015274699136083592, "loss": 2.4452, "step": 20674 }, { "crossentropy": 2.4162938594818115, "epoch": 0.7495287122969838, "grad_norm": 0.02630412019789219, "grad_norm_var": 5.467894267486047e-07, "learning_rate": 0.001527051852060297, "loss": 2.4321, "step": 20675 }, { "crossentropy": 2.522087574005127, "epoch": 0.7495649651972158, "grad_norm": 0.0268980972468853, "grad_norm_var": 5.429696776244423e-07, "learning_rate": 0.001526633837419672, "loss": 2.486, "step": 20676 }, { "crossentropy": 2.415173053741455, "epoch": 0.7496012180974478, "grad_norm": 0.026757314801216125, "grad_norm_var": 5.384970808312705e-07, "learning_rate": 0.0015262158696921263, "loss": 2.3845, "step": 20677 }, { "crossentropy": 2.401489496231079, "epoch": 0.7496374709976799, "grad_norm": 0.02620607614517212, "grad_norm_var": 5.157899445819754e-07, "learning_rate": 0.0015257979488833091, "loss": 2.5004, "step": 20678 }, { "crossentropy": 2.361576795578003, "epoch": 0.7496737238979119, "grad_norm": 0.02655917778611183, "grad_norm_var": 3.143195787294953e-07, "learning_rate": 0.0015253800749988622, "loss": 2.3739, "step": 20679 }, { "crossentropy": 2.4584908485412598, "epoch": 0.7497099767981439, "grad_norm": 0.026222990825772285, "grad_norm_var": 3.27373918176466e-07, "learning_rate": 0.0015249622480444287, "loss": 2.4304, "step": 20680 }, { "crossentropy": 2.576840400695801, "epoch": 0.7497462296983759, "grad_norm": 0.027406051754951477, "grad_norm_var": 2.418019116302705e-07, "learning_rate": 0.0015245444680256554, "loss": 2.5445, "step": 20681 }, { "crossentropy": 2.599165678024292, "epoch": 0.7497824825986079, "grad_norm": 0.027668582275509834, "grad_norm_var": 2.659845239200009e-07, "learning_rate": 0.0015241267349481813, "loss": 2.6169, "step": 20682 }, { "crossentropy": 2.5068278312683105, "epoch": 0.7498187354988399, "grad_norm": 0.02603740617632866, "grad_norm_var": 2.380151276531554e-07, "learning_rate": 0.001523709048817651, "loss": 2.5169, "step": 20683 }, { "crossentropy": 2.5753254890441895, "epoch": 0.7498549883990719, "grad_norm": 0.027897540479898453, "grad_norm_var": 3.309375170340925e-07, "learning_rate": 0.0015232914096397067, "loss": 2.5985, "step": 20684 }, { "crossentropy": 2.639564275741577, "epoch": 0.749891241299304, "grad_norm": 0.028697118163108826, "grad_norm_var": 5.604374976465876e-07, "learning_rate": 0.0015228738174199868, "loss": 2.644, "step": 20685 }, { "crossentropy": 2.6445252895355225, "epoch": 0.749927494199536, "grad_norm": 0.026154397055506706, "grad_norm_var": 5.919150006208328e-07, "learning_rate": 0.0015224562721641338, "loss": 2.6153, "step": 20686 }, { "crossentropy": 2.5937323570251465, "epoch": 0.749963747099768, "grad_norm": 0.026410114020109177, "grad_norm_var": 6.026941506395104e-07, "learning_rate": 0.0015220387738777853, "loss": 2.5142, "step": 20687 }, { "crossentropy": 2.608064651489258, "epoch": 0.75, "grad_norm": 0.0260156262665987, "grad_norm_var": 5.958430347307765e-07, "learning_rate": 0.001521621322566582, "loss": 2.5402, "step": 20688 }, { "crossentropy": 2.4465231895446777, "epoch": 0.750036252900232, "grad_norm": 0.026976365596055984, "grad_norm_var": 5.914309030570035e-07, "learning_rate": 0.0015212039182361614, "loss": 2.5234, "step": 20689 }, { "crossentropy": 2.59169602394104, "epoch": 0.750072505800464, "grad_norm": 0.026427043601870537, "grad_norm_var": 5.907050605860956e-07, "learning_rate": 0.00152078656089216, "loss": 2.5378, "step": 20690 }, { "crossentropy": 2.6233086585998535, "epoch": 0.750108758700696, "grad_norm": 0.026545194908976555, "grad_norm_var": 5.787235693097137e-07, "learning_rate": 0.001520369250540216, "loss": 2.6119, "step": 20691 }, { "crossentropy": 2.558340072631836, "epoch": 0.7501450116009281, "grad_norm": 0.027331247925758362, "grad_norm_var": 5.95829732564536e-07, "learning_rate": 0.0015199519871859674, "loss": 2.5024, "step": 20692 }, { "crossentropy": 2.548079252243042, "epoch": 0.7501812645011601, "grad_norm": 0.02635725401341915, "grad_norm_var": 6.098174141823023e-07, "learning_rate": 0.0015195347708350465, "loss": 2.5686, "step": 20693 }, { "crossentropy": 2.47941255569458, "epoch": 0.7502175174013921, "grad_norm": 0.02634505182504654, "grad_norm_var": 5.998891640886094e-07, "learning_rate": 0.0015191176014930918, "loss": 2.5147, "step": 20694 }, { "crossentropy": 2.367898464202881, "epoch": 0.7502537703016241, "grad_norm": 0.027348754927515984, "grad_norm_var": 6.1184803672263e-07, "learning_rate": 0.0015187004791657349, "loss": 2.4528, "step": 20695 }, { "crossentropy": 2.613624334335327, "epoch": 0.7502900232018561, "grad_norm": 0.027325982227921486, "grad_norm_var": 5.93460700469041e-07, "learning_rate": 0.001518283403858613, "loss": 2.56, "step": 20696 }, { "crossentropy": 2.396389961242676, "epoch": 0.7503262761020881, "grad_norm": 0.026479769498109818, "grad_norm_var": 5.887831299592379e-07, "learning_rate": 0.001517866375577357, "loss": 2.4788, "step": 20697 }, { "crossentropy": 2.5513315200805664, "epoch": 0.7503625290023201, "grad_norm": 0.026297682896256447, "grad_norm_var": 5.613865957839378e-07, "learning_rate": 0.0015174493943275985, "loss": 2.4625, "step": 20698 }, { "crossentropy": 2.476393938064575, "epoch": 0.7503987819025522, "grad_norm": 0.02688729017972946, "grad_norm_var": 5.212018085351891e-07, "learning_rate": 0.001517032460114971, "loss": 2.5496, "step": 20699 }, { "crossentropy": 2.6047921180725098, "epoch": 0.7504350348027842, "grad_norm": 0.026984689757227898, "grad_norm_var": 4.4499525263933836e-07, "learning_rate": 0.001516615572945107, "loss": 2.4906, "step": 20700 }, { "crossentropy": 2.465059280395508, "epoch": 0.7504712877030162, "grad_norm": 0.026994824409484863, "grad_norm_var": 1.924443047553731e-07, "learning_rate": 0.0015161987328236343, "loss": 2.4558, "step": 20701 }, { "crossentropy": 2.685314416885376, "epoch": 0.7505075406032483, "grad_norm": 0.027424398809671402, "grad_norm_var": 2.0423496860048214e-07, "learning_rate": 0.0015157819397561862, "loss": 2.5758, "step": 20702 }, { "crossentropy": 2.624897003173828, "epoch": 0.7505437935034803, "grad_norm": 0.026063451543450356, "grad_norm_var": 2.2789305076779432e-07, "learning_rate": 0.0015153651937483891, "loss": 2.5776, "step": 20703 }, { "crossentropy": 2.5249886512756348, "epoch": 0.7505800464037123, "grad_norm": 0.026342352852225304, "grad_norm_var": 2.0310496162919723e-07, "learning_rate": 0.0015149484948058735, "loss": 2.5284, "step": 20704 }, { "crossentropy": 2.4736666679382324, "epoch": 0.7506162993039444, "grad_norm": 0.027079494670033455, "grad_norm_var": 2.0676944926563645e-07, "learning_rate": 0.0015145318429342696, "loss": 2.4681, "step": 20705 }, { "crossentropy": 2.4390995502471924, "epoch": 0.7506525522041764, "grad_norm": 0.02596619538962841, "grad_norm_var": 2.4078829654507426e-07, "learning_rate": 0.0015141152381391999, "loss": 2.4314, "step": 20706 }, { "crossentropy": 2.5600333213806152, "epoch": 0.7506888051044084, "grad_norm": 0.027342863380908966, "grad_norm_var": 2.602779712732589e-07, "learning_rate": 0.0015136986804262943, "loss": 2.5178, "step": 20707 }, { "crossentropy": 2.5343141555786133, "epoch": 0.7507250580046404, "grad_norm": 0.029580306261777878, "grad_norm_var": 7.400133601231784e-07, "learning_rate": 0.0015132821698011795, "loss": 2.5232, "step": 20708 }, { "crossentropy": 2.4177894592285156, "epoch": 0.7507613109048724, "grad_norm": 0.026975424960255623, "grad_norm_var": 7.169967110982745e-07, "learning_rate": 0.001512865706269479, "loss": 2.4752, "step": 20709 }, { "crossentropy": 2.556950330734253, "epoch": 0.7507975638051044, "grad_norm": 0.027799177914857864, "grad_norm_var": 7.289719397288724e-07, "learning_rate": 0.001512449289836821, "loss": 2.5366, "step": 20710 }, { "crossentropy": 2.3807220458984375, "epoch": 0.7508338167053364, "grad_norm": 0.02606588788330555, "grad_norm_var": 7.817200508464095e-07, "learning_rate": 0.0015120329205088256, "loss": 2.4246, "step": 20711 }, { "crossentropy": 2.5024404525756836, "epoch": 0.7508700696055685, "grad_norm": 0.02674199268221855, "grad_norm_var": 7.757536174148501e-07, "learning_rate": 0.001511616598291119, "loss": 2.3784, "step": 20712 }, { "crossentropy": 2.3366639614105225, "epoch": 0.7509063225058005, "grad_norm": 0.027111010625958443, "grad_norm_var": 7.619968603135429e-07, "learning_rate": 0.0015112003231893274, "loss": 2.4814, "step": 20713 }, { "crossentropy": 2.4928250312805176, "epoch": 0.7509425754060325, "grad_norm": 0.025829244405031204, "grad_norm_var": 8.18238391913038e-07, "learning_rate": 0.0015107840952090668, "loss": 2.4791, "step": 20714 }, { "crossentropy": 2.4281578063964844, "epoch": 0.7509788283062645, "grad_norm": 0.028403088450431824, "grad_norm_var": 9.493110369814447e-07, "learning_rate": 0.0015103679143559617, "loss": 2.389, "step": 20715 }, { "crossentropy": 2.5302443504333496, "epoch": 0.7510150812064965, "grad_norm": 0.025161176919937134, "grad_norm_var": 1.1715625163666165e-06, "learning_rate": 0.0015099517806356356, "loss": 2.452, "step": 20716 }, { "crossentropy": 2.407938241958618, "epoch": 0.7510513341067285, "grad_norm": 0.026575598865747452, "grad_norm_var": 1.178926535096491e-06, "learning_rate": 0.0015095356940537054, "loss": 2.4974, "step": 20717 }, { "crossentropy": 2.460552453994751, "epoch": 0.7510875870069605, "grad_norm": 0.026693854480981827, "grad_norm_var": 1.1615783423044584e-06, "learning_rate": 0.0015091196546157936, "loss": 2.4775, "step": 20718 }, { "crossentropy": 2.414799451828003, "epoch": 0.7511238399071926, "grad_norm": 0.02609643153846264, "grad_norm_var": 1.1581515705908479e-06, "learning_rate": 0.0015087036623275174, "loss": 2.362, "step": 20719 }, { "crossentropy": 2.575554132461548, "epoch": 0.7511600928074246, "grad_norm": 0.026833923533558846, "grad_norm_var": 1.1393093569473372e-06, "learning_rate": 0.0015082877171944964, "loss": 2.4743, "step": 20720 }, { "crossentropy": 2.564354181289673, "epoch": 0.7511963457076566, "grad_norm": 0.027203269302845, "grad_norm_var": 1.143377986428419e-06, "learning_rate": 0.0015078718192223506, "loss": 2.4979, "step": 20721 }, { "crossentropy": 2.4569356441497803, "epoch": 0.7512325986078886, "grad_norm": 0.02599582076072693, "grad_norm_var": 1.1397493401114756e-06, "learning_rate": 0.001507455968416695, "loss": 2.4245, "step": 20722 }, { "crossentropy": 2.466029167175293, "epoch": 0.7512688515081206, "grad_norm": 0.02602478116750717, "grad_norm_var": 1.1706020691323727e-06, "learning_rate": 0.0015070401647831449, "loss": 2.5148, "step": 20723 }, { "crossentropy": 2.435123920440674, "epoch": 0.7513051044083526, "grad_norm": 0.025372814387083054, "grad_norm_var": 7.274928191455664e-07, "learning_rate": 0.0015066244083273201, "loss": 2.3809, "step": 20724 }, { "crossentropy": 2.533328056335449, "epoch": 0.7513413573085846, "grad_norm": 0.027690624818205833, "grad_norm_var": 7.995331153392981e-07, "learning_rate": 0.0015062086990548324, "loss": 2.5079, "step": 20725 }, { "crossentropy": 2.477391004562378, "epoch": 0.7513776102088167, "grad_norm": 0.02684435434639454, "grad_norm_var": 7.038361448890214e-07, "learning_rate": 0.001505793036971298, "loss": 2.5358, "step": 20726 }, { "crossentropy": 2.313570737838745, "epoch": 0.7514138631090487, "grad_norm": 0.02758854441344738, "grad_norm_var": 7.524375133890458e-07, "learning_rate": 0.001505377422082333, "loss": 2.4106, "step": 20727 }, { "crossentropy": 2.5090181827545166, "epoch": 0.7514501160092807, "grad_norm": 0.02832806296646595, "grad_norm_var": 9.322037681789523e-07, "learning_rate": 0.0015049618543935479, "loss": 2.4615, "step": 20728 }, { "crossentropy": 2.4413580894470215, "epoch": 0.7514863689095128, "grad_norm": 0.026704629883170128, "grad_norm_var": 9.221264961217403e-07, "learning_rate": 0.0015045463339105581, "loss": 2.5147, "step": 20729 }, { "crossentropy": 2.3909246921539307, "epoch": 0.7515226218097448, "grad_norm": 0.026388829573988914, "grad_norm_var": 8.760473574923507e-07, "learning_rate": 0.0015041308606389748, "loss": 2.5703, "step": 20730 }, { "crossentropy": 2.5241618156433105, "epoch": 0.7515588747099768, "grad_norm": 0.02733101323246956, "grad_norm_var": 7.107418848891743e-07, "learning_rate": 0.0015037154345844074, "loss": 2.5251, "step": 20731 }, { "crossentropy": 2.473802089691162, "epoch": 0.7515951276102089, "grad_norm": 0.027635233476758003, "grad_norm_var": 5.932350163382102e-07, "learning_rate": 0.0015033000557524707, "loss": 2.4619, "step": 20732 }, { "crossentropy": 2.546891689300537, "epoch": 0.7516313805104409, "grad_norm": 0.027306945994496346, "grad_norm_var": 6.016875520620593e-07, "learning_rate": 0.0015028847241487714, "loss": 2.5432, "step": 20733 }, { "crossentropy": 2.438642978668213, "epoch": 0.7516676334106729, "grad_norm": 0.02694675326347351, "grad_norm_var": 5.994942424557281e-07, "learning_rate": 0.0015024694397789212, "loss": 2.3416, "step": 20734 }, { "crossentropy": 2.4381494522094727, "epoch": 0.7517038863109049, "grad_norm": 0.025954341515898705, "grad_norm_var": 6.158521232910164e-07, "learning_rate": 0.0015020542026485306, "loss": 2.4968, "step": 20735 }, { "crossentropy": 2.3807430267333984, "epoch": 0.7517401392111369, "grad_norm": 0.02629629150032997, "grad_norm_var": 6.375339555819096e-07, "learning_rate": 0.0015016390127632045, "loss": 2.3628, "step": 20736 }, { "crossentropy": 2.494630813598633, "epoch": 0.7517763921113689, "grad_norm": 0.026237988844513893, "grad_norm_var": 6.504011858818673e-07, "learning_rate": 0.0015012238701285542, "loss": 2.5019, "step": 20737 }, { "crossentropy": 2.6947596073150635, "epoch": 0.7518126450116009, "grad_norm": 0.026917925104498863, "grad_norm_var": 6.058472909532906e-07, "learning_rate": 0.0015008087747501852, "loss": 2.6308, "step": 20738 }, { "crossentropy": 2.5798821449279785, "epoch": 0.751848897911833, "grad_norm": 0.025907747447490692, "grad_norm_var": 6.195503681054402e-07, "learning_rate": 0.001500393726633702, "loss": 2.5476, "step": 20739 }, { "crossentropy": 2.402857780456543, "epoch": 0.751885150812065, "grad_norm": 0.025899738073349, "grad_norm_var": 5.337709554092355e-07, "learning_rate": 0.0014999787257847141, "loss": 2.4404, "step": 20740 }, { "crossentropy": 2.46229887008667, "epoch": 0.751921403712297, "grad_norm": 0.02605505660176277, "grad_norm_var": 5.228097723075064e-07, "learning_rate": 0.001499563772208824, "loss": 2.4391, "step": 20741 }, { "crossentropy": 2.594512701034546, "epoch": 0.751957656612529, "grad_norm": 0.027205025777220726, "grad_norm_var": 5.34445180045325e-07, "learning_rate": 0.0014991488659116365, "loss": 2.5943, "step": 20742 }, { "crossentropy": 2.588207721710205, "epoch": 0.751993909512761, "grad_norm": 0.026958832517266273, "grad_norm_var": 4.925182077892068e-07, "learning_rate": 0.001498734006898758, "loss": 2.5083, "step": 20743 }, { "crossentropy": 2.486881971359253, "epoch": 0.752030162412993, "grad_norm": 0.02640332467854023, "grad_norm_var": 3.2026929585865274e-07, "learning_rate": 0.0014983191951757884, "loss": 2.5356, "step": 20744 }, { "crossentropy": 2.439793109893799, "epoch": 0.752066415313225, "grad_norm": 0.026140321046113968, "grad_norm_var": 3.3488449799368103e-07, "learning_rate": 0.0014979044307483341, "loss": 2.4608, "step": 20745 }, { "crossentropy": 2.492255926132202, "epoch": 0.7521026682134571, "grad_norm": 0.027112893760204315, "grad_norm_var": 3.473527983974614e-07, "learning_rate": 0.0014974897136219934, "loss": 2.3787, "step": 20746 }, { "crossentropy": 2.356536626815796, "epoch": 0.7521389211136891, "grad_norm": 0.026532385498285294, "grad_norm_var": 3.1409613678538144e-07, "learning_rate": 0.0014970750438023706, "loss": 2.4237, "step": 20747 }, { "crossentropy": 2.48106050491333, "epoch": 0.7521751740139211, "grad_norm": 0.028096750378608704, "grad_norm_var": 3.9145524332027426e-07, "learning_rate": 0.0014966604212950658, "loss": 2.5376, "step": 20748 }, { "crossentropy": 2.473595142364502, "epoch": 0.7522114269141531, "grad_norm": 0.025877663865685463, "grad_norm_var": 3.8884444578898314e-07, "learning_rate": 0.001496245846105676, "loss": 2.495, "step": 20749 }, { "crossentropy": 2.4140560626983643, "epoch": 0.7522476798143851, "grad_norm": 0.025303881615400314, "grad_norm_var": 4.671070864383734e-07, "learning_rate": 0.0014958313182398043, "loss": 2.4557, "step": 20750 }, { "crossentropy": 2.5012264251708984, "epoch": 0.7522839327146171, "grad_norm": 0.024796131998300552, "grad_norm_var": 6.245972832288315e-07, "learning_rate": 0.0014954168377030491, "loss": 2.4354, "step": 20751 }, { "crossentropy": 2.4246785640716553, "epoch": 0.7523201856148491, "grad_norm": 0.026329869404435158, "grad_norm_var": 6.243875723107994e-07, "learning_rate": 0.001495002404501007, "loss": 2.4649, "step": 20752 }, { "crossentropy": 2.5001957416534424, "epoch": 0.7523564385150812, "grad_norm": 0.026345908641815186, "grad_norm_var": 6.233458615207318e-07, "learning_rate": 0.0014945880186392774, "loss": 2.5812, "step": 20753 }, { "crossentropy": 2.6110241413116455, "epoch": 0.7523926914153132, "grad_norm": 0.02642017789185047, "grad_norm_var": 6.02315041446586e-07, "learning_rate": 0.001494173680123455, "loss": 2.5966, "step": 20754 }, { "crossentropy": 2.5403249263763428, "epoch": 0.7524289443155452, "grad_norm": 0.02679486945271492, "grad_norm_var": 6.007748867752535e-07, "learning_rate": 0.0014937593889591389, "loss": 2.5595, "step": 20755 }, { "crossentropy": 2.3259897232055664, "epoch": 0.7524651972157773, "grad_norm": 0.030840910971164703, "grad_norm_var": 1.8023765058228687e-06, "learning_rate": 0.0014933451451519231, "loss": 2.4126, "step": 20756 }, { "crossentropy": 2.6204748153686523, "epoch": 0.7525014501160093, "grad_norm": 0.025880087167024612, "grad_norm_var": 1.8193563697544342e-06, "learning_rate": 0.001492930948707401, "loss": 2.5806, "step": 20757 }, { "crossentropy": 2.516812562942505, "epoch": 0.7525377030162413, "grad_norm": 0.026319021359086037, "grad_norm_var": 1.8075699664262788e-06, "learning_rate": 0.001492516799631169, "loss": 2.5053, "step": 20758 }, { "crossentropy": 2.5237796306610107, "epoch": 0.7525739559164734, "grad_norm": 0.027065882459282875, "grad_norm_var": 1.8129145816137352e-06, "learning_rate": 0.0014921026979288216, "loss": 2.5258, "step": 20759 }, { "crossentropy": 2.563361644744873, "epoch": 0.7526102088167054, "grad_norm": 0.02706165984272957, "grad_norm_var": 1.8191173518891974e-06, "learning_rate": 0.001491688643605949, "loss": 2.5159, "step": 20760 }, { "crossentropy": 2.3198485374450684, "epoch": 0.7526464617169374, "grad_norm": 0.026172606274485588, "grad_norm_var": 1.816849008047285e-06, "learning_rate": 0.0014912746366681474, "loss": 2.4437, "step": 20761 }, { "crossentropy": 2.4468300342559814, "epoch": 0.7526827146171694, "grad_norm": 0.026224743574857712, "grad_norm_var": 1.8154096630339118e-06, "learning_rate": 0.0014908606771210043, "loss": 2.4279, "step": 20762 }, { "crossentropy": 2.5197911262512207, "epoch": 0.7527189675174014, "grad_norm": 0.026850329712033272, "grad_norm_var": 1.8176358002644124e-06, "learning_rate": 0.0014904467649701153, "loss": 2.537, "step": 20763 }, { "crossentropy": 2.475933313369751, "epoch": 0.7527552204176334, "grad_norm": 0.026605326682329178, "grad_norm_var": 1.6687192107303483e-06, "learning_rate": 0.0014900329002210683, "loss": 2.5164, "step": 20764 }, { "crossentropy": 2.5438809394836426, "epoch": 0.7527914733178654, "grad_norm": 0.0285437423735857, "grad_norm_var": 1.87198858444131e-06, "learning_rate": 0.001489619082879452, "loss": 2.4863, "step": 20765 }, { "crossentropy": 2.5390403270721436, "epoch": 0.7528277262180975, "grad_norm": 0.026250924915075302, "grad_norm_var": 1.74895014876882e-06, "learning_rate": 0.001489205312950857, "loss": 2.5096, "step": 20766 }, { "crossentropy": 2.5449156761169434, "epoch": 0.7528639791183295, "grad_norm": 0.026799742132425308, "grad_norm_var": 1.4694965461353362e-06, "learning_rate": 0.0014887915904408739, "loss": 2.5269, "step": 20767 }, { "crossentropy": 2.611640691757202, "epoch": 0.7529002320185615, "grad_norm": 0.025912711396813393, "grad_norm_var": 1.512451923021601e-06, "learning_rate": 0.0014883779153550874, "loss": 2.5009, "step": 20768 }, { "crossentropy": 2.5008046627044678, "epoch": 0.7529364849187935, "grad_norm": 0.026736237108707428, "grad_norm_var": 1.4941499316398065e-06, "learning_rate": 0.0014879642876990872, "loss": 2.5672, "step": 20769 }, { "crossentropy": 2.619713544845581, "epoch": 0.7529727378190255, "grad_norm": 0.025635983794927597, "grad_norm_var": 1.583270864187238e-06, "learning_rate": 0.0014875507074784584, "loss": 2.5166, "step": 20770 }, { "crossentropy": 2.4839658737182617, "epoch": 0.7530089907192575, "grad_norm": 0.025542698800563812, "grad_norm_var": 1.6914599514675568e-06, "learning_rate": 0.0014871371746987866, "loss": 2.4994, "step": 20771 }, { "crossentropy": 2.5882973670959473, "epoch": 0.7530452436194895, "grad_norm": 0.025943443179130554, "grad_norm_var": 5.372509219290015e-07, "learning_rate": 0.0014867236893656621, "loss": 2.5313, "step": 20772 }, { "crossentropy": 2.3835532665252686, "epoch": 0.7530814965197216, "grad_norm": 0.025103984400629997, "grad_norm_var": 6.361038870058515e-07, "learning_rate": 0.001486310251484662, "loss": 2.4489, "step": 20773 }, { "crossentropy": 2.5700740814208984, "epoch": 0.7531177494199536, "grad_norm": 0.026470081880688667, "grad_norm_var": 6.354345097005308e-07, "learning_rate": 0.001485896861061375, "loss": 2.5514, "step": 20774 }, { "crossentropy": 2.4719107151031494, "epoch": 0.7531540023201856, "grad_norm": 0.025470705702900887, "grad_norm_var": 6.597583611137828e-07, "learning_rate": 0.0014854835181013848, "loss": 2.4522, "step": 20775 }, { "crossentropy": 2.601223945617676, "epoch": 0.7531902552204176, "grad_norm": 0.027871832251548767, "grad_norm_var": 7.795148651594617e-07, "learning_rate": 0.0014850702226102714, "loss": 2.516, "step": 20776 }, { "crossentropy": 2.5017104148864746, "epoch": 0.7532265081206496, "grad_norm": 0.02665722742676735, "grad_norm_var": 7.80569985777783e-07, "learning_rate": 0.0014846569745936206, "loss": 2.4688, "step": 20777 }, { "crossentropy": 2.476743221282959, "epoch": 0.7532627610208816, "grad_norm": 0.025958677753806114, "grad_norm_var": 7.916988800557665e-07, "learning_rate": 0.0014842437740570102, "loss": 2.4595, "step": 20778 }, { "crossentropy": 2.4439592361450195, "epoch": 0.7532990139211136, "grad_norm": 0.02613835595548153, "grad_norm_var": 7.803558167709338e-07, "learning_rate": 0.001483830621006023, "loss": 2.4861, "step": 20779 }, { "crossentropy": 2.44356107711792, "epoch": 0.7533352668213457, "grad_norm": 0.026831991970539093, "grad_norm_var": 7.912046612383912e-07, "learning_rate": 0.001483417515446243, "loss": 2.5524, "step": 20780 }, { "crossentropy": 2.493396759033203, "epoch": 0.7533715197215777, "grad_norm": 0.02780177630484104, "grad_norm_var": 6.102466097939919e-07, "learning_rate": 0.0014830044573832424, "loss": 2.5723, "step": 20781 }, { "crossentropy": 2.5481748580932617, "epoch": 0.7534077726218097, "grad_norm": 0.027181653305888176, "grad_norm_var": 6.557661046165929e-07, "learning_rate": 0.0014825914468226042, "loss": 2.5758, "step": 20782 }, { "crossentropy": 2.4651424884796143, "epoch": 0.7534440255220418, "grad_norm": 0.02629932574927807, "grad_norm_var": 6.433155453396671e-07, "learning_rate": 0.0014821784837699076, "loss": 2.444, "step": 20783 }, { "crossentropy": 2.29006290435791, "epoch": 0.7534802784222738, "grad_norm": 0.027082564309239388, "grad_norm_var": 6.610640878301159e-07, "learning_rate": 0.0014817655682307274, "loss": 2.3593, "step": 20784 }, { "crossentropy": 2.460963487625122, "epoch": 0.7535165313225058, "grad_norm": 0.02693021297454834, "grad_norm_var": 6.715841633613083e-07, "learning_rate": 0.0014813527002106443, "loss": 2.5219, "step": 20785 }, { "crossentropy": 2.3208515644073486, "epoch": 0.7535527842227379, "grad_norm": 0.026029957458376884, "grad_norm_var": 6.394425687202924e-07, "learning_rate": 0.0014809398797152307, "loss": 2.4155, "step": 20786 }, { "crossentropy": 2.3483381271362305, "epoch": 0.7535890371229699, "grad_norm": 0.026861343532800674, "grad_norm_var": 5.873399140764989e-07, "learning_rate": 0.0014805271067500647, "loss": 2.4286, "step": 20787 }, { "crossentropy": 2.4648120403289795, "epoch": 0.7536252900232019, "grad_norm": 0.02609688974916935, "grad_norm_var": 5.766150294609588e-07, "learning_rate": 0.0014801143813207224, "loss": 2.453, "step": 20788 }, { "crossentropy": 2.3129940032958984, "epoch": 0.7536615429234339, "grad_norm": 0.026693278923630714, "grad_norm_var": 4.2823953386206204e-07, "learning_rate": 0.0014797017034327775, "loss": 2.3982, "step": 20789 }, { "crossentropy": 2.5301806926727295, "epoch": 0.7536977958236659, "grad_norm": 0.026105152443051338, "grad_norm_var": 4.452438344153351e-07, "learning_rate": 0.0014792890730918012, "loss": 2.5301, "step": 20790 }, { "crossentropy": 2.466397762298584, "epoch": 0.7537340487238979, "grad_norm": 0.0276795607060194, "grad_norm_var": 4.100265489420749e-07, "learning_rate": 0.0014788764903033709, "loss": 2.4883, "step": 20791 }, { "crossentropy": 2.550553321838379, "epoch": 0.75377030162413, "grad_norm": 0.025294475257396698, "grad_norm_var": 4.4440553257454365e-07, "learning_rate": 0.0014784639550730549, "loss": 2.4693, "step": 20792 }, { "crossentropy": 2.507021188735962, "epoch": 0.753806554524362, "grad_norm": 0.025675440207123756, "grad_norm_var": 4.975055691274742e-07, "learning_rate": 0.0014780514674064293, "loss": 2.4849, "step": 20793 }, { "crossentropy": 2.4531867504119873, "epoch": 0.753842807424594, "grad_norm": 0.029720181599259377, "grad_norm_var": 1.089612236863241e-06, "learning_rate": 0.0014776390273090617, "loss": 2.5535, "step": 20794 }, { "crossentropy": 2.491814374923706, "epoch": 0.753879060324826, "grad_norm": 0.027989471331238747, "grad_norm_var": 1.1463011051060858e-06, "learning_rate": 0.0014772266347865249, "loss": 2.4931, "step": 20795 }, { "crossentropy": 2.4750494956970215, "epoch": 0.753915313225058, "grad_norm": 0.026922481134533882, "grad_norm_var": 1.1460879008994093e-06, "learning_rate": 0.0014768142898443893, "loss": 2.5091, "step": 20796 }, { "crossentropy": 2.564249038696289, "epoch": 0.75395156612529, "grad_norm": 0.02649182267487049, "grad_norm_var": 1.0954363205183896e-06, "learning_rate": 0.0014764019924882238, "loss": 2.492, "step": 20797 }, { "crossentropy": 2.498591423034668, "epoch": 0.753987819025522, "grad_norm": 0.026045911014080048, "grad_norm_var": 1.1206632924955219e-06, "learning_rate": 0.0014759897427235947, "loss": 2.48, "step": 20798 }, { "crossentropy": 2.4258406162261963, "epoch": 0.754024071925754, "grad_norm": 0.025831876322627068, "grad_norm_var": 1.1620899375080576e-06, "learning_rate": 0.001475577540556074, "loss": 2.4165, "step": 20799 }, { "crossentropy": 2.4463343620300293, "epoch": 0.7540603248259861, "grad_norm": 0.027136310935020447, "grad_norm_var": 1.1648997703474767e-06, "learning_rate": 0.0014751653859912252, "loss": 2.4552, "step": 20800 }, { "crossentropy": 2.5858452320098877, "epoch": 0.7540965777262181, "grad_norm": 0.027301980182528496, "grad_norm_var": 1.1840064219100754e-06, "learning_rate": 0.0014747532790346186, "loss": 2.4756, "step": 20801 }, { "crossentropy": 2.6097476482391357, "epoch": 0.7541328306264501, "grad_norm": 0.027053453028202057, "grad_norm_var": 1.15227296425036e-06, "learning_rate": 0.0014743412196918171, "loss": 2.6162, "step": 20802 }, { "crossentropy": 2.7629554271698, "epoch": 0.7541690835266821, "grad_norm": 0.027015309780836105, "grad_norm_var": 1.1548860464934227e-06, "learning_rate": 0.001473929207968388, "loss": 2.6509, "step": 20803 }, { "crossentropy": 2.564849376678467, "epoch": 0.7542053364269141, "grad_norm": 0.026419732719659805, "grad_norm_var": 1.1304521147351142e-06, "learning_rate": 0.0014735172438698973, "loss": 2.5654, "step": 20804 }, { "crossentropy": 2.497647285461426, "epoch": 0.7542415893271461, "grad_norm": 0.025235140696167946, "grad_norm_var": 1.2910904908916553e-06, "learning_rate": 0.001473105327401908, "loss": 2.4486, "step": 20805 }, { "crossentropy": 2.4803130626678467, "epoch": 0.7542778422273781, "grad_norm": 0.025789307430386543, "grad_norm_var": 1.3242665839717966e-06, "learning_rate": 0.0014726934585699819, "loss": 2.5202, "step": 20806 }, { "crossentropy": 2.366264581680298, "epoch": 0.7543140951276102, "grad_norm": 0.02761058509349823, "grad_norm_var": 1.31578649271966e-06, "learning_rate": 0.0014722816373796848, "loss": 2.5237, "step": 20807 }, { "crossentropy": 2.3607234954833984, "epoch": 0.7543503480278422, "grad_norm": 0.026693152263760567, "grad_norm_var": 1.1720514764866515e-06, "learning_rate": 0.0014718698638365763, "loss": 2.4461, "step": 20808 }, { "crossentropy": 2.4061951637268066, "epoch": 0.7543866009280742, "grad_norm": 0.026803167536854744, "grad_norm_var": 1.0812021563161647e-06, "learning_rate": 0.001471458137946221, "loss": 2.4241, "step": 20809 }, { "crossentropy": 2.597717046737671, "epoch": 0.7544228538283063, "grad_norm": 0.026922153308987617, "grad_norm_var": 5.104554568678583e-07, "learning_rate": 0.001471046459714177, "loss": 2.5114, "step": 20810 }, { "crossentropy": 2.4880528450012207, "epoch": 0.7544591067285383, "grad_norm": 0.02665771171450615, "grad_norm_var": 3.9302212883755793e-07, "learning_rate": 0.001470634829146006, "loss": 2.4975, "step": 20811 }, { "crossentropy": 2.4821927547454834, "epoch": 0.7544953596287703, "grad_norm": 0.02644684724509716, "grad_norm_var": 3.88018665853001e-07, "learning_rate": 0.0014702232462472688, "loss": 2.4662, "step": 20812 }, { "crossentropy": 2.3975253105163574, "epoch": 0.7545316125290024, "grad_norm": 0.026130111888051033, "grad_norm_var": 4.009743325392254e-07, "learning_rate": 0.001469811711023522, "loss": 2.4149, "step": 20813 }, { "crossentropy": 2.3686041831970215, "epoch": 0.7545678654292344, "grad_norm": 0.026174888014793396, "grad_norm_var": 3.930305880875251e-07, "learning_rate": 0.0014694002234803267, "loss": 2.4383, "step": 20814 }, { "crossentropy": 2.6168344020843506, "epoch": 0.7546041183294664, "grad_norm": 0.025836406275629997, "grad_norm_var": 3.9258220834308074e-07, "learning_rate": 0.001468988783623239, "loss": 2.5362, "step": 20815 }, { "crossentropy": 2.4658091068267822, "epoch": 0.7546403712296984, "grad_norm": 0.02713850326836109, "grad_norm_var": 3.927461064409087e-07, "learning_rate": 0.0014685773914578154, "loss": 2.4546, "step": 20816 }, { "crossentropy": 2.3574626445770264, "epoch": 0.7546766241299304, "grad_norm": 0.02619839273393154, "grad_norm_var": 3.6215533731383485e-07, "learning_rate": 0.0014681660469896142, "loss": 2.4452, "step": 20817 }, { "crossentropy": 2.3965630531311035, "epoch": 0.7547128770301624, "grad_norm": 0.02554088458418846, "grad_norm_var": 3.951025805984387e-07, "learning_rate": 0.001467754750224189, "loss": 2.402, "step": 20818 }, { "crossentropy": 2.5190858840942383, "epoch": 0.7547491299303944, "grad_norm": 0.0261378176510334, "grad_norm_var": 3.727888992964556e-07, "learning_rate": 0.0014673435011670966, "loss": 2.4453, "step": 20819 }, { "crossentropy": 2.460381031036377, "epoch": 0.7547853828306265, "grad_norm": 0.027358107268810272, "grad_norm_var": 4.3549366882938746e-07, "learning_rate": 0.001466932299823892, "loss": 2.5328, "step": 20820 }, { "crossentropy": 2.596713066101074, "epoch": 0.7548216357308585, "grad_norm": 0.027068236842751503, "grad_norm_var": 3.5662921751688825e-07, "learning_rate": 0.001466521146200127, "loss": 2.5406, "step": 20821 }, { "crossentropy": 2.574597120285034, "epoch": 0.7548578886310905, "grad_norm": 0.02720118686556816, "grad_norm_var": 3.414720038349596e-07, "learning_rate": 0.0014661100403013582, "loss": 2.4966, "step": 20822 }, { "crossentropy": 2.6318421363830566, "epoch": 0.7548941415313225, "grad_norm": 0.02795746922492981, "grad_norm_var": 3.948136484194318e-07, "learning_rate": 0.0014656989821331358, "loss": 2.6586, "step": 20823 }, { "crossentropy": 2.5303919315338135, "epoch": 0.7549303944315545, "grad_norm": 0.026671575382351875, "grad_norm_var": 3.946943331340023e-07, "learning_rate": 0.001465287971701011, "loss": 2.5416, "step": 20824 }, { "crossentropy": 2.4374828338623047, "epoch": 0.7549666473317865, "grad_norm": 0.02603808231651783, "grad_norm_var": 4.1465616121709865e-07, "learning_rate": 0.0014648770090105374, "loss": 2.4967, "step": 20825 }, { "crossentropy": 2.4647059440612793, "epoch": 0.7550029002320185, "grad_norm": 0.026750043034553528, "grad_norm_var": 4.0894030542594725e-07, "learning_rate": 0.0014644660940672626, "loss": 2.4266, "step": 20826 }, { "crossentropy": 2.3161497116088867, "epoch": 0.7550391531322506, "grad_norm": 0.026087893173098564, "grad_norm_var": 4.2345413421306506e-07, "learning_rate": 0.0014640552268767393, "loss": 2.3565, "step": 20827 }, { "crossentropy": 2.4921884536743164, "epoch": 0.7550754060324826, "grad_norm": 0.02628171443939209, "grad_norm_var": 4.273421667077652e-07, "learning_rate": 0.0014636444074445172, "loss": 2.4091, "step": 20828 }, { "crossentropy": 2.450296640396118, "epoch": 0.7551116589327146, "grad_norm": 0.026415513828396797, "grad_norm_var": 4.1699870521119676e-07, "learning_rate": 0.0014632336357761429, "loss": 2.491, "step": 20829 }, { "crossentropy": 2.506915330886841, "epoch": 0.7551479118329466, "grad_norm": 0.02732076309621334, "grad_norm_var": 4.4121062975793246e-07, "learning_rate": 0.0014628229118771668, "loss": 2.5497, "step": 20830 }, { "crossentropy": 2.4619526863098145, "epoch": 0.7551841647331786, "grad_norm": 0.02537677250802517, "grad_norm_var": 5.027530741044759e-07, "learning_rate": 0.001462412235753135, "loss": 2.5075, "step": 20831 }, { "crossentropy": 2.4308547973632812, "epoch": 0.7552204176334106, "grad_norm": 0.026762237772345543, "grad_norm_var": 4.844066668485778e-07, "learning_rate": 0.0014620016074095932, "loss": 2.4833, "step": 20832 }, { "crossentropy": 2.540066719055176, "epoch": 0.7552566705336426, "grad_norm": 0.02659720554947853, "grad_norm_var": 4.744319898865312e-07, "learning_rate": 0.00146159102685209, "loss": 2.5368, "step": 20833 }, { "crossentropy": 2.334383964538574, "epoch": 0.7552929234338747, "grad_norm": 0.02631298452615738, "grad_norm_var": 4.0288019740406663e-07, "learning_rate": 0.0014611804940861683, "loss": 2.4703, "step": 20834 }, { "crossentropy": 2.5222716331481934, "epoch": 0.7553291763341067, "grad_norm": 0.027204647660255432, "grad_norm_var": 4.017129431839924e-07, "learning_rate": 0.0014607700091173747, "loss": 2.5197, "step": 20835 }, { "crossentropy": 2.510099411010742, "epoch": 0.7553654292343387, "grad_norm": 0.027348186820745468, "grad_norm_var": 4.008654988738383e-07, "learning_rate": 0.0014603595719512546, "loss": 2.562, "step": 20836 }, { "crossentropy": 2.393052816390991, "epoch": 0.7554016821345708, "grad_norm": 0.026251858100295067, "grad_norm_var": 4.0376067808508185e-07, "learning_rate": 0.001459949182593348, "loss": 2.4899, "step": 20837 }, { "crossentropy": 2.4404454231262207, "epoch": 0.7554379350348028, "grad_norm": 0.026878206059336662, "grad_norm_var": 3.8702354489040095e-07, "learning_rate": 0.0014595388410492023, "loss": 2.4225, "step": 20838 }, { "crossentropy": 2.5656392574310303, "epoch": 0.7554741879350348, "grad_norm": 0.02744678221642971, "grad_norm_var": 3.1367952130790616e-07, "learning_rate": 0.0014591285473243565, "loss": 2.5855, "step": 20839 }, { "crossentropy": 2.536982536315918, "epoch": 0.7555104408352669, "grad_norm": 0.026831146329641342, "grad_norm_var": 3.166016931873631e-07, "learning_rate": 0.0014587183014243522, "loss": 2.4662, "step": 20840 }, { "crossentropy": 2.479341745376587, "epoch": 0.7555466937354989, "grad_norm": 0.02595762349665165, "grad_norm_var": 3.2323831314060895e-07, "learning_rate": 0.0014583081033547325, "loss": 2.5219, "step": 20841 }, { "crossentropy": 2.651715040206909, "epoch": 0.7555829466357309, "grad_norm": 0.02612384594976902, "grad_norm_var": 3.3638515496235003e-07, "learning_rate": 0.001457897953121035, "loss": 2.5477, "step": 20842 }, { "crossentropy": 2.3368911743164062, "epoch": 0.7556191995359629, "grad_norm": 0.025388676673173904, "grad_norm_var": 4.123387971516432e-07, "learning_rate": 0.0014574878507288014, "loss": 2.3501, "step": 20843 }, { "crossentropy": 2.4914917945861816, "epoch": 0.7556554524361949, "grad_norm": 0.02680560015141964, "grad_norm_var": 4.120699077639345e-07, "learning_rate": 0.0014570777961835717, "loss": 2.4445, "step": 20844 }, { "crossentropy": 2.358215570449829, "epoch": 0.7556917053364269, "grad_norm": 0.02646912820637226, "grad_norm_var": 4.111889691825928e-07, "learning_rate": 0.0014566677894908813, "loss": 2.4603, "step": 20845 }, { "crossentropy": 2.5030088424682617, "epoch": 0.755727958236659, "grad_norm": 0.02645515650510788, "grad_norm_var": 3.7105010635463527e-07, "learning_rate": 0.0014562578306562713, "loss": 2.546, "step": 20846 }, { "crossentropy": 2.5071804523468018, "epoch": 0.755764211136891, "grad_norm": 0.026849307119846344, "grad_norm_var": 2.834626578679736e-07, "learning_rate": 0.0014558479196852753, "loss": 2.4904, "step": 20847 }, { "crossentropy": 2.3789234161376953, "epoch": 0.755800464037123, "grad_norm": 0.026219036430120468, "grad_norm_var": 2.905278914362332e-07, "learning_rate": 0.001455438056583433, "loss": 2.4301, "step": 20848 }, { "crossentropy": 2.481750965118408, "epoch": 0.755836716937355, "grad_norm": 0.026413317769765854, "grad_norm_var": 2.920039913622037e-07, "learning_rate": 0.0014550282413562793, "loss": 2.5143, "step": 20849 }, { "crossentropy": 2.5013389587402344, "epoch": 0.755872969837587, "grad_norm": 0.0277702696621418, "grad_norm_var": 3.767923249827724e-07, "learning_rate": 0.0014546184740093472, "loss": 2.5153, "step": 20850 }, { "crossentropy": 2.5121006965637207, "epoch": 0.755909222737819, "grad_norm": 0.026620833203196526, "grad_norm_var": 3.5498216024298693e-07, "learning_rate": 0.001454208754548173, "loss": 2.4534, "step": 20851 }, { "crossentropy": 2.491323947906494, "epoch": 0.755945475638051, "grad_norm": 0.026057977229356766, "grad_norm_var": 3.327750458497735e-07, "learning_rate": 0.0014537990829782915, "loss": 2.5074, "step": 20852 }, { "crossentropy": 2.531611680984497, "epoch": 0.755981728538283, "grad_norm": 0.02611084096133709, "grad_norm_var": 3.393166711537198e-07, "learning_rate": 0.0014533894593052338, "loss": 2.4985, "step": 20853 }, { "crossentropy": 2.6139631271362305, "epoch": 0.7560179814385151, "grad_norm": 0.026588991284370422, "grad_norm_var": 3.309187461351803e-07, "learning_rate": 0.0014529798835345353, "loss": 2.549, "step": 20854 }, { "crossentropy": 2.5450916290283203, "epoch": 0.7560542343387471, "grad_norm": 0.02614707313477993, "grad_norm_var": 2.735997976128912e-07, "learning_rate": 0.0014525703556717245, "loss": 2.5429, "step": 20855 }, { "crossentropy": 2.298125743865967, "epoch": 0.7560904872389791, "grad_norm": 0.02634052187204361, "grad_norm_var": 2.6211168801050913e-07, "learning_rate": 0.0014521608757223365, "loss": 2.4816, "step": 20856 }, { "crossentropy": 2.3890480995178223, "epoch": 0.7561267401392111, "grad_norm": 0.026058347895741463, "grad_norm_var": 2.568733555597154e-07, "learning_rate": 0.0014517514436918994, "loss": 2.435, "step": 20857 }, { "crossentropy": 2.5172133445739746, "epoch": 0.7561629930394431, "grad_norm": 0.02711152471601963, "grad_norm_var": 2.813200667498335e-07, "learning_rate": 0.0014513420595859423, "loss": 2.5136, "step": 20858 }, { "crossentropy": 2.41341233253479, "epoch": 0.7561992459396751, "grad_norm": 0.02659791335463524, "grad_norm_var": 1.9951015736047126e-07, "learning_rate": 0.0014509327234099956, "loss": 2.5045, "step": 20859 }, { "crossentropy": 2.527341604232788, "epoch": 0.7562354988399071, "grad_norm": 0.026904916390776634, "grad_norm_var": 2.036637567542996e-07, "learning_rate": 0.00145052343516959, "loss": 2.5738, "step": 20860 }, { "crossentropy": 2.5413453578948975, "epoch": 0.7562717517401392, "grad_norm": 0.02618805505335331, "grad_norm_var": 2.1143344571769063e-07, "learning_rate": 0.0014501141948702512, "loss": 2.512, "step": 20861 }, { "crossentropy": 2.4344139099121094, "epoch": 0.7563080046403712, "grad_norm": 0.02605541981756687, "grad_norm_var": 2.2525635255624815e-07, "learning_rate": 0.0014497050025175078, "loss": 2.4728, "step": 20862 }, { "crossentropy": 2.4638113975524902, "epoch": 0.7563442575406032, "grad_norm": 0.025873959064483643, "grad_norm_var": 2.3956586588236063e-07, "learning_rate": 0.0014492958581168852, "loss": 2.4161, "step": 20863 }, { "crossentropy": 2.542660713195801, "epoch": 0.7563805104408353, "grad_norm": 0.026071693748235703, "grad_norm_var": 2.452870411048224e-07, "learning_rate": 0.001448886761673912, "loss": 2.5009, "step": 20864 }, { "crossentropy": 2.195760726928711, "epoch": 0.7564167633410673, "grad_norm": 0.02564997598528862, "grad_norm_var": 2.8360447192523017e-07, "learning_rate": 0.0014484777131941117, "loss": 2.328, "step": 20865 }, { "crossentropy": 2.561466932296753, "epoch": 0.7564530162412993, "grad_norm": 0.027147827669978142, "grad_norm_var": 1.9279181254088356e-07, "learning_rate": 0.0014480687126830084, "loss": 2.5056, "step": 20866 }, { "crossentropy": 2.372957468032837, "epoch": 0.7564892691415314, "grad_norm": 0.025993019342422485, "grad_norm_var": 1.9436733280063973e-07, "learning_rate": 0.0014476597601461273, "loss": 2.4387, "step": 20867 }, { "crossentropy": 2.4500701427459717, "epoch": 0.7565255220417634, "grad_norm": 0.027491504326462746, "grad_norm_var": 2.7537393468509733e-07, "learning_rate": 0.0014472508555889935, "loss": 2.4706, "step": 20868 }, { "crossentropy": 2.446840763092041, "epoch": 0.7565617749419954, "grad_norm": 0.027217712253332138, "grad_norm_var": 3.099028349448987e-07, "learning_rate": 0.0014468419990171265, "loss": 2.3895, "step": 20869 }, { "crossentropy": 2.3375749588012695, "epoch": 0.7565980278422274, "grad_norm": 0.02576814405620098, "grad_norm_var": 3.384337715620165e-07, "learning_rate": 0.0014464331904360524, "loss": 2.4171, "step": 20870 }, { "crossentropy": 2.466022253036499, "epoch": 0.7566342807424594, "grad_norm": 0.02598675712943077, "grad_norm_var": 3.4573724582154243e-07, "learning_rate": 0.0014460244298512893, "loss": 2.5035, "step": 20871 }, { "crossentropy": 2.4305577278137207, "epoch": 0.7566705336426914, "grad_norm": 0.027592651546001434, "grad_norm_var": 4.331988148207714e-07, "learning_rate": 0.0014456157172683592, "loss": 2.4645, "step": 20872 }, { "crossentropy": 2.526113986968994, "epoch": 0.7567067865429234, "grad_norm": 0.026639580726623535, "grad_norm_var": 4.2149370692647013e-07, "learning_rate": 0.0014452070526927863, "loss": 2.5735, "step": 20873 }, { "crossentropy": 2.393282651901245, "epoch": 0.7567430394431555, "grad_norm": 0.026212885975837708, "grad_norm_var": 4.008703243379407e-07, "learning_rate": 0.0014447984361300832, "loss": 2.4088, "step": 20874 }, { "crossentropy": 2.556161880493164, "epoch": 0.7567792923433875, "grad_norm": 0.02757672406733036, "grad_norm_var": 4.784873839431958e-07, "learning_rate": 0.0014443898675857725, "loss": 2.5599, "step": 20875 }, { "crossentropy": 2.3795626163482666, "epoch": 0.7568155452436195, "grad_norm": 0.025548676028847694, "grad_norm_var": 5.244183581892912e-07, "learning_rate": 0.0014439813470653745, "loss": 2.4525, "step": 20876 }, { "crossentropy": 2.645298957824707, "epoch": 0.7568517981438515, "grad_norm": 0.02619459666311741, "grad_norm_var": 5.242026680214815e-07, "learning_rate": 0.001443572874574403, "loss": 2.4671, "step": 20877 }, { "crossentropy": 2.3738937377929688, "epoch": 0.7568880510440835, "grad_norm": 0.02608521655201912, "grad_norm_var": 5.22734946481936e-07, "learning_rate": 0.0014431644501183784, "loss": 2.3694, "step": 20878 }, { "crossentropy": 2.318633556365967, "epoch": 0.7569243039443155, "grad_norm": 0.025158554315567017, "grad_norm_var": 6.087809425509942e-07, "learning_rate": 0.001442756073702814, "loss": 2.4311, "step": 20879 }, { "crossentropy": 2.4540205001831055, "epoch": 0.7569605568445475, "grad_norm": 0.02639133669435978, "grad_norm_var": 6.013463184709507e-07, "learning_rate": 0.0014423477453332273, "loss": 2.443, "step": 20880 }, { "crossentropy": 2.4219746589660645, "epoch": 0.7569968097447796, "grad_norm": 0.024981675669550896, "grad_norm_var": 6.975136197166313e-07, "learning_rate": 0.0014419394650151364, "loss": 2.3856, "step": 20881 }, { "crossentropy": 2.62182354927063, "epoch": 0.7570330626450116, "grad_norm": 0.02661917544901371, "grad_norm_var": 6.604485435397652e-07, "learning_rate": 0.0014415312327540491, "loss": 2.5581, "step": 20882 }, { "crossentropy": 2.3814334869384766, "epoch": 0.7570693155452436, "grad_norm": 0.026377690955996513, "grad_norm_var": 6.518419514562329e-07, "learning_rate": 0.0014411230485554833, "loss": 2.4181, "step": 20883 }, { "crossentropy": 2.4074649810791016, "epoch": 0.7571055684454756, "grad_norm": 0.02550072968006134, "grad_norm_var": 6.005732505300528e-07, "learning_rate": 0.0014407149124249525, "loss": 2.436, "step": 20884 }, { "crossentropy": 2.381181240081787, "epoch": 0.7571418213457076, "grad_norm": 0.025861462578177452, "grad_norm_var": 5.388704938182603e-07, "learning_rate": 0.0014403068243679668, "loss": 2.4739, "step": 20885 }, { "crossentropy": 2.5799663066864014, "epoch": 0.7571780742459396, "grad_norm": 0.02502378262579441, "grad_norm_var": 6.119932354119722e-07, "learning_rate": 0.0014398987843900412, "loss": 2.4696, "step": 20886 }, { "crossentropy": 2.561711549758911, "epoch": 0.7572143271461717, "grad_norm": 0.027292033657431602, "grad_norm_var": 6.971211012706442e-07, "learning_rate": 0.0014394907924966832, "loss": 2.4858, "step": 20887 }, { "crossentropy": 2.5302629470825195, "epoch": 0.7572505800464037, "grad_norm": 0.025613030418753624, "grad_norm_var": 5.72099892558866e-07, "learning_rate": 0.001439082848693406, "loss": 2.5329, "step": 20888 }, { "crossentropy": 2.5195937156677246, "epoch": 0.7572868329466357, "grad_norm": 0.026203783228993416, "grad_norm_var": 5.507180028137733e-07, "learning_rate": 0.0014386749529857197, "loss": 2.5423, "step": 20889 }, { "crossentropy": 2.518496513366699, "epoch": 0.7573230858468677, "grad_norm": 0.028165556490421295, "grad_norm_var": 8.340155077000052e-07, "learning_rate": 0.001438267105379133, "loss": 2.4265, "step": 20890 }, { "crossentropy": 2.559237241744995, "epoch": 0.7573593387470998, "grad_norm": 0.02586774341762066, "grad_norm_var": 6.942179692075977e-07, "learning_rate": 0.0014378593058791528, "loss": 2.501, "step": 20891 }, { "crossentropy": 2.573338747024536, "epoch": 0.7573955916473318, "grad_norm": 0.027243666350841522, "grad_norm_var": 7.592801649597963e-07, "learning_rate": 0.0014374515544912897, "loss": 2.5887, "step": 20892 }, { "crossentropy": 2.4956343173980713, "epoch": 0.7574318445475638, "grad_norm": 0.026724006980657578, "grad_norm_var": 7.791510925993112e-07, "learning_rate": 0.0014370438512210481, "loss": 2.554, "step": 20893 }, { "crossentropy": 2.5279550552368164, "epoch": 0.7574680974477959, "grad_norm": 0.026408126577734947, "grad_norm_var": 7.809697352108609e-07, "learning_rate": 0.0014366361960739377, "loss": 2.5962, "step": 20894 }, { "crossentropy": 2.408656358718872, "epoch": 0.7575043503480279, "grad_norm": 0.026332220062613487, "grad_norm_var": 7.018158301803796e-07, "learning_rate": 0.0014362285890554616, "loss": 2.4911, "step": 20895 }, { "crossentropy": 2.4227850437164307, "epoch": 0.7575406032482599, "grad_norm": 0.026471102610230446, "grad_norm_var": 7.033138411535166e-07, "learning_rate": 0.0014358210301711266, "loss": 2.4166, "step": 20896 }, { "crossentropy": 2.4904723167419434, "epoch": 0.7575768561484919, "grad_norm": 0.025294411927461624, "grad_norm_var": 6.547525352427191e-07, "learning_rate": 0.0014354135194264395, "loss": 2.4588, "step": 20897 }, { "crossentropy": 2.600752353668213, "epoch": 0.7576131090487239, "grad_norm": 0.028148939833045006, "grad_norm_var": 8.635847131056024e-07, "learning_rate": 0.0014350060568269019, "loss": 2.5394, "step": 20898 }, { "crossentropy": 2.5608866214752197, "epoch": 0.7576493619489559, "grad_norm": 0.027535540983080864, "grad_norm_var": 9.426913725984656e-07, "learning_rate": 0.001434598642378016, "loss": 2.5452, "step": 20899 }, { "crossentropy": 2.509077548980713, "epoch": 0.757685614849188, "grad_norm": 0.02587391436100006, "grad_norm_var": 9.026499704572656e-07, "learning_rate": 0.0014341912760852877, "loss": 2.5036, "step": 20900 }, { "crossentropy": 2.424119472503662, "epoch": 0.75772186774942, "grad_norm": 0.02641456574201584, "grad_norm_var": 8.744064620297883e-07, "learning_rate": 0.0014337839579542156, "loss": 2.4197, "step": 20901 }, { "crossentropy": 2.5331783294677734, "epoch": 0.757758120649652, "grad_norm": 0.02733517996966839, "grad_norm_var": 7.415700102708066e-07, "learning_rate": 0.0014333766879903054, "loss": 2.4625, "step": 20902 }, { "crossentropy": 2.3909847736358643, "epoch": 0.757794373549884, "grad_norm": 0.02632344700396061, "grad_norm_var": 7.215177158815626e-07, "learning_rate": 0.0014329694661990534, "loss": 2.4609, "step": 20903 }, { "crossentropy": 2.4278743267059326, "epoch": 0.757830626450116, "grad_norm": 0.02652377262711525, "grad_norm_var": 6.508123758695254e-07, "learning_rate": 0.0014325622925859623, "loss": 2.4983, "step": 20904 }, { "crossentropy": 2.440169334411621, "epoch": 0.757866879350348, "grad_norm": 0.025581279769539833, "grad_norm_var": 7.14485258671811e-07, "learning_rate": 0.0014321551671565324, "loss": 2.541, "step": 20905 }, { "crossentropy": 2.602633237838745, "epoch": 0.75790313225058, "grad_norm": 0.026981128379702568, "grad_norm_var": 5.612773277859893e-07, "learning_rate": 0.0014317480899162616, "loss": 2.6132, "step": 20906 }, { "crossentropy": 2.6256096363067627, "epoch": 0.757939385150812, "grad_norm": 0.026481302455067635, "grad_norm_var": 5.276672839638989e-07, "learning_rate": 0.0014313410608706462, "loss": 2.5812, "step": 20907 }, { "crossentropy": 2.3376622200012207, "epoch": 0.7579756380510441, "grad_norm": 0.026610830798745155, "grad_norm_var": 4.987689046995521e-07, "learning_rate": 0.0014309340800251868, "loss": 2.2959, "step": 20908 }, { "crossentropy": 2.549616575241089, "epoch": 0.7580118909512761, "grad_norm": 0.027352098375558853, "grad_norm_var": 5.367424053066e-07, "learning_rate": 0.0014305271473853776, "loss": 2.532, "step": 20909 }, { "crossentropy": 2.4701714515686035, "epoch": 0.7580481438515081, "grad_norm": 0.02652471512556076, "grad_norm_var": 5.345433294852315e-07, "learning_rate": 0.0014301202629567178, "loss": 2.5311, "step": 20910 }, { "crossentropy": 2.3699758052825928, "epoch": 0.7580843967517401, "grad_norm": 0.026534222066402435, "grad_norm_var": 5.295708582699384e-07, "learning_rate": 0.0014297134267447, "loss": 2.4007, "step": 20911 }, { "crossentropy": 2.4953291416168213, "epoch": 0.7581206496519721, "grad_norm": 0.025999782606959343, "grad_norm_var": 5.53072877878151e-07, "learning_rate": 0.0014293066387548198, "loss": 2.5085, "step": 20912 }, { "crossentropy": 2.556948184967041, "epoch": 0.7581569025522041, "grad_norm": 0.026311486959457397, "grad_norm_var": 4.4139398423999277e-07, "learning_rate": 0.001428899898992574, "loss": 2.4832, "step": 20913 }, { "crossentropy": 2.4775707721710205, "epoch": 0.7581931554524362, "grad_norm": 0.026082394644618034, "grad_norm_var": 2.9756689324266355e-07, "learning_rate": 0.0014284932074634527, "loss": 2.4719, "step": 20914 }, { "crossentropy": 2.325026035308838, "epoch": 0.7582294083526682, "grad_norm": 0.025465551763772964, "grad_norm_var": 2.87595156324832e-07, "learning_rate": 0.0014280865641729523, "loss": 2.387, "step": 20915 }, { "crossentropy": 2.5291178226470947, "epoch": 0.7582656612529002, "grad_norm": 0.027423355728387833, "grad_norm_var": 3.29013885316809e-07, "learning_rate": 0.0014276799691265636, "loss": 2.4989, "step": 20916 }, { "crossentropy": 2.5046586990356445, "epoch": 0.7583019141531323, "grad_norm": 0.026961708441376686, "grad_norm_var": 3.41741829297049e-07, "learning_rate": 0.0014272734223297761, "loss": 2.5018, "step": 20917 }, { "crossentropy": 2.5429794788360596, "epoch": 0.7583381670533643, "grad_norm": 0.026243962347507477, "grad_norm_var": 2.991253195938973e-07, "learning_rate": 0.0014268669237880848, "loss": 2.5292, "step": 20918 }, { "crossentropy": 2.4570369720458984, "epoch": 0.7583744199535963, "grad_norm": 0.02618888020515442, "grad_norm_var": 3.027531709875412e-07, "learning_rate": 0.0014264604735069763, "loss": 2.5367, "step": 20919 }, { "crossentropy": 2.6305272579193115, "epoch": 0.7584106728538283, "grad_norm": 0.027268819510936737, "grad_norm_var": 3.443624336716405e-07, "learning_rate": 0.001426054071491943, "loss": 2.5011, "step": 20920 }, { "crossentropy": 2.539022207260132, "epoch": 0.7584469257540604, "grad_norm": 0.026495937258005142, "grad_norm_var": 2.845201160751413e-07, "learning_rate": 0.0014256477177484734, "loss": 2.4824, "step": 20921 }, { "crossentropy": 2.425261974334717, "epoch": 0.7584831786542924, "grad_norm": 0.026612015441060066, "grad_norm_var": 2.722054922488573e-07, "learning_rate": 0.0014252414122820551, "loss": 2.4828, "step": 20922 }, { "crossentropy": 2.599026679992676, "epoch": 0.7585194315545244, "grad_norm": 0.027010614052414894, "grad_norm_var": 2.8593941910851767e-07, "learning_rate": 0.0014248351550981775, "loss": 2.5652, "step": 20923 }, { "crossentropy": 2.358431339263916, "epoch": 0.7585556844547564, "grad_norm": 0.02585347555577755, "grad_norm_var": 3.1745326656770765e-07, "learning_rate": 0.0014244289462023263, "loss": 2.4208, "step": 20924 }, { "crossentropy": 2.4198520183563232, "epoch": 0.7585919373549884, "grad_norm": 0.02625620923936367, "grad_norm_var": 2.710114447494836e-07, "learning_rate": 0.001424022785599987, "loss": 2.4501, "step": 20925 }, { "crossentropy": 2.555436372756958, "epoch": 0.7586281902552204, "grad_norm": 0.02825143188238144, "grad_norm_var": 4.740832170403806e-07, "learning_rate": 0.0014236166732966488, "loss": 2.5802, "step": 20926 }, { "crossentropy": 2.6515045166015625, "epoch": 0.7586644431554525, "grad_norm": 0.027033675462007523, "grad_norm_var": 4.879580566183403e-07, "learning_rate": 0.0014232106092977926, "loss": 2.569, "step": 20927 }, { "crossentropy": 2.558417558670044, "epoch": 0.7587006960556845, "grad_norm": 0.026579244062304497, "grad_norm_var": 4.6324973096040163e-07, "learning_rate": 0.001422804593608905, "loss": 2.4879, "step": 20928 }, { "crossentropy": 2.5407657623291016, "epoch": 0.7587369489559165, "grad_norm": 0.02597769908607006, "grad_norm_var": 4.842738615529543e-07, "learning_rate": 0.0014223986262354722, "loss": 2.4088, "step": 20929 }, { "crossentropy": 2.42610764503479, "epoch": 0.7587732018561485, "grad_norm": 0.025931255891919136, "grad_norm_var": 4.962644557735448e-07, "learning_rate": 0.001421992707182973, "loss": 2.4776, "step": 20930 }, { "crossentropy": 2.331587553024292, "epoch": 0.7588094547563805, "grad_norm": 0.026102688163518906, "grad_norm_var": 4.255078850286716e-07, "learning_rate": 0.001421586836456894, "loss": 2.4132, "step": 20931 }, { "crossentropy": 2.436396360397339, "epoch": 0.7588457076566125, "grad_norm": 0.02598262019455433, "grad_norm_var": 4.0417055788808234e-07, "learning_rate": 0.001421181014062715, "loss": 2.4581, "step": 20932 }, { "crossentropy": 2.496323585510254, "epoch": 0.7588819605568445, "grad_norm": 0.02724575251340866, "grad_norm_var": 4.2492335938683574e-07, "learning_rate": 0.001420775240005916, "loss": 2.4956, "step": 20933 }, { "crossentropy": 2.4135491847991943, "epoch": 0.7589182134570766, "grad_norm": 0.026368508115410805, "grad_norm_var": 4.2056758906074074e-07, "learning_rate": 0.0014203695142919793, "loss": 2.4221, "step": 20934 }, { "crossentropy": 2.4251651763916016, "epoch": 0.7589544663573086, "grad_norm": 0.02772924117743969, "grad_norm_var": 4.900887514022155e-07, "learning_rate": 0.0014199638369263856, "loss": 2.4516, "step": 20935 }, { "crossentropy": 2.515697717666626, "epoch": 0.7589907192575406, "grad_norm": 0.026461029425263405, "grad_norm_var": 4.6623539056131887e-07, "learning_rate": 0.0014195582079146124, "loss": 2.4436, "step": 20936 }, { "crossentropy": 2.5598371028900146, "epoch": 0.7590269721577726, "grad_norm": 0.028778010979294777, "grad_norm_var": 7.545212348418368e-07, "learning_rate": 0.0014191526272621401, "loss": 2.5094, "step": 20937 }, { "crossentropy": 2.307023048400879, "epoch": 0.7590632250580046, "grad_norm": 0.027040250599384308, "grad_norm_var": 7.574851199686755e-07, "learning_rate": 0.001418747094974444, "loss": 2.3699, "step": 20938 }, { "crossentropy": 2.419278144836426, "epoch": 0.7590994779582366, "grad_norm": 0.026753999292850494, "grad_norm_var": 7.5397055043618e-07, "learning_rate": 0.0014183416110570034, "loss": 2.4501, "step": 20939 }, { "crossentropy": 2.577455520629883, "epoch": 0.7591357308584686, "grad_norm": 0.02750161662697792, "grad_norm_var": 7.219907804097234e-07, "learning_rate": 0.001417936175515298, "loss": 2.4705, "step": 20940 }, { "crossentropy": 2.408320426940918, "epoch": 0.7591719837587007, "grad_norm": 0.025171760469675064, "grad_norm_var": 8.849043620237062e-07, "learning_rate": 0.0014175307883547972, "loss": 2.3852, "step": 20941 }, { "crossentropy": 2.373441696166992, "epoch": 0.7592082366589327, "grad_norm": 0.026026561856269836, "grad_norm_var": 7.657328890255921e-07, "learning_rate": 0.0014171254495809794, "loss": 2.4896, "step": 20942 }, { "crossentropy": 2.4740891456604004, "epoch": 0.7592444895591647, "grad_norm": 0.026118280366063118, "grad_norm_var": 7.734418105888868e-07, "learning_rate": 0.0014167201591993217, "loss": 2.5523, "step": 20943 }, { "crossentropy": 2.43381667137146, "epoch": 0.7592807424593968, "grad_norm": 0.02649613283574581, "grad_norm_var": 7.742202497451929e-07, "learning_rate": 0.0014163149172152938, "loss": 2.4158, "step": 20944 }, { "crossentropy": 2.5638678073883057, "epoch": 0.7593169953596288, "grad_norm": 0.02672393061220646, "grad_norm_var": 7.465755694861191e-07, "learning_rate": 0.0014159097236343737, "loss": 2.5395, "step": 20945 }, { "crossentropy": 2.4298834800720215, "epoch": 0.7593532482598608, "grad_norm": 0.02600310929119587, "grad_norm_var": 7.399934122867402e-07, "learning_rate": 0.0014155045784620302, "loss": 2.4533, "step": 20946 }, { "crossentropy": 2.6242384910583496, "epoch": 0.7593895011600929, "grad_norm": 0.027064740657806396, "grad_norm_var": 7.268045689359648e-07, "learning_rate": 0.0014150994817037365, "loss": 2.5431, "step": 20947 }, { "crossentropy": 2.4471328258514404, "epoch": 0.7594257540603249, "grad_norm": 0.026016289368271828, "grad_norm_var": 7.235804360492531e-07, "learning_rate": 0.001414694433364968, "loss": 2.4656, "step": 20948 }, { "crossentropy": 2.4534313678741455, "epoch": 0.7594620069605569, "grad_norm": 0.025612210854887962, "grad_norm_var": 7.75564311143497e-07, "learning_rate": 0.001414289433451189, "loss": 2.4397, "step": 20949 }, { "crossentropy": 2.6271274089813232, "epoch": 0.7594982598607889, "grad_norm": 0.02676531858742237, "grad_norm_var": 7.722791696839389e-07, "learning_rate": 0.0014138844819678725, "loss": 2.577, "step": 20950 }, { "crossentropy": 2.561197280883789, "epoch": 0.7595345127610209, "grad_norm": 0.025946181267499924, "grad_norm_var": 7.123620192138324e-07, "learning_rate": 0.0014134795789204901, "loss": 2.5242, "step": 20951 }, { "crossentropy": 2.3388822078704834, "epoch": 0.7595707656612529, "grad_norm": 0.026224134489893913, "grad_norm_var": 7.180468349528146e-07, "learning_rate": 0.001413074724314507, "loss": 2.4303, "step": 20952 }, { "crossentropy": 2.607987403869629, "epoch": 0.7596070185614849, "grad_norm": 0.026240255683660507, "grad_norm_var": 3.5488384199638727e-07, "learning_rate": 0.0014126699181553949, "loss": 2.5322, "step": 20953 }, { "crossentropy": 2.5377628803253174, "epoch": 0.759643271461717, "grad_norm": 0.027338743209838867, "grad_norm_var": 3.8766313463587526e-07, "learning_rate": 0.0014122651604486175, "loss": 2.5504, "step": 20954 }, { "crossentropy": 2.60538387298584, "epoch": 0.759679524361949, "grad_norm": 0.027846522629261017, "grad_norm_var": 5.174426000758546e-07, "learning_rate": 0.001411860451199644, "loss": 2.6024, "step": 20955 }, { "crossentropy": 2.565006732940674, "epoch": 0.759715777262181, "grad_norm": 0.02757629193365574, "grad_norm_var": 5.283266144055169e-07, "learning_rate": 0.0014114557904139418, "loss": 2.5591, "step": 20956 }, { "crossentropy": 2.5407581329345703, "epoch": 0.759752030162413, "grad_norm": 0.027683015912771225, "grad_norm_var": 4.950968290718388e-07, "learning_rate": 0.0014110511780969749, "loss": 2.4721, "step": 20957 }, { "crossentropy": 2.5828566551208496, "epoch": 0.759788283062645, "grad_norm": 0.025775866582989693, "grad_norm_var": 5.183633234160171e-07, "learning_rate": 0.0014106466142542067, "loss": 2.4904, "step": 20958 }, { "crossentropy": 2.494800329208374, "epoch": 0.759824535962877, "grad_norm": 0.026487581431865692, "grad_norm_var": 5.036873592797423e-07, "learning_rate": 0.0014102420988911046, "loss": 2.4795, "step": 20959 }, { "crossentropy": 2.4162635803222656, "epoch": 0.759860788863109, "grad_norm": 0.02576366253197193, "grad_norm_var": 5.485861240897415e-07, "learning_rate": 0.001409837632013129, "loss": 2.3887, "step": 20960 }, { "crossentropy": 2.538055658340454, "epoch": 0.759897041763341, "grad_norm": 0.026016587391495705, "grad_norm_var": 5.65032086246433e-07, "learning_rate": 0.0014094332136257459, "loss": 2.5199, "step": 20961 }, { "crossentropy": 2.443340301513672, "epoch": 0.7599332946635731, "grad_norm": 0.026034485548734665, "grad_norm_var": 5.629206102140035e-07, "learning_rate": 0.0014090288437344151, "loss": 2.4533, "step": 20962 }, { "crossentropy": 2.424046516418457, "epoch": 0.7599695475638051, "grad_norm": 0.026423119008541107, "grad_norm_var": 5.424325611911678e-07, "learning_rate": 0.0014086245223445986, "loss": 2.4476, "step": 20963 }, { "crossentropy": 2.432941436767578, "epoch": 0.7600058004640371, "grad_norm": 0.026136081665754318, "grad_norm_var": 5.358527752789687e-07, "learning_rate": 0.0014082202494617602, "loss": 2.496, "step": 20964 }, { "crossentropy": 2.4918675422668457, "epoch": 0.7600420533642691, "grad_norm": 0.02714497037231922, "grad_norm_var": 5.029113633515791e-07, "learning_rate": 0.0014078160250913575, "loss": 2.4622, "step": 20965 }, { "crossentropy": 2.6055970191955566, "epoch": 0.7600783062645011, "grad_norm": 0.027059057727456093, "grad_norm_var": 5.15261436280602e-07, "learning_rate": 0.0014074118492388493, "loss": 2.5044, "step": 20966 }, { "crossentropy": 2.481867551803589, "epoch": 0.7601145591647331, "grad_norm": 0.02643231302499771, "grad_norm_var": 4.872615882600076e-07, "learning_rate": 0.0014070077219096977, "loss": 2.471, "step": 20967 }, { "crossentropy": 2.5809009075164795, "epoch": 0.7601508120649652, "grad_norm": 0.026297517120838165, "grad_norm_var": 4.835642242285882e-07, "learning_rate": 0.0014066036431093583, "loss": 2.4681, "step": 20968 }, { "crossentropy": 2.407191276550293, "epoch": 0.7601870649651972, "grad_norm": 0.02632283605635166, "grad_norm_var": 4.795779127068998e-07, "learning_rate": 0.0014061996128432908, "loss": 2.4364, "step": 20969 }, { "crossentropy": 2.388843297958374, "epoch": 0.7602233178654292, "grad_norm": 0.026138417422771454, "grad_norm_var": 4.5878431094458806e-07, "learning_rate": 0.0014057956311169495, "loss": 2.4383, "step": 20970 }, { "crossentropy": 2.6403026580810547, "epoch": 0.7602595707656613, "grad_norm": 0.028661426156759262, "grad_norm_var": 6.388631332184993e-07, "learning_rate": 0.0014053916979357933, "loss": 2.5855, "step": 20971 }, { "crossentropy": 2.4468419551849365, "epoch": 0.7602958236658933, "grad_norm": 0.027414556592702866, "grad_norm_var": 6.199206550541274e-07, "learning_rate": 0.0014049878133052785, "loss": 2.5008, "step": 20972 }, { "crossentropy": 2.4761855602264404, "epoch": 0.7603320765661253, "grad_norm": 0.02919902838766575, "grad_norm_var": 9.800602025913554e-07, "learning_rate": 0.0014045839772308573, "loss": 2.5702, "step": 20973 }, { "crossentropy": 2.308999538421631, "epoch": 0.7603683294663574, "grad_norm": 0.026579923927783966, "grad_norm_var": 9.206724493161673e-07, "learning_rate": 0.0014041801897179873, "loss": 2.3994, "step": 20974 }, { "crossentropy": 2.5503528118133545, "epoch": 0.7604045823665894, "grad_norm": 0.02702300064265728, "grad_norm_var": 9.193579176195218e-07, "learning_rate": 0.0014037764507721206, "loss": 2.5231, "step": 20975 }, { "crossentropy": 2.656508445739746, "epoch": 0.7604408352668214, "grad_norm": 0.026866436004638672, "grad_norm_var": 8.443915421591878e-07, "learning_rate": 0.001403372760398709, "loss": 2.6064, "step": 20976 }, { "crossentropy": 2.603559970855713, "epoch": 0.7604770881670534, "grad_norm": 0.026536237448453903, "grad_norm_var": 8.028759050819941e-07, "learning_rate": 0.0014029691186032078, "loss": 2.5489, "step": 20977 }, { "crossentropy": 2.5529026985168457, "epoch": 0.7605133410672854, "grad_norm": 0.02742987871170044, "grad_norm_var": 7.650585455696759e-07, "learning_rate": 0.0014025655253910657, "loss": 2.4525, "step": 20978 }, { "crossentropy": 2.4726829528808594, "epoch": 0.7605495939675174, "grad_norm": 0.02732047438621521, "grad_norm_var": 7.488707720150796e-07, "learning_rate": 0.0014021619807677355, "loss": 2.5194, "step": 20979 }, { "crossentropy": 2.0892996788024902, "epoch": 0.7605858468677494, "grad_norm": 0.026032764464616776, "grad_norm_var": 7.619229442445598e-07, "learning_rate": 0.0014017584847386695, "loss": 2.2769, "step": 20980 }, { "crossentropy": 2.6680150032043457, "epoch": 0.7606220997679815, "grad_norm": 0.02615232951939106, "grad_norm_var": 8.081148153747287e-07, "learning_rate": 0.0014013550373093136, "loss": 2.5447, "step": 20981 }, { "crossentropy": 2.5157580375671387, "epoch": 0.7606583526682135, "grad_norm": 0.02586137130856514, "grad_norm_var": 8.830093619118425e-07, "learning_rate": 0.0014009516384851212, "loss": 2.5387, "step": 20982 }, { "crossentropy": 2.6352367401123047, "epoch": 0.7606946055684455, "grad_norm": 0.027486542239785194, "grad_norm_var": 8.87887072583771e-07, "learning_rate": 0.0014005482882715387, "loss": 2.5036, "step": 20983 }, { "crossentropy": 2.5457684993743896, "epoch": 0.7607308584686775, "grad_norm": 0.026783207431435585, "grad_norm_var": 8.598797848266907e-07, "learning_rate": 0.001400144986674012, "loss": 2.5795, "step": 20984 }, { "crossentropy": 2.564793825149536, "epoch": 0.7607671113689095, "grad_norm": 0.026647260412573814, "grad_norm_var": 8.376841001980772e-07, "learning_rate": 0.0013997417336979918, "loss": 2.5912, "step": 20985 }, { "crossentropy": 2.7466378211975098, "epoch": 0.7608033642691415, "grad_norm": 0.027854977175593376, "grad_norm_var": 8.227503429453769e-07, "learning_rate": 0.0013993385293489219, "loss": 2.5807, "step": 20986 }, { "crossentropy": 2.5619757175445557, "epoch": 0.7608396171693735, "grad_norm": 0.026538414880633354, "grad_norm_var": 6.668714399635696e-07, "learning_rate": 0.001398935373632249, "loss": 2.5259, "step": 20987 }, { "crossentropy": 2.497398614883423, "epoch": 0.7608758700696056, "grad_norm": 0.026722682639956474, "grad_norm_var": 6.569692978442097e-07, "learning_rate": 0.00139853226655342, "loss": 2.4646, "step": 20988 }, { "crossentropy": 2.471111297607422, "epoch": 0.7609121229698376, "grad_norm": 0.026978885754942894, "grad_norm_var": 2.9621730618990294e-07, "learning_rate": 0.001398129208117877, "loss": 2.4986, "step": 20989 }, { "crossentropy": 2.5001580715179443, "epoch": 0.7609483758700696, "grad_norm": 0.028573520481586456, "grad_norm_var": 4.858809394137163e-07, "learning_rate": 0.0013977261983310668, "loss": 2.6318, "step": 20990 }, { "crossentropy": 2.545492172241211, "epoch": 0.7609846287703016, "grad_norm": 0.02563910186290741, "grad_norm_var": 5.875884359766066e-07, "learning_rate": 0.0013973232371984306, "loss": 2.5882, "step": 20991 }, { "crossentropy": 2.3288838863372803, "epoch": 0.7610208816705336, "grad_norm": 0.027338458225131035, "grad_norm_var": 6.032401342678419e-07, "learning_rate": 0.0013969203247254098, "loss": 2.4294, "step": 20992 }, { "crossentropy": 2.4636926651000977, "epoch": 0.7610571345707656, "grad_norm": 0.027421118691563606, "grad_norm_var": 6.129760516124586e-07, "learning_rate": 0.0013965174609174497, "loss": 2.5084, "step": 20993 }, { "crossentropy": 2.6436727046966553, "epoch": 0.7610933874709976, "grad_norm": 0.02701100893318653, "grad_norm_var": 5.956783079852223e-07, "learning_rate": 0.0013961146457799878, "loss": 2.5102, "step": 20994 }, { "crossentropy": 2.403442144393921, "epoch": 0.7611296403712297, "grad_norm": 0.026707276701927185, "grad_norm_var": 5.846075902919545e-07, "learning_rate": 0.001395711879318467, "loss": 2.4223, "step": 20995 }, { "crossentropy": 2.534453868865967, "epoch": 0.7611658932714617, "grad_norm": 0.026791760697960854, "grad_norm_var": 5.369665507796879e-07, "learning_rate": 0.0013953091615383285, "loss": 2.5248, "step": 20996 }, { "crossentropy": 2.5203065872192383, "epoch": 0.7612021461716937, "grad_norm": 0.027521219104528427, "grad_norm_var": 5.163878881065161e-07, "learning_rate": 0.0013949064924450084, "loss": 2.4817, "step": 20997 }, { "crossentropy": 2.4871294498443604, "epoch": 0.7612383990719258, "grad_norm": 0.029882565140724182, "grad_norm_var": 9.206548614643182e-07, "learning_rate": 0.001394503872043949, "loss": 2.472, "step": 20998 }, { "crossentropy": 2.45761775970459, "epoch": 0.7612746519721578, "grad_norm": 0.026443682610988617, "grad_norm_var": 9.548499832194579e-07, "learning_rate": 0.0013941013003405862, "loss": 2.4905, "step": 20999 }, { "crossentropy": 2.4441728591918945, "epoch": 0.7613109048723898, "grad_norm": 0.026235349476337433, "grad_norm_var": 1.002480555264149e-06, "learning_rate": 0.0013936987773403554, "loss": 2.4894, "step": 21000 }, { "crossentropy": 2.4329426288604736, "epoch": 0.7613471577726219, "grad_norm": 0.027145909145474434, "grad_norm_var": 9.849810984158588e-07, "learning_rate": 0.0013932963030486973, "loss": 2.493, "step": 21001 }, { "crossentropy": 2.588245153427124, "epoch": 0.7613834106728539, "grad_norm": 0.026356123387813568, "grad_norm_var": 9.895738377581227e-07, "learning_rate": 0.0013928938774710447, "loss": 2.5165, "step": 21002 }, { "crossentropy": 2.4572713375091553, "epoch": 0.7614196635730859, "grad_norm": 0.02646283619105816, "grad_norm_var": 9.954055397164962e-07, "learning_rate": 0.0013924915006128336, "loss": 2.4841, "step": 21003 }, { "crossentropy": 2.3591864109039307, "epoch": 0.7614559164733179, "grad_norm": 0.03165486082434654, "grad_norm_var": 2.282817468519345e-06, "learning_rate": 0.0013920891724795016, "loss": 2.3724, "step": 21004 }, { "crossentropy": 2.5789308547973633, "epoch": 0.7614921693735499, "grad_norm": 0.027794666588306427, "grad_norm_var": 2.2802127557702257e-06, "learning_rate": 0.0013916868930764787, "loss": 2.5196, "step": 21005 }, { "crossentropy": 2.580571174621582, "epoch": 0.7615284222737819, "grad_norm": 0.025703586637973785, "grad_norm_var": 2.3597968388392246e-06, "learning_rate": 0.0013912846624092017, "loss": 2.5298, "step": 21006 }, { "crossentropy": 2.4443957805633545, "epoch": 0.7615646751740139, "grad_norm": 0.02679063007235527, "grad_norm_var": 2.194289293346724e-06, "learning_rate": 0.0013908824804831016, "loss": 2.4654, "step": 21007 }, { "crossentropy": 2.4969074726104736, "epoch": 0.761600928074246, "grad_norm": 0.02711905539035797, "grad_norm_var": 2.1970158161927484e-06, "learning_rate": 0.0013904803473036092, "loss": 2.4458, "step": 21008 }, { "crossentropy": 2.4846537113189697, "epoch": 0.761637180974478, "grad_norm": 0.02644944190979004, "grad_norm_var": 2.242290499822562e-06, "learning_rate": 0.0013900782628761589, "loss": 2.4937, "step": 21009 }, { "crossentropy": 2.3930344581604004, "epoch": 0.76167343387471, "grad_norm": 0.0273010041564703, "grad_norm_var": 2.2381366427838442e-06, "learning_rate": 0.0013896762272061786, "loss": 2.4876, "step": 21010 }, { "crossentropy": 2.5509259700775146, "epoch": 0.761709686774942, "grad_norm": 0.027681192383170128, "grad_norm_var": 2.22402157571525e-06, "learning_rate": 0.0013892742402990992, "loss": 2.5137, "step": 21011 }, { "crossentropy": 2.5659797191619873, "epoch": 0.761745939675174, "grad_norm": 0.02564663626253605, "grad_norm_var": 2.3886727756006515e-06, "learning_rate": 0.0013888723021603527, "loss": 2.5346, "step": 21012 }, { "crossentropy": 2.3714075088500977, "epoch": 0.761782192575406, "grad_norm": 0.027027340605854988, "grad_norm_var": 2.3868344871865823e-06, "learning_rate": 0.001388470412795364, "loss": 2.4348, "step": 21013 }, { "crossentropy": 2.507405996322632, "epoch": 0.761818445475638, "grad_norm": 0.028194839134812355, "grad_norm_var": 1.9681628787876346e-06, "learning_rate": 0.0013880685722095648, "loss": 2.5442, "step": 21014 }, { "crossentropy": 2.395895004272461, "epoch": 0.76185469837587, "grad_norm": 0.028058437630534172, "grad_norm_var": 1.9843431229920697e-06, "learning_rate": 0.0013876667804083793, "loss": 2.4854, "step": 21015 }, { "crossentropy": 2.522792100906372, "epoch": 0.7618909512761021, "grad_norm": 0.028620313853025436, "grad_norm_var": 2.024706948749027e-06, "learning_rate": 0.0013872650373972374, "loss": 2.5744, "step": 21016 }, { "crossentropy": 2.58304500579834, "epoch": 0.7619272041763341, "grad_norm": 0.025563692674040794, "grad_norm_var": 2.2295901556279587e-06, "learning_rate": 0.0013868633431815631, "loss": 2.4961, "step": 21017 }, { "crossentropy": 2.4999160766601562, "epoch": 0.7619634570765661, "grad_norm": 0.026703281328082085, "grad_norm_var": 2.194518527763228e-06, "learning_rate": 0.0013864616977667815, "loss": 2.5057, "step": 21018 }, { "crossentropy": 2.359503984451294, "epoch": 0.7619997099767981, "grad_norm": 0.026317071169614792, "grad_norm_var": 2.212082817151737e-06, "learning_rate": 0.0013860601011583184, "loss": 2.4562, "step": 21019 }, { "crossentropy": 2.5019030570983887, "epoch": 0.7620359628770301, "grad_norm": 0.02624804712831974, "grad_norm_var": 8.918912113464594e-07, "learning_rate": 0.0013856585533615995, "loss": 2.522, "step": 21020 }, { "crossentropy": 2.318486213684082, "epoch": 0.7620722157772621, "grad_norm": 0.02641911245882511, "grad_norm_var": 8.554530974979579e-07, "learning_rate": 0.0013852570543820453, "loss": 2.4225, "step": 21021 }, { "crossentropy": 2.3858466148376465, "epoch": 0.7621084686774942, "grad_norm": 0.025744888931512833, "grad_norm_var": 8.491625759846078e-07, "learning_rate": 0.0013848556042250816, "loss": 2.4298, "step": 21022 }, { "crossentropy": 2.5210928916931152, "epoch": 0.7621447215777262, "grad_norm": 0.02720746025443077, "grad_norm_var": 8.557322449388264e-07, "learning_rate": 0.0013844542028961277, "loss": 2.5453, "step": 21023 }, { "crossentropy": 2.626509428024292, "epoch": 0.7621809744779582, "grad_norm": 0.026775961741805077, "grad_norm_var": 8.527877334471203e-07, "learning_rate": 0.0013840528504006078, "loss": 2.5113, "step": 21024 }, { "crossentropy": 2.463815212249756, "epoch": 0.7622172273781903, "grad_norm": 0.02571799047291279, "grad_norm_var": 9.274782928676839e-07, "learning_rate": 0.0013836515467439414, "loss": 2.4797, "step": 21025 }, { "crossentropy": 2.6758439540863037, "epoch": 0.7622534802784223, "grad_norm": 0.026977717876434326, "grad_norm_var": 9.135658046175669e-07, "learning_rate": 0.001383250291931548, "loss": 2.5825, "step": 21026 }, { "crossentropy": 2.5035085678100586, "epoch": 0.7622897331786543, "grad_norm": 0.026974136009812355, "grad_norm_var": 8.623503445833915e-07, "learning_rate": 0.0013828490859688475, "loss": 2.4637, "step": 21027 }, { "crossentropy": 2.65216064453125, "epoch": 0.7623259860788864, "grad_norm": 0.028086766600608826, "grad_norm_var": 8.715055144071019e-07, "learning_rate": 0.0013824479288612613, "loss": 2.5722, "step": 21028 }, { "crossentropy": 2.3818585872650146, "epoch": 0.7623622389791184, "grad_norm": 0.027242593467235565, "grad_norm_var": 8.776308723155382e-07, "learning_rate": 0.001382046820614204, "loss": 2.387, "step": 21029 }, { "crossentropy": 2.550858497619629, "epoch": 0.7623984918793504, "grad_norm": 0.02552896738052368, "grad_norm_var": 8.71608728719488e-07, "learning_rate": 0.0013816457612330962, "loss": 2.4826, "step": 21030 }, { "crossentropy": 2.4392411708831787, "epoch": 0.7624347447795824, "grad_norm": 0.027372900396585464, "grad_norm_var": 7.824487052200534e-07, "learning_rate": 0.0013812447507233515, "loss": 2.5109, "step": 21031 }, { "crossentropy": 2.5000147819519043, "epoch": 0.7624709976798144, "grad_norm": 0.027286184951663017, "grad_norm_var": 5.554449700332734e-07, "learning_rate": 0.0013808437890903901, "loss": 2.5283, "step": 21032 }, { "crossentropy": 2.6618781089782715, "epoch": 0.7625072505800464, "grad_norm": 0.02764813043177128, "grad_norm_var": 5.291392461136945e-07, "learning_rate": 0.001380442876339626, "loss": 2.5892, "step": 21033 }, { "crossentropy": 2.608213424682617, "epoch": 0.7625435034802784, "grad_norm": 0.026046788319945335, "grad_norm_var": 5.615394021884467e-07, "learning_rate": 0.0013800420124764718, "loss": 2.6048, "step": 21034 }, { "crossentropy": 2.407153367996216, "epoch": 0.7625797563805105, "grad_norm": 0.02585390768945217, "grad_norm_var": 6.00118237517932e-07, "learning_rate": 0.0013796411975063433, "loss": 2.4686, "step": 21035 }, { "crossentropy": 2.6750142574310303, "epoch": 0.7626160092807425, "grad_norm": 0.026508258655667305, "grad_norm_var": 5.888180887585048e-07, "learning_rate": 0.0013792404314346563, "loss": 2.581, "step": 21036 }, { "crossentropy": 2.6126365661621094, "epoch": 0.7626522621809745, "grad_norm": 0.027458354830741882, "grad_norm_var": 6.157375102307123e-07, "learning_rate": 0.0013788397142668208, "loss": 2.5053, "step": 21037 }, { "crossentropy": 2.5895025730133057, "epoch": 0.7626885150812065, "grad_norm": 0.026603667065501213, "grad_norm_var": 5.43657763102144e-07, "learning_rate": 0.0013784390460082512, "loss": 2.5021, "step": 21038 }, { "crossentropy": 2.2283005714416504, "epoch": 0.7627247679814385, "grad_norm": 0.02644674852490425, "grad_norm_var": 5.416023202983357e-07, "learning_rate": 0.0013780384266643575, "loss": 2.3472, "step": 21039 }, { "crossentropy": 2.466779947280884, "epoch": 0.7627610208816705, "grad_norm": 0.02631363831460476, "grad_norm_var": 5.553992543727194e-07, "learning_rate": 0.0013776378562405506, "loss": 2.4252, "step": 21040 }, { "crossentropy": 2.6206915378570557, "epoch": 0.7627972737819025, "grad_norm": 0.02902247942984104, "grad_norm_var": 7.813371812333125e-07, "learning_rate": 0.0013772373347422451, "loss": 2.6618, "step": 21041 }, { "crossentropy": 2.4736440181732178, "epoch": 0.7628335266821346, "grad_norm": 0.026786310598254204, "grad_norm_var": 7.831927288610869e-07, "learning_rate": 0.0013768368621748439, "loss": 2.4994, "step": 21042 }, { "crossentropy": 2.5205979347229004, "epoch": 0.7628697795823666, "grad_norm": 0.02658652514219284, "grad_norm_var": 7.912703391324173e-07, "learning_rate": 0.0013764364385437594, "loss": 2.5353, "step": 21043 }, { "crossentropy": 2.4882566928863525, "epoch": 0.7629060324825986, "grad_norm": 0.026384614408016205, "grad_norm_var": 7.085754966778031e-07, "learning_rate": 0.0013760360638544011, "loss": 2.5142, "step": 21044 }, { "crossentropy": 2.5251290798187256, "epoch": 0.7629422853828306, "grad_norm": 0.026705197989940643, "grad_norm_var": 6.962111024804192e-07, "learning_rate": 0.0013756357381121736, "loss": 2.5195, "step": 21045 }, { "crossentropy": 2.493065595626831, "epoch": 0.7629785382830626, "grad_norm": 0.027141744270920753, "grad_norm_var": 5.887817576016284e-07, "learning_rate": 0.0013752354613224876, "loss": 2.5799, "step": 21046 }, { "crossentropy": 2.4492785930633545, "epoch": 0.7630147911832946, "grad_norm": 0.02689957059919834, "grad_norm_var": 5.720141211401948e-07, "learning_rate": 0.0013748352334907454, "loss": 2.4827, "step": 21047 }, { "crossentropy": 2.6333043575286865, "epoch": 0.7630510440835266, "grad_norm": 0.026806224137544632, "grad_norm_var": 5.588666669574526e-07, "learning_rate": 0.001374435054622355, "loss": 2.5501, "step": 21048 }, { "crossentropy": 2.446438789367676, "epoch": 0.7630872969837587, "grad_norm": 0.02739126980304718, "grad_norm_var": 5.348256427571089e-07, "learning_rate": 0.0013740349247227247, "loss": 2.5272, "step": 21049 }, { "crossentropy": 2.4835691452026367, "epoch": 0.7631235498839907, "grad_norm": 0.02647378481924534, "grad_norm_var": 5.027859144734418e-07, "learning_rate": 0.0013736348437972518, "loss": 2.3968, "step": 21050 }, { "crossentropy": 2.3715879917144775, "epoch": 0.7631598027842227, "grad_norm": 0.02604066953063011, "grad_norm_var": 4.805004652670697e-07, "learning_rate": 0.0013732348118513432, "loss": 2.3975, "step": 21051 }, { "crossentropy": 2.4846394062042236, "epoch": 0.7631960556844548, "grad_norm": 0.025882508605718613, "grad_norm_var": 5.333244333283512e-07, "learning_rate": 0.0013728348288904047, "loss": 2.5084, "step": 21052 }, { "crossentropy": 2.4919772148132324, "epoch": 0.7632323085846868, "grad_norm": 0.025921911001205444, "grad_norm_var": 5.478305052685247e-07, "learning_rate": 0.001372434894919834, "loss": 2.4802, "step": 21053 }, { "crossentropy": 2.3707447052001953, "epoch": 0.7632685614849188, "grad_norm": 0.025824980810284615, "grad_norm_var": 5.97071626799829e-07, "learning_rate": 0.0013720350099450367, "loss": 2.4127, "step": 21054 }, { "crossentropy": 2.4322052001953125, "epoch": 0.7633048143851509, "grad_norm": 0.027064772322773933, "grad_norm_var": 6.030199868689926e-07, "learning_rate": 0.0013716351739714112, "loss": 2.4675, "step": 21055 }, { "crossentropy": 2.497201442718506, "epoch": 0.7633410672853829, "grad_norm": 0.02617938630282879, "grad_norm_var": 6.111141288603697e-07, "learning_rate": 0.0013712353870043582, "loss": 2.5041, "step": 21056 }, { "crossentropy": 2.3014302253723145, "epoch": 0.7633773201856149, "grad_norm": 0.02585291862487793, "grad_norm_var": 2.5517205245622656e-07, "learning_rate": 0.0013708356490492802, "loss": 2.4486, "step": 21057 }, { "crossentropy": 2.5344924926757812, "epoch": 0.7634135730858469, "grad_norm": 0.028317369520664215, "grad_norm_var": 4.608637266427402e-07, "learning_rate": 0.001370435960111574, "loss": 2.4617, "step": 21058 }, { "crossentropy": 2.436521530151367, "epoch": 0.7634498259860789, "grad_norm": 0.02659212425351143, "grad_norm_var": 4.608615312181689e-07, "learning_rate": 0.001370036320196637, "loss": 2.5034, "step": 21059 }, { "crossentropy": 2.4536049365997314, "epoch": 0.7634860788863109, "grad_norm": 0.027711788192391396, "grad_norm_var": 5.34172254345562e-07, "learning_rate": 0.0013696367293098695, "loss": 2.4873, "step": 21060 }, { "crossentropy": 2.534569025039673, "epoch": 0.7635223317865429, "grad_norm": 0.026047904044389725, "grad_norm_var": 5.58562009098197e-07, "learning_rate": 0.0013692371874566656, "loss": 2.5155, "step": 21061 }, { "crossentropy": 2.393589496612549, "epoch": 0.763558584686775, "grad_norm": 0.027150388807058334, "grad_norm_var": 5.591515531729966e-07, "learning_rate": 0.0013688376946424247, "loss": 2.4648, "step": 21062 }, { "crossentropy": 2.470560312271118, "epoch": 0.763594837587007, "grad_norm": 0.02742689661681652, "grad_norm_var": 5.951437657617312e-07, "learning_rate": 0.0013684382508725402, "loss": 2.529, "step": 21063 }, { "crossentropy": 2.5108888149261475, "epoch": 0.763631090487239, "grad_norm": 0.026720549911260605, "grad_norm_var": 5.940213390768015e-07, "learning_rate": 0.0013680388561524077, "loss": 2.4993, "step": 21064 }, { "crossentropy": 2.5428030490875244, "epoch": 0.763667343387471, "grad_norm": 0.026596233248710632, "grad_norm_var": 5.562682364077323e-07, "learning_rate": 0.001367639510487424, "loss": 2.4906, "step": 21065 }, { "crossentropy": 2.5219573974609375, "epoch": 0.763703596287703, "grad_norm": 0.024990325793623924, "grad_norm_var": 7.212977644346918e-07, "learning_rate": 0.001367240213882981, "loss": 2.4879, "step": 21066 }, { "crossentropy": 2.57057523727417, "epoch": 0.763739849187935, "grad_norm": 0.026516124606132507, "grad_norm_var": 7.050367965952612e-07, "learning_rate": 0.0013668409663444697, "loss": 2.5066, "step": 21067 }, { "crossentropy": 2.493157148361206, "epoch": 0.763776102088167, "grad_norm": 0.02582925371825695, "grad_norm_var": 7.099519815639456e-07, "learning_rate": 0.0013664417678772873, "loss": 2.4234, "step": 21068 }, { "crossentropy": 2.5090386867523193, "epoch": 0.7638123549883991, "grad_norm": 0.02725186012685299, "grad_norm_var": 7.097554448613648e-07, "learning_rate": 0.0013660426184868208, "loss": 2.4575, "step": 21069 }, { "crossentropy": 2.5452558994293213, "epoch": 0.7638486078886311, "grad_norm": 0.027399051934480667, "grad_norm_var": 6.957507962803185e-07, "learning_rate": 0.001365643518178465, "loss": 2.5833, "step": 21070 }, { "crossentropy": 2.6895344257354736, "epoch": 0.7638848607888631, "grad_norm": 0.026615194976329803, "grad_norm_var": 6.88191981016289e-07, "learning_rate": 0.001365244466957607, "loss": 2.6508, "step": 21071 }, { "crossentropy": 2.545942783355713, "epoch": 0.7639211136890951, "grad_norm": 0.02647879719734192, "grad_norm_var": 6.730178166120532e-07, "learning_rate": 0.0013648454648296387, "loss": 2.5026, "step": 21072 }, { "crossentropy": 2.4467363357543945, "epoch": 0.7639573665893271, "grad_norm": 0.02559645101428032, "grad_norm_var": 7.067296091588093e-07, "learning_rate": 0.0013644465117999506, "loss": 2.4988, "step": 21073 }, { "crossentropy": 2.5532822608947754, "epoch": 0.7639936194895591, "grad_norm": 0.02608092501759529, "grad_norm_var": 5.377985697505808e-07, "learning_rate": 0.0013640476078739295, "loss": 2.4999, "step": 21074 }, { "crossentropy": 2.6074016094207764, "epoch": 0.7640298723897911, "grad_norm": 0.025801964104175568, "grad_norm_var": 5.737250612594061e-07, "learning_rate": 0.0013636487530569613, "loss": 2.6146, "step": 21075 }, { "crossentropy": 2.4234979152679443, "epoch": 0.7640661252900232, "grad_norm": 0.026899145916104317, "grad_norm_var": 4.851464795784053e-07, "learning_rate": 0.0013632499473544368, "loss": 2.49, "step": 21076 }, { "crossentropy": 2.4066569805145264, "epoch": 0.7641023781902552, "grad_norm": 0.025694113224744797, "grad_norm_var": 5.125299876985867e-07, "learning_rate": 0.0013628511907717395, "loss": 2.5211, "step": 21077 }, { "crossentropy": 2.575611114501953, "epoch": 0.7641386310904872, "grad_norm": 0.026571523398160934, "grad_norm_var": 4.786786422336509e-07, "learning_rate": 0.0013624524833142576, "loss": 2.4943, "step": 21078 }, { "crossentropy": 2.585395574569702, "epoch": 0.7641748839907193, "grad_norm": 0.02697027288377285, "grad_norm_var": 4.2944982042539213e-07, "learning_rate": 0.0013620538249873738, "loss": 2.5993, "step": 21079 }, { "crossentropy": 2.5363988876342773, "epoch": 0.7642111368909513, "grad_norm": 0.026387713849544525, "grad_norm_var": 4.210713908167253e-07, "learning_rate": 0.001361655215796474, "loss": 2.5018, "step": 21080 }, { "crossentropy": 2.61320161819458, "epoch": 0.7642473897911833, "grad_norm": 0.027193648740649223, "grad_norm_var": 4.6259872320906367e-07, "learning_rate": 0.001361256655746943, "loss": 2.5717, "step": 21081 }, { "crossentropy": 2.477334499359131, "epoch": 0.7642836426914154, "grad_norm": 0.026314621791243553, "grad_norm_var": 3.246629895850908e-07, "learning_rate": 0.0013608581448441616, "loss": 2.4688, "step": 21082 }, { "crossentropy": 2.3445281982421875, "epoch": 0.7643198955916474, "grad_norm": 0.026526495814323425, "grad_norm_var": 3.247265231967958e-07, "learning_rate": 0.0013604596830935152, "loss": 2.4738, "step": 21083 }, { "crossentropy": 2.5894713401794434, "epoch": 0.7643561484918794, "grad_norm": 0.025797303766012192, "grad_norm_var": 3.275441359311722e-07, "learning_rate": 0.0013600612705003835, "loss": 2.5666, "step": 21084 }, { "crossentropy": 2.3986687660217285, "epoch": 0.7643924013921114, "grad_norm": 0.025660378858447075, "grad_norm_var": 3.2071974806097077e-07, "learning_rate": 0.0013596629070701466, "loss": 2.43, "step": 21085 }, { "crossentropy": 2.48321270942688, "epoch": 0.7644286542923434, "grad_norm": 0.027888603508472443, "grad_norm_var": 4.0259261956825977e-07, "learning_rate": 0.0013592645928081886, "loss": 2.478, "step": 21086 }, { "crossentropy": 2.5501627922058105, "epoch": 0.7644649071925754, "grad_norm": 0.027618739753961563, "grad_norm_var": 4.936856451086355e-07, "learning_rate": 0.0013588663277198853, "loss": 2.555, "step": 21087 }, { "crossentropy": 2.474877119064331, "epoch": 0.7645011600928074, "grad_norm": 0.026663297787308693, "grad_norm_var": 4.960900118149032e-07, "learning_rate": 0.0013584681118106185, "loss": 2.6008, "step": 21088 }, { "crossentropy": 2.5121898651123047, "epoch": 0.7645374129930395, "grad_norm": 0.026410240679979324, "grad_norm_var": 4.4171149650157107e-07, "learning_rate": 0.001358069945085767, "loss": 2.4351, "step": 21089 }, { "crossentropy": 2.359180450439453, "epoch": 0.7645736658932715, "grad_norm": 0.02664964646100998, "grad_norm_var": 4.278783995850799e-07, "learning_rate": 0.001357671827550706, "loss": 2.4657, "step": 21090 }, { "crossentropy": 2.342611074447632, "epoch": 0.7646099187935035, "grad_norm": 0.025509437546133995, "grad_norm_var": 4.630065343275465e-07, "learning_rate": 0.0013572737592108164, "loss": 2.387, "step": 21091 }, { "crossentropy": 2.440786361694336, "epoch": 0.7646461716937355, "grad_norm": 0.026339750736951828, "grad_norm_var": 4.5631389854588107e-07, "learning_rate": 0.0013568757400714726, "loss": 2.4679, "step": 21092 }, { "crossentropy": 2.3910367488861084, "epoch": 0.7646824245939675, "grad_norm": 0.026145124807953835, "grad_norm_var": 4.1982935286345205e-07, "learning_rate": 0.0013564777701380493, "loss": 2.4646, "step": 21093 }, { "crossentropy": 2.4445364475250244, "epoch": 0.7647186774941995, "grad_norm": 0.026319162920117378, "grad_norm_var": 4.227633159672427e-07, "learning_rate": 0.0013560798494159244, "loss": 2.4671, "step": 21094 }, { "crossentropy": 2.6016578674316406, "epoch": 0.7647549303944315, "grad_norm": 0.027120167389512062, "grad_norm_var": 4.330737283860427e-07, "learning_rate": 0.001355681977910469, "loss": 2.5073, "step": 21095 }, { "crossentropy": 2.3926308155059814, "epoch": 0.7647911832946636, "grad_norm": 0.026180386543273926, "grad_norm_var": 4.398047262060371e-07, "learning_rate": 0.0013552841556270596, "loss": 2.3906, "step": 21096 }, { "crossentropy": 2.485370635986328, "epoch": 0.7648274361948956, "grad_norm": 0.02544541098177433, "grad_norm_var": 4.740470286349399e-07, "learning_rate": 0.0013548863825710695, "loss": 2.4606, "step": 21097 }, { "crossentropy": 2.2800955772399902, "epoch": 0.7648636890951276, "grad_norm": 0.024680061265826225, "grad_norm_var": 6.622125232414247e-07, "learning_rate": 0.0013544886587478695, "loss": 2.34, "step": 21098 }, { "crossentropy": 2.53015398979187, "epoch": 0.7648999419953596, "grad_norm": 0.026809483766555786, "grad_norm_var": 6.754000774645991e-07, "learning_rate": 0.0013540909841628335, "loss": 2.4981, "step": 21099 }, { "crossentropy": 2.4754836559295654, "epoch": 0.7649361948955916, "grad_norm": 0.0264262892305851, "grad_norm_var": 6.55676427097016e-07, "learning_rate": 0.0013536933588213325, "loss": 2.4732, "step": 21100 }, { "crossentropy": 2.5030922889709473, "epoch": 0.7649724477958236, "grad_norm": 0.02737209014594555, "grad_norm_var": 6.776107827098527e-07, "learning_rate": 0.0013532957827287334, "loss": 2.5696, "step": 21101 }, { "crossentropy": 2.306339740753174, "epoch": 0.7650087006960556, "grad_norm": 0.02547922357916832, "grad_norm_var": 5.858653467997096e-07, "learning_rate": 0.0013528982558904112, "loss": 2.3472, "step": 21102 }, { "crossentropy": 2.4500443935394287, "epoch": 0.7650449535962877, "grad_norm": 0.037207480520009995, "grad_norm_var": 7.988922829842611e-06, "learning_rate": 0.001352500778311731, "loss": 2.541, "step": 21103 }, { "crossentropy": 2.3938536643981934, "epoch": 0.7650812064965197, "grad_norm": 0.02754918672144413, "grad_norm_var": 8.007376466564094e-06, "learning_rate": 0.0013521033499980628, "loss": 2.4916, "step": 21104 }, { "crossentropy": 2.6681020259857178, "epoch": 0.7651174593967517, "grad_norm": 0.026965230703353882, "grad_norm_var": 7.984636364354441e-06, "learning_rate": 0.0013517059709547768, "loss": 2.5364, "step": 21105 }, { "crossentropy": 2.3282554149627686, "epoch": 0.7651537122969838, "grad_norm": 0.02581864967942238, "grad_norm_var": 8.067987182242692e-06, "learning_rate": 0.001351308641187236, "loss": 2.4574, "step": 21106 }, { "crossentropy": 2.549551248550415, "epoch": 0.7651899651972158, "grad_norm": 0.027327856048941612, "grad_norm_var": 7.92284712376722e-06, "learning_rate": 0.001350911360700811, "loss": 2.4719, "step": 21107 }, { "crossentropy": 2.361452341079712, "epoch": 0.7652262180974478, "grad_norm": 0.02593115158379078, "grad_norm_var": 7.973288814863453e-06, "learning_rate": 0.0013505141295008661, "loss": 2.4176, "step": 21108 }, { "crossentropy": 2.3927924633026123, "epoch": 0.7652624709976799, "grad_norm": 0.02498728223145008, "grad_norm_var": 8.196547670411105e-06, "learning_rate": 0.0013501169475927644, "loss": 2.3498, "step": 21109 }, { "crossentropy": 2.555560350418091, "epoch": 0.7652987238979119, "grad_norm": 0.026587311178445816, "grad_norm_var": 8.17755071900068e-06, "learning_rate": 0.0013497198149818735, "loss": 2.5491, "step": 21110 }, { "crossentropy": 2.625896692276001, "epoch": 0.7653349767981439, "grad_norm": 0.026744024828076363, "grad_norm_var": 8.180013363682607e-06, "learning_rate": 0.0013493227316735547, "loss": 2.6025, "step": 21111 }, { "crossentropy": 2.4346256256103516, "epoch": 0.7653712296983759, "grad_norm": 0.02706168219447136, "grad_norm_var": 8.135836825991632e-06, "learning_rate": 0.0013489256976731722, "loss": 2.4545, "step": 21112 }, { "crossentropy": 2.466492176055908, "epoch": 0.7654074825986079, "grad_norm": 0.02692546881735325, "grad_norm_var": 7.961123334851843e-06, "learning_rate": 0.0013485287129860906, "loss": 2.469, "step": 21113 }, { "crossentropy": 2.4371578693389893, "epoch": 0.7654437354988399, "grad_norm": 0.026800844818353653, "grad_norm_var": 7.553126719356016e-06, "learning_rate": 0.0013481317776176682, "loss": 2.4504, "step": 21114 }, { "crossentropy": 2.4477522373199463, "epoch": 0.7654799883990719, "grad_norm": 0.026041710749268532, "grad_norm_var": 7.635021314057223e-06, "learning_rate": 0.00134773489157327, "loss": 2.4537, "step": 21115 }, { "crossentropy": 2.429382801055908, "epoch": 0.765516241299304, "grad_norm": 0.025953203439712524, "grad_norm_var": 7.697914122731936e-06, "learning_rate": 0.0013473380548582542, "loss": 2.3988, "step": 21116 }, { "crossentropy": 2.569028854370117, "epoch": 0.765552494199536, "grad_norm": 0.026529701426625252, "grad_norm_var": 7.719794259227324e-06, "learning_rate": 0.0013469412674779802, "loss": 2.5183, "step": 21117 }, { "crossentropy": 2.5998194217681885, "epoch": 0.765588747099768, "grad_norm": 0.02763710729777813, "grad_norm_var": 7.53892218368402e-06, "learning_rate": 0.0013465445294378092, "loss": 2.5605, "step": 21118 }, { "crossentropy": 2.4848294258117676, "epoch": 0.765625, "grad_norm": 0.026778964325785637, "grad_norm_var": 4.963779395345906e-07, "learning_rate": 0.0013461478407430971, "loss": 2.5053, "step": 21119 }, { "crossentropy": 2.592423915863037, "epoch": 0.765661252900232, "grad_norm": 0.027180157601833344, "grad_norm_var": 4.583067640994646e-07, "learning_rate": 0.0013457512013992036, "loss": 2.496, "step": 21120 }, { "crossentropy": 2.4661080837249756, "epoch": 0.765697505800464, "grad_norm": 0.026052741333842278, "grad_norm_var": 4.634039749035548e-07, "learning_rate": 0.0013453546114114868, "loss": 2.5349, "step": 21121 }, { "crossentropy": 2.6298797130584717, "epoch": 0.765733758700696, "grad_norm": 0.02627323754131794, "grad_norm_var": 4.336661487667198e-07, "learning_rate": 0.0013449580707853004, "loss": 2.4859, "step": 21122 }, { "crossentropy": 2.4681015014648438, "epoch": 0.7657700116009281, "grad_norm": 0.028117895126342773, "grad_norm_var": 5.545325447414127e-07, "learning_rate": 0.0013445615795260042, "loss": 2.5375, "step": 21123 }, { "crossentropy": 2.454319715499878, "epoch": 0.7658062645011601, "grad_norm": 0.026525825262069702, "grad_norm_var": 5.235896584729194e-07, "learning_rate": 0.001344165137638949, "loss": 2.5171, "step": 21124 }, { "crossentropy": 2.428135395050049, "epoch": 0.7658425174013921, "grad_norm": 0.027333347126841545, "grad_norm_var": 3.5144410169689273e-07, "learning_rate": 0.0013437687451294932, "loss": 2.4926, "step": 21125 }, { "crossentropy": 2.5378148555755615, "epoch": 0.7658787703016241, "grad_norm": 0.026173673570156097, "grad_norm_var": 3.72982647228491e-07, "learning_rate": 0.0013433724020029897, "loss": 2.5654, "step": 21126 }, { "crossentropy": 2.3760199546813965, "epoch": 0.7659150232018561, "grad_norm": 0.02814546227455139, "grad_norm_var": 4.931044362453212e-07, "learning_rate": 0.001342976108264789, "loss": 2.3886, "step": 21127 }, { "crossentropy": 2.502413511276245, "epoch": 0.7659512761020881, "grad_norm": 0.028585664927959442, "grad_norm_var": 6.821514807399283e-07, "learning_rate": 0.0013425798639202452, "loss": 2.4805, "step": 21128 }, { "crossentropy": 2.6032533645629883, "epoch": 0.7659875290023201, "grad_norm": 0.026214484125375748, "grad_norm_var": 7.152116157113728e-07, "learning_rate": 0.001342183668974713, "loss": 2.5341, "step": 21129 }, { "crossentropy": 2.54679536819458, "epoch": 0.7660237819025522, "grad_norm": 0.027104560285806656, "grad_norm_var": 7.171031610886478e-07, "learning_rate": 0.0013417875234335392, "loss": 2.5035, "step": 21130 }, { "crossentropy": 2.3921854496002197, "epoch": 0.7660600348027842, "grad_norm": 0.026312144473195076, "grad_norm_var": 6.90167710913148e-07, "learning_rate": 0.0013413914273020782, "loss": 2.3708, "step": 21131 }, { "crossentropy": 2.556750774383545, "epoch": 0.7660962877030162, "grad_norm": 0.02696371078491211, "grad_norm_var": 6.220585894236984e-07, "learning_rate": 0.0013409953805856767, "loss": 2.5272, "step": 21132 }, { "crossentropy": 2.543013095855713, "epoch": 0.7661325406032483, "grad_norm": 0.02647818997502327, "grad_norm_var": 6.254239143507756e-07, "learning_rate": 0.0013405993832896867, "loss": 2.5129, "step": 21133 }, { "crossentropy": 2.5420632362365723, "epoch": 0.7661687935034803, "grad_norm": 0.027423512190580368, "grad_norm_var": 6.099123040524534e-07, "learning_rate": 0.0013402034354194554, "loss": 2.4974, "step": 21134 }, { "crossentropy": 2.427722215652466, "epoch": 0.7662050464037123, "grad_norm": 0.026534873992204666, "grad_norm_var": 6.201454235553345e-07, "learning_rate": 0.0013398075369803293, "loss": 2.4014, "step": 21135 }, { "crossentropy": 2.6723926067352295, "epoch": 0.7662412993039444, "grad_norm": 0.026345916092395782, "grad_norm_var": 6.39567749707801e-07, "learning_rate": 0.0013394116879776568, "loss": 2.6127, "step": 21136 }, { "crossentropy": 2.459914445877075, "epoch": 0.7662775522041764, "grad_norm": 0.025996772572398186, "grad_norm_var": 6.461725968714392e-07, "learning_rate": 0.0013390158884167857, "loss": 2.4951, "step": 21137 }, { "crossentropy": 2.4007985591888428, "epoch": 0.7663138051044084, "grad_norm": 0.02534732036292553, "grad_norm_var": 7.781300577593593e-07, "learning_rate": 0.00133862013830306, "loss": 2.421, "step": 21138 }, { "crossentropy": 2.6026339530944824, "epoch": 0.7663500580046404, "grad_norm": 0.027664050459861755, "grad_norm_var": 7.142925170400271e-07, "learning_rate": 0.0013382244376418267, "loss": 2.5931, "step": 21139 }, { "crossentropy": 2.465825319290161, "epoch": 0.7663863109048724, "grad_norm": 0.02664632350206375, "grad_norm_var": 7.104440367368403e-07, "learning_rate": 0.0013378287864384282, "loss": 2.5598, "step": 21140 }, { "crossentropy": 2.57493257522583, "epoch": 0.7664225638051044, "grad_norm": 0.026470471173524857, "grad_norm_var": 6.989967097042419e-07, "learning_rate": 0.001337433184698209, "loss": 2.506, "step": 21141 }, { "crossentropy": 2.3673367500305176, "epoch": 0.7664588167053364, "grad_norm": 0.026158234104514122, "grad_norm_var": 7.002504135664514e-07, "learning_rate": 0.0013370376324265143, "loss": 2.4905, "step": 21142 }, { "crossentropy": 2.360684871673584, "epoch": 0.7664950696055685, "grad_norm": 0.02579808235168457, "grad_norm_var": 6.155421638323957e-07, "learning_rate": 0.0013366421296286857, "loss": 2.3592, "step": 21143 }, { "crossentropy": 2.6202452182769775, "epoch": 0.7665313225058005, "grad_norm": 0.02703191712498665, "grad_norm_var": 3.6081532448287157e-07, "learning_rate": 0.0013362466763100623, "loss": 2.5349, "step": 21144 }, { "crossentropy": 2.6593570709228516, "epoch": 0.7665675754060325, "grad_norm": 0.026686394587159157, "grad_norm_var": 3.5483980111761644e-07, "learning_rate": 0.0013358512724759898, "loss": 2.6082, "step": 21145 }, { "crossentropy": 2.6520626544952393, "epoch": 0.7666038283062645, "grad_norm": 0.02679266594350338, "grad_norm_var": 3.382800751817553e-07, "learning_rate": 0.0013354559181318042, "loss": 2.5388, "step": 21146 }, { "crossentropy": 2.4461278915405273, "epoch": 0.7666400812064965, "grad_norm": 0.026478761807084084, "grad_norm_var": 3.349385116278858e-07, "learning_rate": 0.0013350606132828474, "loss": 2.5129, "step": 21147 }, { "crossentropy": 2.260234832763672, "epoch": 0.7666763341067285, "grad_norm": 0.02731364034116268, "grad_norm_var": 3.618441496475022e-07, "learning_rate": 0.0013346653579344608, "loss": 2.4828, "step": 21148 }, { "crossentropy": 2.363046646118164, "epoch": 0.7667125870069605, "grad_norm": 0.025789359584450722, "grad_norm_var": 4.0020232955900503e-07, "learning_rate": 0.0013342701520919792, "loss": 2.462, "step": 21149 }, { "crossentropy": 2.466543436050415, "epoch": 0.7667488399071926, "grad_norm": 0.025943268090486526, "grad_norm_var": 3.607776548882867e-07, "learning_rate": 0.0013338749957607437, "loss": 2.4577, "step": 21150 }, { "crossentropy": 2.6300013065338135, "epoch": 0.7667850928074246, "grad_norm": 0.0272358451038599, "grad_norm_var": 4.005999120771138e-07, "learning_rate": 0.0013334798889460897, "loss": 2.5456, "step": 21151 }, { "crossentropy": 2.6604208946228027, "epoch": 0.7668213457076566, "grad_norm": 0.0280085951089859, "grad_norm_var": 5.433925443145872e-07, "learning_rate": 0.0013330848316533527, "loss": 2.5826, "step": 21152 }, { "crossentropy": 2.574939727783203, "epoch": 0.7668575986078886, "grad_norm": 0.025979405269026756, "grad_norm_var": 5.447737653490089e-07, "learning_rate": 0.0013326898238878715, "loss": 2.5945, "step": 21153 }, { "crossentropy": 2.481283664703369, "epoch": 0.7668938515081206, "grad_norm": 0.026315229013562202, "grad_norm_var": 4.4372495536386797e-07, "learning_rate": 0.0013322948656549777, "loss": 2.4834, "step": 21154 }, { "crossentropy": 2.531555414199829, "epoch": 0.7669301044083526, "grad_norm": 0.026582978665828705, "grad_norm_var": 3.6981094765676086e-07, "learning_rate": 0.0013318999569600077, "loss": 2.5145, "step": 21155 }, { "crossentropy": 2.565854549407959, "epoch": 0.7669663573085846, "grad_norm": 0.027030915021896362, "grad_norm_var": 3.826128476922998e-07, "learning_rate": 0.001331505097808297, "loss": 2.5057, "step": 21156 }, { "crossentropy": 2.5184006690979004, "epoch": 0.7670026102088167, "grad_norm": 0.026523025706410408, "grad_norm_var": 3.8187092417553644e-07, "learning_rate": 0.0013311102882051756, "loss": 2.4397, "step": 21157 }, { "crossentropy": 2.5018882751464844, "epoch": 0.7670388631090487, "grad_norm": 0.02736409194767475, "grad_norm_var": 4.0103764943483224e-07, "learning_rate": 0.0013307155281559785, "loss": 2.5357, "step": 21158 }, { "crossentropy": 2.4848439693450928, "epoch": 0.7670751160092807, "grad_norm": 0.02777431719005108, "grad_norm_var": 4.128440592043262e-07, "learning_rate": 0.001330320817666037, "loss": 2.4703, "step": 21159 }, { "crossentropy": 2.518202543258667, "epoch": 0.7671113689095128, "grad_norm": 0.02635042928159237, "grad_norm_var": 4.210837886264862e-07, "learning_rate": 0.00132992615674068, "loss": 2.4127, "step": 21160 }, { "crossentropy": 2.381741762161255, "epoch": 0.7671476218097448, "grad_norm": 0.027035245671868324, "grad_norm_var": 4.252402733301297e-07, "learning_rate": 0.0013295315453852419, "loss": 2.4944, "step": 21161 }, { "crossentropy": 2.6183278560638428, "epoch": 0.7671838747099768, "grad_norm": 0.02838064543902874, "grad_norm_var": 5.850271048914168e-07, "learning_rate": 0.0013291369836050482, "loss": 2.6167, "step": 21162 }, { "crossentropy": 2.473463773727417, "epoch": 0.7672201276102089, "grad_norm": 0.027245916426181793, "grad_norm_var": 5.806037892569129e-07, "learning_rate": 0.001328742471405431, "loss": 2.451, "step": 21163 }, { "crossentropy": 2.389742851257324, "epoch": 0.7672563805104409, "grad_norm": 0.02645443193614483, "grad_norm_var": 5.827426843384483e-07, "learning_rate": 0.001328348008791719, "loss": 2.4686, "step": 21164 }, { "crossentropy": 2.2088162899017334, "epoch": 0.7672926334106729, "grad_norm": 0.026713114231824875, "grad_norm_var": 5.022545081644532e-07, "learning_rate": 0.0013279535957692373, "loss": 2.3405, "step": 21165 }, { "crossentropy": 2.502980947494507, "epoch": 0.7673288863109049, "grad_norm": 0.026756394654512405, "grad_norm_var": 4.362102312874261e-07, "learning_rate": 0.0013275592323433172, "loss": 2.5812, "step": 21166 }, { "crossentropy": 2.4923431873321533, "epoch": 0.7673651392111369, "grad_norm": 0.027207762002944946, "grad_norm_var": 4.353180504231552e-07, "learning_rate": 0.001327164918519282, "loss": 2.4557, "step": 21167 }, { "crossentropy": 2.494908094406128, "epoch": 0.7674013921113689, "grad_norm": 0.026936056092381477, "grad_norm_var": 3.604996810774771e-07, "learning_rate": 0.0013267706543024572, "loss": 2.412, "step": 21168 }, { "crossentropy": 2.3355350494384766, "epoch": 0.7674376450116009, "grad_norm": 0.025212273001670837, "grad_norm_var": 4.930407428246301e-07, "learning_rate": 0.0013263764396981704, "loss": 2.4337, "step": 21169 }, { "crossentropy": 2.5753896236419678, "epoch": 0.767473897911833, "grad_norm": 0.02776416204869747, "grad_norm_var": 5.17525727250602e-07, "learning_rate": 0.0013259822747117434, "loss": 2.5194, "step": 21170 }, { "crossentropy": 2.354249954223633, "epoch": 0.767510150812065, "grad_norm": 0.026722831651568413, "grad_norm_var": 5.117507274789966e-07, "learning_rate": 0.001325588159348501, "loss": 2.4091, "step": 21171 }, { "crossentropy": 2.5587053298950195, "epoch": 0.767546403712297, "grad_norm": 0.027575325220823288, "grad_norm_var": 5.349158557055774e-07, "learning_rate": 0.0013251940936137685, "loss": 2.5073, "step": 21172 }, { "crossentropy": 2.599165916442871, "epoch": 0.767582656612529, "grad_norm": 0.02679922617971897, "grad_norm_var": 5.220814926020408e-07, "learning_rate": 0.001324800077512865, "loss": 2.5793, "step": 21173 }, { "crossentropy": 2.4682161808013916, "epoch": 0.767618909512761, "grad_norm": 0.02669379860162735, "grad_norm_var": 5.192548203685933e-07, "learning_rate": 0.0013244061110511147, "loss": 2.4815, "step": 21174 }, { "crossentropy": 2.5469796657562256, "epoch": 0.767655162412993, "grad_norm": 0.027001097798347473, "grad_norm_var": 4.743565478681581e-07, "learning_rate": 0.0013240121942338378, "loss": 2.5054, "step": 21175 }, { "crossentropy": 2.6162924766540527, "epoch": 0.767691415313225, "grad_norm": 0.02677748166024685, "grad_norm_var": 4.528653187884738e-07, "learning_rate": 0.0013236183270663537, "loss": 2.5575, "step": 21176 }, { "crossentropy": 2.536459445953369, "epoch": 0.7677276682134571, "grad_norm": 0.025950638577342033, "grad_norm_var": 5.147456340050145e-07, "learning_rate": 0.0013232245095539847, "loss": 2.4897, "step": 21177 }, { "crossentropy": 2.479816436767578, "epoch": 0.7677639211136891, "grad_norm": 0.02656930685043335, "grad_norm_var": 3.5905908853058923e-07, "learning_rate": 0.0013228307417020469, "loss": 2.4398, "step": 21178 }, { "crossentropy": 2.4002745151519775, "epoch": 0.7678001740139211, "grad_norm": 0.02586658112704754, "grad_norm_var": 3.9113057517936357e-07, "learning_rate": 0.0013224370235158601, "loss": 2.3499, "step": 21179 }, { "crossentropy": 2.5863537788391113, "epoch": 0.7678364269141531, "grad_norm": 0.0266583114862442, "grad_norm_var": 3.87391978050192e-07, "learning_rate": 0.0013220433550007438, "loss": 2.5888, "step": 21180 }, { "crossentropy": 2.419773817062378, "epoch": 0.7678726798143851, "grad_norm": 0.027462024241685867, "grad_norm_var": 4.237284151990672e-07, "learning_rate": 0.0013216497361620116, "loss": 2.4038, "step": 21181 }, { "crossentropy": 2.5497403144836426, "epoch": 0.7679089327146171, "grad_norm": 0.027182450518012047, "grad_norm_var": 4.356028131424772e-07, "learning_rate": 0.0013212561670049839, "loss": 2.5642, "step": 21182 }, { "crossentropy": 2.393031597137451, "epoch": 0.7679451856148491, "grad_norm": 0.026807168498635292, "grad_norm_var": 4.22448612018159e-07, "learning_rate": 0.0013208626475349722, "loss": 2.4547, "step": 21183 }, { "crossentropy": 2.4379959106445312, "epoch": 0.7679814385150812, "grad_norm": 0.02645895630121231, "grad_norm_var": 4.2475493034025793e-07, "learning_rate": 0.0013204691777572958, "loss": 2.4343, "step": 21184 }, { "crossentropy": 2.363830804824829, "epoch": 0.7680176914153132, "grad_norm": 0.02672545798122883, "grad_norm_var": 2.638985955640312e-07, "learning_rate": 0.0013200757576772666, "loss": 2.4563, "step": 21185 }, { "crossentropy": 2.5553879737854004, "epoch": 0.7680539443155452, "grad_norm": 0.027324799448251724, "grad_norm_var": 2.2026785899779503e-07, "learning_rate": 0.0013196823873001973, "loss": 2.4818, "step": 21186 }, { "crossentropy": 2.5618882179260254, "epoch": 0.7680901972157773, "grad_norm": 0.02633925899863243, "grad_norm_var": 2.326922394773097e-07, "learning_rate": 0.0013192890666314028, "loss": 2.6059, "step": 21187 }, { "crossentropy": 2.5465078353881836, "epoch": 0.7681264501160093, "grad_norm": 0.025567054748535156, "grad_norm_var": 2.6697857934246797e-07, "learning_rate": 0.0013188957956761966, "loss": 2.5185, "step": 21188 }, { "crossentropy": 2.3752803802490234, "epoch": 0.7681627030162413, "grad_norm": 0.025184130296111107, "grad_norm_var": 3.9496433939845547e-07, "learning_rate": 0.001318502574439887, "loss": 2.4163, "step": 21189 }, { "crossentropy": 2.558176279067993, "epoch": 0.7681989559164734, "grad_norm": 0.02574588730931282, "grad_norm_var": 4.311198420639626e-07, "learning_rate": 0.001318109402927789, "loss": 2.5787, "step": 21190 }, { "crossentropy": 2.4314143657684326, "epoch": 0.7682352088167054, "grad_norm": 0.02639886364340782, "grad_norm_var": 4.116465867612471e-07, "learning_rate": 0.0013177162811452094, "loss": 2.5149, "step": 21191 }, { "crossentropy": 2.400571584701538, "epoch": 0.7682714617169374, "grad_norm": 0.027372116222977638, "grad_norm_var": 4.6061025399724875e-07, "learning_rate": 0.0013173232090974607, "loss": 2.4824, "step": 21192 }, { "crossentropy": 2.4345974922180176, "epoch": 0.7683077146171694, "grad_norm": 0.02710830606520176, "grad_norm_var": 4.633087493912848e-07, "learning_rate": 0.001316930186789851, "loss": 2.4691, "step": 21193 }, { "crossentropy": 2.501047134399414, "epoch": 0.7683439675174014, "grad_norm": 0.026156144216656685, "grad_norm_var": 4.728131559890436e-07, "learning_rate": 0.001316537214227687, "loss": 2.4367, "step": 21194 }, { "crossentropy": 2.4278504848480225, "epoch": 0.7683802204176334, "grad_norm": 0.025574425235390663, "grad_norm_var": 5.036925297620283e-07, "learning_rate": 0.0013161442914162774, "loss": 2.4073, "step": 21195 }, { "crossentropy": 2.400754451751709, "epoch": 0.7684164733178654, "grad_norm": 0.025982040911912918, "grad_norm_var": 5.183698596253801e-07, "learning_rate": 0.0013157514183609314, "loss": 2.4337, "step": 21196 }, { "crossentropy": 2.4112367630004883, "epoch": 0.7684527262180975, "grad_norm": 0.026329098269343376, "grad_norm_var": 4.475019560315341e-07, "learning_rate": 0.0013153585950669516, "loss": 2.4358, "step": 21197 }, { "crossentropy": 2.478403091430664, "epoch": 0.7684889791183295, "grad_norm": 0.02666170336306095, "grad_norm_var": 4.0949849712359985e-07, "learning_rate": 0.0013149658215396477, "loss": 2.3728, "step": 21198 }, { "crossentropy": 2.4719653129577637, "epoch": 0.7685252320185615, "grad_norm": 0.0254947729408741, "grad_norm_var": 4.3863020300812426e-07, "learning_rate": 0.0013145730977843207, "loss": 2.4278, "step": 21199 }, { "crossentropy": 2.542149782180786, "epoch": 0.7685614849187935, "grad_norm": 0.02739597298204899, "grad_norm_var": 5.163081828805319e-07, "learning_rate": 0.001314180423806277, "loss": 2.5861, "step": 21200 }, { "crossentropy": 2.590498208999634, "epoch": 0.7685977378190255, "grad_norm": 0.026390530169010162, "grad_norm_var": 5.058826156013885e-07, "learning_rate": 0.0013137877996108234, "loss": 2.4929, "step": 21201 }, { "crossentropy": 2.436643123626709, "epoch": 0.7686339907192575, "grad_norm": 0.0263016689568758, "grad_norm_var": 4.334261540588167e-07, "learning_rate": 0.001313395225203256, "loss": 2.5017, "step": 21202 }, { "crossentropy": 2.6308345794677734, "epoch": 0.7686702436194895, "grad_norm": 0.026108918711543083, "grad_norm_var": 4.3400465849705027e-07, "learning_rate": 0.0013130027005888811, "loss": 2.5398, "step": 21203 }, { "crossentropy": 2.343968391418457, "epoch": 0.7687064965197216, "grad_norm": 0.025998467579483986, "grad_norm_var": 4.0717379370991807e-07, "learning_rate": 0.0013126102257730015, "loss": 2.3801, "step": 21204 }, { "crossentropy": 2.391278028488159, "epoch": 0.7687427494199536, "grad_norm": 0.026467375457286835, "grad_norm_var": 3.2555273471590364e-07, "learning_rate": 0.001312217800760915, "loss": 2.482, "step": 21205 }, { "crossentropy": 2.52944016456604, "epoch": 0.7687790023201856, "grad_norm": 0.026330580934882164, "grad_norm_var": 3.003773235584478e-07, "learning_rate": 0.0013118254255579248, "loss": 2.5012, "step": 21206 }, { "crossentropy": 2.4384613037109375, "epoch": 0.7688152552204176, "grad_norm": 0.02612718939781189, "grad_norm_var": 3.0428654436176427e-07, "learning_rate": 0.001311433100169328, "loss": 2.4575, "step": 21207 }, { "crossentropy": 2.4125945568084717, "epoch": 0.7688515081206496, "grad_norm": 0.027362274006009102, "grad_norm_var": 3.029676272893322e-07, "learning_rate": 0.0013110408246004247, "loss": 2.4179, "step": 21208 }, { "crossentropy": 2.581979274749756, "epoch": 0.7688877610208816, "grad_norm": 0.026706717908382416, "grad_norm_var": 2.730776996478131e-07, "learning_rate": 0.0013106485988565164, "loss": 2.5451, "step": 21209 }, { "crossentropy": 2.5138235092163086, "epoch": 0.7689240139211136, "grad_norm": 0.027012091130018234, "grad_norm_var": 2.982570063731939e-07, "learning_rate": 0.0013102564229428943, "loss": 2.4887, "step": 21210 }, { "crossentropy": 2.3686702251434326, "epoch": 0.7689602668213457, "grad_norm": 0.02600056305527687, "grad_norm_var": 2.6325336921709933e-07, "learning_rate": 0.0013098642968648589, "loss": 2.4212, "step": 21211 }, { "crossentropy": 2.3317666053771973, "epoch": 0.7689965197215777, "grad_norm": 0.02605881355702877, "grad_norm_var": 2.5917065347006755e-07, "learning_rate": 0.0013094722206277072, "loss": 2.4038, "step": 21212 }, { "crossentropy": 2.4950273036956787, "epoch": 0.7690327726218097, "grad_norm": 0.027193620800971985, "grad_norm_var": 2.9521225832843114e-07, "learning_rate": 0.0013090801942367326, "loss": 2.5338, "step": 21213 }, { "crossentropy": 2.4942431449890137, "epoch": 0.7690690255220418, "grad_norm": 0.027776136994361877, "grad_norm_var": 4.004727909720362e-07, "learning_rate": 0.0013086882176972325, "loss": 2.4884, "step": 21214 }, { "crossentropy": 2.4325497150421143, "epoch": 0.7691052784222738, "grad_norm": 0.02685755304992199, "grad_norm_var": 3.2565074922751477e-07, "learning_rate": 0.0013082962910144985, "loss": 2.4929, "step": 21215 }, { "crossentropy": 2.597604751586914, "epoch": 0.7691415313225058, "grad_norm": 0.027453193441033363, "grad_norm_var": 3.3169525456821133e-07, "learning_rate": 0.0013079044141938252, "loss": 2.5435, "step": 21216 }, { "crossentropy": 2.4658639430999756, "epoch": 0.7691777842227379, "grad_norm": 0.026341181248426437, "grad_norm_var": 3.334501553782394e-07, "learning_rate": 0.0013075125872405086, "loss": 2.4736, "step": 21217 }, { "crossentropy": 2.415376901626587, "epoch": 0.7692140371229699, "grad_norm": 0.028839055448770523, "grad_norm_var": 6.244197871184727e-07, "learning_rate": 0.0013071208101598352, "loss": 2.4745, "step": 21218 }, { "crossentropy": 2.4280834197998047, "epoch": 0.7692502900232019, "grad_norm": 0.02673966810107231, "grad_norm_var": 5.920391480733363e-07, "learning_rate": 0.0013067290829571, "loss": 2.5282, "step": 21219 }, { "crossentropy": 2.5754125118255615, "epoch": 0.7692865429234339, "grad_norm": 0.028653379529714584, "grad_norm_var": 7.385645967567286e-07, "learning_rate": 0.0013063374056375943, "loss": 2.5131, "step": 21220 }, { "crossentropy": 2.4622530937194824, "epoch": 0.7693227958236659, "grad_norm": 0.027166130021214485, "grad_norm_var": 7.199269054322203e-07, "learning_rate": 0.0013059457782066053, "loss": 2.4708, "step": 21221 }, { "crossentropy": 2.502258777618408, "epoch": 0.7693590487238979, "grad_norm": 0.027102191001176834, "grad_norm_var": 6.842928069254363e-07, "learning_rate": 0.0013055542006694266, "loss": 2.5026, "step": 21222 }, { "crossentropy": 2.617410898208618, "epoch": 0.76939530162413, "grad_norm": 0.027076343074440956, "grad_norm_var": 6.191486270878029e-07, "learning_rate": 0.0013051626730313432, "loss": 2.6148, "step": 21223 }, { "crossentropy": 2.6118013858795166, "epoch": 0.769431554524362, "grad_norm": 0.026393132284283638, "grad_norm_var": 6.49927689357066e-07, "learning_rate": 0.001304771195297645, "loss": 2.5728, "step": 21224 }, { "crossentropy": 2.5107874870300293, "epoch": 0.769467807424594, "grad_norm": 0.026723651215434074, "grad_norm_var": 6.490901560933602e-07, "learning_rate": 0.0013043797674736214, "loss": 2.5462, "step": 21225 }, { "crossentropy": 2.535346508026123, "epoch": 0.769504060324826, "grad_norm": 0.02648412249982357, "grad_norm_var": 6.71762055655852e-07, "learning_rate": 0.0013039883895645571, "loss": 2.5009, "step": 21226 }, { "crossentropy": 2.424189329147339, "epoch": 0.769540313225058, "grad_norm": 0.02622619830071926, "grad_norm_var": 6.432615749833581e-07, "learning_rate": 0.0013035970615757364, "loss": 2.4865, "step": 21227 }, { "crossentropy": 2.405052423477173, "epoch": 0.76957656612529, "grad_norm": 0.0258990116417408, "grad_norm_var": 6.663554393530428e-07, "learning_rate": 0.0013032057835124494, "loss": 2.3813, "step": 21228 }, { "crossentropy": 2.2799839973449707, "epoch": 0.769612819025522, "grad_norm": 0.025336241349577904, "grad_norm_var": 8.483319082091725e-07, "learning_rate": 0.0013028145553799764, "loss": 2.3281, "step": 21229 }, { "crossentropy": 2.269979476928711, "epoch": 0.769649071925754, "grad_norm": 0.02623213827610016, "grad_norm_var": 8.255449019272845e-07, "learning_rate": 0.0013024233771836047, "loss": 2.3235, "step": 21230 }, { "crossentropy": 2.4913558959960938, "epoch": 0.7696853248259861, "grad_norm": 0.026474036276340485, "grad_norm_var": 8.341060109921342e-07, "learning_rate": 0.0013020322489286156, "loss": 2.4919, "step": 21231 }, { "crossentropy": 2.4338769912719727, "epoch": 0.7697215777262181, "grad_norm": 0.026585524901747704, "grad_norm_var": 8.080477027882481e-07, "learning_rate": 0.0013016411706202925, "loss": 2.4889, "step": 21232 }, { "crossentropy": 2.5310864448547363, "epoch": 0.7697578306264501, "grad_norm": 0.02698114514350891, "grad_norm_var": 7.973103033206049e-07, "learning_rate": 0.0013012501422639195, "loss": 2.4559, "step": 21233 }, { "crossentropy": 2.5930399894714355, "epoch": 0.7697940835266821, "grad_norm": 0.02686656452715397, "grad_norm_var": 5.060516987582247e-07, "learning_rate": 0.0013008591638647765, "loss": 2.561, "step": 21234 }, { "crossentropy": 2.4058682918548584, "epoch": 0.7698303364269141, "grad_norm": 0.025958992540836334, "grad_norm_var": 5.383186800859367e-07, "learning_rate": 0.0013004682354281422, "loss": 2.4629, "step": 21235 }, { "crossentropy": 2.522481679916382, "epoch": 0.7698665893271461, "grad_norm": 0.026305420324206352, "grad_norm_var": 2.50975905036604e-07, "learning_rate": 0.0013000773569593006, "loss": 2.5686, "step": 21236 }, { "crossentropy": 2.5986108779907227, "epoch": 0.7699028422273781, "grad_norm": 0.02670043520629406, "grad_norm_var": 2.224345337517586e-07, "learning_rate": 0.0012996865284635273, "loss": 2.5932, "step": 21237 }, { "crossentropy": 2.4547407627105713, "epoch": 0.7699390951276102, "grad_norm": 0.026312509551644325, "grad_norm_var": 1.9369475794382174e-07, "learning_rate": 0.0012992957499461044, "loss": 2.5047, "step": 21238 }, { "crossentropy": 2.5427350997924805, "epoch": 0.7699753480278422, "grad_norm": 0.025780705735087395, "grad_norm_var": 1.834512112593254e-07, "learning_rate": 0.0012989050214123072, "loss": 2.4426, "step": 21239 }, { "crossentropy": 2.420119285583496, "epoch": 0.7700116009280742, "grad_norm": 0.025929605588316917, "grad_norm_var": 1.9290006175753417e-07, "learning_rate": 0.001298514342867414, "loss": 2.4588, "step": 21240 }, { "crossentropy": 2.6109108924865723, "epoch": 0.7700478538283063, "grad_norm": 0.02729390747845173, "grad_norm_var": 2.4545411073198e-07, "learning_rate": 0.0012981237143167034, "loss": 2.5733, "step": 21241 }, { "crossentropy": 2.567831516265869, "epoch": 0.7700841067285383, "grad_norm": 0.027458054944872856, "grad_norm_var": 3.2404959868768436e-07, "learning_rate": 0.001297733135765448, "loss": 2.5149, "step": 21242 }, { "crossentropy": 2.5972890853881836, "epoch": 0.7701203596287703, "grad_norm": 0.02637431025505066, "grad_norm_var": 3.2206183836151065e-07, "learning_rate": 0.001297342607218927, "loss": 2.5335, "step": 21243 }, { "crossentropy": 2.4367282390594482, "epoch": 0.7701566125290024, "grad_norm": 0.027006298303604126, "grad_norm_var": 3.2390946141519495e-07, "learning_rate": 0.0012969521286824131, "loss": 2.4763, "step": 21244 }, { "crossentropy": 2.4088804721832275, "epoch": 0.7701928654292344, "grad_norm": 0.027070917189121246, "grad_norm_var": 2.4865401928076253e-07, "learning_rate": 0.0012965617001611785, "loss": 2.4915, "step": 21245 }, { "crossentropy": 2.357450485229492, "epoch": 0.7702291183294664, "grad_norm": 0.026097595691680908, "grad_norm_var": 2.560823656245903e-07, "learning_rate": 0.0012961713216604997, "loss": 2.3974, "step": 21246 }, { "crossentropy": 2.255500555038452, "epoch": 0.7702653712296984, "grad_norm": 0.025563186034560204, "grad_norm_var": 3.201669046553242e-07, "learning_rate": 0.0012957809931856469, "loss": 2.4122, "step": 21247 }, { "crossentropy": 2.4316532611846924, "epoch": 0.7703016241299304, "grad_norm": 0.02629784308373928, "grad_norm_var": 3.227425885918289e-07, "learning_rate": 0.001295390714741893, "loss": 2.4758, "step": 21248 }, { "crossentropy": 2.6089437007904053, "epoch": 0.7703378770301624, "grad_norm": 0.026652568951249123, "grad_norm_var": 3.0840431965773774e-07, "learning_rate": 0.0012950004863345106, "loss": 2.5619, "step": 21249 }, { "crossentropy": 2.4692270755767822, "epoch": 0.7703741299303944, "grad_norm": 0.02638987824320793, "grad_norm_var": 2.9799281631200335e-07, "learning_rate": 0.0012946103079687682, "loss": 2.4792, "step": 21250 }, { "crossentropy": 2.4511220455169678, "epoch": 0.7704103828306265, "grad_norm": 0.026114970445632935, "grad_norm_var": 2.893119775674094e-07, "learning_rate": 0.0012942201796499375, "loss": 2.4082, "step": 21251 }, { "crossentropy": 2.474695920944214, "epoch": 0.7704466357308585, "grad_norm": 0.02656584233045578, "grad_norm_var": 2.882088369096965e-07, "learning_rate": 0.0012938301013832876, "loss": 2.5025, "step": 21252 }, { "crossentropy": 2.363016128540039, "epoch": 0.7704828886310905, "grad_norm": 0.026197049766778946, "grad_norm_var": 2.8895156756371617e-07, "learning_rate": 0.0012934400731740847, "loss": 2.4373, "step": 21253 }, { "crossentropy": 2.607105255126953, "epoch": 0.7705191415313225, "grad_norm": 0.028012247756123543, "grad_norm_var": 4.397034136256634e-07, "learning_rate": 0.0012930500950276004, "loss": 2.6519, "step": 21254 }, { "crossentropy": 2.470970869064331, "epoch": 0.7705553944315545, "grad_norm": 0.0263343658298254, "grad_norm_var": 4.020488091388899e-07, "learning_rate": 0.0012926601669490984, "loss": 2.5403, "step": 21255 }, { "crossentropy": 2.5655128955841064, "epoch": 0.7705916473317865, "grad_norm": 0.02645358443260193, "grad_norm_var": 3.73425979445493e-07, "learning_rate": 0.0012922702889438464, "loss": 2.5882, "step": 21256 }, { "crossentropy": 2.4766626358032227, "epoch": 0.7706279002320185, "grad_norm": 0.027834469452500343, "grad_norm_var": 4.404291484177604e-07, "learning_rate": 0.0012918804610171132, "loss": 2.537, "step": 21257 }, { "crossentropy": 2.4894392490386963, "epoch": 0.7706641531322506, "grad_norm": 0.026378139853477478, "grad_norm_var": 3.971755502496973e-07, "learning_rate": 0.0012914906831741596, "loss": 2.4481, "step": 21258 }, { "crossentropy": 2.500211715698242, "epoch": 0.7707004060324826, "grad_norm": 0.027424346655607224, "grad_norm_var": 4.367354044534343e-07, "learning_rate": 0.0012911009554202534, "loss": 2.4625, "step": 21259 }, { "crossentropy": 2.403778553009033, "epoch": 0.7707366589327146, "grad_norm": 0.02536713145673275, "grad_norm_var": 5.267022039629008e-07, "learning_rate": 0.0012907112777606577, "loss": 2.4498, "step": 21260 }, { "crossentropy": 2.4477405548095703, "epoch": 0.7707729118329466, "grad_norm": 0.026703083887696266, "grad_norm_var": 5.094698633480975e-07, "learning_rate": 0.0012903216502006337, "loss": 2.528, "step": 21261 }, { "crossentropy": 2.4802114963531494, "epoch": 0.7708091647331786, "grad_norm": 0.02641712874174118, "grad_norm_var": 4.976783617252969e-07, "learning_rate": 0.0012899320727454472, "loss": 2.4641, "step": 21262 }, { "crossentropy": 2.564603567123413, "epoch": 0.7708454176334106, "grad_norm": 0.027108125388622284, "grad_norm_var": 4.447923168706859e-07, "learning_rate": 0.0012895425454003567, "loss": 2.5445, "step": 21263 }, { "crossentropy": 2.529895305633545, "epoch": 0.7708816705336426, "grad_norm": 0.025730030611157417, "grad_norm_var": 4.908981241056084e-07, "learning_rate": 0.0012891530681706249, "loss": 2.4126, "step": 21264 }, { "crossentropy": 2.468472719192505, "epoch": 0.7709179234338747, "grad_norm": 0.026888402178883553, "grad_norm_var": 4.958641599392035e-07, "learning_rate": 0.0012887636410615134, "loss": 2.5837, "step": 21265 }, { "crossentropy": 2.5058157444000244, "epoch": 0.7709541763341067, "grad_norm": 0.026412414386868477, "grad_norm_var": 4.952046539561753e-07, "learning_rate": 0.0012883742640782798, "loss": 2.5376, "step": 21266 }, { "crossentropy": 2.3589868545532227, "epoch": 0.7709904292343387, "grad_norm": 0.0252698864787817, "grad_norm_var": 5.968959724855869e-07, "learning_rate": 0.0012879849372261854, "loss": 2.4626, "step": 21267 }, { "crossentropy": 2.4395358562469482, "epoch": 0.7710266821345708, "grad_norm": 0.025303704664111137, "grad_norm_var": 6.969078076359802e-07, "learning_rate": 0.0012875956605104882, "loss": 2.4488, "step": 21268 }, { "crossentropy": 2.668174982070923, "epoch": 0.7710629350348028, "grad_norm": 0.027231289073824883, "grad_norm_var": 7.234143227837491e-07, "learning_rate": 0.0012872064339364436, "loss": 2.5729, "step": 21269 }, { "crossentropy": 2.463381052017212, "epoch": 0.7710991879350348, "grad_norm": 0.026348071172833443, "grad_norm_var": 5.729964897758227e-07, "learning_rate": 0.0012868172575093123, "loss": 2.5774, "step": 21270 }, { "crossentropy": 2.4200053215026855, "epoch": 0.7711354408352669, "grad_norm": 0.025805717334151268, "grad_norm_var": 5.98632350114767e-07, "learning_rate": 0.0012864281312343468, "loss": 2.4712, "step": 21271 }, { "crossentropy": 2.5242531299591064, "epoch": 0.7711716937354989, "grad_norm": 0.026798050850629807, "grad_norm_var": 6.077185799035899e-07, "learning_rate": 0.0012860390551168055, "loss": 2.5491, "step": 21272 }, { "crossentropy": 2.454472541809082, "epoch": 0.7712079466357309, "grad_norm": 0.029117045924067497, "grad_norm_var": 9.492135711667287e-07, "learning_rate": 0.0012856500291619439, "loss": 2.4256, "step": 21273 }, { "crossentropy": 2.579986095428467, "epoch": 0.7712441995359629, "grad_norm": 0.027197521179914474, "grad_norm_var": 9.757958646716546e-07, "learning_rate": 0.0012852610533750143, "loss": 2.4993, "step": 21274 }, { "crossentropy": 2.4578258991241455, "epoch": 0.7712804524361949, "grad_norm": 0.02768171951174736, "grad_norm_var": 1.0092498167709225e-06, "learning_rate": 0.001284872127761273, "loss": 2.5651, "step": 21275 }, { "crossentropy": 2.5304625034332275, "epoch": 0.7713167053364269, "grad_norm": 0.0290684811770916, "grad_norm_var": 1.2638688523971633e-06, "learning_rate": 0.0012844832523259714, "loss": 2.5042, "step": 21276 }, { "crossentropy": 2.3793091773986816, "epoch": 0.771352958236659, "grad_norm": 0.0263994000852108, "grad_norm_var": 1.274267387865605e-06, "learning_rate": 0.0012840944270743604, "loss": 2.4495, "step": 21277 }, { "crossentropy": 2.2909433841705322, "epoch": 0.771389211136891, "grad_norm": 0.026804735884070396, "grad_norm_var": 1.2639445203514732e-06, "learning_rate": 0.0012837056520116946, "loss": 2.3741, "step": 21278 }, { "crossentropy": 2.4959123134613037, "epoch": 0.771425464037123, "grad_norm": 0.027266710996627808, "grad_norm_var": 1.2715497606327619e-06, "learning_rate": 0.0012833169271432227, "loss": 2.4691, "step": 21279 }, { "crossentropy": 2.5064427852630615, "epoch": 0.771461716937355, "grad_norm": 0.027190297842025757, "grad_norm_var": 1.1901314963982338e-06, "learning_rate": 0.0012829282524741952, "loss": 2.3825, "step": 21280 }, { "crossentropy": 2.2863688468933105, "epoch": 0.771497969837587, "grad_norm": 0.025595419108867645, "grad_norm_var": 1.3007503668721956e-06, "learning_rate": 0.0012825396280098645, "loss": 2.4105, "step": 21281 }, { "crossentropy": 2.3890860080718994, "epoch": 0.771534222737819, "grad_norm": 0.025852613151073456, "grad_norm_var": 1.3524869393397118e-06, "learning_rate": 0.001282151053755476, "loss": 2.4329, "step": 21282 }, { "crossentropy": 2.375225067138672, "epoch": 0.771570475638051, "grad_norm": 0.025835391134023666, "grad_norm_var": 1.2564868885747703e-06, "learning_rate": 0.0012817625297162811, "loss": 2.4458, "step": 21283 }, { "crossentropy": 2.333575963973999, "epoch": 0.771606728538283, "grad_norm": 0.02525343932211399, "grad_norm_var": 1.266964650746029e-06, "learning_rate": 0.001281374055897524, "loss": 2.3982, "step": 21284 }, { "crossentropy": 2.5489792823791504, "epoch": 0.7716429814385151, "grad_norm": 0.02632748894393444, "grad_norm_var": 1.2709095842245235e-06, "learning_rate": 0.0012809856323044554, "loss": 2.4898, "step": 21285 }, { "crossentropy": 2.399272918701172, "epoch": 0.7716792343387471, "grad_norm": 0.02503071539103985, "grad_norm_var": 1.4559226882586428e-06, "learning_rate": 0.00128059725894232, "loss": 2.4748, "step": 21286 }, { "crossentropy": 2.552985429763794, "epoch": 0.7717154872389791, "grad_norm": 0.02678367681801319, "grad_norm_var": 1.3988866701748642e-06, "learning_rate": 0.0012802089358163605, "loss": 2.5333, "step": 21287 }, { "crossentropy": 2.391350507736206, "epoch": 0.7717517401392111, "grad_norm": 0.026177071034908295, "grad_norm_var": 1.420058161067291e-06, "learning_rate": 0.0012798206629318248, "loss": 2.4432, "step": 21288 }, { "crossentropy": 2.3333466053009033, "epoch": 0.7717879930394431, "grad_norm": 0.026068586856126785, "grad_norm_var": 1.0281390094211135e-06, "learning_rate": 0.0012794324402939582, "loss": 2.3673, "step": 21289 }, { "crossentropy": 2.4650821685791016, "epoch": 0.7718242459396751, "grad_norm": 0.02663567289710045, "grad_norm_var": 9.981119254016581e-07, "learning_rate": 0.0012790442679080011, "loss": 2.4967, "step": 21290 }, { "crossentropy": 2.533784866333008, "epoch": 0.7718604988399071, "grad_norm": 0.027288326993584633, "grad_norm_var": 9.457066389073968e-07, "learning_rate": 0.0012786561457791995, "loss": 2.4893, "step": 21291 }, { "crossentropy": 2.5430924892425537, "epoch": 0.7718967517401392, "grad_norm": 0.026930736377835274, "grad_norm_var": 4.917104765789764e-07, "learning_rate": 0.0012782680739127927, "loss": 2.5762, "step": 21292 }, { "crossentropy": 2.4746241569519043, "epoch": 0.7719330046403712, "grad_norm": 0.029236601665616035, "grad_norm_var": 1.0172815086186466e-06, "learning_rate": 0.001277880052314025, "loss": 2.4779, "step": 21293 }, { "crossentropy": 2.393157720565796, "epoch": 0.7719692575406032, "grad_norm": 0.025577135384082794, "grad_norm_var": 1.06442867226357e-06, "learning_rate": 0.0012774920809881351, "loss": 2.4294, "step": 21294 }, { "crossentropy": 2.5020906925201416, "epoch": 0.7720055104408353, "grad_norm": 0.02731563150882721, "grad_norm_var": 1.069966635629466e-06, "learning_rate": 0.0012771041599403626, "loss": 2.5718, "step": 21295 }, { "crossentropy": 2.6824159622192383, "epoch": 0.7720417633410673, "grad_norm": 0.02589859440922737, "grad_norm_var": 1.045659088126519e-06, "learning_rate": 0.0012767162891759482, "loss": 2.6199, "step": 21296 }, { "crossentropy": 2.478074073791504, "epoch": 0.7720780162412993, "grad_norm": 0.02617419697344303, "grad_norm_var": 1.0073654021463632e-06, "learning_rate": 0.001276328468700132, "loss": 2.39, "step": 21297 }, { "crossentropy": 2.525874376296997, "epoch": 0.7721142691415314, "grad_norm": 0.02642781101167202, "grad_norm_var": 9.86130609475462e-07, "learning_rate": 0.0012759406985181493, "loss": 2.5118, "step": 21298 }, { "crossentropy": 2.511460542678833, "epoch": 0.7721505220417634, "grad_norm": 0.026279840618371964, "grad_norm_var": 9.629398004087301e-07, "learning_rate": 0.0012755529786352398, "loss": 2.5141, "step": 21299 }, { "crossentropy": 2.3803510665893555, "epoch": 0.7721867749419954, "grad_norm": 0.02596679888665676, "grad_norm_var": 8.797127325894099e-07, "learning_rate": 0.001275165309056639, "loss": 2.4757, "step": 21300 }, { "crossentropy": 2.5284807682037354, "epoch": 0.7722230278422274, "grad_norm": 0.026707442477345467, "grad_norm_var": 8.796196086161781e-07, "learning_rate": 0.001274777689787584, "loss": 2.5573, "step": 21301 }, { "crossentropy": 2.4837560653686523, "epoch": 0.7722592807424594, "grad_norm": 0.026029108092188835, "grad_norm_var": 7.421788077899601e-07, "learning_rate": 0.0012743901208333097, "loss": 2.5041, "step": 21302 }, { "crossentropy": 2.60892653465271, "epoch": 0.7722955336426914, "grad_norm": 0.02676909603178501, "grad_norm_var": 7.41822521274045e-07, "learning_rate": 0.001274002602199049, "loss": 2.5265, "step": 21303 }, { "crossentropy": 2.5285885334014893, "epoch": 0.7723317865429234, "grad_norm": 0.02775232493877411, "grad_norm_var": 8.096226257322021e-07, "learning_rate": 0.0012736151338900382, "loss": 2.5082, "step": 21304 }, { "crossentropy": 2.5694077014923096, "epoch": 0.7723680394431555, "grad_norm": 0.027952253818511963, "grad_norm_var": 8.750327536921708e-07, "learning_rate": 0.0012732277159115107, "loss": 2.534, "step": 21305 }, { "crossentropy": 2.4349756240844727, "epoch": 0.7724042923433875, "grad_norm": 0.02646835520863533, "grad_norm_var": 8.806458276648059e-07, "learning_rate": 0.0012728403482686979, "loss": 2.4498, "step": 21306 }, { "crossentropy": 2.521284818649292, "epoch": 0.7724405452436195, "grad_norm": 0.026510324329137802, "grad_norm_var": 8.676534585980513e-07, "learning_rate": 0.001272453030966833, "loss": 2.49, "step": 21307 }, { "crossentropy": 2.4563217163085938, "epoch": 0.7724767981438515, "grad_norm": 0.026145529001951218, "grad_norm_var": 8.872412745522847e-07, "learning_rate": 0.0012720657640111455, "loss": 2.4431, "step": 21308 }, { "crossentropy": 2.5542428493499756, "epoch": 0.7725130510440835, "grad_norm": 0.026760481297969818, "grad_norm_var": 4.3320994902769594e-07, "learning_rate": 0.001271678547406867, "loss": 2.5451, "step": 21309 }, { "crossentropy": 2.489501476287842, "epoch": 0.7725493039443155, "grad_norm": 0.02593351900577545, "grad_norm_var": 3.9511289423119903e-07, "learning_rate": 0.0012712913811592308, "loss": 2.4614, "step": 21310 }, { "crossentropy": 2.4882044792175293, "epoch": 0.7725855568445475, "grad_norm": 0.026775933802127838, "grad_norm_var": 3.5953303628936254e-07, "learning_rate": 0.0012709042652734603, "loss": 2.418, "step": 21311 }, { "crossentropy": 2.422299385070801, "epoch": 0.7726218097447796, "grad_norm": 0.026612402871251106, "grad_norm_var": 3.308585420367608e-07, "learning_rate": 0.0012705171997547866, "loss": 2.4807, "step": 21312 }, { "crossentropy": 2.545722007751465, "epoch": 0.7726580626450116, "grad_norm": 0.02665085718035698, "grad_norm_var": 3.1932608501681474e-07, "learning_rate": 0.001270130184608439, "loss": 2.5231, "step": 21313 }, { "crossentropy": 2.4157536029815674, "epoch": 0.7726943155452436, "grad_norm": 0.026582002639770508, "grad_norm_var": 3.170894517058916e-07, "learning_rate": 0.0012697432198396424, "loss": 2.5446, "step": 21314 }, { "crossentropy": 2.3034508228302, "epoch": 0.7727305684454756, "grad_norm": 0.025936784222722054, "grad_norm_var": 3.399362737271349e-07, "learning_rate": 0.001269356305453626, "loss": 2.4338, "step": 21315 }, { "crossentropy": 2.661849021911621, "epoch": 0.7727668213457076, "grad_norm": 0.027328025549650192, "grad_norm_var": 3.4135159047847474e-07, "learning_rate": 0.0012689694414556125, "loss": 2.5795, "step": 21316 }, { "crossentropy": 2.5748775005340576, "epoch": 0.7728030742459396, "grad_norm": 0.026321278885006905, "grad_norm_var": 3.493695945204345e-07, "learning_rate": 0.001268582627850829, "loss": 2.5257, "step": 21317 }, { "crossentropy": 2.4258201122283936, "epoch": 0.7728393271461717, "grad_norm": 0.026793401688337326, "grad_norm_var": 3.217891338465802e-07, "learning_rate": 0.0012681958646445024, "loss": 2.4591, "step": 21318 }, { "crossentropy": 2.5419745445251465, "epoch": 0.7728755800464037, "grad_norm": 0.026392553001642227, "grad_norm_var": 3.27472130519007e-07, "learning_rate": 0.001267809151841851, "loss": 2.4709, "step": 21319 }, { "crossentropy": 2.464745044708252, "epoch": 0.7729118329466357, "grad_norm": 0.02668185718357563, "grad_norm_var": 2.463604388990825e-07, "learning_rate": 0.0012674224894481006, "loss": 2.4842, "step": 21320 }, { "crossentropy": 2.3761770725250244, "epoch": 0.7729480858468677, "grad_norm": 0.02573557198047638, "grad_norm_var": 1.5833253724546978e-07, "learning_rate": 0.0012670358774684754, "loss": 2.3954, "step": 21321 }, { "crossentropy": 2.5640859603881836, "epoch": 0.7729843387470998, "grad_norm": 0.0266070868819952, "grad_norm_var": 1.593791440647792e-07, "learning_rate": 0.0012666493159081943, "loss": 2.4825, "step": 21322 }, { "crossentropy": 2.4014291763305664, "epoch": 0.7730205916473318, "grad_norm": 0.026173407211899757, "grad_norm_var": 1.653574537129864e-07, "learning_rate": 0.001266262804772481, "loss": 2.4509, "step": 21323 }, { "crossentropy": 2.6331799030303955, "epoch": 0.7730568445475638, "grad_norm": 0.026821112260222435, "grad_norm_var": 1.651584159842543e-07, "learning_rate": 0.0012658763440665539, "loss": 2.6296, "step": 21324 }, { "crossentropy": 2.5383450984954834, "epoch": 0.7730930974477959, "grad_norm": 0.026599129661917686, "grad_norm_var": 1.6132458300986e-07, "learning_rate": 0.0012654899337956327, "loss": 2.5157, "step": 21325 }, { "crossentropy": 2.5113608837127686, "epoch": 0.7731293503480279, "grad_norm": 0.025737591087818146, "grad_norm_var": 1.7843248521955357e-07, "learning_rate": 0.0012651035739649392, "loss": 2.5555, "step": 21326 }, { "crossentropy": 2.435983896255493, "epoch": 0.7731656032482599, "grad_norm": 0.026345742866396904, "grad_norm_var": 1.7327194089832888e-07, "learning_rate": 0.00126471726457969, "loss": 2.4763, "step": 21327 }, { "crossentropy": 2.4930615425109863, "epoch": 0.7732018561484919, "grad_norm": 0.02703275717794895, "grad_norm_var": 1.9300161380798353e-07, "learning_rate": 0.0012643310056451008, "loss": 2.4895, "step": 21328 }, { "crossentropy": 2.607109785079956, "epoch": 0.7732381090487239, "grad_norm": 0.026809168979525566, "grad_norm_var": 1.9809647523686552e-07, "learning_rate": 0.001263944797166392, "loss": 2.525, "step": 21329 }, { "crossentropy": 2.549825668334961, "epoch": 0.7732743619489559, "grad_norm": 0.026490118354558945, "grad_norm_var": 1.9754100500946985e-07, "learning_rate": 0.0012635586391487768, "loss": 2.454, "step": 21330 }, { "crossentropy": 2.4367117881774902, "epoch": 0.773310614849188, "grad_norm": 0.027842916548252106, "grad_norm_var": 2.8457108371750194e-07, "learning_rate": 0.001263172531597474, "loss": 2.4325, "step": 21331 }, { "crossentropy": 2.329066038131714, "epoch": 0.77334686774942, "grad_norm": 0.02784663811326027, "grad_norm_var": 3.5123995486585567e-07, "learning_rate": 0.0012627864745176954, "loss": 2.4905, "step": 21332 }, { "crossentropy": 2.7082042694091797, "epoch": 0.773383120649652, "grad_norm": 0.025967668741941452, "grad_norm_var": 3.740535428694199e-07, "learning_rate": 0.0012624004679146572, "loss": 2.6538, "step": 21333 }, { "crossentropy": 2.4457108974456787, "epoch": 0.773419373549884, "grad_norm": 0.026135485619306564, "grad_norm_var": 3.856584435098694e-07, "learning_rate": 0.0012620145117935738, "loss": 2.4751, "step": 21334 }, { "crossentropy": 2.4140989780426025, "epoch": 0.773455626450116, "grad_norm": 0.02656702883541584, "grad_norm_var": 3.83289368632147e-07, "learning_rate": 0.001261628606159657, "loss": 2.4768, "step": 21335 }, { "crossentropy": 2.5779542922973633, "epoch": 0.773491879350348, "grad_norm": 0.025138992816209793, "grad_norm_var": 5.125691795643261e-07, "learning_rate": 0.001261242751018118, "loss": 2.4774, "step": 21336 }, { "crossentropy": 2.562507390975952, "epoch": 0.77352813225058, "grad_norm": 0.026171857491135597, "grad_norm_var": 4.805417361062319e-07, "learning_rate": 0.001260856946374171, "loss": 2.53, "step": 21337 }, { "crossentropy": 2.472979784011841, "epoch": 0.773564385150812, "grad_norm": 0.02564522624015808, "grad_norm_var": 5.269296085014599e-07, "learning_rate": 0.001260471192233023, "loss": 2.4907, "step": 21338 }, { "crossentropy": 2.537001132965088, "epoch": 0.7736006380510441, "grad_norm": 0.026635214686393738, "grad_norm_var": 5.227472854496164e-07, "learning_rate": 0.0012600854885998886, "loss": 2.519, "step": 21339 }, { "crossentropy": 2.467059850692749, "epoch": 0.7736368909512761, "grad_norm": 0.028468411415815353, "grad_norm_var": 7.65804770638349e-07, "learning_rate": 0.0012596998354799738, "loss": 2.4405, "step": 21340 }, { "crossentropy": 2.4540162086486816, "epoch": 0.7736731438515081, "grad_norm": 0.025474701076745987, "grad_norm_var": 8.434005419025489e-07, "learning_rate": 0.0012593142328784885, "loss": 2.4258, "step": 21341 }, { "crossentropy": 2.4915964603424072, "epoch": 0.7737093967517401, "grad_norm": 0.026258930563926697, "grad_norm_var": 8.06046496429448e-07, "learning_rate": 0.0012589286808006427, "loss": 2.5512, "step": 21342 }, { "crossentropy": 2.35510516166687, "epoch": 0.7737456496519721, "grad_norm": 0.026797344908118248, "grad_norm_var": 8.063778279970701e-07, "learning_rate": 0.0012585431792516422, "loss": 2.4899, "step": 21343 }, { "crossentropy": 2.3898050785064697, "epoch": 0.7737819025522041, "grad_norm": 0.025539785623550415, "grad_norm_var": 8.555915622908301e-07, "learning_rate": 0.001258157728236693, "loss": 2.495, "step": 21344 }, { "crossentropy": 2.5814449787139893, "epoch": 0.7738181554524362, "grad_norm": 0.0266016386449337, "grad_norm_var": 8.493643791432677e-07, "learning_rate": 0.0012577723277610031, "loss": 2.5447, "step": 21345 }, { "crossentropy": 2.4471418857574463, "epoch": 0.7738544083526682, "grad_norm": 0.026506025344133377, "grad_norm_var": 8.494146500574353e-07, "learning_rate": 0.0012573869778297754, "loss": 2.5368, "step": 21346 }, { "crossentropy": 2.4385604858398438, "epoch": 0.7738906612529002, "grad_norm": 0.026922019198536873, "grad_norm_var": 7.344400942581203e-07, "learning_rate": 0.001257001678448218, "loss": 2.5229, "step": 21347 }, { "crossentropy": 2.4398036003112793, "epoch": 0.7739269141531323, "grad_norm": 0.03061164915561676, "grad_norm_var": 1.7392179940493395e-06, "learning_rate": 0.0012566164296215316, "loss": 2.427, "step": 21348 }, { "crossentropy": 2.384495258331299, "epoch": 0.7739631670533643, "grad_norm": 0.026114100590348244, "grad_norm_var": 1.7284051715129075e-06, "learning_rate": 0.0012562312313549211, "loss": 2.4227, "step": 21349 }, { "crossentropy": 2.472635507583618, "epoch": 0.7739994199535963, "grad_norm": 0.02613140642642975, "grad_norm_var": 1.7286584634281274e-06, "learning_rate": 0.0012558460836535907, "loss": 2.4616, "step": 21350 }, { "crossentropy": 2.3671281337738037, "epoch": 0.7740356728538283, "grad_norm": 0.02534761093556881, "grad_norm_var": 1.8267962597775702e-06, "learning_rate": 0.001255460986522739, "loss": 2.4045, "step": 21351 }, { "crossentropy": 2.418043375015259, "epoch": 0.7740719257540604, "grad_norm": 0.02677394635975361, "grad_norm_var": 1.692200364945308e-06, "learning_rate": 0.001255075939967571, "loss": 2.4374, "step": 21352 }, { "crossentropy": 2.4128000736236572, "epoch": 0.7741081786542924, "grad_norm": 0.02746487967669964, "grad_norm_var": 1.718572819460242e-06, "learning_rate": 0.0012546909439932858, "loss": 2.4684, "step": 21353 }, { "crossentropy": 2.4169909954071045, "epoch": 0.7741444315545244, "grad_norm": 0.026558713987469673, "grad_norm_var": 1.6415497181312128e-06, "learning_rate": 0.0012543059986050815, "loss": 2.5095, "step": 21354 }, { "crossentropy": 2.390519380569458, "epoch": 0.7741806844547564, "grad_norm": 0.026049010455608368, "grad_norm_var": 1.6730067822136174e-06, "learning_rate": 0.0012539211038081588, "loss": 2.3969, "step": 21355 }, { "crossentropy": 2.365427017211914, "epoch": 0.7742169373549884, "grad_norm": 0.02702947147190571, "grad_norm_var": 1.4681693798412424e-06, "learning_rate": 0.0012535362596077183, "loss": 2.425, "step": 21356 }, { "crossentropy": 2.3427140712738037, "epoch": 0.7742531902552204, "grad_norm": 0.025255367159843445, "grad_norm_var": 1.5051472898703136e-06, "learning_rate": 0.0012531514660089537, "loss": 2.4312, "step": 21357 }, { "crossentropy": 2.4592881202697754, "epoch": 0.7742894431554525, "grad_norm": 0.0257754847407341, "grad_norm_var": 1.5431979213257133e-06, "learning_rate": 0.001252766723017067, "loss": 2.4468, "step": 21358 }, { "crossentropy": 2.4489407539367676, "epoch": 0.7743256960556845, "grad_norm": 0.026095019653439522, "grad_norm_var": 1.5548353077522726e-06, "learning_rate": 0.0012523820306372508, "loss": 2.5065, "step": 21359 }, { "crossentropy": 2.4719698429107666, "epoch": 0.7743619489559165, "grad_norm": 0.028749441727995872, "grad_norm_var": 1.7670166145143518e-06, "learning_rate": 0.0012519973888747038, "loss": 2.4643, "step": 21360 }, { "crossentropy": 2.4718823432922363, "epoch": 0.7743982018561485, "grad_norm": 0.02679036371409893, "grad_norm_var": 1.7655317741232153e-06, "learning_rate": 0.0012516127977346197, "loss": 2.3925, "step": 21361 }, { "crossentropy": 2.301340341567993, "epoch": 0.7744344547563805, "grad_norm": 0.02578875981271267, "grad_norm_var": 1.8220618410837945e-06, "learning_rate": 0.0012512282572221917, "loss": 2.329, "step": 21362 }, { "crossentropy": 2.3814661502838135, "epoch": 0.7744707076566125, "grad_norm": 0.026428895071148872, "grad_norm_var": 1.8237194320004038e-06, "learning_rate": 0.0012508437673426148, "loss": 2.4402, "step": 21363 }, { "crossentropy": 2.4981253147125244, "epoch": 0.7745069605568445, "grad_norm": 0.027278495952486992, "grad_norm_var": 7.731201938254717e-07, "learning_rate": 0.0012504593281010845, "loss": 2.5834, "step": 21364 }, { "crossentropy": 2.5330419540405273, "epoch": 0.7745432134570766, "grad_norm": 0.02706303820014, "grad_norm_var": 7.834926653608413e-07, "learning_rate": 0.0012500749395027895, "loss": 2.4918, "step": 21365 }, { "crossentropy": 2.3630385398864746, "epoch": 0.7745794663573086, "grad_norm": 0.025936413556337357, "grad_norm_var": 7.963944483119761e-07, "learning_rate": 0.0012496906015529241, "loss": 2.4135, "step": 21366 }, { "crossentropy": 2.632269859313965, "epoch": 0.7746157192575406, "grad_norm": 0.027258729562163353, "grad_norm_var": 7.248907761169537e-07, "learning_rate": 0.0012493063142566775, "loss": 2.4859, "step": 21367 }, { "crossentropy": 2.492640972137451, "epoch": 0.7746519721577726, "grad_norm": 0.030277574434876442, "grad_norm_var": 1.5530410440125897e-06, "learning_rate": 0.0012489220776192406, "loss": 2.5277, "step": 21368 }, { "crossentropy": 2.4577059745788574, "epoch": 0.7746882250580046, "grad_norm": 0.026139235123991966, "grad_norm_var": 1.5563984431658419e-06, "learning_rate": 0.0012485378916458068, "loss": 2.5102, "step": 21369 }, { "crossentropy": 2.378462791442871, "epoch": 0.7747244779582366, "grad_norm": 0.025187291204929352, "grad_norm_var": 1.714343624403293e-06, "learning_rate": 0.0012481537563415585, "loss": 2.3997, "step": 21370 }, { "crossentropy": 2.4773764610290527, "epoch": 0.7747607308584686, "grad_norm": 0.02546696737408638, "grad_norm_var": 1.7855650722257976e-06, "learning_rate": 0.0012477696717116877, "loss": 2.4651, "step": 21371 }, { "crossentropy": 2.560861587524414, "epoch": 0.7747969837587007, "grad_norm": 0.0264532919973135, "grad_norm_var": 1.7777403209831912e-06, "learning_rate": 0.001247385637761383, "loss": 2.5367, "step": 21372 }, { "crossentropy": 2.418400764465332, "epoch": 0.7748332366589327, "grad_norm": 0.026014523580670357, "grad_norm_var": 1.6754767535801743e-06, "learning_rate": 0.0012470016544958284, "loss": 2.4688, "step": 21373 }, { "crossentropy": 2.3857765197753906, "epoch": 0.7748694895591647, "grad_norm": 0.02684069611132145, "grad_norm_var": 1.6194938192382048e-06, "learning_rate": 0.0012466177219202135, "loss": 2.3923, "step": 21374 }, { "crossentropy": 2.6265156269073486, "epoch": 0.7749057424593968, "grad_norm": 0.0256312545388937, "grad_norm_var": 1.6725433734260734e-06, "learning_rate": 0.001246233840039721, "loss": 2.6079, "step": 21375 }, { "crossentropy": 2.559548854827881, "epoch": 0.7749419953596288, "grad_norm": 0.027168866246938705, "grad_norm_var": 1.3981583732099266e-06, "learning_rate": 0.001245850008859537, "loss": 2.5025, "step": 21376 }, { "crossentropy": 2.547630786895752, "epoch": 0.7749782482598608, "grad_norm": 0.026806458830833435, "grad_norm_var": 1.3985664027030623e-06, "learning_rate": 0.0012454662283848483, "loss": 2.4487, "step": 21377 }, { "crossentropy": 2.5632970333099365, "epoch": 0.7750145011600929, "grad_norm": 0.02670975774526596, "grad_norm_var": 1.350882885215116e-06, "learning_rate": 0.0012450824986208332, "loss": 2.539, "step": 21378 }, { "crossentropy": 2.425192356109619, "epoch": 0.7750507540603249, "grad_norm": 0.026224397122859955, "grad_norm_var": 1.3599709495733133e-06, "learning_rate": 0.0012446988195726771, "loss": 2.4822, "step": 21379 }, { "crossentropy": 2.5340075492858887, "epoch": 0.7750870069605569, "grad_norm": 0.02717445231974125, "grad_norm_var": 1.351978130070819e-06, "learning_rate": 0.001244315191245564, "loss": 2.5259, "step": 21380 }, { "crossentropy": 2.552607536315918, "epoch": 0.7751232598607889, "grad_norm": 0.02678479440510273, "grad_norm_var": 1.3413843806490128e-06, "learning_rate": 0.0012439316136446722, "loss": 2.5795, "step": 21381 }, { "crossentropy": 2.5766170024871826, "epoch": 0.7751595127610209, "grad_norm": 0.027178144082427025, "grad_norm_var": 1.322974604852506e-06, "learning_rate": 0.0012435480867751853, "loss": 2.4912, "step": 21382 }, { "crossentropy": 2.430377960205078, "epoch": 0.7751957656612529, "grad_norm": 0.026031821966171265, "grad_norm_var": 1.3268451778571891e-06, "learning_rate": 0.0012431646106422806, "loss": 2.5483, "step": 21383 }, { "crossentropy": 2.507596492767334, "epoch": 0.7752320185614849, "grad_norm": 0.027027271687984467, "grad_norm_var": 4.066196464203031e-07, "learning_rate": 0.0012427811852511394, "loss": 2.5369, "step": 21384 }, { "crossentropy": 2.5126357078552246, "epoch": 0.775268271461717, "grad_norm": 0.028081873431801796, "grad_norm_var": 5.678315092293972e-07, "learning_rate": 0.0012423978106069428, "loss": 2.6046, "step": 21385 }, { "crossentropy": 2.4758481979370117, "epoch": 0.775304524361949, "grad_norm": 0.028242597356438637, "grad_norm_var": 5.965919403574099e-07, "learning_rate": 0.0012420144867148631, "loss": 2.5255, "step": 21386 }, { "crossentropy": 2.4451844692230225, "epoch": 0.775340777262181, "grad_norm": 0.02575141005218029, "grad_norm_var": 5.533747373780614e-07, "learning_rate": 0.0012416312135800805, "loss": 2.5348, "step": 21387 }, { "crossentropy": 2.489985704421997, "epoch": 0.775377030162413, "grad_norm": 0.02628258243203163, "grad_norm_var": 5.621225516801483e-07, "learning_rate": 0.001241247991207774, "loss": 2.4101, "step": 21388 }, { "crossentropy": 2.371340751647949, "epoch": 0.775413283062645, "grad_norm": 0.02693641185760498, "grad_norm_var": 5.252135057929111e-07, "learning_rate": 0.0012408648196031154, "loss": 2.5476, "step": 21389 }, { "crossentropy": 2.4338929653167725, "epoch": 0.775449535962877, "grad_norm": 0.026308119297027588, "grad_norm_var": 5.403740970804135e-07, "learning_rate": 0.0012404816987712835, "loss": 2.4873, "step": 21390 }, { "crossentropy": 2.599745750427246, "epoch": 0.775485788863109, "grad_norm": 0.026703109964728355, "grad_norm_var": 4.492554435156822e-07, "learning_rate": 0.001240098628717451, "loss": 2.6037, "step": 21391 }, { "crossentropy": 2.517033576965332, "epoch": 0.775522041763341, "grad_norm": 0.026496099308133125, "grad_norm_var": 4.4788726480278024e-07, "learning_rate": 0.0012397156094467916, "loss": 2.5437, "step": 21392 }, { "crossentropy": 2.4919321537017822, "epoch": 0.7755582946635731, "grad_norm": 0.025667553767561913, "grad_norm_var": 5.273994334091854e-07, "learning_rate": 0.0012393326409644806, "loss": 2.408, "step": 21393 }, { "crossentropy": 2.3627021312713623, "epoch": 0.7755945475638051, "grad_norm": 0.02560438960790634, "grad_norm_var": 6.060144487724742e-07, "learning_rate": 0.0012389497232756896, "loss": 2.4523, "step": 21394 }, { "crossentropy": 2.4275600910186768, "epoch": 0.7756308004640371, "grad_norm": 0.027508454397320747, "grad_norm_var": 6.35181326790607e-07, "learning_rate": 0.001238566856385589, "loss": 2.4569, "step": 21395 }, { "crossentropy": 2.5126876831054688, "epoch": 0.7756670533642691, "grad_norm": 0.025909075513482094, "grad_norm_var": 6.613132017600776e-07, "learning_rate": 0.0012381840402993526, "loss": 2.4127, "step": 21396 }, { "crossentropy": 2.344468593597412, "epoch": 0.7757033062645011, "grad_norm": 0.025769127532839775, "grad_norm_var": 7.084951615151634e-07, "learning_rate": 0.0012378012750221478, "loss": 2.3953, "step": 21397 }, { "crossentropy": 2.4137215614318848, "epoch": 0.7757395591647331, "grad_norm": 0.027279838919639587, "grad_norm_var": 7.170671670913975e-07, "learning_rate": 0.0012374185605591476, "loss": 2.5061, "step": 21398 }, { "crossentropy": 2.5329160690307617, "epoch": 0.7757758120649652, "grad_norm": 0.025964193046092987, "grad_norm_var": 7.224762422626165e-07, "learning_rate": 0.0012370358969155182, "loss": 2.5241, "step": 21399 }, { "crossentropy": 2.5652663707733154, "epoch": 0.7758120649651972, "grad_norm": 0.02759559266269207, "grad_norm_var": 7.753615691084732e-07, "learning_rate": 0.0012366532840964294, "loss": 2.4429, "step": 21400 }, { "crossentropy": 2.362164258956909, "epoch": 0.7758483178654292, "grad_norm": 0.02758636325597763, "grad_norm_var": 6.948691641633544e-07, "learning_rate": 0.0012362707221070506, "loss": 2.4752, "step": 21401 }, { "crossentropy": 2.518544912338257, "epoch": 0.7758845707656613, "grad_norm": 0.027229109779000282, "grad_norm_var": 5.371410899224766e-07, "learning_rate": 0.0012358882109525476, "loss": 2.5354, "step": 21402 }, { "crossentropy": 2.622962236404419, "epoch": 0.7759208236658933, "grad_norm": 0.02649371698498726, "grad_norm_var": 4.938301492137438e-07, "learning_rate": 0.0012355057506380851, "loss": 2.5598, "step": 21403 }, { "crossentropy": 2.429584503173828, "epoch": 0.7759570765661253, "grad_norm": 0.025516537949442863, "grad_norm_var": 5.612277197152565e-07, "learning_rate": 0.0012351233411688318, "loss": 2.3692, "step": 21404 }, { "crossentropy": 2.476501226425171, "epoch": 0.7759933294663574, "grad_norm": 0.027892570942640305, "grad_norm_var": 6.69481573396903e-07, "learning_rate": 0.001234740982549949, "loss": 2.5349, "step": 21405 }, { "crossentropy": 2.3991758823394775, "epoch": 0.7760295823665894, "grad_norm": 0.027530161663889885, "grad_norm_var": 7.160350082528628e-07, "learning_rate": 0.0012343586747866054, "loss": 2.4641, "step": 21406 }, { "crossentropy": 2.387117385864258, "epoch": 0.7760658352668214, "grad_norm": 0.030786029994487762, "grad_norm_var": 1.775068397896863e-06, "learning_rate": 0.0012339764178839609, "loss": 2.5076, "step": 21407 }, { "crossentropy": 2.370213031768799, "epoch": 0.7761020881670534, "grad_norm": 0.02705248072743416, "grad_norm_var": 1.762464662939898e-06, "learning_rate": 0.00123359421184718, "loss": 2.4363, "step": 21408 }, { "crossentropy": 2.5249850749969482, "epoch": 0.7761383410672854, "grad_norm": 0.0281874630600214, "grad_norm_var": 1.7245606751550269e-06, "learning_rate": 0.0012332120566814259, "loss": 2.5378, "step": 21409 }, { "crossentropy": 2.418346881866455, "epoch": 0.7761745939675174, "grad_norm": 0.026641463860869408, "grad_norm_var": 1.5823361788903547e-06, "learning_rate": 0.0012328299523918585, "loss": 2.4785, "step": 21410 }, { "crossentropy": 2.5734128952026367, "epoch": 0.7762108468677494, "grad_norm": 0.028380144387483597, "grad_norm_var": 1.667549432398235e-06, "learning_rate": 0.0012324478989836408, "loss": 2.5744, "step": 21411 }, { "crossentropy": 2.524742364883423, "epoch": 0.7762470997679815, "grad_norm": 0.02600625343620777, "grad_norm_var": 1.6509159508676722e-06, "learning_rate": 0.0012320658964619315, "loss": 2.5234, "step": 21412 }, { "crossentropy": 2.4251935482025146, "epoch": 0.7762833526682135, "grad_norm": 0.026284102350473404, "grad_norm_var": 1.5661910212561574e-06, "learning_rate": 0.0012316839448318889, "loss": 2.4094, "step": 21413 }, { "crossentropy": 2.4016175270080566, "epoch": 0.7763196055684455, "grad_norm": 0.025975193828344345, "grad_norm_var": 1.6720133775043143e-06, "learning_rate": 0.0012313020440986745, "loss": 2.5418, "step": 21414 }, { "crossentropy": 2.293887138366699, "epoch": 0.7763558584686775, "grad_norm": 0.026658151298761368, "grad_norm_var": 1.5882202184021166e-06, "learning_rate": 0.001230920194267444, "loss": 2.3839, "step": 21415 }, { "crossentropy": 2.2742745876312256, "epoch": 0.7763921113689095, "grad_norm": 0.026246847584843636, "grad_norm_var": 1.6376904082278463e-06, "learning_rate": 0.0012305383953433563, "loss": 2.4203, "step": 21416 }, { "crossentropy": 2.5699734687805176, "epoch": 0.7764283642691415, "grad_norm": 0.027473552152514458, "grad_norm_var": 1.6319848572055387e-06, "learning_rate": 0.001230156647331569, "loss": 2.4889, "step": 21417 }, { "crossentropy": 2.427403688430786, "epoch": 0.7764646171693735, "grad_norm": 0.025979653000831604, "grad_norm_var": 1.7158957648708787e-06, "learning_rate": 0.0012297749502372358, "loss": 2.4332, "step": 21418 }, { "crossentropy": 2.540829658508301, "epoch": 0.7765008700696056, "grad_norm": 0.026997720822691917, "grad_norm_var": 1.693111336121579e-06, "learning_rate": 0.0012293933040655148, "loss": 2.5291, "step": 21419 }, { "crossentropy": 2.48606276512146, "epoch": 0.7765371229698376, "grad_norm": 0.02770650014281273, "grad_norm_var": 1.5303423336931414e-06, "learning_rate": 0.0012290117088215597, "loss": 2.4805, "step": 21420 }, { "crossentropy": 2.501981735229492, "epoch": 0.7765733758700696, "grad_norm": 0.02566877193748951, "grad_norm_var": 1.6451579419556975e-06, "learning_rate": 0.0012286301645105218, "loss": 2.4471, "step": 21421 }, { "crossentropy": 2.565399646759033, "epoch": 0.7766096287703016, "grad_norm": 0.028908973559737206, "grad_norm_var": 1.8433527943543618e-06, "learning_rate": 0.001228248671137559, "loss": 2.5188, "step": 21422 }, { "crossentropy": 2.4258296489715576, "epoch": 0.7766458816705336, "grad_norm": 0.02624739333987236, "grad_norm_var": 9.513819889528721e-07, "learning_rate": 0.0012278672287078202, "loss": 2.4496, "step": 21423 }, { "crossentropy": 2.4446001052856445, "epoch": 0.7766821345707656, "grad_norm": 0.027397308498620987, "grad_norm_var": 9.657820956215344e-07, "learning_rate": 0.0012274858372264585, "loss": 2.4983, "step": 21424 }, { "crossentropy": 2.605907440185547, "epoch": 0.7767183874709976, "grad_norm": 0.026976680383086205, "grad_norm_var": 8.531889362053031e-07, "learning_rate": 0.0012271044966986279, "loss": 2.6138, "step": 21425 }, { "crossentropy": 2.377535104751587, "epoch": 0.7767546403712297, "grad_norm": 0.033361971378326416, "grad_norm_var": 3.4920251990548282e-06, "learning_rate": 0.0012267232071294743, "loss": 2.4387, "step": 21426 }, { "crossentropy": 2.5351767539978027, "epoch": 0.7767908932714617, "grad_norm": 0.02569900080561638, "grad_norm_var": 3.543312901126587e-06, "learning_rate": 0.0012263419685241517, "loss": 2.4574, "step": 21427 }, { "crossentropy": 2.5196518898010254, "epoch": 0.7768271461716937, "grad_norm": 0.02724122814834118, "grad_norm_var": 3.4586583773662013e-06, "learning_rate": 0.0012259607808878077, "loss": 2.598, "step": 21428 }, { "crossentropy": 2.5371387004852295, "epoch": 0.7768633990719258, "grad_norm": 0.02730928175151348, "grad_norm_var": 3.4023711918201397e-06, "learning_rate": 0.001225579644225589, "loss": 2.4201, "step": 21429 }, { "crossentropy": 2.648338556289673, "epoch": 0.7768996519721578, "grad_norm": 0.02763296291232109, "grad_norm_var": 3.294452368877293e-06, "learning_rate": 0.0012251985585426466, "loss": 2.6067, "step": 21430 }, { "crossentropy": 2.4225873947143555, "epoch": 0.7769359048723898, "grad_norm": 0.027075855061411858, "grad_norm_var": 3.267152647761168e-06, "learning_rate": 0.0012248175238441246, "loss": 2.3661, "step": 21431 }, { "crossentropy": 2.481283187866211, "epoch": 0.7769721577726219, "grad_norm": 0.026843544095754623, "grad_norm_var": 3.2000296789966644e-06, "learning_rate": 0.001224436540135171, "loss": 2.4995, "step": 21432 }, { "crossentropy": 2.500577211380005, "epoch": 0.7770084106728539, "grad_norm": 0.033639710396528244, "grad_norm_var": 5.630658516477547e-06, "learning_rate": 0.001224055607420933, "loss": 2.5923, "step": 21433 }, { "crossentropy": 2.3290514945983887, "epoch": 0.7770446635730859, "grad_norm": 0.02690260298550129, "grad_norm_var": 5.460758934961764e-06, "learning_rate": 0.001223674725706553, "loss": 2.4312, "step": 21434 }, { "crossentropy": 2.483398914337158, "epoch": 0.7770809164733179, "grad_norm": 0.026772189885377884, "grad_norm_var": 5.489584523475513e-06, "learning_rate": 0.0012232938949971778, "loss": 2.4629, "step": 21435 }, { "crossentropy": 2.566349506378174, "epoch": 0.7771171693735499, "grad_norm": 0.02593993954360485, "grad_norm_var": 5.715250531596455e-06, "learning_rate": 0.00122291311529795, "loss": 2.5189, "step": 21436 }, { "crossentropy": 2.4189794063568115, "epoch": 0.7771534222737819, "grad_norm": 0.0275341235101223, "grad_norm_var": 5.421039091291314e-06, "learning_rate": 0.0012225323866140116, "loss": 2.4798, "step": 21437 }, { "crossentropy": 2.4867844581604004, "epoch": 0.7771896751740139, "grad_norm": 0.02768547832965851, "grad_norm_var": 5.340649373144583e-06, "learning_rate": 0.0012221517089505063, "loss": 2.4785, "step": 21438 }, { "crossentropy": 2.556929349899292, "epoch": 0.777225928074246, "grad_norm": 0.03152811899781227, "grad_norm_var": 6.014138374792893e-06, "learning_rate": 0.0012217710823125745, "loss": 2.5025, "step": 21439 }, { "crossentropy": 2.5092780590057373, "epoch": 0.777262180974478, "grad_norm": 0.026368729770183563, "grad_norm_var": 6.176117248668916e-06, "learning_rate": 0.0012213905067053577, "loss": 2.4567, "step": 21440 }, { "crossentropy": 2.4749510288238525, "epoch": 0.77729843387471, "grad_norm": 0.02692856825888157, "grad_norm_var": 6.18303151130271e-06, "learning_rate": 0.0012210099821339975, "loss": 2.5194, "step": 21441 }, { "crossentropy": 2.499598264694214, "epoch": 0.777334686774942, "grad_norm": 0.026405099779367447, "grad_norm_var": 4.261097221044894e-06, "learning_rate": 0.0012206295086036312, "loss": 2.5034, "step": 21442 }, { "crossentropy": 2.540268898010254, "epoch": 0.777370939675174, "grad_norm": 0.025796864181756973, "grad_norm_var": 4.236967010858422e-06, "learning_rate": 0.0012202490861193999, "loss": 2.5581, "step": 21443 }, { "crossentropy": 2.4324848651885986, "epoch": 0.777407192575406, "grad_norm": 0.025779524818062782, "grad_norm_var": 4.440477799919305e-06, "learning_rate": 0.0012198687146864411, "loss": 2.4515, "step": 21444 }, { "crossentropy": 2.6683645248413086, "epoch": 0.777443445475638, "grad_norm": 0.02690599486231804, "grad_norm_var": 4.4613772621820605e-06, "learning_rate": 0.00121948839430989, "loss": 2.5856, "step": 21445 }, { "crossentropy": 2.4592645168304443, "epoch": 0.77747969837587, "grad_norm": 0.02648383006453514, "grad_norm_var": 4.52104021234644e-06, "learning_rate": 0.001219108124994887, "loss": 2.4574, "step": 21446 }, { "crossentropy": 2.602015256881714, "epoch": 0.7775159512761021, "grad_norm": 0.027283161878585815, "grad_norm_var": 4.5144380215649906e-06, "learning_rate": 0.0012187279067465652, "loss": 2.5593, "step": 21447 }, { "crossentropy": 2.3776628971099854, "epoch": 0.7775522041763341, "grad_norm": 0.02553318627178669, "grad_norm_var": 4.7233140763466245e-06, "learning_rate": 0.001218347739570061, "loss": 2.3939, "step": 21448 }, { "crossentropy": 2.4454352855682373, "epoch": 0.7775884570765661, "grad_norm": 0.026583151891827583, "grad_norm_var": 1.9110371166784653e-06, "learning_rate": 0.0012179676234705113, "loss": 2.4716, "step": 21449 }, { "crossentropy": 2.607461452484131, "epoch": 0.7776247099767981, "grad_norm": 0.027059132233262062, "grad_norm_var": 1.9125829109557676e-06, "learning_rate": 0.001217587558453046, "loss": 2.4942, "step": 21450 }, { "crossentropy": 2.484922409057617, "epoch": 0.7776609628770301, "grad_norm": 0.026472987607121468, "grad_norm_var": 1.923743338443186e-06, "learning_rate": 0.001217207544522803, "loss": 2.4511, "step": 21451 }, { "crossentropy": 2.3045120239257812, "epoch": 0.7776972157772621, "grad_norm": 0.026106487959623337, "grad_norm_var": 1.9043130421838676e-06, "learning_rate": 0.0012168275816849111, "loss": 2.3823, "step": 21452 }, { "crossentropy": 2.406022787094116, "epoch": 0.7777334686774942, "grad_norm": 0.02585868537425995, "grad_norm_var": 1.9388584860441717e-06, "learning_rate": 0.0012164476699445049, "loss": 2.4962, "step": 21453 }, { "crossentropy": 2.411862373352051, "epoch": 0.7777697215777262, "grad_norm": 0.026674484834074974, "grad_norm_var": 1.8832016109882835e-06, "learning_rate": 0.001216067809306715, "loss": 2.4514, "step": 21454 }, { "crossentropy": 2.4663422107696533, "epoch": 0.7778059744779582, "grad_norm": 0.02891852706670761, "grad_norm_var": 6.412542811890977e-07, "learning_rate": 0.00121568799977667, "loss": 2.4517, "step": 21455 }, { "crossentropy": 2.5584378242492676, "epoch": 0.7778422273781903, "grad_norm": 0.026485886424779892, "grad_norm_var": 6.389306101411597e-07, "learning_rate": 0.0012153082413595018, "loss": 2.5159, "step": 21456 }, { "crossentropy": 2.3666329383850098, "epoch": 0.7778784802784223, "grad_norm": 0.028413614258170128, "grad_norm_var": 8.458391440504247e-07, "learning_rate": 0.0012149285340603405, "loss": 2.4588, "step": 21457 }, { "crossentropy": 2.5545663833618164, "epoch": 0.7779147331786543, "grad_norm": 0.028123654425144196, "grad_norm_var": 9.691473991369488e-07, "learning_rate": 0.0012145488778843123, "loss": 2.556, "step": 21458 }, { "crossentropy": 2.5935750007629395, "epoch": 0.7779509860788864, "grad_norm": 0.027714114636182785, "grad_norm_var": 9.47578829439211e-07, "learning_rate": 0.0012141692728365472, "loss": 2.6067, "step": 21459 }, { "crossentropy": 2.530238389968872, "epoch": 0.7779872389791184, "grad_norm": 0.027262909337878227, "grad_norm_var": 8.635371402733547e-07, "learning_rate": 0.0012137897189221697, "loss": 2.483, "step": 21460 }, { "crossentropy": 2.546621561050415, "epoch": 0.7780234918793504, "grad_norm": 0.026616161689162254, "grad_norm_var": 8.721298246666675e-07, "learning_rate": 0.0012134102161463095, "loss": 2.5601, "step": 21461 }, { "crossentropy": 2.430100440979004, "epoch": 0.7780597447795824, "grad_norm": 0.026491578668355942, "grad_norm_var": 8.716267736584109e-07, "learning_rate": 0.0012130307645140904, "loss": 2.4344, "step": 21462 }, { "crossentropy": 2.444913625717163, "epoch": 0.7780959976798144, "grad_norm": 0.026366932317614555, "grad_norm_var": 8.864304184181958e-07, "learning_rate": 0.001212651364030637, "loss": 2.5602, "step": 21463 }, { "crossentropy": 2.560915470123291, "epoch": 0.7781322505800464, "grad_norm": 0.026013584807515144, "grad_norm_var": 8.121787171369432e-07, "learning_rate": 0.0012122720147010736, "loss": 2.527, "step": 21464 }, { "crossentropy": 2.4276833534240723, "epoch": 0.7781685034802784, "grad_norm": 0.03032071702182293, "grad_norm_var": 1.5036368380589299e-06, "learning_rate": 0.0012118927165305265, "loss": 2.4171, "step": 21465 }, { "crossentropy": 2.3468031883239746, "epoch": 0.7782047563805105, "grad_norm": 0.028416918590664864, "grad_norm_var": 1.5967589719059153e-06, "learning_rate": 0.001211513469524116, "loss": 2.4145, "step": 21466 }, { "crossentropy": 2.6122324466705322, "epoch": 0.7782410092807425, "grad_norm": 0.029246360063552856, "grad_norm_var": 1.7842123745374454e-06, "learning_rate": 0.0012111342736869663, "loss": 2.483, "step": 21467 }, { "crossentropy": 2.5272579193115234, "epoch": 0.7782772621809745, "grad_norm": 0.026333600282669067, "grad_norm_var": 1.7470729467435864e-06, "learning_rate": 0.0012107551290241965, "loss": 2.5123, "step": 21468 }, { "crossentropy": 2.46077561378479, "epoch": 0.7783135150812065, "grad_norm": 0.02722669206559658, "grad_norm_var": 1.5731227605711065e-06, "learning_rate": 0.0012103760355409287, "loss": 2.4697, "step": 21469 }, { "crossentropy": 2.6691701412200928, "epoch": 0.7783497679814385, "grad_norm": 0.03137526661157608, "grad_norm_var": 2.4122861174373397e-06, "learning_rate": 0.001209996993242287, "loss": 2.5376, "step": 21470 }, { "crossentropy": 2.5503122806549072, "epoch": 0.7783860208816705, "grad_norm": 0.026809055358171463, "grad_norm_var": 2.38505851592255e-06, "learning_rate": 0.0012096180021333846, "loss": 2.5382, "step": 21471 }, { "crossentropy": 2.4722630977630615, "epoch": 0.7784222737819025, "grad_norm": 0.026340549811720848, "grad_norm_var": 2.4099266863442884e-06, "learning_rate": 0.0012092390622193427, "loss": 2.4307, "step": 21472 }, { "crossentropy": 2.518448829650879, "epoch": 0.7784585266821346, "grad_norm": 0.029572565108537674, "grad_norm_var": 2.6053861575161172e-06, "learning_rate": 0.0012088601735052817, "loss": 2.3725, "step": 21473 }, { "crossentropy": 2.4029712677001953, "epoch": 0.7784947795823666, "grad_norm": 0.026284636929631233, "grad_norm_var": 2.7286742154647987e-06, "learning_rate": 0.001208481335996316, "loss": 2.4912, "step": 21474 }, { "crossentropy": 2.4160470962524414, "epoch": 0.7785310324825986, "grad_norm": 0.027603626251220703, "grad_norm_var": 2.7284849783331623e-06, "learning_rate": 0.001208102549697565, "loss": 2.5347, "step": 21475 }, { "crossentropy": 2.629432439804077, "epoch": 0.7785672853828306, "grad_norm": 0.02634141407907009, "grad_norm_var": 2.8282047409690456e-06, "learning_rate": 0.0012077238146141424, "loss": 2.566, "step": 21476 }, { "crossentropy": 2.5561683177948, "epoch": 0.7786035382830626, "grad_norm": 0.026731347665190697, "grad_norm_var": 2.8141547608660864e-06, "learning_rate": 0.001207345130751164, "loss": 2.4926, "step": 21477 }, { "crossentropy": 2.624276638031006, "epoch": 0.7786397911832946, "grad_norm": 0.025851290673017502, "grad_norm_var": 2.93373786501073e-06, "learning_rate": 0.0012069664981137485, "loss": 2.5105, "step": 21478 }, { "crossentropy": 2.4705727100372314, "epoch": 0.7786760440835266, "grad_norm": 0.02671339362859726, "grad_norm_var": 2.886488681945002e-06, "learning_rate": 0.0012065879167070038, "loss": 2.5137, "step": 21479 }, { "crossentropy": 2.5956783294677734, "epoch": 0.7787122969837587, "grad_norm": 0.025620777159929276, "grad_norm_var": 2.9778482754157085e-06, "learning_rate": 0.0012062093865360458, "loss": 2.5806, "step": 21480 }, { "crossentropy": 2.6002309322357178, "epoch": 0.7787485498839907, "grad_norm": 0.026647673919796944, "grad_norm_var": 2.4637618846293318e-06, "learning_rate": 0.0012058309076059886, "loss": 2.5565, "step": 21481 }, { "crossentropy": 2.4584171772003174, "epoch": 0.7787848027842227, "grad_norm": 0.02617802657186985, "grad_norm_var": 2.4495106072238188e-06, "learning_rate": 0.0012054524799219412, "loss": 2.4764, "step": 21482 }, { "crossentropy": 2.436187982559204, "epoch": 0.7788210556844548, "grad_norm": 0.025802459567785263, "grad_norm_var": 2.2418367744214307e-06, "learning_rate": 0.0012050741034890183, "loss": 2.3974, "step": 21483 }, { "crossentropy": 2.4002909660339355, "epoch": 0.7788573085846868, "grad_norm": 0.026271551847457886, "grad_norm_var": 2.2472971062275816e-06, "learning_rate": 0.0012046957783123275, "loss": 2.4403, "step": 21484 }, { "crossentropy": 2.4273934364318848, "epoch": 0.7788935614849188, "grad_norm": 0.028505610302090645, "grad_norm_var": 2.394891012473402e-06, "learning_rate": 0.001204317504396979, "loss": 2.4321, "step": 21485 }, { "crossentropy": 2.3624889850616455, "epoch": 0.7789298143851509, "grad_norm": 0.027631329372525215, "grad_norm_var": 1.1071172804734871e-06, "learning_rate": 0.001203939281748086, "loss": 2.4727, "step": 21486 }, { "crossentropy": 2.4987268447875977, "epoch": 0.7789660672853829, "grad_norm": 0.027936028316617012, "grad_norm_var": 1.1868682231600298e-06, "learning_rate": 0.001203561110370751, "loss": 2.4994, "step": 21487 }, { "crossentropy": 2.5136303901672363, "epoch": 0.7790023201856149, "grad_norm": 0.026508692651987076, "grad_norm_var": 1.176608128949872e-06, "learning_rate": 0.0012031829902700848, "loss": 2.453, "step": 21488 }, { "crossentropy": 2.641265869140625, "epoch": 0.7790385730858469, "grad_norm": 0.02678579092025757, "grad_norm_var": 6.643105233570164e-07, "learning_rate": 0.0012028049214511949, "loss": 2.5535, "step": 21489 }, { "crossentropy": 2.524507999420166, "epoch": 0.7790748259860789, "grad_norm": 0.028113441541790962, "grad_norm_var": 7.68804999156982e-07, "learning_rate": 0.001202426903919186, "loss": 2.5743, "step": 21490 }, { "crossentropy": 2.535205364227295, "epoch": 0.7791110788863109, "grad_norm": 0.02641960233449936, "grad_norm_var": 7.339218176074154e-07, "learning_rate": 0.001202048937679166, "loss": 2.5766, "step": 21491 }, { "crossentropy": 2.4613490104675293, "epoch": 0.7791473317865429, "grad_norm": 0.027060678228735924, "grad_norm_var": 7.267212392414457e-07, "learning_rate": 0.0012016710227362377, "loss": 2.3896, "step": 21492 }, { "crossentropy": 2.5732619762420654, "epoch": 0.779183584686775, "grad_norm": 0.02580052800476551, "grad_norm_var": 7.89220190737921e-07, "learning_rate": 0.0012012931590955063, "loss": 2.4886, "step": 21493 }, { "crossentropy": 2.3980889320373535, "epoch": 0.779219837587007, "grad_norm": 0.026179760694503784, "grad_norm_var": 7.570227404866496e-07, "learning_rate": 0.0012009153467620764, "loss": 2.4941, "step": 21494 }, { "crossentropy": 2.378666639328003, "epoch": 0.779256090487239, "grad_norm": 0.026402387768030167, "grad_norm_var": 7.65040448142719e-07, "learning_rate": 0.0012005375857410505, "loss": 2.4128, "step": 21495 }, { "crossentropy": 2.5096542835235596, "epoch": 0.779292343387471, "grad_norm": 0.025890696793794632, "grad_norm_var": 7.29259210056378e-07, "learning_rate": 0.0012001598760375287, "loss": 2.4569, "step": 21496 }, { "crossentropy": 2.5680723190307617, "epoch": 0.779328596287703, "grad_norm": 0.025660773739218712, "grad_norm_var": 8.047013754786114e-07, "learning_rate": 0.001199782217656616, "loss": 2.486, "step": 21497 }, { "crossentropy": 2.411069393157959, "epoch": 0.779364849187935, "grad_norm": 0.026657460257411003, "grad_norm_var": 7.859108476818491e-07, "learning_rate": 0.0011994046106034101, "loss": 2.44, "step": 21498 }, { "crossentropy": 2.505882978439331, "epoch": 0.779401102088167, "grad_norm": 0.026903554797172546, "grad_norm_var": 7.260000265211138e-07, "learning_rate": 0.0011990270548830134, "loss": 2.5259, "step": 21499 }, { "crossentropy": 2.403345823287964, "epoch": 0.7794373549883991, "grad_norm": 0.029283273965120316, "grad_norm_var": 1.0825090650890337e-06, "learning_rate": 0.0011986495505005234, "loss": 2.4017, "step": 21500 }, { "crossentropy": 2.527562379837036, "epoch": 0.7794736078886311, "grad_norm": 0.030148008838295937, "grad_norm_var": 1.5843732842921524e-06, "learning_rate": 0.0011982720974610401, "loss": 2.4661, "step": 21501 }, { "crossentropy": 2.300471782684326, "epoch": 0.7795098607888631, "grad_norm": 0.026883164420723915, "grad_norm_var": 1.5649956586843405e-06, "learning_rate": 0.001197894695769663, "loss": 2.4791, "step": 21502 }, { "crossentropy": 2.4765069484710693, "epoch": 0.7795461136890951, "grad_norm": 0.027966653928160667, "grad_norm_var": 1.5687147058231674e-06, "learning_rate": 0.0011975173454314874, "loss": 2.433, "step": 21503 }, { "crossentropy": 2.4503626823425293, "epoch": 0.7795823665893271, "grad_norm": 0.027144242078065872, "grad_norm_var": 1.548807360659173e-06, "learning_rate": 0.0011971400464516097, "loss": 2.3931, "step": 21504 }, { "crossentropy": 2.4962685108184814, "epoch": 0.7796186194895591, "grad_norm": 0.026008576154708862, "grad_norm_var": 1.6171794249915847e-06, "learning_rate": 0.0011967627988351281, "loss": 2.5954, "step": 21505 }, { "crossentropy": 2.52465558052063, "epoch": 0.7796548723897911, "grad_norm": 0.026395153254270554, "grad_norm_var": 1.5541025086515757e-06, "learning_rate": 0.0011963856025871344, "loss": 2.4324, "step": 21506 }, { "crossentropy": 2.4807627201080322, "epoch": 0.7796911252900232, "grad_norm": 0.02640284039080143, "grad_norm_var": 1.5552502258231806e-06, "learning_rate": 0.0011960084577127279, "loss": 2.5356, "step": 21507 }, { "crossentropy": 2.4830219745635986, "epoch": 0.7797273781902552, "grad_norm": 0.02777327597141266, "grad_norm_var": 1.5999513718640736e-06, "learning_rate": 0.0011956313642169974, "loss": 2.4766, "step": 21508 }, { "crossentropy": 2.5656938552856445, "epoch": 0.7797636310904872, "grad_norm": 0.026425877586007118, "grad_norm_var": 1.5269846388591639e-06, "learning_rate": 0.0011952543221050395, "loss": 2.5836, "step": 21509 }, { "crossentropy": 2.458789110183716, "epoch": 0.7797998839907193, "grad_norm": 0.026170924305915833, "grad_norm_var": 1.5279651688985026e-06, "learning_rate": 0.0011948773313819466, "loss": 2.4752, "step": 21510 }, { "crossentropy": 2.439241409301758, "epoch": 0.7798361368909513, "grad_norm": 0.02578786574304104, "grad_norm_var": 1.6011320823331824e-06, "learning_rate": 0.0011945003920528087, "loss": 2.5149, "step": 21511 }, { "crossentropy": 2.5876638889312744, "epoch": 0.7798723897911833, "grad_norm": 0.026680786162614822, "grad_norm_var": 1.5265639529841538e-06, "learning_rate": 0.0011941235041227193, "loss": 2.5833, "step": 21512 }, { "crossentropy": 2.5234553813934326, "epoch": 0.7799086426914154, "grad_norm": 0.027314919978380203, "grad_norm_var": 1.3981752564157554e-06, "learning_rate": 0.0011937466675967679, "loss": 2.4561, "step": 21513 }, { "crossentropy": 2.4738359451293945, "epoch": 0.7799448955916474, "grad_norm": 0.027036385610699654, "grad_norm_var": 1.3836962863731598e-06, "learning_rate": 0.0011933698824800427, "loss": 2.4502, "step": 21514 }, { "crossentropy": 2.455235242843628, "epoch": 0.7799811484918794, "grad_norm": 0.027982674539089203, "grad_norm_var": 1.4216882403320172e-06, "learning_rate": 0.0011929931487776352, "loss": 2.4319, "step": 21515 }, { "crossentropy": 2.5520334243774414, "epoch": 0.7800174013921114, "grad_norm": 0.02662177011370659, "grad_norm_var": 1.1296662272887766e-06, "learning_rate": 0.001192616466494631, "loss": 2.5188, "step": 21516 }, { "crossentropy": 2.4322192668914795, "epoch": 0.7800536542923434, "grad_norm": 0.027165258303284645, "grad_norm_var": 4.5222409171100504e-07, "learning_rate": 0.0011922398356361192, "loss": 2.4672, "step": 21517 }, { "crossentropy": 2.6013402938842773, "epoch": 0.7800899071925754, "grad_norm": 0.028564775362610817, "grad_norm_var": 6.341511929635098e-07, "learning_rate": 0.0011918632562071885, "loss": 2.578, "step": 21518 }, { "crossentropy": 2.3493924140930176, "epoch": 0.7801261600928074, "grad_norm": 0.02550332248210907, "grad_norm_var": 6.844545247273898e-07, "learning_rate": 0.001191486728212922, "loss": 2.3198, "step": 21519 }, { "crossentropy": 2.545487880706787, "epoch": 0.7801624129930395, "grad_norm": 0.028685390949249268, "grad_norm_var": 9.013435100931981e-07, "learning_rate": 0.0011911102516584087, "loss": 2.5359, "step": 21520 }, { "crossentropy": 2.2896132469177246, "epoch": 0.7801986658932715, "grad_norm": 0.026187585666775703, "grad_norm_var": 8.818911325357184e-07, "learning_rate": 0.0011907338265487316, "loss": 2.3787, "step": 21521 }, { "crossentropy": 2.2720465660095215, "epoch": 0.7802349187935035, "grad_norm": 0.027094803750514984, "grad_norm_var": 8.636478879346278e-07, "learning_rate": 0.0011903574528889732, "loss": 2.4522, "step": 21522 }, { "crossentropy": 2.4703006744384766, "epoch": 0.7802711716937355, "grad_norm": 0.026171231642365456, "grad_norm_var": 8.842805144939804e-07, "learning_rate": 0.0011899811306842207, "loss": 2.5438, "step": 21523 }, { "crossentropy": 2.463650703430176, "epoch": 0.7803074245939675, "grad_norm": 0.02519548125565052, "grad_norm_var": 1.0159175993032972e-06, "learning_rate": 0.001189604859939553, "loss": 2.383, "step": 21524 }, { "crossentropy": 2.361469268798828, "epoch": 0.7803436774941995, "grad_norm": 0.02537698857486248, "grad_norm_var": 1.135155998373214e-06, "learning_rate": 0.0011892286406600544, "loss": 2.4318, "step": 21525 }, { "crossentropy": 2.489250898361206, "epoch": 0.7803799303944315, "grad_norm": 0.026293078437447548, "grad_norm_var": 1.127125158675033e-06, "learning_rate": 0.0011888524728508071, "loss": 2.533, "step": 21526 }, { "crossentropy": 2.542440891265869, "epoch": 0.7804161832946636, "grad_norm": 0.02636539936065674, "grad_norm_var": 1.0755082622058523e-06, "learning_rate": 0.00118847635651689, "loss": 2.5253, "step": 21527 }, { "crossentropy": 2.3541436195373535, "epoch": 0.7804524361948956, "grad_norm": 0.02568449079990387, "grad_norm_var": 1.1487317272175385e-06, "learning_rate": 0.001188100291663385, "loss": 2.3917, "step": 21528 }, { "crossentropy": 2.5760276317596436, "epoch": 0.7804886890951276, "grad_norm": 0.026899082586169243, "grad_norm_var": 1.125595981880352e-06, "learning_rate": 0.00118772427829537, "loss": 2.515, "step": 21529 }, { "crossentropy": 2.550370693206787, "epoch": 0.7805249419953596, "grad_norm": 0.02632344514131546, "grad_norm_var": 1.1231755679051598e-06, "learning_rate": 0.0011873483164179226, "loss": 2.4759, "step": 21530 }, { "crossentropy": 2.6402676105499268, "epoch": 0.7805611948955916, "grad_norm": 0.025615734979510307, "grad_norm_var": 1.0471185616980297e-06, "learning_rate": 0.0011869724060361237, "loss": 2.571, "step": 21531 }, { "crossentropy": 2.391213893890381, "epoch": 0.7805974477958236, "grad_norm": 0.025369679555296898, "grad_norm_var": 1.1221417074893636e-06, "learning_rate": 0.0011865965471550477, "loss": 2.3884, "step": 21532 }, { "crossentropy": 2.4232778549194336, "epoch": 0.7806337006960556, "grad_norm": 0.02733229659497738, "grad_norm_var": 1.1407959470117098e-06, "learning_rate": 0.001186220739779772, "loss": 2.4719, "step": 21533 }, { "crossentropy": 2.4554672241210938, "epoch": 0.7806699535962877, "grad_norm": 0.02717697061598301, "grad_norm_var": 8.636388134477627e-07, "learning_rate": 0.0011858449839153744, "loss": 2.485, "step": 21534 }, { "crossentropy": 2.586337089538574, "epoch": 0.7807062064965197, "grad_norm": 0.02648552507162094, "grad_norm_var": 8.157130299303131e-07, "learning_rate": 0.0011854692795669275, "loss": 2.5427, "step": 21535 }, { "crossentropy": 2.5147480964660645, "epoch": 0.7807424593967517, "grad_norm": 0.0262618325650692, "grad_norm_var": 4.4142710858881505e-07, "learning_rate": 0.0011850936267395074, "loss": 2.4721, "step": 21536 }, { "crossentropy": 2.4479880332946777, "epoch": 0.7807787122969838, "grad_norm": 0.02640410326421261, "grad_norm_var": 4.4285545001086314e-07, "learning_rate": 0.0011847180254381878, "loss": 2.4725, "step": 21537 }, { "crossentropy": 2.4096412658691406, "epoch": 0.7808149651972158, "grad_norm": 0.02630637213587761, "grad_norm_var": 3.9322710492870753e-07, "learning_rate": 0.0011843424756680394, "loss": 2.4336, "step": 21538 }, { "crossentropy": 2.5940616130828857, "epoch": 0.7808512180974478, "grad_norm": 0.02629544772207737, "grad_norm_var": 3.9365111061112393e-07, "learning_rate": 0.0011839669774341378, "loss": 2.4957, "step": 21539 }, { "crossentropy": 2.411010503768921, "epoch": 0.7808874709976799, "grad_norm": 0.026120489463210106, "grad_norm_var": 3.2180366488244553e-07, "learning_rate": 0.0011835915307415517, "loss": 2.4553, "step": 21540 }, { "crossentropy": 2.431720733642578, "epoch": 0.7809237238979119, "grad_norm": 0.02773008495569229, "grad_norm_var": 3.8786886915931755e-07, "learning_rate": 0.0011832161355953541, "loss": 2.4472, "step": 21541 }, { "crossentropy": 2.654947519302368, "epoch": 0.7809599767981439, "grad_norm": 0.027584290131926537, "grad_norm_var": 4.708217902382594e-07, "learning_rate": 0.0011828407920006156, "loss": 2.6285, "step": 21542 }, { "crossentropy": 2.430276393890381, "epoch": 0.7809962296983759, "grad_norm": 0.0267447829246521, "grad_norm_var": 4.7315032501529274e-07, "learning_rate": 0.001182465499962404, "loss": 2.4464, "step": 21543 }, { "crossentropy": 2.559475898742676, "epoch": 0.7810324825986079, "grad_norm": 0.026540879160165787, "grad_norm_var": 4.234807777797004e-07, "learning_rate": 0.0011820902594857903, "loss": 2.4613, "step": 21544 }, { "crossentropy": 2.371016263961792, "epoch": 0.7810687354988399, "grad_norm": 0.025545775890350342, "grad_norm_var": 4.793666571323643e-07, "learning_rate": 0.0011817150705758412, "loss": 2.4734, "step": 21545 }, { "crossentropy": 2.437729835510254, "epoch": 0.7811049883990719, "grad_norm": 0.026331128552556038, "grad_norm_var": 4.791998654921862e-07, "learning_rate": 0.0011813399332376235, "loss": 2.4949, "step": 21546 }, { "crossentropy": 2.48388671875, "epoch": 0.781141241299304, "grad_norm": 0.027179110795259476, "grad_norm_var": 4.49647954897138e-07, "learning_rate": 0.0011809648474762065, "loss": 2.6448, "step": 21547 }, { "crossentropy": 2.5889768600463867, "epoch": 0.781177494199536, "grad_norm": 0.02661842294037342, "grad_norm_var": 3.4425067149586783e-07, "learning_rate": 0.001180589813296653, "loss": 2.5802, "step": 21548 }, { "crossentropy": 2.5518476963043213, "epoch": 0.781213747099768, "grad_norm": 0.026571519672870636, "grad_norm_var": 3.1284703121621456e-07, "learning_rate": 0.0011802148307040305, "loss": 2.6254, "step": 21549 }, { "crossentropy": 2.399198293685913, "epoch": 0.78125, "grad_norm": 0.027895605191588402, "grad_norm_var": 3.9863135862700366e-07, "learning_rate": 0.001179839899703405, "loss": 2.4763, "step": 21550 }, { "crossentropy": 2.2801549434661865, "epoch": 0.781286252900232, "grad_norm": 0.027519389986991882, "grad_norm_var": 4.4090796953176875e-07, "learning_rate": 0.0011794650202998375, "loss": 2.3759, "step": 21551 }, { "crossentropy": 2.5220229625701904, "epoch": 0.781322505800464, "grad_norm": 0.02724369987845421, "grad_norm_var": 4.4012321648866497e-07, "learning_rate": 0.0011790901924983944, "loss": 2.5152, "step": 21552 }, { "crossentropy": 2.5373082160949707, "epoch": 0.781358758700696, "grad_norm": 0.02577090822160244, "grad_norm_var": 4.977144862239873e-07, "learning_rate": 0.0011787154163041347, "loss": 2.5182, "step": 21553 }, { "crossentropy": 2.5153560638427734, "epoch": 0.7813950116009281, "grad_norm": 0.027403539046645164, "grad_norm_var": 5.080717270603922e-07, "learning_rate": 0.001178340691722124, "loss": 2.5069, "step": 21554 }, { "crossentropy": 2.468855381011963, "epoch": 0.7814312645011601, "grad_norm": 0.02709217742085457, "grad_norm_var": 4.92187338738921e-07, "learning_rate": 0.001177966018757422, "loss": 2.5056, "step": 21555 }, { "crossentropy": 2.5424492359161377, "epoch": 0.7814675174013921, "grad_norm": 0.026586290448904037, "grad_norm_var": 4.593077458541391e-07, "learning_rate": 0.0011775913974150876, "loss": 2.4448, "step": 21556 }, { "crossentropy": 2.5972256660461426, "epoch": 0.7815037703016241, "grad_norm": 0.027344772592186928, "grad_norm_var": 4.25805120510024e-07, "learning_rate": 0.0011772168277001816, "loss": 2.5381, "step": 21557 }, { "crossentropy": 2.4817731380462646, "epoch": 0.7815400232018561, "grad_norm": 0.02568759396672249, "grad_norm_var": 4.7083382523025833e-07, "learning_rate": 0.0011768423096177654, "loss": 2.4503, "step": 21558 }, { "crossentropy": 2.4896819591522217, "epoch": 0.7815762761020881, "grad_norm": 0.02655702456831932, "grad_norm_var": 4.732860394382823e-07, "learning_rate": 0.0011764678431728942, "loss": 2.525, "step": 21559 }, { "crossentropy": 2.5055317878723145, "epoch": 0.7816125290023201, "grad_norm": 0.02570025995373726, "grad_norm_var": 5.401041689107391e-07, "learning_rate": 0.0011760934283706287, "loss": 2.4193, "step": 21560 }, { "crossentropy": 2.370582342147827, "epoch": 0.7816487819025522, "grad_norm": 0.026564525440335274, "grad_norm_var": 4.4948484066533766e-07, "learning_rate": 0.0011757190652160233, "loss": 2.4527, "step": 21561 }, { "crossentropy": 2.469952344894409, "epoch": 0.7816850348027842, "grad_norm": 0.026683736592531204, "grad_norm_var": 4.373687831363271e-07, "learning_rate": 0.0011753447537141365, "loss": 2.4727, "step": 21562 }, { "crossentropy": 2.4727554321289062, "epoch": 0.7817212877030162, "grad_norm": 0.027069861069321632, "grad_norm_var": 4.3224513207652286e-07, "learning_rate": 0.0011749704938700213, "loss": 2.4752, "step": 21563 }, { "crossentropy": 2.3509867191314697, "epoch": 0.7817575406032483, "grad_norm": 0.02629736438393593, "grad_norm_var": 4.4514767045556826e-07, "learning_rate": 0.001174596285688736, "loss": 2.4496, "step": 21564 }, { "crossentropy": 2.531365394592285, "epoch": 0.7817937935034803, "grad_norm": 0.02625413052737713, "grad_norm_var": 4.589656625174672e-07, "learning_rate": 0.0011742221291753318, "loss": 2.4464, "step": 21565 }, { "crossentropy": 2.553778886795044, "epoch": 0.7818300464037123, "grad_norm": 0.027076883241534233, "grad_norm_var": 3.735566627105126e-07, "learning_rate": 0.0011738480243348648, "loss": 2.5978, "step": 21566 }, { "crossentropy": 2.565422296524048, "epoch": 0.7818662993039444, "grad_norm": 0.027355575934052467, "grad_norm_var": 3.568619943624303e-07, "learning_rate": 0.001173473971172385, "loss": 2.6047, "step": 21567 }, { "crossentropy": 2.470914363861084, "epoch": 0.7819025522041764, "grad_norm": 0.026509687304496765, "grad_norm_var": 3.341947045565688e-07, "learning_rate": 0.0011730999696929485, "loss": 2.4996, "step": 21568 }, { "crossentropy": 2.461653470993042, "epoch": 0.7819388051044084, "grad_norm": 0.02615349180996418, "grad_norm_var": 2.999202452030891e-07, "learning_rate": 0.001172726019901602, "loss": 2.3871, "step": 21569 }, { "crossentropy": 2.5875916481018066, "epoch": 0.7819750580046404, "grad_norm": 0.026974206790328026, "grad_norm_var": 2.68079109919106e-07, "learning_rate": 0.0011723521218034005, "loss": 2.5919, "step": 21570 }, { "crossentropy": 2.399639844894409, "epoch": 0.7820113109048724, "grad_norm": 0.026835037395358086, "grad_norm_var": 2.559963004805741e-07, "learning_rate": 0.0011719782754033908, "loss": 2.4659, "step": 21571 }, { "crossentropy": 2.3808224201202393, "epoch": 0.7820475638051044, "grad_norm": 0.027278197929263115, "grad_norm_var": 2.8436168918634606e-07, "learning_rate": 0.001171604480706625, "loss": 2.4246, "step": 21572 }, { "crossentropy": 2.309288740158081, "epoch": 0.7820838167053364, "grad_norm": 0.0259417537599802, "grad_norm_var": 2.7674597746536975e-07, "learning_rate": 0.0011712307377181497, "loss": 2.4294, "step": 21573 }, { "crossentropy": 2.497718334197998, "epoch": 0.7821200696055685, "grad_norm": 0.02721346914768219, "grad_norm_var": 2.4503621420280765e-07, "learning_rate": 0.0011708570464430146, "loss": 2.4781, "step": 21574 }, { "crossentropy": 2.4330432415008545, "epoch": 0.7821563225058005, "grad_norm": 0.026163168251514435, "grad_norm_var": 2.598279316992057e-07, "learning_rate": 0.0011704834068862646, "loss": 2.4135, "step": 21575 }, { "crossentropy": 2.4135005474090576, "epoch": 0.7821925754060325, "grad_norm": 0.02689763717353344, "grad_norm_var": 2.0108798665789627e-07, "learning_rate": 0.0011701098190529475, "loss": 2.4565, "step": 21576 }, { "crossentropy": 2.5540647506713867, "epoch": 0.7822288283062645, "grad_norm": 0.027876492589712143, "grad_norm_var": 2.842167826433891e-07, "learning_rate": 0.0011697362829481111, "loss": 2.5285, "step": 21577 }, { "crossentropy": 2.5503382682800293, "epoch": 0.7822650812064965, "grad_norm": 0.02638782560825348, "grad_norm_var": 2.937358464377126e-07, "learning_rate": 0.0011693627985767973, "loss": 2.573, "step": 21578 }, { "crossentropy": 2.4574191570281982, "epoch": 0.7823013341067285, "grad_norm": 0.0257426667958498, "grad_norm_var": 3.5037345127158826e-07, "learning_rate": 0.001168989365944053, "loss": 2.3932, "step": 21579 }, { "crossentropy": 2.607527017593384, "epoch": 0.7823375870069605, "grad_norm": 0.026602735742926598, "grad_norm_var": 3.4042477479802265e-07, "learning_rate": 0.0011686159850549222, "loss": 2.5633, "step": 21580 }, { "crossentropy": 2.614414930343628, "epoch": 0.7823738399071926, "grad_norm": 0.026044348254799843, "grad_norm_var": 3.557567796834965e-07, "learning_rate": 0.0011682426559144448, "loss": 2.5357, "step": 21581 }, { "crossentropy": 2.528468608856201, "epoch": 0.7824100928074246, "grad_norm": 0.02677944116294384, "grad_norm_var": 3.459755492544175e-07, "learning_rate": 0.0011678693785276667, "loss": 2.5742, "step": 21582 }, { "crossentropy": 2.5018715858459473, "epoch": 0.7824463457076566, "grad_norm": 0.02626715786755085, "grad_norm_var": 3.208481139431307e-07, "learning_rate": 0.0011674961528996264, "loss": 2.5104, "step": 21583 }, { "crossentropy": 2.4852633476257324, "epoch": 0.7824825986078886, "grad_norm": 0.026458701118826866, "grad_norm_var": 3.216531505939128e-07, "learning_rate": 0.0011671229790353671, "loss": 2.4924, "step": 21584 }, { "crossentropy": 2.4018466472625732, "epoch": 0.7825188515081206, "grad_norm": 0.02789980359375477, "grad_norm_var": 4.080501206924399e-07, "learning_rate": 0.0011667498569399293, "loss": 2.4946, "step": 21585 }, { "crossentropy": 2.393622398376465, "epoch": 0.7825551044083526, "grad_norm": 0.026380283758044243, "grad_norm_var": 4.091872718584947e-07, "learning_rate": 0.0011663767866183512, "loss": 2.3958, "step": 21586 }, { "crossentropy": 2.5299313068389893, "epoch": 0.7825913573085846, "grad_norm": 0.027263017371296883, "grad_norm_var": 4.2987913359034354e-07, "learning_rate": 0.0011660037680756735, "loss": 2.5192, "step": 21587 }, { "crossentropy": 2.6197526454925537, "epoch": 0.7826276102088167, "grad_norm": 0.02653488889336586, "grad_norm_var": 4.0708649750203995e-07, "learning_rate": 0.0011656308013169331, "loss": 2.5597, "step": 21588 }, { "crossentropy": 2.5219907760620117, "epoch": 0.7826638631090487, "grad_norm": 0.026580948382616043, "grad_norm_var": 3.7197675528902877e-07, "learning_rate": 0.0011652578863471663, "loss": 2.487, "step": 21589 }, { "crossentropy": 2.329240560531616, "epoch": 0.7827001160092807, "grad_norm": 0.02682216838002205, "grad_norm_var": 3.5440681313211876e-07, "learning_rate": 0.0011648850231714125, "loss": 2.4282, "step": 21590 }, { "crossentropy": 2.4602582454681396, "epoch": 0.7827363689095128, "grad_norm": 0.02697664126753807, "grad_norm_var": 3.4091980133047623e-07, "learning_rate": 0.001164512211794705, "loss": 2.5206, "step": 21591 }, { "crossentropy": 2.4609880447387695, "epoch": 0.7827726218097448, "grad_norm": 0.02566366083920002, "grad_norm_var": 4.068078205711016e-07, "learning_rate": 0.0011641394522220805, "loss": 2.4594, "step": 21592 }, { "crossentropy": 2.518662691116333, "epoch": 0.7828088747099768, "grad_norm": 0.027276549488306046, "grad_norm_var": 3.3059741562392145e-07, "learning_rate": 0.0011637667444585754, "loss": 2.5231, "step": 21593 }, { "crossentropy": 2.5910162925720215, "epoch": 0.7828451276102089, "grad_norm": 0.027077673003077507, "grad_norm_var": 3.4036005931308414e-07, "learning_rate": 0.0011633940885092209, "loss": 2.4974, "step": 21594 }, { "crossentropy": 2.3557841777801514, "epoch": 0.7828813805104409, "grad_norm": 0.026733633130788803, "grad_norm_var": 2.8209314085897443e-07, "learning_rate": 0.0011630214843790527, "loss": 2.4316, "step": 21595 }, { "crossentropy": 2.562291145324707, "epoch": 0.7829176334106729, "grad_norm": 0.027005666866898537, "grad_norm_var": 2.8647200512871054e-07, "learning_rate": 0.0011626489320731021, "loss": 2.4977, "step": 21596 }, { "crossentropy": 2.5619301795959473, "epoch": 0.7829538863109049, "grad_norm": 0.026359502226114273, "grad_norm_var": 2.6364604244070944e-07, "learning_rate": 0.0011622764315963995, "loss": 2.5252, "step": 21597 }, { "crossentropy": 2.4029433727264404, "epoch": 0.7829901392111369, "grad_norm": 0.026078933849930763, "grad_norm_var": 2.920310867306295e-07, "learning_rate": 0.0011619039829539797, "loss": 2.4069, "step": 21598 }, { "crossentropy": 2.441680431365967, "epoch": 0.7830263921113689, "grad_norm": 0.0254535973072052, "grad_norm_var": 3.8156619493722364e-07, "learning_rate": 0.001161531586150869, "loss": 2.5021, "step": 21599 }, { "crossentropy": 2.4199585914611816, "epoch": 0.7830626450116009, "grad_norm": 0.026133595034480095, "grad_norm_var": 3.969132264682503e-07, "learning_rate": 0.0011611592411920996, "loss": 2.5148, "step": 21600 }, { "crossentropy": 2.5322318077087402, "epoch": 0.783098897911833, "grad_norm": 0.026264166459441185, "grad_norm_var": 2.893835025625952e-07, "learning_rate": 0.0011607869480827016, "loss": 2.4823, "step": 21601 }, { "crossentropy": 2.592072010040283, "epoch": 0.783135150812065, "grad_norm": 0.026268232613801956, "grad_norm_var": 2.92521653326771e-07, "learning_rate": 0.001160414706827701, "loss": 2.539, "step": 21602 }, { "crossentropy": 2.4360313415527344, "epoch": 0.783171403712297, "grad_norm": 0.027729786932468414, "grad_norm_var": 3.517087125819625e-07, "learning_rate": 0.0011600425174321278, "loss": 2.4748, "step": 21603 }, { "crossentropy": 2.221484661102295, "epoch": 0.783207656612529, "grad_norm": 0.027541067451238632, "grad_norm_var": 4.1161756328701264e-07, "learning_rate": 0.0011596703799010078, "loss": 2.4067, "step": 21604 }, { "crossentropy": 2.4882802963256836, "epoch": 0.783243909512761, "grad_norm": 0.025795914232730865, "grad_norm_var": 4.5452233060853246e-07, "learning_rate": 0.0011592982942393653, "loss": 2.4621, "step": 21605 }, { "crossentropy": 2.4419102668762207, "epoch": 0.783280162412993, "grad_norm": 0.02707434631884098, "grad_norm_var": 4.6684803075383066e-07, "learning_rate": 0.0011589262604522299, "loss": 2.4744, "step": 21606 }, { "crossentropy": 2.385953187942505, "epoch": 0.783316415313225, "grad_norm": 0.02628202922642231, "grad_norm_var": 4.6115393146986536e-07, "learning_rate": 0.001158554278544623, "loss": 2.4448, "step": 21607 }, { "crossentropy": 2.6981403827667236, "epoch": 0.7833526682134571, "grad_norm": 0.026102181524038315, "grad_norm_var": 4.2157423693667666e-07, "learning_rate": 0.0011581823485215697, "loss": 2.6092, "step": 21608 }, { "crossentropy": 2.4312188625335693, "epoch": 0.7833889211136891, "grad_norm": 0.02584846131503582, "grad_norm_var": 4.1518050948776094e-07, "learning_rate": 0.0011578104703880954, "loss": 2.4795, "step": 21609 }, { "crossentropy": 2.426647424697876, "epoch": 0.7834251740139211, "grad_norm": 0.026640623807907104, "grad_norm_var": 3.9254095474786494e-07, "learning_rate": 0.00115743864414922, "loss": 2.4261, "step": 21610 }, { "crossentropy": 2.5286483764648438, "epoch": 0.7834614269141531, "grad_norm": 0.026678036898374557, "grad_norm_var": 3.906833826723081e-07, "learning_rate": 0.001157066869809969, "loss": 2.4666, "step": 21611 }, { "crossentropy": 2.518725872039795, "epoch": 0.7834976798143851, "grad_norm": 0.02632766216993332, "grad_norm_var": 3.694986028451285e-07, "learning_rate": 0.0011566951473753612, "loss": 2.4871, "step": 21612 }, { "crossentropy": 2.576799154281616, "epoch": 0.7835339327146171, "grad_norm": 0.026434121653437614, "grad_norm_var": 3.6933291286595817e-07, "learning_rate": 0.0011563234768504167, "loss": 2.5603, "step": 21613 }, { "crossentropy": 2.3186025619506836, "epoch": 0.7835701856148491, "grad_norm": 0.026071732863783836, "grad_norm_var": 3.69659586958249e-07, "learning_rate": 0.0011559518582401579, "loss": 2.373, "step": 21614 }, { "crossentropy": 2.6186742782592773, "epoch": 0.7836064385150812, "grad_norm": 0.026028361171483994, "grad_norm_var": 3.1660279979036677e-07, "learning_rate": 0.0011555802915496018, "loss": 2.6, "step": 21615 }, { "crossentropy": 2.4006242752075195, "epoch": 0.7836426914153132, "grad_norm": 0.027248837053775787, "grad_norm_var": 3.471001710888636e-07, "learning_rate": 0.001155208776783767, "loss": 2.4998, "step": 21616 }, { "crossentropy": 2.616997718811035, "epoch": 0.7836789443155452, "grad_norm": 0.026870060712099075, "grad_norm_var": 3.4929809567689887e-07, "learning_rate": 0.0011548373139476749, "loss": 2.544, "step": 21617 }, { "crossentropy": 2.4549639225006104, "epoch": 0.7837151972157773, "grad_norm": 0.027127791196107864, "grad_norm_var": 3.6216966485129857e-07, "learning_rate": 0.0011544659030463377, "loss": 2.4718, "step": 21618 }, { "crossentropy": 2.5698256492614746, "epoch": 0.7837514501160093, "grad_norm": 0.027050185948610306, "grad_norm_var": 2.8980026423919683e-07, "learning_rate": 0.0011540945440847766, "loss": 2.5153, "step": 21619 }, { "crossentropy": 2.4849531650543213, "epoch": 0.7837877030162413, "grad_norm": 0.02651430293917656, "grad_norm_var": 2.2276168121294845e-07, "learning_rate": 0.0011537232370680034, "loss": 2.4601, "step": 21620 }, { "crossentropy": 2.5899198055267334, "epoch": 0.7838239559164734, "grad_norm": 0.026721172034740448, "grad_norm_var": 1.8867682000515914e-07, "learning_rate": 0.0011533519820010358, "loss": 2.5589, "step": 21621 }, { "crossentropy": 2.405984878540039, "epoch": 0.7838602088167054, "grad_norm": 0.029657414183020592, "grad_norm_var": 7.815477922647051e-07, "learning_rate": 0.0011529807788888875, "loss": 2.4051, "step": 21622 }, { "crossentropy": 2.52451753616333, "epoch": 0.7838964617169374, "grad_norm": 0.026228440925478935, "grad_norm_var": 7.848936757811336e-07, "learning_rate": 0.0011526096277365706, "loss": 2.5318, "step": 21623 }, { "crossentropy": 2.4374170303344727, "epoch": 0.7839327146171694, "grad_norm": 0.02641068771481514, "grad_norm_var": 7.653531878139593e-07, "learning_rate": 0.001152238528549099, "loss": 2.4617, "step": 21624 }, { "crossentropy": 2.4511144161224365, "epoch": 0.7839689675174014, "grad_norm": 0.026886610314249992, "grad_norm_var": 7.091513129412238e-07, "learning_rate": 0.0011518674813314866, "loss": 2.4658, "step": 21625 }, { "crossentropy": 2.4846253395080566, "epoch": 0.7840052204176334, "grad_norm": 0.026651764288544655, "grad_norm_var": 7.089134166059104e-07, "learning_rate": 0.0011514964860887423, "loss": 2.5087, "step": 21626 }, { "crossentropy": 2.443781852722168, "epoch": 0.7840414733178654, "grad_norm": 0.026089651510119438, "grad_norm_var": 7.406444625330932e-07, "learning_rate": 0.00115112554282588, "loss": 2.4659, "step": 21627 }, { "crossentropy": 2.3614089488983154, "epoch": 0.7840777262180975, "grad_norm": 0.027107903733849525, "grad_norm_var": 7.326834612936261e-07, "learning_rate": 0.001150754651547906, "loss": 2.4962, "step": 21628 }, { "crossentropy": 2.600430965423584, "epoch": 0.7841139791183295, "grad_norm": 0.026463739573955536, "grad_norm_var": 7.312196060902382e-07, "learning_rate": 0.0011503838122598337, "loss": 2.4966, "step": 21629 }, { "crossentropy": 2.5711820125579834, "epoch": 0.7841502320185615, "grad_norm": 0.026656948029994965, "grad_norm_var": 6.941958876612841e-07, "learning_rate": 0.00115001302496667, "loss": 2.5324, "step": 21630 }, { "crossentropy": 2.5102498531341553, "epoch": 0.7841864849187935, "grad_norm": 0.026563255116343498, "grad_norm_var": 6.529716648211026e-07, "learning_rate": 0.0011496422896734222, "loss": 2.5684, "step": 21631 }, { "crossentropy": 2.54840087890625, "epoch": 0.7842227378190255, "grad_norm": 0.026843572035431862, "grad_norm_var": 6.438763697343604e-07, "learning_rate": 0.0011492716063850972, "loss": 2.4988, "step": 21632 }, { "crossentropy": 2.2922370433807373, "epoch": 0.7842589907192575, "grad_norm": 0.02630717121064663, "grad_norm_var": 6.633157592479533e-07, "learning_rate": 0.0011489009751067052, "loss": 2.389, "step": 21633 }, { "crossentropy": 2.4985804557800293, "epoch": 0.7842952436194895, "grad_norm": 0.02876915968954563, "grad_norm_var": 8.968594084217407e-07, "learning_rate": 0.001148530395843248, "loss": 2.5992, "step": 21634 }, { "crossentropy": 2.522037982940674, "epoch": 0.7843314965197216, "grad_norm": 0.027215104550123215, "grad_norm_var": 9.011443859762343e-07, "learning_rate": 0.0011481598685997342, "loss": 2.5612, "step": 21635 }, { "crossentropy": 2.691479206085205, "epoch": 0.7843677494199536, "grad_norm": 0.02716626599431038, "grad_norm_var": 8.904504103302475e-07, "learning_rate": 0.001147789393381165, "loss": 2.5658, "step": 21636 }, { "crossentropy": 2.464796781539917, "epoch": 0.7844040023201856, "grad_norm": 0.0283324234187603, "grad_norm_var": 9.963133459795285e-07, "learning_rate": 0.0011474189701925458, "loss": 2.4694, "step": 21637 }, { "crossentropy": 2.403428316116333, "epoch": 0.7844402552204176, "grad_norm": 0.02610855922102928, "grad_norm_var": 5.659525051833335e-07, "learning_rate": 0.0011470485990388835, "loss": 2.4674, "step": 21638 }, { "crossentropy": 2.5543951988220215, "epoch": 0.7844765081206496, "grad_norm": 0.02613113448023796, "grad_norm_var": 5.747717133667656e-07, "learning_rate": 0.0011466782799251735, "loss": 2.5617, "step": 21639 }, { "crossentropy": 2.5506229400634766, "epoch": 0.7845127610208816, "grad_norm": 0.0259073656052351, "grad_norm_var": 6.205231196204266e-07, "learning_rate": 0.0011463080128564212, "loss": 2.5547, "step": 21640 }, { "crossentropy": 2.3147740364074707, "epoch": 0.7845490139211136, "grad_norm": 0.027755189687013626, "grad_norm_var": 6.748055778031207e-07, "learning_rate": 0.001145937797837629, "loss": 2.4215, "step": 21641 }, { "crossentropy": 2.5354504585266113, "epoch": 0.7845852668213457, "grad_norm": 0.027285313233733177, "grad_norm_var": 6.806692715942304e-07, "learning_rate": 0.0011455676348737936, "loss": 2.5561, "step": 21642 }, { "crossentropy": 2.38604474067688, "epoch": 0.7846215197215777, "grad_norm": 0.028292903676629066, "grad_norm_var": 7.404519178334789e-07, "learning_rate": 0.0011451975239699186, "loss": 2.3913, "step": 21643 }, { "crossentropy": 2.382195472717285, "epoch": 0.7846577726218097, "grad_norm": 0.026341373100876808, "grad_norm_var": 7.719341566271576e-07, "learning_rate": 0.0011448274651309992, "loss": 2.4249, "step": 21644 }, { "crossentropy": 2.463286876678467, "epoch": 0.7846940255220418, "grad_norm": 0.027051247656345367, "grad_norm_var": 7.508164977074646e-07, "learning_rate": 0.0011444574583620354, "loss": 2.4679, "step": 21645 }, { "crossentropy": 2.519857406616211, "epoch": 0.7847302784222738, "grad_norm": 0.026566144078969955, "grad_norm_var": 7.560353396676133e-07, "learning_rate": 0.001144087503668027, "loss": 2.5333, "step": 21646 }, { "crossentropy": 2.480886459350586, "epoch": 0.7847665313225058, "grad_norm": 0.027726395055651665, "grad_norm_var": 7.66692111644982e-07, "learning_rate": 0.0011437176010539662, "loss": 2.4954, "step": 21647 }, { "crossentropy": 2.3880372047424316, "epoch": 0.7848027842227379, "grad_norm": 0.02579190768301487, "grad_norm_var": 8.735206390395392e-07, "learning_rate": 0.0011433477505248513, "loss": 2.3915, "step": 21648 }, { "crossentropy": 2.3820736408233643, "epoch": 0.7848390371229699, "grad_norm": 0.026770874857902527, "grad_norm_var": 8.412347253619699e-07, "learning_rate": 0.0011429779520856786, "loss": 2.4733, "step": 21649 }, { "crossentropy": 2.606448173522949, "epoch": 0.7848752900232019, "grad_norm": 0.026391219347715378, "grad_norm_var": 6.577243037979296e-07, "learning_rate": 0.001142608205741441, "loss": 2.4755, "step": 21650 }, { "crossentropy": 2.441494941711426, "epoch": 0.7849115429234339, "grad_norm": 0.026199959218502045, "grad_norm_var": 6.831481011811945e-07, "learning_rate": 0.0011422385114971346, "loss": 2.4106, "step": 21651 }, { "crossentropy": 2.5330262184143066, "epoch": 0.7849477958236659, "grad_norm": 0.026501720771193504, "grad_norm_var": 6.839350865376342e-07, "learning_rate": 0.00114186886935775, "loss": 2.4678, "step": 21652 }, { "crossentropy": 2.5454294681549072, "epoch": 0.7849840487238979, "grad_norm": 0.025904914364218712, "grad_norm_var": 5.633945816531834e-07, "learning_rate": 0.0011414992793282815, "loss": 2.5379, "step": 21653 }, { "crossentropy": 2.4616963863372803, "epoch": 0.78502030162413, "grad_norm": 0.027046140283346176, "grad_norm_var": 5.481009427053356e-07, "learning_rate": 0.0011411297414137234, "loss": 2.4208, "step": 21654 }, { "crossentropy": 2.4906673431396484, "epoch": 0.785056554524362, "grad_norm": 0.02733272686600685, "grad_norm_var": 5.425565098237754e-07, "learning_rate": 0.001140760255619061, "loss": 2.4876, "step": 21655 }, { "crossentropy": 2.498664617538452, "epoch": 0.785092807424594, "grad_norm": 0.02623148448765278, "grad_norm_var": 5.103697378535642e-07, "learning_rate": 0.0011403908219492882, "loss": 2.5503, "step": 21656 }, { "crossentropy": 2.4387569427490234, "epoch": 0.785129060324826, "grad_norm": 0.02812599018216133, "grad_norm_var": 5.64984090984227e-07, "learning_rate": 0.0011400214404093962, "loss": 2.4592, "step": 21657 }, { "crossentropy": 2.582756519317627, "epoch": 0.785165313225058, "grad_norm": 0.025656186044216156, "grad_norm_var": 6.357663690048035e-07, "learning_rate": 0.0011396521110043705, "loss": 2.526, "step": 21658 }, { "crossentropy": 2.4493210315704346, "epoch": 0.78520156612529, "grad_norm": 0.026856228709220886, "grad_norm_var": 4.6839123433676384e-07, "learning_rate": 0.001139282833739203, "loss": 2.4695, "step": 21659 }, { "crossentropy": 2.509028434753418, "epoch": 0.785237819025522, "grad_norm": 0.03998539224267006, "grad_norm_var": 1.1531143928251982e-05, "learning_rate": 0.0011389136086188785, "loss": 2.4568, "step": 21660 }, { "crossentropy": 2.543637990951538, "epoch": 0.785274071925754, "grad_norm": 0.025403134524822235, "grad_norm_var": 1.1801426486188883e-05, "learning_rate": 0.0011385444356483848, "loss": 2.5491, "step": 21661 }, { "crossentropy": 2.500948905944824, "epoch": 0.7853103248259861, "grad_norm": 0.025882869958877563, "grad_norm_var": 1.1907087252230548e-05, "learning_rate": 0.00113817531483271, "loss": 2.4314, "step": 21662 }, { "crossentropy": 2.6129167079925537, "epoch": 0.7853465777262181, "grad_norm": 0.026841629296541214, "grad_norm_var": 1.1913137331365925e-05, "learning_rate": 0.001137806246176838, "loss": 2.6031, "step": 21663 }, { "crossentropy": 2.551694631576538, "epoch": 0.7853828306264501, "grad_norm": 0.02627311274409294, "grad_norm_var": 1.1830358760755175e-05, "learning_rate": 0.0011374372296857522, "loss": 2.5521, "step": 21664 }, { "crossentropy": 2.4286038875579834, "epoch": 0.7854190835266821, "grad_norm": 0.02570418268442154, "grad_norm_var": 1.1982093744374945e-05, "learning_rate": 0.0011370682653644398, "loss": 2.459, "step": 21665 }, { "crossentropy": 2.485297679901123, "epoch": 0.7854553364269141, "grad_norm": 0.025720350444316864, "grad_norm_var": 1.2088923461670688e-05, "learning_rate": 0.0011366993532178815, "loss": 2.436, "step": 21666 }, { "crossentropy": 2.4083073139190674, "epoch": 0.7854915893271461, "grad_norm": 0.026848597452044487, "grad_norm_var": 1.2026211557413826e-05, "learning_rate": 0.0011363304932510627, "loss": 2.4539, "step": 21667 }, { "crossentropy": 2.4084765911102295, "epoch": 0.7855278422273781, "grad_norm": 0.025830214843153954, "grad_norm_var": 1.2123151402057958e-05, "learning_rate": 0.001135961685468962, "loss": 2.3967, "step": 21668 }, { "crossentropy": 2.301452875137329, "epoch": 0.7855640951276102, "grad_norm": 0.026257315650582314, "grad_norm_var": 1.2068759685879975e-05, "learning_rate": 0.0011355929298765626, "loss": 2.3785, "step": 21669 }, { "crossentropy": 2.4774792194366455, "epoch": 0.7856003480278422, "grad_norm": 0.027085023000836372, "grad_norm_var": 1.2067798734673853e-05, "learning_rate": 0.001135224226478847, "loss": 2.3848, "step": 21670 }, { "crossentropy": 2.566068649291992, "epoch": 0.7856366009280742, "grad_norm": 0.026974724605679512, "grad_norm_var": 1.2071962975879144e-05, "learning_rate": 0.0011348555752807932, "loss": 2.5391, "step": 21671 }, { "crossentropy": 2.6402549743652344, "epoch": 0.7856728538283063, "grad_norm": 0.026903897523880005, "grad_norm_var": 1.2010719667719298e-05, "learning_rate": 0.001134486976287379, "loss": 2.5629, "step": 21672 }, { "crossentropy": 2.5495376586914062, "epoch": 0.7857091067285383, "grad_norm": 0.026176024228334427, "grad_norm_var": 1.2026282860245678e-05, "learning_rate": 0.001134118429503585, "loss": 2.58, "step": 21673 }, { "crossentropy": 2.4673688411712646, "epoch": 0.7857453596287703, "grad_norm": 0.027144310995936394, "grad_norm_var": 1.186830636626714e-05, "learning_rate": 0.0011337499349343878, "loss": 2.5247, "step": 21674 }, { "crossentropy": 2.4494032859802246, "epoch": 0.7857816125290024, "grad_norm": 0.026554148644208908, "grad_norm_var": 1.1889585263761121e-05, "learning_rate": 0.0011333814925847657, "loss": 2.4136, "step": 21675 }, { "crossentropy": 2.396390676498413, "epoch": 0.7858178654292344, "grad_norm": 0.025769727304577827, "grad_norm_var": 3.317923977511622e-07, "learning_rate": 0.0011330131024596935, "loss": 2.4799, "step": 21676 }, { "crossentropy": 2.4671435356140137, "epoch": 0.7858541183294664, "grad_norm": 0.026546122506260872, "grad_norm_var": 2.713407002404832e-07, "learning_rate": 0.0011326447645641474, "loss": 2.491, "step": 21677 }, { "crossentropy": 2.488696575164795, "epoch": 0.7858903712296984, "grad_norm": 0.026486245915293694, "grad_norm_var": 2.5192701413784675e-07, "learning_rate": 0.0011322764789031042, "loss": 2.5885, "step": 21678 }, { "crossentropy": 2.5334787368774414, "epoch": 0.7859266241299304, "grad_norm": 0.02656463533639908, "grad_norm_var": 2.4206375272016874e-07, "learning_rate": 0.0011319082454815349, "loss": 2.5288, "step": 21679 }, { "crossentropy": 2.4978954792022705, "epoch": 0.7859628770301624, "grad_norm": 0.027934115380048752, "grad_norm_var": 3.803240867157233e-07, "learning_rate": 0.0011315400643044165, "loss": 2.5091, "step": 21680 }, { "crossentropy": 2.352670431137085, "epoch": 0.7859991299303944, "grad_norm": 0.02641579508781433, "grad_norm_var": 3.3350224081077754e-07, "learning_rate": 0.0011311719353767203, "loss": 2.3989, "step": 21681 }, { "crossentropy": 2.564378499984741, "epoch": 0.7860353828306265, "grad_norm": 0.026044614613056183, "grad_norm_var": 3.0309258346228734e-07, "learning_rate": 0.0011308038587034169, "loss": 2.5868, "step": 21682 }, { "crossentropy": 2.5387065410614014, "epoch": 0.7860716357308585, "grad_norm": 0.027616305276751518, "grad_norm_var": 3.65787793744895e-07, "learning_rate": 0.00113043583428948, "loss": 2.5232, "step": 21683 }, { "crossentropy": 2.323467493057251, "epoch": 0.7861078886310905, "grad_norm": 0.02623605541884899, "grad_norm_var": 3.320489845536617e-07, "learning_rate": 0.0011300678621398786, "loss": 2.3511, "step": 21684 }, { "crossentropy": 2.5109217166900635, "epoch": 0.7861441415313225, "grad_norm": 0.025679610669612885, "grad_norm_var": 3.846432384939253e-07, "learning_rate": 0.0011296999422595834, "loss": 2.544, "step": 21685 }, { "crossentropy": 2.4466373920440674, "epoch": 0.7861803944315545, "grad_norm": 0.02729237824678421, "grad_norm_var": 3.9982194635991966e-07, "learning_rate": 0.0011293320746535646, "loss": 2.517, "step": 21686 }, { "crossentropy": 2.506882905960083, "epoch": 0.7862166473317865, "grad_norm": 0.030428951606154442, "grad_norm_var": 1.2968727364224037e-06, "learning_rate": 0.0011289642593267895, "loss": 2.5145, "step": 21687 }, { "crossentropy": 2.537491798400879, "epoch": 0.7862529002320185, "grad_norm": 0.026168789714574814, "grad_norm_var": 1.3265458957408543e-06, "learning_rate": 0.0011285964962842276, "loss": 2.5101, "step": 21688 }, { "crossentropy": 2.5588550567626953, "epoch": 0.7862891531322506, "grad_norm": 0.027087045833468437, "grad_norm_var": 1.3006669510035387e-06, "learning_rate": 0.0011282287855308448, "loss": 2.5512, "step": 21689 }, { "crossentropy": 2.302551507949829, "epoch": 0.7863254060324826, "grad_norm": 0.025976944714784622, "grad_norm_var": 1.3436175056857526e-06, "learning_rate": 0.0011278611270716071, "loss": 2.3845, "step": 21690 }, { "crossentropy": 2.5433285236358643, "epoch": 0.7863616589327146, "grad_norm": 0.02726116217672825, "grad_norm_var": 1.3516744759622482e-06, "learning_rate": 0.0011274935209114824, "loss": 2.4954, "step": 21691 }, { "crossentropy": 2.52805233001709, "epoch": 0.7863979118329466, "grad_norm": 0.026827674359083176, "grad_norm_var": 1.2700515683199449e-06, "learning_rate": 0.001127125967055433, "loss": 2.4906, "step": 21692 }, { "crossentropy": 2.522761821746826, "epoch": 0.7864341647331786, "grad_norm": 0.026111384853720665, "grad_norm_var": 1.3029793920613185e-06, "learning_rate": 0.0011267584655084246, "loss": 2.4459, "step": 21693 }, { "crossentropy": 2.64581036567688, "epoch": 0.7864704176334106, "grad_norm": 0.02608545497059822, "grad_norm_var": 1.334233426110385e-06, "learning_rate": 0.001126391016275422, "loss": 2.487, "step": 21694 }, { "crossentropy": 2.342034101486206, "epoch": 0.7865066705336426, "grad_norm": 0.027382269501686096, "grad_norm_var": 1.34401439773098e-06, "learning_rate": 0.001126023619361386, "loss": 2.555, "step": 21695 }, { "crossentropy": 2.506629228591919, "epoch": 0.7865429234338747, "grad_norm": 0.0259319469332695, "grad_norm_var": 1.3209722702046492e-06, "learning_rate": 0.0011256562747712817, "loss": 2.4761, "step": 21696 }, { "crossentropy": 2.429037094116211, "epoch": 0.7865791763341067, "grad_norm": 0.026637310162186623, "grad_norm_var": 1.3131596190243578e-06, "learning_rate": 0.0011252889825100686, "loss": 2.4469, "step": 21697 }, { "crossentropy": 2.5983827114105225, "epoch": 0.7866154292343387, "grad_norm": 0.026838382706046104, "grad_norm_var": 1.272804419112881e-06, "learning_rate": 0.0011249217425827063, "loss": 2.5462, "step": 21698 }, { "crossentropy": 2.4262216091156006, "epoch": 0.7866516821345708, "grad_norm": 0.027583232149481773, "grad_norm_var": 1.2694830037759027e-06, "learning_rate": 0.0011245545549941583, "loss": 2.3747, "step": 21699 }, { "crossentropy": 2.400317907333374, "epoch": 0.7866879350348028, "grad_norm": 0.027982795611023903, "grad_norm_var": 1.3182293423757407e-06, "learning_rate": 0.0011241874197493807, "loss": 2.5339, "step": 21700 }, { "crossentropy": 2.327925682067871, "epoch": 0.7867241879350348, "grad_norm": 0.02562860958278179, "grad_norm_var": 1.327062761065353e-06, "learning_rate": 0.001123820336853334, "loss": 2.3711, "step": 21701 }, { "crossentropy": 2.4041855335235596, "epoch": 0.7867604408352669, "grad_norm": 0.026559891179203987, "grad_norm_var": 1.3273065216561884e-06, "learning_rate": 0.001123453306310977, "loss": 2.4242, "step": 21702 }, { "crossentropy": 2.591291904449463, "epoch": 0.7867966937354989, "grad_norm": 0.026469489559531212, "grad_norm_var": 4.4713743436401027e-07, "learning_rate": 0.0011230863281272646, "loss": 2.5134, "step": 21703 }, { "crossentropy": 2.5436019897460938, "epoch": 0.7868329466357309, "grad_norm": 0.02910519391298294, "grad_norm_var": 7.943986237539391e-07, "learning_rate": 0.0011227194023071562, "loss": 2.4399, "step": 21704 }, { "crossentropy": 2.3284358978271484, "epoch": 0.7868691995359629, "grad_norm": 0.025920288637280464, "grad_norm_var": 8.413288441273698e-07, "learning_rate": 0.0011223525288556063, "loss": 2.4064, "step": 21705 }, { "crossentropy": 2.297811269760132, "epoch": 0.7869054524361949, "grad_norm": 0.027488529682159424, "grad_norm_var": 8.245247011912556e-07, "learning_rate": 0.0011219857077775686, "loss": 2.4212, "step": 21706 }, { "crossentropy": 2.477992534637451, "epoch": 0.7869417053364269, "grad_norm": 0.026094632223248482, "grad_norm_var": 8.476997221169384e-07, "learning_rate": 0.0011216189390780007, "loss": 2.4753, "step": 21707 }, { "crossentropy": 2.438697099685669, "epoch": 0.786977958236659, "grad_norm": 0.02512243576347828, "grad_norm_var": 1.0209744973909413e-06, "learning_rate": 0.0011212522227618533, "loss": 2.2978, "step": 21708 }, { "crossentropy": 2.4670827388763428, "epoch": 0.787014211136891, "grad_norm": 0.026582961902022362, "grad_norm_var": 9.988777255487963e-07, "learning_rate": 0.0011208855588340804, "loss": 2.4577, "step": 21709 }, { "crossentropy": 2.485898971557617, "epoch": 0.787050464037123, "grad_norm": 0.026385366916656494, "grad_norm_var": 9.793914333308992e-07, "learning_rate": 0.0011205189472996363, "loss": 2.4982, "step": 21710 }, { "crossentropy": 2.4010508060455322, "epoch": 0.787086716937355, "grad_norm": 0.027470706030726433, "grad_norm_var": 9.87546939177723e-07, "learning_rate": 0.0011201523881634696, "loss": 2.4771, "step": 21711 }, { "crossentropy": 2.4692060947418213, "epoch": 0.787122969837587, "grad_norm": 0.026469195261597633, "grad_norm_var": 9.478744651440166e-07, "learning_rate": 0.0011197858814305346, "loss": 2.52, "step": 21712 }, { "crossentropy": 2.4581170082092285, "epoch": 0.787159222737819, "grad_norm": 0.028198406100273132, "grad_norm_var": 1.0723219221971113e-06, "learning_rate": 0.0011194194271057795, "loss": 2.5364, "step": 21713 }, { "crossentropy": 2.4355971813201904, "epoch": 0.787195475638051, "grad_norm": 0.026110153645277023, "grad_norm_var": 1.1084160608162366e-06, "learning_rate": 0.0011190530251941528, "loss": 2.4546, "step": 21714 }, { "crossentropy": 2.582512140274048, "epoch": 0.787231728538283, "grad_norm": 0.026642562821507454, "grad_norm_var": 1.0683999384215043e-06, "learning_rate": 0.0011186866757006053, "loss": 2.5771, "step": 21715 }, { "crossentropy": 2.3817362785339355, "epoch": 0.7872679814385151, "grad_norm": 0.027099646627902985, "grad_norm_var": 9.736829932376068e-07, "learning_rate": 0.0011183203786300838, "loss": 2.4044, "step": 21716 }, { "crossentropy": 2.6840202808380127, "epoch": 0.7873042343387471, "grad_norm": 0.026958972215652466, "grad_norm_var": 8.926129474084623e-07, "learning_rate": 0.001117954133987536, "loss": 2.6101, "step": 21717 }, { "crossentropy": 2.6113197803497314, "epoch": 0.7873404872389791, "grad_norm": 0.02747492864727974, "grad_norm_var": 9.165763100495046e-07, "learning_rate": 0.00111758794177791, "loss": 2.5433, "step": 21718 }, { "crossentropy": 2.301100730895996, "epoch": 0.7873767401392111, "grad_norm": 0.028533268719911575, "grad_norm_var": 1.0781823723804515e-06, "learning_rate": 0.0011172218020061493, "loss": 2.3268, "step": 21719 }, { "crossentropy": 2.503002405166626, "epoch": 0.7874129930394431, "grad_norm": 0.02679046243429184, "grad_norm_var": 7.567169511560883e-07, "learning_rate": 0.001116855714677202, "loss": 2.4432, "step": 21720 }, { "crossentropy": 2.3056743144989014, "epoch": 0.7874492459396751, "grad_norm": 0.027482163161039352, "grad_norm_var": 7.18921663612979e-07, "learning_rate": 0.0011164896797960094, "loss": 2.4736, "step": 21721 }, { "crossentropy": 2.5013954639434814, "epoch": 0.7874854988399071, "grad_norm": 0.026847388595342636, "grad_norm_var": 6.969971783755506e-07, "learning_rate": 0.0011161236973675187, "loss": 2.528, "step": 21722 }, { "crossentropy": 2.519711971282959, "epoch": 0.7875217517401392, "grad_norm": 0.026986844837665558, "grad_norm_var": 6.519587251178066e-07, "learning_rate": 0.0011157577673966718, "loss": 2.4985, "step": 21723 }, { "crossentropy": 2.385056495666504, "epoch": 0.7875580046403712, "grad_norm": 0.025854680687189102, "grad_norm_var": 5.073119505083027e-07, "learning_rate": 0.0011153918898884091, "loss": 2.422, "step": 21724 }, { "crossentropy": 2.451627731323242, "epoch": 0.7875942575406032, "grad_norm": 0.025851789861917496, "grad_norm_var": 5.80697911868932e-07, "learning_rate": 0.001115026064847674, "loss": 2.4969, "step": 21725 }, { "crossentropy": 2.406704902648926, "epoch": 0.7876305104408353, "grad_norm": 0.02521049790084362, "grad_norm_var": 7.549915270951257e-07, "learning_rate": 0.0011146602922794092, "loss": 2.4091, "step": 21726 }, { "crossentropy": 2.5170834064483643, "epoch": 0.7876667633410673, "grad_norm": 0.026398759335279465, "grad_norm_var": 7.415026078924954e-07, "learning_rate": 0.0011142945721885516, "loss": 2.5, "step": 21727 }, { "crossentropy": 2.6276142597198486, "epoch": 0.7877030162412993, "grad_norm": 0.026419976726174355, "grad_norm_var": 7.438699110355924e-07, "learning_rate": 0.0011139289045800438, "loss": 2.4647, "step": 21728 }, { "crossentropy": 2.6501317024230957, "epoch": 0.7877392691415314, "grad_norm": 0.02642780914902687, "grad_norm_var": 6.105658252293268e-07, "learning_rate": 0.0011135632894588227, "loss": 2.5623, "step": 21729 }, { "crossentropy": 2.4533815383911133, "epoch": 0.7877755220417634, "grad_norm": 0.02572072297334671, "grad_norm_var": 6.503142904852455e-07, "learning_rate": 0.0011131977268298283, "loss": 2.4907, "step": 21730 }, { "crossentropy": 2.3741402626037598, "epoch": 0.7878117749419954, "grad_norm": 0.025752410292625427, "grad_norm_var": 7.029491097853187e-07, "learning_rate": 0.001112832216697997, "loss": 2.3485, "step": 21731 }, { "crossentropy": 2.663938045501709, "epoch": 0.7878480278422274, "grad_norm": 0.026847396045923233, "grad_norm_var": 6.905633024245049e-07, "learning_rate": 0.0011124667590682647, "loss": 2.6, "step": 21732 }, { "crossentropy": 2.446777582168579, "epoch": 0.7878842807424594, "grad_norm": 0.025632798671722412, "grad_norm_var": 7.365463555202821e-07, "learning_rate": 0.0011121013539455676, "loss": 2.4748, "step": 21733 }, { "crossentropy": 2.5982391834259033, "epoch": 0.7879205336426914, "grad_norm": 0.026297304779291153, "grad_norm_var": 6.724170685270513e-07, "learning_rate": 0.0011117360013348438, "loss": 2.511, "step": 21734 }, { "crossentropy": 2.4113376140594482, "epoch": 0.7879567865429234, "grad_norm": 0.02584671601653099, "grad_norm_var": 3.7401091194788394e-07, "learning_rate": 0.001111370701241024, "loss": 2.5289, "step": 21735 }, { "crossentropy": 2.3845770359039307, "epoch": 0.7879930394431555, "grad_norm": 0.025831397622823715, "grad_norm_var": 3.6532584963671253e-07, "learning_rate": 0.001111005453669046, "loss": 2.4221, "step": 21736 }, { "crossentropy": 2.4810359477996826, "epoch": 0.7880292923433875, "grad_norm": 0.045531414449214935, "grad_norm_var": 2.3780520061503162e-05, "learning_rate": 0.0011106402586238396, "loss": 2.484, "step": 21737 }, { "crossentropy": 2.4267077445983887, "epoch": 0.7880655452436195, "grad_norm": 0.027385512366890907, "grad_norm_var": 2.376319354274249e-05, "learning_rate": 0.0011102751161103403, "loss": 2.3955, "step": 21738 }, { "crossentropy": 2.4796302318573, "epoch": 0.7881017981438515, "grad_norm": 0.02556784451007843, "grad_norm_var": 2.396243321205227e-05, "learning_rate": 0.0011099100261334782, "loss": 2.4445, "step": 21739 }, { "crossentropy": 2.5248093605041504, "epoch": 0.7881380510440835, "grad_norm": 0.02935650758445263, "grad_norm_var": 2.4060529927901324e-05, "learning_rate": 0.0011095449886981835, "loss": 2.5344, "step": 21740 }, { "crossentropy": 2.471947193145752, "epoch": 0.7881743039443155, "grad_norm": 0.02704606018960476, "grad_norm_var": 2.3886433249721663e-05, "learning_rate": 0.0011091800038093875, "loss": 2.408, "step": 21741 }, { "crossentropy": 2.444854736328125, "epoch": 0.7882105568445475, "grad_norm": 0.026393456384539604, "grad_norm_var": 2.3600226563099642e-05, "learning_rate": 0.0011088150714720214, "loss": 2.4726, "step": 21742 }, { "crossentropy": 2.566850185394287, "epoch": 0.7882468097447796, "grad_norm": 0.026925528421998024, "grad_norm_var": 2.3529441222647622e-05, "learning_rate": 0.0011084501916910111, "loss": 2.5007, "step": 21743 }, { "crossentropy": 2.5918900966644287, "epoch": 0.7882830626450116, "grad_norm": 0.02583272196352482, "grad_norm_var": 2.365015946142786e-05, "learning_rate": 0.0011080853644712884, "loss": 2.5234, "step": 21744 }, { "crossentropy": 2.492805004119873, "epoch": 0.7883193155452436, "grad_norm": 0.027647724375128746, "grad_norm_var": 2.3544420352523372e-05, "learning_rate": 0.0011077205898177777, "loss": 2.535, "step": 21745 }, { "crossentropy": 2.490729331970215, "epoch": 0.7883555684454756, "grad_norm": 0.02695443108677864, "grad_norm_var": 2.330969568198082e-05, "learning_rate": 0.0011073558677354074, "loss": 2.5017, "step": 21746 }, { "crossentropy": 2.421508550643921, "epoch": 0.7883918213457076, "grad_norm": 0.025293463841080666, "grad_norm_var": 2.3448346305785365e-05, "learning_rate": 0.001106991198229106, "loss": 2.457, "step": 21747 }, { "crossentropy": 2.6304891109466553, "epoch": 0.7884280742459396, "grad_norm": 0.027312539517879486, "grad_norm_var": 2.3404377202213e-05, "learning_rate": 0.0011066265813037934, "loss": 2.5702, "step": 21748 }, { "crossentropy": 2.50459623336792, "epoch": 0.7884643271461717, "grad_norm": 0.026018992066383362, "grad_norm_var": 2.3301925911409226e-05, "learning_rate": 0.0011062620169643971, "loss": 2.5172, "step": 21749 }, { "crossentropy": 2.4606103897094727, "epoch": 0.7885005800464037, "grad_norm": 0.02554263174533844, "grad_norm_var": 2.3491504723553194e-05, "learning_rate": 0.0011058975052158427, "loss": 2.579, "step": 21750 }, { "crossentropy": 2.44649076461792, "epoch": 0.7885368329466357, "grad_norm": 0.02600344829261303, "grad_norm_var": 2.3452629902764157e-05, "learning_rate": 0.0011055330460630508, "loss": 2.417, "step": 21751 }, { "crossentropy": 2.372380018234253, "epoch": 0.7885730858468677, "grad_norm": 0.02753511257469654, "grad_norm_var": 2.318907300394342e-05, "learning_rate": 0.0011051686395109467, "loss": 2.4045, "step": 21752 }, { "crossentropy": 2.551464319229126, "epoch": 0.7886093387470998, "grad_norm": 0.02793303318321705, "grad_norm_var": 1.1665492720779365e-06, "learning_rate": 0.001104804285564449, "loss": 2.5353, "step": 21753 }, { "crossentropy": 2.3469018936157227, "epoch": 0.7886455916473318, "grad_norm": 0.02607489563524723, "grad_norm_var": 1.1710319714410318e-06, "learning_rate": 0.0011044399842284809, "loss": 2.4085, "step": 21754 }, { "crossentropy": 2.4321882724761963, "epoch": 0.7886818445475638, "grad_norm": 0.026461847126483917, "grad_norm_var": 1.0842551635457567e-06, "learning_rate": 0.001104075735507965, "loss": 2.4348, "step": 21755 }, { "crossentropy": 2.64272403717041, "epoch": 0.7887180974477959, "grad_norm": 0.026425233110785484, "grad_norm_var": 6.106792214727638e-07, "learning_rate": 0.0011037115394078162, "loss": 2.6011, "step": 21756 }, { "crossentropy": 2.4036073684692383, "epoch": 0.7887543503480279, "grad_norm": 0.025695139542222023, "grad_norm_var": 6.421563630685471e-07, "learning_rate": 0.0011033473959329554, "loss": 2.3893, "step": 21757 }, { "crossentropy": 2.4815382957458496, "epoch": 0.7887906032482599, "grad_norm": 0.02622162364423275, "grad_norm_var": 6.465146746631543e-07, "learning_rate": 0.0011029833050883032, "loss": 2.3997, "step": 21758 }, { "crossentropy": 2.404524803161621, "epoch": 0.7888268561484919, "grad_norm": 0.025843117386102676, "grad_norm_var": 6.572304911045893e-07, "learning_rate": 0.0011026192668787738, "loss": 2.4828, "step": 21759 }, { "crossentropy": 2.5130913257598877, "epoch": 0.7888631090487239, "grad_norm": 0.026185519993305206, "grad_norm_var": 6.371609329353419e-07, "learning_rate": 0.0011022552813092867, "loss": 2.578, "step": 21760 }, { "crossentropy": 2.491529703140259, "epoch": 0.7888993619489559, "grad_norm": 0.028223862871527672, "grad_norm_var": 7.501602994492192e-07, "learning_rate": 0.0011018913483847564, "loss": 2.4152, "step": 21761 }, { "crossentropy": 2.5125014781951904, "epoch": 0.788935614849188, "grad_norm": 0.027888696640729904, "grad_norm_var": 8.634633344584229e-07, "learning_rate": 0.0011015274681100984, "loss": 2.5218, "step": 21762 }, { "crossentropy": 2.4389846324920654, "epoch": 0.78897186774942, "grad_norm": 0.027347972616553307, "grad_norm_var": 7.85478976932892e-07, "learning_rate": 0.0011011636404902297, "loss": 2.4667, "step": 21763 }, { "crossentropy": 2.373361587524414, "epoch": 0.789008120649652, "grad_norm": 0.026926908642053604, "grad_norm_var": 7.617153265505738e-07, "learning_rate": 0.0011007998655300634, "loss": 2.4052, "step": 21764 }, { "crossentropy": 2.4770936965942383, "epoch": 0.789044373549884, "grad_norm": 0.028061097487807274, "grad_norm_var": 8.517658561924453e-07, "learning_rate": 0.00110043614323451, "loss": 2.5263, "step": 21765 }, { "crossentropy": 2.3748655319213867, "epoch": 0.789080626450116, "grad_norm": 0.025936990976333618, "grad_norm_var": 7.967844948583251e-07, "learning_rate": 0.0011000724736084854, "loss": 2.4557, "step": 21766 }, { "crossentropy": 2.5271570682525635, "epoch": 0.789116879350348, "grad_norm": 0.02576255425810814, "grad_norm_var": 8.259247080746965e-07, "learning_rate": 0.0010997088566568997, "loss": 2.5212, "step": 21767 }, { "crossentropy": 2.4265058040618896, "epoch": 0.78915313225058, "grad_norm": 0.025551535189151764, "grad_norm_var": 8.728468077862441e-07, "learning_rate": 0.0010993452923846658, "loss": 2.4509, "step": 21768 }, { "crossentropy": 2.5306358337402344, "epoch": 0.789189385150812, "grad_norm": 0.027913950383663177, "grad_norm_var": 8.69627319789951e-07, "learning_rate": 0.0010989817807966922, "loss": 2.5089, "step": 21769 }, { "crossentropy": 2.592834949493408, "epoch": 0.7892256380510441, "grad_norm": 0.026108967140316963, "grad_norm_var": 8.670529112074586e-07, "learning_rate": 0.0010986183218978896, "loss": 2.5469, "step": 21770 }, { "crossentropy": 2.4912614822387695, "epoch": 0.7892618909512761, "grad_norm": 0.026726718991994858, "grad_norm_var": 8.644507128935704e-07, "learning_rate": 0.001098254915693168, "loss": 2.4849, "step": 21771 }, { "crossentropy": 2.3871686458587646, "epoch": 0.7892981438515081, "grad_norm": 0.026015253737568855, "grad_norm_var": 8.886770923934115e-07, "learning_rate": 0.0010978915621874352, "loss": 2.457, "step": 21772 }, { "crossentropy": 2.6067240238189697, "epoch": 0.7893343967517401, "grad_norm": 0.025724804028868675, "grad_norm_var": 8.849529155450225e-07, "learning_rate": 0.001097528261385597, "loss": 2.5946, "step": 21773 }, { "crossentropy": 2.6330127716064453, "epoch": 0.7893706496519721, "grad_norm": 0.02659396454691887, "grad_norm_var": 8.72228044462795e-07, "learning_rate": 0.0010971650132925637, "loss": 2.4924, "step": 21774 }, { "crossentropy": 2.467533588409424, "epoch": 0.7894069025522041, "grad_norm": 0.026360798627138138, "grad_norm_var": 8.315062570128536e-07, "learning_rate": 0.0010968018179132377, "loss": 2.4626, "step": 21775 }, { "crossentropy": 2.5163793563842773, "epoch": 0.7894431554524362, "grad_norm": 0.025899581611156464, "grad_norm_var": 8.565397194163285e-07, "learning_rate": 0.0010964386752525279, "loss": 2.499, "step": 21776 }, { "crossentropy": 2.5227274894714355, "epoch": 0.7894794083526682, "grad_norm": 0.026897678151726723, "grad_norm_var": 6.952782789573786e-07, "learning_rate": 0.0010960755853153365, "loss": 2.5442, "step": 21777 }, { "crossentropy": 2.4303908348083496, "epoch": 0.7895156612529002, "grad_norm": 0.026226593181490898, "grad_norm_var": 5.83974173435026e-07, "learning_rate": 0.0010957125481065688, "loss": 2.4126, "step": 21778 }, { "crossentropy": 2.4328839778900146, "epoch": 0.7895519141531323, "grad_norm": 0.027603963389992714, "grad_norm_var": 6.168948494921032e-07, "learning_rate": 0.0010953495636311294, "loss": 2.4644, "step": 21779 }, { "crossentropy": 2.4133493900299072, "epoch": 0.7895881670533643, "grad_norm": 0.025917192921042442, "grad_norm_var": 6.257609106328129e-07, "learning_rate": 0.0010949866318939183, "loss": 2.4638, "step": 21780 }, { "crossentropy": 2.5201027393341064, "epoch": 0.7896244199535963, "grad_norm": 0.026923196390271187, "grad_norm_var": 4.6321499182344396e-07, "learning_rate": 0.0010946237528998398, "loss": 2.4159, "step": 21781 }, { "crossentropy": 2.347630500793457, "epoch": 0.7896606728538283, "grad_norm": 0.026469726115465164, "grad_norm_var": 4.4911360651429345e-07, "learning_rate": 0.0010942609266537946, "loss": 2.4516, "step": 21782 }, { "crossentropy": 2.390983819961548, "epoch": 0.7896969257540604, "grad_norm": 0.027189774438738823, "grad_norm_var": 4.5159388706593876e-07, "learning_rate": 0.0010938981531606812, "loss": 2.5182, "step": 21783 }, { "crossentropy": 2.467012643814087, "epoch": 0.7897331786542924, "grad_norm": 0.027204088866710663, "grad_norm_var": 4.115885476310323e-07, "learning_rate": 0.001093535432425401, "loss": 2.5064, "step": 21784 }, { "crossentropy": 2.4797961711883545, "epoch": 0.7897694315545244, "grad_norm": 0.027617711573839188, "grad_norm_var": 3.6560941442952537e-07, "learning_rate": 0.0010931727644528544, "loss": 2.5625, "step": 21785 }, { "crossentropy": 2.58185076713562, "epoch": 0.7898056844547564, "grad_norm": 0.026659172028303146, "grad_norm_var": 3.490574045065623e-07, "learning_rate": 0.001092810149247937, "loss": 2.5611, "step": 21786 }, { "crossentropy": 2.336325168609619, "epoch": 0.7898419373549884, "grad_norm": 0.025607122108340263, "grad_norm_var": 4.1249834827985536e-07, "learning_rate": 0.0010924475868155491, "loss": 2.4488, "step": 21787 }, { "crossentropy": 2.434252977371216, "epoch": 0.7898781902552204, "grad_norm": 0.025866232812404633, "grad_norm_var": 4.246487928282842e-07, "learning_rate": 0.0010920850771605857, "loss": 2.5159, "step": 21788 }, { "crossentropy": 2.441087245941162, "epoch": 0.7899144431554525, "grad_norm": 0.027733124792575836, "grad_norm_var": 4.5640819993860285e-07, "learning_rate": 0.001091722620287945, "loss": 2.4316, "step": 21789 }, { "crossentropy": 2.580927610397339, "epoch": 0.7899506960556845, "grad_norm": 0.02674020826816559, "grad_norm_var": 4.562014345986838e-07, "learning_rate": 0.001091360216202521, "loss": 2.4849, "step": 21790 }, { "crossentropy": 2.500649929046631, "epoch": 0.7899869489559165, "grad_norm": 0.02694579027593136, "grad_norm_var": 4.525162315564361e-07, "learning_rate": 0.0010909978649092079, "loss": 2.5769, "step": 21791 }, { "crossentropy": 2.4055638313293457, "epoch": 0.7900232018561485, "grad_norm": 0.026518210768699646, "grad_norm_var": 4.0886095111098787e-07, "learning_rate": 0.0010906355664129003, "loss": 2.4371, "step": 21792 }, { "crossentropy": 2.4512321949005127, "epoch": 0.7900594547563805, "grad_norm": 0.02667098119854927, "grad_norm_var": 4.0783545495931956e-07, "learning_rate": 0.0010902733207184933, "loss": 2.4318, "step": 21793 }, { "crossentropy": 2.5262186527252197, "epoch": 0.7900957076566125, "grad_norm": 0.02638133242726326, "grad_norm_var": 3.9867095364617965e-07, "learning_rate": 0.0010899111278308777, "loss": 2.5074, "step": 21794 }, { "crossentropy": 2.52173113822937, "epoch": 0.7901319605568445, "grad_norm": 0.025248974561691284, "grad_norm_var": 4.780896224890978e-07, "learning_rate": 0.001089548987754947, "loss": 2.4761, "step": 21795 }, { "crossentropy": 2.400564193725586, "epoch": 0.7901682134570766, "grad_norm": 0.02813933789730072, "grad_norm_var": 5.826847737975168e-07, "learning_rate": 0.00108918690049559, "loss": 2.4203, "step": 21796 }, { "crossentropy": 2.527482509613037, "epoch": 0.7902044663573086, "grad_norm": 0.027373583987355232, "grad_norm_var": 6.06082652293329e-07, "learning_rate": 0.0010888248660577006, "loss": 2.5401, "step": 21797 }, { "crossentropy": 2.5349252223968506, "epoch": 0.7902407192575406, "grad_norm": 0.02743682824075222, "grad_norm_var": 6.254529959609136e-07, "learning_rate": 0.0010884628844461664, "loss": 2.4156, "step": 21798 }, { "crossentropy": 2.369312047958374, "epoch": 0.7902769721577726, "grad_norm": 0.026950685307383537, "grad_norm_var": 6.176611842014506e-07, "learning_rate": 0.0010881009556658761, "loss": 2.3867, "step": 21799 }, { "crossentropy": 2.6391372680664062, "epoch": 0.7903132250580046, "grad_norm": 0.027557486668229103, "grad_norm_var": 6.436433447428701e-07, "learning_rate": 0.0010877390797217185, "loss": 2.5587, "step": 21800 }, { "crossentropy": 2.5364766120910645, "epoch": 0.7903494779582366, "grad_norm": 0.02891010232269764, "grad_norm_var": 8.81976707843438e-07, "learning_rate": 0.001087377256618583, "loss": 2.4158, "step": 21801 }, { "crossentropy": 2.598292827606201, "epoch": 0.7903857308584686, "grad_norm": 0.02595672197639942, "grad_norm_var": 9.373578500448954e-07, "learning_rate": 0.001087015486361354, "loss": 2.5139, "step": 21802 }, { "crossentropy": 2.447404146194458, "epoch": 0.7904219837587007, "grad_norm": 0.027582138776779175, "grad_norm_var": 8.46669283729529e-07, "learning_rate": 0.00108665376895492, "loss": 2.3921, "step": 21803 }, { "crossentropy": 2.629784345626831, "epoch": 0.7904582366589327, "grad_norm": 0.027254445478320122, "grad_norm_var": 7.571247046864645e-07, "learning_rate": 0.0010862921044041646, "loss": 2.5297, "step": 21804 }, { "crossentropy": 2.3772575855255127, "epoch": 0.7904944895591647, "grad_norm": 0.028973929584026337, "grad_norm_var": 9.601625525081093e-07, "learning_rate": 0.0010859304927139735, "loss": 2.49, "step": 21805 }, { "crossentropy": 2.487901210784912, "epoch": 0.7905307424593968, "grad_norm": 0.02658936195075512, "grad_norm_var": 9.701294373656942e-07, "learning_rate": 0.0010855689338892338, "loss": 2.5078, "step": 21806 }, { "crossentropy": 2.560053825378418, "epoch": 0.7905669953596288, "grad_norm": 0.02615540288388729, "grad_norm_var": 1.0312867976615952e-06, "learning_rate": 0.0010852074279348234, "loss": 2.5143, "step": 21807 }, { "crossentropy": 2.5078845024108887, "epoch": 0.7906032482598608, "grad_norm": 0.026572084054350853, "grad_norm_var": 1.027244459629537e-06, "learning_rate": 0.0010848459748556273, "loss": 2.4875, "step": 21808 }, { "crossentropy": 2.387274742126465, "epoch": 0.7906395011600929, "grad_norm": 0.02668789215385914, "grad_norm_var": 1.026273366908414e-06, "learning_rate": 0.001084484574656529, "loss": 2.4434, "step": 21809 }, { "crossentropy": 2.4617464542388916, "epoch": 0.7906757540603249, "grad_norm": 0.02805517613887787, "grad_norm_var": 1.0386157102037213e-06, "learning_rate": 0.001084123227342408, "loss": 2.4941, "step": 21810 }, { "crossentropy": 2.6406147480010986, "epoch": 0.7907120069605569, "grad_norm": 0.028034372255206108, "grad_norm_var": 7.932668257848788e-07, "learning_rate": 0.0010837619329181459, "loss": 2.6432, "step": 21811 }, { "crossentropy": 2.3695359230041504, "epoch": 0.7907482598607889, "grad_norm": 0.025639373809099197, "grad_norm_var": 9.338871803855897e-07, "learning_rate": 0.0010834006913886208, "loss": 2.425, "step": 21812 }, { "crossentropy": 2.526834011077881, "epoch": 0.7907845127610209, "grad_norm": 0.026882296428084373, "grad_norm_var": 9.397699332983414e-07, "learning_rate": 0.0010830395027587132, "loss": 2.4745, "step": 21813 }, { "crossentropy": 2.4503543376922607, "epoch": 0.7908207656612529, "grad_norm": 0.028233934193849564, "grad_norm_var": 1.0043969489303596e-06, "learning_rate": 0.0010826783670333034, "loss": 2.4464, "step": 21814 }, { "crossentropy": 2.3619847297668457, "epoch": 0.7908570185614849, "grad_norm": 0.026880411431193352, "grad_norm_var": 1.0075308669804286e-06, "learning_rate": 0.0010823172842172651, "loss": 2.4135, "step": 21815 }, { "crossentropy": 2.579225778579712, "epoch": 0.790893271461717, "grad_norm": 0.027063684538006783, "grad_norm_var": 1.0023824011627514e-06, "learning_rate": 0.0010819562543154771, "loss": 2.5443, "step": 21816 }, { "crossentropy": 2.4930419921875, "epoch": 0.790929524361949, "grad_norm": 0.026544509455561638, "grad_norm_var": 8.180955671189023e-07, "learning_rate": 0.0010815952773328163, "loss": 2.466, "step": 21817 }, { "crossentropy": 2.435964822769165, "epoch": 0.790965777262181, "grad_norm": 0.026737071573734283, "grad_norm_var": 7.404146161936349e-07, "learning_rate": 0.001081234353274157, "loss": 2.5543, "step": 21818 }, { "crossentropy": 2.4738845825195312, "epoch": 0.791002030162413, "grad_norm": 0.025730811059474945, "grad_norm_var": 8.400287314001715e-07, "learning_rate": 0.001080873482144376, "loss": 2.4553, "step": 21819 }, { "crossentropy": 2.352614641189575, "epoch": 0.791038283062645, "grad_norm": 0.026522187516093254, "grad_norm_var": 8.489108011210921e-07, "learning_rate": 0.0010805126639483447, "loss": 2.3082, "step": 21820 }, { "crossentropy": 2.588932514190674, "epoch": 0.791074535962877, "grad_norm": 0.029634961858391762, "grad_norm_var": 1.0540407741309177e-06, "learning_rate": 0.0010801518986909375, "loss": 2.4837, "step": 21821 }, { "crossentropy": 2.7761504650115967, "epoch": 0.791110788863109, "grad_norm": 0.027360083535313606, "grad_norm_var": 1.0492023882375934e-06, "learning_rate": 0.0010797911863770293, "loss": 2.5964, "step": 21822 }, { "crossentropy": 2.5054409503936768, "epoch": 0.791147041763341, "grad_norm": 0.027546418830752373, "grad_norm_var": 1.0049775036224112e-06, "learning_rate": 0.0010794305270114908, "loss": 2.4591, "step": 21823 }, { "crossentropy": 2.523350954055786, "epoch": 0.7911832946635731, "grad_norm": 0.029833147302269936, "grad_norm_var": 1.425819085335691e-06, "learning_rate": 0.0010790699205991906, "loss": 2.5133, "step": 21824 }, { "crossentropy": 2.422396421432495, "epoch": 0.7912195475638051, "grad_norm": 0.02634229138493538, "grad_norm_var": 1.463178711005412e-06, "learning_rate": 0.0010787093671450027, "loss": 2.5513, "step": 21825 }, { "crossentropy": 2.547541618347168, "epoch": 0.7912558004640371, "grad_norm": 0.027018895372748375, "grad_norm_var": 1.4280316843384334e-06, "learning_rate": 0.0010783488666537943, "loss": 2.5586, "step": 21826 }, { "crossentropy": 2.5730369091033936, "epoch": 0.7912920533642691, "grad_norm": 0.027361538261175156, "grad_norm_var": 1.3859837616531437e-06, "learning_rate": 0.0010779884191304374, "loss": 2.5634, "step": 21827 }, { "crossentropy": 2.5608723163604736, "epoch": 0.7913283062645011, "grad_norm": 0.02598179318010807, "grad_norm_var": 1.3216845651291825e-06, "learning_rate": 0.001077628024579797, "loss": 2.5085, "step": 21828 }, { "crossentropy": 2.493313789367676, "epoch": 0.7913645591647331, "grad_norm": 0.026114745065569878, "grad_norm_var": 1.394051407423375e-06, "learning_rate": 0.0010772676830067423, "loss": 2.4531, "step": 21829 }, { "crossentropy": 2.392512321472168, "epoch": 0.7914008120649652, "grad_norm": 0.02839294634759426, "grad_norm_var": 1.4179417295742103e-06, "learning_rate": 0.0010769073944161428, "loss": 2.5153, "step": 21830 }, { "crossentropy": 2.3217644691467285, "epoch": 0.7914370649651972, "grad_norm": 0.027296412736177444, "grad_norm_var": 1.411497508733853e-06, "learning_rate": 0.0010765471588128617, "loss": 2.4184, "step": 21831 }, { "crossentropy": 2.5309081077575684, "epoch": 0.7914733178654292, "grad_norm": 0.02643691375851631, "grad_norm_var": 1.448912205243881e-06, "learning_rate": 0.001076186976201764, "loss": 2.5615, "step": 21832 }, { "crossentropy": 2.4375290870666504, "epoch": 0.7915095707656613, "grad_norm": 0.025569725781679153, "grad_norm_var": 1.5906900513116036e-06, "learning_rate": 0.0010758268465877175, "loss": 2.4293, "step": 21833 }, { "crossentropy": 2.5211517810821533, "epoch": 0.7915458236658933, "grad_norm": 0.02720380201935768, "grad_norm_var": 1.5806307654281689e-06, "learning_rate": 0.0010754667699755826, "loss": 2.5479, "step": 21834 }, { "crossentropy": 2.488452672958374, "epoch": 0.7915820765661253, "grad_norm": 0.026141056790947914, "grad_norm_var": 1.5137030919371944e-06, "learning_rate": 0.0010751067463702268, "loss": 2.605, "step": 21835 }, { "crossentropy": 2.3916521072387695, "epoch": 0.7916183294663574, "grad_norm": 0.026130257174372673, "grad_norm_var": 1.55727724446784e-06, "learning_rate": 0.001074746775776509, "loss": 2.3954, "step": 21836 }, { "crossentropy": 2.536820888519287, "epoch": 0.7916545823665894, "grad_norm": 0.027501124888658524, "grad_norm_var": 1.1342329859568747e-06, "learning_rate": 0.0010743868581992928, "loss": 2.5776, "step": 21837 }, { "crossentropy": 2.5077896118164062, "epoch": 0.7916908352668214, "grad_norm": 0.0265872310847044, "grad_norm_var": 1.135947489849899e-06, "learning_rate": 0.0010740269936434404, "loss": 2.4762, "step": 21838 }, { "crossentropy": 2.5808935165405273, "epoch": 0.7917270881670534, "grad_norm": 0.02660158835351467, "grad_norm_var": 1.118639984911994e-06, "learning_rate": 0.0010736671821138116, "loss": 2.5649, "step": 21839 }, { "crossentropy": 2.417062759399414, "epoch": 0.7917633410672854, "grad_norm": 0.026699557900428772, "grad_norm_var": 5.098105438120295e-07, "learning_rate": 0.0010733074236152645, "loss": 2.4399, "step": 21840 }, { "crossentropy": 2.527973175048828, "epoch": 0.7917995939675174, "grad_norm": 0.02707682177424431, "grad_norm_var": 5.073973665068246e-07, "learning_rate": 0.001072947718152661, "loss": 2.5494, "step": 21841 }, { "crossentropy": 2.4834952354431152, "epoch": 0.7918358468677494, "grad_norm": 0.026143020018935204, "grad_norm_var": 5.247772858230429e-07, "learning_rate": 0.0010725880657308567, "loss": 2.4099, "step": 21842 }, { "crossentropy": 2.5237739086151123, "epoch": 0.7918720997679815, "grad_norm": 0.02618885599076748, "grad_norm_var": 5.076662877771896e-07, "learning_rate": 0.0010722284663547122, "loss": 2.4681, "step": 21843 }, { "crossentropy": 2.576340675354004, "epoch": 0.7919083526682135, "grad_norm": 0.028006315231323242, "grad_norm_var": 5.890985268989952e-07, "learning_rate": 0.001071868920029081, "loss": 2.5096, "step": 21844 }, { "crossentropy": 2.5016634464263916, "epoch": 0.7919446055684455, "grad_norm": 0.025574006140232086, "grad_norm_var": 6.535816248543948e-07, "learning_rate": 0.0010715094267588216, "loss": 2.482, "step": 21845 }, { "crossentropy": 2.433555841445923, "epoch": 0.7919808584686775, "grad_norm": 0.02792338840663433, "grad_norm_var": 5.627385155836424e-07, "learning_rate": 0.0010711499865487905, "loss": 2.5182, "step": 21846 }, { "crossentropy": 2.4113717079162598, "epoch": 0.7920171113689095, "grad_norm": 0.027968214824795723, "grad_norm_var": 6.450401015225514e-07, "learning_rate": 0.0010707905994038398, "loss": 2.4199, "step": 21847 }, { "crossentropy": 2.4544074535369873, "epoch": 0.7920533642691415, "grad_norm": 0.025042029097676277, "grad_norm_var": 8.219916237900502e-07, "learning_rate": 0.0010704312653288257, "loss": 2.3427, "step": 21848 }, { "crossentropy": 2.511462450027466, "epoch": 0.7920896171693735, "grad_norm": 0.025730790570378304, "grad_norm_var": 8.004714928818806e-07, "learning_rate": 0.0010700719843286012, "loss": 2.4067, "step": 21849 }, { "crossentropy": 2.3504772186279297, "epoch": 0.7921258700696056, "grad_norm": 0.02668910287320614, "grad_norm_var": 7.795295493607703e-07, "learning_rate": 0.0010697127564080167, "loss": 2.4065, "step": 21850 }, { "crossentropy": 2.4409992694854736, "epoch": 0.7921621229698376, "grad_norm": 0.02719033882021904, "grad_norm_var": 7.806064865634524e-07, "learning_rate": 0.0010693535815719274, "loss": 2.4841, "step": 21851 }, { "crossentropy": 2.4312872886657715, "epoch": 0.7921983758700696, "grad_norm": 0.027184808626770973, "grad_norm_var": 7.712966210227563e-07, "learning_rate": 0.0010689944598251816, "loss": 2.4383, "step": 21852 }, { "crossentropy": 2.4850661754608154, "epoch": 0.7922346287703016, "grad_norm": 0.026668967679142952, "grad_norm_var": 7.319797965219064e-07, "learning_rate": 0.001068635391172631, "loss": 2.4472, "step": 21853 }, { "crossentropy": 2.426378011703491, "epoch": 0.7922708816705336, "grad_norm": 0.02656511776149273, "grad_norm_var": 7.323566794106927e-07, "learning_rate": 0.0010682763756191267, "loss": 2.4464, "step": 21854 }, { "crossentropy": 2.6733241081237793, "epoch": 0.7923071345707656, "grad_norm": 0.02741359919309616, "grad_norm_var": 7.625538121579234e-07, "learning_rate": 0.0010679174131695158, "loss": 2.6444, "step": 21855 }, { "crossentropy": 2.531775712966919, "epoch": 0.7923433874709976, "grad_norm": 0.02674269489943981, "grad_norm_var": 7.623566469514096e-07, "learning_rate": 0.0010675585038286478, "loss": 2.432, "step": 21856 }, { "crossentropy": 2.3978281021118164, "epoch": 0.7923796403712297, "grad_norm": 0.02677658572793007, "grad_norm_var": 7.551777375447751e-07, "learning_rate": 0.0010671996476013708, "loss": 2.4532, "step": 21857 }, { "crossentropy": 2.483421564102173, "epoch": 0.7924158932714617, "grad_norm": 0.02544443868100643, "grad_norm_var": 8.410966963652884e-07, "learning_rate": 0.0010668408444925292, "loss": 2.3954, "step": 21858 }, { "crossentropy": 2.4212629795074463, "epoch": 0.7924521461716937, "grad_norm": 0.026782285422086716, "grad_norm_var": 8.231116407926448e-07, "learning_rate": 0.001066482094506972, "loss": 2.4227, "step": 21859 }, { "crossentropy": 2.475795269012451, "epoch": 0.7924883990719258, "grad_norm": 0.025703376159071922, "grad_norm_var": 7.631140000464622e-07, "learning_rate": 0.0010661233976495422, "loss": 2.5149, "step": 21860 }, { "crossentropy": 2.502192497253418, "epoch": 0.7925246519721578, "grad_norm": 0.027475683018565178, "grad_norm_var": 7.321630987165244e-07, "learning_rate": 0.0010657647539250854, "loss": 2.5187, "step": 21861 }, { "crossentropy": 2.447929859161377, "epoch": 0.7925609048723898, "grad_norm": 0.030172258615493774, "grad_norm_var": 1.4131832037908893e-06, "learning_rate": 0.0010654061633384477, "loss": 2.5007, "step": 21862 }, { "crossentropy": 2.651801824569702, "epoch": 0.7925971577726219, "grad_norm": 0.026479242369532585, "grad_norm_var": 1.329132547424787e-06, "learning_rate": 0.0010650476258944697, "loss": 2.5728, "step": 21863 }, { "crossentropy": 2.4723613262176514, "epoch": 0.7926334106728539, "grad_norm": 0.026652341708540916, "grad_norm_var": 1.1236632421649743e-06, "learning_rate": 0.0010646891415979959, "loss": 2.5613, "step": 21864 }, { "crossentropy": 2.3657588958740234, "epoch": 0.7926696635730859, "grad_norm": 0.02608495205640793, "grad_norm_var": 1.078440445080366e-06, "learning_rate": 0.0010643307104538674, "loss": 2.4866, "step": 21865 }, { "crossentropy": 2.3334312438964844, "epoch": 0.7927059164733179, "grad_norm": 0.026245728135108948, "grad_norm_var": 1.1018116814330425e-06, "learning_rate": 0.0010639723324669236, "loss": 2.4518, "step": 21866 }, { "crossentropy": 2.437252998352051, "epoch": 0.7927421693735499, "grad_norm": 0.027265984565019608, "grad_norm_var": 1.105613097802345e-06, "learning_rate": 0.0010636140076420086, "loss": 2.4881, "step": 21867 }, { "crossentropy": 2.409876823425293, "epoch": 0.7927784222737819, "grad_norm": 0.026350565254688263, "grad_norm_var": 1.1122728061981057e-06, "learning_rate": 0.0010632557359839584, "loss": 2.4533, "step": 21868 }, { "crossentropy": 2.3605895042419434, "epoch": 0.7928146751740139, "grad_norm": 0.02641289122402668, "grad_norm_var": 1.1208959925945265e-06, "learning_rate": 0.0010628975174976135, "loss": 2.3648, "step": 21869 }, { "crossentropy": 2.4046177864074707, "epoch": 0.792850928074246, "grad_norm": 0.02644425816833973, "grad_norm_var": 1.1253600499943594e-06, "learning_rate": 0.001062539352187814, "loss": 2.4925, "step": 21870 }, { "crossentropy": 2.5164191722869873, "epoch": 0.792887180974478, "grad_norm": 0.027453968301415443, "grad_norm_var": 1.1288834221037513e-06, "learning_rate": 0.0010621812400593944, "loss": 2.5541, "step": 21871 }, { "crossentropy": 2.4075260162353516, "epoch": 0.79292343387471, "grad_norm": 0.02547943778336048, "grad_norm_var": 1.2349819058907978e-06, "learning_rate": 0.0010618231811171947, "loss": 2.4016, "step": 21872 }, { "crossentropy": 2.460386037826538, "epoch": 0.792959686774942, "grad_norm": 0.02666342817246914, "grad_norm_var": 1.234649323774278e-06, "learning_rate": 0.0010614651753660487, "loss": 2.5506, "step": 21873 }, { "crossentropy": 2.4597980976104736, "epoch": 0.792995939675174, "grad_norm": 0.027021469548344612, "grad_norm_var": 1.1272523490021835e-06, "learning_rate": 0.0010611072228107915, "loss": 2.4829, "step": 21874 }, { "crossentropy": 2.4506711959838867, "epoch": 0.793032192575406, "grad_norm": 0.025526557117700577, "grad_norm_var": 1.2275982887909044e-06, "learning_rate": 0.0010607493234562604, "loss": 2.507, "step": 21875 }, { "crossentropy": 2.4284448623657227, "epoch": 0.793068445475638, "grad_norm": 0.026764338836073875, "grad_norm_var": 1.15491435658463e-06, "learning_rate": 0.0010603914773072859, "loss": 2.4115, "step": 21876 }, { "crossentropy": 2.362337350845337, "epoch": 0.79310469837587, "grad_norm": 0.026236962527036667, "grad_norm_var": 1.1360505043098244e-06, "learning_rate": 0.0010600336843687031, "loss": 2.4578, "step": 21877 }, { "crossentropy": 2.5295522212982178, "epoch": 0.7931409512761021, "grad_norm": 0.027025654911994934, "grad_norm_var": 2.995199918095672e-07, "learning_rate": 0.0010596759446453458, "loss": 2.5325, "step": 21878 }, { "crossentropy": 2.66579532623291, "epoch": 0.7931772041763341, "grad_norm": 0.027523279190063477, "grad_norm_var": 3.638185079172148e-07, "learning_rate": 0.0010593182581420436, "loss": 2.6027, "step": 21879 }, { "crossentropy": 2.483253240585327, "epoch": 0.7932134570765661, "grad_norm": 0.02637801133096218, "grad_norm_var": 3.655829734631096e-07, "learning_rate": 0.001058960624863629, "loss": 2.4637, "step": 21880 }, { "crossentropy": 2.35050368309021, "epoch": 0.7932497099767981, "grad_norm": 0.026700768619775772, "grad_norm_var": 3.507026386406186e-07, "learning_rate": 0.0010586030448149326, "loss": 2.4077, "step": 21881 }, { "crossentropy": 2.3534929752349854, "epoch": 0.7932859628770301, "grad_norm": 0.02755201980471611, "grad_norm_var": 3.968096617836444e-07, "learning_rate": 0.0010582455180007822, "loss": 2.363, "step": 21882 }, { "crossentropy": 2.3592588901519775, "epoch": 0.7933222157772621, "grad_norm": 0.028894327580928802, "grad_norm_var": 6.908440422299817e-07, "learning_rate": 0.0010578880444260085, "loss": 2.4619, "step": 21883 }, { "crossentropy": 2.4344077110290527, "epoch": 0.7933584686774942, "grad_norm": 0.02669476717710495, "grad_norm_var": 6.786897563943253e-07, "learning_rate": 0.0010575306240954386, "loss": 2.4836, "step": 21884 }, { "crossentropy": 2.3853790760040283, "epoch": 0.7933947215777262, "grad_norm": 0.02687601000070572, "grad_norm_var": 6.682985678804389e-07, "learning_rate": 0.0010571732570138998, "loss": 2.3563, "step": 21885 }, { "crossentropy": 2.429763078689575, "epoch": 0.7934309744779582, "grad_norm": 0.025232741609215736, "grad_norm_var": 8.218936536056546e-07, "learning_rate": 0.0010568159431862218, "loss": 2.3919, "step": 21886 }, { "crossentropy": 2.365384101867676, "epoch": 0.7934672273781903, "grad_norm": 0.025422191247344017, "grad_norm_var": 8.895954719885842e-07, "learning_rate": 0.0010564586826172261, "loss": 2.4099, "step": 21887 }, { "crossentropy": 2.461646318435669, "epoch": 0.7935034802784223, "grad_norm": 0.025966770946979523, "grad_norm_var": 8.300353906975343e-07, "learning_rate": 0.0010561014753117427, "loss": 2.4757, "step": 21888 }, { "crossentropy": 2.276703357696533, "epoch": 0.7935397331786543, "grad_norm": 0.02531236782670021, "grad_norm_var": 9.425944849955481e-07, "learning_rate": 0.0010557443212745922, "loss": 2.4206, "step": 21889 }, { "crossentropy": 2.5497078895568848, "epoch": 0.7935759860788864, "grad_norm": 0.02622201107442379, "grad_norm_var": 9.344710812242505e-07, "learning_rate": 0.0010553872205106014, "loss": 2.5033, "step": 21890 }, { "crossentropy": 2.4547817707061768, "epoch": 0.7936122389791184, "grad_norm": 0.024723293259739876, "grad_norm_var": 1.0812564719366845e-06, "learning_rate": 0.0010550301730245927, "loss": 2.4648, "step": 21891 }, { "crossentropy": 2.5241973400115967, "epoch": 0.7936484918793504, "grad_norm": 0.025849420577287674, "grad_norm_var": 1.0977096109643225e-06, "learning_rate": 0.001054673178821387, "loss": 2.5168, "step": 21892 }, { "crossentropy": 2.38423752784729, "epoch": 0.7936847447795824, "grad_norm": 0.02762468159198761, "grad_norm_var": 1.1854677588744028e-06, "learning_rate": 0.0010543162379058068, "loss": 2.4618, "step": 21893 }, { "crossentropy": 2.5313355922698975, "epoch": 0.7937209976798144, "grad_norm": 0.026095736771821976, "grad_norm_var": 1.1743259772971909e-06, "learning_rate": 0.001053959350282675, "loss": 2.4928, "step": 21894 }, { "crossentropy": 2.5636496543884277, "epoch": 0.7937572505800464, "grad_norm": 0.026529423892498016, "grad_norm_var": 1.0927457486630306e-06, "learning_rate": 0.0010536025159568097, "loss": 2.5747, "step": 21895 }, { "crossentropy": 2.398552894592285, "epoch": 0.7937935034802784, "grad_norm": 0.02686549536883831, "grad_norm_var": 1.1074911994254083e-06, "learning_rate": 0.0010532457349330321, "loss": 2.5071, "step": 21896 }, { "crossentropy": 2.477506399154663, "epoch": 0.7938297563805105, "grad_norm": 0.02578447386622429, "grad_norm_var": 1.1244574664492306e-06, "learning_rate": 0.001052889007216159, "loss": 2.4558, "step": 21897 }, { "crossentropy": 2.593583822250366, "epoch": 0.7938660092807425, "grad_norm": 0.026492677628993988, "grad_norm_var": 1.0252190156554656e-06, "learning_rate": 0.0010525323328110115, "loss": 2.4814, "step": 21898 }, { "crossentropy": 2.3171989917755127, "epoch": 0.7939022621809745, "grad_norm": 0.027687953785061836, "grad_norm_var": 6.967330075872969e-07, "learning_rate": 0.0010521757117224052, "loss": 2.416, "step": 21899 }, { "crossentropy": 2.575604200363159, "epoch": 0.7939385150812065, "grad_norm": 0.026930473744869232, "grad_norm_var": 7.154010806321072e-07, "learning_rate": 0.001051819143955155, "loss": 2.5291, "step": 21900 }, { "crossentropy": 2.5101065635681152, "epoch": 0.7939747679814385, "grad_norm": 0.025763723999261856, "grad_norm_var": 6.963226709265242e-07, "learning_rate": 0.0010514626295140795, "loss": 2.4689, "step": 21901 }, { "crossentropy": 2.458165407180786, "epoch": 0.7940110208816705, "grad_norm": 0.026757577434182167, "grad_norm_var": 6.5383943911908e-07, "learning_rate": 0.0010511061684039947, "loss": 2.524, "step": 21902 }, { "crossentropy": 2.6360232830047607, "epoch": 0.7940472737819025, "grad_norm": 0.026862062513828278, "grad_norm_var": 6.241519506456807e-07, "learning_rate": 0.0010507497606297117, "loss": 2.5887, "step": 21903 }, { "crossentropy": 2.450274705886841, "epoch": 0.7940835266821346, "grad_norm": 0.026310615241527557, "grad_norm_var": 6.143495890091698e-07, "learning_rate": 0.0010503934061960485, "loss": 2.4911, "step": 21904 }, { "crossentropy": 2.531233310699463, "epoch": 0.7941197795823666, "grad_norm": 0.028053168207406998, "grad_norm_var": 6.998145855806412e-07, "learning_rate": 0.0010500371051078146, "loss": 2.5546, "step": 21905 }, { "crossentropy": 2.4667165279388428, "epoch": 0.7941560324825986, "grad_norm": 0.027369588613510132, "grad_norm_var": 7.343013923799546e-07, "learning_rate": 0.0010496808573698235, "loss": 2.4201, "step": 21906 }, { "crossentropy": 2.405510187149048, "epoch": 0.7941922853828306, "grad_norm": 0.027084195986390114, "grad_norm_var": 4.899301768425401e-07, "learning_rate": 0.0010493246629868901, "loss": 2.4762, "step": 21907 }, { "crossentropy": 2.434972047805786, "epoch": 0.7942285382830626, "grad_norm": 0.026565587148070335, "grad_norm_var": 4.35625111662322e-07, "learning_rate": 0.00104896852196382, "loss": 2.4531, "step": 21908 }, { "crossentropy": 2.53902006149292, "epoch": 0.7942647911832946, "grad_norm": 0.026699963957071304, "grad_norm_var": 3.872154698526664e-07, "learning_rate": 0.001048612434305426, "loss": 2.5458, "step": 21909 }, { "crossentropy": 2.5428075790405273, "epoch": 0.7943010440835266, "grad_norm": 0.0260482095181942, "grad_norm_var": 3.914443594156171e-07, "learning_rate": 0.0010482564000165184, "loss": 2.482, "step": 21910 }, { "crossentropy": 2.5685601234436035, "epoch": 0.7943372969837587, "grad_norm": 0.028195662423968315, "grad_norm_var": 5.186669527150351e-07, "learning_rate": 0.0010479004191019043, "loss": 2.6523, "step": 21911 }, { "crossentropy": 2.5051350593566895, "epoch": 0.7943735498839907, "grad_norm": 0.026674438267946243, "grad_norm_var": 5.203489426783599e-07, "learning_rate": 0.0010475444915663934, "loss": 2.4742, "step": 21912 }, { "crossentropy": 2.4329915046691895, "epoch": 0.7944098027842227, "grad_norm": 0.027720976620912552, "grad_norm_var": 4.847653847676042e-07, "learning_rate": 0.001047188617414791, "loss": 2.5093, "step": 21913 }, { "crossentropy": 2.4198527336120605, "epoch": 0.7944460556844548, "grad_norm": 0.027097122743725777, "grad_norm_var": 4.7065816635756773e-07, "learning_rate": 0.0010468327966519054, "loss": 2.49, "step": 21914 }, { "crossentropy": 2.486027717590332, "epoch": 0.7944823085846868, "grad_norm": 0.02788998931646347, "grad_norm_var": 4.92042291969881e-07, "learning_rate": 0.0010464770292825444, "loss": 2.4874, "step": 21915 }, { "crossentropy": 2.399965763092041, "epoch": 0.7945185614849188, "grad_norm": 0.02641867846250534, "grad_norm_var": 5.132572318788566e-07, "learning_rate": 0.001046121315311508, "loss": 2.4123, "step": 21916 }, { "crossentropy": 2.494117259979248, "epoch": 0.7945548143851509, "grad_norm": 0.02646896429359913, "grad_norm_var": 4.3096348091136347e-07, "learning_rate": 0.0010457656547436035, "loss": 2.4235, "step": 21917 }, { "crossentropy": 2.407958507537842, "epoch": 0.7945910672853829, "grad_norm": 0.02616502158343792, "grad_norm_var": 4.73132375744476e-07, "learning_rate": 0.001045410047583636, "loss": 2.445, "step": 21918 }, { "crossentropy": 2.3498218059539795, "epoch": 0.7946273201856149, "grad_norm": 0.026082361117005348, "grad_norm_var": 5.230268011142813e-07, "learning_rate": 0.0010450544938364055, "loss": 2.5059, "step": 21919 }, { "crossentropy": 2.5082297325134277, "epoch": 0.7946635730858469, "grad_norm": 0.026141587644815445, "grad_norm_var": 5.387215862086379e-07, "learning_rate": 0.0010446989935067176, "loss": 2.495, "step": 21920 }, { "crossentropy": 2.533582925796509, "epoch": 0.7946998259860789, "grad_norm": 0.026737095788121223, "grad_norm_var": 4.4764244053081364e-07, "learning_rate": 0.0010443435465993707, "loss": 2.4442, "step": 21921 }, { "crossentropy": 2.50437068939209, "epoch": 0.7947360788863109, "grad_norm": 0.02993607521057129, "grad_norm_var": 1.0422679519254037e-06, "learning_rate": 0.001043988153119167, "loss": 2.4641, "step": 21922 }, { "crossentropy": 2.4057228565216064, "epoch": 0.7947723317865429, "grad_norm": 0.02735835500061512, "grad_norm_var": 1.0502126193560427e-06, "learning_rate": 0.0010436328130709095, "loss": 2.467, "step": 21923 }, { "crossentropy": 2.3384897708892822, "epoch": 0.794808584686775, "grad_norm": 0.026579542085528374, "grad_norm_var": 1.0493932281425976e-06, "learning_rate": 0.0010432775264593924, "loss": 2.4162, "step": 21924 }, { "crossentropy": 2.5476300716400146, "epoch": 0.794844837587007, "grad_norm": 0.027028853073716164, "grad_norm_var": 1.042409946505141e-06, "learning_rate": 0.0010429222932894168, "loss": 2.4495, "step": 21925 }, { "crossentropy": 2.495893955230713, "epoch": 0.794881090487239, "grad_norm": 0.02657434716820717, "grad_norm_var": 9.905610607553826e-07, "learning_rate": 0.0010425671135657817, "loss": 2.4404, "step": 21926 }, { "crossentropy": 2.5724847316741943, "epoch": 0.794917343387471, "grad_norm": 0.027468938380479813, "grad_norm_var": 9.141878284092571e-07, "learning_rate": 0.0010422119872932822, "loss": 2.5394, "step": 21927 }, { "crossentropy": 2.5004868507385254, "epoch": 0.794953596287703, "grad_norm": 0.02511601150035858, "grad_norm_var": 1.1380757714024118e-06, "learning_rate": 0.0010418569144767175, "loss": 2.418, "step": 21928 }, { "crossentropy": 2.499939441680908, "epoch": 0.794989849187935, "grad_norm": 0.02596196159720421, "grad_norm_var": 1.1445387844681114e-06, "learning_rate": 0.0010415018951208804, "loss": 2.5114, "step": 21929 }, { "crossentropy": 2.316965103149414, "epoch": 0.795026102088167, "grad_norm": 0.026170725002884865, "grad_norm_var": 1.1632128331585944e-06, "learning_rate": 0.0010411469292305674, "loss": 2.4236, "step": 21930 }, { "crossentropy": 2.4009997844696045, "epoch": 0.7950623549883991, "grad_norm": 0.027348557487130165, "grad_norm_var": 1.0996822008618821e-06, "learning_rate": 0.0010407920168105744, "loss": 2.4938, "step": 21931 }, { "crossentropy": 2.552460193634033, "epoch": 0.7950986078886311, "grad_norm": 0.027401719242334366, "grad_norm_var": 1.1202817045077948e-06, "learning_rate": 0.001040437157865693, "loss": 2.531, "step": 21932 }, { "crossentropy": 2.4885687828063965, "epoch": 0.7951348607888631, "grad_norm": 0.026263069361448288, "grad_norm_var": 1.1315731532559234e-06, "learning_rate": 0.0010400823524007157, "loss": 2.4613, "step": 21933 }, { "crossentropy": 2.3309712409973145, "epoch": 0.7951711136890951, "grad_norm": 0.026694942265748978, "grad_norm_var": 1.1063159377200921e-06, "learning_rate": 0.001039727600420437, "loss": 2.4055, "step": 21934 }, { "crossentropy": 2.4573826789855957, "epoch": 0.7952073665893271, "grad_norm": 0.026718322187662125, "grad_norm_var": 1.0704018592151235e-06, "learning_rate": 0.0010393729019296455, "loss": 2.4761, "step": 21935 }, { "crossentropy": 2.3881759643554688, "epoch": 0.7952436194895591, "grad_norm": 0.02711113542318344, "grad_norm_var": 1.0383817977511918e-06, "learning_rate": 0.0010390182569331352, "loss": 2.4546, "step": 21936 }, { "crossentropy": 2.574425220489502, "epoch": 0.7952798723897911, "grad_norm": 0.02651548758149147, "grad_norm_var": 1.0463932663068067e-06, "learning_rate": 0.0010386636654356924, "loss": 2.4726, "step": 21937 }, { "crossentropy": 2.646144151687622, "epoch": 0.7953161252900232, "grad_norm": 0.027201635763049126, "grad_norm_var": 4.033245467525865e-07, "learning_rate": 0.0010383091274421086, "loss": 2.4778, "step": 21938 }, { "crossentropy": 2.366506338119507, "epoch": 0.7953523781902552, "grad_norm": 0.026691485196352005, "grad_norm_var": 3.743237496225584e-07, "learning_rate": 0.0010379546429571734, "loss": 2.4686, "step": 21939 }, { "crossentropy": 2.486795425415039, "epoch": 0.7953886310904872, "grad_norm": 0.02596353180706501, "grad_norm_var": 4.0612085029655534e-07, "learning_rate": 0.0010376002119856737, "loss": 2.4461, "step": 21940 }, { "crossentropy": 2.497431755065918, "epoch": 0.7954248839907193, "grad_norm": 0.027172280475497246, "grad_norm_var": 4.148539435796927e-07, "learning_rate": 0.0010372458345323944, "loss": 2.449, "step": 21941 }, { "crossentropy": 2.49336576461792, "epoch": 0.7954611368909513, "grad_norm": 0.02613200806081295, "grad_norm_var": 4.3144954500149804e-07, "learning_rate": 0.0010368915106021253, "loss": 2.4345, "step": 21942 }, { "crossentropy": 2.4567015171051025, "epoch": 0.7954973897911833, "grad_norm": 0.02634752355515957, "grad_norm_var": 3.832231792640402e-07, "learning_rate": 0.0010365372401996498, "loss": 2.4332, "step": 21943 }, { "crossentropy": 2.583967924118042, "epoch": 0.7955336426914154, "grad_norm": 0.02605312317609787, "grad_norm_var": 2.58853818042035e-07, "learning_rate": 0.0010361830233297541, "loss": 2.5345, "step": 21944 }, { "crossentropy": 2.340254545211792, "epoch": 0.7955698955916474, "grad_norm": 0.02701026387512684, "grad_norm_var": 2.3706786431709758e-07, "learning_rate": 0.001035828859997221, "loss": 2.4701, "step": 21945 }, { "crossentropy": 2.439119577407837, "epoch": 0.7956061484918794, "grad_norm": 0.026538316160440445, "grad_norm_var": 2.208102976684976e-07, "learning_rate": 0.0010354747502068352, "loss": 2.4585, "step": 21946 }, { "crossentropy": 2.4455268383026123, "epoch": 0.7956424013921114, "grad_norm": 0.02611089125275612, "grad_norm_var": 2.0914507251005437e-07, "learning_rate": 0.0010351206939633805, "loss": 2.4812, "step": 21947 }, { "crossentropy": 2.4670217037200928, "epoch": 0.7956786542923434, "grad_norm": 0.026421044021844864, "grad_norm_var": 1.6708466138015456e-07, "learning_rate": 0.0010347666912716364, "loss": 2.4086, "step": 21948 }, { "crossentropy": 2.567756414413452, "epoch": 0.7957149071925754, "grad_norm": 0.026778757572174072, "grad_norm_var": 1.6335327556576037e-07, "learning_rate": 0.0010344127421363868, "loss": 2.5766, "step": 21949 }, { "crossentropy": 2.4911587238311768, "epoch": 0.7957511600928074, "grad_norm": 0.025921791791915894, "grad_norm_var": 1.9002890414606076e-07, "learning_rate": 0.0010340588465624111, "loss": 2.5556, "step": 21950 }, { "crossentropy": 2.433053731918335, "epoch": 0.7957874129930395, "grad_norm": 0.028291499242186546, "grad_norm_var": 3.814895991009383e-07, "learning_rate": 0.0010337050045544883, "loss": 2.4112, "step": 21951 }, { "crossentropy": 2.612154483795166, "epoch": 0.7958236658932715, "grad_norm": 0.029258884489536285, "grad_norm_var": 8.043368211334365e-07, "learning_rate": 0.0010333512161173996, "loss": 2.584, "step": 21952 }, { "crossentropy": 2.5220248699188232, "epoch": 0.7958599187935035, "grad_norm": 0.027759097516536713, "grad_norm_var": 8.578778746371468e-07, "learning_rate": 0.0010329974812559207, "loss": 2.5506, "step": 21953 }, { "crossentropy": 2.521064043045044, "epoch": 0.7958961716937355, "grad_norm": 0.02716629020869732, "grad_norm_var": 8.56314144133729e-07, "learning_rate": 0.0010326437999748311, "loss": 2.4633, "step": 21954 }, { "crossentropy": 2.3697097301483154, "epoch": 0.7959324245939675, "grad_norm": 0.026712719351053238, "grad_norm_var": 8.558905636054376e-07, "learning_rate": 0.0010322901722789092, "loss": 2.4106, "step": 21955 }, { "crossentropy": 2.3738179206848145, "epoch": 0.7959686774941995, "grad_norm": 0.026431040838360786, "grad_norm_var": 8.141451397752069e-07, "learning_rate": 0.0010319365981729283, "loss": 2.4831, "step": 21956 }, { "crossentropy": 2.632532835006714, "epoch": 0.7960049303944315, "grad_norm": 0.027667328715324402, "grad_norm_var": 8.486492505190823e-07, "learning_rate": 0.001031583077661667, "loss": 2.5969, "step": 21957 }, { "crossentropy": 2.567763328552246, "epoch": 0.7960411832946636, "grad_norm": 0.028499945998191833, "grad_norm_var": 9.526625755682521e-07, "learning_rate": 0.001031229610749898, "loss": 2.5142, "step": 21958 }, { "crossentropy": 2.55448317527771, "epoch": 0.7960774361948956, "grad_norm": 0.027017919346690178, "grad_norm_var": 9.170189025908681e-07, "learning_rate": 0.0010308761974423947, "loss": 2.5349, "step": 21959 }, { "crossentropy": 2.673722743988037, "epoch": 0.7961136890951276, "grad_norm": 0.02839123085141182, "grad_norm_var": 9.315709890983139e-07, "learning_rate": 0.001030522837743933, "loss": 2.5339, "step": 21960 }, { "crossentropy": 2.4118800163269043, "epoch": 0.7961499419953596, "grad_norm": 0.02559107169508934, "grad_norm_var": 1.1025450989372739e-06, "learning_rate": 0.0010301695316592835, "loss": 2.4427, "step": 21961 }, { "crossentropy": 2.4944591522216797, "epoch": 0.7961861948955916, "grad_norm": 0.026180274784564972, "grad_norm_var": 1.1402291948419778e-06, "learning_rate": 0.001029816279193218, "loss": 2.4598, "step": 21962 }, { "crossentropy": 2.4835686683654785, "epoch": 0.7962224477958236, "grad_norm": 0.02663550339639187, "grad_norm_var": 1.0856217198243102e-06, "learning_rate": 0.0010294630803505105, "loss": 2.483, "step": 21963 }, { "crossentropy": 2.4642794132232666, "epoch": 0.7962587006960556, "grad_norm": 0.026711609214544296, "grad_norm_var": 1.0618717526548881e-06, "learning_rate": 0.0010291099351359284, "loss": 2.3635, "step": 21964 }, { "crossentropy": 2.3774960041046143, "epoch": 0.7962949535962877, "grad_norm": 0.026956742629408836, "grad_norm_var": 1.0541294680322296e-06, "learning_rate": 0.0010287568435542438, "loss": 2.42, "step": 21965 }, { "crossentropy": 2.5183799266815186, "epoch": 0.7963312064965197, "grad_norm": 0.027092115953564644, "grad_norm_var": 9.403461624985042e-07, "learning_rate": 0.0010284038056102246, "loss": 2.4301, "step": 21966 }, { "crossentropy": 2.5910544395446777, "epoch": 0.7963674593967517, "grad_norm": 0.026885904371738434, "grad_norm_var": 8.728922255557895e-07, "learning_rate": 0.001028050821308638, "loss": 2.5573, "step": 21967 }, { "crossentropy": 2.4562127590179443, "epoch": 0.7964037122969838, "grad_norm": 0.027350809425115585, "grad_norm_var": 5.727852583658118e-07, "learning_rate": 0.0010276978906542539, "loss": 2.3972, "step": 21968 }, { "crossentropy": 2.539571762084961, "epoch": 0.7964399651972158, "grad_norm": 0.026470020413398743, "grad_norm_var": 5.57446554683381e-07, "learning_rate": 0.0010273450136518364, "loss": 2.4653, "step": 21969 }, { "crossentropy": 2.2871696949005127, "epoch": 0.7964762180974478, "grad_norm": 0.025612281635403633, "grad_norm_var": 6.708237587683848e-07, "learning_rate": 0.0010269921903061534, "loss": 2.3389, "step": 21970 }, { "crossentropy": 2.5222041606903076, "epoch": 0.7965124709976799, "grad_norm": 0.029022719711065292, "grad_norm_var": 9.503721809970995e-07, "learning_rate": 0.0010266394206219715, "loss": 2.5179, "step": 21971 }, { "crossentropy": 2.6377811431884766, "epoch": 0.7965487238979119, "grad_norm": 0.02598748356103897, "grad_norm_var": 9.982266336292968e-07, "learning_rate": 0.0010262867046040526, "loss": 2.6056, "step": 21972 }, { "crossentropy": 2.3687007427215576, "epoch": 0.7965849767981439, "grad_norm": 0.02596297487616539, "grad_norm_var": 1.0291657127904265e-06, "learning_rate": 0.0010259340422571633, "loss": 2.4118, "step": 21973 }, { "crossentropy": 2.4292221069335938, "epoch": 0.7966212296983759, "grad_norm": 0.02785729058086872, "grad_norm_var": 9.177152724038536e-07, "learning_rate": 0.0010255814335860657, "loss": 2.442, "step": 21974 }, { "crossentropy": 2.4153800010681152, "epoch": 0.7966574825986079, "grad_norm": 0.0263577401638031, "grad_norm_var": 9.308670692467392e-07, "learning_rate": 0.001025228878595521, "loss": 2.5017, "step": 21975 }, { "crossentropy": 2.525716781616211, "epoch": 0.7966937354988399, "grad_norm": 0.028380658477544785, "grad_norm_var": 9.286543923130482e-07, "learning_rate": 0.0010248763772902935, "loss": 2.473, "step": 21976 }, { "crossentropy": 2.4785690307617188, "epoch": 0.7967299883990719, "grad_norm": 0.02704598940908909, "grad_norm_var": 8.233405262417226e-07, "learning_rate": 0.0010245239296751418, "loss": 2.4624, "step": 21977 }, { "crossentropy": 2.465195417404175, "epoch": 0.796766241299304, "grad_norm": 0.027164706960320473, "grad_norm_var": 7.8853690509092e-07, "learning_rate": 0.0010241715357548276, "loss": 2.429, "step": 21978 }, { "crossentropy": 2.5317347049713135, "epoch": 0.796802494199536, "grad_norm": 0.027149803936481476, "grad_norm_var": 7.82239970483339e-07, "learning_rate": 0.0010238191955341115, "loss": 2.5691, "step": 21979 }, { "crossentropy": 2.4062540531158447, "epoch": 0.796838747099768, "grad_norm": 0.026966581121087074, "grad_norm_var": 7.764801241189375e-07, "learning_rate": 0.0010234669090177495, "loss": 2.4582, "step": 21980 }, { "crossentropy": 2.5348217487335205, "epoch": 0.796875, "grad_norm": 0.02645111456513405, "grad_norm_var": 7.964867801306859e-07, "learning_rate": 0.001023114676210503, "loss": 2.5032, "step": 21981 }, { "crossentropy": 2.53944730758667, "epoch": 0.796911252900232, "grad_norm": 0.027762439101934433, "grad_norm_var": 8.341538226378503e-07, "learning_rate": 0.0010227624971171285, "loss": 2.5543, "step": 21982 }, { "crossentropy": 2.5472543239593506, "epoch": 0.796947505800464, "grad_norm": 0.02856265939772129, "grad_norm_var": 9.783773118734438e-07, "learning_rate": 0.00102241037174238, "loss": 2.5148, "step": 21983 }, { "crossentropy": 2.4665563106536865, "epoch": 0.796983758700696, "grad_norm": 0.027266796678304672, "grad_norm_var": 9.76362698901628e-07, "learning_rate": 0.001022058300091016, "loss": 2.4976, "step": 21984 }, { "crossentropy": 2.5346808433532715, "epoch": 0.7970200116009281, "grad_norm": 0.02590900845825672, "grad_norm_var": 1.0451265122659078e-06, "learning_rate": 0.001021706282167793, "loss": 2.5463, "step": 21985 }, { "crossentropy": 2.379301071166992, "epoch": 0.7970562645011601, "grad_norm": 0.026905996724963188, "grad_norm_var": 8.946148438589238e-07, "learning_rate": 0.001021354317977462, "loss": 2.45, "step": 21986 }, { "crossentropy": 2.4978115558624268, "epoch": 0.7970925174013921, "grad_norm": 0.02583989128470421, "grad_norm_var": 7.424136208886706e-07, "learning_rate": 0.0010210024075247809, "loss": 2.525, "step": 21987 }, { "crossentropy": 2.5469112396240234, "epoch": 0.7971287703016241, "grad_norm": 0.026604261249303818, "grad_norm_var": 6.851274857099746e-07, "learning_rate": 0.0010206505508144987, "loss": 2.5586, "step": 21988 }, { "crossentropy": 2.447187900543213, "epoch": 0.7971650232018561, "grad_norm": 0.026813022792339325, "grad_norm_var": 6.114215621930239e-07, "learning_rate": 0.0010202987478513709, "loss": 2.4325, "step": 21989 }, { "crossentropy": 2.425917387008667, "epoch": 0.7972012761020881, "grad_norm": 0.026325659826397896, "grad_norm_var": 5.962142846321992e-07, "learning_rate": 0.0010199469986401472, "loss": 2.524, "step": 21990 }, { "crossentropy": 2.634563684463501, "epoch": 0.7972375290023201, "grad_norm": 0.028542472049593925, "grad_norm_var": 7.164291860851934e-07, "learning_rate": 0.0010195953031855798, "loss": 2.5085, "step": 21991 }, { "crossentropy": 2.522371530532837, "epoch": 0.7972737819025522, "grad_norm": 0.026340879499912262, "grad_norm_var": 6.297193499394185e-07, "learning_rate": 0.0010192436614924177, "loss": 2.5224, "step": 21992 }, { "crossentropy": 2.6045379638671875, "epoch": 0.7973100348027842, "grad_norm": 0.0297971423715353, "grad_norm_var": 1.1276366475271305e-06, "learning_rate": 0.0010188920735654118, "loss": 2.5396, "step": 21993 }, { "crossentropy": 2.41913104057312, "epoch": 0.7973462877030162, "grad_norm": 0.026284923776984215, "grad_norm_var": 1.174305468847001e-06, "learning_rate": 0.001018540539409309, "loss": 2.4271, "step": 21994 }, { "crossentropy": 2.487013578414917, "epoch": 0.7973825406032483, "grad_norm": 0.02537411078810692, "grad_norm_var": 1.3584372789051298e-06, "learning_rate": 0.0010181890590288596, "loss": 2.3838, "step": 21995 }, { "crossentropy": 2.5379421710968018, "epoch": 0.7974187935034803, "grad_norm": 0.02694377303123474, "grad_norm_var": 1.3585233266859374e-06, "learning_rate": 0.001017837632428808, "loss": 2.517, "step": 21996 }, { "crossentropy": 2.5778353214263916, "epoch": 0.7974550464037123, "grad_norm": 0.027338042855262756, "grad_norm_var": 1.3448176505866886e-06, "learning_rate": 0.0010174862596139024, "loss": 2.4543, "step": 21997 }, { "crossentropy": 2.5300095081329346, "epoch": 0.7974912993039444, "grad_norm": 0.027283838018774986, "grad_norm_var": 1.3129171430129386e-06, "learning_rate": 0.0010171349405888896, "loss": 2.5184, "step": 21998 }, { "crossentropy": 2.3904659748077393, "epoch": 0.7975275522041764, "grad_norm": 0.027944035828113556, "grad_norm_var": 1.2086254843597741e-06, "learning_rate": 0.0010167836753585142, "loss": 2.5054, "step": 21999 }, { "crossentropy": 2.498110055923462, "epoch": 0.7975638051044084, "grad_norm": 0.025462530553340912, "grad_norm_var": 1.3405940843305794e-06, "learning_rate": 0.0010164324639275186, "loss": 2.4137, "step": 22000 }, { "crossentropy": 2.3076326847076416, "epoch": 0.7976000580046404, "grad_norm": 0.02579725719988346, "learning_rate": 0.0010160813063006485, "loss": 2.3953, "step": 22001 }, { "crossentropy": 2.3896899223327637, "epoch": 0.7976363109048724, "grad_norm": 0.02543928287923336, "learning_rate": 0.0010157302024826453, "loss": 2.4237, "step": 22002 }, { "crossentropy": 2.5781548023223877, "epoch": 0.7976725638051044, "grad_norm": 0.0268475953489542, "learning_rate": 0.0010153791524782536, "loss": 2.4654, "step": 22003 }, { "crossentropy": 2.374080181121826, "epoch": 0.7977088167053364, "grad_norm": 0.027372635900974274, "learning_rate": 0.001015028156292212, "loss": 2.414, "step": 22004 }, { "crossentropy": 2.3896524906158447, "epoch": 0.7977450696055685, "grad_norm": 0.025954674929380417, "learning_rate": 0.0010146772139292626, "loss": 2.4344, "step": 22005 }, { "crossentropy": 2.4180727005004883, "epoch": 0.7977813225058005, "grad_norm": 0.027799539268016815, "learning_rate": 0.0010143263253941464, "loss": 2.5009, "step": 22006 }, { "crossentropy": 2.3970892429351807, "epoch": 0.7978175754060325, "grad_norm": 0.026709845289587975, "learning_rate": 0.0010139754906916016, "loss": 2.4637, "step": 22007 }, { "crossentropy": 2.476264476776123, "epoch": 0.7978538283062645, "grad_norm": 0.028048954904079437, "learning_rate": 0.0010136247098263657, "loss": 2.4775, "step": 22008 }, { "crossentropy": 2.3795387744903564, "epoch": 0.7978900812064965, "grad_norm": 0.02878611907362938, "learning_rate": 0.001013273982803179, "loss": 2.4513, "step": 22009 }, { "crossentropy": 2.4783668518066406, "epoch": 0.7979263341067285, "grad_norm": 0.026020996272563934, "learning_rate": 0.0010129233096267754, "loss": 2.3769, "step": 22010 }, { "crossentropy": 2.414163827896118, "epoch": 0.7979625870069605, "grad_norm": 0.02701088786125183, "learning_rate": 0.001012572690301895, "loss": 2.5063, "step": 22011 }, { "crossentropy": 2.4634459018707275, "epoch": 0.7979988399071926, "grad_norm": 0.02606821246445179, "learning_rate": 0.0010122221248332701, "loss": 2.5113, "step": 22012 }, { "crossentropy": 2.3651227951049805, "epoch": 0.7980350928074246, "grad_norm": 0.027171459048986435, "learning_rate": 0.0010118716132256366, "loss": 2.4523, "step": 22013 }, { "crossentropy": 2.395070791244507, "epoch": 0.7980713457076566, "grad_norm": 0.02542257495224476, "learning_rate": 0.0010115211554837312, "loss": 2.4406, "step": 22014 }, { "crossentropy": 2.4116408824920654, "epoch": 0.7981075986078886, "grad_norm": 0.029865723103284836, "learning_rate": 0.0010111707516122837, "loss": 2.4989, "step": 22015 }, { "crossentropy": 2.4867265224456787, "epoch": 0.7981438515081206, "grad_norm": 0.027412842959165573, "grad_norm_var": 1.5249243590332676e-06, "learning_rate": 0.0010108204016160299, "loss": 2.4796, "step": 22016 }, { "crossentropy": 2.511167049407959, "epoch": 0.7981801044083526, "grad_norm": 0.028146328404545784, "grad_norm_var": 1.4984101530818768e-06, "learning_rate": 0.0010104701054997, "loss": 2.4637, "step": 22017 }, { "crossentropy": 2.4482994079589844, "epoch": 0.7982163573085846, "grad_norm": 0.025964053347706795, "grad_norm_var": 1.397333378124789e-06, "learning_rate": 0.0010101198632680246, "loss": 2.4077, "step": 22018 }, { "crossentropy": 2.6349244117736816, "epoch": 0.7982526102088167, "grad_norm": 0.027022721245884895, "grad_norm_var": 1.3918935688897807e-06, "learning_rate": 0.0010097696749257368, "loss": 2.5587, "step": 22019 }, { "crossentropy": 2.5193116664886475, "epoch": 0.7982888631090487, "grad_norm": 0.02680070698261261, "grad_norm_var": 1.3971594209016258e-06, "learning_rate": 0.0010094195404775636, "loss": 2.5228, "step": 22020 }, { "crossentropy": 2.414968490600586, "epoch": 0.7983251160092807, "grad_norm": 0.02665717713534832, "grad_norm_var": 1.3171791074881862e-06, "learning_rate": 0.0010090694599282353, "loss": 2.3367, "step": 22021 }, { "crossentropy": 2.3427464962005615, "epoch": 0.7983613689095128, "grad_norm": 0.025824271142482758, "grad_norm_var": 1.3983301172765799e-06, "learning_rate": 0.0010087194332824817, "loss": 2.4834, "step": 22022 }, { "crossentropy": 2.3928325176239014, "epoch": 0.7983976218097448, "grad_norm": 0.02693749964237213, "grad_norm_var": 1.3909921688449548e-06, "learning_rate": 0.0010083694605450277, "loss": 2.4415, "step": 22023 }, { "crossentropy": 2.4479286670684814, "epoch": 0.7984338747099768, "grad_norm": 0.02635856904089451, "grad_norm_var": 1.3495093116091024e-06, "learning_rate": 0.0010080195417206028, "loss": 2.3877, "step": 22024 }, { "crossentropy": 2.4050087928771973, "epoch": 0.7984701276102089, "grad_norm": 0.02555815689265728, "grad_norm_var": 1.217753414014105e-06, "learning_rate": 0.0010076696768139327, "loss": 2.3624, "step": 22025 }, { "crossentropy": 2.543288230895996, "epoch": 0.7985063805104409, "grad_norm": 0.02683419920504093, "grad_norm_var": 1.1783996904933623e-06, "learning_rate": 0.0010073198658297406, "loss": 2.4312, "step": 22026 }, { "crossentropy": 2.546621561050415, "epoch": 0.7985426334106729, "grad_norm": 0.02727593667805195, "grad_norm_var": 1.1896790383152112e-06, "learning_rate": 0.0010069701087727539, "loss": 2.5084, "step": 22027 }, { "crossentropy": 2.433633804321289, "epoch": 0.7985788863109049, "grad_norm": 0.02658143825829029, "grad_norm_var": 1.1538394574140318e-06, "learning_rate": 0.001006620405647694, "loss": 2.4568, "step": 22028 }, { "crossentropy": 2.5632171630859375, "epoch": 0.7986151392111369, "grad_norm": 0.02671538107097149, "grad_norm_var": 1.1481799025160235e-06, "learning_rate": 0.001006270756459286, "loss": 2.4929, "step": 22029 }, { "crossentropy": 2.5945422649383545, "epoch": 0.7986513921113689, "grad_norm": 0.026751955971121788, "grad_norm_var": 1.00808504263961e-06, "learning_rate": 0.0010059211612122537, "loss": 2.5715, "step": 22030 }, { "crossentropy": 2.4110074043273926, "epoch": 0.7986876450116009, "grad_norm": 0.025755513459444046, "grad_norm_var": 4.491637931743151e-07, "learning_rate": 0.0010055716199113157, "loss": 2.3662, "step": 22031 }, { "crossentropy": 2.3526618480682373, "epoch": 0.798723897911833, "grad_norm": 0.02659480832517147, "grad_norm_var": 4.0912456886754713e-07, "learning_rate": 0.0010052221325611965, "loss": 2.3517, "step": 22032 }, { "crossentropy": 2.4331045150756836, "epoch": 0.798760150812065, "grad_norm": 0.027546660974621773, "grad_norm_var": 3.0885501771567757e-07, "learning_rate": 0.0010048726991666141, "loss": 2.4297, "step": 22033 }, { "crossentropy": 2.497541904449463, "epoch": 0.798796403712297, "grad_norm": 0.026451384648680687, "grad_norm_var": 2.840855472413875e-07, "learning_rate": 0.0010045233197322877, "loss": 2.4377, "step": 22034 }, { "crossentropy": 2.426239252090454, "epoch": 0.798832656612529, "grad_norm": 0.026678109541535378, "grad_norm_var": 2.722752116892634e-07, "learning_rate": 0.0010041739942629386, "loss": 2.4111, "step": 22035 }, { "crossentropy": 2.569514274597168, "epoch": 0.798868909512761, "grad_norm": 0.027169669046998024, "grad_norm_var": 2.9151276531336145e-07, "learning_rate": 0.001003824722763282, "loss": 2.5218, "step": 22036 }, { "crossentropy": 2.5160269737243652, "epoch": 0.798905162412993, "grad_norm": 0.02757907286286354, "grad_norm_var": 3.50962141558384e-07, "learning_rate": 0.0010034755052380374, "loss": 2.4342, "step": 22037 }, { "crossentropy": 2.4134395122528076, "epoch": 0.798941415313225, "grad_norm": 0.026664147153496742, "grad_norm_var": 3.010929748380042e-07, "learning_rate": 0.0010031263416919217, "loss": 2.4741, "step": 22038 }, { "crossentropy": 2.5523693561553955, "epoch": 0.7989776682134571, "grad_norm": 0.027271095663309097, "grad_norm_var": 3.179102776672589e-07, "learning_rate": 0.001002777232129649, "loss": 2.4297, "step": 22039 }, { "crossentropy": 2.377107858657837, "epoch": 0.7990139211136891, "grad_norm": 0.026549167931079865, "grad_norm_var": 3.1057301064600177e-07, "learning_rate": 0.001002428176555938, "loss": 2.3997, "step": 22040 }, { "crossentropy": 2.2857112884521484, "epoch": 0.7990501740139211, "grad_norm": 0.02782227098941803, "grad_norm_var": 2.716051226451007e-07, "learning_rate": 0.0010020791749755, "loss": 2.354, "step": 22041 }, { "crossentropy": 2.481753349304199, "epoch": 0.7990864269141531, "grad_norm": 0.027146657928824425, "grad_norm_var": 2.7538018626907003e-07, "learning_rate": 0.001001730227393049, "loss": 2.4359, "step": 22042 }, { "crossentropy": 2.444882869720459, "epoch": 0.7991226798143851, "grad_norm": 0.02694699913263321, "grad_norm_var": 2.660748590144549e-07, "learning_rate": 0.0010013813338132999, "loss": 2.4442, "step": 22043 }, { "crossentropy": 2.556826591491699, "epoch": 0.7991589327146171, "grad_norm": 0.026116972789168358, "grad_norm_var": 2.986060663153859e-07, "learning_rate": 0.0010010324942409626, "loss": 2.5134, "step": 22044 }, { "crossentropy": 2.454988718032837, "epoch": 0.7991951856148491, "grad_norm": 0.02696818672120571, "grad_norm_var": 2.9772603304158616e-07, "learning_rate": 0.0010006837086807508, "loss": 2.3919, "step": 22045 }, { "crossentropy": 2.400203227996826, "epoch": 0.7992314385150812, "grad_norm": 0.027567308396100998, "grad_norm_var": 3.258133313300265e-07, "learning_rate": 0.0010003349771373754, "loss": 2.457, "step": 22046 }, { "crossentropy": 2.5437233448028564, "epoch": 0.7992676914153132, "grad_norm": 0.027104532346129417, "grad_norm_var": 2.2888442936265056e-07, "learning_rate": 0.0009999862996155446, "loss": 2.4532, "step": 22047 }, { "crossentropy": 2.4305074214935303, "epoch": 0.7993039443155452, "grad_norm": 0.026298336684703827, "grad_norm_var": 2.5083234485889724e-07, "learning_rate": 0.0009996376761199704, "loss": 2.3439, "step": 22048 }, { "crossentropy": 2.5680243968963623, "epoch": 0.7993401972157773, "grad_norm": 0.027500860393047333, "grad_norm_var": 2.475795499908469e-07, "learning_rate": 0.0009992891066553605, "loss": 2.4934, "step": 22049 }, { "crossentropy": 2.3781752586364746, "epoch": 0.7993764501160093, "grad_norm": 0.026971112936735153, "grad_norm_var": 2.2716005057021813e-07, "learning_rate": 0.000998940591226421, "loss": 2.4117, "step": 22050 }, { "crossentropy": 2.4200234413146973, "epoch": 0.7994127030162413, "grad_norm": 0.025711046531796455, "grad_norm_var": 3.2997271317279586e-07, "learning_rate": 0.0009985921298378614, "loss": 2.4742, "step": 22051 }, { "crossentropy": 2.424351930618286, "epoch": 0.7994489559164734, "grad_norm": 0.02601165696978569, "grad_norm_var": 3.816762475817674e-07, "learning_rate": 0.0009982437224943868, "loss": 2.3964, "step": 22052 }, { "crossentropy": 2.4665703773498535, "epoch": 0.7994852088167054, "grad_norm": 0.02713775262236595, "grad_norm_var": 3.532631731105462e-07, "learning_rate": 0.0009978953692007032, "loss": 2.4487, "step": 22053 }, { "crossentropy": 2.4616990089416504, "epoch": 0.7995214617169374, "grad_norm": 0.026432236656546593, "grad_norm_var": 3.6273493788877704e-07, "learning_rate": 0.0009975470699615174, "loss": 2.4663, "step": 22054 }, { "crossentropy": 2.4510204792022705, "epoch": 0.7995577146171694, "grad_norm": 0.026349876075983047, "grad_norm_var": 3.637161210781036e-07, "learning_rate": 0.0009971988247815316, "loss": 2.4551, "step": 22055 }, { "crossentropy": 2.46083402633667, "epoch": 0.7995939675174014, "grad_norm": 0.026374539360404015, "grad_norm_var": 3.7122224276207813e-07, "learning_rate": 0.000996850633665451, "loss": 2.4108, "step": 22056 }, { "crossentropy": 2.390183687210083, "epoch": 0.7996302204176334, "grad_norm": 0.02691020630300045, "grad_norm_var": 2.963150985956879e-07, "learning_rate": 0.0009965024966179764, "loss": 2.4409, "step": 22057 }, { "crossentropy": 2.3607890605926514, "epoch": 0.7996664733178654, "grad_norm": 0.026941128075122833, "grad_norm_var": 2.873115702858488e-07, "learning_rate": 0.0009961544136438123, "loss": 2.5038, "step": 22058 }, { "crossentropy": 2.5617189407348633, "epoch": 0.7997027262180975, "grad_norm": 0.026156805455684662, "grad_norm_var": 3.0125334221652436e-07, "learning_rate": 0.000995806384747659, "loss": 2.5249, "step": 22059 }, { "crossentropy": 2.467133045196533, "epoch": 0.7997389791183295, "grad_norm": 0.026905735954642296, "grad_norm_var": 2.830771498900462e-07, "learning_rate": 0.0009954584099342158, "loss": 2.4929, "step": 22060 }, { "crossentropy": 2.502659320831299, "epoch": 0.7997752320185615, "grad_norm": 0.026722947135567665, "grad_norm_var": 2.7835553342013486e-07, "learning_rate": 0.0009951104892081835, "loss": 2.5447, "step": 22061 }, { "crossentropy": 2.5346596240997314, "epoch": 0.7998114849187935, "grad_norm": 0.026818733662366867, "grad_norm_var": 2.26164017563798e-07, "learning_rate": 0.0009947626225742634, "loss": 2.4884, "step": 22062 }, { "crossentropy": 2.2930004596710205, "epoch": 0.7998477378190255, "grad_norm": 0.02603841945528984, "grad_norm_var": 2.3212391118589368e-07, "learning_rate": 0.0009944148100371507, "loss": 2.3496, "step": 22063 }, { "crossentropy": 2.452749252319336, "epoch": 0.7998839907192575, "grad_norm": 0.026386573910713196, "grad_norm_var": 2.292957403513332e-07, "learning_rate": 0.000994067051601546, "loss": 2.4445, "step": 22064 }, { "crossentropy": 2.485084056854248, "epoch": 0.7999202436194895, "grad_norm": 0.026483530178666115, "grad_norm_var": 1.698314482943223e-07, "learning_rate": 0.0009937193472721439, "loss": 2.4158, "step": 22065 }, { "crossentropy": 2.403686046600342, "epoch": 0.7999564965197216, "grad_norm": 0.028309863060712814, "grad_norm_var": 3.620104971212608e-07, "learning_rate": 0.0009933716970536427, "loss": 2.4136, "step": 22066 }, { "crossentropy": 2.6543960571289062, "epoch": 0.7999927494199536, "grad_norm": 0.027471818029880524, "grad_norm_var": 3.4574506719882667e-07, "learning_rate": 0.0009930241009507373, "loss": 2.5645, "step": 22067 }, { "crossentropy": 2.6361494064331055, "epoch": 0.8000290023201856, "grad_norm": 0.027013972401618958, "grad_norm_var": 3.144398691648046e-07, "learning_rate": 0.0009926765589681208, "loss": 2.5668, "step": 22068 }, { "crossentropy": 2.4732108116149902, "epoch": 0.8000652552204176, "grad_norm": 0.027986926957964897, "grad_norm_var": 4.001973564770328e-07, "learning_rate": 0.0009923290711104887, "loss": 2.535, "step": 22069 }, { "crossentropy": 2.549231767654419, "epoch": 0.8001015081206496, "grad_norm": 0.02633299119770527, "grad_norm_var": 4.060957350086967e-07, "learning_rate": 0.0009919816373825357, "loss": 2.4313, "step": 22070 }, { "crossentropy": 2.400240898132324, "epoch": 0.8001377610208816, "grad_norm": 0.025813449174165726, "grad_norm_var": 4.5808109745703165e-07, "learning_rate": 0.000991634257788952, "loss": 2.3975, "step": 22071 }, { "crossentropy": 2.4171979427337646, "epoch": 0.8001740139211136, "grad_norm": 0.02720191888511181, "grad_norm_var": 4.548428435442124e-07, "learning_rate": 0.000991286932334432, "loss": 2.4063, "step": 22072 }, { "crossentropy": 2.495241641998291, "epoch": 0.8002102668213457, "grad_norm": 0.025267234072089195, "grad_norm_var": 6.089264027333049e-07, "learning_rate": 0.0009909396610236637, "loss": 2.4568, "step": 22073 }, { "crossentropy": 2.7456889152526855, "epoch": 0.8002465197215777, "grad_norm": 0.035791512578725815, "grad_norm_var": 5.740960909350366e-06, "learning_rate": 0.000990592443861339, "loss": 2.6864, "step": 22074 }, { "crossentropy": 2.473853826522827, "epoch": 0.8002827726218097, "grad_norm": 0.028114283457398415, "grad_norm_var": 5.683664552647884e-06, "learning_rate": 0.000990245280852151, "loss": 2.4032, "step": 22075 }, { "crossentropy": 2.325685977935791, "epoch": 0.8003190255220418, "grad_norm": 0.026256505399942398, "grad_norm_var": 5.75420001148647e-06, "learning_rate": 0.0009898981720007827, "loss": 2.3799, "step": 22076 }, { "crossentropy": 2.5764198303222656, "epoch": 0.8003552784222738, "grad_norm": 0.02702045999467373, "grad_norm_var": 5.733839771173314e-06, "learning_rate": 0.000989551117311925, "loss": 2.4951, "step": 22077 }, { "crossentropy": 2.4860033988952637, "epoch": 0.8003915313225058, "grad_norm": 0.027389660477638245, "grad_norm_var": 5.7104008336768585e-06, "learning_rate": 0.0009892041167902665, "loss": 2.484, "step": 22078 }, { "crossentropy": 2.37322735786438, "epoch": 0.8004277842227379, "grad_norm": 0.028032371774315834, "grad_norm_var": 5.588939800511376e-06, "learning_rate": 0.0009888571704404926, "loss": 2.4399, "step": 22079 }, { "crossentropy": 2.41682767868042, "epoch": 0.8004640371229699, "grad_norm": 0.026211179792881012, "grad_norm_var": 5.618177042395154e-06, "learning_rate": 0.0009885102782672905, "loss": 2.5145, "step": 22080 }, { "crossentropy": 2.502943277359009, "epoch": 0.8005002900232019, "grad_norm": 0.02730448544025421, "grad_norm_var": 5.54426349766549e-06, "learning_rate": 0.0009881634402753438, "loss": 2.4827, "step": 22081 }, { "crossentropy": 2.395538806915283, "epoch": 0.8005365429234339, "grad_norm": 0.026814082637429237, "grad_norm_var": 5.541510973727734e-06, "learning_rate": 0.0009878166564693375, "loss": 2.459, "step": 22082 }, { "crossentropy": 2.419008731842041, "epoch": 0.8005727958236659, "grad_norm": 0.026332609355449677, "grad_norm_var": 5.627120880833241e-06, "learning_rate": 0.000987469926853959, "loss": 2.4254, "step": 22083 }, { "crossentropy": 2.388901948928833, "epoch": 0.8006090487238979, "grad_norm": 0.02543787658214569, "grad_norm_var": 5.869850194477275e-06, "learning_rate": 0.0009871232514338851, "loss": 2.4399, "step": 22084 }, { "crossentropy": 2.46968936920166, "epoch": 0.80064530162413, "grad_norm": 0.026927031576633453, "grad_norm_var": 5.847468134726483e-06, "learning_rate": 0.0009867766302138021, "loss": 2.4509, "step": 22085 }, { "crossentropy": 2.5289955139160156, "epoch": 0.800681554524362, "grad_norm": 0.026629295200109482, "grad_norm_var": 5.816115434841007e-06, "learning_rate": 0.000986430063198392, "loss": 2.4908, "step": 22086 }, { "crossentropy": 2.3402490615844727, "epoch": 0.800717807424594, "grad_norm": 0.026714716106653214, "grad_norm_var": 5.690168881618274e-06, "learning_rate": 0.0009860835503923333, "loss": 2.4504, "step": 22087 }, { "crossentropy": 2.511936664581299, "epoch": 0.800754060324826, "grad_norm": 0.02655082941055298, "grad_norm_var": 5.728679154150479e-06, "learning_rate": 0.0009857370918003095, "loss": 2.4289, "step": 22088 }, { "crossentropy": 2.4903452396392822, "epoch": 0.800790313225058, "grad_norm": 0.027047649025917053, "grad_norm_var": 5.444327946121872e-06, "learning_rate": 0.0009853906874269962, "loss": 2.4423, "step": 22089 }, { "crossentropy": 2.5913007259368896, "epoch": 0.80082656612529, "grad_norm": 0.025935761630535126, "grad_norm_var": 5.023652909550332e-07, "learning_rate": 0.000985044337277074, "loss": 2.4687, "step": 22090 }, { "crossentropy": 2.3680481910705566, "epoch": 0.800862819025522, "grad_norm": 0.02585875615477562, "grad_norm_var": 4.2354809440269384e-07, "learning_rate": 0.0009846980413552226, "loss": 2.4541, "step": 22091 }, { "crossentropy": 2.5359201431274414, "epoch": 0.800899071925754, "grad_norm": 0.026792561635375023, "grad_norm_var": 4.1310052636035997e-07, "learning_rate": 0.0009843517996661177, "loss": 2.5039, "step": 22092 }, { "crossentropy": 2.4307780265808105, "epoch": 0.8009353248259861, "grad_norm": 0.026855334639549255, "grad_norm_var": 4.074730639505673e-07, "learning_rate": 0.0009840056122144341, "loss": 2.3479, "step": 22093 }, { "crossentropy": 2.4890553951263428, "epoch": 0.8009715777262181, "grad_norm": 0.026700954884290695, "grad_norm_var": 3.7168864301545016e-07, "learning_rate": 0.0009836594790048508, "loss": 2.4743, "step": 22094 }, { "crossentropy": 2.2971861362457275, "epoch": 0.8010078306264501, "grad_norm": 0.026377925649285316, "grad_norm_var": 2.343129966726609e-07, "learning_rate": 0.0009833134000420396, "loss": 2.3955, "step": 22095 }, { "crossentropy": 2.403092384338379, "epoch": 0.8010440835266821, "grad_norm": 0.026523878797888756, "grad_norm_var": 2.2710285989176643e-07, "learning_rate": 0.0009829673753306777, "loss": 2.5058, "step": 22096 }, { "crossentropy": 2.4143497943878174, "epoch": 0.8010803364269141, "grad_norm": 0.025881312787532806, "grad_norm_var": 2.105676878442116e-07, "learning_rate": 0.0009826214048754367, "loss": 2.4333, "step": 22097 }, { "crossentropy": 2.530996799468994, "epoch": 0.8011165893271461, "grad_norm": 0.024920163676142693, "grad_norm_var": 3.4566165469507074e-07, "learning_rate": 0.0009822754886809899, "loss": 2.4199, "step": 22098 }, { "crossentropy": 2.362313747406006, "epoch": 0.8011528422273781, "grad_norm": 0.026900045573711395, "grad_norm_var": 3.650058590075218e-07, "learning_rate": 0.000981929626752011, "loss": 2.381, "step": 22099 }, { "crossentropy": 2.4542598724365234, "epoch": 0.8011890951276102, "grad_norm": 0.027013929560780525, "grad_norm_var": 3.2261435913628843e-07, "learning_rate": 0.0009815838190931703, "loss": 2.4452, "step": 22100 }, { "crossentropy": 2.497492790222168, "epoch": 0.8012253480278422, "grad_norm": 0.027597807347774506, "grad_norm_var": 3.9099534971802296e-07, "learning_rate": 0.0009812380657091369, "loss": 2.4543, "step": 22101 }, { "crossentropy": 2.457050323486328, "epoch": 0.8012616009280742, "grad_norm": 0.02826172113418579, "grad_norm_var": 5.815946215750734e-07, "learning_rate": 0.0009808923666045832, "loss": 2.4565, "step": 22102 }, { "crossentropy": 2.5088582038879395, "epoch": 0.8012978538283063, "grad_norm": 0.02620854787528515, "grad_norm_var": 5.912715122624011e-07, "learning_rate": 0.0009805467217841762, "loss": 2.4959, "step": 22103 }, { "crossentropy": 2.5624375343322754, "epoch": 0.8013341067285383, "grad_norm": 0.026873508468270302, "grad_norm_var": 5.961283260976959e-07, "learning_rate": 0.0009802011312525865, "loss": 2.5368, "step": 22104 }, { "crossentropy": 2.3886477947235107, "epoch": 0.8013703596287703, "grad_norm": 0.02654993161559105, "grad_norm_var": 5.825255226463473e-07, "learning_rate": 0.00097985559501448, "loss": 2.3832, "step": 22105 }, { "crossentropy": 2.4321625232696533, "epoch": 0.8014066125290024, "grad_norm": 0.02681795507669449, "grad_norm_var": 5.555928588418657e-07, "learning_rate": 0.0009795101130745244, "loss": 2.413, "step": 22106 }, { "crossentropy": 2.565432548522949, "epoch": 0.8014428654292344, "grad_norm": 0.028674637898802757, "grad_norm_var": 7.603280335138328e-07, "learning_rate": 0.000979164685437387, "loss": 2.4422, "step": 22107 }, { "crossentropy": 2.3975539207458496, "epoch": 0.8014791183294664, "grad_norm": 0.02731490693986416, "grad_norm_var": 7.762088948832952e-07, "learning_rate": 0.0009788193121077326, "loss": 2.4278, "step": 22108 }, { "crossentropy": 2.4697580337524414, "epoch": 0.8015153712296984, "grad_norm": 0.026168564334511757, "grad_norm_var": 8.044694069785172e-07, "learning_rate": 0.0009784739930902243, "loss": 2.4453, "step": 22109 }, { "crossentropy": 2.4703071117401123, "epoch": 0.8015516241299304, "grad_norm": 0.02736821584403515, "grad_norm_var": 8.235638785318578e-07, "learning_rate": 0.000978128728389529, "loss": 2.4865, "step": 22110 }, { "crossentropy": 2.4659385681152344, "epoch": 0.8015878770301624, "grad_norm": 0.026667771860957146, "grad_norm_var": 8.109256281663494e-07, "learning_rate": 0.0009777835180103073, "loss": 2.4872, "step": 22111 }, { "crossentropy": 2.4472012519836426, "epoch": 0.8016241299303944, "grad_norm": 0.02682197466492653, "grad_norm_var": 8.031624192933086e-07, "learning_rate": 0.0009774383619572241, "loss": 2.4834, "step": 22112 }, { "crossentropy": 2.5709097385406494, "epoch": 0.8016603828306265, "grad_norm": 0.026034587994217873, "grad_norm_var": 7.84270705931657e-07, "learning_rate": 0.0009770932602349391, "loss": 2.6053, "step": 22113 }, { "crossentropy": 2.4704394340515137, "epoch": 0.8016966357308585, "grad_norm": 0.026682229712605476, "grad_norm_var": 5.161994388838686e-07, "learning_rate": 0.0009767482128481154, "loss": 2.4168, "step": 22114 }, { "crossentropy": 2.430628776550293, "epoch": 0.8017328886310905, "grad_norm": 0.02668563462793827, "grad_norm_var": 5.218521848105214e-07, "learning_rate": 0.0009764032198014128, "loss": 2.4418, "step": 22115 }, { "crossentropy": 2.281547784805298, "epoch": 0.8017691415313225, "grad_norm": 0.026758791878819466, "grad_norm_var": 5.248980707581493e-07, "learning_rate": 0.0009760582810994901, "loss": 2.364, "step": 22116 }, { "crossentropy": 2.571333885192871, "epoch": 0.8018053944315545, "grad_norm": 0.025564484298229218, "grad_norm_var": 6.125307796230832e-07, "learning_rate": 0.0009757133967470078, "loss": 2.4187, "step": 22117 }, { "crossentropy": 2.354646921157837, "epoch": 0.8018416473317865, "grad_norm": 0.026433011516928673, "grad_norm_var": 4.750917951330813e-07, "learning_rate": 0.0009753685667486234, "loss": 2.4402, "step": 22118 }, { "crossentropy": 2.4945437908172607, "epoch": 0.8018779002320185, "grad_norm": 0.027020111680030823, "grad_norm_var": 4.602046035028038e-07, "learning_rate": 0.0009750237911089927, "loss": 2.4848, "step": 22119 }, { "crossentropy": 2.5850281715393066, "epoch": 0.8019141531322506, "grad_norm": 0.026898792013525963, "grad_norm_var": 4.6056899073014e-07, "learning_rate": 0.0009746790698327751, "loss": 2.5877, "step": 22120 }, { "crossentropy": 2.531043291091919, "epoch": 0.8019504060324826, "grad_norm": 0.026833733543753624, "grad_norm_var": 4.56940626483715e-07, "learning_rate": 0.0009743344029246243, "loss": 2.5334, "step": 22121 }, { "crossentropy": 2.303647756576538, "epoch": 0.8019866589327146, "grad_norm": 0.026558009907603264, "grad_norm_var": 4.604232666397066e-07, "learning_rate": 0.0009739897903891959, "loss": 2.3502, "step": 22122 }, { "crossentropy": 2.4691152572631836, "epoch": 0.8020229118329466, "grad_norm": 0.02716217190027237, "grad_norm_var": 2.2138743476761888e-07, "learning_rate": 0.0009736452322311473, "loss": 2.4858, "step": 22123 }, { "crossentropy": 2.474299907684326, "epoch": 0.8020591647331786, "grad_norm": 0.027058422565460205, "grad_norm_var": 2.0398521466840658e-07, "learning_rate": 0.0009733007284551282, "loss": 2.4583, "step": 22124 }, { "crossentropy": 2.489835262298584, "epoch": 0.8020954176334106, "grad_norm": 0.027072306722402573, "grad_norm_var": 1.9463591620170333e-07, "learning_rate": 0.0009729562790657948, "loss": 2.3876, "step": 22125 }, { "crossentropy": 2.4708733558654785, "epoch": 0.8021316705336426, "grad_norm": 0.02766694873571396, "grad_norm_var": 2.2578305086533243e-07, "learning_rate": 0.0009726118840677978, "loss": 2.5656, "step": 22126 }, { "crossentropy": 2.299546480178833, "epoch": 0.8021679234338747, "grad_norm": 0.026824628934264183, "grad_norm_var": 2.2570696760561687e-07, "learning_rate": 0.0009722675434657885, "loss": 2.3772, "step": 22127 }, { "crossentropy": 2.3914167881011963, "epoch": 0.8022041763341067, "grad_norm": 0.025744246318936348, "grad_norm_var": 2.886391958671515e-07, "learning_rate": 0.0009719232572644187, "loss": 2.4076, "step": 22128 }, { "crossentropy": 2.616046667098999, "epoch": 0.8022404292343387, "grad_norm": 0.02661042846739292, "grad_norm_var": 2.5924301528667863e-07, "learning_rate": 0.0009715790254683377, "loss": 2.3698, "step": 22129 }, { "crossentropy": 2.4625751972198486, "epoch": 0.8022766821345708, "grad_norm": 0.027197346091270447, "grad_norm_var": 2.730013248274417e-07, "learning_rate": 0.0009712348480821947, "loss": 2.4719, "step": 22130 }, { "crossentropy": 2.457746982574463, "epoch": 0.8023129350348028, "grad_norm": 0.026088686659932137, "grad_norm_var": 3.0083912234631485e-07, "learning_rate": 0.0009708907251106408, "loss": 2.3805, "step": 22131 }, { "crossentropy": 2.38738751411438, "epoch": 0.8023491879350348, "grad_norm": 0.02688675932586193, "grad_norm_var": 3.0255421104538545e-07, "learning_rate": 0.0009705466565583204, "loss": 2.4908, "step": 22132 }, { "crossentropy": 2.524043321609497, "epoch": 0.8023854408352669, "grad_norm": 0.026683520525693893, "grad_norm_var": 2.074774595472167e-07, "learning_rate": 0.0009702026424298838, "loss": 2.4681, "step": 22133 }, { "crossentropy": 2.477572202682495, "epoch": 0.8024216937354989, "grad_norm": 0.02718486078083515, "grad_norm_var": 2.0639936309598388e-07, "learning_rate": 0.000969858682729976, "loss": 2.4786, "step": 22134 }, { "crossentropy": 2.428441047668457, "epoch": 0.8024579466357309, "grad_norm": 0.026284638792276382, "grad_norm_var": 2.2285700282894244e-07, "learning_rate": 0.0009695147774632407, "loss": 2.507, "step": 22135 }, { "crossentropy": 2.4126360416412354, "epoch": 0.8024941995359629, "grad_norm": 0.027627674862742424, "grad_norm_var": 2.659327176449903e-07, "learning_rate": 0.0009691709266343268, "loss": 2.5052, "step": 22136 }, { "crossentropy": 2.3463661670684814, "epoch": 0.8025304524361949, "grad_norm": 0.027077199891209602, "grad_norm_var": 2.693439858538708e-07, "learning_rate": 0.0009688271302478752, "loss": 2.4633, "step": 22137 }, { "crossentropy": 2.3012092113494873, "epoch": 0.8025667053364269, "grad_norm": 0.026719605550169945, "grad_norm_var": 2.6451264694796136e-07, "learning_rate": 0.0009684833883085303, "loss": 2.4218, "step": 22138 }, { "crossentropy": 2.443162441253662, "epoch": 0.802602958236659, "grad_norm": 0.026932084932923317, "grad_norm_var": 2.5879948779676997e-07, "learning_rate": 0.0009681397008209369, "loss": 2.4121, "step": 22139 }, { "crossentropy": 2.3942580223083496, "epoch": 0.802639211136891, "grad_norm": 0.02660362422466278, "grad_norm_var": 2.593133555950829e-07, "learning_rate": 0.000967796067789734, "loss": 2.3793, "step": 22140 }, { "crossentropy": 2.491219997406006, "epoch": 0.802675464037123, "grad_norm": 0.027439778670668602, "grad_norm_var": 2.798562206224175e-07, "learning_rate": 0.0009674524892195657, "loss": 2.5219, "step": 22141 }, { "crossentropy": 2.2845327854156494, "epoch": 0.802711716937355, "grad_norm": 0.027748409658670425, "grad_norm_var": 2.891632016214918e-07, "learning_rate": 0.000967108965115071, "loss": 2.4601, "step": 22142 }, { "crossentropy": 2.3347322940826416, "epoch": 0.802747969837587, "grad_norm": 0.025737594813108444, "grad_norm_var": 3.6717770782220226e-07, "learning_rate": 0.0009667654954808891, "loss": 2.3857, "step": 22143 }, { "crossentropy": 2.361705780029297, "epoch": 0.802784222737819, "grad_norm": 0.026154080405831337, "grad_norm_var": 3.207818839671856e-07, "learning_rate": 0.0009664220803216605, "loss": 2.3884, "step": 22144 }, { "crossentropy": 2.351069450378418, "epoch": 0.802820475638051, "grad_norm": 0.02674386277794838, "grad_norm_var": 3.1832593558471917e-07, "learning_rate": 0.0009660787196420217, "loss": 2.4043, "step": 22145 }, { "crossentropy": 2.6564414501190186, "epoch": 0.802856728538283, "grad_norm": 0.02662018872797489, "grad_norm_var": 3.1005753460405796e-07, "learning_rate": 0.0009657354134466118, "loss": 2.5909, "step": 22146 }, { "crossentropy": 2.4892473220825195, "epoch": 0.8028929814385151, "grad_norm": 0.026826487854123116, "grad_norm_var": 2.75749317424819e-07, "learning_rate": 0.000965392161740068, "loss": 2.4552, "step": 22147 }, { "crossentropy": 2.482736110687256, "epoch": 0.8029292343387471, "grad_norm": 0.027187997475266457, "grad_norm_var": 2.837247569349606e-07, "learning_rate": 0.0009650489645270249, "loss": 2.4687, "step": 22148 }, { "crossentropy": 2.4747891426086426, "epoch": 0.8029654872389791, "grad_norm": 0.026286480948328972, "grad_norm_var": 3.022965424437681e-07, "learning_rate": 0.0009647058218121201, "loss": 2.4286, "step": 22149 }, { "crossentropy": 2.318760395050049, "epoch": 0.8030017401392111, "grad_norm": 0.02647019363939762, "grad_norm_var": 2.9977615819951934e-07, "learning_rate": 0.0009643627335999871, "loss": 2.3465, "step": 22150 }, { "crossentropy": 2.3850655555725098, "epoch": 0.8030379930394431, "grad_norm": 0.026621097698807716, "grad_norm_var": 2.846853012212006e-07, "learning_rate": 0.0009640196998952583, "loss": 2.4427, "step": 22151 }, { "crossentropy": 2.4455480575561523, "epoch": 0.8030742459396751, "grad_norm": 0.02646365948021412, "grad_norm_var": 2.408764153444265e-07, "learning_rate": 0.0009636767207025692, "loss": 2.5142, "step": 22152 }, { "crossentropy": 2.3321633338928223, "epoch": 0.8031104988399071, "grad_norm": 0.02679033949971199, "grad_norm_var": 2.3262583579360062e-07, "learning_rate": 0.0009633337960265504, "loss": 2.3605, "step": 22153 }, { "crossentropy": 2.53662371635437, "epoch": 0.8031467517401392, "grad_norm": 0.02760367840528488, "grad_norm_var": 2.827140800797911e-07, "learning_rate": 0.0009629909258718344, "loss": 2.5311, "step": 22154 }, { "crossentropy": 2.487081289291382, "epoch": 0.8031830046403712, "grad_norm": 0.025984754785895348, "grad_norm_var": 3.1761669406725016e-07, "learning_rate": 0.000962648110243054, "loss": 2.4673, "step": 22155 }, { "crossentropy": 2.458141803741455, "epoch": 0.8032192575406032, "grad_norm": 0.027382856234908104, "grad_norm_var": 3.4501967737253906e-07, "learning_rate": 0.0009623053491448358, "loss": 2.4205, "step": 22156 }, { "crossentropy": 2.4274864196777344, "epoch": 0.8032555104408353, "grad_norm": 0.026824159547686577, "grad_norm_var": 3.1240287198529223e-07, "learning_rate": 0.0009619626425818134, "loss": 2.4411, "step": 22157 }, { "crossentropy": 2.268833637237549, "epoch": 0.8032917633410673, "grad_norm": 0.02610802836716175, "grad_norm_var": 2.546361216802679e-07, "learning_rate": 0.0009616199905586115, "loss": 2.4069, "step": 22158 }, { "crossentropy": 2.458181858062744, "epoch": 0.8033280162412993, "grad_norm": 0.02639574371278286, "grad_norm_var": 2.0490295251743226e-07, "learning_rate": 0.0009612773930798618, "loss": 2.4652, "step": 22159 }, { "crossentropy": 2.432844400405884, "epoch": 0.8033642691415314, "grad_norm": 0.027111802250146866, "grad_norm_var": 1.983951588753937e-07, "learning_rate": 0.0009609348501501897, "loss": 2.4154, "step": 22160 }, { "crossentropy": 2.3484888076782227, "epoch": 0.8034005220417634, "grad_norm": 0.026006465777754784, "grad_norm_var": 2.2942731003962266e-07, "learning_rate": 0.0009605923617742202, "loss": 2.3313, "step": 22161 }, { "crossentropy": 2.4576709270477295, "epoch": 0.8034367749419954, "grad_norm": 0.027625558897852898, "grad_norm_var": 2.8622537536665493e-07, "learning_rate": 0.0009602499279565813, "loss": 2.4974, "step": 22162 }, { "crossentropy": 2.494683027267456, "epoch": 0.8034730278422274, "grad_norm": 0.02631654404103756, "grad_norm_var": 2.959571325036042e-07, "learning_rate": 0.0009599075487018993, "loss": 2.4362, "step": 22163 }, { "crossentropy": 2.4084763526916504, "epoch": 0.8035092807424594, "grad_norm": 0.026827726513147354, "grad_norm_var": 2.805658567472368e-07, "learning_rate": 0.0009595652240147951, "loss": 2.4548, "step": 22164 }, { "crossentropy": 2.4106125831604004, "epoch": 0.8035455336426914, "grad_norm": 0.027085626497864723, "grad_norm_var": 2.7895555845073676e-07, "learning_rate": 0.0009592229538998953, "loss": 2.4511, "step": 22165 }, { "crossentropy": 2.2767069339752197, "epoch": 0.8035817865429234, "grad_norm": 0.025292685255408287, "grad_norm_var": 4.057972521948408e-07, "learning_rate": 0.0009588807383618209, "loss": 2.2574, "step": 22166 }, { "crossentropy": 2.397655725479126, "epoch": 0.8036180394431555, "grad_norm": 0.031559500843286514, "grad_norm_var": 1.9093294731425584e-06, "learning_rate": 0.0009585385774051958, "loss": 2.5099, "step": 22167 }, { "crossentropy": 2.4565389156341553, "epoch": 0.8036542923433875, "grad_norm": 0.026097923517227173, "grad_norm_var": 1.941951896280008e-06, "learning_rate": 0.0009581964710346402, "loss": 2.4212, "step": 22168 }, { "crossentropy": 2.464056968688965, "epoch": 0.8036905452436195, "grad_norm": 0.02600172348320484, "grad_norm_var": 1.996383371617944e-06, "learning_rate": 0.0009578544192547739, "loss": 2.438, "step": 22169 }, { "crossentropy": 2.4946987628936768, "epoch": 0.8037267981438515, "grad_norm": 0.0267917662858963, "grad_norm_var": 1.960221242503973e-06, "learning_rate": 0.0009575124220702175, "loss": 2.3975, "step": 22170 }, { "crossentropy": 2.509503126144409, "epoch": 0.8037630510440835, "grad_norm": 0.026203472167253494, "grad_norm_var": 1.9383195964490135e-06, "learning_rate": 0.0009571704794855918, "loss": 2.4353, "step": 22171 }, { "crossentropy": 2.3016152381896973, "epoch": 0.8037993039443155, "grad_norm": 0.02687913179397583, "grad_norm_var": 1.9185224574932844e-06, "learning_rate": 0.000956828591505513, "loss": 2.3208, "step": 22172 }, { "crossentropy": 2.426009178161621, "epoch": 0.8038355568445475, "grad_norm": 0.027080291882157326, "grad_norm_var": 1.9227479714817626e-06, "learning_rate": 0.000956486758134601, "loss": 2.4197, "step": 22173 }, { "crossentropy": 2.420119047164917, "epoch": 0.8038718097447796, "grad_norm": 0.025875696912407875, "grad_norm_var": 1.948687823559086e-06, "learning_rate": 0.0009561449793774701, "loss": 2.4153, "step": 22174 }, { "crossentropy": 2.4908223152160645, "epoch": 0.8039080626450116, "grad_norm": 0.025573808699846268, "grad_norm_var": 2.0376230620247048e-06, "learning_rate": 0.0009558032552387385, "loss": 2.4542, "step": 22175 }, { "crossentropy": 2.702033042907715, "epoch": 0.8039443155452436, "grad_norm": 0.027389902621507645, "grad_norm_var": 2.055108307621953e-06, "learning_rate": 0.0009554615857230231, "loss": 2.6008, "step": 22176 }, { "crossentropy": 2.50224232673645, "epoch": 0.8039805684454756, "grad_norm": 0.027034113183617592, "grad_norm_var": 2.014027955261205e-06, "learning_rate": 0.0009551199708349351, "loss": 2.4339, "step": 22177 }, { "crossentropy": 2.2731881141662598, "epoch": 0.8040168213457076, "grad_norm": 0.027108414098620415, "grad_norm_var": 1.97741891388876e-06, "learning_rate": 0.0009547784105790897, "loss": 2.3678, "step": 22178 }, { "crossentropy": 2.545743942260742, "epoch": 0.8040530742459396, "grad_norm": 0.02760463021695614, "grad_norm_var": 1.994668783404883e-06, "learning_rate": 0.0009544369049601021, "loss": 2.5109, "step": 22179 }, { "crossentropy": 2.4735450744628906, "epoch": 0.8040893271461717, "grad_norm": 0.02653457783162594, "grad_norm_var": 2.0028803784607653e-06, "learning_rate": 0.000954095453982582, "loss": 2.3861, "step": 22180 }, { "crossentropy": 2.534905433654785, "epoch": 0.8041255800464037, "grad_norm": 0.026908816769719124, "grad_norm_var": 2.000035678325707e-06, "learning_rate": 0.0009537540576511439, "loss": 2.5342, "step": 22181 }, { "crossentropy": 2.534681558609009, "epoch": 0.8041618329466357, "grad_norm": 0.02709839679300785, "grad_norm_var": 1.823818472112742e-06, "learning_rate": 0.0009534127159703965, "loss": 2.3831, "step": 22182 }, { "crossentropy": 2.570526123046875, "epoch": 0.8041980858468677, "grad_norm": 0.028059182688593864, "grad_norm_var": 4.5410138420770573e-07, "learning_rate": 0.0009530714289449511, "loss": 2.5606, "step": 22183 }, { "crossentropy": 2.5147554874420166, "epoch": 0.8042343387470998, "grad_norm": 0.0286478903144598, "grad_norm_var": 6.33654710340325e-07, "learning_rate": 0.0009527301965794205, "loss": 2.49, "step": 22184 }, { "crossentropy": 2.482541799545288, "epoch": 0.8042705916473318, "grad_norm": 0.026577835902571678, "grad_norm_var": 5.835166217147871e-07, "learning_rate": 0.0009523890188784079, "loss": 2.4396, "step": 22185 }, { "crossentropy": 2.273104667663574, "epoch": 0.8043068445475638, "grad_norm": 0.026315277442336082, "grad_norm_var": 6.0842640189999e-07, "learning_rate": 0.0009520478958465234, "loss": 2.4095, "step": 22186 }, { "crossentropy": 2.5956761837005615, "epoch": 0.8043430974477959, "grad_norm": 0.026827868074178696, "grad_norm_var": 5.722483029007667e-07, "learning_rate": 0.0009517068274883773, "loss": 2.5167, "step": 22187 }, { "crossentropy": 2.5117506980895996, "epoch": 0.8043793503480279, "grad_norm": 0.025564758107066154, "grad_norm_var": 6.961009596507245e-07, "learning_rate": 0.0009513658138085718, "loss": 2.4564, "step": 22188 }, { "crossentropy": 2.641075611114502, "epoch": 0.8044156032482599, "grad_norm": 0.02662777341902256, "grad_norm_var": 6.972725295658801e-07, "learning_rate": 0.0009510248548117167, "loss": 2.554, "step": 22189 }, { "crossentropy": 2.5808913707733154, "epoch": 0.8044518561484919, "grad_norm": 0.02642430178821087, "grad_norm_var": 6.441344056983539e-07, "learning_rate": 0.0009506839505024145, "loss": 2.5517, "step": 22190 }, { "crossentropy": 2.505596876144409, "epoch": 0.8044881090487239, "grad_norm": 0.027143241837620735, "grad_norm_var": 5.219035496062858e-07, "learning_rate": 0.0009503431008852703, "loss": 2.5247, "step": 22191 }, { "crossentropy": 2.610564947128296, "epoch": 0.8045243619489559, "grad_norm": 0.026539908722043037, "grad_norm_var": 5.21928294281446e-07, "learning_rate": 0.0009500023059648893, "loss": 2.6087, "step": 22192 }, { "crossentropy": 2.5036563873291016, "epoch": 0.804560614849188, "grad_norm": 0.025901874527335167, "grad_norm_var": 5.876261231519634e-07, "learning_rate": 0.0009496615657458735, "loss": 2.4952, "step": 22193 }, { "crossentropy": 2.4576897621154785, "epoch": 0.80459686774942, "grad_norm": 0.025105681270360947, "grad_norm_var": 7.740575041395581e-07, "learning_rate": 0.0009493208802328235, "loss": 2.3789, "step": 22194 }, { "crossentropy": 2.529916524887085, "epoch": 0.804633120649652, "grad_norm": 0.02682427130639553, "grad_norm_var": 7.22427814500252e-07, "learning_rate": 0.0009489802494303434, "loss": 2.5297, "step": 22195 }, { "crossentropy": 2.4585084915161133, "epoch": 0.804669373549884, "grad_norm": 0.02701801247894764, "grad_norm_var": 7.26768043896447e-07, "learning_rate": 0.0009486396733430313, "loss": 2.5321, "step": 22196 }, { "crossentropy": 2.4075441360473633, "epoch": 0.804705626450116, "grad_norm": 0.026584576815366745, "grad_norm_var": 7.253517172582282e-07, "learning_rate": 0.0009482991519754897, "loss": 2.4544, "step": 22197 }, { "crossentropy": 2.34543776512146, "epoch": 0.804741879350348, "grad_norm": 0.027290940284729004, "grad_norm_var": 7.377989694874627e-07, "learning_rate": 0.0009479586853323158, "loss": 2.4082, "step": 22198 }, { "crossentropy": 2.338482618331909, "epoch": 0.80477813225058, "grad_norm": 0.028443532064557076, "grad_norm_var": 8.158736145689553e-07, "learning_rate": 0.0009476182734181088, "loss": 2.3613, "step": 22199 }, { "crossentropy": 2.5466103553771973, "epoch": 0.804814385150812, "grad_norm": 0.02662716619670391, "grad_norm_var": 5.570008404470287e-07, "learning_rate": 0.0009472779162374684, "loss": 2.5974, "step": 22200 }, { "crossentropy": 2.4405524730682373, "epoch": 0.8048506380510441, "grad_norm": 0.02677639201283455, "grad_norm_var": 5.585190087900218e-07, "learning_rate": 0.0009469376137949898, "loss": 2.392, "step": 22201 }, { "crossentropy": 2.4460721015930176, "epoch": 0.8048868909512761, "grad_norm": 0.02666587382555008, "grad_norm_var": 5.516775165565387e-07, "learning_rate": 0.0009465973660952676, "loss": 2.483, "step": 22202 }, { "crossentropy": 2.5098066329956055, "epoch": 0.8049231438515081, "grad_norm": 0.026823291555047035, "grad_norm_var": 5.515689999440459e-07, "learning_rate": 0.000946257173142901, "loss": 2.5066, "step": 22203 }, { "crossentropy": 2.4023935794830322, "epoch": 0.8049593967517401, "grad_norm": 0.02614370547235012, "grad_norm_var": 4.889299766649834e-07, "learning_rate": 0.0009459170349424817, "loss": 2.4063, "step": 22204 }, { "crossentropy": 2.541707992553711, "epoch": 0.8049956496519721, "grad_norm": 0.0268008541315794, "grad_norm_var": 4.895097054223441e-07, "learning_rate": 0.0009455769514986051, "loss": 2.4616, "step": 22205 }, { "crossentropy": 2.405546188354492, "epoch": 0.8050319025522041, "grad_norm": 0.026533018797636032, "grad_norm_var": 4.863302600766987e-07, "learning_rate": 0.0009452369228158664, "loss": 2.4722, "step": 22206 }, { "crossentropy": 2.3166120052337646, "epoch": 0.8050681554524362, "grad_norm": 0.026949390769004822, "grad_norm_var": 4.772586033989819e-07, "learning_rate": 0.0009448969488988545, "loss": 2.4212, "step": 22207 }, { "crossentropy": 2.519951343536377, "epoch": 0.8051044083526682, "grad_norm": 0.027257243171334267, "grad_norm_var": 4.951325316305723e-07, "learning_rate": 0.0009445570297521644, "loss": 2.5227, "step": 22208 }, { "crossentropy": 2.5356194972991943, "epoch": 0.8051406612529002, "grad_norm": 0.027093878015875816, "grad_norm_var": 4.5166606404283396e-07, "learning_rate": 0.000944217165380386, "loss": 2.6326, "step": 22209 }, { "crossentropy": 2.4617185592651367, "epoch": 0.8051769141531323, "grad_norm": 0.02764938771724701, "grad_norm_var": 2.785005027817657e-07, "learning_rate": 0.0009438773557881086, "loss": 2.4102, "step": 22210 }, { "crossentropy": 2.633261203765869, "epoch": 0.8052131670533643, "grad_norm": 0.026343010365962982, "grad_norm_var": 3.0217312993781564e-07, "learning_rate": 0.0009435376009799241, "loss": 2.5923, "step": 22211 }, { "crossentropy": 2.565078020095825, "epoch": 0.8052494199535963, "grad_norm": 0.02629704214632511, "grad_norm_var": 3.2692254848479957e-07, "learning_rate": 0.0009431979009604191, "loss": 2.5839, "step": 22212 }, { "crossentropy": 2.4901926517486572, "epoch": 0.8052856728538283, "grad_norm": 0.026941347867250443, "grad_norm_var": 3.202322248818089e-07, "learning_rate": 0.0009428582557341825, "loss": 2.513, "step": 22213 }, { "crossentropy": 2.389105796813965, "epoch": 0.8053219257540604, "grad_norm": 0.026926681399345398, "grad_norm_var": 3.1025448033687036e-07, "learning_rate": 0.0009425186653058038, "loss": 2.3793, "step": 22214 }, { "crossentropy": 2.354278087615967, "epoch": 0.8053581786542924, "grad_norm": 0.0258189644664526, "grad_norm_var": 1.9782586635856229e-07, "learning_rate": 0.0009421791296798659, "loss": 2.4371, "step": 22215 }, { "crossentropy": 2.4904112815856934, "epoch": 0.8053944315545244, "grad_norm": 0.02800896391272545, "grad_norm_var": 2.985922454426844e-07, "learning_rate": 0.0009418396488609588, "loss": 2.5545, "step": 22216 }, { "crossentropy": 2.2909095287323, "epoch": 0.8054306844547564, "grad_norm": 0.02545316331088543, "grad_norm_var": 4.1471646618380466e-07, "learning_rate": 0.0009415002228536645, "loss": 2.4298, "step": 22217 }, { "crossentropy": 2.4494950771331787, "epoch": 0.8054669373549884, "grad_norm": 0.027240918949246407, "grad_norm_var": 4.3034333041604086e-07, "learning_rate": 0.0009411608516625697, "loss": 2.4601, "step": 22218 }, { "crossentropy": 2.5131590366363525, "epoch": 0.8055031902552204, "grad_norm": 0.028099754825234413, "grad_norm_var": 5.416645150905918e-07, "learning_rate": 0.0009408215352922572, "loss": 2.4648, "step": 22219 }, { "crossentropy": 2.4050450325012207, "epoch": 0.8055394431554525, "grad_norm": 0.033135708421468735, "grad_norm_var": 2.9412024824109853e-06, "learning_rate": 0.0009404822737473084, "loss": 2.4578, "step": 22220 }, { "crossentropy": 2.442472219467163, "epoch": 0.8055756960556845, "grad_norm": 0.026841875165700912, "grad_norm_var": 2.938663278892574e-06, "learning_rate": 0.000940143067032307, "loss": 2.495, "step": 22221 }, { "crossentropy": 2.483778715133667, "epoch": 0.8056119489559165, "grad_norm": 0.02711140178143978, "grad_norm_var": 2.901433845097408e-06, "learning_rate": 0.0009398039151518362, "loss": 2.4775, "step": 22222 }, { "crossentropy": 2.506237506866455, "epoch": 0.8056482018561485, "grad_norm": 0.027259619906544685, "grad_norm_var": 2.891993155054945e-06, "learning_rate": 0.0009394648181104737, "loss": 2.4782, "step": 22223 }, { "crossentropy": 2.227536916732788, "epoch": 0.8056844547563805, "grad_norm": 0.025651149451732635, "grad_norm_var": 3.071457713657577e-06, "learning_rate": 0.0009391257759128019, "loss": 2.3, "step": 22224 }, { "crossentropy": 2.2655959129333496, "epoch": 0.8057207076566125, "grad_norm": 0.025991063565015793, "grad_norm_var": 3.169258311965063e-06, "learning_rate": 0.0009387867885633977, "loss": 2.3101, "step": 22225 }, { "crossentropy": 2.4170310497283936, "epoch": 0.8057569605568445, "grad_norm": 0.025393934920430183, "grad_norm_var": 3.343975943269582e-06, "learning_rate": 0.000938447856066843, "loss": 2.3859, "step": 22226 }, { "crossentropy": 2.6364541053771973, "epoch": 0.8057932134570766, "grad_norm": 0.026508601382374763, "grad_norm_var": 3.3304740630404286e-06, "learning_rate": 0.0009381089784277131, "loss": 2.5433, "step": 22227 }, { "crossentropy": 2.4800076484680176, "epoch": 0.8058294663573086, "grad_norm": 0.02695869840681553, "grad_norm_var": 3.2920699061323668e-06, "learning_rate": 0.0009377701556505846, "loss": 2.5428, "step": 22228 }, { "crossentropy": 2.433522939682007, "epoch": 0.8058657192575406, "grad_norm": 0.027068719267845154, "grad_norm_var": 3.2906635174560165e-06, "learning_rate": 0.0009374313877400343, "loss": 2.4029, "step": 22229 }, { "crossentropy": 2.4177725315093994, "epoch": 0.8059019721577726, "grad_norm": 0.026865772902965546, "grad_norm_var": 3.292236545596053e-06, "learning_rate": 0.0009370926747006403, "loss": 2.516, "step": 22230 }, { "crossentropy": 2.216017723083496, "epoch": 0.8059382250580046, "grad_norm": 0.02590223401784897, "grad_norm_var": 3.2785800910666942e-06, "learning_rate": 0.0009367540165369737, "loss": 2.3321, "step": 22231 }, { "crossentropy": 2.419567584991455, "epoch": 0.8059744779582366, "grad_norm": 0.027598856016993523, "grad_norm_var": 3.2390182359376494e-06, "learning_rate": 0.0009364154132536112, "loss": 2.4635, "step": 22232 }, { "crossentropy": 2.3653740882873535, "epoch": 0.8060107308584686, "grad_norm": 0.02676137536764145, "grad_norm_var": 3.0643798967222e-06, "learning_rate": 0.0009360768648551244, "loss": 2.407, "step": 22233 }, { "crossentropy": 2.359684705734253, "epoch": 0.8060469837587007, "grad_norm": 0.027076933532953262, "grad_norm_var": 3.0640585834017327e-06, "learning_rate": 0.0009357383713460882, "loss": 2.3094, "step": 22234 }, { "crossentropy": 2.480496883392334, "epoch": 0.8060832366589327, "grad_norm": 0.027324311435222626, "grad_norm_var": 3.0023167929960417e-06, "learning_rate": 0.000935399932731073, "loss": 2.4112, "step": 22235 }, { "crossentropy": 2.441357135772705, "epoch": 0.8061194895591647, "grad_norm": 0.02633950486779213, "grad_norm_var": 4.1128952961797667e-07, "learning_rate": 0.0009350615490146486, "loss": 2.5077, "step": 22236 }, { "crossentropy": 2.2976038455963135, "epoch": 0.8061557424593968, "grad_norm": 0.02591794542968273, "grad_norm_var": 4.429612431725797e-07, "learning_rate": 0.0009347232202013867, "loss": 2.3823, "step": 22237 }, { "crossentropy": 2.3917956352233887, "epoch": 0.8061919953596288, "grad_norm": 0.026786424219608307, "grad_norm_var": 4.2775507038544935e-07, "learning_rate": 0.0009343849462958581, "loss": 2.3686, "step": 22238 }, { "crossentropy": 2.339951515197754, "epoch": 0.8062282482598608, "grad_norm": 0.02612890675663948, "grad_norm_var": 4.063805854626849e-07, "learning_rate": 0.0009340467273026293, "loss": 2.4248, "step": 22239 }, { "crossentropy": 2.459254264831543, "epoch": 0.8062645011600929, "grad_norm": 0.026211896911263466, "grad_norm_var": 3.612851141721399e-07, "learning_rate": 0.0009337085632262715, "loss": 2.461, "step": 22240 }, { "crossentropy": 2.404282331466675, "epoch": 0.8063007540603249, "grad_norm": 0.025887932628393173, "grad_norm_var": 3.6966591565410627e-07, "learning_rate": 0.0009333704540713483, "loss": 2.4902, "step": 22241 }, { "crossentropy": 2.519263505935669, "epoch": 0.8063370069605569, "grad_norm": 0.026562156155705452, "grad_norm_var": 2.755517731387394e-07, "learning_rate": 0.0009330323998424294, "loss": 2.5321, "step": 22242 }, { "crossentropy": 2.446746587753296, "epoch": 0.8063732598607889, "grad_norm": 0.02679530344903469, "grad_norm_var": 2.7647786457959953e-07, "learning_rate": 0.0009326944005440818, "loss": 2.4657, "step": 22243 }, { "crossentropy": 2.3874399662017822, "epoch": 0.8064095127610209, "grad_norm": 0.028879620134830475, "grad_norm_var": 5.895739571008431e-07, "learning_rate": 0.0009323564561808662, "loss": 2.3476, "step": 22244 }, { "crossentropy": 2.4980504512786865, "epoch": 0.8064457656612529, "grad_norm": 0.02667493186891079, "grad_norm_var": 5.828854463017669e-07, "learning_rate": 0.0009320185667573494, "loss": 2.4043, "step": 22245 }, { "crossentropy": 2.5132079124450684, "epoch": 0.8064820185614849, "grad_norm": 0.02625368721783161, "grad_norm_var": 5.95394345028717e-07, "learning_rate": 0.0009316807322780968, "loss": 2.5078, "step": 22246 }, { "crossentropy": 2.582381248474121, "epoch": 0.806518271461717, "grad_norm": 0.027080779895186424, "grad_norm_var": 5.578067687851568e-07, "learning_rate": 0.000931342952747668, "loss": 2.5641, "step": 22247 }, { "crossentropy": 2.535778760910034, "epoch": 0.806554524361949, "grad_norm": 0.026952840387821198, "grad_norm_var": 5.122841299072851e-07, "learning_rate": 0.0009310052281706288, "loss": 2.5779, "step": 22248 }, { "crossentropy": 2.463488817214966, "epoch": 0.806590777262181, "grad_norm": 0.02581745572388172, "grad_norm_var": 5.636643639633325e-07, "learning_rate": 0.0009306675585515379, "loss": 2.4491, "step": 22249 }, { "crossentropy": 2.3435051441192627, "epoch": 0.806627030162413, "grad_norm": 0.026658926159143448, "grad_norm_var": 5.518025317616711e-07, "learning_rate": 0.0009303299438949564, "loss": 2.3891, "step": 22250 }, { "crossentropy": 2.5022976398468018, "epoch": 0.806663283062645, "grad_norm": 0.025917841121554375, "grad_norm_var": 5.474913208537105e-07, "learning_rate": 0.0009299923842054481, "loss": 2.4826, "step": 22251 }, { "crossentropy": 2.3436858654022217, "epoch": 0.806699535962877, "grad_norm": 0.0318312793970108, "grad_norm_var": 2.275305822919598e-06, "learning_rate": 0.0009296548794875659, "loss": 2.4295, "step": 22252 }, { "crossentropy": 2.3963193893432617, "epoch": 0.806735788863109, "grad_norm": 0.026589656248688698, "grad_norm_var": 2.215786817833298e-06, "learning_rate": 0.0009293174297458717, "loss": 2.3176, "step": 22253 }, { "crossentropy": 2.567683696746826, "epoch": 0.806772041763341, "grad_norm": 0.025665534660220146, "grad_norm_var": 2.317166817873637e-06, "learning_rate": 0.0009289800349849248, "loss": 2.5215, "step": 22254 }, { "crossentropy": 2.458186388015747, "epoch": 0.8068082946635731, "grad_norm": 0.026467645540833473, "grad_norm_var": 2.2908984727729492e-06, "learning_rate": 0.0009286426952092792, "loss": 2.4524, "step": 22255 }, { "crossentropy": 2.5487234592437744, "epoch": 0.8068445475638051, "grad_norm": 0.02651391364634037, "grad_norm_var": 2.269274045832191e-06, "learning_rate": 0.0009283054104234934, "loss": 2.4763, "step": 22256 }, { "crossentropy": 2.6404762268066406, "epoch": 0.8068808004640371, "grad_norm": 0.026822401210665703, "grad_norm_var": 2.1965874365235833e-06, "learning_rate": 0.0009279681806321205, "loss": 2.5888, "step": 22257 }, { "crossentropy": 2.6470253467559814, "epoch": 0.8069170533642691, "grad_norm": 0.02768957056105137, "grad_norm_var": 2.215059498787639e-06, "learning_rate": 0.0009276310058397175, "loss": 2.5224, "step": 22258 }, { "crossentropy": 2.293703079223633, "epoch": 0.8069533062645011, "grad_norm": 0.030934663489460945, "grad_norm_var": 3.151888716840014e-06, "learning_rate": 0.000927293886050839, "loss": 2.3476, "step": 22259 }, { "crossentropy": 2.525484800338745, "epoch": 0.8069895591647331, "grad_norm": 0.025578700006008148, "grad_norm_var": 3.136311841787226e-06, "learning_rate": 0.0009269568212700368, "loss": 2.4961, "step": 22260 }, { "crossentropy": 2.5101630687713623, "epoch": 0.8070258120649652, "grad_norm": 0.026327865198254585, "grad_norm_var": 3.1630762272425243e-06, "learning_rate": 0.0009266198115018626, "loss": 2.4854, "step": 22261 }, { "crossentropy": 2.4005112648010254, "epoch": 0.8070620649651972, "grad_norm": 0.02556406520307064, "grad_norm_var": 3.267760447317238e-06, "learning_rate": 0.0009262828567508707, "loss": 2.3537, "step": 22262 }, { "crossentropy": 2.3951172828674316, "epoch": 0.8070983178654292, "grad_norm": 0.02726336382329464, "grad_norm_var": 3.271181947394943e-06, "learning_rate": 0.00092594595702161, "loss": 2.3664, "step": 22263 }, { "crossentropy": 2.419340133666992, "epoch": 0.8071345707656613, "grad_norm": 0.025199709460139275, "grad_norm_var": 3.483000458147353e-06, "learning_rate": 0.0009256091123186328, "loss": 2.3405, "step": 22264 }, { "crossentropy": 2.476349353790283, "epoch": 0.8071708236658933, "grad_norm": 0.028933828696608543, "grad_norm_var": 3.628677842939136e-06, "learning_rate": 0.000925272322646486, "loss": 2.4441, "step": 22265 }, { "crossentropy": 2.418471574783325, "epoch": 0.8072070765661253, "grad_norm": 0.0266894344240427, "grad_norm_var": 3.6268505672206833e-06, "learning_rate": 0.0009249355880097204, "loss": 2.4425, "step": 22266 }, { "crossentropy": 2.4066832065582275, "epoch": 0.8072433294663574, "grad_norm": 0.026966262608766556, "grad_norm_var": 3.526893594437312e-06, "learning_rate": 0.0009245989084128853, "loss": 2.4532, "step": 22267 }, { "crossentropy": 2.3740665912628174, "epoch": 0.8072795823665894, "grad_norm": 0.026266023516654968, "grad_norm_var": 2.0185628347736076e-06, "learning_rate": 0.0009242622838605264, "loss": 2.4106, "step": 22268 }, { "crossentropy": 2.3768372535705566, "epoch": 0.8073158352668214, "grad_norm": 0.026405762881040573, "grad_norm_var": 2.0268646067217455e-06, "learning_rate": 0.0009239257143571894, "loss": 2.3984, "step": 22269 }, { "crossentropy": 2.2213597297668457, "epoch": 0.8073520881670534, "grad_norm": 0.027972716838121414, "grad_norm_var": 2.001171748531325e-06, "learning_rate": 0.0009235891999074219, "loss": 2.3234, "step": 22270 }, { "crossentropy": 2.4648325443267822, "epoch": 0.8073883410672854, "grad_norm": 0.027280017733573914, "grad_norm_var": 1.9874913533802012e-06, "learning_rate": 0.0009232527405157681, "loss": 2.4331, "step": 22271 }, { "crossentropy": 2.510509967803955, "epoch": 0.8074245939675174, "grad_norm": 0.02808629348874092, "grad_norm_var": 2.0347566703378943e-06, "learning_rate": 0.0009229163361867732, "loss": 2.51, "step": 22272 }, { "crossentropy": 2.403372287750244, "epoch": 0.8074608468677494, "grad_norm": 0.02596081793308258, "grad_norm_var": 2.1157751824927204e-06, "learning_rate": 0.0009225799869249795, "loss": 2.3417, "step": 22273 }, { "crossentropy": 2.444689989089966, "epoch": 0.8074970997679815, "grad_norm": 0.027058742940425873, "grad_norm_var": 2.0885296010877125e-06, "learning_rate": 0.0009222436927349309, "loss": 2.4226, "step": 22274 }, { "crossentropy": 2.3764870166778564, "epoch": 0.8075333526682135, "grad_norm": 0.025809265673160553, "grad_norm_var": 1.062345345125409e-06, "learning_rate": 0.0009219074536211703, "loss": 2.3681, "step": 22275 }, { "crossentropy": 2.430119514465332, "epoch": 0.8075696055684455, "grad_norm": 0.027192892506718636, "grad_norm_var": 9.816730271250032e-07, "learning_rate": 0.0009215712695882389, "loss": 2.4092, "step": 22276 }, { "crossentropy": 2.432506799697876, "epoch": 0.8076058584686775, "grad_norm": 0.026319993659853935, "grad_norm_var": 9.821840379712568e-07, "learning_rate": 0.000921235140640675, "loss": 2.3947, "step": 22277 }, { "crossentropy": 2.3945186138153076, "epoch": 0.8076421113689095, "grad_norm": 0.026685981079936028, "grad_norm_var": 8.743886864208955e-07, "learning_rate": 0.0009208990667830214, "loss": 2.4303, "step": 22278 }, { "crossentropy": 2.572453737258911, "epoch": 0.8076783642691415, "grad_norm": 0.026468489319086075, "grad_norm_var": 8.733212022403458e-07, "learning_rate": 0.0009205630480198151, "loss": 2.5487, "step": 22279 }, { "crossentropy": 2.4105656147003174, "epoch": 0.8077146171693735, "grad_norm": 0.02764841914176941, "grad_norm_var": 7.15470054816443e-07, "learning_rate": 0.0009202270843555971, "loss": 2.494, "step": 22280 }, { "crossentropy": 2.5103557109832764, "epoch": 0.8077508700696056, "grad_norm": 0.025792120024561882, "grad_norm_var": 5.156180683559778e-07, "learning_rate": 0.0009198911757949019, "loss": 2.4607, "step": 22281 }, { "crossentropy": 2.301062822341919, "epoch": 0.8077871229698376, "grad_norm": 0.02665158547461033, "grad_norm_var": 5.162035127850243e-07, "learning_rate": 0.0009195553223422687, "loss": 2.3774, "step": 22282 }, { "crossentropy": 2.3747317790985107, "epoch": 0.8078233758700696, "grad_norm": 0.027103964239358902, "grad_norm_var": 5.207104632281989e-07, "learning_rate": 0.0009192195240022338, "loss": 2.4251, "step": 22283 }, { "crossentropy": 2.3392868041992188, "epoch": 0.8078596287703016, "grad_norm": 0.025866173207759857, "grad_norm_var": 5.58848144042639e-07, "learning_rate": 0.000918883780779331, "loss": 2.4576, "step": 22284 }, { "crossentropy": 2.3612334728240967, "epoch": 0.8078958816705336, "grad_norm": 0.02666160650551319, "grad_norm_var": 5.505498499818369e-07, "learning_rate": 0.0009185480926780976, "loss": 2.4502, "step": 22285 }, { "crossentropy": 2.6654722690582275, "epoch": 0.8079321345707656, "grad_norm": 0.028016716241836548, "grad_norm_var": 5.576390284713882e-07, "learning_rate": 0.0009182124597030655, "loss": 2.5313, "step": 22286 }, { "crossentropy": 2.443323850631714, "epoch": 0.8079683874709976, "grad_norm": 0.028885943815112114, "grad_norm_var": 8.242448418822503e-07, "learning_rate": 0.000917876881858768, "loss": 2.4508, "step": 22287 }, { "crossentropy": 2.3057618141174316, "epoch": 0.8080046403712297, "grad_norm": 0.026095140725374222, "grad_norm_var": 7.53923219377533e-07, "learning_rate": 0.0009175413591497389, "loss": 2.3583, "step": 22288 }, { "crossentropy": 2.5029666423797607, "epoch": 0.8080408932714617, "grad_norm": 0.026807501912117004, "grad_norm_var": 7.080990192902808e-07, "learning_rate": 0.0009172058915805076, "loss": 2.5011, "step": 22289 }, { "crossentropy": 2.406525135040283, "epoch": 0.8080771461716937, "grad_norm": 0.02792060561478138, "grad_norm_var": 7.823579737525321e-07, "learning_rate": 0.0009168704791556065, "loss": 2.3846, "step": 22290 }, { "crossentropy": 2.451585292816162, "epoch": 0.8081133990719258, "grad_norm": 0.026357626542448997, "grad_norm_var": 7.235670997744602e-07, "learning_rate": 0.0009165351218795675, "loss": 2.4529, "step": 22291 }, { "crossentropy": 2.503365993499756, "epoch": 0.8081496519721578, "grad_norm": 0.0279365424066782, "grad_norm_var": 7.867085186191811e-07, "learning_rate": 0.0009161998197569176, "loss": 2.5393, "step": 22292 }, { "crossentropy": 2.472898483276367, "epoch": 0.8081859048723898, "grad_norm": 0.02696685679256916, "grad_norm_var": 7.584242240679055e-07, "learning_rate": 0.0009158645727921871, "loss": 2.4806, "step": 22293 }, { "crossentropy": 2.431218385696411, "epoch": 0.8082221577726219, "grad_norm": 0.02686452865600586, "grad_norm_var": 7.531414938690325e-07, "learning_rate": 0.000915529380989904, "loss": 2.4838, "step": 22294 }, { "crossentropy": 2.407846450805664, "epoch": 0.8082584106728539, "grad_norm": 0.027539901435375214, "grad_norm_var": 7.485665584000892e-07, "learning_rate": 0.0009151942443545936, "loss": 2.4401, "step": 22295 }, { "crossentropy": 2.4697508811950684, "epoch": 0.8082946635730859, "grad_norm": 0.027511660009622574, "grad_norm_var": 7.391828532939126e-07, "learning_rate": 0.0009148591628907848, "loss": 2.5319, "step": 22296 }, { "crossentropy": 2.524263620376587, "epoch": 0.8083309164733179, "grad_norm": 0.028007378801703453, "grad_norm_var": 6.7106157126684e-07, "learning_rate": 0.0009145241366030016, "loss": 2.4966, "step": 22297 }, { "crossentropy": 2.5136287212371826, "epoch": 0.8083671693735499, "grad_norm": 0.02683020755648613, "grad_norm_var": 6.60003823667538e-07, "learning_rate": 0.0009141891654957696, "loss": 2.4817, "step": 22298 }, { "crossentropy": 2.571676731109619, "epoch": 0.8084034222737819, "grad_norm": 0.026832591742277145, "grad_norm_var": 6.684711439093691e-07, "learning_rate": 0.0009138542495736152, "loss": 2.5574, "step": 22299 }, { "crossentropy": 2.6306543350219727, "epoch": 0.8084396751740139, "grad_norm": 0.02750825323164463, "grad_norm_var": 5.463194045655812e-07, "learning_rate": 0.0009135193888410581, "loss": 2.5315, "step": 22300 }, { "crossentropy": 2.337531805038452, "epoch": 0.808475928074246, "grad_norm": 0.026987852528691292, "grad_norm_var": 5.253567036154922e-07, "learning_rate": 0.0009131845833026248, "loss": 2.3495, "step": 22301 }, { "crossentropy": 2.4345932006835938, "epoch": 0.808512180974478, "grad_norm": 0.02657516673207283, "grad_norm_var": 5.20713346107334e-07, "learning_rate": 0.0009128498329628354, "loss": 2.4912, "step": 22302 }, { "crossentropy": 2.4394593238830566, "epoch": 0.80854843387471, "grad_norm": 0.026137687265872955, "grad_norm_var": 3.8477957490753945e-07, "learning_rate": 0.0009125151378262103, "loss": 2.4911, "step": 22303 }, { "crossentropy": 2.6221015453338623, "epoch": 0.808584686774942, "grad_norm": 0.027184106409549713, "grad_norm_var": 3.195323109730774e-07, "learning_rate": 0.0009121804978972725, "loss": 2.5049, "step": 22304 }, { "crossentropy": 2.3913426399230957, "epoch": 0.808620939675174, "grad_norm": 0.026967791840434074, "grad_norm_var": 3.143946680272379e-07, "learning_rate": 0.0009118459131805395, "loss": 2.4291, "step": 22305 }, { "crossentropy": 2.574371337890625, "epoch": 0.808657192575406, "grad_norm": 0.026365172117948532, "grad_norm_var": 3.022729020146862e-07, "learning_rate": 0.0009115113836805311, "loss": 2.535, "step": 22306 }, { "crossentropy": 2.496696949005127, "epoch": 0.808693445475638, "grad_norm": 0.026322321966290474, "grad_norm_var": 3.0554330721446234e-07, "learning_rate": 0.0009111769094017674, "loss": 2.5368, "step": 22307 }, { "crossentropy": 2.435173988342285, "epoch": 0.80872969837587, "grad_norm": 0.0270660612732172, "grad_norm_var": 2.481056940055315e-07, "learning_rate": 0.0009108424903487634, "loss": 2.4189, "step": 22308 }, { "crossentropy": 2.431040048599243, "epoch": 0.8087659512761021, "grad_norm": 0.028258727863430977, "grad_norm_var": 3.502841193318695e-07, "learning_rate": 0.0009105081265260379, "loss": 2.4693, "step": 22309 }, { "crossentropy": 2.4887945652008057, "epoch": 0.8088022041763341, "grad_norm": 0.025753004476428032, "grad_norm_var": 4.564660075441947e-07, "learning_rate": 0.0009101738179381069, "loss": 2.4687, "step": 22310 }, { "crossentropy": 2.39166522026062, "epoch": 0.8088384570765661, "grad_norm": 0.02774362824857235, "grad_norm_var": 4.7398394790314156e-07, "learning_rate": 0.000909839564589483, "loss": 2.4684, "step": 22311 }, { "crossentropy": 2.484769582748413, "epoch": 0.8088747099767981, "grad_norm": 0.027411628514528275, "grad_norm_var": 4.6782808315440755e-07, "learning_rate": 0.0009095053664846847, "loss": 2.5118, "step": 22312 }, { "crossentropy": 2.3530466556549072, "epoch": 0.8089109628770301, "grad_norm": 0.02628452703356743, "grad_norm_var": 4.2123797911319585e-07, "learning_rate": 0.0009091712236282223, "loss": 2.3358, "step": 22313 }, { "crossentropy": 2.4967668056488037, "epoch": 0.8089472157772621, "grad_norm": 0.02650545723736286, "grad_norm_var": 4.303879148599323e-07, "learning_rate": 0.0009088371360246106, "loss": 2.3951, "step": 22314 }, { "crossentropy": 2.599990129470825, "epoch": 0.8089834686774942, "grad_norm": 0.026933850720524788, "grad_norm_var": 4.305372142461109e-07, "learning_rate": 0.0009085031036783636, "loss": 2.4428, "step": 22315 }, { "crossentropy": 2.474388837814331, "epoch": 0.8090197215777262, "grad_norm": 0.02813028171658516, "grad_norm_var": 5.072127366911496e-07, "learning_rate": 0.0009081691265939895, "loss": 2.4797, "step": 22316 }, { "crossentropy": 2.5191783905029297, "epoch": 0.8090559744779582, "grad_norm": 0.027480464428663254, "grad_norm_var": 5.272167361758743e-07, "learning_rate": 0.0009078352047760024, "loss": 2.5313, "step": 22317 }, { "crossentropy": 2.5562565326690674, "epoch": 0.8090922273781903, "grad_norm": 0.02943774312734604, "grad_norm_var": 8.982093286043214e-07, "learning_rate": 0.000907501338228911, "loss": 2.5331, "step": 22318 }, { "crossentropy": 2.464986562728882, "epoch": 0.8091284802784223, "grad_norm": 0.02673415094614029, "grad_norm_var": 8.420126063619915e-07, "learning_rate": 0.0009071675269572233, "loss": 2.4298, "step": 22319 }, { "crossentropy": 2.414254903793335, "epoch": 0.8091647331786543, "grad_norm": 0.02625604346394539, "grad_norm_var": 8.930072480816652e-07, "learning_rate": 0.0009068337709654506, "loss": 2.3653, "step": 22320 }, { "crossentropy": 2.393697500228882, "epoch": 0.8092009860788864, "grad_norm": 0.026923734694719315, "grad_norm_var": 8.93923862182649e-07, "learning_rate": 0.0009065000702580978, "loss": 2.4301, "step": 22321 }, { "crossentropy": 2.571800470352173, "epoch": 0.8092372389791184, "grad_norm": 0.02714107371866703, "grad_norm_var": 8.554858136775497e-07, "learning_rate": 0.0009061664248396739, "loss": 2.5348, "step": 22322 }, { "crossentropy": 2.472278118133545, "epoch": 0.8092734918793504, "grad_norm": 0.02730461023747921, "grad_norm_var": 8.075306154070015e-07, "learning_rate": 0.0009058328347146866, "loss": 2.5083, "step": 22323 }, { "crossentropy": 2.4639129638671875, "epoch": 0.8093097447795824, "grad_norm": 0.026023028418421745, "grad_norm_var": 8.955865237016598e-07, "learning_rate": 0.0009054992998876382, "loss": 2.468, "step": 22324 }, { "crossentropy": 2.330007791519165, "epoch": 0.8093459976798144, "grad_norm": 0.02680707350373268, "grad_norm_var": 8.117500576686053e-07, "learning_rate": 0.0009051658203630369, "loss": 2.3224, "step": 22325 }, { "crossentropy": 2.299621820449829, "epoch": 0.8093822505800464, "grad_norm": 0.02615036629140377, "grad_norm_var": 7.52668927975541e-07, "learning_rate": 0.0009048323961453841, "loss": 2.3974, "step": 22326 }, { "crossentropy": 2.4412753582000732, "epoch": 0.8094185034802784, "grad_norm": 0.027200710028409958, "grad_norm_var": 7.229961714016094e-07, "learning_rate": 0.0009044990272391862, "loss": 2.4124, "step": 22327 }, { "crossentropy": 2.409209966659546, "epoch": 0.8094547563805105, "grad_norm": 0.027187464758753777, "grad_norm_var": 7.151876436932523e-07, "learning_rate": 0.0009041657136489434, "loss": 2.4587, "step": 22328 }, { "crossentropy": 2.3922955989837646, "epoch": 0.8094910092807425, "grad_norm": 0.025878075510263443, "grad_norm_var": 7.659823451543058e-07, "learning_rate": 0.0009038324553791572, "loss": 2.4029, "step": 22329 }, { "crossentropy": 2.331357955932617, "epoch": 0.8095272621809745, "grad_norm": 0.026222096756100655, "grad_norm_var": 7.899074543536577e-07, "learning_rate": 0.0009034992524343294, "loss": 2.3533, "step": 22330 }, { "crossentropy": 2.4690418243408203, "epoch": 0.8095635150812065, "grad_norm": 0.026502249762415886, "grad_norm_var": 8.04675990247874e-07, "learning_rate": 0.0009031661048189627, "loss": 2.5023, "step": 22331 }, { "crossentropy": 2.5188238620758057, "epoch": 0.8095997679814385, "grad_norm": 0.025274019688367844, "grad_norm_var": 8.693375997865232e-07, "learning_rate": 0.0009028330125375534, "loss": 2.4598, "step": 22332 }, { "crossentropy": 2.2677621841430664, "epoch": 0.8096360208816705, "grad_norm": 0.026462528854608536, "grad_norm_var": 8.393932543602437e-07, "learning_rate": 0.0009024999755946034, "loss": 2.3101, "step": 22333 }, { "crossentropy": 2.4317429065704346, "epoch": 0.8096722737819025, "grad_norm": 0.026530280709266663, "grad_norm_var": 3.137978952966278e-07, "learning_rate": 0.0009021669939946081, "loss": 2.4297, "step": 22334 }, { "crossentropy": 2.4880905151367188, "epoch": 0.8097085266821346, "grad_norm": 0.026669511571526527, "grad_norm_var": 3.123628405883594e-07, "learning_rate": 0.0009018340677420672, "loss": 2.4761, "step": 22335 }, { "crossentropy": 2.528899669647217, "epoch": 0.8097447795823666, "grad_norm": 0.026687821373343468, "grad_norm_var": 3.0805283988536747e-07, "learning_rate": 0.0009015011968414771, "loss": 2.4667, "step": 22336 }, { "crossentropy": 2.467024087905884, "epoch": 0.8097810324825986, "grad_norm": 0.02628055214881897, "grad_norm_var": 3.0273993689143055e-07, "learning_rate": 0.0009011683812973309, "loss": 2.5033, "step": 22337 }, { "crossentropy": 2.378317356109619, "epoch": 0.8098172853828306, "grad_norm": 0.028855014592409134, "grad_norm_var": 6.282497697148355e-07, "learning_rate": 0.0009008356211141266, "loss": 2.4691, "step": 22338 }, { "crossentropy": 2.460644006729126, "epoch": 0.8098535382830626, "grad_norm": 0.02741307206451893, "grad_norm_var": 6.387812534077696e-07, "learning_rate": 0.0009005029162963591, "loss": 2.3507, "step": 22339 }, { "crossentropy": 2.4796142578125, "epoch": 0.8098897911832946, "grad_norm": 0.026819441467523575, "grad_norm_var": 6.135461528763841e-07, "learning_rate": 0.000900170266848519, "loss": 2.4897, "step": 22340 }, { "crossentropy": 2.4219279289245605, "epoch": 0.8099260440835266, "grad_norm": 0.026223545894026756, "grad_norm_var": 6.252340168492905e-07, "learning_rate": 0.0008998376727751029, "loss": 2.4562, "step": 22341 }, { "crossentropy": 2.483008623123169, "epoch": 0.8099622969837587, "grad_norm": 0.028060125187039375, "grad_norm_var": 7.266470030548033e-07, "learning_rate": 0.0008995051340806005, "loss": 2.4135, "step": 22342 }, { "crossentropy": 2.380777359008789, "epoch": 0.8099985498839907, "grad_norm": 0.02748098596930504, "grad_norm_var": 7.477772856830563e-07, "learning_rate": 0.000899172650769503, "loss": 2.4588, "step": 22343 }, { "crossentropy": 2.4462077617645264, "epoch": 0.8100348027842227, "grad_norm": 0.025789368897676468, "grad_norm_var": 7.94765762790805e-07, "learning_rate": 0.0008988402228463055, "loss": 2.3727, "step": 22344 }, { "crossentropy": 2.5440828800201416, "epoch": 0.8100710556844548, "grad_norm": 0.025674546137452126, "grad_norm_var": 8.195725207507087e-07, "learning_rate": 0.0008985078503154914, "loss": 2.4261, "step": 22345 }, { "crossentropy": 2.5921144485473633, "epoch": 0.8101073085846868, "grad_norm": 0.02675367332994938, "grad_norm_var": 8.044899716054428e-07, "learning_rate": 0.0008981755331815534, "loss": 2.5659, "step": 22346 }, { "crossentropy": 2.522197723388672, "epoch": 0.8101435614849188, "grad_norm": 0.02617410011589527, "grad_norm_var": 8.206290925995285e-07, "learning_rate": 0.0008978432714489803, "loss": 2.4999, "step": 22347 }, { "crossentropy": 2.434483051300049, "epoch": 0.8101798143851509, "grad_norm": 0.026918785646557808, "grad_norm_var": 6.776916897312565e-07, "learning_rate": 0.0008975110651222584, "loss": 2.433, "step": 22348 }, { "crossentropy": 2.5007529258728027, "epoch": 0.8102160672853829, "grad_norm": 0.02698729932308197, "grad_norm_var": 6.713196040889022e-07, "learning_rate": 0.0008971789142058761, "loss": 2.436, "step": 22349 }, { "crossentropy": 2.5012497901916504, "epoch": 0.8102523201856149, "grad_norm": 0.02752639167010784, "grad_norm_var": 6.93210795384841e-07, "learning_rate": 0.0008968468187043177, "loss": 2.4231, "step": 22350 }, { "crossentropy": 2.442490339279175, "epoch": 0.8102885730858469, "grad_norm": 0.026981374248862267, "grad_norm_var": 6.89928232369165e-07, "learning_rate": 0.0008965147786220701, "loss": 2.3774, "step": 22351 }, { "crossentropy": 2.484049081802368, "epoch": 0.8103248259860789, "grad_norm": 0.027202408760786057, "grad_norm_var": 6.909507574431959e-07, "learning_rate": 0.0008961827939636197, "loss": 2.4819, "step": 22352 }, { "crossentropy": 2.535946846008301, "epoch": 0.8103610788863109, "grad_norm": 0.026961704716086388, "grad_norm_var": 6.594860111494571e-07, "learning_rate": 0.0008958508647334462, "loss": 2.5147, "step": 22353 }, { "crossentropy": 2.554698944091797, "epoch": 0.8103973317865429, "grad_norm": 0.02933921106159687, "grad_norm_var": 7.946166470813258e-07, "learning_rate": 0.000895518990936035, "loss": 2.4741, "step": 22354 }, { "crossentropy": 2.4967286586761475, "epoch": 0.810433584686775, "grad_norm": 0.02696583792567253, "grad_norm_var": 7.83626380573138e-07, "learning_rate": 0.0008951871725758698, "loss": 2.5521, "step": 22355 }, { "crossentropy": 2.374418258666992, "epoch": 0.810469837587007, "grad_norm": 0.026209620758891106, "grad_norm_var": 8.2083251866509e-07, "learning_rate": 0.0008948554096574296, "loss": 2.3462, "step": 22356 }, { "crossentropy": 2.3156676292419434, "epoch": 0.810506090487239, "grad_norm": 0.029225235804915428, "grad_norm_var": 1.0919959048176816e-06, "learning_rate": 0.0008945237021851987, "loss": 2.4299, "step": 22357 }, { "crossentropy": 2.4469404220581055, "epoch": 0.810542343387471, "grad_norm": 0.02649068832397461, "grad_norm_var": 1.0535374173175256e-06, "learning_rate": 0.0008941920501636541, "loss": 2.3988, "step": 22358 }, { "crossentropy": 2.5598504543304443, "epoch": 0.810578596287703, "grad_norm": 0.026708131656050682, "grad_norm_var": 1.0456920735584975e-06, "learning_rate": 0.0008938604535972761, "loss": 2.5427, "step": 22359 }, { "crossentropy": 2.2779150009155273, "epoch": 0.810614849187935, "grad_norm": 0.02676893025636673, "grad_norm_var": 9.482929340361938e-07, "learning_rate": 0.0008935289124905471, "loss": 2.3381, "step": 22360 }, { "crossentropy": 2.398329257965088, "epoch": 0.810651102088167, "grad_norm": 0.026173820719122887, "grad_norm_var": 8.719428574107396e-07, "learning_rate": 0.0008931974268479392, "loss": 2.3867, "step": 22361 }, { "crossentropy": 2.2964658737182617, "epoch": 0.8106873549883991, "grad_norm": 0.0262753963470459, "grad_norm_var": 9.074769177973748e-07, "learning_rate": 0.0008928659966739323, "loss": 2.3723, "step": 22362 }, { "crossentropy": 2.5168511867523193, "epoch": 0.8107236078886311, "grad_norm": 0.027245448902249336, "grad_norm_var": 8.531218622861991e-07, "learning_rate": 0.0008925346219730041, "loss": 2.5269, "step": 22363 }, { "crossentropy": 2.582650661468506, "epoch": 0.8107598607888631, "grad_norm": 0.027768423780798912, "grad_norm_var": 8.750182422906718e-07, "learning_rate": 0.0008922033027496279, "loss": 2.5263, "step": 22364 }, { "crossentropy": 2.464461088180542, "epoch": 0.8107961136890951, "grad_norm": 0.028003597632050514, "grad_norm_var": 9.138840492657936e-07, "learning_rate": 0.0008918720390082812, "loss": 2.447, "step": 22365 }, { "crossentropy": 2.54017972946167, "epoch": 0.8108323665893271, "grad_norm": 0.027435095980763435, "grad_norm_var": 9.109235381482879e-07, "learning_rate": 0.0008915408307534356, "loss": 2.5819, "step": 22366 }, { "crossentropy": 2.4406020641326904, "epoch": 0.8108686194895591, "grad_norm": 0.026029257103800774, "grad_norm_var": 9.997387510043599e-07, "learning_rate": 0.0008912096779895662, "loss": 2.4712, "step": 22367 }, { "crossentropy": 2.4178128242492676, "epoch": 0.8109048723897911, "grad_norm": 0.02624119073152542, "grad_norm_var": 1.0539947409308648e-06, "learning_rate": 0.0008908785807211467, "loss": 2.4553, "step": 22368 }, { "crossentropy": 2.5375685691833496, "epoch": 0.8109411252900232, "grad_norm": 0.027048511430621147, "grad_norm_var": 1.0526902777069677e-06, "learning_rate": 0.0008905475389526474, "loss": 2.5754, "step": 22369 }, { "crossentropy": 2.3257274627685547, "epoch": 0.8109773781902552, "grad_norm": 0.026422366499900818, "grad_norm_var": 7.21564143459028e-07, "learning_rate": 0.0008902165526885386, "loss": 2.3722, "step": 22370 }, { "crossentropy": 2.501155376434326, "epoch": 0.8110136310904872, "grad_norm": 0.026675958186388016, "grad_norm_var": 7.257486664121714e-07, "learning_rate": 0.0008898856219332935, "loss": 2.4928, "step": 22371 }, { "crossentropy": 2.4338443279266357, "epoch": 0.8110498839907193, "grad_norm": 0.02749563381075859, "grad_norm_var": 7.072874766228388e-07, "learning_rate": 0.0008895547466913795, "loss": 2.4766, "step": 22372 }, { "crossentropy": 2.304687023162842, "epoch": 0.8110861368909513, "grad_norm": 0.02847229689359665, "grad_norm_var": 5.193724725219631e-07, "learning_rate": 0.0008892239269672669, "loss": 2.3874, "step": 22373 }, { "crossentropy": 2.401871681213379, "epoch": 0.8111223897911833, "grad_norm": 0.027107002213597298, "grad_norm_var": 5.05087441850802e-07, "learning_rate": 0.0008888931627654229, "loss": 2.3658, "step": 22374 }, { "crossentropy": 2.469966411590576, "epoch": 0.8111586426914154, "grad_norm": 0.027262168005108833, "grad_norm_var": 5.033067550360189e-07, "learning_rate": 0.0008885624540903153, "loss": 2.5167, "step": 22375 }, { "crossentropy": 2.3563077449798584, "epoch": 0.8111948955916474, "grad_norm": 0.027192333713173866, "grad_norm_var": 4.999664952949507e-07, "learning_rate": 0.0008882318009464125, "loss": 2.365, "step": 22376 }, { "crossentropy": 2.464379072189331, "epoch": 0.8112311484918794, "grad_norm": 0.027115147560834885, "grad_norm_var": 4.449975642966854e-07, "learning_rate": 0.0008879012033381789, "loss": 2.441, "step": 22377 }, { "crossentropy": 2.462873935699463, "epoch": 0.8112674013921114, "grad_norm": 0.026404304429888725, "grad_norm_var": 4.316591478123003e-07, "learning_rate": 0.000887570661270079, "loss": 2.3591, "step": 22378 }, { "crossentropy": 2.5084187984466553, "epoch": 0.8113036542923434, "grad_norm": 0.027313601225614548, "grad_norm_var": 4.330901126054724e-07, "learning_rate": 0.0008872401747465792, "loss": 2.5101, "step": 22379 }, { "crossentropy": 2.447467088699341, "epoch": 0.8113399071925754, "grad_norm": 0.02621009573340416, "grad_norm_var": 4.5100528894906343e-07, "learning_rate": 0.0008869097437721407, "loss": 2.4434, "step": 22380 }, { "crossentropy": 2.3149542808532715, "epoch": 0.8113761600928074, "grad_norm": 0.025408172979950905, "grad_norm_var": 5.339871560453476e-07, "learning_rate": 0.000886579368351229, "loss": 2.3265, "step": 22381 }, { "crossentropy": 2.4914097785949707, "epoch": 0.8114124129930395, "grad_norm": 0.027767052873969078, "grad_norm_var": 5.661263258640557e-07, "learning_rate": 0.0008862490484883046, "loss": 2.5109, "step": 22382 }, { "crossentropy": 2.4494266510009766, "epoch": 0.8114486658932715, "grad_norm": 0.027475552633404732, "grad_norm_var": 5.317796603445298e-07, "learning_rate": 0.0008859187841878291, "loss": 2.4562, "step": 22383 }, { "crossentropy": 2.7013375759124756, "epoch": 0.8114849187935035, "grad_norm": 0.027929434552788734, "grad_norm_var": 5.445750212419576e-07, "learning_rate": 0.0008855885754542653, "loss": 2.5566, "step": 22384 }, { "crossentropy": 2.598299741744995, "epoch": 0.8115211716937355, "grad_norm": 0.026349859312176704, "grad_norm_var": 5.781297730610696e-07, "learning_rate": 0.0008852584222920707, "loss": 2.5416, "step": 22385 }, { "crossentropy": 2.364980697631836, "epoch": 0.8115574245939675, "grad_norm": 0.026086725294589996, "grad_norm_var": 6.127020078000908e-07, "learning_rate": 0.000884928324705706, "loss": 2.3788, "step": 22386 }, { "crossentropy": 2.4694297313690186, "epoch": 0.8115936774941995, "grad_norm": 0.02743339352309704, "grad_norm_var": 6.141585344179394e-07, "learning_rate": 0.0008845982826996297, "loss": 2.4601, "step": 22387 }, { "crossentropy": 2.392252206802368, "epoch": 0.8116299303944315, "grad_norm": 0.02599981054663658, "grad_norm_var": 6.67899827761174e-07, "learning_rate": 0.0008842682962782977, "loss": 2.4345, "step": 22388 }, { "crossentropy": 2.394998550415039, "epoch": 0.8116661832946636, "grad_norm": 0.026284612715244293, "grad_norm_var": 5.289423679724163e-07, "learning_rate": 0.0008839383654461691, "loss": 2.3816, "step": 22389 }, { "crossentropy": 2.4506521224975586, "epoch": 0.8117024361948956, "grad_norm": 0.027001647278666496, "grad_norm_var": 5.257969910414546e-07, "learning_rate": 0.0008836084902076979, "loss": 2.4675, "step": 22390 }, { "crossentropy": 2.536492347717285, "epoch": 0.8117386890951276, "grad_norm": 0.02588379941880703, "grad_norm_var": 5.645864504764952e-07, "learning_rate": 0.0008832786705673413, "loss": 2.4557, "step": 22391 }, { "crossentropy": 2.4607181549072266, "epoch": 0.8117749419953596, "grad_norm": 0.026178905740380287, "grad_norm_var": 5.677864690321888e-07, "learning_rate": 0.0008829489065295548, "loss": 2.4051, "step": 22392 }, { "crossentropy": 2.464277505874634, "epoch": 0.8118111948955916, "grad_norm": 0.026882192119956017, "grad_norm_var": 5.575886905711484e-07, "learning_rate": 0.0008826191980987896, "loss": 2.4927, "step": 22393 }, { "crossentropy": 2.5302999019622803, "epoch": 0.8118474477958236, "grad_norm": 0.025760356336832047, "grad_norm_var": 6.057232749677454e-07, "learning_rate": 0.0008822895452795021, "loss": 2.525, "step": 22394 }, { "crossentropy": 2.4559268951416016, "epoch": 0.8118837006960556, "grad_norm": 0.028146319091320038, "grad_norm_var": 7.257581071927362e-07, "learning_rate": 0.0008819599480761425, "loss": 2.5062, "step": 22395 }, { "crossentropy": 2.5631866455078125, "epoch": 0.8119199535962877, "grad_norm": 0.027484653517603874, "grad_norm_var": 7.483048747010014e-07, "learning_rate": 0.0008816304064931624, "loss": 2.5292, "step": 22396 }, { "crossentropy": 2.383103609085083, "epoch": 0.8119562064965197, "grad_norm": 0.026573719456791878, "grad_norm_var": 6.239787353081213e-07, "learning_rate": 0.0008813009205350148, "loss": 2.4322, "step": 22397 }, { "crossentropy": 2.600283145904541, "epoch": 0.8119924593967517, "grad_norm": 0.026229167357087135, "grad_norm_var": 5.79115152043993e-07, "learning_rate": 0.0008809714902061472, "loss": 2.5434, "step": 22398 }, { "crossentropy": 2.4244139194488525, "epoch": 0.8120287122969838, "grad_norm": 0.027805734425783157, "grad_norm_var": 6.186958503665365e-07, "learning_rate": 0.0008806421155110105, "loss": 2.3905, "step": 22399 }, { "crossentropy": 2.387826442718506, "epoch": 0.8120649651972158, "grad_norm": 0.026429325342178345, "grad_norm_var": 5.2381640450864e-07, "learning_rate": 0.0008803127964540547, "loss": 2.4698, "step": 22400 }, { "crossentropy": 2.3887722492218018, "epoch": 0.8121012180974478, "grad_norm": 0.025355393067002296, "grad_norm_var": 6.265030772991161e-07, "learning_rate": 0.0008799835330397254, "loss": 2.3924, "step": 22401 }, { "crossentropy": 2.5524797439575195, "epoch": 0.8121374709976799, "grad_norm": 0.026438534259796143, "grad_norm_var": 6.103504040301608e-07, "learning_rate": 0.000879654325272472, "loss": 2.4526, "step": 22402 }, { "crossentropy": 2.3897225856781006, "epoch": 0.8121737238979119, "grad_norm": 0.02730339579284191, "grad_norm_var": 5.972729040666733e-07, "learning_rate": 0.0008793251731567392, "loss": 2.4676, "step": 22403 }, { "crossentropy": 2.3522825241088867, "epoch": 0.8122099767981439, "grad_norm": 0.02769337221980095, "grad_norm_var": 6.387808986421748e-07, "learning_rate": 0.0008789960766969729, "loss": 2.3154, "step": 22404 }, { "crossentropy": 2.554893970489502, "epoch": 0.8122462296983759, "grad_norm": 0.02678985334932804, "grad_norm_var": 6.256950822629165e-07, "learning_rate": 0.0008786670358976189, "loss": 2.5363, "step": 22405 }, { "crossentropy": 2.445181369781494, "epoch": 0.8122824825986079, "grad_norm": 0.02824365720152855, "grad_norm_var": 7.642315818772443e-07, "learning_rate": 0.00087833805076312, "loss": 2.4169, "step": 22406 }, { "crossentropy": 2.440624475479126, "epoch": 0.8123187354988399, "grad_norm": 0.02650103159248829, "grad_norm_var": 7.105923227310734e-07, "learning_rate": 0.0008780091212979207, "loss": 2.4653, "step": 22407 }, { "crossentropy": 2.5225796699523926, "epoch": 0.8123549883990719, "grad_norm": 0.026547003537416458, "grad_norm_var": 6.854623304621461e-07, "learning_rate": 0.0008776802475064649, "loss": 2.4745, "step": 22408 }, { "crossentropy": 2.566342830657959, "epoch": 0.812391241299304, "grad_norm": 0.02695915661752224, "grad_norm_var": 6.857885310055748e-07, "learning_rate": 0.0008773514293931916, "loss": 2.493, "step": 22409 }, { "crossentropy": 2.4601998329162598, "epoch": 0.812427494199536, "grad_norm": 0.025296658277511597, "grad_norm_var": 7.69148718277451e-07, "learning_rate": 0.0008770226669625447, "loss": 2.4091, "step": 22410 }, { "crossentropy": 2.437434673309326, "epoch": 0.812463747099768, "grad_norm": 0.02548336610198021, "grad_norm_var": 7.564556877207955e-07, "learning_rate": 0.0008766939602189638, "loss": 2.3948, "step": 22411 }, { "crossentropy": 2.4207723140716553, "epoch": 0.8125, "grad_norm": 0.029466308653354645, "grad_norm_var": 1.2103017398569129e-06, "learning_rate": 0.0008763653091668871, "loss": 2.4538, "step": 22412 }, { "crossentropy": 2.5279812812805176, "epoch": 0.812536252900232, "grad_norm": 0.02675752528011799, "grad_norm_var": 1.2063841909663026e-06, "learning_rate": 0.000876036713810755, "loss": 2.5535, "step": 22413 }, { "crossentropy": 2.434291362762451, "epoch": 0.812572505800464, "grad_norm": 0.028077760711312294, "grad_norm_var": 1.2715724446685316e-06, "learning_rate": 0.0008757081741550061, "loss": 2.4388, "step": 22414 }, { "crossentropy": 2.3893792629241943, "epoch": 0.812608758700696, "grad_norm": 0.02648378722369671, "grad_norm_var": 1.2293905375173585e-06, "learning_rate": 0.0008753796902040761, "loss": 2.476, "step": 22415 }, { "crossentropy": 2.399559497833252, "epoch": 0.8126450116009281, "grad_norm": 0.028304073959589005, "grad_norm_var": 1.3403708262774613e-06, "learning_rate": 0.000875051261962404, "loss": 2.3485, "step": 22416 }, { "crossentropy": 2.5074241161346436, "epoch": 0.8126812645011601, "grad_norm": 0.026614340022206306, "grad_norm_var": 1.1665051466371498e-06, "learning_rate": 0.0008747228894344233, "loss": 2.5252, "step": 22417 }, { "crossentropy": 2.537391185760498, "epoch": 0.8127175174013921, "grad_norm": 0.026482637971639633, "grad_norm_var": 1.1629722559922465e-06, "learning_rate": 0.0008743945726245706, "loss": 2.509, "step": 22418 }, { "crossentropy": 2.3876137733459473, "epoch": 0.8127537703016241, "grad_norm": 0.026710543781518936, "grad_norm_var": 1.16591668471327e-06, "learning_rate": 0.0008740663115372821, "loss": 2.3922, "step": 22419 }, { "crossentropy": 2.625901699066162, "epoch": 0.8127900232018561, "grad_norm": 0.027263915166258812, "grad_norm_var": 1.1392117885533048e-06, "learning_rate": 0.0008737381061769867, "loss": 2.4665, "step": 22420 }, { "crossentropy": 2.3055665493011475, "epoch": 0.8128262761020881, "grad_norm": 0.0258626751601696, "grad_norm_var": 1.218777602448442e-06, "learning_rate": 0.0008734099565481201, "loss": 2.4034, "step": 22421 }, { "crossentropy": 2.516634941101074, "epoch": 0.8128625290023201, "grad_norm": 0.026757460087537766, "grad_norm_var": 1.098673114031468e-06, "learning_rate": 0.0008730818626551157, "loss": 2.4757, "step": 22422 }, { "crossentropy": 2.602410316467285, "epoch": 0.8128987819025522, "grad_norm": 0.026645544916391373, "grad_norm_var": 1.093292535265152e-06, "learning_rate": 0.0008727538245024025, "loss": 2.5613, "step": 22423 }, { "crossentropy": 2.5889620780944824, "epoch": 0.8129350348027842, "grad_norm": 0.027291351929306984, "grad_norm_var": 1.097150195672802e-06, "learning_rate": 0.0008724258420944136, "loss": 2.4456, "step": 22424 }, { "crossentropy": 2.5579071044921875, "epoch": 0.8129712877030162, "grad_norm": 0.02816183865070343, "grad_norm_var": 1.1964668287701523e-06, "learning_rate": 0.0008720979154355757, "loss": 2.5425, "step": 22425 }, { "crossentropy": 2.394421100616455, "epoch": 0.8130075406032483, "grad_norm": 0.026535244658589363, "grad_norm_var": 1.0145612454430375e-06, "learning_rate": 0.0008717700445303201, "loss": 2.4912, "step": 22426 }, { "crossentropy": 2.4168243408203125, "epoch": 0.8130437935034803, "grad_norm": 0.026572197675704956, "grad_norm_var": 8.603257138719283e-07, "learning_rate": 0.000871442229383076, "loss": 2.4031, "step": 22427 }, { "crossentropy": 2.4458749294281006, "epoch": 0.8130800464037123, "grad_norm": 0.02650359645485878, "grad_norm_var": 4.837306477857428e-07, "learning_rate": 0.0008711144699982704, "loss": 2.4524, "step": 22428 }, { "crossentropy": 2.4292120933532715, "epoch": 0.8131162993039444, "grad_norm": 0.026209160685539246, "grad_norm_var": 5.157954614814127e-07, "learning_rate": 0.0008707867663803276, "loss": 2.3854, "step": 22429 }, { "crossentropy": 2.455859899520874, "epoch": 0.8131525522041764, "grad_norm": 0.025693072006106377, "grad_norm_var": 4.982505706643247e-07, "learning_rate": 0.000870459118533678, "loss": 2.452, "step": 22430 }, { "crossentropy": 2.4272677898406982, "epoch": 0.8131888051044084, "grad_norm": 0.02607964165508747, "grad_norm_var": 5.231120451515876e-07, "learning_rate": 0.000870131526462743, "loss": 2.467, "step": 22431 }, { "crossentropy": 2.456799030303955, "epoch": 0.8132250580046404, "grad_norm": 0.026114236563444138, "grad_norm_var": 3.6336143440368617e-07, "learning_rate": 0.0008698039901719501, "loss": 2.3164, "step": 22432 }, { "crossentropy": 2.4617865085601807, "epoch": 0.8132613109048724, "grad_norm": 0.026837868615984917, "grad_norm_var": 3.6710264630594043e-07, "learning_rate": 0.0008694765096657209, "loss": 2.4822, "step": 22433 }, { "crossentropy": 2.4502904415130615, "epoch": 0.8132975638051044, "grad_norm": 0.026204749941825867, "grad_norm_var": 3.765576444644814e-07, "learning_rate": 0.0008691490849484802, "loss": 2.4527, "step": 22434 }, { "crossentropy": 2.3218297958374023, "epoch": 0.8133338167053364, "grad_norm": 0.026601053774356842, "grad_norm_var": 3.755499463390169e-07, "learning_rate": 0.0008688217160246509, "loss": 2.4695, "step": 22435 }, { "crossentropy": 2.502751350402832, "epoch": 0.8133700696055685, "grad_norm": 0.027665482833981514, "grad_norm_var": 4.2206751895377627e-07, "learning_rate": 0.0008684944028986536, "loss": 2.4423, "step": 22436 }, { "crossentropy": 2.5813820362091064, "epoch": 0.8134063225058005, "grad_norm": 0.03127428889274597, "grad_norm_var": 1.7143036030110742e-06, "learning_rate": 0.0008681671455749079, "loss": 2.5222, "step": 22437 }, { "crossentropy": 2.3290867805480957, "epoch": 0.8134425754060325, "grad_norm": 0.027931977063417435, "grad_norm_var": 1.77089035667029e-06, "learning_rate": 0.0008678399440578366, "loss": 2.4573, "step": 22438 }, { "crossentropy": 2.597426652908325, "epoch": 0.8134788283062645, "grad_norm": 0.027577238157391548, "grad_norm_var": 1.778616513336513e-06, "learning_rate": 0.0008675127983518561, "loss": 2.5804, "step": 22439 }, { "crossentropy": 2.4433770179748535, "epoch": 0.8135150812064965, "grad_norm": 0.026438336819410324, "grad_norm_var": 1.799863570181986e-06, "learning_rate": 0.0008671857084613877, "loss": 2.4291, "step": 22440 }, { "crossentropy": 2.6110100746154785, "epoch": 0.8135513341067285, "grad_norm": 0.026640895754098892, "grad_norm_var": 1.7139003562866942e-06, "learning_rate": 0.0008668586743908474, "loss": 2.4865, "step": 22441 }, { "crossentropy": 2.5527169704437256, "epoch": 0.8135875870069605, "grad_norm": 0.02611093781888485, "grad_norm_var": 1.7474822212264091e-06, "learning_rate": 0.0008665316961446524, "loss": 2.5206, "step": 22442 }, { "crossentropy": 2.509467363357544, "epoch": 0.8136238399071926, "grad_norm": 0.026808081194758415, "grad_norm_var": 1.740542438639668e-06, "learning_rate": 0.000866204773727221, "loss": 2.4613, "step": 22443 }, { "crossentropy": 2.6175479888916016, "epoch": 0.8136600928074246, "grad_norm": 0.026166336610913277, "grad_norm_var": 1.7662936998184367e-06, "learning_rate": 0.0008658779071429662, "loss": 2.4861, "step": 22444 }, { "crossentropy": 2.3923425674438477, "epoch": 0.8136963457076566, "grad_norm": 0.026995569467544556, "grad_norm_var": 1.7328141706925611e-06, "learning_rate": 0.0008655510963963053, "loss": 2.4163, "step": 22445 }, { "crossentropy": 2.493338108062744, "epoch": 0.8137325986078886, "grad_norm": 0.02612895704805851, "grad_norm_var": 1.6718575463721293e-06, "learning_rate": 0.0008652243414916511, "loss": 2.3999, "step": 22446 }, { "crossentropy": 2.4106197357177734, "epoch": 0.8137688515081206, "grad_norm": 0.026295999065041542, "grad_norm_var": 1.648998114629705e-06, "learning_rate": 0.0008648976424334154, "loss": 2.4146, "step": 22447 }, { "crossentropy": 2.586120128631592, "epoch": 0.8138051044083526, "grad_norm": 0.02575000748038292, "grad_norm_var": 1.6996743472237503e-06, "learning_rate": 0.0008645709992260143, "loss": 2.4757, "step": 22448 }, { "crossentropy": 2.3507237434387207, "epoch": 0.8138413573085846, "grad_norm": 0.02691943757236004, "grad_norm_var": 1.6987158334692716e-06, "learning_rate": 0.0008642444118738557, "loss": 2.3995, "step": 22449 }, { "crossentropy": 2.4437150955200195, "epoch": 0.8138776102088167, "grad_norm": 0.026819001883268356, "grad_norm_var": 1.65967776352971e-06, "learning_rate": 0.0008639178803813524, "loss": 2.4378, "step": 22450 }, { "crossentropy": 2.4716567993164062, "epoch": 0.8139138631090487, "grad_norm": 0.02618079073727131, "grad_norm_var": 1.6935044335834956e-06, "learning_rate": 0.0008635914047529164, "loss": 2.4637, "step": 22451 }, { "crossentropy": 2.478505849838257, "epoch": 0.8139501160092807, "grad_norm": 0.027323653921484947, "grad_norm_var": 1.66963147236325e-06, "learning_rate": 0.0008632649849929542, "loss": 2.499, "step": 22452 }, { "crossentropy": 2.2976765632629395, "epoch": 0.8139863689095128, "grad_norm": 0.026392793282866478, "grad_norm_var": 3.509810335199902e-07, "learning_rate": 0.0008629386211058776, "loss": 2.3363, "step": 22453 }, { "crossentropy": 2.4855809211730957, "epoch": 0.8140226218097448, "grad_norm": 0.027569526806473732, "grad_norm_var": 2.974796235992999e-07, "learning_rate": 0.0008626123130960933, "loss": 2.4889, "step": 22454 }, { "crossentropy": 2.481241464614868, "epoch": 0.8140588747099768, "grad_norm": 0.026538841426372528, "grad_norm_var": 2.3404854130557967e-07, "learning_rate": 0.0008622860609680067, "loss": 2.5267, "step": 22455 }, { "crossentropy": 2.5320467948913574, "epoch": 0.8140951276102089, "grad_norm": 0.026144569739699364, "grad_norm_var": 2.4449938075368714e-07, "learning_rate": 0.0008619598647260279, "loss": 2.5237, "step": 22456 }, { "crossentropy": 2.4758315086364746, "epoch": 0.8141313805104409, "grad_norm": 0.02699960395693779, "grad_norm_var": 2.569323379250036e-07, "learning_rate": 0.0008616337243745592, "loss": 2.4638, "step": 22457 }, { "crossentropy": 2.5386338233947754, "epoch": 0.8141676334106729, "grad_norm": 0.02754051983356476, "grad_norm_var": 2.9687440909609053e-07, "learning_rate": 0.000861307639918007, "loss": 2.5083, "step": 22458 }, { "crossentropy": 2.485746145248413, "epoch": 0.8142038863109049, "grad_norm": 0.02663339674472809, "grad_norm_var": 2.9535250617855554e-07, "learning_rate": 0.0008609816113607777, "loss": 2.4499, "step": 22459 }, { "crossentropy": 2.4173970222473145, "epoch": 0.8142401392111369, "grad_norm": 0.026654819026589394, "grad_norm_var": 2.7876852130811976e-07, "learning_rate": 0.0008606556387072712, "loss": 2.4526, "step": 22460 }, { "crossentropy": 2.3171517848968506, "epoch": 0.8142763921113689, "grad_norm": 0.027453579008579254, "grad_norm_var": 3.1112191470337945e-07, "learning_rate": 0.0008603297219618933, "loss": 2.3774, "step": 22461 }, { "crossentropy": 2.372865915298462, "epoch": 0.8143126450116009, "grad_norm": 0.026554293930530548, "grad_norm_var": 2.8952841842477586e-07, "learning_rate": 0.0008600038611290445, "loss": 2.4104, "step": 22462 }, { "crossentropy": 2.4547719955444336, "epoch": 0.814348897911833, "grad_norm": 0.028651254251599312, "grad_norm_var": 4.981562265521725e-07, "learning_rate": 0.0008596780562131256, "loss": 2.4098, "step": 22463 }, { "crossentropy": 2.3062400817871094, "epoch": 0.814385150812065, "grad_norm": 0.026689104735851288, "grad_norm_var": 4.1142479999256576e-07, "learning_rate": 0.0008593523072185383, "loss": 2.3385, "step": 22464 }, { "crossentropy": 2.3275251388549805, "epoch": 0.814421403712297, "grad_norm": 0.026168961077928543, "grad_norm_var": 4.4884080255950564e-07, "learning_rate": 0.0008590266141496811, "loss": 2.3893, "step": 22465 }, { "crossentropy": 2.523271322250366, "epoch": 0.814457656612529, "grad_norm": 0.025756927207112312, "grad_norm_var": 5.30056239623986e-07, "learning_rate": 0.0008587009770109533, "loss": 2.4126, "step": 22466 }, { "crossentropy": 2.2917370796203613, "epoch": 0.814493909512761, "grad_norm": 0.025665324181318283, "grad_norm_var": 5.911647270693722e-07, "learning_rate": 0.0008583753958067548, "loss": 2.3765, "step": 22467 }, { "crossentropy": 2.415602684020996, "epoch": 0.814530162412993, "grad_norm": 0.027487928047776222, "grad_norm_var": 6.044070691290827e-07, "learning_rate": 0.0008580498705414802, "loss": 2.5258, "step": 22468 }, { "crossentropy": 2.5068576335906982, "epoch": 0.814566415313225, "grad_norm": 0.02619919925928116, "grad_norm_var": 6.17424179479425e-07, "learning_rate": 0.000857724401219529, "loss": 2.4316, "step": 22469 }, { "crossentropy": 2.4860897064208984, "epoch": 0.8146026682134571, "grad_norm": 0.02660408616065979, "grad_norm_var": 5.75879861958621e-07, "learning_rate": 0.0008573989878452959, "loss": 2.4855, "step": 22470 }, { "crossentropy": 2.447404146194458, "epoch": 0.8146389211136891, "grad_norm": 0.0262571033090353, "grad_norm_var": 5.881682964193706e-07, "learning_rate": 0.0008570736304231752, "loss": 2.4298, "step": 22471 }, { "crossentropy": 2.5736441612243652, "epoch": 0.8146751740139211, "grad_norm": 0.029432019218802452, "grad_norm_var": 1.0130250202462314e-06, "learning_rate": 0.0008567483289575634, "loss": 2.5281, "step": 22472 }, { "crossentropy": 2.4858956336975098, "epoch": 0.8147114269141531, "grad_norm": 0.028760170564055443, "grad_norm_var": 1.225023544282203e-06, "learning_rate": 0.0008564230834528513, "loss": 2.466, "step": 22473 }, { "crossentropy": 2.543764591217041, "epoch": 0.8147476798143851, "grad_norm": 0.02769913710653782, "grad_norm_var": 1.2373550595481374e-06, "learning_rate": 0.0008560978939134339, "loss": 2.5444, "step": 22474 }, { "crossentropy": 2.4968421459198, "epoch": 0.8147839327146171, "grad_norm": 0.026539035141468048, "grad_norm_var": 1.2430487348432638e-06, "learning_rate": 0.0008557727603437043, "loss": 2.4984, "step": 22475 }, { "crossentropy": 2.3871726989746094, "epoch": 0.8148201856148491, "grad_norm": 0.026524877175688744, "grad_norm_var": 1.2507049105212515e-06, "learning_rate": 0.0008554476827480517, "loss": 2.4595, "step": 22476 }, { "crossentropy": 2.6507961750030518, "epoch": 0.8148564385150812, "grad_norm": 0.029197625815868378, "grad_norm_var": 1.5398477392844963e-06, "learning_rate": 0.0008551226611308682, "loss": 2.5597, "step": 22477 }, { "crossentropy": 2.3594179153442383, "epoch": 0.8148926914153132, "grad_norm": 0.026840107515454292, "grad_norm_var": 1.5227590822131417e-06, "learning_rate": 0.0008547976954965442, "loss": 2.4536, "step": 22478 }, { "crossentropy": 2.376380681991577, "epoch": 0.8149289443155452, "grad_norm": 0.026150833815336227, "grad_norm_var": 1.4145314261165253e-06, "learning_rate": 0.0008544727858494661, "loss": 2.4366, "step": 22479 }, { "crossentropy": 2.4063308238983154, "epoch": 0.8149651972157773, "grad_norm": 0.02534579299390316, "grad_norm_var": 1.582687219337991e-06, "learning_rate": 0.0008541479321940254, "loss": 2.417, "step": 22480 }, { "crossentropy": 2.5095036029815674, "epoch": 0.8150014501160093, "grad_norm": 0.026705194264650345, "grad_norm_var": 1.5473673158559644e-06, "learning_rate": 0.0008538231345346071, "loss": 2.487, "step": 22481 }, { "crossentropy": 2.506399393081665, "epoch": 0.8150377030162413, "grad_norm": 0.026224061846733093, "grad_norm_var": 1.4868304944725779e-06, "learning_rate": 0.0008534983928755996, "loss": 2.5149, "step": 22482 }, { "crossentropy": 2.4929816722869873, "epoch": 0.8150739559164734, "grad_norm": 0.026112955063581467, "grad_norm_var": 1.4210657667719506e-06, "learning_rate": 0.0008531737072213897, "loss": 2.4775, "step": 22483 }, { "crossentropy": 2.5270509719848633, "epoch": 0.8151102088167054, "grad_norm": 0.02580060437321663, "grad_norm_var": 1.4903614235624162e-06, "learning_rate": 0.0008528490775763608, "loss": 2.431, "step": 22484 }, { "crossentropy": 2.4478278160095215, "epoch": 0.8151464617169374, "grad_norm": 0.02646794728934765, "grad_norm_var": 1.469779791834815e-06, "learning_rate": 0.0008525245039449004, "loss": 2.513, "step": 22485 }, { "crossentropy": 2.45696759223938, "epoch": 0.8151827146171694, "grad_norm": 0.02680005319416523, "grad_norm_var": 1.464020940739385e-06, "learning_rate": 0.000852199986331389, "loss": 2.4418, "step": 22486 }, { "crossentropy": 2.607266664505005, "epoch": 0.8152189675174014, "grad_norm": 0.027596663683652878, "grad_norm_var": 1.4562385087346641e-06, "learning_rate": 0.0008518755247402121, "loss": 2.5769, "step": 22487 }, { "crossentropy": 2.313735246658325, "epoch": 0.8152552204176334, "grad_norm": 0.02554367482662201, "grad_norm_var": 1.146705379160987e-06, "learning_rate": 0.0008515511191757519, "loss": 2.3545, "step": 22488 }, { "crossentropy": 2.449059247970581, "epoch": 0.8152914733178654, "grad_norm": 0.028076933696866035, "grad_norm_var": 9.945159734513296e-07, "learning_rate": 0.0008512267696423881, "loss": 2.491, "step": 22489 }, { "crossentropy": 2.3995370864868164, "epoch": 0.8153277262180975, "grad_norm": 0.02674403227865696, "grad_norm_var": 9.27679252293737e-07, "learning_rate": 0.0008509024761445023, "loss": 2.4775, "step": 22490 }, { "crossentropy": 2.399949312210083, "epoch": 0.8153639791183295, "grad_norm": 0.02638300135731697, "grad_norm_var": 9.31861067053373e-07, "learning_rate": 0.0008505782386864763, "loss": 2.3925, "step": 22491 }, { "crossentropy": 2.544981002807617, "epoch": 0.8154002320185615, "grad_norm": 0.027895910665392876, "grad_norm_var": 1.0251647722539128e-06, "learning_rate": 0.0008502540572726874, "loss": 2.4583, "step": 22492 }, { "crossentropy": 2.5830414295196533, "epoch": 0.8154364849187935, "grad_norm": 0.025808941572904587, "grad_norm_var": 6.337297069500576e-07, "learning_rate": 0.000849929931907516, "loss": 2.5066, "step": 22493 }, { "crossentropy": 2.168583393096924, "epoch": 0.8154727378190255, "grad_norm": 0.026235656812787056, "grad_norm_var": 6.316562821728159e-07, "learning_rate": 0.0008496058625953373, "loss": 2.2946, "step": 22494 }, { "crossentropy": 2.370126485824585, "epoch": 0.8155089907192575, "grad_norm": 0.02645256370306015, "grad_norm_var": 6.235700656807002e-07, "learning_rate": 0.0008492818493405313, "loss": 2.3784, "step": 22495 }, { "crossentropy": 2.545259475708008, "epoch": 0.8155452436194895, "grad_norm": 0.026786549016833305, "grad_norm_var": 5.292530191322905e-07, "learning_rate": 0.0008489578921474728, "loss": 2.5151, "step": 22496 }, { "crossentropy": 2.482088804244995, "epoch": 0.8155814965197216, "grad_norm": 0.0264442078769207, "grad_norm_var": 5.299251315465098e-07, "learning_rate": 0.0008486339910205359, "loss": 2.4659, "step": 22497 }, { "crossentropy": 2.462141990661621, "epoch": 0.8156177494199536, "grad_norm": 0.027555769309401512, "grad_norm_var": 5.765242197669496e-07, "learning_rate": 0.0008483101459640974, "loss": 2.3966, "step": 22498 }, { "crossentropy": 2.7161035537719727, "epoch": 0.8156540023201856, "grad_norm": 0.026924816891551018, "grad_norm_var": 5.575183991068326e-07, "learning_rate": 0.0008479863569825319, "loss": 2.55, "step": 22499 }, { "crossentropy": 2.5114827156066895, "epoch": 0.8156902552204176, "grad_norm": 0.026724377647042274, "grad_norm_var": 4.976320482244038e-07, "learning_rate": 0.0008476626240802099, "loss": 2.5738, "step": 22500 }, { "crossentropy": 2.369346857070923, "epoch": 0.8157265081206496, "grad_norm": 0.026161469519138336, "grad_norm_var": 5.161548660641135e-07, "learning_rate": 0.0008473389472615073, "loss": 2.2821, "step": 22501 }, { "crossentropy": 2.391200542449951, "epoch": 0.8157627610208816, "grad_norm": 0.026951462030410767, "grad_norm_var": 5.184282638431848e-07, "learning_rate": 0.0008470153265307929, "loss": 2.459, "step": 22502 }, { "crossentropy": 2.4524576663970947, "epoch": 0.8157990139211136, "grad_norm": 0.02638038620352745, "grad_norm_var": 4.7648182381528534e-07, "learning_rate": 0.0008466917618924397, "loss": 2.4473, "step": 22503 }, { "crossentropy": 2.360630750656128, "epoch": 0.8158352668213457, "grad_norm": 0.026047589257359505, "grad_norm_var": 4.152075173527303e-07, "learning_rate": 0.0008463682533508171, "loss": 2.3669, "step": 22504 }, { "crossentropy": 2.4154272079467773, "epoch": 0.8158715197215777, "grad_norm": 0.02659781463444233, "grad_norm_var": 2.849972354399769e-07, "learning_rate": 0.0008460448009102939, "loss": 2.3721, "step": 22505 }, { "crossentropy": 2.3677265644073486, "epoch": 0.8159077726218097, "grad_norm": 0.025897158309817314, "grad_norm_var": 3.170485010992128e-07, "learning_rate": 0.0008457214045752393, "loss": 2.3819, "step": 22506 }, { "crossentropy": 2.542274236679077, "epoch": 0.8159440255220418, "grad_norm": 0.026557980105280876, "grad_norm_var": 3.1441315713985153e-07, "learning_rate": 0.0008453980643500225, "loss": 2.4537, "step": 22507 }, { "crossentropy": 2.4472811222076416, "epoch": 0.8159802784222738, "grad_norm": 0.027393421158194542, "grad_norm_var": 2.42627314414832e-07, "learning_rate": 0.0008450747802390085, "loss": 2.4043, "step": 22508 }, { "crossentropy": 2.5773603916168213, "epoch": 0.8160165313225058, "grad_norm": 0.02779330685734749, "grad_norm_var": 2.906760863780981e-07, "learning_rate": 0.0008447515522465665, "loss": 2.5103, "step": 22509 }, { "crossentropy": 2.5304930210113525, "epoch": 0.8160527842227379, "grad_norm": 0.02674134261906147, "grad_norm_var": 2.7659536294642277e-07, "learning_rate": 0.0008444283803770586, "loss": 2.4395, "step": 22510 }, { "crossentropy": 2.4041237831115723, "epoch": 0.8160890371229699, "grad_norm": 0.02708331122994423, "grad_norm_var": 2.795462656384728e-07, "learning_rate": 0.0008441052646348524, "loss": 2.3913, "step": 22511 }, { "crossentropy": 2.490565776824951, "epoch": 0.8161252900232019, "grad_norm": 0.026805194094777107, "grad_norm_var": 2.796524897064982e-07, "learning_rate": 0.0008437822050243138, "loss": 2.4798, "step": 22512 }, { "crossentropy": 2.4066896438598633, "epoch": 0.8161615429234339, "grad_norm": 0.026516180485486984, "grad_norm_var": 2.770060046198656e-07, "learning_rate": 0.000843459201549801, "loss": 2.4421, "step": 22513 }, { "crossentropy": 2.495056390762329, "epoch": 0.8161977958236659, "grad_norm": 0.02736326865851879, "grad_norm_var": 2.5885163123356055e-07, "learning_rate": 0.0008431362542156801, "loss": 2.5272, "step": 22514 }, { "crossentropy": 2.378399610519409, "epoch": 0.8162340487238979, "grad_norm": 0.025969889014959335, "grad_norm_var": 2.9310146049710214e-07, "learning_rate": 0.000842813363026313, "loss": 2.3812, "step": 22515 }, { "crossentropy": 2.4366283416748047, "epoch": 0.81627030162413, "grad_norm": 0.026880888268351555, "grad_norm_var": 2.9542266976587997e-07, "learning_rate": 0.0008424905279860589, "loss": 2.4267, "step": 22516 }, { "crossentropy": 2.3603408336639404, "epoch": 0.816306554524362, "grad_norm": 0.026738880202174187, "grad_norm_var": 2.7508544742603037e-07, "learning_rate": 0.0008421677490992808, "loss": 2.3862, "step": 22517 }, { "crossentropy": 2.530136823654175, "epoch": 0.816342807424594, "grad_norm": 0.026329802349209785, "grad_norm_var": 2.81079947261883e-07, "learning_rate": 0.0008418450263703358, "loss": 2.5212, "step": 22518 }, { "crossentropy": 2.445499897003174, "epoch": 0.816379060324826, "grad_norm": 0.02624344639480114, "grad_norm_var": 2.8796948347882776e-07, "learning_rate": 0.0008415223598035843, "loss": 2.4621, "step": 22519 }, { "crossentropy": 2.4729011058807373, "epoch": 0.816415313225058, "grad_norm": 0.026831068098545074, "grad_norm_var": 2.5975148615780115e-07, "learning_rate": 0.0008411997494033863, "loss": 2.3874, "step": 22520 }, { "crossentropy": 2.5490224361419678, "epoch": 0.81645156612529, "grad_norm": 0.02720717526972294, "grad_norm_var": 2.7189952933833787e-07, "learning_rate": 0.000840877195174094, "loss": 2.493, "step": 22521 }, { "crossentropy": 2.5575132369995117, "epoch": 0.816487819025522, "grad_norm": 0.02643614262342453, "grad_norm_var": 2.271845022239552e-07, "learning_rate": 0.0008405546971200673, "loss": 2.5378, "step": 22522 }, { "crossentropy": 2.4355671405792236, "epoch": 0.816524071925754, "grad_norm": 0.027664117515087128, "grad_norm_var": 2.6711988652659317e-07, "learning_rate": 0.0008402322552456631, "loss": 2.4593, "step": 22523 }, { "crossentropy": 2.393399715423584, "epoch": 0.8165603248259861, "grad_norm": 0.025830551981925964, "grad_norm_var": 3.1171655552978775e-07, "learning_rate": 0.0008399098695552332, "loss": 2.4055, "step": 22524 }, { "crossentropy": 2.3585731983184814, "epoch": 0.8165965777262181, "grad_norm": 0.029232949018478394, "grad_norm_var": 6.363037989583093e-07, "learning_rate": 0.000839587540053135, "loss": 2.2856, "step": 22525 }, { "crossentropy": 2.3340866565704346, "epoch": 0.8166328306264501, "grad_norm": 0.027205418795347214, "grad_norm_var": 6.419803996403437e-07, "learning_rate": 0.0008392652667437195, "loss": 2.3177, "step": 22526 }, { "crossentropy": 2.3772428035736084, "epoch": 0.8166690835266821, "grad_norm": 0.026949100196361542, "grad_norm_var": 6.39756843884157e-07, "learning_rate": 0.000838943049631341, "loss": 2.3577, "step": 22527 }, { "crossentropy": 2.5069282054901123, "epoch": 0.8167053364269141, "grad_norm": 0.025592733174562454, "grad_norm_var": 7.449825316654367e-07, "learning_rate": 0.0008386208887203523, "loss": 2.5225, "step": 22528 }, { "crossentropy": 2.4283573627471924, "epoch": 0.8167415893271461, "grad_norm": 0.026866870000958443, "grad_norm_var": 7.388380044791165e-07, "learning_rate": 0.0008382987840151035, "loss": 2.3963, "step": 22529 }, { "crossentropy": 2.4888272285461426, "epoch": 0.8167778422273781, "grad_norm": 0.028971649706363678, "grad_norm_var": 1.0140434592381098e-06, "learning_rate": 0.0008379767355199441, "loss": 2.4704, "step": 22530 }, { "crossentropy": 2.403733491897583, "epoch": 0.8168140951276102, "grad_norm": 0.026005081832408905, "grad_norm_var": 1.009594936886062e-06, "learning_rate": 0.0008376547432392262, "loss": 2.4035, "step": 22531 }, { "crossentropy": 2.4300220012664795, "epoch": 0.8168503480278422, "grad_norm": 0.028273092582821846, "grad_norm_var": 1.1203896796933727e-06, "learning_rate": 0.0008373328071772962, "loss": 2.5161, "step": 22532 }, { "crossentropy": 2.254258871078491, "epoch": 0.8168866009280742, "grad_norm": 0.02591540850698948, "grad_norm_var": 1.194035733134933e-06, "learning_rate": 0.0008370109273385052, "loss": 2.3783, "step": 22533 }, { "crossentropy": 2.364727735519409, "epoch": 0.8169228538283063, "grad_norm": 0.027195526286959648, "grad_norm_var": 1.1667305048793444e-06, "learning_rate": 0.0008366891037271979, "loss": 2.3132, "step": 22534 }, { "crossentropy": 2.5787694454193115, "epoch": 0.8169591067285383, "grad_norm": 0.03634370490908623, "grad_norm_var": 6.48845143678572e-06, "learning_rate": 0.000836367336347722, "loss": 2.6296, "step": 22535 }, { "crossentropy": 2.336447238922119, "epoch": 0.8169953596287703, "grad_norm": 0.02671152539551258, "grad_norm_var": 6.502517699099631e-06, "learning_rate": 0.0008360456252044252, "loss": 2.4016, "step": 22536 }, { "crossentropy": 2.455526351928711, "epoch": 0.8170316125290024, "grad_norm": 0.026112839579582214, "grad_norm_var": 6.641988604683341e-06, "learning_rate": 0.0008357239703016517, "loss": 2.4534, "step": 22537 }, { "crossentropy": 2.381124496459961, "epoch": 0.8170678654292344, "grad_norm": 0.026055382564663887, "grad_norm_var": 6.7092058613608835e-06, "learning_rate": 0.0008354023716437436, "loss": 2.3967, "step": 22538 }, { "crossentropy": 2.436405658721924, "epoch": 0.8171041183294664, "grad_norm": 0.02682207152247429, "grad_norm_var": 6.741592471184536e-06, "learning_rate": 0.0008350808292350481, "loss": 2.3961, "step": 22539 }, { "crossentropy": 2.4117634296417236, "epoch": 0.8171403712296984, "grad_norm": 0.029687490314245224, "grad_norm_var": 6.8101162244459e-06, "learning_rate": 0.000834759343079905, "loss": 2.3394, "step": 22540 }, { "crossentropy": 2.395089626312256, "epoch": 0.8171766241299304, "grad_norm": 0.026068421080708504, "grad_norm_var": 6.808734898097038e-06, "learning_rate": 0.0008344379131826596, "loss": 2.3949, "step": 22541 }, { "crossentropy": 2.4898927211761475, "epoch": 0.8172128770301624, "grad_norm": 0.026309559121727943, "grad_norm_var": 6.899877894025961e-06, "learning_rate": 0.0008341165395476497, "loss": 2.4837, "step": 22542 }, { "crossentropy": 2.338275671005249, "epoch": 0.8172491299303944, "grad_norm": 0.026639077812433243, "grad_norm_var": 6.928348338529575e-06, "learning_rate": 0.0008337952221792183, "loss": 2.4193, "step": 22543 }, { "crossentropy": 2.4642789363861084, "epoch": 0.8172853828306265, "grad_norm": 0.02771931327879429, "grad_norm_var": 6.677813228307669e-06, "learning_rate": 0.000833473961081706, "loss": 2.488, "step": 22544 }, { "crossentropy": 2.4724206924438477, "epoch": 0.8173216357308585, "grad_norm": 0.0263492651283741, "grad_norm_var": 6.74557258466176e-06, "learning_rate": 0.000833152756259451, "loss": 2.4766, "step": 22545 }, { "crossentropy": 2.4317445755004883, "epoch": 0.8173578886310905, "grad_norm": 0.02687077410519123, "grad_norm_var": 6.629842023852722e-06, "learning_rate": 0.0008328316077167902, "loss": 2.4999, "step": 22546 }, { "crossentropy": 2.555563449859619, "epoch": 0.8173941415313225, "grad_norm": 0.02607285976409912, "grad_norm_var": 6.617139936645787e-06, "learning_rate": 0.0008325105154580637, "loss": 2.5078, "step": 22547 }, { "crossentropy": 2.613389730453491, "epoch": 0.8174303944315545, "grad_norm": 0.028521036729216576, "grad_norm_var": 6.6483039340660495e-06, "learning_rate": 0.0008321894794876055, "loss": 2.5121, "step": 22548 }, { "crossentropy": 2.511967897415161, "epoch": 0.8174666473317865, "grad_norm": 0.02696828916668892, "grad_norm_var": 6.500452158061942e-06, "learning_rate": 0.0008318684998097547, "loss": 2.4481, "step": 22549 }, { "crossentropy": 2.340420961380005, "epoch": 0.8175029002320185, "grad_norm": 0.030759448185563087, "grad_norm_var": 7.136335955729099e-06, "learning_rate": 0.0008315475764288444, "loss": 2.3657, "step": 22550 }, { "crossentropy": 2.330565929412842, "epoch": 0.8175391531322506, "grad_norm": 0.027252938598394394, "grad_norm_var": 1.8858523174055653e-06, "learning_rate": 0.0008312267093492093, "loss": 2.4209, "step": 22551 }, { "crossentropy": 2.428004264831543, "epoch": 0.8175754060324826, "grad_norm": 0.02777736261487007, "grad_norm_var": 1.8899193135378377e-06, "learning_rate": 0.0008309058985751849, "loss": 2.4015, "step": 22552 }, { "crossentropy": 2.4880199432373047, "epoch": 0.8176116589327146, "grad_norm": 0.026372553780674934, "grad_norm_var": 1.8547868217348254e-06, "learning_rate": 0.0008305851441111023, "loss": 2.4641, "step": 22553 }, { "crossentropy": 2.4873368740081787, "epoch": 0.8176479118329466, "grad_norm": 0.025926580652594566, "grad_norm_var": 1.8766034367294232e-06, "learning_rate": 0.0008302644459612963, "loss": 2.462, "step": 22554 }, { "crossentropy": 2.218113899230957, "epoch": 0.8176841647331786, "grad_norm": 0.026634277775883675, "grad_norm_var": 1.889705729114195e-06, "learning_rate": 0.000829943804130096, "loss": 2.3656, "step": 22555 }, { "crossentropy": 2.4965364933013916, "epoch": 0.8177204176334106, "grad_norm": 0.02764054201543331, "grad_norm_var": 1.4851181302746296e-06, "learning_rate": 0.0008296232186218323, "loss": 2.578, "step": 22556 }, { "crossentropy": 2.6266369819641113, "epoch": 0.8177566705336426, "grad_norm": 0.025775354355573654, "grad_norm_var": 1.5314851032820528e-06, "learning_rate": 0.0008293026894408367, "loss": 2.5593, "step": 22557 }, { "crossentropy": 2.5173561573028564, "epoch": 0.8177929234338747, "grad_norm": 0.027736788615584373, "grad_norm_var": 1.508505924240399e-06, "learning_rate": 0.0008289822165914362, "loss": 2.4989, "step": 22558 }, { "crossentropy": 2.5415854454040527, "epoch": 0.8178291763341067, "grad_norm": 0.026765864342451096, "grad_norm_var": 1.5002222009887252e-06, "learning_rate": 0.0008286618000779605, "loss": 2.4868, "step": 22559 }, { "crossentropy": 2.5520217418670654, "epoch": 0.8178654292343387, "grad_norm": 0.026797493919730186, "grad_norm_var": 1.4890672790720023e-06, "learning_rate": 0.000828341439904739, "loss": 2.5196, "step": 22560 }, { "crossentropy": 2.4249234199523926, "epoch": 0.8179016821345708, "grad_norm": 0.025672994554042816, "grad_norm_var": 1.5888465896706267e-06, "learning_rate": 0.0008280211360760959, "loss": 2.4338, "step": 22561 }, { "crossentropy": 2.4181766510009766, "epoch": 0.8179379350348028, "grad_norm": 0.02572358027100563, "grad_norm_var": 1.705637874456894e-06, "learning_rate": 0.0008277008885963594, "loss": 2.4114, "step": 22562 }, { "crossentropy": 2.444331169128418, "epoch": 0.8179741879350348, "grad_norm": 0.026270706206560135, "grad_norm_var": 1.682970672177662e-06, "learning_rate": 0.0008273806974698544, "loss": 2.4641, "step": 22563 }, { "crossentropy": 2.6144416332244873, "epoch": 0.8180104408352669, "grad_norm": 0.02615506388247013, "grad_norm_var": 1.5647512991461808e-06, "learning_rate": 0.0008270605627009042, "loss": 2.5484, "step": 22564 }, { "crossentropy": 2.32738995552063, "epoch": 0.8180466937354989, "grad_norm": 0.026983629912137985, "grad_norm_var": 1.5649274420099017e-06, "learning_rate": 0.0008267404842938347, "loss": 2.4084, "step": 22565 }, { "crossentropy": 2.560847759246826, "epoch": 0.8180829466357309, "grad_norm": 0.026756003499031067, "grad_norm_var": 5.013404279042283e-07, "learning_rate": 0.0008264204622529669, "loss": 2.4508, "step": 22566 }, { "crossentropy": 2.66206693649292, "epoch": 0.8181191995359629, "grad_norm": 0.027112681418657303, "grad_norm_var": 4.911094220093193e-07, "learning_rate": 0.0008261004965826241, "loss": 2.5339, "step": 22567 }, { "crossentropy": 2.3781275749206543, "epoch": 0.8181554524361949, "grad_norm": 0.02622690051794052, "grad_norm_var": 4.0444042170259486e-07, "learning_rate": 0.0008257805872871299, "loss": 2.3712, "step": 22568 }, { "crossentropy": 2.540493965148926, "epoch": 0.8181917053364269, "grad_norm": 0.026191486045718193, "grad_norm_var": 4.103977964968793e-07, "learning_rate": 0.0008254607343708021, "loss": 2.435, "step": 22569 }, { "crossentropy": 2.455034017562866, "epoch": 0.818227958236659, "grad_norm": 0.027245642617344856, "grad_norm_var": 4.142264529089264e-07, "learning_rate": 0.0008251409378379638, "loss": 2.4411, "step": 22570 }, { "crossentropy": 2.449389934539795, "epoch": 0.818264211136891, "grad_norm": 0.026045631617307663, "grad_norm_var": 4.3362926888691294e-07, "learning_rate": 0.0008248211976929321, "loss": 2.514, "step": 22571 }, { "crossentropy": 2.4979660511016846, "epoch": 0.818300464037123, "grad_norm": 0.026747284457087517, "grad_norm_var": 3.5584978430730494e-07, "learning_rate": 0.0008245015139400252, "loss": 2.4885, "step": 22572 }, { "crossentropy": 2.4680840969085693, "epoch": 0.818336716937355, "grad_norm": 0.026725606992840767, "grad_norm_var": 3.188331539804758e-07, "learning_rate": 0.0008241818865835632, "loss": 2.4597, "step": 22573 }, { "crossentropy": 2.4606080055236816, "epoch": 0.818372969837587, "grad_norm": 0.0276753231883049, "grad_norm_var": 3.0952612645300987e-07, "learning_rate": 0.0008238623156278602, "loss": 2.4995, "step": 22574 }, { "crossentropy": 2.52494215965271, "epoch": 0.818409222737819, "grad_norm": 0.025892173871397972, "grad_norm_var": 3.342424112088982e-07, "learning_rate": 0.0008235428010772344, "loss": 2.4355, "step": 22575 }, { "crossentropy": 2.2798633575439453, "epoch": 0.818445475638051, "grad_norm": 0.02785460837185383, "grad_norm_var": 4.4405949633853577e-07, "learning_rate": 0.0008232233429360025, "loss": 2.3762, "step": 22576 }, { "crossentropy": 2.4953091144561768, "epoch": 0.818481728538283, "grad_norm": 0.027442527934908867, "grad_norm_var": 4.257757307098963e-07, "learning_rate": 0.0008229039412084765, "loss": 2.4273, "step": 22577 }, { "crossentropy": 2.410017251968384, "epoch": 0.8185179814385151, "grad_norm": 0.02743138186633587, "grad_norm_var": 3.8787599129715077e-07, "learning_rate": 0.0008225845958989731, "loss": 2.4111, "step": 22578 }, { "crossentropy": 2.454806089401245, "epoch": 0.8185542343387471, "grad_norm": 0.02548178657889366, "grad_norm_var": 4.821666810346139e-07, "learning_rate": 0.0008222653070118035, "loss": 2.5416, "step": 22579 }, { "crossentropy": 2.581197500228882, "epoch": 0.8185904872389791, "grad_norm": 0.028007376939058304, "grad_norm_var": 5.501718430426609e-07, "learning_rate": 0.0008219460745512802, "loss": 2.6582, "step": 22580 }, { "crossentropy": 2.439342498779297, "epoch": 0.8186267401392111, "grad_norm": 0.026597559452056885, "grad_norm_var": 5.533166947836616e-07, "learning_rate": 0.0008216268985217168, "loss": 2.4452, "step": 22581 }, { "crossentropy": 2.4916272163391113, "epoch": 0.8186629930394431, "grad_norm": 0.02644725888967514, "grad_norm_var": 5.627166918768684e-07, "learning_rate": 0.000821307778927422, "loss": 2.4275, "step": 22582 }, { "crossentropy": 2.4100446701049805, "epoch": 0.8186992459396751, "grad_norm": 0.026263834908604622, "grad_norm_var": 5.746619208894423e-07, "learning_rate": 0.000820988715772707, "loss": 2.4156, "step": 22583 }, { "crossentropy": 2.4120144844055176, "epoch": 0.8187354988399071, "grad_norm": 0.02691010758280754, "grad_norm_var": 5.546102320646972e-07, "learning_rate": 0.000820669709061882, "loss": 2.4236, "step": 22584 }, { "crossentropy": 2.4372475147247314, "epoch": 0.8187717517401392, "grad_norm": 0.026010148227214813, "grad_norm_var": 5.716194900804346e-07, "learning_rate": 0.0008203507587992543, "loss": 2.4639, "step": 22585 }, { "crossentropy": 2.520591974258423, "epoch": 0.8188080046403712, "grad_norm": 0.026871085166931152, "grad_norm_var": 5.580641076760219e-07, "learning_rate": 0.0008200318649891331, "loss": 2.5481, "step": 22586 }, { "crossentropy": 2.4281721115112305, "epoch": 0.8188442575406032, "grad_norm": 0.026073794811964035, "grad_norm_var": 5.55373967253099e-07, "learning_rate": 0.0008197130276358256, "loss": 2.4913, "step": 22587 }, { "crossentropy": 2.6339480876922607, "epoch": 0.8188805104408353, "grad_norm": 0.02755284123122692, "grad_norm_var": 5.927408451921924e-07, "learning_rate": 0.0008193942467436355, "loss": 2.5203, "step": 22588 }, { "crossentropy": 2.4343855381011963, "epoch": 0.8189167633410673, "grad_norm": 0.02652897872030735, "grad_norm_var": 5.978243679647152e-07, "learning_rate": 0.0008190755223168722, "loss": 2.4721, "step": 22589 }, { "crossentropy": 2.56484055519104, "epoch": 0.8189530162412993, "grad_norm": 0.1442905068397522, "grad_norm_var": 0.000863917774522062, "learning_rate": 0.0008187568543598372, "loss": 2.5054, "step": 22590 }, { "crossentropy": 2.4255406856536865, "epoch": 0.8189892691415314, "grad_norm": 0.02614212967455387, "grad_norm_var": 0.0008636480169682935, "learning_rate": 0.0008184382428768362, "loss": 2.4375, "step": 22591 }, { "crossentropy": 2.4910683631896973, "epoch": 0.8190255220417634, "grad_norm": 0.027124058455228806, "grad_norm_var": 0.0008642915785872474, "learning_rate": 0.0008181196878721736, "loss": 2.4932, "step": 22592 }, { "crossentropy": 2.5273306369781494, "epoch": 0.8190617749419954, "grad_norm": 0.027694256976246834, "grad_norm_var": 0.0008640729792641339, "learning_rate": 0.0008178011893501497, "loss": 2.4383, "step": 22593 }, { "crossentropy": 2.4581458568573, "epoch": 0.8190980278422274, "grad_norm": 0.02799106016755104, "grad_norm_var": 0.0008635957256762255, "learning_rate": 0.0008174827473150687, "loss": 2.455, "step": 22594 }, { "crossentropy": 2.3582210540771484, "epoch": 0.8191342807424594, "grad_norm": 0.02672293409705162, "grad_norm_var": 0.000862261806590917, "learning_rate": 0.0008171643617712293, "loss": 2.431, "step": 22595 }, { "crossentropy": 2.440131902694702, "epoch": 0.8191705336426914, "grad_norm": 0.02716832235455513, "grad_norm_var": 0.0008629987958523234, "learning_rate": 0.0008168460327229344, "loss": 2.5204, "step": 22596 }, { "crossentropy": 2.4215590953826904, "epoch": 0.8192067865429234, "grad_norm": 0.026986798271536827, "grad_norm_var": 0.0008626163407026344, "learning_rate": 0.0008165277601744814, "loss": 2.4151, "step": 22597 }, { "crossentropy": 2.412853479385376, "epoch": 0.8192430394431555, "grad_norm": 0.02738332934677601, "grad_norm_var": 0.0008617067809644145, "learning_rate": 0.0008162095441301687, "loss": 2.4762, "step": 22598 }, { "crossentropy": 2.583913803100586, "epoch": 0.8192792923433875, "grad_norm": 0.028122970834374428, "grad_norm_var": 0.000859947584302482, "learning_rate": 0.000815891384594295, "loss": 2.5188, "step": 22599 }, { "crossentropy": 2.5670504570007324, "epoch": 0.8193155452436195, "grad_norm": 0.026460079476237297, "grad_norm_var": 0.0008604065635139664, "learning_rate": 0.0008155732815711592, "loss": 2.4272, "step": 22600 }, { "crossentropy": 2.474123239517212, "epoch": 0.8193517981438515, "grad_norm": 0.026339346542954445, "grad_norm_var": 0.000860048582468692, "learning_rate": 0.000815255235065055, "loss": 2.4716, "step": 22601 }, { "crossentropy": 2.4663796424865723, "epoch": 0.8193880510440835, "grad_norm": 0.026995904743671417, "grad_norm_var": 0.000859925240980966, "learning_rate": 0.0008149372450802806, "loss": 2.4688, "step": 22602 }, { "crossentropy": 2.512988328933716, "epoch": 0.8194243039443155, "grad_norm": 0.026492275297641754, "grad_norm_var": 0.0008594744747540308, "learning_rate": 0.0008146193116211287, "loss": 2.5279, "step": 22603 }, { "crossentropy": 2.371466875076294, "epoch": 0.8194605568445475, "grad_norm": 0.0261554978787899, "grad_norm_var": 0.0008608675143876101, "learning_rate": 0.0008143014346918959, "loss": 2.4584, "step": 22604 }, { "crossentropy": 2.572052478790283, "epoch": 0.8194968097447796, "grad_norm": 0.0273601021617651, "grad_norm_var": 0.0008600509261569104, "learning_rate": 0.0008139836142968737, "loss": 2.4799, "step": 22605 }, { "crossentropy": 2.491675853729248, "epoch": 0.8195330626450116, "grad_norm": 0.026670264080166817, "grad_norm_var": 3.7759275998935855e-07, "learning_rate": 0.0008136658504403543, "loss": 2.4872, "step": 22606 }, { "crossentropy": 2.5776302814483643, "epoch": 0.8195693155452436, "grad_norm": 0.026102427393198013, "grad_norm_var": 3.821694479751979e-07, "learning_rate": 0.000813348143126631, "loss": 2.497, "step": 22607 }, { "crossentropy": 2.4094955921173096, "epoch": 0.8196055684454756, "grad_norm": 0.02648903988301754, "grad_norm_var": 3.956494745084438e-07, "learning_rate": 0.0008130304923599952, "loss": 2.5131, "step": 22608 }, { "crossentropy": 2.2065091133117676, "epoch": 0.8196418213457076, "grad_norm": 0.026313481852412224, "grad_norm_var": 3.7703560538126084e-07, "learning_rate": 0.0008127128981447357, "loss": 2.3897, "step": 22609 }, { "crossentropy": 2.457247257232666, "epoch": 0.8196780742459396, "grad_norm": 0.02745787426829338, "grad_norm_var": 3.14367449399698e-07, "learning_rate": 0.0008123953604851442, "loss": 2.3974, "step": 22610 }, { "crossentropy": 2.594667434692383, "epoch": 0.8197143271461717, "grad_norm": 0.02656804956495762, "grad_norm_var": 3.1800121724628874e-07, "learning_rate": 0.0008120778793855066, "loss": 2.5791, "step": 22611 }, { "crossentropy": 2.476186513900757, "epoch": 0.8197505800464037, "grad_norm": 0.027666810899972916, "grad_norm_var": 3.5690848837616194e-07, "learning_rate": 0.0008117604548501129, "loss": 2.4825, "step": 22612 }, { "crossentropy": 2.437486410140991, "epoch": 0.8197868329466357, "grad_norm": 0.02643892914056778, "grad_norm_var": 3.655123102813077e-07, "learning_rate": 0.0008114430868832523, "loss": 2.3474, "step": 22613 }, { "crossentropy": 2.4584076404571533, "epoch": 0.8198230858468677, "grad_norm": 0.02618187665939331, "grad_norm_var": 3.644511169002366e-07, "learning_rate": 0.000811125775489207, "loss": 2.5025, "step": 22614 }, { "crossentropy": 2.5149269104003906, "epoch": 0.8198593387470998, "grad_norm": 0.02603142336010933, "grad_norm_var": 2.5175162615072706e-07, "learning_rate": 0.0008108085206722654, "loss": 2.422, "step": 22615 }, { "crossentropy": 2.5018017292022705, "epoch": 0.8198955916473318, "grad_norm": 0.02695825695991516, "grad_norm_var": 2.574566697721136e-07, "learning_rate": 0.0008104913224367127, "loss": 2.4855, "step": 22616 }, { "crossentropy": 2.3406732082366943, "epoch": 0.8199318445475638, "grad_norm": 0.02746439352631569, "grad_norm_var": 2.916378132409297e-07, "learning_rate": 0.0008101741807868312, "loss": 2.4704, "step": 22617 }, { "crossentropy": 2.244053602218628, "epoch": 0.8199680974477959, "grad_norm": 0.026750117540359497, "grad_norm_var": 2.8601652839742253e-07, "learning_rate": 0.0008098570957269075, "loss": 2.3607, "step": 22618 }, { "crossentropy": 2.504474639892578, "epoch": 0.8200043503480279, "grad_norm": 0.02634395658969879, "grad_norm_var": 2.9137677393594314e-07, "learning_rate": 0.0008095400672612208, "loss": 2.5579, "step": 22619 }, { "crossentropy": 2.290933609008789, "epoch": 0.8200406032482599, "grad_norm": 0.026757681742310524, "grad_norm_var": 2.7156413836450465e-07, "learning_rate": 0.0008092230953940549, "loss": 2.4342, "step": 22620 }, { "crossentropy": 2.2268223762512207, "epoch": 0.8200768561484919, "grad_norm": 0.028874577954411507, "grad_norm_var": 5.437345889836946e-07, "learning_rate": 0.0008089061801296915, "loss": 2.2539, "step": 22621 }, { "crossentropy": 2.4227471351623535, "epoch": 0.8201131090487239, "grad_norm": 0.02588053047657013, "grad_norm_var": 5.98146827291548e-07, "learning_rate": 0.000808589321472411, "loss": 2.4586, "step": 22622 }, { "crossentropy": 2.4593474864959717, "epoch": 0.8201493619489559, "grad_norm": 0.027437418699264526, "grad_norm_var": 5.911586514872052e-07, "learning_rate": 0.0008082725194264911, "loss": 2.4702, "step": 22623 }, { "crossentropy": 2.3525822162628174, "epoch": 0.820185614849188, "grad_norm": 0.02590235322713852, "grad_norm_var": 6.409777888895125e-07, "learning_rate": 0.0008079557739962129, "loss": 2.4412, "step": 22624 }, { "crossentropy": 2.3153011798858643, "epoch": 0.82022186774942, "grad_norm": 0.025882238522171974, "grad_norm_var": 6.813937262084106e-07, "learning_rate": 0.0008076390851858528, "loss": 2.4313, "step": 22625 }, { "crossentropy": 2.5112483501434326, "epoch": 0.820258120649652, "grad_norm": 0.026358455419540405, "grad_norm_var": 6.586370588909073e-07, "learning_rate": 0.0008073224529996886, "loss": 2.4111, "step": 22626 }, { "crossentropy": 2.414583206176758, "epoch": 0.820294373549884, "grad_norm": 0.025563113391399384, "grad_norm_var": 7.419236444306526e-07, "learning_rate": 0.0008070058774419986, "loss": 2.3643, "step": 22627 }, { "crossentropy": 2.2725350856781006, "epoch": 0.820330626450116, "grad_norm": 0.02950456738471985, "grad_norm_var": 1.2007503933945574e-06, "learning_rate": 0.0008066893585170565, "loss": 2.3489, "step": 22628 }, { "crossentropy": 2.512885570526123, "epoch": 0.820366879350348, "grad_norm": 0.026582496240735054, "grad_norm_var": 1.195689328779374e-06, "learning_rate": 0.0008063728962291389, "loss": 2.4799, "step": 22629 }, { "crossentropy": 2.5369632244110107, "epoch": 0.82040313225058, "grad_norm": 0.02558646723628044, "grad_norm_var": 1.2652976611024839e-06, "learning_rate": 0.0008060564905825196, "loss": 2.4691, "step": 22630 }, { "crossentropy": 2.502592086791992, "epoch": 0.820439385150812, "grad_norm": 0.02740212343633175, "grad_norm_var": 1.2527897494805784e-06, "learning_rate": 0.0008057401415814708, "loss": 2.4763, "step": 22631 }, { "crossentropy": 2.3517215251922607, "epoch": 0.8204756380510441, "grad_norm": 0.026325145736336708, "grad_norm_var": 1.2668499424414263e-06, "learning_rate": 0.0008054238492302679, "loss": 2.4332, "step": 22632 }, { "crossentropy": 2.356874942779541, "epoch": 0.8205118909512761, "grad_norm": 0.026410380378365517, "grad_norm_var": 1.2412939822769072e-06, "learning_rate": 0.0008051076135331798, "loss": 2.412, "step": 22633 }, { "crossentropy": 2.5507640838623047, "epoch": 0.8205481438515081, "grad_norm": 0.026316771283745766, "grad_norm_var": 1.2514409302432732e-06, "learning_rate": 0.000804791434494479, "loss": 2.5599, "step": 22634 }, { "crossentropy": 2.3791427612304688, "epoch": 0.8205843967517401, "grad_norm": 0.02713301032781601, "grad_norm_var": 1.2533670811174884e-06, "learning_rate": 0.0008044753121184384, "loss": 2.4307, "step": 22635 }, { "crossentropy": 2.348203182220459, "epoch": 0.8206206496519721, "grad_norm": 0.025582026690244675, "grad_norm_var": 1.3377383232366594e-06, "learning_rate": 0.000804159246409324, "loss": 2.3934, "step": 22636 }, { "crossentropy": 2.3925061225891113, "epoch": 0.8206569025522041, "grad_norm": 0.026275821030139923, "grad_norm_var": 9.96415554550591e-07, "learning_rate": 0.0008038432373714072, "loss": 2.4608, "step": 22637 }, { "crossentropy": 2.5258095264434814, "epoch": 0.8206931554524362, "grad_norm": 0.026507362723350525, "grad_norm_var": 9.684526202325313e-07, "learning_rate": 0.0008035272850089558, "loss": 2.5166, "step": 22638 }, { "crossentropy": 2.265550374984741, "epoch": 0.8207294083526682, "grad_norm": 0.02686375565826893, "grad_norm_var": 9.209988565999355e-07, "learning_rate": 0.0008032113893262355, "loss": 2.3471, "step": 22639 }, { "crossentropy": 2.415029764175415, "epoch": 0.8207656612529002, "grad_norm": 0.02757035195827484, "grad_norm_var": 9.592454201984146e-07, "learning_rate": 0.000802895550327516, "loss": 2.3777, "step": 22640 }, { "crossentropy": 2.5868794918060303, "epoch": 0.8208019141531323, "grad_norm": 0.027281414717435837, "grad_norm_var": 9.44618786728688e-07, "learning_rate": 0.0008025797680170593, "loss": 2.4837, "step": 22641 }, { "crossentropy": 2.5683701038360596, "epoch": 0.8208381670533643, "grad_norm": 0.026736225932836533, "grad_norm_var": 9.361356411806872e-07, "learning_rate": 0.0008022640423991334, "loss": 2.4771, "step": 22642 }, { "crossentropy": 2.4505982398986816, "epoch": 0.8208744199535963, "grad_norm": 0.026711832731962204, "grad_norm_var": 8.402575439260424e-07, "learning_rate": 0.0008019483734780026, "loss": 2.51, "step": 22643 }, { "crossentropy": 2.3717496395111084, "epoch": 0.8209106728538283, "grad_norm": 0.026832833886146545, "grad_norm_var": 3.227132993855256e-07, "learning_rate": 0.0008016327612579288, "loss": 2.4705, "step": 22644 }, { "crossentropy": 2.400190830230713, "epoch": 0.8209469257540604, "grad_norm": 0.025748202577233315, "grad_norm_var": 3.7176477906356724e-07, "learning_rate": 0.0008013172057431767, "loss": 2.4619, "step": 22645 }, { "crossentropy": 2.44930362701416, "epoch": 0.8209831786542924, "grad_norm": 0.026344390586018562, "grad_norm_var": 3.0724134065979587e-07, "learning_rate": 0.0008010017069380077, "loss": 2.4657, "step": 22646 }, { "crossentropy": 2.381406784057617, "epoch": 0.8210194315545244, "grad_norm": 0.02616623416543007, "grad_norm_var": 2.7507571783944954e-07, "learning_rate": 0.0008006862648466817, "loss": 2.3924, "step": 22647 }, { "crossentropy": 2.4390196800231934, "epoch": 0.8210556844547564, "grad_norm": 0.02744053490459919, "grad_norm_var": 3.1933798653576337e-07, "learning_rate": 0.0008003708794734615, "loss": 2.4233, "step": 22648 }, { "crossentropy": 2.6130640506744385, "epoch": 0.8210919373549884, "grad_norm": 0.025964904576539993, "grad_norm_var": 3.441960263444865e-07, "learning_rate": 0.0008000555508226048, "loss": 2.4858, "step": 22649 }, { "crossentropy": 2.4313344955444336, "epoch": 0.8211281902552204, "grad_norm": 0.026814566925168037, "grad_norm_var": 3.414006304474354e-07, "learning_rate": 0.0007997402788983716, "loss": 2.4638, "step": 22650 }, { "crossentropy": 2.379873275756836, "epoch": 0.8211644431554525, "grad_norm": 0.02613048069179058, "grad_norm_var": 3.360895326751101e-07, "learning_rate": 0.0007994250637050204, "loss": 2.414, "step": 22651 }, { "crossentropy": 2.551177501678467, "epoch": 0.8212006960556845, "grad_norm": 0.025832464918494225, "grad_norm_var": 3.073304044731804e-07, "learning_rate": 0.0007991099052468082, "loss": 2.4639, "step": 22652 }, { "crossentropy": 2.5555365085601807, "epoch": 0.8212369489559165, "grad_norm": 0.026312323287129402, "grad_norm_var": 3.0595108317140137e-07, "learning_rate": 0.0007987948035279924, "loss": 2.5293, "step": 22653 }, { "crossentropy": 2.524104595184326, "epoch": 0.8212732018561485, "grad_norm": 0.026597455143928528, "grad_norm_var": 3.05602437881911e-07, "learning_rate": 0.0007984797585528269, "loss": 2.5249, "step": 22654 }, { "crossentropy": 2.5151970386505127, "epoch": 0.8213094547563805, "grad_norm": 0.025800658389925957, "grad_norm_var": 3.3661928973416355e-07, "learning_rate": 0.0007981647703255701, "loss": 2.5441, "step": 22655 }, { "crossentropy": 2.4472811222076416, "epoch": 0.8213457076566125, "grad_norm": 0.026099905371665955, "grad_norm_var": 2.653956784291293e-07, "learning_rate": 0.000797849838850474, "loss": 2.4954, "step": 22656 }, { "crossentropy": 2.4152162075042725, "epoch": 0.8213819605568445, "grad_norm": 0.02740398235619068, "grad_norm_var": 2.803156981473111e-07, "learning_rate": 0.0007975349641317919, "loss": 2.4786, "step": 22657 }, { "crossentropy": 2.49537992477417, "epoch": 0.8214182134570766, "grad_norm": 0.025928204879164696, "grad_norm_var": 2.8851401053751427e-07, "learning_rate": 0.0007972201461737777, "loss": 2.4333, "step": 22658 }, { "crossentropy": 2.4926047325134277, "epoch": 0.8214544663573086, "grad_norm": 0.025575196370482445, "grad_norm_var": 3.1943453938930846e-07, "learning_rate": 0.0007969053849806846, "loss": 2.4414, "step": 22659 }, { "crossentropy": 2.402909994125366, "epoch": 0.8214907192575406, "grad_norm": 0.026631752029061317, "grad_norm_var": 3.079981943991556e-07, "learning_rate": 0.0007965906805567614, "loss": 2.5003, "step": 22660 }, { "crossentropy": 2.4663116931915283, "epoch": 0.8215269721577726, "grad_norm": 0.028094621375203133, "grad_norm_var": 4.796411745892788e-07, "learning_rate": 0.0007962760329062613, "loss": 2.4444, "step": 22661 }, { "crossentropy": 2.4435958862304688, "epoch": 0.8215632250580046, "grad_norm": 0.02567492052912712, "grad_norm_var": 5.167323375297825e-07, "learning_rate": 0.0007959614420334315, "loss": 2.4135, "step": 22662 }, { "crossentropy": 2.519624710083008, "epoch": 0.8215994779582366, "grad_norm": 0.02660062536597252, "grad_norm_var": 5.147394764472609e-07, "learning_rate": 0.0007956469079425233, "loss": 2.5108, "step": 22663 }, { "crossentropy": 2.449770450592041, "epoch": 0.8216357308584686, "grad_norm": 0.027539512142539024, "grad_norm_var": 5.286691127925757e-07, "learning_rate": 0.000795332430637784, "loss": 2.4757, "step": 22664 }, { "crossentropy": 2.425311803817749, "epoch": 0.8216719837587007, "grad_norm": 0.0266870129853487, "grad_norm_var": 5.15747656413638e-07, "learning_rate": 0.00079501801012346, "loss": 2.3787, "step": 22665 }, { "crossentropy": 2.4625067710876465, "epoch": 0.8217082366589327, "grad_norm": 0.026785165071487427, "grad_norm_var": 5.145008036400485e-07, "learning_rate": 0.0007947036464037988, "loss": 2.3696, "step": 22666 }, { "crossentropy": 2.5317440032958984, "epoch": 0.8217444895591647, "grad_norm": 0.026501651853322983, "grad_norm_var": 5.057696017310778e-07, "learning_rate": 0.0007943893394830482, "loss": 2.5683, "step": 22667 }, { "crossentropy": 2.5752718448638916, "epoch": 0.8217807424593968, "grad_norm": 0.026071004569530487, "grad_norm_var": 4.879646722170342e-07, "learning_rate": 0.0007940750893654503, "loss": 2.5198, "step": 22668 }, { "crossentropy": 2.3514151573181152, "epoch": 0.8218169953596288, "grad_norm": 0.027067935094237328, "grad_norm_var": 5.028267342772289e-07, "learning_rate": 0.0007937608960552523, "loss": 2.4036, "step": 22669 }, { "crossentropy": 2.450742721557617, "epoch": 0.8218532482598608, "grad_norm": 0.02718016691505909, "grad_norm_var": 5.264752031603274e-07, "learning_rate": 0.0007934467595566952, "loss": 2.3676, "step": 22670 }, { "crossentropy": 2.6338493824005127, "epoch": 0.8218895011600929, "grad_norm": 0.026461318135261536, "grad_norm_var": 4.831093159630608e-07, "learning_rate": 0.0007931326798740235, "loss": 2.4985, "step": 22671 }, { "crossentropy": 2.4853475093841553, "epoch": 0.8219257540603249, "grad_norm": 0.026603111997246742, "grad_norm_var": 4.624340678322884e-07, "learning_rate": 0.0007928186570114815, "loss": 2.4773, "step": 22672 }, { "crossentropy": 2.4350533485412598, "epoch": 0.8219620069605569, "grad_norm": 0.026390178129076958, "grad_norm_var": 4.2818434514141125e-07, "learning_rate": 0.0007925046909733058, "loss": 2.4279, "step": 22673 }, { "crossentropy": 2.436549663543701, "epoch": 0.8219982598607889, "grad_norm": 0.02675975300371647, "grad_norm_var": 3.9558427397159056e-07, "learning_rate": 0.0007921907817637392, "loss": 2.375, "step": 22674 }, { "crossentropy": 2.4871649742126465, "epoch": 0.8220345127610209, "grad_norm": 0.026126829907298088, "grad_norm_var": 3.3452059229818756e-07, "learning_rate": 0.0007918769293870226, "loss": 2.4835, "step": 22675 }, { "crossentropy": 2.3448517322540283, "epoch": 0.8220707656612529, "grad_norm": 0.026181545108556747, "grad_norm_var": 3.5119354992323565e-07, "learning_rate": 0.0007915631338473934, "loss": 2.3402, "step": 22676 }, { "crossentropy": 2.458482265472412, "epoch": 0.8221070185614849, "grad_norm": 0.02617073431611061, "grad_norm_var": 2.1717177968074106e-07, "learning_rate": 0.0007912493951490918, "loss": 2.5002, "step": 22677 }, { "crossentropy": 2.3697566986083984, "epoch": 0.822143271461717, "grad_norm": 0.026816368103027344, "grad_norm_var": 1.654082750444467e-07, "learning_rate": 0.000790935713296353, "loss": 2.4021, "step": 22678 }, { "crossentropy": 2.3374979496002197, "epoch": 0.822179524361949, "grad_norm": 0.027104903012514114, "grad_norm_var": 1.7990279260355717e-07, "learning_rate": 0.0007906220882934145, "loss": 2.3943, "step": 22679 }, { "crossentropy": 2.506857395172119, "epoch": 0.822215777262181, "grad_norm": 0.02644335851073265, "grad_norm_var": 1.254253117865944e-07, "learning_rate": 0.0007903085201445153, "loss": 2.4376, "step": 22680 }, { "crossentropy": 2.475977659225464, "epoch": 0.822252030162413, "grad_norm": 0.02755631133913994, "grad_norm_var": 1.845441826531698e-07, "learning_rate": 0.0007899950088538854, "loss": 2.5613, "step": 22681 }, { "crossentropy": 2.3901309967041016, "epoch": 0.822288283062645, "grad_norm": 0.027841389179229736, "grad_norm_var": 2.748864359037225e-07, "learning_rate": 0.0007896815544257618, "loss": 2.4403, "step": 22682 }, { "crossentropy": 2.4052460193634033, "epoch": 0.822324535962877, "grad_norm": 0.027324380353093147, "grad_norm_var": 2.9490845421807577e-07, "learning_rate": 0.0007893681568643796, "loss": 2.4974, "step": 22683 }, { "crossentropy": 2.584026336669922, "epoch": 0.822360788863109, "grad_norm": 0.026715638116002083, "grad_norm_var": 2.619866807669624e-07, "learning_rate": 0.0007890548161739691, "loss": 2.5897, "step": 22684 }, { "crossentropy": 2.518065929412842, "epoch": 0.822397041763341, "grad_norm": 0.02684645727276802, "grad_norm_var": 2.5703673150940554e-07, "learning_rate": 0.0007887415323587648, "loss": 2.4338, "step": 22685 }, { "crossentropy": 2.3566014766693115, "epoch": 0.8224332946635731, "grad_norm": 0.027773182839155197, "grad_norm_var": 3.104469413608979e-07, "learning_rate": 0.0007884283054229957, "loss": 2.4034, "step": 22686 }, { "crossentropy": 2.3949337005615234, "epoch": 0.8224695475638051, "grad_norm": 0.029069406911730766, "grad_norm_var": 6.109486842903322e-07, "learning_rate": 0.0007881151353708938, "loss": 2.338, "step": 22687 }, { "crossentropy": 2.6345932483673096, "epoch": 0.8225058004640371, "grad_norm": 0.026824815198779106, "grad_norm_var": 6.027992771190226e-07, "learning_rate": 0.0007878020222066906, "loss": 2.5619, "step": 22688 }, { "crossentropy": 2.334096670150757, "epoch": 0.8225420533642691, "grad_norm": 0.027062389999628067, "grad_norm_var": 5.766904996884944e-07, "learning_rate": 0.0007874889659346113, "loss": 2.3883, "step": 22689 }, { "crossentropy": 2.514909505844116, "epoch": 0.8225783062645011, "grad_norm": 0.027375150471925735, "grad_norm_var": 5.774805977261695e-07, "learning_rate": 0.0007871759665588862, "loss": 2.5693, "step": 22690 }, { "crossentropy": 2.5709493160247803, "epoch": 0.8226145591647331, "grad_norm": 0.027914902195334435, "grad_norm_var": 5.507632221548776e-07, "learning_rate": 0.0007868630240837443, "loss": 2.5568, "step": 22691 }, { "crossentropy": 2.4085593223571777, "epoch": 0.8226508120649652, "grad_norm": 0.025898387655615807, "grad_norm_var": 5.938029003675511e-07, "learning_rate": 0.0007865501385134094, "loss": 2.3827, "step": 22692 }, { "crossentropy": 2.5052521228790283, "epoch": 0.8226870649651972, "grad_norm": 0.027245856821537018, "grad_norm_var": 5.226422652773269e-07, "learning_rate": 0.0007862373098521103, "loss": 2.5149, "step": 22693 }, { "crossentropy": 2.4296364784240723, "epoch": 0.8227233178654292, "grad_norm": 0.027277221903204918, "grad_norm_var": 5.099895056423816e-07, "learning_rate": 0.0007859245381040697, "loss": 2.4292, "step": 22694 }, { "crossentropy": 2.4995670318603516, "epoch": 0.8227595707656613, "grad_norm": 0.026554623618721962, "grad_norm_var": 5.408161528510623e-07, "learning_rate": 0.0007856118232735138, "loss": 2.5013, "step": 22695 }, { "crossentropy": 2.402719736099243, "epoch": 0.8227958236658933, "grad_norm": 0.025756217539310455, "grad_norm_var": 6.426463992764697e-07, "learning_rate": 0.0007852991653646662, "loss": 2.3992, "step": 22696 }, { "crossentropy": 2.3140363693237305, "epoch": 0.8228320765661253, "grad_norm": 0.026892701163887978, "grad_norm_var": 6.377380414731518e-07, "learning_rate": 0.00078498656438175, "loss": 2.298, "step": 22697 }, { "crossentropy": 2.3840837478637695, "epoch": 0.8228683294663574, "grad_norm": 0.026385944336652756, "grad_norm_var": 6.356316573126551e-07, "learning_rate": 0.0007846740203289854, "loss": 2.331, "step": 22698 }, { "crossentropy": 2.475454092025757, "epoch": 0.8229045823665894, "grad_norm": 0.026515640318393707, "grad_norm_var": 6.477138358551571e-07, "learning_rate": 0.0007843615332105964, "loss": 2.4403, "step": 22699 }, { "crossentropy": 2.552501916885376, "epoch": 0.8229408352668214, "grad_norm": 0.027674295008182526, "grad_norm_var": 6.679383022330319e-07, "learning_rate": 0.0007840491030308006, "loss": 2.496, "step": 22700 }, { "crossentropy": 2.508396625518799, "epoch": 0.8229770881670534, "grad_norm": 0.027153629809617996, "grad_norm_var": 6.648151693284249e-07, "learning_rate": 0.0007837367297938213, "loss": 2.4969, "step": 22701 }, { "crossentropy": 2.313480854034424, "epoch": 0.8230133410672854, "grad_norm": 0.025882121175527573, "grad_norm_var": 7.15029204829603e-07, "learning_rate": 0.0007834244135038748, "loss": 2.3459, "step": 22702 }, { "crossentropy": 2.472703456878662, "epoch": 0.8230495939675174, "grad_norm": 0.02684546448290348, "grad_norm_var": 4.009411068463109e-07, "learning_rate": 0.0007831121541651804, "loss": 2.3932, "step": 22703 }, { "crossentropy": 2.4303839206695557, "epoch": 0.8230858468677494, "grad_norm": 0.026993751525878906, "grad_norm_var": 4.0263709232910697e-07, "learning_rate": 0.0007827999517819568, "loss": 2.4621, "step": 22704 }, { "crossentropy": 2.5924901962280273, "epoch": 0.8231220997679815, "grad_norm": 0.027761150151491165, "grad_norm_var": 4.5394147828951154e-07, "learning_rate": 0.0007824878063584195, "loss": 2.5242, "step": 22705 }, { "crossentropy": 2.4599735736846924, "epoch": 0.8231583526682135, "grad_norm": 0.025950033217668533, "grad_norm_var": 4.873489586114273e-07, "learning_rate": 0.0007821757178987837, "loss": 2.4224, "step": 22706 }, { "crossentropy": 2.5473527908325195, "epoch": 0.8231946055684455, "grad_norm": 0.027907541021704674, "grad_norm_var": 4.862520648902039e-07, "learning_rate": 0.0007818636864072665, "loss": 2.4291, "step": 22707 }, { "crossentropy": 2.4172861576080322, "epoch": 0.8232308584686775, "grad_norm": 0.02604101598262787, "grad_norm_var": 4.705027302790195e-07, "learning_rate": 0.0007815517118880805, "loss": 2.4638, "step": 22708 }, { "crossentropy": 2.4020237922668457, "epoch": 0.8232671113689095, "grad_norm": 0.027751905843615532, "grad_norm_var": 5.164345596196546e-07, "learning_rate": 0.0007812397943454419, "loss": 2.4155, "step": 22709 }, { "crossentropy": 2.3148341178894043, "epoch": 0.8233033642691415, "grad_norm": 0.02601957693696022, "grad_norm_var": 5.409590964969146e-07, "learning_rate": 0.0007809279337835606, "loss": 2.4279, "step": 22710 }, { "crossentropy": 2.475614309310913, "epoch": 0.8233396171693735, "grad_norm": 0.02635914459824562, "grad_norm_var": 5.485790754132409e-07, "learning_rate": 0.00078061613020665, "loss": 2.4657, "step": 22711 }, { "crossentropy": 2.500579833984375, "epoch": 0.8233758700696056, "grad_norm": 0.026390191167593002, "grad_norm_var": 4.902754250977563e-07, "learning_rate": 0.0007803043836189233, "loss": 2.4561, "step": 22712 }, { "crossentropy": 2.3931922912597656, "epoch": 0.8234121229698376, "grad_norm": 0.026779454201459885, "grad_norm_var": 4.894168624445938e-07, "learning_rate": 0.0007799926940245878, "loss": 2.4057, "step": 22713 }, { "crossentropy": 2.556439161300659, "epoch": 0.8234483758700696, "grad_norm": 0.028952093794941902, "grad_norm_var": 7.676381568314423e-07, "learning_rate": 0.0007796810614278566, "loss": 2.4848, "step": 22714 }, { "crossentropy": 2.5258195400238037, "epoch": 0.8234846287703016, "grad_norm": 0.02578260563313961, "grad_norm_var": 8.423131594095888e-07, "learning_rate": 0.000779369485832937, "loss": 2.4193, "step": 22715 }, { "crossentropy": 2.3819050788879395, "epoch": 0.8235208816705336, "grad_norm": 0.02669437788426876, "grad_norm_var": 7.99887915781984e-07, "learning_rate": 0.0007790579672440368, "loss": 2.3049, "step": 22716 }, { "crossentropy": 2.355959177017212, "epoch": 0.8235571345707656, "grad_norm": 0.02662131004035473, "grad_norm_var": 7.945575261974428e-07, "learning_rate": 0.0007787465056653653, "loss": 2.3587, "step": 22717 }, { "crossentropy": 2.3260254859924316, "epoch": 0.8235933874709976, "grad_norm": 0.02632216550409794, "grad_norm_var": 7.530559687241381e-07, "learning_rate": 0.0007784351011011265, "loss": 2.3873, "step": 22718 }, { "crossentropy": 2.3646790981292725, "epoch": 0.8236296403712297, "grad_norm": 0.027638714760541916, "grad_norm_var": 7.947348376983413e-07, "learning_rate": 0.0007781237535555286, "loss": 2.3944, "step": 22719 }, { "crossentropy": 2.489464521408081, "epoch": 0.8236658932714617, "grad_norm": 0.02763121947646141, "grad_norm_var": 8.304118034488933e-07, "learning_rate": 0.0007778124630327776, "loss": 2.4482, "step": 22720 }, { "crossentropy": 2.425924062728882, "epoch": 0.8237021461716937, "grad_norm": 0.027700794860720634, "grad_norm_var": 8.238113299751636e-07, "learning_rate": 0.0007775012295370753, "loss": 2.4787, "step": 22721 }, { "crossentropy": 2.318932294845581, "epoch": 0.8237383990719258, "grad_norm": 0.026695528998970985, "grad_norm_var": 7.632373433116933e-07, "learning_rate": 0.0007771900530726278, "loss": 2.4059, "step": 22722 }, { "crossentropy": 2.433647394180298, "epoch": 0.8237746519721578, "grad_norm": 0.027391619980335236, "grad_norm_var": 7.14381306761378e-07, "learning_rate": 0.0007768789336436372, "loss": 2.495, "step": 22723 }, { "crossentropy": 2.394681215286255, "epoch": 0.8238109048723898, "grad_norm": 0.026423349976539612, "grad_norm_var": 6.785439977345537e-07, "learning_rate": 0.0007765678712543045, "loss": 2.4231, "step": 22724 }, { "crossentropy": 2.4391539096832275, "epoch": 0.8238471577726219, "grad_norm": 0.026214580982923508, "grad_norm_var": 6.612938883357079e-07, "learning_rate": 0.0007762568659088326, "loss": 2.4723, "step": 22725 }, { "crossentropy": 2.543250560760498, "epoch": 0.8238834106728539, "grad_norm": 0.02684851735830307, "grad_norm_var": 6.123418849125617e-07, "learning_rate": 0.0007759459176114209, "loss": 2.5543, "step": 22726 }, { "crossentropy": 2.30267596244812, "epoch": 0.8239196635730859, "grad_norm": 0.02794811688363552, "grad_norm_var": 6.549519841146193e-07, "learning_rate": 0.0007756350263662693, "loss": 2.4072, "step": 22727 }, { "crossentropy": 2.495182991027832, "epoch": 0.8239559164733179, "grad_norm": 0.02575024403631687, "grad_norm_var": 7.327652159422645e-07, "learning_rate": 0.0007753241921775788, "loss": 2.3759, "step": 22728 }, { "crossentropy": 2.6441564559936523, "epoch": 0.8239921693735499, "grad_norm": 0.02729993686079979, "grad_norm_var": 7.370166605490432e-07, "learning_rate": 0.0007750134150495447, "loss": 2.5284, "step": 22729 }, { "crossentropy": 2.4535763263702393, "epoch": 0.8240284222737819, "grad_norm": 0.026239529252052307, "grad_norm_var": 4.889506073801318e-07, "learning_rate": 0.0007747026949863678, "loss": 2.4615, "step": 22730 }, { "crossentropy": 2.3388311862945557, "epoch": 0.8240646751740139, "grad_norm": 0.026638751849532127, "grad_norm_var": 4.1575134917207905e-07, "learning_rate": 0.0007743920319922427, "loss": 2.4769, "step": 22731 }, { "crossentropy": 2.374331474304199, "epoch": 0.824100928074246, "grad_norm": 0.026696359738707542, "grad_norm_var": 4.157028953306769e-07, "learning_rate": 0.0007740814260713646, "loss": 2.4434, "step": 22732 }, { "crossentropy": 2.466550827026367, "epoch": 0.824137180974478, "grad_norm": 0.025491712614893913, "grad_norm_var": 5.342330650146346e-07, "learning_rate": 0.0007737708772279317, "loss": 2.535, "step": 22733 }, { "crossentropy": 2.4331464767456055, "epoch": 0.82417343387471, "grad_norm": 0.026111245155334473, "grad_norm_var": 5.506820358778478e-07, "learning_rate": 0.0007734603854661348, "loss": 2.416, "step": 22734 }, { "crossentropy": 2.3992605209350586, "epoch": 0.824209686774942, "grad_norm": 0.026591747999191284, "grad_norm_var": 5.014138507056318e-07, "learning_rate": 0.0007731499507901696, "loss": 2.3649, "step": 22735 }, { "crossentropy": 2.4207923412323, "epoch": 0.824245939675174, "grad_norm": 0.026587262749671936, "grad_norm_var": 4.4402598221361146e-07, "learning_rate": 0.0007728395732042304, "loss": 2.5081, "step": 22736 }, { "crossentropy": 2.4803121089935303, "epoch": 0.824282192575406, "grad_norm": 0.02696838043630123, "grad_norm_var": 3.7633678864680494e-07, "learning_rate": 0.0007725292527125061, "loss": 2.4421, "step": 22737 }, { "crossentropy": 2.4632465839385986, "epoch": 0.824318445475638, "grad_norm": 0.026570534333586693, "grad_norm_var": 3.7603042763894625e-07, "learning_rate": 0.0007722189893191906, "loss": 2.5198, "step": 22738 }, { "crossentropy": 2.4664855003356934, "epoch": 0.82435469837587, "grad_norm": 0.026522943750023842, "grad_norm_var": 3.3274894031080453e-07, "learning_rate": 0.0007719087830284737, "loss": 2.4589, "step": 22739 }, { "crossentropy": 2.5086960792541504, "epoch": 0.8243909512761021, "grad_norm": 0.02646440640091896, "grad_norm_var": 3.3212567259599305e-07, "learning_rate": 0.0007715986338445441, "loss": 2.3979, "step": 22740 }, { "crossentropy": 2.5542922019958496, "epoch": 0.8244272041763341, "grad_norm": 0.02737003192305565, "grad_norm_var": 3.625035117054675e-07, "learning_rate": 0.0007712885417715926, "loss": 2.5168, "step": 22741 }, { "crossentropy": 2.435558795928955, "epoch": 0.8244634570765661, "grad_norm": 0.026456959545612335, "grad_norm_var": 3.60741916614835e-07, "learning_rate": 0.0007709785068138058, "loss": 2.3847, "step": 22742 }, { "crossentropy": 2.4340288639068604, "epoch": 0.8244997099767981, "grad_norm": 0.02582293003797531, "grad_norm_var": 2.6293364075965997e-07, "learning_rate": 0.0007706685289753717, "loss": 2.4347, "step": 22743 }, { "crossentropy": 2.4048562049865723, "epoch": 0.8245359628770301, "grad_norm": 0.02667732909321785, "grad_norm_var": 2.2719502035204824e-07, "learning_rate": 0.000770358608260479, "loss": 2.3499, "step": 22744 }, { "crossentropy": 2.5015194416046143, "epoch": 0.8245722157772621, "grad_norm": 0.02915263921022415, "grad_norm_var": 6.314576937134381e-07, "learning_rate": 0.0007700487446733106, "loss": 2.5211, "step": 22745 }, { "crossentropy": 2.3931334018707275, "epoch": 0.8246084686774942, "grad_norm": 0.025152171030640602, "grad_norm_var": 7.645275301207024e-07, "learning_rate": 0.0007697389382180542, "loss": 2.3189, "step": 22746 }, { "crossentropy": 2.444321393966675, "epoch": 0.8246447215777262, "grad_norm": 0.026453062891960144, "grad_norm_var": 7.652208361431471e-07, "learning_rate": 0.000769429188898893, "loss": 2.4636, "step": 22747 }, { "crossentropy": 2.556191921234131, "epoch": 0.8246809744779582, "grad_norm": 0.027096206322312355, "grad_norm_var": 7.820506730176734e-07, "learning_rate": 0.0007691194967200099, "loss": 2.5096, "step": 22748 }, { "crossentropy": 2.357332229614258, "epoch": 0.8247172273781903, "grad_norm": 0.025479767471551895, "grad_norm_var": 7.838137513123025e-07, "learning_rate": 0.0007688098616855893, "loss": 2.4258, "step": 22749 }, { "crossentropy": 2.602264881134033, "epoch": 0.8247534802784223, "grad_norm": 0.026991361752152443, "grad_norm_var": 7.757693287359253e-07, "learning_rate": 0.0007685002837998118, "loss": 2.5274, "step": 22750 }, { "crossentropy": 2.404578924179077, "epoch": 0.8247897331786543, "grad_norm": 0.02588324062526226, "grad_norm_var": 8.123966345277885e-07, "learning_rate": 0.0007681907630668595, "loss": 2.4743, "step": 22751 }, { "crossentropy": 2.514037847518921, "epoch": 0.8248259860788864, "grad_norm": 0.025878455489873886, "grad_norm_var": 8.452916590008837e-07, "learning_rate": 0.000767881299490914, "loss": 2.4455, "step": 22752 }, { "crossentropy": 2.499509811401367, "epoch": 0.8248622389791184, "grad_norm": 0.026428014039993286, "grad_norm_var": 8.340298888171356e-07, "learning_rate": 0.0007675718930761527, "loss": 2.4733, "step": 22753 }, { "crossentropy": 2.4964847564697266, "epoch": 0.8248984918793504, "grad_norm": 0.026622315868735313, "grad_norm_var": 8.345118268714147e-07, "learning_rate": 0.0007672625438267572, "loss": 2.4681, "step": 22754 }, { "crossentropy": 2.4575588703155518, "epoch": 0.8249347447795824, "grad_norm": 0.026251155883073807, "grad_norm_var": 8.393205343940839e-07, "learning_rate": 0.0007669532517469036, "loss": 2.388, "step": 22755 }, { "crossentropy": 2.454501152038574, "epoch": 0.8249709976798144, "grad_norm": 0.025756556540727615, "grad_norm_var": 8.750576271391071e-07, "learning_rate": 0.0007666440168407707, "loss": 2.4159, "step": 22756 }, { "crossentropy": 2.517252206802368, "epoch": 0.8250072505800464, "grad_norm": 0.02777649462223053, "grad_norm_var": 9.343225418177812e-07, "learning_rate": 0.0007663348391125352, "loss": 2.4221, "step": 22757 }, { "crossentropy": 2.4394876956939697, "epoch": 0.8250435034802784, "grad_norm": 0.0266810804605484, "grad_norm_var": 9.364023823512594e-07, "learning_rate": 0.000766025718566371, "loss": 2.4946, "step": 22758 }, { "crossentropy": 2.466299057006836, "epoch": 0.8250797563805105, "grad_norm": 0.02659706026315689, "grad_norm_var": 9.033088052232911e-07, "learning_rate": 0.0007657166552064554, "loss": 2.4928, "step": 22759 }, { "crossentropy": 2.488849401473999, "epoch": 0.8251160092807425, "grad_norm": 0.02834603562951088, "grad_norm_var": 1.104605610985571e-06, "learning_rate": 0.0007654076490369627, "loss": 2.4829, "step": 22760 }, { "crossentropy": 2.362941265106201, "epoch": 0.8251522621809745, "grad_norm": 0.026507560163736343, "grad_norm_var": 6.624692654223398e-07, "learning_rate": 0.0007650987000620646, "loss": 2.4028, "step": 22761 }, { "crossentropy": 2.5011203289031982, "epoch": 0.8251885150812065, "grad_norm": 0.02811548300087452, "grad_norm_var": 6.812131325456868e-07, "learning_rate": 0.0007647898082859366, "loss": 2.4952, "step": 22762 }, { "crossentropy": 2.4488415718078613, "epoch": 0.8252247679814385, "grad_norm": 0.026018153876066208, "grad_norm_var": 7.061358197582898e-07, "learning_rate": 0.000764480973712749, "loss": 2.4232, "step": 22763 }, { "crossentropy": 2.3795363903045654, "epoch": 0.8252610208816705, "grad_norm": 0.02551102824509144, "grad_norm_var": 7.692586204455318e-07, "learning_rate": 0.0007641721963466735, "loss": 2.4079, "step": 22764 }, { "crossentropy": 2.5598244667053223, "epoch": 0.8252972737819025, "grad_norm": 0.02644541673362255, "grad_norm_var": 6.893904546070861e-07, "learning_rate": 0.0007638634761918811, "loss": 2.529, "step": 22765 }, { "crossentropy": 2.3706138134002686, "epoch": 0.8253335266821346, "grad_norm": 0.028024468570947647, "grad_norm_var": 8.082035584378583e-07, "learning_rate": 0.0007635548132525399, "loss": 2.4486, "step": 22766 }, { "crossentropy": 2.441664218902588, "epoch": 0.8253697795823666, "grad_norm": 0.02670390158891678, "grad_norm_var": 7.633700786962549e-07, "learning_rate": 0.00076324620753282, "loss": 2.4159, "step": 22767 }, { "crossentropy": 2.469604969024658, "epoch": 0.8254060324825986, "grad_norm": 0.027003131806850433, "grad_norm_var": 7.148888370065813e-07, "learning_rate": 0.0007629376590368908, "loss": 2.427, "step": 22768 }, { "crossentropy": 2.3883309364318848, "epoch": 0.8254422853828306, "grad_norm": 0.02750968746840954, "grad_norm_var": 7.344754013938511e-07, "learning_rate": 0.0007626291677689174, "loss": 2.3736, "step": 22769 }, { "crossentropy": 2.578451156616211, "epoch": 0.8254785382830626, "grad_norm": 0.026128016412258148, "grad_norm_var": 7.658622748272596e-07, "learning_rate": 0.0007623207337330685, "loss": 2.5479, "step": 22770 }, { "crossentropy": 2.432889699935913, "epoch": 0.8255147911832946, "grad_norm": 0.027081137523055077, "grad_norm_var": 7.442006197154779e-07, "learning_rate": 0.0007620123569335074, "loss": 2.4691, "step": 22771 }, { "crossentropy": 2.4367313385009766, "epoch": 0.8255510440835266, "grad_norm": 0.025977574288845062, "grad_norm_var": 7.139162611051551e-07, "learning_rate": 0.0007617040373744027, "loss": 2.4385, "step": 22772 }, { "crossentropy": 2.405252456665039, "epoch": 0.8255872969837587, "grad_norm": 0.026425860822200775, "grad_norm_var": 6.703816217231654e-07, "learning_rate": 0.0007613957750599165, "loss": 2.4519, "step": 22773 }, { "crossentropy": 2.380394458770752, "epoch": 0.8256235498839907, "grad_norm": 0.028477324172854424, "grad_norm_var": 8.394308828776319e-07, "learning_rate": 0.0007610875699942116, "loss": 2.3857, "step": 22774 }, { "crossentropy": 2.4714322090148926, "epoch": 0.8256598027842227, "grad_norm": 0.02647094428539276, "grad_norm_var": 8.460149211366706e-07, "learning_rate": 0.0007607794221814512, "loss": 2.4093, "step": 22775 }, { "crossentropy": 2.5497255325317383, "epoch": 0.8256960556844548, "grad_norm": 0.02799759805202484, "grad_norm_var": 7.874264094815767e-07, "learning_rate": 0.0007604713316257999, "loss": 2.5016, "step": 22776 }, { "crossentropy": 2.3045825958251953, "epoch": 0.8257323085846868, "grad_norm": 0.026818063110113144, "grad_norm_var": 7.772120209972403e-07, "learning_rate": 0.0007601632983314155, "loss": 2.3805, "step": 22777 }, { "crossentropy": 2.4818613529205322, "epoch": 0.8257685614849188, "grad_norm": 0.02666490338742733, "grad_norm_var": 6.773566473702285e-07, "learning_rate": 0.0007598553223024612, "loss": 2.4576, "step": 22778 }, { "crossentropy": 2.456249475479126, "epoch": 0.8258048143851509, "grad_norm": 0.027074234560132027, "grad_norm_var": 6.329471943795761e-07, "learning_rate": 0.0007595474035430944, "loss": 2.4647, "step": 22779 }, { "crossentropy": 2.5365560054779053, "epoch": 0.8258410672853829, "grad_norm": 0.027820002287626266, "grad_norm_var": 5.402123742725394e-07, "learning_rate": 0.000759239542057475, "loss": 2.4843, "step": 22780 }, { "crossentropy": 2.3628365993499756, "epoch": 0.8258773201856149, "grad_norm": 0.02626672200858593, "grad_norm_var": 5.563482216900186e-07, "learning_rate": 0.0007589317378497634, "loss": 2.4251, "step": 22781 }, { "crossentropy": 2.4432826042175293, "epoch": 0.8259135730858469, "grad_norm": 0.02616293355822563, "grad_norm_var": 5.255333919225331e-07, "learning_rate": 0.0007586239909241121, "loss": 2.491, "step": 22782 }, { "crossentropy": 2.254357099533081, "epoch": 0.8259498259860789, "grad_norm": 0.025859452784061432, "grad_norm_var": 5.934620919346873e-07, "learning_rate": 0.0007583163012846806, "loss": 2.3902, "step": 22783 }, { "crossentropy": 2.3986310958862305, "epoch": 0.8259860788863109, "grad_norm": 0.02562364749610424, "grad_norm_var": 6.858140826294558e-07, "learning_rate": 0.0007580086689356258, "loss": 2.394, "step": 22784 }, { "crossentropy": 2.498128652572632, "epoch": 0.8260223317865429, "grad_norm": 0.027220388874411583, "grad_norm_var": 6.626047224446748e-07, "learning_rate": 0.0007577010938810997, "loss": 2.4562, "step": 22785 }, { "crossentropy": 2.5089380741119385, "epoch": 0.826058584686775, "grad_norm": 0.026153527200222015, "grad_norm_var": 6.605151316516847e-07, "learning_rate": 0.0007573935761252598, "loss": 2.4136, "step": 22786 }, { "crossentropy": 2.381502628326416, "epoch": 0.826094837587007, "grad_norm": 0.027590688318014145, "grad_norm_var": 6.988397926633213e-07, "learning_rate": 0.000757086115672257, "loss": 2.4255, "step": 22787 }, { "crossentropy": 2.5706450939178467, "epoch": 0.826131090487239, "grad_norm": 0.0268959179520607, "grad_norm_var": 6.523478862330752e-07, "learning_rate": 0.0007567787125262449, "loss": 2.5469, "step": 22788 }, { "crossentropy": 2.5225350856781006, "epoch": 0.826167343387471, "grad_norm": 0.02632460556924343, "grad_norm_var": 6.586492116670741e-07, "learning_rate": 0.0007564713666913776, "loss": 2.5025, "step": 22789 }, { "crossentropy": 2.4956681728363037, "epoch": 0.826203596287703, "grad_norm": 0.027402425184845924, "grad_norm_var": 4.96030513973108e-07, "learning_rate": 0.0007561640781718021, "loss": 2.4787, "step": 22790 }, { "crossentropy": 2.511251926422119, "epoch": 0.826239849187935, "grad_norm": 0.026492226868867874, "grad_norm_var": 4.95205578569209e-07, "learning_rate": 0.0007558568469716715, "loss": 2.5059, "step": 22791 }, { "crossentropy": 2.4447762966156006, "epoch": 0.826276102088167, "grad_norm": 0.027800971642136574, "grad_norm_var": 4.6551575498543496e-07, "learning_rate": 0.0007555496730951355, "loss": 2.433, "step": 22792 }, { "crossentropy": 2.4581189155578613, "epoch": 0.8263123549883991, "grad_norm": 0.026774564757943153, "grad_norm_var": 4.65301140887631e-07, "learning_rate": 0.0007552425565463411, "loss": 2.4442, "step": 22793 }, { "crossentropy": 2.5882070064544678, "epoch": 0.8263486078886311, "grad_norm": 0.025308771058917046, "grad_norm_var": 5.970691816113329e-07, "learning_rate": 0.0007549354973294392, "loss": 2.4967, "step": 22794 }, { "crossentropy": 2.4084312915802, "epoch": 0.8263848607888631, "grad_norm": 0.026431171223521233, "grad_norm_var": 5.88528776318065e-07, "learning_rate": 0.0007546284954485743, "loss": 2.4153, "step": 22795 }, { "crossentropy": 2.419919729232788, "epoch": 0.8264211136890951, "grad_norm": 0.027858922258019447, "grad_norm_var": 5.947831894823064e-07, "learning_rate": 0.0007543215509078944, "loss": 2.4623, "step": 22796 }, { "crossentropy": 2.422679901123047, "epoch": 0.8264573665893271, "grad_norm": 0.026140917092561722, "grad_norm_var": 6.019571325086102e-07, "learning_rate": 0.0007540146637115453, "loss": 2.4503, "step": 22797 }, { "crossentropy": 2.477611541748047, "epoch": 0.8264936194895591, "grad_norm": 0.027076635509729385, "grad_norm_var": 5.975300160626033e-07, "learning_rate": 0.0007537078338636722, "loss": 2.4698, "step": 22798 }, { "crossentropy": 2.444395065307617, "epoch": 0.8265298723897911, "grad_norm": 0.02583729662001133, "grad_norm_var": 5.999985377827533e-07, "learning_rate": 0.0007534010613684178, "loss": 2.394, "step": 22799 }, { "crossentropy": 2.3953428268432617, "epoch": 0.8265661252900232, "grad_norm": 0.027998559176921844, "grad_norm_var": 6.169696550149904e-07, "learning_rate": 0.0007530943462299273, "loss": 2.4161, "step": 22800 }, { "crossentropy": 2.521589994430542, "epoch": 0.8266023781902552, "grad_norm": 0.026722321286797523, "grad_norm_var": 6.066632844731509e-07, "learning_rate": 0.0007527876884523416, "loss": 2.5433, "step": 22801 }, { "crossentropy": 2.4218342304229736, "epoch": 0.8266386310904872, "grad_norm": 0.02566395327448845, "grad_norm_var": 6.638817921807468e-07, "learning_rate": 0.0007524810880398048, "loss": 2.4278, "step": 22802 }, { "crossentropy": 2.3178534507751465, "epoch": 0.8266748839907193, "grad_norm": 0.026481544598937035, "grad_norm_var": 6.19400624063809e-07, "learning_rate": 0.0007521745449964556, "loss": 2.3958, "step": 22803 }, { "crossentropy": 2.3396966457366943, "epoch": 0.8267111368909513, "grad_norm": 0.027167044579982758, "grad_norm_var": 6.310530429755208e-07, "learning_rate": 0.0007518680593264354, "loss": 2.4895, "step": 22804 }, { "crossentropy": 2.460965394973755, "epoch": 0.8267473897911833, "grad_norm": 0.02771657332777977, "grad_norm_var": 6.792095200410745e-07, "learning_rate": 0.0007515616310338846, "loss": 2.4175, "step": 22805 }, { "crossentropy": 2.2879698276519775, "epoch": 0.8267836426914154, "grad_norm": 0.026047520339488983, "grad_norm_var": 6.859488375634178e-07, "learning_rate": 0.0007512552601229417, "loss": 2.3573, "step": 22806 }, { "crossentropy": 2.38484525680542, "epoch": 0.8268198955916474, "grad_norm": 0.02593628503382206, "grad_norm_var": 7.22144947885731e-07, "learning_rate": 0.0007509489465977432, "loss": 2.4405, "step": 22807 }, { "crossentropy": 2.4800031185150146, "epoch": 0.8268561484918794, "grad_norm": 0.027194388210773468, "grad_norm_var": 6.548995193523972e-07, "learning_rate": 0.000750642690462428, "loss": 2.5026, "step": 22808 }, { "crossentropy": 2.4191198348999023, "epoch": 0.8268924013921114, "grad_norm": 0.026149913668632507, "grad_norm_var": 6.68685128475084e-07, "learning_rate": 0.0007503364917211308, "loss": 2.4204, "step": 22809 }, { "crossentropy": 2.371943712234497, "epoch": 0.8269286542923434, "grad_norm": 0.0262070931494236, "grad_norm_var": 5.634761557538985e-07, "learning_rate": 0.0007500303503779898, "loss": 2.4071, "step": 22810 }, { "crossentropy": 2.3960390090942383, "epoch": 0.8269649071925754, "grad_norm": 0.025541603565216064, "grad_norm_var": 6.405954240865415e-07, "learning_rate": 0.0007497242664371373, "loss": 2.3836, "step": 22811 }, { "crossentropy": 2.4268691539764404, "epoch": 0.8270011600928074, "grad_norm": 0.027002759277820587, "grad_norm_var": 5.436994493198306e-07, "learning_rate": 0.0007494182399027083, "loss": 2.444, "step": 22812 }, { "crossentropy": 2.4081530570983887, "epoch": 0.8270374129930395, "grad_norm": 0.02674192376434803, "grad_norm_var": 5.330707204884935e-07, "learning_rate": 0.0007491122707788378, "loss": 2.4214, "step": 22813 }, { "crossentropy": 2.3260765075683594, "epoch": 0.8270736658932715, "grad_norm": 0.02633388340473175, "grad_norm_var": 5.196385934534322e-07, "learning_rate": 0.000748806359069657, "loss": 2.3638, "step": 22814 }, { "crossentropy": 2.4194159507751465, "epoch": 0.8271099187935035, "grad_norm": 0.027217090129852295, "grad_norm_var": 5.081694504397809e-07, "learning_rate": 0.000748500504779297, "loss": 2.3802, "step": 22815 }, { "crossentropy": 2.349081516265869, "epoch": 0.8271461716937355, "grad_norm": 0.026662955060601234, "grad_norm_var": 3.7641812382868155e-07, "learning_rate": 0.00074819470791189, "loss": 2.3815, "step": 22816 }, { "crossentropy": 2.462104320526123, "epoch": 0.8271824245939675, "grad_norm": 0.026492280885577202, "grad_norm_var": 3.7441488426822173e-07, "learning_rate": 0.000747888968471565, "loss": 2.3737, "step": 22817 }, { "crossentropy": 2.5760600566864014, "epoch": 0.8272186774941995, "grad_norm": 0.02643846906721592, "grad_norm_var": 3.2197570237797967e-07, "learning_rate": 0.0007475832864624538, "loss": 2.5532, "step": 22818 }, { "crossentropy": 2.4922726154327393, "epoch": 0.8272549303944315, "grad_norm": 0.02621842548251152, "grad_norm_var": 3.298692930927728e-07, "learning_rate": 0.0007472776618886817, "loss": 2.4303, "step": 22819 }, { "crossentropy": 2.4495465755462646, "epoch": 0.8272911832946636, "grad_norm": 0.026078151538968086, "grad_norm_var": 3.1682248043714507e-07, "learning_rate": 0.000746972094754379, "loss": 2.4634, "step": 22820 }, { "crossentropy": 2.4322965145111084, "epoch": 0.8273274361948956, "grad_norm": 0.026231003925204277, "grad_norm_var": 2.135248131884439e-07, "learning_rate": 0.0007466665850636734, "loss": 2.3862, "step": 22821 }, { "crossentropy": 2.360629081726074, "epoch": 0.8273636890951276, "grad_norm": 0.026335524395108223, "grad_norm_var": 2.049485548466512e-07, "learning_rate": 0.0007463611328206893, "loss": 2.4822, "step": 22822 }, { "crossentropy": 2.388260841369629, "epoch": 0.8273999419953596, "grad_norm": 0.025797145441174507, "grad_norm_var": 2.1520399733146455e-07, "learning_rate": 0.000746055738029554, "loss": 2.3905, "step": 22823 }, { "crossentropy": 2.481840133666992, "epoch": 0.8274361948955916, "grad_norm": 0.02854914776980877, "grad_norm_var": 4.7066984564963535e-07, "learning_rate": 0.0007457504006943916, "loss": 2.511, "step": 22824 }, { "crossentropy": 2.278294801712036, "epoch": 0.8274724477958236, "grad_norm": 0.026949426159262657, "grad_norm_var": 4.7331884274603765e-07, "learning_rate": 0.0007454451208193252, "loss": 2.4503, "step": 22825 }, { "crossentropy": 2.5268547534942627, "epoch": 0.8275087006960556, "grad_norm": 0.02770683355629444, "grad_norm_var": 5.453646275126034e-07, "learning_rate": 0.0007451398984084801, "loss": 2.5187, "step": 22826 }, { "crossentropy": 2.467440128326416, "epoch": 0.8275449535962877, "grad_norm": 0.02607635222375393, "grad_norm_var": 4.846690886626167e-07, "learning_rate": 0.0007448347334659761, "loss": 2.5086, "step": 22827 }, { "crossentropy": 2.6283113956451416, "epoch": 0.8275812064965197, "grad_norm": 0.026341265067458153, "grad_norm_var": 4.832823223593069e-07, "learning_rate": 0.0007445296259959367, "loss": 2.4806, "step": 22828 }, { "crossentropy": 2.40533709526062, "epoch": 0.8276174593967517, "grad_norm": 0.027025409042835236, "grad_norm_var": 4.923232394588376e-07, "learning_rate": 0.000744224576002484, "loss": 2.4959, "step": 22829 }, { "crossentropy": 2.4812653064727783, "epoch": 0.8276537122969838, "grad_norm": 0.026398343965411186, "grad_norm_var": 4.898373316504588e-07, "learning_rate": 0.0007439195834897356, "loss": 2.4538, "step": 22830 }, { "crossentropy": 2.5127835273742676, "epoch": 0.8276899651972158, "grad_norm": 0.02685386687517166, "grad_norm_var": 4.7097562035681013e-07, "learning_rate": 0.0007436146484618129, "loss": 2.4511, "step": 22831 }, { "crossentropy": 2.551893949508667, "epoch": 0.8277262180974478, "grad_norm": 0.026414023712277412, "grad_norm_var": 4.739094932802347e-07, "learning_rate": 0.0007433097709228336, "loss": 2.5062, "step": 22832 }, { "crossentropy": 2.581308126449585, "epoch": 0.8277624709976799, "grad_norm": 0.02694886364042759, "grad_norm_var": 4.7921801250166e-07, "learning_rate": 0.0007430049508769149, "loss": 2.4954, "step": 22833 }, { "crossentropy": 2.26456880569458, "epoch": 0.8277987238979119, "grad_norm": 0.02836368791759014, "grad_norm_var": 6.571787388448634e-07, "learning_rate": 0.0007427001883281742, "loss": 2.4271, "step": 22834 }, { "crossentropy": 2.5382375717163086, "epoch": 0.8278349767981439, "grad_norm": 0.027795841917395592, "grad_norm_var": 6.971131426460862e-07, "learning_rate": 0.0007423954832807295, "loss": 2.5648, "step": 22835 }, { "crossentropy": 2.3410913944244385, "epoch": 0.8278712296983759, "grad_norm": 0.02645483985543251, "grad_norm_var": 6.663838510351788e-07, "learning_rate": 0.0007420908357386935, "loss": 2.3799, "step": 22836 }, { "crossentropy": 2.501249313354492, "epoch": 0.8279074825986079, "grad_norm": 0.026339203119277954, "grad_norm_var": 6.576070762096449e-07, "learning_rate": 0.0007417862457061836, "loss": 2.4333, "step": 22837 }, { "crossentropy": 2.484712839126587, "epoch": 0.8279437354988399, "grad_norm": 0.02613915503025055, "grad_norm_var": 6.747143736571346e-07, "learning_rate": 0.0007414817131873119, "loss": 2.4791, "step": 22838 }, { "crossentropy": 2.30495023727417, "epoch": 0.8279799883990719, "grad_norm": 0.025133058428764343, "grad_norm_var": 7.985651118586415e-07, "learning_rate": 0.0007411772381861925, "loss": 2.3313, "step": 22839 }, { "crossentropy": 2.5469272136688232, "epoch": 0.828016241299304, "grad_norm": 0.026624014601111412, "grad_norm_var": 5.922783130976952e-07, "learning_rate": 0.0007408728207069382, "loss": 2.4826, "step": 22840 }, { "crossentropy": 2.317460536956787, "epoch": 0.828052494199536, "grad_norm": 0.02668105997145176, "grad_norm_var": 5.886690405229599e-07, "learning_rate": 0.0007405684607536584, "loss": 2.3436, "step": 22841 }, { "crossentropy": 2.3714699745178223, "epoch": 0.828088747099768, "grad_norm": 0.025923533365130424, "grad_norm_var": 5.49454766531933e-07, "learning_rate": 0.0007402641583304653, "loss": 2.4585, "step": 22842 }, { "crossentropy": 2.438037633895874, "epoch": 0.828125, "grad_norm": 0.02739202417433262, "grad_norm_var": 5.667411223713355e-07, "learning_rate": 0.0007399599134414709, "loss": 2.4832, "step": 22843 }, { "crossentropy": 2.490473747253418, "epoch": 0.828161252900232, "grad_norm": 0.026079094037413597, "grad_norm_var": 5.827646494098911e-07, "learning_rate": 0.000739655726090781, "loss": 2.4774, "step": 22844 }, { "crossentropy": 2.3657331466674805, "epoch": 0.828197505800464, "grad_norm": 0.02638690173625946, "grad_norm_var": 5.771685573060607e-07, "learning_rate": 0.0007393515962825071, "loss": 2.4549, "step": 22845 }, { "crossentropy": 2.4281649589538574, "epoch": 0.828233758700696, "grad_norm": 0.02749728597700596, "grad_norm_var": 6.201010731005444e-07, "learning_rate": 0.0007390475240207545, "loss": 2.4007, "step": 22846 }, { "crossentropy": 2.464845895767212, "epoch": 0.8282700116009281, "grad_norm": 0.025755422189831734, "grad_norm_var": 6.713885492957375e-07, "learning_rate": 0.0007387435093096317, "loss": 2.3396, "step": 22847 }, { "crossentropy": 2.4930403232574463, "epoch": 0.8283062645011601, "grad_norm": 0.029926836490631104, "grad_norm_var": 1.3459207781659743e-06, "learning_rate": 0.0007384395521532461, "loss": 2.4912, "step": 22848 }, { "crossentropy": 2.3435747623443604, "epoch": 0.8283425174013921, "grad_norm": 0.02624528855085373, "grad_norm_var": 1.3666517200653829e-06, "learning_rate": 0.0007381356525556998, "loss": 2.4544, "step": 22849 }, { "crossentropy": 2.4943079948425293, "epoch": 0.8283787703016241, "grad_norm": 0.026950854808092117, "grad_norm_var": 1.1961049738551389e-06, "learning_rate": 0.000737831810521099, "loss": 2.5318, "step": 22850 }, { "crossentropy": 2.5599117279052734, "epoch": 0.8284150232018561, "grad_norm": 0.027500320225954056, "grad_norm_var": 1.158690342394292e-06, "learning_rate": 0.0007375280260535483, "loss": 2.4801, "step": 22851 }, { "crossentropy": 2.478797674179077, "epoch": 0.8284512761020881, "grad_norm": 0.026101935654878616, "grad_norm_var": 1.177506713180088e-06, "learning_rate": 0.0007372242991571499, "loss": 2.4253, "step": 22852 }, { "crossentropy": 2.551074504852295, "epoch": 0.8284875290023201, "grad_norm": 0.026175739243626595, "grad_norm_var": 1.1863265666767209e-06, "learning_rate": 0.0007369206298360065, "loss": 2.5137, "step": 22853 }, { "crossentropy": 2.3223485946655273, "epoch": 0.8285237819025522, "grad_norm": 0.02702784352004528, "grad_norm_var": 1.1743227525856052e-06, "learning_rate": 0.0007366170180942189, "loss": 2.3923, "step": 22854 }, { "crossentropy": 2.3936173915863037, "epoch": 0.8285600348027842, "grad_norm": 0.02747497335076332, "grad_norm_var": 1.0238953861966498e-06, "learning_rate": 0.0007363134639358881, "loss": 2.4334, "step": 22855 }, { "crossentropy": 2.5384726524353027, "epoch": 0.8285962877030162, "grad_norm": 0.025721676647663116, "grad_norm_var": 1.1030486890152453e-06, "learning_rate": 0.0007360099673651172, "loss": 2.4925, "step": 22856 }, { "crossentropy": 2.4212493896484375, "epoch": 0.8286325406032483, "grad_norm": 0.02680061385035515, "grad_norm_var": 1.1020054057683428e-06, "learning_rate": 0.0007357065283859998, "loss": 2.4614, "step": 22857 }, { "crossentropy": 2.4336116313934326, "epoch": 0.8286687935034803, "grad_norm": 0.026468336582183838, "grad_norm_var": 1.0561611872871023e-06, "learning_rate": 0.0007354031470026368, "loss": 2.5053, "step": 22858 }, { "crossentropy": 2.455575466156006, "epoch": 0.8287050464037123, "grad_norm": 0.027427900582551956, "grad_norm_var": 1.0588627743471675e-06, "learning_rate": 0.0007350998232191275, "loss": 2.4764, "step": 22859 }, { "crossentropy": 2.5090010166168213, "epoch": 0.8287412993039444, "grad_norm": 0.026216771453619003, "grad_norm_var": 1.045963618632323e-06, "learning_rate": 0.000734796557039566, "loss": 2.462, "step": 22860 }, { "crossentropy": 2.2614855766296387, "epoch": 0.8287775522041764, "grad_norm": 0.026027178391814232, "grad_norm_var": 1.0764987292322896e-06, "learning_rate": 0.0007344933484680511, "loss": 2.395, "step": 22861 }, { "crossentropy": 2.347975492477417, "epoch": 0.8288138051044084, "grad_norm": 0.025651901960372925, "grad_norm_var": 1.125751752021192e-06, "learning_rate": 0.0007341901975086751, "loss": 2.3878, "step": 22862 }, { "crossentropy": 2.403897285461426, "epoch": 0.8288500580046404, "grad_norm": 0.026483474299311638, "grad_norm_var": 1.0655269924590242e-06, "learning_rate": 0.0007338871041655348, "loss": 2.4055, "step": 22863 }, { "crossentropy": 2.433356523513794, "epoch": 0.8288863109048724, "grad_norm": 0.026136869564652443, "grad_norm_var": 3.642887949032904e-07, "learning_rate": 0.0007335840684427236, "loss": 2.4388, "step": 22864 }, { "crossentropy": 2.378005266189575, "epoch": 0.8289225638051044, "grad_norm": 0.0275513157248497, "grad_norm_var": 4.220602749572712e-07, "learning_rate": 0.0007332810903443344, "loss": 2.4503, "step": 22865 }, { "crossentropy": 2.477983236312866, "epoch": 0.8289588167053364, "grad_norm": 0.028151320293545723, "grad_norm_var": 5.671111411265547e-07, "learning_rate": 0.000732978169874458, "loss": 2.4224, "step": 22866 }, { "crossentropy": 2.591923952102661, "epoch": 0.8289950696055685, "grad_norm": 0.026138942688703537, "grad_norm_var": 5.344764173081014e-07, "learning_rate": 0.0007326753070371878, "loss": 2.5395, "step": 22867 }, { "crossentropy": 2.4592063426971436, "epoch": 0.8290313225058005, "grad_norm": 0.0267435722053051, "grad_norm_var": 5.178283592248423e-07, "learning_rate": 0.0007323725018366123, "loss": 2.4336, "step": 22868 }, { "crossentropy": 2.526707649230957, "epoch": 0.8290675754060325, "grad_norm": 0.027484750375151634, "grad_norm_var": 5.443465360780233e-07, "learning_rate": 0.000732069754276824, "loss": 2.425, "step": 22869 }, { "crossentropy": 2.406510829925537, "epoch": 0.8291038283062645, "grad_norm": 0.02681158296763897, "grad_norm_var": 5.383703546916914e-07, "learning_rate": 0.000731767064361909, "loss": 2.4141, "step": 22870 }, { "crossentropy": 2.395357847213745, "epoch": 0.8291400812064965, "grad_norm": 0.025357093662023544, "grad_norm_var": 6.014779628174885e-07, "learning_rate": 0.0007314644320959574, "loss": 2.4316, "step": 22871 }, { "crossentropy": 2.557861804962158, "epoch": 0.8291763341067285, "grad_norm": 0.026594463735818863, "grad_norm_var": 5.49979366324301e-07, "learning_rate": 0.0007311618574830569, "loss": 2.5242, "step": 22872 }, { "crossentropy": 2.4731059074401855, "epoch": 0.8292125870069605, "grad_norm": 0.025810016319155693, "grad_norm_var": 5.884950218591148e-07, "learning_rate": 0.0007308593405272946, "loss": 2.4875, "step": 22873 }, { "crossentropy": 2.504234790802002, "epoch": 0.8292488399071926, "grad_norm": 0.027168497443199158, "grad_norm_var": 6.100197227643418e-07, "learning_rate": 0.0007305568812327535, "loss": 2.4709, "step": 22874 }, { "crossentropy": 2.4446070194244385, "epoch": 0.8292850928074246, "grad_norm": 0.027234816923737526, "grad_norm_var": 5.912863757680242e-07, "learning_rate": 0.0007302544796035226, "loss": 2.4377, "step": 22875 }, { "crossentropy": 2.561086893081665, "epoch": 0.8293213457076566, "grad_norm": 0.02630784921348095, "grad_norm_var": 5.871794203442335e-07, "learning_rate": 0.0007299521356436833, "loss": 2.463, "step": 22876 }, { "crossentropy": 2.568802833557129, "epoch": 0.8293575986078886, "grad_norm": 0.02662217803299427, "grad_norm_var": 5.635961425365463e-07, "learning_rate": 0.0007296498493573212, "loss": 2.5579, "step": 22877 }, { "crossentropy": 2.6977057456970215, "epoch": 0.8293938515081206, "grad_norm": 0.02651032991707325, "grad_norm_var": 4.964956593358587e-07, "learning_rate": 0.0007293476207485178, "loss": 2.5533, "step": 22878 }, { "crossentropy": 2.4646902084350586, "epoch": 0.8294301044083526, "grad_norm": 0.026913685724139214, "grad_norm_var": 4.959761830146403e-07, "learning_rate": 0.000729045449821355, "loss": 2.5062, "step": 22879 }, { "crossentropy": 2.4886817932128906, "epoch": 0.8294663573085846, "grad_norm": 0.02699289470911026, "grad_norm_var": 4.750949963560341e-07, "learning_rate": 0.0007287433365799167, "loss": 2.5263, "step": 22880 }, { "crossentropy": 2.1555275917053223, "epoch": 0.8295026102088167, "grad_norm": 0.027400411665439606, "grad_norm_var": 4.608899430023067e-07, "learning_rate": 0.0007284412810282798, "loss": 2.3147, "step": 22881 }, { "crossentropy": 2.3377671241760254, "epoch": 0.8295388631090487, "grad_norm": 0.02655007503926754, "grad_norm_var": 3.2519268863376246e-07, "learning_rate": 0.0007281392831705269, "loss": 2.3626, "step": 22882 }, { "crossentropy": 2.447683095932007, "epoch": 0.8295751160092807, "grad_norm": 0.027222782373428345, "grad_norm_var": 3.225799127269061e-07, "learning_rate": 0.000727837343010736, "loss": 2.4257, "step": 22883 }, { "crossentropy": 2.4305875301361084, "epoch": 0.8296113689095128, "grad_norm": 0.025422189384698868, "grad_norm_var": 4.2981250905313406e-07, "learning_rate": 0.0007275354605529838, "loss": 2.4741, "step": 22884 }, { "crossentropy": 2.407909393310547, "epoch": 0.8296476218097448, "grad_norm": 0.02729921229183674, "grad_norm_var": 4.113192280510847e-07, "learning_rate": 0.0007272336358013504, "loss": 2.3887, "step": 22885 }, { "crossentropy": 2.4173707962036133, "epoch": 0.8296838747099768, "grad_norm": 0.027118626981973648, "grad_norm_var": 4.242920377130199e-07, "learning_rate": 0.0007269318687599092, "loss": 2.5095, "step": 22886 }, { "crossentropy": 2.4363114833831787, "epoch": 0.8297201276102089, "grad_norm": 0.027410900220274925, "grad_norm_var": 3.317325276040121e-07, "learning_rate": 0.0007266301594327379, "loss": 2.4259, "step": 22887 }, { "crossentropy": 2.4985976219177246, "epoch": 0.8297563805104409, "grad_norm": 0.025855178013443947, "grad_norm_var": 3.847895429437883e-07, "learning_rate": 0.000726328507823913, "loss": 2.462, "step": 22888 }, { "crossentropy": 2.393211841583252, "epoch": 0.8297926334106729, "grad_norm": 0.027147863060235977, "grad_norm_var": 3.307680430893045e-07, "learning_rate": 0.0007260269139375053, "loss": 2.4102, "step": 22889 }, { "crossentropy": 2.4547834396362305, "epoch": 0.8298288863109049, "grad_norm": 0.026351360604166985, "grad_norm_var": 3.349222183044245e-07, "learning_rate": 0.0007257253777775913, "loss": 2.466, "step": 22890 }, { "crossentropy": 2.580291986465454, "epoch": 0.8298651392111369, "grad_norm": 0.026779593899846077, "grad_norm_var": 3.1981433749137674e-07, "learning_rate": 0.0007254238993482431, "loss": 2.5273, "step": 22891 }, { "crossentropy": 2.3865926265716553, "epoch": 0.8299013921113689, "grad_norm": 0.02687501162290573, "grad_norm_var": 3.069311221538199e-07, "learning_rate": 0.0007251224786535305, "loss": 2.4832, "step": 22892 }, { "crossentropy": 2.4696905612945557, "epoch": 0.8299376450116009, "grad_norm": 0.025377513840794563, "grad_norm_var": 4.2986687880801287e-07, "learning_rate": 0.000724821115697527, "loss": 2.4089, "step": 22893 }, { "crossentropy": 2.4463279247283936, "epoch": 0.829973897911833, "grad_norm": 0.026092128828167915, "grad_norm_var": 4.514699581230208e-07, "learning_rate": 0.0007245198104843015, "loss": 2.3883, "step": 22894 }, { "crossentropy": 2.562530040740967, "epoch": 0.830010150812065, "grad_norm": 0.026263311505317688, "grad_norm_var": 4.5725971577944187e-07, "learning_rate": 0.0007242185630179243, "loss": 2.4986, "step": 22895 }, { "crossentropy": 2.3067920207977295, "epoch": 0.830046403712297, "grad_norm": 0.026344502344727516, "grad_norm_var": 4.5258956942553255e-07, "learning_rate": 0.0007239173733024651, "loss": 2.363, "step": 22896 }, { "crossentropy": 2.5406341552734375, "epoch": 0.830082656612529, "grad_norm": 0.02716856822371483, "grad_norm_var": 4.3103373385589874e-07, "learning_rate": 0.0007236162413419894, "loss": 2.45, "step": 22897 }, { "crossentropy": 2.590240955352783, "epoch": 0.830118909512761, "grad_norm": 0.026784813031554222, "grad_norm_var": 4.335433125195274e-07, "learning_rate": 0.000723315167140568, "loss": 2.5245, "step": 22898 }, { "crossentropy": 2.5220696926116943, "epoch": 0.830155162412993, "grad_norm": 0.027522949501872063, "grad_norm_var": 4.6431598429794856e-07, "learning_rate": 0.0007230141507022647, "loss": 2.5058, "step": 22899 }, { "crossentropy": 2.2868473529815674, "epoch": 0.830191415313225, "grad_norm": 0.02656608819961548, "grad_norm_var": 3.6442071075643013e-07, "learning_rate": 0.0007227131920311447, "loss": 2.2698, "step": 22900 }, { "crossentropy": 2.452606439590454, "epoch": 0.8302276682134571, "grad_norm": 0.026429012417793274, "grad_norm_var": 3.404664617984021e-07, "learning_rate": 0.0007224122911312753, "loss": 2.4189, "step": 22901 }, { "crossentropy": 2.46913480758667, "epoch": 0.8302639211136891, "grad_norm": 0.02766762301325798, "grad_norm_var": 3.9503703170329304e-07, "learning_rate": 0.0007221114480067186, "loss": 2.4576, "step": 22902 }, { "crossentropy": 2.3642232418060303, "epoch": 0.8303001740139211, "grad_norm": 0.027489205822348595, "grad_norm_var": 4.032103598136747e-07, "learning_rate": 0.0007218106626615384, "loss": 2.4873, "step": 22903 }, { "crossentropy": 2.3487536907196045, "epoch": 0.8303364269141531, "grad_norm": 0.026584498584270477, "grad_norm_var": 3.572511821267957e-07, "learning_rate": 0.0007215099350997989, "loss": 2.4477, "step": 22904 }, { "crossentropy": 2.4267706871032715, "epoch": 0.8303726798143851, "grad_norm": 0.025741854682564735, "grad_norm_var": 3.997044143110116e-07, "learning_rate": 0.0007212092653255592, "loss": 2.4144, "step": 22905 }, { "crossentropy": 2.530308246612549, "epoch": 0.8304089327146171, "grad_norm": 0.027309084311127663, "grad_norm_var": 4.217852239789815e-07, "learning_rate": 0.0007209086533428833, "loss": 2.5644, "step": 22906 }, { "crossentropy": 2.4342751502990723, "epoch": 0.8304451856148491, "grad_norm": 0.025778284296393394, "grad_norm_var": 4.7211838306499864e-07, "learning_rate": 0.0007206080991558289, "loss": 2.4039, "step": 22907 }, { "crossentropy": 2.5183374881744385, "epoch": 0.8304814385150812, "grad_norm": 0.02677360735833645, "grad_norm_var": 4.69376070563049e-07, "learning_rate": 0.0007203076027684557, "loss": 2.452, "step": 22908 }, { "crossentropy": 2.351597309112549, "epoch": 0.8305176914153132, "grad_norm": 0.026182183995842934, "grad_norm_var": 3.7671965441530657e-07, "learning_rate": 0.0007200071641848244, "loss": 2.4435, "step": 22909 }, { "crossentropy": 2.5268514156341553, "epoch": 0.8305539443155452, "grad_norm": 0.027513829991221428, "grad_norm_var": 3.937694513053124e-07, "learning_rate": 0.0007197067834089899, "loss": 2.4893, "step": 22910 }, { "crossentropy": 2.418134927749634, "epoch": 0.8305901972157773, "grad_norm": 0.0251766350120306, "grad_norm_var": 5.391713563602004e-07, "learning_rate": 0.0007194064604450112, "loss": 2.3921, "step": 22911 }, { "crossentropy": 2.448402166366577, "epoch": 0.8306264501160093, "grad_norm": 0.02696266584098339, "grad_norm_var": 5.346150944848891e-07, "learning_rate": 0.0007191061952969452, "loss": 2.4506, "step": 22912 }, { "crossentropy": 2.4261860847473145, "epoch": 0.8306627030162413, "grad_norm": 0.02585607022047043, "grad_norm_var": 5.652132262077969e-07, "learning_rate": 0.0007188059879688452, "loss": 2.4527, "step": 22913 }, { "crossentropy": 2.5259673595428467, "epoch": 0.8306989559164734, "grad_norm": 0.02699972316622734, "grad_norm_var": 5.720732082735436e-07, "learning_rate": 0.0007185058384647691, "loss": 2.5232, "step": 22914 }, { "crossentropy": 2.3789045810699463, "epoch": 0.8307352088167054, "grad_norm": 0.027491511777043343, "grad_norm_var": 5.685160054289331e-07, "learning_rate": 0.0007182057467887682, "loss": 2.4044, "step": 22915 }, { "crossentropy": 2.409734010696411, "epoch": 0.8307714617169374, "grad_norm": 0.026003919541835785, "grad_norm_var": 5.951287531358665e-07, "learning_rate": 0.0007179057129448957, "loss": 2.4408, "step": 22916 }, { "crossentropy": 2.4753971099853516, "epoch": 0.8308077146171694, "grad_norm": 0.026926038786768913, "grad_norm_var": 5.977471974773537e-07, "learning_rate": 0.0007176057369372063, "loss": 2.5091, "step": 22917 }, { "crossentropy": 2.387601137161255, "epoch": 0.8308439675174014, "grad_norm": 0.027273891493678093, "grad_norm_var": 5.54199686183104e-07, "learning_rate": 0.0007173058187697489, "loss": 2.368, "step": 22918 }, { "crossentropy": 2.5033304691314697, "epoch": 0.8308802204176334, "grad_norm": 0.026331041008234024, "grad_norm_var": 5.051895082814565e-07, "learning_rate": 0.0007170059584465754, "loss": 2.4421, "step": 22919 }, { "crossentropy": 2.5319161415100098, "epoch": 0.8309164733178654, "grad_norm": 0.02793864533305168, "grad_norm_var": 6.248423426563707e-07, "learning_rate": 0.0007167061559717375, "loss": 2.5087, "step": 22920 }, { "crossentropy": 2.413998603820801, "epoch": 0.8309527262180975, "grad_norm": 0.026491442695260048, "grad_norm_var": 5.700761818944276e-07, "learning_rate": 0.0007164064113492818, "loss": 2.4287, "step": 22921 }, { "crossentropy": 2.4684131145477295, "epoch": 0.8309889791183295, "grad_norm": 0.02712389826774597, "grad_norm_var": 5.568849497381359e-07, "learning_rate": 0.0007161067245832598, "loss": 2.5299, "step": 22922 }, { "crossentropy": 2.399193048477173, "epoch": 0.8310252320185615, "grad_norm": 0.027333365753293037, "grad_norm_var": 5.21795455248747e-07, "learning_rate": 0.0007158070956777157, "loss": 2.4078, "step": 22923 }, { "crossentropy": 2.569519519805908, "epoch": 0.8310614849187935, "grad_norm": 0.026705216616392136, "grad_norm_var": 5.220882149695964e-07, "learning_rate": 0.0007155075246366999, "loss": 2.4738, "step": 22924 }, { "crossentropy": 2.3506059646606445, "epoch": 0.8310977378190255, "grad_norm": 0.025764795020222664, "grad_norm_var": 5.656551123631257e-07, "learning_rate": 0.0007152080114642561, "loss": 2.3267, "step": 22925 }, { "crossentropy": 2.4265072345733643, "epoch": 0.8311339907192575, "grad_norm": 0.025528525933623314, "grad_norm_var": 6.080279713378215e-07, "learning_rate": 0.0007149085561644303, "loss": 2.3717, "step": 22926 }, { "crossentropy": 2.4668567180633545, "epoch": 0.8311702436194895, "grad_norm": 0.025523049756884575, "grad_norm_var": 5.488975272139775e-07, "learning_rate": 0.0007146091587412667, "loss": 2.4479, "step": 22927 }, { "crossentropy": 2.4790871143341064, "epoch": 0.8312064965197216, "grad_norm": 0.025472665205597878, "grad_norm_var": 6.237222862620947e-07, "learning_rate": 0.0007143098191988112, "loss": 2.3422, "step": 22928 }, { "crossentropy": 2.3397185802459717, "epoch": 0.8312427494199536, "grad_norm": 0.02670845203101635, "grad_norm_var": 5.905233396998981e-07, "learning_rate": 0.0007140105375411044, "loss": 2.3457, "step": 22929 }, { "crossentropy": 2.50046968460083, "epoch": 0.8312790023201856, "grad_norm": 0.0262420866638422, "grad_norm_var": 5.861220114062772e-07, "learning_rate": 0.0007137113137721901, "loss": 2.4215, "step": 22930 }, { "crossentropy": 2.4080889225006104, "epoch": 0.8313152552204176, "grad_norm": 0.027191659435629845, "grad_norm_var": 5.542458334678677e-07, "learning_rate": 0.0007134121478961081, "loss": 2.523, "step": 22931 }, { "crossentropy": 2.4537148475646973, "epoch": 0.8313515081206496, "grad_norm": 0.026389911770820618, "grad_norm_var": 5.362295202297144e-07, "learning_rate": 0.0007131130399169017, "loss": 2.4708, "step": 22932 }, { "crossentropy": 2.440537214279175, "epoch": 0.8313877610208816, "grad_norm": 0.02709398791193962, "grad_norm_var": 5.462106713154576e-07, "learning_rate": 0.0007128139898386094, "loss": 2.4602, "step": 22933 }, { "crossentropy": 2.5560545921325684, "epoch": 0.8314240139211136, "grad_norm": 0.026391683146357536, "grad_norm_var": 5.120025545169808e-07, "learning_rate": 0.0007125149976652684, "loss": 2.51, "step": 22934 }, { "crossentropy": 2.570497512817383, "epoch": 0.8314602668213457, "grad_norm": 0.02741074562072754, "grad_norm_var": 5.58465970552866e-07, "learning_rate": 0.0007122160634009195, "loss": 2.5084, "step": 22935 }, { "crossentropy": 2.4703447818756104, "epoch": 0.8314965197215777, "grad_norm": 0.0259298887103796, "grad_norm_var": 4.4727259530633e-07, "learning_rate": 0.0007119171870495999, "loss": 2.4587, "step": 22936 }, { "crossentropy": 2.302882194519043, "epoch": 0.8315327726218097, "grad_norm": 0.025656839832663536, "grad_norm_var": 4.869010198429857e-07, "learning_rate": 0.000711618368615346, "loss": 2.346, "step": 22937 }, { "crossentropy": 2.394085168838501, "epoch": 0.8315690255220418, "grad_norm": 0.025478798896074295, "grad_norm_var": 4.981787885712513e-07, "learning_rate": 0.0007113196081021944, "loss": 2.3823, "step": 22938 }, { "crossentropy": 2.341395854949951, "epoch": 0.8316052784222738, "grad_norm": 0.02669811062514782, "grad_norm_var": 4.359885487585964e-07, "learning_rate": 0.0007110209055141786, "loss": 2.4825, "step": 22939 }, { "crossentropy": 2.5316507816314697, "epoch": 0.8316415313225058, "grad_norm": 0.026596611365675926, "grad_norm_var": 4.3030260294067005e-07, "learning_rate": 0.0007107222608553354, "loss": 2.5154, "step": 22940 }, { "crossentropy": 2.343709945678711, "epoch": 0.8316777842227379, "grad_norm": 0.026033906266093254, "grad_norm_var": 4.1724452410421554e-07, "learning_rate": 0.0007104236741296977, "loss": 2.3788, "step": 22941 }, { "crossentropy": 2.384211778640747, "epoch": 0.8317140371229699, "grad_norm": 0.026330286636948586, "grad_norm_var": 3.779762694832688e-07, "learning_rate": 0.0007101251453412965, "loss": 2.4558, "step": 22942 }, { "crossentropy": 2.4541308879852295, "epoch": 0.8317502900232019, "grad_norm": 0.02659166418015957, "grad_norm_var": 3.355408819668536e-07, "learning_rate": 0.0007098266744941656, "loss": 2.4142, "step": 22943 }, { "crossentropy": 2.478226900100708, "epoch": 0.8317865429234339, "grad_norm": 0.025964733213186264, "grad_norm_var": 2.905816739524053e-07, "learning_rate": 0.0007095282615923365, "loss": 2.4584, "step": 22944 }, { "crossentropy": 2.5579123497009277, "epoch": 0.8318227958236659, "grad_norm": 0.03132835030555725, "grad_norm_var": 1.8026398550430775e-06, "learning_rate": 0.0007092299066398389, "loss": 2.5733, "step": 22945 }, { "crossentropy": 2.419217109680176, "epoch": 0.8318590487238979, "grad_norm": 0.026080003008246422, "grad_norm_var": 1.814352432467856e-06, "learning_rate": 0.0007089316096407034, "loss": 2.4995, "step": 22946 }, { "crossentropy": 2.447765588760376, "epoch": 0.83189530162413, "grad_norm": 0.026359492912888527, "grad_norm_var": 1.8028538250514556e-06, "learning_rate": 0.0007086333705989578, "loss": 2.4127, "step": 22947 }, { "crossentropy": 2.2841904163360596, "epoch": 0.831931554524362, "grad_norm": 0.027604317292571068, "grad_norm_var": 1.8535716029339921e-06, "learning_rate": 0.0007083351895186308, "loss": 2.404, "step": 22948 }, { "crossentropy": 2.3870551586151123, "epoch": 0.831967807424594, "grad_norm": 0.026555638760328293, "grad_norm_var": 1.8449725159137697e-06, "learning_rate": 0.0007080370664037516, "loss": 2.4766, "step": 22949 }, { "crossentropy": 2.289299964904785, "epoch": 0.832004060324826, "grad_norm": 0.0256140548735857, "grad_norm_var": 1.9135097718218773e-06, "learning_rate": 0.0007077390012583435, "loss": 2.4147, "step": 22950 }, { "crossentropy": 2.5147793292999268, "epoch": 0.832040313225058, "grad_norm": 0.026323216035962105, "grad_norm_var": 1.8756092249063694e-06, "learning_rate": 0.0007074409940864334, "loss": 2.4467, "step": 22951 }, { "crossentropy": 2.4096944332122803, "epoch": 0.83207656612529, "grad_norm": 0.02618166245520115, "grad_norm_var": 1.8580283039376182e-06, "learning_rate": 0.0007071430448920485, "loss": 2.3611, "step": 22952 }, { "crossentropy": 2.517657518386841, "epoch": 0.832112819025522, "grad_norm": 0.0263975877314806, "grad_norm_var": 1.8004188577548536e-06, "learning_rate": 0.0007068451536792103, "loss": 2.4559, "step": 22953 }, { "crossentropy": 2.4640698432922363, "epoch": 0.832149071925754, "grad_norm": 0.02651665359735489, "grad_norm_var": 1.707930938005947e-06, "learning_rate": 0.000706547320451944, "loss": 2.4855, "step": 22954 }, { "crossentropy": 2.592331647872925, "epoch": 0.8321853248259861, "grad_norm": 0.02733786217868328, "grad_norm_var": 1.7334763102679062e-06, "learning_rate": 0.0007062495452142715, "loss": 2.599, "step": 22955 }, { "crossentropy": 2.4168989658355713, "epoch": 0.8322215777262181, "grad_norm": 0.026960201561450958, "grad_norm_var": 1.734859975498085e-06, "learning_rate": 0.0007059518279702138, "loss": 2.5076, "step": 22956 }, { "crossentropy": 2.4086241722106934, "epoch": 0.8322578306264501, "grad_norm": 0.026249252259731293, "grad_norm_var": 1.7168749316846695e-06, "learning_rate": 0.0007056541687237961, "loss": 2.3968, "step": 22957 }, { "crossentropy": 2.5421900749206543, "epoch": 0.8322940835266821, "grad_norm": 0.027296746149659157, "grad_norm_var": 1.7179868073789623e-06, "learning_rate": 0.000705356567479033, "loss": 2.5011, "step": 22958 }, { "crossentropy": 2.539533853530884, "epoch": 0.8323303364269141, "grad_norm": 0.026140958070755005, "grad_norm_var": 1.745311261334735e-06, "learning_rate": 0.0007050590242399474, "loss": 2.4907, "step": 22959 }, { "crossentropy": 2.5256361961364746, "epoch": 0.8323665893271461, "grad_norm": 0.026024218648672104, "grad_norm_var": 1.7388527004541839e-06, "learning_rate": 0.0007047615390105582, "loss": 2.3382, "step": 22960 }, { "crossentropy": 2.4331064224243164, "epoch": 0.8324028422273781, "grad_norm": 0.027118735015392303, "grad_norm_var": 3.106960081844147e-07, "learning_rate": 0.0007044641117948813, "loss": 2.5078, "step": 22961 }, { "crossentropy": 2.4246585369110107, "epoch": 0.8324390951276102, "grad_norm": 0.026750268414616585, "grad_norm_var": 2.969915278064657e-07, "learning_rate": 0.0007041667425969367, "loss": 2.5262, "step": 22962 }, { "crossentropy": 2.3887598514556885, "epoch": 0.8324753480278422, "grad_norm": 0.026420632377266884, "grad_norm_var": 2.9535073137926653e-07, "learning_rate": 0.0007038694314207383, "loss": 2.4642, "step": 22963 }, { "crossentropy": 2.527822256088257, "epoch": 0.8325116009280742, "grad_norm": 0.02707860805094242, "grad_norm_var": 2.41753570441464e-07, "learning_rate": 0.0007035721782703031, "loss": 2.5135, "step": 22964 }, { "crossentropy": 2.54545259475708, "epoch": 0.8325478538283063, "grad_norm": 0.026757584884762764, "grad_norm_var": 2.441744331268513e-07, "learning_rate": 0.0007032749831496466, "loss": 2.5622, "step": 22965 }, { "crossentropy": 2.6245269775390625, "epoch": 0.8325841067285383, "grad_norm": 0.027217941358685493, "grad_norm_var": 1.9987755184574842e-07, "learning_rate": 0.0007029778460627817, "loss": 2.5843, "step": 22966 }, { "crossentropy": 2.5183603763580322, "epoch": 0.8326203596287703, "grad_norm": 0.026457272469997406, "grad_norm_var": 1.9474402943677785e-07, "learning_rate": 0.0007026807670137209, "loss": 2.4721, "step": 22967 }, { "crossentropy": 2.4574966430664062, "epoch": 0.8326566125290024, "grad_norm": 0.026753241196274757, "grad_norm_var": 1.7705964333544246e-07, "learning_rate": 0.0007023837460064791, "loss": 2.4538, "step": 22968 }, { "crossentropy": 2.45798659324646, "epoch": 0.8326928654292344, "grad_norm": 0.025839775800704956, "grad_norm_var": 2.2028983435358935e-07, "learning_rate": 0.0007020867830450655, "loss": 2.4021, "step": 22969 }, { "crossentropy": 2.501422643661499, "epoch": 0.8327291183294664, "grad_norm": 0.02559163048863411, "grad_norm_var": 2.942235997523019e-07, "learning_rate": 0.0007017898781334936, "loss": 2.4048, "step": 22970 }, { "crossentropy": 2.465019464492798, "epoch": 0.8327653712296984, "grad_norm": 0.02648218721151352, "grad_norm_var": 2.5861822507282767e-07, "learning_rate": 0.0007014930312757706, "loss": 2.4453, "step": 22971 }, { "crossentropy": 2.5479087829589844, "epoch": 0.8328016241299304, "grad_norm": 0.02694535441696644, "grad_norm_var": 2.578619341201775e-07, "learning_rate": 0.0007011962424759077, "loss": 2.4707, "step": 22972 }, { "crossentropy": 2.4436700344085693, "epoch": 0.8328378770301624, "grad_norm": 0.026586368680000305, "grad_norm_var": 2.505352775903265e-07, "learning_rate": 0.0007008995117379141, "loss": 2.534, "step": 22973 }, { "crossentropy": 2.4514122009277344, "epoch": 0.8328741299303944, "grad_norm": 0.026691677048802376, "grad_norm_var": 2.1650822386957518e-07, "learning_rate": 0.000700602839065797, "loss": 2.445, "step": 22974 }, { "crossentropy": 2.3108463287353516, "epoch": 0.8329103828306265, "grad_norm": 0.025708800181746483, "grad_norm_var": 2.5195348681664116e-07, "learning_rate": 0.0007003062244635616, "loss": 2.4226, "step": 22975 }, { "crossentropy": 2.4578723907470703, "epoch": 0.8329466357308585, "grad_norm": 0.025441695004701614, "grad_norm_var": 3.121753888772471e-07, "learning_rate": 0.0007000096679352169, "loss": 2.4733, "step": 22976 }, { "crossentropy": 2.4144198894500732, "epoch": 0.8329828886310905, "grad_norm": 0.0272861085832119, "grad_norm_var": 3.27954935638155e-07, "learning_rate": 0.0006997131694847664, "loss": 2.4127, "step": 22977 }, { "crossentropy": 2.3324084281921387, "epoch": 0.8330191415313225, "grad_norm": 0.026049528270959854, "grad_norm_var": 3.353150582020732e-07, "learning_rate": 0.0006994167291162162, "loss": 2.4398, "step": 22978 }, { "crossentropy": 2.4997928142547607, "epoch": 0.8330553944315545, "grad_norm": 0.026729008182883263, "grad_norm_var": 3.3977245200803954e-07, "learning_rate": 0.0006991203468335683, "loss": 2.5297, "step": 22979 }, { "crossentropy": 2.37713360786438, "epoch": 0.8330916473317865, "grad_norm": 0.027729924768209457, "grad_norm_var": 4.186133764879588e-07, "learning_rate": 0.0006988240226408272, "loss": 2.3601, "step": 22980 }, { "crossentropy": 2.3755900859832764, "epoch": 0.8331279002320185, "grad_norm": 0.02773905359208584, "grad_norm_var": 5.1033387567545e-07, "learning_rate": 0.0006985277565419956, "loss": 2.398, "step": 22981 }, { "crossentropy": 2.4915528297424316, "epoch": 0.8331641531322506, "grad_norm": 0.02621523290872574, "grad_norm_var": 4.876293718591417e-07, "learning_rate": 0.0006982315485410734, "loss": 2.3901, "step": 22982 }, { "crossentropy": 2.4893784523010254, "epoch": 0.8332004060324826, "grad_norm": 0.02741936594247818, "grad_norm_var": 5.380206350652852e-07, "learning_rate": 0.0006979353986420639, "loss": 2.5106, "step": 22983 }, { "crossentropy": 2.5407280921936035, "epoch": 0.8332366589327146, "grad_norm": 0.026486949995160103, "grad_norm_var": 5.361438971099656e-07, "learning_rate": 0.0006976393068489651, "loss": 2.5191, "step": 22984 }, { "crossentropy": 2.3899505138397217, "epoch": 0.8332729118329466, "grad_norm": 0.02615535445511341, "grad_norm_var": 5.121088788472093e-07, "learning_rate": 0.0006973432731657753, "loss": 2.4504, "step": 22985 }, { "crossentropy": 2.4838387966156006, "epoch": 0.8333091647331786, "grad_norm": 0.026519382372498512, "grad_norm_var": 4.438107788263974e-07, "learning_rate": 0.0006970472975964953, "loss": 2.5115, "step": 22986 }, { "crossentropy": 2.476062297821045, "epoch": 0.8333454176334106, "grad_norm": 0.026911895722150803, "grad_norm_var": 4.4650296566565534e-07, "learning_rate": 0.0006967513801451203, "loss": 2.4879, "step": 22987 }, { "crossentropy": 2.4007785320281982, "epoch": 0.8333816705336426, "grad_norm": 0.026470595970749855, "grad_norm_var": 4.427473011499317e-07, "learning_rate": 0.0006964555208156486, "loss": 2.4087, "step": 22988 }, { "crossentropy": 2.3800384998321533, "epoch": 0.8334179234338747, "grad_norm": 0.026960914954543114, "grad_norm_var": 4.4914596936279116e-07, "learning_rate": 0.0006961597196120773, "loss": 2.4044, "step": 22989 }, { "crossentropy": 2.428422212600708, "epoch": 0.8334541763341067, "grad_norm": 0.026307372376322746, "grad_norm_var": 4.566108961676172e-07, "learning_rate": 0.0006958639765383983, "loss": 2.5131, "step": 22990 }, { "crossentropy": 2.495899200439453, "epoch": 0.8334904292343387, "grad_norm": 0.026508625596761703, "grad_norm_var": 3.9801242190292314e-07, "learning_rate": 0.00069556829159861, "loss": 2.4301, "step": 22991 }, { "crossentropy": 2.4985530376434326, "epoch": 0.8335266821345708, "grad_norm": 0.027214793488383293, "grad_norm_var": 3.009995548342113e-07, "learning_rate": 0.0006952726647967039, "loss": 2.35, "step": 22992 }, { "crossentropy": 2.3898863792419434, "epoch": 0.8335629350348028, "grad_norm": 0.026340177282691002, "grad_norm_var": 2.948577380099474e-07, "learning_rate": 0.0006949770961366719, "loss": 2.4278, "step": 22993 }, { "crossentropy": 2.4740750789642334, "epoch": 0.8335991879350348, "grad_norm": 0.025915009900927544, "grad_norm_var": 3.082811146473078e-07, "learning_rate": 0.0006946815856225087, "loss": 2.4514, "step": 22994 }, { "crossentropy": 2.6155080795288086, "epoch": 0.8336354408352669, "grad_norm": 0.026725007221102715, "grad_norm_var": 3.0828076568998645e-07, "learning_rate": 0.0006943861332582024, "loss": 2.5305, "step": 22995 }, { "crossentropy": 2.441725492477417, "epoch": 0.8336716937354989, "grad_norm": 0.026807326823472977, "grad_norm_var": 2.3801220682081493e-07, "learning_rate": 0.0006940907390477458, "loss": 2.4548, "step": 22996 }, { "crossentropy": 2.413611650466919, "epoch": 0.8337079466357309, "grad_norm": 0.026243269443511963, "grad_norm_var": 1.6435209509932381e-07, "learning_rate": 0.0006937954029951293, "loss": 2.4582, "step": 22997 }, { "crossentropy": 2.5335140228271484, "epoch": 0.8337441995359629, "grad_norm": 0.027590224519371986, "grad_norm_var": 2.165432150085979e-07, "learning_rate": 0.0006935001251043399, "loss": 2.5051, "step": 22998 }, { "crossentropy": 2.303713321685791, "epoch": 0.8337804524361949, "grad_norm": 0.026741746813058853, "grad_norm_var": 1.7672492842097688e-07, "learning_rate": 0.0006932049053793676, "loss": 2.3171, "step": 22999 }, { "crossentropy": 2.412773370742798, "epoch": 0.8338167053364269, "grad_norm": 0.027326922863721848, "grad_norm_var": 2.0607042705705903e-07, "learning_rate": 0.0006929097438241988, "loss": 2.4601, "step": 23000 }, { "crossentropy": 2.375995397567749, "epoch": 0.833852958236659, "grad_norm": 0.026992307975888252, "grad_norm_var": 1.9229000744700833e-07, "learning_rate": 0.0006926146404428196, "loss": 2.3765, "step": 23001 }, { "crossentropy": 2.6187734603881836, "epoch": 0.833889211136891, "grad_norm": 0.02723105065524578, "grad_norm_var": 2.0457848305440127e-07, "learning_rate": 0.0006923195952392169, "loss": 2.5637, "step": 23002 }, { "crossentropy": 2.535492181777954, "epoch": 0.833925464037123, "grad_norm": 0.026049423962831497, "grad_norm_var": 2.3451666204722897e-07, "learning_rate": 0.0006920246082173742, "loss": 2.4188, "step": 23003 }, { "crossentropy": 2.318010091781616, "epoch": 0.833961716937355, "grad_norm": 0.027341825887560844, "grad_norm_var": 2.5367638779721926e-07, "learning_rate": 0.0006917296793812766, "loss": 2.3934, "step": 23004 }, { "crossentropy": 2.4427106380462646, "epoch": 0.833997969837587, "grad_norm": 0.027267878875136375, "grad_norm_var": 2.674408262751116e-07, "learning_rate": 0.0006914348087349087, "loss": 2.5435, "step": 23005 }, { "crossentropy": 2.455984592437744, "epoch": 0.834034222737819, "grad_norm": 0.027648616582155228, "grad_norm_var": 2.939787498405875e-07, "learning_rate": 0.0006911399962822518, "loss": 2.5127, "step": 23006 }, { "crossentropy": 2.467982053756714, "epoch": 0.834070475638051, "grad_norm": 0.027123020961880684, "grad_norm_var": 2.878438426223128e-07, "learning_rate": 0.0006908452420272887, "loss": 2.3313, "step": 23007 }, { "crossentropy": 2.4401023387908936, "epoch": 0.834106728538283, "grad_norm": 0.028951680287718773, "grad_norm_var": 5.469981152999546e-07, "learning_rate": 0.0006905505459740002, "loss": 2.4477, "step": 23008 }, { "crossentropy": 2.4746084213256836, "epoch": 0.8341429814385151, "grad_norm": 0.026984553784132004, "grad_norm_var": 5.146728082331403e-07, "learning_rate": 0.0006902559081263649, "loss": 2.4428, "step": 23009 }, { "crossentropy": 2.535325288772583, "epoch": 0.8341792343387471, "grad_norm": 0.027444979175925255, "grad_norm_var": 4.2765658880189865e-07, "learning_rate": 0.0006899613284883654, "loss": 2.4544, "step": 23010 }, { "crossentropy": 2.508477210998535, "epoch": 0.8342154872389791, "grad_norm": 0.025791337713599205, "grad_norm_var": 5.355906636677338e-07, "learning_rate": 0.0006896668070639777, "loss": 2.513, "step": 23011 }, { "crossentropy": 2.482635736465454, "epoch": 0.8342517401392111, "grad_norm": 0.026482176035642624, "grad_norm_var": 5.547137787872825e-07, "learning_rate": 0.0006893723438571808, "loss": 2.4452, "step": 23012 }, { "crossentropy": 2.542001962661743, "epoch": 0.8342879930394431, "grad_norm": 0.02689676731824875, "grad_norm_var": 5.088737870837518e-07, "learning_rate": 0.0006890779388719531, "loss": 2.5119, "step": 23013 }, { "crossentropy": 2.4592466354370117, "epoch": 0.8343242459396751, "grad_norm": 0.026160232722759247, "grad_norm_var": 5.463617319842925e-07, "learning_rate": 0.0006887835921122687, "loss": 2.4335, "step": 23014 }, { "crossentropy": 2.410446882247925, "epoch": 0.8343604988399071, "grad_norm": 0.02697698399424553, "grad_norm_var": 5.408683679671474e-07, "learning_rate": 0.000688489303582106, "loss": 2.412, "step": 23015 }, { "crossentropy": 2.471611976623535, "epoch": 0.8343967517401392, "grad_norm": 0.026341093704104424, "grad_norm_var": 5.641397827112059e-07, "learning_rate": 0.0006881950732854375, "loss": 2.5023, "step": 23016 }, { "crossentropy": 2.3828203678131104, "epoch": 0.8344330046403712, "grad_norm": 0.026873577386140823, "grad_norm_var": 5.648298859063534e-07, "learning_rate": 0.0006879009012262372, "loss": 2.4344, "step": 23017 }, { "crossentropy": 2.472836494445801, "epoch": 0.8344692575406032, "grad_norm": 0.025541206821799278, "grad_norm_var": 6.851216630099701e-07, "learning_rate": 0.0006876067874084801, "loss": 2.4905, "step": 23018 }, { "crossentropy": 2.4812076091766357, "epoch": 0.8345055104408353, "grad_norm": 0.026834510266780853, "grad_norm_var": 6.380398768842295e-07, "learning_rate": 0.0006873127318361366, "loss": 2.4275, "step": 23019 }, { "crossentropy": 2.362758159637451, "epoch": 0.8345417633410673, "grad_norm": 0.026385707780718803, "grad_norm_var": 6.409250677926377e-07, "learning_rate": 0.0006870187345131795, "loss": 2.4019, "step": 23020 }, { "crossentropy": 2.422346353530884, "epoch": 0.8345780162412993, "grad_norm": 0.026528816670179367, "grad_norm_var": 6.345274247535609e-07, "learning_rate": 0.0006867247954435807, "loss": 2.3924, "step": 23021 }, { "crossentropy": 2.485647678375244, "epoch": 0.8346142691415314, "grad_norm": 0.0277185570448637, "grad_norm_var": 6.426505185027261e-07, "learning_rate": 0.000686430914631308, "loss": 2.4697, "step": 23022 }, { "crossentropy": 2.483511447906494, "epoch": 0.8346505220417634, "grad_norm": 0.026104314252734184, "grad_norm_var": 6.656322582976673e-07, "learning_rate": 0.0006861370920803328, "loss": 2.404, "step": 23023 }, { "crossentropy": 2.5763094425201416, "epoch": 0.8346867749419954, "grad_norm": 0.027410101145505905, "grad_norm_var": 3.618313801606454e-07, "learning_rate": 0.0006858433277946218, "loss": 2.4992, "step": 23024 }, { "crossentropy": 2.355377435684204, "epoch": 0.8347230278422274, "grad_norm": 0.025220008566975594, "grad_norm_var": 4.788228513933253e-07, "learning_rate": 0.0006855496217781443, "loss": 2.3323, "step": 23025 }, { "crossentropy": 2.5037178993225098, "epoch": 0.8347592807424594, "grad_norm": 0.027835644781589508, "grad_norm_var": 5.352717142994284e-07, "learning_rate": 0.0006852559740348674, "loss": 2.3826, "step": 23026 }, { "crossentropy": 2.4258508682250977, "epoch": 0.8347955336426914, "grad_norm": 0.02772679552435875, "grad_norm_var": 5.687599858505051e-07, "learning_rate": 0.0006849623845687547, "loss": 2.467, "step": 23027 }, { "crossentropy": 2.654926061630249, "epoch": 0.8348317865429234, "grad_norm": 0.02919137105345726, "grad_norm_var": 9.525013220643142e-07, "learning_rate": 0.0006846688533837731, "loss": 2.5943, "step": 23028 }, { "crossentropy": 2.4067773818969727, "epoch": 0.8348680394431555, "grad_norm": 0.02590855397284031, "grad_norm_var": 1.0085742933984977e-06, "learning_rate": 0.0006843753804838887, "loss": 2.4757, "step": 23029 }, { "crossentropy": 2.413944959640503, "epoch": 0.8349042923433875, "grad_norm": 0.027234967797994614, "grad_norm_var": 9.894687363790615e-07, "learning_rate": 0.000684081965873063, "loss": 2.5586, "step": 23030 }, { "crossentropy": 2.358231782913208, "epoch": 0.8349405452436195, "grad_norm": 0.027180563658475876, "grad_norm_var": 9.951119318025165e-07, "learning_rate": 0.0006837886095552615, "loss": 2.4296, "step": 23031 }, { "crossentropy": 2.4531075954437256, "epoch": 0.8349767981438515, "grad_norm": 0.02622484229505062, "grad_norm_var": 1.0042669014754946e-06, "learning_rate": 0.0006834953115344433, "loss": 2.4545, "step": 23032 }, { "crossentropy": 2.322166681289673, "epoch": 0.8350130510440835, "grad_norm": 0.026149731129407883, "grad_norm_var": 1.0366659448529282e-06, "learning_rate": 0.0006832020718145726, "loss": 2.4111, "step": 23033 }, { "crossentropy": 2.3632829189300537, "epoch": 0.8350493039443155, "grad_norm": 0.026371756568551064, "grad_norm_var": 9.376418482192796e-07, "learning_rate": 0.0006829088903996089, "loss": 2.4091, "step": 23034 }, { "crossentropy": 2.37611985206604, "epoch": 0.8350855568445475, "grad_norm": 0.026601511985063553, "grad_norm_var": 9.423436862735958e-07, "learning_rate": 0.0006826157672935108, "loss": 2.399, "step": 23035 }, { "crossentropy": 2.387355327606201, "epoch": 0.8351218097447796, "grad_norm": 0.02581178769469261, "grad_norm_var": 9.993833097678034e-07, "learning_rate": 0.0006823227025002387, "loss": 2.3271, "step": 23036 }, { "crossentropy": 2.3272178173065186, "epoch": 0.8351580626450116, "grad_norm": 0.025550859048962593, "grad_norm_var": 1.0979364966279357e-06, "learning_rate": 0.0006820296960237521, "loss": 2.394, "step": 23037 }, { "crossentropy": 2.367053985595703, "epoch": 0.8351943155452436, "grad_norm": 0.027069391682744026, "grad_norm_var": 1.0417468744097625e-06, "learning_rate": 0.0006817367478680059, "loss": 2.329, "step": 23038 }, { "crossentropy": 2.57985258102417, "epoch": 0.8352305684454756, "grad_norm": 0.028044255450367928, "grad_norm_var": 1.1165378455822572e-06, "learning_rate": 0.0006814438580369597, "loss": 2.4911, "step": 23039 }, { "crossentropy": 2.5037624835968018, "epoch": 0.8352668213457076, "grad_norm": 0.02649890072643757, "grad_norm_var": 1.0998668808999411e-06, "learning_rate": 0.0006811510265345661, "loss": 2.473, "step": 23040 }, { "crossentropy": 2.4329521656036377, "epoch": 0.8353030742459396, "grad_norm": 0.02681540884077549, "grad_norm_var": 9.252330000458775e-07, "learning_rate": 0.0006808582533647834, "loss": 2.3651, "step": 23041 }, { "crossentropy": 2.3813815116882324, "epoch": 0.8353393271461717, "grad_norm": 0.027030397206544876, "grad_norm_var": 8.640703031733675e-07, "learning_rate": 0.0006805655385315635, "loss": 2.4502, "step": 23042 }, { "crossentropy": 2.3685357570648193, "epoch": 0.8353755800464037, "grad_norm": 0.026850363239645958, "grad_norm_var": 8.082387064101224e-07, "learning_rate": 0.0006802728820388621, "loss": 2.4804, "step": 23043 }, { "crossentropy": 2.522847890853882, "epoch": 0.8354118329466357, "grad_norm": 0.025886597111821175, "grad_norm_var": 4.2980160820839616e-07, "learning_rate": 0.0006799802838906294, "loss": 2.4839, "step": 23044 }, { "crossentropy": 2.4356191158294678, "epoch": 0.8354480858468677, "grad_norm": 0.02729838714003563, "grad_norm_var": 4.266828678669976e-07, "learning_rate": 0.00067968774409082, "loss": 2.4392, "step": 23045 }, { "crossentropy": 2.3826379776000977, "epoch": 0.8354843387470998, "grad_norm": 0.026496151462197304, "grad_norm_var": 4.045267469022021e-07, "learning_rate": 0.000679395262643383, "loss": 2.4876, "step": 23046 }, { "crossentropy": 2.4830873012542725, "epoch": 0.8355205916473318, "grad_norm": 0.02606363222002983, "grad_norm_var": 3.986523622882705e-07, "learning_rate": 0.0006791028395522708, "loss": 2.4784, "step": 23047 }, { "crossentropy": 2.471853733062744, "epoch": 0.8355568445475638, "grad_norm": 0.02596351131796837, "grad_norm_var": 4.1417211084890784e-07, "learning_rate": 0.0006788104748214307, "loss": 2.4535, "step": 23048 }, { "crossentropy": 2.3496487140655518, "epoch": 0.8355930974477959, "grad_norm": 0.02782527357339859, "grad_norm_var": 5.043668897861438e-07, "learning_rate": 0.0006785181684548131, "loss": 2.439, "step": 23049 }, { "crossentropy": 2.452927350997925, "epoch": 0.8356293503480279, "grad_norm": 0.02629309706389904, "grad_norm_var": 5.07526397232324e-07, "learning_rate": 0.0006782259204563668, "loss": 2.4558, "step": 23050 }, { "crossentropy": 2.5481295585632324, "epoch": 0.8356656032482599, "grad_norm": 0.02630010060966015, "grad_norm_var": 5.143983739403293e-07, "learning_rate": 0.0006779337308300376, "loss": 2.4754, "step": 23051 }, { "crossentropy": 2.5227906703948975, "epoch": 0.8357018561484919, "grad_norm": 0.026065558195114136, "grad_norm_var": 4.9133437323573e-07, "learning_rate": 0.0006776415995797713, "loss": 2.4971, "step": 23052 }, { "crossentropy": 2.562593698501587, "epoch": 0.8357381090487239, "grad_norm": 0.025585055351257324, "grad_norm_var": 4.864951211783064e-07, "learning_rate": 0.0006773495267095159, "loss": 2.4952, "step": 23053 }, { "crossentropy": 2.53515887260437, "epoch": 0.8357743619489559, "grad_norm": 0.027364950627088547, "grad_norm_var": 5.092553219424497e-07, "learning_rate": 0.0006770575122232136, "loss": 2.5805, "step": 23054 }, { "crossentropy": 2.496384620666504, "epoch": 0.835810614849188, "grad_norm": 0.027155809104442596, "grad_norm_var": 3.932901261540263e-07, "learning_rate": 0.0006767655561248103, "loss": 2.4314, "step": 23055 }, { "crossentropy": 2.3087270259857178, "epoch": 0.83584686774942, "grad_norm": 0.026212599128484726, "grad_norm_var": 4.0201765853809893e-07, "learning_rate": 0.0006764736584182502, "loss": 2.3731, "step": 23056 }, { "crossentropy": 2.4126319885253906, "epoch": 0.835883120649652, "grad_norm": 0.025684598833322525, "grad_norm_var": 4.4575574802810786e-07, "learning_rate": 0.0006761818191074726, "loss": 2.3466, "step": 23057 }, { "crossentropy": 2.363974094390869, "epoch": 0.835919373549884, "grad_norm": 0.027908416464924812, "grad_norm_var": 5.554746263870351e-07, "learning_rate": 0.0006758900381964228, "loss": 2.3431, "step": 23058 }, { "crossentropy": 2.477595806121826, "epoch": 0.835955626450116, "grad_norm": 0.02683490328490734, "grad_norm_var": 5.548902709237117e-07, "learning_rate": 0.0006755983156890399, "loss": 2.4223, "step": 23059 }, { "crossentropy": 2.3286428451538086, "epoch": 0.835991879350348, "grad_norm": 0.025761539116501808, "grad_norm_var": 5.670740697373093e-07, "learning_rate": 0.0006753066515892636, "loss": 2.456, "step": 23060 }, { "crossentropy": 2.3554773330688477, "epoch": 0.83602813225058, "grad_norm": 0.026395760476589203, "grad_norm_var": 5.280286158213312e-07, "learning_rate": 0.0006750150459010345, "loss": 2.4902, "step": 23061 }, { "crossentropy": 2.5541908740997314, "epoch": 0.836064385150812, "grad_norm": 0.026243317872285843, "grad_norm_var": 5.319660467470079e-07, "learning_rate": 0.0006747234986282896, "loss": 2.5449, "step": 23062 }, { "crossentropy": 2.5252938270568848, "epoch": 0.8361006380510441, "grad_norm": 0.027752289548516273, "grad_norm_var": 6.167496225913e-07, "learning_rate": 0.0006744320097749679, "loss": 2.4862, "step": 23063 }, { "crossentropy": 2.5313737392425537, "epoch": 0.8361368909512761, "grad_norm": 0.026774190366268158, "grad_norm_var": 5.907369139158495e-07, "learning_rate": 0.0006741405793450072, "loss": 2.4845, "step": 23064 }, { "crossentropy": 2.4610185623168945, "epoch": 0.8361731438515081, "grad_norm": 0.02633974887430668, "grad_norm_var": 4.928720036119812e-07, "learning_rate": 0.0006738492073423419, "loss": 2.4311, "step": 23065 }, { "crossentropy": 2.4188404083251953, "epoch": 0.8362093967517401, "grad_norm": 0.02586551383137703, "grad_norm_var": 5.184887151068555e-07, "learning_rate": 0.0006735578937709091, "loss": 2.3871, "step": 23066 }, { "crossentropy": 2.465561866760254, "epoch": 0.8362456496519721, "grad_norm": 0.026612484827637672, "grad_norm_var": 5.156255567873372e-07, "learning_rate": 0.0006732666386346425, "loss": 2.3936, "step": 23067 }, { "crossentropy": 2.4913980960845947, "epoch": 0.8362819025522041, "grad_norm": 0.028575902804732323, "grad_norm_var": 7.524301249560525e-07, "learning_rate": 0.0006729754419374745, "loss": 2.5028, "step": 23068 }, { "crossentropy": 2.4607415199279785, "epoch": 0.8363181554524362, "grad_norm": 0.027113815769553185, "grad_norm_var": 6.729283184042644e-07, "learning_rate": 0.0006726843036833413, "loss": 2.4392, "step": 23069 }, { "crossentropy": 2.3830974102020264, "epoch": 0.8363544083526682, "grad_norm": 0.02577998861670494, "grad_norm_var": 7.078482803402064e-07, "learning_rate": 0.0006723932238761721, "loss": 2.379, "step": 23070 }, { "crossentropy": 2.5760436058044434, "epoch": 0.8363906612529002, "grad_norm": 0.02706265076994896, "grad_norm_var": 7.025822110701477e-07, "learning_rate": 0.0006721022025199003, "loss": 2.437, "step": 23071 }, { "crossentropy": 2.221071481704712, "epoch": 0.8364269141531323, "grad_norm": 0.025929376482963562, "grad_norm_var": 7.253351486350331e-07, "learning_rate": 0.0006718112396184567, "loss": 2.2419, "step": 23072 }, { "crossentropy": 2.596622943878174, "epoch": 0.8364631670533643, "grad_norm": 0.027748964726924896, "grad_norm_var": 7.21925979012677e-07, "learning_rate": 0.0006715203351757693, "loss": 2.5154, "step": 23073 }, { "crossentropy": 2.452331781387329, "epoch": 0.8364994199535963, "grad_norm": 0.02580016851425171, "grad_norm_var": 6.863679181481058e-07, "learning_rate": 0.0006712294891957699, "loss": 2.4372, "step": 23074 }, { "crossentropy": 2.6355295181274414, "epoch": 0.8365356728538283, "grad_norm": 0.025748586282134056, "grad_norm_var": 7.350669738802184e-07, "learning_rate": 0.0006709387016823853, "loss": 2.5686, "step": 23075 }, { "crossentropy": 2.6410088539123535, "epoch": 0.8365719257540604, "grad_norm": 0.02835254929959774, "grad_norm_var": 8.670552642561172e-07, "learning_rate": 0.0006706479726395421, "loss": 2.4727, "step": 23076 }, { "crossentropy": 2.365858554840088, "epoch": 0.8366081786542924, "grad_norm": 0.026381617411971092, "grad_norm_var": 8.677470032464676e-07, "learning_rate": 0.0006703573020711695, "loss": 2.4268, "step": 23077 }, { "crossentropy": 2.280794858932495, "epoch": 0.8366444315545244, "grad_norm": 0.026898343116044998, "grad_norm_var": 8.498681399038769e-07, "learning_rate": 0.0006700666899811902, "loss": 2.3167, "step": 23078 }, { "crossentropy": 2.4726574420928955, "epoch": 0.8366806844547564, "grad_norm": 0.026932502165436745, "grad_norm_var": 7.87345441790171e-07, "learning_rate": 0.0006697761363735317, "loss": 2.4344, "step": 23079 }, { "crossentropy": 2.356419324874878, "epoch": 0.8367169373549884, "grad_norm": 0.026515373960137367, "grad_norm_var": 7.90516980285931e-07, "learning_rate": 0.0006694856412521188, "loss": 2.4053, "step": 23080 }, { "crossentropy": 2.3973400592803955, "epoch": 0.8367531902552204, "grad_norm": 0.0267658531665802, "grad_norm_var": 7.79772677917713e-07, "learning_rate": 0.0006691952046208727, "loss": 2.3758, "step": 23081 }, { "crossentropy": 2.316729784011841, "epoch": 0.8367894431554525, "grad_norm": 0.02741030789911747, "grad_norm_var": 7.456647199561752e-07, "learning_rate": 0.0006689048264837188, "loss": 2.3471, "step": 23082 }, { "crossentropy": 2.3567516803741455, "epoch": 0.8368256960556845, "grad_norm": 0.026191284880042076, "grad_norm_var": 7.701916428111497e-07, "learning_rate": 0.0006686145068445781, "loss": 2.4459, "step": 23083 }, { "crossentropy": 2.372722864151001, "epoch": 0.8368619489559165, "grad_norm": 0.02686495892703533, "grad_norm_var": 5.538273920244515e-07, "learning_rate": 0.0006683242457073707, "loss": 2.3693, "step": 23084 }, { "crossentropy": 2.4898877143859863, "epoch": 0.8368982018561485, "grad_norm": 0.02706036902964115, "grad_norm_var": 5.51188967085507e-07, "learning_rate": 0.0006680340430760184, "loss": 2.5402, "step": 23085 }, { "crossentropy": 2.529085636138916, "epoch": 0.8369344547563805, "grad_norm": 0.02736271545290947, "grad_norm_var": 5.103991142671778e-07, "learning_rate": 0.0006677438989544399, "loss": 2.4198, "step": 23086 }, { "crossentropy": 2.345716953277588, "epoch": 0.8369707076566125, "grad_norm": 0.02739277295768261, "grad_norm_var": 5.281506278805835e-07, "learning_rate": 0.0006674538133465541, "loss": 2.4153, "step": 23087 }, { "crossentropy": 2.4813168048858643, "epoch": 0.8370069605568445, "grad_norm": 0.02610187977552414, "grad_norm_var": 5.091868437593142e-07, "learning_rate": 0.0006671637862562802, "loss": 2.4541, "step": 23088 }, { "crossentropy": 2.250441551208496, "epoch": 0.8370432134570766, "grad_norm": 0.026917869225144386, "grad_norm_var": 4.5224314558638064e-07, "learning_rate": 0.0006668738176875339, "loss": 2.3683, "step": 23089 }, { "crossentropy": 2.4282515048980713, "epoch": 0.8370794663573086, "grad_norm": 0.026539448648691177, "grad_norm_var": 3.884811283540497e-07, "learning_rate": 0.0006665839076442337, "loss": 2.5252, "step": 23090 }, { "crossentropy": 2.421973705291748, "epoch": 0.8371157192575406, "grad_norm": 0.02685331366956234, "grad_norm_var": 3.040284362762809e-07, "learning_rate": 0.0006662940561302927, "loss": 2.4263, "step": 23091 }, { "crossentropy": 2.5415055751800537, "epoch": 0.8371519721577726, "grad_norm": 0.02619944140315056, "grad_norm_var": 1.793039200210141e-07, "learning_rate": 0.0006660042631496283, "loss": 2.4599, "step": 23092 }, { "crossentropy": 2.3473877906799316, "epoch": 0.8371882250580046, "grad_norm": 0.026569969952106476, "grad_norm_var": 1.716606853118524e-07, "learning_rate": 0.0006657145287061534, "loss": 2.4013, "step": 23093 }, { "crossentropy": 2.505857467651367, "epoch": 0.8372244779582366, "grad_norm": 0.02704373002052307, "grad_norm_var": 1.7515904244916279e-07, "learning_rate": 0.0006654248528037804, "loss": 2.4329, "step": 23094 }, { "crossentropy": 2.4530625343322754, "epoch": 0.8372607308584686, "grad_norm": 0.026026364415884018, "grad_norm_var": 2.0987763185130923e-07, "learning_rate": 0.0006651352354464224, "loss": 2.4606, "step": 23095 }, { "crossentropy": 2.4091227054595947, "epoch": 0.8372969837587007, "grad_norm": 0.027127772569656372, "grad_norm_var": 2.150999607632009e-07, "learning_rate": 0.0006648456766379929, "loss": 2.3884, "step": 23096 }, { "crossentropy": 2.4129507541656494, "epoch": 0.8373332366589327, "grad_norm": 0.027389755472540855, "grad_norm_var": 2.3852159630229305e-07, "learning_rate": 0.0006645561763823999, "loss": 2.465, "step": 23097 }, { "crossentropy": 2.468259572982788, "epoch": 0.8373694895591647, "grad_norm": 0.027266668155789375, "grad_norm_var": 2.284241125536171e-07, "learning_rate": 0.0006642667346835562, "loss": 2.5341, "step": 23098 }, { "crossentropy": 2.4065754413604736, "epoch": 0.8374057424593968, "grad_norm": 0.027058303356170654, "grad_norm_var": 2.0425512065520326e-07, "learning_rate": 0.0006639773515453689, "loss": 2.5094, "step": 23099 }, { "crossentropy": 2.402263641357422, "epoch": 0.8374419953596288, "grad_norm": 0.026909204199910164, "grad_norm_var": 2.0440107450837333e-07, "learning_rate": 0.0006636880269717488, "loss": 2.4085, "step": 23100 }, { "crossentropy": 2.514862537384033, "epoch": 0.8374782482598608, "grad_norm": 0.027427632361650467, "grad_norm_var": 2.2246064030288086e-07, "learning_rate": 0.0006633987609666026, "loss": 2.4506, "step": 23101 }, { "crossentropy": 2.3966472148895264, "epoch": 0.8375145011600929, "grad_norm": 0.025671688839793205, "grad_norm_var": 2.9385147163178175e-07, "learning_rate": 0.0006631095535338355, "loss": 2.4534, "step": 23102 }, { "crossentropy": 2.548863172531128, "epoch": 0.8375507540603249, "grad_norm": 0.025875605642795563, "grad_norm_var": 3.1395649956420896e-07, "learning_rate": 0.0006628204046773556, "loss": 2.511, "step": 23103 }, { "crossentropy": 2.4279332160949707, "epoch": 0.8375870069605569, "grad_norm": 0.026710299775004387, "grad_norm_var": 2.8969361600138647e-07, "learning_rate": 0.0006625313144010697, "loss": 2.4962, "step": 23104 }, { "crossentropy": 2.4244937896728516, "epoch": 0.8376232598607889, "grad_norm": 0.02694406546652317, "grad_norm_var": 2.904129891749165e-07, "learning_rate": 0.0006622422827088792, "loss": 2.4312, "step": 23105 }, { "crossentropy": 2.498528003692627, "epoch": 0.8376595127610209, "grad_norm": 0.02615095116198063, "grad_norm_var": 3.0950057232677586e-07, "learning_rate": 0.0006619533096046904, "loss": 2.478, "step": 23106 }, { "crossentropy": 2.286362409591675, "epoch": 0.8376957656612529, "grad_norm": 0.02724463865160942, "grad_norm_var": 3.26990156326159e-07, "learning_rate": 0.0006616643950924045, "loss": 2.3576, "step": 23107 }, { "crossentropy": 2.549616813659668, "epoch": 0.8377320185614849, "grad_norm": 0.02694297954440117, "grad_norm_var": 3.093404634721726e-07, "learning_rate": 0.000661375539175924, "loss": 2.5786, "step": 23108 }, { "crossentropy": 2.5642004013061523, "epoch": 0.837768271461717, "grad_norm": 0.055464256554841995, "grad_norm_var": 5.17091555464232e-05, "learning_rate": 0.0006610867418591537, "loss": 2.4306, "step": 23109 }, { "crossentropy": 2.486720323562622, "epoch": 0.837804524361949, "grad_norm": 0.027400532737374306, "grad_norm_var": 5.164410382371767e-05, "learning_rate": 0.0006607980031459892, "loss": 2.4434, "step": 23110 }, { "crossentropy": 2.304161310195923, "epoch": 0.837840777262181, "grad_norm": 0.02589709684252739, "grad_norm_var": 5.1689518102259147e-05, "learning_rate": 0.0006605093230403331, "loss": 2.4558, "step": 23111 }, { "crossentropy": 2.481642246246338, "epoch": 0.837877030162413, "grad_norm": 0.02721712924540043, "grad_norm_var": 5.167256497013623e-05, "learning_rate": 0.0006602207015460848, "loss": 2.4787, "step": 23112 }, { "crossentropy": 2.298051118850708, "epoch": 0.837913283062645, "grad_norm": 0.027170494198799133, "grad_norm_var": 5.1710897651211356e-05, "learning_rate": 0.0006599321386671403, "loss": 2.4108, "step": 23113 }, { "crossentropy": 2.491818904876709, "epoch": 0.837949535962877, "grad_norm": 0.028568444773554802, "grad_norm_var": 5.158808010849887e-05, "learning_rate": 0.0006596436344074003, "loss": 2.5227, "step": 23114 }, { "crossentropy": 2.415360450744629, "epoch": 0.837985788863109, "grad_norm": 0.025699971243739128, "grad_norm_var": 5.199453790712207e-05, "learning_rate": 0.0006593551887707577, "loss": 2.4964, "step": 23115 }, { "crossentropy": 2.53499698638916, "epoch": 0.838022041763341, "grad_norm": 0.02760859951376915, "grad_norm_var": 5.186921641284666e-05, "learning_rate": 0.0006590668017611112, "loss": 2.5035, "step": 23116 }, { "crossentropy": 2.433838129043579, "epoch": 0.8380582946635731, "grad_norm": 0.02688806876540184, "grad_norm_var": 5.1973527525933416e-05, "learning_rate": 0.0006587784733823571, "loss": 2.5019, "step": 23117 }, { "crossentropy": 2.572148323059082, "epoch": 0.8380945475638051, "grad_norm": 0.026333481073379517, "grad_norm_var": 5.174331008350751e-05, "learning_rate": 0.0006584902036383855, "loss": 2.5376, "step": 23118 }, { "crossentropy": 2.3627758026123047, "epoch": 0.8381308004640371, "grad_norm": 0.027910051867365837, "grad_norm_var": 5.125421940810615e-05, "learning_rate": 0.0006582019925330924, "loss": 2.4387, "step": 23119 }, { "crossentropy": 2.448298454284668, "epoch": 0.8381670533642691, "grad_norm": 0.027385393157601357, "grad_norm_var": 5.1098255595706923e-05, "learning_rate": 0.0006579138400703715, "loss": 2.5276, "step": 23120 }, { "crossentropy": 2.4792957305908203, "epoch": 0.8382033062645011, "grad_norm": 0.027657289057970047, "grad_norm_var": 5.095340029796535e-05, "learning_rate": 0.0006576257462541119, "loss": 2.5924, "step": 23121 }, { "crossentropy": 2.5911495685577393, "epoch": 0.8382395591647331, "grad_norm": 0.026752008125185966, "grad_norm_var": 5.075997900228743e-05, "learning_rate": 0.0006573377110882079, "loss": 2.5156, "step": 23122 }, { "crossentropy": 2.5592360496520996, "epoch": 0.8382758120649652, "grad_norm": 0.027310745790600777, "grad_norm_var": 5.074580430217135e-05, "learning_rate": 0.0006570497345765475, "loss": 2.473, "step": 23123 }, { "crossentropy": 2.5056021213531494, "epoch": 0.8383120649651972, "grad_norm": 0.027138564735651016, "grad_norm_var": 5.069747523263034e-05, "learning_rate": 0.0006567618167230204, "loss": 2.4821, "step": 23124 }, { "crossentropy": 2.508204221725464, "epoch": 0.8383483178654292, "grad_norm": 0.02610982581973076, "grad_norm_var": 5.82674567588722e-07, "learning_rate": 0.0006564739575315187, "loss": 2.4656, "step": 23125 }, { "crossentropy": 2.5790653228759766, "epoch": 0.8383845707656613, "grad_norm": 0.02896665409207344, "grad_norm_var": 8.059347854292755e-07, "learning_rate": 0.0006561861570059257, "loss": 2.5686, "step": 23126 }, { "crossentropy": 2.6220345497131348, "epoch": 0.8384208236658933, "grad_norm": 0.026334546506404877, "grad_norm_var": 7.440378585693661e-07, "learning_rate": 0.0006558984151501301, "loss": 2.4476, "step": 23127 }, { "crossentropy": 2.3709983825683594, "epoch": 0.8384570765661253, "grad_norm": 0.026341818273067474, "grad_norm_var": 7.888394276798009e-07, "learning_rate": 0.0006556107319680204, "loss": 2.4325, "step": 23128 }, { "crossentropy": 2.365166187286377, "epoch": 0.8384933294663574, "grad_norm": 0.025795822963118553, "grad_norm_var": 9.006240735269597e-07, "learning_rate": 0.0006553231074634792, "loss": 2.336, "step": 23129 }, { "crossentropy": 2.5010316371917725, "epoch": 0.8385295823665894, "grad_norm": 0.026379352435469627, "grad_norm_var": 7.569532486586025e-07, "learning_rate": 0.0006550355416403936, "loss": 2.4376, "step": 23130 }, { "crossentropy": 2.315459966659546, "epoch": 0.8385658352668214, "grad_norm": 0.02720045857131481, "grad_norm_var": 6.549326324017801e-07, "learning_rate": 0.0006547480345026463, "loss": 2.3469, "step": 23131 }, { "crossentropy": 2.303957223892212, "epoch": 0.8386020881670534, "grad_norm": 0.025852957740426064, "grad_norm_var": 7.067592042932687e-07, "learning_rate": 0.0006544605860541203, "loss": 2.3935, "step": 23132 }, { "crossentropy": 2.3504090309143066, "epoch": 0.8386383410672854, "grad_norm": 0.026160316541790962, "grad_norm_var": 7.407578499664478e-07, "learning_rate": 0.0006541731962987002, "loss": 2.3453, "step": 23133 }, { "crossentropy": 2.515310525894165, "epoch": 0.8386745939675174, "grad_norm": 0.026001451537013054, "grad_norm_var": 7.705957149642742e-07, "learning_rate": 0.0006538858652402663, "loss": 2.51, "step": 23134 }, { "crossentropy": 2.520800828933716, "epoch": 0.8387108468677494, "grad_norm": 0.026748554781079292, "grad_norm_var": 6.878163546629456e-07, "learning_rate": 0.0006535985928826976, "loss": 2.4419, "step": 23135 }, { "crossentropy": 2.3120675086975098, "epoch": 0.8387470997679815, "grad_norm": 0.027149200439453125, "grad_norm_var": 6.71560223519417e-07, "learning_rate": 0.0006533113792298773, "loss": 2.4048, "step": 23136 }, { "crossentropy": 2.565836191177368, "epoch": 0.8387833526682135, "grad_norm": 0.027911454439163208, "grad_norm_var": 7.06557309379851e-07, "learning_rate": 0.0006530242242856815, "loss": 2.5503, "step": 23137 }, { "crossentropy": 2.4213359355926514, "epoch": 0.8388196055684455, "grad_norm": 0.02591104805469513, "grad_norm_var": 7.516103668035619e-07, "learning_rate": 0.0006527371280539918, "loss": 2.4562, "step": 23138 }, { "crossentropy": 2.541123867034912, "epoch": 0.8388558584686775, "grad_norm": 0.02610846608877182, "grad_norm_var": 7.451775475693685e-07, "learning_rate": 0.0006524500905386838, "loss": 2.41, "step": 23139 }, { "crossentropy": 2.6224770545959473, "epoch": 0.8388921113689095, "grad_norm": 0.028403926640748978, "grad_norm_var": 9.307297643273375e-07, "learning_rate": 0.0006521631117436344, "loss": 2.538, "step": 23140 }, { "crossentropy": 2.4486513137817383, "epoch": 0.8389283642691415, "grad_norm": 0.02736690454185009, "grad_norm_var": 9.287336134833693e-07, "learning_rate": 0.0006518761916727211, "loss": 2.4007, "step": 23141 }, { "crossentropy": 2.3633878231048584, "epoch": 0.8389646171693735, "grad_norm": 0.025893723592162132, "grad_norm_var": 6.2690645981366e-07, "learning_rate": 0.0006515893303298187, "loss": 2.4079, "step": 23142 }, { "crossentropy": 2.5075762271881104, "epoch": 0.8390008700696056, "grad_norm": 0.02724541537463665, "grad_norm_var": 6.468260800598346e-07, "learning_rate": 0.0006513025277188001, "loss": 2.5036, "step": 23143 }, { "crossentropy": 2.487330198287964, "epoch": 0.8390371229698376, "grad_norm": 0.026302337646484375, "grad_norm_var": 6.485691116723975e-07, "learning_rate": 0.0006510157838435415, "loss": 2.45, "step": 23144 }, { "crossentropy": 2.475231885910034, "epoch": 0.8390733758700696, "grad_norm": 0.026647645980119705, "grad_norm_var": 5.96682077634566e-07, "learning_rate": 0.0006507290987079134, "loss": 2.5559, "step": 23145 }, { "crossentropy": 2.374626398086548, "epoch": 0.8391096287703016, "grad_norm": 0.026757074519991875, "grad_norm_var": 5.891885127711488e-07, "learning_rate": 0.0006504424723157903, "loss": 2.4955, "step": 23146 }, { "crossentropy": 2.483375310897827, "epoch": 0.8391458816705336, "grad_norm": 0.026117002591490746, "grad_norm_var": 5.944208756026992e-07, "learning_rate": 0.0006501559046710414, "loss": 2.4858, "step": 23147 }, { "crossentropy": 2.4466748237609863, "epoch": 0.8391821345707656, "grad_norm": 0.026631170883774757, "grad_norm_var": 5.48418374459814e-07, "learning_rate": 0.0006498693957775376, "loss": 2.4482, "step": 23148 }, { "crossentropy": 2.353640556335449, "epoch": 0.8392183874709976, "grad_norm": 0.02584301494061947, "grad_norm_var": 5.779548990256783e-07, "learning_rate": 0.0006495829456391505, "loss": 2.4175, "step": 23149 }, { "crossentropy": 2.485400438308716, "epoch": 0.8392546403712297, "grad_norm": 0.026168683543801308, "grad_norm_var": 5.643520700364246e-07, "learning_rate": 0.0006492965542597468, "loss": 2.3775, "step": 23150 }, { "crossentropy": 2.4229116439819336, "epoch": 0.8392908932714617, "grad_norm": 0.026995057240128517, "grad_norm_var": 5.697340832864211e-07, "learning_rate": 0.0006490102216431964, "loss": 2.511, "step": 23151 }, { "crossentropy": 2.5653018951416016, "epoch": 0.8393271461716937, "grad_norm": 0.027485739439725876, "grad_norm_var": 5.962621149785957e-07, "learning_rate": 0.000648723947793366, "loss": 2.6203, "step": 23152 }, { "crossentropy": 2.458819627761841, "epoch": 0.8393633990719258, "grad_norm": 0.02553592249751091, "grad_norm_var": 5.768992343744972e-07, "learning_rate": 0.0006484377327141206, "loss": 2.4173, "step": 23153 }, { "crossentropy": 2.4620437622070312, "epoch": 0.8393996519721578, "grad_norm": 0.026892684400081635, "grad_norm_var": 5.484801086797491e-07, "learning_rate": 0.0006481515764093287, "loss": 2.4278, "step": 23154 }, { "crossentropy": 2.492964029312134, "epoch": 0.8394359048723898, "grad_norm": 0.026586830615997314, "grad_norm_var": 5.282629147289648e-07, "learning_rate": 0.0006478654788828525, "loss": 2.4723, "step": 23155 }, { "crossentropy": 2.5494229793548584, "epoch": 0.8394721577726219, "grad_norm": 0.028290051966905594, "grad_norm_var": 5.02891986850481e-07, "learning_rate": 0.0006475794401385582, "loss": 2.4805, "step": 23156 }, { "crossentropy": 2.568988800048828, "epoch": 0.8395084106728539, "grad_norm": 0.02766493335366249, "grad_norm_var": 5.360388232814502e-07, "learning_rate": 0.0006472934601803093, "loss": 2.5079, "step": 23157 }, { "crossentropy": 2.411989212036133, "epoch": 0.8395446635730859, "grad_norm": 0.026656027883291245, "grad_norm_var": 4.91314246748202e-07, "learning_rate": 0.0006470075390119667, "loss": 2.4356, "step": 23158 }, { "crossentropy": 2.2811005115509033, "epoch": 0.8395809164733179, "grad_norm": 0.027966145426034927, "grad_norm_var": 5.724716247596309e-07, "learning_rate": 0.000646721676637394, "loss": 2.5303, "step": 23159 }, { "crossentropy": 2.5924625396728516, "epoch": 0.8396171693735499, "grad_norm": 0.02573845349252224, "grad_norm_var": 6.285407489327408e-07, "learning_rate": 0.000646435873060452, "loss": 2.5727, "step": 23160 }, { "crossentropy": 2.514516592025757, "epoch": 0.8396534222737819, "grad_norm": 0.025808408856391907, "grad_norm_var": 6.838491436128667e-07, "learning_rate": 0.0006461501282849986, "loss": 2.492, "step": 23161 }, { "crossentropy": 2.5164597034454346, "epoch": 0.8396896751740139, "grad_norm": 0.026819363236427307, "grad_norm_var": 6.845982464247224e-07, "learning_rate": 0.0006458644423148963, "loss": 2.4116, "step": 23162 }, { "crossentropy": 2.499375581741333, "epoch": 0.839725928074246, "grad_norm": 0.028981275856494904, "grad_norm_var": 9.747157255642156e-07, "learning_rate": 0.000645578815154001, "loss": 2.4653, "step": 23163 }, { "crossentropy": 2.540743827819824, "epoch": 0.839762180974478, "grad_norm": 0.025263318791985512, "grad_norm_var": 1.1368508836772368e-06, "learning_rate": 0.0006452932468061723, "loss": 2.4452, "step": 23164 }, { "crossentropy": 2.4958324432373047, "epoch": 0.83979843387471, "grad_norm": 0.0269792340695858, "grad_norm_var": 1.0735442663907986e-06, "learning_rate": 0.0006450077372752677, "loss": 2.4784, "step": 23165 }, { "crossentropy": 2.5266177654266357, "epoch": 0.839834686774942, "grad_norm": 0.02802521362900734, "grad_norm_var": 1.1167206841435622e-06, "learning_rate": 0.0006447222865651414, "loss": 2.5039, "step": 23166 }, { "crossentropy": 2.5100905895233154, "epoch": 0.839870939675174, "grad_norm": 0.027066798880696297, "grad_norm_var": 1.1171812160882019e-06, "learning_rate": 0.0006444368946796509, "loss": 2.4955, "step": 23167 }, { "crossentropy": 2.440584897994995, "epoch": 0.839907192575406, "grad_norm": 0.030587125569581985, "grad_norm_var": 1.92539790867748e-06, "learning_rate": 0.0006441515616226501, "loss": 2.523, "step": 23168 }, { "crossentropy": 2.516840934753418, "epoch": 0.839943445475638, "grad_norm": 0.027730660513043404, "grad_norm_var": 1.745676419235657e-06, "learning_rate": 0.0006438662873979917, "loss": 2.4713, "step": 23169 }, { "crossentropy": 2.5138967037200928, "epoch": 0.83997969837587, "grad_norm": 0.026753393933176994, "grad_norm_var": 1.7547514887314054e-06, "learning_rate": 0.0006435810720095309, "loss": 2.4771, "step": 23170 }, { "crossentropy": 2.50142765045166, "epoch": 0.8400159512761021, "grad_norm": 0.02691241167485714, "grad_norm_var": 1.7300993367126589e-06, "learning_rate": 0.0006432959154611173, "loss": 2.4876, "step": 23171 }, { "crossentropy": 2.333933115005493, "epoch": 0.8400522041763341, "grad_norm": 0.026658615097403526, "grad_norm_var": 1.6871077420256882e-06, "learning_rate": 0.0006430108177566047, "loss": 2.3515, "step": 23172 }, { "crossentropy": 2.521636486053467, "epoch": 0.8400884570765661, "grad_norm": 0.026499612256884575, "grad_norm_var": 1.7037364354439978e-06, "learning_rate": 0.0006427257788998431, "loss": 2.4674, "step": 23173 }, { "crossentropy": 2.5599606037139893, "epoch": 0.8401247099767981, "grad_norm": 0.027466725558042526, "grad_norm_var": 1.6911072487475754e-06, "learning_rate": 0.0006424407988946817, "loss": 2.533, "step": 23174 }, { "crossentropy": 2.510200262069702, "epoch": 0.8401609628770301, "grad_norm": 0.026559337973594666, "grad_norm_var": 1.6717576195002595e-06, "learning_rate": 0.0006421558777449715, "loss": 2.4082, "step": 23175 }, { "crossentropy": 2.5745558738708496, "epoch": 0.8401972157772621, "grad_norm": 0.02566087432205677, "grad_norm_var": 1.6863790545135204e-06, "learning_rate": 0.0006418710154545588, "loss": 2.5181, "step": 23176 }, { "crossentropy": 2.589240312576294, "epoch": 0.8402334686774942, "grad_norm": 0.026498882099986076, "grad_norm_var": 1.596276443724615e-06, "learning_rate": 0.0006415862120272914, "loss": 2.5292, "step": 23177 }, { "crossentropy": 2.5089921951293945, "epoch": 0.8402697215777262, "grad_norm": 0.02632916532456875, "grad_norm_var": 1.6331618595377106e-06, "learning_rate": 0.0006413014674670164, "loss": 2.474, "step": 23178 }, { "crossentropy": 2.457589626312256, "epoch": 0.8403059744779582, "grad_norm": 0.026608379557728767, "grad_norm_var": 1.3972357371586866e-06, "learning_rate": 0.0006410167817775797, "loss": 2.5093, "step": 23179 }, { "crossentropy": 2.4587786197662354, "epoch": 0.8403422273781903, "grad_norm": 0.026005510240793228, "grad_norm_var": 1.262279276427238e-06, "learning_rate": 0.0006407321549628253, "loss": 2.4488, "step": 23180 }, { "crossentropy": 2.3555757999420166, "epoch": 0.8403784802784223, "grad_norm": 0.027771666646003723, "grad_norm_var": 1.2970739919667662e-06, "learning_rate": 0.0006404475870266002, "loss": 2.3821, "step": 23181 }, { "crossentropy": 2.3919272422790527, "epoch": 0.8404147331786543, "grad_norm": 0.027577094733715057, "grad_norm_var": 1.2526050886368008e-06, "learning_rate": 0.0006401630779727452, "loss": 2.3656, "step": 23182 }, { "crossentropy": 2.3522090911865234, "epoch": 0.8404509860788864, "grad_norm": 0.02707165665924549, "grad_norm_var": 1.2526220488034053e-06, "learning_rate": 0.0006398786278051055, "loss": 2.4319, "step": 23183 }, { "crossentropy": 2.516350507736206, "epoch": 0.8404872389791184, "grad_norm": 0.026042193174362183, "grad_norm_var": 3.9605742643318794e-07, "learning_rate": 0.0006395942365275209, "loss": 2.4839, "step": 23184 }, { "crossentropy": 2.479801654815674, "epoch": 0.8405234918793504, "grad_norm": 0.027100440114736557, "grad_norm_var": 3.392444468680676e-07, "learning_rate": 0.0006393099041438327, "loss": 2.4549, "step": 23185 }, { "crossentropy": 2.5518274307250977, "epoch": 0.8405597447795824, "grad_norm": 0.02697673812508583, "grad_norm_var": 3.43364077023435e-07, "learning_rate": 0.0006390256306578829, "loss": 2.5362, "step": 23186 }, { "crossentropy": 2.466461420059204, "epoch": 0.8405959976798144, "grad_norm": 0.02582460455596447, "grad_norm_var": 3.914022753352559e-07, "learning_rate": 0.0006387414160735088, "loss": 2.3978, "step": 23187 }, { "crossentropy": 2.4028608798980713, "epoch": 0.8406322505800464, "grad_norm": 0.0298428013920784, "grad_norm_var": 1.0220766034792767e-06, "learning_rate": 0.0006384572603945505, "loss": 2.4117, "step": 23188 }, { "crossentropy": 2.453122854232788, "epoch": 0.8406685034802784, "grad_norm": 0.027471469715237617, "grad_norm_var": 1.0337959435468736e-06, "learning_rate": 0.0006381731636248467, "loss": 2.4333, "step": 23189 }, { "crossentropy": 2.414309024810791, "epoch": 0.8407047563805105, "grad_norm": 0.025485385209321976, "grad_norm_var": 1.1361649287361242e-06, "learning_rate": 0.0006378891257682323, "loss": 2.434, "step": 23190 }, { "crossentropy": 2.3909032344818115, "epoch": 0.8407410092807425, "grad_norm": 0.026657430455088615, "grad_norm_var": 1.1335972773881173e-06, "learning_rate": 0.0006376051468285466, "loss": 2.4231, "step": 23191 }, { "crossentropy": 2.474936008453369, "epoch": 0.8407772621809745, "grad_norm": 0.026548143476247787, "grad_norm_var": 1.047119703757509e-06, "learning_rate": 0.0006373212268096223, "loss": 2.4792, "step": 23192 }, { "crossentropy": 2.5517466068267822, "epoch": 0.8408135150812065, "grad_norm": 0.026909491047263145, "grad_norm_var": 1.0377103233367383e-06, "learning_rate": 0.0006370373657152962, "loss": 2.5502, "step": 23193 }, { "crossentropy": 2.5318400859832764, "epoch": 0.8408497679814385, "grad_norm": 0.02726045995950699, "grad_norm_var": 1.0224152410383154e-06, "learning_rate": 0.000636753563549402, "loss": 2.4766, "step": 23194 }, { "crossentropy": 2.4747729301452637, "epoch": 0.8408860208816705, "grad_norm": 0.0262767244130373, "grad_norm_var": 1.0442680143417256e-06, "learning_rate": 0.0006364698203157709, "loss": 2.5284, "step": 23195 }, { "crossentropy": 2.3994991779327393, "epoch": 0.8409222737819025, "grad_norm": 0.026819905266165733, "grad_norm_var": 9.857287340850889e-07, "learning_rate": 0.0006361861360182369, "loss": 2.4044, "step": 23196 }, { "crossentropy": 2.378669023513794, "epoch": 0.8409585266821346, "grad_norm": 0.026367001235485077, "grad_norm_var": 9.602636623836594e-07, "learning_rate": 0.0006359025106606325, "loss": 2.424, "step": 23197 }, { "crossentropy": 2.4706082344055176, "epoch": 0.8409947795823666, "grad_norm": 0.027288619428873062, "grad_norm_var": 9.390164671430606e-07, "learning_rate": 0.0006356189442467863, "loss": 2.4965, "step": 23198 }, { "crossentropy": 2.44746994972229, "epoch": 0.8410310324825986, "grad_norm": 0.027815409004688263, "grad_norm_var": 9.934441702622033e-07, "learning_rate": 0.0006353354367805308, "loss": 2.4363, "step": 23199 }, { "crossentropy": 2.500993490219116, "epoch": 0.8410672853828306, "grad_norm": 0.026497462764382362, "grad_norm_var": 9.532392981298574e-07, "learning_rate": 0.0006350519882656925, "loss": 2.4291, "step": 23200 }, { "crossentropy": 2.5374441146850586, "epoch": 0.8411035382830626, "grad_norm": 0.026686707511544228, "grad_norm_var": 9.554391104206471e-07, "learning_rate": 0.0006347685987061025, "loss": 2.4709, "step": 23201 }, { "crossentropy": 2.6370849609375, "epoch": 0.8411397911832946, "grad_norm": 0.025208966806530952, "grad_norm_var": 1.13750229759472e-06, "learning_rate": 0.0006344852681055868, "loss": 2.4982, "step": 23202 }, { "crossentropy": 2.435991048812866, "epoch": 0.8411760440835266, "grad_norm": 0.02584720402956009, "grad_norm_var": 1.1345648531789025e-06, "learning_rate": 0.0006342019964679713, "loss": 2.4108, "step": 23203 }, { "crossentropy": 2.303518295288086, "epoch": 0.8412122969837587, "grad_norm": 0.02668728493154049, "grad_norm_var": 4.814974019356898e-07, "learning_rate": 0.0006339187837970834, "loss": 2.2993, "step": 23204 }, { "crossentropy": 2.242089033126831, "epoch": 0.8412485498839907, "grad_norm": 0.0262833870947361, "grad_norm_var": 4.3392231121227215e-07, "learning_rate": 0.0006336356300967494, "loss": 2.2776, "step": 23205 }, { "crossentropy": 2.3667430877685547, "epoch": 0.8412848027842227, "grad_norm": 0.0257570780813694, "grad_norm_var": 4.003326435344142e-07, "learning_rate": 0.0006333525353707909, "loss": 2.3759, "step": 23206 }, { "crossentropy": 2.411130905151367, "epoch": 0.8413210556844548, "grad_norm": 0.027069278061389923, "grad_norm_var": 4.164512222234179e-07, "learning_rate": 0.000633069499623034, "loss": 2.4354, "step": 23207 }, { "crossentropy": 2.4090800285339355, "epoch": 0.8413573085846868, "grad_norm": 0.030397620052099228, "grad_norm_var": 1.3248714541226237e-06, "learning_rate": 0.0006327865228572999, "loss": 2.4597, "step": 23208 }, { "crossentropy": 2.4412310123443604, "epoch": 0.8413935614849188, "grad_norm": 0.02636554464697838, "grad_norm_var": 1.3371117935446578e-06, "learning_rate": 0.0006325036050774124, "loss": 2.4505, "step": 23209 }, { "crossentropy": 2.5722742080688477, "epoch": 0.8414298143851509, "grad_norm": 0.02628219500184059, "grad_norm_var": 1.3354673392672206e-06, "learning_rate": 0.000632220746287191, "loss": 2.4667, "step": 23210 }, { "crossentropy": 2.4065778255462646, "epoch": 0.8414660672853829, "grad_norm": 0.025549840182065964, "grad_norm_var": 1.4122410301555989e-06, "learning_rate": 0.0006319379464904562, "loss": 2.5195, "step": 23211 }, { "crossentropy": 2.6355011463165283, "epoch": 0.8415023201856149, "grad_norm": 0.026671092957258224, "grad_norm_var": 1.4109030985646916e-06, "learning_rate": 0.000631655205691028, "loss": 2.4897, "step": 23212 }, { "crossentropy": 2.4722185134887695, "epoch": 0.8415385730858469, "grad_norm": 0.02788497507572174, "grad_norm_var": 1.4929006524795502e-06, "learning_rate": 0.000631372523892727, "loss": 2.429, "step": 23213 }, { "crossentropy": 2.5085439682006836, "epoch": 0.8415748259860789, "grad_norm": 0.026636211201548576, "grad_norm_var": 1.4742407793933196e-06, "learning_rate": 0.0006310899010993681, "loss": 2.4576, "step": 23214 }, { "crossentropy": 2.40368914604187, "epoch": 0.8416110788863109, "grad_norm": 0.025663994252681732, "grad_norm_var": 1.451459519729659e-06, "learning_rate": 0.0006308073373147715, "loss": 2.3925, "step": 23215 }, { "crossentropy": 2.5077121257781982, "epoch": 0.8416473317865429, "grad_norm": 0.026507163420319557, "grad_norm_var": 1.4513417631841722e-06, "learning_rate": 0.0006305248325427509, "loss": 2.4824, "step": 23216 }, { "crossentropy": 2.4801137447357178, "epoch": 0.841683584686775, "grad_norm": 0.026478657498955727, "grad_norm_var": 1.4514658966002185e-06, "learning_rate": 0.0006302423867871237, "loss": 2.437, "step": 23217 }, { "crossentropy": 2.391202211380005, "epoch": 0.841719837587007, "grad_norm": 0.026591066271066666, "grad_norm_var": 1.3180785742361588e-06, "learning_rate": 0.0006299600000517064, "loss": 2.3864, "step": 23218 }, { "crossentropy": 2.3228437900543213, "epoch": 0.841756090487239, "grad_norm": 0.026500428095459938, "grad_norm_var": 1.2733428082794489e-06, "learning_rate": 0.0006296776723403086, "loss": 2.3933, "step": 23219 }, { "crossentropy": 2.375777006149292, "epoch": 0.841792343387471, "grad_norm": 0.025238730013370514, "grad_norm_var": 1.408461835880094e-06, "learning_rate": 0.0006293954036567467, "loss": 2.3995, "step": 23220 }, { "crossentropy": 2.5627520084381104, "epoch": 0.841828596287703, "grad_norm": 0.03007718361914158, "grad_norm_var": 2.1390966377357226e-06, "learning_rate": 0.0006291131940048334, "loss": 2.4803, "step": 23221 }, { "crossentropy": 2.3082799911499023, "epoch": 0.841864849187935, "grad_norm": 0.025961659848690033, "grad_norm_var": 2.111779097106039e-06, "learning_rate": 0.0006288310433883781, "loss": 2.293, "step": 23222 }, { "crossentropy": 2.34116268157959, "epoch": 0.841901102088167, "grad_norm": 0.02586260996758938, "grad_norm_var": 2.1702743633717285e-06, "learning_rate": 0.0006285489518111942, "loss": 2.299, "step": 23223 }, { "crossentropy": 2.394435405731201, "epoch": 0.8419373549883991, "grad_norm": 0.028400931507349014, "grad_norm_var": 1.4594901029206404e-06, "learning_rate": 0.0006282669192770896, "loss": 2.3732, "step": 23224 }, { "crossentropy": 2.422710657119751, "epoch": 0.8419736078886311, "grad_norm": 0.02592410333454609, "grad_norm_var": 1.4894138609570686e-06, "learning_rate": 0.0006279849457898745, "loss": 2.4777, "step": 23225 }, { "crossentropy": 2.344541549682617, "epoch": 0.8420098607888631, "grad_norm": 0.027992403134703636, "grad_norm_var": 1.590755646875032e-06, "learning_rate": 0.00062770303135336, "loss": 2.4025, "step": 23226 }, { "crossentropy": 2.3941216468811035, "epoch": 0.8420461136890951, "grad_norm": 0.02657894417643547, "grad_norm_var": 1.4927735620654318e-06, "learning_rate": 0.0006274211759713482, "loss": 2.3813, "step": 23227 }, { "crossentropy": 2.4027369022369385, "epoch": 0.8420823665893271, "grad_norm": 0.02735070139169693, "grad_norm_var": 1.508995793872194e-06, "learning_rate": 0.0006271393796476499, "loss": 2.3873, "step": 23228 }, { "crossentropy": 2.4491207599639893, "epoch": 0.8421186194895591, "grad_norm": 0.026095472276210785, "grad_norm_var": 1.4629374603771414e-06, "learning_rate": 0.0006268576423860711, "loss": 2.4733, "step": 23229 }, { "crossentropy": 2.455697536468506, "epoch": 0.8421548723897911, "grad_norm": 0.026573192328214645, "grad_norm_var": 1.4640683983042305e-06, "learning_rate": 0.0006265759641904156, "loss": 2.4743, "step": 23230 }, { "crossentropy": 2.3456876277923584, "epoch": 0.8421911252900232, "grad_norm": 0.02714317850768566, "grad_norm_var": 1.3891298277542962e-06, "learning_rate": 0.0006262943450644892, "loss": 2.3861, "step": 23231 }, { "crossentropy": 2.475226879119873, "epoch": 0.8422273781902552, "grad_norm": 0.026304561644792557, "grad_norm_var": 1.4004102286024744e-06, "learning_rate": 0.0006260127850120945, "loss": 2.4349, "step": 23232 }, { "crossentropy": 2.3804471492767334, "epoch": 0.8422636310904872, "grad_norm": 0.026162628084421158, "grad_norm_var": 1.4209140175018564e-06, "learning_rate": 0.0006257312840370344, "loss": 2.3967, "step": 23233 }, { "crossentropy": 2.4740123748779297, "epoch": 0.8422998839907193, "grad_norm": 0.03077453002333641, "grad_norm_var": 2.3996787406013744e-06, "learning_rate": 0.0006254498421431127, "loss": 2.4921, "step": 23234 }, { "crossentropy": 2.646867036819458, "epoch": 0.8423361368909513, "grad_norm": 0.0272411797195673, "grad_norm_var": 2.378821821538239e-06, "learning_rate": 0.0006251684593341295, "loss": 2.507, "step": 23235 }, { "crossentropy": 2.3507418632507324, "epoch": 0.8423723897911833, "grad_norm": 0.026257626712322235, "grad_norm_var": 2.1901509866357563e-06, "learning_rate": 0.0006248871356138841, "loss": 2.3529, "step": 23236 }, { "crossentropy": 2.448673725128174, "epoch": 0.8424086426914154, "grad_norm": 0.026448726654052734, "grad_norm_var": 1.605951140188364e-06, "learning_rate": 0.0006246058709861784, "loss": 2.426, "step": 23237 }, { "crossentropy": 2.391770601272583, "epoch": 0.8424448955916474, "grad_norm": 0.025950537994503975, "grad_norm_var": 1.6074126728377972e-06, "learning_rate": 0.0006243246654548096, "loss": 2.4333, "step": 23238 }, { "crossentropy": 2.498976707458496, "epoch": 0.8424811484918794, "grad_norm": 0.025853551924228668, "grad_norm_var": 1.6087206168064234e-06, "learning_rate": 0.0006240435190235777, "loss": 2.4738, "step": 23239 }, { "crossentropy": 2.4908573627471924, "epoch": 0.8425174013921114, "grad_norm": 0.02689918875694275, "grad_norm_var": 1.4573003448613141e-06, "learning_rate": 0.0006237624316962775, "loss": 2.4876, "step": 23240 }, { "crossentropy": 2.4323413372039795, "epoch": 0.8425536542923434, "grad_norm": 0.025885168462991714, "grad_norm_var": 1.4621856605075608e-06, "learning_rate": 0.0006234814034767066, "loss": 2.4299, "step": 23241 }, { "crossentropy": 2.369830369949341, "epoch": 0.8425899071925754, "grad_norm": 0.02620377205312252, "grad_norm_var": 1.3883729509713467e-06, "learning_rate": 0.0006232004343686625, "loss": 2.3898, "step": 23242 }, { "crossentropy": 2.4758057594299316, "epoch": 0.8426261600928074, "grad_norm": 0.027103126049041748, "grad_norm_var": 1.394800775551761e-06, "learning_rate": 0.0006229195243759384, "loss": 2.4898, "step": 23243 }, { "crossentropy": 2.4258053302764893, "epoch": 0.8426624129930395, "grad_norm": 0.025817323476076126, "grad_norm_var": 1.4220981568289585e-06, "learning_rate": 0.000622638673502327, "loss": 2.3776, "step": 23244 }, { "crossentropy": 2.496297836303711, "epoch": 0.8426986658932715, "grad_norm": 0.02725771628320217, "grad_norm_var": 1.4175520568708896e-06, "learning_rate": 0.0006223578817516251, "loss": 2.4225, "step": 23245 }, { "crossentropy": 2.502671480178833, "epoch": 0.8427349187935035, "grad_norm": 0.027915582060813904, "grad_norm_var": 1.4999187592600368e-06, "learning_rate": 0.0006220771491276217, "loss": 2.4812, "step": 23246 }, { "crossentropy": 2.562299966812134, "epoch": 0.8427711716937355, "grad_norm": 0.026323363184928894, "grad_norm_var": 1.5072708439783344e-06, "learning_rate": 0.0006217964756341116, "loss": 2.5123, "step": 23247 }, { "crossentropy": 2.4554975032806396, "epoch": 0.8428074245939675, "grad_norm": 0.025637824088335037, "grad_norm_var": 1.576867846217409e-06, "learning_rate": 0.0006215158612748828, "loss": 2.375, "step": 23248 }, { "crossentropy": 2.490276336669922, "epoch": 0.8428436774941995, "grad_norm": 0.02606033720076084, "grad_norm_var": 1.5853042679274022e-06, "learning_rate": 0.0006212353060537268, "loss": 2.5047, "step": 23249 }, { "crossentropy": 2.533200740814209, "epoch": 0.8428799303944315, "grad_norm": 0.026128772646188736, "grad_norm_var": 4.269719519526023e-07, "learning_rate": 0.0006209548099744345, "loss": 2.4966, "step": 23250 }, { "crossentropy": 2.5060019493103027, "epoch": 0.8429161832946636, "grad_norm": 0.02573978528380394, "grad_norm_var": 4.067704398651057e-07, "learning_rate": 0.0006206743730407927, "loss": 2.4083, "step": 23251 }, { "crossentropy": 2.3266494274139404, "epoch": 0.8429524361948956, "grad_norm": 0.026354392990469933, "grad_norm_var": 4.062586846713719e-07, "learning_rate": 0.0006203939952565879, "loss": 2.4173, "step": 23252 }, { "crossentropy": 2.409186363220215, "epoch": 0.8429886890951276, "grad_norm": 0.026293722912669182, "grad_norm_var": 4.056930123448136e-07, "learning_rate": 0.0006201136766256105, "loss": 2.4213, "step": 23253 }, { "crossentropy": 2.3872499465942383, "epoch": 0.8430249419953596, "grad_norm": 0.02556631900370121, "grad_norm_var": 4.348206536758844e-07, "learning_rate": 0.0006198334171516428, "loss": 2.448, "step": 23254 }, { "crossentropy": 2.351802349090576, "epoch": 0.8430611948955916, "grad_norm": 0.029101630672812462, "grad_norm_var": 8.94355452280337e-07, "learning_rate": 0.0006195532168384721, "loss": 2.3741, "step": 23255 }, { "crossentropy": 2.4767982959747314, "epoch": 0.8430974477958236, "grad_norm": 0.025894805788993835, "grad_norm_var": 9.063568015514235e-07, "learning_rate": 0.000619273075689884, "loss": 2.479, "step": 23256 }, { "crossentropy": 2.3974969387054443, "epoch": 0.8431337006960556, "grad_norm": 0.026596317067742348, "grad_norm_var": 8.839121680513535e-07, "learning_rate": 0.0006189929937096594, "loss": 2.4682, "step": 23257 }, { "crossentropy": 2.4052734375, "epoch": 0.8431699535962877, "grad_norm": 0.029205121099948883, "grad_norm_var": 1.3285040017187948e-06, "learning_rate": 0.0006187129709015848, "loss": 2.41, "step": 23258 }, { "crossentropy": 2.5588080883026123, "epoch": 0.8432062064965197, "grad_norm": 0.026307685300707817, "grad_norm_var": 1.3239430017218552e-06, "learning_rate": 0.0006184330072694389, "loss": 2.4781, "step": 23259 }, { "crossentropy": 2.485682964324951, "epoch": 0.8432424593967517, "grad_norm": 0.026982922106981277, "grad_norm_var": 1.281383754253978e-06, "learning_rate": 0.0006181531028170051, "loss": 2.4816, "step": 23260 }, { "crossentropy": 2.401841163635254, "epoch": 0.8432787122969838, "grad_norm": 0.026484651491045952, "grad_norm_var": 1.2623201253072458e-06, "learning_rate": 0.0006178732575480639, "loss": 2.4234, "step": 23261 }, { "crossentropy": 2.519519329071045, "epoch": 0.8433149651972158, "grad_norm": 0.02815297059714794, "grad_norm_var": 1.30551790340747e-06, "learning_rate": 0.0006175934714663933, "loss": 2.4533, "step": 23262 }, { "crossentropy": 2.4071972370147705, "epoch": 0.8433512180974478, "grad_norm": 0.028349673375487328, "grad_norm_var": 1.4666182758769908e-06, "learning_rate": 0.0006173137445757732, "loss": 2.321, "step": 23263 }, { "crossentropy": 2.47733998298645, "epoch": 0.8433874709976799, "grad_norm": 0.027003634721040726, "grad_norm_var": 1.3709185828457426e-06, "learning_rate": 0.0006170340768799831, "loss": 2.5148, "step": 23264 }, { "crossentropy": 2.439561367034912, "epoch": 0.8434237238979119, "grad_norm": 0.026108261197805405, "grad_norm_var": 1.3657675850164135e-06, "learning_rate": 0.000616754468382798, "loss": 2.3957, "step": 23265 }, { "crossentropy": 2.4604711532592773, "epoch": 0.8434599767981439, "grad_norm": 0.026056908071041107, "grad_norm_var": 1.3734027698677897e-06, "learning_rate": 0.000616474919087997, "loss": 2.4384, "step": 23266 }, { "crossentropy": 2.4489119052886963, "epoch": 0.8434962296983759, "grad_norm": 0.025716813281178474, "grad_norm_var": 1.376950896589344e-06, "learning_rate": 0.0006161954289993532, "loss": 2.3825, "step": 23267 }, { "crossentropy": 2.4018187522888184, "epoch": 0.8435324825986079, "grad_norm": 0.026816129684448242, "grad_norm_var": 1.3575482719525711e-06, "learning_rate": 0.0006159159981206442, "loss": 2.3859, "step": 23268 }, { "crossentropy": 2.3739123344421387, "epoch": 0.8435687354988399, "grad_norm": 0.026947317644953728, "grad_norm_var": 1.330118865755731e-06, "learning_rate": 0.0006156366264556424, "loss": 2.5129, "step": 23269 }, { "crossentropy": 2.459066867828369, "epoch": 0.8436049883990719, "grad_norm": 0.026501689106225967, "grad_norm_var": 1.2115234363986593e-06, "learning_rate": 0.0006153573140081204, "loss": 2.4677, "step": 23270 }, { "crossentropy": 2.400359630584717, "epoch": 0.843641241299304, "grad_norm": 0.026625437662005424, "grad_norm_var": 9.055462094037515e-07, "learning_rate": 0.0006150780607818518, "loss": 2.3711, "step": 23271 }, { "crossentropy": 2.4508421421051025, "epoch": 0.843677494199536, "grad_norm": 0.025866683572530746, "grad_norm_var": 9.092124939046852e-07, "learning_rate": 0.0006147988667806093, "loss": 2.4352, "step": 23272 }, { "crossentropy": 2.6486027240753174, "epoch": 0.843713747099768, "grad_norm": 0.027019092813134193, "grad_norm_var": 9.056529870469024e-07, "learning_rate": 0.0006145197320081619, "loss": 2.5919, "step": 23273 }, { "crossentropy": 2.5169739723205566, "epoch": 0.84375, "grad_norm": 0.025962699204683304, "grad_norm_var": 5.592871676986244e-07, "learning_rate": 0.0006142406564682818, "loss": 2.4739, "step": 23274 }, { "crossentropy": 2.5921707153320312, "epoch": 0.843786252900232, "grad_norm": 0.026003317907452583, "grad_norm_var": 5.802437770147453e-07, "learning_rate": 0.0006139616401647363, "loss": 2.5252, "step": 23275 }, { "crossentropy": 2.508575916290283, "epoch": 0.843822505800464, "grad_norm": 0.02587319351732731, "grad_norm_var": 6.09784873659364e-07, "learning_rate": 0.0006136826831012949, "loss": 2.4816, "step": 23276 }, { "crossentropy": 2.4978559017181396, "epoch": 0.843858758700696, "grad_norm": 0.026736602187156677, "grad_norm_var": 6.101115282198973e-07, "learning_rate": 0.0006134037852817276, "loss": 2.4257, "step": 23277 }, { "crossentropy": 2.516129970550537, "epoch": 0.8438950116009281, "grad_norm": 0.026015864685177803, "grad_norm_var": 4.555487149382013e-07, "learning_rate": 0.0006131249467097965, "loss": 2.5062, "step": 23278 }, { "crossentropy": 2.459218740463257, "epoch": 0.8439312645011601, "grad_norm": 0.026579972356557846, "grad_norm_var": 2.0898957814364088e-07, "learning_rate": 0.0006128461673892704, "loss": 2.4292, "step": 23279 }, { "crossentropy": 2.5055935382843018, "epoch": 0.8439675174013921, "grad_norm": 0.027474211528897285, "grad_norm_var": 2.629249898831893e-07, "learning_rate": 0.0006125674473239162, "loss": 2.418, "step": 23280 }, { "crossentropy": 2.3974199295043945, "epoch": 0.8440037703016241, "grad_norm": 0.02537287026643753, "grad_norm_var": 3.247434667302355e-07, "learning_rate": 0.0006122887865174953, "loss": 2.3996, "step": 23281 }, { "crossentropy": 2.3379411697387695, "epoch": 0.8440400232018561, "grad_norm": 0.02678549289703369, "grad_norm_var": 3.2963780720041135e-07, "learning_rate": 0.0006120101849737741, "loss": 2.3947, "step": 23282 }, { "crossentropy": 2.4952802658081055, "epoch": 0.8440762761020881, "grad_norm": 0.027983687818050385, "grad_norm_var": 4.4625307245569657e-07, "learning_rate": 0.000611731642696513, "loss": 2.5167, "step": 23283 }, { "crossentropy": 2.388288974761963, "epoch": 0.8441125290023201, "grad_norm": 0.02670997940003872, "grad_norm_var": 4.429821531990244e-07, "learning_rate": 0.0006114531596894757, "loss": 2.3596, "step": 23284 }, { "crossentropy": 2.433500051498413, "epoch": 0.8441487819025522, "grad_norm": 0.02666218765079975, "grad_norm_var": 4.3214603774599514e-07, "learning_rate": 0.0006111747359564257, "loss": 2.4528, "step": 23285 }, { "crossentropy": 2.408639430999756, "epoch": 0.8441850348027842, "grad_norm": 0.026785433292388916, "grad_norm_var": 4.3683284032484254e-07, "learning_rate": 0.0006108963715011185, "loss": 2.3843, "step": 23286 }, { "crossentropy": 2.5090551376342773, "epoch": 0.8442212877030162, "grad_norm": 0.026248829439282417, "grad_norm_var": 4.408320613639531e-07, "learning_rate": 0.000610618066327317, "loss": 2.4197, "step": 23287 }, { "crossentropy": 2.575124502182007, "epoch": 0.8442575406032483, "grad_norm": 0.02755478397011757, "grad_norm_var": 4.7526328303288773e-07, "learning_rate": 0.0006103398204387806, "loss": 2.5435, "step": 23288 }, { "crossentropy": 2.4717822074890137, "epoch": 0.8442937935034803, "grad_norm": 0.026711659505963326, "grad_norm_var": 4.644223743512551e-07, "learning_rate": 0.0006100616338392651, "loss": 2.5217, "step": 23289 }, { "crossentropy": 2.4996511936187744, "epoch": 0.8443300464037123, "grad_norm": 0.026134654879570007, "grad_norm_var": 4.5185824554718155e-07, "learning_rate": 0.0006097835065325308, "loss": 2.5025, "step": 23290 }, { "crossentropy": 2.4752957820892334, "epoch": 0.8443662993039444, "grad_norm": 0.027795646339654922, "grad_norm_var": 5.095534241166307e-07, "learning_rate": 0.0006095054385223308, "loss": 2.4714, "step": 23291 }, { "crossentropy": 2.4624249935150146, "epoch": 0.8444025522041764, "grad_norm": 0.027633173391222954, "grad_norm_var": 5.058263088270846e-07, "learning_rate": 0.0006092274298124234, "loss": 2.4516, "step": 23292 }, { "crossentropy": 2.417736291885376, "epoch": 0.8444388051044084, "grad_norm": 0.02634972333908081, "grad_norm_var": 5.196927108016506e-07, "learning_rate": 0.0006089494804065637, "loss": 2.3891, "step": 23293 }, { "crossentropy": 2.3897526264190674, "epoch": 0.8444750580046404, "grad_norm": 0.02631424553692341, "grad_norm_var": 4.940655765961795e-07, "learning_rate": 0.000608671590308505, "loss": 2.4567, "step": 23294 }, { "crossentropy": 2.373419761657715, "epoch": 0.8445113109048724, "grad_norm": 0.02613093890249729, "grad_norm_var": 5.209504997605307e-07, "learning_rate": 0.000608393759521999, "loss": 2.4008, "step": 23295 }, { "crossentropy": 2.4043612480163574, "epoch": 0.8445475638051044, "grad_norm": 0.02639533206820488, "grad_norm_var": 4.953426632750644e-07, "learning_rate": 0.0006081159880508013, "loss": 2.4414, "step": 23296 }, { "crossentropy": 2.3822617530822754, "epoch": 0.8445838167053364, "grad_norm": 0.02704458124935627, "grad_norm_var": 3.6906047242059544e-07, "learning_rate": 0.0006078382758986606, "loss": 2.4206, "step": 23297 }, { "crossentropy": 2.538925886154175, "epoch": 0.8446200696055685, "grad_norm": 0.026846513152122498, "grad_norm_var": 3.6895124035969903e-07, "learning_rate": 0.0006075606230693309, "loss": 2.4797, "step": 23298 }, { "crossentropy": 2.430241346359253, "epoch": 0.8446563225058005, "grad_norm": 0.027120981365442276, "grad_norm_var": 2.8291541833887676e-07, "learning_rate": 0.0006072830295665593, "loss": 2.4868, "step": 23299 }, { "crossentropy": 2.4929146766662598, "epoch": 0.8446925754060325, "grad_norm": 0.027694454416632652, "grad_norm_var": 3.3463784504620766e-07, "learning_rate": 0.0006070054953940968, "loss": 2.5574, "step": 23300 }, { "crossentropy": 2.572334051132202, "epoch": 0.8447288283062645, "grad_norm": 0.02664790488779545, "grad_norm_var": 3.349872081908715e-07, "learning_rate": 0.000606728020555693, "loss": 2.5097, "step": 23301 }, { "crossentropy": 2.4538166522979736, "epoch": 0.8447650812064965, "grad_norm": 0.025659864768385887, "grad_norm_var": 4.220657570279714e-07, "learning_rate": 0.0006064506050550938, "loss": 2.4487, "step": 23302 }, { "crossentropy": 2.416546583175659, "epoch": 0.8448013341067285, "grad_norm": 0.027968721464276314, "grad_norm_var": 4.879544479155362e-07, "learning_rate": 0.0006061732488960458, "loss": 2.4131, "step": 23303 }, { "crossentropy": 2.4617135524749756, "epoch": 0.8448375870069605, "grad_norm": 0.026409650221467018, "grad_norm_var": 4.661505176116349e-07, "learning_rate": 0.0006058959520822971, "loss": 2.4319, "step": 23304 }, { "crossentropy": 2.360140323638916, "epoch": 0.8448738399071926, "grad_norm": 0.025600260123610497, "grad_norm_var": 5.569795280410221e-07, "learning_rate": 0.0006056187146175906, "loss": 2.4592, "step": 23305 }, { "crossentropy": 2.425550937652588, "epoch": 0.8449100928074246, "grad_norm": 0.027227183803915977, "grad_norm_var": 5.442497351711931e-07, "learning_rate": 0.000605341536505673, "loss": 2.4045, "step": 23306 }, { "crossentropy": 2.305372476577759, "epoch": 0.8449463457076566, "grad_norm": 0.025746816769242287, "grad_norm_var": 5.352870512365451e-07, "learning_rate": 0.0006050644177502862, "loss": 2.3699, "step": 23307 }, { "crossentropy": 2.3884875774383545, "epoch": 0.8449825986078886, "grad_norm": 0.026358162984251976, "grad_norm_var": 4.738968859658091e-07, "learning_rate": 0.0006047873583551744, "loss": 2.4056, "step": 23308 }, { "crossentropy": 2.4076921939849854, "epoch": 0.8450188515081206, "grad_norm": 0.026310382410883904, "grad_norm_var": 4.7527867659487703e-07, "learning_rate": 0.0006045103583240797, "loss": 2.3786, "step": 23309 }, { "crossentropy": 2.442784070968628, "epoch": 0.8450551044083526, "grad_norm": 0.027176933363080025, "grad_norm_var": 4.898156362128827e-07, "learning_rate": 0.0006042334176607434, "loss": 2.4281, "step": 23310 }, { "crossentropy": 2.4352853298187256, "epoch": 0.8450913573085846, "grad_norm": 0.02586505375802517, "grad_norm_var": 5.1249962309204e-07, "learning_rate": 0.0006039565363689043, "loss": 2.4042, "step": 23311 }, { "crossentropy": 2.5053718090057373, "epoch": 0.8451276102088167, "grad_norm": 0.02804805338382721, "grad_norm_var": 6.316047221497824e-07, "learning_rate": 0.0006036797144523048, "loss": 2.4888, "step": 23312 }, { "crossentropy": 2.390946388244629, "epoch": 0.8451638631090487, "grad_norm": 0.02746613137423992, "grad_norm_var": 6.602329211290257e-07, "learning_rate": 0.000603402951914681, "loss": 2.4406, "step": 23313 }, { "crossentropy": 2.4968111515045166, "epoch": 0.8452001160092807, "grad_norm": 0.026109686121344566, "grad_norm_var": 6.855862829672927e-07, "learning_rate": 0.0006031262487597738, "loss": 2.4243, "step": 23314 }, { "crossentropy": 2.3802576065063477, "epoch": 0.8452363689095128, "grad_norm": 0.026847995817661285, "grad_norm_var": 6.753992152585171e-07, "learning_rate": 0.0006028496049913179, "loss": 2.488, "step": 23315 }, { "crossentropy": 2.5530598163604736, "epoch": 0.8452726218097448, "grad_norm": 0.02735748700797558, "grad_norm_var": 6.376398845849932e-07, "learning_rate": 0.0006025730206130509, "loss": 2.5135, "step": 23316 }, { "crossentropy": 2.459979295730591, "epoch": 0.8453088747099768, "grad_norm": 0.027961213141679764, "grad_norm_var": 7.406908258076554e-07, "learning_rate": 0.0006022964956287097, "loss": 2.4075, "step": 23317 }, { "crossentropy": 2.5640244483947754, "epoch": 0.8453451276102089, "grad_norm": 0.02649390883743763, "grad_norm_var": 6.621486893865995e-07, "learning_rate": 0.000602020030042027, "loss": 2.504, "step": 23318 }, { "crossentropy": 2.424032211303711, "epoch": 0.8453813805104409, "grad_norm": 0.02549413964152336, "grad_norm_var": 6.623025429056591e-07, "learning_rate": 0.0006017436238567392, "loss": 2.2854, "step": 23319 }, { "crossentropy": 2.2707743644714355, "epoch": 0.8454176334106729, "grad_norm": 0.029642349109053612, "grad_norm_var": 1.2098836543530366e-06, "learning_rate": 0.0006014672770765783, "loss": 2.4223, "step": 23320 }, { "crossentropy": 2.3965511322021484, "epoch": 0.8454538863109049, "grad_norm": 0.026220522820949554, "grad_norm_var": 1.130026768117566e-06, "learning_rate": 0.0006011909897052753, "loss": 2.3544, "step": 23321 }, { "crossentropy": 2.52337646484375, "epoch": 0.8454901392111369, "grad_norm": 0.026999132707715034, "grad_norm_var": 1.1231880149918517e-06, "learning_rate": 0.0006009147617465649, "loss": 2.4964, "step": 23322 }, { "crossentropy": 2.555434465408325, "epoch": 0.8455263921113689, "grad_norm": 0.027343645691871643, "grad_norm_var": 1.0410486785297304e-06, "learning_rate": 0.0006006385932041753, "loss": 2.5155, "step": 23323 }, { "crossentropy": 2.428220748901367, "epoch": 0.8455626450116009, "grad_norm": 0.026183895766735077, "grad_norm_var": 1.0574170094727973e-06, "learning_rate": 0.0006003624840818378, "loss": 2.4202, "step": 23324 }, { "crossentropy": 2.5465149879455566, "epoch": 0.845598897911833, "grad_norm": 0.0270837415009737, "grad_norm_var": 1.0267776874908155e-06, "learning_rate": 0.0006000864343832829, "loss": 2.4564, "step": 23325 }, { "crossentropy": 2.46662974357605, "epoch": 0.845635150812065, "grad_norm": 0.027120450511574745, "grad_norm_var": 1.0257829198041887e-06, "learning_rate": 0.0005998104441122366, "loss": 2.5086, "step": 23326 }, { "crossentropy": 2.5503416061401367, "epoch": 0.845671403712297, "grad_norm": 0.0275725070387125, "grad_norm_var": 9.462348419284611e-07, "learning_rate": 0.0005995345132724289, "loss": 2.4229, "step": 23327 }, { "crossentropy": 2.3485779762268066, "epoch": 0.845707656612529, "grad_norm": 0.02713596448302269, "grad_norm_var": 8.855556430296113e-07, "learning_rate": 0.0005992586418675855, "loss": 2.3497, "step": 23328 }, { "crossentropy": 2.374973773956299, "epoch": 0.845743909512761, "grad_norm": 0.026018116623163223, "grad_norm_var": 9.390692028048037e-07, "learning_rate": 0.0005989828299014316, "loss": 2.44, "step": 23329 }, { "crossentropy": 2.50844669342041, "epoch": 0.845780162412993, "grad_norm": 0.02816258929669857, "grad_norm_var": 9.65876610124703e-07, "learning_rate": 0.0005987070773776943, "loss": 2.4766, "step": 23330 }, { "crossentropy": 2.444345712661743, "epoch": 0.845816415313225, "grad_norm": 0.025733275339007378, "grad_norm_var": 1.0813442853349475e-06, "learning_rate": 0.000598431384300096, "loss": 2.4756, "step": 23331 }, { "crossentropy": 2.461524724960327, "epoch": 0.8458526682134571, "grad_norm": 0.027090542018413544, "grad_norm_var": 1.0742373973373608e-06, "learning_rate": 0.0005981557506723611, "loss": 2.5168, "step": 23332 }, { "crossentropy": 2.4411044120788574, "epoch": 0.8458889211136891, "grad_norm": 0.029206128790974617, "grad_norm_var": 1.3279956408990354e-06, "learning_rate": 0.0005978801764982144, "loss": 2.4784, "step": 23333 }, { "crossentropy": 2.1754138469696045, "epoch": 0.8459251740139211, "grad_norm": 0.025695648044347763, "grad_norm_var": 1.4316719215418452e-06, "learning_rate": 0.0005976046617813752, "loss": 2.333, "step": 23334 }, { "crossentropy": 2.405731678009033, "epoch": 0.8459614269141531, "grad_norm": 0.025726301595568657, "grad_norm_var": 1.3870674182846297e-06, "learning_rate": 0.000597329206525567, "loss": 2.4484, "step": 23335 }, { "crossentropy": 2.298065185546875, "epoch": 0.8459976798143851, "grad_norm": 0.026694096624851227, "grad_norm_var": 9.145883181430522e-07, "learning_rate": 0.0005970538107345091, "loss": 2.3335, "step": 23336 }, { "crossentropy": 2.448911666870117, "epoch": 0.8460339327146171, "grad_norm": 0.0271610040217638, "grad_norm_var": 8.879054188145555e-07, "learning_rate": 0.0005967784744119203, "loss": 2.4246, "step": 23337 }, { "crossentropy": 2.430830717086792, "epoch": 0.8460701856148491, "grad_norm": 0.026064414530992508, "grad_norm_var": 9.342620116721429e-07, "learning_rate": 0.0005965031975615215, "loss": 2.3866, "step": 23338 }, { "crossentropy": 2.576380729675293, "epoch": 0.8461064385150812, "grad_norm": 0.027744244784116745, "grad_norm_var": 9.69349492619768e-07, "learning_rate": 0.000596227980187028, "loss": 2.5851, "step": 23339 }, { "crossentropy": 2.3604073524475098, "epoch": 0.8461426914153132, "grad_norm": 0.025646822527050972, "grad_norm_var": 1.0386258447612662e-06, "learning_rate": 0.000595952822292159, "loss": 2.4337, "step": 23340 }, { "crossentropy": 2.449022054672241, "epoch": 0.8461789443155452, "grad_norm": 0.0254514180123806, "grad_norm_var": 1.1577638295113986e-06, "learning_rate": 0.0005956777238806316, "loss": 2.4565, "step": 23341 }, { "crossentropy": 2.420698404312134, "epoch": 0.8462151972157773, "grad_norm": 0.027732335031032562, "grad_norm_var": 1.2102472953194928e-06, "learning_rate": 0.0005954026849561589, "loss": 2.4258, "step": 23342 }, { "crossentropy": 2.3835012912750244, "epoch": 0.8462514501160093, "grad_norm": 0.025787191465497017, "grad_norm_var": 1.2260943952197562e-06, "learning_rate": 0.0005951277055224586, "loss": 2.3774, "step": 23343 }, { "crossentropy": 2.406449317932129, "epoch": 0.8462877030162413, "grad_norm": 0.026856964454054832, "grad_norm_var": 1.2143930449056683e-06, "learning_rate": 0.0005948527855832431, "loss": 2.4584, "step": 23344 }, { "crossentropy": 2.4947891235351562, "epoch": 0.8463239559164734, "grad_norm": 0.026317795738577843, "grad_norm_var": 1.1938309773979004e-06, "learning_rate": 0.0005945779251422245, "loss": 2.397, "step": 23345 }, { "crossentropy": 2.5814902782440186, "epoch": 0.8463602088167054, "grad_norm": 0.02716461569070816, "grad_norm_var": 1.0603864840343092e-06, "learning_rate": 0.0005943031242031178, "loss": 2.5075, "step": 23346 }, { "crossentropy": 2.558230400085449, "epoch": 0.8463964617169374, "grad_norm": 0.026166679337620735, "grad_norm_var": 1.0203332250421337e-06, "learning_rate": 0.0005940283827696319, "loss": 2.486, "step": 23347 }, { "crossentropy": 2.317060708999634, "epoch": 0.8464327146171694, "grad_norm": 0.02633940801024437, "grad_norm_var": 1.012139826704154e-06, "learning_rate": 0.0005937537008454797, "loss": 2.3261, "step": 23348 }, { "crossentropy": 2.3247649669647217, "epoch": 0.8464689675174014, "grad_norm": 0.026525234803557396, "grad_norm_var": 5.332363835989946e-07, "learning_rate": 0.0005934790784343708, "loss": 2.44, "step": 23349 }, { "crossentropy": 2.3725883960723877, "epoch": 0.8465052204176334, "grad_norm": 0.026773761957883835, "grad_norm_var": 4.985754560167417e-07, "learning_rate": 0.0005932045155400134, "loss": 2.3218, "step": 23350 }, { "crossentropy": 2.3004703521728516, "epoch": 0.8465414733178654, "grad_norm": 0.025669122114777565, "grad_norm_var": 5.047509872568856e-07, "learning_rate": 0.0005929300121661174, "loss": 2.344, "step": 23351 }, { "crossentropy": 2.487969398498535, "epoch": 0.8465777262180975, "grad_norm": 0.026507847011089325, "grad_norm_var": 5.022466054336687e-07, "learning_rate": 0.0005926555683163898, "loss": 2.5039, "step": 23352 }, { "crossentropy": 2.459540843963623, "epoch": 0.8466139791183295, "grad_norm": 0.02787896804511547, "grad_norm_var": 5.982858709183483e-07, "learning_rate": 0.0005923811839945359, "loss": 2.4686, "step": 23353 }, { "crossentropy": 2.5640294551849365, "epoch": 0.8466502320185615, "grad_norm": 0.026999859139323235, "grad_norm_var": 5.937617766017435e-07, "learning_rate": 0.0005921068592042644, "loss": 2.514, "step": 23354 }, { "crossentropy": 2.397449493408203, "epoch": 0.8466864849187935, "grad_norm": 0.02771870419383049, "grad_norm_var": 5.898978909620594e-07, "learning_rate": 0.0005918325939492775, "loss": 2.4293, "step": 23355 }, { "crossentropy": 2.3625335693359375, "epoch": 0.8467227378190255, "grad_norm": 0.027409272268414497, "grad_norm_var": 5.609761782456585e-07, "learning_rate": 0.0005915583882332809, "loss": 2.4381, "step": 23356 }, { "crossentropy": 2.5313618183135986, "epoch": 0.8467589907192575, "grad_norm": 0.026820417493581772, "grad_norm_var": 4.490719592211417e-07, "learning_rate": 0.0005912842420599796, "loss": 2.5239, "step": 23357 }, { "crossentropy": 2.54363751411438, "epoch": 0.8467952436194895, "grad_norm": 0.027140986174345016, "grad_norm_var": 3.967668211198588e-07, "learning_rate": 0.0005910101554330744, "loss": 2.5056, "step": 23358 }, { "crossentropy": 2.587355136871338, "epoch": 0.8468314965197216, "grad_norm": 0.026974303647875786, "grad_norm_var": 3.316891029326302e-07, "learning_rate": 0.0005907361283562684, "loss": 2.5192, "step": 23359 }, { "crossentropy": 2.5994906425476074, "epoch": 0.8468677494199536, "grad_norm": 0.02623118832707405, "grad_norm_var": 3.53830260502117e-07, "learning_rate": 0.0005904621608332617, "loss": 2.4916, "step": 23360 }, { "crossentropy": 2.443779706954956, "epoch": 0.8469040023201856, "grad_norm": 0.026233352720737457, "grad_norm_var": 3.595912128283394e-07, "learning_rate": 0.000590188252867756, "loss": 2.458, "step": 23361 }, { "crossentropy": 2.5170116424560547, "epoch": 0.8469402552204176, "grad_norm": 0.0266331247985363, "grad_norm_var": 3.503169251953351e-07, "learning_rate": 0.0005899144044634497, "loss": 2.4832, "step": 23362 }, { "crossentropy": 2.524190902709961, "epoch": 0.8469765081206496, "grad_norm": 0.026779286563396454, "grad_norm_var": 3.260127216034303e-07, "learning_rate": 0.0005896406156240413, "loss": 2.4931, "step": 23363 }, { "crossentropy": 2.470306634902954, "epoch": 0.8470127610208816, "grad_norm": 0.026526954025030136, "grad_norm_var": 3.1695157031487917e-07, "learning_rate": 0.0005893668863532292, "loss": 2.4536, "step": 23364 }, { "crossentropy": 2.467299699783325, "epoch": 0.8470490139211136, "grad_norm": 0.02649051323533058, "grad_norm_var": 3.1830543317927447e-07, "learning_rate": 0.0005890932166547119, "loss": 2.4245, "step": 23365 }, { "crossentropy": 2.4104878902435303, "epoch": 0.8470852668213457, "grad_norm": 0.026715928688645363, "grad_norm_var": 3.187108539918932e-07, "learning_rate": 0.0005888196065321838, "loss": 2.3614, "step": 23366 }, { "crossentropy": 2.4803249835968018, "epoch": 0.8471215197215777, "grad_norm": 0.026262056082487106, "grad_norm_var": 2.5162595973421815e-07, "learning_rate": 0.0005885460559893413, "loss": 2.5183, "step": 23367 }, { "crossentropy": 2.4256958961486816, "epoch": 0.8471577726218097, "grad_norm": 0.026853013783693314, "grad_norm_var": 2.441230136728991e-07, "learning_rate": 0.0005882725650298787, "loss": 2.4471, "step": 23368 }, { "crossentropy": 2.6224019527435303, "epoch": 0.8471940255220418, "grad_norm": 0.026419654488563538, "grad_norm_var": 1.7783724806184204e-07, "learning_rate": 0.0005879991336574909, "loss": 2.5341, "step": 23369 }, { "crossentropy": 2.4300901889801025, "epoch": 0.8472302784222738, "grad_norm": 0.026259006932377815, "grad_norm_var": 1.887479078773365e-07, "learning_rate": 0.0005877257618758697, "loss": 2.4431, "step": 23370 }, { "crossentropy": 2.37190580368042, "epoch": 0.8472665313225058, "grad_norm": 0.026015492156147957, "grad_norm_var": 1.4251402699696063e-07, "learning_rate": 0.0005874524496887074, "loss": 2.4119, "step": 23371 }, { "crossentropy": 2.2439510822296143, "epoch": 0.8473027842227379, "grad_norm": 0.02639426849782467, "grad_norm_var": 9.87734830205546e-08, "learning_rate": 0.0005871791970996964, "loss": 2.3617, "step": 23372 }, { "crossentropy": 2.524010181427002, "epoch": 0.8473390371229699, "grad_norm": 0.026058055460453033, "grad_norm_var": 1.0729022932181159e-07, "learning_rate": 0.0005869060041125274, "loss": 2.4988, "step": 23373 }, { "crossentropy": 2.345118999481201, "epoch": 0.8473752900232019, "grad_norm": 0.02618180401623249, "grad_norm_var": 8.271336849047008e-08, "learning_rate": 0.0005866328707308893, "loss": 2.3975, "step": 23374 }, { "crossentropy": 2.4970951080322266, "epoch": 0.8474115429234339, "grad_norm": 0.02674654871225357, "grad_norm_var": 6.970724652509823e-08, "learning_rate": 0.0005863597969584728, "loss": 2.3624, "step": 23375 }, { "crossentropy": 2.389026403427124, "epoch": 0.8474477958236659, "grad_norm": 0.027935121208429337, "grad_norm_var": 2.0713314449505245e-07, "learning_rate": 0.0005860867827989641, "loss": 2.4384, "step": 23376 }, { "crossentropy": 2.463390588760376, "epoch": 0.8474840487238979, "grad_norm": 0.02746305614709854, "grad_norm_var": 2.527575790837172e-07, "learning_rate": 0.0005858138282560515, "loss": 2.4799, "step": 23377 }, { "crossentropy": 2.398322820663452, "epoch": 0.84752030162413, "grad_norm": 0.02648293972015381, "grad_norm_var": 2.536715505377923e-07, "learning_rate": 0.0005855409333334244, "loss": 2.371, "step": 23378 }, { "crossentropy": 2.4655075073242188, "epoch": 0.847556554524362, "grad_norm": 0.02630576863884926, "grad_norm_var": 2.5630154460846466e-07, "learning_rate": 0.0005852680980347641, "loss": 2.4654, "step": 23379 }, { "crossentropy": 2.3890392780303955, "epoch": 0.847592807424594, "grad_norm": 0.026788143441081047, "grad_norm_var": 2.5908757207079857e-07, "learning_rate": 0.0005849953223637572, "loss": 2.3879, "step": 23380 }, { "crossentropy": 2.366111993789673, "epoch": 0.847629060324826, "grad_norm": 0.026851294562220573, "grad_norm_var": 2.6264337422373696e-07, "learning_rate": 0.0005847226063240901, "loss": 2.428, "step": 23381 }, { "crossentropy": 2.461843967437744, "epoch": 0.847665313225058, "grad_norm": 0.027380604296922684, "grad_norm_var": 2.997974863943575e-07, "learning_rate": 0.0005844499499194434, "loss": 2.5063, "step": 23382 }, { "crossentropy": 2.3845183849334717, "epoch": 0.84770156612529, "grad_norm": 0.0289474930614233, "grad_norm_var": 6.116851781158648e-07, "learning_rate": 0.0005841773531535016, "loss": 2.4657, "step": 23383 }, { "crossentropy": 2.401035785675049, "epoch": 0.847737819025522, "grad_norm": 0.026508226990699768, "grad_norm_var": 6.174889326327708e-07, "learning_rate": 0.0005839048160299454, "loss": 2.3059, "step": 23384 }, { "crossentropy": 2.350033760070801, "epoch": 0.847774071925754, "grad_norm": 0.02766084484755993, "grad_norm_var": 6.514760466952429e-07, "learning_rate": 0.0005836323385524561, "loss": 2.3615, "step": 23385 }, { "crossentropy": 2.452907085418701, "epoch": 0.8478103248259861, "grad_norm": 0.02685438096523285, "grad_norm_var": 6.24836767283684e-07, "learning_rate": 0.0005833599207247164, "loss": 2.4519, "step": 23386 }, { "crossentropy": 2.4741692543029785, "epoch": 0.8478465777262181, "grad_norm": 0.02775508351624012, "grad_norm_var": 6.062922446564556e-07, "learning_rate": 0.0005830875625504012, "loss": 2.5246, "step": 23387 }, { "crossentropy": 2.1678149700164795, "epoch": 0.8478828306264501, "grad_norm": 0.02612566389143467, "grad_norm_var": 6.33197187707698e-07, "learning_rate": 0.0005828152640331913, "loss": 2.3038, "step": 23388 }, { "crossentropy": 2.526808738708496, "epoch": 0.8479190835266821, "grad_norm": 0.027036326006054878, "grad_norm_var": 5.69779878245522e-07, "learning_rate": 0.0005825430251767661, "loss": 2.4677, "step": 23389 }, { "crossentropy": 2.2983970642089844, "epoch": 0.8479553364269141, "grad_norm": 0.024964606389403343, "grad_norm_var": 8.055451547824462e-07, "learning_rate": 0.0005822708459847992, "loss": 2.3434, "step": 23390 }, { "crossentropy": 2.521639108657837, "epoch": 0.8479915893271461, "grad_norm": 0.02641759067773819, "grad_norm_var": 8.22893601765129e-07, "learning_rate": 0.0005819987264609705, "loss": 2.575, "step": 23391 }, { "crossentropy": 2.422607421875, "epoch": 0.8480278422273781, "grad_norm": 0.02549113892018795, "grad_norm_var": 8.808378187232739e-07, "learning_rate": 0.0005817266666089521, "loss": 2.429, "step": 23392 }, { "crossentropy": 2.5060036182403564, "epoch": 0.8480640951276102, "grad_norm": 0.027311665937304497, "grad_norm_var": 8.691803818463665e-07, "learning_rate": 0.0005814546664324199, "loss": 2.4823, "step": 23393 }, { "crossentropy": 2.58005690574646, "epoch": 0.8481003480278422, "grad_norm": 0.026115594431757927, "grad_norm_var": 8.933940261316899e-07, "learning_rate": 0.0005811827259350505, "loss": 2.6197, "step": 23394 }, { "crossentropy": 2.292447090148926, "epoch": 0.8481366009280742, "grad_norm": 0.026121830567717552, "grad_norm_var": 9.071919318997627e-07, "learning_rate": 0.0005809108451205114, "loss": 2.3453, "step": 23395 }, { "crossentropy": 2.3092737197875977, "epoch": 0.8481728538283063, "grad_norm": 0.026962094008922577, "grad_norm_var": 9.094887111447813e-07, "learning_rate": 0.000580639023992478, "loss": 2.4346, "step": 23396 }, { "crossentropy": 2.2732832431793213, "epoch": 0.8482091067285383, "grad_norm": 0.025970354676246643, "grad_norm_var": 9.497974019472809e-07, "learning_rate": 0.000580367262554622, "loss": 2.3052, "step": 23397 }, { "crossentropy": 2.5046298503875732, "epoch": 0.8482453596287703, "grad_norm": 0.025606529787182808, "grad_norm_var": 9.917748028165218e-07, "learning_rate": 0.0005800955608106123, "loss": 2.5048, "step": 23398 }, { "crossentropy": 2.4264204502105713, "epoch": 0.8482816125290024, "grad_norm": 0.026643851771950722, "grad_norm_var": 6.071981179889361e-07, "learning_rate": 0.0005798239187641208, "loss": 2.4588, "step": 23399 }, { "crossentropy": 2.289602518081665, "epoch": 0.8483178654292344, "grad_norm": 0.026589540764689445, "grad_norm_var": 6.080083431849714e-07, "learning_rate": 0.0005795523364188133, "loss": 2.3591, "step": 23400 }, { "crossentropy": 2.4058916568756104, "epoch": 0.8483541183294664, "grad_norm": 0.025927897542715073, "grad_norm_var": 5.220929226792814e-07, "learning_rate": 0.0005792808137783606, "loss": 2.4156, "step": 23401 }, { "crossentropy": 2.4820125102996826, "epoch": 0.8483903712296984, "grad_norm": 0.02586848847568035, "grad_norm_var": 5.189565364447738e-07, "learning_rate": 0.0005790093508464295, "loss": 2.4564, "step": 23402 }, { "crossentropy": 2.486694812774658, "epoch": 0.8484266241299304, "grad_norm": 0.026017578318715096, "grad_norm_var": 3.721114307251675e-07, "learning_rate": 0.0005787379476266868, "loss": 2.3972, "step": 23403 }, { "crossentropy": 2.6090798377990723, "epoch": 0.8484628770301624, "grad_norm": 0.026847051456570625, "grad_norm_var": 3.976622378607427e-07, "learning_rate": 0.000578466604122796, "loss": 2.4903, "step": 23404 }, { "crossentropy": 2.3920202255249023, "epoch": 0.8484991299303944, "grad_norm": 0.026295840740203857, "grad_norm_var": 3.536315222960601e-07, "learning_rate": 0.0005781953203384249, "loss": 2.4432, "step": 23405 }, { "crossentropy": 2.483106851577759, "epoch": 0.8485353828306265, "grad_norm": 0.026975058019161224, "grad_norm_var": 2.759013469422663e-07, "learning_rate": 0.0005779240962772358, "loss": 2.5315, "step": 23406 }, { "crossentropy": 2.194542169570923, "epoch": 0.8485716357308585, "grad_norm": 0.027845576405525208, "grad_norm_var": 4.214278211516394e-07, "learning_rate": 0.0005776529319428931, "loss": 2.375, "step": 23407 }, { "crossentropy": 2.4696192741394043, "epoch": 0.8486078886310905, "grad_norm": 0.026775188744068146, "grad_norm_var": 3.6683970199590835e-07, "learning_rate": 0.0005773818273390575, "loss": 2.4928, "step": 23408 }, { "crossentropy": 2.420830488204956, "epoch": 0.8486441415313225, "grad_norm": 0.027026429772377014, "grad_norm_var": 3.4075665619002356e-07, "learning_rate": 0.0005771107824693922, "loss": 2.4407, "step": 23409 }, { "crossentropy": 2.4097208976745605, "epoch": 0.8486803944315545, "grad_norm": 0.026279741898179054, "grad_norm_var": 3.345897882913359e-07, "learning_rate": 0.0005768397973375588, "loss": 2.4428, "step": 23410 }, { "crossentropy": 2.4290883541107178, "epoch": 0.8487166473317865, "grad_norm": 0.027171475812792778, "grad_norm_var": 3.526837083570507e-07, "learning_rate": 0.0005765688719472162, "loss": 2.4567, "step": 23411 }, { "crossentropy": 2.408876895904541, "epoch": 0.8487529002320185, "grad_norm": 0.026302581652998924, "grad_norm_var": 3.4364584293257855e-07, "learning_rate": 0.0005762980063020229, "loss": 2.4783, "step": 23412 }, { "crossentropy": 2.399808406829834, "epoch": 0.8487891531322506, "grad_norm": 0.027323240414261818, "grad_norm_var": 3.608853503991871e-07, "learning_rate": 0.0005760272004056388, "loss": 2.4246, "step": 23413 }, { "crossentropy": 2.6244330406188965, "epoch": 0.8488254060324826, "grad_norm": 0.02664325200021267, "grad_norm_var": 2.9163076544300865e-07, "learning_rate": 0.0005757564542617205, "loss": 2.6371, "step": 23414 }, { "crossentropy": 2.381464719772339, "epoch": 0.8488616589327146, "grad_norm": 0.026495948433876038, "grad_norm_var": 2.932828956151891e-07, "learning_rate": 0.0005754857678739262, "loss": 2.4892, "step": 23415 }, { "crossentropy": 2.3804006576538086, "epoch": 0.8488979118329466, "grad_norm": 0.0262601301074028, "grad_norm_var": 3.026788354106834e-07, "learning_rate": 0.0005752151412459094, "loss": 2.4314, "step": 23416 }, { "crossentropy": 2.4172422885894775, "epoch": 0.8489341647331786, "grad_norm": 0.026132766157388687, "grad_norm_var": 2.8616539304021405e-07, "learning_rate": 0.0005749445743813275, "loss": 2.4089, "step": 23417 }, { "crossentropy": 2.533842086791992, "epoch": 0.8489704176334106, "grad_norm": 0.02834606170654297, "grad_norm_var": 4.145298337351744e-07, "learning_rate": 0.0005746740672838358, "loss": 2.5637, "step": 23418 }, { "crossentropy": 2.466770648956299, "epoch": 0.8490066705336426, "grad_norm": 0.026582395657896996, "grad_norm_var": 3.7583731139766095e-07, "learning_rate": 0.0005744036199570851, "loss": 2.3719, "step": 23419 }, { "crossentropy": 2.463005542755127, "epoch": 0.8490429234338747, "grad_norm": 0.026897985488176346, "grad_norm_var": 3.7610560197903533e-07, "learning_rate": 0.0005741332324047304, "loss": 2.435, "step": 23420 }, { "crossentropy": 2.3615543842315674, "epoch": 0.8490791763341067, "grad_norm": 0.02715154178440571, "grad_norm_var": 3.6040018384004593e-07, "learning_rate": 0.0005738629046304234, "loss": 2.3625, "step": 23421 }, { "crossentropy": 2.502924919128418, "epoch": 0.8491154292343387, "grad_norm": 0.02727031521499157, "grad_norm_var": 3.692726212880503e-07, "learning_rate": 0.000573592636637813, "loss": 2.5182, "step": 23422 }, { "crossentropy": 2.44230055809021, "epoch": 0.8491516821345708, "grad_norm": 0.026697812601923943, "grad_norm_var": 3.079020473742509e-07, "learning_rate": 0.0005733224284305527, "loss": 2.4383, "step": 23423 }, { "crossentropy": 2.5008528232574463, "epoch": 0.8491879350348028, "grad_norm": 0.025979436933994293, "grad_norm_var": 3.5380357222985064e-07, "learning_rate": 0.00057305228001229, "loss": 2.4728, "step": 23424 }, { "crossentropy": 2.4229605197906494, "epoch": 0.8492241879350348, "grad_norm": 0.02724168822169304, "grad_norm_var": 3.6362688993287336e-07, "learning_rate": 0.0005727821913866743, "loss": 2.4134, "step": 23425 }, { "crossentropy": 2.4149701595306396, "epoch": 0.8492604408352669, "grad_norm": 0.025660846382379532, "grad_norm_var": 4.303759063887617e-07, "learning_rate": 0.0005725121625573548, "loss": 2.4768, "step": 23426 }, { "crossentropy": 2.477423906326294, "epoch": 0.8492966937354989, "grad_norm": 0.0260182972997427, "grad_norm_var": 4.5019812646499014e-07, "learning_rate": 0.0005722421935279759, "loss": 2.4424, "step": 23427 }, { "crossentropy": 2.390991449356079, "epoch": 0.8493329466357309, "grad_norm": 0.027123937383294106, "grad_norm_var": 4.5017878605163366e-07, "learning_rate": 0.0005719722843021868, "loss": 2.3904, "step": 23428 }, { "crossentropy": 2.4106533527374268, "epoch": 0.8493691995359629, "grad_norm": 0.027385832741856575, "grad_norm_var": 4.552986471755815e-07, "learning_rate": 0.0005717024348836319, "loss": 2.4691, "step": 23429 }, { "crossentropy": 2.5288443565368652, "epoch": 0.8494054524361949, "grad_norm": 0.02668430283665657, "grad_norm_var": 4.54857920319226e-07, "learning_rate": 0.0005714326452759549, "loss": 2.5129, "step": 23430 }, { "crossentropy": 2.4853100776672363, "epoch": 0.8494417053364269, "grad_norm": 0.028332356363534927, "grad_norm_var": 6.045088755398381e-07, "learning_rate": 0.0005711629154828018, "loss": 2.4664, "step": 23431 }, { "crossentropy": 2.3982903957366943, "epoch": 0.849477958236659, "grad_norm": 0.026911698281764984, "grad_norm_var": 5.788975557799534e-07, "learning_rate": 0.000570893245507813, "loss": 2.4383, "step": 23432 }, { "crossentropy": 2.484781265258789, "epoch": 0.849514211136891, "grad_norm": 0.025956260040402412, "grad_norm_var": 5.989263114897572e-07, "learning_rate": 0.0005706236353546323, "loss": 2.518, "step": 23433 }, { "crossentropy": 2.455824851989746, "epoch": 0.849550464037123, "grad_norm": 0.025933528319001198, "grad_norm_var": 4.943386071977792e-07, "learning_rate": 0.0005703540850269029, "loss": 2.4839, "step": 23434 }, { "crossentropy": 2.5225906372070312, "epoch": 0.849586716937355, "grad_norm": 0.026515720412135124, "grad_norm_var": 4.96011027592659e-07, "learning_rate": 0.0005700845945282623, "loss": 2.4592, "step": 23435 }, { "crossentropy": 2.519974708557129, "epoch": 0.849622969837587, "grad_norm": 0.026977645233273506, "grad_norm_var": 4.981377141587025e-07, "learning_rate": 0.0005698151638623528, "loss": 2.4906, "step": 23436 }, { "crossentropy": 2.350820779800415, "epoch": 0.849659222737819, "grad_norm": 0.026927422732114792, "grad_norm_var": 4.889814131878485e-07, "learning_rate": 0.000569545793032813, "loss": 2.3927, "step": 23437 }, { "crossentropy": 2.4387054443359375, "epoch": 0.849695475638051, "grad_norm": 0.0267778467386961, "grad_norm_var": 4.68402678649897e-07, "learning_rate": 0.0005692764820432794, "loss": 2.5006, "step": 23438 }, { "crossentropy": 2.408569812774658, "epoch": 0.849731728538283, "grad_norm": 0.026253292337059975, "grad_norm_var": 4.80603029544381e-07, "learning_rate": 0.0005690072308973914, "loss": 2.4951, "step": 23439 }, { "crossentropy": 2.53942608833313, "epoch": 0.8497679814385151, "grad_norm": 0.025919156149029732, "grad_norm_var": 4.863604609804557e-07, "learning_rate": 0.0005687380395987846, "loss": 2.4504, "step": 23440 }, { "crossentropy": 2.4343037605285645, "epoch": 0.8498042343387471, "grad_norm": 0.026302512735128403, "grad_norm_var": 4.691159160925713e-07, "learning_rate": 0.0005684689081510952, "loss": 2.388, "step": 23441 }, { "crossentropy": 2.515939235687256, "epoch": 0.8498404872389791, "grad_norm": 0.026852425187826157, "grad_norm_var": 4.0784619672046045e-07, "learning_rate": 0.0005681998365579593, "loss": 2.4647, "step": 23442 }, { "crossentropy": 2.4955155849456787, "epoch": 0.8498767401392111, "grad_norm": 0.027426451444625854, "grad_norm_var": 4.076311961195006e-07, "learning_rate": 0.0005679308248230086, "loss": 2.4696, "step": 23443 }, { "crossentropy": 2.39101505279541, "epoch": 0.8499129930394431, "grad_norm": 0.025914154946804047, "grad_norm_var": 4.4161357219818277e-07, "learning_rate": 0.0005676618729498795, "loss": 2.3823, "step": 23444 }, { "crossentropy": 2.416957139968872, "epoch": 0.8499492459396751, "grad_norm": 0.02704726718366146, "grad_norm_var": 4.174527561580377e-07, "learning_rate": 0.0005673929809422029, "loss": 2.3899, "step": 23445 }, { "crossentropy": 2.4495861530303955, "epoch": 0.8499854988399071, "grad_norm": 0.02689584344625473, "grad_norm_var": 4.206317867058139e-07, "learning_rate": 0.00056712414880361, "loss": 2.4608, "step": 23446 }, { "crossentropy": 2.4629666805267334, "epoch": 0.8500217517401392, "grad_norm": 0.026441117748618126, "grad_norm_var": 2.2851613303124092e-07, "learning_rate": 0.0005668553765377338, "loss": 2.4253, "step": 23447 }, { "crossentropy": 2.4170498847961426, "epoch": 0.8500580046403712, "grad_norm": 0.02674291841685772, "grad_norm_var": 2.2251181664428624e-07, "learning_rate": 0.000566586664148202, "loss": 2.3858, "step": 23448 }, { "crossentropy": 2.3598151206970215, "epoch": 0.8500942575406032, "grad_norm": 0.027833567932248116, "grad_norm_var": 2.928546491660795e-07, "learning_rate": 0.0005663180116386451, "loss": 2.4108, "step": 23449 }, { "crossentropy": 2.452073812484741, "epoch": 0.8501305104408353, "grad_norm": 0.02593008615076542, "grad_norm_var": 2.9319456999506397e-07, "learning_rate": 0.000566049419012693, "loss": 2.4162, "step": 23450 }, { "crossentropy": 2.306269407272339, "epoch": 0.8501667633410673, "grad_norm": 0.02659425139427185, "grad_norm_var": 2.9194009001745567e-07, "learning_rate": 0.0005657808862739711, "loss": 2.2539, "step": 23451 }, { "crossentropy": 2.5043561458587646, "epoch": 0.8502030162412993, "grad_norm": 0.026112213730812073, "grad_norm_var": 3.0408766386596833e-07, "learning_rate": 0.0005655124134261086, "loss": 2.4631, "step": 23452 }, { "crossentropy": 2.5005319118499756, "epoch": 0.8502392691415314, "grad_norm": 0.02843710221350193, "grad_norm_var": 5.077790486728336e-07, "learning_rate": 0.0005652440004727305, "loss": 2.4877, "step": 23453 }, { "crossentropy": 2.431670665740967, "epoch": 0.8502755220417634, "grad_norm": 0.02662079595029354, "grad_norm_var": 5.080572129067256e-07, "learning_rate": 0.0005649756474174611, "loss": 2.4269, "step": 23454 }, { "crossentropy": 2.5223357677459717, "epoch": 0.8503117749419954, "grad_norm": 0.02681945264339447, "grad_norm_var": 4.937886649921304e-07, "learning_rate": 0.0005647073542639275, "loss": 2.3813, "step": 23455 }, { "crossentropy": 2.426302433013916, "epoch": 0.8503480278422274, "grad_norm": 0.02545376867055893, "grad_norm_var": 5.584512494134399e-07, "learning_rate": 0.0005644391210157502, "loss": 2.4662, "step": 23456 }, { "crossentropy": 2.4663262367248535, "epoch": 0.8503842807424594, "grad_norm": 0.02726299874484539, "grad_norm_var": 5.634131124502192e-07, "learning_rate": 0.0005641709476765539, "loss": 2.4954, "step": 23457 }, { "crossentropy": 2.2429330348968506, "epoch": 0.8504205336426914, "grad_norm": 0.02627895213663578, "grad_norm_var": 5.779729274370809e-07, "learning_rate": 0.0005639028342499619, "loss": 2.3154, "step": 23458 }, { "crossentropy": 2.5099422931671143, "epoch": 0.8504567865429234, "grad_norm": 0.026422370225191116, "grad_norm_var": 5.488405884510137e-07, "learning_rate": 0.0005636347807395925, "loss": 2.5513, "step": 23459 }, { "crossentropy": 2.396230459213257, "epoch": 0.8504930394431555, "grad_norm": 0.026759499683976173, "grad_norm_var": 5.076983866873659e-07, "learning_rate": 0.0005633667871490694, "loss": 2.4167, "step": 23460 }, { "crossentropy": 2.327479839324951, "epoch": 0.8505292923433875, "grad_norm": 0.026570934802293777, "grad_norm_var": 5.016188937613144e-07, "learning_rate": 0.0005630988534820097, "loss": 2.4462, "step": 23461 }, { "crossentropy": 2.413686990737915, "epoch": 0.8505655452436195, "grad_norm": 0.026210596784949303, "grad_norm_var": 5.129353352804899e-07, "learning_rate": 0.0005628309797420339, "loss": 2.3767, "step": 23462 }, { "crossentropy": 2.4868173599243164, "epoch": 0.8506017981438515, "grad_norm": 0.02646242268383503, "grad_norm_var": 5.123542509421463e-07, "learning_rate": 0.0005625631659327585, "loss": 2.4453, "step": 23463 }, { "crossentropy": 2.357398271560669, "epoch": 0.8506380510440835, "grad_norm": 0.027231866493821144, "grad_norm_var": 5.328977021754055e-07, "learning_rate": 0.0005622954120578028, "loss": 2.3589, "step": 23464 }, { "crossentropy": 2.6030702590942383, "epoch": 0.8506743039443155, "grad_norm": 0.02842041291296482, "grad_norm_var": 6.440928154070879e-07, "learning_rate": 0.0005620277181207806, "loss": 2.5019, "step": 23465 }, { "crossentropy": 2.3607349395751953, "epoch": 0.8507105568445475, "grad_norm": 0.027140123769640923, "grad_norm_var": 6.074784416970046e-07, "learning_rate": 0.0005617600841253101, "loss": 2.3637, "step": 23466 }, { "crossentropy": 2.447737216949463, "epoch": 0.8507468097447796, "grad_norm": 0.026085995137691498, "grad_norm_var": 6.375573114960596e-07, "learning_rate": 0.0005614925100750041, "loss": 2.3912, "step": 23467 }, { "crossentropy": 2.4201278686523438, "epoch": 0.8507830626450116, "grad_norm": 0.02729765512049198, "grad_norm_var": 6.217190591191323e-07, "learning_rate": 0.0005612249959734783, "loss": 2.3982, "step": 23468 }, { "crossentropy": 2.471747875213623, "epoch": 0.8508193155452436, "grad_norm": 0.026946378871798515, "grad_norm_var": 4.4359920571833355e-07, "learning_rate": 0.000560957541824344, "loss": 2.4955, "step": 23469 }, { "crossentropy": 2.313504695892334, "epoch": 0.8508555684454756, "grad_norm": 0.02697048895061016, "grad_norm_var": 4.452637679303589e-07, "learning_rate": 0.0005606901476312154, "loss": 2.3753, "step": 23470 }, { "crossentropy": 2.391900062561035, "epoch": 0.8508918213457076, "grad_norm": 0.026702668517827988, "grad_norm_var": 4.453596830053154e-07, "learning_rate": 0.0005604228133977024, "loss": 2.5122, "step": 23471 }, { "crossentropy": 2.4842562675476074, "epoch": 0.8509280742459396, "grad_norm": 0.026601115241646767, "grad_norm_var": 3.27262004945252e-07, "learning_rate": 0.0005601555391274176, "loss": 2.5097, "step": 23472 }, { "crossentropy": 2.4592435359954834, "epoch": 0.8509643271461717, "grad_norm": 0.02990264631807804, "grad_norm_var": 9.132826097351642e-07, "learning_rate": 0.0005598883248239689, "loss": 2.4265, "step": 23473 }, { "crossentropy": 2.4140493869781494, "epoch": 0.8510005800464037, "grad_norm": 0.025762785226106644, "grad_norm_var": 9.795762715781535e-07, "learning_rate": 0.0005596211704909676, "loss": 2.4888, "step": 23474 }, { "crossentropy": 2.49741792678833, "epoch": 0.8510368329466357, "grad_norm": 0.027391770854592323, "grad_norm_var": 9.677856745446758e-07, "learning_rate": 0.0005593540761320198, "loss": 2.4668, "step": 23475 }, { "crossentropy": 2.403857707977295, "epoch": 0.8510730858468677, "grad_norm": 0.02566242590546608, "grad_norm_var": 1.08236972604322e-06, "learning_rate": 0.0005590870417507344, "loss": 2.4489, "step": 23476 }, { "crossentropy": 2.1694540977478027, "epoch": 0.8511093387470998, "grad_norm": 0.027417395263910294, "grad_norm_var": 1.0832382135716414e-06, "learning_rate": 0.0005588200673507188, "loss": 2.2587, "step": 23477 }, { "crossentropy": 2.3312108516693115, "epoch": 0.8511455916473318, "grad_norm": 0.02682020701467991, "grad_norm_var": 1.0412506853104634e-06, "learning_rate": 0.0005585531529355775, "loss": 2.4008, "step": 23478 }, { "crossentropy": 2.6268253326416016, "epoch": 0.8511818445475638, "grad_norm": 0.026410769671201706, "grad_norm_var": 1.0454711638439746e-06, "learning_rate": 0.0005582862985089155, "loss": 2.4695, "step": 23479 }, { "crossentropy": 2.5187604427337646, "epoch": 0.8512180974477959, "grad_norm": 0.026801183819770813, "grad_norm_var": 1.0464938967474769e-06, "learning_rate": 0.000558019504074338, "loss": 2.4264, "step": 23480 }, { "crossentropy": 2.4066991806030273, "epoch": 0.8512543503480279, "grad_norm": 0.027186445891857147, "grad_norm_var": 9.11396808263761e-07, "learning_rate": 0.0005577527696354473, "loss": 2.4797, "step": 23481 }, { "crossentropy": 2.4318180084228516, "epoch": 0.8512906032482599, "grad_norm": 0.025559455156326294, "grad_norm_var": 1.0261677054990467e-06, "learning_rate": 0.0005574860951958482, "loss": 2.3878, "step": 23482 }, { "crossentropy": 2.4546680450439453, "epoch": 0.8513268561484919, "grad_norm": 0.02666538581252098, "grad_norm_var": 9.88516800616669e-07, "learning_rate": 0.0005572194807591396, "loss": 2.443, "step": 23483 }, { "crossentropy": 2.3428232669830322, "epoch": 0.8513631090487239, "grad_norm": 0.025773128494620323, "grad_norm_var": 1.0491198516907833e-06, "learning_rate": 0.0005569529263289247, "loss": 2.3526, "step": 23484 }, { "crossentropy": 2.6318280696868896, "epoch": 0.8513993619489559, "grad_norm": 0.02653811126947403, "grad_norm_var": 1.0508012204284965e-06, "learning_rate": 0.0005566864319088033, "loss": 2.5739, "step": 23485 }, { "crossentropy": 2.2999656200408936, "epoch": 0.851435614849188, "grad_norm": 0.02838916890323162, "grad_norm_var": 1.2163368103408507e-06, "learning_rate": 0.0005564199975023743, "loss": 2.4537, "step": 23486 }, { "crossentropy": 2.500535249710083, "epoch": 0.85147186774942, "grad_norm": 0.028004217892885208, "grad_norm_var": 1.2968121856207192e-06, "learning_rate": 0.0005561536231132369, "loss": 2.4979, "step": 23487 }, { "crossentropy": 2.4639527797698975, "epoch": 0.851508120649652, "grad_norm": 0.02632831782102585, "grad_norm_var": 1.3134399829861498e-06, "learning_rate": 0.0005558873087449889, "loss": 2.4193, "step": 23488 }, { "crossentropy": 2.457669258117676, "epoch": 0.851544373549884, "grad_norm": 0.026078196242451668, "grad_norm_var": 7.032633085327638e-07, "learning_rate": 0.0005556210544012263, "loss": 2.4585, "step": 23489 }, { "crossentropy": 2.5664374828338623, "epoch": 0.851580626450116, "grad_norm": 0.026332780718803406, "grad_norm_var": 6.542938774848807e-07, "learning_rate": 0.0005553548600855468, "loss": 2.4938, "step": 23490 }, { "crossentropy": 2.3183023929595947, "epoch": 0.851616879350348, "grad_norm": 0.02671704813838005, "grad_norm_var": 6.214070344482311e-07, "learning_rate": 0.0005550887258015436, "loss": 2.4202, "step": 23491 }, { "crossentropy": 2.446915626525879, "epoch": 0.85165313225058, "grad_norm": 0.026350799947977066, "grad_norm_var": 5.587499773462382e-07, "learning_rate": 0.0005548226515528132, "loss": 2.3777, "step": 23492 }, { "crossentropy": 2.4641060829162598, "epoch": 0.851689385150812, "grad_norm": 0.026480229571461678, "grad_norm_var": 5.253480685544667e-07, "learning_rate": 0.0005545566373429489, "loss": 2.4748, "step": 23493 }, { "crossentropy": 2.489577531814575, "epoch": 0.8517256380510441, "grad_norm": 0.02626643516123295, "grad_norm_var": 5.321106521283808e-07, "learning_rate": 0.0005542906831755429, "loss": 2.4695, "step": 23494 }, { "crossentropy": 2.3842973709106445, "epoch": 0.8517618909512761, "grad_norm": 0.027114832773804665, "grad_norm_var": 5.436755544887516e-07, "learning_rate": 0.0005540247890541888, "loss": 2.4134, "step": 23495 }, { "crossentropy": 2.550687074661255, "epoch": 0.8517981438515081, "grad_norm": 0.026526952162384987, "grad_norm_var": 5.432722829647503e-07, "learning_rate": 0.0005537589549824768, "loss": 2.5095, "step": 23496 }, { "crossentropy": 2.423361301422119, "epoch": 0.8518343967517401, "grad_norm": 0.024937083944678307, "grad_norm_var": 6.969521831910625e-07, "learning_rate": 0.0005534931809639965, "loss": 2.3409, "step": 23497 }, { "crossentropy": 2.310934066772461, "epoch": 0.8518706496519721, "grad_norm": 0.025734905153512955, "grad_norm_var": 6.767827636237437e-07, "learning_rate": 0.0005532274670023396, "loss": 2.3712, "step": 23498 }, { "crossentropy": 2.554896116256714, "epoch": 0.8519069025522041, "grad_norm": 0.026377489790320396, "grad_norm_var": 6.761845185076678e-07, "learning_rate": 0.0005529618131010933, "loss": 2.5403, "step": 23499 }, { "crossentropy": 2.343088150024414, "epoch": 0.8519431554524362, "grad_norm": 0.027372151613235474, "grad_norm_var": 6.816882403832062e-07, "learning_rate": 0.0005526962192638458, "loss": 2.5324, "step": 23500 }, { "crossentropy": 2.4105403423309326, "epoch": 0.8519794083526682, "grad_norm": 0.026208480820059776, "grad_norm_var": 6.910584505875287e-07, "learning_rate": 0.0005524306854941863, "loss": 2.45, "step": 23501 }, { "crossentropy": 2.552757740020752, "epoch": 0.8520156612529002, "grad_norm": 0.026178695261478424, "grad_norm_var": 4.621075474275348e-07, "learning_rate": 0.0005521652117956988, "loss": 2.4224, "step": 23502 }, { "crossentropy": 2.349189043045044, "epoch": 0.8520519141531323, "grad_norm": 0.025418676435947418, "grad_norm_var": 3.3999870845772644e-07, "learning_rate": 0.0005518997981719704, "loss": 2.37, "step": 23503 }, { "crossentropy": 2.4593987464904785, "epoch": 0.8520881670533643, "grad_norm": 0.026886293664574623, "grad_norm_var": 3.63316651043087e-07, "learning_rate": 0.0005516344446265858, "loss": 2.3842, "step": 23504 }, { "crossentropy": 2.5097246170043945, "epoch": 0.8521244199535963, "grad_norm": 0.026708047837018967, "grad_norm_var": 3.685337928322277e-07, "learning_rate": 0.000551369151163128, "loss": 2.4545, "step": 23505 }, { "crossentropy": 2.612163782119751, "epoch": 0.8521606728538283, "grad_norm": 0.025892766192555428, "grad_norm_var": 3.81684801937819e-07, "learning_rate": 0.0005511039177851818, "loss": 2.4816, "step": 23506 }, { "crossentropy": 2.3147716522216797, "epoch": 0.8521969257540604, "grad_norm": 0.02636227011680603, "grad_norm_var": 3.7092010774276e-07, "learning_rate": 0.000550838744496327, "loss": 2.3578, "step": 23507 }, { "crossentropy": 2.4436988830566406, "epoch": 0.8522331786542924, "grad_norm": 0.02658216468989849, "grad_norm_var": 3.758017578157797e-07, "learning_rate": 0.0005505736313001475, "loss": 2.4114, "step": 23508 }, { "crossentropy": 2.482713222503662, "epoch": 0.8522694315545244, "grad_norm": 0.027149634435772896, "grad_norm_var": 4.1851388512161256e-07, "learning_rate": 0.0005503085782002237, "loss": 2.4435, "step": 23509 }, { "crossentropy": 2.504208564758301, "epoch": 0.8523056844547564, "grad_norm": 0.02663956768810749, "grad_norm_var": 4.2269476194506155e-07, "learning_rate": 0.000550043585200135, "loss": 2.4856, "step": 23510 }, { "crossentropy": 2.4944052696228027, "epoch": 0.8523419373549884, "grad_norm": 0.025925569236278534, "grad_norm_var": 3.9466941714609516e-07, "learning_rate": 0.0005497786523034609, "loss": 2.3783, "step": 23511 }, { "crossentropy": 2.343127727508545, "epoch": 0.8523781902552204, "grad_norm": 0.025784986093640327, "grad_norm_var": 4.072473493574863e-07, "learning_rate": 0.00054951377951378, "loss": 2.3464, "step": 23512 }, { "crossentropy": 2.4243292808532715, "epoch": 0.8524144431554525, "grad_norm": 0.027433598414063454, "grad_norm_var": 3.564519898566065e-07, "learning_rate": 0.0005492489668346684, "loss": 2.4898, "step": 23513 }, { "crossentropy": 2.3898110389709473, "epoch": 0.8524506960556845, "grad_norm": 0.027434052899479866, "grad_norm_var": 3.8260177607490665e-07, "learning_rate": 0.0005489842142697049, "loss": 2.376, "step": 23514 }, { "crossentropy": 2.5129480361938477, "epoch": 0.8524869489559165, "grad_norm": 0.025831112638115883, "grad_norm_var": 4.117985191430991e-07, "learning_rate": 0.0005487195218224628, "loss": 2.4903, "step": 23515 }, { "crossentropy": 2.5664284229278564, "epoch": 0.8525232018561485, "grad_norm": 0.02670624852180481, "grad_norm_var": 3.6101184063599043e-07, "learning_rate": 0.0005484548894965192, "loss": 2.4798, "step": 23516 }, { "crossentropy": 2.361086130142212, "epoch": 0.8525594547563805, "grad_norm": 0.02548954077064991, "grad_norm_var": 4.161217239820672e-07, "learning_rate": 0.0005481903172954483, "loss": 2.404, "step": 23517 }, { "crossentropy": 2.5036377906799316, "epoch": 0.8525957076566125, "grad_norm": 0.026068130508065224, "grad_norm_var": 4.201696252458294e-07, "learning_rate": 0.000547925805222822, "loss": 2.393, "step": 23518 }, { "crossentropy": 2.453038454055786, "epoch": 0.8526319605568445, "grad_norm": 0.02609906531870365, "grad_norm_var": 3.605736795151785e-07, "learning_rate": 0.000547661353282215, "loss": 2.4571, "step": 23519 }, { "crossentropy": 2.4708969593048096, "epoch": 0.8526682134570766, "grad_norm": 0.026721296831965446, "grad_norm_var": 3.523923481953013e-07, "learning_rate": 0.0005473969614771984, "loss": 2.4325, "step": 23520 }, { "crossentropy": 2.5111072063446045, "epoch": 0.8527044663573086, "grad_norm": 0.026538800448179245, "grad_norm_var": 3.4783485763535915e-07, "learning_rate": 0.0005471326298113416, "loss": 2.4305, "step": 23521 }, { "crossentropy": 2.491159677505493, "epoch": 0.8527407192575406, "grad_norm": 0.02624323219060898, "grad_norm_var": 3.3105322924628584e-07, "learning_rate": 0.0005468683582882173, "loss": 2.5157, "step": 23522 }, { "crossentropy": 2.4653313159942627, "epoch": 0.8527769721577726, "grad_norm": 0.026324748992919922, "grad_norm_var": 3.31520478977322e-07, "learning_rate": 0.0005466041469113925, "loss": 2.4236, "step": 23523 }, { "crossentropy": 2.4878647327423096, "epoch": 0.8528132250580046, "grad_norm": 0.026777416467666626, "grad_norm_var": 3.3771528695301226e-07, "learning_rate": 0.0005463399956844373, "loss": 2.4558, "step": 23524 }, { "crossentropy": 2.589266538619995, "epoch": 0.8528494779582366, "grad_norm": 0.026858346536755562, "grad_norm_var": 3.157655536720552e-07, "learning_rate": 0.0005460759046109198, "loss": 2.5215, "step": 23525 }, { "crossentropy": 2.482107639312744, "epoch": 0.8528857308584686, "grad_norm": 0.02655179239809513, "grad_norm_var": 3.137913010398466e-07, "learning_rate": 0.0005458118736944051, "loss": 2.4747, "step": 23526 }, { "crossentropy": 2.3568968772888184, "epoch": 0.8529219837587007, "grad_norm": 0.027753612026572227, "grad_norm_var": 4.0110304918732944e-07, "learning_rate": 0.0005455479029384619, "loss": 2.4573, "step": 23527 }, { "crossentropy": 2.3171091079711914, "epoch": 0.8529582366589327, "grad_norm": 0.02660243771970272, "grad_norm_var": 3.6073922005118844e-07, "learning_rate": 0.0005452839923466529, "loss": 2.4365, "step": 23528 }, { "crossentropy": 2.4121787548065186, "epoch": 0.8529944895591647, "grad_norm": 0.02602895349264145, "grad_norm_var": 3.259823931182788e-07, "learning_rate": 0.0005450201419225453, "loss": 2.4762, "step": 23529 }, { "crossentropy": 2.4696972370147705, "epoch": 0.8530307424593968, "grad_norm": 0.02811446227133274, "grad_norm_var": 4.3949241755643585e-07, "learning_rate": 0.0005447563516697007, "loss": 2.4637, "step": 23530 }, { "crossentropy": 2.393744468688965, "epoch": 0.8530669953596288, "grad_norm": 0.027883974835276604, "grad_norm_var": 5.076657737880056e-07, "learning_rate": 0.0005444926215916824, "loss": 2.408, "step": 23531 }, { "crossentropy": 2.4581964015960693, "epoch": 0.8531032482598608, "grad_norm": 0.02710934914648533, "grad_norm_var": 5.196283616054245e-07, "learning_rate": 0.0005442289516920518, "loss": 2.4796, "step": 23532 }, { "crossentropy": 2.4770941734313965, "epoch": 0.8531395011600929, "grad_norm": 0.02653675340116024, "grad_norm_var": 4.1945887024444757e-07, "learning_rate": 0.0005439653419743729, "loss": 2.5033, "step": 23533 }, { "crossentropy": 2.5470597743988037, "epoch": 0.8531757540603249, "grad_norm": 0.028280921280384064, "grad_norm_var": 5.203924877585896e-07, "learning_rate": 0.0005437017924422027, "loss": 2.4722, "step": 23534 }, { "crossentropy": 2.6140894889831543, "epoch": 0.8532120069605569, "grad_norm": 0.026177389547228813, "grad_norm_var": 5.123951359167767e-07, "learning_rate": 0.0005434383030991041, "loss": 2.5379, "step": 23535 }, { "crossentropy": 2.624067783355713, "epoch": 0.8532482598607889, "grad_norm": 0.02697046473622322, "grad_norm_var": 5.101235988073099e-07, "learning_rate": 0.0005431748739486331, "loss": 2.4777, "step": 23536 }, { "crossentropy": 2.444518566131592, "epoch": 0.8532845127610209, "grad_norm": 0.027757637202739716, "grad_norm_var": 5.406902708969396e-07, "learning_rate": 0.0005429115049943495, "loss": 2.5373, "step": 23537 }, { "crossentropy": 2.6241042613983154, "epoch": 0.8533207656612529, "grad_norm": 0.02650337480008602, "grad_norm_var": 5.187327006286879e-07, "learning_rate": 0.00054264819623981, "loss": 2.4931, "step": 23538 }, { "crossentropy": 2.573749303817749, "epoch": 0.8533570185614849, "grad_norm": 0.02650979720056057, "grad_norm_var": 5.03855149476641e-07, "learning_rate": 0.00054238494768857, "loss": 2.509, "step": 23539 }, { "crossentropy": 2.521519422531128, "epoch": 0.853393271461717, "grad_norm": 0.027210146188735962, "grad_norm_var": 5.012135268337894e-07, "learning_rate": 0.0005421217593441863, "loss": 2.4737, "step": 23540 }, { "crossentropy": 2.388277769088745, "epoch": 0.853429524361949, "grad_norm": 0.026229195296764374, "grad_norm_var": 5.422892515001438e-07, "learning_rate": 0.0005418586312102142, "loss": 2.4333, "step": 23541 }, { "crossentropy": 2.560396671295166, "epoch": 0.853465777262181, "grad_norm": 0.026682715862989426, "grad_norm_var": 5.352961304350926e-07, "learning_rate": 0.0005415955632902054, "loss": 2.4511, "step": 23542 }, { "crossentropy": 2.5413665771484375, "epoch": 0.853502030162413, "grad_norm": 0.026599744334816933, "grad_norm_var": 5.059436620840577e-07, "learning_rate": 0.0005413325555877158, "loss": 2.4698, "step": 23543 }, { "crossentropy": 2.3559865951538086, "epoch": 0.853538283062645, "grad_norm": 0.02678193710744381, "grad_norm_var": 4.996431312689291e-07, "learning_rate": 0.000541069608106295, "loss": 2.4252, "step": 23544 }, { "crossentropy": 2.3998191356658936, "epoch": 0.853574535962877, "grad_norm": 0.02740495093166828, "grad_norm_var": 4.4697017829298017e-07, "learning_rate": 0.0005408067208494954, "loss": 2.4778, "step": 23545 }, { "crossentropy": 2.419546127319336, "epoch": 0.853610788863109, "grad_norm": 0.026861779391765594, "grad_norm_var": 3.667623487072034e-07, "learning_rate": 0.0005405438938208706, "loss": 2.3935, "step": 23546 }, { "crossentropy": 2.4410760402679443, "epoch": 0.853647041763341, "grad_norm": 0.027616066858172417, "grad_norm_var": 3.3855575496386826e-07, "learning_rate": 0.000540281127023966, "loss": 2.462, "step": 23547 }, { "crossentropy": 2.502908229827881, "epoch": 0.8536832946635731, "grad_norm": 0.026749804615974426, "grad_norm_var": 3.390927374835595e-07, "learning_rate": 0.0005400184204623327, "loss": 2.518, "step": 23548 }, { "crossentropy": 2.4055569171905518, "epoch": 0.8537195475638051, "grad_norm": 0.026189830154180527, "grad_norm_var": 3.647839902014832e-07, "learning_rate": 0.0005397557741395198, "loss": 2.393, "step": 23549 }, { "crossentropy": 2.4571032524108887, "epoch": 0.8537558004640371, "grad_norm": 0.026948267593979836, "grad_norm_var": 2.3180644889963148e-07, "learning_rate": 0.0005394931880590731, "loss": 2.3667, "step": 23550 }, { "crossentropy": 2.3864359855651855, "epoch": 0.8537920533642691, "grad_norm": 0.026102617383003235, "grad_norm_var": 2.3860801177075173e-07, "learning_rate": 0.0005392306622245407, "loss": 2.2859, "step": 23551 }, { "crossentropy": 2.391139268875122, "epoch": 0.8538283062645011, "grad_norm": 0.02583185024559498, "grad_norm_var": 2.9677700384399726e-07, "learning_rate": 0.0005389681966394667, "loss": 2.3512, "step": 23552 }, { "crossentropy": 2.5365161895751953, "epoch": 0.8538645591647331, "grad_norm": 0.026674343273043633, "grad_norm_var": 2.2439694017729825e-07, "learning_rate": 0.0005387057913073967, "loss": 2.4771, "step": 23553 }, { "crossentropy": 2.2177786827087402, "epoch": 0.8539008120649652, "grad_norm": 0.026310088112950325, "grad_norm_var": 2.313102808324524e-07, "learning_rate": 0.0005384434462318777, "loss": 2.3722, "step": 23554 }, { "crossentropy": 2.4967727661132812, "epoch": 0.8539370649651972, "grad_norm": 0.02568097785115242, "grad_norm_var": 2.918315293723902e-07, "learning_rate": 0.0005381811614164483, "loss": 2.4765, "step": 23555 }, { "crossentropy": 2.5068752765655518, "epoch": 0.8539733178654292, "grad_norm": 0.026814904063940048, "grad_norm_var": 2.7034449466689415e-07, "learning_rate": 0.0005379189368646536, "loss": 2.5412, "step": 23556 }, { "crossentropy": 2.266934394836426, "epoch": 0.8540095707656613, "grad_norm": 0.026220010593533516, "grad_norm_var": 2.707946089285414e-07, "learning_rate": 0.0005376567725800364, "loss": 2.3339, "step": 23557 }, { "crossentropy": 2.600632429122925, "epoch": 0.8540458236658933, "grad_norm": 0.025914151221513748, "grad_norm_var": 2.984031761335172e-07, "learning_rate": 0.0005373946685661346, "loss": 2.4417, "step": 23558 }, { "crossentropy": 2.4891462326049805, "epoch": 0.8540820765661253, "grad_norm": 0.026049131527543068, "grad_norm_var": 3.1324682758803304e-07, "learning_rate": 0.0005371326248264913, "loss": 2.4227, "step": 23559 }, { "crossentropy": 2.410731792449951, "epoch": 0.8541183294663574, "grad_norm": 0.026745876297354698, "grad_norm_var": 3.120178070499148e-07, "learning_rate": 0.0005368706413646441, "loss": 2.3455, "step": 23560 }, { "crossentropy": 2.53887939453125, "epoch": 0.8541545823665894, "grad_norm": 0.026889372617006302, "grad_norm_var": 2.669144685762496e-07, "learning_rate": 0.0005366087181841311, "loss": 2.5707, "step": 23561 }, { "crossentropy": 2.3301172256469727, "epoch": 0.8541908352668214, "grad_norm": 0.02564246393740177, "grad_norm_var": 2.9694486128200153e-07, "learning_rate": 0.0005363468552884932, "loss": 2.3847, "step": 23562 }, { "crossentropy": 2.355382204055786, "epoch": 0.8542270881670534, "grad_norm": 0.026398317888379097, "grad_norm_var": 1.9197290996978272e-07, "learning_rate": 0.0005360850526812627, "loss": 2.3691, "step": 23563 }, { "crossentropy": 2.3216705322265625, "epoch": 0.8542633410672854, "grad_norm": 0.027669942006468773, "grad_norm_var": 2.9729717944306184e-07, "learning_rate": 0.0005358233103659782, "loss": 2.3816, "step": 23564 }, { "crossentropy": 2.315265417098999, "epoch": 0.8542995939675174, "grad_norm": 0.025923285633325577, "grad_norm_var": 3.0850081465013064e-07, "learning_rate": 0.000535561628346175, "loss": 2.4235, "step": 23565 }, { "crossentropy": 2.4189395904541016, "epoch": 0.8543358468677494, "grad_norm": 0.02672555297613144, "grad_norm_var": 2.942353464579211e-07, "learning_rate": 0.0005353000066253866, "loss": 2.448, "step": 23566 }, { "crossentropy": 2.409959554672241, "epoch": 0.8543720997679815, "grad_norm": 0.02666977420449257, "grad_norm_var": 2.9566585142182647e-07, "learning_rate": 0.0005350384452071478, "loss": 2.4753, "step": 23567 }, { "crossentropy": 2.3919665813446045, "epoch": 0.8544083526682135, "grad_norm": 0.025494489818811417, "grad_norm_var": 3.276606693229193e-07, "learning_rate": 0.0005347769440949895, "loss": 2.3009, "step": 23568 }, { "crossentropy": 2.5042288303375244, "epoch": 0.8544446055684455, "grad_norm": 0.026920268312096596, "grad_norm_var": 3.416194731935158e-07, "learning_rate": 0.0005345155032924448, "loss": 2.4962, "step": 23569 }, { "crossentropy": 2.4545929431915283, "epoch": 0.8544808584686775, "grad_norm": 0.02808390185236931, "grad_norm_var": 5.219040745531398e-07, "learning_rate": 0.0005342541228030462, "loss": 2.5083, "step": 23570 }, { "crossentropy": 2.4177780151367188, "epoch": 0.8545171113689095, "grad_norm": 0.02731749787926674, "grad_norm_var": 5.127276298159579e-07, "learning_rate": 0.000533992802630322, "loss": 2.4433, "step": 23571 }, { "crossentropy": 2.473193645477295, "epoch": 0.8545533642691415, "grad_norm": 0.026298748329281807, "grad_norm_var": 5.140681026749005e-07, "learning_rate": 0.0005337315427778017, "loss": 2.3643, "step": 23572 }, { "crossentropy": 2.51781964302063, "epoch": 0.8545896171693735, "grad_norm": 0.02696308121085167, "grad_norm_var": 5.14875659632013e-07, "learning_rate": 0.0005334703432490162, "loss": 2.5044, "step": 23573 }, { "crossentropy": 2.3800153732299805, "epoch": 0.8546258700696056, "grad_norm": 0.026053033769130707, "grad_norm_var": 5.032583462492019e-07, "learning_rate": 0.00053320920404749, "loss": 2.4142, "step": 23574 }, { "crossentropy": 2.4152750968933105, "epoch": 0.8546621229698376, "grad_norm": 0.026069600135087967, "grad_norm_var": 5.017393846548067e-07, "learning_rate": 0.0005329481251767537, "loss": 2.4771, "step": 23575 }, { "crossentropy": 2.5330028533935547, "epoch": 0.8546983758700696, "grad_norm": 0.02650337666273117, "grad_norm_var": 5.012340424336564e-07, "learning_rate": 0.000532687106640331, "loss": 2.4398, "step": 23576 }, { "crossentropy": 2.4073476791381836, "epoch": 0.8547346287703016, "grad_norm": 0.02576691284775734, "grad_norm_var": 5.368833051220262e-07, "learning_rate": 0.0005324261484417481, "loss": 2.4317, "step": 23577 }, { "crossentropy": 2.5057907104492188, "epoch": 0.8547708816705336, "grad_norm": 0.0262287687510252, "grad_norm_var": 4.888867398801135e-07, "learning_rate": 0.0005321652505845309, "loss": 2.4587, "step": 23578 }, { "crossentropy": 2.4406070709228516, "epoch": 0.8548071345707656, "grad_norm": 0.026628289371728897, "grad_norm_var": 4.869920049475779e-07, "learning_rate": 0.0005319044130722022, "loss": 2.4544, "step": 23579 }, { "crossentropy": 2.488494396209717, "epoch": 0.8548433874709976, "grad_norm": 0.025814510881900787, "grad_norm_var": 4.33079130721089e-07, "learning_rate": 0.0005316436359082843, "loss": 2.4044, "step": 23580 }, { "crossentropy": 2.545121192932129, "epoch": 0.8548796403712297, "grad_norm": 0.026608644053339958, "grad_norm_var": 4.128134571741715e-07, "learning_rate": 0.0005313829190963004, "loss": 2.5214, "step": 23581 }, { "crossentropy": 2.424642324447632, "epoch": 0.8549158932714617, "grad_norm": 0.027086695656180382, "grad_norm_var": 4.3138511916563685e-07, "learning_rate": 0.000531122262639771, "loss": 2.4695, "step": 23582 }, { "crossentropy": 2.5685715675354004, "epoch": 0.8549521461716937, "grad_norm": 0.026511985808610916, "grad_norm_var": 4.300368428757171e-07, "learning_rate": 0.0005308616665422183, "loss": 2.5122, "step": 23583 }, { "crossentropy": 2.3536806106567383, "epoch": 0.8549883990719258, "grad_norm": 0.026180796325206757, "grad_norm_var": 3.654630103834631e-07, "learning_rate": 0.0005306011308071596, "loss": 2.4044, "step": 23584 }, { "crossentropy": 2.3620381355285645, "epoch": 0.8550246519721578, "grad_norm": 0.02751685306429863, "grad_norm_var": 4.1598661362721864e-07, "learning_rate": 0.0005303406554381157, "loss": 2.4548, "step": 23585 }, { "crossentropy": 2.4501256942749023, "epoch": 0.8550609048723898, "grad_norm": 0.02631291374564171, "grad_norm_var": 2.620977497128432e-07, "learning_rate": 0.0005300802404386046, "loss": 2.4163, "step": 23586 }, { "crossentropy": 2.4733171463012695, "epoch": 0.8550971577726219, "grad_norm": 0.02646375074982643, "grad_norm_var": 2.136109309278447e-07, "learning_rate": 0.0005298198858121422, "loss": 2.4532, "step": 23587 }, { "crossentropy": 2.451094388961792, "epoch": 0.8551334106728539, "grad_norm": 0.028181685134768486, "grad_norm_var": 4.002419460642781e-07, "learning_rate": 0.0005295595915622475, "loss": 2.4998, "step": 23588 }, { "crossentropy": 2.2670156955718994, "epoch": 0.8551696635730859, "grad_norm": 0.026203542947769165, "grad_norm_var": 3.9503996830108957e-07, "learning_rate": 0.0005292993576924343, "loss": 2.3786, "step": 23589 }, { "crossentropy": 2.4327375888824463, "epoch": 0.8552059164733179, "grad_norm": 0.0273087490350008, "grad_norm_var": 4.1738170553534625e-07, "learning_rate": 0.000529039184206217, "loss": 2.4755, "step": 23590 }, { "crossentropy": 2.3112308979034424, "epoch": 0.8552421693735499, "grad_norm": 0.045254603028297424, "grad_norm_var": 2.2098681009451157e-05, "learning_rate": 0.0005287790711071111, "loss": 2.3443, "step": 23591 }, { "crossentropy": 2.4738383293151855, "epoch": 0.8552784222737819, "grad_norm": 0.02612275443971157, "grad_norm_var": 2.2172815808899534e-05, "learning_rate": 0.0005285190183986288, "loss": 2.454, "step": 23592 }, { "crossentropy": 2.6636438369750977, "epoch": 0.8553146751740139, "grad_norm": 0.027647169306874275, "grad_norm_var": 2.1893614539954933e-05, "learning_rate": 0.0005282590260842823, "loss": 2.5496, "step": 23593 }, { "crossentropy": 2.6075897216796875, "epoch": 0.855350928074246, "grad_norm": 0.026694832369685173, "grad_norm_var": 2.180461217613735e-05, "learning_rate": 0.000527999094167585, "loss": 2.5884, "step": 23594 }, { "crossentropy": 2.4079339504241943, "epoch": 0.855387180974478, "grad_norm": 0.0388379767537117, "grad_norm_var": 2.903758106331023e-05, "learning_rate": 0.000527739222652045, "loss": 2.4177, "step": 23595 }, { "crossentropy": 2.3526439666748047, "epoch": 0.85542343387471, "grad_norm": 0.02694365382194519, "grad_norm_var": 2.8687107165822306e-05, "learning_rate": 0.0005274794115411751, "loss": 2.4324, "step": 23596 }, { "crossentropy": 2.4065582752227783, "epoch": 0.855459686774942, "grad_norm": 0.025745168328285217, "grad_norm_var": 2.8979353182495963e-05, "learning_rate": 0.0005272196608384833, "loss": 2.3061, "step": 23597 }, { "crossentropy": 2.429344415664673, "epoch": 0.855495939675174, "grad_norm": 0.027954423800110817, "grad_norm_var": 2.8841109345472774e-05, "learning_rate": 0.0005269599705474765, "loss": 2.4343, "step": 23598 }, { "crossentropy": 2.537142753601074, "epoch": 0.855532192575406, "grad_norm": 0.026183973997831345, "grad_norm_var": 2.894538750999621e-05, "learning_rate": 0.0005267003406716641, "loss": 2.548, "step": 23599 }, { "crossentropy": 2.5346357822418213, "epoch": 0.855568445475638, "grad_norm": 0.029747584834694862, "grad_norm_var": 2.8531961296863896e-05, "learning_rate": 0.0005264407712145514, "loss": 2.5092, "step": 23600 }, { "crossentropy": 2.4101054668426514, "epoch": 0.85560469837587, "grad_norm": 0.02658163011074066, "grad_norm_var": 2.87647083510344e-05, "learning_rate": 0.0005261812621796447, "loss": 2.4213, "step": 23601 }, { "crossentropy": 2.357419729232788, "epoch": 0.8556409512761021, "grad_norm": 0.026574984192848206, "grad_norm_var": 2.8679071887750572e-05, "learning_rate": 0.0005259218135704502, "loss": 2.4597, "step": 23602 }, { "crossentropy": 2.4349539279937744, "epoch": 0.8556772041763341, "grad_norm": 0.026326287537813187, "grad_norm_var": 2.872495876314374e-05, "learning_rate": 0.0005256624253904701, "loss": 2.4357, "step": 23603 }, { "crossentropy": 2.4163081645965576, "epoch": 0.8557134570765661, "grad_norm": 0.02738936059176922, "grad_norm_var": 2.8839479316026156e-05, "learning_rate": 0.0005254030976432106, "loss": 2.3607, "step": 23604 }, { "crossentropy": 2.3795504570007324, "epoch": 0.8557497099767981, "grad_norm": 0.02676038071513176, "grad_norm_var": 2.866275884750694e-05, "learning_rate": 0.0005251438303321726, "loss": 2.4591, "step": 23605 }, { "crossentropy": 2.397207736968994, "epoch": 0.8557859628770301, "grad_norm": 0.026690056547522545, "grad_norm_var": 2.8816265434799384e-05, "learning_rate": 0.0005248846234608567, "loss": 2.472, "step": 23606 }, { "crossentropy": 2.444552421569824, "epoch": 0.8558222157772621, "grad_norm": 0.026114994660019875, "grad_norm_var": 9.824710659536197e-06, "learning_rate": 0.0005246254770327663, "loss": 2.5124, "step": 23607 }, { "crossentropy": 2.3494954109191895, "epoch": 0.8558584686774942, "grad_norm": 0.025049975141882896, "grad_norm_var": 1.0114334301487317e-05, "learning_rate": 0.0005243663910513996, "loss": 2.3945, "step": 23608 }, { "crossentropy": 2.4489145278930664, "epoch": 0.8558947215777262, "grad_norm": 0.027356170117855072, "grad_norm_var": 1.0116929620721882e-05, "learning_rate": 0.0005241073655202566, "loss": 2.4081, "step": 23609 }, { "crossentropy": 2.464536190032959, "epoch": 0.8559309744779582, "grad_norm": 0.028203926980495453, "grad_norm_var": 1.0085289867332349e-05, "learning_rate": 0.0005238484004428374, "loss": 2.3962, "step": 23610 }, { "crossentropy": 2.218245029449463, "epoch": 0.8559672273781903, "grad_norm": 0.02718055061995983, "grad_norm_var": 1.1949108205212688e-06, "learning_rate": 0.0005235894958226367, "loss": 2.3801, "step": 23611 }, { "crossentropy": 2.444761037826538, "epoch": 0.8560034802784223, "grad_norm": 0.03091452084481716, "grad_norm_var": 2.190170315105368e-06, "learning_rate": 0.0005233306516631548, "loss": 2.5253, "step": 23612 }, { "crossentropy": 2.5731704235076904, "epoch": 0.8560397331786543, "grad_norm": 0.02631237544119358, "grad_norm_var": 2.1022662469181387e-06, "learning_rate": 0.0005230718679678859, "loss": 2.5432, "step": 23613 }, { "crossentropy": 2.490356683731079, "epoch": 0.8560759860788864, "grad_norm": 0.027068423107266426, "grad_norm_var": 2.0632484046474807e-06, "learning_rate": 0.0005228131447403239, "loss": 2.4933, "step": 23614 }, { "crossentropy": 2.3385396003723145, "epoch": 0.8561122389791184, "grad_norm": 0.027393009513616562, "grad_norm_var": 1.9983247580310313e-06, "learning_rate": 0.0005225544819839661, "loss": 2.4569, "step": 23615 }, { "crossentropy": 2.532371997833252, "epoch": 0.8561484918793504, "grad_norm": 0.027198901399970055, "grad_norm_var": 1.5484396402144398e-06, "learning_rate": 0.0005222958797023036, "loss": 2.4842, "step": 23616 }, { "crossentropy": 2.576300859451294, "epoch": 0.8561847447795824, "grad_norm": 0.02653418481349945, "grad_norm_var": 1.5516680180942815e-06, "learning_rate": 0.0005220373378988308, "loss": 2.4916, "step": 23617 }, { "crossentropy": 2.2815558910369873, "epoch": 0.8562209976798144, "grad_norm": 0.02635400928556919, "grad_norm_var": 1.5692091295098816e-06, "learning_rate": 0.0005217788565770399, "loss": 2.3842, "step": 23618 }, { "crossentropy": 2.4082534313201904, "epoch": 0.8562572505800464, "grad_norm": 0.025512708351016045, "grad_norm_var": 1.6894044074070574e-06, "learning_rate": 0.0005215204357404208, "loss": 2.4111, "step": 23619 }, { "crossentropy": 2.602332830429077, "epoch": 0.8562935034802784, "grad_norm": 0.027566365897655487, "grad_norm_var": 1.700502287743597e-06, "learning_rate": 0.0005212620753924651, "loss": 2.6096, "step": 23620 }, { "crossentropy": 2.3097524642944336, "epoch": 0.8563297563805105, "grad_norm": 0.02680891565978527, "grad_norm_var": 1.6990137008043947e-06, "learning_rate": 0.000521003775536662, "loss": 2.3964, "step": 23621 }, { "crossentropy": 2.636911630630493, "epoch": 0.8563660092807425, "grad_norm": 0.02622881904244423, "grad_norm_var": 1.7323668030162659e-06, "learning_rate": 0.000520745536176499, "loss": 2.5865, "step": 23622 }, { "crossentropy": 2.528010129928589, "epoch": 0.8564022621809745, "grad_norm": 0.026425490155816078, "grad_norm_var": 1.70227663251517e-06, "learning_rate": 0.0005204873573154661, "loss": 2.5136, "step": 23623 }, { "crossentropy": 2.3067948818206787, "epoch": 0.8564385150812065, "grad_norm": 0.026532083749771118, "grad_norm_var": 1.4528756767236458e-06, "learning_rate": 0.000520229238957049, "loss": 2.3646, "step": 23624 }, { "crossentropy": 2.4348552227020264, "epoch": 0.8564747679814385, "grad_norm": 0.026334989815950394, "grad_norm_var": 1.483090575300179e-06, "learning_rate": 0.0005199711811047336, "loss": 2.3895, "step": 23625 }, { "crossentropy": 2.420126438140869, "epoch": 0.8565110208816705, "grad_norm": 0.02693784236907959, "grad_norm_var": 1.3860460083221175e-06, "learning_rate": 0.0005197131837620072, "loss": 2.4594, "step": 23626 }, { "crossentropy": 2.3757240772247314, "epoch": 0.8565472737819025, "grad_norm": 0.02710217982530594, "grad_norm_var": 1.3840881498536698e-06, "learning_rate": 0.0005194552469323521, "loss": 2.4604, "step": 23627 }, { "crossentropy": 2.5141382217407227, "epoch": 0.8565835266821346, "grad_norm": 0.02711653523147106, "grad_norm_var": 2.7879134581968583e-07, "learning_rate": 0.0005191973706192549, "loss": 2.4212, "step": 23628 }, { "crossentropy": 2.3742873668670654, "epoch": 0.8566197795823666, "grad_norm": 0.02612948976457119, "grad_norm_var": 2.906796298450069e-07, "learning_rate": 0.0005189395548261955, "loss": 2.4445, "step": 23629 }, { "crossentropy": 2.398174285888672, "epoch": 0.8566560324825986, "grad_norm": 0.02686678245663643, "grad_norm_var": 2.83389451835346e-07, "learning_rate": 0.000518681799556659, "loss": 2.4087, "step": 23630 }, { "crossentropy": 2.5688798427581787, "epoch": 0.8566922853828306, "grad_norm": 0.026562822982668877, "grad_norm_var": 2.4866381531877394e-07, "learning_rate": 0.0005184241048141252, "loss": 2.5637, "step": 23631 }, { "crossentropy": 2.637840747833252, "epoch": 0.8567285382830626, "grad_norm": 0.027133051306009293, "grad_norm_var": 2.4401236997256306e-07, "learning_rate": 0.0005181664706020745, "loss": 2.6115, "step": 23632 }, { "crossentropy": 2.4249632358551025, "epoch": 0.8567647911832946, "grad_norm": 0.02880914881825447, "grad_norm_var": 5.371588540730437e-07, "learning_rate": 0.0005179088969239864, "loss": 2.4874, "step": 23633 }, { "crossentropy": 2.5957610607147217, "epoch": 0.8568010440835266, "grad_norm": 0.028086530044674873, "grad_norm_var": 6.272040152551731e-07, "learning_rate": 0.0005176513837833413, "loss": 2.6155, "step": 23634 }, { "crossentropy": 2.422719717025757, "epoch": 0.8568372969837587, "grad_norm": 0.02658884786069393, "grad_norm_var": 5.027361454224087e-07, "learning_rate": 0.0005173939311836162, "loss": 2.4495, "step": 23635 }, { "crossentropy": 2.334763765335083, "epoch": 0.8568735498839907, "grad_norm": 0.02769007347524166, "grad_norm_var": 5.138283505363878e-07, "learning_rate": 0.000517136539128289, "loss": 2.4349, "step": 23636 }, { "crossentropy": 2.4860785007476807, "epoch": 0.8569098027842227, "grad_norm": 0.02615152858197689, "grad_norm_var": 5.540459502591128e-07, "learning_rate": 0.0005168792076208351, "loss": 2.5162, "step": 23637 }, { "crossentropy": 2.5457839965820312, "epoch": 0.8569460556844548, "grad_norm": 0.02642153762280941, "grad_norm_var": 5.386449710616566e-07, "learning_rate": 0.0005166219366647323, "loss": 2.4655, "step": 23638 }, { "crossentropy": 2.4176974296569824, "epoch": 0.8569823085846868, "grad_norm": 0.02738080359995365, "grad_norm_var": 5.313508286118913e-07, "learning_rate": 0.0005163647262634535, "loss": 2.469, "step": 23639 }, { "crossentropy": 2.3033883571624756, "epoch": 0.8570185614849188, "grad_norm": 0.02576122246682644, "grad_norm_var": 6.155826298527451e-07, "learning_rate": 0.0005161075764204726, "loss": 2.3772, "step": 23640 }, { "crossentropy": 2.3470020294189453, "epoch": 0.8570548143851509, "grad_norm": 0.027071412652730942, "grad_norm_var": 5.898668800717483e-07, "learning_rate": 0.0005158504871392627, "loss": 2.3915, "step": 23641 }, { "crossentropy": 2.5166521072387695, "epoch": 0.8570910672853829, "grad_norm": 0.02625182829797268, "grad_norm_var": 6.238785240917129e-07, "learning_rate": 0.0005155934584232985, "loss": 2.4967, "step": 23642 }, { "crossentropy": 2.5784871578216553, "epoch": 0.8571273201856149, "grad_norm": 0.02647167630493641, "grad_norm_var": 6.355307240181347e-07, "learning_rate": 0.0005153364902760487, "loss": 2.5101, "step": 23643 }, { "crossentropy": 2.4329657554626465, "epoch": 0.8571635730858469, "grad_norm": 0.027249924838542938, "grad_norm_var": 6.403902122136996e-07, "learning_rate": 0.0005150795827009868, "loss": 2.4692, "step": 23644 }, { "crossentropy": 2.2720489501953125, "epoch": 0.8571998259860789, "grad_norm": 0.02613491378724575, "grad_norm_var": 6.39824569605947e-07, "learning_rate": 0.00051482273570158, "loss": 2.3351, "step": 23645 }, { "crossentropy": 2.4523680210113525, "epoch": 0.8572360788863109, "grad_norm": 0.026638085022568703, "grad_norm_var": 6.445487278035228e-07, "learning_rate": 0.0005145659492812981, "loss": 2.5252, "step": 23646 }, { "crossentropy": 2.4711365699768066, "epoch": 0.8572723317865429, "grad_norm": 0.026906298473477364, "grad_norm_var": 6.364708318273849e-07, "learning_rate": 0.0005143092234436125, "loss": 2.5165, "step": 23647 }, { "crossentropy": 2.541261672973633, "epoch": 0.857308584686775, "grad_norm": 0.026590796187520027, "grad_norm_var": 6.395660947467226e-07, "learning_rate": 0.0005140525581919864, "loss": 2.5094, "step": 23648 }, { "crossentropy": 2.616975784301758, "epoch": 0.857344837587007, "grad_norm": 0.02711634337902069, "grad_norm_var": 3.8500042914151447e-07, "learning_rate": 0.0005137959535298875, "loss": 2.567, "step": 23649 }, { "crossentropy": 2.4537622928619385, "epoch": 0.857381090487239, "grad_norm": 0.02644854411482811, "grad_norm_var": 2.6777846231686397e-07, "learning_rate": 0.0005135394094607837, "loss": 2.3892, "step": 23650 }, { "crossentropy": 2.5134644508361816, "epoch": 0.857417343387471, "grad_norm": 0.026470378041267395, "grad_norm_var": 2.700894082050305e-07, "learning_rate": 0.0005132829259881377, "loss": 2.4695, "step": 23651 }, { "crossentropy": 2.4609339237213135, "epoch": 0.857453596287703, "grad_norm": 0.02662721276283264, "grad_norm_var": 1.9644774493255782e-07, "learning_rate": 0.0005130265031154152, "loss": 2.4967, "step": 23652 }, { "crossentropy": 2.3314497470855713, "epoch": 0.857489849187935, "grad_norm": 0.026304172351956367, "grad_norm_var": 1.8865881602532728e-07, "learning_rate": 0.0005127701408460784, "loss": 2.4173, "step": 23653 }, { "crossentropy": 2.2832915782928467, "epoch": 0.857526102088167, "grad_norm": 0.026489106938242912, "grad_norm_var": 1.8719831761839177e-07, "learning_rate": 0.0005125138391835904, "loss": 2.3729, "step": 23654 }, { "crossentropy": 2.581594467163086, "epoch": 0.8575623549883991, "grad_norm": 0.026898477226495743, "grad_norm_var": 1.5278155517070512e-07, "learning_rate": 0.0005122575981314149, "loss": 2.5308, "step": 23655 }, { "crossentropy": 2.562572956085205, "epoch": 0.8575986078886311, "grad_norm": 0.02847292833030224, "grad_norm_var": 3.129294935878679e-07, "learning_rate": 0.0005120014176930088, "loss": 2.5579, "step": 23656 }, { "crossentropy": 2.473095178604126, "epoch": 0.8576348607888631, "grad_norm": 0.027325036004185677, "grad_norm_var": 3.2751849816236903e-07, "learning_rate": 0.0005117452978718346, "loss": 2.4881, "step": 23657 }, { "crossentropy": 2.3368613719940186, "epoch": 0.8576711136890951, "grad_norm": 0.02624245174229145, "grad_norm_var": 3.28177732026221e-07, "learning_rate": 0.0005114892386713521, "loss": 2.4013, "step": 23658 }, { "crossentropy": 2.4154303073883057, "epoch": 0.8577073665893271, "grad_norm": 0.027334891259670258, "grad_norm_var": 3.399360687260002e-07, "learning_rate": 0.0005112332400950181, "loss": 2.388, "step": 23659 }, { "crossentropy": 2.2831549644470215, "epoch": 0.8577436194895591, "grad_norm": 0.026925643905997276, "grad_norm_var": 3.2826971097562003e-07, "learning_rate": 0.0005109773021462922, "loss": 2.408, "step": 23660 }, { "crossentropy": 2.422759532928467, "epoch": 0.8577798723897911, "grad_norm": 0.026438916102051735, "grad_norm_var": 3.0677005528820246e-07, "learning_rate": 0.0005107214248286291, "loss": 2.4252, "step": 23661 }, { "crossentropy": 2.305832624435425, "epoch": 0.8578161252900232, "grad_norm": 0.028047999367117882, "grad_norm_var": 3.955292871707719e-07, "learning_rate": 0.0005104656081454861, "loss": 2.4195, "step": 23662 }, { "crossentropy": 2.489041328430176, "epoch": 0.8578523781902552, "grad_norm": 0.026725728064775467, "grad_norm_var": 3.9777543058871987e-07, "learning_rate": 0.0005102098521003201, "loss": 2.4578, "step": 23663 }, { "crossentropy": 2.330897092819214, "epoch": 0.8578886310904872, "grad_norm": 0.02694922313094139, "grad_norm_var": 3.9085275855739917e-07, "learning_rate": 0.0005099541566965809, "loss": 2.4903, "step": 23664 }, { "crossentropy": 2.4372317790985107, "epoch": 0.8579248839907193, "grad_norm": 0.026252171024680138, "grad_norm_var": 4.156030243288583e-07, "learning_rate": 0.0005096985219377254, "loss": 2.4108, "step": 23665 }, { "crossentropy": 2.34069561958313, "epoch": 0.8579611368909513, "grad_norm": 0.026131201535463333, "grad_norm_var": 4.398169080687171e-07, "learning_rate": 0.0005094429478272067, "loss": 2.3916, "step": 23666 }, { "crossentropy": 2.3563337326049805, "epoch": 0.8579973897911833, "grad_norm": 0.026610026136040688, "grad_norm_var": 4.339259364940599e-07, "learning_rate": 0.0005091874343684744, "loss": 2.3743, "step": 23667 }, { "crossentropy": 2.3919639587402344, "epoch": 0.8580336426914154, "grad_norm": 0.026211760938167572, "grad_norm_var": 4.576609428398141e-07, "learning_rate": 0.0005089319815649818, "loss": 2.4838, "step": 23668 }, { "crossentropy": 2.6755547523498535, "epoch": 0.8580698955916474, "grad_norm": 0.027655772864818573, "grad_norm_var": 4.7617817869985364e-07, "learning_rate": 0.0005086765894201773, "loss": 2.5474, "step": 23669 }, { "crossentropy": 2.542146682739258, "epoch": 0.8581061484918794, "grad_norm": 0.027576319873332977, "grad_norm_var": 4.876706849232804e-07, "learning_rate": 0.0005084212579375114, "loss": 2.4925, "step": 23670 }, { "crossentropy": 2.4299654960632324, "epoch": 0.8581424013921114, "grad_norm": 0.027408113703131676, "grad_norm_var": 4.978607035822317e-07, "learning_rate": 0.000508165987120433, "loss": 2.438, "step": 23671 }, { "crossentropy": 2.3709607124328613, "epoch": 0.8581786542923434, "grad_norm": 0.026653684675693512, "grad_norm_var": 3.5210378290882055e-07, "learning_rate": 0.00050791077697239, "loss": 2.44, "step": 23672 }, { "crossentropy": 2.4089810848236084, "epoch": 0.8582149071925754, "grad_norm": 0.02709704451262951, "grad_norm_var": 3.4260090839913216e-07, "learning_rate": 0.000507655627496828, "loss": 2.3599, "step": 23673 }, { "crossentropy": 2.3884224891662598, "epoch": 0.8582511600928074, "grad_norm": 0.025422228500247, "grad_norm_var": 4.556098673562265e-07, "learning_rate": 0.0005074005386971952, "loss": 2.402, "step": 23674 }, { "crossentropy": 2.2818310260772705, "epoch": 0.8582874129930395, "grad_norm": 0.026486042886972427, "grad_norm_var": 4.4463728712066757e-07, "learning_rate": 0.0005071455105769352, "loss": 2.3273, "step": 23675 }, { "crossentropy": 2.4089832305908203, "epoch": 0.8583236658932715, "grad_norm": 0.025546211749315262, "grad_norm_var": 5.38062961301132e-07, "learning_rate": 0.0005068905431394937, "loss": 2.4181, "step": 23676 }, { "crossentropy": 2.4377644062042236, "epoch": 0.8583599187935035, "grad_norm": 0.026939963921904564, "grad_norm_var": 5.362594865062811e-07, "learning_rate": 0.0005066356363883129, "loss": 2.4834, "step": 23677 }, { "crossentropy": 2.407426118850708, "epoch": 0.8583961716937355, "grad_norm": 0.026441270485520363, "grad_norm_var": 4.157008560314353e-07, "learning_rate": 0.0005063807903268369, "loss": 2.4132, "step": 23678 }, { "crossentropy": 2.3052964210510254, "epoch": 0.8584324245939675, "grad_norm": 0.031462814658880234, "grad_norm_var": 1.8776066161545288e-06, "learning_rate": 0.0005061260049585087, "loss": 2.3381, "step": 23679 }, { "crossentropy": 2.4131813049316406, "epoch": 0.8584686774941995, "grad_norm": 0.026258861646056175, "grad_norm_var": 1.9054166298715368e-06, "learning_rate": 0.0005058712802867688, "loss": 2.461, "step": 23680 }, { "crossentropy": 2.369563341140747, "epoch": 0.8585049303944315, "grad_norm": 0.026868682354688644, "grad_norm_var": 1.8771859711551407e-06, "learning_rate": 0.0005056166163150555, "loss": 2.4519, "step": 23681 }, { "crossentropy": 2.455141067504883, "epoch": 0.8585411832946636, "grad_norm": 0.027229566127061844, "grad_norm_var": 1.8366101732460259e-06, "learning_rate": 0.0005053620130468123, "loss": 2.5371, "step": 23682 }, { "crossentropy": 2.4096481800079346, "epoch": 0.8585774361948956, "grad_norm": 0.026334136724472046, "grad_norm_var": 1.8554100075802877e-06, "learning_rate": 0.0005051074704854741, "loss": 2.388, "step": 23683 }, { "crossentropy": 2.337996006011963, "epoch": 0.8586136890951276, "grad_norm": 0.025921674445271492, "grad_norm_var": 1.8901719164390497e-06, "learning_rate": 0.000504852988634481, "loss": 2.3959, "step": 23684 }, { "crossentropy": 2.324808359146118, "epoch": 0.8586499419953596, "grad_norm": 0.02605459652841091, "grad_norm_var": 1.9010978876363508e-06, "learning_rate": 0.0005045985674972714, "loss": 2.3938, "step": 23685 }, { "crossentropy": 2.358281373977661, "epoch": 0.8586861948955916, "grad_norm": 0.026009026914834976, "grad_norm_var": 1.9041644531564428e-06, "learning_rate": 0.0005043442070772786, "loss": 2.4356, "step": 23686 }, { "crossentropy": 2.4691598415374756, "epoch": 0.8587224477958236, "grad_norm": 0.026420533657073975, "grad_norm_var": 1.8795650763413769e-06, "learning_rate": 0.0005040899073779409, "loss": 2.46, "step": 23687 }, { "crossentropy": 2.4310035705566406, "epoch": 0.8587587006960556, "grad_norm": 0.026516322046518326, "grad_norm_var": 1.881531198816621e-06, "learning_rate": 0.0005038356684026907, "loss": 2.4508, "step": 23688 }, { "crossentropy": 2.423248291015625, "epoch": 0.8587949535962877, "grad_norm": 0.0266862902790308, "grad_norm_var": 1.8696772482416768e-06, "learning_rate": 0.000503581490154964, "loss": 2.4855, "step": 23689 }, { "crossentropy": 2.2831456661224365, "epoch": 0.8588312064965197, "grad_norm": 0.025512076914310455, "grad_norm_var": 1.8553249345950572e-06, "learning_rate": 0.0005033273726381931, "loss": 2.2696, "step": 23690 }, { "crossentropy": 2.491443157196045, "epoch": 0.8588674593967517, "grad_norm": 0.027298718690872192, "grad_norm_var": 1.8768857883500375e-06, "learning_rate": 0.0005030733158558088, "loss": 2.4478, "step": 23691 }, { "crossentropy": 2.6158413887023926, "epoch": 0.8589037122969838, "grad_norm": 0.028206510469317436, "grad_norm_var": 1.9032866204443522e-06, "learning_rate": 0.0005028193198112435, "loss": 2.574, "step": 23692 }, { "crossentropy": 2.4545085430145264, "epoch": 0.8589399651972158, "grad_norm": 0.026291465386748314, "grad_norm_var": 1.924824132958529e-06, "learning_rate": 0.0005025653845079293, "loss": 2.4642, "step": 23693 }, { "crossentropy": 2.458928346633911, "epoch": 0.8589762180974478, "grad_norm": 0.028046557679772377, "grad_norm_var": 1.9995694403323554e-06, "learning_rate": 0.000502311509949293, "loss": 2.5042, "step": 23694 }, { "crossentropy": 2.532973051071167, "epoch": 0.8590124709976799, "grad_norm": 0.02764795906841755, "grad_norm_var": 6.110960629170103e-07, "learning_rate": 0.0005020576961387668, "loss": 2.4724, "step": 23695 }, { "crossentropy": 2.441685914993286, "epoch": 0.8590487238979119, "grad_norm": 0.026499848812818527, "grad_norm_var": 6.003444428731893e-07, "learning_rate": 0.0005018039430797766, "loss": 2.3907, "step": 23696 }, { "crossentropy": 2.369493007659912, "epoch": 0.8590849767981439, "grad_norm": 0.02808086946606636, "grad_norm_var": 7.159704753318854e-07, "learning_rate": 0.0005015502507757508, "loss": 2.3997, "step": 23697 }, { "crossentropy": 2.584929943084717, "epoch": 0.8591212296983759, "grad_norm": 0.025873564183712006, "grad_norm_var": 7.52730668225028e-07, "learning_rate": 0.000501296619230116, "loss": 2.4853, "step": 23698 }, { "crossentropy": 2.4025697708129883, "epoch": 0.8591574825986079, "grad_norm": 0.027711492031812668, "grad_norm_var": 8.018127375602658e-07, "learning_rate": 0.0005010430484462958, "loss": 2.4215, "step": 23699 }, { "crossentropy": 2.365448474884033, "epoch": 0.8591937354988399, "grad_norm": 0.026646604761481285, "grad_norm_var": 7.498972270345566e-07, "learning_rate": 0.0005007895384277172, "loss": 2.3492, "step": 23700 }, { "crossentropy": 2.3953189849853516, "epoch": 0.8592299883990719, "grad_norm": 0.025963449850678444, "grad_norm_var": 7.600088059137352e-07, "learning_rate": 0.0005005360891778043, "loss": 2.4206, "step": 23701 }, { "crossentropy": 2.4558799266815186, "epoch": 0.859266241299304, "grad_norm": 0.026706740260124207, "grad_norm_var": 7.132968422898413e-07, "learning_rate": 0.0005002827006999788, "loss": 2.4104, "step": 23702 }, { "crossentropy": 2.4661977291107178, "epoch": 0.859302494199536, "grad_norm": 0.0261333379894495, "grad_norm_var": 7.361155733099159e-07, "learning_rate": 0.0005000293729976651, "loss": 2.4178, "step": 23703 }, { "crossentropy": 2.4229018688201904, "epoch": 0.859338747099768, "grad_norm": 0.027183113619685173, "grad_norm_var": 7.330054443278281e-07, "learning_rate": 0.0004997761060742828, "loss": 2.4445, "step": 23704 }, { "crossentropy": 2.4976861476898193, "epoch": 0.859375, "grad_norm": 0.026502590626478195, "grad_norm_var": 7.404846262631936e-07, "learning_rate": 0.0004995228999332546, "loss": 2.4852, "step": 23705 }, { "crossentropy": 2.311570405960083, "epoch": 0.859411252900232, "grad_norm": 0.027321461588144302, "grad_norm_var": 6.116973879472312e-07, "learning_rate": 0.0004992697545779989, "loss": 2.4349, "step": 23706 }, { "crossentropy": 2.408322811126709, "epoch": 0.859447505800464, "grad_norm": 0.026834754273295403, "grad_norm_var": 6.071138779117537e-07, "learning_rate": 0.0004990166700119342, "loss": 2.487, "step": 23707 }, { "crossentropy": 2.47824764251709, "epoch": 0.859483758700696, "grad_norm": 0.02618020959198475, "grad_norm_var": 5.318605948140583e-07, "learning_rate": 0.0004987636462384803, "loss": 2.5162, "step": 23708 }, { "crossentropy": 2.4358389377593994, "epoch": 0.8595200116009281, "grad_norm": 0.027218982577323914, "grad_norm_var": 5.163695576781517e-07, "learning_rate": 0.0004985106832610553, "loss": 2.4598, "step": 23709 }, { "crossentropy": 2.38521409034729, "epoch": 0.8595562645011601, "grad_norm": 0.02732066623866558, "grad_norm_var": 4.3924842718223493e-07, "learning_rate": 0.0004982577810830735, "loss": 2.4062, "step": 23710 }, { "crossentropy": 2.4802045822143555, "epoch": 0.8595925174013921, "grad_norm": 0.02636023983359337, "grad_norm_var": 4.0830232452905973e-07, "learning_rate": 0.0004980049397079528, "loss": 2.4036, "step": 23711 }, { "crossentropy": 2.4152650833129883, "epoch": 0.8596287703016241, "grad_norm": 0.027010245248675346, "grad_norm_var": 4.052723925329164e-07, "learning_rate": 0.0004977521591391066, "loss": 2.3406, "step": 23712 }, { "crossentropy": 2.3697333335876465, "epoch": 0.8596650232018561, "grad_norm": 0.026285864412784576, "grad_norm_var": 3.0380896279217503e-07, "learning_rate": 0.0004974994393799498, "loss": 2.4759, "step": 23713 }, { "crossentropy": 2.634037494659424, "epoch": 0.8597012761020881, "grad_norm": 0.02691836655139923, "grad_norm_var": 2.564422171687115e-07, "learning_rate": 0.0004972467804338982, "loss": 2.5811, "step": 23714 }, { "crossentropy": 2.2957377433776855, "epoch": 0.8597375290023201, "grad_norm": 0.02683727815747261, "grad_norm_var": 1.9430638059780615e-07, "learning_rate": 0.00049699418230436, "loss": 2.3972, "step": 23715 }, { "crossentropy": 2.3149948120117188, "epoch": 0.8597737819025522, "grad_norm": 0.026996182277798653, "grad_norm_var": 1.9880311846904679e-07, "learning_rate": 0.0004967416449947487, "loss": 2.3791, "step": 23716 }, { "crossentropy": 2.479910135269165, "epoch": 0.8598100348027842, "grad_norm": 0.026144078001379967, "grad_norm_var": 1.8224015774313124e-07, "learning_rate": 0.0004964891685084761, "loss": 2.4117, "step": 23717 }, { "crossentropy": 2.519409656524658, "epoch": 0.8598462877030162, "grad_norm": 0.028172163292765617, "grad_norm_var": 3.0856457700159135e-07, "learning_rate": 0.0004962367528489509, "loss": 2.4836, "step": 23718 }, { "crossentropy": 2.34865403175354, "epoch": 0.8598825406032483, "grad_norm": 0.02581198327243328, "grad_norm_var": 3.4524263014799365e-07, "learning_rate": 0.0004959843980195844, "loss": 2.3874, "step": 23719 }, { "crossentropy": 2.214536666870117, "epoch": 0.8599187935034803, "grad_norm": 0.025219745934009552, "grad_norm_var": 4.907546695280077e-07, "learning_rate": 0.0004957321040237822, "loss": 2.3445, "step": 23720 }, { "crossentropy": 2.5759668350219727, "epoch": 0.8599550464037123, "grad_norm": 0.025377880781888962, "grad_norm_var": 5.988082219410273e-07, "learning_rate": 0.0004954798708649538, "loss": 2.4557, "step": 23721 }, { "crossentropy": 2.691788673400879, "epoch": 0.8599912993039444, "grad_norm": 0.02708679810166359, "grad_norm_var": 5.804784473243766e-07, "learning_rate": 0.0004952276985465071, "loss": 2.5202, "step": 23722 }, { "crossentropy": 2.528177499771118, "epoch": 0.8600275522041764, "grad_norm": 0.02725757658481598, "grad_norm_var": 6.042685346823214e-07, "learning_rate": 0.0004949755870718447, "loss": 2.4873, "step": 23723 }, { "crossentropy": 2.361410140991211, "epoch": 0.8600638051044084, "grad_norm": 0.028353538364171982, "grad_norm_var": 7.669975842681045e-07, "learning_rate": 0.0004947235364443741, "loss": 2.4253, "step": 23724 }, { "crossentropy": 2.3945088386535645, "epoch": 0.8601000580046404, "grad_norm": 0.027569415047764778, "grad_norm_var": 7.955005206919704e-07, "learning_rate": 0.0004944715466674998, "loss": 2.4922, "step": 23725 }, { "crossentropy": 2.4800989627838135, "epoch": 0.8601363109048724, "grad_norm": 0.026768626645207405, "grad_norm_var": 7.758647448466293e-07, "learning_rate": 0.0004942196177446234, "loss": 2.5037, "step": 23726 }, { "crossentropy": 2.4892842769622803, "epoch": 0.8601725638051044, "grad_norm": 0.02761072851717472, "grad_norm_var": 8.068405995746246e-07, "learning_rate": 0.00049396774967915, "loss": 2.551, "step": 23727 }, { "crossentropy": 2.346139430999756, "epoch": 0.8602088167053364, "grad_norm": 0.027804404497146606, "grad_norm_var": 8.644148099358802e-07, "learning_rate": 0.0004937159424744797, "loss": 2.3868, "step": 23728 }, { "crossentropy": 2.416609048843384, "epoch": 0.8602450696055685, "grad_norm": 0.025641607120633125, "grad_norm_var": 9.421161560588601e-07, "learning_rate": 0.0004934641961340136, "loss": 2.4867, "step": 23729 }, { "crossentropy": 2.5301613807678223, "epoch": 0.8602813225058005, "grad_norm": 0.026270970702171326, "grad_norm_var": 9.622500439352214e-07, "learning_rate": 0.000493212510661154, "loss": 2.5012, "step": 23730 }, { "crossentropy": 2.518226146697998, "epoch": 0.8603175754060325, "grad_norm": 0.027105094864964485, "grad_norm_var": 9.67789606062215e-07, "learning_rate": 0.0004929608860592983, "loss": 2.475, "step": 23731 }, { "crossentropy": 2.4201316833496094, "epoch": 0.8603538283062645, "grad_norm": 0.027225198224186897, "grad_norm_var": 9.76312323886054e-07, "learning_rate": 0.0004927093223318446, "loss": 2.4396, "step": 23732 }, { "crossentropy": 2.4669320583343506, "epoch": 0.8603900812064965, "grad_norm": 0.026936961337924004, "grad_norm_var": 9.421658984089572e-07, "learning_rate": 0.0004924578194821927, "loss": 2.4084, "step": 23733 }, { "crossentropy": 2.2960872650146484, "epoch": 0.8604263341067285, "grad_norm": 0.026578599587082863, "grad_norm_var": 8.280907520511697e-07, "learning_rate": 0.000492206377513737, "loss": 2.4136, "step": 23734 }, { "crossentropy": 2.527406930923462, "epoch": 0.8604625870069605, "grad_norm": 0.026935091242194176, "grad_norm_var": 7.606660184579592e-07, "learning_rate": 0.0004919549964298764, "loss": 2.4962, "step": 23735 }, { "crossentropy": 2.453623056411743, "epoch": 0.8604988399071926, "grad_norm": 0.026719970628619194, "grad_norm_var": 5.734552616685923e-07, "learning_rate": 0.0004917036762340033, "loss": 2.3859, "step": 23736 }, { "crossentropy": 2.538020372390747, "epoch": 0.8605350928074246, "grad_norm": 0.026723340153694153, "grad_norm_var": 4.040908042536132e-07, "learning_rate": 0.0004914524169295142, "loss": 2.5086, "step": 23737 }, { "crossentropy": 2.3892979621887207, "epoch": 0.8605713457076566, "grad_norm": 0.02628467231988907, "grad_norm_var": 4.389504912498283e-07, "learning_rate": 0.0004912012185198029, "loss": 2.304, "step": 23738 }, { "crossentropy": 2.4759414196014404, "epoch": 0.8606075986078886, "grad_norm": 0.026857895776629448, "grad_norm_var": 4.344946382422391e-07, "learning_rate": 0.0004909500810082612, "loss": 2.4639, "step": 23739 }, { "crossentropy": 2.604313373565674, "epoch": 0.8606438515081206, "grad_norm": 0.026186246424913406, "grad_norm_var": 3.2584451406026706e-07, "learning_rate": 0.0004906990043982811, "loss": 2.4876, "step": 23740 }, { "crossentropy": 2.4448883533477783, "epoch": 0.8606801044083526, "grad_norm": 0.028801145032048225, "grad_norm_var": 5.427295061695975e-07, "learning_rate": 0.0004904479886932544, "loss": 2.4178, "step": 23741 }, { "crossentropy": 2.4684224128723145, "epoch": 0.8607163573085846, "grad_norm": 0.025891322642564774, "grad_norm_var": 6.065702368569677e-07, "learning_rate": 0.0004901970338965706, "loss": 2.4302, "step": 23742 }, { "crossentropy": 2.424921751022339, "epoch": 0.8607526102088167, "grad_norm": 0.02591545879840851, "grad_norm_var": 6.138615309976877e-07, "learning_rate": 0.00048994614001162, "loss": 2.3788, "step": 23743 }, { "crossentropy": 2.4541516304016113, "epoch": 0.8607888631090487, "grad_norm": 0.0278184711933136, "grad_norm_var": 6.158657999151869e-07, "learning_rate": 0.0004896953070417909, "loss": 2.3741, "step": 23744 }, { "crossentropy": 2.5799942016601562, "epoch": 0.8608251160092807, "grad_norm": 0.02621094509959221, "grad_norm_var": 5.524970724317286e-07, "learning_rate": 0.0004894445349904708, "loss": 2.4983, "step": 23745 }, { "crossentropy": 2.561816930770874, "epoch": 0.8608613689095128, "grad_norm": 0.027023032307624817, "grad_norm_var": 5.369206811021872e-07, "learning_rate": 0.0004891938238610482, "loss": 2.397, "step": 23746 }, { "crossentropy": 2.3948705196380615, "epoch": 0.8608976218097448, "grad_norm": 0.02643497660756111, "grad_norm_var": 5.400356963630119e-07, "learning_rate": 0.0004889431736569089, "loss": 2.4259, "step": 23747 }, { "crossentropy": 2.2176222801208496, "epoch": 0.8609338747099768, "grad_norm": 0.026766005903482437, "grad_norm_var": 5.26199076850605e-07, "learning_rate": 0.0004886925843814366, "loss": 2.3065, "step": 23748 }, { "crossentropy": 2.4523658752441406, "epoch": 0.8609701276102089, "grad_norm": 0.02599981054663658, "grad_norm_var": 5.583854003801536e-07, "learning_rate": 0.0004884420560380176, "loss": 2.3957, "step": 23749 }, { "crossentropy": 2.3832616806030273, "epoch": 0.8610063805104409, "grad_norm": 0.025621309876441956, "grad_norm_var": 6.307330784289961e-07, "learning_rate": 0.00048819158863003476, "loss": 2.4869, "step": 23750 }, { "crossentropy": 2.322366952896118, "epoch": 0.8610426334106729, "grad_norm": 0.027804715558886528, "grad_norm_var": 7.125788413330757e-07, "learning_rate": 0.00048794118216087204, "loss": 2.3728, "step": 23751 }, { "crossentropy": 2.4110822677612305, "epoch": 0.8610788863109049, "grad_norm": 0.02706535905599594, "grad_norm_var": 7.213592610129932e-07, "learning_rate": 0.0004876908366339106, "loss": 2.3745, "step": 23752 }, { "crossentropy": 2.6394379138946533, "epoch": 0.8611151392111369, "grad_norm": 0.025410914793610573, "grad_norm_var": 8.271675954999817e-07, "learning_rate": 0.00048744055205253157, "loss": 2.5096, "step": 23753 }, { "crossentropy": 2.525552749633789, "epoch": 0.8611513921113689, "grad_norm": 0.026953188702464104, "grad_norm_var": 8.24250346271787e-07, "learning_rate": 0.0004871903284201168, "loss": 2.4815, "step": 23754 }, { "crossentropy": 2.415576219558716, "epoch": 0.8611876450116009, "grad_norm": 0.028241777792572975, "grad_norm_var": 9.781455135401817e-07, "learning_rate": 0.0004869401657400441, "loss": 2.3824, "step": 23755 }, { "crossentropy": 2.6145055294036865, "epoch": 0.861223897911833, "grad_norm": 0.027399221435189247, "grad_norm_var": 9.774639736062166e-07, "learning_rate": 0.0004866900640156946, "loss": 2.4815, "step": 23756 }, { "crossentropy": 2.5015618801116943, "epoch": 0.861260150812065, "grad_norm": 0.026607848703861237, "grad_norm_var": 7.031019276274812e-07, "learning_rate": 0.0004864400232504446, "loss": 2.4399, "step": 23757 }, { "crossentropy": 2.5052642822265625, "epoch": 0.861296403712297, "grad_norm": 0.02660788781940937, "grad_norm_var": 6.581436854872017e-07, "learning_rate": 0.00048619004344767125, "loss": 2.4856, "step": 23758 }, { "crossentropy": 2.4493329524993896, "epoch": 0.861332656612529, "grad_norm": 0.02536645345389843, "grad_norm_var": 7.375258482966805e-07, "learning_rate": 0.000485940124610752, "loss": 2.3779, "step": 23759 }, { "crossentropy": 2.2997734546661377, "epoch": 0.861368909512761, "grad_norm": 0.025711581110954285, "grad_norm_var": 7.03079167695926e-07, "learning_rate": 0.00048569026674306074, "loss": 2.3897, "step": 23760 }, { "crossentropy": 2.35981822013855, "epoch": 0.861405162412993, "grad_norm": 0.025896288454532623, "grad_norm_var": 7.246064860092951e-07, "learning_rate": 0.0004854404698479725, "loss": 2.3899, "step": 23761 }, { "crossentropy": 2.3186230659484863, "epoch": 0.861441415313225, "grad_norm": 0.026876123622059822, "grad_norm_var": 7.168248190513964e-07, "learning_rate": 0.0004851907339288636, "loss": 2.3092, "step": 23762 }, { "crossentropy": 2.4663655757904053, "epoch": 0.8614776682134571, "grad_norm": 0.026672476902604103, "grad_norm_var": 7.167801185196797e-07, "learning_rate": 0.00048494105898910404, "loss": 2.4577, "step": 23763 }, { "crossentropy": 2.350851058959961, "epoch": 0.8615139211136891, "grad_norm": 0.026287363842129707, "grad_norm_var": 7.181150701232944e-07, "learning_rate": 0.0004846914450320683, "loss": 2.3728, "step": 23764 }, { "crossentropy": 2.5695505142211914, "epoch": 0.8615501740139211, "grad_norm": 0.02800557017326355, "grad_norm_var": 8.270586442792512e-07, "learning_rate": 0.0004844418920611271, "loss": 2.518, "step": 23765 }, { "crossentropy": 2.4421987533569336, "epoch": 0.8615864269141531, "grad_norm": 0.0263842586427927, "grad_norm_var": 7.579799435355189e-07, "learning_rate": 0.00048419240007964936, "loss": 2.4453, "step": 23766 }, { "crossentropy": 2.4117138385772705, "epoch": 0.8616226798143851, "grad_norm": 0.027782423421740532, "grad_norm_var": 7.547443832695098e-07, "learning_rate": 0.00048394296909100743, "loss": 2.436, "step": 23767 }, { "crossentropy": 2.517287254333496, "epoch": 0.8616589327146171, "grad_norm": 0.027052177116274834, "grad_norm_var": 7.541206421768365e-07, "learning_rate": 0.000483693599098568, "loss": 2.4707, "step": 23768 }, { "crossentropy": 2.3887460231781006, "epoch": 0.8616951856148491, "grad_norm": 0.02676413394510746, "grad_norm_var": 6.353556378985159e-07, "learning_rate": 0.0004834442901057001, "loss": 2.3744, "step": 23769 }, { "crossentropy": 2.4197373390197754, "epoch": 0.8617314385150812, "grad_norm": 0.025787152349948883, "grad_norm_var": 6.946585807478343e-07, "learning_rate": 0.0004831950421157727, "loss": 2.3534, "step": 23770 }, { "crossentropy": 2.5411789417266846, "epoch": 0.8617676914153132, "grad_norm": 0.026666048914194107, "grad_norm_var": 5.291054231054115e-07, "learning_rate": 0.00048294585513214915, "loss": 2.525, "step": 23771 }, { "crossentropy": 2.5038750171661377, "epoch": 0.8618039443155452, "grad_norm": 0.028871335089206696, "grad_norm_var": 8.18147400134791e-07, "learning_rate": 0.000482696729158198, "loss": 2.4271, "step": 23772 }, { "crossentropy": 2.3835837841033936, "epoch": 0.8618401972157773, "grad_norm": 0.026922117918729782, "grad_norm_var": 8.200944938450133e-07, "learning_rate": 0.00048244766419728246, "loss": 2.3905, "step": 23773 }, { "crossentropy": 2.4225380420684814, "epoch": 0.8618764501160093, "grad_norm": 0.028111407533288002, "grad_norm_var": 9.372338423995248e-07, "learning_rate": 0.0004821986602527662, "loss": 2.3931, "step": 23774 }, { "crossentropy": 2.5847058296203613, "epoch": 0.8619127030162413, "grad_norm": 0.02670316770672798, "grad_norm_var": 7.894344576930256e-07, "learning_rate": 0.00048194971732801406, "loss": 2.5123, "step": 23775 }, { "crossentropy": 2.468994379043579, "epoch": 0.8619489559164734, "grad_norm": 0.027797389775514603, "grad_norm_var": 7.292108348708656e-07, "learning_rate": 0.0004817008354263863, "loss": 2.432, "step": 23776 }, { "crossentropy": 2.365114212036133, "epoch": 0.8619852088167054, "grad_norm": 0.025819702073931694, "grad_norm_var": 7.412178038151828e-07, "learning_rate": 0.0004814520145512458, "loss": 2.374, "step": 23777 }, { "crossentropy": 2.514641761779785, "epoch": 0.8620214617169374, "grad_norm": 0.026448918506503105, "grad_norm_var": 7.614705639421287e-07, "learning_rate": 0.00048120325470595363, "loss": 2.4351, "step": 23778 }, { "crossentropy": 2.429234504699707, "epoch": 0.8620577146171694, "grad_norm": 0.026754779741168022, "grad_norm_var": 7.582478987822831e-07, "learning_rate": 0.00048095455589386774, "loss": 2.453, "step": 23779 }, { "crossentropy": 2.5450029373168945, "epoch": 0.8620939675174014, "grad_norm": 0.026998165994882584, "grad_norm_var": 7.213506972830741e-07, "learning_rate": 0.00048070591811834985, "loss": 2.531, "step": 23780 }, { "crossentropy": 2.5652778148651123, "epoch": 0.8621302204176334, "grad_norm": 0.02659306302666664, "grad_norm_var": 6.66891834200968e-07, "learning_rate": 0.00048045734138275675, "loss": 2.5639, "step": 23781 }, { "crossentropy": 2.4569387435913086, "epoch": 0.8621664733178654, "grad_norm": 0.026223214343190193, "grad_norm_var": 6.810045967220804e-07, "learning_rate": 0.0004802088256904452, "loss": 2.4792, "step": 23782 }, { "crossentropy": 2.253563404083252, "epoch": 0.8622027262180975, "grad_norm": 0.026829112321138382, "grad_norm_var": 6.327531999555791e-07, "learning_rate": 0.00047996037104477306, "loss": 2.2375, "step": 23783 }, { "crossentropy": 2.403388261795044, "epoch": 0.8622389791183295, "grad_norm": 0.026244137436151505, "grad_norm_var": 6.567745368828149e-07, "learning_rate": 0.00047971197744909434, "loss": 2.3573, "step": 23784 }, { "crossentropy": 2.5025463104248047, "epoch": 0.8622752320185615, "grad_norm": 0.026814794167876244, "grad_norm_var": 6.563828692402254e-07, "learning_rate": 0.00047946364490676575, "loss": 2.4782, "step": 23785 }, { "crossentropy": 2.382025718688965, "epoch": 0.8623114849187935, "grad_norm": 0.026192909106612206, "grad_norm_var": 6.092241498836196e-07, "learning_rate": 0.0004792153734211413, "loss": 2.4338, "step": 23786 }, { "crossentropy": 2.511394739151001, "epoch": 0.8623477378190255, "grad_norm": 0.026554923504590988, "grad_norm_var": 6.130829071308013e-07, "learning_rate": 0.0004789671629955733, "loss": 2.4042, "step": 23787 }, { "crossentropy": 2.4724321365356445, "epoch": 0.8623839907192575, "grad_norm": 0.02635882794857025, "grad_norm_var": 3.3632145963791564e-07, "learning_rate": 0.0004787190136334152, "loss": 2.3993, "step": 23788 }, { "crossentropy": 2.5413856506347656, "epoch": 0.8624202436194895, "grad_norm": 0.02614615485072136, "grad_norm_var": 3.5205066696203785e-07, "learning_rate": 0.00047847092533801817, "loss": 2.4973, "step": 23789 }, { "crossentropy": 2.282963275909424, "epoch": 0.8624564965197216, "grad_norm": 0.02680326998233795, "grad_norm_var": 2.0618438168404363e-07, "learning_rate": 0.00047822289811273234, "loss": 2.3662, "step": 23790 }, { "crossentropy": 2.479207754135132, "epoch": 0.8624927494199536, "grad_norm": 0.026081403717398643, "grad_norm_var": 2.2014856408875995e-07, "learning_rate": 0.00047797493196090893, "loss": 2.4884, "step": 23791 }, { "crossentropy": 2.4300384521484375, "epoch": 0.8625290023201856, "grad_norm": 0.02566307783126831, "grad_norm_var": 1.474017806305048e-07, "learning_rate": 0.0004777270268858952, "loss": 2.4395, "step": 23792 }, { "crossentropy": 2.496948719024658, "epoch": 0.8625652552204176, "grad_norm": 0.02589344047009945, "grad_norm_var": 1.4195854532391728e-07, "learning_rate": 0.0004774791828910413, "loss": 2.4596, "step": 23793 }, { "crossentropy": 2.306670665740967, "epoch": 0.8626015081206496, "grad_norm": 0.027020534500479698, "grad_norm_var": 1.6515483244176379e-07, "learning_rate": 0.0004772313999796951, "loss": 2.3748, "step": 23794 }, { "crossentropy": 2.514153480529785, "epoch": 0.8626377610208816, "grad_norm": 0.026767276227474213, "grad_norm_var": 1.65675351768609e-07, "learning_rate": 0.00047698367815520115, "loss": 2.5731, "step": 23795 }, { "crossentropy": 2.3019564151763916, "epoch": 0.8626740139211136, "grad_norm": 0.025743447244167328, "grad_norm_var": 1.722003089935434e-07, "learning_rate": 0.0004767360174209079, "loss": 2.3289, "step": 23796 }, { "crossentropy": 2.4190127849578857, "epoch": 0.8627102668213457, "grad_norm": 0.027326814830303192, "grad_norm_var": 2.276142252226332e-07, "learning_rate": 0.0004764884177801582, "loss": 2.4592, "step": 23797 }, { "crossentropy": 2.31402587890625, "epoch": 0.8627465197215777, "grad_norm": 0.02582191303372383, "grad_norm_var": 2.4801928925863244e-07, "learning_rate": 0.00047624087923629765, "loss": 2.4312, "step": 23798 }, { "crossentropy": 2.3087844848632812, "epoch": 0.8627827726218097, "grad_norm": 0.027286387979984283, "grad_norm_var": 2.8777684479053674e-07, "learning_rate": 0.0004759934017926687, "loss": 2.4082, "step": 23799 }, { "crossentropy": 2.4280261993408203, "epoch": 0.8628190255220418, "grad_norm": 0.02707955799996853, "grad_norm_var": 3.118128716816012e-07, "learning_rate": 0.00047574598545261417, "loss": 2.3925, "step": 23800 }, { "crossentropy": 2.4079487323760986, "epoch": 0.8628552784222738, "grad_norm": 0.025878293439745903, "grad_norm_var": 3.2384520605929776e-07, "learning_rate": 0.00047549863021947527, "loss": 2.4923, "step": 23801 }, { "crossentropy": 2.395688533782959, "epoch": 0.8628915313225058, "grad_norm": 0.027097627520561218, "grad_norm_var": 3.483759005387058e-07, "learning_rate": 0.0004752513360965949, "loss": 2.46, "step": 23802 }, { "crossentropy": 2.4346656799316406, "epoch": 0.8629277842227379, "grad_norm": 0.02664324641227722, "grad_norm_var": 3.498613790868608e-07, "learning_rate": 0.0004750041030873109, "loss": 2.4585, "step": 23803 }, { "crossentropy": 2.4532148838043213, "epoch": 0.8629640371229699, "grad_norm": 0.0257594995200634, "grad_norm_var": 3.816507080001922e-07, "learning_rate": 0.00047475693119496397, "loss": 2.3874, "step": 23804 }, { "crossentropy": 2.433466911315918, "epoch": 0.8630002900232019, "grad_norm": 0.027077188715338707, "grad_norm_var": 3.995675705187676e-07, "learning_rate": 0.00047450982042289094, "loss": 2.4231, "step": 23805 }, { "crossentropy": 2.384275197982788, "epoch": 0.8630365429234339, "grad_norm": 0.025951988995075226, "grad_norm_var": 4.100331275504072e-07, "learning_rate": 0.0004742627707744318, "loss": 2.3782, "step": 23806 }, { "crossentropy": 2.5568575859069824, "epoch": 0.8630727958236659, "grad_norm": 0.026927899569272995, "grad_norm_var": 4.139797728798609e-07, "learning_rate": 0.0004740157822529217, "loss": 2.4652, "step": 23807 }, { "crossentropy": 2.4308812618255615, "epoch": 0.8631090487238979, "grad_norm": 0.026303069666028023, "grad_norm_var": 3.6849229867134265e-07, "learning_rate": 0.0004737688548616964, "loss": 2.4285, "step": 23808 }, { "crossentropy": 2.4688704013824463, "epoch": 0.86314530162413, "grad_norm": 0.02631889469921589, "grad_norm_var": 3.4334713159486773e-07, "learning_rate": 0.00047352198860409146, "loss": 2.4997, "step": 23809 }, { "crossentropy": 2.4265010356903076, "epoch": 0.863181554524362, "grad_norm": 0.02647162601351738, "grad_norm_var": 3.286725297600105e-07, "learning_rate": 0.00047327518348344203, "loss": 2.4232, "step": 23810 }, { "crossentropy": 2.4094808101654053, "epoch": 0.863217807424594, "grad_norm": 0.02548408880829811, "grad_norm_var": 3.9071697122103284e-07, "learning_rate": 0.0004730284395030793, "loss": 2.3924, "step": 23811 }, { "crossentropy": 2.361642837524414, "epoch": 0.863254060324826, "grad_norm": 0.026658736169338226, "grad_norm_var": 3.5706697186000703e-07, "learning_rate": 0.00047278175666633893, "loss": 2.4395, "step": 23812 }, { "crossentropy": 2.5072665214538574, "epoch": 0.863290313225058, "grad_norm": 0.034967001527547836, "grad_norm_var": 4.8420860324238035e-06, "learning_rate": 0.00047253513497655055, "loss": 2.4312, "step": 23813 }, { "crossentropy": 2.3433122634887695, "epoch": 0.86332656612529, "grad_norm": 0.025679487735033035, "grad_norm_var": 4.865401767453975e-06, "learning_rate": 0.0004722885744370453, "loss": 2.4005, "step": 23814 }, { "crossentropy": 2.41036319732666, "epoch": 0.863362819025522, "grad_norm": 0.027005406096577644, "grad_norm_var": 4.858634204566124e-06, "learning_rate": 0.00047204207505115614, "loss": 2.4234, "step": 23815 }, { "crossentropy": 2.2312233448028564, "epoch": 0.863399071925754, "grad_norm": 0.026438521221280098, "grad_norm_var": 4.873797185581524e-06, "learning_rate": 0.00047179563682220814, "loss": 2.3789, "step": 23816 }, { "crossentropy": 2.462871789932251, "epoch": 0.8634353248259861, "grad_norm": 0.025893840938806534, "grad_norm_var": 4.871660275790436e-06, "learning_rate": 0.0004715492597535315, "loss": 2.44, "step": 23817 }, { "crossentropy": 2.5406830310821533, "epoch": 0.8634715777262181, "grad_norm": 0.02687443420290947, "grad_norm_var": 4.869409804849585e-06, "learning_rate": 0.0004713029438484556, "loss": 2.6043, "step": 23818 }, { "crossentropy": 2.392202854156494, "epoch": 0.8635078306264501, "grad_norm": 0.028721701353788376, "grad_norm_var": 5.067303385973308e-06, "learning_rate": 0.0004710566891103052, "loss": 2.3572, "step": 23819 }, { "crossentropy": 2.5153627395629883, "epoch": 0.8635440835266821, "grad_norm": 0.025460798293352127, "grad_norm_var": 5.123612679605926e-06, "learning_rate": 0.00047081049554240805, "loss": 2.452, "step": 23820 }, { "crossentropy": 2.5526299476623535, "epoch": 0.8635803364269141, "grad_norm": 0.02617710456252098, "grad_norm_var": 5.166743929931513e-06, "learning_rate": 0.00047056436314808715, "loss": 2.45, "step": 23821 }, { "crossentropy": 2.4671239852905273, "epoch": 0.8636165893271461, "grad_norm": 0.027859020978212357, "grad_norm_var": 5.138137877155833e-06, "learning_rate": 0.00047031829193066866, "loss": 2.5443, "step": 23822 }, { "crossentropy": 2.417262315750122, "epoch": 0.8636528422273781, "grad_norm": 0.025794662535190582, "grad_norm_var": 5.2410217900466604e-06, "learning_rate": 0.0004700722818934772, "loss": 2.444, "step": 23823 }, { "crossentropy": 2.5416202545166016, "epoch": 0.8636890951276102, "grad_norm": 0.027315234765410423, "grad_norm_var": 5.210082923429576e-06, "learning_rate": 0.00046982633303983227, "loss": 2.5381, "step": 23824 }, { "crossentropy": 2.4701879024505615, "epoch": 0.8637253480278422, "grad_norm": 0.027060557156801224, "grad_norm_var": 5.170182865907299e-06, "learning_rate": 0.0004695804453730573, "loss": 2.5391, "step": 23825 }, { "crossentropy": 2.3175361156463623, "epoch": 0.8637616009280742, "grad_norm": 0.029273102059960365, "grad_norm_var": 5.41986124834992e-06, "learning_rate": 0.0004693346188964748, "loss": 2.372, "step": 23826 }, { "crossentropy": 2.524599075317383, "epoch": 0.8637978538283063, "grad_norm": 0.0280452910810709, "grad_norm_var": 5.212633082756615e-06, "learning_rate": 0.0004690888536134025, "loss": 2.404, "step": 23827 }, { "crossentropy": 2.535468578338623, "epoch": 0.8638341067285383, "grad_norm": 0.026465198025107384, "grad_norm_var": 5.235432937462e-06, "learning_rate": 0.00046884314952716223, "loss": 2.5085, "step": 23828 }, { "crossentropy": 2.3776376247406006, "epoch": 0.8638703596287703, "grad_norm": 0.02695149928331375, "grad_norm_var": 1.206013569884865e-06, "learning_rate": 0.0004685975066410708, "loss": 2.3446, "step": 23829 }, { "crossentropy": 2.4216561317443848, "epoch": 0.8639066125290024, "grad_norm": 0.025888128206133842, "grad_norm_var": 1.1737103690707585e-06, "learning_rate": 0.00046835192495844614, "loss": 2.4184, "step": 23830 }, { "crossentropy": 2.384904146194458, "epoch": 0.8639428654292344, "grad_norm": 0.02582593262195587, "grad_norm_var": 1.2521852043802123e-06, "learning_rate": 0.0004681064044826083, "loss": 2.4503, "step": 23831 }, { "crossentropy": 2.421842098236084, "epoch": 0.8639791183294664, "grad_norm": 0.02658492513000965, "grad_norm_var": 1.2449496079179495e-06, "learning_rate": 0.0004678609452168686, "loss": 2.3511, "step": 23832 }, { "crossentropy": 2.541560649871826, "epoch": 0.8640153712296984, "grad_norm": 0.025749055668711662, "grad_norm_var": 1.2654317354210728e-06, "learning_rate": 0.00046761554716454436, "loss": 2.4497, "step": 23833 }, { "crossentropy": 2.333897113800049, "epoch": 0.8640516241299304, "grad_norm": 0.02556716836988926, "grad_norm_var": 1.3728475076287804e-06, "learning_rate": 0.0004673702103289518, "loss": 2.2546, "step": 23834 }, { "crossentropy": 2.3935492038726807, "epoch": 0.8640878770301624, "grad_norm": 0.02581859566271305, "grad_norm_var": 1.1542787552321626e-06, "learning_rate": 0.00046712493471340154, "loss": 2.4796, "step": 23835 }, { "crossentropy": 2.2295074462890625, "epoch": 0.8641241299303944, "grad_norm": 0.026983896270394325, "grad_norm_var": 1.0649202865563775e-06, "learning_rate": 0.0004668797203212094, "loss": 2.3127, "step": 23836 }, { "crossentropy": 2.492433547973633, "epoch": 0.8641603828306265, "grad_norm": 0.027533892542123795, "grad_norm_var": 1.0835785009685276e-06, "learning_rate": 0.0004666345671556849, "loss": 2.473, "step": 23837 }, { "crossentropy": 2.3528740406036377, "epoch": 0.8641966357308585, "grad_norm": 0.026126570999622345, "grad_norm_var": 1.0253277512865694e-06, "learning_rate": 0.00046638947522013977, "loss": 2.346, "step": 23838 }, { "crossentropy": 2.3737547397613525, "epoch": 0.8642328886310905, "grad_norm": 0.026070071384310722, "grad_norm_var": 9.9731971915395e-07, "learning_rate": 0.00046614444451788587, "loss": 2.4591, "step": 23839 }, { "crossentropy": 2.358703851699829, "epoch": 0.8642691415313225, "grad_norm": 0.027438215911388397, "grad_norm_var": 1.0082927074087436e-06, "learning_rate": 0.0004658994750522322, "loss": 2.5728, "step": 23840 }, { "crossentropy": 2.4960052967071533, "epoch": 0.8643053944315545, "grad_norm": 0.02798055112361908, "grad_norm_var": 1.1040239744135779e-06, "learning_rate": 0.00046565456682648497, "loss": 2.51, "step": 23841 }, { "crossentropy": 2.599600076675415, "epoch": 0.8643416473317865, "grad_norm": 0.02659931778907776, "grad_norm_var": 6.580778320933265e-07, "learning_rate": 0.0004654097198439555, "loss": 2.5411, "step": 23842 }, { "crossentropy": 2.3080434799194336, "epoch": 0.8643779002320185, "grad_norm": 0.02684127166867256, "grad_norm_var": 5.169446668692087e-07, "learning_rate": 0.00046516493410794833, "loss": 2.3264, "step": 23843 }, { "crossentropy": 2.423456907272339, "epoch": 0.8644141531322506, "grad_norm": 0.027233432978391647, "grad_norm_var": 5.475501224181383e-07, "learning_rate": 0.0004649202096217714, "loss": 2.4516, "step": 23844 }, { "crossentropy": 2.4955875873565674, "epoch": 0.8644504060324826, "grad_norm": 0.025959398597478867, "grad_norm_var": 5.592014539103439e-07, "learning_rate": 0.0004646755463887281, "loss": 2.482, "step": 23845 }, { "crossentropy": 2.4891910552978516, "epoch": 0.8644866589327146, "grad_norm": 0.0261892918497324, "grad_norm_var": 5.39797364155917e-07, "learning_rate": 0.0004644309444121242, "loss": 2.4207, "step": 23846 }, { "crossentropy": 2.5867509841918945, "epoch": 0.8645229118329466, "grad_norm": 0.02679390273988247, "grad_norm_var": 5.073147816803472e-07, "learning_rate": 0.0004641864036952648, "loss": 2.5129, "step": 23847 }, { "crossentropy": 2.5170860290527344, "epoch": 0.8645591647331786, "grad_norm": 0.026773232966661453, "grad_norm_var": 5.093572189573351e-07, "learning_rate": 0.0004639419242414511, "loss": 2.5203, "step": 23848 }, { "crossentropy": 2.4514267444610596, "epoch": 0.8645954176334106, "grad_norm": 0.026721814647316933, "grad_norm_var": 4.5766088755797885e-07, "learning_rate": 0.0004636975060539839, "loss": 2.3953, "step": 23849 }, { "crossentropy": 2.3383898735046387, "epoch": 0.8646316705336426, "grad_norm": 0.025777975097298622, "grad_norm_var": 4.2959745079767093e-07, "learning_rate": 0.00046345314913616767, "loss": 2.3843, "step": 23850 }, { "crossentropy": 2.5559558868408203, "epoch": 0.8646679234338747, "grad_norm": 0.02608717791736126, "grad_norm_var": 4.033445767990014e-07, "learning_rate": 0.0004632088534912998, "loss": 2.4418, "step": 23851 }, { "crossentropy": 2.416722536087036, "epoch": 0.8647041763341067, "grad_norm": 0.02598819136619568, "grad_norm_var": 4.2687192981684614e-07, "learning_rate": 0.00046296461912268217, "loss": 2.4207, "step": 23852 }, { "crossentropy": 2.5117568969726562, "epoch": 0.8647404292343387, "grad_norm": 0.026420651003718376, "grad_norm_var": 3.704801271796661e-07, "learning_rate": 0.00046272044603361154, "loss": 2.4259, "step": 23853 }, { "crossentropy": 2.3327016830444336, "epoch": 0.8647766821345708, "grad_norm": 0.02636869251728058, "grad_norm_var": 3.600688602263622e-07, "learning_rate": 0.00046247633422738743, "loss": 2.3446, "step": 23854 }, { "crossentropy": 2.423083782196045, "epoch": 0.8648129350348028, "grad_norm": 0.026120418682694435, "grad_norm_var": 3.568195955097001e-07, "learning_rate": 0.00046223228370730675, "loss": 2.464, "step": 23855 }, { "crossentropy": 2.515291690826416, "epoch": 0.8648491879350348, "grad_norm": 0.026403164491057396, "grad_norm_var": 3.0545489213273936e-07, "learning_rate": 0.00046198829447666455, "loss": 2.3928, "step": 23856 }, { "crossentropy": 2.3707709312438965, "epoch": 0.8648854408352669, "grad_norm": 0.026124361902475357, "grad_norm_var": 1.5836870441519185e-07, "learning_rate": 0.0004617443665387577, "loss": 2.4074, "step": 23857 }, { "crossentropy": 2.3883144855499268, "epoch": 0.8649216937354989, "grad_norm": 0.026839347556233406, "grad_norm_var": 1.683439648186933e-07, "learning_rate": 0.00046150049989688083, "loss": 2.4565, "step": 23858 }, { "crossentropy": 2.531486749649048, "epoch": 0.8649579466357309, "grad_norm": 0.028211098164319992, "grad_norm_var": 3.6344971595541654e-07, "learning_rate": 0.00046125669455432617, "loss": 2.4718, "step": 23859 }, { "crossentropy": 2.529027223587036, "epoch": 0.8649941995359629, "grad_norm": 0.027192091569304466, "grad_norm_var": 3.5951790176037374e-07, "learning_rate": 0.00046101295051438786, "loss": 2.4557, "step": 23860 }, { "crossentropy": 2.4747753143310547, "epoch": 0.8650304524361949, "grad_norm": 0.026513801887631416, "grad_norm_var": 3.3890145060091505e-07, "learning_rate": 0.00046076926778035765, "loss": 2.4635, "step": 23861 }, { "crossentropy": 2.4733293056488037, "epoch": 0.8650667053364269, "grad_norm": 0.026276111602783203, "grad_norm_var": 3.353958161209439e-07, "learning_rate": 0.00046052564635552687, "loss": 2.538, "step": 23862 }, { "crossentropy": 2.2839531898498535, "epoch": 0.865102958236659, "grad_norm": 0.025851381942629814, "grad_norm_var": 3.5878993831639273e-07, "learning_rate": 0.0004602820862431867, "loss": 2.4104, "step": 23863 }, { "crossentropy": 2.486595392227173, "epoch": 0.865139211136891, "grad_norm": 0.026780059561133385, "grad_norm_var": 3.590603518753421e-07, "learning_rate": 0.00046003858744662565, "loss": 2.4516, "step": 23864 }, { "crossentropy": 2.474365711212158, "epoch": 0.865175464037123, "grad_norm": 0.026744777336716652, "grad_norm_var": 3.598343695255472e-07, "learning_rate": 0.00045979514996913383, "loss": 2.4794, "step": 23865 }, { "crossentropy": 2.5566000938415527, "epoch": 0.865211716937355, "grad_norm": 0.027030933648347855, "grad_norm_var": 3.404708149035614e-07, "learning_rate": 0.0004595517738139987, "loss": 2.4372, "step": 23866 }, { "crossentropy": 2.531416893005371, "epoch": 0.865247969837587, "grad_norm": 0.02585521899163723, "grad_norm_var": 3.584420383733374e-07, "learning_rate": 0.0004593084589845059, "loss": 2.4909, "step": 23867 }, { "crossentropy": 2.3538522720336914, "epoch": 0.865284222737819, "grad_norm": 0.025783756747841835, "grad_norm_var": 3.762321092292403e-07, "learning_rate": 0.00045906520548394446, "loss": 2.3431, "step": 23868 }, { "crossentropy": 2.526777744293213, "epoch": 0.865320475638051, "grad_norm": 0.026170609518885612, "grad_norm_var": 3.838599637022982e-07, "learning_rate": 0.0004588220133155974, "loss": 2.5718, "step": 23869 }, { "crossentropy": 2.439110517501831, "epoch": 0.865356728538283, "grad_norm": 0.02662450261414051, "grad_norm_var": 3.829045766914318e-07, "learning_rate": 0.0004585788824827508, "loss": 2.3754, "step": 23870 }, { "crossentropy": 2.31364107131958, "epoch": 0.8653929814385151, "grad_norm": 0.027193620800971985, "grad_norm_var": 3.9590891190362187e-07, "learning_rate": 0.0004583358129886889, "loss": 2.3674, "step": 23871 }, { "crossentropy": 2.5906784534454346, "epoch": 0.8654292343387471, "grad_norm": 0.026820305734872818, "grad_norm_var": 3.9585451963817614e-07, "learning_rate": 0.00045809280483669333, "loss": 2.5488, "step": 23872 }, { "crossentropy": 2.4586145877838135, "epoch": 0.8654654872389791, "grad_norm": 0.026702411472797394, "grad_norm_var": 3.7809482636748615e-07, "learning_rate": 0.00045784985803004776, "loss": 2.448, "step": 23873 }, { "crossentropy": 2.285006284713745, "epoch": 0.8655017401392111, "grad_norm": 0.025799192488193512, "grad_norm_var": 4.2110204221481166e-07, "learning_rate": 0.0004576069725720328, "loss": 2.3421, "step": 23874 }, { "crossentropy": 2.294187068939209, "epoch": 0.8655379930394431, "grad_norm": 0.02576453611254692, "grad_norm_var": 2.6863064048510073e-07, "learning_rate": 0.00045736414846592855, "loss": 2.2997, "step": 23875 }, { "crossentropy": 2.398876428604126, "epoch": 0.8655742459396751, "grad_norm": 0.026005161926150322, "grad_norm_var": 2.3828301808173114e-07, "learning_rate": 0.00045712138571501527, "loss": 2.4144, "step": 23876 }, { "crossentropy": 2.4282686710357666, "epoch": 0.8656104988399071, "grad_norm": 0.02492283470928669, "grad_norm_var": 3.6592904626244795e-07, "learning_rate": 0.0004568786843225714, "loss": 2.4228, "step": 23877 }, { "crossentropy": 2.438504219055176, "epoch": 0.8656467517401392, "grad_norm": 0.02613266184926033, "grad_norm_var": 3.6710473978056726e-07, "learning_rate": 0.00045663604429187546, "loss": 2.4306, "step": 23878 }, { "crossentropy": 2.541085958480835, "epoch": 0.8656830046403712, "grad_norm": 0.026252707466483116, "grad_norm_var": 3.5523248418007205e-07, "learning_rate": 0.0004563934656262048, "loss": 2.5194, "step": 23879 }, { "crossentropy": 2.452446460723877, "epoch": 0.8657192575406032, "grad_norm": 0.027163591235876083, "grad_norm_var": 3.8966771438390253e-07, "learning_rate": 0.00045615094832883506, "loss": 2.5019, "step": 23880 }, { "crossentropy": 2.594592332839966, "epoch": 0.8657555104408353, "grad_norm": 0.027213482186198235, "grad_norm_var": 4.305422968593764e-07, "learning_rate": 0.00045590849240304324, "loss": 2.589, "step": 23881 }, { "crossentropy": 2.532297134399414, "epoch": 0.8657917633410673, "grad_norm": 0.026643922552466393, "grad_norm_var": 4.042357734584106e-07, "learning_rate": 0.00045566609785210324, "loss": 2.5636, "step": 23882 }, { "crossentropy": 2.440239429473877, "epoch": 0.8658280162412993, "grad_norm": 0.027669956907629967, "grad_norm_var": 4.986856506867249e-07, "learning_rate": 0.00045542376467928804, "loss": 2.4456, "step": 23883 }, { "crossentropy": 2.4320390224456787, "epoch": 0.8658642691415314, "grad_norm": 0.027565589174628258, "grad_norm_var": 5.438342677802626e-07, "learning_rate": 0.00045518149288787226, "loss": 2.3973, "step": 23884 }, { "crossentropy": 2.405812978744507, "epoch": 0.8659005220417634, "grad_norm": 0.027354326099157333, "grad_norm_var": 5.730576644791611e-07, "learning_rate": 0.000454939282481126, "loss": 2.3479, "step": 23885 }, { "crossentropy": 2.3511598110198975, "epoch": 0.8659367749419954, "grad_norm": 0.026897473260760307, "grad_norm_var": 5.780860529963628e-07, "learning_rate": 0.0004546971334623229, "loss": 2.3939, "step": 23886 }, { "crossentropy": 2.476325511932373, "epoch": 0.8659730278422274, "grad_norm": 0.026198625564575195, "grad_norm_var": 5.653692297750516e-07, "learning_rate": 0.00045445504583473297, "loss": 2.4979, "step": 23887 }, { "crossentropy": 2.5169146060943604, "epoch": 0.8660092807424594, "grad_norm": 0.02764005772769451, "grad_norm_var": 6.348176141596304e-07, "learning_rate": 0.0004542130196016253, "loss": 2.4763, "step": 23888 }, { "crossentropy": 2.5512096881866455, "epoch": 0.8660455336426914, "grad_norm": 0.027078570798039436, "grad_norm_var": 6.477739449292988e-07, "learning_rate": 0.0004539710547662701, "loss": 2.5263, "step": 23889 }, { "crossentropy": 2.37148380279541, "epoch": 0.8660817865429234, "grad_norm": 0.025854889303445816, "grad_norm_var": 6.416946913228212e-07, "learning_rate": 0.0004537291513319347, "loss": 2.3987, "step": 23890 }, { "crossentropy": 2.5289337635040283, "epoch": 0.8661180394431555, "grad_norm": 0.026974665001034737, "grad_norm_var": 5.907700231361773e-07, "learning_rate": 0.0004534873093018854, "loss": 2.4775, "step": 23891 }, { "crossentropy": 2.5821170806884766, "epoch": 0.8661542923433875, "grad_norm": 0.02619752287864685, "grad_norm_var": 5.746706652841327e-07, "learning_rate": 0.0004532455286793896, "loss": 2.5104, "step": 23892 }, { "crossentropy": 2.3082120418548584, "epoch": 0.8661905452436195, "grad_norm": 0.026253309100866318, "grad_norm_var": 3.6382414154340475e-07, "learning_rate": 0.00045300380946771356, "loss": 2.3627, "step": 23893 }, { "crossentropy": 2.389718770980835, "epoch": 0.8662267981438515, "grad_norm": 0.026410333812236786, "grad_norm_var": 3.432620194632746e-07, "learning_rate": 0.0004527621516701208, "loss": 2.4222, "step": 23894 }, { "crossentropy": 2.2977828979492188, "epoch": 0.8662630510440835, "grad_norm": 0.026857692748308182, "grad_norm_var": 3.191215237123452e-07, "learning_rate": 0.00045252055528987646, "loss": 2.4209, "step": 23895 }, { "crossentropy": 2.6110219955444336, "epoch": 0.8662993039443155, "grad_norm": 0.026947811245918274, "grad_norm_var": 3.1368188728892834e-07, "learning_rate": 0.00045227902033024236, "loss": 2.5437, "step": 23896 }, { "crossentropy": 2.4459011554718018, "epoch": 0.8663355568445475, "grad_norm": 0.026192616671323776, "grad_norm_var": 3.3068784568237157e-07, "learning_rate": 0.0004520375467944815, "loss": 2.4709, "step": 23897 }, { "crossentropy": 2.496781587600708, "epoch": 0.8663718097447796, "grad_norm": 0.027183398604393005, "grad_norm_var": 3.37932417354229e-07, "learning_rate": 0.00045179613468585624, "loss": 2.458, "step": 23898 }, { "crossentropy": 2.4684407711029053, "epoch": 0.8664080626450116, "grad_norm": 0.026633765548467636, "grad_norm_var": 2.889634261379516e-07, "learning_rate": 0.0004515547840076262, "loss": 2.4075, "step": 23899 }, { "crossentropy": 2.3883323669433594, "epoch": 0.8664443155452436, "grad_norm": 0.02612454816699028, "grad_norm_var": 2.649344072138659e-07, "learning_rate": 0.00045131349476305096, "loss": 2.4503, "step": 23900 }, { "crossentropy": 2.5157651901245117, "epoch": 0.8664805684454756, "grad_norm": 0.02693122625350952, "grad_norm_var": 2.3779832667991662e-07, "learning_rate": 0.00045107226695539015, "loss": 2.4236, "step": 23901 }, { "crossentropy": 2.516078233718872, "epoch": 0.8665168213457076, "grad_norm": 0.025311125442385674, "grad_norm_var": 3.4242514553306514e-07, "learning_rate": 0.00045083110058790065, "loss": 2.3783, "step": 23902 }, { "crossentropy": 2.324812412261963, "epoch": 0.8665530742459396, "grad_norm": 0.025502044707536697, "grad_norm_var": 4.053293356962376e-07, "learning_rate": 0.00045058999566384206, "loss": 2.3571, "step": 23903 }, { "crossentropy": 2.3796193599700928, "epoch": 0.8665893271461717, "grad_norm": 0.026886286213994026, "grad_norm_var": 3.268487859010849e-07, "learning_rate": 0.00045034895218646844, "loss": 2.4551, "step": 23904 }, { "crossentropy": 2.3233911991119385, "epoch": 0.8666255800464037, "grad_norm": 0.02584017626941204, "grad_norm_var": 3.203537439904735e-07, "learning_rate": 0.0004501079701590366, "loss": 2.4786, "step": 23905 }, { "crossentropy": 2.516918659210205, "epoch": 0.8666618329466357, "grad_norm": 0.10536330938339233, "grad_norm_var": 0.0003898387078746892, "learning_rate": 0.0004498670495848023, "loss": 2.51, "step": 23906 }, { "crossentropy": 2.3780910968780518, "epoch": 0.8666980858468677, "grad_norm": 0.026524316519498825, "grad_norm_var": 0.00039011414402949883, "learning_rate": 0.00044962619046701847, "loss": 2.3974, "step": 23907 }, { "crossentropy": 2.380119800567627, "epoch": 0.8667343387470998, "grad_norm": 0.026050109416246414, "grad_norm_var": 0.00039021623364860245, "learning_rate": 0.0004493853928089375, "loss": 2.3802, "step": 23908 }, { "crossentropy": 2.2247183322906494, "epoch": 0.8667705916473318, "grad_norm": 0.02849012240767479, "grad_norm_var": 0.00038901985495112904, "learning_rate": 0.0004491446566138135, "loss": 2.2794, "step": 23909 }, { "crossentropy": 2.6008975505828857, "epoch": 0.8668068445475638, "grad_norm": 0.02655990608036518, "grad_norm_var": 0.0003889206863555934, "learning_rate": 0.0004489039818848961, "loss": 2.4844, "step": 23910 }, { "crossentropy": 2.5616111755371094, "epoch": 0.8668430974477959, "grad_norm": 0.026067886501550674, "grad_norm_var": 0.00038944458404819004, "learning_rate": 0.00044866336862543765, "loss": 2.4652, "step": 23911 }, { "crossentropy": 2.5463333129882812, "epoch": 0.8668793503480279, "grad_norm": 0.026641661301255226, "grad_norm_var": 0.00038963271266383967, "learning_rate": 0.00044842281683868667, "loss": 2.476, "step": 23912 }, { "crossentropy": 2.4541783332824707, "epoch": 0.8669156032482599, "grad_norm": 0.02717822603881359, "grad_norm_var": 0.0003890099014845558, "learning_rate": 0.0004481823265278928, "loss": 2.5053, "step": 23913 }, { "crossentropy": 2.4869775772094727, "epoch": 0.8669518561484919, "grad_norm": 0.026770804077386856, "grad_norm_var": 0.00038925556091968356, "learning_rate": 0.0004479418976963057, "loss": 2.4626, "step": 23914 }, { "crossentropy": 2.519284725189209, "epoch": 0.8669881090487239, "grad_norm": 0.025794126093387604, "grad_norm_var": 0.000389836539375211, "learning_rate": 0.00044770153034716934, "loss": 2.4513, "step": 23915 }, { "crossentropy": 2.5353317260742188, "epoch": 0.8670243619489559, "grad_norm": 0.027215223759412766, "grad_norm_var": 0.00038914702301600464, "learning_rate": 0.0004474612244837339, "loss": 2.4857, "step": 23916 }, { "crossentropy": 2.573120594024658, "epoch": 0.867060614849188, "grad_norm": 0.027167344465851784, "grad_norm_var": 0.0003890083900567146, "learning_rate": 0.0004472209801092431, "loss": 2.6162, "step": 23917 }, { "crossentropy": 2.4405837059020996, "epoch": 0.86709686774942, "grad_norm": 0.026984473690390587, "grad_norm_var": 0.00038781146427816756, "learning_rate": 0.00044698079722694075, "loss": 2.4325, "step": 23918 }, { "crossentropy": 2.4275853633880615, "epoch": 0.867133120649652, "grad_norm": 0.02759263664484024, "grad_norm_var": 0.00038639467252511706, "learning_rate": 0.00044674067584007384, "loss": 2.5036, "step": 23919 }, { "crossentropy": 2.541642665863037, "epoch": 0.867169373549884, "grad_norm": 0.028109179809689522, "grad_norm_var": 0.00038570399926663454, "learning_rate": 0.0004465006159518825, "loss": 2.5045, "step": 23920 }, { "crossentropy": 2.2649877071380615, "epoch": 0.867205626450116, "grad_norm": 0.0269466582685709, "grad_norm_var": 0.00038490541370589436, "learning_rate": 0.00044626061756561033, "loss": 2.4378, "step": 23921 }, { "crossentropy": 2.4666080474853516, "epoch": 0.867241879350348, "grad_norm": 0.026362206786870956, "grad_norm_var": 5.329907933728732e-07, "learning_rate": 0.0004460206806845002, "loss": 2.4846, "step": 23922 }, { "crossentropy": 2.451094150543213, "epoch": 0.86727813225058, "grad_norm": 0.026596328243613243, "grad_norm_var": 5.296748157324407e-07, "learning_rate": 0.0004457808053117912, "loss": 2.4677, "step": 23923 }, { "crossentropy": 2.2856650352478027, "epoch": 0.867314385150812, "grad_norm": 0.026060005649924278, "grad_norm_var": 5.285490431836907e-07, "learning_rate": 0.0004455409914507241, "loss": 2.4108, "step": 23924 }, { "crossentropy": 2.4382357597351074, "epoch": 0.8673506380510441, "grad_norm": 0.027260661125183105, "grad_norm_var": 3.637581071094653e-07, "learning_rate": 0.00044530123910453744, "loss": 2.37, "step": 23925 }, { "crossentropy": 2.353944778442383, "epoch": 0.8673868909512761, "grad_norm": 0.025819048285484314, "grad_norm_var": 4.249113785502005e-07, "learning_rate": 0.00044506154827646916, "loss": 2.4074, "step": 23926 }, { "crossentropy": 2.352363109588623, "epoch": 0.8674231438515081, "grad_norm": 0.026489829644560814, "grad_norm_var": 3.95671727377667e-07, "learning_rate": 0.00044482191896975774, "loss": 2.3951, "step": 23927 }, { "crossentropy": 2.4959909915924072, "epoch": 0.8674593967517401, "grad_norm": 0.027539944276213646, "grad_norm_var": 4.257289437955506e-07, "learning_rate": 0.0004445823511876379, "loss": 2.3781, "step": 23928 }, { "crossentropy": 2.2260568141937256, "epoch": 0.8674956496519721, "grad_norm": 0.026399627327919006, "grad_norm_var": 4.3140343978786965e-07, "learning_rate": 0.0004443428449333464, "loss": 2.3435, "step": 23929 }, { "crossentropy": 2.451953649520874, "epoch": 0.8675319025522041, "grad_norm": 0.025289395824074745, "grad_norm_var": 5.781343942055089e-07, "learning_rate": 0.00044410340021011966, "loss": 2.387, "step": 23930 }, { "crossentropy": 2.3339264392852783, "epoch": 0.8675681554524362, "grad_norm": 0.027596181258559227, "grad_norm_var": 5.570314472716845e-07, "learning_rate": 0.0004438640170211894, "loss": 2.3822, "step": 23931 }, { "crossentropy": 2.585881233215332, "epoch": 0.8676044083526682, "grad_norm": 0.026677150279283524, "grad_norm_var": 5.48156445406324e-07, "learning_rate": 0.00044362469536979033, "loss": 2.555, "step": 23932 }, { "crossentropy": 2.4181578159332275, "epoch": 0.8676406612529002, "grad_norm": 0.026880595833063126, "grad_norm_var": 5.394674238764213e-07, "learning_rate": 0.0004433854352591543, "loss": 2.4142, "step": 23933 }, { "crossentropy": 2.4778494834899902, "epoch": 0.8676769141531323, "grad_norm": 0.02731127105653286, "grad_norm_var": 5.547142539700228e-07, "learning_rate": 0.0004431462366925121, "loss": 2.4942, "step": 23934 }, { "crossentropy": 2.3271775245666504, "epoch": 0.8677131670533643, "grad_norm": 0.02625623531639576, "grad_norm_var": 5.265556538833205e-07, "learning_rate": 0.00044290709967309604, "loss": 2.4003, "step": 23935 }, { "crossentropy": 2.496605634689331, "epoch": 0.8677494199535963, "grad_norm": 0.02718597836792469, "grad_norm_var": 4.093971814643956e-07, "learning_rate": 0.0004426680242041342, "loss": 2.5169, "step": 23936 }, { "crossentropy": 2.7466161251068115, "epoch": 0.8677856728538283, "grad_norm": 0.027117758989334106, "grad_norm_var": 4.1760811992096595e-07, "learning_rate": 0.00044242901028885727, "loss": 2.6456, "step": 23937 }, { "crossentropy": 2.532142400741577, "epoch": 0.8678219257540604, "grad_norm": 0.026689328253269196, "grad_norm_var": 4.105382108349065e-07, "learning_rate": 0.00044219005793049327, "loss": 2.4153, "step": 23938 }, { "crossentropy": 2.482637405395508, "epoch": 0.8678581786542924, "grad_norm": 0.02661629021167755, "grad_norm_var": 4.102922838676466e-07, "learning_rate": 0.00044195116713226847, "loss": 2.4199, "step": 23939 }, { "crossentropy": 2.444018840789795, "epoch": 0.8678944315545244, "grad_norm": 0.027360832318663597, "grad_norm_var": 4.0516473641573577e-07, "learning_rate": 0.00044171233789741136, "loss": 2.4989, "step": 23940 }, { "crossentropy": 2.3968896865844727, "epoch": 0.8679306844547564, "grad_norm": 0.02584315836429596, "grad_norm_var": 4.4002137821781925e-07, "learning_rate": 0.00044147357022914613, "loss": 2.3647, "step": 23941 }, { "crossentropy": 2.520198106765747, "epoch": 0.8679669373549884, "grad_norm": 0.027436228469014168, "grad_norm_var": 4.152380694934088e-07, "learning_rate": 0.0004412348641306979, "loss": 2.4587, "step": 23942 }, { "crossentropy": 2.3892149925231934, "epoch": 0.8680031902552204, "grad_norm": 0.02646660804748535, "grad_norm_var": 4.162108015771937e-07, "learning_rate": 0.00044099621960529134, "loss": 2.3889, "step": 23943 }, { "crossentropy": 2.5201098918914795, "epoch": 0.8680394431554525, "grad_norm": 0.026567965745925903, "grad_norm_var": 3.782818888631156e-07, "learning_rate": 0.00044075763665614887, "loss": 2.4522, "step": 23944 }, { "crossentropy": 2.3122594356536865, "epoch": 0.8680756960556845, "grad_norm": 0.025592926889657974, "grad_norm_var": 4.545878292212437e-07, "learning_rate": 0.0004405191152864929, "loss": 2.3596, "step": 23945 }, { "crossentropy": 2.4485840797424316, "epoch": 0.8681119489559165, "grad_norm": 0.026338739320635796, "grad_norm_var": 3.2877595283740434e-07, "learning_rate": 0.000440280655499547, "loss": 2.4268, "step": 23946 }, { "crossentropy": 2.4549779891967773, "epoch": 0.8681482018561485, "grad_norm": 0.027236346155405045, "grad_norm_var": 2.9608227051156064e-07, "learning_rate": 0.0004400422572985291, "loss": 2.4778, "step": 23947 }, { "crossentropy": 2.3319339752197266, "epoch": 0.8681844547563805, "grad_norm": 0.02659878320991993, "grad_norm_var": 2.9695133660575095e-07, "learning_rate": 0.0004398039206866616, "loss": 2.3815, "step": 23948 }, { "crossentropy": 2.548328161239624, "epoch": 0.8682207076566125, "grad_norm": 0.027527496218681335, "grad_norm_var": 3.3707123421945704e-07, "learning_rate": 0.00043956564566716264, "loss": 2.4871, "step": 23949 }, { "crossentropy": 2.383826732635498, "epoch": 0.8682569605568445, "grad_norm": 0.026063190773129463, "grad_norm_var": 3.425441926908305e-07, "learning_rate": 0.00043932743224324943, "loss": 2.3985, "step": 23950 }, { "crossentropy": 2.372403621673584, "epoch": 0.8682932134570766, "grad_norm": 0.026941029354929924, "grad_norm_var": 3.330589603786251e-07, "learning_rate": 0.0004390892804181412, "loss": 2.4251, "step": 23951 }, { "crossentropy": 2.4690210819244385, "epoch": 0.8683294663573086, "grad_norm": 0.026369238272309303, "grad_norm_var": 3.244325346112801e-07, "learning_rate": 0.0004388511901950526, "loss": 2.4061, "step": 23952 }, { "crossentropy": 2.349769115447998, "epoch": 0.8683657192575406, "grad_norm": 0.02685079351067543, "grad_norm_var": 3.130509460932124e-07, "learning_rate": 0.00043861316157720043, "loss": 2.4564, "step": 23953 }, { "crossentropy": 2.4139857292175293, "epoch": 0.8684019721577726, "grad_norm": 0.026925895363092422, "grad_norm_var": 3.175941193809025e-07, "learning_rate": 0.00043837519456780107, "loss": 2.4756, "step": 23954 }, { "crossentropy": 2.531233787536621, "epoch": 0.8684382250580046, "grad_norm": 0.02674553357064724, "grad_norm_var": 3.176958407474321e-07, "learning_rate": 0.0004381372891700658, "loss": 2.5011, "step": 23955 }, { "crossentropy": 2.3876545429229736, "epoch": 0.8684744779582366, "grad_norm": 0.026967540383338928, "grad_norm_var": 2.9161120542532614e-07, "learning_rate": 0.0004378994453872104, "loss": 2.4067, "step": 23956 }, { "crossentropy": 2.632077217102051, "epoch": 0.8685107308584686, "grad_norm": 0.026816096156835556, "grad_norm_var": 2.455271444018325e-07, "learning_rate": 0.00043766166322244506, "loss": 2.5342, "step": 23957 }, { "crossentropy": 2.358288288116455, "epoch": 0.8685469837587007, "grad_norm": 0.025923410430550575, "grad_norm_var": 2.431430846965097e-07, "learning_rate": 0.0004374239426789839, "loss": 2.4312, "step": 23958 }, { "crossentropy": 2.4986698627471924, "epoch": 0.8685832366589327, "grad_norm": 0.026365976780653, "grad_norm_var": 2.4584385946226024e-07, "learning_rate": 0.000437186283760036, "loss": 2.4578, "step": 23959 }, { "crossentropy": 2.820298910140991, "epoch": 0.8686194895591647, "grad_norm": 0.027247967198491096, "grad_norm_var": 2.705307494548758e-07, "learning_rate": 0.0004369486864688105, "loss": 2.6206, "step": 23960 }, { "crossentropy": 2.4168522357940674, "epoch": 0.8686557424593968, "grad_norm": 0.02602420188486576, "grad_norm_var": 2.2097160679103849e-07, "learning_rate": 0.00043671115080851807, "loss": 2.3551, "step": 23961 }, { "crossentropy": 2.628676652908325, "epoch": 0.8686919953596288, "grad_norm": 0.02735608071088791, "grad_norm_var": 2.388399448821979e-07, "learning_rate": 0.0004364736767823674, "loss": 2.5258, "step": 23962 }, { "crossentropy": 2.418205738067627, "epoch": 0.8687282482598608, "grad_norm": 0.027511345222592354, "grad_norm_var": 2.6149173610345337e-07, "learning_rate": 0.00043623626439356466, "loss": 2.3826, "step": 23963 }, { "crossentropy": 2.3987679481506348, "epoch": 0.8687645011600929, "grad_norm": 0.026923565194010735, "grad_norm_var": 2.609012207418441e-07, "learning_rate": 0.0004359989136453174, "loss": 2.4359, "step": 23964 }, { "crossentropy": 2.4890201091766357, "epoch": 0.8688007540603249, "grad_norm": 0.026627006009221077, "grad_norm_var": 2.2242851267645358e-07, "learning_rate": 0.0004357616245408297, "loss": 2.4223, "step": 23965 }, { "crossentropy": 2.386321783065796, "epoch": 0.8688370069605569, "grad_norm": 0.02773161605000496, "grad_norm_var": 2.4836378221582536e-07, "learning_rate": 0.0004355243970833095, "loss": 2.4703, "step": 23966 }, { "crossentropy": 2.4847497940063477, "epoch": 0.8688732598607889, "grad_norm": 0.027837924659252167, "grad_norm_var": 3.1156417031000946e-07, "learning_rate": 0.00043528723127595815, "loss": 2.4246, "step": 23967 }, { "crossentropy": 2.331969976425171, "epoch": 0.8689095127610209, "grad_norm": 0.027066046372056007, "grad_norm_var": 2.9361951350229815e-07, "learning_rate": 0.0004350501271219798, "loss": 2.3802, "step": 23968 }, { "crossentropy": 2.3752450942993164, "epoch": 0.8689457656612529, "grad_norm": 0.025553055107593536, "grad_norm_var": 4.1302595259196107e-07, "learning_rate": 0.0004348130846245768, "loss": 2.3648, "step": 23969 }, { "crossentropy": 2.4771180152893066, "epoch": 0.8689820185614849, "grad_norm": 0.027638457715511322, "grad_norm_var": 4.518325873512433e-07, "learning_rate": 0.0004345761037869517, "loss": 2.3999, "step": 23970 }, { "crossentropy": 2.491812229156494, "epoch": 0.869018271461717, "grad_norm": 0.027284778654575348, "grad_norm_var": 4.591890218717433e-07, "learning_rate": 0.000434339184612304, "loss": 2.4626, "step": 23971 }, { "crossentropy": 2.541107177734375, "epoch": 0.869054524361949, "grad_norm": 0.026568034663796425, "grad_norm_var": 4.6714822548495534e-07, "learning_rate": 0.00043410232710383524, "loss": 2.4103, "step": 23972 }, { "crossentropy": 2.398982048034668, "epoch": 0.869090777262181, "grad_norm": 0.02626689337193966, "grad_norm_var": 4.924895641306246e-07, "learning_rate": 0.0004338655312647421, "loss": 2.3884, "step": 23973 }, { "crossentropy": 2.532900810241699, "epoch": 0.869127030162413, "grad_norm": 0.02723914198577404, "grad_norm_var": 4.345556398952736e-07, "learning_rate": 0.0004336287970982261, "loss": 2.4972, "step": 23974 }, { "crossentropy": 2.3260934352874756, "epoch": 0.869163283062645, "grad_norm": 0.027486300095915794, "grad_norm_var": 4.253686318282657e-07, "learning_rate": 0.0004333921246074823, "loss": 2.439, "step": 23975 }, { "crossentropy": 2.3623924255371094, "epoch": 0.869199535962877, "grad_norm": 0.026951132342219353, "grad_norm_var": 4.2195800215623587e-07, "learning_rate": 0.00043315551379570716, "loss": 2.4083, "step": 23976 }, { "crossentropy": 2.532924175262451, "epoch": 0.869235788863109, "grad_norm": 0.02668423391878605, "grad_norm_var": 3.629505353407054e-07, "learning_rate": 0.0004329189646660969, "loss": 2.5732, "step": 23977 }, { "crossentropy": 2.399911642074585, "epoch": 0.869272041763341, "grad_norm": 0.026464417576789856, "grad_norm_var": 3.756997889981309e-07, "learning_rate": 0.00043268247722184796, "loss": 2.4729, "step": 23978 }, { "crossentropy": 2.448418378829956, "epoch": 0.8693082946635731, "grad_norm": 0.027015099301934242, "grad_norm_var": 3.565706244147588e-07, "learning_rate": 0.0004324460514661527, "loss": 2.373, "step": 23979 }, { "crossentropy": 2.471374273300171, "epoch": 0.8693445475638051, "grad_norm": 0.02774597890675068, "grad_norm_var": 3.9500094074241633e-07, "learning_rate": 0.00043220968740220576, "loss": 2.4398, "step": 23980 }, { "crossentropy": 2.456169605255127, "epoch": 0.8693808004640371, "grad_norm": 0.02690069191157818, "grad_norm_var": 3.8570616536960875e-07, "learning_rate": 0.0004319733850331975, "loss": 2.4845, "step": 23981 }, { "crossentropy": 2.462150812149048, "epoch": 0.8694170533642691, "grad_norm": 0.026204559952020645, "grad_norm_var": 3.8800777701578777e-07, "learning_rate": 0.0004317371443623214, "loss": 2.4128, "step": 23982 }, { "crossentropy": 2.4545538425445557, "epoch": 0.8694533062645011, "grad_norm": 0.028349021449685097, "grad_norm_var": 4.660917559220669e-07, "learning_rate": 0.0004315009653927698, "loss": 2.4634, "step": 23983 }, { "crossentropy": 2.5361485481262207, "epoch": 0.8694895591647331, "grad_norm": 0.027315350249409676, "grad_norm_var": 4.73381147226406e-07, "learning_rate": 0.00043126484812772815, "loss": 2.5164, "step": 23984 }, { "crossentropy": 2.3844711780548096, "epoch": 0.8695258120649652, "grad_norm": 0.026433050632476807, "grad_norm_var": 3.5444755904441977e-07, "learning_rate": 0.0004310287925703882, "loss": 2.3915, "step": 23985 }, { "crossentropy": 2.2629899978637695, "epoch": 0.8695620649651972, "grad_norm": 0.027710942551493645, "grad_norm_var": 3.606159080404269e-07, "learning_rate": 0.0004307927987239396, "loss": 2.4106, "step": 23986 }, { "crossentropy": 2.4232890605926514, "epoch": 0.8695983178654292, "grad_norm": 0.027533363550901413, "grad_norm_var": 3.7263336706746163e-07, "learning_rate": 0.0004305568665915666, "loss": 2.4647, "step": 23987 }, { "crossentropy": 2.4699866771698, "epoch": 0.8696345707656613, "grad_norm": 0.027027467265725136, "grad_norm_var": 3.560405247673056e-07, "learning_rate": 0.0004303209961764587, "loss": 2.4802, "step": 23988 }, { "crossentropy": 2.4894909858703613, "epoch": 0.8696708236658933, "grad_norm": 0.02606857381761074, "grad_norm_var": 3.8007809056347004e-07, "learning_rate": 0.0004300851874817996, "loss": 2.4851, "step": 23989 }, { "crossentropy": 2.4044814109802246, "epoch": 0.8697070765661253, "grad_norm": 0.026736507192254066, "grad_norm_var": 3.845717071403997e-07, "learning_rate": 0.000429849440510775, "loss": 2.4706, "step": 23990 }, { "crossentropy": 2.5349647998809814, "epoch": 0.8697433294663574, "grad_norm": 0.02670734003186226, "grad_norm_var": 3.7605565912818215e-07, "learning_rate": 0.0004296137552665713, "loss": 2.4942, "step": 23991 }, { "crossentropy": 2.5569045543670654, "epoch": 0.8697795823665894, "grad_norm": 0.02639157511293888, "grad_norm_var": 3.985605491329715e-07, "learning_rate": 0.0004293781317523676, "loss": 2.4531, "step": 23992 }, { "crossentropy": 2.5821897983551025, "epoch": 0.8698158352668214, "grad_norm": 0.02717311680316925, "grad_norm_var": 3.9581543470930636e-07, "learning_rate": 0.00042914256997134804, "loss": 2.589, "step": 23993 }, { "crossentropy": 2.4413092136383057, "epoch": 0.8698520881670534, "grad_norm": 0.026876099407672882, "grad_norm_var": 3.777742972156546e-07, "learning_rate": 0.0004289070699266961, "loss": 2.3855, "step": 23994 }, { "crossentropy": 2.5304715633392334, "epoch": 0.8698883410672854, "grad_norm": 0.027484508231282234, "grad_norm_var": 3.9175258187875104e-07, "learning_rate": 0.0004286716316215899, "loss": 2.5181, "step": 23995 }, { "crossentropy": 2.5185067653656006, "epoch": 0.8699245939675174, "grad_norm": 0.027070695534348488, "grad_norm_var": 3.5679040344861976e-07, "learning_rate": 0.00042843625505921136, "loss": 2.4686, "step": 23996 }, { "crossentropy": 2.3093152046203613, "epoch": 0.8699608468677494, "grad_norm": 0.02572452463209629, "grad_norm_var": 4.5865675882411636e-07, "learning_rate": 0.00042820094024273783, "loss": 2.3137, "step": 23997 }, { "crossentropy": 2.493748903274536, "epoch": 0.8699970997679815, "grad_norm": 0.026317771524190903, "grad_norm_var": 4.485765415833398e-07, "learning_rate": 0.000427965687175349, "loss": 2.4317, "step": 23998 }, { "crossentropy": 2.446531057357788, "epoch": 0.8700333526682135, "grad_norm": 0.02793136052787304, "grad_norm_var": 3.805953410900819e-07, "learning_rate": 0.0004277304958602224, "loss": 2.4346, "step": 23999 }, { "crossentropy": 2.463080644607544, "epoch": 0.8700696055684455, "grad_norm": 0.027027830481529236, "grad_norm_var": 3.7008419661182086e-07, "learning_rate": 0.0004274953663005343, "loss": 2.4042, "step": 24000 }, { "crossentropy": 2.5072383880615234, "epoch": 0.8701058584686775, "grad_norm": 0.025892751291394234, "grad_norm_var": 4.211342098214254e-07, "learning_rate": 0.00042726029849945893, "loss": 2.4269, "step": 24001 }, { "crossentropy": 2.3052048683166504, "epoch": 0.8701421113689095, "grad_norm": 0.026973431929945946, "grad_norm_var": 3.7092618786106986e-07, "learning_rate": 0.0004270252924601742, "loss": 2.4405, "step": 24002 }, { "crossentropy": 2.5279994010925293, "epoch": 0.8701783642691415, "grad_norm": 0.02612145058810711, "grad_norm_var": 3.5907138707182554e-07, "learning_rate": 0.0004267903481858515, "loss": 2.5046, "step": 24003 }, { "crossentropy": 2.504629135131836, "epoch": 0.8702146171693735, "grad_norm": 0.027008239179849625, "grad_norm_var": 3.583070287994465e-07, "learning_rate": 0.00042655546567966606, "loss": 2.5406, "step": 24004 }, { "crossentropy": 2.465353488922119, "epoch": 0.8702508700696056, "grad_norm": 0.026874985545873642, "grad_norm_var": 3.2900399395418127e-07, "learning_rate": 0.0004263206449447887, "loss": 2.4656, "step": 24005 }, { "crossentropy": 2.4041614532470703, "epoch": 0.8702871229698376, "grad_norm": 0.02840578369796276, "grad_norm_var": 4.958134159924275e-07, "learning_rate": 0.00042608588598439136, "loss": 2.3883, "step": 24006 }, { "crossentropy": 2.4844181537628174, "epoch": 0.8703233758700696, "grad_norm": 0.025556061416864395, "grad_norm_var": 6.042121846843897e-07, "learning_rate": 0.000425851188801647, "loss": 2.4518, "step": 24007 }, { "crossentropy": 2.2334184646606445, "epoch": 0.8703596287703016, "grad_norm": 0.025992311537265778, "grad_norm_var": 6.360183892400544e-07, "learning_rate": 0.00042561655339972305, "loss": 2.3025, "step": 24008 }, { "crossentropy": 2.6094906330108643, "epoch": 0.8703958816705336, "grad_norm": 0.027337249368429184, "grad_norm_var": 6.463723375394261e-07, "learning_rate": 0.0004253819797817893, "loss": 2.5153, "step": 24009 }, { "crossentropy": 2.5040526390075684, "epoch": 0.8704321345707656, "grad_norm": 0.026736794039607048, "grad_norm_var": 6.459338212272027e-07, "learning_rate": 0.0004251474679510142, "loss": 2.4032, "step": 24010 }, { "crossentropy": 2.526137113571167, "epoch": 0.8704683874709976, "grad_norm": 0.027558404952287674, "grad_norm_var": 6.532314965787891e-07, "learning_rate": 0.0004249130179105648, "loss": 2.42, "step": 24011 }, { "crossentropy": 2.4223792552948, "epoch": 0.8705046403712297, "grad_norm": 0.026023104786872864, "grad_norm_var": 6.816513092644382e-07, "learning_rate": 0.000424678629663609, "loss": 2.4365, "step": 24012 }, { "crossentropy": 2.573397159576416, "epoch": 0.8705408932714617, "grad_norm": 0.02771870233118534, "grad_norm_var": 6.661411162088426e-07, "learning_rate": 0.00042444430321331053, "loss": 2.4651, "step": 24013 }, { "crossentropy": 2.5480692386627197, "epoch": 0.8705771461716937, "grad_norm": 0.026211660355329514, "grad_norm_var": 6.742654493921274e-07, "learning_rate": 0.000424210038562835, "loss": 2.559, "step": 24014 }, { "crossentropy": 2.4461755752563477, "epoch": 0.8706133990719258, "grad_norm": 0.02677440084517002, "grad_norm_var": 5.888967804859446e-07, "learning_rate": 0.00042397583571534837, "loss": 2.4022, "step": 24015 }, { "crossentropy": 2.4442615509033203, "epoch": 0.8706496519721578, "grad_norm": 0.025519972667098045, "grad_norm_var": 6.778202856789923e-07, "learning_rate": 0.0004237416946740125, "loss": 2.5041, "step": 24016 }, { "crossentropy": 2.390778064727783, "epoch": 0.8706859048723898, "grad_norm": 0.027802105993032455, "grad_norm_var": 7.080338570594613e-07, "learning_rate": 0.00042350761544198865, "loss": 2.3962, "step": 24017 }, { "crossentropy": 2.370579957962036, "epoch": 0.8707221577726219, "grad_norm": 0.025912299752235413, "grad_norm_var": 7.522321317716037e-07, "learning_rate": 0.00042327359802244005, "loss": 2.4927, "step": 24018 }, { "crossentropy": 2.367896318435669, "epoch": 0.8707584106728539, "grad_norm": 0.026359794661402702, "grad_norm_var": 7.366946065699709e-07, "learning_rate": 0.0004230396424185268, "loss": 2.3616, "step": 24019 }, { "crossentropy": 2.38076114654541, "epoch": 0.8707946635730859, "grad_norm": 0.026325426995754242, "grad_norm_var": 7.411393476370112e-07, "learning_rate": 0.00042280574863340917, "loss": 2.3747, "step": 24020 }, { "crossentropy": 2.485980749130249, "epoch": 0.8708309164733179, "grad_norm": 0.027075977995991707, "grad_norm_var": 7.485059774730521e-07, "learning_rate": 0.0004225719166702452, "loss": 2.505, "step": 24021 }, { "crossentropy": 2.6492233276367188, "epoch": 0.8708671693735499, "grad_norm": 0.026916515082120895, "grad_norm_var": 5.497758244585315e-07, "learning_rate": 0.0004223381465321946, "loss": 2.5668, "step": 24022 }, { "crossentropy": 2.4982059001922607, "epoch": 0.8709034222737819, "grad_norm": 0.02737661451101303, "grad_norm_var": 5.001710336927437e-07, "learning_rate": 0.00042210443822241487, "loss": 2.531, "step": 24023 }, { "crossentropy": 2.607062339782715, "epoch": 0.8709396751740139, "grad_norm": 0.026620695367455482, "grad_norm_var": 4.6324577260291724e-07, "learning_rate": 0.00042187079174406064, "loss": 2.523, "step": 24024 }, { "crossentropy": 2.4324822425842285, "epoch": 0.870975928074246, "grad_norm": 0.027203410863876343, "grad_norm_var": 4.5418659956456865e-07, "learning_rate": 0.00042163720710029, "loss": 2.4465, "step": 24025 }, { "crossentropy": 2.479804754257202, "epoch": 0.871012180974478, "grad_norm": 0.026383209973573685, "grad_norm_var": 4.6302342281654786e-07, "learning_rate": 0.000421403684294257, "loss": 2.4699, "step": 24026 }, { "crossentropy": 2.5286214351654053, "epoch": 0.87104843387471, "grad_norm": 0.026077091693878174, "grad_norm_var": 4.378123023710337e-07, "learning_rate": 0.00042117022332911427, "loss": 2.4527, "step": 24027 }, { "crossentropy": 2.5330069065093994, "epoch": 0.871084686774942, "grad_norm": 0.02661070041358471, "grad_norm_var": 4.107616517475537e-07, "learning_rate": 0.0004209368242080175, "loss": 2.4256, "step": 24028 }, { "crossentropy": 2.325514078140259, "epoch": 0.871120939675174, "grad_norm": 0.025752145797014236, "grad_norm_var": 3.802557091605573e-07, "learning_rate": 0.00042070348693411643, "loss": 2.3683, "step": 24029 }, { "crossentropy": 2.2842469215393066, "epoch": 0.871157192575406, "grad_norm": 0.02790559083223343, "grad_norm_var": 4.814542396494592e-07, "learning_rate": 0.00042047021151056464, "loss": 2.4546, "step": 24030 }, { "crossentropy": 2.562408924102783, "epoch": 0.871193445475638, "grad_norm": 0.028370393440127373, "grad_norm_var": 6.642539718751742e-07, "learning_rate": 0.00042023699794051293, "loss": 2.5178, "step": 24031 }, { "crossentropy": 2.386902093887329, "epoch": 0.87122969837587, "grad_norm": 0.028583159670233727, "grad_norm_var": 7.429145504643404e-07, "learning_rate": 0.00042000384622711, "loss": 2.433, "step": 24032 }, { "crossentropy": 2.4125075340270996, "epoch": 0.8712659512761021, "grad_norm": 0.031706761568784714, "grad_norm_var": 2.136989815058367e-06, "learning_rate": 0.0004197707563735065, "loss": 2.4239, "step": 24033 }, { "crossentropy": 2.440178394317627, "epoch": 0.8713022041763341, "grad_norm": 0.028090553358197212, "grad_norm_var": 2.0599142981635264e-06, "learning_rate": 0.0004195377283828494, "loss": 2.4818, "step": 24034 }, { "crossentropy": 2.3419902324676514, "epoch": 0.8713384570765661, "grad_norm": 0.025934703648090363, "grad_norm_var": 2.12647473106925e-06, "learning_rate": 0.00041930476225828574, "loss": 2.306, "step": 24035 }, { "crossentropy": 2.3346312046051025, "epoch": 0.8713747099767981, "grad_norm": 0.02551899664103985, "grad_norm_var": 2.2728038504582116e-06, "learning_rate": 0.0004190718580029634, "loss": 2.3376, "step": 24036 }, { "crossentropy": 2.6541292667388916, "epoch": 0.8714109628770301, "grad_norm": 0.026385951787233353, "grad_norm_var": 2.3193005160573413e-06, "learning_rate": 0.0004188390156200267, "loss": 2.5475, "step": 24037 }, { "crossentropy": 2.5691990852355957, "epoch": 0.8714472157772621, "grad_norm": 0.026805534958839417, "grad_norm_var": 2.3244838466541957e-06, "learning_rate": 0.00041860623511262075, "loss": 2.5612, "step": 24038 }, { "crossentropy": 2.3951809406280518, "epoch": 0.8714834686774942, "grad_norm": 0.025707868859171867, "grad_norm_var": 2.4609771553232706e-06, "learning_rate": 0.00041837351648389145, "loss": 2.4158, "step": 24039 }, { "crossentropy": 2.522085666656494, "epoch": 0.8715197215777262, "grad_norm": 0.026541586965322495, "grad_norm_var": 2.466461316301347e-06, "learning_rate": 0.0004181408597369796, "loss": 2.4418, "step": 24040 }, { "crossentropy": 2.517068862915039, "epoch": 0.8715559744779582, "grad_norm": 0.026288213208317757, "grad_norm_var": 2.5060212617038773e-06, "learning_rate": 0.0004179082648750299, "loss": 2.4389, "step": 24041 }, { "crossentropy": 2.540541887283325, "epoch": 0.8715922273781903, "grad_norm": 0.02635946497321129, "grad_norm_var": 2.50814034285413e-06, "learning_rate": 0.00041767573190118257, "loss": 2.5065, "step": 24042 }, { "crossentropy": 2.499298334121704, "epoch": 0.8716284802784223, "grad_norm": 0.027130568400025368, "grad_norm_var": 2.4422613928475984e-06, "learning_rate": 0.00041744326081857684, "loss": 2.4863, "step": 24043 }, { "crossentropy": 2.2798001766204834, "epoch": 0.8716647331786543, "grad_norm": 0.027677636593580246, "grad_norm_var": 2.4429818721410547e-06, "learning_rate": 0.00041721085163035575, "loss": 2.3551, "step": 24044 }, { "crossentropy": 2.626519203186035, "epoch": 0.8717009860788864, "grad_norm": 0.02702406607568264, "grad_norm_var": 2.3032254356780226e-06, "learning_rate": 0.0004169785043396557, "loss": 2.5108, "step": 24045 }, { "crossentropy": 2.4743173122406006, "epoch": 0.8717372389791184, "grad_norm": 0.026541132479906082, "grad_norm_var": 2.3006674777230226e-06, "learning_rate": 0.0004167462189496157, "loss": 2.3455, "step": 24046 }, { "crossentropy": 2.5040221214294434, "epoch": 0.8717734918793504, "grad_norm": 0.026352250948548317, "grad_norm_var": 2.231316807347709e-06, "learning_rate": 0.0004165139954633745, "loss": 2.4725, "step": 24047 }, { "crossentropy": 2.429844856262207, "epoch": 0.8718097447795824, "grad_norm": 0.027117419987916946, "grad_norm_var": 2.0641118572253205e-06, "learning_rate": 0.00041628183388406684, "loss": 2.3998, "step": 24048 }, { "crossentropy": 2.4559426307678223, "epoch": 0.8718459976798144, "grad_norm": 0.025965431705117226, "grad_norm_var": 4.821123034829192e-07, "learning_rate": 0.0004160497342148295, "loss": 2.3925, "step": 24049 }, { "crossentropy": 2.377126932144165, "epoch": 0.8718822505800464, "grad_norm": 0.028146522119641304, "grad_norm_var": 4.935053226831979e-07, "learning_rate": 0.00041581769645879674, "loss": 2.4578, "step": 24050 }, { "crossentropy": 2.3841452598571777, "epoch": 0.8719185034802784, "grad_norm": 0.027686981484293938, "grad_norm_var": 5.31471237175104e-07, "learning_rate": 0.00041558572061910215, "loss": 2.336, "step": 24051 }, { "crossentropy": 2.4887399673461914, "epoch": 0.8719547563805105, "grad_norm": 0.027133282274007797, "grad_norm_var": 4.3947661427512494e-07, "learning_rate": 0.00041535380669888047, "loss": 2.4932, "step": 24052 }, { "crossentropy": 2.3210701942443848, "epoch": 0.8719910092807425, "grad_norm": 0.026164326816797256, "grad_norm_var": 4.5489962802551057e-07, "learning_rate": 0.0004151219547012619, "loss": 2.4267, "step": 24053 }, { "crossentropy": 2.4084787368774414, "epoch": 0.8720272621809745, "grad_norm": 0.02632896415889263, "grad_norm_var": 4.6811656263984605e-07, "learning_rate": 0.0004148901646293796, "loss": 2.4065, "step": 24054 }, { "crossentropy": 2.3969321250915527, "epoch": 0.8720635150812065, "grad_norm": 0.0264701209962368, "grad_norm_var": 3.974626226435051e-07, "learning_rate": 0.00041465843648636424, "loss": 2.4506, "step": 24055 }, { "crossentropy": 2.46878719329834, "epoch": 0.8720997679814385, "grad_norm": 0.026630930602550507, "grad_norm_var": 3.9478789715917856e-07, "learning_rate": 0.0004144267702753446, "loss": 2.4218, "step": 24056 }, { "crossentropy": 2.506197690963745, "epoch": 0.8721360208816705, "grad_norm": 0.026534248143434525, "grad_norm_var": 3.8133667625618073e-07, "learning_rate": 0.00041419516599945085, "loss": 2.3734, "step": 24057 }, { "crossentropy": 2.3575994968414307, "epoch": 0.8721722737819025, "grad_norm": 0.026887737214565277, "grad_norm_var": 3.6570921068805085e-07, "learning_rate": 0.00041396362366181104, "loss": 2.352, "step": 24058 }, { "crossentropy": 2.3968915939331055, "epoch": 0.8722085266821346, "grad_norm": 0.02559618279337883, "grad_norm_var": 4.579055528634857e-07, "learning_rate": 0.0004137321432655505, "loss": 2.4089, "step": 24059 }, { "crossentropy": 2.4571313858032227, "epoch": 0.8722447795823666, "grad_norm": 0.02720279060304165, "grad_norm_var": 4.142845929230258e-07, "learning_rate": 0.00041350072481379895, "loss": 2.4094, "step": 24060 }, { "crossentropy": 2.5108089447021484, "epoch": 0.8722810324825986, "grad_norm": 0.02681097574532032, "grad_norm_var": 4.089493591737884e-07, "learning_rate": 0.0004132693683096789, "loss": 2.5104, "step": 24061 }, { "crossentropy": 2.4067885875701904, "epoch": 0.8723172853828306, "grad_norm": 0.02683442085981369, "grad_norm_var": 4.072103654927887e-07, "learning_rate": 0.00041303807375631666, "loss": 2.387, "step": 24062 }, { "crossentropy": 2.3632607460021973, "epoch": 0.8723535382830626, "grad_norm": 0.0264117531478405, "grad_norm_var": 4.0434419176684173e-07, "learning_rate": 0.00041280684115683753, "loss": 2.3547, "step": 24063 }, { "crossentropy": 2.302259922027588, "epoch": 0.8723897911832946, "grad_norm": 0.025818215683102608, "grad_norm_var": 4.4534925722620963e-07, "learning_rate": 0.00041257567051436185, "loss": 2.3308, "step": 24064 }, { "crossentropy": 2.3521530628204346, "epoch": 0.8724260440835266, "grad_norm": 0.026428550481796265, "grad_norm_var": 4.156224879159805e-07, "learning_rate": 0.00041234456183201487, "loss": 2.4295, "step": 24065 }, { "crossentropy": 2.377244710922241, "epoch": 0.8724622969837587, "grad_norm": 0.02728475257754326, "grad_norm_var": 2.9501009014760936e-07, "learning_rate": 0.0004121135151129152, "loss": 2.44, "step": 24066 }, { "crossentropy": 2.6247119903564453, "epoch": 0.8724985498839907, "grad_norm": 0.02657018043100834, "grad_norm_var": 2.169134743328401e-07, "learning_rate": 0.00041188253036018586, "loss": 2.4922, "step": 24067 }, { "crossentropy": 2.4329044818878174, "epoch": 0.8725348027842227, "grad_norm": 0.025897270068526268, "grad_norm_var": 2.1943707842221143e-07, "learning_rate": 0.0004116516075769461, "loss": 2.3676, "step": 24068 }, { "crossentropy": 2.3282341957092285, "epoch": 0.8725710556844548, "grad_norm": 0.02581920102238655, "grad_norm_var": 2.419583619732435e-07, "learning_rate": 0.00041142074676631335, "loss": 2.3897, "step": 24069 }, { "crossentropy": 2.5603156089782715, "epoch": 0.8726073085846868, "grad_norm": 0.026318596675992012, "grad_norm_var": 2.4216058181012606e-07, "learning_rate": 0.00041118994793140683, "loss": 2.5076, "step": 24070 }, { "crossentropy": 2.5667402744293213, "epoch": 0.8726435614849188, "grad_norm": 0.026407217606902122, "grad_norm_var": 2.4240473425322407e-07, "learning_rate": 0.0004109592110753446, "loss": 2.5349, "step": 24071 }, { "crossentropy": 2.4022305011749268, "epoch": 0.8726798143851509, "grad_norm": 0.02554033324122429, "grad_norm_var": 2.927322964473098e-07, "learning_rate": 0.0004107285362012414, "loss": 2.4527, "step": 24072 }, { "crossentropy": 2.3793859481811523, "epoch": 0.8727160672853829, "grad_norm": 0.026815427467226982, "grad_norm_var": 3.0279474123682397e-07, "learning_rate": 0.00041049792331221467, "loss": 2.4254, "step": 24073 }, { "crossentropy": 2.358278274536133, "epoch": 0.8727523201856149, "grad_norm": 0.02508382499217987, "grad_norm_var": 3.9252662446710947e-07, "learning_rate": 0.0004102673724113776, "loss": 2.4326, "step": 24074 }, { "crossentropy": 2.4419970512390137, "epoch": 0.8727885730858469, "grad_norm": 0.026449451223015785, "grad_norm_var": 3.5767590307147986e-07, "learning_rate": 0.0004100368835018453, "loss": 2.4598, "step": 24075 }, { "crossentropy": 2.359955310821533, "epoch": 0.8728248259860789, "grad_norm": 0.02539951540529728, "grad_norm_var": 3.572683051920733e-07, "learning_rate": 0.000409806456586731, "loss": 2.3865, "step": 24076 }, { "crossentropy": 2.414923906326294, "epoch": 0.8728610788863109, "grad_norm": 0.04421031475067139, "grad_norm_var": 2.0595739789150355e-05, "learning_rate": 0.0004095760916691449, "loss": 2.5083, "step": 24077 }, { "crossentropy": 2.469316005706787, "epoch": 0.8728973317865429, "grad_norm": 0.025163214653730392, "grad_norm_var": 2.0880852281031834e-05, "learning_rate": 0.00040934578875220063, "loss": 2.4062, "step": 24078 }, { "crossentropy": 2.498671531677246, "epoch": 0.872933584686775, "grad_norm": 0.026385361328721046, "grad_norm_var": 2.0883761474941434e-05, "learning_rate": 0.0004091155478390085, "loss": 2.4161, "step": 24079 }, { "crossentropy": 2.458951234817505, "epoch": 0.872969837587007, "grad_norm": 0.025739578530192375, "grad_norm_var": 2.089889241344369e-05, "learning_rate": 0.00040888536893267656, "loss": 2.4715, "step": 24080 }, { "crossentropy": 2.349673271179199, "epoch": 0.873006090487239, "grad_norm": 0.02590637281537056, "grad_norm_var": 2.0971006520963338e-05, "learning_rate": 0.00040865525203631625, "loss": 2.3117, "step": 24081 }, { "crossentropy": 2.411693572998047, "epoch": 0.873042343387471, "grad_norm": 0.02617453783750534, "grad_norm_var": 2.1033559554521963e-05, "learning_rate": 0.00040842519715303394, "loss": 2.3639, "step": 24082 }, { "crossentropy": 2.278272867202759, "epoch": 0.873078596287703, "grad_norm": 0.025755029171705246, "grad_norm_var": 2.113457816337172e-05, "learning_rate": 0.0004081952042859366, "loss": 2.3676, "step": 24083 }, { "crossentropy": 2.503697395324707, "epoch": 0.873114849187935, "grad_norm": 0.026430627331137657, "grad_norm_var": 2.1069203024848213e-05, "learning_rate": 0.0004079652734381334, "loss": 2.5081, "step": 24084 }, { "crossentropy": 2.5548386573791504, "epoch": 0.873151102088167, "grad_norm": 0.02646678499877453, "grad_norm_var": 2.098483088504217e-05, "learning_rate": 0.0004077354046127263, "loss": 2.5197, "step": 24085 }, { "crossentropy": 2.5346736907958984, "epoch": 0.8731873549883991, "grad_norm": 0.0259074829518795, "grad_norm_var": 2.1040440848063558e-05, "learning_rate": 0.0004075055978128212, "loss": 2.5192, "step": 24086 }, { "crossentropy": 2.445585250854492, "epoch": 0.8732236078886311, "grad_norm": 0.026205556467175484, "grad_norm_var": 2.1062005231884708e-05, "learning_rate": 0.00040727585304152324, "loss": 2.4826, "step": 24087 }, { "crossentropy": 2.4768829345703125, "epoch": 0.8732598607888631, "grad_norm": 0.026160065084695816, "grad_norm_var": 2.0956960206808668e-05, "learning_rate": 0.000407046170301934, "loss": 2.3906, "step": 24088 }, { "crossentropy": 2.4327142238616943, "epoch": 0.8732961136890951, "grad_norm": 0.025951627641916275, "grad_norm_var": 2.1041071305533917e-05, "learning_rate": 0.0004068165495971565, "loss": 2.4384, "step": 24089 }, { "crossentropy": 2.4912991523742676, "epoch": 0.8733323665893271, "grad_norm": 0.026481080800294876, "grad_norm_var": 2.078992939951219e-05, "learning_rate": 0.0004065869909302916, "loss": 2.4907, "step": 24090 }, { "crossentropy": 2.4148435592651367, "epoch": 0.8733686194895591, "grad_norm": 0.026932554319500923, "grad_norm_var": 2.0757834814870292e-05, "learning_rate": 0.00040635749430444014, "loss": 2.4509, "step": 24091 }, { "crossentropy": 2.5495307445526123, "epoch": 0.8734048723897911, "grad_norm": 0.027238979935646057, "grad_norm_var": 2.0526652849863802e-05, "learning_rate": 0.0004061280597227035, "loss": 2.49, "step": 24092 }, { "crossentropy": 2.3792855739593506, "epoch": 0.8734411252900232, "grad_norm": 0.02636115252971649, "grad_norm_var": 2.4001636389024e-07, "learning_rate": 0.00040589868718817704, "loss": 2.3726, "step": 24093 }, { "crossentropy": 2.372999668121338, "epoch": 0.8734773781902552, "grad_norm": 0.026165761053562164, "grad_norm_var": 1.6374368696989162e-07, "learning_rate": 0.000405669376703961, "loss": 2.4118, "step": 24094 }, { "crossentropy": 2.50028657913208, "epoch": 0.8735136310904872, "grad_norm": 0.027127273380756378, "grad_norm_var": 2.0991268139245767e-07, "learning_rate": 0.0004054401282731524, "loss": 2.4622, "step": 24095 }, { "crossentropy": 2.5707738399505615, "epoch": 0.8735498839907193, "grad_norm": 0.026984218508005142, "grad_norm_var": 2.1160945205631747e-07, "learning_rate": 0.000405210941898847, "loss": 2.4257, "step": 24096 }, { "crossentropy": 2.4135353565216064, "epoch": 0.8735861368909513, "grad_norm": 0.02866419218480587, "grad_norm_var": 5.089139971439175e-07, "learning_rate": 0.00040498181758414134, "loss": 2.4332, "step": 24097 }, { "crossentropy": 2.4468612670898438, "epoch": 0.8736223897911833, "grad_norm": 0.026943020522594452, "grad_norm_var": 5.060277128480917e-07, "learning_rate": 0.0004047527553321284, "loss": 2.5008, "step": 24098 }, { "crossentropy": 2.3363089561462402, "epoch": 0.8736586426914154, "grad_norm": 0.02570566162467003, "grad_norm_var": 5.118140816939698e-07, "learning_rate": 0.00040452375514590355, "loss": 2.3885, "step": 24099 }, { "crossentropy": 2.493905782699585, "epoch": 0.8736948955916474, "grad_norm": 0.02978515811264515, "grad_norm_var": 1.1358400783715763e-06, "learning_rate": 0.0004042948170285593, "loss": 2.4912, "step": 24100 }, { "crossentropy": 2.333132743835449, "epoch": 0.8737311484918794, "grad_norm": 0.026547331362962723, "grad_norm_var": 1.1324786689017226e-06, "learning_rate": 0.0004040659409831887, "loss": 2.3861, "step": 24101 }, { "crossentropy": 2.5057716369628906, "epoch": 0.8737674013921114, "grad_norm": 0.026158595457673073, "grad_norm_var": 1.1057811292032912e-06, "learning_rate": 0.00040383712701288045, "loss": 2.5056, "step": 24102 }, { "crossentropy": 2.443740129470825, "epoch": 0.8738036542923434, "grad_norm": 0.025541044771671295, "grad_norm_var": 1.1894385179612882e-06, "learning_rate": 0.0004036083751207281, "loss": 2.4946, "step": 24103 }, { "crossentropy": 2.493257761001587, "epoch": 0.8738399071925754, "grad_norm": 0.02613624557852745, "grad_norm_var": 1.1914959917683558e-06, "learning_rate": 0.0004033796853098193, "loss": 2.5086, "step": 24104 }, { "crossentropy": 2.229003429412842, "epoch": 0.8738761600928074, "grad_norm": 0.02564135193824768, "grad_norm_var": 1.2324134018505683e-06, "learning_rate": 0.00040315105758324257, "loss": 2.3657, "step": 24105 }, { "crossentropy": 2.4132320880889893, "epoch": 0.8739124129930395, "grad_norm": 0.026366299018263817, "grad_norm_var": 1.2377480694351992e-06, "learning_rate": 0.0004029224919440883, "loss": 2.4404, "step": 24106 }, { "crossentropy": 2.624345541000366, "epoch": 0.8739486658932715, "grad_norm": 0.026993338018655777, "grad_norm_var": 1.2393071239864711e-06, "learning_rate": 0.00040269398839544115, "loss": 2.5506, "step": 24107 }, { "crossentropy": 2.4289913177490234, "epoch": 0.8739849187935035, "grad_norm": 0.0256018228828907, "grad_norm_var": 1.3049929062860141e-06, "learning_rate": 0.00040246554694038926, "loss": 2.3806, "step": 24108 }, { "crossentropy": 2.466578960418701, "epoch": 0.8740211716937355, "grad_norm": 0.026440918445587158, "grad_norm_var": 1.3021041955404062e-06, "learning_rate": 0.0004022371675820169, "loss": 2.4566, "step": 24109 }, { "crossentropy": 2.523554563522339, "epoch": 0.8740574245939675, "grad_norm": 0.027038341388106346, "grad_norm_var": 1.2904283180968038e-06, "learning_rate": 0.0004020088503234082, "loss": 2.4183, "step": 24110 }, { "crossentropy": 2.476062774658203, "epoch": 0.8740936774941995, "grad_norm": 0.026262423023581505, "grad_norm_var": 1.291327883423156e-06, "learning_rate": 0.00040178059516764896, "loss": 2.4102, "step": 24111 }, { "crossentropy": 2.494443893432617, "epoch": 0.8741299303944315, "grad_norm": 0.02620379999279976, "grad_norm_var": 1.2972825129501168e-06, "learning_rate": 0.0004015524021178196, "loss": 2.443, "step": 24112 }, { "crossentropy": 2.2807374000549316, "epoch": 0.8741661832946636, "grad_norm": 0.02589382790029049, "grad_norm_var": 1.0244062828438164e-06, "learning_rate": 0.0004013242711770032, "loss": 2.3901, "step": 24113 }, { "crossentropy": 2.5763185024261475, "epoch": 0.8742024361948956, "grad_norm": 0.027462217956781387, "grad_norm_var": 1.0751281039965422e-06, "learning_rate": 0.00040109620234828226, "loss": 2.5121, "step": 24114 }, { "crossentropy": 2.471965789794922, "epoch": 0.8742386890951276, "grad_norm": 0.025545628741383553, "grad_norm_var": 1.0933825723649042e-06, "learning_rate": 0.00040086819563473543, "loss": 2.4081, "step": 24115 }, { "crossentropy": 2.385037660598755, "epoch": 0.8742749419953596, "grad_norm": 0.025627074763178825, "grad_norm_var": 3.3943335216641593e-07, "learning_rate": 0.00040064025103944413, "loss": 2.3167, "step": 24116 }, { "crossentropy": 2.3655319213867188, "epoch": 0.8743111948955916, "grad_norm": 0.02673519216477871, "grad_norm_var": 3.4993163462119364e-07, "learning_rate": 0.0004004123685654859, "loss": 2.4256, "step": 24117 }, { "crossentropy": 2.55153489112854, "epoch": 0.8743474477958236, "grad_norm": 0.02673320658504963, "grad_norm_var": 3.6524975565345683e-07, "learning_rate": 0.00040018454821593775, "loss": 2.5574, "step": 24118 }, { "crossentropy": 2.2860326766967773, "epoch": 0.8743837006960556, "grad_norm": 0.02728295885026455, "grad_norm_var": 3.8699956725085117e-07, "learning_rate": 0.0003999567899938783, "loss": 2.3529, "step": 24119 }, { "crossentropy": 2.358161687850952, "epoch": 0.8744199535962877, "grad_norm": 0.027250168845057487, "grad_norm_var": 4.2941875391244493e-07, "learning_rate": 0.00039972909390238287, "loss": 2.3848, "step": 24120 }, { "crossentropy": 2.5466325283050537, "epoch": 0.8744562064965197, "grad_norm": 0.0269109345972538, "grad_norm_var": 3.945573941281644e-07, "learning_rate": 0.0003995014599445268, "loss": 2.5094, "step": 24121 }, { "crossentropy": 2.425153970718384, "epoch": 0.8744924593967517, "grad_norm": 0.02648153342306614, "grad_norm_var": 3.9299874235957515e-07, "learning_rate": 0.0003992738881233865, "loss": 2.4939, "step": 24122 }, { "crossentropy": 2.3709561824798584, "epoch": 0.8745287122969838, "grad_norm": 0.026379281654953957, "grad_norm_var": 3.785448892605018e-07, "learning_rate": 0.0003990463784420328, "loss": 2.4031, "step": 24123 }, { "crossentropy": 2.68172550201416, "epoch": 0.8745649651972158, "grad_norm": 0.026395147666335106, "grad_norm_var": 3.2387007426557676e-07, "learning_rate": 0.00039881893090354116, "loss": 2.5401, "step": 24124 }, { "crossentropy": 2.3733654022216797, "epoch": 0.8746012180974478, "grad_norm": 0.026012826710939407, "grad_norm_var": 3.4098892362013913e-07, "learning_rate": 0.00039859154551098123, "loss": 2.4032, "step": 24125 }, { "crossentropy": 2.441253662109375, "epoch": 0.8746374709976799, "grad_norm": 0.026733243837952614, "grad_norm_var": 3.254526792538399e-07, "learning_rate": 0.0003983642222674261, "loss": 2.4758, "step": 24126 }, { "crossentropy": 2.4178860187530518, "epoch": 0.8746737238979119, "grad_norm": 0.026549462229013443, "grad_norm_var": 3.217261824942173e-07, "learning_rate": 0.0003981369611759456, "loss": 2.4704, "step": 24127 }, { "crossentropy": 2.5124471187591553, "epoch": 0.8747099767981439, "grad_norm": 0.025783775374293327, "grad_norm_var": 3.5002845870466676e-07, "learning_rate": 0.0003979097622396083, "loss": 2.4868, "step": 24128 }, { "crossentropy": 2.4927356243133545, "epoch": 0.8747462296983759, "grad_norm": 0.026720942929387093, "grad_norm_var": 3.2747666994237255e-07, "learning_rate": 0.0003976826254614835, "loss": 2.4489, "step": 24129 }, { "crossentropy": 2.4393208026885986, "epoch": 0.8747824825986079, "grad_norm": 0.026061031967401505, "grad_norm_var": 2.774660526949635e-07, "learning_rate": 0.0003974555508446398, "loss": 2.4715, "step": 24130 }, { "crossentropy": 2.402249336242676, "epoch": 0.8748187354988399, "grad_norm": 0.02614370919764042, "grad_norm_var": 2.2769206739561841e-07, "learning_rate": 0.00039722853839214257, "loss": 2.3829, "step": 24131 }, { "crossentropy": 2.456136703491211, "epoch": 0.8748549883990719, "grad_norm": 0.026222916319966316, "grad_norm_var": 1.81521879622816e-07, "learning_rate": 0.00039700158810705987, "loss": 2.4772, "step": 24132 }, { "crossentropy": 2.587186336517334, "epoch": 0.874891241299304, "grad_norm": 0.027265409007668495, "grad_norm_var": 2.1396836060449952e-07, "learning_rate": 0.0003967746999924554, "loss": 2.5738, "step": 24133 }, { "crossentropy": 2.3803839683532715, "epoch": 0.874927494199536, "grad_norm": 0.026963504031300545, "grad_norm_var": 2.2266590256429335e-07, "learning_rate": 0.00039654787405139503, "loss": 2.4218, "step": 24134 }, { "crossentropy": 2.2612783908843994, "epoch": 0.874963747099768, "grad_norm": 0.026416579261422157, "grad_norm_var": 1.8748621247958314e-07, "learning_rate": 0.0003963211102869413, "loss": 2.4052, "step": 24135 }, { "crossentropy": 2.5135180950164795, "epoch": 0.875, "grad_norm": 0.027348026633262634, "grad_norm_var": 1.9763583298166612e-07, "learning_rate": 0.00039609440870215675, "loss": 2.4293, "step": 24136 }, { "crossentropy": 2.4894802570343018, "epoch": 0.875036252900232, "grad_norm": 0.028118683025240898, "grad_norm_var": 3.5106760876424393e-07, "learning_rate": 0.0003958677693001039, "loss": 2.4252, "step": 24137 }, { "crossentropy": 2.4164135456085205, "epoch": 0.875072505800464, "grad_norm": 0.026670189574360847, "grad_norm_var": 3.50318302114877e-07, "learning_rate": 0.0003956411920838443, "loss": 2.3811, "step": 24138 }, { "crossentropy": 2.4657418727874756, "epoch": 0.875108758700696, "grad_norm": 0.02720584161579609, "grad_norm_var": 3.674210452581095e-07, "learning_rate": 0.0003954146770564371, "loss": 2.5605, "step": 24139 }, { "crossentropy": 2.424072265625, "epoch": 0.8751450116009281, "grad_norm": 0.026383165270090103, "grad_norm_var": 3.678582823671139e-07, "learning_rate": 0.0003951882242209437, "loss": 2.4096, "step": 24140 }, { "crossentropy": 2.281830310821533, "epoch": 0.8751812645011601, "grad_norm": 0.025914130732417107, "grad_norm_var": 3.7701587052828743e-07, "learning_rate": 0.00039496183358042006, "loss": 2.317, "step": 24141 }, { "crossentropy": 2.4074835777282715, "epoch": 0.8752175174013921, "grad_norm": 0.026820287108421326, "grad_norm_var": 3.7838253323928205e-07, "learning_rate": 0.00039473550513792645, "loss": 2.5014, "step": 24142 }, { "crossentropy": 2.547055721282959, "epoch": 0.8752537703016241, "grad_norm": 0.02763190306723118, "grad_norm_var": 4.354095547608993e-07, "learning_rate": 0.00039450923889651826, "loss": 2.5115, "step": 24143 }, { "crossentropy": 2.476412773132324, "epoch": 0.8752900232018561, "grad_norm": 0.02601538971066475, "grad_norm_var": 4.095602722558317e-07, "learning_rate": 0.0003942830348592508, "loss": 2.4149, "step": 24144 }, { "crossentropy": 2.426025867462158, "epoch": 0.8753262761020881, "grad_norm": 0.02855139784514904, "grad_norm_var": 6.133782129406993e-07, "learning_rate": 0.00039405689302918104, "loss": 2.3995, "step": 24145 }, { "crossentropy": 2.445448637008667, "epoch": 0.8753625290023201, "grad_norm": 0.02686070092022419, "grad_norm_var": 5.683426184991194e-07, "learning_rate": 0.0003938308134093632, "loss": 2.3793, "step": 24146 }, { "crossentropy": 2.4463391304016113, "epoch": 0.8753987819025522, "grad_norm": 0.02619752287864685, "grad_norm_var": 5.630379871151275e-07, "learning_rate": 0.0003936047960028494, "loss": 2.3498, "step": 24147 }, { "crossentropy": 2.446963310241699, "epoch": 0.8754350348027842, "grad_norm": 0.026897696778178215, "grad_norm_var": 5.295343925995245e-07, "learning_rate": 0.00039337884081269426, "loss": 2.4524, "step": 24148 }, { "crossentropy": 2.3761849403381348, "epoch": 0.8754712877030162, "grad_norm": 0.02649427019059658, "grad_norm_var": 5.34658767257579e-07, "learning_rate": 0.00039315294784194764, "loss": 2.4419, "step": 24149 }, { "crossentropy": 2.4052178859710693, "epoch": 0.8755075406032483, "grad_norm": 0.027169842272996902, "grad_norm_var": 5.389133116396119e-07, "learning_rate": 0.0003929271170936616, "loss": 2.3691, "step": 24150 }, { "crossentropy": 2.3106963634490967, "epoch": 0.8755437935034803, "grad_norm": 0.026430301368236542, "grad_norm_var": 5.380068014875268e-07, "learning_rate": 0.0003927013485708886, "loss": 2.3442, "step": 24151 }, { "crossentropy": 2.3132591247558594, "epoch": 0.8755800464037123, "grad_norm": 0.02617373690009117, "grad_norm_var": 5.570703761991705e-07, "learning_rate": 0.00039247564227667345, "loss": 2.4093, "step": 24152 }, { "crossentropy": 2.484609842300415, "epoch": 0.8756162993039444, "grad_norm": 0.026775633916258812, "grad_norm_var": 4.4189271375936513e-07, "learning_rate": 0.000392249998214067, "loss": 2.4469, "step": 24153 }, { "crossentropy": 2.402165412902832, "epoch": 0.8756525522041764, "grad_norm": 0.026645373553037643, "grad_norm_var": 4.422349881377796e-07, "learning_rate": 0.0003920244163861175, "loss": 2.4246, "step": 24154 }, { "crossentropy": 2.587031364440918, "epoch": 0.8756888051044084, "grad_norm": 0.027192993089556694, "grad_norm_var": 4.414822885165717e-07, "learning_rate": 0.00039179889679587075, "loss": 2.4923, "step": 24155 }, { "crossentropy": 2.441516399383545, "epoch": 0.8757250580046404, "grad_norm": 0.026699984446167946, "grad_norm_var": 4.318521539407264e-07, "learning_rate": 0.0003915734394463738, "loss": 2.4746, "step": 24156 }, { "crossentropy": 2.4194483757019043, "epoch": 0.8757613109048724, "grad_norm": 0.027144107967615128, "grad_norm_var": 3.8449553153088735e-07, "learning_rate": 0.00039134804434067026, "loss": 2.4815, "step": 24157 }, { "crossentropy": 2.381270170211792, "epoch": 0.8757975638051044, "grad_norm": 0.027144944295287132, "grad_norm_var": 3.895233367412343e-07, "learning_rate": 0.000391122711481805, "loss": 2.45, "step": 24158 }, { "crossentropy": 2.44994854927063, "epoch": 0.8758338167053364, "grad_norm": 0.02583901956677437, "grad_norm_var": 4.098722115373118e-07, "learning_rate": 0.0003908974408728233, "loss": 2.4253, "step": 24159 }, { "crossentropy": 2.5581769943237305, "epoch": 0.8758700696055685, "grad_norm": 0.026054270565509796, "grad_norm_var": 4.0608292427573904e-07, "learning_rate": 0.00039067223251676496, "loss": 2.4982, "step": 24160 }, { "crossentropy": 2.4872357845306396, "epoch": 0.8759063225058005, "grad_norm": 0.027178531512618065, "grad_norm_var": 1.9724625628930505e-07, "learning_rate": 0.00039044708641667317, "loss": 2.45, "step": 24161 }, { "crossentropy": 2.6308767795562744, "epoch": 0.8759425754060325, "grad_norm": 0.026780327782034874, "grad_norm_var": 1.9572620871335932e-07, "learning_rate": 0.0003902220025755893, "loss": 2.5914, "step": 24162 }, { "crossentropy": 2.43570613861084, "epoch": 0.8759788283062645, "grad_norm": 0.025607896968722343, "grad_norm_var": 2.550837770804443e-07, "learning_rate": 0.00038999698099655145, "loss": 2.3713, "step": 24163 }, { "crossentropy": 2.348604202270508, "epoch": 0.8760150812064965, "grad_norm": 0.027679210528731346, "grad_norm_var": 3.201810747234379e-07, "learning_rate": 0.0003897720216826017, "loss": 2.406, "step": 24164 }, { "crossentropy": 2.5759599208831787, "epoch": 0.8760513341067285, "grad_norm": 0.026442717760801315, "grad_norm_var": 3.216798607050611e-07, "learning_rate": 0.00038954712463677656, "loss": 2.4381, "step": 24165 }, { "crossentropy": 2.3769583702087402, "epoch": 0.8760875870069605, "grad_norm": 0.02603619545698166, "grad_norm_var": 3.2870627212980284e-07, "learning_rate": 0.0003893222898621135, "loss": 2.3847, "step": 24166 }, { "crossentropy": 2.645097255706787, "epoch": 0.8761238399071926, "grad_norm": 0.027164265513420105, "grad_norm_var": 3.4439051893875306e-07, "learning_rate": 0.0003890975173616512, "loss": 2.5239, "step": 24167 }, { "crossentropy": 2.429718255996704, "epoch": 0.8761600928074246, "grad_norm": 0.026628540828824043, "grad_norm_var": 3.278341779843942e-07, "learning_rate": 0.000388872807138424, "loss": 2.4536, "step": 24168 }, { "crossentropy": 2.3252389430999756, "epoch": 0.8761963457076566, "grad_norm": 0.026745403185486794, "grad_norm_var": 3.27539579909293e-07, "learning_rate": 0.000388648159195466, "loss": 2.3945, "step": 24169 }, { "crossentropy": 2.461347818374634, "epoch": 0.8762325986078886, "grad_norm": 0.027619879692792892, "grad_norm_var": 3.815515004828753e-07, "learning_rate": 0.00038842357353581383, "loss": 2.4179, "step": 24170 }, { "crossentropy": 2.2729976177215576, "epoch": 0.8762688515081206, "grad_norm": 0.027168896049261093, "grad_norm_var": 3.801561066503478e-07, "learning_rate": 0.0003881990501624988, "loss": 2.3323, "step": 24171 }, { "crossentropy": 2.4507482051849365, "epoch": 0.8763051044083526, "grad_norm": 0.027119994163513184, "grad_norm_var": 3.886110139052068e-07, "learning_rate": 0.0003879745890785552, "loss": 2.4441, "step": 24172 }, { "crossentropy": 2.3866169452667236, "epoch": 0.8763413573085846, "grad_norm": 0.02580885775387287, "grad_norm_var": 4.3381870234026945e-07, "learning_rate": 0.00038775019028701284, "loss": 2.4209, "step": 24173 }, { "crossentropy": 2.4310522079467773, "epoch": 0.8763776102088167, "grad_norm": 0.029920218512415886, "grad_norm_var": 1.084035703444117e-06, "learning_rate": 0.0003875258537909032, "loss": 2.4792, "step": 24174 }, { "crossentropy": 2.3665595054626465, "epoch": 0.8764138631090487, "grad_norm": 0.025946712121367455, "grad_norm_var": 1.070069577796872e-06, "learning_rate": 0.0003873015795932577, "loss": 2.3707, "step": 24175 }, { "crossentropy": 2.3531296253204346, "epoch": 0.8764501160092807, "grad_norm": 0.026351528242230415, "grad_norm_var": 1.0433060899534923e-06, "learning_rate": 0.000387077367697104, "loss": 2.358, "step": 24176 }, { "crossentropy": 2.481813430786133, "epoch": 0.8764863689095128, "grad_norm": 0.027639424428343773, "grad_norm_var": 1.0744702286120493e-06, "learning_rate": 0.0003868532181054707, "loss": 2.4721, "step": 24177 }, { "crossentropy": 2.3685553073883057, "epoch": 0.8765226218097448, "grad_norm": 0.025798320770263672, "grad_norm_var": 1.152538780018715e-06, "learning_rate": 0.0003866291308213854, "loss": 2.367, "step": 24178 }, { "crossentropy": 2.4109487533569336, "epoch": 0.8765588747099768, "grad_norm": 0.02623097598552704, "grad_norm_var": 1.0732072284436525e-06, "learning_rate": 0.00038640510584787456, "loss": 2.4017, "step": 24179 }, { "crossentropy": 2.440244674682617, "epoch": 0.8765951276102089, "grad_norm": 0.02613101899623871, "grad_norm_var": 1.0608889009568277e-06, "learning_rate": 0.0003861811431879647, "loss": 2.4524, "step": 24180 }, { "crossentropy": 2.475517749786377, "epoch": 0.8766313805104409, "grad_norm": 0.026203256100416183, "grad_norm_var": 1.0757862653105182e-06, "learning_rate": 0.00038595724284467946, "loss": 2.4606, "step": 24181 }, { "crossentropy": 2.4392800331115723, "epoch": 0.8766676334106729, "grad_norm": 0.026773804798722267, "grad_norm_var": 1.0364330217530379e-06, "learning_rate": 0.0003857334048210437, "loss": 2.4983, "step": 24182 }, { "crossentropy": 2.4841392040252686, "epoch": 0.8767038863109049, "grad_norm": 0.026709897443652153, "grad_norm_var": 1.0289761202341887e-06, "learning_rate": 0.0003855096291200816, "loss": 2.4579, "step": 24183 }, { "crossentropy": 2.549980878829956, "epoch": 0.8767401392111369, "grad_norm": 0.02655212953686714, "grad_norm_var": 1.031085810853973e-06, "learning_rate": 0.00038528591574481464, "loss": 2.4433, "step": 24184 }, { "crossentropy": 2.388747215270996, "epoch": 0.8767763921113689, "grad_norm": 0.028614826500415802, "grad_norm_var": 1.2371400054247133e-06, "learning_rate": 0.00038506226469826323, "loss": 2.3983, "step": 24185 }, { "crossentropy": 2.538140296936035, "epoch": 0.8768126450116009, "grad_norm": 0.02645842358469963, "grad_norm_var": 1.2118065843989457e-06, "learning_rate": 0.00038483867598345045, "loss": 2.4795, "step": 24186 }, { "crossentropy": 2.4191157817840576, "epoch": 0.876848897911833, "grad_norm": 0.025814730674028397, "grad_norm_var": 1.2669006730726817e-06, "learning_rate": 0.0003846151496033945, "loss": 2.3965, "step": 24187 }, { "crossentropy": 2.2667033672332764, "epoch": 0.876885150812065, "grad_norm": 0.027069326490163803, "grad_norm_var": 1.2645928536730943e-06, "learning_rate": 0.0003843916855611157, "loss": 2.3991, "step": 24188 }, { "crossentropy": 2.354090929031372, "epoch": 0.876921403712297, "grad_norm": 0.026513272896409035, "grad_norm_var": 1.207073751218141e-06, "learning_rate": 0.0003841682838596311, "loss": 2.3967, "step": 24189 }, { "crossentropy": 2.2047102451324463, "epoch": 0.876957656612529, "grad_norm": 0.025695210322737694, "grad_norm_var": 5.624759516712479e-07, "learning_rate": 0.00038394494450195825, "loss": 2.3527, "step": 24190 }, { "crossentropy": 2.5764942169189453, "epoch": 0.876993909512761, "grad_norm": 0.0265557412058115, "grad_norm_var": 5.381769761676613e-07, "learning_rate": 0.0003837216674911148, "loss": 2.5204, "step": 24191 }, { "crossentropy": 2.2762880325317383, "epoch": 0.877030162412993, "grad_norm": 0.02772030420601368, "grad_norm_var": 6.154943810232482e-07, "learning_rate": 0.00038349845283011495, "loss": 2.4008, "step": 24192 }, { "crossentropy": 2.3946924209594727, "epoch": 0.877066415313225, "grad_norm": 0.026714157313108444, "grad_norm_var": 5.475595457354297e-07, "learning_rate": 0.00038327530052197523, "loss": 2.4233, "step": 24193 }, { "crossentropy": 2.4920198917388916, "epoch": 0.8771026682134571, "grad_norm": 0.026317371055483818, "grad_norm_var": 5.091092203560506e-07, "learning_rate": 0.00038305221056970874, "loss": 2.3945, "step": 24194 }, { "crossentropy": 2.4780189990997314, "epoch": 0.8771389211136891, "grad_norm": 0.02689153328537941, "grad_norm_var": 5.012670202770292e-07, "learning_rate": 0.00038282918297632805, "loss": 2.5018, "step": 24195 }, { "crossentropy": 2.352954864501953, "epoch": 0.8771751740139211, "grad_norm": 0.02576059103012085, "grad_norm_var": 5.365098771998768e-07, "learning_rate": 0.00038260621774484637, "loss": 2.3596, "step": 24196 }, { "crossentropy": 2.4614059925079346, "epoch": 0.8772114269141531, "grad_norm": 0.0269811749458313, "grad_norm_var": 5.282244742847344e-07, "learning_rate": 0.00038238331487827404, "loss": 2.5199, "step": 24197 }, { "crossentropy": 2.523308277130127, "epoch": 0.8772476798143851, "grad_norm": 0.025794392451643944, "grad_norm_var": 5.780701240369236e-07, "learning_rate": 0.00038216047437962254, "loss": 2.4752, "step": 24198 }, { "crossentropy": 2.5414302349090576, "epoch": 0.8772839327146171, "grad_norm": 0.02776547521352768, "grad_norm_var": 6.582246252962813e-07, "learning_rate": 0.00038193769625190276, "loss": 2.5094, "step": 24199 }, { "crossentropy": 2.2318553924560547, "epoch": 0.8773201856148491, "grad_norm": 0.02586698718369007, "grad_norm_var": 7.011782306995714e-07, "learning_rate": 0.00038171498049812183, "loss": 2.3796, "step": 24200 }, { "crossentropy": 2.4501781463623047, "epoch": 0.8773564385150812, "grad_norm": 0.02650238387286663, "grad_norm_var": 4.2901841153236467e-07, "learning_rate": 0.0003814923271212889, "loss": 2.4089, "step": 24201 }, { "crossentropy": 2.2658495903015137, "epoch": 0.8773926914153132, "grad_norm": 0.026126375421881676, "grad_norm_var": 4.3891527193536987e-07, "learning_rate": 0.00038126973612441164, "loss": 2.3642, "step": 24202 }, { "crossentropy": 2.414630889892578, "epoch": 0.8774289443155452, "grad_norm": 0.026498055085539818, "grad_norm_var": 4.0515674720141095e-07, "learning_rate": 0.0003810472075104943, "loss": 2.4179, "step": 24203 }, { "crossentropy": 2.279632568359375, "epoch": 0.8774651972157773, "grad_norm": 0.02658391185104847, "grad_norm_var": 3.8615979422276734e-07, "learning_rate": 0.000380824741282545, "loss": 2.2942, "step": 24204 }, { "crossentropy": 2.4209089279174805, "epoch": 0.8775014501160093, "grad_norm": 0.025605155155062675, "grad_norm_var": 4.3826648578845715e-07, "learning_rate": 0.0003806023374435663, "loss": 2.4909, "step": 24205 }, { "crossentropy": 2.377230167388916, "epoch": 0.8775377030162413, "grad_norm": 0.0268718171864748, "grad_norm_var": 4.0462628604595537e-07, "learning_rate": 0.0003803799959965637, "loss": 2.4466, "step": 24206 }, { "crossentropy": 2.4063143730163574, "epoch": 0.8775739559164734, "grad_norm": 0.028532041236758232, "grad_norm_var": 6.542771677438824e-07, "learning_rate": 0.00038015771694454025, "loss": 2.4023, "step": 24207 }, { "crossentropy": 2.6145615577697754, "epoch": 0.8776102088167054, "grad_norm": 0.026702675968408585, "grad_norm_var": 5.748942869046013e-07, "learning_rate": 0.00037993550029049694, "loss": 2.4512, "step": 24208 }, { "crossentropy": 2.373457908630371, "epoch": 0.8776464617169374, "grad_norm": 0.026404032483696938, "grad_norm_var": 5.759629705222862e-07, "learning_rate": 0.0003797133460374369, "loss": 2.4326, "step": 24209 }, { "crossentropy": 2.3105483055114746, "epoch": 0.8776827146171694, "grad_norm": 0.025690382346510887, "grad_norm_var": 6.220908007892178e-07, "learning_rate": 0.0003794912541883594, "loss": 2.3982, "step": 24210 }, { "crossentropy": 2.4599721431732178, "epoch": 0.8777189675174014, "grad_norm": 0.02678927220404148, "grad_norm_var": 6.178975938943287e-07, "learning_rate": 0.0003792692247462637, "loss": 2.4632, "step": 24211 }, { "crossentropy": 2.467759132385254, "epoch": 0.8777552204176334, "grad_norm": 0.025958117097616196, "grad_norm_var": 6.000810361810859e-07, "learning_rate": 0.0003790472577141502, "loss": 2.4403, "step": 24212 }, { "crossentropy": 2.351505756378174, "epoch": 0.8777914733178654, "grad_norm": 0.026368489488959312, "grad_norm_var": 5.876669642728421e-07, "learning_rate": 0.0003788253530950153, "loss": 2.3634, "step": 24213 }, { "crossentropy": 2.3871798515319824, "epoch": 0.8778277262180975, "grad_norm": 0.025645798072218895, "grad_norm_var": 6.031006487449414e-07, "learning_rate": 0.00037860351089185674, "loss": 2.4308, "step": 24214 }, { "crossentropy": 2.438049554824829, "epoch": 0.8778639791183295, "grad_norm": 0.0271710567176342, "grad_norm_var": 5.244467236343534e-07, "learning_rate": 0.0003783817311076715, "loss": 2.4897, "step": 24215 }, { "crossentropy": 2.358837366104126, "epoch": 0.8779002320185615, "grad_norm": 0.02743884176015854, "grad_norm_var": 5.551522857036653e-07, "learning_rate": 0.00037816001374545437, "loss": 2.4142, "step": 24216 }, { "crossentropy": 2.6178712844848633, "epoch": 0.8779364849187935, "grad_norm": 0.027240540832281113, "grad_norm_var": 5.839767811127571e-07, "learning_rate": 0.00037793835880820025, "loss": 2.5336, "step": 24217 }, { "crossentropy": 2.4830429553985596, "epoch": 0.8779727378190255, "grad_norm": 0.026048220694065094, "grad_norm_var": 5.893113080111696e-07, "learning_rate": 0.0003777167662989034, "loss": 2.5288, "step": 24218 }, { "crossentropy": 2.4336209297180176, "epoch": 0.8780089907192575, "grad_norm": 0.027531925588846207, "grad_norm_var": 6.425082701120987e-07, "learning_rate": 0.00037749523622055526, "loss": 2.3884, "step": 24219 }, { "crossentropy": 2.558906078338623, "epoch": 0.8780452436194895, "grad_norm": 0.02584744431078434, "grad_norm_var": 6.840155537941034e-07, "learning_rate": 0.0003772737685761496, "loss": 2.4532, "step": 24220 }, { "crossentropy": 2.403870105743408, "epoch": 0.8780814965197216, "grad_norm": 0.026600787416100502, "grad_norm_var": 6.118646832273342e-07, "learning_rate": 0.0003770523633686762, "loss": 2.4396, "step": 24221 }, { "crossentropy": 2.339202404022217, "epoch": 0.8781177494199536, "grad_norm": 0.026551062241196632, "grad_norm_var": 6.09988348350858e-07, "learning_rate": 0.00037683102060112605, "loss": 2.3527, "step": 24222 }, { "crossentropy": 2.269855499267578, "epoch": 0.8781540023201856, "grad_norm": 0.02554161101579666, "grad_norm_var": 4.214979262698778e-07, "learning_rate": 0.00037660974027648955, "loss": 2.3316, "step": 24223 }, { "crossentropy": 2.491650104522705, "epoch": 0.8781902552204176, "grad_norm": 0.026979530230164528, "grad_norm_var": 4.3485375473063746e-07, "learning_rate": 0.00037638852239775425, "loss": 2.4231, "step": 24224 }, { "crossentropy": 2.4367873668670654, "epoch": 0.8782265081206496, "grad_norm": 0.02745564468204975, "grad_norm_var": 4.922060438400464e-07, "learning_rate": 0.0003761673669679094, "loss": 2.4679, "step": 24225 }, { "crossentropy": 2.2090325355529785, "epoch": 0.8782627610208816, "grad_norm": 0.026107773184776306, "grad_norm_var": 4.5505069100892916e-07, "learning_rate": 0.00037594627398993994, "loss": 2.299, "step": 24226 }, { "crossentropy": 2.371262550354004, "epoch": 0.8782990139211136, "grad_norm": 0.02561989054083824, "grad_norm_var": 5.078494692440597e-07, "learning_rate": 0.00037572524346683457, "loss": 2.3529, "step": 24227 }, { "crossentropy": 2.4236927032470703, "epoch": 0.8783352668213457, "grad_norm": 0.027161411941051483, "grad_norm_var": 5.103347846378229e-07, "learning_rate": 0.00037550427540157695, "loss": 2.3863, "step": 24228 }, { "crossentropy": 2.285865545272827, "epoch": 0.8783715197215777, "grad_norm": 0.025647932663559914, "grad_norm_var": 5.632859422598543e-07, "learning_rate": 0.00037528336979715136, "loss": 2.3498, "step": 24229 }, { "crossentropy": 2.1407225131988525, "epoch": 0.8784077726218097, "grad_norm": 0.025849755853414536, "grad_norm_var": 5.416544869817075e-07, "learning_rate": 0.00037506252665654217, "loss": 2.2584, "step": 24230 }, { "crossentropy": 2.481886386871338, "epoch": 0.8784440255220418, "grad_norm": 0.026608997955918312, "grad_norm_var": 5.148253750020809e-07, "learning_rate": 0.000374841745982733, "loss": 2.4663, "step": 24231 }, { "crossentropy": 2.408228635787964, "epoch": 0.8784802784222738, "grad_norm": 0.026722749695181847, "grad_norm_var": 4.586156879967639e-07, "learning_rate": 0.0003746210277787043, "loss": 2.5161, "step": 24232 }, { "crossentropy": 2.390639305114746, "epoch": 0.8785165313225058, "grad_norm": 0.025685034692287445, "grad_norm_var": 4.499686498386316e-07, "learning_rate": 0.0003744003720474387, "loss": 2.3892, "step": 24233 }, { "crossentropy": 2.5062365531921387, "epoch": 0.8785527842227379, "grad_norm": 0.026444872841238976, "grad_norm_var": 4.42652564612321e-07, "learning_rate": 0.00037417977879191435, "loss": 2.4703, "step": 24234 }, { "crossentropy": 2.3113389015197754, "epoch": 0.8785890371229699, "grad_norm": 0.02608196996152401, "grad_norm_var": 3.546920049756393e-07, "learning_rate": 0.00037395924801511326, "loss": 2.3678, "step": 24235 }, { "crossentropy": 2.443629741668701, "epoch": 0.8786252900232019, "grad_norm": 0.027954481542110443, "grad_norm_var": 5.031576933149506e-07, "learning_rate": 0.0003737387797200126, "loss": 2.4657, "step": 24236 }, { "crossentropy": 2.4748597145080566, "epoch": 0.8786615429234339, "grad_norm": 0.0257104579359293, "grad_norm_var": 5.334168680380062e-07, "learning_rate": 0.00037351837390958985, "loss": 2.4105, "step": 24237 }, { "crossentropy": 2.358210325241089, "epoch": 0.8786977958236659, "grad_norm": 0.026286428794264793, "grad_norm_var": 5.3185317539701e-07, "learning_rate": 0.00037329803058682224, "loss": 2.3424, "step": 24238 }, { "crossentropy": 2.4572789669036865, "epoch": 0.8787340487238979, "grad_norm": 0.0285712331533432, "grad_norm_var": 7.724404872582695e-07, "learning_rate": 0.00037307774975468665, "loss": 2.4259, "step": 24239 }, { "crossentropy": 2.4530303478240967, "epoch": 0.87877030162413, "grad_norm": 0.02686280757188797, "grad_norm_var": 7.6669296816545e-07, "learning_rate": 0.000372857531416157, "loss": 2.4731, "step": 24240 }, { "crossentropy": 2.483790397644043, "epoch": 0.878806554524362, "grad_norm": 0.025846384465694427, "grad_norm_var": 7.338450130741815e-07, "learning_rate": 0.0003726373755742091, "loss": 2.3471, "step": 24241 }, { "crossentropy": 2.5563197135925293, "epoch": 0.878842807424594, "grad_norm": 0.026704592630267143, "grad_norm_var": 7.290621716897367e-07, "learning_rate": 0.00037241728223181447, "loss": 2.5577, "step": 24242 }, { "crossentropy": 2.3718769550323486, "epoch": 0.878879060324826, "grad_norm": 0.025643009692430496, "grad_norm_var": 7.264290236355086e-07, "learning_rate": 0.0003721972513919486, "loss": 2.4561, "step": 24243 }, { "crossentropy": 2.4157984256744385, "epoch": 0.878915313225058, "grad_norm": 0.026577498763799667, "grad_norm_var": 6.951842147218112e-07, "learning_rate": 0.0003719772830575813, "loss": 2.4604, "step": 24244 }, { "crossentropy": 2.3197319507598877, "epoch": 0.87895156612529, "grad_norm": 0.026705583557486534, "grad_norm_var": 6.520064640547566e-07, "learning_rate": 0.00037175737723168346, "loss": 2.3916, "step": 24245 }, { "crossentropy": 2.409081220626831, "epoch": 0.878987819025522, "grad_norm": 0.026706276461482048, "grad_norm_var": 6.217722794997811e-07, "learning_rate": 0.00037153753391722555, "loss": 2.4129, "step": 24246 }, { "crossentropy": 2.2376928329467773, "epoch": 0.879024071925754, "grad_norm": 0.026657259091734886, "grad_norm_var": 6.221718600264338e-07, "learning_rate": 0.0003713177531171785, "loss": 2.3603, "step": 24247 }, { "crossentropy": 2.5957343578338623, "epoch": 0.8790603248259861, "grad_norm": 0.02541729249060154, "grad_norm_var": 7.025398825061575e-07, "learning_rate": 0.0003710980348345083, "loss": 2.4598, "step": 24248 }, { "crossentropy": 2.529019832611084, "epoch": 0.8790965777262181, "grad_norm": 0.026117807254195213, "grad_norm_var": 6.677419594785493e-07, "learning_rate": 0.0003708783790721848, "loss": 2.5209, "step": 24249 }, { "crossentropy": 2.391798257827759, "epoch": 0.8791328306264501, "grad_norm": 0.026265699416399002, "grad_norm_var": 6.714953313172728e-07, "learning_rate": 0.0003706587858331728, "loss": 2.3681, "step": 24250 }, { "crossentropy": 2.4473257064819336, "epoch": 0.8791690835266821, "grad_norm": 0.025408487766981125, "grad_norm_var": 7.37992605220894e-07, "learning_rate": 0.0003704392551204394, "loss": 2.469, "step": 24251 }, { "crossentropy": 2.513198137283325, "epoch": 0.8792053364269141, "grad_norm": 0.026610229164361954, "grad_norm_var": 5.839131514264717e-07, "learning_rate": 0.0003702197869369506, "loss": 2.4212, "step": 24252 }, { "crossentropy": 2.325925350189209, "epoch": 0.8792415893271461, "grad_norm": 0.026396246626973152, "grad_norm_var": 5.520222282279566e-07, "learning_rate": 0.0003700003812856684, "loss": 2.3112, "step": 24253 }, { "crossentropy": 2.3954153060913086, "epoch": 0.8792778422273781, "grad_norm": 0.026638174429535866, "grad_norm_var": 5.533240273069655e-07, "learning_rate": 0.0003697810381695571, "loss": 2.3824, "step": 24254 }, { "crossentropy": 2.3948752880096436, "epoch": 0.8793140951276102, "grad_norm": 0.025884682312607765, "grad_norm_var": 2.429822574733272e-07, "learning_rate": 0.00036956175759157993, "loss": 2.4759, "step": 24255 }, { "crossentropy": 2.385763645172119, "epoch": 0.8793503480278422, "grad_norm": 0.027334457263350487, "grad_norm_var": 2.9368562962326744e-07, "learning_rate": 0.0003693425395546979, "loss": 2.4248, "step": 24256 }, { "crossentropy": 2.480194091796875, "epoch": 0.8793866009280742, "grad_norm": 0.026356352493166924, "grad_norm_var": 2.786128032608668e-07, "learning_rate": 0.00036912338406187297, "loss": 2.412, "step": 24257 }, { "crossentropy": 2.4552736282348633, "epoch": 0.8794228538283063, "grad_norm": 0.02640613541007042, "grad_norm_var": 2.696307244707055e-07, "learning_rate": 0.0003689042911160628, "loss": 2.492, "step": 24258 }, { "crossentropy": 2.3575973510742188, "epoch": 0.8794591067285383, "grad_norm": 0.02647153101861477, "grad_norm_var": 2.3771106235462246e-07, "learning_rate": 0.000368685260720229, "loss": 2.457, "step": 24259 }, { "crossentropy": 2.5314340591430664, "epoch": 0.8794953596287703, "grad_norm": 0.03107570670545101, "grad_norm_var": 1.6255141549653684e-06, "learning_rate": 0.00036846629287733026, "loss": 2.4854, "step": 24260 }, { "crossentropy": 2.4016318321228027, "epoch": 0.8795316125290024, "grad_norm": 0.025951813906431198, "grad_norm_var": 1.6557645449216297e-06, "learning_rate": 0.00036824738759032137, "loss": 2.4498, "step": 24261 }, { "crossentropy": 2.469270944595337, "epoch": 0.8795678654292344, "grad_norm": 0.027435695752501488, "grad_norm_var": 1.6987572245860064e-06, "learning_rate": 0.00036802854486216006, "loss": 2.4741, "step": 24262 }, { "crossentropy": 2.256955623626709, "epoch": 0.8796041183294664, "grad_norm": 0.02744472585618496, "grad_norm_var": 1.7380949633926616e-06, "learning_rate": 0.0003678097646958034, "loss": 2.3674, "step": 24263 }, { "crossentropy": 2.428027391433716, "epoch": 0.8796403712296984, "grad_norm": 0.026841096580028534, "grad_norm_var": 1.6211077759106218e-06, "learning_rate": 0.0003675910470942051, "loss": 2.4023, "step": 24264 }, { "crossentropy": 2.4811480045318604, "epoch": 0.8796766241299304, "grad_norm": 0.02737879939377308, "grad_norm_var": 1.6074839659441694e-06, "learning_rate": 0.00036737239206032, "loss": 2.4197, "step": 24265 }, { "crossentropy": 2.4208226203918457, "epoch": 0.8797128770301624, "grad_norm": 0.0255291610956192, "grad_norm_var": 1.7006111398403777e-06, "learning_rate": 0.0003671537995971003, "loss": 2.4193, "step": 24266 }, { "crossentropy": 2.346417188644409, "epoch": 0.8797491299303944, "grad_norm": 0.0263972170650959, "grad_norm_var": 1.5752730431264327e-06, "learning_rate": 0.0003669352697074996, "loss": 2.3856, "step": 24267 }, { "crossentropy": 2.4114584922790527, "epoch": 0.8797853828306265, "grad_norm": 0.026166081428527832, "grad_norm_var": 1.6038445727203457e-06, "learning_rate": 0.00036671680239446945, "loss": 2.3865, "step": 24268 }, { "crossentropy": 2.5021209716796875, "epoch": 0.8798216357308585, "grad_norm": 0.026925155892968178, "grad_norm_var": 1.5888539097892267e-06, "learning_rate": 0.0003664983976609598, "loss": 2.4594, "step": 24269 }, { "crossentropy": 2.4352588653564453, "epoch": 0.8798578886310905, "grad_norm": 0.02802291139960289, "grad_norm_var": 1.6622395634683137e-06, "learning_rate": 0.0003662800555099205, "loss": 2.4893, "step": 24270 }, { "crossentropy": 2.4332432746887207, "epoch": 0.8798941415313225, "grad_norm": 0.027106689289212227, "grad_norm_var": 1.5777015867400226e-06, "learning_rate": 0.0003660617759443019, "loss": 2.4413, "step": 24271 }, { "crossentropy": 2.4214420318603516, "epoch": 0.8799303944315545, "grad_norm": 0.02549048326909542, "grad_norm_var": 1.7209479296725518e-06, "learning_rate": 0.00036584355896705003, "loss": 2.4015, "step": 24272 }, { "crossentropy": 2.458423376083374, "epoch": 0.8799666473317865, "grad_norm": 0.027166398242115974, "grad_norm_var": 1.6991943296418606e-06, "learning_rate": 0.0003656254045811147, "loss": 2.4323, "step": 24273 }, { "crossentropy": 2.503840684890747, "epoch": 0.8800029002320185, "grad_norm": 0.028009243309497833, "grad_norm_var": 1.7354228782465725e-06, "learning_rate": 0.0003654073127894403, "loss": 2.5021, "step": 24274 }, { "crossentropy": 2.4583749771118164, "epoch": 0.8800391531322506, "grad_norm": 0.02753441222012043, "grad_norm_var": 1.7186239974948386e-06, "learning_rate": 0.00036518928359497327, "loss": 2.5233, "step": 24275 }, { "crossentropy": 2.432854652404785, "epoch": 0.8800754060324826, "grad_norm": 0.026609957218170166, "grad_norm_var": 6.303731441105927e-07, "learning_rate": 0.00036497131700065913, "loss": 2.4472, "step": 24276 }, { "crossentropy": 2.3711209297180176, "epoch": 0.8801116589327146, "grad_norm": 0.02586149424314499, "grad_norm_var": 6.42007985315364e-07, "learning_rate": 0.00036475341300944153, "loss": 2.3411, "step": 24277 }, { "crossentropy": 2.667302370071411, "epoch": 0.8801479118329466, "grad_norm": 0.028107473626732826, "grad_norm_var": 7.208855896621383e-07, "learning_rate": 0.00036453557162426266, "loss": 2.5829, "step": 24278 }, { "crossentropy": 2.354813575744629, "epoch": 0.8801841647331786, "grad_norm": 0.02626422978937626, "grad_norm_var": 7.241261195588151e-07, "learning_rate": 0.00036431779284806564, "loss": 2.3746, "step": 24279 }, { "crossentropy": 2.3379123210906982, "epoch": 0.8802204176334106, "grad_norm": 0.02633044682443142, "grad_norm_var": 7.402249146042876e-07, "learning_rate": 0.00036410007668379155, "loss": 2.3948, "step": 24280 }, { "crossentropy": 2.3582468032836914, "epoch": 0.8802566705336426, "grad_norm": 0.025673843920230865, "grad_norm_var": 7.917505168835807e-07, "learning_rate": 0.0003638824231343818, "loss": 2.4169, "step": 24281 }, { "crossentropy": 2.444643974304199, "epoch": 0.8802929234338747, "grad_norm": 0.026731712743639946, "grad_norm_var": 6.944492175657192e-07, "learning_rate": 0.00036366483220277466, "loss": 2.4465, "step": 24282 }, { "crossentropy": 2.5803349018096924, "epoch": 0.8803291763341067, "grad_norm": 0.02688557468354702, "grad_norm_var": 6.847651051108498e-07, "learning_rate": 0.00036344730389191004, "loss": 2.5422, "step": 24283 }, { "crossentropy": 2.3975815773010254, "epoch": 0.8803654292343387, "grad_norm": 0.026271162554621696, "grad_norm_var": 6.764981130219129e-07, "learning_rate": 0.00036322983820472613, "loss": 2.3996, "step": 24284 }, { "crossentropy": 2.478771924972534, "epoch": 0.8804016821345708, "grad_norm": 0.02595643512904644, "grad_norm_var": 7.205272844054787e-07, "learning_rate": 0.00036301243514416046, "loss": 2.483, "step": 24285 }, { "crossentropy": 2.4727914333343506, "epoch": 0.8804379350348028, "grad_norm": 0.027214931324124336, "grad_norm_var": 6.243489494739407e-07, "learning_rate": 0.00036279509471314755, "loss": 2.4886, "step": 24286 }, { "crossentropy": 2.4612104892730713, "epoch": 0.8804741879350348, "grad_norm": 0.02663177251815796, "grad_norm_var": 6.127504352977508e-07, "learning_rate": 0.00036257781691462486, "loss": 2.4593, "step": 24287 }, { "crossentropy": 2.4383959770202637, "epoch": 0.8805104408352669, "grad_norm": 0.026228927075862885, "grad_norm_var": 5.305769576381218e-07, "learning_rate": 0.0003623606017515252, "loss": 2.3763, "step": 24288 }, { "crossentropy": 2.380720376968384, "epoch": 0.8805466937354989, "grad_norm": 0.02649643085896969, "grad_norm_var": 5.185197719506999e-07, "learning_rate": 0.000362143449226785, "loss": 2.4326, "step": 24289 }, { "crossentropy": 2.3098983764648438, "epoch": 0.8805829466357309, "grad_norm": 0.026082972064614296, "grad_norm_var": 4.078745818812766e-07, "learning_rate": 0.00036192635934333416, "loss": 2.3843, "step": 24290 }, { "crossentropy": 2.4260663986206055, "epoch": 0.8806191995359629, "grad_norm": 0.026644788682460785, "grad_norm_var": 3.411777097454711e-07, "learning_rate": 0.00036170933210410706, "loss": 2.3983, "step": 24291 }, { "crossentropy": 2.3741350173950195, "epoch": 0.8806554524361949, "grad_norm": 0.02592429891228676, "grad_norm_var": 3.604634212446093e-07, "learning_rate": 0.0003614923675120346, "loss": 2.3912, "step": 24292 }, { "crossentropy": 2.3814451694488525, "epoch": 0.8806917053364269, "grad_norm": 0.02624397911131382, "grad_norm_var": 3.3925479292136405e-07, "learning_rate": 0.0003612754655700462, "loss": 2.4, "step": 24293 }, { "crossentropy": 2.529463291168213, "epoch": 0.880727958236659, "grad_norm": 0.026884889230132103, "grad_norm_var": 1.6746930414991643e-07, "learning_rate": 0.0003610586262810733, "loss": 2.4832, "step": 24294 }, { "crossentropy": 2.5960657596588135, "epoch": 0.880764211136891, "grad_norm": 0.02667500451207161, "grad_norm_var": 1.7035188710605904e-07, "learning_rate": 0.0003608418496480431, "loss": 2.5522, "step": 24295 }, { "crossentropy": 2.383894443511963, "epoch": 0.880800464037123, "grad_norm": 0.02611268125474453, "grad_norm_var": 1.7620118413982921e-07, "learning_rate": 0.0003606251356738832, "loss": 2.4363, "step": 24296 }, { "crossentropy": 2.5113961696624756, "epoch": 0.880836716937355, "grad_norm": 0.025267772376537323, "grad_norm_var": 2.2670104814330924e-07, "learning_rate": 0.00036040848436152186, "loss": 2.4347, "step": 24297 }, { "crossentropy": 2.5341784954071045, "epoch": 0.880872969837587, "grad_norm": 0.026246588677167892, "grad_norm_var": 2.1936095782359941e-07, "learning_rate": 0.00036019189571388446, "loss": 2.4614, "step": 24298 }, { "crossentropy": 2.5555825233459473, "epoch": 0.880909222737819, "grad_norm": 0.026326943188905716, "grad_norm_var": 1.997564849198196e-07, "learning_rate": 0.00035997536973389645, "loss": 2.4454, "step": 24299 }, { "crossentropy": 2.3927161693573, "epoch": 0.880945475638051, "grad_norm": 0.026167772710323334, "grad_norm_var": 2.0117499395347754e-07, "learning_rate": 0.0003597589064244833, "loss": 2.4368, "step": 24300 }, { "crossentropy": 2.3495399951934814, "epoch": 0.880981728538283, "grad_norm": 0.02565031871199608, "grad_norm_var": 2.2183555288910612e-07, "learning_rate": 0.0003595425057885671, "loss": 2.413, "step": 24301 }, { "crossentropy": 2.5682976245880127, "epoch": 0.8810179814385151, "grad_norm": 0.026074932888150215, "grad_norm_var": 1.639916360364364e-07, "learning_rate": 0.00035932616782907267, "loss": 2.4854, "step": 24302 }, { "crossentropy": 2.3869125843048096, "epoch": 0.8810542343387471, "grad_norm": 0.025918999686837196, "grad_norm_var": 1.5744308743205887e-07, "learning_rate": 0.00035910989254892055, "loss": 2.4613, "step": 24303 }, { "crossentropy": 2.467974901199341, "epoch": 0.8810904872389791, "grad_norm": 0.026354800909757614, "grad_norm_var": 1.5918390894538852e-07, "learning_rate": 0.0003588936799510312, "loss": 2.4736, "step": 24304 }, { "crossentropy": 2.2798900604248047, "epoch": 0.8811267401392111, "grad_norm": 0.025794532150030136, "grad_norm_var": 1.6149151998018362e-07, "learning_rate": 0.00035867753003832715, "loss": 2.3419, "step": 24305 }, { "crossentropy": 2.484623432159424, "epoch": 0.8811629930394431, "grad_norm": 0.02625350095331669, "grad_norm_var": 1.618258200270876e-07, "learning_rate": 0.00035846144281372506, "loss": 2.5294, "step": 24306 }, { "crossentropy": 2.423248052597046, "epoch": 0.8811992459396751, "grad_norm": 0.02663254365324974, "grad_norm_var": 1.6104183435388986e-07, "learning_rate": 0.00035824541828014537, "loss": 2.4176, "step": 24307 }, { "crossentropy": 2.4648826122283936, "epoch": 0.8812354988399071, "grad_norm": 0.02647286280989647, "grad_norm_var": 1.6274902418228255e-07, "learning_rate": 0.0003580294564405062, "loss": 2.4451, "step": 24308 }, { "crossentropy": 2.541829824447632, "epoch": 0.8812717517401392, "grad_norm": 0.026733046397566795, "grad_norm_var": 1.8106275028300274e-07, "learning_rate": 0.0003578135572977226, "loss": 2.4119, "step": 24309 }, { "crossentropy": 2.429643392562866, "epoch": 0.8813080046403712, "grad_norm": 0.02603580430150032, "grad_norm_var": 1.5118273978090232e-07, "learning_rate": 0.00035759772085471264, "loss": 2.4111, "step": 24310 }, { "crossentropy": 2.3994994163513184, "epoch": 0.8813442575406032, "grad_norm": 0.02601468749344349, "grad_norm_var": 1.3396173042055233e-07, "learning_rate": 0.00035738194711439075, "loss": 2.3541, "step": 24311 }, { "crossentropy": 2.397646903991699, "epoch": 0.8813805104408353, "grad_norm": 0.027003800496459007, "grad_norm_var": 1.816997770279563e-07, "learning_rate": 0.00035716623607967015, "loss": 2.4711, "step": 24312 }, { "crossentropy": 2.527036190032959, "epoch": 0.8814167633410673, "grad_norm": 0.02779601328074932, "grad_norm_var": 2.722372931717911e-07, "learning_rate": 0.0003569505877534651, "loss": 2.4895, "step": 24313 }, { "crossentropy": 2.4504966735839844, "epoch": 0.8814530162412993, "grad_norm": 0.025848601013422012, "grad_norm_var": 2.8721700807859444e-07, "learning_rate": 0.00035673500213868946, "loss": 2.4179, "step": 24314 }, { "crossentropy": 2.485856533050537, "epoch": 0.8814892691415314, "grad_norm": 0.026630226522684097, "grad_norm_var": 2.933497897895324e-07, "learning_rate": 0.0003565194792382531, "loss": 2.4881, "step": 24315 }, { "crossentropy": 2.5559237003326416, "epoch": 0.8815255220417634, "grad_norm": 0.026394100859761238, "grad_norm_var": 2.914625534708795e-07, "learning_rate": 0.0003563040190550687, "loss": 2.4532, "step": 24316 }, { "crossentropy": 2.5596723556518555, "epoch": 0.8815617749419954, "grad_norm": 0.02658171020448208, "grad_norm_var": 2.587223065569997e-07, "learning_rate": 0.00035608862159204504, "loss": 2.5869, "step": 24317 }, { "crossentropy": 2.4835832118988037, "epoch": 0.8815980278422274, "grad_norm": 0.025681842118501663, "grad_norm_var": 2.858764213369928e-07, "learning_rate": 0.00035587328685209144, "loss": 2.45, "step": 24318 }, { "crossentropy": 2.544516086578369, "epoch": 0.8816342807424594, "grad_norm": 0.025837311521172523, "grad_norm_var": 2.9136024259149297e-07, "learning_rate": 0.00035565801483811887, "loss": 2.5149, "step": 24319 }, { "crossentropy": 2.43166446685791, "epoch": 0.8816705336426914, "grad_norm": 0.02616666629910469, "grad_norm_var": 2.941816028251276e-07, "learning_rate": 0.0003554428055530312, "loss": 2.4565, "step": 24320 }, { "crossentropy": 2.4658875465393066, "epoch": 0.8817067865429234, "grad_norm": 0.02643921598792076, "grad_norm_var": 2.709213729347922e-07, "learning_rate": 0.0003552276589997366, "loss": 2.4533, "step": 24321 }, { "crossentropy": 2.4942309856414795, "epoch": 0.8817430394431555, "grad_norm": 0.02601020783185959, "grad_norm_var": 2.796203531031516e-07, "learning_rate": 0.0003550125751811423, "loss": 2.4863, "step": 24322 }, { "crossentropy": 2.4803428649902344, "epoch": 0.8817792923433875, "grad_norm": 0.02627459540963173, "grad_norm_var": 2.7616780557784767e-07, "learning_rate": 0.0003547975541001508, "loss": 2.5126, "step": 24323 }, { "crossentropy": 2.521482467651367, "epoch": 0.8818155452436195, "grad_norm": 0.02691529504954815, "grad_norm_var": 2.9446737156319883e-07, "learning_rate": 0.0003545825957596688, "loss": 2.5092, "step": 24324 }, { "crossentropy": 2.351943016052246, "epoch": 0.8818517981438515, "grad_norm": 0.02688532881438732, "grad_norm_var": 3.0272581923266686e-07, "learning_rate": 0.0003543677001625978, "loss": 2.3974, "step": 24325 }, { "crossentropy": 2.4754226207733154, "epoch": 0.8818880510440835, "grad_norm": 0.028072969987988472, "grad_norm_var": 4.6122077737343185e-07, "learning_rate": 0.00035415286731184105, "loss": 2.4426, "step": 24326 }, { "crossentropy": 2.401806116104126, "epoch": 0.8819243039443155, "grad_norm": 0.026355400681495667, "grad_norm_var": 4.4486022955271776e-07, "learning_rate": 0.0003539380972103018, "loss": 2.5094, "step": 24327 }, { "crossentropy": 2.430142641067505, "epoch": 0.8819605568445475, "grad_norm": 0.027542676776647568, "grad_norm_var": 4.951961867232276e-07, "learning_rate": 0.00035372338986087716, "loss": 2.4261, "step": 24328 }, { "crossentropy": 2.5547900199890137, "epoch": 0.8819968097447796, "grad_norm": 0.026633957400918007, "grad_norm_var": 3.9265801212068754e-07, "learning_rate": 0.00035350874526646924, "loss": 2.6024, "step": 24329 }, { "crossentropy": 2.4805214405059814, "epoch": 0.8820330626450116, "grad_norm": 0.02696545608341694, "grad_norm_var": 3.7110199038916055e-07, "learning_rate": 0.000353294163429978, "loss": 2.495, "step": 24330 }, { "crossentropy": 2.5507938861846924, "epoch": 0.8820693155452436, "grad_norm": 0.026068085804581642, "grad_norm_var": 3.875886059550067e-07, "learning_rate": 0.00035307964435429996, "loss": 2.3819, "step": 24331 }, { "crossentropy": 2.4501802921295166, "epoch": 0.8821055684454756, "grad_norm": 0.02531927451491356, "grad_norm_var": 4.823560865336099e-07, "learning_rate": 0.00035286518804233393, "loss": 2.4242, "step": 24332 }, { "crossentropy": 2.377192497253418, "epoch": 0.8821418213457076, "grad_norm": 0.02571498602628708, "grad_norm_var": 5.180583517632679e-07, "learning_rate": 0.00035265079449697535, "loss": 2.4124, "step": 24333 }, { "crossentropy": 2.29600191116333, "epoch": 0.8821780742459396, "grad_norm": 0.0273250974714756, "grad_norm_var": 5.228596527055099e-07, "learning_rate": 0.0003524364637211203, "loss": 2.4238, "step": 24334 }, { "crossentropy": 2.42283034324646, "epoch": 0.8822143271461717, "grad_norm": 0.026505526155233383, "grad_norm_var": 4.887922243443365e-07, "learning_rate": 0.0003522221957176641, "loss": 2.3576, "step": 24335 }, { "crossentropy": 2.514160633087158, "epoch": 0.8822505800464037, "grad_norm": 0.02717920020222664, "grad_norm_var": 4.977862656893594e-07, "learning_rate": 0.00035200799048950074, "loss": 2.4729, "step": 24336 }, { "crossentropy": 2.388390064239502, "epoch": 0.8822868329466357, "grad_norm": 0.026618141680955887, "grad_norm_var": 4.95045905837127e-07, "learning_rate": 0.0003517938480395222, "loss": 2.4061, "step": 24337 }, { "crossentropy": 2.5744707584381104, "epoch": 0.8823230858468677, "grad_norm": 0.02719871886074543, "grad_norm_var": 4.820808167202284e-07, "learning_rate": 0.0003515797683706223, "loss": 2.5405, "step": 24338 }, { "crossentropy": 2.445920705795288, "epoch": 0.8823593387470998, "grad_norm": 0.02723461389541626, "grad_norm_var": 4.822324538564093e-07, "learning_rate": 0.0003513657514856911, "loss": 2.4784, "step": 24339 }, { "crossentropy": 2.4511873722076416, "epoch": 0.8823955916473318, "grad_norm": 0.027306456118822098, "grad_norm_var": 4.986732776295887e-07, "learning_rate": 0.00035115179738762106, "loss": 2.4052, "step": 24340 }, { "crossentropy": 2.4373087882995605, "epoch": 0.8824318445475638, "grad_norm": 0.025892697274684906, "grad_norm_var": 5.500036276306763e-07, "learning_rate": 0.00035093790607929997, "loss": 2.4337, "step": 24341 }, { "crossentropy": 2.3656086921691895, "epoch": 0.8824680974477959, "grad_norm": 0.026209615170955658, "grad_norm_var": 4.3728464033243365e-07, "learning_rate": 0.0003507240775636172, "loss": 2.3724, "step": 24342 }, { "crossentropy": 2.4546196460723877, "epoch": 0.8825043503480279, "grad_norm": 0.027216944843530655, "grad_norm_var": 4.5220434520705045e-07, "learning_rate": 0.00035051031184346273, "loss": 2.5404, "step": 24343 }, { "crossentropy": 2.44242262840271, "epoch": 0.8825406032482599, "grad_norm": 0.025904148817062378, "grad_norm_var": 4.3223587192538446e-07, "learning_rate": 0.0003502966089217219, "loss": 2.4873, "step": 24344 }, { "crossentropy": 2.4390687942504883, "epoch": 0.8825768561484919, "grad_norm": 0.02524569258093834, "grad_norm_var": 5.428526746071993e-07, "learning_rate": 0.0003500829688012813, "loss": 2.4101, "step": 24345 }, { "crossentropy": 2.4543681144714355, "epoch": 0.8826131090487239, "grad_norm": 0.026445535942912102, "grad_norm_var": 5.270677224568452e-07, "learning_rate": 0.00034986939148502706, "loss": 2.4863, "step": 24346 }, { "crossentropy": 2.499372959136963, "epoch": 0.8826493619489559, "grad_norm": 0.025767212733626366, "grad_norm_var": 5.485097189615682e-07, "learning_rate": 0.0003496558769758429, "loss": 2.5353, "step": 24347 }, { "crossentropy": 2.1852574348449707, "epoch": 0.882685614849188, "grad_norm": 0.026524139568209648, "grad_norm_var": 4.58757492242251e-07, "learning_rate": 0.0003494424252766137, "loss": 2.2769, "step": 24348 }, { "crossentropy": 2.454284906387329, "epoch": 0.88272186774942, "grad_norm": 0.02668885700404644, "grad_norm_var": 4.1375719817431967e-07, "learning_rate": 0.0003492290363902212, "loss": 2.4579, "step": 24349 }, { "crossentropy": 2.3067169189453125, "epoch": 0.882758120649652, "grad_norm": 0.025832749903202057, "grad_norm_var": 4.04475362636653e-07, "learning_rate": 0.00034901571031954905, "loss": 2.4456, "step": 24350 }, { "crossentropy": 2.4492647647857666, "epoch": 0.882794373549884, "grad_norm": 0.027201134711503983, "grad_norm_var": 4.3656165157669216e-07, "learning_rate": 0.0003488024470674783, "loss": 2.5113, "step": 24351 }, { "crossentropy": 2.372469663619995, "epoch": 0.882830626450116, "grad_norm": 0.026872804388403893, "grad_norm_var": 4.1587131593637945e-07, "learning_rate": 0.00034858924663688784, "loss": 2.394, "step": 24352 }, { "crossentropy": 2.346575975418091, "epoch": 0.882866879350348, "grad_norm": 0.025871137157082558, "grad_norm_var": 4.3997298443813834e-07, "learning_rate": 0.0003483761090306592, "loss": 2.3838, "step": 24353 }, { "crossentropy": 2.438225269317627, "epoch": 0.88290313225058, "grad_norm": 0.026239382103085518, "grad_norm_var": 4.034221171192309e-07, "learning_rate": 0.00034816303425167064, "loss": 2.3908, "step": 24354 }, { "crossentropy": 2.559413194656372, "epoch": 0.882939385150812, "grad_norm": 0.02592923305928707, "grad_norm_var": 3.652359872872065e-07, "learning_rate": 0.0003479500223027987, "loss": 2.4413, "step": 24355 }, { "crossentropy": 2.3883769512176514, "epoch": 0.8829756380510441, "grad_norm": 0.025886444374918938, "grad_norm_var": 3.0482077770925484e-07, "learning_rate": 0.0003477370731869228, "loss": 2.3622, "step": 24356 }, { "crossentropy": 2.441433906555176, "epoch": 0.8830118909512761, "grad_norm": 0.02708081156015396, "grad_norm_var": 3.391402895599419e-07, "learning_rate": 0.0003475241869069162, "loss": 2.4471, "step": 24357 }, { "crossentropy": 2.4740192890167236, "epoch": 0.8830481438515081, "grad_norm": 0.02729150839149952, "grad_norm_var": 3.9821344425903376e-07, "learning_rate": 0.00034731136346565606, "loss": 2.4948, "step": 24358 }, { "crossentropy": 2.4475789070129395, "epoch": 0.8830843967517401, "grad_norm": 0.027441425248980522, "grad_norm_var": 4.2656715908034823e-07, "learning_rate": 0.0003470986028660178, "loss": 2.4444, "step": 24359 }, { "crossentropy": 2.524806499481201, "epoch": 0.8831206496519721, "grad_norm": 0.02650926075875759, "grad_norm_var": 4.103426107293061e-07, "learning_rate": 0.00034688590511087303, "loss": 2.474, "step": 24360 }, { "crossentropy": 2.4351587295532227, "epoch": 0.8831569025522041, "grad_norm": 0.026217294856905937, "grad_norm_var": 3.163463158942081e-07, "learning_rate": 0.00034667327020309615, "loss": 2.4519, "step": 24361 }, { "crossentropy": 2.377216339111328, "epoch": 0.8831931554524362, "grad_norm": 0.026852848008275032, "grad_norm_var": 3.244398911208945e-07, "learning_rate": 0.0003464606981455581, "loss": 2.3234, "step": 24362 }, { "crossentropy": 2.4110734462738037, "epoch": 0.8832294083526682, "grad_norm": 0.02663719840347767, "grad_norm_var": 2.852474168340968e-07, "learning_rate": 0.00034624818894113006, "loss": 2.4126, "step": 24363 }, { "crossentropy": 2.4933063983917236, "epoch": 0.8832656612529002, "grad_norm": 0.026894407346844673, "grad_norm_var": 2.916870274385774e-07, "learning_rate": 0.000346035742592683, "loss": 2.4561, "step": 24364 }, { "crossentropy": 2.3276684284210205, "epoch": 0.8833019141531323, "grad_norm": 0.02700234390795231, "grad_norm_var": 3.019442315959116e-07, "learning_rate": 0.00034582335910308514, "loss": 2.4182, "step": 24365 }, { "crossentropy": 2.491492986679077, "epoch": 0.8833381670533643, "grad_norm": 0.026581933721899986, "grad_norm_var": 2.5938367891834707e-07, "learning_rate": 0.00034561103847520594, "loss": 2.4443, "step": 24366 }, { "crossentropy": 2.3108692169189453, "epoch": 0.8833744199535963, "grad_norm": 0.025992179289460182, "grad_norm_var": 2.6299219203517777e-07, "learning_rate": 0.00034539878071191354, "loss": 2.3876, "step": 24367 }, { "crossentropy": 2.4960083961486816, "epoch": 0.8834106728538283, "grad_norm": 0.02722390554845333, "grad_norm_var": 2.8434474946986176e-07, "learning_rate": 0.00034518658581607353, "loss": 2.5735, "step": 24368 }, { "crossentropy": 2.4253499507904053, "epoch": 0.8834469257540604, "grad_norm": 0.027583779767155647, "grad_norm_var": 3.0049640150641283e-07, "learning_rate": 0.0003449744537905536, "loss": 2.42, "step": 24369 }, { "crossentropy": 2.5115718841552734, "epoch": 0.8834831786542924, "grad_norm": 0.02648243121802807, "grad_norm_var": 2.8892934102521006e-07, "learning_rate": 0.0003447623846382175, "loss": 2.4595, "step": 24370 }, { "crossentropy": 2.3429553508758545, "epoch": 0.8835194315545244, "grad_norm": 0.02827090211212635, "grad_norm_var": 3.8304963336324195e-07, "learning_rate": 0.00034455037836192973, "loss": 2.4913, "step": 24371 }, { "crossentropy": 2.466806650161743, "epoch": 0.8835556844547564, "grad_norm": 0.026848122477531433, "grad_norm_var": 3.14506196924004e-07, "learning_rate": 0.0003443384349645545, "loss": 2.4682, "step": 24372 }, { "crossentropy": 2.4369232654571533, "epoch": 0.8835919373549884, "grad_norm": 0.026146993041038513, "grad_norm_var": 3.504660426894545e-07, "learning_rate": 0.000344126554448953, "loss": 2.3841, "step": 24373 }, { "crossentropy": 2.457144260406494, "epoch": 0.8836281902552204, "grad_norm": 0.025566386058926582, "grad_norm_var": 4.4032791411078437e-07, "learning_rate": 0.00034391473681798793, "loss": 2.3925, "step": 24374 }, { "crossentropy": 2.3700149059295654, "epoch": 0.8836644431554525, "grad_norm": 0.02649589814245701, "grad_norm_var": 4.110170550901404e-07, "learning_rate": 0.00034370298207452135, "loss": 2.4027, "step": 24375 }, { "crossentropy": 2.394287109375, "epoch": 0.8837006960556845, "grad_norm": 0.028734082356095314, "grad_norm_var": 6.618369429153259e-07, "learning_rate": 0.000343491290221411, "loss": 2.422, "step": 24376 }, { "crossentropy": 2.208522081375122, "epoch": 0.8837369489559165, "grad_norm": 0.026524614542722702, "grad_norm_var": 6.41991541402625e-07, "learning_rate": 0.00034327966126151797, "loss": 2.3396, "step": 24377 }, { "crossentropy": 2.487858295440674, "epoch": 0.8837732018561485, "grad_norm": 0.026721052825450897, "grad_norm_var": 6.432885396432757e-07, "learning_rate": 0.00034306809519770056, "loss": 2.4725, "step": 24378 }, { "crossentropy": 2.3453643321990967, "epoch": 0.8838094547563805, "grad_norm": 0.02637142315506935, "grad_norm_var": 6.554795853213098e-07, "learning_rate": 0.00034285659203281416, "loss": 2.3664, "step": 24379 }, { "crossentropy": 2.502454996109009, "epoch": 0.8838457076566125, "grad_norm": 0.027156244963407516, "grad_norm_var": 6.616629769981911e-07, "learning_rate": 0.0003426451517697182, "loss": 2.4315, "step": 24380 }, { "crossentropy": 2.5053694248199463, "epoch": 0.8838819605568445, "grad_norm": 0.02703293040394783, "grad_norm_var": 6.623166634646491e-07, "learning_rate": 0.00034243377441126656, "loss": 2.4264, "step": 24381 }, { "crossentropy": 2.408243179321289, "epoch": 0.8839182134570766, "grad_norm": 0.02629065327346325, "grad_norm_var": 6.783529712550088e-07, "learning_rate": 0.00034222245996031467, "loss": 2.4517, "step": 24382 }, { "crossentropy": 2.4455225467681885, "epoch": 0.8839544663573086, "grad_norm": 0.026461433619260788, "grad_norm_var": 6.390633891406594e-07, "learning_rate": 0.00034201120841971744, "loss": 2.452, "step": 24383 }, { "crossentropy": 2.512427806854248, "epoch": 0.8839907192575406, "grad_norm": 0.02531014382839203, "grad_norm_var": 7.775173136297195e-07, "learning_rate": 0.00034180001979232713, "loss": 2.5107, "step": 24384 }, { "crossentropy": 2.431687593460083, "epoch": 0.8840269721577726, "grad_norm": 0.026103079319000244, "grad_norm_var": 7.49900632257781e-07, "learning_rate": 0.0003415888940809975, "loss": 2.4281, "step": 24385 }, { "crossentropy": 2.5296971797943115, "epoch": 0.8840632250580046, "grad_norm": 0.025713486596941948, "grad_norm_var": 8.047813408964636e-07, "learning_rate": 0.000341377831288579, "loss": 2.507, "step": 24386 }, { "crossentropy": 2.4171018600463867, "epoch": 0.8840994779582366, "grad_norm": 0.025892626494169235, "grad_norm_var": 6.313669970570464e-07, "learning_rate": 0.0003411668314179217, "loss": 2.4824, "step": 24387 }, { "crossentropy": 2.387420177459717, "epoch": 0.8841357308584686, "grad_norm": 0.026142502203583717, "grad_norm_var": 6.260240609937476e-07, "learning_rate": 0.0003409558944718777, "loss": 2.4005, "step": 24388 }, { "crossentropy": 2.513181447982788, "epoch": 0.8841719837587007, "grad_norm": 0.02620629034936428, "grad_norm_var": 6.241132386594021e-07, "learning_rate": 0.00034074502045329356, "loss": 2.3629, "step": 24389 }, { "crossentropy": 2.4511165618896484, "epoch": 0.8842082366589327, "grad_norm": 0.026598095893859863, "grad_norm_var": 5.731910724786812e-07, "learning_rate": 0.00034053420936501824, "loss": 2.4697, "step": 24390 }, { "crossentropy": 2.482106924057007, "epoch": 0.8842444895591647, "grad_norm": 0.026714958250522614, "grad_norm_var": 5.76518528200504e-07, "learning_rate": 0.00034032346120990096, "loss": 2.5092, "step": 24391 }, { "crossentropy": 2.4494524002075195, "epoch": 0.8842807424593968, "grad_norm": 0.027121825143694878, "grad_norm_var": 2.583695213354779e-07, "learning_rate": 0.00034011277599078514, "loss": 2.4384, "step": 24392 }, { "crossentropy": 2.510599136352539, "epoch": 0.8843169953596288, "grad_norm": 0.026207227259874344, "grad_norm_var": 2.592897667205805e-07, "learning_rate": 0.00033990215371051903, "loss": 2.4687, "step": 24393 }, { "crossentropy": 2.4658894538879395, "epoch": 0.8843532482598608, "grad_norm": 0.026990385726094246, "grad_norm_var": 2.761519553329608e-07, "learning_rate": 0.0003396915943719453, "loss": 2.4394, "step": 24394 }, { "crossentropy": 2.4918103218078613, "epoch": 0.8843895011600929, "grad_norm": 0.026777369901537895, "grad_norm_var": 2.8519802053130793e-07, "learning_rate": 0.00033948109797791006, "loss": 2.5487, "step": 24395 }, { "crossentropy": 2.390134811401367, "epoch": 0.8844257540603249, "grad_norm": 0.02580898068845272, "grad_norm_var": 2.663791622797377e-07, "learning_rate": 0.00033927066453125486, "loss": 2.339, "step": 24396 }, { "crossentropy": 2.336439609527588, "epoch": 0.8844620069605569, "grad_norm": 0.025634491816163063, "grad_norm_var": 2.586107414994602e-07, "learning_rate": 0.0003390602940348214, "loss": 2.3844, "step": 24397 }, { "crossentropy": 2.3259308338165283, "epoch": 0.8844982598607889, "grad_norm": 0.02623048797249794, "grad_norm_var": 2.5849759952099177e-07, "learning_rate": 0.0003388499864914524, "loss": 2.3851, "step": 24398 }, { "crossentropy": 2.315182685852051, "epoch": 0.8845345127610209, "grad_norm": 0.027079856023192406, "grad_norm_var": 3.0028090146649347e-07, "learning_rate": 0.0003386397419039883, "loss": 2.3561, "step": 24399 }, { "crossentropy": 2.324833631515503, "epoch": 0.8845707656612529, "grad_norm": 0.02644955739378929, "grad_norm_var": 2.3358813448213536e-07, "learning_rate": 0.0003384295602752674, "loss": 2.369, "step": 24400 }, { "crossentropy": 2.3872811794281006, "epoch": 0.8846070185614849, "grad_norm": 0.025596778839826584, "grad_norm_var": 2.665786970056181e-07, "learning_rate": 0.00033821944160813077, "loss": 2.4073, "step": 24401 }, { "crossentropy": 2.3563406467437744, "epoch": 0.884643271461717, "grad_norm": 0.026858465746045113, "grad_norm_var": 2.5549347275112025e-07, "learning_rate": 0.00033800938590541397, "loss": 2.3864, "step": 24402 }, { "crossentropy": 2.417248010635376, "epoch": 0.884679524361949, "grad_norm": 0.026413695886731148, "grad_norm_var": 2.3760405170464294e-07, "learning_rate": 0.0003377993931699558, "loss": 2.4357, "step": 24403 }, { "crossentropy": 2.5808284282684326, "epoch": 0.884715777262181, "grad_norm": 0.027640450745821, "grad_norm_var": 3.21035793835013e-07, "learning_rate": 0.00033758946340459227, "loss": 2.4978, "step": 24404 }, { "crossentropy": 2.511544942855835, "epoch": 0.884752030162413, "grad_norm": 0.026390474289655685, "grad_norm_var": 3.15438301921415e-07, "learning_rate": 0.0003373795966121573, "loss": 2.4198, "step": 24405 }, { "crossentropy": 2.258932590484619, "epoch": 0.884788283062645, "grad_norm": 0.026314228773117065, "grad_norm_var": 3.1797553151450643e-07, "learning_rate": 0.000337169792795487, "loss": 2.3531, "step": 24406 }, { "crossentropy": 2.465972661972046, "epoch": 0.884824535962877, "grad_norm": 0.026104988530278206, "grad_norm_var": 3.249122868625159e-07, "learning_rate": 0.0003369600519574156, "loss": 2.4139, "step": 24407 }, { "crossentropy": 2.4007134437561035, "epoch": 0.884860788863109, "grad_norm": 0.026998702436685562, "grad_norm_var": 3.1526098795790306e-07, "learning_rate": 0.00033675037410077437, "loss": 2.4523, "step": 24408 }, { "crossentropy": 2.42495059967041, "epoch": 0.884897041763341, "grad_norm": 0.026019565761089325, "grad_norm_var": 3.2399970698726347e-07, "learning_rate": 0.000336540759228397, "loss": 2.4653, "step": 24409 }, { "crossentropy": 2.513758897781372, "epoch": 0.8849332946635731, "grad_norm": 0.026700440794229507, "grad_norm_var": 3.0862512545383875e-07, "learning_rate": 0.0003363312073431124, "loss": 2.5128, "step": 24410 }, { "crossentropy": 2.4784932136535645, "epoch": 0.8849695475638051, "grad_norm": 0.026423810049891472, "grad_norm_var": 3.0047060943238075e-07, "learning_rate": 0.0003361217184477533, "loss": 2.4785, "step": 24411 }, { "crossentropy": 2.4771549701690674, "epoch": 0.8850058004640371, "grad_norm": 0.027263280004262924, "grad_norm_var": 3.148434512230287e-07, "learning_rate": 0.000335912292545148, "loss": 2.4271, "step": 24412 }, { "crossentropy": 2.539109706878662, "epoch": 0.8850420533642691, "grad_norm": 0.026334764435887337, "grad_norm_var": 2.63984043480323e-07, "learning_rate": 0.0003357029296381248, "loss": 2.6092, "step": 24413 }, { "crossentropy": 2.4887783527374268, "epoch": 0.8850783062645011, "grad_norm": 0.02693064697086811, "grad_norm_var": 2.646810072140833e-07, "learning_rate": 0.00033549362972951134, "loss": 2.4808, "step": 24414 }, { "crossentropy": 2.399845600128174, "epoch": 0.8851145591647331, "grad_norm": 0.026143793016672134, "grad_norm_var": 2.589279862792263e-07, "learning_rate": 0.0003352843928221361, "loss": 2.4189, "step": 24415 }, { "crossentropy": 2.5024847984313965, "epoch": 0.8851508120649652, "grad_norm": 0.02702719159424305, "grad_norm_var": 2.730873944618474e-07, "learning_rate": 0.00033507521891882334, "loss": 2.505, "step": 24416 }, { "crossentropy": 2.502107620239258, "epoch": 0.8851870649651972, "grad_norm": 0.02695348672568798, "grad_norm_var": 2.1161148591718656e-07, "learning_rate": 0.00033486610802239915, "loss": 2.4244, "step": 24417 }, { "crossentropy": 2.500520706176758, "epoch": 0.8852233178654292, "grad_norm": 0.02548935078084469, "grad_norm_var": 2.9205721357167025e-07, "learning_rate": 0.00033465706013568796, "loss": 2.4613, "step": 24418 }, { "crossentropy": 2.312587022781372, "epoch": 0.8852595707656613, "grad_norm": 0.026761839166283607, "grad_norm_var": 2.9229319085521725e-07, "learning_rate": 0.00033444807526151256, "loss": 2.2751, "step": 24419 }, { "crossentropy": 2.576925754547119, "epoch": 0.8852958236658933, "grad_norm": 0.027743907645344734, "grad_norm_var": 3.074031784568946e-07, "learning_rate": 0.00033423915340269783, "loss": 2.5634, "step": 24420 }, { "crossentropy": 2.493039131164551, "epoch": 0.8853320765661253, "grad_norm": 0.02715460956096649, "grad_norm_var": 3.2254663420264115e-07, "learning_rate": 0.0003340302945620621, "loss": 2.473, "step": 24421 }, { "crossentropy": 2.4479591846466064, "epoch": 0.8853683294663574, "grad_norm": 0.026003146544098854, "grad_norm_var": 3.4243013560669724e-07, "learning_rate": 0.00033382149874242815, "loss": 2.394, "step": 24422 }, { "crossentropy": 2.570919990539551, "epoch": 0.8854045823665894, "grad_norm": 0.028395595028996468, "grad_norm_var": 5.105194447449502e-07, "learning_rate": 0.00033361276594661706, "loss": 2.4979, "step": 24423 }, { "crossentropy": 2.4593136310577393, "epoch": 0.8854408352668214, "grad_norm": 0.02525542490184307, "grad_norm_var": 6.476496221789838e-07, "learning_rate": 0.00033340409617744647, "loss": 2.39, "step": 24424 }, { "crossentropy": 2.5327069759368896, "epoch": 0.8854770881670534, "grad_norm": 0.027078764513134956, "grad_norm_var": 6.269616155111027e-07, "learning_rate": 0.00033319548943773535, "loss": 2.5094, "step": 24425 }, { "crossentropy": 2.4649274349212646, "epoch": 0.8855133410672854, "grad_norm": 0.026808133348822594, "grad_norm_var": 6.272799323866664e-07, "learning_rate": 0.00033298694573030143, "loss": 2.3545, "step": 24426 }, { "crossentropy": 2.3967349529266357, "epoch": 0.8855495939675174, "grad_norm": 0.026160230860114098, "grad_norm_var": 6.425754929817306e-07, "learning_rate": 0.0003327784650579607, "loss": 2.3569, "step": 24427 }, { "crossentropy": 2.3836517333984375, "epoch": 0.8855858468677494, "grad_norm": 0.02673374116420746, "grad_norm_var": 6.216729460261586e-07, "learning_rate": 0.00033257004742353105, "loss": 2.419, "step": 24428 }, { "crossentropy": 2.548701763153076, "epoch": 0.8856220997679815, "grad_norm": 0.026393726468086243, "grad_norm_var": 6.191296285734671e-07, "learning_rate": 0.00033236169282982453, "loss": 2.4772, "step": 24429 }, { "crossentropy": 2.368035078048706, "epoch": 0.8856583526682135, "grad_norm": 0.026153596118092537, "grad_norm_var": 6.318934539154923e-07, "learning_rate": 0.00033215340127965676, "loss": 2.4397, "step": 24430 }, { "crossentropy": 2.402527093887329, "epoch": 0.8856946055684455, "grad_norm": 0.02587186172604561, "grad_norm_var": 6.545438208669744e-07, "learning_rate": 0.00033194517277584213, "loss": 2.4355, "step": 24431 }, { "crossentropy": 2.4112911224365234, "epoch": 0.8857308584686775, "grad_norm": 0.02592720091342926, "grad_norm_var": 6.710388296163645e-07, "learning_rate": 0.00033173700732119115, "loss": 2.4345, "step": 24432 }, { "crossentropy": 2.497781276702881, "epoch": 0.8857671113689095, "grad_norm": 0.02547396533191204, "grad_norm_var": 7.292979226816565e-07, "learning_rate": 0.0003315289049185166, "loss": 2.4701, "step": 24433 }, { "crossentropy": 2.3922150135040283, "epoch": 0.8858033642691415, "grad_norm": 0.02627163752913475, "grad_norm_var": 6.660087671555547e-07, "learning_rate": 0.0003313208655706285, "loss": 2.41, "step": 24434 }, { "crossentropy": 2.2475976943969727, "epoch": 0.8858396171693735, "grad_norm": 0.025339124724268913, "grad_norm_var": 7.450679710069759e-07, "learning_rate": 0.00033111288928033665, "loss": 2.3762, "step": 24435 }, { "crossentropy": 2.4437520503997803, "epoch": 0.8858758700696056, "grad_norm": 0.02641923353075981, "grad_norm_var": 6.21400810282358e-07, "learning_rate": 0.0003309049760504512, "loss": 2.3848, "step": 24436 }, { "crossentropy": 2.3490779399871826, "epoch": 0.8859121229698376, "grad_norm": 0.026281094178557396, "grad_norm_var": 5.742135432207825e-07, "learning_rate": 0.0003306971258837799, "loss": 2.3336, "step": 24437 }, { "crossentropy": 2.3329994678497314, "epoch": 0.8859483758700696, "grad_norm": 0.027160417288541794, "grad_norm_var": 6.143650363403154e-07, "learning_rate": 0.00033048933878312926, "loss": 2.3769, "step": 24438 }, { "crossentropy": 2.4138169288635254, "epoch": 0.8859846287703016, "grad_norm": 0.02651267684996128, "grad_norm_var": 3.2433465383117865e-07, "learning_rate": 0.00033028161475130713, "loss": 2.442, "step": 24439 }, { "crossentropy": 2.4982337951660156, "epoch": 0.8860208816705336, "grad_norm": 0.02674003690481186, "grad_norm_var": 2.6718400759438014e-07, "learning_rate": 0.0003300739537911174, "loss": 2.5121, "step": 24440 }, { "crossentropy": 2.4108803272247314, "epoch": 0.8860571345707656, "grad_norm": 0.026767177507281303, "grad_norm_var": 2.4226253049555723e-07, "learning_rate": 0.00032986635590536605, "loss": 2.4035, "step": 24441 }, { "crossentropy": 2.364384651184082, "epoch": 0.8860933874709976, "grad_norm": 0.026098012924194336, "grad_norm_var": 2.269335399015474e-07, "learning_rate": 0.0003296588210968565, "loss": 2.3694, "step": 24442 }, { "crossentropy": 2.4856748580932617, "epoch": 0.8861296403712297, "grad_norm": 0.026861168444156647, "grad_norm_var": 2.474768033277596e-07, "learning_rate": 0.00032945134936839205, "loss": 2.3736, "step": 24443 }, { "crossentropy": 2.4935736656188965, "epoch": 0.8861658932714617, "grad_norm": 0.02696145884692669, "grad_norm_var": 2.634987721146827e-07, "learning_rate": 0.000329243940722776, "loss": 2.4239, "step": 24444 }, { "crossentropy": 2.5145139694213867, "epoch": 0.8862021461716937, "grad_norm": 0.02605874463915825, "grad_norm_var": 2.675328731776046e-07, "learning_rate": 0.00032903659516280825, "loss": 2.4496, "step": 24445 }, { "crossentropy": 2.5276060104370117, "epoch": 0.8862383990719258, "grad_norm": 0.026688989251852036, "grad_norm_var": 2.745624910420463e-07, "learning_rate": 0.0003288293126912889, "loss": 2.5056, "step": 24446 }, { "crossentropy": 2.510854959487915, "epoch": 0.8862746519721578, "grad_norm": 0.026775404810905457, "grad_norm_var": 2.6924333412266474e-07, "learning_rate": 0.00032862209331101954, "loss": 2.4929, "step": 24447 }, { "crossentropy": 2.4778621196746826, "epoch": 0.8863109048723898, "grad_norm": 0.02644011192023754, "grad_norm_var": 2.5362392660533264e-07, "learning_rate": 0.00032841493702479686, "loss": 2.4188, "step": 24448 }, { "crossentropy": 2.429490327835083, "epoch": 0.8863471577726219, "grad_norm": 0.02715764008462429, "grad_norm_var": 2.1660765350064021e-07, "learning_rate": 0.0003282078438354208, "loss": 2.4646, "step": 24449 }, { "crossentropy": 2.450077533721924, "epoch": 0.8863834106728539, "grad_norm": 0.026871031150221825, "grad_norm_var": 2.181497094183929e-07, "learning_rate": 0.00032800081374568634, "loss": 2.4645, "step": 24450 }, { "crossentropy": 2.4945385456085205, "epoch": 0.8864196635730859, "grad_norm": 0.027104312554001808, "grad_norm_var": 1.2301463559553454e-07, "learning_rate": 0.0003277938467583913, "loss": 2.475, "step": 24451 }, { "crossentropy": 2.4158477783203125, "epoch": 0.8864559164733179, "grad_norm": 0.025512881577014923, "grad_norm_var": 2.0600183716397667e-07, "learning_rate": 0.00032758694287633115, "loss": 2.3453, "step": 24452 }, { "crossentropy": 2.39583683013916, "epoch": 0.8864921693735499, "grad_norm": 0.026657933369278908, "grad_norm_var": 1.9762546145755492e-07, "learning_rate": 0.0003273801021023004, "loss": 2.4074, "step": 24453 }, { "crossentropy": 2.3743488788604736, "epoch": 0.8865284222737819, "grad_norm": 0.02699475735425949, "grad_norm_var": 1.8802239115141516e-07, "learning_rate": 0.0003271733244390912, "loss": 2.3375, "step": 24454 }, { "crossentropy": 2.3908019065856934, "epoch": 0.8865646751740139, "grad_norm": 0.026551702991127968, "grad_norm_var": 1.874673052616622e-07, "learning_rate": 0.00032696660988949854, "loss": 2.3961, "step": 24455 }, { "crossentropy": 2.2604925632476807, "epoch": 0.886600928074246, "grad_norm": 0.0263346116989851, "grad_norm_var": 1.9233735019176464e-07, "learning_rate": 0.00032675995845631245, "loss": 2.3963, "step": 24456 }, { "crossentropy": 2.427689552307129, "epoch": 0.886637180974478, "grad_norm": 0.02641315385699272, "grad_norm_var": 1.9297540815218134e-07, "learning_rate": 0.0003265533701423257, "loss": 2.4252, "step": 24457 }, { "crossentropy": 2.5380897521972656, "epoch": 0.88667343387471, "grad_norm": 0.02724190056324005, "grad_norm_var": 1.993187284425139e-07, "learning_rate": 0.0003263468449503271, "loss": 2.4644, "step": 24458 }, { "crossentropy": 2.541849374771118, "epoch": 0.886709686774942, "grad_norm": 0.026580970734357834, "grad_norm_var": 1.9686370809664464e-07, "learning_rate": 0.0003261403828831072, "loss": 2.5284, "step": 24459 }, { "crossentropy": 2.3267390727996826, "epoch": 0.886745939675174, "grad_norm": 0.025924202054739, "grad_norm_var": 2.2056234443543615e-07, "learning_rate": 0.00032593398394345496, "loss": 2.3097, "step": 24460 }, { "crossentropy": 2.418429374694824, "epoch": 0.886782192575406, "grad_norm": 0.026718847453594208, "grad_norm_var": 2.0176226820701043e-07, "learning_rate": 0.00032572764813415647, "loss": 2.4299, "step": 24461 }, { "crossentropy": 2.3893418312072754, "epoch": 0.886818445475638, "grad_norm": 0.02784409187734127, "grad_norm_var": 2.953125503068569e-07, "learning_rate": 0.0003255213754580005, "loss": 2.3818, "step": 24462 }, { "crossentropy": 2.4105703830718994, "epoch": 0.88685469837587, "grad_norm": 0.026520628482103348, "grad_norm_var": 2.9664566647587497e-07, "learning_rate": 0.00032531516591777167, "loss": 2.4057, "step": 24463 }, { "crossentropy": 2.413597345352173, "epoch": 0.8868909512761021, "grad_norm": 0.02548806555569172, "grad_norm_var": 3.8365742614880163e-07, "learning_rate": 0.0003251090195162554, "loss": 2.4049, "step": 24464 }, { "crossentropy": 2.48075270652771, "epoch": 0.8869272041763341, "grad_norm": 0.025320200249552727, "grad_norm_var": 4.629014406303903e-07, "learning_rate": 0.00032490293625623667, "loss": 2.4433, "step": 24465 }, { "crossentropy": 2.3029041290283203, "epoch": 0.8869634570765661, "grad_norm": 0.02841201238334179, "grad_norm_var": 6.865307705337479e-07, "learning_rate": 0.0003246969161404978, "loss": 2.3415, "step": 24466 }, { "crossentropy": 2.455104351043701, "epoch": 0.8869997099767981, "grad_norm": 0.02721540257334709, "grad_norm_var": 6.947531939025294e-07, "learning_rate": 0.000324490959171822, "loss": 2.4294, "step": 24467 }, { "crossentropy": 2.3528554439544678, "epoch": 0.8870359628770301, "grad_norm": 0.025757739320397377, "grad_norm_var": 6.627404426075174e-07, "learning_rate": 0.00032428506535299217, "loss": 2.3742, "step": 24468 }, { "crossentropy": 2.45332670211792, "epoch": 0.8870722157772621, "grad_norm": 0.027364186942577362, "grad_norm_var": 6.971562686334016e-07, "learning_rate": 0.0003240792346867877, "loss": 2.4573, "step": 24469 }, { "crossentropy": 2.533813953399658, "epoch": 0.8871084686774942, "grad_norm": 0.025603890419006348, "grad_norm_var": 7.574023472616999e-07, "learning_rate": 0.0003238734671759891, "loss": 2.5413, "step": 24470 }, { "crossentropy": 2.5692403316497803, "epoch": 0.8871447215777262, "grad_norm": 0.02592424489557743, "grad_norm_var": 7.844368761244682e-07, "learning_rate": 0.0003236677628233764, "loss": 2.4456, "step": 24471 }, { "crossentropy": 2.460611343383789, "epoch": 0.8871809744779582, "grad_norm": 0.02781088277697563, "grad_norm_var": 8.799229654662492e-07, "learning_rate": 0.00032346212163172574, "loss": 2.464, "step": 24472 }, { "crossentropy": 2.5337462425231934, "epoch": 0.8872172273781903, "grad_norm": 0.027002904564142227, "grad_norm_var": 8.843125383485121e-07, "learning_rate": 0.0003232565436038171, "loss": 2.4992, "step": 24473 }, { "crossentropy": 2.4943788051605225, "epoch": 0.8872534802784223, "grad_norm": 0.026603734120726585, "grad_norm_var": 8.611577865868242e-07, "learning_rate": 0.00032305102874242487, "loss": 2.4958, "step": 24474 }, { "crossentropy": 2.5069777965545654, "epoch": 0.8872897331786543, "grad_norm": 0.026238249614834785, "grad_norm_var": 8.707736302769493e-07, "learning_rate": 0.00032284557705032635, "loss": 2.3857, "step": 24475 }, { "crossentropy": 2.365349531173706, "epoch": 0.8873259860788864, "grad_norm": 0.027240632101893425, "grad_norm_var": 8.588289467953281e-07, "learning_rate": 0.0003226401885302971, "loss": 2.3837, "step": 24476 }, { "crossentropy": 2.4064550399780273, "epoch": 0.8873622389791184, "grad_norm": 0.02779998630285263, "grad_norm_var": 9.358095222007038e-07, "learning_rate": 0.0003224348631851093, "loss": 2.4758, "step": 24477 }, { "crossentropy": 2.4940881729125977, "epoch": 0.8873984918793504, "grad_norm": 0.026567379012703896, "grad_norm_var": 8.530011664506827e-07, "learning_rate": 0.000322229601017538, "loss": 2.4497, "step": 24478 }, { "crossentropy": 2.4474639892578125, "epoch": 0.8874347447795824, "grad_norm": 0.027317164465785027, "grad_norm_var": 8.757949481935067e-07, "learning_rate": 0.00032202440203035477, "loss": 2.4741, "step": 24479 }, { "crossentropy": 2.506249189376831, "epoch": 0.8874709976798144, "grad_norm": 0.02663644403219223, "grad_norm_var": 7.681843455097411e-07, "learning_rate": 0.0003218192662263303, "loss": 2.4346, "step": 24480 }, { "crossentropy": 2.4780848026275635, "epoch": 0.8875072505800464, "grad_norm": 0.02675677463412285, "grad_norm_var": 6.135425971480017e-07, "learning_rate": 0.0003216141936082362, "loss": 2.416, "step": 24481 }, { "crossentropy": 2.4390833377838135, "epoch": 0.8875435034802784, "grad_norm": 0.026589425280690193, "grad_norm_var": 4.5146650712029936e-07, "learning_rate": 0.0003214091841788419, "loss": 2.4096, "step": 24482 }, { "crossentropy": 2.554797410964966, "epoch": 0.8875797563805105, "grad_norm": 0.02724062092602253, "grad_norm_var": 4.529809822104708e-07, "learning_rate": 0.00032120423794091633, "loss": 2.4902, "step": 24483 }, { "crossentropy": 2.4614508152008057, "epoch": 0.8876160092807425, "grad_norm": 0.02670956589281559, "grad_norm_var": 3.800732005124757e-07, "learning_rate": 0.000320999354897229, "loss": 2.4466, "step": 24484 }, { "crossentropy": 2.45058012008667, "epoch": 0.8876522621809745, "grad_norm": 0.025342661887407303, "grad_norm_var": 4.936245002761513e-07, "learning_rate": 0.0003207945350505448, "loss": 2.4835, "step": 24485 }, { "crossentropy": 2.2711033821105957, "epoch": 0.8876885150812065, "grad_norm": 0.026559125632047653, "grad_norm_var": 4.0957932309902633e-07, "learning_rate": 0.0003205897784036327, "loss": 2.316, "step": 24486 }, { "crossentropy": 2.4318008422851562, "epoch": 0.8877247679814385, "grad_norm": 0.02675476484000683, "grad_norm_var": 3.5889699417193606e-07, "learning_rate": 0.00032038508495925655, "loss": 2.4417, "step": 24487 }, { "crossentropy": 2.4721884727478027, "epoch": 0.8877610208816705, "grad_norm": 0.02574741095304489, "grad_norm_var": 3.5326079856853355e-07, "learning_rate": 0.00032018045472018033, "loss": 2.4832, "step": 24488 }, { "crossentropy": 2.5839829444885254, "epoch": 0.8877972737819025, "grad_norm": 0.027874844148755074, "grad_norm_var": 4.366703622984969e-07, "learning_rate": 0.00031997588768917007, "loss": 2.5366, "step": 24489 }, { "crossentropy": 2.3446736335754395, "epoch": 0.8878335266821346, "grad_norm": 0.02590952068567276, "grad_norm_var": 4.802070246073213e-07, "learning_rate": 0.00031977138386898694, "loss": 2.4035, "step": 24490 }, { "crossentropy": 2.429983615875244, "epoch": 0.8878697795823666, "grad_norm": 0.025985699146986008, "grad_norm_var": 4.999200725302443e-07, "learning_rate": 0.0003195669432623938, "loss": 2.4838, "step": 24491 }, { "crossentropy": 2.4094741344451904, "epoch": 0.8879060324825986, "grad_norm": 0.02719193510711193, "grad_norm_var": 4.964898293425996e-07, "learning_rate": 0.00031936256587215215, "loss": 2.4406, "step": 24492 }, { "crossentropy": 2.3912036418914795, "epoch": 0.8879422853828306, "grad_norm": 0.02597067691385746, "grad_norm_var": 4.340396842975828e-07, "learning_rate": 0.0003191582517010222, "loss": 2.4053, "step": 24493 }, { "crossentropy": 2.290590286254883, "epoch": 0.8879785382830626, "grad_norm": 0.026175295934081078, "grad_norm_var": 4.438959093923476e-07, "learning_rate": 0.0003189540007517638, "loss": 2.3637, "step": 24494 }, { "crossentropy": 2.372119903564453, "epoch": 0.8880147911832946, "grad_norm": 0.02612117864191532, "grad_norm_var": 4.105796582733277e-07, "learning_rate": 0.0003187498130271349, "loss": 2.4084, "step": 24495 }, { "crossentropy": 2.394651174545288, "epoch": 0.8880510440835266, "grad_norm": 0.026692131534218788, "grad_norm_var": 4.1198800268191227e-07, "learning_rate": 0.00031854568852989374, "loss": 2.4414, "step": 24496 }, { "crossentropy": 2.3567168712615967, "epoch": 0.8880872969837587, "grad_norm": 0.02626199647784233, "grad_norm_var": 4.087887431468429e-07, "learning_rate": 0.00031834162726279816, "loss": 2.4262, "step": 24497 }, { "crossentropy": 2.3392670154571533, "epoch": 0.8881235498839907, "grad_norm": 0.02634900063276291, "grad_norm_var": 4.0778543981289694e-07, "learning_rate": 0.00031813762922860257, "loss": 2.4702, "step": 24498 }, { "crossentropy": 2.511638641357422, "epoch": 0.8881598027842227, "grad_norm": 0.026485856622457504, "grad_norm_var": 3.6185316883684725e-07, "learning_rate": 0.00031793369443006316, "loss": 2.4714, "step": 24499 }, { "crossentropy": 2.4963932037353516, "epoch": 0.8881960556844548, "grad_norm": 0.02590629644691944, "grad_norm_var": 3.672292611991053e-07, "learning_rate": 0.00031772982286993543, "loss": 2.4678, "step": 24500 }, { "crossentropy": 2.370581865310669, "epoch": 0.8882323085846868, "grad_norm": 0.026372749358415604, "grad_norm_var": 2.97525393425091e-07, "learning_rate": 0.0003175260145509712, "loss": 2.3801, "step": 24501 }, { "crossentropy": 2.408418893814087, "epoch": 0.8882685614849188, "grad_norm": 0.026995258405804634, "grad_norm_var": 3.1881784189392336e-07, "learning_rate": 0.00031732226947592533, "loss": 2.4132, "step": 24502 }, { "crossentropy": 2.3849427700042725, "epoch": 0.8883048143851509, "grad_norm": 0.025959106162190437, "grad_norm_var": 3.2336515385770515e-07, "learning_rate": 0.00031711858764754686, "loss": 2.3484, "step": 24503 }, { "crossentropy": 2.4805495738983154, "epoch": 0.8883410672853829, "grad_norm": 0.027524514123797417, "grad_norm_var": 3.720561792305843e-07, "learning_rate": 0.00031691496906858965, "loss": 2.479, "step": 24504 }, { "crossentropy": 2.450439453125, "epoch": 0.8883773201856149, "grad_norm": 0.02720874734222889, "grad_norm_var": 2.7643953694432105e-07, "learning_rate": 0.00031671141374180255, "loss": 2.4937, "step": 24505 }, { "crossentropy": 2.4383256435394287, "epoch": 0.8884135730858469, "grad_norm": 0.02810117043554783, "grad_norm_var": 4.203531321464767e-07, "learning_rate": 0.00031650792166993406, "loss": 2.4439, "step": 24506 }, { "crossentropy": 2.5494213104248047, "epoch": 0.8884498259860789, "grad_norm": 0.026487769559025764, "grad_norm_var": 3.962332559964236e-07, "learning_rate": 0.0003163044928557335, "loss": 2.484, "step": 24507 }, { "crossentropy": 2.598644495010376, "epoch": 0.8884860788863109, "grad_norm": 0.026198597624897957, "grad_norm_var": 3.811904271695694e-07, "learning_rate": 0.00031610112730194985, "loss": 2.4362, "step": 24508 }, { "crossentropy": 2.402585744857788, "epoch": 0.8885223317865429, "grad_norm": 0.027464164420962334, "grad_norm_var": 4.0510664710372937e-07, "learning_rate": 0.00031589782501132704, "loss": 2.439, "step": 24509 }, { "crossentropy": 2.5574111938476562, "epoch": 0.888558584686775, "grad_norm": 0.026613429188728333, "grad_norm_var": 3.897241559519162e-07, "learning_rate": 0.0003156945859866134, "loss": 2.5162, "step": 24510 }, { "crossentropy": 2.4850029945373535, "epoch": 0.888594837587007, "grad_norm": 0.026653258129954338, "grad_norm_var": 3.6838549585132586e-07, "learning_rate": 0.0003154914102305528, "loss": 2.438, "step": 24511 }, { "crossentropy": 2.4051780700683594, "epoch": 0.888631090487239, "grad_norm": 0.02786412462592125, "grad_norm_var": 4.5228072923092497e-07, "learning_rate": 0.00031528829774588984, "loss": 2.5133, "step": 24512 }, { "crossentropy": 2.4266440868377686, "epoch": 0.888667343387471, "grad_norm": 0.025517068803310394, "grad_norm_var": 5.382022606909356e-07, "learning_rate": 0.00031508524853536833, "loss": 2.4275, "step": 24513 }, { "crossentropy": 2.3454692363739014, "epoch": 0.888703596287703, "grad_norm": 0.027457188814878464, "grad_norm_var": 5.584664938121376e-07, "learning_rate": 0.00031488226260172893, "loss": 2.4469, "step": 24514 }, { "crossentropy": 2.321201801300049, "epoch": 0.888739849187935, "grad_norm": 0.02774919383227825, "grad_norm_var": 6.052040220252356e-07, "learning_rate": 0.00031467933994771435, "loss": 2.3632, "step": 24515 }, { "crossentropy": 2.3350658416748047, "epoch": 0.888776102088167, "grad_norm": 0.02628016099333763, "grad_norm_var": 5.654251146037029e-07, "learning_rate": 0.0003144764805760669, "loss": 2.3409, "step": 24516 }, { "crossentropy": 2.3718159198760986, "epoch": 0.8888123549883991, "grad_norm": 0.026760194450616837, "grad_norm_var": 5.474196568414427e-07, "learning_rate": 0.0003142736844895239, "loss": 2.417, "step": 24517 }, { "crossentropy": 2.289112091064453, "epoch": 0.8888486078886311, "grad_norm": 0.025863658636808395, "grad_norm_var": 6.171715673909296e-07, "learning_rate": 0.00031407095169082636, "loss": 2.3734, "step": 24518 }, { "crossentropy": 2.3889172077178955, "epoch": 0.8888848607888631, "grad_norm": 0.03013845719397068, "grad_norm_var": 1.208845225085274e-06, "learning_rate": 0.00031386828218271103, "loss": 2.4642, "step": 24519 }, { "crossentropy": 2.365839719772339, "epoch": 0.8889211136890951, "grad_norm": 0.027075685560703278, "grad_norm_var": 1.1970847438635505e-06, "learning_rate": 0.00031366567596791627, "loss": 2.3555, "step": 24520 }, { "crossentropy": 2.492501735687256, "epoch": 0.8889573665893271, "grad_norm": 0.026951421052217484, "grad_norm_var": 1.1971337630477338e-06, "learning_rate": 0.0003134631330491788, "loss": 2.4917, "step": 24521 }, { "crossentropy": 2.4901983737945557, "epoch": 0.8889936194895591, "grad_norm": 0.026112915948033333, "grad_norm_var": 1.171762393306153e-06, "learning_rate": 0.00031326065342923373, "loss": 2.4121, "step": 24522 }, { "crossentropy": 2.3460447788238525, "epoch": 0.8890298723897911, "grad_norm": 0.025754747912287712, "grad_norm_var": 1.250443948764635e-06, "learning_rate": 0.00031305823711081537, "loss": 2.3717, "step": 24523 }, { "crossentropy": 2.473330020904541, "epoch": 0.8890661252900232, "grad_norm": 0.026751350611448288, "grad_norm_var": 1.2175963363357523e-06, "learning_rate": 0.0003128558840966589, "loss": 2.5138, "step": 24524 }, { "crossentropy": 2.3376622200012207, "epoch": 0.8891023781902552, "grad_norm": 0.02564563974738121, "grad_norm_var": 1.2966919227355799e-06, "learning_rate": 0.00031265359438949593, "loss": 2.4325, "step": 24525 }, { "crossentropy": 2.4421651363372803, "epoch": 0.8891386310904872, "grad_norm": 0.02623751573264599, "grad_norm_var": 1.3160921251586498e-06, "learning_rate": 0.0003124513679920599, "loss": 2.3896, "step": 24526 }, { "crossentropy": 2.511810302734375, "epoch": 0.8891748839907193, "grad_norm": 0.026431025937199593, "grad_norm_var": 1.3235502253472938e-06, "learning_rate": 0.0003122492049070819, "loss": 2.5421, "step": 24527 }, { "crossentropy": 2.5351920127868652, "epoch": 0.8892111368909513, "grad_norm": 0.02684617228806019, "grad_norm_var": 1.242105541574218e-06, "learning_rate": 0.00031204710513729197, "loss": 2.4716, "step": 24528 }, { "crossentropy": 2.432203769683838, "epoch": 0.8892473897911833, "grad_norm": 0.026208126917481422, "grad_norm_var": 1.1608119942616939e-06, "learning_rate": 0.00031184506868542073, "loss": 2.3219, "step": 24529 }, { "crossentropy": 2.3801162242889404, "epoch": 0.8892836426914154, "grad_norm": 0.025944272056221962, "grad_norm_var": 1.1645351598738347e-06, "learning_rate": 0.000311643095554196, "loss": 2.4125, "step": 24530 }, { "crossentropy": 2.361379861831665, "epoch": 0.8893198955916474, "grad_norm": 0.026253173127770424, "grad_norm_var": 1.0895295791986016e-06, "learning_rate": 0.00031144118574634563, "loss": 2.3533, "step": 24531 }, { "crossentropy": 2.4705967903137207, "epoch": 0.8893561484918794, "grad_norm": 0.026813361793756485, "grad_norm_var": 1.0860951605515575e-06, "learning_rate": 0.00031123933926459845, "loss": 2.414, "step": 24532 }, { "crossentropy": 2.5562844276428223, "epoch": 0.8893924013921114, "grad_norm": 0.026374636217951775, "grad_norm_var": 1.0877540071817122e-06, "learning_rate": 0.00031103755611167793, "loss": 2.4565, "step": 24533 }, { "crossentropy": 2.4587459564208984, "epoch": 0.8894286542923434, "grad_norm": 0.027250677347183228, "grad_norm_var": 1.0741036369161609e-06, "learning_rate": 0.00031083583629031134, "loss": 2.4652, "step": 24534 }, { "crossentropy": 2.3963656425476074, "epoch": 0.8894649071925754, "grad_norm": 0.026683760806918144, "grad_norm_var": 2.2436599391757971e-07, "learning_rate": 0.00031063417980322364, "loss": 2.4223, "step": 24535 }, { "crossentropy": 2.543323278427124, "epoch": 0.8895011600928074, "grad_norm": 0.025894436985254288, "grad_norm_var": 2.1435371526055494e-07, "learning_rate": 0.0003104325866531366, "loss": 2.4281, "step": 24536 }, { "crossentropy": 2.2370190620422363, "epoch": 0.8895374129930395, "grad_norm": 0.02539961412549019, "grad_norm_var": 2.4757595658888207e-07, "learning_rate": 0.0003102310568427752, "loss": 2.2887, "step": 24537 }, { "crossentropy": 2.4314403533935547, "epoch": 0.8895736658932715, "grad_norm": 0.025254789739847183, "grad_norm_var": 3.135853053358306e-07, "learning_rate": 0.00031002959037486013, "loss": 2.406, "step": 24538 }, { "crossentropy": 2.5162999629974365, "epoch": 0.8896099187935035, "grad_norm": 0.028042668476700783, "grad_norm_var": 4.945611467403598e-07, "learning_rate": 0.00030982818725211193, "loss": 2.4939, "step": 24539 }, { "crossentropy": 2.5187220573425293, "epoch": 0.8896461716937355, "grad_norm": 0.026313986629247665, "grad_norm_var": 4.846834342602287e-07, "learning_rate": 0.0003096268474772529, "loss": 2.5018, "step": 24540 }, { "crossentropy": 2.299767255783081, "epoch": 0.8896824245939675, "grad_norm": 0.025580769404768944, "grad_norm_var": 4.910354028139311e-07, "learning_rate": 0.0003094255710530003, "loss": 2.3557, "step": 24541 }, { "crossentropy": 2.4672982692718506, "epoch": 0.8897186774941995, "grad_norm": 0.02709897793829441, "grad_norm_var": 5.250073851279305e-07, "learning_rate": 0.00030922435798207305, "loss": 2.4763, "step": 24542 }, { "crossentropy": 2.3707315921783447, "epoch": 0.8897549303944315, "grad_norm": 0.025713667273521423, "grad_norm_var": 5.54145446550686e-07, "learning_rate": 0.00030902320826719067, "loss": 2.3303, "step": 24543 }, { "crossentropy": 2.3348026275634766, "epoch": 0.8897911832946636, "grad_norm": 0.026905134320259094, "grad_norm_var": 5.582275259726396e-07, "learning_rate": 0.0003088221219110676, "loss": 2.3928, "step": 24544 }, { "crossentropy": 2.409548282623291, "epoch": 0.8898274361948956, "grad_norm": 0.02685759961605072, "grad_norm_var": 5.715905499198513e-07, "learning_rate": 0.00030862109891642174, "loss": 2.4825, "step": 24545 }, { "crossentropy": 2.481260061264038, "epoch": 0.8898636890951276, "grad_norm": 0.02562515065073967, "grad_norm_var": 5.972973308614183e-07, "learning_rate": 0.0003084201392859676, "loss": 2.4258, "step": 24546 }, { "crossentropy": 2.524402618408203, "epoch": 0.8898999419953596, "grad_norm": 0.026569826528429985, "grad_norm_var": 5.982559095655051e-07, "learning_rate": 0.0003082192430224179, "loss": 2.4302, "step": 24547 }, { "crossentropy": 2.4690423011779785, "epoch": 0.8899361948955916, "grad_norm": 0.027023345232009888, "grad_norm_var": 6.126215881273323e-07, "learning_rate": 0.0003080184101284883, "loss": 2.5028, "step": 24548 }, { "crossentropy": 2.404956817626953, "epoch": 0.8899724477958236, "grad_norm": 0.025234123691916466, "grad_norm_var": 6.995733673414069e-07, "learning_rate": 0.00030781764060689, "loss": 2.3502, "step": 24549 }, { "crossentropy": 2.588547706604004, "epoch": 0.8900087006960556, "grad_norm": 0.026906700804829597, "grad_norm_var": 6.65225920149628e-07, "learning_rate": 0.00030761693446033466, "loss": 2.5267, "step": 24550 }, { "crossentropy": 2.5230002403259277, "epoch": 0.8900449535962877, "grad_norm": 0.027251899242401123, "grad_norm_var": 7.130284203237779e-07, "learning_rate": 0.0003074162916915346, "loss": 2.48, "step": 24551 }, { "crossentropy": 2.375718116760254, "epoch": 0.8900812064965197, "grad_norm": 0.025914611294865608, "grad_norm_var": 7.118162147235711e-07, "learning_rate": 0.0003072157123031982, "loss": 2.4397, "step": 24552 }, { "crossentropy": 2.515507698059082, "epoch": 0.8901174593967517, "grad_norm": 0.027481934055685997, "grad_norm_var": 7.173406184339421e-07, "learning_rate": 0.00030701519629803555, "loss": 2.5471, "step": 24553 }, { "crossentropy": 2.472043752670288, "epoch": 0.8901537122969838, "grad_norm": 0.026010463014245033, "grad_norm_var": 6.289835239482357e-07, "learning_rate": 0.00030681474367875495, "loss": 2.5034, "step": 24554 }, { "crossentropy": 2.4334850311279297, "epoch": 0.8901899651972158, "grad_norm": 0.028088128194212914, "grad_norm_var": 6.382621498015323e-07, "learning_rate": 0.0003066143544480621, "loss": 2.4755, "step": 24555 }, { "crossentropy": 2.4342031478881836, "epoch": 0.8902262180974478, "grad_norm": 0.026151547208428383, "grad_norm_var": 6.447202369067387e-07, "learning_rate": 0.00030641402860866586, "loss": 2.3766, "step": 24556 }, { "crossentropy": 2.4254777431488037, "epoch": 0.8902624709976799, "grad_norm": 0.02763453498482704, "grad_norm_var": 6.49541004160226e-07, "learning_rate": 0.00030621376616327026, "loss": 2.4472, "step": 24557 }, { "crossentropy": 2.4502291679382324, "epoch": 0.8902987238979119, "grad_norm": 0.026738394051790237, "grad_norm_var": 6.3628466699054e-07, "learning_rate": 0.0003060135671145808, "loss": 2.4021, "step": 24558 }, { "crossentropy": 2.343332290649414, "epoch": 0.8903349767981439, "grad_norm": 0.026234986260533333, "grad_norm_var": 5.89459398569779e-07, "learning_rate": 0.0003058134314653022, "loss": 2.3395, "step": 24559 }, { "crossentropy": 2.498796224594116, "epoch": 0.8903712296983759, "grad_norm": 0.026951951906085014, "grad_norm_var": 5.910999265200274e-07, "learning_rate": 0.0003056133592181359, "loss": 2.5173, "step": 24560 }, { "crossentropy": 2.4994964599609375, "epoch": 0.8904074825986079, "grad_norm": 0.026754828169941902, "grad_norm_var": 5.891510276749875e-07, "learning_rate": 0.0003054133503757861, "loss": 2.4132, "step": 24561 }, { "crossentropy": 2.3897571563720703, "epoch": 0.8904437354988399, "grad_norm": 0.025850683450698853, "grad_norm_var": 5.611877441573232e-07, "learning_rate": 0.0003052134049409527, "loss": 2.3632, "step": 24562 }, { "crossentropy": 2.485321283340454, "epoch": 0.8904799883990719, "grad_norm": 0.026568399742245674, "grad_norm_var": 5.612078551191774e-07, "learning_rate": 0.00030501352291633754, "loss": 2.473, "step": 24563 }, { "crossentropy": 2.4648404121398926, "epoch": 0.890516241299304, "grad_norm": 0.02715843915939331, "grad_norm_var": 5.686269849227427e-07, "learning_rate": 0.0003048137043046395, "loss": 2.4425, "step": 24564 }, { "crossentropy": 2.4978976249694824, "epoch": 0.890552494199536, "grad_norm": 0.026400597766041756, "grad_norm_var": 4.282895524468222e-07, "learning_rate": 0.000304613949108557, "loss": 2.499, "step": 24565 }, { "crossentropy": 2.466604232788086, "epoch": 0.890588747099768, "grad_norm": 0.025451958179473877, "grad_norm_var": 5.31351474064116e-07, "learning_rate": 0.00030441425733078844, "loss": 2.4205, "step": 24566 }, { "crossentropy": 2.5846939086914062, "epoch": 0.890625, "grad_norm": 0.02726631984114647, "grad_norm_var": 5.324925262542926e-07, "learning_rate": 0.00030421462897403217, "loss": 2.5333, "step": 24567 }, { "crossentropy": 2.533369302749634, "epoch": 0.890661252900232, "grad_norm": 0.026958966627717018, "grad_norm_var": 4.960155306538497e-07, "learning_rate": 0.0003040150640409828, "loss": 2.4131, "step": 24568 }, { "crossentropy": 2.6127309799194336, "epoch": 0.890697505800464, "grad_norm": 0.026704546064138412, "grad_norm_var": 4.559903990909806e-07, "learning_rate": 0.0003038155625343364, "loss": 2.5929, "step": 24569 }, { "crossentropy": 2.4890098571777344, "epoch": 0.890733758700696, "grad_norm": 0.027874749153852463, "grad_norm_var": 5.060901319320702e-07, "learning_rate": 0.00030361612445678753, "loss": 2.4556, "step": 24570 }, { "crossentropy": 2.417212963104248, "epoch": 0.8907700116009281, "grad_norm": 0.026369865983724594, "grad_norm_var": 3.9534736256696157e-07, "learning_rate": 0.0003034167498110302, "loss": 2.4371, "step": 24571 }, { "crossentropy": 2.4169228076934814, "epoch": 0.8908062645011601, "grad_norm": 0.027347758412361145, "grad_norm_var": 3.985927913649027e-07, "learning_rate": 0.0003032174385997577, "loss": 2.3628, "step": 24572 }, { "crossentropy": 2.3992536067962646, "epoch": 0.8908425174013921, "grad_norm": 0.025951944291591644, "grad_norm_var": 3.8083935906118904e-07, "learning_rate": 0.00030301819082565964, "loss": 2.3192, "step": 24573 }, { "crossentropy": 2.431950807571411, "epoch": 0.8908787703016241, "grad_norm": 0.026371046900749207, "grad_norm_var": 3.855083042026327e-07, "learning_rate": 0.00030281900649142937, "loss": 2.4286, "step": 24574 }, { "crossentropy": 2.3419480323791504, "epoch": 0.8909150232018561, "grad_norm": 0.02710949257016182, "grad_norm_var": 3.8624825008831627e-07, "learning_rate": 0.0003026198855997575, "loss": 2.4739, "step": 24575 }, { "crossentropy": 2.3699758052825928, "epoch": 0.8909512761020881, "grad_norm": 0.026139704510569572, "grad_norm_var": 3.994619799867754e-07, "learning_rate": 0.00030242082815333136, "loss": 2.3253, "step": 24576 }, { "crossentropy": 2.4556708335876465, "epoch": 0.8909875290023201, "grad_norm": 0.02615657076239586, "grad_norm_var": 4.128678349816945e-07, "learning_rate": 0.0003022218341548422, "loss": 2.4518, "step": 24577 }, { "crossentropy": 2.406113624572754, "epoch": 0.8910237819025522, "grad_norm": 0.026111343875527382, "grad_norm_var": 3.90895996611494e-07, "learning_rate": 0.00030202290360697493, "loss": 2.3848, "step": 24578 }, { "crossentropy": 2.4556257724761963, "epoch": 0.8910600348027842, "grad_norm": 0.027251113206148148, "grad_norm_var": 4.1520652992455e-07, "learning_rate": 0.00030182403651241805, "loss": 2.4259, "step": 24579 }, { "crossentropy": 2.392019033432007, "epoch": 0.8910962877030162, "grad_norm": 0.026757748797535896, "grad_norm_var": 3.988268704631226e-07, "learning_rate": 0.0003016252328738589, "loss": 2.3718, "step": 24580 }, { "crossentropy": 2.180978298187256, "epoch": 0.8911325406032483, "grad_norm": 0.02617751806974411, "grad_norm_var": 4.0902767367544576e-07, "learning_rate": 0.00030142649269397913, "loss": 2.2831, "step": 24581 }, { "crossentropy": 2.4693686962127686, "epoch": 0.8911687935034803, "grad_norm": 0.028012696653604507, "grad_norm_var": 4.1833643135419143e-07, "learning_rate": 0.000301227815975465, "loss": 2.4416, "step": 24582 }, { "crossentropy": 2.3611247539520264, "epoch": 0.8912050464037123, "grad_norm": 0.0273745134472847, "grad_norm_var": 4.2601022857112946e-07, "learning_rate": 0.000301029202721001, "loss": 2.3935, "step": 24583 }, { "crossentropy": 2.3777408599853516, "epoch": 0.8912412993039444, "grad_norm": 0.026670534163713455, "grad_norm_var": 4.2478284451376245e-07, "learning_rate": 0.00030083065293326716, "loss": 2.3679, "step": 24584 }, { "crossentropy": 2.255279779434204, "epoch": 0.8912775522041764, "grad_norm": 0.0259939506649971, "grad_norm_var": 4.6290554943094417e-07, "learning_rate": 0.0003006321666149481, "loss": 2.3224, "step": 24585 }, { "crossentropy": 2.346808910369873, "epoch": 0.8913138051044084, "grad_norm": 0.025437474250793457, "grad_norm_var": 4.619738356028639e-07, "learning_rate": 0.00030043374376872214, "loss": 2.4026, "step": 24586 }, { "crossentropy": 2.473374128341675, "epoch": 0.8913500580046404, "grad_norm": 0.02648099511861801, "grad_norm_var": 4.5967535882775135e-07, "learning_rate": 0.0003002353843972699, "loss": 2.4563, "step": 24587 }, { "crossentropy": 2.482750654220581, "epoch": 0.8913863109048724, "grad_norm": 0.026895662769675255, "grad_norm_var": 4.2641237828298234e-07, "learning_rate": 0.00030003708850327325, "loss": 2.47, "step": 24588 }, { "crossentropy": 2.375993013381958, "epoch": 0.8914225638051044, "grad_norm": 0.026309197768568993, "grad_norm_var": 4.056267746461398e-07, "learning_rate": 0.0002998388560894061, "loss": 2.348, "step": 24589 }, { "crossentropy": 2.2321572303771973, "epoch": 0.8914588167053364, "grad_norm": 0.027041025459766388, "grad_norm_var": 4.15185280999578e-07, "learning_rate": 0.0002996406871583479, "loss": 2.3935, "step": 24590 }, { "crossentropy": 2.346843719482422, "epoch": 0.8914950696055685, "grad_norm": 0.026949310675263405, "grad_norm_var": 4.0633393006648624e-07, "learning_rate": 0.00029944258171277606, "loss": 2.4546, "step": 24591 }, { "crossentropy": 2.5482451915740967, "epoch": 0.8915313225058005, "grad_norm": 0.026551108807325363, "grad_norm_var": 3.9111692233920597e-07, "learning_rate": 0.000299244539755365, "loss": 2.4642, "step": 24592 }, { "crossentropy": 2.5294528007507324, "epoch": 0.8915675754060325, "grad_norm": 0.026562264189124107, "grad_norm_var": 3.75487814346237e-07, "learning_rate": 0.0002990465612887905, "loss": 2.5434, "step": 24593 }, { "crossentropy": 2.3714706897735596, "epoch": 0.8916038283062645, "grad_norm": 0.027699293568730354, "grad_norm_var": 4.167039324566117e-07, "learning_rate": 0.0002988486463157258, "loss": 2.3771, "step": 24594 }, { "crossentropy": 2.5637879371643066, "epoch": 0.8916400812064965, "grad_norm": 0.02699851058423519, "grad_norm_var": 4.041603513906296e-07, "learning_rate": 0.00029865079483884396, "loss": 2.5068, "step": 24595 }, { "crossentropy": 2.516702175140381, "epoch": 0.8916763341067285, "grad_norm": 0.025990542024374008, "grad_norm_var": 4.3959171853082355e-07, "learning_rate": 0.00029845300686081987, "loss": 2.4801, "step": 24596 }, { "crossentropy": 2.4290294647216797, "epoch": 0.8917125870069605, "grad_norm": 0.02590184658765793, "grad_norm_var": 4.6341856805200374e-07, "learning_rate": 0.00029825528238432045, "loss": 2.4632, "step": 24597 }, { "crossentropy": 2.301035165786743, "epoch": 0.8917488399071926, "grad_norm": 0.027295641601085663, "grad_norm_var": 3.6807231445177647e-07, "learning_rate": 0.00029805762141201896, "loss": 2.4523, "step": 24598 }, { "crossentropy": 2.427666664123535, "epoch": 0.8917850928074246, "grad_norm": 0.02738288603723049, "grad_norm_var": 3.6890281522431303e-07, "learning_rate": 0.0002978600239465856, "loss": 2.403, "step": 24599 }, { "crossentropy": 2.5312817096710205, "epoch": 0.8918213457076566, "grad_norm": 0.026780102401971817, "grad_norm_var": 3.701720392106407e-07, "learning_rate": 0.0002976624899906877, "loss": 2.5107, "step": 24600 }, { "crossentropy": 2.340296745300293, "epoch": 0.8918575986078886, "grad_norm": 0.028531165793538094, "grad_norm_var": 5.533275014520112e-07, "learning_rate": 0.00029746501954699414, "loss": 2.41, "step": 24601 }, { "crossentropy": 2.449373960494995, "epoch": 0.8918938515081206, "grad_norm": 0.02527657523751259, "grad_norm_var": 5.841854948807415e-07, "learning_rate": 0.0002972676126181717, "loss": 2.3886, "step": 24602 }, { "crossentropy": 2.3498244285583496, "epoch": 0.8919301044083526, "grad_norm": 0.02622907981276512, "grad_norm_var": 5.985437678102113e-07, "learning_rate": 0.00029707026920688663, "loss": 2.4574, "step": 24603 }, { "crossentropy": 2.532097101211548, "epoch": 0.8919663573085846, "grad_norm": 0.027125591412186623, "grad_norm_var": 6.055582312096293e-07, "learning_rate": 0.0002968729893158051, "loss": 2.4679, "step": 24604 }, { "crossentropy": 2.2534875869750977, "epoch": 0.8920026102088167, "grad_norm": 0.02689916267991066, "grad_norm_var": 5.895689981059385e-07, "learning_rate": 0.00029667577294759107, "loss": 2.3924, "step": 24605 }, { "crossentropy": 2.4926280975341797, "epoch": 0.8920388631090487, "grad_norm": 0.02622019313275814, "grad_norm_var": 6.081330922872811e-07, "learning_rate": 0.00029647862010490745, "loss": 2.437, "step": 24606 }, { "crossentropy": 2.451852560043335, "epoch": 0.8920751160092807, "grad_norm": 0.027164170518517494, "grad_norm_var": 6.160240798016211e-07, "learning_rate": 0.0002962815307904193, "loss": 2.4962, "step": 24607 }, { "crossentropy": 2.330955743789673, "epoch": 0.8921113689095128, "grad_norm": 0.02631257474422455, "grad_norm_var": 6.271147189683297e-07, "learning_rate": 0.00029608450500678564, "loss": 2.4163, "step": 24608 }, { "crossentropy": 2.354564666748047, "epoch": 0.8921476218097448, "grad_norm": 0.02516861818730831, "grad_norm_var": 7.87682694383153e-07, "learning_rate": 0.0002958875427566704, "loss": 2.3837, "step": 24609 }, { "crossentropy": 2.246715545654297, "epoch": 0.8921838747099768, "grad_norm": 0.026211651042103767, "grad_norm_var": 7.250104987694198e-07, "learning_rate": 0.00029569064404273205, "loss": 2.3675, "step": 24610 }, { "crossentropy": 2.1796867847442627, "epoch": 0.8922201276102089, "grad_norm": 0.025761226192116737, "grad_norm_var": 7.537956664108632e-07, "learning_rate": 0.00029549380886763013, "loss": 2.2882, "step": 24611 }, { "crossentropy": 2.3673629760742188, "epoch": 0.8922563805104409, "grad_norm": 0.026319069787859917, "grad_norm_var": 7.375379312789803e-07, "learning_rate": 0.0002952970372340247, "loss": 2.3618, "step": 24612 }, { "crossentropy": 2.3835370540618896, "epoch": 0.8922926334106729, "grad_norm": 0.025637377053499222, "grad_norm_var": 7.642791759732697e-07, "learning_rate": 0.0002951003291445725, "loss": 2.4052, "step": 24613 }, { "crossentropy": 2.44333815574646, "epoch": 0.8923288863109049, "grad_norm": 0.026712190359830856, "grad_norm_var": 7.251913673112503e-07, "learning_rate": 0.0002949036846019293, "loss": 2.4381, "step": 24614 }, { "crossentropy": 2.451918125152588, "epoch": 0.8923651392111369, "grad_norm": 0.026704324409365654, "grad_norm_var": 6.725727017435941e-07, "learning_rate": 0.0002947071036087529, "loss": 2.4439, "step": 24615 }, { "crossentropy": 2.488377809524536, "epoch": 0.8924013921113689, "grad_norm": 0.026992542669177055, "grad_norm_var": 6.850037637779263e-07, "learning_rate": 0.0002945105861676967, "loss": 2.4144, "step": 24616 }, { "crossentropy": 2.479024887084961, "epoch": 0.8924376450116009, "grad_norm": 0.026193059980869293, "grad_norm_var": 3.7915329638448826e-07, "learning_rate": 0.0002943141322814169, "loss": 2.3764, "step": 24617 }, { "crossentropy": 2.5630102157592773, "epoch": 0.892473897911833, "grad_norm": 0.027041813358664513, "grad_norm_var": 3.311547432437977e-07, "learning_rate": 0.00029411774195256535, "loss": 2.4806, "step": 24618 }, { "crossentropy": 2.4325006008148193, "epoch": 0.892510150812065, "grad_norm": 0.025974538177251816, "grad_norm_var": 3.4162580257174103e-07, "learning_rate": 0.0002939214151837949, "loss": 2.4457, "step": 24619 }, { "crossentropy": 2.5166196823120117, "epoch": 0.892546403712297, "grad_norm": 0.02660730667412281, "grad_norm_var": 3.0843733850356946e-07, "learning_rate": 0.00029372515197775785, "loss": 2.4538, "step": 24620 }, { "crossentropy": 2.497589111328125, "epoch": 0.892582656612529, "grad_norm": 0.02778242900967598, "grad_norm_var": 4.1951751116777135e-07, "learning_rate": 0.0002935289523371043, "loss": 2.5356, "step": 24621 }, { "crossentropy": 2.4856817722320557, "epoch": 0.892618909512761, "grad_norm": 0.026468250900506973, "grad_norm_var": 4.1658306650243077e-07, "learning_rate": 0.0002933328162644849, "loss": 2.4621, "step": 24622 }, { "crossentropy": 2.435779333114624, "epoch": 0.892655162412993, "grad_norm": 0.025722339749336243, "grad_norm_var": 4.0742918979060537e-07, "learning_rate": 0.0002931367437625487, "loss": 2.3946, "step": 24623 }, { "crossentropy": 2.4246573448181152, "epoch": 0.892691415313225, "grad_norm": 0.027658626437187195, "grad_norm_var": 5.138488365150999e-07, "learning_rate": 0.000292940734833943, "loss": 2.486, "step": 24624 }, { "crossentropy": 2.331679105758667, "epoch": 0.8927276682134571, "grad_norm": 0.027106869965791702, "grad_norm_var": 4.2144939169628095e-07, "learning_rate": 0.0002927447894813168, "loss": 2.3847, "step": 24625 }, { "crossentropy": 2.526743173599243, "epoch": 0.8927639211136891, "grad_norm": 0.027114881202578545, "grad_norm_var": 4.309862008609975e-07, "learning_rate": 0.00029254890770731513, "loss": 2.4809, "step": 24626 }, { "crossentropy": 2.537323236465454, "epoch": 0.8928001740139211, "grad_norm": 0.025859884917736053, "grad_norm_var": 4.2039906197078686e-07, "learning_rate": 0.0002923530895145837, "loss": 2.4682, "step": 24627 }, { "crossentropy": 2.3998703956604004, "epoch": 0.8928364269141531, "grad_norm": 0.026031699031591415, "grad_norm_var": 4.3703224591056373e-07, "learning_rate": 0.0002921573349057688, "loss": 2.474, "step": 24628 }, { "crossentropy": 2.477186679840088, "epoch": 0.8928726798143851, "grad_norm": 0.027642562985420227, "grad_norm_var": 4.3082942794817465e-07, "learning_rate": 0.0002919616438835132, "loss": 2.4241, "step": 24629 }, { "crossentropy": 2.5651819705963135, "epoch": 0.8929089327146171, "grad_norm": 0.026514418423175812, "grad_norm_var": 4.336337740137256e-07, "learning_rate": 0.0002917660164504604, "loss": 2.4892, "step": 24630 }, { "crossentropy": 2.4290223121643066, "epoch": 0.8929451856148491, "grad_norm": 0.02739226073026657, "grad_norm_var": 4.6237326009959667e-07, "learning_rate": 0.00029157045260925276, "loss": 2.3748, "step": 24631 }, { "crossentropy": 2.315908193588257, "epoch": 0.8929814385150812, "grad_norm": 0.026602892205119133, "grad_norm_var": 4.595975849765177e-07, "learning_rate": 0.00029137495236253086, "loss": 2.4067, "step": 24632 }, { "crossentropy": 2.429621458053589, "epoch": 0.8930176914153132, "grad_norm": 0.026220835745334625, "grad_norm_var": 4.5764944937553194e-07, "learning_rate": 0.0002911795157129366, "loss": 2.3655, "step": 24633 }, { "crossentropy": 2.2999672889709473, "epoch": 0.8930539443155452, "grad_norm": 0.02701280638575554, "grad_norm_var": 4.565109614815121e-07, "learning_rate": 0.00029098414266310826, "loss": 2.364, "step": 24634 }, { "crossentropy": 2.3965303897857666, "epoch": 0.8930901972157773, "grad_norm": 0.02647382579743862, "grad_norm_var": 4.216634543361316e-07, "learning_rate": 0.0002907888332156849, "loss": 2.4505, "step": 24635 }, { "crossentropy": 2.4532697200775146, "epoch": 0.8931264501160093, "grad_norm": 0.026750827208161354, "grad_norm_var": 4.1996682709673677e-07, "learning_rate": 0.00029059358737330566, "loss": 2.4456, "step": 24636 }, { "crossentropy": 2.302539348602295, "epoch": 0.8931627030162413, "grad_norm": 0.026713449507951736, "grad_norm_var": 3.4739998032810396e-07, "learning_rate": 0.0002903984051386061, "loss": 2.3153, "step": 24637 }, { "crossentropy": 2.5260539054870605, "epoch": 0.8931989559164734, "grad_norm": 0.026355551555752754, "grad_norm_var": 3.5175737076753066e-07, "learning_rate": 0.000290203286514224, "loss": 2.4666, "step": 24638 }, { "crossentropy": 2.472440004348755, "epoch": 0.8932352088167054, "grad_norm": 0.025799507275223732, "grad_norm_var": 3.420872898333982e-07, "learning_rate": 0.0002900082315027935, "loss": 2.4754, "step": 24639 }, { "crossentropy": 2.3575098514556885, "epoch": 0.8932714617169374, "grad_norm": 0.026906292885541916, "grad_norm_var": 2.8162085057610815e-07, "learning_rate": 0.00028981324010694923, "loss": 2.4288, "step": 24640 }, { "crossentropy": 2.5265896320343018, "epoch": 0.8933077146171694, "grad_norm": 0.02617582678794861, "grad_norm_var": 2.7984775801357754e-07, "learning_rate": 0.00028961831232932555, "loss": 2.4943, "step": 24641 }, { "crossentropy": 2.2961745262145996, "epoch": 0.8933439675174014, "grad_norm": 0.02711305022239685, "grad_norm_var": 2.797217736954464e-07, "learning_rate": 0.0002894234481725544, "loss": 2.351, "step": 24642 }, { "crossentropy": 2.412707567214966, "epoch": 0.8933802204176334, "grad_norm": 0.02554142102599144, "grad_norm_var": 3.173960843840523e-07, "learning_rate": 0.00028922864763926805, "loss": 2.4229, "step": 24643 }, { "crossentropy": 2.4925777912139893, "epoch": 0.8934164733178654, "grad_norm": 0.02665676735341549, "grad_norm_var": 2.9628945000202227e-07, "learning_rate": 0.0002890339107320983, "loss": 2.4432, "step": 24644 }, { "crossentropy": 2.3828797340393066, "epoch": 0.8934527262180975, "grad_norm": 0.026368459686636925, "grad_norm_var": 2.2352820375133065e-07, "learning_rate": 0.0002888392374536741, "loss": 2.4055, "step": 24645 }, { "crossentropy": 2.599475383758545, "epoch": 0.8934889791183295, "grad_norm": 0.026272239163517952, "grad_norm_var": 2.2793554836179433e-07, "learning_rate": 0.0002886446278066257, "loss": 2.523, "step": 24646 }, { "crossentropy": 2.378328323364258, "epoch": 0.8935252320185615, "grad_norm": 0.02621934376657009, "grad_norm_var": 1.7785902252051046e-07, "learning_rate": 0.000288450081793582, "loss": 2.3545, "step": 24647 }, { "crossentropy": 2.545219659805298, "epoch": 0.8935614849187935, "grad_norm": 0.02695414610207081, "grad_norm_var": 1.9278023811555335e-07, "learning_rate": 0.00028825559941716874, "loss": 2.4601, "step": 24648 }, { "crossentropy": 2.460047960281372, "epoch": 0.8935977378190255, "grad_norm": 0.02735147997736931, "grad_norm_var": 2.3498015088289393e-07, "learning_rate": 0.00028806118068001483, "loss": 2.4751, "step": 24649 }, { "crossentropy": 2.3891139030456543, "epoch": 0.8936339907192575, "grad_norm": 0.02619086764752865, "grad_norm_var": 2.255595895325757e-07, "learning_rate": 0.00028786682558474495, "loss": 2.424, "step": 24650 }, { "crossentropy": 2.393666982650757, "epoch": 0.8936702436194895, "grad_norm": 0.025658149272203445, "grad_norm_var": 2.689224296063223e-07, "learning_rate": 0.0002876725341339842, "loss": 2.4215, "step": 24651 }, { "crossentropy": 2.473466157913208, "epoch": 0.8937064965197216, "grad_norm": 0.026614829897880554, "grad_norm_var": 2.6442785895821005e-07, "learning_rate": 0.00028747830633035765, "loss": 2.46, "step": 24652 }, { "crossentropy": 2.4004108905792236, "epoch": 0.8937427494199536, "grad_norm": 0.026330523192882538, "grad_norm_var": 2.5915667835584486e-07, "learning_rate": 0.00028728414217648766, "loss": 2.4442, "step": 24653 }, { "crossentropy": 2.40771222114563, "epoch": 0.8937790023201856, "grad_norm": 0.026327848434448242, "grad_norm_var": 2.593938642458561e-07, "learning_rate": 0.00028709004167499775, "loss": 2.5062, "step": 24654 }, { "crossentropy": 2.452688455581665, "epoch": 0.8938152552204176, "grad_norm": 0.026479152962565422, "grad_norm_var": 2.333900899377141e-07, "learning_rate": 0.00028689600482850855, "loss": 2.3805, "step": 24655 }, { "crossentropy": 2.5105113983154297, "epoch": 0.8938515081206496, "grad_norm": 0.027738070115447044, "grad_norm_var": 3.275099603197139e-07, "learning_rate": 0.00028670203163964073, "loss": 2.5183, "step": 24656 }, { "crossentropy": 2.2461729049682617, "epoch": 0.8938877610208816, "grad_norm": 0.02635098621249199, "grad_norm_var": 3.21868000348798e-07, "learning_rate": 0.0002865081221110155, "loss": 2.3432, "step": 24657 }, { "crossentropy": 2.2942073345184326, "epoch": 0.8939240139211136, "grad_norm": 0.026533640921115875, "grad_norm_var": 2.9629722532153087e-07, "learning_rate": 0.00028631427624525033, "loss": 2.3321, "step": 24658 }, { "crossentropy": 2.43485164642334, "epoch": 0.8939602668213457, "grad_norm": 0.027054740116000175, "grad_norm_var": 2.5120919279716877e-07, "learning_rate": 0.0002861204940449641, "loss": 2.5035, "step": 24659 }, { "crossentropy": 2.5264015197753906, "epoch": 0.8939965197215777, "grad_norm": 0.02629537507891655, "grad_norm_var": 2.551345430044464e-07, "learning_rate": 0.0002859267755127748, "loss": 2.4624, "step": 24660 }, { "crossentropy": 2.3840527534484863, "epoch": 0.8940327726218097, "grad_norm": 0.025997133925557137, "grad_norm_var": 2.7255417948951247e-07, "learning_rate": 0.00028573312065129753, "loss": 2.4032, "step": 24661 }, { "crossentropy": 2.383449077606201, "epoch": 0.8940690255220418, "grad_norm": 0.02671956829726696, "grad_norm_var": 2.70102325015259e-07, "learning_rate": 0.00028553952946314967, "loss": 2.3799, "step": 24662 }, { "crossentropy": 2.453500986099243, "epoch": 0.8941052784222738, "grad_norm": 0.026244057342410088, "grad_norm_var": 2.690476723761955e-07, "learning_rate": 0.0002853460019509441, "loss": 2.4449, "step": 24663 }, { "crossentropy": 2.592116594314575, "epoch": 0.8941415313225058, "grad_norm": 0.026586739346385002, "grad_norm_var": 2.578104832672822e-07, "learning_rate": 0.0002851525381172959, "loss": 2.5082, "step": 24664 }, { "crossentropy": 2.4791080951690674, "epoch": 0.8941777842227379, "grad_norm": 0.025972329080104828, "grad_norm_var": 2.2555116495140415e-07, "learning_rate": 0.0002849591379648181, "loss": 2.4014, "step": 24665 }, { "crossentropy": 2.315070152282715, "epoch": 0.8942140371229699, "grad_norm": 0.025851162150502205, "grad_norm_var": 2.442007722790069e-07, "learning_rate": 0.0002847658014961213, "loss": 2.4216, "step": 24666 }, { "crossentropy": 2.4521710872650146, "epoch": 0.8942502900232019, "grad_norm": 0.0261435117572546, "grad_norm_var": 2.0948239187131416e-07, "learning_rate": 0.00028457252871381787, "loss": 2.4812, "step": 24667 }, { "crossentropy": 2.2740726470947266, "epoch": 0.8942865429234339, "grad_norm": 0.026899099349975586, "grad_norm_var": 2.206864706187263e-07, "learning_rate": 0.00028437931962051964, "loss": 2.377, "step": 24668 }, { "crossentropy": 2.6538283824920654, "epoch": 0.8943227958236659, "grad_norm": 0.027034694328904152, "grad_norm_var": 2.3855901664486997e-07, "learning_rate": 0.000284186174218834, "loss": 2.5671, "step": 24669 }, { "crossentropy": 2.275291681289673, "epoch": 0.8943590487238979, "grad_norm": 0.026449574157595634, "grad_norm_var": 2.3645966225001719e-07, "learning_rate": 0.0002839930925113715, "loss": 2.3737, "step": 24670 }, { "crossentropy": 2.434272527694702, "epoch": 0.89439530162413, "grad_norm": 0.026581736281514168, "grad_norm_var": 2.3653316931087205e-07, "learning_rate": 0.00028380007450073795, "loss": 2.3869, "step": 24671 }, { "crossentropy": 2.4459946155548096, "epoch": 0.894431554524362, "grad_norm": 0.026545491069555283, "grad_norm_var": 1.3305347764345591e-07, "learning_rate": 0.00028360712018954236, "loss": 2.4101, "step": 24672 }, { "crossentropy": 2.343196392059326, "epoch": 0.894467807424594, "grad_norm": 0.026418115943670273, "grad_norm_var": 1.324154168031814e-07, "learning_rate": 0.00028341422958038986, "loss": 2.3992, "step": 24673 }, { "crossentropy": 2.411336898803711, "epoch": 0.894504060324826, "grad_norm": 0.025824083015322685, "grad_norm_var": 1.567201325964786e-07, "learning_rate": 0.0002832214026758856, "loss": 2.4196, "step": 24674 }, { "crossentropy": 2.482884168624878, "epoch": 0.894540313225058, "grad_norm": 0.027217769995331764, "grad_norm_var": 1.723182264878742e-07, "learning_rate": 0.00028302863947863357, "loss": 2.4391, "step": 24675 }, { "crossentropy": 2.5185303688049316, "epoch": 0.89457656612529, "grad_norm": 0.026422780007123947, "grad_norm_var": 1.7115151278305983e-07, "learning_rate": 0.00028283593999123947, "loss": 2.4785, "step": 24676 }, { "crossentropy": 2.299389600753784, "epoch": 0.894612819025522, "grad_norm": 0.02675997093319893, "grad_norm_var": 1.6331701345317687e-07, "learning_rate": 0.00028264330421630347, "loss": 2.3717, "step": 24677 }, { "crossentropy": 2.540156126022339, "epoch": 0.894649071925754, "grad_norm": 0.0266716368496418, "grad_norm_var": 1.6192583360516374e-07, "learning_rate": 0.000282450732156429, "loss": 2.6151, "step": 24678 }, { "crossentropy": 2.342757225036621, "epoch": 0.8946853248259861, "grad_norm": 0.027511585503816605, "grad_norm_var": 2.230695656098754e-07, "learning_rate": 0.0002822582238142163, "loss": 2.4487, "step": 24679 }, { "crossentropy": 2.330961227416992, "epoch": 0.8947215777262181, "grad_norm": 0.027706904336810112, "grad_norm_var": 3.061371479364435e-07, "learning_rate": 0.00028206577919226596, "loss": 2.388, "step": 24680 }, { "crossentropy": 2.394981622695923, "epoch": 0.8947578306264501, "grad_norm": 0.026150181889533997, "grad_norm_var": 2.926213964857723e-07, "learning_rate": 0.0002818733982931776, "loss": 2.3291, "step": 24681 }, { "crossentropy": 2.581669330596924, "epoch": 0.8947940835266821, "grad_norm": 0.02732543647289276, "grad_norm_var": 2.7403760666128093e-07, "learning_rate": 0.0002816810811195475, "loss": 2.5194, "step": 24682 }, { "crossentropy": 2.3827388286590576, "epoch": 0.8948303364269141, "grad_norm": 0.026944855228066444, "grad_norm_var": 2.516246458489907e-07, "learning_rate": 0.0002814888276739752, "loss": 2.42, "step": 24683 }, { "crossentropy": 2.4870200157165527, "epoch": 0.8948665893271461, "grad_norm": 0.027681903913617134, "grad_norm_var": 3.024593770883354e-07, "learning_rate": 0.0002812966379590576, "loss": 2.4115, "step": 24684 }, { "crossentropy": 2.451620578765869, "epoch": 0.8949028422273781, "grad_norm": 0.025932857766747475, "grad_norm_var": 3.4795957909398297e-07, "learning_rate": 0.00028110451197738915, "loss": 2.4775, "step": 24685 }, { "crossentropy": 2.502861976623535, "epoch": 0.8949390951276102, "grad_norm": 0.025881454348564148, "grad_norm_var": 3.9157506197214955e-07, "learning_rate": 0.0002809124497315657, "loss": 2.452, "step": 24686 }, { "crossentropy": 2.4284751415252686, "epoch": 0.8949753480278422, "grad_norm": 0.027972664684057236, "grad_norm_var": 4.861927343615772e-07, "learning_rate": 0.00028072045122418067, "loss": 2.3914, "step": 24687 }, { "crossentropy": 2.510451078414917, "epoch": 0.8950116009280742, "grad_norm": 0.026583388447761536, "grad_norm_var": 4.84943509027463e-07, "learning_rate": 0.00028052851645782805, "loss": 2.5407, "step": 24688 }, { "crossentropy": 2.421614408493042, "epoch": 0.8950478538283063, "grad_norm": 0.027068259194493294, "grad_norm_var": 4.771436328688949e-07, "learning_rate": 0.0002803366454351008, "loss": 2.4705, "step": 24689 }, { "crossentropy": 2.478490114212036, "epoch": 0.8950841067285383, "grad_norm": 0.027042942121624947, "grad_norm_var": 4.02702208628193e-07, "learning_rate": 0.000280144838158588, "loss": 2.4735, "step": 24690 }, { "crossentropy": 2.51948881149292, "epoch": 0.8951203596287703, "grad_norm": 0.025906691327691078, "grad_norm_var": 4.5977086026934413e-07, "learning_rate": 0.000279953094630882, "loss": 2.4696, "step": 24691 }, { "crossentropy": 2.451094150543213, "epoch": 0.8951566125290024, "grad_norm": 0.027560612186789513, "grad_norm_var": 4.76219252607916e-07, "learning_rate": 0.0002797614148545735, "loss": 2.4659, "step": 24692 }, { "crossentropy": 2.4265613555908203, "epoch": 0.8951928654292344, "grad_norm": 0.02599804289638996, "grad_norm_var": 5.286416193718412e-07, "learning_rate": 0.00027956979883224874, "loss": 2.4417, "step": 24693 }, { "crossentropy": 2.25764536857605, "epoch": 0.8952291183294664, "grad_norm": 0.02597997523844242, "grad_norm_var": 5.769466303049212e-07, "learning_rate": 0.00027937824656649904, "loss": 2.4253, "step": 24694 }, { "crossentropy": 2.4077160358428955, "epoch": 0.8952653712296984, "grad_norm": 0.02596275322139263, "grad_norm_var": 5.8570565909335e-07, "learning_rate": 0.00027918675805990836, "loss": 2.3966, "step": 24695 }, { "crossentropy": 2.4984874725341797, "epoch": 0.8953016241299304, "grad_norm": 0.026144705712795258, "grad_norm_var": 5.349985607465413e-07, "learning_rate": 0.0002789953333150652, "loss": 2.5217, "step": 24696 }, { "crossentropy": 2.448496103286743, "epoch": 0.8953378770301624, "grad_norm": 0.026615053415298462, "grad_norm_var": 5.185449058080216e-07, "learning_rate": 0.00027880397233455537, "loss": 2.4225, "step": 24697 }, { "crossentropy": 2.4563069343566895, "epoch": 0.8953741299303944, "grad_norm": 0.026651913300156593, "grad_norm_var": 4.873722126314765e-07, "learning_rate": 0.00027861267512096165, "loss": 2.4156, "step": 24698 }, { "crossentropy": 2.3410544395446777, "epoch": 0.8954103828306265, "grad_norm": 0.026255548000335693, "grad_norm_var": 4.87258439962196e-07, "learning_rate": 0.00027842144167686825, "loss": 2.4502, "step": 24699 }, { "crossentropy": 2.494246006011963, "epoch": 0.8954466357308585, "grad_norm": 0.02590220607817173, "grad_norm_var": 4.231305422398394e-07, "learning_rate": 0.00027823027200485963, "loss": 2.4575, "step": 24700 }, { "crossentropy": 2.4329543113708496, "epoch": 0.8954828886310905, "grad_norm": 0.0261591337621212, "grad_norm_var": 4.1023983692483646e-07, "learning_rate": 0.00027803916610751544, "loss": 2.3877, "step": 24701 }, { "crossentropy": 2.433414936065674, "epoch": 0.8955191415313225, "grad_norm": 0.02700432576239109, "grad_norm_var": 3.9938037198292836e-07, "learning_rate": 0.0002778481239874192, "loss": 2.3807, "step": 24702 }, { "crossentropy": 2.4456658363342285, "epoch": 0.8955553944315545, "grad_norm": 0.026284735649824142, "grad_norm_var": 2.5738401942480144e-07, "learning_rate": 0.00027765714564714994, "loss": 2.4649, "step": 24703 }, { "crossentropy": 2.3942952156066895, "epoch": 0.8955916473317865, "grad_norm": 0.02613162063062191, "grad_norm_var": 2.618050561025657e-07, "learning_rate": 0.0002774662310892867, "loss": 2.3944, "step": 24704 }, { "crossentropy": 2.4080567359924316, "epoch": 0.8956279002320185, "grad_norm": 0.025433847680687904, "grad_norm_var": 2.8679054653716655e-07, "learning_rate": 0.00027727538031640966, "loss": 2.2967, "step": 24705 }, { "crossentropy": 2.529226779937744, "epoch": 0.8956641531322506, "grad_norm": 0.026377500966191292, "grad_norm_var": 2.498465939217703e-07, "learning_rate": 0.0002770845933310956, "loss": 2.4563, "step": 24706 }, { "crossentropy": 2.392502546310425, "epoch": 0.8957004060324826, "grad_norm": 0.026130925863981247, "grad_norm_var": 2.420360461565773e-07, "learning_rate": 0.00027689387013592025, "loss": 2.4392, "step": 24707 }, { "crossentropy": 2.505373001098633, "epoch": 0.8957366589327146, "grad_norm": 0.026497777551412582, "grad_norm_var": 1.3215989281809409e-07, "learning_rate": 0.00027670321073346214, "loss": 2.5156, "step": 24708 }, { "crossentropy": 2.545894145965576, "epoch": 0.8957729118329466, "grad_norm": 0.025734489783644676, "grad_norm_var": 1.4432292988184915e-07, "learning_rate": 0.0002765126151262942, "loss": 2.4562, "step": 24709 }, { "crossentropy": 2.447957754135132, "epoch": 0.8958091647331786, "grad_norm": 0.025387858971953392, "grad_norm_var": 1.8393442651613799e-07, "learning_rate": 0.00027632208331699216, "loss": 2.3863, "step": 24710 }, { "crossentropy": 2.383047580718994, "epoch": 0.8958454176334106, "grad_norm": 0.026012562215328217, "grad_norm_var": 1.827320468022452e-07, "learning_rate": 0.0002761316153081278, "loss": 2.4322, "step": 24711 }, { "crossentropy": 2.4360713958740234, "epoch": 0.8958816705336426, "grad_norm": 0.02592361532151699, "grad_norm_var": 1.8654049806393567e-07, "learning_rate": 0.00027594121110227464, "loss": 2.3601, "step": 24712 }, { "crossentropy": 2.3908350467681885, "epoch": 0.8959179234338747, "grad_norm": 0.026090968400239944, "grad_norm_var": 1.716604051976104e-07, "learning_rate": 0.0002757508707020057, "loss": 2.3408, "step": 24713 }, { "crossentropy": 2.5059919357299805, "epoch": 0.8959541763341067, "grad_norm": 0.026879610493779182, "grad_norm_var": 1.9093746177812654e-07, "learning_rate": 0.0002755605941098904, "loss": 2.3991, "step": 24714 }, { "crossentropy": 2.500732421875, "epoch": 0.8959904292343387, "grad_norm": 0.026263894513249397, "grad_norm_var": 1.910727197825905e-07, "learning_rate": 0.000275370381328498, "loss": 2.4792, "step": 24715 }, { "crossentropy": 2.4858977794647217, "epoch": 0.8960266821345708, "grad_norm": 0.03426191210746765, "grad_norm_var": 4.295550363186485e-06, "learning_rate": 0.00027518023236039913, "loss": 2.4959, "step": 24716 }, { "crossentropy": 2.4229533672332764, "epoch": 0.8960629350348028, "grad_norm": 0.026069439947605133, "grad_norm_var": 4.302054168628952e-06, "learning_rate": 0.0002749901472081606, "loss": 2.3577, "step": 24717 }, { "crossentropy": 2.324589252471924, "epoch": 0.8960991879350348, "grad_norm": 0.029092740267515182, "grad_norm_var": 4.671829452233765e-06, "learning_rate": 0.00027480012587435154, "loss": 2.4349, "step": 24718 }, { "crossentropy": 2.4318959712982178, "epoch": 0.8961354408352669, "grad_norm": 0.027005510404706, "grad_norm_var": 4.656141073086173e-06, "learning_rate": 0.00027461016836153706, "loss": 2.4575, "step": 24719 }, { "crossentropy": 2.3803796768188477, "epoch": 0.8961716937354989, "grad_norm": 0.02601657807826996, "grad_norm_var": 4.667694378796399e-06, "learning_rate": 0.0002744202746722829, "loss": 2.4383, "step": 24720 }, { "crossentropy": 2.370323657989502, "epoch": 0.8962079466357309, "grad_norm": 0.027556048706173897, "grad_norm_var": 4.555904575065788e-06, "learning_rate": 0.0002742304448091548, "loss": 2.3741, "step": 24721 }, { "crossentropy": 2.2317936420440674, "epoch": 0.8962441995359629, "grad_norm": 0.026156742125749588, "grad_norm_var": 4.575988311902044e-06, "learning_rate": 0.00027404067877471685, "loss": 2.2991, "step": 24722 }, { "crossentropy": 2.4656496047973633, "epoch": 0.8962804524361949, "grad_norm": 0.026769759133458138, "grad_norm_var": 4.53236339192081e-06, "learning_rate": 0.00027385097657153026, "loss": 2.4791, "step": 24723 }, { "crossentropy": 2.445930004119873, "epoch": 0.8963167053364269, "grad_norm": 0.026121066883206367, "grad_norm_var": 4.565577962781216e-06, "learning_rate": 0.0002736613382021591, "loss": 2.5242, "step": 24724 }, { "crossentropy": 2.395381450653076, "epoch": 0.896352958236659, "grad_norm": 0.02699819579720497, "grad_norm_var": 4.459077393375218e-06, "learning_rate": 0.0002734717636691636, "loss": 2.4217, "step": 24725 }, { "crossentropy": 2.451613187789917, "epoch": 0.896389211136891, "grad_norm": 0.02681621164083481, "grad_norm_var": 4.272342721344648e-06, "learning_rate": 0.000273282252975105, "loss": 2.5347, "step": 24726 }, { "crossentropy": 2.5325379371643066, "epoch": 0.896425464037123, "grad_norm": 0.026150666177272797, "grad_norm_var": 4.253010375327099e-06, "learning_rate": 0.0002730928061225418, "loss": 2.4882, "step": 24727 }, { "crossentropy": 2.3999884128570557, "epoch": 0.896461716937355, "grad_norm": 0.025006713345646858, "grad_norm_var": 4.453749861843765e-06, "learning_rate": 0.0002729034231140343, "loss": 2.463, "step": 24728 }, { "crossentropy": 2.268375873565674, "epoch": 0.896497969837587, "grad_norm": 0.026157516986131668, "grad_norm_var": 4.445264113443976e-06, "learning_rate": 0.00027271410395213967, "loss": 2.3675, "step": 24729 }, { "crossentropy": 2.5153815746307373, "epoch": 0.896534222737819, "grad_norm": 0.026346327736973763, "grad_norm_var": 4.4774764334673854e-06, "learning_rate": 0.00027252484863941493, "loss": 2.4347, "step": 24730 }, { "crossentropy": 2.4434633255004883, "epoch": 0.896570475638051, "grad_norm": 0.025870831683278084, "grad_norm_var": 4.528296125730575e-06, "learning_rate": 0.00027233565717841635, "loss": 2.4666, "step": 24731 }, { "crossentropy": 2.5098047256469727, "epoch": 0.896606728538283, "grad_norm": 0.026903463527560234, "grad_norm_var": 8.119136113199713e-07, "learning_rate": 0.0002721465295716996, "loss": 2.4965, "step": 24732 }, { "crossentropy": 2.2768054008483887, "epoch": 0.8966429814385151, "grad_norm": 0.025344185531139374, "grad_norm_var": 8.926959621041152e-07, "learning_rate": 0.00027195746582181826, "loss": 2.3342, "step": 24733 }, { "crossentropy": 2.5110437870025635, "epoch": 0.8966792343387471, "grad_norm": 0.02730216085910797, "grad_norm_var": 4.787447308335357e-07, "learning_rate": 0.0002717684659313258, "loss": 2.4731, "step": 24734 }, { "crossentropy": 2.4070024490356445, "epoch": 0.8967154872389791, "grad_norm": 0.026381824165582657, "grad_norm_var": 4.533370992358069e-07, "learning_rate": 0.0002715795299027768, "loss": 2.4227, "step": 24735 }, { "crossentropy": 2.528646469116211, "epoch": 0.8967517401392111, "grad_norm": 0.02681810036301613, "grad_norm_var": 4.5586439257950564e-07, "learning_rate": 0.00027139065773872095, "loss": 2.4489, "step": 24736 }, { "crossentropy": 2.3732194900512695, "epoch": 0.8967879930394431, "grad_norm": 0.027365176007151604, "grad_norm_var": 4.2919721883008003e-07, "learning_rate": 0.00027120184944171065, "loss": 2.4116, "step": 24737 }, { "crossentropy": 2.424638509750366, "epoch": 0.8968242459396751, "grad_norm": 0.027486607432365417, "grad_norm_var": 4.953903983732303e-07, "learning_rate": 0.00027101310501429556, "loss": 2.4612, "step": 24738 }, { "crossentropy": 2.2907252311706543, "epoch": 0.8968604988399071, "grad_norm": 0.026201864704489708, "grad_norm_var": 4.943581052076988e-07, "learning_rate": 0.00027082442445902524, "loss": 2.3387, "step": 24739 }, { "crossentropy": 2.3525383472442627, "epoch": 0.8968967517401392, "grad_norm": 0.025840945541858673, "grad_norm_var": 5.117133795281651e-07, "learning_rate": 0.0002706358077784477, "loss": 2.3917, "step": 24740 }, { "crossentropy": 2.5078861713409424, "epoch": 0.8969330046403712, "grad_norm": 0.026765942573547363, "grad_norm_var": 4.977037836370949e-07, "learning_rate": 0.00027044725497511025, "loss": 2.3898, "step": 24741 }, { "crossentropy": 2.4197092056274414, "epoch": 0.8969692575406032, "grad_norm": 0.02593599446117878, "grad_norm_var": 4.999100543796267e-07, "learning_rate": 0.00027025876605155986, "loss": 2.4229, "step": 24742 }, { "crossentropy": 2.3192694187164307, "epoch": 0.8970055104408353, "grad_norm": 0.025913819670677185, "grad_norm_var": 5.102602685460306e-07, "learning_rate": 0.00027007034101034276, "loss": 2.2982, "step": 24743 }, { "crossentropy": 2.4780991077423096, "epoch": 0.8970417633410673, "grad_norm": 0.025298912078142166, "grad_norm_var": 4.631613135253528e-07, "learning_rate": 0.0002698819798540036, "loss": 2.4854, "step": 24744 }, { "crossentropy": 2.448883295059204, "epoch": 0.8970780162412993, "grad_norm": 0.02577936463057995, "grad_norm_var": 4.828553123954604e-07, "learning_rate": 0.0002696936825850871, "loss": 2.3547, "step": 24745 }, { "crossentropy": 2.488851308822632, "epoch": 0.8971142691415314, "grad_norm": 0.026906536892056465, "grad_norm_var": 5.02403304283846e-07, "learning_rate": 0.000269505449206135, "loss": 2.4851, "step": 24746 }, { "crossentropy": 2.3025925159454346, "epoch": 0.8971505220417634, "grad_norm": 0.026122624054551125, "grad_norm_var": 4.891968355267278e-07, "learning_rate": 0.0002693172797196902, "loss": 2.3705, "step": 24747 }, { "crossentropy": 2.461940050125122, "epoch": 0.8971867749419954, "grad_norm": 0.026292098686099052, "grad_norm_var": 4.7135182110224994e-07, "learning_rate": 0.00026912917412829696, "loss": 2.4714, "step": 24748 }, { "crossentropy": 2.5079329013824463, "epoch": 0.8972230278422274, "grad_norm": 0.02582082524895668, "grad_norm_var": 4.2100916878318126e-07, "learning_rate": 0.00026894113243449146, "loss": 2.4766, "step": 24749 }, { "crossentropy": 2.5501549243927, "epoch": 0.8972592807424594, "grad_norm": 0.025621933862566948, "grad_norm_var": 3.9300435073537594e-07, "learning_rate": 0.0002687531546408156, "loss": 2.4451, "step": 24750 }, { "crossentropy": 2.2208802700042725, "epoch": 0.8972955336426914, "grad_norm": 0.025563038885593414, "grad_norm_var": 4.2428381337421736e-07, "learning_rate": 0.0002685652407498096, "loss": 2.3136, "step": 24751 }, { "crossentropy": 2.1715681552886963, "epoch": 0.8973317865429234, "grad_norm": 0.026067541912198067, "grad_norm_var": 4.009750184442619e-07, "learning_rate": 0.00026837739076400913, "loss": 2.2857, "step": 24752 }, { "crossentropy": 2.4954073429107666, "epoch": 0.8973680394431555, "grad_norm": 0.026270831003785133, "grad_norm_var": 3.0383364255485006e-07, "learning_rate": 0.0002681896046859528, "loss": 2.3988, "step": 24753 }, { "crossentropy": 2.401771306991577, "epoch": 0.8974042923433875, "grad_norm": 0.02645154483616352, "grad_norm_var": 1.819216599931982e-07, "learning_rate": 0.00026800188251817624, "loss": 2.3978, "step": 24754 }, { "crossentropy": 2.3516252040863037, "epoch": 0.8974405452436195, "grad_norm": 0.027210362255573273, "grad_norm_var": 2.65456754807676e-07, "learning_rate": 0.0002678142242632159, "loss": 2.4146, "step": 24755 }, { "crossentropy": 2.430023193359375, "epoch": 0.8974767981438515, "grad_norm": 0.027020011097192764, "grad_norm_var": 3.0904094982009817e-07, "learning_rate": 0.0002676266299236063, "loss": 2.4561, "step": 24756 }, { "crossentropy": 2.388612985610962, "epoch": 0.8975130510440835, "grad_norm": 0.02568901516497135, "grad_norm_var": 2.988393648830611e-07, "learning_rate": 0.00026743909950187995, "loss": 2.3258, "step": 24757 }, { "crossentropy": 2.3765361309051514, "epoch": 0.8975493039443155, "grad_norm": 0.02588513121008873, "grad_norm_var": 3.0026778203907403e-07, "learning_rate": 0.00026725163300057034, "loss": 2.3944, "step": 24758 }, { "crossentropy": 2.225155830383301, "epoch": 0.8975855568445475, "grad_norm": 0.026056809350848198, "grad_norm_var": 2.976224084233828e-07, "learning_rate": 0.00026706423042220987, "loss": 2.392, "step": 24759 }, { "crossentropy": 2.481041193008423, "epoch": 0.8976218097447796, "grad_norm": 0.027044180780649185, "grad_norm_var": 2.9493945556489236e-07, "learning_rate": 0.00026687689176932926, "loss": 2.4472, "step": 24760 }, { "crossentropy": 2.5527472496032715, "epoch": 0.8976580626450116, "grad_norm": 0.0267020296305418, "grad_norm_var": 2.9177142160148836e-07, "learning_rate": 0.00026668961704445993, "loss": 2.4058, "step": 24761 }, { "crossentropy": 2.5294933319091797, "epoch": 0.8976943155452436, "grad_norm": 0.027331864461302757, "grad_norm_var": 3.3774235706889494e-07, "learning_rate": 0.00026650240625012924, "loss": 2.4676, "step": 24762 }, { "crossentropy": 2.4005556106567383, "epoch": 0.8977305684454756, "grad_norm": 0.02637830562889576, "grad_norm_var": 3.350358701031027e-07, "learning_rate": 0.0002663152593888668, "loss": 2.346, "step": 24763 }, { "crossentropy": 2.4927003383636475, "epoch": 0.8977668213457076, "grad_norm": 0.026165373623371124, "grad_norm_var": 3.3681253753859755e-07, "learning_rate": 0.00026612817646320177, "loss": 2.466, "step": 24764 }, { "crossentropy": 2.375551462173462, "epoch": 0.8978030742459396, "grad_norm": 0.027169307693839073, "grad_norm_var": 3.5892791222045387e-07, "learning_rate": 0.0002659411574756587, "loss": 2.436, "step": 24765 }, { "crossentropy": 2.4537291526794434, "epoch": 0.8978393271461717, "grad_norm": 0.027073198929429054, "grad_norm_var": 3.372574820130481e-07, "learning_rate": 0.000265754202428764, "loss": 2.5455, "step": 24766 }, { "crossentropy": 2.4593863487243652, "epoch": 0.8978755800464037, "grad_norm": 0.02628445066511631, "grad_norm_var": 2.791878258827461e-07, "learning_rate": 0.00026556731132504395, "loss": 2.4423, "step": 24767 }, { "crossentropy": 2.4844229221343994, "epoch": 0.8979118329466357, "grad_norm": 0.025897566229104996, "grad_norm_var": 2.9192765244456567e-07, "learning_rate": 0.0002653804841670221, "loss": 2.4463, "step": 24768 }, { "crossentropy": 2.451305627822876, "epoch": 0.8979480858468677, "grad_norm": 0.026767807081341743, "grad_norm_var": 2.895696421034132e-07, "learning_rate": 0.0002651937209572219, "loss": 2.4348, "step": 24769 }, { "crossentropy": 2.406481981277466, "epoch": 0.8979843387470998, "grad_norm": 0.02738688327372074, "grad_norm_var": 3.2942127191990663e-07, "learning_rate": 0.0002650070216981654, "loss": 2.4766, "step": 24770 }, { "crossentropy": 2.4090311527252197, "epoch": 0.8980205916473318, "grad_norm": 0.02522973157465458, "grad_norm_var": 4.210457604287513e-07, "learning_rate": 0.0002648203863923743, "loss": 2.4179, "step": 24771 }, { "crossentropy": 2.603477954864502, "epoch": 0.8980568445475638, "grad_norm": 0.02789163775742054, "grad_norm_var": 5.283699581687742e-07, "learning_rate": 0.00026463381504237106, "loss": 2.5514, "step": 24772 }, { "crossentropy": 2.3711225986480713, "epoch": 0.8980930974477959, "grad_norm": 0.025668947026133537, "grad_norm_var": 5.307245464529024e-07, "learning_rate": 0.00026444730765067426, "loss": 2.4004, "step": 24773 }, { "crossentropy": 2.410080671310425, "epoch": 0.8981293503480279, "grad_norm": 0.02802026830613613, "grad_norm_var": 6.240016164790376e-07, "learning_rate": 0.0002642608642198019, "loss": 2.4394, "step": 24774 }, { "crossentropy": 2.407487392425537, "epoch": 0.8981656032482599, "grad_norm": 0.02590152621269226, "grad_norm_var": 6.38655215117317e-07, "learning_rate": 0.0002640744847522741, "loss": 2.3806, "step": 24775 }, { "crossentropy": 2.450495481491089, "epoch": 0.8982018561484919, "grad_norm": 0.026636341586709023, "grad_norm_var": 6.293598133327253e-07, "learning_rate": 0.0002638881692506068, "loss": 2.3984, "step": 24776 }, { "crossentropy": 2.484304904937744, "epoch": 0.8982381090487239, "grad_norm": 0.026805704459547997, "grad_norm_var": 6.306598919093477e-07, "learning_rate": 0.00026370191771731775, "loss": 2.4832, "step": 24777 }, { "crossentropy": 2.300900936126709, "epoch": 0.8982743619489559, "grad_norm": 0.025466056540608406, "grad_norm_var": 6.818552129055165e-07, "learning_rate": 0.00026351573015492125, "loss": 2.2253, "step": 24778 }, { "crossentropy": 2.4887611865997314, "epoch": 0.898310614849188, "grad_norm": 0.02639269456267357, "grad_norm_var": 6.815455750985855e-07, "learning_rate": 0.000263329606565933, "loss": 2.4505, "step": 24779 }, { "crossentropy": 2.485081672668457, "epoch": 0.89834686774942, "grad_norm": 0.026044636964797974, "grad_norm_var": 6.886056939573356e-07, "learning_rate": 0.00026314354695286703, "loss": 2.4537, "step": 24780 }, { "crossentropy": 2.3482823371887207, "epoch": 0.898383120649652, "grad_norm": 0.025481367483735085, "grad_norm_var": 7.250003200189846e-07, "learning_rate": 0.0002629575513182353, "loss": 2.3412, "step": 24781 }, { "crossentropy": 2.548565149307251, "epoch": 0.898419373549884, "grad_norm": 0.026168247684836388, "grad_norm_var": 6.990943570379937e-07, "learning_rate": 0.00026277161966455076, "loss": 2.5667, "step": 24782 }, { "crossentropy": 2.4522509574890137, "epoch": 0.898455626450116, "grad_norm": 0.025874922052025795, "grad_norm_var": 7.146705094780714e-07, "learning_rate": 0.0002625857519943248, "loss": 2.4042, "step": 24783 }, { "crossentropy": 2.2593889236450195, "epoch": 0.898491879350348, "grad_norm": 0.02635663002729416, "grad_norm_var": 7.000175731292427e-07, "learning_rate": 0.0002623999483100664, "loss": 2.3337, "step": 24784 }, { "crossentropy": 2.215916872024536, "epoch": 0.89852813225058, "grad_norm": 0.02676095999777317, "grad_norm_var": 6.996672216952673e-07, "learning_rate": 0.00026221420861428755, "loss": 2.3535, "step": 24785 }, { "crossentropy": 2.4861273765563965, "epoch": 0.898564385150812, "grad_norm": 0.02661723457276821, "grad_norm_var": 6.334055250606374e-07, "learning_rate": 0.00026202853290949516, "loss": 2.4246, "step": 24786 }, { "crossentropy": 2.4737985134124756, "epoch": 0.8986006380510441, "grad_norm": 0.026826277375221252, "grad_norm_var": 5.580072112597399e-07, "learning_rate": 0.0002618429211981971, "loss": 2.4991, "step": 24787 }, { "crossentropy": 2.4745254516601562, "epoch": 0.8986368909512761, "grad_norm": 0.026594677940011024, "grad_norm_var": 4.10742256489122e-07, "learning_rate": 0.00026165737348290144, "loss": 2.3893, "step": 24788 }, { "crossentropy": 2.2617380619049072, "epoch": 0.8986731438515081, "grad_norm": 0.025826044380664825, "grad_norm_var": 3.979975893142936e-07, "learning_rate": 0.00026147188976611337, "loss": 2.383, "step": 24789 }, { "crossentropy": 2.4492082595825195, "epoch": 0.8987093967517401, "grad_norm": 0.025372525677084923, "grad_norm_var": 2.5032784113481387e-07, "learning_rate": 0.00026128647005033866, "loss": 2.4778, "step": 24790 }, { "crossentropy": 2.4729557037353516, "epoch": 0.8987456496519721, "grad_norm": 0.026160573586821556, "grad_norm_var": 2.443728324463216e-07, "learning_rate": 0.0002611011143380815, "loss": 2.4362, "step": 24791 }, { "crossentropy": 2.589604616165161, "epoch": 0.8987819025522041, "grad_norm": 0.026941122487187386, "grad_norm_var": 2.674407520736882e-07, "learning_rate": 0.00026091582263184477, "loss": 2.4362, "step": 24792 }, { "crossentropy": 2.3434762954711914, "epoch": 0.8988181554524362, "grad_norm": 0.026363598182797432, "grad_norm_var": 2.4575618162782306e-07, "learning_rate": 0.00026073059493413264, "loss": 2.466, "step": 24793 }, { "crossentropy": 2.4197182655334473, "epoch": 0.8988544083526682, "grad_norm": 0.027725785970687866, "grad_norm_var": 3.428738135409041e-07, "learning_rate": 0.0002605454312474448, "loss": 2.3712, "step": 24794 }, { "crossentropy": 2.5330679416656494, "epoch": 0.8988906612529002, "grad_norm": 0.027087723836302757, "grad_norm_var": 3.775588649084036e-07, "learning_rate": 0.0002603603315742836, "loss": 2.5235, "step": 24795 }, { "crossentropy": 2.558504343032837, "epoch": 0.8989269141531323, "grad_norm": 0.027538100257515907, "grad_norm_var": 4.486581459003357e-07, "learning_rate": 0.0002601752959171488, "loss": 2.4576, "step": 24796 }, { "crossentropy": 2.408149003982544, "epoch": 0.8989631670533643, "grad_norm": 0.026645153760910034, "grad_norm_var": 3.7819559341975505e-07, "learning_rate": 0.00025999032427853955, "loss": 2.397, "step": 24797 }, { "crossentropy": 2.4799416065216064, "epoch": 0.8989994199535963, "grad_norm": 0.025925207883119583, "grad_norm_var": 3.94378830712259e-07, "learning_rate": 0.0002598054166609548, "loss": 2.4591, "step": 24798 }, { "crossentropy": 2.43770694732666, "epoch": 0.8990356728538283, "grad_norm": 0.027204781770706177, "grad_norm_var": 3.8724372033725693e-07, "learning_rate": 0.0002596205730668916, "loss": 2.4133, "step": 24799 }, { "crossentropy": 2.437415361404419, "epoch": 0.8990719257540604, "grad_norm": 0.02706136554479599, "grad_norm_var": 3.933819614743931e-07, "learning_rate": 0.00025943579349884505, "loss": 2.4758, "step": 24800 }, { "crossentropy": 2.454862594604492, "epoch": 0.8991081786542924, "grad_norm": 0.02552768960595131, "grad_norm_var": 4.72776838442851e-07, "learning_rate": 0.0002592510779593132, "loss": 2.4392, "step": 24801 }, { "crossentropy": 2.351508378982544, "epoch": 0.8991444315545244, "grad_norm": 0.027031349018216133, "grad_norm_var": 4.85075170153446e-07, "learning_rate": 0.00025906642645078914, "loss": 2.4742, "step": 24802 }, { "crossentropy": 2.4457225799560547, "epoch": 0.8991806844547564, "grad_norm": 0.028161749243736267, "grad_norm_var": 6.342529394325868e-07, "learning_rate": 0.00025888183897576747, "loss": 2.4036, "step": 24803 }, { "crossentropy": 2.3970847129821777, "epoch": 0.8992169373549884, "grad_norm": 0.026623697951436043, "grad_norm_var": 6.339059200656412e-07, "learning_rate": 0.0002586973155367428, "loss": 2.4294, "step": 24804 }, { "crossentropy": 2.5087807178497314, "epoch": 0.8992531902552204, "grad_norm": 0.025479700416326523, "grad_norm_var": 6.817514303832558e-07, "learning_rate": 0.0002585128561362049, "loss": 2.468, "step": 24805 }, { "crossentropy": 2.4653735160827637, "epoch": 0.8992894431554525, "grad_norm": 0.02549898438155651, "grad_norm_var": 6.60736866339486e-07, "learning_rate": 0.0002583284607766467, "loss": 2.4664, "step": 24806 }, { "crossentropy": 2.451493740081787, "epoch": 0.8993256960556845, "grad_norm": 0.026368234306573868, "grad_norm_var": 6.488829848567129e-07, "learning_rate": 0.0002581441294605591, "loss": 2.5013, "step": 24807 }, { "crossentropy": 2.521912097930908, "epoch": 0.8993619489559165, "grad_norm": 0.025563279166817665, "grad_norm_var": 7.230581434755952e-07, "learning_rate": 0.00025795986219042945, "loss": 2.4773, "step": 24808 }, { "crossentropy": 2.4267609119415283, "epoch": 0.8993982018561485, "grad_norm": 0.025600921362638474, "grad_norm_var": 7.847644597660612e-07, "learning_rate": 0.00025777565896874965, "loss": 2.3934, "step": 24809 }, { "crossentropy": 2.4788472652435303, "epoch": 0.8994344547563805, "grad_norm": 0.0270659402012825, "grad_norm_var": 7.098719221410958e-07, "learning_rate": 0.000257591519798005, "loss": 2.5267, "step": 24810 }, { "crossentropy": 2.2525997161865234, "epoch": 0.8994707076566125, "grad_norm": 0.026003608480095863, "grad_norm_var": 7.018419022706996e-07, "learning_rate": 0.000257407444680684, "loss": 2.3145, "step": 24811 }, { "crossentropy": 2.4196279048919678, "epoch": 0.8995069605568445, "grad_norm": 0.026751277968287468, "grad_norm_var": 6.270369132831133e-07, "learning_rate": 0.00025722343361927357, "loss": 2.4267, "step": 24812 }, { "crossentropy": 2.519195318222046, "epoch": 0.8995432134570766, "grad_norm": 0.02626355178654194, "grad_norm_var": 6.240238340059299e-07, "learning_rate": 0.00025703948661625676, "loss": 2.5523, "step": 24813 }, { "crossentropy": 2.4636480808258057, "epoch": 0.8995794663573086, "grad_norm": 0.025896204635500908, "grad_norm_var": 6.258475430656954e-07, "learning_rate": 0.0002568556036741204, "loss": 2.4813, "step": 24814 }, { "crossentropy": 2.414278030395508, "epoch": 0.8996157192575406, "grad_norm": 0.02620081417262554, "grad_norm_var": 5.786241240877347e-07, "learning_rate": 0.000256671784795347, "loss": 2.4735, "step": 24815 }, { "crossentropy": 2.4304635524749756, "epoch": 0.8996519721577726, "grad_norm": 0.026695316657423973, "grad_norm_var": 5.507491541198451e-07, "learning_rate": 0.00025648802998241903, "loss": 2.4146, "step": 24816 }, { "crossentropy": 2.449984073638916, "epoch": 0.8996882250580046, "grad_norm": 0.02523505501449108, "grad_norm_var": 5.860702592805443e-07, "learning_rate": 0.00025630433923781894, "loss": 2.439, "step": 24817 }, { "crossentropy": 2.5789883136749268, "epoch": 0.8997244779582366, "grad_norm": 0.027818430215120316, "grad_norm_var": 7.039029292494129e-07, "learning_rate": 0.00025612071256402746, "loss": 2.6082, "step": 24818 }, { "crossentropy": 2.5796618461608887, "epoch": 0.8997607308584686, "grad_norm": 0.027206724509596825, "grad_norm_var": 5.272350012871004e-07, "learning_rate": 0.00025593714996352437, "loss": 2.4828, "step": 24819 }, { "crossentropy": 2.5528621673583984, "epoch": 0.8997969837587007, "grad_norm": 0.027127275243401527, "grad_norm_var": 5.670354662443231e-07, "learning_rate": 0.0002557536514387904, "loss": 2.49, "step": 24820 }, { "crossentropy": 2.282911539077759, "epoch": 0.8998332366589327, "grad_norm": 0.025695795193314552, "grad_norm_var": 5.463634778655098e-07, "learning_rate": 0.0002555702169923019, "loss": 2.3572, "step": 24821 }, { "crossentropy": 2.427731990814209, "epoch": 0.8998694895591647, "grad_norm": 0.026157552376389503, "grad_norm_var": 5.020835433979774e-07, "learning_rate": 0.00025538684662653854, "loss": 2.424, "step": 24822 }, { "crossentropy": 2.453151226043701, "epoch": 0.8999057424593968, "grad_norm": 0.026254786178469658, "grad_norm_var": 5.026593808972324e-07, "learning_rate": 0.0002552035403439762, "loss": 2.519, "step": 24823 }, { "crossentropy": 2.4117648601531982, "epoch": 0.8999419953596288, "grad_norm": 0.026762640103697777, "grad_norm_var": 4.6738957926692413e-07, "learning_rate": 0.0002550202981470895, "loss": 2.3928, "step": 24824 }, { "crossentropy": 2.400120258331299, "epoch": 0.8999782482598608, "grad_norm": 0.027820901945233345, "grad_norm_var": 5.32669996628395e-07, "learning_rate": 0.00025483712003835534, "loss": 2.5339, "step": 24825 }, { "crossentropy": 2.251399040222168, "epoch": 0.9000145011600929, "grad_norm": 0.02609759196639061, "grad_norm_var": 5.25919344305549e-07, "learning_rate": 0.00025465400602024625, "loss": 2.3277, "step": 24826 }, { "crossentropy": 2.285066843032837, "epoch": 0.9000507540603249, "grad_norm": 0.025398794561624527, "grad_norm_var": 5.887489017646393e-07, "learning_rate": 0.0002544709560952363, "loss": 2.3751, "step": 24827 }, { "crossentropy": 2.5786099433898926, "epoch": 0.9000870069605569, "grad_norm": 0.02700883522629738, "grad_norm_var": 6.028489047603276e-07, "learning_rate": 0.00025428797026579795, "loss": 2.4157, "step": 24828 }, { "crossentropy": 2.4186854362487793, "epoch": 0.9001232598607889, "grad_norm": 0.026462918147444725, "grad_norm_var": 5.996454304257418e-07, "learning_rate": 0.0002541050485344021, "loss": 2.4112, "step": 24829 }, { "crossentropy": 2.3883869647979736, "epoch": 0.9001595127610209, "grad_norm": 0.028700510039925575, "grad_norm_var": 8.69137510158396e-07, "learning_rate": 0.00025392219090352, "loss": 2.3121, "step": 24830 }, { "crossentropy": 2.5356616973876953, "epoch": 0.9001957656612529, "grad_norm": 0.026675423607230186, "grad_norm_var": 8.538260401484583e-07, "learning_rate": 0.0002537393973756208, "loss": 2.4805, "step": 24831 }, { "crossentropy": 2.5403759479522705, "epoch": 0.9002320185614849, "grad_norm": 0.026586322113871574, "grad_norm_var": 8.545626103269195e-07, "learning_rate": 0.0002535566679531742, "loss": 2.4948, "step": 24832 }, { "crossentropy": 2.503872871398926, "epoch": 0.900268271461717, "grad_norm": 0.026851505041122437, "grad_norm_var": 7.047001676989031e-07, "learning_rate": 0.0002533740026386483, "loss": 2.4625, "step": 24833 }, { "crossentropy": 2.424072504043579, "epoch": 0.900304524361949, "grad_norm": 0.02571130357682705, "grad_norm_var": 6.930156731271379e-07, "learning_rate": 0.00025319140143450824, "loss": 2.3083, "step": 24834 }, { "crossentropy": 2.408510446548462, "epoch": 0.900340777262181, "grad_norm": 0.026016870513558388, "grad_norm_var": 6.943561690600856e-07, "learning_rate": 0.0002530088643432227, "loss": 2.4414, "step": 24835 }, { "crossentropy": 2.5514159202575684, "epoch": 0.900377030162413, "grad_norm": 0.02704080380499363, "grad_norm_var": 6.885490050340462e-07, "learning_rate": 0.00025282639136725625, "loss": 2.4537, "step": 24836 }, { "crossentropy": 2.6811180114746094, "epoch": 0.900413283062645, "grad_norm": 0.027971655130386353, "grad_norm_var": 7.446701601724562e-07, "learning_rate": 0.00025264398250907316, "loss": 2.562, "step": 24837 }, { "crossentropy": 2.2777764797210693, "epoch": 0.900449535962877, "grad_norm": 0.02683114819228649, "grad_norm_var": 7.225222865682532e-07, "learning_rate": 0.0002524616377711381, "loss": 2.3393, "step": 24838 }, { "crossentropy": 2.3894169330596924, "epoch": 0.900485788863109, "grad_norm": 0.025991596281528473, "grad_norm_var": 7.446507568995008e-07, "learning_rate": 0.0002522793571559123, "loss": 2.4487, "step": 24839 }, { "crossentropy": 2.188727617263794, "epoch": 0.900522041763341, "grad_norm": 0.026597406715154648, "grad_norm_var": 7.459806502297841e-07, "learning_rate": 0.0002520971406658601, "loss": 2.3661, "step": 24840 }, { "crossentropy": 2.665329694747925, "epoch": 0.9005582946635731, "grad_norm": 0.027735644951462746, "grad_norm_var": 7.340933977426859e-07, "learning_rate": 0.0002519149883034405, "loss": 2.5383, "step": 24841 }, { "crossentropy": 2.5087077617645264, "epoch": 0.9005945475638051, "grad_norm": 0.026253465563058853, "grad_norm_var": 7.224706746511983e-07, "learning_rate": 0.00025173290007111384, "loss": 2.4472, "step": 24842 }, { "crossentropy": 2.617110252380371, "epoch": 0.9006308004640371, "grad_norm": 0.027700867503881454, "grad_norm_var": 6.421294145117351e-07, "learning_rate": 0.0002515508759713403, "loss": 2.5451, "step": 24843 }, { "crossentropy": 2.567678213119507, "epoch": 0.9006670533642691, "grad_norm": 0.026889631524682045, "grad_norm_var": 6.41025727852497e-07, "learning_rate": 0.0002513689160065791, "loss": 2.4592, "step": 24844 }, { "crossentropy": 2.4848644733428955, "epoch": 0.9007033062645011, "grad_norm": 0.025904184207320213, "grad_norm_var": 6.913159119695602e-07, "learning_rate": 0.00025118702017928597, "loss": 2.4054, "step": 24845 }, { "crossentropy": 2.4708917140960693, "epoch": 0.9007395591647331, "grad_norm": 0.026230039075016975, "grad_norm_var": 4.603003267081643e-07, "learning_rate": 0.00025100518849191956, "loss": 2.4931, "step": 24846 }, { "crossentropy": 2.475874900817871, "epoch": 0.9007758120649652, "grad_norm": 0.025693852454423904, "grad_norm_var": 5.21999220585859e-07, "learning_rate": 0.000250823420946934, "loss": 2.4835, "step": 24847 }, { "crossentropy": 2.497563362121582, "epoch": 0.9008120649651972, "grad_norm": 0.02666923776268959, "grad_norm_var": 5.219969573607338e-07, "learning_rate": 0.0002506417175467857, "loss": 2.4522, "step": 24848 }, { "crossentropy": 2.3622331619262695, "epoch": 0.9008483178654292, "grad_norm": 0.026072774082422256, "grad_norm_var": 5.369590646004188e-07, "learning_rate": 0.0002504600782939298, "loss": 2.3708, "step": 24849 }, { "crossentropy": 2.3581738471984863, "epoch": 0.9008845707656613, "grad_norm": 0.026112597435712814, "grad_norm_var": 5.004415904065097e-07, "learning_rate": 0.0002502785031908167, "loss": 2.3611, "step": 24850 }, { "crossentropy": 2.29528546333313, "epoch": 0.9009208236658933, "grad_norm": 0.027575775980949402, "grad_norm_var": 5.296703407398999e-07, "learning_rate": 0.0002500969922398999, "loss": 2.4551, "step": 24851 }, { "crossentropy": 2.321126937866211, "epoch": 0.9009570765661253, "grad_norm": 0.02851899527013302, "grad_norm_var": 7.325350733495065e-07, "learning_rate": 0.0002499155454436325, "loss": 2.4435, "step": 24852 }, { "crossentropy": 2.3854358196258545, "epoch": 0.9009933294663574, "grad_norm": 0.02559063211083412, "grad_norm_var": 7.138849702802615e-07, "learning_rate": 0.0002497341628044636, "loss": 2.4748, "step": 24853 }, { "crossentropy": 2.3661422729492188, "epoch": 0.9010295823665894, "grad_norm": 0.026322759687900543, "grad_norm_var": 7.176232857516631e-07, "learning_rate": 0.0002495528443248446, "loss": 2.3734, "step": 24854 }, { "crossentropy": 2.274904727935791, "epoch": 0.9010658352668214, "grad_norm": 0.02698500081896782, "grad_norm_var": 6.965681918824484e-07, "learning_rate": 0.0002493715900072224, "loss": 2.4074, "step": 24855 }, { "crossentropy": 2.306749105453491, "epoch": 0.9011020881670534, "grad_norm": 0.02601476013660431, "grad_norm_var": 7.240701156557231e-07, "learning_rate": 0.00024919039985404624, "loss": 2.3097, "step": 24856 }, { "crossentropy": 2.429983615875244, "epoch": 0.9011383410672854, "grad_norm": 0.026294805109500885, "grad_norm_var": 6.43697645746899e-07, "learning_rate": 0.00024900927386776464, "loss": 2.425, "step": 24857 }, { "crossentropy": 2.4302008152008057, "epoch": 0.9011745939675174, "grad_norm": 0.026776693761348724, "grad_norm_var": 6.39992674601221e-07, "learning_rate": 0.00024882821205082117, "loss": 2.3607, "step": 24858 }, { "crossentropy": 2.32525634765625, "epoch": 0.9012108468677494, "grad_norm": 0.026042276993393898, "grad_norm_var": 5.650541785706994e-07, "learning_rate": 0.0002486472144056634, "loss": 2.4314, "step": 24859 }, { "crossentropy": 2.451814889907837, "epoch": 0.9012470997679815, "grad_norm": 0.027543485164642334, "grad_norm_var": 6.27409967754224e-07, "learning_rate": 0.00024846628093473547, "loss": 2.5219, "step": 24860 }, { "crossentropy": 2.505290985107422, "epoch": 0.9012833526682135, "grad_norm": 0.025795385241508484, "grad_norm_var": 6.371084118127057e-07, "learning_rate": 0.0002482854116404809, "loss": 2.4828, "step": 24861 }, { "crossentropy": 2.4638967514038086, "epoch": 0.9013196055684455, "grad_norm": 0.02573356404900551, "grad_norm_var": 6.713734986928006e-07, "learning_rate": 0.00024810460652534337, "loss": 2.415, "step": 24862 }, { "crossentropy": 2.491584300994873, "epoch": 0.9013558584686775, "grad_norm": 0.025831421837210655, "grad_norm_var": 6.58064593607556e-07, "learning_rate": 0.00024792386559176373, "loss": 2.401, "step": 24863 }, { "crossentropy": 2.421030044555664, "epoch": 0.9013921113689095, "grad_norm": 0.026389535516500473, "grad_norm_var": 6.563633701599092e-07, "learning_rate": 0.00024774318884218384, "loss": 2.3452, "step": 24864 }, { "crossentropy": 2.3382625579833984, "epoch": 0.9014283642691415, "grad_norm": 0.025886474177241325, "grad_norm_var": 6.685246043966937e-07, "learning_rate": 0.00024756257627904564, "loss": 2.3573, "step": 24865 }, { "crossentropy": 2.5315849781036377, "epoch": 0.9014646171693735, "grad_norm": 0.02664562128484249, "grad_norm_var": 6.613513228945529e-07, "learning_rate": 0.000247382027904785, "loss": 2.5015, "step": 24866 }, { "crossentropy": 2.591235637664795, "epoch": 0.9015008700696056, "grad_norm": 0.026561178267002106, "grad_norm_var": 5.797121769472348e-07, "learning_rate": 0.0002472015437218428, "loss": 2.5146, "step": 24867 }, { "crossentropy": 2.519338607788086, "epoch": 0.9015371229698376, "grad_norm": 0.027231499552726746, "grad_norm_var": 3.252695803261536e-07, "learning_rate": 0.0002470211237326569, "loss": 2.455, "step": 24868 }, { "crossentropy": 2.436673164367676, "epoch": 0.9015733758700696, "grad_norm": 0.02558271214365959, "grad_norm_var": 3.2607836603354194e-07, "learning_rate": 0.00024684076793966283, "loss": 2.3388, "step": 24869 }, { "crossentropy": 2.4450244903564453, "epoch": 0.9016096287703016, "grad_norm": 0.026365431025624275, "grad_norm_var": 3.260239658731673e-07, "learning_rate": 0.00024666047634529844, "loss": 2.4901, "step": 24870 }, { "crossentropy": 2.4878673553466797, "epoch": 0.9016458816705336, "grad_norm": 0.02720031887292862, "grad_norm_var": 3.470086004162768e-07, "learning_rate": 0.0002464802489519968, "loss": 2.4477, "step": 24871 }, { "crossentropy": 2.5089547634124756, "epoch": 0.9016821345707656, "grad_norm": 0.026984713971614838, "grad_norm_var": 3.600678391119192e-07, "learning_rate": 0.0002463000857621933, "loss": 2.5339, "step": 24872 }, { "crossentropy": 2.491305112838745, "epoch": 0.9017183874709976, "grad_norm": 0.026419509202241898, "grad_norm_var": 3.5880733583608784e-07, "learning_rate": 0.0002461199867783215, "loss": 2.5172, "step": 24873 }, { "crossentropy": 2.587078094482422, "epoch": 0.9017546403712297, "grad_norm": 0.0276622511446476, "grad_norm_var": 4.4794576719620463e-07, "learning_rate": 0.00024593995200281385, "loss": 2.4655, "step": 24874 }, { "crossentropy": 2.3894121646881104, "epoch": 0.9017908932714617, "grad_norm": 0.02677728235721588, "grad_norm_var": 4.376164530951743e-07, "learning_rate": 0.0002457599814381006, "loss": 2.4173, "step": 24875 }, { "crossentropy": 2.39858341217041, "epoch": 0.9018271461716937, "grad_norm": 0.02599598467350006, "grad_norm_var": 3.7985438345114475e-07, "learning_rate": 0.00024558007508661495, "loss": 2.3887, "step": 24876 }, { "crossentropy": 2.5699994564056396, "epoch": 0.9018633990719258, "grad_norm": 0.025647204369306564, "grad_norm_var": 3.939909325007332e-07, "learning_rate": 0.00024540023295078426, "loss": 2.4294, "step": 24877 }, { "crossentropy": 2.541325330734253, "epoch": 0.9018996519721578, "grad_norm": 0.027507640421390533, "grad_norm_var": 4.254496698013569e-07, "learning_rate": 0.00024522045503304004, "loss": 2.5624, "step": 24878 }, { "crossentropy": 2.492962121963501, "epoch": 0.9019359048723898, "grad_norm": 0.026133568957448006, "grad_norm_var": 4.0248667493575134e-07, "learning_rate": 0.000245040741335808, "loss": 2.4081, "step": 24879 }, { "crossentropy": 2.3232839107513428, "epoch": 0.9019721577726219, "grad_norm": 0.026394670829176903, "grad_norm_var": 4.0237028124111225e-07, "learning_rate": 0.00024486109186151685, "loss": 2.4183, "step": 24880 }, { "crossentropy": 2.4557559490203857, "epoch": 0.9020084106728539, "grad_norm": 0.026196852326393127, "grad_norm_var": 3.804248951531456e-07, "learning_rate": 0.00024468150661259347, "loss": 2.4742, "step": 24881 }, { "crossentropy": 2.4304397106170654, "epoch": 0.9020446635730859, "grad_norm": 0.025685520842671394, "grad_norm_var": 4.298480828931449e-07, "learning_rate": 0.00024450198559146266, "loss": 2.4213, "step": 24882 }, { "crossentropy": 2.4343018531799316, "epoch": 0.9020809164733179, "grad_norm": 0.026678429916501045, "grad_norm_var": 4.313253566371461e-07, "learning_rate": 0.0002443225288005485, "loss": 2.3608, "step": 24883 }, { "crossentropy": 2.5680582523345947, "epoch": 0.9021171693735499, "grad_norm": 0.026633810251951218, "grad_norm_var": 3.9766681996001157e-07, "learning_rate": 0.0002441431362422769, "loss": 2.5406, "step": 24884 }, { "crossentropy": 2.5077528953552246, "epoch": 0.9021534222737819, "grad_norm": 0.026242641732096672, "grad_norm_var": 3.449107557788762e-07, "learning_rate": 0.00024396380791906814, "loss": 2.4693, "step": 24885 }, { "crossentropy": 2.351933240890503, "epoch": 0.9021896751740139, "grad_norm": 0.026661043986678123, "grad_norm_var": 3.4377304673783674e-07, "learning_rate": 0.0002437845438333469, "loss": 2.348, "step": 24886 }, { "crossentropy": 2.485142469406128, "epoch": 0.902225928074246, "grad_norm": 0.025852004066109657, "grad_norm_var": 3.40724764712992e-07, "learning_rate": 0.00024360534398753188, "loss": 2.4787, "step": 24887 }, { "crossentropy": 2.396594762802124, "epoch": 0.902262180974478, "grad_norm": 0.026317795738577843, "grad_norm_var": 3.2249340021564545e-07, "learning_rate": 0.0002434262083840455, "loss": 2.3448, "step": 24888 }, { "crossentropy": 2.463529348373413, "epoch": 0.90229843387471, "grad_norm": 0.027246883139014244, "grad_norm_var": 3.646290792144359e-07, "learning_rate": 0.000243247137025307, "loss": 2.4324, "step": 24889 }, { "crossentropy": 2.4554097652435303, "epoch": 0.902334686774942, "grad_norm": 0.025728652253746986, "grad_norm_var": 2.9275653361480564e-07, "learning_rate": 0.0002430681299137333, "loss": 2.4771, "step": 24890 }, { "crossentropy": 2.4098258018493652, "epoch": 0.902370939675174, "grad_norm": 0.027324998751282692, "grad_norm_var": 3.4225368609716533e-07, "learning_rate": 0.00024288918705174468, "loss": 2.4863, "step": 24891 }, { "crossentropy": 2.5268688201904297, "epoch": 0.902407192575406, "grad_norm": 0.027154667302966118, "grad_norm_var": 3.6521657664466873e-07, "learning_rate": 0.000242710308441757, "loss": 2.4848, "step": 24892 }, { "crossentropy": 2.3690483570098877, "epoch": 0.902443445475638, "grad_norm": 0.026382990181446075, "grad_norm_var": 3.190293306267721e-07, "learning_rate": 0.000242531494086185, "loss": 2.4203, "step": 24893 }, { "crossentropy": 2.361171245574951, "epoch": 0.90247969837587, "grad_norm": 0.026753436774015427, "grad_norm_var": 2.5414551333124056e-07, "learning_rate": 0.00024235274398744512, "loss": 2.4151, "step": 24894 }, { "crossentropy": 2.4665729999542236, "epoch": 0.9025159512761021, "grad_norm": 0.026472633704543114, "grad_norm_var": 2.4649429656014903e-07, "learning_rate": 0.00024217405814795146, "loss": 2.5282, "step": 24895 }, { "crossentropy": 2.4846482276916504, "epoch": 0.9025522041763341, "grad_norm": 0.026642074808478355, "grad_norm_var": 2.4740810648463813e-07, "learning_rate": 0.00024199543657011714, "loss": 2.4366, "step": 24896 }, { "crossentropy": 2.492861270904541, "epoch": 0.9025884570765661, "grad_norm": 0.029181910678744316, "grad_norm_var": 6.842997317303788e-07, "learning_rate": 0.00024181687925635577, "loss": 2.4657, "step": 24897 }, { "crossentropy": 2.4176228046417236, "epoch": 0.9026247099767981, "grad_norm": 0.025981321930885315, "grad_norm_var": 6.503500184595427e-07, "learning_rate": 0.00024163838620907708, "loss": 2.3404, "step": 24898 }, { "crossentropy": 2.192011594772339, "epoch": 0.9026609628770301, "grad_norm": 0.027372373268008232, "grad_norm_var": 6.781318010300071e-07, "learning_rate": 0.00024145995743069415, "loss": 2.3895, "step": 24899 }, { "crossentropy": 2.3612349033355713, "epoch": 0.9026972157772621, "grad_norm": 0.02627829648554325, "grad_norm_var": 6.913883991738057e-07, "learning_rate": 0.00024128159292361562, "loss": 2.3672, "step": 24900 }, { "crossentropy": 2.382490634918213, "epoch": 0.9027334686774942, "grad_norm": 0.026444578543305397, "grad_norm_var": 6.809601617876882e-07, "learning_rate": 0.00024110329269025067, "loss": 2.4328, "step": 24901 }, { "crossentropy": 2.469320774078369, "epoch": 0.9027697215777262, "grad_norm": 0.02651629038155079, "grad_norm_var": 6.837401666740511e-07, "learning_rate": 0.0002409250567330079, "loss": 2.4531, "step": 24902 }, { "crossentropy": 2.2364094257354736, "epoch": 0.9028059744779582, "grad_norm": 0.027071556076407433, "grad_norm_var": 6.342242932688326e-07, "learning_rate": 0.00024074688505429376, "loss": 2.4354, "step": 24903 }, { "crossentropy": 2.599081516265869, "epoch": 0.9028422273781903, "grad_norm": 0.026895267888903618, "grad_norm_var": 6.175994134919541e-07, "learning_rate": 0.0002405687776565152, "loss": 2.5259, "step": 24904 }, { "crossentropy": 2.3672430515289307, "epoch": 0.9028784802784223, "grad_norm": 0.025224510580301285, "grad_norm_var": 7.636416120605667e-07, "learning_rate": 0.0002403907345420786, "loss": 2.3497, "step": 24905 }, { "crossentropy": 2.4912827014923096, "epoch": 0.9029147331786543, "grad_norm": 0.026931872591376305, "grad_norm_var": 6.960309465201449e-07, "learning_rate": 0.00024021275571338763, "loss": 2.5511, "step": 24906 }, { "crossentropy": 2.3234212398529053, "epoch": 0.9029509860788864, "grad_norm": 0.027383465319871902, "grad_norm_var": 7.004206648890254e-07, "learning_rate": 0.00024003484117284758, "loss": 2.2983, "step": 24907 }, { "crossentropy": 2.5353775024414062, "epoch": 0.9029872389791184, "grad_norm": 0.027496634051203728, "grad_norm_var": 7.242220662034693e-07, "learning_rate": 0.00023985699092286038, "loss": 2.5171, "step": 24908 }, { "crossentropy": 2.491412878036499, "epoch": 0.9030234918793504, "grad_norm": 0.026320533826947212, "grad_norm_var": 7.280578195822373e-07, "learning_rate": 0.0002396792049658275, "loss": 2.3789, "step": 24909 }, { "crossentropy": 2.4673943519592285, "epoch": 0.9030597447795824, "grad_norm": 0.025484517216682434, "grad_norm_var": 8.383339632202687e-07, "learning_rate": 0.00023950148330415146, "loss": 2.4164, "step": 24910 }, { "crossentropy": 2.485104560852051, "epoch": 0.9030959976798144, "grad_norm": 0.026891613379120827, "grad_norm_var": 8.348656825709599e-07, "learning_rate": 0.00023932382594023138, "loss": 2.4688, "step": 24911 }, { "crossentropy": 2.3653616905212402, "epoch": 0.9031322505800464, "grad_norm": 0.02723403088748455, "grad_norm_var": 8.476719148659342e-07, "learning_rate": 0.0002391462328764682, "loss": 2.3298, "step": 24912 }, { "crossentropy": 2.479757308959961, "epoch": 0.9031685034802784, "grad_norm": 0.025715667754411697, "grad_norm_var": 4.9512682252042e-07, "learning_rate": 0.0002389687041152605, "loss": 2.4034, "step": 24913 }, { "crossentropy": 2.333146810531616, "epoch": 0.9032047563805105, "grad_norm": 0.025910716503858566, "grad_norm_var": 5.01052335907789e-07, "learning_rate": 0.00023879123965900474, "loss": 2.3718, "step": 24914 }, { "crossentropy": 2.364699363708496, "epoch": 0.9032410092807425, "grad_norm": 0.02712923102080822, "grad_norm_var": 4.788403176809863e-07, "learning_rate": 0.00023861383951009953, "loss": 2.4654, "step": 24915 }, { "crossentropy": 2.285975933074951, "epoch": 0.9032772621809745, "grad_norm": 0.02739306539297104, "grad_norm_var": 5.149284348683499e-07, "learning_rate": 0.0002384365036709396, "loss": 2.4608, "step": 24916 }, { "crossentropy": 2.5842444896698, "epoch": 0.9033135150812065, "grad_norm": 0.028414228931069374, "grad_norm_var": 7.0930146602721e-07, "learning_rate": 0.00023825923214392032, "loss": 2.4761, "step": 24917 }, { "crossentropy": 2.411677837371826, "epoch": 0.9033497679814385, "grad_norm": 0.026049194857478142, "grad_norm_var": 7.375442884022717e-07, "learning_rate": 0.0002380820249314375, "loss": 2.3849, "step": 24918 }, { "crossentropy": 2.38614559173584, "epoch": 0.9033860208816705, "grad_norm": 0.027471205219626427, "grad_norm_var": 7.661730199543434e-07, "learning_rate": 0.000237904882035882, "loss": 2.3947, "step": 24919 }, { "crossentropy": 2.290846586227417, "epoch": 0.9034222737819025, "grad_norm": 0.025841621682047844, "grad_norm_var": 8.146742492334361e-07, "learning_rate": 0.00023772780345964807, "loss": 2.4007, "step": 24920 }, { "crossentropy": 2.440089702606201, "epoch": 0.9034585266821346, "grad_norm": 0.026844525709748268, "grad_norm_var": 6.641501862731143e-07, "learning_rate": 0.00023755078920512763, "loss": 2.445, "step": 24921 }, { "crossentropy": 2.4574947357177734, "epoch": 0.9034947795823666, "grad_norm": 0.02610902488231659, "grad_norm_var": 6.900254601364952e-07, "learning_rate": 0.00023737383927471044, "loss": 2.3624, "step": 24922 }, { "crossentropy": 2.3387136459350586, "epoch": 0.9035310324825986, "grad_norm": 0.02574811689555645, "grad_norm_var": 7.148138630461935e-07, "learning_rate": 0.00023719695367078797, "loss": 2.405, "step": 24923 }, { "crossentropy": 2.379831552505493, "epoch": 0.9035672853828306, "grad_norm": 0.026264864951372147, "grad_norm_var": 6.670422815373278e-07, "learning_rate": 0.00023702013239574826, "loss": 2.4168, "step": 24924 }, { "crossentropy": 2.5202078819274902, "epoch": 0.9036035382830626, "grad_norm": 0.027011804282665253, "grad_norm_var": 6.756308015933339e-07, "learning_rate": 0.00023684337545197944, "loss": 2.4073, "step": 24925 }, { "crossentropy": 2.4219725131988525, "epoch": 0.9036397911832946, "grad_norm": 0.02661033347249031, "grad_norm_var": 5.88215558308203e-07, "learning_rate": 0.00023666668284186954, "loss": 2.4538, "step": 24926 }, { "crossentropy": 2.3922536373138428, "epoch": 0.9036760440835266, "grad_norm": 0.026676343753933907, "grad_norm_var": 5.846061216423313e-07, "learning_rate": 0.00023649005456780392, "loss": 2.3994, "step": 24927 }, { "crossentropy": 2.4218525886535645, "epoch": 0.9037122969837587, "grad_norm": 0.026235805824398994, "grad_norm_var": 5.693513291743357e-07, "learning_rate": 0.000236313490632169, "loss": 2.4685, "step": 24928 }, { "crossentropy": 2.3485875129699707, "epoch": 0.9037485498839907, "grad_norm": 0.027135169133543968, "grad_norm_var": 5.29974281417156e-07, "learning_rate": 0.00023613699103735064, "loss": 2.4669, "step": 24929 }, { "crossentropy": 2.4891819953918457, "epoch": 0.9037848027842227, "grad_norm": 0.02595398761332035, "grad_norm_var": 5.256654686940868e-07, "learning_rate": 0.00023596055578573027, "loss": 2.5218, "step": 24930 }, { "crossentropy": 2.5105624198913574, "epoch": 0.9038210556844548, "grad_norm": 0.026490967720746994, "grad_norm_var": 5.129417006624962e-07, "learning_rate": 0.00023578418487969378, "loss": 2.4689, "step": 24931 }, { "crossentropy": 2.396625518798828, "epoch": 0.9038573085846868, "grad_norm": 0.02634977549314499, "grad_norm_var": 4.763038657493515e-07, "learning_rate": 0.00023560787832162033, "loss": 2.3997, "step": 24932 }, { "crossentropy": 2.4915709495544434, "epoch": 0.9038935614849188, "grad_norm": 0.027718927711248398, "grad_norm_var": 3.3605040768457975e-07, "learning_rate": 0.00023543163611389417, "loss": 2.3917, "step": 24933 }, { "crossentropy": 2.5523781776428223, "epoch": 0.9039298143851509, "grad_norm": 0.026505766436457634, "grad_norm_var": 3.196889192057095e-07, "learning_rate": 0.00023525545825889394, "loss": 2.4005, "step": 24934 }, { "crossentropy": 2.4410431385040283, "epoch": 0.9039660672853829, "grad_norm": 0.026516404002904892, "grad_norm_var": 2.607296766811313e-07, "learning_rate": 0.00023507934475899828, "loss": 2.4385, "step": 24935 }, { "crossentropy": 2.3998923301696777, "epoch": 0.9040023201856149, "grad_norm": 0.02682887762784958, "grad_norm_var": 2.3487119696281807e-07, "learning_rate": 0.00023490329561658808, "loss": 2.4386, "step": 24936 }, { "crossentropy": 2.436138153076172, "epoch": 0.9040385730858469, "grad_norm": 0.025710420683026314, "grad_norm_var": 2.726186717056372e-07, "learning_rate": 0.00023472731083404031, "loss": 2.3786, "step": 24937 }, { "crossentropy": 2.5229806900024414, "epoch": 0.9040748259860789, "grad_norm": 0.02694050595164299, "grad_norm_var": 2.734079942737077e-07, "learning_rate": 0.00023455139041373086, "loss": 2.5777, "step": 24938 }, { "crossentropy": 2.4588663578033447, "epoch": 0.9041110788863109, "grad_norm": 0.025257429108023643, "grad_norm_var": 3.4050284164718833e-07, "learning_rate": 0.00023437553435803782, "loss": 2.3634, "step": 24939 }, { "crossentropy": 2.499203681945801, "epoch": 0.9041473317865429, "grad_norm": 0.026638876646757126, "grad_norm_var": 3.3687350437237805e-07, "learning_rate": 0.0002341997426693343, "loss": 2.4692, "step": 24940 }, { "crossentropy": 2.502861976623535, "epoch": 0.904183584686775, "grad_norm": 0.026288706809282303, "grad_norm_var": 3.237116720492764e-07, "learning_rate": 0.00023402401534999617, "loss": 2.4318, "step": 24941 }, { "crossentropy": 2.5163052082061768, "epoch": 0.904219837587007, "grad_norm": 0.026948964223265648, "grad_norm_var": 3.362601091720189e-07, "learning_rate": 0.000233848352402396, "loss": 2.471, "step": 24942 }, { "crossentropy": 2.286966562271118, "epoch": 0.904256090487239, "grad_norm": 0.027044162154197693, "grad_norm_var": 3.5276047119155105e-07, "learning_rate": 0.0002336727538289074, "loss": 2.4101, "step": 24943 }, { "crossentropy": 2.4923956394195557, "epoch": 0.904292343387471, "grad_norm": 0.026343833655118942, "grad_norm_var": 3.4917606706065066e-07, "learning_rate": 0.00023349721963190074, "loss": 2.496, "step": 24944 }, { "crossentropy": 2.4431614875793457, "epoch": 0.904328596287703, "grad_norm": 0.025889402255415916, "grad_norm_var": 3.476533290379391e-07, "learning_rate": 0.000233321749813748, "loss": 2.4907, "step": 24945 }, { "crossentropy": 2.4495794773101807, "epoch": 0.904364849187935, "grad_norm": 0.02688724175095558, "grad_norm_var": 3.386023174053865e-07, "learning_rate": 0.00023314634437681837, "loss": 2.4007, "step": 24946 }, { "crossentropy": 2.2658116817474365, "epoch": 0.904401102088167, "grad_norm": 0.03611929342150688, "grad_norm_var": 6.092141857300101e-06, "learning_rate": 0.00023297100332348166, "loss": 2.3285, "step": 24947 }, { "crossentropy": 2.4772789478302, "epoch": 0.9044373549883991, "grad_norm": 0.026893336325883865, "grad_norm_var": 6.054475475128482e-06, "learning_rate": 0.00023279572665610537, "loss": 2.4795, "step": 24948 }, { "crossentropy": 2.418489456176758, "epoch": 0.9044736078886311, "grad_norm": 0.02558918669819832, "grad_norm_var": 6.178752305377094e-06, "learning_rate": 0.00023262051437705766, "loss": 2.4695, "step": 24949 }, { "crossentropy": 2.4341330528259277, "epoch": 0.9045098607888631, "grad_norm": 0.025898706167936325, "grad_norm_var": 6.243824596926552e-06, "learning_rate": 0.0002324453664887044, "loss": 2.3432, "step": 24950 }, { "crossentropy": 2.524592638015747, "epoch": 0.9045461136890951, "grad_norm": 0.02605142630636692, "grad_norm_var": 6.286525886303263e-06, "learning_rate": 0.000232270282993412, "loss": 2.4816, "step": 24951 }, { "crossentropy": 2.3837897777557373, "epoch": 0.9045823665893271, "grad_norm": 0.025533942505717278, "grad_norm_var": 6.413649032248105e-06, "learning_rate": 0.00023209526389354363, "loss": 2.3247, "step": 24952 }, { "crossentropy": 2.3846640586853027, "epoch": 0.9046186194895591, "grad_norm": 0.027061758562922478, "grad_norm_var": 6.317550106971619e-06, "learning_rate": 0.00023192030919146566, "loss": 2.3682, "step": 24953 }, { "crossentropy": 2.5586273670196533, "epoch": 0.9046548723897911, "grad_norm": 0.026627223938703537, "grad_norm_var": 6.32456838746172e-06, "learning_rate": 0.00023174541888953848, "loss": 2.5269, "step": 24954 }, { "crossentropy": 2.435779571533203, "epoch": 0.9046911252900232, "grad_norm": 0.025656776502728462, "grad_norm_var": 6.244833623899696e-06, "learning_rate": 0.00023157059299012573, "loss": 2.4301, "step": 24955 }, { "crossentropy": 2.3589775562286377, "epoch": 0.9047273781902552, "grad_norm": 0.026342321187257767, "grad_norm_var": 6.26330650218392e-06, "learning_rate": 0.00023139583149558884, "loss": 2.4722, "step": 24956 }, { "crossentropy": 2.5193960666656494, "epoch": 0.9047636310904872, "grad_norm": 0.026134667918086052, "grad_norm_var": 6.278341038391796e-06, "learning_rate": 0.00023122113440828763, "loss": 2.4858, "step": 24957 }, { "crossentropy": 2.369683027267456, "epoch": 0.9047998839907193, "grad_norm": 0.027181539684534073, "grad_norm_var": 6.282034142821487e-06, "learning_rate": 0.0002310465017305824, "loss": 2.4259, "step": 24958 }, { "crossentropy": 2.44998836517334, "epoch": 0.9048361368909513, "grad_norm": 0.025311313569545746, "grad_norm_var": 6.4487426813884806e-06, "learning_rate": 0.00023087193346483126, "loss": 2.3836, "step": 24959 }, { "crossentropy": 2.314013957977295, "epoch": 0.9048723897911833, "grad_norm": 0.026502961292862892, "grad_norm_var": 6.439689413219882e-06, "learning_rate": 0.0002306974296133918, "loss": 2.3758, "step": 24960 }, { "crossentropy": 2.4251291751861572, "epoch": 0.9049086426914154, "grad_norm": 0.02562198042869568, "grad_norm_var": 6.478591100779474e-06, "learning_rate": 0.00023052299017862154, "loss": 2.4454, "step": 24961 }, { "crossentropy": 2.5041255950927734, "epoch": 0.9049448955916474, "grad_norm": 0.026355944573879242, "grad_norm_var": 6.492770258339995e-06, "learning_rate": 0.0002303486151628753, "loss": 2.4811, "step": 24962 }, { "crossentropy": 2.56370210647583, "epoch": 0.9049811484918794, "grad_norm": 0.027052607387304306, "grad_norm_var": 3.707798617094115e-07, "learning_rate": 0.00023017430456850953, "loss": 2.4882, "step": 24963 }, { "crossentropy": 2.433697462081909, "epoch": 0.9050174013921114, "grad_norm": 0.026342280209064484, "grad_norm_var": 3.416438381802223e-07, "learning_rate": 0.00023000005839787896, "loss": 2.3866, "step": 24964 }, { "crossentropy": 2.5342752933502197, "epoch": 0.9050536542923434, "grad_norm": 0.02619619108736515, "grad_norm_var": 3.149097621677702e-07, "learning_rate": 0.00022982587665333566, "loss": 2.5051, "step": 24965 }, { "crossentropy": 2.535041570663452, "epoch": 0.9050899071925754, "grad_norm": 0.02586640603840351, "grad_norm_var": 3.1645332980393544e-07, "learning_rate": 0.00022965175933723327, "loss": 2.5124, "step": 24966 }, { "crossentropy": 2.3784186840057373, "epoch": 0.9051261600928074, "grad_norm": 0.026473382487893105, "grad_norm_var": 3.1697427363506364e-07, "learning_rate": 0.00022947770645192323, "loss": 2.3601, "step": 24967 }, { "crossentropy": 2.3361258506774902, "epoch": 0.9051624129930395, "grad_norm": 0.02770998887717724, "grad_norm_var": 4.0042801847563436e-07, "learning_rate": 0.0002293037179997559, "loss": 2.4111, "step": 24968 }, { "crossentropy": 2.553318500518799, "epoch": 0.9051986658932715, "grad_norm": 0.0282786525785923, "grad_norm_var": 5.999732686370508e-07, "learning_rate": 0.0002291297939830822, "loss": 2.5866, "step": 24969 }, { "crossentropy": 2.518950939178467, "epoch": 0.9052349187935035, "grad_norm": 0.027091221883893013, "grad_norm_var": 6.22636977009224e-07, "learning_rate": 0.00022895593440424967, "loss": 2.4257, "step": 24970 }, { "crossentropy": 2.449369430541992, "epoch": 0.9052711716937355, "grad_norm": 0.026643475517630577, "grad_norm_var": 5.715788464555685e-07, "learning_rate": 0.0002287821392656081, "loss": 2.4468, "step": 24971 }, { "crossentropy": 2.3677444458007812, "epoch": 0.9053074245939675, "grad_norm": 0.02629702165722847, "grad_norm_var": 5.730765780979802e-07, "learning_rate": 0.0002286084085695045, "loss": 2.3744, "step": 24972 }, { "crossentropy": 2.3534460067749023, "epoch": 0.9053436774941995, "grad_norm": 0.026124538853764534, "grad_norm_var": 5.736658293654939e-07, "learning_rate": 0.00022843474231828476, "loss": 2.4547, "step": 24973 }, { "crossentropy": 2.2431600093841553, "epoch": 0.9053799303944315, "grad_norm": 0.02687389776110649, "grad_norm_var": 5.543156282039607e-07, "learning_rate": 0.00022826114051429592, "loss": 2.3634, "step": 24974 }, { "crossentropy": 2.3013362884521484, "epoch": 0.9054161832946636, "grad_norm": 0.025959566235542297, "grad_norm_var": 4.738299163936181e-07, "learning_rate": 0.0002280876031598811, "loss": 2.3619, "step": 24975 }, { "crossentropy": 2.423884391784668, "epoch": 0.9054524361948956, "grad_norm": 0.026071559637784958, "grad_norm_var": 4.90288783246935e-07, "learning_rate": 0.0002279141302573845, "loss": 2.4153, "step": 24976 }, { "crossentropy": 2.3944709300994873, "epoch": 0.9054886890951276, "grad_norm": 0.026506677269935608, "grad_norm_var": 4.285678930271517e-07, "learning_rate": 0.00022774072180915039, "loss": 2.4585, "step": 24977 }, { "crossentropy": 2.3495750427246094, "epoch": 0.9055249419953596, "grad_norm": 0.027005242183804512, "grad_norm_var": 4.32471433464178e-07, "learning_rate": 0.00022756737781751913, "loss": 2.4416, "step": 24978 }, { "crossentropy": 2.5526816844940186, "epoch": 0.9055611948955916, "grad_norm": 0.027574699372053146, "grad_norm_var": 4.771307398713023e-07, "learning_rate": 0.0002273940982848327, "loss": 2.514, "step": 24979 }, { "crossentropy": 2.6180481910705566, "epoch": 0.9055974477958236, "grad_norm": 0.02657780982553959, "grad_norm_var": 4.6972756083129137e-07, "learning_rate": 0.0002272208832134326, "loss": 2.5729, "step": 24980 }, { "crossentropy": 2.4950525760650635, "epoch": 0.9056337006960556, "grad_norm": 0.02748887985944748, "grad_norm_var": 4.867898883491202e-07, "learning_rate": 0.00022704773260565637, "loss": 2.4563, "step": 24981 }, { "crossentropy": 2.4207193851470947, "epoch": 0.9056699535962877, "grad_norm": 0.026895731687545776, "grad_norm_var": 4.2708401656019593e-07, "learning_rate": 0.00022687464646384492, "loss": 2.4722, "step": 24982 }, { "crossentropy": 2.429420232772827, "epoch": 0.9057062064965197, "grad_norm": 0.02565496414899826, "grad_norm_var": 5.098558698434415e-07, "learning_rate": 0.00022670162479033419, "loss": 2.4711, "step": 24983 }, { "crossentropy": 2.4665229320526123, "epoch": 0.9057424593967517, "grad_norm": 0.02609354816377163, "grad_norm_var": 4.7641454922221306e-07, "learning_rate": 0.00022652866758746172, "loss": 2.4438, "step": 24984 }, { "crossentropy": 2.496912956237793, "epoch": 0.9057787122969838, "grad_norm": 0.026058414950966835, "grad_norm_var": 3.1601769261261196e-07, "learning_rate": 0.000226355774857564, "loss": 2.5584, "step": 24985 }, { "crossentropy": 2.544956922531128, "epoch": 0.9058149651972158, "grad_norm": 0.02648870274424553, "grad_norm_var": 2.958161919699535e-07, "learning_rate": 0.00022618294660297467, "loss": 2.4786, "step": 24986 }, { "crossentropy": 2.511376142501831, "epoch": 0.9058512180974478, "grad_norm": 0.026628371328115463, "grad_norm_var": 2.9558112081249215e-07, "learning_rate": 0.0002260101828260297, "loss": 2.5304, "step": 24987 }, { "crossentropy": 2.3970181941986084, "epoch": 0.9058874709976799, "grad_norm": 0.027367770671844482, "grad_norm_var": 3.355855393344e-07, "learning_rate": 0.00022583748352906274, "loss": 2.432, "step": 24988 }, { "crossentropy": 2.495244264602661, "epoch": 0.9059237238979119, "grad_norm": 0.02640044316649437, "grad_norm_var": 3.2338028969249265e-07, "learning_rate": 0.00022566484871440473, "loss": 2.4302, "step": 24989 }, { "crossentropy": 2.4066247940063477, "epoch": 0.9059599767981439, "grad_norm": 0.02771468460559845, "grad_norm_var": 3.979439751257419e-07, "learning_rate": 0.0002254922783843888, "loss": 2.4529, "step": 24990 }, { "crossentropy": 2.4374213218688965, "epoch": 0.9059962296983759, "grad_norm": 0.025788750499486923, "grad_norm_var": 4.1561646314195594e-07, "learning_rate": 0.00022531977254134584, "loss": 2.3819, "step": 24991 }, { "crossentropy": 2.5686001777648926, "epoch": 0.9060324825986079, "grad_norm": 0.02659398876130581, "grad_norm_var": 3.9274679092444045e-07, "learning_rate": 0.000225147331187604, "loss": 2.5303, "step": 24992 }, { "crossentropy": 2.3396384716033936, "epoch": 0.9060687354988399, "grad_norm": 0.02519902214407921, "grad_norm_var": 5.293886480823752e-07, "learning_rate": 0.0002249749543254942, "loss": 2.2809, "step": 24993 }, { "crossentropy": 2.5264768600463867, "epoch": 0.9061049883990719, "grad_norm": 0.02708233892917633, "grad_norm_var": 5.339701712725001e-07, "learning_rate": 0.00022480264195734345, "loss": 2.5483, "step": 24994 }, { "crossentropy": 2.3827764987945557, "epoch": 0.906141241299304, "grad_norm": 0.02652505598962307, "grad_norm_var": 4.664890904467241e-07, "learning_rate": 0.00022463039408547936, "loss": 2.4301, "step": 24995 }, { "crossentropy": 2.5407423973083496, "epoch": 0.906177494199536, "grad_norm": 0.02556653320789337, "grad_norm_var": 5.24621441101945e-07, "learning_rate": 0.00022445821071223005, "loss": 2.5024, "step": 24996 }, { "crossentropy": 2.599154472351074, "epoch": 0.906213747099768, "grad_norm": 0.02639860287308693, "grad_norm_var": 4.5104773752352216e-07, "learning_rate": 0.00022428609183991867, "loss": 2.5088, "step": 24997 }, { "crossentropy": 2.388787269592285, "epoch": 0.90625, "grad_norm": 0.025872481986880302, "grad_norm_var": 4.4933882206766156e-07, "learning_rate": 0.00022411403747087168, "loss": 2.4034, "step": 24998 }, { "crossentropy": 2.1324005126953125, "epoch": 0.906286252900232, "grad_norm": 0.026884106919169426, "grad_norm_var": 4.3156053165261755e-07, "learning_rate": 0.0002239420476074111, "loss": 2.2376, "step": 24999 }, { "crossentropy": 2.3731493949890137, "epoch": 0.906322505800464, "grad_norm": 0.025420701131224632, "grad_norm_var": 4.88822051705982e-07, "learning_rate": 0.00022377012225186234, "loss": 2.3903, "step": 25000 }, { "crossentropy": 2.3515427112579346, "epoch": 0.906358758700696, "grad_norm": 0.025820842012763023, "grad_norm_var": 5.023580227372347e-07, "learning_rate": 0.00022359826140654627, "loss": 2.366, "step": 25001 }, { "crossentropy": 2.3971519470214844, "epoch": 0.9063950116009281, "grad_norm": 0.025763973593711853, "grad_norm_var": 5.227025088818764e-07, "learning_rate": 0.0002234264650737833, "loss": 2.4249, "step": 25002 }, { "crossentropy": 2.419018507003784, "epoch": 0.9064312645011601, "grad_norm": 0.026545871049165726, "grad_norm_var": 5.196723277168617e-07, "learning_rate": 0.00022325473325589485, "loss": 2.5197, "step": 25003 }, { "crossentropy": 2.425881862640381, "epoch": 0.9064675174013921, "grad_norm": 0.026055194437503815, "grad_norm_var": 4.420679920978665e-07, "learning_rate": 0.00022308306595520078, "loss": 2.4738, "step": 25004 }, { "crossentropy": 2.4106600284576416, "epoch": 0.9065037703016241, "grad_norm": 0.025863923132419586, "grad_norm_var": 4.4765406387124034e-07, "learning_rate": 0.0002229114631740181, "loss": 2.3729, "step": 25005 }, { "crossentropy": 2.293797492980957, "epoch": 0.9065400232018561, "grad_norm": 0.026566466316580772, "grad_norm_var": 2.971681419495677e-07, "learning_rate": 0.00022273992491466665, "loss": 2.3875, "step": 25006 }, { "crossentropy": 2.427610397338867, "epoch": 0.9065762761020881, "grad_norm": 0.025817954912781715, "grad_norm_var": 2.9592480983326823e-07, "learning_rate": 0.0002225684511794618, "loss": 2.3732, "step": 25007 }, { "crossentropy": 2.4738121032714844, "epoch": 0.9066125290023201, "grad_norm": 0.025757888332009315, "grad_norm_var": 2.8717355870412863e-07, "learning_rate": 0.00022239704197072054, "loss": 2.4111, "step": 25008 }, { "crossentropy": 2.3729357719421387, "epoch": 0.9066487819025522, "grad_norm": 0.02713986486196518, "grad_norm_var": 2.9687320680626035e-07, "learning_rate": 0.00022222569729075714, "loss": 2.4325, "step": 25009 }, { "crossentropy": 2.3753628730773926, "epoch": 0.9066850348027842, "grad_norm": 0.02738231047987938, "grad_norm_var": 3.3808282301478246e-07, "learning_rate": 0.0002220544171418859, "loss": 2.4912, "step": 25010 }, { "crossentropy": 2.435800075531006, "epoch": 0.9067212877030162, "grad_norm": 0.02691752463579178, "grad_norm_var": 3.641252090343655e-07, "learning_rate": 0.0002218832015264205, "loss": 2.3543, "step": 25011 }, { "crossentropy": 2.4942269325256348, "epoch": 0.9067575406032483, "grad_norm": 0.026520948857069016, "grad_norm_var": 3.358777572924487e-07, "learning_rate": 0.0002217120504466741, "loss": 2.4837, "step": 25012 }, { "crossentropy": 2.3626716136932373, "epoch": 0.9067937935034803, "grad_norm": 0.02630641497671604, "grad_norm_var": 3.351421123109581e-07, "learning_rate": 0.00022154096390495704, "loss": 2.3, "step": 25013 }, { "crossentropy": 2.3883860111236572, "epoch": 0.9068300464037123, "grad_norm": 0.02581333927810192, "grad_norm_var": 3.3865140711570687e-07, "learning_rate": 0.0002213699419035814, "loss": 2.399, "step": 25014 }, { "crossentropy": 2.3522117137908936, "epoch": 0.9068662993039444, "grad_norm": 0.026312757283449173, "grad_norm_var": 3.1349648475762343e-07, "learning_rate": 0.00022119898444485585, "loss": 2.3565, "step": 25015 }, { "crossentropy": 2.5077860355377197, "epoch": 0.9069025522041764, "grad_norm": 0.025486527010798454, "grad_norm_var": 3.064854454234613e-07, "learning_rate": 0.00022102809153109082, "loss": 2.5449, "step": 25016 }, { "crossentropy": 2.4595086574554443, "epoch": 0.9069388051044084, "grad_norm": 0.02611265704035759, "grad_norm_var": 2.949351239122632e-07, "learning_rate": 0.00022085726316459442, "loss": 2.4318, "step": 25017 }, { "crossentropy": 2.387751340866089, "epoch": 0.9069750580046404, "grad_norm": 0.026676002889871597, "grad_norm_var": 2.8505618235076985e-07, "learning_rate": 0.00022068649934767204, "loss": 2.4137, "step": 25018 }, { "crossentropy": 2.5676021575927734, "epoch": 0.9070113109048724, "grad_norm": 0.026780616492033005, "grad_norm_var": 2.952654222928656e-07, "learning_rate": 0.0002205158000826313, "loss": 2.468, "step": 25019 }, { "crossentropy": 2.1284162998199463, "epoch": 0.9070475638051044, "grad_norm": 0.02719554863870144, "grad_norm_var": 3.325680861665138e-07, "learning_rate": 0.00022034516537177863, "loss": 2.335, "step": 25020 }, { "crossentropy": 2.4259555339813232, "epoch": 0.9070838167053364, "grad_norm": 0.026791216805577278, "grad_norm_var": 3.1809247191168695e-07, "learning_rate": 0.00022017459521741722, "loss": 2.4467, "step": 25021 }, { "crossentropy": 2.537496566772461, "epoch": 0.9071200696055685, "grad_norm": 0.026774216443300247, "grad_norm_var": 3.233616179538438e-07, "learning_rate": 0.00022000408962185192, "loss": 2.5321, "step": 25022 }, { "crossentropy": 2.541278123855591, "epoch": 0.9071563225058005, "grad_norm": 0.02559630200266838, "grad_norm_var": 3.461935433171637e-07, "learning_rate": 0.0002198336485873853, "loss": 2.5346, "step": 25023 }, { "crossentropy": 2.4942173957824707, "epoch": 0.9071925754060325, "grad_norm": 0.02663918025791645, "grad_norm_var": 3.1073451832661083e-07, "learning_rate": 0.00021966327211631886, "loss": 2.4294, "step": 25024 }, { "crossentropy": 2.6263668537139893, "epoch": 0.9072288283062645, "grad_norm": 0.027015630155801773, "grad_norm_var": 3.0156118339206425e-07, "learning_rate": 0.00021949296021095578, "loss": 2.5863, "step": 25025 }, { "crossentropy": 2.4920496940612793, "epoch": 0.9072650812064965, "grad_norm": 0.027227001264691353, "grad_norm_var": 2.852136520818758e-07, "learning_rate": 0.0002193227128735936, "loss": 2.4615, "step": 25026 }, { "crossentropy": 2.550828695297241, "epoch": 0.9073013341067285, "grad_norm": 0.026064522564411163, "grad_norm_var": 2.8438201341561033e-07, "learning_rate": 0.00021915253010653334, "loss": 2.5423, "step": 25027 }, { "crossentropy": 2.307218551635742, "epoch": 0.9073375870069605, "grad_norm": 0.025820840150117874, "grad_norm_var": 3.0905218470210435e-07, "learning_rate": 0.00021898241191207368, "loss": 2.3069, "step": 25028 }, { "crossentropy": 2.3723487854003906, "epoch": 0.9073738399071926, "grad_norm": 0.026665018871426582, "grad_norm_var": 3.119789769573013e-07, "learning_rate": 0.00021881235829251166, "loss": 2.373, "step": 25029 }, { "crossentropy": 2.465646743774414, "epoch": 0.9074100928074246, "grad_norm": 0.02740625850856304, "grad_norm_var": 3.383808901345388e-07, "learning_rate": 0.0002186423692501449, "loss": 2.4667, "step": 25030 }, { "crossentropy": 2.2606635093688965, "epoch": 0.9074463457076566, "grad_norm": 0.02567444182932377, "grad_norm_var": 3.827839522722402e-07, "learning_rate": 0.00021847244478726823, "loss": 2.3375, "step": 25031 }, { "crossentropy": 2.490413188934326, "epoch": 0.9074825986078886, "grad_norm": 0.026660004630684853, "grad_norm_var": 3.1100167494814656e-07, "learning_rate": 0.0002183025849061776, "loss": 2.3548, "step": 25032 }, { "crossentropy": 2.412914752960205, "epoch": 0.9075188515081206, "grad_norm": 0.025676481425762177, "grad_norm_var": 3.4941516339181043e-07, "learning_rate": 0.0002181327896091684, "loss": 2.3504, "step": 25033 }, { "crossentropy": 2.5736520290374756, "epoch": 0.9075551044083526, "grad_norm": 0.029199600219726562, "grad_norm_var": 7.927216955416183e-07, "learning_rate": 0.00021796305889853096, "loss": 2.4716, "step": 25034 }, { "crossentropy": 2.605281352996826, "epoch": 0.9075913573085846, "grad_norm": 0.026850897818803787, "grad_norm_var": 7.93793540572405e-07, "learning_rate": 0.00021779339277655963, "loss": 2.5197, "step": 25035 }, { "crossentropy": 2.5551412105560303, "epoch": 0.9076276102088167, "grad_norm": 0.025666959583759308, "grad_norm_var": 8.395595131863945e-07, "learning_rate": 0.00021762379124554642, "loss": 2.4448, "step": 25036 }, { "crossentropy": 2.423248767852783, "epoch": 0.9076638631090487, "grad_norm": 0.027053939178586006, "grad_norm_var": 8.502902205653144e-07, "learning_rate": 0.0002174542543077812, "loss": 2.465, "step": 25037 }, { "crossentropy": 2.4425125122070312, "epoch": 0.9077001160092807, "grad_norm": 0.02591128461062908, "grad_norm_var": 8.79599848094964e-07, "learning_rate": 0.0002172847819655538, "loss": 2.4118, "step": 25038 }, { "crossentropy": 2.452084541320801, "epoch": 0.9077363689095128, "grad_norm": 0.026341231539845467, "grad_norm_var": 8.175189134211932e-07, "learning_rate": 0.00021711537422115345, "loss": 2.514, "step": 25039 }, { "crossentropy": 2.5331177711486816, "epoch": 0.9077726218097448, "grad_norm": 0.02654583565890789, "grad_norm_var": 8.177884404870211e-07, "learning_rate": 0.00021694603107686838, "loss": 2.4994, "step": 25040 }, { "crossentropy": 2.571152925491333, "epoch": 0.9078088747099768, "grad_norm": 0.02573630027472973, "grad_norm_var": 8.511026130488796e-07, "learning_rate": 0.00021677675253498562, "loss": 2.4852, "step": 25041 }, { "crossentropy": 2.4654107093811035, "epoch": 0.9078451276102089, "grad_norm": 0.02681683376431465, "grad_norm_var": 8.235696210539261e-07, "learning_rate": 0.00021660753859779224, "loss": 2.4739, "step": 25042 }, { "crossentropy": 2.5724613666534424, "epoch": 0.9078813805104409, "grad_norm": 0.029556384310126305, "grad_norm_var": 1.3802559986813543e-06, "learning_rate": 0.00021643838926757253, "loss": 2.4425, "step": 25043 }, { "crossentropy": 2.565293550491333, "epoch": 0.9079176334106729, "grad_norm": 0.027094952762126923, "grad_norm_var": 1.3283037874162647e-06, "learning_rate": 0.00021626930454661186, "loss": 2.4896, "step": 25044 }, { "crossentropy": 2.5877726078033447, "epoch": 0.9079538863109049, "grad_norm": 0.027830857783555984, "grad_norm_var": 1.391722206486506e-06, "learning_rate": 0.00021610028443719344, "loss": 2.5079, "step": 25045 }, { "crossentropy": 2.5169851779937744, "epoch": 0.9079901392111369, "grad_norm": 0.026467548683285713, "grad_norm_var": 1.380476865267298e-06, "learning_rate": 0.00021593132894160094, "loss": 2.4481, "step": 25046 }, { "crossentropy": 2.513572931289673, "epoch": 0.9080263921113689, "grad_norm": 0.026196112856268883, "grad_norm_var": 1.3179634954261215e-06, "learning_rate": 0.00021576243806211538, "loss": 2.4609, "step": 25047 }, { "crossentropy": 2.3771812915802, "epoch": 0.9080626450116009, "grad_norm": 0.027284452691674232, "grad_norm_var": 1.3264883013664459e-06, "learning_rate": 0.00021559361180101876, "loss": 2.4324, "step": 25048 }, { "crossentropy": 2.414177179336548, "epoch": 0.908098897911833, "grad_norm": 0.02690143510699272, "grad_norm_var": 1.222175147955875e-06, "learning_rate": 0.00021542485016059153, "loss": 2.3979, "step": 25049 }, { "crossentropy": 2.4724645614624023, "epoch": 0.908135150812065, "grad_norm": 0.02619091048836708, "grad_norm_var": 8.918760823199556e-07, "learning_rate": 0.0002152561531431124, "loss": 2.4271, "step": 25050 }, { "crossentropy": 2.300102949142456, "epoch": 0.908171403712297, "grad_norm": 0.025694238021969795, "grad_norm_var": 9.64230169502034e-07, "learning_rate": 0.00021508752075085903, "loss": 2.3644, "step": 25051 }, { "crossentropy": 2.439323902130127, "epoch": 0.908207656612529, "grad_norm": 0.02624843269586563, "grad_norm_var": 9.048381393377455e-07, "learning_rate": 0.0002149189529861112, "loss": 2.4373, "step": 25052 }, { "crossentropy": 2.3547041416168213, "epoch": 0.908243909512761, "grad_norm": 0.026849890127778053, "grad_norm_var": 8.989514797258976e-07, "learning_rate": 0.0002147504498511438, "loss": 2.4593, "step": 25053 }, { "crossentropy": 2.4262709617614746, "epoch": 0.908280162412993, "grad_norm": 0.02603348344564438, "grad_norm_var": 8.865588311613663e-07, "learning_rate": 0.00021458201134823385, "loss": 2.4525, "step": 25054 }, { "crossentropy": 2.3901190757751465, "epoch": 0.908316415313225, "grad_norm": 0.026027588173747063, "grad_norm_var": 9.092496762674729e-07, "learning_rate": 0.0002144136374796557, "loss": 2.4629, "step": 25055 }, { "crossentropy": 2.426795721054077, "epoch": 0.9083526682134571, "grad_norm": 0.026958810165524483, "grad_norm_var": 9.104728465195948e-07, "learning_rate": 0.00021424532824768362, "loss": 2.422, "step": 25056 }, { "crossentropy": 2.4104321002960205, "epoch": 0.9083889211136891, "grad_norm": 0.025889195501804352, "grad_norm_var": 8.914110000728924e-07, "learning_rate": 0.00021407708365459133, "loss": 2.396, "step": 25057 }, { "crossentropy": 2.319913148880005, "epoch": 0.9084251740139211, "grad_norm": 0.025821512565016747, "grad_norm_var": 9.447991611566252e-07, "learning_rate": 0.00021390890370265092, "loss": 2.3648, "step": 25058 }, { "crossentropy": 2.5185844898223877, "epoch": 0.9084614269141531, "grad_norm": 0.02589549869298935, "grad_norm_var": 3.834724743930392e-07, "learning_rate": 0.0002137407883941339, "loss": 2.4789, "step": 25059 }, { "crossentropy": 2.5255887508392334, "epoch": 0.9084976798143851, "grad_norm": 0.028034640476107597, "grad_norm_var": 5.180199548617531e-07, "learning_rate": 0.00021357273773131124, "loss": 2.5075, "step": 25060 }, { "crossentropy": 2.57338547706604, "epoch": 0.9085339327146171, "grad_norm": 0.02813887968659401, "grad_norm_var": 5.777743593415498e-07, "learning_rate": 0.00021340475171645223, "loss": 2.5275, "step": 25061 }, { "crossentropy": 2.4391911029815674, "epoch": 0.9085701856148491, "grad_norm": 0.026631565764546394, "grad_norm_var": 5.778813499558345e-07, "learning_rate": 0.00021323683035182617, "loss": 2.4306, "step": 25062 }, { "crossentropy": 2.399742603302002, "epoch": 0.9086064385150812, "grad_norm": 0.026370588690042496, "grad_norm_var": 5.715562054923901e-07, "learning_rate": 0.00021306897363970013, "loss": 2.3677, "step": 25063 }, { "crossentropy": 2.27789044380188, "epoch": 0.9086426914153132, "grad_norm": 0.025932233780622482, "grad_norm_var": 5.553467186191942e-07, "learning_rate": 0.00021290118158234227, "loss": 2.452, "step": 25064 }, { "crossentropy": 2.3512158393859863, "epoch": 0.9086789443155452, "grad_norm": 0.02631331793963909, "grad_norm_var": 5.436178020232681e-07, "learning_rate": 0.00021273345418201917, "loss": 2.3519, "step": 25065 }, { "crossentropy": 2.303544521331787, "epoch": 0.9087151972157773, "grad_norm": 0.02689339965581894, "grad_norm_var": 5.511839051299793e-07, "learning_rate": 0.00021256579144099508, "loss": 2.3898, "step": 25066 }, { "crossentropy": 2.5590035915374756, "epoch": 0.9087514501160093, "grad_norm": 0.026576371863484383, "grad_norm_var": 5.070076478263495e-07, "learning_rate": 0.00021239819336153542, "loss": 2.535, "step": 25067 }, { "crossentropy": 2.2995400428771973, "epoch": 0.9087877030162413, "grad_norm": 0.02689696103334427, "grad_norm_var": 5.082154045256299e-07, "learning_rate": 0.00021223065994590396, "loss": 2.324, "step": 25068 }, { "crossentropy": 2.394620418548584, "epoch": 0.9088239559164734, "grad_norm": 0.025672825053334236, "grad_norm_var": 5.522933878056187e-07, "learning_rate": 0.00021206319119636274, "loss": 2.4163, "step": 25069 }, { "crossentropy": 2.391347885131836, "epoch": 0.9088602088167054, "grad_norm": 0.02591073326766491, "grad_norm_var": 5.609593086480415e-07, "learning_rate": 0.00021189578711517442, "loss": 2.4344, "step": 25070 }, { "crossentropy": 2.4584338665008545, "epoch": 0.9088964617169374, "grad_norm": 0.026319319382309914, "grad_norm_var": 5.479900878893694e-07, "learning_rate": 0.00021172844770459887, "loss": 2.463, "step": 25071 }, { "crossentropy": 2.387392044067383, "epoch": 0.9089327146171694, "grad_norm": 0.02628215216100216, "grad_norm_var": 5.366550846202702e-07, "learning_rate": 0.0002115611729668976, "loss": 2.3871, "step": 25072 }, { "crossentropy": 2.215043544769287, "epoch": 0.9089689675174014, "grad_norm": 0.025761056691408157, "grad_norm_var": 5.476676634769689e-07, "learning_rate": 0.00021139396290432988, "loss": 2.3482, "step": 25073 }, { "crossentropy": 2.3067986965179443, "epoch": 0.9090052204176334, "grad_norm": 0.025700505822896957, "grad_norm_var": 5.589761531656888e-07, "learning_rate": 0.00021122681751915284, "loss": 2.3335, "step": 25074 }, { "crossentropy": 2.546966314315796, "epoch": 0.9090414733178654, "grad_norm": 0.02720620110630989, "grad_norm_var": 5.68022154021769e-07, "learning_rate": 0.00021105973681362524, "loss": 2.4606, "step": 25075 }, { "crossentropy": 2.324117660522461, "epoch": 0.9090777262180975, "grad_norm": 0.02683325670659542, "grad_norm_var": 4.188191303183819e-07, "learning_rate": 0.0002108927207900041, "loss": 2.332, "step": 25076 }, { "crossentropy": 2.3500053882598877, "epoch": 0.9091139791183295, "grad_norm": 0.02659977599978447, "grad_norm_var": 2.2336028914007876e-07, "learning_rate": 0.00021072576945054322, "loss": 2.3886, "step": 25077 }, { "crossentropy": 2.4250168800354004, "epoch": 0.9091502320185615, "grad_norm": 0.026726486161351204, "grad_norm_var": 2.2724940741750132e-07, "learning_rate": 0.0002105588827974997, "loss": 2.402, "step": 25078 }, { "crossentropy": 2.3435540199279785, "epoch": 0.9091864849187935, "grad_norm": 0.025352494791150093, "grad_norm_var": 2.9258957529644003e-07, "learning_rate": 0.0002103920608331261, "loss": 2.3625, "step": 25079 }, { "crossentropy": 2.442551851272583, "epoch": 0.9092227378190255, "grad_norm": 0.028681904077529907, "grad_norm_var": 6.262431916320785e-07, "learning_rate": 0.00021022530355967628, "loss": 2.4926, "step": 25080 }, { "crossentropy": 2.501845121383667, "epoch": 0.9092589907192575, "grad_norm": 0.026939278468489647, "grad_norm_var": 6.365769094666355e-07, "learning_rate": 0.0002100586109794028, "loss": 2.4473, "step": 25081 }, { "crossentropy": 2.39030122756958, "epoch": 0.9092952436194895, "grad_norm": 0.025726528838276863, "grad_norm_var": 6.6389976221311e-07, "learning_rate": 0.00020989198309455672, "loss": 2.4055, "step": 25082 }, { "crossentropy": 2.3944077491760254, "epoch": 0.9093314965197216, "grad_norm": 0.025908522307872772, "grad_norm_var": 6.804444731421985e-07, "learning_rate": 0.00020972541990738836, "loss": 2.3827, "step": 25083 }, { "crossentropy": 2.4794445037841797, "epoch": 0.9093677494199536, "grad_norm": 0.02651442587375641, "grad_norm_var": 6.64619118018285e-07, "learning_rate": 0.00020955892142014877, "loss": 2.457, "step": 25084 }, { "crossentropy": 2.3191676139831543, "epoch": 0.9094040023201856, "grad_norm": 0.026128893718123436, "grad_norm_var": 6.344055158107823e-07, "learning_rate": 0.00020939248763508444, "loss": 2.4491, "step": 25085 }, { "crossentropy": 2.4141249656677246, "epoch": 0.9094402552204176, "grad_norm": 0.02596244588494301, "grad_norm_var": 6.311166112520893e-07, "learning_rate": 0.00020922611855444528, "loss": 2.4116, "step": 25086 }, { "crossentropy": 2.4756674766540527, "epoch": 0.9094765081206496, "grad_norm": 0.025481589138507843, "grad_norm_var": 6.856885563043576e-07, "learning_rate": 0.0002090598141804767, "loss": 2.4491, "step": 25087 }, { "crossentropy": 2.4554998874664307, "epoch": 0.9095127610208816, "grad_norm": 0.026503844186663628, "grad_norm_var": 6.863750755803915e-07, "learning_rate": 0.00020889357451542634, "loss": 2.4005, "step": 25088 }, { "crossentropy": 2.425229072570801, "epoch": 0.9095490139211136, "grad_norm": 0.025758106261491776, "grad_norm_var": 6.866178082102209e-07, "learning_rate": 0.00020872739956153964, "loss": 2.4186, "step": 25089 }, { "crossentropy": 2.4692728519439697, "epoch": 0.9095852668213457, "grad_norm": 0.026306085288524628, "grad_norm_var": 6.549544793358798e-07, "learning_rate": 0.00020856128932105977, "loss": 2.499, "step": 25090 }, { "crossentropy": 2.5696630477905273, "epoch": 0.9096215197215777, "grad_norm": 0.027510875836014748, "grad_norm_var": 6.92923144096115e-07, "learning_rate": 0.0002083952437962311, "loss": 2.61, "step": 25091 }, { "crossentropy": 2.5915451049804688, "epoch": 0.9096577726218097, "grad_norm": 0.02632274478673935, "grad_norm_var": 6.819949797310602e-07, "learning_rate": 0.00020822926298929678, "loss": 2.517, "step": 25092 }, { "crossentropy": 2.455548048019409, "epoch": 0.9096940255220418, "grad_norm": 0.025838423520326614, "grad_norm_var": 6.980958666708553e-07, "learning_rate": 0.0002080633469024973, "loss": 2.4757, "step": 25093 }, { "crossentropy": 2.440563678741455, "epoch": 0.9097302784222738, "grad_norm": 0.025913838297128677, "grad_norm_var": 6.9900149187845e-07, "learning_rate": 0.00020789749553807468, "loss": 2.4168, "step": 25094 }, { "crossentropy": 2.417574882507324, "epoch": 0.9097665313225058, "grad_norm": 0.026478854939341545, "grad_norm_var": 6.355274978150337e-07, "learning_rate": 0.0002077317088982683, "loss": 2.3842, "step": 25095 }, { "crossentropy": 2.3345820903778076, "epoch": 0.9098027842227379, "grad_norm": 0.026970500126481056, "grad_norm_var": 2.9184087220721484e-07, "learning_rate": 0.00020756598698531747, "loss": 2.3306, "step": 25096 }, { "crossentropy": 2.4085981845855713, "epoch": 0.9098390371229699, "grad_norm": 0.026987064629793167, "grad_norm_var": 2.962698108056767e-07, "learning_rate": 0.0002074003298014615, "loss": 2.4197, "step": 25097 }, { "crossentropy": 2.448617696762085, "epoch": 0.9098752900232019, "grad_norm": 0.026519669219851494, "grad_norm_var": 2.7816156089067546e-07, "learning_rate": 0.0002072347373489364, "loss": 2.4579, "step": 25098 }, { "crossentropy": 2.5275349617004395, "epoch": 0.9099115429234339, "grad_norm": 0.02645217441022396, "grad_norm_var": 2.6687110130083935e-07, "learning_rate": 0.00020706920962998033, "loss": 2.5177, "step": 25099 }, { "crossentropy": 2.365360736846924, "epoch": 0.9099477958236659, "grad_norm": 0.026474157348275185, "grad_norm_var": 2.661062463234683e-07, "learning_rate": 0.00020690374664682765, "loss": 2.3976, "step": 25100 }, { "crossentropy": 2.412931203842163, "epoch": 0.9099840487238979, "grad_norm": 0.02628372609615326, "grad_norm_var": 2.630280170197076e-07, "learning_rate": 0.0002067383484017138, "loss": 2.4314, "step": 25101 }, { "crossentropy": 2.26814341545105, "epoch": 0.91002030162413, "grad_norm": 0.02565678395330906, "grad_norm_var": 2.8508007425357785e-07, "learning_rate": 0.00020657301489687252, "loss": 2.3909, "step": 25102 }, { "crossentropy": 2.432007074356079, "epoch": 0.910056554524362, "grad_norm": 0.027927132323384285, "grad_norm_var": 3.7859279661763407e-07, "learning_rate": 0.0002064077461345365, "loss": 2.4599, "step": 25103 }, { "crossentropy": 2.4792237281799316, "epoch": 0.910092807424594, "grad_norm": 0.02755161188542843, "grad_norm_var": 4.4858178857945e-07, "learning_rate": 0.00020624254211693894, "loss": 2.4869, "step": 25104 }, { "crossentropy": 2.3492136001586914, "epoch": 0.910129060324826, "grad_norm": 0.02593245543539524, "grad_norm_var": 4.31852360687629e-07, "learning_rate": 0.00020607740284631082, "loss": 2.3781, "step": 25105 }, { "crossentropy": 2.406649351119995, "epoch": 0.910165313225058, "grad_norm": 0.027480633929371834, "grad_norm_var": 4.766846995331164e-07, "learning_rate": 0.00020591232832488149, "loss": 2.3675, "step": 25106 }, { "crossentropy": 2.359107255935669, "epoch": 0.91020156612529, "grad_norm": 0.02687269262969494, "grad_norm_var": 4.283583156337491e-07, "learning_rate": 0.0002057473185548825, "loss": 2.3364, "step": 25107 }, { "crossentropy": 2.4135725498199463, "epoch": 0.910237819025522, "grad_norm": 0.026759913191199303, "grad_norm_var": 4.239145574207991e-07, "learning_rate": 0.00020558237353854092, "loss": 2.4164, "step": 25108 }, { "crossentropy": 2.348740339279175, "epoch": 0.910274071925754, "grad_norm": 0.02767128124833107, "grad_norm_var": 4.401288928344733e-07, "learning_rate": 0.00020541749327808556, "loss": 2.2021, "step": 25109 }, { "crossentropy": 2.4804906845092773, "epoch": 0.9103103248259861, "grad_norm": 0.026975490152835846, "grad_norm_var": 3.9280852173635233e-07, "learning_rate": 0.00020525267777574298, "loss": 2.4312, "step": 25110 }, { "crossentropy": 2.386469841003418, "epoch": 0.9103465777262181, "grad_norm": 0.027189230546355247, "grad_norm_var": 3.927810216936001e-07, "learning_rate": 0.00020508792703373858, "loss": 2.388, "step": 25111 }, { "crossentropy": 2.3666157722473145, "epoch": 0.9103828306264501, "grad_norm": 0.02559639886021614, "grad_norm_var": 4.899102239253137e-07, "learning_rate": 0.00020492324105429838, "loss": 2.4243, "step": 25112 }, { "crossentropy": 2.201587200164795, "epoch": 0.9104190835266821, "grad_norm": 0.026986362412571907, "grad_norm_var": 4.898899921601067e-07, "learning_rate": 0.00020475861983964728, "loss": 2.2249, "step": 25113 }, { "crossentropy": 2.3773865699768066, "epoch": 0.9104553364269141, "grad_norm": 0.026564911007881165, "grad_norm_var": 4.885042012379153e-07, "learning_rate": 0.00020459406339200736, "loss": 2.4512, "step": 25114 }, { "crossentropy": 2.3131394386291504, "epoch": 0.9104915893271461, "grad_norm": 0.02659429982304573, "grad_norm_var": 4.83678777851021e-07, "learning_rate": 0.000204429571713603, "loss": 2.3512, "step": 25115 }, { "crossentropy": 2.4561357498168945, "epoch": 0.9105278422273781, "grad_norm": 0.02662106417119503, "grad_norm_var": 4.789915172201745e-07, "learning_rate": 0.00020426514480665348, "loss": 2.4504, "step": 25116 }, { "crossentropy": 2.4833505153656006, "epoch": 0.9105640951276102, "grad_norm": 0.025167042389512062, "grad_norm_var": 6.325308547505999e-07, "learning_rate": 0.00020410078267338205, "loss": 2.4302, "step": 25117 }, { "crossentropy": 2.2941627502441406, "epoch": 0.9106003480278422, "grad_norm": 0.025402599945664406, "grad_norm_var": 6.726604531240617e-07, "learning_rate": 0.0002039364853160075, "loss": 2.4032, "step": 25118 }, { "crossentropy": 2.5477800369262695, "epoch": 0.9106366009280742, "grad_norm": 0.026380041614174843, "grad_norm_var": 5.703227632738167e-07, "learning_rate": 0.00020377225273674805, "loss": 2.5461, "step": 25119 }, { "crossentropy": 2.3627147674560547, "epoch": 0.9106728538283063, "grad_norm": 0.02600781060755253, "grad_norm_var": 5.252791275011376e-07, "learning_rate": 0.000203608084937823, "loss": 2.3464, "step": 25120 }, { "crossentropy": 2.4940998554229736, "epoch": 0.9107091067285383, "grad_norm": 0.026252055540680885, "grad_norm_var": 5.069395673694959e-07, "learning_rate": 0.00020344398192145063, "loss": 2.4545, "step": 25121 }, { "crossentropy": 2.474059820175171, "epoch": 0.9107453596287703, "grad_norm": 0.027015147730708122, "grad_norm_var": 4.616432283058852e-07, "learning_rate": 0.00020327994368984526, "loss": 2.443, "step": 25122 }, { "crossentropy": 2.4560208320617676, "epoch": 0.9107816125290024, "grad_norm": 0.02562139742076397, "grad_norm_var": 4.979096619072415e-07, "learning_rate": 0.000203115970245224, "loss": 2.4736, "step": 25123 }, { "crossentropy": 2.5055699348449707, "epoch": 0.9108178654292344, "grad_norm": 0.026242433115839958, "grad_norm_var": 4.915599061206758e-07, "learning_rate": 0.00020295206158980062, "loss": 2.4988, "step": 25124 }, { "crossentropy": 2.420799732208252, "epoch": 0.9108541183294664, "grad_norm": 0.026699749752879143, "grad_norm_var": 3.8496308854621407e-07, "learning_rate": 0.0002027882177257895, "loss": 2.3534, "step": 25125 }, { "crossentropy": 2.2905287742614746, "epoch": 0.9108903712296984, "grad_norm": 0.025506604462862015, "grad_norm_var": 3.9383559047167827e-07, "learning_rate": 0.00020262443865540493, "loss": 2.3038, "step": 25126 }, { "crossentropy": 2.3769583702087402, "epoch": 0.9109266241299304, "grad_norm": 0.02637704648077488, "grad_norm_var": 3.3231834058987847e-07, "learning_rate": 0.00020246072438085628, "loss": 2.425, "step": 25127 }, { "crossentropy": 2.5009589195251465, "epoch": 0.9109628770301624, "grad_norm": 0.02632904425263405, "grad_norm_var": 3.079106065559852e-07, "learning_rate": 0.00020229707490435623, "loss": 2.4776, "step": 25128 }, { "crossentropy": 2.392977476119995, "epoch": 0.9109991299303944, "grad_norm": 0.026740984991192818, "grad_norm_var": 2.8710698282118546e-07, "learning_rate": 0.00020213349022811522, "loss": 2.4316, "step": 25129 }, { "crossentropy": 2.4922001361846924, "epoch": 0.9110353828306265, "grad_norm": 0.025912843644618988, "grad_norm_var": 2.837062472384651e-07, "learning_rate": 0.00020196997035434205, "loss": 2.3774, "step": 25130 }, { "crossentropy": 2.358692169189453, "epoch": 0.9110716357308585, "grad_norm": 0.02763582579791546, "grad_norm_var": 4.0912399332595065e-07, "learning_rate": 0.00020180651528524607, "loss": 2.3849, "step": 25131 }, { "crossentropy": 2.611945152282715, "epoch": 0.9111078886310905, "grad_norm": 0.02719680592417717, "grad_norm_var": 4.587500439727036e-07, "learning_rate": 0.00020164312502303383, "loss": 2.5554, "step": 25132 }, { "crossentropy": 2.2249085903167725, "epoch": 0.9111441415313225, "grad_norm": 0.027351291850209236, "grad_norm_var": 4.326685676179128e-07, "learning_rate": 0.00020147979956991357, "loss": 2.3357, "step": 25133 }, { "crossentropy": 2.474074602127075, "epoch": 0.9111803944315545, "grad_norm": 0.026607848703861237, "grad_norm_var": 3.6044686494833034e-07, "learning_rate": 0.00020131653892809133, "loss": 2.4672, "step": 25134 }, { "crossentropy": 2.406416177749634, "epoch": 0.9112166473317865, "grad_norm": 0.02671339362859726, "grad_norm_var": 3.624021802025567e-07, "learning_rate": 0.00020115334309977085, "loss": 2.3834, "step": 25135 }, { "crossentropy": 2.432011842727661, "epoch": 0.9112529002320185, "grad_norm": 0.02648279257118702, "grad_norm_var": 3.4449951841434426e-07, "learning_rate": 0.0002009902120871565, "loss": 2.4203, "step": 25136 }, { "crossentropy": 2.42737078666687, "epoch": 0.9112891531322506, "grad_norm": 0.026493722572922707, "grad_norm_var": 3.3878031810933433e-07, "learning_rate": 0.00020082714589245265, "loss": 2.4519, "step": 25137 }, { "crossentropy": 2.335151195526123, "epoch": 0.9113254060324826, "grad_norm": 0.028220003470778465, "grad_norm_var": 5.029604759134697e-07, "learning_rate": 0.00020066414451786087, "loss": 2.3559, "step": 25138 }, { "crossentropy": 2.3728761672973633, "epoch": 0.9113616589327146, "grad_norm": 0.026113923639059067, "grad_norm_var": 4.51674199899984e-07, "learning_rate": 0.0002005012079655838, "loss": 2.386, "step": 25139 }, { "crossentropy": 2.4496281147003174, "epoch": 0.9113979118329466, "grad_norm": 0.027061741799116135, "grad_norm_var": 4.47573765576433e-07, "learning_rate": 0.00020033833623782027, "loss": 2.3818, "step": 25140 }, { "crossentropy": 2.3637142181396484, "epoch": 0.9114341647331786, "grad_norm": 0.026388777419924736, "grad_norm_var": 4.5425946340419777e-07, "learning_rate": 0.00020017552933677184, "loss": 2.4021, "step": 25141 }, { "crossentropy": 2.4999096393585205, "epoch": 0.9114704176334106, "grad_norm": 0.026499856263399124, "grad_norm_var": 3.584306016990446e-07, "learning_rate": 0.00020001278726463733, "loss": 2.4278, "step": 25142 }, { "crossentropy": 2.3815743923187256, "epoch": 0.9115066705336426, "grad_norm": 0.02679162658751011, "grad_norm_var": 3.4812203791867447e-07, "learning_rate": 0.00019985011002361387, "loss": 2.4499, "step": 25143 }, { "crossentropy": 2.489739179611206, "epoch": 0.9115429234338747, "grad_norm": 0.026150314137339592, "grad_norm_var": 3.6095523395078023e-07, "learning_rate": 0.0001996874976158991, "loss": 2.512, "step": 25144 }, { "crossentropy": 2.477341890335083, "epoch": 0.9115791763341067, "grad_norm": 0.026204843074083328, "grad_norm_var": 3.8118144476909784e-07, "learning_rate": 0.0001995249500436902, "loss": 2.4942, "step": 25145 }, { "crossentropy": 2.342197895050049, "epoch": 0.9116154292343387, "grad_norm": 0.026334373280405998, "grad_norm_var": 3.4584798587337873e-07, "learning_rate": 0.00019936246730918095, "loss": 2.3737, "step": 25146 }, { "crossentropy": 2.4265198707580566, "epoch": 0.9116516821345708, "grad_norm": 0.02693236619234085, "grad_norm_var": 2.951395398422795e-07, "learning_rate": 0.00019920004941456683, "loss": 2.4299, "step": 25147 }, { "crossentropy": 2.362583875656128, "epoch": 0.9116879350348028, "grad_norm": 0.0257036704570055, "grad_norm_var": 3.3985026614209653e-07, "learning_rate": 0.0001990376963620416, "loss": 2.4859, "step": 25148 }, { "crossentropy": 2.365999698638916, "epoch": 0.9117241879350348, "grad_norm": 0.026925884187221527, "grad_norm_var": 3.101441678548626e-07, "learning_rate": 0.000198875408153798, "loss": 2.4072, "step": 25149 }, { "crossentropy": 2.4024367332458496, "epoch": 0.9117604408352669, "grad_norm": 0.026431484147906303, "grad_norm_var": 3.119405773819994e-07, "learning_rate": 0.00019871318479202816, "loss": 2.4515, "step": 25150 }, { "crossentropy": 2.5108566284179688, "epoch": 0.9117966937354989, "grad_norm": 0.028170382604002953, "grad_norm_var": 4.6848119632693594e-07, "learning_rate": 0.00019855102627892363, "loss": 2.4228, "step": 25151 }, { "crossentropy": 2.478268623352051, "epoch": 0.9118329466357309, "grad_norm": 0.025401536375284195, "grad_norm_var": 5.702139132979089e-07, "learning_rate": 0.0001983889326166727, "loss": 2.429, "step": 25152 }, { "crossentropy": 2.411715269088745, "epoch": 0.9118691995359629, "grad_norm": 0.02664729207754135, "grad_norm_var": 5.692244473743183e-07, "learning_rate": 0.00019822690380746688, "loss": 2.397, "step": 25153 }, { "crossentropy": 2.422947645187378, "epoch": 0.9119054524361949, "grad_norm": 0.02630765177309513, "grad_norm_var": 3.9074877360064284e-07, "learning_rate": 0.00019806493985349282, "loss": 2.3883, "step": 25154 }, { "crossentropy": 2.526473045349121, "epoch": 0.9119417053364269, "grad_norm": 0.02687181532382965, "grad_norm_var": 3.872197889414596e-07, "learning_rate": 0.00019790304075693988, "loss": 2.5375, "step": 25155 }, { "crossentropy": 2.2843143939971924, "epoch": 0.911977958236659, "grad_norm": 0.025891978293657303, "grad_norm_var": 3.9315607753409357e-07, "learning_rate": 0.00019774120651999295, "loss": 2.3279, "step": 25156 }, { "crossentropy": 2.4321846961975098, "epoch": 0.912014211136891, "grad_norm": 0.02710331231355667, "grad_norm_var": 4.1653088512746905e-07, "learning_rate": 0.0001975794371448386, "loss": 2.505, "step": 25157 }, { "crossentropy": 2.443359375, "epoch": 0.912050464037123, "grad_norm": 0.026342814788222313, "grad_norm_var": 4.1855737179773606e-07, "learning_rate": 0.0001974177326336629, "loss": 2.4332, "step": 25158 }, { "crossentropy": 2.5103821754455566, "epoch": 0.912086716937355, "grad_norm": 0.02555117942392826, "grad_norm_var": 4.6867840376140407e-07, "learning_rate": 0.000197256092988648, "loss": 2.41, "step": 25159 }, { "crossentropy": 2.4147133827209473, "epoch": 0.912122969837587, "grad_norm": 0.027501394972205162, "grad_norm_var": 5.313599302068465e-07, "learning_rate": 0.00019709451821197876, "loss": 2.3921, "step": 25160 }, { "crossentropy": 2.173219680786133, "epoch": 0.912159222737819, "grad_norm": 0.027614932507276535, "grad_norm_var": 5.963554251038616e-07, "learning_rate": 0.00019693300830583683, "loss": 2.3463, "step": 25161 }, { "crossentropy": 2.364645481109619, "epoch": 0.912195475638051, "grad_norm": 0.02714681811630726, "grad_norm_var": 6.079411595378999e-07, "learning_rate": 0.00019677156327240264, "loss": 2.3928, "step": 25162 }, { "crossentropy": 2.3740835189819336, "epoch": 0.912231728538283, "grad_norm": 0.026212334632873535, "grad_norm_var": 6.14102776637313e-07, "learning_rate": 0.00019661018311385782, "loss": 2.4146, "step": 25163 }, { "crossentropy": 2.318199634552002, "epoch": 0.9122679814385151, "grad_norm": 0.02673182636499405, "grad_norm_var": 5.553729229127904e-07, "learning_rate": 0.00019644886783238224, "loss": 2.3727, "step": 25164 }, { "crossentropy": 2.3669114112854004, "epoch": 0.9123042343387471, "grad_norm": 0.026795372366905212, "grad_norm_var": 5.521289750070087e-07, "learning_rate": 0.00019628761743015422, "loss": 2.2855, "step": 25165 }, { "crossentropy": 2.3332629203796387, "epoch": 0.9123404872389791, "grad_norm": 0.02643931843340397, "grad_norm_var": 5.518835253686168e-07, "learning_rate": 0.00019612643190935198, "loss": 2.3337, "step": 25166 }, { "crossentropy": 2.5024325847625732, "epoch": 0.9123767401392111, "grad_norm": 0.027618154883384705, "grad_norm_var": 4.605153625272344e-07, "learning_rate": 0.0001959653112721521, "loss": 2.4945, "step": 25167 }, { "crossentropy": 2.497866630554199, "epoch": 0.9124129930394431, "grad_norm": 0.025915706530213356, "grad_norm_var": 3.924012090751923e-07, "learning_rate": 0.00019580425552073123, "loss": 2.4334, "step": 25168 }, { "crossentropy": 2.437849760055542, "epoch": 0.9124492459396751, "grad_norm": 0.02734430693089962, "grad_norm_var": 4.208183991618574e-07, "learning_rate": 0.00019564326465726478, "loss": 2.396, "step": 25169 }, { "crossentropy": 2.3777577877044678, "epoch": 0.9124854988399071, "grad_norm": 0.025784723460674286, "grad_norm_var": 4.660885281510072e-07, "learning_rate": 0.00019548233868392663, "loss": 2.4222, "step": 25170 }, { "crossentropy": 2.466432571411133, "epoch": 0.9125217517401392, "grad_norm": 0.026906324550509453, "grad_norm_var": 4.670495741781559e-07, "learning_rate": 0.00019532147760289, "loss": 2.4403, "step": 25171 }, { "crossentropy": 2.365352153778076, "epoch": 0.9125580046403712, "grad_norm": 0.026742136105895042, "grad_norm_var": 4.227516601487304e-07, "learning_rate": 0.00019516068141632925, "loss": 2.3943, "step": 25172 }, { "crossentropy": 2.4858462810516357, "epoch": 0.9125942575406032, "grad_norm": 0.026640303432941437, "grad_norm_var": 4.1337660219655365e-07, "learning_rate": 0.00019499995012641435, "loss": 2.4586, "step": 25173 }, { "crossentropy": 2.4456491470336914, "epoch": 0.9126305104408353, "grad_norm": 0.02658022753894329, "grad_norm_var": 4.054192879798657e-07, "learning_rate": 0.00019483928373531745, "loss": 2.4764, "step": 25174 }, { "crossentropy": 2.5262973308563232, "epoch": 0.9126667633410673, "grad_norm": 0.026083296164870262, "grad_norm_var": 3.4016708064242505e-07, "learning_rate": 0.00019467868224520791, "loss": 2.5052, "step": 25175 }, { "crossentropy": 2.4187567234039307, "epoch": 0.9127030162412993, "grad_norm": 0.02750667929649353, "grad_norm_var": 3.4069572334311883e-07, "learning_rate": 0.00019451814565825566, "loss": 2.4035, "step": 25176 }, { "crossentropy": 2.607475757598877, "epoch": 0.9127392691415314, "grad_norm": 0.02654651179909706, "grad_norm_var": 2.8938211672121146e-07, "learning_rate": 0.00019435767397662785, "loss": 2.4971, "step": 25177 }, { "crossentropy": 2.5120232105255127, "epoch": 0.9127755220417634, "grad_norm": 0.025599557906389236, "grad_norm_var": 3.4417320945929974e-07, "learning_rate": 0.0001941972672024922, "loss": 2.4771, "step": 25178 }, { "crossentropy": 2.371595621109009, "epoch": 0.9128117749419954, "grad_norm": 0.02724594622850418, "grad_norm_var": 3.5883895810501163e-07, "learning_rate": 0.00019403692533801587, "loss": 2.4155, "step": 25179 }, { "crossentropy": 2.301135778427124, "epoch": 0.9128480278422274, "grad_norm": 0.026700593531131744, "grad_norm_var": 3.5858009423763287e-07, "learning_rate": 0.00019387664838536433, "loss": 2.3983, "step": 25180 }, { "crossentropy": 2.4056508541107178, "epoch": 0.9128842807424594, "grad_norm": 0.026030858978629112, "grad_norm_var": 3.8060478015448466e-07, "learning_rate": 0.00019371643634670145, "loss": 2.4552, "step": 25181 }, { "crossentropy": 2.3494160175323486, "epoch": 0.9129205336426914, "grad_norm": 0.026746364310383797, "grad_norm_var": 3.797023038503803e-07, "learning_rate": 0.0001935562892241932, "loss": 2.3367, "step": 25182 }, { "crossentropy": 2.382674217224121, "epoch": 0.9129567865429234, "grad_norm": 0.025835568085312843, "grad_norm_var": 3.4212856052603417e-07, "learning_rate": 0.00019339620702000016, "loss": 2.3688, "step": 25183 }, { "crossentropy": 2.471090316772461, "epoch": 0.9129930394431555, "grad_norm": 0.027560856193304062, "grad_norm_var": 3.80252465112009e-07, "learning_rate": 0.00019323618973628555, "loss": 2.473, "step": 25184 }, { "crossentropy": 2.499584436416626, "epoch": 0.9130292923433875, "grad_norm": 0.02586432173848152, "grad_norm_var": 3.734104018419341e-07, "learning_rate": 0.00019307623737521208, "loss": 2.5161, "step": 25185 }, { "crossentropy": 2.408574342727661, "epoch": 0.9130655452436195, "grad_norm": 0.026476692408323288, "grad_norm_var": 3.3518530574299225e-07, "learning_rate": 0.00019291634993893802, "loss": 2.4277, "step": 25186 }, { "crossentropy": 2.4262895584106445, "epoch": 0.9131017981438515, "grad_norm": 0.027034956961870193, "grad_norm_var": 3.4204537688777246e-07, "learning_rate": 0.00019275652742962334, "loss": 2.3477, "step": 25187 }, { "crossentropy": 2.5223045349121094, "epoch": 0.9131380510440835, "grad_norm": 0.026728913187980652, "grad_norm_var": 3.4176106926257256e-07, "learning_rate": 0.00019259676984942744, "loss": 2.4169, "step": 25188 }, { "crossentropy": 2.5182087421417236, "epoch": 0.9131743039443155, "grad_norm": 0.025411849841475487, "grad_norm_var": 4.251955362484401e-07, "learning_rate": 0.00019243707720050684, "loss": 2.4653, "step": 25189 }, { "crossentropy": 2.4292476177215576, "epoch": 0.9132105568445475, "grad_norm": 0.02765565924346447, "grad_norm_var": 5.094034898210526e-07, "learning_rate": 0.00019227744948502047, "loss": 2.4227, "step": 25190 }, { "crossentropy": 2.3537659645080566, "epoch": 0.9132468097447796, "grad_norm": 0.02623756229877472, "grad_norm_var": 5.009974108992662e-07, "learning_rate": 0.0001921178867051221, "loss": 2.3659, "step": 25191 }, { "crossentropy": 2.438876152038574, "epoch": 0.9132830626450116, "grad_norm": 0.026324644684791565, "grad_norm_var": 4.4131729948515033e-07, "learning_rate": 0.00019195838886296778, "loss": 2.4434, "step": 25192 }, { "crossentropy": 2.45440936088562, "epoch": 0.9133193155452436, "grad_norm": 0.02601500228047371, "grad_norm_var": 4.556812999425049e-07, "learning_rate": 0.0001917989559607136, "loss": 2.4095, "step": 25193 }, { "crossentropy": 2.4653074741363525, "epoch": 0.9133555684454756, "grad_norm": 0.026543932035565376, "grad_norm_var": 4.02217011316539e-07, "learning_rate": 0.00019163958800050952, "loss": 2.4306, "step": 25194 }, { "crossentropy": 2.445390224456787, "epoch": 0.9133918213457076, "grad_norm": 0.026164427399635315, "grad_norm_var": 3.7148361304855984e-07, "learning_rate": 0.0001914802849845104, "loss": 2.3869, "step": 25195 }, { "crossentropy": 2.5125439167022705, "epoch": 0.9134280742459396, "grad_norm": 0.027802463620901108, "grad_norm_var": 4.829682484434883e-07, "learning_rate": 0.0001913210469148674, "loss": 2.4746, "step": 25196 }, { "crossentropy": 2.482255220413208, "epoch": 0.9134643271461717, "grad_norm": 0.026379790157079697, "grad_norm_var": 4.6748923749477476e-07, "learning_rate": 0.0001911618737937304, "loss": 2.4265, "step": 25197 }, { "crossentropy": 2.5428807735443115, "epoch": 0.9135005800464037, "grad_norm": 0.02738647535443306, "grad_norm_var": 5.099481089593239e-07, "learning_rate": 0.00019100276562325047, "loss": 2.5162, "step": 25198 }, { "crossentropy": 2.300200939178467, "epoch": 0.9135368329466357, "grad_norm": 0.025287816300988197, "grad_norm_var": 5.83721896030287e-07, "learning_rate": 0.0001908437224055759, "loss": 2.4159, "step": 25199 }, { "crossentropy": 2.4017887115478516, "epoch": 0.9135730858468677, "grad_norm": 0.028737690299749374, "grad_norm_var": 8.281561376481209e-07, "learning_rate": 0.00019068474414285443, "loss": 2.3227, "step": 25200 }, { "crossentropy": 2.3719100952148438, "epoch": 0.9136093387470998, "grad_norm": 0.02578677050769329, "grad_norm_var": 8.364312961633603e-07, "learning_rate": 0.00019052583083723485, "loss": 2.3765, "step": 25201 }, { "crossentropy": 2.203211545944214, "epoch": 0.9136455916473318, "grad_norm": 0.02599789761006832, "grad_norm_var": 8.601257695981315e-07, "learning_rate": 0.0001903669824908627, "loss": 2.3091, "step": 25202 }, { "crossentropy": 2.5486936569213867, "epoch": 0.9136818445475638, "grad_norm": 0.026887904852628708, "grad_norm_var": 8.528214844963702e-07, "learning_rate": 0.00019020819910588294, "loss": 2.5019, "step": 25203 }, { "crossentropy": 2.4031496047973633, "epoch": 0.9137180974477959, "grad_norm": 0.025805175304412842, "grad_norm_var": 8.883409211526393e-07, "learning_rate": 0.0001900494806844405, "loss": 2.3499, "step": 25204 }, { "crossentropy": 2.524280071258545, "epoch": 0.9137543503480279, "grad_norm": 0.027272608131170273, "grad_norm_var": 8.281798586121884e-07, "learning_rate": 0.00018989082722867923, "loss": 2.3987, "step": 25205 }, { "crossentropy": 2.4564337730407715, "epoch": 0.9137906032482599, "grad_norm": 0.026491407305002213, "grad_norm_var": 7.556776382151853e-07, "learning_rate": 0.00018973223874074185, "loss": 2.4571, "step": 25206 }, { "crossentropy": 2.471308946609497, "epoch": 0.9138268561484919, "grad_norm": 0.025484561920166016, "grad_norm_var": 8.24502338441178e-07, "learning_rate": 0.00018957371522277056, "loss": 2.3596, "step": 25207 }, { "crossentropy": 2.3064091205596924, "epoch": 0.9138631090487239, "grad_norm": 0.02648937702178955, "grad_norm_var": 8.218408675086479e-07, "learning_rate": 0.00018941525667690585, "loss": 2.4064, "step": 25208 }, { "crossentropy": 2.5000405311584473, "epoch": 0.9138993619489559, "grad_norm": 0.026432806625962257, "grad_norm_var": 8.038762171505882e-07, "learning_rate": 0.00018925686310528877, "loss": 2.4606, "step": 25209 }, { "crossentropy": 2.5005128383636475, "epoch": 0.913935614849188, "grad_norm": 0.026332279667258263, "grad_norm_var": 8.071137657871767e-07, "learning_rate": 0.00018909853451005876, "loss": 2.4158, "step": 25210 }, { "crossentropy": 2.362638473510742, "epoch": 0.91397186774942, "grad_norm": 0.02578675001859665, "grad_norm_var": 8.35254488698459e-07, "learning_rate": 0.00018894027089335353, "loss": 2.41, "step": 25211 }, { "crossentropy": 2.372013568878174, "epoch": 0.914008120649652, "grad_norm": 0.026245757937431335, "grad_norm_var": 7.210655785047041e-07, "learning_rate": 0.0001887820722573108, "loss": 2.4355, "step": 25212 }, { "crossentropy": 2.3363149166107178, "epoch": 0.914044373549884, "grad_norm": 0.026412012055516243, "grad_norm_var": 7.209348751363934e-07, "learning_rate": 0.0001886239386040678, "loss": 2.3826, "step": 25213 }, { "crossentropy": 2.2546370029449463, "epoch": 0.914080626450116, "grad_norm": 0.026933889836072922, "grad_norm_var": 6.758576465199298e-07, "learning_rate": 0.00018846586993575998, "loss": 2.3104, "step": 25214 }, { "crossentropy": 2.4653186798095703, "epoch": 0.914116879350348, "grad_norm": 0.026935914531350136, "grad_norm_var": 6.014335428557981e-07, "learning_rate": 0.00018830786625452178, "loss": 2.425, "step": 25215 }, { "crossentropy": 2.456373929977417, "epoch": 0.91415313225058, "grad_norm": 0.025732912123203278, "grad_norm_var": 2.7004645232313794e-07, "learning_rate": 0.00018814992756248817, "loss": 2.4022, "step": 25216 }, { "crossentropy": 2.416332244873047, "epoch": 0.914189385150812, "grad_norm": 0.027866048738360405, "grad_norm_var": 3.940215005079443e-07, "learning_rate": 0.0001879920538617924, "loss": 2.5394, "step": 25217 }, { "crossentropy": 2.357556104660034, "epoch": 0.9142256380510441, "grad_norm": 0.02585572935640812, "grad_norm_var": 4.037448657960458e-07, "learning_rate": 0.00018783424515456614, "loss": 2.3174, "step": 25218 }, { "crossentropy": 2.5555663108825684, "epoch": 0.9142618909512761, "grad_norm": 0.026555834338068962, "grad_norm_var": 3.9059815716093774e-07, "learning_rate": 0.00018767650144294103, "loss": 2.4991, "step": 25219 }, { "crossentropy": 2.4059085845947266, "epoch": 0.9142981438515081, "grad_norm": 0.025813421234488487, "grad_norm_var": 3.8993240718352556e-07, "learning_rate": 0.00018751882272904807, "loss": 2.3926, "step": 25220 }, { "crossentropy": 2.405667304992676, "epoch": 0.9143343967517401, "grad_norm": 0.027078239247202873, "grad_norm_var": 3.700700896351229e-07, "learning_rate": 0.0001873612090150162, "loss": 2.5188, "step": 25221 }, { "crossentropy": 2.4786651134490967, "epoch": 0.9143706496519721, "grad_norm": 0.02636800706386566, "grad_norm_var": 3.695661301307869e-07, "learning_rate": 0.00018720366030297532, "loss": 2.5034, "step": 25222 }, { "crossentropy": 2.3175466060638428, "epoch": 0.9144069025522041, "grad_norm": 0.02549762651324272, "grad_norm_var": 3.679904785250694e-07, "learning_rate": 0.00018704617659505153, "loss": 2.3563, "step": 25223 }, { "crossentropy": 2.35412335395813, "epoch": 0.9144431554524362, "grad_norm": 0.025695765390992165, "grad_norm_var": 3.974775233508513e-07, "learning_rate": 0.00018688875789337367, "loss": 2.411, "step": 25224 }, { "crossentropy": 2.4495112895965576, "epoch": 0.9144794083526682, "grad_norm": 0.02645341120660305, "grad_norm_var": 3.9774133851055763e-07, "learning_rate": 0.00018673140420006728, "loss": 2.3337, "step": 25225 }, { "crossentropy": 2.373244047164917, "epoch": 0.9145156612529002, "grad_norm": 0.026272686198353767, "grad_norm_var": 3.980860249492661e-07, "learning_rate": 0.00018657411551725733, "loss": 2.5177, "step": 25226 }, { "crossentropy": 2.5446105003356934, "epoch": 0.9145519141531323, "grad_norm": 0.02627789042890072, "grad_norm_var": 3.766704513995563e-07, "learning_rate": 0.00018641689184706932, "loss": 2.4777, "step": 25227 }, { "crossentropy": 2.405723810195923, "epoch": 0.9145881670533643, "grad_norm": 0.026893574744462967, "grad_norm_var": 3.9176246060209673e-07, "learning_rate": 0.00018625973319162604, "loss": 2.423, "step": 25228 }, { "crossentropy": 2.4674808979034424, "epoch": 0.9146244199535963, "grad_norm": 0.02676011621952057, "grad_norm_var": 3.9918871499964685e-07, "learning_rate": 0.00018610263955304963, "loss": 2.4135, "step": 25229 }, { "crossentropy": 2.3933515548706055, "epoch": 0.9146606728538283, "grad_norm": 0.02552827075123787, "grad_norm_var": 4.295380566577319e-07, "learning_rate": 0.00018594561093346286, "loss": 2.3861, "step": 25230 }, { "crossentropy": 2.5858752727508545, "epoch": 0.9146969257540604, "grad_norm": 0.0257994644343853, "grad_norm_var": 4.213384869281572e-07, "learning_rate": 0.00018578864733498568, "loss": 2.4965, "step": 25231 }, { "crossentropy": 2.616919994354248, "epoch": 0.9147331786542924, "grad_norm": 0.027322029694914818, "grad_norm_var": 4.6366169053462433e-07, "learning_rate": 0.00018563174875973866, "loss": 2.5004, "step": 25232 }, { "crossentropy": 2.467586040496826, "epoch": 0.9147694315545244, "grad_norm": 0.02609151229262352, "grad_norm_var": 3.0824718365191014e-07, "learning_rate": 0.00018547491520984173, "loss": 2.4013, "step": 25233 }, { "crossentropy": 2.3360445499420166, "epoch": 0.9148056844547564, "grad_norm": 0.02569405362010002, "grad_norm_var": 3.187351929942924e-07, "learning_rate": 0.00018531814668741153, "loss": 2.3938, "step": 25234 }, { "crossentropy": 2.5231876373291016, "epoch": 0.9148419373549884, "grad_norm": 0.026620574295520782, "grad_norm_var": 3.2158213030933765e-07, "learning_rate": 0.00018516144319456697, "loss": 2.532, "step": 25235 }, { "crossentropy": 2.525324821472168, "epoch": 0.9148781902552204, "grad_norm": 0.026151364669203758, "grad_norm_var": 3.0857883344160134e-07, "learning_rate": 0.0001850048047334235, "loss": 2.4915, "step": 25236 }, { "crossentropy": 2.3774759769439697, "epoch": 0.9149144431554525, "grad_norm": 0.026447493582963943, "grad_norm_var": 2.6644161335229177e-07, "learning_rate": 0.00018484823130609728, "loss": 2.4584, "step": 25237 }, { "crossentropy": 2.378518581390381, "epoch": 0.9149506960556845, "grad_norm": 0.026487922295928, "grad_norm_var": 2.693531913166517e-07, "learning_rate": 0.00018469172291470326, "loss": 2.4476, "step": 25238 }, { "crossentropy": 2.423032760620117, "epoch": 0.9149869489559165, "grad_norm": 0.025781814008951187, "grad_norm_var": 2.45906951092907e-07, "learning_rate": 0.00018453527956135418, "loss": 2.3502, "step": 25239 }, { "crossentropy": 2.523589849472046, "epoch": 0.9150232018561485, "grad_norm": 0.027220329269766808, "grad_norm_var": 2.749820550817927e-07, "learning_rate": 0.00018437890124816392, "loss": 2.5367, "step": 25240 }, { "crossentropy": 2.190049171447754, "epoch": 0.9150594547563805, "grad_norm": 0.025925910100340843, "grad_norm_var": 2.8599006555717007e-07, "learning_rate": 0.00018422258797724523, "loss": 2.361, "step": 25241 }, { "crossentropy": 2.374213218688965, "epoch": 0.9150957076566125, "grad_norm": 0.026702167466282845, "grad_norm_var": 2.942542947049435e-07, "learning_rate": 0.00018406633975070807, "loss": 2.4022, "step": 25242 }, { "crossentropy": 2.256427526473999, "epoch": 0.9151319605568445, "grad_norm": 0.026066245511174202, "grad_norm_var": 2.992730622016718e-07, "learning_rate": 0.00018391015657066411, "loss": 2.4009, "step": 25243 }, { "crossentropy": 2.4818122386932373, "epoch": 0.9151682134570766, "grad_norm": 0.026256771758198738, "grad_norm_var": 2.778959544651067e-07, "learning_rate": 0.00018375403843922224, "loss": 2.5004, "step": 25244 }, { "crossentropy": 2.2861156463623047, "epoch": 0.9152044663573086, "grad_norm": 0.027525968849658966, "grad_norm_var": 3.6118060441619793e-07, "learning_rate": 0.0001835979853584907, "loss": 2.3691, "step": 25245 }, { "crossentropy": 2.5771024227142334, "epoch": 0.9152407192575406, "grad_norm": 0.026786290109157562, "grad_norm_var": 3.220308805828668e-07, "learning_rate": 0.00018344199733057732, "loss": 2.5027, "step": 25246 }, { "crossentropy": 2.327646255493164, "epoch": 0.9152769721577726, "grad_norm": 0.02637125924229622, "grad_norm_var": 2.9439403647450273e-07, "learning_rate": 0.00018328607435758927, "loss": 2.3899, "step": 25247 }, { "crossentropy": 2.4342637062072754, "epoch": 0.9153132250580046, "grad_norm": 0.02695409767329693, "grad_norm_var": 2.608469814468587e-07, "learning_rate": 0.00018313021644163208, "loss": 2.4153, "step": 25248 }, { "crossentropy": 2.3054254055023193, "epoch": 0.9153494779582366, "grad_norm": 0.02675969898700714, "grad_norm_var": 2.574605118557093e-07, "learning_rate": 0.00018297442358481187, "loss": 2.3312, "step": 25249 }, { "crossentropy": 2.5418004989624023, "epoch": 0.9153857308584686, "grad_norm": 0.027369612827897072, "grad_norm_var": 2.5633775509978306e-07, "learning_rate": 0.0001828186957892314, "loss": 2.4058, "step": 25250 }, { "crossentropy": 2.3743467330932617, "epoch": 0.9154219837587007, "grad_norm": 0.02659366838634014, "grad_norm_var": 2.56270518631437e-07, "learning_rate": 0.0001826630330569956, "loss": 2.4103, "step": 25251 }, { "crossentropy": 2.4232521057128906, "epoch": 0.9154582366589327, "grad_norm": 0.026852255687117577, "grad_norm_var": 2.4621216365289246e-07, "learning_rate": 0.00018250743539020565, "loss": 2.4145, "step": 25252 }, { "crossentropy": 2.3119938373565674, "epoch": 0.9154944895591647, "grad_norm": 0.025718234479427338, "grad_norm_var": 2.9732746387404333e-07, "learning_rate": 0.0001823519027909637, "loss": 2.2812, "step": 25253 }, { "crossentropy": 2.223428964614868, "epoch": 0.9155307424593968, "grad_norm": 0.026808908209204674, "grad_norm_var": 2.9957945939790054e-07, "learning_rate": 0.0001821964352613703, "loss": 2.3416, "step": 25254 }, { "crossentropy": 2.3832314014434814, "epoch": 0.9155669953596288, "grad_norm": 0.025973014533519745, "grad_norm_var": 2.808574155477768e-07, "learning_rate": 0.0001820410328035249, "loss": 2.4117, "step": 25255 }, { "crossentropy": 2.388456106185913, "epoch": 0.9156032482598608, "grad_norm": 0.02624969184398651, "grad_norm_var": 2.6175968310368906e-07, "learning_rate": 0.00018188569541952693, "loss": 2.4075, "step": 25256 }, { "crossentropy": 2.2819457054138184, "epoch": 0.9156395011600929, "grad_norm": 0.026078665629029274, "grad_norm_var": 2.503621251744113e-07, "learning_rate": 0.00018173042311147526, "loss": 2.3738, "step": 25257 }, { "crossentropy": 2.429945230484009, "epoch": 0.9156757540603249, "grad_norm": 0.02678591199219227, "grad_norm_var": 2.523135207241894e-07, "learning_rate": 0.0001815752158814654, "loss": 2.4505, "step": 25258 }, { "crossentropy": 2.4116268157958984, "epoch": 0.9157120069605569, "grad_norm": 0.026655644178390503, "grad_norm_var": 2.342883463165556e-07, "learning_rate": 0.0001814200737315952, "loss": 2.4602, "step": 25259 }, { "crossentropy": 2.4485526084899902, "epoch": 0.9157482598607889, "grad_norm": 0.025130216032266617, "grad_norm_var": 3.664757121175083e-07, "learning_rate": 0.000181264996663959, "loss": 2.4209, "step": 25260 }, { "crossentropy": 2.4083609580993652, "epoch": 0.9157845127610209, "grad_norm": 0.0271364226937294, "grad_norm_var": 3.2466193563040975e-07, "learning_rate": 0.00018110998468065132, "loss": 2.4557, "step": 25261 }, { "crossentropy": 2.319709062576294, "epoch": 0.9158207656612529, "grad_norm": 0.025445854291319847, "grad_norm_var": 3.882904064592477e-07, "learning_rate": 0.00018095503778376654, "loss": 2.3577, "step": 25262 }, { "crossentropy": 2.4605331420898438, "epoch": 0.9158570185614849, "grad_norm": 0.026092972606420517, "grad_norm_var": 3.9531751176938993e-07, "learning_rate": 0.0001808001559753969, "loss": 2.386, "step": 25263 }, { "crossentropy": 2.356438398361206, "epoch": 0.915893271461717, "grad_norm": 0.026866596192121506, "grad_norm_var": 3.894808482520027e-07, "learning_rate": 0.0001806453392576346, "loss": 2.4421, "step": 25264 }, { "crossentropy": 2.505028486251831, "epoch": 0.915929524361949, "grad_norm": 0.026172585785388947, "grad_norm_var": 3.8344108651161036e-07, "learning_rate": 0.00018049058763257132, "loss": 2.5283, "step": 25265 }, { "crossentropy": 2.434769630432129, "epoch": 0.915965777262181, "grad_norm": 0.026587456464767456, "grad_norm_var": 3.174963288483009e-07, "learning_rate": 0.00018033590110229537, "loss": 2.476, "step": 25266 }, { "crossentropy": 2.4154958724975586, "epoch": 0.916002030162413, "grad_norm": 0.02589261904358864, "grad_norm_var": 3.2279669941851347e-07, "learning_rate": 0.00018018127966889786, "loss": 2.4099, "step": 25267 }, { "crossentropy": 2.346500873565674, "epoch": 0.916038283062645, "grad_norm": 0.02543765679001808, "grad_norm_var": 3.3954142912475067e-07, "learning_rate": 0.00018002672333446656, "loss": 2.2497, "step": 25268 }, { "crossentropy": 2.411405563354492, "epoch": 0.916074535962877, "grad_norm": 0.027023442089557648, "grad_norm_var": 3.6399621202206034e-07, "learning_rate": 0.00017987223210108873, "loss": 2.4117, "step": 25269 }, { "crossentropy": 2.3068857192993164, "epoch": 0.916110788863109, "grad_norm": 0.02617833763360977, "grad_norm_var": 3.43630917674674e-07, "learning_rate": 0.00017971780597085207, "loss": 2.3607, "step": 25270 }, { "crossentropy": 2.4448060989379883, "epoch": 0.916147041763341, "grad_norm": 0.02588726580142975, "grad_norm_var": 3.4704798375025506e-07, "learning_rate": 0.00017956344494583998, "loss": 2.4491, "step": 25271 }, { "crossentropy": 2.447925567626953, "epoch": 0.9161832946635731, "grad_norm": 0.02828606590628624, "grad_norm_var": 6.12566316068166e-07, "learning_rate": 0.00017940914902813966, "loss": 2.5064, "step": 25272 }, { "crossentropy": 2.417335271835327, "epoch": 0.9162195475638051, "grad_norm": 0.026711151003837585, "grad_norm_var": 6.143824833033014e-07, "learning_rate": 0.00017925491821983442, "loss": 2.4453, "step": 25273 }, { "crossentropy": 2.417480945587158, "epoch": 0.9162558004640371, "grad_norm": 0.027362769469618797, "grad_norm_var": 6.653902618544127e-07, "learning_rate": 0.00017910075252300652, "loss": 2.4763, "step": 25274 }, { "crossentropy": 2.6258933544158936, "epoch": 0.9162920533642691, "grad_norm": 0.02659979648888111, "grad_norm_var": 6.63898945549981e-07, "learning_rate": 0.0001789466519397398, "loss": 2.5595, "step": 25275 }, { "crossentropy": 2.4618773460388184, "epoch": 0.9163283062645011, "grad_norm": 0.02648398093879223, "grad_norm_var": 5.446038848812285e-07, "learning_rate": 0.00017879261647211432, "loss": 2.4787, "step": 25276 }, { "crossentropy": 2.402759313583374, "epoch": 0.9163645591647331, "grad_norm": 0.026619289070367813, "grad_norm_var": 5.181469506634317e-07, "learning_rate": 0.00017863864612221113, "loss": 2.3561, "step": 25277 }, { "crossentropy": 2.403963804244995, "epoch": 0.9164008120649652, "grad_norm": 0.02606581337749958, "grad_norm_var": 4.568511940352312e-07, "learning_rate": 0.00017848474089210976, "loss": 2.4528, "step": 25278 }, { "crossentropy": 2.337043285369873, "epoch": 0.9164370649651972, "grad_norm": 0.026458006352186203, "grad_norm_var": 4.445541688028835e-07, "learning_rate": 0.00017833090078388846, "loss": 2.3743, "step": 25279 }, { "crossentropy": 2.3429808616638184, "epoch": 0.9164733178654292, "grad_norm": 0.026120949536561966, "grad_norm_var": 4.4678889202856565e-07, "learning_rate": 0.00017817712579962564, "loss": 2.3414, "step": 25280 }, { "crossentropy": 2.5242159366607666, "epoch": 0.9165095707656613, "grad_norm": 0.025630440562963486, "grad_norm_var": 4.883167749153774e-07, "learning_rate": 0.00017802341594139848, "loss": 2.5199, "step": 25281 }, { "crossentropy": 2.4408605098724365, "epoch": 0.9165458236658933, "grad_norm": 0.026803720742464066, "grad_norm_var": 4.949421121419606e-07, "learning_rate": 0.000177869771211282, "loss": 2.4291, "step": 25282 }, { "crossentropy": 2.450151205062866, "epoch": 0.9165820765661253, "grad_norm": 0.02628493681550026, "grad_norm_var": 4.742244094774195e-07, "learning_rate": 0.0001777161916113529, "loss": 2.5379, "step": 25283 }, { "crossentropy": 2.4928643703460693, "epoch": 0.9166183294663574, "grad_norm": 0.025718390941619873, "grad_norm_var": 4.3949383354922184e-07, "learning_rate": 0.00017756267714368445, "loss": 2.4576, "step": 25284 }, { "crossentropy": 2.2904114723205566, "epoch": 0.9166545823665894, "grad_norm": 0.027316920459270477, "grad_norm_var": 4.647862995249899e-07, "learning_rate": 0.00017740922781035007, "loss": 2.3651, "step": 25285 }, { "crossentropy": 2.4600765705108643, "epoch": 0.9166908352668214, "grad_norm": 0.025749310851097107, "grad_norm_var": 4.965776625883688e-07, "learning_rate": 0.0001772558436134236, "loss": 2.4423, "step": 25286 }, { "crossentropy": 2.419804096221924, "epoch": 0.9167270881670534, "grad_norm": 0.025814183056354523, "grad_norm_var": 5.029423633890265e-07, "learning_rate": 0.00017710252455497454, "loss": 2.409, "step": 25287 }, { "crossentropy": 2.6444873809814453, "epoch": 0.9167633410672854, "grad_norm": 0.026739774271845818, "grad_norm_var": 2.844753014569068e-07, "learning_rate": 0.00017694927063707512, "loss": 2.5909, "step": 25288 }, { "crossentropy": 2.251086711883545, "epoch": 0.9167995939675174, "grad_norm": 0.027209268882870674, "grad_norm_var": 3.2031847849769055e-07, "learning_rate": 0.0001767960818617953, "loss": 2.3781, "step": 25289 }, { "crossentropy": 2.4279701709747314, "epoch": 0.9168358468677494, "grad_norm": 0.027143919840455055, "grad_norm_var": 2.962716688328348e-07, "learning_rate": 0.00017664295823120347, "loss": 2.4747, "step": 25290 }, { "crossentropy": 2.4136385917663574, "epoch": 0.9168720997679815, "grad_norm": 0.02640339359641075, "grad_norm_var": 2.9403755329563257e-07, "learning_rate": 0.00017648989974736905, "loss": 2.4016, "step": 25291 }, { "crossentropy": 2.2623329162597656, "epoch": 0.9169083526682135, "grad_norm": 0.02644779160618782, "grad_norm_var": 2.937631248427104e-07, "learning_rate": 0.00017633690641235766, "loss": 2.4138, "step": 25292 }, { "crossentropy": 2.337646484375, "epoch": 0.9169446055684455, "grad_norm": 0.026860374957323074, "grad_norm_var": 3.041914140484876e-07, "learning_rate": 0.0001761839782282365, "loss": 2.3407, "step": 25293 }, { "crossentropy": 2.4150092601776123, "epoch": 0.9169808584686775, "grad_norm": 0.027031347155570984, "grad_norm_var": 3.1648041840085365e-07, "learning_rate": 0.00017603111519707225, "loss": 2.3967, "step": 25294 }, { "crossentropy": 2.2505950927734375, "epoch": 0.9170171113689095, "grad_norm": 0.02688850462436676, "grad_norm_var": 3.2661186894638164e-07, "learning_rate": 0.00017587831732092718, "loss": 2.3358, "step": 25295 }, { "crossentropy": 2.324949026107788, "epoch": 0.9170533642691415, "grad_norm": 0.026237821206450462, "grad_norm_var": 3.213998815853695e-07, "learning_rate": 0.00017572558460186684, "loss": 2.3637, "step": 25296 }, { "crossentropy": 2.3075637817382812, "epoch": 0.9170896171693735, "grad_norm": 0.02637689746916294, "grad_norm_var": 2.6793724602750847e-07, "learning_rate": 0.00017557291704195345, "loss": 2.3197, "step": 25297 }, { "crossentropy": 2.556356191635132, "epoch": 0.9171258700696056, "grad_norm": 0.027208181098103523, "grad_norm_var": 2.9108056218644234e-07, "learning_rate": 0.0001754203146432487, "loss": 2.5108, "step": 25298 }, { "crossentropy": 2.452402353286743, "epoch": 0.9171621229698376, "grad_norm": 0.026447515934705734, "grad_norm_var": 2.861318104285226e-07, "learning_rate": 0.0001752677774078154, "loss": 2.4549, "step": 25299 }, { "crossentropy": 2.474808692932129, "epoch": 0.9171983758700696, "grad_norm": 0.026350919157266617, "grad_norm_var": 2.3681896580002305e-07, "learning_rate": 0.00017511530533771135, "loss": 2.4679, "step": 25300 }, { "crossentropy": 2.589901924133301, "epoch": 0.9172346287703016, "grad_norm": 0.026341767981648445, "grad_norm_var": 2.081254324577299e-07, "learning_rate": 0.00017496289843499713, "loss": 2.5202, "step": 25301 }, { "crossentropy": 2.476691961288452, "epoch": 0.9172708816705336, "grad_norm": 0.0249883271753788, "grad_norm_var": 3.284203091304434e-07, "learning_rate": 0.00017481055670173274, "loss": 2.3967, "step": 25302 }, { "crossentropy": 2.509850025177002, "epoch": 0.9173071345707656, "grad_norm": 0.02771952375769615, "grad_norm_var": 3.7330689931894663e-07, "learning_rate": 0.00017465828013997264, "loss": 2.5243, "step": 25303 }, { "crossentropy": 2.5232632160186768, "epoch": 0.9173433874709976, "grad_norm": 0.02659786306321621, "grad_norm_var": 3.7286138614502487e-07, "learning_rate": 0.00017450606875177521, "loss": 2.4208, "step": 25304 }, { "crossentropy": 2.4698188304901123, "epoch": 0.9173796403712297, "grad_norm": 0.026455221697688103, "grad_norm_var": 3.5124831559925755e-07, "learning_rate": 0.00017435392253919714, "loss": 2.3352, "step": 25305 }, { "crossentropy": 2.507164478302002, "epoch": 0.9174158932714617, "grad_norm": 0.026379980146884918, "grad_norm_var": 3.316800040161748e-07, "learning_rate": 0.0001742018415042923, "loss": 2.4764, "step": 25306 }, { "crossentropy": 2.4976584911346436, "epoch": 0.9174521461716937, "grad_norm": 0.0269284900277853, "grad_norm_var": 3.3893110505064417e-07, "learning_rate": 0.00017404982564911466, "loss": 2.4219, "step": 25307 }, { "crossentropy": 2.495182752609253, "epoch": 0.9174883990719258, "grad_norm": 0.02694712020456791, "grad_norm_var": 3.4579314132730104e-07, "learning_rate": 0.00017389787497571808, "loss": 2.4369, "step": 25308 }, { "crossentropy": 2.5482728481292725, "epoch": 0.9175246519721578, "grad_norm": 0.026471927762031555, "grad_norm_var": 3.422557155611609e-07, "learning_rate": 0.0001737459894861537, "loss": 2.4423, "step": 25309 }, { "crossentropy": 2.505814790725708, "epoch": 0.9175609048723898, "grad_norm": 0.02617689035832882, "grad_norm_var": 3.3711672722685065e-07, "learning_rate": 0.00017359416918247495, "loss": 2.4274, "step": 25310 }, { "crossentropy": 2.289191484451294, "epoch": 0.9175971577726219, "grad_norm": 0.027476059272885323, "grad_norm_var": 3.8659755474147274e-07, "learning_rate": 0.00017344241406673123, "loss": 2.3623, "step": 25311 }, { "crossentropy": 2.491513967514038, "epoch": 0.9176334106728539, "grad_norm": 0.0263358261436224, "grad_norm_var": 3.828698307989838e-07, "learning_rate": 0.00017329072414097203, "loss": 2.345, "step": 25312 }, { "crossentropy": 2.3914644718170166, "epoch": 0.9176696635730859, "grad_norm": 0.025934938341379166, "grad_norm_var": 4.0676083453125973e-07, "learning_rate": 0.00017313909940724627, "loss": 2.4326, "step": 25313 }, { "crossentropy": 2.4236972332000732, "epoch": 0.9177059164733179, "grad_norm": 0.02757928892970085, "grad_norm_var": 4.4805788159725754e-07, "learning_rate": 0.00017298753986760173, "loss": 2.4393, "step": 25314 }, { "crossentropy": 2.5062153339385986, "epoch": 0.9177421693735499, "grad_norm": 0.025429878383874893, "grad_norm_var": 5.295001494459138e-07, "learning_rate": 0.00017283604552408626, "loss": 2.4559, "step": 25315 }, { "crossentropy": 2.227510452270508, "epoch": 0.9177784222737819, "grad_norm": 0.026318296790122986, "grad_norm_var": 5.302461097940932e-07, "learning_rate": 0.00017268461637874545, "loss": 2.2811, "step": 25316 }, { "crossentropy": 2.3700265884399414, "epoch": 0.9178146751740139, "grad_norm": 0.02673906460404396, "grad_norm_var": 5.314598918226687e-07, "learning_rate": 0.00017253325243362372, "loss": 2.3042, "step": 25317 }, { "crossentropy": 2.540560007095337, "epoch": 0.917850928074246, "grad_norm": 0.026139546185731888, "grad_norm_var": 3.7766356256438834e-07, "learning_rate": 0.00017238195369076727, "loss": 2.4665, "step": 25318 }, { "crossentropy": 2.2869651317596436, "epoch": 0.917887180974478, "grad_norm": 0.026562822982668877, "grad_norm_var": 2.889136844749206e-07, "learning_rate": 0.00017223072015221886, "loss": 2.3378, "step": 25319 }, { "crossentropy": 2.395305633544922, "epoch": 0.91792343387471, "grad_norm": 0.027367189526557922, "grad_norm_var": 3.329098146568106e-07, "learning_rate": 0.0001720795518200202, "loss": 2.3133, "step": 25320 }, { "crossentropy": 2.3851773738861084, "epoch": 0.917959686774942, "grad_norm": 0.025420546531677246, "grad_norm_var": 4.1671038038583926e-07, "learning_rate": 0.00017192844869621472, "loss": 2.3582, "step": 25321 }, { "crossentropy": 2.5010313987731934, "epoch": 0.917995939675174, "grad_norm": 0.027944646775722504, "grad_norm_var": 5.419725821219819e-07, "learning_rate": 0.00017177741078284182, "loss": 2.4433, "step": 25322 }, { "crossentropy": 2.3739397525787354, "epoch": 0.918032192575406, "grad_norm": 0.0261367317289114, "grad_norm_var": 5.476130657206905e-07, "learning_rate": 0.00017162643808194211, "loss": 2.3517, "step": 25323 }, { "crossentropy": 2.485994577407837, "epoch": 0.918068445475638, "grad_norm": 0.02616235800087452, "grad_norm_var": 5.457333495109222e-07, "learning_rate": 0.00017147553059555453, "loss": 2.4629, "step": 25324 }, { "crossentropy": 2.3606863021850586, "epoch": 0.91810469837587, "grad_norm": 0.026674244552850723, "grad_norm_var": 5.472038685454146e-07, "learning_rate": 0.00017132468832571746, "loss": 2.4226, "step": 25325 }, { "crossentropy": 2.3509809970855713, "epoch": 0.9181409512761021, "grad_norm": 0.027829090133309364, "grad_norm_var": 6.411509070700094e-07, "learning_rate": 0.0001711739112744687, "loss": 2.5006, "step": 25326 }, { "crossentropy": 2.567491292953491, "epoch": 0.9181772041763341, "grad_norm": 0.02818651683628559, "grad_norm_var": 7.530174899854821e-07, "learning_rate": 0.0001710231994438438, "loss": 2.4125, "step": 25327 }, { "crossentropy": 2.3571596145629883, "epoch": 0.9182134570765661, "grad_norm": 0.02606080286204815, "grad_norm_var": 7.700928667968288e-07, "learning_rate": 0.000170872552835879, "loss": 2.4201, "step": 25328 }, { "crossentropy": 2.461259126663208, "epoch": 0.9182497099767981, "grad_norm": 0.026791684329509735, "grad_norm_var": 7.336714896910915e-07, "learning_rate": 0.00017072197145260925, "loss": 2.4737, "step": 25329 }, { "crossentropy": 2.4780468940734863, "epoch": 0.9182859628770301, "grad_norm": 0.02716663107275963, "grad_norm_var": 6.964257487704698e-07, "learning_rate": 0.00017057145529606743, "loss": 2.4687, "step": 25330 }, { "crossentropy": 2.2991631031036377, "epoch": 0.9183222157772621, "grad_norm": 0.026561470702290535, "grad_norm_var": 5.873680226354255e-07, "learning_rate": 0.00017042100436828744, "loss": 2.4023, "step": 25331 }, { "crossentropy": 2.5987727642059326, "epoch": 0.9183584686774942, "grad_norm": 0.02754468098282814, "grad_norm_var": 6.101479707594359e-07, "learning_rate": 0.00017027061867130045, "loss": 2.6219, "step": 25332 }, { "crossentropy": 2.494018077850342, "epoch": 0.9183947215777262, "grad_norm": 0.026593534275889397, "grad_norm_var": 6.132459131833375e-07, "learning_rate": 0.00017012029820713816, "loss": 2.4388, "step": 25333 }, { "crossentropy": 2.50144362449646, "epoch": 0.9184309744779582, "grad_norm": 0.0254546906799078, "grad_norm_var": 7.048235061873995e-07, "learning_rate": 0.00016997004297783115, "loss": 2.4295, "step": 25334 }, { "crossentropy": 2.4621806144714355, "epoch": 0.9184672273781903, "grad_norm": 0.02691398561000824, "grad_norm_var": 7.02427539540397e-07, "learning_rate": 0.00016981985298540836, "loss": 2.5346, "step": 25335 }, { "crossentropy": 2.5094025135040283, "epoch": 0.9185034802784223, "grad_norm": 0.02685135416686535, "grad_norm_var": 6.800855811814882e-07, "learning_rate": 0.00016966972823189875, "loss": 2.481, "step": 25336 }, { "crossentropy": 2.2918620109558105, "epoch": 0.9185397331786543, "grad_norm": 0.026045074686408043, "grad_norm_var": 5.922339156689642e-07, "learning_rate": 0.0001695196687193301, "loss": 2.3404, "step": 25337 }, { "crossentropy": 2.323716402053833, "epoch": 0.9185759860788864, "grad_norm": 0.025352410972118378, "grad_norm_var": 6.191265802271695e-07, "learning_rate": 0.00016936967444972805, "loss": 2.2948, "step": 25338 }, { "crossentropy": 2.522027015686035, "epoch": 0.9186122389791184, "grad_norm": 0.026441672816872597, "grad_norm_var": 6.042594423489104e-07, "learning_rate": 0.00016921974542511987, "loss": 2.5378, "step": 25339 }, { "crossentropy": 2.3308637142181396, "epoch": 0.9186484918793504, "grad_norm": 0.026513537392020226, "grad_norm_var": 5.884603812893185e-07, "learning_rate": 0.00016906988164752947, "loss": 2.3497, "step": 25340 }, { "crossentropy": 2.4580349922180176, "epoch": 0.9186847447795824, "grad_norm": 0.02555875852704048, "grad_norm_var": 6.680281335540162e-07, "learning_rate": 0.00016892008311898132, "loss": 2.4284, "step": 25341 }, { "crossentropy": 2.427616834640503, "epoch": 0.9187209976798144, "grad_norm": 0.027248233556747437, "grad_norm_var": 5.952123318222428e-07, "learning_rate": 0.0001687703498414994, "loss": 2.4274, "step": 25342 }, { "crossentropy": 2.496110439300537, "epoch": 0.9187572505800464, "grad_norm": 0.025813136249780655, "grad_norm_var": 4.3898701685696204e-07, "learning_rate": 0.00016862068181710543, "loss": 2.4231, "step": 25343 }, { "crossentropy": 2.479855537414551, "epoch": 0.9187935034802784, "grad_norm": 0.02653990313410759, "grad_norm_var": 4.2962235904755485e-07, "learning_rate": 0.0001684710790478211, "loss": 2.5033, "step": 25344 }, { "crossentropy": 2.4321563243865967, "epoch": 0.9188297563805105, "grad_norm": 0.026037976145744324, "grad_norm_var": 4.319878801040209e-07, "learning_rate": 0.00016832154153566703, "loss": 2.3769, "step": 25345 }, { "crossentropy": 2.457754611968994, "epoch": 0.9188660092807425, "grad_norm": 0.02657288685441017, "grad_norm_var": 3.9450300382870965e-07, "learning_rate": 0.0001681720692826627, "loss": 2.4275, "step": 25346 }, { "crossentropy": 2.5191802978515625, "epoch": 0.9189022621809745, "grad_norm": 0.02664373070001602, "grad_norm_var": 3.9694144685810627e-07, "learning_rate": 0.00016802266229082763, "loss": 2.4543, "step": 25347 }, { "crossentropy": 2.41215181350708, "epoch": 0.9189385150812065, "grad_norm": 0.026985106989741325, "grad_norm_var": 3.2982742902465795e-07, "learning_rate": 0.00016787332056217852, "loss": 2.4217, "step": 25348 }, { "crossentropy": 2.4671881198883057, "epoch": 0.9189747679814385, "grad_norm": 0.026248376816511154, "grad_norm_var": 3.259677784735992e-07, "learning_rate": 0.00016772404409873322, "loss": 2.3528, "step": 25349 }, { "crossentropy": 2.4493536949157715, "epoch": 0.9190110208816705, "grad_norm": 0.02593705616891384, "grad_norm_var": 2.8445201485828804e-07, "learning_rate": 0.00016757483290250842, "loss": 2.5356, "step": 25350 }, { "crossentropy": 2.325953483581543, "epoch": 0.9190472737819025, "grad_norm": 0.027108874171972275, "grad_norm_var": 3.013135023241057e-07, "learning_rate": 0.00016742568697551864, "loss": 2.3949, "step": 25351 }, { "crossentropy": 2.6119019985198975, "epoch": 0.9190835266821346, "grad_norm": 0.026665017008781433, "grad_norm_var": 2.9149035295296795e-07, "learning_rate": 0.00016727660631977892, "loss": 2.5698, "step": 25352 }, { "crossentropy": 2.541754961013794, "epoch": 0.9191197795823666, "grad_norm": 0.026226229965686798, "grad_norm_var": 2.860075486128706e-07, "learning_rate": 0.00016712759093730323, "loss": 2.4766, "step": 25353 }, { "crossentropy": 2.455838918685913, "epoch": 0.9191560324825986, "grad_norm": 0.025727257132530212, "grad_norm_var": 2.4401545211978857e-07, "learning_rate": 0.00016697864083010217, "loss": 2.4196, "step": 25354 }, { "crossentropy": 2.3696954250335693, "epoch": 0.9191922853828306, "grad_norm": 0.02642975002527237, "grad_norm_var": 2.4394494962247324e-07, "learning_rate": 0.00016682975600019022, "loss": 2.3825, "step": 25355 }, { "crossentropy": 2.5632803440093994, "epoch": 0.9192285382830626, "grad_norm": 0.026339950039982796, "grad_norm_var": 2.429918649494844e-07, "learning_rate": 0.00016668093644957584, "loss": 2.4926, "step": 25356 }, { "crossentropy": 2.4692513942718506, "epoch": 0.9192647911832946, "grad_norm": 0.026758279651403427, "grad_norm_var": 2.0155142268136508e-07, "learning_rate": 0.00016653218218027068, "loss": 2.4556, "step": 25357 }, { "crossentropy": 2.367687702178955, "epoch": 0.9193010440835266, "grad_norm": 0.026484597474336624, "grad_norm_var": 1.5724333740974907e-07, "learning_rate": 0.0001663834931942837, "loss": 2.3449, "step": 25358 }, { "crossentropy": 2.350878953933716, "epoch": 0.9193372969837587, "grad_norm": 0.02564348094165325, "grad_norm_var": 1.7248455294969351e-07, "learning_rate": 0.0001662348694936222, "loss": 2.4212, "step": 25359 }, { "crossentropy": 2.541137933731079, "epoch": 0.9193735498839907, "grad_norm": 0.026418045163154602, "grad_norm_var": 1.710872054210925e-07, "learning_rate": 0.00016608631108029514, "loss": 2.4734, "step": 25360 }, { "crossentropy": 2.505162477493286, "epoch": 0.9194098027842227, "grad_norm": 0.02781115099787712, "grad_norm_var": 2.845676545227988e-07, "learning_rate": 0.00016593781795630868, "loss": 2.5203, "step": 25361 }, { "crossentropy": 2.393941879272461, "epoch": 0.9194460556844548, "grad_norm": 0.02586318925023079, "grad_norm_var": 3.091488144493203e-07, "learning_rate": 0.00016578939012366678, "loss": 2.4151, "step": 25362 }, { "crossentropy": 2.4653689861297607, "epoch": 0.9194823085846868, "grad_norm": 0.027259891852736473, "grad_norm_var": 3.4833053987833907e-07, "learning_rate": 0.00016564102758437506, "loss": 2.4784, "step": 25363 }, { "crossentropy": 2.287179470062256, "epoch": 0.9195185614849188, "grad_norm": 0.025820987299084663, "grad_norm_var": 3.568231761611005e-07, "learning_rate": 0.00016549273034043854, "loss": 2.4005, "step": 25364 }, { "crossentropy": 2.279872417449951, "epoch": 0.9195548143851509, "grad_norm": 0.02584201656281948, "grad_norm_var": 3.765174488787326e-07, "learning_rate": 0.000165344498393859, "loss": 2.3669, "step": 25365 }, { "crossentropy": 2.5134692192077637, "epoch": 0.9195910672853829, "grad_norm": 0.026334384456276894, "grad_norm_var": 3.620715431283556e-07, "learning_rate": 0.0001651963317466393, "loss": 2.4883, "step": 25366 }, { "crossentropy": 2.5358753204345703, "epoch": 0.9196273201856149, "grad_norm": 0.027514014393091202, "grad_norm_var": 4.0949805559329506e-07, "learning_rate": 0.00016504823040077942, "loss": 2.4917, "step": 25367 }, { "crossentropy": 2.379687547683716, "epoch": 0.9196635730858469, "grad_norm": 0.026000389829277992, "grad_norm_var": 4.1770993381022986e-07, "learning_rate": 0.00016490019435828117, "loss": 2.428, "step": 25368 }, { "crossentropy": 2.4144442081451416, "epoch": 0.9196998259860789, "grad_norm": 0.02618779055774212, "grad_norm_var": 4.187164796835725e-07, "learning_rate": 0.00016475222362114295, "loss": 2.3416, "step": 25369 }, { "crossentropy": 2.484839916229248, "epoch": 0.9197360788863109, "grad_norm": 0.027056152001023293, "grad_norm_var": 4.0949893272631116e-07, "learning_rate": 0.00016460431819136423, "loss": 2.5433, "step": 25370 }, { "crossentropy": 2.465421676635742, "epoch": 0.9197723317865429, "grad_norm": 0.02744281478226185, "grad_norm_var": 4.661454244126821e-07, "learning_rate": 0.00016445647807094177, "loss": 2.4818, "step": 25371 }, { "crossentropy": 2.3695812225341797, "epoch": 0.919808584686775, "grad_norm": 0.0271468423306942, "grad_norm_var": 4.843930082440064e-07, "learning_rate": 0.00016430870326187286, "loss": 2.3922, "step": 25372 }, { "crossentropy": 2.3918142318725586, "epoch": 0.919844837587007, "grad_norm": 0.02648092247545719, "grad_norm_var": 4.833106956899739e-07, "learning_rate": 0.0001641609937661531, "loss": 2.4216, "step": 25373 }, { "crossentropy": 2.5039138793945312, "epoch": 0.919881090487239, "grad_norm": 0.027386093512177467, "grad_norm_var": 5.224364463885285e-07, "learning_rate": 0.00016401334958577818, "loss": 2.3723, "step": 25374 }, { "crossentropy": 2.513821840286255, "epoch": 0.919917343387471, "grad_norm": 0.02666695974767208, "grad_norm_var": 4.521884408119897e-07, "learning_rate": 0.00016386577072274145, "loss": 2.5129, "step": 25375 }, { "crossentropy": 2.3742380142211914, "epoch": 0.919953596287703, "grad_norm": 0.02661729045212269, "grad_norm_var": 4.471266425820168e-07, "learning_rate": 0.00016371825717903632, "loss": 2.3897, "step": 25376 }, { "crossentropy": 2.35715389251709, "epoch": 0.919989849187935, "grad_norm": 0.026590049266815186, "grad_norm_var": 3.617587847508259e-07, "learning_rate": 0.00016357080895665623, "loss": 2.399, "step": 25377 }, { "crossentropy": 2.353685140609741, "epoch": 0.920026102088167, "grad_norm": 0.026097839698195457, "grad_norm_var": 3.409552970639555e-07, "learning_rate": 0.00016342342605759174, "loss": 2.3239, "step": 25378 }, { "crossentropy": 2.3077762126922607, "epoch": 0.9200623549883991, "grad_norm": 0.027773085981607437, "grad_norm_var": 3.989581499635441e-07, "learning_rate": 0.00016327610848383301, "loss": 2.3997, "step": 25379 }, { "crossentropy": 2.479468584060669, "epoch": 0.9200986078886311, "grad_norm": 0.026263149455189705, "grad_norm_var": 3.6024825193903033e-07, "learning_rate": 0.00016312885623737117, "loss": 2.4649, "step": 25380 }, { "crossentropy": 2.359701633453369, "epoch": 0.9201348607888631, "grad_norm": 0.026537416502833366, "grad_norm_var": 3.09762036954965e-07, "learning_rate": 0.00016298166932019354, "loss": 2.4149, "step": 25381 }, { "crossentropy": 2.476841688156128, "epoch": 0.9201711136890951, "grad_norm": 0.02641185186803341, "grad_norm_var": 3.057827693488576e-07, "learning_rate": 0.00016283454773428964, "loss": 2.4721, "step": 25382 }, { "crossentropy": 2.439828395843506, "epoch": 0.9202073665893271, "grad_norm": 0.02628442458808422, "grad_norm_var": 2.7678859194529777e-07, "learning_rate": 0.00016268749148164562, "loss": 2.4081, "step": 25383 }, { "crossentropy": 2.402139663696289, "epoch": 0.9202436194895591, "grad_norm": 0.02696996182203293, "grad_norm_var": 2.4717587699651006e-07, "learning_rate": 0.0001625405005642472, "loss": 2.4416, "step": 25384 }, { "crossentropy": 2.4575867652893066, "epoch": 0.9202798723897911, "grad_norm": 0.027477869763970375, "grad_norm_var": 2.5542806138389763e-07, "learning_rate": 0.00016239357498408158, "loss": 2.4979, "step": 25385 }, { "crossentropy": 2.3496785163879395, "epoch": 0.9203161252900232, "grad_norm": 0.026687275618314743, "grad_norm_var": 2.5257192871572187e-07, "learning_rate": 0.00016224671474313168, "loss": 2.4341, "step": 25386 }, { "crossentropy": 2.4369447231292725, "epoch": 0.9203523781902552, "grad_norm": 0.02549051307141781, "grad_norm_var": 3.240111090093417e-07, "learning_rate": 0.00016209991984338147, "loss": 2.4526, "step": 25387 }, { "crossentropy": 2.502274990081787, "epoch": 0.9203886310904872, "grad_norm": 0.027298063039779663, "grad_norm_var": 3.3485122245337173e-07, "learning_rate": 0.00016195319028681376, "loss": 2.5121, "step": 25388 }, { "crossentropy": 2.4590539932250977, "epoch": 0.9204248839907193, "grad_norm": 0.026259170845150948, "grad_norm_var": 3.4409298864215785e-07, "learning_rate": 0.00016180652607540924, "loss": 2.4002, "step": 25389 }, { "crossentropy": 2.5135111808776855, "epoch": 0.9204611368909513, "grad_norm": 0.02700597234070301, "grad_norm_var": 3.1711841138314045e-07, "learning_rate": 0.00016165992721115131, "loss": 2.4397, "step": 25390 }, { "crossentropy": 2.5036392211914062, "epoch": 0.9204973897911833, "grad_norm": 0.026614423841238022, "grad_norm_var": 3.17185638692002e-07, "learning_rate": 0.00016151339369601726, "loss": 2.5581, "step": 25391 }, { "crossentropy": 2.4916582107543945, "epoch": 0.9205336426914154, "grad_norm": 0.02614389732480049, "grad_norm_var": 3.3317117535551904e-07, "learning_rate": 0.00016136692553198772, "loss": 2.4671, "step": 25392 }, { "crossentropy": 2.3560590744018555, "epoch": 0.9205698955916474, "grad_norm": 0.025875281542539597, "grad_norm_var": 3.6786680296615507e-07, "learning_rate": 0.00016122052272104115, "loss": 2.4226, "step": 25393 }, { "crossentropy": 2.372157335281372, "epoch": 0.9206061484918794, "grad_norm": 0.025854002684354782, "grad_norm_var": 3.870761604627312e-07, "learning_rate": 0.00016107418526515427, "loss": 2.3546, "step": 25394 }, { "crossentropy": 2.560319423675537, "epoch": 0.9206424013921114, "grad_norm": 0.027777228504419327, "grad_norm_var": 3.877477353639765e-07, "learning_rate": 0.00016092791316630383, "loss": 2.45, "step": 25395 }, { "crossentropy": 2.3125526905059814, "epoch": 0.9206786542923434, "grad_norm": 0.026615863665938377, "grad_norm_var": 3.8159065337544974e-07, "learning_rate": 0.00016078170642646605, "loss": 2.3433, "step": 25396 }, { "crossentropy": 2.4232966899871826, "epoch": 0.9207149071925754, "grad_norm": 0.0257110558450222, "grad_norm_var": 4.291219399456808e-07, "learning_rate": 0.00016063556504761434, "loss": 2.4891, "step": 25397 }, { "crossentropy": 2.4528541564941406, "epoch": 0.9207511600928074, "grad_norm": 0.02632272243499756, "grad_norm_var": 4.310201719846691e-07, "learning_rate": 0.00016048948903172377, "loss": 2.424, "step": 25398 }, { "crossentropy": 2.454204797744751, "epoch": 0.9207874129930395, "grad_norm": 0.02642817795276642, "grad_norm_var": 4.277153021503179e-07, "learning_rate": 0.0001603434783807667, "loss": 2.3668, "step": 25399 }, { "crossentropy": 2.434614658355713, "epoch": 0.9208236658932715, "grad_norm": 0.025970516726374626, "grad_norm_var": 4.319456980682197e-07, "learning_rate": 0.0001601975330967148, "loss": 2.2614, "step": 25400 }, { "crossentropy": 2.5633509159088135, "epoch": 0.9208599187935035, "grad_norm": 0.027773018926382065, "grad_norm_var": 4.770235830458032e-07, "learning_rate": 0.00016005165318154103, "loss": 2.4949, "step": 25401 }, { "crossentropy": 2.5611486434936523, "epoch": 0.9208961716937355, "grad_norm": 0.027348674833774567, "grad_norm_var": 5.218318346705455e-07, "learning_rate": 0.00015990583863721375, "loss": 2.5407, "step": 25402 }, { "crossentropy": 2.222501516342163, "epoch": 0.9209324245939675, "grad_norm": 0.025677748024463654, "grad_norm_var": 4.980590615950738e-07, "learning_rate": 0.00015976008946570365, "loss": 2.3673, "step": 25403 }, { "crossentropy": 2.5178961753845215, "epoch": 0.9209686774941995, "grad_norm": 0.02642206847667694, "grad_norm_var": 4.577397264110897e-07, "learning_rate": 0.0001596144056689791, "loss": 2.4938, "step": 25404 }, { "crossentropy": 2.4109947681427, "epoch": 0.9210049303944315, "grad_norm": 0.027147287502884865, "grad_norm_var": 4.800002556171537e-07, "learning_rate": 0.00015946878724900637, "loss": 2.3692, "step": 25405 }, { "crossentropy": 2.4070370197296143, "epoch": 0.9210411832946636, "grad_norm": 0.026544643566012383, "grad_norm_var": 4.6482388057489386e-07, "learning_rate": 0.00015932323420775498, "loss": 2.4076, "step": 25406 }, { "crossentropy": 2.4559717178344727, "epoch": 0.9210774361948956, "grad_norm": 0.026111485436558723, "grad_norm_var": 4.7390974964738445e-07, "learning_rate": 0.0001591777465471883, "loss": 2.4236, "step": 25407 }, { "crossentropy": 2.531959295272827, "epoch": 0.9211136890951276, "grad_norm": 0.02712930366396904, "grad_norm_var": 4.900805246392381e-07, "learning_rate": 0.00015903232426927206, "loss": 2.5542, "step": 25408 }, { "crossentropy": 2.2714459896087646, "epoch": 0.9211499419953596, "grad_norm": 0.026591818779706955, "grad_norm_var": 4.582510698058432e-07, "learning_rate": 0.0001588869673759713, "loss": 2.4095, "step": 25409 }, { "crossentropy": 2.4151062965393066, "epoch": 0.9211861948955916, "grad_norm": 0.02631213888525963, "grad_norm_var": 4.264657612736463e-07, "learning_rate": 0.00015874167586924838, "loss": 2.5365, "step": 25410 }, { "crossentropy": 2.4166853427886963, "epoch": 0.9212224477958236, "grad_norm": 0.02595791406929493, "grad_norm_var": 3.52070285924005e-07, "learning_rate": 0.00015859644975106668, "loss": 2.4778, "step": 25411 }, { "crossentropy": 2.4377925395965576, "epoch": 0.9212587006960556, "grad_norm": 0.026356052607297897, "grad_norm_var": 3.52414975252517e-07, "learning_rate": 0.0001584512890233869, "loss": 2.3759, "step": 25412 }, { "crossentropy": 2.4104702472686768, "epoch": 0.9212949535962877, "grad_norm": 0.025707537308335304, "grad_norm_var": 3.5278014433176576e-07, "learning_rate": 0.0001583061936881691, "loss": 2.4123, "step": 25413 }, { "crossentropy": 2.5249509811401367, "epoch": 0.9213312064965197, "grad_norm": 0.027429124340415, "grad_norm_var": 4.049697128061334e-07, "learning_rate": 0.00015816116374737455, "loss": 2.418, "step": 25414 }, { "crossentropy": 2.4370296001434326, "epoch": 0.9213674593967517, "grad_norm": 0.026155970990657806, "grad_norm_var": 4.142660706973744e-07, "learning_rate": 0.0001580161992029605, "loss": 2.4494, "step": 25415 }, { "crossentropy": 2.411731243133545, "epoch": 0.9214037122969838, "grad_norm": 0.026461515575647354, "grad_norm_var": 3.920706921576103e-07, "learning_rate": 0.00015787130005688545, "loss": 2.3707, "step": 25416 }, { "crossentropy": 2.4597089290618896, "epoch": 0.9214399651972158, "grad_norm": 0.02635752223432064, "grad_norm_var": 2.9032272986181417e-07, "learning_rate": 0.00015772646631110777, "loss": 2.4955, "step": 25417 }, { "crossentropy": 2.3234291076660156, "epoch": 0.9214762180974478, "grad_norm": 0.02652497962117195, "grad_norm_var": 2.3753569396351738e-07, "learning_rate": 0.00015758169796758147, "loss": 2.4481, "step": 25418 }, { "crossentropy": 2.475032329559326, "epoch": 0.9215124709976799, "grad_norm": 0.025919795036315918, "grad_norm_var": 2.169056436957754e-07, "learning_rate": 0.00015743699502826391, "loss": 2.4471, "step": 25419 }, { "crossentropy": 2.522303819656372, "epoch": 0.9215487238979119, "grad_norm": 0.02659096010029316, "grad_norm_var": 2.1815913580584129e-07, "learning_rate": 0.00015729235749510796, "loss": 2.5401, "step": 25420 }, { "crossentropy": 2.3401334285736084, "epoch": 0.9215849767981439, "grad_norm": 0.026542386040091515, "grad_norm_var": 1.8528380312151279e-07, "learning_rate": 0.00015714778537006814, "loss": 2.418, "step": 25421 }, { "crossentropy": 2.3119566440582275, "epoch": 0.9216212296983759, "grad_norm": 0.025498157367110252, "grad_norm_var": 2.3610376901882686e-07, "learning_rate": 0.0001570032786550968, "loss": 2.3297, "step": 25422 }, { "crossentropy": 2.527531385421753, "epoch": 0.9216574825986079, "grad_norm": 0.026516033336520195, "grad_norm_var": 2.3330973932111688e-07, "learning_rate": 0.00015685883735214512, "loss": 2.5016, "step": 25423 }, { "crossentropy": 2.357022762298584, "epoch": 0.9216937354988399, "grad_norm": 0.02550499700009823, "grad_norm_var": 2.3553844118102903e-07, "learning_rate": 0.00015671446146316438, "loss": 2.4106, "step": 25424 }, { "crossentropy": 2.394775867462158, "epoch": 0.9217299883990719, "grad_norm": 0.025681620463728905, "grad_norm_var": 2.4907225721002015e-07, "learning_rate": 0.00015657015099010575, "loss": 2.3735, "step": 25425 }, { "crossentropy": 2.425798177719116, "epoch": 0.921766241299304, "grad_norm": 0.026318544521927834, "grad_norm_var": 2.4915369204029683e-07, "learning_rate": 0.00015642590593491657, "loss": 2.5151, "step": 25426 }, { "crossentropy": 2.5127296447753906, "epoch": 0.921802494199536, "grad_norm": 0.026017997413873672, "grad_norm_var": 2.4727816018722353e-07, "learning_rate": 0.00015628172629954696, "loss": 2.482, "step": 25427 }, { "crossentropy": 2.4055824279785156, "epoch": 0.921838747099768, "grad_norm": 0.02566489391028881, "grad_norm_var": 2.6496054812360073e-07, "learning_rate": 0.00015613761208594257, "loss": 2.3764, "step": 25428 }, { "crossentropy": 2.375680446624756, "epoch": 0.921875, "grad_norm": 0.02527318149805069, "grad_norm_var": 3.0415792732519743e-07, "learning_rate": 0.00015599356329605018, "loss": 2.3996, "step": 25429 }, { "crossentropy": 2.4350850582122803, "epoch": 0.921911252900232, "grad_norm": 0.027929255738854408, "grad_norm_var": 4.0484811262466124e-07, "learning_rate": 0.00015584957993181658, "loss": 2.4, "step": 25430 }, { "crossentropy": 2.354130744934082, "epoch": 0.921947505800464, "grad_norm": 0.027965575456619263, "grad_norm_var": 6.025437586459325e-07, "learning_rate": 0.00015570566199518522, "loss": 2.3811, "step": 25431 }, { "crossentropy": 2.4096362590789795, "epoch": 0.921983758700696, "grad_norm": 0.02669239230453968, "grad_norm_var": 6.109099797243911e-07, "learning_rate": 0.00015556180948810006, "loss": 2.3767, "step": 25432 }, { "crossentropy": 2.4374184608459473, "epoch": 0.9220200116009281, "grad_norm": 0.02648647129535675, "grad_norm_var": 6.127251329691881e-07, "learning_rate": 0.00015541802241250514, "loss": 2.4323, "step": 25433 }, { "crossentropy": 2.3780016899108887, "epoch": 0.9220562645011601, "grad_norm": 0.027161717414855957, "grad_norm_var": 6.554288354436444e-07, "learning_rate": 0.0001552743007703411, "loss": 2.4782, "step": 25434 }, { "crossentropy": 2.427198648452759, "epoch": 0.9220925174013921, "grad_norm": 0.02730226330459118, "grad_norm_var": 6.936915674742114e-07, "learning_rate": 0.00015513064456354974, "loss": 2.4247, "step": 25435 }, { "crossentropy": 2.5017004013061523, "epoch": 0.9221287703016241, "grad_norm": 0.026943909004330635, "grad_norm_var": 7.0826844903112e-07, "learning_rate": 0.00015498705379407175, "loss": 2.4921, "step": 25436 }, { "crossentropy": 2.474006414413452, "epoch": 0.9221650232018561, "grad_norm": 0.02662498503923416, "grad_norm_var": 7.095062453251848e-07, "learning_rate": 0.00015484352846384608, "loss": 2.4741, "step": 25437 }, { "crossentropy": 2.3342504501342773, "epoch": 0.9222012761020881, "grad_norm": 0.02527168206870556, "grad_norm_var": 7.4217538626233e-07, "learning_rate": 0.00015470006857481178, "loss": 2.3166, "step": 25438 }, { "crossentropy": 2.384075880050659, "epoch": 0.9222375290023201, "grad_norm": 0.026655886322259903, "grad_norm_var": 7.444478935630563e-07, "learning_rate": 0.00015455667412890507, "loss": 2.4555, "step": 25439 }, { "crossentropy": 2.405893325805664, "epoch": 0.9222737819025522, "grad_norm": 0.025992758572101593, "grad_norm_var": 6.966586041672977e-07, "learning_rate": 0.00015441334512806382, "loss": 2.3902, "step": 25440 }, { "crossentropy": 2.4975359439849854, "epoch": 0.9223100348027842, "grad_norm": 0.025931574404239655, "grad_norm_var": 6.733242532913254e-07, "learning_rate": 0.00015427008157422483, "loss": 2.5009, "step": 25441 }, { "crossentropy": 2.435023307800293, "epoch": 0.9223462877030162, "grad_norm": 0.026664352044463158, "grad_norm_var": 6.717599944040896e-07, "learning_rate": 0.000154126883469321, "loss": 2.4647, "step": 25442 }, { "crossentropy": 2.410085678100586, "epoch": 0.9223825406032483, "grad_norm": 0.026712771505117416, "grad_norm_var": 6.539266981196116e-07, "learning_rate": 0.00015398375081528804, "loss": 2.4455, "step": 25443 }, { "crossentropy": 2.397860527038574, "epoch": 0.9224187935034803, "grad_norm": 0.025502294301986694, "grad_norm_var": 6.754099876035702e-07, "learning_rate": 0.00015384068361405824, "loss": 2.4243, "step": 25444 }, { "crossentropy": 2.5225181579589844, "epoch": 0.9224550464037123, "grad_norm": 0.026698624715209007, "grad_norm_var": 5.560369322347384e-07, "learning_rate": 0.0001536976818675645, "loss": 2.4757, "step": 25445 }, { "crossentropy": 2.4931418895721436, "epoch": 0.9224912993039444, "grad_norm": 0.026527907699346542, "grad_norm_var": 4.4134280745983245e-07, "learning_rate": 0.00015355474557773863, "loss": 2.4941, "step": 25446 }, { "crossentropy": 2.3481667041778564, "epoch": 0.9225275522041764, "grad_norm": 0.026717636734247208, "grad_norm_var": 3.066226077101548e-07, "learning_rate": 0.00015341187474650963, "loss": 2.3474, "step": 25447 }, { "crossentropy": 2.251234292984009, "epoch": 0.9225638051044084, "grad_norm": 0.025716423988342285, "grad_norm_var": 3.4020171039866357e-07, "learning_rate": 0.00015326906937580877, "loss": 2.3414, "step": 25448 }, { "crossentropy": 2.3378474712371826, "epoch": 0.9226000580046404, "grad_norm": 0.02603842131793499, "grad_norm_var": 3.4949162537462055e-07, "learning_rate": 0.00015312632946756443, "loss": 2.3829, "step": 25449 }, { "crossentropy": 2.572690010070801, "epoch": 0.9226363109048724, "grad_norm": 0.025545237585902214, "grad_norm_var": 3.4948258004414923e-07, "learning_rate": 0.00015298365502370404, "loss": 2.4751, "step": 25450 }, { "crossentropy": 2.457167387008667, "epoch": 0.9226725638051044, "grad_norm": 0.026052339002490044, "grad_norm_var": 2.8057996126020816e-07, "learning_rate": 0.00015284104604615546, "loss": 2.4971, "step": 25451 }, { "crossentropy": 2.5008301734924316, "epoch": 0.9227088167053364, "grad_norm": 0.027110571041703224, "grad_norm_var": 2.982957261827285e-07, "learning_rate": 0.0001526985025368438, "loss": 2.431, "step": 25452 }, { "crossentropy": 2.4198944568634033, "epoch": 0.9227450696055685, "grad_norm": 0.026355648413300514, "grad_norm_var": 2.8883242763783994e-07, "learning_rate": 0.0001525560244976948, "loss": 2.4305, "step": 25453 }, { "crossentropy": 2.4889254570007324, "epoch": 0.9227813225058005, "grad_norm": 0.02661852352321148, "grad_norm_var": 2.321988027647696e-07, "learning_rate": 0.00015241361193063407, "loss": 2.4327, "step": 25454 }, { "crossentropy": 2.4110846519470215, "epoch": 0.9228175754060325, "grad_norm": 0.026170289143919945, "grad_norm_var": 2.2406006801599782e-07, "learning_rate": 0.00015227126483758347, "loss": 2.4649, "step": 25455 }, { "crossentropy": 2.4437615871429443, "epoch": 0.9228538283062645, "grad_norm": 0.02615448087453842, "grad_norm_var": 2.1966887682269415e-07, "learning_rate": 0.00015212898322046532, "loss": 2.387, "step": 25456 }, { "crossentropy": 2.49236798286438, "epoch": 0.9228900812064965, "grad_norm": 0.027016708627343178, "grad_norm_var": 2.42516385532722e-07, "learning_rate": 0.00015198676708120306, "loss": 2.5027, "step": 25457 }, { "crossentropy": 2.556816339492798, "epoch": 0.9229263341067285, "grad_norm": 0.025665003806352615, "grad_norm_var": 2.630672291527852e-07, "learning_rate": 0.00015184461642171632, "loss": 2.4582, "step": 25458 }, { "crossentropy": 2.4838790893554688, "epoch": 0.9229625870069605, "grad_norm": 0.025509750470519066, "grad_norm_var": 2.853351079870036e-07, "learning_rate": 0.00015170253124392575, "loss": 2.3861, "step": 25459 }, { "crossentropy": 2.1854195594787598, "epoch": 0.9229988399071926, "grad_norm": 0.02645784057676792, "grad_norm_var": 2.5191841769415094e-07, "learning_rate": 0.00015156051154974925, "loss": 2.2482, "step": 25460 }, { "crossentropy": 2.457507610321045, "epoch": 0.9230350928074246, "grad_norm": 0.02649443782866001, "grad_norm_var": 2.429151606003621e-07, "learning_rate": 0.0001514185573411059, "loss": 2.4591, "step": 25461 }, { "crossentropy": 2.505641460418701, "epoch": 0.9230713457076566, "grad_norm": 0.026088619604706764, "grad_norm_var": 2.3925207685386124e-07, "learning_rate": 0.00015127666861991407, "loss": 2.4193, "step": 25462 }, { "crossentropy": 2.335515260696411, "epoch": 0.9231075986078886, "grad_norm": 0.0257056076079607, "grad_norm_var": 2.3773372313886687e-07, "learning_rate": 0.00015113484538808842, "loss": 2.3995, "step": 25463 }, { "crossentropy": 2.4896020889282227, "epoch": 0.9231438515081206, "grad_norm": 0.026457808911800385, "grad_norm_var": 2.27374520427601e-07, "learning_rate": 0.0001509930876475446, "loss": 2.417, "step": 25464 }, { "crossentropy": 2.251443386077881, "epoch": 0.9231801044083526, "grad_norm": 0.02653013914823532, "grad_norm_var": 2.3090397462492233e-07, "learning_rate": 0.0001508513954001983, "loss": 2.3381, "step": 25465 }, { "crossentropy": 2.5119993686676025, "epoch": 0.9232163573085846, "grad_norm": 0.02635618858039379, "grad_norm_var": 1.962556045555693e-07, "learning_rate": 0.00015070976864796194, "loss": 2.4976, "step": 25466 }, { "crossentropy": 2.466644048690796, "epoch": 0.9232526102088167, "grad_norm": 0.02679477445781231, "grad_norm_var": 2.0653668241129204e-07, "learning_rate": 0.00015056820739275056, "loss": 2.4322, "step": 25467 }, { "crossentropy": 2.475757360458374, "epoch": 0.9232888631090487, "grad_norm": 0.02596314810216427, "grad_norm_var": 1.7137704268517748e-07, "learning_rate": 0.00015042671163647382, "loss": 2.4341, "step": 25468 }, { "crossentropy": 2.402184247970581, "epoch": 0.9233251160092807, "grad_norm": 0.028679028153419495, "grad_norm_var": 5.349231049595378e-07, "learning_rate": 0.00015028528138104347, "loss": 2.474, "step": 25469 }, { "crossentropy": 2.523465394973755, "epoch": 0.9233613689095128, "grad_norm": 0.026608821004629135, "grad_norm_var": 5.346675035836079e-07, "learning_rate": 0.0001501439166283708, "loss": 2.5271, "step": 25470 }, { "crossentropy": 2.432650089263916, "epoch": 0.9233976218097448, "grad_norm": 0.02599933184683323, "grad_norm_var": 5.420901849568019e-07, "learning_rate": 0.0001500026173803648, "loss": 2.4525, "step": 25471 }, { "crossentropy": 2.3969783782958984, "epoch": 0.9234338747099768, "grad_norm": 0.027927370741963387, "grad_norm_var": 6.792923352117678e-07, "learning_rate": 0.00014986138363893287, "loss": 2.4844, "step": 25472 }, { "crossentropy": 2.397547721862793, "epoch": 0.9234701276102089, "grad_norm": 0.02744019404053688, "grad_norm_var": 7.187784651023488e-07, "learning_rate": 0.000149720215405984, "loss": 2.3577, "step": 25473 }, { "crossentropy": 2.5137887001037598, "epoch": 0.9235063805104409, "grad_norm": 0.026518696919083595, "grad_norm_var": 6.64460066479563e-07, "learning_rate": 0.00014957911268342338, "loss": 2.4519, "step": 25474 }, { "crossentropy": 2.2894492149353027, "epoch": 0.9235426334106729, "grad_norm": 0.026653742417693138, "grad_norm_var": 5.806072598414965e-07, "learning_rate": 0.00014943807547315835, "loss": 2.3987, "step": 25475 }, { "crossentropy": 2.4992055892944336, "epoch": 0.9235788863109049, "grad_norm": 0.028089037165045738, "grad_norm_var": 7.013657450671464e-07, "learning_rate": 0.0001492971037770924, "loss": 2.5046, "step": 25476 }, { "crossentropy": 2.4716012477874756, "epoch": 0.9236151392111369, "grad_norm": 0.025728942826390266, "grad_norm_var": 7.66031916425699e-07, "learning_rate": 0.0001491561975971306, "loss": 2.4532, "step": 25477 }, { "crossentropy": 2.355410099029541, "epoch": 0.9236513921113689, "grad_norm": 0.026488255709409714, "grad_norm_var": 7.422993029910022e-07, "learning_rate": 0.00014901535693517598, "loss": 2.4191, "step": 25478 }, { "crossentropy": 2.372494697570801, "epoch": 0.9236876450116009, "grad_norm": 0.026869256049394608, "grad_norm_var": 6.654596924087018e-07, "learning_rate": 0.00014887458179313085, "loss": 2.394, "step": 25479 }, { "crossentropy": 2.4762299060821533, "epoch": 0.923723897911833, "grad_norm": 0.02592444233596325, "grad_norm_var": 7.089292585947422e-07, "learning_rate": 0.00014873387217289536, "loss": 2.4927, "step": 25480 }, { "crossentropy": 2.4699230194091797, "epoch": 0.923760150812065, "grad_norm": 0.026530014351010323, "grad_norm_var": 7.089335121824297e-07, "learning_rate": 0.00014859322807637187, "loss": 2.4203, "step": 25481 }, { "crossentropy": 2.5234644412994385, "epoch": 0.923796403712297, "grad_norm": 0.026380768045783043, "grad_norm_var": 7.075636409323149e-07, "learning_rate": 0.00014845264950545835, "loss": 2.4344, "step": 25482 }, { "crossentropy": 2.5875422954559326, "epoch": 0.923832656612529, "grad_norm": 0.02620244398713112, "grad_norm_var": 7.28896972577014e-07, "learning_rate": 0.00014831213646205487, "loss": 2.4978, "step": 25483 }, { "crossentropy": 2.3336029052734375, "epoch": 0.923868909512761, "grad_norm": 0.026539267972111702, "grad_norm_var": 6.891820291385353e-07, "learning_rate": 0.00014817168894805833, "loss": 2.4164, "step": 25484 }, { "crossentropy": 2.3945767879486084, "epoch": 0.923905162412993, "grad_norm": 0.02578897215425968, "grad_norm_var": 4.818345545922933e-07, "learning_rate": 0.00014803130696536604, "loss": 2.3755, "step": 25485 }, { "crossentropy": 2.3899993896484375, "epoch": 0.923941415313225, "grad_norm": 0.025742165744304657, "grad_norm_var": 5.284052568222854e-07, "learning_rate": 0.00014789099051587486, "loss": 2.4636, "step": 25486 }, { "crossentropy": 2.2837681770324707, "epoch": 0.9239776682134571, "grad_norm": 0.027389269322156906, "grad_norm_var": 5.468327966081747e-07, "learning_rate": 0.0001477507396014788, "loss": 2.3846, "step": 25487 }, { "crossentropy": 2.7126529216766357, "epoch": 0.9240139211136891, "grad_norm": 0.02751222252845764, "grad_norm_var": 4.862506312716299e-07, "learning_rate": 0.0001476105542240719, "loss": 2.5528, "step": 25488 }, { "crossentropy": 2.450585126876831, "epoch": 0.9240501740139211, "grad_norm": 0.02617792971432209, "grad_norm_var": 4.465058129597572e-07, "learning_rate": 0.00014747043438554874, "loss": 2.4198, "step": 25489 }, { "crossentropy": 2.342634439468384, "epoch": 0.9240864269141531, "grad_norm": 0.02563866786658764, "grad_norm_var": 4.966417605965778e-07, "learning_rate": 0.0001473303800878012, "loss": 2.3527, "step": 25490 }, { "crossentropy": 2.3753581047058105, "epoch": 0.9241226798143851, "grad_norm": 0.02616233378648758, "grad_norm_var": 5.002498639368555e-07, "learning_rate": 0.0001471903913327216, "loss": 2.3771, "step": 25491 }, { "crossentropy": 2.495126247406006, "epoch": 0.9241589327146171, "grad_norm": 0.027174117043614388, "grad_norm_var": 3.5234765747095956e-07, "learning_rate": 0.00014705046812219901, "loss": 2.4526, "step": 25492 }, { "crossentropy": 2.416278839111328, "epoch": 0.9241951856148491, "grad_norm": 0.025779597461223602, "grad_norm_var": 3.4803944934976245e-07, "learning_rate": 0.00014691061045812525, "loss": 2.3701, "step": 25493 }, { "crossentropy": 2.3971431255340576, "epoch": 0.9242314385150812, "grad_norm": 0.026440897956490517, "grad_norm_var": 3.4758276901493815e-07, "learning_rate": 0.0001467708183423888, "loss": 2.4074, "step": 25494 }, { "crossentropy": 2.463500499725342, "epoch": 0.9242676914153132, "grad_norm": 0.025568902492523193, "grad_norm_var": 3.7030559358225356e-07, "learning_rate": 0.0001466310917768765, "loss": 2.4103, "step": 25495 }, { "crossentropy": 2.404755115509033, "epoch": 0.9243039443155452, "grad_norm": 0.027397897094488144, "grad_norm_var": 4.303485717296306e-07, "learning_rate": 0.00014649143076347738, "loss": 2.4435, "step": 25496 }, { "crossentropy": 2.3699445724487305, "epoch": 0.9243401972157773, "grad_norm": 0.026553744450211525, "grad_norm_var": 4.307900974830891e-07, "learning_rate": 0.00014635183530407715, "loss": 2.4411, "step": 25497 }, { "crossentropy": 2.3616843223571777, "epoch": 0.9243764501160093, "grad_norm": 0.025894591584801674, "grad_norm_var": 4.470090752661804e-07, "learning_rate": 0.00014621230540056042, "loss": 2.3713, "step": 25498 }, { "crossentropy": 2.3193936347961426, "epoch": 0.9244127030162413, "grad_norm": 0.02637636475265026, "grad_norm_var": 4.449517206973983e-07, "learning_rate": 0.0001460728410548129, "loss": 2.3497, "step": 25499 }, { "crossentropy": 2.5465967655181885, "epoch": 0.9244489559164734, "grad_norm": 0.028341419994831085, "grad_norm_var": 6.853510782991086e-07, "learning_rate": 0.00014593344226871696, "loss": 2.5468, "step": 25500 }, { "crossentropy": 2.686728000640869, "epoch": 0.9244852088167054, "grad_norm": 0.026860767975449562, "grad_norm_var": 6.56081488207653e-07, "learning_rate": 0.0001457941090441567, "loss": 2.5401, "step": 25501 }, { "crossentropy": 2.478820562362671, "epoch": 0.9245214617169374, "grad_norm": 0.027578605338931084, "grad_norm_var": 6.658308261685373e-07, "learning_rate": 0.00014565484138301388, "loss": 2.5227, "step": 25502 }, { "crossentropy": 2.393812417984009, "epoch": 0.9245577146171694, "grad_norm": 0.027035005390644073, "grad_norm_var": 6.400758374473185e-07, "learning_rate": 0.0001455156392871687, "loss": 2.383, "step": 25503 }, { "crossentropy": 2.424325942993164, "epoch": 0.9245939675174014, "grad_norm": 0.02574624866247177, "grad_norm_var": 6.333402508189399e-07, "learning_rate": 0.0001453765027585019, "loss": 2.4152, "step": 25504 }, { "crossentropy": 2.4806668758392334, "epoch": 0.9246302204176334, "grad_norm": 0.025930704548954964, "grad_norm_var": 6.492747468580245e-07, "learning_rate": 0.00014523743179889305, "loss": 2.4457, "step": 25505 }, { "crossentropy": 2.4093708992004395, "epoch": 0.9246664733178654, "grad_norm": 0.026306873187422752, "grad_norm_var": 5.977692544921331e-07, "learning_rate": 0.00014509842641021898, "loss": 2.375, "step": 25506 }, { "crossentropy": 2.3926050662994385, "epoch": 0.9247027262180975, "grad_norm": 0.02761806547641754, "grad_norm_var": 6.507488689844737e-07, "learning_rate": 0.00014495948659435932, "loss": 2.4054, "step": 25507 }, { "crossentropy": 2.2715320587158203, "epoch": 0.9247389791183295, "grad_norm": 0.027735410258173943, "grad_norm_var": 7.087106649595789e-07, "learning_rate": 0.00014482061235318868, "loss": 2.3399, "step": 25508 }, { "crossentropy": 2.4139342308044434, "epoch": 0.9247752320185615, "grad_norm": 0.026818206533789635, "grad_norm_var": 6.489735996960284e-07, "learning_rate": 0.00014468180368858386, "loss": 2.4754, "step": 25509 }, { "crossentropy": 2.315711498260498, "epoch": 0.9248114849187935, "grad_norm": 0.028536047786474228, "grad_norm_var": 8.334216048130517e-07, "learning_rate": 0.0001445430606024195, "loss": 2.4786, "step": 25510 }, { "crossentropy": 2.3605189323425293, "epoch": 0.9248477378190255, "grad_norm": 0.02601628378033638, "grad_norm_var": 7.669069831095123e-07, "learning_rate": 0.00014440438309656968, "loss": 2.3209, "step": 25511 }, { "crossentropy": 2.4031169414520264, "epoch": 0.9248839907192575, "grad_norm": 0.02588828094303608, "grad_norm_var": 8.134786972392264e-07, "learning_rate": 0.00014426577117290784, "loss": 2.4664, "step": 25512 }, { "crossentropy": 2.4636707305908203, "epoch": 0.9249202436194895, "grad_norm": 0.026916062459349632, "grad_norm_var": 8.084686713186531e-07, "learning_rate": 0.00014412722483330532, "loss": 2.4428, "step": 25513 }, { "crossentropy": 2.3975257873535156, "epoch": 0.9249564965197216, "grad_norm": 0.02576017566025257, "grad_norm_var": 8.267196574341176e-07, "learning_rate": 0.0001439887440796328, "loss": 2.4667, "step": 25514 }, { "crossentropy": 2.3846888542175293, "epoch": 0.9249927494199536, "grad_norm": 0.02587909810245037, "grad_norm_var": 8.73015955748085e-07, "learning_rate": 0.0001438503289137627, "loss": 2.4144, "step": 25515 }, { "crossentropy": 2.2290921211242676, "epoch": 0.9250290023201856, "grad_norm": 0.026084769517183304, "grad_norm_var": 7.30647915841524e-07, "learning_rate": 0.00014371197933756241, "loss": 2.3294, "step": 25516 }, { "crossentropy": 2.384204864501953, "epoch": 0.9250652552204176, "grad_norm": 0.027492240071296692, "grad_norm_var": 7.716816183346662e-07, "learning_rate": 0.0001435736953529021, "loss": 2.4651, "step": 25517 }, { "crossentropy": 2.3609416484832764, "epoch": 0.9251015081206496, "grad_norm": 0.02609110064804554, "grad_norm_var": 7.374774083494882e-07, "learning_rate": 0.00014343547696164861, "loss": 2.368, "step": 25518 }, { "crossentropy": 2.3897345066070557, "epoch": 0.9251377610208816, "grad_norm": 0.02691194787621498, "grad_norm_var": 7.315474904212669e-07, "learning_rate": 0.00014329732416566932, "loss": 2.3718, "step": 25519 }, { "crossentropy": 2.3847861289978027, "epoch": 0.9251740139211136, "grad_norm": 0.025578221306204796, "grad_norm_var": 7.526233606682785e-07, "learning_rate": 0.00014315923696682996, "loss": 2.354, "step": 25520 }, { "crossentropy": 2.416661024093628, "epoch": 0.9252102668213457, "grad_norm": 0.02641895040869713, "grad_norm_var": 7.241001538087066e-07, "learning_rate": 0.0001430212153669963, "loss": 2.4962, "step": 25521 }, { "crossentropy": 2.4417624473571777, "epoch": 0.9252465197215777, "grad_norm": 0.025692133232951164, "grad_norm_var": 7.740596225578316e-07, "learning_rate": 0.0001428832593680307, "loss": 2.4188, "step": 25522 }, { "crossentropy": 2.4106202125549316, "epoch": 0.9252827726218097, "grad_norm": 0.025294769555330276, "grad_norm_var": 7.928912108361713e-07, "learning_rate": 0.00014274536897179835, "loss": 2.4098, "step": 25523 }, { "crossentropy": 2.4672744274139404, "epoch": 0.9253190255220418, "grad_norm": 0.02572001703083515, "grad_norm_var": 6.998906084414488e-07, "learning_rate": 0.00014260754418016054, "loss": 2.4442, "step": 25524 }, { "crossentropy": 2.5017921924591064, "epoch": 0.9253552784222738, "grad_norm": 0.026306379586458206, "grad_norm_var": 6.821715966113968e-07, "learning_rate": 0.0001424697849949791, "loss": 2.5106, "step": 25525 }, { "crossentropy": 2.462045192718506, "epoch": 0.9253915313225058, "grad_norm": 0.027442757040262222, "grad_norm_var": 4.289781599835114e-07, "learning_rate": 0.00014233209141811532, "loss": 2.4508, "step": 25526 }, { "crossentropy": 2.4643194675445557, "epoch": 0.9254277842227379, "grad_norm": 0.027474505826830864, "grad_norm_var": 5.225962415774151e-07, "learning_rate": 0.00014219446345142773, "loss": 2.4828, "step": 25527 }, { "crossentropy": 2.211752414703369, "epoch": 0.9254640371229699, "grad_norm": 0.02649909444153309, "grad_norm_var": 5.116127309955868e-07, "learning_rate": 0.00014205690109677648, "loss": 2.3657, "step": 25528 }, { "crossentropy": 2.3416600227355957, "epoch": 0.9255002900232019, "grad_norm": 0.025640251114964485, "grad_norm_var": 5.166501497937676e-07, "learning_rate": 0.00014191940435601903, "loss": 2.3315, "step": 25529 }, { "crossentropy": 2.4896888732910156, "epoch": 0.9255365429234339, "grad_norm": 0.025582008063793182, "grad_norm_var": 5.306954842289787e-07, "learning_rate": 0.0001417819732310116, "loss": 2.5044, "step": 25530 }, { "crossentropy": 2.598752498626709, "epoch": 0.9255727958236659, "grad_norm": 0.028819937258958817, "grad_norm_var": 9.231411445083469e-07, "learning_rate": 0.0001416446077236122, "loss": 2.5886, "step": 25531 }, { "crossentropy": 2.4205055236816406, "epoch": 0.9256090487238979, "grad_norm": 0.026015786454081535, "grad_norm_var": 9.267111006183715e-07, "learning_rate": 0.0001415073078356749, "loss": 2.4292, "step": 25532 }, { "crossentropy": 2.430966377258301, "epoch": 0.92564530162413, "grad_norm": 0.026095354929566383, "grad_norm_var": 8.519881911948074e-07, "learning_rate": 0.00014137007356905375, "loss": 2.4692, "step": 25533 }, { "crossentropy": 2.383168935775757, "epoch": 0.925681554524362, "grad_norm": 0.02571827545762062, "grad_norm_var": 8.734933488439729e-07, "learning_rate": 0.0001412329049256039, "loss": 2.3904, "step": 25534 }, { "crossentropy": 2.4176273345947266, "epoch": 0.925717807424594, "grad_norm": 0.02615961991250515, "grad_norm_var": 8.500563472096789e-07, "learning_rate": 0.00014109580190717675, "loss": 2.4135, "step": 25535 }, { "crossentropy": 2.3836724758148193, "epoch": 0.925754060324826, "grad_norm": 0.026515910401940346, "grad_norm_var": 8.174415473800734e-07, "learning_rate": 0.0001409587645156246, "loss": 2.3984, "step": 25536 }, { "crossentropy": 2.4623117446899414, "epoch": 0.925790313225058, "grad_norm": 0.025741884484887123, "grad_norm_var": 8.387157407497662e-07, "learning_rate": 0.00014082179275279772, "loss": 2.4169, "step": 25537 }, { "crossentropy": 2.4578468799591064, "epoch": 0.92582656612529, "grad_norm": 0.026288695633411407, "grad_norm_var": 8.13012178285767e-07, "learning_rate": 0.00014068488662054735, "loss": 2.4486, "step": 25538 }, { "crossentropy": 2.329387903213501, "epoch": 0.925862819025522, "grad_norm": 0.026793356984853745, "grad_norm_var": 7.460811602228208e-07, "learning_rate": 0.00014054804612072147, "loss": 2.3921, "step": 25539 }, { "crossentropy": 2.381716251373291, "epoch": 0.925899071925754, "grad_norm": 0.02519460953772068, "grad_norm_var": 8.127821595583223e-07, "learning_rate": 0.00014041127125516862, "loss": 2.3228, "step": 25540 }, { "crossentropy": 2.5558438301086426, "epoch": 0.9259353248259861, "grad_norm": 0.026905030012130737, "grad_norm_var": 8.28264878149779e-07, "learning_rate": 0.0001402745620257362, "loss": 2.5037, "step": 25541 }, { "crossentropy": 2.5092148780822754, "epoch": 0.9259715777262181, "grad_norm": 0.026651781052350998, "grad_norm_var": 7.606053512333587e-07, "learning_rate": 0.00014013791843427105, "loss": 2.5004, "step": 25542 }, { "crossentropy": 2.5286858081817627, "epoch": 0.9260078306264501, "grad_norm": 0.02633327804505825, "grad_norm_var": 6.756144730639489e-07, "learning_rate": 0.00014000134048261782, "loss": 2.4866, "step": 25543 }, { "crossentropy": 2.4749395847320557, "epoch": 0.9260440835266821, "grad_norm": 0.026517225429415703, "grad_norm_var": 6.760929225905898e-07, "learning_rate": 0.00013986482817262224, "loss": 2.4747, "step": 25544 }, { "crossentropy": 2.5703048706054688, "epoch": 0.9260803364269141, "grad_norm": 0.027588551864027977, "grad_norm_var": 7.391410499751055e-07, "learning_rate": 0.00013972838150612732, "loss": 2.5198, "step": 25545 }, { "crossentropy": 2.4894392490386963, "epoch": 0.9261165893271461, "grad_norm": 0.025977954268455505, "grad_norm_var": 7.040352057500887e-07, "learning_rate": 0.00013959200048497712, "loss": 2.3941, "step": 25546 }, { "crossentropy": 2.539930820465088, "epoch": 0.9261528422273781, "grad_norm": 0.026932232081890106, "grad_norm_var": 3.3209506099597606e-07, "learning_rate": 0.00013945568511101235, "loss": 2.5533, "step": 25547 }, { "crossentropy": 2.4727091789245605, "epoch": 0.9261890951276102, "grad_norm": 0.025814035907387733, "grad_norm_var": 3.4334280929654695e-07, "learning_rate": 0.0001393194353860744, "loss": 2.4582, "step": 25548 }, { "crossentropy": 2.5073421001434326, "epoch": 0.9262253480278422, "grad_norm": 0.026494722813367844, "grad_norm_var": 3.409903394276566e-07, "learning_rate": 0.00013918325131200392, "loss": 2.4779, "step": 25549 }, { "crossentropy": 2.4496753215789795, "epoch": 0.9262616009280742, "grad_norm": 0.026150548830628395, "grad_norm_var": 3.161608945324943e-07, "learning_rate": 0.00013904713289064063, "loss": 2.4235, "step": 25550 }, { "crossentropy": 2.5420923233032227, "epoch": 0.9262978538283063, "grad_norm": 0.02674436755478382, "grad_norm_var": 3.2044947592632136e-07, "learning_rate": 0.00013891108012382248, "loss": 2.404, "step": 25551 }, { "crossentropy": 2.470668077468872, "epoch": 0.9263341067285383, "grad_norm": 0.027092507109045982, "grad_norm_var": 3.489663001470748e-07, "learning_rate": 0.00013877509301338687, "loss": 2.4591, "step": 25552 }, { "crossentropy": 2.373098850250244, "epoch": 0.9263703596287703, "grad_norm": 0.026242341846227646, "grad_norm_var": 3.1728234609710675e-07, "learning_rate": 0.00013863917156117068, "loss": 2.4117, "step": 25553 }, { "crossentropy": 2.451796293258667, "epoch": 0.9264066125290024, "grad_norm": 0.026418613269925117, "grad_norm_var": 3.1497876937940497e-07, "learning_rate": 0.0001385033157690102, "loss": 2.4304, "step": 25554 }, { "crossentropy": 2.619006872177124, "epoch": 0.9264428654292344, "grad_norm": 0.026932232081890106, "grad_norm_var": 3.217884153022445e-07, "learning_rate": 0.00013836752563874068, "loss": 2.5217, "step": 25555 }, { "crossentropy": 2.4100496768951416, "epoch": 0.9264791183294664, "grad_norm": 0.026119748130440712, "grad_norm_var": 2.143355748816424e-07, "learning_rate": 0.00013823180117219448, "loss": 2.3729, "step": 25556 }, { "crossentropy": 2.4038748741149902, "epoch": 0.9265153712296984, "grad_norm": 0.027293335646390915, "grad_norm_var": 2.4176808056014924e-07, "learning_rate": 0.0001380961423712057, "loss": 2.4067, "step": 25557 }, { "crossentropy": 2.510101318359375, "epoch": 0.9265516241299304, "grad_norm": 0.026303613558411598, "grad_norm_var": 2.4608023738668473e-07, "learning_rate": 0.00013796054923760737, "loss": 2.512, "step": 25558 }, { "crossentropy": 2.5406389236450195, "epoch": 0.9265878770301624, "grad_norm": 0.027760053053498268, "grad_norm_var": 3.3023562266045147e-07, "learning_rate": 0.0001378250217732302, "loss": 2.4624, "step": 25559 }, { "crossentropy": 2.444444417953491, "epoch": 0.9266241299303944, "grad_norm": 0.026884792372584343, "grad_norm_var": 3.3222745413752134e-07, "learning_rate": 0.0001376895599799044, "loss": 2.484, "step": 25560 }, { "crossentropy": 2.4497573375701904, "epoch": 0.9266603828306265, "grad_norm": 0.026930248364806175, "grad_norm_var": 2.788505431546827e-07, "learning_rate": 0.00013755416385945963, "loss": 2.4425, "step": 25561 }, { "crossentropy": 2.3346242904663086, "epoch": 0.9266966357308585, "grad_norm": 0.02744540013372898, "grad_norm_var": 2.857202123738526e-07, "learning_rate": 0.00013741883341372553, "loss": 2.4621, "step": 25562 }, { "crossentropy": 2.258415699005127, "epoch": 0.9267328886310905, "grad_norm": 0.026578383520245552, "grad_norm_var": 2.8364708340976784e-07, "learning_rate": 0.0001372835686445295, "loss": 2.3395, "step": 25563 }, { "crossentropy": 2.46901273727417, "epoch": 0.9267691415313225, "grad_norm": 0.025717873126268387, "grad_norm_var": 2.955885699831706e-07, "learning_rate": 0.00013714836955369791, "loss": 2.4063, "step": 25564 }, { "crossentropy": 2.3655471801757812, "epoch": 0.9268053944315545, "grad_norm": 0.026236586272716522, "grad_norm_var": 3.0662227320624587e-07, "learning_rate": 0.00013701323614305705, "loss": 2.3956, "step": 25565 }, { "crossentropy": 2.4319283962249756, "epoch": 0.9268416473317865, "grad_norm": 0.02700856700539589, "grad_norm_var": 2.922738692884748e-07, "learning_rate": 0.00013687816841443268, "loss": 2.4691, "step": 25566 }, { "crossentropy": 2.3373262882232666, "epoch": 0.9268779002320185, "grad_norm": 0.026762070134282112, "grad_norm_var": 2.9232313957632446e-07, "learning_rate": 0.00013674316636964778, "loss": 2.3098, "step": 25567 }, { "crossentropy": 2.5770559310913086, "epoch": 0.9269141531322506, "grad_norm": 0.027324644848704338, "grad_norm_var": 3.068216525695111e-07, "learning_rate": 0.00013660823001052812, "loss": 2.4908, "step": 25568 }, { "crossentropy": 2.4739861488342285, "epoch": 0.9269504060324826, "grad_norm": 0.02686883881688118, "grad_norm_var": 2.891632791955645e-07, "learning_rate": 0.00013647335933889338, "loss": 2.4445, "step": 25569 }, { "crossentropy": 2.4319875240325928, "epoch": 0.9269866589327146, "grad_norm": 0.02587885968387127, "grad_norm_var": 3.3385190708375374e-07, "learning_rate": 0.00013633855435656706, "loss": 2.4361, "step": 25570 }, { "crossentropy": 2.5209522247314453, "epoch": 0.9270229118329466, "grad_norm": 0.026426877826452255, "grad_norm_var": 3.3772498570806365e-07, "learning_rate": 0.00013620381506536993, "loss": 2.5283, "step": 25571 }, { "crossentropy": 2.3920321464538574, "epoch": 0.9270591647331786, "grad_norm": 0.02677757665514946, "grad_norm_var": 3.1201371137542085e-07, "learning_rate": 0.00013606914146712114, "loss": 2.4153, "step": 25572 }, { "crossentropy": 2.428199529647827, "epoch": 0.9270954176334106, "grad_norm": 0.02596641704440117, "grad_norm_var": 3.281163130404528e-07, "learning_rate": 0.0001359345335636397, "loss": 2.4275, "step": 25573 }, { "crossentropy": 2.444121837615967, "epoch": 0.9271316705336426, "grad_norm": 0.026614395901560783, "grad_norm_var": 3.1858016944403115e-07, "learning_rate": 0.00013579999135674426, "loss": 2.3904, "step": 25574 }, { "crossentropy": 2.296658754348755, "epoch": 0.9271679234338747, "grad_norm": 0.02571563795208931, "grad_norm_var": 2.9053503922751857e-07, "learning_rate": 0.00013566551484825106, "loss": 2.3626, "step": 25575 }, { "crossentropy": 2.28739070892334, "epoch": 0.9272041763341067, "grad_norm": 0.027249645441770554, "grad_norm_var": 3.141164244406611e-07, "learning_rate": 0.00013553110403997703, "loss": 2.3287, "step": 25576 }, { "crossentropy": 2.3413639068603516, "epoch": 0.9272404292343387, "grad_norm": 0.025455821305513382, "grad_norm_var": 3.838599186284722e-07, "learning_rate": 0.00013539675893373736, "loss": 2.3562, "step": 25577 }, { "crossentropy": 2.310636281967163, "epoch": 0.9272766821345708, "grad_norm": 0.026317324489355087, "grad_norm_var": 3.2145628618752457e-07, "learning_rate": 0.00013526247953134728, "loss": 2.3677, "step": 25578 }, { "crossentropy": 2.4720613956451416, "epoch": 0.9273129350348028, "grad_norm": 0.02900886908173561, "grad_norm_var": 7.383505478776587e-07, "learning_rate": 0.0001351282658346198, "loss": 2.4096, "step": 25579 }, { "crossentropy": 2.453960418701172, "epoch": 0.9273491879350348, "grad_norm": 0.0267531406134367, "grad_norm_var": 6.859010550641497e-07, "learning_rate": 0.00013499411784536787, "loss": 2.4803, "step": 25580 }, { "crossentropy": 2.3608155250549316, "epoch": 0.9273854408352669, "grad_norm": 0.02687118574976921, "grad_norm_var": 6.762741988085846e-07, "learning_rate": 0.00013486003556540283, "loss": 2.4343, "step": 25581 }, { "crossentropy": 2.326484441757202, "epoch": 0.9274216937354989, "grad_norm": 0.02705121785402298, "grad_norm_var": 6.782137749039198e-07, "learning_rate": 0.00013472601899653714, "loss": 2.3509, "step": 25582 }, { "crossentropy": 2.532965898513794, "epoch": 0.9274579466357309, "grad_norm": 0.025927413254976273, "grad_norm_var": 7.137515718596531e-07, "learning_rate": 0.0001345920681405788, "loss": 2.5323, "step": 25583 }, { "crossentropy": 2.407104253768921, "epoch": 0.9274941995359629, "grad_norm": 0.02532469667494297, "grad_norm_var": 7.806358426951561e-07, "learning_rate": 0.000134458182999338, "loss": 2.4075, "step": 25584 }, { "crossentropy": 2.378891944885254, "epoch": 0.9275304524361949, "grad_norm": 0.026852494105696678, "grad_norm_var": 7.798770507303481e-07, "learning_rate": 0.00013432436357462384, "loss": 2.3124, "step": 25585 }, { "crossentropy": 2.4969570636749268, "epoch": 0.9275667053364269, "grad_norm": 0.026661183685064316, "grad_norm_var": 7.520889759820663e-07, "learning_rate": 0.00013419060986824273, "loss": 2.4706, "step": 25586 }, { "crossentropy": 2.3107502460479736, "epoch": 0.927602958236659, "grad_norm": 0.027220724150538445, "grad_norm_var": 7.772935638135908e-07, "learning_rate": 0.00013405692188200202, "loss": 2.3594, "step": 25587 }, { "crossentropy": 2.4136390686035156, "epoch": 0.927639211136891, "grad_norm": 0.026441365480422974, "grad_norm_var": 7.768679801104742e-07, "learning_rate": 0.00013392329961770699, "loss": 2.3914, "step": 25588 }, { "crossentropy": 2.3939483165740967, "epoch": 0.927675464037123, "grad_norm": 0.025805577635765076, "grad_norm_var": 7.918463586479522e-07, "learning_rate": 0.00013378974307716175, "loss": 2.3823, "step": 25589 }, { "crossentropy": 2.392829179763794, "epoch": 0.927711716937355, "grad_norm": 0.026235582306981087, "grad_norm_var": 7.990484287196392e-07, "learning_rate": 0.00013365625226217093, "loss": 2.4094, "step": 25590 }, { "crossentropy": 2.5007755756378174, "epoch": 0.927747969837587, "grad_norm": 0.02548713982105255, "grad_norm_var": 8.279066164240534e-07, "learning_rate": 0.00013352282717453702, "loss": 2.4457, "step": 25591 }, { "crossentropy": 2.54115891456604, "epoch": 0.927784222737819, "grad_norm": 0.02578517608344555, "grad_norm_var": 8.236666697288819e-07, "learning_rate": 0.000133389467816063, "loss": 2.5426, "step": 25592 }, { "crossentropy": 2.2608959674835205, "epoch": 0.927820475638051, "grad_norm": 0.026677580550312996, "grad_norm_var": 7.550182875020225e-07, "learning_rate": 0.00013325617418854962, "loss": 2.3244, "step": 25593 }, { "crossentropy": 2.354632616043091, "epoch": 0.927856728538283, "grad_norm": 0.025928625836968422, "grad_norm_var": 7.752912531226267e-07, "learning_rate": 0.0001331229462937966, "loss": 2.3153, "step": 25594 }, { "crossentropy": 2.4114856719970703, "epoch": 0.9278929814385151, "grad_norm": 0.02626432664692402, "grad_norm_var": 3.287115633914302e-07, "learning_rate": 0.00013298978413360475, "loss": 2.4423, "step": 25595 }, { "crossentropy": 2.532136917114258, "epoch": 0.9279292343387471, "grad_norm": 0.027309762313961983, "grad_norm_var": 3.794452291941275e-07, "learning_rate": 0.00013285668770977087, "loss": 2.4864, "step": 25596 }, { "crossentropy": 2.4928677082061768, "epoch": 0.9279654872389791, "grad_norm": 0.027321266010403633, "grad_norm_var": 4.224673578729262e-07, "learning_rate": 0.00013272365702409416, "loss": 2.3562, "step": 25597 }, { "crossentropy": 2.4296393394470215, "epoch": 0.9280017401392111, "grad_norm": 0.026518475264310837, "grad_norm_var": 3.9347824164059675e-07, "learning_rate": 0.00013259069207837093, "loss": 2.4314, "step": 25598 }, { "crossentropy": 2.381154775619507, "epoch": 0.9280379930394431, "grad_norm": 0.026665419340133667, "grad_norm_var": 3.849436188328725e-07, "learning_rate": 0.00013245779287439697, "loss": 2.452, "step": 25599 }, { "crossentropy": 2.2667641639709473, "epoch": 0.9280742459396751, "grad_norm": 0.027294334024190903, "grad_norm_var": 3.4338477977583805e-07, "learning_rate": 0.0001323249594139664, "loss": 2.2672, "step": 25600 }, { "crossentropy": 2.4237608909606934, "epoch": 0.9281104988399071, "grad_norm": 0.025329966098070145, "grad_norm_var": 4.2265884976702693e-07, "learning_rate": 0.00013219219169887552, "loss": 2.4425, "step": 25601 }, { "crossentropy": 2.3539838790893555, "epoch": 0.9281467517401392, "grad_norm": 0.026493748649954796, "grad_norm_var": 4.1934269990188896e-07, "learning_rate": 0.0001320594897309152, "loss": 2.3981, "step": 25602 }, { "crossentropy": 2.3778719902038574, "epoch": 0.9281830046403712, "grad_norm": 0.025522777810692787, "grad_norm_var": 4.190891590688846e-07, "learning_rate": 0.0001319268535118795, "loss": 2.4173, "step": 25603 }, { "crossentropy": 2.268075466156006, "epoch": 0.9282192575406032, "grad_norm": 0.026441749185323715, "grad_norm_var": 4.1909550171600973e-07, "learning_rate": 0.00013179428304355866, "loss": 2.3176, "step": 25604 }, { "crossentropy": 2.326737403869629, "epoch": 0.9282555104408353, "grad_norm": 0.02688794583082199, "grad_norm_var": 4.184234958938404e-07, "learning_rate": 0.00013166177832774406, "loss": 2.3089, "step": 25605 }, { "crossentropy": 2.316335916519165, "epoch": 0.9282917633410673, "grad_norm": 0.028214508667588234, "grad_norm_var": 6.236940450734246e-07, "learning_rate": 0.00013152933936622536, "loss": 2.3852, "step": 25606 }, { "crossentropy": 2.3640518188476562, "epoch": 0.9283280162412993, "grad_norm": 0.02639816887676716, "grad_norm_var": 5.514506019140807e-07, "learning_rate": 0.00013139696616079055, "loss": 2.4616, "step": 25607 }, { "crossentropy": 2.323546886444092, "epoch": 0.9283642691415314, "grad_norm": 0.026281513273715973, "grad_norm_var": 5.151828921478302e-07, "learning_rate": 0.00013126465871322825, "loss": 2.3023, "step": 25608 }, { "crossentropy": 2.3543362617492676, "epoch": 0.9284005220417634, "grad_norm": 0.02566877380013466, "grad_norm_var": 5.679344916823444e-07, "learning_rate": 0.00013113241702532587, "loss": 2.4707, "step": 25609 }, { "crossentropy": 2.368678092956543, "epoch": 0.9284367749419954, "grad_norm": 0.02593020349740982, "grad_norm_var": 5.678073386168227e-07, "learning_rate": 0.0001310002410988681, "loss": 2.3964, "step": 25610 }, { "crossentropy": 2.532121181488037, "epoch": 0.9284730278422274, "grad_norm": 0.027150055393576622, "grad_norm_var": 5.849997222786742e-07, "learning_rate": 0.00013086813093564188, "loss": 2.527, "step": 25611 }, { "crossentropy": 2.416010856628418, "epoch": 0.9285092807424594, "grad_norm": 0.026246478781104088, "grad_norm_var": 5.535185330173316e-07, "learning_rate": 0.0001307360865374302, "loss": 2.3489, "step": 25612 }, { "crossentropy": 2.6200075149536133, "epoch": 0.9285455336426914, "grad_norm": 0.026162780821323395, "grad_norm_var": 5.140698654069904e-07, "learning_rate": 0.00013060410790601717, "loss": 2.5577, "step": 25613 }, { "crossentropy": 2.41605544090271, "epoch": 0.9285817865429234, "grad_norm": 0.02717135287821293, "grad_norm_var": 5.466336979645699e-07, "learning_rate": 0.00013047219504318586, "loss": 2.289, "step": 25614 }, { "crossentropy": 2.47760009765625, "epoch": 0.9286180394431555, "grad_norm": 0.024761589244008064, "grad_norm_var": 7.289538793604662e-07, "learning_rate": 0.00013034034795071647, "loss": 2.4971, "step": 25615 }, { "crossentropy": 2.5131306648254395, "epoch": 0.9286542923433875, "grad_norm": 0.0270778089761734, "grad_norm_var": 7.052634052713558e-07, "learning_rate": 0.00013020856663039038, "loss": 2.471, "step": 25616 }, { "crossentropy": 2.4039621353149414, "epoch": 0.9286905452436195, "grad_norm": 0.026143519207835197, "grad_norm_var": 6.350380710523022e-07, "learning_rate": 0.00013007685108398838, "loss": 2.3448, "step": 25617 }, { "crossentropy": 2.425764322280884, "epoch": 0.9287267981438515, "grad_norm": 0.026724673807621002, "grad_norm_var": 6.409631144690374e-07, "learning_rate": 0.00012994520131328903, "loss": 2.394, "step": 25618 }, { "crossentropy": 2.450644016265869, "epoch": 0.9287630510440835, "grad_norm": 0.02662353776395321, "grad_norm_var": 5.844229882941606e-07, "learning_rate": 0.0001298136173200698, "loss": 2.4496, "step": 25619 }, { "crossentropy": 2.376023769378662, "epoch": 0.9287993039443155, "grad_norm": 0.026557600125670433, "grad_norm_var": 5.844733921538201e-07, "learning_rate": 0.00012968209910610873, "loss": 2.4298, "step": 25620 }, { "crossentropy": 2.5104610919952393, "epoch": 0.9288355568445475, "grad_norm": 0.026306290179491043, "grad_norm_var": 5.755342901662587e-07, "learning_rate": 0.00012955064667318162, "loss": 2.483, "step": 25621 }, { "crossentropy": 2.466503858566284, "epoch": 0.9288718097447796, "grad_norm": 0.027398837730288506, "grad_norm_var": 4.267032232303415e-07, "learning_rate": 0.00012941926002306536, "loss": 2.3844, "step": 25622 }, { "crossentropy": 2.6689677238464355, "epoch": 0.9289080626450116, "grad_norm": 0.02656833827495575, "grad_norm_var": 4.281833965851893e-07, "learning_rate": 0.0001292879391575319, "loss": 2.4967, "step": 25623 }, { "crossentropy": 2.4504880905151367, "epoch": 0.9289443155452436, "grad_norm": 0.02521556243300438, "grad_norm_var": 5.193557059065523e-07, "learning_rate": 0.000129156684078357, "loss": 2.3941, "step": 25624 }, { "crossentropy": 2.4719130992889404, "epoch": 0.9289805684454756, "grad_norm": 0.025865215808153152, "grad_norm_var": 5.037488679294928e-07, "learning_rate": 0.00012902549478731264, "loss": 2.4758, "step": 25625 }, { "crossentropy": 2.478189706802368, "epoch": 0.9290168213457076, "grad_norm": 0.027474163100123405, "grad_norm_var": 5.624078567883905e-07, "learning_rate": 0.00012889437128617122, "loss": 2.3969, "step": 25626 }, { "crossentropy": 2.4895520210266113, "epoch": 0.9290530742459396, "grad_norm": 0.026520080864429474, "grad_norm_var": 5.297107465671372e-07, "learning_rate": 0.00012876331357670356, "loss": 2.4581, "step": 25627 }, { "crossentropy": 2.434098243713379, "epoch": 0.9290893271461717, "grad_norm": 0.026007015258073807, "grad_norm_var": 5.390301598621882e-07, "learning_rate": 0.00012863232166067883, "loss": 2.4665, "step": 25628 }, { "crossentropy": 2.326551675796509, "epoch": 0.9291255800464037, "grad_norm": 0.026263359934091568, "grad_norm_var": 5.363316809491817e-07, "learning_rate": 0.00012850139553986784, "loss": 2.4285, "step": 25629 }, { "crossentropy": 2.396848440170288, "epoch": 0.9291618329466357, "grad_norm": 0.02640547789633274, "grad_norm_var": 4.960042906051864e-07, "learning_rate": 0.00012837053521603915, "loss": 2.5011, "step": 25630 }, { "crossentropy": 2.4206433296203613, "epoch": 0.9291980858468677, "grad_norm": 0.026210499927401543, "grad_norm_var": 3.165710397445004e-07, "learning_rate": 0.000128239740690958, "loss": 2.4212, "step": 25631 }, { "crossentropy": 2.388580322265625, "epoch": 0.9292343387470998, "grad_norm": 0.026076896116137505, "grad_norm_var": 2.9675199856004273e-07, "learning_rate": 0.00012810901196639245, "loss": 2.3404, "step": 25632 }, { "crossentropy": 2.457559823989868, "epoch": 0.9292705916473318, "grad_norm": 0.026478899642825127, "grad_norm_var": 2.924216562660379e-07, "learning_rate": 0.00012797834904410888, "loss": 2.4565, "step": 25633 }, { "crossentropy": 2.4819188117980957, "epoch": 0.9293068445475638, "grad_norm": 0.026308918371796608, "grad_norm_var": 2.862540411657875e-07, "learning_rate": 0.00012784775192587028, "loss": 2.4591, "step": 25634 }, { "crossentropy": 2.501673460006714, "epoch": 0.9293430974477959, "grad_norm": 0.026778511703014374, "grad_norm_var": 2.9252818089884474e-07, "learning_rate": 0.00012771722061344248, "loss": 2.4298, "step": 25635 }, { "crossentropy": 2.2101359367370605, "epoch": 0.9293793503480279, "grad_norm": 0.02572062984108925, "grad_norm_var": 3.189718559181426e-07, "learning_rate": 0.00012758675510858686, "loss": 2.3751, "step": 25636 }, { "crossentropy": 2.4065616130828857, "epoch": 0.9294156032482599, "grad_norm": 0.027157457545399666, "grad_norm_var": 3.593008823620884e-07, "learning_rate": 0.0001274563554130659, "loss": 2.3954, "step": 25637 }, { "crossentropy": 2.448258399963379, "epoch": 0.9294518561484919, "grad_norm": 0.027067899703979492, "grad_norm_var": 3.222096130129816e-07, "learning_rate": 0.00012732602152864202, "loss": 2.5178, "step": 25638 }, { "crossentropy": 2.3393237590789795, "epoch": 0.9294881090487239, "grad_norm": 0.0265496838837862, "grad_norm_var": 3.2176896861592595e-07, "learning_rate": 0.000127195753457075, "loss": 2.3446, "step": 25639 }, { "crossentropy": 2.382941246032715, "epoch": 0.9295243619489559, "grad_norm": 0.026911471039056778, "grad_norm_var": 2.379351388978948e-07, "learning_rate": 0.00012706555120012343, "loss": 2.3345, "step": 25640 }, { "crossentropy": 2.5052177906036377, "epoch": 0.929560614849188, "grad_norm": 0.02607801742851734, "grad_norm_var": 2.2311578381042398e-07, "learning_rate": 0.00012693541475954694, "loss": 2.4552, "step": 25641 }, { "crossentropy": 2.4702584743499756, "epoch": 0.92959686774942, "grad_norm": 0.028000663965940475, "grad_norm_var": 3.0878793379207625e-07, "learning_rate": 0.00012680534413710254, "loss": 2.4374, "step": 25642 }, { "crossentropy": 2.6219964027404785, "epoch": 0.929633120649652, "grad_norm": 0.027300508692860603, "grad_norm_var": 3.454616644768625e-07, "learning_rate": 0.00012667533933454712, "loss": 2.435, "step": 25643 }, { "crossentropy": 2.4307267665863037, "epoch": 0.929669373549884, "grad_norm": 0.02566928043961525, "grad_norm_var": 3.784940378100208e-07, "learning_rate": 0.00012654540035363703, "loss": 2.4632, "step": 25644 }, { "crossentropy": 2.486680269241333, "epoch": 0.929705626450116, "grad_norm": 0.026510631665587425, "grad_norm_var": 3.7249794724112524e-07, "learning_rate": 0.00012641552719612703, "loss": 2.4518, "step": 25645 }, { "crossentropy": 2.504666566848755, "epoch": 0.929741879350348, "grad_norm": 0.02691543474793434, "grad_norm_var": 3.771167746097753e-07, "learning_rate": 0.00012628571986377125, "loss": 2.4036, "step": 25646 }, { "crossentropy": 2.4833106994628906, "epoch": 0.92977813225058, "grad_norm": 0.02707824483513832, "grad_norm_var": 3.78134067625877e-07, "learning_rate": 0.00012615597835832326, "loss": 2.5064, "step": 25647 }, { "crossentropy": 2.4681599140167236, "epoch": 0.929814385150812, "grad_norm": 0.02620459720492363, "grad_norm_var": 3.691789717364385e-07, "learning_rate": 0.0001260263026815345, "loss": 2.4285, "step": 25648 }, { "crossentropy": 2.4855289459228516, "epoch": 0.9298506380510441, "grad_norm": 0.027229219675064087, "grad_norm_var": 3.851791951211524e-07, "learning_rate": 0.00012589669283515737, "loss": 2.5478, "step": 25649 }, { "crossentropy": 2.384040355682373, "epoch": 0.9298868909512761, "grad_norm": 0.02678781934082508, "grad_norm_var": 3.734193048123812e-07, "learning_rate": 0.00012576714882094221, "loss": 2.3811, "step": 25650 }, { "crossentropy": 2.2180228233337402, "epoch": 0.9299231438515081, "grad_norm": 0.025949567556381226, "grad_norm_var": 4.1293897912551143e-07, "learning_rate": 0.0001256376706406387, "loss": 2.3833, "step": 25651 }, { "crossentropy": 2.4304890632629395, "epoch": 0.9299593967517401, "grad_norm": 0.026063228026032448, "grad_norm_var": 3.757340618089211e-07, "learning_rate": 0.00012550825829599487, "loss": 2.438, "step": 25652 }, { "crossentropy": 2.5551037788391113, "epoch": 0.9299956496519721, "grad_norm": 0.02686878852546215, "grad_norm_var": 3.639934661974712e-07, "learning_rate": 0.00012537891178875994, "loss": 2.5054, "step": 25653 }, { "crossentropy": 2.438239574432373, "epoch": 0.9300319025522041, "grad_norm": 0.026466818526387215, "grad_norm_var": 3.570147649549031e-07, "learning_rate": 0.0001252496311206802, "loss": 2.5227, "step": 25654 }, { "crossentropy": 2.482067108154297, "epoch": 0.9300681554524362, "grad_norm": 0.025719664990901947, "grad_norm_var": 4.1244740455341895e-07, "learning_rate": 0.00012512041629350213, "loss": 2.4467, "step": 25655 }, { "crossentropy": 2.4183616638183594, "epoch": 0.9301044083526682, "grad_norm": 0.02701035887002945, "grad_norm_var": 4.1703846880788005e-07, "learning_rate": 0.00012499126730897037, "loss": 2.4025, "step": 25656 }, { "crossentropy": 2.3715786933898926, "epoch": 0.9301406612529002, "grad_norm": 0.025738105177879333, "grad_norm_var": 4.486330505317863e-07, "learning_rate": 0.00012486218416882965, "loss": 2.3595, "step": 25657 }, { "crossentropy": 2.358988046646118, "epoch": 0.9301769141531323, "grad_norm": 0.02586006373167038, "grad_norm_var": 3.336972766882287e-07, "learning_rate": 0.0001247331668748236, "loss": 2.4135, "step": 25658 }, { "crossentropy": 2.351735830307007, "epoch": 0.9302131670533643, "grad_norm": 0.026339499279856682, "grad_norm_var": 2.8381899431171677e-07, "learning_rate": 0.00012460421542869471, "loss": 2.4231, "step": 25659 }, { "crossentropy": 2.3753340244293213, "epoch": 0.9302494199535963, "grad_norm": 0.02673932909965515, "grad_norm_var": 2.5102672349437927e-07, "learning_rate": 0.00012447532983218379, "loss": 2.4249, "step": 25660 }, { "crossentropy": 2.4824705123901367, "epoch": 0.9302856728538283, "grad_norm": 0.025964736938476562, "grad_norm_var": 2.665186474717542e-07, "learning_rate": 0.00012434651008703278, "loss": 2.4542, "step": 25661 }, { "crossentropy": 2.407167434692383, "epoch": 0.9303219257540604, "grad_norm": 0.026015255600214005, "grad_norm_var": 2.593161953439906e-07, "learning_rate": 0.000124217756194982, "loss": 2.3255, "step": 25662 }, { "crossentropy": 2.2287967205047607, "epoch": 0.9303581786542924, "grad_norm": 0.026211686432361603, "grad_norm_var": 2.2525010945546355e-07, "learning_rate": 0.00012408906815776887, "loss": 2.2991, "step": 25663 }, { "crossentropy": 2.213068962097168, "epoch": 0.9303944315545244, "grad_norm": 0.026158252730965614, "grad_norm_var": 2.2611627502634273e-07, "learning_rate": 0.00012396044597713264, "loss": 2.3693, "step": 25664 }, { "crossentropy": 2.289836883544922, "epoch": 0.9304306844547564, "grad_norm": 0.026236817240715027, "grad_norm_var": 1.6738174707028874e-07, "learning_rate": 0.00012383188965481073, "loss": 2.3307, "step": 25665 }, { "crossentropy": 2.395606517791748, "epoch": 0.9304669373549884, "grad_norm": 0.026664843782782555, "grad_norm_var": 1.5964166472966456e-07, "learning_rate": 0.00012370339919253793, "loss": 2.4006, "step": 25666 }, { "crossentropy": 2.564897298812866, "epoch": 0.9305031902552204, "grad_norm": 0.02638140879571438, "grad_norm_var": 1.5397329390490616e-07, "learning_rate": 0.0001235749745920517, "loss": 2.4944, "step": 25667 }, { "crossentropy": 2.428924083709717, "epoch": 0.9305394431554525, "grad_norm": 0.0265769325196743, "grad_norm_var": 1.5579512278914253e-07, "learning_rate": 0.0001234466158550851, "loss": 2.4365, "step": 25668 }, { "crossentropy": 2.3335914611816406, "epoch": 0.9305756960556845, "grad_norm": 0.0262052770704031, "grad_norm_var": 1.3383445148686345e-07, "learning_rate": 0.0001233183229833723, "loss": 2.4512, "step": 25669 }, { "crossentropy": 2.4009718894958496, "epoch": 0.9306119489559165, "grad_norm": 0.026119327172636986, "grad_norm_var": 1.3217268852403857e-07, "learning_rate": 0.00012319009597864638, "loss": 2.5386, "step": 25670 }, { "crossentropy": 2.5319714546203613, "epoch": 0.9306482018561485, "grad_norm": 0.026394998654723167, "grad_norm_var": 1.1325255691422137e-07, "learning_rate": 0.00012306193484263873, "loss": 2.481, "step": 25671 }, { "crossentropy": 2.300097703933716, "epoch": 0.9306844547563805, "grad_norm": 0.026972517371177673, "grad_norm_var": 1.0970017435839713e-07, "learning_rate": 0.00012293383957708126, "loss": 2.3643, "step": 25672 }, { "crossentropy": 2.428539752960205, "epoch": 0.9307207076566125, "grad_norm": 0.026690753176808357, "grad_norm_var": 9.680363252268878e-08, "learning_rate": 0.00012280581018370374, "loss": 2.4248, "step": 25673 }, { "crossentropy": 2.4489173889160156, "epoch": 0.9307569605568445, "grad_norm": 0.026080019772052765, "grad_norm_var": 8.558402392125353e-08, "learning_rate": 0.00012267784666423475, "loss": 2.4651, "step": 25674 }, { "crossentropy": 2.4650490283966064, "epoch": 0.9307932134570766, "grad_norm": 0.026327498257160187, "grad_norm_var": 8.562499489576878e-08, "learning_rate": 0.00012254994902040296, "loss": 2.4432, "step": 25675 }, { "crossentropy": 2.3631718158721924, "epoch": 0.9308294663573086, "grad_norm": 0.02545545995235443, "grad_norm_var": 1.2349279698308826e-07, "learning_rate": 0.00012242211725393582, "loss": 2.3729, "step": 25676 }, { "crossentropy": 2.4681453704833984, "epoch": 0.9308657192575406, "grad_norm": 0.026514697819948196, "grad_norm_var": 1.1938968622880222e-07, "learning_rate": 0.00012229435136655974, "loss": 2.3622, "step": 25677 }, { "crossentropy": 2.3741250038146973, "epoch": 0.9309019721577726, "grad_norm": 0.025682564824819565, "grad_norm_var": 1.3950871052315477e-07, "learning_rate": 0.00012216665136000117, "loss": 2.3993, "step": 25678 }, { "crossentropy": 2.552046060562134, "epoch": 0.9309382250580046, "grad_norm": 0.026823190972208977, "grad_norm_var": 1.5632616259710306e-07, "learning_rate": 0.00012203901723598365, "loss": 2.4889, "step": 25679 }, { "crossentropy": 2.2145397663116455, "epoch": 0.9309744779582366, "grad_norm": 0.02777840383350849, "grad_norm_var": 2.832193247013943e-07, "learning_rate": 0.00012191144899623252, "loss": 2.3591, "step": 25680 }, { "crossentropy": 2.29988169670105, "epoch": 0.9310107308584686, "grad_norm": 0.0258012842386961, "grad_norm_var": 3.0638290380705957e-07, "learning_rate": 0.00012178394664247027, "loss": 2.4363, "step": 25681 }, { "crossentropy": 2.335376739501953, "epoch": 0.9310469837587007, "grad_norm": 0.025617895647883415, "grad_norm_var": 3.38522368063795e-07, "learning_rate": 0.00012165651017641777, "loss": 2.3848, "step": 25682 }, { "crossentropy": 2.4805946350097656, "epoch": 0.9310832366589327, "grad_norm": 0.027544893324375153, "grad_norm_var": 4.29724474573269e-07, "learning_rate": 0.00012152913959979805, "loss": 2.511, "step": 25683 }, { "crossentropy": 2.4283037185668945, "epoch": 0.9311194895591647, "grad_norm": 0.026122411713004112, "grad_norm_var": 4.326171220420897e-07, "learning_rate": 0.00012140183491433032, "loss": 2.4511, "step": 25684 }, { "crossentropy": 2.483161449432373, "epoch": 0.9311557424593968, "grad_norm": 0.02595994807779789, "grad_norm_var": 4.4219870777756756e-07, "learning_rate": 0.00012127459612173431, "loss": 2.5016, "step": 25685 }, { "crossentropy": 2.379925489425659, "epoch": 0.9311919953596288, "grad_norm": 0.02591702714562416, "grad_norm_var": 4.514604754683275e-07, "learning_rate": 0.00012114742322372863, "loss": 2.3406, "step": 25686 }, { "crossentropy": 2.4338836669921875, "epoch": 0.9312282482598608, "grad_norm": 0.027614997699856758, "grad_norm_var": 5.509555306703583e-07, "learning_rate": 0.0001210203162220308, "loss": 2.3565, "step": 25687 }, { "crossentropy": 2.3418307304382324, "epoch": 0.9312645011600929, "grad_norm": 0.026643676683306694, "grad_norm_var": 5.339917111857415e-07, "learning_rate": 0.00012089327511835835, "loss": 2.357, "step": 25688 }, { "crossentropy": 2.3692679405212402, "epoch": 0.9313007540603249, "grad_norm": 0.026713237166404724, "grad_norm_var": 5.348622083740928e-07, "learning_rate": 0.000120766299914426, "loss": 2.3175, "step": 25689 }, { "crossentropy": 2.3623759746551514, "epoch": 0.9313370069605569, "grad_norm": 0.02613895945250988, "grad_norm_var": 5.324678608834329e-07, "learning_rate": 0.00012063939061194906, "loss": 2.4425, "step": 25690 }, { "crossentropy": 2.3355886936187744, "epoch": 0.9313732598607889, "grad_norm": 0.025807712227106094, "grad_norm_var": 5.554881875527527e-07, "learning_rate": 0.00012051254721264282, "loss": 2.3161, "step": 25691 }, { "crossentropy": 2.2540206909179688, "epoch": 0.9314095127610209, "grad_norm": 0.025967299938201904, "grad_norm_var": 5.085260087558565e-07, "learning_rate": 0.00012038576971821813, "loss": 2.2769, "step": 25692 }, { "crossentropy": 2.5012614727020264, "epoch": 0.9314457656612529, "grad_norm": 0.02711308002471924, "grad_norm_var": 5.388182646007816e-07, "learning_rate": 0.00012025905813038918, "loss": 2.4257, "step": 25693 }, { "crossentropy": 2.424802780151367, "epoch": 0.9314820185614849, "grad_norm": 0.025523338466882706, "grad_norm_var": 5.567574271921841e-07, "learning_rate": 0.00012013241245086737, "loss": 2.3793, "step": 25694 }, { "crossentropy": 2.3628435134887695, "epoch": 0.931518271461717, "grad_norm": 0.026326054707169533, "grad_norm_var": 5.470003948993083e-07, "learning_rate": 0.00012000583268136245, "loss": 2.4842, "step": 25695 }, { "crossentropy": 2.398869037628174, "epoch": 0.931554524361949, "grad_norm": 0.02708739973604679, "grad_norm_var": 4.5094097705735724e-07, "learning_rate": 0.00011987931882358472, "loss": 2.4477, "step": 25696 }, { "crossentropy": 2.350477695465088, "epoch": 0.931590777262181, "grad_norm": 0.026370944455266, "grad_norm_var": 4.2812505616830234e-07, "learning_rate": 0.0001197528708792428, "loss": 2.3519, "step": 25697 }, { "crossentropy": 2.373696804046631, "epoch": 0.931627030162413, "grad_norm": 0.02599979378283024, "grad_norm_var": 3.9719668535654187e-07, "learning_rate": 0.00011962648885004368, "loss": 2.4433, "step": 25698 }, { "crossentropy": 2.313549757003784, "epoch": 0.931663283062645, "grad_norm": 0.025645533576607704, "grad_norm_var": 3.3986260440279037e-07, "learning_rate": 0.00011950017273769598, "loss": 2.332, "step": 25699 }, { "crossentropy": 2.537205934524536, "epoch": 0.931699535962877, "grad_norm": 0.02641942724585533, "grad_norm_var": 3.379686082550694e-07, "learning_rate": 0.0001193739225439039, "loss": 2.4559, "step": 25700 }, { "crossentropy": 2.416238784790039, "epoch": 0.931735788863109, "grad_norm": 0.026417560875415802, "grad_norm_var": 3.285983563988219e-07, "learning_rate": 0.00011924773827037384, "loss": 2.4562, "step": 25701 }, { "crossentropy": 2.3929145336151123, "epoch": 0.931772041763341, "grad_norm": 0.02599972113966942, "grad_norm_var": 3.2417877320056743e-07, "learning_rate": 0.00011912161991880998, "loss": 2.395, "step": 25702 }, { "crossentropy": 2.517038583755493, "epoch": 0.9318082946635731, "grad_norm": 0.026545492932200432, "grad_norm_var": 2.1696145265642555e-07, "learning_rate": 0.00011899556749091545, "loss": 2.4609, "step": 25703 }, { "crossentropy": 2.489304542541504, "epoch": 0.9318445475638051, "grad_norm": 0.025703616440296173, "grad_norm_var": 2.284839065792941e-07, "learning_rate": 0.00011886958098839385, "loss": 2.4088, "step": 25704 }, { "crossentropy": 2.347442150115967, "epoch": 0.9318808004640371, "grad_norm": 0.02523736283183098, "grad_norm_var": 2.707484585760204e-07, "learning_rate": 0.00011874366041294494, "loss": 2.432, "step": 25705 }, { "crossentropy": 2.403536081314087, "epoch": 0.9319170533642691, "grad_norm": 0.02671593613922596, "grad_norm_var": 2.911704453058359e-07, "learning_rate": 0.00011861780576627123, "loss": 2.3605, "step": 25706 }, { "crossentropy": 2.5152857303619385, "epoch": 0.9319533062645011, "grad_norm": 0.02652949094772339, "grad_norm_var": 2.8790115772733305e-07, "learning_rate": 0.0001184920170500714, "loss": 2.4069, "step": 25707 }, { "crossentropy": 2.446169853210449, "epoch": 0.9319895591647331, "grad_norm": 0.026745881885290146, "grad_norm_var": 2.990226193345328e-07, "learning_rate": 0.00011836629426604462, "loss": 2.4806, "step": 25708 }, { "crossentropy": 2.5431785583496094, "epoch": 0.9320258120649652, "grad_norm": 0.02604549005627632, "grad_norm_var": 2.507878330099452e-07, "learning_rate": 0.00011824063741588953, "loss": 2.4834, "step": 25709 }, { "crossentropy": 2.3467795848846436, "epoch": 0.9320620649651972, "grad_norm": 0.025678105652332306, "grad_norm_var": 2.3817575723210764e-07, "learning_rate": 0.00011811504650130311, "loss": 2.3151, "step": 25710 }, { "crossentropy": 2.4289989471435547, "epoch": 0.9320983178654292, "grad_norm": 0.026263689622282982, "grad_norm_var": 2.3750984116294576e-07, "learning_rate": 0.00011798952152398068, "loss": 2.3717, "step": 25711 }, { "crossentropy": 2.5450029373168945, "epoch": 0.9321345707656613, "grad_norm": 0.026698989793658257, "grad_norm_var": 2.0164706466990698e-07, "learning_rate": 0.0001178640624856192, "loss": 2.5417, "step": 25712 }, { "crossentropy": 2.4275364875793457, "epoch": 0.9321708236658933, "grad_norm": 0.02648269385099411, "grad_norm_var": 2.051450017673609e-07, "learning_rate": 0.00011773866938791233, "loss": 2.3667, "step": 25713 }, { "crossentropy": 2.3795289993286133, "epoch": 0.9322070765661253, "grad_norm": 0.026916544884443283, "grad_norm_var": 2.3374417551604887e-07, "learning_rate": 0.00011761334223255427, "loss": 2.3661, "step": 25714 }, { "crossentropy": 2.505072593688965, "epoch": 0.9322433294663574, "grad_norm": 0.02621704339981079, "grad_norm_var": 2.0788013187032077e-07, "learning_rate": 0.00011748808102123699, "loss": 2.4649, "step": 25715 }, { "crossentropy": 2.3462650775909424, "epoch": 0.9322795823665894, "grad_norm": 0.026451362296938896, "grad_norm_var": 2.0850108262402723e-07, "learning_rate": 0.00011736288575565246, "loss": 2.4929, "step": 25716 }, { "crossentropy": 2.4027628898620605, "epoch": 0.9323158352668214, "grad_norm": 0.026217782869935036, "grad_norm_var": 2.0761264283505886e-07, "learning_rate": 0.00011723775643749157, "loss": 2.4674, "step": 25717 }, { "crossentropy": 2.4110348224639893, "epoch": 0.9323520881670534, "grad_norm": 0.026798659935593605, "grad_norm_var": 2.1785486816621308e-07, "learning_rate": 0.00011711269306844519, "loss": 2.4466, "step": 25718 }, { "crossentropy": 2.487029552459717, "epoch": 0.9323883410672854, "grad_norm": 0.027250025421380997, "grad_norm_var": 2.6930768045786935e-07, "learning_rate": 0.0001169876956502014, "loss": 2.4835, "step": 25719 }, { "crossentropy": 2.3077309131622314, "epoch": 0.9324245939675174, "grad_norm": 0.02660883404314518, "grad_norm_var": 2.398452541426162e-07, "learning_rate": 0.00011686276418444941, "loss": 2.3158, "step": 25720 }, { "crossentropy": 2.4638218879699707, "epoch": 0.9324608468677494, "grad_norm": 0.026335053145885468, "grad_norm_var": 1.4080238627729203e-07, "learning_rate": 0.00011673789867287565, "loss": 2.4583, "step": 25721 }, { "crossentropy": 2.4165422916412354, "epoch": 0.9324970997679815, "grad_norm": 0.02633036859333515, "grad_norm_var": 1.3885000817435467e-07, "learning_rate": 0.00011661309911716767, "loss": 2.4618, "step": 25722 }, { "crossentropy": 2.4462177753448486, "epoch": 0.9325333526682135, "grad_norm": 0.026819339022040367, "grad_norm_var": 1.4627905422133955e-07, "learning_rate": 0.00011648836551901131, "loss": 2.373, "step": 25723 }, { "crossentropy": 2.3589022159576416, "epoch": 0.9325696055684455, "grad_norm": 0.027598939836025238, "grad_norm_var": 2.2072385993012154e-07, "learning_rate": 0.00011636369788008972, "loss": 2.4893, "step": 25724 }, { "crossentropy": 2.519259452819824, "epoch": 0.9326058584686775, "grad_norm": 0.026146937161684036, "grad_norm_var": 2.146165512380167e-07, "learning_rate": 0.00011623909620208706, "loss": 2.3706, "step": 25725 }, { "crossentropy": 2.4202206134796143, "epoch": 0.9326421113689095, "grad_norm": 0.025836046785116196, "grad_norm_var": 1.9779566187057747e-07, "learning_rate": 0.00011611456048668811, "loss": 2.3712, "step": 25726 }, { "crossentropy": 2.4030847549438477, "epoch": 0.9326783642691415, "grad_norm": 0.026939816772937775, "grad_norm_var": 1.9958557848363752e-07, "learning_rate": 0.00011599009073557265, "loss": 2.4374, "step": 25727 }, { "crossentropy": 2.3646137714385986, "epoch": 0.9327146171693735, "grad_norm": 0.025428440421819687, "grad_norm_var": 2.842223927125113e-07, "learning_rate": 0.00011586568695042321, "loss": 2.3047, "step": 25728 }, { "crossentropy": 2.487966299057007, "epoch": 0.9327508700696056, "grad_norm": 0.026306763291358948, "grad_norm_var": 2.8711684043082146e-07, "learning_rate": 0.000115741349132919, "loss": 2.4548, "step": 25729 }, { "crossentropy": 2.4747045040130615, "epoch": 0.9327871229698376, "grad_norm": 0.025770636275410652, "grad_norm_var": 3.0747156060429287e-07, "learning_rate": 0.00011561707728473981, "loss": 2.4327, "step": 25730 }, { "crossentropy": 2.372802734375, "epoch": 0.9328233758700696, "grad_norm": 0.026828233152627945, "grad_norm_var": 3.125677028879229e-07, "learning_rate": 0.00011549287140756592, "loss": 2.3863, "step": 25731 }, { "crossentropy": 2.3168396949768066, "epoch": 0.9328596287703016, "grad_norm": 0.025400150567293167, "grad_norm_var": 3.8553520203474244e-07, "learning_rate": 0.00011536873150307214, "loss": 2.4339, "step": 25732 }, { "crossentropy": 2.409231185913086, "epoch": 0.9328958816705336, "grad_norm": 0.026572125032544136, "grad_norm_var": 3.8413574196399793e-07, "learning_rate": 0.00011524465757293601, "loss": 2.3929, "step": 25733 }, { "crossentropy": 2.3803350925445557, "epoch": 0.9329321345707656, "grad_norm": 0.026642613112926483, "grad_norm_var": 3.7810474248730666e-07, "learning_rate": 0.00011512064961883451, "loss": 2.3245, "step": 25734 }, { "crossentropy": 2.4487144947052, "epoch": 0.9329683874709976, "grad_norm": 0.026874227449297905, "grad_norm_var": 3.456370525014666e-07, "learning_rate": 0.00011499670764244075, "loss": 2.4964, "step": 25735 }, { "crossentropy": 2.2771828174591064, "epoch": 0.9330046403712297, "grad_norm": 0.026058116927742958, "grad_norm_var": 3.49434973677167e-07, "learning_rate": 0.0001148728316454306, "loss": 2.3769, "step": 25736 }, { "crossentropy": 2.4981672763824463, "epoch": 0.9330408932714617, "grad_norm": 0.025687597692012787, "grad_norm_var": 3.784780616580367e-07, "learning_rate": 0.00011474902162947554, "loss": 2.3637, "step": 25737 }, { "crossentropy": 2.315047264099121, "epoch": 0.9330771461716937, "grad_norm": 0.02571052499115467, "grad_norm_var": 4.022556838199909e-07, "learning_rate": 0.00011462527759624919, "loss": 2.3569, "step": 25738 }, { "crossentropy": 2.3060309886932373, "epoch": 0.9331133990719258, "grad_norm": 0.049340564757585526, "grad_norm_var": 3.369577947617979e-05, "learning_rate": 0.00011450159954742246, "loss": 2.3186, "step": 25739 }, { "crossentropy": 2.343616247177124, "epoch": 0.9331496519721578, "grad_norm": 0.025794006884098053, "grad_norm_var": 3.3922835438811634e-05, "learning_rate": 0.0001143779874846651, "loss": 2.4098, "step": 25740 }, { "crossentropy": 2.3942720890045166, "epoch": 0.9331859048723898, "grad_norm": 0.026222271844744682, "grad_norm_var": 3.3908759908784594e-05, "learning_rate": 0.00011425444140964748, "loss": 2.41, "step": 25741 }, { "crossentropy": 2.548959970474243, "epoch": 0.9332221577726219, "grad_norm": 0.02708466164767742, "grad_norm_var": 3.371448817279496e-05, "learning_rate": 0.00011413096132403766, "loss": 2.5345, "step": 25742 }, { "crossentropy": 2.2707529067993164, "epoch": 0.9332584106728539, "grad_norm": 0.025715487077832222, "grad_norm_var": 3.392676811956637e-05, "learning_rate": 0.0001140075472295038, "loss": 2.3415, "step": 25743 }, { "crossentropy": 2.429863691329956, "epoch": 0.9332946635730859, "grad_norm": 0.02691827528178692, "grad_norm_var": 3.36361558051693e-05, "learning_rate": 0.00011388419912771342, "loss": 2.493, "step": 25744 }, { "crossentropy": 2.453545093536377, "epoch": 0.9333309164733179, "grad_norm": 0.02674044668674469, "grad_norm_var": 3.356833705406175e-05, "learning_rate": 0.00011376091702033132, "loss": 2.4816, "step": 25745 }, { "crossentropy": 2.30739164352417, "epoch": 0.9333671693735499, "grad_norm": 0.025772487744688988, "grad_norm_var": 3.356785851292911e-05, "learning_rate": 0.00011363770090902281, "loss": 2.3887, "step": 25746 }, { "crossentropy": 2.526869297027588, "epoch": 0.9334034222737819, "grad_norm": 0.02690223418176174, "grad_norm_var": 3.355949944752045e-05, "learning_rate": 0.00011351455079545326, "loss": 2.4606, "step": 25747 }, { "crossentropy": 2.415893077850342, "epoch": 0.9334396751740139, "grad_norm": 0.02665913663804531, "grad_norm_var": 3.3270027208733345e-05, "learning_rate": 0.0001133914666812852, "loss": 2.3875, "step": 25748 }, { "crossentropy": 2.3696486949920654, "epoch": 0.933475928074246, "grad_norm": 0.026257527992129326, "grad_norm_var": 3.332744183773873e-05, "learning_rate": 0.00011326844856818063, "loss": 2.3708, "step": 25749 }, { "crossentropy": 2.637097120285034, "epoch": 0.933512180974478, "grad_norm": 0.027229739353060722, "grad_norm_var": 3.3260436425950476e-05, "learning_rate": 0.00011314549645780214, "loss": 2.479, "step": 25750 }, { "crossentropy": 2.292588233947754, "epoch": 0.93354843387471, "grad_norm": 0.026935486122965813, "grad_norm_var": 3.3253024010091585e-05, "learning_rate": 0.00011302261035180895, "loss": 2.3461, "step": 25751 }, { "crossentropy": 2.362323760986328, "epoch": 0.933584686774942, "grad_norm": 0.026238558813929558, "grad_norm_var": 3.3212807452054756e-05, "learning_rate": 0.00011289979025186247, "loss": 2.4006, "step": 25752 }, { "crossentropy": 2.4904348850250244, "epoch": 0.933620939675174, "grad_norm": 0.0258637648075819, "grad_norm_var": 3.316452857323551e-05, "learning_rate": 0.00011277703615961977, "loss": 2.4619, "step": 25753 }, { "crossentropy": 2.3735146522521973, "epoch": 0.933657192575406, "grad_norm": 0.026057472452521324, "grad_norm_var": 3.307370157605489e-05, "learning_rate": 0.00011265434807674058, "loss": 2.3403, "step": 25754 }, { "crossentropy": 2.5065693855285645, "epoch": 0.933693445475638, "grad_norm": 0.026555359363555908, "grad_norm_var": 2.577133031075729e-07, "learning_rate": 0.00011253172600488082, "loss": 2.3798, "step": 25755 }, { "crossentropy": 2.301584005355835, "epoch": 0.93372969837587, "grad_norm": 0.026007434353232384, "grad_norm_var": 2.423427890525335e-07, "learning_rate": 0.00011240916994569806, "loss": 2.3173, "step": 25756 }, { "crossentropy": 2.3963162899017334, "epoch": 0.9337659512761021, "grad_norm": 0.026317374780774117, "grad_norm_var": 2.4005182048577e-07, "learning_rate": 0.0001122866799008454, "loss": 2.4626, "step": 25757 }, { "crossentropy": 2.4440388679504395, "epoch": 0.9338022041763341, "grad_norm": 0.025821922346949577, "grad_norm_var": 2.3343723099990507e-07, "learning_rate": 0.00011216425587197932, "loss": 2.4362, "step": 25758 }, { "crossentropy": 2.4047601222991943, "epoch": 0.9338384570765661, "grad_norm": 0.026499925181269646, "grad_norm_var": 2.0296423605635032e-07, "learning_rate": 0.0001120418978607518, "loss": 2.4462, "step": 25759 }, { "crossentropy": 2.2367522716522217, "epoch": 0.9338747099767981, "grad_norm": 0.025836985558271408, "grad_norm_var": 2.047160562334508e-07, "learning_rate": 0.00011191960586881711, "loss": 2.257, "step": 25760 }, { "crossentropy": 2.4894018173217773, "epoch": 0.9339109628770301, "grad_norm": 0.025887737050652504, "grad_norm_var": 2.0645012413523604e-07, "learning_rate": 0.00011179737989782502, "loss": 2.359, "step": 25761 }, { "crossentropy": 2.5565240383148193, "epoch": 0.9339472157772621, "grad_norm": 0.026384325698018074, "grad_norm_var": 1.865931366038312e-07, "learning_rate": 0.00011167521994942809, "loss": 2.515, "step": 25762 }, { "crossentropy": 2.4697723388671875, "epoch": 0.9339834686774942, "grad_norm": 0.02541976608335972, "grad_norm_var": 2.1300266985421757e-07, "learning_rate": 0.00011155312602527556, "loss": 2.4334, "step": 25763 }, { "crossentropy": 2.5429439544677734, "epoch": 0.9340197215777262, "grad_norm": 0.025369251146912575, "grad_norm_var": 2.463297806476819e-07, "learning_rate": 0.00011143109812701613, "loss": 2.4477, "step": 25764 }, { "crossentropy": 2.488463878631592, "epoch": 0.9340559744779582, "grad_norm": 0.026058413088321686, "grad_norm_var": 2.464219459748316e-07, "learning_rate": 0.00011130913625629901, "loss": 2.4718, "step": 25765 }, { "crossentropy": 2.585330009460449, "epoch": 0.9340922273781903, "grad_norm": 0.025990312919020653, "grad_norm_var": 1.6486133256840042e-07, "learning_rate": 0.00011118724041477124, "loss": 2.5092, "step": 25766 }, { "crossentropy": 2.5897769927978516, "epoch": 0.9341284802784223, "grad_norm": 0.026533590629696846, "grad_norm_var": 1.2899392813230972e-07, "learning_rate": 0.0001110654106040776, "loss": 2.5683, "step": 25767 }, { "crossentropy": 2.4312095642089844, "epoch": 0.9341647331786543, "grad_norm": 0.026450088247656822, "grad_norm_var": 1.3703419275942705e-07, "learning_rate": 0.00011094364682586567, "loss": 2.4458, "step": 25768 }, { "crossentropy": 2.391028642654419, "epoch": 0.9342009860788864, "grad_norm": 0.025842128321528435, "grad_norm_var": 1.376464621196388e-07, "learning_rate": 0.00011082194908177856, "loss": 2.3984, "step": 25769 }, { "crossentropy": 2.5066330432891846, "epoch": 0.9342372389791184, "grad_norm": 0.025382941588759422, "grad_norm_var": 1.6671598497586202e-07, "learning_rate": 0.0001107003173734611, "loss": 2.4758, "step": 25770 }, { "crossentropy": 2.611196517944336, "epoch": 0.9342734918793504, "grad_norm": 0.02637125365436077, "grad_norm_var": 1.5575034111152872e-07, "learning_rate": 0.00011057875170255527, "loss": 2.5606, "step": 25771 }, { "crossentropy": 2.4488539695739746, "epoch": 0.9343097447795824, "grad_norm": 0.026265939697623253, "grad_norm_var": 1.5980949777047448e-07, "learning_rate": 0.00011045725207070367, "loss": 2.4089, "step": 25772 }, { "crossentropy": 2.407208204269409, "epoch": 0.9343459976798144, "grad_norm": 0.026473086327314377, "grad_norm_var": 1.673535627235197e-07, "learning_rate": 0.0001103358184795472, "loss": 2.4284, "step": 25773 }, { "crossentropy": 2.5243396759033203, "epoch": 0.9343822505800464, "grad_norm": 0.027220359072089195, "grad_norm_var": 2.4952763278764516e-07, "learning_rate": 0.00011021445093072569, "loss": 2.472, "step": 25774 }, { "crossentropy": 2.3627607822418213, "epoch": 0.9344185034802784, "grad_norm": 0.02630271576344967, "grad_norm_var": 2.420770149639025e-07, "learning_rate": 0.00011009314942587833, "loss": 2.373, "step": 25775 }, { "crossentropy": 2.4265716075897217, "epoch": 0.9344547563805105, "grad_norm": 0.025560883805155754, "grad_norm_var": 2.569586442078067e-07, "learning_rate": 0.00010997191396664386, "loss": 2.4042, "step": 25776 }, { "crossentropy": 2.4867467880249023, "epoch": 0.9344910092807425, "grad_norm": 0.027839792892336845, "grad_norm_var": 4.4128832532783215e-07, "learning_rate": 0.00010985074455465926, "loss": 2.4889, "step": 25777 }, { "crossentropy": 2.339475393295288, "epoch": 0.9345272621809745, "grad_norm": 0.027115946635603905, "grad_norm_var": 4.911087802099308e-07, "learning_rate": 0.00010972964119156159, "loss": 2.4869, "step": 25778 }, { "crossentropy": 2.429150342941284, "epoch": 0.9345635150812065, "grad_norm": 0.027349023148417473, "grad_norm_var": 5.070125376506011e-07, "learning_rate": 0.00010960860387898674, "loss": 2.5344, "step": 25779 }, { "crossentropy": 2.1982932090759277, "epoch": 0.9345997679814385, "grad_norm": 0.025416649878025055, "grad_norm_var": 5.007471292274898e-07, "learning_rate": 0.00010948763261856843, "loss": 2.3292, "step": 25780 }, { "crossentropy": 2.425665855407715, "epoch": 0.9346360208816705, "grad_norm": 0.025037767365574837, "grad_norm_var": 6.104100616912782e-07, "learning_rate": 0.00010936672741194198, "loss": 2.3606, "step": 25781 }, { "crossentropy": 2.4217910766601562, "epoch": 0.9346722737819025, "grad_norm": 0.02735433354973793, "grad_norm_var": 6.66365387593061e-07, "learning_rate": 0.00010924588826073945, "loss": 2.4179, "step": 25782 }, { "crossentropy": 2.340761661529541, "epoch": 0.9347085266821346, "grad_norm": 0.026055769994854927, "grad_norm_var": 6.725878277686794e-07, "learning_rate": 0.00010912511516659285, "loss": 2.4511, "step": 25783 }, { "crossentropy": 2.463949203491211, "epoch": 0.9347447795823666, "grad_norm": 0.02699993923306465, "grad_norm_var": 6.968115789614144e-07, "learning_rate": 0.00010900440813113421, "loss": 2.4365, "step": 25784 }, { "crossentropy": 2.526487350463867, "epoch": 0.9347810324825986, "grad_norm": 0.02567281201481819, "grad_norm_var": 7.114635773261633e-07, "learning_rate": 0.00010888376715599279, "loss": 2.494, "step": 25785 }, { "crossentropy": 2.3964016437530518, "epoch": 0.9348172853828306, "grad_norm": 0.026736294850707054, "grad_norm_var": 6.42194449174691e-07, "learning_rate": 0.00010876319224279895, "loss": 2.4357, "step": 25786 }, { "crossentropy": 2.3963420391082764, "epoch": 0.9348535382830626, "grad_norm": 0.025744346901774406, "grad_norm_var": 6.763311416638768e-07, "learning_rate": 0.0001086426833931814, "loss": 2.4638, "step": 25787 }, { "crossentropy": 2.425428867340088, "epoch": 0.9348897911832946, "grad_norm": 0.027156634256243706, "grad_norm_var": 7.044591512843223e-07, "learning_rate": 0.00010852224060876658, "loss": 2.5351, "step": 25788 }, { "crossentropy": 2.515624761581421, "epoch": 0.9349260440835266, "grad_norm": 0.026839442551136017, "grad_norm_var": 7.114220481475719e-07, "learning_rate": 0.00010840186389118212, "loss": 2.5422, "step": 25789 }, { "crossentropy": 2.456446409225464, "epoch": 0.9349622969837587, "grad_norm": 0.025849049910902977, "grad_norm_var": 7.018432875963935e-07, "learning_rate": 0.00010828155324205447, "loss": 2.4575, "step": 25790 }, { "crossentropy": 2.2737579345703125, "epoch": 0.9349985498839907, "grad_norm": 0.025190798565745354, "grad_norm_var": 7.99389277083987e-07, "learning_rate": 0.00010816130866300677, "loss": 2.2631, "step": 25791 }, { "crossentropy": 2.3924779891967773, "epoch": 0.9350348027842227, "grad_norm": 0.026598036289215088, "grad_norm_var": 7.547338035903776e-07, "learning_rate": 0.00010804113015566496, "loss": 2.4539, "step": 25792 }, { "crossentropy": 2.6059811115264893, "epoch": 0.9350710556844548, "grad_norm": 0.026003552600741386, "grad_norm_var": 6.214802013379077e-07, "learning_rate": 0.00010792101772165108, "loss": 2.5172, "step": 25793 }, { "crossentropy": 2.3232665061950684, "epoch": 0.9351073085846868, "grad_norm": 0.026226740330457687, "grad_norm_var": 5.765330372218032e-07, "learning_rate": 0.00010780097136258826, "loss": 2.3762, "step": 25794 }, { "crossentropy": 2.436575412750244, "epoch": 0.9351435614849188, "grad_norm": 0.02598435804247856, "grad_norm_var": 4.955834840377736e-07, "learning_rate": 0.00010768099108009744, "loss": 2.4156, "step": 25795 }, { "crossentropy": 2.4576776027679443, "epoch": 0.9351798143851509, "grad_norm": 0.03392409905791283, "grad_norm_var": 4.15419361089094e-06, "learning_rate": 0.00010756107687579896, "loss": 2.4587, "step": 25796 }, { "crossentropy": 2.345459222793579, "epoch": 0.9352160672853829, "grad_norm": 0.02670266479253769, "grad_norm_var": 3.956029652993686e-06, "learning_rate": 0.00010744122875131213, "loss": 2.3919, "step": 25797 }, { "crossentropy": 2.3451805114746094, "epoch": 0.9352523201856149, "grad_norm": 0.02658396027982235, "grad_norm_var": 3.937716183433776e-06, "learning_rate": 0.00010732144670825783, "loss": 2.4439, "step": 25798 }, { "crossentropy": 2.4683780670166016, "epoch": 0.9352885730858469, "grad_norm": 0.025851009413599968, "grad_norm_var": 3.959748224236353e-06, "learning_rate": 0.0001072017307482509, "loss": 2.4303, "step": 25799 }, { "crossentropy": 2.5411503314971924, "epoch": 0.9353248259860789, "grad_norm": 0.025867091491818428, "grad_norm_var": 4.002806534167851e-06, "learning_rate": 0.00010708208087291004, "loss": 2.3549, "step": 25800 }, { "crossentropy": 2.513932228088379, "epoch": 0.9353610788863109, "grad_norm": 0.02728055790066719, "grad_norm_var": 3.9477706463453135e-06, "learning_rate": 0.0001069624970838512, "loss": 2.454, "step": 25801 }, { "crossentropy": 2.480052947998047, "epoch": 0.9353973317865429, "grad_norm": 0.027458572760224342, "grad_norm_var": 3.975814077490286e-06, "learning_rate": 0.00010684297938268917, "loss": 2.4187, "step": 25802 }, { "crossentropy": 2.3829262256622314, "epoch": 0.935433584686775, "grad_norm": 0.02622799016535282, "grad_norm_var": 3.920501247397127e-06, "learning_rate": 0.00010672352777103822, "loss": 2.5373, "step": 25803 }, { "crossentropy": 2.4758002758026123, "epoch": 0.935469837587007, "grad_norm": 0.02621513232588768, "grad_norm_var": 3.938544149487133e-06, "learning_rate": 0.00010660414225051207, "loss": 2.4625, "step": 25804 }, { "crossentropy": 2.495739221572876, "epoch": 0.935506090487239, "grad_norm": 0.02749461866915226, "grad_norm_var": 3.968801521495715e-06, "learning_rate": 0.00010648482282272221, "loss": 2.5153, "step": 25805 }, { "crossentropy": 2.4935243129730225, "epoch": 0.935542343387471, "grad_norm": 0.025820648297667503, "grad_norm_var": 3.972608863337977e-06, "learning_rate": 0.00010636556948928233, "loss": 2.4682, "step": 25806 }, { "crossentropy": 2.373991012573242, "epoch": 0.935578596287703, "grad_norm": 0.02778862789273262, "grad_norm_var": 3.823377986911005e-06, "learning_rate": 0.00010624638225180116, "loss": 2.4259, "step": 25807 }, { "crossentropy": 2.4127771854400635, "epoch": 0.935614849187935, "grad_norm": 0.026541436091065407, "grad_norm_var": 3.826624754062188e-06, "learning_rate": 0.00010612726111188854, "loss": 2.4333, "step": 25808 }, { "crossentropy": 2.368460178375244, "epoch": 0.935651102088167, "grad_norm": 0.026515744626522064, "grad_norm_var": 3.7750949142470804e-06, "learning_rate": 0.0001060082060711548, "loss": 2.3747, "step": 25809 }, { "crossentropy": 2.373361349105835, "epoch": 0.9356873549883991, "grad_norm": 0.025447389110922813, "grad_norm_var": 3.896547326180767e-06, "learning_rate": 0.00010588921713120648, "loss": 2.4036, "step": 25810 }, { "crossentropy": 2.6293845176696777, "epoch": 0.9357236078886311, "grad_norm": 0.02759539522230625, "grad_norm_var": 3.844572670438859e-06, "learning_rate": 0.00010577029429365226, "loss": 2.6, "step": 25811 }, { "crossentropy": 2.446859121322632, "epoch": 0.9357598607888631, "grad_norm": 0.026702910661697388, "grad_norm_var": 5.161022479283535e-07, "learning_rate": 0.00010565143756009643, "loss": 2.37, "step": 25812 }, { "crossentropy": 2.2226717472076416, "epoch": 0.9357961136890951, "grad_norm": 0.025749601423740387, "grad_norm_var": 5.637481753688333e-07, "learning_rate": 0.00010553264693214549, "loss": 2.2895, "step": 25813 }, { "crossentropy": 2.4024975299835205, "epoch": 0.9358323665893271, "grad_norm": 0.026961633935570717, "grad_norm_var": 5.73300897222367e-07, "learning_rate": 0.0001054139224114048, "loss": 2.4429, "step": 25814 }, { "crossentropy": 2.3406500816345215, "epoch": 0.9358686194895591, "grad_norm": 0.037353720515966415, "grad_norm_var": 7.701926267268315e-06, "learning_rate": 0.00010529526399947641, "loss": 2.3482, "step": 25815 }, { "crossentropy": 2.2715582847595215, "epoch": 0.9359048723897911, "grad_norm": 0.026509428396821022, "grad_norm_var": 7.603808874477808e-06, "learning_rate": 0.00010517667169796297, "loss": 2.2446, "step": 25816 }, { "crossentropy": 2.4524483680725098, "epoch": 0.9359411252900232, "grad_norm": 0.026266226544976234, "grad_norm_var": 7.678040743165406e-06, "learning_rate": 0.00010505814550846703, "loss": 2.4625, "step": 25817 }, { "crossentropy": 2.3058319091796875, "epoch": 0.9359773781902552, "grad_norm": 0.026743602007627487, "grad_norm_var": 7.693973825404265e-06, "learning_rate": 0.00010493968543258903, "loss": 2.3567, "step": 25818 }, { "crossentropy": 2.345872163772583, "epoch": 0.9360136310904872, "grad_norm": 0.026225190609693527, "grad_norm_var": 7.694354267752332e-06, "learning_rate": 0.00010482129147192932, "loss": 2.3069, "step": 25819 }, { "crossentropy": 2.333404779434204, "epoch": 0.9360498839907193, "grad_norm": 0.025607943534851074, "grad_norm_var": 7.800830413715042e-06, "learning_rate": 0.00010470296362808663, "loss": 2.4137, "step": 25820 }, { "crossentropy": 2.362987995147705, "epoch": 0.9360861368909513, "grad_norm": 0.024357913061976433, "grad_norm_var": 8.295789773542633e-06, "learning_rate": 0.00010458470190265912, "loss": 2.3414, "step": 25821 }, { "crossentropy": 2.3739705085754395, "epoch": 0.9361223897911833, "grad_norm": 0.02558613196015358, "grad_norm_var": 8.336470368330605e-06, "learning_rate": 0.00010446650629724497, "loss": 2.3576, "step": 25822 }, { "crossentropy": 2.415731430053711, "epoch": 0.9361586426914154, "grad_norm": 0.026700876653194427, "grad_norm_var": 8.295616096118498e-06, "learning_rate": 0.00010434837681343901, "loss": 2.3147, "step": 25823 }, { "crossentropy": 2.1677932739257812, "epoch": 0.9361948955916474, "grad_norm": 0.027553772553801537, "grad_norm_var": 8.307345334889364e-06, "learning_rate": 0.0001042303134528383, "loss": 2.2451, "step": 25824 }, { "crossentropy": 2.446856737136841, "epoch": 0.9362311484918794, "grad_norm": 0.02787667140364647, "grad_norm_var": 8.336620956993555e-06, "learning_rate": 0.00010411231621703655, "loss": 2.4206, "step": 25825 }, { "crossentropy": 2.4163408279418945, "epoch": 0.9362674013921114, "grad_norm": 0.026427147909998894, "grad_norm_var": 8.183680683334948e-06, "learning_rate": 0.00010399438510762749, "loss": 2.4004, "step": 25826 }, { "crossentropy": 2.428987741470337, "epoch": 0.9363036542923434, "grad_norm": 0.0270990002900362, "grad_norm_var": 8.168850077393133e-06, "learning_rate": 0.00010387652012620485, "loss": 2.4459, "step": 25827 }, { "crossentropy": 2.3510968685150146, "epoch": 0.9363399071925754, "grad_norm": 0.025790037587285042, "grad_norm_var": 8.270192293281103e-06, "learning_rate": 0.00010375872127435959, "loss": 2.339, "step": 25828 }, { "crossentropy": 2.495971441268921, "epoch": 0.9363761600928074, "grad_norm": 0.02528088539838791, "grad_norm_var": 8.36522698989331e-06, "learning_rate": 0.00010364098855368264, "loss": 2.3912, "step": 25829 }, { "crossentropy": 2.4100184440612793, "epoch": 0.9364124129930395, "grad_norm": 0.026722371578216553, "grad_norm_var": 8.370707108909741e-06, "learning_rate": 0.00010352332196576552, "loss": 2.3939, "step": 25830 }, { "crossentropy": 2.3759443759918213, "epoch": 0.9364486658932715, "grad_norm": 0.02688225544989109, "grad_norm_var": 7.769210427397213e-07, "learning_rate": 0.00010340572151219585, "loss": 2.4284, "step": 25831 }, { "crossentropy": 2.4455575942993164, "epoch": 0.9364849187935035, "grad_norm": 0.02686469443142414, "grad_norm_var": 7.922741446947726e-07, "learning_rate": 0.0001032881871945629, "loss": 2.3699, "step": 25832 }, { "crossentropy": 2.2443947792053223, "epoch": 0.9365211716937355, "grad_norm": 0.026368584483861923, "grad_norm_var": 7.914574889472756e-07, "learning_rate": 0.00010317071901445484, "loss": 2.3688, "step": 25833 }, { "crossentropy": 2.5258679389953613, "epoch": 0.9365574245939675, "grad_norm": 0.027219682931900024, "grad_norm_var": 8.286757505324237e-07, "learning_rate": 0.00010305331697345655, "loss": 2.484, "step": 25834 }, { "crossentropy": 2.4309144020080566, "epoch": 0.9365936774941995, "grad_norm": 0.02701997570693493, "grad_norm_var": 8.485505317877446e-07, "learning_rate": 0.00010293598107315505, "loss": 2.5107, "step": 25835 }, { "crossentropy": 2.274904489517212, "epoch": 0.9366299303944315, "grad_norm": 0.02692355588078499, "grad_norm_var": 8.072868353229771e-07, "learning_rate": 0.00010281871131513409, "loss": 2.4491, "step": 25836 }, { "crossentropy": 2.2883360385894775, "epoch": 0.9366661832946636, "grad_norm": 0.02642987295985222, "grad_norm_var": 4.7219482055871745e-07, "learning_rate": 0.00010270150770097797, "loss": 2.4151, "step": 25837 }, { "crossentropy": 2.439460277557373, "epoch": 0.9367024361948956, "grad_norm": 0.02548537217080593, "grad_norm_var": 4.874121552691143e-07, "learning_rate": 0.00010258437023227097, "loss": 2.436, "step": 25838 }, { "crossentropy": 2.4652822017669678, "epoch": 0.9367386890951276, "grad_norm": 0.02624249830842018, "grad_norm_var": 4.983695693710465e-07, "learning_rate": 0.00010246729891059348, "loss": 2.4211, "step": 25839 }, { "crossentropy": 2.4271886348724365, "epoch": 0.9367749419953596, "grad_norm": 0.02689494378864765, "grad_norm_var": 4.4493435057112637e-07, "learning_rate": 0.00010235029373752758, "loss": 2.4671, "step": 25840 }, { "crossentropy": 2.5278213024139404, "epoch": 0.9368111948955916, "grad_norm": 0.026287831366062164, "grad_norm_var": 3.312939982110773e-07, "learning_rate": 0.00010223335471465367, "loss": 2.4595, "step": 25841 }, { "crossentropy": 2.4764957427978516, "epoch": 0.9368474477958236, "grad_norm": 0.027356943115592003, "grad_norm_var": 3.767696646380635e-07, "learning_rate": 0.00010211648184355049, "loss": 2.5033, "step": 25842 }, { "crossentropy": 2.2853891849517822, "epoch": 0.9368837006960556, "grad_norm": 0.02567937597632408, "grad_norm_var": 3.9962186781174595e-07, "learning_rate": 0.00010199967512579733, "loss": 2.383, "step": 25843 }, { "crossentropy": 2.4351134300231934, "epoch": 0.9369199535962877, "grad_norm": 0.02605036459863186, "grad_norm_var": 3.8041010934779647e-07, "learning_rate": 0.00010188293456297127, "loss": 2.5267, "step": 25844 }, { "crossentropy": 2.442474842071533, "epoch": 0.9369562064965197, "grad_norm": 0.0253113005310297, "grad_norm_var": 3.7559769316449173e-07, "learning_rate": 0.0001017662601566488, "loss": 2.3813, "step": 25845 }, { "crossentropy": 2.5641744136810303, "epoch": 0.9369924593967517, "grad_norm": 0.026511751115322113, "grad_norm_var": 3.71668448197598e-07, "learning_rate": 0.00010164965190840758, "loss": 2.4958, "step": 25846 }, { "crossentropy": 2.3544886112213135, "epoch": 0.9370287122969838, "grad_norm": 0.0255932305008173, "grad_norm_var": 4.0475989639728267e-07, "learning_rate": 0.00010153310981982022, "loss": 2.3924, "step": 25847 }, { "crossentropy": 2.307035207748413, "epoch": 0.9370649651972158, "grad_norm": 0.025950482115149498, "grad_norm_var": 3.991333716357274e-07, "learning_rate": 0.0001014166338924627, "loss": 2.3505, "step": 25848 }, { "crossentropy": 2.4420435428619385, "epoch": 0.9371012180974478, "grad_norm": 0.026769470423460007, "grad_norm_var": 4.110872249003805e-07, "learning_rate": 0.00010130022412790707, "loss": 2.4825, "step": 25849 }, { "crossentropy": 2.338129997253418, "epoch": 0.9371374709976799, "grad_norm": 0.0259273499250412, "grad_norm_var": 3.6697798382404057e-07, "learning_rate": 0.00010118388052772598, "loss": 2.3846, "step": 25850 }, { "crossentropy": 2.3549485206604004, "epoch": 0.9371737238979119, "grad_norm": 0.025901127606630325, "grad_norm_var": 3.344014899440365e-07, "learning_rate": 0.00010106760309349095, "loss": 2.3685, "step": 25851 }, { "crossentropy": 2.3959081172943115, "epoch": 0.9372099767981439, "grad_norm": 0.02601231262087822, "grad_norm_var": 2.992647148255251e-07, "learning_rate": 0.00010095139182677182, "loss": 2.4356, "step": 25852 }, { "crossentropy": 2.468602180480957, "epoch": 0.9372462296983759, "grad_norm": 0.02779945731163025, "grad_norm_var": 4.675595010296957e-07, "learning_rate": 0.00010083524672913902, "loss": 2.4715, "step": 25853 }, { "crossentropy": 2.354177951812744, "epoch": 0.9372824825986079, "grad_norm": 0.026027081534266472, "grad_norm_var": 4.31693662083588e-07, "learning_rate": 0.00010071916780216128, "loss": 2.4044, "step": 25854 }, { "crossentropy": 2.3799455165863037, "epoch": 0.9373187354988399, "grad_norm": 0.026593083515763283, "grad_norm_var": 4.381030642095403e-07, "learning_rate": 0.00010060315504740569, "loss": 2.3872, "step": 25855 }, { "crossentropy": 2.3925254344940186, "epoch": 0.9373549883990719, "grad_norm": 0.025620710104703903, "grad_norm_var": 4.3708110815733557e-07, "learning_rate": 0.00010048720846643989, "loss": 2.423, "step": 25856 }, { "crossentropy": 2.497714042663574, "epoch": 0.937391241299304, "grad_norm": 0.02695835381746292, "grad_norm_var": 4.719613948556452e-07, "learning_rate": 0.00010037132806082982, "loss": 2.4808, "step": 25857 }, { "crossentropy": 2.3580117225646973, "epoch": 0.937427494199536, "grad_norm": 0.026383107528090477, "grad_norm_var": 3.8800923515869683e-07, "learning_rate": 0.00010025551383213982, "loss": 2.3779, "step": 25858 }, { "crossentropy": 2.322866439819336, "epoch": 0.937463747099768, "grad_norm": 0.02548232674598694, "grad_norm_var": 4.0393149087108947e-07, "learning_rate": 0.00010013976578193528, "loss": 2.3571, "step": 25859 }, { "crossentropy": 2.4679088592529297, "epoch": 0.9375, "grad_norm": 0.026416422799229622, "grad_norm_var": 4.0594407912559044e-07, "learning_rate": 0.00010002408391177831, "loss": 2.419, "step": 25860 }, { "crossentropy": 2.39898681640625, "epoch": 0.937536252900232, "grad_norm": 0.025510171428322792, "grad_norm_var": 3.847556644726805e-07, "learning_rate": 9.990846822323207e-05, "loss": 2.3989, "step": 25861 }, { "crossentropy": 2.4716482162475586, "epoch": 0.937572505800464, "grad_norm": 0.025763532146811485, "grad_norm_var": 3.902429995789047e-07, "learning_rate": 9.979291871785868e-05, "loss": 2.4299, "step": 25862 }, { "crossentropy": 2.619018316268921, "epoch": 0.937608758700696, "grad_norm": 0.027546800673007965, "grad_norm_var": 4.787274238216792e-07, "learning_rate": 9.967743539721797e-05, "loss": 2.4717, "step": 25863 }, { "crossentropy": 2.312892198562622, "epoch": 0.9376450116009281, "grad_norm": 0.03065256029367447, "grad_norm_var": 1.646861174643688e-06, "learning_rate": 9.956201826287036e-05, "loss": 2.3416, "step": 25864 }, { "crossentropy": 2.4171319007873535, "epoch": 0.9376812645011601, "grad_norm": 0.026003625243902206, "grad_norm_var": 1.6647065195815603e-06, "learning_rate": 9.944666731637408e-05, "loss": 2.4222, "step": 25865 }, { "crossentropy": 2.6498231887817383, "epoch": 0.9377175174013921, "grad_norm": 0.026743756607174873, "grad_norm_var": 1.6399600466796786e-06, "learning_rate": 9.933138255928731e-05, "loss": 2.5245, "step": 25866 }, { "crossentropy": 2.254528045654297, "epoch": 0.9377537703016241, "grad_norm": 0.02559550665318966, "grad_norm_var": 1.673803862694591e-06, "learning_rate": 9.921616399316769e-05, "loss": 2.3694, "step": 25867 }, { "crossentropy": 2.3986620903015137, "epoch": 0.9377900232018561, "grad_norm": 0.026531009003520012, "grad_norm_var": 1.6520982162765517e-06, "learning_rate": 9.910101161957063e-05, "loss": 2.4663, "step": 25868 }, { "crossentropy": 2.4016411304473877, "epoch": 0.9378262761020881, "grad_norm": 0.026523014530539513, "grad_norm_var": 1.5500839442086333e-06, "learning_rate": 9.898592544005158e-05, "loss": 2.4155, "step": 25869 }, { "crossentropy": 2.4625210762023926, "epoch": 0.9378625290023201, "grad_norm": 0.02548290230333805, "grad_norm_var": 1.6044977964297287e-06, "learning_rate": 9.887090545616594e-05, "loss": 2.4161, "step": 25870 }, { "crossentropy": 2.470895290374756, "epoch": 0.9378987819025522, "grad_norm": 0.02718781679868698, "grad_norm_var": 1.6349429488874868e-06, "learning_rate": 9.875595166946528e-05, "loss": 2.4653, "step": 25871 }, { "crossentropy": 2.303928852081299, "epoch": 0.9379350348027842, "grad_norm": 0.025958789512515068, "grad_norm_var": 1.6013190939109337e-06, "learning_rate": 9.864106408150442e-05, "loss": 2.3861, "step": 25872 }, { "crossentropy": 2.365494728088379, "epoch": 0.9379712877030162, "grad_norm": 0.02699287422001362, "grad_norm_var": 1.6032904585979208e-06, "learning_rate": 9.852624269383326e-05, "loss": 2.3587, "step": 25873 }, { "crossentropy": 2.351935625076294, "epoch": 0.9380075406032483, "grad_norm": 0.026899494230747223, "grad_norm_var": 1.6085765553526415e-06, "learning_rate": 9.841148750800444e-05, "loss": 2.4063, "step": 25874 }, { "crossentropy": 2.564253330230713, "epoch": 0.9380437935034803, "grad_norm": 0.028224315494298935, "grad_norm_var": 1.6769329720218532e-06, "learning_rate": 9.829679852556672e-05, "loss": 2.3942, "step": 25875 }, { "crossentropy": 2.336678981781006, "epoch": 0.9380800464037123, "grad_norm": 0.025264272466301918, "grad_norm_var": 1.811455689210382e-06, "learning_rate": 9.818217574806831e-05, "loss": 2.3843, "step": 25876 }, { "crossentropy": 2.4323596954345703, "epoch": 0.9381162993039444, "grad_norm": 0.026631029322743416, "grad_norm_var": 1.715143503656924e-06, "learning_rate": 9.806761917705909e-05, "loss": 2.4192, "step": 25877 }, { "crossentropy": 2.359609603881836, "epoch": 0.9381525522041764, "grad_norm": 0.025511791929602623, "grad_norm_var": 1.7522182017388569e-06, "learning_rate": 9.795312881408558e-05, "loss": 2.353, "step": 25878 }, { "crossentropy": 2.3538026809692383, "epoch": 0.9381888051044084, "grad_norm": 0.025872409343719482, "grad_norm_var": 1.7460603971843994e-06, "learning_rate": 9.783870466069433e-05, "loss": 2.4282, "step": 25879 }, { "crossentropy": 2.497525215148926, "epoch": 0.9382250580046404, "grad_norm": 0.025506723672151566, "grad_norm_var": 6.409052717888548e-07, "learning_rate": 9.772434671843078e-05, "loss": 2.4464, "step": 25880 }, { "crossentropy": 2.436893939971924, "epoch": 0.9382613109048724, "grad_norm": 0.025456001982092857, "grad_norm_var": 6.81878909231552e-07, "learning_rate": 9.761005498883869e-05, "loss": 2.355, "step": 25881 }, { "crossentropy": 2.4377901554107666, "epoch": 0.9382975638051044, "grad_norm": 0.027219193056225777, "grad_norm_var": 7.257940667868782e-07, "learning_rate": 9.749582947346291e-05, "loss": 2.4347, "step": 25882 }, { "crossentropy": 2.5025529861450195, "epoch": 0.9383338167053364, "grad_norm": 0.02673708088696003, "grad_norm_var": 6.994690670492625e-07, "learning_rate": 9.738167017384614e-05, "loss": 2.464, "step": 25883 }, { "crossentropy": 2.38885235786438, "epoch": 0.9383700696055685, "grad_norm": 0.026355018839240074, "grad_norm_var": 6.977421649250696e-07, "learning_rate": 9.726757709152934e-05, "loss": 2.4086, "step": 25884 }, { "crossentropy": 2.473787784576416, "epoch": 0.9384063225058005, "grad_norm": 0.027125436812639236, "grad_norm_var": 7.332031050332385e-07, "learning_rate": 9.715355022805406e-05, "loss": 2.5226, "step": 25885 }, { "crossentropy": 2.4198405742645264, "epoch": 0.9384425754060325, "grad_norm": 0.026707079261541367, "grad_norm_var": 6.769176231296992e-07, "learning_rate": 9.703958958496073e-05, "loss": 2.3939, "step": 25886 }, { "crossentropy": 2.424701452255249, "epoch": 0.9384788283062645, "grad_norm": 0.025454986840486526, "grad_norm_var": 7.006066178413672e-07, "learning_rate": 9.692569516378813e-05, "loss": 2.4786, "step": 25887 }, { "crossentropy": 2.452270030975342, "epoch": 0.9385150812064965, "grad_norm": 0.027645356953144073, "grad_norm_var": 7.859664682888697e-07, "learning_rate": 9.681186696607503e-05, "loss": 2.3778, "step": 25888 }, { "crossentropy": 2.4090161323547363, "epoch": 0.9385513341067285, "grad_norm": 0.02627737633883953, "grad_norm_var": 7.685757669951993e-07, "learning_rate": 9.669810499335796e-05, "loss": 2.4857, "step": 25889 }, { "crossentropy": 2.426124095916748, "epoch": 0.9385875870069605, "grad_norm": 0.026652509346604347, "grad_norm_var": 7.569428743908382e-07, "learning_rate": 9.658440924717348e-05, "loss": 2.4242, "step": 25890 }, { "crossentropy": 2.4740071296691895, "epoch": 0.9386238399071926, "grad_norm": 0.026297204196453094, "grad_norm_var": 5.241617910771753e-07, "learning_rate": 9.647077972905926e-05, "loss": 2.4751, "step": 25891 }, { "crossentropy": 2.324265718460083, "epoch": 0.9386600928074246, "grad_norm": 0.02590775303542614, "grad_norm_var": 4.61642251126668e-07, "learning_rate": 9.635721644054795e-05, "loss": 2.3791, "step": 25892 }, { "crossentropy": 2.500507116317749, "epoch": 0.9386963457076566, "grad_norm": 0.026606660336256027, "grad_norm_var": 4.6071688964421933e-07, "learning_rate": 9.624371938317333e-05, "loss": 2.4951, "step": 25893 }, { "crossentropy": 2.3979880809783936, "epoch": 0.9387325986078886, "grad_norm": 0.02689298428595066, "grad_norm_var": 4.286620705668334e-07, "learning_rate": 9.613028855846972e-05, "loss": 2.382, "step": 25894 }, { "crossentropy": 2.486194610595703, "epoch": 0.9387688515081206, "grad_norm": 0.026281196624040604, "grad_norm_var": 4.0928105312064886e-07, "learning_rate": 9.601692396796813e-05, "loss": 2.4386, "step": 25895 }, { "crossentropy": 2.4250574111938477, "epoch": 0.9388051044083526, "grad_norm": 0.025037378072738647, "grad_norm_var": 4.817756882782274e-07, "learning_rate": 9.59036256132001e-05, "loss": 2.409, "step": 25896 }, { "crossentropy": 2.4683074951171875, "epoch": 0.9388413573085846, "grad_norm": 0.02662346139550209, "grad_norm_var": 4.175533611268234e-07, "learning_rate": 9.579039349569552e-05, "loss": 2.4673, "step": 25897 }, { "crossentropy": 2.3745181560516357, "epoch": 0.9388776102088167, "grad_norm": 0.026776371523737907, "grad_norm_var": 3.8668408344071035e-07, "learning_rate": 9.567722761698427e-05, "loss": 2.3223, "step": 25898 }, { "crossentropy": 2.6245949268341064, "epoch": 0.9389138631090487, "grad_norm": 0.026632610708475113, "grad_norm_var": 3.8352219502296725e-07, "learning_rate": 9.55641279785957e-05, "loss": 2.5033, "step": 25899 }, { "crossentropy": 2.491827964782715, "epoch": 0.9389501160092807, "grad_norm": 0.0263777207583189, "grad_norm_var": 3.83253022388554e-07, "learning_rate": 9.545109458205526e-05, "loss": 2.5104, "step": 25900 }, { "crossentropy": 2.465592622756958, "epoch": 0.9389863689095128, "grad_norm": 0.026699654757976532, "grad_norm_var": 3.565794191107488e-07, "learning_rate": 9.533812742889059e-05, "loss": 2.4262, "step": 25901 }, { "crossentropy": 2.4928669929504395, "epoch": 0.9390226218097448, "grad_norm": 0.025830915197730064, "grad_norm_var": 3.721186759984002e-07, "learning_rate": 9.52252265206277e-05, "loss": 2.427, "step": 25902 }, { "crossentropy": 2.2838821411132812, "epoch": 0.9390588747099768, "grad_norm": 0.02607950195670128, "grad_norm_var": 3.1991708350335005e-07, "learning_rate": 9.51123918587915e-05, "loss": 2.3101, "step": 25903 }, { "crossentropy": 2.419443368911743, "epoch": 0.9390951276102089, "grad_norm": 0.026126466691493988, "grad_norm_var": 2.1466585115914999e-07, "learning_rate": 9.499962344490631e-05, "loss": 2.4203, "step": 25904 }, { "crossentropy": 2.4697909355163574, "epoch": 0.9391313805104409, "grad_norm": 0.025544052943587303, "grad_norm_var": 2.5231998640010075e-07, "learning_rate": 9.488692128049425e-05, "loss": 2.3997, "step": 25905 }, { "crossentropy": 2.4772231578826904, "epoch": 0.9391676334106729, "grad_norm": 0.026244189590215683, "grad_norm_var": 2.4207350991257516e-07, "learning_rate": 9.4774285367078e-05, "loss": 2.4445, "step": 25906 }, { "crossentropy": 2.2093677520751953, "epoch": 0.9392038863109049, "grad_norm": 0.025667330250144005, "grad_norm_var": 2.626856599261072e-07, "learning_rate": 9.466171570617966e-05, "loss": 2.3149, "step": 25907 }, { "crossentropy": 2.3823673725128174, "epoch": 0.9392401392111369, "grad_norm": 0.02700788713991642, "grad_norm_var": 2.9428523556454015e-07, "learning_rate": 9.454921229931857e-05, "loss": 2.4033, "step": 25908 }, { "crossentropy": 2.4910037517547607, "epoch": 0.9392763921113689, "grad_norm": 0.026547223329544067, "grad_norm_var": 2.9189170453202255e-07, "learning_rate": 9.443677514801407e-05, "loss": 2.4057, "step": 25909 }, { "crossentropy": 2.4159984588623047, "epoch": 0.9393126450116009, "grad_norm": 0.025936847552657127, "grad_norm_var": 2.6999794339222354e-07, "learning_rate": 9.432440425378663e-05, "loss": 2.4824, "step": 25910 }, { "crossentropy": 2.470452070236206, "epoch": 0.939348897911833, "grad_norm": 0.026099862530827522, "grad_norm_var": 2.7041148819388165e-07, "learning_rate": 9.42120996181517e-05, "loss": 2.4696, "step": 25911 }, { "crossentropy": 2.3911056518554688, "epoch": 0.939385150812065, "grad_norm": 0.027230897918343544, "grad_norm_var": 2.3052550320854828e-07, "learning_rate": 9.409986124262749e-05, "loss": 2.4251, "step": 25912 }, { "crossentropy": 2.5706729888916016, "epoch": 0.939421403712297, "grad_norm": 0.02554466761648655, "grad_norm_var": 2.6235500134539717e-07, "learning_rate": 9.398768912872946e-05, "loss": 2.5053, "step": 25913 }, { "crossentropy": 2.148420572280884, "epoch": 0.939457656612529, "grad_norm": 0.026927141472697258, "grad_norm_var": 2.739222207536381e-07, "learning_rate": 9.387558327797307e-05, "loss": 2.2922, "step": 25914 }, { "crossentropy": 2.336172103881836, "epoch": 0.939493909512761, "grad_norm": 0.02610212005674839, "grad_norm_var": 2.666451222588789e-07, "learning_rate": 9.376354369187213e-05, "loss": 2.3208, "step": 25915 }, { "crossentropy": 2.2777011394500732, "epoch": 0.939530162412993, "grad_norm": 0.02556445449590683, "grad_norm_var": 2.9390610962789127e-07, "learning_rate": 9.365157037194038e-05, "loss": 2.3194, "step": 25916 }, { "crossentropy": 2.4882736206054688, "epoch": 0.939566415313225, "grad_norm": 0.027781566604971886, "grad_norm_var": 4.3956391492445897e-07, "learning_rate": 9.353966331968945e-05, "loss": 2.4404, "step": 25917 }, { "crossentropy": 2.4382412433624268, "epoch": 0.9396026682134571, "grad_norm": 0.026119327172636986, "grad_norm_var": 4.2808177184900227e-07, "learning_rate": 9.34278225366314e-05, "loss": 2.4398, "step": 25918 }, { "crossentropy": 2.397334575653076, "epoch": 0.9396389211136891, "grad_norm": 0.026904648169875145, "grad_norm_var": 4.4827784828592917e-07, "learning_rate": 9.331604802427618e-05, "loss": 2.4636, "step": 25919 }, { "crossentropy": 2.315108060836792, "epoch": 0.9396751740139211, "grad_norm": 0.027212021872401237, "grad_norm_var": 4.918488433857618e-07, "learning_rate": 9.320433978413423e-05, "loss": 2.412, "step": 25920 }, { "crossentropy": 2.4487814903259277, "epoch": 0.9397114269141531, "grad_norm": 0.02671809121966362, "grad_norm_var": 4.436731165864567e-07, "learning_rate": 9.309269781771379e-05, "loss": 2.4777, "step": 25921 }, { "crossentropy": 2.291139602661133, "epoch": 0.9397476798143851, "grad_norm": 0.025075530633330345, "grad_norm_var": 5.650791133127019e-07, "learning_rate": 9.29811221265231e-05, "loss": 2.2856, "step": 25922 }, { "crossentropy": 2.4377269744873047, "epoch": 0.9397839327146171, "grad_norm": 0.027357324957847595, "grad_norm_var": 5.779318930881442e-07, "learning_rate": 9.286961271206928e-05, "loss": 2.5105, "step": 25923 }, { "crossentropy": 2.425584077835083, "epoch": 0.9398201856148491, "grad_norm": 0.02588871866464615, "grad_norm_var": 5.816361735162436e-07, "learning_rate": 9.275816957585781e-05, "loss": 2.3984, "step": 25924 }, { "crossentropy": 2.3096795082092285, "epoch": 0.9398564385150812, "grad_norm": 0.0261513851583004, "grad_norm_var": 5.856725923497936e-07, "learning_rate": 9.264679271939414e-05, "loss": 2.3745, "step": 25925 }, { "crossentropy": 2.394789695739746, "epoch": 0.9398926914153132, "grad_norm": 0.02584364078938961, "grad_norm_var": 5.921381093778084e-07, "learning_rate": 9.253548214418317e-05, "loss": 2.3807, "step": 25926 }, { "crossentropy": 2.3309900760650635, "epoch": 0.9399289443155452, "grad_norm": 0.026437493041157722, "grad_norm_var": 5.854097803944935e-07, "learning_rate": 9.242423785172759e-05, "loss": 2.371, "step": 25927 }, { "crossentropy": 2.3611104488372803, "epoch": 0.9399651972157773, "grad_norm": 0.026811793446540833, "grad_norm_var": 5.515599230327313e-07, "learning_rate": 9.231305984353011e-05, "loss": 2.4938, "step": 25928 }, { "crossentropy": 2.5289578437805176, "epoch": 0.9400014501160093, "grad_norm": 0.025800099596381187, "grad_norm_var": 5.264222155215136e-07, "learning_rate": 9.220194812109228e-05, "loss": 2.5226, "step": 25929 }, { "crossentropy": 2.483350992202759, "epoch": 0.9400377030162413, "grad_norm": 0.026612158864736557, "grad_norm_var": 5.112596440273587e-07, "learning_rate": 9.209090268591513e-05, "loss": 2.4711, "step": 25930 }, { "crossentropy": 2.2400975227355957, "epoch": 0.9400739559164734, "grad_norm": 0.02592446282505989, "grad_norm_var": 5.202592901833979e-07, "learning_rate": 9.197992353949913e-05, "loss": 2.3596, "step": 25931 }, { "crossentropy": 2.562750816345215, "epoch": 0.9401102088167054, "grad_norm": 0.027991004288196564, "grad_norm_var": 6.219252124849949e-07, "learning_rate": 9.186901068334142e-05, "loss": 2.5987, "step": 25932 }, { "crossentropy": 2.301798105239868, "epoch": 0.9401464617169374, "grad_norm": 0.026767896488308907, "grad_norm_var": 5.182497994321115e-07, "learning_rate": 9.175816411894134e-05, "loss": 2.2745, "step": 25933 }, { "crossentropy": 2.335369110107422, "epoch": 0.9401827146171694, "grad_norm": 0.025713732466101646, "grad_norm_var": 5.478187443358022e-07, "learning_rate": 9.164738384779658e-05, "loss": 2.4586, "step": 25934 }, { "crossentropy": 2.4620766639709473, "epoch": 0.9402189675174014, "grad_norm": 0.0269818976521492, "grad_norm_var": 5.528681177034786e-07, "learning_rate": 9.153666987140153e-05, "loss": 2.4929, "step": 25935 }, { "crossentropy": 2.408468723297119, "epoch": 0.9402552204176334, "grad_norm": 0.025775449350476265, "grad_norm_var": 5.36936479594606e-07, "learning_rate": 9.142602219125329e-05, "loss": 2.4317, "step": 25936 }, { "crossentropy": 2.4107978343963623, "epoch": 0.9402914733178654, "grad_norm": 0.0266063641756773, "grad_norm_var": 5.32466627623888e-07, "learning_rate": 9.131544080884513e-05, "loss": 2.3825, "step": 25937 }, { "crossentropy": 2.5148723125457764, "epoch": 0.9403277262180975, "grad_norm": 0.026380077004432678, "grad_norm_var": 4.1564053773050257e-07, "learning_rate": 9.120492572567085e-05, "loss": 2.4593, "step": 25938 }, { "crossentropy": 2.498605489730835, "epoch": 0.9403639791183295, "grad_norm": 0.02668740786612034, "grad_norm_var": 3.617718212467827e-07, "learning_rate": 9.109447694322426e-05, "loss": 2.5332, "step": 25939 }, { "crossentropy": 2.4436895847320557, "epoch": 0.9404002320185615, "grad_norm": 0.026123005896806717, "grad_norm_var": 3.492824983852838e-07, "learning_rate": 9.098409446299582e-05, "loss": 2.396, "step": 25940 }, { "crossentropy": 2.4305224418640137, "epoch": 0.9404364849187935, "grad_norm": 0.02531866356730461, "grad_norm_var": 4.216676436732033e-07, "learning_rate": 9.087377828647713e-05, "loss": 2.4097, "step": 25941 }, { "crossentropy": 2.5007758140563965, "epoch": 0.9404727378190255, "grad_norm": 0.02675769291818142, "grad_norm_var": 4.1083990614162086e-07, "learning_rate": 9.076352841515812e-05, "loss": 2.3948, "step": 25942 }, { "crossentropy": 2.4068286418914795, "epoch": 0.9405089907192575, "grad_norm": 0.026522448286414146, "grad_norm_var": 4.115109488985723e-07, "learning_rate": 9.065334485052701e-05, "loss": 2.2968, "step": 25943 }, { "crossentropy": 2.3670706748962402, "epoch": 0.9405452436194895, "grad_norm": 0.026061058044433594, "grad_norm_var": 4.0785721393907933e-07, "learning_rate": 9.054322759407319e-05, "loss": 2.3621, "step": 25944 }, { "crossentropy": 2.407809019088745, "epoch": 0.9405814965197216, "grad_norm": 0.025607626885175705, "grad_norm_var": 4.2496382134161496e-07, "learning_rate": 9.043317664728323e-05, "loss": 2.4199, "step": 25945 }, { "crossentropy": 2.5114078521728516, "epoch": 0.9406177494199536, "grad_norm": 0.026126280426979065, "grad_norm_var": 4.236701417673853e-07, "learning_rate": 9.032319201164374e-05, "loss": 2.4349, "step": 25946 }, { "crossentropy": 2.343308448791504, "epoch": 0.9406540023201856, "grad_norm": 0.026263967156410217, "grad_norm_var": 4.123324499166371e-07, "learning_rate": 9.021327368864019e-05, "loss": 2.2773, "step": 25947 }, { "crossentropy": 2.4903035163879395, "epoch": 0.9406902552204176, "grad_norm": 0.026182636618614197, "grad_norm_var": 2.223221560200868e-07, "learning_rate": 9.010342167975749e-05, "loss": 2.4481, "step": 25948 }, { "crossentropy": 2.351074457168579, "epoch": 0.9407265081206496, "grad_norm": 0.02518612891435623, "grad_norm_var": 2.6783907066697345e-07, "learning_rate": 8.999363598647891e-05, "loss": 2.3476, "step": 25949 }, { "crossentropy": 2.5097954273223877, "epoch": 0.9407627610208816, "grad_norm": 0.02630038559436798, "grad_norm_var": 2.557402332493819e-07, "learning_rate": 8.988391661028772e-05, "loss": 2.4022, "step": 25950 }, { "crossentropy": 2.5768001079559326, "epoch": 0.9407990139211136, "grad_norm": 0.027299189940094948, "grad_norm_var": 2.959542920806408e-07, "learning_rate": 8.977426355266549e-05, "loss": 2.4821, "step": 25951 }, { "crossentropy": 2.4207870960235596, "epoch": 0.9408352668213457, "grad_norm": 0.026530617848038673, "grad_norm_var": 2.888592996181175e-07, "learning_rate": 8.966467681509327e-05, "loss": 2.4241, "step": 25952 }, { "crossentropy": 2.4475674629211426, "epoch": 0.9408715197215777, "grad_norm": 0.024852823466062546, "grad_norm_var": 3.9704223511100904e-07, "learning_rate": 8.955515639905154e-05, "loss": 2.3751, "step": 25953 }, { "crossentropy": 2.38606858253479, "epoch": 0.9409077726218097, "grad_norm": 0.02577727846801281, "grad_norm_var": 4.0025599214385534e-07, "learning_rate": 8.944570230601857e-05, "loss": 2.3942, "step": 25954 }, { "crossentropy": 2.454483985900879, "epoch": 0.9409440255220418, "grad_norm": 0.027637019753456116, "grad_norm_var": 5.310128256254121e-07, "learning_rate": 8.933631453747426e-05, "loss": 2.4825, "step": 25955 }, { "crossentropy": 2.3768746852874756, "epoch": 0.9409802784222738, "grad_norm": 0.026399293914437294, "grad_norm_var": 5.344513006802845e-07, "learning_rate": 8.922699309489468e-05, "loss": 2.3283, "step": 25956 }, { "crossentropy": 2.54099440574646, "epoch": 0.9410165313225058, "grad_norm": 0.026064248755574226, "grad_norm_var": 4.839217137091265e-07, "learning_rate": 8.911773797975752e-05, "loss": 2.5373, "step": 25957 }, { "crossentropy": 2.3752706050872803, "epoch": 0.9410527842227379, "grad_norm": 0.02636910043656826, "grad_norm_var": 4.65658041654326e-07, "learning_rate": 8.900854919353773e-05, "loss": 2.416, "step": 25958 }, { "crossentropy": 2.482893943786621, "epoch": 0.9410890371229699, "grad_norm": 0.027323966845870018, "grad_norm_var": 5.404027015738402e-07, "learning_rate": 8.889942673770968e-05, "loss": 2.5059, "step": 25959 }, { "crossentropy": 2.3172926902770996, "epoch": 0.9411252900232019, "grad_norm": 0.026848865672945976, "grad_norm_var": 5.59466745826918e-07, "learning_rate": 8.879037061374828e-05, "loss": 2.3195, "step": 25960 }, { "crossentropy": 2.317697286605835, "epoch": 0.9411615429234339, "grad_norm": 0.026811787858605385, "grad_norm_var": 5.392349024386691e-07, "learning_rate": 8.868138082312515e-05, "loss": 2.4059, "step": 25961 }, { "crossentropy": 2.3926353454589844, "epoch": 0.9411977958236659, "grad_norm": 0.025760889053344727, "grad_norm_var": 5.596162509935992e-07, "learning_rate": 8.857245736731357e-05, "loss": 2.395, "step": 25962 }, { "crossentropy": 2.3622288703918457, "epoch": 0.9412340487238979, "grad_norm": 0.026587827131152153, "grad_norm_var": 5.624344373566491e-07, "learning_rate": 8.846360024778455e-05, "loss": 2.4226, "step": 25963 }, { "crossentropy": 2.3671035766601562, "epoch": 0.94127030162413, "grad_norm": 0.02514360100030899, "grad_norm_var": 6.55970514870594e-07, "learning_rate": 8.835480946600805e-05, "loss": 2.3649, "step": 25964 }, { "crossentropy": 2.516578435897827, "epoch": 0.941306554524362, "grad_norm": 0.026766372844576836, "grad_norm_var": 5.761269448266756e-07, "learning_rate": 8.824608502345344e-05, "loss": 2.4545, "step": 25965 }, { "crossentropy": 2.5925662517547607, "epoch": 0.941342807424594, "grad_norm": 0.025550538673996925, "grad_norm_var": 6.216860876280424e-07, "learning_rate": 8.813742692158955e-05, "loss": 2.4726, "step": 25966 }, { "crossentropy": 2.3621132373809814, "epoch": 0.941379060324826, "grad_norm": 0.02701379545032978, "grad_norm_var": 5.909511019210866e-07, "learning_rate": 8.802883516188353e-05, "loss": 2.4111, "step": 25967 }, { "crossentropy": 2.3628361225128174, "epoch": 0.941415313225058, "grad_norm": 0.026576612144708633, "grad_norm_var": 5.922530531325543e-07, "learning_rate": 8.7920309745802e-05, "loss": 2.3785, "step": 25968 }, { "crossentropy": 2.321608066558838, "epoch": 0.94145156612529, "grad_norm": 0.02623726986348629, "grad_norm_var": 4.3701625499654543e-07, "learning_rate": 8.7811850674811e-05, "loss": 2.404, "step": 25969 }, { "crossentropy": 2.50321888923645, "epoch": 0.941487819025522, "grad_norm": 0.027054263278841972, "grad_norm_var": 4.2792171498472663e-07, "learning_rate": 8.770345795037604e-05, "loss": 2.5027, "step": 25970 }, { "crossentropy": 2.4042506217956543, "epoch": 0.941524071925754, "grad_norm": 0.027017036452889442, "grad_norm_var": 3.5870580478435054e-07, "learning_rate": 8.759513157396038e-05, "loss": 2.4888, "step": 25971 }, { "crossentropy": 2.24662446975708, "epoch": 0.9415603248259861, "grad_norm": 0.02602371759712696, "grad_norm_var": 3.7107975848665815e-07, "learning_rate": 8.748687154702672e-05, "loss": 2.4008, "step": 25972 }, { "crossentropy": 2.3587045669555664, "epoch": 0.9415965777262181, "grad_norm": 0.026561541482806206, "grad_norm_var": 3.611661553774009e-07, "learning_rate": 8.737867787103893e-05, "loss": 2.3531, "step": 25973 }, { "crossentropy": 2.2830772399902344, "epoch": 0.9416328306264501, "grad_norm": 0.026627371087670326, "grad_norm_var": 3.615868160265391e-07, "learning_rate": 8.727055054745636e-05, "loss": 2.4032, "step": 25974 }, { "crossentropy": 2.4494619369506836, "epoch": 0.9416690835266821, "grad_norm": 0.027597032487392426, "grad_norm_var": 3.9646186184612526e-07, "learning_rate": 8.716248957774064e-05, "loss": 2.3998, "step": 25975 }, { "crossentropy": 2.4334328174591064, "epoch": 0.9417053364269141, "grad_norm": 0.026080170646309853, "grad_norm_var": 3.9878001552006344e-07, "learning_rate": 8.705449496335116e-05, "loss": 2.4138, "step": 25976 }, { "crossentropy": 2.4480206966400146, "epoch": 0.9417415893271461, "grad_norm": 0.026013804599642754, "grad_norm_var": 4.014804956574298e-07, "learning_rate": 8.694656670574619e-05, "loss": 2.4513, "step": 25977 }, { "crossentropy": 2.458230972290039, "epoch": 0.9417778422273781, "grad_norm": 0.026824738830327988, "grad_norm_var": 3.7968269588420367e-07, "learning_rate": 8.683870480638345e-05, "loss": 2.3395, "step": 25978 }, { "crossentropy": 2.3880817890167236, "epoch": 0.9418140951276102, "grad_norm": 0.02624879777431488, "grad_norm_var": 3.8198012733031635e-07, "learning_rate": 8.673090926672066e-05, "loss": 2.3791, "step": 25979 }, { "crossentropy": 2.5241353511810303, "epoch": 0.9418503480278422, "grad_norm": 0.026625365018844604, "grad_norm_var": 2.5941579339867847e-07, "learning_rate": 8.662318008821223e-05, "loss": 2.501, "step": 25980 }, { "crossentropy": 2.3906314373016357, "epoch": 0.9418866009280742, "grad_norm": 0.02622894197702408, "grad_norm_var": 2.6204559592090995e-07, "learning_rate": 8.651551727231477e-05, "loss": 2.4278, "step": 25981 }, { "crossentropy": 2.572084903717041, "epoch": 0.9419228538283063, "grad_norm": 0.028034303337335587, "grad_norm_var": 3.273656341891967e-07, "learning_rate": 8.640792082048154e-05, "loss": 2.4788, "step": 25982 }, { "crossentropy": 2.3810930252075195, "epoch": 0.9419591067285383, "grad_norm": 0.025307822972536087, "grad_norm_var": 4.316976282057091e-07, "learning_rate": 8.630039073416584e-05, "loss": 2.3846, "step": 25983 }, { "crossentropy": 2.400480270385742, "epoch": 0.9419953596287703, "grad_norm": 0.02660818211734295, "grad_norm_var": 4.3180385578589903e-07, "learning_rate": 8.619292701482039e-05, "loss": 2.4449, "step": 25984 }, { "crossentropy": 2.3626086711883545, "epoch": 0.9420316125290024, "grad_norm": 0.026150427758693695, "grad_norm_var": 4.3610641710696796e-07, "learning_rate": 8.608552966389627e-05, "loss": 2.3573, "step": 25985 }, { "crossentropy": 2.418642520904541, "epoch": 0.9420678654292344, "grad_norm": 0.02620372362434864, "grad_norm_var": 4.2557639856495116e-07, "learning_rate": 8.597819868284395e-05, "loss": 2.4315, "step": 25986 }, { "crossentropy": 2.4196646213531494, "epoch": 0.9421041183294664, "grad_norm": 0.025491148233413696, "grad_norm_var": 4.678505742841995e-07, "learning_rate": 8.587093407311397e-05, "loss": 2.3795, "step": 25987 }, { "crossentropy": 2.3333866596221924, "epoch": 0.9421403712296984, "grad_norm": 0.026022326201200485, "grad_norm_var": 4.6792313608568873e-07, "learning_rate": 8.576373583615348e-05, "loss": 2.2859, "step": 25988 }, { "crossentropy": 2.449695587158203, "epoch": 0.9421766241299304, "grad_norm": 0.02645772136747837, "grad_norm_var": 4.6655589895633113e-07, "learning_rate": 8.565660397341246e-05, "loss": 2.4274, "step": 25989 }, { "crossentropy": 2.3825883865356445, "epoch": 0.9422128770301624, "grad_norm": 0.02657247520983219, "grad_norm_var": 4.6513576991419107e-07, "learning_rate": 8.554953848633585e-05, "loss": 2.4475, "step": 25990 }, { "crossentropy": 2.3827621936798096, "epoch": 0.9422491299303944, "grad_norm": 0.02601846680045128, "grad_norm_var": 3.698128313395684e-07, "learning_rate": 8.544253937637137e-05, "loss": 2.4501, "step": 25991 }, { "crossentropy": 2.3247556686401367, "epoch": 0.9422853828306265, "grad_norm": 0.026640845462679863, "grad_norm_var": 3.7261328521501905e-07, "learning_rate": 8.533560664496398e-05, "loss": 2.3515, "step": 25992 }, { "crossentropy": 2.2482213973999023, "epoch": 0.9423216357308585, "grad_norm": 0.02607634663581848, "grad_norm_var": 3.7013289285827736e-07, "learning_rate": 8.522874029355643e-05, "loss": 2.3222, "step": 25993 }, { "crossentropy": 2.440255880355835, "epoch": 0.9423578886310905, "grad_norm": 0.02618531882762909, "grad_norm_var": 3.54741318362101e-07, "learning_rate": 8.51219403235931e-05, "loss": 2.489, "step": 25994 }, { "crossentropy": 2.529947280883789, "epoch": 0.9423941415313225, "grad_norm": 0.02711108885705471, "grad_norm_var": 3.9480720063142514e-07, "learning_rate": 8.501520673651731e-05, "loss": 2.4853, "step": 25995 }, { "crossentropy": 2.337214231491089, "epoch": 0.9424303944315545, "grad_norm": 0.026380926370620728, "grad_norm_var": 3.898409283158155e-07, "learning_rate": 8.490853953376953e-05, "loss": 2.36, "step": 25996 }, { "crossentropy": 2.3868298530578613, "epoch": 0.9424666473317865, "grad_norm": 0.02674247696995735, "grad_norm_var": 3.985047700174581e-07, "learning_rate": 8.480193871679087e-05, "loss": 2.401, "step": 25997 }, { "crossentropy": 2.3131890296936035, "epoch": 0.9425029002320185, "grad_norm": 0.02541668713092804, "grad_norm_var": 2.4770540325060506e-07, "learning_rate": 8.469540428702127e-05, "loss": 2.3546, "step": 25998 }, { "crossentropy": 2.3275504112243652, "epoch": 0.9425391531322506, "grad_norm": 0.025958644226193428, "grad_norm_var": 1.957500303017967e-07, "learning_rate": 8.458893624589903e-05, "loss": 2.31, "step": 25999 }, { "crossentropy": 2.4092328548431396, "epoch": 0.9425754060324826, "grad_norm": 0.026484111323952675, "grad_norm_var": 1.9082485701580643e-07, "learning_rate": 8.448253459486299e-05, "loss": 2.3687, "step": 26000 }, { "crossentropy": 2.4762327671051025, "epoch": 0.9426116589327146, "grad_norm": 0.02618844248354435, "grad_norm_var": 1.9043812674954947e-07, "learning_rate": 8.437619933535034e-05, "loss": 2.4127, "step": 26001 }, { "crossentropy": 2.2660372257232666, "epoch": 0.9426479118329466, "grad_norm": 0.025677470490336418, "grad_norm_var": 2.107781184153279e-07, "learning_rate": 8.426993046879605e-05, "loss": 2.3852, "step": 26002 }, { "crossentropy": 2.490147590637207, "epoch": 0.9426841647331786, "grad_norm": 0.027011889964342117, "grad_norm_var": 2.087433422239803e-07, "learning_rate": 8.416372799663674e-05, "loss": 2.3898, "step": 26003 }, { "crossentropy": 2.4351301193237305, "epoch": 0.9427204176334106, "grad_norm": 0.027019409462809563, "grad_norm_var": 2.327573062073229e-07, "learning_rate": 8.405759192030571e-05, "loss": 2.3954, "step": 26004 }, { "crossentropy": 2.4382119178771973, "epoch": 0.9427566705336426, "grad_norm": 0.02548043616116047, "grad_norm_var": 2.8120148659638245e-07, "learning_rate": 8.395152224123737e-05, "loss": 2.5038, "step": 26005 }, { "crossentropy": 2.2568166255950928, "epoch": 0.9427929234338747, "grad_norm": 0.0268551018089056, "grad_norm_var": 2.960729837540721e-07, "learning_rate": 8.384551896086446e-05, "loss": 2.2684, "step": 26006 }, { "crossentropy": 2.3613874912261963, "epoch": 0.9428291763341067, "grad_norm": 0.025880612432956696, "grad_norm_var": 3.029497362097422e-07, "learning_rate": 8.373958208061749e-05, "loss": 2.4117, "step": 26007 }, { "crossentropy": 2.3228254318237305, "epoch": 0.9428654292343387, "grad_norm": 0.02787075564265251, "grad_norm_var": 4.5021144444845387e-07, "learning_rate": 8.363371160192923e-05, "loss": 2.4192, "step": 26008 }, { "crossentropy": 2.5041096210479736, "epoch": 0.9429016821345708, "grad_norm": 0.026650521904230118, "grad_norm_var": 4.4632687074830366e-07, "learning_rate": 8.352790752622853e-05, "loss": 2.4504, "step": 26009 }, { "crossentropy": 2.449162244796753, "epoch": 0.9429379350348028, "grad_norm": 0.026960546150803566, "grad_norm_var": 4.5837785771136486e-07, "learning_rate": 8.342216985494367e-05, "loss": 2.5174, "step": 26010 }, { "crossentropy": 2.301034927368164, "epoch": 0.9429741879350348, "grad_norm": 0.02705017849802971, "grad_norm_var": 4.534890539078148e-07, "learning_rate": 8.331649858950407e-05, "loss": 2.353, "step": 26011 }, { "crossentropy": 2.483304500579834, "epoch": 0.9430104408352669, "grad_norm": 0.026257958263158798, "grad_norm_var": 4.5600544241180863e-07, "learning_rate": 8.321089373133639e-05, "loss": 2.4541, "step": 26012 }, { "crossentropy": 2.4070818424224854, "epoch": 0.9430466937354989, "grad_norm": 0.02572263777256012, "grad_norm_var": 4.838335003261307e-07, "learning_rate": 8.31053552818667e-05, "loss": 2.5096, "step": 26013 }, { "crossentropy": 2.500221014022827, "epoch": 0.9430829466357309, "grad_norm": 0.027216175571084023, "grad_norm_var": 4.490097052509915e-07, "learning_rate": 8.299988324252106e-05, "loss": 2.5244, "step": 26014 }, { "crossentropy": 2.5157322883605957, "epoch": 0.9431191995359629, "grad_norm": 0.02725287154316902, "grad_norm_var": 4.5720777614947223e-07, "learning_rate": 8.28944776147239e-05, "loss": 2.4393, "step": 26015 }, { "crossentropy": 2.4936485290527344, "epoch": 0.9431554524361949, "grad_norm": 0.02800244651734829, "grad_norm_var": 5.780947857201661e-07, "learning_rate": 8.27891383998991e-05, "loss": 2.4536, "step": 26016 }, { "crossentropy": 2.2630627155303955, "epoch": 0.9431917053364269, "grad_norm": 0.026282651349902153, "grad_norm_var": 5.723042311291925e-07, "learning_rate": 8.268386559946883e-05, "loss": 2.3817, "step": 26017 }, { "crossentropy": 2.403470754623413, "epoch": 0.943227958236659, "grad_norm": 0.02636129967868328, "grad_norm_var": 5.083467312621524e-07, "learning_rate": 8.25786592148553e-05, "loss": 2.3869, "step": 26018 }, { "crossentropy": 2.3388795852661133, "epoch": 0.943264211136891, "grad_norm": 0.02557680942118168, "grad_norm_var": 5.854626661898893e-07, "learning_rate": 8.247351924747959e-05, "loss": 2.3798, "step": 26019 }, { "crossentropy": 2.422745704650879, "epoch": 0.943300464037123, "grad_norm": 0.027236929163336754, "grad_norm_var": 5.990604335019082e-07, "learning_rate": 8.236844569876112e-05, "loss": 2.4878, "step": 26020 }, { "crossentropy": 2.4078621864318848, "epoch": 0.943336716937355, "grad_norm": 0.02648305892944336, "grad_norm_var": 5.033827951759588e-07, "learning_rate": 8.226343857011986e-05, "loss": 2.3877, "step": 26021 }, { "crossentropy": 2.409010887145996, "epoch": 0.943372969837587, "grad_norm": 0.028539329767227173, "grad_norm_var": 7.090380322634458e-07, "learning_rate": 8.215849786297358e-05, "loss": 2.4556, "step": 26022 }, { "crossentropy": 2.433197021484375, "epoch": 0.943409222737819, "grad_norm": 0.02677011862397194, "grad_norm_var": 6.454110077057922e-07, "learning_rate": 8.205362357874002e-05, "loss": 2.4416, "step": 26023 }, { "crossentropy": 2.535998821258545, "epoch": 0.943445475638051, "grad_norm": 0.026272254064679146, "grad_norm_var": 5.960034599169472e-07, "learning_rate": 8.194881571883583e-05, "loss": 2.5195, "step": 26024 }, { "crossentropy": 2.5316455364227295, "epoch": 0.943481728538283, "grad_norm": 0.02576364390552044, "grad_norm_var": 6.616252018549201e-07, "learning_rate": 8.184407428467655e-05, "loss": 2.5069, "step": 26025 }, { "crossentropy": 2.2246203422546387, "epoch": 0.9435179814385151, "grad_norm": 0.024935830384492874, "grad_norm_var": 8.567662791022131e-07, "learning_rate": 8.173939927767548e-05, "loss": 2.2266, "step": 26026 }, { "crossentropy": 2.3700368404388428, "epoch": 0.9435542343387471, "grad_norm": 0.026477953419089317, "grad_norm_var": 8.434764736105766e-07, "learning_rate": 8.163479069924872e-05, "loss": 2.3513, "step": 26027 }, { "crossentropy": 2.46696138381958, "epoch": 0.9435904872389791, "grad_norm": 0.027085743844509125, "grad_norm_var": 8.516422727509182e-07, "learning_rate": 8.153024855080738e-05, "loss": 2.5187, "step": 26028 }, { "crossentropy": 2.3812429904937744, "epoch": 0.9436267401392111, "grad_norm": 0.02630496397614479, "grad_norm_var": 8.028719530390188e-07, "learning_rate": 8.142577283376362e-05, "loss": 2.3861, "step": 26029 }, { "crossentropy": 2.4069223403930664, "epoch": 0.9436629930394431, "grad_norm": 0.026593247428536415, "grad_norm_var": 7.809408961564525e-07, "learning_rate": 8.132136354952968e-05, "loss": 2.4061, "step": 26030 }, { "crossentropy": 2.439052104949951, "epoch": 0.9436992459396751, "grad_norm": 0.02545085735619068, "grad_norm_var": 8.321228102655768e-07, "learning_rate": 8.121702069951497e-05, "loss": 2.4412, "step": 26031 }, { "crossentropy": 2.399749755859375, "epoch": 0.9437354988399071, "grad_norm": 0.027222497388720512, "grad_norm_var": 7.147899412932035e-07, "learning_rate": 8.111274428512894e-05, "loss": 2.4784, "step": 26032 }, { "crossentropy": 2.54396653175354, "epoch": 0.9437717517401392, "grad_norm": 0.027414638549089432, "grad_norm_var": 7.681361293759935e-07, "learning_rate": 8.100853430777989e-05, "loss": 2.5414, "step": 26033 }, { "crossentropy": 2.4672188758850098, "epoch": 0.9438080046403712, "grad_norm": 0.02526349201798439, "grad_norm_var": 8.682373285846696e-07, "learning_rate": 8.090439076887557e-05, "loss": 2.4568, "step": 26034 }, { "crossentropy": 2.380140781402588, "epoch": 0.9438442575406032, "grad_norm": 0.02667972259223461, "grad_norm_var": 8.140974467144938e-07, "learning_rate": 8.080031366982266e-05, "loss": 2.4348, "step": 26035 }, { "crossentropy": 2.4219443798065186, "epoch": 0.9438805104408353, "grad_norm": 0.026275502517819405, "grad_norm_var": 7.813617879108443e-07, "learning_rate": 8.069630301202613e-05, "loss": 2.4956, "step": 26036 }, { "crossentropy": 2.364779233932495, "epoch": 0.9439167633410673, "grad_norm": 0.026987751945853233, "grad_norm_var": 7.981061804401228e-07, "learning_rate": 8.059235879689153e-05, "loss": 2.3929, "step": 26037 }, { "crossentropy": 2.3323769569396973, "epoch": 0.9439530162412993, "grad_norm": 0.026096489280462265, "grad_norm_var": 5.076030840283578e-07, "learning_rate": 8.048848102582274e-05, "loss": 2.4216, "step": 26038 }, { "crossentropy": 2.3373022079467773, "epoch": 0.9439892691415314, "grad_norm": 0.02719176933169365, "grad_norm_var": 5.423526224416676e-07, "learning_rate": 8.038466970022196e-05, "loss": 2.37, "step": 26039 }, { "crossentropy": 2.417041063308716, "epoch": 0.9440255220417634, "grad_norm": 0.02691011317074299, "grad_norm_var": 5.589563624870299e-07, "learning_rate": 8.028092482149308e-05, "loss": 2.3964, "step": 26040 }, { "crossentropy": 2.39251971244812, "epoch": 0.9440617749419954, "grad_norm": 0.026117630302906036, "grad_norm_var": 5.360032454160279e-07, "learning_rate": 8.017724639103552e-05, "loss": 2.5226, "step": 26041 }, { "crossentropy": 2.393977642059326, "epoch": 0.9440980278422274, "grad_norm": 0.027056608349084854, "grad_norm_var": 3.923367513064546e-07, "learning_rate": 8.00736344102504e-05, "loss": 2.4225, "step": 26042 }, { "crossentropy": 2.4571962356567383, "epoch": 0.9441342807424594, "grad_norm": 0.02616894245147705, "grad_norm_var": 4.021203192845392e-07, "learning_rate": 7.997008888053769e-05, "loss": 2.4534, "step": 26043 }, { "crossentropy": 2.2930407524108887, "epoch": 0.9441705336426914, "grad_norm": 0.025590473785996437, "grad_norm_var": 4.3529780036257134e-07, "learning_rate": 7.986660980329464e-05, "loss": 2.3243, "step": 26044 }, { "crossentropy": 2.3605518341064453, "epoch": 0.9442067865429234, "grad_norm": 0.026064207777380943, "grad_norm_var": 4.4382648501279224e-07, "learning_rate": 7.976319717991953e-05, "loss": 2.3617, "step": 26045 }, { "crossentropy": 2.224912643432617, "epoch": 0.9442430394431555, "grad_norm": 0.02659190259873867, "grad_norm_var": 4.437996116332289e-07, "learning_rate": 7.965985101180961e-05, "loss": 2.3356, "step": 26046 }, { "crossentropy": 2.3074512481689453, "epoch": 0.9442792923433875, "grad_norm": 0.026339592412114143, "grad_norm_var": 3.756383076058591e-07, "learning_rate": 7.955657130036042e-05, "loss": 2.3974, "step": 26047 }, { "crossentropy": 2.4377880096435547, "epoch": 0.9443155452436195, "grad_norm": 0.025421610102057457, "grad_norm_var": 4.0442295317400363e-07, "learning_rate": 7.945335804696641e-05, "loss": 2.4013, "step": 26048 }, { "crossentropy": 2.451915979385376, "epoch": 0.9443517981438515, "grad_norm": 0.026651540771126747, "grad_norm_var": 3.361222918572826e-07, "learning_rate": 7.9350211253022e-05, "loss": 2.4607, "step": 26049 }, { "crossentropy": 2.4295568466186523, "epoch": 0.9443880510440835, "grad_norm": 0.027153851464390755, "grad_norm_var": 2.8864620938485497e-07, "learning_rate": 7.924713091991997e-05, "loss": 2.4386, "step": 26050 }, { "crossentropy": 2.3245677947998047, "epoch": 0.9444243039443155, "grad_norm": 0.0257759727537632, "grad_norm_var": 3.1274824730248334e-07, "learning_rate": 7.914411704905422e-05, "loss": 2.4033, "step": 26051 }, { "crossentropy": 2.2554025650024414, "epoch": 0.9444605568445475, "grad_norm": 0.025404181331396103, "grad_norm_var": 3.746180609585087e-07, "learning_rate": 7.904116964181362e-05, "loss": 2.3486, "step": 26052 }, { "crossentropy": 2.3883814811706543, "epoch": 0.9444968097447796, "grad_norm": 0.02632172964513302, "grad_norm_var": 3.4527852399746036e-07, "learning_rate": 7.89382886995904e-05, "loss": 2.4348, "step": 26053 }, { "crossentropy": 2.406369209289551, "epoch": 0.9445330626450116, "grad_norm": 0.025322062894701958, "grad_norm_var": 4.041412856409706e-07, "learning_rate": 7.883547422377346e-05, "loss": 2.4512, "step": 26054 }, { "crossentropy": 2.395206928253174, "epoch": 0.9445693155452436, "grad_norm": 0.026782119646668434, "grad_norm_var": 3.6347076129276436e-07, "learning_rate": 7.873272621575167e-05, "loss": 2.5194, "step": 26055 }, { "crossentropy": 2.4820973873138428, "epoch": 0.9446055684454756, "grad_norm": 0.024908466264605522, "grad_norm_var": 4.322453560449361e-07, "learning_rate": 7.863004467691282e-05, "loss": 2.4068, "step": 26056 }, { "crossentropy": 2.4642562866210938, "epoch": 0.9446418213457076, "grad_norm": 0.027235310524702072, "grad_norm_var": 5.122879688576233e-07, "learning_rate": 7.852742960864301e-05, "loss": 2.4379, "step": 26057 }, { "crossentropy": 2.2563741207122803, "epoch": 0.9446780742459396, "grad_norm": 0.027072478085756302, "grad_norm_var": 5.141706729301267e-07, "learning_rate": 7.842488101232892e-05, "loss": 2.3972, "step": 26058 }, { "crossentropy": 2.3579399585723877, "epoch": 0.9447143271461717, "grad_norm": 0.026365824043750763, "grad_norm_var": 5.164270158059762e-07, "learning_rate": 7.832239888935666e-05, "loss": 2.3366, "step": 26059 }, { "crossentropy": 2.4368996620178223, "epoch": 0.9447505800464037, "grad_norm": 0.026109246537089348, "grad_norm_var": 4.919454903241309e-07, "learning_rate": 7.821998324110846e-05, "loss": 2.4041, "step": 26060 }, { "crossentropy": 2.391737699508667, "epoch": 0.9447868329466357, "grad_norm": 0.025780731812119484, "grad_norm_var": 5.028565798091776e-07, "learning_rate": 7.811763406896821e-05, "loss": 2.4745, "step": 26061 }, { "crossentropy": 2.3499999046325684, "epoch": 0.9448230858468677, "grad_norm": 0.02753971330821514, "grad_norm_var": 6.082405012261857e-07, "learning_rate": 7.801535137431925e-05, "loss": 2.3799, "step": 26062 }, { "crossentropy": 2.30733323097229, "epoch": 0.9448593387470998, "grad_norm": 0.026221388950943947, "grad_norm_var": 6.07883407548063e-07, "learning_rate": 7.79131351585416e-05, "loss": 2.318, "step": 26063 }, { "crossentropy": 2.3962440490722656, "epoch": 0.9448955916473318, "grad_norm": 0.025481756776571274, "grad_norm_var": 6.014329942585193e-07, "learning_rate": 7.781098542301745e-05, "loss": 2.404, "step": 26064 }, { "crossentropy": 2.4624457359313965, "epoch": 0.9449318445475638, "grad_norm": 0.025781184434890747, "grad_norm_var": 6.030967891511626e-07, "learning_rate": 7.770890216912463e-05, "loss": 2.4452, "step": 26065 }, { "crossentropy": 2.473515510559082, "epoch": 0.9449680974477959, "grad_norm": 0.0253057349473238, "grad_norm_var": 5.823866248558559e-07, "learning_rate": 7.760688539824367e-05, "loss": 2.3939, "step": 26066 }, { "crossentropy": 2.481372594833374, "epoch": 0.9450043503480279, "grad_norm": 0.027272790670394897, "grad_norm_var": 6.601437769882298e-07, "learning_rate": 7.750493511175183e-05, "loss": 2.4528, "step": 26067 }, { "crossentropy": 2.495563268661499, "epoch": 0.9450406032482599, "grad_norm": 0.02645387500524521, "grad_norm_var": 6.202106652837171e-07, "learning_rate": 7.740305131102576e-05, "loss": 2.5034, "step": 26068 }, { "crossentropy": 2.412647247314453, "epoch": 0.9450768561484919, "grad_norm": 0.025831392034888268, "grad_norm_var": 6.303617623989172e-07, "learning_rate": 7.730123399744105e-05, "loss": 2.3809, "step": 26069 }, { "crossentropy": 2.5164237022399902, "epoch": 0.9451131090487239, "grad_norm": 0.026612617075443268, "grad_norm_var": 5.80547303801783e-07, "learning_rate": 7.71994831723749e-05, "loss": 2.3975, "step": 26070 }, { "crossentropy": 2.363114356994629, "epoch": 0.9451493619489559, "grad_norm": 0.026603655889630318, "grad_norm_var": 5.709982940438688e-07, "learning_rate": 7.709779883719958e-05, "loss": 2.3672, "step": 26071 }, { "crossentropy": 2.281815767288208, "epoch": 0.945185614849188, "grad_norm": 0.0262767244130373, "grad_norm_var": 4.3669497695186796e-07, "learning_rate": 7.699618099328954e-05, "loss": 2.3451, "step": 26072 }, { "crossentropy": 2.3581087589263916, "epoch": 0.94522186774942, "grad_norm": 0.025970859453082085, "grad_norm_var": 3.9099391293399786e-07, "learning_rate": 7.689462964201644e-05, "loss": 2.4498, "step": 26073 }, { "crossentropy": 2.4444031715393066, "epoch": 0.945258120649652, "grad_norm": 0.026267075911164284, "grad_norm_var": 3.4777630323450567e-07, "learning_rate": 7.679314478475253e-05, "loss": 2.4417, "step": 26074 }, { "crossentropy": 2.434857130050659, "epoch": 0.945294373549884, "grad_norm": 0.027066992595791817, "grad_norm_var": 3.9006481620042164e-07, "learning_rate": 7.669172642286948e-05, "loss": 2.4471, "step": 26075 }, { "crossentropy": 2.3486297130584717, "epoch": 0.945330626450116, "grad_norm": 0.025658532977104187, "grad_norm_var": 4.133822828692249e-07, "learning_rate": 7.65903745577351e-05, "loss": 2.3368, "step": 26076 }, { "crossentropy": 2.307358980178833, "epoch": 0.945366879350348, "grad_norm": 0.02738984115421772, "grad_norm_var": 4.728523199011454e-07, "learning_rate": 7.648908919071939e-05, "loss": 2.4078, "step": 26077 }, { "crossentropy": 2.3805856704711914, "epoch": 0.94540313225058, "grad_norm": 0.026730652898550034, "grad_norm_var": 3.863278650346648e-07, "learning_rate": 7.638787032319073e-05, "loss": 2.4311, "step": 26078 }, { "crossentropy": 2.3873794078826904, "epoch": 0.945439385150812, "grad_norm": 0.025618160143494606, "grad_norm_var": 4.16022145057028e-07, "learning_rate": 7.628671795651521e-05, "loss": 2.3669, "step": 26079 }, { "crossentropy": 2.292301654815674, "epoch": 0.9454756380510441, "grad_norm": 0.02581948973238468, "grad_norm_var": 3.8765049186692545e-07, "learning_rate": 7.618563209205954e-05, "loss": 2.3818, "step": 26080 }, { "crossentropy": 2.312138795852661, "epoch": 0.9455118909512761, "grad_norm": 0.025196559727191925, "grad_norm_var": 4.4876966142313547e-07, "learning_rate": 7.608461273118927e-05, "loss": 2.3952, "step": 26081 }, { "crossentropy": 2.422492504119873, "epoch": 0.9455481438515081, "grad_norm": 0.02587444707751274, "grad_norm_var": 3.970270256969567e-07, "learning_rate": 7.598365987526834e-05, "loss": 2.3984, "step": 26082 }, { "crossentropy": 2.3814423084259033, "epoch": 0.9455843967517401, "grad_norm": 0.029415326192975044, "grad_norm_var": 9.646204014552087e-07, "learning_rate": 7.58827735256612e-05, "loss": 2.4238, "step": 26083 }, { "crossentropy": 2.4211325645446777, "epoch": 0.9456206496519721, "grad_norm": 0.026065854355692863, "grad_norm_var": 9.72491908216239e-07, "learning_rate": 7.578195368372953e-05, "loss": 2.413, "step": 26084 }, { "crossentropy": 2.3626766204833984, "epoch": 0.9456569025522041, "grad_norm": 0.025700237601995468, "grad_norm_var": 9.835084076807721e-07, "learning_rate": 7.568120035083504e-05, "loss": 2.3491, "step": 26085 }, { "crossentropy": 2.490086078643799, "epoch": 0.9456931554524362, "grad_norm": 0.025794267654418945, "grad_norm_var": 1.0012582337455093e-06, "learning_rate": 7.558051352833884e-05, "loss": 2.4187, "step": 26086 }, { "crossentropy": 2.4017927646636963, "epoch": 0.9457294083526682, "grad_norm": 0.026522494852542877, "grad_norm_var": 9.9882265278763e-07, "learning_rate": 7.547989321760096e-05, "loss": 2.3769, "step": 26087 }, { "crossentropy": 2.5106167793273926, "epoch": 0.9457656612529002, "grad_norm": 0.025897542014718056, "grad_norm_var": 1.010778887197217e-06, "learning_rate": 7.537933941998088e-05, "loss": 2.4648, "step": 26088 }, { "crossentropy": 2.503446102142334, "epoch": 0.9458019141531323, "grad_norm": 0.026033593341708183, "grad_norm_var": 1.0081732984270105e-06, "learning_rate": 7.527885213683528e-05, "loss": 2.4876, "step": 26089 }, { "crossentropy": 2.3417367935180664, "epoch": 0.9458381670533643, "grad_norm": 0.025592206045985222, "grad_norm_var": 1.041013465108417e-06, "learning_rate": 7.517843136952307e-05, "loss": 2.3357, "step": 26090 }, { "crossentropy": 2.493306875228882, "epoch": 0.9458744199535963, "grad_norm": 0.026103289797902107, "grad_norm_var": 9.9710144620331e-07, "learning_rate": 7.507807711939929e-05, "loss": 2.5147, "step": 26091 }, { "crossentropy": 2.311211585998535, "epoch": 0.9459106728538283, "grad_norm": 0.025514701381325722, "grad_norm_var": 1.0090331219675744e-06, "learning_rate": 7.497778938782007e-05, "loss": 2.3702, "step": 26092 }, { "crossentropy": 2.427950859069824, "epoch": 0.9459469257540604, "grad_norm": 0.02765725366771221, "grad_norm_var": 1.055773237856189e-06, "learning_rate": 7.487756817614045e-05, "loss": 2.4008, "step": 26093 }, { "crossentropy": 2.2363669872283936, "epoch": 0.9459831786542924, "grad_norm": 0.026028595864772797, "grad_norm_var": 1.0388715532919438e-06, "learning_rate": 7.477741348571265e-05, "loss": 2.3163, "step": 26094 }, { "crossentropy": 2.384580612182617, "epoch": 0.9460194315545244, "grad_norm": 0.02471271902322769, "grad_norm_var": 1.1575919785735638e-06, "learning_rate": 7.467732531789062e-05, "loss": 2.3324, "step": 26095 }, { "crossentropy": 2.4639949798583984, "epoch": 0.9460556844547564, "grad_norm": 0.027204317972064018, "grad_norm_var": 1.2218649636201352e-06, "learning_rate": 7.457730367402549e-05, "loss": 2.4385, "step": 26096 }, { "crossentropy": 2.511504650115967, "epoch": 0.9460919373549884, "grad_norm": 0.025596987456083298, "grad_norm_var": 1.1779338940970008e-06, "learning_rate": 7.44773485554684e-05, "loss": 2.5073, "step": 26097 }, { "crossentropy": 2.5052692890167236, "epoch": 0.9461281902552204, "grad_norm": 0.026881663128733635, "grad_norm_var": 1.1933060197944282e-06, "learning_rate": 7.437745996356937e-05, "loss": 2.4855, "step": 26098 }, { "crossentropy": 2.435426712036133, "epoch": 0.9461644431554525, "grad_norm": 0.026259545236825943, "grad_norm_var": 5.02826102095422e-07, "learning_rate": 7.42776378996779e-05, "loss": 2.4756, "step": 26099 }, { "crossentropy": 2.6045312881469727, "epoch": 0.9462006960556845, "grad_norm": 0.02609444037079811, "grad_norm_var": 5.027553029709452e-07, "learning_rate": 7.417788236514179e-05, "loss": 2.5158, "step": 26100 }, { "crossentropy": 2.3972578048706055, "epoch": 0.9462369489559165, "grad_norm": 0.026127463206648827, "grad_norm_var": 4.914129553883062e-07, "learning_rate": 7.407819336130883e-05, "loss": 2.4516, "step": 26101 }, { "crossentropy": 2.3839783668518066, "epoch": 0.9462732018561485, "grad_norm": 0.025665299966931343, "grad_norm_var": 4.981623247676814e-07, "learning_rate": 7.39785708895252e-05, "loss": 2.3288, "step": 26102 }, { "crossentropy": 2.2911288738250732, "epoch": 0.9463094547563805, "grad_norm": 0.027217620983719826, "grad_norm_var": 5.658285108171701e-07, "learning_rate": 7.387901495113647e-05, "loss": 2.3861, "step": 26103 }, { "crossentropy": 2.5193326473236084, "epoch": 0.9463457076566125, "grad_norm": 0.026593593880534172, "grad_norm_var": 5.71593106627706e-07, "learning_rate": 7.377952554748712e-05, "loss": 2.4125, "step": 26104 }, { "crossentropy": 2.352128744125366, "epoch": 0.9463819605568445, "grad_norm": 0.02589745633304119, "grad_norm_var": 5.758664761711194e-07, "learning_rate": 7.36801026799211e-05, "loss": 2.3774, "step": 26105 }, { "crossentropy": 2.338972330093384, "epoch": 0.9464182134570766, "grad_norm": 0.02723957970738411, "grad_norm_var": 6.127051129798181e-07, "learning_rate": 7.358074634978118e-05, "loss": 2.4095, "step": 26106 }, { "crossentropy": 2.381497621536255, "epoch": 0.9464544663573086, "grad_norm": 0.02732684276998043, "grad_norm_var": 6.742371441182763e-07, "learning_rate": 7.348145655840966e-05, "loss": 2.4277, "step": 26107 }, { "crossentropy": 2.377525568008423, "epoch": 0.9464907192575406, "grad_norm": 0.025399422273039818, "grad_norm_var": 6.883083543161745e-07, "learning_rate": 7.33822333071471e-05, "loss": 2.3831, "step": 26108 }, { "crossentropy": 2.4147725105285645, "epoch": 0.9465269721577726, "grad_norm": 0.0267673060297966, "grad_norm_var": 5.849361189040238e-07, "learning_rate": 7.328307659733413e-05, "loss": 2.473, "step": 26109 }, { "crossentropy": 2.496767997741699, "epoch": 0.9465632250580046, "grad_norm": 0.027194777503609657, "grad_norm_var": 6.256654200659702e-07, "learning_rate": 7.318398643030966e-05, "loss": 2.4131, "step": 26110 }, { "crossentropy": 2.302088975906372, "epoch": 0.9465994779582366, "grad_norm": 0.025983789935708046, "grad_norm_var": 4.430284222780295e-07, "learning_rate": 7.308496280741206e-05, "loss": 2.3684, "step": 26111 }, { "crossentropy": 2.4012131690979004, "epoch": 0.9466357308584686, "grad_norm": 0.026219690218567848, "grad_norm_var": 4.0664418965544124e-07, "learning_rate": 7.298600572997915e-05, "loss": 2.3864, "step": 26112 }, { "crossentropy": 2.469806671142578, "epoch": 0.9466719837587007, "grad_norm": 0.025593528524041176, "grad_norm_var": 4.070171669228716e-07, "learning_rate": 7.28871151993471e-05, "loss": 2.5017, "step": 26113 }, { "crossentropy": 2.368062973022461, "epoch": 0.9467082366589327, "grad_norm": 0.027073750272393227, "grad_norm_var": 4.215601543094971e-07, "learning_rate": 7.278829121685149e-05, "loss": 2.397, "step": 26114 }, { "crossentropy": 2.42586088180542, "epoch": 0.9467444895591647, "grad_norm": 0.026700235903263092, "grad_norm_var": 4.2451203471020187e-07, "learning_rate": 7.268953378382737e-05, "loss": 2.445, "step": 26115 }, { "crossentropy": 2.468430757522583, "epoch": 0.9467807424593968, "grad_norm": 0.026486482471227646, "grad_norm_var": 4.158758823965524e-07, "learning_rate": 7.259084290160867e-05, "loss": 2.4928, "step": 26116 }, { "crossentropy": 2.296314239501953, "epoch": 0.9468169953596288, "grad_norm": 0.026801206171512604, "grad_norm_var": 4.1366175843741913e-07, "learning_rate": 7.249221857152821e-05, "loss": 2.3818, "step": 26117 }, { "crossentropy": 2.426833391189575, "epoch": 0.9468532482598608, "grad_norm": 0.025526834651827812, "grad_norm_var": 4.3045560880181945e-07, "learning_rate": 7.239366079491882e-05, "loss": 2.4897, "step": 26118 }, { "crossentropy": 2.5164332389831543, "epoch": 0.9468895011600929, "grad_norm": 0.02654370665550232, "grad_norm_var": 3.944828445544742e-07, "learning_rate": 7.229516957311e-05, "loss": 2.484, "step": 26119 }, { "crossentropy": 2.5290298461914062, "epoch": 0.9469257540603249, "grad_norm": 0.02675110474228859, "grad_norm_var": 3.988545982286923e-07, "learning_rate": 7.219674490743344e-05, "loss": 2.4708, "step": 26120 }, { "crossentropy": 2.425616502761841, "epoch": 0.9469620069605569, "grad_norm": 0.02661200985312462, "grad_norm_var": 3.763029233197837e-07, "learning_rate": 7.209838679921755e-05, "loss": 2.3634, "step": 26121 }, { "crossentropy": 2.3707053661346436, "epoch": 0.9469982598607889, "grad_norm": 0.02641962468624115, "grad_norm_var": 3.389721137734651e-07, "learning_rate": 7.200009524979124e-05, "loss": 2.3355, "step": 26122 }, { "crossentropy": 2.4531874656677246, "epoch": 0.9470345127610209, "grad_norm": 0.025503020733594894, "grad_norm_var": 3.3668463993197173e-07, "learning_rate": 7.190187026048289e-05, "loss": 2.4444, "step": 26123 }, { "crossentropy": 2.3794314861297607, "epoch": 0.9470707656612529, "grad_norm": 0.02793619967997074, "grad_norm_var": 4.1786355260939264e-07, "learning_rate": 7.180371183261814e-05, "loss": 2.4265, "step": 26124 }, { "crossentropy": 2.3508121967315674, "epoch": 0.9471070185614849, "grad_norm": 0.025653183460235596, "grad_norm_var": 4.5678626785296214e-07, "learning_rate": 7.170561996752311e-05, "loss": 2.4377, "step": 26125 }, { "crossentropy": 2.4805867671966553, "epoch": 0.947143271461717, "grad_norm": 0.026475869119167328, "grad_norm_var": 4.164945579222349e-07, "learning_rate": 7.160759466652289e-05, "loss": 2.4451, "step": 26126 }, { "crossentropy": 2.490262508392334, "epoch": 0.947179524361949, "grad_norm": 0.026115743443369865, "grad_norm_var": 4.103917666924024e-07, "learning_rate": 7.150963593094028e-05, "loss": 2.4958, "step": 26127 }, { "crossentropy": 2.444894313812256, "epoch": 0.947215777262181, "grad_norm": 0.02590283751487732, "grad_norm_var": 4.243162346672837e-07, "learning_rate": 7.141174376210035e-05, "loss": 2.4437, "step": 26128 }, { "crossentropy": 2.3900163173675537, "epoch": 0.947252030162413, "grad_norm": 0.027018792927265167, "grad_norm_var": 4.016379206983149e-07, "learning_rate": 7.131391816132315e-05, "loss": 2.4235, "step": 26129 }, { "crossentropy": 2.554218292236328, "epoch": 0.947288283062645, "grad_norm": 0.026095310226082802, "grad_norm_var": 3.8271243003665243e-07, "learning_rate": 7.121615912993151e-05, "loss": 2.4932, "step": 26130 }, { "crossentropy": 2.3815879821777344, "epoch": 0.947324535962877, "grad_norm": 0.025985896587371826, "grad_norm_var": 3.868551971847323e-07, "learning_rate": 7.111846666924604e-05, "loss": 2.4038, "step": 26131 }, { "crossentropy": 2.5328402519226074, "epoch": 0.947360788863109, "grad_norm": 0.02659216895699501, "grad_norm_var": 3.892758979299101e-07, "learning_rate": 7.102084078058457e-05, "loss": 2.4638, "step": 26132 }, { "crossentropy": 2.448888063430786, "epoch": 0.947397041763341, "grad_norm": 0.02668869122862816, "grad_norm_var": 3.8361083875554137e-07, "learning_rate": 7.092328146526716e-05, "loss": 2.3249, "step": 26133 }, { "crossentropy": 2.416384220123291, "epoch": 0.9474332946635731, "grad_norm": 0.02602166123688221, "grad_norm_var": 3.436930105192119e-07, "learning_rate": 7.082578872461054e-05, "loss": 2.3665, "step": 26134 }, { "crossentropy": 2.4943315982818604, "epoch": 0.9474695475638051, "grad_norm": 0.02661854214966297, "grad_norm_var": 3.4552944338885926e-07, "learning_rate": 7.072836255993253e-05, "loss": 2.4219, "step": 26135 }, { "crossentropy": 2.3414273262023926, "epoch": 0.9475058004640371, "grad_norm": 0.02714942954480648, "grad_norm_var": 3.7412403693874025e-07, "learning_rate": 7.063100297254821e-05, "loss": 2.3143, "step": 26136 }, { "crossentropy": 2.3913276195526123, "epoch": 0.9475420533642691, "grad_norm": 0.026926564052700996, "grad_norm_var": 3.881802394496327e-07, "learning_rate": 7.053370996377262e-05, "loss": 2.3262, "step": 26137 }, { "crossentropy": 2.4199061393737793, "epoch": 0.9475783062645011, "grad_norm": 0.025815090164542198, "grad_norm_var": 4.1298403636961613e-07, "learning_rate": 7.043648353491972e-05, "loss": 2.4566, "step": 26138 }, { "crossentropy": 2.40775728225708, "epoch": 0.9476145591647331, "grad_norm": 0.02693011984229088, "grad_norm_var": 3.6841780081490565e-07, "learning_rate": 7.033932368730345e-05, "loss": 2.4093, "step": 26139 }, { "crossentropy": 2.454244613647461, "epoch": 0.9476508120649652, "grad_norm": 0.0267756599932909, "grad_norm_var": 2.296458123207205e-07, "learning_rate": 7.024223042223554e-05, "loss": 2.4146, "step": 26140 }, { "crossentropy": 2.336454153060913, "epoch": 0.9476870649651972, "grad_norm": 0.02648727037012577, "grad_norm_var": 1.8753155995632614e-07, "learning_rate": 7.014520374102717e-05, "loss": 2.4131, "step": 26141 }, { "crossentropy": 2.4774768352508545, "epoch": 0.9477233178654292, "grad_norm": 0.026167066767811775, "grad_norm_var": 1.9345479844116304e-07, "learning_rate": 7.004824364498952e-05, "loss": 2.4807, "step": 26142 }, { "crossentropy": 2.267669439315796, "epoch": 0.9477595707656613, "grad_norm": 0.025584915652871132, "grad_norm_var": 2.351254804293379e-07, "learning_rate": 6.995135013543208e-05, "loss": 2.308, "step": 26143 }, { "crossentropy": 2.2474656105041504, "epoch": 0.9477958236658933, "grad_norm": 0.026001155376434326, "grad_norm_var": 2.289173358735001e-07, "learning_rate": 6.98545232136627e-05, "loss": 2.343, "step": 26144 }, { "crossentropy": 2.470379590988159, "epoch": 0.9478320765661253, "grad_norm": 0.026522882282733917, "grad_norm_var": 2.0526644011314061e-07, "learning_rate": 6.975776288098979e-05, "loss": 2.3928, "step": 26145 }, { "crossentropy": 2.4257867336273193, "epoch": 0.9478683294663574, "grad_norm": 0.025382045656442642, "grad_norm_var": 2.658163318155441e-07, "learning_rate": 6.966106913872061e-05, "loss": 2.4177, "step": 26146 }, { "crossentropy": 2.461524248123169, "epoch": 0.9479045823665894, "grad_norm": 0.026969900354743004, "grad_norm_var": 2.7815913214643275e-07, "learning_rate": 6.956444198816025e-05, "loss": 2.4732, "step": 26147 }, { "crossentropy": 2.4329118728637695, "epoch": 0.9479408352668214, "grad_norm": 0.025822851806879044, "grad_norm_var": 2.9693264357704326e-07, "learning_rate": 6.946788143061433e-05, "loss": 2.4362, "step": 26148 }, { "crossentropy": 2.4314863681793213, "epoch": 0.9479770881670534, "grad_norm": 0.02642294205725193, "grad_norm_var": 2.8992994387324185e-07, "learning_rate": 6.937138746738791e-05, "loss": 2.3758, "step": 26149 }, { "crossentropy": 2.2220616340637207, "epoch": 0.9480133410672854, "grad_norm": 0.026246318593621254, "grad_norm_var": 2.8325277649624406e-07, "learning_rate": 6.927496009978218e-05, "loss": 2.3153, "step": 26150 }, { "crossentropy": 2.402301073074341, "epoch": 0.9480495939675174, "grad_norm": 0.0271894671022892, "grad_norm_var": 3.230075030747718e-07, "learning_rate": 6.917859932910164e-05, "loss": 2.4127, "step": 26151 }, { "crossentropy": 2.499204397201538, "epoch": 0.9480858468677494, "grad_norm": 0.02691883035004139, "grad_norm_var": 3.032764767984507e-07, "learning_rate": 6.908230515664693e-05, "loss": 2.4955, "step": 26152 }, { "crossentropy": 2.437229633331299, "epoch": 0.9481220997679815, "grad_norm": 0.025523530319333076, "grad_norm_var": 3.2503295355381053e-07, "learning_rate": 6.898607758371812e-05, "loss": 2.4151, "step": 26153 }, { "crossentropy": 2.427375555038452, "epoch": 0.9481583526682135, "grad_norm": 0.027149809524416924, "grad_norm_var": 3.505237702598636e-07, "learning_rate": 6.888991661161526e-05, "loss": 2.4317, "step": 26154 }, { "crossentropy": 2.5884480476379395, "epoch": 0.9481946055684455, "grad_norm": 0.026629257947206497, "grad_norm_var": 3.341501458341163e-07, "learning_rate": 6.879382224163789e-05, "loss": 2.4817, "step": 26155 }, { "crossentropy": 2.317758083343506, "epoch": 0.9482308584686775, "grad_norm": 0.026665354147553444, "grad_norm_var": 3.288284753340138e-07, "learning_rate": 6.869779447508274e-05, "loss": 2.4113, "step": 26156 }, { "crossentropy": 2.5043370723724365, "epoch": 0.9482671113689095, "grad_norm": 0.026871906593441963, "grad_norm_var": 3.4484696909923424e-07, "learning_rate": 6.860183331324766e-05, "loss": 2.4298, "step": 26157 }, { "crossentropy": 2.418804883956909, "epoch": 0.9483033642691415, "grad_norm": 0.026519637554883957, "grad_norm_var": 3.4264080045671476e-07, "learning_rate": 6.850593875742827e-05, "loss": 2.3686, "step": 26158 }, { "crossentropy": 2.5894789695739746, "epoch": 0.9483396171693735, "grad_norm": 0.02716023288667202, "grad_norm_var": 3.2626701061849205e-07, "learning_rate": 6.841011080892023e-05, "loss": 2.6081, "step": 26159 }, { "crossentropy": 2.4089717864990234, "epoch": 0.9483758700696056, "grad_norm": 0.026854095980525017, "grad_norm_var": 3.150324892337945e-07, "learning_rate": 6.831434946901804e-05, "loss": 2.4267, "step": 26160 }, { "crossentropy": 2.4865152835845947, "epoch": 0.9484121229698376, "grad_norm": 0.026363622397184372, "grad_norm_var": 3.172586713945727e-07, "learning_rate": 6.821865473901401e-05, "loss": 2.4629, "step": 26161 }, { "crossentropy": 2.3449344635009766, "epoch": 0.9484483758700696, "grad_norm": 0.025559330359101295, "grad_norm_var": 2.9177778115527187e-07, "learning_rate": 6.812302662020154e-05, "loss": 2.4424, "step": 26162 }, { "crossentropy": 2.3514087200164795, "epoch": 0.9484846287703016, "grad_norm": 0.025820117443799973, "grad_norm_var": 3.106730656524115e-07, "learning_rate": 6.80274651138718e-05, "loss": 2.3875, "step": 26163 }, { "crossentropy": 2.475529193878174, "epoch": 0.9485208816705336, "grad_norm": 0.026017580181360245, "grad_norm_var": 2.959204223454415e-07, "learning_rate": 6.793197022131603e-05, "loss": 2.5253, "step": 26164 }, { "crossentropy": 2.2678964138031006, "epoch": 0.9485571345707656, "grad_norm": 0.025968272238969803, "grad_norm_var": 3.131788708471041e-07, "learning_rate": 6.783654194382371e-05, "loss": 2.4006, "step": 26165 }, { "crossentropy": 2.447833776473999, "epoch": 0.9485933874709976, "grad_norm": 0.0266293715685606, "grad_norm_var": 3.111251680403311e-07, "learning_rate": 6.774118028268327e-05, "loss": 2.4327, "step": 26166 }, { "crossentropy": 2.249502658843994, "epoch": 0.9486296403712297, "grad_norm": 0.02581820636987686, "grad_norm_var": 3.0076527551546547e-07, "learning_rate": 6.764588523918314e-05, "loss": 2.3259, "step": 26167 }, { "crossentropy": 2.289644479751587, "epoch": 0.9486658932714617, "grad_norm": 0.02632475644350052, "grad_norm_var": 2.820689011471163e-07, "learning_rate": 6.755065681461114e-05, "loss": 2.325, "step": 26168 }, { "crossentropy": 2.523348569869995, "epoch": 0.9487021461716937, "grad_norm": 0.027157984673976898, "grad_norm_var": 2.651769626667155e-07, "learning_rate": 6.745549501025239e-05, "loss": 2.5091, "step": 26169 }, { "crossentropy": 2.3513238430023193, "epoch": 0.9487383990719258, "grad_norm": 0.026874586939811707, "grad_norm_var": 2.449406563330888e-07, "learning_rate": 6.736039982739194e-05, "loss": 2.4236, "step": 26170 }, { "crossentropy": 2.415339946746826, "epoch": 0.9487746519721578, "grad_norm": 0.03006182610988617, "grad_norm_var": 1.0624088995894153e-06, "learning_rate": 6.726537126731602e-05, "loss": 2.437, "step": 26171 }, { "crossentropy": 2.364891290664673, "epoch": 0.9488109048723898, "grad_norm": 0.026681093499064445, "grad_norm_var": 1.0624215998830677e-06, "learning_rate": 6.71704093313058e-05, "loss": 2.423, "step": 26172 }, { "crossentropy": 2.4464588165283203, "epoch": 0.9488471577726219, "grad_norm": 0.025872405618429184, "grad_norm_var": 1.0976404581691424e-06, "learning_rate": 6.707551402064582e-05, "loss": 2.4598, "step": 26173 }, { "crossentropy": 2.525238037109375, "epoch": 0.9488834106728539, "grad_norm": 0.02708454243838787, "grad_norm_var": 1.1111410599062266e-06, "learning_rate": 6.698068533661672e-05, "loss": 2.4333, "step": 26174 }, { "crossentropy": 2.3940677642822266, "epoch": 0.9489196635730859, "grad_norm": 0.026736387982964516, "grad_norm_var": 1.0929974454264765e-06, "learning_rate": 6.688592328049914e-05, "loss": 2.4882, "step": 26175 }, { "crossentropy": 2.44700288772583, "epoch": 0.9489559164733179, "grad_norm": 0.0275399349629879, "grad_norm_var": 1.1443504840289668e-06, "learning_rate": 6.679122785357428e-05, "loss": 2.4905, "step": 26176 }, { "crossentropy": 2.380624294281006, "epoch": 0.9489921693735499, "grad_norm": 0.025934187695384026, "grad_norm_var": 1.1726674839049322e-06, "learning_rate": 6.669659905712e-05, "loss": 2.3969, "step": 26177 }, { "crossentropy": 2.4041526317596436, "epoch": 0.9490284222737819, "grad_norm": 0.026426125317811966, "grad_norm_var": 1.0958814658573822e-06, "learning_rate": 6.660203689241473e-05, "loss": 2.4891, "step": 26178 }, { "crossentropy": 2.3276724815368652, "epoch": 0.9490646751740139, "grad_norm": 0.025737576186656952, "grad_norm_var": 1.1058170669057579e-06, "learning_rate": 6.650754136073579e-05, "loss": 2.3267, "step": 26179 }, { "crossentropy": 2.4287123680114746, "epoch": 0.949100928074246, "grad_norm": 0.025710226967930794, "grad_norm_var": 1.1388286062595394e-06, "learning_rate": 6.641311246335879e-05, "loss": 2.4698, "step": 26180 }, { "crossentropy": 2.419729232788086, "epoch": 0.949137180974478, "grad_norm": 0.02718948945403099, "grad_norm_var": 1.1194316053693305e-06, "learning_rate": 6.631875020156053e-05, "loss": 2.4788, "step": 26181 }, { "crossentropy": 2.598724842071533, "epoch": 0.94917343387471, "grad_norm": 0.0277674812823534, "grad_norm_var": 1.184181187152226e-06, "learning_rate": 6.622445457661385e-05, "loss": 2.5065, "step": 26182 }, { "crossentropy": 2.4681482315063477, "epoch": 0.949209686774942, "grad_norm": 0.025491274893283844, "grad_norm_var": 1.2339769270540886e-06, "learning_rate": 6.613022558979331e-05, "loss": 2.4658, "step": 26183 }, { "crossentropy": 2.376577615737915, "epoch": 0.949245939675174, "grad_norm": 0.02608058974146843, "grad_norm_var": 1.2527472985037734e-06, "learning_rate": 6.603606324237176e-05, "loss": 2.3944, "step": 26184 }, { "crossentropy": 2.378108263015747, "epoch": 0.949282192575406, "grad_norm": 0.02689957059919834, "grad_norm_var": 1.2436081930463355e-06, "learning_rate": 6.594196753562098e-05, "loss": 2.4332, "step": 26185 }, { "crossentropy": 2.3390707969665527, "epoch": 0.949318445475638, "grad_norm": 0.025666281580924988, "grad_norm_var": 1.3156654714866945e-06, "learning_rate": 6.584793847081105e-05, "loss": 2.3927, "step": 26186 }, { "crossentropy": 2.4971508979797363, "epoch": 0.94935469837587, "grad_norm": 0.025432368740439415, "grad_norm_var": 5.676495873865294e-07, "learning_rate": 6.575397604921318e-05, "loss": 2.472, "step": 26187 }, { "crossentropy": 2.6133105754852295, "epoch": 0.9493909512761021, "grad_norm": 0.026609187945723534, "grad_norm_var": 5.65187620711558e-07, "learning_rate": 6.566008027209525e-05, "loss": 2.5889, "step": 26188 }, { "crossentropy": 2.512360095977783, "epoch": 0.9494272041763341, "grad_norm": 0.02617822214961052, "grad_norm_var": 5.500866105761481e-07, "learning_rate": 6.556625114072623e-05, "loss": 2.4495, "step": 26189 }, { "crossentropy": 2.3167006969451904, "epoch": 0.9494634570765661, "grad_norm": 0.026464639231562614, "grad_norm_var": 5.179551830311023e-07, "learning_rate": 6.547248865637345e-05, "loss": 2.2797, "step": 26190 }, { "crossentropy": 2.525336980819702, "epoch": 0.9494997099767981, "grad_norm": 0.02552821673452854, "grad_norm_var": 5.495953895456295e-07, "learning_rate": 6.537879282030257e-05, "loss": 2.2664, "step": 26191 }, { "crossentropy": 2.564365863800049, "epoch": 0.9495359628770301, "grad_norm": 0.026741093024611473, "grad_norm_var": 4.5644861290937365e-07, "learning_rate": 6.528516363378034e-05, "loss": 2.492, "step": 26192 }, { "crossentropy": 2.482558012008667, "epoch": 0.9495722157772621, "grad_norm": 0.0261065810918808, "grad_norm_var": 4.5125299494482484e-07, "learning_rate": 6.519160109807077e-05, "loss": 2.4271, "step": 26193 }, { "crossentropy": 2.315453052520752, "epoch": 0.9496084686774942, "grad_norm": 0.024866824969649315, "grad_norm_var": 5.669748131144666e-07, "learning_rate": 6.509810521443671e-05, "loss": 2.365, "step": 26194 }, { "crossentropy": 2.338071346282959, "epoch": 0.9496447215777262, "grad_norm": 0.02641202323138714, "grad_norm_var": 5.579256853324259e-07, "learning_rate": 6.500467598414217e-05, "loss": 2.3725, "step": 26195 }, { "crossentropy": 2.418532133102417, "epoch": 0.9496809744779582, "grad_norm": 0.025317251682281494, "grad_norm_var": 5.930568728705245e-07, "learning_rate": 6.491131340844835e-05, "loss": 2.4069, "step": 26196 }, { "crossentropy": 2.350388288497925, "epoch": 0.9497172273781903, "grad_norm": 0.02656247839331627, "grad_norm_var": 5.325599617107447e-07, "learning_rate": 6.481801748861704e-05, "loss": 2.3288, "step": 26197 }, { "crossentropy": 2.4580743312835693, "epoch": 0.9497534802784223, "grad_norm": 0.026168150827288628, "grad_norm_var": 3.4383048073357603e-07, "learning_rate": 6.472478822590722e-05, "loss": 2.4213, "step": 26198 }, { "crossentropy": 2.357935667037964, "epoch": 0.9497897331786543, "grad_norm": 0.02590925246477127, "grad_norm_var": 3.2457033631674326e-07, "learning_rate": 6.463162562157898e-05, "loss": 2.408, "step": 26199 }, { "crossentropy": 2.4555065631866455, "epoch": 0.9498259860788864, "grad_norm": 0.027724508196115494, "grad_norm_var": 4.982241808933284e-07, "learning_rate": 6.453852967689022e-05, "loss": 2.4364, "step": 26200 }, { "crossentropy": 2.405057430267334, "epoch": 0.9498622389791184, "grad_norm": 0.028233658522367477, "grad_norm_var": 7.407184315706579e-07, "learning_rate": 6.444550039309827e-05, "loss": 2.4085, "step": 26201 }, { "crossentropy": 2.3366925716400146, "epoch": 0.9498984918793504, "grad_norm": 0.027066294103860855, "grad_norm_var": 7.551835993139614e-07, "learning_rate": 6.435253777146044e-05, "loss": 2.4507, "step": 26202 }, { "crossentropy": 2.496793746948242, "epoch": 0.9499347447795824, "grad_norm": 0.02604427933692932, "grad_norm_var": 7.051419485325777e-07, "learning_rate": 6.42596418132313e-05, "loss": 2.4899, "step": 26203 }, { "crossentropy": 2.453835964202881, "epoch": 0.9499709976798144, "grad_norm": 0.026950649917125702, "grad_norm_var": 7.232829979053034e-07, "learning_rate": 6.416681251966594e-05, "loss": 2.4698, "step": 26204 }, { "crossentropy": 2.544996500015259, "epoch": 0.9500072505800464, "grad_norm": 0.02656269446015358, "grad_norm_var": 7.21555989442677e-07, "learning_rate": 6.407404989201837e-05, "loss": 2.4763, "step": 26205 }, { "crossentropy": 2.443401336669922, "epoch": 0.9500435034802784, "grad_norm": 0.02668338268995285, "grad_norm_var": 7.259604025669816e-07, "learning_rate": 6.398135393154092e-05, "loss": 2.4685, "step": 26206 }, { "crossentropy": 2.4323744773864746, "epoch": 0.9500797563805105, "grad_norm": 0.027441220358014107, "grad_norm_var": 7.247114548278859e-07, "learning_rate": 6.388872463948592e-05, "loss": 2.4853, "step": 26207 }, { "crossentropy": 2.5912153720855713, "epoch": 0.9501160092807425, "grad_norm": 0.02704240195453167, "grad_norm_var": 7.380869652413713e-07, "learning_rate": 6.379616201710515e-05, "loss": 2.4866, "step": 26208 }, { "crossentropy": 2.227931261062622, "epoch": 0.9501522621809745, "grad_norm": 0.026464318856596947, "grad_norm_var": 7.240656675908466e-07, "learning_rate": 6.370366606564759e-05, "loss": 2.3641, "step": 26209 }, { "crossentropy": 2.3338005542755127, "epoch": 0.9501885150812065, "grad_norm": 0.026407107710838318, "grad_norm_var": 5.183343390264035e-07, "learning_rate": 6.361123678636338e-05, "loss": 2.2775, "step": 26210 }, { "crossentropy": 2.3897509574890137, "epoch": 0.9502247679814385, "grad_norm": 0.026435865089297295, "grad_norm_var": 5.174962009489436e-07, "learning_rate": 6.351887418050039e-05, "loss": 2.431, "step": 26211 }, { "crossentropy": 2.3077101707458496, "epoch": 0.9502610208816705, "grad_norm": 0.02632349729537964, "grad_norm_var": 3.9682516800701855e-07, "learning_rate": 6.342657824930654e-05, "loss": 2.4133, "step": 26212 }, { "crossentropy": 2.415126085281372, "epoch": 0.9502972737819025, "grad_norm": 0.028683526441454887, "grad_norm_var": 6.246213886696809e-07, "learning_rate": 6.333434899402801e-05, "loss": 2.434, "step": 26213 }, { "crossentropy": 2.261347770690918, "epoch": 0.9503335266821346, "grad_norm": 0.029251080006361008, "grad_norm_var": 9.24476685363222e-07, "learning_rate": 6.324218641591106e-05, "loss": 2.3389, "step": 26214 }, { "crossentropy": 2.4092419147491455, "epoch": 0.9503697795823666, "grad_norm": 0.0257544107735157, "grad_norm_var": 9.500733210197522e-07, "learning_rate": 6.315009051620024e-05, "loss": 2.5197, "step": 26215 }, { "crossentropy": 2.3194408416748047, "epoch": 0.9504060324825986, "grad_norm": 0.02558271959424019, "grad_norm_var": 1.048955757891008e-06, "learning_rate": 6.305806129613956e-05, "loss": 2.3354, "step": 26216 }, { "crossentropy": 2.4855408668518066, "epoch": 0.9504422853828306, "grad_norm": 0.026799136772751808, "grad_norm_var": 9.287844827253477e-07, "learning_rate": 6.296609875697134e-05, "loss": 2.422, "step": 26217 }, { "crossentropy": 2.3866329193115234, "epoch": 0.9504785382830626, "grad_norm": 0.026523690670728683, "grad_norm_var": 9.310516853697215e-07, "learning_rate": 6.28742028999385e-05, "loss": 2.4411, "step": 26218 }, { "crossentropy": 2.505166530609131, "epoch": 0.9505147911832946, "grad_norm": 0.025681177154183388, "grad_norm_var": 9.763328844168782e-07, "learning_rate": 6.27823737262817e-05, "loss": 2.5399, "step": 26219 }, { "crossentropy": 2.732898712158203, "epoch": 0.9505510440835266, "grad_norm": 0.026353133842349052, "grad_norm_var": 9.855836862054934e-07, "learning_rate": 6.269061123724162e-05, "loss": 2.5768, "step": 26220 }, { "crossentropy": 2.416898012161255, "epoch": 0.9505872969837587, "grad_norm": 0.026650456711649895, "grad_norm_var": 9.838810726207527e-07, "learning_rate": 6.259891543405727e-05, "loss": 2.4227, "step": 26221 }, { "crossentropy": 2.5438973903656006, "epoch": 0.9506235498839907, "grad_norm": 0.026270292699337006, "grad_norm_var": 9.984809734761655e-07, "learning_rate": 6.250728631796764e-05, "loss": 2.4987, "step": 26222 }, { "crossentropy": 2.4981136322021484, "epoch": 0.9506598027842227, "grad_norm": 0.026998693123459816, "grad_norm_var": 9.68696915673338e-07, "learning_rate": 6.241572389020955e-05, "loss": 2.414, "step": 26223 }, { "crossentropy": 2.3591959476470947, "epoch": 0.9506960556844548, "grad_norm": 0.02616135962307453, "grad_norm_var": 9.77146808463789e-07, "learning_rate": 6.232422815202088e-05, "loss": 2.4211, "step": 26224 }, { "crossentropy": 2.4149222373962402, "epoch": 0.9507323085846868, "grad_norm": 0.026701046153903008, "grad_norm_var": 9.749059677331088e-07, "learning_rate": 6.223279910463619e-05, "loss": 2.4498, "step": 26225 }, { "crossentropy": 2.5078346729278564, "epoch": 0.9507685614849188, "grad_norm": 0.026723092421889305, "grad_norm_var": 9.704464097911569e-07, "learning_rate": 6.214143674929062e-05, "loss": 2.4016, "step": 26226 }, { "crossentropy": 2.4869840145111084, "epoch": 0.9508048143851509, "grad_norm": 0.02594040520489216, "grad_norm_var": 1.0019712261039501e-06, "learning_rate": 6.205014108721929e-05, "loss": 2.4509, "step": 26227 }, { "crossentropy": 2.463392972946167, "epoch": 0.9508410672853829, "grad_norm": 0.025683656334877014, "grad_norm_var": 1.0554009800418003e-06, "learning_rate": 6.195891211965343e-05, "loss": 2.4388, "step": 26228 }, { "crossentropy": 2.268362283706665, "epoch": 0.9508773201856149, "grad_norm": 0.025920260697603226, "grad_norm_var": 7.686188423498685e-07, "learning_rate": 6.18677498478265e-05, "loss": 2.3754, "step": 26229 }, { "crossentropy": 2.473829746246338, "epoch": 0.9509135730858469, "grad_norm": 0.025494394823908806, "grad_norm_var": 2.4119497368015517e-07, "learning_rate": 6.177665427296975e-05, "loss": 2.305, "step": 26230 }, { "crossentropy": 2.3259847164154053, "epoch": 0.9509498259860789, "grad_norm": 0.0260618943721056, "grad_norm_var": 2.2873874331098774e-07, "learning_rate": 6.168562539631328e-05, "loss": 2.3865, "step": 26231 }, { "crossentropy": 2.4301998615264893, "epoch": 0.9509860788863109, "grad_norm": 0.0262079369276762, "grad_norm_var": 1.999122296617491e-07, "learning_rate": 6.159466321908669e-05, "loss": 2.3897, "step": 26232 }, { "crossentropy": 2.2652223110198975, "epoch": 0.9510223317865429, "grad_norm": 0.026810873299837112, "grad_norm_var": 2.007634784829352e-07, "learning_rate": 6.150376774251787e-05, "loss": 2.2162, "step": 26233 }, { "crossentropy": 2.586355686187744, "epoch": 0.951058584686775, "grad_norm": 0.027236752212047577, "grad_norm_var": 2.574794916353062e-07, "learning_rate": 6.141293896783584e-05, "loss": 2.5338, "step": 26234 }, { "crossentropy": 2.461965322494507, "epoch": 0.951094837587007, "grad_norm": 0.028536492958664894, "grad_norm_var": 5.291693909165954e-07, "learning_rate": 6.132217689626685e-05, "loss": 2.4837, "step": 26235 }, { "crossentropy": 2.148815631866455, "epoch": 0.951131090487239, "grad_norm": 0.02642197161912918, "grad_norm_var": 5.282605506438943e-07, "learning_rate": 6.123148152903546e-05, "loss": 2.3623, "step": 26236 }, { "crossentropy": 2.363616943359375, "epoch": 0.951167343387471, "grad_norm": 0.026296885684132576, "grad_norm_var": 5.284492824506396e-07, "learning_rate": 6.114085286736849e-05, "loss": 2.4482, "step": 26237 }, { "crossentropy": 2.3639612197875977, "epoch": 0.951203596287703, "grad_norm": 0.02643999457359314, "grad_norm_var": 5.25806796712082e-07, "learning_rate": 6.105029091248938e-05, "loss": 2.4102, "step": 26238 }, { "crossentropy": 2.398890733718872, "epoch": 0.951239849187935, "grad_norm": 0.02707843855023384, "grad_norm_var": 5.31748808317101e-07, "learning_rate": 6.095979566562049e-05, "loss": 2.4025, "step": 26239 }, { "crossentropy": 2.390608549118042, "epoch": 0.951276102088167, "grad_norm": 0.025975193828344345, "grad_norm_var": 5.418792449083295e-07, "learning_rate": 6.086936712798585e-05, "loss": 2.4684, "step": 26240 }, { "crossentropy": 2.3691821098327637, "epoch": 0.9513123549883991, "grad_norm": 0.025380907580256462, "grad_norm_var": 6.102359157709088e-07, "learning_rate": 6.0779005300805044e-05, "loss": 2.3996, "step": 26241 }, { "crossentropy": 2.4407293796539307, "epoch": 0.9513486078886311, "grad_norm": 0.026901815086603165, "grad_norm_var": 6.202157113953937e-07, "learning_rate": 6.0688710185299314e-05, "loss": 2.3927, "step": 26242 }, { "crossentropy": 2.5209579467773438, "epoch": 0.9513848607888631, "grad_norm": 0.026553984731435776, "grad_norm_var": 6.062079770057423e-07, "learning_rate": 6.059848178268879e-05, "loss": 2.5116, "step": 26243 }, { "crossentropy": 2.3708314895629883, "epoch": 0.9514211136890951, "grad_norm": 0.025676080957055092, "grad_norm_var": 6.069730754730638e-07, "learning_rate": 6.0508320094191384e-05, "loss": 2.4731, "step": 26244 }, { "crossentropy": 2.439648389816284, "epoch": 0.9514573665893271, "grad_norm": 0.026885762810707092, "grad_norm_var": 5.98698393347224e-07, "learning_rate": 6.041822512102446e-05, "loss": 2.4696, "step": 26245 }, { "crossentropy": 2.2357656955718994, "epoch": 0.9514936194895591, "grad_norm": 0.025658123195171356, "grad_norm_var": 5.784764385864208e-07, "learning_rate": 6.032819686440594e-05, "loss": 2.2163, "step": 26246 }, { "crossentropy": 2.4053940773010254, "epoch": 0.9515298723897911, "grad_norm": 0.026365863159298897, "grad_norm_var": 5.661833518184427e-07, "learning_rate": 6.023823532555151e-05, "loss": 2.3661, "step": 26247 }, { "crossentropy": 2.5946404933929443, "epoch": 0.9515661252900232, "grad_norm": 0.02718711830675602, "grad_norm_var": 5.84492199578029e-07, "learning_rate": 6.0148340505675746e-05, "loss": 2.4906, "step": 26248 }, { "crossentropy": 2.4027163982391357, "epoch": 0.9516023781902552, "grad_norm": 0.025538433343172073, "grad_norm_var": 6.478553134671323e-07, "learning_rate": 6.0058512405993246e-05, "loss": 2.4369, "step": 26249 }, { "crossentropy": 2.46966290473938, "epoch": 0.9516386310904872, "grad_norm": 0.0266135111451149, "grad_norm_var": 6.116039317673992e-07, "learning_rate": 5.996875102771693e-05, "loss": 2.5421, "step": 26250 }, { "crossentropy": 2.434921979904175, "epoch": 0.9516748839907193, "grad_norm": 0.02671343833208084, "grad_norm_var": 3.1687068862266447e-07, "learning_rate": 5.987905637206026e-05, "loss": 2.4613, "step": 26251 }, { "crossentropy": 2.4844322204589844, "epoch": 0.9517111368909513, "grad_norm": 0.02570522017776966, "grad_norm_var": 3.4262364440728935e-07, "learning_rate": 5.9789428440233384e-05, "loss": 2.4433, "step": 26252 }, { "crossentropy": 2.294060230255127, "epoch": 0.9517473897911833, "grad_norm": 0.026132481172680855, "grad_norm_var": 3.446151781188433e-07, "learning_rate": 5.9699867233447e-05, "loss": 2.3129, "step": 26253 }, { "crossentropy": 2.405550479888916, "epoch": 0.9517836426914154, "grad_norm": 0.026974279433488846, "grad_norm_var": 3.7240103036411305e-07, "learning_rate": 5.961037275291126e-05, "loss": 2.4767, "step": 26254 }, { "crossentropy": 2.390979051589966, "epoch": 0.9518198955916474, "grad_norm": 0.02663998492062092, "grad_norm_var": 3.4088365798028557e-07, "learning_rate": 5.952094499983462e-05, "loss": 2.3656, "step": 26255 }, { "crossentropy": 2.4901225566864014, "epoch": 0.9518561484918794, "grad_norm": 0.02622380293905735, "grad_norm_var": 3.337681988124207e-07, "learning_rate": 5.943158397542503e-05, "loss": 2.5007, "step": 26256 }, { "crossentropy": 2.427037239074707, "epoch": 0.9518924013921114, "grad_norm": 0.02615157514810562, "grad_norm_var": 2.741938024209637e-07, "learning_rate": 5.934228968088984e-05, "loss": 2.5064, "step": 26257 }, { "crossentropy": 2.469278573989868, "epoch": 0.9519286542923434, "grad_norm": 0.02561163902282715, "grad_norm_var": 2.867596221963007e-07, "learning_rate": 5.92530621174342e-05, "loss": 2.4489, "step": 26258 }, { "crossentropy": 2.475903272628784, "epoch": 0.9519649071925754, "grad_norm": 0.026389610022306442, "grad_norm_var": 2.826507385664343e-07, "learning_rate": 5.916390128626437e-05, "loss": 2.527, "step": 26259 }, { "crossentropy": 2.432770252227783, "epoch": 0.9520011600928074, "grad_norm": 0.025266746059060097, "grad_norm_var": 3.260390114830641e-07, "learning_rate": 5.907480718858383e-05, "loss": 2.4209, "step": 26260 }, { "crossentropy": 2.4108471870422363, "epoch": 0.9520374129930395, "grad_norm": 0.026189418509602547, "grad_norm_var": 2.9765118093460823e-07, "learning_rate": 5.898577982559605e-05, "loss": 2.4324, "step": 26261 }, { "crossentropy": 2.212865114212036, "epoch": 0.9520736658932715, "grad_norm": 0.026551416143774986, "grad_norm_var": 2.817834943453218e-07, "learning_rate": 5.889681919850398e-05, "loss": 2.2677, "step": 26262 }, { "crossentropy": 2.4712438583374023, "epoch": 0.9521099187935035, "grad_norm": 0.02631252445280552, "grad_norm_var": 2.812504484639289e-07, "learning_rate": 5.880792530850832e-05, "loss": 2.334, "step": 26263 }, { "crossentropy": 2.3474552631378174, "epoch": 0.9521461716937355, "grad_norm": 0.025655148550868034, "grad_norm_var": 2.390839944624388e-07, "learning_rate": 5.8719098156810315e-05, "loss": 2.326, "step": 26264 }, { "crossentropy": 2.4204964637756348, "epoch": 0.9521824245939675, "grad_norm": 0.026063067838549614, "grad_norm_var": 2.1232965868173137e-07, "learning_rate": 5.8630337744609575e-05, "loss": 2.4696, "step": 26265 }, { "crossentropy": 2.4366374015808105, "epoch": 0.9522186774941995, "grad_norm": 0.0268042404204607, "grad_norm_var": 2.2512883918432366e-07, "learning_rate": 5.8541644073104584e-05, "loss": 2.4376, "step": 26266 }, { "crossentropy": 2.224944829940796, "epoch": 0.9522549303944315, "grad_norm": 0.02547001652419567, "grad_norm_var": 2.3854995483037675e-07, "learning_rate": 5.845301714349382e-05, "loss": 2.2824, "step": 26267 }, { "crossentropy": 2.557389736175537, "epoch": 0.9522911832946636, "grad_norm": 0.026508862152695656, "grad_norm_var": 2.329891965105931e-07, "learning_rate": 5.8364456956974124e-05, "loss": 2.4586, "step": 26268 }, { "crossentropy": 2.4843451976776123, "epoch": 0.9523274361948956, "grad_norm": 0.026413166895508766, "grad_norm_var": 2.3598324391989796e-07, "learning_rate": 5.8275963514741734e-05, "loss": 2.3304, "step": 26269 }, { "crossentropy": 2.468094825744629, "epoch": 0.9523636890951276, "grad_norm": 0.025898702442646027, "grad_norm_var": 1.9747629273752833e-07, "learning_rate": 5.818753681799182e-05, "loss": 2.4878, "step": 26270 }, { "crossentropy": 2.3505685329437256, "epoch": 0.9523999419953596, "grad_norm": 0.025512361899018288, "grad_norm_var": 2.0092809692306748e-07, "learning_rate": 5.809917686791844e-05, "loss": 2.4028, "step": 26271 }, { "crossentropy": 2.446352481842041, "epoch": 0.9524361948955916, "grad_norm": 0.026652002707123756, "grad_norm_var": 2.21517534189306e-07, "learning_rate": 5.801088366571561e-05, "loss": 2.3914, "step": 26272 }, { "crossentropy": 2.4715213775634766, "epoch": 0.9524724477958236, "grad_norm": 0.026807166635990143, "grad_norm_var": 2.537051080455631e-07, "learning_rate": 5.792265721257517e-05, "loss": 2.455, "step": 26273 }, { "crossentropy": 2.331120729446411, "epoch": 0.9525087006960556, "grad_norm": 0.02682396024465561, "grad_norm_var": 2.615098628671574e-07, "learning_rate": 5.78344975096895e-05, "loss": 2.3559, "step": 26274 }, { "crossentropy": 2.4354522228240967, "epoch": 0.9525449535962877, "grad_norm": 0.02635667845606804, "grad_norm_var": 2.6077758500381065e-07, "learning_rate": 5.774640455824876e-05, "loss": 2.3614, "step": 26275 }, { "crossentropy": 2.49817156791687, "epoch": 0.9525812064965197, "grad_norm": 0.026035619899630547, "grad_norm_var": 2.0150388848038287e-07, "learning_rate": 5.7658378359443097e-05, "loss": 2.4594, "step": 26276 }, { "crossentropy": 2.5868308544158936, "epoch": 0.9526174593967517, "grad_norm": 0.026834305375814438, "grad_norm_var": 2.2199513574695736e-07, "learning_rate": 5.757041891446102e-05, "loss": 2.4664, "step": 26277 }, { "crossentropy": 2.40523624420166, "epoch": 0.9526537122969838, "grad_norm": 0.02633107826113701, "grad_norm_var": 2.174582253500664e-07, "learning_rate": 5.748252622449157e-05, "loss": 2.3901, "step": 26278 }, { "crossentropy": 2.3245627880096436, "epoch": 0.9526899651972158, "grad_norm": 0.025720838457345963, "grad_norm_var": 2.3676768321835515e-07, "learning_rate": 5.7394700290719917e-05, "loss": 2.3256, "step": 26279 }, { "crossentropy": 2.44982647895813, "epoch": 0.9527262180974478, "grad_norm": 0.02667449414730072, "grad_norm_var": 2.2181942197907247e-07, "learning_rate": 5.7306941114334544e-05, "loss": 2.3731, "step": 26280 }, { "crossentropy": 2.457188129425049, "epoch": 0.9527624709976799, "grad_norm": 0.026332831010222435, "grad_norm_var": 2.1760605135167966e-07, "learning_rate": 5.7219248696518954e-05, "loss": 2.4363, "step": 26281 }, { "crossentropy": 2.4633407592773438, "epoch": 0.9527987238979119, "grad_norm": 0.0265471450984478, "grad_norm_var": 2.0525839231377795e-07, "learning_rate": 5.713162303845887e-05, "loss": 2.4936, "step": 26282 }, { "crossentropy": 2.2490622997283936, "epoch": 0.9528349767981439, "grad_norm": 0.025646638125181198, "grad_norm_var": 1.8748686839472015e-07, "learning_rate": 5.7044064141337225e-05, "loss": 2.2434, "step": 26283 }, { "crossentropy": 2.355079412460327, "epoch": 0.9528712296983759, "grad_norm": 0.0258474200963974, "grad_norm_var": 1.9804168001449848e-07, "learning_rate": 5.6956572006336416e-05, "loss": 2.4093, "step": 26284 }, { "crossentropy": 2.4577856063842773, "epoch": 0.9529074825986079, "grad_norm": 0.026728427037596703, "grad_norm_var": 2.0997089165800063e-07, "learning_rate": 5.6869146634638826e-05, "loss": 2.4253, "step": 26285 }, { "crossentropy": 2.4560165405273438, "epoch": 0.9529437354988399, "grad_norm": 0.02659149095416069, "grad_norm_var": 2.0319012609576413e-07, "learning_rate": 5.678178802742517e-05, "loss": 2.4813, "step": 26286 }, { "crossentropy": 2.558328628540039, "epoch": 0.9529799883990719, "grad_norm": 0.025540921837091446, "grad_norm_var": 2.0008888132050142e-07, "learning_rate": 5.669449618587452e-05, "loss": 2.4315, "step": 26287 }, { "crossentropy": 2.4402217864990234, "epoch": 0.953016241299304, "grad_norm": 0.026651885360479355, "grad_norm_var": 2.000840308508661e-07, "learning_rate": 5.660727111116648e-05, "loss": 2.4101, "step": 26288 }, { "crossentropy": 2.4922144412994385, "epoch": 0.953052494199536, "grad_norm": 0.026022914797067642, "grad_norm_var": 1.8987649394914114e-07, "learning_rate": 5.6520112804478995e-05, "loss": 2.4322, "step": 26289 }, { "crossentropy": 2.4576058387756348, "epoch": 0.953088747099768, "grad_norm": 0.02688957378268242, "grad_norm_var": 1.947913942785719e-07, "learning_rate": 5.643302126698946e-05, "loss": 2.4672, "step": 26290 }, { "crossentropy": 2.3808929920196533, "epoch": 0.953125, "grad_norm": 0.026264159008860588, "grad_norm_var": 1.9459039828801742e-07, "learning_rate": 5.634599649987415e-05, "loss": 2.4353, "step": 26291 }, { "crossentropy": 2.407207727432251, "epoch": 0.953161252900232, "grad_norm": 0.026607908308506012, "grad_norm_var": 1.9555536339605344e-07, "learning_rate": 5.625903850430825e-05, "loss": 2.4105, "step": 26292 }, { "crossentropy": 2.5154542922973633, "epoch": 0.953197505800464, "grad_norm": 0.026519298553466797, "grad_norm_var": 1.804499903810258e-07, "learning_rate": 5.617214728146691e-05, "loss": 2.4852, "step": 26293 }, { "crossentropy": 2.5582144260406494, "epoch": 0.953233758700696, "grad_norm": 0.02602188102900982, "grad_norm_var": 1.8544546305486927e-07, "learning_rate": 5.60853228325231e-05, "loss": 2.5087, "step": 26294 }, { "crossentropy": 2.3920416831970215, "epoch": 0.9532700116009281, "grad_norm": 0.027548521757125854, "grad_norm_var": 2.560125050653112e-07, "learning_rate": 5.599856515864976e-05, "loss": 2.4493, "step": 26295 }, { "crossentropy": 2.3455276489257812, "epoch": 0.9533062645011601, "grad_norm": 0.02576279640197754, "grad_norm_var": 2.748644211384224e-07, "learning_rate": 5.591187426101818e-05, "loss": 2.3694, "step": 26296 }, { "crossentropy": 2.427156925201416, "epoch": 0.9533425174013921, "grad_norm": 0.026420539245009422, "grad_norm_var": 2.7520012058166593e-07, "learning_rate": 5.582525014079964e-05, "loss": 2.4523, "step": 26297 }, { "crossentropy": 2.382038116455078, "epoch": 0.9533787703016241, "grad_norm": 0.025364425033330917, "grad_norm_var": 3.316513523083167e-07, "learning_rate": 5.5738692799164324e-05, "loss": 2.4002, "step": 26298 }, { "crossentropy": 2.610466957092285, "epoch": 0.9534150232018561, "grad_norm": 0.02649580128490925, "grad_norm_var": 3.053706892330952e-07, "learning_rate": 5.565220223728129e-05, "loss": 2.644, "step": 26299 }, { "crossentropy": 2.4646239280700684, "epoch": 0.9534512761020881, "grad_norm": 0.026155708357691765, "grad_norm_var": 2.914795259353435e-07, "learning_rate": 5.556577845631849e-05, "loss": 2.4233, "step": 26300 }, { "crossentropy": 2.4160947799682617, "epoch": 0.9534875290023201, "grad_norm": 0.025955041870474815, "grad_norm_var": 2.8975106745159537e-07, "learning_rate": 5.5479421457443335e-05, "loss": 2.3741, "step": 26301 }, { "crossentropy": 2.4276912212371826, "epoch": 0.9535237819025522, "grad_norm": 0.026896925643086433, "grad_norm_var": 3.074198224155596e-07, "learning_rate": 5.53931312418221e-05, "loss": 2.3999, "step": 26302 }, { "crossentropy": 2.544240713119507, "epoch": 0.9535600348027842, "grad_norm": 0.026602942496538162, "grad_norm_var": 2.6760822626873255e-07, "learning_rate": 5.5306907810620535e-05, "loss": 2.5561, "step": 26303 }, { "crossentropy": 2.3307483196258545, "epoch": 0.9535962877030162, "grad_norm": 0.026121698319911957, "grad_norm_var": 2.6640010658561933e-07, "learning_rate": 5.522075116500269e-05, "loss": 2.3846, "step": 26304 }, { "crossentropy": 2.503538131713867, "epoch": 0.9536325406032483, "grad_norm": 0.025717269629240036, "grad_norm_var": 2.8569609196008445e-07, "learning_rate": 5.513466130613265e-05, "loss": 2.4684, "step": 26305 }, { "crossentropy": 2.222039222717285, "epoch": 0.9536687935034803, "grad_norm": 0.02603389509022236, "grad_norm_var": 2.6807553128333865e-07, "learning_rate": 5.5048638235172807e-05, "loss": 2.3356, "step": 26306 }, { "crossentropy": 2.4131951332092285, "epoch": 0.9537050464037123, "grad_norm": 0.02589685283601284, "grad_norm_var": 2.7731041710536234e-07, "learning_rate": 5.496268195328613e-05, "loss": 2.3972, "step": 26307 }, { "crossentropy": 2.3568594455718994, "epoch": 0.9537412993039444, "grad_norm": 0.025736767798662186, "grad_norm_var": 2.8405106095024993e-07, "learning_rate": 5.487679246163224e-05, "loss": 2.4867, "step": 26308 }, { "crossentropy": 2.5186896324157715, "epoch": 0.9537775522041764, "grad_norm": 0.024925295263528824, "grad_norm_var": 3.756612615803586e-07, "learning_rate": 5.479096976137188e-05, "loss": 2.4517, "step": 26309 }, { "crossentropy": 2.4654855728149414, "epoch": 0.9538138051044084, "grad_norm": 0.02592362090945244, "grad_norm_var": 3.7733431738465735e-07, "learning_rate": 5.470521385366411e-05, "loss": 2.442, "step": 26310 }, { "crossentropy": 2.328313112258911, "epoch": 0.9538500580046404, "grad_norm": 0.02685394324362278, "grad_norm_var": 2.730959833846062e-07, "learning_rate": 5.461952473966747e-05, "loss": 2.4232, "step": 26311 }, { "crossentropy": 2.6186602115631104, "epoch": 0.9538863109048724, "grad_norm": 0.027492258697748184, "grad_norm_var": 3.928927556656169e-07, "learning_rate": 5.4533902420538796e-05, "loss": 2.5764, "step": 26312 }, { "crossentropy": 2.2806413173675537, "epoch": 0.9539225638051044, "grad_norm": 0.0273294560611248, "grad_norm_var": 4.758504896763682e-07, "learning_rate": 5.444834689743439e-05, "loss": 2.3613, "step": 26313 }, { "crossentropy": 2.616478204727173, "epoch": 0.9539588167053364, "grad_norm": 0.026141269132494926, "grad_norm_var": 4.2506578749734503e-07, "learning_rate": 5.4362858171510544e-05, "loss": 2.478, "step": 26314 }, { "crossentropy": 2.2941970825195312, "epoch": 0.9539950696055685, "grad_norm": 0.02634354680776596, "grad_norm_var": 4.218783839878554e-07, "learning_rate": 5.4277436243921895e-05, "loss": 2.3266, "step": 26315 }, { "crossentropy": 2.477109909057617, "epoch": 0.9540313225058005, "grad_norm": 0.02600422501564026, "grad_norm_var": 4.2537674470732636e-07, "learning_rate": 5.419208111582086e-05, "loss": 2.4609, "step": 26316 }, { "crossentropy": 2.2750601768493652, "epoch": 0.9540675754060325, "grad_norm": 0.026732444763183594, "grad_norm_var": 4.3273734304989134e-07, "learning_rate": 5.4106792788362614e-05, "loss": 2.393, "step": 26317 }, { "crossentropy": 2.447730541229248, "epoch": 0.9541038283062645, "grad_norm": 0.025878015905618668, "grad_norm_var": 4.161242273821586e-07, "learning_rate": 5.40215712626968e-05, "loss": 2.4733, "step": 26318 }, { "crossentropy": 2.4605414867401123, "epoch": 0.9541400812064965, "grad_norm": 0.026854010298848152, "grad_norm_var": 4.3243649268096854e-07, "learning_rate": 5.393641653997583e-05, "loss": 2.4864, "step": 26319 }, { "crossentropy": 2.432950735092163, "epoch": 0.9541763341067285, "grad_norm": 0.026296110823750496, "grad_norm_var": 4.3137649650924297e-07, "learning_rate": 5.3851328621349896e-05, "loss": 2.445, "step": 26320 }, { "crossentropy": 2.311124086380005, "epoch": 0.9542125870069605, "grad_norm": 0.026207370683550835, "grad_norm_var": 4.1092739636153634e-07, "learning_rate": 5.376630750796696e-05, "loss": 2.4296, "step": 26321 }, { "crossentropy": 2.479046106338501, "epoch": 0.9542488399071926, "grad_norm": 0.02684440091252327, "grad_norm_var": 4.2424692335418557e-07, "learning_rate": 5.368135320097667e-05, "loss": 2.4817, "step": 26322 }, { "crossentropy": 2.388491630554199, "epoch": 0.9542850928074246, "grad_norm": 0.026872573420405388, "grad_norm_var": 4.2593785205124793e-07, "learning_rate": 5.359646570152588e-05, "loss": 2.333, "step": 26323 }, { "crossentropy": 2.329909563064575, "epoch": 0.9543213457076566, "grad_norm": 0.026451272889971733, "grad_norm_var": 3.9445058455596775e-07, "learning_rate": 5.3511645010761446e-05, "loss": 2.3952, "step": 26324 }, { "crossentropy": 2.4569754600524902, "epoch": 0.9543575986078886, "grad_norm": 0.026411397382616997, "grad_norm_var": 2.3098771425311692e-07, "learning_rate": 5.342689112982857e-05, "loss": 2.4445, "step": 26325 }, { "crossentropy": 2.521514892578125, "epoch": 0.9543938515081206, "grad_norm": 0.02575433999300003, "grad_norm_var": 2.4668511849988e-07, "learning_rate": 5.3342204059872446e-05, "loss": 2.5089, "step": 26326 }, { "crossentropy": 2.5396530628204346, "epoch": 0.9544301044083526, "grad_norm": 0.026563378050923347, "grad_norm_var": 2.3937930003301715e-07, "learning_rate": 5.32575838020366e-05, "loss": 2.4742, "step": 26327 }, { "crossentropy": 2.5531139373779297, "epoch": 0.9544663573085846, "grad_norm": 0.026428185403347015, "grad_norm_var": 1.7092818665164396e-07, "learning_rate": 5.317303035746457e-05, "loss": 2.4938, "step": 26328 }, { "crossentropy": 2.4910898208618164, "epoch": 0.9545026102088167, "grad_norm": 0.025742249563336372, "grad_norm_var": 1.410986152775336e-07, "learning_rate": 5.30885437272971e-05, "loss": 2.4902, "step": 26329 }, { "crossentropy": 2.350338935852051, "epoch": 0.9545388631090487, "grad_norm": 0.025490880012512207, "grad_norm_var": 1.8522970306374583e-07, "learning_rate": 5.300412391267606e-05, "loss": 2.3991, "step": 26330 }, { "crossentropy": 2.544685125350952, "epoch": 0.9545751160092807, "grad_norm": 0.02645489200949669, "grad_norm_var": 1.865820239518289e-07, "learning_rate": 5.291977091474165e-05, "loss": 2.4842, "step": 26331 }, { "crossentropy": 2.369849443435669, "epoch": 0.9546113689095128, "grad_norm": 0.02595875784754753, "grad_norm_var": 1.885746796696323e-07, "learning_rate": 5.283548473463351e-05, "loss": 2.3851, "step": 26332 }, { "crossentropy": 2.336564302444458, "epoch": 0.9546476218097448, "grad_norm": 0.025695346295833588, "grad_norm_var": 1.972120008924642e-07, "learning_rate": 5.275126537348962e-05, "loss": 2.3655, "step": 26333 }, { "crossentropy": 2.363546133041382, "epoch": 0.9546838747099768, "grad_norm": 0.026466388255357742, "grad_norm_var": 1.901410676091268e-07, "learning_rate": 5.266711283244741e-05, "loss": 2.4298, "step": 26334 }, { "crossentropy": 2.5271265506744385, "epoch": 0.9547201276102089, "grad_norm": 0.026435039937496185, "grad_norm_var": 1.6908664351096501e-07, "learning_rate": 5.2583027112643735e-05, "loss": 2.5629, "step": 26335 }, { "crossentropy": 2.3737971782684326, "epoch": 0.9547563805104409, "grad_norm": 0.026049261912703514, "grad_norm_var": 1.7152669796066145e-07, "learning_rate": 5.249900821521492e-05, "loss": 2.5558, "step": 26336 }, { "crossentropy": 2.419037342071533, "epoch": 0.9547926334106729, "grad_norm": 0.025929812341928482, "grad_norm_var": 1.7751615531066965e-07, "learning_rate": 5.241505614129449e-05, "loss": 2.4321, "step": 26337 }, { "crossentropy": 2.47819447517395, "epoch": 0.9548288863109049, "grad_norm": 0.02726735733449459, "grad_norm_var": 2.238101862109352e-07, "learning_rate": 5.233117089201711e-05, "loss": 2.5073, "step": 26338 }, { "crossentropy": 2.461564302444458, "epoch": 0.9548651392111369, "grad_norm": 0.025655075907707214, "grad_norm_var": 2.150968534776638e-07, "learning_rate": 5.224735246851575e-05, "loss": 2.3829, "step": 26339 }, { "crossentropy": 2.4201207160949707, "epoch": 0.9549013921113689, "grad_norm": 0.025723112747073174, "grad_norm_var": 2.211312984426797e-07, "learning_rate": 5.216360087192173e-05, "loss": 2.3294, "step": 26340 }, { "crossentropy": 2.4553778171539307, "epoch": 0.9549376450116009, "grad_norm": 0.02731439471244812, "grad_norm_var": 3.063845036776428e-07, "learning_rate": 5.207991610336749e-05, "loss": 2.451, "step": 26341 }, { "crossentropy": 2.373997688293457, "epoch": 0.954973897911833, "grad_norm": 0.025198638439178467, "grad_norm_var": 3.574478920940199e-07, "learning_rate": 5.199629816398266e-05, "loss": 2.428, "step": 26342 }, { "crossentropy": 2.2542717456817627, "epoch": 0.955010150812065, "grad_norm": 0.02601408027112484, "grad_norm_var": 3.459055679059236e-07, "learning_rate": 5.1912747054896345e-05, "loss": 2.3382, "step": 26343 }, { "crossentropy": 2.3962554931640625, "epoch": 0.955046403712297, "grad_norm": 0.02585143782198429, "grad_norm_var": 3.42532139573024e-07, "learning_rate": 5.182926277723821e-05, "loss": 2.4135, "step": 26344 }, { "crossentropy": 2.385831117630005, "epoch": 0.955082656612529, "grad_norm": 0.025878990069031715, "grad_norm_var": 3.3758079024248424e-07, "learning_rate": 5.174584533213455e-05, "loss": 2.361, "step": 26345 }, { "crossentropy": 2.442972183227539, "epoch": 0.955118909512761, "grad_norm": 0.026681602001190186, "grad_norm_var": 3.3163739271835126e-07, "learning_rate": 5.166249472071172e-05, "loss": 2.3877, "step": 26346 }, { "crossentropy": 2.3719048500061035, "epoch": 0.955155162412993, "grad_norm": 0.025213245302438736, "grad_norm_var": 3.7931937916137424e-07, "learning_rate": 5.157921094409712e-05, "loss": 2.3188, "step": 26347 }, { "crossentropy": 2.374018669128418, "epoch": 0.955191415313225, "grad_norm": 0.02664385549724102, "grad_norm_var": 3.9727930959593665e-07, "learning_rate": 5.149599400341487e-05, "loss": 2.357, "step": 26348 }, { "crossentropy": 2.3348779678344727, "epoch": 0.9552276682134571, "grad_norm": 0.02657371386885643, "grad_norm_var": 3.950516134017575e-07, "learning_rate": 5.141284389978851e-05, "loss": 2.4422, "step": 26349 }, { "crossentropy": 2.4379923343658447, "epoch": 0.9552639211136891, "grad_norm": 0.026013577356934547, "grad_norm_var": 3.906362395118823e-07, "learning_rate": 5.1329760634341024e-05, "loss": 2.5004, "step": 26350 }, { "crossentropy": 2.649822235107422, "epoch": 0.9553001740139211, "grad_norm": 0.027161160483956337, "grad_norm_var": 4.5092450106907526e-07, "learning_rate": 5.124674420819542e-05, "loss": 2.6428, "step": 26351 }, { "crossentropy": 2.465254306793213, "epoch": 0.9553364269141531, "grad_norm": 0.026987425982952118, "grad_norm_var": 4.873182703377457e-07, "learning_rate": 5.1163794622473e-05, "loss": 2.4747, "step": 26352 }, { "crossentropy": 2.493501901626587, "epoch": 0.9553726798143851, "grad_norm": 0.026108603924512863, "grad_norm_var": 4.815231199309725e-07, "learning_rate": 5.108091187829345e-05, "loss": 2.4619, "step": 26353 }, { "crossentropy": 2.3621468544006348, "epoch": 0.9554089327146171, "grad_norm": 0.02665886841714382, "grad_norm_var": 4.235758139333722e-07, "learning_rate": 5.099809597677585e-05, "loss": 2.2458, "step": 26354 }, { "crossentropy": 2.28509259223938, "epoch": 0.9554451856148491, "grad_norm": 0.025322897359728813, "grad_norm_var": 4.5592974814849474e-07, "learning_rate": 5.091534691903987e-05, "loss": 2.324, "step": 26355 }, { "crossentropy": 2.5121607780456543, "epoch": 0.9554814385150812, "grad_norm": 0.027226893231272697, "grad_norm_var": 4.998220093220396e-07, "learning_rate": 5.0832664706202404e-05, "loss": 2.4769, "step": 26356 }, { "crossentropy": 2.5966384410858154, "epoch": 0.9555176914153132, "grad_norm": 0.027307432144880295, "grad_norm_var": 4.988861989643623e-07, "learning_rate": 5.0750049339380875e-05, "loss": 2.5255, "step": 26357 }, { "crossentropy": 2.526787042617798, "epoch": 0.9555539443155452, "grad_norm": 0.026905642822384834, "grad_norm_var": 4.297287048709205e-07, "learning_rate": 5.066750081968996e-05, "loss": 2.5718, "step": 26358 }, { "crossentropy": 2.4683220386505127, "epoch": 0.9555901972157773, "grad_norm": 0.028986651450395584, "grad_norm_var": 8.253318942790747e-07, "learning_rate": 5.0585019148245426e-05, "loss": 2.5024, "step": 26359 }, { "crossentropy": 2.429457426071167, "epoch": 0.9556264501160093, "grad_norm": 0.024900879710912704, "grad_norm_var": 9.76060138715984e-07, "learning_rate": 5.0502604326161384e-05, "loss": 2.3987, "step": 26360 }, { "crossentropy": 2.3749799728393555, "epoch": 0.9556627030162413, "grad_norm": 0.02694130316376686, "grad_norm_var": 9.535722956580203e-07, "learning_rate": 5.042025635455083e-05, "loss": 2.4175, "step": 26361 }, { "crossentropy": 2.330449104309082, "epoch": 0.9556989559164734, "grad_norm": 0.02561413310468197, "grad_norm_var": 1.0134763143598846e-06, "learning_rate": 5.0337975234526215e-05, "loss": 2.3656, "step": 26362 }, { "crossentropy": 2.507380962371826, "epoch": 0.9557352088167054, "grad_norm": 0.025805005803704262, "grad_norm_var": 9.3104331107389e-07, "learning_rate": 5.025576096719886e-05, "loss": 2.486, "step": 26363 }, { "crossentropy": 2.3754889965057373, "epoch": 0.9557714617169374, "grad_norm": 0.02699689194560051, "grad_norm_var": 9.421975476592639e-07, "learning_rate": 5.017361355367844e-05, "loss": 2.3477, "step": 26364 }, { "crossentropy": 2.392727851867676, "epoch": 0.9558077146171694, "grad_norm": 0.025126507505774498, "grad_norm_var": 1.077097762301734e-06, "learning_rate": 5.0091532995075184e-05, "loss": 2.4158, "step": 26365 }, { "crossentropy": 2.43953537940979, "epoch": 0.9558439675174014, "grad_norm": 0.025616033002734184, "grad_norm_var": 1.1129702412207884e-06, "learning_rate": 5.000951929249709e-05, "loss": 2.3566, "step": 26366 }, { "crossentropy": 2.438498020172119, "epoch": 0.9558802204176334, "grad_norm": 0.026324793696403503, "grad_norm_var": 1.0806343024210666e-06, "learning_rate": 4.99275724470527e-05, "loss": 2.3785, "step": 26367 }, { "crossentropy": 2.5205297470092773, "epoch": 0.9559164733178654, "grad_norm": 0.025760607793927193, "grad_norm_var": 1.0830090539530456e-06, "learning_rate": 4.984569245984838e-05, "loss": 2.4762, "step": 26368 }, { "crossentropy": 2.343581199645996, "epoch": 0.9559527262180975, "grad_norm": 0.02606550045311451, "grad_norm_var": 1.0845136374904492e-06, "learning_rate": 4.976387933198989e-05, "loss": 2.4411, "step": 26369 }, { "crossentropy": 2.373908519744873, "epoch": 0.9559889791183295, "grad_norm": 0.025962285697460175, "grad_norm_var": 1.0859214156665274e-06, "learning_rate": 4.968213306458302e-05, "loss": 2.3477, "step": 26370 }, { "crossentropy": 2.455965042114258, "epoch": 0.9560252320185615, "grad_norm": 0.02646733447909355, "grad_norm_var": 1.0180770590041688e-06, "learning_rate": 4.9600453658731335e-05, "loss": 2.4944, "step": 26371 }, { "crossentropy": 2.2673778533935547, "epoch": 0.9560614849187935, "grad_norm": 0.025515640154480934, "grad_norm_var": 1.0068398644304139e-06, "learning_rate": 4.951884111553728e-05, "loss": 2.3254, "step": 26372 }, { "crossentropy": 2.3754212856292725, "epoch": 0.9560977378190255, "grad_norm": 0.026682404801249504, "grad_norm_var": 9.446779454380284e-07, "learning_rate": 4.943729543610442e-05, "loss": 2.3242, "step": 26373 }, { "crossentropy": 2.3438832759857178, "epoch": 0.9561339907192575, "grad_norm": 0.026486754417419434, "grad_norm_var": 9.178795369175925e-07, "learning_rate": 4.935581662153354e-05, "loss": 2.3864, "step": 26374 }, { "crossentropy": 2.3727920055389404, "epoch": 0.9561702436194895, "grad_norm": 0.02620822936296463, "grad_norm_var": 3.6924466230876343e-07, "learning_rate": 4.927440467292488e-05, "loss": 2.4078, "step": 26375 }, { "crossentropy": 2.330378770828247, "epoch": 0.9562064965197216, "grad_norm": 0.026001116260886192, "grad_norm_var": 2.793144844489044e-07, "learning_rate": 4.919305959137921e-05, "loss": 2.4032, "step": 26376 }, { "crossentropy": 2.3066186904907227, "epoch": 0.9562427494199536, "grad_norm": 0.0253602284938097, "grad_norm_var": 2.5786129132463843e-07, "learning_rate": 4.9111781377993434e-05, "loss": 2.387, "step": 26377 }, { "crossentropy": 2.385969877243042, "epoch": 0.9562790023201856, "grad_norm": 0.027609897777438164, "grad_norm_var": 4.0423229809595045e-07, "learning_rate": 4.903057003386724e-05, "loss": 2.468, "step": 26378 }, { "crossentropy": 2.412379741668701, "epoch": 0.9563152552204176, "grad_norm": 0.026472793892025948, "grad_norm_var": 4.036717560824502e-07, "learning_rate": 4.894942556009585e-05, "loss": 2.3603, "step": 26379 }, { "crossentropy": 2.513735294342041, "epoch": 0.9563515081206496, "grad_norm": 0.02656981535255909, "grad_norm_var": 3.677611028000584e-07, "learning_rate": 4.886834795777617e-05, "loss": 2.4214, "step": 26380 }, { "crossentropy": 2.42775559425354, "epoch": 0.9563877610208816, "grad_norm": 0.02727540209889412, "grad_norm_var": 3.6616530501339346e-07, "learning_rate": 4.8787337228003435e-05, "loss": 2.4677, "step": 26381 }, { "crossentropy": 2.4006593227386475, "epoch": 0.9564240139211136, "grad_norm": 0.025981195271015167, "grad_norm_var": 3.424796853642668e-07, "learning_rate": 4.870639337187066e-05, "loss": 2.4408, "step": 26382 }, { "crossentropy": 2.4536566734313965, "epoch": 0.9564602668213457, "grad_norm": 0.026207784190773964, "grad_norm_var": 3.428939693686054e-07, "learning_rate": 4.862551639047197e-05, "loss": 2.4196, "step": 26383 }, { "crossentropy": 2.539135456085205, "epoch": 0.9564965197215777, "grad_norm": 0.02552619017660618, "grad_norm_var": 3.6284954638005076e-07, "learning_rate": 4.854470628490038e-05, "loss": 2.4204, "step": 26384 }, { "crossentropy": 2.3567380905151367, "epoch": 0.9565327726218097, "grad_norm": 0.025701384991407394, "grad_norm_var": 3.812842012089566e-07, "learning_rate": 4.846396305624612e-05, "loss": 2.4161, "step": 26385 }, { "crossentropy": 2.34372878074646, "epoch": 0.9565690255220418, "grad_norm": 0.027029404416680336, "grad_norm_var": 4.1126582766600147e-07, "learning_rate": 4.8383286705600546e-05, "loss": 2.3492, "step": 26386 }, { "crossentropy": 2.303317070007324, "epoch": 0.9566052784222738, "grad_norm": 0.025862427428364754, "grad_norm_var": 4.2212908799853173e-07, "learning_rate": 4.830267723405279e-05, "loss": 2.3041, "step": 26387 }, { "crossentropy": 2.3792061805725098, "epoch": 0.9566415313225058, "grad_norm": 0.026204511523246765, "grad_norm_var": 3.815207445303596e-07, "learning_rate": 4.822213464269198e-05, "loss": 2.4794, "step": 26388 }, { "crossentropy": 2.4999892711639404, "epoch": 0.9566777842227379, "grad_norm": 0.02667292393743992, "grad_norm_var": 3.810729451699778e-07, "learning_rate": 4.814165893260614e-05, "loss": 2.4272, "step": 26389 }, { "crossentropy": 2.355025291442871, "epoch": 0.9567140371229699, "grad_norm": 0.02525532804429531, "grad_norm_var": 4.4898289471421474e-07, "learning_rate": 4.8061250104881046e-05, "loss": 2.2828, "step": 26390 }, { "crossentropy": 2.4765093326568604, "epoch": 0.9567502900232019, "grad_norm": 0.026375431567430496, "grad_norm_var": 4.498844672849809e-07, "learning_rate": 4.7980908160604184e-05, "loss": 2.4879, "step": 26391 }, { "crossentropy": 2.5979015827178955, "epoch": 0.9567865429234339, "grad_norm": 0.027155809104442596, "grad_norm_var": 4.938803919341446e-07, "learning_rate": 4.7900633100860235e-05, "loss": 2.5737, "step": 26392 }, { "crossentropy": 2.1989190578460693, "epoch": 0.9568227958236659, "grad_norm": 0.026044799014925957, "grad_norm_var": 4.347643419087716e-07, "learning_rate": 4.782042492673278e-05, "loss": 2.3466, "step": 26393 }, { "crossentropy": 2.4385783672332764, "epoch": 0.9568590487238979, "grad_norm": 0.02703372947871685, "grad_norm_var": 3.60380998166004e-07, "learning_rate": 4.7740283639306494e-05, "loss": 2.5145, "step": 26394 }, { "crossentropy": 2.3599164485931396, "epoch": 0.95689530162413, "grad_norm": 0.026488717645406723, "grad_norm_var": 3.606882204391319e-07, "learning_rate": 4.766020923966275e-05, "loss": 2.3946, "step": 26395 }, { "crossentropy": 2.4965171813964844, "epoch": 0.956931554524362, "grad_norm": 0.0262452382594347, "grad_norm_var": 3.571777482322258e-07, "learning_rate": 4.758020172888289e-05, "loss": 2.4859, "step": 26396 }, { "crossentropy": 2.436222791671753, "epoch": 0.956967807424594, "grad_norm": 0.025805363431572914, "grad_norm_var": 3.0424574218260866e-07, "learning_rate": 4.7500261108048285e-05, "loss": 2.4184, "step": 26397 }, { "crossentropy": 2.3875555992126465, "epoch": 0.957004060324826, "grad_norm": 0.025796469300985336, "grad_norm_var": 3.1236838772472076e-07, "learning_rate": 4.742038737823806e-05, "loss": 2.3745, "step": 26398 }, { "crossentropy": 2.5331740379333496, "epoch": 0.957040313225058, "grad_norm": 0.025641245767474174, "grad_norm_var": 3.328109996559095e-07, "learning_rate": 4.734058054053136e-05, "loss": 2.384, "step": 26399 }, { "crossentropy": 2.3587279319763184, "epoch": 0.95707656612529, "grad_norm": 0.028385521844029427, "grad_norm_var": 5.955134367439825e-07, "learning_rate": 4.726084059600622e-05, "loss": 2.3878, "step": 26400 }, { "crossentropy": 2.484337091445923, "epoch": 0.957112819025522, "grad_norm": 0.026549916714429855, "grad_norm_var": 5.664359660090615e-07, "learning_rate": 4.718116754573953e-05, "loss": 2.4903, "step": 26401 }, { "crossentropy": 2.514803409576416, "epoch": 0.957149071925754, "grad_norm": 0.02678878791630268, "grad_norm_var": 5.50156235702183e-07, "learning_rate": 4.710156139080713e-05, "loss": 2.5162, "step": 26402 }, { "crossentropy": 2.4236764907836914, "epoch": 0.9571853248259861, "grad_norm": 0.025872711092233658, "grad_norm_var": 5.49433786492974e-07, "learning_rate": 4.702202213228479e-05, "loss": 2.2883, "step": 26403 }, { "crossentropy": 2.424933910369873, "epoch": 0.9572215777262181, "grad_norm": 0.026645295321941376, "grad_norm_var": 5.503945417071326e-07, "learning_rate": 4.6942549771246125e-05, "loss": 2.3682, "step": 26404 }, { "crossentropy": 2.466383934020996, "epoch": 0.9572578306264501, "grad_norm": 0.02608468197286129, "grad_norm_var": 5.523667217982441e-07, "learning_rate": 4.686314430876526e-05, "loss": 2.4259, "step": 26405 }, { "crossentropy": 2.4353530406951904, "epoch": 0.9572940835266821, "grad_norm": 0.025860067456960678, "grad_norm_var": 4.840903712848936e-07, "learning_rate": 4.678380574591357e-05, "loss": 2.47, "step": 26406 }, { "crossentropy": 2.4304561614990234, "epoch": 0.9573303364269141, "grad_norm": 0.026018807664513588, "grad_norm_var": 4.943182261900181e-07, "learning_rate": 4.6704534083763515e-05, "loss": 2.4441, "step": 26407 }, { "crossentropy": 2.321690320968628, "epoch": 0.9573665893271461, "grad_norm": 0.026121899485588074, "grad_norm_var": 4.570848873794066e-07, "learning_rate": 4.662532932338648e-05, "loss": 2.3944, "step": 26408 }, { "crossentropy": 2.502110242843628, "epoch": 0.9574028422273781, "grad_norm": 0.026056526228785515, "grad_norm_var": 4.5663744393669285e-07, "learning_rate": 4.654619146585048e-05, "loss": 2.482, "step": 26409 }, { "crossentropy": 2.397928237915039, "epoch": 0.9574390951276102, "grad_norm": 0.026138102635741234, "grad_norm_var": 4.2359261420315577e-07, "learning_rate": 4.646712051222579e-05, "loss": 2.4296, "step": 26410 }, { "crossentropy": 2.3765313625335693, "epoch": 0.9574753480278422, "grad_norm": 0.025602657347917557, "grad_norm_var": 4.481462506217831e-07, "learning_rate": 4.6388116463579324e-05, "loss": 2.4845, "step": 26411 }, { "crossentropy": 2.2492775917053223, "epoch": 0.9575116009280742, "grad_norm": 0.025562673807144165, "grad_norm_var": 4.754983924598957e-07, "learning_rate": 4.630917932097967e-05, "loss": 2.3207, "step": 26412 }, { "crossentropy": 2.486611843109131, "epoch": 0.9575478538283063, "grad_norm": 0.026476504281163216, "grad_norm_var": 4.698420357722328e-07, "learning_rate": 4.623030908549153e-05, "loss": 2.466, "step": 26413 }, { "crossentropy": 2.400240182876587, "epoch": 0.9575841067285383, "grad_norm": 0.025234777480363846, "grad_norm_var": 5.216630148351919e-07, "learning_rate": 4.615150575818017e-05, "loss": 2.4324, "step": 26414 }, { "crossentropy": 2.3645358085632324, "epoch": 0.9576203596287703, "grad_norm": 0.025607803836464882, "grad_norm_var": 5.241798153301132e-07, "learning_rate": 4.6072769340110846e-05, "loss": 2.3827, "step": 26415 }, { "crossentropy": 2.439075469970703, "epoch": 0.9576566125290024, "grad_norm": 0.02701493538916111, "grad_norm_var": 2.399862467831101e-07, "learning_rate": 4.599409983234715e-05, "loss": 2.5124, "step": 26416 }, { "crossentropy": 2.3529605865478516, "epoch": 0.9576928654292344, "grad_norm": 0.025630980730056763, "grad_norm_var": 2.3791482021551004e-07, "learning_rate": 4.5915497235950456e-05, "loss": 2.3784, "step": 26417 }, { "crossentropy": 2.362172842025757, "epoch": 0.9577291183294664, "grad_norm": 0.030867230147123337, "grad_norm_var": 1.6820813832271828e-06, "learning_rate": 4.5836961551983245e-05, "loss": 2.3472, "step": 26418 }, { "crossentropy": 2.4187777042388916, "epoch": 0.9577653712296984, "grad_norm": 0.026437358930706978, "grad_norm_var": 1.669859493178399e-06, "learning_rate": 4.575849278150579e-05, "loss": 2.4158, "step": 26419 }, { "crossentropy": 2.3878042697906494, "epoch": 0.9578016241299304, "grad_norm": 0.026223205029964447, "grad_norm_var": 1.6635325522988571e-06, "learning_rate": 4.56800909255789e-05, "loss": 2.4137, "step": 26420 }, { "crossentropy": 2.469055652618408, "epoch": 0.9578378770301624, "grad_norm": 0.026326680555939674, "grad_norm_var": 1.659966478349123e-06, "learning_rate": 4.560175598526006e-05, "loss": 2.4255, "step": 26421 }, { "crossentropy": 2.4487099647521973, "epoch": 0.9578741299303944, "grad_norm": 0.026157408952713013, "grad_norm_var": 1.6471087618698139e-06, "learning_rate": 4.552348796160899e-05, "loss": 2.4476, "step": 26422 }, { "crossentropy": 2.3453047275543213, "epoch": 0.9579103828306265, "grad_norm": 0.025893012061715126, "grad_norm_var": 1.6535244391842224e-06, "learning_rate": 4.544528685568094e-05, "loss": 2.368, "step": 26423 }, { "crossentropy": 2.460519552230835, "epoch": 0.9579466357308585, "grad_norm": 0.02663467638194561, "grad_norm_var": 1.6554236861463967e-06, "learning_rate": 4.536715266853397e-05, "loss": 2.3964, "step": 26424 }, { "crossentropy": 2.5043203830718994, "epoch": 0.9579828886310905, "grad_norm": 0.025950290262699127, "grad_norm_var": 1.6605202538201463e-06, "learning_rate": 4.528908540122168e-05, "loss": 2.4673, "step": 26425 }, { "crossentropy": 2.471113920211792, "epoch": 0.9580191415313225, "grad_norm": 0.026187658309936523, "grad_norm_var": 1.6592082722590666e-06, "learning_rate": 4.521108505479987e-05, "loss": 2.4105, "step": 26426 }, { "crossentropy": 2.391517400741577, "epoch": 0.9580553944315545, "grad_norm": 0.02585606835782528, "grad_norm_var": 1.6375315929554693e-06, "learning_rate": 4.513315163032161e-05, "loss": 2.4689, "step": 26427 }, { "crossentropy": 2.2776951789855957, "epoch": 0.9580916473317865, "grad_norm": 0.02744104154407978, "grad_norm_var": 1.6536428919138352e-06, "learning_rate": 4.505528512883883e-05, "loss": 2.3446, "step": 26428 }, { "crossentropy": 2.4678807258605957, "epoch": 0.9581279002320185, "grad_norm": 0.025934875011444092, "grad_norm_var": 1.6734023549042077e-06, "learning_rate": 4.497748555140457e-05, "loss": 2.4769, "step": 26429 }, { "crossentropy": 2.342252492904663, "epoch": 0.9581641531322506, "grad_norm": 0.02569684013724327, "grad_norm_var": 1.611115949234678e-06, "learning_rate": 4.4899752899068555e-05, "loss": 2.2946, "step": 26430 }, { "crossentropy": 2.3442609310150146, "epoch": 0.9582004060324826, "grad_norm": 0.026248494163155556, "grad_norm_var": 1.5613021332181362e-06, "learning_rate": 4.48220871728805e-05, "loss": 2.4339, "step": 26431 }, { "crossentropy": 2.3419787883758545, "epoch": 0.9582366589327146, "grad_norm": 0.026890672743320465, "grad_norm_var": 1.5542541211977043e-06, "learning_rate": 4.474448837389067e-05, "loss": 2.3186, "step": 26432 }, { "crossentropy": 2.365126371383667, "epoch": 0.9582729118329466, "grad_norm": 0.026680001989006996, "grad_norm_var": 1.498191433688123e-06, "learning_rate": 4.4666956503146004e-05, "loss": 2.4902, "step": 26433 }, { "crossentropy": 2.4081413745880127, "epoch": 0.9583091647331786, "grad_norm": 0.0252988263964653, "grad_norm_var": 2.5981836289836144e-07, "learning_rate": 4.458949156169401e-05, "loss": 2.3101, "step": 26434 }, { "crossentropy": 2.476886749267578, "epoch": 0.9583454176334106, "grad_norm": 0.02647603675723076, "grad_norm_var": 2.6092413480751213e-07, "learning_rate": 4.4512093550581056e-05, "loss": 2.4625, "step": 26435 }, { "crossentropy": 2.287503480911255, "epoch": 0.9583816705336426, "grad_norm": 0.02547071874141693, "grad_norm_var": 2.9834876031409736e-07, "learning_rate": 4.4434762470852984e-05, "loss": 2.337, "step": 26436 }, { "crossentropy": 2.481235980987549, "epoch": 0.9584179234338747, "grad_norm": 0.02712470106780529, "grad_norm_var": 3.520072629682128e-07, "learning_rate": 4.4357498323553404e-05, "loss": 2.3876, "step": 26437 }, { "crossentropy": 2.5156242847442627, "epoch": 0.9584541763341067, "grad_norm": 0.026053929701447487, "grad_norm_var": 3.539034116108985e-07, "learning_rate": 4.428030110972647e-05, "loss": 2.4513, "step": 26438 }, { "crossentropy": 2.5464956760406494, "epoch": 0.9584904292343387, "grad_norm": 0.02641354314982891, "grad_norm_var": 3.4676490258400694e-07, "learning_rate": 4.420317083041414e-05, "loss": 2.4816, "step": 26439 }, { "crossentropy": 2.4911739826202393, "epoch": 0.9585266821345708, "grad_norm": 0.02556026540696621, "grad_norm_var": 3.6701428071083303e-07, "learning_rate": 4.412610748665946e-05, "loss": 2.3855, "step": 26440 }, { "crossentropy": 2.5507352352142334, "epoch": 0.9585629350348028, "grad_norm": 0.02595694176852703, "grad_norm_var": 3.6679093238834296e-07, "learning_rate": 4.40491110795016e-05, "loss": 2.4938, "step": 26441 }, { "crossentropy": 2.299060344696045, "epoch": 0.9585991879350348, "grad_norm": 0.02559150569140911, "grad_norm_var": 3.9043447900421794e-07, "learning_rate": 4.3972181609981955e-05, "loss": 2.3227, "step": 26442 }, { "crossentropy": 2.4810779094696045, "epoch": 0.9586354408352669, "grad_norm": 0.026174280792474747, "grad_norm_var": 3.8351130172592447e-07, "learning_rate": 4.389531907913913e-05, "loss": 2.397, "step": 26443 }, { "crossentropy": 2.295253276824951, "epoch": 0.9586716937354989, "grad_norm": 0.025905074551701546, "grad_norm_var": 2.744034102646614e-07, "learning_rate": 4.381852348801063e-05, "loss": 2.3212, "step": 26444 }, { "crossentropy": 2.568000078201294, "epoch": 0.9587079466357309, "grad_norm": 0.02595321089029312, "grad_norm_var": 2.740395670208177e-07, "learning_rate": 4.3741794837635075e-05, "loss": 2.5075, "step": 26445 }, { "crossentropy": 2.3076908588409424, "epoch": 0.9587441995359629, "grad_norm": 0.027107419446110725, "grad_norm_var": 3.2380648673155377e-07, "learning_rate": 4.3665133129047184e-05, "loss": 2.3906, "step": 26446 }, { "crossentropy": 2.4733426570892334, "epoch": 0.9587804524361949, "grad_norm": 0.027119958773255348, "grad_norm_var": 3.7904476435761854e-07, "learning_rate": 4.358853836328336e-05, "loss": 2.5167, "step": 26447 }, { "crossentropy": 2.343205213546753, "epoch": 0.9588167053364269, "grad_norm": 0.025583995506167412, "grad_norm_var": 3.717099848858507e-07, "learning_rate": 4.351201054137832e-05, "loss": 2.3745, "step": 26448 }, { "crossentropy": 2.4212486743927, "epoch": 0.958852958236659, "grad_norm": 0.025582779198884964, "grad_norm_var": 3.7006003366391965e-07, "learning_rate": 4.343554966436514e-05, "loss": 2.3835, "step": 26449 }, { "crossentropy": 2.3101999759674072, "epoch": 0.958889211136891, "grad_norm": 0.026006659492850304, "grad_norm_var": 3.270991817101345e-07, "learning_rate": 4.3359155733276314e-05, "loss": 2.3773, "step": 26450 }, { "crossentropy": 2.413440227508545, "epoch": 0.958925464037123, "grad_norm": 0.02672613598406315, "grad_norm_var": 3.425455420942465e-07, "learning_rate": 4.328282874914491e-05, "loss": 2.358, "step": 26451 }, { "crossentropy": 2.3455100059509277, "epoch": 0.958961716937355, "grad_norm": 0.026398567482829094, "grad_norm_var": 3.128485466648608e-07, "learning_rate": 4.320656871300011e-05, "loss": 2.4606, "step": 26452 }, { "crossentropy": 2.400510787963867, "epoch": 0.958997969837587, "grad_norm": 0.026621360331773758, "grad_norm_var": 2.6687175682082085e-07, "learning_rate": 4.3130375625873296e-05, "loss": 2.3893, "step": 26453 }, { "crossentropy": 2.5435211658477783, "epoch": 0.959034222737819, "grad_norm": 0.026238664984703064, "grad_norm_var": 2.660908833632443e-07, "learning_rate": 4.305424948879311e-05, "loss": 2.4339, "step": 26454 }, { "crossentropy": 2.4741477966308594, "epoch": 0.959070475638051, "grad_norm": 0.026365652680397034, "grad_norm_var": 2.6476705173629954e-07, "learning_rate": 4.29781903027876e-05, "loss": 2.4363, "step": 26455 }, { "crossentropy": 2.526592969894409, "epoch": 0.959106728538283, "grad_norm": 0.02710030786693096, "grad_norm_var": 2.855844727471524e-07, "learning_rate": 4.290219806888429e-05, "loss": 2.5557, "step": 26456 }, { "crossentropy": 2.512817621231079, "epoch": 0.9591429814385151, "grad_norm": 0.026499584317207336, "grad_norm_var": 2.808289884561789e-07, "learning_rate": 4.282627278810902e-05, "loss": 2.4846, "step": 26457 }, { "crossentropy": 2.4003713130950928, "epoch": 0.9591792343387471, "grad_norm": 0.025748448446393013, "grad_norm_var": 2.673136064228517e-07, "learning_rate": 4.275041446148764e-05, "loss": 2.4622, "step": 26458 }, { "crossentropy": 2.473494052886963, "epoch": 0.9592154872389791, "grad_norm": 0.026523642241954803, "grad_norm_var": 2.6811889272923957e-07, "learning_rate": 4.267462309004544e-05, "loss": 2.477, "step": 26459 }, { "crossentropy": 2.3490383625030518, "epoch": 0.9592517401392111, "grad_norm": 0.02620420791208744, "grad_norm_var": 2.5626132318736356e-07, "learning_rate": 4.2598898674805485e-05, "loss": 2.4098, "step": 26460 }, { "crossentropy": 2.34000563621521, "epoch": 0.9592879930394431, "grad_norm": 0.026658181101083755, "grad_norm_var": 2.489652062015544e-07, "learning_rate": 4.2523241216790295e-05, "loss": 2.3911, "step": 26461 }, { "crossentropy": 2.496326446533203, "epoch": 0.9593242459396751, "grad_norm": 0.02572684735059738, "grad_norm_var": 2.388541877654596e-07, "learning_rate": 4.244765071702183e-05, "loss": 2.4804, "step": 26462 }, { "crossentropy": 2.553469657897949, "epoch": 0.9593604988399071, "grad_norm": 0.02586367353796959, "grad_norm_var": 2.0334102012208901e-07, "learning_rate": 4.2372127176521505e-05, "loss": 2.4686, "step": 26463 }, { "crossentropy": 2.519076347351074, "epoch": 0.9593967517401392, "grad_norm": 0.02629748173058033, "grad_norm_var": 1.7269896497305811e-07, "learning_rate": 4.229667059630849e-05, "loss": 2.4775, "step": 26464 }, { "crossentropy": 2.4430713653564453, "epoch": 0.9594330046403712, "grad_norm": 0.026196803897619247, "grad_norm_var": 1.387610920621501e-07, "learning_rate": 4.2221280977403075e-05, "loss": 2.4363, "step": 26465 }, { "crossentropy": 2.269199848175049, "epoch": 0.9594692575406032, "grad_norm": 0.027179664000868797, "grad_norm_var": 1.7520112554751144e-07, "learning_rate": 4.21459583208228e-05, "loss": 2.3854, "step": 26466 }, { "crossentropy": 2.3460216522216797, "epoch": 0.9595055104408353, "grad_norm": 0.02578786201775074, "grad_norm_var": 1.8902583298178026e-07, "learning_rate": 4.2070702627585165e-05, "loss": 2.336, "step": 26467 }, { "crossentropy": 2.325427770614624, "epoch": 0.9595417633410673, "grad_norm": 0.02601429633796215, "grad_norm_var": 1.9516105408238078e-07, "learning_rate": 4.1995513898706596e-05, "loss": 2.3461, "step": 26468 }, { "crossentropy": 2.487354040145874, "epoch": 0.9595780162412993, "grad_norm": 0.02733505517244339, "grad_norm_var": 2.5622834038198774e-07, "learning_rate": 4.192039213520238e-05, "loss": 2.4399, "step": 26469 }, { "crossentropy": 2.371615409851074, "epoch": 0.9596142691415314, "grad_norm": 0.026285121217370033, "grad_norm_var": 2.556192557752931e-07, "learning_rate": 4.184533733808782e-05, "loss": 2.3687, "step": 26470 }, { "crossentropy": 2.3499038219451904, "epoch": 0.9596505220417634, "grad_norm": 0.027533559128642082, "grad_norm_var": 3.414887152629187e-07, "learning_rate": 4.177034950837599e-05, "loss": 2.4622, "step": 26471 }, { "crossentropy": 2.4553377628326416, "epoch": 0.9596867749419954, "grad_norm": 0.026090946048498154, "grad_norm_var": 3.15581964172434e-07, "learning_rate": 4.169542864708054e-05, "loss": 2.3893, "step": 26472 }, { "crossentropy": 2.3918585777282715, "epoch": 0.9597230278422274, "grad_norm": 0.026123613119125366, "grad_norm_var": 3.1800011494973607e-07, "learning_rate": 4.16205747552123e-05, "loss": 2.3505, "step": 26473 }, { "crossentropy": 2.482654094696045, "epoch": 0.9597592807424594, "grad_norm": 0.026019670069217682, "grad_norm_var": 3.0091300365860973e-07, "learning_rate": 4.154578783378271e-05, "loss": 2.4091, "step": 26474 }, { "crossentropy": 2.3998589515686035, "epoch": 0.9597955336426914, "grad_norm": 0.024919768795371056, "grad_norm_var": 4.2777137206998866e-07, "learning_rate": 4.147106788380206e-05, "loss": 2.4146, "step": 26475 }, { "crossentropy": 2.2688021659851074, "epoch": 0.9598317865429234, "grad_norm": 0.025311505421996117, "grad_norm_var": 4.847904636584923e-07, "learning_rate": 4.139641490627954e-05, "loss": 2.3847, "step": 26476 }, { "crossentropy": 2.508164882659912, "epoch": 0.9598680394431555, "grad_norm": 0.025771157816052437, "grad_norm_var": 4.808419236160353e-07, "learning_rate": 4.13218289022238e-05, "loss": 2.4726, "step": 26477 }, { "crossentropy": 2.337273359298706, "epoch": 0.9599042923433875, "grad_norm": 0.026873543858528137, "grad_norm_var": 4.977820305821727e-07, "learning_rate": 4.1247309872641245e-05, "loss": 2.3792, "step": 26478 }, { "crossentropy": 2.3839406967163086, "epoch": 0.9599405452436195, "grad_norm": 0.02657804638147354, "grad_norm_var": 4.952391654616198e-07, "learning_rate": 4.1172857818538856e-05, "loss": 2.4531, "step": 26479 }, { "crossentropy": 2.501878261566162, "epoch": 0.9599767981438515, "grad_norm": 0.02636042982339859, "grad_norm_var": 4.957184749918367e-07, "learning_rate": 4.1098472740923046e-05, "loss": 2.4567, "step": 26480 }, { "crossentropy": 2.450779914855957, "epoch": 0.9600130510440835, "grad_norm": 0.02689954824745655, "grad_norm_var": 5.193681758510014e-07, "learning_rate": 4.10241546407969e-05, "loss": 2.4707, "step": 26481 }, { "crossentropy": 2.3941564559936523, "epoch": 0.9600493039443155, "grad_norm": 0.024916693568229675, "grad_norm_var": 5.793640797756178e-07, "learning_rate": 4.094990351916572e-05, "loss": 2.4076, "step": 26482 }, { "crossentropy": 2.4057767391204834, "epoch": 0.9600855568445475, "grad_norm": 0.02564193494617939, "grad_norm_var": 5.882528354973655e-07, "learning_rate": 4.087571937703149e-05, "loss": 2.4295, "step": 26483 }, { "crossentropy": 2.514085054397583, "epoch": 0.9601218097447796, "grad_norm": 0.025787418708205223, "grad_norm_var": 5.960947298761302e-07, "learning_rate": 4.0801602215396194e-05, "loss": 2.4241, "step": 26484 }, { "crossentropy": 2.595825672149658, "epoch": 0.9601580626450116, "grad_norm": 0.026411017403006554, "grad_norm_var": 5.038250372338491e-07, "learning_rate": 4.072755203526124e-05, "loss": 2.4732, "step": 26485 }, { "crossentropy": 2.3330516815185547, "epoch": 0.9601943155452436, "grad_norm": 0.026049140840768814, "grad_norm_var": 5.013312908081283e-07, "learning_rate": 4.0653568837626944e-05, "loss": 2.4042, "step": 26486 }, { "crossentropy": 2.2889516353607178, "epoch": 0.9602305684454756, "grad_norm": 0.02587473951280117, "grad_norm_var": 3.5192963626171773e-07, "learning_rate": 4.057965262349195e-05, "loss": 2.3785, "step": 26487 }, { "crossentropy": 2.5137786865234375, "epoch": 0.9602668213457076, "grad_norm": 0.02583370730280876, "grad_norm_var": 3.521511367791825e-07, "learning_rate": 4.0505803393855456e-05, "loss": 2.4628, "step": 26488 }, { "crossentropy": 2.4448039531707764, "epoch": 0.9603030742459396, "grad_norm": 0.026283562183380127, "grad_norm_var": 3.5722351091378957e-07, "learning_rate": 4.043202114971389e-05, "loss": 2.4417, "step": 26489 }, { "crossentropy": 2.4885618686676025, "epoch": 0.9603393271461717, "grad_norm": 0.02574821561574936, "grad_norm_var": 3.6005811109475714e-07, "learning_rate": 4.035830589206424e-05, "loss": 2.4495, "step": 26490 }, { "crossentropy": 2.430304527282715, "epoch": 0.9603755800464037, "grad_norm": 0.02612615004181862, "grad_norm_var": 2.8469677801240486e-07, "learning_rate": 4.0284657621902364e-05, "loss": 2.4309, "step": 26491 }, { "crossentropy": 2.3975484371185303, "epoch": 0.9604118329466357, "grad_norm": 0.02555939555168152, "grad_norm_var": 2.648169213742904e-07, "learning_rate": 4.021107634022247e-05, "loss": 2.4184, "step": 26492 }, { "crossentropy": 2.505889415740967, "epoch": 0.9604480858468677, "grad_norm": 0.025851618498563766, "grad_norm_var": 2.6228729006643784e-07, "learning_rate": 4.0137562048019324e-05, "loss": 2.4572, "step": 26493 }, { "crossentropy": 2.3988454341888428, "epoch": 0.9604843387470998, "grad_norm": 0.026251070201396942, "grad_norm_var": 2.1812803972149664e-07, "learning_rate": 4.006411474628491e-05, "loss": 2.4083, "step": 26494 }, { "crossentropy": 2.426797866821289, "epoch": 0.9605205916473318, "grad_norm": 0.025947237387299538, "grad_norm_var": 1.9528757192951995e-07, "learning_rate": 3.999073443601176e-05, "loss": 2.3905, "step": 26495 }, { "crossentropy": 2.592076539993286, "epoch": 0.9605568445475638, "grad_norm": 0.026151033118367195, "grad_norm_var": 1.8716555874966935e-07, "learning_rate": 3.99174211181913e-05, "loss": 2.469, "step": 26496 }, { "crossentropy": 2.3270375728607178, "epoch": 0.9605930974477959, "grad_norm": 0.026961535215377808, "grad_norm_var": 1.951852214340799e-07, "learning_rate": 3.9844174793812196e-05, "loss": 2.3775, "step": 26497 }, { "crossentropy": 2.3876454830169678, "epoch": 0.9606293503480279, "grad_norm": 0.027066629379987717, "grad_norm_var": 1.8438440228616542e-07, "learning_rate": 3.977099546386531e-05, "loss": 2.3763, "step": 26498 }, { "crossentropy": 2.2625606060028076, "epoch": 0.9606656032482599, "grad_norm": 0.02582640014588833, "grad_norm_var": 1.7533030031574582e-07, "learning_rate": 3.969788312933875e-05, "loss": 2.298, "step": 26499 }, { "crossentropy": 2.5072600841522217, "epoch": 0.9607018561484919, "grad_norm": 0.026070401072502136, "grad_norm_var": 1.6823733781910246e-07, "learning_rate": 3.96248377912195e-05, "loss": 2.4281, "step": 26500 }, { "crossentropy": 2.327812671661377, "epoch": 0.9607381090487239, "grad_norm": 0.027058226987719536, "grad_norm_var": 2.1903518317502255e-07, "learning_rate": 3.955185945049455e-05, "loss": 2.4233, "step": 26501 }, { "crossentropy": 2.257115364074707, "epoch": 0.9607743619489559, "grad_norm": 0.025799954310059547, "grad_norm_var": 2.2680504468339245e-07, "learning_rate": 3.947894810814978e-05, "loss": 2.3268, "step": 26502 }, { "crossentropy": 2.3713836669921875, "epoch": 0.960810614849188, "grad_norm": 0.025790277868509293, "grad_norm_var": 2.3035771727895371e-07, "learning_rate": 3.940610376516884e-05, "loss": 2.3835, "step": 26503 }, { "crossentropy": 2.4161367416381836, "epoch": 0.96084686774942, "grad_norm": 0.026976093649864197, "grad_norm_var": 2.644560416367823e-07, "learning_rate": 3.933332642253762e-05, "loss": 2.5084, "step": 26504 }, { "crossentropy": 2.4439890384674072, "epoch": 0.960883120649652, "grad_norm": 0.026354428380727768, "grad_norm_var": 2.654013318585755e-07, "learning_rate": 3.926061608123699e-05, "loss": 2.4129, "step": 26505 }, { "crossentropy": 2.3591790199279785, "epoch": 0.960919373549884, "grad_norm": 0.025507474318146706, "grad_norm_var": 2.842047866142191e-07, "learning_rate": 3.9187972742250054e-05, "loss": 2.4186, "step": 26506 }, { "crossentropy": 2.529270648956299, "epoch": 0.960955626450116, "grad_norm": 0.02833135612308979, "grad_norm_var": 5.646246752279698e-07, "learning_rate": 3.9115396406558255e-05, "loss": 2.5175, "step": 26507 }, { "crossentropy": 2.571453094482422, "epoch": 0.960991879350348, "grad_norm": 0.026881922036409378, "grad_norm_var": 5.355968197313556e-07, "learning_rate": 3.9042887075141366e-05, "loss": 2.4483, "step": 26508 }, { "crossentropy": 2.5820980072021484, "epoch": 0.96102813225058, "grad_norm": 0.02689988538622856, "grad_norm_var": 5.23910731213189e-07, "learning_rate": 3.897044474897859e-05, "loss": 2.5391, "step": 26509 }, { "crossentropy": 2.5753889083862305, "epoch": 0.961064385150812, "grad_norm": 0.025827176868915558, "grad_norm_var": 5.48765018585602e-07, "learning_rate": 3.889806942904861e-05, "loss": 2.487, "step": 26510 }, { "crossentropy": 2.397163152694702, "epoch": 0.9611006380510441, "grad_norm": 0.025637872517108917, "grad_norm_var": 5.761295545250271e-07, "learning_rate": 3.882576111632896e-05, "loss": 2.4276, "step": 26511 }, { "crossentropy": 2.532076835632324, "epoch": 0.9611368909512761, "grad_norm": 0.027544260025024414, "grad_norm_var": 6.425988349072296e-07, "learning_rate": 3.875351981179609e-05, "loss": 2.4694, "step": 26512 }, { "crossentropy": 2.47812819480896, "epoch": 0.9611731438515081, "grad_norm": 0.026203665882349014, "grad_norm_var": 6.352307713833414e-07, "learning_rate": 3.8681345516426435e-05, "loss": 2.5149, "step": 26513 }, { "crossentropy": 2.4730381965637207, "epoch": 0.9612093967517401, "grad_norm": 0.02626592293381691, "grad_norm_var": 6.133131220368518e-07, "learning_rate": 3.8609238231193665e-05, "loss": 2.3838, "step": 26514 }, { "crossentropy": 2.3349263668060303, "epoch": 0.9612456496519721, "grad_norm": 0.0260790903121233, "grad_norm_var": 5.967666753037137e-07, "learning_rate": 3.853719795707311e-05, "loss": 2.4108, "step": 26515 }, { "crossentropy": 2.425649881362915, "epoch": 0.9612819025522041, "grad_norm": 0.024750765413045883, "grad_norm_var": 7.72705548572255e-07, "learning_rate": 3.846522469503622e-05, "loss": 2.4084, "step": 26516 }, { "crossentropy": 2.471626043319702, "epoch": 0.9613181554524362, "grad_norm": 0.02589256316423416, "grad_norm_var": 7.505503580015315e-07, "learning_rate": 3.8393318446056095e-05, "loss": 2.4153, "step": 26517 }, { "crossentropy": 2.3490140438079834, "epoch": 0.9613544083526682, "grad_norm": 0.026021486148238182, "grad_norm_var": 7.38953255915072e-07, "learning_rate": 3.832147921110363e-05, "loss": 2.3995, "step": 26518 }, { "crossentropy": 2.283832550048828, "epoch": 0.9613906612529002, "grad_norm": 0.025171078741550446, "grad_norm_var": 8.058463106918577e-07, "learning_rate": 3.8249706991149156e-05, "loss": 2.3408, "step": 26519 }, { "crossentropy": 2.3637287616729736, "epoch": 0.9614269141531323, "grad_norm": 0.025152236223220825, "grad_norm_var": 8.424218594162957e-07, "learning_rate": 3.8178001787162465e-05, "loss": 2.3954, "step": 26520 }, { "crossentropy": 2.5882222652435303, "epoch": 0.9614631670533643, "grad_norm": 0.026003606617450714, "grad_norm_var": 8.409059983579177e-07, "learning_rate": 3.810636360011166e-05, "loss": 2.4726, "step": 26521 }, { "crossentropy": 2.504465103149414, "epoch": 0.9614994199535963, "grad_norm": 0.025823593139648438, "grad_norm_var": 8.206747034219247e-07, "learning_rate": 3.803479243096375e-05, "loss": 2.4009, "step": 26522 }, { "crossentropy": 2.5321037769317627, "epoch": 0.9615356728538283, "grad_norm": 0.02661840058863163, "grad_norm_var": 5.070889308689581e-07, "learning_rate": 3.796328828068685e-05, "loss": 2.5227, "step": 26523 }, { "crossentropy": 2.3928909301757812, "epoch": 0.9615719257540604, "grad_norm": 0.026137446984648705, "grad_norm_var": 4.589855210216746e-07, "learning_rate": 3.789185115024518e-05, "loss": 2.3143, "step": 26524 }, { "crossentropy": 2.415006160736084, "epoch": 0.9616081786542924, "grad_norm": 0.025506552308797836, "grad_norm_var": 4.134802246619218e-07, "learning_rate": 3.7820481040605204e-05, "loss": 2.3493, "step": 26525 }, { "crossentropy": 2.3740575313568115, "epoch": 0.9616444315545244, "grad_norm": 0.026599552482366562, "grad_norm_var": 4.417487159924863e-07, "learning_rate": 3.774917795272892e-05, "loss": 2.3909, "step": 26526 }, { "crossentropy": 2.3756165504455566, "epoch": 0.9616806844547564, "grad_norm": 0.026681169867515564, "grad_norm_var": 4.645499528036293e-07, "learning_rate": 3.767794188758111e-05, "loss": 2.4438, "step": 26527 }, { "crossentropy": 2.466521739959717, "epoch": 0.9617169373549884, "grad_norm": 0.025680063292384148, "grad_norm_var": 3.0492353915640737e-07, "learning_rate": 3.760677284612324e-05, "loss": 2.3879, "step": 26528 }, { "crossentropy": 2.491183280944824, "epoch": 0.9617531902552204, "grad_norm": 0.027066931128501892, "grad_norm_var": 3.8510612849865176e-07, "learning_rate": 3.7535670829316194e-05, "loss": 2.5172, "step": 26529 }, { "crossentropy": 2.323556661605835, "epoch": 0.9617894431554525, "grad_norm": 0.02509651705622673, "grad_norm_var": 4.2375729733070725e-07, "learning_rate": 3.746463583812143e-05, "loss": 2.3234, "step": 26530 }, { "crossentropy": 2.2200889587402344, "epoch": 0.9618256960556845, "grad_norm": 0.026415741071105003, "grad_norm_var": 4.392131362768951e-07, "learning_rate": 3.739366787349763e-05, "loss": 2.3949, "step": 26531 }, { "crossentropy": 2.388296604156494, "epoch": 0.9618619489559165, "grad_norm": 0.025963032618165016, "grad_norm_var": 3.431060714186492e-07, "learning_rate": 3.732276693640291e-05, "loss": 2.3587, "step": 26532 }, { "crossentropy": 2.417522430419922, "epoch": 0.9618982018561485, "grad_norm": 0.025217220187187195, "grad_norm_var": 3.8032891163687167e-07, "learning_rate": 3.725193302779539e-05, "loss": 2.3683, "step": 26533 }, { "crossentropy": 2.410670757293701, "epoch": 0.9619344547563805, "grad_norm": 0.025455402210354805, "grad_norm_var": 3.9474744512179906e-07, "learning_rate": 3.718116614863209e-05, "loss": 2.4624, "step": 26534 }, { "crossentropy": 2.3628616333007812, "epoch": 0.9619707076566125, "grad_norm": 0.026613853871822357, "grad_norm_var": 3.823579602462689e-07, "learning_rate": 3.711046629986781e-05, "loss": 2.4299, "step": 26535 }, { "crossentropy": 2.438673734664917, "epoch": 0.9620069605568445, "grad_norm": 0.025373363867402077, "grad_norm_var": 3.603611353643661e-07, "learning_rate": 3.7039833482458983e-05, "loss": 2.4939, "step": 26536 }, { "crossentropy": 2.3288135528564453, "epoch": 0.9620432134570766, "grad_norm": 0.02703327313065529, "grad_norm_var": 4.2495346662615024e-07, "learning_rate": 3.69692676973582e-05, "loss": 2.3623, "step": 26537 }, { "crossentropy": 2.4525644779205322, "epoch": 0.9620794663573086, "grad_norm": 0.026966650038957596, "grad_norm_var": 4.675161730511064e-07, "learning_rate": 3.6898768945519134e-05, "loss": 2.4502, "step": 26538 }, { "crossentropy": 2.2995738983154297, "epoch": 0.9621157192575406, "grad_norm": 0.026773031800985336, "grad_norm_var": 4.786354107708113e-07, "learning_rate": 3.682833722789436e-05, "loss": 2.3775, "step": 26539 }, { "crossentropy": 2.3142969608306885, "epoch": 0.9621519721577726, "grad_norm": 0.026090415194630623, "grad_norm_var": 4.789228489213496e-07, "learning_rate": 3.6757972545434784e-05, "loss": 2.442, "step": 26540 }, { "crossentropy": 2.4742543697357178, "epoch": 0.9621882250580046, "grad_norm": 0.0269416905939579, "grad_norm_var": 4.82936499594279e-07, "learning_rate": 3.6687674899090196e-05, "loss": 2.4866, "step": 26541 }, { "crossentropy": 2.3685271739959717, "epoch": 0.9622244779582366, "grad_norm": 0.02526644431054592, "grad_norm_var": 5.315214045428177e-07, "learning_rate": 3.6617444289810954e-05, "loss": 2.4609, "step": 26542 }, { "crossentropy": 2.4365406036376953, "epoch": 0.9622607308584686, "grad_norm": 0.02739422582089901, "grad_norm_var": 6.124047513451798e-07, "learning_rate": 3.6547280718545185e-05, "loss": 2.4809, "step": 26543 }, { "crossentropy": 2.3187499046325684, "epoch": 0.9622969837587007, "grad_norm": 0.0253728199750185, "grad_norm_var": 6.399828305939566e-07, "learning_rate": 3.647718418624102e-05, "loss": 2.3873, "step": 26544 }, { "crossentropy": 2.3476312160491943, "epoch": 0.9623332366589327, "grad_norm": 0.0265604630112648, "grad_norm_var": 5.967989350449028e-07, "learning_rate": 3.6407154693844925e-05, "loss": 2.4113, "step": 26545 }, { "crossentropy": 2.4974796772003174, "epoch": 0.9623694895591647, "grad_norm": 0.025991540402173996, "grad_norm_var": 5.201461764155664e-07, "learning_rate": 3.633719224230281e-05, "loss": 2.483, "step": 26546 }, { "crossentropy": 2.4495701789855957, "epoch": 0.9624057424593968, "grad_norm": 0.025823721662163734, "grad_norm_var": 5.26152492231977e-07, "learning_rate": 3.626729683255947e-05, "loss": 2.4066, "step": 26547 }, { "crossentropy": 2.3370285034179688, "epoch": 0.9624419953596288, "grad_norm": 0.025918763130903244, "grad_norm_var": 5.275398418733233e-07, "learning_rate": 3.61974684655586e-05, "loss": 2.3877, "step": 26548 }, { "crossentropy": 2.421280860900879, "epoch": 0.9624782482598608, "grad_norm": 0.026928909122943878, "grad_norm_var": 4.921693705359538e-07, "learning_rate": 3.612770714224389e-05, "loss": 2.4721, "step": 26549 }, { "crossentropy": 2.4198334217071533, "epoch": 0.9625145011600929, "grad_norm": 0.026324329897761345, "grad_norm_var": 4.436457321782174e-07, "learning_rate": 3.6058012863557366e-05, "loss": 2.4444, "step": 26550 }, { "crossentropy": 2.5817747116088867, "epoch": 0.9625507540603249, "grad_norm": 0.027995062991976738, "grad_norm_var": 6.140781268519173e-07, "learning_rate": 3.598838563044049e-05, "loss": 2.4849, "step": 26551 }, { "crossentropy": 2.570909261703491, "epoch": 0.9625870069605569, "grad_norm": 0.025730807334184647, "grad_norm_var": 5.7207834950206e-07, "learning_rate": 3.591882544383418e-05, "loss": 2.4517, "step": 26552 }, { "crossentropy": 2.588887929916382, "epoch": 0.9626232598607889, "grad_norm": 0.02642420306801796, "grad_norm_var": 5.474506207980006e-07, "learning_rate": 3.5849332304676575e-05, "loss": 2.5654, "step": 26553 }, { "crossentropy": 2.520829677581787, "epoch": 0.9626595127610209, "grad_norm": 0.027001043781638145, "grad_norm_var": 5.500935722240728e-07, "learning_rate": 3.5779906213907474e-05, "loss": 2.4867, "step": 26554 }, { "crossentropy": 2.5445444583892822, "epoch": 0.9626957656612529, "grad_norm": 0.025727178901433945, "grad_norm_var": 5.676365347132354e-07, "learning_rate": 3.571054717246447e-05, "loss": 2.5349, "step": 26555 }, { "crossentropy": 2.249720573425293, "epoch": 0.9627320185614849, "grad_norm": 0.02510477975010872, "grad_norm_var": 6.615777932405016e-07, "learning_rate": 3.564125518128403e-05, "loss": 2.3304, "step": 26556 }, { "crossentropy": 2.489595890045166, "epoch": 0.962768271461717, "grad_norm": 0.02664121799170971, "grad_norm_var": 6.407762722183963e-07, "learning_rate": 3.557203024130207e-05, "loss": 2.5323, "step": 26557 }, { "crossentropy": 2.3440396785736084, "epoch": 0.962804524361949, "grad_norm": 0.02553006075322628, "grad_norm_var": 6.100972960922102e-07, "learning_rate": 3.5502872353453397e-05, "loss": 2.3041, "step": 26558 }, { "crossentropy": 2.355494499206543, "epoch": 0.962840777262181, "grad_norm": 0.0265192873775959, "grad_norm_var": 5.27878986329129e-07, "learning_rate": 3.543378151867227e-05, "loss": 2.3867, "step": 26559 }, { "crossentropy": 2.451373815536499, "epoch": 0.962877030162413, "grad_norm": 0.027097826823592186, "grad_norm_var": 5.179383729175437e-07, "learning_rate": 3.5364757737892386e-05, "loss": 2.4569, "step": 26560 }, { "crossentropy": 2.4237635135650635, "epoch": 0.962913283062645, "grad_norm": 0.026063797995448112, "grad_norm_var": 5.182561368731002e-07, "learning_rate": 3.529580101204466e-05, "loss": 2.4304, "step": 26561 }, { "crossentropy": 2.3525185585021973, "epoch": 0.962949535962877, "grad_norm": 0.02658427320420742, "grad_norm_var": 5.157252244398681e-07, "learning_rate": 3.522691134206224e-05, "loss": 2.4232, "step": 26562 }, { "crossentropy": 2.511895179748535, "epoch": 0.962985788863109, "grad_norm": 0.02600099891424179, "grad_norm_var": 5.055227150068767e-07, "learning_rate": 3.5158088728874384e-05, "loss": 2.5058, "step": 26563 }, { "crossentropy": 2.3251373767852783, "epoch": 0.963022041763341, "grad_norm": 0.02576286531984806, "grad_norm_var": 5.159958838694591e-07, "learning_rate": 3.50893331734109e-05, "loss": 2.3955, "step": 26564 }, { "crossentropy": 2.4429731369018555, "epoch": 0.9630582946635731, "grad_norm": 0.026266705244779587, "grad_norm_var": 4.913874294764821e-07, "learning_rate": 3.502064467660104e-05, "loss": 2.4508, "step": 26565 }, { "crossentropy": 2.4321930408477783, "epoch": 0.9630945475638051, "grad_norm": 0.026230910792946815, "grad_norm_var": 4.916099259699337e-07, "learning_rate": 3.49520232393713e-05, "loss": 2.4299, "step": 26566 }, { "crossentropy": 2.3990256786346436, "epoch": 0.9631308004640371, "grad_norm": 0.0266320388764143, "grad_norm_var": 2.98318263505431e-07, "learning_rate": 3.488346886264926e-05, "loss": 2.4415, "step": 26567 }, { "crossentropy": 2.4474940299987793, "epoch": 0.9631670533642691, "grad_norm": 0.026243772357702255, "grad_norm_var": 2.821690939069776e-07, "learning_rate": 3.481498154736085e-05, "loss": 2.4509, "step": 26568 }, { "crossentropy": 2.4134087562561035, "epoch": 0.9632033062645011, "grad_norm": 0.02627551555633545, "grad_norm_var": 2.798878166073894e-07, "learning_rate": 3.4746561294431434e-05, "loss": 2.3943, "step": 26569 }, { "crossentropy": 2.3893582820892334, "epoch": 0.9632395591647331, "grad_norm": 0.02626054175198078, "grad_norm_var": 2.380453690346464e-07, "learning_rate": 3.467820810478472e-05, "loss": 2.4319, "step": 26570 }, { "crossentropy": 2.392784595489502, "epoch": 0.9632758120649652, "grad_norm": 0.02507018856704235, "grad_norm_var": 3.050273770124438e-07, "learning_rate": 3.4609921979343315e-05, "loss": 2.3757, "step": 26571 }, { "crossentropy": 2.3134939670562744, "epoch": 0.9633120649651972, "grad_norm": 0.02739769034087658, "grad_norm_var": 3.162726542512191e-07, "learning_rate": 3.454170291903092e-05, "loss": 2.4132, "step": 26572 }, { "crossentropy": 2.425912380218506, "epoch": 0.9633483178654292, "grad_norm": 0.02525908872485161, "grad_norm_var": 3.7022376498616985e-07, "learning_rate": 3.4473550924767896e-05, "loss": 2.3955, "step": 26573 }, { "crossentropy": 2.4441158771514893, "epoch": 0.9633845707656613, "grad_norm": 0.025878341868519783, "grad_norm_var": 3.4670758555319784e-07, "learning_rate": 3.440546599747463e-05, "loss": 2.3613, "step": 26574 }, { "crossentropy": 2.4210667610168457, "epoch": 0.9634208236658933, "grad_norm": 0.025614222511649132, "grad_norm_var": 3.6196722182171326e-07, "learning_rate": 3.433744813807149e-05, "loss": 2.4125, "step": 26575 }, { "crossentropy": 2.5796358585357666, "epoch": 0.9634570765661253, "grad_norm": 0.025922056287527084, "grad_norm_var": 3.02118823059377e-07, "learning_rate": 3.426949734747664e-05, "loss": 2.4711, "step": 26576 }, { "crossentropy": 2.383866786956787, "epoch": 0.9634933294663574, "grad_norm": 0.026369212195277214, "grad_norm_var": 3.0682313353387843e-07, "learning_rate": 3.420161362660823e-05, "loss": 2.4111, "step": 26577 }, { "crossentropy": 2.4852569103240967, "epoch": 0.9635295823665894, "grad_norm": 0.026771217584609985, "grad_norm_var": 3.208159696452591e-07, "learning_rate": 3.413379697638275e-05, "loss": 2.4492, "step": 26578 }, { "crossentropy": 2.4601237773895264, "epoch": 0.9635658352668214, "grad_norm": 0.02539575658738613, "grad_norm_var": 3.5349250716940755e-07, "learning_rate": 3.4066047397716125e-05, "loss": 2.4403, "step": 26579 }, { "crossentropy": 2.4203619956970215, "epoch": 0.9636020881670534, "grad_norm": 0.02594272419810295, "grad_norm_var": 3.4780396374702907e-07, "learning_rate": 3.3998364891524306e-05, "loss": 2.4688, "step": 26580 }, { "crossentropy": 2.330660343170166, "epoch": 0.9636383410672854, "grad_norm": 0.025151705369353294, "grad_norm_var": 4.0007142818807017e-07, "learning_rate": 3.393074945871988e-05, "loss": 2.3843, "step": 26581 }, { "crossentropy": 2.564760684967041, "epoch": 0.9636745939675174, "grad_norm": 0.026511551812291145, "grad_norm_var": 4.126637831526151e-07, "learning_rate": 3.386320110021768e-05, "loss": 2.4882, "step": 26582 }, { "crossentropy": 2.3866281509399414, "epoch": 0.9637108468677494, "grad_norm": 0.02668284811079502, "grad_norm_var": 4.16812385377272e-07, "learning_rate": 3.379571981692864e-05, "loss": 2.3977, "step": 26583 }, { "crossentropy": 2.331550121307373, "epoch": 0.9637470997679815, "grad_norm": 0.026545844972133636, "grad_norm_var": 4.304546605505185e-07, "learning_rate": 3.3728305609765365e-05, "loss": 2.345, "step": 26584 }, { "crossentropy": 2.503591775894165, "epoch": 0.9637833526682135, "grad_norm": 0.027326548472046852, "grad_norm_var": 5.289232133910086e-07, "learning_rate": 3.3660958479637685e-05, "loss": 2.4877, "step": 26585 }, { "crossentropy": 2.5366413593292236, "epoch": 0.9638196055684455, "grad_norm": 0.025673890486359596, "grad_norm_var": 5.403177204632657e-07, "learning_rate": 3.3593678427455424e-05, "loss": 2.387, "step": 26586 }, { "crossentropy": 2.3953394889831543, "epoch": 0.9638558584686775, "grad_norm": 0.026663905009627342, "grad_norm_var": 4.813900855879834e-07, "learning_rate": 3.35264654541273e-05, "loss": 2.4992, "step": 26587 }, { "crossentropy": 2.4735045433044434, "epoch": 0.9638921113689095, "grad_norm": 0.026180200278759003, "grad_norm_var": 3.786616683389953e-07, "learning_rate": 3.345931956056148e-05, "loss": 2.4239, "step": 26588 }, { "crossentropy": 2.4655325412750244, "epoch": 0.9639283642691415, "grad_norm": 0.025578822940587997, "grad_norm_var": 3.4843162887426604e-07, "learning_rate": 3.339224074766445e-05, "loss": 2.3763, "step": 26589 }, { "crossentropy": 2.4037983417510986, "epoch": 0.9639646171693735, "grad_norm": 0.025687528774142265, "grad_norm_var": 3.5731473689408953e-07, "learning_rate": 3.332522901634216e-05, "loss": 2.4211, "step": 26590 }, { "crossentropy": 2.4700067043304443, "epoch": 0.9640008700696056, "grad_norm": 0.026574736461043358, "grad_norm_var": 3.4941754501402465e-07, "learning_rate": 3.325828436749945e-05, "loss": 2.3956, "step": 26591 }, { "crossentropy": 2.346907377243042, "epoch": 0.9640371229698376, "grad_norm": 0.02613559179008007, "grad_norm_var": 3.447479992760843e-07, "learning_rate": 3.319140680204169e-05, "loss": 2.389, "step": 26592 }, { "crossentropy": 2.4356939792633057, "epoch": 0.9640733758700696, "grad_norm": 0.025965873152017593, "grad_norm_var": 3.457890601766637e-07, "learning_rate": 3.312459632087095e-05, "loss": 2.4605, "step": 26593 }, { "crossentropy": 2.4367482662200928, "epoch": 0.9641096287703016, "grad_norm": 0.0250615943223238, "grad_norm_var": 3.9239679691219627e-07, "learning_rate": 3.305785292489039e-05, "loss": 2.4435, "step": 26594 }, { "crossentropy": 2.3719685077667236, "epoch": 0.9641458816705336, "grad_norm": 0.026317674666643143, "grad_norm_var": 3.629520261093829e-07, "learning_rate": 3.29911766150004e-05, "loss": 2.3697, "step": 26595 }, { "crossentropy": 2.388373374938965, "epoch": 0.9641821345707656, "grad_norm": 0.025630779564380646, "grad_norm_var": 3.7661790042621777e-07, "learning_rate": 3.292456739210248e-05, "loss": 2.4672, "step": 26596 }, { "crossentropy": 2.258831739425659, "epoch": 0.9642183874709976, "grad_norm": 0.02607039548456669, "grad_norm_var": 3.125267522631018e-07, "learning_rate": 3.285802525709702e-05, "loss": 2.3713, "step": 26597 }, { "crossentropy": 2.46826434135437, "epoch": 0.9642546403712297, "grad_norm": 0.027044912800192833, "grad_norm_var": 3.5509451542185127e-07, "learning_rate": 3.279155021088109e-05, "loss": 2.498, "step": 26598 }, { "crossentropy": 2.4959211349487305, "epoch": 0.9642908932714617, "grad_norm": 0.02729952149093151, "grad_norm_var": 4.188661117614234e-07, "learning_rate": 3.272514225435286e-05, "loss": 2.5386, "step": 26599 }, { "crossentropy": 2.406482458114624, "epoch": 0.9643271461716937, "grad_norm": 0.027137398719787598, "grad_norm_var": 4.652653792389105e-07, "learning_rate": 3.2658801388410484e-05, "loss": 2.4613, "step": 26600 }, { "crossentropy": 2.5540027618408203, "epoch": 0.9643633990719258, "grad_norm": 0.026195799931883812, "grad_norm_var": 3.861621021991034e-07, "learning_rate": 3.259252761394882e-05, "loss": 2.4818, "step": 26601 }, { "crossentropy": 2.419370412826538, "epoch": 0.9643996519721578, "grad_norm": 0.026606854051351547, "grad_norm_var": 3.749731373831374e-07, "learning_rate": 3.252632093186381e-05, "loss": 2.3658, "step": 26602 }, { "crossentropy": 2.4774770736694336, "epoch": 0.9644359048723898, "grad_norm": 0.026066666468977928, "grad_norm_var": 3.65060960137474e-07, "learning_rate": 3.246018134304862e-05, "loss": 2.4402, "step": 26603 }, { "crossentropy": 2.457484483718872, "epoch": 0.9644721577726219, "grad_norm": 0.026718396693468094, "grad_norm_var": 3.801543497176553e-07, "learning_rate": 3.2394108848397555e-05, "loss": 2.4686, "step": 26604 }, { "crossentropy": 2.323350429534912, "epoch": 0.9645084106728539, "grad_norm": 0.026057200506329536, "grad_norm_var": 3.5127809048194715e-07, "learning_rate": 3.232810344880266e-05, "loss": 2.3195, "step": 26605 }, { "crossentropy": 2.3105576038360596, "epoch": 0.9645446635730859, "grad_norm": 0.02650832012295723, "grad_norm_var": 3.279229557163193e-07, "learning_rate": 3.2262165145155455e-05, "loss": 2.427, "step": 26606 }, { "crossentropy": 2.43789005279541, "epoch": 0.9645809164733179, "grad_norm": 0.02589711919426918, "grad_norm_var": 3.3513993169202034e-07, "learning_rate": 3.2196293938346334e-05, "loss": 2.4045, "step": 26607 }, { "crossentropy": 2.605989456176758, "epoch": 0.9646171693735499, "grad_norm": 0.02560761757194996, "grad_norm_var": 3.6375805682215915e-07, "learning_rate": 3.213048982926514e-05, "loss": 2.4586, "step": 26608 }, { "crossentropy": 2.4214894771575928, "epoch": 0.9646534222737819, "grad_norm": 0.02619982324540615, "grad_norm_var": 3.579531143536498e-07, "learning_rate": 3.2064752818800614e-05, "loss": 2.3553, "step": 26609 }, { "crossentropy": 2.4455878734588623, "epoch": 0.9646896751740139, "grad_norm": 0.026126207783818245, "grad_norm_var": 2.563715595920903e-07, "learning_rate": 3.199908290784093e-05, "loss": 2.4719, "step": 26610 }, { "crossentropy": 2.355719804763794, "epoch": 0.964725928074246, "grad_norm": 0.025343310087919235, "grad_norm_var": 3.189714630098672e-07, "learning_rate": 3.193348009727259e-05, "loss": 2.3562, "step": 26611 }, { "crossentropy": 2.3732810020446777, "epoch": 0.964762180974478, "grad_norm": 0.026032032445073128, "grad_norm_var": 2.941992668244996e-07, "learning_rate": 3.186794438798213e-05, "loss": 2.4205, "step": 26612 }, { "crossentropy": 2.4112703800201416, "epoch": 0.96479843387471, "grad_norm": 0.025421030819416046, "grad_norm_var": 3.41037311506762e-07, "learning_rate": 3.180247578085493e-05, "loss": 2.4525, "step": 26613 }, { "crossentropy": 2.4460818767547607, "epoch": 0.964834686774942, "grad_norm": 0.025529803708195686, "grad_norm_var": 3.272362499654914e-07, "learning_rate": 3.173707427677419e-05, "loss": 2.3699, "step": 26614 }, { "crossentropy": 2.2696738243103027, "epoch": 0.964870939675174, "grad_norm": 0.02584741823375225, "grad_norm_var": 2.4066104804483915e-07, "learning_rate": 3.1671739876624196e-05, "loss": 2.3024, "step": 26615 }, { "crossentropy": 2.503331184387207, "epoch": 0.964907192575406, "grad_norm": 0.026491718366742134, "grad_norm_var": 1.7576599025427112e-07, "learning_rate": 3.160647258128702e-05, "loss": 2.5218, "step": 26616 }, { "crossentropy": 2.307774066925049, "epoch": 0.964943445475638, "grad_norm": 0.026030583307147026, "grad_norm_var": 1.7405275582147808e-07, "learning_rate": 3.154127239164417e-05, "loss": 2.3864, "step": 26617 }, { "crossentropy": 2.4220809936523438, "epoch": 0.96497969837587, "grad_norm": 0.026433657854795456, "grad_norm_var": 1.626122944219562e-07, "learning_rate": 3.147613930857607e-05, "loss": 2.4285, "step": 26618 }, { "crossentropy": 2.5341618061065674, "epoch": 0.9650159512761021, "grad_norm": 0.025865904986858368, "grad_norm_var": 1.638669767642231e-07, "learning_rate": 3.1411073332963116e-05, "loss": 2.4399, "step": 26619 }, { "crossentropy": 2.2963593006134033, "epoch": 0.9650522041763341, "grad_norm": 0.026024751365184784, "grad_norm_var": 1.2813347810776825e-07, "learning_rate": 3.134607446568349e-05, "loss": 2.4228, "step": 26620 }, { "crossentropy": 2.404287099838257, "epoch": 0.9650884570765661, "grad_norm": 0.02571687661111355, "grad_norm_var": 1.3112186425279167e-07, "learning_rate": 3.128114270761539e-05, "loss": 2.4382, "step": 26621 }, { "crossentropy": 2.18300461769104, "epoch": 0.9651247099767981, "grad_norm": 0.0261788759380579, "grad_norm_var": 1.1304055386139137e-07, "learning_rate": 3.121627805963589e-05, "loss": 2.2653, "step": 26622 }, { "crossentropy": 2.223503828048706, "epoch": 0.9651609628770301, "grad_norm": 0.02638595923781395, "grad_norm_var": 1.263756047691078e-07, "learning_rate": 3.11514805226204e-05, "loss": 2.2696, "step": 26623 }, { "crossentropy": 2.5002331733703613, "epoch": 0.9651972157772621, "grad_norm": 0.02590305171906948, "grad_norm_var": 1.1825625135467727e-07, "learning_rate": 3.108675009744544e-05, "loss": 2.5025, "step": 26624 }, { "crossentropy": 2.3610799312591553, "epoch": 0.9652334686774942, "grad_norm": 0.02636285498738289, "grad_norm_var": 1.2489830619869837e-07, "learning_rate": 3.1022086784984197e-05, "loss": 2.3955, "step": 26625 }, { "crossentropy": 2.3662819862365723, "epoch": 0.9652697215777262, "grad_norm": 0.025125226005911827, "grad_norm_var": 1.681246719779783e-07, "learning_rate": 3.095749058611041e-05, "loss": 2.3342, "step": 26626 }, { "crossentropy": 2.416375160217285, "epoch": 0.9653059744779582, "grad_norm": 0.02597588300704956, "grad_norm_var": 1.4463619904396483e-07, "learning_rate": 3.089296150169618e-05, "loss": 2.403, "step": 26627 }, { "crossentropy": 2.4688467979431152, "epoch": 0.9653422273781903, "grad_norm": 0.026502760127186775, "grad_norm_var": 1.6314108535474212e-07, "learning_rate": 3.0828499532613574e-05, "loss": 2.5006, "step": 26628 }, { "crossentropy": 2.5512962341308594, "epoch": 0.9653784802784223, "grad_norm": 0.02587188594043255, "grad_norm_var": 1.4180643408343018e-07, "learning_rate": 3.076410467973301e-05, "loss": 2.4356, "step": 26629 }, { "crossentropy": 2.439157009124756, "epoch": 0.9654147331786543, "grad_norm": 0.026812320575118065, "grad_norm_var": 1.6156279882128856e-07, "learning_rate": 3.06997769439249e-05, "loss": 2.3437, "step": 26630 }, { "crossentropy": 2.485464334487915, "epoch": 0.9654509860788864, "grad_norm": 0.025865450501441956, "grad_norm_var": 1.6098639820867564e-07, "learning_rate": 3.0635516326056875e-05, "loss": 2.4413, "step": 26631 }, { "crossentropy": 2.3064239025115967, "epoch": 0.9654872389791184, "grad_norm": 0.02568945102393627, "grad_norm_var": 1.5896248986555818e-07, "learning_rate": 3.057132282699771e-05, "loss": 2.3683, "step": 26632 }, { "crossentropy": 2.4944283962249756, "epoch": 0.9655234918793504, "grad_norm": 0.026170792058110237, "grad_norm_var": 1.598918464636023e-07, "learning_rate": 3.0507196447613904e-05, "loss": 2.4284, "step": 26633 }, { "crossentropy": 2.3889989852905273, "epoch": 0.9655597447795824, "grad_norm": 0.026059357449412346, "grad_norm_var": 1.4976835956738894e-07, "learning_rate": 3.0443137188772007e-05, "loss": 2.4008, "step": 26634 }, { "crossentropy": 2.554992198944092, "epoch": 0.9655959976798144, "grad_norm": 0.026367003098130226, "grad_norm_var": 1.543672463216594e-07, "learning_rate": 3.0379145051337432e-05, "loss": 2.5257, "step": 26635 }, { "crossentropy": 2.333695888519287, "epoch": 0.9656322505800464, "grad_norm": 0.026343833655118942, "grad_norm_var": 1.5909136431532742e-07, "learning_rate": 3.0315220036173374e-05, "loss": 2.3755, "step": 26636 }, { "crossentropy": 2.5654311180114746, "epoch": 0.9656685034802784, "grad_norm": 0.027196137234568596, "grad_norm_var": 2.2359820368923697e-07, "learning_rate": 3.02513621441447e-05, "loss": 2.4908, "step": 26637 }, { "crossentropy": 2.3504929542541504, "epoch": 0.9657047563805105, "grad_norm": 0.026168538257479668, "grad_norm_var": 2.2360047454635043e-07, "learning_rate": 3.0187571376113497e-05, "loss": 2.3698, "step": 26638 }, { "crossentropy": 2.369020462036133, "epoch": 0.9657410092807425, "grad_norm": 0.02656734175980091, "grad_norm_var": 2.3075784676302817e-07, "learning_rate": 3.0123847732940746e-05, "loss": 2.4764, "step": 26639 }, { "crossentropy": 2.4818358421325684, "epoch": 0.9657772621809745, "grad_norm": 0.02668590098619461, "grad_norm_var": 2.394886358973811e-07, "learning_rate": 3.006019121548742e-05, "loss": 2.5125, "step": 26640 }, { "crossentropy": 2.5036280155181885, "epoch": 0.9658135150812065, "grad_norm": 0.02582593634724617, "grad_norm_var": 2.4837440077683273e-07, "learning_rate": 2.999660182461339e-05, "loss": 2.4602, "step": 26641 }, { "crossentropy": 2.465486764907837, "epoch": 0.9658497679814385, "grad_norm": 0.028586870059370995, "grad_norm_var": 5.00443487868327e-07, "learning_rate": 2.9933079561177412e-05, "loss": 2.3664, "step": 26642 }, { "crossentropy": 2.3796186447143555, "epoch": 0.9658860208816705, "grad_norm": 0.026658039540052414, "grad_norm_var": 4.89306373093939e-07, "learning_rate": 2.986962442603769e-05, "loss": 2.3354, "step": 26643 }, { "crossentropy": 2.419196844100952, "epoch": 0.9659222737819025, "grad_norm": 0.02588793635368347, "grad_norm_var": 5.094860927548049e-07, "learning_rate": 2.9806236420051314e-05, "loss": 2.4315, "step": 26644 }, { "crossentropy": 2.362414598464966, "epoch": 0.9659585266821346, "grad_norm": 0.027027832344174385, "grad_norm_var": 5.081661776163333e-07, "learning_rate": 2.9742915544074267e-05, "loss": 2.3506, "step": 26645 }, { "crossentropy": 2.5230507850646973, "epoch": 0.9659947795823666, "grad_norm": 0.02586185745894909, "grad_norm_var": 5.243564092126858e-07, "learning_rate": 2.967966179896142e-05, "loss": 2.4778, "step": 26646 }, { "crossentropy": 2.502246379852295, "epoch": 0.9660310324825986, "grad_norm": 0.026604337617754936, "grad_norm_var": 5.023534793403695e-07, "learning_rate": 2.9616475185567648e-05, "loss": 2.5066, "step": 26647 }, { "crossentropy": 2.1805834770202637, "epoch": 0.9660672853828306, "grad_norm": 0.025967460125684738, "grad_norm_var": 4.77831036447586e-07, "learning_rate": 2.9553355704746155e-05, "loss": 2.2355, "step": 26648 }, { "crossentropy": 2.255823850631714, "epoch": 0.9661035382830626, "grad_norm": 0.026181742548942566, "grad_norm_var": 4.773597663118666e-07, "learning_rate": 2.949030335734959e-05, "loss": 2.3674, "step": 26649 }, { "crossentropy": 2.407743453979492, "epoch": 0.9661397911832946, "grad_norm": 0.026775529608130455, "grad_norm_var": 4.673983221218595e-07, "learning_rate": 2.9427318144228943e-05, "loss": 2.3908, "step": 26650 }, { "crossentropy": 2.350339651107788, "epoch": 0.9661760440835266, "grad_norm": 0.026168132200837135, "grad_norm_var": 4.745672519720767e-07, "learning_rate": 2.936440006623631e-05, "loss": 2.4179, "step": 26651 }, { "crossentropy": 2.376359224319458, "epoch": 0.9662122969837587, "grad_norm": 0.025933634489774704, "grad_norm_var": 4.953595002673972e-07, "learning_rate": 2.930154912422045e-05, "loss": 2.4615, "step": 26652 }, { "crossentropy": 2.5215091705322266, "epoch": 0.9662485498839907, "grad_norm": 0.026354797184467316, "grad_norm_var": 4.6219024059003755e-07, "learning_rate": 2.9238765319030134e-05, "loss": 2.4146, "step": 26653 }, { "crossentropy": 2.4899685382843018, "epoch": 0.9662848027842227, "grad_norm": 0.025883635506033897, "grad_norm_var": 4.780879219058615e-07, "learning_rate": 2.9176048651513577e-05, "loss": 2.4079, "step": 26654 }, { "crossentropy": 2.382448196411133, "epoch": 0.9663210556844548, "grad_norm": 0.02675577811896801, "grad_norm_var": 4.836150064172854e-07, "learning_rate": 2.9113399122518425e-05, "loss": 2.4607, "step": 26655 }, { "crossentropy": 2.4406726360321045, "epoch": 0.9663573085846868, "grad_norm": 0.025118643417954445, "grad_norm_var": 5.873078555822571e-07, "learning_rate": 2.9050816732890674e-05, "loss": 2.3687, "step": 26656 }, { "crossentropy": 2.350477695465088, "epoch": 0.9663935614849188, "grad_norm": 0.026204105466604233, "grad_norm_var": 5.698461707602144e-07, "learning_rate": 2.8988301483474644e-05, "loss": 2.3924, "step": 26657 }, { "crossentropy": 2.5025041103363037, "epoch": 0.9664298143851509, "grad_norm": 0.026445774361491203, "grad_norm_var": 2.243915997045583e-07, "learning_rate": 2.8925853375115773e-05, "loss": 2.4539, "step": 26658 }, { "crossentropy": 2.505945920944214, "epoch": 0.9664660672853829, "grad_norm": 0.026272710412740707, "grad_norm_var": 2.1215923715366994e-07, "learning_rate": 2.8863472408657275e-05, "loss": 2.4508, "step": 26659 }, { "crossentropy": 2.4472179412841797, "epoch": 0.9665023201856149, "grad_norm": 0.025825260207057, "grad_norm_var": 2.151400084809037e-07, "learning_rate": 2.8801158584941255e-05, "loss": 2.4657, "step": 26660 }, { "crossentropy": 2.3856754302978516, "epoch": 0.9665385730858469, "grad_norm": 0.026062188670039177, "grad_norm_var": 1.6829213651438588e-07, "learning_rate": 2.8738911904809817e-05, "loss": 2.3986, "step": 26661 }, { "crossentropy": 2.3781015872955322, "epoch": 0.9665748259860789, "grad_norm": 0.026145948097109795, "grad_norm_var": 1.6238497295746843e-07, "learning_rate": 2.8676732369102844e-05, "loss": 2.2848, "step": 26662 }, { "crossentropy": 2.4440526962280273, "epoch": 0.9666110788863109, "grad_norm": 0.026850156486034393, "grad_norm_var": 1.8043906878383436e-07, "learning_rate": 2.8614619978661326e-05, "loss": 2.3979, "step": 26663 }, { "crossentropy": 2.5152320861816406, "epoch": 0.9666473317865429, "grad_norm": 0.02581573836505413, "grad_norm_var": 1.8626018780965962e-07, "learning_rate": 2.85525747343246e-05, "loss": 2.4663, "step": 26664 }, { "crossentropy": 2.445363998413086, "epoch": 0.966683584686775, "grad_norm": 0.02491634152829647, "grad_norm_var": 2.851344267866858e-07, "learning_rate": 2.8490596636928658e-05, "loss": 2.412, "step": 26665 }, { "crossentropy": 2.48539662361145, "epoch": 0.966719837587007, "grad_norm": 0.025478744879364967, "grad_norm_var": 2.726613661588184e-07, "learning_rate": 2.842868568731227e-05, "loss": 2.3925, "step": 26666 }, { "crossentropy": 2.2360873222351074, "epoch": 0.966756090487239, "grad_norm": 0.026902392506599426, "grad_norm_var": 3.2140081945968474e-07, "learning_rate": 2.836684188631089e-05, "loss": 2.2853, "step": 26667 }, { "crossentropy": 2.440171241760254, "epoch": 0.966792343387471, "grad_norm": 0.02583974041044712, "grad_norm_var": 3.2353839912491133e-07, "learning_rate": 2.8305065234759954e-05, "loss": 2.3673, "step": 26668 }, { "crossentropy": 2.3951802253723145, "epoch": 0.966828596287703, "grad_norm": 0.027497543022036552, "grad_norm_var": 4.509106891915799e-07, "learning_rate": 2.8243355733494345e-05, "loss": 2.3886, "step": 26669 }, { "crossentropy": 2.4136593341827393, "epoch": 0.966864849187935, "grad_norm": 0.026337100192904472, "grad_norm_var": 4.4911365319888253e-07, "learning_rate": 2.818171338334674e-05, "loss": 2.3455, "step": 26670 }, { "crossentropy": 2.418684720993042, "epoch": 0.966901102088167, "grad_norm": 0.026421358808875084, "grad_norm_var": 4.292822009222333e-07, "learning_rate": 2.81201381851498e-05, "loss": 2.3745, "step": 26671 }, { "crossentropy": 2.3841278553009033, "epoch": 0.9669373549883991, "grad_norm": 0.026305843144655228, "grad_norm_var": 3.5674969744955927e-07, "learning_rate": 2.8058630139736196e-05, "loss": 2.4071, "step": 26672 }, { "crossentropy": 2.4238884449005127, "epoch": 0.9669736078886311, "grad_norm": 0.0269129890948534, "grad_norm_var": 3.8783051179291697e-07, "learning_rate": 2.7997189247935817e-05, "loss": 2.3779, "step": 26673 }, { "crossentropy": 2.3795864582061768, "epoch": 0.9670098607888631, "grad_norm": 0.025992102921009064, "grad_norm_var": 3.889646004436118e-07, "learning_rate": 2.7935815510578e-05, "loss": 2.3941, "step": 26674 }, { "crossentropy": 2.337930679321289, "epoch": 0.9670461136890951, "grad_norm": 0.026282696053385735, "grad_norm_var": 3.8903633896372746e-07, "learning_rate": 2.787450892849319e-05, "loss": 2.2774, "step": 26675 }, { "crossentropy": 2.3980047702789307, "epoch": 0.9670823665893271, "grad_norm": 0.02556881308555603, "grad_norm_var": 4.067853316475165e-07, "learning_rate": 2.781326950250851e-05, "loss": 2.4561, "step": 26676 }, { "crossentropy": 2.4424326419830322, "epoch": 0.9671186194895591, "grad_norm": 0.026006178930401802, "grad_norm_var": 4.0807110597653773e-07, "learning_rate": 2.775209723345107e-05, "loss": 2.4539, "step": 26677 }, { "crossentropy": 2.4326670169830322, "epoch": 0.9671548723897911, "grad_norm": 0.027191322296857834, "grad_norm_var": 4.6819570311360665e-07, "learning_rate": 2.7690992122146875e-05, "loss": 2.3926, "step": 26678 }, { "crossentropy": 2.2625489234924316, "epoch": 0.9671911252900232, "grad_norm": 0.025504212826490402, "grad_norm_var": 4.772935576449755e-07, "learning_rate": 2.7629954169421933e-05, "loss": 2.3964, "step": 26679 }, { "crossentropy": 2.3746261596679688, "epoch": 0.9672273781902552, "grad_norm": 0.02601078525185585, "grad_norm_var": 4.7004683018923335e-07, "learning_rate": 2.7568983376100033e-05, "loss": 2.3907, "step": 26680 }, { "crossentropy": 2.49237060546875, "epoch": 0.9672636310904872, "grad_norm": 0.027000421658158302, "grad_norm_var": 3.853619790625349e-07, "learning_rate": 2.7508079743005508e-05, "loss": 2.5006, "step": 26681 }, { "crossentropy": 2.493858575820923, "epoch": 0.9672998839907193, "grad_norm": 0.025420265272259712, "grad_norm_var": 3.9219967011883533e-07, "learning_rate": 2.7447243270959932e-05, "loss": 2.4144, "step": 26682 }, { "crossentropy": 2.3187692165374756, "epoch": 0.9673361368909513, "grad_norm": 0.025743335485458374, "grad_norm_var": 3.868719971926304e-07, "learning_rate": 2.738647396078542e-05, "loss": 2.3286, "step": 26683 }, { "crossentropy": 2.326677083969116, "epoch": 0.9673723897911833, "grad_norm": 0.02646597847342491, "grad_norm_var": 3.7694572510750765e-07, "learning_rate": 2.732577181330298e-05, "loss": 2.3938, "step": 26684 }, { "crossentropy": 2.4754903316497803, "epoch": 0.9674086426914154, "grad_norm": 0.026723165065050125, "grad_norm_var": 2.8988042927341416e-07, "learning_rate": 2.726513682933196e-05, "loss": 2.5269, "step": 26685 }, { "crossentropy": 2.539398670196533, "epoch": 0.9674448955916474, "grad_norm": 0.02556850016117096, "grad_norm_var": 3.1714949828929445e-07, "learning_rate": 2.7204569009691706e-05, "loss": 2.4302, "step": 26686 }, { "crossentropy": 2.4254848957061768, "epoch": 0.9674811484918794, "grad_norm": 0.027024613693356514, "grad_norm_var": 3.5811143016840446e-07, "learning_rate": 2.7144068355200446e-05, "loss": 2.4651, "step": 26687 }, { "crossentropy": 2.323409080505371, "epoch": 0.9675174013921114, "grad_norm": 0.02650388330221176, "grad_norm_var": 3.624973068493941e-07, "learning_rate": 2.708363486667531e-05, "loss": 2.3526, "step": 26688 }, { "crossentropy": 2.398200750350952, "epoch": 0.9675536542923434, "grad_norm": 0.027370182797312737, "grad_norm_var": 4.1628429703182333e-07, "learning_rate": 2.7023268544932313e-05, "loss": 2.4316, "step": 26689 }, { "crossentropy": 2.5341358184814453, "epoch": 0.9675899071925754, "grad_norm": 0.025909194722771645, "grad_norm_var": 4.198249068543819e-07, "learning_rate": 2.6962969390786353e-05, "loss": 2.4959, "step": 26690 }, { "crossentropy": 2.307687520980835, "epoch": 0.9676261600928074, "grad_norm": 0.02600463479757309, "grad_norm_var": 4.241252901784319e-07, "learning_rate": 2.690273740505289e-05, "loss": 2.3924, "step": 26691 }, { "crossentropy": 2.350367307662964, "epoch": 0.9676624129930395, "grad_norm": 0.025727415457367897, "grad_norm_var": 4.112719416214516e-07, "learning_rate": 2.684257258854461e-05, "loss": 2.4173, "step": 26692 }, { "crossentropy": 2.3334057331085205, "epoch": 0.9676986658932715, "grad_norm": 0.026237573474645615, "grad_norm_var": 4.0676019494057893e-07, "learning_rate": 2.6782474942074752e-05, "loss": 2.3292, "step": 26693 }, { "crossentropy": 2.5196914672851562, "epoch": 0.9677349187935035, "grad_norm": 0.02647012285888195, "grad_norm_var": 3.5118771411177196e-07, "learning_rate": 2.6722444466454333e-05, "loss": 2.5189, "step": 26694 }, { "crossentropy": 2.389615297317505, "epoch": 0.9677711716937355, "grad_norm": 0.025546077638864517, "grad_norm_var": 3.4724443464666187e-07, "learning_rate": 2.6662481162494366e-05, "loss": 2.4172, "step": 26695 }, { "crossentropy": 2.2944767475128174, "epoch": 0.9678074245939675, "grad_norm": 0.02563754841685295, "grad_norm_var": 3.6700378659817955e-07, "learning_rate": 2.6602585031005323e-05, "loss": 2.3298, "step": 26696 }, { "crossentropy": 2.5054938793182373, "epoch": 0.9678436774941995, "grad_norm": 0.02583223395049572, "grad_norm_var": 3.2911142358985473e-07, "learning_rate": 2.6542756072795437e-05, "loss": 2.423, "step": 26697 }, { "crossentropy": 2.355520248413086, "epoch": 0.9678799303944315, "grad_norm": 0.026669062674045563, "grad_norm_var": 3.0731469834079324e-07, "learning_rate": 2.648299428867351e-05, "loss": 2.2781, "step": 26698 }, { "crossentropy": 2.5482261180877686, "epoch": 0.9679161832946636, "grad_norm": 0.026031771674752235, "grad_norm_var": 2.943906327700866e-07, "learning_rate": 2.6423299679446122e-05, "loss": 2.5414, "step": 26699 }, { "crossentropy": 2.534266471862793, "epoch": 0.9679524361948956, "grad_norm": 0.02593541145324707, "grad_norm_var": 2.954763331038893e-07, "learning_rate": 2.6363672245919844e-05, "loss": 2.4267, "step": 26700 }, { "crossentropy": 2.378444194793701, "epoch": 0.9679886890951276, "grad_norm": 0.02688620239496231, "grad_norm_var": 3.0852207601865e-07, "learning_rate": 2.6304111988899593e-05, "loss": 2.3981, "step": 26701 }, { "crossentropy": 2.4435791969299316, "epoch": 0.9680249419953596, "grad_norm": 0.025642329826951027, "grad_norm_var": 3.0255128366796226e-07, "learning_rate": 2.6244618909190277e-05, "loss": 2.4061, "step": 26702 }, { "crossentropy": 2.330576181411743, "epoch": 0.9680611948955916, "grad_norm": 0.025344792753458023, "grad_norm_var": 2.9741519040614393e-07, "learning_rate": 2.61851930075957e-05, "loss": 2.3836, "step": 26703 }, { "crossentropy": 2.3943228721618652, "epoch": 0.9680974477958236, "grad_norm": 0.025288067758083344, "grad_norm_var": 3.2583408786182077e-07, "learning_rate": 2.6125834284917993e-05, "loss": 2.3916, "step": 26704 }, { "crossentropy": 2.4527406692504883, "epoch": 0.9681337006960556, "grad_norm": 0.02636992558836937, "grad_norm_var": 2.100678774166753e-07, "learning_rate": 2.6066542741958744e-05, "loss": 2.4253, "step": 26705 }, { "crossentropy": 2.4320218563079834, "epoch": 0.9681699535962877, "grad_norm": 0.025562483817338943, "grad_norm_var": 2.204275463043298e-07, "learning_rate": 2.6007318379518974e-05, "loss": 2.4088, "step": 26706 }, { "crossentropy": 2.208674907684326, "epoch": 0.9682062064965197, "grad_norm": 0.02593380957841873, "grad_norm_var": 2.202166564933914e-07, "learning_rate": 2.5948161198399155e-05, "loss": 2.2649, "step": 26707 }, { "crossentropy": 2.3578739166259766, "epoch": 0.9682424593967517, "grad_norm": 0.026301128789782524, "grad_norm_var": 2.2416891024162368e-07, "learning_rate": 2.588907119939754e-05, "loss": 2.3628, "step": 26708 }, { "crossentropy": 2.4188780784606934, "epoch": 0.9682787122969838, "grad_norm": 0.025720244273543358, "grad_norm_var": 2.2316587816988605e-07, "learning_rate": 2.5830048383312932e-05, "loss": 2.4187, "step": 26709 }, { "crossentropy": 2.489511013031006, "epoch": 0.9683149651972158, "grad_norm": 0.025502139702439308, "grad_norm_var": 2.1436627708152494e-07, "learning_rate": 2.5771092750941362e-05, "loss": 2.5027, "step": 26710 }, { "crossentropy": 2.356700897216797, "epoch": 0.9683512180974478, "grad_norm": 0.02628263644874096, "grad_norm_var": 2.1472352654468227e-07, "learning_rate": 2.5712204303079968e-05, "loss": 2.4069, "step": 26711 }, { "crossentropy": 2.471503973007202, "epoch": 0.9683874709976799, "grad_norm": 0.02700548805296421, "grad_norm_var": 2.7765482390236747e-07, "learning_rate": 2.5653383040524226e-05, "loss": 2.4674, "step": 26712 }, { "crossentropy": 2.4865479469299316, "epoch": 0.9684237238979119, "grad_norm": 0.027393164113163948, "grad_norm_var": 3.910172558685421e-07, "learning_rate": 2.55946289640685e-05, "loss": 2.4836, "step": 26713 }, { "crossentropy": 2.4329488277435303, "epoch": 0.9684599767981439, "grad_norm": 0.0261379387229681, "grad_norm_var": 3.6953809760656833e-07, "learning_rate": 2.5535942074506047e-05, "loss": 2.3947, "step": 26714 }, { "crossentropy": 2.4513046741485596, "epoch": 0.9684962296983759, "grad_norm": 0.025673573836684227, "grad_norm_var": 3.800323150047648e-07, "learning_rate": 2.547732237263012e-05, "loss": 2.4366, "step": 26715 }, { "crossentropy": 2.436652898788452, "epoch": 0.9685324825986079, "grad_norm": 0.025941530242562294, "grad_norm_var": 3.799320248386191e-07, "learning_rate": 2.5418769859231194e-05, "loss": 2.4119, "step": 26716 }, { "crossentropy": 2.3649473190307617, "epoch": 0.9685687354988399, "grad_norm": 0.02628239430487156, "grad_norm_var": 3.363309332467344e-07, "learning_rate": 2.5360284535101973e-05, "loss": 2.3217, "step": 26717 }, { "crossentropy": 2.502617835998535, "epoch": 0.9686049883990719, "grad_norm": 0.027464574202895164, "grad_norm_var": 4.511697189630981e-07, "learning_rate": 2.5301866401030716e-05, "loss": 2.4503, "step": 26718 }, { "crossentropy": 2.405992031097412, "epoch": 0.968641241299304, "grad_norm": 0.026385262608528137, "grad_norm_var": 4.088253338761175e-07, "learning_rate": 2.5243515457807343e-05, "loss": 2.411, "step": 26719 }, { "crossentropy": 2.2906832695007324, "epoch": 0.968677494199536, "grad_norm": 0.026234496384859085, "grad_norm_var": 3.493812306686137e-07, "learning_rate": 2.5185231706219556e-05, "loss": 2.3435, "step": 26720 }, { "crossentropy": 2.3535666465759277, "epoch": 0.968713747099768, "grad_norm": 0.0253546554595232, "grad_norm_var": 3.9918452187093224e-07, "learning_rate": 2.5127015147055067e-05, "loss": 2.4246, "step": 26721 }, { "crossentropy": 2.440948724746704, "epoch": 0.96875, "grad_norm": 0.02963174507021904, "grad_norm_var": 1.0890490926995326e-06, "learning_rate": 2.5068865781099903e-05, "loss": 2.5139, "step": 26722 }, { "crossentropy": 2.327888011932373, "epoch": 0.968786252900232, "grad_norm": 0.02573973499238491, "grad_norm_var": 1.1048328358077465e-06, "learning_rate": 2.5010783609139e-05, "loss": 2.3782, "step": 26723 }, { "crossentropy": 2.4438302516937256, "epoch": 0.968822505800464, "grad_norm": 0.027130959555506706, "grad_norm_var": 1.1324322061045568e-06, "learning_rate": 2.4952768631957833e-05, "loss": 2.4492, "step": 26724 }, { "crossentropy": 2.3396618366241455, "epoch": 0.968858758700696, "grad_norm": 0.02599874511361122, "grad_norm_var": 1.1086021116189666e-06, "learning_rate": 2.4894820850339116e-05, "loss": 2.4292, "step": 26725 }, { "crossentropy": 2.373399019241333, "epoch": 0.9688950116009281, "grad_norm": 0.025292741134762764, "grad_norm_var": 1.139480186396607e-06, "learning_rate": 2.4836940265065556e-05, "loss": 2.3302, "step": 26726 }, { "crossentropy": 2.316059112548828, "epoch": 0.9689312645011601, "grad_norm": 0.02617529220879078, "grad_norm_var": 1.1432663419584192e-06, "learning_rate": 2.4779126876919857e-05, "loss": 2.3955, "step": 26727 }, { "crossentropy": 2.3630259037017822, "epoch": 0.9689675174013921, "grad_norm": 0.025840165093541145, "grad_norm_var": 1.1480675633739981e-06, "learning_rate": 2.472138068668195e-05, "loss": 2.416, "step": 26728 }, { "crossentropy": 2.3018555641174316, "epoch": 0.9690037703016241, "grad_norm": 0.026331104338169098, "grad_norm_var": 1.080377144200871e-06, "learning_rate": 2.466370169513177e-05, "loss": 2.3917, "step": 26729 }, { "crossentropy": 2.406688928604126, "epoch": 0.9690400232018561, "grad_norm": 0.025360489264130592, "grad_norm_var": 1.1402327489721017e-06, "learning_rate": 2.460608990304869e-05, "loss": 2.4097, "step": 26730 }, { "crossentropy": 2.546469211578369, "epoch": 0.9690762761020881, "grad_norm": 0.026388557627797127, "grad_norm_var": 1.1122417055922045e-06, "learning_rate": 2.4548545311210978e-05, "loss": 2.4721, "step": 26731 }, { "crossentropy": 2.3190267086029053, "epoch": 0.9691125290023201, "grad_norm": 0.026435811072587967, "grad_norm_var": 1.1007873343314712e-06, "learning_rate": 2.4491067920395792e-05, "loss": 2.3657, "step": 26732 }, { "crossentropy": 2.5336036682128906, "epoch": 0.9691487819025522, "grad_norm": 0.026159189641475677, "grad_norm_var": 1.1033052829550188e-06, "learning_rate": 2.443365773137918e-05, "loss": 2.4751, "step": 26733 }, { "crossentropy": 2.343759775161743, "epoch": 0.9691850348027842, "grad_norm": 0.026437437161803246, "grad_norm_var": 1.0193699064886629e-06, "learning_rate": 2.4376314744936624e-05, "loss": 2.3451, "step": 26734 }, { "crossentropy": 2.4222657680511475, "epoch": 0.9692212877030162, "grad_norm": 0.02628270350396633, "grad_norm_var": 1.0189437548421252e-06, "learning_rate": 2.4319038961843065e-05, "loss": 2.3916, "step": 26735 }, { "crossentropy": 2.397369623184204, "epoch": 0.9692575406032483, "grad_norm": 0.02658996917307377, "grad_norm_var": 1.0237549618615268e-06, "learning_rate": 2.426183038287122e-05, "loss": 2.4245, "step": 26736 }, { "crossentropy": 2.5032734870910645, "epoch": 0.9692937935034803, "grad_norm": 0.025759490206837654, "grad_norm_var": 9.817919765305367e-07, "learning_rate": 2.420468900879491e-05, "loss": 2.4617, "step": 26737 }, { "crossentropy": 2.5855727195739746, "epoch": 0.9693300464037123, "grad_norm": 0.026097068563103676, "grad_norm_var": 2.1465783040334566e-07, "learning_rate": 2.414761484038519e-05, "loss": 2.4891, "step": 26738 }, { "crossentropy": 2.38057279586792, "epoch": 0.9693662993039444, "grad_norm": 0.025450479239225388, "grad_norm_var": 2.3479272312439532e-07, "learning_rate": 2.4090607878412552e-05, "loss": 2.4475, "step": 26739 }, { "crossentropy": 2.352872133255005, "epoch": 0.9694025522041764, "grad_norm": 0.026097428053617477, "grad_norm_var": 1.6060528306571554e-07, "learning_rate": 2.4033668123648046e-05, "loss": 2.4047, "step": 26740 }, { "crossentropy": 2.341647148132324, "epoch": 0.9694388051044084, "grad_norm": 0.02676199935376644, "grad_norm_var": 1.924562464680381e-07, "learning_rate": 2.3976795576859944e-05, "loss": 2.3943, "step": 26741 }, { "crossentropy": 2.488687038421631, "epoch": 0.9694750580046404, "grad_norm": 0.026316463947296143, "grad_norm_var": 1.4896384015605548e-07, "learning_rate": 2.3919990238816525e-05, "loss": 2.4417, "step": 26742 }, { "crossentropy": 2.4604835510253906, "epoch": 0.9695113109048724, "grad_norm": 0.02733522467315197, "grad_norm_var": 2.3615712705140254e-07, "learning_rate": 2.386325211028495e-05, "loss": 2.4125, "step": 26743 }, { "crossentropy": 2.134713888168335, "epoch": 0.9695475638051044, "grad_norm": 0.02513163909316063, "grad_norm_var": 3.0414541874108393e-07, "learning_rate": 2.380658119203183e-05, "loss": 2.156, "step": 26744 }, { "crossentropy": 2.4600584506988525, "epoch": 0.9695838167053364, "grad_norm": 0.025914330035448074, "grad_norm_var": 3.0679607820568944e-07, "learning_rate": 2.374997748482266e-05, "loss": 2.5245, "step": 26745 }, { "crossentropy": 2.482858896255493, "epoch": 0.9696200696055685, "grad_norm": 0.025880448520183563, "grad_norm_var": 2.684457988186631e-07, "learning_rate": 2.3693440989421834e-05, "loss": 2.4229, "step": 26746 }, { "crossentropy": 2.332573890686035, "epoch": 0.9696563225058005, "grad_norm": 0.026300767436623573, "grad_norm_var": 2.6660201770522854e-07, "learning_rate": 2.3636971706592626e-05, "loss": 2.3491, "step": 26747 }, { "crossentropy": 2.4586689472198486, "epoch": 0.9696925754060325, "grad_norm": 0.02720906026661396, "grad_norm_var": 3.2989179693045505e-07, "learning_rate": 2.358056963709776e-05, "loss": 2.4014, "step": 26748 }, { "crossentropy": 2.4369797706604004, "epoch": 0.9697288283062645, "grad_norm": 0.025551430881023407, "grad_norm_var": 3.5893686821419857e-07, "learning_rate": 2.352423478169996e-05, "loss": 2.3819, "step": 26749 }, { "crossentropy": 2.6966426372528076, "epoch": 0.9697650812064965, "grad_norm": 0.027365610003471375, "grad_norm_var": 4.428154673066208e-07, "learning_rate": 2.3467967141159175e-05, "loss": 2.5185, "step": 26750 }, { "crossentropy": 2.3796067237854004, "epoch": 0.9698013341067285, "grad_norm": 0.026793045923113823, "grad_norm_var": 4.6113127931673763e-07, "learning_rate": 2.341176671623535e-05, "loss": 2.4447, "step": 26751 }, { "crossentropy": 2.527524709701538, "epoch": 0.9698375870069605, "grad_norm": 0.02671380713582039, "grad_norm_var": 4.671310594905048e-07, "learning_rate": 2.3355633507688436e-05, "loss": 2.4697, "step": 26752 }, { "crossentropy": 2.398268461227417, "epoch": 0.9698738399071926, "grad_norm": 0.07139305025339127, "grad_norm_var": 0.0001273760581412162, "learning_rate": 2.3299567516275044e-05, "loss": 2.4291, "step": 26753 }, { "crossentropy": 2.3532636165618896, "epoch": 0.9699100928074246, "grad_norm": 0.026335055008530617, "grad_norm_var": 0.0001272828986287143, "learning_rate": 2.3243568742754573e-05, "loss": 2.4318, "step": 26754 }, { "crossentropy": 2.299158811569214, "epoch": 0.9699463457076566, "grad_norm": 0.02647770382463932, "grad_norm_var": 0.00012684086687529415, "learning_rate": 2.3187637187881417e-05, "loss": 2.3472, "step": 26755 }, { "crossentropy": 2.3889505863189697, "epoch": 0.9699825986078886, "grad_norm": 0.02523065358400345, "grad_norm_var": 0.00012724911060125175, "learning_rate": 2.313177285241219e-05, "loss": 2.415, "step": 26756 }, { "crossentropy": 2.513751268386841, "epoch": 0.9700188515081206, "grad_norm": 0.025669040158391, "grad_norm_var": 0.0001276745950096009, "learning_rate": 2.307597573710074e-05, "loss": 2.4779, "step": 26757 }, { "crossentropy": 2.2918667793273926, "epoch": 0.9700551044083526, "grad_norm": 0.025377321988344193, "grad_norm_var": 0.00012807840626955305, "learning_rate": 2.3020245842701458e-05, "loss": 2.2984, "step": 26758 }, { "crossentropy": 2.4874460697174072, "epoch": 0.9700913573085846, "grad_norm": 0.02527705766260624, "grad_norm_var": 0.00012881164278733513, "learning_rate": 2.296458316996597e-05, "loss": 2.4098, "step": 26759 }, { "crossentropy": 2.380147933959961, "epoch": 0.9701276102088167, "grad_norm": 0.026392925530672073, "grad_norm_var": 0.0001282750269168729, "learning_rate": 2.2908987719647e-05, "loss": 2.4582, "step": 26760 }, { "crossentropy": 2.4727821350097656, "epoch": 0.9701638631090487, "grad_norm": 0.025894371792674065, "grad_norm_var": 0.00012828324334529604, "learning_rate": 2.2853459492495064e-05, "loss": 2.3923, "step": 26761 }, { "crossentropy": 2.4735350608825684, "epoch": 0.9702001160092807, "grad_norm": 0.026294894516468048, "grad_norm_var": 0.00012812207279648216, "learning_rate": 2.2797998489260673e-05, "loss": 2.4375, "step": 26762 }, { "crossentropy": 2.455397605895996, "epoch": 0.9702363689095128, "grad_norm": 0.026923613622784615, "grad_norm_var": 0.00012792072650063817, "learning_rate": 2.2742604710692117e-05, "loss": 2.4514, "step": 26763 }, { "crossentropy": 2.5465402603149414, "epoch": 0.9702726218097448, "grad_norm": 0.025767622515559196, "grad_norm_var": 0.0001284055836499272, "learning_rate": 2.2687278157537685e-05, "loss": 2.4682, "step": 26764 }, { "crossentropy": 2.390817403793335, "epoch": 0.9703088747099768, "grad_norm": 0.026532551273703575, "grad_norm_var": 0.00012801905567664908, "learning_rate": 2.2632018830545663e-05, "loss": 2.3522, "step": 26765 }, { "crossentropy": 2.51638126373291, "epoch": 0.9703451276102089, "grad_norm": 0.02657819353044033, "grad_norm_var": 0.0001282322761694362, "learning_rate": 2.257682673046102e-05, "loss": 2.488, "step": 26766 }, { "crossentropy": 2.348179340362549, "epoch": 0.9703813805104409, "grad_norm": 0.02644115313887596, "grad_norm_var": 0.00012834253992427766, "learning_rate": 2.2521701858030373e-05, "loss": 2.3581, "step": 26767 }, { "crossentropy": 2.416795015335083, "epoch": 0.9704176334106729, "grad_norm": 0.025539763271808624, "grad_norm_var": 0.0001287797091510961, "learning_rate": 2.2466644213997023e-05, "loss": 2.2724, "step": 26768 }, { "crossentropy": 2.4709672927856445, "epoch": 0.9704538863109049, "grad_norm": 0.02607443742454052, "grad_norm_var": 2.734061144796587e-07, "learning_rate": 2.2411653799105925e-05, "loss": 2.4307, "step": 26769 }, { "crossentropy": 2.1932692527770996, "epoch": 0.9704901392111369, "grad_norm": 0.026184789836406708, "grad_norm_var": 2.6911412480215035e-07, "learning_rate": 2.2356730614098708e-05, "loss": 2.2332, "step": 26770 }, { "crossentropy": 2.522688627243042, "epoch": 0.9705263921113689, "grad_norm": 0.027872566133737564, "grad_norm_var": 4.7193448383797454e-07, "learning_rate": 2.2301874659718112e-05, "loss": 2.4813, "step": 26771 }, { "crossentropy": 2.4630188941955566, "epoch": 0.9705626450116009, "grad_norm": 0.026571223512291908, "grad_norm_var": 4.238278618946215e-07, "learning_rate": 2.2247085936704102e-05, "loss": 2.431, "step": 26772 }, { "crossentropy": 2.3862850666046143, "epoch": 0.970598897911833, "grad_norm": 0.027027929201722145, "grad_norm_var": 4.4086815440788573e-07, "learning_rate": 2.2192364445798308e-05, "loss": 2.4053, "step": 26773 }, { "crossentropy": 2.3209712505340576, "epoch": 0.970635150812065, "grad_norm": 0.02687574364244938, "grad_norm_var": 3.9747510283260946e-07, "learning_rate": 2.2137710187737914e-05, "loss": 2.4312, "step": 26774 }, { "crossentropy": 2.338409423828125, "epoch": 0.970671403712297, "grad_norm": 0.026186881586909294, "grad_norm_var": 3.141334528671454e-07, "learning_rate": 2.2083123163261777e-05, "loss": 2.3572, "step": 26775 }, { "crossentropy": 2.390230894088745, "epoch": 0.970707656612529, "grad_norm": 0.025692258030176163, "grad_norm_var": 3.4990754025678493e-07, "learning_rate": 2.2028603373107635e-05, "loss": 2.3451, "step": 26776 }, { "crossentropy": 2.2836718559265137, "epoch": 0.970743909512761, "grad_norm": 0.0256999172270298, "grad_norm_var": 3.6547436325881584e-07, "learning_rate": 2.1974150818011017e-05, "loss": 2.3685, "step": 26777 }, { "crossentropy": 2.430556058883667, "epoch": 0.970780162412993, "grad_norm": 0.026192158460617065, "grad_norm_var": 3.6745695220981254e-07, "learning_rate": 2.1919765498708555e-05, "loss": 2.3681, "step": 26778 }, { "crossentropy": 2.5403664112091064, "epoch": 0.970816415313225, "grad_norm": 0.027224861085414886, "grad_norm_var": 3.9476094658147156e-07, "learning_rate": 2.1865447415934103e-05, "loss": 2.5039, "step": 26779 }, { "crossentropy": 2.2540483474731445, "epoch": 0.9708526682134571, "grad_norm": 0.026690922677516937, "grad_norm_var": 3.6971382591995057e-07, "learning_rate": 2.1811196570420967e-05, "loss": 2.2979, "step": 26780 }, { "crossentropy": 2.5508148670196533, "epoch": 0.9708889211136891, "grad_norm": 0.026491910219192505, "grad_norm_var": 3.6943250094183717e-07, "learning_rate": 2.1757012962902445e-05, "loss": 2.4347, "step": 26781 }, { "crossentropy": 2.3832523822784424, "epoch": 0.9709251740139211, "grad_norm": 0.025188738480210304, "grad_norm_var": 4.6802041989187314e-07, "learning_rate": 2.1702896594110733e-05, "loss": 2.3446, "step": 26782 }, { "crossentropy": 2.4209539890289307, "epoch": 0.9709614269141531, "grad_norm": 0.026090586557984352, "grad_norm_var": 4.7247861326483486e-07, "learning_rate": 2.16488474647758e-05, "loss": 2.4536, "step": 26783 }, { "crossentropy": 2.4136812686920166, "epoch": 0.9709976798143851, "grad_norm": 0.025768188759684563, "grad_norm_var": 4.510536657453823e-07, "learning_rate": 2.1594865575628175e-05, "loss": 2.4455, "step": 26784 }, { "crossentropy": 2.279911756515503, "epoch": 0.9710339327146171, "grad_norm": 0.026282750070095062, "grad_norm_var": 4.4570737516140027e-07, "learning_rate": 2.154095092739672e-05, "loss": 2.3563, "step": 26785 }, { "crossentropy": 2.372321844100952, "epoch": 0.9710701856148491, "grad_norm": 0.02643195353448391, "grad_norm_var": 4.431717632283801e-07, "learning_rate": 2.1487103520809735e-05, "loss": 2.3506, "step": 26786 }, { "crossentropy": 2.357478380203247, "epoch": 0.9711064385150812, "grad_norm": 0.02664908394217491, "grad_norm_var": 2.9537152026840964e-07, "learning_rate": 2.143332335659498e-05, "loss": 2.4248, "step": 26787 }, { "crossentropy": 2.29437518119812, "epoch": 0.9711426914153132, "grad_norm": 0.026451271027326584, "grad_norm_var": 2.9219795179649156e-07, "learning_rate": 2.137961043547798e-05, "loss": 2.3043, "step": 26788 }, { "crossentropy": 2.3690319061279297, "epoch": 0.9711789443155452, "grad_norm": 0.025631174445152283, "grad_norm_var": 2.802550927820391e-07, "learning_rate": 2.1325964758184822e-05, "loss": 2.2976, "step": 26789 }, { "crossentropy": 2.3765718936920166, "epoch": 0.9712151972157773, "grad_norm": 0.027839407324790955, "grad_norm_var": 4.2232301962577024e-07, "learning_rate": 2.1272386325440485e-05, "loss": 2.5121, "step": 26790 }, { "crossentropy": 2.367466449737549, "epoch": 0.9712514501160093, "grad_norm": 0.025995677337050438, "grad_norm_var": 4.270330015736321e-07, "learning_rate": 2.1218875137967165e-05, "loss": 2.432, "step": 26791 }, { "crossentropy": 2.337852954864502, "epoch": 0.9712877030162413, "grad_norm": 0.026255154982209206, "grad_norm_var": 4.0347105664337993e-07, "learning_rate": 2.1165431196489283e-05, "loss": 2.3881, "step": 26792 }, { "crossentropy": 2.4725699424743652, "epoch": 0.9713239559164734, "grad_norm": 0.02620769664645195, "grad_norm_var": 3.7860368154631293e-07, "learning_rate": 2.1112054501727374e-05, "loss": 2.4522, "step": 26793 }, { "crossentropy": 2.3721673488616943, "epoch": 0.9713602088167054, "grad_norm": 0.024967215955257416, "grad_norm_var": 4.960355404361067e-07, "learning_rate": 2.1058745054403085e-05, "loss": 2.2974, "step": 26794 }, { "crossentropy": 2.3215880393981934, "epoch": 0.9713964617169374, "grad_norm": 0.026807129383087158, "grad_norm_var": 4.532243189182712e-07, "learning_rate": 2.100550285523639e-05, "loss": 2.4222, "step": 26795 }, { "crossentropy": 2.29463791847229, "epoch": 0.9714327146171694, "grad_norm": 0.02531980350613594, "grad_norm_var": 4.872451252381042e-07, "learning_rate": 2.0952327904945613e-05, "loss": 2.3675, "step": 26796 }, { "crossentropy": 2.4425299167633057, "epoch": 0.9714689675174014, "grad_norm": 0.025716086849570274, "grad_norm_var": 4.893518343957916e-07, "learning_rate": 2.0899220204250722e-05, "loss": 2.4147, "step": 26797 }, { "crossentropy": 2.451166868209839, "epoch": 0.9715052204176334, "grad_norm": 0.02611144445836544, "grad_norm_var": 4.3043853185073935e-07, "learning_rate": 2.0846179753867268e-05, "loss": 2.4739, "step": 26798 }, { "crossentropy": 2.3981363773345947, "epoch": 0.9715414733178654, "grad_norm": 0.025648245587944984, "grad_norm_var": 4.466311497049498e-07, "learning_rate": 2.0793206554512446e-05, "loss": 2.3081, "step": 26799 }, { "crossentropy": 2.522695541381836, "epoch": 0.9715777262180975, "grad_norm": 0.026714112609624863, "grad_norm_var": 4.5690360855286474e-07, "learning_rate": 2.074030060690124e-05, "loss": 2.4497, "step": 26800 }, { "crossentropy": 2.542649984359741, "epoch": 0.9716139791183295, "grad_norm": 0.026661783456802368, "grad_norm_var": 4.7060738333146166e-07, "learning_rate": 2.068746191174864e-05, "loss": 2.5076, "step": 26801 }, { "crossentropy": 2.265821695327759, "epoch": 0.9716502320185615, "grad_norm": 0.02581029385328293, "grad_norm_var": 4.766086377670873e-07, "learning_rate": 2.0634690469768515e-05, "loss": 2.2923, "step": 26802 }, { "crossentropy": 2.320721387863159, "epoch": 0.9716864849187935, "grad_norm": 0.026924069970846176, "grad_norm_var": 4.987499557762189e-07, "learning_rate": 2.0581986281673626e-05, "loss": 2.407, "step": 26803 }, { "crossentropy": 2.3316709995269775, "epoch": 0.9717227378190255, "grad_norm": 0.025462143123149872, "grad_norm_var": 5.256104678218274e-07, "learning_rate": 2.0529349348175076e-05, "loss": 2.3897, "step": 26804 }, { "crossentropy": 2.4129655361175537, "epoch": 0.9717589907192575, "grad_norm": 0.026986965909600258, "grad_norm_var": 5.504190520519425e-07, "learning_rate": 2.0476779669984514e-05, "loss": 2.4623, "step": 26805 }, { "crossentropy": 2.343841791152954, "epoch": 0.9717952436194895, "grad_norm": 0.026401903480291367, "grad_norm_var": 3.680716006264235e-07, "learning_rate": 2.0424277247811375e-05, "loss": 2.3929, "step": 26806 }, { "crossentropy": 2.3726089000701904, "epoch": 0.9718314965197216, "grad_norm": 0.025892524048686028, "grad_norm_var": 3.705064818883159e-07, "learning_rate": 2.0371842082365645e-05, "loss": 2.3883, "step": 26807 }, { "crossentropy": 2.496821880340576, "epoch": 0.9718677494199536, "grad_norm": 0.02744186669588089, "grad_norm_var": 4.802401688201585e-07, "learning_rate": 2.0319474174354536e-05, "loss": 2.4997, "step": 26808 }, { "crossentropy": 2.4925193786621094, "epoch": 0.9719040023201856, "grad_norm": 0.026060793548822403, "grad_norm_var": 4.812830736034089e-07, "learning_rate": 2.0267173524485817e-05, "loss": 2.51, "step": 26809 }, { "crossentropy": 2.681678295135498, "epoch": 0.9719402552204176, "grad_norm": 0.02592542953789234, "grad_norm_var": 3.8335104803281087e-07, "learning_rate": 2.0214940133466698e-05, "loss": 2.5846, "step": 26810 }, { "crossentropy": 2.3535618782043457, "epoch": 0.9719765081206496, "grad_norm": 0.02802068553864956, "grad_norm_var": 5.667107393431675e-07, "learning_rate": 2.016277400200106e-05, "loss": 2.37, "step": 26811 }, { "crossentropy": 2.479196786880493, "epoch": 0.9720127610208816, "grad_norm": 0.025817006826400757, "grad_norm_var": 5.159451567666167e-07, "learning_rate": 2.0110675130795008e-05, "loss": 2.4426, "step": 26812 }, { "crossentropy": 2.3360238075256348, "epoch": 0.9720490139211136, "grad_norm": 0.026207976043224335, "grad_norm_var": 4.895110440006808e-07, "learning_rate": 2.0058643520551313e-05, "loss": 2.3697, "step": 26813 }, { "crossentropy": 2.347592830657959, "epoch": 0.9720852668213457, "grad_norm": 0.026243656873703003, "grad_norm_var": 4.858613861164248e-07, "learning_rate": 2.0006679171972742e-05, "loss": 2.4149, "step": 26814 }, { "crossentropy": 2.3143677711486816, "epoch": 0.9721215197215777, "grad_norm": 0.026202555745840073, "grad_norm_var": 4.5033841265662333e-07, "learning_rate": 1.9954782085761515e-05, "loss": 2.3967, "step": 26815 }, { "crossentropy": 2.4944205284118652, "epoch": 0.9721577726218097, "grad_norm": 0.027710409834980965, "grad_norm_var": 5.509998315344205e-07, "learning_rate": 1.9902952262618734e-05, "loss": 2.4534, "step": 26816 }, { "crossentropy": 2.3816330432891846, "epoch": 0.9721940255220418, "grad_norm": 0.02598668821156025, "grad_norm_var": 5.636282955023252e-07, "learning_rate": 1.9851189703243843e-05, "loss": 2.4276, "step": 26817 }, { "crossentropy": 2.5170135498046875, "epoch": 0.9722302784222738, "grad_norm": 0.026560867205262184, "grad_norm_var": 5.3547574361544e-07, "learning_rate": 1.9799494408336284e-05, "loss": 2.4991, "step": 26818 }, { "crossentropy": 2.362816333770752, "epoch": 0.9722665313225058, "grad_norm": 0.02538917399942875, "grad_norm_var": 5.939571072523127e-07, "learning_rate": 1.9747866378593825e-05, "loss": 2.3612, "step": 26819 }, { "crossentropy": 2.4190738201141357, "epoch": 0.9723027842227379, "grad_norm": 0.026380106806755066, "grad_norm_var": 5.325175758689899e-07, "learning_rate": 1.9696305614714805e-05, "loss": 2.4227, "step": 26820 }, { "crossentropy": 2.4463791847229004, "epoch": 0.9723390371229699, "grad_norm": 0.02712760865688324, "grad_norm_var": 5.43789699261711e-07, "learning_rate": 1.964481211739533e-05, "loss": 2.4199, "step": 26821 }, { "crossentropy": 2.3652548789978027, "epoch": 0.9723752900232019, "grad_norm": 0.026323288679122925, "grad_norm_var": 5.447909952681207e-07, "learning_rate": 1.959338588732984e-05, "loss": 2.4057, "step": 26822 }, { "crossentropy": 2.3019444942474365, "epoch": 0.9724115429234339, "grad_norm": 0.02655406855046749, "grad_norm_var": 5.224712020457927e-07, "learning_rate": 1.9542026925214452e-05, "loss": 2.3604, "step": 26823 }, { "crossentropy": 2.5057284832000732, "epoch": 0.9724477958236659, "grad_norm": 0.026738742366433144, "grad_norm_var": 4.647900935292789e-07, "learning_rate": 1.949073523174194e-05, "loss": 2.3867, "step": 26824 }, { "crossentropy": 2.236865758895874, "epoch": 0.9724840487238979, "grad_norm": 0.024798806756734848, "grad_norm_var": 6.30333972470587e-07, "learning_rate": 1.9439510807605088e-05, "loss": 2.2917, "step": 26825 }, { "crossentropy": 2.4442780017852783, "epoch": 0.97252030162413, "grad_norm": 0.026092808693647385, "grad_norm_var": 6.220698284043937e-07, "learning_rate": 1.9388353653495562e-05, "loss": 2.4253, "step": 26826 }, { "crossentropy": 2.418132781982422, "epoch": 0.972556554524362, "grad_norm": 0.02588622272014618, "grad_norm_var": 4.412088484233185e-07, "learning_rate": 1.933726377010503e-05, "loss": 2.4362, "step": 26827 }, { "crossentropy": 2.425455331802368, "epoch": 0.972592807424594, "grad_norm": 0.026441412046551704, "grad_norm_var": 4.294240345299825e-07, "learning_rate": 1.9286241158122942e-05, "loss": 2.4816, "step": 26828 }, { "crossentropy": 2.3659424781799316, "epoch": 0.972629060324826, "grad_norm": 0.026247873902320862, "grad_norm_var": 4.2908571978397295e-07, "learning_rate": 1.92352858182393e-05, "loss": 2.3742, "step": 26829 }, { "crossentropy": 2.4673445224761963, "epoch": 0.972665313225058, "grad_norm": 0.026169199496507645, "grad_norm_var": 4.2991977357736444e-07, "learning_rate": 1.9184397751141337e-05, "loss": 2.5302, "step": 26830 }, { "crossentropy": 2.3596460819244385, "epoch": 0.97270156612529, "grad_norm": 0.026486815884709358, "grad_norm_var": 4.3172721632932697e-07, "learning_rate": 1.913357695751683e-05, "loss": 2.3681, "step": 26831 }, { "crossentropy": 2.2424497604370117, "epoch": 0.972737819025522, "grad_norm": 0.026413029059767723, "grad_norm_var": 2.9396583624685437e-07, "learning_rate": 1.90828234380519e-05, "loss": 2.3172, "step": 26832 }, { "crossentropy": 2.5233914852142334, "epoch": 0.972774071925754, "grad_norm": 0.02711520530283451, "grad_norm_var": 3.3773515094309473e-07, "learning_rate": 1.9032137193432108e-05, "loss": 2.5455, "step": 26833 }, { "crossentropy": 2.484743118286133, "epoch": 0.9728103248259861, "grad_norm": 0.026853229850530624, "grad_norm_var": 3.53428605188175e-07, "learning_rate": 1.8981518224342463e-05, "loss": 2.4115, "step": 26834 }, { "crossentropy": 2.551198720932007, "epoch": 0.9728465777262181, "grad_norm": 0.026205474510788918, "grad_norm_var": 2.9446072322308834e-07, "learning_rate": 1.893096653146631e-05, "loss": 2.4919, "step": 26835 }, { "crossentropy": 2.46028995513916, "epoch": 0.9728828306264501, "grad_norm": 0.026436960324645042, "grad_norm_var": 2.9478015329583115e-07, "learning_rate": 1.888048211548643e-05, "loss": 2.3546, "step": 26836 }, { "crossentropy": 2.5067174434661865, "epoch": 0.9729190835266821, "grad_norm": 0.026602642610669136, "grad_norm_var": 2.5884733702943344e-07, "learning_rate": 1.883006497708506e-05, "loss": 2.5653, "step": 26837 }, { "crossentropy": 2.310147285461426, "epoch": 0.9729553364269141, "grad_norm": 0.02624376490712166, "grad_norm_var": 2.5937059694145353e-07, "learning_rate": 1.877971511694221e-05, "loss": 2.4015, "step": 26838 }, { "crossentropy": 2.2662746906280518, "epoch": 0.9729915893271461, "grad_norm": 0.025390448048710823, "grad_norm_var": 3.092929606878277e-07, "learning_rate": 1.8729432535739555e-05, "loss": 2.3899, "step": 26839 }, { "crossentropy": 2.3649661540985107, "epoch": 0.9730278422273781, "grad_norm": 0.026466263458132744, "grad_norm_var": 2.9645545850154736e-07, "learning_rate": 1.8679217234154332e-05, "loss": 2.4125, "step": 26840 }, { "crossentropy": 2.483383893966675, "epoch": 0.9730640951276102, "grad_norm": 0.026106039062142372, "grad_norm_var": 1.5195174257418934e-07, "learning_rate": 1.8629069212866e-05, "loss": 2.3969, "step": 26841 }, { "crossentropy": 2.304542064666748, "epoch": 0.9731003480278422, "grad_norm": 0.027239780873060226, "grad_norm_var": 1.9907165904655317e-07, "learning_rate": 1.8578988472551794e-05, "loss": 2.4044, "step": 26842 }, { "crossentropy": 2.493215322494507, "epoch": 0.9731366009280742, "grad_norm": 0.025411829352378845, "grad_norm_var": 2.45256813591795e-07, "learning_rate": 1.852897501388784e-05, "loss": 2.4503, "step": 26843 }, { "crossentropy": 2.383329391479492, "epoch": 0.9731728538283063, "grad_norm": 0.025342363864183426, "grad_norm_var": 3.0946171840604404e-07, "learning_rate": 1.8479028837549706e-05, "loss": 2.3969, "step": 26844 }, { "crossentropy": 2.4816415309906006, "epoch": 0.9732091067285383, "grad_norm": 0.0265353936702013, "grad_norm_var": 3.1279565432430997e-07, "learning_rate": 1.8429149944211855e-05, "loss": 2.4649, "step": 26845 }, { "crossentropy": 2.308391809463501, "epoch": 0.9732453596287703, "grad_norm": 0.02568867616355419, "grad_norm_var": 3.364821443513673e-07, "learning_rate": 1.837933833454819e-05, "loss": 2.3921, "step": 26846 }, { "crossentropy": 2.419188976287842, "epoch": 0.9732816125290024, "grad_norm": 0.025501657277345657, "grad_norm_var": 3.704500218882726e-07, "learning_rate": 1.8329594009232065e-05, "loss": 2.4173, "step": 26847 }, { "crossentropy": 2.417696475982666, "epoch": 0.9733178654292344, "grad_norm": 0.026330938562750816, "grad_norm_var": 3.6878082936795766e-07, "learning_rate": 1.827991696893405e-05, "loss": 2.4393, "step": 26848 }, { "crossentropy": 2.252023696899414, "epoch": 0.9733541183294664, "grad_norm": 0.02578844130039215, "grad_norm_var": 3.1989082538024364e-07, "learning_rate": 1.8230307214325837e-05, "loss": 2.4164, "step": 26849 }, { "crossentropy": 2.3983161449432373, "epoch": 0.9733903712296984, "grad_norm": 0.027077961713075638, "grad_norm_var": 3.4459871295377566e-07, "learning_rate": 1.8180764746077437e-05, "loss": 2.44, "step": 26850 }, { "crossentropy": 2.4703502655029297, "epoch": 0.9734266241299304, "grad_norm": 0.026801178231835365, "grad_norm_var": 3.713395272474425e-07, "learning_rate": 1.813128956485832e-05, "loss": 2.4929, "step": 26851 }, { "crossentropy": 2.48112416267395, "epoch": 0.9734628770301624, "grad_norm": 0.026222553104162216, "grad_norm_var": 3.670174857342903e-07, "learning_rate": 1.808188167133573e-05, "loss": 2.4608, "step": 26852 }, { "crossentropy": 2.5049984455108643, "epoch": 0.9734991299303944, "grad_norm": 0.026712436228990555, "grad_norm_var": 3.7407703396608905e-07, "learning_rate": 1.8032541066178575e-05, "loss": 2.455, "step": 26853 }, { "crossentropy": 2.373196601867676, "epoch": 0.9735353828306265, "grad_norm": 0.02686537243425846, "grad_norm_var": 4.036166998603111e-07, "learning_rate": 1.7983267750051325e-05, "loss": 2.3618, "step": 26854 }, { "crossentropy": 2.3932461738586426, "epoch": 0.9735716357308585, "grad_norm": 0.02545713074505329, "grad_norm_var": 3.9654052975904993e-07, "learning_rate": 1.793406172362122e-05, "loss": 2.448, "step": 26855 }, { "crossentropy": 2.3260464668273926, "epoch": 0.9736078886310905, "grad_norm": 0.02629699744284153, "grad_norm_var": 3.928128630497722e-07, "learning_rate": 1.788492298755218e-05, "loss": 2.4371, "step": 26856 }, { "crossentropy": 2.2704052925109863, "epoch": 0.9736441415313225, "grad_norm": 0.02579275518655777, "grad_norm_var": 4.033385592954921e-07, "learning_rate": 1.7835851542507553e-05, "loss": 2.3365, "step": 26857 }, { "crossentropy": 2.380642890930176, "epoch": 0.9736803944315545, "grad_norm": 0.0262460820376873, "grad_norm_var": 3.261754718337223e-07, "learning_rate": 1.7786847389150706e-05, "loss": 2.4604, "step": 26858 }, { "crossentropy": 2.4748873710632324, "epoch": 0.9737166473317865, "grad_norm": 0.026360588148236275, "grad_norm_var": 2.916500862146694e-07, "learning_rate": 1.773791052814333e-05, "loss": 2.5653, "step": 26859 }, { "crossentropy": 2.3001139163970947, "epoch": 0.9737529002320185, "grad_norm": 0.02604931779205799, "grad_norm_var": 2.4310267732531065e-07, "learning_rate": 1.7689040960146007e-05, "loss": 2.3275, "step": 26860 }, { "crossentropy": 2.601052761077881, "epoch": 0.9737891531322506, "grad_norm": 0.02677873708307743, "grad_norm_var": 2.566161338030896e-07, "learning_rate": 1.764023868581932e-05, "loss": 2.497, "step": 26861 }, { "crossentropy": 2.3007748126983643, "epoch": 0.9738254060324826, "grad_norm": 0.025130027905106544, "grad_norm_var": 3.1779680868385985e-07, "learning_rate": 1.7591503705822188e-05, "loss": 2.3228, "step": 26862 }, { "crossentropy": 2.3977177143096924, "epoch": 0.9738616589327146, "grad_norm": 0.02538858912885189, "grad_norm_var": 3.2932379536252527e-07, "learning_rate": 1.7542836020812415e-05, "loss": 2.3498, "step": 26863 }, { "crossentropy": 2.506685256958008, "epoch": 0.9738979118329466, "grad_norm": 0.02625429816544056, "grad_norm_var": 3.284161769999405e-07, "learning_rate": 1.7494235631448363e-05, "loss": 2.4253, "step": 26864 }, { "crossentropy": 2.53511118888855, "epoch": 0.9739341647331786, "grad_norm": 0.027458174154162407, "grad_norm_var": 4.1072830958912135e-07, "learning_rate": 1.744570253838562e-05, "loss": 2.4106, "step": 26865 }, { "crossentropy": 2.378087282180786, "epoch": 0.9739704176334106, "grad_norm": 0.027021324262022972, "grad_norm_var": 4.0509741088985234e-07, "learning_rate": 1.739723674228033e-05, "loss": 2.3912, "step": 26866 }, { "crossentropy": 2.358065128326416, "epoch": 0.9740066705336426, "grad_norm": 0.026318328455090523, "grad_norm_var": 3.8754615815835465e-07, "learning_rate": 1.7348838243786412e-05, "loss": 2.4359, "step": 26867 }, { "crossentropy": 2.349536180496216, "epoch": 0.9740429234338747, "grad_norm": 0.027489231899380684, "grad_norm_var": 4.794672265242266e-07, "learning_rate": 1.730050704355779e-05, "loss": 2.3459, "step": 26868 }, { "crossentropy": 2.3444247245788574, "epoch": 0.9740791763341067, "grad_norm": 0.026203414425253868, "grad_norm_var": 4.711450378875233e-07, "learning_rate": 1.7252243142247825e-05, "loss": 2.3939, "step": 26869 }, { "crossentropy": 2.3247923851013184, "epoch": 0.9741154292343387, "grad_norm": 0.02604704722762108, "grad_norm_var": 4.5342726993877143e-07, "learning_rate": 1.7204046540507667e-05, "loss": 2.4225, "step": 26870 }, { "crossentropy": 2.2891464233398438, "epoch": 0.9741516821345708, "grad_norm": 0.025957541540265083, "grad_norm_var": 4.1495872727495493e-07, "learning_rate": 1.7155917238987906e-05, "loss": 2.3063, "step": 26871 }, { "crossentropy": 2.547879219055176, "epoch": 0.9741879350348028, "grad_norm": 0.026340672746300697, "grad_norm_var": 4.1506320915851656e-07, "learning_rate": 1.710785523834024e-05, "loss": 2.481, "step": 26872 }, { "crossentropy": 2.413264751434326, "epoch": 0.9742241879350348, "grad_norm": 0.0255292896181345, "grad_norm_var": 4.372997886178375e-07, "learning_rate": 1.7059860539211934e-05, "loss": 2.442, "step": 26873 }, { "crossentropy": 2.5625386238098145, "epoch": 0.9742604408352669, "grad_norm": 0.02776685357093811, "grad_norm_var": 5.73794540360828e-07, "learning_rate": 1.7011933142252467e-05, "loss": 2.4888, "step": 26874 }, { "crossentropy": 2.5067880153656006, "epoch": 0.9742966937354989, "grad_norm": 0.02633032202720642, "grad_norm_var": 5.739335177623912e-07, "learning_rate": 1.6964073048109098e-05, "loss": 2.5463, "step": 26875 }, { "crossentropy": 2.3776028156280518, "epoch": 0.9743329466357309, "grad_norm": 0.02746783383190632, "grad_norm_var": 6.373504638584737e-07, "learning_rate": 1.691628025742742e-05, "loss": 2.3795, "step": 26876 }, { "crossentropy": 2.4739534854888916, "epoch": 0.9743691995359629, "grad_norm": 0.02658770978450775, "grad_norm_var": 6.317065582641645e-07, "learning_rate": 1.686855477085414e-05, "loss": 2.5136, "step": 26877 }, { "crossentropy": 2.3528144359588623, "epoch": 0.9744054524361949, "grad_norm": 0.026274118572473526, "grad_norm_var": 5.112954750439846e-07, "learning_rate": 1.682089658903263e-05, "loss": 2.4271, "step": 26878 }, { "crossentropy": 2.4968667030334473, "epoch": 0.9744417053364269, "grad_norm": 0.0265993382781744, "grad_norm_var": 4.191099960686029e-07, "learning_rate": 1.6773305712606824e-05, "loss": 2.4546, "step": 26879 }, { "crossentropy": 2.4450488090515137, "epoch": 0.974477958236659, "grad_norm": 0.02579377591609955, "grad_norm_var": 4.537667698015571e-07, "learning_rate": 1.6725782142220646e-05, "loss": 2.309, "step": 26880 }, { "crossentropy": 2.3062570095062256, "epoch": 0.974514211136891, "grad_norm": 0.027912989258766174, "grad_norm_var": 5.203097195649453e-07, "learning_rate": 1.66783258785147e-05, "loss": 2.3949, "step": 26881 }, { "crossentropy": 2.4322023391723633, "epoch": 0.974550464037123, "grad_norm": 0.026582535356283188, "grad_norm_var": 5.07839046392083e-07, "learning_rate": 1.663093692213069e-05, "loss": 2.4175, "step": 26882 }, { "crossentropy": 2.4810893535614014, "epoch": 0.974586716937355, "grad_norm": 0.02652048133313656, "grad_norm_var": 5.034732169883575e-07, "learning_rate": 1.6583615273708108e-05, "loss": 2.4467, "step": 26883 }, { "crossentropy": 2.4713354110717773, "epoch": 0.974622969837587, "grad_norm": 0.02696647122502327, "grad_norm_var": 4.577148863125208e-07, "learning_rate": 1.6536360933886442e-05, "loss": 2.5038, "step": 26884 }, { "crossentropy": 2.39323091506958, "epoch": 0.974659222737819, "grad_norm": 0.026309331879019737, "grad_norm_var": 4.5345048809556675e-07, "learning_rate": 1.6489173903304067e-05, "loss": 2.402, "step": 26885 }, { "crossentropy": 2.3986363410949707, "epoch": 0.974695475638051, "grad_norm": 0.02601523883640766, "grad_norm_var": 4.5569619217996257e-07, "learning_rate": 1.6442054182598254e-05, "loss": 2.3129, "step": 26886 }, { "crossentropy": 2.3510308265686035, "epoch": 0.974731728538283, "grad_norm": 0.025344595313072205, "grad_norm_var": 5.283861790890265e-07, "learning_rate": 1.639500177240516e-05, "loss": 2.3965, "step": 26887 }, { "crossentropy": 2.378946304321289, "epoch": 0.9747679814385151, "grad_norm": 0.026814667508006096, "grad_norm_var": 5.310096101658006e-07, "learning_rate": 1.634801667336039e-05, "loss": 2.3914, "step": 26888 }, { "crossentropy": 2.5871400833129883, "epoch": 0.9748042343387471, "grad_norm": 0.027513742446899414, "grad_norm_var": 5.06807187030528e-07, "learning_rate": 1.6301098886098432e-05, "loss": 2.51, "step": 26889 }, { "crossentropy": 2.4710488319396973, "epoch": 0.9748404872389791, "grad_norm": 0.02639567106962204, "grad_norm_var": 4.2469868508782094e-07, "learning_rate": 1.625424841125378e-05, "loss": 2.4742, "step": 26890 }, { "crossentropy": 2.4393162727355957, "epoch": 0.9748767401392111, "grad_norm": 0.026427172124385834, "grad_norm_var": 4.2194064090658733e-07, "learning_rate": 1.6207465249458153e-05, "loss": 2.3848, "step": 26891 }, { "crossentropy": 2.4467577934265137, "epoch": 0.9749129930394431, "grad_norm": 0.026905927807092667, "grad_norm_var": 3.763074407212106e-07, "learning_rate": 1.616074940134382e-05, "loss": 2.4475, "step": 26892 }, { "crossentropy": 2.3344993591308594, "epoch": 0.9749492459396751, "grad_norm": 0.025777621194720268, "grad_norm_var": 4.143551049599903e-07, "learning_rate": 1.6114100867541948e-05, "loss": 2.3974, "step": 26893 }, { "crossentropy": 2.243907928466797, "epoch": 0.9749854988399071, "grad_norm": 0.02670113928616047, "grad_norm_var": 4.1234410212662105e-07, "learning_rate": 1.606751964868258e-05, "loss": 2.2821, "step": 26894 }, { "crossentropy": 2.3016245365142822, "epoch": 0.9750217517401392, "grad_norm": 0.026480544358491898, "grad_norm_var": 4.122275264653007e-07, "learning_rate": 1.6021005745394667e-05, "loss": 2.3368, "step": 26895 }, { "crossentropy": 2.391772985458374, "epoch": 0.9750580046403712, "grad_norm": 0.02603819966316223, "grad_norm_var": 3.9200489931236436e-07, "learning_rate": 1.597455915830659e-05, "loss": 2.3872, "step": 26896 }, { "crossentropy": 2.4231486320495605, "epoch": 0.9750942575406032, "grad_norm": 0.026570815593004227, "grad_norm_var": 2.5963081827252523e-07, "learning_rate": 1.592817988804507e-05, "loss": 2.4516, "step": 26897 }, { "crossentropy": 2.3559317588806152, "epoch": 0.9751305104408353, "grad_norm": 0.025493977591395378, "grad_norm_var": 3.1594347652967995e-07, "learning_rate": 1.5881867935237938e-05, "loss": 2.435, "step": 26898 }, { "crossentropy": 2.5315985679626465, "epoch": 0.9751667633410673, "grad_norm": 0.025538846850395203, "grad_norm_var": 3.5938206640878425e-07, "learning_rate": 1.5835623300509693e-05, "loss": 2.5055, "step": 26899 }, { "crossentropy": 2.285912275314331, "epoch": 0.9752030162412993, "grad_norm": 0.02609105408191681, "grad_norm_var": 3.3309074884737185e-07, "learning_rate": 1.578944598448484e-05, "loss": 2.3278, "step": 26900 }, { "crossentropy": 2.452881336212158, "epoch": 0.9752392691415314, "grad_norm": 0.025458328425884247, "grad_norm_var": 3.745896587692277e-07, "learning_rate": 1.574333598778732e-05, "loss": 2.4446, "step": 26901 }, { "crossentropy": 2.4173202514648438, "epoch": 0.9752755220417634, "grad_norm": 0.025387749075889587, "grad_norm_var": 4.1657862748141947e-07, "learning_rate": 1.5697293311039972e-05, "loss": 2.4178, "step": 26902 }, { "crossentropy": 2.42018461227417, "epoch": 0.9753117749419954, "grad_norm": 0.0265653133392334, "grad_norm_var": 3.7312979200473014e-07, "learning_rate": 1.5651317954864517e-05, "loss": 2.4353, "step": 26903 }, { "crossentropy": 2.3764536380767822, "epoch": 0.9753480278422274, "grad_norm": 0.02587095834314823, "grad_norm_var": 3.5900489605133803e-07, "learning_rate": 1.560540991988213e-05, "loss": 2.416, "step": 26904 }, { "crossentropy": 2.5018739700317383, "epoch": 0.9753842807424594, "grad_norm": 0.026029499247670174, "grad_norm_var": 2.369135996114439e-07, "learning_rate": 1.555956920671231e-05, "loss": 2.3516, "step": 26905 }, { "crossentropy": 2.50669264793396, "epoch": 0.9754205336426914, "grad_norm": 0.026263616979122162, "grad_norm_var": 2.3294370878842486e-07, "learning_rate": 1.5513795815975675e-05, "loss": 2.4808, "step": 26906 }, { "crossentropy": 2.4018611907958984, "epoch": 0.9754567865429234, "grad_norm": 0.0251110028475523, "grad_norm_var": 2.838057493620774e-07, "learning_rate": 1.5468089748288393e-05, "loss": 2.4102, "step": 26907 }, { "crossentropy": 2.5244507789611816, "epoch": 0.9754930394431555, "grad_norm": 0.026596304029226303, "grad_norm_var": 2.531321682656846e-07, "learning_rate": 1.542245100426942e-05, "loss": 2.506, "step": 26908 }, { "crossentropy": 2.466475248336792, "epoch": 0.9755292923433875, "grad_norm": 0.02576593868434429, "grad_norm_var": 2.5348465400257954e-07, "learning_rate": 1.537687958453493e-05, "loss": 2.4586, "step": 26909 }, { "crossentropy": 2.076565980911255, "epoch": 0.9755655452436195, "grad_norm": 0.025533514097332954, "grad_norm_var": 2.2918100989147843e-07, "learning_rate": 1.533137548969943e-05, "loss": 2.249, "step": 26910 }, { "crossentropy": 2.531670331954956, "epoch": 0.9756017981438515, "grad_norm": 0.026380931958556175, "grad_norm_var": 2.2241902671858658e-07, "learning_rate": 1.5285938720378533e-05, "loss": 2.4548, "step": 26911 }, { "crossentropy": 2.3370699882507324, "epoch": 0.9756380510440835, "grad_norm": 0.02664208970963955, "grad_norm_var": 2.548495283170744e-07, "learning_rate": 1.5240569277185645e-05, "loss": 2.381, "step": 26912 }, { "crossentropy": 2.3967676162719727, "epoch": 0.9756743039443155, "grad_norm": 0.026161814108490944, "grad_norm_var": 2.317900331413967e-07, "learning_rate": 1.5195267160733605e-05, "loss": 2.3382, "step": 26913 }, { "crossentropy": 2.452613353729248, "epoch": 0.9757105568445475, "grad_norm": 0.0260881669819355, "grad_norm_var": 2.192581952544422e-07, "learning_rate": 1.5150032371634149e-05, "loss": 2.3516, "step": 26914 }, { "crossentropy": 2.4590353965759277, "epoch": 0.9757468097447796, "grad_norm": 0.02570316195487976, "grad_norm_var": 2.115474133868034e-07, "learning_rate": 1.5104864910497895e-05, "loss": 2.4637, "step": 26915 }, { "crossentropy": 2.4724488258361816, "epoch": 0.9757830626450116, "grad_norm": 0.02631358988583088, "grad_norm_var": 2.1799434975880844e-07, "learning_rate": 1.505976477793547e-05, "loss": 2.4596, "step": 26916 }, { "crossentropy": 2.3610153198242188, "epoch": 0.9758193155452436, "grad_norm": 0.02667434699833393, "grad_norm_var": 2.238861060575743e-07, "learning_rate": 1.5014731974556383e-05, "loss": 2.3977, "step": 26917 }, { "crossentropy": 2.5586633682250977, "epoch": 0.9758555684454756, "grad_norm": 0.026212725788354874, "grad_norm_var": 1.9159728968607774e-07, "learning_rate": 1.496976650096793e-05, "loss": 2.4795, "step": 26918 }, { "crossentropy": 2.2443082332611084, "epoch": 0.9758918213457076, "grad_norm": 0.02530827187001705, "grad_norm_var": 2.1564630046110438e-07, "learning_rate": 1.4924868357777399e-05, "loss": 2.2671, "step": 26919 }, { "crossentropy": 2.4661712646484375, "epoch": 0.9759280742459396, "grad_norm": 0.026547323912382126, "grad_norm_var": 2.2890386471690246e-07, "learning_rate": 1.4880037545592084e-05, "loss": 2.3748, "step": 26920 }, { "crossentropy": 2.5713298320770264, "epoch": 0.9759643271461717, "grad_norm": 0.027767321094870567, "grad_norm_var": 4.0519651792788e-07, "learning_rate": 1.4835274065017057e-05, "loss": 2.4842, "step": 26921 }, { "crossentropy": 2.5678532123565674, "epoch": 0.9760005800464037, "grad_norm": 0.026719490066170692, "grad_norm_var": 4.225455239333416e-07, "learning_rate": 1.4790577916656834e-05, "loss": 2.5879, "step": 26922 }, { "crossentropy": 2.3501152992248535, "epoch": 0.9760368329466357, "grad_norm": 0.025294285267591476, "grad_norm_var": 3.9753460591608844e-07, "learning_rate": 1.4745949101115374e-05, "loss": 2.3358, "step": 26923 }, { "crossentropy": 2.6368751525878906, "epoch": 0.9760730858468677, "grad_norm": 0.02575615420937538, "grad_norm_var": 4.008219412552153e-07, "learning_rate": 1.4701387618994977e-05, "loss": 2.4667, "step": 26924 }, { "crossentropy": 2.422650098800659, "epoch": 0.9761093387470998, "grad_norm": 0.02605135180056095, "grad_norm_var": 3.901819560859591e-07, "learning_rate": 1.4656893470897936e-05, "loss": 2.4051, "step": 26925 }, { "crossentropy": 2.4340715408325195, "epoch": 0.9761455916473318, "grad_norm": 0.026064876466989517, "grad_norm_var": 3.6081047435468767e-07, "learning_rate": 1.4612466657424883e-05, "loss": 2.4104, "step": 26926 }, { "crossentropy": 2.538261651992798, "epoch": 0.9761818445475638, "grad_norm": 0.02675817534327507, "grad_norm_var": 3.772782001298473e-07, "learning_rate": 1.4568107179176448e-05, "loss": 2.5091, "step": 26927 }, { "crossentropy": 2.4049160480499268, "epoch": 0.9762180974477959, "grad_norm": 0.026347264647483826, "grad_norm_var": 3.674529045057425e-07, "learning_rate": 1.4523815036751043e-05, "loss": 2.4913, "step": 26928 }, { "crossentropy": 2.428572177886963, "epoch": 0.9762543503480279, "grad_norm": 0.026372963562607765, "grad_norm_var": 3.68164348388715e-07, "learning_rate": 1.447959023074763e-05, "loss": 2.3883, "step": 26929 }, { "crossentropy": 2.5499320030212402, "epoch": 0.9762906032482599, "grad_norm": 0.025684459134936333, "grad_norm_var": 3.8699263304967787e-07, "learning_rate": 1.4435432761762957e-05, "loss": 2.54, "step": 26930 }, { "crossentropy": 2.41502046585083, "epoch": 0.9763268561484919, "grad_norm": 0.026006095111370087, "grad_norm_var": 3.7171174541248547e-07, "learning_rate": 1.4391342630393211e-05, "loss": 2.4491, "step": 26931 }, { "crossentropy": 2.395530939102173, "epoch": 0.9763631090487239, "grad_norm": 0.026682842522859573, "grad_norm_var": 3.837374969553695e-07, "learning_rate": 1.4347319837234585e-05, "loss": 2.4288, "step": 26932 }, { "crossentropy": 2.541891574859619, "epoch": 0.9763993619489559, "grad_norm": 0.026088137179613113, "grad_norm_var": 3.732588472760608e-07, "learning_rate": 1.43033643828816e-05, "loss": 2.5003, "step": 26933 }, { "crossentropy": 2.3162002563476562, "epoch": 0.976435614849188, "grad_norm": 0.02615027315914631, "grad_norm_var": 3.7363695635446854e-07, "learning_rate": 1.4259476267927674e-05, "loss": 2.3666, "step": 26934 }, { "crossentropy": 2.4831814765930176, "epoch": 0.97647186774942, "grad_norm": 0.026006389409303665, "grad_norm_var": 3.187704219327509e-07, "learning_rate": 1.4215655492965662e-05, "loss": 2.4339, "step": 26935 }, { "crossentropy": 2.5207996368408203, "epoch": 0.976508120649652, "grad_norm": 0.026840494945645332, "grad_norm_var": 3.350379022341489e-07, "learning_rate": 1.4171902058587315e-05, "loss": 2.4904, "step": 26936 }, { "crossentropy": 2.386598825454712, "epoch": 0.976544373549884, "grad_norm": 0.026548080146312714, "grad_norm_var": 1.872836140073252e-07, "learning_rate": 1.4128215965383829e-05, "loss": 2.4562, "step": 26937 }, { "crossentropy": 2.430107355117798, "epoch": 0.976580626450116, "grad_norm": 0.026295095682144165, "grad_norm_var": 1.6975064346020213e-07, "learning_rate": 1.4084597213945283e-05, "loss": 2.405, "step": 26938 }, { "crossentropy": 2.5155742168426514, "epoch": 0.976616879350348, "grad_norm": 0.02625023014843464, "grad_norm_var": 1.1343921498564262e-07, "learning_rate": 1.4041045804860098e-05, "loss": 2.3098, "step": 26939 }, { "crossentropy": 2.367851734161377, "epoch": 0.97665313225058, "grad_norm": 0.025671958923339844, "grad_norm_var": 1.1935805993562355e-07, "learning_rate": 1.3997561738717247e-05, "loss": 2.4081, "step": 26940 }, { "crossentropy": 2.4049887657165527, "epoch": 0.976689385150812, "grad_norm": 0.026425082236528397, "grad_norm_var": 1.1875360980150886e-07, "learning_rate": 1.395414501610459e-05, "loss": 2.5421, "step": 26941 }, { "crossentropy": 2.2264246940612793, "epoch": 0.9767256380510441, "grad_norm": 0.02567053958773613, "grad_norm_var": 1.388382443531875e-07, "learning_rate": 1.3910795637606665e-05, "loss": 2.3867, "step": 26942 }, { "crossentropy": 2.4454610347747803, "epoch": 0.9767618909512761, "grad_norm": 0.026003004983067513, "grad_norm_var": 1.220423364362358e-07, "learning_rate": 1.3867513603810777e-05, "loss": 2.4419, "step": 26943 }, { "crossentropy": 2.515538454055786, "epoch": 0.9767981438515081, "grad_norm": 0.026440396904945374, "grad_norm_var": 1.245350328790117e-07, "learning_rate": 1.3824298915300903e-05, "loss": 2.4854, "step": 26944 }, { "crossentropy": 2.5244977474212646, "epoch": 0.9768343967517401, "grad_norm": 0.02573291026055813, "grad_norm_var": 1.3503738163061126e-07, "learning_rate": 1.3781151572659911e-05, "loss": 2.5069, "step": 26945 }, { "crossentropy": 2.38495135307312, "epoch": 0.9768706496519721, "grad_norm": 0.02574877068400383, "grad_norm_var": 1.312524821380495e-07, "learning_rate": 1.3738071576472334e-05, "loss": 2.4036, "step": 26946 }, { "crossentropy": 2.4117205142974854, "epoch": 0.9769069025522041, "grad_norm": 0.027161210775375366, "grad_norm_var": 1.9093911218097335e-07, "learning_rate": 1.3695058927318261e-05, "loss": 2.3502, "step": 26947 }, { "crossentropy": 2.4904532432556152, "epoch": 0.9769431554524362, "grad_norm": 0.027107659727334976, "grad_norm_var": 2.2774312183960973e-07, "learning_rate": 1.365211362577945e-05, "loss": 2.4745, "step": 26948 }, { "crossentropy": 2.6034862995147705, "epoch": 0.9769794083526682, "grad_norm": 0.026782497763633728, "grad_norm_var": 2.420797265859174e-07, "learning_rate": 1.3609235672436549e-05, "loss": 2.4853, "step": 26949 }, { "crossentropy": 2.4927027225494385, "epoch": 0.9770156612529002, "grad_norm": 0.026758814230561256, "grad_norm_var": 2.529007696592019e-07, "learning_rate": 1.3566425067867427e-05, "loss": 2.5313, "step": 26950 }, { "crossentropy": 2.4326603412628174, "epoch": 0.9770519141531323, "grad_norm": 0.026849305257201195, "grad_norm_var": 2.597913468108462e-07, "learning_rate": 1.3523681812651067e-05, "loss": 2.3941, "step": 26951 }, { "crossentropy": 2.4110870361328125, "epoch": 0.9770881670533643, "grad_norm": 0.02615930885076523, "grad_norm_var": 2.48137551738534e-07, "learning_rate": 1.3481005907364785e-05, "loss": 2.424, "step": 26952 }, { "crossentropy": 2.425549030303955, "epoch": 0.9771244199535963, "grad_norm": 0.02577493153512478, "grad_norm_var": 2.6510944205771864e-07, "learning_rate": 1.3438397352584786e-05, "loss": 2.4477, "step": 26953 }, { "crossentropy": 2.423156261444092, "epoch": 0.9771606728538283, "grad_norm": 0.025172453373670578, "grad_norm_var": 3.449106364386903e-07, "learning_rate": 1.339585614888672e-05, "loss": 2.4867, "step": 26954 }, { "crossentropy": 2.3028013706207275, "epoch": 0.9771969257540604, "grad_norm": 0.026401879265904427, "grad_norm_var": 3.4672028496277735e-07, "learning_rate": 1.3353382296844574e-05, "loss": 2.3971, "step": 26955 }, { "crossentropy": 2.3219127655029297, "epoch": 0.9772331786542924, "grad_norm": 0.025263020768761635, "grad_norm_var": 3.882152999817723e-07, "learning_rate": 1.3310975797032887e-05, "loss": 2.361, "step": 26956 }, { "crossentropy": 2.4068586826324463, "epoch": 0.9772694315545244, "grad_norm": 0.02598213218152523, "grad_norm_var": 3.8811414433866563e-07, "learning_rate": 1.326863665002398e-05, "loss": 2.4268, "step": 26957 }, { "crossentropy": 2.3856751918792725, "epoch": 0.9773056844547564, "grad_norm": 0.02547411620616913, "grad_norm_var": 4.0407907358619204e-07, "learning_rate": 1.3226364856390172e-05, "loss": 2.342, "step": 26958 }, { "crossentropy": 2.5066373348236084, "epoch": 0.9773419373549884, "grad_norm": 0.025342168286442757, "grad_norm_var": 4.465962507632995e-07, "learning_rate": 1.3184160416701008e-05, "loss": 2.4687, "step": 26959 }, { "crossentropy": 2.441525936126709, "epoch": 0.9773781902552204, "grad_norm": 0.027632657438516617, "grad_norm_var": 5.840711273942534e-07, "learning_rate": 1.3142023331528252e-05, "loss": 2.38, "step": 26960 }, { "crossentropy": 2.3837692737579346, "epoch": 0.9774144431554525, "grad_norm": 0.026555493474006653, "grad_norm_var": 5.741459868461533e-07, "learning_rate": 1.309995360144034e-05, "loss": 2.39, "step": 26961 }, { "crossentropy": 2.399984121322632, "epoch": 0.9774506960556845, "grad_norm": 0.025832319632172585, "grad_norm_var": 5.68882770891436e-07, "learning_rate": 1.3057951227005149e-05, "loss": 2.4037, "step": 26962 }, { "crossentropy": 2.512557029724121, "epoch": 0.9774869489559165, "grad_norm": 0.02652362547814846, "grad_norm_var": 5.181548167759818e-07, "learning_rate": 1.301601620879056e-05, "loss": 2.5135, "step": 26963 }, { "crossentropy": 2.4247889518737793, "epoch": 0.9775232018561485, "grad_norm": 0.0256936177611351, "grad_norm_var": 4.768546877865666e-07, "learning_rate": 1.297414854736223e-05, "loss": 2.4459, "step": 26964 }, { "crossentropy": 2.4100217819213867, "epoch": 0.9775594547563805, "grad_norm": 0.026305582374334335, "grad_norm_var": 4.50049033195648e-07, "learning_rate": 1.2932348243286373e-05, "loss": 2.3645, "step": 26965 }, { "crossentropy": 2.3789799213409424, "epoch": 0.9775957076566125, "grad_norm": 0.026302631944417953, "grad_norm_var": 4.234451116911138e-07, "learning_rate": 1.289061529712754e-05, "loss": 2.3844, "step": 26966 }, { "crossentropy": 2.3974618911743164, "epoch": 0.9776319605568445, "grad_norm": 0.025469861924648285, "grad_norm_var": 4.007094107024428e-07, "learning_rate": 1.2848949709449164e-05, "loss": 2.3637, "step": 26967 }, { "crossentropy": 2.3416757583618164, "epoch": 0.9776682134570766, "grad_norm": 0.025825226679444313, "grad_norm_var": 4.0027085872097974e-07, "learning_rate": 1.2807351480813578e-05, "loss": 2.3974, "step": 26968 }, { "crossentropy": 2.584054946899414, "epoch": 0.9777044663573086, "grad_norm": 0.026066822931170464, "grad_norm_var": 3.9792690199226776e-07, "learning_rate": 1.2765820611783108e-05, "loss": 2.4598, "step": 26969 }, { "crossentropy": 2.3718080520629883, "epoch": 0.9777407192575406, "grad_norm": 0.026869719848036766, "grad_norm_var": 3.929078419791088e-07, "learning_rate": 1.272435710291897e-05, "loss": 2.4192, "step": 26970 }, { "crossentropy": 2.3143885135650635, "epoch": 0.9777769721577726, "grad_norm": 0.025620141997933388, "grad_norm_var": 3.9925188387682165e-07, "learning_rate": 1.268296095478072e-05, "loss": 2.3353, "step": 26971 }, { "crossentropy": 2.3554255962371826, "epoch": 0.9778132250580046, "grad_norm": 0.02597196027636528, "grad_norm_var": 3.565160632173584e-07, "learning_rate": 1.2641632167927353e-05, "loss": 2.3517, "step": 26972 }, { "crossentropy": 2.4070656299591064, "epoch": 0.9778494779582366, "grad_norm": 0.02729920670390129, "grad_norm_var": 4.456830553220866e-07, "learning_rate": 1.2600370742917311e-05, "loss": 2.4531, "step": 26973 }, { "crossentropy": 2.376641273498535, "epoch": 0.9778857308584686, "grad_norm": 0.026516709476709366, "grad_norm_var": 4.163180324515906e-07, "learning_rate": 1.2559176680308482e-05, "loss": 2.3953, "step": 26974 }, { "crossentropy": 2.5138440132141113, "epoch": 0.9779219837587007, "grad_norm": 0.026383375748991966, "grad_norm_var": 3.595375389990694e-07, "learning_rate": 1.2518049980655976e-05, "loss": 2.5496, "step": 26975 }, { "crossentropy": 2.4094316959381104, "epoch": 0.9779582366589327, "grad_norm": 0.026405129581689835, "grad_norm_var": 2.3630288316308219e-07, "learning_rate": 1.2476990644516572e-05, "loss": 2.4571, "step": 26976 }, { "crossentropy": 2.5083959102630615, "epoch": 0.9779944895591647, "grad_norm": 0.027300389483571053, "grad_norm_var": 3.0354954874481364e-07, "learning_rate": 1.2435998672443716e-05, "loss": 2.4005, "step": 26977 }, { "crossentropy": 2.4437811374664307, "epoch": 0.9780307424593968, "grad_norm": 0.025437546893954277, "grad_norm_var": 3.365459808538432e-07, "learning_rate": 1.2395074064991962e-05, "loss": 2.4518, "step": 26978 }, { "crossentropy": 2.6427805423736572, "epoch": 0.9780669953596288, "grad_norm": 0.02638278901576996, "grad_norm_var": 3.326375518083174e-07, "learning_rate": 1.2354216822713649e-05, "loss": 2.494, "step": 26979 }, { "crossentropy": 2.2698402404785156, "epoch": 0.9781032482598608, "grad_norm": 0.026840530335903168, "grad_norm_var": 3.3119450494298264e-07, "learning_rate": 1.2313426946160555e-05, "loss": 2.3063, "step": 26980 }, { "crossentropy": 2.353086471557617, "epoch": 0.9781395011600929, "grad_norm": 0.026941003277897835, "grad_norm_var": 3.558559820995265e-07, "learning_rate": 1.2272704435883908e-05, "loss": 2.3607, "step": 26981 }, { "crossentropy": 2.4105050563812256, "epoch": 0.9781757540603249, "grad_norm": 0.026598511263728142, "grad_norm_var": 3.5937734022179327e-07, "learning_rate": 1.2232049292433268e-05, "loss": 2.4115, "step": 26982 }, { "crossentropy": 2.332075357437134, "epoch": 0.9782120069605569, "grad_norm": 0.02760113961994648, "grad_norm_var": 3.873227966801988e-07, "learning_rate": 1.2191461516358193e-05, "loss": 2.3852, "step": 26983 }, { "crossentropy": 2.542506694793701, "epoch": 0.9782482598607889, "grad_norm": 0.02687949314713478, "grad_norm_var": 3.614091223282419e-07, "learning_rate": 1.2150941108206582e-05, "loss": 2.4612, "step": 26984 }, { "crossentropy": 2.43015456199646, "epoch": 0.9782845127610209, "grad_norm": 0.025768660008907318, "grad_norm_var": 3.869555342141002e-07, "learning_rate": 1.2110488068525771e-05, "loss": 2.4055, "step": 26985 }, { "crossentropy": 2.313385486602783, "epoch": 0.9783207656612529, "grad_norm": 0.025854263454675674, "grad_norm_var": 4.082522954946723e-07, "learning_rate": 1.2070102397862548e-05, "loss": 2.3578, "step": 26986 }, { "crossentropy": 2.4574811458587646, "epoch": 0.9783570185614849, "grad_norm": 0.025896891951560974, "grad_norm_var": 3.8103173815908403e-07, "learning_rate": 1.2029784096762031e-05, "loss": 2.3655, "step": 26987 }, { "crossentropy": 2.4913156032562256, "epoch": 0.978393271461717, "grad_norm": 0.02689184993505478, "grad_norm_var": 3.6855908169166017e-07, "learning_rate": 1.198953316576823e-05, "loss": 2.5024, "step": 26988 }, { "crossentropy": 2.3260717391967773, "epoch": 0.978429524361949, "grad_norm": 0.026187939569354057, "grad_norm_var": 3.3656094231327277e-07, "learning_rate": 1.1949349605425708e-05, "loss": 2.4092, "step": 26989 }, { "crossentropy": 2.3138110637664795, "epoch": 0.978465777262181, "grad_norm": 0.026153607293963432, "grad_norm_var": 3.436479026589408e-07, "learning_rate": 1.190923341627681e-05, "loss": 2.2966, "step": 26990 }, { "crossentropy": 2.5310869216918945, "epoch": 0.978502030162413, "grad_norm": 0.026310697197914124, "grad_norm_var": 3.4481935839735834e-07, "learning_rate": 1.186918459886388e-05, "loss": 2.4833, "step": 26991 }, { "crossentropy": 2.4594266414642334, "epoch": 0.978538283062645, "grad_norm": 0.025625715032219887, "grad_norm_var": 3.890769706654344e-07, "learning_rate": 1.1829203153727042e-05, "loss": 2.4109, "step": 26992 }, { "crossentropy": 2.4007787704467773, "epoch": 0.978574535962877, "grad_norm": 0.027919195592403412, "grad_norm_var": 4.859007893771716e-07, "learning_rate": 1.1789289081406418e-05, "loss": 2.2975, "step": 26993 }, { "crossentropy": 2.4018001556396484, "epoch": 0.978610788863109, "grad_norm": 0.02640422247350216, "grad_norm_var": 4.130858189090586e-07, "learning_rate": 1.1749442382441577e-05, "loss": 2.3646, "step": 26994 }, { "crossentropy": 2.497096061706543, "epoch": 0.978647041763341, "grad_norm": 0.02640877291560173, "grad_norm_var": 4.1266639423177765e-07, "learning_rate": 1.1709663057370424e-05, "loss": 2.4875, "step": 26995 }, { "crossentropy": 2.442751169204712, "epoch": 0.9786832946635731, "grad_norm": 0.02675691619515419, "grad_norm_var": 4.0950376861783633e-07, "learning_rate": 1.1669951106729749e-05, "loss": 2.5173, "step": 26996 }, { "crossentropy": 2.3503501415252686, "epoch": 0.9787195475638051, "grad_norm": 0.026994548738002777, "grad_norm_var": 4.127427176448331e-07, "learning_rate": 1.16303065310569e-05, "loss": 2.3428, "step": 26997 }, { "crossentropy": 2.4626598358154297, "epoch": 0.9787558004640371, "grad_norm": 0.025820773094892502, "grad_norm_var": 4.419680468853293e-07, "learning_rate": 1.1590729330887007e-05, "loss": 2.408, "step": 26998 }, { "crossentropy": 2.412879467010498, "epoch": 0.9787920533642691, "grad_norm": 0.026218146085739136, "grad_norm_var": 3.524065819882072e-07, "learning_rate": 1.155121950675464e-05, "loss": 2.3799, "step": 26999 }, { "crossentropy": 2.4083333015441895, "epoch": 0.9788283062645011, "grad_norm": 0.02651967853307724, "grad_norm_var": 3.3656997723511414e-07, "learning_rate": 1.1511777059192708e-05, "loss": 2.4109, "step": 27000 }, { "crossentropy": 2.4967236518859863, "epoch": 0.9788645591647331, "grad_norm": 0.024991851300001144, "grad_norm_var": 4.35350158887321e-07, "learning_rate": 1.1472401988735227e-05, "loss": 2.3361, "step": 27001 }, { "crossentropy": 2.4168286323547363, "epoch": 0.9789008120649652, "grad_norm": 0.026690024882555008, "grad_norm_var": 4.2825563517807454e-07, "learning_rate": 1.1433094295912882e-05, "loss": 2.4461, "step": 27002 }, { "crossentropy": 2.290266990661621, "epoch": 0.9789370649651972, "grad_norm": 0.026515085250139236, "grad_norm_var": 4.1380995265029537e-07, "learning_rate": 1.1393853981256918e-05, "loss": 2.3514, "step": 27003 }, { "crossentropy": 2.389101028442383, "epoch": 0.9789733178654292, "grad_norm": 0.026516713201999664, "grad_norm_var": 3.980321725377892e-07, "learning_rate": 1.1354681045297467e-05, "loss": 2.3618, "step": 27004 }, { "crossentropy": 2.4280335903167725, "epoch": 0.9790095707656613, "grad_norm": 0.027386104688048363, "grad_norm_var": 4.5753489094999684e-07, "learning_rate": 1.1315575488563546e-05, "loss": 2.4251, "step": 27005 }, { "crossentropy": 2.5912108421325684, "epoch": 0.9790458236658933, "grad_norm": 0.026538440957665443, "grad_norm_var": 4.514799075215163e-07, "learning_rate": 1.1276537311583623e-05, "loss": 2.4972, "step": 27006 }, { "crossentropy": 2.5343856811523438, "epoch": 0.9790820765661253, "grad_norm": 0.026675548404455185, "grad_norm_var": 4.517555315584782e-07, "learning_rate": 1.12375665148845e-05, "loss": 2.4469, "step": 27007 }, { "crossentropy": 2.399695873260498, "epoch": 0.9791183294663574, "grad_norm": 0.025320254266262054, "grad_norm_var": 4.931486537824616e-07, "learning_rate": 1.1198663098992978e-05, "loss": 2.3512, "step": 27008 }, { "crossentropy": 2.3811569213867188, "epoch": 0.9791545823665894, "grad_norm": 0.026336049661040306, "grad_norm_var": 3.4595225159190595e-07, "learning_rate": 1.1159827064434191e-05, "loss": 2.3893, "step": 27009 }, { "crossentropy": 2.44986629486084, "epoch": 0.9791908352668214, "grad_norm": 0.026548948138952255, "grad_norm_var": 3.477129254465198e-07, "learning_rate": 1.1121058411733276e-05, "loss": 2.4545, "step": 27010 }, { "crossentropy": 2.5102665424346924, "epoch": 0.9792270881670534, "grad_norm": 0.026260605081915855, "grad_norm_var": 3.4871151295389387e-07, "learning_rate": 1.1082357141412591e-05, "loss": 2.4513, "step": 27011 }, { "crossentropy": 2.4262025356292725, "epoch": 0.9792633410672854, "grad_norm": 0.026166347786784172, "grad_norm_var": 3.40878077853994e-07, "learning_rate": 1.1043723253995608e-05, "loss": 2.3963, "step": 27012 }, { "crossentropy": 2.4857966899871826, "epoch": 0.9792995939675174, "grad_norm": 0.025353705510497093, "grad_norm_var": 3.667577918304669e-07, "learning_rate": 1.1005156750004685e-05, "loss": 2.4091, "step": 27013 }, { "crossentropy": 2.4217991828918457, "epoch": 0.9793358468677494, "grad_norm": 0.026129458099603653, "grad_norm_var": 3.554116377176934e-07, "learning_rate": 1.0966657629959964e-05, "loss": 2.4382, "step": 27014 }, { "crossentropy": 2.4613375663757324, "epoch": 0.9793720997679815, "grad_norm": 0.02584226056933403, "grad_norm_var": 3.663617023155185e-07, "learning_rate": 1.0928225894381582e-05, "loss": 2.4255, "step": 27015 }, { "crossentropy": 2.2613255977630615, "epoch": 0.9794083526682135, "grad_norm": 0.026880906894803047, "grad_norm_var": 3.881347204755103e-07, "learning_rate": 1.0889861543789125e-05, "loss": 2.3332, "step": 27016 }, { "crossentropy": 2.4894776344299316, "epoch": 0.9794446055684455, "grad_norm": 0.02651619166135788, "grad_norm_var": 2.7571295241831234e-07, "learning_rate": 1.0851564578699958e-05, "loss": 2.4429, "step": 27017 }, { "crossentropy": 2.608586549758911, "epoch": 0.9794808584686775, "grad_norm": 0.026386283338069916, "grad_norm_var": 2.6790251328085285e-07, "learning_rate": 1.0813334999631442e-05, "loss": 2.5081, "step": 27018 }, { "crossentropy": 2.4453084468841553, "epoch": 0.9795171113689095, "grad_norm": 0.025269808247685432, "grad_norm_var": 3.350552900645172e-07, "learning_rate": 1.0775172807100387e-05, "loss": 2.4111, "step": 27019 }, { "crossentropy": 2.426790475845337, "epoch": 0.9795533642691415, "grad_norm": 0.02594822645187378, "grad_norm_var": 3.3564209110052007e-07, "learning_rate": 1.073707800162249e-05, "loss": 2.4112, "step": 27020 }, { "crossentropy": 2.4995014667510986, "epoch": 0.9795896171693735, "grad_norm": 0.027501221746206284, "grad_norm_var": 3.543312622116077e-07, "learning_rate": 1.0699050583711235e-05, "loss": 2.5011, "step": 27021 }, { "crossentropy": 2.478109121322632, "epoch": 0.9796258700696056, "grad_norm": 0.02729669399559498, "grad_norm_var": 4.2148528719951745e-07, "learning_rate": 1.0661090553881203e-05, "loss": 2.4605, "step": 27022 }, { "crossentropy": 2.5413646697998047, "epoch": 0.9796621229698376, "grad_norm": 0.02549796923995018, "grad_norm_var": 4.455822772403735e-07, "learning_rate": 1.0623197912644767e-05, "loss": 2.4837, "step": 27023 }, { "crossentropy": 2.5206618309020996, "epoch": 0.9796983758700696, "grad_norm": 0.025542311370372772, "grad_norm_var": 4.2251529216879617e-07, "learning_rate": 1.0585372660513182e-05, "loss": 2.5278, "step": 27024 }, { "crossentropy": 2.30741810798645, "epoch": 0.9797346287703016, "grad_norm": 0.026392167434096336, "grad_norm_var": 4.2360055849606444e-07, "learning_rate": 1.0547614797998261e-05, "loss": 2.3481, "step": 27025 }, { "crossentropy": 2.235016345977783, "epoch": 0.9797708816705336, "grad_norm": 0.025977520272135735, "grad_norm_var": 4.190083943727529e-07, "learning_rate": 1.0509924325609599e-05, "loss": 2.3346, "step": 27026 }, { "crossentropy": 2.397212505340576, "epoch": 0.9798071345707656, "grad_norm": 0.025988036766648293, "grad_norm_var": 4.2090787470393705e-07, "learning_rate": 1.0472301243856231e-05, "loss": 2.3918, "step": 27027 }, { "crossentropy": 2.248750686645508, "epoch": 0.9798433874709976, "grad_norm": 0.02639200910925865, "grad_norm_var": 4.240387659394286e-07, "learning_rate": 1.0434745553246083e-05, "loss": 2.3982, "step": 27028 }, { "crossentropy": 2.468217611312866, "epoch": 0.9798796403712297, "grad_norm": 0.025327928364276886, "grad_norm_var": 4.2692769903230715e-07, "learning_rate": 1.0397257254287085e-05, "loss": 2.4483, "step": 27029 }, { "crossentropy": 2.344093084335327, "epoch": 0.9799158932714617, "grad_norm": 0.02534942328929901, "grad_norm_var": 4.7027114609497073e-07, "learning_rate": 1.0359836347484386e-05, "loss": 2.3736, "step": 27030 }, { "crossentropy": 2.3569650650024414, "epoch": 0.9799521461716937, "grad_norm": 0.027221955358982086, "grad_norm_var": 5.35978205380705e-07, "learning_rate": 1.0322482833344804e-05, "loss": 2.3407, "step": 27031 }, { "crossentropy": 2.4559037685394287, "epoch": 0.9799883990719258, "grad_norm": 0.026351848617196083, "grad_norm_var": 5.0671281904488e-07, "learning_rate": 1.0285196712371826e-05, "loss": 2.4195, "step": 27032 }, { "crossentropy": 2.412008047103882, "epoch": 0.9800246519721578, "grad_norm": 0.02536601759493351, "grad_norm_var": 5.385998056824653e-07, "learning_rate": 1.024797798506949e-05, "loss": 2.2901, "step": 27033 }, { "crossentropy": 2.3208601474761963, "epoch": 0.9800609048723898, "grad_norm": 0.026412153616547585, "grad_norm_var": 5.3958398422421e-07, "learning_rate": 1.021082665194073e-05, "loss": 2.3234, "step": 27034 }, { "crossentropy": 2.4516515731811523, "epoch": 0.9800971577726219, "grad_norm": 0.026206331327557564, "grad_norm_var": 4.888990709610182e-07, "learning_rate": 1.0173742713486811e-05, "loss": 2.4815, "step": 27035 }, { "crossentropy": 2.4898438453674316, "epoch": 0.9801334106728539, "grad_norm": 0.02567274682223797, "grad_norm_var": 5.019069632143282e-07, "learning_rate": 1.0136726170209e-05, "loss": 2.4381, "step": 27036 }, { "crossentropy": 2.4718048572540283, "epoch": 0.9801696635730859, "grad_norm": 0.025696102529764175, "grad_norm_var": 3.817940463860288e-07, "learning_rate": 1.0099777022606892e-05, "loss": 2.4833, "step": 27037 }, { "crossentropy": 2.3785367012023926, "epoch": 0.9802059164733179, "grad_norm": 0.026330675929784775, "grad_norm_var": 2.786655582943826e-07, "learning_rate": 1.0062895271179539e-05, "loss": 2.389, "step": 27038 }, { "crossentropy": 2.3830819129943848, "epoch": 0.9802421693735499, "grad_norm": 0.026432985439896584, "grad_norm_var": 2.728601338399522e-07, "learning_rate": 1.002608091642543e-05, "loss": 2.3266, "step": 27039 }, { "crossentropy": 2.5023231506347656, "epoch": 0.9802784222737819, "grad_norm": 0.02705492451786995, "grad_norm_var": 3.152305292370422e-07, "learning_rate": 9.98933395884194e-06, "loss": 2.4205, "step": 27040 }, { "crossentropy": 2.3380074501037598, "epoch": 0.9803146751740139, "grad_norm": 0.02500477246940136, "grad_norm_var": 3.8811050558777647e-07, "learning_rate": 9.952654398924788e-06, "loss": 2.3716, "step": 27041 }, { "crossentropy": 2.4160101413726807, "epoch": 0.980350928074246, "grad_norm": 0.026688145473599434, "grad_norm_var": 4.1289108023213837e-07, "learning_rate": 9.916042237169687e-06, "loss": 2.4367, "step": 27042 }, { "crossentropy": 2.5063161849975586, "epoch": 0.980387180974478, "grad_norm": 0.02643214352428913, "grad_norm_var": 4.1897286958655854e-07, "learning_rate": 9.879497474071241e-06, "loss": 2.3938, "step": 27043 }, { "crossentropy": 2.375349283218384, "epoch": 0.98042343387471, "grad_norm": 0.026849256828427315, "grad_norm_var": 4.4854666005206265e-07, "learning_rate": 9.843020110122947e-06, "loss": 2.3865, "step": 27044 }, { "crossentropy": 2.5090489387512207, "epoch": 0.980459686774942, "grad_norm": 0.026858435943722725, "grad_norm_var": 4.272247632017365e-07, "learning_rate": 9.806610145817741e-06, "loss": 2.5168, "step": 27045 }, { "crossentropy": 2.345299243927002, "epoch": 0.980495939675174, "grad_norm": 0.026848208159208298, "grad_norm_var": 3.885528645621115e-07, "learning_rate": 9.770267581646896e-06, "loss": 2.3654, "step": 27046 }, { "crossentropy": 2.4601967334747314, "epoch": 0.980532192575406, "grad_norm": 0.02687826193869114, "grad_norm_var": 3.554813062011756e-07, "learning_rate": 9.733992418101689e-06, "loss": 2.4563, "step": 27047 }, { "crossentropy": 2.3686888217926025, "epoch": 0.980568445475638, "grad_norm": 0.025853324681520462, "grad_norm_var": 3.6874354833088946e-07, "learning_rate": 9.697784655671726e-06, "loss": 2.419, "step": 27048 }, { "crossentropy": 2.3380916118621826, "epoch": 0.98060469837587, "grad_norm": 0.02586773782968521, "grad_norm_var": 3.2289759287161084e-07, "learning_rate": 9.661644294846617e-06, "loss": 2.4503, "step": 27049 }, { "crossentropy": 2.3508362770080566, "epoch": 0.9806409512761021, "grad_norm": 0.02554859220981598, "grad_norm_var": 3.5865234339914747e-07, "learning_rate": 9.625571336113748e-06, "loss": 2.3685, "step": 27050 }, { "crossentropy": 2.450551748275757, "epoch": 0.9806772041763341, "grad_norm": 0.02640506438910961, "grad_norm_var": 3.595949236730496e-07, "learning_rate": 9.589565779961063e-06, "loss": 2.4622, "step": 27051 }, { "crossentropy": 2.3056087493896484, "epoch": 0.9807134570765661, "grad_norm": 0.025921177119016647, "grad_norm_var": 3.4345895707406833e-07, "learning_rate": 9.55362762687373e-06, "loss": 2.4311, "step": 27052 }, { "crossentropy": 2.4538002014160156, "epoch": 0.9807497099767981, "grad_norm": 0.027667609974741936, "grad_norm_var": 4.2978038092866763e-07, "learning_rate": 9.517756877338578e-06, "loss": 2.4491, "step": 27053 }, { "crossentropy": 2.437817335128784, "epoch": 0.9807859628770301, "grad_norm": 0.026045238599181175, "grad_norm_var": 4.380848982687401e-07, "learning_rate": 9.481953531839116e-06, "loss": 2.5022, "step": 27054 }, { "crossentropy": 2.3950462341308594, "epoch": 0.9808222157772621, "grad_norm": 0.02599550038576126, "grad_norm_var": 4.4796203466852755e-07, "learning_rate": 9.446217590859951e-06, "loss": 2.3979, "step": 27055 }, { "crossentropy": 2.397522449493408, "epoch": 0.9808584686774942, "grad_norm": 0.025953980162739754, "grad_norm_var": 4.231603677837259e-07, "learning_rate": 9.410549054882366e-06, "loss": 2.3923, "step": 27056 }, { "crossentropy": 2.279261589050293, "epoch": 0.9808947215777262, "grad_norm": 0.027346758171916008, "grad_norm_var": 3.6117196010275003e-07, "learning_rate": 9.374947924388756e-06, "loss": 2.2976, "step": 27057 }, { "crossentropy": 2.4013595581054688, "epoch": 0.9809309744779582, "grad_norm": 0.026434408500790596, "grad_norm_var": 3.570532489306314e-07, "learning_rate": 9.339414199860396e-06, "loss": 2.3853, "step": 27058 }, { "crossentropy": 2.456953287124634, "epoch": 0.9809672273781903, "grad_norm": 0.026662388816475868, "grad_norm_var": 3.60383051859157e-07, "learning_rate": 9.303947881776908e-06, "loss": 2.3703, "step": 27059 }, { "crossentropy": 2.517326593399048, "epoch": 0.9810034802784223, "grad_norm": 0.026614490896463394, "grad_norm_var": 3.512048365587507e-07, "learning_rate": 9.268548970616797e-06, "loss": 2.5526, "step": 27060 }, { "crossentropy": 2.3894455432891846, "epoch": 0.9810397331786543, "grad_norm": 0.0266933161765337, "grad_norm_var": 3.435055782984118e-07, "learning_rate": 9.233217466859122e-06, "loss": 2.3398, "step": 27061 }, { "crossentropy": 2.446889638900757, "epoch": 0.9810759860788864, "grad_norm": 0.027476491406559944, "grad_norm_var": 4.0396421177359604e-07, "learning_rate": 9.197953370980172e-06, "loss": 2.4505, "step": 27062 }, { "crossentropy": 2.357008695602417, "epoch": 0.9811122389791184, "grad_norm": 0.02548150159418583, "grad_norm_var": 4.4805356689871695e-07, "learning_rate": 9.162756683456785e-06, "loss": 2.3603, "step": 27063 }, { "crossentropy": 2.45196533203125, "epoch": 0.9811484918793504, "grad_norm": 0.02616814896464348, "grad_norm_var": 4.324351240159637e-07, "learning_rate": 9.127627404764138e-06, "loss": 2.3276, "step": 27064 }, { "crossentropy": 2.5171728134155273, "epoch": 0.9811847447795824, "grad_norm": 0.027087882161140442, "grad_norm_var": 4.40086268466607e-07, "learning_rate": 9.092565535376852e-06, "loss": 2.494, "step": 27065 }, { "crossentropy": 2.3927929401397705, "epoch": 0.9812209976798144, "grad_norm": 0.025609109550714493, "grad_norm_var": 4.328891458057459e-07, "learning_rate": 9.05757107576899e-06, "loss": 2.4274, "step": 27066 }, { "crossentropy": 2.275301456451416, "epoch": 0.9812572505800464, "grad_norm": 0.026025159284472466, "grad_norm_var": 4.4533523254336227e-07, "learning_rate": 9.022644026411842e-06, "loss": 2.3674, "step": 27067 }, { "crossentropy": 2.367011070251465, "epoch": 0.9812935034802784, "grad_norm": 0.02640412002801895, "grad_norm_var": 4.2592794619646566e-07, "learning_rate": 8.987784387778919e-06, "loss": 2.5191, "step": 27068 }, { "crossentropy": 2.4298930168151855, "epoch": 0.9813297563805105, "grad_norm": 0.02595081739127636, "grad_norm_var": 3.3808953345306387e-07, "learning_rate": 8.95299216033929e-06, "loss": 2.4742, "step": 27069 }, { "crossentropy": 2.360142946243286, "epoch": 0.9813660092807425, "grad_norm": 0.025458237156271935, "grad_norm_var": 3.851866419412227e-07, "learning_rate": 8.918267344563691e-06, "loss": 2.3261, "step": 27070 }, { "crossentropy": 2.387220621109009, "epoch": 0.9814022621809745, "grad_norm": 0.026259688660502434, "grad_norm_var": 3.775848611908452e-07, "learning_rate": 8.883609940921744e-06, "loss": 2.3653, "step": 27071 }, { "crossentropy": 2.475149154663086, "epoch": 0.9814385150812065, "grad_norm": 0.025978097692131996, "grad_norm_var": 3.763424195624461e-07, "learning_rate": 8.849019949880299e-06, "loss": 2.3458, "step": 27072 }, { "crossentropy": 2.362318992614746, "epoch": 0.9814747679814385, "grad_norm": 0.026108311489224434, "grad_norm_var": 3.0813327412854547e-07, "learning_rate": 8.814497371907315e-06, "loss": 2.3201, "step": 27073 }, { "crossentropy": 2.6194586753845215, "epoch": 0.9815110208816705, "grad_norm": 0.025679148733615875, "grad_norm_var": 3.278083072241293e-07, "learning_rate": 8.780042207469086e-06, "loss": 2.4243, "step": 27074 }, { "crossentropy": 2.3067665100097656, "epoch": 0.9815472737819025, "grad_norm": 0.025775551795959473, "grad_norm_var": 3.2566487086608103e-07, "learning_rate": 8.745654457030793e-06, "loss": 2.3876, "step": 27075 }, { "crossentropy": 2.251420259475708, "epoch": 0.9815835266821346, "grad_norm": 0.025773243978619576, "grad_norm_var": 3.20390046039803e-07, "learning_rate": 8.711334121057068e-06, "loss": 2.4064, "step": 27076 }, { "crossentropy": 2.4320671558380127, "epoch": 0.9816197795823666, "grad_norm": 0.02692342922091484, "grad_norm_var": 3.412729593642392e-07, "learning_rate": 8.677081200011428e-06, "loss": 2.3924, "step": 27077 }, { "crossentropy": 2.5538954734802246, "epoch": 0.9816560324825986, "grad_norm": 0.026082970201969147, "grad_norm_var": 2.1337657769010301e-07, "learning_rate": 8.642895694356278e-06, "loss": 2.4611, "step": 27078 }, { "crossentropy": 2.4802324771881104, "epoch": 0.9816922853828306, "grad_norm": 0.026012014597654343, "grad_norm_var": 1.9090694315172738e-07, "learning_rate": 8.60877760455292e-06, "loss": 2.4457, "step": 27079 }, { "crossentropy": 2.417193651199341, "epoch": 0.9817285382830626, "grad_norm": 0.02626027539372444, "grad_norm_var": 1.9250794751726694e-07, "learning_rate": 8.574726931063204e-06, "loss": 2.3732, "step": 27080 }, { "crossentropy": 2.454353094100952, "epoch": 0.9817647911832946, "grad_norm": 0.026342732831835747, "grad_norm_var": 1.277455459469089e-07, "learning_rate": 8.540743674346762e-06, "loss": 2.3382, "step": 27081 }, { "crossentropy": 2.426426887512207, "epoch": 0.9818010440835266, "grad_norm": 0.02526240609586239, "grad_norm_var": 1.551854807549929e-07, "learning_rate": 8.506827834862119e-06, "loss": 2.3988, "step": 27082 }, { "crossentropy": 2.4542295932769775, "epoch": 0.9818372969837587, "grad_norm": 0.0261528380215168, "grad_norm_var": 1.5631749589456225e-07, "learning_rate": 8.472979413067795e-06, "loss": 2.3947, "step": 27083 }, { "crossentropy": 2.518866539001465, "epoch": 0.9818735498839907, "grad_norm": 0.026781223714351654, "grad_norm_var": 1.8419273266571252e-07, "learning_rate": 8.439198409420646e-06, "loss": 2.4521, "step": 27084 }, { "crossentropy": 2.434962272644043, "epoch": 0.9819098027842227, "grad_norm": 0.03211602196097374, "grad_norm_var": 2.478220427087303e-06, "learning_rate": 8.405484824376975e-06, "loss": 2.3749, "step": 27085 }, { "crossentropy": 2.406292200088501, "epoch": 0.9819460556844548, "grad_norm": 0.026212284341454506, "grad_norm_var": 2.4155148487329013e-06, "learning_rate": 8.371838658392527e-06, "loss": 2.4188, "step": 27086 }, { "crossentropy": 2.3692402839660645, "epoch": 0.9819823085846868, "grad_norm": 0.02586660534143448, "grad_norm_var": 2.436850576170944e-06, "learning_rate": 8.338259911921386e-06, "loss": 2.4191, "step": 27087 }, { "crossentropy": 2.449423313140869, "epoch": 0.9820185614849188, "grad_norm": 0.02602134458720684, "grad_norm_var": 2.4342005359565334e-06, "learning_rate": 8.304748585417077e-06, "loss": 2.4075, "step": 27088 }, { "crossentropy": 2.378483772277832, "epoch": 0.9820548143851509, "grad_norm": 0.026078015565872192, "grad_norm_var": 2.4356811577934026e-06, "learning_rate": 8.27130467933257e-06, "loss": 2.3135, "step": 27089 }, { "crossentropy": 2.4918365478515625, "epoch": 0.9820910672853829, "grad_norm": 0.02584279328584671, "grad_norm_var": 2.4203444005815026e-06, "learning_rate": 8.237928194119172e-06, "loss": 2.4395, "step": 27090 }, { "crossentropy": 2.511082649230957, "epoch": 0.9821273201856149, "grad_norm": 0.026925072073936462, "grad_norm_var": 2.396649721163194e-06, "learning_rate": 8.20461913022763e-06, "loss": 2.4582, "step": 27091 }, { "crossentropy": 2.2466020584106445, "epoch": 0.9821635730858469, "grad_norm": 0.026021940633654594, "grad_norm_var": 2.3750625558835533e-06, "learning_rate": 8.17137748810759e-06, "loss": 2.3567, "step": 27092 }, { "crossentropy": 2.33302640914917, "epoch": 0.9821998259860789, "grad_norm": 0.026008930057287216, "grad_norm_var": 2.3825755022922977e-06, "learning_rate": 8.138203268209243e-06, "loss": 2.3294, "step": 27093 }, { "crossentropy": 2.512502908706665, "epoch": 0.9822360788863109, "grad_norm": 0.025874106213450432, "grad_norm_var": 2.3968938691054073e-06, "learning_rate": 8.105096470980011e-06, "loss": 2.3799, "step": 27094 }, { "crossentropy": 2.4675533771514893, "epoch": 0.9822723317865429, "grad_norm": 0.025710122659802437, "grad_norm_var": 2.4216755762293396e-06, "learning_rate": 8.072057096866203e-06, "loss": 2.3743, "step": 27095 }, { "crossentropy": 2.3093669414520264, "epoch": 0.982308584686775, "grad_norm": 0.026222340762615204, "grad_norm_var": 2.4228126086974032e-06, "learning_rate": 8.03908514631524e-06, "loss": 2.403, "step": 27096 }, { "crossentropy": 2.68410062789917, "epoch": 0.982344837587007, "grad_norm": 0.026388172060251236, "grad_norm_var": 2.4222013531443573e-06, "learning_rate": 8.006180619771763e-06, "loss": 2.5223, "step": 27097 }, { "crossentropy": 2.5232725143432617, "epoch": 0.982381090487239, "grad_norm": 0.027174150571227074, "grad_norm_var": 2.343379550497013e-06, "learning_rate": 7.973343517680975e-06, "loss": 2.4596, "step": 27098 }, { "crossentropy": 2.37886381149292, "epoch": 0.982417343387471, "grad_norm": 0.0257119107991457, "grad_norm_var": 2.381069670224694e-06, "learning_rate": 7.940573840485299e-06, "loss": 2.338, "step": 27099 }, { "crossentropy": 2.3706493377685547, "epoch": 0.982453596287703, "grad_norm": 0.026456652209162712, "grad_norm_var": 2.378066684587584e-06, "learning_rate": 7.907871588627713e-06, "loss": 2.4544, "step": 27100 }, { "crossentropy": 2.3029708862304688, "epoch": 0.982489849187935, "grad_norm": 0.0261548962444067, "grad_norm_var": 1.6661594726510795e-07, "learning_rate": 7.875236762550642e-06, "loss": 2.4521, "step": 27101 }, { "crossentropy": 2.4197940826416016, "epoch": 0.982526102088167, "grad_norm": 0.025276558473706245, "grad_norm_var": 2.1566928571658574e-07, "learning_rate": 7.842669362694287e-06, "loss": 2.4143, "step": 27102 }, { "crossentropy": 2.331981658935547, "epoch": 0.9825623549883991, "grad_norm": 0.025412870571017265, "grad_norm_var": 2.4316159291867814e-07, "learning_rate": 7.810169389498301e-06, "loss": 2.3637, "step": 27103 }, { "crossentropy": 2.4697887897491455, "epoch": 0.9825986078886311, "grad_norm": 0.02676546201109886, "grad_norm_var": 2.7194974306908074e-07, "learning_rate": 7.777736843401773e-06, "loss": 2.5077, "step": 27104 }, { "crossentropy": 2.45358943939209, "epoch": 0.9826348607888631, "grad_norm": 0.027420025318861008, "grad_norm_var": 3.7583614962443705e-07, "learning_rate": 7.745371724843242e-06, "loss": 2.4391, "step": 27105 }, { "crossentropy": 2.491297483444214, "epoch": 0.9826711136890951, "grad_norm": 0.027193687856197357, "grad_norm_var": 4.236848117298523e-07, "learning_rate": 7.713074034259026e-06, "loss": 2.3264, "step": 27106 }, { "crossentropy": 2.3329148292541504, "epoch": 0.9827073665893271, "grad_norm": 0.02542644552886486, "grad_norm_var": 4.3811464240049264e-07, "learning_rate": 7.680843772085999e-06, "loss": 2.3793, "step": 27107 }, { "crossentropy": 2.352863073348999, "epoch": 0.9827436194895591, "grad_norm": 0.02591843530535698, "grad_norm_var": 4.4125733310421537e-07, "learning_rate": 7.648680938759367e-06, "loss": 2.2994, "step": 27108 }, { "crossentropy": 2.4918954372406006, "epoch": 0.9827798723897911, "grad_norm": 0.02558443509042263, "grad_norm_var": 4.6303250276607164e-07, "learning_rate": 7.616585534713782e-06, "loss": 2.4943, "step": 27109 }, { "crossentropy": 2.252239465713501, "epoch": 0.9828161252900232, "grad_norm": 0.025558795779943466, "grad_norm_var": 4.816079672658452e-07, "learning_rate": 7.5845575603822325e-06, "loss": 2.295, "step": 27110 }, { "crossentropy": 2.4877750873565674, "epoch": 0.9828523781902552, "grad_norm": 0.02694779634475708, "grad_norm_var": 5.05016027289804e-07, "learning_rate": 7.552597016197149e-06, "loss": 2.5095, "step": 27111 }, { "crossentropy": 2.495788335800171, "epoch": 0.9828886310904872, "grad_norm": 0.025986747816205025, "grad_norm_var": 5.085933684048493e-07, "learning_rate": 7.520703902590964e-06, "loss": 2.3668, "step": 27112 }, { "crossentropy": 2.2885091304779053, "epoch": 0.9829248839907193, "grad_norm": 0.025858653709292412, "grad_norm_var": 5.136135405964711e-07, "learning_rate": 7.488878219993888e-06, "loss": 2.3142, "step": 27113 }, { "crossentropy": 2.420213460922241, "epoch": 0.9829611368909513, "grad_norm": 0.027088619768619537, "grad_norm_var": 5.027102132308294e-07, "learning_rate": 7.45711996883558e-06, "loss": 2.3577, "step": 27114 }, { "crossentropy": 2.456926107406616, "epoch": 0.9829973897911833, "grad_norm": 0.02584785595536232, "grad_norm_var": 4.95514373999711e-07, "learning_rate": 7.425429149545692e-06, "loss": 2.4307, "step": 27115 }, { "crossentropy": 2.621140956878662, "epoch": 0.9830336426914154, "grad_norm": 0.026567041873931885, "grad_norm_var": 5.003314292641488e-07, "learning_rate": 7.393805762551664e-06, "loss": 2.4909, "step": 27116 }, { "crossentropy": 2.4612295627593994, "epoch": 0.9830698955916474, "grad_norm": 0.026193054392933846, "grad_norm_var": 5.002539041094502e-07, "learning_rate": 7.362249808281485e-06, "loss": 2.4047, "step": 27117 }, { "crossentropy": 2.5233702659606934, "epoch": 0.9831061484918794, "grad_norm": 0.026973353698849678, "grad_norm_var": 4.7345039308996304e-07, "learning_rate": 7.330761287159815e-06, "loss": 2.4415, "step": 27118 }, { "crossentropy": 2.3493196964263916, "epoch": 0.9831424013921114, "grad_norm": 0.025742750614881516, "grad_norm_var": 4.413881102979337e-07, "learning_rate": 7.299340199613535e-06, "loss": 2.3797, "step": 27119 }, { "crossentropy": 2.3883798122406006, "epoch": 0.9831786542923434, "grad_norm": 0.02544925920665264, "grad_norm_var": 4.7097294828857215e-07, "learning_rate": 7.267986546066196e-06, "loss": 2.4276, "step": 27120 }, { "crossentropy": 2.2475481033325195, "epoch": 0.9832149071925754, "grad_norm": 0.02578943967819214, "grad_norm_var": 3.794691763779467e-07, "learning_rate": 7.236700326941348e-06, "loss": 2.3052, "step": 27121 }, { "crossentropy": 2.6388001441955566, "epoch": 0.9832511600928074, "grad_norm": 0.0268646739423275, "grad_norm_var": 3.396995407821257e-07, "learning_rate": 7.2054815426614294e-06, "loss": 2.5105, "step": 27122 }, { "crossentropy": 2.313232898712158, "epoch": 0.9832874129930395, "grad_norm": 0.02650598995387554, "grad_norm_var": 3.1381164213443943e-07, "learning_rate": 7.1743301936483264e-06, "loss": 2.3943, "step": 27123 }, { "crossentropy": 2.409006118774414, "epoch": 0.9833236658932715, "grad_norm": 0.02582627907395363, "grad_norm_var": 3.175540375602853e-07, "learning_rate": 7.143246280323367e-06, "loss": 2.4327, "step": 27124 }, { "crossentropy": 2.3144218921661377, "epoch": 0.9833599187935035, "grad_norm": 0.02534608729183674, "grad_norm_var": 3.398423254214461e-07, "learning_rate": 7.112229803105108e-06, "loss": 2.3323, "step": 27125 }, { "crossentropy": 2.45825457572937, "epoch": 0.9833961716937355, "grad_norm": 0.02573518641293049, "grad_norm_var": 3.27667348752556e-07, "learning_rate": 7.081280762413767e-06, "loss": 2.3792, "step": 27126 }, { "crossentropy": 2.3104963302612305, "epoch": 0.9834324245939675, "grad_norm": 0.02563929744064808, "grad_norm_var": 2.990089328719214e-07, "learning_rate": 7.050399158666232e-06, "loss": 2.2958, "step": 27127 }, { "crossentropy": 2.435270071029663, "epoch": 0.9834686774941995, "grad_norm": 0.025700602680444717, "grad_norm_var": 3.080044161968276e-07, "learning_rate": 7.019584992280503e-06, "loss": 2.5051, "step": 27128 }, { "crossentropy": 2.4067142009735107, "epoch": 0.9835049303944315, "grad_norm": 0.030374187976121902, "grad_norm_var": 1.4548304829670265e-06, "learning_rate": 6.988838263672359e-06, "loss": 2.4817, "step": 27129 }, { "crossentropy": 2.334749698638916, "epoch": 0.9835411832946636, "grad_norm": 0.025687871500849724, "grad_norm_var": 1.4400219552169909e-06, "learning_rate": 6.958158973257023e-06, "loss": 2.3276, "step": 27130 }, { "crossentropy": 2.5177412033081055, "epoch": 0.9835774361948956, "grad_norm": 0.026204081252217293, "grad_norm_var": 1.4281313130860464e-06, "learning_rate": 6.927547121448608e-06, "loss": 2.5613, "step": 27131 }, { "crossentropy": 2.4956796169281006, "epoch": 0.9836136890951276, "grad_norm": 0.02718256041407585, "grad_norm_var": 1.4747563400507956e-06, "learning_rate": 6.897002708661226e-06, "loss": 2.4558, "step": 27132 }, { "crossentropy": 2.335270881652832, "epoch": 0.9836499419953596, "grad_norm": 0.025589799508452415, "grad_norm_var": 1.5081878051690362e-06, "learning_rate": 6.866525735307327e-06, "loss": 2.4168, "step": 27133 }, { "crossentropy": 2.575568675994873, "epoch": 0.9836861948955916, "grad_norm": 0.026596827432513237, "grad_norm_var": 1.4826521341101124e-06, "learning_rate": 6.836116201798248e-06, "loss": 2.4669, "step": 27134 }, { "crossentropy": 2.512472629547119, "epoch": 0.9837224477958236, "grad_norm": 0.026186631992459297, "grad_norm_var": 1.4640765336581518e-06, "learning_rate": 6.805774108544771e-06, "loss": 2.518, "step": 27135 }, { "crossentropy": 2.4131109714508057, "epoch": 0.9837587006960556, "grad_norm": 0.02713947184383869, "grad_norm_var": 1.4526107836592325e-06, "learning_rate": 6.775499455956569e-06, "loss": 2.4072, "step": 27136 }, { "crossentropy": 2.532888412475586, "epoch": 0.9837949535962877, "grad_norm": 0.02710266411304474, "grad_norm_var": 1.4538280301859553e-06, "learning_rate": 6.7452922444427575e-06, "loss": 2.5027, "step": 27137 }, { "crossentropy": 2.3763370513916016, "epoch": 0.9838312064965197, "grad_norm": 0.02674214355647564, "grad_norm_var": 1.4484840794318076e-06, "learning_rate": 6.715152474411901e-06, "loss": 2.4112, "step": 27138 }, { "crossentropy": 2.5156805515289307, "epoch": 0.9838674593967517, "grad_norm": 0.026041047647595406, "grad_norm_var": 1.4599174387955877e-06, "learning_rate": 6.685080146269784e-06, "loss": 2.5127, "step": 27139 }, { "crossentropy": 2.495865821838379, "epoch": 0.9839037122969838, "grad_norm": 0.025670889765024185, "grad_norm_var": 1.474212859972956e-06, "learning_rate": 6.655075260423304e-06, "loss": 2.3993, "step": 27140 }, { "crossentropy": 2.554100751876831, "epoch": 0.9839399651972158, "grad_norm": 0.02603328227996826, "grad_norm_var": 1.4040732054395577e-06, "learning_rate": 6.625137817278248e-06, "loss": 2.493, "step": 27141 }, { "crossentropy": 2.3259496688842773, "epoch": 0.9839762180974478, "grad_norm": 0.026641853153705597, "grad_norm_var": 1.3658151733714295e-06, "learning_rate": 6.595267817238182e-06, "loss": 2.3723, "step": 27142 }, { "crossentropy": 2.4471206665039062, "epoch": 0.9840124709976799, "grad_norm": 0.025348324328660965, "grad_norm_var": 1.4057918524670286e-06, "learning_rate": 6.565465260706671e-06, "loss": 2.4268, "step": 27143 }, { "crossentropy": 2.380629301071167, "epoch": 0.9840487238979119, "grad_norm": 0.02570345252752304, "grad_norm_var": 1.4054828525085956e-06, "learning_rate": 6.535730148086172e-06, "loss": 2.436, "step": 27144 }, { "crossentropy": 2.4792966842651367, "epoch": 0.9840849767981439, "grad_norm": 0.02797621488571167, "grad_norm_var": 5.310795472820406e-07, "learning_rate": 6.50606247977914e-06, "loss": 2.3943, "step": 27145 }, { "crossentropy": 2.468193769454956, "epoch": 0.9841212296983759, "grad_norm": 0.0262942872941494, "grad_norm_var": 4.992778208214119e-07, "learning_rate": 6.476462256185256e-06, "loss": 2.4721, "step": 27146 }, { "crossentropy": 2.400754451751709, "epoch": 0.9841574825986079, "grad_norm": 0.025369862094521523, "grad_norm_var": 5.649369558477357e-07, "learning_rate": 6.4469294777042e-06, "loss": 2.3844, "step": 27147 }, { "crossentropy": 2.3347976207733154, "epoch": 0.9841937354988399, "grad_norm": 0.026298707351088524, "grad_norm_var": 5.157891586332966e-07, "learning_rate": 6.417464144736207e-06, "loss": 2.3788, "step": 27148 }, { "crossentropy": 2.4206719398498535, "epoch": 0.9842299883990719, "grad_norm": 0.02712414227426052, "grad_norm_var": 5.184602333539619e-07, "learning_rate": 6.388066257677627e-06, "loss": 2.4028, "step": 27149 }, { "crossentropy": 2.2979822158813477, "epoch": 0.984266241299304, "grad_norm": 0.02654852904379368, "grad_norm_var": 5.172861001422983e-07, "learning_rate": 6.3587358169264754e-06, "loss": 2.3878, "step": 27150 }, { "crossentropy": 2.4623939990997314, "epoch": 0.984302494199536, "grad_norm": 0.02591727487742901, "grad_norm_var": 5.290829780100921e-07, "learning_rate": 6.329472822879101e-06, "loss": 2.3615, "step": 27151 }, { "crossentropy": 2.365590810775757, "epoch": 0.984338747099768, "grad_norm": 0.0258508138358593, "grad_norm_var": 5.010066916953052e-07, "learning_rate": 6.300277275930188e-06, "loss": 2.3963, "step": 27152 }, { "crossentropy": 2.3059890270233154, "epoch": 0.984375, "grad_norm": 0.027246322482824326, "grad_norm_var": 5.178345632370161e-07, "learning_rate": 6.271149176474422e-06, "loss": 2.3527, "step": 27153 }, { "crossentropy": 2.462156057357788, "epoch": 0.984411252900232, "grad_norm": 0.02625948190689087, "grad_norm_var": 5.039693546694697e-07, "learning_rate": 6.242088524904821e-06, "loss": 2.4036, "step": 27154 }, { "crossentropy": 2.457747459411621, "epoch": 0.984447505800464, "grad_norm": 0.026547936722636223, "grad_norm_var": 5.045351471076598e-07, "learning_rate": 6.213095321613848e-06, "loss": 2.4654, "step": 27155 }, { "crossentropy": 2.410583972930908, "epoch": 0.984483758700696, "grad_norm": 0.026146739721298218, "grad_norm_var": 4.78647862966972e-07, "learning_rate": 6.184169566993969e-06, "loss": 2.4728, "step": 27156 }, { "crossentropy": 2.461615562438965, "epoch": 0.9845200116009281, "grad_norm": 0.02541891299188137, "grad_norm_var": 5.26683748847687e-07, "learning_rate": 6.15531126143487e-06, "loss": 2.4772, "step": 27157 }, { "crossentropy": 2.439946174621582, "epoch": 0.9845562645011601, "grad_norm": 0.025921525433659554, "grad_norm_var": 5.256372503328309e-07, "learning_rate": 6.126520405326796e-06, "loss": 2.4343, "step": 27158 }, { "crossentropy": 2.5135338306427, "epoch": 0.9845925174013921, "grad_norm": 0.026406731456518173, "grad_norm_var": 4.686483323578771e-07, "learning_rate": 6.097796999058325e-06, "loss": 2.488, "step": 27159 }, { "crossentropy": 2.409457206726074, "epoch": 0.9846287703016241, "grad_norm": 0.025215815752744675, "grad_norm_var": 5.232350848446051e-07, "learning_rate": 6.069141043018034e-06, "loss": 2.3586, "step": 27160 }, { "crossentropy": 2.4101295471191406, "epoch": 0.9846650232018561, "grad_norm": 0.025898221880197525, "grad_norm_var": 3.2424660442864396e-07, "learning_rate": 6.040552537592281e-06, "loss": 2.4396, "step": 27161 }, { "crossentropy": 2.4214441776275635, "epoch": 0.9847012761020881, "grad_norm": 0.02576562948524952, "grad_norm_var": 3.3183126673627604e-07, "learning_rate": 6.012031483167424e-06, "loss": 2.4234, "step": 27162 }, { "crossentropy": 2.348153829574585, "epoch": 0.9847375290023201, "grad_norm": 0.026612745597958565, "grad_norm_var": 3.038951034611218e-07, "learning_rate": 5.98357788012871e-06, "loss": 2.4441, "step": 27163 }, { "crossentropy": 2.3854517936706543, "epoch": 0.9847737819025522, "grad_norm": 0.02674666792154312, "grad_norm_var": 3.224089072448658e-07, "learning_rate": 5.955191728860832e-06, "loss": 2.4609, "step": 27164 }, { "crossentropy": 2.3714487552642822, "epoch": 0.9848100348027842, "grad_norm": 0.025158630684018135, "grad_norm_var": 3.2867482590570594e-07, "learning_rate": 5.92687302974626e-06, "loss": 2.366, "step": 27165 }, { "crossentropy": 2.554931879043579, "epoch": 0.9848462877030162, "grad_norm": 0.02684134617447853, "grad_norm_var": 3.51394051711556e-07, "learning_rate": 5.898621783168579e-06, "loss": 2.5564, "step": 27166 }, { "crossentropy": 2.551886558532715, "epoch": 0.9848825406032483, "grad_norm": 0.027338581159710884, "grad_norm_var": 4.3882094902362016e-07, "learning_rate": 5.870437989508593e-06, "loss": 2.4401, "step": 27167 }, { "crossentropy": 2.551410436630249, "epoch": 0.9849187935034803, "grad_norm": 0.02590753138065338, "grad_norm_var": 4.362981052494537e-07, "learning_rate": 5.8423216491471085e-06, "loss": 2.4926, "step": 27168 }, { "crossentropy": 2.429626703262329, "epoch": 0.9849550464037123, "grad_norm": 0.025369105860590935, "grad_norm_var": 3.982967596284521e-07, "learning_rate": 5.814272762464379e-06, "loss": 2.3882, "step": 27169 }, { "crossentropy": 2.3757801055908203, "epoch": 0.9849912993039444, "grad_norm": 0.025597456842660904, "grad_norm_var": 4.113666879567805e-07, "learning_rate": 5.786291329838433e-06, "loss": 2.3632, "step": 27170 }, { "crossentropy": 2.4239437580108643, "epoch": 0.9850275522041764, "grad_norm": 0.026156138628721237, "grad_norm_var": 3.952542434273633e-07, "learning_rate": 5.7583773516478586e-06, "loss": 2.4777, "step": 27171 }, { "crossentropy": 2.344212770462036, "epoch": 0.9850638051044084, "grad_norm": 0.02599506266415119, "grad_norm_var": 3.943587463285658e-07, "learning_rate": 5.73053082826902e-06, "loss": 2.3816, "step": 27172 }, { "crossentropy": 2.468355417251587, "epoch": 0.9851000580046404, "grad_norm": 0.02627820521593094, "grad_norm_var": 3.714242056171984e-07, "learning_rate": 5.7027517600788395e-06, "loss": 2.4943, "step": 27173 }, { "crossentropy": 2.2661385536193848, "epoch": 0.9851363109048724, "grad_norm": 0.02570432610809803, "grad_norm_var": 3.7883429390461686e-07, "learning_rate": 5.675040147452015e-06, "loss": 2.4152, "step": 27174 }, { "crossentropy": 2.401676654815674, "epoch": 0.9851725638051044, "grad_norm": 0.025548497214913368, "grad_norm_var": 3.8542303199250636e-07, "learning_rate": 5.6473959907626934e-06, "loss": 2.3514, "step": 27175 }, { "crossentropy": 2.5246152877807617, "epoch": 0.9852088167053364, "grad_norm": 0.025305187329649925, "grad_norm_var": 3.7647796216111875e-07, "learning_rate": 5.61981929038502e-06, "loss": 2.3397, "step": 27176 }, { "crossentropy": 2.3874621391296387, "epoch": 0.9852450696055685, "grad_norm": 0.02515641786158085, "grad_norm_var": 4.22317211132515e-07, "learning_rate": 5.592310046690363e-06, "loss": 2.3487, "step": 27177 }, { "crossentropy": 2.232835531234741, "epoch": 0.9852813225058005, "grad_norm": 0.025316812098026276, "grad_norm_var": 4.4699314908056416e-07, "learning_rate": 5.5648682600512035e-06, "loss": 2.2933, "step": 27178 }, { "crossentropy": 2.4727039337158203, "epoch": 0.9853175754060325, "grad_norm": 0.025703495368361473, "grad_norm_var": 4.1704971974151385e-07, "learning_rate": 5.5374939308378e-06, "loss": 2.4518, "step": 27179 }, { "crossentropy": 2.404229164123535, "epoch": 0.9853538283062645, "grad_norm": 0.027426617220044136, "grad_norm_var": 5.242711737689249e-07, "learning_rate": 5.510187059419858e-06, "loss": 2.4104, "step": 27180 }, { "crossentropy": 2.5006914138793945, "epoch": 0.9853900812064965, "grad_norm": 0.026239609345793724, "grad_norm_var": 4.868154351582906e-07, "learning_rate": 5.48294764616597e-06, "loss": 2.4175, "step": 27181 }, { "crossentropy": 2.4176392555236816, "epoch": 0.9854263341067285, "grad_norm": 0.025529975071549416, "grad_norm_var": 4.4592398771822695e-07, "learning_rate": 5.455775691444731e-06, "loss": 2.3922, "step": 27182 }, { "crossentropy": 2.499556303024292, "epoch": 0.9854625870069605, "grad_norm": 0.025607232004404068, "grad_norm_var": 3.0367691073012365e-07, "learning_rate": 5.428671195622514e-06, "loss": 2.4472, "step": 27183 }, { "crossentropy": 2.4808707237243652, "epoch": 0.9854988399071926, "grad_norm": 0.026290129870176315, "grad_norm_var": 3.18178415488884e-07, "learning_rate": 5.401634159065693e-06, "loss": 2.3952, "step": 27184 }, { "crossentropy": 2.3942155838012695, "epoch": 0.9855350928074246, "grad_norm": 0.02461983822286129, "grad_norm_var": 3.9896246799188635e-07, "learning_rate": 5.374664582139533e-06, "loss": 2.3841, "step": 27185 }, { "crossentropy": 2.615405559539795, "epoch": 0.9855713457076566, "grad_norm": 0.026396682485938072, "grad_norm_var": 4.1946594077888324e-07, "learning_rate": 5.347762465208183e-06, "loss": 2.543, "step": 27186 }, { "crossentropy": 2.3523547649383545, "epoch": 0.9856075986078886, "grad_norm": 0.02635030820965767, "grad_norm_var": 4.3027514000658755e-07, "learning_rate": 5.3209278086346905e-06, "loss": 2.3468, "step": 27187 }, { "crossentropy": 2.3460335731506348, "epoch": 0.9856438515081206, "grad_norm": 0.026386799290776253, "grad_norm_var": 4.4787270348175106e-07, "learning_rate": 5.29416061278154e-06, "loss": 2.3738, "step": 27188 }, { "crossentropy": 2.4941835403442383, "epoch": 0.9856801044083526, "grad_norm": 0.026719830930233, "grad_norm_var": 4.843191293195005e-07, "learning_rate": 5.267460878011221e-06, "loss": 2.4834, "step": 27189 }, { "crossentropy": 2.3608195781707764, "epoch": 0.9857163573085846, "grad_norm": 0.02580757439136505, "grad_norm_var": 4.823761869683525e-07, "learning_rate": 5.2408286046828905e-06, "loss": 2.3962, "step": 27190 }, { "crossentropy": 2.5182583332061768, "epoch": 0.9857526102088167, "grad_norm": 0.026710575446486473, "grad_norm_var": 5.122661489410467e-07, "learning_rate": 5.21426379315737e-06, "loss": 2.4362, "step": 27191 }, { "crossentropy": 2.615913152694702, "epoch": 0.9857888631090487, "grad_norm": 0.026775792241096497, "grad_norm_var": 5.16499640125613e-07, "learning_rate": 5.187766443792708e-06, "loss": 2.5142, "step": 27192 }, { "crossentropy": 2.2980144023895264, "epoch": 0.9858251160092807, "grad_norm": 0.026380637660622597, "grad_norm_var": 4.618856056465368e-07, "learning_rate": 5.161336556947505e-06, "loss": 2.3354, "step": 27193 }, { "crossentropy": 2.4016525745391846, "epoch": 0.9858613689095128, "grad_norm": 0.025443648919463158, "grad_norm_var": 4.489465162879162e-07, "learning_rate": 5.134974132978698e-06, "loss": 2.3694, "step": 27194 }, { "crossentropy": 2.319214105606079, "epoch": 0.9858976218097448, "grad_norm": 0.026030514389276505, "grad_norm_var": 4.361922890446969e-07, "learning_rate": 5.1086791722415595e-06, "loss": 2.4451, "step": 27195 }, { "crossentropy": 2.440666675567627, "epoch": 0.9859338747099768, "grad_norm": 0.025532837957143784, "grad_norm_var": 3.42974707968439e-07, "learning_rate": 5.0824516750913595e-06, "loss": 2.3325, "step": 27196 }, { "crossentropy": 2.5184743404388428, "epoch": 0.9859701276102089, "grad_norm": 0.026339860633015633, "grad_norm_var": 3.4611896239835025e-07, "learning_rate": 5.0562916418833705e-06, "loss": 2.5087, "step": 27197 }, { "crossentropy": 2.4604568481445312, "epoch": 0.9860063805104409, "grad_norm": 0.026881080120801926, "grad_norm_var": 3.6515435818672226e-07, "learning_rate": 5.030199072970087e-06, "loss": 2.3736, "step": 27198 }, { "crossentropy": 2.437439203262329, "epoch": 0.9860426334106729, "grad_norm": 0.025965405628085136, "grad_norm_var": 3.476297330724798e-07, "learning_rate": 5.004173968704006e-06, "loss": 2.4111, "step": 27199 }, { "crossentropy": 2.4787676334381104, "epoch": 0.9860788863109049, "grad_norm": 0.027013713493943214, "grad_norm_var": 3.9247647348112683e-07, "learning_rate": 4.9782163294370685e-06, "loss": 2.4621, "step": 27200 }, { "crossentropy": 2.4303958415985107, "epoch": 0.9861151392111369, "grad_norm": 0.02513759396970272, "grad_norm_var": 2.994766669517455e-07, "learning_rate": 4.952326155518993e-06, "loss": 2.3876, "step": 27201 }, { "crossentropy": 2.4469525814056396, "epoch": 0.9861513921113689, "grad_norm": 0.02667827717959881, "grad_norm_var": 3.102383343058738e-07, "learning_rate": 4.926503447300612e-06, "loss": 2.4325, "step": 27202 }, { "crossentropy": 2.2795090675354004, "epoch": 0.9861876450116009, "grad_norm": 0.025019964203238487, "grad_norm_var": 4.047714614129524e-07, "learning_rate": 4.900748205129424e-06, "loss": 2.33, "step": 27203 }, { "crossentropy": 2.3661935329437256, "epoch": 0.986223897911833, "grad_norm": 0.025561846792697906, "grad_norm_var": 4.241747672172608e-07, "learning_rate": 4.875060429354594e-06, "loss": 2.2987, "step": 27204 }, { "crossentropy": 2.3542027473449707, "epoch": 0.986260150812065, "grad_norm": 0.02622002549469471, "grad_norm_var": 4.0014412120842336e-07, "learning_rate": 4.849440120321958e-06, "loss": 2.4299, "step": 27205 }, { "crossentropy": 2.3061792850494385, "epoch": 0.986296403712297, "grad_norm": 0.025705089792609215, "grad_norm_var": 4.047104871219281e-07, "learning_rate": 4.823887278378458e-06, "loss": 2.3767, "step": 27206 }, { "crossentropy": 2.5655527114868164, "epoch": 0.986332656612529, "grad_norm": 0.026327352970838547, "grad_norm_var": 3.820423199263692e-07, "learning_rate": 4.798401903868266e-06, "loss": 2.5612, "step": 27207 }, { "crossentropy": 2.469817638397217, "epoch": 0.986368909512761, "grad_norm": 0.025756802409887314, "grad_norm_var": 3.5014275259874955e-07, "learning_rate": 4.77298399713666e-06, "loss": 2.4149, "step": 27208 }, { "crossentropy": 2.5672333240509033, "epoch": 0.986405162412993, "grad_norm": 0.026311401277780533, "grad_norm_var": 3.469254082304977e-07, "learning_rate": 4.747633558526143e-06, "loss": 2.5114, "step": 27209 }, { "crossentropy": 2.477346897125244, "epoch": 0.986441415313225, "grad_norm": 0.025883639231324196, "grad_norm_var": 3.266598021230225e-07, "learning_rate": 4.7223505883797755e-06, "loss": 2.3566, "step": 27210 }, { "crossentropy": 2.545474052429199, "epoch": 0.9864776682134571, "grad_norm": 0.02700020559132099, "grad_norm_var": 3.8642113312451625e-07, "learning_rate": 4.697135087038395e-06, "loss": 2.4008, "step": 27211 }, { "crossentropy": 2.2554659843444824, "epoch": 0.9865139211136891, "grad_norm": 0.02544291876256466, "grad_norm_var": 3.935278092903934e-07, "learning_rate": 4.671987054842841e-06, "loss": 2.4165, "step": 27212 }, { "crossentropy": 2.3400166034698486, "epoch": 0.9865501740139211, "grad_norm": 0.025919491425156593, "grad_norm_var": 3.89885227721727e-07, "learning_rate": 4.646906492132841e-06, "loss": 2.3776, "step": 27213 }, { "crossentropy": 2.360445737838745, "epoch": 0.9865864269141531, "grad_norm": 0.025943534448742867, "grad_norm_var": 3.411259692507121e-07, "learning_rate": 4.621893399247013e-06, "loss": 2.3546, "step": 27214 }, { "crossentropy": 2.4603490829467773, "epoch": 0.9866226798143851, "grad_norm": 0.02543291077017784, "grad_norm_var": 3.6080380223315183e-07, "learning_rate": 4.596947776523419e-06, "loss": 2.462, "step": 27215 }, { "crossentropy": 2.379154682159424, "epoch": 0.9866589327146171, "grad_norm": 0.025909140706062317, "grad_norm_var": 2.81823606165336e-07, "learning_rate": 4.5720696242984586e-06, "loss": 2.4143, "step": 27216 }, { "crossentropy": 2.502828598022461, "epoch": 0.9866951856148491, "grad_norm": 0.026494460180401802, "grad_norm_var": 2.6065428915204166e-07, "learning_rate": 4.5472589429085275e-06, "loss": 2.4178, "step": 27217 }, { "crossentropy": 2.3446004390716553, "epoch": 0.9867314385150812, "grad_norm": 0.02718832902610302, "grad_norm_var": 3.247115411006765e-07, "learning_rate": 4.52251573268947e-06, "loss": 2.4187, "step": 27218 }, { "crossentropy": 2.4352405071258545, "epoch": 0.9867676914153132, "grad_norm": 0.025681093335151672, "grad_norm_var": 2.6499385483214936e-07, "learning_rate": 4.4978399939743505e-06, "loss": 2.4358, "step": 27219 }, { "crossentropy": 2.486652135848999, "epoch": 0.9868039443155452, "grad_norm": 0.0263524129986763, "grad_norm_var": 2.5274372191437e-07, "learning_rate": 4.473231727096238e-06, "loss": 2.4706, "step": 27220 }, { "crossentropy": 2.436387300491333, "epoch": 0.9868401972157773, "grad_norm": 0.026692846789956093, "grad_norm_var": 2.7440586882937895e-07, "learning_rate": 4.448690932388755e-06, "loss": 2.4163, "step": 27221 }, { "crossentropy": 2.4708173274993896, "epoch": 0.9868764501160093, "grad_norm": 0.02677096426486969, "grad_norm_var": 2.853654167131921e-07, "learning_rate": 4.424217610182746e-06, "loss": 2.5076, "step": 27222 }, { "crossentropy": 2.36004376411438, "epoch": 0.9869127030162413, "grad_norm": 0.026206588372588158, "grad_norm_var": 2.8413320634841586e-07, "learning_rate": 4.399811760808503e-06, "loss": 2.3444, "step": 27223 }, { "crossentropy": 2.4629945755004883, "epoch": 0.9869489559164734, "grad_norm": 0.025711718946695328, "grad_norm_var": 2.8684423541458446e-07, "learning_rate": 4.375473384596318e-06, "loss": 2.3678, "step": 27224 }, { "crossentropy": 2.661458969116211, "epoch": 0.9869852088167054, "grad_norm": 0.02645096369087696, "grad_norm_var": 2.9043503954999845e-07, "learning_rate": 4.351202481873706e-06, "loss": 2.5534, "step": 27225 }, { "crossentropy": 2.553640842437744, "epoch": 0.9870214617169374, "grad_norm": 0.026255881413817406, "grad_norm_var": 2.8376205721726786e-07, "learning_rate": 4.326999052969849e-06, "loss": 2.611, "step": 27226 }, { "crossentropy": 2.4903109073638916, "epoch": 0.9870577146171694, "grad_norm": 0.027232853695750237, "grad_norm_var": 3.1147566072152004e-07, "learning_rate": 4.3028630982111514e-06, "loss": 2.4261, "step": 27227 }, { "crossentropy": 2.2440927028656006, "epoch": 0.9870939675174014, "grad_norm": 0.025975888594985008, "grad_norm_var": 2.732700074943121e-07, "learning_rate": 4.278794617923465e-06, "loss": 2.3062, "step": 27228 }, { "crossentropy": 2.3558855056762695, "epoch": 0.9871302204176334, "grad_norm": 0.026750050485134125, "grad_norm_var": 2.7826704595370657e-07, "learning_rate": 4.254793612431529e-06, "loss": 2.4386, "step": 27229 }, { "crossentropy": 2.4810853004455566, "epoch": 0.9871664733178654, "grad_norm": 0.026246221736073494, "grad_norm_var": 2.6897724043927797e-07, "learning_rate": 4.230860082060084e-06, "loss": 2.5157, "step": 27230 }, { "crossentropy": 2.384157657623291, "epoch": 0.9872027262180975, "grad_norm": 0.025299962610006332, "grad_norm_var": 2.860642519241733e-07, "learning_rate": 4.20699402713276e-06, "loss": 2.3858, "step": 27231 }, { "crossentropy": 2.4252750873565674, "epoch": 0.9872389791183295, "grad_norm": 0.026037508621811867, "grad_norm_var": 2.7995568688289155e-07, "learning_rate": 4.183195447970967e-06, "loss": 2.3866, "step": 27232 }, { "crossentropy": 2.3807871341705322, "epoch": 0.9872752320185615, "grad_norm": 0.025704247877001762, "grad_norm_var": 3.021012164737431e-07, "learning_rate": 4.159464344896113e-06, "loss": 2.4459, "step": 27233 }, { "crossentropy": 2.3500049114227295, "epoch": 0.9873114849187935, "grad_norm": 0.025513779371976852, "grad_norm_var": 2.7563482589327133e-07, "learning_rate": 4.13580071822961e-06, "loss": 2.3763, "step": 27234 }, { "crossentropy": 2.3995800018310547, "epoch": 0.9873477378190255, "grad_norm": 0.025456828996539116, "grad_norm_var": 2.937020693117668e-07, "learning_rate": 4.112204568290645e-06, "loss": 2.4137, "step": 27235 }, { "crossentropy": 2.5648536682128906, "epoch": 0.9873839907192575, "grad_norm": 0.02607828751206398, "grad_norm_var": 2.915914210245447e-07, "learning_rate": 4.088675895397853e-06, "loss": 2.5077, "step": 27236 }, { "crossentropy": 2.4823176860809326, "epoch": 0.9874202436194895, "grad_norm": 0.02592974156141281, "grad_norm_var": 2.726558140820802e-07, "learning_rate": 4.065214699869313e-06, "loss": 2.4507, "step": 27237 }, { "crossentropy": 2.335869312286377, "epoch": 0.9874564965197216, "grad_norm": 0.02573363110423088, "grad_norm_var": 2.472935215707863e-07, "learning_rate": 4.041820982020883e-06, "loss": 2.3591, "step": 27238 }, { "crossentropy": 2.5803382396698, "epoch": 0.9874927494199536, "grad_norm": 0.02592925727367401, "grad_norm_var": 2.458114731253754e-07, "learning_rate": 4.018494742170087e-06, "loss": 2.3995, "step": 27239 }, { "crossentropy": 2.427739143371582, "epoch": 0.9875290023201856, "grad_norm": 0.025786129757761955, "grad_norm_var": 2.4310711186924645e-07, "learning_rate": 3.995235980630563e-06, "loss": 2.4588, "step": 27240 }, { "crossentropy": 2.438462972640991, "epoch": 0.9875652552204176, "grad_norm": 0.02614910528063774, "grad_norm_var": 2.3161071690715415e-07, "learning_rate": 3.972044697717614e-06, "loss": 2.4587, "step": 27241 }, { "crossentropy": 2.435896158218384, "epoch": 0.9876015081206496, "grad_norm": 0.025824692100286484, "grad_norm_var": 2.2880508706404844e-07, "learning_rate": 3.9489208937437685e-06, "loss": 2.4057, "step": 27242 }, { "crossentropy": 2.358898162841797, "epoch": 0.9876377610208816, "grad_norm": 0.025987163186073303, "grad_norm_var": 1.1736983145654624e-07, "learning_rate": 3.925864569021553e-06, "loss": 2.3952, "step": 27243 }, { "crossentropy": 2.4005143642425537, "epoch": 0.9876740139211136, "grad_norm": 0.025744011625647545, "grad_norm_var": 1.1838884536729868e-07, "learning_rate": 3.902875723861832e-06, "loss": 2.3957, "step": 27244 }, { "crossentropy": 2.4223403930664062, "epoch": 0.9877102668213457, "grad_norm": 0.026628663763403893, "grad_norm_var": 1.0531975585917284e-07, "learning_rate": 3.879954358576576e-06, "loss": 2.4602, "step": 27245 }, { "crossentropy": 2.42209529876709, "epoch": 0.9877465197215777, "grad_norm": 0.026104401797056198, "grad_norm_var": 9.961544371461564e-08, "learning_rate": 3.857100473473874e-06, "loss": 2.3823, "step": 27246 }, { "crossentropy": 2.4066903591156006, "epoch": 0.9877827726218097, "grad_norm": 0.02704509347677231, "grad_norm_var": 1.575024645016539e-07, "learning_rate": 3.83431406886292e-06, "loss": 2.4778, "step": 27247 }, { "crossentropy": 2.3187620639801025, "epoch": 0.9878190255220418, "grad_norm": 0.025915829464793205, "grad_norm_var": 1.5746697376902105e-07, "learning_rate": 3.81159514505125e-06, "loss": 2.3897, "step": 27248 }, { "crossentropy": 2.3745696544647217, "epoch": 0.9878552784222738, "grad_norm": 0.025356139987707138, "grad_norm_var": 1.774069048966157e-07, "learning_rate": 3.7889437023463926e-06, "loss": 2.3511, "step": 27249 }, { "crossentropy": 2.463146686553955, "epoch": 0.9878915313225058, "grad_norm": 0.02640891633927822, "grad_norm_var": 1.755512930226924e-07, "learning_rate": 3.76635974105366e-06, "loss": 2.4679, "step": 27250 }, { "crossentropy": 2.460005283355713, "epoch": 0.9879277842227379, "grad_norm": 0.026750197634100914, "grad_norm_var": 1.8559252748974695e-07, "learning_rate": 3.7438432614783637e-06, "loss": 2.4488, "step": 27251 }, { "crossentropy": 2.380439281463623, "epoch": 0.9879640371229699, "grad_norm": 0.026574639603495598, "grad_norm_var": 2.0049954849812348e-07, "learning_rate": 3.7213942639247043e-06, "loss": 2.4263, "step": 27252 }, { "crossentropy": 2.4750893115997314, "epoch": 0.9880002900232019, "grad_norm": 0.026390518993139267, "grad_norm_var": 2.022815351933645e-07, "learning_rate": 3.6990127486957738e-06, "loss": 2.4575, "step": 27253 }, { "crossentropy": 2.226266622543335, "epoch": 0.9880365429234339, "grad_norm": 0.026826653629541397, "grad_norm_var": 2.1692234620586797e-07, "learning_rate": 3.6766987160935517e-06, "loss": 2.3625, "step": 27254 }, { "crossentropy": 2.549657106399536, "epoch": 0.9880727958236659, "grad_norm": 0.02571289800107479, "grad_norm_var": 2.2805762248760224e-07, "learning_rate": 3.6544521664200193e-06, "loss": 2.4401, "step": 27255 }, { "crossentropy": 2.298067808151245, "epoch": 0.9881090487238979, "grad_norm": 0.026102542877197266, "grad_norm_var": 2.1684109349818086e-07, "learning_rate": 3.6322730999749365e-06, "loss": 2.2554, "step": 27256 }, { "crossentropy": 2.410365104675293, "epoch": 0.98814530162413, "grad_norm": 0.02698519453406334, "grad_norm_var": 2.526179527601354e-07, "learning_rate": 3.6101615170586187e-06, "loss": 2.3999, "step": 27257 }, { "crossentropy": 2.232828140258789, "epoch": 0.988181554524362, "grad_norm": 0.026492413133382797, "grad_norm_var": 2.406291570423846e-07, "learning_rate": 3.588117417969161e-06, "loss": 2.3196, "step": 27258 }, { "crossentropy": 2.4551169872283936, "epoch": 0.988217807424594, "grad_norm": 0.026666952297091484, "grad_norm_var": 2.398799799481951e-07, "learning_rate": 3.566140803004658e-06, "loss": 2.4, "step": 27259 }, { "crossentropy": 2.444140911102295, "epoch": 0.988254060324826, "grad_norm": 0.025259705260396004, "grad_norm_var": 2.9409476361841095e-07, "learning_rate": 3.5442316724615398e-06, "loss": 2.3676, "step": 27260 }, { "crossentropy": 2.406233549118042, "epoch": 0.988290313225058, "grad_norm": 0.025571636855602264, "grad_norm_var": 3.2131181814710044e-07, "learning_rate": 3.5223900266362352e-06, "loss": 2.4172, "step": 27261 }, { "crossentropy": 2.4419138431549072, "epoch": 0.98832656612529, "grad_norm": 0.02547276020050049, "grad_norm_var": 3.5937147258786825e-07, "learning_rate": 3.5006158658235086e-06, "loss": 2.3593, "step": 27262 }, { "crossentropy": 2.3316240310668945, "epoch": 0.988362819025522, "grad_norm": 0.026783352717757225, "grad_norm_var": 3.3488486630243546e-07, "learning_rate": 3.4789091903170143e-06, "loss": 2.3316, "step": 27263 }, { "crossentropy": 2.473482370376587, "epoch": 0.988399071925754, "grad_norm": 0.026532771065831184, "grad_norm_var": 3.349361851261052e-07, "learning_rate": 3.4572700004109613e-06, "loss": 2.5058, "step": 27264 }, { "crossentropy": 2.429661750793457, "epoch": 0.9884353248259861, "grad_norm": 0.02556810900568962, "grad_norm_var": 3.126806999976967e-07, "learning_rate": 3.435698296396228e-06, "loss": 2.3628, "step": 27265 }, { "crossentropy": 2.3919854164123535, "epoch": 0.9884715777262181, "grad_norm": 0.025501836091279984, "grad_norm_var": 3.4563570080746394e-07, "learning_rate": 3.414194078565358e-06, "loss": 2.3574, "step": 27266 }, { "crossentropy": 2.391944408416748, "epoch": 0.9885078306264501, "grad_norm": 0.026616547256708145, "grad_norm_var": 3.3693884531014555e-07, "learning_rate": 3.392757347208675e-06, "loss": 2.5028, "step": 27267 }, { "crossentropy": 2.342378616333008, "epoch": 0.9885440835266821, "grad_norm": 0.026465347036719322, "grad_norm_var": 3.3209717742293164e-07, "learning_rate": 3.3713881026148365e-06, "loss": 2.352, "step": 27268 }, { "crossentropy": 2.388719081878662, "epoch": 0.9885803364269141, "grad_norm": 0.02558237873017788, "grad_norm_var": 3.5069778518251733e-07, "learning_rate": 3.350086345073611e-06, "loss": 2.3696, "step": 27269 }, { "crossentropy": 2.4114503860473633, "epoch": 0.9886165893271461, "grad_norm": 0.026194168254733086, "grad_norm_var": 3.17272413832316e-07, "learning_rate": 3.3288520748719908e-06, "loss": 2.5187, "step": 27270 }, { "crossentropy": 2.3424432277679443, "epoch": 0.9886528422273781, "grad_norm": 0.026384763419628143, "grad_norm_var": 3.11319380178832e-07, "learning_rate": 3.307685292296969e-06, "loss": 2.3171, "step": 27271 }, { "crossentropy": 2.402249574661255, "epoch": 0.9886890951276102, "grad_norm": 0.026087353006005287, "grad_norm_var": 3.11402129120573e-07, "learning_rate": 3.286585997633873e-06, "loss": 2.3731, "step": 27272 }, { "crossentropy": 2.3718862533569336, "epoch": 0.9887253480278422, "grad_norm": 0.025721576064825058, "grad_norm_var": 2.6801068016812e-07, "learning_rate": 3.265554191168585e-06, "loss": 2.3657, "step": 27273 }, { "crossentropy": 2.441969156265259, "epoch": 0.9887616009280742, "grad_norm": 0.02550625614821911, "grad_norm_var": 2.7145596465069584e-07, "learning_rate": 3.2445898731853217e-06, "loss": 2.4467, "step": 27274 }, { "crossentropy": 2.4219610691070557, "epoch": 0.9887978538283063, "grad_norm": 0.026167338714003563, "grad_norm_var": 2.422759439448293e-07, "learning_rate": 3.2236930439660805e-06, "loss": 2.4304, "step": 27275 }, { "crossentropy": 2.2838189601898193, "epoch": 0.9888341067285383, "grad_norm": 0.02632395550608635, "grad_norm_var": 2.131976223303129e-07, "learning_rate": 3.202863703793968e-06, "loss": 2.3017, "step": 27276 }, { "crossentropy": 2.554750919342041, "epoch": 0.9888703596287703, "grad_norm": 0.02635592594742775, "grad_norm_var": 2.0370908247630917e-07, "learning_rate": 3.1821018529498703e-06, "loss": 2.4485, "step": 27277 }, { "crossentropy": 2.3034355640411377, "epoch": 0.9889066125290024, "grad_norm": 0.02590499073266983, "grad_norm_var": 1.8044590587388646e-07, "learning_rate": 3.161407491714674e-06, "loss": 2.4088, "step": 27278 }, { "crossentropy": 2.4618539810180664, "epoch": 0.9889428654292344, "grad_norm": 0.02576996572315693, "grad_norm_var": 1.5311341179282076e-07, "learning_rate": 3.1407806203676005e-06, "loss": 2.4282, "step": 27279 }, { "crossentropy": 2.233651876449585, "epoch": 0.9889791183294664, "grad_norm": 0.025358855724334717, "grad_norm_var": 1.6253712197924982e-07, "learning_rate": 3.1202212391873153e-06, "loss": 2.2021, "step": 27280 }, { "crossentropy": 2.4690959453582764, "epoch": 0.9890153712296984, "grad_norm": 0.02589605003595352, "grad_norm_var": 1.5171488886010256e-07, "learning_rate": 3.0997293484513744e-06, "loss": 2.4326, "step": 27281 }, { "crossentropy": 2.4455008506774902, "epoch": 0.9890516241299304, "grad_norm": 0.02641187235713005, "grad_norm_var": 1.442627616088232e-07, "learning_rate": 3.0793049484367785e-06, "loss": 2.4641, "step": 27282 }, { "crossentropy": 2.4281344413757324, "epoch": 0.9890878770301624, "grad_norm": 0.026710430160164833, "grad_norm_var": 1.519467120721906e-07, "learning_rate": 3.058948039418863e-06, "loss": 2.3711, "step": 27283 }, { "crossentropy": 2.4134116172790527, "epoch": 0.9891241299303944, "grad_norm": 0.030734160915017128, "grad_norm_var": 1.5258085769518277e-06, "learning_rate": 3.0386586216735178e-06, "loss": 2.3824, "step": 27284 }, { "crossentropy": 2.324469566345215, "epoch": 0.9891603828306265, "grad_norm": 0.025945447385311127, "grad_norm_var": 1.4983697636584593e-06, "learning_rate": 3.0184366954738586e-06, "loss": 2.3197, "step": 27285 }, { "crossentropy": 2.4171195030212402, "epoch": 0.9891966357308585, "grad_norm": 0.026598455384373665, "grad_norm_var": 1.500612665954641e-06, "learning_rate": 2.9982822610941097e-06, "loss": 2.4215, "step": 27286 }, { "crossentropy": 2.3525421619415283, "epoch": 0.9892328886310905, "grad_norm": 0.026289155706763268, "grad_norm_var": 1.500961825606252e-06, "learning_rate": 2.9781953188051657e-06, "loss": 2.3548, "step": 27287 }, { "crossentropy": 2.3393666744232178, "epoch": 0.9892691415313225, "grad_norm": 0.025994285941123962, "grad_norm_var": 1.5049033277962034e-06, "learning_rate": 2.9581758688790318e-06, "loss": 2.4264, "step": 27288 }, { "crossentropy": 2.5242531299591064, "epoch": 0.9893053944315545, "grad_norm": 0.025954890996217728, "grad_norm_var": 1.488583641632993e-06, "learning_rate": 2.9382239115860465e-06, "loss": 2.5258, "step": 27289 }, { "crossentropy": 2.5748541355133057, "epoch": 0.9893416473317865, "grad_norm": 0.026232946664094925, "grad_norm_var": 1.4378863191911542e-06, "learning_rate": 2.9183394471954394e-06, "loss": 2.4306, "step": 27290 }, { "crossentropy": 2.5267364978790283, "epoch": 0.9893779002320185, "grad_norm": 0.025784116238355637, "grad_norm_var": 1.45974749133416e-06, "learning_rate": 2.898522475975884e-06, "loss": 2.4502, "step": 27291 }, { "crossentropy": 2.5230798721313477, "epoch": 0.9894141531322506, "grad_norm": 0.026242610067129135, "grad_norm_var": 1.4608946708866694e-06, "learning_rate": 2.8787729981955002e-06, "loss": 2.4752, "step": 27292 }, { "crossentropy": 2.343491554260254, "epoch": 0.9894504060324826, "grad_norm": 0.026103906333446503, "grad_norm_var": 1.4658919928572558e-06, "learning_rate": 2.8590910141207404e-06, "loss": 2.2886, "step": 27293 }, { "crossentropy": 2.433337450027466, "epoch": 0.9894866589327146, "grad_norm": 0.02606939524412155, "grad_norm_var": 1.457371379956597e-06, "learning_rate": 2.8394765240175036e-06, "loss": 2.397, "step": 27294 }, { "crossentropy": 2.357158899307251, "epoch": 0.9895229118329466, "grad_norm": 0.025857336819171906, "grad_norm_var": 1.4507298587494258e-06, "learning_rate": 2.8199295281505777e-06, "loss": 2.3862, "step": 27295 }, { "crossentropy": 2.3698904514312744, "epoch": 0.9895591647331786, "grad_norm": 0.02633068896830082, "grad_norm_var": 1.3765994307918336e-06, "learning_rate": 2.800450026783641e-06, "loss": 2.3475, "step": 27296 }, { "crossentropy": 2.4960289001464844, "epoch": 0.9895954176334106, "grad_norm": 0.026540856808423996, "grad_norm_var": 1.3551977682386828e-06, "learning_rate": 2.781038020180371e-06, "loss": 2.4384, "step": 27297 }, { "crossentropy": 2.3636739253997803, "epoch": 0.9896316705336426, "grad_norm": 0.025415875017642975, "grad_norm_var": 1.4272463758482061e-06, "learning_rate": 2.7616935086027807e-06, "loss": 2.3697, "step": 27298 }, { "crossentropy": 2.470501661300659, "epoch": 0.9896679234338747, "grad_norm": 0.02576678991317749, "grad_norm_var": 1.4470233292142772e-06, "learning_rate": 2.7424164923117724e-06, "loss": 2.4111, "step": 27299 }, { "crossentropy": 2.4982423782348633, "epoch": 0.9897041763341067, "grad_norm": 0.025650963187217712, "grad_norm_var": 1.0159945051875912e-07, "learning_rate": 2.723206971568248e-06, "loss": 2.5719, "step": 27300 }, { "crossentropy": 2.294949531555176, "epoch": 0.9897404292343387, "grad_norm": 0.02640022709965706, "grad_norm_var": 1.0827063441537904e-07, "learning_rate": 2.704064946631446e-06, "loss": 2.3424, "step": 27301 }, { "crossentropy": 2.5316994190216064, "epoch": 0.9897766821345708, "grad_norm": 0.025579549372196198, "grad_norm_var": 1.0231860435236384e-07, "learning_rate": 2.684990417760047e-06, "loss": 2.458, "step": 27302 }, { "crossentropy": 2.294508695602417, "epoch": 0.9898129350348028, "grad_norm": 0.025655485689640045, "grad_norm_var": 1.041120393987695e-07, "learning_rate": 2.665983385211068e-06, "loss": 2.3812, "step": 27303 }, { "crossentropy": 2.5387728214263916, "epoch": 0.9898491879350348, "grad_norm": 0.027195872738957405, "grad_norm_var": 1.9764106927670084e-07, "learning_rate": 2.6470438492426364e-06, "loss": 2.5636, "step": 27304 }, { "crossentropy": 2.5094449520111084, "epoch": 0.9898854408352669, "grad_norm": 0.026292169466614723, "grad_norm_var": 2.005257370168382e-07, "learning_rate": 2.6281718101089925e-06, "loss": 2.4916, "step": 27305 }, { "crossentropy": 2.3317859172821045, "epoch": 0.9899216937354989, "grad_norm": 0.026318129152059555, "grad_norm_var": 2.0283079335396172e-07, "learning_rate": 2.6093672680654877e-06, "loss": 2.3809, "step": 27306 }, { "crossentropy": 2.3834404945373535, "epoch": 0.9899579466357309, "grad_norm": 0.027130192145705223, "grad_norm_var": 2.6382437559215277e-07, "learning_rate": 2.5906302233663636e-06, "loss": 2.3408, "step": 27307 }, { "crossentropy": 2.3142669200897217, "epoch": 0.9899941995359629, "grad_norm": 0.025747904554009438, "grad_norm_var": 2.736301739030318e-07, "learning_rate": 2.571960676264751e-06, "loss": 2.3247, "step": 27308 }, { "crossentropy": 2.512576103210449, "epoch": 0.9900304524361949, "grad_norm": 0.025686373934149742, "grad_norm_var": 2.858928714187988e-07, "learning_rate": 2.5533586270132247e-06, "loss": 2.4363, "step": 27309 }, { "crossentropy": 2.4664909839630127, "epoch": 0.9900667053364269, "grad_norm": 0.026256373152136803, "grad_norm_var": 2.8725601532301966e-07, "learning_rate": 2.534824075861586e-06, "loss": 2.4011, "step": 27310 }, { "crossentropy": 2.375185966491699, "epoch": 0.990102958236659, "grad_norm": 0.025654830038547516, "grad_norm_var": 2.9675054533938257e-07, "learning_rate": 2.516357023061855e-06, "loss": 2.28, "step": 27311 }, { "crossentropy": 2.545051097869873, "epoch": 0.990139211136891, "grad_norm": 0.025711901485919952, "grad_norm_var": 3.017635689189201e-07, "learning_rate": 2.4979574688621664e-06, "loss": 2.4201, "step": 27312 }, { "crossentropy": 2.289379358291626, "epoch": 0.990175464037123, "grad_norm": 0.0257854126393795, "grad_norm_var": 2.892711417969961e-07, "learning_rate": 2.4796254135117656e-06, "loss": 2.2987, "step": 27313 }, { "crossentropy": 2.5513134002685547, "epoch": 0.990211716937355, "grad_norm": 0.026077156886458397, "grad_norm_var": 2.637322415590387e-07, "learning_rate": 2.4613608572576772e-06, "loss": 2.4474, "step": 27314 }, { "crossentropy": 2.4593939781188965, "epoch": 0.990247969837587, "grad_norm": 0.02750631980597973, "grad_norm_var": 3.855831409963391e-07, "learning_rate": 2.443163800347481e-06, "loss": 2.4635, "step": 27315 }, { "crossentropy": 2.3225295543670654, "epoch": 0.990284222737819, "grad_norm": 0.025897739455103874, "grad_norm_var": 3.724574643127008e-07, "learning_rate": 2.425034243025981e-06, "loss": 2.3908, "step": 27316 }, { "crossentropy": 2.4439144134521484, "epoch": 0.990320475638051, "grad_norm": 0.027165060862898827, "grad_norm_var": 4.31376746860603e-07, "learning_rate": 2.406972185538536e-06, "loss": 2.4837, "step": 27317 }, { "crossentropy": 2.4514169692993164, "epoch": 0.990356728538283, "grad_norm": 0.025807082653045654, "grad_norm_var": 4.1491626535430003e-07, "learning_rate": 2.3889776281293962e-06, "loss": 2.4155, "step": 27318 }, { "crossentropy": 2.431123733520508, "epoch": 0.9903929814385151, "grad_norm": 0.025813555344939232, "grad_norm_var": 4.0409546060760486e-07, "learning_rate": 2.3710505710411446e-06, "loss": 2.4371, "step": 27319 }, { "crossentropy": 2.314892530441284, "epoch": 0.9904292343387471, "grad_norm": 0.026360543444752693, "grad_norm_var": 3.426784325235062e-07, "learning_rate": 2.3531910145158096e-06, "loss": 2.3714, "step": 27320 }, { "crossentropy": 2.4556398391723633, "epoch": 0.9904654872389791, "grad_norm": 0.0256895013153553, "grad_norm_var": 3.5802660588747184e-07, "learning_rate": 2.3353989587948654e-06, "loss": 2.3664, "step": 27321 }, { "crossentropy": 2.4748713970184326, "epoch": 0.9905017401392111, "grad_norm": 0.02502148598432541, "grad_norm_var": 4.3628803484263337e-07, "learning_rate": 2.317674404118675e-06, "loss": 2.4709, "step": 27322 }, { "crossentropy": 2.5079402923583984, "epoch": 0.9905379930394431, "grad_norm": 0.026750952005386353, "grad_norm_var": 3.9227298304979286e-07, "learning_rate": 2.300017350726491e-06, "loss": 2.3918, "step": 27323 }, { "crossentropy": 2.432133674621582, "epoch": 0.9905742459396751, "grad_norm": 0.02579602599143982, "grad_norm_var": 3.9042639911375753e-07, "learning_rate": 2.282427798857012e-06, "loss": 2.3852, "step": 27324 }, { "crossentropy": 2.4366753101348877, "epoch": 0.9906104988399071, "grad_norm": 0.02530263550579548, "grad_norm_var": 4.18811435575391e-07, "learning_rate": 2.26490574874727e-06, "loss": 2.3572, "step": 27325 }, { "crossentropy": 2.4878451824188232, "epoch": 0.9906467517401392, "grad_norm": 0.025982853025197983, "grad_norm_var": 4.1549729952957265e-07, "learning_rate": 2.2474512006342986e-06, "loss": 2.4768, "step": 27326 }, { "crossentropy": 2.3858230113983154, "epoch": 0.9906830046403712, "grad_norm": 0.025726525112986565, "grad_norm_var": 4.1232594990136864e-07, "learning_rate": 2.230064154754574e-06, "loss": 2.4189, "step": 27327 }, { "crossentropy": 2.2626121044158936, "epoch": 0.9907192575406032, "grad_norm": 0.02579846791923046, "grad_norm_var": 4.0918425241290705e-07, "learning_rate": 2.212744611341799e-06, "loss": 2.402, "step": 27328 }, { "crossentropy": 2.3285810947418213, "epoch": 0.9907555104408353, "grad_norm": 0.025227172300219536, "grad_norm_var": 4.468725382834766e-07, "learning_rate": 2.195492570630231e-06, "loss": 2.384, "step": 27329 }, { "crossentropy": 2.447573661804199, "epoch": 0.9907917633410673, "grad_norm": 0.025281893089413643, "grad_norm_var": 4.777091962527662e-07, "learning_rate": 2.178308032853016e-06, "loss": 2.4399, "step": 27330 }, { "crossentropy": 2.4258477687835693, "epoch": 0.9908280162412993, "grad_norm": 0.02594517357647419, "grad_norm_var": 3.0514131027960476e-07, "learning_rate": 2.1611909982421908e-06, "loss": 2.4526, "step": 27331 }, { "crossentropy": 2.394331932067871, "epoch": 0.9908642691415314, "grad_norm": 0.026631787419319153, "grad_norm_var": 3.4369426436957745e-07, "learning_rate": 2.144141467028682e-06, "loss": 2.3691, "step": 27332 }, { "crossentropy": 2.4482855796813965, "epoch": 0.9909005220417634, "grad_norm": 0.025379054248332977, "grad_norm_var": 2.4032602618679715e-07, "learning_rate": 2.127159439443416e-06, "loss": 2.4204, "step": 27333 }, { "crossentropy": 2.3226819038391113, "epoch": 0.9909367749419954, "grad_norm": 0.025186898186802864, "grad_norm_var": 2.623052085432096e-07, "learning_rate": 2.110244915715653e-06, "loss": 2.3861, "step": 27334 }, { "crossentropy": 2.537107229232788, "epoch": 0.9909730278422274, "grad_norm": 0.025327999144792557, "grad_norm_var": 2.7249909862724536e-07, "learning_rate": 2.0933978960735455e-06, "loss": 2.3773, "step": 27335 }, { "crossentropy": 2.479201316833496, "epoch": 0.9910092807424594, "grad_norm": 0.02580397203564644, "grad_norm_var": 2.438104338939404e-07, "learning_rate": 2.076618380744133e-06, "loss": 2.4451, "step": 27336 }, { "crossentropy": 2.2448010444641113, "epoch": 0.9910455336426914, "grad_norm": 0.024417169392108917, "grad_norm_var": 3.4308270336993495e-07, "learning_rate": 2.0599063699550115e-06, "loss": 2.2523, "step": 27337 }, { "crossentropy": 2.406569242477417, "epoch": 0.9910817865429234, "grad_norm": 0.025960784405469894, "grad_norm_var": 3.259283665196181e-07, "learning_rate": 2.043261863931556e-06, "loss": 2.4322, "step": 27338 }, { "crossentropy": 2.3424723148345947, "epoch": 0.9911180394431555, "grad_norm": 0.02629433199763298, "grad_norm_var": 2.7238503337022264e-07, "learning_rate": 2.0266848628980315e-06, "loss": 2.4533, "step": 27339 }, { "crossentropy": 2.4871232509613037, "epoch": 0.9911542923433875, "grad_norm": 0.024949951097369194, "grad_norm_var": 2.9827414387278754e-07, "learning_rate": 2.010175367079259e-06, "loss": 2.4297, "step": 27340 }, { "crossentropy": 2.3947227001190186, "epoch": 0.9911905452436195, "grad_norm": 0.02529023587703705, "grad_norm_var": 2.9873577146835527e-07, "learning_rate": 1.993733376697837e-06, "loss": 2.4732, "step": 27341 }, { "crossentropy": 2.3685057163238525, "epoch": 0.9912267981438515, "grad_norm": 0.025013286620378494, "grad_norm_var": 3.0479853856030654e-07, "learning_rate": 1.977358891975811e-06, "loss": 2.4256, "step": 27342 }, { "crossentropy": 2.5680718421936035, "epoch": 0.9912630510440835, "grad_norm": 0.025420455262064934, "grad_norm_var": 3.020077541188015e-07, "learning_rate": 1.9610519131341155e-06, "loss": 2.4767, "step": 27343 }, { "crossentropy": 2.5303685665130615, "epoch": 0.9912993039443155, "grad_norm": 0.02577643282711506, "grad_norm_var": 3.011480933878905e-07, "learning_rate": 1.944812440393129e-06, "loss": 2.4239, "step": 27344 }, { "crossentropy": 2.616270065307617, "epoch": 0.9913355568445475, "grad_norm": 0.026575317606329918, "grad_norm_var": 3.667493958146254e-07, "learning_rate": 1.928640473972676e-06, "loss": 2.5526, "step": 27345 }, { "crossentropy": 2.505084753036499, "epoch": 0.9913718097447796, "grad_norm": 0.026675386354327202, "grad_norm_var": 4.330186649677357e-07, "learning_rate": 1.91253601409036e-06, "loss": 2.4459, "step": 27346 }, { "crossentropy": 2.2617485523223877, "epoch": 0.9914080626450116, "grad_norm": 0.025044100359082222, "grad_norm_var": 4.5016539715693873e-07, "learning_rate": 1.8964990609643406e-06, "loss": 2.2124, "step": 27347 }, { "crossentropy": 2.500836133956909, "epoch": 0.9914443155452436, "grad_norm": 0.025729741901159286, "grad_norm_var": 3.7803111360919644e-07, "learning_rate": 1.8805296148111106e-06, "loss": 2.4542, "step": 27348 }, { "crossentropy": 2.5718202590942383, "epoch": 0.9914805684454756, "grad_norm": 0.02549479529261589, "grad_norm_var": 3.761867882570337e-07, "learning_rate": 1.8646276758460534e-06, "loss": 2.4172, "step": 27349 }, { "crossentropy": 2.405498743057251, "epoch": 0.9915168213457076, "grad_norm": 0.025677867233753204, "grad_norm_var": 3.6682474695339435e-07, "learning_rate": 1.8487932442839973e-06, "loss": 2.4743, "step": 27350 }, { "crossentropy": 2.450922966003418, "epoch": 0.9915530742459396, "grad_norm": 0.025647219270467758, "grad_norm_var": 3.620106676503754e-07, "learning_rate": 1.833026320339215e-06, "loss": 2.4249, "step": 27351 }, { "crossentropy": 2.5075604915618896, "epoch": 0.9915893271461717, "grad_norm": 0.027005240321159363, "grad_norm_var": 4.831587462172358e-07, "learning_rate": 1.8173269042243146e-06, "loss": 2.4405, "step": 27352 }, { "crossentropy": 2.378122568130493, "epoch": 0.9916255800464037, "grad_norm": 0.02570056915283203, "grad_norm_var": 3.6902057950754843e-07, "learning_rate": 1.8016949961513484e-06, "loss": 2.3868, "step": 27353 }, { "crossentropy": 2.305535316467285, "epoch": 0.9916618329466357, "grad_norm": 0.025724872946739197, "grad_norm_var": 3.663714924472446e-07, "learning_rate": 1.786130596331814e-06, "loss": 2.3407, "step": 27354 }, { "crossentropy": 2.4865057468414307, "epoch": 0.9916980858468677, "grad_norm": 0.0267331525683403, "grad_norm_var": 4.1018283403049016e-07, "learning_rate": 1.7706337049749888e-06, "loss": 2.4682, "step": 27355 }, { "crossentropy": 2.261644124984741, "epoch": 0.9917343387470998, "grad_norm": 0.025513842701911926, "grad_norm_var": 3.6774895652313434e-07, "learning_rate": 1.7552043222912594e-06, "loss": 2.2959, "step": 27356 }, { "crossentropy": 2.3850114345550537, "epoch": 0.9917705916473318, "grad_norm": 0.026220014318823814, "grad_norm_var": 3.5685967073913424e-07, "learning_rate": 1.7398424484887932e-06, "loss": 2.4438, "step": 27357 }, { "crossentropy": 2.4322917461395264, "epoch": 0.9918068445475638, "grad_norm": 0.02621079422533512, "grad_norm_var": 3.093744848087308e-07, "learning_rate": 1.7245480837746464e-06, "loss": 2.4794, "step": 27358 }, { "crossentropy": 2.3721935749053955, "epoch": 0.9918430974477959, "grad_norm": 0.04458559304475784, "grad_norm_var": 2.1920625117465875e-05, "learning_rate": 1.7093212283558756e-06, "loss": 2.3777, "step": 27359 }, { "crossentropy": 2.505495309829712, "epoch": 0.9918793503480279, "grad_norm": 0.025920594111084938, "grad_norm_var": 2.1895624180741774e-05, "learning_rate": 1.6941618824373172e-06, "loss": 2.5079, "step": 27360 }, { "crossentropy": 2.414372682571411, "epoch": 0.9919156032482599, "grad_norm": 0.024907603859901428, "grad_norm_var": 2.21980623009586e-05, "learning_rate": 1.6790700462249174e-06, "loss": 2.3875, "step": 27361 }, { "crossentropy": 2.318732976913452, "epoch": 0.9919518561484919, "grad_norm": 0.026456179097294807, "grad_norm_var": 2.2211998878042178e-05, "learning_rate": 1.6640457199218474e-06, "loss": 2.4162, "step": 27362 }, { "crossentropy": 2.589491367340088, "epoch": 0.9919881090487239, "grad_norm": 0.02623339183628559, "grad_norm_var": 2.1984577719561997e-05, "learning_rate": 1.649088903730722e-06, "loss": 2.5446, "step": 27363 }, { "crossentropy": 2.301842212677002, "epoch": 0.9920243619489559, "grad_norm": 0.02742733433842659, "grad_norm_var": 2.1852255224606694e-05, "learning_rate": 1.6341995978541578e-06, "loss": 2.375, "step": 27364 }, { "crossentropy": 2.511514663696289, "epoch": 0.992060614849188, "grad_norm": 0.025778260082006454, "grad_norm_var": 2.1792216546660094e-05, "learning_rate": 1.619377802493105e-06, "loss": 2.4534, "step": 27365 }, { "crossentropy": 2.465364456176758, "epoch": 0.99209686774942, "grad_norm": 0.026295451447367668, "grad_norm_var": 2.1687923187195304e-05, "learning_rate": 1.6046235178474034e-06, "loss": 2.2876, "step": 27366 }, { "crossentropy": 2.4882965087890625, "epoch": 0.992133120649652, "grad_norm": 0.026308899745345116, "grad_norm_var": 2.1571897508020293e-05, "learning_rate": 1.5899367441168932e-06, "loss": 2.455, "step": 27367 }, { "crossentropy": 2.3152363300323486, "epoch": 0.992169373549884, "grad_norm": 0.025231575593352318, "grad_norm_var": 2.184150096313948e-05, "learning_rate": 1.5753174814991944e-06, "loss": 2.3659, "step": 27368 }, { "crossentropy": 2.4310193061828613, "epoch": 0.992205626450116, "grad_norm": 0.02869819849729538, "grad_norm_var": 2.1802611654968735e-05, "learning_rate": 1.5607657301924816e-06, "loss": 2.324, "step": 27369 }, { "crossentropy": 2.496459484100342, "epoch": 0.992241879350348, "grad_norm": 0.027105065062642097, "grad_norm_var": 2.1615177535500697e-05, "learning_rate": 1.5462814903932642e-06, "loss": 2.5166, "step": 27370 }, { "crossentropy": 2.5486233234405518, "epoch": 0.99227813225058, "grad_norm": 0.025677237659692764, "grad_norm_var": 2.178953435189635e-05, "learning_rate": 1.5318647622969418e-06, "loss": 2.5495, "step": 27371 }, { "crossentropy": 2.3727569580078125, "epoch": 0.992314385150812, "grad_norm": 0.026289932429790497, "grad_norm_var": 2.16309023912822e-05, "learning_rate": 1.517515546097803e-06, "loss": 2.4266, "step": 27372 }, { "crossentropy": 2.283945083618164, "epoch": 0.9923506380510441, "grad_norm": 0.0252460278570652, "grad_norm_var": 2.1851110947496085e-05, "learning_rate": 1.503233841990137e-06, "loss": 2.3379, "step": 27373 }, { "crossentropy": 2.483436346054077, "epoch": 0.9923868909512761, "grad_norm": 0.026242397725582123, "grad_norm_var": 2.184616963356264e-05, "learning_rate": 1.4890196501671228e-06, "loss": 2.4222, "step": 27374 }, { "crossentropy": 2.4446001052856445, "epoch": 0.9924231438515081, "grad_norm": 0.0255720354616642, "grad_norm_var": 8.735716214953396e-07, "learning_rate": 1.4748729708202735e-06, "loss": 2.4765, "step": 27375 }, { "crossentropy": 2.3260512351989746, "epoch": 0.9924593967517401, "grad_norm": 0.02570490725338459, "grad_norm_var": 8.848562329000307e-07, "learning_rate": 1.4607938041405478e-06, "loss": 2.4136, "step": 27376 }, { "crossentropy": 2.5042970180511475, "epoch": 0.9924956496519721, "grad_norm": 0.026340818032622337, "grad_norm_var": 7.665715158514455e-07, "learning_rate": 1.4467821503183488e-06, "loss": 2.4177, "step": 27377 }, { "crossentropy": 2.5010201930999756, "epoch": 0.9925319025522041, "grad_norm": 0.026230184361338615, "grad_norm_var": 7.646953962240262e-07, "learning_rate": 1.4328380095435245e-06, "loss": 2.4169, "step": 27378 }, { "crossentropy": 2.385571241378784, "epoch": 0.9925681554524362, "grad_norm": 0.026250425726175308, "grad_norm_var": 7.646216261618463e-07, "learning_rate": 1.4189613820031478e-06, "loss": 2.4123, "step": 27379 }, { "crossentropy": 2.571833610534668, "epoch": 0.9926044083526682, "grad_norm": 0.02626354806125164, "grad_norm_var": 6.704499699545947e-07, "learning_rate": 1.4051522678859563e-06, "loss": 2.4574, "step": 27380 }, { "crossentropy": 2.4018373489379883, "epoch": 0.9926406612529002, "grad_norm": 0.02555241249501705, "grad_norm_var": 6.864035811152454e-07, "learning_rate": 1.3914106673773574e-06, "loss": 2.3716, "step": 27381 }, { "crossentropy": 2.4269704818725586, "epoch": 0.9926769141531323, "grad_norm": 0.02658858522772789, "grad_norm_var": 6.959710013816506e-07, "learning_rate": 1.3777365806633135e-06, "loss": 2.4835, "step": 27382 }, { "crossentropy": 2.367422103881836, "epoch": 0.9927131670533643, "grad_norm": 0.026142090559005737, "grad_norm_var": 6.954301561819341e-07, "learning_rate": 1.3641300079292319e-06, "loss": 2.3864, "step": 27383 }, { "crossentropy": 2.5424206256866455, "epoch": 0.9927494199535963, "grad_norm": 0.026043888181447983, "grad_norm_var": 6.322194557715003e-07, "learning_rate": 1.3505909493577439e-06, "loss": 2.5443, "step": 27384 }, { "crossentropy": 2.523440361022949, "epoch": 0.9927856728538283, "grad_norm": 0.02661050111055374, "grad_norm_var": 2.2223586041095895e-07, "learning_rate": 1.3371194051325918e-06, "loss": 2.5185, "step": 27385 }, { "crossentropy": 2.5422708988189697, "epoch": 0.9928219257540604, "grad_norm": 0.025398338213562965, "grad_norm_var": 1.7927566211545e-07, "learning_rate": 1.3237153754347419e-06, "loss": 2.4455, "step": 27386 }, { "crossentropy": 2.481443166732788, "epoch": 0.9928581786542924, "grad_norm": 0.02633146569132805, "grad_norm_var": 1.7703592573251742e-07, "learning_rate": 1.3103788604462707e-06, "loss": 2.4906, "step": 27387 }, { "crossentropy": 2.5121026039123535, "epoch": 0.9928944315545244, "grad_norm": 0.025740360841155052, "grad_norm_var": 1.7836601000017066e-07, "learning_rate": 1.2971098603470343e-06, "loss": 2.4329, "step": 27388 }, { "crossentropy": 2.4457290172576904, "epoch": 0.9929306844547564, "grad_norm": 0.02582053281366825, "grad_norm_var": 1.4000461968273943e-07, "learning_rate": 1.2839083753163339e-06, "loss": 2.4676, "step": 27389 }, { "crossentropy": 2.417875289916992, "epoch": 0.9929669373549884, "grad_norm": 0.026342913508415222, "grad_norm_var": 1.4318740187678942e-07, "learning_rate": 1.270774405531805e-06, "loss": 2.4975, "step": 27390 }, { "crossentropy": 2.2612833976745605, "epoch": 0.9930031902552204, "grad_norm": 0.02539762482047081, "grad_norm_var": 1.5639685682541e-07, "learning_rate": 1.2577079511716383e-06, "loss": 2.3829, "step": 27391 }, { "crossentropy": 2.5607597827911377, "epoch": 0.9930394431554525, "grad_norm": 0.02630985900759697, "grad_norm_var": 1.5164323322335433e-07, "learning_rate": 1.2447090124118043e-06, "loss": 2.5021, "step": 27392 }, { "crossentropy": 2.4296066761016846, "epoch": 0.9930756960556845, "grad_norm": 0.02788858860731125, "grad_norm_var": 3.541150883467602e-07, "learning_rate": 1.2317775894282735e-06, "loss": 2.4242, "step": 27393 }, { "crossentropy": 2.2447876930236816, "epoch": 0.9931119489559165, "grad_norm": 0.026583567261695862, "grad_norm_var": 3.6419239751888687e-07, "learning_rate": 1.2189136823953507e-06, "loss": 2.3254, "step": 27394 }, { "crossentropy": 2.4976797103881836, "epoch": 0.9931482018561485, "grad_norm": 0.027071429416537285, "grad_norm_var": 4.1139762734255607e-07, "learning_rate": 1.2061172914867857e-06, "loss": 2.3984, "step": 27395 }, { "crossentropy": 2.3673951625823975, "epoch": 0.9931844547563805, "grad_norm": 0.026210051029920578, "grad_norm_var": 4.115180690305303e-07, "learning_rate": 1.1933884168757736e-06, "loss": 2.4368, "step": 27396 }, { "crossentropy": 2.467693328857422, "epoch": 0.9932207076566125, "grad_norm": 0.026186855509877205, "grad_norm_var": 3.774945489282e-07, "learning_rate": 1.1807270587338436e-06, "loss": 2.4289, "step": 27397 }, { "crossentropy": 2.5317819118499756, "epoch": 0.9932569605568445, "grad_norm": 0.025426233187317848, "grad_norm_var": 4.1591927717234596e-07, "learning_rate": 1.1681332172319703e-06, "loss": 2.4351, "step": 27398 }, { "crossentropy": 2.534424304962158, "epoch": 0.9932932134570766, "grad_norm": 0.02603202685713768, "grad_norm_var": 4.1780533663691306e-07, "learning_rate": 1.1556068925405726e-06, "loss": 2.394, "step": 27399 }, { "crossentropy": 2.370213508605957, "epoch": 0.9933294663573086, "grad_norm": 0.025688478723168373, "grad_norm_var": 4.3367317267618313e-07, "learning_rate": 1.1431480848289599e-06, "loss": 2.4257, "step": 27400 }, { "crossentropy": 2.4809107780456543, "epoch": 0.9933657192575406, "grad_norm": 0.02658824622631073, "grad_norm_var": 4.3245614958057195e-07, "learning_rate": 1.1307567942653308e-06, "loss": 2.5075, "step": 27401 }, { "crossentropy": 2.4656357765197754, "epoch": 0.9934019721577726, "grad_norm": 0.024775199592113495, "grad_norm_var": 5.223786874198703e-07, "learning_rate": 1.1184330210162186e-06, "loss": 2.411, "step": 27402 }, { "crossentropy": 2.3320200443267822, "epoch": 0.9934382250580046, "grad_norm": 0.02626197785139084, "grad_norm_var": 5.209953818908926e-07, "learning_rate": 1.1061767652492672e-06, "loss": 2.3892, "step": 27403 }, { "crossentropy": 2.5359723567962646, "epoch": 0.9934744779582366, "grad_norm": 0.025873543694615364, "grad_norm_var": 5.149141413781736e-07, "learning_rate": 1.0939880271293446e-06, "loss": 2.5028, "step": 27404 }, { "crossentropy": 2.3793246746063232, "epoch": 0.9935107308584686, "grad_norm": 0.026189133524894714, "grad_norm_var": 5.070380725779999e-07, "learning_rate": 1.081866806821319e-06, "loss": 2.4789, "step": 27405 }, { "crossentropy": 2.341827869415283, "epoch": 0.9935469837587007, "grad_norm": 0.026233717799186707, "grad_norm_var": 5.053619916965072e-07, "learning_rate": 1.069813104488948e-06, "loss": 2.4223, "step": 27406 }, { "crossentropy": 2.3675811290740967, "epoch": 0.9935832366589327, "grad_norm": 0.02528027631342411, "grad_norm_var": 5.183042116426705e-07, "learning_rate": 1.0578269202943248e-06, "loss": 2.4263, "step": 27407 }, { "crossentropy": 2.337728500366211, "epoch": 0.9936194895591647, "grad_norm": 0.02563777193427086, "grad_norm_var": 5.333258815723787e-07, "learning_rate": 1.0459082544000963e-06, "loss": 2.3979, "step": 27408 }, { "crossentropy": 2.441967010498047, "epoch": 0.9936557424593968, "grad_norm": 0.02621304616332054, "grad_norm_var": 3.1377743476695975e-07, "learning_rate": 1.0340571069672456e-06, "loss": 2.4675, "step": 27409 }, { "crossentropy": 2.573338031768799, "epoch": 0.9936919953596288, "grad_norm": 0.025788743048906326, "grad_norm_var": 2.9308326236389637e-07, "learning_rate": 1.0222734781556443e-06, "loss": 2.4924, "step": 27410 }, { "crossentropy": 2.313617467880249, "epoch": 0.9937282482598608, "grad_norm": 0.02608490362763405, "grad_norm_var": 2.085117246982861e-07, "learning_rate": 1.0105573681246094e-06, "loss": 2.3199, "step": 27411 }, { "crossentropy": 2.323561668395996, "epoch": 0.9937645011600929, "grad_norm": 0.026139115914702415, "grad_norm_var": 2.0593524425223618e-07, "learning_rate": 9.98908777032348e-07, "loss": 2.3652, "step": 27412 }, { "crossentropy": 2.516895055770874, "epoch": 0.9938007540603249, "grad_norm": 0.025814643129706383, "grad_norm_var": 2.0035570207933377e-07, "learning_rate": 9.873277050354012e-07, "loss": 2.4106, "step": 27413 }, { "crossentropy": 2.4248499870300293, "epoch": 0.9938370069605569, "grad_norm": 0.02563122846186161, "grad_norm_var": 1.9066991104714955e-07, "learning_rate": 9.75814152291421e-07, "loss": 2.4002, "step": 27414 }, { "crossentropy": 2.399925470352173, "epoch": 0.9938732598607889, "grad_norm": 0.029652558267116547, "grad_norm_var": 1.0787368585261799e-06, "learning_rate": 9.643681189552833e-07, "loss": 2.393, "step": 27415 }, { "crossentropy": 2.358591079711914, "epoch": 0.9939095127610209, "grad_norm": 0.025700798258185387, "grad_norm_var": 1.0780444464328353e-06, "learning_rate": 9.529896051818643e-07, "loss": 2.3684, "step": 27416 }, { "crossentropy": 2.33242130279541, "epoch": 0.9939457656612529, "grad_norm": 0.02637651562690735, "grad_norm_var": 1.0675301583824384e-06, "learning_rate": 9.416786111243747e-07, "loss": 2.4076, "step": 27417 }, { "crossentropy": 2.41363263130188, "epoch": 0.9939820185614849, "grad_norm": 0.025413036346435547, "grad_norm_var": 9.800072403450913e-07, "learning_rate": 9.304351369354702e-07, "loss": 2.427, "step": 27418 }, { "crossentropy": 2.209263563156128, "epoch": 0.994018271461717, "grad_norm": 0.024801673367619514, "grad_norm_var": 1.0901585912955243e-06, "learning_rate": 9.192591827678066e-07, "loss": 2.2518, "step": 27419 }, { "crossentropy": 2.4094982147216797, "epoch": 0.994054524361949, "grad_norm": 0.02542886696755886, "grad_norm_var": 1.1130930966809685e-06, "learning_rate": 9.081507487718188e-07, "loss": 2.3626, "step": 27420 }, { "crossentropy": 2.354384422302246, "epoch": 0.994090777262181, "grad_norm": 0.02497417852282524, "grad_norm_var": 1.1786202253199896e-06, "learning_rate": 8.971098350979423e-07, "loss": 2.4004, "step": 27421 }, { "crossentropy": 2.5202817916870117, "epoch": 0.994127030162413, "grad_norm": 0.026735199615359306, "grad_norm_var": 1.2134294349338178e-06, "learning_rate": 8.861364418949469e-07, "loss": 2.5496, "step": 27422 }, { "crossentropy": 2.445469379425049, "epoch": 0.994163283062645, "grad_norm": 0.026995906606316566, "grad_norm_var": 1.237435286662499e-06, "learning_rate": 8.752305693110474e-07, "loss": 2.4721, "step": 27423 }, { "crossentropy": 2.4979448318481445, "epoch": 0.994199535962877, "grad_norm": 0.02620922587811947, "grad_norm_var": 1.2236350056879132e-06, "learning_rate": 8.643922174933483e-07, "loss": 2.4545, "step": 27424 }, { "crossentropy": 2.6057047843933105, "epoch": 0.994235788863109, "grad_norm": 0.0257530827075243, "grad_norm_var": 1.2313034670072277e-06, "learning_rate": 8.536213865883991e-07, "loss": 2.556, "step": 27425 }, { "crossentropy": 2.3923051357269287, "epoch": 0.994272041763341, "grad_norm": 0.02551405318081379, "grad_norm_var": 1.247189609753863e-06, "learning_rate": 8.42918076742194e-07, "loss": 2.4852, "step": 27426 }, { "crossentropy": 2.5338172912597656, "epoch": 0.9943082946635731, "grad_norm": 0.025445081293582916, "grad_norm_var": 1.2720637477235992e-06, "learning_rate": 8.32282288098507e-07, "loss": 2.4648, "step": 27427 }, { "crossentropy": 2.4996373653411865, "epoch": 0.9943445475638051, "grad_norm": 0.02602853812277317, "grad_norm_var": 1.2713160971304273e-06, "learning_rate": 8.217140208016672e-07, "loss": 2.4927, "step": 27428 }, { "crossentropy": 2.440351724624634, "epoch": 0.9943808004640371, "grad_norm": 0.025961529463529587, "grad_norm_var": 1.2684534675931497e-06, "learning_rate": 8.112132749943379e-07, "loss": 2.3597, "step": 27429 }, { "crossentropy": 2.475043535232544, "epoch": 0.9944170533642691, "grad_norm": 0.02693798951804638, "grad_norm_var": 1.3041595223912499e-06, "learning_rate": 8.007800508175178e-07, "loss": 2.4793, "step": 27430 }, { "crossentropy": 2.345937490463257, "epoch": 0.9944533062645011, "grad_norm": 0.02587970532476902, "grad_norm_var": 4.1702655573376486e-07, "learning_rate": 7.904143484127602e-07, "loss": 2.291, "step": 27431 }, { "crossentropy": 2.5218253135681152, "epoch": 0.9944895591647331, "grad_norm": 0.026088690385222435, "grad_norm_var": 4.169185351572535e-07, "learning_rate": 7.801161679205082e-07, "loss": 2.4661, "step": 27432 }, { "crossentropy": 2.3955342769622803, "epoch": 0.9945258120649652, "grad_norm": 0.02650928683578968, "grad_norm_var": 4.2629745038985413e-07, "learning_rate": 7.698855094789847e-07, "loss": 2.4191, "step": 27433 }, { "crossentropy": 2.3274550437927246, "epoch": 0.9945620649651972, "grad_norm": 0.02511361613869667, "grad_norm_var": 4.520304069034753e-07, "learning_rate": 7.597223732269676e-07, "loss": 2.37, "step": 27434 }, { "crossentropy": 2.3440816402435303, "epoch": 0.9945983178654292, "grad_norm": 0.025881435722112656, "grad_norm_var": 3.6698443322846253e-07, "learning_rate": 7.496267593010142e-07, "loss": 2.4058, "step": 27435 }, { "crossentropy": 2.5138890743255615, "epoch": 0.9946345707656613, "grad_norm": 0.025219837203621864, "grad_norm_var": 3.8468618503531504e-07, "learning_rate": 7.395986678387923e-07, "loss": 2.4047, "step": 27436 }, { "crossentropy": 2.352736234664917, "epoch": 0.9946708236658933, "grad_norm": 0.025883322581648827, "grad_norm_var": 3.176980285735098e-07, "learning_rate": 7.296380989746388e-07, "loss": 2.4698, "step": 27437 }, { "crossentropy": 2.4269721508026123, "epoch": 0.9947070765661253, "grad_norm": 0.027080930769443512, "grad_norm_var": 3.586086176599144e-07, "learning_rate": 7.197450528428906e-07, "loss": 2.4471, "step": 27438 }, { "crossentropy": 2.418121337890625, "epoch": 0.9947433294663574, "grad_norm": 0.025441648438572884, "grad_norm_var": 3.0970987204928006e-07, "learning_rate": 7.099195295784399e-07, "loss": 2.4159, "step": 27439 }, { "crossentropy": 2.4432291984558105, "epoch": 0.9947795823665894, "grad_norm": 0.02599276229739189, "grad_norm_var": 3.0470205271531287e-07, "learning_rate": 7.001615293128483e-07, "loss": 2.4954, "step": 27440 }, { "crossentropy": 2.453155994415283, "epoch": 0.9948158352668214, "grad_norm": 0.02565179020166397, "grad_norm_var": 3.0760735851841025e-07, "learning_rate": 6.904710521782321e-07, "loss": 2.4582, "step": 27441 }, { "crossentropy": 2.3861489295959473, "epoch": 0.9948520881670534, "grad_norm": 0.025759415701031685, "grad_norm_var": 2.9827305734612896e-07, "learning_rate": 6.808480983055976e-07, "loss": 2.4773, "step": 27442 }, { "crossentropy": 2.57655668258667, "epoch": 0.9948883410672854, "grad_norm": 0.026207761839032173, "grad_norm_var": 2.853445085255109e-07, "learning_rate": 6.71292667824841e-07, "loss": 2.4874, "step": 27443 }, { "crossentropy": 2.41532826423645, "epoch": 0.9949245939675174, "grad_norm": 0.026778055354952812, "grad_norm_var": 3.2556690521725005e-07, "learning_rate": 6.618047608647482e-07, "loss": 2.3134, "step": 27444 }, { "crossentropy": 2.3806471824645996, "epoch": 0.9949608468677494, "grad_norm": 0.026018768548965454, "grad_norm_var": 3.2529310528689457e-07, "learning_rate": 6.52384377554105e-07, "loss": 2.4897, "step": 27445 }, { "crossentropy": 2.5308001041412354, "epoch": 0.9949970997679815, "grad_norm": 0.025730373337864876, "grad_norm_var": 2.698867282051205e-07, "learning_rate": 6.430315180194768e-07, "loss": 2.4137, "step": 27446 }, { "crossentropy": 2.431112051010132, "epoch": 0.9950333526682135, "grad_norm": 0.026573453098535538, "grad_norm_var": 2.932486429703714e-07, "learning_rate": 6.337461823879842e-07, "loss": 2.475, "step": 27447 }, { "crossentropy": 2.449855089187622, "epoch": 0.9950696055684455, "grad_norm": 0.026070494204759598, "grad_norm_var": 2.930437196549743e-07, "learning_rate": 6.245283707839722e-07, "loss": 2.3855, "step": 27448 }, { "crossentropy": 2.4700584411621094, "epoch": 0.9951058584686775, "grad_norm": 0.026747819036245346, "grad_norm_var": 3.129703589814601e-07, "learning_rate": 6.153780833328959e-07, "loss": 2.4628, "step": 27449 }, { "crossentropy": 2.3352088928222656, "epoch": 0.9951421113689095, "grad_norm": 0.02501685358583927, "grad_norm_var": 3.2511353152680825e-07, "learning_rate": 6.062953201579902e-07, "loss": 2.3986, "step": 27450 }, { "crossentropy": 2.5402276515960693, "epoch": 0.9951783642691415, "grad_norm": 0.025255253538489342, "grad_norm_var": 3.5980463369681727e-07, "learning_rate": 5.972800813819346e-07, "loss": 2.4828, "step": 27451 }, { "crossentropy": 2.3048408031463623, "epoch": 0.9952146171693735, "grad_norm": 0.025625519454479218, "grad_norm_var": 3.2982293171329493e-07, "learning_rate": 5.883323671262986e-07, "loss": 2.311, "step": 27452 }, { "crossentropy": 2.295611619949341, "epoch": 0.9952508700696056, "grad_norm": 0.025641314685344696, "grad_norm_var": 3.369140048909419e-07, "learning_rate": 5.794521775120964e-07, "loss": 2.386, "step": 27453 }, { "crossentropy": 2.4324002265930176, "epoch": 0.9952871229698376, "grad_norm": 0.027667948976159096, "grad_norm_var": 4.4504919017839446e-07, "learning_rate": 5.706395126597874e-07, "loss": 2.4105, "step": 27454 }, { "crossentropy": 2.3857743740081787, "epoch": 0.9953233758700696, "grad_norm": 0.0265971627086401, "grad_norm_var": 4.407497116937567e-07, "learning_rate": 5.618943726870551e-07, "loss": 2.4286, "step": 27455 }, { "crossentropy": 2.1673669815063477, "epoch": 0.9953596287703016, "grad_norm": 0.02646898292005062, "grad_norm_var": 4.491673273320038e-07, "learning_rate": 5.532167577132485e-07, "loss": 2.3037, "step": 27456 }, { "crossentropy": 2.3805935382843018, "epoch": 0.9953958816705336, "grad_norm": 0.026966659352183342, "grad_norm_var": 4.763324655080239e-07, "learning_rate": 5.446066678554962e-07, "loss": 2.405, "step": 27457 }, { "crossentropy": 2.2734625339508057, "epoch": 0.9954321345707656, "grad_norm": 0.02617519721388817, "grad_norm_var": 4.629691652920185e-07, "learning_rate": 5.360641032292613e-07, "loss": 2.3983, "step": 27458 }, { "crossentropy": 2.256056547164917, "epoch": 0.9954683874709976, "grad_norm": 0.0258925911039114, "grad_norm_var": 4.69748511296034e-07, "learning_rate": 5.275890639511172e-07, "loss": 2.3552, "step": 27459 }, { "crossentropy": 2.4733452796936035, "epoch": 0.9955046403712297, "grad_norm": 0.026104804128408432, "grad_norm_var": 4.463358949109708e-07, "learning_rate": 5.191815501343066e-07, "loss": 2.494, "step": 27460 }, { "crossentropy": 2.369797945022583, "epoch": 0.9955408932714617, "grad_norm": 0.026063231751322746, "grad_norm_var": 4.4562469669177406e-07, "learning_rate": 5.108415618931827e-07, "loss": 2.3943, "step": 27461 }, { "crossentropy": 2.5178208351135254, "epoch": 0.9955771461716937, "grad_norm": 0.026108818128705025, "grad_norm_var": 4.3277854671213274e-07, "learning_rate": 5.025690993398779e-07, "loss": 2.4622, "step": 27462 }, { "crossentropy": 2.4375267028808594, "epoch": 0.9956133990719258, "grad_norm": 0.025603607296943665, "grad_norm_var": 4.414643064702076e-07, "learning_rate": 4.943641625865247e-07, "loss": 2.3911, "step": 27463 }, { "crossentropy": 2.3717129230499268, "epoch": 0.9956496519721578, "grad_norm": 0.02575700543820858, "grad_norm_var": 4.4990111615714896e-07, "learning_rate": 4.862267517435904e-07, "loss": 2.4604, "step": 27464 }, { "crossentropy": 2.421496629714966, "epoch": 0.9956859048723898, "grad_norm": 0.02577863074839115, "grad_norm_var": 4.256437620394574e-07, "learning_rate": 4.781568669215419e-07, "loss": 2.4369, "step": 27465 }, { "crossentropy": 2.3352601528167725, "epoch": 0.9957221577726219, "grad_norm": 0.02504015527665615, "grad_norm_var": 4.2248266224086984e-07, "learning_rate": 4.701545082291814e-07, "loss": 2.3324, "step": 27466 }, { "crossentropy": 2.431359052658081, "epoch": 0.9957584106728539, "grad_norm": 0.025410590693354607, "grad_norm_var": 4.0759903605553685e-07, "learning_rate": 4.622196757736452e-07, "loss": 2.3813, "step": 27467 }, { "crossentropy": 2.438524007797241, "epoch": 0.9957946635730859, "grad_norm": 0.026471011340618134, "grad_norm_var": 4.037047042401796e-07, "learning_rate": 4.543523696637353e-07, "loss": 2.3856, "step": 27468 }, { "crossentropy": 2.4011738300323486, "epoch": 0.9958309164733179, "grad_norm": 0.025483567267656326, "grad_norm_var": 4.1510166915886306e-07, "learning_rate": 4.4655259000436763e-07, "loss": 2.4409, "step": 27469 }, { "crossentropy": 2.518216371536255, "epoch": 0.9958671693735499, "grad_norm": 0.02698685973882675, "grad_norm_var": 3.0164894570019104e-07, "learning_rate": 4.3882033690156864e-07, "loss": 2.4873, "step": 27470 }, { "crossentropy": 2.493332862854004, "epoch": 0.9959034222737819, "grad_norm": 0.026349889114499092, "grad_norm_var": 2.8765495765168544e-07, "learning_rate": 4.311556104596992e-07, "loss": 2.4415, "step": 27471 }, { "crossentropy": 2.340038537979126, "epoch": 0.9959396751740139, "grad_norm": 0.02610500529408455, "grad_norm_var": 2.7518176891881165e-07, "learning_rate": 4.2355841078201006e-07, "loss": 2.4706, "step": 27472 }, { "crossentropy": 2.4625720977783203, "epoch": 0.995975928074246, "grad_norm": 0.02652745693922043, "grad_norm_var": 2.317193681386845e-07, "learning_rate": 4.16028737971752e-07, "loss": 2.466, "step": 27473 }, { "crossentropy": 2.2731165885925293, "epoch": 0.996012180974478, "grad_norm": 0.025631451979279518, "grad_norm_var": 2.3685483938906162e-07, "learning_rate": 4.0856659212940015e-07, "loss": 2.3187, "step": 27474 }, { "crossentropy": 2.300994634628296, "epoch": 0.99604843387471, "grad_norm": 0.02509423717856407, "grad_norm_var": 2.8356435216383977e-07, "learning_rate": 4.011719733570951e-07, "loss": 2.297, "step": 27475 }, { "crossentropy": 2.3430418968200684, "epoch": 0.996084686774942, "grad_norm": 0.025077585130929947, "grad_norm_var": 3.224582766969686e-07, "learning_rate": 3.9384488175420175e-07, "loss": 2.3558, "step": 27476 }, { "crossentropy": 2.4486801624298096, "epoch": 0.996120939675174, "grad_norm": 0.025863640010356903, "grad_norm_var": 3.1908905725844204e-07, "learning_rate": 3.8658531741897487e-07, "loss": 2.4505, "step": 27477 }, { "crossentropy": 2.2835090160369873, "epoch": 0.996157192575406, "grad_norm": 0.02590496838092804, "grad_norm_var": 3.141241176463546e-07, "learning_rate": 3.793932804507794e-07, "loss": 2.4077, "step": 27478 }, { "crossentropy": 2.4159584045410156, "epoch": 0.996193445475638, "grad_norm": 0.027081958949565887, "grad_norm_var": 4.084884409445248e-07, "learning_rate": 3.722687709456496e-07, "loss": 2.3747, "step": 27479 }, { "crossentropy": 2.4039838314056396, "epoch": 0.99622969837587, "grad_norm": 0.025637483224272728, "grad_norm_var": 4.1182345242168907e-07, "learning_rate": 3.652117890001749e-07, "loss": 2.4327, "step": 27480 }, { "crossentropy": 2.378627061843872, "epoch": 0.9962659512761021, "grad_norm": 0.026425186544656754, "grad_norm_var": 4.2724795247237916e-07, "learning_rate": 3.5822233470983456e-07, "loss": 2.445, "step": 27481 }, { "crossentropy": 2.415717363357544, "epoch": 0.9963022041763341, "grad_norm": 0.025916608050465584, "grad_norm_var": 3.697295206778282e-07, "learning_rate": 3.5130040816899745e-07, "loss": 2.3966, "step": 27482 }, { "crossentropy": 2.2415521144866943, "epoch": 0.9963384570765661, "grad_norm": 0.025394339114427567, "grad_norm_var": 3.710188038824035e-07, "learning_rate": 3.444460094714774e-07, "loss": 2.3226, "step": 27483 }, { "crossentropy": 2.420877695083618, "epoch": 0.9963747099767981, "grad_norm": 0.027379531413316727, "grad_norm_var": 4.800323828266691e-07, "learning_rate": 3.3765913870886786e-07, "loss": 2.5431, "step": 27484 }, { "crossentropy": 2.3327441215515137, "epoch": 0.9964109628770301, "grad_norm": 0.025062771514058113, "grad_norm_var": 5.230891167621777e-07, "learning_rate": 3.309397959733174e-07, "loss": 2.4092, "step": 27485 }, { "crossentropy": 2.3481550216674805, "epoch": 0.9964472157772621, "grad_norm": 0.02528819814324379, "grad_norm_var": 4.861316014621257e-07, "learning_rate": 3.242879813564192e-07, "loss": 2.34, "step": 27486 }, { "crossentropy": 2.4738340377807617, "epoch": 0.9964834686774942, "grad_norm": 0.026360180228948593, "grad_norm_var": 4.867263505403095e-07, "learning_rate": 3.1770369494699136e-07, "loss": 2.4602, "step": 27487 }, { "crossentropy": 2.290001153945923, "epoch": 0.9965197215777262, "grad_norm": 0.02529998868703842, "grad_norm_var": 5.075772328888562e-07, "learning_rate": 3.1118693683385155e-07, "loss": 2.2747, "step": 27488 }, { "crossentropy": 2.377833843231201, "epoch": 0.9965559744779582, "grad_norm": 0.02555292285978794, "grad_norm_var": 4.817137485769862e-07, "learning_rate": 3.047377071058177e-07, "loss": 2.3739, "step": 27489 }, { "crossentropy": 2.4332501888275146, "epoch": 0.9965922273781903, "grad_norm": 0.026459621265530586, "grad_norm_var": 5.04788269592539e-07, "learning_rate": 2.9835600584948717e-07, "loss": 2.4694, "step": 27490 }, { "crossentropy": 2.3217568397521973, "epoch": 0.9966284802784223, "grad_norm": 0.026856714859604836, "grad_norm_var": 5.184057121902785e-07, "learning_rate": 2.9204183315145737e-07, "loss": 2.3687, "step": 27491 }, { "crossentropy": 2.359003782272339, "epoch": 0.9966647331786543, "grad_norm": 0.02597212605178356, "grad_norm_var": 4.616673716757117e-07, "learning_rate": 2.8579518909610525e-07, "loss": 2.422, "step": 27492 }, { "crossentropy": 2.3474369049072266, "epoch": 0.9967009860788864, "grad_norm": 0.02685791440308094, "grad_norm_var": 5.015962614669684e-07, "learning_rate": 2.7961607376891794e-07, "loss": 2.4153, "step": 27493 }, { "crossentropy": 2.3356006145477295, "epoch": 0.9967372389791184, "grad_norm": 0.02645944617688656, "grad_norm_var": 5.070835764613002e-07, "learning_rate": 2.735044872526071e-07, "loss": 2.401, "step": 27494 }, { "crossentropy": 2.3995020389556885, "epoch": 0.9967734918793504, "grad_norm": 0.025688333436846733, "grad_norm_var": 4.5070956211206345e-07, "learning_rate": 2.674604296304395e-07, "loss": 2.4876, "step": 27495 }, { "crossentropy": 2.475609302520752, "epoch": 0.9968097447795824, "grad_norm": 0.026016872376203537, "grad_norm_var": 4.3943469904942205e-07, "learning_rate": 2.614839009829062e-07, "loss": 2.4642, "step": 27496 }, { "crossentropy": 2.5168066024780273, "epoch": 0.9968459976798144, "grad_norm": 0.025173965841531754, "grad_norm_var": 4.7667859377323725e-07, "learning_rate": 2.5557490139216376e-07, "loss": 2.4251, "step": 27497 }, { "crossentropy": 2.42376708984375, "epoch": 0.9968822505800464, "grad_norm": 0.026625871658325195, "grad_norm_var": 5.017727624890995e-07, "learning_rate": 2.497334309364829e-07, "loss": 2.4745, "step": 27498 }, { "crossentropy": 2.3875410556793213, "epoch": 0.9969185034802784, "grad_norm": 0.025198007002472878, "grad_norm_var": 5.207709424412717e-07, "learning_rate": 2.4395948969635486e-07, "loss": 2.3786, "step": 27499 }, { "crossentropy": 2.4628677368164062, "epoch": 0.9969547563805105, "grad_norm": 0.026439983397722244, "grad_norm_var": 4.0510141223816463e-07, "learning_rate": 2.382530777483849e-07, "loss": 2.5333, "step": 27500 }, { "crossentropy": 2.408813953399658, "epoch": 0.9969910092807425, "grad_norm": 0.025662295520305634, "grad_norm_var": 3.5607961507399164e-07, "learning_rate": 2.326141951702887e-07, "loss": 2.4502, "step": 27501 }, { "crossentropy": 2.398348331451416, "epoch": 0.9970272621809745, "grad_norm": 0.025710035115480423, "grad_norm_var": 3.274738153413263e-07, "learning_rate": 2.2704284203811655e-07, "loss": 2.3834, "step": 27502 }, { "crossentropy": 2.4558370113372803, "epoch": 0.9970635150812065, "grad_norm": 0.025882486253976822, "grad_norm_var": 3.201256878933763e-07, "learning_rate": 2.2153901842736356e-07, "loss": 2.4718, "step": 27503 }, { "crossentropy": 2.229956865310669, "epoch": 0.9970997679814385, "grad_norm": 0.02597583271563053, "grad_norm_var": 2.8640143300907806e-07, "learning_rate": 2.1610272441185962e-07, "loss": 2.3014, "step": 27504 }, { "crossentropy": 2.39788556098938, "epoch": 0.9971360208816705, "grad_norm": 0.02584482543170452, "grad_norm_var": 2.730313370714049e-07, "learning_rate": 2.1073396006598966e-07, "loss": 2.4438, "step": 27505 }, { "crossentropy": 2.383866786956787, "epoch": 0.9971722737819025, "grad_norm": 0.025809520855545998, "grad_norm_var": 2.640715683475814e-07, "learning_rate": 2.0543272546080794e-07, "loss": 2.483, "step": 27506 }, { "crossentropy": 2.4064056873321533, "epoch": 0.9972085266821346, "grad_norm": 0.026926638558506966, "grad_norm_var": 2.7226291663879664e-07, "learning_rate": 2.0019902066903405e-07, "loss": 2.4518, "step": 27507 }, { "crossentropy": 2.3783349990844727, "epoch": 0.9972447795823666, "grad_norm": 0.026648862287402153, "grad_norm_var": 2.969941510082372e-07, "learning_rate": 1.950328457611672e-07, "loss": 2.4541, "step": 27508 }, { "crossentropy": 2.3842110633850098, "epoch": 0.9972810324825986, "grad_norm": 0.02519308775663376, "grad_norm_var": 2.9256100109494265e-07, "learning_rate": 1.8993420080659628e-07, "loss": 2.377, "step": 27509 }, { "crossentropy": 2.4131650924682617, "epoch": 0.9973172853828306, "grad_norm": 0.026601077988743782, "grad_norm_var": 3.0336905806777986e-07, "learning_rate": 1.8490308587471027e-07, "loss": 2.4728, "step": 27510 }, { "crossentropy": 2.4179272651672363, "epoch": 0.9973535382830626, "grad_norm": 0.027418049052357674, "grad_norm_var": 4.271663312714497e-07, "learning_rate": 1.7993950103323275e-07, "loss": 2.4319, "step": 27511 }, { "crossentropy": 2.4397199153900146, "epoch": 0.9973897911832946, "grad_norm": 0.02615547366440296, "grad_norm_var": 4.273766077535993e-07, "learning_rate": 1.750434463487771e-07, "loss": 2.4255, "step": 27512 }, { "crossentropy": 2.609255075454712, "epoch": 0.9974260440835266, "grad_norm": 0.026184314861893654, "grad_norm_var": 3.692399296810478e-07, "learning_rate": 1.7021492188851183e-07, "loss": 2.5081, "step": 27513 }, { "crossentropy": 2.3789145946502686, "epoch": 0.9974622969837587, "grad_norm": 0.02528829127550125, "grad_norm_var": 3.948130003793756e-07, "learning_rate": 1.6545392771627477e-07, "loss": 2.3671, "step": 27514 }, { "crossentropy": 2.3092892169952393, "epoch": 0.9974985498839907, "grad_norm": 0.02542242221534252, "grad_norm_var": 3.722077390165441e-07, "learning_rate": 1.6076046389756905e-07, "loss": 2.3286, "step": 27515 }, { "crossentropy": 2.5378737449645996, "epoch": 0.9975348027842227, "grad_norm": 0.026923878118395805, "grad_norm_var": 4.1053924889089024e-07, "learning_rate": 1.5613453049512226e-07, "loss": 2.4195, "step": 27516 }, { "crossentropy": 2.427891731262207, "epoch": 0.9975710556844548, "grad_norm": 0.025781556963920593, "grad_norm_var": 4.044212337136656e-07, "learning_rate": 1.5157612757166205e-07, "loss": 2.4553, "step": 27517 }, { "crossentropy": 2.2636327743530273, "epoch": 0.9976073085846868, "grad_norm": 0.026215875521302223, "grad_norm_var": 3.9341082385304047e-07, "learning_rate": 1.4708525518880578e-07, "loss": 2.3859, "step": 27518 }, { "crossentropy": 2.4755380153656006, "epoch": 0.9976435614849188, "grad_norm": 0.02534196898341179, "grad_norm_var": 4.303745162423725e-07, "learning_rate": 1.426619134070606e-07, "loss": 2.4212, "step": 27519 }, { "crossentropy": 2.378830671310425, "epoch": 0.9976798143851509, "grad_norm": 0.025630293413996696, "grad_norm_var": 4.4393663886435617e-07, "learning_rate": 1.3830610228637852e-07, "loss": 2.4329, "step": 27520 }, { "crossentropy": 2.487009048461914, "epoch": 0.9977160672853829, "grad_norm": 0.025971129536628723, "grad_norm_var": 4.408615030807115e-07, "learning_rate": 1.340178218850463e-07, "loss": 2.49, "step": 27521 }, { "crossentropy": 2.258967638015747, "epoch": 0.9977523201856149, "grad_norm": 0.026325399056077003, "grad_norm_var": 4.378908147603753e-07, "learning_rate": 1.2979707226135062e-07, "loss": 2.3322, "step": 27522 }, { "crossentropy": 2.398921489715576, "epoch": 0.9977885730858469, "grad_norm": 0.025996174663305283, "grad_norm_var": 3.9276782496742294e-07, "learning_rate": 1.256438534730231e-07, "loss": 2.4376, "step": 27523 }, { "crossentropy": 2.4260659217834473, "epoch": 0.9978248259860789, "grad_norm": 0.026519834995269775, "grad_norm_var": 3.8382597927961483e-07, "learning_rate": 1.2155816557446465e-07, "loss": 2.4155, "step": 27524 }, { "crossentropy": 2.537224531173706, "epoch": 0.9978610788863109, "grad_norm": 0.026182962581515312, "grad_norm_var": 3.305759701472191e-07, "learning_rate": 1.1754000862285175e-07, "loss": 2.4823, "step": 27525 }, { "crossentropy": 2.4263968467712402, "epoch": 0.9978973317865429, "grad_norm": 0.02555926702916622, "grad_norm_var": 3.319219661970796e-07, "learning_rate": 1.1358938267092001e-07, "loss": 2.3317, "step": 27526 }, { "crossentropy": 2.3489255905151367, "epoch": 0.997933584686775, "grad_norm": 0.025084195658564568, "grad_norm_var": 2.4891472729469263e-07, "learning_rate": 1.0970628777251524e-07, "loss": 2.2619, "step": 27527 }, { "crossentropy": 2.4344098567962646, "epoch": 0.997969837587007, "grad_norm": 0.025858279317617416, "grad_norm_var": 2.4476494692921883e-07, "learning_rate": 1.0589072397981791e-07, "loss": 2.4608, "step": 27528 }, { "crossentropy": 2.4031436443328857, "epoch": 0.998006090487239, "grad_norm": 0.026075363159179688, "grad_norm_var": 2.412729941692407e-07, "learning_rate": 1.0214269134556364e-07, "loss": 2.3376, "step": 27529 }, { "crossentropy": 2.3886539936065674, "epoch": 0.998042343387471, "grad_norm": 0.025744706392288208, "grad_norm_var": 2.179154990769941e-07, "learning_rate": 9.846218991860222e-08, "loss": 2.3332, "step": 27530 }, { "crossentropy": 2.444709062576294, "epoch": 0.998078596287703, "grad_norm": 0.025805596262216568, "grad_norm_var": 2.0194752914219107e-07, "learning_rate": 9.484921975000394e-08, "loss": 2.444, "step": 27531 }, { "crossentropy": 2.277510166168213, "epoch": 0.998114849187935, "grad_norm": 0.025308480486273766, "grad_norm_var": 1.528113601226144e-07, "learning_rate": 9.130378088806347e-08, "loss": 2.3187, "step": 27532 }, { "crossentropy": 2.345407485961914, "epoch": 0.998151102088167, "grad_norm": 0.02694571577012539, "grad_norm_var": 2.2882140781957384e-07, "learning_rate": 8.782587338052039e-08, "loss": 2.3042, "step": 27533 }, { "crossentropy": 2.541398048400879, "epoch": 0.9981873549883991, "grad_norm": 0.025458529591560364, "grad_norm_var": 2.3381565587217376e-07, "learning_rate": 8.441549727455921e-08, "loss": 2.5391, "step": 27534 }, { "crossentropy": 2.434981346130371, "epoch": 0.9982236078886311, "grad_norm": 0.02498950995504856, "grad_norm_var": 2.6606517490723504e-07, "learning_rate": 8.10726526168093e-08, "loss": 2.4534, "step": 27535 }, { "crossentropy": 2.370253801345825, "epoch": 0.9982598607888631, "grad_norm": 0.025705065578222275, "grad_norm_var": 2.643142899622636e-07, "learning_rate": 7.779733945112444e-08, "loss": 2.3927, "step": 27536 }, { "crossentropy": 2.3598549365997314, "epoch": 0.9982961136890951, "grad_norm": 0.025763269513845444, "grad_norm_var": 2.635366988124604e-07, "learning_rate": 7.458955782246868e-08, "loss": 2.363, "step": 27537 }, { "crossentropy": 2.4014110565185547, "epoch": 0.9983323665893271, "grad_norm": 0.026237666606903076, "grad_norm_var": 2.582537128106276e-07, "learning_rate": 7.144930777469583e-08, "loss": 2.3673, "step": 27538 }, { "crossentropy": 2.368932008743286, "epoch": 0.9983686194895591, "grad_norm": 0.02588256075978279, "grad_norm_var": 2.565002026638207e-07, "learning_rate": 6.837658934888413e-08, "loss": 2.4921, "step": 27539 }, { "crossentropy": 2.3758559226989746, "epoch": 0.9984048723897911, "grad_norm": 0.02619464136660099, "grad_norm_var": 2.3276810265300481e-07, "learning_rate": 6.537140258777718e-08, "loss": 2.456, "step": 27540 }, { "crossentropy": 2.421700954437256, "epoch": 0.9984411252900232, "grad_norm": 0.026370147243142128, "grad_norm_var": 2.445224824006714e-07, "learning_rate": 6.243374753134301e-08, "loss": 2.4147, "step": 27541 }, { "crossentropy": 2.4110491275787354, "epoch": 0.9984773781902552, "grad_norm": 0.02548501081764698, "grad_norm_var": 2.4736379968397633e-07, "learning_rate": 5.956362421954964e-08, "loss": 2.3794, "step": 27542 }, { "crossentropy": 2.454810380935669, "epoch": 0.9985136310904872, "grad_norm": 0.02640744112432003, "grad_norm_var": 2.2930958246517435e-07, "learning_rate": 5.676103269125488e-08, "loss": 2.4611, "step": 27543 }, { "crossentropy": 2.391317844390869, "epoch": 0.9985498839907193, "grad_norm": 0.025930680334568024, "grad_norm_var": 2.2933582346817912e-07, "learning_rate": 5.40259729836512e-08, "loss": 2.4079, "step": 27544 }, { "crossentropy": 2.437589406967163, "epoch": 0.9985861368909513, "grad_norm": 0.02578529343008995, "grad_norm_var": 2.2758113700607704e-07, "learning_rate": 5.1358445133931065e-08, "loss": 2.4001, "step": 27545 }, { "crossentropy": 2.5105414390563965, "epoch": 0.9986223897911833, "grad_norm": 0.027191871777176857, "grad_norm_var": 3.331606244374082e-07, "learning_rate": 4.8758449178731846e-08, "loss": 2.4984, "step": 27546 }, { "crossentropy": 2.3774683475494385, "epoch": 0.9986586426914154, "grad_norm": 0.025645049288868904, "grad_norm_var": 3.382125596897012e-07, "learning_rate": 4.622598515302556e-08, "loss": 2.3354, "step": 27547 }, { "crossentropy": 2.459627628326416, "epoch": 0.9986948955916474, "grad_norm": 0.025231029838323593, "grad_norm_var": 3.4527743077155996e-07, "learning_rate": 4.376105309011891e-08, "loss": 2.3808, "step": 27548 }, { "crossentropy": 2.4270987510681152, "epoch": 0.9987311484918794, "grad_norm": 0.025625869631767273, "grad_norm_var": 2.7918477938230987e-07, "learning_rate": 4.136365302442879e-08, "loss": 2.5148, "step": 27549 }, { "crossentropy": 2.396599054336548, "epoch": 0.9987674013921114, "grad_norm": 0.025750787928700447, "grad_norm_var": 2.685289774179145e-07, "learning_rate": 3.903378498759658e-08, "loss": 2.4503, "step": 27550 }, { "crossentropy": 2.149345874786377, "epoch": 0.9988036542923434, "grad_norm": 0.024917779490351677, "grad_norm_var": 2.7743653490669197e-07, "learning_rate": 3.677144901126361e-08, "loss": 2.2545, "step": 27551 }, { "crossentropy": 2.4888675212860107, "epoch": 0.9988399071925754, "grad_norm": 0.027239445596933365, "grad_norm_var": 3.882281660080288e-07, "learning_rate": 3.4576645125961034e-08, "loss": 2.4909, "step": 27552 }, { "crossentropy": 2.3657631874084473, "epoch": 0.9988761600928074, "grad_norm": 0.026069670915603638, "grad_norm_var": 3.8529635905208227e-07, "learning_rate": 3.244937336166487e-08, "loss": 2.4066, "step": 27553 }, { "crossentropy": 2.182213306427002, "epoch": 0.9989124129930395, "grad_norm": 0.025883065536618233, "grad_norm_var": 3.818147334649306e-07, "learning_rate": 3.0389633746685796e-08, "loss": 2.3261, "step": 27554 }, { "crossentropy": 2.243090867996216, "epoch": 0.9989486658932715, "grad_norm": 0.025451652705669403, "grad_norm_var": 3.98768032812977e-07, "learning_rate": 2.8397426309334506e-08, "loss": 2.3808, "step": 27555 }, { "crossentropy": 2.3564114570617676, "epoch": 0.9989849187935035, "grad_norm": 0.025825534015893936, "grad_norm_var": 3.9517994115204995e-07, "learning_rate": 2.647275107570124e-08, "loss": 2.3533, "step": 27556 }, { "crossentropy": 2.3635799884796143, "epoch": 0.9990211716937355, "grad_norm": 0.02686135098338127, "grad_norm_var": 4.393721264476924e-07, "learning_rate": 2.461560807243135e-08, "loss": 2.3979, "step": 27557 }, { "crossentropy": 2.4377803802490234, "epoch": 0.9990574245939675, "grad_norm": 0.0262971930205822, "grad_norm_var": 4.2955830004681495e-07, "learning_rate": 2.2825997324504855e-08, "loss": 2.4136, "step": 27558 }, { "crossentropy": 2.3891732692718506, "epoch": 0.9990936774941995, "grad_norm": 0.025440175086259842, "grad_norm_var": 4.3640293984659126e-07, "learning_rate": 2.110391885634666e-08, "loss": 2.3932, "step": 27559 }, { "crossentropy": 2.4574172496795654, "epoch": 0.9991299303944315, "grad_norm": 0.02560953050851822, "grad_norm_var": 4.435329689137507e-07, "learning_rate": 1.944937269071634e-08, "loss": 2.4372, "step": 27560 }, { "crossentropy": 2.4666826725006104, "epoch": 0.9991661832946636, "grad_norm": 0.025622781366109848, "grad_norm_var": 4.482450676291377e-07, "learning_rate": 1.786235884981835e-08, "loss": 2.4668, "step": 27561 }, { "crossentropy": 2.4032516479492188, "epoch": 0.9992024361948956, "grad_norm": 0.0263659730553627, "grad_norm_var": 3.504247948443266e-07, "learning_rate": 1.6342877355857156e-08, "loss": 2.2937, "step": 27562 }, { "crossentropy": 2.455901622772217, "epoch": 0.9992386890951276, "grad_norm": 0.02607361227273941, "grad_norm_var": 3.493466804881887e-07, "learning_rate": 1.4890928228816768e-08, "loss": 2.4934, "step": 27563 }, { "crossentropy": 2.1773388385772705, "epoch": 0.9992749419953596, "grad_norm": 0.025810934603214264, "grad_norm_var": 3.1928978738008684e-07, "learning_rate": 1.3506511488681206e-08, "loss": 2.3761, "step": 27564 }, { "crossentropy": 2.5384602546691895, "epoch": 0.9993111948955916, "grad_norm": 0.02593522146344185, "grad_norm_var": 3.1281581694129975e-07, "learning_rate": 1.2189627153214033e-08, "loss": 2.4841, "step": 27565 }, { "crossentropy": 2.6026902198791504, "epoch": 0.9993474477958236, "grad_norm": 0.027629757300019264, "grad_norm_var": 4.842744119434022e-07, "learning_rate": 1.0940275241289044e-08, "loss": 2.5077, "step": 27566 }, { "crossentropy": 2.4691250324249268, "epoch": 0.9993837006960556, "grad_norm": 0.026164762675762177, "grad_norm_var": 3.9078359147654006e-07, "learning_rate": 9.758455769559582e-09, "loss": 2.43, "step": 27567 }, { "crossentropy": 2.3693113327026367, "epoch": 0.9994199535962877, "grad_norm": 0.02541266940534115, "grad_norm_var": 3.3217989564292224e-07, "learning_rate": 8.644168753568771e-09, "loss": 2.4062, "step": 27568 }, { "crossentropy": 2.363708019256592, "epoch": 0.9994562064965197, "grad_norm": 0.026331203058362007, "grad_norm_var": 3.3789511572618103e-07, "learning_rate": 7.597414208859731e-09, "loss": 2.3881, "step": 27569 }, { "crossentropy": 2.4451675415039062, "epoch": 0.9994924593967517, "grad_norm": 0.02656819112598896, "grad_norm_var": 3.524658726077855e-07, "learning_rate": 6.618192148755142e-09, "loss": 2.3774, "step": 27570 }, { "crossentropy": 2.3684420585632324, "epoch": 0.9995287122969838, "grad_norm": 0.02654656209051609, "grad_norm_var": 3.3456155877872814e-07, "learning_rate": 5.706502587687901e-09, "loss": 2.3677, "step": 27571 }, { "crossentropy": 2.442200183868408, "epoch": 0.9995649651972158, "grad_norm": 0.026851855218410492, "grad_norm_var": 3.5517779655400093e-07, "learning_rate": 4.862345536760237e-09, "loss": 2.4493, "step": 27572 }, { "crossentropy": 2.3472981452941895, "epoch": 0.9996012180974478, "grad_norm": 0.02608988806605339, "grad_norm_var": 3.264159192177164e-07, "learning_rate": 4.085721008184606e-09, "loss": 2.3159, "step": 27573 }, { "crossentropy": 2.5101478099823, "epoch": 0.9996374709976799, "grad_norm": 0.026209263131022453, "grad_norm_var": 3.2543014906640196e-07, "learning_rate": 3.3766290125081254e-09, "loss": 2.3535, "step": 27574 }, { "crossentropy": 2.5481150150299072, "epoch": 0.9996737238979119, "grad_norm": 0.0261169895529747, "grad_norm_var": 2.8852418632920506e-07, "learning_rate": 2.7350695586125797e-09, "loss": 2.4338, "step": 27575 }, { "crossentropy": 2.4426462650299072, "epoch": 0.9997099767981439, "grad_norm": 0.02579839713871479, "grad_norm_var": 2.756651907461488e-07, "learning_rate": 2.1610426553797526e-09, "loss": 2.4137, "step": 27576 }, { "crossentropy": 2.355775833129883, "epoch": 0.9997462296983759, "grad_norm": 0.025994788855314255, "grad_norm_var": 2.546669071700937e-07, "learning_rate": 1.6545483111363168e-09, "loss": 2.3813, "step": 27577 }, { "crossentropy": 2.3557746410369873, "epoch": 0.9997824825986079, "grad_norm": 0.025301501154899597, "grad_norm_var": 3.081392283022736e-07, "learning_rate": 1.2155865319884996e-09, "loss": 2.3423, "step": 27578 }, { "crossentropy": 2.427887439727783, "epoch": 0.9998187354988399, "grad_norm": 0.02587149851024151, "grad_norm_var": 3.1348455450455617e-07, "learning_rate": 8.441573245976387e-10, "loss": 2.4204, "step": 27579 }, { "crossentropy": 2.5523815155029297, "epoch": 0.9998549883990719, "grad_norm": 0.026372015476226807, "grad_norm_var": 3.067028487716931e-07, "learning_rate": 5.402606928495146e-10, "loss": 2.4998, "step": 27580 }, { "crossentropy": 2.307316303253174, "epoch": 0.999891241299304, "grad_norm": 0.02499488927423954, "grad_norm_var": 3.951215967261014e-07, "learning_rate": 3.038966422952427e-10, "loss": 2.3666, "step": 27581 }, { "crossentropy": 2.4937663078308105, "epoch": 0.999927494199536, "grad_norm": 0.025855911895632744, "grad_norm_var": 2.3964339317266287e-07, "learning_rate": 1.3506517515526895e-10, "loss": 2.3849, "step": 27582 }, { "crossentropy": 2.435901403427124, "epoch": 0.999963747099768, "grad_norm": 0.02563844621181488, "grad_norm_var": 2.4750111068578124e-07, "learning_rate": 3.376629365003936e-11, "loss": 2.4645, "step": 27583 }, { "crossentropy": 2.4924559593200684, "epoch": 1.0, "grad_norm": 0.029177285730838776, "grad_norm_var": 8.399031045898281e-07, "learning_rate": 0.0, "loss": 2.4808, "step": 27584 } ], "logging_steps": 1, "max_steps": 27584, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.548450559074238e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }