{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 2560, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 3.128413954199511, "learning_rate": 2.734375e-08, "loss": 0.5621, "step": 1 }, { "epoch": 0.0, "grad_norm": 3.037585935585123, "learning_rate": 5.46875e-08, "loss": 0.6275, "step": 2 }, { "epoch": 0.01, "grad_norm": 3.2088168210626695, "learning_rate": 8.203125e-08, "loss": 0.6421, "step": 3 }, { "epoch": 0.01, "grad_norm": 3.2977586966193364, "learning_rate": 1.09375e-07, "loss": 0.5669, "step": 4 }, { "epoch": 0.01, "grad_norm": 2.7279850179103255, "learning_rate": 1.3671875e-07, "loss": 0.5437, "step": 5 }, { "epoch": 0.01, "grad_norm": 2.946143496522244, "learning_rate": 1.640625e-07, "loss": 0.5253, "step": 6 }, { "epoch": 0.01, "grad_norm": 3.102898231772002, "learning_rate": 1.9140625e-07, "loss": 0.6088, "step": 7 }, { "epoch": 0.02, "grad_norm": 3.296417193731637, "learning_rate": 2.1875e-07, "loss": 0.6401, "step": 8 }, { "epoch": 0.02, "grad_norm": 2.589445324499214, "learning_rate": 2.4609375e-07, "loss": 0.4806, "step": 9 }, { "epoch": 0.02, "grad_norm": 3.225832007072029, "learning_rate": 2.734375e-07, "loss": 0.5967, "step": 10 }, { "epoch": 0.02, "grad_norm": 3.0471471509694865, "learning_rate": 3.0078125e-07, "loss": 0.5275, "step": 11 }, { "epoch": 0.02, "grad_norm": 2.8506862807430435, "learning_rate": 3.28125e-07, "loss": 0.4738, "step": 12 }, { "epoch": 0.03, "grad_norm": 2.7534941936602997, "learning_rate": 3.5546875e-07, "loss": 0.5466, "step": 13 }, { "epoch": 0.03, "grad_norm": 2.4971943894440045, "learning_rate": 3.828125e-07, "loss": 0.5133, "step": 14 }, { "epoch": 0.03, "grad_norm": 2.766676234217308, "learning_rate": 4.1015625e-07, "loss": 0.5082, "step": 15 }, { "epoch": 0.03, "grad_norm": 2.968245371584164, "learning_rate": 4.375e-07, "loss": 0.5587, "step": 16 }, { "epoch": 0.03, "grad_norm": 2.952876034161518, "learning_rate": 4.6484374999999997e-07, "loss": 0.5961, "step": 17 }, { "epoch": 0.04, "grad_norm": 2.7529049741475444, "learning_rate": 4.921875e-07, "loss": 0.5186, "step": 18 }, { "epoch": 0.04, "grad_norm": 2.5979866770390605, "learning_rate": 5.1953125e-07, "loss": 0.5303, "step": 19 }, { "epoch": 0.04, "grad_norm": 2.613695448259043, "learning_rate": 5.46875e-07, "loss": 0.5524, "step": 20 }, { "epoch": 0.04, "grad_norm": 2.9115837883461673, "learning_rate": 5.7421875e-07, "loss": 0.5829, "step": 21 }, { "epoch": 0.04, "grad_norm": 2.632517558239255, "learning_rate": 6.015625e-07, "loss": 0.6033, "step": 22 }, { "epoch": 0.04, "grad_norm": 2.991636448394699, "learning_rate": 6.2890625e-07, "loss": 0.5981, "step": 23 }, { "epoch": 0.05, "grad_norm": 3.189113609556015, "learning_rate": 6.5625e-07, "loss": 0.6624, "step": 24 }, { "epoch": 0.05, "grad_norm": 3.201670939774819, "learning_rate": 6.8359375e-07, "loss": 0.6181, "step": 25 }, { "epoch": 0.05, "grad_norm": 2.275094909819829, "learning_rate": 7.109375e-07, "loss": 0.5586, "step": 26 }, { "epoch": 0.05, "grad_norm": 2.3779844175646168, "learning_rate": 7.382812499999999e-07, "loss": 0.5507, "step": 27 }, { "epoch": 0.05, "grad_norm": 2.238956450682287, "learning_rate": 7.65625e-07, "loss": 0.5468, "step": 28 }, { "epoch": 0.06, "grad_norm": 1.949604312765129, "learning_rate": 7.9296875e-07, "loss": 0.5077, "step": 29 }, { "epoch": 0.06, "grad_norm": 2.5687010832740644, "learning_rate": 8.203125e-07, "loss": 0.6007, "step": 30 }, { "epoch": 0.06, "grad_norm": 1.7581913875261437, "learning_rate": 8.4765625e-07, "loss": 0.4693, "step": 31 }, { "epoch": 0.06, "grad_norm": 2.080928575111656, "learning_rate": 8.75e-07, "loss": 0.579, "step": 32 }, { "epoch": 0.06, "grad_norm": 2.0049064634520857, "learning_rate": 9.0234375e-07, "loss": 0.4988, "step": 33 }, { "epoch": 0.07, "grad_norm": 1.9323347227125987, "learning_rate": 9.296874999999999e-07, "loss": 0.5208, "step": 34 }, { "epoch": 0.07, "grad_norm": 2.1486661419855033, "learning_rate": 9.5703125e-07, "loss": 0.5085, "step": 35 }, { "epoch": 0.07, "grad_norm": 2.354236720570912, "learning_rate": 9.84375e-07, "loss": 0.5269, "step": 36 }, { "epoch": 0.07, "grad_norm": 2.0426536501490404, "learning_rate": 1.01171875e-06, "loss": 0.4577, "step": 37 }, { "epoch": 0.07, "grad_norm": 2.4275972673838027, "learning_rate": 1.0390625e-06, "loss": 0.5547, "step": 38 }, { "epoch": 0.08, "grad_norm": 1.8661965431626903, "learning_rate": 1.06640625e-06, "loss": 0.5188, "step": 39 }, { "epoch": 0.08, "grad_norm": 2.2353224713624655, "learning_rate": 1.09375e-06, "loss": 0.505, "step": 40 }, { "epoch": 0.08, "grad_norm": 2.1175750099721697, "learning_rate": 1.12109375e-06, "loss": 0.5114, "step": 41 }, { "epoch": 0.08, "grad_norm": 1.819555197095869, "learning_rate": 1.1484375e-06, "loss": 0.4851, "step": 42 }, { "epoch": 0.08, "grad_norm": 2.0686225356865617, "learning_rate": 1.17578125e-06, "loss": 0.508, "step": 43 }, { "epoch": 0.09, "grad_norm": 2.4044491498533103, "learning_rate": 1.203125e-06, "loss": 0.6324, "step": 44 }, { "epoch": 0.09, "grad_norm": 2.0900695464469288, "learning_rate": 1.23046875e-06, "loss": 0.4948, "step": 45 }, { "epoch": 0.09, "grad_norm": 1.8677129744056007, "learning_rate": 1.2578125e-06, "loss": 0.5125, "step": 46 }, { "epoch": 0.09, "grad_norm": 1.9204494559869265, "learning_rate": 1.28515625e-06, "loss": 0.497, "step": 47 }, { "epoch": 0.09, "grad_norm": 1.6628026092314254, "learning_rate": 1.3125e-06, "loss": 0.4507, "step": 48 }, { "epoch": 0.1, "grad_norm": 1.8268622873360458, "learning_rate": 1.33984375e-06, "loss": 0.5894, "step": 49 }, { "epoch": 0.1, "grad_norm": 1.7682156362759995, "learning_rate": 1.3671875e-06, "loss": 0.4506, "step": 50 }, { "epoch": 0.1, "grad_norm": 1.7292349931490945, "learning_rate": 1.39453125e-06, "loss": 0.4432, "step": 51 }, { "epoch": 0.1, "grad_norm": 1.6365928691895806, "learning_rate": 1.421875e-06, "loss": 0.4281, "step": 52 }, { "epoch": 0.1, "grad_norm": 1.8929951375892726, "learning_rate": 1.44921875e-06, "loss": 0.4704, "step": 53 }, { "epoch": 0.11, "grad_norm": 1.7798658553286189, "learning_rate": 1.4765624999999999e-06, "loss": 0.4758, "step": 54 }, { "epoch": 0.11, "grad_norm": 1.5132049869591295, "learning_rate": 1.50390625e-06, "loss": 0.4431, "step": 55 }, { "epoch": 0.11, "grad_norm": 1.786669693462764, "learning_rate": 1.53125e-06, "loss": 0.4583, "step": 56 }, { "epoch": 0.11, "grad_norm": 1.8071346306214575, "learning_rate": 1.55859375e-06, "loss": 0.4312, "step": 57 }, { "epoch": 0.11, "grad_norm": 1.6815536252349477, "learning_rate": 1.5859375e-06, "loss": 0.5283, "step": 58 }, { "epoch": 0.12, "grad_norm": 1.5179586620456682, "learning_rate": 1.61328125e-06, "loss": 0.3334, "step": 59 }, { "epoch": 0.12, "grad_norm": 1.5090951824642556, "learning_rate": 1.640625e-06, "loss": 0.4235, "step": 60 }, { "epoch": 0.12, "grad_norm": 1.8945788563237862, "learning_rate": 1.6679687499999999e-06, "loss": 0.4998, "step": 61 }, { "epoch": 0.12, "grad_norm": 1.736735337115983, "learning_rate": 1.6953125e-06, "loss": 0.4739, "step": 62 }, { "epoch": 0.12, "grad_norm": 1.7118313788890553, "learning_rate": 1.72265625e-06, "loss": 0.4363, "step": 63 }, { "epoch": 0.12, "grad_norm": 2.0233293870319056, "learning_rate": 1.75e-06, "loss": 0.414, "step": 64 }, { "epoch": 0.13, "grad_norm": 1.9508041864738046, "learning_rate": 1.77734375e-06, "loss": 0.4891, "step": 65 }, { "epoch": 0.13, "grad_norm": 1.8579553947725151, "learning_rate": 1.8046875e-06, "loss": 0.4513, "step": 66 }, { "epoch": 0.13, "grad_norm": 1.7077612740622743, "learning_rate": 1.83203125e-06, "loss": 0.4443, "step": 67 }, { "epoch": 0.13, "grad_norm": 1.777993936008971, "learning_rate": 1.8593749999999999e-06, "loss": 0.4335, "step": 68 }, { "epoch": 0.13, "grad_norm": 1.6575639651076723, "learning_rate": 1.88671875e-06, "loss": 0.4853, "step": 69 }, { "epoch": 0.14, "grad_norm": 1.7880136437994043, "learning_rate": 1.9140625e-06, "loss": 0.4701, "step": 70 }, { "epoch": 0.14, "grad_norm": 1.8888968280304452, "learning_rate": 1.94140625e-06, "loss": 0.5283, "step": 71 }, { "epoch": 0.14, "grad_norm": 1.805920986260904, "learning_rate": 1.96875e-06, "loss": 0.5019, "step": 72 }, { "epoch": 0.14, "grad_norm": 1.922194280119805, "learning_rate": 1.99609375e-06, "loss": 0.5526, "step": 73 }, { "epoch": 0.14, "grad_norm": 1.5879991195192271, "learning_rate": 2.0234375e-06, "loss": 0.4709, "step": 74 }, { "epoch": 0.15, "grad_norm": 1.648788299098461, "learning_rate": 2.05078125e-06, "loss": 0.4395, "step": 75 }, { "epoch": 0.15, "grad_norm": 1.6909038510367775, "learning_rate": 2.078125e-06, "loss": 0.4055, "step": 76 }, { "epoch": 0.15, "grad_norm": 1.7014792704046715, "learning_rate": 2.10546875e-06, "loss": 0.4175, "step": 77 }, { "epoch": 0.15, "grad_norm": 1.6800516005840025, "learning_rate": 2.1328125e-06, "loss": 0.4477, "step": 78 }, { "epoch": 0.15, "grad_norm": 1.730286231271452, "learning_rate": 2.16015625e-06, "loss": 0.4744, "step": 79 }, { "epoch": 0.16, "grad_norm": 1.4985133621089306, "learning_rate": 2.1875e-06, "loss": 0.4242, "step": 80 }, { "epoch": 0.16, "grad_norm": 1.7273343648326056, "learning_rate": 2.21484375e-06, "loss": 0.4987, "step": 81 }, { "epoch": 0.16, "grad_norm": 1.6506990743654066, "learning_rate": 2.2421875e-06, "loss": 0.3981, "step": 82 }, { "epoch": 0.16, "grad_norm": 1.708706990116356, "learning_rate": 2.26953125e-06, "loss": 0.4642, "step": 83 }, { "epoch": 0.16, "grad_norm": 1.533215165569491, "learning_rate": 2.296875e-06, "loss": 0.4032, "step": 84 }, { "epoch": 0.17, "grad_norm": 1.626443933318734, "learning_rate": 2.32421875e-06, "loss": 0.4301, "step": 85 }, { "epoch": 0.17, "grad_norm": 1.5206809398796302, "learning_rate": 2.3515625e-06, "loss": 0.3984, "step": 86 }, { "epoch": 0.17, "grad_norm": 1.6283983599205825, "learning_rate": 2.37890625e-06, "loss": 0.4223, "step": 87 }, { "epoch": 0.17, "grad_norm": 1.7969773816254218, "learning_rate": 2.40625e-06, "loss": 0.4468, "step": 88 }, { "epoch": 0.17, "grad_norm": 1.6643997849274585, "learning_rate": 2.43359375e-06, "loss": 0.3757, "step": 89 }, { "epoch": 0.18, "grad_norm": 1.6304325632471675, "learning_rate": 2.4609375e-06, "loss": 0.4196, "step": 90 }, { "epoch": 0.18, "grad_norm": 1.8215320257930196, "learning_rate": 2.48828125e-06, "loss": 0.5063, "step": 91 }, { "epoch": 0.18, "grad_norm": 1.6044941873769307, "learning_rate": 2.515625e-06, "loss": 0.3913, "step": 92 }, { "epoch": 0.18, "grad_norm": 1.5379877817348175, "learning_rate": 2.54296875e-06, "loss": 0.3972, "step": 93 }, { "epoch": 0.18, "grad_norm": 1.530546582634043, "learning_rate": 2.5703125e-06, "loss": 0.4656, "step": 94 }, { "epoch": 0.19, "grad_norm": 1.729848709901716, "learning_rate": 2.59765625e-06, "loss": 0.4638, "step": 95 }, { "epoch": 0.19, "grad_norm": 1.8261412348731878, "learning_rate": 2.625e-06, "loss": 0.464, "step": 96 }, { "epoch": 0.19, "grad_norm": 1.8979064931234173, "learning_rate": 2.65234375e-06, "loss": 0.5098, "step": 97 }, { "epoch": 0.19, "grad_norm": 1.810958159950579, "learning_rate": 2.6796875e-06, "loss": 0.5244, "step": 98 }, { "epoch": 0.19, "grad_norm": 1.7405405092236963, "learning_rate": 2.70703125e-06, "loss": 0.4607, "step": 99 }, { "epoch": 0.2, "grad_norm": 1.9212445209859856, "learning_rate": 2.734375e-06, "loss": 0.5642, "step": 100 }, { "epoch": 0.2, "grad_norm": 1.6190108674711587, "learning_rate": 2.76171875e-06, "loss": 0.4438, "step": 101 }, { "epoch": 0.2, "grad_norm": 1.7091790904549895, "learning_rate": 2.7890625e-06, "loss": 0.4812, "step": 102 }, { "epoch": 0.2, "grad_norm": 1.781783038220379, "learning_rate": 2.81640625e-06, "loss": 0.396, "step": 103 }, { "epoch": 0.2, "grad_norm": 1.7429296872382805, "learning_rate": 2.84375e-06, "loss": 0.4316, "step": 104 }, { "epoch": 0.21, "grad_norm": 1.6685192066925245, "learning_rate": 2.87109375e-06, "loss": 0.4463, "step": 105 }, { "epoch": 0.21, "grad_norm": 1.6604188558097106, "learning_rate": 2.8984375e-06, "loss": 0.4458, "step": 106 }, { "epoch": 0.21, "grad_norm": 1.6603255217987127, "learning_rate": 2.92578125e-06, "loss": 0.3831, "step": 107 }, { "epoch": 0.21, "grad_norm": 1.7515755648421543, "learning_rate": 2.9531249999999998e-06, "loss": 0.3941, "step": 108 }, { "epoch": 0.21, "grad_norm": 1.479753318796779, "learning_rate": 2.98046875e-06, "loss": 0.4021, "step": 109 }, { "epoch": 0.21, "grad_norm": 1.872749867472135, "learning_rate": 3.0078125e-06, "loss": 0.5431, "step": 110 }, { "epoch": 0.22, "grad_norm": 1.884113873604911, "learning_rate": 3.03515625e-06, "loss": 0.4619, "step": 111 }, { "epoch": 0.22, "grad_norm": 1.6303375325263114, "learning_rate": 3.0625e-06, "loss": 0.4507, "step": 112 }, { "epoch": 0.22, "grad_norm": 1.8118310331615943, "learning_rate": 3.08984375e-06, "loss": 0.4222, "step": 113 }, { "epoch": 0.22, "grad_norm": 1.5470274462052933, "learning_rate": 3.1171875e-06, "loss": 0.4075, "step": 114 }, { "epoch": 0.22, "grad_norm": 1.5658079313414102, "learning_rate": 3.1445312499999998e-06, "loss": 0.4249, "step": 115 }, { "epoch": 0.23, "grad_norm": 1.8829013328690554, "learning_rate": 3.171875e-06, "loss": 0.4704, "step": 116 }, { "epoch": 0.23, "grad_norm": 1.6615560283993915, "learning_rate": 3.19921875e-06, "loss": 0.4277, "step": 117 }, { "epoch": 0.23, "grad_norm": 2.0212579724469757, "learning_rate": 3.2265625e-06, "loss": 0.4585, "step": 118 }, { "epoch": 0.23, "grad_norm": 1.5256586596012434, "learning_rate": 3.25390625e-06, "loss": 0.3688, "step": 119 }, { "epoch": 0.23, "grad_norm": 1.53513175027563, "learning_rate": 3.28125e-06, "loss": 0.3709, "step": 120 }, { "epoch": 0.24, "grad_norm": 1.9214579657390796, "learning_rate": 3.30859375e-06, "loss": 0.4931, "step": 121 }, { "epoch": 0.24, "grad_norm": 1.6063790259065545, "learning_rate": 3.3359374999999998e-06, "loss": 0.3884, "step": 122 }, { "epoch": 0.24, "grad_norm": 1.9574883998278414, "learning_rate": 3.36328125e-06, "loss": 0.6157, "step": 123 }, { "epoch": 0.24, "grad_norm": 1.7886981572141256, "learning_rate": 3.390625e-06, "loss": 0.4522, "step": 124 }, { "epoch": 0.24, "grad_norm": 1.8382951320153509, "learning_rate": 3.41796875e-06, "loss": 0.4402, "step": 125 }, { "epoch": 0.25, "grad_norm": 1.9095954610278296, "learning_rate": 3.4453125e-06, "loss": 0.4161, "step": 126 }, { "epoch": 0.25, "grad_norm": 1.707856398771417, "learning_rate": 3.47265625e-06, "loss": 0.4585, "step": 127 }, { "epoch": 0.25, "grad_norm": 1.6331867464661778, "learning_rate": 3.5e-06, "loss": 0.409, "step": 128 }, { "epoch": 0.25, "grad_norm": 1.5618629563989987, "learning_rate": 3.5273437499999998e-06, "loss": 0.4381, "step": 129 }, { "epoch": 0.25, "grad_norm": 1.7707923896186502, "learning_rate": 3.5546875e-06, "loss": 0.4481, "step": 130 }, { "epoch": 0.26, "grad_norm": 1.5908208626795446, "learning_rate": 3.58203125e-06, "loss": 0.4051, "step": 131 }, { "epoch": 0.26, "grad_norm": 1.7211396878061895, "learning_rate": 3.609375e-06, "loss": 0.3843, "step": 132 }, { "epoch": 0.26, "grad_norm": 1.5041181250326119, "learning_rate": 3.63671875e-06, "loss": 0.4043, "step": 133 }, { "epoch": 0.26, "grad_norm": 1.5553157675396516, "learning_rate": 3.6640625e-06, "loss": 0.38, "step": 134 }, { "epoch": 0.26, "grad_norm": 1.7712363605884898, "learning_rate": 3.69140625e-06, "loss": 0.417, "step": 135 }, { "epoch": 0.27, "grad_norm": 1.784458094130054, "learning_rate": 3.7187499999999998e-06, "loss": 0.4264, "step": 136 }, { "epoch": 0.27, "grad_norm": 1.6480350107533233, "learning_rate": 3.74609375e-06, "loss": 0.4868, "step": 137 }, { "epoch": 0.27, "grad_norm": 1.5089497142423627, "learning_rate": 3.7734375e-06, "loss": 0.401, "step": 138 }, { "epoch": 0.27, "grad_norm": 1.6355710287430631, "learning_rate": 3.80078125e-06, "loss": 0.4196, "step": 139 }, { "epoch": 0.27, "grad_norm": 1.7812735776547552, "learning_rate": 3.828125e-06, "loss": 0.428, "step": 140 }, { "epoch": 0.28, "grad_norm": 1.5254037147416897, "learning_rate": 3.85546875e-06, "loss": 0.3745, "step": 141 }, { "epoch": 0.28, "grad_norm": 1.8618797668136995, "learning_rate": 3.8828125e-06, "loss": 0.4326, "step": 142 }, { "epoch": 0.28, "grad_norm": 1.5968651786544443, "learning_rate": 3.91015625e-06, "loss": 0.3865, "step": 143 }, { "epoch": 0.28, "grad_norm": 1.9707365024664616, "learning_rate": 3.9375e-06, "loss": 0.4476, "step": 144 }, { "epoch": 0.28, "grad_norm": 1.6562468060802487, "learning_rate": 3.96484375e-06, "loss": 0.417, "step": 145 }, { "epoch": 0.29, "grad_norm": 1.5891545226101267, "learning_rate": 3.9921875e-06, "loss": 0.3725, "step": 146 }, { "epoch": 0.29, "grad_norm": 1.8547739637888951, "learning_rate": 4.01953125e-06, "loss": 0.4338, "step": 147 }, { "epoch": 0.29, "grad_norm": 1.7484621704723866, "learning_rate": 4.046875e-06, "loss": 0.4218, "step": 148 }, { "epoch": 0.29, "grad_norm": 1.5630222680029855, "learning_rate": 4.07421875e-06, "loss": 0.4337, "step": 149 }, { "epoch": 0.29, "grad_norm": 1.6611544199636974, "learning_rate": 4.1015625e-06, "loss": 0.4033, "step": 150 }, { "epoch": 0.29, "grad_norm": 2.066419685922582, "learning_rate": 4.12890625e-06, "loss": 0.4346, "step": 151 }, { "epoch": 0.3, "grad_norm": 1.7255432832803852, "learning_rate": 4.15625e-06, "loss": 0.4453, "step": 152 }, { "epoch": 0.3, "grad_norm": 1.5987696694945261, "learning_rate": 4.18359375e-06, "loss": 0.4367, "step": 153 }, { "epoch": 0.3, "grad_norm": 1.74809759863357, "learning_rate": 4.2109375e-06, "loss": 0.3892, "step": 154 }, { "epoch": 0.3, "grad_norm": 1.8300332392362222, "learning_rate": 4.23828125e-06, "loss": 0.4027, "step": 155 }, { "epoch": 0.3, "grad_norm": 1.5046605555845984, "learning_rate": 4.265625e-06, "loss": 0.3847, "step": 156 }, { "epoch": 0.31, "grad_norm": 1.5259805967133637, "learning_rate": 4.29296875e-06, "loss": 0.4534, "step": 157 }, { "epoch": 0.31, "grad_norm": 1.8290074921486483, "learning_rate": 4.3203125e-06, "loss": 0.3846, "step": 158 }, { "epoch": 0.31, "grad_norm": 1.670052857442523, "learning_rate": 4.34765625e-06, "loss": 0.4218, "step": 159 }, { "epoch": 0.31, "grad_norm": 1.838099007107515, "learning_rate": 4.375e-06, "loss": 0.4138, "step": 160 }, { "epoch": 0.31, "grad_norm": 1.8411812559852296, "learning_rate": 4.40234375e-06, "loss": 0.3896, "step": 161 }, { "epoch": 0.32, "grad_norm": 1.7177735152171905, "learning_rate": 4.4296875e-06, "loss": 0.4465, "step": 162 }, { "epoch": 0.32, "grad_norm": 1.8693531440819167, "learning_rate": 4.45703125e-06, "loss": 0.4572, "step": 163 }, { "epoch": 0.32, "grad_norm": 1.9207152701679124, "learning_rate": 4.484375e-06, "loss": 0.4551, "step": 164 }, { "epoch": 0.32, "grad_norm": 1.6812557453273627, "learning_rate": 4.51171875e-06, "loss": 0.4362, "step": 165 }, { "epoch": 0.32, "grad_norm": 1.7603728011911475, "learning_rate": 4.5390625e-06, "loss": 0.4515, "step": 166 }, { "epoch": 0.33, "grad_norm": 1.7184306548945683, "learning_rate": 4.56640625e-06, "loss": 0.3473, "step": 167 }, { "epoch": 0.33, "grad_norm": 1.5972547241908788, "learning_rate": 4.59375e-06, "loss": 0.3994, "step": 168 }, { "epoch": 0.33, "grad_norm": 1.4650628102867491, "learning_rate": 4.62109375e-06, "loss": 0.3954, "step": 169 }, { "epoch": 0.33, "grad_norm": 1.7599778494358067, "learning_rate": 4.6484375e-06, "loss": 0.4331, "step": 170 }, { "epoch": 0.33, "grad_norm": 1.9183014925780575, "learning_rate": 4.67578125e-06, "loss": 0.5148, "step": 171 }, { "epoch": 0.34, "grad_norm": 1.714016735063692, "learning_rate": 4.703125e-06, "loss": 0.4428, "step": 172 }, { "epoch": 0.34, "grad_norm": 1.487633391835312, "learning_rate": 4.73046875e-06, "loss": 0.3435, "step": 173 }, { "epoch": 0.34, "grad_norm": 1.6069336847032472, "learning_rate": 4.7578125e-06, "loss": 0.3874, "step": 174 }, { "epoch": 0.34, "grad_norm": 1.6311644796741402, "learning_rate": 4.78515625e-06, "loss": 0.4326, "step": 175 }, { "epoch": 0.34, "grad_norm": 1.588429067744261, "learning_rate": 4.8125e-06, "loss": 0.4604, "step": 176 }, { "epoch": 0.35, "grad_norm": 1.9303420718498465, "learning_rate": 4.83984375e-06, "loss": 0.4159, "step": 177 }, { "epoch": 0.35, "grad_norm": 1.7179742255184733, "learning_rate": 4.8671875e-06, "loss": 0.452, "step": 178 }, { "epoch": 0.35, "grad_norm": 1.9261905271804165, "learning_rate": 4.89453125e-06, "loss": 0.5006, "step": 179 }, { "epoch": 0.35, "grad_norm": 1.554906700804885, "learning_rate": 4.921875e-06, "loss": 0.3443, "step": 180 }, { "epoch": 0.35, "grad_norm": 1.766649301167674, "learning_rate": 4.94921875e-06, "loss": 0.4229, "step": 181 }, { "epoch": 0.36, "grad_norm": 2.0204707298915525, "learning_rate": 4.9765625e-06, "loss": 0.4444, "step": 182 }, { "epoch": 0.36, "grad_norm": 1.837370637851734, "learning_rate": 5.00390625e-06, "loss": 0.3955, "step": 183 }, { "epoch": 0.36, "grad_norm": 1.6377788789869152, "learning_rate": 5.03125e-06, "loss": 0.427, "step": 184 }, { "epoch": 0.36, "grad_norm": 1.648447040153375, "learning_rate": 5.05859375e-06, "loss": 0.4016, "step": 185 }, { "epoch": 0.36, "grad_norm": 1.5780761828632177, "learning_rate": 5.0859375e-06, "loss": 0.3949, "step": 186 }, { "epoch": 0.37, "grad_norm": 1.841923056360508, "learning_rate": 5.11328125e-06, "loss": 0.4442, "step": 187 }, { "epoch": 0.37, "grad_norm": 1.514454091306901, "learning_rate": 5.140625e-06, "loss": 0.406, "step": 188 }, { "epoch": 0.37, "grad_norm": 1.7317144567353495, "learning_rate": 5.16796875e-06, "loss": 0.4625, "step": 189 }, { "epoch": 0.37, "grad_norm": 1.663115727810181, "learning_rate": 5.1953125e-06, "loss": 0.4368, "step": 190 }, { "epoch": 0.37, "grad_norm": 2.4380073015012114, "learning_rate": 5.22265625e-06, "loss": 0.4849, "step": 191 }, { "epoch": 0.38, "grad_norm": 1.9757286802961604, "learning_rate": 5.25e-06, "loss": 0.4217, "step": 192 }, { "epoch": 0.38, "grad_norm": 1.6436044544748893, "learning_rate": 5.27734375e-06, "loss": 0.4177, "step": 193 }, { "epoch": 0.38, "grad_norm": 1.7363184892575236, "learning_rate": 5.3046875e-06, "loss": 0.4065, "step": 194 }, { "epoch": 0.38, "grad_norm": 1.649248993812584, "learning_rate": 5.33203125e-06, "loss": 0.3235, "step": 195 }, { "epoch": 0.38, "grad_norm": 1.840122302347385, "learning_rate": 5.359375e-06, "loss": 0.387, "step": 196 }, { "epoch": 0.38, "grad_norm": 2.0573028227453296, "learning_rate": 5.38671875e-06, "loss": 0.4631, "step": 197 }, { "epoch": 0.39, "grad_norm": 1.6983923110828483, "learning_rate": 5.4140625e-06, "loss": 0.3954, "step": 198 }, { "epoch": 0.39, "grad_norm": 1.7963601510536131, "learning_rate": 5.44140625e-06, "loss": 0.3834, "step": 199 }, { "epoch": 0.39, "grad_norm": 1.5452713783607808, "learning_rate": 5.46875e-06, "loss": 0.3417, "step": 200 }, { "epoch": 0.39, "grad_norm": 2.480516727527445, "learning_rate": 5.49609375e-06, "loss": 0.5015, "step": 201 }, { "epoch": 0.39, "grad_norm": 1.911898316688283, "learning_rate": 5.5234375e-06, "loss": 0.4459, "step": 202 }, { "epoch": 0.4, "grad_norm": 2.0194873991808535, "learning_rate": 5.55078125e-06, "loss": 0.4264, "step": 203 }, { "epoch": 0.4, "grad_norm": 1.8806385422955485, "learning_rate": 5.578125e-06, "loss": 0.4367, "step": 204 }, { "epoch": 0.4, "grad_norm": 2.08068017780106, "learning_rate": 5.60546875e-06, "loss": 0.43, "step": 205 }, { "epoch": 0.4, "grad_norm": 2.038725054080853, "learning_rate": 5.6328125e-06, "loss": 0.4076, "step": 206 }, { "epoch": 0.4, "grad_norm": 1.7873784029780868, "learning_rate": 5.66015625e-06, "loss": 0.3706, "step": 207 }, { "epoch": 0.41, "grad_norm": 1.7421956668724328, "learning_rate": 5.6875e-06, "loss": 0.3991, "step": 208 }, { "epoch": 0.41, "grad_norm": 1.8121345018164325, "learning_rate": 5.7148437499999996e-06, "loss": 0.4618, "step": 209 }, { "epoch": 0.41, "grad_norm": 1.5264607452490162, "learning_rate": 5.7421875e-06, "loss": 0.3887, "step": 210 }, { "epoch": 0.41, "grad_norm": 1.4601956314428939, "learning_rate": 5.76953125e-06, "loss": 0.3502, "step": 211 }, { "epoch": 0.41, "grad_norm": 1.4858623899201022, "learning_rate": 5.796875e-06, "loss": 0.4083, "step": 212 }, { "epoch": 0.42, "grad_norm": 2.058373755208833, "learning_rate": 5.82421875e-06, "loss": 0.5156, "step": 213 }, { "epoch": 0.42, "grad_norm": 1.9394041335374566, "learning_rate": 5.8515625e-06, "loss": 0.418, "step": 214 }, { "epoch": 0.42, "grad_norm": 1.6261674957091425, "learning_rate": 5.87890625e-06, "loss": 0.453, "step": 215 }, { "epoch": 0.42, "grad_norm": 2.366023652663027, "learning_rate": 5.9062499999999996e-06, "loss": 0.5127, "step": 216 }, { "epoch": 0.42, "grad_norm": 1.7621154537810195, "learning_rate": 5.93359375e-06, "loss": 0.454, "step": 217 }, { "epoch": 0.43, "grad_norm": 1.8432299597815534, "learning_rate": 5.9609375e-06, "loss": 0.3242, "step": 218 }, { "epoch": 0.43, "grad_norm": 1.9593859030752425, "learning_rate": 5.98828125e-06, "loss": 0.4902, "step": 219 }, { "epoch": 0.43, "grad_norm": 1.7610414686459346, "learning_rate": 6.015625e-06, "loss": 0.4008, "step": 220 }, { "epoch": 0.43, "grad_norm": 1.6206522765357234, "learning_rate": 6.04296875e-06, "loss": 0.4447, "step": 221 }, { "epoch": 0.43, "grad_norm": 1.7347394254867612, "learning_rate": 6.0703125e-06, "loss": 0.4157, "step": 222 }, { "epoch": 0.44, "grad_norm": 1.6996608020246171, "learning_rate": 6.0976562499999996e-06, "loss": 0.4549, "step": 223 }, { "epoch": 0.44, "grad_norm": 1.6987258915404204, "learning_rate": 6.125e-06, "loss": 0.4191, "step": 224 }, { "epoch": 0.44, "grad_norm": 1.7323057252346672, "learning_rate": 6.15234375e-06, "loss": 0.4483, "step": 225 }, { "epoch": 0.44, "grad_norm": 1.8080609640530043, "learning_rate": 6.1796875e-06, "loss": 0.4103, "step": 226 }, { "epoch": 0.44, "grad_norm": 1.6294657158516932, "learning_rate": 6.20703125e-06, "loss": 0.4016, "step": 227 }, { "epoch": 0.45, "grad_norm": 1.9035888099107414, "learning_rate": 6.234375e-06, "loss": 0.4156, "step": 228 }, { "epoch": 0.45, "grad_norm": 1.7752702311073079, "learning_rate": 6.26171875e-06, "loss": 0.4438, "step": 229 }, { "epoch": 0.45, "grad_norm": 1.5370591369525417, "learning_rate": 6.2890624999999996e-06, "loss": 0.4218, "step": 230 }, { "epoch": 0.45, "grad_norm": 1.6580984531594065, "learning_rate": 6.31640625e-06, "loss": 0.4186, "step": 231 }, { "epoch": 0.45, "grad_norm": 1.744943108292908, "learning_rate": 6.34375e-06, "loss": 0.4469, "step": 232 }, { "epoch": 0.46, "grad_norm": 1.636866021689124, "learning_rate": 6.37109375e-06, "loss": 0.3554, "step": 233 }, { "epoch": 0.46, "grad_norm": 1.5635276577228927, "learning_rate": 6.3984375e-06, "loss": 0.4142, "step": 234 }, { "epoch": 0.46, "grad_norm": 1.5665407291142348, "learning_rate": 6.42578125e-06, "loss": 0.4057, "step": 235 }, { "epoch": 0.46, "grad_norm": 1.5512577372318066, "learning_rate": 6.453125e-06, "loss": 0.4264, "step": 236 }, { "epoch": 0.46, "grad_norm": 1.5770233095969752, "learning_rate": 6.4804687499999995e-06, "loss": 0.4402, "step": 237 }, { "epoch": 0.46, "grad_norm": 1.4616252772583602, "learning_rate": 6.5078125e-06, "loss": 0.3564, "step": 238 }, { "epoch": 0.47, "grad_norm": 1.5272926904089874, "learning_rate": 6.53515625e-06, "loss": 0.4215, "step": 239 }, { "epoch": 0.47, "grad_norm": 1.7912047504129245, "learning_rate": 6.5625e-06, "loss": 0.4574, "step": 240 }, { "epoch": 0.47, "grad_norm": 1.523442519443845, "learning_rate": 6.58984375e-06, "loss": 0.3812, "step": 241 }, { "epoch": 0.47, "grad_norm": 1.565773828013147, "learning_rate": 6.6171875e-06, "loss": 0.3819, "step": 242 }, { "epoch": 0.47, "grad_norm": 1.6581699806335515, "learning_rate": 6.64453125e-06, "loss": 0.4135, "step": 243 }, { "epoch": 0.48, "grad_norm": 1.7349059892096943, "learning_rate": 6.6718749999999995e-06, "loss": 0.3974, "step": 244 }, { "epoch": 0.48, "grad_norm": 1.5922377783671553, "learning_rate": 6.69921875e-06, "loss": 0.4096, "step": 245 }, { "epoch": 0.48, "grad_norm": 1.462631162243193, "learning_rate": 6.7265625e-06, "loss": 0.4593, "step": 246 }, { "epoch": 0.48, "grad_norm": 1.5298789406064595, "learning_rate": 6.75390625e-06, "loss": 0.3489, "step": 247 }, { "epoch": 0.48, "grad_norm": 1.6522716420116605, "learning_rate": 6.78125e-06, "loss": 0.4269, "step": 248 }, { "epoch": 0.49, "grad_norm": 1.548590736472323, "learning_rate": 6.80859375e-06, "loss": 0.3406, "step": 249 }, { "epoch": 0.49, "grad_norm": 1.6522731460503821, "learning_rate": 6.8359375e-06, "loss": 0.4034, "step": 250 }, { "epoch": 0.49, "grad_norm": 1.4329465574741809, "learning_rate": 6.8632812499999995e-06, "loss": 0.3495, "step": 251 }, { "epoch": 0.49, "grad_norm": 1.8070030088241065, "learning_rate": 6.890625e-06, "loss": 0.4556, "step": 252 }, { "epoch": 0.49, "grad_norm": 1.5170915306322663, "learning_rate": 6.91796875e-06, "loss": 0.408, "step": 253 }, { "epoch": 0.5, "grad_norm": 1.6613486671753739, "learning_rate": 6.9453125e-06, "loss": 0.3875, "step": 254 }, { "epoch": 0.5, "grad_norm": 1.3510574520713194, "learning_rate": 6.97265625e-06, "loss": 0.3958, "step": 255 }, { "epoch": 0.5, "grad_norm": 1.6822067715666484, "learning_rate": 7e-06, "loss": 0.4247, "step": 256 }, { "epoch": 0.5, "grad_norm": 1.7400042732384944, "learning_rate": 6.999996746335437e-06, "loss": 0.4521, "step": 257 }, { "epoch": 0.5, "grad_norm": 1.5325648661935098, "learning_rate": 6.9999869853477956e-06, "loss": 0.3803, "step": 258 }, { "epoch": 0.51, "grad_norm": 1.8004159649287634, "learning_rate": 6.999970717055227e-06, "loss": 0.4728, "step": 259 }, { "epoch": 0.51, "grad_norm": 1.4667087631098832, "learning_rate": 6.9999479414879755e-06, "loss": 0.3933, "step": 260 }, { "epoch": 0.51, "grad_norm": 1.6387998178407548, "learning_rate": 6.999918658688386e-06, "loss": 0.4407, "step": 261 }, { "epoch": 0.51, "grad_norm": 1.6466055903686765, "learning_rate": 6.9998828687109035e-06, "loss": 0.3976, "step": 262 }, { "epoch": 0.51, "grad_norm": 1.5798436432933527, "learning_rate": 6.99984057162207e-06, "loss": 0.3813, "step": 263 }, { "epoch": 0.52, "grad_norm": 1.546091388264012, "learning_rate": 6.999791767500524e-06, "loss": 0.3691, "step": 264 }, { "epoch": 0.52, "grad_norm": 1.7115045846539407, "learning_rate": 6.999736456437006e-06, "loss": 0.3929, "step": 265 }, { "epoch": 0.52, "grad_norm": 1.7596739321355748, "learning_rate": 6.9996746385343505e-06, "loss": 0.3548, "step": 266 }, { "epoch": 0.52, "grad_norm": 1.8192931737950122, "learning_rate": 6.999606313907494e-06, "loss": 0.3926, "step": 267 }, { "epoch": 0.52, "grad_norm": 1.5969623370174006, "learning_rate": 6.999531482683467e-06, "loss": 0.3975, "step": 268 }, { "epoch": 0.53, "grad_norm": 1.5776314644267835, "learning_rate": 6.999450145001397e-06, "loss": 0.4144, "step": 269 }, { "epoch": 0.53, "grad_norm": 1.4650232608993643, "learning_rate": 6.999362301012511e-06, "loss": 0.4002, "step": 270 }, { "epoch": 0.53, "grad_norm": 1.7935831569389924, "learning_rate": 6.999267950880133e-06, "loss": 0.424, "step": 271 }, { "epoch": 0.53, "grad_norm": 1.7409134077893513, "learning_rate": 6.999167094779681e-06, "loss": 0.4328, "step": 272 }, { "epoch": 0.53, "grad_norm": 1.4786180125333104, "learning_rate": 6.999059732898672e-06, "loss": 0.3793, "step": 273 }, { "epoch": 0.54, "grad_norm": 1.6449181205746937, "learning_rate": 6.998945865436715e-06, "loss": 0.3407, "step": 274 }, { "epoch": 0.54, "grad_norm": 1.6974099553662596, "learning_rate": 6.998825492605517e-06, "loss": 0.4038, "step": 275 }, { "epoch": 0.54, "grad_norm": 1.6857571334157586, "learning_rate": 6.998698614628881e-06, "loss": 0.3735, "step": 276 }, { "epoch": 0.54, "grad_norm": 1.7158392754102496, "learning_rate": 6.998565231742702e-06, "loss": 0.3972, "step": 277 }, { "epoch": 0.54, "grad_norm": 1.8094492702273424, "learning_rate": 6.99842534419497e-06, "loss": 0.4936, "step": 278 }, { "epoch": 0.54, "grad_norm": 1.6220885675115237, "learning_rate": 6.99827895224577e-06, "loss": 0.4424, "step": 279 }, { "epoch": 0.55, "grad_norm": 1.7737577503781699, "learning_rate": 6.99812605616728e-06, "loss": 0.4428, "step": 280 }, { "epoch": 0.55, "grad_norm": 1.4359233175768376, "learning_rate": 6.997966656243768e-06, "loss": 0.367, "step": 281 }, { "epoch": 0.55, "grad_norm": 1.5523073060672277, "learning_rate": 6.997800752771598e-06, "loss": 0.407, "step": 282 }, { "epoch": 0.55, "grad_norm": 1.7884652699829031, "learning_rate": 6.997628346059223e-06, "loss": 0.4489, "step": 283 }, { "epoch": 0.55, "grad_norm": 1.401360322759287, "learning_rate": 6.997449436427188e-06, "loss": 0.3978, "step": 284 }, { "epoch": 0.56, "grad_norm": 1.8450506739215302, "learning_rate": 6.997264024208129e-06, "loss": 0.396, "step": 285 }, { "epoch": 0.56, "grad_norm": 1.6699069865395626, "learning_rate": 6.99707210974677e-06, "loss": 0.4511, "step": 286 }, { "epoch": 0.56, "grad_norm": 1.7708671214406244, "learning_rate": 6.9968736933999275e-06, "loss": 0.4243, "step": 287 }, { "epoch": 0.56, "grad_norm": 1.4660520329608013, "learning_rate": 6.996668775536502e-06, "loss": 0.4033, "step": 288 }, { "epoch": 0.56, "grad_norm": 1.792920605277754, "learning_rate": 6.996457356537486e-06, "loss": 0.4882, "step": 289 }, { "epoch": 0.57, "grad_norm": 1.5885110942398821, "learning_rate": 6.996239436795957e-06, "loss": 0.429, "step": 290 }, { "epoch": 0.57, "grad_norm": 1.355725140923794, "learning_rate": 6.996015016717079e-06, "loss": 0.3274, "step": 291 }, { "epoch": 0.57, "grad_norm": 1.5872202743228925, "learning_rate": 6.9957840967181034e-06, "loss": 0.3521, "step": 292 }, { "epoch": 0.57, "grad_norm": 1.5054964866757612, "learning_rate": 6.9955466772283635e-06, "loss": 0.3495, "step": 293 }, { "epoch": 0.57, "grad_norm": 1.6113631932700565, "learning_rate": 6.99530275868928e-06, "loss": 0.3531, "step": 294 }, { "epoch": 0.58, "grad_norm": 1.5355920007893307, "learning_rate": 6.995052341554354e-06, "loss": 0.3431, "step": 295 }, { "epoch": 0.58, "grad_norm": 1.5177369194502275, "learning_rate": 6.994795426289171e-06, "loss": 0.3471, "step": 296 }, { "epoch": 0.58, "grad_norm": 1.4932079043978095, "learning_rate": 6.9945320133713965e-06, "loss": 0.3558, "step": 297 }, { "epoch": 0.58, "grad_norm": 1.6313956627429136, "learning_rate": 6.994262103290778e-06, "loss": 0.4013, "step": 298 }, { "epoch": 0.58, "grad_norm": 1.6900918076635838, "learning_rate": 6.993985696549143e-06, "loss": 0.3739, "step": 299 }, { "epoch": 0.59, "grad_norm": 1.5830207219818921, "learning_rate": 6.993702793660396e-06, "loss": 0.3936, "step": 300 }, { "epoch": 0.59, "grad_norm": 1.7316007424850535, "learning_rate": 6.993413395150521e-06, "loss": 0.356, "step": 301 }, { "epoch": 0.59, "grad_norm": 1.4703605745714758, "learning_rate": 6.99311750155758e-06, "loss": 0.331, "step": 302 }, { "epoch": 0.59, "grad_norm": 1.6656549837385775, "learning_rate": 6.992815113431707e-06, "loss": 0.4528, "step": 303 }, { "epoch": 0.59, "grad_norm": 1.645782406125907, "learning_rate": 6.992506231335112e-06, "loss": 0.4089, "step": 304 }, { "epoch": 0.6, "grad_norm": 1.533542996055402, "learning_rate": 6.992190855842082e-06, "loss": 0.3885, "step": 305 }, { "epoch": 0.6, "grad_norm": 1.7835349792580901, "learning_rate": 6.9918689875389766e-06, "loss": 0.4222, "step": 306 }, { "epoch": 0.6, "grad_norm": 1.5694377711379892, "learning_rate": 6.991540627024222e-06, "loss": 0.3871, "step": 307 }, { "epoch": 0.6, "grad_norm": 1.8371234702999297, "learning_rate": 6.991205774908319e-06, "loss": 0.3515, "step": 308 }, { "epoch": 0.6, "grad_norm": 1.6326786144987195, "learning_rate": 6.990864431813838e-06, "loss": 0.382, "step": 309 }, { "epoch": 0.61, "grad_norm": 1.742696518900445, "learning_rate": 6.990516598375416e-06, "loss": 0.4372, "step": 310 }, { "epoch": 0.61, "grad_norm": 1.532321224363555, "learning_rate": 6.990162275239758e-06, "loss": 0.4247, "step": 311 }, { "epoch": 0.61, "grad_norm": 1.7668068394451153, "learning_rate": 6.9898014630656335e-06, "loss": 0.4626, "step": 312 }, { "epoch": 0.61, "grad_norm": 1.5626538142324693, "learning_rate": 6.989434162523879e-06, "loss": 0.4255, "step": 313 }, { "epoch": 0.61, "grad_norm": 1.5194279034655085, "learning_rate": 6.9890603742973934e-06, "loss": 0.3683, "step": 314 }, { "epoch": 0.62, "grad_norm": 1.816123383634497, "learning_rate": 6.988680099081137e-06, "loss": 0.3902, "step": 315 }, { "epoch": 0.62, "grad_norm": 1.8228044681903763, "learning_rate": 6.988293337582131e-06, "loss": 0.3734, "step": 316 }, { "epoch": 0.62, "grad_norm": 1.9748560454969526, "learning_rate": 6.987900090519458e-06, "loss": 0.5146, "step": 317 }, { "epoch": 0.62, "grad_norm": 1.6707085509357764, "learning_rate": 6.987500358624256e-06, "loss": 0.4168, "step": 318 }, { "epoch": 0.62, "grad_norm": 1.6281526486493931, "learning_rate": 6.987094142639722e-06, "loss": 0.4351, "step": 319 }, { "epoch": 0.62, "grad_norm": 1.7256634539720888, "learning_rate": 6.9866814433211094e-06, "loss": 0.4274, "step": 320 }, { "epoch": 0.63, "grad_norm": 1.5801130727854125, "learning_rate": 6.986262261435721e-06, "loss": 0.383, "step": 321 }, { "epoch": 0.63, "grad_norm": 1.7714897830503342, "learning_rate": 6.985836597762917e-06, "loss": 0.4241, "step": 322 }, { "epoch": 0.63, "grad_norm": 1.7694522156567096, "learning_rate": 6.985404453094107e-06, "loss": 0.4871, "step": 323 }, { "epoch": 0.63, "grad_norm": 1.6620477473773856, "learning_rate": 6.984965828232749e-06, "loss": 0.4174, "step": 324 }, { "epoch": 0.63, "grad_norm": 1.4168899332631593, "learning_rate": 6.984520723994351e-06, "loss": 0.374, "step": 325 }, { "epoch": 0.64, "grad_norm": 1.6240729425324465, "learning_rate": 6.984069141206469e-06, "loss": 0.3688, "step": 326 }, { "epoch": 0.64, "grad_norm": 1.9070650547171257, "learning_rate": 6.983611080708701e-06, "loss": 0.4441, "step": 327 }, { "epoch": 0.64, "grad_norm": 1.6163060533423277, "learning_rate": 6.983146543352689e-06, "loss": 0.4766, "step": 328 }, { "epoch": 0.64, "grad_norm": 1.6713075030505575, "learning_rate": 6.982675530002119e-06, "loss": 0.458, "step": 329 }, { "epoch": 0.64, "grad_norm": 1.5068490293154886, "learning_rate": 6.9821980415327175e-06, "loss": 0.4218, "step": 330 }, { "epoch": 0.65, "grad_norm": 1.4311478203412418, "learning_rate": 6.981714078832247e-06, "loss": 0.4008, "step": 331 }, { "epoch": 0.65, "grad_norm": 1.5779221179646437, "learning_rate": 6.981223642800509e-06, "loss": 0.4078, "step": 332 }, { "epoch": 0.65, "grad_norm": 1.5740448566603595, "learning_rate": 6.980726734349341e-06, "loss": 0.435, "step": 333 }, { "epoch": 0.65, "grad_norm": 1.458531774845749, "learning_rate": 6.980223354402614e-06, "loss": 0.4106, "step": 334 }, { "epoch": 0.65, "grad_norm": 1.5553514341788026, "learning_rate": 6.979713503896228e-06, "loss": 0.4721, "step": 335 }, { "epoch": 0.66, "grad_norm": 1.4685800950248606, "learning_rate": 6.979197183778118e-06, "loss": 0.3805, "step": 336 }, { "epoch": 0.66, "grad_norm": 1.4952369340273903, "learning_rate": 6.978674395008247e-06, "loss": 0.3987, "step": 337 }, { "epoch": 0.66, "grad_norm": 1.495185009426871, "learning_rate": 6.978145138558598e-06, "loss": 0.3786, "step": 338 }, { "epoch": 0.66, "grad_norm": 1.5448667258338398, "learning_rate": 6.97760941541319e-06, "loss": 0.3317, "step": 339 }, { "epoch": 0.66, "grad_norm": 1.4952305514099107, "learning_rate": 6.977067226568055e-06, "loss": 0.4255, "step": 340 }, { "epoch": 0.67, "grad_norm": 1.5486254226518885, "learning_rate": 6.9765185730312525e-06, "loss": 0.4025, "step": 341 }, { "epoch": 0.67, "grad_norm": 1.8167024249898573, "learning_rate": 6.975963455822859e-06, "loss": 0.423, "step": 342 }, { "epoch": 0.67, "grad_norm": 1.555175862196501, "learning_rate": 6.975401875974969e-06, "loss": 0.3905, "step": 343 }, { "epoch": 0.67, "grad_norm": 1.347923975168308, "learning_rate": 6.974833834531692e-06, "loss": 0.4481, "step": 344 }, { "epoch": 0.67, "grad_norm": 1.2999557074030674, "learning_rate": 6.974259332549153e-06, "loss": 0.3379, "step": 345 }, { "epoch": 0.68, "grad_norm": 1.6650006298234694, "learning_rate": 6.973678371095485e-06, "loss": 0.3851, "step": 346 }, { "epoch": 0.68, "grad_norm": 1.8533034176240821, "learning_rate": 6.9730909512508345e-06, "loss": 0.391, "step": 347 }, { "epoch": 0.68, "grad_norm": 1.3674753536803084, "learning_rate": 6.972497074107354e-06, "loss": 0.3752, "step": 348 }, { "epoch": 0.68, "grad_norm": 1.5618987999905256, "learning_rate": 6.971896740769201e-06, "loss": 0.4119, "step": 349 }, { "epoch": 0.68, "grad_norm": 1.7013437002394913, "learning_rate": 6.971289952352539e-06, "loss": 0.388, "step": 350 }, { "epoch": 0.69, "grad_norm": 1.8557878417679454, "learning_rate": 6.970676709985529e-06, "loss": 0.424, "step": 351 }, { "epoch": 0.69, "grad_norm": 1.9131051310198892, "learning_rate": 6.970057014808337e-06, "loss": 0.4174, "step": 352 }, { "epoch": 0.69, "grad_norm": 1.621131626686106, "learning_rate": 6.96943086797312e-06, "loss": 0.3952, "step": 353 }, { "epoch": 0.69, "grad_norm": 1.536300171825336, "learning_rate": 6.9687982706440355e-06, "loss": 0.3657, "step": 354 }, { "epoch": 0.69, "grad_norm": 1.3912231731510822, "learning_rate": 6.968159223997229e-06, "loss": 0.3628, "step": 355 }, { "epoch": 0.7, "grad_norm": 1.5973385550555566, "learning_rate": 6.967513729220844e-06, "loss": 0.3904, "step": 356 }, { "epoch": 0.7, "grad_norm": 1.7818574674019028, "learning_rate": 6.966861787515006e-06, "loss": 0.4773, "step": 357 }, { "epoch": 0.7, "grad_norm": 1.4057723165529261, "learning_rate": 6.966203400091827e-06, "loss": 0.3459, "step": 358 }, { "epoch": 0.7, "grad_norm": 1.7997894430937842, "learning_rate": 6.965538568175408e-06, "loss": 0.4318, "step": 359 }, { "epoch": 0.7, "grad_norm": 1.5735186675853814, "learning_rate": 6.964867293001827e-06, "loss": 0.4504, "step": 360 }, { "epoch": 0.71, "grad_norm": 1.4276776798640247, "learning_rate": 6.964189575819146e-06, "loss": 0.3667, "step": 361 }, { "epoch": 0.71, "grad_norm": 1.7036688888959843, "learning_rate": 6.9635054178874e-06, "loss": 0.3843, "step": 362 }, { "epoch": 0.71, "grad_norm": 1.5267523734548005, "learning_rate": 6.9628148204786e-06, "loss": 0.4431, "step": 363 }, { "epoch": 0.71, "grad_norm": 1.5284082335109002, "learning_rate": 6.962117784876734e-06, "loss": 0.4247, "step": 364 }, { "epoch": 0.71, "grad_norm": 2.0451974735374634, "learning_rate": 6.961414312377751e-06, "loss": 0.4819, "step": 365 }, { "epoch": 0.71, "grad_norm": 1.6014275396545696, "learning_rate": 6.960704404289577e-06, "loss": 0.4165, "step": 366 }, { "epoch": 0.72, "grad_norm": 1.5800424099815258, "learning_rate": 6.959988061932097e-06, "loss": 0.4424, "step": 367 }, { "epoch": 0.72, "grad_norm": 1.5755070817916135, "learning_rate": 6.959265286637163e-06, "loss": 0.3575, "step": 368 }, { "epoch": 0.72, "grad_norm": 1.5444884762283142, "learning_rate": 6.958536079748583e-06, "loss": 0.4782, "step": 369 }, { "epoch": 0.72, "grad_norm": 1.8253818697384032, "learning_rate": 6.957800442622129e-06, "loss": 0.4238, "step": 370 }, { "epoch": 0.72, "grad_norm": 1.8986091445030466, "learning_rate": 6.95705837662552e-06, "loss": 0.3835, "step": 371 }, { "epoch": 0.73, "grad_norm": 1.8129182442131329, "learning_rate": 6.956309883138437e-06, "loss": 0.4478, "step": 372 }, { "epoch": 0.73, "grad_norm": 1.7150979717726202, "learning_rate": 6.9555549635525045e-06, "loss": 0.3542, "step": 373 }, { "epoch": 0.73, "grad_norm": 1.577293473759658, "learning_rate": 6.954793619271297e-06, "loss": 0.3804, "step": 374 }, { "epoch": 0.73, "grad_norm": 1.5331793118621817, "learning_rate": 6.954025851710333e-06, "loss": 0.406, "step": 375 }, { "epoch": 0.73, "grad_norm": 1.579431153391335, "learning_rate": 6.953251662297077e-06, "loss": 0.4258, "step": 376 }, { "epoch": 0.74, "grad_norm": 1.6006289243430118, "learning_rate": 6.952471052470927e-06, "loss": 0.4137, "step": 377 }, { "epoch": 0.74, "grad_norm": 1.5308588255756512, "learning_rate": 6.9516840236832244e-06, "loss": 0.3851, "step": 378 }, { "epoch": 0.74, "grad_norm": 1.495507319928818, "learning_rate": 6.9508905773972405e-06, "loss": 0.3873, "step": 379 }, { "epoch": 0.74, "grad_norm": 1.8440243280991153, "learning_rate": 6.950090715088181e-06, "loss": 0.4345, "step": 380 }, { "epoch": 0.74, "grad_norm": 1.6444520296144443, "learning_rate": 6.949284438243179e-06, "loss": 0.3924, "step": 381 }, { "epoch": 0.75, "grad_norm": 1.4857576495717528, "learning_rate": 6.9484717483612935e-06, "loss": 0.4074, "step": 382 }, { "epoch": 0.75, "grad_norm": 1.4503594508343018, "learning_rate": 6.947652646953509e-06, "loss": 0.42, "step": 383 }, { "epoch": 0.75, "grad_norm": 1.4298052483196666, "learning_rate": 6.946827135542729e-06, "loss": 0.364, "step": 384 }, { "epoch": 0.75, "grad_norm": 1.5866871521481831, "learning_rate": 6.945995215663772e-06, "loss": 0.4048, "step": 385 }, { "epoch": 0.75, "grad_norm": 1.5710090810094302, "learning_rate": 6.945156888863377e-06, "loss": 0.3884, "step": 386 }, { "epoch": 0.76, "grad_norm": 1.4396597250427239, "learning_rate": 6.944312156700191e-06, "loss": 0.3521, "step": 387 }, { "epoch": 0.76, "grad_norm": 1.5361136372869866, "learning_rate": 6.94346102074477e-06, "loss": 0.3954, "step": 388 }, { "epoch": 0.76, "grad_norm": 1.7090128086511363, "learning_rate": 6.942603482579581e-06, "loss": 0.4338, "step": 389 }, { "epoch": 0.76, "grad_norm": 1.6084516269235527, "learning_rate": 6.9417395437989875e-06, "loss": 0.3773, "step": 390 }, { "epoch": 0.76, "grad_norm": 1.6771253197298748, "learning_rate": 6.9408692060092574e-06, "loss": 0.4486, "step": 391 }, { "epoch": 0.77, "grad_norm": 1.4126032752224191, "learning_rate": 6.939992470828554e-06, "loss": 0.3635, "step": 392 }, { "epoch": 0.77, "grad_norm": 1.3596069771869914, "learning_rate": 6.939109339886937e-06, "loss": 0.3789, "step": 393 }, { "epoch": 0.77, "grad_norm": 1.4471196566213127, "learning_rate": 6.938219814826355e-06, "loss": 0.4169, "step": 394 }, { "epoch": 0.77, "grad_norm": 1.4926341217930108, "learning_rate": 6.937323897300646e-06, "loss": 0.3481, "step": 395 }, { "epoch": 0.77, "grad_norm": 1.4897960134026111, "learning_rate": 6.936421588975533e-06, "loss": 0.3976, "step": 396 }, { "epoch": 0.78, "grad_norm": 1.6258331094747263, "learning_rate": 6.935512891528622e-06, "loss": 0.3689, "step": 397 }, { "epoch": 0.78, "grad_norm": 1.4250989224856196, "learning_rate": 6.934597806649395e-06, "loss": 0.3748, "step": 398 }, { "epoch": 0.78, "grad_norm": 1.6479589450364982, "learning_rate": 6.9336763360392125e-06, "loss": 0.4032, "step": 399 }, { "epoch": 0.78, "grad_norm": 1.3251673793487038, "learning_rate": 6.932748481411306e-06, "loss": 0.3561, "step": 400 }, { "epoch": 0.78, "grad_norm": 1.3812009554750335, "learning_rate": 6.931814244490778e-06, "loss": 0.3884, "step": 401 }, { "epoch": 0.79, "grad_norm": 1.5348814363170098, "learning_rate": 6.930873627014596e-06, "loss": 0.4085, "step": 402 }, { "epoch": 0.79, "grad_norm": 1.4353231281935956, "learning_rate": 6.929926630731591e-06, "loss": 0.4369, "step": 403 }, { "epoch": 0.79, "grad_norm": 1.7496164528200289, "learning_rate": 6.928973257402453e-06, "loss": 0.491, "step": 404 }, { "epoch": 0.79, "grad_norm": 1.6993755208006613, "learning_rate": 6.928013508799728e-06, "loss": 0.4376, "step": 405 }, { "epoch": 0.79, "grad_norm": 1.5944056839232883, "learning_rate": 6.9270473867078185e-06, "loss": 0.3591, "step": 406 }, { "epoch": 0.79, "grad_norm": 1.5144531648388075, "learning_rate": 6.926074892922971e-06, "loss": 0.3633, "step": 407 }, { "epoch": 0.8, "grad_norm": 1.5260776904877442, "learning_rate": 6.925096029253284e-06, "loss": 0.4239, "step": 408 }, { "epoch": 0.8, "grad_norm": 1.5216267665762389, "learning_rate": 6.924110797518696e-06, "loss": 0.4004, "step": 409 }, { "epoch": 0.8, "grad_norm": 1.5668302313100648, "learning_rate": 6.923119199550988e-06, "loss": 0.4379, "step": 410 }, { "epoch": 0.8, "grad_norm": 1.6449656107480701, "learning_rate": 6.922121237193773e-06, "loss": 0.4183, "step": 411 }, { "epoch": 0.8, "grad_norm": 1.3815902712945125, "learning_rate": 6.921116912302502e-06, "loss": 0.3924, "step": 412 }, { "epoch": 0.81, "grad_norm": 1.4974122452777632, "learning_rate": 6.92010622674445e-06, "loss": 0.3055, "step": 413 }, { "epoch": 0.81, "grad_norm": 1.8912943839634002, "learning_rate": 6.919089182398723e-06, "loss": 0.4029, "step": 414 }, { "epoch": 0.81, "grad_norm": 1.3454699368820688, "learning_rate": 6.918065781156246e-06, "loss": 0.3671, "step": 415 }, { "epoch": 0.81, "grad_norm": 1.7415289760509933, "learning_rate": 6.917036024919767e-06, "loss": 0.4321, "step": 416 }, { "epoch": 0.81, "grad_norm": 1.6968095275847261, "learning_rate": 6.915999915603844e-06, "loss": 0.4018, "step": 417 }, { "epoch": 0.82, "grad_norm": 1.5723696021060625, "learning_rate": 6.9149574551348496e-06, "loss": 0.4506, "step": 418 }, { "epoch": 0.82, "grad_norm": 1.6885312627811564, "learning_rate": 6.913908645450967e-06, "loss": 0.3795, "step": 419 }, { "epoch": 0.82, "grad_norm": 1.4140085080253653, "learning_rate": 6.912853488502181e-06, "loss": 0.3684, "step": 420 }, { "epoch": 0.82, "grad_norm": 1.490931251764968, "learning_rate": 6.911791986250275e-06, "loss": 0.4088, "step": 421 }, { "epoch": 0.82, "grad_norm": 1.7513308227972242, "learning_rate": 6.910724140668839e-06, "loss": 0.4319, "step": 422 }, { "epoch": 0.83, "grad_norm": 1.3176220420713172, "learning_rate": 6.909649953743247e-06, "loss": 0.3531, "step": 423 }, { "epoch": 0.83, "grad_norm": 1.5859693229558487, "learning_rate": 6.908569427470668e-06, "loss": 0.3382, "step": 424 }, { "epoch": 0.83, "grad_norm": 1.5208488703680316, "learning_rate": 6.907482563860056e-06, "loss": 0.3869, "step": 425 }, { "epoch": 0.83, "grad_norm": 1.893663781671655, "learning_rate": 6.906389364932148e-06, "loss": 0.399, "step": 426 }, { "epoch": 0.83, "grad_norm": 1.6709327638732312, "learning_rate": 6.905289832719461e-06, "loss": 0.4132, "step": 427 }, { "epoch": 0.84, "grad_norm": 1.635619597786822, "learning_rate": 6.904183969266283e-06, "loss": 0.3656, "step": 428 }, { "epoch": 0.84, "grad_norm": 1.59968543690196, "learning_rate": 6.90307177662868e-06, "loss": 0.3488, "step": 429 }, { "epoch": 0.84, "grad_norm": 1.6164015262215174, "learning_rate": 6.901953256874478e-06, "loss": 0.3735, "step": 430 }, { "epoch": 0.84, "grad_norm": 1.7592512016627402, "learning_rate": 6.900828412083273e-06, "loss": 0.3977, "step": 431 }, { "epoch": 0.84, "grad_norm": 1.3937185555373854, "learning_rate": 6.899697244346414e-06, "loss": 0.3596, "step": 432 }, { "epoch": 0.85, "grad_norm": 1.5042754026675247, "learning_rate": 6.8985597557670156e-06, "loss": 0.4224, "step": 433 }, { "epoch": 0.85, "grad_norm": 1.8078217265869445, "learning_rate": 6.897415948459933e-06, "loss": 0.4801, "step": 434 }, { "epoch": 0.85, "grad_norm": 1.3630120177884193, "learning_rate": 6.8962658245517785e-06, "loss": 0.3224, "step": 435 }, { "epoch": 0.85, "grad_norm": 1.4549549758801936, "learning_rate": 6.8951093861809044e-06, "loss": 0.3759, "step": 436 }, { "epoch": 0.85, "grad_norm": 1.7921185961407644, "learning_rate": 6.8939466354974015e-06, "loss": 0.3485, "step": 437 }, { "epoch": 0.86, "grad_norm": 1.6087810609577322, "learning_rate": 6.8927775746631e-06, "loss": 0.3981, "step": 438 }, { "epoch": 0.86, "grad_norm": 1.479973120005004, "learning_rate": 6.8916022058515625e-06, "loss": 0.4143, "step": 439 }, { "epoch": 0.86, "grad_norm": 1.7223887577071961, "learning_rate": 6.890420531248076e-06, "loss": 0.3563, "step": 440 }, { "epoch": 0.86, "grad_norm": 1.5115766132745947, "learning_rate": 6.889232553049655e-06, "loss": 0.4306, "step": 441 }, { "epoch": 0.86, "grad_norm": 1.403857849865393, "learning_rate": 6.888038273465029e-06, "loss": 0.3562, "step": 442 }, { "epoch": 0.87, "grad_norm": 1.463502289399873, "learning_rate": 6.8868376947146514e-06, "loss": 0.3956, "step": 443 }, { "epoch": 0.87, "grad_norm": 1.385785628296623, "learning_rate": 6.885630819030679e-06, "loss": 0.3807, "step": 444 }, { "epoch": 0.87, "grad_norm": 1.6045008958409186, "learning_rate": 6.884417648656982e-06, "loss": 0.3941, "step": 445 }, { "epoch": 0.87, "grad_norm": 1.572341916815721, "learning_rate": 6.883198185849131e-06, "loss": 0.419, "step": 446 }, { "epoch": 0.87, "grad_norm": 1.2650578013363873, "learning_rate": 6.881972432874394e-06, "loss": 0.3766, "step": 447 }, { "epoch": 0.88, "grad_norm": 1.6692034869726693, "learning_rate": 6.880740392011738e-06, "loss": 0.4338, "step": 448 }, { "epoch": 0.88, "grad_norm": 1.6376548503313102, "learning_rate": 6.87950206555182e-06, "loss": 0.3237, "step": 449 }, { "epoch": 0.88, "grad_norm": 1.4747601371004269, "learning_rate": 6.87825745579698e-06, "loss": 0.3497, "step": 450 }, { "epoch": 0.88, "grad_norm": 1.3861070236519208, "learning_rate": 6.877006565061244e-06, "loss": 0.4046, "step": 451 }, { "epoch": 0.88, "grad_norm": 1.3869923108700475, "learning_rate": 6.875749395670313e-06, "loss": 0.4386, "step": 452 }, { "epoch": 0.88, "grad_norm": 1.642982308523025, "learning_rate": 6.874485949961563e-06, "loss": 0.389, "step": 453 }, { "epoch": 0.89, "grad_norm": 1.65391591389462, "learning_rate": 6.87321623028404e-06, "loss": 0.3276, "step": 454 }, { "epoch": 0.89, "grad_norm": 1.4223408652011937, "learning_rate": 6.871940238998452e-06, "loss": 0.4438, "step": 455 }, { "epoch": 0.89, "grad_norm": 1.6054470912910286, "learning_rate": 6.870657978477169e-06, "loss": 0.3799, "step": 456 }, { "epoch": 0.89, "grad_norm": 1.7572627926671025, "learning_rate": 6.86936945110422e-06, "loss": 0.3712, "step": 457 }, { "epoch": 0.89, "grad_norm": 1.9264809754033452, "learning_rate": 6.868074659275278e-06, "loss": 0.4487, "step": 458 }, { "epoch": 0.9, "grad_norm": 1.6748505512783471, "learning_rate": 6.866773605397672e-06, "loss": 0.4335, "step": 459 }, { "epoch": 0.9, "grad_norm": 1.6181852750757655, "learning_rate": 6.865466291890367e-06, "loss": 0.3606, "step": 460 }, { "epoch": 0.9, "grad_norm": 1.618065831483016, "learning_rate": 6.864152721183969e-06, "loss": 0.3935, "step": 461 }, { "epoch": 0.9, "grad_norm": 1.8202089939525627, "learning_rate": 6.862832895720718e-06, "loss": 0.4638, "step": 462 }, { "epoch": 0.9, "grad_norm": 1.32028547735582, "learning_rate": 6.86150681795448e-06, "loss": 0.3314, "step": 463 }, { "epoch": 0.91, "grad_norm": 1.4513641972531524, "learning_rate": 6.860174490350751e-06, "loss": 0.4143, "step": 464 }, { "epoch": 0.91, "grad_norm": 1.4782234419391354, "learning_rate": 6.8588359153866394e-06, "loss": 0.3722, "step": 465 }, { "epoch": 0.91, "grad_norm": 1.620070170411505, "learning_rate": 6.857491095550878e-06, "loss": 0.4398, "step": 466 }, { "epoch": 0.91, "grad_norm": 1.4129254221283196, "learning_rate": 6.8561400333438015e-06, "loss": 0.3467, "step": 467 }, { "epoch": 0.91, "grad_norm": 1.495347443689641, "learning_rate": 6.854782731277357e-06, "loss": 0.4375, "step": 468 }, { "epoch": 0.92, "grad_norm": 1.531369890873368, "learning_rate": 6.8534191918750885e-06, "loss": 0.387, "step": 469 }, { "epoch": 0.92, "grad_norm": 1.442879283526123, "learning_rate": 6.852049417672141e-06, "loss": 0.3471, "step": 470 }, { "epoch": 0.92, "grad_norm": 1.5859441994757626, "learning_rate": 6.850673411215248e-06, "loss": 0.4344, "step": 471 }, { "epoch": 0.92, "grad_norm": 1.475777877974845, "learning_rate": 6.849291175062731e-06, "loss": 0.3872, "step": 472 }, { "epoch": 0.92, "grad_norm": 1.4977482480863609, "learning_rate": 6.847902711784495e-06, "loss": 0.4058, "step": 473 }, { "epoch": 0.93, "grad_norm": 1.5019063430398456, "learning_rate": 6.846508023962023e-06, "loss": 0.4219, "step": 474 }, { "epoch": 0.93, "grad_norm": 1.7723275018435238, "learning_rate": 6.845107114188369e-06, "loss": 0.3958, "step": 475 }, { "epoch": 0.93, "grad_norm": 1.5464237241338619, "learning_rate": 6.843699985068156e-06, "loss": 0.3659, "step": 476 }, { "epoch": 0.93, "grad_norm": 1.6131934584194865, "learning_rate": 6.842286639217572e-06, "loss": 0.3423, "step": 477 }, { "epoch": 0.93, "grad_norm": 1.3636032110936416, "learning_rate": 6.8408670792643595e-06, "loss": 0.3611, "step": 478 }, { "epoch": 0.94, "grad_norm": 1.617320441900244, "learning_rate": 6.839441307847818e-06, "loss": 0.4128, "step": 479 }, { "epoch": 0.94, "grad_norm": 1.3748542011457883, "learning_rate": 6.838009327618794e-06, "loss": 0.3699, "step": 480 }, { "epoch": 0.94, "grad_norm": 1.6617064103066639, "learning_rate": 6.836571141239678e-06, "loss": 0.4103, "step": 481 }, { "epoch": 0.94, "grad_norm": 1.472954662040216, "learning_rate": 6.8351267513844e-06, "loss": 0.3538, "step": 482 }, { "epoch": 0.94, "grad_norm": 1.7023811309386494, "learning_rate": 6.8336761607384215e-06, "loss": 0.3775, "step": 483 }, { "epoch": 0.95, "grad_norm": 1.5741612066742374, "learning_rate": 6.8322193719987345e-06, "loss": 0.3476, "step": 484 }, { "epoch": 0.95, "grad_norm": 1.445909268205409, "learning_rate": 6.830756387873856e-06, "loss": 0.4069, "step": 485 }, { "epoch": 0.95, "grad_norm": 1.6972866737683538, "learning_rate": 6.829287211083817e-06, "loss": 0.3983, "step": 486 }, { "epoch": 0.95, "grad_norm": 1.3542097068912118, "learning_rate": 6.827811844360168e-06, "loss": 0.3449, "step": 487 }, { "epoch": 0.95, "grad_norm": 1.6117358848461774, "learning_rate": 6.8263302904459634e-06, "loss": 0.3627, "step": 488 }, { "epoch": 0.96, "grad_norm": 1.4425920281484288, "learning_rate": 6.824842552095764e-06, "loss": 0.364, "step": 489 }, { "epoch": 0.96, "grad_norm": 1.584196631638136, "learning_rate": 6.823348632075628e-06, "loss": 0.4284, "step": 490 }, { "epoch": 0.96, "grad_norm": 1.4162938063229875, "learning_rate": 6.821848533163106e-06, "loss": 0.4038, "step": 491 }, { "epoch": 0.96, "grad_norm": 1.8137135094624217, "learning_rate": 6.820342258147237e-06, "loss": 0.3758, "step": 492 }, { "epoch": 0.96, "grad_norm": 1.7468095912595143, "learning_rate": 6.818829809828544e-06, "loss": 0.4518, "step": 493 }, { "epoch": 0.96, "grad_norm": 1.43851773359781, "learning_rate": 6.817311191019026e-06, "loss": 0.312, "step": 494 }, { "epoch": 0.97, "grad_norm": 1.5667516026379813, "learning_rate": 6.815786404542154e-06, "loss": 0.4229, "step": 495 }, { "epoch": 0.97, "grad_norm": 1.5272657314735092, "learning_rate": 6.81425545323287e-06, "loss": 0.3628, "step": 496 }, { "epoch": 0.97, "grad_norm": 1.4816546526145773, "learning_rate": 6.812718339937573e-06, "loss": 0.4204, "step": 497 }, { "epoch": 0.97, "grad_norm": 1.5327146257016886, "learning_rate": 6.8111750675141215e-06, "loss": 0.3442, "step": 498 }, { "epoch": 0.97, "grad_norm": 1.5088855977260753, "learning_rate": 6.8096256388318245e-06, "loss": 0.4228, "step": 499 }, { "epoch": 0.98, "grad_norm": 1.5419351636050564, "learning_rate": 6.808070056771437e-06, "loss": 0.4319, "step": 500 }, { "epoch": 0.98, "grad_norm": 1.4733777278641178, "learning_rate": 6.806508324225154e-06, "loss": 0.3655, "step": 501 }, { "epoch": 0.98, "grad_norm": 1.5179153836715853, "learning_rate": 6.804940444096608e-06, "loss": 0.3793, "step": 502 }, { "epoch": 0.98, "grad_norm": 1.81517537744245, "learning_rate": 6.803366419300858e-06, "loss": 0.4075, "step": 503 }, { "epoch": 0.98, "grad_norm": 1.9178939028292705, "learning_rate": 6.801786252764388e-06, "loss": 0.3915, "step": 504 }, { "epoch": 0.99, "grad_norm": 1.4185993570966313, "learning_rate": 6.8001999474251034e-06, "loss": 0.4074, "step": 505 }, { "epoch": 0.99, "grad_norm": 1.6972343771550653, "learning_rate": 6.798607506232324e-06, "loss": 0.4033, "step": 506 }, { "epoch": 0.99, "grad_norm": 1.5195251193435875, "learning_rate": 6.797008932146771e-06, "loss": 0.3761, "step": 507 }, { "epoch": 0.99, "grad_norm": 1.2708352977819897, "learning_rate": 6.795404228140573e-06, "loss": 0.307, "step": 508 }, { "epoch": 0.99, "grad_norm": 1.6789940530990302, "learning_rate": 6.793793397197257e-06, "loss": 0.422, "step": 509 }, { "epoch": 1.0, "grad_norm": 1.5359175926517448, "learning_rate": 6.792176442311738e-06, "loss": 0.3266, "step": 510 }, { "epoch": 1.0, "grad_norm": 1.5440096874504388, "learning_rate": 6.790553366490317e-06, "loss": 0.3874, "step": 511 }, { "epoch": 1.0, "grad_norm": 1.44872597080273, "learning_rate": 6.788924172750679e-06, "loss": 0.3495, "step": 512 }, { "epoch": 1.0, "grad_norm": 1.4490133043507374, "learning_rate": 6.78728886412188e-06, "loss": 0.3966, "step": 513 }, { "epoch": 1.0, "grad_norm": 1.6827141100010667, "learning_rate": 6.785647443644346e-06, "loss": 0.4348, "step": 514 }, { "epoch": 1.01, "grad_norm": 1.7513499771778536, "learning_rate": 6.783999914369867e-06, "loss": 0.4113, "step": 515 }, { "epoch": 1.01, "grad_norm": 1.5845765961911642, "learning_rate": 6.782346279361589e-06, "loss": 0.3975, "step": 516 }, { "epoch": 1.01, "grad_norm": 1.3789076658043473, "learning_rate": 6.7806865416940126e-06, "loss": 0.3891, "step": 517 }, { "epoch": 1.01, "grad_norm": 1.4909575720814823, "learning_rate": 6.779020704452983e-06, "loss": 0.347, "step": 518 }, { "epoch": 1.01, "grad_norm": 1.6771272998248135, "learning_rate": 6.7773487707356845e-06, "loss": 0.4205, "step": 519 }, { "epoch": 1.02, "grad_norm": 2.066534772682538, "learning_rate": 6.77567074365064e-06, "loss": 0.4195, "step": 520 }, { "epoch": 1.02, "grad_norm": 1.2111909496578512, "learning_rate": 6.773986626317699e-06, "loss": 0.334, "step": 521 }, { "epoch": 1.02, "grad_norm": 1.4527956304181684, "learning_rate": 6.772296421868033e-06, "loss": 0.3781, "step": 522 }, { "epoch": 1.02, "grad_norm": 1.467405820611919, "learning_rate": 6.770600133444136e-06, "loss": 0.3494, "step": 523 }, { "epoch": 1.02, "grad_norm": 1.2943673987815012, "learning_rate": 6.768897764199808e-06, "loss": 0.3388, "step": 524 }, { "epoch": 1.03, "grad_norm": 1.31676697431199, "learning_rate": 6.767189317300154e-06, "loss": 0.36, "step": 525 }, { "epoch": 1.03, "grad_norm": 1.2552008571917344, "learning_rate": 6.765474795921586e-06, "loss": 0.3635, "step": 526 }, { "epoch": 1.03, "grad_norm": 1.4630846059300933, "learning_rate": 6.763754203251803e-06, "loss": 0.3419, "step": 527 }, { "epoch": 1.03, "grad_norm": 1.6163476166258437, "learning_rate": 6.762027542489795e-06, "loss": 0.3729, "step": 528 }, { "epoch": 1.03, "grad_norm": 1.6327356245169886, "learning_rate": 6.760294816845832e-06, "loss": 0.3877, "step": 529 }, { "epoch": 1.04, "grad_norm": 1.5737534592544493, "learning_rate": 6.7585560295414646e-06, "loss": 0.3236, "step": 530 }, { "epoch": 1.04, "grad_norm": 1.536383163505699, "learning_rate": 6.756811183809507e-06, "loss": 0.371, "step": 531 }, { "epoch": 1.04, "grad_norm": 1.5641646953195714, "learning_rate": 6.755060282894042e-06, "loss": 0.3784, "step": 532 }, { "epoch": 1.04, "grad_norm": 1.7131384369669833, "learning_rate": 6.75330333005041e-06, "loss": 0.3927, "step": 533 }, { "epoch": 1.04, "grad_norm": 1.6692945580993346, "learning_rate": 6.7515403285452015e-06, "loss": 0.438, "step": 534 }, { "epoch": 1.04, "grad_norm": 1.8618580664526874, "learning_rate": 6.7497712816562545e-06, "loss": 0.3988, "step": 535 }, { "epoch": 1.05, "grad_norm": 1.862062446176349, "learning_rate": 6.747996192672646e-06, "loss": 0.4249, "step": 536 }, { "epoch": 1.05, "grad_norm": 2.026100823620014, "learning_rate": 6.7462150648946865e-06, "loss": 0.4016, "step": 537 }, { "epoch": 1.05, "grad_norm": 1.534212516787495, "learning_rate": 6.744427901633915e-06, "loss": 0.3626, "step": 538 }, { "epoch": 1.05, "grad_norm": 1.5042541647257084, "learning_rate": 6.742634706213091e-06, "loss": 0.3447, "step": 539 }, { "epoch": 1.05, "grad_norm": 1.5788526266425775, "learning_rate": 6.740835481966191e-06, "loss": 0.3869, "step": 540 }, { "epoch": 1.06, "grad_norm": 1.4843269889572055, "learning_rate": 6.739030232238398e-06, "loss": 0.3875, "step": 541 }, { "epoch": 1.06, "grad_norm": 1.695766767780369, "learning_rate": 6.737218960386098e-06, "loss": 0.3588, "step": 542 }, { "epoch": 1.06, "grad_norm": 1.282365559040157, "learning_rate": 6.735401669776875e-06, "loss": 0.3277, "step": 543 }, { "epoch": 1.06, "grad_norm": 1.5091766919282033, "learning_rate": 6.733578363789503e-06, "loss": 0.4109, "step": 544 }, { "epoch": 1.06, "grad_norm": 1.5213211015505608, "learning_rate": 6.73174904581394e-06, "loss": 0.3533, "step": 545 }, { "epoch": 1.07, "grad_norm": 1.5639859615028195, "learning_rate": 6.729913719251323e-06, "loss": 0.3827, "step": 546 }, { "epoch": 1.07, "grad_norm": 1.6113922674738699, "learning_rate": 6.728072387513955e-06, "loss": 0.3748, "step": 547 }, { "epoch": 1.07, "grad_norm": 1.7358681667299882, "learning_rate": 6.726225054025311e-06, "loss": 0.3559, "step": 548 }, { "epoch": 1.07, "grad_norm": 1.646037096526761, "learning_rate": 6.724371722220021e-06, "loss": 0.3358, "step": 549 }, { "epoch": 1.07, "grad_norm": 1.8272261372176661, "learning_rate": 6.722512395543867e-06, "loss": 0.3626, "step": 550 }, { "epoch": 1.08, "grad_norm": 1.4354491237314584, "learning_rate": 6.720647077453778e-06, "loss": 0.3957, "step": 551 }, { "epoch": 1.08, "grad_norm": 1.7609469535394036, "learning_rate": 6.718775771417823e-06, "loss": 0.3735, "step": 552 }, { "epoch": 1.08, "grad_norm": 1.6192114853427093, "learning_rate": 6.716898480915203e-06, "loss": 0.3082, "step": 553 }, { "epoch": 1.08, "grad_norm": 1.5376449624332489, "learning_rate": 6.715015209436244e-06, "loss": 0.3074, "step": 554 }, { "epoch": 1.08, "grad_norm": 1.5931139398561256, "learning_rate": 6.713125960482396e-06, "loss": 0.3437, "step": 555 }, { "epoch": 1.09, "grad_norm": 1.9275049525599572, "learning_rate": 6.711230737566219e-06, "loss": 0.3717, "step": 556 }, { "epoch": 1.09, "grad_norm": 1.6020231822362265, "learning_rate": 6.709329544211383e-06, "loss": 0.3516, "step": 557 }, { "epoch": 1.09, "grad_norm": 1.5265759587673142, "learning_rate": 6.707422383952656e-06, "loss": 0.3583, "step": 558 }, { "epoch": 1.09, "grad_norm": 1.5793539244916939, "learning_rate": 6.7055092603359e-06, "loss": 0.3426, "step": 559 }, { "epoch": 1.09, "grad_norm": 1.4232475879895807, "learning_rate": 6.7035901769180656e-06, "loss": 0.3583, "step": 560 }, { "epoch": 1.1, "grad_norm": 1.5257148594679155, "learning_rate": 6.701665137267182e-06, "loss": 0.3866, "step": 561 }, { "epoch": 1.1, "grad_norm": 1.5317484913936195, "learning_rate": 6.699734144962357e-06, "loss": 0.3384, "step": 562 }, { "epoch": 1.1, "grad_norm": 1.4319122591221911, "learning_rate": 6.6977972035937605e-06, "loss": 0.338, "step": 563 }, { "epoch": 1.1, "grad_norm": 1.4349849435893973, "learning_rate": 6.6958543167626265e-06, "loss": 0.31, "step": 564 }, { "epoch": 1.1, "grad_norm": 1.5792972907790404, "learning_rate": 6.6939054880812415e-06, "loss": 0.3548, "step": 565 }, { "epoch": 1.11, "grad_norm": 1.5057907383972735, "learning_rate": 6.6919507211729395e-06, "loss": 0.3358, "step": 566 }, { "epoch": 1.11, "grad_norm": 1.339292758951916, "learning_rate": 6.689990019672093e-06, "loss": 0.3458, "step": 567 }, { "epoch": 1.11, "grad_norm": 1.6051639075748252, "learning_rate": 6.688023387224115e-06, "loss": 0.3447, "step": 568 }, { "epoch": 1.11, "grad_norm": 1.4724797670906675, "learning_rate": 6.686050827485439e-06, "loss": 0.2993, "step": 569 }, { "epoch": 1.11, "grad_norm": 1.4441870717060707, "learning_rate": 6.68407234412352e-06, "loss": 0.3764, "step": 570 }, { "epoch": 1.12, "grad_norm": 1.3370465507885616, "learning_rate": 6.682087940816828e-06, "loss": 0.2604, "step": 571 }, { "epoch": 1.12, "grad_norm": 1.3447008372858118, "learning_rate": 6.6800976212548396e-06, "loss": 0.3463, "step": 572 }, { "epoch": 1.12, "grad_norm": 1.5598853026581967, "learning_rate": 6.678101389138029e-06, "loss": 0.3612, "step": 573 }, { "epoch": 1.12, "grad_norm": 1.5569144717189214, "learning_rate": 6.676099248177865e-06, "loss": 0.3453, "step": 574 }, { "epoch": 1.12, "grad_norm": 1.3799710109866647, "learning_rate": 6.6740912020968026e-06, "loss": 0.2997, "step": 575 }, { "epoch": 1.12, "grad_norm": 1.7256467193805083, "learning_rate": 6.672077254628275e-06, "loss": 0.2877, "step": 576 }, { "epoch": 1.13, "grad_norm": 1.681192985916992, "learning_rate": 6.6700574095166866e-06, "loss": 0.3161, "step": 577 }, { "epoch": 1.13, "grad_norm": 1.7023068267461663, "learning_rate": 6.6680316705174095e-06, "loss": 0.3152, "step": 578 }, { "epoch": 1.13, "grad_norm": 1.5366006367172114, "learning_rate": 6.666000041396771e-06, "loss": 0.3292, "step": 579 }, { "epoch": 1.13, "grad_norm": 1.6342115420038172, "learning_rate": 6.663962525932052e-06, "loss": 0.3043, "step": 580 }, { "epoch": 1.13, "grad_norm": 1.5005395331317195, "learning_rate": 6.6619191279114745e-06, "loss": 0.3661, "step": 581 }, { "epoch": 1.14, "grad_norm": 1.5484600249874991, "learning_rate": 6.659869851134203e-06, "loss": 0.3224, "step": 582 }, { "epoch": 1.14, "grad_norm": 1.5937086197734212, "learning_rate": 6.657814699410325e-06, "loss": 0.3419, "step": 583 }, { "epoch": 1.14, "grad_norm": 1.7099164426195532, "learning_rate": 6.655753676560856e-06, "loss": 0.3323, "step": 584 }, { "epoch": 1.14, "grad_norm": 1.7161538656835105, "learning_rate": 6.653686786417726e-06, "loss": 0.3655, "step": 585 }, { "epoch": 1.14, "grad_norm": 1.3813396185768745, "learning_rate": 6.651614032823773e-06, "loss": 0.358, "step": 586 }, { "epoch": 1.15, "grad_norm": 1.4013537347993237, "learning_rate": 6.649535419632736e-06, "loss": 0.2944, "step": 587 }, { "epoch": 1.15, "grad_norm": 1.4340564000450675, "learning_rate": 6.647450950709251e-06, "loss": 0.2832, "step": 588 }, { "epoch": 1.15, "grad_norm": 1.4624771270284822, "learning_rate": 6.645360629928838e-06, "loss": 0.3197, "step": 589 }, { "epoch": 1.15, "grad_norm": 1.5197668597603329, "learning_rate": 6.6432644611779e-06, "loss": 0.3383, "step": 590 }, { "epoch": 1.15, "grad_norm": 1.5126689864852634, "learning_rate": 6.641162448353711e-06, "loss": 0.3753, "step": 591 }, { "epoch": 1.16, "grad_norm": 1.3412735800769449, "learning_rate": 6.639054595364409e-06, "loss": 0.3333, "step": 592 }, { "epoch": 1.16, "grad_norm": 1.4824056030838642, "learning_rate": 6.6369409061289945e-06, "loss": 0.3691, "step": 593 }, { "epoch": 1.16, "grad_norm": 1.3845916653458221, "learning_rate": 6.634821384577314e-06, "loss": 0.2807, "step": 594 }, { "epoch": 1.16, "grad_norm": 1.524344933862754, "learning_rate": 6.632696034650063e-06, "loss": 0.3291, "step": 595 }, { "epoch": 1.16, "grad_norm": 1.368740846207929, "learning_rate": 6.630564860298768e-06, "loss": 0.3193, "step": 596 }, { "epoch": 1.17, "grad_norm": 1.469034426530807, "learning_rate": 6.628427865485789e-06, "loss": 0.3246, "step": 597 }, { "epoch": 1.17, "grad_norm": 1.3681223514409826, "learning_rate": 6.626285054184303e-06, "loss": 0.2943, "step": 598 }, { "epoch": 1.17, "grad_norm": 1.513374549601433, "learning_rate": 6.624136430378307e-06, "loss": 0.3273, "step": 599 }, { "epoch": 1.17, "grad_norm": 1.5459215360797343, "learning_rate": 6.6219819980625995e-06, "loss": 0.3385, "step": 600 }, { "epoch": 1.17, "grad_norm": 1.4851367239029383, "learning_rate": 6.619821761242781e-06, "loss": 0.2681, "step": 601 }, { "epoch": 1.18, "grad_norm": 1.431453411077164, "learning_rate": 6.617655723935244e-06, "loss": 0.3112, "step": 602 }, { "epoch": 1.18, "grad_norm": 1.6070811648760868, "learning_rate": 6.615483890167164e-06, "loss": 0.3384, "step": 603 }, { "epoch": 1.18, "grad_norm": 1.4098337589174694, "learning_rate": 6.613306263976496e-06, "loss": 0.2908, "step": 604 }, { "epoch": 1.18, "grad_norm": 1.380870368409901, "learning_rate": 6.6111228494119616e-06, "loss": 0.2932, "step": 605 }, { "epoch": 1.18, "grad_norm": 1.401692049552829, "learning_rate": 6.6089336505330466e-06, "loss": 0.3209, "step": 606 }, { "epoch": 1.19, "grad_norm": 1.5924253936917894, "learning_rate": 6.606738671409989e-06, "loss": 0.3531, "step": 607 }, { "epoch": 1.19, "grad_norm": 1.6237627750347543, "learning_rate": 6.604537916123775e-06, "loss": 0.3223, "step": 608 }, { "epoch": 1.19, "grad_norm": 1.5776967505708548, "learning_rate": 6.602331388766133e-06, "loss": 0.3088, "step": 609 }, { "epoch": 1.19, "grad_norm": 1.6002226216803075, "learning_rate": 6.600119093439517e-06, "loss": 0.3274, "step": 610 }, { "epoch": 1.19, "grad_norm": 1.4877100275202666, "learning_rate": 6.5979010342571085e-06, "loss": 0.2743, "step": 611 }, { "epoch": 1.2, "grad_norm": 1.7061697837867096, "learning_rate": 6.595677215342806e-06, "loss": 0.3764, "step": 612 }, { "epoch": 1.2, "grad_norm": 1.4873416906097539, "learning_rate": 6.593447640831215e-06, "loss": 0.3048, "step": 613 }, { "epoch": 1.2, "grad_norm": 1.5607546071504712, "learning_rate": 6.591212314867643e-06, "loss": 0.3337, "step": 614 }, { "epoch": 1.2, "grad_norm": 1.543056786172919, "learning_rate": 6.58897124160809e-06, "loss": 0.2716, "step": 615 }, { "epoch": 1.2, "grad_norm": 1.5614691718578486, "learning_rate": 6.5867244252192426e-06, "loss": 0.299, "step": 616 }, { "epoch": 1.21, "grad_norm": 1.5556696502280438, "learning_rate": 6.584471869878464e-06, "loss": 0.3274, "step": 617 }, { "epoch": 1.21, "grad_norm": 1.5266763505481131, "learning_rate": 6.58221357977379e-06, "loss": 0.3135, "step": 618 }, { "epoch": 1.21, "grad_norm": 1.4663144858938315, "learning_rate": 6.579949559103914e-06, "loss": 0.2523, "step": 619 }, { "epoch": 1.21, "grad_norm": 1.5813650506947463, "learning_rate": 6.577679812078189e-06, "loss": 0.2747, "step": 620 }, { "epoch": 1.21, "grad_norm": 1.4316267631272948, "learning_rate": 6.575404342916612e-06, "loss": 0.3146, "step": 621 }, { "epoch": 1.21, "grad_norm": 1.7160141228932517, "learning_rate": 6.573123155849819e-06, "loss": 0.3529, "step": 622 }, { "epoch": 1.22, "grad_norm": 1.643795196763233, "learning_rate": 6.570836255119078e-06, "loss": 0.3181, "step": 623 }, { "epoch": 1.22, "grad_norm": 1.6187408650083381, "learning_rate": 6.568543644976277e-06, "loss": 0.3234, "step": 624 }, { "epoch": 1.22, "grad_norm": 1.5601316992040704, "learning_rate": 6.566245329683923e-06, "loss": 0.2813, "step": 625 }, { "epoch": 1.22, "grad_norm": 1.4077318011856856, "learning_rate": 6.563941313515128e-06, "loss": 0.2979, "step": 626 }, { "epoch": 1.22, "grad_norm": 1.4964589435526412, "learning_rate": 6.5616316007536055e-06, "loss": 0.3098, "step": 627 }, { "epoch": 1.23, "grad_norm": 1.6105259788976034, "learning_rate": 6.559316195693656e-06, "loss": 0.2917, "step": 628 }, { "epoch": 1.23, "grad_norm": 1.4880969526455767, "learning_rate": 6.556995102640168e-06, "loss": 0.2927, "step": 629 }, { "epoch": 1.23, "grad_norm": 1.6038816134533636, "learning_rate": 6.5546683259086015e-06, "loss": 0.2601, "step": 630 }, { "epoch": 1.23, "grad_norm": 1.4126687848987542, "learning_rate": 6.552335869824988e-06, "loss": 0.2657, "step": 631 }, { "epoch": 1.23, "grad_norm": 1.4322697940387383, "learning_rate": 6.549997738725915e-06, "loss": 0.2733, "step": 632 }, { "epoch": 1.24, "grad_norm": 1.6466108531211048, "learning_rate": 6.547653936958522e-06, "loss": 0.3141, "step": 633 }, { "epoch": 1.24, "grad_norm": 1.4941018333628282, "learning_rate": 6.54530446888049e-06, "loss": 0.2828, "step": 634 }, { "epoch": 1.24, "grad_norm": 1.7002486813473032, "learning_rate": 6.542949338860039e-06, "loss": 0.4398, "step": 635 }, { "epoch": 1.24, "grad_norm": 1.624169985270247, "learning_rate": 6.540588551275913e-06, "loss": 0.3141, "step": 636 }, { "epoch": 1.24, "grad_norm": 1.546025452412661, "learning_rate": 6.538222110517375e-06, "loss": 0.2746, "step": 637 }, { "epoch": 1.25, "grad_norm": 1.5168444938594585, "learning_rate": 6.5358500209842005e-06, "loss": 0.2386, "step": 638 }, { "epoch": 1.25, "grad_norm": 1.5292959906446877, "learning_rate": 6.533472287086663e-06, "loss": 0.3262, "step": 639 }, { "epoch": 1.25, "grad_norm": 1.481689321484185, "learning_rate": 6.531088913245536e-06, "loss": 0.2684, "step": 640 }, { "epoch": 1.25, "grad_norm": 1.4323958400384247, "learning_rate": 6.528699903892073e-06, "loss": 0.3145, "step": 641 }, { "epoch": 1.25, "grad_norm": 1.5701524470627148, "learning_rate": 6.526305263468012e-06, "loss": 0.319, "step": 642 }, { "epoch": 1.26, "grad_norm": 1.3976110126570156, "learning_rate": 6.523904996425554e-06, "loss": 0.2629, "step": 643 }, { "epoch": 1.26, "grad_norm": 1.5942436576913042, "learning_rate": 6.5214991072273635e-06, "loss": 0.234, "step": 644 }, { "epoch": 1.26, "grad_norm": 1.4617885452205481, "learning_rate": 6.5190876003465626e-06, "loss": 0.2914, "step": 645 }, { "epoch": 1.26, "grad_norm": 1.423483723622913, "learning_rate": 6.516670480266711e-06, "loss": 0.2539, "step": 646 }, { "epoch": 1.26, "grad_norm": 1.5737955056059307, "learning_rate": 6.514247751481805e-06, "loss": 0.2927, "step": 647 }, { "epoch": 1.27, "grad_norm": 1.5796911262541091, "learning_rate": 6.511819418496276e-06, "loss": 0.2868, "step": 648 }, { "epoch": 1.27, "grad_norm": 1.517328819246049, "learning_rate": 6.509385485824968e-06, "loss": 0.3386, "step": 649 }, { "epoch": 1.27, "grad_norm": 1.388730965497905, "learning_rate": 6.506945957993139e-06, "loss": 0.266, "step": 650 }, { "epoch": 1.27, "grad_norm": 1.4830481071198485, "learning_rate": 6.504500839536449e-06, "loss": 0.2845, "step": 651 }, { "epoch": 1.27, "grad_norm": 1.5465628928401935, "learning_rate": 6.502050135000952e-06, "loss": 0.2852, "step": 652 }, { "epoch": 1.28, "grad_norm": 1.412089187628656, "learning_rate": 6.499593848943089e-06, "loss": 0.25, "step": 653 }, { "epoch": 1.28, "grad_norm": 1.6845688034270652, "learning_rate": 6.4971319859296766e-06, "loss": 0.2742, "step": 654 }, { "epoch": 1.28, "grad_norm": 1.4460676753882034, "learning_rate": 6.494664550537902e-06, "loss": 0.27, "step": 655 }, { "epoch": 1.28, "grad_norm": 1.6825963198732656, "learning_rate": 6.492191547355313e-06, "loss": 0.2564, "step": 656 }, { "epoch": 1.28, "grad_norm": 1.4880057150009678, "learning_rate": 6.489712980979807e-06, "loss": 0.2694, "step": 657 }, { "epoch": 1.29, "grad_norm": 1.4553454323059298, "learning_rate": 6.4872288560196266e-06, "loss": 0.2651, "step": 658 }, { "epoch": 1.29, "grad_norm": 1.4841500800446976, "learning_rate": 6.484739177093348e-06, "loss": 0.2056, "step": 659 }, { "epoch": 1.29, "grad_norm": 1.6687333194100546, "learning_rate": 6.482243948829876e-06, "loss": 0.2514, "step": 660 }, { "epoch": 1.29, "grad_norm": 1.5433610270226492, "learning_rate": 6.479743175868428e-06, "loss": 0.2734, "step": 661 }, { "epoch": 1.29, "grad_norm": 1.5220811481932661, "learning_rate": 6.477236862858536e-06, "loss": 0.2386, "step": 662 }, { "epoch": 1.29, "grad_norm": 1.6769457131977623, "learning_rate": 6.474725014460028e-06, "loss": 0.2286, "step": 663 }, { "epoch": 1.3, "grad_norm": 1.6131105750586237, "learning_rate": 6.472207635343026e-06, "loss": 0.2473, "step": 664 }, { "epoch": 1.3, "grad_norm": 1.533144534978118, "learning_rate": 6.469684730187934e-06, "loss": 0.3097, "step": 665 }, { "epoch": 1.3, "grad_norm": 1.5233191568166384, "learning_rate": 6.467156303685431e-06, "loss": 0.2515, "step": 666 }, { "epoch": 1.3, "grad_norm": 1.5113358725103458, "learning_rate": 6.4646223605364595e-06, "loss": 0.2237, "step": 667 }, { "epoch": 1.3, "grad_norm": 1.429739352199233, "learning_rate": 6.46208290545222e-06, "loss": 0.2851, "step": 668 }, { "epoch": 1.31, "grad_norm": 1.3994940217976886, "learning_rate": 6.459537943154163e-06, "loss": 0.3276, "step": 669 }, { "epoch": 1.31, "grad_norm": 1.41604166525969, "learning_rate": 6.456987478373975e-06, "loss": 0.2655, "step": 670 }, { "epoch": 1.31, "grad_norm": 1.4768265197107389, "learning_rate": 6.454431515853573e-06, "loss": 0.2984, "step": 671 }, { "epoch": 1.31, "grad_norm": 1.426949399514644, "learning_rate": 6.4518700603451e-06, "loss": 0.2304, "step": 672 }, { "epoch": 1.31, "grad_norm": 1.5025637068711146, "learning_rate": 6.449303116610906e-06, "loss": 0.2231, "step": 673 }, { "epoch": 1.32, "grad_norm": 1.5457885325918674, "learning_rate": 6.446730689423548e-06, "loss": 0.2896, "step": 674 }, { "epoch": 1.32, "grad_norm": 1.5919246839169228, "learning_rate": 6.444152783565778e-06, "loss": 0.2554, "step": 675 }, { "epoch": 1.32, "grad_norm": 1.7336773803211802, "learning_rate": 6.441569403830533e-06, "loss": 0.2468, "step": 676 }, { "epoch": 1.32, "grad_norm": 1.5841163836284198, "learning_rate": 6.438980555020928e-06, "loss": 0.2564, "step": 677 }, { "epoch": 1.32, "grad_norm": 1.5781575825558105, "learning_rate": 6.436386241950248e-06, "loss": 0.2605, "step": 678 }, { "epoch": 1.33, "grad_norm": 1.563688561460372, "learning_rate": 6.433786469441933e-06, "loss": 0.2037, "step": 679 }, { "epoch": 1.33, "grad_norm": 1.4443012269938738, "learning_rate": 6.431181242329578e-06, "loss": 0.2542, "step": 680 }, { "epoch": 1.33, "grad_norm": 1.4313554516561364, "learning_rate": 6.428570565456915e-06, "loss": 0.2482, "step": 681 }, { "epoch": 1.33, "grad_norm": 1.658513984865977, "learning_rate": 6.4259544436778135e-06, "loss": 0.2468, "step": 682 }, { "epoch": 1.33, "grad_norm": 1.78205064914089, "learning_rate": 6.423332881856262e-06, "loss": 0.245, "step": 683 }, { "epoch": 1.34, "grad_norm": 1.580091135790614, "learning_rate": 6.420705884866365e-06, "loss": 0.2656, "step": 684 }, { "epoch": 1.34, "grad_norm": 1.354923334870393, "learning_rate": 6.418073457592333e-06, "loss": 0.2159, "step": 685 }, { "epoch": 1.34, "grad_norm": 1.4193395921908514, "learning_rate": 6.415435604928471e-06, "loss": 0.2468, "step": 686 }, { "epoch": 1.34, "grad_norm": 1.6126758078740027, "learning_rate": 6.412792331779172e-06, "loss": 0.2713, "step": 687 }, { "epoch": 1.34, "grad_norm": 1.5015955816721034, "learning_rate": 6.4101436430589085e-06, "loss": 0.2837, "step": 688 }, { "epoch": 1.35, "grad_norm": 1.548516270781378, "learning_rate": 6.407489543692218e-06, "loss": 0.2144, "step": 689 }, { "epoch": 1.35, "grad_norm": 1.5995219217348766, "learning_rate": 6.4048300386137025e-06, "loss": 0.2554, "step": 690 }, { "epoch": 1.35, "grad_norm": 1.679758813775177, "learning_rate": 6.4021651327680095e-06, "loss": 0.2431, "step": 691 }, { "epoch": 1.35, "grad_norm": 1.419932234807484, "learning_rate": 6.399494831109832e-06, "loss": 0.2231, "step": 692 }, { "epoch": 1.35, "grad_norm": 1.5233933466687235, "learning_rate": 6.396819138603892e-06, "loss": 0.2261, "step": 693 }, { "epoch": 1.36, "grad_norm": 1.6888302306573564, "learning_rate": 6.394138060224937e-06, "loss": 0.2338, "step": 694 }, { "epoch": 1.36, "grad_norm": 1.6638618646346788, "learning_rate": 6.391451600957725e-06, "loss": 0.2056, "step": 695 }, { "epoch": 1.36, "grad_norm": 1.54427585785311, "learning_rate": 6.3887597657970235e-06, "loss": 0.2688, "step": 696 }, { "epoch": 1.36, "grad_norm": 1.5946152581461341, "learning_rate": 6.386062559747589e-06, "loss": 0.2598, "step": 697 }, { "epoch": 1.36, "grad_norm": 1.487377119633243, "learning_rate": 6.383359987824167e-06, "loss": 0.239, "step": 698 }, { "epoch": 1.37, "grad_norm": 1.6229678779967882, "learning_rate": 6.380652055051478e-06, "loss": 0.2207, "step": 699 }, { "epoch": 1.37, "grad_norm": 1.4993555723099932, "learning_rate": 6.377938766464212e-06, "loss": 0.2544, "step": 700 }, { "epoch": 1.37, "grad_norm": 1.5529690013943946, "learning_rate": 6.375220127107016e-06, "loss": 0.276, "step": 701 }, { "epoch": 1.37, "grad_norm": 1.5458659385556328, "learning_rate": 6.372496142034483e-06, "loss": 0.2691, "step": 702 }, { "epoch": 1.37, "grad_norm": 1.7998702736915513, "learning_rate": 6.369766816311148e-06, "loss": 0.2265, "step": 703 }, { "epoch": 1.38, "grad_norm": 1.6158625479345892, "learning_rate": 6.367032155011471e-06, "loss": 0.2247, "step": 704 }, { "epoch": 1.38, "grad_norm": 1.5621265616372126, "learning_rate": 6.364292163219839e-06, "loss": 0.2554, "step": 705 }, { "epoch": 1.38, "grad_norm": 1.55778482941904, "learning_rate": 6.361546846030543e-06, "loss": 0.215, "step": 706 }, { "epoch": 1.38, "grad_norm": 1.527657854007939, "learning_rate": 6.358796208547779e-06, "loss": 0.1757, "step": 707 }, { "epoch": 1.38, "grad_norm": 1.5132164218022688, "learning_rate": 6.3560402558856354e-06, "loss": 0.1716, "step": 708 }, { "epoch": 1.38, "grad_norm": 1.7526913896126513, "learning_rate": 6.353278993168078e-06, "loss": 0.2412, "step": 709 }, { "epoch": 1.39, "grad_norm": 1.5818420368239936, "learning_rate": 6.350512425528949e-06, "loss": 0.1872, "step": 710 }, { "epoch": 1.39, "grad_norm": 1.5872912063123072, "learning_rate": 6.347740558111955e-06, "loss": 0.1772, "step": 711 }, { "epoch": 1.39, "grad_norm": 1.5016042437481156, "learning_rate": 6.3449633960706536e-06, "loss": 0.2082, "step": 712 }, { "epoch": 1.39, "grad_norm": 2.059905205693757, "learning_rate": 6.342180944568445e-06, "loss": 0.2193, "step": 713 }, { "epoch": 1.39, "grad_norm": 1.739657667727544, "learning_rate": 6.339393208778568e-06, "loss": 0.255, "step": 714 }, { "epoch": 1.4, "grad_norm": 1.6979345382067677, "learning_rate": 6.336600193884082e-06, "loss": 0.2001, "step": 715 }, { "epoch": 1.4, "grad_norm": 1.7297300021739745, "learning_rate": 6.333801905077864e-06, "loss": 0.2254, "step": 716 }, { "epoch": 1.4, "grad_norm": 1.8459933435558482, "learning_rate": 6.330998347562596e-06, "loss": 0.1932, "step": 717 }, { "epoch": 1.4, "grad_norm": 1.8059762126520358, "learning_rate": 6.328189526550756e-06, "loss": 0.2285, "step": 718 }, { "epoch": 1.4, "grad_norm": 1.477559911886489, "learning_rate": 6.325375447264607e-06, "loss": 0.187, "step": 719 }, { "epoch": 1.41, "grad_norm": 1.701061838578498, "learning_rate": 6.322556114936189e-06, "loss": 0.2364, "step": 720 }, { "epoch": 1.41, "grad_norm": 1.6679447748605194, "learning_rate": 6.319731534807309e-06, "loss": 0.2463, "step": 721 }, { "epoch": 1.41, "grad_norm": 1.519668342812649, "learning_rate": 6.31690171212953e-06, "loss": 0.2393, "step": 722 }, { "epoch": 1.41, "grad_norm": 1.396869929090381, "learning_rate": 6.314066652164164e-06, "loss": 0.2159, "step": 723 }, { "epoch": 1.41, "grad_norm": 1.5067283495838106, "learning_rate": 6.311226360182257e-06, "loss": 0.2478, "step": 724 }, { "epoch": 1.42, "grad_norm": 1.6667028557490369, "learning_rate": 6.308380841464587e-06, "loss": 0.2022, "step": 725 }, { "epoch": 1.42, "grad_norm": 1.5739626687355739, "learning_rate": 6.305530101301645e-06, "loss": 0.2186, "step": 726 }, { "epoch": 1.42, "grad_norm": 1.491052065969651, "learning_rate": 6.302674144993634e-06, "loss": 0.2795, "step": 727 }, { "epoch": 1.42, "grad_norm": 1.7427878020149625, "learning_rate": 6.2998129778504535e-06, "loss": 0.2064, "step": 728 }, { "epoch": 1.42, "grad_norm": 1.6137607527961562, "learning_rate": 6.2969466051916905e-06, "loss": 0.2398, "step": 729 }, { "epoch": 1.43, "grad_norm": 1.5714176484918514, "learning_rate": 6.29407503234661e-06, "loss": 0.1631, "step": 730 }, { "epoch": 1.43, "grad_norm": 1.6861223902197622, "learning_rate": 6.291198264654147e-06, "loss": 0.2357, "step": 731 }, { "epoch": 1.43, "grad_norm": 1.6583564945480405, "learning_rate": 6.288316307462895e-06, "loss": 0.2291, "step": 732 }, { "epoch": 1.43, "grad_norm": 1.6109123932238907, "learning_rate": 6.285429166131092e-06, "loss": 0.2628, "step": 733 }, { "epoch": 1.43, "grad_norm": 1.6073728101180118, "learning_rate": 6.282536846026621e-06, "loss": 0.2244, "step": 734 }, { "epoch": 1.44, "grad_norm": 1.602954509323332, "learning_rate": 6.279639352526989e-06, "loss": 0.2364, "step": 735 }, { "epoch": 1.44, "grad_norm": 1.5287905678764164, "learning_rate": 6.276736691019323e-06, "loss": 0.2136, "step": 736 }, { "epoch": 1.44, "grad_norm": 1.630107853737259, "learning_rate": 6.273828866900358e-06, "loss": 0.2488, "step": 737 }, { "epoch": 1.44, "grad_norm": 1.6156099831891164, "learning_rate": 6.270915885576429e-06, "loss": 0.1849, "step": 738 }, { "epoch": 1.44, "grad_norm": 1.4618523644859287, "learning_rate": 6.267997752463455e-06, "loss": 0.1996, "step": 739 }, { "epoch": 1.45, "grad_norm": 1.619284338152327, "learning_rate": 6.265074472986942e-06, "loss": 0.2034, "step": 740 }, { "epoch": 1.45, "grad_norm": 1.6115841739805956, "learning_rate": 6.262146052581954e-06, "loss": 0.2406, "step": 741 }, { "epoch": 1.45, "grad_norm": 1.385773682095175, "learning_rate": 6.259212496693122e-06, "loss": 0.22, "step": 742 }, { "epoch": 1.45, "grad_norm": 1.4853860994690589, "learning_rate": 6.2562738107746195e-06, "loss": 0.2272, "step": 743 }, { "epoch": 1.45, "grad_norm": 1.4593761328277106, "learning_rate": 6.253330000290159e-06, "loss": 0.2103, "step": 744 }, { "epoch": 1.46, "grad_norm": 1.439560811143782, "learning_rate": 6.250381070712984e-06, "loss": 0.1711, "step": 745 }, { "epoch": 1.46, "grad_norm": 1.4935544430885708, "learning_rate": 6.247427027525851e-06, "loss": 0.2326, "step": 746 }, { "epoch": 1.46, "grad_norm": 1.4029595474146663, "learning_rate": 6.244467876221027e-06, "loss": 0.2109, "step": 747 }, { "epoch": 1.46, "grad_norm": 1.4728303789670665, "learning_rate": 6.241503622300277e-06, "loss": 0.261, "step": 748 }, { "epoch": 1.46, "grad_norm": 1.5161079424349184, "learning_rate": 6.238534271274847e-06, "loss": 0.2563, "step": 749 }, { "epoch": 1.46, "grad_norm": 1.419337089524256, "learning_rate": 6.235559828665468e-06, "loss": 0.2172, "step": 750 }, { "epoch": 1.47, "grad_norm": 1.5248577750276426, "learning_rate": 6.2325803000023306e-06, "loss": 0.2134, "step": 751 }, { "epoch": 1.47, "grad_norm": 1.6784825981589788, "learning_rate": 6.229595690825086e-06, "loss": 0.2438, "step": 752 }, { "epoch": 1.47, "grad_norm": 1.4742380818310923, "learning_rate": 6.2266060066828295e-06, "loss": 0.2137, "step": 753 }, { "epoch": 1.47, "grad_norm": 1.4543967845905672, "learning_rate": 6.223611253134092e-06, "loss": 0.2236, "step": 754 }, { "epoch": 1.47, "grad_norm": 1.6542517887252408, "learning_rate": 6.22061143574683e-06, "loss": 0.2393, "step": 755 }, { "epoch": 1.48, "grad_norm": 1.554357335529555, "learning_rate": 6.217606560098415e-06, "loss": 0.1984, "step": 756 }, { "epoch": 1.48, "grad_norm": 1.5359990775038832, "learning_rate": 6.214596631775621e-06, "loss": 0.2304, "step": 757 }, { "epoch": 1.48, "grad_norm": 1.5400405299027415, "learning_rate": 6.21158165637462e-06, "loss": 0.269, "step": 758 }, { "epoch": 1.48, "grad_norm": 1.5235011556724805, "learning_rate": 6.208561639500964e-06, "loss": 0.1994, "step": 759 }, { "epoch": 1.48, "grad_norm": 1.5124463998529605, "learning_rate": 6.205536586769579e-06, "loss": 0.1965, "step": 760 }, { "epoch": 1.49, "grad_norm": 1.4220228340531154, "learning_rate": 6.2025065038047566e-06, "loss": 0.1776, "step": 761 }, { "epoch": 1.49, "grad_norm": 1.5367698331373916, "learning_rate": 6.199471396240139e-06, "loss": 0.2062, "step": 762 }, { "epoch": 1.49, "grad_norm": 1.3438803261086743, "learning_rate": 6.196431269718709e-06, "loss": 0.1992, "step": 763 }, { "epoch": 1.49, "grad_norm": 1.4885002430409782, "learning_rate": 6.193386129892782e-06, "loss": 0.1855, "step": 764 }, { "epoch": 1.49, "grad_norm": 1.4568065487416695, "learning_rate": 6.1903359824239935e-06, "loss": 0.2262, "step": 765 }, { "epoch": 1.5, "grad_norm": 1.537870577785427, "learning_rate": 6.1872808329832926e-06, "loss": 0.2143, "step": 766 }, { "epoch": 1.5, "grad_norm": 1.4873084082908952, "learning_rate": 6.184220687250923e-06, "loss": 0.2397, "step": 767 }, { "epoch": 1.5, "grad_norm": 1.6377625742059532, "learning_rate": 6.181155550916423e-06, "loss": 0.2499, "step": 768 }, { "epoch": 1.5, "grad_norm": 1.5115919138106764, "learning_rate": 6.178085429678607e-06, "loss": 0.2165, "step": 769 }, { "epoch": 1.5, "grad_norm": 1.4154546469010034, "learning_rate": 6.175010329245555e-06, "loss": 0.2223, "step": 770 }, { "epoch": 1.51, "grad_norm": 1.5617140945481642, "learning_rate": 6.1719302553346105e-06, "loss": 0.2357, "step": 771 }, { "epoch": 1.51, "grad_norm": 1.3901779831637933, "learning_rate": 6.168845213672358e-06, "loss": 0.1984, "step": 772 }, { "epoch": 1.51, "grad_norm": 1.576334449867566, "learning_rate": 6.165755209994623e-06, "loss": 0.2383, "step": 773 }, { "epoch": 1.51, "grad_norm": 1.510250375268073, "learning_rate": 6.162660250046452e-06, "loss": 0.2011, "step": 774 }, { "epoch": 1.51, "grad_norm": 1.4648927512769898, "learning_rate": 6.15956033958211e-06, "loss": 0.2034, "step": 775 }, { "epoch": 1.52, "grad_norm": 1.4691830371678116, "learning_rate": 6.156455484365066e-06, "loss": 0.22, "step": 776 }, { "epoch": 1.52, "grad_norm": 1.5344976088680549, "learning_rate": 6.1533456901679806e-06, "loss": 0.205, "step": 777 }, { "epoch": 1.52, "grad_norm": 1.5066209647188327, "learning_rate": 6.150230962772696e-06, "loss": 0.171, "step": 778 }, { "epoch": 1.52, "grad_norm": 1.494462397632066, "learning_rate": 6.147111307970229e-06, "loss": 0.1698, "step": 779 }, { "epoch": 1.52, "grad_norm": 1.530856795809198, "learning_rate": 6.143986731560761e-06, "loss": 0.2159, "step": 780 }, { "epoch": 1.53, "grad_norm": 1.5843616209506817, "learning_rate": 6.140857239353613e-06, "loss": 0.2307, "step": 781 }, { "epoch": 1.53, "grad_norm": 1.434807436553126, "learning_rate": 6.137722837167257e-06, "loss": 0.2196, "step": 782 }, { "epoch": 1.53, "grad_norm": 1.5906697337267315, "learning_rate": 6.134583530829289e-06, "loss": 0.2052, "step": 783 }, { "epoch": 1.53, "grad_norm": 1.6676458630188786, "learning_rate": 6.131439326176421e-06, "loss": 0.2256, "step": 784 }, { "epoch": 1.53, "grad_norm": 1.4459116127229694, "learning_rate": 6.1282902290544755e-06, "loss": 0.2033, "step": 785 }, { "epoch": 1.54, "grad_norm": 1.3992511215135668, "learning_rate": 6.125136245318369e-06, "loss": 0.1792, "step": 786 }, { "epoch": 1.54, "grad_norm": 1.592777715402256, "learning_rate": 6.121977380832107e-06, "loss": 0.2092, "step": 787 }, { "epoch": 1.54, "grad_norm": 1.3838238590509728, "learning_rate": 6.118813641468765e-06, "loss": 0.1622, "step": 788 }, { "epoch": 1.54, "grad_norm": 1.5091694602768735, "learning_rate": 6.115645033110484e-06, "loss": 0.1792, "step": 789 }, { "epoch": 1.54, "grad_norm": 1.6306514612527883, "learning_rate": 6.112471561648458e-06, "loss": 0.2092, "step": 790 }, { "epoch": 1.54, "grad_norm": 1.5760256499476375, "learning_rate": 6.109293232982922e-06, "loss": 0.2232, "step": 791 }, { "epoch": 1.55, "grad_norm": 1.6775191763778843, "learning_rate": 6.1061100530231424e-06, "loss": 0.2302, "step": 792 }, { "epoch": 1.55, "grad_norm": 1.432696701489475, "learning_rate": 6.102922027687403e-06, "loss": 0.2195, "step": 793 }, { "epoch": 1.55, "grad_norm": 1.5454408137000604, "learning_rate": 6.0997291629030006e-06, "loss": 0.2297, "step": 794 }, { "epoch": 1.55, "grad_norm": 1.5239463632284074, "learning_rate": 6.0965314646062255e-06, "loss": 0.1887, "step": 795 }, { "epoch": 1.55, "grad_norm": 1.4378339993028284, "learning_rate": 6.093328938742357e-06, "loss": 0.2428, "step": 796 }, { "epoch": 1.56, "grad_norm": 1.6019937247230225, "learning_rate": 6.090121591265649e-06, "loss": 0.1743, "step": 797 }, { "epoch": 1.56, "grad_norm": 1.5972177580956295, "learning_rate": 6.086909428139321e-06, "loss": 0.2512, "step": 798 }, { "epoch": 1.56, "grad_norm": 1.609015025587239, "learning_rate": 6.083692455335545e-06, "loss": 0.2117, "step": 799 }, { "epoch": 1.56, "grad_norm": 1.5484115210429834, "learning_rate": 6.080470678835434e-06, "loss": 0.2408, "step": 800 }, { "epoch": 1.56, "grad_norm": 1.7154977008665826, "learning_rate": 6.077244104629035e-06, "loss": 0.2571, "step": 801 }, { "epoch": 1.57, "grad_norm": 1.5750055615218004, "learning_rate": 6.074012738715316e-06, "loss": 0.2322, "step": 802 }, { "epoch": 1.57, "grad_norm": 1.4409112024269977, "learning_rate": 6.070776587102147e-06, "loss": 0.2002, "step": 803 }, { "epoch": 1.57, "grad_norm": 1.5601877924013323, "learning_rate": 6.067535655806304e-06, "loss": 0.1891, "step": 804 }, { "epoch": 1.57, "grad_norm": 1.4650521523873927, "learning_rate": 6.064289950853444e-06, "loss": 0.1916, "step": 805 }, { "epoch": 1.57, "grad_norm": 1.4396844164475386, "learning_rate": 6.061039478278104e-06, "loss": 0.1439, "step": 806 }, { "epoch": 1.58, "grad_norm": 1.4516319960848205, "learning_rate": 6.05778424412368e-06, "loss": 0.1774, "step": 807 }, { "epoch": 1.58, "grad_norm": 1.4320549172944732, "learning_rate": 6.054524254442424e-06, "loss": 0.1843, "step": 808 }, { "epoch": 1.58, "grad_norm": 1.5784859107863396, "learning_rate": 6.0512595152954305e-06, "loss": 0.1949, "step": 809 }, { "epoch": 1.58, "grad_norm": 1.524753963687103, "learning_rate": 6.047990032752622e-06, "loss": 0.1879, "step": 810 }, { "epoch": 1.58, "grad_norm": 1.5415042494487146, "learning_rate": 6.04471581289274e-06, "loss": 0.1918, "step": 811 }, { "epoch": 1.59, "grad_norm": 1.5622133043755708, "learning_rate": 6.0414368618033354e-06, "loss": 0.2116, "step": 812 }, { "epoch": 1.59, "grad_norm": 1.6272655891389358, "learning_rate": 6.038153185580757e-06, "loss": 0.1628, "step": 813 }, { "epoch": 1.59, "grad_norm": 1.4687944399572777, "learning_rate": 6.0348647903301345e-06, "loss": 0.1774, "step": 814 }, { "epoch": 1.59, "grad_norm": 1.6747152917437942, "learning_rate": 6.031571682165374e-06, "loss": 0.2447, "step": 815 }, { "epoch": 1.59, "grad_norm": 1.6851546522288474, "learning_rate": 6.028273867209144e-06, "loss": 0.2249, "step": 816 }, { "epoch": 1.6, "grad_norm": 1.4825704341843833, "learning_rate": 6.0249713515928645e-06, "loss": 0.2013, "step": 817 }, { "epoch": 1.6, "grad_norm": 1.5688165623077204, "learning_rate": 6.0216641414566945e-06, "loss": 0.2012, "step": 818 }, { "epoch": 1.6, "grad_norm": 1.498510111034304, "learning_rate": 6.018352242949519e-06, "loss": 0.1959, "step": 819 }, { "epoch": 1.6, "grad_norm": 1.5487837110645661, "learning_rate": 6.015035662228943e-06, "loss": 0.1585, "step": 820 }, { "epoch": 1.6, "grad_norm": 1.5161898744769695, "learning_rate": 6.011714405461277e-06, "loss": 0.1934, "step": 821 }, { "epoch": 1.61, "grad_norm": 1.6369585928059978, "learning_rate": 6.008388478821523e-06, "loss": 0.2444, "step": 822 }, { "epoch": 1.61, "grad_norm": 1.5770821996868292, "learning_rate": 6.005057888493365e-06, "loss": 0.2238, "step": 823 }, { "epoch": 1.61, "grad_norm": 1.6206247784198546, "learning_rate": 6.001722640669162e-06, "loss": 0.2278, "step": 824 }, { "epoch": 1.61, "grad_norm": 1.4956456198486852, "learning_rate": 5.998382741549929e-06, "loss": 0.2439, "step": 825 }, { "epoch": 1.61, "grad_norm": 1.4884186846271075, "learning_rate": 5.995038197345329e-06, "loss": 0.2169, "step": 826 }, { "epoch": 1.62, "grad_norm": 1.4630932836165946, "learning_rate": 5.991689014273663e-06, "loss": 0.2149, "step": 827 }, { "epoch": 1.62, "grad_norm": 1.4842433361333227, "learning_rate": 5.988335198561855e-06, "loss": 0.1427, "step": 828 }, { "epoch": 1.62, "grad_norm": 1.6939331740793713, "learning_rate": 5.984976756445443e-06, "loss": 0.2054, "step": 829 }, { "epoch": 1.62, "grad_norm": 1.5245906739438482, "learning_rate": 5.981613694168567e-06, "loss": 0.188, "step": 830 }, { "epoch": 1.62, "grad_norm": 1.6103902584383138, "learning_rate": 5.978246017983955e-06, "loss": 0.2321, "step": 831 }, { "epoch": 1.62, "grad_norm": 1.7578359946643838, "learning_rate": 5.974873734152916e-06, "loss": 0.1997, "step": 832 }, { "epoch": 1.63, "grad_norm": 1.4862979091337336, "learning_rate": 5.971496848945324e-06, "loss": 0.1939, "step": 833 }, { "epoch": 1.63, "grad_norm": 1.693546290372143, "learning_rate": 5.96811536863961e-06, "loss": 0.1993, "step": 834 }, { "epoch": 1.63, "grad_norm": 1.6513287639716825, "learning_rate": 5.964729299522746e-06, "loss": 0.2268, "step": 835 }, { "epoch": 1.63, "grad_norm": 1.6374901858056834, "learning_rate": 5.961338647890235e-06, "loss": 0.1956, "step": 836 }, { "epoch": 1.63, "grad_norm": 1.508515336773621, "learning_rate": 5.9579434200461045e-06, "loss": 0.2119, "step": 837 }, { "epoch": 1.64, "grad_norm": 1.6974448563305895, "learning_rate": 5.954543622302885e-06, "loss": 0.1949, "step": 838 }, { "epoch": 1.64, "grad_norm": 1.560895488041379, "learning_rate": 5.951139260981607e-06, "loss": 0.1737, "step": 839 }, { "epoch": 1.64, "grad_norm": 1.8419372889085235, "learning_rate": 5.947730342411785e-06, "loss": 0.254, "step": 840 }, { "epoch": 1.64, "grad_norm": 1.54808146851788, "learning_rate": 5.944316872931405e-06, "loss": 0.2254, "step": 841 }, { "epoch": 1.64, "grad_norm": 1.5348635500295191, "learning_rate": 5.940898858886916e-06, "loss": 0.2427, "step": 842 }, { "epoch": 1.65, "grad_norm": 1.4798924438134817, "learning_rate": 5.937476306633216e-06, "loss": 0.2218, "step": 843 }, { "epoch": 1.65, "grad_norm": 1.5631999505920282, "learning_rate": 5.93404922253364e-06, "loss": 0.2219, "step": 844 }, { "epoch": 1.65, "grad_norm": 1.5044944967770686, "learning_rate": 5.9306176129599504e-06, "loss": 0.2394, "step": 845 }, { "epoch": 1.65, "grad_norm": 1.4695565495496319, "learning_rate": 5.927181484292321e-06, "loss": 0.2407, "step": 846 }, { "epoch": 1.65, "grad_norm": 1.356035334652753, "learning_rate": 5.923740842919329e-06, "loss": 0.2144, "step": 847 }, { "epoch": 1.66, "grad_norm": 1.415615732777814, "learning_rate": 5.9202956952379435e-06, "loss": 0.2173, "step": 848 }, { "epoch": 1.66, "grad_norm": 1.384588414759804, "learning_rate": 5.916846047653508e-06, "loss": 0.2148, "step": 849 }, { "epoch": 1.66, "grad_norm": 1.3711770767632199, "learning_rate": 5.913391906579735e-06, "loss": 0.2069, "step": 850 }, { "epoch": 1.66, "grad_norm": 1.3773306287472222, "learning_rate": 5.909933278438691e-06, "loss": 0.1718, "step": 851 }, { "epoch": 1.66, "grad_norm": 1.4852408818964347, "learning_rate": 5.9064701696607854e-06, "loss": 0.2365, "step": 852 }, { "epoch": 1.67, "grad_norm": 1.5014759805661089, "learning_rate": 5.903002586684759e-06, "loss": 0.2333, "step": 853 }, { "epoch": 1.67, "grad_norm": 1.4840467153638823, "learning_rate": 5.8995305359576685e-06, "loss": 0.1923, "step": 854 }, { "epoch": 1.67, "grad_norm": 1.42582686162404, "learning_rate": 5.896054023934879e-06, "loss": 0.2152, "step": 855 }, { "epoch": 1.67, "grad_norm": 1.4390382561313497, "learning_rate": 5.892573057080049e-06, "loss": 0.2819, "step": 856 }, { "epoch": 1.67, "grad_norm": 1.3633252088910854, "learning_rate": 5.8890876418651235e-06, "loss": 0.2086, "step": 857 }, { "epoch": 1.68, "grad_norm": 1.580468279117873, "learning_rate": 5.885597784770311e-06, "loss": 0.205, "step": 858 }, { "epoch": 1.68, "grad_norm": 1.6213928105589794, "learning_rate": 5.882103492284086e-06, "loss": 0.1902, "step": 859 }, { "epoch": 1.68, "grad_norm": 1.406153319312404, "learning_rate": 5.878604770903163e-06, "loss": 0.2158, "step": 860 }, { "epoch": 1.68, "grad_norm": 1.552072247386532, "learning_rate": 5.875101627132497e-06, "loss": 0.2119, "step": 861 }, { "epoch": 1.68, "grad_norm": 1.4683160496507135, "learning_rate": 5.8715940674852605e-06, "loss": 0.1641, "step": 862 }, { "epoch": 1.69, "grad_norm": 1.5784107990669474, "learning_rate": 5.868082098482837e-06, "loss": 0.1848, "step": 863 }, { "epoch": 1.69, "grad_norm": 1.606456752918685, "learning_rate": 5.864565726654811e-06, "loss": 0.1886, "step": 864 }, { "epoch": 1.69, "grad_norm": 1.6393660827319165, "learning_rate": 5.8610449585389485e-06, "loss": 0.2219, "step": 865 }, { "epoch": 1.69, "grad_norm": 1.613059510621349, "learning_rate": 5.857519800681193e-06, "loss": 0.1982, "step": 866 }, { "epoch": 1.69, "grad_norm": 1.403536188844097, "learning_rate": 5.853990259635647e-06, "loss": 0.1975, "step": 867 }, { "epoch": 1.7, "grad_norm": 1.7342346850816917, "learning_rate": 5.850456341964565e-06, "loss": 0.2048, "step": 868 }, { "epoch": 1.7, "grad_norm": 1.6897390175734006, "learning_rate": 5.846918054238335e-06, "loss": 0.1933, "step": 869 }, { "epoch": 1.7, "grad_norm": 1.362949082968982, "learning_rate": 5.8433754030354725e-06, "loss": 0.1955, "step": 870 }, { "epoch": 1.7, "grad_norm": 1.6897216890227944, "learning_rate": 5.839828394942607e-06, "loss": 0.2203, "step": 871 }, { "epoch": 1.7, "grad_norm": 1.444929761409237, "learning_rate": 5.836277036554466e-06, "loss": 0.2117, "step": 872 }, { "epoch": 1.71, "grad_norm": 1.3787681816843602, "learning_rate": 5.8327213344738656e-06, "loss": 0.2025, "step": 873 }, { "epoch": 1.71, "grad_norm": 1.4887437994036987, "learning_rate": 5.829161295311698e-06, "loss": 0.172, "step": 874 }, { "epoch": 1.71, "grad_norm": 1.613534784040808, "learning_rate": 5.8255969256869195e-06, "loss": 0.2645, "step": 875 }, { "epoch": 1.71, "grad_norm": 1.6279486117616642, "learning_rate": 5.822028232226539e-06, "loss": 0.2322, "step": 876 }, { "epoch": 1.71, "grad_norm": 1.691372167975381, "learning_rate": 5.8184552215656015e-06, "loss": 0.1939, "step": 877 }, { "epoch": 1.71, "grad_norm": 1.5785667479959244, "learning_rate": 5.81487790034718e-06, "loss": 0.2275, "step": 878 }, { "epoch": 1.72, "grad_norm": 1.5784131014173868, "learning_rate": 5.811296275222363e-06, "loss": 0.231, "step": 879 }, { "epoch": 1.72, "grad_norm": 1.4867777665182866, "learning_rate": 5.807710352850241e-06, "loss": 0.1806, "step": 880 }, { "epoch": 1.72, "grad_norm": 1.5953224684416196, "learning_rate": 5.804120139897891e-06, "loss": 0.2551, "step": 881 }, { "epoch": 1.72, "grad_norm": 1.6582329410501013, "learning_rate": 5.800525643040371e-06, "loss": 0.1765, "step": 882 }, { "epoch": 1.72, "grad_norm": 1.650492890700563, "learning_rate": 5.796926868960701e-06, "loss": 0.1618, "step": 883 }, { "epoch": 1.73, "grad_norm": 1.7754051607549333, "learning_rate": 5.793323824349856e-06, "loss": 0.2288, "step": 884 }, { "epoch": 1.73, "grad_norm": 1.5430081420550046, "learning_rate": 5.7897165159067485e-06, "loss": 0.149, "step": 885 }, { "epoch": 1.73, "grad_norm": 1.623056327724441, "learning_rate": 5.7861049503382185e-06, "loss": 0.1992, "step": 886 }, { "epoch": 1.73, "grad_norm": 1.5662083600994776, "learning_rate": 5.782489134359023e-06, "loss": 0.222, "step": 887 }, { "epoch": 1.73, "grad_norm": 1.710366610129068, "learning_rate": 5.778869074691822e-06, "loss": 0.2369, "step": 888 }, { "epoch": 1.74, "grad_norm": 1.5464075052444228, "learning_rate": 5.775244778067161e-06, "loss": 0.2036, "step": 889 }, { "epoch": 1.74, "grad_norm": 1.5094900300010041, "learning_rate": 5.771616251223469e-06, "loss": 0.1968, "step": 890 }, { "epoch": 1.74, "grad_norm": 1.5051785161612603, "learning_rate": 5.767983500907034e-06, "loss": 0.223, "step": 891 }, { "epoch": 1.74, "grad_norm": 1.4091347327895254, "learning_rate": 5.764346533872001e-06, "loss": 0.1543, "step": 892 }, { "epoch": 1.74, "grad_norm": 1.503789977282737, "learning_rate": 5.760705356880353e-06, "loss": 0.1909, "step": 893 }, { "epoch": 1.75, "grad_norm": 1.4171732318897141, "learning_rate": 5.757059976701901e-06, "loss": 0.2172, "step": 894 }, { "epoch": 1.75, "grad_norm": 1.404696323642193, "learning_rate": 5.75341040011427e-06, "loss": 0.2319, "step": 895 }, { "epoch": 1.75, "grad_norm": 1.3823662977121143, "learning_rate": 5.749756633902887e-06, "loss": 0.2103, "step": 896 }, { "epoch": 1.75, "grad_norm": 1.4979136251703113, "learning_rate": 5.74609868486097e-06, "loss": 0.2007, "step": 897 }, { "epoch": 1.75, "grad_norm": 1.4616817802780977, "learning_rate": 5.742436559789513e-06, "loss": 0.2076, "step": 898 }, { "epoch": 1.76, "grad_norm": 1.4115746615792877, "learning_rate": 5.738770265497272e-06, "loss": 0.2058, "step": 899 }, { "epoch": 1.76, "grad_norm": 1.5008953809051688, "learning_rate": 5.735099808800758e-06, "loss": 0.1975, "step": 900 }, { "epoch": 1.76, "grad_norm": 1.4895589297528806, "learning_rate": 5.73142519652422e-06, "loss": 0.185, "step": 901 }, { "epoch": 1.76, "grad_norm": 1.4898671289762404, "learning_rate": 5.727746435499632e-06, "loss": 0.1979, "step": 902 }, { "epoch": 1.76, "grad_norm": 1.547487152471578, "learning_rate": 5.724063532566682e-06, "loss": 0.2229, "step": 903 }, { "epoch": 1.77, "grad_norm": 1.4397163431696676, "learning_rate": 5.720376494572759e-06, "loss": 0.2115, "step": 904 }, { "epoch": 1.77, "grad_norm": 1.433457965843154, "learning_rate": 5.716685328372941e-06, "loss": 0.229, "step": 905 }, { "epoch": 1.77, "grad_norm": 1.5400450600867779, "learning_rate": 5.712990040829979e-06, "loss": 0.2533, "step": 906 }, { "epoch": 1.77, "grad_norm": 1.4393542452743997, "learning_rate": 5.70929063881429e-06, "loss": 0.1847, "step": 907 }, { "epoch": 1.77, "grad_norm": 1.5409922541559207, "learning_rate": 5.705587129203936e-06, "loss": 0.2221, "step": 908 }, { "epoch": 1.78, "grad_norm": 1.542358699967642, "learning_rate": 5.701879518884622e-06, "loss": 0.1754, "step": 909 }, { "epoch": 1.78, "grad_norm": 1.4837788374186094, "learning_rate": 5.69816781474967e-06, "loss": 0.2277, "step": 910 }, { "epoch": 1.78, "grad_norm": 1.5016542556561119, "learning_rate": 5.694452023700021e-06, "loss": 0.203, "step": 911 }, { "epoch": 1.78, "grad_norm": 1.3150969074034786, "learning_rate": 5.690732152644207e-06, "loss": 0.2103, "step": 912 }, { "epoch": 1.78, "grad_norm": 1.5069747546409284, "learning_rate": 5.687008208498352e-06, "loss": 0.2252, "step": 913 }, { "epoch": 1.79, "grad_norm": 1.5757567952670728, "learning_rate": 5.6832801981861506e-06, "loss": 0.2093, "step": 914 }, { "epoch": 1.79, "grad_norm": 1.5304273068559795, "learning_rate": 5.6795481286388565e-06, "loss": 0.2698, "step": 915 }, { "epoch": 1.79, "grad_norm": 1.5682306982997618, "learning_rate": 5.675812006795271e-06, "loss": 0.2302, "step": 916 }, { "epoch": 1.79, "grad_norm": 1.5747328935537674, "learning_rate": 5.67207183960173e-06, "loss": 0.2113, "step": 917 }, { "epoch": 1.79, "grad_norm": 1.5595899714321466, "learning_rate": 5.668327634012089e-06, "loss": 0.1684, "step": 918 }, { "epoch": 1.79, "grad_norm": 1.53076843493047, "learning_rate": 5.664579396987714e-06, "loss": 0.2004, "step": 919 }, { "epoch": 1.8, "grad_norm": 1.5166844152353496, "learning_rate": 5.6608271354974675e-06, "loss": 0.2229, "step": 920 }, { "epoch": 1.8, "grad_norm": 1.5131252257233316, "learning_rate": 5.657070856517689e-06, "loss": 0.2194, "step": 921 }, { "epoch": 1.8, "grad_norm": 1.572520998936348, "learning_rate": 5.653310567032194e-06, "loss": 0.2428, "step": 922 }, { "epoch": 1.8, "grad_norm": 1.546825499803589, "learning_rate": 5.64954627403225e-06, "loss": 0.2056, "step": 923 }, { "epoch": 1.8, "grad_norm": 1.4505644441122847, "learning_rate": 5.645777984516568e-06, "loss": 0.2389, "step": 924 }, { "epoch": 1.81, "grad_norm": 1.382605938715351, "learning_rate": 5.6420057054912946e-06, "loss": 0.1367, "step": 925 }, { "epoch": 1.81, "grad_norm": 1.6263346216542405, "learning_rate": 5.638229443969987e-06, "loss": 0.1627, "step": 926 }, { "epoch": 1.81, "grad_norm": 1.5025412823867277, "learning_rate": 5.63444920697361e-06, "loss": 0.2174, "step": 927 }, { "epoch": 1.81, "grad_norm": 1.6003755121158845, "learning_rate": 5.630665001530522e-06, "loss": 0.211, "step": 928 }, { "epoch": 1.81, "grad_norm": 1.4288562376193203, "learning_rate": 5.6268768346764565e-06, "loss": 0.1732, "step": 929 }, { "epoch": 1.82, "grad_norm": 1.6136390808978025, "learning_rate": 5.623084713454511e-06, "loss": 0.252, "step": 930 }, { "epoch": 1.82, "grad_norm": 1.5713445293709851, "learning_rate": 5.61928864491514e-06, "loss": 0.1748, "step": 931 }, { "epoch": 1.82, "grad_norm": 1.4835366730688204, "learning_rate": 5.615488636116131e-06, "loss": 0.2022, "step": 932 }, { "epoch": 1.82, "grad_norm": 1.5900951562126573, "learning_rate": 5.611684694122604e-06, "loss": 0.2407, "step": 933 }, { "epoch": 1.82, "grad_norm": 1.6634390773327132, "learning_rate": 5.607876826006988e-06, "loss": 0.2048, "step": 934 }, { "epoch": 1.83, "grad_norm": 1.3771835458092547, "learning_rate": 5.604065038849008e-06, "loss": 0.2052, "step": 935 }, { "epoch": 1.83, "grad_norm": 1.4176916619502649, "learning_rate": 5.600249339735683e-06, "loss": 0.1792, "step": 936 }, { "epoch": 1.83, "grad_norm": 1.4407303891280816, "learning_rate": 5.596429735761302e-06, "loss": 0.2055, "step": 937 }, { "epoch": 1.83, "grad_norm": 1.452691176311634, "learning_rate": 5.592606234027411e-06, "loss": 0.1472, "step": 938 }, { "epoch": 1.83, "grad_norm": 1.6542047301958496, "learning_rate": 5.588778841642805e-06, "loss": 0.2053, "step": 939 }, { "epoch": 1.84, "grad_norm": 1.49852793622414, "learning_rate": 5.584947565723517e-06, "loss": 0.1821, "step": 940 }, { "epoch": 1.84, "grad_norm": 1.455303228615125, "learning_rate": 5.581112413392794e-06, "loss": 0.1747, "step": 941 }, { "epoch": 1.84, "grad_norm": 1.4744296696403933, "learning_rate": 5.577273391781091e-06, "loss": 0.1839, "step": 942 }, { "epoch": 1.84, "grad_norm": 1.5374786475250082, "learning_rate": 5.573430508026063e-06, "loss": 0.1794, "step": 943 }, { "epoch": 1.84, "grad_norm": 1.5336623213170721, "learning_rate": 5.569583769272539e-06, "loss": 0.2131, "step": 944 }, { "epoch": 1.85, "grad_norm": 1.441470467129757, "learning_rate": 5.5657331826725164e-06, "loss": 0.2049, "step": 945 }, { "epoch": 1.85, "grad_norm": 1.7582109053626103, "learning_rate": 5.561878755385149e-06, "loss": 0.2335, "step": 946 }, { "epoch": 1.85, "grad_norm": 1.423089949235948, "learning_rate": 5.55802049457673e-06, "loss": 0.1813, "step": 947 }, { "epoch": 1.85, "grad_norm": 1.5088274334127034, "learning_rate": 5.554158407420681e-06, "loss": 0.2027, "step": 948 }, { "epoch": 1.85, "grad_norm": 1.4968079567508494, "learning_rate": 5.550292501097536e-06, "loss": 0.145, "step": 949 }, { "epoch": 1.86, "grad_norm": 1.5330745404463335, "learning_rate": 5.546422782794931e-06, "loss": 0.1906, "step": 950 }, { "epoch": 1.86, "grad_norm": 1.5132692854607526, "learning_rate": 5.542549259707588e-06, "loss": 0.247, "step": 951 }, { "epoch": 1.86, "grad_norm": 1.5225968006386879, "learning_rate": 5.5386719390373075e-06, "loss": 0.1773, "step": 952 }, { "epoch": 1.86, "grad_norm": 1.506067174550098, "learning_rate": 5.5347908279929435e-06, "loss": 0.2333, "step": 953 }, { "epoch": 1.86, "grad_norm": 1.3694334141057647, "learning_rate": 5.530905933790402e-06, "loss": 0.1911, "step": 954 }, { "epoch": 1.87, "grad_norm": 1.412871429445833, "learning_rate": 5.527017263652621e-06, "loss": 0.2278, "step": 955 }, { "epoch": 1.87, "grad_norm": 1.4747767037841124, "learning_rate": 5.523124824809562e-06, "loss": 0.2362, "step": 956 }, { "epoch": 1.87, "grad_norm": 1.4669236035563638, "learning_rate": 5.519228624498188e-06, "loss": 0.2007, "step": 957 }, { "epoch": 1.87, "grad_norm": 1.4751307031423995, "learning_rate": 5.515328669962459e-06, "loss": 0.2221, "step": 958 }, { "epoch": 1.87, "grad_norm": 1.4175827718809022, "learning_rate": 5.5114249684533145e-06, "loss": 0.2323, "step": 959 }, { "epoch": 1.88, "grad_norm": 1.610545508455769, "learning_rate": 5.507517527228661e-06, "loss": 0.2254, "step": 960 }, { "epoch": 1.88, "grad_norm": 1.362192280405189, "learning_rate": 5.503606353553358e-06, "loss": 0.136, "step": 961 }, { "epoch": 1.88, "grad_norm": 1.3621297350053299, "learning_rate": 5.499691454699202e-06, "loss": 0.1867, "step": 962 }, { "epoch": 1.88, "grad_norm": 1.3989776967249343, "learning_rate": 5.495772837944917e-06, "loss": 0.2386, "step": 963 }, { "epoch": 1.88, "grad_norm": 1.4812298087341456, "learning_rate": 5.4918505105761435e-06, "loss": 0.2787, "step": 964 }, { "epoch": 1.88, "grad_norm": 1.4959406054581912, "learning_rate": 5.4879244798854145e-06, "loss": 0.187, "step": 965 }, { "epoch": 1.89, "grad_norm": 1.3805571311394118, "learning_rate": 5.483994753172151e-06, "loss": 0.1667, "step": 966 }, { "epoch": 1.89, "grad_norm": 1.4905649754322212, "learning_rate": 5.4800613377426455e-06, "loss": 0.2556, "step": 967 }, { "epoch": 1.89, "grad_norm": 1.4504554340490876, "learning_rate": 5.476124240910052e-06, "loss": 0.191, "step": 968 }, { "epoch": 1.89, "grad_norm": 1.4783500642137533, "learning_rate": 5.472183469994362e-06, "loss": 0.176, "step": 969 }, { "epoch": 1.89, "grad_norm": 1.662078086472112, "learning_rate": 5.468239032322407e-06, "loss": 0.2183, "step": 970 }, { "epoch": 1.9, "grad_norm": 1.549880400707491, "learning_rate": 5.464290935227826e-06, "loss": 0.2103, "step": 971 }, { "epoch": 1.9, "grad_norm": 1.4474578136333722, "learning_rate": 5.460339186051069e-06, "loss": 0.1676, "step": 972 }, { "epoch": 1.9, "grad_norm": 1.507945104043349, "learning_rate": 5.456383792139375e-06, "loss": 0.2033, "step": 973 }, { "epoch": 1.9, "grad_norm": 1.699987038195162, "learning_rate": 5.452424760846757e-06, "loss": 0.2137, "step": 974 }, { "epoch": 1.9, "grad_norm": 1.4443832411800812, "learning_rate": 5.4484620995339936e-06, "loss": 0.2071, "step": 975 }, { "epoch": 1.91, "grad_norm": 1.567656808021611, "learning_rate": 5.444495815568607e-06, "loss": 0.2444, "step": 976 }, { "epoch": 1.91, "grad_norm": 1.440747529387856, "learning_rate": 5.440525916324864e-06, "loss": 0.2145, "step": 977 }, { "epoch": 1.91, "grad_norm": 1.6651981119047268, "learning_rate": 5.436552409183743e-06, "loss": 0.2521, "step": 978 }, { "epoch": 1.91, "grad_norm": 1.5174055733881613, "learning_rate": 5.432575301532938e-06, "loss": 0.1926, "step": 979 }, { "epoch": 1.91, "grad_norm": 1.5276422748620826, "learning_rate": 5.428594600766834e-06, "loss": 0.2419, "step": 980 }, { "epoch": 1.92, "grad_norm": 1.5358908541157168, "learning_rate": 5.424610314286495e-06, "loss": 0.2077, "step": 981 }, { "epoch": 1.92, "grad_norm": 1.3543089161932491, "learning_rate": 5.420622449499655e-06, "loss": 0.171, "step": 982 }, { "epoch": 1.92, "grad_norm": 1.5953598918401888, "learning_rate": 5.4166310138207e-06, "loss": 0.2391, "step": 983 }, { "epoch": 1.92, "grad_norm": 1.3982616945356046, "learning_rate": 5.412636014670652e-06, "loss": 0.2103, "step": 984 }, { "epoch": 1.92, "grad_norm": 1.8469951218402687, "learning_rate": 5.408637459477162e-06, "loss": 0.2434, "step": 985 }, { "epoch": 1.93, "grad_norm": 1.571947770284599, "learning_rate": 5.404635355674492e-06, "loss": 0.2616, "step": 986 }, { "epoch": 1.93, "grad_norm": 1.3933550290404235, "learning_rate": 5.400629710703501e-06, "loss": 0.171, "step": 987 }, { "epoch": 1.93, "grad_norm": 1.4157643202213892, "learning_rate": 5.396620532011631e-06, "loss": 0.1836, "step": 988 }, { "epoch": 1.93, "grad_norm": 1.4330653966628857, "learning_rate": 5.392607827052896e-06, "loss": 0.1637, "step": 989 }, { "epoch": 1.93, "grad_norm": 1.372557520191389, "learning_rate": 5.388591603287863e-06, "loss": 0.2176, "step": 990 }, { "epoch": 1.94, "grad_norm": 1.4376236165103073, "learning_rate": 5.384571868183646e-06, "loss": 0.2395, "step": 991 }, { "epoch": 1.94, "grad_norm": 1.4346277972946018, "learning_rate": 5.380548629213884e-06, "loss": 0.2135, "step": 992 }, { "epoch": 1.94, "grad_norm": 1.478406689331299, "learning_rate": 5.37652189385873e-06, "loss": 0.1991, "step": 993 }, { "epoch": 1.94, "grad_norm": 1.4022488952305974, "learning_rate": 5.372491669604841e-06, "loss": 0.1951, "step": 994 }, { "epoch": 1.94, "grad_norm": 1.5393460026284773, "learning_rate": 5.368457963945356e-06, "loss": 0.183, "step": 995 }, { "epoch": 1.95, "grad_norm": 1.3711315704586695, "learning_rate": 5.364420784379892e-06, "loss": 0.1737, "step": 996 }, { "epoch": 1.95, "grad_norm": 1.514664264512654, "learning_rate": 5.360380138414521e-06, "loss": 0.2523, "step": 997 }, { "epoch": 1.95, "grad_norm": 2.258754420111654, "learning_rate": 5.356336033561761e-06, "loss": 0.1857, "step": 998 }, { "epoch": 1.95, "grad_norm": 1.444599068013082, "learning_rate": 5.352288477340562e-06, "loss": 0.2099, "step": 999 }, { "epoch": 1.95, "grad_norm": 1.5012313177972192, "learning_rate": 5.348237477276288e-06, "loss": 0.2023, "step": 1000 }, { "epoch": 1.96, "grad_norm": 1.3644500541213127, "learning_rate": 5.344183040900709e-06, "loss": 0.1942, "step": 1001 }, { "epoch": 1.96, "grad_norm": 1.536899981887165, "learning_rate": 5.340125175751983e-06, "loss": 0.2077, "step": 1002 }, { "epoch": 1.96, "grad_norm": 1.5777741488098669, "learning_rate": 5.336063889374641e-06, "loss": 0.2533, "step": 1003 }, { "epoch": 1.96, "grad_norm": 1.6296391634504244, "learning_rate": 5.331999189319578e-06, "loss": 0.1648, "step": 1004 }, { "epoch": 1.96, "grad_norm": 1.5734792671217386, "learning_rate": 5.327931083144033e-06, "loss": 0.2095, "step": 1005 }, { "epoch": 1.96, "grad_norm": 1.4333608599033871, "learning_rate": 5.323859578411582e-06, "loss": 0.1718, "step": 1006 }, { "epoch": 1.97, "grad_norm": 1.5367382900479294, "learning_rate": 5.319784682692114e-06, "loss": 0.2223, "step": 1007 }, { "epoch": 1.97, "grad_norm": 1.5267437948541092, "learning_rate": 5.315706403561825e-06, "loss": 0.1858, "step": 1008 }, { "epoch": 1.97, "grad_norm": 1.587130109612982, "learning_rate": 5.311624748603203e-06, "loss": 0.238, "step": 1009 }, { "epoch": 1.97, "grad_norm": 1.3999512354358525, "learning_rate": 5.3075397254050135e-06, "loss": 0.1592, "step": 1010 }, { "epoch": 1.97, "grad_norm": 1.5723722597080947, "learning_rate": 5.30345134156228e-06, "loss": 0.2449, "step": 1011 }, { "epoch": 1.98, "grad_norm": 1.4744357558091978, "learning_rate": 5.299359604676275e-06, "loss": 0.2331, "step": 1012 }, { "epoch": 1.98, "grad_norm": 1.3881502099136842, "learning_rate": 5.295264522354512e-06, "loss": 0.1999, "step": 1013 }, { "epoch": 1.98, "grad_norm": 1.4857553101383152, "learning_rate": 5.291166102210713e-06, "loss": 0.2056, "step": 1014 }, { "epoch": 1.98, "grad_norm": 1.619598943846148, "learning_rate": 5.287064351864818e-06, "loss": 0.2152, "step": 1015 }, { "epoch": 1.98, "grad_norm": 1.638238646463492, "learning_rate": 5.282959278942947e-06, "loss": 0.1666, "step": 1016 }, { "epoch": 1.99, "grad_norm": 1.4256389367405697, "learning_rate": 5.2788508910774055e-06, "loss": 0.2433, "step": 1017 }, { "epoch": 1.99, "grad_norm": 1.6021606610383718, "learning_rate": 5.27473919590666e-06, "loss": 0.1962, "step": 1018 }, { "epoch": 1.99, "grad_norm": 1.4947061576901712, "learning_rate": 5.270624201075326e-06, "loss": 0.199, "step": 1019 }, { "epoch": 1.99, "grad_norm": 1.271398406734294, "learning_rate": 5.266505914234152e-06, "loss": 0.1855, "step": 1020 }, { "epoch": 1.99, "grad_norm": 1.6343625658845287, "learning_rate": 5.2623843430400116e-06, "loss": 0.2181, "step": 1021 }, { "epoch": 2.0, "grad_norm": 1.4219936019664021, "learning_rate": 5.25825949515588e-06, "loss": 0.1609, "step": 1022 }, { "epoch": 2.0, "grad_norm": 1.4625338633354514, "learning_rate": 5.254131378250826e-06, "loss": 0.2049, "step": 1023 }, { "epoch": 2.0, "grad_norm": 1.485413904424515, "learning_rate": 5.25e-06, "loss": 0.1923, "step": 1024 }, { "epoch": 2.0, "grad_norm": 1.4684883683953387, "learning_rate": 5.24586536808461e-06, "loss": 0.2288, "step": 1025 }, { "epoch": 2.0, "grad_norm": 1.5862239648295868, "learning_rate": 5.241727490191916e-06, "loss": 0.2185, "step": 1026 }, { "epoch": 2.01, "grad_norm": 1.548737133960248, "learning_rate": 5.237586374015216e-06, "loss": 0.1954, "step": 1027 }, { "epoch": 2.01, "grad_norm": 1.5393768743672311, "learning_rate": 5.233442027253823e-06, "loss": 0.2101, "step": 1028 }, { "epoch": 2.01, "grad_norm": 1.4765146280325605, "learning_rate": 5.229294457613061e-06, "loss": 0.23, "step": 1029 }, { "epoch": 2.01, "grad_norm": 1.6117131252068877, "learning_rate": 5.2251436728042444e-06, "loss": 0.1844, "step": 1030 }, { "epoch": 2.01, "grad_norm": 1.6279599028063632, "learning_rate": 5.2209896805446645e-06, "loss": 0.2059, "step": 1031 }, { "epoch": 2.02, "grad_norm": 1.4653045470628405, "learning_rate": 5.216832488557577e-06, "loss": 0.1988, "step": 1032 }, { "epoch": 2.02, "grad_norm": 1.4164470110633578, "learning_rate": 5.212672104572189e-06, "loss": 0.2105, "step": 1033 }, { "epoch": 2.02, "grad_norm": 1.4949446770955879, "learning_rate": 5.208508536323637e-06, "loss": 0.2107, "step": 1034 }, { "epoch": 2.02, "grad_norm": 1.464032877604832, "learning_rate": 5.204341791552983e-06, "loss": 0.1868, "step": 1035 }, { "epoch": 2.02, "grad_norm": 1.412426912125762, "learning_rate": 5.20017187800719e-06, "loss": 0.2074, "step": 1036 }, { "epoch": 2.03, "grad_norm": 1.4370098497163, "learning_rate": 5.195998803439117e-06, "loss": 0.2131, "step": 1037 }, { "epoch": 2.03, "grad_norm": 1.362383796187994, "learning_rate": 5.191822575607498e-06, "loss": 0.2187, "step": 1038 }, { "epoch": 2.03, "grad_norm": 1.4418606532060592, "learning_rate": 5.18764320227693e-06, "loss": 0.1853, "step": 1039 }, { "epoch": 2.03, "grad_norm": 1.4459174296577728, "learning_rate": 5.183460691217857e-06, "loss": 0.1668, "step": 1040 }, { "epoch": 2.03, "grad_norm": 1.5099833769668716, "learning_rate": 5.179275050206558e-06, "loss": 0.1803, "step": 1041 }, { "epoch": 2.04, "grad_norm": 1.383510182423923, "learning_rate": 5.175086287025134e-06, "loss": 0.1698, "step": 1042 }, { "epoch": 2.04, "grad_norm": 1.4672781523406975, "learning_rate": 5.170894409461483e-06, "loss": 0.2006, "step": 1043 }, { "epoch": 2.04, "grad_norm": 1.5860334887590435, "learning_rate": 5.166699425309303e-06, "loss": 0.1959, "step": 1044 }, { "epoch": 2.04, "grad_norm": 1.6078374650199438, "learning_rate": 5.1625013423680605e-06, "loss": 0.1849, "step": 1045 }, { "epoch": 2.04, "grad_norm": 1.603623202926053, "learning_rate": 5.158300168442987e-06, "loss": 0.2246, "step": 1046 }, { "epoch": 2.04, "grad_norm": 1.6578347335245123, "learning_rate": 5.154095911345061e-06, "loss": 0.1749, "step": 1047 }, { "epoch": 2.05, "grad_norm": 1.8561627848557876, "learning_rate": 5.1498885788909926e-06, "loss": 0.1768, "step": 1048 }, { "epoch": 2.05, "grad_norm": 1.7171939905156925, "learning_rate": 5.1456781789032064e-06, "loss": 0.1482, "step": 1049 }, { "epoch": 2.05, "grad_norm": 1.4684954591644706, "learning_rate": 5.141464719209837e-06, "loss": 0.1657, "step": 1050 }, { "epoch": 2.05, "grad_norm": 1.5266808833075105, "learning_rate": 5.137248207644702e-06, "loss": 0.1618, "step": 1051 }, { "epoch": 2.05, "grad_norm": 1.7124475419312717, "learning_rate": 5.133028652047296e-06, "loss": 0.2059, "step": 1052 }, { "epoch": 2.06, "grad_norm": 1.6702813560989793, "learning_rate": 5.128806060262774e-06, "loss": 0.215, "step": 1053 }, { "epoch": 2.06, "grad_norm": 1.550224986565842, "learning_rate": 5.12458044014193e-06, "loss": 0.1742, "step": 1054 }, { "epoch": 2.06, "grad_norm": 1.4870216029120185, "learning_rate": 5.120351799541198e-06, "loss": 0.1929, "step": 1055 }, { "epoch": 2.06, "grad_norm": 1.6455407328350369, "learning_rate": 5.11612014632262e-06, "loss": 0.2429, "step": 1056 }, { "epoch": 2.06, "grad_norm": 1.5025074192969599, "learning_rate": 5.1118854883538396e-06, "loss": 0.1763, "step": 1057 }, { "epoch": 2.07, "grad_norm": 1.5595128148291753, "learning_rate": 5.107647833508094e-06, "loss": 0.1972, "step": 1058 }, { "epoch": 2.07, "grad_norm": 1.5159784136321621, "learning_rate": 5.103407189664184e-06, "loss": 0.1834, "step": 1059 }, { "epoch": 2.07, "grad_norm": 1.6222249738313559, "learning_rate": 5.099163564706473e-06, "loss": 0.1552, "step": 1060 }, { "epoch": 2.07, "grad_norm": 1.5081877245018884, "learning_rate": 5.094916966524863e-06, "loss": 0.1567, "step": 1061 }, { "epoch": 2.07, "grad_norm": 1.4974309369370569, "learning_rate": 5.090667403014788e-06, "loss": 0.1347, "step": 1062 }, { "epoch": 2.08, "grad_norm": 1.4515384371316755, "learning_rate": 5.0864148820771915e-06, "loss": 0.2254, "step": 1063 }, { "epoch": 2.08, "grad_norm": 1.7504345474507534, "learning_rate": 5.082159411618519e-06, "loss": 0.1652, "step": 1064 }, { "epoch": 2.08, "grad_norm": 1.3732105989961998, "learning_rate": 5.077900999550697e-06, "loss": 0.1315, "step": 1065 }, { "epoch": 2.08, "grad_norm": 1.4700162459326744, "learning_rate": 5.0736396537911234e-06, "loss": 0.1575, "step": 1066 }, { "epoch": 2.08, "grad_norm": 1.5076997132530245, "learning_rate": 5.069375382262648e-06, "loss": 0.1529, "step": 1067 }, { "epoch": 2.09, "grad_norm": 1.565105914864787, "learning_rate": 5.065108192893563e-06, "loss": 0.139, "step": 1068 }, { "epoch": 2.09, "grad_norm": 1.7096174848746915, "learning_rate": 5.0608380936175835e-06, "loss": 0.1759, "step": 1069 }, { "epoch": 2.09, "grad_norm": 1.5815684886181345, "learning_rate": 5.056565092373836e-06, "loss": 0.182, "step": 1070 }, { "epoch": 2.09, "grad_norm": 1.6057818726666813, "learning_rate": 5.052289197106843e-06, "loss": 0.1614, "step": 1071 }, { "epoch": 2.09, "grad_norm": 1.5936514619853224, "learning_rate": 5.048010415766505e-06, "loss": 0.2021, "step": 1072 }, { "epoch": 2.1, "grad_norm": 1.5726047505973342, "learning_rate": 5.043728756308091e-06, "loss": 0.21, "step": 1073 }, { "epoch": 2.1, "grad_norm": 1.5831546102374623, "learning_rate": 5.0394442266922196e-06, "loss": 0.1704, "step": 1074 }, { "epoch": 2.1, "grad_norm": 1.5379199647509059, "learning_rate": 5.035156834884847e-06, "loss": 0.1798, "step": 1075 }, { "epoch": 2.1, "grad_norm": 1.465297487615899, "learning_rate": 5.030866588857251e-06, "loss": 0.1523, "step": 1076 }, { "epoch": 2.1, "grad_norm": 1.5823623422601996, "learning_rate": 5.026573496586013e-06, "loss": 0.1684, "step": 1077 }, { "epoch": 2.11, "grad_norm": 1.5418447681725764, "learning_rate": 5.02227756605301e-06, "loss": 0.1739, "step": 1078 }, { "epoch": 2.11, "grad_norm": 1.5417949615152027, "learning_rate": 5.017978805245394e-06, "loss": 0.2017, "step": 1079 }, { "epoch": 2.11, "grad_norm": 1.751041564238575, "learning_rate": 5.013677222155581e-06, "loss": 0.1632, "step": 1080 }, { "epoch": 2.11, "grad_norm": 1.4434729501543526, "learning_rate": 5.009372824781232e-06, "loss": 0.1482, "step": 1081 }, { "epoch": 2.11, "grad_norm": 1.4370005127670842, "learning_rate": 5.0050656211252426e-06, "loss": 0.1991, "step": 1082 }, { "epoch": 2.12, "grad_norm": 1.4748072711209026, "learning_rate": 5.000755619195723e-06, "loss": 0.1314, "step": 1083 }, { "epoch": 2.12, "grad_norm": 1.449397261447236, "learning_rate": 4.996442827005987e-06, "loss": 0.2025, "step": 1084 }, { "epoch": 2.12, "grad_norm": 1.5094632231882639, "learning_rate": 4.992127252574539e-06, "loss": 0.1718, "step": 1085 }, { "epoch": 2.12, "grad_norm": 1.4714711557017799, "learning_rate": 4.987808903925054e-06, "loss": 0.1615, "step": 1086 }, { "epoch": 2.12, "grad_norm": 1.3892043149029167, "learning_rate": 4.983487789086366e-06, "loss": 0.1628, "step": 1087 }, { "epoch": 2.12, "grad_norm": 1.5469111417236905, "learning_rate": 4.979163916092448e-06, "loss": 0.1046, "step": 1088 }, { "epoch": 2.13, "grad_norm": 1.3869912681307255, "learning_rate": 4.974837292982406e-06, "loss": 0.1278, "step": 1089 }, { "epoch": 2.13, "grad_norm": 1.452425883802754, "learning_rate": 4.970507927800459e-06, "loss": 0.122, "step": 1090 }, { "epoch": 2.13, "grad_norm": 1.4964603958210405, "learning_rate": 4.966175828595919e-06, "loss": 0.158, "step": 1091 }, { "epoch": 2.13, "grad_norm": 1.500232745567763, "learning_rate": 4.961841003423187e-06, "loss": 0.1315, "step": 1092 }, { "epoch": 2.13, "grad_norm": 1.5370126347119522, "learning_rate": 4.95750346034173e-06, "loss": 0.1908, "step": 1093 }, { "epoch": 2.14, "grad_norm": 1.5083513297834175, "learning_rate": 4.953163207416067e-06, "loss": 0.1475, "step": 1094 }, { "epoch": 2.14, "grad_norm": 1.5683442438116364, "learning_rate": 4.948820252715757e-06, "loss": 0.151, "step": 1095 }, { "epoch": 2.14, "grad_norm": 1.6598180668719211, "learning_rate": 4.944474604315381e-06, "loss": 0.1409, "step": 1096 }, { "epoch": 2.14, "grad_norm": 1.7006379130392166, "learning_rate": 4.9401262702945304e-06, "loss": 0.1476, "step": 1097 }, { "epoch": 2.14, "grad_norm": 1.7457503608183427, "learning_rate": 4.935775258737787e-06, "loss": 0.2082, "step": 1098 }, { "epoch": 2.15, "grad_norm": 1.541474606124428, "learning_rate": 4.931421577734711e-06, "loss": 0.1479, "step": 1099 }, { "epoch": 2.15, "grad_norm": 1.6092959427271285, "learning_rate": 4.927065235379828e-06, "loss": 0.1451, "step": 1100 }, { "epoch": 2.15, "grad_norm": 1.7403371032105734, "learning_rate": 4.922706239772611e-06, "loss": 0.1689, "step": 1101 }, { "epoch": 2.15, "grad_norm": 1.5737492221942955, "learning_rate": 4.918344599017464e-06, "loss": 0.165, "step": 1102 }, { "epoch": 2.15, "grad_norm": 1.6086025976620293, "learning_rate": 4.913980321223712e-06, "loss": 0.1856, "step": 1103 }, { "epoch": 2.16, "grad_norm": 1.5961418039321171, "learning_rate": 4.9096134145055806e-06, "loss": 0.1968, "step": 1104 }, { "epoch": 2.16, "grad_norm": 2.0705758466105593, "learning_rate": 4.905243886982183e-06, "loss": 0.1804, "step": 1105 }, { "epoch": 2.16, "grad_norm": 1.6744701707948308, "learning_rate": 4.900871746777507e-06, "loss": 0.132, "step": 1106 }, { "epoch": 2.16, "grad_norm": 1.4311254392893684, "learning_rate": 4.896497002020397e-06, "loss": 0.166, "step": 1107 }, { "epoch": 2.16, "grad_norm": 1.449157776119781, "learning_rate": 4.892119660844538e-06, "loss": 0.1817, "step": 1108 }, { "epoch": 2.17, "grad_norm": 1.4685642070137366, "learning_rate": 4.8877397313884485e-06, "loss": 0.172, "step": 1109 }, { "epoch": 2.17, "grad_norm": 1.3445810553583444, "learning_rate": 4.883357221795449e-06, "loss": 0.1407, "step": 1110 }, { "epoch": 2.17, "grad_norm": 1.5149722135171848, "learning_rate": 4.878972140213669e-06, "loss": 0.1592, "step": 1111 }, { "epoch": 2.17, "grad_norm": 1.47671079607355, "learning_rate": 4.87458449479601e-06, "loss": 0.1504, "step": 1112 }, { "epoch": 2.17, "grad_norm": 1.38318253383385, "learning_rate": 4.8701942937001455e-06, "loss": 0.1121, "step": 1113 }, { "epoch": 2.18, "grad_norm": 1.4435000843922474, "learning_rate": 4.865801545088499e-06, "loss": 0.1618, "step": 1114 }, { "epoch": 2.18, "grad_norm": 1.530229894894107, "learning_rate": 4.8614062571282305e-06, "loss": 0.1571, "step": 1115 }, { "epoch": 2.18, "grad_norm": 1.4932536608594913, "learning_rate": 4.857008437991222e-06, "loss": 0.1415, "step": 1116 }, { "epoch": 2.18, "grad_norm": 1.5651061804226014, "learning_rate": 4.852608095854062e-06, "loss": 0.1532, "step": 1117 }, { "epoch": 2.18, "grad_norm": 1.4370026147764883, "learning_rate": 4.848205238898028e-06, "loss": 0.1678, "step": 1118 }, { "epoch": 2.19, "grad_norm": 1.5801579554900138, "learning_rate": 4.843799875309074e-06, "loss": 0.1702, "step": 1119 }, { "epoch": 2.19, "grad_norm": 1.7707835365800484, "learning_rate": 4.8393920132778144e-06, "loss": 0.1495, "step": 1120 }, { "epoch": 2.19, "grad_norm": 1.5146581857754198, "learning_rate": 4.834981660999509e-06, "loss": 0.13, "step": 1121 }, { "epoch": 2.19, "grad_norm": 1.55825215997078, "learning_rate": 4.830568826674048e-06, "loss": 0.1488, "step": 1122 }, { "epoch": 2.19, "grad_norm": 1.4851300862121661, "learning_rate": 4.826153518505937e-06, "loss": 0.1141, "step": 1123 }, { "epoch": 2.2, "grad_norm": 1.6651007238264113, "learning_rate": 4.821735744704276e-06, "loss": 0.1647, "step": 1124 }, { "epoch": 2.2, "grad_norm": 1.5487342630357248, "learning_rate": 4.817315513482755e-06, "loss": 0.1457, "step": 1125 }, { "epoch": 2.2, "grad_norm": 1.573493618322773, "learning_rate": 4.812892833059633e-06, "loss": 0.1507, "step": 1126 }, { "epoch": 2.2, "grad_norm": 1.5114926165910871, "learning_rate": 4.808467711657718e-06, "loss": 0.1161, "step": 1127 }, { "epoch": 2.2, "grad_norm": 1.6326933118295988, "learning_rate": 4.804040157504361e-06, "loss": 0.1286, "step": 1128 }, { "epoch": 2.21, "grad_norm": 1.5905454651402373, "learning_rate": 4.7996101788314315e-06, "loss": 0.1585, "step": 1129 }, { "epoch": 2.21, "grad_norm": 1.5266049808381053, "learning_rate": 4.795177783875312e-06, "loss": 0.1484, "step": 1130 }, { "epoch": 2.21, "grad_norm": 1.4204080677779234, "learning_rate": 4.7907429808768716e-06, "loss": 0.1007, "step": 1131 }, { "epoch": 2.21, "grad_norm": 1.4960828037183986, "learning_rate": 4.786305778081462e-06, "loss": 0.1043, "step": 1132 }, { "epoch": 2.21, "grad_norm": 1.5812593208258534, "learning_rate": 4.7818661837388945e-06, "loss": 0.1659, "step": 1133 }, { "epoch": 2.21, "grad_norm": 1.7159153874175994, "learning_rate": 4.777424206103426e-06, "loss": 0.1457, "step": 1134 }, { "epoch": 2.22, "grad_norm": 1.5761464557156277, "learning_rate": 4.772979853433746e-06, "loss": 0.1274, "step": 1135 }, { "epoch": 2.22, "grad_norm": 1.61947391697077, "learning_rate": 4.7685331339929555e-06, "loss": 0.1414, "step": 1136 }, { "epoch": 2.22, "grad_norm": 1.605525907057918, "learning_rate": 4.764084056048564e-06, "loss": 0.1228, "step": 1137 }, { "epoch": 2.22, "grad_norm": 1.5886547587994995, "learning_rate": 4.759632627872458e-06, "loss": 0.1467, "step": 1138 }, { "epoch": 2.22, "grad_norm": 1.5957058811552505, "learning_rate": 4.755178857740899e-06, "loss": 0.152, "step": 1139 }, { "epoch": 2.23, "grad_norm": 1.6943608086455062, "learning_rate": 4.750722753934501e-06, "loss": 0.1224, "step": 1140 }, { "epoch": 2.23, "grad_norm": 1.5027734373244892, "learning_rate": 4.746264324738215e-06, "loss": 0.1305, "step": 1141 }, { "epoch": 2.23, "grad_norm": 1.4780110512271496, "learning_rate": 4.741803578441318e-06, "loss": 0.0921, "step": 1142 }, { "epoch": 2.23, "grad_norm": 1.5374946741600723, "learning_rate": 4.737340523337393e-06, "loss": 0.121, "step": 1143 }, { "epoch": 2.23, "grad_norm": 1.6181884975560115, "learning_rate": 4.732875167724318e-06, "loss": 0.1356, "step": 1144 }, { "epoch": 2.24, "grad_norm": 1.6957072712387138, "learning_rate": 4.728407519904245e-06, "loss": 0.1248, "step": 1145 }, { "epoch": 2.24, "grad_norm": 1.6337342157634005, "learning_rate": 4.723937588183593e-06, "loss": 0.1326, "step": 1146 }, { "epoch": 2.24, "grad_norm": 1.8924098239916443, "learning_rate": 4.71946538087302e-06, "loss": 0.2055, "step": 1147 }, { "epoch": 2.24, "grad_norm": 1.7168653512632088, "learning_rate": 4.714990906287423e-06, "loss": 0.1289, "step": 1148 }, { "epoch": 2.24, "grad_norm": 1.8300552733871294, "learning_rate": 4.710514172745907e-06, "loss": 0.1317, "step": 1149 }, { "epoch": 2.25, "grad_norm": 1.5968528322932645, "learning_rate": 4.706035188571782e-06, "loss": 0.0939, "step": 1150 }, { "epoch": 2.25, "grad_norm": 1.5982336232278067, "learning_rate": 4.70155396209254e-06, "loss": 0.1527, "step": 1151 }, { "epoch": 2.25, "grad_norm": 1.573542364055737, "learning_rate": 4.697070501639841e-06, "loss": 0.1219, "step": 1152 }, { "epoch": 2.25, "grad_norm": 1.5913331934925568, "learning_rate": 4.692584815549502e-06, "loss": 0.1544, "step": 1153 }, { "epoch": 2.25, "grad_norm": 1.6407493489663525, "learning_rate": 4.688096912161476e-06, "loss": 0.1518, "step": 1154 }, { "epoch": 2.26, "grad_norm": 1.534951674261674, "learning_rate": 4.683606799819838e-06, "loss": 0.1265, "step": 1155 }, { "epoch": 2.26, "grad_norm": 1.5112826462671631, "learning_rate": 4.67911448687277e-06, "loss": 0.0943, "step": 1156 }, { "epoch": 2.26, "grad_norm": 1.5379572650027797, "learning_rate": 4.674619981672548e-06, "loss": 0.1407, "step": 1157 }, { "epoch": 2.26, "grad_norm": 1.4684515908193085, "learning_rate": 4.67012329257552e-06, "loss": 0.1134, "step": 1158 }, { "epoch": 2.26, "grad_norm": 1.5900538947444331, "learning_rate": 4.665624427942096e-06, "loss": 0.1238, "step": 1159 }, { "epoch": 2.27, "grad_norm": 1.7344380442789882, "learning_rate": 4.661123396136733e-06, "loss": 0.1214, "step": 1160 }, { "epoch": 2.27, "grad_norm": 1.6250790143621905, "learning_rate": 4.656620205527914e-06, "loss": 0.1605, "step": 1161 }, { "epoch": 2.27, "grad_norm": 1.5996019782700115, "learning_rate": 4.652114864488136e-06, "loss": 0.1296, "step": 1162 }, { "epoch": 2.27, "grad_norm": 1.577791302317765, "learning_rate": 4.647607381393899e-06, "loss": 0.1218, "step": 1163 }, { "epoch": 2.27, "grad_norm": 1.6830445216947998, "learning_rate": 4.643097764625678e-06, "loss": 0.1255, "step": 1164 }, { "epoch": 2.28, "grad_norm": 1.5448615753279147, "learning_rate": 4.638586022567921e-06, "loss": 0.1226, "step": 1165 }, { "epoch": 2.28, "grad_norm": 1.7871649975649324, "learning_rate": 4.634072163609024e-06, "loss": 0.1049, "step": 1166 }, { "epoch": 2.28, "grad_norm": 1.6135349632617884, "learning_rate": 4.62955619614132e-06, "loss": 0.1314, "step": 1167 }, { "epoch": 2.28, "grad_norm": 1.7815712865083242, "learning_rate": 4.625038128561065e-06, "loss": 0.0864, "step": 1168 }, { "epoch": 2.28, "grad_norm": 1.61630896936375, "learning_rate": 4.620517969268416e-06, "loss": 0.1177, "step": 1169 }, { "epoch": 2.29, "grad_norm": 1.5904457221040738, "learning_rate": 4.615995726667416e-06, "loss": 0.1228, "step": 1170 }, { "epoch": 2.29, "grad_norm": 1.3509162945590072, "learning_rate": 4.61147140916599e-06, "loss": 0.0731, "step": 1171 }, { "epoch": 2.29, "grad_norm": 1.5520836577269157, "learning_rate": 4.606945025175914e-06, "loss": 0.0893, "step": 1172 }, { "epoch": 2.29, "grad_norm": 1.455601723891749, "learning_rate": 4.602416583112809e-06, "loss": 0.1055, "step": 1173 }, { "epoch": 2.29, "grad_norm": 1.6474611030836628, "learning_rate": 4.597886091396121e-06, "loss": 0.0942, "step": 1174 }, { "epoch": 2.29, "grad_norm": 1.6572523445960397, "learning_rate": 4.593353558449106e-06, "loss": 0.0843, "step": 1175 }, { "epoch": 2.3, "grad_norm": 1.7652390915681493, "learning_rate": 4.588818992698818e-06, "loss": 0.1002, "step": 1176 }, { "epoch": 2.3, "grad_norm": 1.6921267994751514, "learning_rate": 4.58428240257609e-06, "loss": 0.1476, "step": 1177 }, { "epoch": 2.3, "grad_norm": 1.6681085316575817, "learning_rate": 4.579743796515515e-06, "loss": 0.1043, "step": 1178 }, { "epoch": 2.3, "grad_norm": 1.5279957180601094, "learning_rate": 4.5752031829554385e-06, "loss": 0.0909, "step": 1179 }, { "epoch": 2.3, "grad_norm": 1.5917076216635946, "learning_rate": 4.570660570337937e-06, "loss": 0.1438, "step": 1180 }, { "epoch": 2.31, "grad_norm": 1.620376001982488, "learning_rate": 4.566115967108803e-06, "loss": 0.1726, "step": 1181 }, { "epoch": 2.31, "grad_norm": 1.5124489860528183, "learning_rate": 4.561569381717531e-06, "loss": 0.1321, "step": 1182 }, { "epoch": 2.31, "grad_norm": 1.5445515353233654, "learning_rate": 4.557020822617298e-06, "loss": 0.1385, "step": 1183 }, { "epoch": 2.31, "grad_norm": 1.4235519315755092, "learning_rate": 4.552470298264956e-06, "loss": 0.0959, "step": 1184 }, { "epoch": 2.31, "grad_norm": 1.4644368070928468, "learning_rate": 4.547917817121006e-06, "loss": 0.0906, "step": 1185 }, { "epoch": 2.32, "grad_norm": 1.655418945833978, "learning_rate": 4.54336338764959e-06, "loss": 0.1167, "step": 1186 }, { "epoch": 2.32, "grad_norm": 1.5188520087778923, "learning_rate": 4.5388070183184695e-06, "loss": 0.096, "step": 1187 }, { "epoch": 2.32, "grad_norm": 1.6545551794130209, "learning_rate": 4.534248717599016e-06, "loss": 0.0931, "step": 1188 }, { "epoch": 2.32, "grad_norm": 1.464693306313088, "learning_rate": 4.52968849396619e-06, "loss": 0.1036, "step": 1189 }, { "epoch": 2.32, "grad_norm": 1.7176345158971886, "learning_rate": 4.525126355898528e-06, "loss": 0.1042, "step": 1190 }, { "epoch": 2.33, "grad_norm": 1.5086307005405708, "learning_rate": 4.520562311878125e-06, "loss": 0.0791, "step": 1191 }, { "epoch": 2.33, "grad_norm": 1.511739021449142, "learning_rate": 4.5159963703906175e-06, "loss": 0.1177, "step": 1192 }, { "epoch": 2.33, "grad_norm": 1.430661698055311, "learning_rate": 4.511428539925177e-06, "loss": 0.1066, "step": 1193 }, { "epoch": 2.33, "grad_norm": 1.6966141875441954, "learning_rate": 4.50685882897448e-06, "loss": 0.0953, "step": 1194 }, { "epoch": 2.33, "grad_norm": 1.6454837782501668, "learning_rate": 4.502287246034701e-06, "loss": 0.0949, "step": 1195 }, { "epoch": 2.34, "grad_norm": 1.7788784279480403, "learning_rate": 4.497713799605498e-06, "loss": 0.1047, "step": 1196 }, { "epoch": 2.34, "grad_norm": 1.5947918121083056, "learning_rate": 4.493138498189989e-06, "loss": 0.1008, "step": 1197 }, { "epoch": 2.34, "grad_norm": 1.746597568457484, "learning_rate": 4.488561350294743e-06, "loss": 0.1151, "step": 1198 }, { "epoch": 2.34, "grad_norm": 1.8662934926991552, "learning_rate": 4.483982364429766e-06, "loss": 0.1072, "step": 1199 }, { "epoch": 2.34, "grad_norm": 1.629114098373168, "learning_rate": 4.479401549108473e-06, "loss": 0.1385, "step": 1200 }, { "epoch": 2.35, "grad_norm": 1.563018875691206, "learning_rate": 4.474818912847685e-06, "loss": 0.0752, "step": 1201 }, { "epoch": 2.35, "grad_norm": 1.8017288250940922, "learning_rate": 4.470234464167612e-06, "loss": 0.1091, "step": 1202 }, { "epoch": 2.35, "grad_norm": 1.6370009099495202, "learning_rate": 4.465648211591828e-06, "loss": 0.0804, "step": 1203 }, { "epoch": 2.35, "grad_norm": 1.5107555207093704, "learning_rate": 4.4610601636472636e-06, "loss": 0.1004, "step": 1204 }, { "epoch": 2.35, "grad_norm": 1.5783715871540327, "learning_rate": 4.456470328864186e-06, "loss": 0.0945, "step": 1205 }, { "epoch": 2.36, "grad_norm": 1.6122379272648044, "learning_rate": 4.451878715776184e-06, "loss": 0.0863, "step": 1206 }, { "epoch": 2.36, "grad_norm": 1.6023142582275327, "learning_rate": 4.447285332920157e-06, "loss": 0.0689, "step": 1207 }, { "epoch": 2.36, "grad_norm": 1.6269910982331894, "learning_rate": 4.442690188836292e-06, "loss": 0.1174, "step": 1208 }, { "epoch": 2.36, "grad_norm": 1.5979481883730748, "learning_rate": 4.438093292068047e-06, "loss": 0.113, "step": 1209 }, { "epoch": 2.36, "grad_norm": 1.6090095741398347, "learning_rate": 4.433494651162144e-06, "loss": 0.1022, "step": 1210 }, { "epoch": 2.37, "grad_norm": 1.35593311770173, "learning_rate": 4.428894274668547e-06, "loss": 0.0777, "step": 1211 }, { "epoch": 2.37, "grad_norm": 1.556818357869746, "learning_rate": 4.424292171140445e-06, "loss": 0.1139, "step": 1212 }, { "epoch": 2.37, "grad_norm": 1.5976918577440438, "learning_rate": 4.419688349134237e-06, "loss": 0.1168, "step": 1213 }, { "epoch": 2.37, "grad_norm": 1.614502124793931, "learning_rate": 4.4150828172095205e-06, "loss": 0.1141, "step": 1214 }, { "epoch": 2.37, "grad_norm": 1.70681397490951, "learning_rate": 4.410475583929069e-06, "loss": 0.0726, "step": 1215 }, { "epoch": 2.38, "grad_norm": 1.813154219993849, "learning_rate": 4.405866657858823e-06, "loss": 0.0888, "step": 1216 }, { "epoch": 2.38, "grad_norm": 1.7002112793967525, "learning_rate": 4.401256047567866e-06, "loss": 0.1033, "step": 1217 }, { "epoch": 2.38, "grad_norm": 1.6153036740664126, "learning_rate": 4.396643761628414e-06, "loss": 0.0833, "step": 1218 }, { "epoch": 2.38, "grad_norm": 1.4666844632882907, "learning_rate": 4.392029808615802e-06, "loss": 0.0677, "step": 1219 }, { "epoch": 2.38, "grad_norm": 1.6570145428791507, "learning_rate": 4.387414197108459e-06, "loss": 0.07, "step": 1220 }, { "epoch": 2.38, "grad_norm": 1.6109394372357206, "learning_rate": 4.382796935687899e-06, "loss": 0.0857, "step": 1221 }, { "epoch": 2.39, "grad_norm": 1.507579280376353, "learning_rate": 4.378178032938711e-06, "loss": 0.0734, "step": 1222 }, { "epoch": 2.39, "grad_norm": 1.5653955909206914, "learning_rate": 4.373557497448522e-06, "loss": 0.0668, "step": 1223 }, { "epoch": 2.39, "grad_norm": 1.5931029368809095, "learning_rate": 4.368935337808006e-06, "loss": 0.0862, "step": 1224 }, { "epoch": 2.39, "grad_norm": 2.814341030962141, "learning_rate": 4.364311562610854e-06, "loss": 0.089, "step": 1225 }, { "epoch": 2.39, "grad_norm": 1.641984677747962, "learning_rate": 4.359686180453757e-06, "loss": 0.0932, "step": 1226 }, { "epoch": 2.4, "grad_norm": 2.0647044362527174, "learning_rate": 4.355059199936396e-06, "loss": 0.0769, "step": 1227 }, { "epoch": 2.4, "grad_norm": 1.6615778982679232, "learning_rate": 4.350430629661424e-06, "loss": 0.0884, "step": 1228 }, { "epoch": 2.4, "grad_norm": 1.7045372702131183, "learning_rate": 4.34580047823445e-06, "loss": 0.0624, "step": 1229 }, { "epoch": 2.4, "grad_norm": 1.8892537905946163, "learning_rate": 4.341168754264023e-06, "loss": 0.0936, "step": 1230 }, { "epoch": 2.4, "grad_norm": 1.3936387878501884, "learning_rate": 4.336535466361616e-06, "loss": 0.0619, "step": 1231 }, { "epoch": 2.41, "grad_norm": 1.6617167288013397, "learning_rate": 4.3319006231416055e-06, "loss": 0.0897, "step": 1232 }, { "epoch": 2.41, "grad_norm": 1.700580916776741, "learning_rate": 4.327264233221266e-06, "loss": 0.0922, "step": 1233 }, { "epoch": 2.41, "grad_norm": 1.638319497394559, "learning_rate": 4.3226263052207435e-06, "loss": 0.1045, "step": 1234 }, { "epoch": 2.41, "grad_norm": 1.5978598201574785, "learning_rate": 4.317986847763045e-06, "loss": 0.1001, "step": 1235 }, { "epoch": 2.41, "grad_norm": 1.5832122655240417, "learning_rate": 4.313345869474022e-06, "loss": 0.1202, "step": 1236 }, { "epoch": 2.42, "grad_norm": 1.5534135030877572, "learning_rate": 4.308703378982349e-06, "loss": 0.0696, "step": 1237 }, { "epoch": 2.42, "grad_norm": 1.905759782748856, "learning_rate": 4.3040593849195195e-06, "loss": 0.0858, "step": 1238 }, { "epoch": 2.42, "grad_norm": 1.5485769209262523, "learning_rate": 4.299413895919817e-06, "loss": 0.1231, "step": 1239 }, { "epoch": 2.42, "grad_norm": 1.8606474866966884, "learning_rate": 4.294766920620306e-06, "loss": 0.0776, "step": 1240 }, { "epoch": 2.42, "grad_norm": 1.4278919887689812, "learning_rate": 4.290118467660815e-06, "loss": 0.0964, "step": 1241 }, { "epoch": 2.43, "grad_norm": 1.6825771883432121, "learning_rate": 4.285468545683919e-06, "loss": 0.0617, "step": 1242 }, { "epoch": 2.43, "grad_norm": 1.5769328651190504, "learning_rate": 4.280817163334925e-06, "loss": 0.0781, "step": 1243 }, { "epoch": 2.43, "grad_norm": 1.3910006767998297, "learning_rate": 4.276164329261853e-06, "loss": 0.0956, "step": 1244 }, { "epoch": 2.43, "grad_norm": 1.5200623108584495, "learning_rate": 4.2715100521154245e-06, "loss": 0.1164, "step": 1245 }, { "epoch": 2.43, "grad_norm": 1.5531363794412438, "learning_rate": 4.266854340549044e-06, "loss": 0.0869, "step": 1246 }, { "epoch": 2.44, "grad_norm": 1.5932897215923345, "learning_rate": 4.262197203218782e-06, "loss": 0.0936, "step": 1247 }, { "epoch": 2.44, "grad_norm": 1.552727428508241, "learning_rate": 4.25753864878336e-06, "loss": 0.0899, "step": 1248 }, { "epoch": 2.44, "grad_norm": 1.6505093409426959, "learning_rate": 4.252878685904134e-06, "loss": 0.101, "step": 1249 }, { "epoch": 2.44, "grad_norm": 1.5177029932285184, "learning_rate": 4.248217323245079e-06, "loss": 0.0661, "step": 1250 }, { "epoch": 2.44, "grad_norm": 1.4788186599371305, "learning_rate": 4.243554569472773e-06, "loss": 0.079, "step": 1251 }, { "epoch": 2.45, "grad_norm": 1.5919180283888381, "learning_rate": 4.238890433256378e-06, "loss": 0.0758, "step": 1252 }, { "epoch": 2.45, "grad_norm": 1.8023061016460107, "learning_rate": 4.234224923267631e-06, "loss": 0.0999, "step": 1253 }, { "epoch": 2.45, "grad_norm": 1.6663506871631273, "learning_rate": 4.2295580481808165e-06, "loss": 0.1044, "step": 1254 }, { "epoch": 2.45, "grad_norm": 1.9074736606695508, "learning_rate": 4.224889816672765e-06, "loss": 0.1066, "step": 1255 }, { "epoch": 2.45, "grad_norm": 1.6938047969234558, "learning_rate": 4.220220237422822e-06, "loss": 0.1021, "step": 1256 }, { "epoch": 2.46, "grad_norm": 1.4044711219622936, "learning_rate": 4.215549319112843e-06, "loss": 0.0607, "step": 1257 }, { "epoch": 2.46, "grad_norm": 1.5004150694150318, "learning_rate": 4.21087707042717e-06, "loss": 0.1006, "step": 1258 }, { "epoch": 2.46, "grad_norm": 1.543272927876705, "learning_rate": 4.206203500052622e-06, "loss": 0.0877, "step": 1259 }, { "epoch": 2.46, "grad_norm": 1.6656116670155217, "learning_rate": 4.201528616678472e-06, "loss": 0.1269, "step": 1260 }, { "epoch": 2.46, "grad_norm": 1.615799040226302, "learning_rate": 4.196852428996435e-06, "loss": 0.1128, "step": 1261 }, { "epoch": 2.46, "grad_norm": 1.5338715272079022, "learning_rate": 4.192174945700656e-06, "loss": 0.1089, "step": 1262 }, { "epoch": 2.47, "grad_norm": 1.4409547028565863, "learning_rate": 4.187496175487679e-06, "loss": 0.0904, "step": 1263 }, { "epoch": 2.47, "grad_norm": 1.7234539721917246, "learning_rate": 4.182816127056449e-06, "loss": 0.1068, "step": 1264 }, { "epoch": 2.47, "grad_norm": 1.6033427468640205, "learning_rate": 4.178134809108284e-06, "loss": 0.1004, "step": 1265 }, { "epoch": 2.47, "grad_norm": 1.4274584588308663, "learning_rate": 4.173452230346864e-06, "loss": 0.1056, "step": 1266 }, { "epoch": 2.47, "grad_norm": 1.5665633910713708, "learning_rate": 4.168768399478211e-06, "loss": 0.1072, "step": 1267 }, { "epoch": 2.48, "grad_norm": 1.3000577156577677, "learning_rate": 4.1640833252106775e-06, "loss": 0.0726, "step": 1268 }, { "epoch": 2.48, "grad_norm": 1.4875679082641264, "learning_rate": 4.1593970162549244e-06, "loss": 0.0934, "step": 1269 }, { "epoch": 2.48, "grad_norm": 1.5778956928473644, "learning_rate": 4.154709481323912e-06, "loss": 0.1206, "step": 1270 }, { "epoch": 2.48, "grad_norm": 1.4905611199961823, "learning_rate": 4.150020729132878e-06, "loss": 0.0889, "step": 1271 }, { "epoch": 2.48, "grad_norm": 1.429629820744158, "learning_rate": 4.1453307683993216e-06, "loss": 0.0745, "step": 1272 }, { "epoch": 2.49, "grad_norm": 1.6825654524383553, "learning_rate": 4.140639607842994e-06, "loss": 0.0769, "step": 1273 }, { "epoch": 2.49, "grad_norm": 1.6509976980827084, "learning_rate": 4.135947256185871e-06, "loss": 0.0892, "step": 1274 }, { "epoch": 2.49, "grad_norm": 1.5404148459823537, "learning_rate": 4.131253722152147e-06, "loss": 0.0982, "step": 1275 }, { "epoch": 2.49, "grad_norm": 1.6528169654238793, "learning_rate": 4.1265590144682155e-06, "loss": 0.0755, "step": 1276 }, { "epoch": 2.49, "grad_norm": 1.493097150861608, "learning_rate": 4.121863141862647e-06, "loss": 0.099, "step": 1277 }, { "epoch": 2.5, "grad_norm": 1.772118374676995, "learning_rate": 4.117166113066182e-06, "loss": 0.0875, "step": 1278 }, { "epoch": 2.5, "grad_norm": 1.616600941710714, "learning_rate": 4.112467936811711e-06, "loss": 0.1092, "step": 1279 }, { "epoch": 2.5, "grad_norm": 1.914037869551319, "learning_rate": 4.107768621834257e-06, "loss": 0.118, "step": 1280 }, { "epoch": 2.5, "grad_norm": 1.6451342181146154, "learning_rate": 4.103068176870958e-06, "loss": 0.0928, "step": 1281 }, { "epoch": 2.5, "grad_norm": 1.6773008180613778, "learning_rate": 4.098366610661054e-06, "loss": 0.1059, "step": 1282 }, { "epoch": 2.51, "grad_norm": 1.563694332511682, "learning_rate": 4.093663931945873e-06, "loss": 0.1004, "step": 1283 }, { "epoch": 2.51, "grad_norm": 1.3182526101502152, "learning_rate": 4.088960149468808e-06, "loss": 0.0815, "step": 1284 }, { "epoch": 2.51, "grad_norm": 1.5768481931103118, "learning_rate": 4.084255271975304e-06, "loss": 0.1014, "step": 1285 }, { "epoch": 2.51, "grad_norm": 1.450375849005317, "learning_rate": 4.079549308212843e-06, "loss": 0.0819, "step": 1286 }, { "epoch": 2.51, "grad_norm": 1.4275870788489, "learning_rate": 4.074842266930927e-06, "loss": 0.0819, "step": 1287 }, { "epoch": 2.52, "grad_norm": 1.482274018652211, "learning_rate": 4.070134156881061e-06, "loss": 0.0942, "step": 1288 }, { "epoch": 2.52, "grad_norm": 1.5779972034193117, "learning_rate": 4.065424986816736e-06, "loss": 0.0858, "step": 1289 }, { "epoch": 2.52, "grad_norm": 1.515240084449149, "learning_rate": 4.060714765493415e-06, "loss": 0.0693, "step": 1290 }, { "epoch": 2.52, "grad_norm": 1.450682036353589, "learning_rate": 4.0560035016685145e-06, "loss": 0.0688, "step": 1291 }, { "epoch": 2.52, "grad_norm": 1.4982481925203317, "learning_rate": 4.051291204101393e-06, "loss": 0.091, "step": 1292 }, { "epoch": 2.53, "grad_norm": 1.5845798814623173, "learning_rate": 4.046577881553324e-06, "loss": 0.0954, "step": 1293 }, { "epoch": 2.53, "grad_norm": 1.4056478319217285, "learning_rate": 4.041863542787494e-06, "loss": 0.0949, "step": 1294 }, { "epoch": 2.53, "grad_norm": 1.6345430048292255, "learning_rate": 4.037148196568974e-06, "loss": 0.0754, "step": 1295 }, { "epoch": 2.53, "grad_norm": 1.6652048543449818, "learning_rate": 4.032431851664708e-06, "loss": 0.09, "step": 1296 }, { "epoch": 2.53, "grad_norm": 1.4217249444217572, "learning_rate": 4.027714516843502e-06, "loss": 0.0845, "step": 1297 }, { "epoch": 2.54, "grad_norm": 1.5412319931415714, "learning_rate": 4.022996200875997e-06, "loss": 0.0719, "step": 1298 }, { "epoch": 2.54, "grad_norm": 1.614500954057609, "learning_rate": 4.01827691253466e-06, "loss": 0.0897, "step": 1299 }, { "epoch": 2.54, "grad_norm": 1.4765433170221693, "learning_rate": 4.013556660593766e-06, "loss": 0.0647, "step": 1300 }, { "epoch": 2.54, "grad_norm": 1.6022054211221268, "learning_rate": 4.008835453829383e-06, "loss": 0.0661, "step": 1301 }, { "epoch": 2.54, "grad_norm": 1.6871078315485823, "learning_rate": 4.004113301019351e-06, "loss": 0.0781, "step": 1302 }, { "epoch": 2.54, "grad_norm": 1.7190978652211026, "learning_rate": 3.999390210943271e-06, "loss": 0.0853, "step": 1303 }, { "epoch": 2.55, "grad_norm": 1.705560393349106, "learning_rate": 3.9946661923824864e-06, "loss": 0.0907, "step": 1304 }, { "epoch": 2.55, "grad_norm": 1.7954271083799092, "learning_rate": 3.989941254120068e-06, "loss": 0.1082, "step": 1305 }, { "epoch": 2.55, "grad_norm": 1.6294319813781692, "learning_rate": 3.9852154049407935e-06, "loss": 0.0957, "step": 1306 }, { "epoch": 2.55, "grad_norm": 1.5166030892733473, "learning_rate": 3.980488653631138e-06, "loss": 0.066, "step": 1307 }, { "epoch": 2.55, "grad_norm": 1.6449216996522613, "learning_rate": 3.97576100897925e-06, "loss": 0.1141, "step": 1308 }, { "epoch": 2.56, "grad_norm": 1.7163725972229007, "learning_rate": 3.9710324797749415e-06, "loss": 0.0659, "step": 1309 }, { "epoch": 2.56, "grad_norm": 1.7399322831624398, "learning_rate": 3.96630307480967e-06, "loss": 0.113, "step": 1310 }, { "epoch": 2.56, "grad_norm": 1.89026656002876, "learning_rate": 3.961572802876516e-06, "loss": 0.0844, "step": 1311 }, { "epoch": 2.56, "grad_norm": 1.6262549898899294, "learning_rate": 3.956841672770181e-06, "loss": 0.1051, "step": 1312 }, { "epoch": 2.56, "grad_norm": 1.619807569035199, "learning_rate": 3.952109693286952e-06, "loss": 0.0935, "step": 1313 }, { "epoch": 2.57, "grad_norm": 1.5135948744036394, "learning_rate": 3.947376873224701e-06, "loss": 0.0919, "step": 1314 }, { "epoch": 2.57, "grad_norm": 1.538787378032846, "learning_rate": 3.942643221382863e-06, "loss": 0.0973, "step": 1315 }, { "epoch": 2.57, "grad_norm": 1.675385828791224, "learning_rate": 3.937908746562417e-06, "loss": 0.0773, "step": 1316 }, { "epoch": 2.57, "grad_norm": 1.4811737825639781, "learning_rate": 3.9331734575658735e-06, "loss": 0.0776, "step": 1317 }, { "epoch": 2.57, "grad_norm": 1.3769020022398384, "learning_rate": 3.928437363197257e-06, "loss": 0.0523, "step": 1318 }, { "epoch": 2.58, "grad_norm": 1.518495724016738, "learning_rate": 3.923700472262088e-06, "loss": 0.072, "step": 1319 }, { "epoch": 2.58, "grad_norm": 1.4086590748396146, "learning_rate": 3.918962793567368e-06, "loss": 0.0713, "step": 1320 }, { "epoch": 2.58, "grad_norm": 1.5429874726724162, "learning_rate": 3.914224335921568e-06, "loss": 0.0776, "step": 1321 }, { "epoch": 2.58, "grad_norm": 1.4672598776721533, "learning_rate": 3.909485108134598e-06, "loss": 0.0709, "step": 1322 }, { "epoch": 2.58, "grad_norm": 1.4327867145715625, "learning_rate": 3.90474511901781e-06, "loss": 0.0721, "step": 1323 }, { "epoch": 2.59, "grad_norm": 1.6176212130951308, "learning_rate": 3.900004377383963e-06, "loss": 0.0874, "step": 1324 }, { "epoch": 2.59, "grad_norm": 1.681839751302704, "learning_rate": 3.89526289204722e-06, "loss": 0.0592, "step": 1325 }, { "epoch": 2.59, "grad_norm": 1.6635392583864905, "learning_rate": 3.890520671823126e-06, "loss": 0.0769, "step": 1326 }, { "epoch": 2.59, "grad_norm": 1.7830793792764248, "learning_rate": 3.8857777255285915e-06, "loss": 0.0876, "step": 1327 }, { "epoch": 2.59, "grad_norm": 1.943921161991667, "learning_rate": 3.881034061981876e-06, "loss": 0.1057, "step": 1328 }, { "epoch": 2.6, "grad_norm": 1.4202983985154911, "learning_rate": 3.876289690002576e-06, "loss": 0.0747, "step": 1329 }, { "epoch": 2.6, "grad_norm": 1.5996552330500118, "learning_rate": 3.871544618411602e-06, "loss": 0.0787, "step": 1330 }, { "epoch": 2.6, "grad_norm": 1.8030699847067735, "learning_rate": 3.866798856031164e-06, "loss": 0.0801, "step": 1331 }, { "epoch": 2.6, "grad_norm": 1.5836196103164526, "learning_rate": 3.862052411684763e-06, "loss": 0.0568, "step": 1332 }, { "epoch": 2.6, "grad_norm": 1.5759366312530885, "learning_rate": 3.85730529419716e-06, "loss": 0.0726, "step": 1333 }, { "epoch": 2.61, "grad_norm": 1.632000752359195, "learning_rate": 3.852557512394371e-06, "loss": 0.1024, "step": 1334 }, { "epoch": 2.61, "grad_norm": 1.600885037480587, "learning_rate": 3.8478090751036495e-06, "loss": 0.0985, "step": 1335 }, { "epoch": 2.61, "grad_norm": 1.6251312286566884, "learning_rate": 3.843059991153463e-06, "loss": 0.0862, "step": 1336 }, { "epoch": 2.61, "grad_norm": 1.6695285642084863, "learning_rate": 3.838310269373483e-06, "loss": 0.1136, "step": 1337 }, { "epoch": 2.61, "grad_norm": 1.5336377477749183, "learning_rate": 3.83355991859457e-06, "loss": 0.1074, "step": 1338 }, { "epoch": 2.62, "grad_norm": 1.6084728726586612, "learning_rate": 3.828808947648751e-06, "loss": 0.1012, "step": 1339 }, { "epoch": 2.62, "grad_norm": 1.397024996610413, "learning_rate": 3.824057365369205e-06, "loss": 0.0465, "step": 1340 }, { "epoch": 2.62, "grad_norm": 1.4768362433721565, "learning_rate": 3.8193051805902496e-06, "loss": 0.0726, "step": 1341 }, { "epoch": 2.62, "grad_norm": 1.4495213464951184, "learning_rate": 3.8145524021473225e-06, "loss": 0.0793, "step": 1342 }, { "epoch": 2.62, "grad_norm": 1.5905547539298148, "learning_rate": 3.809799038876965e-06, "loss": 0.0918, "step": 1343 }, { "epoch": 2.62, "grad_norm": 1.6398457047425532, "learning_rate": 3.805045099616804e-06, "loss": 0.0764, "step": 1344 }, { "epoch": 2.63, "grad_norm": 1.3889916843546588, "learning_rate": 3.80029059320554e-06, "loss": 0.0738, "step": 1345 }, { "epoch": 2.63, "grad_norm": 1.5051781171076393, "learning_rate": 3.7955355284829257e-06, "loss": 0.0651, "step": 1346 }, { "epoch": 2.63, "grad_norm": 1.5371044470409079, "learning_rate": 3.7907799142897547e-06, "loss": 0.0906, "step": 1347 }, { "epoch": 2.63, "grad_norm": 1.565404078994156, "learning_rate": 3.786023759467839e-06, "loss": 0.0732, "step": 1348 }, { "epoch": 2.63, "grad_norm": 1.5723332809694566, "learning_rate": 3.7812670728599973e-06, "loss": 0.086, "step": 1349 }, { "epoch": 2.64, "grad_norm": 1.8106740701382753, "learning_rate": 3.776509863310037e-06, "loss": 0.0795, "step": 1350 }, { "epoch": 2.64, "grad_norm": 1.6488618265904378, "learning_rate": 3.771752139662736e-06, "loss": 0.0622, "step": 1351 }, { "epoch": 2.64, "grad_norm": 1.7921064874356765, "learning_rate": 3.766993910763834e-06, "loss": 0.1076, "step": 1352 }, { "epoch": 2.64, "grad_norm": 1.8036123473469177, "learning_rate": 3.7622351854600005e-06, "loss": 0.0838, "step": 1353 }, { "epoch": 2.64, "grad_norm": 1.8970371799347665, "learning_rate": 3.7574759725988363e-06, "loss": 0.1109, "step": 1354 }, { "epoch": 2.65, "grad_norm": 1.6193212993312156, "learning_rate": 3.7527162810288446e-06, "loss": 0.0913, "step": 1355 }, { "epoch": 2.65, "grad_norm": 2.0081232951211514, "learning_rate": 3.7479561195994195e-06, "loss": 0.1033, "step": 1356 }, { "epoch": 2.65, "grad_norm": 1.6397136512585018, "learning_rate": 3.743195497160829e-06, "loss": 0.0982, "step": 1357 }, { "epoch": 2.65, "grad_norm": 1.6805270220526716, "learning_rate": 3.7384344225641987e-06, "loss": 0.1043, "step": 1358 }, { "epoch": 2.65, "grad_norm": 1.5275850918530345, "learning_rate": 3.733672904661492e-06, "loss": 0.0898, "step": 1359 }, { "epoch": 2.66, "grad_norm": 1.5622274322513239, "learning_rate": 3.728910952305501e-06, "loss": 0.1016, "step": 1360 }, { "epoch": 2.66, "grad_norm": 1.5472621700842952, "learning_rate": 3.724148574349822e-06, "loss": 0.0947, "step": 1361 }, { "epoch": 2.66, "grad_norm": 1.4582908094624216, "learning_rate": 3.719385779648844e-06, "loss": 0.0921, "step": 1362 }, { "epoch": 2.66, "grad_norm": 1.4042989242982529, "learning_rate": 3.71462257705773e-06, "loss": 0.07, "step": 1363 }, { "epoch": 2.66, "grad_norm": 1.5080154116361943, "learning_rate": 3.7098589754324037e-06, "loss": 0.1037, "step": 1364 }, { "epoch": 2.67, "grad_norm": 1.5149558972843244, "learning_rate": 3.7050949836295268e-06, "loss": 0.1046, "step": 1365 }, { "epoch": 2.67, "grad_norm": 1.3522044923413263, "learning_rate": 3.700330610506491e-06, "loss": 0.0699, "step": 1366 }, { "epoch": 2.67, "grad_norm": 1.5003098280896372, "learning_rate": 3.695565864921392e-06, "loss": 0.102, "step": 1367 }, { "epoch": 2.67, "grad_norm": 1.502160983482869, "learning_rate": 3.6908007557330225e-06, "loss": 0.1411, "step": 1368 }, { "epoch": 2.67, "grad_norm": 1.3579910573849188, "learning_rate": 3.6860352918008482e-06, "loss": 0.0974, "step": 1369 }, { "epoch": 2.68, "grad_norm": 1.5030755484541507, "learning_rate": 3.6812694819849964e-06, "loss": 0.0857, "step": 1370 }, { "epoch": 2.68, "grad_norm": 1.5470985049717183, "learning_rate": 3.6765033351462366e-06, "loss": 0.0739, "step": 1371 }, { "epoch": 2.68, "grad_norm": 1.3956211994745589, "learning_rate": 3.6717368601459635e-06, "loss": 0.1023, "step": 1372 }, { "epoch": 2.68, "grad_norm": 1.5166210712309423, "learning_rate": 3.6669700658461837e-06, "loss": 0.0885, "step": 1373 }, { "epoch": 2.68, "grad_norm": 1.3966836926383681, "learning_rate": 3.662202961109498e-06, "loss": 0.061, "step": 1374 }, { "epoch": 2.69, "grad_norm": 1.3703293505939813, "learning_rate": 3.657435554799083e-06, "loss": 0.0636, "step": 1375 }, { "epoch": 2.69, "grad_norm": 1.5819455402003142, "learning_rate": 3.6526678557786763e-06, "loss": 0.0725, "step": 1376 }, { "epoch": 2.69, "grad_norm": 1.5065839085694765, "learning_rate": 3.6478998729125588e-06, "loss": 0.0923, "step": 1377 }, { "epoch": 2.69, "grad_norm": 1.5369804678678336, "learning_rate": 3.643131615065542e-06, "loss": 0.0835, "step": 1378 }, { "epoch": 2.69, "grad_norm": 1.314627961273215, "learning_rate": 3.6383630911029457e-06, "loss": 0.0798, "step": 1379 }, { "epoch": 2.7, "grad_norm": 1.765590272921791, "learning_rate": 3.633594309890586e-06, "loss": 0.0824, "step": 1380 }, { "epoch": 2.7, "grad_norm": 1.3664574761050245, "learning_rate": 3.628825280294756e-06, "loss": 0.0629, "step": 1381 }, { "epoch": 2.7, "grad_norm": 1.602454324156175, "learning_rate": 3.6240560111822124e-06, "loss": 0.0913, "step": 1382 }, { "epoch": 2.7, "grad_norm": 1.6946998883169975, "learning_rate": 3.619286511420156e-06, "loss": 0.0816, "step": 1383 }, { "epoch": 2.7, "grad_norm": 1.5651088921314105, "learning_rate": 3.6145167898762167e-06, "loss": 0.0914, "step": 1384 }, { "epoch": 2.71, "grad_norm": 1.5964553503670587, "learning_rate": 3.609746855418437e-06, "loss": 0.0904, "step": 1385 }, { "epoch": 2.71, "grad_norm": 1.4404841914556237, "learning_rate": 3.6049767169152543e-06, "loss": 0.0599, "step": 1386 }, { "epoch": 2.71, "grad_norm": 1.6822360193765002, "learning_rate": 3.6002063832354873e-06, "loss": 0.1113, "step": 1387 }, { "epoch": 2.71, "grad_norm": 1.7162532761375442, "learning_rate": 3.595435863248315e-06, "loss": 0.1045, "step": 1388 }, { "epoch": 2.71, "grad_norm": 1.6119329253617363, "learning_rate": 3.5906651658232647e-06, "loss": 0.0695, "step": 1389 }, { "epoch": 2.71, "grad_norm": 1.4964261571746102, "learning_rate": 3.585894299830193e-06, "loss": 0.0867, "step": 1390 }, { "epoch": 2.72, "grad_norm": 1.605352110699925, "learning_rate": 3.5811232741392703e-06, "loss": 0.0969, "step": 1391 }, { "epoch": 2.72, "grad_norm": 1.5986580397783603, "learning_rate": 3.576352097620964e-06, "loss": 0.0698, "step": 1392 }, { "epoch": 2.72, "grad_norm": 1.532180970412994, "learning_rate": 3.571580779146021e-06, "loss": 0.1099, "step": 1393 }, { "epoch": 2.72, "grad_norm": 1.4631270473569253, "learning_rate": 3.5668093275854518e-06, "loss": 0.0627, "step": 1394 }, { "epoch": 2.72, "grad_norm": 1.5863815474764567, "learning_rate": 3.5620377518105167e-06, "loss": 0.0547, "step": 1395 }, { "epoch": 2.73, "grad_norm": 1.6996278761358348, "learning_rate": 3.557266060692704e-06, "loss": 0.0828, "step": 1396 }, { "epoch": 2.73, "grad_norm": 1.2845276133280015, "learning_rate": 3.5524942631037195e-06, "loss": 0.0456, "step": 1397 }, { "epoch": 2.73, "grad_norm": 1.5946600851774562, "learning_rate": 3.547722367915463e-06, "loss": 0.0741, "step": 1398 }, { "epoch": 2.73, "grad_norm": 1.4928068462105315, "learning_rate": 3.5429503840000197e-06, "loss": 0.0876, "step": 1399 }, { "epoch": 2.73, "grad_norm": 1.542260987058057, "learning_rate": 3.5381783202296382e-06, "loss": 0.0899, "step": 1400 }, { "epoch": 2.74, "grad_norm": 1.4153127613085184, "learning_rate": 3.533406185476716e-06, "loss": 0.0784, "step": 1401 }, { "epoch": 2.74, "grad_norm": 1.603098686012174, "learning_rate": 3.5286339886137804e-06, "loss": 0.0745, "step": 1402 }, { "epoch": 2.74, "grad_norm": 1.6740603110938534, "learning_rate": 3.5238617385134766e-06, "loss": 0.1041, "step": 1403 }, { "epoch": 2.74, "grad_norm": 1.4107503436358924, "learning_rate": 3.5190894440485483e-06, "loss": 0.0549, "step": 1404 }, { "epoch": 2.74, "grad_norm": 1.7269094888104461, "learning_rate": 3.5143171140918213e-06, "loss": 0.0807, "step": 1405 }, { "epoch": 2.75, "grad_norm": 1.5855305939650723, "learning_rate": 3.509544757516189e-06, "loss": 0.0877, "step": 1406 }, { "epoch": 2.75, "grad_norm": 1.5995714063500692, "learning_rate": 3.5047723831945895e-06, "loss": 0.1035, "step": 1407 }, { "epoch": 2.75, "grad_norm": 1.6076247748636305, "learning_rate": 3.5e-06, "loss": 0.1007, "step": 1408 }, { "epoch": 2.75, "grad_norm": 1.6532323251265915, "learning_rate": 3.4952276168054104e-06, "loss": 0.0843, "step": 1409 }, { "epoch": 2.75, "grad_norm": 1.5635482531941574, "learning_rate": 3.490455242483811e-06, "loss": 0.0856, "step": 1410 }, { "epoch": 2.76, "grad_norm": 1.5570512836766586, "learning_rate": 3.485682885908178e-06, "loss": 0.0948, "step": 1411 }, { "epoch": 2.76, "grad_norm": 1.651493212004195, "learning_rate": 3.480910555951451e-06, "loss": 0.0845, "step": 1412 }, { "epoch": 2.76, "grad_norm": 1.3947989403942405, "learning_rate": 3.476138261486524e-06, "loss": 0.0703, "step": 1413 }, { "epoch": 2.76, "grad_norm": 1.353321536029836, "learning_rate": 3.471366011386221e-06, "loss": 0.0761, "step": 1414 }, { "epoch": 2.76, "grad_norm": 1.522548959206914, "learning_rate": 3.466593814523285e-06, "loss": 0.0851, "step": 1415 }, { "epoch": 2.77, "grad_norm": 1.485714242475294, "learning_rate": 3.461821679770362e-06, "loss": 0.0916, "step": 1416 }, { "epoch": 2.77, "grad_norm": 1.5204380973166813, "learning_rate": 3.4570496159999806e-06, "loss": 0.1107, "step": 1417 }, { "epoch": 2.77, "grad_norm": 1.47042957663359, "learning_rate": 3.452277632084538e-06, "loss": 0.1134, "step": 1418 }, { "epoch": 2.77, "grad_norm": 1.3555010324135282, "learning_rate": 3.4475057368962812e-06, "loss": 0.0798, "step": 1419 }, { "epoch": 2.77, "grad_norm": 1.5669692407337088, "learning_rate": 3.442733939307296e-06, "loss": 0.0929, "step": 1420 }, { "epoch": 2.78, "grad_norm": 1.4386245464235201, "learning_rate": 3.4379622481894836e-06, "loss": 0.0597, "step": 1421 }, { "epoch": 2.78, "grad_norm": 1.5911937959563602, "learning_rate": 3.433190672414549e-06, "loss": 0.1114, "step": 1422 }, { "epoch": 2.78, "grad_norm": 1.4907181551164383, "learning_rate": 3.4284192208539802e-06, "loss": 0.0811, "step": 1423 }, { "epoch": 2.78, "grad_norm": 1.3867999991765652, "learning_rate": 3.4236479023790363e-06, "loss": 0.0967, "step": 1424 }, { "epoch": 2.78, "grad_norm": 1.5151677204761602, "learning_rate": 3.4188767258607296e-06, "loss": 0.0981, "step": 1425 }, { "epoch": 2.79, "grad_norm": 1.4239114935437491, "learning_rate": 3.4141057001698072e-06, "loss": 0.0928, "step": 1426 }, { "epoch": 2.79, "grad_norm": 1.548373199968855, "learning_rate": 3.4093348341767356e-06, "loss": 0.1229, "step": 1427 }, { "epoch": 2.79, "grad_norm": 1.4468213286228362, "learning_rate": 3.4045641367516857e-06, "loss": 0.0887, "step": 1428 }, { "epoch": 2.79, "grad_norm": 1.528769752967868, "learning_rate": 3.3997936167645135e-06, "loss": 0.0726, "step": 1429 }, { "epoch": 2.79, "grad_norm": 1.422037594502457, "learning_rate": 3.395023283084745e-06, "loss": 0.062, "step": 1430 }, { "epoch": 2.79, "grad_norm": 1.5045544699989195, "learning_rate": 3.3902531445815646e-06, "loss": 0.0811, "step": 1431 }, { "epoch": 2.8, "grad_norm": 1.535451596451779, "learning_rate": 3.3854832101237836e-06, "loss": 0.0929, "step": 1432 }, { "epoch": 2.8, "grad_norm": 1.4739336610684732, "learning_rate": 3.3807134885798448e-06, "loss": 0.089, "step": 1433 }, { "epoch": 2.8, "grad_norm": 1.6991321164538455, "learning_rate": 3.3759439888177883e-06, "loss": 0.1055, "step": 1434 }, { "epoch": 2.8, "grad_norm": 1.562911931549612, "learning_rate": 3.3711747197052438e-06, "loss": 0.0781, "step": 1435 }, { "epoch": 2.8, "grad_norm": 1.6109288491342446, "learning_rate": 3.366405690109414e-06, "loss": 0.11, "step": 1436 }, { "epoch": 2.81, "grad_norm": 1.4070055280292186, "learning_rate": 3.3616369088970542e-06, "loss": 0.0473, "step": 1437 }, { "epoch": 2.81, "grad_norm": 1.6120989023691683, "learning_rate": 3.3568683849344583e-06, "loss": 0.0566, "step": 1438 }, { "epoch": 2.81, "grad_norm": 1.473573401279048, "learning_rate": 3.3521001270874403e-06, "loss": 0.0971, "step": 1439 }, { "epoch": 2.81, "grad_norm": 1.4382563529375547, "learning_rate": 3.3473321442213245e-06, "loss": 0.0741, "step": 1440 }, { "epoch": 2.81, "grad_norm": 1.5059714265209942, "learning_rate": 3.342564445200917e-06, "loss": 0.0683, "step": 1441 }, { "epoch": 2.82, "grad_norm": 1.6661332202047388, "learning_rate": 3.3377970388905024e-06, "loss": 0.0945, "step": 1442 }, { "epoch": 2.82, "grad_norm": 1.6338712782526863, "learning_rate": 3.333029934153817e-06, "loss": 0.0669, "step": 1443 }, { "epoch": 2.82, "grad_norm": 1.711178856566601, "learning_rate": 3.328263139854037e-06, "loss": 0.0864, "step": 1444 }, { "epoch": 2.82, "grad_norm": 1.7023697803484574, "learning_rate": 3.323496664853764e-06, "loss": 0.1039, "step": 1445 }, { "epoch": 2.82, "grad_norm": 1.8325759233167014, "learning_rate": 3.3187305180150035e-06, "loss": 0.082, "step": 1446 }, { "epoch": 2.83, "grad_norm": 1.4563008691069212, "learning_rate": 3.3139647081991513e-06, "loss": 0.0881, "step": 1447 }, { "epoch": 2.83, "grad_norm": 1.5729123619606715, "learning_rate": 3.3091992442669774e-06, "loss": 0.0763, "step": 1448 }, { "epoch": 2.83, "grad_norm": 1.5344011771159114, "learning_rate": 3.304434135078609e-06, "loss": 0.0894, "step": 1449 }, { "epoch": 2.83, "grad_norm": 1.3706828985654262, "learning_rate": 3.2996693894935104e-06, "loss": 0.054, "step": 1450 }, { "epoch": 2.83, "grad_norm": 1.7244636259556778, "learning_rate": 3.294905016370474e-06, "loss": 0.0831, "step": 1451 }, { "epoch": 2.84, "grad_norm": 1.5705849082349672, "learning_rate": 3.290141024567597e-06, "loss": 0.0661, "step": 1452 }, { "epoch": 2.84, "grad_norm": 1.529154123463132, "learning_rate": 3.28537742294227e-06, "loss": 0.0649, "step": 1453 }, { "epoch": 2.84, "grad_norm": 1.426971945259894, "learning_rate": 3.280614220351157e-06, "loss": 0.0694, "step": 1454 }, { "epoch": 2.84, "grad_norm": 1.476867033897873, "learning_rate": 3.275851425650178e-06, "loss": 0.0659, "step": 1455 }, { "epoch": 2.84, "grad_norm": 1.5609555454104895, "learning_rate": 3.271089047694499e-06, "loss": 0.0902, "step": 1456 }, { "epoch": 2.85, "grad_norm": 1.472228186913368, "learning_rate": 3.2663270953385075e-06, "loss": 0.0886, "step": 1457 }, { "epoch": 2.85, "grad_norm": 1.8180027363234865, "learning_rate": 3.2615655774358025e-06, "loss": 0.0861, "step": 1458 }, { "epoch": 2.85, "grad_norm": 1.521784077472249, "learning_rate": 3.2568045028391715e-06, "loss": 0.0788, "step": 1459 }, { "epoch": 2.85, "grad_norm": 1.5704735868236297, "learning_rate": 3.252043880400581e-06, "loss": 0.0821, "step": 1460 }, { "epoch": 2.85, "grad_norm": 1.5141361836769016, "learning_rate": 3.2472837189711557e-06, "loss": 0.0468, "step": 1461 }, { "epoch": 2.86, "grad_norm": 1.4460545263158997, "learning_rate": 3.2425240274011644e-06, "loss": 0.0715, "step": 1462 }, { "epoch": 2.86, "grad_norm": 1.5744718343520265, "learning_rate": 3.2377648145400002e-06, "loss": 0.1056, "step": 1463 }, { "epoch": 2.86, "grad_norm": 1.6458509127157492, "learning_rate": 3.2330060892361665e-06, "loss": 0.0701, "step": 1464 }, { "epoch": 2.86, "grad_norm": 1.622667696011991, "learning_rate": 3.2282478603372634e-06, "loss": 0.1049, "step": 1465 }, { "epoch": 2.86, "grad_norm": 1.4884271236510687, "learning_rate": 3.2234901366899633e-06, "loss": 0.0766, "step": 1466 }, { "epoch": 2.87, "grad_norm": 1.5345237687255147, "learning_rate": 3.218732927140004e-06, "loss": 0.1, "step": 1467 }, { "epoch": 2.87, "grad_norm": 1.6120637876722197, "learning_rate": 3.2139762405321623e-06, "loss": 0.1112, "step": 1468 }, { "epoch": 2.87, "grad_norm": 1.3753552175221246, "learning_rate": 3.2092200857102456e-06, "loss": 0.0817, "step": 1469 }, { "epoch": 2.87, "grad_norm": 1.4340389877407194, "learning_rate": 3.204464471517074e-06, "loss": 0.0901, "step": 1470 }, { "epoch": 2.87, "grad_norm": 1.5174437062057025, "learning_rate": 3.1997094067944606e-06, "loss": 0.1163, "step": 1471 }, { "epoch": 2.88, "grad_norm": 1.6707949809369398, "learning_rate": 3.1949549003831962e-06, "loss": 0.0829, "step": 1472 }, { "epoch": 2.88, "grad_norm": 1.2051398576833592, "learning_rate": 3.1902009611230357e-06, "loss": 0.051, "step": 1473 }, { "epoch": 2.88, "grad_norm": 1.395671337006273, "learning_rate": 3.1854475978526774e-06, "loss": 0.075, "step": 1474 }, { "epoch": 2.88, "grad_norm": 1.542924238186102, "learning_rate": 3.18069481940975e-06, "loss": 0.1137, "step": 1475 }, { "epoch": 2.88, "grad_norm": 1.7648976516107158, "learning_rate": 3.1759426346307963e-06, "loss": 0.14, "step": 1476 }, { "epoch": 2.88, "grad_norm": 1.5805714346961854, "learning_rate": 3.1711910523512493e-06, "loss": 0.0709, "step": 1477 }, { "epoch": 2.89, "grad_norm": 1.1763926583501763, "learning_rate": 3.16644008140543e-06, "loss": 0.0651, "step": 1478 }, { "epoch": 2.89, "grad_norm": 1.503388140381377, "learning_rate": 3.161689730626517e-06, "loss": 0.1205, "step": 1479 }, { "epoch": 2.89, "grad_norm": 1.3837218731795042, "learning_rate": 3.1569400088465375e-06, "loss": 0.0678, "step": 1480 }, { "epoch": 2.89, "grad_norm": 1.38371475889525, "learning_rate": 3.152190924896351e-06, "loss": 0.0659, "step": 1481 }, { "epoch": 2.89, "grad_norm": 1.52850095227249, "learning_rate": 3.1474424876056288e-06, "loss": 0.0882, "step": 1482 }, { "epoch": 2.9, "grad_norm": 1.405815372395355, "learning_rate": 3.142694705802841e-06, "loss": 0.0761, "step": 1483 }, { "epoch": 2.9, "grad_norm": 1.2936530853874963, "learning_rate": 3.137947588315237e-06, "loss": 0.0588, "step": 1484 }, { "epoch": 2.9, "grad_norm": 1.339289042424607, "learning_rate": 3.1332011439688366e-06, "loss": 0.0787, "step": 1485 }, { "epoch": 2.9, "grad_norm": 1.4598393052752188, "learning_rate": 3.1284553815883995e-06, "loss": 0.0675, "step": 1486 }, { "epoch": 2.9, "grad_norm": 1.395563445293868, "learning_rate": 3.123710309997425e-06, "loss": 0.0853, "step": 1487 }, { "epoch": 2.91, "grad_norm": 1.7203682198797527, "learning_rate": 3.118965938018125e-06, "loss": 0.094, "step": 1488 }, { "epoch": 2.91, "grad_norm": 1.4901536127393413, "learning_rate": 3.1142222744714093e-06, "loss": 0.0973, "step": 1489 }, { "epoch": 2.91, "grad_norm": 1.7300501182954477, "learning_rate": 3.1094793281768745e-06, "loss": 0.1019, "step": 1490 }, { "epoch": 2.91, "grad_norm": 1.6053579797589992, "learning_rate": 3.1047371079527805e-06, "loss": 0.0751, "step": 1491 }, { "epoch": 2.91, "grad_norm": 1.6610807203368463, "learning_rate": 3.099995622616037e-06, "loss": 0.105, "step": 1492 }, { "epoch": 2.92, "grad_norm": 1.72149295077048, "learning_rate": 3.0952548809821907e-06, "loss": 0.0907, "step": 1493 }, { "epoch": 2.92, "grad_norm": 1.4479603427723151, "learning_rate": 3.090514891865402e-06, "loss": 0.0691, "step": 1494 }, { "epoch": 2.92, "grad_norm": 1.9282516396660319, "learning_rate": 3.085775664078433e-06, "loss": 0.1051, "step": 1495 }, { "epoch": 2.92, "grad_norm": 1.5124932753231444, "learning_rate": 3.0810372064326317e-06, "loss": 0.0854, "step": 1496 }, { "epoch": 2.92, "grad_norm": 1.7274938319409392, "learning_rate": 3.0762995277379124e-06, "loss": 0.116, "step": 1497 }, { "epoch": 2.93, "grad_norm": 1.6618653728404895, "learning_rate": 3.0715626368027436e-06, "loss": 0.1238, "step": 1498 }, { "epoch": 2.93, "grad_norm": 1.621711277471934, "learning_rate": 3.0668265424341264e-06, "loss": 0.0656, "step": 1499 }, { "epoch": 2.93, "grad_norm": 1.3889022504003157, "learning_rate": 3.0620912534375834e-06, "loss": 0.0815, "step": 1500 }, { "epoch": 2.93, "grad_norm": 1.3375962964229857, "learning_rate": 3.0573567786171366e-06, "loss": 0.0584, "step": 1501 }, { "epoch": 2.93, "grad_norm": 1.6355429490060287, "learning_rate": 3.052623126775298e-06, "loss": 0.1042, "step": 1502 }, { "epoch": 2.94, "grad_norm": 1.41403285164085, "learning_rate": 3.0478903067130487e-06, "loss": 0.1143, "step": 1503 }, { "epoch": 2.94, "grad_norm": 1.453065909458296, "learning_rate": 3.0431583272298204e-06, "loss": 0.0933, "step": 1504 }, { "epoch": 2.94, "grad_norm": 1.513322314122633, "learning_rate": 3.0384271971234837e-06, "loss": 0.0809, "step": 1505 }, { "epoch": 2.94, "grad_norm": 1.4135706937037438, "learning_rate": 3.0336969251903305e-06, "loss": 0.0915, "step": 1506 }, { "epoch": 2.94, "grad_norm": 1.304587525812431, "learning_rate": 3.0289675202250584e-06, "loss": 0.0709, "step": 1507 }, { "epoch": 2.95, "grad_norm": 1.3295378833390081, "learning_rate": 3.0242389910207505e-06, "loss": 0.0676, "step": 1508 }, { "epoch": 2.95, "grad_norm": 1.61100472791081, "learning_rate": 3.0195113463688624e-06, "loss": 0.1214, "step": 1509 }, { "epoch": 2.95, "grad_norm": 1.3093086464636372, "learning_rate": 3.0147845950592064e-06, "loss": 0.0651, "step": 1510 }, { "epoch": 2.95, "grad_norm": 1.5030000704651876, "learning_rate": 3.0100587458799325e-06, "loss": 0.0928, "step": 1511 }, { "epoch": 2.95, "grad_norm": 1.4705893518019502, "learning_rate": 3.0053338076175147e-06, "loss": 0.089, "step": 1512 }, { "epoch": 2.96, "grad_norm": 1.308164111342756, "learning_rate": 3.0006097890567304e-06, "loss": 0.0856, "step": 1513 }, { "epoch": 2.96, "grad_norm": 1.4424017301309497, "learning_rate": 2.99588669898065e-06, "loss": 0.0791, "step": 1514 }, { "epoch": 2.96, "grad_norm": 1.560570157995841, "learning_rate": 2.991164546170618e-06, "loss": 0.1159, "step": 1515 }, { "epoch": 2.96, "grad_norm": 1.691659742087251, "learning_rate": 2.986443339406234e-06, "loss": 0.0576, "step": 1516 }, { "epoch": 2.96, "grad_norm": 1.4812039569220508, "learning_rate": 2.9817230874653398e-06, "loss": 0.0818, "step": 1517 }, { "epoch": 2.96, "grad_norm": 1.2996638663364246, "learning_rate": 2.977003799124003e-06, "loss": 0.0648, "step": 1518 }, { "epoch": 2.97, "grad_norm": 1.5238974802175047, "learning_rate": 2.972285483156498e-06, "loss": 0.0852, "step": 1519 }, { "epoch": 2.97, "grad_norm": 1.462014653380785, "learning_rate": 2.9675681483352915e-06, "loss": 0.07, "step": 1520 }, { "epoch": 2.97, "grad_norm": 1.6943196772583262, "learning_rate": 2.9628518034310278e-06, "loss": 0.1063, "step": 1521 }, { "epoch": 2.97, "grad_norm": 1.4883787024903536, "learning_rate": 2.9581364572125067e-06, "loss": 0.0625, "step": 1522 }, { "epoch": 2.97, "grad_norm": 1.4825291309017266, "learning_rate": 2.953422118446676e-06, "loss": 0.1088, "step": 1523 }, { "epoch": 2.98, "grad_norm": 1.513263061415094, "learning_rate": 2.948708795898608e-06, "loss": 0.1052, "step": 1524 }, { "epoch": 2.98, "grad_norm": 1.5320969271350355, "learning_rate": 2.9439964983314854e-06, "loss": 0.0868, "step": 1525 }, { "epoch": 2.98, "grad_norm": 1.6214970106916218, "learning_rate": 2.9392852345065854e-06, "loss": 0.0847, "step": 1526 }, { "epoch": 2.98, "grad_norm": 1.7966674360661001, "learning_rate": 2.9345750131832646e-06, "loss": 0.089, "step": 1527 }, { "epoch": 2.98, "grad_norm": 1.6520194692515802, "learning_rate": 2.9298658431189395e-06, "loss": 0.0503, "step": 1528 }, { "epoch": 2.99, "grad_norm": 1.5810153146394994, "learning_rate": 2.925157733069072e-06, "loss": 0.1121, "step": 1529 }, { "epoch": 2.99, "grad_norm": 1.7580609716548374, "learning_rate": 2.9204506917871577e-06, "loss": 0.0651, "step": 1530 }, { "epoch": 2.99, "grad_norm": 1.5050419284959082, "learning_rate": 2.9157447280246964e-06, "loss": 0.0713, "step": 1531 }, { "epoch": 2.99, "grad_norm": 1.373154905614713, "learning_rate": 2.9110398505311927e-06, "loss": 0.0871, "step": 1532 }, { "epoch": 2.99, "grad_norm": 1.6532612046141617, "learning_rate": 2.906336068054127e-06, "loss": 0.0843, "step": 1533 }, { "epoch": 3.0, "grad_norm": 1.424619692704762, "learning_rate": 2.9016333893389455e-06, "loss": 0.0641, "step": 1534 }, { "epoch": 3.0, "grad_norm": 1.4926763921249193, "learning_rate": 2.896931823129043e-06, "loss": 0.0875, "step": 1535 }, { "epoch": 3.0, "grad_norm": 1.5379346085069356, "learning_rate": 2.8922313781657437e-06, "loss": 0.0789, "step": 1536 }, { "epoch": 3.0, "grad_norm": 1.625898957174144, "learning_rate": 2.8875320631882885e-06, "loss": 0.0995, "step": 1537 }, { "epoch": 3.0, "grad_norm": 1.7680435633598603, "learning_rate": 2.8828338869338175e-06, "loss": 0.0797, "step": 1538 }, { "epoch": 3.01, "grad_norm": 1.5183467074844899, "learning_rate": 2.8781368581373545e-06, "loss": 0.0707, "step": 1539 }, { "epoch": 3.01, "grad_norm": 1.479934553341198, "learning_rate": 2.873440985531786e-06, "loss": 0.0814, "step": 1540 }, { "epoch": 3.01, "grad_norm": 1.478311264773331, "learning_rate": 2.8687462778478535e-06, "loss": 0.098, "step": 1541 }, { "epoch": 3.01, "grad_norm": 1.5372193132202843, "learning_rate": 2.864052743814129e-06, "loss": 0.0685, "step": 1542 }, { "epoch": 3.01, "grad_norm": 1.5977967858142377, "learning_rate": 2.859360392157007e-06, "loss": 0.0762, "step": 1543 }, { "epoch": 3.02, "grad_norm": 1.4609779434755985, "learning_rate": 2.8546692316006783e-06, "loss": 0.0757, "step": 1544 }, { "epoch": 3.02, "grad_norm": 1.4222097717988784, "learning_rate": 2.8499792708671227e-06, "loss": 0.097, "step": 1545 }, { "epoch": 3.02, "grad_norm": 1.482451902496579, "learning_rate": 2.845290518676088e-06, "loss": 0.0805, "step": 1546 }, { "epoch": 3.02, "grad_norm": 1.4015477773410392, "learning_rate": 2.840602983745075e-06, "loss": 0.0756, "step": 1547 }, { "epoch": 3.02, "grad_norm": 1.5717411828170875, "learning_rate": 2.8359166747893237e-06, "loss": 0.0984, "step": 1548 }, { "epoch": 3.03, "grad_norm": 1.5210142815052798, "learning_rate": 2.8312316005217896e-06, "loss": 0.0939, "step": 1549 }, { "epoch": 3.03, "grad_norm": 1.4862320997869323, "learning_rate": 2.826547769653137e-06, "loss": 0.1014, "step": 1550 }, { "epoch": 3.03, "grad_norm": 1.5340014447904258, "learning_rate": 2.821865190891716e-06, "loss": 0.076, "step": 1551 }, { "epoch": 3.03, "grad_norm": 1.3175417852927649, "learning_rate": 2.8171838729435513e-06, "loss": 0.0561, "step": 1552 }, { "epoch": 3.03, "grad_norm": 1.493576146940746, "learning_rate": 2.8125038245123216e-06, "loss": 0.0644, "step": 1553 }, { "epoch": 3.04, "grad_norm": 1.404179314548681, "learning_rate": 2.8078250542993445e-06, "loss": 0.0706, "step": 1554 }, { "epoch": 3.04, "grad_norm": 1.4240465092157348, "learning_rate": 2.8031475710035637e-06, "loss": 0.081, "step": 1555 }, { "epoch": 3.04, "grad_norm": 1.543171123482986, "learning_rate": 2.7984713833215282e-06, "loss": 0.0757, "step": 1556 }, { "epoch": 3.04, "grad_norm": 1.57100879982317, "learning_rate": 2.793796499947379e-06, "loss": 0.0654, "step": 1557 }, { "epoch": 3.04, "grad_norm": 1.6183538034728586, "learning_rate": 2.7891229295728308e-06, "loss": 0.0906, "step": 1558 }, { "epoch": 3.04, "grad_norm": 1.521584836546725, "learning_rate": 2.7844506808871573e-06, "loss": 0.0615, "step": 1559 }, { "epoch": 3.05, "grad_norm": 1.6736495844447976, "learning_rate": 2.779779762577178e-06, "loss": 0.0627, "step": 1560 }, { "epoch": 3.05, "grad_norm": 1.5306745994437798, "learning_rate": 2.7751101833272356e-06, "loss": 0.0474, "step": 1561 }, { "epoch": 3.05, "grad_norm": 1.2553688911125465, "learning_rate": 2.7704419518191834e-06, "loss": 0.0615, "step": 1562 }, { "epoch": 3.05, "grad_norm": 1.4919675802710233, "learning_rate": 2.7657750767323693e-06, "loss": 0.061, "step": 1563 }, { "epoch": 3.05, "grad_norm": 1.722731000037934, "learning_rate": 2.761109566743622e-06, "loss": 0.0727, "step": 1564 }, { "epoch": 3.06, "grad_norm": 1.6942027104239346, "learning_rate": 2.756445430527228e-06, "loss": 0.0915, "step": 1565 }, { "epoch": 3.06, "grad_norm": 1.5142708551378292, "learning_rate": 2.751782676754922e-06, "loss": 0.0622, "step": 1566 }, { "epoch": 3.06, "grad_norm": 1.4372524147264827, "learning_rate": 2.7471213140958667e-06, "loss": 0.0784, "step": 1567 }, { "epoch": 3.06, "grad_norm": 1.6121881135472398, "learning_rate": 2.7424613512166398e-06, "loss": 0.1031, "step": 1568 }, { "epoch": 3.06, "grad_norm": 1.3488622272566457, "learning_rate": 2.7378027967812183e-06, "loss": 0.0609, "step": 1569 }, { "epoch": 3.07, "grad_norm": 1.614777098078913, "learning_rate": 2.733145659450956e-06, "loss": 0.0697, "step": 1570 }, { "epoch": 3.07, "grad_norm": 1.6557995052132697, "learning_rate": 2.728489947884575e-06, "loss": 0.0734, "step": 1571 }, { "epoch": 3.07, "grad_norm": 1.7700280411722442, "learning_rate": 2.7238356707381474e-06, "loss": 0.0549, "step": 1572 }, { "epoch": 3.07, "grad_norm": 1.4383734698321915, "learning_rate": 2.7191828366650756e-06, "loss": 0.0592, "step": 1573 }, { "epoch": 3.07, "grad_norm": 1.6085372419324826, "learning_rate": 2.7145314543160805e-06, "loss": 0.048, "step": 1574 }, { "epoch": 3.08, "grad_norm": 1.7418160034408103, "learning_rate": 2.709881532339186e-06, "loss": 0.1066, "step": 1575 }, { "epoch": 3.08, "grad_norm": 1.8500341103206641, "learning_rate": 2.705233079379694e-06, "loss": 0.0604, "step": 1576 }, { "epoch": 3.08, "grad_norm": 1.2626859758497118, "learning_rate": 2.7005861040801835e-06, "loss": 0.0494, "step": 1577 }, { "epoch": 3.08, "grad_norm": 1.3079818570215662, "learning_rate": 2.695940615080481e-06, "loss": 0.0547, "step": 1578 }, { "epoch": 3.08, "grad_norm": 1.5288207078890017, "learning_rate": 2.691296621017651e-06, "loss": 0.0619, "step": 1579 }, { "epoch": 3.09, "grad_norm": 1.7853364516808803, "learning_rate": 2.6866541305259794e-06, "loss": 0.0487, "step": 1580 }, { "epoch": 3.09, "grad_norm": 1.546850156983191, "learning_rate": 2.682013152236955e-06, "loss": 0.0693, "step": 1581 }, { "epoch": 3.09, "grad_norm": 1.433665905871544, "learning_rate": 2.677373694779257e-06, "loss": 0.0702, "step": 1582 }, { "epoch": 3.09, "grad_norm": 1.3591654117025287, "learning_rate": 2.6727357667787335e-06, "loss": 0.0604, "step": 1583 }, { "epoch": 3.09, "grad_norm": 1.391267675199677, "learning_rate": 2.6680993768583944e-06, "loss": 0.0834, "step": 1584 }, { "epoch": 3.1, "grad_norm": 1.3683000889912398, "learning_rate": 2.663464533638385e-06, "loss": 0.0846, "step": 1585 }, { "epoch": 3.1, "grad_norm": 1.4313970758704437, "learning_rate": 2.6588312457359768e-06, "loss": 0.0711, "step": 1586 }, { "epoch": 3.1, "grad_norm": 1.4404975376356313, "learning_rate": 2.6541995217655503e-06, "loss": 0.0743, "step": 1587 }, { "epoch": 3.1, "grad_norm": 1.28749959776305, "learning_rate": 2.6495693703385765e-06, "loss": 0.0603, "step": 1588 }, { "epoch": 3.1, "grad_norm": 1.8600750520008307, "learning_rate": 2.644940800063605e-06, "loss": 0.06, "step": 1589 }, { "epoch": 3.11, "grad_norm": 1.4131068495563734, "learning_rate": 2.640313819546244e-06, "loss": 0.0671, "step": 1590 }, { "epoch": 3.11, "grad_norm": 1.342899670409795, "learning_rate": 2.635688437389146e-06, "loss": 0.0905, "step": 1591 }, { "epoch": 3.11, "grad_norm": 1.3700576442986745, "learning_rate": 2.6310646621919923e-06, "loss": 0.0546, "step": 1592 }, { "epoch": 3.11, "grad_norm": 1.2851670266333595, "learning_rate": 2.626442502551478e-06, "loss": 0.053, "step": 1593 }, { "epoch": 3.11, "grad_norm": 1.4464251575264937, "learning_rate": 2.6218219670612907e-06, "loss": 0.085, "step": 1594 }, { "epoch": 3.12, "grad_norm": 1.4003513116476545, "learning_rate": 2.6172030643121005e-06, "loss": 0.0541, "step": 1595 }, { "epoch": 3.12, "grad_norm": 1.4975857572269227, "learning_rate": 2.612585802891542e-06, "loss": 0.0866, "step": 1596 }, { "epoch": 3.12, "grad_norm": 1.5313501055184167, "learning_rate": 2.607970191384199e-06, "loss": 0.0667, "step": 1597 }, { "epoch": 3.12, "grad_norm": 1.4977632427500662, "learning_rate": 2.6033562383715864e-06, "loss": 0.0574, "step": 1598 }, { "epoch": 3.12, "grad_norm": 1.3937433645020867, "learning_rate": 2.598743952432134e-06, "loss": 0.0675, "step": 1599 }, { "epoch": 3.12, "grad_norm": 1.642404059210013, "learning_rate": 2.594133342141177e-06, "loss": 0.033, "step": 1600 }, { "epoch": 3.13, "grad_norm": 1.326299768204585, "learning_rate": 2.58952441607093e-06, "loss": 0.0452, "step": 1601 }, { "epoch": 3.13, "grad_norm": 1.373598127633892, "learning_rate": 2.5849171827904802e-06, "loss": 0.0408, "step": 1602 }, { "epoch": 3.13, "grad_norm": 1.3540231569397383, "learning_rate": 2.580311650865764e-06, "loss": 0.0551, "step": 1603 }, { "epoch": 3.13, "grad_norm": 1.5273620115784388, "learning_rate": 2.575707828859556e-06, "loss": 0.0523, "step": 1604 }, { "epoch": 3.13, "grad_norm": 1.4866342538206914, "learning_rate": 2.571105725331453e-06, "loss": 0.0759, "step": 1605 }, { "epoch": 3.14, "grad_norm": 1.3210121019967376, "learning_rate": 2.566505348837856e-06, "loss": 0.0543, "step": 1606 }, { "epoch": 3.14, "grad_norm": 1.2971472829156452, "learning_rate": 2.5619067079319538e-06, "loss": 0.0483, "step": 1607 }, { "epoch": 3.14, "grad_norm": 1.3519338031472652, "learning_rate": 2.5573098111637088e-06, "loss": 0.0455, "step": 1608 }, { "epoch": 3.14, "grad_norm": 1.4603582687379129, "learning_rate": 2.5527146670798423e-06, "loss": 0.0504, "step": 1609 }, { "epoch": 3.14, "grad_norm": 1.5222231403679969, "learning_rate": 2.5481212842238152e-06, "loss": 0.079, "step": 1610 }, { "epoch": 3.15, "grad_norm": 1.4721196870761766, "learning_rate": 2.543529671135816e-06, "loss": 0.0543, "step": 1611 }, { "epoch": 3.15, "grad_norm": 1.4755443688826864, "learning_rate": 2.5389398363527376e-06, "loss": 0.0533, "step": 1612 }, { "epoch": 3.15, "grad_norm": 1.5520943140846275, "learning_rate": 2.5343517884081723e-06, "loss": 0.0661, "step": 1613 }, { "epoch": 3.15, "grad_norm": 1.4081298434422738, "learning_rate": 2.5297655358323877e-06, "loss": 0.0566, "step": 1614 }, { "epoch": 3.15, "grad_norm": 1.621226275892968, "learning_rate": 2.5251810871523144e-06, "loss": 0.0715, "step": 1615 }, { "epoch": 3.16, "grad_norm": 1.5941586570904254, "learning_rate": 2.5205984508915277e-06, "loss": 0.0884, "step": 1616 }, { "epoch": 3.16, "grad_norm": 1.5730369491369556, "learning_rate": 2.516017635570235e-06, "loss": 0.0695, "step": 1617 }, { "epoch": 3.16, "grad_norm": 1.4157063410218274, "learning_rate": 2.5114386497052563e-06, "loss": 0.0517, "step": 1618 }, { "epoch": 3.16, "grad_norm": 1.479751403254998, "learning_rate": 2.506861501810011e-06, "loss": 0.0656, "step": 1619 }, { "epoch": 3.16, "grad_norm": 1.6356306286210918, "learning_rate": 2.502286200394503e-06, "loss": 0.0812, "step": 1620 }, { "epoch": 3.17, "grad_norm": 1.5406978973718595, "learning_rate": 2.4977127539652985e-06, "loss": 0.0708, "step": 1621 }, { "epoch": 3.17, "grad_norm": 1.3843602252807212, "learning_rate": 2.4931411710255205e-06, "loss": 0.0547, "step": 1622 }, { "epoch": 3.17, "grad_norm": 1.377247203718581, "learning_rate": 2.488571460074823e-06, "loss": 0.064, "step": 1623 }, { "epoch": 3.17, "grad_norm": 1.469268099324724, "learning_rate": 2.4840036296093824e-06, "loss": 0.0626, "step": 1624 }, { "epoch": 3.17, "grad_norm": 1.3868538416242306, "learning_rate": 2.479437688121876e-06, "loss": 0.0445, "step": 1625 }, { "epoch": 3.18, "grad_norm": 1.3945442761011744, "learning_rate": 2.4748736441014728e-06, "loss": 0.0663, "step": 1626 }, { "epoch": 3.18, "grad_norm": 1.4458120550498659, "learning_rate": 2.4703115060338096e-06, "loss": 0.0574, "step": 1627 }, { "epoch": 3.18, "grad_norm": 1.38249688079666, "learning_rate": 2.465751282400983e-06, "loss": 0.057, "step": 1628 }, { "epoch": 3.18, "grad_norm": 1.331476181259786, "learning_rate": 2.4611929816815317e-06, "loss": 0.0557, "step": 1629 }, { "epoch": 3.18, "grad_norm": 1.3246170417621905, "learning_rate": 2.456636612350411e-06, "loss": 0.0714, "step": 1630 }, { "epoch": 3.19, "grad_norm": 1.5361578491052474, "learning_rate": 2.4520821828789946e-06, "loss": 0.0672, "step": 1631 }, { "epoch": 3.19, "grad_norm": 1.5714231141820163, "learning_rate": 2.4475297017350445e-06, "loss": 0.0593, "step": 1632 }, { "epoch": 3.19, "grad_norm": 1.3995458160205778, "learning_rate": 2.4429791773827017e-06, "loss": 0.0514, "step": 1633 }, { "epoch": 3.19, "grad_norm": 1.2950528856337444, "learning_rate": 2.43843061828247e-06, "loss": 0.0514, "step": 1634 }, { "epoch": 3.19, "grad_norm": 1.3592086981236082, "learning_rate": 2.4338840328911975e-06, "loss": 0.0459, "step": 1635 }, { "epoch": 3.2, "grad_norm": 1.5750366857278235, "learning_rate": 2.4293394296620625e-06, "loss": 0.0577, "step": 1636 }, { "epoch": 3.2, "grad_norm": 1.4433549776019936, "learning_rate": 2.4247968170445606e-06, "loss": 0.0542, "step": 1637 }, { "epoch": 3.2, "grad_norm": 1.4451357812928998, "learning_rate": 2.420256203484486e-06, "loss": 0.0539, "step": 1638 }, { "epoch": 3.2, "grad_norm": 1.5600960604394412, "learning_rate": 2.4157175974239114e-06, "loss": 0.0411, "step": 1639 }, { "epoch": 3.2, "grad_norm": 1.7633698554971189, "learning_rate": 2.411181007301182e-06, "loss": 0.0501, "step": 1640 }, { "epoch": 3.21, "grad_norm": 1.3820895669544935, "learning_rate": 2.4066464415508943e-06, "loss": 0.0569, "step": 1641 }, { "epoch": 3.21, "grad_norm": 1.4664848652551465, "learning_rate": 2.40211390860388e-06, "loss": 0.0538, "step": 1642 }, { "epoch": 3.21, "grad_norm": 1.7316112418730774, "learning_rate": 2.3975834168871913e-06, "loss": 0.0358, "step": 1643 }, { "epoch": 3.21, "grad_norm": 1.4469935271602232, "learning_rate": 2.393054974824086e-06, "loss": 0.0413, "step": 1644 }, { "epoch": 3.21, "grad_norm": 1.6685622291157511, "learning_rate": 2.3885285908340092e-06, "loss": 0.0792, "step": 1645 }, { "epoch": 3.21, "grad_norm": 1.761013957233675, "learning_rate": 2.384004273332583e-06, "loss": 0.0547, "step": 1646 }, { "epoch": 3.22, "grad_norm": 1.4016927568313102, "learning_rate": 2.379482030731586e-06, "loss": 0.0454, "step": 1647 }, { "epoch": 3.22, "grad_norm": 1.5357062890913051, "learning_rate": 2.374961871438935e-06, "loss": 0.0567, "step": 1648 }, { "epoch": 3.22, "grad_norm": 1.5207523975327055, "learning_rate": 2.3704438038586796e-06, "loss": 0.0479, "step": 1649 }, { "epoch": 3.22, "grad_norm": 1.7320586782190837, "learning_rate": 2.3659278363909763e-06, "loss": 0.0624, "step": 1650 }, { "epoch": 3.22, "grad_norm": 1.8757367109674132, "learning_rate": 2.36141397743208e-06, "loss": 0.062, "step": 1651 }, { "epoch": 3.23, "grad_norm": 1.4043389924921532, "learning_rate": 2.356902235374323e-06, "loss": 0.0433, "step": 1652 }, { "epoch": 3.23, "grad_norm": 1.5830332601406976, "learning_rate": 2.352392618606102e-06, "loss": 0.0515, "step": 1653 }, { "epoch": 3.23, "grad_norm": 1.396073872430874, "learning_rate": 2.3478851355118637e-06, "loss": 0.036, "step": 1654 }, { "epoch": 3.23, "grad_norm": 1.2528760714608433, "learning_rate": 2.343379794472087e-06, "loss": 0.0471, "step": 1655 }, { "epoch": 3.23, "grad_norm": 1.602167334962155, "learning_rate": 2.3388766038632686e-06, "loss": 0.0604, "step": 1656 }, { "epoch": 3.24, "grad_norm": 1.5249477430547491, "learning_rate": 2.3343755720579052e-06, "loss": 0.0484, "step": 1657 }, { "epoch": 3.24, "grad_norm": 1.532650987426097, "learning_rate": 2.329876707424481e-06, "loss": 0.057, "step": 1658 }, { "epoch": 3.24, "grad_norm": 1.8969944106439074, "learning_rate": 2.3253800183274527e-06, "loss": 0.0884, "step": 1659 }, { "epoch": 3.24, "grad_norm": 1.4302633598378212, "learning_rate": 2.32088551312723e-06, "loss": 0.0466, "step": 1660 }, { "epoch": 3.24, "grad_norm": 1.7913636573011102, "learning_rate": 2.3163932001801625e-06, "loss": 0.0557, "step": 1661 }, { "epoch": 3.25, "grad_norm": 1.4353416744619556, "learning_rate": 2.311903087838524e-06, "loss": 0.0372, "step": 1662 }, { "epoch": 3.25, "grad_norm": 1.4495976281093075, "learning_rate": 2.307415184450498e-06, "loss": 0.0556, "step": 1663 }, { "epoch": 3.25, "grad_norm": 1.3767887026582837, "learning_rate": 2.3029294983601598e-06, "loss": 0.0506, "step": 1664 }, { "epoch": 3.25, "grad_norm": 1.5172323831041399, "learning_rate": 2.298446037907462e-06, "loss": 0.0631, "step": 1665 }, { "epoch": 3.25, "grad_norm": 1.5765092891187005, "learning_rate": 2.2939648114282187e-06, "loss": 0.0628, "step": 1666 }, { "epoch": 3.26, "grad_norm": 1.3735272373674494, "learning_rate": 2.289485827254093e-06, "loss": 0.0508, "step": 1667 }, { "epoch": 3.26, "grad_norm": 1.3537475023530747, "learning_rate": 2.2850090937125775e-06, "loss": 0.0352, "step": 1668 }, { "epoch": 3.26, "grad_norm": 1.512050787555494, "learning_rate": 2.2805346191269795e-06, "loss": 0.0611, "step": 1669 }, { "epoch": 3.26, "grad_norm": 1.4010622189282875, "learning_rate": 2.276062411816407e-06, "loss": 0.0411, "step": 1670 }, { "epoch": 3.26, "grad_norm": 1.5937061190819943, "learning_rate": 2.2715924800957543e-06, "loss": 0.0475, "step": 1671 }, { "epoch": 3.27, "grad_norm": 1.5106782795696596, "learning_rate": 2.2671248322756827e-06, "loss": 0.0471, "step": 1672 }, { "epoch": 3.27, "grad_norm": 1.580425845283506, "learning_rate": 2.2626594766626067e-06, "loss": 0.0653, "step": 1673 }, { "epoch": 3.27, "grad_norm": 1.3986133092846158, "learning_rate": 2.258196421558684e-06, "loss": 0.0564, "step": 1674 }, { "epoch": 3.27, "grad_norm": 1.343873093528596, "learning_rate": 2.2537356752617857e-06, "loss": 0.0436, "step": 1675 }, { "epoch": 3.27, "grad_norm": 1.761270188638894, "learning_rate": 2.2492772460655e-06, "loss": 0.0543, "step": 1676 }, { "epoch": 3.28, "grad_norm": 1.358158225406825, "learning_rate": 2.244821142259101e-06, "loss": 0.0502, "step": 1677 }, { "epoch": 3.28, "grad_norm": 1.483907193428131, "learning_rate": 2.240367372127541e-06, "loss": 0.0389, "step": 1678 }, { "epoch": 3.28, "grad_norm": 1.592473115417211, "learning_rate": 2.2359159439514365e-06, "loss": 0.0577, "step": 1679 }, { "epoch": 3.28, "grad_norm": 1.3230856553628412, "learning_rate": 2.2314668660070444e-06, "loss": 0.0323, "step": 1680 }, { "epoch": 3.28, "grad_norm": 1.4064466804572944, "learning_rate": 2.2270201465662547e-06, "loss": 0.0443, "step": 1681 }, { "epoch": 3.29, "grad_norm": 1.3368241088863035, "learning_rate": 2.2225757938965737e-06, "loss": 0.0451, "step": 1682 }, { "epoch": 3.29, "grad_norm": 1.0724881754806597, "learning_rate": 2.218133816261106e-06, "loss": 0.0263, "step": 1683 }, { "epoch": 3.29, "grad_norm": 1.0996837966655273, "learning_rate": 2.213694221918538e-06, "loss": 0.029, "step": 1684 }, { "epoch": 3.29, "grad_norm": 1.6372014757948048, "learning_rate": 2.2092570191231287e-06, "loss": 0.0447, "step": 1685 }, { "epoch": 3.29, "grad_norm": 1.4085343704050115, "learning_rate": 2.2048222161246893e-06, "loss": 0.037, "step": 1686 }, { "epoch": 3.29, "grad_norm": 1.351493202291899, "learning_rate": 2.200389821168569e-06, "loss": 0.0316, "step": 1687 }, { "epoch": 3.3, "grad_norm": 1.4389063572503413, "learning_rate": 2.1959598424956397e-06, "loss": 0.0354, "step": 1688 }, { "epoch": 3.3, "grad_norm": 1.579078096089134, "learning_rate": 2.191532288342282e-06, "loss": 0.0603, "step": 1689 }, { "epoch": 3.3, "grad_norm": 1.6014402266546168, "learning_rate": 2.187107166940367e-06, "loss": 0.0422, "step": 1690 }, { "epoch": 3.3, "grad_norm": 1.389028890316008, "learning_rate": 2.1826844865172438e-06, "loss": 0.0338, "step": 1691 }, { "epoch": 3.3, "grad_norm": 1.4642824515717683, "learning_rate": 2.178264255295725e-06, "loss": 0.0577, "step": 1692 }, { "epoch": 3.31, "grad_norm": 1.4641932025699171, "learning_rate": 2.1738464814940648e-06, "loss": 0.0753, "step": 1693 }, { "epoch": 3.31, "grad_norm": 1.5325115226126436, "learning_rate": 2.1694311733259525e-06, "loss": 0.054, "step": 1694 }, { "epoch": 3.31, "grad_norm": 1.4949950738723898, "learning_rate": 2.165018339000491e-06, "loss": 0.058, "step": 1695 }, { "epoch": 3.31, "grad_norm": 1.3379656949059724, "learning_rate": 2.160607986722186e-06, "loss": 0.0399, "step": 1696 }, { "epoch": 3.31, "grad_norm": 1.4442170808355033, "learning_rate": 2.1562001246909267e-06, "loss": 0.0319, "step": 1697 }, { "epoch": 3.32, "grad_norm": 1.5194254204913997, "learning_rate": 2.151794761101972e-06, "loss": 0.0442, "step": 1698 }, { "epoch": 3.32, "grad_norm": 1.4240129462533888, "learning_rate": 2.147391904145938e-06, "loss": 0.0412, "step": 1699 }, { "epoch": 3.32, "grad_norm": 1.3383627461201955, "learning_rate": 2.1429915620087775e-06, "loss": 0.0347, "step": 1700 }, { "epoch": 3.32, "grad_norm": 1.303680896079759, "learning_rate": 2.1385937428717707e-06, "loss": 0.039, "step": 1701 }, { "epoch": 3.32, "grad_norm": 1.2718294346448236, "learning_rate": 2.134198454911503e-06, "loss": 0.0361, "step": 1702 }, { "epoch": 3.33, "grad_norm": 1.3879912773194796, "learning_rate": 2.1298057062998557e-06, "loss": 0.0291, "step": 1703 }, { "epoch": 3.33, "grad_norm": 1.3127847093149452, "learning_rate": 2.125415505203991e-06, "loss": 0.0465, "step": 1704 }, { "epoch": 3.33, "grad_norm": 1.2899816980104957, "learning_rate": 2.121027859786332e-06, "loss": 0.046, "step": 1705 }, { "epoch": 3.33, "grad_norm": 1.3423576348537083, "learning_rate": 2.1166427782045506e-06, "loss": 0.0388, "step": 1706 }, { "epoch": 3.33, "grad_norm": 1.274366612282138, "learning_rate": 2.1122602686115522e-06, "loss": 0.0351, "step": 1707 }, { "epoch": 3.34, "grad_norm": 1.2287031301115972, "learning_rate": 2.1078803391554616e-06, "loss": 0.0377, "step": 1708 }, { "epoch": 3.34, "grad_norm": 1.2727970622249862, "learning_rate": 2.1035029979796034e-06, "loss": 0.0379, "step": 1709 }, { "epoch": 3.34, "grad_norm": 1.5206435624678718, "learning_rate": 2.099128253222494e-06, "loss": 0.0501, "step": 1710 }, { "epoch": 3.34, "grad_norm": 1.6335538659586035, "learning_rate": 2.094756113017818e-06, "loss": 0.0446, "step": 1711 }, { "epoch": 3.34, "grad_norm": 1.438209524151543, "learning_rate": 2.09038658549442e-06, "loss": 0.0569, "step": 1712 }, { "epoch": 3.35, "grad_norm": 1.5478449149157947, "learning_rate": 2.0860196787762884e-06, "loss": 0.0316, "step": 1713 }, { "epoch": 3.35, "grad_norm": 1.6316356321746208, "learning_rate": 2.0816554009825357e-06, "loss": 0.0433, "step": 1714 }, { "epoch": 3.35, "grad_norm": 1.2973048014512991, "learning_rate": 2.0772937602273888e-06, "loss": 0.0301, "step": 1715 }, { "epoch": 3.35, "grad_norm": 1.3511165749120706, "learning_rate": 2.0729347646201717e-06, "loss": 0.0401, "step": 1716 }, { "epoch": 3.35, "grad_norm": 1.2411749331164634, "learning_rate": 2.0685784222652893e-06, "loss": 0.0345, "step": 1717 }, { "epoch": 3.36, "grad_norm": 1.226121247405433, "learning_rate": 2.064224741262213e-06, "loss": 0.0275, "step": 1718 }, { "epoch": 3.36, "grad_norm": 1.4978443597758235, "learning_rate": 2.0598737297054707e-06, "loss": 0.0271, "step": 1719 }, { "epoch": 3.36, "grad_norm": 1.3820366793807461, "learning_rate": 2.0555253956846186e-06, "loss": 0.0433, "step": 1720 }, { "epoch": 3.36, "grad_norm": 1.3739051571080039, "learning_rate": 2.0511797472842434e-06, "loss": 0.0432, "step": 1721 }, { "epoch": 3.36, "grad_norm": 1.3025837952682922, "learning_rate": 2.046836792583933e-06, "loss": 0.0393, "step": 1722 }, { "epoch": 3.37, "grad_norm": 1.1398085610993063, "learning_rate": 2.0424965396582706e-06, "loss": 0.0275, "step": 1723 }, { "epoch": 3.37, "grad_norm": 1.3916061159392539, "learning_rate": 2.0381589965768126e-06, "loss": 0.0459, "step": 1724 }, { "epoch": 3.37, "grad_norm": 1.4231607113414888, "learning_rate": 2.03382417140408e-06, "loss": 0.0455, "step": 1725 }, { "epoch": 3.37, "grad_norm": 1.3744570863179315, "learning_rate": 2.0294920721995417e-06, "loss": 0.0399, "step": 1726 }, { "epoch": 3.37, "grad_norm": 1.8488521803453917, "learning_rate": 2.0251627070175925e-06, "loss": 0.0254, "step": 1727 }, { "epoch": 3.38, "grad_norm": 1.6141553076752386, "learning_rate": 2.0208360839075525e-06, "loss": 0.0356, "step": 1728 }, { "epoch": 3.38, "grad_norm": 1.4927958209118921, "learning_rate": 2.0165122109136357e-06, "loss": 0.0407, "step": 1729 }, { "epoch": 3.38, "grad_norm": 1.6424518554984948, "learning_rate": 2.0121910960749458e-06, "loss": 0.0307, "step": 1730 }, { "epoch": 3.38, "grad_norm": 1.377476845255786, "learning_rate": 2.0078727474254598e-06, "loss": 0.0287, "step": 1731 }, { "epoch": 3.38, "grad_norm": 1.3298610004836713, "learning_rate": 2.0035571729940133e-06, "loss": 0.0275, "step": 1732 }, { "epoch": 3.38, "grad_norm": 1.4703978418227783, "learning_rate": 1.9992443808042776e-06, "loss": 0.0331, "step": 1733 }, { "epoch": 3.39, "grad_norm": 1.2854533624930258, "learning_rate": 1.994934378874757e-06, "loss": 0.0269, "step": 1734 }, { "epoch": 3.39, "grad_norm": 1.142002089375079, "learning_rate": 1.9906271752187675e-06, "loss": 0.027, "step": 1735 }, { "epoch": 3.39, "grad_norm": 1.322291512060729, "learning_rate": 1.9863227778444176e-06, "loss": 0.0377, "step": 1736 }, { "epoch": 3.39, "grad_norm": 2.0548507091728907, "learning_rate": 1.982021194754606e-06, "loss": 0.0341, "step": 1737 }, { "epoch": 3.39, "grad_norm": 1.4203172845251748, "learning_rate": 1.9777224339469897e-06, "loss": 0.0307, "step": 1738 }, { "epoch": 3.4, "grad_norm": 1.5738230270569367, "learning_rate": 1.9734265034139883e-06, "loss": 0.0313, "step": 1739 }, { "epoch": 3.4, "grad_norm": 1.6569596806336608, "learning_rate": 1.9691334111427506e-06, "loss": 0.0339, "step": 1740 }, { "epoch": 3.4, "grad_norm": 1.4504758170687126, "learning_rate": 1.964843165115154e-06, "loss": 0.0197, "step": 1741 }, { "epoch": 3.4, "grad_norm": 1.5074296027796619, "learning_rate": 1.960555773307781e-06, "loss": 0.033, "step": 1742 }, { "epoch": 3.4, "grad_norm": 1.3108711547514604, "learning_rate": 1.9562712436919092e-06, "loss": 0.0246, "step": 1743 }, { "epoch": 3.41, "grad_norm": 1.3000432539512798, "learning_rate": 1.951989584233496e-06, "loss": 0.0324, "step": 1744 }, { "epoch": 3.41, "grad_norm": 1.4963428554579652, "learning_rate": 1.9477108028931577e-06, "loss": 0.0307, "step": 1745 }, { "epoch": 3.41, "grad_norm": 1.2665381947550212, "learning_rate": 1.943434907626164e-06, "loss": 0.0389, "step": 1746 }, { "epoch": 3.41, "grad_norm": 1.415182711670528, "learning_rate": 1.9391619063824164e-06, "loss": 0.0356, "step": 1747 }, { "epoch": 3.41, "grad_norm": 1.280263833739202, "learning_rate": 1.934891807106438e-06, "loss": 0.044, "step": 1748 }, { "epoch": 3.42, "grad_norm": 1.0262882972363716, "learning_rate": 1.930624617737352e-06, "loss": 0.0188, "step": 1749 }, { "epoch": 3.42, "grad_norm": 1.3811456204550292, "learning_rate": 1.9263603462088765e-06, "loss": 0.0259, "step": 1750 }, { "epoch": 3.42, "grad_norm": 1.3237701544243887, "learning_rate": 1.9220990004493035e-06, "loss": 0.0407, "step": 1751 }, { "epoch": 3.42, "grad_norm": 1.8293128388851385, "learning_rate": 1.917840588381481e-06, "loss": 0.0287, "step": 1752 }, { "epoch": 3.42, "grad_norm": 1.3426409364774898, "learning_rate": 1.9135851179228076e-06, "loss": 0.0367, "step": 1753 }, { "epoch": 3.43, "grad_norm": 1.323331739828401, "learning_rate": 1.9093325969852126e-06, "loss": 0.0193, "step": 1754 }, { "epoch": 3.43, "grad_norm": 1.507178545760924, "learning_rate": 1.905083033475138e-06, "loss": 0.0248, "step": 1755 }, { "epoch": 3.43, "grad_norm": 1.147031543336055, "learning_rate": 1.9008364352935276e-06, "loss": 0.0325, "step": 1756 }, { "epoch": 3.43, "grad_norm": 1.281507744138041, "learning_rate": 1.896592810335817e-06, "loss": 0.0461, "step": 1757 }, { "epoch": 3.43, "grad_norm": 1.3588993175547812, "learning_rate": 1.8923521664919068e-06, "loss": 0.0332, "step": 1758 }, { "epoch": 3.44, "grad_norm": 1.094922752380483, "learning_rate": 1.8881145116461597e-06, "loss": 0.0309, "step": 1759 }, { "epoch": 3.44, "grad_norm": 1.384434459945072, "learning_rate": 1.8838798536773821e-06, "loss": 0.0344, "step": 1760 }, { "epoch": 3.44, "grad_norm": 1.332017136945708, "learning_rate": 1.8796482004588025e-06, "loss": 0.0375, "step": 1761 }, { "epoch": 3.44, "grad_norm": 1.30325439261459, "learning_rate": 1.875419559858069e-06, "loss": 0.0234, "step": 1762 }, { "epoch": 3.44, "grad_norm": 1.1127816328905908, "learning_rate": 1.8711939397372273e-06, "loss": 0.0244, "step": 1763 }, { "epoch": 3.45, "grad_norm": 1.1009301207089823, "learning_rate": 1.8669713479527048e-06, "loss": 0.0221, "step": 1764 }, { "epoch": 3.45, "grad_norm": 1.603411816253569, "learning_rate": 1.8627517923552982e-06, "loss": 0.0375, "step": 1765 }, { "epoch": 3.45, "grad_norm": 1.2815579407928852, "learning_rate": 1.8585352807901644e-06, "loss": 0.0384, "step": 1766 }, { "epoch": 3.45, "grad_norm": 1.2946666567894238, "learning_rate": 1.8543218210967937e-06, "loss": 0.037, "step": 1767 }, { "epoch": 3.45, "grad_norm": 1.2851674877441246, "learning_rate": 1.850111421109008e-06, "loss": 0.0339, "step": 1768 }, { "epoch": 3.46, "grad_norm": 1.0305313203209883, "learning_rate": 1.8459040886549394e-06, "loss": 0.02, "step": 1769 }, { "epoch": 3.46, "grad_norm": 1.3267257752953603, "learning_rate": 1.8416998315570125e-06, "loss": 0.0373, "step": 1770 }, { "epoch": 3.46, "grad_norm": 1.2772117860239072, "learning_rate": 1.8374986576319388e-06, "loss": 0.0319, "step": 1771 }, { "epoch": 3.46, "grad_norm": 1.3842166695359135, "learning_rate": 1.8333005746906976e-06, "loss": 0.0442, "step": 1772 }, { "epoch": 3.46, "grad_norm": 1.392089663719773, "learning_rate": 1.8291055905385179e-06, "loss": 0.0403, "step": 1773 }, { "epoch": 3.46, "grad_norm": 1.3227890491007719, "learning_rate": 1.8249137129748679e-06, "loss": 0.0428, "step": 1774 }, { "epoch": 3.47, "grad_norm": 1.166098693512627, "learning_rate": 1.8207249497934416e-06, "loss": 0.0301, "step": 1775 }, { "epoch": 3.47, "grad_norm": 1.3644185963598154, "learning_rate": 1.8165393087821438e-06, "loss": 0.041, "step": 1776 }, { "epoch": 3.47, "grad_norm": 1.3127721066228442, "learning_rate": 1.8123567977230706e-06, "loss": 0.0371, "step": 1777 }, { "epoch": 3.47, "grad_norm": 1.2512796680172147, "learning_rate": 1.8081774243925025e-06, "loss": 0.0411, "step": 1778 }, { "epoch": 3.47, "grad_norm": 1.3896354901285626, "learning_rate": 1.8040011965608827e-06, "loss": 0.0403, "step": 1779 }, { "epoch": 3.48, "grad_norm": 1.104764169544422, "learning_rate": 1.7998281219928094e-06, "loss": 0.0222, "step": 1780 }, { "epoch": 3.48, "grad_norm": 1.2517329365564922, "learning_rate": 1.7956582084470179e-06, "loss": 0.0304, "step": 1781 }, { "epoch": 3.48, "grad_norm": 1.4538142632067081, "learning_rate": 1.7914914636763638e-06, "loss": 0.0461, "step": 1782 }, { "epoch": 3.48, "grad_norm": 1.3293276178444742, "learning_rate": 1.7873278954278112e-06, "loss": 0.0331, "step": 1783 }, { "epoch": 3.48, "grad_norm": 1.3350895603387583, "learning_rate": 1.783167511442422e-06, "loss": 0.0305, "step": 1784 }, { "epoch": 3.49, "grad_norm": 1.2797818500765623, "learning_rate": 1.7790103194553362e-06, "loss": 0.0296, "step": 1785 }, { "epoch": 3.49, "grad_norm": 1.0805595484251211, "learning_rate": 1.7748563271957563e-06, "loss": 0.026, "step": 1786 }, { "epoch": 3.49, "grad_norm": 1.3543429890830208, "learning_rate": 1.7707055423869382e-06, "loss": 0.0397, "step": 1787 }, { "epoch": 3.49, "grad_norm": 1.3625234341633299, "learning_rate": 1.7665579727461771e-06, "loss": 0.0267, "step": 1788 }, { "epoch": 3.49, "grad_norm": 1.276092709279319, "learning_rate": 1.762413625984784e-06, "loss": 0.0405, "step": 1789 }, { "epoch": 3.5, "grad_norm": 1.1898471546525118, "learning_rate": 1.7582725098080826e-06, "loss": 0.0285, "step": 1790 }, { "epoch": 3.5, "grad_norm": 1.2219945545560824, "learning_rate": 1.7541346319153915e-06, "loss": 0.0391, "step": 1791 }, { "epoch": 3.5, "grad_norm": 1.437924035360448, "learning_rate": 1.7500000000000008e-06, "loss": 0.0447, "step": 1792 }, { "epoch": 3.5, "grad_norm": 1.3574117243465895, "learning_rate": 1.7458686217491734e-06, "loss": 0.0346, "step": 1793 }, { "epoch": 3.5, "grad_norm": 1.3858698099350442, "learning_rate": 1.741740504844121e-06, "loss": 0.0416, "step": 1794 }, { "epoch": 3.51, "grad_norm": 1.1082245971199407, "learning_rate": 1.7376156569599887e-06, "loss": 0.0316, "step": 1795 }, { "epoch": 3.51, "grad_norm": 1.0470086398256777, "learning_rate": 1.7334940857658472e-06, "loss": 0.0286, "step": 1796 }, { "epoch": 3.51, "grad_norm": 1.2535290721049908, "learning_rate": 1.729375798924675e-06, "loss": 0.0334, "step": 1797 }, { "epoch": 3.51, "grad_norm": 1.215188949787473, "learning_rate": 1.7252608040933402e-06, "loss": 0.0279, "step": 1798 }, { "epoch": 3.51, "grad_norm": 1.2144021773083795, "learning_rate": 1.721149108922594e-06, "loss": 0.027, "step": 1799 }, { "epoch": 3.52, "grad_norm": 1.329452161289932, "learning_rate": 1.7170407210570539e-06, "loss": 0.0364, "step": 1800 }, { "epoch": 3.52, "grad_norm": 1.2095538949478566, "learning_rate": 1.712935648135184e-06, "loss": 0.0283, "step": 1801 }, { "epoch": 3.52, "grad_norm": 1.4130132251047385, "learning_rate": 1.7088338977892866e-06, "loss": 0.0248, "step": 1802 }, { "epoch": 3.52, "grad_norm": 1.2609027941668738, "learning_rate": 1.7047354776454897e-06, "loss": 0.0234, "step": 1803 }, { "epoch": 3.52, "grad_norm": 1.1838375241981844, "learning_rate": 1.700640395323724e-06, "loss": 0.0294, "step": 1804 }, { "epoch": 3.53, "grad_norm": 1.2985548248797583, "learning_rate": 1.6965486584377205e-06, "loss": 0.0353, "step": 1805 }, { "epoch": 3.53, "grad_norm": 1.3109562158973938, "learning_rate": 1.692460274594987e-06, "loss": 0.0392, "step": 1806 }, { "epoch": 3.53, "grad_norm": 1.2037593726345965, "learning_rate": 1.6883752513967963e-06, "loss": 0.025, "step": 1807 }, { "epoch": 3.53, "grad_norm": 1.178344756253592, "learning_rate": 1.6842935964381741e-06, "loss": 0.0283, "step": 1808 }, { "epoch": 3.53, "grad_norm": 1.1597909145282201, "learning_rate": 1.6802153173078865e-06, "loss": 0.0307, "step": 1809 }, { "epoch": 3.54, "grad_norm": 1.2330917702504232, "learning_rate": 1.6761404215884189e-06, "loss": 0.0262, "step": 1810 }, { "epoch": 3.54, "grad_norm": 1.4397676957455228, "learning_rate": 1.6720689168559663e-06, "loss": 0.0338, "step": 1811 }, { "epoch": 3.54, "grad_norm": 1.1642518206998935, "learning_rate": 1.6680008106804213e-06, "loss": 0.021, "step": 1812 }, { "epoch": 3.54, "grad_norm": 1.4281260180082467, "learning_rate": 1.6639361106253595e-06, "loss": 0.0256, "step": 1813 }, { "epoch": 3.54, "grad_norm": 1.146423497888737, "learning_rate": 1.6598748242480173e-06, "loss": 0.0245, "step": 1814 }, { "epoch": 3.54, "grad_norm": 1.310121110123137, "learning_rate": 1.6558169590992901e-06, "loss": 0.0295, "step": 1815 }, { "epoch": 3.55, "grad_norm": 1.2154456516930219, "learning_rate": 1.651762522723712e-06, "loss": 0.0293, "step": 1816 }, { "epoch": 3.55, "grad_norm": 1.471663111117828, "learning_rate": 1.6477115226594378e-06, "loss": 0.0452, "step": 1817 }, { "epoch": 3.55, "grad_norm": 1.401714936078221, "learning_rate": 1.643663966438239e-06, "loss": 0.0379, "step": 1818 }, { "epoch": 3.55, "grad_norm": 1.2459568756650463, "learning_rate": 1.6396198615854799e-06, "loss": 0.0203, "step": 1819 }, { "epoch": 3.55, "grad_norm": 1.3744634474275639, "learning_rate": 1.6355792156201085e-06, "loss": 0.0415, "step": 1820 }, { "epoch": 3.56, "grad_norm": 1.344037434712068, "learning_rate": 1.6315420360546436e-06, "loss": 0.0256, "step": 1821 }, { "epoch": 3.56, "grad_norm": 1.3005563845698767, "learning_rate": 1.6275083303951604e-06, "loss": 0.0377, "step": 1822 }, { "epoch": 3.56, "grad_norm": 1.2180750398855853, "learning_rate": 1.62347810614127e-06, "loss": 0.0269, "step": 1823 }, { "epoch": 3.56, "grad_norm": 1.325483407766987, "learning_rate": 1.619451370786116e-06, "loss": 0.038, "step": 1824 }, { "epoch": 3.56, "grad_norm": 1.3186147895677194, "learning_rate": 1.6154281318163542e-06, "loss": 0.03, "step": 1825 }, { "epoch": 3.57, "grad_norm": 1.206136725307207, "learning_rate": 1.6114083967121365e-06, "loss": 0.0301, "step": 1826 }, { "epoch": 3.57, "grad_norm": 1.302201835077349, "learning_rate": 1.607392172947105e-06, "loss": 0.0333, "step": 1827 }, { "epoch": 3.57, "grad_norm": 1.2936339116793731, "learning_rate": 1.60337946798837e-06, "loss": 0.0248, "step": 1828 }, { "epoch": 3.57, "grad_norm": 1.078884916946985, "learning_rate": 1.5993702892964996e-06, "loss": 0.0225, "step": 1829 }, { "epoch": 3.57, "grad_norm": 1.0451296987398764, "learning_rate": 1.5953646443255076e-06, "loss": 0.0151, "step": 1830 }, { "epoch": 3.58, "grad_norm": 1.4746153501198396, "learning_rate": 1.591362540522838e-06, "loss": 0.0274, "step": 1831 }, { "epoch": 3.58, "grad_norm": 1.2787012632858374, "learning_rate": 1.5873639853293484e-06, "loss": 0.0249, "step": 1832 }, { "epoch": 3.58, "grad_norm": 1.4020928397789019, "learning_rate": 1.5833689861793e-06, "loss": 0.0254, "step": 1833 }, { "epoch": 3.58, "grad_norm": 1.300365014621732, "learning_rate": 1.5793775505003446e-06, "loss": 0.0264, "step": 1834 }, { "epoch": 3.58, "grad_norm": 1.2079016356533014, "learning_rate": 1.5753896857135043e-06, "loss": 0.0246, "step": 1835 }, { "epoch": 3.59, "grad_norm": 1.324073838638926, "learning_rate": 1.5714053992331667e-06, "loss": 0.0338, "step": 1836 }, { "epoch": 3.59, "grad_norm": 1.316109782094636, "learning_rate": 1.5674246984670614e-06, "loss": 0.0238, "step": 1837 }, { "epoch": 3.59, "grad_norm": 1.2916026389929893, "learning_rate": 1.5634475908162573e-06, "loss": 0.0263, "step": 1838 }, { "epoch": 3.59, "grad_norm": 1.4109498612574178, "learning_rate": 1.5594740836751365e-06, "loss": 0.0343, "step": 1839 }, { "epoch": 3.59, "grad_norm": 1.5495724856044497, "learning_rate": 1.5555041844313931e-06, "loss": 0.0368, "step": 1840 }, { "epoch": 3.6, "grad_norm": 1.2163071535807082, "learning_rate": 1.5515379004660076e-06, "loss": 0.0265, "step": 1841 }, { "epoch": 3.6, "grad_norm": 1.3653451865951074, "learning_rate": 1.5475752391532423e-06, "loss": 0.0258, "step": 1842 }, { "epoch": 3.6, "grad_norm": 1.2924476737444826, "learning_rate": 1.5436162078606252e-06, "loss": 0.0302, "step": 1843 }, { "epoch": 3.6, "grad_norm": 1.3207443492500541, "learning_rate": 1.5396608139489307e-06, "loss": 0.0197, "step": 1844 }, { "epoch": 3.6, "grad_norm": 1.3037358908933159, "learning_rate": 1.5357090647721752e-06, "loss": 0.0228, "step": 1845 }, { "epoch": 3.61, "grad_norm": 1.335301730260492, "learning_rate": 1.5317609676775944e-06, "loss": 0.0359, "step": 1846 }, { "epoch": 3.61, "grad_norm": 1.3064395834576585, "learning_rate": 1.5278165300056381e-06, "loss": 0.0341, "step": 1847 }, { "epoch": 3.61, "grad_norm": 1.3009839128231429, "learning_rate": 1.5238757590899485e-06, "loss": 0.0298, "step": 1848 }, { "epoch": 3.61, "grad_norm": 1.2710069111530582, "learning_rate": 1.5199386622573537e-06, "loss": 0.0391, "step": 1849 }, { "epoch": 3.61, "grad_norm": 1.266943977057315, "learning_rate": 1.5160052468278497e-06, "loss": 0.039, "step": 1850 }, { "epoch": 3.62, "grad_norm": 1.3349399521149272, "learning_rate": 1.5120755201145856e-06, "loss": 0.036, "step": 1851 }, { "epoch": 3.62, "grad_norm": 1.1996540682481904, "learning_rate": 1.5081494894238554e-06, "loss": 0.0184, "step": 1852 }, { "epoch": 3.62, "grad_norm": 1.1717182966047663, "learning_rate": 1.504227162055082e-06, "loss": 0.0219, "step": 1853 }, { "epoch": 3.62, "grad_norm": 1.1368089953123932, "learning_rate": 1.500308545300799e-06, "loss": 0.028, "step": 1854 }, { "epoch": 3.62, "grad_norm": 1.2139913365432375, "learning_rate": 1.4963936464466426e-06, "loss": 0.031, "step": 1855 }, { "epoch": 3.62, "grad_norm": 1.1142476014977438, "learning_rate": 1.4924824727713396e-06, "loss": 0.0232, "step": 1856 }, { "epoch": 3.63, "grad_norm": 1.2488624070794714, "learning_rate": 1.4885750315466856e-06, "loss": 0.0278, "step": 1857 }, { "epoch": 3.63, "grad_norm": 1.1576859764904872, "learning_rate": 1.4846713300375413e-06, "loss": 0.0209, "step": 1858 }, { "epoch": 3.63, "grad_norm": 1.2609745104709302, "learning_rate": 1.4807713755018133e-06, "loss": 0.0321, "step": 1859 }, { "epoch": 3.63, "grad_norm": 1.0941484745546355, "learning_rate": 1.4768751751904387e-06, "loss": 0.0259, "step": 1860 }, { "epoch": 3.63, "grad_norm": 1.2729754101481523, "learning_rate": 1.472982736347378e-06, "loss": 0.0302, "step": 1861 }, { "epoch": 3.64, "grad_norm": 1.304097568246886, "learning_rate": 1.4690940662095984e-06, "loss": 0.0276, "step": 1862 }, { "epoch": 3.64, "grad_norm": 1.0029450302351604, "learning_rate": 1.4652091720070573e-06, "loss": 0.0182, "step": 1863 }, { "epoch": 3.64, "grad_norm": 1.321709083723321, "learning_rate": 1.4613280609626928e-06, "loss": 0.0378, "step": 1864 }, { "epoch": 3.64, "grad_norm": 1.0963401201857914, "learning_rate": 1.4574507402924117e-06, "loss": 0.0264, "step": 1865 }, { "epoch": 3.64, "grad_norm": 1.243853346855574, "learning_rate": 1.4535772172050692e-06, "loss": 0.0423, "step": 1866 }, { "epoch": 3.65, "grad_norm": 1.2789935236988306, "learning_rate": 1.449707498902464e-06, "loss": 0.0332, "step": 1867 }, { "epoch": 3.65, "grad_norm": 1.3303224435645158, "learning_rate": 1.4458415925793196e-06, "loss": 0.0349, "step": 1868 }, { "epoch": 3.65, "grad_norm": 1.065082125710361, "learning_rate": 1.4419795054232702e-06, "loss": 0.0299, "step": 1869 }, { "epoch": 3.65, "grad_norm": 1.1075271193203666, "learning_rate": 1.4381212446148507e-06, "loss": 0.0328, "step": 1870 }, { "epoch": 3.65, "grad_norm": 1.066448908619228, "learning_rate": 1.4342668173274843e-06, "loss": 0.0301, "step": 1871 }, { "epoch": 3.66, "grad_norm": 1.3481921110521264, "learning_rate": 1.4304162307274625e-06, "loss": 0.0324, "step": 1872 }, { "epoch": 3.66, "grad_norm": 1.1617523824981781, "learning_rate": 1.4265694919739373e-06, "loss": 0.0296, "step": 1873 }, { "epoch": 3.66, "grad_norm": 1.3326599630225808, "learning_rate": 1.422726608218908e-06, "loss": 0.0289, "step": 1874 }, { "epoch": 3.66, "grad_norm": 1.0809185779162358, "learning_rate": 1.4188875866072074e-06, "loss": 0.022, "step": 1875 }, { "epoch": 3.66, "grad_norm": 1.364859053983178, "learning_rate": 1.4150524342764833e-06, "loss": 0.0361, "step": 1876 }, { "epoch": 3.67, "grad_norm": 1.4007588543013108, "learning_rate": 1.4112211583571942e-06, "loss": 0.0356, "step": 1877 }, { "epoch": 3.67, "grad_norm": 1.1438533280652077, "learning_rate": 1.4073937659725903e-06, "loss": 0.0195, "step": 1878 }, { "epoch": 3.67, "grad_norm": 1.3824256474802126, "learning_rate": 1.4035702642386989e-06, "loss": 0.0344, "step": 1879 }, { "epoch": 3.67, "grad_norm": 1.3213755854145708, "learning_rate": 1.399750660264317e-06, "loss": 0.0608, "step": 1880 }, { "epoch": 3.67, "grad_norm": 1.2728243859048418, "learning_rate": 1.3959349611509929e-06, "loss": 0.035, "step": 1881 }, { "epoch": 3.68, "grad_norm": 1.4144363753854376, "learning_rate": 1.3921231739930136e-06, "loss": 0.031, "step": 1882 }, { "epoch": 3.68, "grad_norm": 1.2159092377389762, "learning_rate": 1.3883153058773957e-06, "loss": 0.022, "step": 1883 }, { "epoch": 3.68, "grad_norm": 1.3234508532166798, "learning_rate": 1.384511363883869e-06, "loss": 0.0421, "step": 1884 }, { "epoch": 3.68, "grad_norm": 1.2293928184104064, "learning_rate": 1.380711355084861e-06, "loss": 0.0279, "step": 1885 }, { "epoch": 3.68, "grad_norm": 1.0344831038502846, "learning_rate": 1.3769152865454887e-06, "loss": 0.0181, "step": 1886 }, { "epoch": 3.69, "grad_norm": 1.2670720104303181, "learning_rate": 1.3731231653235445e-06, "loss": 0.0204, "step": 1887 }, { "epoch": 3.69, "grad_norm": 1.4723985715113623, "learning_rate": 1.3693349984694776e-06, "loss": 0.0257, "step": 1888 }, { "epoch": 3.69, "grad_norm": 1.2483206451566162, "learning_rate": 1.3655507930263885e-06, "loss": 0.033, "step": 1889 }, { "epoch": 3.69, "grad_norm": 1.308026730990303, "learning_rate": 1.3617705560300144e-06, "loss": 0.0324, "step": 1890 }, { "epoch": 3.69, "grad_norm": 1.122071171205585, "learning_rate": 1.3579942945087064e-06, "loss": 0.0285, "step": 1891 }, { "epoch": 3.7, "grad_norm": 1.2739761189650025, "learning_rate": 1.3542220154834316e-06, "loss": 0.0295, "step": 1892 }, { "epoch": 3.7, "grad_norm": 1.1464570336210467, "learning_rate": 1.3504537259677512e-06, "loss": 0.0233, "step": 1893 }, { "epoch": 3.7, "grad_norm": 1.207620773407543, "learning_rate": 1.3466894329678065e-06, "loss": 0.0315, "step": 1894 }, { "epoch": 3.7, "grad_norm": 1.205916869488868, "learning_rate": 1.3429291434823101e-06, "loss": 0.0279, "step": 1895 }, { "epoch": 3.7, "grad_norm": 1.2227433229450109, "learning_rate": 1.339172864502533e-06, "loss": 0.0316, "step": 1896 }, { "epoch": 3.71, "grad_norm": 1.2065528033663064, "learning_rate": 1.3354206030122852e-06, "loss": 0.0348, "step": 1897 }, { "epoch": 3.71, "grad_norm": 1.097516875183705, "learning_rate": 1.3316723659879105e-06, "loss": 0.0212, "step": 1898 }, { "epoch": 3.71, "grad_norm": 1.347593697005193, "learning_rate": 1.3279281603982706e-06, "loss": 0.0393, "step": 1899 }, { "epoch": 3.71, "grad_norm": 1.3264812337750855, "learning_rate": 1.32418799320473e-06, "loss": 0.0365, "step": 1900 }, { "epoch": 3.71, "grad_norm": 1.190072369984306, "learning_rate": 1.3204518713611436e-06, "loss": 0.0201, "step": 1901 }, { "epoch": 3.71, "grad_norm": 1.257887538656759, "learning_rate": 1.316719801813849e-06, "loss": 0.0295, "step": 1902 }, { "epoch": 3.72, "grad_norm": 1.2574991952324872, "learning_rate": 1.3129917915016482e-06, "loss": 0.0367, "step": 1903 }, { "epoch": 3.72, "grad_norm": 1.3515409313574076, "learning_rate": 1.3092678473557933e-06, "loss": 0.0275, "step": 1904 }, { "epoch": 3.72, "grad_norm": 1.1595997211507088, "learning_rate": 1.3055479762999807e-06, "loss": 0.0392, "step": 1905 }, { "epoch": 3.72, "grad_norm": 1.154244730504199, "learning_rate": 1.3018321852503304e-06, "loss": 0.0195, "step": 1906 }, { "epoch": 3.72, "grad_norm": 1.2709246925582818, "learning_rate": 1.2981204811153784e-06, "loss": 0.0173, "step": 1907 }, { "epoch": 3.73, "grad_norm": 1.130466027618193, "learning_rate": 1.294412870796064e-06, "loss": 0.0222, "step": 1908 }, { "epoch": 3.73, "grad_norm": 1.1407805839236786, "learning_rate": 1.2907093611857113e-06, "loss": 0.0166, "step": 1909 }, { "epoch": 3.73, "grad_norm": 1.0154857455543729, "learning_rate": 1.287009959170021e-06, "loss": 0.021, "step": 1910 }, { "epoch": 3.73, "grad_norm": 1.0052078816062804, "learning_rate": 1.283314671627059e-06, "loss": 0.0262, "step": 1911 }, { "epoch": 3.73, "grad_norm": 1.0636415444738863, "learning_rate": 1.2796235054272411e-06, "loss": 0.0291, "step": 1912 }, { "epoch": 3.74, "grad_norm": 1.1443606200858953, "learning_rate": 1.2759364674333183e-06, "loss": 0.0272, "step": 1913 }, { "epoch": 3.74, "grad_norm": 1.0527963062725316, "learning_rate": 1.2722535645003675e-06, "loss": 0.0218, "step": 1914 }, { "epoch": 3.74, "grad_norm": 1.3505899633335245, "learning_rate": 1.26857480347578e-06, "loss": 0.0359, "step": 1915 }, { "epoch": 3.74, "grad_norm": 0.9235192536894867, "learning_rate": 1.2649001911992413e-06, "loss": 0.0138, "step": 1916 }, { "epoch": 3.74, "grad_norm": 1.2451341560547524, "learning_rate": 1.2612297345027284e-06, "loss": 0.0238, "step": 1917 }, { "epoch": 3.75, "grad_norm": 1.2694852830395116, "learning_rate": 1.2575634402104883e-06, "loss": 0.0275, "step": 1918 }, { "epoch": 3.75, "grad_norm": 1.2082460029180693, "learning_rate": 1.2539013151390298e-06, "loss": 0.0332, "step": 1919 }, { "epoch": 3.75, "grad_norm": 1.3847199115932096, "learning_rate": 1.2502433660971122e-06, "loss": 0.0359, "step": 1920 }, { "epoch": 3.75, "grad_norm": 1.435250133720234, "learning_rate": 1.2465895998857306e-06, "loss": 0.0238, "step": 1921 }, { "epoch": 3.75, "grad_norm": 1.1495771853062136, "learning_rate": 1.2429400232980989e-06, "loss": 0.0293, "step": 1922 }, { "epoch": 3.76, "grad_norm": 1.4178976655932043, "learning_rate": 1.2392946431196465e-06, "loss": 0.0355, "step": 1923 }, { "epoch": 3.76, "grad_norm": 1.3997238525713893, "learning_rate": 1.2356534661279994e-06, "loss": 0.0275, "step": 1924 }, { "epoch": 3.76, "grad_norm": 1.1186261667110438, "learning_rate": 1.2320164990929661e-06, "loss": 0.0196, "step": 1925 }, { "epoch": 3.76, "grad_norm": 0.9943438242646123, "learning_rate": 1.2283837487765322e-06, "loss": 0.0228, "step": 1926 }, { "epoch": 3.76, "grad_norm": 1.26586132093406, "learning_rate": 1.22475522193284e-06, "loss": 0.0285, "step": 1927 }, { "epoch": 3.77, "grad_norm": 1.1675040853803806, "learning_rate": 1.2211309253081786e-06, "loss": 0.0296, "step": 1928 }, { "epoch": 3.77, "grad_norm": 1.2363100088943126, "learning_rate": 1.2175108656409762e-06, "loss": 0.0435, "step": 1929 }, { "epoch": 3.77, "grad_norm": 1.2935193936402838, "learning_rate": 1.213895049661782e-06, "loss": 0.0412, "step": 1930 }, { "epoch": 3.77, "grad_norm": 1.0852066329710008, "learning_rate": 1.2102834840932523e-06, "loss": 0.0274, "step": 1931 }, { "epoch": 3.77, "grad_norm": 1.373275520709265, "learning_rate": 1.2066761756501436e-06, "loss": 0.0306, "step": 1932 }, { "epoch": 3.78, "grad_norm": 0.9920167126920142, "learning_rate": 1.2030731310392987e-06, "loss": 0.0165, "step": 1933 }, { "epoch": 3.78, "grad_norm": 1.3256202989140102, "learning_rate": 1.1994743569596289e-06, "loss": 0.0407, "step": 1934 }, { "epoch": 3.78, "grad_norm": 1.1298966131589396, "learning_rate": 1.195879860102109e-06, "loss": 0.0269, "step": 1935 }, { "epoch": 3.78, "grad_norm": 1.0649339831163818, "learning_rate": 1.192289647149759e-06, "loss": 0.0328, "step": 1936 }, { "epoch": 3.78, "grad_norm": 1.3067079154357142, "learning_rate": 1.188703724777637e-06, "loss": 0.0395, "step": 1937 }, { "epoch": 3.79, "grad_norm": 1.1987244203595413, "learning_rate": 1.1851220996528198e-06, "loss": 0.036, "step": 1938 }, { "epoch": 3.79, "grad_norm": 1.2543350480996123, "learning_rate": 1.1815447784343984e-06, "loss": 0.0465, "step": 1939 }, { "epoch": 3.79, "grad_norm": 1.1269204611398103, "learning_rate": 1.1779717677734615e-06, "loss": 0.0296, "step": 1940 }, { "epoch": 3.79, "grad_norm": 0.9770362396693055, "learning_rate": 1.17440307431308e-06, "loss": 0.0199, "step": 1941 }, { "epoch": 3.79, "grad_norm": 0.9452916765601559, "learning_rate": 1.1708387046883027e-06, "loss": 0.0166, "step": 1942 }, { "epoch": 3.79, "grad_norm": 1.0997430064715286, "learning_rate": 1.1672786655261346e-06, "loss": 0.0256, "step": 1943 }, { "epoch": 3.8, "grad_norm": 1.2671391146419708, "learning_rate": 1.1637229634455348e-06, "loss": 0.0318, "step": 1944 }, { "epoch": 3.8, "grad_norm": 1.0336865072933656, "learning_rate": 1.160171605057393e-06, "loss": 0.0305, "step": 1945 }, { "epoch": 3.8, "grad_norm": 1.2778089252880755, "learning_rate": 1.1566245969645276e-06, "loss": 0.0365, "step": 1946 }, { "epoch": 3.8, "grad_norm": 1.2462437117428917, "learning_rate": 1.1530819457616656e-06, "loss": 0.0264, "step": 1947 }, { "epoch": 3.8, "grad_norm": 1.1550838744592722, "learning_rate": 1.1495436580354353e-06, "loss": 0.0371, "step": 1948 }, { "epoch": 3.81, "grad_norm": 0.9346862337613993, "learning_rate": 1.1460097403643532e-06, "loss": 0.0151, "step": 1949 }, { "epoch": 3.81, "grad_norm": 1.0064174444129326, "learning_rate": 1.142480199318807e-06, "loss": 0.0164, "step": 1950 }, { "epoch": 3.81, "grad_norm": 1.2906686073476068, "learning_rate": 1.1389550414610507e-06, "loss": 0.0366, "step": 1951 }, { "epoch": 3.81, "grad_norm": 1.02989680443413, "learning_rate": 1.135434273345189e-06, "loss": 0.0208, "step": 1952 }, { "epoch": 3.81, "grad_norm": 1.097497822836905, "learning_rate": 1.1319179015171633e-06, "loss": 0.0219, "step": 1953 }, { "epoch": 3.82, "grad_norm": 1.45891681813863, "learning_rate": 1.1284059325147396e-06, "loss": 0.0347, "step": 1954 }, { "epoch": 3.82, "grad_norm": 1.0624211268526589, "learning_rate": 1.1248983728675037e-06, "loss": 0.0185, "step": 1955 }, { "epoch": 3.82, "grad_norm": 1.1707041336326252, "learning_rate": 1.1213952290968368e-06, "loss": 0.0276, "step": 1956 }, { "epoch": 3.82, "grad_norm": 1.3613054732397072, "learning_rate": 1.1178965077159144e-06, "loss": 0.0352, "step": 1957 }, { "epoch": 3.82, "grad_norm": 1.307396288967398, "learning_rate": 1.1144022152296895e-06, "loss": 0.0255, "step": 1958 }, { "epoch": 3.83, "grad_norm": 1.319598329370049, "learning_rate": 1.110912358134877e-06, "loss": 0.0311, "step": 1959 }, { "epoch": 3.83, "grad_norm": 1.213679363570775, "learning_rate": 1.1074269429199503e-06, "loss": 0.0243, "step": 1960 }, { "epoch": 3.83, "grad_norm": 1.3356865875507167, "learning_rate": 1.1039459760651216e-06, "loss": 0.0308, "step": 1961 }, { "epoch": 3.83, "grad_norm": 1.1550184747262129, "learning_rate": 1.1004694640423325e-06, "loss": 0.0175, "step": 1962 }, { "epoch": 3.83, "grad_norm": 1.5216250815692158, "learning_rate": 1.0969974133152416e-06, "loss": 0.0284, "step": 1963 }, { "epoch": 3.84, "grad_norm": 1.1942216642063672, "learning_rate": 1.093529830339214e-06, "loss": 0.0212, "step": 1964 }, { "epoch": 3.84, "grad_norm": 1.430729880011208, "learning_rate": 1.09006672156131e-06, "loss": 0.0229, "step": 1965 }, { "epoch": 3.84, "grad_norm": 1.2064008208450459, "learning_rate": 1.0866080934202657e-06, "loss": 0.0238, "step": 1966 }, { "epoch": 3.84, "grad_norm": 1.2799176834038355, "learning_rate": 1.0831539523464935e-06, "loss": 0.0232, "step": 1967 }, { "epoch": 3.84, "grad_norm": 1.2411536551243383, "learning_rate": 1.0797043047620575e-06, "loss": 0.0317, "step": 1968 }, { "epoch": 3.85, "grad_norm": 1.0815984767682005, "learning_rate": 1.0762591570806703e-06, "loss": 0.0319, "step": 1969 }, { "epoch": 3.85, "grad_norm": 1.2482680560383905, "learning_rate": 1.072818515707679e-06, "loss": 0.0273, "step": 1970 }, { "epoch": 3.85, "grad_norm": 1.3175799508013413, "learning_rate": 1.0693823870400503e-06, "loss": 0.0323, "step": 1971 }, { "epoch": 3.85, "grad_norm": 1.1711396595016155, "learning_rate": 1.0659507774663595e-06, "loss": 0.0262, "step": 1972 }, { "epoch": 3.85, "grad_norm": 1.126643247996475, "learning_rate": 1.0625236933667838e-06, "loss": 0.0135, "step": 1973 }, { "epoch": 3.86, "grad_norm": 0.9561003239630204, "learning_rate": 1.0591011411130844e-06, "loss": 0.0208, "step": 1974 }, { "epoch": 3.86, "grad_norm": 1.173685085745822, "learning_rate": 1.0556831270685953e-06, "loss": 0.0358, "step": 1975 }, { "epoch": 3.86, "grad_norm": 1.4017650614718251, "learning_rate": 1.0522696575882148e-06, "loss": 0.0227, "step": 1976 }, { "epoch": 3.86, "grad_norm": 1.251503800255131, "learning_rate": 1.048860739018393e-06, "loss": 0.0356, "step": 1977 }, { "epoch": 3.86, "grad_norm": 1.2436863273724446, "learning_rate": 1.0454563776971147e-06, "loss": 0.0264, "step": 1978 }, { "epoch": 3.87, "grad_norm": 1.3330336322045298, "learning_rate": 1.042056579953895e-06, "loss": 0.037, "step": 1979 }, { "epoch": 3.87, "grad_norm": 1.2559297328924741, "learning_rate": 1.0386613521097656e-06, "loss": 0.0382, "step": 1980 }, { "epoch": 3.87, "grad_norm": 1.351444723093487, "learning_rate": 1.0352707004772549e-06, "loss": 0.0312, "step": 1981 }, { "epoch": 3.87, "grad_norm": 1.2440430032475671, "learning_rate": 1.0318846313603895e-06, "loss": 0.0287, "step": 1982 }, { "epoch": 3.87, "grad_norm": 1.2573769324421706, "learning_rate": 1.0285031510546756e-06, "loss": 0.0431, "step": 1983 }, { "epoch": 3.88, "grad_norm": 1.428994061266696, "learning_rate": 1.0251262658470838e-06, "loss": 0.0283, "step": 1984 }, { "epoch": 3.88, "grad_norm": 0.9889040531771097, "learning_rate": 1.0217539820160445e-06, "loss": 0.0164, "step": 1985 }, { "epoch": 3.88, "grad_norm": 1.1131130147423873, "learning_rate": 1.0183863058314338e-06, "loss": 0.0256, "step": 1986 }, { "epoch": 3.88, "grad_norm": 1.229835399804949, "learning_rate": 1.0150232435545569e-06, "loss": 0.043, "step": 1987 }, { "epoch": 3.88, "grad_norm": 1.4124251152734235, "learning_rate": 1.0116648014381442e-06, "loss": 0.051, "step": 1988 }, { "epoch": 3.88, "grad_norm": 1.3284771362249352, "learning_rate": 1.0083109857263376e-06, "loss": 0.0238, "step": 1989 }, { "epoch": 3.89, "grad_norm": 0.8902130690649548, "learning_rate": 1.0049618026546712e-06, "loss": 0.0195, "step": 1990 }, { "epoch": 3.89, "grad_norm": 1.3798899382101377, "learning_rate": 1.001617258450071e-06, "loss": 0.0466, "step": 1991 }, { "epoch": 3.89, "grad_norm": 1.0088751363814163, "learning_rate": 9.982773593308383e-07, "loss": 0.0201, "step": 1992 }, { "epoch": 3.89, "grad_norm": 1.1045022029920553, "learning_rate": 9.94942111506635e-07, "loss": 0.0197, "step": 1993 }, { "epoch": 3.89, "grad_norm": 1.0692550351024177, "learning_rate": 9.916115211784778e-07, "loss": 0.0279, "step": 1994 }, { "epoch": 3.9, "grad_norm": 1.0376729907824567, "learning_rate": 9.882855945387237e-07, "loss": 0.0248, "step": 1995 }, { "epoch": 3.9, "grad_norm": 1.0337761702358494, "learning_rate": 9.849643377710566e-07, "loss": 0.0153, "step": 1996 }, { "epoch": 3.9, "grad_norm": 1.0104213437895706, "learning_rate": 9.816477570504808e-07, "loss": 0.0258, "step": 1997 }, { "epoch": 3.9, "grad_norm": 0.9619415153182648, "learning_rate": 9.78335858543306e-07, "loss": 0.0179, "step": 1998 }, { "epoch": 3.9, "grad_norm": 1.182436061169276, "learning_rate": 9.750286484071358e-07, "loss": 0.0309, "step": 1999 }, { "epoch": 3.91, "grad_norm": 1.2146759314218767, "learning_rate": 9.717261327908557e-07, "loss": 0.0301, "step": 2000 }, { "epoch": 3.91, "grad_norm": 1.057644696702824, "learning_rate": 9.684283178346259e-07, "loss": 0.0298, "step": 2001 }, { "epoch": 3.91, "grad_norm": 1.2030129179862126, "learning_rate": 9.651352096698663e-07, "loss": 0.0324, "step": 2002 }, { "epoch": 3.91, "grad_norm": 1.152294129707474, "learning_rate": 9.61846814419243e-07, "loss": 0.0245, "step": 2003 }, { "epoch": 3.91, "grad_norm": 1.2894783272746357, "learning_rate": 9.585631381966645e-07, "loss": 0.0359, "step": 2004 }, { "epoch": 3.92, "grad_norm": 1.295073865802885, "learning_rate": 9.552841871072603e-07, "loss": 0.0297, "step": 2005 }, { "epoch": 3.92, "grad_norm": 0.9583138753485414, "learning_rate": 9.520099672473782e-07, "loss": 0.0193, "step": 2006 }, { "epoch": 3.92, "grad_norm": 1.3444982870497284, "learning_rate": 9.487404847045695e-07, "loss": 0.0308, "step": 2007 }, { "epoch": 3.92, "grad_norm": 0.9840851595210974, "learning_rate": 9.454757455575762e-07, "loss": 0.0266, "step": 2008 }, { "epoch": 3.92, "grad_norm": 1.3606450234354168, "learning_rate": 9.422157558763201e-07, "loss": 0.0331, "step": 2009 }, { "epoch": 3.93, "grad_norm": 1.3439116240283324, "learning_rate": 9.389605217218959e-07, "loss": 0.0389, "step": 2010 }, { "epoch": 3.93, "grad_norm": 1.1215790184167365, "learning_rate": 9.357100491465556e-07, "loss": 0.0196, "step": 2011 }, { "epoch": 3.93, "grad_norm": 1.1730616855331002, "learning_rate": 9.324643441936959e-07, "loss": 0.0311, "step": 2012 }, { "epoch": 3.93, "grad_norm": 1.065120925378081, "learning_rate": 9.292234128978525e-07, "loss": 0.0176, "step": 2013 }, { "epoch": 3.93, "grad_norm": 1.3626936398303806, "learning_rate": 9.25987261284685e-07, "loss": 0.0344, "step": 2014 }, { "epoch": 3.94, "grad_norm": 1.3271599330703854, "learning_rate": 9.227558953709638e-07, "loss": 0.0531, "step": 2015 }, { "epoch": 3.94, "grad_norm": 1.2249757377021109, "learning_rate": 9.195293211645661e-07, "loss": 0.0318, "step": 2016 }, { "epoch": 3.94, "grad_norm": 1.291643860362483, "learning_rate": 9.163075446644564e-07, "loss": 0.026, "step": 2017 }, { "epoch": 3.94, "grad_norm": 1.1593279501791598, "learning_rate": 9.130905718606795e-07, "loss": 0.0314, "step": 2018 }, { "epoch": 3.94, "grad_norm": 1.1798748943651258, "learning_rate": 9.098784087343511e-07, "loss": 0.0256, "step": 2019 }, { "epoch": 3.95, "grad_norm": 1.1364062029053892, "learning_rate": 9.066710612576439e-07, "loss": 0.0249, "step": 2020 }, { "epoch": 3.95, "grad_norm": 1.5644737718020165, "learning_rate": 9.034685353937748e-07, "loss": 0.0549, "step": 2021 }, { "epoch": 3.95, "grad_norm": 1.078578913216433, "learning_rate": 9.002708370969993e-07, "loss": 0.0223, "step": 2022 }, { "epoch": 3.95, "grad_norm": 1.268170181879783, "learning_rate": 8.97077972312597e-07, "loss": 0.0341, "step": 2023 }, { "epoch": 3.95, "grad_norm": 1.2983133288450885, "learning_rate": 8.938899469768581e-07, "loss": 0.0306, "step": 2024 }, { "epoch": 3.96, "grad_norm": 1.0787349410482154, "learning_rate": 8.907067670170782e-07, "loss": 0.0327, "step": 2025 }, { "epoch": 3.96, "grad_norm": 1.0132338388279627, "learning_rate": 8.875284383515417e-07, "loss": 0.0242, "step": 2026 }, { "epoch": 3.96, "grad_norm": 1.2604031978393024, "learning_rate": 8.843549668895162e-07, "loss": 0.0451, "step": 2027 }, { "epoch": 3.96, "grad_norm": 1.1549029463421008, "learning_rate": 8.811863585312348e-07, "loss": 0.0143, "step": 2028 }, { "epoch": 3.96, "grad_norm": 1.1931150048248262, "learning_rate": 8.780226191678929e-07, "loss": 0.0267, "step": 2029 }, { "epoch": 3.96, "grad_norm": 1.1609571532226182, "learning_rate": 8.748637546816303e-07, "loss": 0.0222, "step": 2030 }, { "epoch": 3.97, "grad_norm": 1.1897097595203816, "learning_rate": 8.717097709455242e-07, "loss": 0.0281, "step": 2031 }, { "epoch": 3.97, "grad_norm": 1.0766155470931131, "learning_rate": 8.685606738235796e-07, "loss": 0.022, "step": 2032 }, { "epoch": 3.97, "grad_norm": 1.2162073676604235, "learning_rate": 8.654164691707113e-07, "loss": 0.0362, "step": 2033 }, { "epoch": 3.97, "grad_norm": 1.1804861594035352, "learning_rate": 8.622771628327429e-07, "loss": 0.0191, "step": 2034 }, { "epoch": 3.97, "grad_norm": 1.3337375792048467, "learning_rate": 8.591427606463867e-07, "loss": 0.04, "step": 2035 }, { "epoch": 3.98, "grad_norm": 1.146724653394522, "learning_rate": 8.560132684392404e-07, "loss": 0.0357, "step": 2036 }, { "epoch": 3.98, "grad_norm": 1.1813442814444381, "learning_rate": 8.528886920297698e-07, "loss": 0.0296, "step": 2037 }, { "epoch": 3.98, "grad_norm": 1.246359981123067, "learning_rate": 8.49769037227304e-07, "loss": 0.0281, "step": 2038 }, { "epoch": 3.98, "grad_norm": 1.2379955161970018, "learning_rate": 8.466543098320205e-07, "loss": 0.0232, "step": 2039 }, { "epoch": 3.98, "grad_norm": 1.342365026108788, "learning_rate": 8.435445156349334e-07, "loss": 0.0162, "step": 2040 }, { "epoch": 3.99, "grad_norm": 1.1567815798604482, "learning_rate": 8.404396604178883e-07, "loss": 0.0378, "step": 2041 }, { "epoch": 3.99, "grad_norm": 0.9350678446489761, "learning_rate": 8.373397499535475e-07, "loss": 0.0183, "step": 2042 }, { "epoch": 3.99, "grad_norm": 1.1702603441170645, "learning_rate": 8.342447900053779e-07, "loss": 0.0238, "step": 2043 }, { "epoch": 3.99, "grad_norm": 1.0159567198998871, "learning_rate": 8.311547863276417e-07, "loss": 0.0335, "step": 2044 }, { "epoch": 3.99, "grad_norm": 1.2268119248723326, "learning_rate": 8.280697446653906e-07, "loss": 0.0264, "step": 2045 }, { "epoch": 4.0, "grad_norm": 1.063938689916454, "learning_rate": 8.249896707544451e-07, "loss": 0.0232, "step": 2046 }, { "epoch": 4.0, "grad_norm": 1.1600596650138715, "learning_rate": 8.219145703213937e-07, "loss": 0.0307, "step": 2047 }, { "epoch": 4.0, "grad_norm": 1.0871463778848507, "learning_rate": 8.188444490835774e-07, "loss": 0.0248, "step": 2048 }, { "epoch": 4.0, "grad_norm": 1.2167220500854348, "learning_rate": 8.157793127490769e-07, "loss": 0.0308, "step": 2049 }, { "epoch": 4.0, "grad_norm": 1.1741498242797797, "learning_rate": 8.127191670167078e-07, "loss": 0.0288, "step": 2050 }, { "epoch": 4.01, "grad_norm": 1.218657993553999, "learning_rate": 8.096640175760066e-07, "loss": 0.022, "step": 2051 }, { "epoch": 4.01, "grad_norm": 1.29554562323444, "learning_rate": 8.066138701072195e-07, "loss": 0.0272, "step": 2052 }, { "epoch": 4.01, "grad_norm": 1.1435304347268076, "learning_rate": 8.035687302812919e-07, "loss": 0.0327, "step": 2053 }, { "epoch": 4.01, "grad_norm": 1.1298236756570115, "learning_rate": 8.005286037598621e-07, "loss": 0.0231, "step": 2054 }, { "epoch": 4.01, "grad_norm": 1.320419139301207, "learning_rate": 7.974934961952433e-07, "loss": 0.0259, "step": 2055 }, { "epoch": 4.02, "grad_norm": 1.1722608748176762, "learning_rate": 7.944634132304205e-07, "loss": 0.0281, "step": 2056 }, { "epoch": 4.02, "grad_norm": 1.271342998513779, "learning_rate": 7.914383604990372e-07, "loss": 0.0372, "step": 2057 }, { "epoch": 4.02, "grad_norm": 1.043555148187623, "learning_rate": 7.884183436253804e-07, "loss": 0.0218, "step": 2058 }, { "epoch": 4.02, "grad_norm": 1.128323766348588, "learning_rate": 7.854033682243785e-07, "loss": 0.0248, "step": 2059 }, { "epoch": 4.02, "grad_norm": 1.4261426854622474, "learning_rate": 7.823934399015856e-07, "loss": 0.0329, "step": 2060 }, { "epoch": 4.03, "grad_norm": 1.1456443678582575, "learning_rate": 7.793885642531703e-07, "loss": 0.0329, "step": 2061 }, { "epoch": 4.03, "grad_norm": 1.206943025168992, "learning_rate": 7.763887468659081e-07, "loss": 0.0404, "step": 2062 }, { "epoch": 4.03, "grad_norm": 1.2730736521053825, "learning_rate": 7.733939933171702e-07, "loss": 0.0236, "step": 2063 }, { "epoch": 4.03, "grad_norm": 0.9256023097496454, "learning_rate": 7.704043091749143e-07, "loss": 0.0177, "step": 2064 }, { "epoch": 4.03, "grad_norm": 1.1993246655739707, "learning_rate": 7.674196999976693e-07, "loss": 0.0186, "step": 2065 }, { "epoch": 4.04, "grad_norm": 1.2420996560246957, "learning_rate": 7.644401713345332e-07, "loss": 0.0262, "step": 2066 }, { "epoch": 4.04, "grad_norm": 1.1362024675754412, "learning_rate": 7.614657287251531e-07, "loss": 0.0276, "step": 2067 }, { "epoch": 4.04, "grad_norm": 1.1740392186961193, "learning_rate": 7.584963776997237e-07, "loss": 0.0247, "step": 2068 }, { "epoch": 4.04, "grad_norm": 1.3625317804540418, "learning_rate": 7.555321237789723e-07, "loss": 0.0216, "step": 2069 }, { "epoch": 4.04, "grad_norm": 1.4243480323533864, "learning_rate": 7.525729724741495e-07, "loss": 0.0275, "step": 2070 }, { "epoch": 4.04, "grad_norm": 1.0134511701365687, "learning_rate": 7.496189292870161e-07, "loss": 0.0173, "step": 2071 }, { "epoch": 4.05, "grad_norm": 1.274923630876148, "learning_rate": 7.466699997098405e-07, "loss": 0.0176, "step": 2072 }, { "epoch": 4.05, "grad_norm": 0.7181514676394142, "learning_rate": 7.437261892253815e-07, "loss": 0.0088, "step": 2073 }, { "epoch": 4.05, "grad_norm": 1.1636954578020906, "learning_rate": 7.407875033068782e-07, "loss": 0.0191, "step": 2074 }, { "epoch": 4.05, "grad_norm": 0.8762456316014725, "learning_rate": 7.378539474180453e-07, "loss": 0.0173, "step": 2075 }, { "epoch": 4.05, "grad_norm": 1.042034733773421, "learning_rate": 7.349255270130589e-07, "loss": 0.018, "step": 2076 }, { "epoch": 4.06, "grad_norm": 1.2698681095658246, "learning_rate": 7.320022475365443e-07, "loss": 0.0316, "step": 2077 }, { "epoch": 4.06, "grad_norm": 1.0612479783980844, "learning_rate": 7.290841144235711e-07, "loss": 0.0192, "step": 2078 }, { "epoch": 4.06, "grad_norm": 1.1212948887437069, "learning_rate": 7.261711330996429e-07, "loss": 0.0294, "step": 2079 }, { "epoch": 4.06, "grad_norm": 1.2374532502010793, "learning_rate": 7.232633089806773e-07, "loss": 0.0357, "step": 2080 }, { "epoch": 4.06, "grad_norm": 0.9119528953076954, "learning_rate": 7.203606474730107e-07, "loss": 0.0177, "step": 2081 }, { "epoch": 4.07, "grad_norm": 0.9282745199416229, "learning_rate": 7.174631539733795e-07, "loss": 0.0199, "step": 2082 }, { "epoch": 4.07, "grad_norm": 1.1372575605560875, "learning_rate": 7.145708338689079e-07, "loss": 0.0191, "step": 2083 }, { "epoch": 4.07, "grad_norm": 1.0853249679321468, "learning_rate": 7.116836925371055e-07, "loss": 0.0139, "step": 2084 }, { "epoch": 4.07, "grad_norm": 1.1052038123081154, "learning_rate": 7.088017353458533e-07, "loss": 0.017, "step": 2085 }, { "epoch": 4.07, "grad_norm": 1.0822926265815054, "learning_rate": 7.059249676533898e-07, "loss": 0.011, "step": 2086 }, { "epoch": 4.08, "grad_norm": 2.5314907834816083, "learning_rate": 7.03053394808309e-07, "loss": 0.0348, "step": 2087 }, { "epoch": 4.08, "grad_norm": 1.3935234898529159, "learning_rate": 7.001870221495463e-07, "loss": 0.0165, "step": 2088 }, { "epoch": 4.08, "grad_norm": 0.9944303402989906, "learning_rate": 6.973258550063658e-07, "loss": 0.0165, "step": 2089 }, { "epoch": 4.08, "grad_norm": 0.8181453638907159, "learning_rate": 6.944698986983546e-07, "loss": 0.0167, "step": 2090 }, { "epoch": 4.08, "grad_norm": 0.9432345361198997, "learning_rate": 6.91619158535414e-07, "loss": 0.0163, "step": 2091 }, { "epoch": 4.09, "grad_norm": 1.3204297239646827, "learning_rate": 6.88773639817743e-07, "loss": 0.0174, "step": 2092 }, { "epoch": 4.09, "grad_norm": 0.9806708832119574, "learning_rate": 6.859333478358361e-07, "loss": 0.02, "step": 2093 }, { "epoch": 4.09, "grad_norm": 1.0449316580020471, "learning_rate": 6.830982878704702e-07, "loss": 0.0241, "step": 2094 }, { "epoch": 4.09, "grad_norm": 1.0574692387855231, "learning_rate": 6.802684651926911e-07, "loss": 0.0208, "step": 2095 }, { "epoch": 4.09, "grad_norm": 1.0905401456973403, "learning_rate": 6.774438850638107e-07, "loss": 0.0296, "step": 2096 }, { "epoch": 4.1, "grad_norm": 1.027963331169256, "learning_rate": 6.74624552735393e-07, "loss": 0.0286, "step": 2097 }, { "epoch": 4.1, "grad_norm": 1.0584082847048828, "learning_rate": 6.718104734492447e-07, "loss": 0.0244, "step": 2098 }, { "epoch": 4.1, "grad_norm": 1.2496331854179246, "learning_rate": 6.69001652437404e-07, "loss": 0.0261, "step": 2099 }, { "epoch": 4.1, "grad_norm": 1.0000862432682744, "learning_rate": 6.661980949221356e-07, "loss": 0.0183, "step": 2100 }, { "epoch": 4.1, "grad_norm": 0.9068631395717704, "learning_rate": 6.633998061159187e-07, "loss": 0.0164, "step": 2101 }, { "epoch": 4.11, "grad_norm": 1.6062961529238173, "learning_rate": 6.606067912214323e-07, "loss": 0.0214, "step": 2102 }, { "epoch": 4.11, "grad_norm": 1.060632114859284, "learning_rate": 6.578190554315545e-07, "loss": 0.0322, "step": 2103 }, { "epoch": 4.11, "grad_norm": 0.8941853939787111, "learning_rate": 6.550366039293471e-07, "loss": 0.0135, "step": 2104 }, { "epoch": 4.11, "grad_norm": 1.2081745425342696, "learning_rate": 6.522594418880442e-07, "loss": 0.0169, "step": 2105 }, { "epoch": 4.11, "grad_norm": 1.0197431457437356, "learning_rate": 6.494875744710507e-07, "loss": 0.0279, "step": 2106 }, { "epoch": 4.12, "grad_norm": 0.9164681552352628, "learning_rate": 6.467210068319233e-07, "loss": 0.0135, "step": 2107 }, { "epoch": 4.12, "grad_norm": 1.0961587368192238, "learning_rate": 6.439597441143655e-07, "loss": 0.03, "step": 2108 }, { "epoch": 4.12, "grad_norm": 0.9759905386913261, "learning_rate": 6.412037914522204e-07, "loss": 0.0186, "step": 2109 }, { "epoch": 4.12, "grad_norm": 1.2765748683172213, "learning_rate": 6.384531539694574e-07, "loss": 0.0191, "step": 2110 }, { "epoch": 4.12, "grad_norm": 0.9254934153704132, "learning_rate": 6.357078367801617e-07, "loss": 0.0186, "step": 2111 }, { "epoch": 4.12, "grad_norm": 1.435069195018503, "learning_rate": 6.329678449885283e-07, "loss": 0.0122, "step": 2112 }, { "epoch": 4.13, "grad_norm": 0.9095221403980838, "learning_rate": 6.302331836888529e-07, "loss": 0.0128, "step": 2113 }, { "epoch": 4.13, "grad_norm": 0.7359230970319034, "learning_rate": 6.275038579655167e-07, "loss": 0.0096, "step": 2114 }, { "epoch": 4.13, "grad_norm": 0.9902043000672235, "learning_rate": 6.24779872892984e-07, "loss": 0.0166, "step": 2115 }, { "epoch": 4.13, "grad_norm": 1.1474363049933791, "learning_rate": 6.22061233535788e-07, "loss": 0.0138, "step": 2116 }, { "epoch": 4.13, "grad_norm": 1.0095305162686248, "learning_rate": 6.193479449485223e-07, "loss": 0.0234, "step": 2117 }, { "epoch": 4.14, "grad_norm": 0.7602331885945934, "learning_rate": 6.166400121758337e-07, "loss": 0.0148, "step": 2118 }, { "epoch": 4.14, "grad_norm": 0.8500651310419827, "learning_rate": 6.139374402524123e-07, "loss": 0.0143, "step": 2119 }, { "epoch": 4.14, "grad_norm": 0.8616947112942298, "learning_rate": 6.112402342029767e-07, "loss": 0.0126, "step": 2120 }, { "epoch": 4.14, "grad_norm": 0.8566058595018946, "learning_rate": 6.08548399042274e-07, "loss": 0.0145, "step": 2121 }, { "epoch": 4.14, "grad_norm": 0.9919097937221366, "learning_rate": 6.058619397750635e-07, "loss": 0.0243, "step": 2122 }, { "epoch": 4.15, "grad_norm": 0.8868317721889021, "learning_rate": 6.03180861396108e-07, "loss": 0.0137, "step": 2123 }, { "epoch": 4.15, "grad_norm": 1.0447106287791763, "learning_rate": 6.005051688901686e-07, "loss": 0.0163, "step": 2124 }, { "epoch": 4.15, "grad_norm": 0.8040617691458385, "learning_rate": 5.978348672319908e-07, "loss": 0.015, "step": 2125 }, { "epoch": 4.15, "grad_norm": 0.9436116232310937, "learning_rate": 5.951699613862985e-07, "loss": 0.0149, "step": 2126 }, { "epoch": 4.15, "grad_norm": 1.0373610856422906, "learning_rate": 5.925104563077817e-07, "loss": 0.0213, "step": 2127 }, { "epoch": 4.16, "grad_norm": 1.0952664367758995, "learning_rate": 5.898563569410913e-07, "loss": 0.0298, "step": 2128 }, { "epoch": 4.16, "grad_norm": 0.9746575063557484, "learning_rate": 5.87207668220828e-07, "loss": 0.0199, "step": 2129 }, { "epoch": 4.16, "grad_norm": 1.1027828105668323, "learning_rate": 5.845643950715289e-07, "loss": 0.0135, "step": 2130 }, { "epoch": 4.16, "grad_norm": 1.0741055523667882, "learning_rate": 5.819265424076679e-07, "loss": 0.0196, "step": 2131 }, { "epoch": 4.16, "grad_norm": 1.193890318907871, "learning_rate": 5.79294115133635e-07, "loss": 0.0313, "step": 2132 }, { "epoch": 4.17, "grad_norm": 1.1342607487096874, "learning_rate": 5.766671181437387e-07, "loss": 0.0223, "step": 2133 }, { "epoch": 4.17, "grad_norm": 0.8625512819119924, "learning_rate": 5.740455563221866e-07, "loss": 0.0142, "step": 2134 }, { "epoch": 4.17, "grad_norm": 1.1020591482278288, "learning_rate": 5.714294345430853e-07, "loss": 0.0232, "step": 2135 }, { "epoch": 4.17, "grad_norm": 0.9482560333332212, "learning_rate": 5.688187576704227e-07, "loss": 0.0178, "step": 2136 }, { "epoch": 4.17, "grad_norm": 0.8719017876644144, "learning_rate": 5.662135305580667e-07, "loss": 0.0122, "step": 2137 }, { "epoch": 4.18, "grad_norm": 0.8828781695150704, "learning_rate": 5.636137580497524e-07, "loss": 0.0193, "step": 2138 }, { "epoch": 4.18, "grad_norm": 0.8288226180449051, "learning_rate": 5.610194449790711e-07, "loss": 0.0151, "step": 2139 }, { "epoch": 4.18, "grad_norm": 0.963336669529624, "learning_rate": 5.584305961694664e-07, "loss": 0.019, "step": 2140 }, { "epoch": 4.18, "grad_norm": 0.9425573624962074, "learning_rate": 5.558472164342222e-07, "loss": 0.0164, "step": 2141 }, { "epoch": 4.18, "grad_norm": 1.0776019692425183, "learning_rate": 5.532693105764526e-07, "loss": 0.0274, "step": 2142 }, { "epoch": 4.19, "grad_norm": 1.085335440651618, "learning_rate": 5.506968833890943e-07, "loss": 0.0214, "step": 2143 }, { "epoch": 4.19, "grad_norm": 1.0435068426087146, "learning_rate": 5.481299396549007e-07, "loss": 0.0167, "step": 2144 }, { "epoch": 4.19, "grad_norm": 0.9728974775963855, "learning_rate": 5.455684841464266e-07, "loss": 0.0153, "step": 2145 }, { "epoch": 4.19, "grad_norm": 0.7851788258539769, "learning_rate": 5.43012521626025e-07, "loss": 0.0137, "step": 2146 }, { "epoch": 4.19, "grad_norm": 0.8794168302512871, "learning_rate": 5.404620568458372e-07, "loss": 0.0135, "step": 2147 }, { "epoch": 4.2, "grad_norm": 0.8843867940842597, "learning_rate": 5.379170945477797e-07, "loss": 0.0144, "step": 2148 }, { "epoch": 4.2, "grad_norm": 0.9045697740624453, "learning_rate": 5.353776394635403e-07, "loss": 0.0143, "step": 2149 }, { "epoch": 4.2, "grad_norm": 0.8680216536427943, "learning_rate": 5.328436963145696e-07, "loss": 0.0157, "step": 2150 }, { "epoch": 4.2, "grad_norm": 1.0614062128578177, "learning_rate": 5.303152698120663e-07, "loss": 0.0106, "step": 2151 }, { "epoch": 4.2, "grad_norm": 1.3205761324494372, "learning_rate": 5.277923646569743e-07, "loss": 0.0158, "step": 2152 }, { "epoch": 4.21, "grad_norm": 0.9275324786801225, "learning_rate": 5.252749855399728e-07, "loss": 0.0166, "step": 2153 }, { "epoch": 4.21, "grad_norm": 1.0403292324903823, "learning_rate": 5.227631371414648e-07, "loss": 0.0161, "step": 2154 }, { "epoch": 4.21, "grad_norm": 0.9118860014738188, "learning_rate": 5.202568241315718e-07, "loss": 0.0099, "step": 2155 }, { "epoch": 4.21, "grad_norm": 0.8778762139160802, "learning_rate": 5.177560511701249e-07, "loss": 0.0112, "step": 2156 }, { "epoch": 4.21, "grad_norm": 0.9038696681572288, "learning_rate": 5.152608229066519e-07, "loss": 0.0228, "step": 2157 }, { "epoch": 4.21, "grad_norm": 1.0027477284568318, "learning_rate": 5.127711439803733e-07, "loss": 0.0155, "step": 2158 }, { "epoch": 4.22, "grad_norm": 0.7166869496967001, "learning_rate": 5.10287019020193e-07, "loss": 0.011, "step": 2159 }, { "epoch": 4.22, "grad_norm": 1.0588227547735594, "learning_rate": 5.078084526446877e-07, "loss": 0.0186, "step": 2160 }, { "epoch": 4.22, "grad_norm": 0.930023892432445, "learning_rate": 5.053354494620977e-07, "loss": 0.0138, "step": 2161 }, { "epoch": 4.22, "grad_norm": 0.9216943201813511, "learning_rate": 5.028680140703231e-07, "loss": 0.0181, "step": 2162 }, { "epoch": 4.22, "grad_norm": 1.1010788176483592, "learning_rate": 5.004061510569114e-07, "loss": 0.0175, "step": 2163 }, { "epoch": 4.23, "grad_norm": 0.7371275071770702, "learning_rate": 4.97949864999048e-07, "loss": 0.0121, "step": 2164 }, { "epoch": 4.23, "grad_norm": 1.0220311136732658, "learning_rate": 4.954991604635503e-07, "loss": 0.0162, "step": 2165 }, { "epoch": 4.23, "grad_norm": 0.8219153746635496, "learning_rate": 4.930540420068608e-07, "loss": 0.0085, "step": 2166 }, { "epoch": 4.23, "grad_norm": 0.7685907891555089, "learning_rate": 4.906145141750314e-07, "loss": 0.0133, "step": 2167 }, { "epoch": 4.23, "grad_norm": 0.9941569286162151, "learning_rate": 4.881805815037239e-07, "loss": 0.0179, "step": 2168 }, { "epoch": 4.24, "grad_norm": 0.9159825320517345, "learning_rate": 4.857522485181948e-07, "loss": 0.0118, "step": 2169 }, { "epoch": 4.24, "grad_norm": 0.9052028871474405, "learning_rate": 4.833295197332904e-07, "loss": 0.0155, "step": 2170 }, { "epoch": 4.24, "grad_norm": 0.985120155141913, "learning_rate": 4.809123996534373e-07, "loss": 0.0231, "step": 2171 }, { "epoch": 4.24, "grad_norm": 0.7943914575596037, "learning_rate": 4.785008927726359e-07, "loss": 0.0108, "step": 2172 }, { "epoch": 4.24, "grad_norm": 0.8960496041860067, "learning_rate": 4.7609500357444654e-07, "loss": 0.0132, "step": 2173 }, { "epoch": 4.25, "grad_norm": 0.9012684489545314, "learning_rate": 4.736947365319881e-07, "loss": 0.0105, "step": 2174 }, { "epoch": 4.25, "grad_norm": 1.0815683869356625, "learning_rate": 4.7130009610792695e-07, "loss": 0.0153, "step": 2175 }, { "epoch": 4.25, "grad_norm": 0.9807904446853889, "learning_rate": 4.6891108675446453e-07, "loss": 0.0147, "step": 2176 }, { "epoch": 4.25, "grad_norm": 0.9161499794714695, "learning_rate": 4.665277129133368e-07, "loss": 0.018, "step": 2177 }, { "epoch": 4.25, "grad_norm": 0.9062096667802463, "learning_rate": 4.6414997901580083e-07, "loss": 0.0186, "step": 2178 }, { "epoch": 4.26, "grad_norm": 0.8274607288910362, "learning_rate": 4.61777889482625e-07, "loss": 0.0166, "step": 2179 }, { "epoch": 4.26, "grad_norm": 0.8358667992146226, "learning_rate": 4.59411448724087e-07, "loss": 0.0101, "step": 2180 }, { "epoch": 4.26, "grad_norm": 0.9097262345478171, "learning_rate": 4.5705066113996144e-07, "loss": 0.0166, "step": 2181 }, { "epoch": 4.26, "grad_norm": 0.9239014274633917, "learning_rate": 4.5469553111951026e-07, "loss": 0.0105, "step": 2182 }, { "epoch": 4.26, "grad_norm": 1.0931988168977185, "learning_rate": 4.5234606304147895e-07, "loss": 0.0143, "step": 2183 }, { "epoch": 4.27, "grad_norm": 0.9450495892191794, "learning_rate": 4.500022612740856e-07, "loss": 0.0127, "step": 2184 }, { "epoch": 4.27, "grad_norm": 1.0214199997660212, "learning_rate": 4.4766413017501164e-07, "loss": 0.0196, "step": 2185 }, { "epoch": 4.27, "grad_norm": 0.9463881724223947, "learning_rate": 4.453316740913976e-07, "loss": 0.0161, "step": 2186 }, { "epoch": 4.27, "grad_norm": 0.8973311098137442, "learning_rate": 4.430048973598325e-07, "loss": 0.0119, "step": 2187 }, { "epoch": 4.27, "grad_norm": 1.0353015753365726, "learning_rate": 4.406838043063446e-07, "loss": 0.0159, "step": 2188 }, { "epoch": 4.28, "grad_norm": 0.8819147164260673, "learning_rate": 4.383683992463951e-07, "loss": 0.0148, "step": 2189 }, { "epoch": 4.28, "grad_norm": 0.738013377544508, "learning_rate": 4.3605868648487136e-07, "loss": 0.0097, "step": 2190 }, { "epoch": 4.28, "grad_norm": 1.0173583818169163, "learning_rate": 4.3375467031607726e-07, "loss": 0.0169, "step": 2191 }, { "epoch": 4.28, "grad_norm": 0.6364998985505708, "learning_rate": 4.314563550237231e-07, "loss": 0.0061, "step": 2192 }, { "epoch": 4.28, "grad_norm": 0.9111350885756412, "learning_rate": 4.291637448809228e-07, "loss": 0.0123, "step": 2193 }, { "epoch": 4.29, "grad_norm": 0.943410867111662, "learning_rate": 4.268768441501807e-07, "loss": 0.0149, "step": 2194 }, { "epoch": 4.29, "grad_norm": 0.7144846970156645, "learning_rate": 4.24595657083387e-07, "loss": 0.0073, "step": 2195 }, { "epoch": 4.29, "grad_norm": 0.5333131228975143, "learning_rate": 4.2232018792181037e-07, "loss": 0.0074, "step": 2196 }, { "epoch": 4.29, "grad_norm": 1.1718295162891241, "learning_rate": 4.200504408960861e-07, "loss": 0.0122, "step": 2197 }, { "epoch": 4.29, "grad_norm": 0.9165598771669454, "learning_rate": 4.177864202262105e-07, "loss": 0.01, "step": 2198 }, { "epoch": 4.29, "grad_norm": 0.7279407522868394, "learning_rate": 4.155281301215353e-07, "loss": 0.0067, "step": 2199 }, { "epoch": 4.3, "grad_norm": 0.8665401222478486, "learning_rate": 4.132755747807577e-07, "loss": 0.01, "step": 2200 }, { "epoch": 4.3, "grad_norm": 0.9072887290852577, "learning_rate": 4.1102875839191017e-07, "loss": 0.0187, "step": 2201 }, { "epoch": 4.3, "grad_norm": 0.9079249430062901, "learning_rate": 4.087876851323568e-07, "loss": 0.0098, "step": 2202 }, { "epoch": 4.3, "grad_norm": 0.5566841199812195, "learning_rate": 4.0655235916878516e-07, "loss": 0.0079, "step": 2203 }, { "epoch": 4.3, "grad_norm": 1.2709505162818135, "learning_rate": 4.0432278465719386e-07, "loss": 0.0189, "step": 2204 }, { "epoch": 4.31, "grad_norm": 1.0230362252173293, "learning_rate": 4.0209896574289155e-07, "loss": 0.0243, "step": 2205 }, { "epoch": 4.31, "grad_norm": 0.8055631152521803, "learning_rate": 3.9988090656048367e-07, "loss": 0.0145, "step": 2206 }, { "epoch": 4.31, "grad_norm": 0.8306565732931788, "learning_rate": 3.976686112338672e-07, "loss": 0.0137, "step": 2207 }, { "epoch": 4.31, "grad_norm": 0.6750476380859496, "learning_rate": 3.95462083876224e-07, "loss": 0.0108, "step": 2208 }, { "epoch": 4.31, "grad_norm": 0.9623909359332791, "learning_rate": 3.932613285900116e-07, "loss": 0.0075, "step": 2209 }, { "epoch": 4.32, "grad_norm": 0.9225190978618996, "learning_rate": 3.9106634946695387e-07, "loss": 0.0122, "step": 2210 }, { "epoch": 4.32, "grad_norm": 0.7070750589100916, "learning_rate": 3.888771505880383e-07, "loss": 0.0095, "step": 2211 }, { "epoch": 4.32, "grad_norm": 0.6063233322928235, "learning_rate": 3.8669373602350414e-07, "loss": 0.0073, "step": 2212 }, { "epoch": 4.32, "grad_norm": 0.8142685729956957, "learning_rate": 3.845161098328354e-07, "loss": 0.011, "step": 2213 }, { "epoch": 4.32, "grad_norm": 0.6487502764877683, "learning_rate": 3.823442760647562e-07, "loss": 0.0092, "step": 2214 }, { "epoch": 4.33, "grad_norm": 1.239713638623792, "learning_rate": 3.8017823875721947e-07, "loss": 0.0101, "step": 2215 }, { "epoch": 4.33, "grad_norm": 0.7515322999030987, "learning_rate": 3.7801800193740066e-07, "loss": 0.0116, "step": 2216 }, { "epoch": 4.33, "grad_norm": 0.8037645552805904, "learning_rate": 3.7586356962169313e-07, "loss": 0.015, "step": 2217 }, { "epoch": 4.33, "grad_norm": 0.6916785091874809, "learning_rate": 3.7371494581569677e-07, "loss": 0.0097, "step": 2218 }, { "epoch": 4.33, "grad_norm": 1.0763937515566357, "learning_rate": 3.715721345142115e-07, "loss": 0.0102, "step": 2219 }, { "epoch": 4.34, "grad_norm": 0.809775186757681, "learning_rate": 3.6943513970123184e-07, "loss": 0.0099, "step": 2220 }, { "epoch": 4.34, "grad_norm": 0.6802121977815848, "learning_rate": 3.673039653499374e-07, "loss": 0.0097, "step": 2221 }, { "epoch": 4.34, "grad_norm": 0.8315166158051106, "learning_rate": 3.651786154226854e-07, "loss": 0.0123, "step": 2222 }, { "epoch": 4.34, "grad_norm": 0.8906720265175802, "learning_rate": 3.630590938710062e-07, "loss": 0.0115, "step": 2223 }, { "epoch": 4.34, "grad_norm": 0.8353002778356227, "learning_rate": 3.609454046355911e-07, "loss": 0.0168, "step": 2224 }, { "epoch": 4.35, "grad_norm": 1.0110177354777843, "learning_rate": 3.588375516462901e-07, "loss": 0.0075, "step": 2225 }, { "epoch": 4.35, "grad_norm": 0.8427365174313487, "learning_rate": 3.5673553882209986e-07, "loss": 0.0097, "step": 2226 }, { "epoch": 4.35, "grad_norm": 0.7546788455403841, "learning_rate": 3.5463937007116125e-07, "loss": 0.0075, "step": 2227 }, { "epoch": 4.35, "grad_norm": 0.8459679827209088, "learning_rate": 3.525490492907494e-07, "loss": 0.0105, "step": 2228 }, { "epoch": 4.35, "grad_norm": 0.6807981854151749, "learning_rate": 3.5046458036726355e-07, "loss": 0.0092, "step": 2229 }, { "epoch": 4.36, "grad_norm": 1.0839683686077584, "learning_rate": 3.483859671762278e-07, "loss": 0.0082, "step": 2230 }, { "epoch": 4.36, "grad_norm": 0.6716563850006657, "learning_rate": 3.4631321358227384e-07, "loss": 0.0058, "step": 2231 }, { "epoch": 4.36, "grad_norm": 0.7803150372658572, "learning_rate": 3.442463234391441e-07, "loss": 0.0104, "step": 2232 }, { "epoch": 4.36, "grad_norm": 0.7393278696900559, "learning_rate": 3.421853005896751e-07, "loss": 0.0123, "step": 2233 }, { "epoch": 4.36, "grad_norm": 0.7821106957113308, "learning_rate": 3.401301488657978e-07, "loss": 0.0102, "step": 2234 }, { "epoch": 4.37, "grad_norm": 0.45709789704203396, "learning_rate": 3.380808720885251e-07, "loss": 0.0065, "step": 2235 }, { "epoch": 4.37, "grad_norm": 1.220567278085964, "learning_rate": 3.3603747406794833e-07, "loss": 0.0156, "step": 2236 }, { "epoch": 4.37, "grad_norm": 0.7103363647854286, "learning_rate": 3.3399995860322934e-07, "loss": 0.0124, "step": 2237 }, { "epoch": 4.37, "grad_norm": 1.0076547154106694, "learning_rate": 3.3196832948259083e-07, "loss": 0.0112, "step": 2238 }, { "epoch": 4.37, "grad_norm": 0.6482914733609276, "learning_rate": 3.2994259048331295e-07, "loss": 0.0046, "step": 2239 }, { "epoch": 4.38, "grad_norm": 0.8438841815247563, "learning_rate": 3.279227453717252e-07, "loss": 0.0099, "step": 2240 }, { "epoch": 4.38, "grad_norm": 0.9504304335028091, "learning_rate": 3.2590879790319744e-07, "loss": 0.0114, "step": 2241 }, { "epoch": 4.38, "grad_norm": 1.0274976466156467, "learning_rate": 3.23900751822135e-07, "loss": 0.0075, "step": 2242 }, { "epoch": 4.38, "grad_norm": 0.9280429197999273, "learning_rate": 3.2189861086197146e-07, "loss": 0.0074, "step": 2243 }, { "epoch": 4.38, "grad_norm": 0.7674525741062588, "learning_rate": 3.1990237874516066e-07, "loss": 0.0068, "step": 2244 }, { "epoch": 4.38, "grad_norm": 0.6736750115172612, "learning_rate": 3.1791205918317164e-07, "loss": 0.0081, "step": 2245 }, { "epoch": 4.39, "grad_norm": 0.6370468706705514, "learning_rate": 3.1592765587648043e-07, "loss": 0.0065, "step": 2246 }, { "epoch": 4.39, "grad_norm": 0.7367442247501413, "learning_rate": 3.1394917251456133e-07, "loss": 0.0076, "step": 2247 }, { "epoch": 4.39, "grad_norm": 0.6113609361109954, "learning_rate": 3.1197661277588436e-07, "loss": 0.0083, "step": 2248 }, { "epoch": 4.39, "grad_norm": 0.8643991658703984, "learning_rate": 3.100099803279063e-07, "loss": 0.0075, "step": 2249 }, { "epoch": 4.39, "grad_norm": 0.554632104497548, "learning_rate": 3.0804927882706196e-07, "loss": 0.0063, "step": 2250 }, { "epoch": 4.4, "grad_norm": 0.9139768506629518, "learning_rate": 3.0609451191875913e-07, "loss": 0.0066, "step": 2251 }, { "epoch": 4.4, "grad_norm": 0.700381427532407, "learning_rate": 3.0414568323737346e-07, "loss": 0.0078, "step": 2252 }, { "epoch": 4.4, "grad_norm": 0.43178323288038994, "learning_rate": 3.0220279640623946e-07, "loss": 0.0047, "step": 2253 }, { "epoch": 4.4, "grad_norm": 0.8392957603560209, "learning_rate": 3.002658550376426e-07, "loss": 0.0073, "step": 2254 }, { "epoch": 4.4, "grad_norm": 0.544182147562363, "learning_rate": 2.983348627328177e-07, "loss": 0.0052, "step": 2255 }, { "epoch": 4.41, "grad_norm": 0.9167771374794118, "learning_rate": 2.964098230819351e-07, "loss": 0.0076, "step": 2256 }, { "epoch": 4.41, "grad_norm": 0.7178255473037951, "learning_rate": 2.9449073966410027e-07, "loss": 0.0074, "step": 2257 }, { "epoch": 4.41, "grad_norm": 0.730442424930122, "learning_rate": 2.925776160473445e-07, "loss": 0.0111, "step": 2258 }, { "epoch": 4.41, "grad_norm": 0.8073597795809256, "learning_rate": 2.906704557886173e-07, "loss": 0.0103, "step": 2259 }, { "epoch": 4.41, "grad_norm": 0.6478452045126236, "learning_rate": 2.887692624337806e-07, "loss": 0.0126, "step": 2260 }, { "epoch": 4.42, "grad_norm": 0.43687452985808584, "learning_rate": 2.8687403951760417e-07, "loss": 0.0048, "step": 2261 }, { "epoch": 4.42, "grad_norm": 0.6326754628709955, "learning_rate": 2.8498479056375656e-07, "loss": 0.0055, "step": 2262 }, { "epoch": 4.42, "grad_norm": 0.6384227105108833, "learning_rate": 2.831015190847978e-07, "loss": 0.0101, "step": 2263 }, { "epoch": 4.42, "grad_norm": 1.1503613432423583, "learning_rate": 2.812242285821771e-07, "loss": 0.0062, "step": 2264 }, { "epoch": 4.42, "grad_norm": 0.8252455851916153, "learning_rate": 2.793529225462219e-07, "loss": 0.0095, "step": 2265 }, { "epoch": 4.43, "grad_norm": 0.6089765788272253, "learning_rate": 2.774876044561331e-07, "loss": 0.0033, "step": 2266 }, { "epoch": 4.43, "grad_norm": 0.9705823778527906, "learning_rate": 2.7562827777997873e-07, "loss": 0.0058, "step": 2267 }, { "epoch": 4.43, "grad_norm": 0.6258898281287135, "learning_rate": 2.7377494597468916e-07, "loss": 0.0081, "step": 2268 }, { "epoch": 4.43, "grad_norm": 0.7703005430955759, "learning_rate": 2.719276124860448e-07, "loss": 0.0149, "step": 2269 }, { "epoch": 4.43, "grad_norm": 0.6011539101355894, "learning_rate": 2.700862807486774e-07, "loss": 0.0086, "step": 2270 }, { "epoch": 4.44, "grad_norm": 0.6121508410432737, "learning_rate": 2.682509541860595e-07, "loss": 0.009, "step": 2271 }, { "epoch": 4.44, "grad_norm": 0.8103941215745272, "learning_rate": 2.664216362104964e-07, "loss": 0.0087, "step": 2272 }, { "epoch": 4.44, "grad_norm": 0.8356086728230029, "learning_rate": 2.6459833022312473e-07, "loss": 0.0112, "step": 2273 }, { "epoch": 4.44, "grad_norm": 0.6698016659737879, "learning_rate": 2.6278103961390257e-07, "loss": 0.0054, "step": 2274 }, { "epoch": 4.44, "grad_norm": 0.7453695934625307, "learning_rate": 2.6096976776160246e-07, "loss": 0.0057, "step": 2275 }, { "epoch": 4.45, "grad_norm": 0.47060927983880146, "learning_rate": 2.591645180338085e-07, "loss": 0.0051, "step": 2276 }, { "epoch": 4.45, "grad_norm": 0.6476227638725617, "learning_rate": 2.573652937869088e-07, "loss": 0.0066, "step": 2277 }, { "epoch": 4.45, "grad_norm": 0.706533572096224, "learning_rate": 2.555720983660852e-07, "loss": 0.0115, "step": 2278 }, { "epoch": 4.45, "grad_norm": 0.626725784920759, "learning_rate": 2.5378493510531367e-07, "loss": 0.0094, "step": 2279 }, { "epoch": 4.45, "grad_norm": 0.6410690192579737, "learning_rate": 2.5200380732735444e-07, "loss": 0.0085, "step": 2280 }, { "epoch": 4.46, "grad_norm": 0.4947842053124823, "learning_rate": 2.502287183437458e-07, "loss": 0.0043, "step": 2281 }, { "epoch": 4.46, "grad_norm": 0.7217041255166142, "learning_rate": 2.4845967145479826e-07, "loss": 0.0107, "step": 2282 }, { "epoch": 4.46, "grad_norm": 0.8719565425328935, "learning_rate": 2.4669666994959026e-07, "loss": 0.0088, "step": 2283 }, { "epoch": 4.46, "grad_norm": 0.8777509605373863, "learning_rate": 2.4493971710595773e-07, "loss": 0.0132, "step": 2284 }, { "epoch": 4.46, "grad_norm": 0.8471939602018662, "learning_rate": 2.431888161904926e-07, "loss": 0.0095, "step": 2285 }, { "epoch": 4.46, "grad_norm": 0.720547943371305, "learning_rate": 2.4144397045853586e-07, "loss": 0.0107, "step": 2286 }, { "epoch": 4.47, "grad_norm": 0.6092430387319936, "learning_rate": 2.397051831541677e-07, "loss": 0.0076, "step": 2287 }, { "epoch": 4.47, "grad_norm": 0.7798405729591721, "learning_rate": 2.3797245751020545e-07, "loss": 0.0106, "step": 2288 }, { "epoch": 4.47, "grad_norm": 0.9024492143106418, "learning_rate": 2.3624579674819684e-07, "loss": 0.008, "step": 2289 }, { "epoch": 4.47, "grad_norm": 0.6922087571274677, "learning_rate": 2.3452520407841404e-07, "loss": 0.0108, "step": 2290 }, { "epoch": 4.47, "grad_norm": 0.7536606128399317, "learning_rate": 2.3281068269984535e-07, "loss": 0.0109, "step": 2291 }, { "epoch": 4.48, "grad_norm": 0.3700721749804377, "learning_rate": 2.3110223580019317e-07, "loss": 0.0042, "step": 2292 }, { "epoch": 4.48, "grad_norm": 0.6323861127319746, "learning_rate": 2.2939986655586364e-07, "loss": 0.0064, "step": 2293 }, { "epoch": 4.48, "grad_norm": 0.8000852560108355, "learning_rate": 2.2770357813196568e-07, "loss": 0.0125, "step": 2294 }, { "epoch": 4.48, "grad_norm": 0.5621252832084792, "learning_rate": 2.260133736823014e-07, "loss": 0.0072, "step": 2295 }, { "epoch": 4.48, "grad_norm": 0.6406817845752051, "learning_rate": 2.2432925634936062e-07, "loss": 0.0056, "step": 2296 }, { "epoch": 4.49, "grad_norm": 0.7775441265629167, "learning_rate": 2.2265122926431585e-07, "loss": 0.0086, "step": 2297 }, { "epoch": 4.49, "grad_norm": 0.5926778514476541, "learning_rate": 2.2097929554701795e-07, "loss": 0.0081, "step": 2298 }, { "epoch": 4.49, "grad_norm": 0.7062418157236797, "learning_rate": 2.1931345830598803e-07, "loss": 0.01, "step": 2299 }, { "epoch": 4.49, "grad_norm": 0.4987571734572555, "learning_rate": 2.176537206384112e-07, "loss": 0.0055, "step": 2300 }, { "epoch": 4.49, "grad_norm": 0.7787022361926843, "learning_rate": 2.160000856301331e-07, "loss": 0.0121, "step": 2301 }, { "epoch": 4.5, "grad_norm": 0.5953524231770814, "learning_rate": 2.143525563556541e-07, "loss": 0.0064, "step": 2302 }, { "epoch": 4.5, "grad_norm": 0.683514744153867, "learning_rate": 2.127111358781198e-07, "loss": 0.0123, "step": 2303 }, { "epoch": 4.5, "grad_norm": 0.9990672295588554, "learning_rate": 2.1107582724932088e-07, "loss": 0.0123, "step": 2304 }, { "epoch": 4.5, "grad_norm": 0.6715657290550969, "learning_rate": 2.0944663350968328e-07, "loss": 0.0099, "step": 2305 }, { "epoch": 4.5, "grad_norm": 0.7082894633518662, "learning_rate": 2.078235576882631e-07, "loss": 0.0104, "step": 2306 }, { "epoch": 4.51, "grad_norm": 0.8219920162283776, "learning_rate": 2.0620660280274355e-07, "loss": 0.009, "step": 2307 }, { "epoch": 4.51, "grad_norm": 0.7164708996792125, "learning_rate": 2.0459577185942756e-07, "loss": 0.0088, "step": 2308 }, { "epoch": 4.51, "grad_norm": 0.6755801141222959, "learning_rate": 2.0299106785323e-07, "loss": 0.01, "step": 2309 }, { "epoch": 4.51, "grad_norm": 0.6902446954445884, "learning_rate": 2.0139249376767654e-07, "loss": 0.009, "step": 2310 }, { "epoch": 4.51, "grad_norm": 0.7525438182197577, "learning_rate": 1.998000525748958e-07, "loss": 0.0081, "step": 2311 }, { "epoch": 4.52, "grad_norm": 0.7613346074490493, "learning_rate": 1.9821374723561168e-07, "loss": 0.0104, "step": 2312 }, { "epoch": 4.52, "grad_norm": 0.9739583761384975, "learning_rate": 1.9663358069914292e-07, "loss": 0.008, "step": 2313 }, { "epoch": 4.52, "grad_norm": 0.6189593983274871, "learning_rate": 1.9505955590339224e-07, "loss": 0.0053, "step": 2314 }, { "epoch": 4.52, "grad_norm": 0.6277625993876558, "learning_rate": 1.934916757748455e-07, "loss": 0.0061, "step": 2315 }, { "epoch": 4.52, "grad_norm": 0.7236987079415176, "learning_rate": 1.9192994322856282e-07, "loss": 0.0079, "step": 2316 }, { "epoch": 4.53, "grad_norm": 0.8707688751164165, "learning_rate": 1.903743611681759e-07, "loss": 0.012, "step": 2317 }, { "epoch": 4.53, "grad_norm": 0.9045685013861438, "learning_rate": 1.888249324858786e-07, "loss": 0.0121, "step": 2318 }, { "epoch": 4.53, "grad_norm": 0.5028630892107218, "learning_rate": 1.8728166006242702e-07, "loss": 0.0057, "step": 2319 }, { "epoch": 4.53, "grad_norm": 0.5593387352774787, "learning_rate": 1.8574454676713047e-07, "loss": 0.0071, "step": 2320 }, { "epoch": 4.53, "grad_norm": 0.6672635295860229, "learning_rate": 1.8421359545784576e-07, "loss": 0.0098, "step": 2321 }, { "epoch": 4.54, "grad_norm": 0.5521608068534274, "learning_rate": 1.826888089809748e-07, "loss": 0.0052, "step": 2322 }, { "epoch": 4.54, "grad_norm": 0.6509332933356972, "learning_rate": 1.8117019017145636e-07, "loss": 0.0089, "step": 2323 }, { "epoch": 4.54, "grad_norm": 0.6441271365146246, "learning_rate": 1.7965774185276317e-07, "loss": 0.0062, "step": 2324 }, { "epoch": 4.54, "grad_norm": 0.3819442425189732, "learning_rate": 1.7815146683689398e-07, "loss": 0.0039, "step": 2325 }, { "epoch": 4.54, "grad_norm": 0.42500209297663544, "learning_rate": 1.7665136792437163e-07, "loss": 0.0052, "step": 2326 }, { "epoch": 4.54, "grad_norm": 0.6607853346077911, "learning_rate": 1.7515744790423538e-07, "loss": 0.0072, "step": 2327 }, { "epoch": 4.55, "grad_norm": 0.6072994405168517, "learning_rate": 1.736697095540361e-07, "loss": 0.0072, "step": 2328 }, { "epoch": 4.55, "grad_norm": 0.8720298335476313, "learning_rate": 1.7218815563983176e-07, "loss": 0.011, "step": 2329 }, { "epoch": 4.55, "grad_norm": 0.7690119902596777, "learning_rate": 1.7071278891618263e-07, "loss": 0.009, "step": 2330 }, { "epoch": 4.55, "grad_norm": 0.6141349033168962, "learning_rate": 1.692436121261448e-07, "loss": 0.0052, "step": 2331 }, { "epoch": 4.55, "grad_norm": 0.817514516495674, "learning_rate": 1.6778062800126503e-07, "loss": 0.0111, "step": 2332 }, { "epoch": 4.56, "grad_norm": 0.44448222984894026, "learning_rate": 1.6632383926157883e-07, "loss": 0.0045, "step": 2333 }, { "epoch": 4.56, "grad_norm": 0.7805296030927401, "learning_rate": 1.6487324861560043e-07, "loss": 0.0107, "step": 2334 }, { "epoch": 4.56, "grad_norm": 0.6530466288266938, "learning_rate": 1.6342885876032148e-07, "loss": 0.006, "step": 2335 }, { "epoch": 4.56, "grad_norm": 0.6308133753026339, "learning_rate": 1.6199067238120613e-07, "loss": 0.0096, "step": 2336 }, { "epoch": 4.56, "grad_norm": 0.5759354223235199, "learning_rate": 1.6055869215218199e-07, "loss": 0.0079, "step": 2337 }, { "epoch": 4.57, "grad_norm": 0.6739530404054863, "learning_rate": 1.5913292073564023e-07, "loss": 0.0081, "step": 2338 }, { "epoch": 4.57, "grad_norm": 0.5851120053133883, "learning_rate": 1.577133607824281e-07, "loss": 0.0082, "step": 2339 }, { "epoch": 4.57, "grad_norm": 0.9491664599804783, "learning_rate": 1.563000149318439e-07, "loss": 0.0065, "step": 2340 }, { "epoch": 4.57, "grad_norm": 0.5279966086996981, "learning_rate": 1.548928858116309e-07, "loss": 0.0054, "step": 2341 }, { "epoch": 4.57, "grad_norm": 0.3882033650669893, "learning_rate": 1.534919760379771e-07, "loss": 0.0033, "step": 2342 }, { "epoch": 4.58, "grad_norm": 0.5658947057988253, "learning_rate": 1.5209728821550488e-07, "loss": 0.0055, "step": 2343 }, { "epoch": 4.58, "grad_norm": 0.4845880403498325, "learning_rate": 1.5070882493726911e-07, "loss": 0.0059, "step": 2344 }, { "epoch": 4.58, "grad_norm": 0.7674024099085063, "learning_rate": 1.4932658878475274e-07, "loss": 0.0055, "step": 2345 }, { "epoch": 4.58, "grad_norm": 0.6054642166393199, "learning_rate": 1.4795058232785913e-07, "loss": 0.0067, "step": 2346 }, { "epoch": 4.58, "grad_norm": 0.6370746707077004, "learning_rate": 1.465808081249112e-07, "loss": 0.0064, "step": 2347 }, { "epoch": 4.59, "grad_norm": 0.7170238818832992, "learning_rate": 1.4521726872264334e-07, "loss": 0.0115, "step": 2348 }, { "epoch": 4.59, "grad_norm": 0.5583574120097988, "learning_rate": 1.4385996665619865e-07, "loss": 0.0045, "step": 2349 }, { "epoch": 4.59, "grad_norm": 0.5970342866847834, "learning_rate": 1.4250890444912235e-07, "loss": 0.0056, "step": 2350 }, { "epoch": 4.59, "grad_norm": 0.7648665463321589, "learning_rate": 1.4116408461335976e-07, "loss": 0.0092, "step": 2351 }, { "epoch": 4.59, "grad_norm": 0.7742624382061226, "learning_rate": 1.398255096492499e-07, "loss": 0.0099, "step": 2352 }, { "epoch": 4.6, "grad_norm": 0.6812508490446402, "learning_rate": 1.3849318204551976e-07, "loss": 0.007, "step": 2353 }, { "epoch": 4.6, "grad_norm": 0.5439275422722814, "learning_rate": 1.3716710427928297e-07, "loss": 0.0065, "step": 2354 }, { "epoch": 4.6, "grad_norm": 0.5734438169229614, "learning_rate": 1.358472788160312e-07, "loss": 0.0069, "step": 2355 }, { "epoch": 4.6, "grad_norm": 0.46897971045683107, "learning_rate": 1.3453370810963294e-07, "loss": 0.0035, "step": 2356 }, { "epoch": 4.6, "grad_norm": 0.5153328428741342, "learning_rate": 1.332263946023285e-07, "loss": 0.005, "step": 2357 }, { "epoch": 4.61, "grad_norm": 0.6781059594940184, "learning_rate": 1.3192534072472216e-07, "loss": 0.0102, "step": 2358 }, { "epoch": 4.61, "grad_norm": 0.6325314314746578, "learning_rate": 1.3063054889578118e-07, "loss": 0.0088, "step": 2359 }, { "epoch": 4.61, "grad_norm": 0.6192481118131031, "learning_rate": 1.2934202152283052e-07, "loss": 0.0076, "step": 2360 }, { "epoch": 4.61, "grad_norm": 0.7276118130210336, "learning_rate": 1.2805976100154875e-07, "loss": 0.0099, "step": 2361 }, { "epoch": 4.61, "grad_norm": 0.8983195996437013, "learning_rate": 1.2678376971596057e-07, "loss": 0.0118, "step": 2362 }, { "epoch": 4.62, "grad_norm": 0.6826552976786321, "learning_rate": 1.2551405003843678e-07, "loss": 0.0088, "step": 2363 }, { "epoch": 4.62, "grad_norm": 0.4970628097119865, "learning_rate": 1.242506043296871e-07, "loss": 0.0036, "step": 2364 }, { "epoch": 4.62, "grad_norm": 0.6238807874615483, "learning_rate": 1.2299343493875598e-07, "loss": 0.0051, "step": 2365 }, { "epoch": 4.62, "grad_norm": 0.5824754118845686, "learning_rate": 1.2174254420301934e-07, "loss": 0.0073, "step": 2366 }, { "epoch": 4.62, "grad_norm": 0.5434052512400094, "learning_rate": 1.204979344481802e-07, "loss": 0.0077, "step": 2367 }, { "epoch": 4.62, "grad_norm": 0.932488001474171, "learning_rate": 1.192596079882613e-07, "loss": 0.0062, "step": 2368 }, { "epoch": 4.63, "grad_norm": 0.5890509285441361, "learning_rate": 1.1802756712560553e-07, "loss": 0.0069, "step": 2369 }, { "epoch": 4.63, "grad_norm": 0.8699082415595292, "learning_rate": 1.1680181415086965e-07, "loss": 0.006, "step": 2370 }, { "epoch": 4.63, "grad_norm": 0.6622608471433219, "learning_rate": 1.1558235134301776e-07, "loss": 0.0085, "step": 2371 }, { "epoch": 4.63, "grad_norm": 0.5376034783964436, "learning_rate": 1.1436918096932042e-07, "loss": 0.0071, "step": 2372 }, { "epoch": 4.63, "grad_norm": 0.818823468923777, "learning_rate": 1.1316230528534892e-07, "loss": 0.0091, "step": 2373 }, { "epoch": 4.64, "grad_norm": 0.796184933455159, "learning_rate": 1.1196172653497061e-07, "loss": 0.006, "step": 2374 }, { "epoch": 4.64, "grad_norm": 0.6139699305982137, "learning_rate": 1.1076744695034606e-07, "loss": 0.0048, "step": 2375 }, { "epoch": 4.64, "grad_norm": 0.757157457097081, "learning_rate": 1.095794687519242e-07, "loss": 0.0132, "step": 2376 }, { "epoch": 4.64, "grad_norm": 0.6134098561588123, "learning_rate": 1.0839779414843786e-07, "loss": 0.0062, "step": 2377 }, { "epoch": 4.64, "grad_norm": 0.8661235970181036, "learning_rate": 1.0722242533689924e-07, "loss": 0.0129, "step": 2378 }, { "epoch": 4.65, "grad_norm": 0.6739238990818037, "learning_rate": 1.0605336450259867e-07, "loss": 0.0097, "step": 2379 }, { "epoch": 4.65, "grad_norm": 0.7799118643748157, "learning_rate": 1.0489061381909609e-07, "loss": 0.0082, "step": 2380 }, { "epoch": 4.65, "grad_norm": 0.5990319442119106, "learning_rate": 1.0373417544822106e-07, "loss": 0.0094, "step": 2381 }, { "epoch": 4.65, "grad_norm": 0.6526318346295694, "learning_rate": 1.025840515400665e-07, "loss": 0.0095, "step": 2382 }, { "epoch": 4.65, "grad_norm": 0.6694858155174154, "learning_rate": 1.0144024423298487e-07, "loss": 0.0086, "step": 2383 }, { "epoch": 4.66, "grad_norm": 0.8623710730440927, "learning_rate": 1.0030275565358499e-07, "loss": 0.0105, "step": 2384 }, { "epoch": 4.66, "grad_norm": 0.6853862685428406, "learning_rate": 9.91715879167278e-08, "loss": 0.0084, "step": 2385 }, { "epoch": 4.66, "grad_norm": 0.7894091057709539, "learning_rate": 9.804674312552214e-08, "loss": 0.0084, "step": 2386 }, { "epoch": 4.66, "grad_norm": 0.5941963646787021, "learning_rate": 9.692822337132074e-08, "loss": 0.0056, "step": 2387 }, { "epoch": 4.66, "grad_norm": 0.6572232663261427, "learning_rate": 9.581603073371642e-08, "loss": 0.0095, "step": 2388 }, { "epoch": 4.67, "grad_norm": 0.7509089292108363, "learning_rate": 9.471016728053976e-08, "loss": 0.0086, "step": 2389 }, { "epoch": 4.67, "grad_norm": 0.552036428604834, "learning_rate": 9.361063506785172e-08, "loss": 0.0055, "step": 2390 }, { "epoch": 4.67, "grad_norm": 0.6915535902681421, "learning_rate": 9.251743613994395e-08, "loss": 0.0091, "step": 2391 }, { "epoch": 4.67, "grad_norm": 0.888928120705791, "learning_rate": 9.143057252933229e-08, "loss": 0.0237, "step": 2392 }, { "epoch": 4.67, "grad_norm": 0.6992624172935136, "learning_rate": 9.035004625675319e-08, "loss": 0.0115, "step": 2393 }, { "epoch": 4.68, "grad_norm": 0.8689487296128037, "learning_rate": 8.927585933116144e-08, "loss": 0.0078, "step": 2394 }, { "epoch": 4.68, "grad_norm": 0.6318996415617609, "learning_rate": 8.82080137497243e-08, "loss": 0.0055, "step": 2395 }, { "epoch": 4.68, "grad_norm": 0.8322791855746269, "learning_rate": 8.714651149782038e-08, "loss": 0.0125, "step": 2396 }, { "epoch": 4.68, "grad_norm": 0.5589511788310251, "learning_rate": 8.609135454903332e-08, "loss": 0.007, "step": 2397 }, { "epoch": 4.68, "grad_norm": 0.736181031405738, "learning_rate": 8.504254486515039e-08, "loss": 0.0051, "step": 2398 }, { "epoch": 4.69, "grad_norm": 0.6181597623674456, "learning_rate": 8.400008439615653e-08, "loss": 0.0046, "step": 2399 }, { "epoch": 4.69, "grad_norm": 1.0349260484958547, "learning_rate": 8.296397508023323e-08, "loss": 0.0073, "step": 2400 }, { "epoch": 4.69, "grad_norm": 0.709541107479915, "learning_rate": 8.193421884375312e-08, "loss": 0.0102, "step": 2401 }, { "epoch": 4.69, "grad_norm": 0.7438080993515799, "learning_rate": 8.091081760127683e-08, "loss": 0.0094, "step": 2402 }, { "epoch": 4.69, "grad_norm": 0.6230429070771447, "learning_rate": 7.989377325554986e-08, "loss": 0.0073, "step": 2403 }, { "epoch": 4.7, "grad_norm": 0.6862121858758911, "learning_rate": 7.888308769749875e-08, "loss": 0.0088, "step": 2404 }, { "epoch": 4.7, "grad_norm": 0.48151617037319405, "learning_rate": 7.787876280622674e-08, "loss": 0.0065, "step": 2405 }, { "epoch": 4.7, "grad_norm": 0.7670279646505443, "learning_rate": 7.688080044901191e-08, "loss": 0.0105, "step": 2406 }, { "epoch": 4.7, "grad_norm": 0.9169504564860986, "learning_rate": 7.588920248130359e-08, "loss": 0.0081, "step": 2407 }, { "epoch": 4.7, "grad_norm": 0.74206103024135, "learning_rate": 7.490397074671583e-08, "loss": 0.0091, "step": 2408 }, { "epoch": 4.71, "grad_norm": 0.6092227738527938, "learning_rate": 7.392510707702892e-08, "loss": 0.0093, "step": 2409 }, { "epoch": 4.71, "grad_norm": 0.4500219041832704, "learning_rate": 7.2952613292182e-08, "loss": 0.0041, "step": 2410 }, { "epoch": 4.71, "grad_norm": 0.6917255167418394, "learning_rate": 7.19864912002715e-08, "loss": 0.0118, "step": 2411 }, { "epoch": 4.71, "grad_norm": 0.6832264092963957, "learning_rate": 7.102674259754693e-08, "loss": 0.0095, "step": 2412 }, { "epoch": 4.71, "grad_norm": 0.4747138121333257, "learning_rate": 7.007336926840846e-08, "loss": 0.0051, "step": 2413 }, { "epoch": 4.71, "grad_norm": 0.6150743481778702, "learning_rate": 6.912637298540347e-08, "loss": 0.0071, "step": 2414 }, { "epoch": 4.72, "grad_norm": 0.7559615448232916, "learning_rate": 6.818575550922112e-08, "loss": 0.0102, "step": 2415 }, { "epoch": 4.72, "grad_norm": 0.5423969632106725, "learning_rate": 6.72515185886935e-08, "loss": 0.0059, "step": 2416 }, { "epoch": 4.72, "grad_norm": 0.7215507185159425, "learning_rate": 6.632366396078782e-08, "loss": 0.0144, "step": 2417 }, { "epoch": 4.72, "grad_norm": 0.5940426889785871, "learning_rate": 6.540219335060493e-08, "loss": 0.0054, "step": 2418 }, { "epoch": 4.72, "grad_norm": 0.4546084752982406, "learning_rate": 6.44871084713785e-08, "loss": 0.0031, "step": 2419 }, { "epoch": 4.73, "grad_norm": 0.5449186295219698, "learning_rate": 6.357841102446649e-08, "loss": 0.0054, "step": 2420 }, { "epoch": 4.73, "grad_norm": 0.49564513252016845, "learning_rate": 6.267610269935419e-08, "loss": 0.0043, "step": 2421 }, { "epoch": 4.73, "grad_norm": 0.5767410410850843, "learning_rate": 6.178018517364503e-08, "loss": 0.0061, "step": 2422 }, { "epoch": 4.73, "grad_norm": 0.6012167768308524, "learning_rate": 6.089066011306354e-08, "loss": 0.0087, "step": 2423 }, { "epoch": 4.73, "grad_norm": 0.6293822594694154, "learning_rate": 6.000752917144614e-08, "loss": 0.0093, "step": 2424 }, { "epoch": 4.74, "grad_norm": 0.5049095874886438, "learning_rate": 5.9130793990743004e-08, "loss": 0.0072, "step": 2425 }, { "epoch": 4.74, "grad_norm": 0.5322453467653705, "learning_rate": 5.8260456201012664e-08, "loss": 0.0054, "step": 2426 }, { "epoch": 4.74, "grad_norm": 0.8890729459384169, "learning_rate": 5.73965174204189e-08, "loss": 0.0093, "step": 2427 }, { "epoch": 4.74, "grad_norm": 0.3300430206415209, "learning_rate": 5.653897925522877e-08, "loss": 0.0032, "step": 2428 }, { "epoch": 4.74, "grad_norm": 0.49342600707043205, "learning_rate": 5.5687843299809524e-08, "loss": 0.0067, "step": 2429 }, { "epoch": 4.75, "grad_norm": 0.5546794717221433, "learning_rate": 5.4843111136623545e-08, "loss": 0.0064, "step": 2430 }, { "epoch": 4.75, "grad_norm": 0.6109727519261624, "learning_rate": 5.400478433622835e-08, "loss": 0.0114, "step": 2431 }, { "epoch": 4.75, "grad_norm": 0.6935354388582906, "learning_rate": 5.3172864457271926e-08, "loss": 0.0125, "step": 2432 }, { "epoch": 4.75, "grad_norm": 0.569312195583685, "learning_rate": 5.2347353046490795e-08, "loss": 0.0058, "step": 2433 }, { "epoch": 4.75, "grad_norm": 0.7042078083041506, "learning_rate": 5.1528251638705724e-08, "loss": 0.0088, "step": 2434 }, { "epoch": 4.76, "grad_norm": 0.6703765142767085, "learning_rate": 5.071556175682057e-08, "loss": 0.0092, "step": 2435 }, { "epoch": 4.76, "grad_norm": 0.5163952528784854, "learning_rate": 4.990928491181839e-08, "loss": 0.0066, "step": 2436 }, { "epoch": 4.76, "grad_norm": 0.4493814737551696, "learning_rate": 4.9109422602758746e-08, "loss": 0.004, "step": 2437 }, { "epoch": 4.76, "grad_norm": 0.6452861349411588, "learning_rate": 4.83159763167757e-08, "loss": 0.0069, "step": 2438 }, { "epoch": 4.76, "grad_norm": 0.6219716326298758, "learning_rate": 4.752894752907283e-08, "loss": 0.0075, "step": 2439 }, { "epoch": 4.77, "grad_norm": 0.6763587063327379, "learning_rate": 4.674833770292358e-08, "loss": 0.0086, "step": 2440 }, { "epoch": 4.77, "grad_norm": 0.7793902804975035, "learning_rate": 4.597414828966661e-08, "loss": 0.015, "step": 2441 }, { "epoch": 4.77, "grad_norm": 0.7046235176179817, "learning_rate": 4.5206380728703474e-08, "loss": 0.0122, "step": 2442 }, { "epoch": 4.77, "grad_norm": 0.6350272337274077, "learning_rate": 4.444503644749548e-08, "loss": 0.0097, "step": 2443 }, { "epoch": 4.77, "grad_norm": 0.69059020865564, "learning_rate": 4.369011686156293e-08, "loss": 0.0109, "step": 2444 }, { "epoch": 4.78, "grad_norm": 0.4751094277525685, "learning_rate": 4.294162337447932e-08, "loss": 0.0046, "step": 2445 }, { "epoch": 4.78, "grad_norm": 0.837407418944609, "learning_rate": 4.2199557377871676e-08, "loss": 0.0134, "step": 2446 }, { "epoch": 4.78, "grad_norm": 0.6034918464760793, "learning_rate": 4.146392025141671e-08, "loss": 0.0071, "step": 2447 }, { "epoch": 4.78, "grad_norm": 0.7334562775697001, "learning_rate": 4.073471336283768e-08, "loss": 0.0123, "step": 2448 }, { "epoch": 4.78, "grad_norm": 0.7340254541192858, "learning_rate": 4.0011938067902874e-08, "loss": 0.0153, "step": 2449 }, { "epoch": 4.79, "grad_norm": 0.7921390492815453, "learning_rate": 3.929559571042324e-08, "loss": 0.0128, "step": 2450 }, { "epoch": 4.79, "grad_norm": 0.8253048004036094, "learning_rate": 3.85856876222489e-08, "loss": 0.0153, "step": 2451 }, { "epoch": 4.79, "grad_norm": 0.6420810368190379, "learning_rate": 3.788221512326645e-08, "loss": 0.0086, "step": 2452 }, { "epoch": 4.79, "grad_norm": 0.4914126569131877, "learning_rate": 3.718517952139894e-08, "loss": 0.0051, "step": 2453 }, { "epoch": 4.79, "grad_norm": 0.6326294319835615, "learning_rate": 3.6494582112600036e-08, "loss": 0.0046, "step": 2454 }, { "epoch": 4.79, "grad_norm": 0.5885473335460268, "learning_rate": 3.5810424180853674e-08, "loss": 0.0074, "step": 2455 }, { "epoch": 4.8, "grad_norm": 0.552980980258122, "learning_rate": 3.5132706998172444e-08, "loss": 0.0076, "step": 2456 }, { "epoch": 4.8, "grad_norm": 0.6514947244503702, "learning_rate": 3.4461431824592604e-08, "loss": 0.0106, "step": 2457 }, { "epoch": 4.8, "grad_norm": 0.6736005064491258, "learning_rate": 3.3796599908173244e-08, "loss": 0.0133, "step": 2458 }, { "epoch": 4.8, "grad_norm": 0.746246307720691, "learning_rate": 3.3138212484994764e-08, "loss": 0.0082, "step": 2459 }, { "epoch": 4.8, "grad_norm": 0.6870273838277717, "learning_rate": 3.248627077915578e-08, "loss": 0.0131, "step": 2460 }, { "epoch": 4.81, "grad_norm": 0.4353106754009962, "learning_rate": 3.1840776002769965e-08, "loss": 0.0047, "step": 2461 }, { "epoch": 4.81, "grad_norm": 0.4163830064881803, "learning_rate": 3.1201729355964934e-08, "loss": 0.0047, "step": 2462 }, { "epoch": 4.81, "grad_norm": 0.7826636362220234, "learning_rate": 3.0569132026880276e-08, "loss": 0.0136, "step": 2463 }, { "epoch": 4.81, "grad_norm": 0.5419152446172166, "learning_rate": 2.994298519166366e-08, "loss": 0.0068, "step": 2464 }, { "epoch": 4.81, "grad_norm": 0.4644446962252012, "learning_rate": 2.9323290014470483e-08, "loss": 0.0059, "step": 2465 }, { "epoch": 4.82, "grad_norm": 0.7322617750120296, "learning_rate": 2.871004764746149e-08, "loss": 0.0095, "step": 2466 }, { "epoch": 4.82, "grad_norm": 0.4959769458736918, "learning_rate": 2.8103259230798925e-08, "loss": 0.0054, "step": 2467 }, { "epoch": 4.82, "grad_norm": 0.6020493706208434, "learning_rate": 2.7502925892646135e-08, "loss": 0.0087, "step": 2468 }, { "epoch": 4.82, "grad_norm": 0.5924078675138492, "learning_rate": 2.6909048749165607e-08, "loss": 0.0103, "step": 2469 }, { "epoch": 4.82, "grad_norm": 0.5423483575519701, "learning_rate": 2.6321628904515114e-08, "loss": 0.0073, "step": 2470 }, { "epoch": 4.83, "grad_norm": 0.7296204093031614, "learning_rate": 2.5740667450847297e-08, "loss": 0.0095, "step": 2471 }, { "epoch": 4.83, "grad_norm": 0.5245146692066164, "learning_rate": 2.5166165468307356e-08, "loss": 0.0067, "step": 2472 }, { "epoch": 4.83, "grad_norm": 0.6907870792556337, "learning_rate": 2.45981240250307e-08, "loss": 0.0085, "step": 2473 }, { "epoch": 4.83, "grad_norm": 0.5105390492757816, "learning_rate": 2.403654417714024e-08, "loss": 0.0042, "step": 2474 }, { "epoch": 4.83, "grad_norm": 0.5938396653001593, "learning_rate": 2.3481426968747165e-08, "loss": 0.0069, "step": 2475 }, { "epoch": 4.84, "grad_norm": 0.4416173182498096, "learning_rate": 2.293277343194472e-08, "loss": 0.0043, "step": 2476 }, { "epoch": 4.84, "grad_norm": 0.6322302131756588, "learning_rate": 2.2390584586810147e-08, "loss": 0.0052, "step": 2477 }, { "epoch": 4.84, "grad_norm": 0.6688493224965536, "learning_rate": 2.1854861441401195e-08, "loss": 0.007, "step": 2478 }, { "epoch": 4.84, "grad_norm": 0.5023078768611245, "learning_rate": 2.132560499175379e-08, "loss": 0.0056, "step": 2479 }, { "epoch": 4.84, "grad_norm": 0.6690623489521356, "learning_rate": 2.0802816221881235e-08, "loss": 0.0101, "step": 2480 }, { "epoch": 4.85, "grad_norm": 0.6769830082533458, "learning_rate": 2.0286496103771922e-08, "loss": 0.0125, "step": 2481 }, { "epoch": 4.85, "grad_norm": 0.5811716993211655, "learning_rate": 1.9776645597386564e-08, "loss": 0.0066, "step": 2482 }, { "epoch": 4.85, "grad_norm": 0.7122069488505645, "learning_rate": 1.927326565065862e-08, "loss": 0.0107, "step": 2483 }, { "epoch": 4.85, "grad_norm": 0.5275548858205786, "learning_rate": 1.8776357199490778e-08, "loss": 0.0075, "step": 2484 }, { "epoch": 4.85, "grad_norm": 0.6847597891598193, "learning_rate": 1.8285921167753403e-08, "loss": 0.0057, "step": 2485 }, { "epoch": 4.86, "grad_norm": 0.5591401366248862, "learning_rate": 1.780195846728261e-08, "loss": 0.0065, "step": 2486 }, { "epoch": 4.86, "grad_norm": 0.6697324395017826, "learning_rate": 1.732446999788023e-08, "loss": 0.0133, "step": 2487 }, { "epoch": 4.86, "grad_norm": 0.9064241768016845, "learning_rate": 1.6853456647311137e-08, "loss": 0.0067, "step": 2488 }, { "epoch": 4.86, "grad_norm": 0.7850905488293921, "learning_rate": 1.638891929129932e-08, "loss": 0.0115, "step": 2489 }, { "epoch": 4.86, "grad_norm": 0.5068163676010622, "learning_rate": 1.593085879353062e-08, "loss": 0.0061, "step": 2490 }, { "epoch": 4.87, "grad_norm": 0.6964665472899048, "learning_rate": 1.5479276005648467e-08, "loss": 0.0107, "step": 2491 }, { "epoch": 4.87, "grad_norm": 0.8216956673844958, "learning_rate": 1.5034171767251135e-08, "loss": 0.014, "step": 2492 }, { "epoch": 4.87, "grad_norm": 0.6888503451954227, "learning_rate": 1.459554690589332e-08, "loss": 0.009, "step": 2493 }, { "epoch": 4.87, "grad_norm": 0.6411693946072634, "learning_rate": 1.4163402237083011e-08, "loss": 0.0089, "step": 2494 }, { "epoch": 4.87, "grad_norm": 0.7791240099615426, "learning_rate": 1.3737738564278789e-08, "loss": 0.017, "step": 2495 }, { "epoch": 4.88, "grad_norm": 0.6308953614455673, "learning_rate": 1.3318556678890592e-08, "loss": 0.0071, "step": 2496 }, { "epoch": 4.88, "grad_norm": 0.4878271358139344, "learning_rate": 1.2905857360276996e-08, "loss": 0.005, "step": 2497 }, { "epoch": 4.88, "grad_norm": 1.1130838282396527, "learning_rate": 1.2499641375743664e-08, "loss": 0.0088, "step": 2498 }, { "epoch": 4.88, "grad_norm": 0.7660749533189488, "learning_rate": 1.2099909480542181e-08, "loss": 0.0171, "step": 2499 }, { "epoch": 4.88, "grad_norm": 0.8767739660474819, "learning_rate": 1.1706662417868885e-08, "loss": 0.0192, "step": 2500 }, { "epoch": 4.88, "grad_norm": 0.6856581281496097, "learning_rate": 1.1319900918863313e-08, "loss": 0.0069, "step": 2501 }, { "epoch": 4.89, "grad_norm": 0.5382954707232194, "learning_rate": 1.0939625702607036e-08, "loss": 0.0065, "step": 2502 }, { "epoch": 4.89, "grad_norm": 0.8459840096332762, "learning_rate": 1.0565837476121332e-08, "loss": 0.0164, "step": 2503 }, { "epoch": 4.89, "grad_norm": 0.5189712422555551, "learning_rate": 1.0198536934366786e-08, "loss": 0.0054, "step": 2504 }, { "epoch": 4.89, "grad_norm": 0.5401290079556255, "learning_rate": 9.837724760242916e-09, "loss": 0.0052, "step": 2505 }, { "epoch": 4.89, "grad_norm": 0.6759978752264958, "learning_rate": 9.483401624584276e-09, "loss": 0.0101, "step": 2506 }, { "epoch": 4.9, "grad_norm": 0.6487084797313923, "learning_rate": 9.135568186162012e-09, "loss": 0.0095, "step": 2507 }, { "epoch": 4.9, "grad_norm": 0.3863584221924932, "learning_rate": 8.794225091680763e-09, "loss": 0.0041, "step": 2508 }, { "epoch": 4.9, "grad_norm": 0.49756764018088434, "learning_rate": 8.459372975777868e-09, "loss": 0.0087, "step": 2509 }, { "epoch": 4.9, "grad_norm": 0.49070961888619186, "learning_rate": 8.13101246102338e-09, "loss": 0.0064, "step": 2510 }, { "epoch": 4.9, "grad_norm": 0.652322486005463, "learning_rate": 7.809144157916947e-09, "loss": 0.0114, "step": 2511 }, { "epoch": 4.91, "grad_norm": 0.6109551005602656, "learning_rate": 7.493768664887822e-09, "loss": 0.0102, "step": 2512 }, { "epoch": 4.91, "grad_norm": 0.6331811039279497, "learning_rate": 7.18488656829408e-09, "loss": 0.0102, "step": 2513 }, { "epoch": 4.91, "grad_norm": 0.6146723597379349, "learning_rate": 6.882498442420282e-09, "loss": 0.0111, "step": 2514 }, { "epoch": 4.91, "grad_norm": 0.4970449828707489, "learning_rate": 6.586604849477873e-09, "loss": 0.007, "step": 2515 }, { "epoch": 4.91, "grad_norm": 0.6860396865998082, "learning_rate": 6.2972063396032336e-09, "loss": 0.0132, "step": 2516 }, { "epoch": 4.92, "grad_norm": 0.6873182017269688, "learning_rate": 6.0143034508565175e-09, "loss": 0.0091, "step": 2517 }, { "epoch": 4.92, "grad_norm": 0.5611944322846386, "learning_rate": 5.737896709221257e-09, "loss": 0.0066, "step": 2518 }, { "epoch": 4.92, "grad_norm": 0.6067519806706158, "learning_rate": 5.467986628603205e-09, "loss": 0.0079, "step": 2519 }, { "epoch": 4.92, "grad_norm": 0.5494107123184346, "learning_rate": 5.204573710829163e-09, "loss": 0.0106, "step": 2520 }, { "epoch": 4.92, "grad_norm": 0.7192082925010066, "learning_rate": 4.947658445645819e-09, "loss": 0.0133, "step": 2521 }, { "epoch": 4.93, "grad_norm": 0.6610104322312917, "learning_rate": 4.697241310720135e-09, "loss": 0.0133, "step": 2522 }, { "epoch": 4.93, "grad_norm": 0.4504625334814629, "learning_rate": 4.453322771636236e-09, "loss": 0.0066, "step": 2523 }, { "epoch": 4.93, "grad_norm": 0.6909427894332516, "learning_rate": 4.2159032818965825e-09, "loss": 0.0127, "step": 2524 }, { "epoch": 4.93, "grad_norm": 0.5124889558502048, "learning_rate": 3.984983282920795e-09, "loss": 0.0055, "step": 2525 }, { "epoch": 4.93, "grad_norm": 0.6906124483724745, "learning_rate": 3.760563204042944e-09, "loss": 0.0106, "step": 2526 }, { "epoch": 4.94, "grad_norm": 0.8097448237070037, "learning_rate": 3.5426434625138724e-09, "loss": 0.023, "step": 2527 }, { "epoch": 4.94, "grad_norm": 0.6701900952609197, "learning_rate": 3.331224463497706e-09, "loss": 0.01, "step": 2528 }, { "epoch": 4.94, "grad_norm": 0.7015149974787596, "learning_rate": 3.126306600072626e-09, "loss": 0.0076, "step": 2529 }, { "epoch": 4.94, "grad_norm": 0.6824597899075813, "learning_rate": 2.9278902532293148e-09, "loss": 0.0127, "step": 2530 }, { "epoch": 4.94, "grad_norm": 0.6088336349155752, "learning_rate": 2.7359757918709593e-09, "loss": 0.0086, "step": 2531 }, { "epoch": 4.95, "grad_norm": 0.5716860747315653, "learning_rate": 2.5505635728116927e-09, "loss": 0.0078, "step": 2532 }, { "epoch": 4.95, "grad_norm": 0.8981526091034873, "learning_rate": 2.371653940776597e-09, "loss": 0.0207, "step": 2533 }, { "epoch": 4.95, "grad_norm": 1.1659139011965682, "learning_rate": 2.199247228401702e-09, "loss": 0.0082, "step": 2534 }, { "epoch": 4.95, "grad_norm": 0.8019533478284275, "learning_rate": 2.0333437562316535e-09, "loss": 0.0112, "step": 2535 }, { "epoch": 4.95, "grad_norm": 0.6127857436209697, "learning_rate": 1.873943832720104e-09, "loss": 0.0101, "step": 2536 }, { "epoch": 4.96, "grad_norm": 0.7568109719578202, "learning_rate": 1.7210477542297098e-09, "loss": 0.0139, "step": 2537 }, { "epoch": 4.96, "grad_norm": 0.6108443808757446, "learning_rate": 1.5746558050298009e-09, "loss": 0.009, "step": 2538 }, { "epoch": 4.96, "grad_norm": 0.8695094246389805, "learning_rate": 1.4347682572983244e-09, "loss": 0.0198, "step": 2539 }, { "epoch": 4.96, "grad_norm": 0.39766690400669447, "learning_rate": 1.3013853711191237e-09, "loss": 0.0036, "step": 2540 }, { "epoch": 4.96, "grad_norm": 0.7377255398782304, "learning_rate": 1.1745073944827156e-09, "loss": 0.0099, "step": 2541 }, { "epoch": 4.96, "grad_norm": 0.5763337093068304, "learning_rate": 1.054134563285125e-09, "loss": 0.0078, "step": 2542 }, { "epoch": 4.97, "grad_norm": 0.5783661415850757, "learning_rate": 9.402671013282738e-10, "loss": 0.0083, "step": 2543 }, { "epoch": 4.97, "grad_norm": 0.5387864551357069, "learning_rate": 8.329052203180364e-10, "loss": 0.0082, "step": 2544 }, { "epoch": 4.97, "grad_norm": 0.784367271566863, "learning_rate": 7.320491198665735e-10, "loss": 0.0149, "step": 2545 }, { "epoch": 4.97, "grad_norm": 0.46423696116496005, "learning_rate": 6.376989874884443e-10, "loss": 0.0053, "step": 2546 }, { "epoch": 4.97, "grad_norm": 0.8358649117346555, "learning_rate": 5.498549986033274e-10, "loss": 0.0166, "step": 2547 }, { "epoch": 4.98, "grad_norm": 0.6749233417603095, "learning_rate": 4.685173165336897e-10, "loss": 0.0153, "step": 2548 }, { "epoch": 4.98, "grad_norm": 0.622260451080664, "learning_rate": 3.9368609250595154e-10, "loss": 0.0097, "step": 2549 }, { "epoch": 4.98, "grad_norm": 0.7124569886363933, "learning_rate": 3.253614656489323e-10, "loss": 0.0096, "step": 2550 }, { "epoch": 4.98, "grad_norm": 0.5725589266360342, "learning_rate": 2.6354356299423954e-10, "loss": 0.0075, "step": 2551 }, { "epoch": 4.98, "grad_norm": 0.3435439677752349, "learning_rate": 2.0823249947587997e-10, "loss": 0.0034, "step": 2552 }, { "epoch": 4.99, "grad_norm": 0.8291843223050324, "learning_rate": 1.5942837793025965e-10, "loss": 0.0159, "step": 2553 }, { "epoch": 4.99, "grad_norm": 0.49647217724245607, "learning_rate": 1.1713128909618397e-10, "loss": 0.0066, "step": 2554 }, { "epoch": 4.99, "grad_norm": 0.5946674061015385, "learning_rate": 8.134131161330327e-11, "loss": 0.0087, "step": 2555 }, { "epoch": 4.99, "grad_norm": 0.7595161127269165, "learning_rate": 5.205851202444434e-11, "loss": 0.0158, "step": 2556 }, { "epoch": 4.99, "grad_norm": 0.7128316105743503, "learning_rate": 2.92829447728904e-11, "loss": 0.0088, "step": 2557 }, { "epoch": 5.0, "grad_norm": 0.5376998433398467, "learning_rate": 1.3014652203546806e-11, "loss": 0.008, "step": 2558 }, { "epoch": 5.0, "grad_norm": 0.6764238652122477, "learning_rate": 3.25366456332965e-12, "loss": 0.0111, "step": 2559 }, { "epoch": 5.0, "grad_norm": 0.6084787526814462, "learning_rate": 0.0, "loss": 0.0082, "step": 2560 }, { "epoch": 5.0, "step": 2560, "total_flos": 0.0, "train_loss": 0.1665668800491403, "train_runtime": 7422.5523, "train_samples_per_second": 11.048, "train_steps_per_second": 0.345 } ], "logging_steps": 1.0, "max_steps": 2560, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }