{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.997639653815893, "eval_steps": 500, "global_step": 3170, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003147128245476003, "grad_norm": 2.0566761052521856, "learning_rate": 4.1666666666666667e-07, "loss": 1.0248, "step": 1 }, { "epoch": 0.006294256490952006, "grad_norm": 2.078116756187581, "learning_rate": 8.333333333333333e-07, "loss": 1.0183, "step": 2 }, { "epoch": 0.00944138473642801, "grad_norm": 2.055186894692077, "learning_rate": 1.25e-06, "loss": 1.0165, "step": 3 }, { "epoch": 0.012588512981904013, "grad_norm": 2.0254172121373073, "learning_rate": 1.6666666666666667e-06, "loss": 1.0132, "step": 4 }, { "epoch": 0.015735641227380016, "grad_norm": 1.979378254901161, "learning_rate": 2.0833333333333334e-06, "loss": 1.0338, "step": 5 }, { "epoch": 0.01888276947285602, "grad_norm": 1.6978990047138407, "learning_rate": 2.5e-06, "loss": 1.0147, "step": 6 }, { "epoch": 0.022029897718332022, "grad_norm": 1.5934719348056317, "learning_rate": 2.916666666666667e-06, "loss": 1.0025, "step": 7 }, { "epoch": 0.025177025963808025, "grad_norm": 1.1983077219680367, "learning_rate": 3.3333333333333333e-06, "loss": 0.9763, "step": 8 }, { "epoch": 0.02832415420928403, "grad_norm": 1.0943483464567008, "learning_rate": 3.7500000000000005e-06, "loss": 0.9673, "step": 9 }, { "epoch": 0.03147128245476003, "grad_norm": 1.0795799352113267, "learning_rate": 4.166666666666667e-06, "loss": 0.9625, "step": 10 }, { "epoch": 0.03461841070023604, "grad_norm": 1.2586895987651956, "learning_rate": 4.583333333333333e-06, "loss": 0.9443, "step": 11 }, { "epoch": 0.03776553894571204, "grad_norm": 1.2360462710902367, "learning_rate": 5e-06, "loss": 0.952, "step": 12 }, { "epoch": 0.040912667191188044, "grad_norm": 1.158712657793634, "learning_rate": 5.416666666666667e-06, "loss": 0.9186, "step": 13 }, { "epoch": 0.044059795436664044, "grad_norm": 0.9342993351221153, "learning_rate": 5.833333333333334e-06, "loss": 0.8874, "step": 14 }, { "epoch": 0.04720692368214005, "grad_norm": 1.0504874222027794, "learning_rate": 6.25e-06, "loss": 0.878, "step": 15 }, { "epoch": 0.05035405192761605, "grad_norm": 0.8975104331120672, "learning_rate": 6.666666666666667e-06, "loss": 0.8668, "step": 16 }, { "epoch": 0.05350118017309206, "grad_norm": 0.6476430875482199, "learning_rate": 7.083333333333335e-06, "loss": 0.8655, "step": 17 }, { "epoch": 0.05664830841856806, "grad_norm": 0.49682103011953394, "learning_rate": 7.500000000000001e-06, "loss": 0.8502, "step": 18 }, { "epoch": 0.05979543666404406, "grad_norm": 0.5685849690063021, "learning_rate": 7.916666666666667e-06, "loss": 0.8249, "step": 19 }, { "epoch": 0.06294256490952006, "grad_norm": 0.7286039018171099, "learning_rate": 8.333333333333334e-06, "loss": 0.8183, "step": 20 }, { "epoch": 0.06608969315499606, "grad_norm": 0.650325267587393, "learning_rate": 8.750000000000001e-06, "loss": 0.8078, "step": 21 }, { "epoch": 0.06923682140047208, "grad_norm": 0.524249002042332, "learning_rate": 9.166666666666666e-06, "loss": 0.7968, "step": 22 }, { "epoch": 0.07238394964594808, "grad_norm": 0.42266135038030506, "learning_rate": 9.583333333333335e-06, "loss": 0.793, "step": 23 }, { "epoch": 0.07553107789142408, "grad_norm": 0.45652357144630545, "learning_rate": 1e-05, "loss": 0.786, "step": 24 }, { "epoch": 0.07867820613690008, "grad_norm": 0.48851305915388266, "learning_rate": 1.0416666666666668e-05, "loss": 0.7842, "step": 25 }, { "epoch": 0.08182533438237609, "grad_norm": 0.47219631692611636, "learning_rate": 1.0833333333333334e-05, "loss": 0.7726, "step": 26 }, { "epoch": 0.08497246262785209, "grad_norm": 0.39201363065054773, "learning_rate": 1.125e-05, "loss": 0.768, "step": 27 }, { "epoch": 0.08811959087332809, "grad_norm": 0.3342565416438565, "learning_rate": 1.1666666666666668e-05, "loss": 0.7589, "step": 28 }, { "epoch": 0.09126671911880409, "grad_norm": 0.35827703185804977, "learning_rate": 1.2083333333333333e-05, "loss": 0.7723, "step": 29 }, { "epoch": 0.0944138473642801, "grad_norm": 0.3625916446194259, "learning_rate": 1.25e-05, "loss": 0.7524, "step": 30 }, { "epoch": 0.0975609756097561, "grad_norm": 0.32147227722705174, "learning_rate": 1.2916666666666668e-05, "loss": 0.7462, "step": 31 }, { "epoch": 0.1007081038552321, "grad_norm": 0.3105919347762339, "learning_rate": 1.3333333333333333e-05, "loss": 0.7432, "step": 32 }, { "epoch": 0.1038552321007081, "grad_norm": 0.2941313278165609, "learning_rate": 1.375e-05, "loss": 0.7487, "step": 33 }, { "epoch": 0.10700236034618411, "grad_norm": 0.2847875994844311, "learning_rate": 1.416666666666667e-05, "loss": 0.7279, "step": 34 }, { "epoch": 0.11014948859166011, "grad_norm": 0.29110050664950804, "learning_rate": 1.4583333333333333e-05, "loss": 0.7264, "step": 35 }, { "epoch": 0.11329661683713611, "grad_norm": 0.2758326744258242, "learning_rate": 1.5000000000000002e-05, "loss": 0.7289, "step": 36 }, { "epoch": 0.11644374508261211, "grad_norm": 0.25172506909717546, "learning_rate": 1.5416666666666668e-05, "loss": 0.7233, "step": 37 }, { "epoch": 0.11959087332808813, "grad_norm": 0.2472744394585722, "learning_rate": 1.5833333333333333e-05, "loss": 0.729, "step": 38 }, { "epoch": 0.12273800157356413, "grad_norm": 0.2646648296393675, "learning_rate": 1.6250000000000002e-05, "loss": 0.7279, "step": 39 }, { "epoch": 0.12588512981904013, "grad_norm": 0.24358773217689053, "learning_rate": 1.6666666666666667e-05, "loss": 0.7184, "step": 40 }, { "epoch": 0.12903225806451613, "grad_norm": 0.2393241155630341, "learning_rate": 1.7083333333333333e-05, "loss": 0.7136, "step": 41 }, { "epoch": 0.13217938630999213, "grad_norm": 0.2330003474602153, "learning_rate": 1.7500000000000002e-05, "loss": 0.711, "step": 42 }, { "epoch": 0.13532651455546812, "grad_norm": 0.22294095752365714, "learning_rate": 1.7916666666666667e-05, "loss": 0.7126, "step": 43 }, { "epoch": 0.13847364280094415, "grad_norm": 0.23816885540234745, "learning_rate": 1.8333333333333333e-05, "loss": 0.7188, "step": 44 }, { "epoch": 0.14162077104642015, "grad_norm": 0.2257889298086421, "learning_rate": 1.8750000000000002e-05, "loss": 0.6991, "step": 45 }, { "epoch": 0.14476789929189615, "grad_norm": 0.20099324635222396, "learning_rate": 1.916666666666667e-05, "loss": 0.7006, "step": 46 }, { "epoch": 0.14791502753737215, "grad_norm": 0.25186139333784574, "learning_rate": 1.9583333333333333e-05, "loss": 0.7087, "step": 47 }, { "epoch": 0.15106215578284815, "grad_norm": 0.2232374205375328, "learning_rate": 2e-05, "loss": 0.6971, "step": 48 }, { "epoch": 0.15420928402832415, "grad_norm": 0.21825531385293007, "learning_rate": 2.0416666666666667e-05, "loss": 0.697, "step": 49 }, { "epoch": 0.15735641227380015, "grad_norm": 0.21596204587349424, "learning_rate": 2.0833333333333336e-05, "loss": 0.6887, "step": 50 }, { "epoch": 0.16050354051927615, "grad_norm": 0.23116942027734438, "learning_rate": 2.125e-05, "loss": 0.6885, "step": 51 }, { "epoch": 0.16365066876475218, "grad_norm": 0.21015812381257615, "learning_rate": 2.1666666666666667e-05, "loss": 0.6866, "step": 52 }, { "epoch": 0.16679779701022818, "grad_norm": 0.19996909500963955, "learning_rate": 2.2083333333333336e-05, "loss": 0.6898, "step": 53 }, { "epoch": 0.16994492525570418, "grad_norm": 0.20997251324092625, "learning_rate": 2.25e-05, "loss": 0.6836, "step": 54 }, { "epoch": 0.17309205350118018, "grad_norm": 0.20108945450898513, "learning_rate": 2.2916666666666667e-05, "loss": 0.6868, "step": 55 }, { "epoch": 0.17623918174665618, "grad_norm": 0.2035143838254788, "learning_rate": 2.3333333333333336e-05, "loss": 0.6936, "step": 56 }, { "epoch": 0.17938630999213218, "grad_norm": 0.2004298904967849, "learning_rate": 2.375e-05, "loss": 0.6746, "step": 57 }, { "epoch": 0.18253343823760818, "grad_norm": 0.20059328010088198, "learning_rate": 2.4166666666666667e-05, "loss": 0.682, "step": 58 }, { "epoch": 0.18568056648308418, "grad_norm": 0.21755269002083433, "learning_rate": 2.4583333333333336e-05, "loss": 0.6735, "step": 59 }, { "epoch": 0.1888276947285602, "grad_norm": 0.2129373116359228, "learning_rate": 2.5e-05, "loss": 0.6663, "step": 60 }, { "epoch": 0.1919748229740362, "grad_norm": 0.1995735536259152, "learning_rate": 2.5416666666666667e-05, "loss": 0.6787, "step": 61 }, { "epoch": 0.1951219512195122, "grad_norm": 0.23037748881825523, "learning_rate": 2.5833333333333336e-05, "loss": 0.6703, "step": 62 }, { "epoch": 0.1982690794649882, "grad_norm": 0.18391751461207972, "learning_rate": 2.625e-05, "loss": 0.6764, "step": 63 }, { "epoch": 0.2014162077104642, "grad_norm": 0.2123421226257098, "learning_rate": 2.6666666666666667e-05, "loss": 0.6714, "step": 64 }, { "epoch": 0.2045633359559402, "grad_norm": 0.20183153602864587, "learning_rate": 2.7083333333333335e-05, "loss": 0.6656, "step": 65 }, { "epoch": 0.2077104642014162, "grad_norm": 0.19119357792446254, "learning_rate": 2.75e-05, "loss": 0.6684, "step": 66 }, { "epoch": 0.2108575924468922, "grad_norm": 0.20177148219300692, "learning_rate": 2.7916666666666666e-05, "loss": 0.6458, "step": 67 }, { "epoch": 0.21400472069236823, "grad_norm": 0.22326018847799878, "learning_rate": 2.833333333333334e-05, "loss": 0.6659, "step": 68 }, { "epoch": 0.21715184893784423, "grad_norm": 0.22960589662619602, "learning_rate": 2.875e-05, "loss": 0.6814, "step": 69 }, { "epoch": 0.22029897718332023, "grad_norm": 0.20556408160244669, "learning_rate": 2.9166666666666666e-05, "loss": 0.6651, "step": 70 }, { "epoch": 0.22344610542879623, "grad_norm": 0.23091408485344644, "learning_rate": 2.958333333333334e-05, "loss": 0.6587, "step": 71 }, { "epoch": 0.22659323367427223, "grad_norm": 0.24593395754345967, "learning_rate": 3.0000000000000004e-05, "loss": 0.6559, "step": 72 }, { "epoch": 0.22974036191974823, "grad_norm": 0.2941140735112936, "learning_rate": 3.0416666666666666e-05, "loss": 0.6523, "step": 73 }, { "epoch": 0.23288749016522423, "grad_norm": 0.2726093572840182, "learning_rate": 3.0833333333333335e-05, "loss": 0.6685, "step": 74 }, { "epoch": 0.23603461841070023, "grad_norm": 0.28195810887905565, "learning_rate": 3.125e-05, "loss": 0.6614, "step": 75 }, { "epoch": 0.23918174665617625, "grad_norm": 0.24022069913129832, "learning_rate": 3.1666666666666666e-05, "loss": 0.6593, "step": 76 }, { "epoch": 0.24232887490165225, "grad_norm": 0.25139586251835144, "learning_rate": 3.208333333333334e-05, "loss": 0.659, "step": 77 }, { "epoch": 0.24547600314712825, "grad_norm": 0.2566709925175564, "learning_rate": 3.2500000000000004e-05, "loss": 0.6542, "step": 78 }, { "epoch": 0.24862313139260425, "grad_norm": 0.2883214448935213, "learning_rate": 3.291666666666667e-05, "loss": 0.6471, "step": 79 }, { "epoch": 0.25177025963808025, "grad_norm": 0.30668415687027056, "learning_rate": 3.3333333333333335e-05, "loss": 0.6439, "step": 80 }, { "epoch": 0.2549173878835563, "grad_norm": 0.29042450307830464, "learning_rate": 3.375e-05, "loss": 0.651, "step": 81 }, { "epoch": 0.25806451612903225, "grad_norm": 0.2736791002695721, "learning_rate": 3.4166666666666666e-05, "loss": 0.6467, "step": 82 }, { "epoch": 0.2612116443745083, "grad_norm": 0.265465779092424, "learning_rate": 3.458333333333334e-05, "loss": 0.6412, "step": 83 }, { "epoch": 0.26435877261998425, "grad_norm": 0.2968535790814613, "learning_rate": 3.5000000000000004e-05, "loss": 0.6574, "step": 84 }, { "epoch": 0.2675059008654603, "grad_norm": 0.43190214956783235, "learning_rate": 3.541666666666667e-05, "loss": 0.6495, "step": 85 }, { "epoch": 0.27065302911093625, "grad_norm": 0.632308039014685, "learning_rate": 3.5833333333333335e-05, "loss": 0.6515, "step": 86 }, { "epoch": 0.2738001573564123, "grad_norm": 0.7849780285031561, "learning_rate": 3.625e-05, "loss": 0.6546, "step": 87 }, { "epoch": 0.2769472856018883, "grad_norm": 0.7233136246737597, "learning_rate": 3.6666666666666666e-05, "loss": 0.6468, "step": 88 }, { "epoch": 0.2800944138473643, "grad_norm": 0.5392685671512011, "learning_rate": 3.708333333333334e-05, "loss": 0.6366, "step": 89 }, { "epoch": 0.2832415420928403, "grad_norm": 0.6548726987366142, "learning_rate": 3.7500000000000003e-05, "loss": 0.6433, "step": 90 }, { "epoch": 0.2863886703383163, "grad_norm": 0.74072769066675, "learning_rate": 3.791666666666667e-05, "loss": 0.6421, "step": 91 }, { "epoch": 0.2895357985837923, "grad_norm": 0.37857830042275176, "learning_rate": 3.833333333333334e-05, "loss": 0.65, "step": 92 }, { "epoch": 0.2926829268292683, "grad_norm": 0.5508648164496704, "learning_rate": 3.875e-05, "loss": 0.6493, "step": 93 }, { "epoch": 0.2958300550747443, "grad_norm": 0.5478861545420471, "learning_rate": 3.9166666666666665e-05, "loss": 0.6469, "step": 94 }, { "epoch": 0.2989771833202203, "grad_norm": 0.3468090216652617, "learning_rate": 3.958333333333334e-05, "loss": 0.6514, "step": 95 }, { "epoch": 0.3021243115656963, "grad_norm": 0.6547220457148604, "learning_rate": 4e-05, "loss": 0.649, "step": 96 }, { "epoch": 0.30527143981117233, "grad_norm": 0.46761393726738054, "learning_rate": 3.999999059985635e-05, "loss": 0.6408, "step": 97 }, { "epoch": 0.3084185680566483, "grad_norm": 0.39367909064047446, "learning_rate": 3.99999623994352e-05, "loss": 0.6365, "step": 98 }, { "epoch": 0.31156569630212433, "grad_norm": 0.5946767742649087, "learning_rate": 3.9999915398766006e-05, "loss": 0.6366, "step": 99 }, { "epoch": 0.3147128245476003, "grad_norm": 0.31375774268214407, "learning_rate": 3.999984959789786e-05, "loss": 0.6389, "step": 100 }, { "epoch": 0.31785995279307633, "grad_norm": 0.5057217370873666, "learning_rate": 3.9999764996899494e-05, "loss": 0.6457, "step": 101 }, { "epoch": 0.3210070810385523, "grad_norm": 0.35265559358910226, "learning_rate": 3.9999661595859275e-05, "loss": 0.6438, "step": 102 }, { "epoch": 0.3241542092840283, "grad_norm": 0.341984180495186, "learning_rate": 3.9999539394885177e-05, "loss": 0.6275, "step": 103 }, { "epoch": 0.32730133752950435, "grad_norm": 0.3862289663549392, "learning_rate": 3.999939839410486e-05, "loss": 0.6279, "step": 104 }, { "epoch": 0.3304484657749803, "grad_norm": 0.30610508770190564, "learning_rate": 3.999923859366557e-05, "loss": 0.6335, "step": 105 }, { "epoch": 0.33359559402045635, "grad_norm": 0.39738483622597887, "learning_rate": 3.999905999373424e-05, "loss": 0.6275, "step": 106 }, { "epoch": 0.3367427222659323, "grad_norm": 0.34695466353973403, "learning_rate": 3.9998862594497396e-05, "loss": 0.634, "step": 107 }, { "epoch": 0.33988985051140835, "grad_norm": 0.4434518808465586, "learning_rate": 3.999864639616121e-05, "loss": 0.6374, "step": 108 }, { "epoch": 0.3430369787568843, "grad_norm": 0.33772070770009105, "learning_rate": 3.99984113989515e-05, "loss": 0.6266, "step": 109 }, { "epoch": 0.34618410700236035, "grad_norm": 0.2584585866122632, "learning_rate": 3.99981576031137e-05, "loss": 0.6292, "step": 110 }, { "epoch": 0.3493312352478363, "grad_norm": 0.3611261393186681, "learning_rate": 3.9997885008912905e-05, "loss": 0.6361, "step": 111 }, { "epoch": 0.35247836349331235, "grad_norm": 0.3023341429429724, "learning_rate": 3.999759361663381e-05, "loss": 0.6325, "step": 112 }, { "epoch": 0.3556254917387884, "grad_norm": 0.30908333541351135, "learning_rate": 3.999728342658079e-05, "loss": 0.6368, "step": 113 }, { "epoch": 0.35877261998426435, "grad_norm": 0.265928899655407, "learning_rate": 3.999695443907781e-05, "loss": 0.6303, "step": 114 }, { "epoch": 0.3619197482297404, "grad_norm": 0.27333927680685793, "learning_rate": 3.9996606654468476e-05, "loss": 0.6277, "step": 115 }, { "epoch": 0.36506687647521635, "grad_norm": 0.2744818724487684, "learning_rate": 3.9996240073116044e-05, "loss": 0.6272, "step": 116 }, { "epoch": 0.3682140047206924, "grad_norm": 0.2869505492537586, "learning_rate": 3.99958546954034e-05, "loss": 0.6165, "step": 117 }, { "epoch": 0.37136113296616835, "grad_norm": 0.26133884085799125, "learning_rate": 3.9995450521733044e-05, "loss": 0.6303, "step": 118 }, { "epoch": 0.3745082612116444, "grad_norm": 0.28364779766814496, "learning_rate": 3.9995027552527126e-05, "loss": 0.6355, "step": 119 }, { "epoch": 0.3776553894571204, "grad_norm": 0.26991948715363395, "learning_rate": 3.9994585788227425e-05, "loss": 0.6353, "step": 120 }, { "epoch": 0.3808025177025964, "grad_norm": 0.29168641093859365, "learning_rate": 3.9994125229295335e-05, "loss": 0.6347, "step": 121 }, { "epoch": 0.3839496459480724, "grad_norm": 0.2778112649452421, "learning_rate": 3.999364587621189e-05, "loss": 0.6314, "step": 122 }, { "epoch": 0.3870967741935484, "grad_norm": 0.25501755874765036, "learning_rate": 3.9993147729477775e-05, "loss": 0.6287, "step": 123 }, { "epoch": 0.3902439024390244, "grad_norm": 0.34414954964963435, "learning_rate": 3.999263078961327e-05, "loss": 0.6278, "step": 124 }, { "epoch": 0.3933910306845004, "grad_norm": 0.34610957165859696, "learning_rate": 3.9992095057158304e-05, "loss": 0.6216, "step": 125 }, { "epoch": 0.3965381589299764, "grad_norm": 0.28674531389146546, "learning_rate": 3.999154053267242e-05, "loss": 0.638, "step": 126 }, { "epoch": 0.3996852871754524, "grad_norm": 0.25576505127419086, "learning_rate": 3.99909672167348e-05, "loss": 0.6259, "step": 127 }, { "epoch": 0.4028324154209284, "grad_norm": 0.31068566841934725, "learning_rate": 3.9990375109944254e-05, "loss": 0.6266, "step": 128 }, { "epoch": 0.40597954366640443, "grad_norm": 0.43495770039787945, "learning_rate": 3.998976421291921e-05, "loss": 0.6194, "step": 129 }, { "epoch": 0.4091266719118804, "grad_norm": 0.40876214950723583, "learning_rate": 3.998913452629773e-05, "loss": 0.6261, "step": 130 }, { "epoch": 0.41227380015735643, "grad_norm": 0.26678746806822895, "learning_rate": 3.998848605073749e-05, "loss": 0.63, "step": 131 }, { "epoch": 0.4154209284028324, "grad_norm": 0.2878327006301991, "learning_rate": 3.9987818786915807e-05, "loss": 0.6204, "step": 132 }, { "epoch": 0.41856805664830843, "grad_norm": 0.3111848739668028, "learning_rate": 3.9987132735529594e-05, "loss": 0.6297, "step": 133 }, { "epoch": 0.4217151848937844, "grad_norm": 0.2609480696864346, "learning_rate": 3.998642789729543e-05, "loss": 0.6231, "step": 134 }, { "epoch": 0.42486231313926043, "grad_norm": 0.2811609267853307, "learning_rate": 3.998570427294947e-05, "loss": 0.6187, "step": 135 }, { "epoch": 0.42800944138473646, "grad_norm": 0.3489218624854075, "learning_rate": 3.998496186324753e-05, "loss": 0.6286, "step": 136 }, { "epoch": 0.4311565696302124, "grad_norm": 0.3209965825815324, "learning_rate": 3.9984200668965e-05, "loss": 0.6146, "step": 137 }, { "epoch": 0.43430369787568845, "grad_norm": 0.2527692210447605, "learning_rate": 3.998342069089694e-05, "loss": 0.6203, "step": 138 }, { "epoch": 0.4374508261211644, "grad_norm": 0.2863723927860823, "learning_rate": 3.9982621929857994e-05, "loss": 0.6186, "step": 139 }, { "epoch": 0.44059795436664045, "grad_norm": 0.3362120402580175, "learning_rate": 3.998180438668244e-05, "loss": 0.6173, "step": 140 }, { "epoch": 0.4437450826121164, "grad_norm": 0.29944731618068043, "learning_rate": 3.998096806222417e-05, "loss": 0.6079, "step": 141 }, { "epoch": 0.44689221085759245, "grad_norm": 0.2521452559671069, "learning_rate": 3.9980112957356705e-05, "loss": 0.6249, "step": 142 }, { "epoch": 0.4500393391030684, "grad_norm": 0.2545938988617545, "learning_rate": 3.997923907297315e-05, "loss": 0.6083, "step": 143 }, { "epoch": 0.45318646734854445, "grad_norm": 0.25898746551692964, "learning_rate": 3.997834640998624e-05, "loss": 0.6146, "step": 144 }, { "epoch": 0.4563335955940205, "grad_norm": 0.2771475593887788, "learning_rate": 3.9977434969328344e-05, "loss": 0.6155, "step": 145 }, { "epoch": 0.45948072383949645, "grad_norm": 0.2715220470047786, "learning_rate": 3.9976504751951415e-05, "loss": 0.6139, "step": 146 }, { "epoch": 0.4626278520849725, "grad_norm": 0.262357157875343, "learning_rate": 3.997555575882702e-05, "loss": 0.6109, "step": 147 }, { "epoch": 0.46577498033044845, "grad_norm": 0.2656139774401674, "learning_rate": 3.9974587990946365e-05, "loss": 0.6195, "step": 148 }, { "epoch": 0.4689221085759245, "grad_norm": 0.27484911731602474, "learning_rate": 3.997360144932023e-05, "loss": 0.6167, "step": 149 }, { "epoch": 0.47206923682140045, "grad_norm": 0.27057626890655806, "learning_rate": 3.997259613497902e-05, "loss": 0.6268, "step": 150 }, { "epoch": 0.4752163650668765, "grad_norm": 0.22502846697134835, "learning_rate": 3.9971572048972754e-05, "loss": 0.6159, "step": 151 }, { "epoch": 0.4783634933123525, "grad_norm": 0.2812778266326769, "learning_rate": 3.997052919237105e-05, "loss": 0.621, "step": 152 }, { "epoch": 0.4815106215578285, "grad_norm": 0.28278818974675024, "learning_rate": 3.9969467566263115e-05, "loss": 0.6238, "step": 153 }, { "epoch": 0.4846577498033045, "grad_norm": 0.3420285701015734, "learning_rate": 3.996838717175779e-05, "loss": 0.6118, "step": 154 }, { "epoch": 0.4878048780487805, "grad_norm": 0.35539942258827856, "learning_rate": 3.9967288009983496e-05, "loss": 0.6168, "step": 155 }, { "epoch": 0.4909520062942565, "grad_norm": 0.29368139728658277, "learning_rate": 3.996617008208827e-05, "loss": 0.6049, "step": 156 }, { "epoch": 0.4940991345397325, "grad_norm": 0.27690796066202816, "learning_rate": 3.996503338923974e-05, "loss": 0.6151, "step": 157 }, { "epoch": 0.4972462627852085, "grad_norm": 0.41334059184826466, "learning_rate": 3.9963877932625134e-05, "loss": 0.6184, "step": 158 }, { "epoch": 0.5003933910306845, "grad_norm": 0.42636029329489616, "learning_rate": 3.996270371345129e-05, "loss": 0.6144, "step": 159 }, { "epoch": 0.5035405192761605, "grad_norm": 0.2810420864283484, "learning_rate": 3.9961510732944624e-05, "loss": 0.6185, "step": 160 }, { "epoch": 0.5066876475216365, "grad_norm": 0.3032979670400302, "learning_rate": 3.996029899235116e-05, "loss": 0.6009, "step": 161 }, { "epoch": 0.5098347757671126, "grad_norm": 0.4124798386817749, "learning_rate": 3.9959068492936517e-05, "loss": 0.608, "step": 162 }, { "epoch": 0.5129819040125885, "grad_norm": 0.46292941599151394, "learning_rate": 3.99578192359859e-05, "loss": 0.6303, "step": 163 }, { "epoch": 0.5161290322580645, "grad_norm": 0.4249023229216657, "learning_rate": 3.99565512228041e-05, "loss": 0.6158, "step": 164 }, { "epoch": 0.5192761605035405, "grad_norm": 0.3081385660055859, "learning_rate": 3.9955264454715524e-05, "loss": 0.604, "step": 165 }, { "epoch": 0.5224232887490166, "grad_norm": 0.3115598725592295, "learning_rate": 3.995395893306414e-05, "loss": 0.6132, "step": 166 }, { "epoch": 0.5255704169944925, "grad_norm": 0.423339283046509, "learning_rate": 3.995263465921351e-05, "loss": 0.6133, "step": 167 }, { "epoch": 0.5287175452399685, "grad_norm": 0.38579662867406184, "learning_rate": 3.9951291634546784e-05, "loss": 0.6046, "step": 168 }, { "epoch": 0.5318646734854445, "grad_norm": 0.2646661405581634, "learning_rate": 3.9949929860466715e-05, "loss": 0.6065, "step": 169 }, { "epoch": 0.5350118017309206, "grad_norm": 0.29299500671952294, "learning_rate": 3.994854933839561e-05, "loss": 0.597, "step": 170 }, { "epoch": 0.5381589299763966, "grad_norm": 0.3801312863346971, "learning_rate": 3.994715006977536e-05, "loss": 0.609, "step": 171 }, { "epoch": 0.5413060582218725, "grad_norm": 0.2636893704090895, "learning_rate": 3.994573205606747e-05, "loss": 0.6059, "step": 172 }, { "epoch": 0.5444531864673485, "grad_norm": 0.2587055020132136, "learning_rate": 3.994429529875298e-05, "loss": 0.5968, "step": 173 }, { "epoch": 0.5476003147128246, "grad_norm": 0.3153597828303425, "learning_rate": 3.994283979933254e-05, "loss": 0.6133, "step": 174 }, { "epoch": 0.5507474429583006, "grad_norm": 0.26775970858600634, "learning_rate": 3.994136555932635e-05, "loss": 0.6045, "step": 175 }, { "epoch": 0.5538945712037766, "grad_norm": 0.3094235529607136, "learning_rate": 3.993987258027419e-05, "loss": 0.6089, "step": 176 }, { "epoch": 0.5570416994492525, "grad_norm": 0.28763348823077006, "learning_rate": 3.9938360863735435e-05, "loss": 0.609, "step": 177 }, { "epoch": 0.5601888276947286, "grad_norm": 0.2177547953667043, "learning_rate": 3.9936830411289e-05, "loss": 0.6154, "step": 178 }, { "epoch": 0.5633359559402046, "grad_norm": 0.2817386027141762, "learning_rate": 3.993528122453339e-05, "loss": 0.6119, "step": 179 }, { "epoch": 0.5664830841856806, "grad_norm": 0.2844686972678976, "learning_rate": 3.993371330508666e-05, "loss": 0.5981, "step": 180 }, { "epoch": 0.5696302124311565, "grad_norm": 0.2448670332544363, "learning_rate": 3.9932126654586446e-05, "loss": 0.5915, "step": 181 }, { "epoch": 0.5727773406766326, "grad_norm": 0.2597177617957836, "learning_rate": 3.993052127468994e-05, "loss": 0.5928, "step": 182 }, { "epoch": 0.5759244689221086, "grad_norm": 0.2215768221036163, "learning_rate": 3.99288971670739e-05, "loss": 0.6161, "step": 183 }, { "epoch": 0.5790715971675846, "grad_norm": 0.2699731828037381, "learning_rate": 3.9927254333434656e-05, "loss": 0.5921, "step": 184 }, { "epoch": 0.5822187254130606, "grad_norm": 0.29227902549722135, "learning_rate": 3.9925592775488046e-05, "loss": 0.5976, "step": 185 }, { "epoch": 0.5853658536585366, "grad_norm": 0.2541877399622803, "learning_rate": 3.9923912494969536e-05, "loss": 0.6102, "step": 186 }, { "epoch": 0.5885129819040126, "grad_norm": 0.3043738596400513, "learning_rate": 3.9922213493634096e-05, "loss": 0.611, "step": 187 }, { "epoch": 0.5916601101494886, "grad_norm": 0.2769382151889082, "learning_rate": 3.992049577325627e-05, "loss": 0.609, "step": 188 }, { "epoch": 0.5948072383949646, "grad_norm": 0.23411388895804314, "learning_rate": 3.991875933563014e-05, "loss": 0.5983, "step": 189 }, { "epoch": 0.5979543666404405, "grad_norm": 0.31989694156952164, "learning_rate": 3.991700418256936e-05, "loss": 0.6045, "step": 190 }, { "epoch": 0.6011014948859166, "grad_norm": 0.38404257854635715, "learning_rate": 3.991523031590711e-05, "loss": 0.6063, "step": 191 }, { "epoch": 0.6042486231313926, "grad_norm": 0.33761081359143924, "learning_rate": 3.9913437737496135e-05, "loss": 0.5951, "step": 192 }, { "epoch": 0.6073957513768686, "grad_norm": 0.2381342715991919, "learning_rate": 3.9911626449208694e-05, "loss": 0.601, "step": 193 }, { "epoch": 0.6105428796223447, "grad_norm": 0.31880643686538623, "learning_rate": 3.9909796452936616e-05, "loss": 0.6009, "step": 194 }, { "epoch": 0.6136900078678206, "grad_norm": 0.3563025725018504, "learning_rate": 3.990794775059126e-05, "loss": 0.6009, "step": 195 }, { "epoch": 0.6168371361132966, "grad_norm": 0.3033415317564058, "learning_rate": 3.9906080344103516e-05, "loss": 0.5992, "step": 196 }, { "epoch": 0.6199842643587726, "grad_norm": 0.2775053050931378, "learning_rate": 3.990419423542383e-05, "loss": 0.5987, "step": 197 }, { "epoch": 0.6231313926042487, "grad_norm": 0.2614901374015711, "learning_rate": 3.990228942652215e-05, "loss": 0.5918, "step": 198 }, { "epoch": 0.6262785208497246, "grad_norm": 0.2977635557001149, "learning_rate": 3.9900365919387985e-05, "loss": 0.6046, "step": 199 }, { "epoch": 0.6294256490952006, "grad_norm": 0.30438529335477493, "learning_rate": 3.9898423716030364e-05, "loss": 0.5966, "step": 200 }, { "epoch": 0.6325727773406766, "grad_norm": 0.28279872927198246, "learning_rate": 3.989646281847783e-05, "loss": 0.5943, "step": 201 }, { "epoch": 0.6357199055861527, "grad_norm": 0.25795220306495825, "learning_rate": 3.989448322877848e-05, "loss": 0.5989, "step": 202 }, { "epoch": 0.6388670338316287, "grad_norm": 0.280857506484411, "learning_rate": 3.98924849489999e-05, "loss": 0.595, "step": 203 }, { "epoch": 0.6420141620771046, "grad_norm": 0.28147222245655734, "learning_rate": 3.989046798122922e-05, "loss": 0.5968, "step": 204 }, { "epoch": 0.6451612903225806, "grad_norm": 0.22772337446133548, "learning_rate": 3.988843232757308e-05, "loss": 0.5895, "step": 205 }, { "epoch": 0.6483084185680567, "grad_norm": 0.24725422874191516, "learning_rate": 3.9886377990157645e-05, "loss": 0.5915, "step": 206 }, { "epoch": 0.6514555468135327, "grad_norm": 0.2478883381265632, "learning_rate": 3.988430497112859e-05, "loss": 0.5946, "step": 207 }, { "epoch": 0.6546026750590087, "grad_norm": 0.20803618305397573, "learning_rate": 3.988221327265111e-05, "loss": 0.6081, "step": 208 }, { "epoch": 0.6577498033044846, "grad_norm": 0.25475933367836384, "learning_rate": 3.988010289690987e-05, "loss": 0.6017, "step": 209 }, { "epoch": 0.6608969315499607, "grad_norm": 0.24824235295137567, "learning_rate": 3.987797384610911e-05, "loss": 0.6028, "step": 210 }, { "epoch": 0.6640440597954367, "grad_norm": 0.23024676822564225, "learning_rate": 3.9875826122472514e-05, "loss": 0.5947, "step": 211 }, { "epoch": 0.6671911880409127, "grad_norm": 0.27973892618861823, "learning_rate": 3.987365972824331e-05, "loss": 0.5977, "step": 212 }, { "epoch": 0.6703383162863886, "grad_norm": 0.21516896519325748, "learning_rate": 3.98714746656842e-05, "loss": 0.601, "step": 213 }, { "epoch": 0.6734854445318647, "grad_norm": 0.20803886239420252, "learning_rate": 3.98692709370774e-05, "loss": 0.5969, "step": 214 }, { "epoch": 0.6766325727773407, "grad_norm": 0.238940879654807, "learning_rate": 3.986704854472462e-05, "loss": 0.5985, "step": 215 }, { "epoch": 0.6797797010228167, "grad_norm": 0.24651655899332123, "learning_rate": 3.9864807490947056e-05, "loss": 0.5984, "step": 216 }, { "epoch": 0.6829268292682927, "grad_norm": 0.26084076415464624, "learning_rate": 3.98625477780854e-05, "loss": 0.5932, "step": 217 }, { "epoch": 0.6860739575137687, "grad_norm": 0.2386108054460097, "learning_rate": 3.9860269408499844e-05, "loss": 0.5842, "step": 218 }, { "epoch": 0.6892210857592447, "grad_norm": 0.2844335881193996, "learning_rate": 3.9857972384570035e-05, "loss": 0.595, "step": 219 }, { "epoch": 0.6923682140047207, "grad_norm": 0.2700792716329994, "learning_rate": 3.985565670869513e-05, "loss": 0.5965, "step": 220 }, { "epoch": 0.6955153422501967, "grad_norm": 0.25960095481333906, "learning_rate": 3.985332238329378e-05, "loss": 0.5916, "step": 221 }, { "epoch": 0.6986624704956726, "grad_norm": 0.28288643057296725, "learning_rate": 3.9850969410804065e-05, "loss": 0.5995, "step": 222 }, { "epoch": 0.7018095987411487, "grad_norm": 0.24327917475329708, "learning_rate": 3.98485977936836e-05, "loss": 0.5959, "step": 223 }, { "epoch": 0.7049567269866247, "grad_norm": 0.2721668752895481, "learning_rate": 3.984620753440943e-05, "loss": 0.5994, "step": 224 }, { "epoch": 0.7081038552321007, "grad_norm": 0.2607076361644052, "learning_rate": 3.984379863547808e-05, "loss": 0.5943, "step": 225 }, { "epoch": 0.7112509834775768, "grad_norm": 0.32932315230284676, "learning_rate": 3.984137109940556e-05, "loss": 0.5918, "step": 226 }, { "epoch": 0.7143981117230527, "grad_norm": 0.2696383359131296, "learning_rate": 3.983892492872733e-05, "loss": 0.5906, "step": 227 }, { "epoch": 0.7175452399685287, "grad_norm": 0.3585597053241887, "learning_rate": 3.9836460125998334e-05, "loss": 0.5948, "step": 228 }, { "epoch": 0.7206923682140047, "grad_norm": 0.2909815414499778, "learning_rate": 3.9833976693792937e-05, "loss": 0.5967, "step": 229 }, { "epoch": 0.7238394964594808, "grad_norm": 0.26425876255452163, "learning_rate": 3.9831474634705005e-05, "loss": 0.5935, "step": 230 }, { "epoch": 0.7269866247049567, "grad_norm": 0.2425151164990489, "learning_rate": 3.982895395134782e-05, "loss": 0.589, "step": 231 }, { "epoch": 0.7301337529504327, "grad_norm": 0.2616193102994722, "learning_rate": 3.982641464635416e-05, "loss": 0.6018, "step": 232 }, { "epoch": 0.7332808811959087, "grad_norm": 0.24405998282776664, "learning_rate": 3.982385672237621e-05, "loss": 0.5784, "step": 233 }, { "epoch": 0.7364280094413848, "grad_norm": 0.21832468260666305, "learning_rate": 3.9821280182085625e-05, "loss": 0.6015, "step": 234 }, { "epoch": 0.7395751376868608, "grad_norm": 0.25650060654854345, "learning_rate": 3.98186850281735e-05, "loss": 0.5913, "step": 235 }, { "epoch": 0.7427222659323367, "grad_norm": 0.27580046393197283, "learning_rate": 3.981607126335038e-05, "loss": 0.5895, "step": 236 }, { "epoch": 0.7458693941778127, "grad_norm": 0.2565257806459118, "learning_rate": 3.981343889034622e-05, "loss": 0.5919, "step": 237 }, { "epoch": 0.7490165224232888, "grad_norm": 0.28129400118590575, "learning_rate": 3.981078791191044e-05, "loss": 0.5824, "step": 238 }, { "epoch": 0.7521636506687648, "grad_norm": 0.27891204249277274, "learning_rate": 3.980811833081189e-05, "loss": 0.592, "step": 239 }, { "epoch": 0.7553107789142408, "grad_norm": 0.23957189182523364, "learning_rate": 3.9805430149838826e-05, "loss": 0.5923, "step": 240 }, { "epoch": 0.7584579071597167, "grad_norm": 0.267527485388114, "learning_rate": 3.980272337179895e-05, "loss": 0.5915, "step": 241 }, { "epoch": 0.7616050354051928, "grad_norm": 0.2893661929586173, "learning_rate": 3.97999979995194e-05, "loss": 0.5911, "step": 242 }, { "epoch": 0.7647521636506688, "grad_norm": 0.28637666161476577, "learning_rate": 3.97972540358467e-05, "loss": 0.5752, "step": 243 }, { "epoch": 0.7678992918961448, "grad_norm": 0.22704240396877157, "learning_rate": 3.979449148364682e-05, "loss": 0.5755, "step": 244 }, { "epoch": 0.7710464201416207, "grad_norm": 0.25773571819059654, "learning_rate": 3.979171034580514e-05, "loss": 0.5983, "step": 245 }, { "epoch": 0.7741935483870968, "grad_norm": 0.3238069584967583, "learning_rate": 3.9788910625226435e-05, "loss": 0.5841, "step": 246 }, { "epoch": 0.7773406766325728, "grad_norm": 0.3222043091055347, "learning_rate": 3.978609232483491e-05, "loss": 0.59, "step": 247 }, { "epoch": 0.7804878048780488, "grad_norm": 0.23894790835594737, "learning_rate": 3.978325544757419e-05, "loss": 0.5855, "step": 248 }, { "epoch": 0.7836349331235248, "grad_norm": 0.3065610131625477, "learning_rate": 3.9780399996407235e-05, "loss": 0.5872, "step": 249 }, { "epoch": 0.7867820613690008, "grad_norm": 0.41439567088342205, "learning_rate": 3.977752597431649e-05, "loss": 0.5922, "step": 250 }, { "epoch": 0.7899291896144768, "grad_norm": 0.33164301867172113, "learning_rate": 3.977463338430375e-05, "loss": 0.5966, "step": 251 }, { "epoch": 0.7930763178599528, "grad_norm": 0.338521625558078, "learning_rate": 3.977172222939019e-05, "loss": 0.5907, "step": 252 }, { "epoch": 0.7962234461054288, "grad_norm": 0.4350142509786572, "learning_rate": 3.976879251261641e-05, "loss": 0.5841, "step": 253 }, { "epoch": 0.7993705743509048, "grad_norm": 0.36993049984822834, "learning_rate": 3.9765844237042385e-05, "loss": 0.5864, "step": 254 }, { "epoch": 0.8025177025963808, "grad_norm": 0.313011441001989, "learning_rate": 3.976287740574748e-05, "loss": 0.5955, "step": 255 }, { "epoch": 0.8056648308418568, "grad_norm": 0.3540292456063131, "learning_rate": 3.975989202183041e-05, "loss": 0.5957, "step": 256 }, { "epoch": 0.8088119590873328, "grad_norm": 0.33646403434651423, "learning_rate": 3.9756888088409314e-05, "loss": 0.5847, "step": 257 }, { "epoch": 0.8119590873328089, "grad_norm": 0.26982248963868355, "learning_rate": 3.975386560862166e-05, "loss": 0.5885, "step": 258 }, { "epoch": 0.8151062155782848, "grad_norm": 0.29403731858846366, "learning_rate": 3.975082458562433e-05, "loss": 0.5897, "step": 259 }, { "epoch": 0.8182533438237608, "grad_norm": 0.4082653715355324, "learning_rate": 3.974776502259354e-05, "loss": 0.5791, "step": 260 }, { "epoch": 0.8214004720692368, "grad_norm": 0.3510739201797744, "learning_rate": 3.9744686922724876e-05, "loss": 0.593, "step": 261 }, { "epoch": 0.8245476003147129, "grad_norm": 0.3513487612479217, "learning_rate": 3.97415902892333e-05, "loss": 0.5836, "step": 262 }, { "epoch": 0.8276947285601888, "grad_norm": 0.28586935653924184, "learning_rate": 3.973847512535313e-05, "loss": 0.5826, "step": 263 }, { "epoch": 0.8308418568056648, "grad_norm": 0.2318470192789233, "learning_rate": 3.973534143433802e-05, "loss": 0.5814, "step": 264 }, { "epoch": 0.8339889850511408, "grad_norm": 0.29154683610806104, "learning_rate": 3.9732189219460994e-05, "loss": 0.5797, "step": 265 }, { "epoch": 0.8371361132966169, "grad_norm": 0.31587273712728664, "learning_rate": 3.972901848401441e-05, "loss": 0.5831, "step": 266 }, { "epoch": 0.8402832415420929, "grad_norm": 0.23591831976720817, "learning_rate": 3.972582923130998e-05, "loss": 0.5737, "step": 267 }, { "epoch": 0.8434303697875688, "grad_norm": 0.25691795262588335, "learning_rate": 3.972262146467874e-05, "loss": 0.5786, "step": 268 }, { "epoch": 0.8465774980330448, "grad_norm": 0.32432381849212155, "learning_rate": 3.971939518747109e-05, "loss": 0.593, "step": 269 }, { "epoch": 0.8497246262785209, "grad_norm": 0.28695101570482007, "learning_rate": 3.9716150403056746e-05, "loss": 0.5796, "step": 270 }, { "epoch": 0.8528717545239969, "grad_norm": 0.2724926989579401, "learning_rate": 3.971288711482476e-05, "loss": 0.5741, "step": 271 }, { "epoch": 0.8560188827694729, "grad_norm": 0.2806703214608174, "learning_rate": 3.970960532618349e-05, "loss": 0.5836, "step": 272 }, { "epoch": 0.8591660110149488, "grad_norm": 0.24677303795151184, "learning_rate": 3.9706305040560644e-05, "loss": 0.5818, "step": 273 }, { "epoch": 0.8623131392604249, "grad_norm": 0.2771238866050482, "learning_rate": 3.9702986261403255e-05, "loss": 0.5781, "step": 274 }, { "epoch": 0.8654602675059009, "grad_norm": 0.2924337657823486, "learning_rate": 3.9699648992177626e-05, "loss": 0.5756, "step": 275 }, { "epoch": 0.8686073957513769, "grad_norm": 0.25885016469830363, "learning_rate": 3.969629323636944e-05, "loss": 0.5844, "step": 276 }, { "epoch": 0.8717545239968528, "grad_norm": 0.23987091757624832, "learning_rate": 3.9692918997483614e-05, "loss": 0.5733, "step": 277 }, { "epoch": 0.8749016522423289, "grad_norm": 0.3337090298700013, "learning_rate": 3.968952627904443e-05, "loss": 0.571, "step": 278 }, { "epoch": 0.8780487804878049, "grad_norm": 0.29538904985776865, "learning_rate": 3.9686115084595444e-05, "loss": 0.5801, "step": 279 }, { "epoch": 0.8811959087332809, "grad_norm": 0.2312585310388234, "learning_rate": 3.968268541769951e-05, "loss": 0.5835, "step": 280 }, { "epoch": 0.8843430369787569, "grad_norm": 0.24512094230884698, "learning_rate": 3.967923728193878e-05, "loss": 0.5854, "step": 281 }, { "epoch": 0.8874901652242329, "grad_norm": 0.2903110478982194, "learning_rate": 3.96757706809147e-05, "loss": 0.5837, "step": 282 }, { "epoch": 0.8906372934697089, "grad_norm": 0.34977008813511384, "learning_rate": 3.967228561824798e-05, "loss": 0.5767, "step": 283 }, { "epoch": 0.8937844217151849, "grad_norm": 0.25337253280718625, "learning_rate": 3.9668782097578656e-05, "loss": 0.5839, "step": 284 }, { "epoch": 0.8969315499606609, "grad_norm": 0.21636074579356032, "learning_rate": 3.9665260122566e-05, "loss": 0.5804, "step": 285 }, { "epoch": 0.9000786782061369, "grad_norm": 0.33233711739245153, "learning_rate": 3.966171969688858e-05, "loss": 0.5737, "step": 286 }, { "epoch": 0.9032258064516129, "grad_norm": 0.2680586022952418, "learning_rate": 3.965816082424423e-05, "loss": 0.5729, "step": 287 }, { "epoch": 0.9063729346970889, "grad_norm": 0.24065778232025806, "learning_rate": 3.965458350835005e-05, "loss": 0.5786, "step": 288 }, { "epoch": 0.9095200629425649, "grad_norm": 0.24359799626407486, "learning_rate": 3.965098775294241e-05, "loss": 0.5743, "step": 289 }, { "epoch": 0.912667191188041, "grad_norm": 0.21669535169553875, "learning_rate": 3.964737356177692e-05, "loss": 0.5798, "step": 290 }, { "epoch": 0.9158143194335169, "grad_norm": 0.24527385896938736, "learning_rate": 3.9643740938628485e-05, "loss": 0.5771, "step": 291 }, { "epoch": 0.9189614476789929, "grad_norm": 0.24211076465986903, "learning_rate": 3.964008988729121e-05, "loss": 0.5733, "step": 292 }, { "epoch": 0.9221085759244689, "grad_norm": 0.2589502093646541, "learning_rate": 3.9636420411578486e-05, "loss": 0.5755, "step": 293 }, { "epoch": 0.925255704169945, "grad_norm": 0.25126849259271594, "learning_rate": 3.963273251532294e-05, "loss": 0.5866, "step": 294 }, { "epoch": 0.9284028324154209, "grad_norm": 0.28516274664724456, "learning_rate": 3.962902620237642e-05, "loss": 0.5803, "step": 295 }, { "epoch": 0.9315499606608969, "grad_norm": 0.23946824983891027, "learning_rate": 3.9625301476610035e-05, "loss": 0.588, "step": 296 }, { "epoch": 0.9346970889063729, "grad_norm": 0.25073581626718944, "learning_rate": 3.9621558341914104e-05, "loss": 0.5811, "step": 297 }, { "epoch": 0.937844217151849, "grad_norm": 0.22421588771080747, "learning_rate": 3.9617796802198193e-05, "loss": 0.5809, "step": 298 }, { "epoch": 0.940991345397325, "grad_norm": 0.28680785833388234, "learning_rate": 3.961401686139108e-05, "loss": 0.5762, "step": 299 }, { "epoch": 0.9441384736428009, "grad_norm": 0.2912561340919721, "learning_rate": 3.961021852344075e-05, "loss": 0.5685, "step": 300 }, { "epoch": 0.9472856018882769, "grad_norm": 0.25530982724652573, "learning_rate": 3.960640179231443e-05, "loss": 0.571, "step": 301 }, { "epoch": 0.950432730133753, "grad_norm": 0.23274023004495994, "learning_rate": 3.960256667199854e-05, "loss": 0.5766, "step": 302 }, { "epoch": 0.953579858379229, "grad_norm": 0.2526565624008317, "learning_rate": 3.959871316649872e-05, "loss": 0.5811, "step": 303 }, { "epoch": 0.956726986624705, "grad_norm": 0.2669242519610837, "learning_rate": 3.959484127983979e-05, "loss": 0.5786, "step": 304 }, { "epoch": 0.9598741148701809, "grad_norm": 0.2272805823487035, "learning_rate": 3.959095101606579e-05, "loss": 0.5804, "step": 305 }, { "epoch": 0.963021243115657, "grad_norm": 0.32599108271772875, "learning_rate": 3.958704237923994e-05, "loss": 0.565, "step": 306 }, { "epoch": 0.966168371361133, "grad_norm": 0.31736583791904754, "learning_rate": 3.958311537344467e-05, "loss": 0.5744, "step": 307 }, { "epoch": 0.969315499606609, "grad_norm": 0.25057991032117954, "learning_rate": 3.957917000278156e-05, "loss": 0.5744, "step": 308 }, { "epoch": 0.9724626278520849, "grad_norm": 0.3171592672476318, "learning_rate": 3.9575206271371416e-05, "loss": 0.5727, "step": 309 }, { "epoch": 0.975609756097561, "grad_norm": 0.2698770129342592, "learning_rate": 3.957122418335419e-05, "loss": 0.5684, "step": 310 }, { "epoch": 0.978756884343037, "grad_norm": 0.22946805293885594, "learning_rate": 3.956722374288902e-05, "loss": 0.5675, "step": 311 }, { "epoch": 0.981904012588513, "grad_norm": 0.3173852201061959, "learning_rate": 3.9563204954154194e-05, "loss": 0.5691, "step": 312 }, { "epoch": 0.985051140833989, "grad_norm": 0.3705950113218817, "learning_rate": 3.955916782134719e-05, "loss": 0.5841, "step": 313 }, { "epoch": 0.988198269079465, "grad_norm": 0.24571947086560714, "learning_rate": 3.9555112348684626e-05, "loss": 0.5763, "step": 314 }, { "epoch": 0.991345397324941, "grad_norm": 0.3095154791392979, "learning_rate": 3.955103854040228e-05, "loss": 0.5672, "step": 315 }, { "epoch": 0.994492525570417, "grad_norm": 0.3430882274581819, "learning_rate": 3.9546946400755104e-05, "loss": 0.5623, "step": 316 }, { "epoch": 0.997639653815893, "grad_norm": 0.21959520260949345, "learning_rate": 3.954283593401715e-05, "loss": 0.5667, "step": 317 }, { "epoch": 1.003147128245476, "grad_norm": 0.6182457687415346, "learning_rate": 3.9538707144481656e-05, "loss": 1.1053, "step": 318 }, { "epoch": 1.006294256490952, "grad_norm": 0.8091197174119846, "learning_rate": 3.953456003646097e-05, "loss": 0.5479, "step": 319 }, { "epoch": 1.009441384736428, "grad_norm": 0.5734582312017469, "learning_rate": 3.953039461428659e-05, "loss": 0.5401, "step": 320 }, { "epoch": 1.012588512981904, "grad_norm": 0.5553450254239277, "learning_rate": 3.952621088230912e-05, "loss": 0.5444, "step": 321 }, { "epoch": 1.01573564122738, "grad_norm": 0.715209019372237, "learning_rate": 3.9522008844898316e-05, "loss": 0.5482, "step": 322 }, { "epoch": 1.018882769472856, "grad_norm": 0.3194143113644003, "learning_rate": 3.9517788506443036e-05, "loss": 0.5486, "step": 323 }, { "epoch": 1.022029897718332, "grad_norm": 0.5794083060853625, "learning_rate": 3.9513549871351244e-05, "loss": 0.5387, "step": 324 }, { "epoch": 1.025177025963808, "grad_norm": 0.4549904471790254, "learning_rate": 3.950929294405005e-05, "loss": 0.5417, "step": 325 }, { "epoch": 1.028324154209284, "grad_norm": 0.39940752965584325, "learning_rate": 3.950501772898563e-05, "loss": 0.5383, "step": 326 }, { "epoch": 1.03147128245476, "grad_norm": 0.37376657983185657, "learning_rate": 3.9500724230623285e-05, "loss": 0.5438, "step": 327 }, { "epoch": 1.034618410700236, "grad_norm": 0.3249939876794576, "learning_rate": 3.9496412453447396e-05, "loss": 0.5423, "step": 328 }, { "epoch": 1.037765538945712, "grad_norm": 0.362363957521138, "learning_rate": 3.949208240196145e-05, "loss": 0.5341, "step": 329 }, { "epoch": 1.0409126671911881, "grad_norm": 0.3241126445768653, "learning_rate": 3.948773408068801e-05, "loss": 0.5377, "step": 330 }, { "epoch": 1.044059795436664, "grad_norm": 0.3368256917676624, "learning_rate": 3.948336749416873e-05, "loss": 0.55, "step": 331 }, { "epoch": 1.04720692368214, "grad_norm": 0.32581092992360716, "learning_rate": 3.947898264696433e-05, "loss": 0.5427, "step": 332 }, { "epoch": 1.050354051927616, "grad_norm": 0.31016030261198896, "learning_rate": 3.947457954365461e-05, "loss": 0.5355, "step": 333 }, { "epoch": 1.053501180173092, "grad_norm": 0.3133942156383092, "learning_rate": 3.947015818883845e-05, "loss": 0.5337, "step": 334 }, { "epoch": 1.056648308418568, "grad_norm": 0.3012762044302009, "learning_rate": 3.946571858713376e-05, "loss": 0.5451, "step": 335 }, { "epoch": 1.059795436664044, "grad_norm": 0.32025563813718905, "learning_rate": 3.946126074317755e-05, "loss": 0.5263, "step": 336 }, { "epoch": 1.06294256490952, "grad_norm": 0.31757922834182667, "learning_rate": 3.9456784661625845e-05, "loss": 0.5407, "step": 337 }, { "epoch": 1.0660896931549961, "grad_norm": 0.2724133341234047, "learning_rate": 3.945229034715374e-05, "loss": 0.5451, "step": 338 }, { "epoch": 1.0692368214004722, "grad_norm": 0.31611533863952757, "learning_rate": 3.944777780445537e-05, "loss": 0.5314, "step": 339 }, { "epoch": 1.072383949645948, "grad_norm": 0.264029405971174, "learning_rate": 3.94432470382439e-05, "loss": 0.5395, "step": 340 }, { "epoch": 1.075531077891424, "grad_norm": 0.2403700990983929, "learning_rate": 3.9438698053251545e-05, "loss": 0.5454, "step": 341 }, { "epoch": 1.0786782061369, "grad_norm": 0.26014062877023636, "learning_rate": 3.943413085422954e-05, "loss": 0.5303, "step": 342 }, { "epoch": 1.081825334382376, "grad_norm": 0.24331842687849348, "learning_rate": 3.942954544594814e-05, "loss": 0.5393, "step": 343 }, { "epoch": 1.084972462627852, "grad_norm": 0.2246092745659522, "learning_rate": 3.942494183319662e-05, "loss": 0.5388, "step": 344 }, { "epoch": 1.088119590873328, "grad_norm": 0.26647954408169594, "learning_rate": 3.942032002078326e-05, "loss": 0.5412, "step": 345 }, { "epoch": 1.0912667191188041, "grad_norm": 0.22770410942233646, "learning_rate": 3.941568001353539e-05, "loss": 0.5319, "step": 346 }, { "epoch": 1.0944138473642802, "grad_norm": 0.2554265454591065, "learning_rate": 3.94110218162993e-05, "loss": 0.5356, "step": 347 }, { "epoch": 1.0975609756097562, "grad_norm": 0.29466263774038076, "learning_rate": 3.9406345433940284e-05, "loss": 0.5375, "step": 348 }, { "epoch": 1.100708103855232, "grad_norm": 0.25834878315712795, "learning_rate": 3.940165087134264e-05, "loss": 0.5379, "step": 349 }, { "epoch": 1.103855232100708, "grad_norm": 0.25134217558928273, "learning_rate": 3.939693813340966e-05, "loss": 0.5249, "step": 350 }, { "epoch": 1.107002360346184, "grad_norm": 0.29107674814907275, "learning_rate": 3.939220722506361e-05, "loss": 0.5397, "step": 351 }, { "epoch": 1.11014948859166, "grad_norm": 0.22038795971485264, "learning_rate": 3.938745815124574e-05, "loss": 0.5178, "step": 352 }, { "epoch": 1.113296616837136, "grad_norm": 0.20162768321484986, "learning_rate": 3.938269091691626e-05, "loss": 0.5424, "step": 353 }, { "epoch": 1.1164437450826121, "grad_norm": 0.22798636250647, "learning_rate": 3.937790552705437e-05, "loss": 0.5401, "step": 354 }, { "epoch": 1.1195908733280882, "grad_norm": 0.24904854499387244, "learning_rate": 3.9373101986658204e-05, "loss": 0.5405, "step": 355 }, { "epoch": 1.1227380015735642, "grad_norm": 0.21942469794432654, "learning_rate": 3.936828030074488e-05, "loss": 0.5375, "step": 356 }, { "epoch": 1.1258851298190402, "grad_norm": 0.2072161315607419, "learning_rate": 3.936344047435046e-05, "loss": 0.5324, "step": 357 }, { "epoch": 1.129032258064516, "grad_norm": 0.22290097704007697, "learning_rate": 3.935858251252994e-05, "loss": 0.5408, "step": 358 }, { "epoch": 1.132179386309992, "grad_norm": 0.2194104787972607, "learning_rate": 3.935370642035729e-05, "loss": 0.5295, "step": 359 }, { "epoch": 1.135326514555468, "grad_norm": 0.22998068727153642, "learning_rate": 3.9348812202925375e-05, "loss": 0.5299, "step": 360 }, { "epoch": 1.138473642800944, "grad_norm": 0.20288897468564898, "learning_rate": 3.9343899865346015e-05, "loss": 0.5181, "step": 361 }, { "epoch": 1.1416207710464201, "grad_norm": 0.2533470728298271, "learning_rate": 3.933896941274996e-05, "loss": 0.5403, "step": 362 }, { "epoch": 1.1447678992918962, "grad_norm": 0.24735853692572296, "learning_rate": 3.933402085028687e-05, "loss": 0.5275, "step": 363 }, { "epoch": 1.1479150275373722, "grad_norm": 0.21213304673435288, "learning_rate": 3.932905418312531e-05, "loss": 0.5299, "step": 364 }, { "epoch": 1.1510621557828482, "grad_norm": 0.2570391773369148, "learning_rate": 3.932406941645278e-05, "loss": 0.5346, "step": 365 }, { "epoch": 1.1542092840283242, "grad_norm": 0.20961822348437115, "learning_rate": 3.931906655547568e-05, "loss": 0.5329, "step": 366 }, { "epoch": 1.1573564122738, "grad_norm": 0.19241742932101327, "learning_rate": 3.9314045605419286e-05, "loss": 0.5161, "step": 367 }, { "epoch": 1.160503540519276, "grad_norm": 0.1981209025761205, "learning_rate": 3.930900657152777e-05, "loss": 0.5285, "step": 368 }, { "epoch": 1.163650668764752, "grad_norm": 0.20579170791942883, "learning_rate": 3.930394945906423e-05, "loss": 0.5337, "step": 369 }, { "epoch": 1.1667977970102281, "grad_norm": 0.23447783712452946, "learning_rate": 3.929887427331061e-05, "loss": 0.5319, "step": 370 }, { "epoch": 1.1699449252557041, "grad_norm": 0.23240141396634467, "learning_rate": 3.9293781019567736e-05, "loss": 0.5216, "step": 371 }, { "epoch": 1.1730920535011802, "grad_norm": 0.20717972878459204, "learning_rate": 3.9288669703155305e-05, "loss": 0.5285, "step": 372 }, { "epoch": 1.1762391817466562, "grad_norm": 0.24921261028100655, "learning_rate": 3.92835403294119e-05, "loss": 0.5307, "step": 373 }, { "epoch": 1.1793863099921322, "grad_norm": 0.2541092436236979, "learning_rate": 3.927839290369494e-05, "loss": 0.529, "step": 374 }, { "epoch": 1.1825334382376083, "grad_norm": 0.20270164744890468, "learning_rate": 3.927322743138071e-05, "loss": 0.526, "step": 375 }, { "epoch": 1.185680566483084, "grad_norm": 0.2399584297754137, "learning_rate": 3.926804391786433e-05, "loss": 0.5365, "step": 376 }, { "epoch": 1.1888276947285603, "grad_norm": 0.24063071154269497, "learning_rate": 3.926284236855979e-05, "loss": 0.5359, "step": 377 }, { "epoch": 1.1919748229740361, "grad_norm": 0.2147952601728759, "learning_rate": 3.92576227888999e-05, "loss": 0.5369, "step": 378 }, { "epoch": 1.1951219512195121, "grad_norm": 0.23183738138121565, "learning_rate": 3.925238518433629e-05, "loss": 0.5406, "step": 379 }, { "epoch": 1.1982690794649882, "grad_norm": 0.24958601341879394, "learning_rate": 3.924712956033945e-05, "loss": 0.5257, "step": 380 }, { "epoch": 1.2014162077104642, "grad_norm": 0.2107193249163421, "learning_rate": 3.9241855922398664e-05, "loss": 0.5265, "step": 381 }, { "epoch": 1.2045633359559402, "grad_norm": 0.21211894634268036, "learning_rate": 3.923656427602203e-05, "loss": 0.5209, "step": 382 }, { "epoch": 1.2077104642014163, "grad_norm": 0.19007716139862285, "learning_rate": 3.9231254626736475e-05, "loss": 0.5248, "step": 383 }, { "epoch": 1.2108575924468923, "grad_norm": 0.2106932996483277, "learning_rate": 3.922592698008771e-05, "loss": 0.5255, "step": 384 }, { "epoch": 1.2140047206923683, "grad_norm": 0.20944881028089596, "learning_rate": 3.922058134164025e-05, "loss": 0.5383, "step": 385 }, { "epoch": 1.2171518489378443, "grad_norm": 0.19091968823309646, "learning_rate": 3.9215217716977405e-05, "loss": 0.5321, "step": 386 }, { "epoch": 1.2202989771833201, "grad_norm": 0.24949906631514807, "learning_rate": 3.9209836111701274e-05, "loss": 0.5337, "step": 387 }, { "epoch": 1.2234461054287962, "grad_norm": 0.24811252472670564, "learning_rate": 3.9204436531432725e-05, "loss": 0.5305, "step": 388 }, { "epoch": 1.2265932336742722, "grad_norm": 0.21473975761364125, "learning_rate": 3.9199018981811405e-05, "loss": 0.5203, "step": 389 }, { "epoch": 1.2297403619197482, "grad_norm": 0.21533576750411015, "learning_rate": 3.919358346849573e-05, "loss": 0.5433, "step": 390 }, { "epoch": 1.2328874901652243, "grad_norm": 0.21369216327652596, "learning_rate": 3.918812999716288e-05, "loss": 0.5305, "step": 391 }, { "epoch": 1.2360346184107003, "grad_norm": 0.21462424108637662, "learning_rate": 3.918265857350879e-05, "loss": 0.5359, "step": 392 }, { "epoch": 1.2391817466561763, "grad_norm": 0.24722325499788872, "learning_rate": 3.917716920324815e-05, "loss": 0.528, "step": 393 }, { "epoch": 1.2423288749016523, "grad_norm": 0.2011641071003589, "learning_rate": 3.917166189211438e-05, "loss": 0.5314, "step": 394 }, { "epoch": 1.2454760031471284, "grad_norm": 0.2129227867593859, "learning_rate": 3.916613664585966e-05, "loss": 0.536, "step": 395 }, { "epoch": 1.2486231313926042, "grad_norm": 0.22745233172578577, "learning_rate": 3.9160593470254884e-05, "loss": 0.5313, "step": 396 }, { "epoch": 1.2517702596380802, "grad_norm": 0.21406731944348806, "learning_rate": 3.915503237108967e-05, "loss": 0.5225, "step": 397 }, { "epoch": 1.2549173878835562, "grad_norm": 0.23441772239476547, "learning_rate": 3.9149453354172387e-05, "loss": 0.5288, "step": 398 }, { "epoch": 1.2580645161290323, "grad_norm": 0.23499009277537974, "learning_rate": 3.914385642533008e-05, "loss": 0.5334, "step": 399 }, { "epoch": 1.2612116443745083, "grad_norm": 0.19728137418451414, "learning_rate": 3.913824159040853e-05, "loss": 0.529, "step": 400 }, { "epoch": 1.2643587726199843, "grad_norm": 0.24085359726093897, "learning_rate": 3.913260885527221e-05, "loss": 0.5276, "step": 401 }, { "epoch": 1.2675059008654603, "grad_norm": 0.25321890170169037, "learning_rate": 3.912695822580428e-05, "loss": 0.5317, "step": 402 }, { "epoch": 1.2706530291109361, "grad_norm": 0.21705384607485445, "learning_rate": 3.912128970790659e-05, "loss": 0.5232, "step": 403 }, { "epoch": 1.2738001573564124, "grad_norm": 0.20299880408423301, "learning_rate": 3.911560330749971e-05, "loss": 0.5366, "step": 404 }, { "epoch": 1.2769472856018882, "grad_norm": 0.22689509092035035, "learning_rate": 3.9109899030522846e-05, "loss": 0.5304, "step": 405 }, { "epoch": 1.2800944138473642, "grad_norm": 0.20734100246332385, "learning_rate": 3.910417688293389e-05, "loss": 0.5308, "step": 406 }, { "epoch": 1.2832415420928402, "grad_norm": 0.2079086103791129, "learning_rate": 3.909843687070939e-05, "loss": 0.533, "step": 407 }, { "epoch": 1.2863886703383163, "grad_norm": 0.2383801744359338, "learning_rate": 3.9092678999844575e-05, "loss": 0.5261, "step": 408 }, { "epoch": 1.2895357985837923, "grad_norm": 0.26103176241188575, "learning_rate": 3.90869032763533e-05, "loss": 0.525, "step": 409 }, { "epoch": 1.2926829268292683, "grad_norm": 0.23498553551570836, "learning_rate": 3.90811097062681e-05, "loss": 0.5257, "step": 410 }, { "epoch": 1.2958300550747444, "grad_norm": 0.22833679759966044, "learning_rate": 3.90752982956401e-05, "loss": 0.5328, "step": 411 }, { "epoch": 1.2989771833202202, "grad_norm": 0.25544426763785694, "learning_rate": 3.906946905053912e-05, "loss": 0.5238, "step": 412 }, { "epoch": 1.3021243115656964, "grad_norm": 0.2672450987154701, "learning_rate": 3.906362197705355e-05, "loss": 0.5389, "step": 413 }, { "epoch": 1.3052714398111722, "grad_norm": 0.2619780531405743, "learning_rate": 3.905775708129045e-05, "loss": 0.5289, "step": 414 }, { "epoch": 1.3084185680566482, "grad_norm": 0.25026525055288934, "learning_rate": 3.905187436937545e-05, "loss": 0.5251, "step": 415 }, { "epoch": 1.3115656963021243, "grad_norm": 0.2615761605038083, "learning_rate": 3.904597384745282e-05, "loss": 0.5232, "step": 416 }, { "epoch": 1.3147128245476003, "grad_norm": 0.2209468332287666, "learning_rate": 3.904005552168541e-05, "loss": 0.5247, "step": 417 }, { "epoch": 1.3178599527930763, "grad_norm": 0.27687648105272317, "learning_rate": 3.9034119398254703e-05, "loss": 0.5394, "step": 418 }, { "epoch": 1.3210070810385524, "grad_norm": 0.264646195521817, "learning_rate": 3.902816548336072e-05, "loss": 0.5195, "step": 419 }, { "epoch": 1.3241542092840284, "grad_norm": 0.24319496232874022, "learning_rate": 3.90221937832221e-05, "loss": 0.5297, "step": 420 }, { "epoch": 1.3273013375295044, "grad_norm": 0.2682903371006935, "learning_rate": 3.901620430407605e-05, "loss": 0.5288, "step": 421 }, { "epoch": 1.3304484657749804, "grad_norm": 0.25696926111301793, "learning_rate": 3.9010197052178334e-05, "loss": 0.5321, "step": 422 }, { "epoch": 1.3335955940204562, "grad_norm": 0.21737804255664558, "learning_rate": 3.9004172033803294e-05, "loss": 0.527, "step": 423 }, { "epoch": 1.3367427222659323, "grad_norm": 0.25367064453625227, "learning_rate": 3.899812925524382e-05, "loss": 0.5294, "step": 424 }, { "epoch": 1.3398898505114083, "grad_norm": 0.23247403805521405, "learning_rate": 3.8992068722811366e-05, "loss": 0.5268, "step": 425 }, { "epoch": 1.3430369787568843, "grad_norm": 0.20969217116963346, "learning_rate": 3.89859904428359e-05, "loss": 0.525, "step": 426 }, { "epoch": 1.3461841070023604, "grad_norm": 0.21944665805836538, "learning_rate": 3.897989442166597e-05, "loss": 0.5303, "step": 427 }, { "epoch": 1.3493312352478364, "grad_norm": 0.20387874014114413, "learning_rate": 3.89737806656686e-05, "loss": 0.5294, "step": 428 }, { "epoch": 1.3524783634933124, "grad_norm": 0.23174913286284687, "learning_rate": 3.8967649181229384e-05, "loss": 0.5279, "step": 429 }, { "epoch": 1.3556254917387884, "grad_norm": 0.2435719017940357, "learning_rate": 3.896149997475241e-05, "loss": 0.5246, "step": 430 }, { "epoch": 1.3587726199842645, "grad_norm": 0.22427280573004413, "learning_rate": 3.895533305266029e-05, "loss": 0.5175, "step": 431 }, { "epoch": 1.3619197482297403, "grad_norm": 0.23620945890539627, "learning_rate": 3.894914842139411e-05, "loss": 0.5263, "step": 432 }, { "epoch": 1.3650668764752163, "grad_norm": 0.3013716827125622, "learning_rate": 3.894294608741349e-05, "loss": 0.5301, "step": 433 }, { "epoch": 1.3682140047206923, "grad_norm": 0.2147253729034818, "learning_rate": 3.893672605719651e-05, "loss": 0.5324, "step": 434 }, { "epoch": 1.3713611329661684, "grad_norm": 0.2818860617617866, "learning_rate": 3.893048833723976e-05, "loss": 0.5257, "step": 435 }, { "epoch": 1.3745082612116444, "grad_norm": 0.29077510474579804, "learning_rate": 3.892423293405828e-05, "loss": 0.5329, "step": 436 }, { "epoch": 1.3776553894571204, "grad_norm": 0.24939201266271596, "learning_rate": 3.891795985418559e-05, "loss": 0.5249, "step": 437 }, { "epoch": 1.3808025177025964, "grad_norm": 0.2715447424635029, "learning_rate": 3.891166910417368e-05, "loss": 0.5176, "step": 438 }, { "epoch": 1.3839496459480725, "grad_norm": 0.31564837515854355, "learning_rate": 3.890536069059299e-05, "loss": 0.5247, "step": 439 }, { "epoch": 1.3870967741935485, "grad_norm": 0.28321629275081395, "learning_rate": 3.88990346200324e-05, "loss": 0.5292, "step": 440 }, { "epoch": 1.3902439024390243, "grad_norm": 0.255553001892679, "learning_rate": 3.889269089909924e-05, "loss": 0.5274, "step": 441 }, { "epoch": 1.3933910306845003, "grad_norm": 0.22898176345383003, "learning_rate": 3.888632953441929e-05, "loss": 0.5278, "step": 442 }, { "epoch": 1.3965381589299763, "grad_norm": 0.25102215567626407, "learning_rate": 3.887995053263673e-05, "loss": 0.5263, "step": 443 }, { "epoch": 1.3996852871754524, "grad_norm": 0.20971240214227582, "learning_rate": 3.887355390041418e-05, "loss": 0.5226, "step": 444 }, { "epoch": 1.4028324154209284, "grad_norm": 0.20633909928885785, "learning_rate": 3.886713964443266e-05, "loss": 0.5289, "step": 445 }, { "epoch": 1.4059795436664044, "grad_norm": 0.2574749675063496, "learning_rate": 3.886070777139163e-05, "loss": 0.5312, "step": 446 }, { "epoch": 1.4091266719118805, "grad_norm": 0.261674083730878, "learning_rate": 3.88542582880089e-05, "loss": 0.536, "step": 447 }, { "epoch": 1.4122738001573565, "grad_norm": 0.23684316877305508, "learning_rate": 3.884779120102071e-05, "loss": 0.5305, "step": 448 }, { "epoch": 1.4154209284028325, "grad_norm": 0.2641611884928782, "learning_rate": 3.884130651718168e-05, "loss": 0.5242, "step": 449 }, { "epoch": 1.4185680566483083, "grad_norm": 0.26330433265613506, "learning_rate": 3.883480424326481e-05, "loss": 0.5336, "step": 450 }, { "epoch": 1.4217151848937843, "grad_norm": 0.24912300302600038, "learning_rate": 3.882828438606145e-05, "loss": 0.5349, "step": 451 }, { "epoch": 1.4248623131392604, "grad_norm": 0.20605940059141603, "learning_rate": 3.882174695238135e-05, "loss": 0.5287, "step": 452 }, { "epoch": 1.4280094413847364, "grad_norm": 0.2693403536580001, "learning_rate": 3.8815191949052586e-05, "loss": 0.5367, "step": 453 }, { "epoch": 1.4311565696302124, "grad_norm": 0.314210855668854, "learning_rate": 3.880861938292162e-05, "loss": 0.523, "step": 454 }, { "epoch": 1.4343036978756885, "grad_norm": 0.24090278288346206, "learning_rate": 3.880202926085321e-05, "loss": 0.5224, "step": 455 }, { "epoch": 1.4374508261211645, "grad_norm": 0.30398502103831454, "learning_rate": 3.87954215897305e-05, "loss": 0.5302, "step": 456 }, { "epoch": 1.4405979543666405, "grad_norm": 0.3216734104886104, "learning_rate": 3.8788796376454936e-05, "loss": 0.5274, "step": 457 }, { "epoch": 1.4437450826121165, "grad_norm": 0.22333114204077223, "learning_rate": 3.878215362794628e-05, "loss": 0.5239, "step": 458 }, { "epoch": 1.4468922108575923, "grad_norm": 0.23789190915199318, "learning_rate": 3.877549335114263e-05, "loss": 0.5209, "step": 459 }, { "epoch": 1.4500393391030684, "grad_norm": 0.1890051988727759, "learning_rate": 3.8768815553000376e-05, "loss": 0.5269, "step": 460 }, { "epoch": 1.4531864673485444, "grad_norm": 0.22963562375613875, "learning_rate": 3.8762120240494223e-05, "loss": 0.529, "step": 461 }, { "epoch": 1.4563335955940204, "grad_norm": 0.24014125362476896, "learning_rate": 3.875540742061715e-05, "loss": 0.5249, "step": 462 }, { "epoch": 1.4594807238394965, "grad_norm": 0.25487459533999646, "learning_rate": 3.874867710038044e-05, "loss": 0.5238, "step": 463 }, { "epoch": 1.4626278520849725, "grad_norm": 0.2329939674520019, "learning_rate": 3.874192928681364e-05, "loss": 0.5245, "step": 464 }, { "epoch": 1.4657749803304485, "grad_norm": 0.2375922581202063, "learning_rate": 3.873516398696457e-05, "loss": 0.5253, "step": 465 }, { "epoch": 1.4689221085759245, "grad_norm": 0.22120136053113018, "learning_rate": 3.8728381207899326e-05, "loss": 0.5322, "step": 466 }, { "epoch": 1.4720692368214006, "grad_norm": 0.2575168541935438, "learning_rate": 3.872158095670225e-05, "loss": 0.5163, "step": 467 }, { "epoch": 1.4752163650668764, "grad_norm": 0.24759034856123785, "learning_rate": 3.871476324047593e-05, "loss": 0.5219, "step": 468 }, { "epoch": 1.4783634933123526, "grad_norm": 0.2122545341024054, "learning_rate": 3.870792806634121e-05, "loss": 0.5219, "step": 469 }, { "epoch": 1.4815106215578284, "grad_norm": 0.21548255412844264, "learning_rate": 3.8701075441437156e-05, "loss": 0.5139, "step": 470 }, { "epoch": 1.4846577498033044, "grad_norm": 0.23209201852334332, "learning_rate": 3.8694205372921054e-05, "loss": 0.5255, "step": 471 }, { "epoch": 1.4878048780487805, "grad_norm": 0.22357478851651288, "learning_rate": 3.868731786796843e-05, "loss": 0.5173, "step": 472 }, { "epoch": 1.4909520062942565, "grad_norm": 0.23628716934333172, "learning_rate": 3.8680412933773007e-05, "loss": 0.5166, "step": 473 }, { "epoch": 1.4940991345397325, "grad_norm": 0.2840109021374993, "learning_rate": 3.867349057754671e-05, "loss": 0.5248, "step": 474 }, { "epoch": 1.4972462627852086, "grad_norm": 0.19780778754861236, "learning_rate": 3.8666550806519676e-05, "loss": 0.5309, "step": 475 }, { "epoch": 1.5003933910306846, "grad_norm": 0.23377357292240997, "learning_rate": 3.8659593627940204e-05, "loss": 0.5242, "step": 476 }, { "epoch": 1.5035405192761604, "grad_norm": 0.27389049583059905, "learning_rate": 3.8652619049074814e-05, "loss": 0.5326, "step": 477 }, { "epoch": 1.5066876475216366, "grad_norm": 0.2155931645603816, "learning_rate": 3.8645627077208166e-05, "loss": 0.5195, "step": 478 }, { "epoch": 1.5098347757671124, "grad_norm": 0.22156798104402176, "learning_rate": 3.8638617719643095e-05, "loss": 0.5171, "step": 479 }, { "epoch": 1.5129819040125885, "grad_norm": 0.23067806483053538, "learning_rate": 3.8631590983700606e-05, "loss": 0.5152, "step": 480 }, { "epoch": 1.5161290322580645, "grad_norm": 0.1921108745110929, "learning_rate": 3.8624546876719834e-05, "loss": 0.5283, "step": 481 }, { "epoch": 1.5192761605035405, "grad_norm": 0.21072974454084317, "learning_rate": 3.861748540605808e-05, "loss": 0.5171, "step": 482 }, { "epoch": 1.5224232887490166, "grad_norm": 0.21970389437346471, "learning_rate": 3.8610406579090766e-05, "loss": 0.5219, "step": 483 }, { "epoch": 1.5255704169944924, "grad_norm": 0.20711315573580885, "learning_rate": 3.860331040321145e-05, "loss": 0.5253, "step": 484 }, { "epoch": 1.5287175452399686, "grad_norm": 0.24879575847966118, "learning_rate": 3.8596196885831804e-05, "loss": 0.5302, "step": 485 }, { "epoch": 1.5318646734854444, "grad_norm": 0.28066766221968237, "learning_rate": 3.858906603438161e-05, "loss": 0.5372, "step": 486 }, { "epoch": 1.5350118017309207, "grad_norm": 0.23712162618272606, "learning_rate": 3.8581917856308775e-05, "loss": 0.53, "step": 487 }, { "epoch": 1.5381589299763965, "grad_norm": 0.24813061042956056, "learning_rate": 3.857475235907928e-05, "loss": 0.5204, "step": 488 }, { "epoch": 1.5413060582218725, "grad_norm": 0.20722206925566874, "learning_rate": 3.8567569550177195e-05, "loss": 0.5318, "step": 489 }, { "epoch": 1.5444531864673485, "grad_norm": 0.2282801127025407, "learning_rate": 3.856036943710469e-05, "loss": 0.5238, "step": 490 }, { "epoch": 1.5476003147128246, "grad_norm": 0.25422893533824964, "learning_rate": 3.8553152027382e-05, "loss": 0.5204, "step": 491 }, { "epoch": 1.5507474429583006, "grad_norm": 0.2259564938535116, "learning_rate": 3.854591732854741e-05, "loss": 0.5257, "step": 492 }, { "epoch": 1.5538945712037766, "grad_norm": 0.23136885066637908, "learning_rate": 3.853866534815728e-05, "loss": 0.5253, "step": 493 }, { "epoch": 1.5570416994492526, "grad_norm": 0.25141412317565787, "learning_rate": 3.853139609378603e-05, "loss": 0.5215, "step": 494 }, { "epoch": 1.5601888276947284, "grad_norm": 0.216386761256824, "learning_rate": 3.85241095730261e-05, "loss": 0.5175, "step": 495 }, { "epoch": 1.5633359559402047, "grad_norm": 0.24355134099428055, "learning_rate": 3.8516805793487974e-05, "loss": 0.519, "step": 496 }, { "epoch": 1.5664830841856805, "grad_norm": 0.19505711198842687, "learning_rate": 3.850948476280015e-05, "loss": 0.5327, "step": 497 }, { "epoch": 1.5696302124311565, "grad_norm": 0.2447172400294907, "learning_rate": 3.8502146488609164e-05, "loss": 0.5212, "step": 498 }, { "epoch": 1.5727773406766326, "grad_norm": 0.19962243706421512, "learning_rate": 3.8494790978579565e-05, "loss": 0.5142, "step": 499 }, { "epoch": 1.5759244689221086, "grad_norm": 0.2841517257055549, "learning_rate": 3.848741824039386e-05, "loss": 0.5178, "step": 500 }, { "epoch": 1.5790715971675846, "grad_norm": 0.20724334543587292, "learning_rate": 3.8480028281752615e-05, "loss": 0.5249, "step": 501 }, { "epoch": 1.5822187254130606, "grad_norm": 0.28838849111673964, "learning_rate": 3.8472621110374335e-05, "loss": 0.5173, "step": 502 }, { "epoch": 1.5853658536585367, "grad_norm": 0.24186826741838155, "learning_rate": 3.8465196733995514e-05, "loss": 0.5154, "step": 503 }, { "epoch": 1.5885129819040125, "grad_norm": 0.1983223561298523, "learning_rate": 3.8457755160370625e-05, "loss": 0.509, "step": 504 }, { "epoch": 1.5916601101494887, "grad_norm": 0.21649035102901398, "learning_rate": 3.8450296397272095e-05, "loss": 0.5321, "step": 505 }, { "epoch": 1.5948072383949645, "grad_norm": 0.25388593478131405, "learning_rate": 3.8442820452490305e-05, "loss": 0.5249, "step": 506 }, { "epoch": 1.5979543666404405, "grad_norm": 0.2642843241520677, "learning_rate": 3.843532733383358e-05, "loss": 0.5256, "step": 507 }, { "epoch": 1.6011014948859166, "grad_norm": 0.23572275621033986, "learning_rate": 3.8427817049128194e-05, "loss": 0.5216, "step": 508 }, { "epoch": 1.6042486231313926, "grad_norm": 0.2110591043789785, "learning_rate": 3.842028960621834e-05, "loss": 0.5149, "step": 509 }, { "epoch": 1.6073957513768686, "grad_norm": 0.22877095520766147, "learning_rate": 3.841274501296613e-05, "loss": 0.5235, "step": 510 }, { "epoch": 1.6105428796223447, "grad_norm": 0.24251420163715415, "learning_rate": 3.84051832772516e-05, "loss": 0.5144, "step": 511 }, { "epoch": 1.6136900078678207, "grad_norm": 0.20511713060127806, "learning_rate": 3.839760440697268e-05, "loss": 0.5258, "step": 512 }, { "epoch": 1.6168371361132965, "grad_norm": 0.23501746426992737, "learning_rate": 3.83900084100452e-05, "loss": 0.5218, "step": 513 }, { "epoch": 1.6199842643587727, "grad_norm": 0.24300834825222642, "learning_rate": 3.838239529440287e-05, "loss": 0.5201, "step": 514 }, { "epoch": 1.6231313926042485, "grad_norm": 0.2415867797392267, "learning_rate": 3.83747650679973e-05, "loss": 0.5214, "step": 515 }, { "epoch": 1.6262785208497246, "grad_norm": 0.24575403339954768, "learning_rate": 3.836711773879795e-05, "loss": 0.5262, "step": 516 }, { "epoch": 1.6294256490952006, "grad_norm": 0.19675787482506665, "learning_rate": 3.835945331479216e-05, "loss": 0.5144, "step": 517 }, { "epoch": 1.6325727773406766, "grad_norm": 0.26577039634688837, "learning_rate": 3.8351771803985115e-05, "loss": 0.5192, "step": 518 }, { "epoch": 1.6357199055861527, "grad_norm": 0.286665435100784, "learning_rate": 3.8344073214399845e-05, "loss": 0.5291, "step": 519 }, { "epoch": 1.6388670338316287, "grad_norm": 0.18836938646912646, "learning_rate": 3.833635755407723e-05, "loss": 0.5109, "step": 520 }, { "epoch": 1.6420141620771047, "grad_norm": 0.2650168026961069, "learning_rate": 3.832862483107597e-05, "loss": 0.5221, "step": 521 }, { "epoch": 1.6451612903225805, "grad_norm": 0.2361325083482264, "learning_rate": 3.832087505347257e-05, "loss": 0.5219, "step": 522 }, { "epoch": 1.6483084185680568, "grad_norm": 0.19034304753715936, "learning_rate": 3.831310822936139e-05, "loss": 0.5249, "step": 523 }, { "epoch": 1.6514555468135326, "grad_norm": 0.22876829423081257, "learning_rate": 3.830532436685457e-05, "loss": 0.5144, "step": 524 }, { "epoch": 1.6546026750590088, "grad_norm": 0.19602966120535223, "learning_rate": 3.829752347408202e-05, "loss": 0.5137, "step": 525 }, { "epoch": 1.6577498033044846, "grad_norm": 0.19990915987444982, "learning_rate": 3.8289705559191495e-05, "loss": 0.5188, "step": 526 }, { "epoch": 1.6608969315499607, "grad_norm": 0.22044066914833604, "learning_rate": 3.8281870630348483e-05, "loss": 0.5147, "step": 527 }, { "epoch": 1.6640440597954367, "grad_norm": 0.21061663961165006, "learning_rate": 3.827401869573626e-05, "loss": 0.5231, "step": 528 }, { "epoch": 1.6671911880409127, "grad_norm": 0.21712763474322638, "learning_rate": 3.826614976355584e-05, "loss": 0.5276, "step": 529 }, { "epoch": 1.6703383162863887, "grad_norm": 0.23698263316551577, "learning_rate": 3.825826384202604e-05, "loss": 0.512, "step": 530 }, { "epoch": 1.6734854445318645, "grad_norm": 0.26433812000215734, "learning_rate": 3.8250360939383384e-05, "loss": 0.5205, "step": 531 }, { "epoch": 1.6766325727773408, "grad_norm": 0.23388260377969083, "learning_rate": 3.8242441063882145e-05, "loss": 0.5158, "step": 532 }, { "epoch": 1.6797797010228166, "grad_norm": 0.23031333677705712, "learning_rate": 3.82345042237943e-05, "loss": 0.5211, "step": 533 }, { "epoch": 1.6829268292682928, "grad_norm": 0.2548042458770785, "learning_rate": 3.822655042740959e-05, "loss": 0.5198, "step": 534 }, { "epoch": 1.6860739575137687, "grad_norm": 0.22638295828893965, "learning_rate": 3.8218579683035425e-05, "loss": 0.5238, "step": 535 }, { "epoch": 1.6892210857592447, "grad_norm": 0.2262970560082153, "learning_rate": 3.8210591998996924e-05, "loss": 0.5202, "step": 536 }, { "epoch": 1.6923682140047207, "grad_norm": 0.21411782792791356, "learning_rate": 3.8202587383636926e-05, "loss": 0.5222, "step": 537 }, { "epoch": 1.6955153422501967, "grad_norm": 0.20447045372047343, "learning_rate": 3.8194565845315936e-05, "loss": 0.5173, "step": 538 }, { "epoch": 1.6986624704956728, "grad_norm": 0.2162189855266448, "learning_rate": 3.818652739241211e-05, "loss": 0.5144, "step": 539 }, { "epoch": 1.7018095987411486, "grad_norm": 0.23817213025322223, "learning_rate": 3.817847203332131e-05, "loss": 0.5239, "step": 540 }, { "epoch": 1.7049567269866248, "grad_norm": 0.2451599843139077, "learning_rate": 3.8170399776457044e-05, "loss": 0.5252, "step": 541 }, { "epoch": 1.7081038552321006, "grad_norm": 0.21173486339403857, "learning_rate": 3.816231063025045e-05, "loss": 0.5144, "step": 542 }, { "epoch": 1.7112509834775769, "grad_norm": 0.22798045746972564, "learning_rate": 3.8154204603150334e-05, "loss": 0.5246, "step": 543 }, { "epoch": 1.7143981117230527, "grad_norm": 0.22874807203605335, "learning_rate": 3.814608170362311e-05, "loss": 0.5171, "step": 544 }, { "epoch": 1.7175452399685287, "grad_norm": 0.20891847321188745, "learning_rate": 3.8137941940152834e-05, "loss": 0.5196, "step": 545 }, { "epoch": 1.7206923682140047, "grad_norm": 0.21664552315581692, "learning_rate": 3.812978532124116e-05, "loss": 0.5074, "step": 546 }, { "epoch": 1.7238394964594808, "grad_norm": 0.20341701815710325, "learning_rate": 3.812161185540736e-05, "loss": 0.5167, "step": 547 }, { "epoch": 1.7269866247049568, "grad_norm": 0.21511450945343744, "learning_rate": 3.811342155118829e-05, "loss": 0.5192, "step": 548 }, { "epoch": 1.7301337529504326, "grad_norm": 0.2194147912509461, "learning_rate": 3.81052144171384e-05, "loss": 0.5225, "step": 549 }, { "epoch": 1.7332808811959088, "grad_norm": 0.22863514788872993, "learning_rate": 3.809699046182972e-05, "loss": 0.5081, "step": 550 }, { "epoch": 1.7364280094413846, "grad_norm": 0.207709622035488, "learning_rate": 3.808874969385184e-05, "loss": 0.5089, "step": 551 }, { "epoch": 1.739575137686861, "grad_norm": 0.21852716207508774, "learning_rate": 3.808049212181192e-05, "loss": 0.5198, "step": 552 }, { "epoch": 1.7427222659323367, "grad_norm": 0.22298890603933133, "learning_rate": 3.8072217754334655e-05, "loss": 0.52, "step": 553 }, { "epoch": 1.7458693941778127, "grad_norm": 0.2532878106195187, "learning_rate": 3.8063926600062315e-05, "loss": 0.5145, "step": 554 }, { "epoch": 1.7490165224232888, "grad_norm": 0.24360819970385728, "learning_rate": 3.805561866765467e-05, "loss": 0.5141, "step": 555 }, { "epoch": 1.7521636506687648, "grad_norm": 0.20861292145147736, "learning_rate": 3.8047293965789025e-05, "loss": 0.5196, "step": 556 }, { "epoch": 1.7553107789142408, "grad_norm": 0.21285677902378522, "learning_rate": 3.803895250316021e-05, "loss": 0.5121, "step": 557 }, { "epoch": 1.7584579071597166, "grad_norm": 0.2123709291037103, "learning_rate": 3.803059428848054e-05, "loss": 0.5176, "step": 558 }, { "epoch": 1.7616050354051929, "grad_norm": 0.20536678541188122, "learning_rate": 3.8022219330479854e-05, "loss": 0.5209, "step": 559 }, { "epoch": 1.7647521636506687, "grad_norm": 0.21563446035753572, "learning_rate": 3.801382763790546e-05, "loss": 0.5206, "step": 560 }, { "epoch": 1.767899291896145, "grad_norm": 0.1986468939526788, "learning_rate": 3.800541921952213e-05, "loss": 0.5208, "step": 561 }, { "epoch": 1.7710464201416207, "grad_norm": 0.1966987426971003, "learning_rate": 3.799699408411215e-05, "loss": 0.5073, "step": 562 }, { "epoch": 1.7741935483870968, "grad_norm": 0.216109363155467, "learning_rate": 3.7988552240475235e-05, "loss": 0.5148, "step": 563 }, { "epoch": 1.7773406766325728, "grad_norm": 0.1879949670400624, "learning_rate": 3.7980093697428545e-05, "loss": 0.5253, "step": 564 }, { "epoch": 1.7804878048780488, "grad_norm": 0.21164171435059104, "learning_rate": 3.797161846380669e-05, "loss": 0.5131, "step": 565 }, { "epoch": 1.7836349331235248, "grad_norm": 0.21236700030393402, "learning_rate": 3.796312654846174e-05, "loss": 0.5262, "step": 566 }, { "epoch": 1.7867820613690006, "grad_norm": 0.23772908828608705, "learning_rate": 3.795461796026314e-05, "loss": 0.5161, "step": 567 }, { "epoch": 1.789929189614477, "grad_norm": 0.1989690141679018, "learning_rate": 3.794609270809779e-05, "loss": 0.5148, "step": 568 }, { "epoch": 1.7930763178599527, "grad_norm": 0.19386929212613396, "learning_rate": 3.793755080086997e-05, "loss": 0.5136, "step": 569 }, { "epoch": 1.796223446105429, "grad_norm": 0.20873828711474318, "learning_rate": 3.792899224750136e-05, "loss": 0.5285, "step": 570 }, { "epoch": 1.7993705743509048, "grad_norm": 0.21110002963411045, "learning_rate": 3.7920417056931046e-05, "loss": 0.5261, "step": 571 }, { "epoch": 1.8025177025963808, "grad_norm": 0.1720427492785605, "learning_rate": 3.791182523811545e-05, "loss": 0.5144, "step": 572 }, { "epoch": 1.8056648308418568, "grad_norm": 0.2174774642242585, "learning_rate": 3.7903216800028416e-05, "loss": 0.5106, "step": 573 }, { "epoch": 1.8088119590873328, "grad_norm": 0.19514122415387664, "learning_rate": 3.789459175166109e-05, "loss": 0.5228, "step": 574 }, { "epoch": 1.8119590873328089, "grad_norm": 0.18369904706820767, "learning_rate": 3.7885950102022014e-05, "loss": 0.5135, "step": 575 }, { "epoch": 1.8151062155782847, "grad_norm": 0.20445427723961576, "learning_rate": 3.787729186013704e-05, "loss": 0.5148, "step": 576 }, { "epoch": 1.818253343823761, "grad_norm": 0.1933783032598251, "learning_rate": 3.786861703504936e-05, "loss": 0.5215, "step": 577 }, { "epoch": 1.8214004720692367, "grad_norm": 0.18342847430157735, "learning_rate": 3.7859925635819476e-05, "loss": 0.5128, "step": 578 }, { "epoch": 1.824547600314713, "grad_norm": 0.2226340450308752, "learning_rate": 3.785121767152523e-05, "loss": 0.5283, "step": 579 }, { "epoch": 1.8276947285601888, "grad_norm": 0.19369436637649629, "learning_rate": 3.784249315126173e-05, "loss": 0.5148, "step": 580 }, { "epoch": 1.8308418568056648, "grad_norm": 0.19807163830925228, "learning_rate": 3.783375208414139e-05, "loss": 0.5151, "step": 581 }, { "epoch": 1.8339889850511408, "grad_norm": 0.18328220410897705, "learning_rate": 3.782499447929392e-05, "loss": 0.514, "step": 582 }, { "epoch": 1.8371361132966169, "grad_norm": 0.1839242669305662, "learning_rate": 3.7816220345866294e-05, "loss": 0.5251, "step": 583 }, { "epoch": 1.8402832415420929, "grad_norm": 0.18986955880243364, "learning_rate": 3.780742969302273e-05, "loss": 0.5131, "step": 584 }, { "epoch": 1.8434303697875687, "grad_norm": 0.19594785121809236, "learning_rate": 3.7798622529944735e-05, "loss": 0.5161, "step": 585 }, { "epoch": 1.846577498033045, "grad_norm": 0.2357648214241525, "learning_rate": 3.7789798865831024e-05, "loss": 0.5156, "step": 586 }, { "epoch": 1.8497246262785207, "grad_norm": 0.19304354227740758, "learning_rate": 3.778095870989758e-05, "loss": 0.5203, "step": 587 }, { "epoch": 1.852871754523997, "grad_norm": 0.22728068274130966, "learning_rate": 3.777210207137759e-05, "loss": 0.5321, "step": 588 }, { "epoch": 1.8560188827694728, "grad_norm": 0.2017775324752321, "learning_rate": 3.7763228959521465e-05, "loss": 0.5242, "step": 589 }, { "epoch": 1.8591660110149488, "grad_norm": 0.22300635096362367, "learning_rate": 3.775433938359681e-05, "loss": 0.5231, "step": 590 }, { "epoch": 1.8623131392604249, "grad_norm": 0.1996306525906858, "learning_rate": 3.774543335288845e-05, "loss": 0.5221, "step": 591 }, { "epoch": 1.8654602675059009, "grad_norm": 0.16852847329609896, "learning_rate": 3.773651087669837e-05, "loss": 0.5107, "step": 592 }, { "epoch": 1.868607395751377, "grad_norm": 0.24727169403577154, "learning_rate": 3.7727571964345745e-05, "loss": 0.522, "step": 593 }, { "epoch": 1.8717545239968527, "grad_norm": 0.22616575051796836, "learning_rate": 3.771861662516692e-05, "loss": 0.5109, "step": 594 }, { "epoch": 1.874901652242329, "grad_norm": 0.2500551252876427, "learning_rate": 3.7709644868515386e-05, "loss": 0.514, "step": 595 }, { "epoch": 1.8780487804878048, "grad_norm": 0.19733979061423199, "learning_rate": 3.770065670376179e-05, "loss": 0.5138, "step": 596 }, { "epoch": 1.881195908733281, "grad_norm": 0.23793266287535486, "learning_rate": 3.769165214029392e-05, "loss": 0.5151, "step": 597 }, { "epoch": 1.8843430369787568, "grad_norm": 0.20047355737675948, "learning_rate": 3.768263118751667e-05, "loss": 0.5195, "step": 598 }, { "epoch": 1.8874901652242329, "grad_norm": 0.1734693730368024, "learning_rate": 3.767359385485208e-05, "loss": 0.5067, "step": 599 }, { "epoch": 1.8906372934697089, "grad_norm": 0.20206213476307608, "learning_rate": 3.766454015173929e-05, "loss": 0.5161, "step": 600 }, { "epoch": 1.893784421715185, "grad_norm": 0.20882657364965593, "learning_rate": 3.765547008763453e-05, "loss": 0.5103, "step": 601 }, { "epoch": 1.896931549960661, "grad_norm": 0.2021864503575807, "learning_rate": 3.764638367201112e-05, "loss": 0.5004, "step": 602 }, { "epoch": 1.9000786782061367, "grad_norm": 0.20424529742189995, "learning_rate": 3.763728091435946e-05, "loss": 0.5162, "step": 603 }, { "epoch": 1.903225806451613, "grad_norm": 0.1961209704262886, "learning_rate": 3.7628161824187025e-05, "loss": 0.518, "step": 604 }, { "epoch": 1.9063729346970888, "grad_norm": 0.23767968238521123, "learning_rate": 3.7619026411018345e-05, "loss": 0.5069, "step": 605 }, { "epoch": 1.909520062942565, "grad_norm": 0.21212885274385532, "learning_rate": 3.7609874684394994e-05, "loss": 0.519, "step": 606 }, { "epoch": 1.9126671911880408, "grad_norm": 0.21268544914645238, "learning_rate": 3.760070665387558e-05, "loss": 0.5136, "step": 607 }, { "epoch": 1.9158143194335169, "grad_norm": 0.20821784947192484, "learning_rate": 3.7591522329035763e-05, "loss": 0.5159, "step": 608 }, { "epoch": 1.918961447678993, "grad_norm": 0.22337933592922737, "learning_rate": 3.75823217194682e-05, "loss": 0.519, "step": 609 }, { "epoch": 1.922108575924469, "grad_norm": 0.25135156804926945, "learning_rate": 3.7573104834782566e-05, "loss": 0.5153, "step": 610 }, { "epoch": 1.925255704169945, "grad_norm": 0.197509832149269, "learning_rate": 3.756387168460552e-05, "loss": 0.5247, "step": 611 }, { "epoch": 1.9284028324154208, "grad_norm": 0.2389600063017852, "learning_rate": 3.7554622278580735e-05, "loss": 0.5166, "step": 612 }, { "epoch": 1.931549960660897, "grad_norm": 0.26288745403046027, "learning_rate": 3.754535662636884e-05, "loss": 0.5236, "step": 613 }, { "epoch": 1.9346970889063728, "grad_norm": 0.20996861395700098, "learning_rate": 3.7536074737647455e-05, "loss": 0.5168, "step": 614 }, { "epoch": 1.937844217151849, "grad_norm": 0.23502763223514755, "learning_rate": 3.752677662211114e-05, "loss": 0.5185, "step": 615 }, { "epoch": 1.9409913453973249, "grad_norm": 0.22436762734979818, "learning_rate": 3.75174622894714e-05, "loss": 0.5207, "step": 616 }, { "epoch": 1.944138473642801, "grad_norm": 0.1810432051540689, "learning_rate": 3.7508131749456696e-05, "loss": 0.5161, "step": 617 }, { "epoch": 1.947285601888277, "grad_norm": 0.2522520252638888, "learning_rate": 3.74987850118124e-05, "loss": 0.5112, "step": 618 }, { "epoch": 1.950432730133753, "grad_norm": 0.25396670669216415, "learning_rate": 3.748942208630082e-05, "loss": 0.5221, "step": 619 }, { "epoch": 1.953579858379229, "grad_norm": 0.22197793438383823, "learning_rate": 3.748004298270115e-05, "loss": 0.5162, "step": 620 }, { "epoch": 1.956726986624705, "grad_norm": 0.23177715709488106, "learning_rate": 3.74706477108095e-05, "loss": 0.5106, "step": 621 }, { "epoch": 1.959874114870181, "grad_norm": 0.22079140874889208, "learning_rate": 3.746123628043886e-05, "loss": 0.5193, "step": 622 }, { "epoch": 1.9630212431156568, "grad_norm": 0.22902705592617217, "learning_rate": 3.745180870141908e-05, "loss": 0.5147, "step": 623 }, { "epoch": 1.966168371361133, "grad_norm": 0.21218179122207526, "learning_rate": 3.744236498359692e-05, "loss": 0.5139, "step": 624 }, { "epoch": 1.969315499606609, "grad_norm": 0.2289304429026494, "learning_rate": 3.743290513683595e-05, "loss": 0.509, "step": 625 }, { "epoch": 1.972462627852085, "grad_norm": 0.19761688027384242, "learning_rate": 3.742342917101661e-05, "loss": 0.5108, "step": 626 }, { "epoch": 1.975609756097561, "grad_norm": 0.20658238399745513, "learning_rate": 3.741393709603617e-05, "loss": 0.5162, "step": 627 }, { "epoch": 1.978756884343037, "grad_norm": 0.22387644395974296, "learning_rate": 3.740442892180873e-05, "loss": 0.5176, "step": 628 }, { "epoch": 1.981904012588513, "grad_norm": 0.2099407145968934, "learning_rate": 3.7394904658265205e-05, "loss": 0.5193, "step": 629 }, { "epoch": 1.985051140833989, "grad_norm": 0.20355440734215408, "learning_rate": 3.7385364315353305e-05, "loss": 0.502, "step": 630 }, { "epoch": 1.988198269079465, "grad_norm": 0.2737712591861954, "learning_rate": 3.7375807903037534e-05, "loss": 0.5146, "step": 631 }, { "epoch": 1.9913453973249409, "grad_norm": 0.3107234957383597, "learning_rate": 3.73662354312992e-05, "loss": 0.5181, "step": 632 }, { "epoch": 1.9944925255704171, "grad_norm": 0.24548807968691783, "learning_rate": 3.735664691013636e-05, "loss": 0.5078, "step": 633 }, { "epoch": 1.997639653815893, "grad_norm": 0.21987770768265905, "learning_rate": 3.734704234956385e-05, "loss": 0.5089, "step": 634 }, { "epoch": 2.003147128245476, "grad_norm": 0.5383301018870941, "learning_rate": 3.7337421759613255e-05, "loss": 1.0306, "step": 635 }, { "epoch": 2.006294256490952, "grad_norm": 0.7404093702367391, "learning_rate": 3.7327785150332896e-05, "loss": 0.4838, "step": 636 }, { "epoch": 2.009441384736428, "grad_norm": 0.7862043686171012, "learning_rate": 3.7318132531787835e-05, "loss": 0.481, "step": 637 }, { "epoch": 2.012588512981904, "grad_norm": 0.5852793874262968, "learning_rate": 3.7308463914059846e-05, "loss": 0.4815, "step": 638 }, { "epoch": 2.01573564122738, "grad_norm": 0.3672440684726176, "learning_rate": 3.729877930724741e-05, "loss": 0.4752, "step": 639 }, { "epoch": 2.018882769472856, "grad_norm": 0.5159055161972755, "learning_rate": 3.7289078721465735e-05, "loss": 0.4769, "step": 640 }, { "epoch": 2.022029897718332, "grad_norm": 0.397198576601526, "learning_rate": 3.7279362166846677e-05, "loss": 0.4794, "step": 641 }, { "epoch": 2.025177025963808, "grad_norm": 0.42002417967064054, "learning_rate": 3.726962965353881e-05, "loss": 0.4833, "step": 642 }, { "epoch": 2.028324154209284, "grad_norm": 0.44343130968400324, "learning_rate": 3.725988119170735e-05, "loss": 0.4759, "step": 643 }, { "epoch": 2.03147128245476, "grad_norm": 0.3602187948121981, "learning_rate": 3.725011679153418e-05, "loss": 0.47, "step": 644 }, { "epoch": 2.034618410700236, "grad_norm": 0.3689059788914393, "learning_rate": 3.7240336463217824e-05, "loss": 0.4845, "step": 645 }, { "epoch": 2.037765538945712, "grad_norm": 0.34291838849435596, "learning_rate": 3.723054021697346e-05, "loss": 0.4788, "step": 646 }, { "epoch": 2.040912667191188, "grad_norm": 0.34322337603866276, "learning_rate": 3.722072806303287e-05, "loss": 0.4714, "step": 647 }, { "epoch": 2.044059795436664, "grad_norm": 0.3243197383293679, "learning_rate": 3.721090001164447e-05, "loss": 0.4784, "step": 648 }, { "epoch": 2.04720692368214, "grad_norm": 0.28998221744555436, "learning_rate": 3.720105607307326e-05, "loss": 0.4787, "step": 649 }, { "epoch": 2.050354051927616, "grad_norm": 0.3306107669906591, "learning_rate": 3.7191196257600845e-05, "loss": 0.475, "step": 650 }, { "epoch": 2.0535011801730922, "grad_norm": 0.25616746795125434, "learning_rate": 3.718132057552542e-05, "loss": 0.4727, "step": 651 }, { "epoch": 2.056648308418568, "grad_norm": 0.3110708535061029, "learning_rate": 3.7171429037161735e-05, "loss": 0.4815, "step": 652 }, { "epoch": 2.059795436664044, "grad_norm": 0.27174269235855014, "learning_rate": 3.7161521652841114e-05, "loss": 0.4792, "step": 653 }, { "epoch": 2.06294256490952, "grad_norm": 0.2751966115922098, "learning_rate": 3.715159843291143e-05, "loss": 0.4737, "step": 654 }, { "epoch": 2.066089693154996, "grad_norm": 0.2793498805071301, "learning_rate": 3.714165938773709e-05, "loss": 0.4797, "step": 655 }, { "epoch": 2.069236821400472, "grad_norm": 0.22565153747735522, "learning_rate": 3.713170452769903e-05, "loss": 0.4734, "step": 656 }, { "epoch": 2.072383949645948, "grad_norm": 0.2703231691240761, "learning_rate": 3.712173386319472e-05, "loss": 0.4798, "step": 657 }, { "epoch": 2.075531077891424, "grad_norm": 0.20941889089054108, "learning_rate": 3.711174740463811e-05, "loss": 0.4767, "step": 658 }, { "epoch": 2.0786782061369, "grad_norm": 0.24127646716392062, "learning_rate": 3.710174516245967e-05, "loss": 0.4752, "step": 659 }, { "epoch": 2.0818253343823763, "grad_norm": 0.22857637349920826, "learning_rate": 3.7091727147106336e-05, "loss": 0.4816, "step": 660 }, { "epoch": 2.084972462627852, "grad_norm": 0.2403738146911501, "learning_rate": 3.7081693369041544e-05, "loss": 0.4802, "step": 661 }, { "epoch": 2.088119590873328, "grad_norm": 0.2239762593172141, "learning_rate": 3.707164383874516e-05, "loss": 0.4729, "step": 662 }, { "epoch": 2.091266719118804, "grad_norm": 0.2532472006395691, "learning_rate": 3.706157856671353e-05, "loss": 0.4775, "step": 663 }, { "epoch": 2.09441384736428, "grad_norm": 0.2368186581534253, "learning_rate": 3.7051497563459436e-05, "loss": 0.4747, "step": 664 }, { "epoch": 2.097560975609756, "grad_norm": 0.21848874983233801, "learning_rate": 3.704140083951208e-05, "loss": 0.4744, "step": 665 }, { "epoch": 2.100708103855232, "grad_norm": 0.2589340916721653, "learning_rate": 3.703128840541709e-05, "loss": 0.4686, "step": 666 }, { "epoch": 2.1038552321007082, "grad_norm": 0.1951064514885459, "learning_rate": 3.7021160271736505e-05, "loss": 0.4716, "step": 667 }, { "epoch": 2.107002360346184, "grad_norm": 0.27598581262671057, "learning_rate": 3.701101644904876e-05, "loss": 0.474, "step": 668 }, { "epoch": 2.1101494885916603, "grad_norm": 0.2065313958477124, "learning_rate": 3.7000856947948676e-05, "loss": 0.4715, "step": 669 }, { "epoch": 2.113296616837136, "grad_norm": 0.23178764564943305, "learning_rate": 3.699068177904745e-05, "loss": 0.4806, "step": 670 }, { "epoch": 2.116443745082612, "grad_norm": 0.20461181525036945, "learning_rate": 3.698049095297265e-05, "loss": 0.4748, "step": 671 }, { "epoch": 2.119590873328088, "grad_norm": 0.2350705336847509, "learning_rate": 3.697028448036817e-05, "loss": 0.4729, "step": 672 }, { "epoch": 2.122738001573564, "grad_norm": 0.20120939367328008, "learning_rate": 3.696006237189429e-05, "loss": 0.4786, "step": 673 }, { "epoch": 2.12588512981904, "grad_norm": 0.22060728235722682, "learning_rate": 3.6949824638227585e-05, "loss": 0.4774, "step": 674 }, { "epoch": 2.129032258064516, "grad_norm": 0.19177609563558462, "learning_rate": 3.693957129006096e-05, "loss": 0.484, "step": 675 }, { "epoch": 2.1321793863099923, "grad_norm": 0.25855343543212544, "learning_rate": 3.692930233810364e-05, "loss": 0.4837, "step": 676 }, { "epoch": 2.135326514555468, "grad_norm": 0.22818364530705276, "learning_rate": 3.691901779308113e-05, "loss": 0.4774, "step": 677 }, { "epoch": 2.1384736428009443, "grad_norm": 0.20914360052852593, "learning_rate": 3.690871766573523e-05, "loss": 0.4728, "step": 678 }, { "epoch": 2.14162077104642, "grad_norm": 0.25587656475568926, "learning_rate": 3.6898401966824035e-05, "loss": 0.4698, "step": 679 }, { "epoch": 2.144767899291896, "grad_norm": 0.1826067226207546, "learning_rate": 3.688807070712186e-05, "loss": 0.4761, "step": 680 }, { "epoch": 2.147915027537372, "grad_norm": 0.22738148878540357, "learning_rate": 3.68777238974193e-05, "loss": 0.4714, "step": 681 }, { "epoch": 2.151062155782848, "grad_norm": 0.2111664202105763, "learning_rate": 3.68673615485232e-05, "loss": 0.4774, "step": 682 }, { "epoch": 2.1542092840283242, "grad_norm": 0.24863885294296703, "learning_rate": 3.685698367125662e-05, "loss": 0.4743, "step": 683 }, { "epoch": 2.1573564122738, "grad_norm": 0.21812972928565583, "learning_rate": 3.684659027645884e-05, "loss": 0.469, "step": 684 }, { "epoch": 2.1605035405192763, "grad_norm": 0.24327955100377346, "learning_rate": 3.683618137498535e-05, "loss": 0.4781, "step": 685 }, { "epoch": 2.163650668764752, "grad_norm": 0.2120560203500377, "learning_rate": 3.6825756977707826e-05, "loss": 0.4718, "step": 686 }, { "epoch": 2.1667977970102283, "grad_norm": 0.24161382970696385, "learning_rate": 3.6815317095514145e-05, "loss": 0.4767, "step": 687 }, { "epoch": 2.169944925255704, "grad_norm": 0.20329140472495538, "learning_rate": 3.680486173930835e-05, "loss": 0.4827, "step": 688 }, { "epoch": 2.17309205350118, "grad_norm": 0.24761268049420182, "learning_rate": 3.679439092001065e-05, "loss": 0.4608, "step": 689 }, { "epoch": 2.176239181746656, "grad_norm": 0.1871251156119415, "learning_rate": 3.6783904648557396e-05, "loss": 0.4695, "step": 690 }, { "epoch": 2.179386309992132, "grad_norm": 0.2205720555772292, "learning_rate": 3.67734029359011e-05, "loss": 0.4717, "step": 691 }, { "epoch": 2.1825334382376083, "grad_norm": 0.21386365254718184, "learning_rate": 3.676288579301036e-05, "loss": 0.4764, "step": 692 }, { "epoch": 2.185680566483084, "grad_norm": 0.18868765682443855, "learning_rate": 3.6752353230869925e-05, "loss": 0.4698, "step": 693 }, { "epoch": 2.1888276947285603, "grad_norm": 0.22008810612187404, "learning_rate": 3.6741805260480644e-05, "loss": 0.4713, "step": 694 }, { "epoch": 2.191974822974036, "grad_norm": 0.20786113581001195, "learning_rate": 3.673124189285945e-05, "loss": 0.4806, "step": 695 }, { "epoch": 2.1951219512195124, "grad_norm": 0.21496969539942087, "learning_rate": 3.672066313903937e-05, "loss": 0.4713, "step": 696 }, { "epoch": 2.198269079464988, "grad_norm": 0.19763508819414857, "learning_rate": 3.671006901006948e-05, "loss": 0.4736, "step": 697 }, { "epoch": 2.201416207710464, "grad_norm": 0.2293997083330246, "learning_rate": 3.669945951701494e-05, "loss": 0.4764, "step": 698 }, { "epoch": 2.2045633359559402, "grad_norm": 0.22585825760183822, "learning_rate": 3.668883467095694e-05, "loss": 0.4734, "step": 699 }, { "epoch": 2.207710464201416, "grad_norm": 0.20186256772917444, "learning_rate": 3.6678194482992716e-05, "loss": 0.4777, "step": 700 }, { "epoch": 2.2108575924468923, "grad_norm": 0.19540930931295644, "learning_rate": 3.666753896423551e-05, "loss": 0.4846, "step": 701 }, { "epoch": 2.214004720692368, "grad_norm": 0.21556486232945823, "learning_rate": 3.6656868125814605e-05, "loss": 0.4797, "step": 702 }, { "epoch": 2.2171518489378443, "grad_norm": 0.19136441764044193, "learning_rate": 3.664618197887526e-05, "loss": 0.4722, "step": 703 }, { "epoch": 2.22029897718332, "grad_norm": 0.24889924698530747, "learning_rate": 3.663548053457873e-05, "loss": 0.4824, "step": 704 }, { "epoch": 2.2234461054287964, "grad_norm": 0.20658957990757543, "learning_rate": 3.662476380410227e-05, "loss": 0.4728, "step": 705 }, { "epoch": 2.226593233674272, "grad_norm": 0.19277909826894754, "learning_rate": 3.661403179863905e-05, "loss": 0.4724, "step": 706 }, { "epoch": 2.229740361919748, "grad_norm": 0.21254918935528136, "learning_rate": 3.660328452939825e-05, "loss": 0.4762, "step": 707 }, { "epoch": 2.2328874901652243, "grad_norm": 0.2037212638248634, "learning_rate": 3.659252200760495e-05, "loss": 0.4609, "step": 708 }, { "epoch": 2.2360346184107, "grad_norm": 0.17871023945609968, "learning_rate": 3.658174424450019e-05, "loss": 0.4748, "step": 709 }, { "epoch": 2.2391817466561763, "grad_norm": 0.21443695153487202, "learning_rate": 3.657095125134091e-05, "loss": 0.4753, "step": 710 }, { "epoch": 2.242328874901652, "grad_norm": 0.19069156080979188, "learning_rate": 3.656014303939996e-05, "loss": 0.4717, "step": 711 }, { "epoch": 2.2454760031471284, "grad_norm": 0.2069028744399135, "learning_rate": 3.654931961996611e-05, "loss": 0.4783, "step": 712 }, { "epoch": 2.248623131392604, "grad_norm": 0.18637707862361488, "learning_rate": 3.653848100434397e-05, "loss": 0.4832, "step": 713 }, { "epoch": 2.2517702596380804, "grad_norm": 0.21210206014309427, "learning_rate": 3.652762720385406e-05, "loss": 0.4826, "step": 714 }, { "epoch": 2.2549173878835562, "grad_norm": 0.18374915556791416, "learning_rate": 3.651675822983273e-05, "loss": 0.4728, "step": 715 }, { "epoch": 2.258064516129032, "grad_norm": 0.21024735238705872, "learning_rate": 3.65058740936322e-05, "loss": 0.4706, "step": 716 }, { "epoch": 2.2612116443745083, "grad_norm": 0.18392863904521575, "learning_rate": 3.649497480662053e-05, "loss": 0.4795, "step": 717 }, { "epoch": 2.264358772619984, "grad_norm": 0.19781125359127497, "learning_rate": 3.648406038018158e-05, "loss": 0.4774, "step": 718 }, { "epoch": 2.2675059008654603, "grad_norm": 0.21244857359184766, "learning_rate": 3.6473130825715036e-05, "loss": 0.4778, "step": 719 }, { "epoch": 2.270653029110936, "grad_norm": 0.2240315261327545, "learning_rate": 3.64621861546364e-05, "loss": 0.4768, "step": 720 }, { "epoch": 2.2738001573564124, "grad_norm": 0.19195015387576453, "learning_rate": 3.645122637837693e-05, "loss": 0.4761, "step": 721 }, { "epoch": 2.276947285601888, "grad_norm": 0.23948727145219656, "learning_rate": 3.644025150838368e-05, "loss": 0.4843, "step": 722 }, { "epoch": 2.2800944138473644, "grad_norm": 0.2044549094308154, "learning_rate": 3.642926155611949e-05, "loss": 0.4799, "step": 723 }, { "epoch": 2.2832415420928402, "grad_norm": 0.1912737434372698, "learning_rate": 3.64182565330629e-05, "loss": 0.477, "step": 724 }, { "epoch": 2.286388670338316, "grad_norm": 0.2225878785242701, "learning_rate": 3.6407236450708235e-05, "loss": 0.4659, "step": 725 }, { "epoch": 2.2895357985837923, "grad_norm": 0.18432866393581965, "learning_rate": 3.639620132056553e-05, "loss": 0.4817, "step": 726 }, { "epoch": 2.292682926829268, "grad_norm": 0.19897028209983136, "learning_rate": 3.638515115416055e-05, "loss": 0.4833, "step": 727 }, { "epoch": 2.2958300550747444, "grad_norm": 0.20081135317562743, "learning_rate": 3.637408596303476e-05, "loss": 0.4704, "step": 728 }, { "epoch": 2.29897718332022, "grad_norm": 0.1910907865185398, "learning_rate": 3.63630057587453e-05, "loss": 0.4825, "step": 729 }, { "epoch": 2.3021243115656964, "grad_norm": 0.2079462815404506, "learning_rate": 3.6351910552865e-05, "loss": 0.4757, "step": 730 }, { "epoch": 2.305271439811172, "grad_norm": 0.18921524052483815, "learning_rate": 3.634080035698238e-05, "loss": 0.4828, "step": 731 }, { "epoch": 2.3084185680566485, "grad_norm": 0.19748083973385058, "learning_rate": 3.632967518270159e-05, "loss": 0.4747, "step": 732 }, { "epoch": 2.3115656963021243, "grad_norm": 0.18371527979865251, "learning_rate": 3.6318535041642434e-05, "loss": 0.4787, "step": 733 }, { "epoch": 2.3147128245476, "grad_norm": 0.1802590221169126, "learning_rate": 3.630737994544036e-05, "loss": 0.4771, "step": 734 }, { "epoch": 2.3178599527930763, "grad_norm": 0.178911843071879, "learning_rate": 3.6296209905746416e-05, "loss": 0.4691, "step": 735 }, { "epoch": 2.321007081038552, "grad_norm": 0.20053075969824302, "learning_rate": 3.628502493422726e-05, "loss": 0.4779, "step": 736 }, { "epoch": 2.3241542092840284, "grad_norm": 0.17770666358757115, "learning_rate": 3.627382504256516e-05, "loss": 0.4771, "step": 737 }, { "epoch": 2.327301337529504, "grad_norm": 0.19730337436045323, "learning_rate": 3.626261024245795e-05, "loss": 0.4707, "step": 738 }, { "epoch": 2.3304484657749804, "grad_norm": 0.19412352087085247, "learning_rate": 3.625138054561906e-05, "loss": 0.4781, "step": 739 }, { "epoch": 2.3335955940204562, "grad_norm": 0.17759746668876183, "learning_rate": 3.6240135963777446e-05, "loss": 0.4705, "step": 740 }, { "epoch": 2.3367427222659325, "grad_norm": 0.2043623039312422, "learning_rate": 3.622887650867765e-05, "loss": 0.4684, "step": 741 }, { "epoch": 2.3398898505114083, "grad_norm": 0.20845845270662547, "learning_rate": 3.6217602192079706e-05, "loss": 0.477, "step": 742 }, { "epoch": 2.343036978756884, "grad_norm": 0.18687350571910952, "learning_rate": 3.620631302575921e-05, "loss": 0.4768, "step": 743 }, { "epoch": 2.3461841070023604, "grad_norm": 0.21502507739077148, "learning_rate": 3.619500902150723e-05, "loss": 0.4772, "step": 744 }, { "epoch": 2.349331235247836, "grad_norm": 0.18886284710156062, "learning_rate": 3.6183690191130365e-05, "loss": 0.4812, "step": 745 }, { "epoch": 2.3524783634933124, "grad_norm": 0.17965446422580023, "learning_rate": 3.617235654645068e-05, "loss": 0.4774, "step": 746 }, { "epoch": 2.355625491738788, "grad_norm": 0.20326170603684612, "learning_rate": 3.616100809930572e-05, "loss": 0.4768, "step": 747 }, { "epoch": 2.3587726199842645, "grad_norm": 0.1882843118623647, "learning_rate": 3.614964486154848e-05, "loss": 0.4722, "step": 748 }, { "epoch": 2.3619197482297403, "grad_norm": 0.17204335083123673, "learning_rate": 3.613826684504743e-05, "loss": 0.4674, "step": 749 }, { "epoch": 2.3650668764752165, "grad_norm": 0.19139874893907688, "learning_rate": 3.612687406168644e-05, "loss": 0.4681, "step": 750 }, { "epoch": 2.3682140047206923, "grad_norm": 0.1742655605287443, "learning_rate": 3.611546652336482e-05, "loss": 0.4735, "step": 751 }, { "epoch": 2.371361132966168, "grad_norm": 0.1785986233463769, "learning_rate": 3.610404424199732e-05, "loss": 0.4725, "step": 752 }, { "epoch": 2.3745082612116444, "grad_norm": 0.1770999597168329, "learning_rate": 3.6092607229514026e-05, "loss": 0.4751, "step": 753 }, { "epoch": 2.3776553894571206, "grad_norm": 0.1861951910359395, "learning_rate": 3.608115549786047e-05, "loss": 0.4772, "step": 754 }, { "epoch": 2.3808025177025964, "grad_norm": 0.1789049105676948, "learning_rate": 3.6069689058997506e-05, "loss": 0.4717, "step": 755 }, { "epoch": 2.3839496459480722, "grad_norm": 0.18717193575284816, "learning_rate": 3.60582079249014e-05, "loss": 0.4742, "step": 756 }, { "epoch": 2.3870967741935485, "grad_norm": 0.2319346851175833, "learning_rate": 3.604671210756373e-05, "loss": 0.48, "step": 757 }, { "epoch": 2.3902439024390243, "grad_norm": 0.20723294874930143, "learning_rate": 3.603520161899144e-05, "loss": 0.4728, "step": 758 }, { "epoch": 2.3933910306845005, "grad_norm": 0.21571506744754684, "learning_rate": 3.6023676471206746e-05, "loss": 0.4695, "step": 759 }, { "epoch": 2.3965381589299763, "grad_norm": 0.17314280493311868, "learning_rate": 3.601213667624724e-05, "loss": 0.4735, "step": 760 }, { "epoch": 2.399685287175452, "grad_norm": 0.21517245659461837, "learning_rate": 3.600058224616576e-05, "loss": 0.4805, "step": 761 }, { "epoch": 2.4028324154209284, "grad_norm": 0.20430687151416146, "learning_rate": 3.598901319303047e-05, "loss": 0.4843, "step": 762 }, { "epoch": 2.4059795436664047, "grad_norm": 0.2004177768069127, "learning_rate": 3.597742952892477e-05, "loss": 0.4833, "step": 763 }, { "epoch": 2.4091266719118805, "grad_norm": 0.24567032007723139, "learning_rate": 3.5965831265947344e-05, "loss": 0.4686, "step": 764 }, { "epoch": 2.4122738001573563, "grad_norm": 0.21956644771343653, "learning_rate": 3.595421841621212e-05, "loss": 0.478, "step": 765 }, { "epoch": 2.4154209284028325, "grad_norm": 0.2038846900874555, "learning_rate": 3.594259099184826e-05, "loss": 0.4739, "step": 766 }, { "epoch": 2.4185680566483083, "grad_norm": 0.21879240881473924, "learning_rate": 3.593094900500015e-05, "loss": 0.4713, "step": 767 }, { "epoch": 2.4217151848937846, "grad_norm": 0.22973634489226144, "learning_rate": 3.591929246782738e-05, "loss": 0.4848, "step": 768 }, { "epoch": 2.4248623131392604, "grad_norm": 0.19432937590163568, "learning_rate": 3.5907621392504747e-05, "loss": 0.4791, "step": 769 }, { "epoch": 2.4280094413847366, "grad_norm": 0.19157056326864344, "learning_rate": 3.589593579122222e-05, "loss": 0.4801, "step": 770 }, { "epoch": 2.4311565696302124, "grad_norm": 0.19492660523958835, "learning_rate": 3.588423567618496e-05, "loss": 0.4739, "step": 771 }, { "epoch": 2.4343036978756887, "grad_norm": 0.19603103831816215, "learning_rate": 3.5872521059613254e-05, "loss": 0.4783, "step": 772 }, { "epoch": 2.4374508261211645, "grad_norm": 0.16793141618091936, "learning_rate": 3.5860791953742574e-05, "loss": 0.4828, "step": 773 }, { "epoch": 2.4405979543666403, "grad_norm": 0.18926327402558274, "learning_rate": 3.5849048370823496e-05, "loss": 0.462, "step": 774 }, { "epoch": 2.4437450826121165, "grad_norm": 0.20901177449925584, "learning_rate": 3.583729032312173e-05, "loss": 0.4704, "step": 775 }, { "epoch": 2.4468922108575923, "grad_norm": 0.21820920525411092, "learning_rate": 3.582551782291809e-05, "loss": 0.4661, "step": 776 }, { "epoch": 2.4500393391030686, "grad_norm": 0.18520013651848868, "learning_rate": 3.581373088250849e-05, "loss": 0.4755, "step": 777 }, { "epoch": 2.4531864673485444, "grad_norm": 0.2426404576575414, "learning_rate": 3.580192951420391e-05, "loss": 0.4723, "step": 778 }, { "epoch": 2.4563335955940206, "grad_norm": 0.23249501540895848, "learning_rate": 3.579011373033044e-05, "loss": 0.4755, "step": 779 }, { "epoch": 2.4594807238394965, "grad_norm": 0.2161001745094506, "learning_rate": 3.577828354322917e-05, "loss": 0.4773, "step": 780 }, { "epoch": 2.4626278520849727, "grad_norm": 0.21146417784405747, "learning_rate": 3.576643896525628e-05, "loss": 0.4871, "step": 781 }, { "epoch": 2.4657749803304485, "grad_norm": 0.20723676007259584, "learning_rate": 3.575458000878294e-05, "loss": 0.4783, "step": 782 }, { "epoch": 2.4689221085759243, "grad_norm": 0.2584014129384574, "learning_rate": 3.5742706686195386e-05, "loss": 0.4767, "step": 783 }, { "epoch": 2.4720692368214006, "grad_norm": 0.19393702218745548, "learning_rate": 3.573081900989482e-05, "loss": 0.4804, "step": 784 }, { "epoch": 2.4752163650668764, "grad_norm": 0.2246269196846869, "learning_rate": 3.5718916992297456e-05, "loss": 0.4748, "step": 785 }, { "epoch": 2.4783634933123526, "grad_norm": 0.2016712184696746, "learning_rate": 3.5707000645834476e-05, "loss": 0.4839, "step": 786 }, { "epoch": 2.4815106215578284, "grad_norm": 0.20378549394086407, "learning_rate": 3.569506998295203e-05, "loss": 0.4726, "step": 787 }, { "epoch": 2.4846577498033047, "grad_norm": 0.2337914377259457, "learning_rate": 3.568312501611123e-05, "loss": 0.4814, "step": 788 }, { "epoch": 2.4878048780487805, "grad_norm": 0.19396608409990398, "learning_rate": 3.5671165757788115e-05, "loss": 0.4761, "step": 789 }, { "epoch": 2.4909520062942567, "grad_norm": 0.2692755192747761, "learning_rate": 3.5659192220473654e-05, "loss": 0.4785, "step": 790 }, { "epoch": 2.4940991345397325, "grad_norm": 0.18587733519714691, "learning_rate": 3.5647204416673746e-05, "loss": 0.4864, "step": 791 }, { "epoch": 2.4972462627852083, "grad_norm": 0.2449196618260009, "learning_rate": 3.5635202358909164e-05, "loss": 0.4763, "step": 792 }, { "epoch": 2.5003933910306846, "grad_norm": 0.22615305309932182, "learning_rate": 3.562318605971559e-05, "loss": 0.4851, "step": 793 }, { "epoch": 2.5035405192761604, "grad_norm": 0.2043080610888049, "learning_rate": 3.561115553164356e-05, "loss": 0.4726, "step": 794 }, { "epoch": 2.5066876475216366, "grad_norm": 0.22066196853846168, "learning_rate": 3.55991107872585e-05, "loss": 0.475, "step": 795 }, { "epoch": 2.5098347757671124, "grad_norm": 0.17253963662301974, "learning_rate": 3.558705183914066e-05, "loss": 0.4734, "step": 796 }, { "epoch": 2.5129819040125883, "grad_norm": 0.19881124746164847, "learning_rate": 3.5574978699885134e-05, "loss": 0.4832, "step": 797 }, { "epoch": 2.5161290322580645, "grad_norm": 0.19723415337076033, "learning_rate": 3.556289138210185e-05, "loss": 0.4689, "step": 798 }, { "epoch": 2.5192761605035408, "grad_norm": 0.18954283260561922, "learning_rate": 3.555078989841551e-05, "loss": 0.4757, "step": 799 }, { "epoch": 2.5224232887490166, "grad_norm": 0.19352808470983424, "learning_rate": 3.5538674261465655e-05, "loss": 0.4713, "step": 800 }, { "epoch": 2.5255704169944924, "grad_norm": 0.20041238629382177, "learning_rate": 3.5526544483906575e-05, "loss": 0.4845, "step": 801 }, { "epoch": 2.5287175452399686, "grad_norm": 0.19181251462489346, "learning_rate": 3.551440057840736e-05, "loss": 0.4882, "step": 802 }, { "epoch": 2.5318646734854444, "grad_norm": 0.18016461406070697, "learning_rate": 3.5502242557651813e-05, "loss": 0.4805, "step": 803 }, { "epoch": 2.5350118017309207, "grad_norm": 0.20267911141580347, "learning_rate": 3.5490070434338525e-05, "loss": 0.4776, "step": 804 }, { "epoch": 2.5381589299763965, "grad_norm": 0.19059012370650052, "learning_rate": 3.5477884221180785e-05, "loss": 0.4886, "step": 805 }, { "epoch": 2.5413060582218723, "grad_norm": 0.21515842908040148, "learning_rate": 3.546568393090662e-05, "loss": 0.483, "step": 806 }, { "epoch": 2.5444531864673485, "grad_norm": 0.19422697970653924, "learning_rate": 3.5453469576258744e-05, "loss": 0.4692, "step": 807 }, { "epoch": 2.5476003147128248, "grad_norm": 0.209559004728807, "learning_rate": 3.544124116999457e-05, "loss": 0.4865, "step": 808 }, { "epoch": 2.5507474429583006, "grad_norm": 0.1920441375389536, "learning_rate": 3.542899872488618e-05, "loss": 0.4793, "step": 809 }, { "epoch": 2.5538945712037764, "grad_norm": 0.2319852120314673, "learning_rate": 3.541674225372033e-05, "loss": 0.4773, "step": 810 }, { "epoch": 2.5570416994492526, "grad_norm": 0.19321156216806282, "learning_rate": 3.540447176929841e-05, "loss": 0.4757, "step": 811 }, { "epoch": 2.5601888276947284, "grad_norm": 0.21514651398865584, "learning_rate": 3.539218728443646e-05, "loss": 0.4785, "step": 812 }, { "epoch": 2.5633359559402047, "grad_norm": 0.21148869854656144, "learning_rate": 3.537988881196514e-05, "loss": 0.4746, "step": 813 }, { "epoch": 2.5664830841856805, "grad_norm": 0.17595269634550698, "learning_rate": 3.536757636472972e-05, "loss": 0.4685, "step": 814 }, { "epoch": 2.5696302124311563, "grad_norm": 0.20185229955902398, "learning_rate": 3.5355249955590056e-05, "loss": 0.4783, "step": 815 }, { "epoch": 2.5727773406766326, "grad_norm": 0.17164696512537717, "learning_rate": 3.53429095974206e-05, "loss": 0.4775, "step": 816 }, { "epoch": 2.575924468922109, "grad_norm": 0.23266847949769962, "learning_rate": 3.533055530311036e-05, "loss": 0.4692, "step": 817 }, { "epoch": 2.5790715971675846, "grad_norm": 0.16306518710715495, "learning_rate": 3.531818708556292e-05, "loss": 0.4783, "step": 818 }, { "epoch": 2.5822187254130604, "grad_norm": 0.20716704551095141, "learning_rate": 3.530580495769638e-05, "loss": 0.4785, "step": 819 }, { "epoch": 2.5853658536585367, "grad_norm": 0.20742333714577207, "learning_rate": 3.5293408932443384e-05, "loss": 0.4795, "step": 820 }, { "epoch": 2.5885129819040125, "grad_norm": 0.19428872955255258, "learning_rate": 3.5280999022751095e-05, "loss": 0.4853, "step": 821 }, { "epoch": 2.5916601101494887, "grad_norm": 0.20147158146515537, "learning_rate": 3.526857524158117e-05, "loss": 0.468, "step": 822 }, { "epoch": 2.5948072383949645, "grad_norm": 0.1893299986265306, "learning_rate": 3.525613760190977e-05, "loss": 0.4774, "step": 823 }, { "epoch": 2.5979543666404403, "grad_norm": 0.17391513342893553, "learning_rate": 3.524368611672749e-05, "loss": 0.4698, "step": 824 }, { "epoch": 2.6011014948859166, "grad_norm": 0.18505439269447876, "learning_rate": 3.5231220799039434e-05, "loss": 0.4759, "step": 825 }, { "epoch": 2.604248623131393, "grad_norm": 0.18433327533924215, "learning_rate": 3.521874166186512e-05, "loss": 0.4745, "step": 826 }, { "epoch": 2.6073957513768686, "grad_norm": 0.17820635418388936, "learning_rate": 3.5206248718238525e-05, "loss": 0.4862, "step": 827 }, { "epoch": 2.6105428796223444, "grad_norm": 0.18696671678977475, "learning_rate": 3.519374198120803e-05, "loss": 0.4758, "step": 828 }, { "epoch": 2.6136900078678207, "grad_norm": 0.18769968466549117, "learning_rate": 3.5181221463836426e-05, "loss": 0.4778, "step": 829 }, { "epoch": 2.6168371361132965, "grad_norm": 0.191487891133253, "learning_rate": 3.51686871792009e-05, "loss": 0.4707, "step": 830 }, { "epoch": 2.6199842643587727, "grad_norm": 0.1825542674839225, "learning_rate": 3.5156139140393e-05, "loss": 0.4706, "step": 831 }, { "epoch": 2.6231313926042485, "grad_norm": 0.18855481959761283, "learning_rate": 3.514357736051868e-05, "loss": 0.4838, "step": 832 }, { "epoch": 2.6262785208497244, "grad_norm": 0.18817841391423204, "learning_rate": 3.513100185269821e-05, "loss": 0.4685, "step": 833 }, { "epoch": 2.6294256490952006, "grad_norm": 0.18928717937714, "learning_rate": 3.51184126300662e-05, "loss": 0.4781, "step": 834 }, { "epoch": 2.632572777340677, "grad_norm": 0.2256933473661262, "learning_rate": 3.510580970577161e-05, "loss": 0.4739, "step": 835 }, { "epoch": 2.6357199055861527, "grad_norm": 0.17564318088263306, "learning_rate": 3.5093193092977694e-05, "loss": 0.4718, "step": 836 }, { "epoch": 2.6388670338316285, "grad_norm": 0.20104156818385482, "learning_rate": 3.5080562804861996e-05, "loss": 0.4802, "step": 837 }, { "epoch": 2.6420141620771047, "grad_norm": 0.18928735051422238, "learning_rate": 3.506791885461636e-05, "loss": 0.4799, "step": 838 }, { "epoch": 2.6451612903225805, "grad_norm": 0.17754299417673466, "learning_rate": 3.505526125544688e-05, "loss": 0.4739, "step": 839 }, { "epoch": 2.6483084185680568, "grad_norm": 0.17924816010567865, "learning_rate": 3.504259002057394e-05, "loss": 0.4833, "step": 840 }, { "epoch": 2.6514555468135326, "grad_norm": 0.18330420763324123, "learning_rate": 3.5029905163232114e-05, "loss": 0.4809, "step": 841 }, { "epoch": 2.654602675059009, "grad_norm": 0.17763845173295095, "learning_rate": 3.501720669667025e-05, "loss": 0.478, "step": 842 }, { "epoch": 2.6577498033044846, "grad_norm": 0.20606096391374554, "learning_rate": 3.500449463415139e-05, "loss": 0.4803, "step": 843 }, { "epoch": 2.660896931549961, "grad_norm": 0.16091387572699684, "learning_rate": 3.4991768988952794e-05, "loss": 0.4777, "step": 844 }, { "epoch": 2.6640440597954367, "grad_norm": 0.17951419173964245, "learning_rate": 3.497902977436587e-05, "loss": 0.4786, "step": 845 }, { "epoch": 2.6671911880409125, "grad_norm": 0.17938289165750493, "learning_rate": 3.4966277003696236e-05, "loss": 0.4818, "step": 846 }, { "epoch": 2.6703383162863887, "grad_norm": 0.18410261157957933, "learning_rate": 3.495351069026365e-05, "loss": 0.4738, "step": 847 }, { "epoch": 2.6734854445318645, "grad_norm": 0.20590921125222097, "learning_rate": 3.494073084740204e-05, "loss": 0.486, "step": 848 }, { "epoch": 2.676632572777341, "grad_norm": 0.20239376639918705, "learning_rate": 3.492793748845942e-05, "loss": 0.4782, "step": 849 }, { "epoch": 2.6797797010228166, "grad_norm": 0.1916725798858272, "learning_rate": 3.491513062679796e-05, "loss": 0.47, "step": 850 }, { "epoch": 2.682926829268293, "grad_norm": 0.2033931578311126, "learning_rate": 3.490231027579393e-05, "loss": 0.4791, "step": 851 }, { "epoch": 2.6860739575137687, "grad_norm": 0.1941921749860495, "learning_rate": 3.4889476448837656e-05, "loss": 0.4882, "step": 852 }, { "epoch": 2.689221085759245, "grad_norm": 0.19874836532319543, "learning_rate": 3.4876629159333575e-05, "loss": 0.4756, "step": 853 }, { "epoch": 2.6923682140047207, "grad_norm": 0.20304633766767777, "learning_rate": 3.486376842070017e-05, "loss": 0.4793, "step": 854 }, { "epoch": 2.6955153422501965, "grad_norm": 0.18295353981827106, "learning_rate": 3.485089424636997e-05, "loss": 0.4822, "step": 855 }, { "epoch": 2.6986624704956728, "grad_norm": 0.2050344464266888, "learning_rate": 3.4838006649789546e-05, "loss": 0.4711, "step": 856 }, { "epoch": 2.7018095987411486, "grad_norm": 0.17837345576674332, "learning_rate": 3.482510564441949e-05, "loss": 0.4835, "step": 857 }, { "epoch": 2.704956726986625, "grad_norm": 0.22822296718181742, "learning_rate": 3.4812191243734375e-05, "loss": 0.4762, "step": 858 }, { "epoch": 2.7081038552321006, "grad_norm": 0.1759543201588798, "learning_rate": 3.479926346122279e-05, "loss": 0.4738, "step": 859 }, { "epoch": 2.711250983477577, "grad_norm": 0.2319777833974616, "learning_rate": 3.478632231038729e-05, "loss": 0.4794, "step": 860 }, { "epoch": 2.7143981117230527, "grad_norm": 0.17035781259506136, "learning_rate": 3.477336780474439e-05, "loss": 0.4769, "step": 861 }, { "epoch": 2.717545239968529, "grad_norm": 0.21472875867046778, "learning_rate": 3.4760399957824576e-05, "loss": 0.4818, "step": 862 }, { "epoch": 2.7206923682140047, "grad_norm": 0.19547886779373266, "learning_rate": 3.474741878317223e-05, "loss": 0.4756, "step": 863 }, { "epoch": 2.7238394964594805, "grad_norm": 0.1896076340759897, "learning_rate": 3.4734424294345673e-05, "loss": 0.4826, "step": 864 }, { "epoch": 2.726986624704957, "grad_norm": 0.2005371691635383, "learning_rate": 3.472141650491716e-05, "loss": 0.4898, "step": 865 }, { "epoch": 2.7301337529504326, "grad_norm": 0.2152647700389423, "learning_rate": 3.470839542847279e-05, "loss": 0.4816, "step": 866 }, { "epoch": 2.733280881195909, "grad_norm": 0.2073082020273619, "learning_rate": 3.4695361078612565e-05, "loss": 0.4766, "step": 867 }, { "epoch": 2.7364280094413846, "grad_norm": 0.22650494077281716, "learning_rate": 3.468231346895035e-05, "loss": 0.4773, "step": 868 }, { "epoch": 2.739575137686861, "grad_norm": 0.18847447202117246, "learning_rate": 3.466925261311386e-05, "loss": 0.4757, "step": 869 }, { "epoch": 2.7427222659323367, "grad_norm": 0.19717018165808387, "learning_rate": 3.4656178524744644e-05, "loss": 0.4723, "step": 870 }, { "epoch": 2.745869394177813, "grad_norm": 0.20419628183572036, "learning_rate": 3.464309121749805e-05, "loss": 0.4685, "step": 871 }, { "epoch": 2.7490165224232888, "grad_norm": 0.20942542946004844, "learning_rate": 3.4629990705043274e-05, "loss": 0.4807, "step": 872 }, { "epoch": 2.7521636506687646, "grad_norm": 0.19751423169250182, "learning_rate": 3.461687700106327e-05, "loss": 0.478, "step": 873 }, { "epoch": 2.755310778914241, "grad_norm": 0.20839382920660143, "learning_rate": 3.46037501192548e-05, "loss": 0.4796, "step": 874 }, { "epoch": 2.7584579071597166, "grad_norm": 0.21264262241811938, "learning_rate": 3.459061007332835e-05, "loss": 0.483, "step": 875 }, { "epoch": 2.761605035405193, "grad_norm": 0.21206865795633334, "learning_rate": 3.457745687700818e-05, "loss": 0.482, "step": 876 }, { "epoch": 2.7647521636506687, "grad_norm": 0.2267998478365036, "learning_rate": 3.4564290544032304e-05, "loss": 0.4852, "step": 877 }, { "epoch": 2.767899291896145, "grad_norm": 0.19962244872664645, "learning_rate": 3.455111108815242e-05, "loss": 0.4781, "step": 878 }, { "epoch": 2.7710464201416207, "grad_norm": 0.20223940173326052, "learning_rate": 3.453791852313395e-05, "loss": 0.4815, "step": 879 }, { "epoch": 2.774193548387097, "grad_norm": 0.2497760291214373, "learning_rate": 3.4524712862756004e-05, "loss": 0.4737, "step": 880 }, { "epoch": 2.777340676632573, "grad_norm": 0.21392418161927715, "learning_rate": 3.451149412081137e-05, "loss": 0.4849, "step": 881 }, { "epoch": 2.7804878048780486, "grad_norm": 0.20756513406871988, "learning_rate": 3.4498262311106505e-05, "loss": 0.4794, "step": 882 }, { "epoch": 2.783634933123525, "grad_norm": 0.19434508478386714, "learning_rate": 3.448501744746151e-05, "loss": 0.4717, "step": 883 }, { "epoch": 2.7867820613690006, "grad_norm": 0.17815978736962265, "learning_rate": 3.4471759543710115e-05, "loss": 0.479, "step": 884 }, { "epoch": 2.789929189614477, "grad_norm": 0.21692701459569838, "learning_rate": 3.4458488613699686e-05, "loss": 0.4711, "step": 885 }, { "epoch": 2.7930763178599527, "grad_norm": 0.1873399635626309, "learning_rate": 3.444520467129118e-05, "loss": 0.484, "step": 886 }, { "epoch": 2.796223446105429, "grad_norm": 0.1976319148693629, "learning_rate": 3.4431907730359137e-05, "loss": 0.4777, "step": 887 }, { "epoch": 2.7993705743509048, "grad_norm": 0.19261763199922333, "learning_rate": 3.44185978047917e-05, "loss": 0.4658, "step": 888 }, { "epoch": 2.802517702596381, "grad_norm": 0.20902878866014565, "learning_rate": 3.440527490849055e-05, "loss": 0.4751, "step": 889 }, { "epoch": 2.805664830841857, "grad_norm": 0.196677203826585, "learning_rate": 3.439193905537094e-05, "loss": 0.4739, "step": 890 }, { "epoch": 2.8088119590873326, "grad_norm": 0.18777554851103495, "learning_rate": 3.4378590259361626e-05, "loss": 0.471, "step": 891 }, { "epoch": 2.811959087332809, "grad_norm": 0.2206571953754081, "learning_rate": 3.4365228534404895e-05, "loss": 0.479, "step": 892 }, { "epoch": 2.8151062155782847, "grad_norm": 0.16662192562786868, "learning_rate": 3.435185389445655e-05, "loss": 0.4745, "step": 893 }, { "epoch": 2.818253343823761, "grad_norm": 0.19851273382341908, "learning_rate": 3.433846635348587e-05, "loss": 0.4773, "step": 894 }, { "epoch": 2.8214004720692367, "grad_norm": 0.19549393874279214, "learning_rate": 3.43250659254756e-05, "loss": 0.4683, "step": 895 }, { "epoch": 2.824547600314713, "grad_norm": 0.16826088149239804, "learning_rate": 3.4311652624421976e-05, "loss": 0.48, "step": 896 }, { "epoch": 2.8276947285601888, "grad_norm": 0.1762752510049042, "learning_rate": 3.429822646433464e-05, "loss": 0.479, "step": 897 }, { "epoch": 2.830841856805665, "grad_norm": 0.18678077326219097, "learning_rate": 3.4284787459236705e-05, "loss": 0.4723, "step": 898 }, { "epoch": 2.833988985051141, "grad_norm": 0.1630882015779384, "learning_rate": 3.427133562316466e-05, "loss": 0.4782, "step": 899 }, { "epoch": 2.8371361132966166, "grad_norm": 0.17882705777771293, "learning_rate": 3.425787097016843e-05, "loss": 0.4714, "step": 900 }, { "epoch": 2.840283241542093, "grad_norm": 0.17187562653472938, "learning_rate": 3.424439351431131e-05, "loss": 0.4742, "step": 901 }, { "epoch": 2.8434303697875687, "grad_norm": 0.174319821594161, "learning_rate": 3.423090326966996e-05, "loss": 0.4823, "step": 902 }, { "epoch": 2.846577498033045, "grad_norm": 0.1690838895842069, "learning_rate": 3.4217400250334416e-05, "loss": 0.4773, "step": 903 }, { "epoch": 2.8497246262785207, "grad_norm": 0.1731044457799335, "learning_rate": 3.420388447040804e-05, "loss": 0.4684, "step": 904 }, { "epoch": 2.852871754523997, "grad_norm": 0.177357699597428, "learning_rate": 3.419035594400753e-05, "loss": 0.477, "step": 905 }, { "epoch": 2.856018882769473, "grad_norm": 0.1827484820724141, "learning_rate": 3.41768146852629e-05, "loss": 0.4791, "step": 906 }, { "epoch": 2.859166011014949, "grad_norm": 0.20812185117779652, "learning_rate": 3.416326070831746e-05, "loss": 0.4818, "step": 907 }, { "epoch": 2.862313139260425, "grad_norm": 0.19582434700048504, "learning_rate": 3.414969402732779e-05, "loss": 0.4736, "step": 908 }, { "epoch": 2.8654602675059007, "grad_norm": 0.17405726221555853, "learning_rate": 3.4136114656463766e-05, "loss": 0.4822, "step": 909 }, { "epoch": 2.868607395751377, "grad_norm": 0.1886351350028885, "learning_rate": 3.4122522609908504e-05, "loss": 0.4799, "step": 910 }, { "epoch": 2.8717545239968527, "grad_norm": 0.17300845271944784, "learning_rate": 3.410891790185834e-05, "loss": 0.4737, "step": 911 }, { "epoch": 2.874901652242329, "grad_norm": 0.17440320919074534, "learning_rate": 3.409530054652287e-05, "loss": 0.4731, "step": 912 }, { "epoch": 2.8780487804878048, "grad_norm": 0.1803068800569239, "learning_rate": 3.408167055812488e-05, "loss": 0.4769, "step": 913 }, { "epoch": 2.881195908733281, "grad_norm": 0.181432403586137, "learning_rate": 3.406802795090034e-05, "loss": 0.4915, "step": 914 }, { "epoch": 2.884343036978757, "grad_norm": 0.16179024096943073, "learning_rate": 3.405437273909843e-05, "loss": 0.4795, "step": 915 }, { "epoch": 2.887490165224233, "grad_norm": 0.17972825825989644, "learning_rate": 3.4040704936981475e-05, "loss": 0.4761, "step": 916 }, { "epoch": 2.890637293469709, "grad_norm": 0.17303120146453108, "learning_rate": 3.4027024558824956e-05, "loss": 0.4737, "step": 917 }, { "epoch": 2.8937844217151847, "grad_norm": 0.19260008079093655, "learning_rate": 3.401333161891747e-05, "loss": 0.4827, "step": 918 }, { "epoch": 2.896931549960661, "grad_norm": 0.19408527896636127, "learning_rate": 3.3999626131560754e-05, "loss": 0.4791, "step": 919 }, { "epoch": 2.9000786782061367, "grad_norm": 0.17709492241260272, "learning_rate": 3.398590811106966e-05, "loss": 0.4758, "step": 920 }, { "epoch": 2.903225806451613, "grad_norm": 0.24719908496023188, "learning_rate": 3.397217757177211e-05, "loss": 0.478, "step": 921 }, { "epoch": 2.906372934697089, "grad_norm": 0.21088224277568318, "learning_rate": 3.395843452800912e-05, "loss": 0.4677, "step": 922 }, { "epoch": 2.909520062942565, "grad_norm": 0.18357158178528057, "learning_rate": 3.394467899413473e-05, "loss": 0.4822, "step": 923 }, { "epoch": 2.912667191188041, "grad_norm": 0.20061925910804337, "learning_rate": 3.393091098451607e-05, "loss": 0.4796, "step": 924 }, { "epoch": 2.915814319433517, "grad_norm": 0.257783364662756, "learning_rate": 3.391713051353328e-05, "loss": 0.4823, "step": 925 }, { "epoch": 2.918961447678993, "grad_norm": 0.20566149454081753, "learning_rate": 3.39033375955795e-05, "loss": 0.4806, "step": 926 }, { "epoch": 2.9221085759244687, "grad_norm": 0.21739345983865224, "learning_rate": 3.388953224506091e-05, "loss": 0.479, "step": 927 }, { "epoch": 2.925255704169945, "grad_norm": 0.2019066969178866, "learning_rate": 3.3875714476396635e-05, "loss": 0.4791, "step": 928 }, { "epoch": 2.9284028324154208, "grad_norm": 0.18964152674565116, "learning_rate": 3.38618843040188e-05, "loss": 0.4872, "step": 929 }, { "epoch": 2.931549960660897, "grad_norm": 0.21572408114004044, "learning_rate": 3.384804174237246e-05, "loss": 0.4856, "step": 930 }, { "epoch": 2.934697088906373, "grad_norm": 0.20435168904218748, "learning_rate": 3.3834186805915634e-05, "loss": 0.4823, "step": 931 }, { "epoch": 2.937844217151849, "grad_norm": 0.23036698086458443, "learning_rate": 3.382031950911925e-05, "loss": 0.4842, "step": 932 }, { "epoch": 2.940991345397325, "grad_norm": 0.2559922205766231, "learning_rate": 3.380643986646714e-05, "loss": 0.4683, "step": 933 }, { "epoch": 2.944138473642801, "grad_norm": 0.20816226293770818, "learning_rate": 3.3792547892456045e-05, "loss": 0.478, "step": 934 }, { "epoch": 2.947285601888277, "grad_norm": 0.27147963925238433, "learning_rate": 3.37786436015956e-05, "loss": 0.4716, "step": 935 }, { "epoch": 2.9504327301337527, "grad_norm": 0.1917348894021088, "learning_rate": 3.376472700840827e-05, "loss": 0.4855, "step": 936 }, { "epoch": 2.953579858379229, "grad_norm": 0.26586313382080357, "learning_rate": 3.375079812742939e-05, "loss": 0.4751, "step": 937 }, { "epoch": 2.9567269866247052, "grad_norm": 0.22006766096753588, "learning_rate": 3.373685697320713e-05, "loss": 0.4777, "step": 938 }, { "epoch": 2.959874114870181, "grad_norm": 0.22686294528313417, "learning_rate": 3.372290356030246e-05, "loss": 0.4788, "step": 939 }, { "epoch": 2.963021243115657, "grad_norm": 0.19665794947101586, "learning_rate": 3.370893790328917e-05, "loss": 0.4904, "step": 940 }, { "epoch": 2.966168371361133, "grad_norm": 0.21016583933505117, "learning_rate": 3.369496001675385e-05, "loss": 0.4846, "step": 941 }, { "epoch": 2.969315499606609, "grad_norm": 0.1823767869897883, "learning_rate": 3.368096991529583e-05, "loss": 0.474, "step": 942 }, { "epoch": 2.972462627852085, "grad_norm": 0.19889515201753574, "learning_rate": 3.366696761352723e-05, "loss": 0.4744, "step": 943 }, { "epoch": 2.975609756097561, "grad_norm": 0.19883183201167187, "learning_rate": 3.36529531260729e-05, "loss": 0.4872, "step": 944 }, { "epoch": 2.9787568843430368, "grad_norm": 0.17416758317261993, "learning_rate": 3.363892646757041e-05, "loss": 0.4791, "step": 945 }, { "epoch": 2.981904012588513, "grad_norm": 0.1922592641536021, "learning_rate": 3.362488765267006e-05, "loss": 0.4815, "step": 946 }, { "epoch": 2.9850511408339893, "grad_norm": 0.17209027351228018, "learning_rate": 3.361083669603482e-05, "loss": 0.4796, "step": 947 }, { "epoch": 2.988198269079465, "grad_norm": 0.17084312866058127, "learning_rate": 3.3596773612340375e-05, "loss": 0.4805, "step": 948 }, { "epoch": 2.991345397324941, "grad_norm": 0.16455840604865776, "learning_rate": 3.358269841627504e-05, "loss": 0.4734, "step": 949 }, { "epoch": 2.994492525570417, "grad_norm": 0.17609503259030376, "learning_rate": 3.356861112253982e-05, "loss": 0.4813, "step": 950 }, { "epoch": 2.997639653815893, "grad_norm": 0.16815201729549342, "learning_rate": 3.355451174584834e-05, "loss": 0.477, "step": 951 }, { "epoch": 3.003147128245476, "grad_norm": 0.4088775165911742, "learning_rate": 3.35404003009268e-05, "loss": 0.9, "step": 952 }, { "epoch": 3.006294256490952, "grad_norm": 0.32652509762470977, "learning_rate": 3.352627680251409e-05, "loss": 0.4413, "step": 953 }, { "epoch": 3.009441384736428, "grad_norm": 0.31181526619361466, "learning_rate": 3.3512141265361625e-05, "loss": 0.4442, "step": 954 }, { "epoch": 3.012588512981904, "grad_norm": 0.28249452749334586, "learning_rate": 3.3497993704233415e-05, "loss": 0.4341, "step": 955 }, { "epoch": 3.01573564122738, "grad_norm": 0.2469014492840724, "learning_rate": 3.348383413390603e-05, "loss": 0.4357, "step": 956 }, { "epoch": 3.018882769472856, "grad_norm": 0.29518171306893987, "learning_rate": 3.346966256916858e-05, "loss": 0.4331, "step": 957 }, { "epoch": 3.022029897718332, "grad_norm": 0.24550117952865097, "learning_rate": 3.345547902482271e-05, "loss": 0.4328, "step": 958 }, { "epoch": 3.025177025963808, "grad_norm": 0.26322265876569234, "learning_rate": 3.344128351568255e-05, "loss": 0.4296, "step": 959 }, { "epoch": 3.028324154209284, "grad_norm": 0.3355141854171973, "learning_rate": 3.3427076056574765e-05, "loss": 0.4399, "step": 960 }, { "epoch": 3.03147128245476, "grad_norm": 0.24848144520611637, "learning_rate": 3.341285666233849e-05, "loss": 0.4379, "step": 961 }, { "epoch": 3.034618410700236, "grad_norm": 0.3051743839260837, "learning_rate": 3.3398625347825295e-05, "loss": 0.4321, "step": 962 }, { "epoch": 3.037765538945712, "grad_norm": 0.2520446263992142, "learning_rate": 3.3384382127899254e-05, "loss": 0.4326, "step": 963 }, { "epoch": 3.040912667191188, "grad_norm": 0.23491369763876913, "learning_rate": 3.337012701743682e-05, "loss": 0.4304, "step": 964 }, { "epoch": 3.044059795436664, "grad_norm": 0.22739503178054998, "learning_rate": 3.33558600313269e-05, "loss": 0.4316, "step": 965 }, { "epoch": 3.04720692368214, "grad_norm": 0.23290557210702711, "learning_rate": 3.334158118447081e-05, "loss": 0.4205, "step": 966 }, { "epoch": 3.050354051927616, "grad_norm": 0.23718213519938236, "learning_rate": 3.3327290491782214e-05, "loss": 0.4276, "step": 967 }, { "epoch": 3.0535011801730922, "grad_norm": 0.22184226524551506, "learning_rate": 3.331298796818719e-05, "loss": 0.4336, "step": 968 }, { "epoch": 3.056648308418568, "grad_norm": 0.24883677057218126, "learning_rate": 3.329867362862416e-05, "loss": 0.4202, "step": 969 }, { "epoch": 3.059795436664044, "grad_norm": 0.20245231032426314, "learning_rate": 3.328434748804389e-05, "loss": 0.4283, "step": 970 }, { "epoch": 3.06294256490952, "grad_norm": 0.22154485320162687, "learning_rate": 3.327000956140944e-05, "loss": 0.4276, "step": 971 }, { "epoch": 3.066089693154996, "grad_norm": 0.23045678598682195, "learning_rate": 3.325565986369624e-05, "loss": 0.438, "step": 972 }, { "epoch": 3.069236821400472, "grad_norm": 0.21978195581485033, "learning_rate": 3.3241298409891967e-05, "loss": 0.4347, "step": 973 }, { "epoch": 3.072383949645948, "grad_norm": 0.2459169417158989, "learning_rate": 3.3226925214996586e-05, "loss": 0.426, "step": 974 }, { "epoch": 3.075531077891424, "grad_norm": 0.2087661039669764, "learning_rate": 3.3212540294022324e-05, "loss": 0.424, "step": 975 }, { "epoch": 3.0786782061369, "grad_norm": 0.22053026125524922, "learning_rate": 3.319814366199368e-05, "loss": 0.4358, "step": 976 }, { "epoch": 3.0818253343823763, "grad_norm": 0.22791109755583255, "learning_rate": 3.318373533394735e-05, "loss": 0.4339, "step": 977 }, { "epoch": 3.084972462627852, "grad_norm": 0.2097976976701724, "learning_rate": 3.3169315324932276e-05, "loss": 0.4315, "step": 978 }, { "epoch": 3.088119590873328, "grad_norm": 0.18718087218415436, "learning_rate": 3.3154883650009584e-05, "loss": 0.4311, "step": 979 }, { "epoch": 3.091266719118804, "grad_norm": 0.20933947694391003, "learning_rate": 3.314044032425258e-05, "loss": 0.4391, "step": 980 }, { "epoch": 3.09441384736428, "grad_norm": 0.22529123099122955, "learning_rate": 3.3125985362746745e-05, "loss": 0.4262, "step": 981 }, { "epoch": 3.097560975609756, "grad_norm": 0.1786820765800306, "learning_rate": 3.3111518780589723e-05, "loss": 0.4397, "step": 982 }, { "epoch": 3.100708103855232, "grad_norm": 0.22962197514376026, "learning_rate": 3.3097040592891284e-05, "loss": 0.4308, "step": 983 }, { "epoch": 3.1038552321007082, "grad_norm": 0.24051879415776045, "learning_rate": 3.30825508147733e-05, "loss": 0.4318, "step": 984 }, { "epoch": 3.107002360346184, "grad_norm": 0.20025474171903923, "learning_rate": 3.30680494613698e-05, "loss": 0.4384, "step": 985 }, { "epoch": 3.1101494885916603, "grad_norm": 0.19206558390260614, "learning_rate": 3.305353654782687e-05, "loss": 0.4297, "step": 986 }, { "epoch": 3.113296616837136, "grad_norm": 0.20782219172064703, "learning_rate": 3.303901208930266e-05, "loss": 0.4231, "step": 987 }, { "epoch": 3.116443745082612, "grad_norm": 0.20010226717176327, "learning_rate": 3.30244761009674e-05, "loss": 0.4254, "step": 988 }, { "epoch": 3.119590873328088, "grad_norm": 0.18984132712344276, "learning_rate": 3.300992859800336e-05, "loss": 0.4244, "step": 989 }, { "epoch": 3.122738001573564, "grad_norm": 0.23519299020760812, "learning_rate": 3.299536959560481e-05, "loss": 0.4365, "step": 990 }, { "epoch": 3.12588512981904, "grad_norm": 0.1950168518008789, "learning_rate": 3.2980799108978065e-05, "loss": 0.434, "step": 991 }, { "epoch": 3.129032258064516, "grad_norm": 0.17931137612435868, "learning_rate": 3.296621715334143e-05, "loss": 0.4321, "step": 992 }, { "epoch": 3.1321793863099923, "grad_norm": 0.20090892857586998, "learning_rate": 3.295162374392518e-05, "loss": 0.4205, "step": 993 }, { "epoch": 3.135326514555468, "grad_norm": 0.18545062281039967, "learning_rate": 3.293701889597153e-05, "loss": 0.4289, "step": 994 }, { "epoch": 3.1384736428009443, "grad_norm": 0.23236278068083843, "learning_rate": 3.292240262473469e-05, "loss": 0.4268, "step": 995 }, { "epoch": 3.14162077104642, "grad_norm": 0.2119237430411335, "learning_rate": 3.290777494548075e-05, "loss": 0.4361, "step": 996 }, { "epoch": 3.144767899291896, "grad_norm": 0.18687999372723774, "learning_rate": 3.289313587348778e-05, "loss": 0.4285, "step": 997 }, { "epoch": 3.147915027537372, "grad_norm": 0.242430728271403, "learning_rate": 3.287848542404568e-05, "loss": 0.4322, "step": 998 }, { "epoch": 3.151062155782848, "grad_norm": 0.20753524478639734, "learning_rate": 3.2863823612456264e-05, "loss": 0.4286, "step": 999 }, { "epoch": 3.1542092840283242, "grad_norm": 0.2204970218249786, "learning_rate": 3.284915045403325e-05, "loss": 0.4213, "step": 1000 }, { "epoch": 3.1573564122738, "grad_norm": 0.2077798788084683, "learning_rate": 3.283446596410212e-05, "loss": 0.4243, "step": 1001 }, { "epoch": 3.1605035405192763, "grad_norm": 0.20913787484273705, "learning_rate": 3.281977015800028e-05, "loss": 0.4349, "step": 1002 }, { "epoch": 3.163650668764752, "grad_norm": 0.1845966280590288, "learning_rate": 3.28050630510769e-05, "loss": 0.4367, "step": 1003 }, { "epoch": 3.1667977970102283, "grad_norm": 0.19783164507955084, "learning_rate": 3.279034465869298e-05, "loss": 0.4256, "step": 1004 }, { "epoch": 3.169944925255704, "grad_norm": 0.2645215067092485, "learning_rate": 3.277561499622129e-05, "loss": 0.4358, "step": 1005 }, { "epoch": 3.17309205350118, "grad_norm": 0.17753403080442126, "learning_rate": 3.276087407904639e-05, "loss": 0.4298, "step": 1006 }, { "epoch": 3.176239181746656, "grad_norm": 0.23434775817714046, "learning_rate": 3.274612192256457e-05, "loss": 0.4328, "step": 1007 }, { "epoch": 3.179386309992132, "grad_norm": 0.18670646442316036, "learning_rate": 3.273135854218389e-05, "loss": 0.4289, "step": 1008 }, { "epoch": 3.1825334382376083, "grad_norm": 0.18890780180865613, "learning_rate": 3.2716583953324094e-05, "loss": 0.4377, "step": 1009 }, { "epoch": 3.185680566483084, "grad_norm": 0.21371681553609642, "learning_rate": 3.2701798171416674e-05, "loss": 0.4315, "step": 1010 }, { "epoch": 3.1888276947285603, "grad_norm": 0.1965403832948832, "learning_rate": 3.268700121190479e-05, "loss": 0.4349, "step": 1011 }, { "epoch": 3.191974822974036, "grad_norm": 0.19377735620113185, "learning_rate": 3.267219309024328e-05, "loss": 0.427, "step": 1012 }, { "epoch": 3.1951219512195124, "grad_norm": 0.18770678346838374, "learning_rate": 3.265737382189863e-05, "loss": 0.4267, "step": 1013 }, { "epoch": 3.198269079464988, "grad_norm": 0.2042394668976984, "learning_rate": 3.2642543422349e-05, "loss": 0.4385, "step": 1014 }, { "epoch": 3.201416207710464, "grad_norm": 0.19338325740387202, "learning_rate": 3.2627701907084136e-05, "loss": 0.4232, "step": 1015 }, { "epoch": 3.2045633359559402, "grad_norm": 0.19512605168524141, "learning_rate": 3.2612849291605425e-05, "loss": 0.4291, "step": 1016 }, { "epoch": 3.207710464201416, "grad_norm": 0.18182108276136827, "learning_rate": 3.259798559142583e-05, "loss": 0.4339, "step": 1017 }, { "epoch": 3.2108575924468923, "grad_norm": 0.17815282317494613, "learning_rate": 3.2583110822069894e-05, "loss": 0.4348, "step": 1018 }, { "epoch": 3.214004720692368, "grad_norm": 0.19146604497628397, "learning_rate": 3.2568224999073725e-05, "loss": 0.4253, "step": 1019 }, { "epoch": 3.2171518489378443, "grad_norm": 0.17868445991882773, "learning_rate": 3.255332813798499e-05, "loss": 0.4366, "step": 1020 }, { "epoch": 3.22029897718332, "grad_norm": 0.22039699239981317, "learning_rate": 3.253842025436286e-05, "loss": 0.4288, "step": 1021 }, { "epoch": 3.2234461054287964, "grad_norm": 0.1801947638201736, "learning_rate": 3.252350136377802e-05, "loss": 0.4271, "step": 1022 }, { "epoch": 3.226593233674272, "grad_norm": 0.1849719695926772, "learning_rate": 3.2508571481812686e-05, "loss": 0.4305, "step": 1023 }, { "epoch": 3.229740361919748, "grad_norm": 0.17788240132478986, "learning_rate": 3.2493630624060494e-05, "loss": 0.4402, "step": 1024 }, { "epoch": 3.2328874901652243, "grad_norm": 0.1845071818835128, "learning_rate": 3.2478678806126614e-05, "loss": 0.4389, "step": 1025 }, { "epoch": 3.2360346184107, "grad_norm": 0.1858623426241377, "learning_rate": 3.24637160436276e-05, "loss": 0.4339, "step": 1026 }, { "epoch": 3.2391817466561763, "grad_norm": 0.17220083563874175, "learning_rate": 3.2448742352191476e-05, "loss": 0.4331, "step": 1027 }, { "epoch": 3.242328874901652, "grad_norm": 0.2247895258402866, "learning_rate": 3.243375774745768e-05, "loss": 0.4289, "step": 1028 }, { "epoch": 3.2454760031471284, "grad_norm": 0.16684480973484458, "learning_rate": 3.241876224507702e-05, "loss": 0.4334, "step": 1029 }, { "epoch": 3.248623131392604, "grad_norm": 0.1779584572036571, "learning_rate": 3.240375586071171e-05, "loss": 0.4399, "step": 1030 }, { "epoch": 3.2517702596380804, "grad_norm": 0.191113784730864, "learning_rate": 3.238873861003533e-05, "loss": 0.4367, "step": 1031 }, { "epoch": 3.2549173878835562, "grad_norm": 0.18550711603052794, "learning_rate": 3.237371050873281e-05, "loss": 0.4406, "step": 1032 }, { "epoch": 3.258064516129032, "grad_norm": 0.1783887143847433, "learning_rate": 3.235867157250039e-05, "loss": 0.4418, "step": 1033 }, { "epoch": 3.2612116443745083, "grad_norm": 0.17835625882815775, "learning_rate": 3.234362181704565e-05, "loss": 0.4295, "step": 1034 }, { "epoch": 3.264358772619984, "grad_norm": 0.18249704241735015, "learning_rate": 3.232856125808746e-05, "loss": 0.4294, "step": 1035 }, { "epoch": 3.2675059008654603, "grad_norm": 0.17475471863362183, "learning_rate": 3.231348991135599e-05, "loss": 0.4364, "step": 1036 }, { "epoch": 3.270653029110936, "grad_norm": 0.19935489428291137, "learning_rate": 3.229840779259266e-05, "loss": 0.4255, "step": 1037 }, { "epoch": 3.2738001573564124, "grad_norm": 0.1693521531229919, "learning_rate": 3.2283314917550136e-05, "loss": 0.4359, "step": 1038 }, { "epoch": 3.276947285601888, "grad_norm": 0.19130787480398617, "learning_rate": 3.226821130199233e-05, "loss": 0.431, "step": 1039 }, { "epoch": 3.2800944138473644, "grad_norm": 0.16737976497369075, "learning_rate": 3.225309696169438e-05, "loss": 0.4311, "step": 1040 }, { "epoch": 3.2832415420928402, "grad_norm": 0.18392433219239301, "learning_rate": 3.223797191244261e-05, "loss": 0.4356, "step": 1041 }, { "epoch": 3.286388670338316, "grad_norm": 0.19864523441807563, "learning_rate": 3.2222836170034543e-05, "loss": 0.4247, "step": 1042 }, { "epoch": 3.2895357985837923, "grad_norm": 0.1984089785922852, "learning_rate": 3.220768975027886e-05, "loss": 0.4374, "step": 1043 }, { "epoch": 3.292682926829268, "grad_norm": 0.19402343905084715, "learning_rate": 3.2192532668995385e-05, "loss": 0.4254, "step": 1044 }, { "epoch": 3.2958300550747444, "grad_norm": 0.2041163961571167, "learning_rate": 3.21773649420151e-05, "loss": 0.4358, "step": 1045 }, { "epoch": 3.29897718332022, "grad_norm": 0.1787832950909068, "learning_rate": 3.2162186585180095e-05, "loss": 0.4231, "step": 1046 }, { "epoch": 3.3021243115656964, "grad_norm": 0.19740027959624745, "learning_rate": 3.214699761434355e-05, "loss": 0.4302, "step": 1047 }, { "epoch": 3.305271439811172, "grad_norm": 0.18856045951279926, "learning_rate": 3.2131798045369765e-05, "loss": 0.4308, "step": 1048 }, { "epoch": 3.3084185680566485, "grad_norm": 0.20807270153545412, "learning_rate": 3.211658789413408e-05, "loss": 0.4351, "step": 1049 }, { "epoch": 3.3115656963021243, "grad_norm": 0.19162342835863225, "learning_rate": 3.2101367176522886e-05, "loss": 0.4354, "step": 1050 }, { "epoch": 3.3147128245476, "grad_norm": 0.19387129843147574, "learning_rate": 3.2086135908433634e-05, "loss": 0.43, "step": 1051 }, { "epoch": 3.3178599527930763, "grad_norm": 0.18150899186866062, "learning_rate": 3.2070894105774766e-05, "loss": 0.4344, "step": 1052 }, { "epoch": 3.321007081038552, "grad_norm": 0.21537331116177905, "learning_rate": 3.2055641784465745e-05, "loss": 0.4415, "step": 1053 }, { "epoch": 3.3241542092840284, "grad_norm": 0.2400691954661002, "learning_rate": 3.2040378960437024e-05, "loss": 0.4406, "step": 1054 }, { "epoch": 3.327301337529504, "grad_norm": 0.19817600331351765, "learning_rate": 3.2025105649630014e-05, "loss": 0.4315, "step": 1055 }, { "epoch": 3.3304484657749804, "grad_norm": 0.2084827766948241, "learning_rate": 3.200982186799709e-05, "loss": 0.4187, "step": 1056 }, { "epoch": 3.3335955940204562, "grad_norm": 0.22112170756734634, "learning_rate": 3.199452763150155e-05, "loss": 0.4315, "step": 1057 }, { "epoch": 3.3367427222659325, "grad_norm": 0.19512676684498764, "learning_rate": 3.197922295611762e-05, "loss": 0.4345, "step": 1058 }, { "epoch": 3.3398898505114083, "grad_norm": 0.2663842173534082, "learning_rate": 3.196390785783043e-05, "loss": 0.4346, "step": 1059 }, { "epoch": 3.343036978756884, "grad_norm": 0.21826809183094342, "learning_rate": 3.194858235263598e-05, "loss": 0.4355, "step": 1060 }, { "epoch": 3.3461841070023604, "grad_norm": 0.20953163723288945, "learning_rate": 3.193324645654118e-05, "loss": 0.4301, "step": 1061 }, { "epoch": 3.349331235247836, "grad_norm": 0.24320015638205855, "learning_rate": 3.191790018556373e-05, "loss": 0.4425, "step": 1062 }, { "epoch": 3.3524783634933124, "grad_norm": 0.1904143985171064, "learning_rate": 3.190254355573223e-05, "loss": 0.4378, "step": 1063 }, { "epoch": 3.355625491738788, "grad_norm": 0.22691576279072934, "learning_rate": 3.1887176583086066e-05, "loss": 0.4263, "step": 1064 }, { "epoch": 3.3587726199842645, "grad_norm": 0.21161650205231633, "learning_rate": 3.187179928367544e-05, "loss": 0.4251, "step": 1065 }, { "epoch": 3.3619197482297403, "grad_norm": 0.20609048888260278, "learning_rate": 3.185641167356131e-05, "loss": 0.4283, "step": 1066 }, { "epoch": 3.3650668764752165, "grad_norm": 0.2065246689397034, "learning_rate": 3.184101376881545e-05, "loss": 0.4292, "step": 1067 }, { "epoch": 3.3682140047206923, "grad_norm": 0.21953821452618635, "learning_rate": 3.1825605585520343e-05, "loss": 0.4334, "step": 1068 }, { "epoch": 3.371361132966168, "grad_norm": 0.19911994569988575, "learning_rate": 3.181018713976924e-05, "loss": 0.4286, "step": 1069 }, { "epoch": 3.3745082612116444, "grad_norm": 0.24900209916902774, "learning_rate": 3.179475844766608e-05, "loss": 0.4332, "step": 1070 }, { "epoch": 3.3776553894571206, "grad_norm": 0.23150612649346244, "learning_rate": 3.1779319525325546e-05, "loss": 0.4268, "step": 1071 }, { "epoch": 3.3808025177025964, "grad_norm": 0.2158927674109808, "learning_rate": 3.176387038887296e-05, "loss": 0.4462, "step": 1072 }, { "epoch": 3.3839496459480722, "grad_norm": 0.24924200720982193, "learning_rate": 3.174841105444434e-05, "loss": 0.4408, "step": 1073 }, { "epoch": 3.3870967741935485, "grad_norm": 0.2239815418692921, "learning_rate": 3.173294153818635e-05, "loss": 0.4326, "step": 1074 }, { "epoch": 3.3902439024390243, "grad_norm": 0.2199325533660836, "learning_rate": 3.17174618562563e-05, "loss": 0.4312, "step": 1075 }, { "epoch": 3.3933910306845005, "grad_norm": 0.22972172889576944, "learning_rate": 3.170197202482208e-05, "loss": 0.4343, "step": 1076 }, { "epoch": 3.3965381589299763, "grad_norm": 0.2094072355653265, "learning_rate": 3.168647206006221e-05, "loss": 0.4362, "step": 1077 }, { "epoch": 3.399685287175452, "grad_norm": 0.19900277532000674, "learning_rate": 3.167096197816581e-05, "loss": 0.4346, "step": 1078 }, { "epoch": 3.4028324154209284, "grad_norm": 0.19471319908950105, "learning_rate": 3.1655441795332523e-05, "loss": 0.434, "step": 1079 }, { "epoch": 3.4059795436664047, "grad_norm": 0.24652320632202052, "learning_rate": 3.163991152777259e-05, "loss": 0.4446, "step": 1080 }, { "epoch": 3.4091266719118805, "grad_norm": 0.18487872132027222, "learning_rate": 3.162437119170673e-05, "loss": 0.4428, "step": 1081 }, { "epoch": 3.4122738001573563, "grad_norm": 0.18070644674221434, "learning_rate": 3.160882080336624e-05, "loss": 0.4345, "step": 1082 }, { "epoch": 3.4154209284028325, "grad_norm": 0.20214727431927682, "learning_rate": 3.1593260378992856e-05, "loss": 0.4393, "step": 1083 }, { "epoch": 3.4185680566483083, "grad_norm": 0.19354862366840503, "learning_rate": 3.1577689934838847e-05, "loss": 0.4286, "step": 1084 }, { "epoch": 3.4217151848937846, "grad_norm": 0.17360041262575412, "learning_rate": 3.156210948716691e-05, "loss": 0.4395, "step": 1085 }, { "epoch": 3.4248623131392604, "grad_norm": 0.21602699476201023, "learning_rate": 3.1546519052250216e-05, "loss": 0.4363, "step": 1086 }, { "epoch": 3.4280094413847366, "grad_norm": 0.19400691100542458, "learning_rate": 3.153091864637236e-05, "loss": 0.4465, "step": 1087 }, { "epoch": 3.4311565696302124, "grad_norm": 0.18853705117037867, "learning_rate": 3.151530828582734e-05, "loss": 0.4367, "step": 1088 }, { "epoch": 3.4343036978756887, "grad_norm": 0.20156628021291076, "learning_rate": 3.149968798691956e-05, "loss": 0.4312, "step": 1089 }, { "epoch": 3.4374508261211645, "grad_norm": 0.17508099200640428, "learning_rate": 3.148405776596381e-05, "loss": 0.4387, "step": 1090 }, { "epoch": 3.4405979543666403, "grad_norm": 0.19428180773708023, "learning_rate": 3.1468417639285234e-05, "loss": 0.4372, "step": 1091 }, { "epoch": 3.4437450826121165, "grad_norm": 0.18564695290847033, "learning_rate": 3.145276762321932e-05, "loss": 0.4372, "step": 1092 }, { "epoch": 3.4468922108575923, "grad_norm": 0.18386558110382897, "learning_rate": 3.1437107734111885e-05, "loss": 0.4303, "step": 1093 }, { "epoch": 3.4500393391030686, "grad_norm": 0.19976923603089122, "learning_rate": 3.142143798831908e-05, "loss": 0.4387, "step": 1094 }, { "epoch": 3.4531864673485444, "grad_norm": 0.1967807399314723, "learning_rate": 3.140575840220733e-05, "loss": 0.4422, "step": 1095 }, { "epoch": 3.4563335955940206, "grad_norm": 0.18143971647103654, "learning_rate": 3.1390068992153336e-05, "loss": 0.4427, "step": 1096 }, { "epoch": 3.4594807238394965, "grad_norm": 0.19909894071095616, "learning_rate": 3.137436977454406e-05, "loss": 0.4413, "step": 1097 }, { "epoch": 3.4626278520849727, "grad_norm": 0.17902127709322027, "learning_rate": 3.135866076577673e-05, "loss": 0.4408, "step": 1098 }, { "epoch": 3.4657749803304485, "grad_norm": 0.1874350511676143, "learning_rate": 3.134294198225877e-05, "loss": 0.4458, "step": 1099 }, { "epoch": 3.4689221085759243, "grad_norm": 0.19777946007747293, "learning_rate": 3.132721344040783e-05, "loss": 0.4363, "step": 1100 }, { "epoch": 3.4720692368214006, "grad_norm": 0.21042566361266743, "learning_rate": 3.1311475156651755e-05, "loss": 0.4287, "step": 1101 }, { "epoch": 3.4752163650668764, "grad_norm": 0.18234625120759887, "learning_rate": 3.129572714742855e-05, "loss": 0.4389, "step": 1102 }, { "epoch": 3.4783634933123526, "grad_norm": 0.2094029102938534, "learning_rate": 3.12799694291864e-05, "loss": 0.4306, "step": 1103 }, { "epoch": 3.4815106215578284, "grad_norm": 0.16484345005981205, "learning_rate": 3.12642020183836e-05, "loss": 0.4322, "step": 1104 }, { "epoch": 3.4846577498033047, "grad_norm": 0.2218688219231824, "learning_rate": 3.12484249314886e-05, "loss": 0.4313, "step": 1105 }, { "epoch": 3.4878048780487805, "grad_norm": 0.18992515126275933, "learning_rate": 3.1232638184979934e-05, "loss": 0.4378, "step": 1106 }, { "epoch": 3.4909520062942567, "grad_norm": 0.1871055104215194, "learning_rate": 3.1216841795346246e-05, "loss": 0.4303, "step": 1107 }, { "epoch": 3.4940991345397325, "grad_norm": 0.19199964417557105, "learning_rate": 3.120103577908623e-05, "loss": 0.441, "step": 1108 }, { "epoch": 3.4972462627852083, "grad_norm": 0.1856197404311817, "learning_rate": 3.1185220152708645e-05, "loss": 0.4327, "step": 1109 }, { "epoch": 3.5003933910306846, "grad_norm": 0.1770808469670125, "learning_rate": 3.116939493273228e-05, "loss": 0.4379, "step": 1110 }, { "epoch": 3.5035405192761604, "grad_norm": 0.17080394749356279, "learning_rate": 3.115356013568597e-05, "loss": 0.434, "step": 1111 }, { "epoch": 3.5066876475216366, "grad_norm": 0.19343264397189958, "learning_rate": 3.113771577810852e-05, "loss": 0.4349, "step": 1112 }, { "epoch": 3.5098347757671124, "grad_norm": 0.17286156111192222, "learning_rate": 3.1121861876548736e-05, "loss": 0.443, "step": 1113 }, { "epoch": 3.5129819040125883, "grad_norm": 0.19138302591060105, "learning_rate": 3.1105998447565383e-05, "loss": 0.4447, "step": 1114 }, { "epoch": 3.5161290322580645, "grad_norm": 0.16413512248871734, "learning_rate": 3.10901255077272e-05, "loss": 0.4468, "step": 1115 }, { "epoch": 3.5192761605035408, "grad_norm": 0.1914477065729763, "learning_rate": 3.1074243073612834e-05, "loss": 0.4309, "step": 1116 }, { "epoch": 3.5224232887490166, "grad_norm": 0.188288353753066, "learning_rate": 3.105835116181086e-05, "loss": 0.4355, "step": 1117 }, { "epoch": 3.5255704169944924, "grad_norm": 0.17465184377745524, "learning_rate": 3.104244978891975e-05, "loss": 0.4355, "step": 1118 }, { "epoch": 3.5287175452399686, "grad_norm": 0.1825953673131463, "learning_rate": 3.102653897154786e-05, "loss": 0.4316, "step": 1119 }, { "epoch": 3.5318646734854444, "grad_norm": 0.1743986993607113, "learning_rate": 3.1010618726313405e-05, "loss": 0.4331, "step": 1120 }, { "epoch": 3.5350118017309207, "grad_norm": 0.17821957810877814, "learning_rate": 3.099468906984446e-05, "loss": 0.4345, "step": 1121 }, { "epoch": 3.5381589299763965, "grad_norm": 0.2093960838490045, "learning_rate": 3.097875001877891e-05, "loss": 0.4387, "step": 1122 }, { "epoch": 3.5413060582218723, "grad_norm": 0.17177569091014847, "learning_rate": 3.0962801589764474e-05, "loss": 0.4282, "step": 1123 }, { "epoch": 3.5444531864673485, "grad_norm": 0.1748649502758884, "learning_rate": 3.094684379945865e-05, "loss": 0.434, "step": 1124 }, { "epoch": 3.5476003147128248, "grad_norm": 0.19080807496580413, "learning_rate": 3.093087666452871e-05, "loss": 0.4386, "step": 1125 }, { "epoch": 3.5507474429583006, "grad_norm": 0.17652468445293854, "learning_rate": 3.09149002016517e-05, "loss": 0.4391, "step": 1126 }, { "epoch": 3.5538945712037764, "grad_norm": 0.19062712789092418, "learning_rate": 3.08989144275144e-05, "loss": 0.43, "step": 1127 }, { "epoch": 3.5570416994492526, "grad_norm": 0.17546579858596842, "learning_rate": 3.088291935881333e-05, "loss": 0.435, "step": 1128 }, { "epoch": 3.5601888276947284, "grad_norm": 0.21065753936700307, "learning_rate": 3.08669150122547e-05, "loss": 0.4233, "step": 1129 }, { "epoch": 3.5633359559402047, "grad_norm": 0.16676658656556034, "learning_rate": 3.0850901404554404e-05, "loss": 0.4419, "step": 1130 }, { "epoch": 3.5664830841856805, "grad_norm": 0.2075035100795957, "learning_rate": 3.083487855243804e-05, "loss": 0.4374, "step": 1131 }, { "epoch": 3.5696302124311563, "grad_norm": 0.16571698042635005, "learning_rate": 3.081884647264083e-05, "loss": 0.4385, "step": 1132 }, { "epoch": 3.5727773406766326, "grad_norm": 0.19707520555123104, "learning_rate": 3.080280518190765e-05, "loss": 0.4445, "step": 1133 }, { "epoch": 3.575924468922109, "grad_norm": 0.17132281905111424, "learning_rate": 3.078675469699299e-05, "loss": 0.4379, "step": 1134 }, { "epoch": 3.5790715971675846, "grad_norm": 0.1865379519918738, "learning_rate": 3.077069503466095e-05, "loss": 0.4324, "step": 1135 }, { "epoch": 3.5822187254130604, "grad_norm": 0.20303319475170387, "learning_rate": 3.075462621168521e-05, "loss": 0.4335, "step": 1136 }, { "epoch": 3.5853658536585367, "grad_norm": 0.1769716572277089, "learning_rate": 3.0738548244849024e-05, "loss": 0.4414, "step": 1137 }, { "epoch": 3.5885129819040125, "grad_norm": 0.2119201910437755, "learning_rate": 3.072246115094519e-05, "loss": 0.4347, "step": 1138 }, { "epoch": 3.5916601101494887, "grad_norm": 0.18291536465188502, "learning_rate": 3.070636494677603e-05, "loss": 0.4297, "step": 1139 }, { "epoch": 3.5948072383949645, "grad_norm": 0.2018907273302855, "learning_rate": 3.0690259649153414e-05, "loss": 0.4369, "step": 1140 }, { "epoch": 3.5979543666404403, "grad_norm": 0.18931019337202662, "learning_rate": 3.067414527489866e-05, "loss": 0.4385, "step": 1141 }, { "epoch": 3.6011014948859166, "grad_norm": 0.17894402262594664, "learning_rate": 3.0658021840842615e-05, "loss": 0.4317, "step": 1142 }, { "epoch": 3.604248623131393, "grad_norm": 0.19224386856066833, "learning_rate": 3.0641889363825566e-05, "loss": 0.4295, "step": 1143 }, { "epoch": 3.6073957513768686, "grad_norm": 0.1823104308503007, "learning_rate": 3.062574786069723e-05, "loss": 0.4381, "step": 1144 }, { "epoch": 3.6105428796223444, "grad_norm": 0.1845807265379901, "learning_rate": 3.0609597348316784e-05, "loss": 0.4443, "step": 1145 }, { "epoch": 3.6136900078678207, "grad_norm": 0.1752786351405681, "learning_rate": 3.05934378435528e-05, "loss": 0.4269, "step": 1146 }, { "epoch": 3.6168371361132965, "grad_norm": 0.17650135711502488, "learning_rate": 3.057726936328323e-05, "loss": 0.4344, "step": 1147 }, { "epoch": 3.6199842643587727, "grad_norm": 0.19322880994465225, "learning_rate": 3.056109192439541e-05, "loss": 0.4286, "step": 1148 }, { "epoch": 3.6231313926042485, "grad_norm": 0.1664813830978989, "learning_rate": 3.0544905543786045e-05, "loss": 0.434, "step": 1149 }, { "epoch": 3.6262785208497244, "grad_norm": 0.19170305910822624, "learning_rate": 3.052871023836116e-05, "loss": 0.4432, "step": 1150 }, { "epoch": 3.6294256490952006, "grad_norm": 0.1854811898273995, "learning_rate": 3.051250602503612e-05, "loss": 0.4335, "step": 1151 }, { "epoch": 3.632572777340677, "grad_norm": 0.1822902853195308, "learning_rate": 3.0496292920735574e-05, "loss": 0.4397, "step": 1152 }, { "epoch": 3.6357199055861527, "grad_norm": 0.16630950273159906, "learning_rate": 3.0480070942393483e-05, "loss": 0.4441, "step": 1153 }, { "epoch": 3.6388670338316285, "grad_norm": 0.1661679586279354, "learning_rate": 3.046384010695304e-05, "loss": 0.4394, "step": 1154 }, { "epoch": 3.6420141620771047, "grad_norm": 0.1564352636813857, "learning_rate": 3.0447600431366724e-05, "loss": 0.4438, "step": 1155 }, { "epoch": 3.6451612903225805, "grad_norm": 0.17161921802692476, "learning_rate": 3.043135193259623e-05, "loss": 0.4343, "step": 1156 }, { "epoch": 3.6483084185680568, "grad_norm": 0.18351798204850334, "learning_rate": 3.0415094627612464e-05, "loss": 0.4402, "step": 1157 }, { "epoch": 3.6514555468135326, "grad_norm": 0.17135389498561554, "learning_rate": 3.0398828533395547e-05, "loss": 0.4324, "step": 1158 }, { "epoch": 3.654602675059009, "grad_norm": 0.19814568295329985, "learning_rate": 3.0382553666934777e-05, "loss": 0.4418, "step": 1159 }, { "epoch": 3.6577498033044846, "grad_norm": 0.17713962766442853, "learning_rate": 3.036627004522859e-05, "loss": 0.4258, "step": 1160 }, { "epoch": 3.660896931549961, "grad_norm": 0.17627230414185083, "learning_rate": 3.0349977685284596e-05, "loss": 0.437, "step": 1161 }, { "epoch": 3.6640440597954367, "grad_norm": 0.21033833731933352, "learning_rate": 3.0333676604119512e-05, "loss": 0.4359, "step": 1162 }, { "epoch": 3.6671911880409125, "grad_norm": 0.16631632754069195, "learning_rate": 3.0317366818759183e-05, "loss": 0.4416, "step": 1163 }, { "epoch": 3.6703383162863887, "grad_norm": 0.16042229001940653, "learning_rate": 3.0301048346238522e-05, "loss": 0.4332, "step": 1164 }, { "epoch": 3.6734854445318645, "grad_norm": 0.1681130029892185, "learning_rate": 3.028472120360153e-05, "loss": 0.4435, "step": 1165 }, { "epoch": 3.676632572777341, "grad_norm": 0.15489295343278095, "learning_rate": 3.0268385407901267e-05, "loss": 0.4301, "step": 1166 }, { "epoch": 3.6797797010228166, "grad_norm": 0.1856105350391872, "learning_rate": 3.025204097619982e-05, "loss": 0.4384, "step": 1167 }, { "epoch": 3.682926829268293, "grad_norm": 0.17420244127013473, "learning_rate": 3.0235687925568308e-05, "loss": 0.4474, "step": 1168 }, { "epoch": 3.6860739575137687, "grad_norm": 0.16283454261310373, "learning_rate": 3.021932627308684e-05, "loss": 0.446, "step": 1169 }, { "epoch": 3.689221085759245, "grad_norm": 0.16652733620538568, "learning_rate": 3.020295603584451e-05, "loss": 0.4385, "step": 1170 }, { "epoch": 3.6923682140047207, "grad_norm": 0.16838451835416196, "learning_rate": 3.0186577230939383e-05, "loss": 0.4383, "step": 1171 }, { "epoch": 3.6955153422501965, "grad_norm": 0.20256218511665658, "learning_rate": 3.017018987547848e-05, "loss": 0.4468, "step": 1172 }, { "epoch": 3.6986624704956728, "grad_norm": 0.1789463978366356, "learning_rate": 3.015379398657774e-05, "loss": 0.4436, "step": 1173 }, { "epoch": 3.7018095987411486, "grad_norm": 0.2254816031556007, "learning_rate": 3.0137389581362012e-05, "loss": 0.4402, "step": 1174 }, { "epoch": 3.704956726986625, "grad_norm": 0.19908506142514173, "learning_rate": 3.0120976676965065e-05, "loss": 0.437, "step": 1175 }, { "epoch": 3.7081038552321006, "grad_norm": 0.17538124081020218, "learning_rate": 3.010455529052952e-05, "loss": 0.4495, "step": 1176 }, { "epoch": 3.711250983477577, "grad_norm": 0.20509162893468286, "learning_rate": 3.0088125439206854e-05, "loss": 0.4432, "step": 1177 }, { "epoch": 3.7143981117230527, "grad_norm": 0.18192579811095336, "learning_rate": 3.0071687140157413e-05, "loss": 0.4388, "step": 1178 }, { "epoch": 3.717545239968529, "grad_norm": 0.21503535709810237, "learning_rate": 3.005524041055034e-05, "loss": 0.4351, "step": 1179 }, { "epoch": 3.7206923682140047, "grad_norm": 0.17400711170435434, "learning_rate": 3.00387852675636e-05, "loss": 0.4492, "step": 1180 }, { "epoch": 3.7238394964594805, "grad_norm": 0.24313295347800967, "learning_rate": 3.0022321728383933e-05, "loss": 0.4315, "step": 1181 }, { "epoch": 3.726986624704957, "grad_norm": 0.1478818146322245, "learning_rate": 3.0005849810206845e-05, "loss": 0.4363, "step": 1182 }, { "epoch": 3.7301337529504326, "grad_norm": 0.20144526501549903, "learning_rate": 2.9989369530236618e-05, "loss": 0.4426, "step": 1183 }, { "epoch": 3.733280881195909, "grad_norm": 0.18206029920285993, "learning_rate": 2.9972880905686246e-05, "loss": 0.4344, "step": 1184 }, { "epoch": 3.7364280094413846, "grad_norm": 0.18180540417395807, "learning_rate": 2.9956383953777442e-05, "loss": 0.4424, "step": 1185 }, { "epoch": 3.739575137686861, "grad_norm": 0.191677223977247, "learning_rate": 2.9939878691740625e-05, "loss": 0.4304, "step": 1186 }, { "epoch": 3.7427222659323367, "grad_norm": 0.17576860046619955, "learning_rate": 2.9923365136814876e-05, "loss": 0.4432, "step": 1187 }, { "epoch": 3.745869394177813, "grad_norm": 0.18177898566663142, "learning_rate": 2.9906843306247965e-05, "loss": 0.4315, "step": 1188 }, { "epoch": 3.7490165224232888, "grad_norm": 0.19227267623614985, "learning_rate": 2.9890313217296277e-05, "loss": 0.4368, "step": 1189 }, { "epoch": 3.7521636506687646, "grad_norm": 0.19404291468297713, "learning_rate": 2.9873774887224844e-05, "loss": 0.4418, "step": 1190 }, { "epoch": 3.755310778914241, "grad_norm": 0.17297469013091823, "learning_rate": 2.985722833330729e-05, "loss": 0.4276, "step": 1191 }, { "epoch": 3.7584579071597166, "grad_norm": 0.21580498854793478, "learning_rate": 2.984067357282584e-05, "loss": 0.438, "step": 1192 }, { "epoch": 3.761605035405193, "grad_norm": 0.19970366933915254, "learning_rate": 2.9824110623071285e-05, "loss": 0.4429, "step": 1193 }, { "epoch": 3.7647521636506687, "grad_norm": 0.1893900043304486, "learning_rate": 2.980753950134297e-05, "loss": 0.4425, "step": 1194 }, { "epoch": 3.767899291896145, "grad_norm": 0.1844133531075253, "learning_rate": 2.979096022494878e-05, "loss": 0.4345, "step": 1195 }, { "epoch": 3.7710464201416207, "grad_norm": 0.1764704510378753, "learning_rate": 2.9774372811205104e-05, "loss": 0.4404, "step": 1196 }, { "epoch": 3.774193548387097, "grad_norm": 0.1937035801525317, "learning_rate": 2.975777727743684e-05, "loss": 0.4386, "step": 1197 }, { "epoch": 3.777340676632573, "grad_norm": 0.18365070141139342, "learning_rate": 2.9741173640977372e-05, "loss": 0.4331, "step": 1198 }, { "epoch": 3.7804878048780486, "grad_norm": 0.17907574402326445, "learning_rate": 2.9724561919168536e-05, "loss": 0.4411, "step": 1199 }, { "epoch": 3.783634933123525, "grad_norm": 0.1915338900258077, "learning_rate": 2.9707942129360622e-05, "loss": 0.4336, "step": 1200 }, { "epoch": 3.7867820613690006, "grad_norm": 0.18426098050440218, "learning_rate": 2.969131428891234e-05, "loss": 0.4352, "step": 1201 }, { "epoch": 3.789929189614477, "grad_norm": 0.19246246891896052, "learning_rate": 2.967467841519081e-05, "loss": 0.4281, "step": 1202 }, { "epoch": 3.7930763178599527, "grad_norm": 0.199901468879607, "learning_rate": 2.9658034525571543e-05, "loss": 0.4401, "step": 1203 }, { "epoch": 3.796223446105429, "grad_norm": 0.17624439473519934, "learning_rate": 2.964138263743843e-05, "loss": 0.4343, "step": 1204 }, { "epoch": 3.7993705743509048, "grad_norm": 0.20949517161628303, "learning_rate": 2.96247227681837e-05, "loss": 0.4284, "step": 1205 }, { "epoch": 3.802517702596381, "grad_norm": 0.1841705003012857, "learning_rate": 2.9608054935207925e-05, "loss": 0.4392, "step": 1206 }, { "epoch": 3.805664830841857, "grad_norm": 0.20023545812113352, "learning_rate": 2.959137915592e-05, "loss": 0.4403, "step": 1207 }, { "epoch": 3.8088119590873326, "grad_norm": 0.18181285159081859, "learning_rate": 2.9574695447737126e-05, "loss": 0.4301, "step": 1208 }, { "epoch": 3.811959087332809, "grad_norm": 0.4591999499033323, "learning_rate": 2.9558003828084768e-05, "loss": 0.4444, "step": 1209 }, { "epoch": 3.8151062155782847, "grad_norm": 0.2085648435684365, "learning_rate": 2.9541304314396653e-05, "loss": 0.4325, "step": 1210 }, { "epoch": 3.818253343823761, "grad_norm": 0.20463216335646361, "learning_rate": 2.9524596924114776e-05, "loss": 0.4345, "step": 1211 }, { "epoch": 3.8214004720692367, "grad_norm": 0.2006047130461185, "learning_rate": 2.950788167468934e-05, "loss": 0.4391, "step": 1212 }, { "epoch": 3.824547600314713, "grad_norm": 0.18827426151401724, "learning_rate": 2.9491158583578753e-05, "loss": 0.4358, "step": 1213 }, { "epoch": 3.8276947285601888, "grad_norm": 0.19581009849077824, "learning_rate": 2.947442766824963e-05, "loss": 0.4441, "step": 1214 }, { "epoch": 3.830841856805665, "grad_norm": 0.17734874484349197, "learning_rate": 2.9457688946176746e-05, "loss": 0.4274, "step": 1215 }, { "epoch": 3.833988985051141, "grad_norm": 0.17277936701165808, "learning_rate": 2.9440942434843042e-05, "loss": 0.4367, "step": 1216 }, { "epoch": 3.8371361132966166, "grad_norm": 0.18624753492705168, "learning_rate": 2.942418815173958e-05, "loss": 0.4431, "step": 1217 }, { "epoch": 3.840283241542093, "grad_norm": 0.17981530092582268, "learning_rate": 2.9407426114365538e-05, "loss": 0.4488, "step": 1218 }, { "epoch": 3.8434303697875687, "grad_norm": 0.17417936860740793, "learning_rate": 2.9390656340228215e-05, "loss": 0.4386, "step": 1219 }, { "epoch": 3.846577498033045, "grad_norm": 0.1767282033703042, "learning_rate": 2.9373878846842964e-05, "loss": 0.4232, "step": 1220 }, { "epoch": 3.8497246262785207, "grad_norm": 0.17726819566715343, "learning_rate": 2.935709365173321e-05, "loss": 0.4372, "step": 1221 }, { "epoch": 3.852871754523997, "grad_norm": 0.18641228873224577, "learning_rate": 2.934030077243044e-05, "loss": 0.4539, "step": 1222 }, { "epoch": 3.856018882769473, "grad_norm": 0.18523506965437314, "learning_rate": 2.932350022647414e-05, "loss": 0.44, "step": 1223 }, { "epoch": 3.859166011014949, "grad_norm": 0.17429118934670704, "learning_rate": 2.9306692031411817e-05, "loss": 0.4419, "step": 1224 }, { "epoch": 3.862313139260425, "grad_norm": 0.16912829692351863, "learning_rate": 2.9289876204798973e-05, "loss": 0.445, "step": 1225 }, { "epoch": 3.8654602675059007, "grad_norm": 0.16689978761928095, "learning_rate": 2.927305276419906e-05, "loss": 0.4399, "step": 1226 }, { "epoch": 3.868607395751377, "grad_norm": 0.1662616285588503, "learning_rate": 2.9256221727183508e-05, "loss": 0.4439, "step": 1227 }, { "epoch": 3.8717545239968527, "grad_norm": 0.17547371330423203, "learning_rate": 2.923938311133165e-05, "loss": 0.4374, "step": 1228 }, { "epoch": 3.874901652242329, "grad_norm": 0.16899679456608838, "learning_rate": 2.922253693423078e-05, "loss": 0.4403, "step": 1229 }, { "epoch": 3.8780487804878048, "grad_norm": 0.17142815065151418, "learning_rate": 2.920568321347604e-05, "loss": 0.4491, "step": 1230 }, { "epoch": 3.881195908733281, "grad_norm": 0.1659629352966584, "learning_rate": 2.918882196667049e-05, "loss": 0.4442, "step": 1231 }, { "epoch": 3.884343036978757, "grad_norm": 0.1827137404665224, "learning_rate": 2.9171953211425027e-05, "loss": 0.4462, "step": 1232 }, { "epoch": 3.887490165224233, "grad_norm": 0.16843932419439855, "learning_rate": 2.9155076965358397e-05, "loss": 0.4425, "step": 1233 }, { "epoch": 3.890637293469709, "grad_norm": 0.1895572765071493, "learning_rate": 2.9138193246097172e-05, "loss": 0.4386, "step": 1234 }, { "epoch": 3.8937844217151847, "grad_norm": 0.17922902546291508, "learning_rate": 2.912130207127573e-05, "loss": 0.4341, "step": 1235 }, { "epoch": 3.896931549960661, "grad_norm": 0.17772714370132225, "learning_rate": 2.9104403458536238e-05, "loss": 0.4444, "step": 1236 }, { "epoch": 3.9000786782061367, "grad_norm": 0.18719240088700048, "learning_rate": 2.9087497425528618e-05, "loss": 0.4329, "step": 1237 }, { "epoch": 3.903225806451613, "grad_norm": 0.19116327201990352, "learning_rate": 2.9070583989910556e-05, "loss": 0.4393, "step": 1238 }, { "epoch": 3.906372934697089, "grad_norm": 0.1888243799061614, "learning_rate": 2.905366316934747e-05, "loss": 0.4404, "step": 1239 }, { "epoch": 3.909520062942565, "grad_norm": 0.19923427176985103, "learning_rate": 2.9036734981512484e-05, "loss": 0.4433, "step": 1240 }, { "epoch": 3.912667191188041, "grad_norm": 0.184137878281804, "learning_rate": 2.9019799444086425e-05, "loss": 0.4451, "step": 1241 }, { "epoch": 3.915814319433517, "grad_norm": 0.16318104017822477, "learning_rate": 2.9002856574757777e-05, "loss": 0.4459, "step": 1242 }, { "epoch": 3.918961447678993, "grad_norm": 0.1834621256694157, "learning_rate": 2.898590639122272e-05, "loss": 0.4432, "step": 1243 }, { "epoch": 3.9221085759244687, "grad_norm": 0.19023161460692964, "learning_rate": 2.8968948911185018e-05, "loss": 0.4411, "step": 1244 }, { "epoch": 3.925255704169945, "grad_norm": 0.18356230097493653, "learning_rate": 2.8951984152356117e-05, "loss": 0.4365, "step": 1245 }, { "epoch": 3.9284028324154208, "grad_norm": 0.19309532709059088, "learning_rate": 2.8935012132455024e-05, "loss": 0.4329, "step": 1246 }, { "epoch": 3.931549960660897, "grad_norm": 0.1790999571642992, "learning_rate": 2.8918032869208335e-05, "loss": 0.44, "step": 1247 }, { "epoch": 3.934697088906373, "grad_norm": 0.17778758227368407, "learning_rate": 2.8901046380350227e-05, "loss": 0.4369, "step": 1248 }, { "epoch": 3.937844217151849, "grad_norm": 0.16705242650281665, "learning_rate": 2.8884052683622408e-05, "loss": 0.4416, "step": 1249 }, { "epoch": 3.940991345397325, "grad_norm": 0.17118532687053076, "learning_rate": 2.886705179677414e-05, "loss": 0.4355, "step": 1250 }, { "epoch": 3.944138473642801, "grad_norm": 0.17794134938829217, "learning_rate": 2.885004373756215e-05, "loss": 0.4362, "step": 1251 }, { "epoch": 3.947285601888277, "grad_norm": 0.16850951168986514, "learning_rate": 2.88330285237507e-05, "loss": 0.439, "step": 1252 }, { "epoch": 3.9504327301337527, "grad_norm": 0.17017397926948555, "learning_rate": 2.8816006173111504e-05, "loss": 0.4379, "step": 1253 }, { "epoch": 3.953579858379229, "grad_norm": 0.1595665503643, "learning_rate": 2.8798976703423726e-05, "loss": 0.4416, "step": 1254 }, { "epoch": 3.9567269866247052, "grad_norm": 0.17016824414153592, "learning_rate": 2.8781940132473977e-05, "loss": 0.437, "step": 1255 }, { "epoch": 3.959874114870181, "grad_norm": 0.1768366345187553, "learning_rate": 2.8764896478056287e-05, "loss": 0.4405, "step": 1256 }, { "epoch": 3.963021243115657, "grad_norm": 0.20215798109859642, "learning_rate": 2.874784575797207e-05, "loss": 0.4407, "step": 1257 }, { "epoch": 3.966168371361133, "grad_norm": 0.19222521069705137, "learning_rate": 2.8730787990030138e-05, "loss": 0.4333, "step": 1258 }, { "epoch": 3.969315499606609, "grad_norm": 0.1768154833619316, "learning_rate": 2.8713723192046637e-05, "loss": 0.4423, "step": 1259 }, { "epoch": 3.972462627852085, "grad_norm": 0.183349858541064, "learning_rate": 2.8696651381845094e-05, "loss": 0.4443, "step": 1260 }, { "epoch": 3.975609756097561, "grad_norm": 0.19201674367546054, "learning_rate": 2.8679572577256324e-05, "loss": 0.4362, "step": 1261 }, { "epoch": 3.9787568843430368, "grad_norm": 0.18098429025124854, "learning_rate": 2.866248679611846e-05, "loss": 0.4339, "step": 1262 }, { "epoch": 3.981904012588513, "grad_norm": 0.21698099039863375, "learning_rate": 2.8645394056276936e-05, "loss": 0.4356, "step": 1263 }, { "epoch": 3.9850511408339893, "grad_norm": 0.19347949036380474, "learning_rate": 2.862829437558443e-05, "loss": 0.4435, "step": 1264 }, { "epoch": 3.988198269079465, "grad_norm": 0.19634210846658384, "learning_rate": 2.8611187771900897e-05, "loss": 0.4359, "step": 1265 }, { "epoch": 3.991345397324941, "grad_norm": 0.20372267012008688, "learning_rate": 2.8594074263093495e-05, "loss": 0.4435, "step": 1266 }, { "epoch": 3.994492525570417, "grad_norm": 0.19018939442925267, "learning_rate": 2.8576953867036605e-05, "loss": 0.435, "step": 1267 }, { "epoch": 3.997639653815893, "grad_norm": 0.17380970464724402, "learning_rate": 2.855982660161181e-05, "loss": 0.4368, "step": 1268 }, { "epoch": 4.003147128245476, "grad_norm": 0.4567009200647522, "learning_rate": 2.854269248470786e-05, "loss": 0.8291, "step": 1269 }, { "epoch": 4.006294256490952, "grad_norm": 0.3540594453182581, "learning_rate": 2.8525551534220657e-05, "loss": 0.3842, "step": 1270 }, { "epoch": 4.009441384736428, "grad_norm": 0.3137640124559675, "learning_rate": 2.8508403768053242e-05, "loss": 0.3803, "step": 1271 }, { "epoch": 4.012588512981904, "grad_norm": 0.3411489806848952, "learning_rate": 2.8491249204115784e-05, "loss": 0.3877, "step": 1272 }, { "epoch": 4.01573564122738, "grad_norm": 0.32660510853569047, "learning_rate": 2.847408786032555e-05, "loss": 0.389, "step": 1273 }, { "epoch": 4.018882769472856, "grad_norm": 0.3039688244842211, "learning_rate": 2.845691975460688e-05, "loss": 0.381, "step": 1274 }, { "epoch": 4.022029897718332, "grad_norm": 0.2867459711491588, "learning_rate": 2.8439744904891178e-05, "loss": 0.3768, "step": 1275 }, { "epoch": 4.025177025963808, "grad_norm": 0.2971765367481167, "learning_rate": 2.8422563329116898e-05, "loss": 0.3887, "step": 1276 }, { "epoch": 4.028324154209284, "grad_norm": 0.26012973313503523, "learning_rate": 2.8405375045229512e-05, "loss": 0.3872, "step": 1277 }, { "epoch": 4.03147128245476, "grad_norm": 0.24236515760372213, "learning_rate": 2.83881800711815e-05, "loss": 0.3911, "step": 1278 }, { "epoch": 4.034618410700236, "grad_norm": 0.27142579496532293, "learning_rate": 2.837097842493234e-05, "loss": 0.3927, "step": 1279 }, { "epoch": 4.037765538945712, "grad_norm": 0.24067712701294466, "learning_rate": 2.8353770124448467e-05, "loss": 0.3851, "step": 1280 }, { "epoch": 4.040912667191188, "grad_norm": 0.2388177326791406, "learning_rate": 2.8336555187703266e-05, "loss": 0.377, "step": 1281 }, { "epoch": 4.044059795436664, "grad_norm": 0.24190387059110036, "learning_rate": 2.8319333632677062e-05, "loss": 0.3819, "step": 1282 }, { "epoch": 4.04720692368214, "grad_norm": 0.233811700600284, "learning_rate": 2.830210547735708e-05, "loss": 0.374, "step": 1283 }, { "epoch": 4.050354051927616, "grad_norm": 0.21990274300198742, "learning_rate": 2.8284870739737456e-05, "loss": 0.3801, "step": 1284 }, { "epoch": 4.053501180173092, "grad_norm": 0.24925704152570827, "learning_rate": 2.826762943781918e-05, "loss": 0.3833, "step": 1285 }, { "epoch": 4.056648308418568, "grad_norm": 0.2281128058726442, "learning_rate": 2.825038158961012e-05, "loss": 0.3849, "step": 1286 }, { "epoch": 4.059795436664044, "grad_norm": 0.20398518468216212, "learning_rate": 2.823312721312496e-05, "loss": 0.3749, "step": 1287 }, { "epoch": 4.06294256490952, "grad_norm": 0.21358990395381772, "learning_rate": 2.8215866326385222e-05, "loss": 0.389, "step": 1288 }, { "epoch": 4.066089693154996, "grad_norm": 0.1998290493946192, "learning_rate": 2.8198598947419222e-05, "loss": 0.3746, "step": 1289 }, { "epoch": 4.069236821400472, "grad_norm": 0.2136797295751118, "learning_rate": 2.818132509426204e-05, "loss": 0.3873, "step": 1290 }, { "epoch": 4.072383949645948, "grad_norm": 0.19695872962181996, "learning_rate": 2.8164044784955536e-05, "loss": 0.387, "step": 1291 }, { "epoch": 4.075531077891424, "grad_norm": 0.23535924524823093, "learning_rate": 2.814675803754831e-05, "loss": 0.3875, "step": 1292 }, { "epoch": 4.0786782061369005, "grad_norm": 0.20439185050875183, "learning_rate": 2.8129464870095697e-05, "loss": 0.3765, "step": 1293 }, { "epoch": 4.081825334382376, "grad_norm": 0.22225008665143714, "learning_rate": 2.8112165300659714e-05, "loss": 0.3779, "step": 1294 }, { "epoch": 4.084972462627852, "grad_norm": 0.21036944845424835, "learning_rate": 2.809485934730907e-05, "loss": 0.4008, "step": 1295 }, { "epoch": 4.088119590873328, "grad_norm": 0.252462473354712, "learning_rate": 2.807754702811916e-05, "loss": 0.3867, "step": 1296 }, { "epoch": 4.091266719118804, "grad_norm": 0.22970311838363114, "learning_rate": 2.8060228361172012e-05, "loss": 0.387, "step": 1297 }, { "epoch": 4.09441384736428, "grad_norm": 0.19785518482856718, "learning_rate": 2.804290336455629e-05, "loss": 0.384, "step": 1298 }, { "epoch": 4.097560975609756, "grad_norm": 0.2375083186949961, "learning_rate": 2.8025572056367263e-05, "loss": 0.3802, "step": 1299 }, { "epoch": 4.100708103855232, "grad_norm": 0.20170571559199502, "learning_rate": 2.8008234454706795e-05, "loss": 0.378, "step": 1300 }, { "epoch": 4.103855232100708, "grad_norm": 0.19637734454839414, "learning_rate": 2.799089057768333e-05, "loss": 0.3841, "step": 1301 }, { "epoch": 4.1070023603461845, "grad_norm": 0.20324902703498704, "learning_rate": 2.797354044341186e-05, "loss": 0.389, "step": 1302 }, { "epoch": 4.11014948859166, "grad_norm": 0.2011077455123792, "learning_rate": 2.7956184070013912e-05, "loss": 0.3813, "step": 1303 }, { "epoch": 4.113296616837136, "grad_norm": 0.18553531787475813, "learning_rate": 2.7938821475617523e-05, "loss": 0.3829, "step": 1304 }, { "epoch": 4.116443745082612, "grad_norm": 0.19878439370656475, "learning_rate": 2.792145267835725e-05, "loss": 0.3738, "step": 1305 }, { "epoch": 4.119590873328088, "grad_norm": 0.19762338018115141, "learning_rate": 2.7904077696374107e-05, "loss": 0.3796, "step": 1306 }, { "epoch": 4.122738001573564, "grad_norm": 0.21096214326769303, "learning_rate": 2.7886696547815568e-05, "loss": 0.3764, "step": 1307 }, { "epoch": 4.12588512981904, "grad_norm": 0.19386489654228445, "learning_rate": 2.7869309250835565e-05, "loss": 0.3808, "step": 1308 }, { "epoch": 4.129032258064516, "grad_norm": 0.22093269977952554, "learning_rate": 2.7851915823594442e-05, "loss": 0.3788, "step": 1309 }, { "epoch": 4.132179386309992, "grad_norm": 0.2004090636824933, "learning_rate": 2.783451628425893e-05, "loss": 0.3789, "step": 1310 }, { "epoch": 4.1353265145554685, "grad_norm": 0.198727713176911, "learning_rate": 2.7817110651002183e-05, "loss": 0.3818, "step": 1311 }, { "epoch": 4.138473642800944, "grad_norm": 0.24944897390496926, "learning_rate": 2.779969894200367e-05, "loss": 0.3815, "step": 1312 }, { "epoch": 4.14162077104642, "grad_norm": 0.17429094692844807, "learning_rate": 2.7782281175449246e-05, "loss": 0.3805, "step": 1313 }, { "epoch": 4.144767899291896, "grad_norm": 0.2344196809712545, "learning_rate": 2.7764857369531078e-05, "loss": 0.3851, "step": 1314 }, { "epoch": 4.147915027537372, "grad_norm": 0.1697952320993672, "learning_rate": 2.774742754244764e-05, "loss": 0.3833, "step": 1315 }, { "epoch": 4.151062155782848, "grad_norm": 0.20972633292156506, "learning_rate": 2.7729991712403697e-05, "loss": 0.3841, "step": 1316 }, { "epoch": 4.154209284028324, "grad_norm": 0.20826674826782807, "learning_rate": 2.7712549897610284e-05, "loss": 0.3873, "step": 1317 }, { "epoch": 4.1573564122738, "grad_norm": 0.18808361463227552, "learning_rate": 2.769510211628468e-05, "loss": 0.3831, "step": 1318 }, { "epoch": 4.160503540519276, "grad_norm": 0.20800615549370643, "learning_rate": 2.767764838665041e-05, "loss": 0.3785, "step": 1319 }, { "epoch": 4.1636506687647525, "grad_norm": 0.22375396662721606, "learning_rate": 2.766018872693719e-05, "loss": 0.3835, "step": 1320 }, { "epoch": 4.166797797010228, "grad_norm": 0.19240780126585705, "learning_rate": 2.764272315538096e-05, "loss": 0.3832, "step": 1321 }, { "epoch": 4.169944925255704, "grad_norm": 0.2037920501367335, "learning_rate": 2.762525169022381e-05, "loss": 0.387, "step": 1322 }, { "epoch": 4.17309205350118, "grad_norm": 0.1946112985945064, "learning_rate": 2.7607774349713997e-05, "loss": 0.3882, "step": 1323 }, { "epoch": 4.176239181746656, "grad_norm": 0.19471855963610726, "learning_rate": 2.7590291152105905e-05, "loss": 0.3859, "step": 1324 }, { "epoch": 4.1793863099921325, "grad_norm": 0.208895734249875, "learning_rate": 2.7572802115660045e-05, "loss": 0.3899, "step": 1325 }, { "epoch": 4.182533438237608, "grad_norm": 0.19332682587345157, "learning_rate": 2.7555307258643028e-05, "loss": 0.3817, "step": 1326 }, { "epoch": 4.185680566483084, "grad_norm": 0.20554112935607244, "learning_rate": 2.753780659932753e-05, "loss": 0.3892, "step": 1327 }, { "epoch": 4.18882769472856, "grad_norm": 0.1863703929014815, "learning_rate": 2.7520300155992296e-05, "loss": 0.3989, "step": 1328 }, { "epoch": 4.191974822974037, "grad_norm": 0.2181184280633318, "learning_rate": 2.7502787946922125e-05, "loss": 0.3857, "step": 1329 }, { "epoch": 4.195121951219512, "grad_norm": 0.1877040187866126, "learning_rate": 2.748526999040782e-05, "loss": 0.3846, "step": 1330 }, { "epoch": 4.198269079464988, "grad_norm": 0.2101059077905614, "learning_rate": 2.7467746304746192e-05, "loss": 0.3791, "step": 1331 }, { "epoch": 4.201416207710464, "grad_norm": 0.20021777258381446, "learning_rate": 2.7450216908240037e-05, "loss": 0.3829, "step": 1332 }, { "epoch": 4.20456333595594, "grad_norm": 0.2135816346994294, "learning_rate": 2.7432681819198114e-05, "loss": 0.385, "step": 1333 }, { "epoch": 4.2077104642014165, "grad_norm": 0.18046740545279863, "learning_rate": 2.7415141055935132e-05, "loss": 0.3744, "step": 1334 }, { "epoch": 4.210857592446892, "grad_norm": 0.20184956661048567, "learning_rate": 2.739759463677172e-05, "loss": 0.3773, "step": 1335 }, { "epoch": 4.214004720692368, "grad_norm": 0.1683940922170243, "learning_rate": 2.738004258003442e-05, "loss": 0.3816, "step": 1336 }, { "epoch": 4.217151848937844, "grad_norm": 0.178967099609675, "learning_rate": 2.736248490405567e-05, "loss": 0.3868, "step": 1337 }, { "epoch": 4.220298977183321, "grad_norm": 0.17657160485024884, "learning_rate": 2.7344921627173745e-05, "loss": 0.3838, "step": 1338 }, { "epoch": 4.223446105428796, "grad_norm": 0.1871577205312587, "learning_rate": 2.732735276773282e-05, "loss": 0.3852, "step": 1339 }, { "epoch": 4.226593233674272, "grad_norm": 0.18868214707434067, "learning_rate": 2.7309778344082853e-05, "loss": 0.3897, "step": 1340 }, { "epoch": 4.229740361919748, "grad_norm": 0.17848169409152329, "learning_rate": 2.7292198374579637e-05, "loss": 0.3841, "step": 1341 }, { "epoch": 4.232887490165224, "grad_norm": 0.19858344201399433, "learning_rate": 2.727461287758476e-05, "loss": 0.3877, "step": 1342 }, { "epoch": 4.2360346184107005, "grad_norm": 0.19726514845450016, "learning_rate": 2.7257021871465566e-05, "loss": 0.3838, "step": 1343 }, { "epoch": 4.239181746656176, "grad_norm": 0.177223025272618, "learning_rate": 2.723942537459518e-05, "loss": 0.394, "step": 1344 }, { "epoch": 4.242328874901652, "grad_norm": 0.2355427079374641, "learning_rate": 2.7221823405352435e-05, "loss": 0.3861, "step": 1345 }, { "epoch": 4.245476003147128, "grad_norm": 0.20054623016987655, "learning_rate": 2.72042159821219e-05, "loss": 0.3888, "step": 1346 }, { "epoch": 4.248623131392605, "grad_norm": 0.2041632465765231, "learning_rate": 2.7186603123293824e-05, "loss": 0.3795, "step": 1347 }, { "epoch": 4.25177025963808, "grad_norm": 0.22641717412431214, "learning_rate": 2.716898484726414e-05, "loss": 0.3778, "step": 1348 }, { "epoch": 4.254917387883556, "grad_norm": 0.1916050281481582, "learning_rate": 2.7151361172434447e-05, "loss": 0.3837, "step": 1349 }, { "epoch": 4.258064516129032, "grad_norm": 0.24137246015980246, "learning_rate": 2.713373211721196e-05, "loss": 0.3862, "step": 1350 }, { "epoch": 4.261211644374509, "grad_norm": 0.18174131822711345, "learning_rate": 2.711609770000955e-05, "loss": 0.3816, "step": 1351 }, { "epoch": 4.2643587726199845, "grad_norm": 0.2242923413154568, "learning_rate": 2.7098457939245654e-05, "loss": 0.3872, "step": 1352 }, { "epoch": 4.26750590086546, "grad_norm": 0.2235462568563175, "learning_rate": 2.7080812853344304e-05, "loss": 0.3996, "step": 1353 }, { "epoch": 4.270653029110936, "grad_norm": 0.24969061397705838, "learning_rate": 2.7063162460735103e-05, "loss": 0.3816, "step": 1354 }, { "epoch": 4.273800157356412, "grad_norm": 0.22715980857633994, "learning_rate": 2.7045506779853186e-05, "loss": 0.3852, "step": 1355 }, { "epoch": 4.276947285601889, "grad_norm": 0.1822166185884246, "learning_rate": 2.7027845829139202e-05, "loss": 0.3803, "step": 1356 }, { "epoch": 4.280094413847364, "grad_norm": 0.20932165509317502, "learning_rate": 2.7010179627039318e-05, "loss": 0.387, "step": 1357 }, { "epoch": 4.28324154209284, "grad_norm": 0.20308647316174985, "learning_rate": 2.699250819200519e-05, "loss": 0.3864, "step": 1358 }, { "epoch": 4.286388670338316, "grad_norm": 0.18217363195930258, "learning_rate": 2.6974831542493923e-05, "loss": 0.3802, "step": 1359 }, { "epoch": 4.289535798583792, "grad_norm": 0.186444098060944, "learning_rate": 2.6957149696968085e-05, "loss": 0.3848, "step": 1360 }, { "epoch": 4.2926829268292686, "grad_norm": 0.1844457489090321, "learning_rate": 2.6939462673895663e-05, "loss": 0.3812, "step": 1361 }, { "epoch": 4.295830055074744, "grad_norm": 0.2010822546007924, "learning_rate": 2.6921770491750044e-05, "loss": 0.3897, "step": 1362 }, { "epoch": 4.29897718332022, "grad_norm": 0.20222906345664107, "learning_rate": 2.690407316901002e-05, "loss": 0.3865, "step": 1363 }, { "epoch": 4.302124311565696, "grad_norm": 0.18971869892805163, "learning_rate": 2.6886370724159738e-05, "loss": 0.3854, "step": 1364 }, { "epoch": 4.305271439811173, "grad_norm": 0.1997246946094328, "learning_rate": 2.686866317568871e-05, "loss": 0.3868, "step": 1365 }, { "epoch": 4.3084185680566485, "grad_norm": 0.18961656604722107, "learning_rate": 2.685095054209176e-05, "loss": 0.3904, "step": 1366 }, { "epoch": 4.311565696302124, "grad_norm": 0.18958617985805862, "learning_rate": 2.6833232841869038e-05, "loss": 0.3832, "step": 1367 }, { "epoch": 4.3147128245476, "grad_norm": 0.20597877250679947, "learning_rate": 2.681551009352598e-05, "loss": 0.3794, "step": 1368 }, { "epoch": 4.317859952793077, "grad_norm": 0.2060368281881852, "learning_rate": 2.679778231557329e-05, "loss": 0.3845, "step": 1369 }, { "epoch": 4.321007081038553, "grad_norm": 0.23435327637660347, "learning_rate": 2.6780049526526934e-05, "loss": 0.392, "step": 1370 }, { "epoch": 4.324154209284028, "grad_norm": 0.19475763086784323, "learning_rate": 2.6762311744908106e-05, "loss": 0.387, "step": 1371 }, { "epoch": 4.327301337529504, "grad_norm": 0.191113901147632, "learning_rate": 2.674456898924322e-05, "loss": 0.3873, "step": 1372 }, { "epoch": 4.33044846577498, "grad_norm": 0.18932113213075516, "learning_rate": 2.6726821278063878e-05, "loss": 0.3815, "step": 1373 }, { "epoch": 4.333595594020457, "grad_norm": 0.18886476158101362, "learning_rate": 2.6709068629906867e-05, "loss": 0.3826, "step": 1374 }, { "epoch": 4.3367427222659325, "grad_norm": 0.19201665124650338, "learning_rate": 2.669131106331412e-05, "loss": 0.3926, "step": 1375 }, { "epoch": 4.339889850511408, "grad_norm": 0.20585223962817667, "learning_rate": 2.667354859683272e-05, "loss": 0.3902, "step": 1376 }, { "epoch": 4.343036978756884, "grad_norm": 0.18087305906555123, "learning_rate": 2.6655781249014843e-05, "loss": 0.3946, "step": 1377 }, { "epoch": 4.34618410700236, "grad_norm": 0.22199132776028002, "learning_rate": 2.6638009038417792e-05, "loss": 0.3883, "step": 1378 }, { "epoch": 4.349331235247837, "grad_norm": 0.20789032314071595, "learning_rate": 2.662023198360394e-05, "loss": 0.3863, "step": 1379 }, { "epoch": 4.352478363493312, "grad_norm": 0.19784286066300738, "learning_rate": 2.6602450103140713e-05, "loss": 0.3964, "step": 1380 }, { "epoch": 4.355625491738788, "grad_norm": 0.24791934452491476, "learning_rate": 2.6584663415600583e-05, "loss": 0.3862, "step": 1381 }, { "epoch": 4.358772619984264, "grad_norm": 0.19522453242970436, "learning_rate": 2.656687193956104e-05, "loss": 0.3907, "step": 1382 }, { "epoch": 4.361919748229741, "grad_norm": 0.21752375333467266, "learning_rate": 2.6549075693604575e-05, "loss": 0.3864, "step": 1383 }, { "epoch": 4.3650668764752165, "grad_norm": 0.20160529341364714, "learning_rate": 2.6531274696318664e-05, "loss": 0.3965, "step": 1384 }, { "epoch": 4.368214004720692, "grad_norm": 0.18568303741674552, "learning_rate": 2.6513468966295737e-05, "loss": 0.3885, "step": 1385 }, { "epoch": 4.371361132966168, "grad_norm": 0.2243222479567149, "learning_rate": 2.649565852213318e-05, "loss": 0.3868, "step": 1386 }, { "epoch": 4.374508261211645, "grad_norm": 0.19214945590700291, "learning_rate": 2.6477843382433302e-05, "loss": 0.3911, "step": 1387 }, { "epoch": 4.377655389457121, "grad_norm": 0.20258061934369762, "learning_rate": 2.6460023565803305e-05, "loss": 0.3823, "step": 1388 }, { "epoch": 4.380802517702596, "grad_norm": 0.2124917879318387, "learning_rate": 2.644219909085528e-05, "loss": 0.386, "step": 1389 }, { "epoch": 4.383949645948072, "grad_norm": 0.1907323815413866, "learning_rate": 2.642436997620619e-05, "loss": 0.3912, "step": 1390 }, { "epoch": 4.387096774193548, "grad_norm": 0.201912862490839, "learning_rate": 2.6406536240477835e-05, "loss": 0.3869, "step": 1391 }, { "epoch": 4.390243902439025, "grad_norm": 0.1897846042537431, "learning_rate": 2.6388697902296848e-05, "loss": 0.3836, "step": 1392 }, { "epoch": 4.3933910306845005, "grad_norm": 0.22784186892176736, "learning_rate": 2.637085498029467e-05, "loss": 0.3838, "step": 1393 }, { "epoch": 4.396538158929976, "grad_norm": 0.20376206510573175, "learning_rate": 2.6353007493107517e-05, "loss": 0.3942, "step": 1394 }, { "epoch": 4.399685287175452, "grad_norm": 0.19112546192154864, "learning_rate": 2.6335155459376395e-05, "loss": 0.3978, "step": 1395 }, { "epoch": 4.402832415420928, "grad_norm": 0.20554479838475062, "learning_rate": 2.6317298897747033e-05, "loss": 0.3971, "step": 1396 }, { "epoch": 4.405979543666405, "grad_norm": 0.19449262983527554, "learning_rate": 2.6299437826869923e-05, "loss": 0.3815, "step": 1397 }, { "epoch": 4.4091266719118805, "grad_norm": 0.22156499828526832, "learning_rate": 2.6281572265400223e-05, "loss": 0.3866, "step": 1398 }, { "epoch": 4.412273800157356, "grad_norm": 0.22531435072563047, "learning_rate": 2.6263702231997824e-05, "loss": 0.3807, "step": 1399 }, { "epoch": 4.415420928402832, "grad_norm": 0.1879250329188261, "learning_rate": 2.624582774532725e-05, "loss": 0.3943, "step": 1400 }, { "epoch": 4.418568056648309, "grad_norm": 0.2321072669481776, "learning_rate": 2.6227948824057712e-05, "loss": 0.3808, "step": 1401 }, { "epoch": 4.421715184893785, "grad_norm": 0.19421971388680198, "learning_rate": 2.6210065486863018e-05, "loss": 0.3868, "step": 1402 }, { "epoch": 4.42486231313926, "grad_norm": 0.23006793771084283, "learning_rate": 2.6192177752421627e-05, "loss": 0.3942, "step": 1403 }, { "epoch": 4.428009441384736, "grad_norm": 0.253444666264355, "learning_rate": 2.617428563941655e-05, "loss": 0.3833, "step": 1404 }, { "epoch": 4.431156569630213, "grad_norm": 0.21355125032513483, "learning_rate": 2.61563891665354e-05, "loss": 0.3897, "step": 1405 }, { "epoch": 4.434303697875689, "grad_norm": 0.19507808857241118, "learning_rate": 2.613848835247033e-05, "loss": 0.3825, "step": 1406 }, { "epoch": 4.4374508261211645, "grad_norm": 0.22181377156510734, "learning_rate": 2.6120583215918038e-05, "loss": 0.3944, "step": 1407 }, { "epoch": 4.44059795436664, "grad_norm": 0.17048806847005354, "learning_rate": 2.6102673775579724e-05, "loss": 0.3915, "step": 1408 }, { "epoch": 4.443745082612116, "grad_norm": 0.2068835428974255, "learning_rate": 2.6084760050161097e-05, "loss": 0.3854, "step": 1409 }, { "epoch": 4.446892210857593, "grad_norm": 0.2207189101858155, "learning_rate": 2.606684205837232e-05, "loss": 0.3831, "step": 1410 }, { "epoch": 4.450039339103069, "grad_norm": 0.18810634531927864, "learning_rate": 2.6048919818928034e-05, "loss": 0.3791, "step": 1411 }, { "epoch": 4.453186467348544, "grad_norm": 0.20997580291783224, "learning_rate": 2.6030993350547316e-05, "loss": 0.3886, "step": 1412 }, { "epoch": 4.45633359559402, "grad_norm": 0.18183918340442795, "learning_rate": 2.6013062671953645e-05, "loss": 0.3861, "step": 1413 }, { "epoch": 4.459480723839496, "grad_norm": 0.1991423678980766, "learning_rate": 2.59951278018749e-05, "loss": 0.3867, "step": 1414 }, { "epoch": 4.462627852084973, "grad_norm": 0.2161981116208714, "learning_rate": 2.597718875904335e-05, "loss": 0.393, "step": 1415 }, { "epoch": 4.4657749803304485, "grad_norm": 0.1851824205661574, "learning_rate": 2.5959245562195615e-05, "loss": 0.3883, "step": 1416 }, { "epoch": 4.468922108575924, "grad_norm": 0.20523032246874873, "learning_rate": 2.594129823007265e-05, "loss": 0.3949, "step": 1417 }, { "epoch": 4.4720692368214, "grad_norm": 0.22841297022323917, "learning_rate": 2.592334678141973e-05, "loss": 0.3896, "step": 1418 }, { "epoch": 4.475216365066877, "grad_norm": 0.20597518241097829, "learning_rate": 2.5905391234986445e-05, "loss": 0.3967, "step": 1419 }, { "epoch": 4.478363493312353, "grad_norm": 0.200755938384984, "learning_rate": 2.5887431609526637e-05, "loss": 0.382, "step": 1420 }, { "epoch": 4.481510621557828, "grad_norm": 0.23257612111660253, "learning_rate": 2.586946792379844e-05, "loss": 0.3903, "step": 1421 }, { "epoch": 4.484657749803304, "grad_norm": 0.2071736737546111, "learning_rate": 2.585150019656419e-05, "loss": 0.3865, "step": 1422 }, { "epoch": 4.487804878048781, "grad_norm": 0.18980283295679137, "learning_rate": 2.5833528446590494e-05, "loss": 0.3876, "step": 1423 }, { "epoch": 4.490952006294257, "grad_norm": 0.20509626159829664, "learning_rate": 2.581555269264811e-05, "loss": 0.3858, "step": 1424 }, { "epoch": 4.4940991345397325, "grad_norm": 0.1972045387860757, "learning_rate": 2.5797572953512014e-05, "loss": 0.3897, "step": 1425 }, { "epoch": 4.497246262785208, "grad_norm": 0.21509560076923515, "learning_rate": 2.5779589247961326e-05, "loss": 0.3904, "step": 1426 }, { "epoch": 4.500393391030684, "grad_norm": 0.19228646717933973, "learning_rate": 2.576160159477932e-05, "loss": 0.3918, "step": 1427 }, { "epoch": 4.503540519276161, "grad_norm": 0.1795849927351903, "learning_rate": 2.5743610012753375e-05, "loss": 0.3953, "step": 1428 }, { "epoch": 4.506687647521637, "grad_norm": 0.20489830860011532, "learning_rate": 2.5725614520675003e-05, "loss": 0.3919, "step": 1429 }, { "epoch": 4.5098347757671124, "grad_norm": 0.21882006418429392, "learning_rate": 2.5707615137339774e-05, "loss": 0.3938, "step": 1430 }, { "epoch": 4.512981904012588, "grad_norm": 0.19280187229135973, "learning_rate": 2.5689611881547333e-05, "loss": 0.3851, "step": 1431 }, { "epoch": 4.516129032258064, "grad_norm": 0.20929709845343097, "learning_rate": 2.5671604772101364e-05, "loss": 0.3869, "step": 1432 }, { "epoch": 4.519276160503541, "grad_norm": 0.19034278382916123, "learning_rate": 2.565359382780959e-05, "loss": 0.3892, "step": 1433 }, { "epoch": 4.522423288749017, "grad_norm": 0.1955253897676763, "learning_rate": 2.5635579067483716e-05, "loss": 0.3948, "step": 1434 }, { "epoch": 4.525570416994492, "grad_norm": 0.21006893895734358, "learning_rate": 2.5617560509939453e-05, "loss": 0.3902, "step": 1435 }, { "epoch": 4.528717545239968, "grad_norm": 0.18703258921272223, "learning_rate": 2.5599538173996466e-05, "loss": 0.3945, "step": 1436 }, { "epoch": 4.531864673485445, "grad_norm": 0.17700701860895593, "learning_rate": 2.5581512078478384e-05, "loss": 0.3872, "step": 1437 }, { "epoch": 4.535011801730921, "grad_norm": 0.1904737922390524, "learning_rate": 2.5563482242212735e-05, "loss": 0.3918, "step": 1438 }, { "epoch": 4.5381589299763965, "grad_norm": 0.19827598790350312, "learning_rate": 2.554544868403098e-05, "loss": 0.3936, "step": 1439 }, { "epoch": 4.541306058221872, "grad_norm": 0.16151435086680768, "learning_rate": 2.5527411422768454e-05, "loss": 0.3915, "step": 1440 }, { "epoch": 4.544453186467349, "grad_norm": 0.18657335720318344, "learning_rate": 2.5509370477264358e-05, "loss": 0.3919, "step": 1441 }, { "epoch": 4.547600314712825, "grad_norm": 0.17941003140458014, "learning_rate": 2.5491325866361737e-05, "loss": 0.3876, "step": 1442 }, { "epoch": 4.550747442958301, "grad_norm": 0.17734405961341654, "learning_rate": 2.547327760890749e-05, "loss": 0.3982, "step": 1443 }, { "epoch": 4.553894571203776, "grad_norm": 0.17524164503141434, "learning_rate": 2.5455225723752308e-05, "loss": 0.3858, "step": 1444 }, { "epoch": 4.557041699449252, "grad_norm": 0.17166835200788202, "learning_rate": 2.5437170229750655e-05, "loss": 0.3926, "step": 1445 }, { "epoch": 4.560188827694729, "grad_norm": 0.17236320310568598, "learning_rate": 2.541911114576079e-05, "loss": 0.3917, "step": 1446 }, { "epoch": 4.563335955940205, "grad_norm": 0.16586917527643763, "learning_rate": 2.5401048490644713e-05, "loss": 0.3905, "step": 1447 }, { "epoch": 4.5664830841856805, "grad_norm": 0.1749389748686086, "learning_rate": 2.538298228326814e-05, "loss": 0.3943, "step": 1448 }, { "epoch": 4.569630212431156, "grad_norm": 0.18445523358454083, "learning_rate": 2.536491254250052e-05, "loss": 0.3809, "step": 1449 }, { "epoch": 4.572777340676632, "grad_norm": 0.18404046799235896, "learning_rate": 2.534683928721498e-05, "loss": 0.3937, "step": 1450 }, { "epoch": 4.575924468922109, "grad_norm": 0.17788876905444853, "learning_rate": 2.532876253628831e-05, "loss": 0.3835, "step": 1451 }, { "epoch": 4.579071597167585, "grad_norm": 0.18602157578478964, "learning_rate": 2.5310682308600976e-05, "loss": 0.3943, "step": 1452 }, { "epoch": 4.58221872541306, "grad_norm": 0.17232298418778785, "learning_rate": 2.5292598623037057e-05, "loss": 0.3851, "step": 1453 }, { "epoch": 4.585365853658536, "grad_norm": 0.1905608180973461, "learning_rate": 2.5274511498484236e-05, "loss": 0.3826, "step": 1454 }, { "epoch": 4.588512981904013, "grad_norm": 0.17927303844283918, "learning_rate": 2.5256420953833813e-05, "loss": 0.3817, "step": 1455 }, { "epoch": 4.591660110149489, "grad_norm": 0.20088651754247414, "learning_rate": 2.5238327007980635e-05, "loss": 0.3862, "step": 1456 }, { "epoch": 4.5948072383949645, "grad_norm": 0.1800905518727353, "learning_rate": 2.5220229679823113e-05, "loss": 0.3935, "step": 1457 }, { "epoch": 4.59795436664044, "grad_norm": 0.19383388730097495, "learning_rate": 2.5202128988263183e-05, "loss": 0.4014, "step": 1458 }, { "epoch": 4.601101494885917, "grad_norm": 0.20016364463516104, "learning_rate": 2.5184024952206315e-05, "loss": 0.3904, "step": 1459 }, { "epoch": 4.604248623131393, "grad_norm": 0.19534288874997346, "learning_rate": 2.5165917590561453e-05, "loss": 0.3884, "step": 1460 }, { "epoch": 4.607395751376869, "grad_norm": 0.19845903664537287, "learning_rate": 2.514780692224102e-05, "loss": 0.3886, "step": 1461 }, { "epoch": 4.610542879622344, "grad_norm": 0.18935447457983023, "learning_rate": 2.5129692966160887e-05, "loss": 0.3847, "step": 1462 }, { "epoch": 4.61369000786782, "grad_norm": 0.1825215910021681, "learning_rate": 2.511157574124037e-05, "loss": 0.396, "step": 1463 }, { "epoch": 4.616837136113297, "grad_norm": 0.1912881105818978, "learning_rate": 2.5093455266402185e-05, "loss": 0.3891, "step": 1464 }, { "epoch": 4.619984264358773, "grad_norm": 0.18458695403611322, "learning_rate": 2.507533156057246e-05, "loss": 0.3951, "step": 1465 }, { "epoch": 4.6231313926042485, "grad_norm": 0.17916510574048766, "learning_rate": 2.5057204642680684e-05, "loss": 0.3915, "step": 1466 }, { "epoch": 4.626278520849724, "grad_norm": 0.17110403262888976, "learning_rate": 2.50390745316597e-05, "loss": 0.3845, "step": 1467 }, { "epoch": 4.6294256490952, "grad_norm": 0.17903953256524813, "learning_rate": 2.50209412464457e-05, "loss": 0.383, "step": 1468 }, { "epoch": 4.632572777340677, "grad_norm": 0.17999296179047053, "learning_rate": 2.5002804805978177e-05, "loss": 0.3944, "step": 1469 }, { "epoch": 4.635719905586153, "grad_norm": 0.17581481498146168, "learning_rate": 2.498466522919993e-05, "loss": 0.3892, "step": 1470 }, { "epoch": 4.6388670338316285, "grad_norm": 0.1783825034649337, "learning_rate": 2.4966522535057024e-05, "loss": 0.3891, "step": 1471 }, { "epoch": 4.642014162077104, "grad_norm": 0.18377927440718408, "learning_rate": 2.494837674249878e-05, "loss": 0.3903, "step": 1472 }, { "epoch": 4.645161290322581, "grad_norm": 0.18349514371989203, "learning_rate": 2.4930227870477773e-05, "loss": 0.3902, "step": 1473 }, { "epoch": 4.648308418568057, "grad_norm": 0.1763906966839621, "learning_rate": 2.491207593794977e-05, "loss": 0.3857, "step": 1474 }, { "epoch": 4.651455546813533, "grad_norm": 0.17906752178646956, "learning_rate": 2.4893920963873746e-05, "loss": 0.3908, "step": 1475 }, { "epoch": 4.654602675059008, "grad_norm": 0.1679211823950241, "learning_rate": 2.487576296721186e-05, "loss": 0.3955, "step": 1476 }, { "epoch": 4.657749803304485, "grad_norm": 0.1755347291395844, "learning_rate": 2.485760196692942e-05, "loss": 0.3916, "step": 1477 }, { "epoch": 4.660896931549961, "grad_norm": 0.16465166030319364, "learning_rate": 2.4839437981994867e-05, "loss": 0.3903, "step": 1478 }, { "epoch": 4.664044059795437, "grad_norm": 0.17776543423913058, "learning_rate": 2.4821271031379765e-05, "loss": 0.394, "step": 1479 }, { "epoch": 4.6671911880409125, "grad_norm": 0.1781942266460176, "learning_rate": 2.4803101134058775e-05, "loss": 0.395, "step": 1480 }, { "epoch": 4.670338316286388, "grad_norm": 0.17946693783829906, "learning_rate": 2.478492830900964e-05, "loss": 0.394, "step": 1481 }, { "epoch": 4.673485444531865, "grad_norm": 0.1919149254694885, "learning_rate": 2.4766752575213146e-05, "loss": 0.3904, "step": 1482 }, { "epoch": 4.676632572777341, "grad_norm": 0.16955030838772125, "learning_rate": 2.4748573951653132e-05, "loss": 0.388, "step": 1483 }, { "epoch": 4.679779701022817, "grad_norm": 0.2002044781358719, "learning_rate": 2.473039245731646e-05, "loss": 0.3934, "step": 1484 }, { "epoch": 4.682926829268292, "grad_norm": 0.17693479469518242, "learning_rate": 2.4712208111192965e-05, "loss": 0.3908, "step": 1485 }, { "epoch": 4.686073957513768, "grad_norm": 0.19217906289741862, "learning_rate": 2.4694020932275483e-05, "loss": 0.3816, "step": 1486 }, { "epoch": 4.689221085759245, "grad_norm": 0.1818600906270269, "learning_rate": 2.467583093955981e-05, "loss": 0.3894, "step": 1487 }, { "epoch": 4.692368214004721, "grad_norm": 0.2030839509068234, "learning_rate": 2.4657638152044667e-05, "loss": 0.3868, "step": 1488 }, { "epoch": 4.6955153422501965, "grad_norm": 0.18302211645178032, "learning_rate": 2.4639442588731695e-05, "loss": 0.3894, "step": 1489 }, { "epoch": 4.698662470495672, "grad_norm": 0.17993702783679505, "learning_rate": 2.4621244268625448e-05, "loss": 0.393, "step": 1490 }, { "epoch": 4.701809598741149, "grad_norm": 0.1836591030041654, "learning_rate": 2.4603043210733343e-05, "loss": 0.3936, "step": 1491 }, { "epoch": 4.704956726986625, "grad_norm": 0.17969588856182217, "learning_rate": 2.4584839434065675e-05, "loss": 0.3896, "step": 1492 }, { "epoch": 4.708103855232101, "grad_norm": 0.18627363426302215, "learning_rate": 2.4566632957635555e-05, "loss": 0.3963, "step": 1493 }, { "epoch": 4.711250983477576, "grad_norm": 0.16348207104757354, "learning_rate": 2.454842380045894e-05, "loss": 0.38, "step": 1494 }, { "epoch": 4.714398111723053, "grad_norm": 0.1932898447384366, "learning_rate": 2.453021198155456e-05, "loss": 0.3915, "step": 1495 }, { "epoch": 4.717545239968529, "grad_norm": 0.17318365673903224, "learning_rate": 2.451199751994395e-05, "loss": 0.3942, "step": 1496 }, { "epoch": 4.720692368214005, "grad_norm": 0.18679178875572805, "learning_rate": 2.449378043465139e-05, "loss": 0.3916, "step": 1497 }, { "epoch": 4.7238394964594805, "grad_norm": 0.18301203217504775, "learning_rate": 2.44755607447039e-05, "loss": 0.3958, "step": 1498 }, { "epoch": 4.726986624704956, "grad_norm": 0.18206504927748668, "learning_rate": 2.4457338469131235e-05, "loss": 0.3935, "step": 1499 }, { "epoch": 4.730133752950433, "grad_norm": 0.19626381276499383, "learning_rate": 2.4439113626965832e-05, "loss": 0.3921, "step": 1500 }, { "epoch": 4.733280881195909, "grad_norm": 0.209994269745939, "learning_rate": 2.4420886237242812e-05, "loss": 0.3896, "step": 1501 }, { "epoch": 4.736428009441385, "grad_norm": 0.20966234827738384, "learning_rate": 2.440265631899998e-05, "loss": 0.3872, "step": 1502 }, { "epoch": 4.7395751376868605, "grad_norm": 0.17956697728831314, "learning_rate": 2.438442389127775e-05, "loss": 0.3905, "step": 1503 }, { "epoch": 4.742722265932336, "grad_norm": 0.230149257151023, "learning_rate": 2.4366188973119173e-05, "loss": 0.3942, "step": 1504 }, { "epoch": 4.745869394177813, "grad_norm": 0.16446551541896534, "learning_rate": 2.43479515835699e-05, "loss": 0.3971, "step": 1505 }, { "epoch": 4.749016522423289, "grad_norm": 0.21606059095007257, "learning_rate": 2.4329711741678158e-05, "loss": 0.3971, "step": 1506 }, { "epoch": 4.752163650668765, "grad_norm": 0.18761423316527837, "learning_rate": 2.4311469466494747e-05, "loss": 0.3822, "step": 1507 }, { "epoch": 4.755310778914241, "grad_norm": 0.20024056830304296, "learning_rate": 2.429322477707299e-05, "loss": 0.394, "step": 1508 }, { "epoch": 4.758457907159717, "grad_norm": 0.20562507776072017, "learning_rate": 2.4274977692468765e-05, "loss": 0.3895, "step": 1509 }, { "epoch": 4.761605035405193, "grad_norm": 0.1760613692982041, "learning_rate": 2.4256728231740406e-05, "loss": 0.3999, "step": 1510 }, { "epoch": 4.764752163650669, "grad_norm": 0.22523629907709936, "learning_rate": 2.423847641394877e-05, "loss": 0.3881, "step": 1511 }, { "epoch": 4.7678992918961445, "grad_norm": 0.18831716496370055, "learning_rate": 2.422022225815714e-05, "loss": 0.394, "step": 1512 }, { "epoch": 4.771046420141621, "grad_norm": 0.22513412157385423, "learning_rate": 2.4201965783431267e-05, "loss": 0.3875, "step": 1513 }, { "epoch": 4.774193548387097, "grad_norm": 0.21031494310694007, "learning_rate": 2.4183707008839323e-05, "loss": 0.3775, "step": 1514 }, { "epoch": 4.777340676632573, "grad_norm": 0.21590978367024655, "learning_rate": 2.4165445953451867e-05, "loss": 0.3899, "step": 1515 }, { "epoch": 4.780487804878049, "grad_norm": 0.22564896784384017, "learning_rate": 2.414718263634185e-05, "loss": 0.3913, "step": 1516 }, { "epoch": 4.783634933123524, "grad_norm": 0.19020615700820612, "learning_rate": 2.4128917076584587e-05, "loss": 0.3944, "step": 1517 }, { "epoch": 4.786782061369001, "grad_norm": 0.22953746850079004, "learning_rate": 2.4110649293257728e-05, "loss": 0.3986, "step": 1518 }, { "epoch": 4.789929189614477, "grad_norm": 0.1805102124166518, "learning_rate": 2.4092379305441252e-05, "loss": 0.3898, "step": 1519 }, { "epoch": 4.793076317859953, "grad_norm": 0.2268972739180654, "learning_rate": 2.407410713221743e-05, "loss": 0.3938, "step": 1520 }, { "epoch": 4.7962234461054285, "grad_norm": 0.19050233625931567, "learning_rate": 2.4055832792670842e-05, "loss": 0.3924, "step": 1521 }, { "epoch": 4.799370574350904, "grad_norm": 0.1922960780024949, "learning_rate": 2.4037556305888288e-05, "loss": 0.3813, "step": 1522 }, { "epoch": 4.802517702596381, "grad_norm": 0.1898465812093314, "learning_rate": 2.4019277690958856e-05, "loss": 0.3939, "step": 1523 }, { "epoch": 4.805664830841857, "grad_norm": 0.18381007690254716, "learning_rate": 2.4000996966973817e-05, "loss": 0.394, "step": 1524 }, { "epoch": 4.808811959087333, "grad_norm": 0.2103100782295867, "learning_rate": 2.398271415302668e-05, "loss": 0.3897, "step": 1525 }, { "epoch": 4.811959087332809, "grad_norm": 0.16769183783145522, "learning_rate": 2.3964429268213115e-05, "loss": 0.3972, "step": 1526 }, { "epoch": 4.815106215578285, "grad_norm": 0.19945262652056686, "learning_rate": 2.3946142331630955e-05, "loss": 0.3941, "step": 1527 }, { "epoch": 4.818253343823761, "grad_norm": 0.1700848589131017, "learning_rate": 2.392785336238019e-05, "loss": 0.3902, "step": 1528 }, { "epoch": 4.821400472069237, "grad_norm": 0.1656556368956296, "learning_rate": 2.390956237956291e-05, "loss": 0.3933, "step": 1529 }, { "epoch": 4.8245476003147125, "grad_norm": 0.18373049425460136, "learning_rate": 2.389126940228333e-05, "loss": 0.3956, "step": 1530 }, { "epoch": 4.827694728560189, "grad_norm": 0.17964107945591998, "learning_rate": 2.387297444964775e-05, "loss": 0.3871, "step": 1531 }, { "epoch": 4.830841856805665, "grad_norm": 0.17450741916661142, "learning_rate": 2.385467754076451e-05, "loss": 0.3788, "step": 1532 }, { "epoch": 4.833988985051141, "grad_norm": 0.18365024872291147, "learning_rate": 2.3836378694744014e-05, "loss": 0.3986, "step": 1533 }, { "epoch": 4.837136113296617, "grad_norm": 0.18009519290000942, "learning_rate": 2.3818077930698683e-05, "loss": 0.4009, "step": 1534 }, { "epoch": 4.840283241542092, "grad_norm": 0.1962147329229101, "learning_rate": 2.3799775267742934e-05, "loss": 0.3919, "step": 1535 }, { "epoch": 4.843430369787569, "grad_norm": 0.1814959883127099, "learning_rate": 2.3781470724993186e-05, "loss": 0.3894, "step": 1536 }, { "epoch": 4.846577498033045, "grad_norm": 0.19522141468087864, "learning_rate": 2.376316432156779e-05, "loss": 0.3915, "step": 1537 }, { "epoch": 4.849724626278521, "grad_norm": 0.1824952868828727, "learning_rate": 2.3744856076587076e-05, "loss": 0.396, "step": 1538 }, { "epoch": 4.8528717545239966, "grad_norm": 0.19970076246737523, "learning_rate": 2.3726546009173275e-05, "loss": 0.3975, "step": 1539 }, { "epoch": 4.856018882769473, "grad_norm": 0.1788577313593828, "learning_rate": 2.3708234138450518e-05, "loss": 0.3888, "step": 1540 }, { "epoch": 4.859166011014949, "grad_norm": 0.2006715626887735, "learning_rate": 2.368992048354485e-05, "loss": 0.3904, "step": 1541 }, { "epoch": 4.862313139260425, "grad_norm": 0.1655853950892595, "learning_rate": 2.3671605063584147e-05, "loss": 0.3917, "step": 1542 }, { "epoch": 4.865460267505901, "grad_norm": 0.20564415582976606, "learning_rate": 2.3653287897698135e-05, "loss": 0.3935, "step": 1543 }, { "epoch": 4.868607395751377, "grad_norm": 0.16818135554428862, "learning_rate": 2.3634969005018377e-05, "loss": 0.39, "step": 1544 }, { "epoch": 4.871754523996853, "grad_norm": 0.17762234599652035, "learning_rate": 2.361664840467823e-05, "loss": 0.3926, "step": 1545 }, { "epoch": 4.874901652242329, "grad_norm": 0.1932106423562817, "learning_rate": 2.359832611581283e-05, "loss": 0.385, "step": 1546 }, { "epoch": 4.878048780487805, "grad_norm": 0.186869483874634, "learning_rate": 2.358000215755909e-05, "loss": 0.388, "step": 1547 }, { "epoch": 4.881195908733281, "grad_norm": 0.18001846076878525, "learning_rate": 2.3561676549055646e-05, "loss": 0.3915, "step": 1548 }, { "epoch": 4.884343036978757, "grad_norm": 0.1920048629271822, "learning_rate": 2.3543349309442887e-05, "loss": 0.392, "step": 1549 }, { "epoch": 4.887490165224233, "grad_norm": 0.17088394010939248, "learning_rate": 2.3525020457862878e-05, "loss": 0.3964, "step": 1550 }, { "epoch": 4.890637293469709, "grad_norm": 0.19786920135937375, "learning_rate": 2.3506690013459376e-05, "loss": 0.3843, "step": 1551 }, { "epoch": 4.893784421715185, "grad_norm": 0.18033263963836252, "learning_rate": 2.348835799537782e-05, "loss": 0.3951, "step": 1552 }, { "epoch": 4.8969315499606605, "grad_norm": 0.20462979636333165, "learning_rate": 2.3470024422765267e-05, "loss": 0.3913, "step": 1553 }, { "epoch": 4.900078678206137, "grad_norm": 0.1679693396240305, "learning_rate": 2.3451689314770404e-05, "loss": 0.3933, "step": 1554 }, { "epoch": 4.903225806451613, "grad_norm": 0.18705636136779824, "learning_rate": 2.3433352690543533e-05, "loss": 0.3875, "step": 1555 }, { "epoch": 4.906372934697089, "grad_norm": 0.1725007429553329, "learning_rate": 2.3415014569236522e-05, "loss": 0.3922, "step": 1556 }, { "epoch": 4.909520062942565, "grad_norm": 0.19115101454023312, "learning_rate": 2.3396674970002824e-05, "loss": 0.3865, "step": 1557 }, { "epoch": 4.912667191188041, "grad_norm": 0.17053442035080676, "learning_rate": 2.337833391199742e-05, "loss": 0.3992, "step": 1558 }, { "epoch": 4.915814319433517, "grad_norm": 0.1821548023265103, "learning_rate": 2.3359991414376814e-05, "loss": 0.388, "step": 1559 }, { "epoch": 4.918961447678993, "grad_norm": 0.1793151595611094, "learning_rate": 2.3341647496299025e-05, "loss": 0.3893, "step": 1560 }, { "epoch": 4.922108575924469, "grad_norm": 0.180962371465097, "learning_rate": 2.3323302176923552e-05, "loss": 0.3948, "step": 1561 }, { "epoch": 4.925255704169945, "grad_norm": 0.20297348545826346, "learning_rate": 2.3304955475411348e-05, "loss": 0.3846, "step": 1562 }, { "epoch": 4.928402832415421, "grad_norm": 0.17305830862076746, "learning_rate": 2.3286607410924815e-05, "loss": 0.3879, "step": 1563 }, { "epoch": 4.931549960660897, "grad_norm": 0.1913355603249613, "learning_rate": 2.3268258002627778e-05, "loss": 0.394, "step": 1564 }, { "epoch": 4.934697088906373, "grad_norm": 0.19290416318807532, "learning_rate": 2.3249907269685473e-05, "loss": 0.3894, "step": 1565 }, { "epoch": 4.937844217151849, "grad_norm": 0.18295228802636587, "learning_rate": 2.3231555231264525e-05, "loss": 0.3948, "step": 1566 }, { "epoch": 4.940991345397325, "grad_norm": 0.17938423256259314, "learning_rate": 2.3213201906532895e-05, "loss": 0.3899, "step": 1567 }, { "epoch": 4.944138473642801, "grad_norm": 0.17823287523578704, "learning_rate": 2.3194847314659908e-05, "loss": 0.3903, "step": 1568 }, { "epoch": 4.947285601888277, "grad_norm": 0.19013188360587602, "learning_rate": 2.3176491474816207e-05, "loss": 0.3892, "step": 1569 }, { "epoch": 4.950432730133753, "grad_norm": 0.16191899543672794, "learning_rate": 2.3158134406173742e-05, "loss": 0.3901, "step": 1570 }, { "epoch": 4.9535798583792285, "grad_norm": 0.18934530299370167, "learning_rate": 2.3139776127905745e-05, "loss": 0.392, "step": 1571 }, { "epoch": 4.956726986624705, "grad_norm": 0.18625354508534378, "learning_rate": 2.312141665918671e-05, "loss": 0.393, "step": 1572 }, { "epoch": 4.959874114870181, "grad_norm": 0.1716080119401114, "learning_rate": 2.3103056019192373e-05, "loss": 0.3934, "step": 1573 }, { "epoch": 4.963021243115657, "grad_norm": 0.17658405885657685, "learning_rate": 2.3084694227099704e-05, "loss": 0.3929, "step": 1574 }, { "epoch": 4.966168371361133, "grad_norm": 0.16588933283792434, "learning_rate": 2.3066331302086858e-05, "loss": 0.3994, "step": 1575 }, { "epoch": 4.969315499606609, "grad_norm": 0.17724140434357114, "learning_rate": 2.3047967263333192e-05, "loss": 0.3866, "step": 1576 }, { "epoch": 4.972462627852085, "grad_norm": 0.16829639574698707, "learning_rate": 2.3029602130019208e-05, "loss": 0.3939, "step": 1577 }, { "epoch": 4.975609756097561, "grad_norm": 0.1696857849206108, "learning_rate": 2.301123592132657e-05, "loss": 0.3942, "step": 1578 }, { "epoch": 4.978756884343037, "grad_norm": 0.1799379463657916, "learning_rate": 2.2992868656438046e-05, "loss": 0.3877, "step": 1579 }, { "epoch": 4.9819040125885135, "grad_norm": 0.17173263518727672, "learning_rate": 2.297450035453752e-05, "loss": 0.3906, "step": 1580 }, { "epoch": 4.985051140833989, "grad_norm": 0.16263371788270237, "learning_rate": 2.2956131034809957e-05, "loss": 0.3943, "step": 1581 }, { "epoch": 4.988198269079465, "grad_norm": 0.18145271609433958, "learning_rate": 2.293776071644139e-05, "loss": 0.3993, "step": 1582 }, { "epoch": 4.991345397324941, "grad_norm": 0.17931042976589995, "learning_rate": 2.291938941861888e-05, "loss": 0.3871, "step": 1583 }, { "epoch": 4.994492525570417, "grad_norm": 0.16386736102567098, "learning_rate": 2.290101716053053e-05, "loss": 0.3738, "step": 1584 }, { "epoch": 4.997639653815893, "grad_norm": 0.1709713086837328, "learning_rate": 2.288264396136543e-05, "loss": 0.3928, "step": 1585 }, { "epoch": 5.003147128245476, "grad_norm": 0.46162362322835515, "learning_rate": 2.2864269840313654e-05, "loss": 0.723, "step": 1586 }, { "epoch": 5.006294256490952, "grad_norm": 0.34659232101376264, "learning_rate": 2.284589481656625e-05, "loss": 0.3346, "step": 1587 }, { "epoch": 5.009441384736428, "grad_norm": 0.34172986216672346, "learning_rate": 2.2827518909315206e-05, "loss": 0.3367, "step": 1588 }, { "epoch": 5.012588512981904, "grad_norm": 0.4217601397562919, "learning_rate": 2.2809142137753422e-05, "loss": 0.3196, "step": 1589 }, { "epoch": 5.01573564122738, "grad_norm": 0.27498607831845434, "learning_rate": 2.2790764521074717e-05, "loss": 0.3274, "step": 1590 }, { "epoch": 5.018882769472856, "grad_norm": 0.337732177614199, "learning_rate": 2.2772386078473775e-05, "loss": 0.3283, "step": 1591 }, { "epoch": 5.022029897718332, "grad_norm": 0.31183657652743435, "learning_rate": 2.2754006829146155e-05, "loss": 0.3296, "step": 1592 }, { "epoch": 5.025177025963808, "grad_norm": 0.3478314549506538, "learning_rate": 2.2735626792288263e-05, "loss": 0.3268, "step": 1593 }, { "epoch": 5.028324154209284, "grad_norm": 0.321081647039957, "learning_rate": 2.27172459870973e-05, "loss": 0.3216, "step": 1594 }, { "epoch": 5.03147128245476, "grad_norm": 0.27287489022056094, "learning_rate": 2.2698864432771313e-05, "loss": 0.3324, "step": 1595 }, { "epoch": 5.034618410700236, "grad_norm": 0.3377680793066894, "learning_rate": 2.2680482148509092e-05, "loss": 0.33, "step": 1596 }, { "epoch": 5.037765538945712, "grad_norm": 0.3099962793181279, "learning_rate": 2.266209915351021e-05, "loss": 0.3208, "step": 1597 }, { "epoch": 5.040912667191188, "grad_norm": 0.25169801292292504, "learning_rate": 2.2643715466974975e-05, "loss": 0.3261, "step": 1598 }, { "epoch": 5.044059795436664, "grad_norm": 0.30251965731477554, "learning_rate": 2.2625331108104426e-05, "loss": 0.3217, "step": 1599 }, { "epoch": 5.04720692368214, "grad_norm": 0.23662305302926548, "learning_rate": 2.2606946096100294e-05, "loss": 0.3315, "step": 1600 }, { "epoch": 5.050354051927616, "grad_norm": 0.2651596494454096, "learning_rate": 2.258856045016499e-05, "loss": 0.3345, "step": 1601 }, { "epoch": 5.053501180173092, "grad_norm": 0.2637402059683434, "learning_rate": 2.2570174189501608e-05, "loss": 0.3269, "step": 1602 }, { "epoch": 5.056648308418568, "grad_norm": 0.23602081515972934, "learning_rate": 2.255178733331385e-05, "loss": 0.3229, "step": 1603 }, { "epoch": 5.059795436664044, "grad_norm": 0.25433290519235396, "learning_rate": 2.253339990080608e-05, "loss": 0.3191, "step": 1604 }, { "epoch": 5.06294256490952, "grad_norm": 0.23107096923107467, "learning_rate": 2.251501191118323e-05, "loss": 0.3356, "step": 1605 }, { "epoch": 5.066089693154996, "grad_norm": 0.25637351601908676, "learning_rate": 2.2496623383650828e-05, "loss": 0.3265, "step": 1606 }, { "epoch": 5.069236821400472, "grad_norm": 0.21472300935571184, "learning_rate": 2.2478234337414962e-05, "loss": 0.33, "step": 1607 }, { "epoch": 5.072383949645948, "grad_norm": 0.2338555364338194, "learning_rate": 2.245984479168227e-05, "loss": 0.3298, "step": 1608 }, { "epoch": 5.075531077891424, "grad_norm": 0.20022516446625999, "learning_rate": 2.2441454765659897e-05, "loss": 0.3342, "step": 1609 }, { "epoch": 5.0786782061369005, "grad_norm": 0.22488563846995296, "learning_rate": 2.2423064278555503e-05, "loss": 0.326, "step": 1610 }, { "epoch": 5.081825334382376, "grad_norm": 0.21302916365806326, "learning_rate": 2.2404673349577218e-05, "loss": 0.3282, "step": 1611 }, { "epoch": 5.084972462627852, "grad_norm": 0.21796018211989795, "learning_rate": 2.2386281997933646e-05, "loss": 0.3258, "step": 1612 }, { "epoch": 5.088119590873328, "grad_norm": 0.22425397553381501, "learning_rate": 2.2367890242833815e-05, "loss": 0.3297, "step": 1613 }, { "epoch": 5.091266719118804, "grad_norm": 0.19706564657591386, "learning_rate": 2.2349498103487197e-05, "loss": 0.3273, "step": 1614 }, { "epoch": 5.09441384736428, "grad_norm": 0.2071088006731519, "learning_rate": 2.233110559910365e-05, "loss": 0.3211, "step": 1615 }, { "epoch": 5.097560975609756, "grad_norm": 0.21348102050857448, "learning_rate": 2.2312712748893403e-05, "loss": 0.3232, "step": 1616 }, { "epoch": 5.100708103855232, "grad_norm": 0.18908349503115035, "learning_rate": 2.2294319572067082e-05, "loss": 0.3229, "step": 1617 }, { "epoch": 5.103855232100708, "grad_norm": 0.21575807977186254, "learning_rate": 2.2275926087835625e-05, "loss": 0.3229, "step": 1618 }, { "epoch": 5.1070023603461845, "grad_norm": 0.20410626875283436, "learning_rate": 2.2257532315410288e-05, "loss": 0.3261, "step": 1619 }, { "epoch": 5.11014948859166, "grad_norm": 0.23313876176564874, "learning_rate": 2.2239138274002642e-05, "loss": 0.3298, "step": 1620 }, { "epoch": 5.113296616837136, "grad_norm": 0.19848085470082366, "learning_rate": 2.2220743982824536e-05, "loss": 0.3244, "step": 1621 }, { "epoch": 5.116443745082612, "grad_norm": 0.23764358322337617, "learning_rate": 2.2202349461088084e-05, "loss": 0.3316, "step": 1622 }, { "epoch": 5.119590873328088, "grad_norm": 0.20745220698687916, "learning_rate": 2.2183954728005625e-05, "loss": 0.3225, "step": 1623 }, { "epoch": 5.122738001573564, "grad_norm": 0.223041599846075, "learning_rate": 2.216555980278974e-05, "loss": 0.3261, "step": 1624 }, { "epoch": 5.12588512981904, "grad_norm": 0.19422755456096907, "learning_rate": 2.2147164704653202e-05, "loss": 0.3271, "step": 1625 }, { "epoch": 5.129032258064516, "grad_norm": 0.20440115995525865, "learning_rate": 2.2128769452808956e-05, "loss": 0.3272, "step": 1626 }, { "epoch": 5.132179386309992, "grad_norm": 0.2187283808498755, "learning_rate": 2.211037406647011e-05, "loss": 0.3265, "step": 1627 }, { "epoch": 5.1353265145554685, "grad_norm": 0.2050591037215658, "learning_rate": 2.2091978564849926e-05, "loss": 0.3229, "step": 1628 }, { "epoch": 5.138473642800944, "grad_norm": 0.22811383006695085, "learning_rate": 2.2073582967161768e-05, "loss": 0.336, "step": 1629 }, { "epoch": 5.14162077104642, "grad_norm": 0.21037766403293978, "learning_rate": 2.2055187292619112e-05, "loss": 0.3234, "step": 1630 }, { "epoch": 5.144767899291896, "grad_norm": 0.22544584006363033, "learning_rate": 2.2036791560435522e-05, "loss": 0.3232, "step": 1631 }, { "epoch": 5.147915027537372, "grad_norm": 0.23307986768402664, "learning_rate": 2.20183957898246e-05, "loss": 0.3299, "step": 1632 }, { "epoch": 5.151062155782848, "grad_norm": 0.20220722711990272, "learning_rate": 2.2000000000000003e-05, "loss": 0.3215, "step": 1633 }, { "epoch": 5.154209284028324, "grad_norm": 0.21396702627741238, "learning_rate": 2.1981604210175407e-05, "loss": 0.3261, "step": 1634 }, { "epoch": 5.1573564122738, "grad_norm": 0.23101866923319364, "learning_rate": 2.196320843956449e-05, "loss": 0.3234, "step": 1635 }, { "epoch": 5.160503540519276, "grad_norm": 0.22249676919651665, "learning_rate": 2.1944812707380897e-05, "loss": 0.3278, "step": 1636 }, { "epoch": 5.1636506687647525, "grad_norm": 0.21159027665052352, "learning_rate": 2.1926417032838238e-05, "loss": 0.3261, "step": 1637 }, { "epoch": 5.166797797010228, "grad_norm": 0.20441901079236766, "learning_rate": 2.1908021435150083e-05, "loss": 0.3249, "step": 1638 }, { "epoch": 5.169944925255704, "grad_norm": 0.22690097885692212, "learning_rate": 2.18896259335299e-05, "loss": 0.3263, "step": 1639 }, { "epoch": 5.17309205350118, "grad_norm": 0.19474865782338907, "learning_rate": 2.1871230547191057e-05, "loss": 0.3241, "step": 1640 }, { "epoch": 5.176239181746656, "grad_norm": 0.24748820815778508, "learning_rate": 2.18528352953468e-05, "loss": 0.3293, "step": 1641 }, { "epoch": 5.1793863099921325, "grad_norm": 0.21000623423513556, "learning_rate": 2.1834440197210254e-05, "loss": 0.3396, "step": 1642 }, { "epoch": 5.182533438237608, "grad_norm": 0.2297339762152351, "learning_rate": 2.1816045271994377e-05, "loss": 0.3355, "step": 1643 }, { "epoch": 5.185680566483084, "grad_norm": 0.23065919694389042, "learning_rate": 2.1797650538911922e-05, "loss": 0.3266, "step": 1644 }, { "epoch": 5.18882769472856, "grad_norm": 0.21981603962817217, "learning_rate": 2.1779256017175473e-05, "loss": 0.3216, "step": 1645 }, { "epoch": 5.191974822974037, "grad_norm": 0.2608827800438322, "learning_rate": 2.1760861725997367e-05, "loss": 0.3191, "step": 1646 }, { "epoch": 5.195121951219512, "grad_norm": 0.19452350370213584, "learning_rate": 2.1742467684589725e-05, "loss": 0.3259, "step": 1647 }, { "epoch": 5.198269079464988, "grad_norm": 0.22996447660538494, "learning_rate": 2.1724073912164387e-05, "loss": 0.3284, "step": 1648 }, { "epoch": 5.201416207710464, "grad_norm": 0.22489712820890972, "learning_rate": 2.170568042793292e-05, "loss": 0.3229, "step": 1649 }, { "epoch": 5.20456333595594, "grad_norm": 0.2002513690412124, "learning_rate": 2.16872872511066e-05, "loss": 0.3335, "step": 1650 }, { "epoch": 5.2077104642014165, "grad_norm": 0.21768101783798655, "learning_rate": 2.166889440089636e-05, "loss": 0.3197, "step": 1651 }, { "epoch": 5.210857592446892, "grad_norm": 0.2105177118679401, "learning_rate": 2.165050189651281e-05, "loss": 0.3312, "step": 1652 }, { "epoch": 5.214004720692368, "grad_norm": 0.21009669854087792, "learning_rate": 2.163210975716619e-05, "loss": 0.3288, "step": 1653 }, { "epoch": 5.217151848937844, "grad_norm": 0.21015093693379167, "learning_rate": 2.1613718002066363e-05, "loss": 0.3296, "step": 1654 }, { "epoch": 5.220298977183321, "grad_norm": 0.22642270974424877, "learning_rate": 2.1595326650422784e-05, "loss": 0.325, "step": 1655 }, { "epoch": 5.223446105428796, "grad_norm": 0.20862201953387366, "learning_rate": 2.15769357214445e-05, "loss": 0.3287, "step": 1656 }, { "epoch": 5.226593233674272, "grad_norm": 0.23556943931498991, "learning_rate": 2.1558545234340108e-05, "loss": 0.3208, "step": 1657 }, { "epoch": 5.229740361919748, "grad_norm": 0.21273958624925166, "learning_rate": 2.1540155208317736e-05, "loss": 0.3254, "step": 1658 }, { "epoch": 5.232887490165224, "grad_norm": 0.21120587575901487, "learning_rate": 2.1521765662585047e-05, "loss": 0.3278, "step": 1659 }, { "epoch": 5.2360346184107005, "grad_norm": 0.2191912573575056, "learning_rate": 2.150337661634918e-05, "loss": 0.3275, "step": 1660 }, { "epoch": 5.239181746656176, "grad_norm": 0.18918312365625706, "learning_rate": 2.1484988088816784e-05, "loss": 0.3245, "step": 1661 }, { "epoch": 5.242328874901652, "grad_norm": 0.24442600792973201, "learning_rate": 2.146660009919393e-05, "loss": 0.3366, "step": 1662 }, { "epoch": 5.245476003147128, "grad_norm": 0.19190784043500905, "learning_rate": 2.1448212666686153e-05, "loss": 0.3235, "step": 1663 }, { "epoch": 5.248623131392605, "grad_norm": 0.19845803273670526, "learning_rate": 2.1429825810498405e-05, "loss": 0.3247, "step": 1664 }, { "epoch": 5.25177025963808, "grad_norm": 0.22683790832172754, "learning_rate": 2.141143954983502e-05, "loss": 0.3277, "step": 1665 }, { "epoch": 5.254917387883556, "grad_norm": 0.20007675897146535, "learning_rate": 2.1393053903899715e-05, "loss": 0.3293, "step": 1666 }, { "epoch": 5.258064516129032, "grad_norm": 0.22364967785365925, "learning_rate": 2.1374668891895586e-05, "loss": 0.3317, "step": 1667 }, { "epoch": 5.261211644374509, "grad_norm": 0.19696570309865535, "learning_rate": 2.1356284533025034e-05, "loss": 0.3357, "step": 1668 }, { "epoch": 5.2643587726199845, "grad_norm": 0.20720814373699586, "learning_rate": 2.1337900846489794e-05, "loss": 0.3304, "step": 1669 }, { "epoch": 5.26750590086546, "grad_norm": 0.22251808274139923, "learning_rate": 2.1319517851490917e-05, "loss": 0.3342, "step": 1670 }, { "epoch": 5.270653029110936, "grad_norm": 0.19960532937969883, "learning_rate": 2.130113556722869e-05, "loss": 0.3213, "step": 1671 }, { "epoch": 5.273800157356412, "grad_norm": 0.22611359477988568, "learning_rate": 2.12827540129027e-05, "loss": 0.3304, "step": 1672 }, { "epoch": 5.276947285601889, "grad_norm": 0.21377559505306823, "learning_rate": 2.126437320771175e-05, "loss": 0.333, "step": 1673 }, { "epoch": 5.280094413847364, "grad_norm": 0.21364402742573374, "learning_rate": 2.124599317085385e-05, "loss": 0.3252, "step": 1674 }, { "epoch": 5.28324154209284, "grad_norm": 0.20440996232135555, "learning_rate": 2.1227613921526234e-05, "loss": 0.3302, "step": 1675 }, { "epoch": 5.286388670338316, "grad_norm": 0.20439524727339334, "learning_rate": 2.1209235478925292e-05, "loss": 0.327, "step": 1676 }, { "epoch": 5.289535798583792, "grad_norm": 0.21129883255126156, "learning_rate": 2.1190857862246587e-05, "loss": 0.3317, "step": 1677 }, { "epoch": 5.2926829268292686, "grad_norm": 0.1832955706368962, "learning_rate": 2.1172481090684803e-05, "loss": 0.3285, "step": 1678 }, { "epoch": 5.295830055074744, "grad_norm": 0.21771566260776035, "learning_rate": 2.1154105183433758e-05, "loss": 0.3296, "step": 1679 }, { "epoch": 5.29897718332022, "grad_norm": 0.1943908762637486, "learning_rate": 2.1135730159686355e-05, "loss": 0.3378, "step": 1680 }, { "epoch": 5.302124311565696, "grad_norm": 0.19415021103057115, "learning_rate": 2.1117356038634584e-05, "loss": 0.3284, "step": 1681 }, { "epoch": 5.305271439811173, "grad_norm": 0.19441982459516802, "learning_rate": 2.109898283946948e-05, "loss": 0.3238, "step": 1682 }, { "epoch": 5.3084185680566485, "grad_norm": 0.19773621537262287, "learning_rate": 2.1080610581381128e-05, "loss": 0.3285, "step": 1683 }, { "epoch": 5.311565696302124, "grad_norm": 0.2120736125028019, "learning_rate": 2.106223928355861e-05, "loss": 0.3324, "step": 1684 }, { "epoch": 5.3147128245476, "grad_norm": 0.19760073719764953, "learning_rate": 2.1043868965190045e-05, "loss": 0.3324, "step": 1685 }, { "epoch": 5.317859952793077, "grad_norm": 0.19405070182884, "learning_rate": 2.1025499645462485e-05, "loss": 0.3375, "step": 1686 }, { "epoch": 5.321007081038553, "grad_norm": 0.1956189468411377, "learning_rate": 2.100713134356196e-05, "loss": 0.3255, "step": 1687 }, { "epoch": 5.324154209284028, "grad_norm": 0.19321084864706617, "learning_rate": 2.098876407867344e-05, "loss": 0.3308, "step": 1688 }, { "epoch": 5.327301337529504, "grad_norm": 0.19304288190055158, "learning_rate": 2.0970397869980798e-05, "loss": 0.3286, "step": 1689 }, { "epoch": 5.33044846577498, "grad_norm": 0.1986064395829299, "learning_rate": 2.0952032736666817e-05, "loss": 0.3291, "step": 1690 }, { "epoch": 5.333595594020457, "grad_norm": 0.19746810657897224, "learning_rate": 2.0933668697913148e-05, "loss": 0.3336, "step": 1691 }, { "epoch": 5.3367427222659325, "grad_norm": 0.19729660360055334, "learning_rate": 2.09153057729003e-05, "loss": 0.3348, "step": 1692 }, { "epoch": 5.339889850511408, "grad_norm": 0.18855256963341346, "learning_rate": 2.0896943980807633e-05, "loss": 0.3372, "step": 1693 }, { "epoch": 5.343036978756884, "grad_norm": 0.2009978457776624, "learning_rate": 2.0878583340813295e-05, "loss": 0.3288, "step": 1694 }, { "epoch": 5.34618410700236, "grad_norm": 0.20225991858456713, "learning_rate": 2.0860223872094264e-05, "loss": 0.3271, "step": 1695 }, { "epoch": 5.349331235247837, "grad_norm": 0.18569033139215133, "learning_rate": 2.084186559382627e-05, "loss": 0.3287, "step": 1696 }, { "epoch": 5.352478363493312, "grad_norm": 0.19538260959362502, "learning_rate": 2.0823508525183805e-05, "loss": 0.3249, "step": 1697 }, { "epoch": 5.355625491738788, "grad_norm": 0.20717894047667273, "learning_rate": 2.08051526853401e-05, "loss": 0.3336, "step": 1698 }, { "epoch": 5.358772619984264, "grad_norm": 0.19701862641818213, "learning_rate": 2.0786798093467114e-05, "loss": 0.3344, "step": 1699 }, { "epoch": 5.361919748229741, "grad_norm": 0.1906266432884064, "learning_rate": 2.0768444768735478e-05, "loss": 0.3334, "step": 1700 }, { "epoch": 5.3650668764752165, "grad_norm": 0.21331000047158513, "learning_rate": 2.0750092730314522e-05, "loss": 0.3349, "step": 1701 }, { "epoch": 5.368214004720692, "grad_norm": 0.19487279888174047, "learning_rate": 2.0731741997372228e-05, "loss": 0.3211, "step": 1702 }, { "epoch": 5.371361132966168, "grad_norm": 0.21324736219816784, "learning_rate": 2.071339258907519e-05, "loss": 0.3385, "step": 1703 }, { "epoch": 5.374508261211645, "grad_norm": 0.18260011303295876, "learning_rate": 2.0695044524588658e-05, "loss": 0.332, "step": 1704 }, { "epoch": 5.377655389457121, "grad_norm": 0.21643053077446903, "learning_rate": 2.0676697823076453e-05, "loss": 0.326, "step": 1705 }, { "epoch": 5.380802517702596, "grad_norm": 0.18935558592473964, "learning_rate": 2.065835250370098e-05, "loss": 0.3286, "step": 1706 }, { "epoch": 5.383949645948072, "grad_norm": 0.21738265810196228, "learning_rate": 2.064000858562319e-05, "loss": 0.327, "step": 1707 }, { "epoch": 5.387096774193548, "grad_norm": 0.188452014974482, "learning_rate": 2.0621666088002586e-05, "loss": 0.3363, "step": 1708 }, { "epoch": 5.390243902439025, "grad_norm": 0.21149246855169024, "learning_rate": 2.060332502999719e-05, "loss": 0.3342, "step": 1709 }, { "epoch": 5.3933910306845005, "grad_norm": 0.1994954339555043, "learning_rate": 2.0584985430763483e-05, "loss": 0.333, "step": 1710 }, { "epoch": 5.396538158929976, "grad_norm": 0.18853756790169016, "learning_rate": 2.0566647309456476e-05, "loss": 0.3344, "step": 1711 }, { "epoch": 5.399685287175452, "grad_norm": 0.19943484129450234, "learning_rate": 2.0548310685229605e-05, "loss": 0.3345, "step": 1712 }, { "epoch": 5.402832415420928, "grad_norm": 0.1941995141451995, "learning_rate": 2.052997557723474e-05, "loss": 0.3282, "step": 1713 }, { "epoch": 5.405979543666405, "grad_norm": 0.19191713025156307, "learning_rate": 2.051164200462218e-05, "loss": 0.3345, "step": 1714 }, { "epoch": 5.4091266719118805, "grad_norm": 0.18974347512799264, "learning_rate": 2.0493309986540626e-05, "loss": 0.3413, "step": 1715 }, { "epoch": 5.412273800157356, "grad_norm": 0.19403906103651297, "learning_rate": 2.047497954213713e-05, "loss": 0.33, "step": 1716 }, { "epoch": 5.415420928402832, "grad_norm": 0.19827267122676, "learning_rate": 2.0456650690557126e-05, "loss": 0.3347, "step": 1717 }, { "epoch": 5.418568056648309, "grad_norm": 0.18940704828298557, "learning_rate": 2.043832345094436e-05, "loss": 0.331, "step": 1718 }, { "epoch": 5.421715184893785, "grad_norm": 0.19382447016721407, "learning_rate": 2.041999784244092e-05, "loss": 0.3403, "step": 1719 }, { "epoch": 5.42486231313926, "grad_norm": 0.1908670547017546, "learning_rate": 2.0401673884187178e-05, "loss": 0.3382, "step": 1720 }, { "epoch": 5.428009441384736, "grad_norm": 0.18977270804467197, "learning_rate": 2.0383351595321777e-05, "loss": 0.3269, "step": 1721 }, { "epoch": 5.431156569630213, "grad_norm": 0.19390756152600916, "learning_rate": 2.036503099498163e-05, "loss": 0.3351, "step": 1722 }, { "epoch": 5.434303697875689, "grad_norm": 0.1822739884958527, "learning_rate": 2.034671210230187e-05, "loss": 0.3283, "step": 1723 }, { "epoch": 5.4374508261211645, "grad_norm": 0.20644018287214425, "learning_rate": 2.0328394936415862e-05, "loss": 0.333, "step": 1724 }, { "epoch": 5.44059795436664, "grad_norm": 0.19229713655862757, "learning_rate": 2.0310079516455158e-05, "loss": 0.3336, "step": 1725 }, { "epoch": 5.443745082612116, "grad_norm": 0.19157755715085112, "learning_rate": 2.0291765861549485e-05, "loss": 0.3319, "step": 1726 }, { "epoch": 5.446892210857593, "grad_norm": 0.20424545745124856, "learning_rate": 2.0273453990826734e-05, "loss": 0.3368, "step": 1727 }, { "epoch": 5.450039339103069, "grad_norm": 0.19085673571348755, "learning_rate": 2.0255143923412926e-05, "loss": 0.3334, "step": 1728 }, { "epoch": 5.453186467348544, "grad_norm": 0.18918109876476505, "learning_rate": 2.0236835678432216e-05, "loss": 0.3475, "step": 1729 }, { "epoch": 5.45633359559402, "grad_norm": 0.20280285926641467, "learning_rate": 2.0218529275006823e-05, "loss": 0.3286, "step": 1730 }, { "epoch": 5.459480723839496, "grad_norm": 0.1916773765490627, "learning_rate": 2.020022473225707e-05, "loss": 0.3391, "step": 1731 }, { "epoch": 5.462627852084973, "grad_norm": 0.19384714862396943, "learning_rate": 2.0181922069301323e-05, "loss": 0.3264, "step": 1732 }, { "epoch": 5.4657749803304485, "grad_norm": 0.18923980213552452, "learning_rate": 2.016362130525599e-05, "loss": 0.3331, "step": 1733 }, { "epoch": 5.468922108575924, "grad_norm": 0.17898327293803962, "learning_rate": 2.0145322459235496e-05, "loss": 0.3362, "step": 1734 }, { "epoch": 5.4720692368214, "grad_norm": 0.19587397865824116, "learning_rate": 2.0127025550352255e-05, "loss": 0.3199, "step": 1735 }, { "epoch": 5.475216365066877, "grad_norm": 0.1941075025448282, "learning_rate": 2.010873059771667e-05, "loss": 0.3349, "step": 1736 }, { "epoch": 5.478363493312353, "grad_norm": 0.1771702814313618, "learning_rate": 2.0090437620437097e-05, "loss": 0.3255, "step": 1737 }, { "epoch": 5.481510621557828, "grad_norm": 0.19485668971197115, "learning_rate": 2.0072146637619822e-05, "loss": 0.3227, "step": 1738 }, { "epoch": 5.484657749803304, "grad_norm": 0.19804684145553095, "learning_rate": 2.0053857668369054e-05, "loss": 0.3282, "step": 1739 }, { "epoch": 5.487804878048781, "grad_norm": 0.1918642737386288, "learning_rate": 2.0035570731786898e-05, "loss": 0.3308, "step": 1740 }, { "epoch": 5.490952006294257, "grad_norm": 0.196904058951029, "learning_rate": 2.001728584697332e-05, "loss": 0.3369, "step": 1741 }, { "epoch": 5.4940991345397325, "grad_norm": 0.19456782195452768, "learning_rate": 1.999900303302618e-05, "loss": 0.3304, "step": 1742 }, { "epoch": 5.497246262785208, "grad_norm": 0.18856607162185943, "learning_rate": 1.9980722309041153e-05, "loss": 0.3387, "step": 1743 }, { "epoch": 5.500393391030684, "grad_norm": 0.19651502014126437, "learning_rate": 1.996244369411171e-05, "loss": 0.337, "step": 1744 }, { "epoch": 5.503540519276161, "grad_norm": 0.18726822437116097, "learning_rate": 1.9944167207329163e-05, "loss": 0.3353, "step": 1745 }, { "epoch": 5.506687647521637, "grad_norm": 0.19603676603474196, "learning_rate": 1.992589286778257e-05, "loss": 0.3258, "step": 1746 }, { "epoch": 5.5098347757671124, "grad_norm": 0.182602364247368, "learning_rate": 1.9907620694558757e-05, "loss": 0.3324, "step": 1747 }, { "epoch": 5.512981904012588, "grad_norm": 0.18782759246781305, "learning_rate": 1.9889350706742278e-05, "loss": 0.3314, "step": 1748 }, { "epoch": 5.516129032258064, "grad_norm": 0.1966254586938506, "learning_rate": 1.9871082923415418e-05, "loss": 0.3361, "step": 1749 }, { "epoch": 5.519276160503541, "grad_norm": 0.2000474732115638, "learning_rate": 1.9852817363658157e-05, "loss": 0.334, "step": 1750 }, { "epoch": 5.522423288749017, "grad_norm": 0.18864691630189573, "learning_rate": 1.983455404654814e-05, "loss": 0.3384, "step": 1751 }, { "epoch": 5.525570416994492, "grad_norm": 0.17699746763746177, "learning_rate": 1.9816292991160682e-05, "loss": 0.3369, "step": 1752 }, { "epoch": 5.528717545239968, "grad_norm": 0.2046692057530345, "learning_rate": 1.979803421656874e-05, "loss": 0.3324, "step": 1753 }, { "epoch": 5.531864673485445, "grad_norm": 0.17773907244324694, "learning_rate": 1.977977774184287e-05, "loss": 0.3306, "step": 1754 }, { "epoch": 5.535011801730921, "grad_norm": 0.2060724377420153, "learning_rate": 1.9761523586051247e-05, "loss": 0.3347, "step": 1755 }, { "epoch": 5.5381589299763965, "grad_norm": 0.19361078551446964, "learning_rate": 1.9743271768259597e-05, "loss": 0.3293, "step": 1756 }, { "epoch": 5.541306058221872, "grad_norm": 0.20747437115821293, "learning_rate": 1.9725022307531238e-05, "loss": 0.3319, "step": 1757 }, { "epoch": 5.544453186467349, "grad_norm": 0.20542783975059725, "learning_rate": 1.970677522292701e-05, "loss": 0.3293, "step": 1758 }, { "epoch": 5.547600314712825, "grad_norm": 0.20422541341547726, "learning_rate": 1.9688530533505262e-05, "loss": 0.3298, "step": 1759 }, { "epoch": 5.550747442958301, "grad_norm": 0.19261928228370265, "learning_rate": 1.9670288258321844e-05, "loss": 0.3291, "step": 1760 }, { "epoch": 5.553894571203776, "grad_norm": 0.21510991198866017, "learning_rate": 1.965204841643011e-05, "loss": 0.3355, "step": 1761 }, { "epoch": 5.557041699449252, "grad_norm": 0.20838170757363106, "learning_rate": 1.9633811026880836e-05, "loss": 0.3361, "step": 1762 }, { "epoch": 5.560188827694729, "grad_norm": 0.19924189860550076, "learning_rate": 1.961557610872226e-05, "loss": 0.3408, "step": 1763 }, { "epoch": 5.563335955940205, "grad_norm": 0.19631983491806956, "learning_rate": 1.9597343681000026e-05, "loss": 0.3314, "step": 1764 }, { "epoch": 5.5664830841856805, "grad_norm": 0.20427189176713215, "learning_rate": 1.9579113762757193e-05, "loss": 0.3343, "step": 1765 }, { "epoch": 5.569630212431156, "grad_norm": 0.20654636293900627, "learning_rate": 1.956088637303418e-05, "loss": 0.3391, "step": 1766 }, { "epoch": 5.572777340676632, "grad_norm": 0.21104773479274883, "learning_rate": 1.954266153086877e-05, "loss": 0.342, "step": 1767 }, { "epoch": 5.575924468922109, "grad_norm": 0.20296107799646898, "learning_rate": 1.9524439255296105e-05, "loss": 0.3327, "step": 1768 }, { "epoch": 5.579071597167585, "grad_norm": 0.18312485399358078, "learning_rate": 1.9506219565348622e-05, "loss": 0.3423, "step": 1769 }, { "epoch": 5.58221872541306, "grad_norm": 0.20577262423625045, "learning_rate": 1.948800248005605e-05, "loss": 0.3312, "step": 1770 }, { "epoch": 5.585365853658536, "grad_norm": 0.18830784164844272, "learning_rate": 1.946978801844544e-05, "loss": 0.3314, "step": 1771 }, { "epoch": 5.588512981904013, "grad_norm": 0.1983034432372846, "learning_rate": 1.9451576199541063e-05, "loss": 0.3369, "step": 1772 }, { "epoch": 5.591660110149489, "grad_norm": 0.18977643322424897, "learning_rate": 1.9433367042364447e-05, "loss": 0.331, "step": 1773 }, { "epoch": 5.5948072383949645, "grad_norm": 0.20788619352037782, "learning_rate": 1.941516056593433e-05, "loss": 0.3308, "step": 1774 }, { "epoch": 5.59795436664044, "grad_norm": 0.18636419597292284, "learning_rate": 1.9396956789266663e-05, "loss": 0.3418, "step": 1775 }, { "epoch": 5.601101494885917, "grad_norm": 0.2241714096919009, "learning_rate": 1.9378755731374557e-05, "loss": 0.3375, "step": 1776 }, { "epoch": 5.604248623131393, "grad_norm": 0.17829719329566568, "learning_rate": 1.9360557411268307e-05, "loss": 0.3348, "step": 1777 }, { "epoch": 5.607395751376869, "grad_norm": 0.20049814376921224, "learning_rate": 1.9342361847955345e-05, "loss": 0.3238, "step": 1778 }, { "epoch": 5.610542879622344, "grad_norm": 0.20325169827837897, "learning_rate": 1.9324169060440194e-05, "loss": 0.3337, "step": 1779 }, { "epoch": 5.61369000786782, "grad_norm": 0.18595173050891842, "learning_rate": 1.930597906772452e-05, "loss": 0.3361, "step": 1780 }, { "epoch": 5.616837136113297, "grad_norm": 0.19965999907742346, "learning_rate": 1.9287791888807048e-05, "loss": 0.338, "step": 1781 }, { "epoch": 5.619984264358773, "grad_norm": 0.19545992600515816, "learning_rate": 1.9269607542683552e-05, "loss": 0.3359, "step": 1782 }, { "epoch": 5.6231313926042485, "grad_norm": 0.2188054711004221, "learning_rate": 1.9251426048346877e-05, "loss": 0.3378, "step": 1783 }, { "epoch": 5.626278520849724, "grad_norm": 0.19412474756377676, "learning_rate": 1.923324742478686e-05, "loss": 0.336, "step": 1784 }, { "epoch": 5.6294256490952, "grad_norm": 0.22045546487078188, "learning_rate": 1.9215071690990365e-05, "loss": 0.3237, "step": 1785 }, { "epoch": 5.632572777340677, "grad_norm": 0.1936651347415692, "learning_rate": 1.9196898865941227e-05, "loss": 0.3348, "step": 1786 }, { "epoch": 5.635719905586153, "grad_norm": 0.21256108861119585, "learning_rate": 1.917872896862024e-05, "loss": 0.3355, "step": 1787 }, { "epoch": 5.6388670338316285, "grad_norm": 0.19411087471884958, "learning_rate": 1.916056201800514e-05, "loss": 0.328, "step": 1788 }, { "epoch": 5.642014162077104, "grad_norm": 0.19203184010960683, "learning_rate": 1.9142398033070585e-05, "loss": 0.3332, "step": 1789 }, { "epoch": 5.645161290322581, "grad_norm": 0.19116732074421183, "learning_rate": 1.9124237032788144e-05, "loss": 0.3289, "step": 1790 }, { "epoch": 5.648308418568057, "grad_norm": 0.2029836866282038, "learning_rate": 1.910607903612626e-05, "loss": 0.3301, "step": 1791 }, { "epoch": 5.651455546813533, "grad_norm": 0.18620591358236213, "learning_rate": 1.9087924062050235e-05, "loss": 0.3273, "step": 1792 }, { "epoch": 5.654602675059008, "grad_norm": 0.194743778672627, "learning_rate": 1.9069772129522236e-05, "loss": 0.3361, "step": 1793 }, { "epoch": 5.657749803304485, "grad_norm": 0.17439717206736727, "learning_rate": 1.9051623257501223e-05, "loss": 0.3359, "step": 1794 }, { "epoch": 5.660896931549961, "grad_norm": 0.21536385200503502, "learning_rate": 1.9033477464942985e-05, "loss": 0.3316, "step": 1795 }, { "epoch": 5.664044059795437, "grad_norm": 0.19927232488706062, "learning_rate": 1.9015334770800084e-05, "loss": 0.3428, "step": 1796 }, { "epoch": 5.6671911880409125, "grad_norm": 0.18222541338952222, "learning_rate": 1.899719519402183e-05, "loss": 0.3371, "step": 1797 }, { "epoch": 5.670338316286388, "grad_norm": 0.20776846100700097, "learning_rate": 1.897905875355431e-05, "loss": 0.3333, "step": 1798 }, { "epoch": 5.673485444531865, "grad_norm": 0.20498121411547227, "learning_rate": 1.89609254683403e-05, "loss": 0.337, "step": 1799 }, { "epoch": 5.676632572777341, "grad_norm": 0.2057071418903273, "learning_rate": 1.8942795357319325e-05, "loss": 0.3422, "step": 1800 }, { "epoch": 5.679779701022817, "grad_norm": 0.18447805097886538, "learning_rate": 1.892466843942754e-05, "loss": 0.3357, "step": 1801 }, { "epoch": 5.682926829268292, "grad_norm": 0.20151427634192418, "learning_rate": 1.8906544733597817e-05, "loss": 0.3341, "step": 1802 }, { "epoch": 5.686073957513768, "grad_norm": 0.19101557171738096, "learning_rate": 1.888842425875964e-05, "loss": 0.3396, "step": 1803 }, { "epoch": 5.689221085759245, "grad_norm": 0.2109852771210735, "learning_rate": 1.887030703383912e-05, "loss": 0.3392, "step": 1804 }, { "epoch": 5.692368214004721, "grad_norm": 0.19115514267570324, "learning_rate": 1.885219307775899e-05, "loss": 0.3363, "step": 1805 }, { "epoch": 5.6955153422501965, "grad_norm": 0.21293008134701097, "learning_rate": 1.8834082409438553e-05, "loss": 0.3328, "step": 1806 }, { "epoch": 5.698662470495672, "grad_norm": 0.18526154866614372, "learning_rate": 1.8815975047793694e-05, "loss": 0.3273, "step": 1807 }, { "epoch": 5.701809598741149, "grad_norm": 0.18777683810128346, "learning_rate": 1.8797871011736823e-05, "loss": 0.3392, "step": 1808 }, { "epoch": 5.704956726986625, "grad_norm": 0.18257028157402827, "learning_rate": 1.87797703201769e-05, "loss": 0.3303, "step": 1809 }, { "epoch": 5.708103855232101, "grad_norm": 0.19418106072360075, "learning_rate": 1.8761672992019377e-05, "loss": 0.3344, "step": 1810 }, { "epoch": 5.711250983477576, "grad_norm": 0.18457444042449656, "learning_rate": 1.87435790461662e-05, "loss": 0.3278, "step": 1811 }, { "epoch": 5.714398111723053, "grad_norm": 0.18582325962284854, "learning_rate": 1.872548850151577e-05, "loss": 0.3264, "step": 1812 }, { "epoch": 5.717545239968529, "grad_norm": 0.18103618532916696, "learning_rate": 1.8707401376962946e-05, "loss": 0.3315, "step": 1813 }, { "epoch": 5.720692368214005, "grad_norm": 0.18382505326981982, "learning_rate": 1.8689317691399026e-05, "loss": 0.3367, "step": 1814 }, { "epoch": 5.7238394964594805, "grad_norm": 0.1908586561032934, "learning_rate": 1.867123746371169e-05, "loss": 0.3315, "step": 1815 }, { "epoch": 5.726986624704956, "grad_norm": 0.18359448429174133, "learning_rate": 1.865316071278503e-05, "loss": 0.3352, "step": 1816 }, { "epoch": 5.730133752950433, "grad_norm": 0.19607618023789522, "learning_rate": 1.8635087457499485e-05, "loss": 0.3319, "step": 1817 }, { "epoch": 5.733280881195909, "grad_norm": 0.17583841531340308, "learning_rate": 1.8617017716731865e-05, "loss": 0.334, "step": 1818 }, { "epoch": 5.736428009441385, "grad_norm": 0.1866341211246049, "learning_rate": 1.8598951509355293e-05, "loss": 0.33, "step": 1819 }, { "epoch": 5.7395751376868605, "grad_norm": 0.18608286547921962, "learning_rate": 1.8580888854239213e-05, "loss": 0.3361, "step": 1820 }, { "epoch": 5.742722265932336, "grad_norm": 0.18536619850476266, "learning_rate": 1.856282977024935e-05, "loss": 0.3387, "step": 1821 }, { "epoch": 5.745869394177813, "grad_norm": 0.2015065945779001, "learning_rate": 1.85447742762477e-05, "loss": 0.3413, "step": 1822 }, { "epoch": 5.749016522423289, "grad_norm": 0.19420792957675226, "learning_rate": 1.8526722391092513e-05, "loss": 0.3379, "step": 1823 }, { "epoch": 5.752163650668765, "grad_norm": 0.20788168048073424, "learning_rate": 1.850867413363827e-05, "loss": 0.3299, "step": 1824 }, { "epoch": 5.755310778914241, "grad_norm": 0.20012818061899734, "learning_rate": 1.8490629522735658e-05, "loss": 0.335, "step": 1825 }, { "epoch": 5.758457907159717, "grad_norm": 0.19094123225319237, "learning_rate": 1.8472588577231558e-05, "loss": 0.3289, "step": 1826 }, { "epoch": 5.761605035405193, "grad_norm": 0.1903248865857444, "learning_rate": 1.8454551315969023e-05, "loss": 0.3328, "step": 1827 }, { "epoch": 5.764752163650669, "grad_norm": 0.19908659395090694, "learning_rate": 1.8436517757787268e-05, "loss": 0.3289, "step": 1828 }, { "epoch": 5.7678992918961445, "grad_norm": 0.19909089376488692, "learning_rate": 1.841848792152162e-05, "loss": 0.3317, "step": 1829 }, { "epoch": 5.771046420141621, "grad_norm": 0.18638784799057997, "learning_rate": 1.8400461826003536e-05, "loss": 0.3296, "step": 1830 }, { "epoch": 5.774193548387097, "grad_norm": 0.21107469641971743, "learning_rate": 1.8382439490060556e-05, "loss": 0.341, "step": 1831 }, { "epoch": 5.777340676632573, "grad_norm": 0.18826238700241835, "learning_rate": 1.8364420932516296e-05, "loss": 0.3352, "step": 1832 }, { "epoch": 5.780487804878049, "grad_norm": 0.2011868737529004, "learning_rate": 1.8346406172190415e-05, "loss": 0.3373, "step": 1833 }, { "epoch": 5.783634933123524, "grad_norm": 0.18535453103657012, "learning_rate": 1.8328395227898638e-05, "loss": 0.3324, "step": 1834 }, { "epoch": 5.786782061369001, "grad_norm": 0.2043412279322104, "learning_rate": 1.8310388118452676e-05, "loss": 0.3263, "step": 1835 }, { "epoch": 5.789929189614477, "grad_norm": 0.18684348371083476, "learning_rate": 1.829238486266023e-05, "loss": 0.3286, "step": 1836 }, { "epoch": 5.793076317859953, "grad_norm": 0.20419794528852878, "learning_rate": 1.8274385479325003e-05, "loss": 0.3272, "step": 1837 }, { "epoch": 5.7962234461054285, "grad_norm": 0.1940073858019324, "learning_rate": 1.825638998724663e-05, "loss": 0.3332, "step": 1838 }, { "epoch": 5.799370574350904, "grad_norm": 0.19347840957399853, "learning_rate": 1.8238398405220693e-05, "loss": 0.3351, "step": 1839 }, { "epoch": 5.802517702596381, "grad_norm": 0.1889872990752563, "learning_rate": 1.8220410752038683e-05, "loss": 0.3316, "step": 1840 }, { "epoch": 5.805664830841857, "grad_norm": 0.18586354781067246, "learning_rate": 1.8202427046487998e-05, "loss": 0.3341, "step": 1841 }, { "epoch": 5.808811959087333, "grad_norm": 0.1966780805383021, "learning_rate": 1.8184447307351892e-05, "loss": 0.3355, "step": 1842 }, { "epoch": 5.811959087332809, "grad_norm": 0.18121589168030264, "learning_rate": 1.8166471553409515e-05, "loss": 0.3383, "step": 1843 }, { "epoch": 5.815106215578285, "grad_norm": 0.20505699055277316, "learning_rate": 1.8148499803435814e-05, "loss": 0.3398, "step": 1844 }, { "epoch": 5.818253343823761, "grad_norm": 0.18962686805322654, "learning_rate": 1.8130532076201567e-05, "loss": 0.3265, "step": 1845 }, { "epoch": 5.821400472069237, "grad_norm": 0.18582561112060875, "learning_rate": 1.811256839047337e-05, "loss": 0.3293, "step": 1846 }, { "epoch": 5.8245476003147125, "grad_norm": 0.19257957858004238, "learning_rate": 1.809460876501356e-05, "loss": 0.3262, "step": 1847 }, { "epoch": 5.827694728560189, "grad_norm": 0.197182828158111, "learning_rate": 1.8076653218580275e-05, "loss": 0.3323, "step": 1848 }, { "epoch": 5.830841856805665, "grad_norm": 0.1879475617365794, "learning_rate": 1.8058701769927355e-05, "loss": 0.334, "step": 1849 }, { "epoch": 5.833988985051141, "grad_norm": 0.1826490417159589, "learning_rate": 1.8040754437804394e-05, "loss": 0.342, "step": 1850 }, { "epoch": 5.837136113296617, "grad_norm": 0.17633023503177056, "learning_rate": 1.8022811240956658e-05, "loss": 0.3273, "step": 1851 }, { "epoch": 5.840283241542092, "grad_norm": 0.18968026969150417, "learning_rate": 1.800487219812511e-05, "loss": 0.346, "step": 1852 }, { "epoch": 5.843430369787569, "grad_norm": 0.1784380943724687, "learning_rate": 1.7986937328046367e-05, "loss": 0.3303, "step": 1853 }, { "epoch": 5.846577498033045, "grad_norm": 0.1823752757582174, "learning_rate": 1.796900664945269e-05, "loss": 0.34, "step": 1854 }, { "epoch": 5.849724626278521, "grad_norm": 0.1813510684645491, "learning_rate": 1.795108018107197e-05, "loss": 0.3412, "step": 1855 }, { "epoch": 5.8528717545239966, "grad_norm": 0.1807432625218474, "learning_rate": 1.7933157941627685e-05, "loss": 0.3373, "step": 1856 }, { "epoch": 5.856018882769473, "grad_norm": 0.19949499343633706, "learning_rate": 1.7915239949838912e-05, "loss": 0.3287, "step": 1857 }, { "epoch": 5.859166011014949, "grad_norm": 0.18250218771454552, "learning_rate": 1.7897326224420278e-05, "loss": 0.3405, "step": 1858 }, { "epoch": 5.862313139260425, "grad_norm": 0.194020691023345, "learning_rate": 1.7879416784081964e-05, "loss": 0.3346, "step": 1859 }, { "epoch": 5.865460267505901, "grad_norm": 0.18925126458655542, "learning_rate": 1.7861511647529673e-05, "loss": 0.3364, "step": 1860 }, { "epoch": 5.868607395751377, "grad_norm": 0.1911546369042113, "learning_rate": 1.7843610833464605e-05, "loss": 0.341, "step": 1861 }, { "epoch": 5.871754523996853, "grad_norm": 0.19577966808550543, "learning_rate": 1.782571436058346e-05, "loss": 0.3364, "step": 1862 }, { "epoch": 5.874901652242329, "grad_norm": 0.17291565385793384, "learning_rate": 1.7807822247578385e-05, "loss": 0.3338, "step": 1863 }, { "epoch": 5.878048780487805, "grad_norm": 0.18608030763102454, "learning_rate": 1.7789934513136988e-05, "loss": 0.3334, "step": 1864 }, { "epoch": 5.881195908733281, "grad_norm": 0.1885179067929735, "learning_rate": 1.7772051175942294e-05, "loss": 0.3379, "step": 1865 }, { "epoch": 5.884343036978757, "grad_norm": 0.19141910450267036, "learning_rate": 1.7754172254672758e-05, "loss": 0.3361, "step": 1866 }, { "epoch": 5.887490165224233, "grad_norm": 0.19065019403622055, "learning_rate": 1.7736297768002185e-05, "loss": 0.3387, "step": 1867 }, { "epoch": 5.890637293469709, "grad_norm": 0.19092741311791903, "learning_rate": 1.7718427734599783e-05, "loss": 0.3428, "step": 1868 }, { "epoch": 5.893784421715185, "grad_norm": 0.18860250598218833, "learning_rate": 1.770056217313009e-05, "loss": 0.336, "step": 1869 }, { "epoch": 5.8969315499606605, "grad_norm": 0.18271946305489614, "learning_rate": 1.7682701102252972e-05, "loss": 0.343, "step": 1870 }, { "epoch": 5.900078678206137, "grad_norm": 0.19868228391520923, "learning_rate": 1.7664844540623608e-05, "loss": 0.3425, "step": 1871 }, { "epoch": 5.903225806451613, "grad_norm": 0.2011603878538626, "learning_rate": 1.764699250689249e-05, "loss": 0.3353, "step": 1872 }, { "epoch": 5.906372934697089, "grad_norm": 0.1802992343069088, "learning_rate": 1.762914501970534e-05, "loss": 0.3409, "step": 1873 }, { "epoch": 5.909520062942565, "grad_norm": 0.20444272040489966, "learning_rate": 1.7611302097703157e-05, "loss": 0.3347, "step": 1874 }, { "epoch": 5.912667191188041, "grad_norm": 0.1862979242929073, "learning_rate": 1.7593463759522168e-05, "loss": 0.3314, "step": 1875 }, { "epoch": 5.915814319433517, "grad_norm": 0.20220253909918987, "learning_rate": 1.7575630023793816e-05, "loss": 0.3377, "step": 1876 }, { "epoch": 5.918961447678993, "grad_norm": 0.18686277913829963, "learning_rate": 1.7557800909144728e-05, "loss": 0.3384, "step": 1877 }, { "epoch": 5.922108575924469, "grad_norm": 0.22245978325014654, "learning_rate": 1.75399764341967e-05, "loss": 0.3441, "step": 1878 }, { "epoch": 5.925255704169945, "grad_norm": 0.1959826366457521, "learning_rate": 1.7522156617566707e-05, "loss": 0.3347, "step": 1879 }, { "epoch": 5.928402832415421, "grad_norm": 0.19789454724861258, "learning_rate": 1.7504341477866824e-05, "loss": 0.3321, "step": 1880 }, { "epoch": 5.931549960660897, "grad_norm": 0.1924382850939243, "learning_rate": 1.7486531033704265e-05, "loss": 0.3326, "step": 1881 }, { "epoch": 5.934697088906373, "grad_norm": 0.20590864368684875, "learning_rate": 1.7468725303681345e-05, "loss": 0.3342, "step": 1882 }, { "epoch": 5.937844217151849, "grad_norm": 0.1797118421700361, "learning_rate": 1.7450924306395434e-05, "loss": 0.3397, "step": 1883 }, { "epoch": 5.940991345397325, "grad_norm": 0.20148625619435923, "learning_rate": 1.7433128060438966e-05, "loss": 0.3316, "step": 1884 }, { "epoch": 5.944138473642801, "grad_norm": 0.18364545990645328, "learning_rate": 1.741533658439942e-05, "loss": 0.3362, "step": 1885 }, { "epoch": 5.947285601888277, "grad_norm": 0.20904376933935323, "learning_rate": 1.7397549896859286e-05, "loss": 0.3363, "step": 1886 }, { "epoch": 5.950432730133753, "grad_norm": 0.17911675822308773, "learning_rate": 1.7379768016396062e-05, "loss": 0.3426, "step": 1887 }, { "epoch": 5.9535798583792285, "grad_norm": 0.19232534709476046, "learning_rate": 1.736199096158221e-05, "loss": 0.3347, "step": 1888 }, { "epoch": 5.956726986624705, "grad_norm": 0.19023265048985652, "learning_rate": 1.7344218750985166e-05, "loss": 0.3388, "step": 1889 }, { "epoch": 5.959874114870181, "grad_norm": 0.17953463564757774, "learning_rate": 1.7326451403167293e-05, "loss": 0.3329, "step": 1890 }, { "epoch": 5.963021243115657, "grad_norm": 0.1802520063583937, "learning_rate": 1.7308688936685882e-05, "loss": 0.3432, "step": 1891 }, { "epoch": 5.966168371361133, "grad_norm": 0.17752047516280367, "learning_rate": 1.729093137009314e-05, "loss": 0.3333, "step": 1892 }, { "epoch": 5.969315499606609, "grad_norm": 0.1986277401406285, "learning_rate": 1.7273178721936128e-05, "loss": 0.3368, "step": 1893 }, { "epoch": 5.972462627852085, "grad_norm": 0.18718316187850806, "learning_rate": 1.7255431010756785e-05, "loss": 0.338, "step": 1894 }, { "epoch": 5.975609756097561, "grad_norm": 0.19078572446696887, "learning_rate": 1.7237688255091903e-05, "loss": 0.336, "step": 1895 }, { "epoch": 5.978756884343037, "grad_norm": 0.20824480546412605, "learning_rate": 1.721995047347308e-05, "loss": 0.3451, "step": 1896 }, { "epoch": 5.9819040125885135, "grad_norm": 0.1845164093165284, "learning_rate": 1.7202217684426717e-05, "loss": 0.3391, "step": 1897 }, { "epoch": 5.985051140833989, "grad_norm": 0.19512142305230165, "learning_rate": 1.7184489906474028e-05, "loss": 0.3343, "step": 1898 }, { "epoch": 5.988198269079465, "grad_norm": 0.17660675867730372, "learning_rate": 1.716676715813096e-05, "loss": 0.3434, "step": 1899 }, { "epoch": 5.991345397324941, "grad_norm": 0.190088158411966, "learning_rate": 1.7149049457908243e-05, "loss": 0.3385, "step": 1900 }, { "epoch": 5.994492525570417, "grad_norm": 0.18592216353409444, "learning_rate": 1.713133682431129e-05, "loss": 0.3351, "step": 1901 }, { "epoch": 5.997639653815893, "grad_norm": 0.18827936723885594, "learning_rate": 1.7113629275840265e-05, "loss": 0.3375, "step": 1902 }, { "epoch": 6.003147128245476, "grad_norm": 0.4975044241803055, "learning_rate": 1.7095926830989985e-05, "loss": 0.6166, "step": 1903 }, { "epoch": 6.006294256490952, "grad_norm": 0.3610313120288688, "learning_rate": 1.7078229508249965e-05, "loss": 0.276, "step": 1904 }, { "epoch": 6.009441384736428, "grad_norm": 0.38073466389678695, "learning_rate": 1.706053732610435e-05, "loss": 0.2739, "step": 1905 }, { "epoch": 6.012588512981904, "grad_norm": 0.42338349975031847, "learning_rate": 1.704285030303192e-05, "loss": 0.2676, "step": 1906 }, { "epoch": 6.01573564122738, "grad_norm": 0.29695193868084807, "learning_rate": 1.702516845750608e-05, "loss": 0.268, "step": 1907 }, { "epoch": 6.018882769472856, "grad_norm": 0.35818009653270233, "learning_rate": 1.700749180799482e-05, "loss": 0.2675, "step": 1908 }, { "epoch": 6.022029897718332, "grad_norm": 0.29739443630667606, "learning_rate": 1.6989820372960685e-05, "loss": 0.2606, "step": 1909 }, { "epoch": 6.025177025963808, "grad_norm": 0.31018644023401853, "learning_rate": 1.6972154170860807e-05, "loss": 0.2663, "step": 1910 }, { "epoch": 6.028324154209284, "grad_norm": 0.3229186420829361, "learning_rate": 1.6954493220146827e-05, "loss": 0.2616, "step": 1911 }, { "epoch": 6.03147128245476, "grad_norm": 0.2769240182680597, "learning_rate": 1.6936837539264903e-05, "loss": 0.2687, "step": 1912 }, { "epoch": 6.034618410700236, "grad_norm": 0.27372353067836536, "learning_rate": 1.6919187146655698e-05, "loss": 0.2662, "step": 1913 }, { "epoch": 6.037765538945712, "grad_norm": 0.26192196377766025, "learning_rate": 1.690154206075435e-05, "loss": 0.2641, "step": 1914 }, { "epoch": 6.040912667191188, "grad_norm": 0.2671939261828635, "learning_rate": 1.6883902299990452e-05, "loss": 0.2705, "step": 1915 }, { "epoch": 6.044059795436664, "grad_norm": 0.2718399605137191, "learning_rate": 1.6866267882788042e-05, "loss": 0.2622, "step": 1916 }, { "epoch": 6.04720692368214, "grad_norm": 0.2548650251007784, "learning_rate": 1.684863882756556e-05, "loss": 0.2575, "step": 1917 }, { "epoch": 6.050354051927616, "grad_norm": 0.24781471314240483, "learning_rate": 1.683101515273587e-05, "loss": 0.2626, "step": 1918 }, { "epoch": 6.053501180173092, "grad_norm": 0.24324637074207814, "learning_rate": 1.681339687670618e-05, "loss": 0.2624, "step": 1919 }, { "epoch": 6.056648308418568, "grad_norm": 0.2436540725116606, "learning_rate": 1.679578401787811e-05, "loss": 0.2726, "step": 1920 }, { "epoch": 6.059795436664044, "grad_norm": 0.253809992047894, "learning_rate": 1.6778176594647574e-05, "loss": 0.2638, "step": 1921 }, { "epoch": 6.06294256490952, "grad_norm": 0.24633420682616938, "learning_rate": 1.6760574625404827e-05, "loss": 0.2502, "step": 1922 }, { "epoch": 6.066089693154996, "grad_norm": 0.24090371222537213, "learning_rate": 1.674297812853444e-05, "loss": 0.2653, "step": 1923 }, { "epoch": 6.069236821400472, "grad_norm": 0.24757439171536427, "learning_rate": 1.6725387122415253e-05, "loss": 0.268, "step": 1924 }, { "epoch": 6.072383949645948, "grad_norm": 0.22694322630286495, "learning_rate": 1.6707801625420375e-05, "loss": 0.2624, "step": 1925 }, { "epoch": 6.075531077891424, "grad_norm": 0.2549701791259606, "learning_rate": 1.669022165591716e-05, "loss": 0.2655, "step": 1926 }, { "epoch": 6.0786782061369005, "grad_norm": 0.2245811781702429, "learning_rate": 1.6672647232267194e-05, "loss": 0.2696, "step": 1927 }, { "epoch": 6.081825334382376, "grad_norm": 0.2711918706767307, "learning_rate": 1.6655078372826253e-05, "loss": 0.2718, "step": 1928 }, { "epoch": 6.084972462627852, "grad_norm": 0.23302194321945463, "learning_rate": 1.663751509594434e-05, "loss": 0.2649, "step": 1929 }, { "epoch": 6.088119590873328, "grad_norm": 0.23949565312026969, "learning_rate": 1.6619957419965582e-05, "loss": 0.2708, "step": 1930 }, { "epoch": 6.091266719118804, "grad_norm": 0.22495551097762426, "learning_rate": 1.6602405363228286e-05, "loss": 0.2643, "step": 1931 }, { "epoch": 6.09441384736428, "grad_norm": 0.2319845411537025, "learning_rate": 1.6584858944064874e-05, "loss": 0.2669, "step": 1932 }, { "epoch": 6.097560975609756, "grad_norm": 0.22569793590685644, "learning_rate": 1.6567318180801892e-05, "loss": 0.2726, "step": 1933 }, { "epoch": 6.100708103855232, "grad_norm": 0.23266554290354272, "learning_rate": 1.6549783091759972e-05, "loss": 0.2719, "step": 1934 }, { "epoch": 6.103855232100708, "grad_norm": 0.2114326207930978, "learning_rate": 1.6532253695253814e-05, "loss": 0.2631, "step": 1935 }, { "epoch": 6.1070023603461845, "grad_norm": 0.22853221650998712, "learning_rate": 1.651473000959219e-05, "loss": 0.2693, "step": 1936 }, { "epoch": 6.11014948859166, "grad_norm": 0.22091900772196124, "learning_rate": 1.649721205307788e-05, "loss": 0.2614, "step": 1937 }, { "epoch": 6.113296616837136, "grad_norm": 0.21192508829344028, "learning_rate": 1.6479699844007706e-05, "loss": 0.2662, "step": 1938 }, { "epoch": 6.116443745082612, "grad_norm": 0.21103131436590075, "learning_rate": 1.646219340067248e-05, "loss": 0.2664, "step": 1939 }, { "epoch": 6.119590873328088, "grad_norm": 0.21105523751019395, "learning_rate": 1.644469274135698e-05, "loss": 0.2602, "step": 1940 }, { "epoch": 6.122738001573564, "grad_norm": 0.1988689455993957, "learning_rate": 1.6427197884339964e-05, "loss": 0.2692, "step": 1941 }, { "epoch": 6.12588512981904, "grad_norm": 0.22070030023746054, "learning_rate": 1.6409708847894097e-05, "loss": 0.2634, "step": 1942 }, { "epoch": 6.129032258064516, "grad_norm": 0.21906464799285424, "learning_rate": 1.639222565028601e-05, "loss": 0.255, "step": 1943 }, { "epoch": 6.132179386309992, "grad_norm": 0.21158801417145398, "learning_rate": 1.637474830977619e-05, "loss": 0.2745, "step": 1944 }, { "epoch": 6.1353265145554685, "grad_norm": 0.21986221418279994, "learning_rate": 1.6357276844619043e-05, "loss": 0.2653, "step": 1945 }, { "epoch": 6.138473642800944, "grad_norm": 0.21102204865676896, "learning_rate": 1.633981127306281e-05, "loss": 0.2689, "step": 1946 }, { "epoch": 6.14162077104642, "grad_norm": 0.21326607733179273, "learning_rate": 1.63223516133496e-05, "loss": 0.2716, "step": 1947 }, { "epoch": 6.144767899291896, "grad_norm": 0.2110880192361947, "learning_rate": 1.6304897883715324e-05, "loss": 0.2666, "step": 1948 }, { "epoch": 6.147915027537372, "grad_norm": 0.22051215932311774, "learning_rate": 1.6287450102389725e-05, "loss": 0.2618, "step": 1949 }, { "epoch": 6.151062155782848, "grad_norm": 0.21340487458450053, "learning_rate": 1.6270008287596305e-05, "loss": 0.2669, "step": 1950 }, { "epoch": 6.154209284028324, "grad_norm": 0.2146633102609745, "learning_rate": 1.6252572457552366e-05, "loss": 0.2644, "step": 1951 }, { "epoch": 6.1573564122738, "grad_norm": 0.2102265150162684, "learning_rate": 1.6235142630468928e-05, "loss": 0.2684, "step": 1952 }, { "epoch": 6.160503540519276, "grad_norm": 0.21811839270757488, "learning_rate": 1.621771882455076e-05, "loss": 0.2686, "step": 1953 }, { "epoch": 6.1636506687647525, "grad_norm": 0.21075077208103193, "learning_rate": 1.6200301057996337e-05, "loss": 0.2655, "step": 1954 }, { "epoch": 6.166797797010228, "grad_norm": 0.218688567959288, "learning_rate": 1.6182889348997832e-05, "loss": 0.2652, "step": 1955 }, { "epoch": 6.169944925255704, "grad_norm": 0.22171656079314364, "learning_rate": 1.6165483715741075e-05, "loss": 0.2844, "step": 1956 }, { "epoch": 6.17309205350118, "grad_norm": 0.25323058930729025, "learning_rate": 1.6148084176405567e-05, "loss": 0.2708, "step": 1957 }, { "epoch": 6.176239181746656, "grad_norm": 0.21655864066814562, "learning_rate": 1.6130690749164437e-05, "loss": 0.2651, "step": 1958 }, { "epoch": 6.1793863099921325, "grad_norm": 0.21929256246949802, "learning_rate": 1.6113303452184434e-05, "loss": 0.2761, "step": 1959 }, { "epoch": 6.182533438237608, "grad_norm": 0.21900714816732905, "learning_rate": 1.6095922303625902e-05, "loss": 0.2731, "step": 1960 }, { "epoch": 6.185680566483084, "grad_norm": 0.23059154637063942, "learning_rate": 1.6078547321642758e-05, "loss": 0.2702, "step": 1961 }, { "epoch": 6.18882769472856, "grad_norm": 0.21225513261785303, "learning_rate": 1.6061178524382483e-05, "loss": 0.273, "step": 1962 }, { "epoch": 6.191974822974037, "grad_norm": 0.23067148079801209, "learning_rate": 1.6043815929986094e-05, "loss": 0.2749, "step": 1963 }, { "epoch": 6.195121951219512, "grad_norm": 0.20922145207589546, "learning_rate": 1.602645955658815e-05, "loss": 0.2603, "step": 1964 }, { "epoch": 6.198269079464988, "grad_norm": 0.23195506433753524, "learning_rate": 1.600910942231668e-05, "loss": 0.2641, "step": 1965 }, { "epoch": 6.201416207710464, "grad_norm": 0.20849955610662246, "learning_rate": 1.599176554529321e-05, "loss": 0.2652, "step": 1966 }, { "epoch": 6.20456333595594, "grad_norm": 0.22725404485600895, "learning_rate": 1.597442794363275e-05, "loss": 0.2714, "step": 1967 }, { "epoch": 6.2077104642014165, "grad_norm": 0.21709605058575326, "learning_rate": 1.595709663544372e-05, "loss": 0.2631, "step": 1968 }, { "epoch": 6.210857592446892, "grad_norm": 0.21948985507840815, "learning_rate": 1.5939771638827997e-05, "loss": 0.2659, "step": 1969 }, { "epoch": 6.214004720692368, "grad_norm": 0.22768102749279248, "learning_rate": 1.5922452971880848e-05, "loss": 0.267, "step": 1970 }, { "epoch": 6.217151848937844, "grad_norm": 0.22470980313296637, "learning_rate": 1.5905140652690935e-05, "loss": 0.2751, "step": 1971 }, { "epoch": 6.220298977183321, "grad_norm": 0.21601930900048635, "learning_rate": 1.5887834699340288e-05, "loss": 0.2687, "step": 1972 }, { "epoch": 6.223446105428796, "grad_norm": 0.23789145044732735, "learning_rate": 1.587053512990431e-05, "loss": 0.2648, "step": 1973 }, { "epoch": 6.226593233674272, "grad_norm": 0.20645827162898345, "learning_rate": 1.5853241962451688e-05, "loss": 0.2656, "step": 1974 }, { "epoch": 6.229740361919748, "grad_norm": 0.23749596259128172, "learning_rate": 1.5835955215044466e-05, "loss": 0.2649, "step": 1975 }, { "epoch": 6.232887490165224, "grad_norm": 0.21313415509044598, "learning_rate": 1.581867490573797e-05, "loss": 0.2724, "step": 1976 }, { "epoch": 6.2360346184107005, "grad_norm": 0.23951028716861328, "learning_rate": 1.580140105258079e-05, "loss": 0.2706, "step": 1977 }, { "epoch": 6.239181746656176, "grad_norm": 0.22264880703435372, "learning_rate": 1.5784133673614787e-05, "loss": 0.2745, "step": 1978 }, { "epoch": 6.242328874901652, "grad_norm": 0.21806089788197308, "learning_rate": 1.576687278687504e-05, "loss": 0.2714, "step": 1979 }, { "epoch": 6.245476003147128, "grad_norm": 0.21519597916517344, "learning_rate": 1.5749618410389884e-05, "loss": 0.2749, "step": 1980 }, { "epoch": 6.248623131392605, "grad_norm": 0.23270010761799506, "learning_rate": 1.5732370562180826e-05, "loss": 0.2656, "step": 1981 }, { "epoch": 6.25177025963808, "grad_norm": 0.20586789811939893, "learning_rate": 1.5715129260262556e-05, "loss": 0.2695, "step": 1982 }, { "epoch": 6.254917387883556, "grad_norm": 0.2437679513570022, "learning_rate": 1.5697894522642928e-05, "loss": 0.2748, "step": 1983 }, { "epoch": 6.258064516129032, "grad_norm": 0.2281678596319344, "learning_rate": 1.568066636732295e-05, "loss": 0.2608, "step": 1984 }, { "epoch": 6.261211644374509, "grad_norm": 0.2330467459753673, "learning_rate": 1.566344481229674e-05, "loss": 0.2582, "step": 1985 }, { "epoch": 6.2643587726199845, "grad_norm": 0.21485682537512538, "learning_rate": 1.564622987555154e-05, "loss": 0.2753, "step": 1986 }, { "epoch": 6.26750590086546, "grad_norm": 0.22237525288016594, "learning_rate": 1.5629021575067662e-05, "loss": 0.2681, "step": 1987 }, { "epoch": 6.270653029110936, "grad_norm": 0.22728442867475201, "learning_rate": 1.5611819928818502e-05, "loss": 0.2714, "step": 1988 }, { "epoch": 6.273800157356412, "grad_norm": 0.22800474162990042, "learning_rate": 1.5594624954770494e-05, "loss": 0.2708, "step": 1989 }, { "epoch": 6.276947285601889, "grad_norm": 0.22113656266743126, "learning_rate": 1.5577436670883108e-05, "loss": 0.2673, "step": 1990 }, { "epoch": 6.280094413847364, "grad_norm": 0.21824707276540103, "learning_rate": 1.5560255095108824e-05, "loss": 0.2755, "step": 1991 }, { "epoch": 6.28324154209284, "grad_norm": 0.22677192743914226, "learning_rate": 1.5543080245393128e-05, "loss": 0.2695, "step": 1992 }, { "epoch": 6.286388670338316, "grad_norm": 0.22384148599348552, "learning_rate": 1.552591213967446e-05, "loss": 0.2693, "step": 1993 }, { "epoch": 6.289535798583792, "grad_norm": 0.2231213061535353, "learning_rate": 1.5508750795884222e-05, "loss": 0.2743, "step": 1994 }, { "epoch": 6.2926829268292686, "grad_norm": 0.21524551643992698, "learning_rate": 1.5491596231946764e-05, "loss": 0.2615, "step": 1995 }, { "epoch": 6.295830055074744, "grad_norm": 0.21550805228949638, "learning_rate": 1.5474448465779355e-05, "loss": 0.2721, "step": 1996 }, { "epoch": 6.29897718332022, "grad_norm": 0.2157668033969489, "learning_rate": 1.5457307515292152e-05, "loss": 0.268, "step": 1997 }, { "epoch": 6.302124311565696, "grad_norm": 0.21835492722502536, "learning_rate": 1.5440173398388202e-05, "loss": 0.2667, "step": 1998 }, { "epoch": 6.305271439811173, "grad_norm": 0.2116868929153955, "learning_rate": 1.5423046132963407e-05, "loss": 0.2646, "step": 1999 }, { "epoch": 6.3084185680566485, "grad_norm": 0.23996177968549967, "learning_rate": 1.5405925736906507e-05, "loss": 0.2681, "step": 2000 }, { "epoch": 6.311565696302124, "grad_norm": 0.227126159207544, "learning_rate": 1.5388812228099105e-05, "loss": 0.268, "step": 2001 }, { "epoch": 6.3147128245476, "grad_norm": 0.22790212437206483, "learning_rate": 1.5371705624415566e-05, "loss": 0.2693, "step": 2002 }, { "epoch": 6.317859952793077, "grad_norm": 0.21628336152215077, "learning_rate": 1.535460594372307e-05, "loss": 0.2712, "step": 2003 }, { "epoch": 6.321007081038553, "grad_norm": 0.2271127802784349, "learning_rate": 1.533751320388154e-05, "loss": 0.2687, "step": 2004 }, { "epoch": 6.324154209284028, "grad_norm": 0.21421542101219174, "learning_rate": 1.5320427422743685e-05, "loss": 0.2718, "step": 2005 }, { "epoch": 6.327301337529504, "grad_norm": 0.24451271038131359, "learning_rate": 1.5303348618154915e-05, "loss": 0.2623, "step": 2006 }, { "epoch": 6.33044846577498, "grad_norm": 0.20321144790178836, "learning_rate": 1.5286276807953365e-05, "loss": 0.2693, "step": 2007 }, { "epoch": 6.333595594020457, "grad_norm": 0.24302047081282138, "learning_rate": 1.5269212009969868e-05, "loss": 0.2725, "step": 2008 }, { "epoch": 6.3367427222659325, "grad_norm": 0.21613725540205522, "learning_rate": 1.5252154242027932e-05, "loss": 0.2695, "step": 2009 }, { "epoch": 6.339889850511408, "grad_norm": 0.20743059537745503, "learning_rate": 1.5235103521943719e-05, "loss": 0.2729, "step": 2010 }, { "epoch": 6.343036978756884, "grad_norm": 0.2188470883353957, "learning_rate": 1.5218059867526025e-05, "loss": 0.2633, "step": 2011 }, { "epoch": 6.34618410700236, "grad_norm": 0.2077384192612987, "learning_rate": 1.5201023296576281e-05, "loss": 0.2749, "step": 2012 }, { "epoch": 6.349331235247837, "grad_norm": 0.21085206599076245, "learning_rate": 1.5183993826888506e-05, "loss": 0.28, "step": 2013 }, { "epoch": 6.352478363493312, "grad_norm": 0.20963113069856626, "learning_rate": 1.5166971476249299e-05, "loss": 0.2699, "step": 2014 }, { "epoch": 6.355625491738788, "grad_norm": 0.21152418642222406, "learning_rate": 1.5149956262437848e-05, "loss": 0.2691, "step": 2015 }, { "epoch": 6.358772619984264, "grad_norm": 0.21798400408864652, "learning_rate": 1.5132948203225866e-05, "loss": 0.2701, "step": 2016 }, { "epoch": 6.361919748229741, "grad_norm": 0.212384884342032, "learning_rate": 1.5115947316377591e-05, "loss": 0.2714, "step": 2017 }, { "epoch": 6.3650668764752165, "grad_norm": 0.21764729442192574, "learning_rate": 1.5098953619649779e-05, "loss": 0.2706, "step": 2018 }, { "epoch": 6.368214004720692, "grad_norm": 0.20976149383277362, "learning_rate": 1.5081967130791672e-05, "loss": 0.2715, "step": 2019 }, { "epoch": 6.371361132966168, "grad_norm": 0.21234247868680792, "learning_rate": 1.5064987867544982e-05, "loss": 0.2665, "step": 2020 }, { "epoch": 6.374508261211645, "grad_norm": 0.21051751829878967, "learning_rate": 1.5048015847643887e-05, "loss": 0.2672, "step": 2021 }, { "epoch": 6.377655389457121, "grad_norm": 0.20516874410918237, "learning_rate": 1.5031051088814982e-05, "loss": 0.2634, "step": 2022 }, { "epoch": 6.380802517702596, "grad_norm": 0.1999457276853811, "learning_rate": 1.5014093608777294e-05, "loss": 0.2738, "step": 2023 }, { "epoch": 6.383949645948072, "grad_norm": 0.21305496050001185, "learning_rate": 1.4997143425242229e-05, "loss": 0.2737, "step": 2024 }, { "epoch": 6.387096774193548, "grad_norm": 0.20672780819338069, "learning_rate": 1.4980200555913586e-05, "loss": 0.2718, "step": 2025 }, { "epoch": 6.390243902439025, "grad_norm": 0.21450892009300504, "learning_rate": 1.4963265018487523e-05, "loss": 0.2654, "step": 2026 }, { "epoch": 6.3933910306845005, "grad_norm": 0.20320195505647742, "learning_rate": 1.4946336830652533e-05, "loss": 0.2658, "step": 2027 }, { "epoch": 6.396538158929976, "grad_norm": 0.20392713465924406, "learning_rate": 1.492941601008945e-05, "loss": 0.2746, "step": 2028 }, { "epoch": 6.399685287175452, "grad_norm": 0.20959876954254805, "learning_rate": 1.4912502574471384e-05, "loss": 0.2747, "step": 2029 }, { "epoch": 6.402832415420928, "grad_norm": 0.2108786736914976, "learning_rate": 1.4895596541463771e-05, "loss": 0.2701, "step": 2030 }, { "epoch": 6.405979543666405, "grad_norm": 0.22118213608216855, "learning_rate": 1.4878697928724273e-05, "loss": 0.272, "step": 2031 }, { "epoch": 6.4091266719118805, "grad_norm": 0.203568518365141, "learning_rate": 1.486180675390283e-05, "loss": 0.2659, "step": 2032 }, { "epoch": 6.412273800157356, "grad_norm": 0.2227618023805204, "learning_rate": 1.484492303464161e-05, "loss": 0.2717, "step": 2033 }, { "epoch": 6.415420928402832, "grad_norm": 0.21398359499500041, "learning_rate": 1.482804678857498e-05, "loss": 0.2648, "step": 2034 }, { "epoch": 6.418568056648309, "grad_norm": 0.22308492477910258, "learning_rate": 1.4811178033329516e-05, "loss": 0.2642, "step": 2035 }, { "epoch": 6.421715184893785, "grad_norm": 0.21249595991484213, "learning_rate": 1.4794316786523962e-05, "loss": 0.2683, "step": 2036 }, { "epoch": 6.42486231313926, "grad_norm": 0.21068800662577022, "learning_rate": 1.4777463065769224e-05, "loss": 0.2701, "step": 2037 }, { "epoch": 6.428009441384736, "grad_norm": 0.21437199711857782, "learning_rate": 1.4760616888668353e-05, "loss": 0.2747, "step": 2038 }, { "epoch": 6.431156569630213, "grad_norm": 0.22330541645842986, "learning_rate": 1.4743778272816504e-05, "loss": 0.2704, "step": 2039 }, { "epoch": 6.434303697875689, "grad_norm": 0.2062736384947792, "learning_rate": 1.4726947235800952e-05, "loss": 0.272, "step": 2040 }, { "epoch": 6.4374508261211645, "grad_norm": 0.22521586255432074, "learning_rate": 1.4710123795201039e-05, "loss": 0.278, "step": 2041 }, { "epoch": 6.44059795436664, "grad_norm": 0.203912107820357, "learning_rate": 1.4693307968588194e-05, "loss": 0.2711, "step": 2042 }, { "epoch": 6.443745082612116, "grad_norm": 0.21931119328761017, "learning_rate": 1.4676499773525863e-05, "loss": 0.2663, "step": 2043 }, { "epoch": 6.446892210857593, "grad_norm": 0.2094814854379808, "learning_rate": 1.4659699227569566e-05, "loss": 0.2695, "step": 2044 }, { "epoch": 6.450039339103069, "grad_norm": 0.23117534470233453, "learning_rate": 1.464290634826679e-05, "loss": 0.2692, "step": 2045 }, { "epoch": 6.453186467348544, "grad_norm": 0.21379505139811097, "learning_rate": 1.4626121153157046e-05, "loss": 0.2768, "step": 2046 }, { "epoch": 6.45633359559402, "grad_norm": 0.217663110855076, "learning_rate": 1.4609343659771793e-05, "loss": 0.2713, "step": 2047 }, { "epoch": 6.459480723839496, "grad_norm": 0.2040638494691249, "learning_rate": 1.4592573885634464e-05, "loss": 0.2654, "step": 2048 }, { "epoch": 6.462627852084973, "grad_norm": 0.2303747870401908, "learning_rate": 1.4575811848260429e-05, "loss": 0.2749, "step": 2049 }, { "epoch": 6.4657749803304485, "grad_norm": 0.20958637817569242, "learning_rate": 1.4559057565156964e-05, "loss": 0.2708, "step": 2050 }, { "epoch": 6.468922108575924, "grad_norm": 0.2224204366720408, "learning_rate": 1.4542311053823257e-05, "loss": 0.2748, "step": 2051 }, { "epoch": 6.4720692368214, "grad_norm": 0.21278520060079634, "learning_rate": 1.4525572331750373e-05, "loss": 0.2674, "step": 2052 }, { "epoch": 6.475216365066877, "grad_norm": 0.21995573974288046, "learning_rate": 1.4508841416421256e-05, "loss": 0.2696, "step": 2053 }, { "epoch": 6.478363493312353, "grad_norm": 0.222067909671715, "learning_rate": 1.4492118325310673e-05, "loss": 0.2735, "step": 2054 }, { "epoch": 6.481510621557828, "grad_norm": 0.21469400060674615, "learning_rate": 1.4475403075885233e-05, "loss": 0.2738, "step": 2055 }, { "epoch": 6.484657749803304, "grad_norm": 0.2220259001976218, "learning_rate": 1.445869568560335e-05, "loss": 0.2655, "step": 2056 }, { "epoch": 6.487804878048781, "grad_norm": 0.22065015324342255, "learning_rate": 1.4441996171915241e-05, "loss": 0.2703, "step": 2057 }, { "epoch": 6.490952006294257, "grad_norm": 0.2233908978556124, "learning_rate": 1.4425304552262876e-05, "loss": 0.2749, "step": 2058 }, { "epoch": 6.4940991345397325, "grad_norm": 0.21860192593438782, "learning_rate": 1.4408620844079998e-05, "loss": 0.2691, "step": 2059 }, { "epoch": 6.497246262785208, "grad_norm": 0.21630356299188297, "learning_rate": 1.4391945064792076e-05, "loss": 0.2699, "step": 2060 }, { "epoch": 6.500393391030684, "grad_norm": 0.20468468546139096, "learning_rate": 1.4375277231816309e-05, "loss": 0.2659, "step": 2061 }, { "epoch": 6.503540519276161, "grad_norm": 0.2266198893281483, "learning_rate": 1.435861736256158e-05, "loss": 0.2636, "step": 2062 }, { "epoch": 6.506687647521637, "grad_norm": 0.210355804203251, "learning_rate": 1.4341965474428463e-05, "loss": 0.281, "step": 2063 }, { "epoch": 6.5098347757671124, "grad_norm": 0.2104007529783895, "learning_rate": 1.4325321584809193e-05, "loss": 0.2745, "step": 2064 }, { "epoch": 6.512981904012588, "grad_norm": 0.21454334831641367, "learning_rate": 1.4308685711087664e-05, "loss": 0.2714, "step": 2065 }, { "epoch": 6.516129032258064, "grad_norm": 0.20461473363605256, "learning_rate": 1.4292057870639387e-05, "loss": 0.2737, "step": 2066 }, { "epoch": 6.519276160503541, "grad_norm": 0.22229813634194787, "learning_rate": 1.4275438080831468e-05, "loss": 0.2713, "step": 2067 }, { "epoch": 6.522423288749017, "grad_norm": 0.20801329570201357, "learning_rate": 1.4258826359022639e-05, "loss": 0.2664, "step": 2068 }, { "epoch": 6.525570416994492, "grad_norm": 0.2141355735304912, "learning_rate": 1.4242222722563166e-05, "loss": 0.2692, "step": 2069 }, { "epoch": 6.528717545239968, "grad_norm": 0.21674575849736738, "learning_rate": 1.4225627188794913e-05, "loss": 0.2735, "step": 2070 }, { "epoch": 6.531864673485445, "grad_norm": 0.23378921154557367, "learning_rate": 1.4209039775051233e-05, "loss": 0.2779, "step": 2071 }, { "epoch": 6.535011801730921, "grad_norm": 0.20405908727514255, "learning_rate": 1.4192460498657035e-05, "loss": 0.2778, "step": 2072 }, { "epoch": 6.5381589299763965, "grad_norm": 0.2167015673737203, "learning_rate": 1.4175889376928717e-05, "loss": 0.2674, "step": 2073 }, { "epoch": 6.541306058221872, "grad_norm": 0.22602621445184934, "learning_rate": 1.415932642717416e-05, "loss": 0.2776, "step": 2074 }, { "epoch": 6.544453186467349, "grad_norm": 0.2103436914815303, "learning_rate": 1.4142771666692716e-05, "loss": 0.2748, "step": 2075 }, { "epoch": 6.547600314712825, "grad_norm": 0.22284944224802702, "learning_rate": 1.4126225112775163e-05, "loss": 0.2703, "step": 2076 }, { "epoch": 6.550747442958301, "grad_norm": 0.20859337785195634, "learning_rate": 1.4109686782703729e-05, "loss": 0.2751, "step": 2077 }, { "epoch": 6.553894571203776, "grad_norm": 0.20898420028915374, "learning_rate": 1.4093156693752041e-05, "loss": 0.2722, "step": 2078 }, { "epoch": 6.557041699449252, "grad_norm": 0.21921481305234924, "learning_rate": 1.407663486318513e-05, "loss": 0.2743, "step": 2079 }, { "epoch": 6.560188827694729, "grad_norm": 0.19770871033798984, "learning_rate": 1.4060121308259386e-05, "loss": 0.2682, "step": 2080 }, { "epoch": 6.563335955940205, "grad_norm": 0.20251135019019187, "learning_rate": 1.4043616046222562e-05, "loss": 0.2796, "step": 2081 }, { "epoch": 6.5664830841856805, "grad_norm": 0.20589764192052976, "learning_rate": 1.4027119094313766e-05, "loss": 0.268, "step": 2082 }, { "epoch": 6.569630212431156, "grad_norm": 0.20962471096621335, "learning_rate": 1.4010630469763386e-05, "loss": 0.2689, "step": 2083 }, { "epoch": 6.572777340676632, "grad_norm": 0.2077048133726809, "learning_rate": 1.3994150189793165e-05, "loss": 0.2666, "step": 2084 }, { "epoch": 6.575924468922109, "grad_norm": 0.21367970260930136, "learning_rate": 1.397767827161608e-05, "loss": 0.2668, "step": 2085 }, { "epoch": 6.579071597167585, "grad_norm": 0.20369044452822496, "learning_rate": 1.3961214732436407e-05, "loss": 0.2717, "step": 2086 }, { "epoch": 6.58221872541306, "grad_norm": 0.20608659783474267, "learning_rate": 1.3944759589449657e-05, "loss": 0.2662, "step": 2087 }, { "epoch": 6.585365853658536, "grad_norm": 0.21167921476702603, "learning_rate": 1.3928312859842592e-05, "loss": 0.2548, "step": 2088 }, { "epoch": 6.588512981904013, "grad_norm": 0.2119572678076718, "learning_rate": 1.3911874560793149e-05, "loss": 0.2686, "step": 2089 }, { "epoch": 6.591660110149489, "grad_norm": 0.23046584602252454, "learning_rate": 1.3895444709470485e-05, "loss": 0.2691, "step": 2090 }, { "epoch": 6.5948072383949645, "grad_norm": 0.2152670153636726, "learning_rate": 1.387902332303494e-05, "loss": 0.2789, "step": 2091 }, { "epoch": 6.59795436664044, "grad_norm": 0.21392485708596343, "learning_rate": 1.3862610418637988e-05, "loss": 0.276, "step": 2092 }, { "epoch": 6.601101494885917, "grad_norm": 0.2251993808341726, "learning_rate": 1.384620601342227e-05, "loss": 0.2731, "step": 2093 }, { "epoch": 6.604248623131393, "grad_norm": 0.2105054752455257, "learning_rate": 1.3829810124521528e-05, "loss": 0.2712, "step": 2094 }, { "epoch": 6.607395751376869, "grad_norm": 0.22197663382801497, "learning_rate": 1.3813422769060628e-05, "loss": 0.269, "step": 2095 }, { "epoch": 6.610542879622344, "grad_norm": 0.21251503803497962, "learning_rate": 1.37970439641555e-05, "loss": 0.2673, "step": 2096 }, { "epoch": 6.61369000786782, "grad_norm": 0.22088548865063007, "learning_rate": 1.3780673726913168e-05, "loss": 0.2741, "step": 2097 }, { "epoch": 6.616837136113297, "grad_norm": 0.2100559518908244, "learning_rate": 1.37643120744317e-05, "loss": 0.2809, "step": 2098 }, { "epoch": 6.619984264358773, "grad_norm": 0.21846715779185052, "learning_rate": 1.3747959023800181e-05, "loss": 0.2712, "step": 2099 }, { "epoch": 6.6231313926042485, "grad_norm": 0.2051793253015262, "learning_rate": 1.3731614592098735e-05, "loss": 0.274, "step": 2100 }, { "epoch": 6.626278520849724, "grad_norm": 0.21998385302968826, "learning_rate": 1.3715278796398468e-05, "loss": 0.2668, "step": 2101 }, { "epoch": 6.6294256490952, "grad_norm": 0.20524755125389635, "learning_rate": 1.3698951653761487e-05, "loss": 0.2726, "step": 2102 }, { "epoch": 6.632572777340677, "grad_norm": 0.21310125591169676, "learning_rate": 1.3682633181240826e-05, "loss": 0.2731, "step": 2103 }, { "epoch": 6.635719905586153, "grad_norm": 0.20373328332909574, "learning_rate": 1.3666323395880493e-05, "loss": 0.2786, "step": 2104 }, { "epoch": 6.6388670338316285, "grad_norm": 0.2194259189312518, "learning_rate": 1.3650022314715412e-05, "loss": 0.271, "step": 2105 }, { "epoch": 6.642014162077104, "grad_norm": 0.2044629306655923, "learning_rate": 1.3633729954771414e-05, "loss": 0.2768, "step": 2106 }, { "epoch": 6.645161290322581, "grad_norm": 0.2071870556335287, "learning_rate": 1.3617446333065234e-05, "loss": 0.2742, "step": 2107 }, { "epoch": 6.648308418568057, "grad_norm": 0.20950878944876208, "learning_rate": 1.3601171466604452e-05, "loss": 0.274, "step": 2108 }, { "epoch": 6.651455546813533, "grad_norm": 0.2048060795347047, "learning_rate": 1.3584905372387542e-05, "loss": 0.2744, "step": 2109 }, { "epoch": 6.654602675059008, "grad_norm": 0.20307264689129167, "learning_rate": 1.356864806740378e-05, "loss": 0.2718, "step": 2110 }, { "epoch": 6.657749803304485, "grad_norm": 0.20870683975446555, "learning_rate": 1.3552399568633287e-05, "loss": 0.2751, "step": 2111 }, { "epoch": 6.660896931549961, "grad_norm": 0.20074881013703952, "learning_rate": 1.3536159893046969e-05, "loss": 0.2724, "step": 2112 }, { "epoch": 6.664044059795437, "grad_norm": 0.20014382071858103, "learning_rate": 1.3519929057606526e-05, "loss": 0.2693, "step": 2113 }, { "epoch": 6.6671911880409125, "grad_norm": 0.20804817092817526, "learning_rate": 1.3503707079264432e-05, "loss": 0.274, "step": 2114 }, { "epoch": 6.670338316286388, "grad_norm": 0.20493619242075548, "learning_rate": 1.348749397496388e-05, "loss": 0.2769, "step": 2115 }, { "epoch": 6.673485444531865, "grad_norm": 0.2099027610431233, "learning_rate": 1.3471289761638842e-05, "loss": 0.2764, "step": 2116 }, { "epoch": 6.676632572777341, "grad_norm": 0.193445232337728, "learning_rate": 1.345509445621396e-05, "loss": 0.2695, "step": 2117 }, { "epoch": 6.679779701022817, "grad_norm": 0.2096470666573266, "learning_rate": 1.34389080756046e-05, "loss": 0.2829, "step": 2118 }, { "epoch": 6.682926829268292, "grad_norm": 0.20576106019634247, "learning_rate": 1.342273063671678e-05, "loss": 0.2756, "step": 2119 }, { "epoch": 6.686073957513768, "grad_norm": 0.21501849543327453, "learning_rate": 1.3406562156447211e-05, "loss": 0.2727, "step": 2120 }, { "epoch": 6.689221085759245, "grad_norm": 0.1993115840566364, "learning_rate": 1.339040265168322e-05, "loss": 0.2663, "step": 2121 }, { "epoch": 6.692368214004721, "grad_norm": 0.21226796949929252, "learning_rate": 1.337425213930277e-05, "loss": 0.2708, "step": 2122 }, { "epoch": 6.6955153422501965, "grad_norm": 0.20469216116105463, "learning_rate": 1.3358110636174443e-05, "loss": 0.277, "step": 2123 }, { "epoch": 6.698662470495672, "grad_norm": 0.21150553812629214, "learning_rate": 1.3341978159157388e-05, "loss": 0.2726, "step": 2124 }, { "epoch": 6.701809598741149, "grad_norm": 0.2009979298846876, "learning_rate": 1.3325854725101346e-05, "loss": 0.2742, "step": 2125 }, { "epoch": 6.704956726986625, "grad_norm": 0.20585970570515025, "learning_rate": 1.3309740350846597e-05, "loss": 0.2712, "step": 2126 }, { "epoch": 6.708103855232101, "grad_norm": 0.20267136318362497, "learning_rate": 1.3293635053223976e-05, "loss": 0.2768, "step": 2127 }, { "epoch": 6.711250983477576, "grad_norm": 0.1966034773897379, "learning_rate": 1.3277538849054818e-05, "loss": 0.2685, "step": 2128 }, { "epoch": 6.714398111723053, "grad_norm": 0.19084672362126345, "learning_rate": 1.326145175515098e-05, "loss": 0.2707, "step": 2129 }, { "epoch": 6.717545239968529, "grad_norm": 0.21151915236025404, "learning_rate": 1.324537378831479e-05, "loss": 0.2762, "step": 2130 }, { "epoch": 6.720692368214005, "grad_norm": 0.19161947045159958, "learning_rate": 1.3229304965339052e-05, "loss": 0.272, "step": 2131 }, { "epoch": 6.7238394964594805, "grad_norm": 0.2057037763555039, "learning_rate": 1.3213245303007018e-05, "loss": 0.2731, "step": 2132 }, { "epoch": 6.726986624704956, "grad_norm": 0.20303632318163164, "learning_rate": 1.3197194818092359e-05, "loss": 0.2773, "step": 2133 }, { "epoch": 6.730133752950433, "grad_norm": 0.19044415445481397, "learning_rate": 1.318115352735918e-05, "loss": 0.2793, "step": 2134 }, { "epoch": 6.733280881195909, "grad_norm": 0.2073975268632104, "learning_rate": 1.3165121447561968e-05, "loss": 0.2683, "step": 2135 }, { "epoch": 6.736428009441385, "grad_norm": 0.2133081665575485, "learning_rate": 1.3149098595445604e-05, "loss": 0.2742, "step": 2136 }, { "epoch": 6.7395751376868605, "grad_norm": 0.21038693316237772, "learning_rate": 1.313308498774531e-05, "loss": 0.2712, "step": 2137 }, { "epoch": 6.742722265932336, "grad_norm": 0.21412237772602838, "learning_rate": 1.3117080641186672e-05, "loss": 0.2765, "step": 2138 }, { "epoch": 6.745869394177813, "grad_norm": 0.20678305330276317, "learning_rate": 1.3101085572485603e-05, "loss": 0.2688, "step": 2139 }, { "epoch": 6.749016522423289, "grad_norm": 0.21245293744715033, "learning_rate": 1.3085099798348306e-05, "loss": 0.2718, "step": 2140 }, { "epoch": 6.752163650668765, "grad_norm": 0.20398550728736917, "learning_rate": 1.3069123335471301e-05, "loss": 0.2714, "step": 2141 }, { "epoch": 6.755310778914241, "grad_norm": 0.2031385438667128, "learning_rate": 1.3053156200541364e-05, "loss": 0.2699, "step": 2142 }, { "epoch": 6.758457907159717, "grad_norm": 0.19773088989378942, "learning_rate": 1.303719841023553e-05, "loss": 0.2635, "step": 2143 }, { "epoch": 6.761605035405193, "grad_norm": 0.21395849634573397, "learning_rate": 1.3021249981221086e-05, "loss": 0.2771, "step": 2144 }, { "epoch": 6.764752163650669, "grad_norm": 0.19961802457486216, "learning_rate": 1.3005310930155544e-05, "loss": 0.2709, "step": 2145 }, { "epoch": 6.7678992918961445, "grad_norm": 0.21134776492595922, "learning_rate": 1.2989381273686597e-05, "loss": 0.2669, "step": 2146 }, { "epoch": 6.771046420141621, "grad_norm": 0.19864060482042745, "learning_rate": 1.2973461028452144e-05, "loss": 0.2706, "step": 2147 }, { "epoch": 6.774193548387097, "grad_norm": 0.2044619678907636, "learning_rate": 1.2957550211080259e-05, "loss": 0.2739, "step": 2148 }, { "epoch": 6.777340676632573, "grad_norm": 0.21504368793018358, "learning_rate": 1.2941648838189147e-05, "loss": 0.2674, "step": 2149 }, { "epoch": 6.780487804878049, "grad_norm": 0.20378150368432318, "learning_rate": 1.2925756926387177e-05, "loss": 0.2696, "step": 2150 }, { "epoch": 6.783634933123524, "grad_norm": 0.20600148728967427, "learning_rate": 1.2909874492272807e-05, "loss": 0.2802, "step": 2151 }, { "epoch": 6.786782061369001, "grad_norm": 0.19938138549196283, "learning_rate": 1.2894001552434626e-05, "loss": 0.2759, "step": 2152 }, { "epoch": 6.789929189614477, "grad_norm": 0.21154201382497265, "learning_rate": 1.2878138123451274e-05, "loss": 0.2731, "step": 2153 }, { "epoch": 6.793076317859953, "grad_norm": 0.19784577469812065, "learning_rate": 1.2862284221891485e-05, "loss": 0.2763, "step": 2154 }, { "epoch": 6.7962234461054285, "grad_norm": 0.21310912362182374, "learning_rate": 1.2846439864314037e-05, "loss": 0.2761, "step": 2155 }, { "epoch": 6.799370574350904, "grad_norm": 0.20695979034558215, "learning_rate": 1.283060506726772e-05, "loss": 0.2774, "step": 2156 }, { "epoch": 6.802517702596381, "grad_norm": 0.21261086091250228, "learning_rate": 1.2814779847291367e-05, "loss": 0.2758, "step": 2157 }, { "epoch": 6.805664830841857, "grad_norm": 0.20979015880101165, "learning_rate": 1.2798964220913772e-05, "loss": 0.2804, "step": 2158 }, { "epoch": 6.808811959087333, "grad_norm": 0.21761044939140664, "learning_rate": 1.278315820465376e-05, "loss": 0.2769, "step": 2159 }, { "epoch": 6.811959087332809, "grad_norm": 0.2014697385726117, "learning_rate": 1.2767361815020065e-05, "loss": 0.2783, "step": 2160 }, { "epoch": 6.815106215578285, "grad_norm": 0.21283798980232946, "learning_rate": 1.2751575068511408e-05, "loss": 0.2657, "step": 2161 }, { "epoch": 6.818253343823761, "grad_norm": 0.19962011480967196, "learning_rate": 1.2735797981616407e-05, "loss": 0.2806, "step": 2162 }, { "epoch": 6.821400472069237, "grad_norm": 0.21924693121937547, "learning_rate": 1.2720030570813608e-05, "loss": 0.2746, "step": 2163 }, { "epoch": 6.8245476003147125, "grad_norm": 0.19657838694235807, "learning_rate": 1.2704272852571455e-05, "loss": 0.2684, "step": 2164 }, { "epoch": 6.827694728560189, "grad_norm": 0.2030249463511617, "learning_rate": 1.2688524843348252e-05, "loss": 0.2722, "step": 2165 }, { "epoch": 6.830841856805665, "grad_norm": 0.20062136834665203, "learning_rate": 1.2672786559592178e-05, "loss": 0.2722, "step": 2166 }, { "epoch": 6.833988985051141, "grad_norm": 0.21259980872470255, "learning_rate": 1.2657058017741237e-05, "loss": 0.2746, "step": 2167 }, { "epoch": 6.837136113296617, "grad_norm": 0.18743513115346688, "learning_rate": 1.2641339234223282e-05, "loss": 0.2695, "step": 2168 }, { "epoch": 6.840283241542092, "grad_norm": 0.20138411396722927, "learning_rate": 1.2625630225455946e-05, "loss": 0.2764, "step": 2169 }, { "epoch": 6.843430369787569, "grad_norm": 0.20724943586989145, "learning_rate": 1.2609931007846672e-05, "loss": 0.28, "step": 2170 }, { "epoch": 6.846577498033045, "grad_norm": 0.20364398935190642, "learning_rate": 1.2594241597792678e-05, "loss": 0.2742, "step": 2171 }, { "epoch": 6.849724626278521, "grad_norm": 0.21697947841842968, "learning_rate": 1.2578562011680914e-05, "loss": 0.2722, "step": 2172 }, { "epoch": 6.8528717545239966, "grad_norm": 0.20438816521590877, "learning_rate": 1.2562892265888116e-05, "loss": 0.2742, "step": 2173 }, { "epoch": 6.856018882769473, "grad_norm": 0.2160342522382541, "learning_rate": 1.2547232376780687e-05, "loss": 0.2757, "step": 2174 }, { "epoch": 6.859166011014949, "grad_norm": 0.20593020428643655, "learning_rate": 1.2531582360714775e-05, "loss": 0.2675, "step": 2175 }, { "epoch": 6.862313139260425, "grad_norm": 0.21437695083001138, "learning_rate": 1.251594223403619e-05, "loss": 0.2693, "step": 2176 }, { "epoch": 6.865460267505901, "grad_norm": 0.2119697416305465, "learning_rate": 1.2500312013080444e-05, "loss": 0.2669, "step": 2177 }, { "epoch": 6.868607395751377, "grad_norm": 0.2060511460509206, "learning_rate": 1.2484691714172663e-05, "loss": 0.2861, "step": 2178 }, { "epoch": 6.871754523996853, "grad_norm": 0.19849969915264076, "learning_rate": 1.246908135362764e-05, "loss": 0.2758, "step": 2179 }, { "epoch": 6.874901652242329, "grad_norm": 0.20718364482758267, "learning_rate": 1.2453480947749785e-05, "loss": 0.2746, "step": 2180 }, { "epoch": 6.878048780487805, "grad_norm": 0.2009159233212597, "learning_rate": 1.2437890512833089e-05, "loss": 0.2804, "step": 2181 }, { "epoch": 6.881195908733281, "grad_norm": 0.20361754932690013, "learning_rate": 1.2422310065161162e-05, "loss": 0.265, "step": 2182 }, { "epoch": 6.884343036978757, "grad_norm": 0.20703457454924554, "learning_rate": 1.240673962100715e-05, "loss": 0.2686, "step": 2183 }, { "epoch": 6.887490165224233, "grad_norm": 0.20068143636302968, "learning_rate": 1.2391179196633776e-05, "loss": 0.2763, "step": 2184 }, { "epoch": 6.890637293469709, "grad_norm": 0.20358246660033522, "learning_rate": 1.2375628808293274e-05, "loss": 0.2792, "step": 2185 }, { "epoch": 6.893784421715185, "grad_norm": 0.2036764238431656, "learning_rate": 1.2360088472227418e-05, "loss": 0.2737, "step": 2186 }, { "epoch": 6.8969315499606605, "grad_norm": 0.20147681379716198, "learning_rate": 1.2344558204667475e-05, "loss": 0.2725, "step": 2187 }, { "epoch": 6.900078678206137, "grad_norm": 0.2071864222789661, "learning_rate": 1.2329038021834193e-05, "loss": 0.2709, "step": 2188 }, { "epoch": 6.903225806451613, "grad_norm": 0.2043649714329845, "learning_rate": 1.231352793993779e-05, "loss": 0.2738, "step": 2189 }, { "epoch": 6.906372934697089, "grad_norm": 0.20125299342080424, "learning_rate": 1.2298027975177926e-05, "loss": 0.2636, "step": 2190 }, { "epoch": 6.909520062942565, "grad_norm": 0.20814571779048338, "learning_rate": 1.2282538143743712e-05, "loss": 0.2771, "step": 2191 }, { "epoch": 6.912667191188041, "grad_norm": 0.20030130841003688, "learning_rate": 1.2267058461813649e-05, "loss": 0.2694, "step": 2192 }, { "epoch": 6.915814319433517, "grad_norm": 0.21200310708703735, "learning_rate": 1.2251588945555666e-05, "loss": 0.2725, "step": 2193 }, { "epoch": 6.918961447678993, "grad_norm": 0.20922436094459726, "learning_rate": 1.2236129611127045e-05, "loss": 0.2726, "step": 2194 }, { "epoch": 6.922108575924469, "grad_norm": 0.19704648497715532, "learning_rate": 1.2220680474674458e-05, "loss": 0.2741, "step": 2195 }, { "epoch": 6.925255704169945, "grad_norm": 0.21570986491875785, "learning_rate": 1.2205241552333922e-05, "loss": 0.2716, "step": 2196 }, { "epoch": 6.928402832415421, "grad_norm": 0.20000918248634075, "learning_rate": 1.218981286023077e-05, "loss": 0.2791, "step": 2197 }, { "epoch": 6.931549960660897, "grad_norm": 0.1997584203723108, "learning_rate": 1.2174394414479667e-05, "loss": 0.2783, "step": 2198 }, { "epoch": 6.934697088906373, "grad_norm": 0.20118408426733328, "learning_rate": 1.215898623118456e-05, "loss": 0.2736, "step": 2199 }, { "epoch": 6.937844217151849, "grad_norm": 0.19493904766515557, "learning_rate": 1.2143588326438697e-05, "loss": 0.2734, "step": 2200 }, { "epoch": 6.940991345397325, "grad_norm": 0.21288261790118718, "learning_rate": 1.2128200716324566e-05, "loss": 0.2768, "step": 2201 }, { "epoch": 6.944138473642801, "grad_norm": 0.1976856689237112, "learning_rate": 1.2112823416913936e-05, "loss": 0.2747, "step": 2202 }, { "epoch": 6.947285601888277, "grad_norm": 0.2130261035735836, "learning_rate": 1.2097456444267771e-05, "loss": 0.2677, "step": 2203 }, { "epoch": 6.950432730133753, "grad_norm": 0.19662526671296285, "learning_rate": 1.208209981443627e-05, "loss": 0.2717, "step": 2204 }, { "epoch": 6.9535798583792285, "grad_norm": 0.20222706552909867, "learning_rate": 1.2066753543458835e-05, "loss": 0.2711, "step": 2205 }, { "epoch": 6.956726986624705, "grad_norm": 0.2069577717265713, "learning_rate": 1.2051417647364021e-05, "loss": 0.2793, "step": 2206 }, { "epoch": 6.959874114870181, "grad_norm": 0.203212313831048, "learning_rate": 1.2036092142169582e-05, "loss": 0.2763, "step": 2207 }, { "epoch": 6.963021243115657, "grad_norm": 0.20509305087697424, "learning_rate": 1.2020777043882386e-05, "loss": 0.2759, "step": 2208 }, { "epoch": 6.966168371361133, "grad_norm": 0.1989672235984331, "learning_rate": 1.2005472368498457e-05, "loss": 0.2713, "step": 2209 }, { "epoch": 6.969315499606609, "grad_norm": 0.22138011183206288, "learning_rate": 1.1990178132002913e-05, "loss": 0.2692, "step": 2210 }, { "epoch": 6.972462627852085, "grad_norm": 0.2094379431636141, "learning_rate": 1.1974894350369981e-05, "loss": 0.2788, "step": 2211 }, { "epoch": 6.975609756097561, "grad_norm": 0.20436997575009863, "learning_rate": 1.195962103956298e-05, "loss": 0.2759, "step": 2212 }, { "epoch": 6.978756884343037, "grad_norm": 0.204009380946763, "learning_rate": 1.1944358215534258e-05, "loss": 0.2701, "step": 2213 }, { "epoch": 6.9819040125885135, "grad_norm": 0.20886713208010613, "learning_rate": 1.1929105894225248e-05, "loss": 0.2687, "step": 2214 }, { "epoch": 6.985051140833989, "grad_norm": 0.20522061292808225, "learning_rate": 1.1913864091566372e-05, "loss": 0.2628, "step": 2215 }, { "epoch": 6.988198269079465, "grad_norm": 0.202811286102291, "learning_rate": 1.1898632823477121e-05, "loss": 0.2757, "step": 2216 }, { "epoch": 6.991345397324941, "grad_norm": 0.2192831752403655, "learning_rate": 1.1883412105865925e-05, "loss": 0.2698, "step": 2217 }, { "epoch": 6.994492525570417, "grad_norm": 0.19233397196404134, "learning_rate": 1.1868201954630238e-05, "loss": 0.2723, "step": 2218 }, { "epoch": 6.997639653815893, "grad_norm": 0.210128078621367, "learning_rate": 1.185300238565645e-05, "loss": 0.2774, "step": 2219 }, { "epoch": 7.003147128245476, "grad_norm": 0.5714481470801931, "learning_rate": 1.183781341481991e-05, "loss": 0.4569, "step": 2220 }, { "epoch": 7.006294256490952, "grad_norm": 0.4250625166239754, "learning_rate": 1.1822635057984906e-05, "loss": 0.2112, "step": 2221 }, { "epoch": 7.009441384736428, "grad_norm": 0.2691450770861746, "learning_rate": 1.1807467331004619e-05, "loss": 0.2138, "step": 2222 }, { "epoch": 7.012588512981904, "grad_norm": 0.6396419703643319, "learning_rate": 1.179231024972115e-05, "loss": 0.2188, "step": 2223 }, { "epoch": 7.01573564122738, "grad_norm": 0.3063619660764144, "learning_rate": 1.177716382996546e-05, "loss": 0.2141, "step": 2224 }, { "epoch": 7.018882769472856, "grad_norm": 0.37212547491407416, "learning_rate": 1.1762028087557393e-05, "loss": 0.207, "step": 2225 }, { "epoch": 7.022029897718332, "grad_norm": 0.381130447699765, "learning_rate": 1.1746903038305626e-05, "loss": 0.2121, "step": 2226 }, { "epoch": 7.025177025963808, "grad_norm": 0.3005238930589458, "learning_rate": 1.1731788698007675e-05, "loss": 0.2127, "step": 2227 }, { "epoch": 7.028324154209284, "grad_norm": 0.3300998957168307, "learning_rate": 1.1716685082449879e-05, "loss": 0.2237, "step": 2228 }, { "epoch": 7.03147128245476, "grad_norm": 0.38114455718692064, "learning_rate": 1.1701592207407355e-05, "loss": 0.2176, "step": 2229 }, { "epoch": 7.034618410700236, "grad_norm": 0.30283344304928583, "learning_rate": 1.1686510088644014e-05, "loss": 0.2086, "step": 2230 }, { "epoch": 7.037765538945712, "grad_norm": 0.3331464112562727, "learning_rate": 1.167143874191254e-05, "loss": 0.2075, "step": 2231 }, { "epoch": 7.040912667191188, "grad_norm": 0.29800299120189844, "learning_rate": 1.1656378182954357e-05, "loss": 0.2052, "step": 2232 }, { "epoch": 7.044059795436664, "grad_norm": 0.2894375393244374, "learning_rate": 1.1641328427499614e-05, "loss": 0.2071, "step": 2233 }, { "epoch": 7.04720692368214, "grad_norm": 0.3155503383974769, "learning_rate": 1.1626289491267197e-05, "loss": 0.2161, "step": 2234 }, { "epoch": 7.050354051927616, "grad_norm": 0.26776854986829085, "learning_rate": 1.161126138996467e-05, "loss": 0.2022, "step": 2235 }, { "epoch": 7.053501180173092, "grad_norm": 0.27601227009065155, "learning_rate": 1.1596244139288286e-05, "loss": 0.2066, "step": 2236 }, { "epoch": 7.056648308418568, "grad_norm": 0.27656033063047314, "learning_rate": 1.1581237754922984e-05, "loss": 0.2104, "step": 2237 }, { "epoch": 7.059795436664044, "grad_norm": 0.2449882270902797, "learning_rate": 1.1566242252542325e-05, "loss": 0.2073, "step": 2238 }, { "epoch": 7.06294256490952, "grad_norm": 0.2768374274550793, "learning_rate": 1.1551257647808524e-05, "loss": 0.2102, "step": 2239 }, { "epoch": 7.066089693154996, "grad_norm": 0.26011823472626777, "learning_rate": 1.1536283956372402e-05, "loss": 0.2142, "step": 2240 }, { "epoch": 7.069236821400472, "grad_norm": 0.25819042597992947, "learning_rate": 1.1521321193873395e-05, "loss": 0.208, "step": 2241 }, { "epoch": 7.072383949645948, "grad_norm": 0.24665055592686272, "learning_rate": 1.1506369375939506e-05, "loss": 0.208, "step": 2242 }, { "epoch": 7.075531077891424, "grad_norm": 0.24751137987885644, "learning_rate": 1.1491428518187321e-05, "loss": 0.2092, "step": 2243 }, { "epoch": 7.0786782061369005, "grad_norm": 0.24624896065268903, "learning_rate": 1.1476498636221978e-05, "loss": 0.2087, "step": 2244 }, { "epoch": 7.081825334382376, "grad_norm": 0.25982701212215953, "learning_rate": 1.1461579745637143e-05, "loss": 0.2063, "step": 2245 }, { "epoch": 7.084972462627852, "grad_norm": 0.2397365742144504, "learning_rate": 1.1446671862015013e-05, "loss": 0.2151, "step": 2246 }, { "epoch": 7.088119590873328, "grad_norm": 0.23751859174229084, "learning_rate": 1.1431775000926272e-05, "loss": 0.2067, "step": 2247 }, { "epoch": 7.091266719118804, "grad_norm": 0.23996775925549246, "learning_rate": 1.1416889177930113e-05, "loss": 0.2113, "step": 2248 }, { "epoch": 7.09441384736428, "grad_norm": 0.24150863328124383, "learning_rate": 1.1402014408574177e-05, "loss": 0.2125, "step": 2249 }, { "epoch": 7.097560975609756, "grad_norm": 0.24163504765540855, "learning_rate": 1.1387150708394586e-05, "loss": 0.1962, "step": 2250 }, { "epoch": 7.100708103855232, "grad_norm": 0.24010609549944184, "learning_rate": 1.1372298092915868e-05, "loss": 0.2141, "step": 2251 }, { "epoch": 7.103855232100708, "grad_norm": 0.2454335372395361, "learning_rate": 1.1357456577651007e-05, "loss": 0.2105, "step": 2252 }, { "epoch": 7.1070023603461845, "grad_norm": 0.23394725001346658, "learning_rate": 1.1342626178101374e-05, "loss": 0.2079, "step": 2253 }, { "epoch": 7.11014948859166, "grad_norm": 0.23777743303747212, "learning_rate": 1.132780690975673e-05, "loss": 0.2114, "step": 2254 }, { "epoch": 7.113296616837136, "grad_norm": 0.22606234526414365, "learning_rate": 1.131299878809522e-05, "loss": 0.2081, "step": 2255 }, { "epoch": 7.116443745082612, "grad_norm": 0.2418578090305854, "learning_rate": 1.1298201828583332e-05, "loss": 0.2066, "step": 2256 }, { "epoch": 7.119590873328088, "grad_norm": 0.23113427714810778, "learning_rate": 1.1283416046675916e-05, "loss": 0.2102, "step": 2257 }, { "epoch": 7.122738001573564, "grad_norm": 0.2381266978689901, "learning_rate": 1.1268641457816117e-05, "loss": 0.207, "step": 2258 }, { "epoch": 7.12588512981904, "grad_norm": 0.2361934040445735, "learning_rate": 1.1253878077435436e-05, "loss": 0.2158, "step": 2259 }, { "epoch": 7.129032258064516, "grad_norm": 0.21836833345649923, "learning_rate": 1.1239125920953615e-05, "loss": 0.2134, "step": 2260 }, { "epoch": 7.132179386309992, "grad_norm": 0.24198498582441896, "learning_rate": 1.122438500377871e-05, "loss": 0.2042, "step": 2261 }, { "epoch": 7.1353265145554685, "grad_norm": 0.22713295487622734, "learning_rate": 1.1209655341307024e-05, "loss": 0.2117, "step": 2262 }, { "epoch": 7.138473642800944, "grad_norm": 0.2343503065926964, "learning_rate": 1.1194936948923103e-05, "loss": 0.2098, "step": 2263 }, { "epoch": 7.14162077104642, "grad_norm": 0.22909025791500967, "learning_rate": 1.1180229841999726e-05, "loss": 0.2106, "step": 2264 }, { "epoch": 7.144767899291896, "grad_norm": 0.227228211003999, "learning_rate": 1.1165534035897881e-05, "loss": 0.2192, "step": 2265 }, { "epoch": 7.147915027537372, "grad_norm": 0.22905608109888015, "learning_rate": 1.1150849545966766e-05, "loss": 0.2085, "step": 2266 }, { "epoch": 7.151062155782848, "grad_norm": 0.21727537194341173, "learning_rate": 1.1136176387543736e-05, "loss": 0.2122, "step": 2267 }, { "epoch": 7.154209284028324, "grad_norm": 0.23840050117066902, "learning_rate": 1.1121514575954327e-05, "loss": 0.2149, "step": 2268 }, { "epoch": 7.1573564122738, "grad_norm": 0.22511280292668318, "learning_rate": 1.1106864126512233e-05, "loss": 0.2026, "step": 2269 }, { "epoch": 7.160503540519276, "grad_norm": 0.2319999499213673, "learning_rate": 1.109222505451925e-05, "loss": 0.2045, "step": 2270 }, { "epoch": 7.1636506687647525, "grad_norm": 0.22621753505730435, "learning_rate": 1.1077597375265325e-05, "loss": 0.2024, "step": 2271 }, { "epoch": 7.166797797010228, "grad_norm": 0.22239947016703665, "learning_rate": 1.1062981104028479e-05, "loss": 0.2096, "step": 2272 }, { "epoch": 7.169944925255704, "grad_norm": 0.23183568916578862, "learning_rate": 1.1048376256074831e-05, "loss": 0.2046, "step": 2273 }, { "epoch": 7.17309205350118, "grad_norm": 0.23415363186834018, "learning_rate": 1.1033782846658567e-05, "loss": 0.2126, "step": 2274 }, { "epoch": 7.176239181746656, "grad_norm": 0.21870216739732176, "learning_rate": 1.1019200891021932e-05, "loss": 0.201, "step": 2275 }, { "epoch": 7.1793863099921325, "grad_norm": 0.23538602690513877, "learning_rate": 1.1004630404395193e-05, "loss": 0.2138, "step": 2276 }, { "epoch": 7.182533438237608, "grad_norm": 0.2278346344735737, "learning_rate": 1.0990071401996647e-05, "loss": 0.2097, "step": 2277 }, { "epoch": 7.185680566483084, "grad_norm": 0.22286482289133921, "learning_rate": 1.0975523899032603e-05, "loss": 0.2082, "step": 2278 }, { "epoch": 7.18882769472856, "grad_norm": 0.21831126634768455, "learning_rate": 1.0960987910697338e-05, "loss": 0.2098, "step": 2279 }, { "epoch": 7.191974822974037, "grad_norm": 0.23172372964239146, "learning_rate": 1.0946463452173135e-05, "loss": 0.2096, "step": 2280 }, { "epoch": 7.195121951219512, "grad_norm": 0.22322237214331764, "learning_rate": 1.0931950538630199e-05, "loss": 0.2132, "step": 2281 }, { "epoch": 7.198269079464988, "grad_norm": 0.22598175395258135, "learning_rate": 1.0917449185226702e-05, "loss": 0.2108, "step": 2282 }, { "epoch": 7.201416207710464, "grad_norm": 0.22815280201115662, "learning_rate": 1.090295940710873e-05, "loss": 0.2135, "step": 2283 }, { "epoch": 7.20456333595594, "grad_norm": 0.22423887146026672, "learning_rate": 1.0888481219410286e-05, "loss": 0.2155, "step": 2284 }, { "epoch": 7.2077104642014165, "grad_norm": 0.23680769002713375, "learning_rate": 1.087401463725326e-05, "loss": 0.2115, "step": 2285 }, { "epoch": 7.210857592446892, "grad_norm": 0.22339424747961625, "learning_rate": 1.0859559675747427e-05, "loss": 0.2073, "step": 2286 }, { "epoch": 7.214004720692368, "grad_norm": 0.22958762279363118, "learning_rate": 1.0845116349990418e-05, "loss": 0.2102, "step": 2287 }, { "epoch": 7.217151848937844, "grad_norm": 0.21905123849431263, "learning_rate": 1.083068467506772e-05, "loss": 0.2096, "step": 2288 }, { "epoch": 7.220298977183321, "grad_norm": 0.2299465638743488, "learning_rate": 1.0816264666052652e-05, "loss": 0.2103, "step": 2289 }, { "epoch": 7.223446105428796, "grad_norm": 0.22978612951320251, "learning_rate": 1.0801856338006323e-05, "loss": 0.2155, "step": 2290 }, { "epoch": 7.226593233674272, "grad_norm": 0.22975189777607816, "learning_rate": 1.0787459705977681e-05, "loss": 0.2114, "step": 2291 }, { "epoch": 7.229740361919748, "grad_norm": 0.22933284295055767, "learning_rate": 1.0773074785003426e-05, "loss": 0.2108, "step": 2292 }, { "epoch": 7.232887490165224, "grad_norm": 0.23052487817754658, "learning_rate": 1.0758701590108039e-05, "loss": 0.2054, "step": 2293 }, { "epoch": 7.2360346184107005, "grad_norm": 0.22513890179226442, "learning_rate": 1.0744340136303765e-05, "loss": 0.2069, "step": 2294 }, { "epoch": 7.239181746656176, "grad_norm": 0.22537210978153835, "learning_rate": 1.0729990438590558e-05, "loss": 0.2154, "step": 2295 }, { "epoch": 7.242328874901652, "grad_norm": 0.22711701620016747, "learning_rate": 1.0715652511956122e-05, "loss": 0.2117, "step": 2296 }, { "epoch": 7.245476003147128, "grad_norm": 0.21447246807326145, "learning_rate": 1.0701326371375842e-05, "loss": 0.2099, "step": 2297 }, { "epoch": 7.248623131392605, "grad_norm": 0.22924666327151738, "learning_rate": 1.0687012031812818e-05, "loss": 0.2059, "step": 2298 }, { "epoch": 7.25177025963808, "grad_norm": 0.2161633462452467, "learning_rate": 1.0672709508217796e-05, "loss": 0.2071, "step": 2299 }, { "epoch": 7.254917387883556, "grad_norm": 0.24671002964793948, "learning_rate": 1.0658418815529204e-05, "loss": 0.2194, "step": 2300 }, { "epoch": 7.258064516129032, "grad_norm": 0.21221688808795114, "learning_rate": 1.0644139968673101e-05, "loss": 0.2182, "step": 2301 }, { "epoch": 7.261211644374509, "grad_norm": 0.22568981603880797, "learning_rate": 1.062987298256318e-05, "loss": 0.2159, "step": 2302 }, { "epoch": 7.2643587726199845, "grad_norm": 0.22104241538483152, "learning_rate": 1.0615617872100752e-05, "loss": 0.2041, "step": 2303 }, { "epoch": 7.26750590086546, "grad_norm": 0.22669047159973574, "learning_rate": 1.06013746521747e-05, "loss": 0.2078, "step": 2304 }, { "epoch": 7.270653029110936, "grad_norm": 0.2253716719320985, "learning_rate": 1.0587143337661516e-05, "loss": 0.2125, "step": 2305 }, { "epoch": 7.273800157356412, "grad_norm": 0.22803877169918388, "learning_rate": 1.0572923943425234e-05, "loss": 0.2092, "step": 2306 }, { "epoch": 7.276947285601889, "grad_norm": 0.2213507250976366, "learning_rate": 1.0558716484317456e-05, "loss": 0.2108, "step": 2307 }, { "epoch": 7.280094413847364, "grad_norm": 0.2273540787687723, "learning_rate": 1.05445209751773e-05, "loss": 0.2134, "step": 2308 }, { "epoch": 7.28324154209284, "grad_norm": 0.2138902730182339, "learning_rate": 1.053033743083142e-05, "loss": 0.2089, "step": 2309 }, { "epoch": 7.286388670338316, "grad_norm": 0.23680899817380743, "learning_rate": 1.0516165866093974e-05, "loss": 0.2108, "step": 2310 }, { "epoch": 7.289535798583792, "grad_norm": 0.22183476822950635, "learning_rate": 1.0502006295766589e-05, "loss": 0.2174, "step": 2311 }, { "epoch": 7.2926829268292686, "grad_norm": 0.22004800892442652, "learning_rate": 1.0487858734638385e-05, "loss": 0.2151, "step": 2312 }, { "epoch": 7.295830055074744, "grad_norm": 0.22181045274812225, "learning_rate": 1.0473723197485914e-05, "loss": 0.2025, "step": 2313 }, { "epoch": 7.29897718332022, "grad_norm": 0.21908332323352983, "learning_rate": 1.0459599699073206e-05, "loss": 0.2162, "step": 2314 }, { "epoch": 7.302124311565696, "grad_norm": 0.21884697231931952, "learning_rate": 1.044548825415168e-05, "loss": 0.2129, "step": 2315 }, { "epoch": 7.305271439811173, "grad_norm": 0.2187517231572296, "learning_rate": 1.043138887746018e-05, "loss": 0.2092, "step": 2316 }, { "epoch": 7.3084185680566485, "grad_norm": 0.22546922277138795, "learning_rate": 1.041730158372496e-05, "loss": 0.2062, "step": 2317 }, { "epoch": 7.311565696302124, "grad_norm": 0.22614767597501462, "learning_rate": 1.0403226387659628e-05, "loss": 0.2141, "step": 2318 }, { "epoch": 7.3147128245476, "grad_norm": 0.22707234003611404, "learning_rate": 1.0389163303965186e-05, "loss": 0.2122, "step": 2319 }, { "epoch": 7.317859952793077, "grad_norm": 0.23186259964324954, "learning_rate": 1.0375112347329946e-05, "loss": 0.2146, "step": 2320 }, { "epoch": 7.321007081038553, "grad_norm": 0.23276792906716168, "learning_rate": 1.0361073532429594e-05, "loss": 0.2103, "step": 2321 }, { "epoch": 7.324154209284028, "grad_norm": 0.2074352547542711, "learning_rate": 1.0347046873927104e-05, "loss": 0.2104, "step": 2322 }, { "epoch": 7.327301337529504, "grad_norm": 0.2236327394327096, "learning_rate": 1.0333032386472775e-05, "loss": 0.2155, "step": 2323 }, { "epoch": 7.33044846577498, "grad_norm": 0.221050234723865, "learning_rate": 1.0319030084704175e-05, "loss": 0.2214, "step": 2324 }, { "epoch": 7.333595594020457, "grad_norm": 0.2249617592191245, "learning_rate": 1.0305039983246159e-05, "loss": 0.2054, "step": 2325 }, { "epoch": 7.3367427222659325, "grad_norm": 0.22698815261155295, "learning_rate": 1.0291062096710837e-05, "loss": 0.2071, "step": 2326 }, { "epoch": 7.339889850511408, "grad_norm": 0.2268711614187744, "learning_rate": 1.0277096439697552e-05, "loss": 0.2145, "step": 2327 }, { "epoch": 7.343036978756884, "grad_norm": 0.215143567561118, "learning_rate": 1.0263143026792883e-05, "loss": 0.207, "step": 2328 }, { "epoch": 7.34618410700236, "grad_norm": 0.22328803868837543, "learning_rate": 1.0249201872570614e-05, "loss": 0.2183, "step": 2329 }, { "epoch": 7.349331235247837, "grad_norm": 0.2218308643421254, "learning_rate": 1.0235272991591732e-05, "loss": 0.2099, "step": 2330 }, { "epoch": 7.352478363493312, "grad_norm": 0.23227296918591858, "learning_rate": 1.0221356398404398e-05, "loss": 0.2096, "step": 2331 }, { "epoch": 7.355625491738788, "grad_norm": 0.2387762137802973, "learning_rate": 1.0207452107543955e-05, "loss": 0.2065, "step": 2332 }, { "epoch": 7.358772619984264, "grad_norm": 0.22570367340945718, "learning_rate": 1.0193560133532868e-05, "loss": 0.2131, "step": 2333 }, { "epoch": 7.361919748229741, "grad_norm": 0.2306105201682074, "learning_rate": 1.017968049088076e-05, "loss": 0.2166, "step": 2334 }, { "epoch": 7.3650668764752165, "grad_norm": 0.2247866318155448, "learning_rate": 1.0165813194084375e-05, "loss": 0.2065, "step": 2335 }, { "epoch": 7.368214004720692, "grad_norm": 0.22844131668659315, "learning_rate": 1.0151958257627541e-05, "loss": 0.2094, "step": 2336 }, { "epoch": 7.371361132966168, "grad_norm": 0.23333574403162458, "learning_rate": 1.0138115695981207e-05, "loss": 0.213, "step": 2337 }, { "epoch": 7.374508261211645, "grad_norm": 0.21257237150019098, "learning_rate": 1.0124285523603365e-05, "loss": 0.2187, "step": 2338 }, { "epoch": 7.377655389457121, "grad_norm": 0.22969384430433795, "learning_rate": 1.01104677549391e-05, "loss": 0.2108, "step": 2339 }, { "epoch": 7.380802517702596, "grad_norm": 0.23754367381929004, "learning_rate": 1.0096662404420501e-05, "loss": 0.2132, "step": 2340 }, { "epoch": 7.383949645948072, "grad_norm": 0.22700013636080565, "learning_rate": 1.0082869486466729e-05, "loss": 0.2067, "step": 2341 }, { "epoch": 7.387096774193548, "grad_norm": 0.23919755857430938, "learning_rate": 1.006908901548394e-05, "loss": 0.2117, "step": 2342 }, { "epoch": 7.390243902439025, "grad_norm": 0.227136733402989, "learning_rate": 1.0055321005865277e-05, "loss": 0.2162, "step": 2343 }, { "epoch": 7.3933910306845005, "grad_norm": 0.23525073363793073, "learning_rate": 1.0041565471990897e-05, "loss": 0.2112, "step": 2344 }, { "epoch": 7.396538158929976, "grad_norm": 0.2321185458009399, "learning_rate": 1.0027822428227889e-05, "loss": 0.215, "step": 2345 }, { "epoch": 7.399685287175452, "grad_norm": 0.23528217492361306, "learning_rate": 1.0014091888930344e-05, "loss": 0.2142, "step": 2346 }, { "epoch": 7.402832415420928, "grad_norm": 0.22749689788373387, "learning_rate": 1.0000373868439248e-05, "loss": 0.2158, "step": 2347 }, { "epoch": 7.405979543666405, "grad_norm": 0.2404493638710273, "learning_rate": 9.986668381082545e-06, "loss": 0.2168, "step": 2348 }, { "epoch": 7.4091266719118805, "grad_norm": 0.22585072391780345, "learning_rate": 9.972975441175057e-06, "loss": 0.2164, "step": 2349 }, { "epoch": 7.412273800157356, "grad_norm": 0.23795916213633916, "learning_rate": 9.959295063018526e-06, "loss": 0.215, "step": 2350 }, { "epoch": 7.415420928402832, "grad_norm": 0.23204552138933593, "learning_rate": 9.945627260901571e-06, "loss": 0.2174, "step": 2351 }, { "epoch": 7.418568056648309, "grad_norm": 0.22824560817611173, "learning_rate": 9.93197204909966e-06, "loss": 0.2111, "step": 2352 }, { "epoch": 7.421715184893785, "grad_norm": 0.2358749174129253, "learning_rate": 9.918329441875129e-06, "loss": 0.2132, "step": 2353 }, { "epoch": 7.42486231313926, "grad_norm": 0.23304655894118764, "learning_rate": 9.904699453477136e-06, "loss": 0.2121, "step": 2354 }, { "epoch": 7.428009441384736, "grad_norm": 0.2305516088388655, "learning_rate": 9.891082098141667e-06, "loss": 0.2165, "step": 2355 }, { "epoch": 7.431156569630213, "grad_norm": 0.23079140563064027, "learning_rate": 9.877477390091509e-06, "loss": 0.2141, "step": 2356 }, { "epoch": 7.434303697875689, "grad_norm": 0.22387025416375533, "learning_rate": 9.863885343536238e-06, "loss": 0.2121, "step": 2357 }, { "epoch": 7.4374508261211645, "grad_norm": 0.22787402873623003, "learning_rate": 9.850305972672214e-06, "loss": 0.2203, "step": 2358 }, { "epoch": 7.44059795436664, "grad_norm": 0.22535783554358702, "learning_rate": 9.836739291682543e-06, "loss": 0.2154, "step": 2359 }, { "epoch": 7.443745082612116, "grad_norm": 0.22981126911531366, "learning_rate": 9.823185314737104e-06, "loss": 0.2156, "step": 2360 }, { "epoch": 7.446892210857593, "grad_norm": 0.23338901289009809, "learning_rate": 9.809644055992471e-06, "loss": 0.2112, "step": 2361 }, { "epoch": 7.450039339103069, "grad_norm": 0.24023663975496, "learning_rate": 9.796115529591967e-06, "loss": 0.2093, "step": 2362 }, { "epoch": 7.453186467348544, "grad_norm": 0.22580520597689485, "learning_rate": 9.78259974966559e-06, "loss": 0.2175, "step": 2363 }, { "epoch": 7.45633359559402, "grad_norm": 0.2221577009585905, "learning_rate": 9.769096730330047e-06, "loss": 0.2128, "step": 2364 }, { "epoch": 7.459480723839496, "grad_norm": 0.2314324525926755, "learning_rate": 9.755606485688695e-06, "loss": 0.2064, "step": 2365 }, { "epoch": 7.462627852084973, "grad_norm": 0.2234171977467309, "learning_rate": 9.742129029831569e-06, "loss": 0.2137, "step": 2366 }, { "epoch": 7.4657749803304485, "grad_norm": 0.23731134897981873, "learning_rate": 9.728664376835343e-06, "loss": 0.2134, "step": 2367 }, { "epoch": 7.468922108575924, "grad_norm": 0.21962718713348828, "learning_rate": 9.7152125407633e-06, "loss": 0.2108, "step": 2368 }, { "epoch": 7.4720692368214, "grad_norm": 0.2207798183423775, "learning_rate": 9.701773535665366e-06, "loss": 0.2101, "step": 2369 }, { "epoch": 7.475216365066877, "grad_norm": 0.23437916694512362, "learning_rate": 9.688347375578033e-06, "loss": 0.2154, "step": 2370 }, { "epoch": 7.478363493312353, "grad_norm": 0.22408835369735966, "learning_rate": 9.674934074524411e-06, "loss": 0.2172, "step": 2371 }, { "epoch": 7.481510621557828, "grad_norm": 0.22121992831685067, "learning_rate": 9.661533646514142e-06, "loss": 0.2088, "step": 2372 }, { "epoch": 7.484657749803304, "grad_norm": 0.21478252709139647, "learning_rate": 9.648146105543457e-06, "loss": 0.213, "step": 2373 }, { "epoch": 7.487804878048781, "grad_norm": 0.22236538402387201, "learning_rate": 9.634771465595109e-06, "loss": 0.2146, "step": 2374 }, { "epoch": 7.490952006294257, "grad_norm": 0.2329798548119093, "learning_rate": 9.62140974063838e-06, "loss": 0.2147, "step": 2375 }, { "epoch": 7.4940991345397325, "grad_norm": 0.20764196366436552, "learning_rate": 9.608060944629065e-06, "loss": 0.2158, "step": 2376 }, { "epoch": 7.497246262785208, "grad_norm": 0.22039448738264225, "learning_rate": 9.59472509150945e-06, "loss": 0.2131, "step": 2377 }, { "epoch": 7.500393391030684, "grad_norm": 0.21967232307742138, "learning_rate": 9.581402195208307e-06, "loss": 0.2155, "step": 2378 }, { "epoch": 7.503540519276161, "grad_norm": 0.23165634584475214, "learning_rate": 9.568092269640867e-06, "loss": 0.2058, "step": 2379 }, { "epoch": 7.506687647521637, "grad_norm": 0.21342819732195714, "learning_rate": 9.554795328708833e-06, "loss": 0.2212, "step": 2380 }, { "epoch": 7.5098347757671124, "grad_norm": 0.21653855605412423, "learning_rate": 9.541511386300321e-06, "loss": 0.2184, "step": 2381 }, { "epoch": 7.512981904012588, "grad_norm": 0.212878668638118, "learning_rate": 9.528240456289887e-06, "loss": 0.2191, "step": 2382 }, { "epoch": 7.516129032258064, "grad_norm": 0.22346959923074045, "learning_rate": 9.5149825525385e-06, "loss": 0.214, "step": 2383 }, { "epoch": 7.519276160503541, "grad_norm": 0.22445356716908701, "learning_rate": 9.5017376888935e-06, "loss": 0.2115, "step": 2384 }, { "epoch": 7.522423288749017, "grad_norm": 0.22584124484887466, "learning_rate": 9.488505879188638e-06, "loss": 0.2104, "step": 2385 }, { "epoch": 7.525570416994492, "grad_norm": 0.2278568808064928, "learning_rate": 9.475287137244006e-06, "loss": 0.2119, "step": 2386 }, { "epoch": 7.528717545239968, "grad_norm": 0.22171779725801655, "learning_rate": 9.462081476866061e-06, "loss": 0.2092, "step": 2387 }, { "epoch": 7.531864673485445, "grad_norm": 0.22622915992130538, "learning_rate": 9.44888891184758e-06, "loss": 0.2116, "step": 2388 }, { "epoch": 7.535011801730921, "grad_norm": 0.21608386506577046, "learning_rate": 9.435709455967696e-06, "loss": 0.2125, "step": 2389 }, { "epoch": 7.5381589299763965, "grad_norm": 0.2190850436150323, "learning_rate": 9.422543122991816e-06, "loss": 0.215, "step": 2390 }, { "epoch": 7.541306058221872, "grad_norm": 0.22266706339169948, "learning_rate": 9.409389926671652e-06, "loss": 0.2231, "step": 2391 }, { "epoch": 7.544453186467349, "grad_norm": 0.223823120767086, "learning_rate": 9.396249880745208e-06, "loss": 0.2096, "step": 2392 }, { "epoch": 7.547600314712825, "grad_norm": 0.21474334984155782, "learning_rate": 9.383122998936728e-06, "loss": 0.2211, "step": 2393 }, { "epoch": 7.550747442958301, "grad_norm": 0.22017470608910686, "learning_rate": 9.370009294956731e-06, "loss": 0.2127, "step": 2394 }, { "epoch": 7.553894571203776, "grad_norm": 0.22623985048590417, "learning_rate": 9.356908782501953e-06, "loss": 0.2079, "step": 2395 }, { "epoch": 7.557041699449252, "grad_norm": 0.21790079455618502, "learning_rate": 9.34382147525537e-06, "loss": 0.2084, "step": 2396 }, { "epoch": 7.560188827694729, "grad_norm": 0.23038553545405369, "learning_rate": 9.330747386886145e-06, "loss": 0.2144, "step": 2397 }, { "epoch": 7.563335955940205, "grad_norm": 0.22823441881568057, "learning_rate": 9.317686531049651e-06, "loss": 0.2155, "step": 2398 }, { "epoch": 7.5664830841856805, "grad_norm": 0.22875720062537966, "learning_rate": 9.30463892138744e-06, "loss": 0.2163, "step": 2399 }, { "epoch": 7.569630212431156, "grad_norm": 0.2289164728738839, "learning_rate": 9.291604571527218e-06, "loss": 0.2136, "step": 2400 }, { "epoch": 7.572777340676632, "grad_norm": 0.215177642884113, "learning_rate": 9.27858349508285e-06, "loss": 0.2091, "step": 2401 }, { "epoch": 7.575924468922109, "grad_norm": 0.2274883491248114, "learning_rate": 9.265575705654322e-06, "loss": 0.2109, "step": 2402 }, { "epoch": 7.579071597167585, "grad_norm": 0.23063099015253677, "learning_rate": 9.252581216827778e-06, "loss": 0.2007, "step": 2403 }, { "epoch": 7.58221872541306, "grad_norm": 0.2218214636743055, "learning_rate": 9.23960004217543e-06, "loss": 0.2054, "step": 2404 }, { "epoch": 7.585365853658536, "grad_norm": 0.23479587301596305, "learning_rate": 9.226632195255612e-06, "loss": 0.2109, "step": 2405 }, { "epoch": 7.588512981904013, "grad_norm": 0.22498307321621322, "learning_rate": 9.213677689612714e-06, "loss": 0.2105, "step": 2406 }, { "epoch": 7.591660110149489, "grad_norm": 0.23788611222524667, "learning_rate": 9.200736538777214e-06, "loss": 0.2082, "step": 2407 }, { "epoch": 7.5948072383949645, "grad_norm": 0.21636929208162226, "learning_rate": 9.18780875626563e-06, "loss": 0.2097, "step": 2408 }, { "epoch": 7.59795436664044, "grad_norm": 0.22912436986509432, "learning_rate": 9.174894355580514e-06, "loss": 0.208, "step": 2409 }, { "epoch": 7.601101494885917, "grad_norm": 0.22835007748766775, "learning_rate": 9.161993350210457e-06, "loss": 0.2086, "step": 2410 }, { "epoch": 7.604248623131393, "grad_norm": 0.22707280711620756, "learning_rate": 9.149105753630033e-06, "loss": 0.2137, "step": 2411 }, { "epoch": 7.607395751376869, "grad_norm": 0.23071685677491993, "learning_rate": 9.136231579299843e-06, "loss": 0.2116, "step": 2412 }, { "epoch": 7.610542879622344, "grad_norm": 0.21489892065441007, "learning_rate": 9.123370840666437e-06, "loss": 0.2108, "step": 2413 }, { "epoch": 7.61369000786782, "grad_norm": 0.22666390058768177, "learning_rate": 9.110523551162355e-06, "loss": 0.2129, "step": 2414 }, { "epoch": 7.616837136113297, "grad_norm": 0.22108033103329094, "learning_rate": 9.097689724206085e-06, "loss": 0.2147, "step": 2415 }, { "epoch": 7.619984264358773, "grad_norm": 0.22927754219244445, "learning_rate": 9.084869373202036e-06, "loss": 0.2122, "step": 2416 }, { "epoch": 7.6231313926042485, "grad_norm": 0.21679471833735905, "learning_rate": 9.072062511540583e-06, "loss": 0.2118, "step": 2417 }, { "epoch": 7.626278520849724, "grad_norm": 0.22320372473610817, "learning_rate": 9.059269152597964e-06, "loss": 0.2146, "step": 2418 }, { "epoch": 7.6294256490952, "grad_norm": 0.22512553465766197, "learning_rate": 9.046489309736348e-06, "loss": 0.212, "step": 2419 }, { "epoch": 7.632572777340677, "grad_norm": 0.2183847269049812, "learning_rate": 9.033722996303768e-06, "loss": 0.2158, "step": 2420 }, { "epoch": 7.635719905586153, "grad_norm": 0.22036959109568996, "learning_rate": 9.020970225634136e-06, "loss": 0.2164, "step": 2421 }, { "epoch": 7.6388670338316285, "grad_norm": 0.22687740918307078, "learning_rate": 9.008231011047213e-06, "loss": 0.2146, "step": 2422 }, { "epoch": 7.642014162077104, "grad_norm": 0.22039894721380168, "learning_rate": 8.995505365848605e-06, "loss": 0.2133, "step": 2423 }, { "epoch": 7.645161290322581, "grad_norm": 0.22178747844096786, "learning_rate": 8.982793303329751e-06, "loss": 0.218, "step": 2424 }, { "epoch": 7.648308418568057, "grad_norm": 0.21692147792936287, "learning_rate": 8.970094836767888e-06, "loss": 0.222, "step": 2425 }, { "epoch": 7.651455546813533, "grad_norm": 0.2138257230931039, "learning_rate": 8.957409979426072e-06, "loss": 0.2089, "step": 2426 }, { "epoch": 7.654602675059008, "grad_norm": 0.2265413919770675, "learning_rate": 8.944738744553121e-06, "loss": 0.2172, "step": 2427 }, { "epoch": 7.657749803304485, "grad_norm": 0.22297257163502948, "learning_rate": 8.93208114538365e-06, "loss": 0.2121, "step": 2428 }, { "epoch": 7.660896931549961, "grad_norm": 0.22148672698728822, "learning_rate": 8.91943719513801e-06, "loss": 0.2088, "step": 2429 }, { "epoch": 7.664044059795437, "grad_norm": 0.21736370577760047, "learning_rate": 8.906806907022311e-06, "loss": 0.2153, "step": 2430 }, { "epoch": 7.6671911880409125, "grad_norm": 0.22466767010680358, "learning_rate": 8.894190294228391e-06, "loss": 0.21, "step": 2431 }, { "epoch": 7.670338316286388, "grad_norm": 0.22112902451894795, "learning_rate": 8.881587369933799e-06, "loss": 0.2175, "step": 2432 }, { "epoch": 7.673485444531865, "grad_norm": 0.2214421346522819, "learning_rate": 8.8689981473018e-06, "loss": 0.214, "step": 2433 }, { "epoch": 7.676632572777341, "grad_norm": 0.21631613538316005, "learning_rate": 8.856422639481324e-06, "loss": 0.2084, "step": 2434 }, { "epoch": 7.679779701022817, "grad_norm": 0.22609337537385527, "learning_rate": 8.843860859607001e-06, "loss": 0.2147, "step": 2435 }, { "epoch": 7.682926829268292, "grad_norm": 0.22739242141285246, "learning_rate": 8.831312820799108e-06, "loss": 0.2177, "step": 2436 }, { "epoch": 7.686073957513768, "grad_norm": 0.22087226671865368, "learning_rate": 8.81877853616358e-06, "loss": 0.215, "step": 2437 }, { "epoch": 7.689221085759245, "grad_norm": 0.226834895900349, "learning_rate": 8.80625801879197e-06, "loss": 0.212, "step": 2438 }, { "epoch": 7.692368214004721, "grad_norm": 0.23470020627965657, "learning_rate": 8.793751281761473e-06, "loss": 0.215, "step": 2439 }, { "epoch": 7.6955153422501965, "grad_norm": 0.21858002999304377, "learning_rate": 8.781258338134882e-06, "loss": 0.2195, "step": 2440 }, { "epoch": 7.698662470495672, "grad_norm": 0.22766506836176625, "learning_rate": 8.768779200960573e-06, "loss": 0.2141, "step": 2441 }, { "epoch": 7.701809598741149, "grad_norm": 0.2416124281891341, "learning_rate": 8.756313883272518e-06, "loss": 0.206, "step": 2442 }, { "epoch": 7.704956726986625, "grad_norm": 0.22684959206739022, "learning_rate": 8.74386239809024e-06, "loss": 0.217, "step": 2443 }, { "epoch": 7.708103855232101, "grad_norm": 0.22333064815479248, "learning_rate": 8.731424758418837e-06, "loss": 0.2238, "step": 2444 }, { "epoch": 7.711250983477576, "grad_norm": 0.23426086091700454, "learning_rate": 8.719000977248909e-06, "loss": 0.2159, "step": 2445 }, { "epoch": 7.714398111723053, "grad_norm": 0.23296607438783742, "learning_rate": 8.706591067556625e-06, "loss": 0.2149, "step": 2446 }, { "epoch": 7.717545239968529, "grad_norm": 0.21675819624949147, "learning_rate": 8.694195042303631e-06, "loss": 0.2143, "step": 2447 }, { "epoch": 7.720692368214005, "grad_norm": 0.21790836958624593, "learning_rate": 8.681812914437088e-06, "loss": 0.2163, "step": 2448 }, { "epoch": 7.7238394964594805, "grad_norm": 0.2274002177866238, "learning_rate": 8.669444696889645e-06, "loss": 0.2132, "step": 2449 }, { "epoch": 7.726986624704956, "grad_norm": 0.22632872665252599, "learning_rate": 8.657090402579406e-06, "loss": 0.2117, "step": 2450 }, { "epoch": 7.730133752950433, "grad_norm": 0.22423037534942736, "learning_rate": 8.64475004440995e-06, "loss": 0.2147, "step": 2451 }, { "epoch": 7.733280881195909, "grad_norm": 0.21721365887439806, "learning_rate": 8.632423635270284e-06, "loss": 0.213, "step": 2452 }, { "epoch": 7.736428009441385, "grad_norm": 0.22467979551975023, "learning_rate": 8.620111188034862e-06, "loss": 0.2131, "step": 2453 }, { "epoch": 7.7395751376868605, "grad_norm": 0.21890044167587794, "learning_rate": 8.60781271556354e-06, "loss": 0.2233, "step": 2454 }, { "epoch": 7.742722265932336, "grad_norm": 0.23103144172132506, "learning_rate": 8.595528230701591e-06, "loss": 0.2125, "step": 2455 }, { "epoch": 7.745869394177813, "grad_norm": 0.21442112221294987, "learning_rate": 8.583257746279678e-06, "loss": 0.2132, "step": 2456 }, { "epoch": 7.749016522423289, "grad_norm": 0.23258158194634532, "learning_rate": 8.571001275113825e-06, "loss": 0.2121, "step": 2457 }, { "epoch": 7.752163650668765, "grad_norm": 0.218808737474262, "learning_rate": 8.55875883000544e-06, "loss": 0.2099, "step": 2458 }, { "epoch": 7.755310778914241, "grad_norm": 0.22662779438337177, "learning_rate": 8.546530423741258e-06, "loss": 0.2139, "step": 2459 }, { "epoch": 7.758457907159717, "grad_norm": 0.22023777628464106, "learning_rate": 8.534316069093385e-06, "loss": 0.2198, "step": 2460 }, { "epoch": 7.761605035405193, "grad_norm": 0.22730348731809383, "learning_rate": 8.52211577881922e-06, "loss": 0.2203, "step": 2461 }, { "epoch": 7.764752163650669, "grad_norm": 0.224778305802601, "learning_rate": 8.509929565661486e-06, "loss": 0.2144, "step": 2462 }, { "epoch": 7.7678992918961445, "grad_norm": 0.21387975638987688, "learning_rate": 8.497757442348194e-06, "loss": 0.2193, "step": 2463 }, { "epoch": 7.771046420141621, "grad_norm": 0.22076113317664492, "learning_rate": 8.485599421592648e-06, "loss": 0.2212, "step": 2464 }, { "epoch": 7.774193548387097, "grad_norm": 0.2191624701650899, "learning_rate": 8.473455516093427e-06, "loss": 0.2194, "step": 2465 }, { "epoch": 7.777340676632573, "grad_norm": 0.22080484859112698, "learning_rate": 8.461325738534349e-06, "loss": 0.2166, "step": 2466 }, { "epoch": 7.780487804878049, "grad_norm": 0.2286105390431359, "learning_rate": 8.449210101584495e-06, "loss": 0.2101, "step": 2467 }, { "epoch": 7.783634933123524, "grad_norm": 0.22181973516008022, "learning_rate": 8.43710861789816e-06, "loss": 0.2111, "step": 2468 }, { "epoch": 7.786782061369001, "grad_norm": 0.22834807098550453, "learning_rate": 8.42502130011487e-06, "loss": 0.2203, "step": 2469 }, { "epoch": 7.789929189614477, "grad_norm": 0.2155756950754227, "learning_rate": 8.412948160859346e-06, "loss": 0.2078, "step": 2470 }, { "epoch": 7.793076317859953, "grad_norm": 0.22460211874784158, "learning_rate": 8.400889212741506e-06, "loss": 0.2138, "step": 2471 }, { "epoch": 7.7962234461054285, "grad_norm": 0.23266223147738663, "learning_rate": 8.388844468356447e-06, "loss": 0.2082, "step": 2472 }, { "epoch": 7.799370574350904, "grad_norm": 0.22066055467773144, "learning_rate": 8.37681394028442e-06, "loss": 0.2167, "step": 2473 }, { "epoch": 7.802517702596381, "grad_norm": 0.22436154174278092, "learning_rate": 8.364797641090839e-06, "loss": 0.2219, "step": 2474 }, { "epoch": 7.805664830841857, "grad_norm": 0.22160888608683596, "learning_rate": 8.352795583326255e-06, "loss": 0.2205, "step": 2475 }, { "epoch": 7.808811959087333, "grad_norm": 0.2226551886278477, "learning_rate": 8.340807779526345e-06, "loss": 0.2176, "step": 2476 }, { "epoch": 7.811959087332809, "grad_norm": 0.21580763083372614, "learning_rate": 8.328834242211887e-06, "loss": 0.2163, "step": 2477 }, { "epoch": 7.815106215578285, "grad_norm": 0.22040811507283187, "learning_rate": 8.316874983888774e-06, "loss": 0.2107, "step": 2478 }, { "epoch": 7.818253343823761, "grad_norm": 0.2291989714865517, "learning_rate": 8.304930017047969e-06, "loss": 0.2032, "step": 2479 }, { "epoch": 7.821400472069237, "grad_norm": 0.22911848156820763, "learning_rate": 8.292999354165525e-06, "loss": 0.2082, "step": 2480 }, { "epoch": 7.8245476003147125, "grad_norm": 0.23926926990020336, "learning_rate": 8.281083007702546e-06, "loss": 0.2095, "step": 2481 }, { "epoch": 7.827694728560189, "grad_norm": 0.22512534340178053, "learning_rate": 8.26918099010518e-06, "loss": 0.2173, "step": 2482 }, { "epoch": 7.830841856805665, "grad_norm": 0.2167774809962824, "learning_rate": 8.25729331380462e-06, "loss": 0.2151, "step": 2483 }, { "epoch": 7.833988985051141, "grad_norm": 0.2129622004627707, "learning_rate": 8.245419991217063e-06, "loss": 0.2175, "step": 2484 }, { "epoch": 7.837136113296617, "grad_norm": 0.2234270903188792, "learning_rate": 8.233561034743737e-06, "loss": 0.2117, "step": 2485 }, { "epoch": 7.840283241542092, "grad_norm": 0.22836568327537848, "learning_rate": 8.221716456770838e-06, "loss": 0.2136, "step": 2486 }, { "epoch": 7.843430369787569, "grad_norm": 0.21719647596939415, "learning_rate": 8.209886269669569e-06, "loss": 0.216, "step": 2487 }, { "epoch": 7.846577498033045, "grad_norm": 0.2266420312585156, "learning_rate": 8.198070485796087e-06, "loss": 0.2156, "step": 2488 }, { "epoch": 7.849724626278521, "grad_norm": 0.2231981024827487, "learning_rate": 8.186269117491515e-06, "loss": 0.2078, "step": 2489 }, { "epoch": 7.8528717545239966, "grad_norm": 0.2216098578898215, "learning_rate": 8.174482177081914e-06, "loss": 0.2098, "step": 2490 }, { "epoch": 7.856018882769473, "grad_norm": 0.22092053778425194, "learning_rate": 8.162709676878274e-06, "loss": 0.2149, "step": 2491 }, { "epoch": 7.859166011014949, "grad_norm": 0.2181787736211054, "learning_rate": 8.15095162917651e-06, "loss": 0.2147, "step": 2492 }, { "epoch": 7.862313139260425, "grad_norm": 0.21707933494315532, "learning_rate": 8.13920804625743e-06, "loss": 0.2144, "step": 2493 }, { "epoch": 7.865460267505901, "grad_norm": 0.21885761883840674, "learning_rate": 8.12747894038675e-06, "loss": 0.2176, "step": 2494 }, { "epoch": 7.868607395751377, "grad_norm": 0.21519099154642782, "learning_rate": 8.115764323815047e-06, "loss": 0.2092, "step": 2495 }, { "epoch": 7.871754523996853, "grad_norm": 0.2218096537191133, "learning_rate": 8.10406420877778e-06, "loss": 0.2142, "step": 2496 }, { "epoch": 7.874901652242329, "grad_norm": 0.2160268035618521, "learning_rate": 8.092378607495259e-06, "loss": 0.2128, "step": 2497 }, { "epoch": 7.878048780487805, "grad_norm": 0.21717472587214887, "learning_rate": 8.080707532172621e-06, "loss": 0.2089, "step": 2498 }, { "epoch": 7.881195908733281, "grad_norm": 0.22978681303101012, "learning_rate": 8.069050994999859e-06, "loss": 0.2159, "step": 2499 }, { "epoch": 7.884343036978757, "grad_norm": 0.2145432190672201, "learning_rate": 8.057409008151747e-06, "loss": 0.2191, "step": 2500 }, { "epoch": 7.887490165224233, "grad_norm": 0.21225002883741487, "learning_rate": 8.04578158378789e-06, "loss": 0.213, "step": 2501 }, { "epoch": 7.890637293469709, "grad_norm": 0.2238498839813341, "learning_rate": 8.034168734052665e-06, "loss": 0.2166, "step": 2502 }, { "epoch": 7.893784421715185, "grad_norm": 0.20967970929834082, "learning_rate": 8.022570471075239e-06, "loss": 0.221, "step": 2503 }, { "epoch": 7.8969315499606605, "grad_norm": 0.21708326506558323, "learning_rate": 8.010986806969536e-06, "loss": 0.2168, "step": 2504 }, { "epoch": 7.900078678206137, "grad_norm": 0.2194455638945262, "learning_rate": 7.999417753834237e-06, "loss": 0.2159, "step": 2505 }, { "epoch": 7.903225806451613, "grad_norm": 0.22999330162628176, "learning_rate": 7.987863323752768e-06, "loss": 0.2152, "step": 2506 }, { "epoch": 7.906372934697089, "grad_norm": 0.23123822749524514, "learning_rate": 7.976323528793253e-06, "loss": 0.2114, "step": 2507 }, { "epoch": 7.909520062942565, "grad_norm": 0.20876015592600447, "learning_rate": 7.964798381008572e-06, "loss": 0.2187, "step": 2508 }, { "epoch": 7.912667191188041, "grad_norm": 0.23282553158404024, "learning_rate": 7.95328789243627e-06, "loss": 0.2141, "step": 2509 }, { "epoch": 7.915814319433517, "grad_norm": 0.21665198107506672, "learning_rate": 7.941792075098607e-06, "loss": 0.22, "step": 2510 }, { "epoch": 7.918961447678993, "grad_norm": 0.21353778052651864, "learning_rate": 7.930310941002498e-06, "loss": 0.2139, "step": 2511 }, { "epoch": 7.922108575924469, "grad_norm": 0.21878104539924628, "learning_rate": 7.918844502139542e-06, "loss": 0.2178, "step": 2512 }, { "epoch": 7.925255704169945, "grad_norm": 0.22382902656974832, "learning_rate": 7.907392770485981e-06, "loss": 0.2182, "step": 2513 }, { "epoch": 7.928402832415421, "grad_norm": 0.21896656616114246, "learning_rate": 7.895955758002692e-06, "loss": 0.2046, "step": 2514 }, { "epoch": 7.931549960660897, "grad_norm": 0.23683519616131976, "learning_rate": 7.884533476635183e-06, "loss": 0.2152, "step": 2515 }, { "epoch": 7.934697088906373, "grad_norm": 0.2166277071545748, "learning_rate": 7.873125938313572e-06, "loss": 0.2107, "step": 2516 }, { "epoch": 7.937844217151849, "grad_norm": 0.21951761687518714, "learning_rate": 7.86173315495258e-06, "loss": 0.2118, "step": 2517 }, { "epoch": 7.940991345397325, "grad_norm": 0.2309665891777021, "learning_rate": 7.850355138451522e-06, "loss": 0.2136, "step": 2518 }, { "epoch": 7.944138473642801, "grad_norm": 0.22022379458796373, "learning_rate": 7.83899190069429e-06, "loss": 0.2173, "step": 2519 }, { "epoch": 7.947285601888277, "grad_norm": 0.2174782268352602, "learning_rate": 7.827643453549325e-06, "loss": 0.2192, "step": 2520 }, { "epoch": 7.950432730133753, "grad_norm": 0.22169879590437622, "learning_rate": 7.816309808869637e-06, "loss": 0.2184, "step": 2521 }, { "epoch": 7.9535798583792285, "grad_norm": 0.22065198802434965, "learning_rate": 7.804990978492774e-06, "loss": 0.2114, "step": 2522 }, { "epoch": 7.956726986624705, "grad_norm": 0.2165885857788757, "learning_rate": 7.793686974240795e-06, "loss": 0.2132, "step": 2523 }, { "epoch": 7.959874114870181, "grad_norm": 0.22336276390335888, "learning_rate": 7.782397807920297e-06, "loss": 0.2137, "step": 2524 }, { "epoch": 7.963021243115657, "grad_norm": 0.21596192334489273, "learning_rate": 7.771123491322353e-06, "loss": 0.2162, "step": 2525 }, { "epoch": 7.966168371361133, "grad_norm": 0.2122590682034009, "learning_rate": 7.759864036222556e-06, "loss": 0.2154, "step": 2526 }, { "epoch": 7.969315499606609, "grad_norm": 0.2247749539500636, "learning_rate": 7.748619454380947e-06, "loss": 0.2143, "step": 2527 }, { "epoch": 7.972462627852085, "grad_norm": 0.22164707239632309, "learning_rate": 7.737389757542051e-06, "loss": 0.22, "step": 2528 }, { "epoch": 7.975609756097561, "grad_norm": 0.21942953911808707, "learning_rate": 7.72617495743485e-06, "loss": 0.2142, "step": 2529 }, { "epoch": 7.978756884343037, "grad_norm": 0.2169333255982246, "learning_rate": 7.714975065772747e-06, "loss": 0.2167, "step": 2530 }, { "epoch": 7.9819040125885135, "grad_norm": 0.226087767263674, "learning_rate": 7.70379009425359e-06, "loss": 0.219, "step": 2531 }, { "epoch": 7.985051140833989, "grad_norm": 0.22119104996323627, "learning_rate": 7.692620054559641e-06, "loss": 0.2148, "step": 2532 }, { "epoch": 7.988198269079465, "grad_norm": 0.22663737097763387, "learning_rate": 7.681464958357565e-06, "loss": 0.2134, "step": 2533 }, { "epoch": 7.991345397324941, "grad_norm": 0.22890196789986905, "learning_rate": 7.670324817298414e-06, "loss": 0.2118, "step": 2534 }, { "epoch": 7.994492525570417, "grad_norm": 0.21331469668678782, "learning_rate": 7.659199643017628e-06, "loss": 0.2174, "step": 2535 }, { "epoch": 7.997639653815893, "grad_norm": 0.2216721447509053, "learning_rate": 7.648089447135005e-06, "loss": 0.2133, "step": 2536 }, { "epoch": 8.003147128245477, "grad_norm": 0.4349977764263755, "learning_rate": 7.63699424125471e-06, "loss": 0.3762, "step": 2537 }, { "epoch": 8.006294256490952, "grad_norm": 0.26742441223502816, "learning_rate": 7.62591403696525e-06, "loss": 0.1644, "step": 2538 }, { "epoch": 8.009441384736428, "grad_norm": 0.388106689437148, "learning_rate": 7.614848845839449e-06, "loss": 0.1651, "step": 2539 }, { "epoch": 8.012588512981903, "grad_norm": 0.5323255528110843, "learning_rate": 7.603798679434472e-06, "loss": 0.1682, "step": 2540 }, { "epoch": 8.01573564122738, "grad_norm": 0.2923949539924144, "learning_rate": 7.592763549291768e-06, "loss": 0.1656, "step": 2541 }, { "epoch": 8.018882769472857, "grad_norm": 0.38423574858459186, "learning_rate": 7.58174346693711e-06, "loss": 0.1721, "step": 2542 }, { "epoch": 8.022029897718332, "grad_norm": 0.3833329737299957, "learning_rate": 7.570738443880521e-06, "loss": 0.1648, "step": 2543 }, { "epoch": 8.025177025963808, "grad_norm": 0.30621817890570635, "learning_rate": 7.559748491616319e-06, "loss": 0.1675, "step": 2544 }, { "epoch": 8.028324154209285, "grad_norm": 0.3082779392109665, "learning_rate": 7.54877362162308e-06, "loss": 0.1712, "step": 2545 }, { "epoch": 8.03147128245476, "grad_norm": 0.3706427459165308, "learning_rate": 7.537813845363604e-06, "loss": 0.1665, "step": 2546 }, { "epoch": 8.034618410700237, "grad_norm": 0.32385321291038643, "learning_rate": 7.5268691742849665e-06, "loss": 0.1657, "step": 2547 }, { "epoch": 8.037765538945711, "grad_norm": 0.25807590952385684, "learning_rate": 7.5159396198184246e-06, "loss": 0.1641, "step": 2548 }, { "epoch": 8.040912667191188, "grad_norm": 0.2867986489695364, "learning_rate": 7.505025193379478e-06, "loss": 0.1646, "step": 2549 }, { "epoch": 8.044059795436665, "grad_norm": 0.29169617877555604, "learning_rate": 7.494125906367801e-06, "loss": 0.1613, "step": 2550 }, { "epoch": 8.04720692368214, "grad_norm": 0.2527672771226682, "learning_rate": 7.48324177016728e-06, "loss": 0.1572, "step": 2551 }, { "epoch": 8.050354051927616, "grad_norm": 0.2836106342803346, "learning_rate": 7.47237279614595e-06, "loss": 0.1687, "step": 2552 }, { "epoch": 8.053501180173091, "grad_norm": 0.2975001770525274, "learning_rate": 7.461518995656034e-06, "loss": 0.164, "step": 2553 }, { "epoch": 8.056648308418568, "grad_norm": 0.2735520742422293, "learning_rate": 7.450680380033897e-06, "loss": 0.1683, "step": 2554 }, { "epoch": 8.059795436664045, "grad_norm": 0.2449222310949691, "learning_rate": 7.439856960600038e-06, "loss": 0.1634, "step": 2555 }, { "epoch": 8.06294256490952, "grad_norm": 0.26957950448412327, "learning_rate": 7.429048748659098e-06, "loss": 0.164, "step": 2556 }, { "epoch": 8.066089693154996, "grad_norm": 0.2468154213899654, "learning_rate": 7.418255755499817e-06, "loss": 0.1635, "step": 2557 }, { "epoch": 8.069236821400471, "grad_norm": 0.24374149361084566, "learning_rate": 7.407477992395058e-06, "loss": 0.1653, "step": 2558 }, { "epoch": 8.072383949645948, "grad_norm": 0.2639345175746216, "learning_rate": 7.396715470601759e-06, "loss": 0.1654, "step": 2559 }, { "epoch": 8.075531077891425, "grad_norm": 0.2592056292755547, "learning_rate": 7.385968201360953e-06, "loss": 0.17, "step": 2560 }, { "epoch": 8.0786782061369, "grad_norm": 0.24098620754378253, "learning_rate": 7.375236195897737e-06, "loss": 0.1598, "step": 2561 }, { "epoch": 8.081825334382376, "grad_norm": 0.2392626040847066, "learning_rate": 7.364519465421265e-06, "loss": 0.1664, "step": 2562 }, { "epoch": 8.084972462627853, "grad_norm": 0.23598245288642505, "learning_rate": 7.353818021124745e-06, "loss": 0.1676, "step": 2563 }, { "epoch": 8.088119590873328, "grad_norm": 0.2434889824957857, "learning_rate": 7.343131874185396e-06, "loss": 0.1528, "step": 2564 }, { "epoch": 8.091266719118805, "grad_norm": 0.23185343146126583, "learning_rate": 7.332461035764492e-06, "loss": 0.162, "step": 2565 }, { "epoch": 8.09441384736428, "grad_norm": 0.24759550291938406, "learning_rate": 7.32180551700729e-06, "loss": 0.1643, "step": 2566 }, { "epoch": 8.097560975609756, "grad_norm": 0.24247858902191025, "learning_rate": 7.311165329043064e-06, "loss": 0.1676, "step": 2567 }, { "epoch": 8.100708103855233, "grad_norm": 0.23072668662855816, "learning_rate": 7.300540482985061e-06, "loss": 0.1659, "step": 2568 }, { "epoch": 8.103855232100708, "grad_norm": 0.23644287711658252, "learning_rate": 7.289930989930518e-06, "loss": 0.1628, "step": 2569 }, { "epoch": 8.107002360346184, "grad_norm": 0.22627740006753835, "learning_rate": 7.279336860960633e-06, "loss": 0.1606, "step": 2570 }, { "epoch": 8.11014948859166, "grad_norm": 0.2291150949130993, "learning_rate": 7.26875810714055e-06, "loss": 0.1654, "step": 2571 }, { "epoch": 8.113296616837136, "grad_norm": 0.28213249790664724, "learning_rate": 7.25819473951936e-06, "loss": 0.1754, "step": 2572 }, { "epoch": 8.116443745082613, "grad_norm": 0.22998099293791194, "learning_rate": 7.247646769130079e-06, "loss": 0.1657, "step": 2573 }, { "epoch": 8.119590873328088, "grad_norm": 0.228994767720687, "learning_rate": 7.237114206989646e-06, "loss": 0.1612, "step": 2574 }, { "epoch": 8.122738001573564, "grad_norm": 0.22828083602687194, "learning_rate": 7.226597064098905e-06, "loss": 0.1579, "step": 2575 }, { "epoch": 8.12588512981904, "grad_norm": 0.2296606678197253, "learning_rate": 7.216095351442604e-06, "loss": 0.164, "step": 2576 }, { "epoch": 8.129032258064516, "grad_norm": 0.22801101601758597, "learning_rate": 7.205609079989353e-06, "loss": 0.1659, "step": 2577 }, { "epoch": 8.132179386309993, "grad_norm": 0.2375397466097386, "learning_rate": 7.195138260691652e-06, "loss": 0.1615, "step": 2578 }, { "epoch": 8.135326514555468, "grad_norm": 0.24227397129094566, "learning_rate": 7.184682904485862e-06, "loss": 0.1659, "step": 2579 }, { "epoch": 8.138473642800944, "grad_norm": 0.23277206582166304, "learning_rate": 7.1742430222921834e-06, "loss": 0.1593, "step": 2580 }, { "epoch": 8.141620771046421, "grad_norm": 0.22865926008781765, "learning_rate": 7.163818625014662e-06, "loss": 0.1615, "step": 2581 }, { "epoch": 8.144767899291896, "grad_norm": 0.2294968243718884, "learning_rate": 7.1534097235411674e-06, "loss": 0.1643, "step": 2582 }, { "epoch": 8.147915027537373, "grad_norm": 0.22695319257199334, "learning_rate": 7.143016328743384e-06, "loss": 0.1676, "step": 2583 }, { "epoch": 8.151062155782848, "grad_norm": 0.2305261334568713, "learning_rate": 7.132638451476801e-06, "loss": 0.1716, "step": 2584 }, { "epoch": 8.154209284028324, "grad_norm": 0.22658443236517037, "learning_rate": 7.122276102580698e-06, "loss": 0.1693, "step": 2585 }, { "epoch": 8.157356412273801, "grad_norm": 0.23678326060091193, "learning_rate": 7.111929292878147e-06, "loss": 0.1659, "step": 2586 }, { "epoch": 8.160503540519276, "grad_norm": 0.23144801338294188, "learning_rate": 7.101598033175973e-06, "loss": 0.1667, "step": 2587 }, { "epoch": 8.163650668764753, "grad_norm": 0.2255681221831429, "learning_rate": 7.091282334264773e-06, "loss": 0.1684, "step": 2588 }, { "epoch": 8.166797797010227, "grad_norm": 0.23550516194771806, "learning_rate": 7.080982206918873e-06, "loss": 0.1624, "step": 2589 }, { "epoch": 8.169944925255704, "grad_norm": 0.23560675125042624, "learning_rate": 7.070697661896368e-06, "loss": 0.1597, "step": 2590 }, { "epoch": 8.17309205350118, "grad_norm": 0.231816404825124, "learning_rate": 7.060428709939047e-06, "loss": 0.1648, "step": 2591 }, { "epoch": 8.176239181746656, "grad_norm": 0.23355018538739725, "learning_rate": 7.050175361772427e-06, "loss": 0.1626, "step": 2592 }, { "epoch": 8.179386309992132, "grad_norm": 0.24417933186787055, "learning_rate": 7.039937628105717e-06, "loss": 0.1651, "step": 2593 }, { "epoch": 8.182533438237607, "grad_norm": 0.22574700897136932, "learning_rate": 7.029715519631832e-06, "loss": 0.1671, "step": 2594 }, { "epoch": 8.185680566483084, "grad_norm": 0.2304324967468494, "learning_rate": 7.019509047027362e-06, "loss": 0.1672, "step": 2595 }, { "epoch": 8.18882769472856, "grad_norm": 0.2287503066121252, "learning_rate": 7.0093182209525525e-06, "loss": 0.1627, "step": 2596 }, { "epoch": 8.191974822974036, "grad_norm": 0.2242905190122735, "learning_rate": 6.9991430520513306e-06, "loss": 0.1577, "step": 2597 }, { "epoch": 8.195121951219512, "grad_norm": 0.2475609011866222, "learning_rate": 6.988983550951245e-06, "loss": 0.1644, "step": 2598 }, { "epoch": 8.198269079464989, "grad_norm": 0.23890330471937485, "learning_rate": 6.9788397282635044e-06, "loss": 0.1644, "step": 2599 }, { "epoch": 8.201416207710464, "grad_norm": 0.2386410693285585, "learning_rate": 6.968711594582919e-06, "loss": 0.164, "step": 2600 }, { "epoch": 8.20456333595594, "grad_norm": 0.2389681606873896, "learning_rate": 6.958599160487927e-06, "loss": 0.1623, "step": 2601 }, { "epoch": 8.207710464201416, "grad_norm": 0.22261291520501994, "learning_rate": 6.948502436540572e-06, "loss": 0.159, "step": 2602 }, { "epoch": 8.210857592446892, "grad_norm": 0.22847054610493028, "learning_rate": 6.93842143328647e-06, "loss": 0.1602, "step": 2603 }, { "epoch": 8.214004720692369, "grad_norm": 0.233317294983424, "learning_rate": 6.928356161254845e-06, "loss": 0.162, "step": 2604 }, { "epoch": 8.217151848937844, "grad_norm": 0.23035769556977229, "learning_rate": 6.91830663095846e-06, "loss": 0.1653, "step": 2605 }, { "epoch": 8.22029897718332, "grad_norm": 0.2348863615456292, "learning_rate": 6.908272852893666e-06, "loss": 0.1708, "step": 2606 }, { "epoch": 8.223446105428796, "grad_norm": 0.22854836342550924, "learning_rate": 6.898254837540333e-06, "loss": 0.169, "step": 2607 }, { "epoch": 8.226593233674272, "grad_norm": 0.22732100111251707, "learning_rate": 6.888252595361895e-06, "loss": 0.164, "step": 2608 }, { "epoch": 8.229740361919749, "grad_norm": 0.2211306149730069, "learning_rate": 6.878266136805284e-06, "loss": 0.1649, "step": 2609 }, { "epoch": 8.232887490165224, "grad_norm": 0.23144901547851038, "learning_rate": 6.86829547230097e-06, "loss": 0.1672, "step": 2610 }, { "epoch": 8.2360346184107, "grad_norm": 0.23413830665280178, "learning_rate": 6.858340612262916e-06, "loss": 0.1644, "step": 2611 }, { "epoch": 8.239181746656175, "grad_norm": 0.2245328322035716, "learning_rate": 6.848401567088575e-06, "loss": 0.1623, "step": 2612 }, { "epoch": 8.242328874901652, "grad_norm": 0.2351914752245803, "learning_rate": 6.838478347158893e-06, "loss": 0.1568, "step": 2613 }, { "epoch": 8.245476003147129, "grad_norm": 0.22741744869087863, "learning_rate": 6.828570962838271e-06, "loss": 0.1647, "step": 2614 }, { "epoch": 8.248623131392604, "grad_norm": 0.234581482295964, "learning_rate": 6.81867942447459e-06, "loss": 0.1625, "step": 2615 }, { "epoch": 8.25177025963808, "grad_norm": 0.23787978249548633, "learning_rate": 6.808803742399162e-06, "loss": 0.1643, "step": 2616 }, { "epoch": 8.254917387883557, "grad_norm": 0.22558874923524821, "learning_rate": 6.798943926926748e-06, "loss": 0.1655, "step": 2617 }, { "epoch": 8.258064516129032, "grad_norm": 0.23066847653534014, "learning_rate": 6.7890999883555365e-06, "loss": 0.1598, "step": 2618 }, { "epoch": 8.261211644374509, "grad_norm": 0.24121585112670477, "learning_rate": 6.779271936967129e-06, "loss": 0.1671, "step": 2619 }, { "epoch": 8.264358772619984, "grad_norm": 0.23631017657037634, "learning_rate": 6.769459783026544e-06, "loss": 0.1662, "step": 2620 }, { "epoch": 8.26750590086546, "grad_norm": 0.2391507536037999, "learning_rate": 6.759663536782177e-06, "loss": 0.1666, "step": 2621 }, { "epoch": 8.270653029110937, "grad_norm": 0.22647269595449845, "learning_rate": 6.74988320846583e-06, "loss": 0.1646, "step": 2622 }, { "epoch": 8.273800157356412, "grad_norm": 0.23536140779260983, "learning_rate": 6.740118808292657e-06, "loss": 0.174, "step": 2623 }, { "epoch": 8.276947285601889, "grad_norm": 0.2186614203495139, "learning_rate": 6.730370346461198e-06, "loss": 0.1717, "step": 2624 }, { "epoch": 8.280094413847364, "grad_norm": 0.2322550042532507, "learning_rate": 6.720637833153325e-06, "loss": 0.1659, "step": 2625 }, { "epoch": 8.28324154209284, "grad_norm": 0.23867083019820706, "learning_rate": 6.710921278534269e-06, "loss": 0.164, "step": 2626 }, { "epoch": 8.286388670338317, "grad_norm": 0.23732190600347555, "learning_rate": 6.7012206927525926e-06, "loss": 0.1683, "step": 2627 }, { "epoch": 8.289535798583792, "grad_norm": 0.23792842922753502, "learning_rate": 6.69153608594016e-06, "loss": 0.1552, "step": 2628 }, { "epoch": 8.292682926829269, "grad_norm": 0.2310163348962354, "learning_rate": 6.681867468212171e-06, "loss": 0.1669, "step": 2629 }, { "epoch": 8.295830055074743, "grad_norm": 0.22593964960603174, "learning_rate": 6.672214849667107e-06, "loss": 0.1649, "step": 2630 }, { "epoch": 8.29897718332022, "grad_norm": 0.23274451068025068, "learning_rate": 6.66257824038675e-06, "loss": 0.1644, "step": 2631 }, { "epoch": 8.302124311565697, "grad_norm": 0.2276653833702071, "learning_rate": 6.652957650436149e-06, "loss": 0.1631, "step": 2632 }, { "epoch": 8.305271439811172, "grad_norm": 0.22912711462824803, "learning_rate": 6.643353089863644e-06, "loss": 0.1673, "step": 2633 }, { "epoch": 8.308418568056648, "grad_norm": 0.2374453905619608, "learning_rate": 6.633764568700805e-06, "loss": 0.1633, "step": 2634 }, { "epoch": 8.311565696302125, "grad_norm": 0.23454962799092569, "learning_rate": 6.624192096962468e-06, "loss": 0.1578, "step": 2635 }, { "epoch": 8.3147128245476, "grad_norm": 0.22400278973493876, "learning_rate": 6.614635684646704e-06, "loss": 0.1665, "step": 2636 }, { "epoch": 8.317859952793077, "grad_norm": 0.2367047312346527, "learning_rate": 6.6050953417348e-06, "loss": 0.1659, "step": 2637 }, { "epoch": 8.321007081038552, "grad_norm": 0.2396724707707527, "learning_rate": 6.595571078191273e-06, "loss": 0.1618, "step": 2638 }, { "epoch": 8.324154209284028, "grad_norm": 0.23140731829793698, "learning_rate": 6.586062903963832e-06, "loss": 0.1653, "step": 2639 }, { "epoch": 8.327301337529505, "grad_norm": 0.2402337835141489, "learning_rate": 6.576570828983397e-06, "loss": 0.1685, "step": 2640 }, { "epoch": 8.33044846577498, "grad_norm": 0.24165802205944656, "learning_rate": 6.5670948631640575e-06, "loss": 0.1714, "step": 2641 }, { "epoch": 8.333595594020457, "grad_norm": 0.22826464282932363, "learning_rate": 6.557635016403086e-06, "loss": 0.1655, "step": 2642 }, { "epoch": 8.336742722265932, "grad_norm": 0.232516141417856, "learning_rate": 6.548191298580923e-06, "loss": 0.1644, "step": 2643 }, { "epoch": 8.339889850511408, "grad_norm": 0.22800084761954714, "learning_rate": 6.538763719561149e-06, "loss": 0.1725, "step": 2644 }, { "epoch": 8.343036978756885, "grad_norm": 0.23124756998898505, "learning_rate": 6.529352289190507e-06, "loss": 0.1669, "step": 2645 }, { "epoch": 8.34618410700236, "grad_norm": 0.23164406300800636, "learning_rate": 6.51995701729885e-06, "loss": 0.1606, "step": 2646 }, { "epoch": 8.349331235247837, "grad_norm": 0.23456181103996557, "learning_rate": 6.510577913699186e-06, "loss": 0.1626, "step": 2647 }, { "epoch": 8.352478363493312, "grad_norm": 0.22874955849477083, "learning_rate": 6.501214988187601e-06, "loss": 0.1624, "step": 2648 }, { "epoch": 8.355625491738788, "grad_norm": 0.23854686972912492, "learning_rate": 6.491868250543312e-06, "loss": 0.1642, "step": 2649 }, { "epoch": 8.358772619984265, "grad_norm": 0.2372645112138056, "learning_rate": 6.4825377105286044e-06, "loss": 0.1655, "step": 2650 }, { "epoch": 8.36191974822974, "grad_norm": 0.2319500615137312, "learning_rate": 6.473223377888865e-06, "loss": 0.1701, "step": 2651 }, { "epoch": 8.365066876475217, "grad_norm": 0.23635738733339692, "learning_rate": 6.463925262352549e-06, "loss": 0.1648, "step": 2652 }, { "epoch": 8.368214004720693, "grad_norm": 0.22916613347850073, "learning_rate": 6.454643373631161e-06, "loss": 0.167, "step": 2653 }, { "epoch": 8.371361132966168, "grad_norm": 0.2343271870579212, "learning_rate": 6.445377721419274e-06, "loss": 0.1687, "step": 2654 }, { "epoch": 8.374508261211645, "grad_norm": 0.23188613838557393, "learning_rate": 6.436128315394487e-06, "loss": 0.1626, "step": 2655 }, { "epoch": 8.37765538945712, "grad_norm": 0.23834493012382535, "learning_rate": 6.426895165217448e-06, "loss": 0.17, "step": 2656 }, { "epoch": 8.380802517702596, "grad_norm": 0.23537207044394082, "learning_rate": 6.417678280531808e-06, "loss": 0.1623, "step": 2657 }, { "epoch": 8.383949645948073, "grad_norm": 0.23842873789085556, "learning_rate": 6.408477670964244e-06, "loss": 0.1671, "step": 2658 }, { "epoch": 8.387096774193548, "grad_norm": 0.22315005761815868, "learning_rate": 6.399293346124427e-06, "loss": 0.1648, "step": 2659 }, { "epoch": 8.390243902439025, "grad_norm": 0.2369572676127876, "learning_rate": 6.390125315605016e-06, "loss": 0.1669, "step": 2660 }, { "epoch": 8.3933910306845, "grad_norm": 0.23656957128379635, "learning_rate": 6.380973588981662e-06, "loss": 0.1658, "step": 2661 }, { "epoch": 8.396538158929976, "grad_norm": 0.23549710286212458, "learning_rate": 6.371838175812977e-06, "loss": 0.165, "step": 2662 }, { "epoch": 8.399685287175453, "grad_norm": 0.23821077849443947, "learning_rate": 6.362719085640544e-06, "loss": 0.1644, "step": 2663 }, { "epoch": 8.402832415420928, "grad_norm": 0.23812845227381751, "learning_rate": 6.353616327988885e-06, "loss": 0.1695, "step": 2664 }, { "epoch": 8.405979543666405, "grad_norm": 0.23151702473551566, "learning_rate": 6.344529912365477e-06, "loss": 0.1664, "step": 2665 }, { "epoch": 8.40912667191188, "grad_norm": 0.23515427535640315, "learning_rate": 6.335459848260712e-06, "loss": 0.1628, "step": 2666 }, { "epoch": 8.412273800157356, "grad_norm": 0.24148276293516086, "learning_rate": 6.326406145147919e-06, "loss": 0.165, "step": 2667 }, { "epoch": 8.415420928402833, "grad_norm": 0.22961657881825784, "learning_rate": 6.3173688124833354e-06, "loss": 0.1566, "step": 2668 }, { "epoch": 8.418568056648308, "grad_norm": 0.23330366260196786, "learning_rate": 6.3083478597060895e-06, "loss": 0.1679, "step": 2669 }, { "epoch": 8.421715184893785, "grad_norm": 0.23139753589023687, "learning_rate": 6.299343296238215e-06, "loss": 0.1715, "step": 2670 }, { "epoch": 8.424862313139261, "grad_norm": 0.23233022652711008, "learning_rate": 6.290355131484619e-06, "loss": 0.1625, "step": 2671 }, { "epoch": 8.428009441384736, "grad_norm": 0.23192930432672745, "learning_rate": 6.281383374833088e-06, "loss": 0.1661, "step": 2672 }, { "epoch": 8.431156569630213, "grad_norm": 0.237960894230701, "learning_rate": 6.272428035654258e-06, "loss": 0.1664, "step": 2673 }, { "epoch": 8.434303697875688, "grad_norm": 0.23781605768359015, "learning_rate": 6.263489123301633e-06, "loss": 0.1682, "step": 2674 }, { "epoch": 8.437450826121164, "grad_norm": 0.23308152261123055, "learning_rate": 6.254566647111552e-06, "loss": 0.1684, "step": 2675 }, { "epoch": 8.440597954366641, "grad_norm": 0.24050735086700006, "learning_rate": 6.2456606164031865e-06, "loss": 0.1691, "step": 2676 }, { "epoch": 8.443745082612116, "grad_norm": 0.2332745071775824, "learning_rate": 6.23677104047854e-06, "loss": 0.1684, "step": 2677 }, { "epoch": 8.446892210857593, "grad_norm": 0.23439482464949807, "learning_rate": 6.22789792862241e-06, "loss": 0.1644, "step": 2678 }, { "epoch": 8.450039339103068, "grad_norm": 0.22619429799525462, "learning_rate": 6.219041290102423e-06, "loss": 0.1633, "step": 2679 }, { "epoch": 8.453186467348544, "grad_norm": 0.23111017539558812, "learning_rate": 6.210201134168976e-06, "loss": 0.1686, "step": 2680 }, { "epoch": 8.456333595594021, "grad_norm": 0.23077386898729263, "learning_rate": 6.201377470055274e-06, "loss": 0.1643, "step": 2681 }, { "epoch": 8.459480723839496, "grad_norm": 0.2314358817989027, "learning_rate": 6.192570306977274e-06, "loss": 0.1659, "step": 2682 }, { "epoch": 8.462627852084973, "grad_norm": 0.2429161146850131, "learning_rate": 6.183779654133711e-06, "loss": 0.1658, "step": 2683 }, { "epoch": 8.465774980330448, "grad_norm": 0.22921232924755053, "learning_rate": 6.175005520706083e-06, "loss": 0.1753, "step": 2684 }, { "epoch": 8.468922108575924, "grad_norm": 0.23660956898638807, "learning_rate": 6.166247915858612e-06, "loss": 0.1641, "step": 2685 }, { "epoch": 8.472069236821401, "grad_norm": 0.23359602867885043, "learning_rate": 6.157506848738281e-06, "loss": 0.1663, "step": 2686 }, { "epoch": 8.475216365066876, "grad_norm": 0.22916515054110842, "learning_rate": 6.148782328474779e-06, "loss": 0.1681, "step": 2687 }, { "epoch": 8.478363493312353, "grad_norm": 0.2341655050876696, "learning_rate": 6.1400743641805295e-06, "loss": 0.1637, "step": 2688 }, { "epoch": 8.48151062155783, "grad_norm": 0.2326657826844374, "learning_rate": 6.131382964950646e-06, "loss": 0.1714, "step": 2689 }, { "epoch": 8.484657749803304, "grad_norm": 0.22139519017417883, "learning_rate": 6.122708139862964e-06, "loss": 0.1644, "step": 2690 }, { "epoch": 8.487804878048781, "grad_norm": 0.2358515865230787, "learning_rate": 6.114049897977987e-06, "loss": 0.1678, "step": 2691 }, { "epoch": 8.490952006294256, "grad_norm": 0.24072618363900117, "learning_rate": 6.105408248338907e-06, "loss": 0.1652, "step": 2692 }, { "epoch": 8.494099134539733, "grad_norm": 0.23833730230772185, "learning_rate": 6.0967831999715895e-06, "loss": 0.1653, "step": 2693 }, { "epoch": 8.49724626278521, "grad_norm": 0.23569737870390947, "learning_rate": 6.088174761884547e-06, "loss": 0.1676, "step": 2694 }, { "epoch": 8.500393391030684, "grad_norm": 0.2362096527557548, "learning_rate": 6.079582943068963e-06, "loss": 0.1613, "step": 2695 }, { "epoch": 8.50354051927616, "grad_norm": 0.2276250499406588, "learning_rate": 6.07100775249864e-06, "loss": 0.1679, "step": 2696 }, { "epoch": 8.506687647521636, "grad_norm": 0.23255051930242077, "learning_rate": 6.062449199130038e-06, "loss": 0.158, "step": 2697 }, { "epoch": 8.509834775767112, "grad_norm": 0.23823745767581703, "learning_rate": 6.053907291902215e-06, "loss": 0.1675, "step": 2698 }, { "epoch": 8.51298190401259, "grad_norm": 0.22780299459891643, "learning_rate": 6.04538203973686e-06, "loss": 0.167, "step": 2699 }, { "epoch": 8.516129032258064, "grad_norm": 0.23195173113273695, "learning_rate": 6.036873451538268e-06, "loss": 0.1604, "step": 2700 }, { "epoch": 8.51927616050354, "grad_norm": 0.22611245875521443, "learning_rate": 6.02838153619331e-06, "loss": 0.1672, "step": 2701 }, { "epoch": 8.522423288749017, "grad_norm": 0.22775959921131073, "learning_rate": 6.019906302571467e-06, "loss": 0.1641, "step": 2702 }, { "epoch": 8.525570416994492, "grad_norm": 0.23041588221663856, "learning_rate": 6.011447759524776e-06, "loss": 0.1683, "step": 2703 }, { "epoch": 8.528717545239969, "grad_norm": 0.22838434499944277, "learning_rate": 6.003005915887853e-06, "loss": 0.1637, "step": 2704 }, { "epoch": 8.531864673485444, "grad_norm": 0.23218601962132057, "learning_rate": 5.99458078047787e-06, "loss": 0.1701, "step": 2705 }, { "epoch": 8.53501180173092, "grad_norm": 0.2416174473030719, "learning_rate": 5.986172362094551e-06, "loss": 0.1653, "step": 2706 }, { "epoch": 8.538158929976397, "grad_norm": 0.232552051298067, "learning_rate": 5.977780669520149e-06, "loss": 0.1673, "step": 2707 }, { "epoch": 8.541306058221872, "grad_norm": 0.2345354384269151, "learning_rate": 5.96940571151946e-06, "loss": 0.1597, "step": 2708 }, { "epoch": 8.544453186467349, "grad_norm": 0.2352724122845352, "learning_rate": 5.961047496839797e-06, "loss": 0.17, "step": 2709 }, { "epoch": 8.547600314712824, "grad_norm": 0.2347397790865505, "learning_rate": 5.952706034210978e-06, "loss": 0.1654, "step": 2710 }, { "epoch": 8.5507474429583, "grad_norm": 0.23099011233464176, "learning_rate": 5.944381332345337e-06, "loss": 0.1693, "step": 2711 }, { "epoch": 8.553894571203777, "grad_norm": 0.23436885807183397, "learning_rate": 5.93607339993769e-06, "loss": 0.1625, "step": 2712 }, { "epoch": 8.557041699449252, "grad_norm": 0.23460704187792045, "learning_rate": 5.92778224566535e-06, "loss": 0.1661, "step": 2713 }, { "epoch": 8.560188827694729, "grad_norm": 0.23592554127140355, "learning_rate": 5.919507878188092e-06, "loss": 0.1681, "step": 2714 }, { "epoch": 8.563335955940204, "grad_norm": 0.22436344128981697, "learning_rate": 5.9112503061481685e-06, "loss": 0.1681, "step": 2715 }, { "epoch": 8.56648308418568, "grad_norm": 0.23478075715615096, "learning_rate": 5.903009538170289e-06, "loss": 0.1697, "step": 2716 }, { "epoch": 8.569630212431157, "grad_norm": 0.23791774157827272, "learning_rate": 5.894785582861606e-06, "loss": 0.1679, "step": 2717 }, { "epoch": 8.572777340676632, "grad_norm": 0.2411394751490439, "learning_rate": 5.886578448811714e-06, "loss": 0.167, "step": 2718 }, { "epoch": 8.575924468922109, "grad_norm": 0.22846188636305548, "learning_rate": 5.878388144592642e-06, "loss": 0.1691, "step": 2719 }, { "epoch": 8.579071597167584, "grad_norm": 0.2385881186153001, "learning_rate": 5.8702146787588435e-06, "loss": 0.1655, "step": 2720 }, { "epoch": 8.58221872541306, "grad_norm": 0.24030991985714578, "learning_rate": 5.862058059847169e-06, "loss": 0.1724, "step": 2721 }, { "epoch": 8.585365853658537, "grad_norm": 0.22562331859076523, "learning_rate": 5.8539182963768935e-06, "loss": 0.1673, "step": 2722 }, { "epoch": 8.588512981904012, "grad_norm": 0.22799397594939672, "learning_rate": 5.845795396849671e-06, "loss": 0.1625, "step": 2723 }, { "epoch": 8.591660110149489, "grad_norm": 0.22658460250777282, "learning_rate": 5.837689369749554e-06, "loss": 0.1672, "step": 2724 }, { "epoch": 8.594807238394965, "grad_norm": 0.23287534822002212, "learning_rate": 5.829600223542965e-06, "loss": 0.167, "step": 2725 }, { "epoch": 8.59795436664044, "grad_norm": 0.23513451530619064, "learning_rate": 5.821527966678693e-06, "loss": 0.1604, "step": 2726 }, { "epoch": 8.601101494885917, "grad_norm": 0.24331885939351217, "learning_rate": 5.8134726075878965e-06, "loss": 0.1669, "step": 2727 }, { "epoch": 8.604248623131392, "grad_norm": 0.23480791746582516, "learning_rate": 5.805434154684075e-06, "loss": 0.1631, "step": 2728 }, { "epoch": 8.607395751376869, "grad_norm": 0.2313813224235784, "learning_rate": 5.797412616363077e-06, "loss": 0.1718, "step": 2729 }, { "epoch": 8.610542879622345, "grad_norm": 0.23469324508654915, "learning_rate": 5.789408001003079e-06, "loss": 0.1645, "step": 2730 }, { "epoch": 8.61369000786782, "grad_norm": 0.22766690046481194, "learning_rate": 5.781420316964586e-06, "loss": 0.1641, "step": 2731 }, { "epoch": 8.616837136113297, "grad_norm": 0.22244068111472892, "learning_rate": 5.773449572590417e-06, "loss": 0.1677, "step": 2732 }, { "epoch": 8.619984264358772, "grad_norm": 0.22626928201892044, "learning_rate": 5.7654957762056994e-06, "loss": 0.1658, "step": 2733 }, { "epoch": 8.623131392604249, "grad_norm": 0.23558057122663417, "learning_rate": 5.7575589361178645e-06, "loss": 0.1623, "step": 2734 }, { "epoch": 8.626278520849725, "grad_norm": 0.22952982394885552, "learning_rate": 5.749639060616618e-06, "loss": 0.1654, "step": 2735 }, { "epoch": 8.6294256490952, "grad_norm": 0.2260057004402793, "learning_rate": 5.74173615797396e-06, "loss": 0.1611, "step": 2736 }, { "epoch": 8.632572777340677, "grad_norm": 0.22660148535255212, "learning_rate": 5.733850236444161e-06, "loss": 0.1654, "step": 2737 }, { "epoch": 8.635719905586154, "grad_norm": 0.2275035237099505, "learning_rate": 5.725981304263756e-06, "loss": 0.1704, "step": 2738 }, { "epoch": 8.638867033831628, "grad_norm": 0.23670395896446667, "learning_rate": 5.718129369651524e-06, "loss": 0.1683, "step": 2739 }, { "epoch": 8.642014162077105, "grad_norm": 0.2373653142814522, "learning_rate": 5.710294440808507e-06, "loss": 0.1721, "step": 2740 }, { "epoch": 8.64516129032258, "grad_norm": 0.23104346401189593, "learning_rate": 5.702476525917979e-06, "loss": 0.1663, "step": 2741 }, { "epoch": 8.648308418568057, "grad_norm": 0.23283488730211302, "learning_rate": 5.6946756331454354e-06, "loss": 0.1668, "step": 2742 }, { "epoch": 8.651455546813533, "grad_norm": 0.2311838977095239, "learning_rate": 5.6868917706386105e-06, "loss": 0.1747, "step": 2743 }, { "epoch": 8.654602675059008, "grad_norm": 0.2241606956720152, "learning_rate": 5.67912494652743e-06, "loss": 0.1641, "step": 2744 }, { "epoch": 8.657749803304485, "grad_norm": 0.23193312446144088, "learning_rate": 5.671375168924041e-06, "loss": 0.1696, "step": 2745 }, { "epoch": 8.66089693154996, "grad_norm": 0.23447145957791274, "learning_rate": 5.663642445922777e-06, "loss": 0.1699, "step": 2746 }, { "epoch": 8.664044059795437, "grad_norm": 0.22987054795106973, "learning_rate": 5.655926785600158e-06, "loss": 0.1612, "step": 2747 }, { "epoch": 8.667191188040913, "grad_norm": 0.2352484784071021, "learning_rate": 5.648228196014888e-06, "loss": 0.1674, "step": 2748 }, { "epoch": 8.670338316286388, "grad_norm": 0.23150853204839367, "learning_rate": 5.640546685207842e-06, "loss": 0.1677, "step": 2749 }, { "epoch": 8.673485444531865, "grad_norm": 0.2319763405157472, "learning_rate": 5.632882261202054e-06, "loss": 0.1627, "step": 2750 }, { "epoch": 8.67663257277734, "grad_norm": 0.23040198433732714, "learning_rate": 5.625234932002706e-06, "loss": 0.1641, "step": 2751 }, { "epoch": 8.679779701022817, "grad_norm": 0.24040413406804584, "learning_rate": 5.617604705597136e-06, "loss": 0.166, "step": 2752 }, { "epoch": 8.682926829268293, "grad_norm": 0.23110146532134304, "learning_rate": 5.609991589954809e-06, "loss": 0.1683, "step": 2753 }, { "epoch": 8.686073957513768, "grad_norm": 0.22419932903394638, "learning_rate": 5.602395593027327e-06, "loss": 0.1716, "step": 2754 }, { "epoch": 8.689221085759245, "grad_norm": 0.22482368034008485, "learning_rate": 5.594816722748403e-06, "loss": 0.1612, "step": 2755 }, { "epoch": 8.69236821400472, "grad_norm": 0.22947954480527974, "learning_rate": 5.58725498703387e-06, "loss": 0.1703, "step": 2756 }, { "epoch": 8.695515342250197, "grad_norm": 0.234698861914943, "learning_rate": 5.579710393781666e-06, "loss": 0.168, "step": 2757 }, { "epoch": 8.698662470495673, "grad_norm": 0.24098945022071477, "learning_rate": 5.5721829508718095e-06, "loss": 0.1665, "step": 2758 }, { "epoch": 8.701809598741148, "grad_norm": 0.2371662907560327, "learning_rate": 5.564672666166425e-06, "loss": 0.1667, "step": 2759 }, { "epoch": 8.704956726986625, "grad_norm": 0.23429888837701635, "learning_rate": 5.557179547509703e-06, "loss": 0.1718, "step": 2760 }, { "epoch": 8.708103855232102, "grad_norm": 0.23129876881925154, "learning_rate": 5.549703602727912e-06, "loss": 0.1746, "step": 2761 }, { "epoch": 8.711250983477576, "grad_norm": 0.23080453324233027, "learning_rate": 5.542244839629379e-06, "loss": 0.1654, "step": 2762 }, { "epoch": 8.714398111723053, "grad_norm": 0.23544727546366193, "learning_rate": 5.534803266004491e-06, "loss": 0.1698, "step": 2763 }, { "epoch": 8.717545239968528, "grad_norm": 0.23363093531020357, "learning_rate": 5.527378889625668e-06, "loss": 0.1647, "step": 2764 }, { "epoch": 8.720692368214005, "grad_norm": 0.23435894467290583, "learning_rate": 5.519971718247384e-06, "loss": 0.163, "step": 2765 }, { "epoch": 8.723839496459481, "grad_norm": 0.23005445022151622, "learning_rate": 5.512581759606137e-06, "loss": 0.1648, "step": 2766 }, { "epoch": 8.726986624704956, "grad_norm": 0.22957271730336687, "learning_rate": 5.50520902142044e-06, "loss": 0.1666, "step": 2767 }, { "epoch": 8.730133752950433, "grad_norm": 0.22954716917821802, "learning_rate": 5.497853511390836e-06, "loss": 0.1688, "step": 2768 }, { "epoch": 8.733280881195908, "grad_norm": 0.22392571792472526, "learning_rate": 5.490515237199852e-06, "loss": 0.17, "step": 2769 }, { "epoch": 8.736428009441385, "grad_norm": 0.2355921505287895, "learning_rate": 5.483194206512034e-06, "loss": 0.1662, "step": 2770 }, { "epoch": 8.739575137686861, "grad_norm": 0.23974943062847195, "learning_rate": 5.475890426973903e-06, "loss": 0.1694, "step": 2771 }, { "epoch": 8.742722265932336, "grad_norm": 0.2338669519112205, "learning_rate": 5.46860390621397e-06, "loss": 0.1659, "step": 2772 }, { "epoch": 8.745869394177813, "grad_norm": 0.23127783986823608, "learning_rate": 5.461334651842721e-06, "loss": 0.1664, "step": 2773 }, { "epoch": 8.74901652242329, "grad_norm": 0.23190110954348572, "learning_rate": 5.454082671452597e-06, "loss": 0.1676, "step": 2774 }, { "epoch": 8.752163650668765, "grad_norm": 0.23152867010137812, "learning_rate": 5.446847972618009e-06, "loss": 0.1635, "step": 2775 }, { "epoch": 8.755310778914241, "grad_norm": 0.23464418647093355, "learning_rate": 5.439630562895311e-06, "loss": 0.1601, "step": 2776 }, { "epoch": 8.758457907159716, "grad_norm": 0.22904054716255184, "learning_rate": 5.43243044982281e-06, "loss": 0.1658, "step": 2777 }, { "epoch": 8.761605035405193, "grad_norm": 0.22867471399286618, "learning_rate": 5.425247640920726e-06, "loss": 0.1677, "step": 2778 }, { "epoch": 8.76475216365067, "grad_norm": 0.22791012026392768, "learning_rate": 5.418082143691229e-06, "loss": 0.1732, "step": 2779 }, { "epoch": 8.767899291896144, "grad_norm": 0.21856098390948892, "learning_rate": 5.410933965618389e-06, "loss": 0.1648, "step": 2780 }, { "epoch": 8.771046420141621, "grad_norm": 0.22405701257784277, "learning_rate": 5.4038031141682e-06, "loss": 0.1597, "step": 2781 }, { "epoch": 8.774193548387096, "grad_norm": 0.23307681739252337, "learning_rate": 5.396689596788556e-06, "loss": 0.1675, "step": 2782 }, { "epoch": 8.777340676632573, "grad_norm": 0.23277974511466729, "learning_rate": 5.389593420909237e-06, "loss": 0.1657, "step": 2783 }, { "epoch": 8.78048780487805, "grad_norm": 0.2249371763607525, "learning_rate": 5.382514593941926e-06, "loss": 0.1667, "step": 2784 }, { "epoch": 8.783634933123524, "grad_norm": 0.23060309789801092, "learning_rate": 5.375453123280171e-06, "loss": 0.1567, "step": 2785 }, { "epoch": 8.786782061369001, "grad_norm": 0.22431101983410784, "learning_rate": 5.368409016299404e-06, "loss": 0.1646, "step": 2786 }, { "epoch": 8.789929189614476, "grad_norm": 0.23707669405318257, "learning_rate": 5.36138228035691e-06, "loss": 0.1686, "step": 2787 }, { "epoch": 8.793076317859953, "grad_norm": 0.22826592786983121, "learning_rate": 5.3543729227918375e-06, "loss": 0.167, "step": 2788 }, { "epoch": 8.79622344610543, "grad_norm": 0.22647434359880703, "learning_rate": 5.34738095092519e-06, "loss": 0.1671, "step": 2789 }, { "epoch": 8.799370574350904, "grad_norm": 0.23277753509076599, "learning_rate": 5.340406372059793e-06, "loss": 0.1694, "step": 2790 }, { "epoch": 8.802517702596381, "grad_norm": 0.22966481306572614, "learning_rate": 5.33344919348033e-06, "loss": 0.1677, "step": 2791 }, { "epoch": 8.805664830841856, "grad_norm": 0.2345330156945199, "learning_rate": 5.3265094224532925e-06, "loss": 0.1662, "step": 2792 }, { "epoch": 8.808811959087333, "grad_norm": 0.24220581762981963, "learning_rate": 5.319587066227e-06, "loss": 0.1616, "step": 2793 }, { "epoch": 8.81195908733281, "grad_norm": 0.23003977776252021, "learning_rate": 5.312682132031575e-06, "loss": 0.1692, "step": 2794 }, { "epoch": 8.815106215578284, "grad_norm": 0.22663651876262159, "learning_rate": 5.3057946270789504e-06, "loss": 0.171, "step": 2795 }, { "epoch": 8.818253343823761, "grad_norm": 0.23957359702108452, "learning_rate": 5.298924558562852e-06, "loss": 0.1653, "step": 2796 }, { "epoch": 8.821400472069238, "grad_norm": 0.229995530933423, "learning_rate": 5.292071933658794e-06, "loss": 0.1608, "step": 2797 }, { "epoch": 8.824547600314713, "grad_norm": 0.23340637605795178, "learning_rate": 5.2852367595240735e-06, "loss": 0.1727, "step": 2798 }, { "epoch": 8.82769472856019, "grad_norm": 0.22650780265142523, "learning_rate": 5.278419043297756e-06, "loss": 0.1727, "step": 2799 }, { "epoch": 8.830841856805664, "grad_norm": 0.23469188183263542, "learning_rate": 5.271618792100679e-06, "loss": 0.1595, "step": 2800 }, { "epoch": 8.83398898505114, "grad_norm": 0.22902689381057262, "learning_rate": 5.264836013035435e-06, "loss": 0.1666, "step": 2801 }, { "epoch": 8.837136113296618, "grad_norm": 0.24601196499502342, "learning_rate": 5.25807071318637e-06, "loss": 0.1675, "step": 2802 }, { "epoch": 8.840283241542092, "grad_norm": 0.23570422675260183, "learning_rate": 5.251322899619565e-06, "loss": 0.1632, "step": 2803 }, { "epoch": 8.84343036978757, "grad_norm": 0.23861546188691643, "learning_rate": 5.2445925793828504e-06, "loss": 0.1711, "step": 2804 }, { "epoch": 8.846577498033044, "grad_norm": 0.22082821092360364, "learning_rate": 5.237879759505778e-06, "loss": 0.1712, "step": 2805 }, { "epoch": 8.84972462627852, "grad_norm": 0.23120537316015438, "learning_rate": 5.2311844469996205e-06, "loss": 0.1664, "step": 2806 }, { "epoch": 8.852871754523997, "grad_norm": 0.23260665872741915, "learning_rate": 5.224506648857374e-06, "loss": 0.1557, "step": 2807 }, { "epoch": 8.856018882769472, "grad_norm": 0.2310320953143728, "learning_rate": 5.217846372053722e-06, "loss": 0.1701, "step": 2808 }, { "epoch": 8.859166011014949, "grad_norm": 0.23800631732996927, "learning_rate": 5.211203623545071e-06, "loss": 0.166, "step": 2809 }, { "epoch": 8.862313139260426, "grad_norm": 0.2230982585460558, "learning_rate": 5.204578410269503e-06, "loss": 0.1748, "step": 2810 }, { "epoch": 8.8654602675059, "grad_norm": 0.23177916380091554, "learning_rate": 5.197970739146792e-06, "loss": 0.1667, "step": 2811 }, { "epoch": 8.868607395751377, "grad_norm": 0.23008230747581662, "learning_rate": 5.191380617078389e-06, "loss": 0.1702, "step": 2812 }, { "epoch": 8.871754523996852, "grad_norm": 0.22716372034675994, "learning_rate": 5.184808050947413e-06, "loss": 0.1627, "step": 2813 }, { "epoch": 8.874901652242329, "grad_norm": 0.23335158310105836, "learning_rate": 5.178253047618657e-06, "loss": 0.1723, "step": 2814 }, { "epoch": 8.878048780487806, "grad_norm": 0.22629652030237907, "learning_rate": 5.171715613938553e-06, "loss": 0.1665, "step": 2815 }, { "epoch": 8.88119590873328, "grad_norm": 0.23433706877773142, "learning_rate": 5.165195756735199e-06, "loss": 0.1742, "step": 2816 }, { "epoch": 8.884343036978757, "grad_norm": 0.2410492157482833, "learning_rate": 5.158693482818321e-06, "loss": 0.173, "step": 2817 }, { "epoch": 8.887490165224232, "grad_norm": 0.2352594527132892, "learning_rate": 5.152208798979295e-06, "loss": 0.1581, "step": 2818 }, { "epoch": 8.890637293469709, "grad_norm": 0.23014025030306132, "learning_rate": 5.145741711991104e-06, "loss": 0.1674, "step": 2819 }, { "epoch": 8.893784421715186, "grad_norm": 0.23167065606455844, "learning_rate": 5.139292228608378e-06, "loss": 0.1646, "step": 2820 }, { "epoch": 8.89693154996066, "grad_norm": 0.2399892401864134, "learning_rate": 5.1328603555673375e-06, "loss": 0.1614, "step": 2821 }, { "epoch": 8.900078678206137, "grad_norm": 0.24240253361943384, "learning_rate": 5.126446099585824e-06, "loss": 0.1671, "step": 2822 }, { "epoch": 8.903225806451612, "grad_norm": 0.23174387029737067, "learning_rate": 5.120049467363275e-06, "loss": 0.1625, "step": 2823 }, { "epoch": 8.906372934697089, "grad_norm": 0.23155008484418904, "learning_rate": 5.1136704655807145e-06, "loss": 0.1689, "step": 2824 }, { "epoch": 8.909520062942565, "grad_norm": 0.2340628919114927, "learning_rate": 5.107309100900762e-06, "loss": 0.1623, "step": 2825 }, { "epoch": 8.91266719118804, "grad_norm": 0.22856119137017192, "learning_rate": 5.100965379967606e-06, "loss": 0.1634, "step": 2826 }, { "epoch": 8.915814319433517, "grad_norm": 0.238656910481426, "learning_rate": 5.094639309407021e-06, "loss": 0.1654, "step": 2827 }, { "epoch": 8.918961447678992, "grad_norm": 0.23793602636902197, "learning_rate": 5.0883308958263255e-06, "loss": 0.1695, "step": 2828 }, { "epoch": 8.922108575924469, "grad_norm": 0.23552997928550543, "learning_rate": 5.082040145814413e-06, "loss": 0.1634, "step": 2829 }, { "epoch": 8.925255704169945, "grad_norm": 0.2345591111424614, "learning_rate": 5.075767065941728e-06, "loss": 0.1712, "step": 2830 }, { "epoch": 8.92840283241542, "grad_norm": 0.23815624814766417, "learning_rate": 5.069511662760245e-06, "loss": 0.1653, "step": 2831 }, { "epoch": 8.931549960660897, "grad_norm": 0.2339825850851606, "learning_rate": 5.063273942803491e-06, "loss": 0.1713, "step": 2832 }, { "epoch": 8.934697088906374, "grad_norm": 0.23772485751502045, "learning_rate": 5.057053912586512e-06, "loss": 0.1616, "step": 2833 }, { "epoch": 8.937844217151849, "grad_norm": 0.23732466399568317, "learning_rate": 5.050851578605892e-06, "loss": 0.1699, "step": 2834 }, { "epoch": 8.940991345397325, "grad_norm": 0.23247387261686783, "learning_rate": 5.044666947339716e-06, "loss": 0.1677, "step": 2835 }, { "epoch": 8.9441384736428, "grad_norm": 0.22590208575695253, "learning_rate": 5.038500025247589e-06, "loss": 0.1676, "step": 2836 }, { "epoch": 8.947285601888277, "grad_norm": 0.23597275765287926, "learning_rate": 5.032350818770616e-06, "loss": 0.1677, "step": 2837 }, { "epoch": 8.950432730133754, "grad_norm": 0.23033651705619715, "learning_rate": 5.0262193343314e-06, "loss": 0.1686, "step": 2838 }, { "epoch": 8.953579858379229, "grad_norm": 0.2298946051218498, "learning_rate": 5.020105578334038e-06, "loss": 0.1693, "step": 2839 }, { "epoch": 8.956726986624705, "grad_norm": 0.22515833749826317, "learning_rate": 5.014009557164099e-06, "loss": 0.1658, "step": 2840 }, { "epoch": 8.95987411487018, "grad_norm": 0.22997911948352323, "learning_rate": 5.0079312771886425e-06, "loss": 0.1709, "step": 2841 }, { "epoch": 8.963021243115657, "grad_norm": 0.23065097782750338, "learning_rate": 5.001870744756182e-06, "loss": 0.1645, "step": 2842 }, { "epoch": 8.966168371361134, "grad_norm": 0.23243456881790153, "learning_rate": 4.995827966196714e-06, "loss": 0.1715, "step": 2843 }, { "epoch": 8.969315499606608, "grad_norm": 0.22702087926588302, "learning_rate": 4.9898029478216735e-06, "loss": 0.1656, "step": 2844 }, { "epoch": 8.972462627852085, "grad_norm": 0.23667845742674357, "learning_rate": 4.983795695923958e-06, "loss": 0.1665, "step": 2845 }, { "epoch": 8.975609756097562, "grad_norm": 0.23376278457824037, "learning_rate": 4.977806216777904e-06, "loss": 0.1649, "step": 2846 }, { "epoch": 8.978756884343037, "grad_norm": 0.22714146711985553, "learning_rate": 4.971834516639281e-06, "loss": 0.17, "step": 2847 }, { "epoch": 8.981904012588513, "grad_norm": 0.23000407639648165, "learning_rate": 4.965880601745301e-06, "loss": 0.1658, "step": 2848 }, { "epoch": 8.985051140833988, "grad_norm": 0.23490571123631973, "learning_rate": 4.959944478314586e-06, "loss": 0.1637, "step": 2849 }, { "epoch": 8.988198269079465, "grad_norm": 0.2357626917407318, "learning_rate": 4.954026152547187e-06, "loss": 0.1643, "step": 2850 }, { "epoch": 8.991345397324942, "grad_norm": 0.22454926371991393, "learning_rate": 4.948125630624556e-06, "loss": 0.1712, "step": 2851 }, { "epoch": 8.994492525570417, "grad_norm": 0.22383263709497628, "learning_rate": 4.9422429187095586e-06, "loss": 0.1707, "step": 2852 }, { "epoch": 8.997639653815893, "grad_norm": 0.23402521859638922, "learning_rate": 4.936378022946449e-06, "loss": 0.1627, "step": 2853 }, { "epoch": 9.003147128245477, "grad_norm": 0.7259930729062384, "learning_rate": 4.930530949460883e-06, "loss": 0.3053, "step": 2854 }, { "epoch": 9.006294256490952, "grad_norm": 0.2562706446384713, "learning_rate": 4.924701704359899e-06, "loss": 0.1322, "step": 2855 }, { "epoch": 9.009441384736428, "grad_norm": 0.2854052200203414, "learning_rate": 4.918890293731908e-06, "loss": 0.1333, "step": 2856 }, { "epoch": 9.012588512981903, "grad_norm": 0.4691429203587427, "learning_rate": 4.9130967236467026e-06, "loss": 0.1374, "step": 2857 }, { "epoch": 9.01573564122738, "grad_norm": 0.4028184182051083, "learning_rate": 4.907321000155432e-06, "loss": 0.1364, "step": 2858 }, { "epoch": 9.018882769472857, "grad_norm": 0.27399671132513376, "learning_rate": 4.901563129290619e-06, "loss": 0.137, "step": 2859 }, { "epoch": 9.022029897718332, "grad_norm": 0.3360542252051378, "learning_rate": 4.895823117066122e-06, "loss": 0.1385, "step": 2860 }, { "epoch": 9.025177025963808, "grad_norm": 0.3362833473207967, "learning_rate": 4.890100969477159e-06, "loss": 0.1308, "step": 2861 }, { "epoch": 9.028324154209285, "grad_norm": 0.280838008338128, "learning_rate": 4.884396692500293e-06, "loss": 0.1274, "step": 2862 }, { "epoch": 9.03147128245476, "grad_norm": 0.27276504949498936, "learning_rate": 4.878710292093409e-06, "loss": 0.1293, "step": 2863 }, { "epoch": 9.034618410700237, "grad_norm": 0.30411776357879683, "learning_rate": 4.8730417741957306e-06, "loss": 0.1315, "step": 2864 }, { "epoch": 9.037765538945711, "grad_norm": 0.3071835435610242, "learning_rate": 4.867391144727798e-06, "loss": 0.1292, "step": 2865 }, { "epoch": 9.040912667191188, "grad_norm": 0.2774412987717509, "learning_rate": 4.861758409591474e-06, "loss": 0.1352, "step": 2866 }, { "epoch": 9.044059795436665, "grad_norm": 0.24821320889899118, "learning_rate": 4.8561435746699224e-06, "loss": 0.132, "step": 2867 }, { "epoch": 9.04720692368214, "grad_norm": 0.260501036335888, "learning_rate": 4.85054664582762e-06, "loss": 0.1298, "step": 2868 }, { "epoch": 9.050354051927616, "grad_norm": 0.25950354132313963, "learning_rate": 4.844967628910332e-06, "loss": 0.1382, "step": 2869 }, { "epoch": 9.053501180173091, "grad_norm": 0.254911463794064, "learning_rate": 4.839406529745122e-06, "loss": 0.1331, "step": 2870 }, { "epoch": 9.056648308418568, "grad_norm": 0.2568698001440897, "learning_rate": 4.833863354140345e-06, "loss": 0.1313, "step": 2871 }, { "epoch": 9.059795436664045, "grad_norm": 0.25834630400276143, "learning_rate": 4.828338107885621e-06, "loss": 0.1339, "step": 2872 }, { "epoch": 9.06294256490952, "grad_norm": 0.2626115567381949, "learning_rate": 4.822830796751856e-06, "loss": 0.1398, "step": 2873 }, { "epoch": 9.066089693154996, "grad_norm": 0.23847349899918344, "learning_rate": 4.817341426491213e-06, "loss": 0.1304, "step": 2874 }, { "epoch": 9.069236821400471, "grad_norm": 0.25139369715829973, "learning_rate": 4.811870002837126e-06, "loss": 0.1309, "step": 2875 }, { "epoch": 9.072383949645948, "grad_norm": 0.2453860537308011, "learning_rate": 4.806416531504274e-06, "loss": 0.135, "step": 2876 }, { "epoch": 9.075531077891425, "grad_norm": 0.25404710592240104, "learning_rate": 4.800981018188602e-06, "loss": 0.1266, "step": 2877 }, { "epoch": 9.0786782061369, "grad_norm": 0.22754008276241747, "learning_rate": 4.79556346856728e-06, "loss": 0.1357, "step": 2878 }, { "epoch": 9.081825334382376, "grad_norm": 0.25168887931836287, "learning_rate": 4.79016388829873e-06, "loss": 0.1341, "step": 2879 }, { "epoch": 9.084972462627853, "grad_norm": 0.2502383435030883, "learning_rate": 4.784782283022597e-06, "loss": 0.1352, "step": 2880 }, { "epoch": 9.088119590873328, "grad_norm": 0.25986754177545734, "learning_rate": 4.7794186583597544e-06, "loss": 0.132, "step": 2881 }, { "epoch": 9.091266719118805, "grad_norm": 0.22662605665615443, "learning_rate": 4.774073019912298e-06, "loss": 0.1366, "step": 2882 }, { "epoch": 9.09441384736428, "grad_norm": 0.2304079612052606, "learning_rate": 4.7687453732635305e-06, "loss": 0.131, "step": 2883 }, { "epoch": 9.097560975609756, "grad_norm": 0.23596467115393177, "learning_rate": 4.763435723977974e-06, "loss": 0.1311, "step": 2884 }, { "epoch": 9.100708103855233, "grad_norm": 0.23983207466582687, "learning_rate": 4.7581440776013425e-06, "loss": 0.1295, "step": 2885 }, { "epoch": 9.103855232100708, "grad_norm": 0.22975847317016954, "learning_rate": 4.752870439660551e-06, "loss": 0.1321, "step": 2886 }, { "epoch": 9.107002360346184, "grad_norm": 0.23955053071281487, "learning_rate": 4.747614815663711e-06, "loss": 0.1355, "step": 2887 }, { "epoch": 9.11014948859166, "grad_norm": 0.2342749597252554, "learning_rate": 4.742377211100105e-06, "loss": 0.1302, "step": 2888 }, { "epoch": 9.113296616837136, "grad_norm": 0.24153308674358212, "learning_rate": 4.7371576314402135e-06, "loss": 0.1264, "step": 2889 }, { "epoch": 9.116443745082613, "grad_norm": 0.2321425269869848, "learning_rate": 4.731956082135669e-06, "loss": 0.1268, "step": 2890 }, { "epoch": 9.119590873328088, "grad_norm": 0.23212954304734737, "learning_rate": 4.726772568619297e-06, "loss": 0.1325, "step": 2891 }, { "epoch": 9.122738001573564, "grad_norm": 0.23275541822697182, "learning_rate": 4.721607096305063e-06, "loss": 0.1239, "step": 2892 }, { "epoch": 9.12588512981904, "grad_norm": 0.23593940968545418, "learning_rate": 4.716459670588102e-06, "loss": 0.1332, "step": 2893 }, { "epoch": 9.129032258064516, "grad_norm": 0.24159409253703298, "learning_rate": 4.711330296844695e-06, "loss": 0.1337, "step": 2894 }, { "epoch": 9.132179386309993, "grad_norm": 0.225298246971766, "learning_rate": 4.706218980432269e-06, "loss": 0.1332, "step": 2895 }, { "epoch": 9.135326514555468, "grad_norm": 0.23345137024396634, "learning_rate": 4.701125726689394e-06, "loss": 0.1289, "step": 2896 }, { "epoch": 9.138473642800944, "grad_norm": 0.22765284284624834, "learning_rate": 4.69605054093577e-06, "loss": 0.1332, "step": 2897 }, { "epoch": 9.141620771046421, "grad_norm": 0.2365079564611352, "learning_rate": 4.690993428472231e-06, "loss": 0.1353, "step": 2898 }, { "epoch": 9.144767899291896, "grad_norm": 0.22894145213619913, "learning_rate": 4.685954394580723e-06, "loss": 0.1316, "step": 2899 }, { "epoch": 9.147915027537373, "grad_norm": 0.2316844787210736, "learning_rate": 4.680933444524327e-06, "loss": 0.1319, "step": 2900 }, { "epoch": 9.151062155782848, "grad_norm": 0.2295088666150164, "learning_rate": 4.675930583547219e-06, "loss": 0.1352, "step": 2901 }, { "epoch": 9.154209284028324, "grad_norm": 0.22737542176781833, "learning_rate": 4.670945816874691e-06, "loss": 0.1362, "step": 2902 }, { "epoch": 9.157356412273801, "grad_norm": 0.23483198051992082, "learning_rate": 4.66597914971314e-06, "loss": 0.123, "step": 2903 }, { "epoch": 9.160503540519276, "grad_norm": 0.23493971809693678, "learning_rate": 4.661030587250045e-06, "loss": 0.1345, "step": 2904 }, { "epoch": 9.163650668764753, "grad_norm": 0.23898547282251156, "learning_rate": 4.656100134653988e-06, "loss": 0.1289, "step": 2905 }, { "epoch": 9.166797797010227, "grad_norm": 0.22591766013667208, "learning_rate": 4.65118779707463e-06, "loss": 0.1365, "step": 2906 }, { "epoch": 9.169944925255704, "grad_norm": 0.22651931489444632, "learning_rate": 4.646293579642716e-06, "loss": 0.1372, "step": 2907 }, { "epoch": 9.17309205350118, "grad_norm": 0.22780380035435008, "learning_rate": 4.641417487470058e-06, "loss": 0.135, "step": 2908 }, { "epoch": 9.176239181746656, "grad_norm": 0.23276681424595513, "learning_rate": 4.636559525649546e-06, "loss": 0.1362, "step": 2909 }, { "epoch": 9.179386309992132, "grad_norm": 0.23869234341896103, "learning_rate": 4.631719699255123e-06, "loss": 0.1352, "step": 2910 }, { "epoch": 9.182533438237607, "grad_norm": 0.23080606084113706, "learning_rate": 4.626898013341801e-06, "loss": 0.1347, "step": 2911 }, { "epoch": 9.185680566483084, "grad_norm": 0.23836084114678538, "learning_rate": 4.622094472945639e-06, "loss": 0.1246, "step": 2912 }, { "epoch": 9.18882769472856, "grad_norm": 0.2404240676609406, "learning_rate": 4.6173090830837434e-06, "loss": 0.1325, "step": 2913 }, { "epoch": 9.191974822974036, "grad_norm": 0.2123536854744844, "learning_rate": 4.612541848754265e-06, "loss": 0.1355, "step": 2914 }, { "epoch": 9.195121951219512, "grad_norm": 0.23581558553764673, "learning_rate": 4.60779277493639e-06, "loss": 0.1314, "step": 2915 }, { "epoch": 9.198269079464989, "grad_norm": 0.23688342974144003, "learning_rate": 4.6030618665903425e-06, "loss": 0.1317, "step": 2916 }, { "epoch": 9.201416207710464, "grad_norm": 0.22422105201679832, "learning_rate": 4.598349128657362e-06, "loss": 0.1276, "step": 2917 }, { "epoch": 9.20456333595594, "grad_norm": 0.22578943792556588, "learning_rate": 4.593654566059721e-06, "loss": 0.1339, "step": 2918 }, { "epoch": 9.207710464201416, "grad_norm": 0.24065645526761678, "learning_rate": 4.588978183700705e-06, "loss": 0.1265, "step": 2919 }, { "epoch": 9.210857592446892, "grad_norm": 0.22356993270115313, "learning_rate": 4.584319986464608e-06, "loss": 0.1282, "step": 2920 }, { "epoch": 9.214004720692369, "grad_norm": 0.22840256425011418, "learning_rate": 4.579679979216736e-06, "loss": 0.1354, "step": 2921 }, { "epoch": 9.217151848937844, "grad_norm": 0.22189263858930341, "learning_rate": 4.575058166803388e-06, "loss": 0.1292, "step": 2922 }, { "epoch": 9.22029897718332, "grad_norm": 0.2373676925550199, "learning_rate": 4.570454554051869e-06, "loss": 0.1308, "step": 2923 }, { "epoch": 9.223446105428796, "grad_norm": 0.23466096466526548, "learning_rate": 4.565869145770464e-06, "loss": 0.1307, "step": 2924 }, { "epoch": 9.226593233674272, "grad_norm": 0.22341580256810417, "learning_rate": 4.561301946748457e-06, "loss": 0.1356, "step": 2925 }, { "epoch": 9.229740361919749, "grad_norm": 0.2337076552757127, "learning_rate": 4.5567529617561015e-06, "loss": 0.1351, "step": 2926 }, { "epoch": 9.232887490165224, "grad_norm": 0.22744541315055633, "learning_rate": 4.552222195544636e-06, "loss": 0.1312, "step": 2927 }, { "epoch": 9.2360346184107, "grad_norm": 0.23338172402658214, "learning_rate": 4.547709652846264e-06, "loss": 0.1284, "step": 2928 }, { "epoch": 9.239181746656175, "grad_norm": 0.23508676122090486, "learning_rate": 4.543215338374159e-06, "loss": 0.1317, "step": 2929 }, { "epoch": 9.242328874901652, "grad_norm": 0.24601704848722142, "learning_rate": 4.538739256822453e-06, "loss": 0.1338, "step": 2930 }, { "epoch": 9.245476003147129, "grad_norm": 0.22389912494058156, "learning_rate": 4.5342814128662376e-06, "loss": 0.1316, "step": 2931 }, { "epoch": 9.248623131392604, "grad_norm": 0.2263022423534222, "learning_rate": 4.529841811161555e-06, "loss": 0.1321, "step": 2932 }, { "epoch": 9.25177025963808, "grad_norm": 0.23229249077068176, "learning_rate": 4.5254204563453866e-06, "loss": 0.1347, "step": 2933 }, { "epoch": 9.254917387883557, "grad_norm": 0.2310347838411131, "learning_rate": 4.521017353035675e-06, "loss": 0.131, "step": 2934 }, { "epoch": 9.258064516129032, "grad_norm": 0.23080904694053458, "learning_rate": 4.5166325058312745e-06, "loss": 0.1358, "step": 2935 }, { "epoch": 9.261211644374509, "grad_norm": 0.22383054119713772, "learning_rate": 4.512265919311992e-06, "loss": 0.1348, "step": 2936 }, { "epoch": 9.264358772619984, "grad_norm": 0.23282253286973853, "learning_rate": 4.5079175980385546e-06, "loss": 0.1291, "step": 2937 }, { "epoch": 9.26750590086546, "grad_norm": 0.22462772007805537, "learning_rate": 4.503587546552607e-06, "loss": 0.1326, "step": 2938 }, { "epoch": 9.270653029110937, "grad_norm": 0.2404642052007977, "learning_rate": 4.49927576937672e-06, "loss": 0.1353, "step": 2939 }, { "epoch": 9.273800157356412, "grad_norm": 0.22056179286469124, "learning_rate": 4.494982271014371e-06, "loss": 0.1327, "step": 2940 }, { "epoch": 9.276947285601889, "grad_norm": 0.2366854429506929, "learning_rate": 4.490707055949954e-06, "loss": 0.1324, "step": 2941 }, { "epoch": 9.280094413847364, "grad_norm": 0.2334841351792365, "learning_rate": 4.4864501286487574e-06, "loss": 0.1303, "step": 2942 }, { "epoch": 9.28324154209284, "grad_norm": 0.2411585813043938, "learning_rate": 4.482211493556974e-06, "loss": 0.1346, "step": 2943 }, { "epoch": 9.286388670338317, "grad_norm": 0.21989262346519506, "learning_rate": 4.4779911551016934e-06, "loss": 0.1316, "step": 2944 }, { "epoch": 9.289535798583792, "grad_norm": 0.22995367913581893, "learning_rate": 4.473789117690887e-06, "loss": 0.1277, "step": 2945 }, { "epoch": 9.292682926829269, "grad_norm": 0.22756421143574948, "learning_rate": 4.469605385713421e-06, "loss": 0.1351, "step": 2946 }, { "epoch": 9.295830055074743, "grad_norm": 0.23592901330387175, "learning_rate": 4.465439963539034e-06, "loss": 0.1289, "step": 2947 }, { "epoch": 9.29897718332022, "grad_norm": 0.22539025532476353, "learning_rate": 4.4612928555183486e-06, "loss": 0.1348, "step": 2948 }, { "epoch": 9.302124311565697, "grad_norm": 0.22791556449603953, "learning_rate": 4.45716406598285e-06, "loss": 0.1372, "step": 2949 }, { "epoch": 9.305271439811172, "grad_norm": 0.23492214436922185, "learning_rate": 4.453053599244903e-06, "loss": 0.1378, "step": 2950 }, { "epoch": 9.308418568056648, "grad_norm": 0.2245331885362177, "learning_rate": 4.448961459597719e-06, "loss": 0.1334, "step": 2951 }, { "epoch": 9.311565696302125, "grad_norm": 0.22809869975483035, "learning_rate": 4.444887651315381e-06, "loss": 0.1296, "step": 2952 }, { "epoch": 9.3147128245476, "grad_norm": 0.2324760160461881, "learning_rate": 4.440832178652819e-06, "loss": 0.1334, "step": 2953 }, { "epoch": 9.317859952793077, "grad_norm": 0.2446173081810758, "learning_rate": 4.436795045845812e-06, "loss": 0.1313, "step": 2954 }, { "epoch": 9.321007081038552, "grad_norm": 0.2320500946229002, "learning_rate": 4.432776257110989e-06, "loss": 0.1356, "step": 2955 }, { "epoch": 9.324154209284028, "grad_norm": 0.2448068713490391, "learning_rate": 4.428775816645813e-06, "loss": 0.1329, "step": 2956 }, { "epoch": 9.327301337529505, "grad_norm": 0.22982431355817806, "learning_rate": 4.424793728628586e-06, "loss": 0.134, "step": 2957 }, { "epoch": 9.33044846577498, "grad_norm": 0.22806790138954225, "learning_rate": 4.420829997218441e-06, "loss": 0.1362, "step": 2958 }, { "epoch": 9.333595594020457, "grad_norm": 0.23824485969121148, "learning_rate": 4.416884626555339e-06, "loss": 0.1308, "step": 2959 }, { "epoch": 9.336742722265932, "grad_norm": 0.23917326850947188, "learning_rate": 4.412957620760065e-06, "loss": 0.1235, "step": 2960 }, { "epoch": 9.339889850511408, "grad_norm": 0.23389654883715155, "learning_rate": 4.409048983934219e-06, "loss": 0.1384, "step": 2961 }, { "epoch": 9.343036978756885, "grad_norm": 0.2410002101605033, "learning_rate": 4.405158720160217e-06, "loss": 0.1391, "step": 2962 }, { "epoch": 9.34618410700236, "grad_norm": 0.2463484092587633, "learning_rate": 4.4012868335012865e-06, "loss": 0.1313, "step": 2963 }, { "epoch": 9.349331235247837, "grad_norm": 0.23216314413671288, "learning_rate": 4.3974333280014605e-06, "loss": 0.1321, "step": 2964 }, { "epoch": 9.352478363493312, "grad_norm": 0.25795328112436694, "learning_rate": 4.393598207685572e-06, "loss": 0.134, "step": 2965 }, { "epoch": 9.355625491738788, "grad_norm": 0.22300360535493108, "learning_rate": 4.389781476559255e-06, "loss": 0.1358, "step": 2966 }, { "epoch": 9.358772619984265, "grad_norm": 0.2361759272681179, "learning_rate": 4.385983138608928e-06, "loss": 0.1355, "step": 2967 }, { "epoch": 9.36191974822974, "grad_norm": 0.2393199934322128, "learning_rate": 4.38220319780181e-06, "loss": 0.1323, "step": 2968 }, { "epoch": 9.365066876475217, "grad_norm": 0.2357587424561091, "learning_rate": 4.378441658085899e-06, "loss": 0.1358, "step": 2969 }, { "epoch": 9.368214004720693, "grad_norm": 0.24506755276712727, "learning_rate": 4.374698523389971e-06, "loss": 0.1251, "step": 2970 }, { "epoch": 9.371361132966168, "grad_norm": 0.23734868049410257, "learning_rate": 4.370973797623585e-06, "loss": 0.1355, "step": 2971 }, { "epoch": 9.374508261211645, "grad_norm": 0.23479098741562127, "learning_rate": 4.367267484677067e-06, "loss": 0.1332, "step": 2972 }, { "epoch": 9.37765538945712, "grad_norm": 0.2301962617124699, "learning_rate": 4.363579588421517e-06, "loss": 0.1282, "step": 2973 }, { "epoch": 9.380802517702596, "grad_norm": 0.24990595732884477, "learning_rate": 4.3599101127087944e-06, "loss": 0.1287, "step": 2974 }, { "epoch": 9.383949645948073, "grad_norm": 0.24405837184477827, "learning_rate": 4.356259061371524e-06, "loss": 0.1322, "step": 2975 }, { "epoch": 9.387096774193548, "grad_norm": 0.23354503269275637, "learning_rate": 4.3526264382230806e-06, "loss": 0.1301, "step": 2976 }, { "epoch": 9.390243902439025, "grad_norm": 0.22497899824756662, "learning_rate": 4.349012247057597e-06, "loss": 0.1341, "step": 2977 }, { "epoch": 9.3933910306845, "grad_norm": 0.233509003106069, "learning_rate": 4.345416491649954e-06, "loss": 0.1291, "step": 2978 }, { "epoch": 9.396538158929976, "grad_norm": 0.23591177828002854, "learning_rate": 4.3418391757557745e-06, "loss": 0.1311, "step": 2979 }, { "epoch": 9.399685287175453, "grad_norm": 0.22958226547889216, "learning_rate": 4.338280303111426e-06, "loss": 0.1321, "step": 2980 }, { "epoch": 9.402832415420928, "grad_norm": 0.24045468326756445, "learning_rate": 4.334739877434006e-06, "loss": 0.1326, "step": 2981 }, { "epoch": 9.405979543666405, "grad_norm": 0.24332656727987065, "learning_rate": 4.33121790242135e-06, "loss": 0.1353, "step": 2982 }, { "epoch": 9.40912667191188, "grad_norm": 0.2318763713593753, "learning_rate": 4.327714381752023e-06, "loss": 0.1309, "step": 2983 }, { "epoch": 9.412273800157356, "grad_norm": 0.23460111972050288, "learning_rate": 4.32422931908531e-06, "loss": 0.1306, "step": 2984 }, { "epoch": 9.415420928402833, "grad_norm": 0.23033494597201723, "learning_rate": 4.320762718061228e-06, "loss": 0.1341, "step": 2985 }, { "epoch": 9.418568056648308, "grad_norm": 0.24542818156662408, "learning_rate": 4.317314582300496e-06, "loss": 0.1324, "step": 2986 }, { "epoch": 9.421715184893785, "grad_norm": 0.2363958350642206, "learning_rate": 4.313884915404562e-06, "loss": 0.1346, "step": 2987 }, { "epoch": 9.424862313139261, "grad_norm": 0.24392888891052444, "learning_rate": 4.3104737209555735e-06, "loss": 0.1293, "step": 2988 }, { "epoch": 9.428009441384736, "grad_norm": 0.2270478248172354, "learning_rate": 4.30708100251639e-06, "loss": 0.1342, "step": 2989 }, { "epoch": 9.431156569630213, "grad_norm": 0.2419755010247926, "learning_rate": 4.3037067636305695e-06, "loss": 0.1361, "step": 2990 }, { "epoch": 9.434303697875688, "grad_norm": 0.24302820086496213, "learning_rate": 4.3003510078223735e-06, "loss": 0.1357, "step": 2991 }, { "epoch": 9.437450826121164, "grad_norm": 0.22889930071492856, "learning_rate": 4.297013738596754e-06, "loss": 0.1326, "step": 2992 }, { "epoch": 9.440597954366641, "grad_norm": 0.2470105830931397, "learning_rate": 4.293694959439357e-06, "loss": 0.1307, "step": 2993 }, { "epoch": 9.443745082612116, "grad_norm": 0.24509387802011043, "learning_rate": 4.290394673816518e-06, "loss": 0.1351, "step": 2994 }, { "epoch": 9.446892210857593, "grad_norm": 0.24182787680185575, "learning_rate": 4.287112885175252e-06, "loss": 0.1392, "step": 2995 }, { "epoch": 9.450039339103068, "grad_norm": 0.2467508126477943, "learning_rate": 4.283849596943258e-06, "loss": 0.1263, "step": 2996 }, { "epoch": 9.453186467348544, "grad_norm": 0.2316135678434177, "learning_rate": 4.280604812528912e-06, "loss": 0.1324, "step": 2997 }, { "epoch": 9.456333595594021, "grad_norm": 0.2268465070914208, "learning_rate": 4.277378535321262e-06, "loss": 0.1328, "step": 2998 }, { "epoch": 9.459480723839496, "grad_norm": 0.2332807858816893, "learning_rate": 4.274170768690028e-06, "loss": 0.1373, "step": 2999 }, { "epoch": 9.462627852084973, "grad_norm": 0.22900858658201778, "learning_rate": 4.270981515985594e-06, "loss": 0.1329, "step": 3000 }, { "epoch": 9.465774980330448, "grad_norm": 0.2369332302133477, "learning_rate": 4.26781078053901e-06, "loss": 0.1314, "step": 3001 }, { "epoch": 9.468922108575924, "grad_norm": 0.24455011847922023, "learning_rate": 4.264658565661981e-06, "loss": 0.1285, "step": 3002 }, { "epoch": 9.472069236821401, "grad_norm": 0.239435503067824, "learning_rate": 4.261524874646873e-06, "loss": 0.1332, "step": 3003 }, { "epoch": 9.475216365066876, "grad_norm": 0.2492226908546605, "learning_rate": 4.258409710766699e-06, "loss": 0.1278, "step": 3004 }, { "epoch": 9.478363493312353, "grad_norm": 0.23444315425803552, "learning_rate": 4.255313077275127e-06, "loss": 0.1376, "step": 3005 }, { "epoch": 9.48151062155783, "grad_norm": 0.23211757532136212, "learning_rate": 4.252234977406469e-06, "loss": 0.1327, "step": 3006 }, { "epoch": 9.484657749803304, "grad_norm": 0.2313035945554957, "learning_rate": 4.249175414375676e-06, "loss": 0.1335, "step": 3007 }, { "epoch": 9.487804878048781, "grad_norm": 0.24305332604029584, "learning_rate": 4.246134391378343e-06, "loss": 0.1288, "step": 3008 }, { "epoch": 9.490952006294256, "grad_norm": 0.2416576523834277, "learning_rate": 4.243111911590694e-06, "loss": 0.1346, "step": 3009 }, { "epoch": 9.494099134539733, "grad_norm": 0.23223887323478282, "learning_rate": 4.240107978169594e-06, "loss": 0.1357, "step": 3010 }, { "epoch": 9.49724626278521, "grad_norm": 0.23335311574377063, "learning_rate": 4.23712259425253e-06, "loss": 0.1336, "step": 3011 }, { "epoch": 9.500393391030684, "grad_norm": 0.23028601506740967, "learning_rate": 4.234155762957619e-06, "loss": 0.1367, "step": 3012 }, { "epoch": 9.50354051927616, "grad_norm": 0.2446292329288432, "learning_rate": 4.231207487383596e-06, "loss": 0.1363, "step": 3013 }, { "epoch": 9.506687647521636, "grad_norm": 0.24503357102458362, "learning_rate": 4.228277770609821e-06, "loss": 0.1386, "step": 3014 }, { "epoch": 9.509834775767112, "grad_norm": 0.2445600531818062, "learning_rate": 4.225366615696263e-06, "loss": 0.1369, "step": 3015 }, { "epoch": 9.51298190401259, "grad_norm": 0.23791913882537596, "learning_rate": 4.222474025683514e-06, "loss": 0.1346, "step": 3016 }, { "epoch": 9.516129032258064, "grad_norm": 0.2429591101645711, "learning_rate": 4.219600003592767e-06, "loss": 0.1307, "step": 3017 }, { "epoch": 9.51927616050354, "grad_norm": 0.3968467791341316, "learning_rate": 4.2167445524258226e-06, "loss": 0.1379, "step": 3018 }, { "epoch": 9.522423288749017, "grad_norm": 0.23354286684086903, "learning_rate": 4.213907675165086e-06, "loss": 0.1312, "step": 3019 }, { "epoch": 9.525570416994492, "grad_norm": 0.23749531424478923, "learning_rate": 4.2110893747735655e-06, "loss": 0.1308, "step": 3020 }, { "epoch": 9.528717545239969, "grad_norm": 0.24147254873584723, "learning_rate": 4.2082896541948675e-06, "loss": 0.1374, "step": 3021 }, { "epoch": 9.531864673485444, "grad_norm": 0.252926435908387, "learning_rate": 4.205508516353183e-06, "loss": 0.139, "step": 3022 }, { "epoch": 9.53501180173092, "grad_norm": 0.24123310444093413, "learning_rate": 4.202745964153305e-06, "loss": 0.1296, "step": 3023 }, { "epoch": 9.538158929976397, "grad_norm": 0.22888213642637095, "learning_rate": 4.200002000480605e-06, "loss": 0.1333, "step": 3024 }, { "epoch": 9.541306058221872, "grad_norm": 0.23412443013585427, "learning_rate": 4.197276628201048e-06, "loss": 0.1357, "step": 3025 }, { "epoch": 9.544453186467349, "grad_norm": 0.2506322070152408, "learning_rate": 4.194569850161179e-06, "loss": 0.1351, "step": 3026 }, { "epoch": 9.547600314712824, "grad_norm": 0.2314307114246427, "learning_rate": 4.191881669188117e-06, "loss": 0.1377, "step": 3027 }, { "epoch": 9.5507474429583, "grad_norm": 0.2383274553063989, "learning_rate": 4.1892120880895605e-06, "loss": 0.1333, "step": 3028 }, { "epoch": 9.553894571203777, "grad_norm": 0.23157419286047795, "learning_rate": 4.186561109653784e-06, "loss": 0.1401, "step": 3029 }, { "epoch": 9.557041699449252, "grad_norm": 0.2454714122459766, "learning_rate": 4.1839287366496285e-06, "loss": 0.1351, "step": 3030 }, { "epoch": 9.560188827694729, "grad_norm": 0.23213441228577894, "learning_rate": 4.181314971826502e-06, "loss": 0.1349, "step": 3031 }, { "epoch": 9.563335955940204, "grad_norm": 0.2383713805883692, "learning_rate": 4.178719817914378e-06, "loss": 0.1322, "step": 3032 }, { "epoch": 9.56648308418568, "grad_norm": 0.23809296540852945, "learning_rate": 4.176143277623796e-06, "loss": 0.1236, "step": 3033 }, { "epoch": 9.569630212431157, "grad_norm": 0.2452859142596891, "learning_rate": 4.1735853536458455e-06, "loss": 0.1334, "step": 3034 }, { "epoch": 9.572777340676632, "grad_norm": 0.2528048789801201, "learning_rate": 4.1710460486521795e-06, "loss": 0.1345, "step": 3035 }, { "epoch": 9.575924468922109, "grad_norm": 0.24602308858149582, "learning_rate": 4.168525365295002e-06, "loss": 0.1352, "step": 3036 }, { "epoch": 9.579071597167584, "grad_norm": 0.24143788400432517, "learning_rate": 4.166023306207066e-06, "loss": 0.1344, "step": 3037 }, { "epoch": 9.58221872541306, "grad_norm": 0.24049001092299743, "learning_rate": 4.163539874001671e-06, "loss": 0.1361, "step": 3038 }, { "epoch": 9.585365853658537, "grad_norm": 0.23227717157473632, "learning_rate": 4.161075071272668e-06, "loss": 0.1305, "step": 3039 }, { "epoch": 9.588512981904012, "grad_norm": 0.2350438398401355, "learning_rate": 4.158628900594442e-06, "loss": 0.1313, "step": 3040 }, { "epoch": 9.591660110149489, "grad_norm": 0.2374230007231774, "learning_rate": 4.156201364521924e-06, "loss": 0.1356, "step": 3041 }, { "epoch": 9.594807238394965, "grad_norm": 0.23945340129922146, "learning_rate": 4.1537924655905785e-06, "loss": 0.137, "step": 3042 }, { "epoch": 9.59795436664044, "grad_norm": 0.23799731806867103, "learning_rate": 4.151402206316405e-06, "loss": 0.1294, "step": 3043 }, { "epoch": 9.601101494885917, "grad_norm": 0.2361952391608318, "learning_rate": 4.1490305891959334e-06, "loss": 0.1312, "step": 3044 }, { "epoch": 9.604248623131392, "grad_norm": 0.24879673775767014, "learning_rate": 4.146677616706226e-06, "loss": 0.1305, "step": 3045 }, { "epoch": 9.607395751376869, "grad_norm": 0.23579330579462654, "learning_rate": 4.144343291304867e-06, "loss": 0.1342, "step": 3046 }, { "epoch": 9.610542879622345, "grad_norm": 0.2293828810426797, "learning_rate": 4.14202761542997e-06, "loss": 0.1397, "step": 3047 }, { "epoch": 9.61369000786782, "grad_norm": 0.22998664945318115, "learning_rate": 4.139730591500165e-06, "loss": 0.1343, "step": 3048 }, { "epoch": 9.616837136113297, "grad_norm": 0.22872577120241833, "learning_rate": 4.137452221914602e-06, "loss": 0.1315, "step": 3049 }, { "epoch": 9.619984264358772, "grad_norm": 0.25033210170902204, "learning_rate": 4.135192509052947e-06, "loss": 0.1324, "step": 3050 }, { "epoch": 9.623131392604249, "grad_norm": 0.23939961130535323, "learning_rate": 4.132951455275385e-06, "loss": 0.1347, "step": 3051 }, { "epoch": 9.626278520849725, "grad_norm": 0.24037659462353853, "learning_rate": 4.130729062922602e-06, "loss": 0.1323, "step": 3052 }, { "epoch": 9.6294256490952, "grad_norm": 0.22674798888307743, "learning_rate": 4.1285253343158045e-06, "loss": 0.1418, "step": 3053 }, { "epoch": 9.632572777340677, "grad_norm": 0.24399535601791178, "learning_rate": 4.126340271756696e-06, "loss": 0.1345, "step": 3054 }, { "epoch": 9.635719905586154, "grad_norm": 0.2380467339574685, "learning_rate": 4.1241738775274875e-06, "loss": 0.1322, "step": 3055 }, { "epoch": 9.638867033831628, "grad_norm": 0.2291006675130497, "learning_rate": 4.122026153890896e-06, "loss": 0.1365, "step": 3056 }, { "epoch": 9.642014162077105, "grad_norm": 0.23764446572668885, "learning_rate": 4.119897103090129e-06, "loss": 0.1362, "step": 3057 }, { "epoch": 9.64516129032258, "grad_norm": 0.24054188658200384, "learning_rate": 4.117786727348898e-06, "loss": 0.135, "step": 3058 }, { "epoch": 9.648308418568057, "grad_norm": 0.22923630142098805, "learning_rate": 4.1156950288714084e-06, "loss": 0.1376, "step": 3059 }, { "epoch": 9.651455546813533, "grad_norm": 0.23674042582246743, "learning_rate": 4.113622009842354e-06, "loss": 0.138, "step": 3060 }, { "epoch": 9.654602675059008, "grad_norm": 0.23241455559511415, "learning_rate": 4.111567672426922e-06, "loss": 0.1394, "step": 3061 }, { "epoch": 9.657749803304485, "grad_norm": 0.22726828049867015, "learning_rate": 4.109532018770787e-06, "loss": 0.1299, "step": 3062 }, { "epoch": 9.66089693154996, "grad_norm": 0.2320797444398632, "learning_rate": 4.107515051000108e-06, "loss": 0.1364, "step": 3063 }, { "epoch": 9.664044059795437, "grad_norm": 0.237128823718978, "learning_rate": 4.105516771221528e-06, "loss": 0.1312, "step": 3064 }, { "epoch": 9.667191188040913, "grad_norm": 0.24882016641887955, "learning_rate": 4.10353718152217e-06, "loss": 0.1342, "step": 3065 }, { "epoch": 9.670338316286388, "grad_norm": 0.23873796840310055, "learning_rate": 4.1015762839696396e-06, "loss": 0.1345, "step": 3066 }, { "epoch": 9.673485444531865, "grad_norm": 0.2391341419187129, "learning_rate": 4.099634080612016e-06, "loss": 0.1324, "step": 3067 }, { "epoch": 9.67663257277734, "grad_norm": 0.23289972746538012, "learning_rate": 4.097710573477852e-06, "loss": 0.1389, "step": 3068 }, { "epoch": 9.679779701022817, "grad_norm": 0.23306239327347605, "learning_rate": 4.095805764576177e-06, "loss": 0.1362, "step": 3069 }, { "epoch": 9.682926829268293, "grad_norm": 0.24480810822083465, "learning_rate": 4.093919655896484e-06, "loss": 0.1278, "step": 3070 }, { "epoch": 9.686073957513768, "grad_norm": 0.23785192923431273, "learning_rate": 4.092052249408746e-06, "loss": 0.1325, "step": 3071 }, { "epoch": 9.689221085759245, "grad_norm": 0.23076825052848043, "learning_rate": 4.090203547063389e-06, "loss": 0.136, "step": 3072 }, { "epoch": 9.69236821400472, "grad_norm": 0.2339128976885369, "learning_rate": 4.0883735507913105e-06, "loss": 0.1324, "step": 3073 }, { "epoch": 9.695515342250197, "grad_norm": 0.24873730826363002, "learning_rate": 4.0865622625038725e-06, "loss": 0.1305, "step": 3074 }, { "epoch": 9.698662470495673, "grad_norm": 0.23691902367088646, "learning_rate": 4.08476968409289e-06, "loss": 0.1324, "step": 3075 }, { "epoch": 9.701809598741148, "grad_norm": 0.24004736754782502, "learning_rate": 4.0829958174306435e-06, "loss": 0.1395, "step": 3076 }, { "epoch": 9.704956726986625, "grad_norm": 0.23649363473883206, "learning_rate": 4.081240664369862e-06, "loss": 0.1297, "step": 3077 }, { "epoch": 9.708103855232102, "grad_norm": 0.2419505558569642, "learning_rate": 4.079504226743739e-06, "loss": 0.136, "step": 3078 }, { "epoch": 9.711250983477576, "grad_norm": 0.23502797156800156, "learning_rate": 4.077786506365911e-06, "loss": 0.1334, "step": 3079 }, { "epoch": 9.714398111723053, "grad_norm": 0.21960473813286804, "learning_rate": 4.076087505030471e-06, "loss": 0.1356, "step": 3080 }, { "epoch": 9.717545239968528, "grad_norm": 0.23767379913931116, "learning_rate": 4.074407224511955e-06, "loss": 0.1325, "step": 3081 }, { "epoch": 9.720692368214005, "grad_norm": 0.2296867126057468, "learning_rate": 4.072745666565352e-06, "loss": 0.1345, "step": 3082 }, { "epoch": 9.723839496459481, "grad_norm": 0.23047397670360414, "learning_rate": 4.071102832926097e-06, "loss": 0.1302, "step": 3083 }, { "epoch": 9.726986624704956, "grad_norm": 0.23998182867200016, "learning_rate": 4.0694787253100585e-06, "loss": 0.1338, "step": 3084 }, { "epoch": 9.730133752950433, "grad_norm": 0.2323852688580098, "learning_rate": 4.067873345413555e-06, "loss": 0.1315, "step": 3085 }, { "epoch": 9.733280881195908, "grad_norm": 0.24171313095121516, "learning_rate": 4.066286694913345e-06, "loss": 0.1341, "step": 3086 }, { "epoch": 9.736428009441385, "grad_norm": 0.22679821098821545, "learning_rate": 4.064718775466618e-06, "loss": 0.1269, "step": 3087 }, { "epoch": 9.739575137686861, "grad_norm": 0.23502911107275046, "learning_rate": 4.063169588711004e-06, "loss": 0.1345, "step": 3088 }, { "epoch": 9.742722265932336, "grad_norm": 0.23836190822770764, "learning_rate": 4.0616391362645715e-06, "loss": 0.1346, "step": 3089 }, { "epoch": 9.745869394177813, "grad_norm": 0.2278799148478252, "learning_rate": 4.060127419725812e-06, "loss": 0.1367, "step": 3090 }, { "epoch": 9.74901652242329, "grad_norm": 0.2456917732347001, "learning_rate": 4.058634440673658e-06, "loss": 0.1326, "step": 3091 }, { "epoch": 9.752163650668765, "grad_norm": 0.23836719259607256, "learning_rate": 4.057160200667464e-06, "loss": 0.1308, "step": 3092 }, { "epoch": 9.755310778914241, "grad_norm": 0.2429286720699004, "learning_rate": 4.055704701247018e-06, "loss": 0.1327, "step": 3093 }, { "epoch": 9.758457907159716, "grad_norm": 0.23875447802288385, "learning_rate": 4.05426794393253e-06, "loss": 0.1314, "step": 3094 }, { "epoch": 9.761605035405193, "grad_norm": 0.2278467270000926, "learning_rate": 4.052849930224636e-06, "loss": 0.1356, "step": 3095 }, { "epoch": 9.76475216365067, "grad_norm": 0.23233849614758936, "learning_rate": 4.051450661604395e-06, "loss": 0.1311, "step": 3096 }, { "epoch": 9.767899291896144, "grad_norm": 0.23866830169952308, "learning_rate": 4.0500701395332875e-06, "loss": 0.1297, "step": 3097 }, { "epoch": 9.771046420141621, "grad_norm": 0.23851746320196832, "learning_rate": 4.0487083654532165e-06, "loss": 0.1357, "step": 3098 }, { "epoch": 9.774193548387096, "grad_norm": 0.2327507724699466, "learning_rate": 4.047365340786496e-06, "loss": 0.1338, "step": 3099 }, { "epoch": 9.777340676632573, "grad_norm": 0.2293150922887092, "learning_rate": 4.046041066935868e-06, "loss": 0.1295, "step": 3100 }, { "epoch": 9.78048780487805, "grad_norm": 0.2391233027321985, "learning_rate": 4.044735545284482e-06, "loss": 0.1389, "step": 3101 }, { "epoch": 9.783634933123524, "grad_norm": 0.23945974117378163, "learning_rate": 4.043448777195901e-06, "loss": 0.1381, "step": 3102 }, { "epoch": 9.786782061369001, "grad_norm": 0.22786053750451218, "learning_rate": 4.042180764014107e-06, "loss": 0.1385, "step": 3103 }, { "epoch": 9.789929189614476, "grad_norm": 0.23783743029557908, "learning_rate": 4.040931507063487e-06, "loss": 0.1366, "step": 3104 }, { "epoch": 9.793076317859953, "grad_norm": 0.24131425400897927, "learning_rate": 4.039701007648843e-06, "loss": 0.1336, "step": 3105 }, { "epoch": 9.79622344610543, "grad_norm": 0.24372846124327077, "learning_rate": 4.0384892670553795e-06, "loss": 0.1383, "step": 3106 }, { "epoch": 9.799370574350904, "grad_norm": 0.23973719599414378, "learning_rate": 4.0372962865487145e-06, "loss": 0.1326, "step": 3107 }, { "epoch": 9.802517702596381, "grad_norm": 0.24009549823324744, "learning_rate": 4.036122067374869e-06, "loss": 0.1338, "step": 3108 }, { "epoch": 9.805664830841856, "grad_norm": 0.23976142453197466, "learning_rate": 4.034966610760265e-06, "loss": 0.1401, "step": 3109 }, { "epoch": 9.808811959087333, "grad_norm": 0.22620166570113598, "learning_rate": 4.033829917911736e-06, "loss": 0.138, "step": 3110 }, { "epoch": 9.81195908733281, "grad_norm": 0.23910385806970655, "learning_rate": 4.032711990016509e-06, "loss": 0.1319, "step": 3111 }, { "epoch": 9.815106215578284, "grad_norm": 0.22753533073166138, "learning_rate": 4.031612828242216e-06, "loss": 0.1338, "step": 3112 }, { "epoch": 9.818253343823761, "grad_norm": 0.2421208770020626, "learning_rate": 4.030532433736889e-06, "loss": 0.1365, "step": 3113 }, { "epoch": 9.821400472069238, "grad_norm": 0.23782192539849972, "learning_rate": 4.029470807628956e-06, "loss": 0.1356, "step": 3114 }, { "epoch": 9.824547600314713, "grad_norm": 0.23934965568846245, "learning_rate": 4.028427951027245e-06, "loss": 0.1358, "step": 3115 }, { "epoch": 9.82769472856019, "grad_norm": 0.23281488681008186, "learning_rate": 4.027403865020977e-06, "loss": 0.1308, "step": 3116 }, { "epoch": 9.830841856805664, "grad_norm": 0.2387332651986393, "learning_rate": 4.026398550679772e-06, "loss": 0.1317, "step": 3117 }, { "epoch": 9.83398898505114, "grad_norm": 0.24160259965898973, "learning_rate": 4.025412009053636e-06, "loss": 0.1364, "step": 3118 }, { "epoch": 9.837136113296618, "grad_norm": 0.23512080872741115, "learning_rate": 4.0244442411729775e-06, "loss": 0.135, "step": 3119 }, { "epoch": 9.840283241542092, "grad_norm": 0.23515000349024553, "learning_rate": 4.02349524804859e-06, "loss": 0.1424, "step": 3120 }, { "epoch": 9.84343036978757, "grad_norm": 0.24040363807543386, "learning_rate": 4.02256503067166e-06, "loss": 0.1326, "step": 3121 }, { "epoch": 9.846577498033044, "grad_norm": 0.23796510922943842, "learning_rate": 4.021653590013759e-06, "loss": 0.1402, "step": 3122 }, { "epoch": 9.84972462627852, "grad_norm": 0.2405813863455342, "learning_rate": 4.020760927026856e-06, "loss": 0.1382, "step": 3123 }, { "epoch": 9.852871754523997, "grad_norm": 0.2476312470454177, "learning_rate": 4.019887042643299e-06, "loss": 0.1308, "step": 3124 }, { "epoch": 9.856018882769472, "grad_norm": 0.23636778802712008, "learning_rate": 4.019031937775827e-06, "loss": 0.1351, "step": 3125 }, { "epoch": 9.859166011014949, "grad_norm": 0.23409257155120913, "learning_rate": 4.01819561331756e-06, "loss": 0.1376, "step": 3126 }, { "epoch": 9.862313139260426, "grad_norm": 0.234688041297315, "learning_rate": 4.017378070142011e-06, "loss": 0.131, "step": 3127 }, { "epoch": 9.8654602675059, "grad_norm": 0.23250184458501116, "learning_rate": 4.016579309103068e-06, "loss": 0.1312, "step": 3128 }, { "epoch": 9.868607395751377, "grad_norm": 0.2334349376577723, "learning_rate": 4.015799331035007e-06, "loss": 0.1323, "step": 3129 }, { "epoch": 9.871754523996852, "grad_norm": 0.23703297054055375, "learning_rate": 4.015038136752481e-06, "loss": 0.1343, "step": 3130 }, { "epoch": 9.874901652242329, "grad_norm": 0.24196689257467524, "learning_rate": 4.01429572705053e-06, "loss": 0.1355, "step": 3131 }, { "epoch": 9.878048780487806, "grad_norm": 0.24673350426408439, "learning_rate": 4.013572102704572e-06, "loss": 0.1323, "step": 3132 }, { "epoch": 9.88119590873328, "grad_norm": 0.23280087204706737, "learning_rate": 4.012867264470404e-06, "loss": 0.1336, "step": 3133 }, { "epoch": 9.884343036978757, "grad_norm": 0.24120088645534854, "learning_rate": 4.0121812130842e-06, "loss": 0.14, "step": 3134 }, { "epoch": 9.887490165224232, "grad_norm": 0.2337805163325715, "learning_rate": 4.0115139492625134e-06, "loss": 0.1361, "step": 3135 }, { "epoch": 9.890637293469709, "grad_norm": 0.23532202894035434, "learning_rate": 4.0108654737022755e-06, "loss": 0.1335, "step": 3136 }, { "epoch": 9.893784421715186, "grad_norm": 0.24125197048086924, "learning_rate": 4.010235787080794e-06, "loss": 0.1378, "step": 3137 }, { "epoch": 9.89693154996066, "grad_norm": 0.24669623748502031, "learning_rate": 4.00962489005575e-06, "loss": 0.1326, "step": 3138 }, { "epoch": 9.900078678206137, "grad_norm": 0.24343818316183338, "learning_rate": 4.009032783265204e-06, "loss": 0.1348, "step": 3139 }, { "epoch": 9.903225806451612, "grad_norm": 0.22541573373892404, "learning_rate": 4.008459467327586e-06, "loss": 0.1334, "step": 3140 }, { "epoch": 9.906372934697089, "grad_norm": 0.24314992873545332, "learning_rate": 4.007904942841702e-06, "loss": 0.1333, "step": 3141 }, { "epoch": 9.909520062942565, "grad_norm": 0.24444507569036877, "learning_rate": 4.007369210386732e-06, "loss": 0.1355, "step": 3142 }, { "epoch": 9.91266719118804, "grad_norm": 0.23456707600284918, "learning_rate": 4.006852270522226e-06, "loss": 0.1373, "step": 3143 }, { "epoch": 9.915814319433517, "grad_norm": 0.22873767694094543, "learning_rate": 4.006354123788107e-06, "loss": 0.1382, "step": 3144 }, { "epoch": 9.918961447678992, "grad_norm": 0.23735618369859499, "learning_rate": 4.00587477070467e-06, "loss": 0.1379, "step": 3145 }, { "epoch": 9.922108575924469, "grad_norm": 0.23956072263659126, "learning_rate": 4.005414211772583e-06, "loss": 0.1371, "step": 3146 }, { "epoch": 9.925255704169945, "grad_norm": 0.23043205951102974, "learning_rate": 4.004972447472878e-06, "loss": 0.1327, "step": 3147 }, { "epoch": 9.92840283241542, "grad_norm": 0.2344662864402095, "learning_rate": 4.00454947826696e-06, "loss": 0.1319, "step": 3148 }, { "epoch": 9.931549960660897, "grad_norm": 0.24245229721032507, "learning_rate": 4.0041453045966055e-06, "loss": 0.1383, "step": 3149 }, { "epoch": 9.934697088906374, "grad_norm": 0.24522019444100665, "learning_rate": 4.003759926883958e-06, "loss": 0.1346, "step": 3150 }, { "epoch": 9.937844217151849, "grad_norm": 0.23782263325071246, "learning_rate": 4.003393345531529e-06, "loss": 0.145, "step": 3151 }, { "epoch": 9.940991345397325, "grad_norm": 0.24011271527037592, "learning_rate": 4.0030455609221975e-06, "loss": 0.1341, "step": 3152 }, { "epoch": 9.9441384736428, "grad_norm": 0.23685295626777536, "learning_rate": 4.0027165734192115e-06, "loss": 0.1343, "step": 3153 }, { "epoch": 9.947285601888277, "grad_norm": 0.24415877369974173, "learning_rate": 4.002406383366186e-06, "loss": 0.1343, "step": 3154 }, { "epoch": 9.950432730133754, "grad_norm": 0.23530638296528372, "learning_rate": 4.0021149910871e-06, "loss": 0.1344, "step": 3155 }, { "epoch": 9.953579858379229, "grad_norm": 0.24560555060327768, "learning_rate": 4.001842396886302e-06, "loss": 0.1364, "step": 3156 }, { "epoch": 9.956726986624705, "grad_norm": 0.23476225232747105, "learning_rate": 4.001588601048508e-06, "loss": 0.1339, "step": 3157 }, { "epoch": 9.95987411487018, "grad_norm": 0.2326742437614356, "learning_rate": 4.0013536038387946e-06, "loss": 0.138, "step": 3158 }, { "epoch": 9.963021243115657, "grad_norm": 0.23396824661309967, "learning_rate": 4.00113740550261e-06, "loss": 0.133, "step": 3159 }, { "epoch": 9.966168371361134, "grad_norm": 0.23890877834056157, "learning_rate": 4.000940006265763e-06, "loss": 0.1362, "step": 3160 }, { "epoch": 9.969315499606608, "grad_norm": 0.232863057932829, "learning_rate": 4.000761406334429e-06, "loss": 0.1298, "step": 3161 }, { "epoch": 9.972462627852085, "grad_norm": 0.24073916528859923, "learning_rate": 4.000601605895147e-06, "loss": 0.1449, "step": 3162 }, { "epoch": 9.975609756097562, "grad_norm": 0.23431234155479022, "learning_rate": 4.000460605114827e-06, "loss": 0.1384, "step": 3163 }, { "epoch": 9.978756884343037, "grad_norm": 0.2290325302172204, "learning_rate": 4.000338404140736e-06, "loss": 0.1353, "step": 3164 }, { "epoch": 9.981904012588513, "grad_norm": 0.2480177908337849, "learning_rate": 4.00023500310051e-06, "loss": 0.1325, "step": 3165 }, { "epoch": 9.985051140833988, "grad_norm": 0.23707047681989588, "learning_rate": 4.000150402102143e-06, "loss": 0.1358, "step": 3166 }, { "epoch": 9.988198269079465, "grad_norm": 0.25048005095022025, "learning_rate": 4.000084601234001e-06, "loss": 0.1356, "step": 3167 }, { "epoch": 9.991345397324942, "grad_norm": 0.24387714398171834, "learning_rate": 4.000037600564808e-06, "loss": 0.1329, "step": 3168 }, { "epoch": 9.994492525570417, "grad_norm": 0.24315440115509737, "learning_rate": 4.000009400143658e-06, "loss": 0.137, "step": 3169 }, { "epoch": 9.997639653815893, "grad_norm": 0.22876527267837543, "learning_rate": 4.000000000000001e-06, "loss": 0.1403, "step": 3170 } ], "logging_steps": 1, "max_steps": 3170, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 634, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0340729464569725e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }